From 2ce0f6e402f071c4059be8a8f16842ffd73cdfb1 Mon Sep 17 00:00:00 2001 From: Pedram Navid <1045990+PedramNavid@users.noreply.github.com> Date: Mon, 2 Oct 2023 16:42:48 -0700 Subject: [PATCH] update api docs --- docs/content/api/modules.json | 2 +- docs/content/api/searchindex.json | 2 +- docs/content/api/sections.json | 2 +- docs/next/public/objects.inv | Bin 25426 -> 25428 bytes 4 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/content/api/modules.json b/docs/content/api/modules.json index b1970d102c04e..b6aa3a0f22f4a 100644 --- a/docs/content/api/modules.json +++ b/docs/content/api/modules.json @@ -1 +1 @@ -{"": {"dagster_pandera": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_pandera

\nimport itertools\nimport re\nfrom typing import TYPE_CHECKING, Callable, Sequence, Type, Union\n\nimport dagster._check as check\nimport pandas as pd\nimport pandera as pa\nfrom dagster import (\n    DagsterType,\n    TableColumn,\n    TableColumnConstraints,\n    TableConstraints,\n    TableSchema,\n    TypeCheck,\n    TypeCheckContext,\n)\nfrom dagster._core.definitions.metadata import MetadataValue\nfrom dagster._core.libraries import DagsterLibraryRegistry\n\nfrom .version import __version__\n\n# NOTE: Pandera supports multiple dataframe libraries. Most of the alternatives\n# to pandas implement a pandas-like API wrapper around an underlying library\n# that can handle big data (a weakness of pandas). Typically this means the\n# data is only partly loaded into memory, or is distributed across multiple\n# nodes. Because Dagster types perform runtime validation within a single\n# Python process, it's not clear at present how to interface the more complex\n# validation computations on distributed dataframes with Dagster Types.\n\n# Therefore, for the time being dagster-pandera only supports pandas dataframes.\n# However, some commented-out scaffolding has been left in place for support of\n# alternatives in the future. These sections are marked with "TODO: pending\n# alternative dataframe support".\n\nif TYPE_CHECKING:\n    ValidatableDataFrame = pd.DataFrame\n\nDagsterLibraryRegistry.register("dagster-pandera", __version__)\n\n# ########################\n# ##### VALID DATAFRAME CLASSES\n# ########################\n\n# This layer of indirection is used because we may support alternative dataframe classes in the\n# future.\nVALID_DATAFRAME_CLASSES = (pd.DataFrame,)\n\n\n# ########################\n# ##### PANDERA SCHEMA TO DAGSTER TYPE\n# ########################\n\n\n
[docs]def pandera_schema_to_dagster_type(\n schema: Union[pa.DataFrameSchema, Type[pa.SchemaModel]],\n) -> DagsterType:\n """Convert a Pandera dataframe schema to a `DagsterType`.\n\n The generated Dagster type will be given an automatically generated `name`. The schema's `title`\n property, `name` property, or class name (in that order) will be used. If neither `title` or\n `name` is defined, a name of the form `DagsterPanderaDataframe<n>` is generated.\n\n Additional metadata is also extracted from the Pandera schema and attached to the returned\n `DagsterType` as a metadata dictionary. The extracted metadata includes:\n\n - Descriptions on the schema and constituent columns and checks.\n - Data types for each column.\n - String representations of all column-wise checks.\n - String representations of all row-wise (i.e. "wide") checks.\n\n The returned `DagsterType` type will call the Pandera schema's `validate()` method in its type\n check function. Validation is done in `lazy` mode, i.e. pandera will attempt to validate all\n values in the dataframe, rather than stopping on the first error.\n\n If validation fails, the returned `TypeCheck` object will contain two pieces of metadata:\n\n - `num_failures` total number of validation errors.\n - `failure_sample` a table containing up to the first 10 validation errors.\n\n Args:\n schema (Union[pa.DataFrameSchema, Type[pa.SchemaModel]]):\n\n Returns:\n DagsterType: Dagster Type constructed from the Pandera schema.\n\n """\n if not (\n isinstance(schema, pa.DataFrameSchema)\n or (isinstance(schema, type) and issubclass(schema, pa.SchemaModel))\n ):\n raise TypeError(\n "schema must be a pandera `DataFrameSchema` or a subclass of a pandera `SchemaModel`"\n )\n\n name = _extract_name_from_pandera_schema(schema)\n norm_schema = (\n schema.to_schema()\n if isinstance(schema, type) and issubclass(schema, pa.SchemaModel)\n else schema\n )\n tschema = _pandera_schema_to_table_schema(norm_schema)\n type_check_fn = _pandera_schema_to_type_check_fn(norm_schema, tschema)\n\n return DagsterType(\n type_check_fn=type_check_fn,\n name=name,\n description=norm_schema.description,\n metadata={\n "schema": MetadataValue.table_schema(tschema),\n },\n typing_type=pd.DataFrame,\n )
\n\n\n# call next() on this to generate next unique Dagster Type name for anonymous schemas\n_anonymous_schema_name_generator = (f"DagsterPanderaDataframe{i}" for i in itertools.count(start=1))\n\n\ndef _extract_name_from_pandera_schema(\n schema: Union[pa.DataFrameSchema, Type[pa.SchemaModel]],\n) -> str:\n if isinstance(schema, type) and issubclass(schema, pa.SchemaModel):\n return (\n getattr(schema.Config, "title", None)\n or getattr(schema.Config, "name", None)\n or schema.__name__\n )\n elif isinstance(schema, pa.DataFrameSchema):\n return schema.title or schema.name or next(_anonymous_schema_name_generator)\n\n\ndef _pandera_schema_to_type_check_fn(\n schema: pa.DataFrameSchema,\n table_schema: TableSchema,\n) -> Callable[[TypeCheckContext, object], TypeCheck]:\n def type_check_fn(_context, value: object) -> TypeCheck:\n if isinstance(value, VALID_DATAFRAME_CLASSES):\n try:\n # `lazy` instructs pandera to capture every (not just the first) validation error\n schema.validate(value, lazy=True)\n except pa.errors.SchemaErrors as e:\n return _pandera_errors_to_type_check(e, table_schema)\n except Exception as e:\n return TypeCheck(\n success=False,\n description=f"Unexpected error during validation: {e}",\n )\n else:\n return TypeCheck(\n success=False,\n description=(\n f"Must be one of {VALID_DATAFRAME_CLASSES}, not {type(value).__name__}."\n ),\n )\n\n return TypeCheck(success=True)\n\n return type_check_fn\n\n\nPANDERA_FAILURE_CASES_SCHEMA = TableSchema(\n columns=[\n TableColumn(\n name="schema_context",\n type="string",\n description="`Column` for column-wise checks, or `DataFrameSchema`",\n ),\n TableColumn(\n name="column",\n type="string",\n description="Column of value that failed the check, or `None` for wide checks.",\n ),\n TableColumn(\n name="check", type="string", description="Description of the failed Pandera check."\n ),\n TableColumn(name="check_number", description="Index of the failed check."),\n TableColumn(\n name="failure_case", type="number | string", description="Value that failed a check."\n ),\n TableColumn(\n name="index",\n type="number | string",\n description="Index (row) of value that failed a check.",\n ),\n ]\n)\n\n\ndef _pandera_errors_to_type_check(\n error: pa.errors.SchemaErrors, _table_schema: TableSchema\n) -> TypeCheck:\n return TypeCheck(\n success=False,\n description=str(error),\n )\n\n\ndef _pandera_schema_to_table_schema(schema: pa.DataFrameSchema) -> TableSchema:\n df_constraints = _pandera_schema_wide_checks_to_table_constraints(schema.checks)\n columns = [_pandera_column_to_table_column(col) for k, col in schema.columns.items()]\n return TableSchema(columns=columns, constraints=df_constraints)\n\n\ndef _pandera_schema_wide_checks_to_table_constraints(\n checks: Sequence[Union[pa.Check, pa.Hypothesis]]\n) -> TableConstraints:\n return TableConstraints(other=[_pandera_check_to_table_constraint(check) for check in checks])\n\n\ndef _pandera_check_to_table_constraint(pa_check: Union[pa.Check, pa.Hypothesis]) -> str:\n return _get_pandera_check_identifier(pa_check)\n\n\ndef _pandera_column_to_table_column(pa_column: pa.Column) -> TableColumn:\n constraints = TableColumnConstraints(\n nullable=pa_column.nullable,\n unique=pa_column.unique,\n other=[_pandera_check_to_column_constraint(pa_check) for pa_check in pa_column.checks],\n )\n name = check.not_none(pa_column.name, "name")\n name = name if isinstance(name, str) else "/".join(name)\n return TableColumn(\n name=name,\n type=str(pa_column.dtype),\n description=pa_column.description,\n constraints=constraints,\n )\n\n\nCHECK_OPERATORS = {\n "equal_to": "==",\n "not_equal_to": "!=",\n "less_than": "<",\n "less_than_or_equal_to": "<=",\n "greater_than": ">",\n "greater_than_or_equal_to": ">=",\n}\n\n\ndef _extract_operand(error_str: str) -> str:\n match = re.search(r"(?<=\\().+(?=\\))", error_str)\n return match.group(0) if match else ""\n\n\ndef _pandera_check_to_column_constraint(pa_check: pa.Check) -> str:\n if pa_check.description:\n return pa_check.description\n elif pa_check.name in CHECK_OPERATORS:\n assert isinstance(\n pa_check.error, str\n ), "Expected pandera check to have string `error` attr."\n return f"{CHECK_OPERATORS[pa_check.name]} {_extract_operand(pa_check.error)}"\n else:\n return _get_pandera_check_identifier(pa_check)\n\n\ndef _get_pandera_check_identifier(pa_check: Union[pa.Check, pa.Hypothesis]) -> str:\n return pa_check.description or pa_check.error or pa_check.name or str(pa_check)\n\n\n__all__ = [\n "pandera_schema_to_dagster_type",\n]\n
", "current_page_name": "_modules/dagster_pandera", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_pandera"}, "index": {"alabaster_version": "0.7.13", "body": "

All modules for which code is available

\n", "current_page_name": "_modules/index", "customsidebar": null, "favicon_url": null, "logo_url": null, "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "Overview: module code"}}, "dagster": {"_config": {"config_schema": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._config.config_schema

\nfrom typing import TYPE_CHECKING, Any, Dict, List, Mapping, Sequence, Type, Union\n\nfrom typing_extensions import TypeAlias\n\nif TYPE_CHECKING:\n    from dagster._config import ConfigType, Field\n\n# Eventually, the below `UserConfigSchema` should be renamed to `ConfigSchema` and the class\n# definition should be dropped. The reason we don't do this now is that sphinx autodoc doesn't\n# support type aliases, so there is no good way to gracefully attach a docstring to this and have it\n# show up in the docs. See: https://github.com/sphinx-doc/sphinx/issues/8934\n#\n# Unfortunately mypy doesn't support recursive types, which would be used to properly define the\n# List/Dict elements of this union: `Dict[str, ConfigSchema]`, `List[ConfigSchema]`.\nUserConfigSchema: TypeAlias = Union[\n    Type[Union[bool, float, int, str]],\n    Type[Union[Dict[Any, Any], List[Any]]],\n    "ConfigType",\n    "Field",\n    Mapping[str, Any],\n    Sequence[Any],\n]\n\n\n
[docs]class ConfigSchema:\n """Placeholder type for config schemas.\n\n Any time that it appears in documentation, it means that any of the following types are\n acceptable:\n\n #. A Python scalar type that resolves to a Dagster config type\n (:py:class:`~python:int`, :py:class:`~python:float`, :py:class:`~python:bool`,\n or :py:class:`~python:str`). For example:\n\n * ``@op(config_schema=int)``\n * ``@op(config_schema=str)``\n\n #. A built-in python collection (:py:class:`~python:list`, or :py:class:`~python:dict`).\n :py:class:`~python:list` is exactly equivalent to :py:class:`~dagster.Array` [\n :py:class:`~dagster.Any` ] and :py:class:`~python:dict` is equivalent to\n :py:class:`~dagster.Permissive`. For example:\n\n * ``@op(config_schema=list)``\n * ``@op(config_schema=dict)``\n\n #. A Dagster config type:\n\n * :py:data:`~dagster.Any`\n * :py:class:`~dagster.Array`\n * :py:data:`~dagster.Bool`\n * :py:data:`~dagster.Enum`\n * :py:data:`~dagster.Float`\n * :py:data:`~dagster.Int`\n * :py:data:`~dagster.IntSource`\n * :py:data:`~dagster.Noneable`\n * :py:class:`~dagster.Permissive`\n * :py:class:`~dagster.Map`\n * :py:class:`~dagster.ScalarUnion`\n * :py:class:`~dagster.Selector`\n * :py:class:`~dagster.Shape`\n * :py:data:`~dagster.String`\n * :py:data:`~dagster.StringSource`\n\n\n #. A bare python dictionary, which will be automatically wrapped in\n :py:class:`~dagster.Shape`. Values of the dictionary are resolved recursively\n according to the same rules. For example:\n\n * ``{'some_config': str}`` is equivalent to ``Shape({'some_config: str})``.\n\n * ``{'some_config1': {'some_config2': str}}`` is equivalent to\n ``Shape({'some_config1: Shape({'some_config2: str})})``.\n\n #. A bare python list of length one, whose single element will be wrapped in a\n :py:class:`~dagster.Array` is resolved recursively according to the same\n rules. For example:\n\n * ``[str]`` is equivalent to ``Array[str]``.\n\n * ``[[str]]`` is equivalent to ``Array[Array[str]]``.\n\n * ``[{'some_config': str}]`` is equivalent to ``Array(Shape({'some_config: str}))``.\n\n #. An instance of :py:class:`~dagster.Field`.\n """\n\n def __init__(self):\n raise NotImplementedError(\n "ConfigSchema is a placeholder type and should not be instantiated."\n )
\n
", "current_page_name": "_modules/dagster/_config/config_schema", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._config.config_schema"}, "config_type": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._config.config_type

\nimport typing\nfrom enum import Enum as PythonEnum\nfrom typing import TYPE_CHECKING, Dict, Iterator, Optional, Sequence, cast\n\nimport dagster._check as check\nfrom dagster._annotations import public\nfrom dagster._builtins import BuiltinEnum\nfrom dagster._config import UserConfigSchema\nfrom dagster._serdes import whitelist_for_serdes\n\nif TYPE_CHECKING:\n    from .snap import ConfigSchemaSnapshot, ConfigTypeSnap\n\n\n@whitelist_for_serdes\nclass ConfigTypeKind(PythonEnum):\n    ANY = "ANY"\n    SCALAR = "SCALAR"\n    ENUM = "ENUM"\n\n    SELECTOR = "SELECTOR"\n    STRICT_SHAPE = "STRICT_SHAPE"\n    PERMISSIVE_SHAPE = "PERMISSIVE_SHAPE"\n    SCALAR_UNION = "SCALAR_UNION"\n\n    MAP = "MAP"\n\n    # Closed generic types\n    ARRAY = "ARRAY"\n    NONEABLE = "NONEABLE"\n\n    @staticmethod\n    def has_fields(kind: "ConfigTypeKind") -> bool:\n        check.inst_param(kind, "kind", ConfigTypeKind)\n        return kind == ConfigTypeKind.SELECTOR or ConfigTypeKind.is_shape(kind)\n\n    @staticmethod\n    def is_closed_generic(kind: "ConfigTypeKind") -> bool:\n        check.inst_param(kind, "kind", ConfigTypeKind)\n        return (\n            kind == ConfigTypeKind.ARRAY\n            or kind == ConfigTypeKind.NONEABLE\n            or kind == ConfigTypeKind.SCALAR_UNION\n            or kind == ConfigTypeKind.MAP\n        )\n\n    @staticmethod\n    def is_shape(kind: "ConfigTypeKind") -> bool:\n        check.inst_param(kind, "kind", ConfigTypeKind)\n        return kind == ConfigTypeKind.STRICT_SHAPE or kind == ConfigTypeKind.PERMISSIVE_SHAPE\n\n    @staticmethod\n    def is_selector(kind: "ConfigTypeKind") -> bool:\n        check.inst_param(kind, "kind", ConfigTypeKind)\n        return kind == ConfigTypeKind.SELECTOR\n\n\nclass ConfigType:\n    """The class backing DagsterTypes as they are used processing configuration data."""\n\n    def __init__(\n        self,\n        key: str,\n        kind: ConfigTypeKind,\n        given_name: Optional[str] = None,\n        description: Optional[str] = None,\n        type_params: Optional[Sequence["ConfigType"]] = None,\n    ):\n        self.key: str = check.str_param(key, "key")\n        self.kind: ConfigTypeKind = check.inst_param(kind, "kind", ConfigTypeKind)\n        self.given_name: Optional[str] = check.opt_str_param(given_name, "given_name")\n        self._description: Optional[str] = check.opt_str_param(description, "description")\n        self.type_params: Optional[Sequence[ConfigType]] = (\n            check.sequence_param(type_params, "type_params", of_type=ConfigType)\n            if type_params\n            else None\n        )\n\n        # memoized snap representation\n        self._snap: Optional["ConfigTypeSnap"] = None\n\n    @property\n    def description(self) -> Optional[str]:\n        return self._description\n\n    @staticmethod\n    def from_builtin_enum(builtin_enum: typing.Any) -> "ConfigType":\n        check.invariant(BuiltinEnum.contains(builtin_enum), "param must be member of BuiltinEnum")\n        return _CONFIG_MAP[builtin_enum]\n\n    def post_process(self, value):\n        """Implement this in order to take a value provided by the user\n        and perform computation on it. This can be done to coerce data types,\n        fetch things from the environment (e.g. environment variables), or\n        to do custom validation. If the value is not valid, throw a\n        PostProcessingError. Otherwise return the coerced value.\n        """\n        return value\n\n    def get_snapshot(self) -> "ConfigTypeSnap":\n        from .snap import snap_from_config_type\n\n        if self._snap is None:\n            self._snap = snap_from_config_type(self)\n\n        return self._snap\n\n    def type_iterator(self) -> Iterator["ConfigType"]:\n        yield self\n\n    def get_schema_snapshot(self) -> "ConfigSchemaSnapshot":\n        from .snap import ConfigSchemaSnapshot\n\n        return ConfigSchemaSnapshot({ct.key: ct.get_snapshot() for ct in self.type_iterator()})\n\n\n@whitelist_for_serdes\nclass ConfigScalarKind(PythonEnum):\n    INT = "INT"\n    STRING = "STRING"\n    FLOAT = "FLOAT"\n    BOOL = "BOOL"\n\n\n# Scalars, Composites, Selectors, Lists, Optional, Any\n\n\nclass ConfigScalar(ConfigType):\n    def __init__(\n        self,\n        key: str,\n        given_name: Optional[str],\n        scalar_kind: ConfigScalarKind,\n        **kwargs: typing.Any,\n    ):\n        self.scalar_kind = check.inst_param(scalar_kind, "scalar_kind", ConfigScalarKind)\n        super(ConfigScalar, self).__init__(\n            key, kind=ConfigTypeKind.SCALAR, given_name=given_name, **kwargs\n        )\n\n\nclass BuiltinConfigScalar(ConfigScalar):\n    def __init__(self, scalar_kind, description=None):\n        super(BuiltinConfigScalar, self).__init__(\n            key=type(self).__name__,\n            given_name=type(self).__name__,\n            scalar_kind=scalar_kind,\n            description=description,\n        )\n\n\nclass Int(BuiltinConfigScalar):\n    def __init__(self):\n        super(Int, self).__init__(scalar_kind=ConfigScalarKind.INT, description="")\n\n\nclass String(BuiltinConfigScalar):\n    def __init__(self):\n        super(String, self).__init__(scalar_kind=ConfigScalarKind.STRING, description="")\n\n\nclass Bool(BuiltinConfigScalar):\n    def __init__(self):\n        super(Bool, self).__init__(scalar_kind=ConfigScalarKind.BOOL, description="")\n\n\nclass Float(BuiltinConfigScalar):\n    def __init__(self):\n        super(Float, self).__init__(scalar_kind=ConfigScalarKind.FLOAT, description="")\n\n    def post_process(self, value):\n        return float(value)\n\n\nclass Any(ConfigType):\n    def __init__(self):\n        super(Any, self).__init__(\n            key="Any",\n            given_name="Any",\n            kind=ConfigTypeKind.ANY,\n        )\n\n\n
[docs]class Noneable(ConfigType):\n """Defines a configuration type that is the union of ``NoneType`` and the type ``inner_type``.\n\n Args:\n inner_type (type):\n The type of the values that this configuration type can contain.\n\n **Examples:**\n\n .. code-block:: python\n\n config_schema={"name": Noneable(str)}\n\n config={"name": "Hello"} # Ok\n config={"name": None} # Ok\n config={} # Error\n """\n\n def __init__(self, inner_type: object):\n from .field import resolve_to_config_type\n\n self.inner_type = cast(ConfigType, resolve_to_config_type(inner_type))\n super(Noneable, self).__init__(\n key=f"Noneable.{self.inner_type.key}",\n kind=ConfigTypeKind.NONEABLE,\n type_params=[self.inner_type],\n )\n\n def type_iterator(self) -> Iterator["ConfigType"]:\n yield from self.inner_type.type_iterator()\n yield from super().type_iterator()
\n\n\n
[docs]class Array(ConfigType):\n """Defines an array (list) configuration type that contains values of type ``inner_type``.\n\n Args:\n inner_type (type):\n The type of the values that this configuration type can contain.\n """\n\n def __init__(self, inner_type: object):\n from .field import resolve_to_config_type\n\n self.inner_type = cast(ConfigType, resolve_to_config_type(inner_type))\n super(Array, self).__init__(\n key=f"Array.{self.inner_type.key}",\n type_params=[self.inner_type],\n kind=ConfigTypeKind.ARRAY,\n )\n\n @public\n @property\n def description(self) -> str:\n """A human-readable description of this Array type."""\n return f"List of {self.key}"\n\n def type_iterator(self) -> Iterator["ConfigType"]:\n yield from self.inner_type.type_iterator()\n yield from super().type_iterator()
\n\n\n
[docs]class EnumValue:\n """Define an entry in a :py:class:`Enum`.\n\n Args:\n config_value (str):\n The string representation of the config to accept when passed.\n python_value (Optional[Any]):\n The python value to convert the enum entry in to. Defaults to the ``config_value``.\n description (Optional[str]):\n A human-readable description of the enum entry.\n\n """\n\n def __init__(\n self,\n config_value: str,\n python_value: Optional[object] = None,\n description: Optional[str] = None,\n ):\n self.config_value = check.str_param(config_value, "config_value")\n self.python_value = config_value if python_value is None else python_value\n self.description = check.opt_str_param(description, "description")
\n\n\n
[docs]class Enum(ConfigType):\n """Defines a enum configuration type that allows one of a defined set of possible values.\n\n Args:\n name (str):\n The name of the enum configuration type.\n enum_values (List[EnumValue]):\n The set of possible values for the enum configuration type.\n\n **Examples:**\n\n .. code-block:: python\n\n @op(\n config_schema=Field(\n Enum(\n 'CowboyType',\n [\n EnumValue('good'),\n EnumValue('bad'),\n EnumValue('ugly'),\n ]\n )\n )\n )\n def resolve_standoff(context):\n # ...\n """\n\n def __init__(self, name: str, enum_values: Sequence[EnumValue]):\n check.str_param(name, "name")\n super(Enum, self).__init__(key=name, given_name=name, kind=ConfigTypeKind.ENUM)\n self.enum_values = check.sequence_param(enum_values, "enum_values", of_type=EnumValue)\n self._valid_python_values = {ev.python_value for ev in enum_values}\n check.invariant(len(self._valid_python_values) == len(enum_values))\n self._valid_config_values = {ev.config_value for ev in enum_values}\n check.invariant(len(self._valid_config_values) == len(enum_values))\n\n @property\n def config_values(self):\n return [ev.config_value for ev in self.enum_values]\n\n def is_valid_config_enum_value(self, config_value):\n return config_value in self._valid_config_values\n\n def post_process(self, value: typing.Any) -> typing.Any:\n if isinstance(value, PythonEnum):\n value = value.name\n\n for ev in self.enum_values:\n if ev.config_value == value:\n return ev.python_value\n\n check.failed(f"Should never reach this. config_value should be pre-validated. Got {value}")\n\n @classmethod\n def from_python_enum(cls, enum, name=None):\n """Create a Dagster enum corresponding to an existing Python enum.\n\n Args:\n enum (enum.EnumMeta):\n The class representing the enum.\n name (Optional[str]):\n The name for the enum. If not present, `enum.__name__` will be used.\n\n Example:\n .. code-block:: python\n\n class Color(enum.Enum):\n RED = enum.auto()\n GREEN = enum.auto()\n BLUE = enum.auto()\n\n @op(\n config_schema={"color": Field(Enum.from_python_enum(Color))}\n )\n def select_color(context):\n assert context.op_config["color"] == Color.RED\n """\n if name is None:\n name = enum.__name__\n return cls(name, [EnumValue(v.name, python_value=v) for v in enum])\n\n @classmethod\n def from_python_enum_direct_values(cls, enum, name=None):\n """Create a Dagster enum corresponding to an existing Python enum, where the direct values are passed instead of symbolic values (IE, enum.symbol.value as opposed to enum.symbol).\n\n This is necessary for internal usage, as the symbolic values are not serializable.\n\n Args:\n enum (enum.EnumMeta):\n The class representing the enum.\n name (Optional[str]):\n The name for the enum. If not present, `enum.__name__` will be used.\n\n Example:\n .. code-block:: python\n\n class Color(enum.Enum):\n RED = enum.auto()\n GREEN = enum.auto()\n BLUE = enum.auto()\n\n @op(\n config_schema={"color": Field(Enum.from_python_enum(Color))}\n )\n def select_color(context):\n assert context.op_config["color"] == Color.RED.value\n """\n if name is None:\n name = enum.__name__\n return cls(name, [EnumValue(v.name, python_value=v.value) for v in enum])
\n\n\n
[docs]class ScalarUnion(ConfigType):\n """Defines a configuration type that accepts a scalar value OR a non-scalar value like a\n :py:class:`~dagster.List`, :py:class:`~dagster.Dict`, or :py:class:`~dagster.Selector`.\n\n This allows runtime scalars to be configured without a dictionary with the key ``value`` and\n instead just use the scalar value directly. However this still leaves the option to\n load scalars from a json or pickle file.\n\n Args:\n scalar_type (type):\n The scalar type of values that this configuration type can hold. For example,\n :py:class:`~python:int`, :py:class:`~python:float`, :py:class:`~python:bool`,\n or :py:class:`~python:str`.\n non_scalar_schema (ConfigSchema):\n The schema of a non-scalar Dagster configuration type. For example, :py:class:`List`,\n :py:class:`Dict`, or :py:class:`~dagster.Selector`.\n key (Optional[str]):\n The configuation type's unique key. If not set, then the key will be set to\n ``ScalarUnion.{scalar_type}-{non_scalar_schema}``.\n\n **Examples:**\n\n .. code-block:: yaml\n\n graph:\n transform_word:\n inputs:\n word:\n value: foobar\n\n\n becomes, optionally,\n\n\n .. code-block:: yaml\n\n graph:\n transform_word:\n inputs:\n word: foobar\n """\n\n def __init__(\n self,\n scalar_type: typing.Any,\n non_scalar_schema: UserConfigSchema,\n _key: Optional[str] = None,\n ):\n from .field import resolve_to_config_type\n\n self.scalar_type = check.inst(\n cast(ConfigType, resolve_to_config_type(scalar_type)), ConfigType\n )\n self.non_scalar_type = resolve_to_config_type(non_scalar_schema)\n\n check.param_invariant(self.scalar_type.kind == ConfigTypeKind.SCALAR, "scalar_type")\n check.param_invariant(\n self.non_scalar_type.kind\n in {ConfigTypeKind.STRICT_SHAPE, ConfigTypeKind.SELECTOR, ConfigTypeKind.ARRAY},\n "non_scalar_type",\n )\n\n # https://github.com/dagster-io/dagster/issues/2133\n key = check.opt_str_param(\n _key, "_key", f"ScalarUnion.{self.scalar_type.key}-{self.non_scalar_type.key}"\n )\n\n super(ScalarUnion, self).__init__(\n key=key,\n kind=ConfigTypeKind.SCALAR_UNION,\n type_params=[self.scalar_type, self.non_scalar_type],\n )\n\n def type_iterator(self) -> Iterator["ConfigType"]:\n yield from self.scalar_type.type_iterator()\n yield from self.non_scalar_type.type_iterator()\n yield from super().type_iterator()
\n\n\nConfigAnyInstance: Any = Any()\nConfigBoolInstance: Bool = Bool()\nConfigFloatInstance: Float = Float()\nConfigIntInstance: Int = Int()\nConfigStringInstance: String = String()\n\n_CONFIG_MAP: Dict[check.TypeOrTupleOfTypes, ConfigType] = {\n BuiltinEnum.ANY: ConfigAnyInstance,\n BuiltinEnum.BOOL: ConfigBoolInstance,\n BuiltinEnum.FLOAT: ConfigFloatInstance,\n BuiltinEnum.INT: ConfigIntInstance,\n BuiltinEnum.STRING: ConfigStringInstance,\n}\n\n\n_CONFIG_MAP_BY_NAME: Dict[str, ConfigType] = {\n "Any": ConfigAnyInstance,\n "Bool": ConfigBoolInstance,\n "Float": ConfigFloatInstance,\n "Int": ConfigIntInstance,\n "String": ConfigStringInstance,\n}\n\nALL_CONFIG_BUILTINS = set(_CONFIG_MAP.values())\n\n\ndef get_builtin_scalar_by_name(type_name: str):\n if type_name not in _CONFIG_MAP_BY_NAME:\n check.failed(f"Scalar {type_name} is not supported")\n return _CONFIG_MAP_BY_NAME[type_name]\n
", "current_page_name": "_modules/dagster/_config/config_type", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._config.config_type"}, "field": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._config.field

\nfrom typing import Any, Optional, Union, cast, overload\n\nimport dagster._check as check\nfrom dagster._annotations import public\nfrom dagster._builtins import BuiltinEnum\nfrom dagster._config import UserConfigSchema\nfrom dagster._core.errors import DagsterInvalidConfigError, DagsterInvalidDefinitionError\nfrom dagster._serdes import serialize_value\nfrom dagster._seven import is_subclass\nfrom dagster._utils import is_enum_value\nfrom dagster._utils.typing_api import is_closed_python_optional_type, is_typing_type\n\nfrom .config_type import Array, ConfigAnyInstance, ConfigType, ConfigTypeKind\nfrom .field_utils import FIELD_NO_DEFAULT_PROVIDED, Map, all_optional_type\n\n\ndef _is_config_type_class(obj) -> bool:\n    return isinstance(obj, type) and is_subclass(obj, ConfigType)\n\n\ndef helpful_list_error_string() -> str:\n    return "Please use a python list (e.g. [int]) or dagster.Array (e.g. Array(int)) instead."\n\n\nVALID_CONFIG_DESC = """\n1. A Python primitive type that resolve to dagster config\n   types: int, float, bool, str.\n\n2. A dagster config type: Int, Float, Bool, String, StringSource, Path, Any,\n   Array, Noneable, Selector, Shape, Permissive, etc.\n\n3. A bare python dictionary, which is wrapped in Shape. Any\n   values in the dictionary get resolved by the same rules, recursively.\n\n4. A bare python list of length one which itself is config type.\n   Becomes Array with list element as an argument.\n"""\n\n\n@overload\ndef resolve_to_config_type(obj: Union[ConfigType, UserConfigSchema]) -> ConfigType:\n    pass\n\n\n@overload\ndef resolve_to_config_type(obj: object) -> Union[ConfigType, bool]:\n    pass\n\n\ndef resolve_to_config_type(obj: object) -> Union[ConfigType, bool]:\n    from .field_utils import convert_fields_to_dict_type\n\n    # Short circuit if it's already a Config Type\n    if isinstance(obj, ConfigType):\n        return obj\n\n    if isinstance(obj, dict):\n        # Dicts of the special form {type: value} are treated as Maps\n        # mapping from the type to value type, otherwise treat as dict type\n        if len(obj) == 1:\n            key = next(iter(obj.keys()))\n            key_type = resolve_to_config_type(key)\n            if not isinstance(key, str):\n                if not key_type:\n                    raise DagsterInvalidDefinitionError(\n                        f"Invalid key in map specification: {key!r} in map {obj}"\n                    )\n\n                if not key_type.kind == ConfigTypeKind.SCALAR:  # type: ignore\n                    raise DagsterInvalidDefinitionError(\n                        f"Non-scalar key in map specification: {key!r} in map {obj}"\n                    )\n\n                inner_type = resolve_to_config_type(obj[key])\n\n                if not inner_type:\n                    raise DagsterInvalidDefinitionError(\n                        f"Invalid value in map specification: {obj[str]!r} in map {obj}"\n                    )\n                return Map(key_type, inner_type)\n        return convert_fields_to_dict_type(obj)\n\n    if isinstance(obj, list):\n        if len(obj) != 1:\n            raise DagsterInvalidDefinitionError("Array specifications must only be of length 1")\n\n        inner_type = resolve_to_config_type(obj[0])\n\n        if not inner_type:\n            raise DagsterInvalidDefinitionError(\n                f"Invalid member of array specification: {obj[0]!r} in list {obj}"\n            )\n        return Array(inner_type)\n\n    if BuiltinEnum.contains(obj):\n        return ConfigType.from_builtin_enum(obj)\n\n    from .primitive_mapping import (\n        is_supported_config_python_builtin,\n        remap_python_builtin_for_config,\n    )\n\n    if is_supported_config_python_builtin(obj):\n        return remap_python_builtin_for_config(obj)\n\n    if obj is None:\n        return ConfigAnyInstance\n\n    # Special error messages for passing a DagsterType\n    from dagster._core.types.dagster_type import DagsterType, List, ListType\n    from dagster._core.types.python_set import Set, _TypedPythonSet\n    from dagster._core.types.python_tuple import Tuple, _TypedPythonTuple\n\n    if _is_config_type_class(obj):\n        check.param_invariant(\n            False,\n            "dagster_type",\n            f"Cannot pass config type class {obj} to resolve_to_config_type. This error usually"\n            " occurs when you pass a dagster config type class instead of a class instance into"\n            ' another dagster config type. E.g. "Noneable(Permissive)" should instead be'\n            ' "Noneable(Permissive())".',\n        )\n\n    if isinstance(obj, type) and is_subclass(obj, DagsterType):\n        raise DagsterInvalidDefinitionError(\n            f"You have passed a DagsterType class {obj!r} to the config system. "\n            "The DagsterType and config schema systems are separate. "\n            f"Valid config values are:\\n{VALID_CONFIG_DESC}"\n        )\n\n    if is_closed_python_optional_type(obj):\n        raise DagsterInvalidDefinitionError(\n            "Cannot use typing.Optional as a config type. If you want this field to be "\n            "optional, please use Field(<type>, is_required=False), and if you want this field to "\n            "be required, but accept a value of None, use dagster.Noneable(<type>)."\n        )\n\n    if is_typing_type(obj):\n        raise DagsterInvalidDefinitionError(\n            f"You have passed in {obj} to the config system. Types from "\n            "the typing module in python are not allowed in the config system. "\n            "You must use types that are imported from dagster or primitive types "\n            "such as bool, int, etc."\n        )\n\n    if obj is List or isinstance(obj, ListType):\n        raise DagsterInvalidDefinitionError(\n            "Cannot use List in the context of config. " + helpful_list_error_string()\n        )\n\n    if obj is Set or isinstance(obj, _TypedPythonSet):\n        raise DagsterInvalidDefinitionError(\n            "Cannot use Set in the context of a config field. " + helpful_list_error_string()\n        )\n\n    if obj is Tuple or isinstance(obj, _TypedPythonTuple):\n        raise DagsterInvalidDefinitionError(\n            "Cannot use Tuple in the context of a config field. " + helpful_list_error_string()\n        )\n\n    if isinstance(obj, DagsterType):\n        raise DagsterInvalidDefinitionError(\n            f"You have passed an instance of DagsterType {obj.display_name} to the config "\n            f"system (Repr of type: {obj!r}). "\n            "The DagsterType and config schema systems are separate. "\n            f"Valid config values are:\\n{VALID_CONFIG_DESC}",\n        )\n\n    # This means that this is an error and we are return False to a callsite\n    # We do the error reporting there because those callsites have more context\n    return False\n\n\ndef has_implicit_default(config_type):\n    if config_type.kind == ConfigTypeKind.NONEABLE:\n        return True\n\n    return all_optional_type(config_type)\n\n\n
[docs]class Field:\n """Defines the schema for a configuration field.\n\n Fields are used in config schema instead of bare types when one wants to add a description,\n a default value, or to mark it as not required.\n\n Config fields are parsed according to their schemas in order to yield values available at\n job execution time through the config system. Config fields can be set on ops, on\n loaders for custom, and on other pluggable components of the system, such as resources, loggers,\n and executors.\n\n\n Args:\n config (Any): The schema for the config. This value can be any of:\n\n 1. A Python primitive type that resolves to a Dagster config type\n (:py:class:`~python:int`, :py:class:`~python:float`, :py:class:`~python:bool`,\n :py:class:`~python:str`, or :py:class:`~python:list`).\n\n 2. A Dagster config type:\n\n * :py:data:`~dagster.Any`\n * :py:class:`~dagster.Array`\n * :py:data:`~dagster.Bool`\n * :py:data:`~dagster.Enum`\n * :py:data:`~dagster.Float`\n * :py:data:`~dagster.Int`\n * :py:data:`~dagster.IntSource`\n * :py:data:`~dagster.Noneable`\n * :py:class:`~dagster.Permissive`\n * :py:class:`~dagster.ScalarUnion`\n * :py:class:`~dagster.Selector`\n * :py:class:`~dagster.Shape`\n * :py:data:`~dagster.String`\n * :py:data:`~dagster.StringSource`\n\n 3. A bare python dictionary, which will be automatically wrapped in\n :py:class:`~dagster.Shape`. Values of the dictionary are resolved recursively\n according to the same rules.\n\n 4. A bare python list of length one which itself is config type.\n Becomes :py:class:`Array` with list element as an argument.\n\n default_value (Any):\n A default value for this field, conformant to the schema set by the ``dagster_type``\n argument. If a default value is provided, ``is_required`` should be ``False``.\n\n Note: for config types that do post processing such as Enum, this value must be\n the pre processed version, ie use ``ExampleEnum.VALUE.name`` instead of\n ``ExampleEnum.VALUE``\n\n is_required (bool):\n Whether the presence of this field is required. Defaults to true. If ``is_required``\n is ``True``, no default value should be provided.\n\n description (str):\n A human-readable description of this config field.\n\n Examples:\n .. code-block:: python\n\n @op(\n config_schema={\n 'word': Field(str, description='I am a word.'),\n 'repeats': Field(Int, default_value=1, is_required=False),\n }\n )\n def repeat_word(context):\n return context.op_config['word'] * context.op_config['repeats']\n """\n\n def _resolve_config_arg(self, config):\n if isinstance(config, ConfigType):\n return config\n\n config_type = resolve_to_config_type(config)\n if not config_type:\n raise DagsterInvalidDefinitionError(\n f"Attempted to pass {config!r} to a Field that expects a valid "\n "dagster type usable in config (e.g. Dict, Int, String et al)."\n )\n return config_type\n\n def __init__(\n self,\n config: Any,\n default_value: Any = FIELD_NO_DEFAULT_PROVIDED,\n is_required: Optional[bool] = None,\n description: Optional[str] = None,\n ):\n from .post_process import resolve_defaults\n from .validate import validate_config\n\n self.config_type = check.inst(self._resolve_config_arg(config), ConfigType)\n\n self._description = check.opt_str_param(description, "description")\n\n check.opt_bool_param(is_required, "is_required")\n\n if default_value != FIELD_NO_DEFAULT_PROVIDED:\n check.param_invariant(\n not (callable(default_value)), "default_value", "default_value cannot be a callable"\n )\n\n if is_required is True:\n check.param_invariant(\n default_value == FIELD_NO_DEFAULT_PROVIDED,\n "default_value",\n "required arguments should not specify default values",\n )\n\n self._default_value = default_value\n\n # check explicit default value\n if self.default_provided:\n if self.config_type.kind == ConfigTypeKind.ENUM and is_enum_value(default_value):\n raise DagsterInvalidDefinitionError(\n (\n "You have passed into a python enum value as the default value "\n "into of a config enum type {name}. You must pass in the underlying "\n "string represention as the default value. One of {value_set}."\n ).format(\n value_set=[ev.config_value for ev in self.config_type.enum_values],\n name=self.config_type.given_name,\n )\n )\n\n evr = validate_config(self.config_type, default_value)\n if not evr.success:\n raise DagsterInvalidConfigError(\n "Invalid default_value for Field.",\n evr.errors,\n default_value,\n )\n\n if is_required is None:\n is_optional = has_implicit_default(self.config_type) or self.default_provided\n is_required = not is_optional\n\n # on implicitly optional - set the default value\n # by resolving the defaults of the type\n if is_optional and not self.default_provided:\n evr = resolve_defaults(self.config_type, None)\n if not evr.success:\n raise DagsterInvalidConfigError(\n "Unable to resolve implicit default_value for Field.",\n evr.errors,\n None,\n )\n self._default_value = evr.value\n self._is_required = is_required\n\n @public\n @property\n def is_required(self) -> bool:\n """Whether a value for this field must be provided at runtime.\n\n Cannot be True if a default value is provided.\n """\n return self._is_required\n\n @public\n @property\n def default_provided(self) -> bool:\n """Was a default value provided.\n\n Returns:\n bool: Yes or no\n """\n return self._default_value != FIELD_NO_DEFAULT_PROVIDED\n\n @public\n @property\n def default_value(self) -> Any:\n """The default value for the field.\n\n Raises an exception if no default value was provided.\n """\n check.invariant(self.default_provided, "Asking for default value when none was provided")\n return self._default_value\n\n @public\n @property\n def description(self) -> Optional[str]:\n """A human-readable description of this config field, if provided."""\n return self._description\n\n @property\n def default_value_as_json_str(self) -> str:\n check.invariant(self.default_provided, "Asking for default value when none was provided")\n return serialize_value(self.default_value)\n\n def __repr__(self) -> str:\n return ("Field({config_type}, default={default}, is_required={is_required})").format(\n config_type=self.config_type,\n default=(\n "@" if self._default_value == FIELD_NO_DEFAULT_PROVIDED else self._default_value\n ),\n is_required=self.is_required,\n )
\n\n\ndef check_opt_field_param(obj: object, param_name: str) -> Optional[Field]:\n return check.opt_inst_param(cast(Optional[Field], obj), param_name, Field)\n
", "current_page_name": "_modules/dagster/_config/field", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._config.field"}, "field_utils": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._config.field_utils

\n# encoding: utf-8\nimport hashlib\nfrom typing import TYPE_CHECKING, Any, Dict, Iterator, List, Mapping, Optional, Sequence\n\nimport dagster._check as check\nfrom dagster._annotations import public\nfrom dagster._core.errors import DagsterInvalidConfigDefinitionError\n\nfrom .config_type import Array, ConfigType, ConfigTypeKind\n\nif TYPE_CHECKING:\n    from dagster._config import Field\n\n\ndef all_optional_type(config_type: ConfigType) -> bool:\n    check.inst_param(config_type, "config_type", ConfigType)\n\n    if ConfigTypeKind.is_shape(config_type.kind):\n        for field in config_type.fields.values():  # type: ignore\n            if field.is_required:\n                return False\n        return True\n\n    if ConfigTypeKind.is_selector(config_type.kind):\n        if len(config_type.fields) == 1:  # type: ignore\n            for field in config_type.fields.values():  # type: ignore\n                if field.is_required:\n                    return False\n            return True\n\n    return False\n\n\nclass __FieldValueSentinel:\n    pass\n\n\nclass __InferOptionalCompositeFieldSentinel:\n    pass\n\n\nFIELD_NO_DEFAULT_PROVIDED = __FieldValueSentinel\n\nINFER_OPTIONAL_COMPOSITE_FIELD = __InferOptionalCompositeFieldSentinel\n\n\nclass _ConfigHasFields(ConfigType):\n    def __init__(self, fields, **kwargs):\n        self.fields = expand_fields_dict(fields)\n        super(_ConfigHasFields, self).__init__(**kwargs)\n\n    def type_iterator(self) -> Iterator["ConfigType"]:\n        for field in self.fields.values():\n            yield from field.config_type.type_iterator()\n        yield from super().type_iterator()\n\n\nFIELD_HASH_CACHE: Dict[str, Any] = {}\n\n\ndef _memoize_inst_in_field_cache(passed_cls, defined_cls, key):\n    if key in FIELD_HASH_CACHE:\n        return FIELD_HASH_CACHE[key]\n\n    defined_cls_inst = super(defined_cls, passed_cls).__new__(defined_cls)\n    defined_cls_inst._initialized = False  # noqa: SLF001\n    FIELD_HASH_CACHE[key] = defined_cls_inst\n    return defined_cls_inst\n\n\ndef _add_hash(m, string):\n    m.update(string.encode("utf-8"))\n\n\ndef compute_fields_hash(fields, description, field_aliases=None):\n    m = hashlib.sha1()  # so that hexdigest is 40, not 64 bytes\n    if description:\n        _add_hash(m, ":description: " + description)\n\n    for field_name in sorted(list(fields.keys())):\n        field = fields[field_name]\n        _add_hash(m, ":fieldname:" + field_name)\n        if field.default_provided:\n            _add_hash(m, ":default_value: " + field.default_value_as_json_str)\n        _add_hash(m, ":is_required: " + str(field.is_required))\n        _add_hash(m, ":type_key: " + field.config_type.key)\n        if field.description:\n            _add_hash(m, ":description: " + field.description)\n\n    field_aliases = check.opt_dict_param(\n        field_aliases, "field_aliases", key_type=str, value_type=str\n    )\n    for field_name in sorted(list(field_aliases.keys())):\n        field_alias = field_aliases[field_name]\n        _add_hash(m, ":fieldname: " + field_name)\n        _add_hash(m, ":fieldalias: " + field_alias)\n\n    return m.hexdigest()\n\n\ndef _define_shape_key_hash(fields, description, field_aliases):\n    return "Shape." + compute_fields_hash(fields, description, field_aliases=field_aliases)\n\n\n
[docs]class Shape(_ConfigHasFields):\n """Schema for configuration data with string keys and typed values via :py:class:`Field`.\n\n Unlike :py:class:`Permissive`, unspecified fields are not allowed and will throw a\n :py:class:`~dagster.DagsterInvalidConfigError`.\n\n Args:\n fields (Dict[str, Field]):\n The specification of the config dict.\n field_aliases (Dict[str, str]):\n Maps a string key to an alias that can be used instead of the original key. For example,\n an entry {"foo": "bar"} means that someone could use "bar" instead of "foo" as a\n top level string key.\n """\n\n def __new__(\n cls,\n fields,\n description=None,\n field_aliases=None,\n ):\n return _memoize_inst_in_field_cache(\n cls,\n Shape,\n _define_shape_key_hash(expand_fields_dict(fields), description, field_aliases),\n )\n\n def __init__(\n self,\n fields,\n description=None,\n field_aliases=None,\n ):\n # if we hit in the field cache - skip double init\n if self._initialized:\n return\n\n fields = expand_fields_dict(fields)\n super(Shape, self).__init__(\n kind=ConfigTypeKind.STRICT_SHAPE,\n key=_define_shape_key_hash(fields, description, field_aliases),\n description=description,\n fields=fields,\n )\n self.field_aliases = check.opt_dict_param(\n field_aliases, "field_aliases", key_type=str, value_type=str\n )\n self._initialized = True
\n\n\n
[docs]class Map(ConfigType):\n """Defines a config dict with arbitrary scalar keys and typed values.\n\n A map can contrain arbitrary keys of the specified scalar type, each of which has\n type checked values. Unlike :py:class:`Shape` and :py:class:`Permissive`, scalar\n keys other than strings can be used, and unlike :py:class:`Permissive`, all\n values are type checked.\n\n Args:\n key_type (type):\n The type of keys this map can contain. Must be a scalar type.\n inner_type (type):\n The type of the values that this map type can contain.\n key_label_name (string):\n Optional name which describes the role of keys in the map.\n\n **Examples:**\n\n .. code-block:: python\n\n @op(config_schema=Field(Map({str: int})))\n def partially_specified_config(context) -> List:\n return sorted(list(context.op_config.items()))\n """\n\n def __init__(self, key_type, inner_type, key_label_name=None):\n from .field import resolve_to_config_type\n\n self.key_type = resolve_to_config_type(key_type)\n self.inner_type = resolve_to_config_type(inner_type)\n self.given_name = key_label_name\n\n check.inst_param(self.key_type, "key_type", ConfigType)\n check.inst_param(self.inner_type, "inner_type", ConfigType)\n check.param_invariant(\n self.key_type.kind == ConfigTypeKind.SCALAR, "key_type", "Key type must be a scalar"\n )\n check.opt_str_param(self.given_name, "name")\n\n super(Map, self).__init__(\n key="Map.{key_type}.{inner_type}{name_key}".format(\n key_type=self.key_type.key,\n inner_type=self.inner_type.key,\n name_key=f":name: {key_label_name}" if key_label_name else "",\n ),\n # We use the given name field to store the key label name\n # this is used elsewhere to give custom types names\n given_name=key_label_name,\n type_params=[self.key_type, self.inner_type],\n kind=ConfigTypeKind.MAP,\n )\n\n @public\n @property\n def key_label_name(self) -> Optional[str]:\n """Name which describes the role of keys in the map, if provided."""\n return self.given_name\n\n def type_iterator(self) -> Iterator["ConfigType"]:\n yield from self.key_type.type_iterator()\n yield from self.inner_type.type_iterator()\n yield from super().type_iterator()
\n\n\ndef _define_permissive_dict_key(fields, description):\n return (\n "Permissive." + compute_fields_hash(fields, description=description)\n if fields\n else "Permissive"\n )\n\n\n
[docs]class Permissive(_ConfigHasFields):\n """Defines a config dict with a partially specified schema.\n\n A permissive dict allows partial specification of the config schema. Any fields with a\n specified schema will be type checked. Other fields will be allowed, but will be ignored by\n the type checker.\n\n Args:\n fields (Dict[str, Field]): The partial specification of the config dict.\n\n **Examples:**\n\n .. code-block:: python\n\n @op(config_schema=Field(Permissive({'required': Field(String)})))\n def map_config_op(context) -> List:\n return sorted(list(context.op_config.items()))\n """\n\n def __new__(cls, fields=None, description=None):\n return _memoize_inst_in_field_cache(\n cls,\n Permissive,\n _define_permissive_dict_key(\n expand_fields_dict(fields) if fields else None, description\n ),\n )\n\n def __init__(self, fields=None, description=None):\n # if we hit in field cache avoid double init\n if self._initialized:\n return\n\n fields = expand_fields_dict(fields) if fields else None\n super(Permissive, self).__init__(\n key=_define_permissive_dict_key(fields, description),\n kind=ConfigTypeKind.PERMISSIVE_SHAPE,\n fields=fields or dict(),\n description=description,\n )\n self._initialized = True
\n\n\ndef _define_selector_key(fields, description):\n return "Selector." + compute_fields_hash(fields, description=description)\n\n\n
[docs]class Selector(_ConfigHasFields):\n """Define a config field requiring the user to select one option.\n\n Selectors are used when you want to be able to present several different options in config but\n allow only one to be selected. For example, a single input might be read in from either a csv\n file or a parquet file, but not both at once.\n\n Note that in some other type systems this might be called an 'input union'.\n\n Functionally, a selector is like a :py:class:`Dict`, except that only one key from the dict can\n be specified in valid config.\n\n Args:\n fields (Dict[str, Field]): The fields from which the user must select.\n\n **Examples:**\n\n .. code-block:: python\n\n @op(\n config_schema=Field(\n Selector(\n {\n 'haw': {'whom': Field(String, default_value='honua', is_required=False)},\n 'cn': {'whom': Field(String, default_value='\u4e16\u754c', is_required=False)},\n 'en': {'whom': Field(String, default_value='world', is_required=False)},\n }\n ),\n is_required=False,\n default_value={'en': {'whom': 'world'}},\n )\n )\n def hello_world_with_default(context):\n if 'haw' in context.op_config:\n return 'Aloha {whom}!'.format(whom=context.op_config['haw']['whom'])\n if 'cn' in context.op_config:\n return '\u4f60\u597d, {whom}!'.format(whom=context.op_config['cn']['whom'])\n if 'en' in context.op_config:\n return 'Hello, {whom}!'.format(whom=context.op_config['en']['whom'])\n """\n\n def __new__(cls, fields, description=None):\n return _memoize_inst_in_field_cache(\n cls,\n Selector,\n _define_selector_key(expand_fields_dict(fields), description),\n )\n\n def __init__(self, fields, description=None):\n # if we hit in field cache avoid double init\n if self._initialized:\n return\n\n fields = expand_fields_dict(fields)\n super(Selector, self).__init__(\n key=_define_selector_key(fields, description),\n kind=ConfigTypeKind.SELECTOR,\n fields=fields,\n description=description,\n )\n self._initialized = True
\n\n\n# Config syntax expansion code below\n\n\ndef is_potential_field(potential_field: object) -> bool:\n from .field import Field, resolve_to_config_type\n\n return isinstance(potential_field, (Field, dict, list)) or bool(\n resolve_to_config_type(potential_field)\n )\n\n\ndef convert_fields_to_dict_type(fields: Mapping[str, object]):\n return _convert_fields_to_dict_type(fields, fields, [])\n\n\ndef _convert_fields_to_dict_type(\n original_root: object, fields: Mapping[str, object], stack: List[str]\n) -> Shape:\n return Shape(_expand_fields_dict(original_root, fields, stack))\n\n\ndef expand_fields_dict(fields: Mapping[str, object]) -> Mapping[str, "Field"]:\n return _expand_fields_dict(fields, fields, [])\n\n\ndef _expand_fields_dict(\n original_root: object, fields: Mapping[str, object], stack: List[str]\n) -> Mapping[str, "Field"]:\n check.mapping_param(fields, "fields")\n return {\n name: _convert_potential_field(original_root, value, stack + [name])\n for name, value in fields.items()\n }\n\n\ndef expand_list(original_root: object, the_list: Sequence[object], stack: List[str]) -> Array:\n if len(the_list) != 1:\n raise DagsterInvalidConfigDefinitionError(\n original_root, the_list, stack, "List must be of length 1"\n )\n\n inner_type = _convert_potential_type(original_root, the_list[0], stack)\n if not inner_type:\n raise DagsterInvalidConfigDefinitionError(\n original_root,\n the_list,\n stack,\n "List have a single item and contain a valid type i.e. [int]. Got item {}".format(\n repr(the_list[0])\n ),\n )\n\n return Array(inner_type)\n\n\ndef expand_map(original_root: object, the_dict: Mapping[object, object], stack: List[str]) -> Map:\n if len(the_dict) != 1:\n raise DagsterInvalidConfigDefinitionError(\n original_root, the_dict, stack, "Map dict must be of length 1"\n )\n\n key = next(iter(the_dict.keys()))\n key_type = _convert_potential_type(original_root, key, stack)\n if not key_type or not key_type.kind == ConfigTypeKind.SCALAR:\n raise DagsterInvalidConfigDefinitionError(\n original_root,\n the_dict,\n stack,\n f"Map dict must have a scalar type as its only key. Got key {key!r}",\n )\n\n inner_type = _convert_potential_type(original_root, the_dict[key], stack)\n if not inner_type:\n raise DagsterInvalidConfigDefinitionError(\n original_root,\n the_dict,\n stack,\n "Map must have a single value and contain a valid type i.e. {{str: int}}. Got item {}"\n .format(repr(the_dict[key])),\n )\n\n return Map(key_type, inner_type)\n\n\ndef convert_potential_field(potential_field: object) -> "Field":\n return _convert_potential_field(potential_field, potential_field, [])\n\n\ndef _convert_potential_type(original_root: object, potential_type, stack: List[str]):\n from .field import resolve_to_config_type\n\n if isinstance(potential_type, Mapping):\n # A dictionary, containing a single key which is a type (int, str, etc) and not a string is interpreted as a Map\n if len(potential_type) == 1:\n key = next(iter(potential_type.keys()))\n if not isinstance(key, str) and _convert_potential_type(original_root, key, stack):\n return expand_map(original_root, potential_type, stack)\n\n # Otherwise, the dictionary is interpreted as a Shape\n return Shape(_expand_fields_dict(original_root, potential_type, stack))\n\n if isinstance(potential_type, list):\n return expand_list(original_root, potential_type, stack)\n\n return resolve_to_config_type(potential_type)\n\n\ndef _convert_potential_field(\n original_root: object, potential_field: object, stack: List[str]\n) -> "Field":\n from .field import Field\n\n if potential_field is None:\n raise DagsterInvalidConfigDefinitionError(\n original_root, potential_field, stack, reason="Fields cannot be None"\n )\n\n if not is_potential_field(potential_field):\n raise DagsterInvalidConfigDefinitionError(original_root, potential_field, stack)\n\n if isinstance(potential_field, Field):\n return potential_field\n\n return Field(_convert_potential_type(original_root, potential_field, stack))\n\n\ndef config_dictionary_from_values(\n values: Mapping[str, Any], config_field: "Field"\n) -> Dict[str, Any]:\n """Converts a set of config values into a dictionary representation,\n in particular converting EnvVar objects into Dagster config inputs\n and processing data structures such as dicts, lists, and structured Config classes.\n """\n assert ConfigTypeKind.is_shape(config_field.config_type.kind)\n\n from dagster._config.pythonic_config import _config_value_to_dict_representation\n\n return check.is_dict(_config_value_to_dict_representation(None, values))\n\n\nclass IntEnvVar(int):\n """Class used to represent an environment variable in the Dagster config system.\n\n The environment variable will be resolved to an int value when the config is\n loaded.\n """\n\n name: str\n\n @classmethod\n def create(cls, name: str) -> "IntEnvVar":\n var = IntEnvVar(0)\n var.name = name\n return var\n\n\nclass EnvVar(str):\n """Class used to represent an environment variable in the Dagster config system.\n\n The environment variable will be resolved to a string value when the config is\n loaded.\n """\n\n @classmethod\n def int(cls, name: str) -> "IntEnvVar":\n return IntEnvVar.create(name=name)\n
", "current_page_name": "_modules/dagster/_config/field_utils", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._config.field_utils"}, "pythonic_config": {"config": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._config.pythonic_config.config

\nimport re\nfrom enum import Enum\nfrom typing import (\n    Any,\n    Dict,\n    Mapping,\n    Optional,\n    Set,\n    Type,\n    cast,\n)\n\nfrom pydantic import BaseModel, Extra\nfrom pydantic.fields import (\n    ModelField,\n)\n\nimport dagster._check as check\nfrom dagster import (\n    Field,\n    Field as DagsterField,\n    Shape,\n)\nfrom dagster._config.field_utils import Permissive\nfrom dagster._core.definitions.definition_config_schema import (\n    DefinitionConfigSchema,\n)\nfrom dagster._core.errors import (\n    DagsterInvalidConfigDefinitionError,\n    DagsterInvalidDefinitionError,\n    DagsterInvalidInvocationError,\n    DagsterInvalidPythonicConfigDefinitionError,\n)\nfrom dagster._utils.cached_method import CACHED_METHOD_FIELD_SUFFIX\n\nfrom .attach_other_object_to_context import (\n    IAttachDifferentObjectToOpContext as IAttachDifferentObjectToOpContext,\n)\nfrom .conversion_utils import _convert_pydantic_field, _is_pydantic_field_required, safe_is_subclass\nfrom .typing_utils import BaseConfigMeta\n\ntry:\n    from functools import cached_property  # type: ignore  # (py37 compat)\nexcept ImportError:\n\n    class cached_property:\n        pass\n\n\nINTERNAL_MARKER = "__internal__"\n\n# ensure that this ends with the internal marker so we can do a single check\nassert CACHED_METHOD_FIELD_SUFFIX.endswith(INTERNAL_MARKER)\n\n\nclass MakeConfigCacheable(BaseModel):\n    """This class centralizes and implements all the chicanery we need in order\n    to support caching decorators. If we decide this is a bad idea we can remove it\n    all in one go.\n    """\n\n    # Pydantic config for this class\n    # Cannot use kwargs for base class as this is not support for pydnatic<1.8\n    class Config:\n        # Various pydantic model config (https://docs.pydantic.dev/usage/model_config/)\n        # Necessary to allow for caching decorators\n        arbitrary_types_allowed = True\n        # Avoid pydantic reading a cached property class as part of the schema\n        keep_untouched = (cached_property,)\n        # Ensure the class is serializable, for caching purposes\n        frozen = True\n\n    def __setattr__(self, name: str, value: Any):\n        from .resource import ConfigurableResourceFactory\n\n        # This is a hack to allow us to set attributes on the class that are not part of the\n        # config schema. Pydantic will normally raise an error if you try to set an attribute\n        # that is not part of the schema.\n\n        if self._is_field_internal(name):\n            object.__setattr__(self, name, value)\n            return\n\n        try:\n            return super().__setattr__(name, value)\n        except (TypeError, ValueError) as e:\n            clsname = self.__class__.__name__\n            if "is immutable and does not support item assignment" in str(e):\n                if isinstance(self, ConfigurableResourceFactory):\n                    raise DagsterInvalidInvocationError(\n                        f"'{clsname}' is a Pythonic resource and does not support item assignment,"\n                        " as it inherits from 'pydantic.BaseModel' with frozen=True. If trying to"\n                        " maintain state on this resource, consider building a separate, stateful"\n                        " client class, and provide a method on the resource to construct and"\n                        " return the stateful client."\n                    ) from e\n                else:\n                    raise DagsterInvalidInvocationError(\n                        f"'{clsname}' is a Pythonic config class and does not support item"\n                        " assignment, as it inherits from 'pydantic.BaseModel' with frozen=True."\n                    ) from e\n            elif "object has no field" in str(e):\n                field_name = check.not_none(\n                    re.search(r"object has no field \\"(.*)\\"", str(e))\n                ).group(1)\n                if isinstance(self, ConfigurableResourceFactory):\n                    raise DagsterInvalidInvocationError(\n                        f"'{clsname}' is a Pythonic resource and does not support manipulating"\n                        f" undeclared attribute '{field_name}' as it inherits from"\n                        " 'pydantic.BaseModel' without extra=\\"allow\\". If trying to maintain"\n                        " state on this resource, consider building a separate, stateful client"\n                        " class, and provide a method on the resource to construct and return the"\n                        " stateful client."\n                    ) from e\n                else:\n                    raise DagsterInvalidInvocationError(\n                        f"'{clsname}' is a Pythonic config class and does not support manipulating"\n                        f" undeclared attribute '{field_name}' as it inherits from"\n                        " 'pydantic.BaseModel' without extra=\\"allow\\"."\n                    ) from e\n            else:\n                raise\n\n    def _is_field_internal(self, name: str) -> bool:\n        return name.endswith(INTERNAL_MARKER)\n\n\n
[docs]class Config(MakeConfigCacheable, metaclass=BaseConfigMeta):\n """Base class for Dagster configuration models, used to specify config schema for\n ops and assets. Subclasses :py:class:`pydantic.BaseModel`.\n\n Example definition:\n\n .. code-block:: python\n\n from pydantic import Field\n\n class MyAssetConfig(Config):\n my_str: str = "my_default_string"\n my_int_list: List[int]\n my_bool_with_metadata: bool = Field(default=False, description="A bool field")\n\n\n Example usage:\n\n .. code-block:: python\n\n @asset\n def asset_with_config(config: MyAssetConfig):\n assert config.my_str == "my_default_string"\n assert config.my_int_list == [1, 2, 3]\n assert config.my_bool_with_metadata == False\n\n asset_with_config(MyAssetConfig(my_int_list=[1, 2, 3], my_bool_with_metadata=True))\n\n """\n\n def __init__(self, **config_dict) -> None:\n """This constructor is overridden to handle any remapping of raw config dicts to\n the appropriate config classes. For example, discriminated unions are represented\n in Dagster config as dicts with a single key, which is the discriminator value.\n """\n modified_data = {}\n for key, value in config_dict.items():\n field = self.__fields__.get(key)\n if field and field.field_info.discriminator:\n nested_dict = value\n\n discriminator_key = check.not_none(field.discriminator_key)\n if isinstance(value, Config):\n nested_dict = _discriminated_union_config_dict_to_selector_config_dict(\n discriminator_key,\n value._get_non_none_public_field_values(), # noqa: SLF001\n )\n\n nested_items = list(check.is_dict(nested_dict).items())\n check.invariant(\n len(nested_items) == 1,\n "Discriminated union must have exactly one key",\n )\n discriminated_value, nested_values = nested_items[0]\n\n modified_data[key] = {\n **nested_values,\n discriminator_key: discriminated_value,\n }\n else:\n modified_data[key] = value\n super().__init__(**modified_data)\n\n def _convert_to_config_dictionary(self) -> Mapping[str, Any]:\n """Converts this Config object to a Dagster config dictionary, in the same format as the dictionary\n accepted as run config or as YAML in the launchpad.\n\n Inner fields are recursively converted to dictionaries, meaning nested config objects\n or EnvVars will be converted to the appropriate dictionary representation.\n """\n public_fields = self._get_non_none_public_field_values()\n return {\n k: _config_value_to_dict_representation(self.__fields__.get(k), v)\n for k, v in public_fields.items()\n }\n\n def _get_non_none_public_field_values(self) -> Mapping[str, Any]:\n """Returns a dictionary representation of this config object,\n ignoring any private fields, and any optional fields that are None.\n\n Inner fields are returned as-is in the dictionary,\n meaning any nested config objects will be returned as config objects, not dictionaries.\n """\n output = {}\n for key, value in self.__dict__.items():\n if self._is_field_internal(key):\n continue\n field = self.__fields__.get(key)\n if field and value is None and not _is_pydantic_field_required(field):\n continue\n\n if field:\n output[field.alias] = value\n else:\n output[key] = value\n return output\n\n @classmethod\n def to_config_schema(cls) -> DefinitionConfigSchema:\n """Converts the config structure represented by this class into a DefinitionConfigSchema."""\n return DefinitionConfigSchema(infer_schema_from_config_class(cls))\n\n @classmethod\n def to_fields_dict(cls) -> Dict[str, DagsterField]:\n """Converts the config structure represented by this class into a dictionary of dagster.Fields.\n This is useful when interacting with legacy code that expects a dictionary of fields but you\n want the source of truth to be a config class.\n """\n return cast(Shape, cls.to_config_schema().as_field().config_type).fields
\n\n\ndef _discriminated_union_config_dict_to_selector_config_dict(\n discriminator_key: str, config_dict: Mapping[str, Any]\n):\n """Remaps a config dictionary which is a member of a discriminated union to\n the appropriate structure for a Dagster config selector.\n\n A discriminated union with key "my_key" and value "my_value" will be represented\n as {"my_key": "my_value", "my_field": "my_field_value"}. When converted to a selector,\n this should be represented as {"my_value": {"my_field": "my_field_value"}}.\n """\n updated_dict = dict(config_dict)\n discriminator_value = updated_dict.pop(discriminator_key)\n wrapped_dict = {discriminator_value: updated_dict}\n return wrapped_dict\n\n\ndef _config_value_to_dict_representation(field: Optional[ModelField], value: Any):\n """Converts a config value to a dictionary representation. If a field is provided, it will be used\n to determine the appropriate dictionary representation in the case of discriminated unions.\n """\n from dagster._config.field_utils import EnvVar, IntEnvVar\n\n if isinstance(value, dict):\n return {k: _config_value_to_dict_representation(None, v) for k, v in value.items()}\n elif isinstance(value, list):\n return [_config_value_to_dict_representation(None, v) for v in value]\n elif isinstance(value, EnvVar):\n return {"env": str(value)}\n elif isinstance(value, IntEnvVar):\n return {"env": value.name}\n if isinstance(value, Config):\n if field and field.discriminator_key:\n return {\n k: v\n for k, v in _discriminated_union_config_dict_to_selector_config_dict(\n field.discriminator_key,\n value._convert_to_config_dictionary(), # noqa: SLF001\n ).items()\n }\n else:\n return {k: v for k, v in value._convert_to_config_dictionary().items()} # noqa: SLF001\n elif isinstance(value, Enum):\n return value.name\n\n return value\n\n\n
[docs]class PermissiveConfig(Config):\n """Subclass of :py:class:`Config` that allows arbitrary extra fields. This is useful for\n config classes which may have open-ended inputs.\n\n Example definition:\n\n .. code-block:: python\n\n class MyPermissiveOpConfig(PermissiveConfig):\n my_explicit_parameter: bool\n my_other_explicit_parameter: str\n\n\n Example usage:\n\n .. code-block:: python\n\n @op\n def op_with_config(config: MyPermissiveOpConfig):\n assert config.my_explicit_parameter == True\n assert config.my_other_explicit_parameter == "foo"\n assert config.dict().get("my_implicit_parameter") == "bar"\n\n op_with_config(\n MyPermissiveOpConfig(\n my_explicit_parameter=True,\n my_other_explicit_parameter="foo",\n my_implicit_parameter="bar"\n )\n )\n\n """\n\n # Pydantic config for this class\n # Cannot use kwargs for base class as this is not support for pydantic<1.8\n class Config:\n extra = "allow"
\n\n\ndef infer_schema_from_config_class(\n model_cls: Type["Config"],\n description: Optional[str] = None,\n fields_to_omit: Optional[Set[str]] = None,\n) -> Field:\n from .config import Config\n from .resource import ConfigurableResourceFactory\n\n """Parses a structured config class and returns a corresponding Dagster config Field."""\n fields_to_omit = fields_to_omit or set()\n\n check.param_invariant(\n safe_is_subclass(model_cls, Config),\n "Config type annotation must inherit from dagster.Config",\n )\n\n fields: Dict[str, Field] = {}\n for pydantic_field in model_cls.__fields__.values():\n if pydantic_field.name not in fields_to_omit:\n if isinstance(pydantic_field.default, Field):\n raise DagsterInvalidDefinitionError(\n "Using 'dagster.Field' is not supported within a Pythonic config or resource"\n " definition. 'dagster.Field' should only be used in legacy Dagster config"\n " schemas. Did you mean to use 'pydantic.Field' instead?"\n )\n\n try:\n fields[pydantic_field.alias] = _convert_pydantic_field(\n pydantic_field,\n )\n except DagsterInvalidConfigDefinitionError as e:\n raise DagsterInvalidPythonicConfigDefinitionError(\n config_class=model_cls,\n field_name=pydantic_field.name,\n invalid_type=e.current_value,\n is_resource=model_cls is not None\n and safe_is_subclass(model_cls, ConfigurableResourceFactory),\n )\n\n shape_cls = Permissive if model_cls.__config__.extra == Extra.allow else Shape\n\n docstring = model_cls.__doc__.strip() if model_cls.__doc__ else None\n\n return Field(config=shape_cls(fields), description=description or docstring)\n
", "current_page_name": "_modules/dagster/_config/pythonic_config/config", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._config.pythonic_config.config"}, "io_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._config.pythonic_config.io_manager

\nfrom abc import abstractmethod\nfrom typing import (\n    AbstractSet,\n    Any,\n    Callable,\n    Dict,\n    Generic,\n    Mapping,\n    Optional,\n    Type,\n    Union,\n    cast,\n)\n\nfrom typing_extensions import TypeVar\n\nfrom dagster._core.definitions.definition_config_schema import (\n    CoercableToConfigSchema,\n)\nfrom dagster._core.definitions.resource_definition import (\n    ResourceDefinition,\n    ResourceFunction,\n)\nfrom dagster._core.execution.context.init import InitResourceContext\nfrom dagster._core.storage.io_manager import IOManager, IOManagerDefinition\nfrom dagster._utils.cached_method import cached_method\n\nfrom .attach_other_object_to_context import (\n    IAttachDifferentObjectToOpContext as IAttachDifferentObjectToOpContext,\n)\nfrom .config import Config\nfrom .conversion_utils import TResValue\nfrom .inheritance_utils import safe_is_subclass\nfrom .resource import (\n    AllowDelayedDependencies,\n    ConfigurableResourceFactory,\n    PartialResource,\n    ResourceId,\n    ResourceWithKeyMapping,\n    Self,\n)\n\ntry:\n    from functools import cached_property  # type: ignore  # (py37 compat)\nexcept ImportError:\n\n    class cached_property:\n        pass\n\n\nTIOManagerValue = TypeVar("TIOManagerValue", bound=IOManager)\n\n\nclass ConfigurableIOManagerFactoryResourceDefinition(IOManagerDefinition, AllowDelayedDependencies):\n    def __init__(\n        self,\n        configurable_resource_cls: Type,\n        resource_fn: ResourceFunction,\n        config_schema: Any,\n        description: Optional[str],\n        resolve_resource_keys: Callable[[Mapping[int, str]], AbstractSet[str]],\n        nested_resources: Mapping[str, Any],\n        input_config_schema: Optional[Union[CoercableToConfigSchema, Type[Config]]] = None,\n        output_config_schema: Optional[Union[CoercableToConfigSchema, Type[Config]]] = None,\n        dagster_maintained: bool = False,\n    ):\n        input_config_schema_resolved: CoercableToConfigSchema = (\n            cast(Type[Config], input_config_schema).to_config_schema()\n            if safe_is_subclass(input_config_schema, Config)\n            else cast(CoercableToConfigSchema, input_config_schema)\n        )\n        output_config_schema_resolved: CoercableToConfigSchema = (\n            cast(Type[Config], output_config_schema).to_config_schema()\n            if safe_is_subclass(output_config_schema, Config)\n            else cast(CoercableToConfigSchema, output_config_schema)\n        )\n        super().__init__(\n            resource_fn=resource_fn,\n            config_schema=config_schema,\n            description=description,\n            input_config_schema=input_config_schema_resolved,\n            output_config_schema=output_config_schema_resolved,\n        )\n        self._resolve_resource_keys = resolve_resource_keys\n        self._nested_resources = nested_resources\n        self._configurable_resource_cls = configurable_resource_cls\n        self._dagster_maintained = dagster_maintained\n\n    @property\n    def configurable_resource_cls(self) -> Type:\n        return self._configurable_resource_cls\n\n    @property\n    def nested_resources(\n        self,\n    ) -> Mapping[str, Any]:\n        return self._nested_resources\n\n    def _resolve_required_resource_keys(\n        self, resource_mapping: Mapping[int, str]\n    ) -> AbstractSet[str]:\n        return self._resolve_resource_keys(resource_mapping)\n\n\nclass IOManagerWithKeyMapping(ResourceWithKeyMapping, IOManagerDefinition):\n    """Version of ResourceWithKeyMapping wrapper that also implements IOManagerDefinition."""\n\n    def __init__(\n        self,\n        resource: ResourceDefinition,\n        resource_id_to_key_mapping: Dict[ResourceId, str],\n    ):\n        ResourceWithKeyMapping.__init__(self, resource, resource_id_to_key_mapping)\n        IOManagerDefinition.__init__(\n            self, resource_fn=self.resource_fn, config_schema=resource.config_schema\n        )\n\n\n
[docs]class ConfigurableIOManagerFactory(ConfigurableResourceFactory[TIOManagerValue]):\n """Base class for Dagster IO managers that utilize structured config. This base class\n is useful for cases in which the returned IO manager is not the same as the class itself\n (e.g. when it is a wrapper around the actual IO manager implementation).\n\n This class is a subclass of both :py:class:`IOManagerDefinition` and :py:class:`Config`.\n Implementers should provide an implementation of the :py:meth:`resource_function` method,\n which should return an instance of :py:class:`IOManager`.\n\n\n Example definition:\n\n .. code-block:: python\n\n class ExternalIOManager(IOManager):\n\n def __init__(self, connection):\n self._connection = connection\n\n def handle_output(self, context, obj):\n ...\n\n def load_input(self, context):\n ...\n\n class ConfigurableExternalIOManager(ConfigurableIOManagerFactory):\n username: str\n password: str\n\n def create_io_manager(self, context) -> IOManager:\n with database.connect(username, password) as connection:\n return MyExternalIOManager(connection)\n\n defs = Definitions(\n ...,\n resources={\n "io_manager": ConfigurableExternalIOManager(\n username="dagster",\n password=EnvVar("DB_PASSWORD")\n )\n }\n )\n\n """\n\n def __init__(self, **data: Any):\n ConfigurableResourceFactory.__init__(self, **data)\n\n @abstractmethod\n def create_io_manager(self, context) -> TIOManagerValue:\n """Implement as one would implement a @io_manager decorator function."""\n raise NotImplementedError()\n\n def create_resource(self, context: InitResourceContext) -> TIOManagerValue:\n return self.create_io_manager(context)\n\n @classmethod\n def configure_at_launch(cls: "Type[Self]", **kwargs) -> "PartialIOManager[Self]":\n """Returns a partially initialized copy of the IO manager, with remaining config fields\n set at runtime.\n """\n return PartialIOManager(cls, data=kwargs)\n\n @cached_method\n def get_resource_definition(self) -> ConfigurableIOManagerFactoryResourceDefinition:\n return ConfigurableIOManagerFactoryResourceDefinition(\n self.__class__,\n resource_fn=self._get_initialize_and_run_fn(),\n config_schema=self._config_schema,\n description=self.__doc__,\n resolve_resource_keys=self._resolve_required_resource_keys,\n nested_resources=self.nested_resources,\n input_config_schema=self.__class__.input_config_schema(),\n output_config_schema=self.__class__.output_config_schema(),\n dagster_maintained=self._is_dagster_maintained(),\n )\n\n @classmethod\n def input_config_schema(\n cls,\n ) -> Optional[Union[CoercableToConfigSchema, Type[Config]]]:\n return None\n\n @classmethod\n def output_config_schema(\n cls,\n ) -> Optional[Union[CoercableToConfigSchema, Type[Config]]]:\n return None
\n\n\nclass PartialIOManager(Generic[TResValue], PartialResource[TResValue]):\n def __init__(\n self,\n resource_cls: Type[ConfigurableResourceFactory[TResValue]],\n data: Dict[str, Any],\n ):\n PartialResource.__init__(self, resource_cls, data)\n\n @cached_method\n def get_resource_definition(self) -> ConfigurableIOManagerFactoryResourceDefinition:\n input_config_schema = None\n output_config_schema = None\n if safe_is_subclass(self.resource_cls, ConfigurableIOManagerFactory):\n factory_cls: Type[ConfigurableIOManagerFactory] = cast(\n Type[ConfigurableIOManagerFactory], self.resource_cls\n )\n input_config_schema = factory_cls.input_config_schema()\n output_config_schema = factory_cls.output_config_schema()\n\n return ConfigurableIOManagerFactoryResourceDefinition(\n self.resource_cls,\n resource_fn=self._state__internal__.resource_fn,\n config_schema=self._state__internal__.config_schema,\n description=self._state__internal__.description,\n resolve_resource_keys=self._resolve_required_resource_keys,\n nested_resources=self._state__internal__.nested_resources,\n input_config_schema=input_config_schema,\n output_config_schema=output_config_schema,\n dagster_maintained=self.resource_cls._is_dagster_maintained(), # noqa: SLF001\n )\n\n\n
[docs]class ConfigurableIOManager(ConfigurableIOManagerFactory, IOManager):\n """Base class for Dagster IO managers that utilize structured config.\n\n This class is a subclass of both :py:class:`IOManagerDefinition`, :py:class:`Config`,\n and :py:class:`IOManager`. Implementers must provide an implementation of the\n :py:meth:`handle_output` and :py:meth:`load_input` methods.\n\n Example definition:\n\n .. code-block:: python\n\n class MyIOManager(ConfigurableIOManager):\n path_prefix: List[str]\n\n def _get_path(self, context) -> str:\n return "/".join(context.asset_key.path)\n\n def handle_output(self, context, obj):\n write_csv(self._get_path(context), obj)\n\n def load_input(self, context):\n return read_csv(self._get_path(context))\n\n defs = Definitions(\n ...,\n resources={\n "io_manager": MyIOManager(path_prefix=["my", "prefix"])\n }\n )\n\n """\n\n def create_io_manager(self, context) -> IOManager:\n return self
\n\n\nclass ConfigurableLegacyIOManagerAdapter(ConfigurableIOManagerFactory):\n """Adapter base class for wrapping a decorated, function-style I/O manager\n with structured config.\n\n To use this class, subclass it, define config schema fields using Pydantic,\n and implement the ``wrapped_io_manager`` method.\n\n Example:\n .. code-block:: python\n\n class OldIOManager(IOManager):\n def __init__(self, base_path: str):\n ...\n\n @io_manager(config_schema={"base_path": str})\n def old_io_manager(context):\n base_path = context.resource_config["base_path"]\n\n return OldIOManager(base_path)\n\n class MyIOManager(ConfigurableLegacyIOManagerAdapter):\n base_path: str\n\n @property\n def wrapped_io_manager(self) -> IOManagerDefinition:\n return old_io_manager\n """\n\n @property\n @abstractmethod\n def wrapped_io_manager(self) -> IOManagerDefinition:\n raise NotImplementedError()\n\n def create_io_manager(self, context) -> IOManager:\n raise NotImplementedError(\n "Because we override resource_fn in the adapter, this is never called."\n )\n\n @cached_method\n def get_resource_definition(self) -> ConfigurableIOManagerFactoryResourceDefinition:\n return ConfigurableIOManagerFactoryResourceDefinition(\n self.__class__,\n resource_fn=self.wrapped_io_manager.resource_fn,\n config_schema=self._config_schema,\n description=self.__doc__,\n resolve_resource_keys=self._resolve_required_resource_keys,\n nested_resources=self.nested_resources,\n dagster_maintained=self._is_dagster_maintained(),\n )\n
", "current_page_name": "_modules/dagster/_config/pythonic_config/io_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._config.pythonic_config.io_manager"}, "resource": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._config.pythonic_config.resource

\nimport contextlib\nimport inspect\nfrom typing import (\n    AbstractSet,\n    Any,\n    Callable,\n    Dict,\n    Generator,\n    Generic,\n    List,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Set,\n    Tuple,\n    Type,\n    TypeVar,\n    Union,\n    cast,\n)\n\nfrom typing_extensions import TypeAlias, TypeGuard, get_args, get_origin\n\nfrom dagster import (\n    Field as DagsterField,\n)\nfrom dagster._annotations import deprecated\nfrom dagster._config.field_utils import config_dictionary_from_values\nfrom dagster._config.pythonic_config.typing_utils import (\n    TypecheckAllowPartialResourceInitParams,\n)\nfrom dagster._config.validate import validate_config\nfrom dagster._core.definitions.definition_config_schema import (\n    ConfiguredDefinitionConfigSchema,\n    DefinitionConfigSchema,\n)\nfrom dagster._core.errors import DagsterInvalidConfigError\nfrom dagster._core.execution.context.init import InitResourceContext, build_init_resource_context\nfrom dagster._utils.cached_method import cached_method\n\nfrom .attach_other_object_to_context import (\n    IAttachDifferentObjectToOpContext as IAttachDifferentObjectToOpContext,\n)\n\ntry:\n    from functools import cached_property  # type: ignore  # (py37 compat)\nexcept ImportError:\n\n    class cached_property:\n        pass\n\n\nfrom abc import ABC, abstractmethod\n\nfrom pydantic import BaseModel\n\nimport dagster._check as check\nfrom dagster._core.decorator_utils import get_function_params\nfrom dagster._core.definitions.resource_definition import (\n    ResourceDefinition,\n    ResourceFunction,\n    ResourceFunctionWithContext,\n    ResourceFunctionWithoutContext,\n    has_at_least_one_parameter,\n)\nfrom dagster._core.storage.io_manager import IOManagerDefinition\n\nfrom .config import Config, MakeConfigCacheable, infer_schema_from_config_class\nfrom .conversion_utils import (\n    TResValue,\n    _curry_config_schema,\n)\nfrom .typing_utils import BaseResourceMeta, LateBoundTypesForResourceTypeChecking\n\nSelf = TypeVar("Self", bound="ConfigurableResourceFactory")\nResourceId: TypeAlias = int\n\n\nclass AllowDelayedDependencies:\n    _nested_partial_resources: Mapping[str, ResourceDefinition] = {}\n\n    def _resolve_required_resource_keys(\n        self, resource_mapping: Mapping[int, str]\n    ) -> AbstractSet[str]:\n        from dagster._core.execution.build_resources import wrap_resource_for_execution\n\n        # All dependent resources which are not fully configured\n        # must be specified to the Definitions object so that the\n        # resource can be configured at runtime by the user\n        nested_partial_resource_keys = {\n            attr_name: resource_mapping.get(id(resource_def))\n            for attr_name, resource_def in self._nested_partial_resources.items()\n        }\n        check.invariant(\n            all(pointer_key is not None for pointer_key in nested_partial_resource_keys.values()),\n            "Any partially configured, nested resources must be provided to Definitions"\n            f" object: {nested_partial_resource_keys}",\n        )\n\n        # Recursively get all nested resource keys\n        nested_resource_required_keys: Set[str] = set()\n        for v in self._nested_partial_resources.values():\n            nested_resource_required_keys.update(\n                _resolve_required_resource_keys_for_resource(v, resource_mapping)\n            )\n\n        resources, _ = separate_resource_params(\n            cast(Type[BaseModel], self.__class__), self.__dict__\n        )\n        for v in resources.values():\n            nested_resource_required_keys.update(\n                _resolve_required_resource_keys_for_resource(\n                    wrap_resource_for_execution(v), resource_mapping\n                )\n            )\n\n        out = set(cast(Set[str], nested_partial_resource_keys.values())).union(\n            nested_resource_required_keys\n        )\n        return out\n\n\nclass InitResourceContextWithKeyMapping(InitResourceContext):\n    """Passes along a mapping from ResourceDefinition id to resource key alongside the\n    InitResourceContext. This is used to resolve the required resource keys for\n    resources which may hold nested partial resources.\n    """\n\n    def __init__(\n        self,\n        context: InitResourceContext,\n        resource_id_to_key_mapping: Mapping[ResourceId, str],\n    ):\n        super().__init__(\n            resource_config=context.resource_config,\n            resources=context.resources,\n            instance=context.instance,\n            resource_def=context.resource_def,\n            dagster_run=context.dagster_run,\n            log_manager=context.log,\n        )\n        self._resource_id_to_key_mapping = resource_id_to_key_mapping\n        self._resources_by_id = {\n            resource_id: getattr(context.resources, resource_key, None)\n            for resource_id, resource_key in resource_id_to_key_mapping.items()\n        }\n\n    @property\n    def resources_by_id(self) -> Mapping[ResourceId, Any]:\n        return self._resources_by_id\n\n    def replace_config(self, config: Any) -> "InitResourceContext":\n        return InitResourceContextWithKeyMapping(\n            super().replace_config(config), self._resource_id_to_key_mapping\n        )\n\n\nclass ResourceWithKeyMapping(ResourceDefinition):\n    """Wrapper around a ResourceDefinition which helps the inner resource resolve its required\n    resource keys. This is useful for resources which may hold nested resources. At construction\n    time, they are unaware of the resource keys of their nested resources - the resource id to\n    key mapping is used to resolve this.\n    """\n\n    def __init__(\n        self,\n        resource: ResourceDefinition,\n        resource_id_to_key_mapping: Dict[ResourceId, str],\n    ):\n        self._resource = resource\n        self._resource_id_to_key_mapping = resource_id_to_key_mapping\n\n        ResourceDefinition.__init__(\n            self,\n            resource_fn=self.setup_context_resources_and_call,\n            config_schema=resource.config_schema,\n            description=resource.description,\n            version=resource.version,\n        )\n\n    def setup_context_resources_and_call(self, context: InitResourceContext):\n        """Wrapper around the wrapped resource's resource_fn which attaches its\n        resource id to key mapping to the context, and then calls the nested resource's resource_fn.\n        """\n        context_with_key_mapping = InitResourceContextWithKeyMapping(\n            context, self._resource_id_to_key_mapping\n        )\n\n        if has_at_least_one_parameter(self._resource.resource_fn):\n            return self._resource.resource_fn(context_with_key_mapping)\n        else:\n            return cast(ResourceFunctionWithoutContext, self._resource.resource_fn)()\n\n    @property\n    def required_resource_keys(self) -> AbstractSet[str]:\n        return _resolve_required_resource_keys_for_resource(\n            self._resource, self._resource_id_to_key_mapping\n        )\n\n    @property\n    def wrapped_resource(self) -> ResourceDefinition:\n        return self._resource\n\n    @property\n    def inner_resource(self):\n        return self._resource\n\n\ndef attach_resource_id_to_key_mapping(\n    resource_def: Any, resource_id_to_key_mapping: Dict[ResourceId, str]\n) -> Any:\n    from .io_manager import IOManagerWithKeyMapping\n\n    if isinstance(resource_def, (ConfigurableResourceFactory, PartialResource)):\n        defn = resource_def.get_resource_definition()\n        return (\n            IOManagerWithKeyMapping(defn, resource_id_to_key_mapping)\n            if isinstance(defn, IOManagerDefinition)\n            else ResourceWithKeyMapping(defn, resource_id_to_key_mapping)\n        )\n    return resource_def\n\n\nCoercibleToResource: TypeAlias = Union[\n    ResourceDefinition, "ConfigurableResourceFactory", "PartialResource"\n]\n\n\ndef is_coercible_to_resource(val: Any) -> TypeGuard[CoercibleToResource]:\n    return isinstance(val, (ResourceDefinition, ConfigurableResourceFactory, PartialResource))\n\n\nclass ConfigurableResourceFactoryResourceDefinition(ResourceDefinition, AllowDelayedDependencies):\n    def __init__(\n        self,\n        configurable_resource_cls: Type,\n        resource_fn: ResourceFunction,\n        config_schema: Any,\n        description: Optional[str],\n        resolve_resource_keys: Callable[[Mapping[int, str]], AbstractSet[str]],\n        nested_resources: Mapping[str, Any],\n        dagster_maintained: bool = False,\n    ):\n        super().__init__(\n            resource_fn=resource_fn,\n            config_schema=config_schema,\n            description=description,\n        )\n        self._configurable_resource_cls = configurable_resource_cls\n        self._resolve_resource_keys = resolve_resource_keys\n        self._nested_resources = nested_resources\n        self._dagster_maintained = dagster_maintained\n\n    @property\n    def configurable_resource_cls(self) -> Type:\n        return self._configurable_resource_cls\n\n    @property\n    def nested_resources(\n        self,\n    ) -> Mapping[str, Any]:\n        return self._nested_resources\n\n    def _resolve_required_resource_keys(\n        self, resource_mapping: Mapping[int, str]\n    ) -> AbstractSet[str]:\n        return self._resolve_resource_keys(resource_mapping)\n\n    def _is_dagster_maintained(self) -> bool:\n        return self._dagster_maintained\n\n\nclass ConfigurableResourceFactoryState(NamedTuple):\n    nested_partial_resources: Mapping[str, Any]\n    resolved_config_dict: Dict[str, Any]\n    config_schema: DefinitionConfigSchema\n    schema: DagsterField\n    nested_resources: Dict[str, Any]\n    resource_context: Optional[InitResourceContext]\n\n\nclass ConfigurableResourceFactory(\n    Generic[TResValue],\n    Config,\n    TypecheckAllowPartialResourceInitParams,\n    AllowDelayedDependencies,\n    ABC,\n    metaclass=BaseResourceMeta,\n):\n    """Base class for creating and managing the lifecycle of Dagster resources that utilize structured config.\n\n    Users should directly inherit from this class when they want the object passed to user-defined\n    code (such as an asset or op) to be different than the object that defines the configuration\n    schema and is passed to the :py:class:`Definitions` object. Cases where this is useful include is\n    when the object passed to user code is:\n\n    * An existing class from a third-party library that the user does not control.\n    * A complex class that requires substantial internal state management or itself requires arguments beyond its config values.\n    * A class with expensive initialization that should not be invoked on code location load, but rather lazily on first use in an op or asset during a run.\n    * A class that you desire to be a plain Python class, rather than a Pydantic class, for whatever reason.\n\n    This class is a subclass of both :py:class:`ResourceDefinition` and :py:class:`Config`, and\n    must implement ``create_resource``, which creates the resource to pass to user code.\n\n    Example definition:\n\n    .. code-block:: python\n\n        class DatabaseResource(ConfigurableResourceFactory[Database]):\n            connection_uri: str\n\n            def create_resource(self, _init_context) -> Database:\n                # For example Database could be from a third-party library or require expensive setup.\n                # Or you could just prefer to separate the concerns of configuration and runtime representation\n                return Database(self.connection_uri)\n\n    To use a resource created by a factory in a job, you must use the Resource type annotation.\n\n    Example usage:\n\n\n    .. code-block:: python\n\n        @asset\n        def asset_that_uses_database(database: ResourceParam[Database]):\n            # Database used directly in user code\n            database.query("SELECT * FROM table")\n\n        defs = Definitions(\n            assets=[asset_that_uses_database],\n            resources={"database": DatabaseResource(connection_uri="some_uri")},\n        )\n\n    """\n\n    def __init__(self, **data: Any):\n        resource_pointers, data_without_resources = separate_resource_params(self.__class__, data)\n\n        schema = infer_schema_from_config_class(\n            self.__class__, fields_to_omit=set(resource_pointers.keys())\n        )\n\n        # Populate config values\n        Config.__init__(self, **{**data_without_resources, **resource_pointers})\n\n        # We pull the values from the Pydantic config object, which may cast values\n        # to the correct type under the hood - useful in particular for enums\n        casted_data_without_resources = {\n            k: v\n            for k, v in self._convert_to_config_dictionary().items()\n            if k in data_without_resources\n        }\n        resolved_config_dict = config_dictionary_from_values(casted_data_without_resources, schema)\n\n        self._state__internal__ = ConfigurableResourceFactoryState(\n            # We keep track of any resources we depend on which are not fully configured\n            # so that we can retrieve them at runtime\n            nested_partial_resources={\n                k: v for k, v in resource_pointers.items() if (not _is_fully_configured(v))\n            },\n            resolved_config_dict=resolved_config_dict,\n            # These are unfortunately named very similarily\n            config_schema=_curry_config_schema(schema, resolved_config_dict),\n            schema=schema,\n            nested_resources={k: v for k, v in resource_pointers.items()},\n            resource_context=None,\n        )\n\n    @property\n    def _schema(self):\n        return self._state__internal__.schema\n\n    @property\n    def _config_schema(self):\n        return self._state__internal__.config_schema\n\n    @property\n    def _nested_partial_resources(self):\n        return self._state__internal__.nested_partial_resources\n\n    @property\n    def _nested_resources(self):\n        return self._state__internal__.nested_resources\n\n    @property\n    def _resolved_config_dict(self):\n        return self._state__internal__.resolved_config_dict\n\n    @classmethod\n    def _is_dagster_maintained(cls) -> bool:\n        """This should be overridden to return True by all dagster maintained resources and IO managers."""\n        return False\n\n    @classmethod\n    def _is_cm_resource_cls(cls: Type["ConfigurableResourceFactory"]) -> bool:\n        return (\n            cls.yield_for_execution != ConfigurableResourceFactory.yield_for_execution\n            or cls.teardown_after_execution != ConfigurableResourceFactory.teardown_after_execution\n        )\n\n    @property\n    def _is_cm_resource(self) -> bool:\n        return self.__class__._is_cm_resource_cls()  # noqa: SLF001\n\n    def _get_initialize_and_run_fn(self) -> Callable:\n        return self._initialize_and_run_cm if self._is_cm_resource else self._initialize_and_run\n\n    @cached_method\n    def get_resource_definition(self) -> ConfigurableResourceFactoryResourceDefinition:\n        return ConfigurableResourceFactoryResourceDefinition(\n            self.__class__,\n            resource_fn=self._get_initialize_and_run_fn(),\n            config_schema=self._config_schema,\n            description=self.__doc__,\n            resolve_resource_keys=self._resolve_required_resource_keys,\n            nested_resources=self.nested_resources,\n            dagster_maintained=self._is_dagster_maintained(),\n        )\n\n    @abstractmethod\n    def create_resource(self, context: InitResourceContext) -> TResValue:\n        """Returns the object that this resource hands to user code, accessible by ops or assets\n        through the context or resource parameters. This works like the function decorated\n        with @resource when using function-based resources.\n        """\n        raise NotImplementedError()\n\n    @property\n    def nested_resources(\n        self,\n    ) -> Mapping[str, Any]:\n        return self._nested_resources\n\n    @classmethod\n    def configure_at_launch(cls: "Type[Self]", **kwargs) -> "PartialResource[Self]":\n        """Returns a partially initialized copy of the resource, with remaining config fields\n        set at runtime.\n        """\n        return PartialResource(cls, data=kwargs)\n\n    def _with_updated_values(\n        self, values: Optional[Mapping[str, Any]]\n    ) -> "ConfigurableResourceFactory[TResValue]":\n        """Returns a new instance of the resource with the given values.\n        Used when initializing a resource at runtime.\n        """\n        values = check.opt_mapping_param(values, "values", key_type=str)\n        # Since Resource extends BaseModel and is a dataclass, we know that the\n        # signature of any __init__ method will always consist of the fields\n        # of this class. We can therefore safely pass in the values as kwargs.\n        out = self.__class__(**{**self._get_non_none_public_field_values(), **values})\n        out._state__internal__ = out._state__internal__._replace(  # noqa: SLF001\n            resource_context=self._state__internal__.resource_context\n        )\n        return out\n\n    @contextlib.contextmanager\n    def _resolve_and_update_nested_resources(\n        self, context: InitResourceContext\n    ) -> Generator["ConfigurableResourceFactory[TResValue]", None, None]:\n        """Updates any nested resources with the resource values from the context.\n        In this case, populating partially configured resources or\n        resources that return plain Python types.\n\n        Returns a new instance of the resource.\n        """\n        from dagster._core.execution.build_resources import wrap_resource_for_execution\n\n        partial_resources_to_update: Dict[str, Any] = {}\n        if self._nested_partial_resources:\n            context_with_mapping = cast(\n                InitResourceContextWithKeyMapping,\n                check.inst(\n                    context,\n                    InitResourceContextWithKeyMapping,\n                    "This ConfiguredResource contains unresolved partially-specified nested"\n                    " resources, and so can only be initialized using a"\n                    " InitResourceContextWithKeyMapping",\n                ),\n            )\n            partial_resources_to_update = {\n                attr_name: context_with_mapping.resources_by_id[id(resource)]\n                for attr_name, resource in self._nested_partial_resources.items()\n            }\n\n        # Also evaluate any resources that are not partial\n        with contextlib.ExitStack() as stack:\n            resources_to_update, _ = separate_resource_params(self.__class__, self.__dict__)\n            resources_to_update = {\n                attr_name: _call_resource_fn_with_default(\n                    stack, wrap_resource_for_execution(resource), context\n                )\n                for attr_name, resource in resources_to_update.items()\n                if attr_name not in partial_resources_to_update\n            }\n\n            to_update = {**resources_to_update, **partial_resources_to_update}\n            yield self._with_updated_values(to_update)\n\n    @deprecated(\n        breaking_version="2.0", additional_warn_text="Use `with_replaced_resource_context` instead"\n    )\n    def with_resource_context(\n        self, resource_context: InitResourceContext\n    ) -> "ConfigurableResourceFactory[TResValue]":\n        return self.with_replaced_resource_context(resource_context)\n\n    def with_replaced_resource_context(\n        self, resource_context: InitResourceContext\n    ) -> "ConfigurableResourceFactory[TResValue]":\n        """Returns a new instance of the resource with the given resource init context bound."""\n        # This utility is used to create a copy of this resource, without adjusting\n        # any values in this case\n        copy = self._with_updated_values({})\n        copy._state__internal__ = copy._state__internal__._replace(  # noqa: SLF001\n            resource_context=resource_context\n        )\n        return copy\n\n    def _initialize_and_run(self, context: InitResourceContext) -> TResValue:\n        with self._resolve_and_update_nested_resources(context) as has_nested_resource:\n            updated_resource = has_nested_resource.with_replaced_resource_context(  # noqa: SLF001\n                context\n            )._with_updated_values(context.resource_config)\n\n            updated_resource.setup_for_execution(context)\n            return updated_resource.create_resource(context)\n\n    @contextlib.contextmanager\n    def _initialize_and_run_cm(\n        self, context: InitResourceContext\n    ) -> Generator[TResValue, None, None]:\n        with self._resolve_and_update_nested_resources(context) as has_nested_resource:\n            updated_resource = has_nested_resource.with_replaced_resource_context(  # noqa: SLF001\n                context\n            )._with_updated_values(context.resource_config)\n\n            with updated_resource.yield_for_execution(context) as value:\n                yield value\n\n    def setup_for_execution(self, context: InitResourceContext) -> None:\n        """Optionally override this method to perform any pre-execution steps\n        needed before the resource is used in execution.\n        """\n        pass\n\n    def teardown_after_execution(self, context: InitResourceContext) -> None:\n        """Optionally override this method to perform any post-execution steps\n        needed after the resource is used in execution.\n\n        teardown_after_execution will be called even if any part of the run fails.\n        It will not be called if setup_for_execution fails.\n        """\n        pass\n\n    @contextlib.contextmanager\n    def yield_for_execution(self, context: InitResourceContext) -> Generator[TResValue, None, None]:\n        """Optionally override this method to perform any lifecycle steps\n        before or after the resource is used in execution. By default, calls\n        setup_for_execution before yielding, and teardown_after_execution after yielding.\n\n        Note that if you override this method and want setup_for_execution or\n        teardown_after_execution to be called, you must invoke them yourself.\n        """\n        self.setup_for_execution(context)\n        try:\n            yield self.create_resource(context)\n        finally:\n            self.teardown_after_execution(context)\n\n    def get_resource_context(self) -> InitResourceContext:\n        """Returns the context that this resource was initialized with."""\n        return check.not_none(\n            self._state__internal__.resource_context,\n            additional_message="Attempted to get context before resource was initialized.",\n        )\n\n    def process_config_and_initialize(self) -> TResValue:\n        """Initializes this resource, fully processing its config and returning the prepared\n        resource value.\n        """\n        from dagster._config.post_process import post_process_config\n\n        return self.from_resource_context(\n            build_init_resource_context(\n                config=post_process_config(\n                    self._config_schema.config_type, self._convert_to_config_dictionary()\n                ).value\n            )\n        )\n\n    @classmethod\n    def from_resource_context(cls, context: InitResourceContext) -> TResValue:\n        """Creates a new instance of this resource from a populated InitResourceContext.\n        Useful when creating a resource from a function-based resource, for backwards\n        compatibility purposes.\n\n        For resources that have custom teardown behavior, use from_resource_context_cm instead.\n\n        Example usage:\n\n        .. code-block:: python\n\n            class MyResource(ConfigurableResource):\n                my_str: str\n\n            @resource(config_schema=MyResource.to_config_schema())\n            def my_resource(context: InitResourceContext) -> MyResource:\n                return MyResource.from_resource_context(context)\n\n        """\n        check.invariant(\n            not cls._is_cm_resource_cls(),\n            "Use from_resource_context_cm for resources which have custom teardown behavior,"\n            " e.g. overriding yield_for_execution or teardown_after_execution",\n        )\n        return cls(**context.resource_config or {})._initialize_and_run(context)  # noqa: SLF001\n\n    @classmethod\n    @contextlib.contextmanager\n    def from_resource_context_cm(\n        cls, context: InitResourceContext\n    ) -> Generator[TResValue, None, None]:\n        """Context which generates a new instance of this resource from a populated InitResourceContext.\n        Useful when creating a resource from a function-based resource, for backwards\n        compatibility purposes. Handles custom teardown behavior.\n\n        Example usage:\n\n        .. code-block:: python\n\n            class MyResource(ConfigurableResource):\n                my_str: str\n\n            @resource(config_schema=MyResource.to_config_schema())\n            def my_resource(context: InitResourceContext) -> Generator[MyResource, None, None]:\n                with MyResource.from_resource_context_cm(context) as my_resource:\n                    yield my_resource\n\n        """\n        with cls(**context.resource_config or {})._initialize_and_run_cm(  # noqa: SLF001\n            context\n        ) as value:\n            yield value\n\n\n
[docs]class ConfigurableResource(ConfigurableResourceFactory[TResValue]):\n """Base class for Dagster resources that utilize structured config.\n\n This class is a subclass of both :py:class:`ResourceDefinition` and :py:class:`Config`.\n\n Example definition:\n\n .. code-block:: python\n\n class WriterResource(ConfigurableResource):\n prefix: str\n\n def output(self, text: str) -> None:\n print(f"{self.prefix}{text}")\n\n Example usage:\n\n .. code-block:: python\n\n @asset\n def asset_that_uses_writer(writer: WriterResource):\n writer.output("text")\n\n defs = Definitions(\n assets=[asset_that_uses_writer],\n resources={"writer": WriterResource(prefix="a_prefix")},\n )\n\n """\n\n def create_resource(self, context: InitResourceContext) -> TResValue:\n """Returns the object that this resource hands to user code, accessible by ops or assets\n through the context or resource parameters. This works like the function decorated\n with @resource when using function-based resources.\n\n For ConfigurableResource, this function will return itself, passing\n the actual ConfigurableResource object to user code.\n """\n return cast(TResValue, self)
\n\n\ndef _is_fully_configured(resource: CoercibleToResource) -> bool:\n from dagster._core.execution.build_resources import wrap_resource_for_execution\n\n actual_resource = wrap_resource_for_execution(resource)\n res = (\n validate_config(\n actual_resource.config_schema.config_type,\n (\n actual_resource.config_schema.default_value\n if actual_resource.config_schema.default_provided\n else {}\n ),\n ).success\n is True\n )\n\n return res\n\n\nclass PartialResourceState(NamedTuple):\n nested_partial_resources: Dict[str, Any]\n config_schema: DagsterField\n resource_fn: Callable[[InitResourceContext], Any]\n description: Optional[str]\n nested_resources: Dict[str, Any]\n\n\nclass PartialResource(Generic[TResValue], AllowDelayedDependencies, MakeConfigCacheable):\n data: Dict[str, Any]\n resource_cls: Type[ConfigurableResourceFactory[TResValue]]\n\n def __init__(\n self,\n resource_cls: Type[ConfigurableResourceFactory[TResValue]],\n data: Dict[str, Any],\n ):\n resource_pointers, _data_without_resources = separate_resource_params(resource_cls, data)\n\n MakeConfigCacheable.__init__(self, data=data, resource_cls=resource_cls) # type: ignore # extends BaseModel, takes kwargs\n\n def resource_fn(context: InitResourceContext):\n instantiated = resource_cls(\n **{**data, **context.resource_config}\n ) # So that collisions are resolved in favor of the latest provided run config\n return instantiated._get_initialize_and_run_fn()(context) # noqa: SLF001\n\n self._state__internal__ = PartialResourceState(\n # We keep track of any resources we depend on which are not fully configured\n # so that we can retrieve them at runtime\n nested_partial_resources={\n k: v for k, v in resource_pointers.items() if (not _is_fully_configured(v))\n },\n config_schema=infer_schema_from_config_class(\n resource_cls, fields_to_omit=set(resource_pointers.keys())\n ),\n resource_fn=resource_fn,\n description=resource_cls.__doc__,\n nested_resources={k: v for k, v in resource_pointers.items()},\n )\n\n # to make AllowDelayedDependencies work\n @property\n def _nested_partial_resources(\n self,\n ) -> Mapping[str, Any]:\n return self._state__internal__.nested_partial_resources\n\n @property\n def nested_resources(\n self,\n ) -> Mapping[str, Any]:\n return self._state__internal__.nested_resources\n\n @cached_method\n def get_resource_definition(self) -> ConfigurableResourceFactoryResourceDefinition:\n return ConfigurableResourceFactoryResourceDefinition(\n self.resource_cls,\n resource_fn=self._state__internal__.resource_fn,\n config_schema=self._state__internal__.config_schema,\n description=self._state__internal__.description,\n resolve_resource_keys=self._resolve_required_resource_keys,\n nested_resources=self.nested_resources,\n dagster_maintained=self.resource_cls._is_dagster_maintained(), # noqa: SLF001\n )\n\n\nResourceOrPartial: TypeAlias = Union[\n ConfigurableResourceFactory[TResValue], PartialResource[TResValue]\n]\nResourceOrPartialOrValue: TypeAlias = Union[\n ConfigurableResourceFactory[TResValue],\n PartialResource[TResValue],\n ResourceDefinition,\n TResValue,\n]\n\n\nV = TypeVar("V")\n\n\nclass ResourceDependency(Generic[V]):\n def __set_name__(self, _owner, name):\n self._name = name\n\n def __get__(self, obj: "ConfigurableResourceFactory", __owner: Any) -> V:\n return getattr(obj, self._name)\n\n def __set__(self, obj: Optional[object], value: ResourceOrPartialOrValue[V]) -> None:\n setattr(obj, self._name, value)\n\n\nclass ConfigurableLegacyResourceAdapter(ConfigurableResource, ABC):\n """Adapter base class for wrapping a decorated, function-style resource\n with structured config.\n\n To use this class, subclass it, define config schema fields using Pydantic,\n and implement the ``wrapped_resource`` method.\n\n Example:\n .. code-block:: python\n\n @resource(config_schema={"prefix": str})\n def writer_resource(context):\n prefix = context.resource_config["prefix"]\n\n def output(text: str) -> None:\n out_txt.append(f"{prefix}{text}")\n\n return output\n\n class WriterResource(ConfigurableLegacyResourceAdapter):\n prefix: str\n\n @property\n def wrapped_resource(self) -> ResourceDefinition:\n return writer_resource\n """\n\n @property\n @abstractmethod\n def wrapped_resource(self) -> ResourceDefinition:\n raise NotImplementedError()\n\n @cached_method\n def get_resource_definition(self) -> ConfigurableResourceFactoryResourceDefinition:\n return ConfigurableResourceFactoryResourceDefinition(\n self.__class__,\n resource_fn=self.wrapped_resource.resource_fn,\n config_schema=self._config_schema,\n description=self.__doc__,\n resolve_resource_keys=self._resolve_required_resource_keys,\n nested_resources=self.nested_resources,\n dagster_maintained=self._is_dagster_maintained(),\n )\n\n def __call__(self, *args, **kwargs):\n return self.wrapped_resource(*args, **kwargs)\n\n\nclass SeparatedResourceParams(NamedTuple):\n resources: Dict[str, Any]\n non_resources: Dict[str, Any]\n\n\ndef _is_annotated_as_resource_type(annotation: Type) -> bool:\n """Determines if a field in a structured config class is annotated as a resource type or not."""\n from .inheritance_utils import safe_is_subclass\n\n is_annotated_as_resource_dependency = get_origin(annotation) == ResourceDependency or getattr(\n annotation, "__metadata__", None\n ) == ("resource_dependency",)\n\n return is_annotated_as_resource_dependency or safe_is_subclass(\n annotation, (ResourceDefinition, ConfigurableResourceFactory)\n )\n\n\ndef separate_resource_params(cls: Type[BaseModel], data: Dict[str, Any]) -> SeparatedResourceParams:\n """Separates out the key/value inputs of fields in a structured config Resource class which\n are marked as resources (ie, using ResourceDependency) from those which are not.\n """\n keys_by_alias = {field.alias: field for field in cls.__fields__.values()}\n data_with_annotation: List[Tuple[str, Any, Type]] = [\n # No longer exists in Pydantic 2.x, will need to be updated when we upgrade\n (k, v, keys_by_alias[k].outer_type_)\n for k, v in data.items()\n if k in keys_by_alias\n ]\n out = SeparatedResourceParams(\n resources={k: v for k, v, t in data_with_annotation if _is_annotated_as_resource_type(t)},\n non_resources={\n k: v for k, v, t in data_with_annotation if not _is_annotated_as_resource_type(t)\n },\n )\n return out\n\n\ndef _call_resource_fn_with_default(\n stack: contextlib.ExitStack, obj: ResourceDefinition, context: InitResourceContext\n) -> Any:\n from dagster._config.validate import process_config\n\n if isinstance(obj.config_schema, ConfiguredDefinitionConfigSchema):\n value = cast(Dict[str, Any], obj.config_schema.resolve_config({}).value)\n context = context.replace_config(value["config"])\n elif obj.config_schema.default_provided:\n # To explain why we need to process config here;\n # - The resource available on the init context (context.resource_config) has already been processed\n # - The nested resource's config has also already been processed, but is only available in the broader run config dictionary.\n # - The only information we have access to here is the unprocessed default value, so we need to process it a second time.\n unprocessed_config = obj.config_schema.default_value\n evr = process_config(\n {"config": obj.config_schema.config_type}, {"config": unprocessed_config}\n )\n if not evr.success:\n raise DagsterInvalidConfigError(\n "Error in config for nested resource ",\n evr.errors,\n unprocessed_config,\n )\n context = context.replace_config(cast(dict, evr.value)["config"])\n\n if has_at_least_one_parameter(obj.resource_fn):\n result = cast(ResourceFunctionWithContext, obj.resource_fn)(context)\n else:\n result = cast(ResourceFunctionWithoutContext, obj.resource_fn)()\n\n is_fn_generator = inspect.isgenerator(obj.resource_fn) or isinstance(\n obj.resource_fn, contextlib.ContextDecorator\n )\n if is_fn_generator:\n return stack.enter_context(cast(contextlib.AbstractContextManager, result))\n else:\n return result\n\n\nLateBoundTypesForResourceTypeChecking.set_actual_types_for_type_checking(\n resource_dep_type=ResourceDependency,\n resource_type=ConfigurableResourceFactory,\n partial_resource_type=PartialResource,\n)\n\n\ndef validate_resource_annotated_function(fn) -> None:\n """Validates any parameters on the decorated function that are annotated with\n :py:class:`dagster.ResourceDefinition`, raising a :py:class:`dagster.DagsterInvalidDefinitionError`\n if any are not also instances of :py:class:`dagster.ConfigurableResource` (these resources should\n instead be wrapped in the :py:func:`dagster.Resource` Annotation).\n """\n from dagster import DagsterInvalidDefinitionError\n from dagster._config.pythonic_config.resource import (\n ConfigurableResource,\n ConfigurableResourceFactory,\n TResValue,\n )\n\n from .inheritance_utils import safe_is_subclass\n\n malformed_params = [\n param\n for param in get_function_params(fn)\n if safe_is_subclass(param.annotation, (ResourceDefinition, ConfigurableResourceFactory))\n and not safe_is_subclass(param.annotation, ConfigurableResource)\n ]\n if len(malformed_params) > 0:\n malformed_param = malformed_params[0]\n output_type = None\n if safe_is_subclass(malformed_param.annotation, ConfigurableResourceFactory):\n orig_bases = getattr(malformed_param.annotation, "__orig_bases__", None)\n output_type = get_args(orig_bases[0])[0] if orig_bases and len(orig_bases) > 0 else None\n if output_type == TResValue:\n output_type = None\n\n output_type_name = getattr(output_type, "__name__", str(output_type))\n raise DagsterInvalidDefinitionError(\n """Resource param '{param_name}' is annotated as '{annotation_type}', but '{annotation_type}' outputs {value_message} value to user code such as @ops and @assets. This annotation should instead be {annotation_suggestion}""".format(\n param_name=malformed_param.name,\n annotation_type=malformed_param.annotation,\n value_message=f"a '{output_type}'" if output_type else "an unknown",\n annotation_suggestion=(\n f"'ResourceParam[{output_type_name}]'"\n if output_type\n else "'ResourceParam[Any]' or 'ResourceParam[<output type>]'"\n ),\n )\n )\n\n\ndef _resolve_required_resource_keys_for_resource(\n resource: ResourceDefinition, resource_id_to_key_mapping: Mapping[ResourceId, str]\n) -> AbstractSet[str]:\n """Gets the required resource keys for the provided resource, with the assistance of the passed\n resource-id-to-key mapping. For resources which may hold nested partial resources,\n this mapping is used to obtain the top-level resource keys to depend on.\n """\n if isinstance(resource, AllowDelayedDependencies):\n return resource._resolve_required_resource_keys(resource_id_to_key_mapping) # noqa: SLF001\n return resource.required_resource_keys\n
", "current_page_name": "_modules/dagster/_config/pythonic_config/resource", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._config.pythonic_config.resource"}}, "source": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._config.source

\nimport os\n\nimport dagster._check as check\n\nfrom .config_type import ScalarUnion\nfrom .errors import PostProcessingError\nfrom .field_utils import Selector\n\nVALID_STRING_SOURCE_TYPES = (str, dict)\n\n\ndef _ensure_env_variable(var):\n    check.str_param(var, "var")\n    value = os.getenv(var)\n    if value is None:\n        raise PostProcessingError(\n            f'You have attempted to fetch the environment variable "{var}" '\n            "which is not set. In order for this execution to succeed it "\n            "must be set in this environment."\n        )\n    return value\n\n\nclass StringSourceType(ScalarUnion):\n    def __init__(self):\n        super(StringSourceType, self).__init__(\n            scalar_type=str,\n            non_scalar_schema=Selector({"env": str}),\n            _key="StringSourceType",\n        )\n\n    def post_process(self, value):\n        check.param_invariant(isinstance(value, VALID_STRING_SOURCE_TYPES), "value")\n\n        if not isinstance(value, dict):\n            return value\n\n        key, cfg = next(iter(value.items()))\n        check.invariant(key == "env", "Only valid key is env")\n        return str(_ensure_env_variable(cfg))\n\n\nclass IntSourceType(ScalarUnion):\n    def __init__(self):\n        super(IntSourceType, self).__init__(\n            scalar_type=int,\n            non_scalar_schema=Selector({"env": str}),\n            _key="IntSourceType",\n        )\n\n    def post_process(self, value):\n        check.param_invariant(isinstance(value, (dict, int)), "value", "Should be pre-validated")\n\n        if not isinstance(value, dict):\n            return value\n\n        check.invariant(len(value) == 1, "Selector should have one entry")\n\n        key, cfg = next(iter(value.items()))\n        check.invariant(key == "env", "Only valid key is env")\n        value = _ensure_env_variable(cfg)\n        try:\n            return int(value)\n        except ValueError as e:\n            raise PostProcessingError(\n                f'Value "{value}" stored in env variable "{cfg}" cannot be coerced into an int.'\n            ) from e\n\n\nclass BoolSourceType(ScalarUnion):\n    def __init__(self):\n        super(BoolSourceType, self).__init__(\n            scalar_type=bool,\n            non_scalar_schema=Selector({"env": str}),\n            _key="BoolSourceType",\n        )\n\n    def post_process(self, value):\n        check.param_invariant(isinstance(value, (dict, bool)), "value", "Should be pre-validated")\n\n        if not isinstance(value, dict):\n            return value\n\n        check.invariant(len(value) == 1, "Selector should have one entry")\n\n        key, cfg = next(iter(value.items()))\n        check.invariant(key == "env", "Only valid key is env")\n        value = _ensure_env_variable(cfg)\n        try:\n            return bool(value)\n        except ValueError as e:\n            raise PostProcessingError(\n                (\n                    'Value "{value}" stored in env variable "{var}" cannot be coerced into an bool.'\n                ).format(value=value, var=cfg)\n            ) from e\n\n\nStringSource: StringSourceType = StringSourceType()\nIntSource: IntSourceType = IntSourceType()\nBoolSource: BoolSourceType = BoolSourceType()\n
", "current_page_name": "_modules/dagster/_config/source", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._config.source"}}, "_core": {"definitions": {"asset_check_result": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.asset_check_result

\nfrom typing import TYPE_CHECKING, Mapping, NamedTuple, Optional\n\nimport dagster._check as check\nfrom dagster._annotations import PublicAttr, experimental\nfrom dagster._core.definitions.asset_check_evaluation import (\n    AssetCheckEvaluation,\n    AssetCheckEvaluationTargetMaterializationData,\n)\nfrom dagster._core.definitions.asset_check_spec import AssetCheckSeverity\nfrom dagster._core.definitions.events import (\n    AssetKey,\n    CoercibleToAssetKey,\n    MetadataValue,\n    RawMetadataValue,\n    normalize_metadata,\n)\nfrom dagster._core.errors import DagsterInvariantViolationError\n\nif TYPE_CHECKING:\n    from dagster._core.execution.context.compute import StepExecutionContext\n\n\n
[docs]@experimental\nclass AssetCheckResult(\n NamedTuple(\n "_AssetCheckResult",\n [\n ("success", PublicAttr[bool]),\n ("asset_key", PublicAttr[Optional[AssetKey]]),\n ("check_name", PublicAttr[Optional[str]]),\n ("metadata", PublicAttr[Mapping[str, MetadataValue]]),\n ("severity", PublicAttr[AssetCheckSeverity]),\n ],\n )\n):\n """The result of an asset check.\n\n Attributes:\n asset_key (Optional[AssetKey]):\n The asset key that was checked.\n check_name (Optional[str]):\n The name of the check.\n success (bool):\n The pass/fail result of the check.\n metadata (Optional[Dict[str, RawMetadataValue]]):\n Arbitrary metadata about the asset. Keys are displayed string labels, and values are\n one of the following: string, float, int, JSON-serializable dict, JSON-serializable\n list, and one of the data classes returned by a MetadataValue static method.\n severity (AssetCheckSeverity):\n Severity of the check. Defaults to ERROR.\n\n """\n\n def __new__(\n cls,\n *,\n success: bool,\n asset_key: Optional[CoercibleToAssetKey] = None,\n check_name: Optional[str] = None,\n metadata: Optional[Mapping[str, RawMetadataValue]] = None,\n severity: AssetCheckSeverity = AssetCheckSeverity.ERROR,\n ):\n normalized_metadata = normalize_metadata(\n check.opt_mapping_param(metadata, "metadata", key_type=str),\n )\n return super().__new__(\n cls,\n asset_key=AssetKey.from_coercible(asset_key) if asset_key is not None else None,\n check_name=check.opt_str_param(check_name, "check_name"),\n success=check.bool_param(success, "success"),\n metadata=normalized_metadata,\n severity=check.inst_param(severity, "severity", AssetCheckSeverity),\n )\n\n def to_asset_check_evaluation(\n self, step_context: "StepExecutionContext"\n ) -> AssetCheckEvaluation:\n spec_check_names_by_asset_key = (\n step_context.job_def.asset_layer.get_check_names_by_asset_key_for_node_handle(\n step_context.node_handle.root\n )\n )\n\n asset_keys_with_specs = spec_check_names_by_asset_key.keys()\n\n if self.asset_key is not None:\n if self.asset_key not in asset_keys_with_specs:\n raise DagsterInvariantViolationError(\n "Received unexpected AssetCheckResult. It targets asset"\n f" '{self.asset_key.to_user_string()}' which is not targeted by any of the"\n " checks currently being evaluated. Targeted assets:"\n f" {[asset_key.to_user_string() for asset_key in asset_keys_with_specs]}."\n )\n\n resolved_asset_key = self.asset_key\n\n else:\n if len(spec_check_names_by_asset_key) > 1:\n raise DagsterInvariantViolationError(\n "AssetCheckResult didn't specify an asset key, but there are multiple assets"\n " to choose from:"\n f" {[asset_key.to_user_string() for asset_key in spec_check_names_by_asset_key.keys()]}"\n )\n\n resolved_asset_key = next(iter(asset_keys_with_specs))\n\n check_names_with_specs = spec_check_names_by_asset_key[resolved_asset_key]\n if self.check_name is not None:\n if self.check_name not in check_names_with_specs:\n raise DagsterInvariantViolationError(\n "Received unexpected AssetCheckResult. No checks currently being evaluated"\n f" target asset '{resolved_asset_key.to_user_string()}' and have name"\n f" '{self.check_name}'. Checks being evaluated for this asset:"\n f" {check_names_with_specs}"\n )\n\n resolved_check_name = self.check_name\n else:\n if len(check_names_with_specs) > 1:\n raise DagsterInvariantViolationError(\n "AssetCheckResult result didn't specify a check name, but there are multiple"\n " checks to choose from for the this asset key:"\n f" {check_names_with_specs}"\n )\n\n resolved_check_name = next(iter(check_names_with_specs))\n\n input_asset_info = step_context.get_input_asset_version_info(resolved_asset_key)\n if input_asset_info is not None:\n target_materialization_data = AssetCheckEvaluationTargetMaterializationData(\n run_id=input_asset_info.run_id,\n storage_id=input_asset_info.storage_id,\n timestamp=input_asset_info.timestamp,\n )\n else:\n target_materialization_data = None\n\n return AssetCheckEvaluation(\n check_name=resolved_check_name,\n asset_key=resolved_asset_key,\n success=self.success,\n metadata=self.metadata,\n target_materialization_data=target_materialization_data,\n severity=self.severity,\n )\n\n def get_spec_python_identifier(\n self, *, asset_key: Optional[AssetKey] = None, check_name: Optional[str] = None\n ) -> str:\n """Returns a string uniquely identifying the asset check spec associated with this result.\n This is used for the output name associated with an `AssetCheckResult`.\n """\n asset_key = asset_key or self.asset_key\n check_name = check_name or self.check_name\n assert asset_key is not None, "Asset key must be provided if not set on spec"\n assert asset_key is not None, "Asset key must be provided if not set on spec"\n return f"{asset_key.to_python_identifier()}_{self.check_name}"
\n
", "current_page_name": "_modules/dagster/_core/definitions/asset_check_result", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.asset_check_result"}, "asset_check_spec": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.asset_check_spec

\nfrom enum import Enum\nfrom typing import TYPE_CHECKING, Any, Mapping, NamedTuple, Optional, Union\n\nimport dagster._check as check\nfrom dagster._annotations import PublicAttr, experimental\nfrom dagster._core.definitions.events import AssetKey, CoercibleToAssetKey\nfrom dagster._serdes.serdes import whitelist_for_serdes\n\nif TYPE_CHECKING:\n    from dagster._core.definitions.assets import AssetsDefinition\n    from dagster._core.definitions.source_asset import SourceAsset\n\n\n
[docs]@experimental\n@whitelist_for_serdes\nclass AssetCheckSeverity(Enum):\n """Severity level for an asset check.\n\n Severities:\n\n - WARN: If the check fails, don't fail the step.\n - ERROR: If the check fails, fail the step and, within the run, skip materialization of any\n assets that are downstream of the asset being checked.\n """\n\n WARN = "WARN"\n ERROR = "ERROR"
\n\n\n
[docs]@experimental\n@whitelist_for_serdes(old_storage_names={"AssetCheckHandle"})\nclass AssetCheckKey(NamedTuple):\n """Check names are expected to be unique per-asset. Thus, this combination of asset key and\n check name uniquely identifies an asset check within a deployment.\n """\n\n asset_key: PublicAttr[AssetKey]\n name: PublicAttr[str]\n\n @staticmethod\n def from_graphql_input(graphql_input: Mapping[str, Any]) -> "AssetCheckKey":\n return AssetCheckKey(\n asset_key=AssetKey.from_graphql_input(graphql_input["assetKey"]),\n name=graphql_input["name"],\n )
\n\n\n
[docs]@experimental\nclass AssetCheckSpec(\n NamedTuple(\n "_AssetCheckSpec",\n [\n ("name", PublicAttr[str]),\n ("asset_key", PublicAttr[AssetKey]),\n ("description", PublicAttr[Optional[str]]),\n ],\n )\n):\n """Defines information about an check, except how to execute it.\n\n AssetCheckSpec is often used as an argument to decorators that decorator a function that can\n execute multiple checks - e.g. `@asset`, and `@multi_asset`. It defines one of the checks that\n will be executed inside that function.\n\n Args:\n name (str): Name of the check.\n asset (Union[AssetKey, Sequence[str], str, AssetsDefinition, SourceAsset]): The asset that\n the check applies to.\n description (Optional[str]): Description for the check.\n """\n\n def __new__(\n cls,\n name: str,\n *,\n asset: Union[CoercibleToAssetKey, "AssetsDefinition", "SourceAsset"],\n description: Optional[str] = None,\n ):\n return super().__new__(\n cls,\n name=check.str_param(name, "name"),\n asset_key=AssetKey.from_coercible_or_definition(asset),\n description=check.opt_str_param(description, "description"),\n )\n\n def get_python_identifier(self) -> str:\n """Returns a string uniquely identifying the asset check, that uses only the characters\n allowed in a Python identifier.\n """\n return f"{self.asset_key.to_python_identifier()}_{self.name}"\n\n @property\n def key(self) -> AssetCheckKey:\n return AssetCheckKey(self.asset_key, self.name)
\n
", "current_page_name": "_modules/dagster/_core/definitions/asset_check_spec", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.asset_check_spec"}, "asset_dep": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.asset_dep

\nfrom typing import NamedTuple, Optional, Union\n\nimport dagster._check as check\nfrom dagster._annotations import PublicAttr, experimental\nfrom dagster._core.definitions.asset_spec import AssetSpec\nfrom dagster._core.definitions.assets import AssetsDefinition\nfrom dagster._core.definitions.partition_mapping import PartitionMapping\nfrom dagster._core.definitions.source_asset import SourceAsset\nfrom dagster._core.errors import DagsterInvalidDefinitionError\n\nfrom .events import (\n    AssetKey,\n    CoercibleToAssetKey,\n)\n\nCoercibleToAssetDep = Union[\n    CoercibleToAssetKey, AssetSpec, AssetsDefinition, SourceAsset, "AssetDep"\n]\n\n\n
[docs]@experimental\nclass AssetDep(\n NamedTuple(\n "_AssetDep",\n [\n ("asset_key", PublicAttr[AssetKey]),\n ("partition_mapping", PublicAttr[Optional[PartitionMapping]]),\n ],\n )\n):\n """Specifies a dependency on an upstream asset.\n\n Attributes:\n asset (Union[AssetKey, str, AssetSpec, AssetsDefinition, SourceAsset]): The upstream asset to depend on.\n partition_mapping (Optional[PartitionMapping]): Defines what partitions to depend on in\n the upstream asset. If not provided and the upstream asset is partitioned, defaults to\n the default partition mapping for the partitions definition, which is typically maps\n partition keys to the same partition keys in upstream assets.\n\n Examples:\n .. code-block:: python\n\n upstream_asset = AssetSpec("upstream_asset")\n downstream_asset = AssetSpec(\n "downstream_asset",\n deps=[\n AssetDep(\n upstream_asset,\n partition_mapping=TimeWindowPartitionMapping(start_offset=-1, end_offset=-1)\n )\n ]\n )\n """\n\n def __new__(\n cls,\n asset: Union[CoercibleToAssetKey, AssetSpec, AssetsDefinition, SourceAsset],\n *,\n partition_mapping: Optional[PartitionMapping] = None,\n ):\n if isinstance(asset, list):\n check.list_param(asset, "asset", of_type=str)\n else:\n check.inst_param(\n asset, "asset", (AssetKey, str, AssetSpec, AssetsDefinition, SourceAsset)\n )\n if isinstance(asset, AssetsDefinition) and len(asset.keys) > 1:\n # Only AssetsDefinition with a single asset can be passed\n raise DagsterInvalidDefinitionError(\n "Cannot create an AssetDep from a multi_asset AssetsDefinition."\n " Instead, specify dependencies on the assets created by the multi_asset"\n f" via AssetKeys or strings. For the multi_asset {asset.node_def.name}, the"\n f" available keys are: {asset.keys}."\n )\n\n asset_key = _get_asset_key(asset)\n\n return super().__new__(\n cls,\n asset_key=asset_key,\n partition_mapping=check.opt_inst_param(\n partition_mapping,\n "partition_mapping",\n PartitionMapping,\n ),\n )\n\n @staticmethod\n def from_coercible(arg: "CoercibleToAssetDep") -> "AssetDep":\n # if arg is AssetDep, return the original object to retain partition_mapping\n return arg if isinstance(arg, AssetDep) else AssetDep(asset=arg)
\n\n\ndef _get_asset_key(arg: "CoercibleToAssetDep") -> AssetKey:\n if isinstance(arg, (AssetsDefinition, SourceAsset, AssetSpec)):\n return arg.key\n elif isinstance(arg, AssetDep):\n return arg.asset_key\n else:\n return AssetKey.from_coercible(arg)\n
", "current_page_name": "_modules/dagster/_core/definitions/asset_dep", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.asset_dep"}, "asset_in": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.asset_in

\nfrom typing import Mapping, NamedTuple, Optional, Sequence, Type, Union\n\nimport dagster._check as check\nfrom dagster._annotations import PublicAttr\nfrom dagster._core.definitions.events import (\n    AssetKey,\n    CoercibleToAssetKey,\n    CoercibleToAssetKeyPrefix,\n)\nfrom dagster._core.definitions.input import NoValueSentinel\nfrom dagster._core.definitions.metadata import ArbitraryMetadataMapping\nfrom dagster._core.types.dagster_type import DagsterType, resolve_dagster_type\n\nfrom .partition_mapping import PartitionMapping\n\n\n
[docs]class AssetIn(\n NamedTuple(\n "_AssetIn",\n [\n ("key", PublicAttr[Optional[AssetKey]]),\n ("metadata", PublicAttr[Optional[ArbitraryMetadataMapping]]),\n ("key_prefix", PublicAttr[Optional[Sequence[str]]]),\n ("input_manager_key", PublicAttr[Optional[str]]),\n ("partition_mapping", PublicAttr[Optional[PartitionMapping]]),\n ("dagster_type", PublicAttr[Union[DagsterType, Type[NoValueSentinel]]]),\n ],\n )\n):\n """Defines an asset dependency.\n\n Attributes:\n key_prefix (Optional[Union[str, Sequence[str]]]): If provided, the asset's key is the\n concatenation of the key_prefix and the input name. Only one of the "key_prefix" and\n "key" arguments should be provided.\n key (Optional[Union[str, Sequence[str], AssetKey]]): The asset's key. Only one of the\n "key_prefix" and "key" arguments should be provided.\n metadata (Optional[Dict[str, Any]]): A dict of the metadata for the input.\n For example, if you only need a subset of columns from an upstream table, you could\n include that in metadata and the IO manager that loads the upstream table could use the\n metadata to determine which columns to load.\n partition_mapping (Optional[PartitionMapping]): Defines what partitions to depend on in\n the upstream asset. If not provided, defaults to the default partition mapping for the\n partitions definition, which is typically maps partition keys to the same partition keys\n in upstream assets.\n dagster_type (DagsterType): Allows specifying type validation functions that\n will be executed on the input of the decorated function before it runs.\n """\n\n def __new__(\n cls,\n key: Optional[CoercibleToAssetKey] = None,\n metadata: Optional[ArbitraryMetadataMapping] = None,\n key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n input_manager_key: Optional[str] = None,\n partition_mapping: Optional[PartitionMapping] = None,\n dagster_type: Union[DagsterType, Type[NoValueSentinel]] = NoValueSentinel,\n ):\n if isinstance(key_prefix, str):\n key_prefix = [key_prefix]\n\n check.invariant(\n not (key and key_prefix), "key and key_prefix cannot both be set on AssetIn"\n )\n\n return super(AssetIn, cls).__new__(\n cls,\n key=AssetKey.from_coercible(key) if key is not None else None,\n metadata=check.opt_inst_param(metadata, "metadata", Mapping),\n key_prefix=check.opt_list_param(key_prefix, "key_prefix", of_type=str),\n input_manager_key=check.opt_str_param(input_manager_key, "input_manager_key"),\n partition_mapping=check.opt_inst_param(\n partition_mapping, "partition_mapping", PartitionMapping\n ),\n dagster_type=(\n NoValueSentinel\n if dagster_type is NoValueSentinel\n else resolve_dagster_type(dagster_type)\n ),\n )
\n
", "current_page_name": "_modules/dagster/_core/definitions/asset_in", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.asset_in"}, "asset_out": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.asset_out

\nfrom typing import Any, Mapping, NamedTuple, Optional, Sequence, Type, Union\n\nimport dagster._check as check\nfrom dagster._annotations import PublicAttr\nfrom dagster._core.definitions.auto_materialize_policy import AutoMaterializePolicy\nfrom dagster._core.definitions.backfill_policy import BackfillPolicy\nfrom dagster._core.definitions.events import (\n    AssetKey,\n    CoercibleToAssetKey,\n    CoercibleToAssetKeyPrefix,\n)\nfrom dagster._core.definitions.freshness_policy import FreshnessPolicy\nfrom dagster._core.definitions.input import NoValueSentinel\nfrom dagster._core.definitions.metadata import MetadataUserInput\nfrom dagster._core.definitions.output import Out\nfrom dagster._core.definitions.utils import DEFAULT_IO_MANAGER_KEY\nfrom dagster._core.types.dagster_type import DagsterType, resolve_dagster_type\n\n\n
[docs]class AssetOut(\n NamedTuple(\n "_AssetOut",\n [\n ("key", PublicAttr[Optional[AssetKey]]),\n ("key_prefix", PublicAttr[Optional[Sequence[str]]]),\n ("metadata", PublicAttr[Optional[Mapping[str, Any]]]),\n ("io_manager_key", PublicAttr[str]),\n ("description", PublicAttr[Optional[str]]),\n ("is_required", PublicAttr[bool]),\n ("dagster_type", PublicAttr[Union[DagsterType, Type[NoValueSentinel]]]),\n ("group_name", PublicAttr[Optional[str]]),\n ("code_version", PublicAttr[Optional[str]]),\n ("freshness_policy", PublicAttr[Optional[FreshnessPolicy]]),\n ("auto_materialize_policy", PublicAttr[Optional[AutoMaterializePolicy]]),\n ("backfill_policy", PublicAttr[Optional[BackfillPolicy]]),\n ],\n )\n):\n """Defines one of the assets produced by a :py:func:`@multi_asset <multi_asset>`.\n\n Attributes:\n key_prefix (Optional[Union[str, Sequence[str]]]): If provided, the asset's key is the\n concatenation of the key_prefix and the asset's name. When using ``@multi_asset``, the\n asset name defaults to the key of the "outs" dictionary Only one of the "key_prefix" and\n "key" arguments should be provided.\n key (Optional[Union[str, Sequence[str], AssetKey]]): The asset's key. Only one of the\n "key_prefix" and "key" arguments should be provided.\n dagster_type (Optional[Union[Type, DagsterType]]]):\n The type of this output. Should only be set if the correct type can not\n be inferred directly from the type signature of the decorated function.\n description (Optional[str]): Human-readable description of the output.\n is_required (bool): Whether the presence of this field is required. (default: True)\n io_manager_key (Optional[str]): The resource key of the IO manager used for this output.\n (default: "io_manager").\n metadata (Optional[Dict[str, Any]]): A dict of the metadata for the output.\n For example, users can provide a file path if the data object will be stored in a\n filesystem, or provide information of a database table when it is going to load the data\n into the table.\n group_name (Optional[str]): A string name used to organize multiple assets into groups. If\n not provided, the name "default" is used.\n code_version (Optional[str]): The version of the code that generates this asset.\n freshness_policy (Optional[FreshnessPolicy]): A policy which indicates how up to date this\n asset is intended to be.\n auto_materialize_policy (Optional[AutoMaterializePolicy]): AutoMaterializePolicy to apply to\n the specified asset.\n backfill_policy (Optional[BackfillPolicy]): BackfillPolicy to apply to the specified asset.\n """\n\n def __new__(\n cls,\n key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n key: Optional[CoercibleToAssetKey] = None,\n dagster_type: Union[Type, DagsterType] = NoValueSentinel,\n description: Optional[str] = None,\n is_required: bool = True,\n io_manager_key: Optional[str] = None,\n metadata: Optional[MetadataUserInput] = None,\n group_name: Optional[str] = None,\n code_version: Optional[str] = None,\n freshness_policy: Optional[FreshnessPolicy] = None,\n auto_materialize_policy: Optional[AutoMaterializePolicy] = None,\n backfill_policy: Optional[BackfillPolicy] = None,\n ):\n if isinstance(key_prefix, str):\n key_prefix = [key_prefix]\n\n return super(AssetOut, cls).__new__(\n cls,\n key=AssetKey.from_coercible(key) if key is not None else None,\n key_prefix=check.opt_list_param(key_prefix, "key_prefix", of_type=str),\n dagster_type=(\n NoValueSentinel\n if dagster_type is NoValueSentinel\n else resolve_dagster_type(dagster_type)\n ),\n description=check.opt_str_param(description, "description"),\n is_required=check.bool_param(is_required, "is_required"),\n io_manager_key=check.opt_str_param(\n io_manager_key, "io_manager_key", default=DEFAULT_IO_MANAGER_KEY\n ),\n metadata=check.opt_mapping_param(metadata, "metadata", key_type=str),\n group_name=check.opt_str_param(group_name, "group_name"),\n code_version=check.opt_str_param(code_version, "code_version"),\n freshness_policy=check.opt_inst_param(\n freshness_policy, "freshness_policy", FreshnessPolicy\n ),\n auto_materialize_policy=check.opt_inst_param(\n auto_materialize_policy, "auto_materialize_policy", AutoMaterializePolicy\n ),\n backfill_policy=check.opt_inst_param(\n backfill_policy, "backfill_policy", BackfillPolicy\n ),\n )\n\n def to_out(self) -> Out:\n return Out(\n dagster_type=self.dagster_type,\n description=self.description,\n metadata=self.metadata,\n is_required=self.is_required,\n io_manager_key=self.io_manager_key,\n code_version=self.code_version,\n )
\n
", "current_page_name": "_modules/dagster/_core/definitions/asset_out", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.asset_out"}, "asset_selection": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.asset_selection

\nimport collections.abc\nimport operator\nfrom abc import ABC, abstractmethod\nfrom functools import reduce\nfrom typing import AbstractSet, Iterable, Optional, Sequence, Union, cast\n\nfrom typing_extensions import TypeAlias\n\nimport dagster._check as check\nfrom dagster._annotations import deprecated, public\nfrom dagster._core.definitions.asset_checks import AssetChecksDefinition\nfrom dagster._core.errors import DagsterInvalidSubsetError\nfrom dagster._core.selector.subset_selector import (\n    fetch_connected,\n    fetch_sinks,\n    fetch_sources,\n    parse_clause,\n)\n\nfrom .asset_check_spec import AssetCheckKey\nfrom .asset_graph import AssetGraph, InternalAssetGraph\nfrom .assets import AssetsDefinition\nfrom .events import (\n    AssetKey,\n    CoercibleToAssetKey,\n    CoercibleToAssetKeyPrefix,\n    key_prefix_from_coercible,\n)\nfrom .source_asset import SourceAsset\n\nCoercibleToAssetSelection: TypeAlias = Union[\n    str,\n    Sequence[str],\n    Sequence[AssetKey],\n    Sequence[Union["AssetsDefinition", "SourceAsset"]],\n    "AssetSelection",\n]\n\n\n
[docs]class AssetSelection(ABC):\n """An AssetSelection defines a query over a set of assets and asset checks, normally all that are defined in a code location.\n\n You can use the "|", "&", and "-" operators to create unions, intersections, and differences of selections, respectively.\n\n AssetSelections are typically used with :py:func:`define_asset_job`.\n\n By default, selecting assets will also select all of the asset checks that target those assets.\n\n Examples:\n .. code-block:: python\n\n # Select all assets in group "marketing":\n AssetSelection.groups("marketing")\n\n # Select all assets in group "marketing", as well as the asset with key "promotion":\n AssetSelection.groups("marketing") | AssetSelection.keys("promotion")\n\n # Select all assets in group "marketing" that are downstream of asset "leads":\n AssetSelection.groups("marketing") & AssetSelection.keys("leads").downstream()\n\n # Select a list of assets:\n AssetSelection.assets(*my_assets_list)\n\n # Select all assets except for those in group "marketing"\n AssetSelection.all() - AssetSelection.groups("marketing")\n\n # Select all assets which are materialized by the same op as "projections":\n AssetSelection.keys("projections").required_multi_asset_neighbors()\n\n # Select all assets in group "marketing" and exclude their asset checks:\n AssetSelection.groups("marketing") - AssetSelection.all_asset_checks()\n\n # Select all asset checks that target a list of assets:\n AssetSelection.checks_for_assets(*my_assets_list)\n\n # Select a specific asset check:\n AssetSelection.checks(my_asset_check)\n\n """\n\n
[docs] @public\n @staticmethod\n def all() -> "AllSelection":\n """Returns a selection that includes all assets and asset checks."""\n return AllSelection()
\n\n
[docs] @public\n @staticmethod\n def all_asset_checks() -> "AllAssetCheckSelection":\n """Returns a selection that includes all asset checks."""\n return AllAssetCheckSelection()
\n\n
[docs] @public\n @staticmethod\n def assets(*assets_defs: AssetsDefinition) -> "KeysAssetSelection":\n """Returns a selection that includes all of the provided assets and asset checks that target them."""\n return KeysAssetSelection(*(key for assets_def in assets_defs for key in assets_def.keys))
\n\n
[docs] @public\n @staticmethod\n def keys(*asset_keys: CoercibleToAssetKey) -> "KeysAssetSelection":\n """Returns a selection that includes assets with any of the provided keys and all asset checks that target them.\n\n Examples:\n .. code-block:: python\n\n AssetSelection.keys(AssetKey(["a"]))\n\n AssetSelection.keys("a")\n\n AssetSelection.keys(AssetKey(["a"]), AssetKey(["b"]))\n\n AssetSelection.keys("a", "b")\n\n asset_key_list = [AssetKey(["a"]), AssetKey(["b"])]\n AssetSelection.keys(*asset_key_list)\n """\n _asset_keys = [\n AssetKey.from_user_string(key) if isinstance(key, str) else AssetKey.from_coercible(key)\n for key in asset_keys\n ]\n return KeysAssetSelection(*_asset_keys)
\n\n
[docs] @public\n @staticmethod\n def key_prefixes(\n *key_prefixes: CoercibleToAssetKeyPrefix, include_sources: bool = False\n ) -> "KeyPrefixesAssetSelection":\n """Returns a selection that includes assets that match any of the provided key prefixes and all the asset checks that target them.\n\n Args:\n include_sources (bool): If True, then include source assets matching the key prefix(es)\n in the selection.\n\n Examples:\n .. code-block:: python\n\n # match any asset key where the first segment is equal to "a" or "b"\n # e.g. AssetKey(["a", "b", "c"]) would match, but AssetKey(["abc"]) would not.\n AssetSelection.key_prefixes("a", "b")\n\n # match any asset key where the first two segments are ["a", "b"] or ["a", "c"]\n AssetSelection.key_prefixes(["a", "b"], ["a", "c"])\n """\n _asset_key_prefixes = [key_prefix_from_coercible(key_prefix) for key_prefix in key_prefixes]\n return KeyPrefixesAssetSelection(*_asset_key_prefixes, include_sources=include_sources)
\n\n
[docs] @public\n @staticmethod\n def groups(*group_strs, include_sources: bool = False) -> "GroupsAssetSelection":\n """Returns a selection that includes materializable assets that belong to any of the\n provided groups and all the asset checks that target them.\n\n Args:\n include_sources (bool): If True, then include source assets matching the group in the\n selection.\n """\n check.tuple_param(group_strs, "group_strs", of_type=str)\n return GroupsAssetSelection(*group_strs, include_sources=include_sources)
\n\n
[docs] @public\n @staticmethod\n def checks_for_assets(*assets_defs: AssetsDefinition) -> "AssetChecksForAssetKeys":\n """Returns a selection with the asset checks that target the provided assets."""\n return AssetChecksForAssetKeys(\n [key for assets_def in assets_defs for key in assets_def.keys]\n )
\n\n
[docs] @public\n @staticmethod\n def checks(*asset_checks: AssetChecksDefinition) -> "AssetChecksForHandles":\n """Returns a selection that includes all of the provided asset checks."""\n return AssetChecksForHandles(\n [\n AssetCheckKey(asset_key=AssetKey.from_coercible(spec.asset_key), name=spec.name)\n for checks_def in asset_checks\n for spec in checks_def.specs\n ]\n )
\n\n
[docs] @public\n def downstream(\n self, depth: Optional[int] = None, include_self: bool = True\n ) -> "DownstreamAssetSelection":\n """Returns a selection that includes all assets that are downstream of any of the assets in\n this selection, selecting the assets in this selection by default. Includes the asset checks targeting the returned assets. Iterates through each\n asset in this selection and returns the union of all downstream assets.\n\n depth (Optional[int]): If provided, then only include assets to the given depth. A depth\n of 2 means all assets that are children or grandchildren of the assets in this\n selection.\n include_self (bool): If True, then include the assets in this selection in the result.\n If the include_self flag is False, return each downstream asset that is not part of the\n original selection. By default, set to True.\n """\n check.opt_int_param(depth, "depth")\n check.opt_bool_param(include_self, "include_self")\n return DownstreamAssetSelection(self, depth=depth, include_self=include_self)
\n\n
[docs] @public\n def upstream(\n self, depth: Optional[int] = None, include_self: bool = True\n ) -> "UpstreamAssetSelection":\n """Returns a selection that includes all materializable assets that are upstream of any of\n the assets in this selection, selecting the assets in this selection by default. Includes the asset checks targeting the returned assets. Iterates\n through each asset in this selection and returns the union of all upstream assets.\n\n Because mixed selections of source and materializable assets are currently not supported,\n keys corresponding to `SourceAssets` will not be included as upstream of regular assets.\n\n Args:\n depth (Optional[int]): If provided, then only include assets to the given depth. A depth\n of 2 means all assets that are parents or grandparents of the assets in this\n selection.\n include_self (bool): If True, then include the assets in this selection in the result.\n If the include_self flag is False, return each upstream asset that is not part of the\n original selection. By default, set to True.\n """\n check.opt_int_param(depth, "depth")\n check.opt_bool_param(include_self, "include_self")\n return UpstreamAssetSelection(self, depth=depth, include_self=include_self)
\n\n
[docs] @public\n def sinks(self) -> "SinkAssetSelection":\n """Given an asset selection, returns a new asset selection that contains all of the sink\n assets within the original asset selection. Includes the asset checks targeting the returned assets.\n\n A sink asset is an asset that has no downstream dependencies within the asset selection.\n The sink asset can have downstream dependencies outside of the asset selection.\n """\n return SinkAssetSelection(self)
\n\n
[docs] @public\n def required_multi_asset_neighbors(self) -> "RequiredNeighborsAssetSelection":\n """Given an asset selection in which some assets are output from a multi-asset compute op\n which cannot be subset, returns a new asset selection that contains all of the assets\n required to execute the original asset selection. Includes the asset checks targeting the returned assets.\n """\n return RequiredNeighborsAssetSelection(self)
\n\n
[docs] @public\n def roots(self) -> "RootAssetSelection":\n """Given an asset selection, returns a new asset selection that contains all of the root\n assets within the original asset selection. Includes the asset checks targeting the returned assets.\n\n A root asset is an asset that has no upstream dependencies within the asset selection.\n The root asset can have downstream dependencies outside of the asset selection.\n\n Because mixed selections of source and materializable assets are currently not supported,\n keys corresponding to `SourceAssets` will not be included as roots. To select source assets,\n use the `upstream_source_assets` method.\n """\n return RootAssetSelection(self)
\n\n
[docs] @public\n @deprecated(breaking_version="2.0", additional_warn_text="Use AssetSelection.roots instead.")\n def sources(self) -> "RootAssetSelection":\n """Given an asset selection, returns a new asset selection that contains all of the root\n assets within the original asset selection. Includes the asset checks targeting the returned assets.\n\n A root asset is a materializable asset that has no upstream dependencies within the asset\n selection. The root asset can have downstream dependencies outside of the asset selection.\n\n Because mixed selections of source and materializable assets are currently not supported,\n keys corresponding to `SourceAssets` will not be included as roots. To select source assets,\n use the `upstream_source_assets` method.\n """\n return self.roots()
\n\n
[docs] @public\n def upstream_source_assets(self) -> "SourceAssetSelection":\n """Given an asset selection, returns a new asset selection that contains all of the source\n assets upstream of assets in the original selection. Includes the asset checks targeting the returned assets.\n """\n return SourceAssetSelection(self)
\n\n
[docs] @public\n def without_checks(self) -> "AssetSelection":\n """Removes all asset checks in the selection."""\n return self - AssetSelection.all_asset_checks()
\n\n def __or__(self, other: "AssetSelection") -> "OrAssetSelection":\n check.inst_param(other, "other", AssetSelection)\n return OrAssetSelection(self, other)\n\n def __and__(self, other: "AssetSelection") -> "AndAssetSelection":\n check.inst_param(other, "other", AssetSelection)\n return AndAssetSelection(self, other)\n\n def __sub__(self, other: "AssetSelection") -> "SubAssetSelection":\n check.inst_param(other, "other", AssetSelection)\n return SubAssetSelection(self, other)\n\n def resolve(\n self, all_assets: Union[Iterable[Union[AssetsDefinition, SourceAsset]], AssetGraph]\n ) -> AbstractSet[AssetKey]:\n if isinstance(all_assets, AssetGraph):\n asset_graph = all_assets\n else:\n check.iterable_param(all_assets, "all_assets", (AssetsDefinition, SourceAsset))\n asset_graph = AssetGraph.from_assets(all_assets)\n\n resolved = self.resolve_inner(asset_graph)\n resolved_source_assets = asset_graph.source_asset_keys & resolved\n resolved_regular_assets = resolved - asset_graph.source_asset_keys\n check.invariant(\n not (len(resolved_source_assets) > 0 and len(resolved_regular_assets) > 0),\n "Asset selection specified both regular assets and source assets. This is not"\n " currently supported. Selections must be all regular assets or all source assets.",\n )\n return resolved\n\n @abstractmethod\n def resolve_inner(self, asset_graph: AssetGraph) -> AbstractSet[AssetKey]:\n raise NotImplementedError()\n\n def resolve_checks(self, asset_graph: InternalAssetGraph) -> AbstractSet[AssetCheckKey]:\n """We don't need this method currently, but it makes things consistent with resolve_inner. Currently\n we don't store checks in the ExternalAssetGraph, so we only support InternalAssetGraph.\n """\n return self.resolve_checks_inner(asset_graph)\n\n def resolve_checks_inner(self, asset_graph: InternalAssetGraph) -> AbstractSet[AssetCheckKey]:\n """By default, resolve to checks that target the selected assets. This is overriden for particular selections."""\n asset_keys = self.resolve(asset_graph)\n return {handle for handle in asset_graph.asset_check_keys if handle.asset_key in asset_keys}\n\n @staticmethod\n def _selection_from_string(string: str) -> "AssetSelection":\n from dagster._core.definitions import AssetSelection\n\n if string == "*":\n return AssetSelection.all()\n\n parts = parse_clause(string)\n if not parts:\n check.failed(f"Invalid selection string: {string}")\n u, item, d = parts\n\n selection: AssetSelection = AssetSelection.keys(item)\n if u:\n selection = selection.upstream(u)\n if d:\n selection = selection.downstream(d)\n return selection\n\n @classmethod\n def from_coercible(cls, selection: CoercibleToAssetSelection) -> "AssetSelection":\n if isinstance(selection, str):\n return cls._selection_from_string(selection)\n elif isinstance(selection, AssetSelection):\n return selection\n elif isinstance(selection, collections.abc.Sequence) and all(\n isinstance(el, str) for el in selection\n ):\n return reduce(\n operator.or_, [cls._selection_from_string(cast(str, s)) for s in selection]\n )\n elif isinstance(selection, collections.abc.Sequence) and all(\n isinstance(el, (AssetsDefinition, SourceAsset)) for el in selection\n ):\n return AssetSelection.keys(\n *(\n key\n for el in selection\n for key in (\n el.keys if isinstance(el, AssetsDefinition) else [cast(SourceAsset, el).key]\n )\n )\n )\n elif isinstance(selection, collections.abc.Sequence) and all(\n isinstance(el, AssetKey) for el in selection\n ):\n return cls.keys(*cast(Sequence[AssetKey], selection))\n else:\n check.failed(\n "selection argument must be one of str, Sequence[str], Sequence[AssetKey],"\n " Sequence[AssetsDefinition], Sequence[SourceAsset], AssetSelection. Was"\n f" {type(selection)}."\n )
\n\n\nclass AllSelection(AssetSelection):\n def resolve_inner(self, asset_graph: AssetGraph) -> AbstractSet[AssetKey]:\n return asset_graph.materializable_asset_keys\n\n\nclass AllAssetCheckSelection(AssetSelection):\n def resolve_inner(self, asset_graph: AssetGraph) -> AbstractSet[AssetKey]:\n return set()\n\n def resolve_checks_inner(self, asset_graph: InternalAssetGraph) -> AbstractSet[AssetCheckKey]:\n return asset_graph.asset_check_keys\n\n\nclass AssetChecksForAssetKeys(AssetSelection):\n def __init__(self, keys: Sequence[AssetKey]):\n self._keys = keys\n\n def resolve_inner(self, asset_graph: AssetGraph) -> AbstractSet[AssetKey]:\n return set()\n\n def resolve_checks_inner(self, asset_graph: InternalAssetGraph) -> AbstractSet[AssetCheckKey]:\n return {handle for handle in asset_graph.asset_check_keys if handle.asset_key in self._keys}\n\n\nclass AssetChecksForHandles(AssetSelection):\n def __init__(self, asset_check_keys: Sequence[AssetCheckKey]):\n self._asset_check_keys = asset_check_keys\n\n def resolve_inner(self, asset_graph: AssetGraph) -> AbstractSet[AssetKey]:\n return set()\n\n def resolve_checks_inner(self, asset_graph: InternalAssetGraph) -> AbstractSet[AssetCheckKey]:\n return {\n handle for handle in asset_graph.asset_check_keys if handle in self._asset_check_keys\n }\n\n\nclass AndAssetSelection(AssetSelection):\n def __init__(self, left: AssetSelection, right: AssetSelection):\n self._left = left\n self._right = right\n\n def resolve_inner(self, asset_graph: AssetGraph) -> AbstractSet[AssetKey]:\n return self._left.resolve_inner(asset_graph) & self._right.resolve_inner(asset_graph)\n\n def resolve_checks_inner(self, asset_graph: InternalAssetGraph) -> AbstractSet[AssetCheckKey]:\n return self._left.resolve_checks_inner(asset_graph) & self._right.resolve_checks_inner(\n asset_graph\n )\n\n\nclass SubAssetSelection(AssetSelection):\n def __init__(self, left: AssetSelection, right: AssetSelection):\n self._left = left\n self._right = right\n\n def resolve_inner(self, asset_graph: AssetGraph) -> AbstractSet[AssetKey]:\n return self._left.resolve_inner(asset_graph) - self._right.resolve_inner(asset_graph)\n\n def resolve_checks_inner(self, asset_graph: InternalAssetGraph) -> AbstractSet[AssetCheckKey]:\n return self._left.resolve_checks_inner(asset_graph) - self._right.resolve_checks_inner(\n asset_graph\n )\n\n\nclass SinkAssetSelection(AssetSelection):\n def __init__(self, child: AssetSelection):\n self._child = child\n\n def resolve_inner(self, asset_graph: AssetGraph) -> AbstractSet[AssetKey]:\n selection = self._child.resolve_inner(asset_graph)\n return fetch_sinks(asset_graph.asset_dep_graph, selection)\n\n\nclass RequiredNeighborsAssetSelection(AssetSelection):\n def __init__(self, child: AssetSelection):\n self._child = child\n\n def resolve_inner(self, asset_graph: AssetGraph) -> AbstractSet[AssetKey]:\n selection = self._child.resolve_inner(asset_graph)\n output = set(selection)\n for asset_key in selection:\n output.update(asset_graph.get_required_multi_asset_keys(asset_key))\n return output\n\n\nclass RootAssetSelection(AssetSelection):\n def __init__(self, child: AssetSelection):\n self._child = child\n\n def resolve_inner(self, asset_graph: AssetGraph) -> AbstractSet[AssetKey]:\n selection = self._child.resolve_inner(asset_graph)\n return fetch_sources(asset_graph.asset_dep_graph, selection)\n\n\nclass DownstreamAssetSelection(AssetSelection):\n def __init__(\n self,\n child: AssetSelection,\n *,\n depth: Optional[int] = None,\n include_self: Optional[bool] = True,\n ):\n self._child = child\n self.depth = depth\n self.include_self = include_self\n\n def resolve_inner(self, asset_graph: AssetGraph) -> AbstractSet[AssetKey]:\n selection = self._child.resolve_inner(asset_graph)\n return operator.sub(\n reduce(\n operator.or_,\n [\n {asset_key}\n | fetch_connected(\n item=asset_key,\n graph=asset_graph.asset_dep_graph,\n direction="downstream",\n depth=self.depth,\n )\n for asset_key in selection\n ],\n ),\n selection if not self.include_self else set(),\n )\n\n\nclass GroupsAssetSelection(AssetSelection):\n def __init__(self, *groups: str, include_sources: bool):\n self._groups = groups\n self._include_sources = include_sources\n\n def resolve_inner(self, asset_graph: AssetGraph) -> AbstractSet[AssetKey]:\n base_set = (\n asset_graph.all_asset_keys\n if self._include_sources\n else asset_graph.materializable_asset_keys\n )\n return {\n asset_key\n for asset_key, group in asset_graph.group_names_by_key.items()\n if group in self._groups and asset_key in base_set\n }\n\n\nclass KeysAssetSelection(AssetSelection):\n def __init__(self, *keys: AssetKey):\n self._keys = keys\n\n def resolve_inner(self, asset_graph: AssetGraph) -> AbstractSet[AssetKey]:\n specified_keys = set(self._keys)\n invalid_keys = {key for key in specified_keys if key not in asset_graph.all_asset_keys}\n if invalid_keys:\n raise DagsterInvalidSubsetError(\n f"AssetKey(s) {invalid_keys} were selected, but no AssetsDefinition objects supply "\n "these keys. Make sure all keys are spelled correctly, and all AssetsDefinitions "\n "are correctly added to the `Definitions`."\n )\n return specified_keys\n\n\nclass KeyPrefixesAssetSelection(AssetSelection):\n def __init__(self, *key_prefixes: Sequence[str], include_sources: bool):\n self._key_prefixes = key_prefixes\n self._include_sources = include_sources\n\n def resolve_inner(self, asset_graph: AssetGraph) -> AbstractSet[AssetKey]:\n base_set = (\n asset_graph.all_asset_keys\n if self._include_sources\n else asset_graph.materializable_asset_keys\n )\n return {\n key for key in base_set if any(key.has_prefix(prefix) for prefix in self._key_prefixes)\n }\n\n\nclass OrAssetSelection(AssetSelection):\n def __init__(self, left: AssetSelection, right: AssetSelection):\n self._left = left\n self._right = right\n\n def resolve_inner(self, asset_graph: AssetGraph) -> AbstractSet[AssetKey]:\n return self._left.resolve_inner(asset_graph) | self._right.resolve_inner(asset_graph)\n\n def resolve_checks_inner(self, asset_graph: InternalAssetGraph) -> AbstractSet[AssetCheckKey]:\n return self._left.resolve_checks_inner(asset_graph) | self._right.resolve_checks_inner(\n asset_graph\n )\n\n\ndef _fetch_all_upstream(\n selection: AbstractSet[AssetKey],\n asset_graph: AssetGraph,\n depth: Optional[int] = None,\n include_self: bool = True,\n) -> AbstractSet[AssetKey]:\n return operator.sub(\n reduce(\n operator.or_,\n [\n {asset_key}\n | fetch_connected(\n item=asset_key,\n graph=asset_graph.asset_dep_graph,\n direction="upstream",\n depth=depth,\n )\n for asset_key in selection\n ],\n set(),\n ),\n selection if not include_self else set(),\n )\n\n\nclass UpstreamAssetSelection(AssetSelection):\n def __init__(\n self,\n child: AssetSelection,\n *,\n depth: Optional[int] = None,\n include_self: bool = True,\n ):\n self._child = child\n self.depth = depth\n self.include_self = include_self\n\n def resolve_inner(self, asset_graph: AssetGraph) -> AbstractSet[AssetKey]:\n selection = self._child.resolve_inner(asset_graph)\n if len(selection) == 0:\n return selection\n all_upstream = _fetch_all_upstream(selection, asset_graph, self.depth, self.include_self)\n return {key for key in all_upstream if key not in asset_graph.source_asset_keys}\n\n\nclass SourceAssetSelection(AssetSelection):\n def __init__(self, child: AssetSelection):\n self._child = child\n\n def resolve_inner(self, asset_graph: AssetGraph) -> AbstractSet[AssetKey]:\n selection = self._child.resolve_inner(asset_graph)\n if len(selection) == 0:\n return selection\n all_upstream = _fetch_all_upstream(selection, asset_graph)\n return {key for key in all_upstream if key in asset_graph.source_asset_keys}\n
", "current_page_name": "_modules/dagster/_core/definitions/asset_selection", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.asset_selection"}, "asset_sensor_definition": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.asset_sensor_definition

\nimport inspect\nfrom typing import Any, Callable, NamedTuple, Optional, Sequence, Set\n\nimport dagster._check as check\nfrom dagster._annotations import public\nfrom dagster._core.decorator_utils import get_function_params\nfrom dagster._core.definitions.resource_annotation import get_resource_args\n\nfrom .events import AssetKey\nfrom .run_request import RunRequest, SkipReason\nfrom .sensor_definition import (\n    DefaultSensorStatus,\n    RawSensorEvaluationFunctionReturn,\n    SensorDefinition,\n    SensorType,\n    validate_and_get_resource_dict,\n)\nfrom .target import ExecutableDefinition\nfrom .utils import check_valid_name\n\n\nclass AssetSensorParamNames(NamedTuple):\n    context_param_name: Optional[str]\n    event_log_entry_param_name: Optional[str]\n\n\ndef get_asset_sensor_param_names(fn: Callable) -> AssetSensorParamNames:\n    """Determines the names of the context and event log entry parameters for an asset sensor function.\n    These are assumed to be the first two non-resource params, in order (context param before event log entry).\n    """\n    resource_params = {param.name for param in get_resource_args(fn)}\n\n    non_resource_params = [\n        param.name for param in get_function_params(fn) if param.name not in resource_params\n    ]\n\n    context_param_name = non_resource_params[0] if len(non_resource_params) > 0 else None\n    event_log_entry_param_name = non_resource_params[1] if len(non_resource_params) > 1 else None\n\n    return AssetSensorParamNames(\n        context_param_name=context_param_name, event_log_entry_param_name=event_log_entry_param_name\n    )\n\n\n
[docs]class AssetSensorDefinition(SensorDefinition):\n """Define an asset sensor that initiates a set of runs based on the materialization of a given\n asset.\n\n If the asset has been materialized multiple times between since the last sensor tick, the\n evaluation function will only be invoked once, with the latest materialization.\n\n Args:\n name (str): The name of the sensor to create.\n asset_key (AssetKey): The asset_key this sensor monitors.\n asset_materialization_fn (Callable[[SensorEvaluationContext, EventLogEntry], Union[Iterator[Union[RunRequest, SkipReason]], RunRequest, SkipReason]]): The core\n evaluation function for the sensor, which is run at an interval to determine whether a\n run should be launched or not. Takes a :py:class:`~dagster.SensorEvaluationContext` and\n an EventLogEntry corresponding to an AssetMaterialization event.\n\n This function must return a generator, which must yield either a single SkipReason\n or one or more RunRequest objects.\n minimum_interval_seconds (Optional[int]): The minimum number of seconds that will elapse\n between sensor evaluations.\n description (Optional[str]): A human-readable description of the sensor.\n job (Optional[Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]]): The job\n object to target with this sensor.\n jobs (Optional[Sequence[Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]]]):\n (experimental) A list of jobs to be executed when the sensor fires.\n default_status (DefaultSensorStatus): Whether the sensor starts as running or not. The default\n status can be overridden from the Dagster UI or via the GraphQL API.\n """\n\n def __init__(\n self,\n name: str,\n asset_key: AssetKey,\n job_name: Optional[str],\n asset_materialization_fn: Callable[\n ...,\n RawSensorEvaluationFunctionReturn,\n ],\n minimum_interval_seconds: Optional[int] = None,\n description: Optional[str] = None,\n job: Optional[ExecutableDefinition] = None,\n jobs: Optional[Sequence[ExecutableDefinition]] = None,\n default_status: DefaultSensorStatus = DefaultSensorStatus.STOPPED,\n required_resource_keys: Optional[Set[str]] = None,\n ):\n self._asset_key = check.inst_param(asset_key, "asset_key", AssetKey)\n\n from dagster._core.events import DagsterEventType\n from dagster._core.storage.event_log.base import EventRecordsFilter\n\n resource_arg_names: Set[str] = {\n arg.name for arg in get_resource_args(asset_materialization_fn)\n }\n\n combined_required_resource_keys = (\n check.opt_set_param(required_resource_keys, "required_resource_keys", of_type=str)\n | resource_arg_names\n )\n\n def _wrap_asset_fn(materialization_fn) -> Any:\n def _fn(context) -> Any:\n after_cursor = None\n if context.cursor:\n try:\n after_cursor = int(context.cursor)\n except ValueError:\n after_cursor = None\n\n event_records = context.instance.get_event_records(\n EventRecordsFilter(\n event_type=DagsterEventType.ASSET_MATERIALIZATION,\n asset_key=self._asset_key,\n after_cursor=after_cursor,\n ),\n ascending=False,\n limit=1,\n )\n\n if not event_records:\n yield SkipReason(\n f"No new materialization events found for asset key {self._asset_key}"\n )\n return\n\n event_record = event_records[0]\n\n (\n context_param_name,\n event_log_entry_param_name,\n ) = get_asset_sensor_param_names(materialization_fn)\n\n resource_args_populated = validate_and_get_resource_dict(\n context.resources, name, resource_arg_names\n )\n\n # Build asset sensor function args, which can include any subset of\n # context arg, event log entry arg, and any resource args\n args = resource_args_populated\n if context_param_name:\n args[context_param_name] = context\n if event_log_entry_param_name:\n args[event_log_entry_param_name] = event_record.event_log_entry\n\n result = materialization_fn(**args)\n if inspect.isgenerator(result) or isinstance(result, list):\n for item in result:\n yield item\n elif isinstance(result, (SkipReason, RunRequest)):\n yield result\n context.update_cursor(str(event_record.storage_id))\n\n return _fn\n\n super(AssetSensorDefinition, self).__init__(\n name=check_valid_name(name),\n job_name=job_name,\n evaluation_fn=_wrap_asset_fn(\n check.callable_param(asset_materialization_fn, "asset_materialization_fn"),\n ),\n minimum_interval_seconds=minimum_interval_seconds,\n description=description,\n job=job,\n jobs=jobs,\n default_status=default_status,\n required_resource_keys=combined_required_resource_keys,\n )\n\n @public\n @property\n def asset_key(self) -> AssetKey:\n """AssetKey: The key of the asset targeted by this sensor."""\n return self._asset_key\n\n @property\n def sensor_type(self) -> SensorType:\n return SensorType.ASSET
\n
", "current_page_name": "_modules/dagster/_core/definitions/asset_sensor_definition", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.asset_sensor_definition"}, "asset_spec": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.asset_spec

\nfrom enum import Enum\nfrom typing import TYPE_CHECKING, Any, Iterable, Mapping, NamedTuple, Optional\n\nimport dagster._check as check\nfrom dagster._annotations import PublicAttr, experimental\nfrom dagster._core.errors import DagsterInvariantViolationError\n\nfrom .auto_materialize_policy import AutoMaterializePolicy\nfrom .events import (\n    AssetKey,\n    CoercibleToAssetKey,\n)\nfrom .freshness_policy import FreshnessPolicy\nfrom .metadata import MetadataUserInput\n\nif TYPE_CHECKING:\n    from dagster._core.definitions.asset_dep import AssetDep, CoercibleToAssetDep\n\n# SYSTEM_METADATA_KEY_ASSET_EXECUTION_TYPE lives on the metadata of an asset\n# (which currently ends up on the Output associated with the asset key)\n# whih encodes the execution type the of asset. "Unexecutable" assets are assets\n# that cannot be materialized in Dagster, but can have events in the event\n# log keyed off of them, making Dagster usable as a observability and lineage tool\n# for externally materialized assets.\nSYSTEM_METADATA_KEY_ASSET_EXECUTION_TYPE = "dagster/asset_execution_type"\n\n\nclass AssetExecutionType(Enum):\n    UNEXECUTABLE = "UNEXECUTABLE"\n    MATERIALIZATION = "MATERIALIZATION"\n\n    @staticmethod\n    def is_executable(varietal_str: Optional[str]) -> bool:\n        return AssetExecutionType.str_to_enum(varietal_str) in {AssetExecutionType.MATERIALIZATION}\n\n    @staticmethod\n    def str_to_enum(varietal_str: Optional[str]) -> "AssetExecutionType":\n        return (\n            AssetExecutionType.MATERIALIZATION\n            if varietal_str is None\n            else AssetExecutionType(varietal_str)\n        )\n\n\n
[docs]@experimental\nclass AssetSpec(\n NamedTuple(\n "_AssetSpec",\n [\n ("key", PublicAttr[AssetKey]),\n ("deps", PublicAttr[Iterable["AssetDep"]]),\n ("description", PublicAttr[Optional[str]]),\n ("metadata", PublicAttr[Optional[Mapping[str, Any]]]),\n ("group_name", PublicAttr[Optional[str]]),\n ("skippable", PublicAttr[bool]),\n ("code_version", PublicAttr[Optional[str]]),\n ("freshness_policy", PublicAttr[Optional[FreshnessPolicy]]),\n ("auto_materialize_policy", PublicAttr[Optional[AutoMaterializePolicy]]),\n ],\n )\n):\n """Specifies the core attributes of an asset. This object is attached to the decorated\n function that defines how it materialized.\n\n Attributes:\n key (AssetKey): The unique identifier for this asset.\n deps (Optional[AbstractSet[AssetKey]]): The asset keys for the upstream assets that\n materializing this asset depends on.\n description (Optional[str]): Human-readable description of this asset.\n metadata (Optional[Dict[str, Any]]): A dict of static metadata for this asset.\n For example, users can provide information about the database table this\n asset corresponds to.\n skippable (bool): Whether this asset can be omitted during materialization, causing downstream\n dependencies to skip.\n group_name (Optional[str]): A string name used to organize multiple assets into groups. If\n not provided, the name "default" is used.\n code_version (Optional[str]): The version of the code for this specific asset,\n overriding the code version of the materialization function\n freshness_policy (Optional[FreshnessPolicy]): A policy which indicates how up to date this\n asset is intended to be.\n auto_materialize_policy (Optional[AutoMaterializePolicy]): AutoMaterializePolicy to apply to\n the specified asset.\n backfill_policy (Optional[BackfillPolicy]): BackfillPolicy to apply to the specified asset.\n """\n\n def __new__(\n cls,\n key: CoercibleToAssetKey,\n *,\n deps: Optional[Iterable["CoercibleToAssetDep"]] = None,\n description: Optional[str] = None,\n metadata: Optional[MetadataUserInput] = None,\n skippable: bool = False,\n group_name: Optional[str] = None,\n code_version: Optional[str] = None,\n freshness_policy: Optional[FreshnessPolicy] = None,\n auto_materialize_policy: Optional[AutoMaterializePolicy] = None,\n ):\n from dagster._core.definitions.asset_dep import AssetDep\n\n dep_set = {}\n if deps:\n for dep in deps:\n asset_dep = AssetDep.from_coercible(dep)\n\n # we cannot do deduplication via a set because MultiPartitionMappings have an internal\n # dictionary that cannot be hashed. Instead deduplicate by making a dictionary and checking\n # for existing keys.\n if asset_dep.asset_key in dep_set.keys():\n raise DagsterInvariantViolationError(\n f"Cannot set a dependency on asset {asset_dep.asset_key} more than once for"\n f" AssetSpec {key}"\n )\n dep_set[asset_dep.asset_key] = asset_dep\n\n return super().__new__(\n cls,\n key=AssetKey.from_coercible(key),\n deps=list(dep_set.values()),\n description=check.opt_str_param(description, "description"),\n metadata=check.opt_mapping_param(metadata, "metadata", key_type=str),\n skippable=check.bool_param(skippable, "skippable"),\n group_name=check.opt_str_param(group_name, "group_name"),\n code_version=check.opt_str_param(code_version, "code_version"),\n freshness_policy=check.opt_inst_param(\n freshness_policy,\n "freshness_policy",\n FreshnessPolicy,\n ),\n auto_materialize_policy=check.opt_inst_param(\n auto_materialize_policy,\n "auto_materialize_policy",\n AutoMaterializePolicy,\n ),\n )
\n
", "current_page_name": "_modules/dagster/_core/definitions/asset_spec", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.asset_spec"}, "assets": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.assets

\nimport hashlib\nimport json\nimport warnings\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Any,\n    Dict,\n    Iterable,\n    Iterator,\n    List,\n    Mapping,\n    Optional,\n    Sequence,\n    Set,\n    Union,\n    cast,\n)\n\nimport dagster._check as check\nfrom dagster._annotations import experimental_param, public\nfrom dagster._core.definitions.asset_check_spec import AssetCheckKey, AssetCheckSpec\nfrom dagster._core.definitions.asset_layer import get_dep_node_handles_of_graph_backed_asset\nfrom dagster._core.definitions.auto_materialize_policy import AutoMaterializePolicy\nfrom dagster._core.definitions.backfill_policy import BackfillPolicy, BackfillPolicyType\nfrom dagster._core.definitions.freshness_policy import FreshnessPolicy\nfrom dagster._core.definitions.metadata import ArbitraryMetadataMapping\nfrom dagster._core.definitions.multi_dimensional_partitions import MultiPartitionsDefinition\nfrom dagster._core.definitions.op_invocation import direct_invocation_result\nfrom dagster._core.definitions.op_selection import get_graph_subset\nfrom dagster._core.definitions.partition_mapping import MultiPartitionMapping\nfrom dagster._core.definitions.resource_requirement import (\n    RequiresResources,\n    ResourceAddable,\n    ResourceRequirement,\n    merge_resource_defs,\n)\nfrom dagster._core.definitions.time_window_partition_mapping import TimeWindowPartitionMapping\nfrom dagster._core.definitions.time_window_partitions import TimeWindowPartitionsDefinition\nfrom dagster._core.errors import (\n    DagsterInvalidDefinitionError,\n    DagsterInvalidInvocationError,\n    DagsterInvariantViolationError,\n)\nfrom dagster._utils import IHasInternalInit\nfrom dagster._utils.merger import merge_dicts\nfrom dagster._utils.warnings import (\n    disable_dagster_warnings,\n)\n\nfrom .dependency import NodeHandle\nfrom .events import AssetKey, CoercibleToAssetKey, CoercibleToAssetKeyPrefix\nfrom .node_definition import NodeDefinition\nfrom .op_definition import OpDefinition\nfrom .partition import PartitionsDefinition\nfrom .partition_mapping import (\n    PartitionMapping,\n    get_builtin_partition_mapping_types,\n    infer_partition_mapping,\n)\nfrom .resource_definition import ResourceDefinition\nfrom .source_asset import SourceAsset\nfrom .utils import DEFAULT_GROUP_NAME, validate_group_name\n\nif TYPE_CHECKING:\n    from .graph_definition import GraphDefinition\n\n\n
[docs]class AssetsDefinition(ResourceAddable, RequiresResources, IHasInternalInit):\n """Defines a set of assets that are produced by the same op or graph.\n\n AssetsDefinitions are typically not instantiated directly, but rather produced using the\n :py:func:`@asset <asset>` or :py:func:`@multi_asset <multi_asset>` decorators.\n """\n\n _node_def: NodeDefinition\n _keys_by_input_name: Mapping[str, AssetKey]\n _keys_by_output_name: Mapping[str, AssetKey]\n _partitions_def: Optional[PartitionsDefinition]\n _partition_mappings: Mapping[AssetKey, PartitionMapping]\n _asset_deps: Mapping[AssetKey, AbstractSet[AssetKey]]\n _resource_defs: Mapping[str, ResourceDefinition]\n _group_names_by_key: Mapping[AssetKey, str]\n _selected_asset_keys: AbstractSet[AssetKey]\n _can_subset: bool\n _metadata_by_key: Mapping[AssetKey, ArbitraryMetadataMapping]\n _freshness_policies_by_key: Mapping[AssetKey, FreshnessPolicy]\n _auto_materialize_policies_by_key: Mapping[AssetKey, AutoMaterializePolicy]\n _backfill_policy: Optional[BackfillPolicy]\n _code_versions_by_key: Mapping[AssetKey, Optional[str]]\n _descriptions_by_key: Mapping[AssetKey, str]\n _selected_asset_check_keys: AbstractSet[AssetCheckKey]\n\n def __init__(\n self,\n *,\n keys_by_input_name: Mapping[str, AssetKey],\n keys_by_output_name: Mapping[str, AssetKey],\n node_def: NodeDefinition,\n partitions_def: Optional[PartitionsDefinition] = None,\n partition_mappings: Optional[Mapping[AssetKey, PartitionMapping]] = None,\n asset_deps: Optional[Mapping[AssetKey, AbstractSet[AssetKey]]] = None,\n selected_asset_keys: Optional[AbstractSet[AssetKey]] = None,\n can_subset: bool = False,\n resource_defs: Optional[Mapping[str, object]] = None,\n group_names_by_key: Optional[Mapping[AssetKey, str]] = None,\n metadata_by_key: Optional[Mapping[AssetKey, ArbitraryMetadataMapping]] = None,\n freshness_policies_by_key: Optional[Mapping[AssetKey, FreshnessPolicy]] = None,\n auto_materialize_policies_by_key: Optional[Mapping[AssetKey, AutoMaterializePolicy]] = None,\n backfill_policy: Optional[BackfillPolicy] = None,\n descriptions_by_key: Optional[Mapping[AssetKey, str]] = None,\n check_specs_by_output_name: Optional[Mapping[str, AssetCheckSpec]] = None,\n selected_asset_check_keys: Optional[AbstractSet[AssetCheckKey]] = None,\n # if adding new fields, make sure to handle them in the with_attributes, from_graph, and\n # get_attributes_dict methods\n ):\n from dagster._core.execution.build_resources import wrap_resources_for_execution\n\n from .graph_definition import GraphDefinition\n\n if isinstance(node_def, GraphDefinition):\n _validate_graph_def(node_def)\n\n self._node_def = node_def\n self._keys_by_input_name = check.mapping_param(\n keys_by_input_name,\n "keys_by_input_name",\n key_type=str,\n value_type=AssetKey,\n )\n self._keys_by_output_name = check.mapping_param(\n keys_by_output_name,\n "keys_by_output_name",\n key_type=str,\n value_type=AssetKey,\n )\n\n check.opt_mapping_param(\n check_specs_by_output_name,\n "check_specs_by_output_name",\n key_type=str,\n value_type=AssetCheckSpec,\n )\n\n # if not specified assume all output assets depend on all input assets\n all_asset_keys = set(keys_by_output_name.values())\n input_asset_keys = set(keys_by_input_name.values())\n\n self._partitions_def = partitions_def\n self._partition_mappings = partition_mappings or {}\n builtin_partition_mappings = get_builtin_partition_mapping_types()\n for asset_key, partition_mapping in self._partition_mappings.items():\n if not isinstance(partition_mapping, builtin_partition_mappings):\n warnings.warn(\n f"Non-built-in PartitionMappings, such as {type(partition_mapping).__name__} "\n "are deprecated and will not work with asset reconciliation. The built-in "\n "partition mappings are "\n + ", ".join(\n builtin_partition_mapping.__name__\n for builtin_partition_mapping in builtin_partition_mappings\n )\n + ".",\n category=DeprecationWarning,\n )\n\n if asset_key not in input_asset_keys:\n check.failed(\n f"While constructing AssetsDefinition outputting {all_asset_keys}, received a"\n f" partition mapping for {asset_key} that is not defined in the set of upstream"\n f" assets: {input_asset_keys}"\n )\n\n self._asset_deps = asset_deps or {\n out_asset_key: set(keys_by_input_name.values()) for out_asset_key in all_asset_keys\n }\n check.invariant(\n set(self._asset_deps.keys()) == all_asset_keys,\n "The set of asset keys with dependencies specified in the asset_deps argument must "\n "equal the set of asset keys produced by this AssetsDefinition. \\n"\n f"asset_deps keys: {set(self._asset_deps.keys())} \\n"\n f"expected keys: {all_asset_keys}",\n )\n self._resource_defs = wrap_resources_for_execution(\n check.opt_mapping_param(resource_defs, "resource_defs")\n )\n\n group_names_by_key = (\n check.mapping_param(group_names_by_key, "group_names_by_key")\n if group_names_by_key\n else {}\n )\n self._group_names_by_key = {}\n # assets that don't have a group name get a DEFAULT_GROUP_NAME\n for key in all_asset_keys:\n group_name = group_names_by_key.get(key)\n self._group_names_by_key[key] = validate_group_name(group_name)\n\n if selected_asset_keys is not None:\n self._selected_asset_keys = selected_asset_keys\n else:\n self._selected_asset_keys = all_asset_keys\n self._can_subset = can_subset\n\n self._code_versions_by_key = {}\n self._metadata_by_key = dict(\n check.opt_mapping_param(\n metadata_by_key, "metadata_by_key", key_type=AssetKey, value_type=dict\n )\n )\n self._descriptions_by_key = dict(\n check.opt_mapping_param(\n descriptions_by_key, "descriptions_by_key", key_type=AssetKey, value_type=str\n )\n )\n for output_name, asset_key in keys_by_output_name.items():\n output_def, _ = node_def.resolve_output_to_origin(output_name, None)\n self._metadata_by_key[asset_key] = merge_dicts(\n output_def.metadata,\n self._metadata_by_key.get(asset_key, {}),\n )\n description = (\n self._descriptions_by_key.get(asset_key, output_def.description)\n or node_def.description\n )\n if description:\n self._descriptions_by_key[asset_key] = description\n self._code_versions_by_key[asset_key] = output_def.code_version\n\n for key, freshness_policy in (freshness_policies_by_key or {}).items():\n check.param_invariant(\n not (\n freshness_policy\n and self._partitions_def is not None\n and not isinstance(self._partitions_def, TimeWindowPartitionsDefinition)\n ),\n "freshness_policies_by_key",\n "FreshnessPolicies are currently unsupported for assets with partitions of type"\n f" {type(self._partitions_def)}.",\n )\n\n self._freshness_policies_by_key = check.opt_mapping_param(\n freshness_policies_by_key,\n "freshness_policies_by_key",\n key_type=AssetKey,\n value_type=FreshnessPolicy,\n )\n\n self._auto_materialize_policies_by_key = check.opt_mapping_param(\n auto_materialize_policies_by_key,\n "auto_materialize_policies_by_key",\n key_type=AssetKey,\n value_type=AutoMaterializePolicy,\n )\n\n self._backfill_policy = check.opt_inst_param(\n backfill_policy, "backfill_policy", BackfillPolicy\n )\n\n if selected_asset_check_keys is None:\n self._check_specs_by_output_name = check_specs_by_output_name or {}\n else:\n self._check_specs_by_output_name = {\n output_name: check_spec\n for output_name, check_spec in (check_specs_by_output_name or {}).items()\n if check_spec.key in selected_asset_check_keys\n }\n\n self._check_specs_by_handle = {\n spec.key: spec for spec in self._check_specs_by_output_name.values()\n }\n if selected_asset_check_keys is not None:\n self._selected_asset_check_keys = selected_asset_check_keys\n else:\n self._selected_asset_check_keys = self._check_specs_by_handle.keys()\n\n if self._partitions_def is None:\n # check if backfill policy is BackfillPolicyType.SINGLE_RUN if asset is not partitioned\n check.param_invariant(\n (\n backfill_policy.policy_type is BackfillPolicyType.SINGLE_RUN\n if backfill_policy\n else True\n ),\n "backfill_policy",\n "Non partitioned asset can only have single run backfill policy",\n )\n\n _validate_self_deps(\n input_keys=self._keys_by_input_name.values(),\n output_keys=self._selected_asset_keys,\n partition_mappings=self._partition_mappings,\n partitions_def=self._partitions_def,\n )\n\n @staticmethod\n def dagster_internal_init(\n *,\n keys_by_input_name: Mapping[str, AssetKey],\n keys_by_output_name: Mapping[str, AssetKey],\n node_def: NodeDefinition,\n partitions_def: Optional[PartitionsDefinition],\n partition_mappings: Optional[Mapping[AssetKey, PartitionMapping]],\n asset_deps: Optional[Mapping[AssetKey, AbstractSet[AssetKey]]],\n selected_asset_keys: Optional[AbstractSet[AssetKey]],\n can_subset: bool,\n resource_defs: Optional[Mapping[str, object]],\n group_names_by_key: Optional[Mapping[AssetKey, str]],\n metadata_by_key: Optional[Mapping[AssetKey, ArbitraryMetadataMapping]],\n freshness_policies_by_key: Optional[Mapping[AssetKey, FreshnessPolicy]],\n auto_materialize_policies_by_key: Optional[Mapping[AssetKey, AutoMaterializePolicy]],\n backfill_policy: Optional[BackfillPolicy],\n descriptions_by_key: Optional[Mapping[AssetKey, str]],\n check_specs_by_output_name: Optional[Mapping[str, AssetCheckSpec]],\n selected_asset_check_keys: Optional[AbstractSet[AssetCheckKey]],\n ) -> "AssetsDefinition":\n return AssetsDefinition(\n keys_by_input_name=keys_by_input_name,\n keys_by_output_name=keys_by_output_name,\n node_def=node_def,\n partitions_def=partitions_def,\n partition_mappings=partition_mappings,\n asset_deps=asset_deps,\n selected_asset_keys=selected_asset_keys,\n can_subset=can_subset,\n resource_defs=resource_defs,\n group_names_by_key=group_names_by_key,\n metadata_by_key=metadata_by_key,\n freshness_policies_by_key=freshness_policies_by_key,\n auto_materialize_policies_by_key=auto_materialize_policies_by_key,\n backfill_policy=backfill_policy,\n descriptions_by_key=descriptions_by_key,\n check_specs_by_output_name=check_specs_by_output_name,\n selected_asset_check_keys=selected_asset_check_keys,\n )\n\n def __call__(self, *args: object, **kwargs: object) -> object:\n from .composition import is_in_composition\n from .graph_definition import GraphDefinition\n\n # defer to GraphDefinition.__call__ for graph backed assets, or if invoked in composition\n if isinstance(self.node_def, GraphDefinition) or is_in_composition():\n return self._node_def(*args, **kwargs)\n\n # invoke against self to allow assets def information to be used\n return direct_invocation_result(self, *args, **kwargs)\n\n
[docs] @public\n @experimental_param(param="resource_defs")\n @staticmethod\n def from_graph(\n graph_def: "GraphDefinition",\n *,\n keys_by_input_name: Optional[Mapping[str, AssetKey]] = None,\n keys_by_output_name: Optional[Mapping[str, AssetKey]] = None,\n key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n internal_asset_deps: Optional[Mapping[str, Set[AssetKey]]] = None,\n partitions_def: Optional[PartitionsDefinition] = None,\n partition_mappings: Optional[Mapping[str, PartitionMapping]] = None,\n resource_defs: Optional[Mapping[str, ResourceDefinition]] = None,\n group_name: Optional[str] = None,\n group_names_by_output_name: Optional[Mapping[str, Optional[str]]] = None,\n descriptions_by_output_name: Optional[Mapping[str, str]] = None,\n metadata_by_output_name: Optional[Mapping[str, Optional[ArbitraryMetadataMapping]]] = None,\n freshness_policies_by_output_name: Optional[Mapping[str, Optional[FreshnessPolicy]]] = None,\n auto_materialize_policies_by_output_name: Optional[\n Mapping[str, Optional[AutoMaterializePolicy]]\n ] = None,\n backfill_policy: Optional[BackfillPolicy] = None,\n can_subset: bool = False,\n check_specs: Optional[Sequence[AssetCheckSpec]] = None,\n ) -> "AssetsDefinition":\n """Constructs an AssetsDefinition from a GraphDefinition.\n\n Args:\n graph_def (GraphDefinition): The GraphDefinition that is an asset.\n keys_by_input_name (Optional[Mapping[str, AssetKey]]): A mapping of the input\n names of the decorated graph to their corresponding asset keys. If not provided,\n the input asset keys will be created from the graph input names.\n keys_by_output_name (Optional[Mapping[str, AssetKey]]): A mapping of the output\n names of the decorated graph to their corresponding asset keys. If not provided,\n the output asset keys will be created from the graph output names.\n key_prefix (Optional[Union[str, Sequence[str]]]): If provided, key_prefix will be prepended\n to each key in keys_by_output_name. Each item in key_prefix must be a valid name in\n dagster (ie only contains letters, numbers, and _) and may not contain python\n reserved keywords.\n internal_asset_deps (Optional[Mapping[str, Set[AssetKey]]]): By default, it is assumed\n that all assets produced by the graph depend on all assets that are consumed by that\n graph. If this default is not correct, you pass in a map of output names to a\n corrected set of AssetKeys that they depend on. Any AssetKeys in this list must be\n either used as input to the asset or produced within the graph.\n partitions_def (Optional[PartitionsDefinition]): Defines the set of partition keys that\n compose the assets.\n partition_mappings (Optional[Mapping[str, PartitionMapping]]): Defines how to map partition\n keys for this asset to partition keys of upstream assets. Each key in the dictionary\n correponds to one of the input assets, and each value is a PartitionMapping.\n If no entry is provided for a particular asset dependency, the partition mapping defaults\n to the default partition mapping for the partitions definition, which is typically maps\n partition keys to the same partition keys in upstream assets.\n resource_defs (Optional[Mapping[str, ResourceDefinition]]):\n (Experimental) A mapping of resource keys to resource definitions. These resources\n will be initialized during execution, and can be accessed from the\n body of ops in the graph during execution.\n group_name (Optional[str]): A group name for the constructed asset. Assets without a\n group name are assigned to a group called "default".\n group_names_by_output_name (Optional[Mapping[str, Optional[str]]]): Defines a group name to be\n associated with some or all of the output assets for this node. Keys are names of the\n outputs, and values are the group name. Cannot be used with the group_name argument.\n descriptions_by_output_name (Optional[Mapping[str, Optional[str]]]): Defines a description to be\n associated with each of the output asstes for this graph.\n metadata_by_output_name (Optional[Mapping[str, Optional[MetadataUserInput]]]): Defines metadata to\n be associated with each of the output assets for this node. Keys are names of the\n outputs, and values are dictionaries of metadata to be associated with the related\n asset.\n freshness_policies_by_output_name (Optional[Mapping[str, Optional[FreshnessPolicy]]]): Defines a\n FreshnessPolicy to be associated with some or all of the output assets for this node.\n Keys are the names of the outputs, and values are the FreshnessPolicies to be attached\n to the associated asset.\n auto_materialize_policies_by_output_name (Optional[Mapping[str, Optional[AutoMaterializePolicy]]]): Defines an\n AutoMaterializePolicy to be associated with some or all of the output assets for this node.\n Keys are the names of the outputs, and values are the AutoMaterializePolicies to be attached\n to the associated asset.\n backfill_policy (Optional[BackfillPolicy]): Defines this asset's BackfillPolicy\n """\n return AssetsDefinition._from_node(\n node_def=graph_def,\n keys_by_input_name=keys_by_input_name,\n keys_by_output_name=keys_by_output_name,\n key_prefix=key_prefix,\n internal_asset_deps=internal_asset_deps,\n partitions_def=partitions_def,\n partition_mappings=partition_mappings,\n resource_defs=resource_defs,\n group_name=group_name,\n group_names_by_output_name=group_names_by_output_name,\n descriptions_by_output_name=descriptions_by_output_name,\n metadata_by_output_name=metadata_by_output_name,\n freshness_policies_by_output_name=freshness_policies_by_output_name,\n auto_materialize_policies_by_output_name=auto_materialize_policies_by_output_name,\n backfill_policy=backfill_policy,\n can_subset=can_subset,\n check_specs=check_specs,\n )
\n\n
[docs] @public\n @staticmethod\n def from_op(\n op_def: OpDefinition,\n *,\n keys_by_input_name: Optional[Mapping[str, AssetKey]] = None,\n keys_by_output_name: Optional[Mapping[str, AssetKey]] = None,\n key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n internal_asset_deps: Optional[Mapping[str, Set[AssetKey]]] = None,\n partitions_def: Optional[PartitionsDefinition] = None,\n partition_mappings: Optional[Mapping[str, PartitionMapping]] = None,\n group_name: Optional[str] = None,\n group_names_by_output_name: Optional[Mapping[str, Optional[str]]] = None,\n descriptions_by_output_name: Optional[Mapping[str, str]] = None,\n metadata_by_output_name: Optional[Mapping[str, Optional[ArbitraryMetadataMapping]]] = None,\n freshness_policies_by_output_name: Optional[Mapping[str, Optional[FreshnessPolicy]]] = None,\n auto_materialize_policies_by_output_name: Optional[\n Mapping[str, Optional[AutoMaterializePolicy]]\n ] = None,\n backfill_policy: Optional[BackfillPolicy] = None,\n can_subset: bool = False,\n ) -> "AssetsDefinition":\n """Constructs an AssetsDefinition from an OpDefinition.\n\n Args:\n op_def (OpDefinition): The OpDefinition that is an asset.\n keys_by_input_name (Optional[Mapping[str, AssetKey]]): A mapping of the input\n names of the decorated op to their corresponding asset keys. If not provided,\n the input asset keys will be created from the op input names.\n keys_by_output_name (Optional[Mapping[str, AssetKey]]): A mapping of the output\n names of the decorated op to their corresponding asset keys. If not provided,\n the output asset keys will be created from the op output names.\n key_prefix (Optional[Union[str, Sequence[str]]]): If provided, key_prefix will be prepended\n to each key in keys_by_output_name. Each item in key_prefix must be a valid name in\n dagster (ie only contains letters, numbers, and _) and may not contain python\n reserved keywords.\n internal_asset_deps (Optional[Mapping[str, Set[AssetKey]]]): By default, it is assumed\n that all assets produced by the op depend on all assets that are consumed by that\n op. If this default is not correct, you pass in a map of output names to a\n corrected set of AssetKeys that they depend on. Any AssetKeys in this list must be\n either used as input to the asset or produced within the op.\n partitions_def (Optional[PartitionsDefinition]): Defines the set of partition keys that\n compose the assets.\n partition_mappings (Optional[Mapping[str, PartitionMapping]]): Defines how to map partition\n keys for this asset to partition keys of upstream assets. Each key in the dictionary\n correponds to one of the input assets, and each value is a PartitionMapping.\n If no entry is provided for a particular asset dependency, the partition mapping defaults\n to the default partition mapping for the partitions definition, which is typically maps\n partition keys to the same partition keys in upstream assets.\n group_name (Optional[str]): A group name for the constructed asset. Assets without a\n group name are assigned to a group called "default".\n group_names_by_output_name (Optional[Mapping[str, Optional[str]]]): Defines a group name to be\n associated with some or all of the output assets for this node. Keys are names of the\n outputs, and values are the group name. Cannot be used with the group_name argument.\n descriptions_by_output_name (Optional[Mapping[str, Optional[str]]]): Defines a description to be\n associated with each of the output asstes for this graph.\n metadata_by_output_name (Optional[Mapping[str, Optional[MetadataUserInput]]]): Defines metadata to\n be associated with each of the output assets for this node. Keys are names of the\n outputs, and values are dictionaries of metadata to be associated with the related\n asset.\n freshness_policies_by_output_name (Optional[Mapping[str, Optional[FreshnessPolicy]]]): Defines a\n FreshnessPolicy to be associated with some or all of the output assets for this node.\n Keys are the names of the outputs, and values are the FreshnessPolicies to be attached\n to the associated asset.\n auto_materialize_policies_by_output_name (Optional[Mapping[str, Optional[AutoMaterializePolicy]]]): Defines an\n AutoMaterializePolicy to be associated with some or all of the output assets for this node.\n Keys are the names of the outputs, and values are the AutoMaterializePolicies to be attached\n to the associated asset.\n backfill_policy (Optional[BackfillPolicy]): Defines this asset's BackfillPolicy\n """\n return AssetsDefinition._from_node(\n node_def=op_def,\n keys_by_input_name=keys_by_input_name,\n keys_by_output_name=keys_by_output_name,\n key_prefix=key_prefix,\n internal_asset_deps=internal_asset_deps,\n partitions_def=partitions_def,\n partition_mappings=partition_mappings,\n group_name=group_name,\n group_names_by_output_name=group_names_by_output_name,\n descriptions_by_output_name=descriptions_by_output_name,\n metadata_by_output_name=metadata_by_output_name,\n freshness_policies_by_output_name=freshness_policies_by_output_name,\n auto_materialize_policies_by_output_name=auto_materialize_policies_by_output_name,\n backfill_policy=backfill_policy,\n can_subset=can_subset,\n )
\n\n @staticmethod\n def _from_node(\n node_def: Union[OpDefinition, "GraphDefinition"],\n *,\n keys_by_input_name: Optional[Mapping[str, AssetKey]] = None,\n keys_by_output_name: Optional[Mapping[str, AssetKey]] = None,\n key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n internal_asset_deps: Optional[Mapping[str, Set[AssetKey]]] = None,\n partitions_def: Optional[PartitionsDefinition] = None,\n partition_mappings: Optional[Mapping[str, PartitionMapping]] = None,\n resource_defs: Optional[Mapping[str, ResourceDefinition]] = None,\n group_name: Optional[str] = None,\n group_names_by_output_name: Optional[Mapping[str, Optional[str]]] = None,\n descriptions_by_output_name: Optional[Mapping[str, str]] = None,\n metadata_by_output_name: Optional[Mapping[str, Optional[ArbitraryMetadataMapping]]] = None,\n freshness_policies_by_output_name: Optional[Mapping[str, Optional[FreshnessPolicy]]] = None,\n auto_materialize_policies_by_output_name: Optional[\n Mapping[str, Optional[AutoMaterializePolicy]]\n ] = None,\n backfill_policy: Optional[BackfillPolicy] = None,\n can_subset: bool = False,\n check_specs: Optional[Sequence[AssetCheckSpec]] = None,\n ) -> "AssetsDefinition":\n from dagster._core.definitions.decorators.asset_decorator import (\n _validate_and_assign_output_names_to_check_specs,\n )\n\n node_def = check.inst_param(node_def, "node_def", NodeDefinition)\n keys_by_input_name = _infer_keys_by_input_names(\n node_def,\n check.opt_mapping_param(\n keys_by_input_name, "keys_by_input_name", key_type=str, value_type=AssetKey\n ),\n )\n keys_by_output_name = check.opt_mapping_param(\n keys_by_output_name,\n "keys_by_output_name",\n key_type=str,\n value_type=AssetKey,\n )\n internal_asset_deps = check.opt_mapping_param(\n internal_asset_deps, "internal_asset_deps", key_type=str, value_type=set\n )\n resource_defs = check.opt_mapping_param(\n resource_defs, "resource_defs", key_type=str, value_type=ResourceDefinition\n )\n transformed_internal_asset_deps: Dict[AssetKey, AbstractSet[AssetKey]] = {}\n if internal_asset_deps:\n for output_name, asset_keys in internal_asset_deps.items():\n check.invariant(\n output_name in keys_by_output_name,\n f"output_name {output_name} specified in internal_asset_deps does not exist"\n " in the decorated function",\n )\n transformed_internal_asset_deps[keys_by_output_name[output_name]] = asset_keys\n\n check_specs_by_output_name = _validate_and_assign_output_names_to_check_specs(\n check_specs, list(keys_by_output_name.values())\n )\n\n keys_by_output_name = _infer_keys_by_output_names(\n node_def, keys_by_output_name or {}, check_specs_by_output_name\n )\n\n keys_by_output_name_with_prefix: Dict[str, AssetKey] = {}\n key_prefix_list = [key_prefix] if isinstance(key_prefix, str) else key_prefix\n for output_name, key in keys_by_output_name.items():\n # add key_prefix to the beginning of each asset key\n key_with_key_prefix = AssetKey(\n list(filter(None, [*(key_prefix_list or []), *key.path]))\n )\n keys_by_output_name_with_prefix[output_name] = key_with_key_prefix\n\n check.param_invariant(\n group_name is None or group_names_by_output_name is None,\n "group_name",\n "Cannot use both group_name and group_names_by_output_name",\n )\n\n if group_name:\n group_names_by_key = {\n asset_key: group_name for asset_key in keys_by_output_name_with_prefix.values()\n }\n elif group_names_by_output_name:\n group_names_by_key = {\n keys_by_output_name_with_prefix[output_name]: group_name\n for output_name, group_name in group_names_by_output_name.items()\n if group_name is not None\n }\n else:\n group_names_by_key = None\n\n return AssetsDefinition.dagster_internal_init(\n keys_by_input_name=keys_by_input_name,\n keys_by_output_name=keys_by_output_name_with_prefix,\n node_def=node_def,\n asset_deps=transformed_internal_asset_deps or None,\n partitions_def=check.opt_inst_param(\n partitions_def,\n "partitions_def",\n PartitionsDefinition,\n ),\n group_names_by_key=group_names_by_key,\n resource_defs=resource_defs,\n partition_mappings=(\n {\n keys_by_input_name[input_name]: partition_mapping\n for input_name, partition_mapping in partition_mappings.items()\n }\n if partition_mappings\n else None\n ),\n metadata_by_key=(\n {\n keys_by_output_name_with_prefix[output_name]: metadata\n for output_name, metadata in metadata_by_output_name.items()\n if metadata is not None\n }\n if metadata_by_output_name\n else None\n ),\n freshness_policies_by_key=(\n {\n keys_by_output_name_with_prefix[output_name]: freshness_policy\n for output_name, freshness_policy in freshness_policies_by_output_name.items()\n if freshness_policy is not None\n }\n if freshness_policies_by_output_name\n else None\n ),\n auto_materialize_policies_by_key=(\n {\n keys_by_output_name_with_prefix[output_name]: auto_materialize_policy\n for output_name, auto_materialize_policy in auto_materialize_policies_by_output_name.items()\n if auto_materialize_policy is not None\n }\n if auto_materialize_policies_by_output_name\n else None\n ),\n backfill_policy=check.opt_inst_param(\n backfill_policy, "backfill_policy", BackfillPolicy\n ),\n descriptions_by_key=(\n {\n keys_by_output_name_with_prefix[output_name]: description\n for output_name, description in descriptions_by_output_name.items()\n if description is not None\n }\n if descriptions_by_output_name\n else None\n ),\n can_subset=can_subset,\n selected_asset_keys=None, # node has no subselection info\n check_specs_by_output_name=check_specs_by_output_name,\n selected_asset_check_keys=None,\n )\n\n @public\n @property\n def can_subset(self) -> bool:\n """bool: If True, indicates that this AssetsDefinition may materialize any subset of its\n asset keys in a given computation (as opposed to being required to materialize all asset\n keys).\n """\n return self._can_subset\n\n @public\n @property\n def group_names_by_key(self) -> Mapping[AssetKey, str]:\n """Mapping[AssetKey, str]: Returns a mapping from the asset keys in this AssetsDefinition\n to the group names assigned to them. If there is no assigned group name for a given AssetKey,\n it will not be present in this dictionary.\n """\n return self._group_names_by_key\n\n @public\n @property\n def descriptions_by_key(self) -> Mapping[AssetKey, str]:\n """Mapping[AssetKey, str]: Returns a mapping from the asset keys in this AssetsDefinition\n to the descriptions assigned to them. If there is no assigned description for a given AssetKey,\n it will not be present in this dictionary.\n """\n return self._descriptions_by_key\n\n @public\n @property\n def op(self) -> OpDefinition:\n """OpDefinition: Returns the OpDefinition that is used to materialize the assets in this\n AssetsDefinition.\n """\n check.invariant(\n isinstance(self._node_def, OpDefinition),\n "The NodeDefinition for this AssetsDefinition is not of type OpDefinition.",\n )\n return cast(OpDefinition, self._node_def)\n\n @public\n @property\n def node_def(self) -> NodeDefinition:\n """NodeDefinition: Returns the OpDefinition or GraphDefinition that is used to materialize\n the assets in this AssetsDefinition.\n """\n return self._node_def\n\n @public\n @property\n def asset_deps(self) -> Mapping[AssetKey, AbstractSet[AssetKey]]:\n """Maps assets that are produced by this definition to assets that they depend on. The\n dependencies can be either "internal", meaning that they refer to other assets that are\n produced by this definition, or "external", meaning that they refer to assets that aren't\n produced by this definition.\n """\n return self._asset_deps\n\n @property\n def input_names(self) -> Iterable[str]:\n """Iterable[str]: The set of input names of the underlying NodeDefinition for this\n AssetsDefinition.\n """\n return self.keys_by_input_name.keys()\n\n @public\n @property\n def key(self) -> AssetKey:\n """AssetKey: The asset key associated with this AssetsDefinition. If this AssetsDefinition\n has more than one asset key, this will produce an error.\n """\n check.invariant(\n len(self.keys) == 1,\n "Tried to retrieve asset key from an assets definition with multiple asset keys: "\n + ", ".join([str(ak.to_string()) for ak in self._keys_by_output_name.values()]),\n )\n\n return next(iter(self.keys))\n\n @public\n @property\n def resource_defs(self) -> Mapping[str, ResourceDefinition]:\n """Mapping[str, ResourceDefinition]: A mapping from resource name to ResourceDefinition for\n the resources bound to this AssetsDefinition.\n """\n return dict(self._resource_defs)\n\n @public\n @property\n def keys(self) -> AbstractSet[AssetKey]:\n """AbstractSet[AssetKey]: The asset keys associated with this AssetsDefinition."""\n return self._selected_asset_keys\n\n @public\n @property\n def dependency_keys(self) -> Iterable[AssetKey]:\n """Iterable[AssetKey]: The asset keys which are upstream of any asset included in this\n AssetsDefinition.\n """\n # the input asset keys that are directly upstream of a selected asset key\n upstream_keys = {dep_key for key in self.keys for dep_key in self.asset_deps[key]}\n input_keys = set(self._keys_by_input_name.values())\n return upstream_keys.intersection(input_keys)\n\n @property\n def node_keys_by_output_name(self) -> Mapping[str, AssetKey]:\n """AssetKey for each output on the underlying NodeDefinition."""\n return self._keys_by_output_name\n\n @property\n def node_keys_by_input_name(self) -> Mapping[str, AssetKey]:\n """AssetKey for each input on the underlying NodeDefinition."""\n return self._keys_by_input_name\n\n @property\n def check_specs_by_output_name(self) -> Mapping[str, AssetCheckSpec]:\n return self._check_specs_by_output_name\n\n def get_spec_for_check_key(self, asset_check_key: AssetCheckKey) -> AssetCheckSpec:\n return self._check_specs_by_handle[asset_check_key]\n\n @property\n def keys_by_output_name(self) -> Mapping[str, AssetKey]:\n return {\n name: key for name, key in self.node_keys_by_output_name.items() if key in self.keys\n }\n\n @property\n def keys_by_input_name(self) -> Mapping[str, AssetKey]:\n upstream_keys = {dep_key for key in self.keys for dep_key in self.asset_deps[key]}\n return {\n name: key for name, key in self.node_keys_by_input_name.items() if key in upstream_keys\n }\n\n @property\n def freshness_policies_by_key(self) -> Mapping[AssetKey, FreshnessPolicy]:\n return self._freshness_policies_by_key\n\n @property\n def auto_materialize_policies_by_key(self) -> Mapping[AssetKey, AutoMaterializePolicy]:\n return self._auto_materialize_policies_by_key\n\n @property\n def backfill_policy(self) -> Optional[BackfillPolicy]:\n return self._backfill_policy\n\n @public\n @property\n def partitions_def(self) -> Optional[PartitionsDefinition]:\n """Optional[PartitionsDefinition]: The PartitionsDefinition for this AssetsDefinition (if any)."""\n return self._partitions_def\n\n @property\n def metadata_by_key(self) -> Mapping[AssetKey, ArbitraryMetadataMapping]:\n return self._metadata_by_key\n\n @property\n def code_versions_by_key(self) -> Mapping[AssetKey, Optional[str]]:\n return self._code_versions_by_key\n\n @property\n def partition_mappings(self) -> Mapping[AssetKey, PartitionMapping]:\n return self._partition_mappings\n\n
[docs] @public\n def get_partition_mapping(self, in_asset_key: AssetKey) -> Optional[PartitionMapping]:\n """Returns the partition mapping between keys in this AssetsDefinition and a given input\n asset key (if any).\n """\n return self._partition_mappings.get(in_asset_key)
\n\n @public\n @property\n def check_specs(self) -> Iterable[AssetCheckSpec]:\n """Returns the asset check specs defined on this AssetsDefinition, i.e. the checks that can\n be executed while materializing the assets.\n\n Returns:\n Iterable[AssetsCheckSpec]:\n """\n return self._check_specs_by_output_name.values()\n\n @property\n def check_keys(self) -> AbstractSet[AssetCheckKey]:\n """Returns the selected asset checks associated by this AssetsDefinition.\n\n Returns:\n AbstractSet[Tuple[AssetKey, str]]: The selected asset checks. An asset check is\n identified by the asset key and the name of the check.\n """\n return self._selected_asset_check_keys\n\n def is_asset_executable(self, asset_key: AssetKey) -> bool:\n """Returns True if the asset key is materializable by this AssetsDefinition.\n\n Args:\n asset_key (AssetKey): The asset key to check.\n\n Returns:\n bool: True if the asset key is materializable by this AssetsDefinition.\n """\n from dagster._core.definitions.asset_spec import (\n SYSTEM_METADATA_KEY_ASSET_EXECUTION_TYPE,\n AssetExecutionType,\n )\n\n return AssetExecutionType.is_executable(\n self._metadata_by_key.get(asset_key, {}).get(SYSTEM_METADATA_KEY_ASSET_EXECUTION_TYPE)\n )\n\n def get_partition_mapping_for_input(self, input_name: str) -> Optional[PartitionMapping]:\n return self._partition_mappings.get(self._keys_by_input_name[input_name])\n\n def infer_partition_mapping(\n self, upstream_asset_key: AssetKey, upstream_partitions_def: Optional[PartitionsDefinition]\n ) -> PartitionMapping:\n with disable_dagster_warnings():\n partition_mapping = self._partition_mappings.get(upstream_asset_key)\n return infer_partition_mapping(\n partition_mapping, self._partitions_def, upstream_partitions_def\n )\n\n def get_output_name_for_asset_key(self, key: AssetKey) -> str:\n for output_name, asset_key in self.keys_by_output_name.items():\n if key == asset_key:\n return output_name\n\n raise DagsterInvariantViolationError(\n f"Asset key {key.to_user_string()} not found in AssetsDefinition"\n )\n\n def get_op_def_for_asset_key(self, key: AssetKey) -> OpDefinition:\n """If this is an op-backed asset, returns the op def. If it's a graph-backed asset,\n returns the op def within the graph that produces the given asset key.\n """\n output_name = self.get_output_name_for_asset_key(key)\n return self.node_def.resolve_output_to_origin_op_def(output_name)\n\n def with_attributes(\n self,\n *,\n output_asset_key_replacements: Optional[Mapping[AssetKey, AssetKey]] = None,\n input_asset_key_replacements: Optional[Mapping[AssetKey, AssetKey]] = None,\n group_names_by_key: Optional[Mapping[AssetKey, str]] = None,\n descriptions_by_key: Optional[Mapping[AssetKey, str]] = None,\n metadata_by_key: Optional[Mapping[AssetKey, ArbitraryMetadataMapping]] = None,\n freshness_policy: Optional[\n Union[FreshnessPolicy, Mapping[AssetKey, FreshnessPolicy]]\n ] = None,\n auto_materialize_policy: Optional[\n Union[AutoMaterializePolicy, Mapping[AssetKey, AutoMaterializePolicy]]\n ] = None,\n backfill_policy: Optional[BackfillPolicy] = None,\n ) -> "AssetsDefinition":\n output_asset_key_replacements = check.opt_mapping_param(\n output_asset_key_replacements,\n "output_asset_key_replacements",\n key_type=AssetKey,\n value_type=AssetKey,\n )\n input_asset_key_replacements = check.opt_mapping_param(\n input_asset_key_replacements,\n "input_asset_key_replacements",\n key_type=AssetKey,\n value_type=AssetKey,\n )\n group_names_by_key = check.opt_mapping_param(\n group_names_by_key, "group_names_by_key", key_type=AssetKey, value_type=str\n )\n descriptions_by_key = check.opt_mapping_param(\n descriptions_by_key, "descriptions_by_key", key_type=AssetKey, value_type=str\n )\n metadata_by_key = check.opt_mapping_param(\n metadata_by_key, "metadata_by_key", key_type=AssetKey, value_type=dict\n )\n\n backfill_policy = check.opt_inst_param(backfill_policy, "backfill_policy", BackfillPolicy)\n\n if group_names_by_key:\n group_name_conflicts = [\n asset_key\n for asset_key in group_names_by_key\n if asset_key in self.group_names_by_key\n and self.group_names_by_key[asset_key] != DEFAULT_GROUP_NAME\n ]\n if group_name_conflicts:\n raise DagsterInvalidDefinitionError(\n "Group name already exists on assets"\n f" {', '.join(asset_key.to_user_string() for asset_key in group_name_conflicts)}"\n )\n\n replaced_group_names_by_key = {\n output_asset_key_replacements.get(key, key): group_name\n for key, group_name in self.group_names_by_key.items()\n }\n\n if freshness_policy:\n freshness_policy_conflicts = (\n self.freshness_policies_by_key.keys()\n if isinstance(freshness_policy, FreshnessPolicy)\n else (freshness_policy.keys() & self.freshness_policies_by_key.keys())\n )\n if freshness_policy_conflicts:\n raise DagsterInvalidDefinitionError(\n "FreshnessPolicy already exists on assets"\n f" {', '.join(key.to_string() for key in freshness_policy_conflicts)}"\n )\n\n replaced_freshness_policies_by_key = {}\n for key in self.keys:\n if isinstance(freshness_policy, FreshnessPolicy):\n replaced_freshness_policy = freshness_policy\n elif freshness_policy:\n replaced_freshness_policy = freshness_policy.get(key)\n else:\n replaced_freshness_policy = self.freshness_policies_by_key.get(key)\n\n if replaced_freshness_policy:\n replaced_freshness_policies_by_key[output_asset_key_replacements.get(key, key)] = (\n replaced_freshness_policy\n )\n\n if auto_materialize_policy:\n auto_materialize_policy_conflicts = (\n self.auto_materialize_policies_by_key.keys()\n if isinstance(auto_materialize_policy, AutoMaterializePolicy)\n else (auto_materialize_policy.keys() & self.auto_materialize_policies_by_key.keys())\n )\n if auto_materialize_policy_conflicts:\n raise DagsterInvalidDefinitionError(\n "AutoMaterializePolicy already exists on assets"\n f" {', '.join(key.to_string() for key in auto_materialize_policy_conflicts)}"\n )\n\n replaced_auto_materialize_policies_by_key = {}\n for key in self.keys:\n if isinstance(auto_materialize_policy, AutoMaterializePolicy):\n replaced_auto_materialize_policy = auto_materialize_policy\n elif auto_materialize_policy:\n replaced_auto_materialize_policy = auto_materialize_policy.get(key)\n else:\n replaced_auto_materialize_policy = self.auto_materialize_policies_by_key.get(key)\n\n if replaced_auto_materialize_policy:\n replaced_auto_materialize_policies_by_key[\n output_asset_key_replacements.get(key, key)\n ] = replaced_auto_materialize_policy\n\n replaced_descriptions_by_key = {\n output_asset_key_replacements.get(key, key): description\n for key, description in descriptions_by_key.items()\n }\n\n if not metadata_by_key:\n metadata_by_key = self.metadata_by_key\n\n replaced_metadata_by_key = {\n output_asset_key_replacements.get(key, key): metadata\n for key, metadata in metadata_by_key.items()\n }\n\n replaced_attributes = dict(\n keys_by_input_name={\n input_name: input_asset_key_replacements.get(key, key)\n for input_name, key in self._keys_by_input_name.items()\n },\n keys_by_output_name={\n output_name: output_asset_key_replacements.get(key, key)\n for output_name, key in self._keys_by_output_name.items()\n },\n partition_mappings={\n input_asset_key_replacements.get(key, key): partition_mapping\n for key, partition_mapping in self._partition_mappings.items()\n },\n asset_deps={\n # replace both the keys and the values in this mapping\n output_asset_key_replacements.get(key, key): {\n input_asset_key_replacements.get(\n upstream_key,\n output_asset_key_replacements.get(upstream_key, upstream_key),\n )\n for upstream_key in value\n }\n for key, value in self.asset_deps.items()\n },\n selected_asset_keys={\n output_asset_key_replacements.get(key, key) for key in self._selected_asset_keys\n },\n group_names_by_key={\n **replaced_group_names_by_key,\n **group_names_by_key,\n },\n metadata_by_key=replaced_metadata_by_key,\n freshness_policies_by_key=replaced_freshness_policies_by_key,\n auto_materialize_policies_by_key=replaced_auto_materialize_policies_by_key,\n backfill_policy=backfill_policy if backfill_policy else self.backfill_policy,\n descriptions_by_key=replaced_descriptions_by_key,\n )\n\n return self.__class__(**merge_dicts(self.get_attributes_dict(), replaced_attributes))\n\n def _subset_graph_backed_asset(\n self,\n selected_asset_keys: AbstractSet[AssetKey],\n ):\n from dagster._core.definitions.graph_definition import GraphDefinition\n\n if not isinstance(self.node_def, GraphDefinition):\n raise DagsterInvalidInvocationError(\n "Method _subset_graph_backed_asset cannot subset an asset that is not a graph"\n )\n\n # All asset keys in selected_asset_keys are outputted from the same top-level graph backed asset\n dep_node_handles_by_asset_key = get_dep_node_handles_of_graph_backed_asset(\n self.node_def, self\n )\n op_selection: List[str] = []\n for asset_key in selected_asset_keys:\n dep_node_handles = dep_node_handles_by_asset_key[asset_key]\n for dep_node_handle in dep_node_handles:\n op_selection.append(".".join(dep_node_handle.path[1:]))\n\n return get_graph_subset(self.node_def, op_selection)\n\n def subset_for(\n self,\n selected_asset_keys: AbstractSet[AssetKey],\n selected_asset_check_keys: Optional[AbstractSet[AssetCheckKey]],\n ) -> "AssetsDefinition":\n """Create a subset of this AssetsDefinition that will only materialize the assets and checks\n in the selected set.\n\n Args:\n selected_asset_keys (AbstractSet[AssetKey]): The total set of asset keys\n selected_asset_check_keys (AbstractSet[AssetCheckKey]): The selected asset checks\n """\n from dagster._core.definitions.graph_definition import GraphDefinition\n\n check.invariant(\n self.can_subset,\n f"Attempted to subset AssetsDefinition for {self.node_def.name}, but can_subset=False.",\n )\n\n # Set of assets within selected_asset_keys which are outputted by this AssetDefinition\n asset_subselection = selected_asset_keys & self.keys\n if selected_asset_check_keys is None:\n # filter to checks that target selected asset keys\n asset_check_subselection = {\n key for key in self.check_keys if key.asset_key in asset_subselection\n }\n else:\n asset_check_subselection = selected_asset_check_keys & self.check_keys\n\n # Early escape if all assets in AssetsDefinition are selected\n if asset_subselection == self.keys and asset_check_subselection == self.check_keys:\n return self\n elif isinstance(self.node_def, GraphDefinition): # Node is graph-backed asset\n check.invariant(\n selected_asset_check_keys == self.check_keys,\n "Subsetting graph-backed assets with checks is not yet supported",\n )\n\n subsetted_node = self._subset_graph_backed_asset(\n asset_subselection,\n )\n\n # The subsetted node should only include asset inputs that are dependencies of the\n # selected set of assets.\n subsetted_input_names = [input_def.name for input_def in subsetted_node.input_defs]\n subsetted_keys_by_input_name = {\n key: value\n for key, value in self.node_keys_by_input_name.items()\n if key in subsetted_input_names\n }\n\n subsetted_output_names = [output_def.name for output_def in subsetted_node.output_defs]\n subsetted_keys_by_output_name = {\n key: value\n for key, value in self.node_keys_by_output_name.items()\n if key in subsetted_output_names\n }\n\n # An op within the graph-backed asset that yields multiple assets will be run\n # any time any of its output assets are selected. Thus, if an op yields multiple assets\n # and only one of them is selected, the op will still run and potentially unexpectedly\n # materialize the unselected asset.\n #\n # Thus, we include unselected assets that may be accidentally materialized in\n # keys_by_output_name and asset_deps so that the webserver can populate an warning when\n # this occurs. This is the same behavior as multi-asset subsetting.\n\n subsetted_asset_deps = {\n out_asset_key: set(self._keys_by_input_name.values())\n for out_asset_key in subsetted_keys_by_output_name.values()\n }\n\n replaced_attributes = dict(\n keys_by_input_name=subsetted_keys_by_input_name,\n keys_by_output_name=subsetted_keys_by_output_name,\n node_def=subsetted_node,\n asset_deps=subsetted_asset_deps,\n selected_asset_keys=selected_asset_keys & self.keys,\n )\n\n return self.__class__(**merge_dicts(self.get_attributes_dict(), replaced_attributes))\n else:\n # multi_asset subsetting\n replaced_attributes = {\n "selected_asset_keys": asset_subselection,\n "selected_asset_check_keys": asset_check_subselection,\n }\n return self.__class__(**merge_dicts(self.get_attributes_dict(), replaced_attributes))\n\n
[docs] @public\n def to_source_assets(self) -> Sequence[SourceAsset]:\n """Returns a SourceAsset for each asset in this definition.\n\n Each produced SourceAsset will have the same key, metadata, io_manager_key, etc. as the\n corresponding asset\n """\n return [\n self._output_to_source_asset(output_name)\n for output_name in self.keys_by_output_name.keys()\n ]
\n\n
[docs] @public\n def to_source_asset(self, key: Optional[CoercibleToAssetKey] = None) -> SourceAsset:\n """Returns a representation of this asset as a :py:class:`SourceAsset`.\n\n If this is a multi-asset, the "key" argument allows selecting which asset to return a\n SourceAsset representation of.\n\n Args:\n key (Optional[Union[str, Sequence[str], AssetKey]]]): If this is a multi-asset, select\n which asset to return a SourceAsset representation of. If not a multi-asset, this\n can be left as None.\n\n Returns:\n SourceAsset\n """\n if len(self.keys) > 1:\n check.invariant(\n key is not None,\n "The 'key' argument is required when there are multiple assets to choose from",\n )\n\n if key is not None:\n resolved_key = AssetKey.from_coercible(key)\n check.invariant(\n resolved_key in self.keys, f"Key {resolved_key} not found in AssetsDefinition"\n )\n else:\n resolved_key = self.key\n\n output_names = [\n output_name\n for output_name, ak in self.keys_by_output_name.items()\n if ak == resolved_key\n ]\n check.invariant(len(output_names) == 1)\n return self._output_to_source_asset(output_names[0])
\n\n def _output_to_source_asset(self, output_name: str) -> SourceAsset:\n with disable_dagster_warnings():\n output_def = self.node_def.resolve_output_to_origin(\n output_name, NodeHandle(self.node_def.name, parent=None)\n )[0]\n key = self._keys_by_output_name[output_name]\n\n return SourceAsset(\n key=key,\n metadata=output_def.metadata,\n io_manager_key=output_def.io_manager_key,\n description=output_def.description,\n resource_defs=self.resource_defs,\n partitions_def=self.partitions_def,\n group_name=self.group_names_by_key[key],\n )\n\n def get_io_manager_key_for_asset_key(self, key: AssetKey) -> str:\n output_name = self.get_output_name_for_asset_key(key)\n return self.node_def.resolve_output_to_origin(\n output_name, NodeHandle(self.node_def.name, parent=None)\n )[0].io_manager_key\n\n def get_resource_requirements(self) -> Iterator[ResourceRequirement]:\n yield from self.node_def.get_resource_requirements() # type: ignore[attr-defined]\n for source_key, resource_def in self.resource_defs.items():\n yield from resource_def.get_resource_requirements(outer_context=source_key)\n\n @public\n @property\n def required_resource_keys(self) -> Set[str]:\n """Set[str]: The set of keys for resources that must be provided to this AssetsDefinition."""\n return {requirement.key for requirement in self.get_resource_requirements()}\n\n def __str__(self):\n if len(self.keys) == 1:\n return f"AssetsDefinition with key {self.key.to_string()}"\n else:\n asset_keys = ", ".join(sorted(([asset_key.to_string() for asset_key in self.keys])))\n return f"AssetsDefinition with keys {asset_keys}"\n\n @property\n def unique_id(self) -> str:\n """A unique identifier for the AssetsDefinition that's stable across processes."""\n return hashlib.md5((json.dumps(sorted(self.keys))).encode("utf-8")).hexdigest()\n\n def with_resources(self, resource_defs: Mapping[str, ResourceDefinition]) -> "AssetsDefinition":\n attributes_dict = self.get_attributes_dict()\n attributes_dict["resource_defs"] = merge_resource_defs(\n old_resource_defs=self.resource_defs,\n resource_defs_to_merge_in=resource_defs,\n requires_resources=self,\n )\n return self.__class__(**attributes_dict)\n\n def get_attributes_dict(self) -> Dict[str, Any]:\n return dict(\n keys_by_input_name=self._keys_by_input_name,\n keys_by_output_name=self._keys_by_output_name,\n node_def=self._node_def,\n partitions_def=self._partitions_def,\n partition_mappings=self._partition_mappings,\n asset_deps=self.asset_deps,\n selected_asset_keys=self._selected_asset_keys,\n can_subset=self._can_subset,\n resource_defs=self._resource_defs,\n group_names_by_key=self._group_names_by_key,\n metadata_by_key=self._metadata_by_key,\n freshness_policies_by_key=self._freshness_policies_by_key,\n auto_materialize_policies_by_key=self._auto_materialize_policies_by_key,\n backfill_policy=self._backfill_policy,\n descriptions_by_key=self._descriptions_by_key,\n check_specs_by_output_name=self._check_specs_by_output_name,\n selected_asset_check_keys=self._selected_asset_check_keys,\n )
\n\n\ndef _infer_keys_by_input_names(\n node_def: Union["GraphDefinition", OpDefinition], keys_by_input_name: Mapping[str, AssetKey]\n) -> Mapping[str, AssetKey]:\n all_input_names = [input_def.name for input_def in node_def.input_defs]\n if keys_by_input_name:\n check.invariant(\n set(keys_by_input_name.keys()) == set(all_input_names),\n "The set of input names keys specified in the keys_by_input_name argument must "\n f"equal the set of asset keys inputted by '{node_def.name}'. \\n"\n f"keys_by_input_name keys: {set(keys_by_input_name.keys())} \\n"\n f"expected keys: {all_input_names}",\n )\n\n # If asset key is not supplied in keys_by_input_name, create asset key\n # from input name\n inferred_input_names_by_asset_key: Dict[str, AssetKey] = {\n input_name: keys_by_input_name.get(input_name, AssetKey([input_name]))\n for input_name in all_input_names\n }\n\n return inferred_input_names_by_asset_key\n\n\ndef _infer_keys_by_output_names(\n node_def: Union["GraphDefinition", OpDefinition],\n keys_by_output_name: Mapping[str, AssetKey],\n check_specs_by_output_name: Mapping[str, AssetCheckSpec],\n) -> Mapping[str, AssetKey]:\n output_names = [output_def.name for output_def in node_def.output_defs]\n if keys_by_output_name:\n overlapping_asset_and_check_outputs = set(keys_by_output_name.keys()) & set(\n check_specs_by_output_name.keys()\n )\n check.invariant(\n not overlapping_asset_and_check_outputs,\n "The set of output names associated with asset keys and checks overlap:"\n f" {overlapping_asset_and_check_outputs}",\n )\n\n union_asset_and_check_outputs = set(keys_by_output_name.keys()) | set(\n check_specs_by_output_name.keys()\n )\n check.invariant(\n union_asset_and_check_outputs == set(output_names),\n "The union of the set of output names keys specified in the keys_by_output_name and"\n " check_specs_by_output_name arguments must equal the set of asset keys outputted by"\n f" {node_def.name}. union keys:"\n f" {union_asset_and_check_outputs} \\nexpected keys: {set(output_names)}",\n )\n\n inferred_keys_by_output_names: Dict[str, AssetKey] = {\n output_name: asset_key for output_name, asset_key in keys_by_output_name.items()\n }\n\n if (\n len(output_names) == 1\n and output_names[0] not in keys_by_output_name\n and output_names[0] not in check_specs_by_output_name\n and output_names[0] == "result"\n ):\n # If there is only one output and the name is the default "result", generate asset key\n # from the name of the node\n inferred_keys_by_output_names[output_names[0]] = AssetKey([node_def.name])\n\n for output_name in output_names:\n if (\n output_name not in inferred_keys_by_output_names\n and output_name not in check_specs_by_output_name\n ):\n inferred_keys_by_output_names[output_name] = AssetKey([output_name])\n return inferred_keys_by_output_names\n\n\ndef _validate_graph_def(graph_def: "GraphDefinition", prefix: Optional[Sequence[str]] = None):\n """Ensure that all leaf nodes are mapped to graph outputs."""\n from dagster._core.definitions.graph_definition import GraphDefinition, create_adjacency_lists\n\n prefix = check.opt_sequence_param(prefix, "prefix")\n\n # recursively validate any sub-graphs\n for inner_node_def in graph_def.node_defs:\n if isinstance(inner_node_def, GraphDefinition):\n _validate_graph_def(inner_node_def, prefix=[*prefix, graph_def.name])\n\n # leaf nodes have no downstream nodes\n forward_edges, _ = create_adjacency_lists(graph_def.nodes, graph_def.dependency_structure)\n leaf_nodes = {\n node_name for node_name, downstream_nodes in forward_edges.items() if not downstream_nodes\n }\n\n # set of nodes that have outputs mapped to a graph output\n mapped_output_nodes = {\n output_mapping.maps_from.node_name for output_mapping in graph_def.output_mappings\n }\n\n # leaf nodes which do not have an associated mapped output\n unmapped_leaf_nodes = {".".join([*prefix, node]) for node in leaf_nodes - mapped_output_nodes}\n\n check.invariant(\n not unmapped_leaf_nodes,\n f"All leaf nodes within graph '{graph_def.name}' must generate outputs which are mapped"\n " to outputs of the graph, and produce assets. The following leaf node(s) are"\n f" non-asset producing ops: {unmapped_leaf_nodes}. This behavior is not currently"\n " supported because these ops are not required for the creation of the associated"\n " asset(s).",\n )\n\n\ndef _validate_self_deps(\n input_keys: Iterable[AssetKey],\n output_keys: Iterable[AssetKey],\n partition_mappings: Mapping[AssetKey, PartitionMapping],\n partitions_def: Optional[PartitionsDefinition],\n) -> None:\n output_keys_set = set(output_keys)\n for input_key in input_keys:\n if input_key in output_keys_set:\n if input_key in partition_mappings:\n partition_mapping = partition_mappings[input_key]\n time_window_partition_mapping = get_self_dep_time_window_partition_mapping(\n partition_mapping, partitions_def\n )\n if (\n time_window_partition_mapping is not None\n and (time_window_partition_mapping.start_offset or 0) < 0\n and (time_window_partition_mapping.end_offset or 0) < 0\n ):\n continue\n\n raise DagsterInvalidDefinitionError(\n f'Asset "{input_key.to_user_string()}" depends on itself. Assets can only depend'\n " on themselves if they are:\\n(a) time-partitioned and each partition depends on"\n " earlier partitions\\n(b) multipartitioned, with one time dimension that depends"\n " on earlier time partitions"\n )\n\n\ndef get_self_dep_time_window_partition_mapping(\n partition_mapping: Optional[PartitionMapping], partitions_def: Optional[PartitionsDefinition]\n) -> Optional[TimeWindowPartitionMapping]:\n """Returns a time window partition mapping dimension of the provided partition mapping,\n if exists.\n """\n if isinstance(partition_mapping, TimeWindowPartitionMapping):\n return partition_mapping\n elif isinstance(partition_mapping, MultiPartitionMapping):\n if not isinstance(partitions_def, MultiPartitionsDefinition):\n return None\n\n time_partition_mapping = partition_mapping.downstream_mappings_by_upstream_dimension.get(\n partitions_def.time_window_dimension.name\n )\n\n if time_partition_mapping is None or not isinstance(\n time_partition_mapping.partition_mapping, TimeWindowPartitionMapping\n ):\n return None\n\n return time_partition_mapping.partition_mapping\n return None\n
", "current_page_name": "_modules/dagster/_core/definitions/assets", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.assets"}, "auto_materialize_policy": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.auto_materialize_policy

\nfrom enum import Enum\nfrom typing import TYPE_CHECKING, AbstractSet, Dict, FrozenSet, NamedTuple, Optional, Sequence\n\nimport dagster._check as check\nfrom dagster._annotations import experimental, public\nfrom dagster._serdes.serdes import (\n    NamedTupleSerializer,\n    UnpackContext,\n    UnpackedValue,\n    whitelist_for_serdes,\n)\n\nif TYPE_CHECKING:\n    from dagster._core.definitions.auto_materialize_rule import (\n        AutoMaterializeRule,\n        AutoMaterializeRuleSnapshot,\n    )\n\n\nclass AutoMaterializePolicySerializer(NamedTupleSerializer):\n    def before_unpack(\n        self, context: UnpackContext, unpacked_dict: Dict[str, UnpackedValue]\n    ) -> Dict[str, UnpackedValue]:\n        from dagster._core.definitions.auto_materialize_rule import AutoMaterializeRule\n\n        backcompat_map = {\n            "on_missing": AutoMaterializeRule.materialize_on_missing(),\n            "on_new_parent_data": AutoMaterializeRule.materialize_on_parent_updated(),\n            "for_freshness": AutoMaterializeRule.materialize_on_required_for_freshness(),\n        }\n\n        # determine if this namedtuple was serialized with the old format (booleans for rules)\n        if any(backcompat_key in unpacked_dict for backcompat_key in backcompat_map):\n            # all old policies had these rules by default\n            rules = {\n                AutoMaterializeRule.skip_on_parent_outdated(),\n                AutoMaterializeRule.skip_on_parent_missing(),\n            }\n            for backcompat_key, rule in backcompat_map.items():\n                if unpacked_dict.get(backcompat_key):\n                    rules.add(rule)\n            unpacked_dict["rules"] = frozenset(rules)\n\n        return unpacked_dict\n\n\nclass AutoMaterializePolicyType(Enum):\n    EAGER = "EAGER"\n    LAZY = "LAZY"\n\n\n
[docs]@experimental\n@whitelist_for_serdes(\n old_fields={"time_window_partition_scope_minutes": 1e-6},\n serializer=AutoMaterializePolicySerializer,\n)\nclass AutoMaterializePolicy(\n NamedTuple(\n "_AutoMaterializePolicy",\n [\n ("rules", FrozenSet["AutoMaterializeRule"]),\n ("max_materializations_per_minute", Optional[int]),\n ],\n )\n):\n """An AutoMaterializePolicy specifies how Dagster should attempt to keep an asset up-to-date.\n\n Each policy consists of a set of AutoMaterializeRules, which are used to determine whether an\n asset or a partition of an asset should or should not be auto-materialized.\n\n The most common policy is `AutoMaterializePolicy.eager()`, which consists of the following rules:\n\n - `AutoMaterializeRule.materialize_on_missing()`\n Materialize an asset or a partition if it has never been materialized.\n - `AutoMaterializeRule.materialize_on_parent_updated()`\n Materialize an asset or a partition if one of its parents have been updated more recently\n than it has.\n - `AutoMaterializeRule.materialize_on_required_for_freshness()`\n Materialize an asset or a partition if it is required to satisfy a freshness policy.\n - `AutoMaterializeRule.skip_on_parent_outdated()`\n Skip materializing an asset or partition if any of its parents have ancestors that have\n been materialized more recently.\n - `AutoMaterializeRule.skip_on_parent_missing()`\n Skip materializing an asset or a partition if any parent has never been materialized or\n observed.\n\n Policies can be customized by adding or removing rules. For example, if you'd like to allow\n an asset to be materialized even if some of its parent partitions are missing:\n\n .. code-block:: python\n\n from dagster import AutoMaterializePolicy, AutoMaterializeRule\n\n my_policy = AutoMaterializePolicy.eager().without_rules(\n AutoMaterializeRule.skip_on_parent_missing(),\n )\n\n If you'd like an asset to wait for all of its parents to be updated before materializing:\n\n .. code-block:: python\n\n from dagster import AutoMaterializePolicy, AutoMaterializeRule\n\n my_policy = AutoMaterializePolicy.eager().with_rules(\n AutoMaterializeRule.skip_on_all_parents_not_updated(),\n )\n\n Lastly, the `max_materializations_per_minute` parameter, which is set to 1 by default,\n rate-limits the number of auto-materializations that can occur for a particular asset within\n a short time interval. This mainly matters for partitioned assets. Its purpose is to provide a\n safeguard against "surprise backfills", where user-error causes auto-materialize to be\n accidentally triggered for large numbers of partitions at once.\n\n **Warning:**\n\n Constructing an AutoMaterializePolicy directly is not recommended as the API is subject to change.\n AutoMaterializePolicy.eager() and AutoMaterializePolicy.lazy() are the recommended API.\n\n """\n\n def __new__(\n cls,\n rules: AbstractSet["AutoMaterializeRule"],\n max_materializations_per_minute: Optional[int] = 1,\n ):\n from dagster._core.definitions.auto_materialize_rule import AutoMaterializeRule\n\n check.invariant(\n max_materializations_per_minute is None or max_materializations_per_minute > 0,\n "max_materializations_per_minute must be positive. To disable rate-limiting, set it"\n " to None. To disable auto materializing, remove the policy.",\n )\n\n return super(AutoMaterializePolicy, cls).__new__(\n cls,\n rules=frozenset(check.set_param(rules, "rules", of_type=AutoMaterializeRule)),\n max_materializations_per_minute=max_materializations_per_minute,\n )\n\n @property\n def materialize_rules(self) -> AbstractSet["AutoMaterializeRule"]:\n from dagster._core.definitions.auto_materialize_rule import AutoMaterializeDecisionType\n\n return {\n rule\n for rule in self.rules\n if rule.decision_type == AutoMaterializeDecisionType.MATERIALIZE\n }\n\n @property\n def skip_rules(self) -> AbstractSet["AutoMaterializeRule"]:\n from dagster._core.definitions.auto_materialize_rule import AutoMaterializeDecisionType\n\n return {\n rule for rule in self.rules if rule.decision_type == AutoMaterializeDecisionType.SKIP\n }\n\n
[docs] @public\n @staticmethod\n def eager(max_materializations_per_minute: Optional[int] = 1) -> "AutoMaterializePolicy":\n """Constructs an eager AutoMaterializePolicy.\n\n Args:\n max_materializations_per_minute (Optional[int]): The maximum number of\n auto-materializations for this asset that may be initiated per minute. If this limit\n is exceeded, the partitions which would have been materialized will be discarded,\n and will require manual materialization in order to be updated. Defaults to 1.\n """\n from dagster._core.definitions.auto_materialize_rule import AutoMaterializeRule\n\n return AutoMaterializePolicy(\n rules={\n AutoMaterializeRule.materialize_on_missing(),\n AutoMaterializeRule.materialize_on_parent_updated(),\n AutoMaterializeRule.materialize_on_required_for_freshness(),\n AutoMaterializeRule.skip_on_parent_outdated(),\n AutoMaterializeRule.skip_on_parent_missing(),\n },\n max_materializations_per_minute=check.opt_int_param(\n max_materializations_per_minute, "max_materializations_per_minute"\n ),\n )
\n\n
[docs] @public\n @staticmethod\n def lazy(max_materializations_per_minute: Optional[int] = 1) -> "AutoMaterializePolicy":\n """Constructs a lazy AutoMaterializePolicy.\n\n Args:\n max_materializations_per_minute (Optional[int]): The maximum number of\n auto-materializations for this asset that may be initiated per minute. If this limit\n is exceeded, the partitions which would have been materialized will be discarded,\n and will require manual materialization in order to be updated. Defaults to 1.\n """\n from dagster._core.definitions.auto_materialize_rule import AutoMaterializeRule\n\n return AutoMaterializePolicy(\n rules={\n AutoMaterializeRule.materialize_on_required_for_freshness(),\n AutoMaterializeRule.skip_on_parent_outdated(),\n AutoMaterializeRule.skip_on_parent_missing(),\n },\n max_materializations_per_minute=check.opt_int_param(\n max_materializations_per_minute, "max_materializations_per_minute"\n ),\n )
\n\n
[docs] @public\n def without_rules(self, *rules_to_remove: "AutoMaterializeRule") -> "AutoMaterializePolicy":\n """Constructs a copy of this policy with the specified rules removed. Raises an error\n if any of the arguments are not rules in this policy.\n """\n non_matching_rules = set(rules_to_remove).difference(self.rules)\n check.param_invariant(\n not non_matching_rules,\n "rules_to_remove",\n f"Rules {[rule for rule in rules_to_remove if rule in non_matching_rules]} do not"\n " exist in this policy.",\n )\n return self._replace(\n rules=self.rules.difference(set(rules_to_remove)),\n )
\n\n
[docs] @public\n def with_rules(self, *rules_to_add: "AutoMaterializeRule") -> "AutoMaterializePolicy":\n """Constructs a copy of this policy with the specified rules added."""\n return self._replace(rules=self.rules.union(set(rules_to_add)))
\n\n @property\n def policy_type(self) -> AutoMaterializePolicyType:\n from dagster._core.definitions.auto_materialize_rule import AutoMaterializeRule\n\n if AutoMaterializeRule.materialize_on_parent_updated() in self.rules:\n return AutoMaterializePolicyType.EAGER\n return AutoMaterializePolicyType.LAZY\n\n @property\n def rule_snapshots(self) -> Sequence["AutoMaterializeRuleSnapshot"]:\n return [rule.to_snapshot() for rule in self.rules]
\n
", "current_page_name": "_modules/dagster/_core/definitions/auto_materialize_policy", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.auto_materialize_policy"}, "auto_materialize_rule": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.auto_materialize_rule

\nimport datetime\nfrom abc import ABC, abstractmethod, abstractproperty\nfrom collections import defaultdict\nfrom enum import Enum\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Dict,\n    FrozenSet,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Set,\n    Tuple,\n    cast,\n)\n\nimport dagster._check as check\nfrom dagster._annotations import public\nfrom dagster._core.definitions.data_time import CachingDataTimeResolver\nfrom dagster._core.definitions.events import AssetKey, AssetKeyPartitionKey\nfrom dagster._core.definitions.freshness_based_auto_materialize import (\n    freshness_evaluation_results_for_asset_key,\n)\nfrom dagster._core.definitions.partition_mapping import IdentityPartitionMapping\nfrom dagster._core.definitions.time_window_partition_mapping import TimeWindowPartitionMapping\nfrom dagster._serdes.serdes import (\n    NamedTupleSerializer,\n    UnpackContext,\n    UnpackedValue,\n    WhitelistMap,\n    whitelist_for_serdes,\n)\nfrom dagster._utils.caching_instance_queryer import CachingInstanceQueryer\n\nfrom .asset_graph import AssetGraph, sort_key_for_asset_partition\nfrom .partition import SerializedPartitionsSubset\n\nif TYPE_CHECKING:\n    from dagster._core.definitions.asset_daemon_context import AssetDaemonContext\n    from dagster._core.definitions.asset_daemon_cursor import AssetDaemonCursor\n    from dagster._core.instance import DynamicPartitionsStore\n\n\n@whitelist_for_serdes\nclass AutoMaterializeDecisionType(Enum):\n    """Represents the set of results of the auto-materialize logic.\n\n    MATERIALIZE: The asset should be materialized by a run kicked off on this tick\n    SKIP: The asset should not be materialized by a run kicked off on this tick, because future\n        ticks are expected to materialize it.\n    DISCARD: The asset should not be materialized by a run kicked off on this tick, but future\n        ticks are not expected to materialize it.\n    """\n\n    MATERIALIZE = "MATERIALIZE"\n    SKIP = "SKIP"\n    DISCARD = "DISCARD"\n\n\nclass AutoMaterializeRuleEvaluationData(ABC):\n    pass\n\n\n@whitelist_for_serdes\nclass TextRuleEvaluationData(\n    AutoMaterializeRuleEvaluationData,\n    NamedTuple("_TextRuleEvaluationData", [("text", str)]),\n):\n    pass\n\n\n@whitelist_for_serdes\nclass ParentUpdatedRuleEvaluationData(\n    AutoMaterializeRuleEvaluationData,\n    NamedTuple(\n        "_ParentUpdatedRuleEvaluationData",\n        [\n            ("updated_asset_keys", FrozenSet[AssetKey]),\n            ("will_update_asset_keys", FrozenSet[AssetKey]),\n        ],\n    ),\n):\n    pass\n\n\n@whitelist_for_serdes\nclass WaitingOnAssetsRuleEvaluationData(\n    AutoMaterializeRuleEvaluationData,\n    NamedTuple(\n        "_WaitingOnParentRuleEvaluationData",\n        [("waiting_on_asset_keys", FrozenSet[AssetKey])],\n    ),\n):\n    pass\n\n\n@whitelist_for_serdes\nclass AutoMaterializeRuleSnapshot(NamedTuple):\n    """A serializable snapshot of an AutoMaterializeRule for historical evaluations."""\n\n    class_name: str\n    description: str\n    decision_type: AutoMaterializeDecisionType\n\n    @staticmethod\n    def from_rule(rule: "AutoMaterializeRule") -> "AutoMaterializeRuleSnapshot":\n        return AutoMaterializeRuleSnapshot(\n            class_name=rule.__class__.__name__,\n            description=rule.description,\n            decision_type=rule.decision_type,\n        )\n\n\n@whitelist_for_serdes\nclass AutoMaterializeRuleEvaluation(NamedTuple):\n    rule_snapshot: AutoMaterializeRuleSnapshot\n    evaluation_data: Optional[AutoMaterializeRuleEvaluationData]\n\n\nclass RuleEvaluationContext(NamedTuple):\n    asset_key: AssetKey\n    cursor: "AssetDaemonCursor"\n    instance_queryer: CachingInstanceQueryer\n    data_time_resolver: CachingDataTimeResolver\n    will_materialize_mapping: Mapping[AssetKey, AbstractSet[AssetKeyPartitionKey]]\n    expected_data_time_mapping: Mapping[AssetKey, Optional[datetime.datetime]]\n    candidates: AbstractSet[AssetKeyPartitionKey]\n    daemon_context: "AssetDaemonContext"\n\n    @property\n    def asset_graph(self) -> AssetGraph:\n        return self.instance_queryer.asset_graph\n\n    def materializable_in_same_run(self, child_key: AssetKey, parent_key: AssetKey) -> bool:\n        """Returns whether a child asset can be materialized in the same run as a parent asset."""\n        from dagster._core.definitions.external_asset_graph import ExternalAssetGraph\n\n        return (\n            # both assets must be materializable\n            child_key in self.asset_graph.materializable_asset_keys\n            and parent_key in self.asset_graph.materializable_asset_keys\n            # the parent must have the same partitioning\n            and self.asset_graph.have_same_partitioning(child_key, parent_key)\n            # the parent must have a simple partition mapping to the child\n            and (\n                not self.asset_graph.is_partitioned(parent_key)\n                or isinstance(\n                    self.asset_graph.get_partition_mapping(child_key, parent_key),\n                    (TimeWindowPartitionMapping, IdentityPartitionMapping),\n                )\n            )\n            # the parent must be in the same repository to be materialized alongside the candidate\n            and (\n                not isinstance(self.asset_graph, ExternalAssetGraph)\n                or self.asset_graph.get_repository_handle(child_key)\n                == self.asset_graph.get_repository_handle(parent_key)\n            )\n        )\n\n    def get_parents_that_will_not_be_materialized_on_current_tick(\n        self, *, asset_partition: AssetKeyPartitionKey\n    ) -> AbstractSet[AssetKeyPartitionKey]:\n        """Returns the set of parent asset partitions that will not be updated in the same run of\n        this asset partition if we launch a run of this asset partition on this tick.\n        """\n        return {\n            parent\n            for parent in self.asset_graph.get_parents_partitions(\n                dynamic_partitions_store=self.instance_queryer,\n                current_time=self.instance_queryer.evaluation_time,\n                asset_key=asset_partition.asset_key,\n                partition_key=asset_partition.partition_key,\n            ).parent_partitions\n            if parent not in self.will_materialize_mapping.get(parent.asset_key, set())\n            or not self.materializable_in_same_run(asset_partition.asset_key, parent.asset_key)\n        }\n\n    def get_asset_partitions_by_asset_key(\n        self,\n        asset_partitions: AbstractSet[AssetKeyPartitionKey],\n    ) -> Mapping[AssetKey, Set[AssetKeyPartitionKey]]:\n        asset_partitions_by_asset_key: Dict[AssetKey, Set[AssetKeyPartitionKey]] = defaultdict(set)\n        for parent in asset_partitions:\n            asset_partitions_by_asset_key[parent.asset_key].add(parent)\n\n        return asset_partitions_by_asset_key\n\n\nRuleEvaluationResults = Sequence[Tuple[Optional[AutoMaterializeRuleEvaluationData], AbstractSet]]\n\n\n
[docs]class AutoMaterializeRule(ABC):\n """An AutoMaterializeRule defines a bit of logic which helps determine if a materialization\n should be kicked off for a given asset partition.\n\n Each rule can have one of two decision types, `MATERIALIZE` (indicating that an asset partition\n should be materialized) or `SKIP` (indicating that the asset partition should not be\n materialized).\n\n Materialize rules are evaluated first, and skip rules operate over the set of candidates that\n are produced by the materialize rules. Other than that, there is no ordering between rules.\n """\n\n @abstractproperty\n def decision_type(self) -> AutoMaterializeDecisionType:\n """The decision type of the rule (either `MATERIALIZE` or `SKIP`)."""\n ...\n\n @abstractproperty\n def description(self) -> str:\n """A human-readable description of this rule. As a basic guideline, this string should\n complete the sentence: 'Indicates an asset should be (materialize/skipped) when ____'.\n """\n ...\n\n @abstractmethod\n def evaluate_for_asset(self, context: RuleEvaluationContext) -> RuleEvaluationResults:\n """The core evaluation function for the rule. This function takes in a context object and\n returns a mapping from evaluated rules to the set of asset partitions that the rule applies\n to.\n """\n ...\n\n
[docs] @public\n @staticmethod\n def materialize_on_required_for_freshness() -> "MaterializeOnRequiredForFreshnessRule":\n """Materialize an asset partition if it is required to satisfy a freshness policy of this\n asset or one of its downstream assets.\n\n Note: This rule has no effect on partitioned assets.\n """\n return MaterializeOnRequiredForFreshnessRule()
\n\n
[docs] @public\n @staticmethod\n def materialize_on_parent_updated() -> "MaterializeOnParentUpdatedRule":\n """Materialize an asset partition if one of its parents has been updated more recently\n than it has.\n\n Note: For time-partitioned or dynamic-partitioned assets downstream of an unpartitioned\n asset, this rule will only fire for the most recent partition of the downstream.\n """\n return MaterializeOnParentUpdatedRule()
\n\n
[docs] @public\n @staticmethod\n def materialize_on_missing() -> "MaterializeOnMissingRule":\n """Materialize an asset partition if it has never been materialized before. This rule will\n not fire for non-root assets unless that asset's parents have been updated.\n """\n return MaterializeOnMissingRule()
\n\n
[docs] @public\n @staticmethod\n def skip_on_parent_missing() -> "SkipOnParentMissingRule":\n """Skip materializing an asset partition if one of its parent asset partitions has never\n been materialized (for regular assets) or observed (for observable source assets).\n """\n return SkipOnParentMissingRule()
\n\n
[docs] @public\n @staticmethod\n def skip_on_parent_outdated() -> "SkipOnParentOutdatedRule":\n """Skip materializing an asset partition if any of its parents has not incorporated the\n latest data from its ancestors.\n """\n return SkipOnParentOutdatedRule()
\n\n
[docs] @public\n @staticmethod\n def skip_on_not_all_parents_updated(\n require_update_for_all_parent_partitions: bool = False,\n ) -> "SkipOnNotAllParentsUpdatedRule":\n """Skip materializing an asset partition if any of its parents have not been updated since\n the asset's last materialization.\n\n Attributes:\n require_update_for_all_parent_partitions (Optional[bool]): Applies only to an unpartitioned\n asset or an asset partition that depends on more than one partition in any upstream asset.\n If true, requires all upstream partitions in each upstream asset to be materialized since\n the downstream asset's last materialization in order to update it. If false, requires at\n least one upstream partition in each upstream asset to be materialized since the downstream\n asset's last materialization in order to update it. Defaults to false.\n """\n return SkipOnNotAllParentsUpdatedRule(require_update_for_all_parent_partitions)
\n\n def to_snapshot(self) -> AutoMaterializeRuleSnapshot:\n """Returns a serializable snapshot of this rule for historical evaluations."""\n return AutoMaterializeRuleSnapshot.from_rule(self)\n\n def __eq__(self, other) -> bool:\n # override the default NamedTuple __eq__ method to factor in types\n return type(self) == type(other) and super().__eq__(other)\n\n def __hash__(self) -> int:\n # override the default NamedTuple __hash__ method to factor in types\n return hash(hash(type(self)) + super().__hash__())
\n\n\n@whitelist_for_serdes\nclass MaterializeOnRequiredForFreshnessRule(\n AutoMaterializeRule, NamedTuple("_MaterializeOnRequiredForFreshnessRule", [])\n):\n @property\n def decision_type(self) -> AutoMaterializeDecisionType:\n return AutoMaterializeDecisionType.MATERIALIZE\n\n @property\n def description(self) -> str:\n return "required to meet this or downstream asset's freshness policy"\n\n def evaluate_for_asset(self, context: RuleEvaluationContext) -> RuleEvaluationResults:\n freshness_conditions = freshness_evaluation_results_for_asset_key(\n asset_key=context.asset_key,\n data_time_resolver=context.data_time_resolver,\n asset_graph=context.asset_graph,\n current_time=context.instance_queryer.evaluation_time,\n will_materialize_mapping=context.will_materialize_mapping,\n expected_data_time_mapping=context.expected_data_time_mapping,\n )\n return freshness_conditions\n\n\n@whitelist_for_serdes\nclass MaterializeOnParentUpdatedRule(\n AutoMaterializeRule, NamedTuple("_MaterializeOnParentUpdatedRule", [])\n):\n @property\n def decision_type(self) -> AutoMaterializeDecisionType:\n return AutoMaterializeDecisionType.MATERIALIZE\n\n @property\n def description(self) -> str:\n return "upstream data has changed since latest materialization"\n\n def evaluate_for_asset(self, context: RuleEvaluationContext) -> RuleEvaluationResults:\n """Evaluates the set of asset partitions of this asset whose parents have been updated,\n or will update on this tick.\n """\n conditions = defaultdict(set)\n has_parents_that_will_update = set()\n\n # first, get the set of parents that will be materialized this tick, and see if we\n # can materialize this asset with those parents\n will_update_parents_by_asset_partition = defaultdict(set)\n for parent_key in context.asset_graph.get_parents(context.asset_key):\n if not context.materializable_in_same_run(context.asset_key, parent_key):\n continue\n for parent_partition in context.will_materialize_mapping.get(parent_key, set()):\n asset_partition = AssetKeyPartitionKey(\n context.asset_key, parent_partition.partition_key\n )\n will_update_parents_by_asset_partition[asset_partition].add(parent_key)\n has_parents_that_will_update.add(asset_partition)\n\n # next, for each asset partition of this asset which has newly-updated parents, or\n # has a parent that will update, create a ParentUpdatedRuleEvaluationData\n has_or_will_update = (\n context.daemon_context.get_asset_partitions_with_newly_updated_parents_for_key(\n context.asset_key\n )\n | has_parents_that_will_update\n )\n for asset_partition in has_or_will_update:\n parent_asset_partitions = context.asset_graph.get_parents_partitions(\n dynamic_partitions_store=context.instance_queryer,\n current_time=context.instance_queryer.evaluation_time,\n asset_key=asset_partition.asset_key,\n partition_key=asset_partition.partition_key,\n ).parent_partitions\n\n updated_parent_asset_partitions = context.instance_queryer.get_updated_parent_asset_partitions(\n asset_partition,\n parent_asset_partitions,\n # do a precise check for updated parents, factoring in data versions, as long as\n # we're within reasonable limits on the number of partitions to check\n respect_materialization_data_versions=context.daemon_context.respect_materialization_data_versions\n and len(parent_asset_partitions | has_or_will_update) < 100,\n # ignore self-dependencies when checking for updated parents, to avoid historical\n # rematerializations from causing a chain of materializations to be kicked off\n ignored_parent_keys={context.asset_key},\n )\n updated_parents = {parent.asset_key for parent in updated_parent_asset_partitions}\n will_update_parents = will_update_parents_by_asset_partition[asset_partition]\n\n if updated_parents or will_update_parents:\n conditions[\n ParentUpdatedRuleEvaluationData(\n updated_asset_keys=frozenset(updated_parents),\n will_update_asset_keys=frozenset(will_update_parents),\n )\n ].add(asset_partition)\n if conditions:\n return [(k, v) for k, v in conditions.items()]\n return []\n\n\n@whitelist_for_serdes\nclass MaterializeOnMissingRule(AutoMaterializeRule, NamedTuple("_MaterializeOnMissingRule", [])):\n @property\n def decision_type(self) -> AutoMaterializeDecisionType:\n return AutoMaterializeDecisionType.MATERIALIZE\n\n @property\n def description(self) -> str:\n return "materialization is missing"\n\n def evaluate_for_asset(self, context: RuleEvaluationContext) -> RuleEvaluationResults:\n """Evaluates the set of asset partitions for this asset which are missing and were not\n previously discarded. Currently only applies to root asset partitions and asset partitions\n with updated parents.\n """\n missing_asset_partitions = (\n context.daemon_context.get_never_handled_root_asset_partitions_for_key(\n context.asset_key\n )\n )\n # in addition to missing root asset partitions, check any asset partitions with updated\n # parents to see if they're missing\n for (\n candidate\n ) in context.daemon_context.get_asset_partitions_with_newly_updated_parents_for_key(\n context.asset_key\n ):\n if not context.instance_queryer.asset_partition_has_materialization_or_observation(\n candidate\n ):\n missing_asset_partitions |= {candidate}\n if missing_asset_partitions:\n return [(None, missing_asset_partitions)]\n return []\n\n\n@whitelist_for_serdes\nclass SkipOnParentOutdatedRule(AutoMaterializeRule, NamedTuple("_SkipOnParentOutdatedRule", [])):\n @property\n def decision_type(self) -> AutoMaterializeDecisionType:\n return AutoMaterializeDecisionType.SKIP\n\n @property\n def description(self) -> str:\n return "waiting on upstream data to be up to date"\n\n def evaluate_for_asset(self, context: RuleEvaluationContext) -> RuleEvaluationResults:\n asset_partitions_by_waiting_on_asset_keys = defaultdict(set)\n for candidate in context.candidates:\n unreconciled_ancestors = set()\n # find the root cause of why this asset partition's parents are outdated (if any)\n for parent in context.get_parents_that_will_not_be_materialized_on_current_tick(\n asset_partition=candidate\n ):\n unreconciled_ancestors.update(\n context.instance_queryer.get_root_unreconciled_ancestors(\n asset_partition=parent,\n )\n )\n if unreconciled_ancestors:\n asset_partitions_by_waiting_on_asset_keys[frozenset(unreconciled_ancestors)].add(\n candidate\n )\n if asset_partitions_by_waiting_on_asset_keys:\n return [\n (WaitingOnAssetsRuleEvaluationData(waiting_on_asset_keys=k), v)\n for k, v in asset_partitions_by_waiting_on_asset_keys.items()\n ]\n return []\n\n\n@whitelist_for_serdes\nclass SkipOnParentMissingRule(AutoMaterializeRule, NamedTuple("_SkipOnParentMissingRule", [])):\n @property\n def decision_type(self) -> AutoMaterializeDecisionType:\n return AutoMaterializeDecisionType.SKIP\n\n @property\n def description(self) -> str:\n return "waiting on upstream data to be present"\n\n def evaluate_for_asset(\n self,\n context: RuleEvaluationContext,\n ) -> RuleEvaluationResults:\n asset_partitions_by_waiting_on_asset_keys = defaultdict(set)\n for candidate in context.candidates:\n missing_parent_asset_keys = set()\n for parent in context.get_parents_that_will_not_be_materialized_on_current_tick(\n asset_partition=candidate\n ):\n # ignore non-observable sources, which will never have a materialization or observation\n if context.asset_graph.is_source(\n parent.asset_key\n ) and not context.asset_graph.is_observable(parent.asset_key):\n continue\n if not context.instance_queryer.asset_partition_has_materialization_or_observation(\n parent\n ):\n missing_parent_asset_keys.add(parent.asset_key)\n if missing_parent_asset_keys:\n asset_partitions_by_waiting_on_asset_keys[frozenset(missing_parent_asset_keys)].add(\n candidate\n )\n if asset_partitions_by_waiting_on_asset_keys:\n return [\n (WaitingOnAssetsRuleEvaluationData(waiting_on_asset_keys=k), v)\n for k, v in asset_partitions_by_waiting_on_asset_keys.items()\n ]\n return []\n\n\n@whitelist_for_serdes\nclass SkipOnNotAllParentsUpdatedRule(\n AutoMaterializeRule,\n NamedTuple(\n "_SkipOnNotAllParentsUpdatedRule", [("require_update_for_all_parent_partitions", bool)]\n ),\n):\n """An auto-materialize rule that enforces that an asset can only be materialized if all parents\n have been materialized since the asset's last materialization.\n\n Attributes:\n require_update_for_all_parent_partitions (Optional[bool]): Applies only to an unpartitioned\n asset or an asset partition that depends on more than one partition in any upstream asset.\n If true, requires all upstream partitions in each upstream asset to be materialized since\n the downstream asset's last materialization in order to update it. If false, requires at\n least one upstream partition in each upstream asset to be materialized since the downstream\n asset's last materialization in order to update it. Defaults to false.\n """\n\n @property\n def decision_type(self) -> AutoMaterializeDecisionType:\n return AutoMaterializeDecisionType.SKIP\n\n @property\n def description(self) -> str:\n if self.require_update_for_all_parent_partitions is False:\n return "waiting on upstream data to be updated"\n else:\n return "waiting until all upstream partitions are updated"\n\n def evaluate_for_asset(\n self,\n context: RuleEvaluationContext,\n ) -> RuleEvaluationResults:\n asset_partitions_by_waiting_on_asset_keys = defaultdict(set)\n for candidate in context.candidates:\n parent_partitions = context.asset_graph.get_parents_partitions(\n context.instance_queryer,\n context.instance_queryer.evaluation_time,\n context.asset_key,\n candidate.partition_key,\n ).parent_partitions\n\n updated_parent_partitions = (\n context.instance_queryer.get_updated_parent_asset_partitions(\n candidate,\n parent_partitions,\n context.daemon_context.respect_materialization_data_versions,\n ignored_parent_keys=set(),\n )\n | set().union(\n *[\n context.will_materialize_mapping.get(parent, set())\n for parent in context.asset_graph.get_parents(context.asset_key)\n ]\n )\n )\n\n if self.require_update_for_all_parent_partitions:\n # All upstream partitions must be updated in order for the candidate to be updated\n non_updated_parent_keys = {\n parent.asset_key for parent in parent_partitions - updated_parent_partitions\n }\n else:\n # At least one upstream partition in each upstream asset must be updated in order\n # for the candidate to be updated\n parent_asset_keys = context.asset_graph.get_parents(context.asset_key)\n updated_parent_partitions_by_asset_key = context.get_asset_partitions_by_asset_key(\n updated_parent_partitions\n )\n non_updated_parent_keys = {\n parent\n for parent in parent_asset_keys\n if not updated_parent_partitions_by_asset_key.get(parent)\n }\n\n # do not require past partitions of this asset to be updated\n non_updated_parent_keys -= {context.asset_key}\n\n if non_updated_parent_keys:\n asset_partitions_by_waiting_on_asset_keys[frozenset(non_updated_parent_keys)].add(\n candidate\n )\n\n if asset_partitions_by_waiting_on_asset_keys:\n return [\n (WaitingOnAssetsRuleEvaluationData(waiting_on_asset_keys=k), v)\n for k, v in asset_partitions_by_waiting_on_asset_keys.items()\n ]\n return []\n\n\n@whitelist_for_serdes\nclass DiscardOnMaxMaterializationsExceededRule(\n AutoMaterializeRule, NamedTuple("_DiscardOnMaxMaterializationsExceededRule", [("limit", int)])\n):\n @property\n def decision_type(self) -> AutoMaterializeDecisionType:\n return AutoMaterializeDecisionType.DISCARD\n\n @property\n def description(self) -> str:\n return f"exceeds {self.limit} materialization(s) per minute"\n\n def evaluate_for_asset(self, context: RuleEvaluationContext) -> RuleEvaluationResults:\n # the set of asset partitions which exceed the limit\n rate_limited_asset_partitions = set(\n sorted(\n context.candidates,\n key=lambda x: sort_key_for_asset_partition(context.asset_graph, x),\n )[self.limit :]\n )\n if rate_limited_asset_partitions:\n return [(None, rate_limited_asset_partitions)]\n return []\n\n\n@whitelist_for_serdes\nclass AutoMaterializeAssetEvaluation(NamedTuple):\n """Represents the results of the auto-materialize logic for a single asset.\n\n Properties:\n asset_key (AssetKey): The asset key that was evaluated.\n partition_subsets_by_condition: The rule evaluations that impact if the asset should be\n materialized, skipped, or discarded. If the asset is partitioned, this will be a list of\n tuples, where the first element is the condition and the second element is the\n serialized subset of partitions that the condition applies to. If it's not partitioned,\n the second element will be None.\n """\n\n asset_key: AssetKey\n partition_subsets_by_condition: Sequence[\n Tuple["AutoMaterializeRuleEvaluation", Optional[SerializedPartitionsSubset]]\n ]\n num_requested: int\n num_skipped: int\n num_discarded: int\n run_ids: Set[str] = set()\n rule_snapshots: Optional[Sequence[AutoMaterializeRuleSnapshot]] = None\n\n @staticmethod\n def from_rule_evaluation_results(\n asset_graph: AssetGraph,\n asset_key: AssetKey,\n asset_partitions_by_rule_evaluation: Sequence[\n Tuple[AutoMaterializeRuleEvaluation, AbstractSet[AssetKeyPartitionKey]]\n ],\n num_requested: int,\n num_skipped: int,\n num_discarded: int,\n dynamic_partitions_store: "DynamicPartitionsStore",\n ) -> "AutoMaterializeAssetEvaluation":\n auto_materialize_policy = asset_graph.auto_materialize_policies_by_key.get(asset_key)\n\n if not auto_materialize_policy:\n check.failed(f"Expected auto materialize policy on asset {asset_key}")\n\n partitions_def = asset_graph.get_partitions_def(asset_key)\n if partitions_def is None:\n return AutoMaterializeAssetEvaluation(\n asset_key=asset_key,\n partition_subsets_by_condition=[\n (rule_evaluation, None)\n for rule_evaluation, _ in asset_partitions_by_rule_evaluation\n ],\n num_requested=num_requested,\n num_skipped=num_skipped,\n num_discarded=num_discarded,\n rule_snapshots=auto_materialize_policy.rule_snapshots,\n )\n else:\n return AutoMaterializeAssetEvaluation(\n asset_key=asset_key,\n partition_subsets_by_condition=[\n (\n rule_evaluation,\n SerializedPartitionsSubset.from_subset(\n subset=partitions_def.empty_subset().with_partition_keys(\n check.not_none(ap.partition_key) for ap in asset_partitions\n ),\n partitions_def=partitions_def,\n dynamic_partitions_store=dynamic_partitions_store,\n ),\n )\n for rule_evaluation, asset_partitions in asset_partitions_by_rule_evaluation\n ],\n num_requested=num_requested,\n num_skipped=num_skipped,\n num_discarded=num_discarded,\n rule_snapshots=auto_materialize_policy.rule_snapshots,\n )\n\n\n# BACKCOMPAT GRAVEYARD\n\n\nclass BackcompatAutoMaterializeConditionSerializer(NamedTupleSerializer):\n """This handles backcompat for the old AutoMaterializeCondition objects, turning them into the\n proper AutoMaterializeRuleEvaluation objects. This is necessary because old\n AutoMaterializeAssetEvaluation objects will have serialized AutoMaterializeCondition objects,\n and we need to be able to deserialize them.\n\n In theory, as these serialized objects happen to be purged periodically, we can remove this\n backcompat logic at some point in the future.\n """\n\n def unpack(\n self,\n unpacked_dict: Dict[str, UnpackedValue],\n whitelist_map: WhitelistMap,\n context: UnpackContext,\n ) -> AutoMaterializeRuleEvaluation:\n if self.klass in (\n FreshnessAutoMaterializeCondition,\n DownstreamFreshnessAutoMaterializeCondition,\n ):\n return AutoMaterializeRuleEvaluation(\n rule_snapshot=AutoMaterializeRule.materialize_on_required_for_freshness().to_snapshot(),\n evaluation_data=None,\n )\n elif self.klass == MissingAutoMaterializeCondition:\n return AutoMaterializeRuleEvaluation(\n rule_snapshot=AutoMaterializeRule.materialize_on_missing().to_snapshot(),\n evaluation_data=None,\n )\n elif self.klass == ParentMaterializedAutoMaterializeCondition:\n updated_asset_keys = unpacked_dict.get("updated_asset_keys")\n if isinstance(updated_asset_keys, set):\n updated_asset_keys = cast(FrozenSet[AssetKey], frozenset(updated_asset_keys))\n else:\n updated_asset_keys = frozenset()\n will_update_asset_keys = unpacked_dict.get("will_update_asset_keys")\n if isinstance(will_update_asset_keys, set):\n will_update_asset_keys = cast(\n FrozenSet[AssetKey], frozenset(will_update_asset_keys)\n )\n else:\n will_update_asset_keys = frozenset()\n return AutoMaterializeRuleEvaluation(\n rule_snapshot=AutoMaterializeRule.materialize_on_parent_updated().to_snapshot(),\n evaluation_data=ParentUpdatedRuleEvaluationData(\n updated_asset_keys=updated_asset_keys,\n will_update_asset_keys=will_update_asset_keys,\n ),\n )\n elif self.klass == ParentOutdatedAutoMaterializeCondition:\n waiting_on_asset_keys = unpacked_dict.get("waiting_on_asset_keys")\n if isinstance(waiting_on_asset_keys, set):\n waiting_on_asset_keys = cast(FrozenSet[AssetKey], frozenset(waiting_on_asset_keys))\n else:\n waiting_on_asset_keys = frozenset()\n return AutoMaterializeRuleEvaluation(\n rule_snapshot=AutoMaterializeRule.skip_on_parent_outdated().to_snapshot(),\n evaluation_data=WaitingOnAssetsRuleEvaluationData(\n waiting_on_asset_keys=waiting_on_asset_keys\n ),\n )\n elif self.klass == MaxMaterializationsExceededAutoMaterializeCondition:\n return AutoMaterializeRuleEvaluation(\n rule_snapshot=DiscardOnMaxMaterializationsExceededRule(limit=1).to_snapshot(),\n evaluation_data=None,\n )\n check.failed(f"Unexpected class {self.klass}")\n\n\n@whitelist_for_serdes(serializer=BackcompatAutoMaterializeConditionSerializer)\nclass FreshnessAutoMaterializeCondition(NamedTuple): ...\n\n\n@whitelist_for_serdes(serializer=BackcompatAutoMaterializeConditionSerializer)\nclass DownstreamFreshnessAutoMaterializeCondition(NamedTuple): ...\n\n\n@whitelist_for_serdes(serializer=BackcompatAutoMaterializeConditionSerializer)\nclass ParentMaterializedAutoMaterializeCondition(NamedTuple): ...\n\n\n@whitelist_for_serdes(serializer=BackcompatAutoMaterializeConditionSerializer)\nclass MissingAutoMaterializeCondition(NamedTuple): ...\n\n\n@whitelist_for_serdes(serializer=BackcompatAutoMaterializeConditionSerializer)\nclass ParentOutdatedAutoMaterializeCondition(NamedTuple): ...\n\n\n@whitelist_for_serdes(serializer=BackcompatAutoMaterializeConditionSerializer)\nclass MaxMaterializationsExceededAutoMaterializeCondition(NamedTuple): ...\n
", "current_page_name": "_modules/dagster/_core/definitions/auto_materialize_rule", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.auto_materialize_rule"}, "backfill_policy": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.backfill_policy

\nfrom enum import Enum\nfrom typing import NamedTuple, Optional\n\nimport dagster._check as check\nfrom dagster._annotations import experimental, public\nfrom dagster._serdes import whitelist_for_serdes\n\n\nclass BackfillPolicyType(Enum):\n    SINGLE_RUN = "SINGLE_RUN"\n    MULTI_RUN = "MULTI_RUN"\n\n\n
[docs]@experimental\n@whitelist_for_serdes\nclass BackfillPolicy(\n NamedTuple(\n "_BackfillPolicy",\n [\n ("max_partitions_per_run", Optional[int]),\n ],\n )\n):\n """A BackfillPolicy specifies how Dagster should attempt to backfill a partitioned asset.\n\n There are two main kinds of backfill policies: single-run and multi-run.\n\n An asset with a single-run backfill policy will take a single run to backfill all of its\n partitions at once.\n\n An asset with a multi-run backfill policy will take multiple runs to backfill all of its\n partitions. Each run will backfill a subset of the partitions. The number of partitions to\n backfill in each run is controlled by the `max_partitions_per_run` parameter.\n\n For example:\n\n - If an asset has 100 partitions, and the `max_partitions_per_run` is set to 10, then it will\n be backfilled in 10 runs; each run will backfill 10 partitions.\n\n - If an asset has 100 partitions, and the `max_partitions_per_run` is set to 11, then it will\n be backfilled in 10 runs; the first 9 runs will backfill 11 partitions, and the last one run\n will backfill the remaining 9 partitions.\n\n **Warning:**\n\n Constructing an BackfillPolicy directly is not recommended as the API is subject to change.\n BackfillPolicy.single_run() and BackfillPolicy.multi_run(max_partitions_per_run=x) are the\n recommended APIs.\n """\n\n def __new__(cls, max_partitions_per_run: Optional[int] = 1):\n return super(BackfillPolicy, cls).__new__(\n cls,\n max_partitions_per_run=max_partitions_per_run,\n )\n\n
[docs] @public\n @staticmethod\n def single_run() -> "BackfillPolicy":\n """Creates a BackfillPolicy that executes the entire backfill in a single run."""\n return BackfillPolicy(max_partitions_per_run=None)
\n\n
[docs] @public\n @staticmethod\n def multi_run(max_partitions_per_run: int = 1) -> "BackfillPolicy":\n """Creates a BackfillPolicy that executes the entire backfill in multiple runs.\n Each run will backfill [max_partitions_per_run] number of partitions.\n\n Args:\n max_partitions_per_run (Optional[int]): The maximum number of partitions in each run of\n the multiple runs. Defaults to 1.\n """\n return BackfillPolicy(\n max_partitions_per_run=check.int_param(max_partitions_per_run, "max_partitions_per_run")\n )
\n\n @property\n def policy_type(self) -> BackfillPolicyType:\n if self.max_partitions_per_run:\n return BackfillPolicyType.MULTI_RUN\n else:\n return BackfillPolicyType.SINGLE_RUN
\n
", "current_page_name": "_modules/dagster/_core/definitions/backfill_policy", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.backfill_policy"}, "config": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.config

\nfrom typing import Any, Callable, Mapping, NamedTuple, Optional, Union, cast\n\nfrom typing_extensions import TypeAlias\n\nimport dagster._check as check\nfrom dagster._builtins import BuiltinEnum\nfrom dagster._config import (\n    ConfigType,\n    is_supported_config_python_builtin,\n    process_config,\n    resolve_defaults,\n    validate_config,\n)\nfrom dagster._core.definitions.definition_config_schema import IDefinitionConfigSchema\nfrom dagster._core.errors import DagsterInvalidConfigError\n\nfrom .definition_config_schema import convert_user_facing_definition_config_schema\n\nConfigMappingFn: TypeAlias = Callable[[Any], Any]\n\n\ndef is_callable_valid_config_arg(config: Union[Callable[..., Any], Mapping[str, object]]) -> bool:\n    return BuiltinEnum.contains(config) or is_supported_config_python_builtin(config)\n\n\n
[docs]class ConfigMapping(\n NamedTuple(\n "_ConfigMapping",\n [\n ("config_fn", Callable[[Any], Any]),\n ("config_schema", IDefinitionConfigSchema),\n ("receive_processed_config_values", Optional[bool]),\n ],\n )\n):\n """Defines a config mapping for a graph (or job).\n\n By specifying a config mapping function, you can override the configuration for the child\n ops and graphs contained within a graph.\n\n Config mappings require the configuration schema to be specified as ``config_schema``, which will\n be exposed as the configuration schema for the graph, as well as a configuration mapping\n function, ``config_fn``, which maps the config provided to the graph to the config\n that will be provided to the child nodes.\n\n Args:\n config_fn (Callable[[dict], dict]): The function that will be called\n to map the graph config to a config appropriate for the child nodes.\n config_schema (ConfigSchema): The schema of the graph config.\n receive_processed_config_values (Optional[bool]): If true, config values provided to the config_fn\n will be converted to their dagster types before being passed in. For example, if this\n value is true, enum config passed to config_fn will be actual enums, while if false,\n then enum config passed to config_fn will be strings.\n """\n\n def __new__(\n cls,\n config_fn: ConfigMappingFn,\n config_schema: Optional[Any] = None,\n receive_processed_config_values: Optional[bool] = None,\n ):\n return super(ConfigMapping, cls).__new__(\n cls,\n config_fn=check.callable_param(config_fn, "config_fn"),\n config_schema=convert_user_facing_definition_config_schema(config_schema),\n receive_processed_config_values=check.opt_bool_param(\n receive_processed_config_values, "receive_processed_config_values"\n ),\n )\n\n def resolve_from_unvalidated_config(self, config: Any) -> Any:\n """Validates config against outer config schema, and calls mapping against validated config."""\n receive_processed_config_values = check.opt_bool_param(\n self.receive_processed_config_values, "receive_processed_config_values", default=True\n )\n if receive_processed_config_values:\n outer_evr = process_config(\n self.config_schema.config_type,\n config,\n )\n else:\n outer_evr = validate_config(\n self.config_schema.config_type,\n config,\n )\n if not outer_evr.success:\n raise DagsterInvalidConfigError(\n "Error in config mapping ",\n outer_evr.errors,\n config,\n )\n\n outer_config = outer_evr.value\n if not receive_processed_config_values:\n outer_config = resolve_defaults(\n cast(ConfigType, self.config_schema.config_type),\n outer_config,\n ).value\n\n return self.config_fn(outer_config)\n\n def resolve_from_validated_config(self, config: Any) -> Any:\n if self.receive_processed_config_values is not None:\n check.failed(\n "`receive_processed_config_values` parameter has been set, but only applies to "\n "unvalidated config."\n )\n\n return self.config_fn(config)
\n
", "current_page_name": "_modules/dagster/_core/definitions/config", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.config"}, "configurable": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.configurable

\nfrom abc import ABC, abstractmethod\nfrom typing import Any, Callable, NamedTuple, Optional, Type, TypeVar, Union, cast\n\nfrom typing_extensions import Self\n\nfrom dagster import (\n    Field,\n    _check as check,\n)\nfrom dagster._config import EvaluateValueResult\nfrom dagster._config.config_schema import UserConfigSchema\nfrom dagster._core.decorator_utils import get_function_params\n\nfrom .definition_config_schema import (\n    CoercableToConfigSchema,\n    ConfiguredDefinitionConfigSchema,\n    IDefinitionConfigSchema,\n    convert_user_facing_definition_config_schema,\n)\n\n\nclass ConfigurableDefinition(ABC):\n    @property\n    @abstractmethod\n    def config_schema(self) -> Optional[IDefinitionConfigSchema]:\n        raise NotImplementedError()\n\n    @property\n    def has_config_field(self) -> bool:\n        return self.config_schema is not None and bool(self.config_schema.as_field())\n\n    @property\n    def config_field(self) -> Optional[Field]:\n        return None if not self.config_schema else self.config_schema.as_field()\n\n    # getter for typed access\n    def get_config_field(self) -> Field:\n        field = self.config_field\n        if field is None:\n            check.failed("Must check has_config_Field before calling get_config_field")\n        return field\n\n    def apply_config_mapping(self, config: Any) -> EvaluateValueResult:\n        """Applies user-provided config mapping functions to the given configuration and validates the\n        results against the respective config schema.\n\n        Expects incoming config to be validated and have fully-resolved values (StringSource values\n        resolved, Enum types hydrated, etc.) via process_config() during ResolvedRunConfig\n        construction and Graph config mapping.\n\n        Args:\n            config (Any): A validated and resolved configuration dictionary matching this object's\n            config_schema\n\n        Returns (EvaluateValueResult):\n            If successful, the value is a validated and resolved configuration dictionary for the\n            innermost wrapped object after applying the config mapping transformation function.\n        """\n        # If schema is on a mapped schema this is the innermost resource (base case),\n        # so we aren't responsible for validating against anything farther down.\n        # Returns an EVR for type consistency with config_mapping_fn.\n        return (\n            self.config_schema.resolve_config(config)\n            if isinstance(self.config_schema, ConfiguredDefinitionConfigSchema)\n            else EvaluateValueResult.for_value(config)\n        )\n\n\nclass AnonymousConfigurableDefinition(ConfigurableDefinition):\n    """An interface that makes the `configured` method not accept a name argument."""\n\n    def configured(\n        self,\n        config_or_config_fn: Any,\n        config_schema: CoercableToConfigSchema = None,\n        description: Optional[str] = None,\n    ) -> Self:\n        """Wraps this object in an object of the same type that provides configuration to the inner\n        object.\n\n        Using ``configured`` may result in config values being displayed in\n        the Dagster UI, so it is not recommended to use this API with sensitive values,\n        such as secrets.\n\n        Args:\n            config_or_config_fn (Union[Any, Callable[[Any], Any]]): Either (1) Run configuration\n                that fully satisfies this object's config schema or (2) A function that accepts run\n                configuration and returns run configuration that fully satisfies this object's\n                config schema.  In the latter case, config_schema must be specified.  When\n                passing a function, it's easiest to use :py:func:`configured`.\n            config_schema (ConfigSchema): If config_or_config_fn is a function, the config schema\n                that its input must satisfy.\n            description (Optional[str]): Description of the new definition. If not specified,\n                inherits the description of the definition being configured.\n\n        Returns (ConfigurableDefinition): A configured version of this object.\n        """\n        new_config_schema = ConfiguredDefinitionConfigSchema(\n            self, convert_user_facing_definition_config_schema(config_schema), config_or_config_fn\n        )\n\n        return self.copy_for_configured(description, new_config_schema)\n\n    @abstractmethod\n    def copy_for_configured(\n        self,\n        description: Optional[str],\n        config_schema: IDefinitionConfigSchema,\n    ) -> Self:\n        raise NotImplementedError()\n\n\nclass NamedConfigurableDefinition(ConfigurableDefinition):\n    """An interface that makes the `configured` method require a positional `name` argument."""\n\n    def configured(\n        self,\n        config_or_config_fn: Any,\n        name: str,\n        config_schema: Optional[UserConfigSchema] = None,\n        description: Optional[str] = None,\n    ) -> Self:\n        """Wraps this object in an object of the same type that provides configuration to the inner\n        object.\n\n        Using ``configured`` may result in config values being displayed in\n        the Dagster UI, so it is not recommended to use this API with sensitive values,\n        such as secrets.\n\n        Args:\n            config_or_config_fn (Union[Any, Callable[[Any], Any]]): Either (1) Run configuration\n                that fully satisfies this object's config schema or (2) A function that accepts run\n                configuration and returns run configuration that fully satisfies this object's\n                config schema.  In the latter case, config_schema must be specified.  When\n                passing a function, it's easiest to use :py:func:`configured`.\n            name (str): Name of the new definition. This is a required argument, as this definition\n                type has a name uniqueness constraint.\n            config_schema (ConfigSchema): If config_or_config_fn is a function, the config schema\n                that its input must satisfy.\n            description (Optional[str]): Description of the new definition. If not specified,\n                inherits the description of the definition being configured.\n\n        Returns (ConfigurableDefinition): A configured version of this object.\n        """\n        name = check.str_param(name, "name")\n\n        new_config_schema = ConfiguredDefinitionConfigSchema(\n            self, convert_user_facing_definition_config_schema(config_schema), config_or_config_fn\n        )\n\n        return self.copy_for_configured(name, description, new_config_schema)\n\n    @abstractmethod\n    def copy_for_configured(\n        self,\n        name: str,\n        description: Optional[str],\n        config_schema: IDefinitionConfigSchema,\n    ) -> Self: ...\n\n\ndef _check_configurable_param(configurable: ConfigurableDefinition) -> None:\n    from dagster._core.definitions.composition import PendingNodeInvocation\n\n    check.param_invariant(\n        not isinstance(configurable, PendingNodeInvocation),\n        "configurable",\n        "You have invoked `configured` on a PendingNodeInvocation (an intermediate type), which"\n        " is produced by aliasing or tagging a node definition. To configure a node, you must"\n        " call `configured` on either an OpDefinition and GraphDefinition. To fix"\n        " this error, make sure to call `configured` on the definition object *before* using"\n        " the `tag` or `alias` methods. For usage examples, see"\n        " https://docs.dagster.io/concepts/configuration/configured",\n    )\n    check.inst_param(\n        configurable,\n        "configurable",\n        ConfigurableDefinition,\n        "Only the following types can be used with the `configured` method: ResourceDefinition,"\n        " ExecutorDefinition, GraphDefinition, NodeDefinition, and LoggerDefinition."\n        " For usage examples of `configured`, see"\n        " https://docs.dagster.io/concepts/configuration/configured",\n    )\n\n\nT_Configurable = TypeVar(\n    "T_Configurable", bound=Union["AnonymousConfigurableDefinition", "NamedConfigurableDefinition"]\n)\n\n\nclass FunctionAndConfigSchema(NamedTuple):\n    function: Callable[[Any], Any]\n    config_schema: Optional[UserConfigSchema]\n\n\ndef _wrap_user_fn_if_pythonic_config(\n    user_fn: Any, config_schema: Optional[UserConfigSchema]\n) -> FunctionAndConfigSchema:\n    """Helper function which allows users to provide a Pythonic config object to a @configurable\n    function. Detects if the function has a single parameter annotated with a Config class.\n    If so, wraps the function to convert the config dictionary into the appropriate Config object.\n    """\n    from dagster._config.pythonic_config import (\n        Config,\n        infer_schema_from_config_annotation,\n        safe_is_subclass,\n    )\n\n    if not isinstance(user_fn, Callable):\n        return FunctionAndConfigSchema(function=user_fn, config_schema=config_schema)\n\n    config_fn_params = get_function_params(user_fn)\n    check.invariant(\n        len(config_fn_params) == 1, "@configured function should have exactly one parameter"\n    )\n\n    param = config_fn_params[0]\n\n    # If the parameter is a subclass of Config, we can infer the config schema from the\n    # type annotation. We'll also wrap the config mapping function to convert the config\n    # dictionary into the appropriate Config object.\n    if not safe_is_subclass(param.annotation, Config):\n        return FunctionAndConfigSchema(function=user_fn, config_schema=config_schema)\n\n    check.invariant(\n        config_schema is None,\n        "Cannot provide config_schema to @configured function with Config-annotated param",\n    )\n\n    config_schema_from_class = infer_schema_from_config_annotation(param.annotation, param.default)\n    config_cls = cast(Type[Config], param.annotation)\n\n    param_name = param.name\n\n    def wrapped_fn(config_as_dict) -> Any:\n        config_input = config_cls(**config_as_dict)\n        output = user_fn(**{param_name: config_input})\n\n        if isinstance(output, Config):\n            return output._convert_to_config_dictionary()  # noqa: SLF001\n        else:\n            return output\n\n    return FunctionAndConfigSchema(function=wrapped_fn, config_schema=config_schema_from_class)\n\n\n
[docs]def configured(\n configurable: T_Configurable,\n config_schema: Optional[UserConfigSchema] = None,\n **kwargs: Any,\n) -> Callable[[object], T_Configurable]:\n """A decorator that makes it easy to create a function-configured version of an object.\n\n The following definition types can be configured using this function:\n\n * :py:class:`GraphDefinition`\n * :py:class:`ExecutorDefinition`\n * :py:class:`LoggerDefinition`\n * :py:class:`ResourceDefinition`\n * :py:class:`OpDefinition`\n\n Using ``configured`` may result in config values being displayed in the Dagster UI,\n so it is not recommended to use this API with sensitive values, such as\n secrets.\n\n If the config that will be supplied to the object is constant, you may alternatively invoke this\n and call the result with a dict of config values to be curried. Examples of both strategies\n below.\n\n Args:\n configurable (ConfigurableDefinition): An object that can be configured.\n config_schema (ConfigSchema): The config schema that the inputs to the decorated function\n must satisfy. Alternatively, annotate the config parameter to the decorated function\n with a subclass of :py:class:`Config` and omit this argument.\n **kwargs: Arbitrary keyword arguments that will be passed to the initializer of the returned\n object.\n\n Returns:\n (Callable[[Union[Any, Callable[[Any], Any]]], ConfigurableDefinition])\n\n **Examples:**\n\n .. code-block:: python\n\n class GreetingConfig(Config):\n message: str\n\n @op\n def greeting_op(config: GreetingConfig):\n print(config.message)\n\n class HelloConfig(Config):\n name: str\n\n @configured(greeting_op)\n def hello_op(config: HelloConfig):\n return GreetingConfig(message=f"Hello, {config.name}!")\n\n .. code-block:: python\n\n dev_s3 = configured(S3Resource, name="dev_s3")({'bucket': 'dev'})\n\n @configured(S3Resource)\n def dev_s3(_):\n return {'bucket': 'dev'}\n\n @configured(S3Resource, {'bucket_prefix', str})\n def dev_s3(config):\n return {'bucket': config['bucket_prefix'] + 'dev'}\n\n """\n _check_configurable_param(configurable)\n\n if isinstance(configurable, NamedConfigurableDefinition):\n\n def _configured(config_or_config_fn: object) -> T_Configurable:\n fn_name = (\n getattr(config_or_config_fn, "__name__", None)\n if callable(config_or_config_fn)\n else None\n )\n name: str = check.not_none(kwargs.get("name") or fn_name)\n\n updated_fn, new_config_schema = _wrap_user_fn_if_pythonic_config(\n config_or_config_fn, config_schema\n )\n return configurable.configured(\n config_or_config_fn=updated_fn,\n name=name,\n config_schema=new_config_schema,\n **{k: v for k, v in kwargs.items() if k != "name"},\n )\n\n return _configured\n elif isinstance(configurable, AnonymousConfigurableDefinition):\n\n def _configured(config_or_config_fn: object) -> T_Configurable:\n updated_fn, new_config_schema = _wrap_user_fn_if_pythonic_config(\n config_or_config_fn, config_schema\n )\n return configurable.configured(\n config_schema=new_config_schema, config_or_config_fn=updated_fn, **kwargs\n )\n\n return _configured\n else:\n check.failed(f"Invalid configurable definition type: {type(configurable)}")
\n
", "current_page_name": "_modules/dagster/_core/definitions/configurable", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.configurable"}, "decorators": {"asset_check_decorator": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.decorators.asset_check_decorator

\nfrom typing import Any, Callable, Mapping, Optional, Set, Tuple, Union, cast\n\nfrom dagster import _check as check\nfrom dagster._annotations import experimental\nfrom dagster._builtins import Nothing\nfrom dagster._config import UserConfigSchema\nfrom dagster._core.definitions.asset_check_result import AssetCheckResult\nfrom dagster._core.definitions.asset_check_spec import AssetCheckSpec\nfrom dagster._core.definitions.asset_checks import (\n    AssetChecksDefinition,\n    AssetChecksDefinitionInputOutputProps,\n)\nfrom dagster._core.definitions.assets import AssetsDefinition\nfrom dagster._core.definitions.events import AssetKey, CoercibleToAssetKey\nfrom dagster._core.definitions.output import Out\nfrom dagster._core.definitions.policy import RetryPolicy\nfrom dagster._core.definitions.source_asset import SourceAsset\nfrom dagster._core.definitions.utils import NoValueSentinel\nfrom dagster._core.errors import DagsterInvalidDefinitionError\n\nfrom ..input import In\nfrom .asset_decorator import (\n    get_function_params_without_context_or_config_or_resources,\n    stringify_asset_key_to_input_name,\n)\nfrom .op_decorator import _Op\n\nAssetCheckFunctionReturn = AssetCheckResult\nAssetCheckFunction = Callable[..., AssetCheckFunctionReturn]\n\n\ndef _build_asset_check_input(\n    name: str, asset_key: AssetKey, fn: Callable\n) -> Mapping[AssetKey, Tuple[str, In]]:\n    asset_params = get_function_params_without_context_or_config_or_resources(fn)\n\n    if len(asset_params) == 0:\n        input_name = stringify_asset_key_to_input_name(asset_key)\n        in_def = In(cast(type, Nothing))\n    elif len(asset_params) == 1:\n        input_name = asset_params[0].name\n        in_def = In(metadata={}, input_manager_key=None, dagster_type=NoValueSentinel)\n    else:\n        raise DagsterInvalidDefinitionError(\n            f"When defining check '{name}', multiple target assets provided as parameters:"\n            f" {[param.name for param in asset_params]}. Only one"\n            " is allowed."\n        )\n\n    return {\n        asset_key: (\n            input_name,\n            in_def,\n        )\n    }\n\n\n
[docs]@experimental\ndef asset_check(\n *,\n asset: Union[CoercibleToAssetKey, AssetsDefinition, SourceAsset],\n name: Optional[str] = None,\n description: Optional[str] = None,\n required_resource_keys: Optional[Set[str]] = None,\n resource_defs: Optional[Mapping[str, object]] = None,\n config_schema: Optional[UserConfigSchema] = None,\n compute_kind: Optional[str] = None,\n op_tags: Optional[Mapping[str, Any]] = None,\n retry_policy: Optional[RetryPolicy] = None,\n) -> Callable[[AssetCheckFunction], AssetChecksDefinition]:\n """Create a definition for how to execute an asset check.\n\n Args:\n asset (Union[AssetKey, Sequence[str], str, AssetsDefinition, SourceAsset]): The\n asset that the check applies to.\n name (Optional[str]): The name of the check. If not specified, the name of the decorated\n function will be used. Checks for the same asset must have unique names.\n description (Optional[str]): The description of the check.\n required_resource_keys (Optional[Set[str]]): A set of keys for resources that are required\n by the function that execute the check. These can alternatively be specified by\n including resource-typed parameters in the function signature.\n config_schema (Optional[ConfigSchema): The configuration schema for the check's underlying\n op. If set, Dagster will check that config provided for the op matches this schema and fail\n if it does not. If not set, Dagster will accept any config provided for the op.\n op_tags (Optional[Dict[str, Any]]): A dictionary of tags for the op that executes the check.\n Frameworks may expect and require certain metadata to be attached to a op. Values that\n are not strings will be json encoded and must meet the criteria that\n `json.loads(json.dumps(value)) == value`.\n compute_kind (Optional[str]): A string to represent the kind of computation that executes\n the check, e.g. "dbt" or "spark".\n retry_policy (Optional[RetryPolicy]): The retry policy for the op that executes the check.\n\n\n Produces an :py:class:`AssetChecksDefinition` object.\n\n\n Example:\n .. code-block:: python\n\n from dagster import asset, asset_check, AssetCheckResult\n\n @asset\n def my_asset() -> None:\n ...\n\n @asset_check(asset=my_asset, description="Check that my asset has enough rows")\n def my_asset_has_enough_rows() -> AssetCheckResult:\n num_rows = ...\n return AssetCheckResult(success=num_rows > 5, metadata={"num_rows": num_rows})\n\n\n Example with a DataFrame Output:\n .. code-block:: python\n\n from dagster import asset, asset_check, AssetCheckResult\n from pandas import DataFrame\n\n @asset\n def my_asset() -> DataFrame:\n ...\n\n @asset_check(asset=my_asset, description="Check that my asset has enough rows")\n def my_asset_has_enough_rows(my_asset: DataFrame) -> AssetCheckResult:\n num_rows = my_asset.shape[0]\n return AssetCheckResult(success=num_rows > 5, metadata={"num_rows": num_rows})\n """\n\n def inner(fn: AssetCheckFunction) -> AssetChecksDefinition:\n check.callable_param(fn, "fn")\n resolved_name = name or fn.__name__\n asset_key = AssetKey.from_coercible_or_definition(asset)\n\n out = Out(dagster_type=None)\n input_tuples_by_asset_key = _build_asset_check_input(resolved_name, asset_key, fn)\n if len(input_tuples_by_asset_key) == 0:\n raise DagsterInvalidDefinitionError(\n f"No target asset provided when defining check '{resolved_name}'"\n )\n\n if len(input_tuples_by_asset_key) > 1:\n raise DagsterInvalidDefinitionError(\n f"When defining check '{resolved_name}', Multiple target assets provided:"\n f" {[key.to_user_string() for key in input_tuples_by_asset_key.keys()]}. Only one"\n " is allowed."\n )\n\n resolved_asset_key = next(iter(input_tuples_by_asset_key.keys()))\n spec = AssetCheckSpec(\n name=resolved_name,\n description=description,\n asset=resolved_asset_key,\n )\n\n op_def = _Op(\n name=spec.get_python_identifier(),\n ins=dict(input_tuples_by_asset_key.values()),\n out=out,\n # Any resource requirements specified as arguments will be identified as\n # part of the Op definition instantiation\n required_resource_keys=required_resource_keys,\n tags={\n **({"kind": compute_kind} if compute_kind else {}),\n **(op_tags or {}),\n },\n config_schema=config_schema,\n retry_policy=retry_policy,\n )(fn)\n\n checks_def = AssetChecksDefinition(\n node_def=op_def,\n resource_defs={},\n specs=[spec],\n input_output_props=AssetChecksDefinitionInputOutputProps(\n asset_keys_by_input_name={\n input_tuples_by_asset_key[resolved_asset_key][0]: resolved_asset_key\n },\n asset_check_keys_by_output_name={op_def.output_defs[0].name: spec.key},\n ),\n )\n\n return checks_def\n\n return inner
\n
", "current_page_name": "_modules/dagster/_core/definitions/decorators/asset_check_decorator", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.decorators.asset_check_decorator"}, "asset_decorator": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.decorators.asset_decorator

\nfrom collections import Counter\nfrom inspect import Parameter\nfrom typing import (\n    AbstractSet,\n    Any,\n    Callable,\n    Dict,\n    Iterable,\n    List,\n    Mapping,\n    Optional,\n    Sequence,\n    Set,\n    Tuple,\n    Union,\n    cast,\n    overload,\n)\n\nimport dagster._check as check\nfrom dagster._annotations import deprecated_param, experimental_param\nfrom dagster._builtins import Nothing\nfrom dagster._config import UserConfigSchema\nfrom dagster._core.decorator_utils import get_function_params, get_valid_name_permutations\nfrom dagster._core.definitions.asset_dep import AssetDep, CoercibleToAssetDep\nfrom dagster._core.definitions.auto_materialize_policy import AutoMaterializePolicy\nfrom dagster._core.definitions.config import ConfigMapping\nfrom dagster._core.definitions.freshness_policy import FreshnessPolicy\nfrom dagster._core.definitions.metadata import ArbitraryMetadataMapping, MetadataUserInput\nfrom dagster._core.definitions.partition_mapping import PartitionMapping\nfrom dagster._core.definitions.resource_annotation import (\n    get_resource_args,\n)\nfrom dagster._core.errors import DagsterInvalidDefinitionError, DagsterInvariantViolationError\nfrom dagster._core.types.dagster_type import DagsterType\nfrom dagster._utils.warnings import (\n    disable_dagster_warnings,\n)\n\nfrom ..asset_check_spec import AssetCheckSpec\nfrom ..asset_in import AssetIn\nfrom ..asset_out import AssetOut\nfrom ..asset_spec import AssetSpec\nfrom ..assets import AssetsDefinition\nfrom ..backfill_policy import BackfillPolicy, BackfillPolicyType\nfrom ..decorators.graph_decorator import graph\nfrom ..decorators.op_decorator import _Op\nfrom ..events import AssetKey, CoercibleToAssetKey, CoercibleToAssetKeyPrefix\nfrom ..input import GraphIn, In\nfrom ..output import GraphOut, Out\nfrom ..partition import PartitionsDefinition\nfrom ..policy import RetryPolicy\nfrom ..resource_definition import ResourceDefinition\nfrom ..utils import DEFAULT_IO_MANAGER_KEY, DEFAULT_OUTPUT, NoValueSentinel\n\n\n@overload\ndef asset(\n    compute_fn: Callable,\n) -> AssetsDefinition: ...\n\n\n@overload\ndef asset(\n    *,\n    name: Optional[str] = ...,\n    key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n    ins: Optional[Mapping[str, AssetIn]] = ...,\n    deps: Optional[Iterable[CoercibleToAssetDep]] = ...,\n    metadata: Optional[Mapping[str, Any]] = ...,\n    description: Optional[str] = ...,\n    config_schema: Optional[UserConfigSchema] = None,\n    required_resource_keys: Optional[Set[str]] = ...,\n    resource_defs: Optional[Mapping[str, object]] = ...,\n    io_manager_def: Optional[object] = ...,\n    io_manager_key: Optional[str] = ...,\n    compute_kind: Optional[str] = ...,\n    dagster_type: Optional[DagsterType] = ...,\n    partitions_def: Optional[PartitionsDefinition] = ...,\n    op_tags: Optional[Mapping[str, Any]] = ...,\n    group_name: Optional[str] = ...,\n    output_required: bool = ...,\n    freshness_policy: Optional[FreshnessPolicy] = ...,\n    auto_materialize_policy: Optional[AutoMaterializePolicy] = ...,\n    backfill_policy: Optional[BackfillPolicy] = ...,\n    retry_policy: Optional[RetryPolicy] = ...,\n    code_version: Optional[str] = ...,\n    key: Optional[CoercibleToAssetKey] = None,\n    non_argument_deps: Optional[Union[Set[AssetKey], Set[str]]] = ...,\n    check_specs: Optional[Sequence[AssetCheckSpec]] = ...,\n) -> Callable[[Callable[..., Any]], AssetsDefinition]: ...\n\n\n
[docs]@experimental_param(param="resource_defs")\n@experimental_param(param="io_manager_def")\n@experimental_param(param="auto_materialize_policy")\n@experimental_param(param="backfill_policy")\n@deprecated_param(\n param="non_argument_deps", breaking_version="2.0.0", additional_warn_text="use `deps` instead."\n)\ndef asset(\n compute_fn: Optional[Callable] = None,\n *,\n name: Optional[str] = None,\n key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n ins: Optional[Mapping[str, AssetIn]] = None,\n deps: Optional[Iterable[CoercibleToAssetDep]] = None,\n metadata: Optional[ArbitraryMetadataMapping] = None,\n description: Optional[str] = None,\n config_schema: Optional[UserConfigSchema] = None,\n required_resource_keys: Optional[Set[str]] = None,\n resource_defs: Optional[Mapping[str, object]] = None,\n io_manager_def: Optional[object] = None,\n io_manager_key: Optional[str] = None,\n compute_kind: Optional[str] = None,\n dagster_type: Optional[DagsterType] = None,\n partitions_def: Optional[PartitionsDefinition] = None,\n op_tags: Optional[Mapping[str, Any]] = None,\n group_name: Optional[str] = None,\n output_required: bool = True,\n freshness_policy: Optional[FreshnessPolicy] = None,\n auto_materialize_policy: Optional[AutoMaterializePolicy] = None,\n backfill_policy: Optional[BackfillPolicy] = None,\n retry_policy: Optional[RetryPolicy] = None,\n code_version: Optional[str] = None,\n key: Optional[CoercibleToAssetKey] = None,\n non_argument_deps: Optional[Union[Set[AssetKey], Set[str]]] = None,\n check_specs: Optional[Sequence[AssetCheckSpec]] = None,\n) -> Union[AssetsDefinition, Callable[[Callable[..., Any]], AssetsDefinition]]:\n """Create a definition for how to compute an asset.\n\n A software-defined asset is the combination of:\n 1. An asset key, e.g. the name of a table.\n 2. A function, which can be run to compute the contents of the asset.\n 3. A set of upstream assets that are provided as inputs to the function when computing the asset.\n\n Unlike an op, whose dependencies are determined by the graph it lives inside, an asset knows\n about the upstream assets it depends on. The upstream assets are inferred from the arguments\n to the decorated function. The name of the argument designates the name of the upstream asset.\n\n An asset has an op inside it to represent the function that computes it. The name of the op\n will be the segments of the asset key, separated by double-underscores.\n\n Args:\n name (Optional[str]): The name of the asset. If not provided, defaults to the name of the\n decorated function. The asset's name must be a valid name in dagster (ie only contains\n letters, numbers, and _) and may not contain python reserved keywords.\n key_prefix (Optional[Union[str, Sequence[str]]]): If provided, the asset's key is the\n concatenation of the key_prefix and the asset's name, which defaults to the name of\n the decorated function. Each item in key_prefix must be a valid name in dagster (ie only\n contains letters, numbers, and _) and may not contain python reserved keywords.\n ins (Optional[Mapping[str, AssetIn]]): A dictionary that maps input names to information\n about the input.\n deps (Optional[Sequence[Union[AssetDep, AssetsDefinition, SourceAsset, AssetKey, str]]]):\n The assets that are upstream dependencies, but do not correspond to a parameter of the\n decorated function. If the AssetsDefinition for a multi_asset is provided, dependencies on\n all assets created by the multi_asset will be created.\n config_schema (Optional[ConfigSchema): The configuration schema for the asset's underlying\n op. If set, Dagster will check that config provided for the op matches this schema and fail\n if it does not. If not set, Dagster will accept any config provided for the op.\n metadata (Optional[Dict[str, Any]]): A dict of metadata entries for the asset.\n required_resource_keys (Optional[Set[str]]): Set of resource handles required by the op.\n io_manager_key (Optional[str]): The resource key of the IOManager used\n for storing the output of the op as an asset, and for loading it in downstream ops\n (default: "io_manager"). Only one of io_manager_key and io_manager_def can be provided.\n io_manager_def (Optional[object]): (Experimental) The IOManager used for\n storing the output of the op as an asset, and for loading it in\n downstream ops. Only one of io_manager_def and io_manager_key can be provided.\n compute_kind (Optional[str]): A string to represent the kind of computation that produces\n the asset, e.g. "dbt" or "spark". It will be displayed in the Dagster UI as a badge on the asset.\n dagster_type (Optional[DagsterType]): Allows specifying type validation functions that\n will be executed on the output of the decorated function after it runs.\n partitions_def (Optional[PartitionsDefinition]): Defines the set of partition keys that\n compose the asset.\n op_tags (Optional[Dict[str, Any]]): A dictionary of tags for the op that computes the asset.\n Frameworks may expect and require certain metadata to be attached to a op. Values that\n are not strings will be json encoded and must meet the criteria that\n `json.loads(json.dumps(value)) == value`.\n group_name (Optional[str]): A string name used to organize multiple assets into groups. If not provided,\n the name "default" is used.\n resource_defs (Optional[Mapping[str, object]]):\n (Experimental) A mapping of resource keys to resources. These resources\n will be initialized during execution, and can be accessed from the\n context within the body of the function.\n output_required (bool): Whether the decorated function will always materialize an asset.\n Defaults to True. If False, the function can return None, which will not be materialized to\n storage and will halt execution of downstream assets.\n freshness_policy (FreshnessPolicy): A constraint telling Dagster how often this asset is intended to be updated\n with respect to its root data.\n auto_materialize_policy (AutoMaterializePolicy): (Experimental) Configure Dagster to automatically materialize\n this asset according to its FreshnessPolicy and when upstream dependencies change.\n backfill_policy (BackfillPolicy): (Experimental) Configure Dagster to backfill this asset according to its\n BackfillPolicy.\n retry_policy (Optional[RetryPolicy]): The retry policy for the op that computes the asset.\n code_version (Optional[str]): (Experimental) Version of the code that generates this asset. In\n general, versions should be set only for code that deterministically produces the same\n output when given the same inputs.\n check_specs (Optional[Sequence[AssetCheckSpec]]): (Experimental) Specs for asset checks that\n execute in the decorated function after materializing the asset.\n non_argument_deps (Optional[Union[Set[AssetKey], Set[str]]]): Deprecated, use deps instead.\n Set of asset keys that are upstream dependencies, but do not pass an input to the asset.\n key (Optional[CoeercibleToAssetKey]): The key for this asset. If provided, cannot specify key_prefix or name.\n\n Examples:\n .. code-block:: python\n\n @asset\n def my_asset(my_upstream_asset: int) -> int:\n return my_upstream_asset + 1\n """\n\n def create_asset():\n upstream_asset_deps = _deps_and_non_argument_deps_to_asset_deps(\n deps=deps, non_argument_deps=non_argument_deps\n )\n\n return _Asset(\n name=cast(Optional[str], name), # (mypy bug that it can't infer name is Optional[str])\n key_prefix=key_prefix,\n ins=ins,\n deps=upstream_asset_deps,\n metadata=metadata,\n description=description,\n config_schema=config_schema,\n required_resource_keys=required_resource_keys,\n resource_defs=resource_defs,\n io_manager_key=io_manager_key,\n io_manager_def=io_manager_def,\n compute_kind=check.opt_str_param(compute_kind, "compute_kind"),\n dagster_type=dagster_type,\n partitions_def=partitions_def,\n op_tags=op_tags,\n group_name=group_name,\n output_required=output_required,\n freshness_policy=freshness_policy,\n auto_materialize_policy=auto_materialize_policy,\n backfill_policy=backfill_policy,\n retry_policy=retry_policy,\n code_version=code_version,\n check_specs=check_specs,\n key=key,\n )\n\n if compute_fn is not None:\n return create_asset()(compute_fn)\n\n def inner(fn: Callable[..., Any]) -> AssetsDefinition:\n check.invariant(\n not (io_manager_key and io_manager_def),\n "Both io_manager_key and io_manager_def were provided to `@asset` decorator. Please"\n " provide one or the other. ",\n )\n return create_asset()(fn)\n\n return inner
\n\n\ndef _resolve_key_and_name(\n *,\n key: Optional[CoercibleToAssetKey],\n key_prefix: Optional[CoercibleToAssetKeyPrefix],\n name: Optional[str],\n decorator: str,\n fn: Callable[..., Any],\n) -> Tuple[AssetKey, str]:\n if (name or key_prefix) and key:\n raise DagsterInvalidDefinitionError(\n f"Cannot specify a name or key prefix for {decorator} when the key"\n " argument is provided."\n )\n key_prefix_list = [key_prefix] if isinstance(key_prefix, str) else key_prefix\n key = AssetKey.from_coercible(key) if key else None\n assigned_name = name or fn.__name__\n return (\n (\n # the filter here appears unnecessary per typing, but this exists\n # historically so keeping it here to be conservative in case users\n # can get Nones into the key_prefix_list somehow\n AssetKey(list(filter(None, [*(key_prefix_list or []), assigned_name])))\n if not key\n else key\n ),\n assigned_name,\n )\n\n\nclass _Asset:\n def __init__(\n self,\n name: Optional[str] = None,\n key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n ins: Optional[Mapping[str, AssetIn]] = None,\n deps: Optional[Iterable[AssetDep]] = None,\n metadata: Optional[ArbitraryMetadataMapping] = None,\n description: Optional[str] = None,\n config_schema: Optional[UserConfigSchema] = None,\n required_resource_keys: Optional[Set[str]] = None,\n resource_defs: Optional[Mapping[str, object]] = None,\n io_manager_key: Optional[str] = None,\n io_manager_def: Optional[object] = None,\n compute_kind: Optional[str] = None,\n dagster_type: Optional[DagsterType] = None,\n partitions_def: Optional[PartitionsDefinition] = None,\n op_tags: Optional[Mapping[str, Any]] = None,\n group_name: Optional[str] = None,\n output_required: bool = True,\n freshness_policy: Optional[FreshnessPolicy] = None,\n auto_materialize_policy: Optional[AutoMaterializePolicy] = None,\n backfill_policy: Optional[BackfillPolicy] = None,\n retry_policy: Optional[RetryPolicy] = None,\n code_version: Optional[str] = None,\n key: Optional[CoercibleToAssetKey] = None,\n check_specs: Optional[Sequence[AssetCheckSpec]] = None,\n ):\n self.name = name\n self.key_prefix = key_prefix\n self.ins = ins or {}\n self.deps = deps or []\n self.metadata = metadata\n self.description = description\n self.required_resource_keys = check.opt_set_param(\n required_resource_keys, "required_resource_keys"\n )\n self.io_manager_key = io_manager_key\n self.io_manager_def = io_manager_def\n self.config_schema = config_schema\n self.compute_kind = compute_kind\n self.dagster_type = dagster_type\n self.partitions_def = partitions_def\n self.op_tags = op_tags\n self.resource_defs = dict(check.opt_mapping_param(resource_defs, "resource_defs"))\n self.group_name = group_name\n self.output_required = output_required\n self.freshness_policy = freshness_policy\n self.retry_policy = retry_policy\n self.auto_materialize_policy = auto_materialize_policy\n self.backfill_policy = backfill_policy\n self.code_version = code_version\n self.check_specs = check_specs\n self.key = key\n\n def __call__(self, fn: Callable) -> AssetsDefinition:\n from dagster._config.pythonic_config import (\n validate_resource_annotated_function,\n )\n from dagster._core.execution.build_resources import wrap_resources_for_execution\n\n validate_resource_annotated_function(fn)\n\n asset_ins = build_asset_ins(fn, self.ins or {}, {dep.asset_key for dep in self.deps})\n\n out_asset_key, asset_name = _resolve_key_and_name(\n key=self.key,\n key_prefix=self.key_prefix,\n name=self.name,\n fn=fn,\n decorator="@asset",\n )\n\n with disable_dagster_warnings():\n arg_resource_keys = {arg.name for arg in get_resource_args(fn)}\n\n bare_required_resource_keys = set(self.required_resource_keys)\n\n resource_defs_dict = self.resource_defs\n resource_defs_keys = set(resource_defs_dict.keys())\n decorator_resource_keys = bare_required_resource_keys | resource_defs_keys\n\n io_manager_key = self.io_manager_key\n if self.io_manager_def:\n if not io_manager_key:\n io_manager_key = out_asset_key.to_python_identifier("io_manager")\n\n if (\n io_manager_key in self.resource_defs\n and self.resource_defs[io_manager_key] != self.io_manager_def\n ):\n raise DagsterInvalidDefinitionError(\n f"Provided conflicting definitions for io manager key '{io_manager_key}'."\n " Please provide only one definition per key."\n )\n\n resource_defs_dict[io_manager_key] = self.io_manager_def\n\n wrapped_resource_defs = wrap_resources_for_execution(resource_defs_dict)\n\n check.param_invariant(\n len(bare_required_resource_keys) == 0 or len(arg_resource_keys) == 0,\n "Cannot specify resource requirements in both @asset decorator and as arguments"\n " to the decorated function",\n )\n\n io_manager_key = cast(str, io_manager_key) if io_manager_key else DEFAULT_IO_MANAGER_KEY\n\n out = Out(\n metadata=self.metadata or {},\n io_manager_key=io_manager_key,\n dagster_type=self.dagster_type if self.dagster_type else NoValueSentinel,\n description=self.description,\n is_required=self.output_required,\n code_version=self.code_version,\n )\n\n check_specs_by_output_name = _validate_and_assign_output_names_to_check_specs(\n self.check_specs, [out_asset_key]\n )\n check_outs: Mapping[str, Out] = {\n output_name: Out(dagster_type=None)\n for output_name in check_specs_by_output_name.keys()\n }\n\n op_required_resource_keys = decorator_resource_keys - arg_resource_keys\n\n op = _Op(\n name=out_asset_key.to_python_identifier(),\n description=self.description,\n ins=dict(asset_ins.values()),\n out={DEFAULT_OUTPUT: out, **check_outs},\n # Any resource requirements specified as arguments will be identified as\n # part of the Op definition instantiation\n required_resource_keys=op_required_resource_keys,\n tags={\n **({"kind": self.compute_kind} if self.compute_kind else {}),\n **(self.op_tags or {}),\n },\n config_schema=self.config_schema,\n retry_policy=self.retry_policy,\n code_version=self.code_version,\n )(fn)\n\n # check backfill policy is BackfillPolicyType.SINGLE_RUN for non-partitioned asset\n if self.partitions_def is None:\n check.param_invariant(\n (\n self.backfill_policy.policy_type is BackfillPolicyType.SINGLE_RUN\n if self.backfill_policy\n else True\n ),\n "backfill_policy",\n "Non partitioned asset can only have single run backfill policy",\n )\n\n keys_by_input_name = {\n input_name: asset_key for asset_key, (input_name, _) in asset_ins.items()\n }\n partition_mappings = {\n keys_by_input_name[input_name]: asset_in.partition_mapping\n for input_name, asset_in in self.ins.items()\n if asset_in.partition_mapping is not None\n }\n\n partition_mappings = _get_partition_mappings_from_deps(\n partition_mappings=partition_mappings, deps=self.deps, asset_name=asset_name\n )\n\n return AssetsDefinition.dagster_internal_init(\n keys_by_input_name=keys_by_input_name,\n keys_by_output_name={"result": out_asset_key},\n node_def=op,\n partitions_def=self.partitions_def,\n partition_mappings=partition_mappings if partition_mappings else None,\n resource_defs=wrapped_resource_defs,\n group_names_by_key={out_asset_key: self.group_name} if self.group_name else None,\n freshness_policies_by_key=(\n {out_asset_key: self.freshness_policy} if self.freshness_policy else None\n ),\n auto_materialize_policies_by_key=(\n {out_asset_key: self.auto_materialize_policy}\n if self.auto_materialize_policy\n else None\n ),\n backfill_policy=self.backfill_policy,\n asset_deps=None, # no asset deps in single-asset decorator\n selected_asset_keys=None, # no subselection in decorator\n can_subset=False,\n metadata_by_key={out_asset_key: self.metadata} if self.metadata else None,\n descriptions_by_key=None, # not supported for now\n check_specs_by_output_name=check_specs_by_output_name,\n selected_asset_check_keys=None, # no subselection in decorator\n )\n\n\n
[docs]@experimental_param(param="resource_defs")\n@deprecated_param(\n param="non_argument_deps", breaking_version="2.0.0", additional_warn_text="use `deps` instead."\n)\ndef multi_asset(\n *,\n outs: Optional[Mapping[str, AssetOut]] = None,\n name: Optional[str] = None,\n ins: Optional[Mapping[str, AssetIn]] = None,\n deps: Optional[Iterable[CoercibleToAssetDep]] = None,\n description: Optional[str] = None,\n config_schema: Optional[UserConfigSchema] = None,\n required_resource_keys: Optional[Set[str]] = None,\n compute_kind: Optional[str] = None,\n internal_asset_deps: Optional[Mapping[str, Set[AssetKey]]] = None,\n partitions_def: Optional[PartitionsDefinition] = None,\n backfill_policy: Optional[BackfillPolicy] = None,\n op_tags: Optional[Mapping[str, Any]] = None,\n can_subset: bool = False,\n resource_defs: Optional[Mapping[str, object]] = None,\n group_name: Optional[str] = None,\n retry_policy: Optional[RetryPolicy] = None,\n code_version: Optional[str] = None,\n specs: Optional[Sequence[AssetSpec]] = None,\n check_specs: Optional[Sequence[AssetCheckSpec]] = None,\n # deprecated\n non_argument_deps: Optional[Union[Set[AssetKey], Set[str]]] = None,\n) -> Callable[[Callable[..., Any]], AssetsDefinition]:\n """Create a combined definition of multiple assets that are computed using the same op and same\n upstream assets.\n\n Each argument to the decorated function references an upstream asset that this asset depends on.\n The name of the argument designates the name of the upstream asset.\n\n You can set I/O managers keys, auto-materialize policies, freshness policies, group names, etc.\n on an individual asset within the multi-asset by attaching them to the :py:class:`AssetOut`\n corresponding to that asset in the `outs` parameter.\n\n Args:\n name (Optional[str]): The name of the op.\n outs: (Optional[Dict[str, AssetOut]]): The AssetOuts representing the assets materialized by\n this function. AssetOuts detail the output, IO management, and core asset properties.\n This argument is required except when AssetSpecs are used.\n ins (Optional[Mapping[str, AssetIn]]): A dictionary that maps input names to information\n about the input.\n deps (Optional[Sequence[Union[AssetsDefinition, SourceAsset, AssetKey, str]]]):\n The assets that are upstream dependencies, but do not correspond to a parameter of the\n decorated function. If the AssetsDefinition for a multi_asset is provided, dependencies on\n all assets created by the multi_asset will be created.\n config_schema (Optional[ConfigSchema): The configuration schema for the asset's underlying\n op. If set, Dagster will check that config provided for the op matches this schema and fail\n if it does not. If not set, Dagster will accept any config provided for the op.\n required_resource_keys (Optional[Set[str]]): Set of resource handles required by the underlying op.\n compute_kind (Optional[str]): A string to represent the kind of computation that produces\n the asset, e.g. "dbt" or "spark". It will be displayed in the Dagster UI as a badge on the asset.\n internal_asset_deps (Optional[Mapping[str, Set[AssetKey]]]): By default, it is assumed\n that all assets produced by a multi_asset depend on all assets that are consumed by that\n multi asset. If this default is not correct, you pass in a map of output names to a\n corrected set of AssetKeys that they depend on. Any AssetKeys in this list must be either\n used as input to the asset or produced within the op.\n partitions_def (Optional[PartitionsDefinition]): Defines the set of partition keys that\n compose the assets.\n backfill_policy (Optional[BackfillPolicy]): The backfill policy for the op that computes the asset.\n op_tags (Optional[Dict[str, Any]]): A dictionary of tags for the op that computes the asset.\n Frameworks may expect and require certain metadata to be attached to a op. Values that\n are not strings will be json encoded and must meet the criteria that\n `json.loads(json.dumps(value)) == value`.\n can_subset (bool): If this asset's computation can emit a subset of the asset\n keys based on the context.selected_assets argument. Defaults to False.\n resource_defs (Optional[Mapping[str, object]]):\n (Experimental) A mapping of resource keys to resources. These resources\n will be initialized during execution, and can be accessed from the\n context within the body of the function.\n group_name (Optional[str]): A string name used to organize multiple assets into groups. This\n group name will be applied to all assets produced by this multi_asset.\n retry_policy (Optional[RetryPolicy]): The retry policy for the op that computes the asset.\n code_version (Optional[str]): (Experimental) Version of the code encapsulated by the multi-asset. If set,\n this is used as a default code version for all defined assets.\n specs (Optional[Sequence[AssetSpec]]): (Experimental) The specifications for the assets materialized\n by this function.\n check_specs (Optional[Sequence[AssetCheckSpec]]): (Experimental) Specs for asset checks that\n execute in the decorated function after materializing the assets.\n non_argument_deps (Optional[Union[Set[AssetKey], Set[str]]]): Deprecated, use deps instead. Set of asset keys that are upstream\n dependencies, but do not pass an input to the multi_asset.\n\n Examples:\n .. code-block:: python\n\n # Use IO managers to handle I/O:\n @multi_asset(\n outs={\n "my_string_asset": AssetOut(),\n "my_int_asset": AssetOut(),\n }\n )\n def my_function(upstream_asset: int):\n result = upstream_asset + 1\n return str(result), result\n\n # Handle I/O on your own:\n @multi_asset(\n outs={\n "asset1": AssetOut(),\n "asset2": AssetOut(),\n },\n deps=["asset0"],\n )\n def my_function():\n asset0_value = load(path="asset0")\n asset1_result, asset2_result = do_some_transformation(asset0_value)\n write(asset1_result, path="asset1")\n write(asset2_result, path="asset2")\n return None, None\n """\n from dagster._core.execution.build_resources import wrap_resources_for_execution\n\n specs = check.opt_list_param(specs, "specs", of_type=AssetSpec)\n\n upstream_asset_deps = _deps_and_non_argument_deps_to_asset_deps(\n deps=deps, non_argument_deps=non_argument_deps\n )\n\n asset_deps = check.opt_mapping_param(\n internal_asset_deps, "internal_asset_deps", key_type=str, value_type=set\n )\n required_resource_keys = check.opt_set_param(\n required_resource_keys, "required_resource_keys", of_type=str\n )\n resource_defs = wrap_resources_for_execution(\n check.opt_mapping_param(resource_defs, "resource_defs", key_type=str)\n )\n\n _config_schema = check.opt_mapping_param(\n config_schema, # type: ignore\n "config_schema",\n additional_message="Only dicts are supported for asset config_schema.",\n )\n\n bare_required_resource_keys = set(required_resource_keys)\n resource_defs_keys = set(resource_defs.keys())\n required_resource_keys = bare_required_resource_keys | resource_defs_keys\n\n asset_out_map: Mapping[str, AssetOut] = {} if outs is None else outs\n\n def inner(fn: Callable[..., Any]) -> AssetsDefinition:\n op_name = name or fn.__name__\n\n if asset_out_map and specs:\n raise DagsterInvalidDefinitionError("Must specify only outs or specs but not both.")\n elif specs:\n output_tuples_by_asset_key = {}\n for asset_spec in specs:\n # output names are asset keys joined with _\n output_name = "_".join(asset_spec.key.path)\n output_tuples_by_asset_key[asset_spec.key] = (\n output_name,\n Out(\n Nothing,\n is_required=not (can_subset or asset_spec.skippable),\n ),\n )\n if upstream_asset_deps:\n raise DagsterInvalidDefinitionError(\n "Can not pass deps and specs to @multi_asset, specify deps on the AssetSpecs"\n " directly."\n )\n if internal_asset_deps:\n raise DagsterInvalidDefinitionError(\n "Can not pass internal_asset_deps and specs to @multi_asset, specify deps on"\n " the AssetSpecs directly."\n )\n\n upstream_keys = set()\n for spec in specs:\n for dep in spec.deps:\n if dep.asset_key not in output_tuples_by_asset_key:\n upstream_keys.add(dep.asset_key)\n if (\n dep.asset_key in output_tuples_by_asset_key\n and dep.partition_mapping is not None\n ):\n # self-dependent asset also needs to be considered an upstream_key\n upstream_keys.add(dep.asset_key)\n\n explicit_ins = ins or {}\n # get which asset keys have inputs set\n loaded_upstreams = build_asset_ins(fn, explicit_ins, deps=set())\n unexpected_upstreams = {\n key for key in loaded_upstreams.keys() if key not in upstream_keys\n }\n if unexpected_upstreams:\n raise DagsterInvalidDefinitionError(\n f"Asset inputs {unexpected_upstreams} do not have dependencies on the passed"\n " AssetSpec(s). Set the deps on the appropriate AssetSpec(s)."\n )\n remaining_upstream_keys = {key for key in upstream_keys if key not in loaded_upstreams}\n asset_ins = build_asset_ins(fn, explicit_ins, deps=remaining_upstream_keys)\n else:\n asset_ins = build_asset_ins(\n fn,\n ins or {},\n deps=(\n {dep.asset_key for dep in upstream_asset_deps} if upstream_asset_deps else set()\n ),\n )\n output_tuples_by_asset_key = build_asset_outs(asset_out_map)\n # validate that the asset_deps make sense\n valid_asset_deps = set(asset_ins.keys()) | set(output_tuples_by_asset_key.keys())\n for out_name, asset_keys in asset_deps.items():\n if asset_out_map and out_name not in asset_out_map:\n check.failed(\n f"Invalid out key '{out_name}' supplied to `internal_asset_deps` argument"\n f" for multi-asset {op_name}. Must be one of the outs for this multi-asset"\n f" {list(asset_out_map.keys())[:20]}.",\n )\n invalid_asset_deps = asset_keys.difference(valid_asset_deps)\n check.invariant(\n not invalid_asset_deps,\n f"Invalid asset dependencies: {invalid_asset_deps} specified in"\n f" `internal_asset_deps` argument for multi-asset '{op_name}' on key"\n f" '{out_name}'. Each specified asset key must be associated with an input to"\n " the asset or produced by this asset. Valid keys:"\n f" {list(valid_asset_deps)[:20]}",\n )\n\n arg_resource_keys = {arg.name for arg in get_resource_args(fn)}\n check.param_invariant(\n len(bare_required_resource_keys or []) == 0 or len(arg_resource_keys) == 0,\n "Cannot specify resource requirements in both @multi_asset decorator and as"\n " arguments to the decorated function",\n )\n\n asset_outs_by_output_name: Mapping[str, Out] = dict(output_tuples_by_asset_key.values())\n\n check_specs_by_output_name = _validate_and_assign_output_names_to_check_specs(\n check_specs, list(output_tuples_by_asset_key.keys())\n )\n check_outs_by_output_name: Mapping[str, Out] = {\n output_name: Out(dagster_type=None, is_required=not can_subset)\n for output_name in check_specs_by_output_name.keys()\n }\n overlapping_output_names = (\n asset_outs_by_output_name.keys() & check_outs_by_output_name.keys()\n )\n check.invariant(\n len(overlapping_output_names) == 0,\n f"Check output names overlap with asset output names: {overlapping_output_names}",\n )\n combined_outs_by_output_name: Mapping[str, Out] = {\n **asset_outs_by_output_name,\n **check_outs_by_output_name,\n }\n\n with disable_dagster_warnings():\n op_required_resource_keys = required_resource_keys - arg_resource_keys\n\n op = _Op(\n name=op_name,\n description=description,\n ins=dict(asset_ins.values()),\n out=combined_outs_by_output_name,\n required_resource_keys=op_required_resource_keys,\n tags={\n **({"kind": compute_kind} if compute_kind else {}),\n **(op_tags or {}),\n },\n config_schema=_config_schema,\n retry_policy=retry_policy,\n code_version=code_version,\n )(fn)\n\n keys_by_input_name = {\n input_name: asset_key for asset_key, (input_name, _) in asset_ins.items()\n }\n keys_by_output_name = {\n output_name: asset_key\n for asset_key, (output_name, _) in output_tuples_by_asset_key.items()\n }\n partition_mappings = {\n keys_by_input_name[input_name]: asset_in.partition_mapping\n for input_name, asset_in in (ins or {}).items()\n if asset_in.partition_mapping is not None\n }\n\n if upstream_asset_deps:\n partition_mappings = _get_partition_mappings_from_deps(\n partition_mappings=partition_mappings, deps=upstream_asset_deps, asset_name=op_name\n )\n\n if specs:\n internal_deps = {\n spec.key: {dep.asset_key for dep in spec.deps}\n for spec in specs\n if spec.deps is not None\n }\n props_by_asset_key: Mapping[AssetKey, Union[AssetSpec, AssetOut]] = {\n spec.key: spec for spec in specs\n }\n # Add PartitionMappings specified via AssetSpec.deps to partition_mappings dictionary. Error on duplicates\n for spec in specs:\n for dep in spec.deps:\n if dep.partition_mapping is None:\n continue\n if partition_mappings.get(dep.asset_key, None) is None:\n partition_mappings[dep.asset_key] = dep.partition_mapping\n continue\n if partition_mappings[dep.asset_key] == dep.partition_mapping:\n continue\n else:\n raise DagsterInvalidDefinitionError(\n f"Two different PartitionMappings for {dep.asset_key} provided for"\n f" multi_asset {op_name}. Please use the same PartitionMapping for"\n f" {dep.asset_key}."\n )\n\n else:\n internal_deps = {keys_by_output_name[name]: asset_deps[name] for name in asset_deps}\n props_by_asset_key = {\n keys_by_output_name[output_name]: asset_out\n for output_name, asset_out in asset_out_map.items()\n }\n\n # handle properties defined ons AssetSpecs or AssetOuts\n group_names_by_key = {\n asset_key: props.group_name\n for asset_key, props in props_by_asset_key.items()\n if props.group_name is not None\n }\n if group_name:\n check.invariant(\n not group_names_by_key,\n "Cannot set group_name parameter on multi_asset if one or more of the"\n " AssetSpecs/AssetOuts supplied to this multi_asset have a group_name defined.",\n )\n group_names_by_key = {asset_key: group_name for asset_key in props_by_asset_key}\n\n freshness_policies_by_key = {\n asset_key: props.freshness_policy\n for asset_key, props in props_by_asset_key.items()\n if props.freshness_policy is not None\n }\n auto_materialize_policies_by_key = {\n asset_key: props.auto_materialize_policy\n for asset_key, props in props_by_asset_key.items()\n if props.auto_materialize_policy is not None\n }\n metadata_by_key = {\n asset_key: props.metadata\n for asset_key, props in props_by_asset_key.items()\n if props.metadata is not None\n }\n\n return AssetsDefinition.dagster_internal_init(\n keys_by_input_name=keys_by_input_name,\n keys_by_output_name=keys_by_output_name,\n node_def=op,\n asset_deps=internal_deps,\n partitions_def=partitions_def,\n partition_mappings=partition_mappings if partition_mappings else None,\n can_subset=can_subset,\n resource_defs=resource_defs,\n group_names_by_key=group_names_by_key,\n freshness_policies_by_key=freshness_policies_by_key,\n auto_materialize_policies_by_key=auto_materialize_policies_by_key,\n backfill_policy=backfill_policy,\n selected_asset_keys=None, # no subselection in decorator\n descriptions_by_key=None, # not supported for now\n metadata_by_key=metadata_by_key,\n check_specs_by_output_name=check_specs_by_output_name,\n selected_asset_check_keys=None, # no subselection in decorator\n )\n\n return inner
\n\n\ndef get_function_params_without_context_or_config_or_resources(fn: Callable) -> List[Parameter]:\n params = get_function_params(fn)\n is_context_provided = len(params) > 0 and params[0].name in get_valid_name_permutations(\n "context"\n )\n input_params = params[1:] if is_context_provided else params\n\n resource_arg_names = {arg.name for arg in get_resource_args(fn)}\n\n new_input_args = []\n for input_arg in input_params:\n if input_arg.name != "config" and input_arg.name not in resource_arg_names:\n new_input_args.append(input_arg)\n\n return new_input_args\n\n\ndef stringify_asset_key_to_input_name(asset_key: AssetKey) -> str:\n return "_".join(asset_key.path).replace("-", "_")\n\n\ndef build_asset_ins(\n fn: Callable,\n asset_ins: Mapping[str, AssetIn],\n deps: Optional[AbstractSet[AssetKey]],\n) -> Mapping[AssetKey, Tuple[str, In]]:\n """Creates a mapping from AssetKey to (name of input, In object)."""\n deps = check.opt_set_param(deps, "deps", AssetKey)\n\n new_input_args = get_function_params_without_context_or_config_or_resources(fn)\n\n non_var_input_param_names = [\n param.name for param in new_input_args if param.kind == Parameter.POSITIONAL_OR_KEYWORD\n ]\n has_kwargs = any(param.kind == Parameter.VAR_KEYWORD for param in new_input_args)\n\n all_input_names = set(non_var_input_param_names) | asset_ins.keys()\n\n if not has_kwargs:\n for in_key, asset_in in asset_ins.items():\n if in_key not in non_var_input_param_names and (\n not isinstance(asset_in.dagster_type, DagsterType)\n or not asset_in.dagster_type.is_nothing\n ):\n raise DagsterInvalidDefinitionError(\n f"Key '{in_key}' in provided ins dict does not correspond to any of the names "\n "of the arguments to the decorated function"\n )\n\n ins_by_asset_key: Dict[AssetKey, Tuple[str, In]] = {}\n for input_name in all_input_names:\n asset_key = None\n\n if input_name in asset_ins:\n asset_key = asset_ins[input_name].key\n metadata = asset_ins[input_name].metadata or {}\n key_prefix = asset_ins[input_name].key_prefix\n input_manager_key = asset_ins[input_name].input_manager_key\n dagster_type = asset_ins[input_name].dagster_type\n else:\n metadata = {}\n key_prefix = None\n input_manager_key = None\n dagster_type = NoValueSentinel\n\n asset_key = asset_key or AssetKey(list(filter(None, [*(key_prefix or []), input_name])))\n\n ins_by_asset_key[asset_key] = (\n input_name.replace("-", "_"),\n In(metadata=metadata, input_manager_key=input_manager_key, dagster_type=dagster_type),\n )\n\n for asset_key in deps:\n if asset_key in ins_by_asset_key:\n raise DagsterInvalidDefinitionError(\n f"deps value {asset_key} also declared as input/AssetIn"\n )\n # mypy doesn't realize that Nothing is a valid type here\n ins_by_asset_key[asset_key] = (\n stringify_asset_key_to_input_name(asset_key),\n In(cast(type, Nothing)),\n )\n\n return ins_by_asset_key\n\n\n@overload\ndef graph_asset(\n compose_fn: Callable,\n) -> AssetsDefinition: ...\n\n\n@overload\ndef graph_asset(\n *,\n name: Optional[str] = None,\n description: Optional[str] = None,\n ins: Optional[Mapping[str, AssetIn]] = None,\n config: Optional[Union[ConfigMapping, Mapping[str, Any]]] = None,\n key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n group_name: Optional[str] = None,\n partitions_def: Optional[PartitionsDefinition] = None,\n metadata: Optional[MetadataUserInput] = ...,\n freshness_policy: Optional[FreshnessPolicy] = ...,\n auto_materialize_policy: Optional[AutoMaterializePolicy] = ...,\n backfill_policy: Optional[BackfillPolicy] = ...,\n resource_defs: Optional[Mapping[str, ResourceDefinition]] = ...,\n check_specs: Optional[Sequence[AssetCheckSpec]] = None,\n key: Optional[CoercibleToAssetKey] = None,\n) -> Callable[[Callable[..., Any]], AssetsDefinition]: ...\n\n\n
[docs]def graph_asset(\n compose_fn: Optional[Callable] = None,\n *,\n name: Optional[str] = None,\n description: Optional[str] = None,\n ins: Optional[Mapping[str, AssetIn]] = None,\n config: Optional[Union[ConfigMapping, Mapping[str, Any]]] = None,\n key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n group_name: Optional[str] = None,\n partitions_def: Optional[PartitionsDefinition] = None,\n metadata: Optional[MetadataUserInput] = None,\n freshness_policy: Optional[FreshnessPolicy] = None,\n auto_materialize_policy: Optional[AutoMaterializePolicy] = None,\n backfill_policy: Optional[BackfillPolicy] = None,\n resource_defs: Optional[Mapping[str, ResourceDefinition]] = None,\n check_specs: Optional[Sequence[AssetCheckSpec]] = None,\n key: Optional[CoercibleToAssetKey] = None,\n) -> Union[AssetsDefinition, Callable[[Callable[..., Any]], AssetsDefinition]]:\n """Creates a software-defined asset that's computed using a graph of ops.\n\n This decorator is meant to decorate a function that composes a set of ops or graphs to define\n the dependencies between them.\n\n Args:\n name (Optional[str]): The name of the asset. If not provided, defaults to the name of the\n decorated function. The asset's name must be a valid name in Dagster (ie only contains\n letters, numbers, and underscores) and may not contain Python reserved keywords.\n description (Optional[str]):\n A human-readable description of the asset.\n ins (Optional[Mapping[str, AssetIn]]): A dictionary that maps input names to information\n about the input.\n config (Optional[Union[ConfigMapping], Mapping[str, Any]):\n Describes how the graph underlying the asset is configured at runtime.\n\n If a :py:class:`ConfigMapping` object is provided, then the graph takes on the config\n schema of this object. The mapping will be applied at runtime to generate the config for\n the graph's constituent nodes.\n\n If a dictionary is provided, then it will be used as the default run config for the\n graph. This means it must conform to the config schema of the underlying nodes. Note\n that the values provided will be viewable and editable in the Dagster UI, so be careful\n with secrets. its constituent nodes.\n\n If no value is provided, then the config schema for the graph is the default (derived\n from the underlying nodes).\n key_prefix (Optional[Union[str, Sequence[str]]]): If provided, the asset's key is the\n concatenation of the key_prefix and the asset's name, which defaults to the name of\n the decorated function. Each item in key_prefix must be a valid name in Dagster (ie only\n contains letters, numbers, and underscores) and may not contain Python reserved keywords.\n group_name (Optional[str]): A string name used to organize multiple assets into groups. If\n not provided, the name "default" is used.\n partitions_def (Optional[PartitionsDefinition]): Defines the set of partition keys that\n compose the asset.\n metadata (Optional[MetadataUserInput]): Dictionary of metadata to be associated with\n the asset.\n freshness_policy (Optional[FreshnessPolicy]): A constraint telling Dagster how often this asset is\n intended to be updated with respect to its root data.\n auto_materialize_policy (Optional[AutoMaterializePolicy]): The AutoMaterializePolicy to use\n for this asset.\n backfill_policy (Optional[BackfillPolicy]): The BackfillPolicy to use for this asset.\n key (Optional[CoeercibleToAssetKey]): The key for this asset. If provided, cannot specify key_prefix or name.\n\n Examples:\n .. code-block:: python\n\n @op\n def fetch_files_from_slack(context) -> pd.DataFrame:\n ...\n\n @op\n def store_files_in_table(files) -> None:\n files.to_sql(name="slack_files", con=create_db_connection())\n\n @graph_asset\n def slack_files_table():\n return store_files(fetch_files_from_slack())\n """\n if compose_fn is None:\n return lambda fn: graph_asset( # type: ignore # (decorator pattern)\n fn,\n name=name,\n description=description,\n ins=ins,\n config=config,\n key_prefix=key_prefix,\n group_name=group_name,\n partitions_def=partitions_def,\n metadata=metadata,\n freshness_policy=freshness_policy,\n auto_materialize_policy=auto_materialize_policy,\n backfill_policy=backfill_policy,\n resource_defs=resource_defs,\n check_specs=check_specs,\n key=key,\n )\n else:\n return graph_asset_no_defaults(\n compose_fn=compose_fn,\n name=name,\n description=description,\n ins=ins,\n config=config,\n key_prefix=key_prefix,\n group_name=group_name,\n partitions_def=partitions_def,\n metadata=metadata,\n freshness_policy=freshness_policy,\n auto_materialize_policy=auto_materialize_policy,\n backfill_policy=backfill_policy,\n resource_defs=resource_defs,\n check_specs=check_specs,\n key=key,\n )
\n\n\ndef graph_asset_no_defaults(\n *,\n compose_fn: Callable,\n name: Optional[str],\n description: Optional[str],\n ins: Optional[Mapping[str, AssetIn]],\n config: Optional[Union[ConfigMapping, Mapping[str, Any]]],\n key_prefix: Optional[CoercibleToAssetKeyPrefix],\n group_name: Optional[str],\n partitions_def: Optional[PartitionsDefinition],\n metadata: Optional[MetadataUserInput],\n freshness_policy: Optional[FreshnessPolicy],\n auto_materialize_policy: Optional[AutoMaterializePolicy],\n backfill_policy: Optional[BackfillPolicy],\n resource_defs: Optional[Mapping[str, ResourceDefinition]],\n check_specs: Optional[Sequence[AssetCheckSpec]],\n key: Optional[CoercibleToAssetKey],\n) -> AssetsDefinition:\n ins = ins or {}\n asset_ins = build_asset_ins(compose_fn, ins or {}, set())\n out_asset_key, _asset_name = _resolve_key_and_name(\n key=key,\n key_prefix=key_prefix,\n name=name,\n decorator="@graph_asset",\n fn=compose_fn,\n )\n\n keys_by_input_name = {input_name: asset_key for asset_key, (input_name, _) in asset_ins.items()}\n partition_mappings = {\n input_name: asset_in.partition_mapping\n for input_name, asset_in in ins.items()\n if asset_in.partition_mapping\n }\n\n check_specs_by_output_name = _validate_and_assign_output_names_to_check_specs(\n check_specs, [out_asset_key]\n )\n check_outs_by_output_name: Mapping[str, GraphOut] = {\n output_name: GraphOut() for output_name in check_specs_by_output_name.keys()\n }\n\n combined_outs_by_output_name: Mapping = {\n "result": GraphOut(),\n **check_outs_by_output_name,\n }\n\n op_graph = graph(\n name=out_asset_key.to_python_identifier(),\n description=description,\n config=config,\n ins={input_name: GraphIn() for _, (input_name, _) in asset_ins.items()},\n out=combined_outs_by_output_name,\n )(compose_fn)\n return AssetsDefinition.from_graph(\n op_graph,\n keys_by_input_name=keys_by_input_name,\n keys_by_output_name={"result": out_asset_key},\n partitions_def=partitions_def,\n partition_mappings=partition_mappings if partition_mappings else None,\n group_name=group_name,\n metadata_by_output_name={"result": metadata} if metadata else None,\n freshness_policies_by_output_name=(\n {"result": freshness_policy} if freshness_policy else None\n ),\n auto_materialize_policies_by_output_name=(\n {"result": auto_materialize_policy} if auto_materialize_policy else None\n ),\n backfill_policy=backfill_policy,\n descriptions_by_output_name={"result": description} if description else None,\n resource_defs=resource_defs,\n check_specs=check_specs,\n )\n\n\n
[docs]def graph_multi_asset(\n *,\n outs: Mapping[str, AssetOut],\n name: Optional[str] = None,\n ins: Optional[Mapping[str, AssetIn]] = None,\n partitions_def: Optional[PartitionsDefinition] = None,\n backfill_policy: Optional[BackfillPolicy] = None,\n group_name: Optional[str] = None,\n can_subset: bool = False,\n resource_defs: Optional[Mapping[str, ResourceDefinition]] = None,\n check_specs: Optional[Sequence[AssetCheckSpec]] = None,\n) -> Callable[[Callable[..., Any]], AssetsDefinition]:\n """Create a combined definition of multiple assets that are computed using the same graph of\n ops, and the same upstream assets.\n\n Each argument to the decorated function references an upstream asset that this asset depends on.\n The name of the argument designates the name of the upstream asset.\n\n Args:\n name (Optional[str]): The name of the graph.\n outs: (Optional[Dict[str, AssetOut]]): The AssetOuts representing the produced assets.\n ins (Optional[Mapping[str, AssetIn]]): A dictionary that maps input names to information\n about the input.\n partitions_def (Optional[PartitionsDefinition]): Defines the set of partition keys that\n compose the assets.\n backfill_policy (Optional[BackfillPolicy]): The backfill policy for the asset.\n group_name (Optional[str]): A string name used to organize multiple assets into groups. This\n group name will be applied to all assets produced by this multi_asset.\n can_subset (bool): Whether this asset's computation can emit a subset of the asset\n keys based on the context.selected_assets argument. Defaults to False.\n """\n\n def inner(fn: Callable) -> AssetsDefinition:\n partition_mappings = {\n input_name: asset_in.partition_mapping\n for input_name, asset_in in (ins or {}).items()\n if asset_in.partition_mapping\n }\n\n asset_ins = build_asset_ins(fn, ins or {}, set())\n keys_by_input_name = {\n input_name: asset_key for asset_key, (input_name, _) in asset_ins.items()\n }\n asset_outs = build_asset_outs(outs)\n\n check_specs_by_output_name = _validate_and_assign_output_names_to_check_specs(\n check_specs, list(asset_outs.keys())\n )\n check_outs_by_output_name: Mapping[str, GraphOut] = {\n output_name: GraphOut() for output_name in check_specs_by_output_name.keys()\n }\n\n combined_outs_by_output_name = {\n **{output_name: GraphOut() for output_name, _ in asset_outs.values()},\n **check_outs_by_output_name,\n }\n\n op_graph = graph(\n name=name or fn.__name__,\n out=combined_outs_by_output_name,\n )(fn)\n\n # source metadata from the AssetOuts (if any)\n metadata_by_output_name = {\n output_name: out.metadata\n for output_name, out in outs.items()\n if isinstance(out, AssetOut) and out.metadata is not None\n }\n\n # source freshness policies from the AssetOuts (if any)\n freshness_policies_by_output_name = {\n output_name: out.freshness_policy\n for output_name, out in outs.items()\n if isinstance(out, AssetOut) and out.freshness_policy is not None\n }\n\n # source auto materialize policies from the AssetOuts (if any)\n auto_materialize_policies_by_output_name = {\n output_name: out.auto_materialize_policy\n for output_name, out in outs.items()\n if isinstance(out, AssetOut) and out.auto_materialize_policy is not None\n }\n\n # source descriptions from the AssetOuts (if any)\n descriptions_by_output_name = {\n output_name: out.description\n for output_name, out in outs.items()\n if isinstance(out, AssetOut) and out.description is not None\n }\n\n return AssetsDefinition.from_graph(\n op_graph,\n keys_by_input_name=keys_by_input_name,\n keys_by_output_name={\n output_name: asset_key for asset_key, (output_name, _) in asset_outs.items()\n },\n partitions_def=partitions_def,\n partition_mappings=partition_mappings if partition_mappings else None,\n group_name=group_name,\n can_subset=can_subset,\n metadata_by_output_name=metadata_by_output_name,\n freshness_policies_by_output_name=freshness_policies_by_output_name,\n auto_materialize_policies_by_output_name=auto_materialize_policies_by_output_name,\n backfill_policy=backfill_policy,\n descriptions_by_output_name=descriptions_by_output_name,\n resource_defs=resource_defs,\n check_specs=check_specs,\n )\n\n return inner
\n\n\ndef build_asset_outs(asset_outs: Mapping[str, AssetOut]) -> Mapping[AssetKey, Tuple[str, Out]]:\n """Creates a mapping from AssetKey to (name of output, Out object)."""\n outs_by_asset_key: Dict[AssetKey, Tuple[str, Out]] = {}\n for output_name, asset_out in asset_outs.items():\n out = asset_out.to_out()\n asset_key = asset_out.key or AssetKey(\n list(filter(None, [*(asset_out.key_prefix or []), output_name]))\n )\n\n outs_by_asset_key[asset_key] = (output_name.replace("-", "_"), out)\n\n return outs_by_asset_key\n\n\ndef _deps_and_non_argument_deps_to_asset_deps(\n deps: Optional[Iterable[CoercibleToAssetDep]],\n non_argument_deps: Optional[Union[Set[AssetKey], Set[str]]],\n) -> Optional[Iterable[AssetDep]]:\n """Helper function for managing deps and non_argument_deps while non_argument_deps is still an accepted parameter.\n Ensures only one of deps and non_argument_deps is provided, then converts the deps to AssetDeps.\n """\n if non_argument_deps is not None and deps is not None:\n raise DagsterInvalidDefinitionError(\n "Cannot specify both deps and non_argument_deps to @asset. Use only deps instead."\n )\n\n if deps is not None:\n return _make_asset_deps(deps)\n\n if non_argument_deps is not None:\n check.set_param(non_argument_deps, "non_argument_deps", of_type=(AssetKey, str))\n return _make_asset_deps(non_argument_deps)\n\n\ndef _make_asset_deps(deps: Optional[Iterable[CoercibleToAssetDep]]) -> Optional[Iterable[AssetDep]]:\n if deps is None:\n return None\n\n # expand any multi_assets into a list of keys\n all_deps = []\n for dep in deps:\n if isinstance(dep, AssetsDefinition) and len(dep.keys) > 1:\n all_deps.extend(dep.keys)\n else:\n all_deps.append(dep)\n\n with disable_dagster_warnings():\n dep_dict = {}\n for dep in all_deps:\n asset_dep = AssetDep.from_coercible(dep)\n\n # we cannot do deduplication via a set because MultiPartitionMappings have an internal\n # dictionary that cannot be hashed. Instead deduplicate by making a dictionary and checking\n # for existing keys.\n if asset_dep.asset_key in dep_dict.keys():\n raise DagsterInvariantViolationError(\n f"Cannot set a dependency on asset {asset_dep.asset_key} more than once per"\n " asset."\n )\n dep_dict[asset_dep.asset_key] = asset_dep\n\n return list(dep_dict.values())\n\n\ndef _validate_and_assign_output_names_to_check_specs(\n check_specs: Optional[Sequence[AssetCheckSpec]], valid_asset_keys: Sequence[AssetKey]\n) -> Mapping[str, AssetCheckSpec]:\n check_specs_by_output_name = {spec.get_python_identifier(): spec for spec in check_specs or []}\n if check_specs and len(check_specs_by_output_name) != len(check_specs):\n duplicates = {\n item: count\n for item, count in Counter(\n [(spec.asset_key, spec.name) for spec in check_specs]\n ).items()\n if count > 1\n }\n\n raise DagsterInvalidDefinitionError(f"Duplicate check specs: {duplicates}")\n\n for spec in check_specs_by_output_name.values():\n if spec.asset_key not in valid_asset_keys:\n raise DagsterInvalidDefinitionError(\n f"Invalid asset key {spec.asset_key} in check spec {spec.name}. Must be one of"\n f" {valid_asset_keys}"\n )\n\n return check_specs_by_output_name\n\n\ndef _get_partition_mappings_from_deps(\n partition_mappings: Dict[AssetKey, PartitionMapping], deps: Iterable[AssetDep], asset_name: str\n):\n # Add PartitionMappings specified via AssetDeps to partition_mappings dictionary. Error on duplicates\n for dep in deps:\n if dep.partition_mapping is None:\n continue\n if partition_mappings.get(dep.asset_key, None) is None:\n partition_mappings[dep.asset_key] = dep.partition_mapping\n continue\n if partition_mappings[dep.asset_key] == dep.partition_mapping:\n continue\n else:\n raise DagsterInvalidDefinitionError(\n f"Two different PartitionMappings for {dep.asset_key} provided for"\n f" asset {asset_name}. Please use the same PartitionMapping for"\n f" {dep.asset_key}."\n )\n\n return partition_mappings\n
", "current_page_name": "_modules/dagster/_core/definitions/decorators/asset_decorator", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.decorators.asset_decorator"}, "graph_decorator": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.decorators.graph_decorator

\nfrom functools import update_wrapper\nfrom typing import Any, Callable, Mapping, Optional, Sequence, Union, overload\n\nimport dagster._check as check\nfrom dagster._core.decorator_utils import format_docstring_for_description\n\nfrom ..config import ConfigMapping\nfrom ..graph_definition import GraphDefinition\nfrom ..input import GraphIn, InputDefinition\nfrom ..output import GraphOut, OutputDefinition\n\n\nclass _Graph:\n    name: Optional[str]\n    description: Optional[str]\n    input_defs: Sequence[InputDefinition]\n    output_defs: Optional[Sequence[OutputDefinition]]\n    ins: Optional[Mapping[str, GraphIn]]\n    out: Optional[Union[GraphOut, Mapping[str, GraphOut]]]\n    tags: Optional[Mapping[str, str]]\n    config_mapping: Optional[ConfigMapping]\n\n    def __init__(\n        self,\n        name: Optional[str] = None,\n        description: Optional[str] = None,\n        input_defs: Optional[Sequence[InputDefinition]] = None,\n        output_defs: Optional[Sequence[OutputDefinition]] = None,\n        ins: Optional[Mapping[str, GraphIn]] = None,\n        out: Optional[Union[GraphOut, Mapping[str, GraphOut]]] = None,\n        tags: Optional[Mapping[str, Any]] = None,\n        config_mapping: Optional[ConfigMapping] = None,\n    ):\n        self.name = check.opt_str_param(name, "name")\n        self.description = check.opt_str_param(description, "description")\n        self.input_defs = check.opt_sequence_param(\n            input_defs, "input_defs", of_type=InputDefinition\n        )\n        self.did_pass_outputs = output_defs is not None or out is not None\n        self.output_defs = check.opt_nullable_sequence_param(\n            output_defs, "output_defs", of_type=OutputDefinition\n        )\n        self.ins = ins\n        self.out = out\n        self.tags = tags\n        self.config_mapping = check.opt_inst_param(config_mapping, "config_mapping", ConfigMapping)\n\n    def __call__(self, fn: Callable[..., Any]) -> GraphDefinition:\n        check.callable_param(fn, "fn")\n\n        if not self.name:\n            self.name = fn.__name__\n\n        if self.ins is not None:\n            input_defs = [inp.to_definition(name) for name, inp in self.ins.items()]\n        else:\n            input_defs = check.opt_list_param(\n                self.input_defs, "input_defs", of_type=InputDefinition\n            )\n\n        if self.out is None:\n            output_defs = self.output_defs\n        elif isinstance(self.out, GraphOut):\n            output_defs = [self.out.to_definition(name=None)]\n        else:\n            check.dict_param(self.out, "out", key_type=str, value_type=GraphOut)\n            output_defs = [out.to_definition(name=name) for name, out in self.out.items()]\n\n        from dagster._core.definitions.composition import do_composition\n\n        (\n            input_mappings,\n            output_mappings,\n            dependencies,\n            node_defs,\n            config_mapping,\n            positional_inputs,\n            node_input_source_assets,\n        ) = do_composition(\n            decorator_name="@graph",\n            graph_name=self.name,\n            fn=fn,\n            provided_input_defs=input_defs,\n            provided_output_defs=output_defs,\n            ignore_output_from_composition_fn=False,\n            config_mapping=self.config_mapping,\n        )\n\n        graph_def = GraphDefinition(\n            name=self.name,\n            dependencies=dependencies,\n            node_defs=node_defs,\n            description=self.description or format_docstring_for_description(fn),\n            input_mappings=input_mappings,\n            output_mappings=output_mappings,\n            config=config_mapping,\n            positional_inputs=positional_inputs,\n            tags=self.tags,\n            node_input_source_assets=node_input_source_assets,\n        )\n        update_wrapper(graph_def, fn)\n        return graph_def\n\n\n@overload\ndef graph(compose_fn: Callable) -> GraphDefinition: ...\n\n\n@overload\ndef graph(\n    *,\n    name: Optional[str] = ...,\n    description: Optional[str] = ...,\n    input_defs: Optional[Sequence[InputDefinition]] = ...,\n    output_defs: Optional[Sequence[OutputDefinition]] = ...,\n    ins: Optional[Mapping[str, GraphIn]] = ...,\n    out: Optional[Union[GraphOut, Mapping[str, GraphOut]]] = ...,\n    tags: Optional[Mapping[str, Any]] = ...,\n    config: Optional[Union[ConfigMapping, Mapping[str, Any]]] = ...,\n) -> _Graph: ...\n\n\n
[docs]def graph(\n compose_fn: Optional[Callable] = None,\n *,\n name: Optional[str] = None,\n description: Optional[str] = None,\n input_defs: Optional[Sequence[InputDefinition]] = None,\n output_defs: Optional[Sequence[OutputDefinition]] = None,\n ins: Optional[Mapping[str, GraphIn]] = None,\n out: Optional[Union[GraphOut, Mapping[str, GraphOut]]] = None,\n tags: Optional[Mapping[str, Any]] = None,\n config: Optional[Union[ConfigMapping, Mapping[str, Any]]] = None,\n) -> Union[GraphDefinition, _Graph]:\n """Create an op graph with the specified parameters from the decorated composition function.\n\n Using this decorator allows you to build up a dependency graph by writing a\n function that invokes ops (or other graphs) and passes the output to subsequent invocations.\n\n Args:\n name (Optional[str]):\n The name of the op graph. Must be unique within any :py:class:`RepositoryDefinition` containing the graph.\n description (Optional[str]):\n A human-readable description of the graph.\n input_defs (Optional[List[InputDefinition]]):\n Information about the inputs that this graph maps. Information provided here\n will be combined with what can be inferred from the function signature, with these\n explicit InputDefinitions taking precedence.\n\n Uses of inputs in the body of the decorated composition function will determine\n the :py:class:`InputMappings <InputMapping>` passed to the underlying\n :py:class:`GraphDefinition`.\n output_defs (Optional[List[OutputDefinition]]):\n Output definitions for the graph. If not provided explicitly, these will be inferred from typehints.\n\n Uses of these outputs in the body of the decorated composition function, as well as the\n return value of the decorated function, will be used to infer the appropriate set of\n :py:class:`OutputMappings <OutputMapping>` for the underlying\n :py:class:`GraphDefinition`.\n\n To map multiple outputs, return a dictionary from the composition function.\n ins (Optional[Dict[str, GraphIn]]):\n Information about the inputs that this graph maps. Information provided here\n will be combined with what can be inferred from the function signature, with these\n explicit GraphIn taking precedence.\n out (Optional[Union[GraphOut, Dict[str, GraphOut]]]):\n Information about the outputs that this graph maps. Information provided here will be\n combined with what can be inferred from the return type signature if the function does\n not use yield.\n\n To map multiple outputs, return a dictionary from the composition function.\n tags (Optional[Dict[str, Any]]): Arbitrary metadata for any execution run of the graph.\n Values that are not strings will be json encoded and must meet the criteria that\n `json.loads(json.dumps(value)) == value`. These tag values may be overwritten by tag\n values provided at invocation time.\n\n config (Optional[Union[ConfigMapping], Mapping[str, Any]):\n Describes how the graph is configured at runtime.\n\n If a :py:class:`ConfigMapping` object is provided, then the graph takes on the config\n schema of this object. The mapping will be applied at runtime to generate the config for\n the graph's constituent nodes.\n\n If a dictionary is provided, then it will be used as the default run config for the\n graph. This means it must conform to the config schema of the underlying nodes. Note\n that the values provided will be viewable and editable in the Dagster UI, so be careful\n with secrets. its constituent nodes.\n\n If no value is provided, then the config schema for the graph is the default (derived\n from the underlying nodes).\n """\n if compose_fn is not None:\n check.invariant(description is None)\n return _Graph()(compose_fn)\n\n config_mapping = None\n # Case 1: a dictionary of config is provided, convert to config mapping.\n if config is not None and not isinstance(config, ConfigMapping):\n config = check.dict_param(config, "config", key_type=str)\n config_mapping = ConfigMapping(config_fn=lambda _: config, config_schema=None)\n # Case 2: actual config mapping is provided.\n else:\n config_mapping = config\n\n return _Graph(\n name=name,\n description=description,\n input_defs=input_defs,\n output_defs=output_defs,\n ins=ins,\n out=out,\n tags=tags,\n config_mapping=config_mapping,\n )
\n
", "current_page_name": "_modules/dagster/_core/definitions/decorators/graph_decorator", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.decorators.graph_decorator"}, "hook_decorator": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.decorators.hook_decorator

\nfrom functools import update_wrapper\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Any,\n    Callable,\n    Optional,\n    Sequence,\n    Union,\n    cast,\n    overload,\n)\n\nimport dagster._check as check\nfrom dagster._core.errors import DagsterInvalidDefinitionError\n\nfrom ...decorator_utils import get_function_params, validate_expected_params\nfrom ..events import HookExecutionResult\nfrom ..hook_definition import HookDefinition\n\nif TYPE_CHECKING:\n    from dagster._core.events import DagsterEvent\n    from dagster._core.execution.context.hook import HookContext\n\n\ndef _validate_hook_fn_params(fn, expected_positionals):\n    params = get_function_params(fn)\n    missing_positional = validate_expected_params(params, expected_positionals)\n    if missing_positional:\n        raise DagsterInvalidDefinitionError(\n            f"'{fn.__name__}' decorated function does not have required positional "\n            f"parameter '{missing_positional}'. Hook functions should only have keyword arguments "\n            "that match input names and a first positional parameter named 'context' and "\n            "a second positional parameter named 'event_list'."\n        )\n\n\nclass _Hook:\n    def __init__(\n        self,\n        name: Optional[str] = None,\n        required_resource_keys: Optional[AbstractSet[str]] = None,\n        decorated_fn: Optional[Callable[..., Any]] = None,\n    ):\n        self.name = check.opt_str_param(name, "name")\n        self.required_resource_keys = check.opt_set_param(\n            required_resource_keys, "required_resource_keys"\n        )\n        self.decorated_fn = check.opt_callable_param(decorated_fn, "decorated_fn")\n\n    def __call__(self, fn) -> HookDefinition:\n        check.callable_param(fn, "fn")\n\n        if not self.name:\n            self.name = fn.__name__\n\n        expected_positionals = ["context", "event_list"]\n\n        _validate_hook_fn_params(fn, expected_positionals)\n\n        hook_def = HookDefinition(\n            name=self.name or "",\n            hook_fn=fn,\n            required_resource_keys=self.required_resource_keys,\n            decorated_fn=self.decorated_fn or fn,\n        )\n        update_wrapper(cast(Callable[..., Any], hook_def), fn)\n        return hook_def\n\n\n@overload\ndef event_list_hook(\n    hook_fn: Callable,\n) -> HookDefinition:\n    pass\n\n\n@overload\ndef event_list_hook(\n    *,\n    name: Optional[str] = ...,\n    required_resource_keys: Optional[AbstractSet[str]] = ...,\n    decorated_fn: Optional[Callable[..., Any]] = ...,\n) -> _Hook:\n    pass\n\n\ndef event_list_hook(\n    hook_fn: Optional[Callable] = None,\n    *,\n    name: Optional[str] = None,\n    required_resource_keys: Optional[AbstractSet[str]] = None,\n    decorated_fn: Optional[Callable[..., Any]] = None,\n) -> Union[HookDefinition, _Hook]:\n    """Create a generic hook with the specified parameters from the decorated function.\n\n    This decorator is currently used internally by Dagster machinery to support success_hook and\n    failure_hook.\n\n    The user-defined hook function requires two parameters:\n    - A `context` object is passed as the first parameter. The context is an instance of\n        :py:class:`context <HookContext>`, and provides access to system\n        information, such as loggers (context.log), resources (context.resources), the op\n        (context.op) and its execution step (context.step) which triggers this hook.\n    - An `event_list` object is passed as the second paramter. It provides the full event list of the\n        associated execution step.\n\n    Args:\n        name (Optional[str]): The name of this hook.\n        required_resource_keys (Optional[AbstractSet[str]]): Keys for the resources required by the\n            hook.\n\n    Examples:\n        .. code-block:: python\n\n            @event_list_hook(required_resource_keys={'slack'})\n            def slack_on_materializations(context, event_list):\n                for event in event_list:\n                    if event.event_type == DagsterEventType.ASSET_MATERIALIZATION:\n                        message = f'{context.op_name} has materialized an asset {event.asset_key}.'\n                        # send a slack message every time a materialization event occurs\n                        context.resources.slack.send_message(message)\n\n\n    """\n    # This case is for when decorator is used bare, without arguments.\n    # e.g. @event_list_hook versus @event_list_hook()\n    if hook_fn is not None:\n        check.invariant(required_resource_keys is None)\n        return _Hook()(hook_fn)\n\n    return _Hook(\n        name=name, required_resource_keys=required_resource_keys, decorated_fn=decorated_fn\n    )\n\n\nSuccessOrFailureHookFn = Callable[["HookContext"], Any]\n\n\n@overload\ndef success_hook(hook_fn: SuccessOrFailureHookFn) -> HookDefinition: ...\n\n\n@overload\ndef success_hook(\n    *,\n    name: Optional[str] = ...,\n    required_resource_keys: Optional[AbstractSet[str]] = ...,\n) -> Callable[[SuccessOrFailureHookFn], HookDefinition]: ...\n\n\n
[docs]def success_hook(\n hook_fn: Optional[SuccessOrFailureHookFn] = None,\n *,\n name: Optional[str] = None,\n required_resource_keys: Optional[AbstractSet[str]] = None,\n) -> Union[HookDefinition, Callable[[SuccessOrFailureHookFn], HookDefinition]]:\n """Create a hook on step success events with the specified parameters from the decorated function.\n\n Args:\n name (Optional[str]): The name of this hook.\n required_resource_keys (Optional[AbstractSet[str]]): Keys for the resources required by the\n hook.\n\n Examples:\n .. code-block:: python\n\n @success_hook(required_resource_keys={'slack'})\n def slack_message_on_success(context):\n message = 'op {} succeeded'.format(context.op.name)\n context.resources.slack.send_message(message)\n\n @success_hook\n def do_something_on_success(context):\n do_something()\n\n\n """\n\n def wrapper(fn: SuccessOrFailureHookFn) -> HookDefinition:\n check.callable_param(fn, "fn")\n\n expected_positionals = ["context"]\n _validate_hook_fn_params(fn, expected_positionals)\n\n if name is None or callable(name):\n _name = fn.__name__\n else:\n _name = name\n\n @event_list_hook(name=_name, required_resource_keys=required_resource_keys, decorated_fn=fn)\n def _success_hook(\n context: "HookContext", event_list: Sequence["DagsterEvent"]\n ) -> HookExecutionResult:\n for event in event_list:\n if event.is_step_success:\n fn(context)\n return HookExecutionResult(hook_name=_name, is_skipped=False)\n\n # hook is skipped when fn didn't run\n return HookExecutionResult(hook_name=_name, is_skipped=True)\n\n return _success_hook\n\n # This case is for when decorator is used bare, without arguments, i.e. @success_hook\n if hook_fn is not None:\n check.invariant(required_resource_keys is None)\n return wrapper(hook_fn)\n\n return wrapper
\n\n\n@overload\ndef failure_hook(name: SuccessOrFailureHookFn) -> HookDefinition: ...\n\n\n@overload\ndef failure_hook(\n name: Optional[str] = ...,\n required_resource_keys: Optional[AbstractSet[str]] = ...,\n) -> Callable[[SuccessOrFailureHookFn], HookDefinition]: ...\n\n\n
[docs]def failure_hook(\n name: Optional[Union[SuccessOrFailureHookFn, str]] = None,\n required_resource_keys: Optional[AbstractSet[str]] = None,\n) -> Union[HookDefinition, Callable[[SuccessOrFailureHookFn], HookDefinition]]:\n """Create a hook on step failure events with the specified parameters from the decorated function.\n\n Args:\n name (Optional[str]): The name of this hook.\n required_resource_keys (Optional[AbstractSet[str]]): Keys for the resources required by the\n hook.\n\n Examples:\n .. code-block:: python\n\n @failure_hook(required_resource_keys={'slack'})\n def slack_message_on_failure(context):\n message = 'op {} failed'.format(context.op.name)\n context.resources.slack.send_message(message)\n\n @failure_hook\n def do_something_on_failure(context):\n do_something()\n\n\n """\n\n def wrapper(fn: Callable[["HookContext"], Any]) -> HookDefinition:\n check.callable_param(fn, "fn")\n\n expected_positionals = ["context"]\n _validate_hook_fn_params(fn, expected_positionals)\n\n if name is None or callable(name):\n _name = fn.__name__\n else:\n _name = name\n\n @event_list_hook(name=_name, required_resource_keys=required_resource_keys, decorated_fn=fn)\n def _failure_hook(\n context: "HookContext", event_list: Sequence["DagsterEvent"]\n ) -> HookExecutionResult:\n for event in event_list:\n if event.is_step_failure:\n fn(context)\n return HookExecutionResult(hook_name=_name, is_skipped=False)\n\n # hook is skipped when fn didn't run\n return HookExecutionResult(hook_name=_name, is_skipped=True)\n\n return _failure_hook\n\n # This case is for when decorator is used bare, without arguments, i.e. @failure_hook\n if callable(name):\n check.invariant(required_resource_keys is None)\n return wrapper(name)\n\n return wrapper
\n
", "current_page_name": "_modules/dagster/_core/definitions/decorators/hook_decorator", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.decorators.hook_decorator"}, "job_decorator": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.decorators.job_decorator

\nfrom functools import update_wrapper\nfrom typing import TYPE_CHECKING, AbstractSet, Any, Callable, Mapping, Optional, Union, overload\n\nimport dagster._check as check\nfrom dagster._annotations import deprecated_param\nfrom dagster._core.decorator_utils import format_docstring_for_description\n\nfrom ..config import ConfigMapping\nfrom ..graph_definition import GraphDefinition\nfrom ..hook_definition import HookDefinition\nfrom ..job_definition import JobDefinition\nfrom ..logger_definition import LoggerDefinition\nfrom ..metadata import RawMetadataValue\nfrom ..policy import RetryPolicy\nfrom ..resource_definition import ResourceDefinition\nfrom ..version_strategy import VersionStrategy\n\nif TYPE_CHECKING:\n    from ..executor_definition import ExecutorDefinition\n    from ..partition import PartitionedConfig, PartitionsDefinition\n    from ..run_config import RunConfig\n\n\nclass _Job:\n    def __init__(\n        self,\n        name: Optional[str] = None,\n        description: Optional[str] = None,\n        tags: Optional[Mapping[str, Any]] = None,\n        metadata: Optional[Mapping[str, RawMetadataValue]] = None,\n        resource_defs: Optional[Mapping[str, ResourceDefinition]] = None,\n        config: Optional[\n            Union[ConfigMapping, Mapping[str, Any], "RunConfig", "PartitionedConfig"]\n        ] = None,\n        logger_defs: Optional[Mapping[str, LoggerDefinition]] = None,\n        executor_def: Optional["ExecutorDefinition"] = None,\n        hooks: Optional[AbstractSet[HookDefinition]] = None,\n        op_retry_policy: Optional[RetryPolicy] = None,\n        version_strategy: Optional[VersionStrategy] = None,\n        partitions_def: Optional["PartitionsDefinition"] = None,\n        input_values: Optional[Mapping[str, object]] = None,\n    ):\n        from dagster._core.definitions.run_config import convert_config_input\n\n        self.name = name\n        self.description = description\n        self.tags = tags\n        self.metadata = metadata\n        self.resource_defs = resource_defs\n        self.config = convert_config_input(config)\n        self.logger_defs = logger_defs\n        self.executor_def = executor_def\n        self.hooks = hooks\n        self.op_retry_policy = op_retry_policy\n        self.version_strategy = version_strategy\n        self.partitions_def = partitions_def\n        self.input_values = input_values\n\n    def __call__(self, fn: Callable[..., Any]) -> JobDefinition:\n        check.callable_param(fn, "fn")\n\n        if not self.name:\n            self.name = fn.__name__\n\n        from dagster._core.definitions.composition import do_composition\n\n        (\n            input_mappings,\n            output_mappings,\n            dependencies,\n            node_defs,\n            config_mapping,\n            positional_inputs,\n            node_input_source_assets,\n        ) = do_composition(\n            decorator_name="@job",\n            graph_name=self.name,\n            fn=fn,\n            provided_input_defs=[],\n            provided_output_defs=[],\n            ignore_output_from_composition_fn=False,\n            config_mapping=None,\n        )\n\n        graph_def = GraphDefinition(\n            name=self.name,\n            dependencies=dependencies,\n            node_defs=node_defs,\n            description=self.description or format_docstring_for_description(fn),\n            input_mappings=input_mappings,\n            output_mappings=output_mappings,\n            config=config_mapping,\n            positional_inputs=positional_inputs,\n            tags=self.tags,\n            node_input_source_assets=node_input_source_assets,\n        )\n\n        job_def = graph_def.to_job(\n            description=self.description or format_docstring_for_description(fn),\n            resource_defs=self.resource_defs,\n            config=self.config,\n            tags=self.tags,\n            metadata=self.metadata,\n            logger_defs=self.logger_defs,\n            executor_def=self.executor_def,\n            hooks=self.hooks,\n            op_retry_policy=self.op_retry_policy,\n            version_strategy=self.version_strategy,\n            partitions_def=self.partitions_def,\n            input_values=self.input_values,\n        )\n        update_wrapper(job_def, fn)\n        return job_def\n\n\n@overload\ndef job(compose_fn: Callable[..., Any]) -> JobDefinition: ...\n\n\n@overload\ndef job(\n    *,\n    name: Optional[str] = ...,\n    description: Optional[str] = ...,\n    resource_defs: Optional[Mapping[str, object]] = ...,\n    config: Union[ConfigMapping, Mapping[str, Any], "RunConfig", "PartitionedConfig"] = ...,\n    tags: Optional[Mapping[str, Any]] = ...,\n    metadata: Optional[Mapping[str, RawMetadataValue]] = ...,\n    logger_defs: Optional[Mapping[str, LoggerDefinition]] = ...,\n    executor_def: Optional["ExecutorDefinition"] = ...,\n    hooks: Optional[AbstractSet[HookDefinition]] = ...,\n    op_retry_policy: Optional[RetryPolicy] = ...,\n    version_strategy: Optional[VersionStrategy] = ...,\n    partitions_def: Optional["PartitionsDefinition"] = ...,\n    input_values: Optional[Mapping[str, object]] = ...,\n) -> _Job: ...\n\n\n
[docs]@deprecated_param(\n param="version_strategy",\n breaking_version="2.0",\n additional_warn_text="Use asset versioning instead.",\n)\ndef job(\n compose_fn: Optional[Callable[..., Any]] = None,\n *,\n name: Optional[str] = None,\n description: Optional[str] = None,\n resource_defs: Optional[Mapping[str, object]] = None,\n config: Optional[\n Union[ConfigMapping, Mapping[str, Any], "RunConfig", "PartitionedConfig"]\n ] = None,\n tags: Optional[Mapping[str, Any]] = None,\n metadata: Optional[Mapping[str, RawMetadataValue]] = None,\n logger_defs: Optional[Mapping[str, LoggerDefinition]] = None,\n executor_def: Optional["ExecutorDefinition"] = None,\n hooks: Optional[AbstractSet[HookDefinition]] = None,\n op_retry_policy: Optional[RetryPolicy] = None,\n version_strategy: Optional[VersionStrategy] = None,\n partitions_def: Optional["PartitionsDefinition"] = None,\n input_values: Optional[Mapping[str, object]] = None,\n) -> Union[JobDefinition, _Job]:\n """Creates a job with the specified parameters from the decorated graph/op invocation function.\n\n Using this decorator allows you to build an executable job by writing a function that invokes\n ops (or graphs).\n\n Args:\n compose_fn (Callable[..., Any]:\n The decorated function. The body should contain op or graph invocations. Unlike op\n functions, does not accept a context argument.\n name (Optional[str]):\n The name for the Job. Defaults to the name of the this graph.\n resource_defs (Optional[Mapping[str, object]]):\n Resources that are required by this graph for execution.\n If not defined, `io_manager` will default to filesystem.\n config:\n Describes how the job is parameterized at runtime.\n\n If no value is provided, then the schema for the job's run config is a standard\n format based on its ops and resources.\n\n If a dictionary is provided, then it must conform to the standard config schema, and\n it will be used as the job's run config for the job whenever the job is executed.\n The values provided will be viewable and editable in the Dagster UI, so be\n careful with secrets.\n\n If a :py:class:`RunConfig` object is provided, then it will be used directly as the run config\n for the job whenever the job is executed, similar to providing a dictionary.\n\n If a :py:class:`ConfigMapping` object is provided, then the schema for the job's run config is\n determined by the config mapping, and the ConfigMapping, which should return\n configuration in the standard format to configure the job.\n\n If a :py:class:`PartitionedConfig` object is provided, then it defines a discrete set of config\n values that can parameterize the job, as well as a function for mapping those\n values to the base config. The values provided will be viewable and editable in the\n Dagster UI, so be careful with secrets.\n tags (Optional[Dict[str, Any]]):\n Arbitrary information that will be attached to the execution of the Job.\n Values that are not strings will be json encoded and must meet the criteria that\n `json.loads(json.dumps(value)) == value`. These tag values may be overwritten by tag\n values provided at invocation time.\n metadata (Optional[Dict[str, RawMetadataValue]]):\n Arbitrary information that will be attached to the JobDefinition and be viewable in the Dagster UI.\n Keys must be strings, and values must be python primitive types or one of the provided\n MetadataValue types\n logger_defs (Optional[Dict[str, LoggerDefinition]]):\n A dictionary of string logger identifiers to their implementations.\n executor_def (Optional[ExecutorDefinition]):\n How this Job will be executed. Defaults to :py:class:`multiprocess_executor` .\n op_retry_policy (Optional[RetryPolicy]): The default retry policy for all ops in this job.\n Only used if retry policy is not defined on the op definition or op invocation.\n version_strategy (Optional[VersionStrategy]):\n Defines how each op (and optionally, resource) in the job can be versioned. If\n provided, memoization will be enabled for this job.\n partitions_def (Optional[PartitionsDefinition]): Defines a discrete set of partition keys\n that can parameterize the job. If this argument is supplied, the config argument\n can't also be supplied.\n input_values (Optional[Mapping[str, Any]]):\n A dictionary that maps python objects to the top-level inputs of a job.\n\n Examples:\n .. code-block:: python\n\n @op\n def return_one():\n return 1\n\n @op\n def add_one(in1):\n return in1 + 1\n\n @job\n def job1():\n add_one(return_one())\n """\n if compose_fn is not None:\n check.invariant(description is None)\n return _Job()(compose_fn)\n\n from dagster._core.execution.build_resources import wrap_resources_for_execution\n\n return _Job(\n name=name,\n description=description,\n resource_defs=wrap_resources_for_execution(resource_defs),\n config=config,\n tags=tags,\n metadata=metadata,\n logger_defs=logger_defs,\n executor_def=executor_def,\n hooks=hooks,\n op_retry_policy=op_retry_policy,\n version_strategy=version_strategy,\n partitions_def=partitions_def,\n input_values=input_values,\n )
\n
", "current_page_name": "_modules/dagster/_core/definitions/decorators/job_decorator", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.decorators.job_decorator"}, "op_decorator": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.decorators.op_decorator

\nfrom functools import lru_cache, update_wrapper\nfrom inspect import Parameter\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Any,\n    Callable,\n    List,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Union,\n    cast,\n    overload,\n)\n\nimport dagster._check as check\nfrom dagster._annotations import deprecated_param\nfrom dagster._config import UserConfigSchema\nfrom dagster._core.decorator_utils import (\n    format_docstring_for_description,\n    get_function_params,\n    get_valid_name_permutations,\n    param_is_var_keyword,\n    positional_arg_name_list,\n)\nfrom dagster._core.definitions.inference import infer_input_props\nfrom dagster._core.definitions.resource_annotation import (\n    get_resource_args,\n)\nfrom dagster._core.errors import DagsterInvalidDefinitionError\nfrom dagster._core.types.dagster_type import DagsterTypeKind\nfrom dagster._utils.warnings import normalize_renamed_param\n\nfrom ..input import In, InputDefinition\nfrom ..output import Out\nfrom ..policy import RetryPolicy\nfrom ..utils import DEFAULT_OUTPUT\n\nif TYPE_CHECKING:\n    from ..op_definition import OpDefinition\n\n\nclass _Op:\n    def __init__(\n        self,\n        name: Optional[str] = None,\n        description: Optional[str] = None,\n        required_resource_keys: Optional[AbstractSet[str]] = None,\n        config_schema: Optional[Union[Any, Mapping[str, Any]]] = None,\n        tags: Optional[Mapping[str, Any]] = None,\n        code_version: Optional[str] = None,\n        decorator_takes_context: Optional[bool] = True,\n        retry_policy: Optional[RetryPolicy] = None,\n        ins: Optional[Mapping[str, In]] = None,\n        out: Optional[Union[Out, Mapping[str, Out]]] = None,\n    ):\n        self.name = check.opt_str_param(name, "name")\n        self.decorator_takes_context = check.bool_param(\n            decorator_takes_context, "decorator_takes_context"\n        )\n\n        self.description = check.opt_str_param(description, "description")\n\n        # these will be checked within OpDefinition\n        self.required_resource_keys = required_resource_keys\n        self.tags = tags\n        self.code_version = code_version\n        self.retry_policy = retry_policy\n\n        # config will be checked within OpDefinition\n        self.config_schema = config_schema\n\n        self.ins = check.opt_nullable_mapping_param(ins, "ins", key_type=str, value_type=In)\n        self.out = out\n\n    def __call__(self, fn: Callable[..., Any]) -> "OpDefinition":\n        from dagster._config.pythonic_config import validate_resource_annotated_function\n\n        from ..op_definition import OpDefinition\n\n        validate_resource_annotated_function(fn)\n\n        if not self.name:\n            self.name = fn.__name__\n\n        compute_fn = (\n            DecoratedOpFunction(decorated_fn=fn)\n            if self.decorator_takes_context\n            else NoContextDecoratedOpFunction(decorated_fn=fn)\n        )\n\n        if compute_fn.has_config_arg():\n            check.param_invariant(\n                self.config_schema is None or self.config_schema == {},\n                "If the @op has a config arg, you cannot specify a config schema",\n            )\n\n            from dagster._config.pythonic_config import infer_schema_from_config_annotation\n\n            # Parse schema from the type annotation of the config arg\n            config_arg = compute_fn.get_config_arg()\n            config_arg_type = config_arg.annotation\n            config_arg_default = config_arg.default\n            self.config_schema = infer_schema_from_config_annotation(\n                config_arg_type, config_arg_default\n            )\n\n        outs: Optional[Mapping[str, Out]] = None\n        if self.out is not None and isinstance(self.out, Out):\n            outs = {DEFAULT_OUTPUT: self.out}\n        elif self.out is not None:\n            outs = check.mapping_param(self.out, "out", key_type=str, value_type=Out)\n\n        arg_resource_keys = {arg.name for arg in compute_fn.get_resource_args()}\n        decorator_resource_keys = set(self.required_resource_keys or [])\n        check.param_invariant(\n            len(decorator_resource_keys) == 0 or len(arg_resource_keys) == 0,\n            "Cannot specify resource requirements in both @op decorator and as arguments to the"\n            " decorated function",\n        )\n        resolved_resource_keys = decorator_resource_keys.union(arg_resource_keys)\n\n        op_def = OpDefinition.dagster_internal_init(\n            name=self.name,\n            ins=self.ins,\n            outs=outs,\n            compute_fn=compute_fn,\n            config_schema=self.config_schema,\n            description=self.description or format_docstring_for_description(fn),\n            required_resource_keys=resolved_resource_keys,\n            tags=self.tags,\n            code_version=self.code_version,\n            retry_policy=self.retry_policy,\n            version=None,  # code_version has replaced version\n        )\n        update_wrapper(op_def, compute_fn.decorated_fn)\n        return op_def\n\n\n@overload\ndef op(compute_fn: Callable[..., Any]) -> "OpDefinition": ...\n\n\n@overload\ndef op(\n    *,\n    name: Optional[str] = ...,\n    description: Optional[str] = ...,\n    ins: Optional[Mapping[str, In]] = ...,\n    out: Optional[Union[Out, Mapping[str, Out]]] = ...,\n    config_schema: Optional[UserConfigSchema] = ...,\n    required_resource_keys: Optional[AbstractSet[str]] = ...,\n    tags: Optional[Mapping[str, Any]] = ...,\n    version: Optional[str] = ...,\n    retry_policy: Optional[RetryPolicy] = ...,\n    code_version: Optional[str] = ...,\n) -> _Op: ...\n\n\n
[docs]@deprecated_param(\n param="version", breaking_version="2.0", additional_warn_text="Use `code_version` instead"\n)\ndef op(\n compute_fn: Optional[Callable] = None,\n *,\n name: Optional[str] = None,\n description: Optional[str] = None,\n ins: Optional[Mapping[str, In]] = None,\n out: Optional[Union[Out, Mapping[str, Out]]] = None,\n config_schema: Optional[UserConfigSchema] = None,\n required_resource_keys: Optional[AbstractSet[str]] = None,\n tags: Optional[Mapping[str, Any]] = None,\n version: Optional[str] = None,\n retry_policy: Optional[RetryPolicy] = None,\n code_version: Optional[str] = None,\n) -> Union["OpDefinition", _Op]:\n """Create an op with the specified parameters from the decorated function.\n\n Ins and outs will be inferred from the type signature of the decorated function\n if not explicitly provided.\n\n The decorated function will be used as the op's compute function. The signature of the\n decorated function is more flexible than that of the ``compute_fn`` in the core API; it may:\n\n 1. Return a value. This value will be wrapped in an :py:class:`Output` and yielded by the compute function.\n 2. Return an :py:class:`Output`. This output will be yielded by the compute function.\n 3. Yield :py:class:`Output` or other :ref:`event objects <events>`. Same as default compute behavior.\n\n Note that options 1) and 2) are incompatible with yielding other events -- if you would like\n to decorate a function that yields events, it must also wrap its eventual output in an\n :py:class:`Output` and yield it.\n\n @op supports ``async def`` functions as well, including async generators when yielding multiple\n events or outputs. Note that async ops will generally be run on their own unless using a custom\n :py:class:`Executor` implementation that supports running them together.\n\n Args:\n name (Optional[str]): Name of op. Must be unique within any :py:class:`GraphDefinition`\n using the op.\n description (Optional[str]): Human-readable description of this op. If not provided, and\n the decorated function has docstring, that docstring will be used as the description.\n ins (Optional[Dict[str, In]]):\n Information about the inputs to the op. Information provided here will be combined\n with what can be inferred from the function signature.\n out (Optional[Union[Out, Dict[str, Out]]]):\n Information about the op outputs. Information provided here will be combined with\n what can be inferred from the return type signature if the function does not use yield.\n config_schema (Optional[ConfigSchema): The schema for the config. If set, Dagster will check\n that config provided for the op matches this schema and fail if it does not. If not\n set, Dagster will accept any config provided for the op.\n required_resource_keys (Optional[Set[str]]): Set of resource handles required by this op.\n tags (Optional[Dict[str, Any]]): Arbitrary metadata for the op. Frameworks may\n expect and require certain metadata to be attached to a op. Values that are not strings\n will be json encoded and must meet the criteria that `json.loads(json.dumps(value)) == value`.\n code_version (Optional[str]): (Experimental) Version of the logic encapsulated by the op. If set,\n this is used as a default version for all outputs.\n retry_policy (Optional[RetryPolicy]): The retry policy for this op.\n\n Examples:\n .. code-block:: python\n\n @op\n def hello_world():\n print('hello')\n\n @op\n def echo(msg: str) -> str:\n return msg\n\n @op(\n ins={'msg': In(str)},\n out=Out(str)\n )\n def echo_2(msg): # same as above\n return msg\n\n @op(\n out={'word': Out(), 'num': Out()}\n )\n def multi_out() -> Tuple[str, int]:\n return 'cool', 4\n """\n code_version = normalize_renamed_param(\n code_version,\n "code_version",\n version,\n "version",\n )\n\n if compute_fn is not None:\n check.invariant(description is None)\n check.invariant(config_schema is None)\n check.invariant(required_resource_keys is None)\n check.invariant(tags is None)\n check.invariant(version is None)\n\n return _Op()(compute_fn)\n\n return _Op(\n name=name,\n description=description,\n config_schema=config_schema,\n required_resource_keys=required_resource_keys,\n tags=tags,\n code_version=code_version,\n retry_policy=retry_policy,\n ins=ins,\n out=out,\n )
\n\n\nclass DecoratedOpFunction(NamedTuple):\n """Wrapper around the decorated op function to provide commonly used util methods."""\n\n decorated_fn: Callable[..., Any]\n\n @property\n def name(self):\n return self.decorated_fn.__name__\n\n @lru_cache(maxsize=1)\n def has_context_arg(self) -> bool:\n return is_context_provided(get_function_params(self.decorated_fn))\n\n def get_context_arg(self) -> Parameter:\n if self.has_context_arg():\n return get_function_params(self.decorated_fn)[0]\n check.failed("Requested context arg on function that does not have one")\n\n @lru_cache(maxsize=1)\n def _get_function_params(self) -> Sequence[Parameter]:\n return get_function_params(self.decorated_fn)\n\n def has_config_arg(self) -> bool:\n for param in get_function_params(self.decorated_fn):\n if param.name == "config":\n return True\n\n return False\n\n def get_config_arg(self) -> Parameter:\n for param in get_function_params(self.decorated_fn):\n if param.name == "config":\n return param\n\n check.failed("Requested config arg on function that does not have one")\n\n def get_resource_args(self) -> Sequence[Parameter]:\n return get_resource_args(self.decorated_fn)\n\n def positional_inputs(self) -> Sequence[str]:\n params = self._get_function_params()\n input_args = params[1:] if self.has_context_arg() else params\n resource_arg_names = [arg.name for arg in self.get_resource_args()]\n input_args_filtered = [\n input_arg\n for input_arg in input_args\n if input_arg.name != "config" and input_arg.name not in resource_arg_names\n ]\n return positional_arg_name_list(input_args_filtered)\n\n def has_var_kwargs(self) -> bool:\n params = self._get_function_params()\n # var keyword arg has to be the last argument\n return len(params) > 0 and param_is_var_keyword(params[-1])\n\n def get_output_annotation(self) -> Any:\n from ..inference import infer_output_props\n\n return infer_output_props(self.decorated_fn).annotation\n\n\nclass NoContextDecoratedOpFunction(DecoratedOpFunction):\n """Wrapper around a decorated op function, when the decorator does not permit a context\n parameter.\n """\n\n @lru_cache(maxsize=1)\n def has_context_arg(self) -> bool:\n return False\n\n\ndef is_context_provided(params: Sequence[Parameter]) -> bool:\n if len(params) == 0:\n return False\n return params[0].name in get_valid_name_permutations("context")\n\n\ndef resolve_checked_op_fn_inputs(\n decorator_name: str,\n fn_name: str,\n compute_fn: DecoratedOpFunction,\n explicit_input_defs: Sequence[InputDefinition],\n exclude_nothing: bool,\n) -> Sequence[InputDefinition]:\n """Validate provided input definitions and infer the remaining from the type signature of the compute_fn.\n Returns the resolved set of InputDefinitions.\n\n Args:\n decorator_name (str): Name of the decorator that is wrapping the op function.\n fn_name (str): Name of the decorated function.\n compute_fn (DecoratedOpFunction): The decorated function, wrapped in the\n DecoratedOpFunction wrapper.\n explicit_input_defs (List[InputDefinition]): The input definitions that were explicitly\n provided in the decorator.\n exclude_nothing (bool): True if Nothing type inputs should be excluded from compute_fn\n arguments.\n """\n explicit_names = set()\n if exclude_nothing:\n explicit_names = set(\n inp.name\n for inp in explicit_input_defs\n if not inp.dagster_type.kind == DagsterTypeKind.NOTHING\n )\n nothing_names = set(\n inp.name\n for inp in explicit_input_defs\n if inp.dagster_type.kind == DagsterTypeKind.NOTHING\n )\n else:\n explicit_names = set(inp.name for inp in explicit_input_defs)\n nothing_names = set()\n\n params = get_function_params(compute_fn.decorated_fn)\n\n input_args = params[1:] if compute_fn.has_context_arg() else params\n\n # filter out config arg\n resource_arg_names = {arg.name for arg in compute_fn.get_resource_args()}\n explicit_names = explicit_names - resource_arg_names\n\n if compute_fn.has_config_arg() or resource_arg_names:\n new_input_args = []\n for input_arg in input_args:\n if input_arg.name != "config" and input_arg.name not in resource_arg_names:\n new_input_args.append(input_arg)\n input_args = new_input_args\n\n # Validate input arguments\n used_inputs = set()\n inputs_to_infer = set()\n has_kwargs = False\n\n for param in cast(List[Parameter], input_args):\n if param.kind == Parameter.VAR_KEYWORD:\n has_kwargs = True\n elif param.kind == Parameter.VAR_POSITIONAL:\n raise DagsterInvalidDefinitionError(\n f"{decorator_name} '{fn_name}' decorated function has positional vararg parameter "\n f"'{param}'. {decorator_name} decorated functions should only have keyword "\n "arguments that match input names and, if system information is required, a first "\n "positional parameter named 'context'."\n )\n\n else:\n if param.name not in explicit_names:\n if param.name in nothing_names:\n raise DagsterInvalidDefinitionError(\n f"{decorator_name} '{fn_name}' decorated function has parameter"\n f" '{param.name}' that is one of the input_defs of type 'Nothing' which"\n " should not be included since no data will be passed for it. "\n )\n else:\n inputs_to_infer.add(param.name)\n\n else:\n used_inputs.add(param.name)\n\n undeclared_inputs = explicit_names - used_inputs\n if not has_kwargs and undeclared_inputs:\n undeclared_inputs_printed = ", '".join(undeclared_inputs)\n raise DagsterInvalidDefinitionError(\n f"{decorator_name} '{fn_name}' decorated function does not have argument(s)"\n f" '{undeclared_inputs_printed}'. {decorator_name}-decorated functions should have a"\n " keyword argument for each of their Ins, except for Ins that have the Nothing"\n " dagster_type. Alternatively, they can accept **kwargs."\n )\n\n inferred_props = {\n inferred.name: inferred\n for inferred in infer_input_props(compute_fn.decorated_fn, compute_fn.has_context_arg())\n }\n input_defs = []\n for input_def in explicit_input_defs:\n if input_def.name in inferred_props:\n # combine any information missing on the explicit def that can be inferred\n input_defs.append(input_def.combine_with_inferred(inferred_props[input_def.name]))\n else:\n # pass through those that don't have any inference info, such as Nothing type inputs\n input_defs.append(input_def)\n\n # build defs from the inferred props for those without explicit entries\n inferred_input_defs = [\n InputDefinition.create_from_inferred(inferred)\n for inferred in inferred_props.values()\n if inferred.name in inputs_to_infer\n ]\n\n if exclude_nothing:\n for in_def in inferred_input_defs:\n if in_def.dagster_type.is_nothing:\n raise DagsterInvalidDefinitionError(\n f"Input parameter {in_def.name} is annotated with"\n f" {in_def.dagster_type.display_name} which is a type that represents passing"\n " no data. This type must be used via In() and no parameter should be included"\n f" in the {decorator_name} decorated function."\n )\n\n input_defs.extend(inferred_input_defs)\n\n return input_defs\n
", "current_page_name": "_modules/dagster/_core/definitions/decorators/op_decorator", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.decorators.op_decorator"}, "repository_decorator": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.decorators.repository_decorator

\nfrom functools import update_wrapper\nfrom typing import (\n    Callable,\n    Dict,\n    Iterable,\n    Iterator,\n    List,\n    Mapping,\n    Optional,\n    Sequence,\n    TypeVar,\n    Union,\n    overload,\n)\n\nfrom typing_extensions import TypeAlias\n\nimport dagster._check as check\nfrom dagster._core.decorator_utils import get_function_params\nfrom dagster._core.definitions.metadata import (\n    RawMetadataValue,\n    normalize_metadata,\n)\nfrom dagster._core.definitions.resource_definition import ResourceDefinition\nfrom dagster._core.errors import DagsterInvalidDefinitionError\n\nfrom ..asset_checks import AssetChecksDefinition\nfrom ..executor_definition import ExecutorDefinition\nfrom ..graph_definition import GraphDefinition\nfrom ..job_definition import JobDefinition\nfrom ..logger_definition import LoggerDefinition\nfrom ..partitioned_schedule import UnresolvedPartitionedAssetScheduleDefinition\nfrom ..repository_definition import (\n    VALID_REPOSITORY_DATA_DICT_KEYS,\n    CachingRepositoryData,\n    PendingRepositoryDefinition,\n    PendingRepositoryListDefinition,\n    RepositoryData,\n    RepositoryDefinition,\n    RepositoryListDefinition,\n)\nfrom ..schedule_definition import ScheduleDefinition\nfrom ..sensor_definition import SensorDefinition\nfrom ..unresolved_asset_job_definition import UnresolvedAssetJobDefinition\n\nT = TypeVar("T")\n\nRepositoryDictSpec: TypeAlias = Dict[str, Dict[str, RepositoryListDefinition]]\n\n\ndef _flatten(items: Iterable[Union[T, List[T]]]) -> Iterator[T]:\n    for x in items:\n        if isinstance(x, List):\n            # switch to `yield from _flatten(x)` to support multiple layers of nesting\n            yield from x\n        else:\n            yield x\n\n\nclass _Repository:\n    def __init__(\n        self,\n        name: Optional[str] = None,\n        description: Optional[str] = None,\n        metadata: Optional[Dict[str, RawMetadataValue]] = None,\n        default_executor_def: Optional[ExecutorDefinition] = None,\n        default_logger_defs: Optional[Mapping[str, LoggerDefinition]] = None,\n        top_level_resources: Optional[Mapping[str, ResourceDefinition]] = None,\n        resource_key_mapping: Optional[Mapping[int, str]] = None,\n    ):\n        self.name = check.opt_str_param(name, "name")\n        self.description = check.opt_str_param(description, "description")\n        self.metadata = normalize_metadata(\n            check.opt_mapping_param(metadata, "metadata", key_type=str)\n        )\n        self.default_executor_def = check.opt_inst_param(\n            default_executor_def, "default_executor_def", ExecutorDefinition\n        )\n        self.default_logger_defs = check.opt_mapping_param(\n            default_logger_defs, "default_logger_defs", key_type=str, value_type=LoggerDefinition\n        )\n        self.top_level_resources = check.opt_mapping_param(\n            top_level_resources, "top_level_resources", key_type=str, value_type=ResourceDefinition\n        )\n        self.resource_key_mapping = check.opt_mapping_param(\n            resource_key_mapping, "resource_key_mapping", key_type=int, value_type=str\n        )\n\n    @overload\n    def __call__(\n        self,\n        fn: Union[\n            Callable[[], Sequence[RepositoryListDefinition]],\n            Callable[[], RepositoryDictSpec],\n        ],\n    ) -> RepositoryDefinition: ...\n\n    @overload\n    def __call__(\n        self, fn: Callable[[], Sequence[PendingRepositoryListDefinition]]\n    ) -> PendingRepositoryDefinition: ...\n\n    def __call__(\n        self,\n        fn: Union[\n            Callable[[], Sequence[PendingRepositoryListDefinition]],\n            Callable[[], RepositoryDictSpec],\n        ],\n    ) -> Union[RepositoryDefinition, PendingRepositoryDefinition]:\n        from dagster._core.definitions import AssetsDefinition, SourceAsset\n        from dagster._core.definitions.cacheable_assets import CacheableAssetsDefinition\n\n        check.callable_param(fn, "fn")\n\n        if not self.name:\n            self.name = fn.__name__\n\n        repository_definitions = fn()\n\n        repository_data: Optional[Union[CachingRepositoryData, RepositoryData]]\n        if isinstance(repository_definitions, list):\n            bad_defns = []\n            repository_defns = []\n            defer_repository_data = False\n            for i, definition in enumerate(_flatten(repository_definitions)):\n                if isinstance(definition, CacheableAssetsDefinition):\n                    defer_repository_data = True\n                elif not isinstance(\n                    definition,\n                    (\n                        JobDefinition,\n                        ScheduleDefinition,\n                        UnresolvedPartitionedAssetScheduleDefinition,\n                        SensorDefinition,\n                        GraphDefinition,\n                        AssetsDefinition,\n                        SourceAsset,\n                        UnresolvedAssetJobDefinition,\n                        AssetChecksDefinition,\n                    ),\n                ):\n                    bad_defns.append((i, type(definition)))\n                else:\n                    repository_defns.append(definition)\n\n            if bad_defns:\n                bad_definitions_str = ", ".join(\n                    [f"value of type {type_} at index {i}" for i, type_ in bad_defns]\n                )\n                raise DagsterInvalidDefinitionError(\n                    "Bad return value from repository construction function: all elements of list "\n                    "must be of type JobDefinition, GraphDefinition, "\n                    "ScheduleDefinition, SensorDefinition, "\n                    "AssetsDefinition, SourceAsset, or AssetChecksDefinition."\n                    f"Got {bad_definitions_str}."\n                )\n\n            repository_data = (\n                None\n                if defer_repository_data\n                else CachingRepositoryData.from_list(\n                    repository_defns,\n                    default_executor_def=self.default_executor_def,\n                    default_logger_defs=self.default_logger_defs,\n                    top_level_resources=self.top_level_resources,\n                    resource_key_mapping=self.resource_key_mapping,\n                )\n            )\n\n        elif isinstance(repository_definitions, dict):\n            if not set(repository_definitions.keys()).issubset(VALID_REPOSITORY_DATA_DICT_KEYS):\n                raise DagsterInvalidDefinitionError(\n                    "Bad return value from repository construction function: dict must not contain "\n                    "keys other than {{'schedules', 'sensors', 'jobs'}}: found "\n                    "{bad_keys}".format(\n                        bad_keys=", ".join(\n                            [\n                                f"'{key}'"\n                                for key in repository_definitions.keys()\n                                if key not in VALID_REPOSITORY_DATA_DICT_KEYS\n                            ]\n                        )\n                    )\n                )\n            repository_data = CachingRepositoryData.from_dict(repository_definitions)\n        elif isinstance(repository_definitions, RepositoryData):\n            repository_data = repository_definitions\n        else:\n            raise DagsterInvalidDefinitionError(\n                "Bad return value of type {type_} from repository construction function: must "\n                "return list, dict, or RepositoryData. See the @repository decorator docstring for "\n                "details and examples".format(type_=type(repository_definitions)),\n            )\n\n        if isinstance(repository_definitions, list) and repository_data is None:\n            return PendingRepositoryDefinition(\n                self.name,\n                repository_definitions=list(_flatten(repository_definitions)),\n                description=self.description,\n                metadata=self.metadata,\n                default_executor_def=self.default_executor_def,\n                default_logger_defs=self.default_logger_defs,\n                _top_level_resources=self.top_level_resources,\n            )\n        else:\n            repository_def = RepositoryDefinition(\n                name=self.name,\n                description=self.description,\n                metadata=self.metadata,\n                repository_data=repository_data,\n            )\n\n            update_wrapper(repository_def, fn)\n            return repository_def\n\n\n@overload\ndef repository(\n    definitions_fn: Union[\n        Callable[[], Sequence[RepositoryListDefinition]], Callable[[], RepositoryDictSpec]\n    ],\n) -> RepositoryDefinition: ...\n\n\n@overload\ndef repository(\n    definitions_fn: Callable[..., Sequence[PendingRepositoryListDefinition]]\n) -> PendingRepositoryDefinition: ...\n\n\n@overload\ndef repository(\n    *,\n    name: Optional[str] = ...,\n    description: Optional[str] = ...,\n    metadata: Optional[Dict[str, RawMetadataValue]] = ...,\n    default_executor_def: Optional[ExecutorDefinition] = ...,\n    default_logger_defs: Optional[Mapping[str, LoggerDefinition]] = ...,\n    _top_level_resources: Optional[Mapping[str, ResourceDefinition]] = ...,\n    _resource_key_mapping: Optional[Mapping[int, str]] = ...,\n) -> _Repository: ...\n\n\n
[docs]def repository(\n definitions_fn: Optional[\n Union[\n Callable[[], Sequence[PendingRepositoryListDefinition]],\n Callable[[], RepositoryDictSpec],\n ]\n ] = None,\n *,\n name: Optional[str] = None,\n description: Optional[str] = None,\n metadata: Optional[Dict[str, RawMetadataValue]] = None,\n default_executor_def: Optional[ExecutorDefinition] = None,\n default_logger_defs: Optional[Mapping[str, LoggerDefinition]] = None,\n _top_level_resources: Optional[Mapping[str, ResourceDefinition]] = None,\n _resource_key_mapping: Optional[Mapping[int, str]] = None,\n) -> Union[RepositoryDefinition, PendingRepositoryDefinition, _Repository]:\n """Create a repository from the decorated function.\n\n The decorated function should take no arguments and its return value should one of:\n\n 1. ``List[Union[JobDefinition, ScheduleDefinition, SensorDefinition]]``.\n Use this form when you have no need to lazy load jobs or other definitions. This is the\n typical use case.\n\n 2. A dict of the form:\n\n .. code-block:: python\n\n {\n 'jobs': Dict[str, Callable[[], JobDefinition]],\n 'schedules': Dict[str, Callable[[], ScheduleDefinition]]\n 'sensors': Dict[str, Callable[[], SensorDefinition]]\n }\n\n This form is intended to allow definitions to be created lazily when accessed by name,\n which can be helpful for performance when there are many definitions in a repository, or\n when constructing the definitions is costly.\n\n 3. A :py:class:`RepositoryData`. Return this object if you need fine-grained\n control over the construction and indexing of definitions within the repository, e.g., to\n create definitions dynamically from .yaml files in a directory.\n\n Args:\n name (Optional[str]): The name of the repository. Defaults to the name of the decorated\n function.\n description (Optional[str]): A string description of the repository.\n metadata (Optional[Dict[str, RawMetadataValue]]): Arbitrary metadata for the repository.\n top_level_resources (Optional[Mapping[str, ResourceDefinition]]): A dict of top-level\n resource keys to defintions, for resources which should be displayed in the UI.\n\n Example:\n .. code-block:: python\n\n ######################################################################\n # A simple repository using the first form of the decorated function\n ######################################################################\n\n @op(config_schema={n: Field(Int)})\n def return_n(context):\n return context.op_config['n']\n\n @job\n def simple_job():\n return_n()\n\n @job\n def some_job():\n ...\n\n @sensor(job=some_job)\n def some_sensor():\n if foo():\n yield RunRequest(\n run_key= ...,\n run_config={\n 'ops': {'return_n': {'config': {'n': bar()}}}\n }\n )\n\n @job\n def my_job():\n ...\n\n my_schedule = ScheduleDefinition(cron_schedule="0 0 * * *", job=my_job)\n\n @repository\n def simple_repository():\n return [simple_job, some_sensor, my_schedule]\n\n ######################################################################\n # A simple repository using the first form of the decorated function\n # and custom metadata that will be displayed in the UI\n ######################################################################\n\n ...\n\n @repository(\n name='my_repo',\n metadata={\n 'team': 'Team A',\n 'repository_version': '1.2.3',\n 'environment': 'production',\n })\n def simple_repository():\n return [simple_job, some_sensor, my_schedule]\n\n ######################################################################\n # A lazy-loaded repository\n ######################################################################\n\n def make_expensive_job():\n @job\n def expensive_job():\n for i in range(10000):\n return_n.alias(f'return_n_{i}')()\n\n return expensive_job\n\n def make_expensive_schedule():\n @job\n def other_expensive_job():\n for i in range(11000):\n return_n.alias(f'my_return_n_{i}')()\n\n return ScheduleDefinition(cron_schedule="0 0 * * *", job=other_expensive_job)\n\n @repository\n def lazy_loaded_repository():\n return {\n 'jobs': {'expensive_job': make_expensive_job},\n 'schedules': {'expensive_schedule': make_expensive_schedule}\n }\n\n\n ######################################################################\n # A complex repository that lazily constructs jobs from a directory\n # of files in a bespoke YAML format\n ######################################################################\n\n class ComplexRepositoryData(RepositoryData):\n def __init__(self, yaml_directory):\n self._yaml_directory = yaml_directory\n\n def get_all_jobs(self):\n return [\n self._construct_job_def_from_yaml_file(\n self._yaml_file_for_job_name(file_name)\n )\n for file_name in os.listdir(self._yaml_directory)\n ]\n\n ...\n\n @repository\n def complex_repository():\n return ComplexRepositoryData('some_directory')\n """\n if definitions_fn is not None:\n check.invariant(description is None)\n check.invariant(len(get_function_params(definitions_fn)) == 0)\n\n return _Repository()(definitions_fn)\n\n return _Repository(\n name=name,\n description=description,\n metadata=metadata,\n default_executor_def=default_executor_def,\n default_logger_defs=default_logger_defs,\n top_level_resources=_top_level_resources,\n resource_key_mapping=_resource_key_mapping,\n )
\n
", "current_page_name": "_modules/dagster/_core/definitions/decorators/repository_decorator", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.decorators.repository_decorator"}, "schedule_decorator": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.decorators.schedule_decorator

\nimport copy\nfrom functools import update_wrapper\nfrom typing import (\n    Callable,\n    List,\n    Mapping,\n    Optional,\n    Sequence,\n    Set,\n    Union,\n    cast,\n)\n\nimport dagster._check as check\nfrom dagster._core.definitions.resource_annotation import (\n    get_resource_args,\n)\nfrom dagster._core.definitions.sensor_definition import get_context_param_name\nfrom dagster._core.errors import (\n    DagsterInvalidDefinitionError,\n    ScheduleExecutionError,\n    user_code_error_boundary,\n)\nfrom dagster._utils import ensure_gen\n\nfrom ..run_request import RunRequest, SkipReason\nfrom ..schedule_definition import (\n    DecoratedScheduleFunction,\n    DefaultScheduleStatus,\n    RawScheduleEvaluationFunction,\n    RunRequestIterator,\n    ScheduleDefinition,\n    ScheduleEvaluationContext,\n    has_at_least_one_parameter,\n    validate_and_get_schedule_resource_dict,\n)\nfrom ..target import ExecutableDefinition\nfrom ..utils import validate_tags\n\n\n
[docs]def schedule(\n cron_schedule: Union[str, Sequence[str]],\n *,\n job_name: Optional[str] = None,\n name: Optional[str] = None,\n tags: Optional[Mapping[str, str]] = None,\n tags_fn: Optional[Callable[[ScheduleEvaluationContext], Optional[Mapping[str, str]]]] = None,\n should_execute: Optional[Callable[[ScheduleEvaluationContext], bool]] = None,\n environment_vars: Optional[Mapping[str, str]] = None,\n execution_timezone: Optional[str] = None,\n description: Optional[str] = None,\n job: Optional[ExecutableDefinition] = None,\n default_status: DefaultScheduleStatus = DefaultScheduleStatus.STOPPED,\n required_resource_keys: Optional[Set[str]] = None,\n) -> Callable[[RawScheduleEvaluationFunction], ScheduleDefinition]:\n """Creates a schedule following the provided cron schedule and requests runs for the provided job.\n\n The decorated function takes in a :py:class:`~dagster.ScheduleEvaluationContext` as its only\n argument, and does one of the following:\n\n 1. Return a `RunRequest` object.\n 2. Return a list of `RunRequest` objects.\n 3. Return a `SkipReason` object, providing a descriptive message of why no runs were requested.\n 4. Return nothing (skipping without providing a reason)\n 5. Return a run config dictionary.\n 6. Yield a `SkipReason` or yield one ore more `RunRequest` objects.\n\n Returns a :py:class:`~dagster.ScheduleDefinition`.\n\n Args:\n cron_schedule (Union[str, Sequence[str]]): A valid cron string or sequence of cron strings\n specifying when the schedule will run, e.g., ``'45 23 * * 6'`` for a schedule that runs\n at 11:45 PM every Saturday. If a sequence is provided, then the schedule will run for\n the union of all execution times for the provided cron strings, e.g.,\n ``['45 23 * * 6', '30 9 * * 0]`` for a schedule that runs at 11:45 PM every Saturday and\n 9:30 AM every Sunday.\n name (Optional[str]): The name of the schedule to create.\n tags (Optional[Dict[str, str]]): A dictionary of tags (string key-value pairs) to attach\n to the scheduled runs.\n tags_fn (Optional[Callable[[ScheduleEvaluationContext], Optional[Dict[str, str]]]]): A function\n that generates tags to attach to the schedules runs. Takes a\n :py:class:`~dagster.ScheduleEvaluationContext` and returns a dictionary of tags (string\n key-value pairs). You may set only one of ``tags`` and ``tags_fn``.\n should_execute (Optional[Callable[[ScheduleEvaluationContext], bool]]): A function that runs at\n schedule execution time to determine whether a schedule should execute or skip. Takes a\n :py:class:`~dagster.ScheduleEvaluationContext` and returns a boolean (``True`` if the\n schedule should execute). Defaults to a function that always returns ``True``.\n execution_timezone (Optional[str]): Timezone in which the schedule should run.\n Supported strings for timezones are the ones provided by the\n `IANA time zone database <https://www.iana.org/time-zones>` - e.g. "America/Los_Angeles".\n description (Optional[str]): A human-readable description of the schedule.\n job (Optional[Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]]): The job\n that should execute when this schedule runs.\n default_status (DefaultScheduleStatus): Whether the schedule starts as running or not. The default\n status can be overridden from the Dagster UI or via the GraphQL API.\n required_resource_keys (Optional[Set[str]]): The set of resource keys required by the schedule.\n """\n\n def inner(fn: RawScheduleEvaluationFunction) -> ScheduleDefinition:\n from dagster._config.pythonic_config import validate_resource_annotated_function\n\n check.callable_param(fn, "fn")\n validate_resource_annotated_function(fn)\n\n schedule_name = name or fn.__name__\n\n validated_tags = None\n\n # perform upfront validation of schedule tags\n if tags_fn and tags:\n raise DagsterInvalidDefinitionError(\n "Attempted to provide both tags_fn and tags as arguments"\n " to ScheduleDefinition. Must provide only one of the two."\n )\n elif tags:\n validated_tags = validate_tags(tags, allow_reserved_tags=False)\n\n context_param_name = get_context_param_name(fn)\n resource_arg_names: Set[str] = {arg.name for arg in get_resource_args(fn)}\n\n def _wrapped_fn(context: ScheduleEvaluationContext) -> RunRequestIterator:\n if should_execute:\n with user_code_error_boundary(\n ScheduleExecutionError,\n lambda: (\n "Error occurred during the execution of should_execute for schedule"\n f" {schedule_name}"\n ),\n ):\n if not should_execute(context):\n yield SkipReason(\n f"should_execute function for {schedule_name} returned false."\n )\n return\n resources = validate_and_get_schedule_resource_dict(\n context.resources, schedule_name, resource_arg_names\n )\n\n with user_code_error_boundary(\n ScheduleExecutionError,\n lambda: f"Error occurred during the evaluation of schedule {schedule_name}",\n ):\n context_param = {context_param_name: context} if context_param_name else {}\n result = fn(**context_param, **resources)\n\n if isinstance(result, dict):\n # this is the run-config based decorated function, wrap the evaluated run config\n # and tags in a RunRequest\n evaluated_run_config = copy.deepcopy(result)\n evaluated_tags = (\n validated_tags\n or (tags_fn and validate_tags(tags_fn(context), allow_reserved_tags=False))\n or None\n )\n yield RunRequest(\n run_key=None,\n run_config=evaluated_run_config,\n tags=evaluated_tags,\n )\n elif isinstance(result, list):\n yield from cast(List[RunRequest], result)\n else:\n # this is a run-request based decorated function\n yield from cast(RunRequestIterator, ensure_gen(result))\n\n has_context_arg = has_at_least_one_parameter(fn)\n evaluation_fn = DecoratedScheduleFunction(\n decorated_fn=fn,\n wrapped_fn=_wrapped_fn,\n has_context_arg=has_context_arg,\n )\n\n schedule_def = ScheduleDefinition.dagster_internal_init(\n name=schedule_name,\n cron_schedule=cron_schedule,\n job_name=job_name,\n environment_vars=environment_vars,\n execution_timezone=execution_timezone,\n description=description,\n execution_fn=evaluation_fn,\n job=job,\n default_status=default_status,\n required_resource_keys=required_resource_keys,\n run_config=None, # cannot supply run_config or run_config_fn to decorator\n run_config_fn=None,\n tags=None, # cannot supply tags or tags_fn to decorator\n tags_fn=None,\n should_execute=None, # already encompassed in evaluation_fn\n )\n\n update_wrapper(schedule_def, wrapped=fn)\n\n return schedule_def\n\n return inner
\n
", "current_page_name": "_modules/dagster/_core/definitions/decorators/schedule_decorator", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.decorators.schedule_decorator"}, "sensor_decorator": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.decorators.sensor_decorator

\nimport collections.abc\nimport inspect\nfrom functools import update_wrapper\nfrom typing import Any, Callable, Optional, Sequence, Set, Union\n\nimport dagster._check as check\nfrom dagster._annotations import experimental\nfrom dagster._core.definitions.asset_selection import AssetSelection\n\nfrom ...errors import DagsterInvariantViolationError\nfrom ..asset_sensor_definition import AssetSensorDefinition\nfrom ..events import AssetKey\nfrom ..multi_asset_sensor_definition import (\n    AssetMaterializationFunction,\n    MultiAssetMaterializationFunction,\n    MultiAssetSensorDefinition,\n)\nfrom ..run_request import SensorResult\nfrom ..sensor_definition import (\n    DefaultSensorStatus,\n    RawSensorEvaluationFunction,\n    RunRequest,\n    SensorDefinition,\n    SkipReason,\n)\nfrom ..target import ExecutableDefinition\n\n\n
[docs]def sensor(\n job_name: Optional[str] = None,\n *,\n name: Optional[str] = None,\n minimum_interval_seconds: Optional[int] = None,\n description: Optional[str] = None,\n job: Optional[ExecutableDefinition] = None,\n jobs: Optional[Sequence[ExecutableDefinition]] = None,\n default_status: DefaultSensorStatus = DefaultSensorStatus.STOPPED,\n asset_selection: Optional[AssetSelection] = None,\n required_resource_keys: Optional[Set[str]] = None,\n) -> Callable[[RawSensorEvaluationFunction], SensorDefinition]:\n """Creates a sensor where the decorated function is used as the sensor's evaluation function.\n\n The decorated function may:\n\n 1. Return a `RunRequest` object.\n 2. Return a list of `RunRequest` objects.\n 3. Return a `SkipReason` object, providing a descriptive message of why no runs were requested.\n 4. Return nothing (skipping without providing a reason)\n 5. Yield a `SkipReason` or yield one or more `RunRequest` objects.\n\n Takes a :py:class:`~dagster.SensorEvaluationContext`.\n\n Args:\n name (Optional[str]): The name of the sensor. Defaults to the name of the decorated\n function.\n minimum_interval_seconds (Optional[int]): The minimum number of seconds that will elapse\n between sensor evaluations.\n description (Optional[str]): A human-readable description of the sensor.\n job (Optional[Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]]):\n The job to be executed when the sensor fires.\n jobs (Optional[Sequence[Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]]]):\n (experimental) A list of jobs to be executed when the sensor fires.\n default_status (DefaultSensorStatus): Whether the sensor starts as running or not. The default\n status can be overridden from the Dagster UI or via the GraphQL API.\n asset_selection (AssetSelection): (Experimental) an asset selection to launch a run for if\n the sensor condition is met. This can be provided instead of specifying a job.\n """\n check.opt_str_param(name, "name")\n\n def inner(fn: RawSensorEvaluationFunction) -> SensorDefinition:\n check.callable_param(fn, "fn")\n\n sensor_def = SensorDefinition.dagster_internal_init(\n name=name,\n job_name=job_name,\n evaluation_fn=fn,\n minimum_interval_seconds=minimum_interval_seconds,\n description=description,\n job=job,\n jobs=jobs,\n default_status=default_status,\n asset_selection=asset_selection,\n required_resource_keys=required_resource_keys,\n )\n\n update_wrapper(sensor_def, wrapped=fn)\n\n return sensor_def\n\n return inner
\n\n\n
[docs]def asset_sensor(\n asset_key: AssetKey,\n *,\n job_name: Optional[str] = None,\n name: Optional[str] = None,\n minimum_interval_seconds: Optional[int] = None,\n description: Optional[str] = None,\n job: Optional[ExecutableDefinition] = None,\n jobs: Optional[Sequence[ExecutableDefinition]] = None,\n default_status: DefaultSensorStatus = DefaultSensorStatus.STOPPED,\n required_resource_keys: Optional[Set[str]] = None,\n) -> Callable[[AssetMaterializationFunction,], AssetSensorDefinition,]:\n """Creates an asset sensor where the decorated function is used as the asset sensor's evaluation\n function.\n\n If the asset has been materialized multiple times between since the last sensor tick, the\n evaluation function will only be invoked once, with the latest materialization.\n\n The decorated function may:\n\n 1. Return a `RunRequest` object.\n 2. Return a list of `RunRequest` objects.\n 3. Return a `SkipReason` object, providing a descriptive message of why no runs were requested.\n 4. Return nothing (skipping without providing a reason)\n 5. Yield a `SkipReason` or yield one or more `RunRequest` objects.\n\n Takes a :py:class:`~dagster.SensorEvaluationContext` and an EventLogEntry corresponding to an\n AssetMaterialization event.\n\n Args:\n asset_key (AssetKey): The asset_key this sensor monitors.\n name (Optional[str]): The name of the sensor. Defaults to the name of the decorated\n function.\n minimum_interval_seconds (Optional[int]): The minimum number of seconds that will elapse\n between sensor evaluations.\n description (Optional[str]): A human-readable description of the sensor.\n job (Optional[Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]]): The\n job to be executed when the sensor fires.\n jobs (Optional[Sequence[Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]]]):\n (experimental) A list of jobs to be executed when the sensor fires.\n default_status (DefaultSensorStatus): Whether the sensor starts as running or not. The default\n status can be overridden from the Dagster UI or via the GraphQL API.\n\n\n Example:\n .. code-block:: python\n\n from dagster import AssetKey, EventLogEntry, SensorEvaluationContext, asset_sensor\n\n\n @asset_sensor(asset_key=AssetKey("my_table"), job=my_job)\n def my_asset_sensor(context: SensorEvaluationContext, asset_event: EventLogEntry):\n return RunRequest(\n run_key=context.cursor,\n run_config={\n "ops": {\n "read_materialization": {\n "config": {\n "asset_key": asset_event.dagster_event.asset_key.path,\n }\n }\n }\n },\n )\n """\n check.opt_str_param(name, "name")\n\n def inner(fn: AssetMaterializationFunction) -> AssetSensorDefinition:\n check.callable_param(fn, "fn")\n sensor_name = name or fn.__name__\n\n def _wrapped_fn(*args, **kwargs) -> Any:\n result = fn(*args, **kwargs)\n\n if inspect.isgenerator(result) or isinstance(result, list):\n for item in result:\n yield item\n elif isinstance(result, (RunRequest, SkipReason)):\n yield result\n\n elif isinstance(result, SensorResult):\n if result.cursor:\n raise DagsterInvariantViolationError(\n f"Error in asset sensor {sensor_name}: Sensor returned a SensorResult"\n " with a cursor value. The cursor is managed by the asset sensor and"\n " should not be modified by a user."\n )\n yield result\n\n elif result is not None:\n raise DagsterInvariantViolationError(\n f"Error in sensor {sensor_name}: Sensor unexpectedly returned output "\n f"{result} of type {type(result)}. Should only return SkipReason or "\n "RunRequest objects."\n )\n\n # Preserve any resource arguments from the underlying function, for when we inspect the\n # wrapped function later on\n _wrapped_fn = update_wrapper(_wrapped_fn, wrapped=fn)\n\n return AssetSensorDefinition(\n name=sensor_name,\n asset_key=asset_key,\n job_name=job_name,\n asset_materialization_fn=_wrapped_fn,\n minimum_interval_seconds=minimum_interval_seconds,\n description=description,\n job=job,\n jobs=jobs,\n default_status=default_status,\n required_resource_keys=required_resource_keys,\n )\n\n return inner
\n\n\n
[docs]@experimental\ndef multi_asset_sensor(\n monitored_assets: Union[Sequence[AssetKey], AssetSelection],\n *,\n job_name: Optional[str] = None,\n name: Optional[str] = None,\n minimum_interval_seconds: Optional[int] = None,\n description: Optional[str] = None,\n job: Optional[ExecutableDefinition] = None,\n jobs: Optional[Sequence[ExecutableDefinition]] = None,\n default_status: DefaultSensorStatus = DefaultSensorStatus.STOPPED,\n request_assets: Optional[AssetSelection] = None,\n required_resource_keys: Optional[Set[str]] = None,\n) -> Callable[[MultiAssetMaterializationFunction,], MultiAssetSensorDefinition,]:\n """Creates an asset sensor that can monitor multiple assets.\n\n The decorated function is used as the asset sensor's evaluation\n function. The decorated function may:\n\n 1. Return a `RunRequest` object.\n 2. Return a list of `RunRequest` objects.\n 3. Return a `SkipReason` object, providing a descriptive message of why no runs were requested.\n 4. Return nothing (skipping without providing a reason)\n 5. Yield a `SkipReason` or yield one or more `RunRequest` objects.\n\n Takes a :py:class:`~dagster.MultiAssetSensorEvaluationContext`.\n\n Args:\n monitored_assets (Union[Sequence[AssetKey], AssetSelection]): The assets this\n sensor monitors. If an AssetSelection object is provided, it will only apply to assets\n within the Definitions that this sensor is part of.\n name (Optional[str]): The name of the sensor. Defaults to the name of the decorated\n function.\n minimum_interval_seconds (Optional[int]): The minimum number of seconds that will elapse\n between sensor evaluations.\n description (Optional[str]): A human-readable description of the sensor.\n job (Optional[Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]]): The\n job to be executed when the sensor fires.\n jobs (Optional[Sequence[Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]]]):\n (experimental) A list of jobs to be executed when the sensor fires.\n default_status (DefaultSensorStatus): Whether the sensor starts as running or not. The default\n status can be overridden from the Dagster UI or via the GraphQL API.\n request_assets (Optional[AssetSelection]): (Experimental) an asset selection to launch a run\n for if the sensor condition is met. This can be provided instead of specifying a job.\n """\n check.opt_str_param(name, "name")\n\n if not isinstance(monitored_assets, AssetSelection) and not (\n isinstance(monitored_assets, collections.abc.Sequence)\n and all(isinstance(el, AssetKey) for el in monitored_assets)\n ):\n check.failed(\n "The value passed to monitored_assets param must be either an AssetSelection"\n f" or a Sequence of AssetKeys, but was a {type(monitored_assets)}"\n )\n\n def inner(fn: MultiAssetMaterializationFunction) -> MultiAssetSensorDefinition:\n check.callable_param(fn, "fn")\n sensor_name = name or fn.__name__\n\n sensor_def = MultiAssetSensorDefinition(\n name=sensor_name,\n monitored_assets=monitored_assets,\n job_name=job_name,\n asset_materialization_fn=fn,\n minimum_interval_seconds=minimum_interval_seconds,\n description=description,\n job=job,\n jobs=jobs,\n default_status=default_status,\n request_assets=request_assets,\n required_resource_keys=required_resource_keys,\n )\n update_wrapper(sensor_def, wrapped=fn)\n return sensor_def\n\n return inner
\n
", "current_page_name": "_modules/dagster/_core/definitions/decorators/sensor_decorator", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.decorators.sensor_decorator"}}, "definitions_class": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.definitions_class

\nfrom typing import (\n    TYPE_CHECKING,\n    Any,\n    Dict,\n    Iterable,\n    List,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Type,\n    Union,\n)\n\nimport dagster._check as check\nfrom dagster._annotations import deprecated, experimental, public\nfrom dagster._config.pythonic_config import (\n    attach_resource_id_to_key_mapping,\n)\nfrom dagster._core.definitions.asset_checks import AssetChecksDefinition\nfrom dagster._core.definitions.asset_graph import InternalAssetGraph\nfrom dagster._core.definitions.events import AssetKey, CoercibleToAssetKey\nfrom dagster._core.definitions.executor_definition import ExecutorDefinition\nfrom dagster._core.definitions.logger_definition import LoggerDefinition\nfrom dagster._core.errors import DagsterInvariantViolationError\nfrom dagster._core.execution.build_resources import wrap_resources_for_execution\nfrom dagster._core.execution.with_resources import with_resources\nfrom dagster._core.executor.base import Executor\nfrom dagster._core.instance import DagsterInstance\nfrom dagster._utils.cached_method import cached_method\n\nfrom .assets import AssetsDefinition, SourceAsset\nfrom .cacheable_assets import CacheableAssetsDefinition\nfrom .decorators import repository\nfrom .job_definition import JobDefinition, default_job_io_manager\nfrom .partitioned_schedule import UnresolvedPartitionedAssetScheduleDefinition\nfrom .repository_definition import (\n    SINGLETON_REPOSITORY_NAME,\n    PendingRepositoryDefinition,\n    RepositoryDefinition,\n)\nfrom .schedule_definition import ScheduleDefinition\nfrom .sensor_definition import SensorDefinition\nfrom .unresolved_asset_job_definition import UnresolvedAssetJobDefinition\n\nif TYPE_CHECKING:\n    from dagster._core.storage.asset_value_loader import AssetValueLoader\n\n\n
[docs]@public\n@experimental\ndef create_repository_using_definitions_args(\n name: str,\n assets: Optional[\n Iterable[Union[AssetsDefinition, SourceAsset, CacheableAssetsDefinition]]\n ] = None,\n schedules: Optional[\n Iterable[Union[ScheduleDefinition, UnresolvedPartitionedAssetScheduleDefinition]]\n ] = None,\n sensors: Optional[Iterable[SensorDefinition]] = None,\n jobs: Optional[Iterable[Union[JobDefinition, UnresolvedAssetJobDefinition]]] = None,\n resources: Optional[Mapping[str, Any]] = None,\n executor: Optional[Union[ExecutorDefinition, Executor]] = None,\n loggers: Optional[Mapping[str, LoggerDefinition]] = None,\n asset_checks: Optional[Iterable[AssetChecksDefinition]] = None,\n) -> Union[RepositoryDefinition, PendingRepositoryDefinition]:\n """Create a named repository using the same arguments as :py:class:`Definitions`. In older\n versions of Dagster, repositories were the mechanism for organizing assets, schedules, sensors,\n and jobs. There could be many repositories per code location. This was a complicated ontology but\n gave users a way to organize code locations that contained large numbers of heterogenous definitions.\n\n As a stopgap for those who both want to 1) use the new :py:class:`Definitions` API and 2) but still\n want multiple logical groups of assets in the same code location, we have introduced this function.\n\n Example usage:\n\n .. code-block:: python\n\n named_repo = create_repository_using_definitions_args(\n name="a_repo",\n assets=[asset_one, asset_two],\n schedules=[a_schedule],\n sensors=[a_sensor],\n jobs=[a_job],\n resources={\n "a_resource": some_resource,\n }\n )\n\n """\n return _create_repository_using_definitions_args(\n name=name,\n assets=assets,\n schedules=schedules,\n sensors=sensors,\n jobs=jobs,\n resources=resources,\n executor=executor,\n loggers=loggers,\n asset_checks=asset_checks,\n )
\n\n\nclass _AttachedObjects(NamedTuple):\n jobs: Iterable[Union[JobDefinition, UnresolvedAssetJobDefinition]]\n schedules: Iterable[Union[ScheduleDefinition, UnresolvedPartitionedAssetScheduleDefinition]]\n sensors: Iterable[SensorDefinition]\n\n\ndef _io_manager_needs_replacement(job: JobDefinition, resource_defs: Mapping[str, Any]) -> bool:\n """Explicitly replace the default IO manager in jobs that don't specify one, if a top-level\n I/O manager is provided to Definitions.\n """\n return (\n job.resource_defs.get("io_manager") == default_job_io_manager\n and "io_manager" in resource_defs\n )\n\n\ndef _jobs_which_will_have_io_manager_replaced(\n jobs: Optional[Iterable[Union[JobDefinition, UnresolvedAssetJobDefinition]]],\n resource_defs: Mapping[str, Any],\n) -> List[Union[JobDefinition, UnresolvedAssetJobDefinition]]:\n """Returns whether any jobs will have their I/O manager replaced by an `io_manager` override from\n the top-level `resource_defs` provided to `Definitions` in 1.3. We will warn users if this is\n the case.\n """\n jobs = jobs or []\n return [\n job\n for job in jobs\n if isinstance(job, JobDefinition) and _io_manager_needs_replacement(job, resource_defs)\n ]\n\n\ndef _attach_resources_to_jobs_and_instigator_jobs(\n jobs: Optional[Iterable[Union[JobDefinition, UnresolvedAssetJobDefinition]]],\n schedules: Optional[\n Iterable[Union[ScheduleDefinition, UnresolvedPartitionedAssetScheduleDefinition]]\n ],\n sensors: Optional[Iterable[SensorDefinition]],\n resource_defs: Mapping[str, Any],\n) -> _AttachedObjects:\n """Given a list of jobs, schedules, and sensors along with top-level resource definitions,\n attach the resource definitions to the jobs, schedules, and sensors which require them.\n """\n jobs = jobs or []\n schedules = schedules or []\n sensors = sensors or []\n\n # Add jobs in schedules and sensors as well\n jobs = [\n *jobs,\n *[\n schedule.job\n for schedule in schedules\n if isinstance(schedule, ScheduleDefinition)\n and schedule.has_loadable_target()\n and isinstance(schedule.job, (JobDefinition, UnresolvedAssetJobDefinition))\n ],\n *[\n job\n for sensor in sensors\n if sensor.has_loadable_targets()\n for job in sensor.jobs\n if isinstance(job, (JobDefinition, UnresolvedAssetJobDefinition))\n ],\n ]\n # Dedupe\n jobs = list({id(job): job for job in jobs}.values())\n\n # Find unsatisfied jobs\n unsatisfied_jobs = [\n job\n for job in jobs\n if isinstance(job, JobDefinition)\n and (\n job.is_missing_required_resources() or _io_manager_needs_replacement(job, resource_defs)\n )\n ]\n\n # Create a mapping of job id to a version of the job with the resource defs bound\n unsatisfied_job_to_resource_bound_job = {\n id(job): job.with_top_level_resources(\n {\n **resource_defs,\n **job.resource_defs,\n # special case for IO manager - the job-level IO manager does not take precedence\n # if it is the default and a top-level IO manager is provided\n **(\n {"io_manager": resource_defs["io_manager"]}\n if _io_manager_needs_replacement(job, resource_defs)\n else {}\n ),\n }\n )\n for job in jobs\n if job in unsatisfied_jobs\n }\n\n # Update all jobs to use the resource bound version\n jobs_with_resources = [\n unsatisfied_job_to_resource_bound_job[id(job)] if job in unsatisfied_jobs else job\n for job in jobs\n ]\n\n # Update all schedules and sensors to use the resource bound version\n updated_schedules = [\n (\n schedule.with_updated_job(unsatisfied_job_to_resource_bound_job[id(schedule.job)])\n if (\n isinstance(schedule, ScheduleDefinition)\n and schedule.has_loadable_target()\n and schedule.job in unsatisfied_jobs\n )\n else schedule\n )\n for schedule in schedules\n ]\n updated_sensors = [\n (\n sensor.with_updated_jobs(\n [\n (\n unsatisfied_job_to_resource_bound_job[id(job)]\n if job in unsatisfied_jobs\n else job\n )\n for job in sensor.jobs\n ]\n )\n if sensor.has_loadable_targets() and any(job in unsatisfied_jobs for job in sensor.jobs)\n else sensor\n )\n for sensor in sensors\n ]\n\n return _AttachedObjects(jobs_with_resources, updated_schedules, updated_sensors)\n\n\ndef _create_repository_using_definitions_args(\n name: str,\n assets: Optional[\n Iterable[Union[AssetsDefinition, SourceAsset, CacheableAssetsDefinition]]\n ] = None,\n schedules: Optional[\n Iterable[Union[ScheduleDefinition, UnresolvedPartitionedAssetScheduleDefinition]]\n ] = None,\n sensors: Optional[Iterable[SensorDefinition]] = None,\n jobs: Optional[Iterable[Union[JobDefinition, UnresolvedAssetJobDefinition]]] = None,\n resources: Optional[Mapping[str, Any]] = None,\n executor: Optional[Union[ExecutorDefinition, Executor]] = None,\n loggers: Optional[Mapping[str, LoggerDefinition]] = None,\n asset_checks: Optional[Iterable[AssetChecksDefinition]] = None,\n):\n check.opt_iterable_param(\n assets, "assets", (AssetsDefinition, SourceAsset, CacheableAssetsDefinition)\n )\n check.opt_iterable_param(\n schedules, "schedules", (ScheduleDefinition, UnresolvedPartitionedAssetScheduleDefinition)\n )\n check.opt_iterable_param(sensors, "sensors", SensorDefinition)\n check.opt_iterable_param(jobs, "jobs", (JobDefinition, UnresolvedAssetJobDefinition))\n\n check.opt_inst_param(executor, "executor", (ExecutorDefinition, Executor))\n executor_def = (\n executor\n if isinstance(executor, ExecutorDefinition) or executor is None\n else ExecutorDefinition.hardcoded_executor(executor)\n )\n\n # Generate a mapping from each top-level resource instance ID to its resource key\n resource_key_mapping = {id(v): k for k, v in resources.items()} if resources else {}\n\n # Provide this mapping to each resource instance so that it can be used to resolve\n # nested resources\n resources_with_key_mapping = (\n {\n k: attach_resource_id_to_key_mapping(v, resource_key_mapping)\n for k, v in resources.items()\n }\n if resources\n else {}\n )\n\n resource_defs = wrap_resources_for_execution(resources_with_key_mapping)\n\n check.opt_mapping_param(loggers, "loggers", key_type=str, value_type=LoggerDefinition)\n\n # Binds top-level resources to jobs and any jobs attached to schedules or sensors\n (\n jobs_with_resources,\n schedules_with_resources,\n sensors_with_resources,\n ) = _attach_resources_to_jobs_and_instigator_jobs(jobs, schedules, sensors, resource_defs)\n\n @repository(\n name=name,\n default_executor_def=executor_def,\n default_logger_defs=loggers,\n _top_level_resources=resource_defs,\n _resource_key_mapping=resource_key_mapping,\n )\n def created_repo():\n return [\n *with_resources(assets or [], resource_defs),\n *with_resources(asset_checks or [], resource_defs),\n *(schedules_with_resources),\n *(sensors_with_resources),\n *(jobs_with_resources),\n ]\n\n return created_repo\n\n\n@deprecated(\n breaking_version="2.0",\n additional_warn_text=(\n "Instantiations can be removed. Since it's behavior is now the default, this class is now a"\n " no-op."\n ),\n)\nclass BindResourcesToJobs(list):\n """Used to instruct Dagster to bind top-level resources to jobs and any jobs attached to schedules\n and sensors. Now deprecated since this behavior is the default.\n """\n\n\n
[docs]class Definitions:\n """A set of definitions explicitly available and loadable by Dagster tools.\n\n Parameters:\n assets (Optional[Iterable[Union[AssetsDefinition, SourceAsset, CacheableAssetsDefinition]]]):\n A list of assets. Assets can be created by annotating\n a function with :py:func:`@asset <asset>` or\n :py:func:`@observable_source_asset <observable_source_asset>`.\n Or they can by directly instantiating :py:class:`AssetsDefinition`,\n :py:class:`SourceAsset`, or :py:class:`CacheableAssetsDefinition`.\n\n asset_checks (Optional[Iterable[AssetChecksDefinition]]):\n A list of asset checks.\n\n schedules (Optional[Iterable[Union[ScheduleDefinition, UnresolvedPartitionedAssetScheduleDefinition]]]):\n List of schedules.\n\n sensors (Optional[Iterable[SensorDefinition]]):\n List of sensors, typically created with :py:func:`@sensor <sensor>`.\n\n jobs (Optional[Iterable[Union[JobDefinition, UnresolvedAssetJobDefinition]]]):\n List of jobs. Typically created with :py:func:`define_asset_job <define_asset_job>`\n or with :py:func:`@job <job>` for jobs defined in terms of ops directly.\n Jobs created with :py:func:`@job <job>` must already have resources bound\n at job creation time. They do not respect the `resources` argument here.\n\n resources (Optional[Mapping[str, Any]]): Dictionary of resources to bind to assets.\n The resources dictionary takes raw Python objects,\n not just instances of :py:class:`ResourceDefinition`. If that raw object inherits from\n :py:class:`IOManager`, it gets coerced to an :py:class:`IOManagerDefinition`.\n Any other object is coerced to a :py:class:`ResourceDefinition`.\n These resources will be automatically bound\n to any assets passed to this Definitions instance using\n :py:func:`with_resources <with_resources>`. Assets passed to Definitions with\n resources already bound using :py:func:`with_resources <with_resources>` will\n override this dictionary.\n\n executor (Optional[Union[ExecutorDefinition, Executor]]):\n Default executor for jobs. Individual jobs can override this and define their own executors\n by setting the executor on :py:func:`@job <job>` or :py:func:`define_asset_job <define_asset_job>`\n explicitly. This executor will also be used for materializing assets directly\n outside of the context of jobs. If an :py:class:`Executor` is passed, it is coerced into\n an :py:class:`ExecutorDefinition`.\n\n loggers (Optional[Mapping[str, LoggerDefinition]):\n Default loggers for jobs. Individual jobs\n can define their own loggers by setting them explictly.\n\n Example usage:\n\n .. code-block:: python\n\n defs = Definitions(\n assets=[asset_one, asset_two],\n schedules=[a_schedule],\n sensors=[a_sensor],\n jobs=[a_job],\n resources={\n "a_resource": some_resource,\n },\n asset_checks=[asset_one_check_one]\n )\n\n Dagster separates user-defined code from system tools such the web server and\n the daemon. Rather than loading code directly into process, a tool such as the\n webserver interacts with user-defined code over a serialization boundary.\n\n These tools must be able to locate and load this code when they start. Via CLI\n arguments or config, they specify a Python module to inspect.\n\n A Python module is loadable by Dagster tools if there is a top-level variable\n that is an instance of :py:class:`Definitions`.\n\n Before the introduction of :py:class:`Definitions`,\n :py:func:`@repository <repository>` was the API for organizing defintions.\n :py:class:`Definitions` provides a few conveniences for dealing with resources\n that do not apply to old-style :py:func:`@repository <repository>` declarations:\n\n * It takes a dictionary of top-level resources which are automatically bound\n (via :py:func:`with_resources <with_resources>`) to any asset passed to it.\n If you need to apply different resources to different assets, use legacy\n :py:func:`@repository <repository>` and use\n :py:func:`with_resources <with_resources>` as before.\n * The resources dictionary takes raw Python objects, not just instances\n of :py:class:`ResourceDefinition`. If that raw object inherits from\n :py:class:`IOManager`, it gets coerced to an :py:class:`IOManagerDefinition`.\n Any other object is coerced to a :py:class:`ResourceDefinition`.\n """\n\n def __init__(\n self,\n assets: Optional[\n Iterable[Union[AssetsDefinition, SourceAsset, CacheableAssetsDefinition]]\n ] = None,\n schedules: Optional[\n Iterable[Union[ScheduleDefinition, UnresolvedPartitionedAssetScheduleDefinition]]\n ] = None,\n sensors: Optional[Iterable[SensorDefinition]] = None,\n jobs: Optional[Iterable[Union[JobDefinition, UnresolvedAssetJobDefinition]]] = None,\n resources: Optional[Mapping[str, Any]] = None,\n executor: Optional[Union[ExecutorDefinition, Executor]] = None,\n loggers: Optional[Mapping[str, LoggerDefinition]] = None,\n asset_checks: Optional[Iterable[AssetChecksDefinition]] = None,\n ):\n self._created_pending_or_normal_repo = _create_repository_using_definitions_args(\n name=SINGLETON_REPOSITORY_NAME,\n assets=assets,\n schedules=schedules,\n sensors=sensors,\n jobs=jobs,\n resources=resources,\n executor=executor,\n loggers=loggers,\n asset_checks=asset_checks,\n )\n\n
[docs] @public\n def get_job_def(self, name: str) -> JobDefinition:\n """Get a job definition by name. If you passed in a an :py:class:`UnresolvedAssetJobDefinition`\n (return value of :py:func:`define_asset_job`) it will be resolved to a :py:class:`JobDefinition` when returned\n from this function.\n """\n check.str_param(name, "name")\n return self.get_repository_def().get_job(name)
\n\n
[docs] @public\n def get_sensor_def(self, name: str) -> SensorDefinition:\n """Get a sensor definition by name."""\n check.str_param(name, "name")\n return self.get_repository_def().get_sensor_def(name)
\n\n
[docs] @public\n def get_schedule_def(self, name: str) -> ScheduleDefinition:\n """Get a schedule definition by name."""\n check.str_param(name, "name")\n return self.get_repository_def().get_schedule_def(name)
\n\n
[docs] @public\n def load_asset_value(\n self,\n asset_key: CoercibleToAssetKey,\n *,\n python_type: Optional[Type] = None,\n instance: Optional[DagsterInstance] = None,\n partition_key: Optional[str] = None,\n metadata: Optional[Dict[str, Any]] = None,\n ) -> object:\n """Load the contents of an asset as a Python object.\n\n Invokes `load_input` on the :py:class:`IOManager` associated with the asset.\n\n If you want to load the values of multiple assets, it's more efficient to use\n :py:meth:`~dagster.Definitions.get_asset_value_loader`, which avoids spinning up\n resources separately for each asset.\n\n Args:\n asset_key (Union[AssetKey, Sequence[str], str]): The key of the asset to load.\n python_type (Optional[Type]): The python type to load the asset as. This is what will\n be returned inside `load_input` by `context.dagster_type.typing_type`.\n partition_key (Optional[str]): The partition of the asset to load.\n metadata (Optional[Dict[str, Any]]): Input metadata to pass to the :py:class:`IOManager`\n (is equivalent to setting the metadata argument in `In` or `AssetIn`).\n\n Returns:\n The contents of an asset as a Python object.\n """\n return self.get_repository_def().load_asset_value(\n asset_key=asset_key,\n python_type=python_type,\n instance=instance,\n partition_key=partition_key,\n metadata=metadata,\n )
\n\n
[docs] @public\n def get_asset_value_loader(\n self, instance: Optional[DagsterInstance] = None\n ) -> "AssetValueLoader":\n """Returns an object that can load the contents of assets as Python objects.\n\n Invokes `load_input` on the :py:class:`IOManager` associated with the assets. Avoids\n spinning up resources separately for each asset.\n\n Usage:\n\n .. code-block:: python\n\n with defs.get_asset_value_loader() as loader:\n asset1 = loader.load_asset_value("asset1")\n asset2 = loader.load_asset_value("asset2")\n """\n return self.get_repository_def().get_asset_value_loader(\n instance=instance,\n )
\n\n def get_all_job_defs(self) -> Sequence[JobDefinition]:\n """Get all the Job definitions in the code location."""\n return self.get_repository_def().get_all_jobs()\n\n def has_implicit_global_asset_job_def(self) -> bool:\n return self.get_repository_def().has_implicit_global_asset_job_def()\n\n def get_implicit_global_asset_job_def(self) -> JobDefinition:\n """A useful conveninence method when there is a single defined global asset job.\n This occurs when all assets in the code location use a single partitioning scheme.\n If there are multiple partitioning schemes you must use get_implicit_job_def_for_assets\n instead to access to the correct implicit asset one.\n """\n return self.get_repository_def().get_implicit_global_asset_job_def()\n\n def get_implicit_job_def_for_assets(\n self, asset_keys: Iterable[AssetKey]\n ) -> Optional[JobDefinition]:\n return self.get_repository_def().get_implicit_job_def_for_assets(asset_keys)\n\n def get_assets_def(self, key: CoercibleToAssetKey) -> AssetsDefinition:\n asset_key = AssetKey.from_coercible(key)\n for assets_def in self.get_asset_graph().assets:\n if asset_key in assets_def.keys:\n return assets_def\n\n raise DagsterInvariantViolationError(f"Could not find asset {asset_key}")\n\n @cached_method\n def get_repository_def(self) -> RepositoryDefinition:\n """Definitions is implemented by wrapping RepositoryDefinition. Get that underlying object\n in order to access an functionality which is not exposed on Definitions. This method\n also resolves a PendingRepositoryDefinition to a RepositoryDefinition.\n """\n return (\n self._created_pending_or_normal_repo.compute_repository_definition()\n if isinstance(self._created_pending_or_normal_repo, PendingRepositoryDefinition)\n else self._created_pending_or_normal_repo\n )\n\n def get_inner_repository_for_loading_process(\n self,\n ) -> Union[RepositoryDefinition, PendingRepositoryDefinition]:\n """This method is used internally to access the inner repository during the loading process\n at CLI entry points. We explicitly do not want to resolve the pending repo because the entire\n point is to defer that resolution until later.\n """\n return self._created_pending_or_normal_repo\n\n def get_asset_graph(self) -> InternalAssetGraph:\n """Get the AssetGraph for this set of definitions."""\n return self.get_repository_def().asset_graph
\n
", "current_page_name": "_modules/dagster/_core/definitions/definitions_class", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.definitions_class"}, "dependency": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.dependency

\nfrom abc import ABC, abstractmethod\nfrom collections import defaultdict\nfrom enum import Enum\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Any,\n    DefaultDict,\n    Dict,\n    Iterable,\n    Iterator,\n    List,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Set,\n    Tuple,\n    Type,\n    Union,\n    cast,\n)\n\nfrom typing_extensions import TypeAlias, TypeVar\n\nimport dagster._check as check\nfrom dagster._annotations import PublicAttr, public\nfrom dagster._core.definitions.policy import RetryPolicy\nfrom dagster._core.errors import DagsterInvalidDefinitionError\nfrom dagster._serdes.serdes import (\n    whitelist_for_serdes,\n)\nfrom dagster._utils import hash_collection\n\nfrom .hook_definition import HookDefinition\nfrom .input import FanInInputPointer, InputDefinition, InputMapping, InputPointer\nfrom .output import OutputDefinition\nfrom .utils import DEFAULT_OUTPUT, struct_to_string, validate_tags\n\nif TYPE_CHECKING:\n    from dagster._core.definitions.op_definition import OpDefinition\n\n    from .asset_layer import AssetLayer\n    from .composition import MappedInputPlaceholder\n    from .graph_definition import GraphDefinition\n    from .node_definition import NodeDefinition\n    from .resource_requirement import ResourceRequirement\n\nT_DependencyKey = TypeVar("T_DependencyKey", str, "NodeInvocation")\nDependencyMapping: TypeAlias = Mapping[T_DependencyKey, Mapping[str, "IDependencyDefinition"]]\n\n\n
[docs]class NodeInvocation(\n NamedTuple(\n "Node",\n [\n ("name", PublicAttr[str]),\n ("alias", PublicAttr[Optional[str]]),\n ("tags", PublicAttr[Mapping[str, Any]]),\n ("hook_defs", PublicAttr[AbstractSet[HookDefinition]]),\n ("retry_policy", PublicAttr[Optional[RetryPolicy]]),\n ],\n )\n):\n """Identifies an instance of a node in a graph dependency structure.\n\n Args:\n name (str): Name of the node of which this is an instance.\n alias (Optional[str]): Name specific to this instance of the node. Necessary when there are\n multiple instances of the same node.\n tags (Optional[Dict[str, Any]]): Optional tags values to extend or override those\n set on the node definition.\n hook_defs (Optional[AbstractSet[HookDefinition]]): A set of hook definitions applied to the\n node instance.\n\n Examples:\n In general, users should prefer not to construct this class directly or use the\n :py:class:`JobDefinition` API that requires instances of this class. Instead, use the\n :py:func:`@job <job>` API:\n\n .. code-block:: python\n\n from dagster import job\n\n @job\n def my_job():\n other_name = some_op.alias('other_name')\n some_graph(other_name(some_op))\n\n """\n\n def __new__(\n cls,\n name: str,\n alias: Optional[str] = None,\n tags: Optional[Mapping[str, str]] = None,\n hook_defs: Optional[AbstractSet[HookDefinition]] = None,\n retry_policy: Optional[RetryPolicy] = None,\n ):\n return super().__new__(\n cls,\n name=check.str_param(name, "name"),\n alias=check.opt_str_param(alias, "alias"),\n tags=check.opt_mapping_param(tags, "tags", value_type=str, key_type=str),\n hook_defs=check.opt_set_param(hook_defs, "hook_defs", of_type=HookDefinition),\n retry_policy=check.opt_inst_param(retry_policy, "retry_policy", RetryPolicy),\n )\n\n # Needs to be hashable because this class is used as a key in dependencies dicts\n def __hash__(self) -> int:\n if not hasattr(self, "_hash"):\n self._hash = hash_collection(self)\n return self._hash
\n\n\nclass Node(ABC):\n """Node invocation within a graph. Identified by its name inside the graph."""\n\n name: str\n definition: "NodeDefinition"\n graph_definition: "GraphDefinition"\n _additional_tags: Mapping[str, str]\n _hook_defs: AbstractSet[HookDefinition]\n _retry_policy: Optional[RetryPolicy]\n _inputs: Mapping[str, "NodeInput"]\n _outputs: Mapping[str, "NodeOutput"]\n\n def __init__(\n self,\n name: str,\n definition: "NodeDefinition",\n graph_definition: "GraphDefinition",\n tags: Optional[Mapping[str, str]] = None,\n hook_defs: Optional[AbstractSet[HookDefinition]] = None,\n retry_policy: Optional[RetryPolicy] = None,\n ):\n from .graph_definition import GraphDefinition\n from .node_definition import NodeDefinition\n\n self.name = check.str_param(name, "name")\n self.definition = check.inst_param(definition, "definition", NodeDefinition)\n self.graph_definition = check.inst_param(\n graph_definition,\n "graph_definition",\n GraphDefinition,\n )\n self._additional_tags = validate_tags(tags)\n self._hook_defs = check.opt_set_param(hook_defs, "hook_defs", of_type=HookDefinition)\n self._retry_policy = check.opt_inst_param(retry_policy, "retry_policy", RetryPolicy)\n\n self._inputs = {\n name: NodeInput(self, input_def)\n for name, input_def in self.definition.input_dict.items()\n }\n self._outputs = {\n name: NodeOutput(self, output_def)\n for name, output_def in self.definition.output_dict.items()\n }\n\n def inputs(self) -> Iterable["NodeInput"]:\n return self._inputs.values()\n\n def outputs(self) -> Iterable["NodeOutput"]:\n return self._outputs.values()\n\n def get_input(self, name: str) -> "NodeInput":\n check.str_param(name, "name")\n return self._inputs[name]\n\n def get_output(self, name: str) -> "NodeOutput":\n check.str_param(name, "name")\n return self._outputs[name]\n\n def has_input(self, name: str) -> bool:\n return self.definition.has_input(name)\n\n def input_def_named(self, name: str) -> InputDefinition:\n return self.definition.input_def_named(name)\n\n def has_output(self, name: str) -> bool:\n return self.definition.has_output(name)\n\n def output_def_named(self, name: str) -> OutputDefinition:\n return self.definition.output_def_named(name)\n\n @property\n def input_dict(self) -> Mapping[str, InputDefinition]:\n return self.definition.input_dict\n\n @property\n def output_dict(self) -> Mapping[str, OutputDefinition]:\n return self.definition.output_dict\n\n @property\n def tags(self) -> Mapping[str, str]:\n return {**self.definition.tags, **self._additional_tags}\n\n def container_maps_input(self, input_name: str) -> bool:\n return (\n self.graph_definition.input_mapping_for_pointer(InputPointer(self.name, input_name))\n is not None\n )\n\n def container_mapped_input(self, input_name: str) -> InputMapping:\n mapping = self.graph_definition.input_mapping_for_pointer(\n InputPointer(self.name, input_name)\n )\n if mapping is None:\n check.failed(\n f"container does not map input {input_name}, check container_maps_input first"\n )\n return mapping\n\n def container_maps_fan_in_input(self, input_name: str, fan_in_index: int) -> bool:\n return (\n self.graph_definition.input_mapping_for_pointer(\n FanInInputPointer(self.name, input_name, fan_in_index)\n )\n is not None\n )\n\n def container_mapped_fan_in_input(self, input_name: str, fan_in_index: int) -> InputMapping:\n mapping = self.graph_definition.input_mapping_for_pointer(\n FanInInputPointer(self.name, input_name, fan_in_index)\n )\n if mapping is None:\n check.failed(\n f"container does not map fan-in {input_name} idx {fan_in_index}, check "\n "container_maps_fan_in_input first"\n )\n\n return mapping\n\n @property\n def hook_defs(self) -> AbstractSet[HookDefinition]:\n return self._hook_defs\n\n @property\n def retry_policy(self) -> Optional[RetryPolicy]:\n return self._retry_policy\n\n @abstractmethod\n def describe_node(self) -> str: ...\n\n @abstractmethod\n def get_resource_requirements(\n self,\n outer_container: "GraphDefinition",\n parent_handle: Optional["NodeHandle"] = None,\n asset_layer: Optional["AssetLayer"] = None,\n ) -> Iterator["ResourceRequirement"]: ...\n\n\nclass GraphNode(Node):\n definition: "GraphDefinition"\n\n def __init__(\n self,\n name: str,\n definition: "GraphDefinition",\n graph_definition: "GraphDefinition",\n tags: Optional[Mapping[str, str]] = None,\n hook_defs: Optional[AbstractSet[HookDefinition]] = None,\n retry_policy: Optional[RetryPolicy] = None,\n ):\n from .graph_definition import GraphDefinition\n\n check.inst_param(definition, "definition", GraphDefinition)\n super().__init__(name, definition, graph_definition, tags, hook_defs, retry_policy)\n\n def get_resource_requirements(\n self,\n outer_container: "GraphDefinition",\n parent_handle: Optional["NodeHandle"] = None,\n asset_layer: Optional["AssetLayer"] = None,\n ) -> Iterator["ResourceRequirement"]:\n cur_node_handle = NodeHandle(self.name, parent_handle)\n\n for node in self.definition.node_dict.values():\n yield from node.get_resource_requirements(\n asset_layer=asset_layer,\n outer_container=self.definition,\n parent_handle=cur_node_handle,\n )\n\n def describe_node(self) -> str:\n return f"graph '{self.name}'"\n\n\nclass OpNode(Node):\n definition: "OpDefinition"\n\n def __init__(\n self,\n name: str,\n definition: "OpDefinition",\n graph_definition: "GraphDefinition",\n tags: Optional[Mapping[str, str]] = None,\n hook_defs: Optional[AbstractSet[HookDefinition]] = None,\n retry_policy: Optional[RetryPolicy] = None,\n ):\n from .op_definition import OpDefinition\n\n check.inst_param(definition, "definition", OpDefinition)\n super().__init__(name, definition, graph_definition, tags, hook_defs, retry_policy)\n\n def get_resource_requirements(\n self,\n outer_container: "GraphDefinition",\n parent_handle: Optional["NodeHandle"] = None,\n asset_layer: Optional["AssetLayer"] = None,\n ) -> Iterator["ResourceRequirement"]:\n from .resource_requirement import InputManagerRequirement\n\n cur_node_handle = NodeHandle(self.name, parent_handle)\n\n for requirement in self.definition.get_resource_requirements(\n (cur_node_handle, asset_layer)\n ):\n # If requirement is a root input manager requirement, but the corresponding node has an upstream output, then ignore the requirement.\n if (\n isinstance(requirement, InputManagerRequirement)\n and outer_container.dependency_structure.has_deps(\n NodeInput(self, self.definition.input_def_named(requirement.input_name))\n )\n and requirement.root_input\n ):\n continue\n yield requirement\n for hook_def in self.hook_defs:\n yield from hook_def.get_resource_requirements(self.describe_node())\n\n def describe_node(self) -> str:\n return f"op '{self.name}'"\n\n\n@whitelist_for_serdes(storage_name="SolidHandle")\nclass NodeHandle(NamedTuple("_NodeHandle", [("name", str), ("parent", Optional["NodeHandle"])])):\n """A structured object to identify nodes in the potentially recursive graph structure."""\n\n def __new__(cls, name: str, parent: Optional["NodeHandle"]):\n return super(NodeHandle, cls).__new__(\n cls,\n check.str_param(name, "name"),\n check.opt_inst_param(parent, "parent", NodeHandle),\n )\n\n def __str__(self):\n return self.to_string()\n\n @property\n def root(self):\n if self.parent:\n return self.parent.root\n else:\n return self\n\n @property\n def path(self) -> Sequence[str]:\n """Return a list representation of the handle.\n\n Inverse of NodeHandle.from_path.\n\n Returns:\n List[str]:\n """\n path: List[str] = []\n cur = self\n while cur:\n path.append(cur.name)\n cur = cur.parent\n path.reverse()\n return path\n\n def to_string(self) -> str:\n """Return a unique string representation of the handle.\n\n Inverse of NodeHandle.from_string.\n """\n return self.parent.to_string() + "." + self.name if self.parent else self.name\n\n def is_or_descends_from(self, handle: "NodeHandle") -> bool:\n """Check if the handle is or descends from another handle.\n\n Args:\n handle (NodeHandle): The handle to check against.\n\n Returns:\n bool:\n """\n check.inst_param(handle, "handle", NodeHandle)\n\n for idx in range(len(handle.path)):\n if idx >= len(self.path):\n return False\n if self.path[idx] != handle.path[idx]:\n return False\n return True\n\n def pop(self, ancestor: "NodeHandle") -> Optional["NodeHandle"]:\n """Return a copy of the handle with some of its ancestors pruned.\n\n Args:\n ancestor (NodeHandle): Handle to an ancestor of the current handle.\n\n Returns:\n NodeHandle:\n\n Example:\n .. code-block:: python\n\n handle = NodeHandle('baz', NodeHandle('bar', NodeHandle('foo', None)))\n ancestor = NodeHandle('bar', NodeHandle('foo', None))\n assert handle.pop(ancestor) == NodeHandle('baz', None)\n """\n check.inst_param(ancestor, "ancestor", NodeHandle)\n check.invariant(\n self.is_or_descends_from(ancestor),\n f"Handle {self.to_string()} does not descend from {ancestor.to_string()}",\n )\n\n return NodeHandle.from_path(self.path[len(ancestor.path) :])\n\n def with_ancestor(self, ancestor: Optional["NodeHandle"]) -> "NodeHandle":\n """Returns a copy of the handle with an ancestor grafted on.\n\n Args:\n ancestor (NodeHandle): Handle to the new ancestor.\n\n Returns:\n NodeHandle:\n\n Example:\n .. code-block:: python\n\n handle = NodeHandle('baz', NodeHandle('bar', NodeHandle('foo', None)))\n ancestor = NodeHandle('quux' None)\n assert handle.with_ancestor(ancestor) == NodeHandle(\n 'baz', NodeHandle('bar', NodeHandle('foo', NodeHandle('quux', None)))\n )\n """\n check.opt_inst_param(ancestor, "ancestor", NodeHandle)\n\n return NodeHandle.from_path([*(ancestor.path if ancestor else []), *self.path])\n\n @staticmethod\n def from_path(path: Sequence[str]) -> "NodeHandle":\n check.sequence_param(path, "path", of_type=str)\n\n cur: Optional["NodeHandle"] = None\n _path = list(path)\n while len(_path) > 0:\n cur = NodeHandle(name=_path.pop(0), parent=cur)\n\n if cur is None:\n check.failed(f"Invalid handle path {path}")\n\n return cur\n\n @staticmethod\n def from_string(handle_str: str) -> "NodeHandle":\n check.str_param(handle_str, "handle_str")\n\n path = handle_str.split(".")\n return NodeHandle.from_path(path)\n\n @classmethod\n def from_dict(cls, dict_repr: Mapping[str, Any]) -> "NodeHandle":\n """This method makes it possible to load a potentially nested NodeHandle after a\n roundtrip through json.loads(json.dumps(NodeHandle._asdict())).\n """\n check.dict_param(dict_repr, "dict_repr", key_type=str)\n check.invariant(\n "name" in dict_repr, "Dict representation of NodeHandle must have a 'name' key"\n )\n check.invariant(\n "parent" in dict_repr, "Dict representation of NodeHandle must have a 'parent' key"\n )\n\n if isinstance(dict_repr["parent"], (list, tuple)):\n parent = NodeHandle.from_dict(\n {\n "name": dict_repr["parent"][0],\n "parent": dict_repr["parent"][1],\n }\n )\n else:\n parent = dict_repr["parent"]\n\n return NodeHandle(name=dict_repr["name"], parent=parent)\n\n\nclass NodeInputHandle(\n NamedTuple("_NodeInputHandle", [("node_handle", NodeHandle), ("input_name", str)])\n):\n """A structured object to uniquely identify inputs in the potentially recursive graph structure."""\n\n\nclass NodeOutputHandle(\n NamedTuple("_NodeOutputHandle", [("node_handle", NodeHandle), ("output_name", str)])\n):\n """A structured object to uniquely identify outputs in the potentially recursive graph structure."""\n\n\nclass NodeInput(NamedTuple("_NodeInput", [("node", Node), ("input_def", InputDefinition)])):\n def __new__(cls, node: Node, input_def: InputDefinition):\n return super(NodeInput, cls).__new__(\n cls,\n check.inst_param(node, "node", Node),\n check.inst_param(input_def, "input_def", InputDefinition),\n )\n\n def _inner_str(self) -> str:\n return struct_to_string(\n "NodeInput",\n node_name=self.node.name,\n input_name=self.input_def.name,\n )\n\n def __str__(self):\n return self._inner_str()\n\n def __repr__(self):\n return self._inner_str()\n\n def __hash__(self):\n return hash((self.node.name, self.input_def.name))\n\n def __eq__(self, other: object) -> bool:\n return (\n isinstance(other, NodeInput)\n and self.node.name == other.node.name\n and self.input_def.name == other.input_def.name\n )\n\n @property\n def node_name(self) -> str:\n return self.node.name\n\n @property\n def input_name(self) -> str:\n return self.input_def.name\n\n\nclass NodeOutput(NamedTuple("_NodeOutput", [("node", Node), ("output_def", OutputDefinition)])):\n def __new__(cls, node: Node, output_def: OutputDefinition):\n return super(NodeOutput, cls).__new__(\n cls,\n check.inst_param(node, "node", Node),\n check.inst_param(output_def, "output_def", OutputDefinition),\n )\n\n def _inner_str(self) -> str:\n return struct_to_string(\n "NodeOutput",\n node_name=self.node.name,\n output_name=self.output_def.name,\n )\n\n def __str__(self):\n return self._inner_str()\n\n def __repr__(self):\n return self._inner_str()\n\n def __hash__(self) -> int:\n return hash((self.node.name, self.output_def.name))\n\n def __eq__(self, other: Any) -> bool:\n return self.node.name == other.node.name and self.output_def.name == other.output_def.name\n\n def describe(self) -> str:\n return f"{self.node_name}:{self.output_def.name}"\n\n @property\n def node_name(self) -> str:\n return self.node.name\n\n @property\n def is_dynamic(self) -> bool:\n return self.output_def.is_dynamic\n\n @property\n def output_name(self) -> str:\n return self.output_def.name\n\n\nclass DependencyType(Enum):\n DIRECT = "DIRECT"\n FAN_IN = "FAN_IN"\n DYNAMIC_COLLECT = "DYNAMIC_COLLECT"\n\n\nclass IDependencyDefinition(ABC):\n @abstractmethod\n def get_node_dependencies(self) -> Sequence["DependencyDefinition"]:\n pass\n\n @abstractmethod\n def is_fan_in(self) -> bool:\n """The result passed to the corresponding input will be a List made from different node outputs."""\n\n\n
[docs]class DependencyDefinition(\n NamedTuple(\n "_DependencyDefinition", [("node", str), ("output", str), ("description", Optional[str])]\n ),\n IDependencyDefinition,\n):\n """Represents an edge in the DAG of nodes (ops or graphs) forming a job.\n\n This object is used at the leaves of a dictionary structure that represents the complete\n dependency structure of a job whose keys represent the dependent node and dependent\n input, so this object only contains information about the dependee.\n\n Concretely, if the input named 'input' of op_b depends on the output named 'result' of\n op_a, and the output named 'other_result' of graph_a, the structure will look as follows:\n\n .. code-block:: python\n\n dependency_structure = {\n 'my_downstream_op': {\n 'input': DependencyDefinition('my_upstream_op', 'result')\n }\n 'my_downstream_op': {\n 'input': DependencyDefinition('my_upstream_graph', 'result')\n }\n }\n\n In general, users should prefer not to construct this class directly or use the\n :py:class:`JobDefinition` API that requires instances of this class. Instead, use the\n :py:func:`@job <job>` API:\n\n .. code-block:: python\n\n @job\n def the_job():\n node_b(node_a())\n\n\n Args:\n node (str): The name of the node (op or graph) that is depended on, that is, from which the value\n passed between the two nodes originates.\n output (Optional[str]): The name of the output that is depended on. (default: "result")\n description (Optional[str]): Human-readable description of this dependency.\n """\n\n def __new__(\n cls,\n node: str,\n output: str = DEFAULT_OUTPUT,\n description: Optional[str] = None,\n ):\n return super(DependencyDefinition, cls).__new__(\n cls,\n check.str_param(node, "node"),\n check.str_param(output, "output"),\n check.opt_str_param(description, "description"),\n )\n\n def get_node_dependencies(self) -> Sequence["DependencyDefinition"]:\n return [self]\n\n
[docs] @public\n def is_fan_in(self) -> bool:\n """Return True if the dependency is fan-in (always False for DependencyDefinition)."""\n return False
\n\n def get_op_dependencies(self) -> Sequence["DependencyDefinition"]:\n return [self]
\n\n\n
[docs]class MultiDependencyDefinition(\n NamedTuple(\n "_MultiDependencyDefinition",\n [\n (\n "dependencies",\n PublicAttr[Sequence[Union[DependencyDefinition, Type["MappedInputPlaceholder"]]]],\n )\n ],\n ),\n IDependencyDefinition,\n):\n """Represents a fan-in edge in the DAG of op instances forming a job.\n\n This object is used only when an input of type ``List[T]`` is assembled by fanning-in multiple\n upstream outputs of type ``T``.\n\n This object is used at the leaves of a dictionary structure that represents the complete\n dependency structure of a job whose keys represent the dependent ops or graphs and dependent\n input, so this object only contains information about the dependee.\n\n Concretely, if the input named 'input' of op_c depends on the outputs named 'result' of\n op_a and op_b, this structure will look as follows:\n\n .. code-block:: python\n\n dependency_structure = {\n 'op_c': {\n 'input': MultiDependencyDefinition(\n [\n DependencyDefinition('op_a', 'result'),\n DependencyDefinition('op_b', 'result')\n ]\n )\n }\n }\n\n In general, users should prefer not to construct this class directly or use the\n :py:class:`JobDefinition` API that requires instances of this class. Instead, use the\n :py:func:`@job <job>` API:\n\n .. code-block:: python\n\n @job\n def the_job():\n op_c(op_a(), op_b())\n\n Args:\n dependencies (List[Union[DependencyDefinition, Type[MappedInputPlaceHolder]]]): List of\n upstream dependencies fanned in to this input.\n """\n\n def __new__(\n cls,\n dependencies: Sequence[Union[DependencyDefinition, Type["MappedInputPlaceholder"]]],\n ):\n from .composition import MappedInputPlaceholder\n\n deps = check.sequence_param(dependencies, "dependencies")\n seen = {}\n for dep in deps:\n if isinstance(dep, DependencyDefinition):\n key = dep.node + ":" + dep.output\n if key in seen:\n raise DagsterInvalidDefinitionError(\n f'Duplicate dependencies on node "{dep.node}" output "{dep.output}" '\n "used in the same MultiDependencyDefinition."\n )\n seen[key] = True\n elif dep is MappedInputPlaceholder:\n pass\n else:\n check.failed(f"Unexpected dependencies entry {dep}")\n\n return super(MultiDependencyDefinition, cls).__new__(cls, deps)\n\n
[docs] @public\n def get_node_dependencies(self) -> Sequence[DependencyDefinition]:\n """Return the list of :py:class:`DependencyDefinition` contained by this object."""\n return [dep for dep in self.dependencies if isinstance(dep, DependencyDefinition)]
\n\n
[docs] @public\n def is_fan_in(self) -> bool:\n """Return `True` if the dependency is fan-in (always True for MultiDependencyDefinition)."""\n return True
\n\n
[docs] @public\n def get_dependencies_and_mappings(\n self,\n ) -> Sequence[Union[DependencyDefinition, Type["MappedInputPlaceholder"]]]:\n """Return the combined list of dependencies contained by this object, inculding of :py:class:`DependencyDefinition` and :py:class:`MappedInputPlaceholder` objects."""\n return self.dependencies
\n\n\nclass BlockingAssetChecksDependencyDefinition(\n IDependencyDefinition,\n NamedTuple(\n "_BlockingAssetChecksDependencyDefinition",\n [\n (\n "asset_check_dependencies",\n Sequence[DependencyDefinition],\n ),\n ("other_dependency", Optional[DependencyDefinition]),\n ],\n ),\n):\n """An input that depends on a set of outputs that correspond to upstream asset checks, and also\n optionally depends on a single upstream output that does not correspond to an asset check.\n\n We model this with a different kind of DependencyDefinition than MultiDependencyDefinition,\n because we treat the value that's passed to the input parameter differently: we ignore the asset\n check dependencies and only pass a single value, instead of a fanned-in list.\n """\n\n @public\n def get_node_dependencies(self) -> Sequence[DependencyDefinition]:\n """Return the list of :py:class:`DependencyDefinition` contained by this object."""\n if self.other_dependency:\n return [*self.asset_check_dependencies, self.other_dependency]\n else:\n return self.asset_check_dependencies\n\n @public\n def is_fan_in(self) -> bool:\n return False\n\n @public\n def get_dependencies_and_mappings(\n self,\n ) -> Sequence[Union[DependencyDefinition, Type["MappedInputPlaceholder"]]]:\n return self.get_node_dependencies()\n\n\nclass DynamicCollectDependencyDefinition(\n NamedTuple("_DynamicCollectDependencyDefinition", [("node_name", str), ("output_name", str)]),\n IDependencyDefinition,\n):\n def get_node_dependencies(self) -> Sequence[DependencyDefinition]:\n return [DependencyDefinition(self.node_name, self.output_name)]\n\n def is_fan_in(self) -> bool:\n return True\n\n\nDepTypeAndOutputs: TypeAlias = Tuple[\n DependencyType,\n Union[NodeOutput, List[Union[NodeOutput, Type["MappedInputPlaceholder"]]]],\n]\n\nInputToOutputMap: TypeAlias = Dict[NodeInput, DepTypeAndOutputs]\n\n\ndef _create_handle_dict(\n node_dict: Mapping[str, Node],\n dep_dict: DependencyMapping[str],\n) -> InputToOutputMap:\n from .composition import MappedInputPlaceholder\n\n check.mapping_param(node_dict, "node_dict", key_type=str, value_type=Node)\n check.two_dim_mapping_param(dep_dict, "dep_dict", value_type=IDependencyDefinition)\n\n handle_dict: InputToOutputMap = {}\n\n for node_name, input_dict in dep_dict.items():\n from_node = node_dict[node_name]\n for input_name, dep_def in input_dict.items():\n if isinstance(\n dep_def, (MultiDependencyDefinition, BlockingAssetChecksDependencyDefinition)\n ):\n handles: List[Union[NodeOutput, Type[MappedInputPlaceholder]]] = []\n for inner_dep in dep_def.get_dependencies_and_mappings():\n if isinstance(inner_dep, DependencyDefinition):\n handles.append(node_dict[inner_dep.node].get_output(inner_dep.output))\n elif inner_dep is MappedInputPlaceholder:\n handles.append(inner_dep)\n else:\n check.failed(\n f"Unexpected MultiDependencyDefinition dependencies type {inner_dep}"\n )\n\n handle_dict[from_node.get_input(input_name)] = (DependencyType.FAN_IN, handles)\n\n elif isinstance(dep_def, DependencyDefinition):\n handle_dict[from_node.get_input(input_name)] = (\n DependencyType.DIRECT,\n node_dict[dep_def.node].get_output(dep_def.output),\n )\n elif isinstance(dep_def, DynamicCollectDependencyDefinition):\n handle_dict[from_node.get_input(input_name)] = (\n DependencyType.DYNAMIC_COLLECT,\n node_dict[dep_def.node_name].get_output(dep_def.output_name),\n )\n\n else:\n check.failed(f"Unknown dependency type {dep_def}")\n\n return handle_dict\n\n\nclass DependencyStructure:\n @staticmethod\n def from_definitions(\n nodes: Mapping[str, Node], dep_dict: DependencyMapping[str]\n ) -> "DependencyStructure":\n return DependencyStructure(\n list(dep_dict.keys()),\n _create_handle_dict(nodes, dep_dict),\n dep_dict,\n )\n\n _node_input_index: DefaultDict[str, Dict[NodeInput, List[NodeOutput]]]\n _node_output_index: Dict[str, DefaultDict[NodeOutput, List[NodeInput]]]\n _dynamic_fan_out_index: Dict[str, NodeOutput]\n _collect_index: Dict[str, Set[NodeOutput]]\n _deps_by_node_name: DependencyMapping[str]\n\n def __init__(\n self,\n node_names: Sequence[str],\n input_to_output_map: InputToOutputMap,\n deps_by_node_name: DependencyMapping[str],\n ):\n self._node_names = node_names\n self._input_to_output_map = input_to_output_map\n self._deps_by_node_name = deps_by_node_name\n\n # Building up a couple indexes here so that one can look up all the upstream output handles\n # or downstream input handles in O(1). Without this, this can become O(N^2) where N is node\n # count during the GraphQL query in particular\n\n # node_name => input_handle => list[output_handle]\n self._node_input_index = defaultdict(dict)\n\n # node_name => output_handle => list[input_handle]\n self._node_output_index = defaultdict(lambda: defaultdict(list))\n\n # node_name => dynamic output_handle that this node will dupe for\n self._dynamic_fan_out_index = {}\n\n # node_name => set of dynamic output_handle this collects over\n self._collect_index = defaultdict(set)\n\n for node_input, (dep_type, node_output_or_list) in self._input_to_output_map.items():\n if dep_type == DependencyType.FAN_IN:\n node_output_list: List[NodeOutput] = []\n for node_output in node_output_or_list:\n if not isinstance(node_output, NodeOutput):\n continue\n\n if node_output.is_dynamic:\n raise DagsterInvalidDefinitionError(\n "Currently, items in a fan-in dependency cannot be downstream of"\n " dynamic outputs. Problematic dependency on dynamic output"\n f' "{node_output.describe()}".'\n )\n if self._dynamic_fan_out_index.get(node_output.node_name):\n raise DagsterInvalidDefinitionError(\n "Currently, items in a fan-in dependency cannot be downstream of"\n " dynamic outputs. Problematic dependency on output"\n f' "{node_output.describe()}", downstream of'\n f' "{self._dynamic_fan_out_index[node_output.node_name].describe()}".'\n )\n\n node_output_list.append(node_output)\n elif dep_type == DependencyType.DIRECT:\n node_output = cast(NodeOutput, node_output_or_list)\n\n if node_output.is_dynamic:\n self._validate_and_set_fan_out(node_input, node_output)\n\n if self._dynamic_fan_out_index.get(node_output.node_name):\n self._validate_and_set_fan_out(\n node_input, self._dynamic_fan_out_index[node_output.node_name]\n )\n\n node_output_list = [node_output]\n elif dep_type == DependencyType.DYNAMIC_COLLECT:\n node_output = cast(NodeOutput, node_output_or_list)\n\n if node_output.is_dynamic:\n self._validate_and_set_collect(node_input, node_output)\n\n elif self._dynamic_fan_out_index.get(node_output.node_name):\n self._validate_and_set_collect(\n node_input,\n self._dynamic_fan_out_index[node_output.node_name],\n )\n else:\n check.failed(\n f"Unexpected dynamic fan in dep created {node_output} -> {node_input}"\n )\n\n node_output_list = [node_output]\n else:\n check.failed(f"Unexpected dep type {dep_type}")\n\n self._node_input_index[node_input.node.name][node_input] = node_output_list\n for node_output in node_output_list:\n self._node_output_index[node_output.node.name][node_output].append(node_input)\n\n def _validate_and_set_fan_out(self, node_input: NodeInput, node_output: NodeOutput) -> None:\n """Helper function for populating _dynamic_fan_out_index."""\n if not node_input.node.definition.input_supports_dynamic_output_dep(node_input.input_name):\n raise DagsterInvalidDefinitionError(\n f"{node_input.node.describe_node()} cannot be downstream of dynamic output"\n f' "{node_output.describe()}" since input "{node_input.input_name}" maps to a'\n " node that is already downstream of another dynamic output. Nodes cannot be"\n " downstream of more than one dynamic output"\n )\n\n if self._collect_index.get(node_input.node_name):\n raise DagsterInvalidDefinitionError(\n f"{node_input.node.describe_node()} cannot be both downstream of dynamic output "\n f"{node_output.describe()} and collect over dynamic output "\n f"{next(iter(self._collect_index[node_input.node_name])).describe()}."\n )\n\n if self._dynamic_fan_out_index.get(node_input.node_name) is None:\n self._dynamic_fan_out_index[node_input.node_name] = node_output\n return\n\n if self._dynamic_fan_out_index[node_input.node_name] != node_output:\n raise DagsterInvalidDefinitionError(\n f"{node_input.node.describe_node()} cannot be downstream of more than one dynamic"\n f' output. It is downstream of both "{node_output.describe()}" and'\n f' "{self._dynamic_fan_out_index[node_input.node_name].describe()}"'\n )\n\n def _validate_and_set_collect(\n self,\n node_input: NodeInput,\n node_output: NodeOutput,\n ) -> None:\n if self._dynamic_fan_out_index.get(node_input.node_name):\n raise DagsterInvalidDefinitionError(\n f"{node_input.node.describe_node()} cannot both collect over dynamic output "\n f"{node_output.describe()} and be downstream of the dynamic output "\n f"{self._dynamic_fan_out_index[node_input.node_name].describe()}."\n )\n\n self._collect_index[node_input.node_name].add(node_output)\n\n # if the output is already fanned out\n if self._dynamic_fan_out_index.get(node_output.node_name):\n raise DagsterInvalidDefinitionError(\n f"{node_input.node.describe_node()} cannot be downstream of more than one dynamic"\n f' output. It is downstream of both "{node_output.describe()}" and'\n f' "{self._dynamic_fan_out_index[node_output.node_name].describe()}"'\n )\n\n def all_upstream_outputs_from_node(self, node_name: str) -> Sequence[NodeOutput]:\n check.str_param(node_name, "node_name")\n\n # flatten out all outputs that feed into the inputs of this node\n return [\n output_handle\n for output_handle_list in self._node_input_index[node_name].values()\n for output_handle in output_handle_list\n ]\n\n def input_to_upstream_outputs_for_node(\n self, node_name: str\n ) -> Mapping[NodeInput, Sequence[NodeOutput]]:\n """Returns a Dict[NodeInput, List[NodeOutput]] that encodes\n where all the the inputs are sourced from upstream. Usually the\n List[NodeOutput] will be a list of one, except for the\n multi-dependency case.\n """\n check.str_param(node_name, "node_name")\n return self._node_input_index[node_name]\n\n def output_to_downstream_inputs_for_node(\n self, node_name: str\n ) -> Mapping[NodeOutput, Sequence[NodeInput]]:\n """Returns a Dict[NodeOutput, List[NodeInput]] that\n represents all the downstream inputs for each output in the\n dictionary.\n """\n check.str_param(node_name, "node_name")\n return self._node_output_index[node_name]\n\n def has_direct_dep(self, node_input: NodeInput) -> bool:\n check.inst_param(node_input, "node_input", NodeInput)\n if node_input not in self._input_to_output_map:\n return False\n dep_type, _ = self._input_to_output_map[node_input]\n return dep_type == DependencyType.DIRECT\n\n def get_direct_dep(self, node_input: NodeInput) -> NodeOutput:\n check.inst_param(node_input, "node_input", NodeInput)\n dep_type, dep = self._input_to_output_map[node_input]\n check.invariant(\n dep_type == DependencyType.DIRECT,\n f"Cannot call get_direct_dep when dep is not singular, got {dep_type}",\n )\n return cast(NodeOutput, dep)\n\n def get_dependency_definition(self, node_input: NodeInput) -> Optional[IDependencyDefinition]:\n return self._deps_by_node_name[node_input.node_name].get(node_input.input_name)\n\n def has_fan_in_deps(self, node_input: NodeInput) -> bool:\n check.inst_param(node_input, "node_input", NodeInput)\n if node_input not in self._input_to_output_map:\n return False\n dep_type, _ = self._input_to_output_map[node_input]\n return dep_type == DependencyType.FAN_IN\n\n def get_fan_in_deps(\n self, node_input: NodeInput\n ) -> Sequence[Union[NodeOutput, Type["MappedInputPlaceholder"]]]:\n check.inst_param(node_input, "node_input", NodeInput)\n dep_type, deps = self._input_to_output_map[node_input]\n check.invariant(\n dep_type == DependencyType.FAN_IN,\n f"Cannot call get_multi_dep when dep is not fan in, got {dep_type}",\n )\n return cast(List[Union[NodeOutput, Type["MappedInputPlaceholder"]]], deps)\n\n def has_dynamic_fan_in_dep(self, node_input: NodeInput) -> bool:\n check.inst_param(node_input, "node_input", NodeInput)\n if node_input not in self._input_to_output_map:\n return False\n dep_type, _ = self._input_to_output_map[node_input]\n return dep_type == DependencyType.DYNAMIC_COLLECT\n\n def get_dynamic_fan_in_dep(self, node_input: NodeInput) -> NodeOutput:\n check.inst_param(node_input, "node_input", NodeInput)\n dep_type, dep = self._input_to_output_map[node_input]\n check.invariant(\n dep_type == DependencyType.DYNAMIC_COLLECT,\n f"Cannot call get_dynamic_fan_in_dep when dep is not, got {dep_type}",\n )\n return cast(NodeOutput, dep)\n\n def has_deps(self, node_input: NodeInput) -> bool:\n check.inst_param(node_input, "node_input", NodeInput)\n return node_input in self._input_to_output_map\n\n def get_deps_list(self, node_input: NodeInput) -> Sequence[NodeOutput]:\n check.inst_param(node_input, "node_input", NodeInput)\n check.invariant(self.has_deps(node_input))\n dep_type, handle_or_list = self._input_to_output_map[node_input]\n if dep_type == DependencyType.DIRECT:\n return [cast(NodeOutput, handle_or_list)]\n elif dep_type == DependencyType.DYNAMIC_COLLECT:\n return [cast(NodeOutput, handle_or_list)]\n elif dep_type == DependencyType.FAN_IN:\n return [handle for handle in handle_or_list if isinstance(handle, NodeOutput)]\n else:\n check.failed(f"Unexpected dep type {dep_type}")\n\n def inputs(self) -> Sequence[NodeInput]:\n return list(self._input_to_output_map.keys())\n\n def get_upstream_dynamic_output_for_node(self, node_name: str) -> Optional[NodeOutput]:\n return self._dynamic_fan_out_index.get(node_name)\n\n def get_dependency_type(self, node_input: NodeInput) -> Optional[DependencyType]:\n result = self._input_to_output_map.get(node_input)\n if result is None:\n return None\n dep_type, _ = result\n return dep_type\n\n def is_dynamic_mapped(self, node_name: str) -> bool:\n return node_name in self._dynamic_fan_out_index\n\n def has_dynamic_downstreams(self, node_name: str) -> bool:\n for node_output in self._dynamic_fan_out_index.values():\n if node_output.node_name == node_name:\n return True\n\n return False\n
", "current_page_name": "_modules/dagster/_core/definitions/dependency", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.dependency"}, "events": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.events

\nimport re\nfrom enum import Enum\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Any,\n    Callable,\n    Generic,\n    List,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    TypeVar,\n    Union,\n    cast,\n)\n\nimport dagster._check as check\nimport dagster._seven as seven\nfrom dagster._annotations import PublicAttr, deprecated, experimental_param, public\nfrom dagster._core.definitions.data_version import DataVersion\nfrom dagster._core.storage.tags import MULTIDIMENSIONAL_PARTITION_PREFIX, SYSTEM_TAG_PREFIX\nfrom dagster._serdes import whitelist_for_serdes\nfrom dagster._serdes.serdes import NamedTupleSerializer\n\nfrom .metadata import (\n    MetadataFieldSerializer,\n    MetadataMapping,\n    MetadataValue,\n    RawMetadataValue,\n    normalize_metadata,\n)\nfrom .utils import DEFAULT_OUTPUT, check_valid_name\n\nif TYPE_CHECKING:\n    from dagster._core.definitions.assets import AssetsDefinition\n    from dagster._core.definitions.source_asset import SourceAsset\n    from dagster._core.execution.context.output import OutputContext\n\n\nASSET_KEY_SPLIT_REGEX = re.compile("[^a-zA-Z0-9_]")\nASSET_KEY_DELIMITER = "/"\n\n\ndef parse_asset_key_string(s: str) -> Sequence[str]:\n    return list(filter(lambda x: x, re.split(ASSET_KEY_SPLIT_REGEX, s)))\n\n\n
[docs]@whitelist_for_serdes\nclass AssetKey(NamedTuple("_AssetKey", [("path", PublicAttr[Sequence[str]])])):\n """Object representing the structure of an asset key. Takes in a sanitized string, list of\n strings, or tuple of strings.\n\n Example usage:\n\n .. code-block:: python\n\n from dagster import op\n\n @op\n def emit_metadata(context, df):\n yield AssetMaterialization(\n asset_key=AssetKey('flat_asset_key'),\n metadata={"text_metadata": "Text-based metadata for this event"},\n )\n\n @op\n def structured_asset_key(context, df):\n yield AssetMaterialization(\n asset_key=AssetKey(['parent', 'child', 'grandchild']),\n metadata={"text_metadata": "Text-based metadata for this event"},\n )\n\n @op\n def structured_asset_key_2(context, df):\n yield AssetMaterialization(\n asset_key=AssetKey(('parent', 'child', 'grandchild')),\n metadata={"text_metadata": "Text-based metadata for this event"},\n )\n\n Args:\n path (Sequence[str]): String, list of strings, or tuple of strings. A list of strings\n represent the hierarchical structure of the asset_key.\n """\n\n def __new__(cls, path: Sequence[str]):\n if isinstance(path, str):\n path = [path]\n else:\n path = list(check.sequence_param(path, "path", of_type=str))\n\n return super(AssetKey, cls).__new__(cls, path=path)\n\n def __str__(self):\n return f"AssetKey({self.path})"\n\n def __repr__(self):\n return f"AssetKey({self.path})"\n\n def __hash__(self):\n return hash(tuple(self.path))\n\n def __eq__(self, other):\n if not isinstance(other, AssetKey):\n return False\n if len(self.path) != len(other.path):\n return False\n for i in range(0, len(self.path)):\n if self.path[i] != other.path[i]:\n return False\n return True\n\n def to_string(self) -> str:\n """E.g. '["first_component", "second_component"]'."""\n return seven.json.dumps(self.path)\n\n def to_user_string(self) -> str:\n """E.g. "first_component/second_component"."""\n return ASSET_KEY_DELIMITER.join(self.path)\n\n def to_python_identifier(self, suffix: Optional[str] = None) -> str:\n """Build a valid Python identifier based on the asset key that can be used for\n operation names or I/O manager keys.\n """\n path = list(self.path)\n\n if suffix is not None:\n path.append(suffix)\n\n return "__".join(path).replace("-", "_")\n\n @staticmethod\n def from_user_string(asset_key_string: str) -> "AssetKey":\n return AssetKey(asset_key_string.split(ASSET_KEY_DELIMITER))\n\n @staticmethod\n def from_db_string(asset_key_string: Optional[str]) -> Optional["AssetKey"]:\n if not asset_key_string:\n return None\n if asset_key_string[0] == "[":\n # is a json string\n try:\n path = seven.json.loads(asset_key_string)\n except seven.JSONDecodeError:\n path = parse_asset_key_string(asset_key_string)\n else:\n path = parse_asset_key_string(asset_key_string)\n return AssetKey(path)\n\n @staticmethod\n def get_db_prefix(path: Sequence[str]):\n check.sequence_param(path, "path", of_type=str)\n return seven.json.dumps(path)[:-2] # strip trailing '"]' from json string\n\n @staticmethod\n def from_graphql_input(graphql_input_asset_key: Mapping[str, Sequence[str]]) -> "AssetKey":\n return AssetKey(graphql_input_asset_key["path"])\n\n def to_graphql_input(self) -> Mapping[str, Sequence[str]]:\n return {"path": self.path}\n\n @staticmethod\n def from_coercible(arg: "CoercibleToAssetKey") -> "AssetKey":\n if isinstance(arg, AssetKey):\n return check.inst_param(arg, "arg", AssetKey)\n elif isinstance(arg, str):\n return AssetKey([arg])\n elif isinstance(arg, list):\n check.list_param(arg, "arg", of_type=str)\n return AssetKey(arg)\n elif isinstance(arg, tuple):\n check.tuple_param(arg, "arg", of_type=str)\n return AssetKey(arg)\n else:\n check.failed(f"Unexpected type for AssetKey: {type(arg)}")\n\n @staticmethod\n def from_coercible_or_definition(\n arg: Union["CoercibleToAssetKey", "AssetsDefinition", "SourceAsset"]\n ) -> "AssetKey":\n from dagster._core.definitions.assets import AssetsDefinition\n from dagster._core.definitions.source_asset import SourceAsset\n\n if isinstance(arg, AssetsDefinition):\n return arg.key\n elif isinstance(arg, SourceAsset):\n return arg.key\n else:\n return AssetKey.from_coercible(arg)\n\n # @staticmethod\n # def from_coercible_to_asset_dep(arg: "CoercibleToAssetDep") -> "AssetKey":\n # from dagster._core.definitions.asset_dep import AssetDep\n # from dagster._core.definitions.asset_spec import AssetSpec\n # from dagster._core.definitions.assets import AssetsDefinition\n # from dagster._core.definitions.source_asset import SourceAsset\n\n # if isinstance(arg, AssetsDefinition):\n # if len(arg.keys) > 1:\n # # Only AssetsDefinition with a single asset can be passed\n # raise DagsterInvalidDefinitionError(\n # "Cannot pass a multi_asset AssetsDefinition as an argument to deps."\n # " Instead, specify dependencies on the assets created by the multi_asset"\n # f" via AssetKeys or strings. For the multi_asset {arg.node_def.name}, the"\n # f" available keys are: {arg.keys}."\n # )\n # return arg.key\n # elif isinstance(arg, SourceAsset):\n # return arg.key\n # elif isinstance(arg, AssetDep):\n # return arg.asset_key\n # elif isinstance(arg, AssetSpec):\n # return arg.asset_key\n # else:\n # return AssetKey.from_coercible(arg)\n\n def has_prefix(self, prefix: Sequence[str]) -> bool:\n return len(self.path) >= len(prefix) and self.path[: len(prefix)] == prefix\n\n def with_prefix(self, prefix: "CoercibleToAssetKeyPrefix") -> "AssetKey":\n prefix = key_prefix_from_coercible(prefix)\n return AssetKey(list(prefix) + list(self.path))
\n\n\nclass AssetKeyPartitionKey(NamedTuple):\n """An AssetKey with an (optional) partition key. Refers either to a non-partitioned asset or a\n partition of a partitioned asset.\n """\n\n asset_key: AssetKey\n partition_key: Optional[str] = None\n\n\nCoercibleToAssetKey = Union[AssetKey, str, Sequence[str]]\nCoercibleToAssetKeyPrefix = Union[str, Sequence[str]]\n\n\ndef check_opt_coercible_to_asset_key_prefix_param(\n prefix: Optional[CoercibleToAssetKeyPrefix], param_name: str\n) -> Optional[Sequence[str]]:\n try:\n return key_prefix_from_coercible(prefix) if prefix is not None else None\n except check.CheckError:\n raise check.ParameterCheckError(\n f'Param "{param_name}" is not a string or a sequence of strings'\n )\n\n\ndef key_prefix_from_coercible(key_prefix: CoercibleToAssetKeyPrefix) -> Sequence[str]:\n if isinstance(key_prefix, str):\n return [key_prefix]\n elif isinstance(key_prefix, list):\n return key_prefix\n else:\n check.failed(f"Unexpected type for key_prefix: {type(key_prefix)}")\n\n\nDynamicAssetKey = Callable[["OutputContext"], Optional[AssetKey]]\n\n\n@whitelist_for_serdes\nclass AssetLineageInfo(\n NamedTuple("_AssetLineageInfo", [("asset_key", AssetKey), ("partitions", AbstractSet[str])])\n):\n def __new__(cls, asset_key: AssetKey, partitions: Optional[AbstractSet[str]] = None):\n asset_key = check.inst_param(asset_key, "asset_key", AssetKey)\n partitions = check.opt_set_param(partitions, "partitions", str)\n return super(AssetLineageInfo, cls).__new__(cls, asset_key=asset_key, partitions=partitions)\n\n\nT = TypeVar("T")\n\n\n
[docs]@experimental_param(param="data_version")\nclass Output(Generic[T]):\n """Event corresponding to one of a op's outputs.\n\n Op compute functions must explicitly yield events of this type when they have more than\n one output, or when they also yield events of other types, or when defining a op using the\n :py:class:`OpDefinition` API directly.\n\n Outputs are values produced by ops that will be consumed by downstream ops in a job.\n They are type-checked at op boundaries when their corresponding :py:class:`Out`\n or the downstream :py:class:`In` is typed.\n\n Args:\n value (Any): The value returned by the compute function.\n output_name (Optional[str]): Name of the corresponding out. (default:\n "result")\n metadata (Optional[Dict[str, Union[str, float, int, MetadataValue]]]):\n Arbitrary metadata about the failure. Keys are displayed string labels, and values are\n one of the following: string, float, int, JSON-serializable dict, JSON-serializable\n list, and one of the data classes returned by a MetadataValue static method.\n data_version (Optional[DataVersion]): (Experimental) A data version to manually set\n for the asset.\n """\n\n def __init__(\n self,\n value: T,\n output_name: Optional[str] = DEFAULT_OUTPUT,\n metadata: Optional[Mapping[str, RawMetadataValue]] = None,\n data_version: Optional[DataVersion] = None,\n ):\n self._value = value\n self._output_name = check.str_param(output_name, "output_name")\n self._data_version = check.opt_inst_param(data_version, "data_version", DataVersion)\n self._metadata = normalize_metadata(\n check.opt_mapping_param(metadata, "metadata", key_type=str),\n )\n\n @property\n def metadata(self) -> MetadataMapping:\n return self._metadata\n\n @public\n @property\n def value(self) -> Any:\n """Any: The value returned by the compute function."""\n return self._value\n\n @public\n @property\n def output_name(self) -> str:\n """str: Name of the corresponding :py:class:`Out`."""\n return self._output_name\n\n @public\n @property\n def data_version(self) -> Optional[DataVersion]:\n """Optional[DataVersion]: A data version that was manually set on the `Output`."""\n return self._data_version\n\n def __eq__(self, other: object) -> bool:\n return (\n isinstance(other, Output)\n and self.value == other.value\n and self.output_name == other.output_name\n and self.metadata == other.metadata\n )
\n\n\n
[docs]class DynamicOutput(Generic[T]):\n """Variant of :py:class:`Output <dagster.Output>` used to support\n dynamic mapping & collect. Each ``DynamicOutput`` produced by an op represents\n one item in a set that can be processed individually with ``map`` or gathered\n with ``collect``.\n\n Each ``DynamicOutput`` must have a unique ``mapping_key`` to distinguish it with it's set.\n\n Args:\n value (Any):\n The value returned by the compute function.\n mapping_key (str):\n The key that uniquely identifies this dynamic value relative to its peers.\n This key will be used to identify the downstream ops when mapped, ie\n ``mapped_op[example_mapping_key]``\n output_name (Optional[str]):\n Name of the corresponding :py:class:`DynamicOut` defined on the op.\n (default: "result")\n metadata (Optional[Dict[str, Union[str, float, int, MetadataValue]]]):\n Arbitrary metadata about the failure. Keys are displayed string labels, and values are\n one of the following: string, float, int, JSON-serializable dict, JSON-serializable\n list, and one of the data classes returned by a MetadataValue static method.\n """\n\n def __init__(\n self,\n value: T,\n mapping_key: str,\n output_name: Optional[str] = DEFAULT_OUTPUT,\n metadata: Optional[Mapping[str, RawMetadataValue]] = None,\n ):\n self._mapping_key = check_valid_name(check.str_param(mapping_key, "mapping_key"))\n self._output_name = check.str_param(output_name, "output_name")\n self._value = value\n self._metadata = normalize_metadata(\n check.opt_mapping_param(metadata, "metadata", key_type=str),\n )\n\n @property\n def metadata(self) -> Mapping[str, MetadataValue]:\n return self._metadata\n\n @public\n @property\n def mapping_key(self) -> str:\n """The mapping_key that was set for this DynamicOutput at instantiation."""\n return self._mapping_key\n\n @public\n @property\n def value(self) -> T:\n """The value that is returned by the compute function for this DynamicOut."""\n return self._value\n\n @public\n @property\n def output_name(self) -> str:\n """Name of the :py:class:`DynamicOut` defined on the op that this DynamicOut is associated with."""\n return self._output_name\n\n def __eq__(self, other: object) -> bool:\n return (\n isinstance(other, DynamicOutput)\n and self.value == other.value\n and self.output_name == other.output_name\n and self.mapping_key == other.mapping_key\n and self.metadata == other.metadata\n )
\n\n\n@whitelist_for_serdes(\n storage_field_names={"metadata": "metadata_entries"},\n field_serializers={"metadata": MetadataFieldSerializer},\n)\nclass AssetObservation(\n NamedTuple(\n "_AssetObservation",\n [\n ("asset_key", PublicAttr[AssetKey]),\n ("description", PublicAttr[Optional[str]]),\n ("metadata", PublicAttr[Mapping[str, MetadataValue]]),\n ("partition", PublicAttr[Optional[str]]),\n ("tags", PublicAttr[Mapping[str, str]]),\n ],\n )\n):\n """Event that captures metadata about an asset at a point in time.\n\n Args:\n asset_key (Union[str, List[str], AssetKey]): A key to identify the asset.\n partition (Optional[str]): The name of a partition of the asset that the metadata\n corresponds to.\n tags (Optional[Mapping[str, str]]): A mapping containing system-populated tags for the\n observation. Users should not pass values into this argument.\n metadata (Optional[Dict[str, Union[str, float, int, MetadataValue]]]):\n Arbitrary metadata about the asset. Keys are displayed string labels, and values are\n one of the following: string, float, int, JSON-serializable dict, JSON-serializable\n list, and one of the data classes returned by a MetadataValue static method.\n """\n\n def __new__(\n cls,\n asset_key: CoercibleToAssetKey,\n description: Optional[str] = None,\n metadata: Optional[Mapping[str, RawMetadataValue]] = None,\n partition: Optional[str] = None,\n tags: Optional[Mapping[str, str]] = None,\n ):\n if isinstance(asset_key, AssetKey):\n check.inst_param(asset_key, "asset_key", AssetKey)\n elif isinstance(asset_key, str):\n asset_key = AssetKey(parse_asset_key_string(asset_key))\n else:\n check.sequence_param(asset_key, "asset_key", of_type=str)\n asset_key = AssetKey(asset_key)\n\n tags = check.opt_mapping_param(tags, "tags", key_type=str, value_type=str)\n if any([not tag.startswith(SYSTEM_TAG_PREFIX) for tag in tags or {}]):\n check.failed(\n "Users should not pass values into the tags argument for AssetMaterializations. "\n "The tags argument is reserved for system-populated tags."\n )\n\n normed_metadata = normalize_metadata(\n check.opt_mapping_param(metadata, "metadata", key_type=str),\n )\n\n return super(AssetObservation, cls).__new__(\n cls,\n asset_key=asset_key,\n description=check.opt_str_param(description, "description"),\n metadata=normed_metadata,\n tags=tags,\n partition=check.opt_str_param(partition, "partition"),\n )\n\n @property\n def label(self) -> str:\n return " ".join(self.asset_key.path)\n\n\nUNDEFINED_ASSET_KEY_PATH = ["__undefined__"]\n\n\nclass AssetMaterializationSerializer(NamedTupleSerializer):\n # There are old `Materialization` objects in storage. We set the default value for asset key to\n # be `AssetKey(["__undefined__"])` to ensure that we can load these objects, without needing to\n # allow for the construction of new `AssetMaterialization` objects with no defined AssetKey.\n def before_unpack(self, context, unpacked_dict: Any) -> Any:\n # cover both the case where "asset_key" is not present at all and where it is None\n if unpacked_dict.get("asset_key") is None:\n unpacked_dict["asset_key"] = AssetKey(UNDEFINED_ASSET_KEY_PATH)\n return unpacked_dict\n\n\n
[docs]@whitelist_for_serdes(\n old_storage_names={"Materialization"},\n serializer=AssetMaterializationSerializer,\n storage_field_names={"metadata": "metadata_entries"},\n field_serializers={"metadata": MetadataFieldSerializer},\n)\nclass AssetMaterialization(\n NamedTuple(\n "_AssetMaterialization",\n [\n ("asset_key", PublicAttr[AssetKey]),\n ("description", PublicAttr[Optional[str]]),\n ("metadata", PublicAttr[Mapping[str, MetadataValue]]),\n ("partition", PublicAttr[Optional[str]]),\n ("tags", Optional[Mapping[str, str]]),\n ],\n )\n):\n """Event indicating that an op has materialized an asset.\n\n Op compute functions may yield events of this type whenever they wish to indicate to the\n Dagster framework (and the end user) that they have produced a materialized value as a\n side effect of computation. Unlike outputs, asset materializations can not be passed to other\n ops, and their persistence is controlled by op logic, rather than by the Dagster\n framework.\n\n Op authors should use these events to organize metadata about the side effects of their\n computations, enabling tooling like the Assets dashboard in the Dagster UI.\n\n Args:\n asset_key (Union[str, List[str], AssetKey]): A key to identify the materialized asset across\n job runs\n description (Optional[str]): A longer human-readable description of the materialized value.\n partition (Optional[str]): The name of the partition\n that was materialized.\n tags (Optional[Mapping[str, str]]): A mapping containing system-populated tags for the\n materialization. Users should not pass values into this argument.\n metadata (Optional[Dict[str, RawMetadataValue]]):\n Arbitrary metadata about the asset. Keys are displayed string labels, and values are\n one of the following: string, float, int, JSON-serializable dict, JSON-serializable\n list, and one of the data classes returned by a MetadataValue static method.\n """\n\n def __new__(\n cls,\n asset_key: CoercibleToAssetKey,\n description: Optional[str] = None,\n metadata: Optional[Mapping[str, RawMetadataValue]] = None,\n partition: Optional[str] = None,\n tags: Optional[Mapping[str, str]] = None,\n ):\n from dagster._core.definitions.multi_dimensional_partitions import MultiPartitionKey\n\n if isinstance(asset_key, AssetKey):\n check.inst_param(asset_key, "asset_key", AssetKey)\n elif isinstance(asset_key, str):\n asset_key = AssetKey(parse_asset_key_string(asset_key))\n else:\n check.sequence_param(asset_key, "asset_key", of_type=str)\n asset_key = AssetKey(asset_key)\n\n check.opt_mapping_param(tags, "tags", key_type=str, value_type=str)\n invalid_tags = [tag for tag in tags or {} if not tag.startswith(SYSTEM_TAG_PREFIX)]\n if len(invalid_tags) > 0:\n check.failed(\n f"Invalid tags: {tags} Users should not pass values into the tags argument for"\n " AssetMaterializations. The tags argument is reserved for system-populated tags."\n )\n\n normed_metadata = normalize_metadata(\n check.opt_mapping_param(metadata, "metadata", key_type=str),\n )\n\n partition = check.opt_str_param(partition, "partition")\n\n if not isinstance(partition, MultiPartitionKey):\n # When event log records are unpacked from storage, cast the partition key as a\n # MultiPartitionKey if multi-dimensional partition tags exist\n multi_dimensional_partitions = {\n dimension[len(MULTIDIMENSIONAL_PARTITION_PREFIX) :]: partition_key\n for dimension, partition_key in (tags or {}).items()\n if dimension.startswith(MULTIDIMENSIONAL_PARTITION_PREFIX)\n }\n if multi_dimensional_partitions:\n partition = MultiPartitionKey(multi_dimensional_partitions)\n\n return super(AssetMaterialization, cls).__new__(\n cls,\n asset_key=asset_key,\n description=check.opt_str_param(description, "description"),\n metadata=normed_metadata,\n tags=tags,\n partition=partition,\n )\n\n @property\n def label(self) -> str:\n return " ".join(self.asset_key.path)\n\n
[docs] @public\n @staticmethod\n def file(\n path: str,\n description: Optional[str] = None,\n asset_key: Optional[Union[str, Sequence[str], AssetKey]] = None,\n ) -> "AssetMaterialization":\n """Static constructor for standard materializations corresponding to files on disk.\n\n Args:\n path (str): The path to the file.\n description (Optional[str]): A human-readable description of the materialization.\n """\n if not asset_key:\n asset_key = path\n\n return AssetMaterialization(\n asset_key=cast(Union[str, AssetKey, List[str]], asset_key),\n description=description,\n metadata={"path": MetadataValue.path(path)},\n )
\n\n\n
[docs]@deprecated(\n breaking_version="1.7",\n additional_warn_text="Please use AssetCheckResult and @asset_check instead.",\n)\n@whitelist_for_serdes(\n storage_field_names={"metadata": "metadata_entries"},\n field_serializers={"metadata": MetadataFieldSerializer},\n)\nclass ExpectationResult(\n NamedTuple(\n "_ExpectationResult",\n [\n ("success", PublicAttr[bool]),\n ("label", PublicAttr[Optional[str]]),\n ("description", PublicAttr[Optional[str]]),\n ("metadata", PublicAttr[Mapping[str, MetadataValue]]),\n ],\n )\n):\n """Event corresponding to a data quality test.\n\n Op compute functions may yield events of this type whenever they wish to indicate to the\n Dagster framework (and the end user) that a data quality test has produced a (positive or\n negative) result.\n\n Args:\n success (bool): Whether the expectation passed or not.\n label (Optional[str]): Short display name for expectation. Defaults to "result".\n description (Optional[str]): A longer human-readable description of the expectation.\n metadata (Optional[Dict[str, RawMetadataValue]]):\n Arbitrary metadata about the failure. Keys are displayed string labels, and values are\n one of the following: string, float, int, JSON-serializable dict, JSON-serializable\n list, and one of the data classes returned by a MetadataValue static method.\n """\n\n def __new__(\n cls,\n success: bool,\n label: Optional[str] = None,\n description: Optional[str] = None,\n metadata: Optional[Mapping[str, RawMetadataValue]] = None,\n ):\n normed_metadata = normalize_metadata(\n check.opt_mapping_param(metadata, "metadata", key_type=str),\n )\n\n return super(ExpectationResult, cls).__new__(\n cls,\n success=check.bool_param(success, "success"),\n label=check.opt_str_param(label, "label", "result"),\n description=check.opt_str_param(description, "description"),\n metadata=normed_metadata,\n )
\n\n\n
[docs]@whitelist_for_serdes(\n storage_field_names={"metadata": "metadata_entries"},\n field_serializers={"metadata": MetadataFieldSerializer},\n)\n@whitelist_for_serdes\nclass TypeCheck(\n NamedTuple(\n "_TypeCheck",\n [\n ("success", PublicAttr[bool]),\n ("description", PublicAttr[Optional[str]]),\n ("metadata", PublicAttr[Mapping[str, MetadataValue]]),\n ],\n )\n):\n """Event corresponding to a successful typecheck.\n\n Events of this type should be returned by user-defined type checks when they need to encapsulate\n additional metadata about a type check's success or failure. (i.e., when using\n :py:func:`as_dagster_type`, :py:func:`@usable_as_dagster_type <dagster_type>`, or the underlying\n :py:func:`PythonObjectDagsterType` API.)\n\n Op compute functions should generally avoid yielding events of this type to avoid confusion.\n\n Args:\n success (bool): ``True`` if the type check succeeded, ``False`` otherwise.\n description (Optional[str]): A human-readable description of the type check.\n metadata (Optional[Dict[str, RawMetadataValue]]):\n Arbitrary metadata about the failure. Keys are displayed string labels, and values are\n one of the following: string, float, int, JSON-serializable dict, JSON-serializable\n list, and one of the data classes returned by a MetadataValue static method.\n """\n\n def __new__(\n cls,\n success: bool,\n description: Optional[str] = None,\n metadata: Optional[Mapping[str, RawMetadataValue]] = None,\n ):\n normed_metadata = normalize_metadata(\n check.opt_mapping_param(metadata, "metadata", key_type=str),\n )\n\n return super(TypeCheck, cls).__new__(\n cls,\n success=check.bool_param(success, "success"),\n description=check.opt_str_param(description, "description"),\n metadata=normed_metadata,\n )
\n\n\n
[docs]class Failure(Exception):\n """Event indicating op failure.\n\n Raise events of this type from within op compute functions or custom type checks in order to\n indicate an unrecoverable failure in user code to the Dagster machinery and return\n structured metadata about the failure.\n\n Args:\n description (Optional[str]): A human-readable description of the failure.\n metadata (Optional[Dict[str, RawMetadataValue]]):\n Arbitrary metadata about the failure. Keys are displayed string labels, and values are\n one of the following: string, float, int, JSON-serializable dict, JSON-serializable\n list, and one of the data classes returned by a MetadataValue static method.\n allow_retries (Optional[bool]):\n Whether this Failure should respect the retry policy or bypass it and immediately fail.\n Defaults to True, respecting the retry policy and allowing retries.\n """\n\n def __init__(\n self,\n description: Optional[str] = None,\n metadata: Optional[Mapping[str, RawMetadataValue]] = None,\n allow_retries: Optional[bool] = None,\n ):\n super(Failure, self).__init__(description)\n self.description = check.opt_str_param(description, "description")\n self.metadata = normalize_metadata(\n check.opt_mapping_param(metadata, "metadata", key_type=str),\n )\n self.allow_retries = check.opt_bool_param(allow_retries, "allow_retries", True)
\n\n\n
[docs]class RetryRequested(Exception):\n """An exception to raise from an op to indicate that it should be retried.\n\n Args:\n max_retries (Optional[int]):\n The max number of retries this step should attempt before failing\n seconds_to_wait (Optional[Union[float,int]]):\n Seconds to wait before restarting the step after putting the step in\n to the up_for_retry state\n\n Example:\n .. code-block:: python\n\n @op\n def flakes():\n try:\n flakey_operation()\n except Exception as e:\n raise RetryRequested(max_retries=3) from e\n """\n\n def __init__(\n self, max_retries: Optional[int] = 1, seconds_to_wait: Optional[Union[float, int]] = None\n ):\n super(RetryRequested, self).__init__()\n self.max_retries = check.int_param(max_retries, "max_retries")\n self.seconds_to_wait = check.opt_numeric_param(seconds_to_wait, "seconds_to_wait")
\n\n\nclass ObjectStoreOperationType(Enum):\n SET_OBJECT = "SET_OBJECT"\n GET_OBJECT = "GET_OBJECT"\n RM_OBJECT = "RM_OBJECT"\n CP_OBJECT = "CP_OBJECT"\n\n\nclass ObjectStoreOperation(\n NamedTuple(\n "_ObjectStoreOperation",\n [\n ("op", ObjectStoreOperationType),\n ("key", str),\n ("dest_key", Optional[str]),\n ("obj", Any),\n ("serialization_strategy_name", Optional[str]),\n ("object_store_name", Optional[str]),\n ("value_name", Optional[str]),\n ("version", Optional[str]),\n ("mapping_key", Optional[str]),\n ],\n )\n):\n """This event is used internally by Dagster machinery when values are written to and read from\n an ObjectStore.\n\n Users should not import this class or yield events of this type from user code.\n\n Args:\n op (ObjectStoreOperationType): The type of the operation on the object store.\n key (str): The key of the object on which the operation was performed.\n dest_key (Optional[str]): The destination key, if any, to which the object was copied.\n obj (Any): The object, if any, retrieved by the operation.\n serialization_strategy_name (Optional[str]): The name of the serialization strategy, if any,\n employed by the operation\n object_store_name (Optional[str]): The name of the object store that performed the\n operation.\n value_name (Optional[str]): The name of the input/output\n version (Optional[str]): (Experimental) The version of the stored data.\n mapping_key (Optional[str]): The mapping key when a dynamic output is used.\n """\n\n def __new__(\n cls,\n op: ObjectStoreOperationType,\n key: str,\n dest_key: Optional[str] = None,\n obj: Any = None,\n serialization_strategy_name: Optional[str] = None,\n object_store_name: Optional[str] = None,\n value_name: Optional[str] = None,\n version: Optional[str] = None,\n mapping_key: Optional[str] = None,\n ):\n return super(ObjectStoreOperation, cls).__new__(\n cls,\n op=op,\n key=check.str_param(key, "key"),\n dest_key=check.opt_str_param(dest_key, "dest_key"),\n obj=obj,\n serialization_strategy_name=check.opt_str_param(\n serialization_strategy_name, "serialization_strategy_name"\n ),\n object_store_name=check.opt_str_param(object_store_name, "object_store_name"),\n value_name=check.opt_str_param(value_name, "value_name"),\n version=check.opt_str_param(version, "version"),\n mapping_key=check.opt_str_param(mapping_key, "mapping_key"),\n )\n\n @classmethod\n def serializable(cls, inst, **kwargs):\n return cls(\n **dict(\n {\n "op": inst.op.value,\n "key": inst.key,\n "dest_key": inst.dest_key,\n "obj": None,\n "serialization_strategy_name": inst.serialization_strategy_name,\n "object_store_name": inst.object_store_name,\n "value_name": inst.value_name,\n "version": inst.version,\n },\n **kwargs,\n )\n )\n\n\nclass HookExecutionResult(\n NamedTuple("_HookExecutionResult", [("hook_name", str), ("is_skipped", bool)])\n):\n """This event is used internally to indicate the execution result of a hook, e.g. whether the\n user-defined hook function is skipped.\n\n Args:\n hook_name (str): The name of the hook.\n is_skipped (bool): ``False`` if the hook_fn is executed, ``True`` otheriwse.\n """\n\n def __new__(cls, hook_name: str, is_skipped: Optional[bool] = None):\n return super(HookExecutionResult, cls).__new__(\n cls,\n hook_name=check.str_param(hook_name, "hook_name"),\n is_skipped=cast(bool, check.opt_bool_param(is_skipped, "is_skipped", default=False)),\n )\n\n\nUserEvent = Union[AssetMaterialization, AssetObservation, ExpectationResult]\n
", "current_page_name": "_modules/dagster/_core/definitions/events", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.events"}, "executor_definition": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.executor_definition

\nfrom enum import Enum as PyEnum\nfrom functools import update_wrapper\nfrom typing import TYPE_CHECKING, Any, Callable, Dict, Mapping, Optional, Sequence, Union, overload\n\nfrom typing_extensions import Self, TypeAlias\n\nimport dagster._check as check\nfrom dagster._annotations import public\nfrom dagster._builtins import Int\nfrom dagster._config import Field, Noneable, Selector, UserConfigSchema\nfrom dagster._core.definitions.configurable import (\n    ConfiguredDefinitionConfigSchema,\n    NamedConfigurableDefinition,\n)\nfrom dagster._core.definitions.job_base import IJob\nfrom dagster._core.definitions.reconstruct import ReconstructableJob\nfrom dagster._core.errors import DagsterUnmetExecutorRequirementsError\nfrom dagster._core.execution.retries import RetryMode, get_retries_config\nfrom dagster._core.execution.tags import get_tag_concurrency_limits_config\n\nfrom .definition_config_schema import (\n    IDefinitionConfigSchema,\n    convert_user_facing_definition_config_schema,\n)\n\nif TYPE_CHECKING:\n    from dagster._core.executor.base import Executor\n    from dagster._core.executor.in_process import InProcessExecutor\n    from dagster._core.executor.init import InitExecutorContext\n    from dagster._core.executor.multiprocess import MultiprocessExecutor\n    from dagster._core.instance import DagsterInstance\n\n\nclass ExecutorRequirement(PyEnum):\n    """An ExecutorDefinition can include a list of requirements that the system uses to\n    check whether the executor will be able to work for a particular job execution.\n    """\n\n    # The passed in IJob must be reconstructable across process boundaries\n    RECONSTRUCTABLE_PIPELINE = (  # This needs to still exist for folks who may have written their own executor\n        "RECONSTRUCTABLE_PIPELINE"\n    )\n    RECONSTRUCTABLE_JOB = "RECONSTRUCTABLE_PIPELINE"\n\n    # The DagsterInstance must be loadable in a different process\n    NON_EPHEMERAL_INSTANCE = "NON_EPHEMERAL_INSTANCE"\n\n    # Any op outputs on the job must be persisted\n    PERSISTENT_OUTPUTS = "PERSISTENT_OUTPUTS"\n\n\ndef multiple_process_executor_requirements() -> Sequence[ExecutorRequirement]:\n    return [\n        ExecutorRequirement.RECONSTRUCTABLE_JOB,\n        ExecutorRequirement.NON_EPHEMERAL_INSTANCE,\n        ExecutorRequirement.PERSISTENT_OUTPUTS,\n    ]\n\n\nExecutorConfig = Mapping[str, object]\nExecutorCreationFunction: TypeAlias = Callable[["InitExecutorContext"], "Executor"]\nExecutorRequirementsFunction: TypeAlias = Callable[[ExecutorConfig], Sequence[ExecutorRequirement]]\n\n\n
[docs]class ExecutorDefinition(NamedConfigurableDefinition):\n """An executor is responsible for executing the steps of a job.\n\n Args:\n name (str): The name of the executor.\n config_schema (Optional[ConfigSchema]): The schema for the config. Configuration data\n available in `init_context.executor_config`. If not set, Dagster will accept any config\n provided.\n requirements (Optional[List[ExecutorRequirement]]): Any requirements that must\n be met in order for the executor to be usable for a particular job execution.\n executor_creation_fn(Optional[Callable]): Should accept an :py:class:`InitExecutorContext`\n and return an instance of :py:class:`Executor`\n required_resource_keys (Optional[Set[str]]): Keys for the resources required by the\n executor.\n description (Optional[str]): A description of the executor.\n """\n\n def __init__(\n self,\n name: str,\n config_schema: Optional[UserConfigSchema] = None,\n requirements: Union[\n ExecutorRequirementsFunction, Optional[Sequence[ExecutorRequirement]]\n ] = None,\n executor_creation_fn: Optional[ExecutorCreationFunction] = None,\n description: Optional[str] = None,\n ):\n self._name = check.str_param(name, "name")\n self._requirements_fn: ExecutorRequirementsFunction\n if callable(requirements):\n self._requirements_fn = requirements\n else:\n requirements_lst = check.opt_list_param(\n requirements, "requirements", of_type=ExecutorRequirement\n )\n self._requirements_fn = lambda _: requirements_lst\n self._config_schema = convert_user_facing_definition_config_schema(config_schema)\n self._executor_creation_fn = check.opt_callable_param(\n executor_creation_fn, "executor_creation_fn"\n )\n self._description = check.opt_str_param(description, "description")\n\n @public\n @property\n def name(self) -> str:\n """Name of the executor."""\n return self._name\n\n @public\n @property\n def description(self) -> Optional[str]:\n """Description of executor, if provided."""\n return self._description\n\n @property\n def config_schema(self) -> IDefinitionConfigSchema:\n return self._config_schema\n\n def get_requirements(\n self, executor_config: Mapping[str, object]\n ) -> Sequence[ExecutorRequirement]:\n return self._requirements_fn(executor_config)\n\n @public\n @property\n def executor_creation_fn(self) -> Optional[ExecutorCreationFunction]:\n """Callable that takes an :py:class:`InitExecutorContext` and returns an instance of\n :py:class:`Executor`.\n """\n return self._executor_creation_fn\n\n def copy_for_configured(self, name, description, config_schema) -> "ExecutorDefinition":\n return ExecutorDefinition(\n name=name,\n config_schema=config_schema, # type: ignore\n executor_creation_fn=self.executor_creation_fn,\n description=description or self.description,\n requirements=self._requirements_fn,\n )\n\n @staticmethod\n def hardcoded_executor(executor: "Executor"):\n return ExecutorDefinition(\n # Executor name was only relevant in the pipeline/solid/mode world, so we\n # can put a dummy value\n name="__executor__",\n executor_creation_fn=lambda _init_context: executor,\n )\n\n # Backcompat: Overrides configured method to provide name as a keyword argument.\n # If no name is provided, the name is pulled off of this ExecutorDefinition.\n
[docs] @public\n def configured(\n self,\n config_or_config_fn: Any,\n name: Optional[str] = None,\n config_schema: Optional[UserConfigSchema] = None,\n description: Optional[str] = None,\n ) -> Self:\n """Wraps this object in an object of the same type that provides configuration to the inner\n object.\n\n Using ``configured`` may result in config values being displayed in\n the Dagster UI, so it is not recommended to use this API with sensitive values,\n such as secrets.\n\n Args:\n config_or_config_fn (Union[Any, Callable[[Any], Any]]): Either (1) Run configuration\n that fully satisfies this object's config schema or (2) A function that accepts run\n configuration and returns run configuration that fully satisfies this object's\n config schema. In the latter case, config_schema must be specified. When\n passing a function, it's easiest to use :py:func:`configured`.\n name (Optional[str]): Name of the new definition. If not provided, the emitted\n definition will inherit the name of the `ExecutorDefinition` upon which this\n function is called.\n config_schema (Optional[ConfigSchema]): If config_or_config_fn is a function, the config\n schema that its input must satisfy. If not set, Dagster will accept any config\n provided.\n description (Optional[str]): Description of the new definition. If not specified,\n inherits the description of the definition being configured.\n\n Returns (ConfigurableDefinition): A configured version of this object.\n """\n name = check.opt_str_param(name, "name")\n\n new_config_schema = ConfiguredDefinitionConfigSchema(\n self, convert_user_facing_definition_config_schema(config_schema), config_or_config_fn\n )\n\n return self.copy_for_configured(name or self.name, description, new_config_schema)
\n\n\n@overload\ndef executor(name: ExecutorCreationFunction) -> ExecutorDefinition: ...\n\n\n@overload\ndef executor(\n name: Optional[str] = ...,\n config_schema: Optional[UserConfigSchema] = ...,\n requirements: Optional[\n Union[ExecutorRequirementsFunction, Sequence[ExecutorRequirement]]\n ] = ...,\n) -> "_ExecutorDecoratorCallable": ...\n\n\n
[docs]def executor(\n name: Union[ExecutorCreationFunction, Optional[str]] = None,\n config_schema: Optional[UserConfigSchema] = None,\n requirements: Optional[\n Union[ExecutorRequirementsFunction, Sequence[ExecutorRequirement]]\n ] = None,\n) -> Union[ExecutorDefinition, "_ExecutorDecoratorCallable"]:\n """Define an executor.\n\n The decorated function should accept an :py:class:`InitExecutorContext` and return an instance\n of :py:class:`Executor`.\n\n Args:\n name (Optional[str]): The name of the executor.\n config_schema (Optional[ConfigSchema]): The schema for the config. Configuration data available in\n `init_context.executor_config`. If not set, Dagster will accept any config provided for.\n requirements (Optional[List[ExecutorRequirement]]): Any requirements that must\n be met in order for the executor to be usable for a particular job execution.\n """\n if callable(name):\n check.invariant(config_schema is None)\n check.invariant(requirements is None)\n return _ExecutorDecoratorCallable()(name)\n\n return _ExecutorDecoratorCallable(\n name=name, config_schema=config_schema, requirements=requirements\n )
\n\n\nclass _ExecutorDecoratorCallable:\n def __init__(self, name=None, config_schema=None, requirements=None):\n self.name = check.opt_str_param(name, "name")\n self.config_schema = config_schema # type check in definition\n self.requirements = requirements\n\n def __call__(self, fn: ExecutorCreationFunction) -> ExecutorDefinition:\n check.callable_param(fn, "fn")\n\n if not self.name:\n self.name = fn.__name__\n\n executor_def = ExecutorDefinition(\n name=self.name,\n config_schema=self.config_schema,\n executor_creation_fn=fn,\n requirements=self.requirements,\n )\n\n # `update_wrapper` typing cannot currently handle a Union of Callables correctly\n update_wrapper(executor_def, wrapped=fn) # type: ignore\n\n return executor_def\n\n\ndef _core_in_process_executor_creation(config: ExecutorConfig) -> "InProcessExecutor":\n from dagster._core.executor.in_process import InProcessExecutor\n\n return InProcessExecutor(\n # shouldn't need to .get() here - issue with defaults in config setup\n retries=RetryMode.from_config(check.dict_elem(config, "retries")), # type: ignore # (possible none)\n marker_to_close=config.get("marker_to_close"), # type: ignore # (should be str)\n )\n\n\nIN_PROC_CONFIG = Field(\n {\n "retries": get_retries_config(),\n "marker_to_close": Field(\n str,\n is_required=False,\n description="[DEPRECATED]",\n ),\n },\n description="Execute all steps in a single process.",\n)\n\n\n
[docs]@executor(\n name="in_process",\n config_schema=IN_PROC_CONFIG,\n)\ndef in_process_executor(init_context):\n """The in-process executor executes all steps in a single process.\n\n To select it, include the following top-level fragment in config:\n\n .. code-block:: yaml\n\n execution:\n in_process:\n\n Execution priority can be configured using the ``dagster/priority`` tag via op metadata,\n where the higher the number the higher the priority. 0 is the default and both positive\n and negative numbers can be used.\n """\n return _core_in_process_executor_creation(init_context.executor_config)
\n\n\n@executor(name="execute_in_process_executor")\ndef execute_in_process_executor(_) -> "InProcessExecutor":\n """Executor used by execute_in_process.\n\n Use of this executor triggers special behavior in the config system that ignores all incoming\n executor config. This is because someone might set executor config on a job, and when we foist\n this executor onto the job for `execute_in_process`, that config becomes nonsensical.\n """\n from dagster._core.executor.in_process import InProcessExecutor\n\n return InProcessExecutor(\n retries=RetryMode.ENABLED,\n marker_to_close=None,\n )\n\n\ndef _core_multiprocess_executor_creation(config: ExecutorConfig) -> "MultiprocessExecutor":\n from dagster._core.executor.multiprocess import MultiprocessExecutor\n\n # unpack optional selector\n start_method = None\n start_cfg: Dict[str, object] = {}\n start_selector = check.opt_dict_elem(config, "start_method")\n if start_selector:\n start_method, start_cfg = next(iter(start_selector.items()))\n\n return MultiprocessExecutor(\n max_concurrent=check.opt_int_elem(config, "max_concurrent"),\n tag_concurrency_limits=check.opt_list_elem(config, "tag_concurrency_limits"),\n retries=RetryMode.from_config(check.dict_elem(config, "retries")), # type: ignore\n start_method=start_method,\n explicit_forkserver_preload=check.opt_list_elem(start_cfg, "preload_modules", of_type=str),\n )\n\n\nMULTI_PROC_CONFIG = Field(\n {\n "max_concurrent": Field(\n Noneable(Int),\n default_value=None,\n description=(\n "The number of processes that may run concurrently. "\n "By default, this is set to be the return value of `multiprocessing.cpu_count()`."\n ),\n ),\n "tag_concurrency_limits": get_tag_concurrency_limits_config(),\n "start_method": Field(\n Selector(\n fields={\n "spawn": Field(\n {},\n description=(\n "Configure the multiprocess executor to start subprocesses "\n "using `spawn`."\n ),\n ),\n "forkserver": Field(\n {\n "preload_modules": Field(\n [str],\n is_required=False,\n description=(\n "Explicitly specify the modules to preload in the forkserver."\n " Otherwise, there are two cases for default values if modules"\n " are not specified. If the Dagster job was loaded from a"\n " module, the same module will be preloaded. If not, the"\n " `dagster` module is preloaded."\n ),\n ),\n },\n description=(\n "Configure the multiprocess executor to start subprocesses "\n "using `forkserver`."\n ),\n ),\n # fork currently unsupported due to threads usage\n }\n ),\n is_required=False,\n description=(\n "Select how subprocesses are created. By default, `spawn` is selected. See "\n "https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods."\n ),\n ),\n "retries": get_retries_config(),\n },\n description="Execute each step in an individual process.",\n)\n\n\n
[docs]@executor(\n name="multiprocess",\n config_schema=MULTI_PROC_CONFIG,\n requirements=multiple_process_executor_requirements(),\n)\ndef multiprocess_executor(init_context):\n """The multiprocess executor executes each step in an individual process.\n\n Any job that does not specify custom executors will use the multiprocess_executor by default.\n To configure the multiprocess executor, include a fragment such as the following in your run\n config:\n\n .. code-block:: yaml\n\n execution:\n config:\n multiprocess:\n max_concurrent: 4\n\n The ``max_concurrent`` arg is optional and tells the execution engine how many processes may run\n concurrently. By default, or if you set ``max_concurrent`` to be None or 0, this is the return value of\n :py:func:`python:multiprocessing.cpu_count`.\n\n Execution priority can be configured using the ``dagster/priority`` tag via op metadata,\n where the higher the number the higher the priority. 0 is the default and both positive\n and negative numbers can be used.\n """\n return _core_multiprocess_executor_creation(init_context.executor_config)
\n\n\ndef check_cross_process_constraints(init_context: "InitExecutorContext") -> None:\n from dagster._core.executor.init import InitExecutorContext\n\n check.inst_param(init_context, "init_context", InitExecutorContext)\n requirements_lst = init_context.executor_def.get_requirements(init_context.executor_config)\n\n if ExecutorRequirement.RECONSTRUCTABLE_JOB in requirements_lst:\n _check_intra_process_job(init_context.job)\n\n if ExecutorRequirement.NON_EPHEMERAL_INSTANCE in requirements_lst:\n _check_non_ephemeral_instance(init_context.instance)\n\n\ndef _check_intra_process_job(job: IJob) -> None:\n if not isinstance(job, ReconstructableJob):\n raise DagsterUnmetExecutorRequirementsError(\n "You have attempted to use an executor that uses multiple processes with the job"\n f' "{job.get_definition().name}" that is not reconstructable. Job must be loaded in a'\n " way that allows dagster to reconstruct them in a new process. This means: \\n *"\n " using the file, module, or workspace.yaml arguments of"\n " dagster-webserver/dagster-graphql/dagster\\n * loading the job through the"\n " reconstructable() function\\n"\n )\n\n\ndef _check_non_ephemeral_instance(instance: "DagsterInstance") -> None:\n if instance.is_ephemeral:\n raise DagsterUnmetExecutorRequirementsError(\n "You have attempted to use an executor that uses multiple processes with an ephemeral"\n " DagsterInstance. A non-ephemeral instance is needed to coordinate execution between"\n " multiple processes. You can configure your default instance via $DAGSTER_HOME or"\n " ensure a valid one is passed when invoking the python APIs. You can learn more about"\n " setting up a persistent DagsterInstance from the DagsterInstance docs here:"\n " https://docs.dagster.io/deployment/dagster-instance#default-local-behavior"\n )\n\n\ndef _get_default_executor_requirements(\n executor_config: ExecutorConfig,\n) -> Sequence[ExecutorRequirement]:\n return multiple_process_executor_requirements() if "multiprocess" in executor_config else []\n\n\n
[docs]@executor(\n name="multi_or_in_process_executor",\n config_schema=Field(\n Selector(\n {"multiprocess": MULTI_PROC_CONFIG, "in_process": IN_PROC_CONFIG},\n ),\n default_value={"multiprocess": {}},\n ),\n requirements=_get_default_executor_requirements,\n)\ndef multi_or_in_process_executor(init_context: "InitExecutorContext") -> "Executor":\n """The default executor for a job.\n\n This is the executor available by default on a :py:class:`JobDefinition`\n that does not provide custom executors. This executor has a multiprocessing-enabled mode, and a\n single-process mode. By default, multiprocessing mode is enabled. Switching between multiprocess\n mode and in-process mode can be achieved via config.\n\n .. code-block:: yaml\n\n execution:\n config:\n multiprocess:\n\n\n execution:\n config:\n in_process:\n\n When using the multiprocess mode, ``max_concurrent`` and ``retries`` can also be configured.\n\n .. code-block:: yaml\n\n execution:\n config:\n multiprocess:\n max_concurrent: 4\n retries:\n enabled:\n\n The ``max_concurrent`` arg is optional and tells the execution engine how many processes may run\n concurrently. By default, or if you set ``max_concurrent`` to be 0, this is the return value of\n :py:func:`python:multiprocessing.cpu_count`.\n\n When using the in_process mode, then only retries can be configured.\n\n Execution priority can be configured using the ``dagster/priority`` tag via op metadata,\n where the higher the number the higher the priority. 0 is the default and both positive\n and negative numbers can be used.\n """\n if "multiprocess" in init_context.executor_config:\n return _core_multiprocess_executor_creation(\n check.dict_elem(init_context.executor_config, "multiprocess")\n )\n else:\n return _core_in_process_executor_creation(\n check.dict_elem(init_context.executor_config, "in_process")\n )
\n
", "current_page_name": "_modules/dagster/_core/definitions/executor_definition", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.executor_definition"}, "freshness_policy": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.freshness_policy

\nimport datetime\nfrom typing import AbstractSet, NamedTuple, Optional\n\nimport pendulum\n\nimport dagster._check as check\nfrom dagster._annotations import experimental\nfrom dagster._core.errors import DagsterInvalidDefinitionError\nfrom dagster._serdes import whitelist_for_serdes\nfrom dagster._utils.schedules import (\n    is_valid_cron_schedule,\n    reverse_cron_string_iterator,\n)\n\nfrom .events import AssetKey\n\n\nclass FreshnessConstraint(NamedTuple):\n    asset_keys: AbstractSet[AssetKey]\n    required_data_time: datetime.datetime\n    required_by_time: datetime.datetime\n\n\nclass FreshnessMinutes(NamedTuple):\n    overdue_minutes: float\n    lag_minutes: float\n\n\n
[docs]@experimental\n@whitelist_for_serdes\nclass FreshnessPolicy(\n NamedTuple(\n "_FreshnessPolicy",\n [\n ("maximum_lag_minutes", float),\n ("cron_schedule", Optional[str]),\n ("cron_schedule_timezone", Optional[str]),\n ],\n )\n):\n """A FreshnessPolicy specifies how up-to-date you want a given asset to be.\n\n Attaching a FreshnessPolicy to an asset definition encodes an expectation on the upstream data\n that you expect to be incorporated into the current state of that asset at certain points in time.\n How this is calculated differs depending on if the asset is unpartitioned or time-partitioned\n (other partitioning schemes are not supported).\n\n For time-partitioned assets, the current data time for the asset is simple to calculate. The\n upstream data that is incorporated into the asset is exactly the set of materialized partitions\n for that asset. Thus, the current data time for the asset is simply the time up to which all\n partitions have been materialized.\n\n For unpartitioned assets, the current data time is based on the upstream materialization records\n that were read to generate the current state of the asset. More specifically,\n imagine you have two assets, where A depends on B. If `B` has a FreshnessPolicy defined, this\n means that at time T, the most recent materialization of `B` should have come after a\n materialization of `A` which was no more than `maximum_lag_minutes` ago. This calculation is\n recursive: any given asset is expected to incorporate up-to-date data from all of its upstream\n assets.\n\n It is assumed that all asset definitions with no upstream asset definitions consume from some\n always-updating source. That is, if you materialize that asset at time T, it will incorporate\n all data up to time T.\n\n If `cron_schedule` is not defined, the given asset will be expected to incorporate upstream\n data from no more than `maximum_lag_minutes` ago at all points in time. For example, "The events\n table should always have data from at most 1 hour ago".\n\n If `cron_schedule` is defined, the given asset will be expected to incorporate upstream data\n from no more than `maximum_lag_minutes` ago at each cron schedule tick. For example, "By 9AM,\n the signups table should contain all of yesterday's data".\n\n The freshness status of assets with policies defined will be visible in the UI. If you are using\n an asset reconciliation sensor, this sensor will kick off runs to help keep your assets up to\n date with respect to their FreshnessPolicy.\n\n Args:\n maximum_lag_minutes (float): An upper bound for how old the data contained within this\n asset may be.\n cron_schedule (Optional[str]): A cron schedule string (e.g. ``"0 1 * * *"``) specifying a\n series of times by which the `maximum_lag_minutes` constraint must be satisfied. If\n no cron schedule is provided, then this constraint must be satisfied at all times.\n cron_schedule_timezone (Optional[str]): Timezone in which the cron schedule should be evaluated.\n If not specified, defaults to UTC. Supported strings for timezones are the ones provided\n by the `IANA time zone database <https://www.iana.org/time-zones>` - e.g.\n "America/Los_Angeles".\n\n .. code-block:: python\n\n # At any point in time, this asset must incorporate all upstream data from at least 30 minutes ago.\n @asset(freshness_policy=FreshnessPolicy(maximum_lag_minutes=30))\n def fresh_asset():\n ...\n\n # At any point in time, this asset must incorporate all upstream data from at least 30 minutes ago.\n @asset(freshness_policy=FreshnessPolicy(maximum_lag_minutes=30))\n def cron_up_to_date_asset():\n ...\n\n """\n\n def __new__(\n cls,\n *,\n maximum_lag_minutes: float,\n cron_schedule: Optional[str] = None,\n cron_schedule_timezone: Optional[str] = None,\n ):\n if cron_schedule is not None:\n if not is_valid_cron_schedule(cron_schedule):\n raise DagsterInvalidDefinitionError(f"Invalid cron schedule '{cron_schedule}'.")\n check.param_invariant(\n is_valid_cron_schedule(cron_schedule),\n "cron_schedule",\n f"Invalid cron schedule '{cron_schedule}'.",\n )\n if cron_schedule_timezone is not None:\n check.param_invariant(\n cron_schedule is not None,\n "cron_schedule_timezone",\n "Cannot specify cron_schedule_timezone without a cron_schedule.",\n )\n try:\n # Verify that the timezone can be loaded\n pendulum.tz.timezone(cron_schedule_timezone) # type: ignore\n except Exception as e:\n raise DagsterInvalidDefinitionError(\n "Invalid cron schedule timezone '{cron_schedule_timezone}'. "\n ) from e\n return super(FreshnessPolicy, cls).__new__(\n cls,\n maximum_lag_minutes=float(\n check.numeric_param(maximum_lag_minutes, "maximum_lag_minutes")\n ),\n cron_schedule=check.opt_str_param(cron_schedule, "cron_schedule"),\n cron_schedule_timezone=check.opt_str_param(\n cron_schedule_timezone, "cron_schedule_timezone"\n ),\n )\n\n @classmethod\n def _create(cls, *args):\n """Pickle requires a method with positional arguments to construct\n instances of a class. Since the constructor for this class has\n keyword arguments only, we define this method to be used by pickle.\n """\n return cls(maximum_lag_minutes=args[0], cron_schedule=args[1])\n\n def __reduce__(self):\n return (self._create, (self.maximum_lag_minutes, self.cron_schedule))\n\n @property\n def maximum_lag_delta(self) -> datetime.timedelta:\n return datetime.timedelta(minutes=self.maximum_lag_minutes)\n\n def get_evaluation_tick(\n self,\n evaluation_time: datetime.datetime,\n ) -> Optional[datetime.datetime]:\n if self.cron_schedule:\n # most recent cron schedule tick\n schedule_ticks = reverse_cron_string_iterator(\n end_timestamp=evaluation_time.timestamp(),\n cron_string=self.cron_schedule,\n execution_timezone=self.cron_schedule_timezone,\n )\n return next(schedule_ticks)\n else:\n return evaluation_time\n\n def minutes_overdue(\n self,\n data_time: Optional[datetime.datetime],\n evaluation_time: datetime.datetime,\n ) -> Optional[FreshnessMinutes]:\n """Returns a number of minutes past the specified freshness policy that this asset currently\n is. If the asset is missing upstream data, or is not materialized at all, then it is unknown\n how overdue it is, and this will return None.\n\n Args:\n data_time (Optional[datetime]): The timestamp of the data that was used to create the\n current version of this asset.\n evaluation_time (datetime): The time at which we're evaluating the overdueness of this\n asset. Generally, this is the current time.\n """\n if data_time is None:\n return None\n evaluation_tick = self.get_evaluation_tick(evaluation_time)\n if evaluation_tick is None:\n return None\n required_time = evaluation_tick - self.maximum_lag_delta\n\n return FreshnessMinutes(\n lag_minutes=max(0.0, (evaluation_tick - data_time).total_seconds() / 60),\n overdue_minutes=max(0.0, (required_time - data_time).total_seconds() / 60),\n )
\n
", "current_page_name": "_modules/dagster/_core/definitions/freshness_policy", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.freshness_policy"}, "freshness_policy_sensor_definition": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.freshness_policy_sensor_definition

\nfrom typing import Callable, Dict, Mapping, NamedTuple, Optional, Set, cast\n\nimport pendulum\n\nimport dagster._check as check\nfrom dagster._annotations import PublicAttr, experimental\nfrom dagster._core.definitions.asset_selection import AssetSelection\nfrom dagster._core.definitions.data_time import CachingDataTimeResolver\nfrom dagster._core.definitions.events import AssetKey\nfrom dagster._core.definitions.freshness_policy import FreshnessPolicy\nfrom dagster._core.definitions.resource_annotation import get_resource_args\nfrom dagster._core.definitions.scoped_resources_builder import Resources, ScopedResourcesBuilder\nfrom dagster._core.errors import (\n    DagsterInvalidDefinitionError,\n    DagsterInvalidInvocationError,\n    FreshnessPolicySensorExecutionError,\n    user_code_error_boundary,\n)\nfrom dagster._core.instance import DagsterInstance\nfrom dagster._serdes import (\n    serialize_value,\n    whitelist_for_serdes,\n)\nfrom dagster._serdes.errors import DeserializationError\nfrom dagster._serdes.serdes import deserialize_value\nfrom dagster._seven import JSONDecodeError\n\nfrom .sensor_definition import (\n    DefaultSensorStatus,\n    SensorDefinition,\n    SensorEvaluationContext,\n    SensorType,\n    SkipReason,\n    get_context_param_name,\n    get_sensor_context_from_args_or_kwargs,\n    validate_and_get_resource_dict,\n)\n\n\n@whitelist_for_serdes\nclass FreshnessPolicySensorCursor(\n    NamedTuple(\n        "_FreshnessPolicySensorCursor",\n        [("minutes_late_by_key_str", Mapping[str, Optional[float]])],\n    )\n):\n    def __new__(cls, minutes_late_by_key_str: Mapping[str, Optional[float]]):\n        return super(FreshnessPolicySensorCursor, cls).__new__(\n            cls,\n            minutes_late_by_key_str=check.mapping_param(\n                minutes_late_by_key_str, "minutes_late_by_key_str", key_type=str\n            ),\n        )\n\n    @staticmethod\n    def is_valid(json_str: str) -> bool:\n        try:\n            deserialize_value(json_str, FreshnessPolicySensorCursor)\n            return True\n        except (JSONDecodeError, DeserializationError):\n            return False\n\n    @staticmethod\n    def from_dict(\n        minutes_late_by_key: Mapping[AssetKey, Optional[float]]\n    ) -> "FreshnessPolicySensorCursor":\n        return FreshnessPolicySensorCursor(\n            minutes_late_by_key_str={k.to_user_string(): v for k, v in minutes_late_by_key.items()}\n        )\n\n    @property\n    def minutes_late_by_key(self) -> Mapping[AssetKey, Optional[float]]:\n        return {AssetKey.from_user_string(k): v for k, v in self.minutes_late_by_key_str.items()}\n\n    def to_json(self) -> str:\n        return serialize_value(cast(NamedTuple, self))\n\n    @staticmethod\n    def from_json(json_str: str) -> "FreshnessPolicySensorCursor":\n        return deserialize_value(json_str, FreshnessPolicySensorCursor)\n\n\n
[docs]class FreshnessPolicySensorContext(\n NamedTuple(\n "_FreshnessPolicySensorContext",\n [\n ("sensor_name", PublicAttr[str]),\n ("asset_key", PublicAttr[AssetKey]),\n ("freshness_policy", PublicAttr[FreshnessPolicy]),\n ("minutes_overdue", PublicAttr[Optional[float]]),\n ("previous_minutes_overdue", PublicAttr[Optional[float]]),\n ("instance", PublicAttr[DagsterInstance]),\n ("resources", Resources),\n ],\n )\n):\n """The ``context`` object available to a decorated function of ``freshness_policy_sensor``.\n\n Attributes:\n sensor_name (str): the name of the sensor.\n asset_key (AssetKey): the key of the asset being monitored\n freshness_policy (FreshnessPolicy): the freshness policy of the asset being monitored\n minutes_overdue (Optional[float])\n previous_minutes_overdue (Optional[float]): the minutes_overdue value for this asset on the\n previous sensor tick.\n instance (DagsterInstance): the current instance.\n """\n\n def __new__(\n cls,\n sensor_name: str,\n asset_key: AssetKey,\n freshness_policy: FreshnessPolicy,\n minutes_overdue: Optional[float],\n previous_minutes_overdue: Optional[float],\n instance: DagsterInstance,\n resources: Optional[Resources] = None,\n ):\n minutes_overdue = check.opt_numeric_param(minutes_overdue, "minutes_overdue")\n previous_minutes_overdue = check.opt_numeric_param(\n previous_minutes_overdue, "previous_minutes_overdue"\n )\n return super(FreshnessPolicySensorContext, cls).__new__(\n cls,\n sensor_name=check.str_param(sensor_name, "sensor_name"),\n asset_key=check.inst_param(asset_key, "asset_key", AssetKey),\n freshness_policy=check.inst_param(freshness_policy, "FreshnessPolicy", FreshnessPolicy),\n minutes_overdue=float(minutes_overdue) if minutes_overdue is not None else None,\n previous_minutes_overdue=(\n float(previous_minutes_overdue) if previous_minutes_overdue is not None else None\n ),\n instance=check.inst_param(instance, "instance", DagsterInstance),\n resources=resources or ScopedResourcesBuilder.build_empty(),\n )
\n\n\n
[docs]@experimental\ndef build_freshness_policy_sensor_context(\n sensor_name: str,\n asset_key: AssetKey,\n freshness_policy: FreshnessPolicy,\n minutes_overdue: Optional[float],\n previous_minutes_overdue: Optional[float] = None,\n instance: Optional[DagsterInstance] = None,\n resources: Optional[Resources] = None,\n) -> FreshnessPolicySensorContext:\n """Builds freshness policy sensor context from provided parameters.\n\n This function can be used to provide the context argument when directly invoking a function\n decorated with `@freshness_policy_sensor`, such as when writing unit tests.\n\n Args:\n sensor_name (str): The name of the sensor the context is being constructed for.\n asset_key (AssetKey): The AssetKey for the monitored asset\n freshness_policy (FreshnessPolicy): The FreshnessPolicy for the monitored asset\n minutes_overdue (Optional[float]): How overdue the monitored asset currently is\n previous_minutes_overdue (Optional[float]): How overdue the monitored asset was on the\n previous tick.\n instance (DagsterInstance): The dagster instance configured for the context.\n\n Examples:\n .. code-block:: python\n\n context = build_freshness_policy_sensor_context(\n sensor_name="freshness_policy_sensor_to_invoke",\n asset_key=AssetKey("some_asset"),\n freshness_policy=FreshnessPolicy(maximum_lag_minutes=30)<\n minutes_overdue=10.0,\n )\n freshness_policy_sensor_to_invoke(context)\n """\n return FreshnessPolicySensorContext(\n sensor_name=sensor_name,\n asset_key=asset_key,\n freshness_policy=freshness_policy,\n minutes_overdue=minutes_overdue,\n previous_minutes_overdue=previous_minutes_overdue,\n instance=instance or DagsterInstance.ephemeral(),\n resources=resources,\n )
\n\n\n
[docs]class FreshnessPolicySensorDefinition(SensorDefinition):\n """Define a sensor that reacts to the status of a given set of asset freshness policies,\n where the decorated function will be evaluated on every sensor tick.\n\n Args:\n name (str): The name of the sensor. Defaults to the name of the decorated function.\n freshness_policy_sensor_fn (Callable[[FreshnessPolicySensorContext], None]): The core\n evaluation function for the sensor. Takes a :py:class:`~dagster.FreshnessPolicySensorContext`.\n asset_selection (AssetSelection): The asset selection monitored by the sensor.\n minimum_interval_seconds (Optional[int]): The minimum number of seconds that will elapse\n between sensor evaluations.\n description (Optional[str]): A human-readable description of the sensor.\n default_status (DefaultSensorStatus): Whether the sensor starts as running or not. The default\n status can be overridden from the Dagster UI or via the GraphQL API.\n """\n\n def __init__(\n self,\n name: str,\n asset_selection: AssetSelection,\n freshness_policy_sensor_fn: Callable[..., None],\n minimum_interval_seconds: Optional[int] = None,\n description: Optional[str] = None,\n default_status: DefaultSensorStatus = DefaultSensorStatus.STOPPED,\n required_resource_keys: Optional[Set[str]] = None,\n ):\n check.str_param(name, "name")\n check.inst_param(asset_selection, "asset_selection", AssetSelection)\n check.opt_int_param(minimum_interval_seconds, "minimum_interval_seconds")\n check.opt_str_param(description, "description")\n check.inst_param(default_status, "default_status", DefaultSensorStatus)\n\n self._freshness_policy_sensor_fn = check.callable_param(\n freshness_policy_sensor_fn, "freshness_policy_sensor_fn"\n )\n\n resource_arg_names: Set[str] = {\n arg.name for arg in get_resource_args(freshness_policy_sensor_fn)\n }\n\n combined_required_resource_keys = (\n check.opt_set_param(required_resource_keys, "required_resource_keys", of_type=str)\n | resource_arg_names\n )\n\n def _wrapped_fn(context: SensorEvaluationContext):\n from dagster._utils.caching_instance_queryer import (\n CachingInstanceQueryer, # expensive import\n )\n\n if context.repository_def is None:\n raise DagsterInvalidInvocationError(\n "The `repository_def` property on the `SensorEvaluationContext` passed into a "\n "`FreshnessPolicySensorDefinition` must not be None."\n )\n\n if context.cursor is None or not FreshnessPolicySensorCursor.is_valid(context.cursor):\n new_cursor = FreshnessPolicySensorCursor({})\n context.update_cursor(new_cursor.to_json())\n yield SkipReason(f"Initializing {name}.")\n return\n\n evaluation_time = pendulum.now("UTC")\n asset_graph = context.repository_def.asset_graph\n instance_queryer = CachingInstanceQueryer(\n context.instance, asset_graph, evaluation_time\n )\n data_time_resolver = CachingDataTimeResolver(instance_queryer=instance_queryer)\n monitored_keys = asset_selection.resolve(asset_graph)\n\n # get the previous status from the cursor\n previous_minutes_late_by_key = FreshnessPolicySensorCursor.from_json(\n context.cursor\n ).minutes_late_by_key\n\n minutes_late_by_key: Dict[AssetKey, Optional[float]] = {}\n for asset_key in monitored_keys:\n freshness_policy = asset_graph.freshness_policies_by_key.get(asset_key)\n if freshness_policy is None:\n continue\n\n # get the current minutes_overdue value for this asset\n result = data_time_resolver.get_minutes_overdue(\n evaluation_time=evaluation_time,\n asset_key=asset_key,\n )\n minutes_late_by_key[asset_key] = result.overdue_minutes if result else None\n\n resource_args_populated = validate_and_get_resource_dict(\n context.resources, name, resource_arg_names\n )\n context_param_name = get_context_param_name(freshness_policy_sensor_fn)\n freshness_context = FreshnessPolicySensorContext(\n sensor_name=name,\n asset_key=asset_key,\n freshness_policy=freshness_policy,\n minutes_overdue=minutes_late_by_key[asset_key],\n previous_minutes_overdue=previous_minutes_late_by_key.get(asset_key),\n instance=context.instance,\n resources=context.resources,\n )\n\n with user_code_error_boundary(\n FreshnessPolicySensorExecutionError,\n lambda: f'Error occurred during the execution of sensor "{name}".',\n ):\n context_param = (\n {context_param_name: freshness_context} if context_param_name else {}\n )\n result = freshness_policy_sensor_fn(\n **context_param,\n **resource_args_populated,\n )\n if result is not None:\n raise DagsterInvalidDefinitionError(\n "Functions decorated by `@freshness_policy_sensor` may not return or yield"\n " a value."\n )\n\n context.update_cursor(\n FreshnessPolicySensorCursor.from_dict(minutes_late_by_key).to_json()\n )\n\n super(FreshnessPolicySensorDefinition, self).__init__(\n name=name,\n evaluation_fn=_wrapped_fn,\n minimum_interval_seconds=minimum_interval_seconds,\n description=description,\n default_status=default_status,\n required_resource_keys=combined_required_resource_keys,\n )\n\n def __call__(self, *args, **kwargs) -> None:\n context_param_name = get_context_param_name(self._freshness_policy_sensor_fn)\n\n sensor_context = get_sensor_context_from_args_or_kwargs(\n self._freshness_policy_sensor_fn,\n args,\n kwargs,\n context_type=FreshnessPolicySensorContext,\n )\n context_param = (\n {context_param_name: sensor_context} if context_param_name and sensor_context else {}\n )\n\n resources = validate_and_get_resource_dict(\n sensor_context.resources if sensor_context else ScopedResourcesBuilder.build_empty(),\n self._name,\n self._required_resource_keys,\n )\n\n return self._freshness_policy_sensor_fn(**context_param, **resources)\n\n @property\n def sensor_type(self) -> SensorType:\n return SensorType.FRESHNESS_POLICY
\n\n\n
[docs]@experimental\ndef freshness_policy_sensor(\n asset_selection: AssetSelection,\n *,\n name: Optional[str] = None,\n minimum_interval_seconds: Optional[int] = None,\n description: Optional[str] = None,\n default_status: DefaultSensorStatus = DefaultSensorStatus.STOPPED,\n) -> Callable[[Callable[..., None]], FreshnessPolicySensorDefinition,]:\n """Define a sensor that reacts to the status of a given set of asset freshness policies, where the\n decorated function will be evaluated on every tick for each asset in the selection that has a\n FreshnessPolicy defined.\n\n Note: returning or yielding a value from the annotated function will result in an error.\n\n Takes a :py:class:`~dagster.FreshnessPolicySensorContext`.\n\n Args:\n asset_selection (AssetSelection): The asset selection monitored by the sensor.\n name (Optional[str]): The name of the sensor. Defaults to the name of the decorated function.\n freshness_policy_sensor_fn (Callable[[FreshnessPolicySensorContext], None]): The core\n evaluation function for the sensor. Takes a :py:class:`~dagster.FreshnessPolicySensorContext`.\n minimum_interval_seconds (Optional[int]): The minimum number of seconds that will elapse\n between sensor evaluations.\n description (Optional[str]): A human-readable description of the sensor.\n default_status (DefaultSensorStatus): Whether the sensor starts as running or not. The default\n status can be overridden from the Dagster UI or via the GraphQL API.\n """\n\n def inner(fn: Callable[..., None]) -> FreshnessPolicySensorDefinition:\n check.callable_param(fn, "fn")\n sensor_name = name or fn.__name__\n\n return FreshnessPolicySensorDefinition(\n name=sensor_name,\n freshness_policy_sensor_fn=fn,\n asset_selection=asset_selection,\n minimum_interval_seconds=minimum_interval_seconds,\n description=description,\n default_status=default_status,\n )\n\n return inner
\n
", "current_page_name": "_modules/dagster/_core/definitions/freshness_policy_sensor_definition", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.freshness_policy_sensor_definition"}, "graph_definition": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.graph_definition

\nfrom collections import OrderedDict, defaultdict\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Any,\n    Dict,\n    Iterable,\n    Iterator,\n    List,\n    Mapping,\n    Optional,\n    Sequence,\n    Set,\n    Tuple,\n    TypeVar,\n    Union,\n    cast,\n)\n\nfrom toposort import CircularDependencyError, toposort_flatten\nfrom typing_extensions import Self\n\nimport dagster._check as check\nfrom dagster._annotations import public\nfrom dagster._core.definitions.config import ConfigMapping\nfrom dagster._core.definitions.definition_config_schema import IDefinitionConfigSchema\nfrom dagster._core.definitions.policy import RetryPolicy\nfrom dagster._core.errors import DagsterInvalidDefinitionError, DagsterInvariantViolationError\nfrom dagster._core.selector.subset_selector import AssetSelectionData\nfrom dagster._core.types.dagster_type import (\n    DagsterType,\n    DagsterTypeKind,\n    construct_dagster_type_dictionary,\n)\n\nfrom .dependency import (\n    DependencyMapping,\n    DependencyStructure,\n    GraphNode,\n    Node,\n    NodeHandle,\n    NodeInput,\n    NodeInputHandle,\n    NodeInvocation,\n)\nfrom .hook_definition import HookDefinition\nfrom .input import FanInInputPointer, InputDefinition, InputMapping, InputPointer\nfrom .logger_definition import LoggerDefinition\nfrom .metadata import RawMetadataValue\nfrom .node_container import create_execution_structure, normalize_dependency_dict\nfrom .node_definition import NodeDefinition\nfrom .output import OutputDefinition, OutputMapping\nfrom .resource_requirement import ResourceRequirement\nfrom .version_strategy import VersionStrategy\n\nif TYPE_CHECKING:\n    from dagster._core.execution.execute_in_process_result import ExecuteInProcessResult\n    from dagster._core.instance import DagsterInstance\n\n    from .asset_layer import AssetLayer\n    from .composition import PendingNodeInvocation\n    from .executor_definition import ExecutorDefinition\n    from .job_definition import JobDefinition\n    from .op_definition import OpDefinition\n    from .partition import PartitionedConfig, PartitionsDefinition\n    from .run_config import RunConfig\n    from .source_asset import SourceAsset\n\nT = TypeVar("T")\n\n\ndef _check_node_defs_arg(\n    graph_name: str, node_defs: Optional[Sequence[NodeDefinition]]\n) -> Sequence[NodeDefinition]:\n    node_defs = node_defs or []\n\n    _node_defs = check.opt_sequence_param(node_defs, "node_defs")\n    for node_def in _node_defs:\n        if isinstance(node_def, NodeDefinition):\n            continue\n        elif callable(node_def):\n            raise DagsterInvalidDefinitionError(\n                """You have passed a lambda or function {func} into {name} that is\n                not a node. You have likely forgetten to annotate this function with\n                the @op or @graph decorators.'\n                """.format(name=graph_name, func=node_def.__name__)\n            )\n        else:\n            raise DagsterInvalidDefinitionError(f"Invalid item in node list: {node_def!r}")\n\n    return node_defs\n\n\ndef create_adjacency_lists(\n    nodes: Sequence[Node],\n    dep_structure: DependencyStructure,\n) -> Tuple[Mapping[str, Set[str]], Mapping[str, Set[str]]]:\n    visit_dict = {s.name: False for s in nodes}\n    forward_edges: Dict[str, Set[str]] = {s.name: set() for s in nodes}\n    backward_edges: Dict[str, Set[str]] = {s.name: set() for s in nodes}\n\n    def visit(node_name: str) -> None:\n        if visit_dict[node_name]:\n            return\n\n        visit_dict[node_name] = True\n\n        for node_output in dep_structure.all_upstream_outputs_from_node(node_name):\n            forward_node = node_output.node.name\n            backward_node = node_name\n            if forward_node in forward_edges:\n                forward_edges[forward_node].add(backward_node)\n                backward_edges[backward_node].add(forward_node)\n                visit(forward_node)\n\n    for s in nodes:\n        visit(s.name)\n\n    return (forward_edges, backward_edges)\n\n\n
[docs]class GraphDefinition(NodeDefinition):\n """Defines a Dagster op graph.\n\n An op graph is made up of\n\n - Nodes, which can either be an op (the functional unit of computation), or another graph.\n - Dependencies, which determine how the values produced by nodes as outputs flow from\n one node to another. This tells Dagster how to arrange nodes into a directed, acyclic graph\n (DAG) of compute.\n\n End users should prefer the :func:`@graph <graph>` decorator. GraphDefinition is generally\n intended to be used by framework authors or for programatically generated graphs.\n\n Args:\n name (str): The name of the graph. Must be unique within any :py:class:`GraphDefinition`\n or :py:class:`JobDefinition` containing the graph.\n description (Optional[str]): A human-readable description of the job.\n node_defs (Optional[Sequence[NodeDefinition]]): The set of ops / graphs used in this graph.\n dependencies (Optional[Dict[Union[str, NodeInvocation], Dict[str, DependencyDefinition]]]):\n A structure that declares the dependencies of each op's inputs on the outputs of other\n ops in the graph. Keys of the top level dict are either the string names of ops in the\n graph or, in the case of aliased ops, :py:class:`NodeInvocations <NodeInvocation>`.\n Values of the top level dict are themselves dicts, which map input names belonging to\n the op or aliased op to :py:class:`DependencyDefinitions <DependencyDefinition>`.\n input_mappings (Optional[Sequence[InputMapping]]): Defines the inputs to the nested graph, and\n how they map to the inputs of its constituent ops.\n output_mappings (Optional[Sequence[OutputMapping]]): Defines the outputs of the nested graph,\n and how they map from the outputs of its constituent ops.\n config (Optional[ConfigMapping]): Defines the config of the graph, and how its schema maps\n to the config of its constituent ops.\n tags (Optional[Dict[str, Any]]): Arbitrary metadata for any execution of the graph.\n Values that are not strings will be json encoded and must meet the criteria that\n `json.loads(json.dumps(value)) == value`. These tag values may be overwritten by tag\n values provided at invocation time.\n\n Examples:\n .. code-block:: python\n\n @op\n def return_one():\n return 1\n\n @op\n def add_one(num):\n return num + 1\n\n graph_def = GraphDefinition(\n name='basic',\n node_defs=[return_one, add_one],\n dependencies={'add_one': {'num': DependencyDefinition('return_one')}},\n )\n """\n\n _node_defs: Sequence[NodeDefinition]\n _dagster_type_dict: Mapping[str, DagsterType]\n _dependencies: DependencyMapping[NodeInvocation]\n _dependency_structure: DependencyStructure\n _node_dict: Mapping[str, Node]\n _input_mappings: Sequence[InputMapping]\n _output_mappings: Sequence[OutputMapping]\n _config_mapping: Optional[ConfigMapping]\n _nodes_in_topological_order: Sequence[Node]\n\n # (node name within the graph -> (input name -> SourceAsset to load that input from))\n # Does NOT include keys for:\n # - Inputs to the graph itself\n # - Inputs to nodes within sub-graphs of the graph\n _node_input_source_assets: Mapping[str, Mapping[str, "SourceAsset"]]\n\n def __init__(\n self,\n name: str,\n *,\n description: Optional[str] = None,\n node_defs: Optional[Sequence[NodeDefinition]] = None,\n dependencies: Optional[\n Union[DependencyMapping[str], DependencyMapping[NodeInvocation]]\n ] = None,\n input_mappings: Optional[Sequence[InputMapping]] = None,\n output_mappings: Optional[Sequence[OutputMapping]] = None,\n config: Optional[ConfigMapping] = None,\n tags: Optional[Mapping[str, str]] = None,\n node_input_source_assets: Optional[Mapping[str, Mapping[str, "SourceAsset"]]] = None,\n **kwargs: Any,\n ):\n self._node_defs = _check_node_defs_arg(name, node_defs)\n\n # `dependencies` will be converted to `dependency_structure` and `node_dict`, which may\n # alternatively be passed directly (useful when copying)\n self._dependencies = normalize_dependency_dict(dependencies)\n self._dependency_structure, self._node_dict = create_execution_structure(\n self._node_defs, self._dependencies, graph_definition=self\n )\n\n # Sequence[InputMapping]\n self._input_mappings = check.opt_sequence_param(input_mappings, "input_mappings")\n input_defs = _validate_in_mappings(\n self._input_mappings,\n self._node_dict,\n self._dependency_structure,\n name,\n class_name=type(self).__name__,\n )\n\n # Sequence[OutputMapping]\n self._output_mappings, output_defs = _validate_out_mappings(\n check.opt_sequence_param(output_mappings, "output_mappings"),\n self._node_dict,\n name,\n class_name=type(self).__name__,\n )\n\n self._config_mapping = check.opt_inst_param(config, "config", ConfigMapping)\n\n super(GraphDefinition, self).__init__(\n name=name,\n description=description,\n input_defs=input_defs,\n output_defs=output_defs,\n tags=tags,\n **kwargs,\n )\n\n # must happen after base class construction as properties are assumed to be there\n # eager computation to detect cycles\n self._nodes_in_topological_order = self._get_nodes_in_topological_order()\n self._dagster_type_dict = construct_dagster_type_dictionary([self])\n self._node_input_source_assets = check.opt_mapping_param(\n node_input_source_assets, "node_input_source_assets", key_type=str, value_type=dict\n )\n\n def _get_nodes_in_topological_order(self) -> Sequence[Node]:\n _forward_edges, backward_edges = create_adjacency_lists(\n self.nodes, self.dependency_structure\n )\n\n try:\n order = toposort_flatten(backward_edges)\n except CircularDependencyError as err:\n raise DagsterInvalidDefinitionError(str(err)) from err\n\n return [self.node_named(node_name) for node_name in order]\n\n def get_inputs_must_be_resolved_top_level(\n self, asset_layer: "AssetLayer", handle: Optional[NodeHandle] = None\n ) -> Sequence[InputDefinition]:\n unresolveable_input_defs: List[InputDefinition] = []\n for node in self.node_dict.values():\n cur_handle = NodeHandle(node.name, handle)\n for input_def in node.definition.get_inputs_must_be_resolved_top_level(\n asset_layer, cur_handle\n ):\n if self.dependency_structure.has_deps(NodeInput(node, input_def)):\n continue\n elif not node.container_maps_input(input_def.name):\n raise DagsterInvalidDefinitionError(\n f"Input '{input_def.name}' of {node.describe_node()} "\n "has no way of being resolved. Must provide a resolution to this "\n "input via another op/graph, or via a direct input value mapped from the "\n "top-level graph. To "\n "learn more, see the docs for unconnected inputs: "\n "https://docs.dagster.io/concepts/io-management/unconnected-inputs#unconnected-inputs."\n )\n else:\n mapped_input = node.container_mapped_input(input_def.name)\n unresolveable_input_defs.append(mapped_input.get_definition())\n return unresolveable_input_defs\n\n @property\n def node_type_str(self) -> str:\n return "graph"\n\n @property\n def is_graph_job_op_node(self) -> bool:\n return True\n\n @property\n def nodes(self) -> Sequence[Node]:\n return list(set(self._node_dict.values()))\n\n @property\n def node_dict(self) -> Mapping[str, Node]:\n return self._node_dict\n\n @property\n def node_defs(self) -> Sequence[NodeDefinition]:\n return self._node_defs\n\n @property\n def nodes_in_topological_order(self) -> Sequence[Node]:\n return self._nodes_in_topological_order\n\n @property\n def node_input_source_assets(self) -> Mapping[str, Mapping[str, "SourceAsset"]]:\n return self._node_input_source_assets\n\n def has_node_named(self, name: str) -> bool:\n check.str_param(name, "name")\n return name in self._node_dict\n\n def node_named(self, name: str) -> Node:\n check.str_param(name, "name")\n if name not in self._node_dict:\n raise DagsterInvariantViolationError(f"{self._name} has no op named {name}.")\n\n return self._node_dict[name]\n\n def get_node(self, handle: NodeHandle) -> Node:\n check.inst_param(handle, "handle", NodeHandle)\n current = handle\n lineage: List[str] = []\n while current:\n lineage.append(current.name)\n current = current.parent\n\n name = lineage.pop()\n node = self.node_named(name)\n while lineage:\n name = lineage.pop()\n # We know that this is a current node is a graph while ascending lineage\n definition = cast(GraphDefinition, node.definition)\n node = definition.node_named(name)\n\n return node\n\n def iterate_node_defs(self) -> Iterator[NodeDefinition]:\n yield self\n for outer_node_def in self._node_defs:\n yield from outer_node_def.iterate_node_defs()\n\n def iterate_op_defs(self) -> Iterator["OpDefinition"]:\n for outer_node_def in self._node_defs:\n yield from outer_node_def.iterate_op_defs()\n\n def iterate_node_handles(\n self, parent_node_handle: Optional[NodeHandle] = None\n ) -> Iterator[NodeHandle]:\n for node in self.node_dict.values():\n cur_node_handle = NodeHandle(node.name, parent_node_handle)\n if isinstance(node, GraphNode):\n yield from node.definition.iterate_node_handles(cur_node_handle)\n yield cur_node_handle\n\n @public\n @property\n def input_mappings(self) -> Sequence[InputMapping]:\n """Input mappings for the graph.\n\n An input mapping is a mapping from an input of the graph to an input of a child node.\n """\n return self._input_mappings\n\n @public\n @property\n def output_mappings(self) -> Sequence[OutputMapping]:\n """Output mappings for the graph.\n\n An output mapping is a mapping from an output of the graph to an output of a child node.\n """\n return self._output_mappings\n\n @public\n @property\n def config_mapping(self) -> Optional[ConfigMapping]:\n """The config mapping for the graph, if present.\n\n By specifying a config mapping function, you can override the configuration for the child nodes contained within a graph.\n """\n return self._config_mapping\n\n @property\n def has_config_mapping(self) -> bool:\n return self._config_mapping is not None\n\n def all_dagster_types(self) -> Iterable[DagsterType]:\n return self._dagster_type_dict.values()\n\n def has_dagster_type(self, name: str) -> bool:\n check.str_param(name, "name")\n return name in self._dagster_type_dict\n\n def dagster_type_named(self, name: str) -> DagsterType:\n check.str_param(name, "name")\n return self._dagster_type_dict[name]\n\n def get_input_mapping(self, input_name: str) -> InputMapping:\n check.str_param(input_name, "input_name")\n for mapping in self._input_mappings:\n if mapping.graph_input_name == input_name:\n return mapping\n check.failed(f"Could not find input mapping {input_name}")\n\n def input_mapping_for_pointer(\n self, pointer: Union[InputPointer, FanInInputPointer]\n ) -> Optional[InputMapping]:\n check.inst_param(pointer, "pointer", (InputPointer, FanInInputPointer))\n\n for mapping in self._input_mappings:\n if mapping.maps_to == pointer:\n return mapping\n return None\n\n def get_output_mapping(self, output_name: str) -> OutputMapping:\n check.str_param(output_name, "output_name")\n for mapping in self._output_mappings:\n if mapping.graph_output_name == output_name:\n return mapping\n check.failed(f"Could not find output mapping {output_name}")\n\n T_Handle = TypeVar("T_Handle", bound=Optional[NodeHandle])\n\n def resolve_output_to_origin(\n self, output_name: str, handle: Optional[NodeHandle]\n ) -> Tuple[OutputDefinition, Optional[NodeHandle]]:\n check.str_param(output_name, "output_name")\n check.opt_inst_param(handle, "handle", NodeHandle)\n\n mapping = self.get_output_mapping(output_name)\n check.invariant(mapping, "Can only resolve outputs for valid output names")\n mapped_node = self.node_named(mapping.maps_from.node_name)\n return mapped_node.definition.resolve_output_to_origin(\n mapping.maps_from.output_name,\n NodeHandle(mapped_node.name, handle),\n )\n\n def resolve_output_to_origin_op_def(self, output_name: str) -> "OpDefinition":\n mapping = self.get_output_mapping(output_name)\n check.invariant(mapping, "Can only resolve outputs for valid output names")\n return self.node_named(\n mapping.maps_from.node_name\n ).definition.resolve_output_to_origin_op_def(output_name)\n\n def default_value_for_input(self, input_name: str) -> object:\n check.str_param(input_name, "input_name")\n\n # base case\n if self.input_def_named(input_name).has_default_value:\n return self.input_def_named(input_name).default_value\n\n mapping = self.get_input_mapping(input_name)\n check.invariant(mapping, "Can only resolve inputs for valid input names")\n mapped_node = self.node_named(mapping.maps_to.node_name)\n\n return mapped_node.definition.default_value_for_input(mapping.maps_to.input_name)\n\n def input_has_default(self, input_name: str) -> bool:\n check.str_param(input_name, "input_name")\n\n # base case\n if self.input_def_named(input_name).has_default_value:\n return True\n\n mapping = self.get_input_mapping(input_name)\n check.invariant(mapping, "Can only resolve inputs for valid input names")\n mapped_node = self.node_named(mapping.maps_to.node_name)\n\n return mapped_node.definition.input_has_default(mapping.maps_to.input_name)\n\n @property\n def dependencies(self) -> DependencyMapping[NodeInvocation]:\n return self._dependencies\n\n @property\n def dependency_structure(self) -> DependencyStructure:\n return self._dependency_structure\n\n @property\n def config_schema(self) -> Optional[IDefinitionConfigSchema]:\n return self.config_mapping.config_schema if self.config_mapping is not None else None\n\n def input_supports_dynamic_output_dep(self, input_name: str) -> bool:\n mapping = self.get_input_mapping(input_name)\n target_node = mapping.maps_to.node_name\n # check if input mapped to node which is downstream of another dynamic output within\n if self.dependency_structure.is_dynamic_mapped(target_node):\n return False\n\n # check if input mapped to node which starts new dynamic downstream\n if self.dependency_structure.has_dynamic_downstreams(target_node):\n return False\n\n return self.node_named(target_node).definition.input_supports_dynamic_output_dep(\n mapping.maps_to.input_name\n )\n\n def copy(\n self,\n name: Optional[str] = None,\n description: Optional[str] = None,\n input_mappings: Optional[Sequence[InputMapping]] = None,\n output_mappings: Optional[Sequence[OutputMapping]] = None,\n config: Optional[ConfigMapping] = None,\n tags: Optional[Mapping[str, str]] = None,\n node_input_source_assets: Optional[Mapping[str, Mapping[str, "SourceAsset"]]] = None,\n ) -> Self:\n return GraphDefinition(\n node_defs=self.node_defs,\n dependencies=self.dependencies,\n name=name or self.name,\n description=description or self.description,\n input_mappings=input_mappings or self._input_mappings,\n output_mappings=output_mappings or self._output_mappings,\n config=config or self.config_mapping,\n tags=tags or self.tags,\n node_input_source_assets=node_input_source_assets or self.node_input_source_assets,\n )\n\n def copy_for_configured(\n self,\n name: str,\n description: Optional[str],\n config_schema: Any,\n ) -> "GraphDefinition":\n if not self.has_config_mapping:\n raise DagsterInvalidDefinitionError(\n "Only graphs utilizing config mapping can be pre-configured. The graph "\n f'"{self.name}" does not have a config mapping, and thus has nothing to be '\n "configured."\n )\n config_mapping = cast(ConfigMapping, self.config_mapping)\n return self.copy(\n name=name,\n description=check.opt_str_param(description, "description", default=self.description),\n config=ConfigMapping(\n config_mapping.config_fn,\n config_schema=config_schema,\n receive_processed_config_values=config_mapping.receive_processed_config_values,\n ),\n )\n\n def node_names(self) -> Sequence[str]:\n return list(self._node_dict.keys())\n\n
[docs] @public\n def to_job(\n self,\n name: Optional[str] = None,\n description: Optional[str] = None,\n resource_defs: Optional[Mapping[str, object]] = None,\n config: Optional[\n Union["RunConfig", ConfigMapping, Mapping[str, object], "PartitionedConfig"]\n ] = None,\n tags: Optional[Mapping[str, str]] = None,\n metadata: Optional[Mapping[str, RawMetadataValue]] = None,\n logger_defs: Optional[Mapping[str, LoggerDefinition]] = None,\n executor_def: Optional["ExecutorDefinition"] = None,\n hooks: Optional[AbstractSet[HookDefinition]] = None,\n op_retry_policy: Optional[RetryPolicy] = None,\n version_strategy: Optional[VersionStrategy] = None,\n op_selection: Optional[Sequence[str]] = None,\n partitions_def: Optional["PartitionsDefinition"] = None,\n asset_layer: Optional["AssetLayer"] = None,\n input_values: Optional[Mapping[str, object]] = None,\n _asset_selection_data: Optional[AssetSelectionData] = None,\n ) -> "JobDefinition":\n """Make this graph in to an executable Job by providing remaining components required for execution.\n\n Args:\n name (Optional[str]):\n The name for the Job. Defaults to the name of the this graph.\n resource_defs (Optional[Mapping [str, object]]):\n Resources that are required by this graph for execution.\n If not defined, `io_manager` will default to filesystem.\n config:\n Describes how the job is parameterized at runtime.\n\n If no value is provided, then the schema for the job's run config is a standard\n format based on its ops and resources.\n\n If a dictionary is provided, then it must conform to the standard config schema, and\n it will be used as the job's run config for the job whenever the job is executed.\n The values provided will be viewable and editable in the Dagster UI, so be\n careful with secrets.\n\n If a :py:class:`ConfigMapping` object is provided, then the schema for the job's run config is\n determined by the config mapping, and the ConfigMapping, which should return\n configuration in the standard format to configure the job.\n\n If a :py:class:`PartitionedConfig` object is provided, then it defines a discrete set of config\n values that can parameterize the job, as well as a function for mapping those\n values to the base config. The values provided will be viewable and editable in the\n Dagster UI, so be careful with secrets.\n tags (Optional[Mapping[str, Any]]):\n Arbitrary information that will be attached to the execution of the Job.\n Values that are not strings will be json encoded and must meet the criteria that\n `json.loads(json.dumps(value)) == value`. These tag values may be overwritten by tag\n values provided at invocation time.\n metadata (Optional[Mapping[str, RawMetadataValue]]):\n Arbitrary information that will be attached to the JobDefinition and be viewable in the Dagster UI.\n Keys must be strings, and values must be python primitive types or one of the provided\n MetadataValue types\n logger_defs (Optional[Mapping[str, LoggerDefinition]]):\n A dictionary of string logger identifiers to their implementations.\n executor_def (Optional[ExecutorDefinition]):\n How this Job will be executed. Defaults to :py:class:`multi_or_in_process_executor`,\n which can be switched between multi-process and in-process modes of execution. The\n default mode of execution is multi-process.\n op_retry_policy (Optional[RetryPolicy]): The default retry policy for all ops in this job.\n Only used if retry policy is not defined on the op definition or op invocation.\n version_strategy (Optional[VersionStrategy]):\n Defines how each op (and optionally, resource) in the job can be versioned. If\n provided, memoizaton will be enabled for this job.\n partitions_def (Optional[PartitionsDefinition]): Defines a discrete set of partition\n keys that can parameterize the job. If this argument is supplied, the config\n argument can't also be supplied.\n asset_layer (Optional[AssetLayer]): Top level information about the assets this job\n will produce. Generally should not be set manually.\n input_values (Optional[Mapping[str, Any]]):\n A dictionary that maps python objects to the top-level inputs of a job.\n\n Returns:\n JobDefinition\n """\n from dagster._core.execution.build_resources import wrap_resources_for_execution\n\n from .job_definition import JobDefinition\n\n wrapped_resource_defs = wrap_resources_for_execution(resource_defs)\n\n return JobDefinition.dagster_internal_init(\n name=name,\n description=description or self.description,\n graph_def=self,\n resource_defs=wrapped_resource_defs,\n logger_defs=logger_defs,\n executor_def=executor_def,\n config=config,\n partitions_def=partitions_def,\n tags=tags,\n metadata=metadata,\n hook_defs=hooks,\n version_strategy=version_strategy,\n op_retry_policy=op_retry_policy,\n asset_layer=asset_layer,\n input_values=input_values,\n _subset_selection_data=_asset_selection_data,\n _was_explicitly_provided_resources=None, # None means this is determined by whether resource_defs contains any explicitly provided resources\n ).get_subset(op_selection=op_selection)
\n\n def coerce_to_job(self) -> "JobDefinition":\n # attempt to coerce a Graph in to a Job, raising a useful error if it doesn't work\n try:\n return self.to_job()\n except DagsterInvalidDefinitionError as err:\n raise DagsterInvalidDefinitionError(\n f"Failed attempting to coerce Graph {self.name} in to a Job. "\n "Use to_job instead, passing the required information."\n ) from err\n\n
[docs] @public\n def execute_in_process(\n self,\n run_config: Any = None,\n instance: Optional["DagsterInstance"] = None,\n resources: Optional[Mapping[str, object]] = None,\n raise_on_error: bool = True,\n op_selection: Optional[Sequence[str]] = None,\n run_id: Optional[str] = None,\n input_values: Optional[Mapping[str, object]] = None,\n ) -> "ExecuteInProcessResult":\n """Execute this graph in-process, collecting results in-memory.\n\n Args:\n run_config (Optional[Mapping[str, Any]]):\n Run config to provide to execution. The configuration for the underlying graph\n should exist under the "ops" key.\n instance (Optional[DagsterInstance]):\n The instance to execute against, an ephemeral one will be used if none provided.\n resources (Optional[Mapping[str, Any]]):\n The resources needed if any are required. Can provide resource instances directly,\n or resource definitions.\n raise_on_error (Optional[bool]): Whether or not to raise exceptions when they occur.\n Defaults to ``True``.\n op_selection (Optional[List[str]]): A list of op selection queries (including single op\n names) to execute. For example:\n * ``['some_op']``: selects ``some_op`` itself.\n * ``['*some_op']``: select ``some_op`` and all its ancestors (upstream dependencies).\n * ``['*some_op+++']``: select ``some_op``, all its ancestors, and its descendants\n (downstream dependencies) within 3 levels down.\n * ``['*some_op', 'other_op_a', 'other_op_b+']``: select ``some_op`` and all its\n ancestors, ``other_op_a`` itself, and ``other_op_b`` and its direct child ops.\n input_values (Optional[Mapping[str, Any]]):\n A dictionary that maps python objects to the top-level inputs of the graph.\n\n Returns:\n :py:class:`~dagster.ExecuteInProcessResult`\n """\n from dagster._core.execution.build_resources import wrap_resources_for_execution\n from dagster._core.instance import DagsterInstance\n\n from .executor_definition import execute_in_process_executor\n from .job_definition import JobDefinition\n\n instance = check.opt_inst_param(instance, "instance", DagsterInstance)\n resources = check.opt_mapping_param(resources, "resources", key_type=str)\n input_values = check.opt_mapping_param(input_values, "input_values")\n\n resource_defs = wrap_resources_for_execution(resources)\n\n ephemeral_job = JobDefinition(\n name=self._name,\n graph_def=self,\n executor_def=execute_in_process_executor,\n resource_defs=resource_defs,\n input_values=input_values,\n ).get_subset(op_selection=op_selection)\n\n run_config = run_config if run_config is not None else {}\n op_selection = check.opt_sequence_param(op_selection, "op_selection", str)\n\n return ephemeral_job.execute_in_process(\n run_config=run_config,\n instance=instance,\n raise_on_error=raise_on_error,\n run_id=run_id,\n )
\n\n @property\n def parent_graph_def(self) -> Optional["GraphDefinition"]:\n return None\n\n @property\n def is_subselected(self) -> bool:\n return False\n\n def get_resource_requirements(\n self, asset_layer: Optional["AssetLayer"] = None\n ) -> Iterator[ResourceRequirement]:\n for node in self.node_dict.values():\n yield from node.get_resource_requirements(outer_container=self, asset_layer=asset_layer)\n\n for dagster_type in self.all_dagster_types():\n yield from dagster_type.get_resource_requirements()\n\n @public\n @property\n def name(self) -> str:\n """The name of the graph."""\n return super(GraphDefinition, self).name\n\n @public\n @property\n def tags(self) -> Mapping[str, str]:\n """The tags associated with the graph."""\n return super(GraphDefinition, self).tags\n\n
[docs] @public\n def alias(self, name: str) -> "PendingNodeInvocation":\n """Aliases the graph with a new name.\n\n Can only be used in the context of a :py:func:`@graph <graph>`, :py:func:`@job <job>`, or :py:func:`@asset_graph <asset_graph>` decorated function.\n\n **Examples:**\n .. code-block:: python\n\n @job\n def do_it_all():\n my_graph.alias("my_graph_alias")\n """\n return super(GraphDefinition, self).alias(name)
\n\n
[docs] @public\n def tag(self, tags: Optional[Mapping[str, str]]) -> "PendingNodeInvocation":\n """Attaches the provided tags to the graph immutably.\n\n Can only be used in the context of a :py:func:`@graph <graph>`, :py:func:`@job <job>`, or :py:func:`@asset_graph <asset_graph>` decorated function.\n\n **Examples:**\n .. code-block:: python\n\n @job\n def do_it_all():\n my_graph.tag({"my_tag": "my_value"})\n """\n return super(GraphDefinition, self).tag(tags)
\n\n
[docs] @public\n def with_hooks(self, hook_defs: AbstractSet[HookDefinition]) -> "PendingNodeInvocation":\n """Attaches the provided hooks to the graph immutably.\n\n Can only be used in the context of a :py:func:`@graph <graph>`, :py:func:`@job <job>`, or :py:func:`@asset_graph <asset_graph>` decorated function.\n\n **Examples:**\n .. code-block:: python\n\n @job\n def do_it_all():\n my_graph.with_hooks({my_hook})\n """\n return super(GraphDefinition, self).with_hooks(hook_defs)
\n\n
[docs] @public\n def with_retry_policy(self, retry_policy: RetryPolicy) -> "PendingNodeInvocation":\n """Attaches the provided retry policy to the graph immutably.\n\n Can only be used in the context of a :py:func:`@graph <graph>`, :py:func:`@job <job>`, or :py:func:`@asset_graph <asset_graph>` decorated function.\n\n **Examples:**\n .. code-block:: python\n\n @job\n def do_it_all():\n my_graph.with_retry_policy(RetryPolicy(max_retries=5))\n """\n return super(GraphDefinition, self).with_retry_policy(retry_policy)
\n\n def resolve_input_to_destinations(\n self, input_handle: NodeInputHandle\n ) -> Sequence[NodeInputHandle]:\n all_destinations: List[NodeInputHandle] = []\n for mapping in self.input_mappings:\n if mapping.graph_input_name != input_handle.input_name:\n continue\n # recurse into graph structure\n all_destinations += self.node_named(\n mapping.maps_to.node_name\n ).definition.resolve_input_to_destinations(\n NodeInputHandle(\n NodeHandle(mapping.maps_to.node_name, parent=input_handle.node_handle),\n mapping.maps_to.input_name,\n ),\n )\n\n return all_destinations
\n\n\nclass SubselectedGraphDefinition(GraphDefinition):\n """Defines a subselected graph.\n\n Args:\n parent_graph_def (GraphDefinition): The parent graph that this current graph is subselected\n from. This is used for tracking where the subselected graph originally comes from.\n Note that we allow subselecting a subselected graph, and this field refers to the direct\n parent graph of the current subselection, rather than the original root graph.\n node_defs (Optional[Sequence[NodeDefinition]]): A list of all top level nodes in the graph. A\n node can be an op or a graph that contains other nodes.\n dependencies (Optional[Mapping[Union[str, NodeInvocation], Mapping[str, IDependencyDefinition]]]):\n A structure that declares the dependencies of each op's inputs on the outputs of other\n ops in the subselected graph. Keys of the top level dict are either the string names of\n ops in the graph or, in the case of aliased ops, :py:class:`NodeInvocations <NodeInvocation>`.\n Values of the top level dict are themselves dicts, which map input names belonging to\n the op or aliased op to :py:class:`DependencyDefinitions <DependencyDefinition>`.\n input_mappings (Optional[Sequence[InputMapping]]): Define the inputs to the nested graph, and\n how they map to the inputs of its constituent ops.\n output_mappings (Optional[Sequence[OutputMapping]]): Define the outputs of the nested graph, and\n how they map from the outputs of its constituent ops.\n """\n\n def __init__(\n self,\n parent_graph_def: GraphDefinition,\n node_defs: Optional[Sequence[NodeDefinition]],\n dependencies: Optional[\n Union[\n DependencyMapping[str],\n DependencyMapping[NodeInvocation],\n ]\n ],\n input_mappings: Optional[Sequence[InputMapping]],\n output_mappings: Optional[Sequence[OutputMapping]],\n ):\n self._parent_graph_def = check.inst_param(\n parent_graph_def, "parent_graph_def", GraphDefinition\n )\n super(SubselectedGraphDefinition, self).__init__(\n name=parent_graph_def.name, # should we create special name for subselected graphs\n node_defs=node_defs,\n dependencies=dependencies,\n input_mappings=input_mappings,\n output_mappings=output_mappings,\n config=parent_graph_def.config_mapping,\n tags=parent_graph_def.tags,\n )\n\n @property\n def parent_graph_def(self) -> GraphDefinition:\n return self._parent_graph_def\n\n def get_top_level_omitted_nodes(self) -> Sequence[Node]:\n return [node for node in self.parent_graph_def.nodes if not self.has_node_named(node.name)]\n\n @property\n def is_subselected(self) -> bool:\n return True\n\n\ndef _validate_in_mappings(\n input_mappings: Sequence[InputMapping],\n nodes_by_name: Mapping[str, Node],\n dependency_structure: DependencyStructure,\n name: str,\n class_name: str,\n) -> Sequence[InputDefinition]:\n from .composition import MappedInputPlaceholder\n\n input_defs_by_name: Dict[str, InputDefinition] = OrderedDict()\n mapping_keys: Set[str] = set()\n\n target_input_types_by_graph_input_name: Dict[str, Set[DagsterType]] = defaultdict(set)\n\n for mapping in input_mappings:\n # handle incorrect objects passed in as mappings\n if not isinstance(mapping, InputMapping):\n if isinstance(mapping, InputDefinition):\n raise DagsterInvalidDefinitionError(\n f"In {class_name} '{name}' you passed an InputDefinition "\n f"named '{mapping.name}' directly in to input_mappings. Return "\n "an InputMapping by calling mapping_to on the InputDefinition."\n )\n else:\n raise DagsterInvalidDefinitionError(\n f"In {class_name} '{name}' received unexpected type '{type(mapping)}' in"\n " input_mappings. Provide an InputMapping using InputMapping(...)"\n )\n\n input_defs_by_name[mapping.graph_input_name] = mapping.get_definition()\n\n target_node = nodes_by_name.get(mapping.maps_to.node_name)\n if target_node is None:\n raise DagsterInvalidDefinitionError(\n f"In {class_name} '{name}' input mapping references node "\n f"'{mapping.maps_to.node_name}' which it does not contain."\n )\n if not target_node.has_input(mapping.maps_to.input_name):\n raise DagsterInvalidDefinitionError(\n f"In {class_name} '{name}' input mapping to node '{mapping.maps_to.node_name}' "\n f"which contains no input named '{mapping.maps_to.input_name}'"\n )\n\n target_input_def = target_node.input_def_named(mapping.maps_to.input_name)\n node_input = NodeInput(target_node, target_input_def)\n\n if mapping.maps_to_fan_in:\n maps_to = cast(FanInInputPointer, mapping.maps_to)\n if not dependency_structure.has_fan_in_deps(node_input):\n raise DagsterInvalidDefinitionError(\n f"In {class_name} '{name}' input mapping target"\n f' "{maps_to.node_name}.{maps_to.input_name}" (index'\n f" {maps_to.fan_in_index} of fan-in) is not a MultiDependencyDefinition."\n )\n inner_deps = dependency_structure.get_fan_in_deps(node_input)\n if (maps_to.fan_in_index >= len(inner_deps)) or (\n inner_deps[maps_to.fan_in_index] is not MappedInputPlaceholder\n ):\n raise DagsterInvalidDefinitionError(\n f"In {class_name} '{name}' input mapping target "\n f'"{maps_to.node_name}.{maps_to.input_name}" index {maps_to.fan_in_index} in '\n "the MultiDependencyDefinition is not a MappedInputPlaceholder"\n )\n mapping_keys.add(f"{maps_to.node_name}.{maps_to.input_name}.{maps_to.fan_in_index}")\n target_input_types_by_graph_input_name[mapping.graph_input_name].add(\n target_input_def.dagster_type.get_inner_type_for_fan_in()\n )\n else:\n if dependency_structure.has_deps(node_input):\n raise DagsterInvalidDefinitionError(\n f"In {class_name} '{name}' input mapping target "\n f'"{mapping.maps_to.node_name}.{mapping.maps_to.input_name}" '\n "is already satisfied by output"\n )\n\n mapping_keys.add(f"{mapping.maps_to.node_name}.{mapping.maps_to.input_name}")\n target_input_types_by_graph_input_name[mapping.graph_input_name].add(\n target_input_def.dagster_type\n )\n\n for node_input in dependency_structure.inputs():\n if dependency_structure.has_fan_in_deps(node_input):\n for idx, dep in enumerate(dependency_structure.get_fan_in_deps(node_input)):\n if dep is MappedInputPlaceholder:\n mapping_str = f"{node_input.node_name}.{node_input.input_name}.{idx}"\n if mapping_str not in mapping_keys:\n raise DagsterInvalidDefinitionError(\n f"Unsatisfied MappedInputPlaceholder at index {idx} in"\n " MultiDependencyDefinition for"\n f" '{node_input.node_name}.{node_input.input_name}'"\n )\n\n # if the dagster type on a graph input is Any and all its target inputs have the\n # same dagster type, then use that dagster type for the graph input\n for graph_input_name, graph_input_def in input_defs_by_name.items():\n if graph_input_def.dagster_type.kind == DagsterTypeKind.ANY:\n target_input_types = target_input_types_by_graph_input_name[graph_input_name]\n if len(target_input_types) == 1:\n input_defs_by_name[graph_input_name] = graph_input_def.with_dagster_type(\n next(iter(target_input_types))\n )\n\n return list(input_defs_by_name.values())\n\n\ndef _validate_out_mappings(\n output_mappings: Sequence[OutputMapping],\n node_dict: Mapping[str, Node],\n name: str,\n class_name: str,\n) -> Tuple[Sequence[OutputMapping], Sequence[OutputDefinition]]:\n output_defs: List[OutputDefinition] = []\n for mapping in output_mappings:\n if isinstance(mapping, OutputMapping):\n target_node = node_dict.get(mapping.maps_from.node_name)\n if target_node is None:\n raise DagsterInvalidDefinitionError(\n f"In {class_name} '{name}' output mapping references node "\n f"'{mapping.maps_from.node_name}' which it does not contain."\n )\n if not target_node.has_output(mapping.maps_from.output_name):\n raise DagsterInvalidDefinitionError(\n f"In {class_name} {name} output mapping from {target_node.describe_node()} "\n f"which contains no output named '{mapping.maps_from.output_name}'"\n )\n\n target_output = target_node.output_def_named(mapping.maps_from.output_name)\n output_def = mapping.get_definition(is_dynamic=target_output.is_dynamic)\n output_defs.append(output_def)\n\n if (\n mapping.dagster_type\n and mapping.dagster_type.kind != DagsterTypeKind.ANY\n and (target_output.dagster_type != mapping.dagster_type)\n and class_name != "GraphDefinition"\n ):\n raise DagsterInvalidDefinitionError(\n f"In {class_name} '{name}' output '{mapping.graph_output_name}' of type"\n f" {mapping.dagster_type.display_name} maps from"\n f" {mapping.maps_from.node_name}.{mapping.maps_from.output_name} of different"\n f" type {target_output.dagster_type.display_name}. OutputMapping source and"\n " destination must have the same type."\n )\n\n elif isinstance(mapping, OutputDefinition):\n raise DagsterInvalidDefinitionError(\n f"You passed an OutputDefinition named '{mapping.name}' directly "\n "in to output_mappings. Return an OutputMapping by calling "\n "mapping_from on the OutputDefinition."\n )\n else:\n raise DagsterInvalidDefinitionError(\n f"Received unexpected type '{type(mapping)}' in output_mappings. "\n "Provide an OutputMapping using OutputDefinition(...).mapping_from(...)"\n )\n return output_mappings, output_defs\n
", "current_page_name": "_modules/dagster/_core/definitions/graph_definition", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.graph_definition"}, "hook_definition": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.hook_definition

\nfrom typing import AbstractSet, Any, Callable, Iterator, NamedTuple, Optional, cast\n\nimport dagster._check as check\nfrom dagster._annotations import PublicAttr\n\nfrom ..decorator_utils import get_function_params\nfrom ..errors import DagsterInvalidInvocationError\nfrom .resource_requirement import HookResourceRequirement, RequiresResources, ResourceRequirement\nfrom .utils import check_valid_name\n\n\n
[docs]class HookDefinition(\n NamedTuple(\n "_HookDefinition",\n [\n ("name", PublicAttr[str]),\n ("hook_fn", PublicAttr[Callable]),\n ("required_resource_keys", PublicAttr[AbstractSet[str]]),\n ("decorated_fn", PublicAttr[Optional[Callable]]),\n ],\n ),\n RequiresResources,\n):\n """Define a hook which can be triggered during a op execution (e.g. a callback on the step\n execution failure event during a op execution).\n\n Args:\n name (str): The name of this hook.\n hook_fn (Callable): The callback function that will be triggered.\n required_resource_keys (Optional[AbstractSet[str]]): Keys for the resources required by the\n hook.\n """\n\n def __new__(\n cls,\n *,\n name: str,\n hook_fn: Callable[..., Any],\n required_resource_keys: Optional[AbstractSet[str]] = None,\n decorated_fn: Optional[Callable[..., Any]] = None,\n ):\n return super(HookDefinition, cls).__new__(\n cls,\n name=check_valid_name(name),\n hook_fn=check.callable_param(hook_fn, "hook_fn"),\n required_resource_keys=frozenset(\n check.opt_set_param(required_resource_keys, "required_resource_keys", of_type=str)\n ),\n decorated_fn=check.opt_callable_param(decorated_fn, "decorated_fn"),\n )\n\n def __call__(self, *args, **kwargs):\n """This is invoked when the hook is used as a decorator.\n\n We currently support hooks to decorate the following:\n\n - JobDefinition: when the hook decorates a job definition, it will be added to\n all the op invocations within the job.\n\n Example:\n .. code-block:: python\n\n @success_hook\n def slack_message_on_success(_):\n ...\n\n @slack_message_on_success\n @job\n def a_job():\n foo(bar())\n\n """\n from ..execution.context.hook import HookContext\n from .graph_definition import GraphDefinition\n from .hook_invocation import hook_invocation_result\n from .job_definition import JobDefinition\n\n if len(args) > 0 and isinstance(args[0], (JobDefinition, GraphDefinition)):\n # when it decorates a job, we apply this hook to all the op invocations within\n # the job.\n return args[0].with_hooks({self})\n else:\n if not self.decorated_fn:\n raise DagsterInvalidInvocationError(\n "Only hook definitions created using one of the hook decorators can be invoked."\n )\n fxn_args = get_function_params(self.decorated_fn)\n # If decorated fxn has two arguments, then this is an event list hook fxn, and parameter\n # names are always context and event_list\n if len(fxn_args) == 2:\n context_arg_name = fxn_args[0].name\n event_list_arg_name = fxn_args[1].name\n if len(args) + len(kwargs) != 2:\n raise DagsterInvalidInvocationError(\n "Decorated function expects two parameters, context and event_list, but "\n f"{len(args) + len(kwargs)} were provided."\n )\n if args:\n context = check.opt_inst_param(args[0], "context", HookContext)\n event_list = check.opt_list_param(\n args[1] if len(args) > 1 else kwargs[event_list_arg_name],\n event_list_arg_name,\n )\n else:\n if context_arg_name not in kwargs:\n raise DagsterInvalidInvocationError(\n f"Could not find expected argument '{context_arg_name}'. Provided "\n f"kwargs: {list(kwargs.keys())}"\n )\n if event_list_arg_name not in kwargs:\n raise DagsterInvalidInvocationError(\n f"Could not find expected argument '{event_list_arg_name}'. Provided "\n f"kwargs: {list(kwargs.keys())}"\n )\n context = check.opt_inst_param(\n kwargs[context_arg_name], context_arg_name, HookContext\n )\n event_list = check.opt_list_param(\n kwargs[event_list_arg_name], event_list_arg_name\n )\n return hook_invocation_result(self, context, event_list)\n else:\n context_arg_name = fxn_args[0].name\n if len(args) + len(kwargs) != 1:\n raise DagsterInvalidInvocationError(\n f"Decorated function expects one parameter, {context_arg_name}, but "\n f"{len(args) + len(kwargs)} were provided."\n )\n if args:\n context = check.opt_inst_param(args[0], context_arg_name, HookContext)\n else:\n if context_arg_name not in kwargs:\n raise DagsterInvalidInvocationError(\n f"Could not find expected argument '{context_arg_name}'. Provided "\n f"kwargs: {list(kwargs.keys())}"\n )\n context = check.opt_inst_param(\n kwargs[context_arg_name], context_arg_name, HookContext\n )\n return hook_invocation_result(self, context)\n\n def get_resource_requirements(\n self, outer_context: Optional[object] = None\n ) -> Iterator[ResourceRequirement]:\n # outer_context in this case is a string of (job, job name) or (node, node name)\n attached_to = cast(Optional[str], outer_context)\n for resource_key in sorted(list(self.required_resource_keys)):\n yield HookResourceRequirement(\n key=resource_key, attached_to=attached_to, hook_name=self.name\n )
\n
", "current_page_name": "_modules/dagster/_core/definitions/hook_definition", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.hook_definition"}, "input": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.input

\nimport inspect\nfrom types import FunctionType\nfrom typing import (\n    TYPE_CHECKING,\n    Any,\n    Callable,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Set,\n    Type,\n    TypeVar,\n    Union,\n)\n\nimport dagster._check as check\nfrom dagster._annotations import PublicAttr, deprecated_param, experimental_param\nfrom dagster._core.definitions.events import AssetKey\nfrom dagster._core.definitions.metadata import (\n    ArbitraryMetadataMapping,\n    MetadataValue,\n    RawMetadataValue,\n    normalize_metadata,\n)\nfrom dagster._core.errors import DagsterError, DagsterInvalidDefinitionError\nfrom dagster._core.types.dagster_type import (  # BuiltinScalarDagsterType,\n    DagsterType,\n    resolve_dagster_type,\n)\n\nfrom .inference import InferredInputProps\nfrom .utils import NoValueSentinel, check_valid_name\n\nif TYPE_CHECKING:\n    from dagster._core.execution.context.input import InputContext\n\nT = TypeVar("T")\n\n\n# unfortunately since type_check functions need TypeCheckContext which is only available\n# at runtime, we can only check basic types before runtime\ndef _check_default_value(input_name: str, dagster_type: DagsterType, default_value: T) -> T:\n    from dagster._core.types.dagster_type import BuiltinScalarDagsterType\n\n    if default_value is not NoValueSentinel:\n        if dagster_type.is_nothing:\n            raise DagsterInvalidDefinitionError(\n                "Setting a default_value is invalid on InputDefinitions of type Nothing"\n            )\n\n        if isinstance(dagster_type, BuiltinScalarDagsterType):\n            type_check = dagster_type.type_check_scalar_value(default_value)\n            if not type_check.success:\n                raise DagsterInvalidDefinitionError(\n                    "Type check failed for the default_value of InputDefinition "\n                    f"{input_name} of type {dagster_type.display_name}. "\n                    f"Received value {default_value} of type {type(default_value)}",\n                )\n\n    return default_value\n\n\n@experimental_param(param="asset_key")\n@experimental_param(param="asset_partitions")\nclass InputDefinition:\n    """Defines an argument to an op's compute function.\n\n    Inputs may flow from previous op outputs, or be stubbed using config. They may optionally\n    be typed using the Dagster type system.\n\n    Args:\n        name (str): Name of the input.\n        dagster_type (Optional[Union[Type, DagsterType]]]): The type of this input.\n            Users should provide the Python type of the objects that they expect to be passed for\n            this input, or a :py:class:`DagsterType` that defines a runtime check that they want\n            to be run on this input. Defaults to :py:class:`Any`.\n        description (Optional[str]): Human-readable description of the input.\n        default_value (Optional[Any]): The default value to use if no input is provided.\n        metadata (Optional[Dict[str, Any]]): A dict of metadata for the input.\n        asset_key (Optional[Union[AssetKey, InputContext -> AssetKey]]): (Experimental) An AssetKey\n            (or function that produces an AssetKey from the InputContext) which should be associated\n            with this InputDefinition. Used for tracking lineage information through Dagster.\n        asset_partitions (Optional[Union[AbstractSet[str], InputContext -> AbstractSet[str]]]): (Experimental) A\n            set of partitions of the given asset_key (or a function that produces this list of\n            partitions from the InputContext) which should be associated with this InputDefinition.\n        input_manager_key (Optional[str]): (Experimental) The resource key for the\n            :py:class:`InputManager` used for loading this input when it is not connected to an\n            upstream output.\n    """\n\n    _name: str\n    _type_not_set: bool\n    _dagster_type: DagsterType\n    _description: Optional[str]\n    _default_value: Any\n    _input_manager_key: Optional[str]\n    _raw_metadata: ArbitraryMetadataMapping\n    _metadata: Mapping[str, MetadataValue]\n    _asset_key: Optional[Union[AssetKey, Callable[["InputContext"], AssetKey]]]\n    _asset_partitions_fn: Optional[Callable[["InputContext"], Set[str]]]\n\n    def __init__(\n        self,\n        name: str,\n        dagster_type: object = None,\n        description: Optional[str] = None,\n        default_value: object = NoValueSentinel,\n        metadata: Optional[ArbitraryMetadataMapping] = None,\n        asset_key: Optional[Union[AssetKey, Callable[["InputContext"], AssetKey]]] = None,\n        asset_partitions: Optional[Union[Set[str], Callable[["InputContext"], Set[str]]]] = None,\n        input_manager_key: Optional[str] = None,\n        # when adding new params, make sure to update combine_with_inferred and with_dagster_type below\n    ):\n        self._name = check_valid_name(name, allow_list=["config"])\n\n        self._type_not_set = dagster_type is None\n        self._dagster_type = check.inst(resolve_dagster_type(dagster_type), DagsterType)\n\n        self._description = check.opt_str_param(description, "description")\n\n        self._default_value = _check_default_value(self._name, self._dagster_type, default_value)\n\n        self._input_manager_key = check.opt_str_param(input_manager_key, "input_manager_key")\n\n        self._raw_metadata = check.opt_mapping_param(metadata, "metadata", key_type=str)\n        self._metadata = normalize_metadata(self._raw_metadata, allow_invalid=True)\n\n        if not callable(asset_key):\n            check.opt_inst_param(asset_key, "asset_key", AssetKey)\n\n        self._asset_key = asset_key\n\n        if asset_partitions:\n            check.param_invariant(\n                asset_key is not None,\n                "asset_partitions",\n                'Cannot specify "asset_partitions" argument without also specifying "asset_key"',\n            )\n        if callable(asset_partitions):\n            self._asset_partitions_fn = asset_partitions\n        elif asset_partitions is not None:\n            _asset_partitions = check.set_param(asset_partitions, "asset_partitions", of_type=str)\n            self._asset_partitions_fn = lambda _: _asset_partitions\n        else:\n            self._asset_partitions_fn = None\n\n    @property\n    def name(self) -> str:\n        return self._name\n\n    @property\n    def dagster_type(self) -> DagsterType:\n        return self._dagster_type\n\n    @property\n    def description(self) -> Optional[str]:\n        return self._description\n\n    @property\n    def has_default_value(self) -> bool:\n        return self._default_value is not NoValueSentinel\n\n    @property\n    def default_value(self) -> Any:\n        check.invariant(self.has_default_value, "Can only fetch default_value if has_default_value")\n        return self._default_value\n\n    @property\n    def input_manager_key(self) -> Optional[str]:\n        return self._input_manager_key\n\n    @property\n    def metadata(self) -> ArbitraryMetadataMapping:\n        return self._raw_metadata\n\n    @property\n    def is_asset(self) -> bool:\n        return self._asset_key is not None\n\n    @property\n    def hardcoded_asset_key(self) -> Optional[AssetKey]:\n        if not callable(self._asset_key):\n            return self._asset_key\n        else:\n            return None\n\n    def get_asset_key(self, context: "InputContext") -> Optional[AssetKey]:\n        """Get the AssetKey associated with this InputDefinition for the given\n        :py:class:`InputContext` (if any).\n\n        Args:\n            context (InputContext): The InputContext that this InputDefinition is being evaluated\n                in\n        """\n        if callable(self._asset_key):\n            return self._asset_key(context)\n        else:\n            return self.hardcoded_asset_key\n\n    def get_asset_partitions(self, context: "InputContext") -> Optional[Set[str]]:\n        """Get the set of partitions that this op will read from this InputDefinition for the given\n        :py:class:`InputContext` (if any).\n\n        Args:\n            context (InputContext): The InputContext that this InputDefinition is being evaluated\n                in\n        """\n        if self._asset_partitions_fn is None:\n            return None\n\n        return self._asset_partitions_fn(context)\n\n    def mapping_to(\n        self, node_name: str, input_name: str, fan_in_index: Optional[int] = None\n    ) -> "InputMapping":\n        """Create an input mapping to an input of a child node.\n\n        In a GraphDefinition, you can use this helper function to construct\n        an :py:class:`InputMapping` to the input of a child node.\n\n        Args:\n            node_name (str): The name of the child node to which to map this input.\n            input_name (str): The name of the child node' input to which to map this input.\n            fan_in_index (Optional[int]): The index in to a fanned in input, else None\n\n        Examples:\n            .. code-block:: python\n\n                input_mapping = InputDefinition('composite_input', Int).mapping_to(\n                    'child_node', 'int_input'\n                )\n        """\n        check.str_param(node_name, "node_name")\n        check.str_param(input_name, "input_name")\n        check.opt_int_param(fan_in_index, "fan_in_index")\n\n        return InputMapping(\n            graph_input_name=self.name,\n            mapped_node_name=node_name,\n            mapped_node_input_name=input_name,\n            fan_in_index=fan_in_index,\n            graph_input_description=self.description,\n            dagster_type=self.dagster_type,\n        )\n\n    @staticmethod\n    def create_from_inferred(inferred: InferredInputProps) -> "InputDefinition":\n        return InputDefinition(\n            name=inferred.name,\n            dagster_type=_checked_inferred_type(inferred),\n            description=inferred.description,\n            default_value=inferred.default_value,\n        )\n\n    def combine_with_inferred(self, inferred: InferredInputProps) -> "InputDefinition":\n        """Return a new InputDefinition that merges this ones properties with those inferred from type signature.\n        This can update: dagster_type, description, and default_value if they are not set.\n        """\n        check.invariant(\n            self.name == inferred.name,\n            f"InferredInputProps name {inferred.name} did not align with InputDefinition name"\n            f" {self.name}",\n        )\n\n        dagster_type = self._dagster_type\n        if self._type_not_set:\n            dagster_type = _checked_inferred_type(inferred)\n\n        description = self._description\n        if description is None and inferred.description is not None:\n            description = inferred.description\n\n        default_value = self._default_value\n        if not self.has_default_value:\n            default_value = inferred.default_value\n\n        return InputDefinition(\n            name=self.name,\n            dagster_type=dagster_type,\n            description=description,\n            default_value=default_value,\n            metadata=self.metadata,\n            asset_key=self._asset_key,\n            asset_partitions=self._asset_partitions_fn,\n            input_manager_key=self._input_manager_key,\n        )\n\n    def with_dagster_type(self, dagster_type: DagsterType) -> "InputDefinition":\n        return InputDefinition(\n            name=self.name,\n            dagster_type=dagster_type,\n            description=self.description,\n            default_value=self.default_value if self.has_default_value else NoValueSentinel,\n            metadata=self.metadata,\n            asset_key=self._asset_key,\n            asset_partitions=self._asset_partitions_fn,\n            input_manager_key=self._input_manager_key,\n        )\n\n\ndef _checked_inferred_type(inferred: InferredInputProps) -> DagsterType:\n    try:\n        if inferred.annotation == inspect.Parameter.empty:\n            resolved_type = resolve_dagster_type(None)\n        elif inferred.annotation is None:\n            # When inferred.annotation is None, it means someone explicitly put "None" as the\n            # annotation, so want to map it to a DagsterType that checks for the None type\n            resolved_type = resolve_dagster_type(type(None))\n        else:\n            resolved_type = resolve_dagster_type(inferred.annotation)\n\n    except DagsterError as e:\n        raise DagsterInvalidDefinitionError(\n            f"Problem using type '{inferred.annotation}' from type annotation for argument "\n            f"'{inferred.name}', correct the issue or explicitly set the dagster_type "\n            "via In()."\n        ) from e\n\n    return resolved_type\n\n\nclass InputPointer(NamedTuple("_InputPointer", [("node_name", str), ("input_name", str)])):\n    def __new__(cls, node_name: str, input_name: str):\n        return super(InputPointer, cls).__new__(\n            cls,\n            check.str_param(node_name, "node_name"),\n            check.str_param(input_name, "input_name"),\n        )\n\n\nclass FanInInputPointer(\n    NamedTuple(\n        "_FanInInputPointer", [("node_name", str), ("input_name", str), ("fan_in_index", int)]\n    )\n):\n    def __new__(cls, node_name: str, input_name: str, fan_in_index: int):\n        return super(FanInInputPointer, cls).__new__(\n            cls,\n            check.str_param(node_name, "node_name"),\n            check.str_param(input_name, "input_name"),\n            check.int_param(fan_in_index, "fan_in_index"),\n        )\n\n\n
[docs]@deprecated_param(\n param="dagster_type",\n breaking_version="2.0",\n additional_warn_text="Any defined `dagster_type` should come from the upstream op `Output`.",\n # Disabling warning here since we're passing this internally and I'm not sure whether it is\n # actually used or discarded.\n emit_runtime_warning=False,\n)\nclass InputMapping(NamedTuple):\n """Defines an input mapping for a graph.\n\n Args:\n graph_input_name (str): Name of the input in the graph being mapped from.\n mapped_node_name (str): Named of the node (op/graph) that the input is being mapped to.\n mapped_node_input_name (str): Name of the input in the node (op/graph) that is being mapped to.\n fan_in_index (Optional[int]): The index in to a fanned input, otherwise None.\n graph_input_description (Optional[str]): A description of the input in the graph being mapped from.\n dagster_type (Optional[DagsterType]): The dagster type of the graph's input\n being mapped from.\n\n Examples:\n .. code-block:: python\n\n from dagster import InputMapping, GraphDefinition, op, graph\n\n @op\n def needs_input(x):\n return x + 1\n\n # The following two graph definitions are equivalent\n GraphDefinition(\n name="the_graph",\n node_defs=[needs_input],\n input_mappings=[\n InputMapping(\n graph_input_name="maps_x", mapped_node_name="needs_input",\n mapped_node_input_name="x"\n )\n ]\n )\n\n @graph\n def the_graph(maps_x):\n needs_input(maps_x)\n """\n\n graph_input_name: str\n mapped_node_name: str\n mapped_node_input_name: str\n fan_in_index: Optional[int] = None\n graph_input_description: Optional[str] = None\n dagster_type: Optional[DagsterType] = None\n\n @property\n def maps_to(self) -> Union[InputPointer, FanInInputPointer]:\n if self.fan_in_index is not None:\n return FanInInputPointer(\n self.mapped_node_name, self.mapped_node_input_name, self.fan_in_index\n )\n return InputPointer(self.mapped_node_name, self.mapped_node_input_name)\n\n @property\n def maps_to_fan_in(self) -> bool:\n return isinstance(self.maps_to, FanInInputPointer)\n\n def describe(self) -> str:\n idx = self.maps_to.fan_in_index if isinstance(self.maps_to, FanInInputPointer) else ""\n return f"{self.graph_input_name} -> {self.maps_to.node_name}:{self.maps_to.input_name}{idx}"\n\n def get_definition(self) -> "InputDefinition":\n return InputDefinition(\n name=self.graph_input_name,\n description=self.graph_input_description,\n dagster_type=self.dagster_type,\n )
\n\n\n
[docs]class In(\n NamedTuple(\n "_In",\n [\n ("dagster_type", PublicAttr[Union[DagsterType, Type[NoValueSentinel]]]),\n ("description", PublicAttr[Optional[str]]),\n ("default_value", PublicAttr[Any]),\n ("metadata", PublicAttr[Optional[Mapping[str, Any]]]),\n (\n "asset_key",\n PublicAttr[Optional[Union[AssetKey, Callable[["InputContext"], AssetKey]]]],\n ),\n (\n "asset_partitions",\n PublicAttr[Optional[Union[Set[str], Callable[["InputContext"], Set[str]]]]],\n ),\n ("input_manager_key", PublicAttr[Optional[str]]),\n ],\n )\n):\n """Defines an argument to an op's compute function.\n\n Inputs may flow from previous op's outputs, or be stubbed using config. They may optionally\n be typed using the Dagster type system.\n\n Args:\n dagster_type (Optional[Union[Type, DagsterType]]]):\n The type of this input. Should only be set if the correct type can not\n be inferred directly from the type signature of the decorated function.\n description (Optional[str]): Human-readable description of the input.\n default_value (Optional[Any]): The default value to use if no input is provided.\n metadata (Optional[Dict[str, RawMetadataValue]]): A dict of metadata for the input.\n asset_key (Optional[Union[AssetKey, InputContext -> AssetKey]]): (Experimental) An AssetKey\n (or function that produces an AssetKey from the InputContext) which should be associated\n with this In. Used for tracking lineage information through Dagster.\n asset_partitions (Optional[Union[Set[str], InputContext -> Set[str]]]): (Experimental) A\n set of partitions of the given asset_key (or a function that produces this list of\n partitions from the InputContext) which should be associated with this In.\n input_manager_key (Optional[str]): (Experimental) The resource key for the\n :py:class:`InputManager` used for loading this input when it is not connected to an\n upstream output.\n """\n\n def __new__(\n cls,\n dagster_type: Union[Type, DagsterType] = NoValueSentinel,\n description: Optional[str] = None,\n default_value: Any = NoValueSentinel,\n metadata: Optional[Mapping[str, RawMetadataValue]] = None,\n asset_key: Optional[Union[AssetKey, Callable[["InputContext"], AssetKey]]] = None,\n asset_partitions: Optional[Union[Set[str], Callable[["InputContext"], Set[str]]]] = None,\n input_manager_key: Optional[str] = None,\n ):\n return super(In, cls).__new__(\n cls,\n dagster_type=(\n NoValueSentinel\n if dagster_type is NoValueSentinel\n else resolve_dagster_type(dagster_type)\n ),\n description=check.opt_str_param(description, "description"),\n default_value=default_value,\n metadata=check.opt_mapping_param(metadata, "metadata", key_type=str),\n asset_key=check.opt_inst_param(asset_key, "asset_key", (AssetKey, FunctionType)),\n asset_partitions=asset_partitions,\n input_manager_key=check.opt_str_param(input_manager_key, "input_manager_key"),\n )\n\n @staticmethod\n def from_definition(input_def: InputDefinition) -> "In":\n return In(\n dagster_type=input_def.dagster_type,\n description=input_def.description,\n default_value=input_def._default_value, # noqa: SLF001\n metadata=input_def.metadata,\n asset_key=input_def._asset_key, # noqa: SLF001\n asset_partitions=input_def._asset_partitions_fn, # noqa: SLF001\n input_manager_key=input_def.input_manager_key,\n )\n\n def to_definition(self, name: str) -> InputDefinition:\n dagster_type = self.dagster_type if self.dagster_type is not NoValueSentinel else None\n return InputDefinition(\n name=name,\n dagster_type=dagster_type,\n description=self.description,\n default_value=self.default_value,\n metadata=self.metadata,\n asset_key=self.asset_key,\n asset_partitions=self.asset_partitions,\n input_manager_key=self.input_manager_key,\n )
\n\n\n
[docs]class GraphIn(NamedTuple("_GraphIn", [("description", PublicAttr[Optional[str]])])):\n """Represents information about an input that a graph maps.\n\n Args:\n description (Optional[str]): Human-readable description of the input.\n """\n\n def __new__(cls, description: Optional[str] = None):\n return super(GraphIn, cls).__new__(cls, description=description)\n\n def to_definition(self, name: str) -> InputDefinition:\n return InputDefinition(name=name, description=self.description)
\n
", "current_page_name": "_modules/dagster/_core/definitions/input", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.input"}, "job_definition": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.job_definition

\nimport importlib\nimport os\nimport warnings\nfrom datetime import datetime\nfrom functools import update_wrapper\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Any,\n    Dict,\n    Iterable,\n    List,\n    Mapping,\n    Optional,\n    Sequence,\n    Set,\n    Tuple,\n    Union,\n    cast,\n)\n\nfrom typing_extensions import Self\n\nimport dagster._check as check\nfrom dagster._annotations import deprecated, experimental_param, public\nfrom dagster._config import Field, Shape, StringSource\nfrom dagster._config.config_type import ConfigType\nfrom dagster._config.validate import validate_config\nfrom dagster._core.definitions.asset_check_spec import AssetCheckKey\nfrom dagster._core.definitions.dependency import (\n    Node,\n    NodeHandle,\n    NodeInputHandle,\n    NodeInvocation,\n)\nfrom dagster._core.definitions.events import AssetKey\nfrom dagster._core.definitions.node_definition import NodeDefinition\nfrom dagster._core.definitions.op_definition import OpDefinition\nfrom dagster._core.definitions.op_selection import OpSelection, get_graph_subset\nfrom dagster._core.definitions.partition import DynamicPartitionsDefinition\nfrom dagster._core.definitions.policy import RetryPolicy\nfrom dagster._core.definitions.resource_requirement import (\n    ResourceRequirement,\n    ensure_requirements_satisfied,\n)\nfrom dagster._core.definitions.utils import check_valid_name\nfrom dagster._core.errors import (\n    DagsterInvalidConfigError,\n    DagsterInvalidDefinitionError,\n    DagsterInvalidInvocationError,\n    DagsterInvalidSubsetError,\n    DagsterInvariantViolationError,\n)\nfrom dagster._core.selector.subset_selector import (\n    AssetSelectionData,\n    OpSelectionData,\n)\nfrom dagster._core.storage.io_manager import (\n    IOManagerDefinition,\n    dagster_maintained_io_manager,\n    io_manager,\n)\nfrom dagster._core.storage.tags import MEMOIZED_RUN_TAG\nfrom dagster._core.types.dagster_type import DagsterType\nfrom dagster._core.utils import str_format_set\nfrom dagster._utils import IHasInternalInit\nfrom dagster._utils.merger import merge_dicts\n\nfrom .asset_layer import AssetLayer, build_asset_selection_job\nfrom .config import ConfigMapping\nfrom .dependency import (\n    DependencyMapping,\n    DependencyStructure,\n    OpNode,\n)\nfrom .executor_definition import ExecutorDefinition, multi_or_in_process_executor\nfrom .graph_definition import GraphDefinition, SubselectedGraphDefinition\nfrom .hook_definition import HookDefinition\nfrom .logger_definition import LoggerDefinition\nfrom .metadata import MetadataValue, RawMetadataValue, normalize_metadata\nfrom .partition import PartitionedConfig, PartitionsDefinition\nfrom .resource_definition import ResourceDefinition\nfrom .run_request import RunRequest\nfrom .utils import DEFAULT_IO_MANAGER_KEY, validate_tags\nfrom .version_strategy import VersionStrategy\n\nif TYPE_CHECKING:\n    from dagster._config.snap import ConfigSchemaSnapshot\n    from dagster._core.definitions.run_config import RunConfig\n    from dagster._core.execution.execute_in_process_result import ExecuteInProcessResult\n    from dagster._core.execution.resources_init import InitResourceContext\n    from dagster._core.host_representation.job_index import JobIndex\n    from dagster._core.instance import DagsterInstance, DynamicPartitionsStore\n    from dagster._core.snap import JobSnapshot\n\n    from .run_config_schema import RunConfigSchema\n\nDEFAULT_EXECUTOR_DEF = multi_or_in_process_executor\n\n\n
[docs]@experimental_param(param="version_strategy")\nclass JobDefinition(IHasInternalInit):\n """Defines a Dagster job."""\n\n _name: str\n _graph_def: GraphDefinition\n _description: Optional[str]\n _tags: Mapping[str, str]\n _metadata: Mapping[str, MetadataValue]\n _current_level_node_defs: Sequence[NodeDefinition]\n _hook_defs: AbstractSet[HookDefinition]\n _op_retry_policy: Optional[RetryPolicy]\n _asset_layer: AssetLayer\n _resource_requirements: Mapping[str, AbstractSet[str]]\n _all_node_defs: Mapping[str, NodeDefinition]\n _cached_run_config_schemas: Dict[str, "RunConfigSchema"]\n _version_strategy: VersionStrategy\n _subset_selection_data: Optional[Union[OpSelectionData, AssetSelectionData]]\n input_values: Mapping[str, object]\n\n def __init__(\n self,\n *,\n graph_def: GraphDefinition,\n resource_defs: Optional[Mapping[str, ResourceDefinition]] = None,\n executor_def: Optional[ExecutorDefinition] = None,\n logger_defs: Optional[Mapping[str, LoggerDefinition]] = None,\n name: Optional[str] = None,\n config: Optional[\n Union[ConfigMapping, Mapping[str, object], PartitionedConfig, "RunConfig"]\n ] = None,\n description: Optional[str] = None,\n partitions_def: Optional[PartitionsDefinition] = None,\n tags: Optional[Mapping[str, Any]] = None,\n metadata: Optional[Mapping[str, RawMetadataValue]] = None,\n hook_defs: Optional[AbstractSet[HookDefinition]] = None,\n op_retry_policy: Optional[RetryPolicy] = None,\n version_strategy: Optional[VersionStrategy] = None,\n _subset_selection_data: Optional[Union[OpSelectionData, AssetSelectionData]] = None,\n asset_layer: Optional[AssetLayer] = None,\n input_values: Optional[Mapping[str, object]] = None,\n _was_explicitly_provided_resources: Optional[bool] = None,\n ):\n from dagster._core.definitions.run_config import RunConfig, convert_config_input\n\n self._graph_def = graph_def\n self._current_level_node_defs = self._graph_def.node_defs\n # Recursively explore all nodes in the this job\n self._all_node_defs = _build_all_node_defs(self._current_level_node_defs)\n self._asset_layer = check.opt_inst_param(\n asset_layer, "asset_layer", AssetLayer\n ) or _infer_asset_layer_from_source_asset_deps(graph_def)\n\n # validates\n self._graph_def.get_inputs_must_be_resolved_top_level(self._asset_layer)\n\n self._name = check_valid_name(check.str_param(name, "name")) if name else graph_def.name\n self._executor_def = check.opt_inst_param(executor_def, "executor_def", ExecutorDefinition)\n self._loggers = check.opt_nullable_mapping_param(\n logger_defs,\n "logger_defs",\n key_type=str,\n value_type=LoggerDefinition,\n )\n\n config = check.opt_inst_param(\n config, "config", (Mapping, ConfigMapping, PartitionedConfig, RunConfig)\n )\n config = convert_config_input(config)\n\n partitions_def = check.opt_inst_param(\n partitions_def, "partitions_def", PartitionsDefinition\n )\n # tags and description can exist on graph as well, but since\n # same graph may be in multiple jobs, keep separate layer\n self._description = check.opt_str_param(description, "description")\n self._tags = validate_tags(tags)\n self._metadata = normalize_metadata(\n check.opt_mapping_param(metadata, "metadata", key_type=str)\n )\n self._hook_defs = check.opt_set_param(hook_defs, "hook_defs")\n self._op_retry_policy = check.opt_inst_param(\n op_retry_policy, "op_retry_policy", RetryPolicy\n )\n self.version_strategy = check.opt_inst_param(\n version_strategy, "version_strategy", VersionStrategy\n )\n\n _subset_selection_data = check.opt_inst_param(\n _subset_selection_data, "_subset_selection_data", (OpSelectionData, AssetSelectionData)\n )\n input_values = check.opt_mapping_param(input_values, "input_values", key_type=str)\n\n resource_defs = check.opt_mapping_param(\n resource_defs, "resource_defs", key_type=str, value_type=ResourceDefinition\n )\n for key in resource_defs.keys():\n if not key.isidentifier():\n check.failed(f"Resource key '{key}' must be a valid Python identifier.")\n was_provided_resources = (\n bool(resource_defs)\n if _was_explicitly_provided_resources is None\n else _was_explicitly_provided_resources\n )\n self._resource_defs = {\n DEFAULT_IO_MANAGER_KEY: default_job_io_manager,\n **resource_defs,\n }\n self._required_resource_keys = self._get_required_resource_keys(was_provided_resources)\n\n self._config_mapping = None\n self._partitioned_config = None\n self._run_config = None\n self._run_config_schema = None\n self._original_config_argument = config\n\n if partitions_def:\n self._partitioned_config = PartitionedConfig.from_flexible_config(\n config, partitions_def\n )\n else:\n if isinstance(config, ConfigMapping):\n self._config_mapping = config\n elif isinstance(config, PartitionedConfig):\n self._partitioned_config = config\n elif isinstance(config, dict):\n self._run_config = config\n # Using config mapping here is a trick to make it so that the preset will be used even\n # when no config is supplied for the job.\n self._config_mapping = _config_mapping_with_default_value(\n get_run_config_schema_for_job(\n graph_def,\n self.resource_defs,\n self.executor_def,\n self.loggers,\n asset_layer,\n was_explicitly_provided_resources=was_provided_resources,\n ),\n config,\n self.name,\n )\n elif config is not None:\n check.failed(\n "config param must be a ConfigMapping, a PartitionedConfig, or a dictionary,"\n f" but is an object of type {type(config)}"\n )\n\n self._subset_selection_data = _subset_selection_data\n self.input_values = input_values\n for input_name in sorted(list(self.input_values.keys())):\n if not graph_def.has_input(input_name):\n raise DagsterInvalidDefinitionError(\n f"Error when constructing JobDefinition '{self.name}': Input value provided for"\n f" key '{input_name}', but job has no top-level input with that name."\n )\n\n def dagster_internal_init(\n *,\n graph_def: GraphDefinition,\n resource_defs: Optional[Mapping[str, ResourceDefinition]],\n executor_def: Optional[ExecutorDefinition],\n logger_defs: Optional[Mapping[str, LoggerDefinition]],\n name: Optional[str],\n config: Optional[\n Union[ConfigMapping, Mapping[str, object], PartitionedConfig, "RunConfig"]\n ],\n description: Optional[str],\n partitions_def: Optional[PartitionsDefinition],\n tags: Optional[Mapping[str, Any]],\n metadata: Optional[Mapping[str, RawMetadataValue]],\n hook_defs: Optional[AbstractSet[HookDefinition]],\n op_retry_policy: Optional[RetryPolicy],\n version_strategy: Optional[VersionStrategy],\n _subset_selection_data: Optional[Union[OpSelectionData, AssetSelectionData]],\n asset_layer: Optional[AssetLayer],\n input_values: Optional[Mapping[str, object]],\n _was_explicitly_provided_resources: Optional[bool],\n ) -> "JobDefinition":\n return JobDefinition(\n graph_def=graph_def,\n resource_defs=resource_defs,\n executor_def=executor_def,\n logger_defs=logger_defs,\n name=name,\n config=config,\n description=description,\n partitions_def=partitions_def,\n tags=tags,\n metadata=metadata,\n hook_defs=hook_defs,\n op_retry_policy=op_retry_policy,\n version_strategy=version_strategy,\n _subset_selection_data=_subset_selection_data,\n asset_layer=asset_layer,\n input_values=input_values,\n _was_explicitly_provided_resources=_was_explicitly_provided_resources,\n )\n\n @property\n def name(self) -> str:\n return self._name\n\n @property\n def tags(self) -> Mapping[str, str]:\n return merge_dicts(self._graph_def.tags, self._tags)\n\n @property\n def metadata(self) -> Mapping[str, MetadataValue]:\n return self._metadata\n\n @property\n def description(self) -> Optional[str]:\n return self._description\n\n @property\n def graph(self) -> GraphDefinition:\n return self._graph_def\n\n @property\n def dependency_structure(self) -> DependencyStructure:\n return self._graph_def.dependency_structure\n\n @property\n def dependencies(self) -> DependencyMapping[NodeInvocation]:\n return self._graph_def.dependencies\n\n @public\n @property\n def executor_def(self) -> ExecutorDefinition:\n """Returns the default :py:class:`ExecutorDefinition` for the job.\n\n If the user has not specified an executor definition, then this will default to the :py:func:`multi_or_in_process_executor`. If a default is specified on the :py:class:`Definitions` object the job was provided to, then that will be used instead.\n """\n return self._executor_def or DEFAULT_EXECUTOR_DEF\n\n @public\n @property\n def has_specified_executor(self) -> bool:\n """Returns True if this job has explicitly specified an executor, and False if the executor was inherited through defaults or the :py:class:`Definitions` object the job was provided to."""\n return self._executor_def is not None\n\n @public\n @property\n def resource_defs(self) -> Mapping[str, ResourceDefinition]:\n """Returns the set of ResourceDefinition objects specified on the job.\n\n This may not be the complete set of resources required by the job, since those can also be provided on the :py:class:`Definitions` object the job may be provided to.\n """\n return self._resource_defs\n\n @public\n @property\n def partitioned_config(self) -> Optional[PartitionedConfig]:\n """The partitioned config for the job, if it has one.\n\n A partitioned config defines a way to map partition keys to run config for the job.\n """\n return self._partitioned_config\n\n @public\n @property\n def config_mapping(self) -> Optional[ConfigMapping]:\n """The config mapping for the job, if it has one.\n\n A config mapping defines a way to map a top-level config schema to run config for the job.\n """\n return self._config_mapping\n\n @public\n @property\n def loggers(self) -> Mapping[str, LoggerDefinition]:\n """Returns the set of LoggerDefinition objects specified on the job.\n\n If the user has not specified a mapping of :py:class:`LoggerDefinition` objects, then this will default to the :py:func:`colored_console_logger` under the key `console`. If a default is specified on the :py:class:`Definitions` object the job was provided to, then that will be used instead.\n """\n from dagster._loggers import default_loggers\n\n return self._loggers or default_loggers()\n\n @public\n @property\n def has_specified_loggers(self) -> bool:\n """Returns true if the job explicitly set loggers, and False if loggers were inherited through defaults or the :py:class:`Definitions` object the job was provided to."""\n return self._loggers is not None\n\n @property\n def required_resource_keys(self) -> AbstractSet[str]:\n return self._required_resource_keys\n\n @property\n def run_config(self) -> Optional[Mapping[str, Any]]:\n return self._run_config\n\n @property\n def run_config_schema(self) -> "RunConfigSchema":\n if self._run_config_schema is None:\n self._run_config_schema = _create_run_config_schema(self, self.required_resource_keys)\n return self._run_config_schema\n\n @public\n @property\n def partitions_def(self) -> Optional[PartitionsDefinition]:\n """Returns the :py:class:`PartitionsDefinition` for the job, if it has one.\n\n A partitions definition defines the set of partition keys the job operates on.\n """\n return None if not self.partitioned_config else self.partitioned_config.partitions_def\n\n @property\n def hook_defs(self) -> AbstractSet[HookDefinition]:\n return self._hook_defs\n\n @property\n def asset_layer(self) -> AssetLayer:\n return self._asset_layer\n\n @property\n def all_node_defs(self) -> Sequence[NodeDefinition]:\n return list(self._all_node_defs.values())\n\n @property\n def top_level_node_defs(self) -> Sequence[NodeDefinition]:\n return self._current_level_node_defs\n\n def node_def_named(self, name: str) -> NodeDefinition:\n check.str_param(name, "name")\n\n check.invariant(name in self._all_node_defs, f"{name} not found")\n return self._all_node_defs[name]\n\n def has_node(self, name: str) -> bool:\n check.str_param(name, "name")\n return name in self._all_node_defs\n\n def get_node(self, handle: NodeHandle) -> Node:\n return self._graph_def.get_node(handle)\n\n def get_op(self, handle: NodeHandle) -> OpNode:\n node = self.get_node(handle)\n assert isinstance(\n node, OpNode\n ), f"Tried to retrieve node {handle} as op, but it represents a nested graph."\n return node\n\n def has_node_named(self, name: str) -> bool:\n return self._graph_def.has_node_named(name)\n\n def get_node_named(self, name: str) -> Node:\n return self._graph_def.node_named(name)\n\n @property\n def nodes(self) -> Sequence[Node]:\n return self._graph_def.nodes\n\n @property\n def nodes_in_topological_order(self) -> Sequence[Node]:\n return self._graph_def.nodes_in_topological_order\n\n def all_dagster_types(self) -> Iterable[DagsterType]:\n return self._graph_def.all_dagster_types()\n\n def has_dagster_type(self, name: str) -> bool:\n return self._graph_def.has_dagster_type(name)\n\n def dagster_type_named(self, name: str) -> DagsterType:\n return self._graph_def.dagster_type_named(name)\n\n def describe_target(self) -> str:\n return f"job '{self.name}'"\n\n def is_using_memoization(self, run_tags: Mapping[str, str]) -> bool:\n tags = merge_dicts(self.tags, run_tags)\n # If someone provides a false value for memoized run tag, then they are intentionally\n # switching off memoization.\n if tags.get(MEMOIZED_RUN_TAG) == "false":\n return False\n return (\n MEMOIZED_RUN_TAG in tags and tags.get(MEMOIZED_RUN_TAG) == "true"\n ) or self.version_strategy is not None\n\n def get_required_resource_defs(self) -> Mapping[str, ResourceDefinition]:\n return {\n resource_key: resource\n for resource_key, resource in self.resource_defs.items()\n if resource_key in self.required_resource_keys\n }\n\n def _get_required_resource_keys(self, validate_requirements: bool = False) -> AbstractSet[str]:\n from ..execution.resources_init import get_transitive_required_resource_keys\n\n requirements = self._get_resource_requirements()\n if validate_requirements:\n ensure_requirements_satisfied(self.resource_defs, requirements)\n required_keys = {req.key for req in requirements}\n if validate_requirements:\n return required_keys.union(\n get_transitive_required_resource_keys(required_keys, self.resource_defs)\n )\n else:\n return required_keys\n\n def _get_resource_requirements(self) -> Sequence[ResourceRequirement]:\n return [\n *self._graph_def.get_resource_requirements(self.asset_layer),\n *[\n req\n for hook_def in self._hook_defs\n for req in hook_def.get_resource_requirements(outer_context=f"job '{self._name}'")\n ],\n ]\n\n def validate_resource_requirements_satisfied(self) -> None:\n resource_requirements = self._get_resource_requirements()\n ensure_requirements_satisfied(self.resource_defs, resource_requirements)\n\n def is_missing_required_resources(self) -> bool:\n requirements = self._get_resource_requirements()\n for requirement in requirements:\n if not requirement.resources_contain_key(self.resource_defs):\n return True\n return False\n\n def get_all_hooks_for_handle(self, handle: NodeHandle) -> AbstractSet[HookDefinition]:\n """Gather all the hooks for the given node from all places possibly attached with a hook.\n\n A hook can be attached to any of the following objects\n * Node (node invocation)\n * JobDefinition\n\n Args:\n handle (NodeHandle): The node's handle\n\n Returns:\n FrozenSet[HookDefinition]\n """\n check.inst_param(handle, "handle", NodeHandle)\n hook_defs: Set[HookDefinition] = set()\n\n current = handle\n lineage = []\n while current:\n lineage.append(current.name)\n current = current.parent\n\n # hooks on top-level node\n name = lineage.pop()\n node = self._graph_def.node_named(name)\n hook_defs = hook_defs.union(node.hook_defs)\n\n # hooks on non-top-level nodes\n while lineage:\n name = lineage.pop()\n # While lineage is non-empty, definition is guaranteed to be a graph\n definition = cast(GraphDefinition, node.definition)\n node = definition.node_named(name)\n hook_defs = hook_defs.union(node.hook_defs)\n\n # hooks applied to a job definition will run on every node\n hook_defs = hook_defs.union(self.hook_defs)\n\n return frozenset(hook_defs)\n\n def get_retry_policy_for_handle(self, handle: NodeHandle) -> Optional[RetryPolicy]:\n node = self.get_node(handle)\n definition = node.definition\n\n if node.retry_policy:\n return node.retry_policy\n elif isinstance(definition, OpDefinition) and definition.retry_policy:\n return definition.retry_policy\n\n # could be expanded to look in graph containers\n else:\n return self._op_retry_policy\n\n # make Callable for decorator reference updates\n def __call__(self, *args, **kwargs):\n raise DagsterInvariantViolationError(\n f"Attempted to call job '{self.name}' directly. Jobs should be invoked by "\n "using an execution API function (e.g. `job.execute_in_process`)."\n )\n\n
[docs] @public\n def execute_in_process(\n self,\n run_config: Optional[Union[Mapping[str, Any], "RunConfig"]] = None,\n instance: Optional["DagsterInstance"] = None,\n partition_key: Optional[str] = None,\n raise_on_error: bool = True,\n op_selection: Optional[Sequence[str]] = None,\n asset_selection: Optional[Sequence[AssetKey]] = None,\n run_id: Optional[str] = None,\n input_values: Optional[Mapping[str, object]] = None,\n tags: Optional[Mapping[str, str]] = None,\n resources: Optional[Mapping[str, object]] = None,\n ) -> "ExecuteInProcessResult":\n """Execute the Job in-process, gathering results in-memory.\n\n The `executor_def` on the Job will be ignored, and replaced with the in-process executor.\n If using the default `io_manager`, it will switch from filesystem to in-memory.\n\n\n Args:\n run_config (Optional[Mapping[str, Any]]:\n The configuration for the run\n instance (Optional[DagsterInstance]):\n The instance to execute against, an ephemeral one will be used if none provided.\n partition_key: (Optional[str])\n The string partition key that specifies the run config to execute. Can only be used\n to select run config for jobs with partitioned config.\n raise_on_error (Optional[bool]): Whether or not to raise exceptions when they occur.\n Defaults to ``True``.\n op_selection (Optional[Sequence[str]]): A list of op selection queries (including single op\n names) to execute. For example:\n * ``['some_op']``: selects ``some_op`` itself.\n * ``['*some_op']``: select ``some_op`` and all its ancestors (upstream dependencies).\n * ``['*some_op+++']``: select ``some_op``, all its ancestors, and its descendants\n (downstream dependencies) within 3 levels down.\n * ``['*some_op', 'other_op_a', 'other_op_b+']``: select ``some_op`` and all its\n ancestors, ``other_op_a`` itself, and ``other_op_b`` and its direct child ops.\n input_values (Optional[Mapping[str, Any]]):\n A dictionary that maps python objects to the top-level inputs of the job. Input values provided here will override input values that have been provided to the job directly.\n resources (Optional[Mapping[str, Any]]):\n The resources needed if any are required. Can provide resource instances directly,\n or resource definitions.\n\n Returns:\n :py:class:`~dagster.ExecuteInProcessResult`\n\n """\n from dagster._core.definitions.executor_definition import execute_in_process_executor\n from dagster._core.definitions.run_config import convert_config_input\n from dagster._core.execution.build_resources import wrap_resources_for_execution\n from dagster._core.execution.execute_in_process import core_execute_in_process\n\n run_config = check.opt_mapping_param(convert_config_input(run_config), "run_config")\n op_selection = check.opt_sequence_param(op_selection, "op_selection", str)\n asset_selection = check.opt_sequence_param(asset_selection, "asset_selection", AssetKey)\n resources = check.opt_mapping_param(resources, "resources", key_type=str)\n\n resource_defs = wrap_resources_for_execution(resources)\n\n check.invariant(\n not (op_selection and asset_selection),\n "op_selection and asset_selection cannot both be provided as args to"\n " execute_in_process",\n )\n\n partition_key = check.opt_str_param(partition_key, "partition_key")\n input_values = check.opt_mapping_param(input_values, "input_values")\n\n # Combine provided input values at execute_in_process with input values\n # provided to the definition. Input values provided at\n # execute_in_process will override those provided on the definition.\n input_values = merge_dicts(self.input_values, input_values)\n\n bound_resource_defs = dict(self.resource_defs)\n ephemeral_job = JobDefinition.dagster_internal_init(\n name=self._name,\n graph_def=self._graph_def,\n resource_defs={**_swap_default_io_man(bound_resource_defs, self), **resource_defs},\n executor_def=execute_in_process_executor,\n logger_defs=self._loggers,\n hook_defs=self.hook_defs,\n config=self.config_mapping or self.partitioned_config or self.run_config,\n tags=self.tags,\n op_retry_policy=self._op_retry_policy,\n version_strategy=self.version_strategy,\n asset_layer=self.asset_layer,\n input_values=input_values,\n description=self.description,\n partitions_def=self.partitions_def,\n metadata=self.metadata,\n _subset_selection_data=None, # this is added below\n _was_explicitly_provided_resources=True,\n )\n\n ephemeral_job = ephemeral_job.get_subset(\n op_selection=op_selection,\n asset_selection=frozenset(asset_selection) if asset_selection else None,\n )\n\n merged_tags = merge_dicts(self.tags, tags or {})\n if partition_key:\n if not (self.partitions_def and self.partitioned_config):\n check.failed("Attempted to execute a partitioned run for a non-partitioned job")\n self.partitions_def.validate_partition_key(\n partition_key, dynamic_partitions_store=instance\n )\n\n run_config = (\n run_config\n if run_config\n else self.partitioned_config.get_run_config_for_partition_key(partition_key)\n )\n merged_tags.update(\n self.partitioned_config.get_tags_for_partition_key(\n partition_key, job_name=self.name\n )\n )\n\n return core_execute_in_process(\n ephemeral_job=ephemeral_job,\n run_config=run_config,\n instance=instance,\n output_capturing_enabled=True,\n raise_on_error=raise_on_error,\n run_tags=merged_tags,\n run_id=run_id,\n asset_selection=frozenset(asset_selection),\n )
\n\n @property\n def op_selection_data(self) -> Optional[OpSelectionData]:\n return (\n self._subset_selection_data\n if isinstance(self._subset_selection_data, OpSelectionData)\n else None\n )\n\n @property\n def asset_selection_data(self) -> Optional[AssetSelectionData]:\n return (\n self._subset_selection_data\n if isinstance(self._subset_selection_data, AssetSelectionData)\n else None\n )\n\n @property\n def is_subset(self) -> bool:\n return bool(self._subset_selection_data)\n\n def get_subset(\n self,\n *,\n op_selection: Optional[Iterable[str]] = None,\n asset_selection: Optional[AbstractSet[AssetKey]] = None,\n asset_check_selection: Optional[AbstractSet[AssetCheckKey]] = None,\n ) -> Self:\n check.invariant(\n not (op_selection and (asset_selection or asset_check_selection)),\n "op_selection cannot be provided with asset_selection or asset_check_selection to"\n " execute_in_process",\n )\n if op_selection:\n return self._get_job_def_for_op_selection(op_selection)\n if asset_selection or asset_check_selection:\n return self._get_job_def_for_asset_selection(\n asset_selection=asset_selection, asset_check_selection=asset_check_selection\n )\n else:\n return self\n\n def _get_job_def_for_asset_selection(\n self,\n asset_selection: Optional[AbstractSet[AssetKey]] = None,\n asset_check_selection: Optional[AbstractSet[AssetCheckKey]] = None,\n ) -> Self:\n asset_selection = check.opt_set_param(asset_selection, "asset_selection", AssetKey)\n check.opt_set_param(asset_check_selection, "asset_check_selection", AssetCheckKey)\n\n nonexistent_assets = [\n asset\n for asset in asset_selection\n if asset not in self.asset_layer.asset_keys\n and asset not in self.asset_layer.source_assets_by_key\n ]\n nonexistent_asset_strings = [\n asset_str\n for asset_str in (asset.to_string() for asset in nonexistent_assets)\n if asset_str\n ]\n if nonexistent_assets:\n raise DagsterInvalidSubsetError(\n "Assets provided in asset_selection argument "\n f"{', '.join(nonexistent_asset_strings)} do not exist in parent asset group or job."\n )\n\n # Test that selected asset checks exist\n all_check_keys = self.asset_layer.node_output_handles_by_asset_check_key.keys()\n\n nonexistent_asset_checks = [\n asset_check\n for asset_check in asset_check_selection or set()\n if asset_check not in all_check_keys\n ]\n nonexistent_asset_check_strings = [\n str(asset_check) for asset_check in nonexistent_asset_checks\n ]\n if nonexistent_asset_checks:\n raise DagsterInvalidSubsetError(\n "Asset checks provided in asset_check_selection argument"\n f" {', '.join(nonexistent_asset_check_strings)} do not exist in parent asset group"\n " or job."\n )\n\n # Test that selected asset checks can be run individually. Currently this is only supported\n # on checks defined with @asset_check, which will have an AssetChecksDefinition.\n all_check_keys_in_checks_defs = set()\n for asset_checks_def in self.asset_layer.asset_checks_defs:\n for spec in asset_checks_def.specs:\n all_check_keys_in_checks_defs.add(spec.key)\n\n non_checks_defs_asset_checks = [\n asset_check\n for asset_check in asset_check_selection or set()\n if asset_check not in all_check_keys_in_checks_defs\n ]\n non_checks_defs_asset_check_strings = [\n asset_check.name for asset_check in non_checks_defs_asset_checks\n ]\n if non_checks_defs_asset_checks:\n raise DagsterInvalidSubsetError(\n f"Can't execute asset checks [{', '.join(non_checks_defs_asset_check_strings)}],"\n " because they weren't defined with @asset_check or AssetChecksDefinition. To"\n " execute these checks, materialize the asset."\n )\n\n asset_selection_data = AssetSelectionData(\n asset_selection=asset_selection,\n asset_check_selection=asset_check_selection,\n parent_job_def=self,\n )\n\n check.invariant(\n self.asset_layer.assets_defs_by_key is not None,\n "Asset layer must have _asset_defs argument defined",\n )\n\n new_job = build_asset_selection_job(\n name=self.name,\n assets=set(self.asset_layer.assets_defs_by_key.values()),\n source_assets=self.asset_layer.source_assets_by_key.values(),\n executor_def=self.executor_def,\n resource_defs=self.resource_defs,\n description=self.description,\n tags=self.tags,\n asset_selection=asset_selection,\n asset_check_selection=asset_check_selection,\n asset_selection_data=asset_selection_data,\n config=self.config_mapping or self.partitioned_config,\n asset_checks=self.asset_layer.asset_checks_defs,\n )\n return new_job\n\n def _get_job_def_for_op_selection(self, op_selection: Iterable[str]) -> Self:\n try:\n sub_graph = get_graph_subset(self.graph, op_selection)\n\n # if explicit config was passed the config_mapping that resolves the defaults implicitly is\n # very unlikely to work. The job will still present the default config in the Dagster UI.\n config = (\n None\n if self.run_config is not None\n else self.config_mapping or self.partitioned_config\n )\n\n return self._copy(\n config=config,\n graph_def=sub_graph,\n _subset_selection_data=OpSelectionData(\n op_selection=list(op_selection),\n resolved_op_selection=OpSelection(op_selection).resolve(self.graph),\n parent_job_def=self, # used by job snapshot lineage\n ),\n # TODO: subset this structure.\n # https://github.com/dagster-io/dagster/issues/7541\n asset_layer=self.asset_layer,\n )\n except DagsterInvalidDefinitionError as exc:\n # This handles the case when you construct a subset such that an unsatisfied\n # input cannot be loaded from config. Instead of throwing a DagsterInvalidDefinitionError,\n # we re-raise a DagsterInvalidSubsetError.\n node_paths = OpSelection(op_selection).resolve(self.graph)\n raise DagsterInvalidSubsetError(\n f"The attempted subset {str_format_set(node_paths)} for graph "\n f"{self.graph.name} results in an invalid graph."\n ) from exc\n\n
[docs] @public\n @deprecated(\n breaking_version="2.0.0",\n additional_warn_text="Directly instantiate `RunRequest(partition_key=...)` instead.",\n )\n def run_request_for_partition(\n self,\n partition_key: str,\n run_key: Optional[str] = None,\n tags: Optional[Mapping[str, str]] = None,\n asset_selection: Optional[Sequence[AssetKey]] = None,\n run_config: Optional[Mapping[str, Any]] = None,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional["DynamicPartitionsStore"] = None,\n ) -> RunRequest:\n """Creates a RunRequest object for a run that processes the given partition.\n\n Args:\n partition_key: The key of the partition to request a run for.\n run_key (Optional[str]): A string key to identify this launched run. For sensors, ensures that\n only one run is created per run key across all sensor evaluations. For schedules,\n ensures that one run is created per tick, across failure recoveries. Passing in a `None`\n value means that a run will always be launched per evaluation.\n tags (Optional[Dict[str, str]]): A dictionary of tags (string key-value pairs) to attach\n to the launched run.\n run_config (Optional[Mapping[str, Any]]: Configuration for the run. If the job has\n a :py:class:`PartitionedConfig`, this value will override replace the config\n provided by it.\n current_time (Optional[datetime]): Used to determine which time-partitions exist.\n Defaults to now.\n dynamic_partitions_store (Optional[DynamicPartitionsStore]): The DynamicPartitionsStore\n object that is responsible for fetching dynamic partitions. Required when the\n partitions definition is a DynamicPartitionsDefinition with a name defined. Users\n can pass the DagsterInstance fetched via `context.instance` to this argument.\n\n\n Returns:\n RunRequest: an object that requests a run to process the given partition.\n """\n if not (self.partitions_def and self.partitioned_config):\n check.failed("Called run_request_for_partition on a non-partitioned job")\n\n if (\n isinstance(self.partitions_def, DynamicPartitionsDefinition)\n and self.partitions_def.name\n ):\n # Do not support using run_request_for_partition with dynamic partitions,\n # since this requires querying the instance once per run request for the\n # existent dynamic partitions\n check.failed(\n "run_request_for_partition is not supported for dynamic partitions. Instead, use"\n " RunRequest(partition_key=...)"\n )\n\n self.partitions_def.validate_partition_key(\n partition_key,\n current_time=current_time,\n dynamic_partitions_store=dynamic_partitions_store,\n )\n\n run_config = (\n run_config\n if run_config is not None\n else self.partitioned_config.get_run_config_for_partition_key(partition_key)\n )\n run_request_tags = {\n **(tags or {}),\n **self.partitioned_config.get_tags_for_partition_key(\n partition_key,\n job_name=self.name,\n ),\n }\n\n return RunRequest(\n run_key=run_key,\n run_config=run_config,\n tags=run_request_tags,\n job_name=self.name,\n asset_selection=asset_selection,\n partition_key=partition_key,\n )
\n\n def get_config_schema_snapshot(self) -> "ConfigSchemaSnapshot":\n return self.get_job_snapshot().config_schema_snapshot\n\n def get_job_snapshot(self) -> "JobSnapshot":\n return self.get_job_index().job_snapshot\n\n def get_job_index(self) -> "JobIndex":\n from dagster._core.host_representation import JobIndex\n from dagster._core.snap import JobSnapshot\n\n return JobIndex(JobSnapshot.from_job_def(self), self.get_parent_job_snapshot())\n\n def get_job_snapshot_id(self) -> str:\n return self.get_job_index().job_snapshot_id\n\n def get_parent_job_snapshot(self) -> Optional["JobSnapshot"]:\n if self.op_selection_data:\n return self.op_selection_data.parent_job_def.get_job_snapshot()\n elif self.asset_selection_data:\n return self.asset_selection_data.parent_job_def.get_job_snapshot()\n else:\n return None\n\n def has_direct_input_value(self, input_name: str) -> bool:\n return input_name in self.input_values\n\n def get_direct_input_value(self, input_name: str) -> object:\n if input_name not in self.input_values:\n raise DagsterInvalidInvocationError(\n f"On job '{self.name}', attempted to retrieve input value for input named"\n f" '{input_name}', but no value was provided. Provided input values:"\n f" {sorted(list(self.input_values.keys()))}"\n )\n return self.input_values[input_name]\n\n def _copy(self, **kwargs: Any) -> "JobDefinition":\n # dict() calls copy dict props\n base_kwargs = dict(\n graph_def=self.graph,\n resource_defs=dict(self.resource_defs),\n executor_def=self._executor_def,\n logger_defs=self._loggers,\n config=self._original_config_argument,\n name=self._name,\n description=self.description,\n tags=self.tags,\n metadata=self._metadata,\n hook_defs=self.hook_defs,\n op_retry_policy=self._op_retry_policy,\n version_strategy=self.version_strategy,\n _subset_selection_data=self._subset_selection_data,\n asset_layer=self.asset_layer,\n input_values=self.input_values,\n partitions_def=self.partitions_def,\n _was_explicitly_provided_resources=None,\n )\n resolved_kwargs = {**base_kwargs, **kwargs} # base kwargs overwritten for conflicts\n job_def = JobDefinition.dagster_internal_init(**resolved_kwargs)\n update_wrapper(job_def, self, updated=())\n return job_def\n\n
[docs] @public\n def with_top_level_resources(\n self, resource_defs: Mapping[str, ResourceDefinition]\n ) -> "JobDefinition":\n """Apply a set of resources to all op instances within the job."""\n resource_defs = check.mapping_param(resource_defs, "resource_defs", key_type=str)\n return self._copy(resource_defs=resource_defs)
\n\n
[docs] @public\n def with_hooks(self, hook_defs: AbstractSet[HookDefinition]) -> "JobDefinition":\n """Apply a set of hooks to all op instances within the job."""\n hook_defs = check.set_param(hook_defs, "hook_defs", of_type=HookDefinition)\n return self._copy(hook_defs=(hook_defs | self.hook_defs))
\n\n def with_executor_def(self, executor_def: ExecutorDefinition) -> "JobDefinition":\n return self._copy(executor_def=executor_def)\n\n def with_logger_defs(self, logger_defs: Mapping[str, LoggerDefinition]) -> "JobDefinition":\n return self._copy(logger_defs=logger_defs)\n\n @property\n def op_selection(self) -> Optional[AbstractSet[str]]:\n return set(self.op_selection_data.op_selection) if self.op_selection_data else None\n\n @property\n def asset_selection(self) -> Optional[AbstractSet[AssetKey]]:\n return self.asset_selection_data.asset_selection if self.asset_selection_data else None\n\n @property\n def resolved_op_selection(self) -> Optional[AbstractSet[str]]:\n return self.op_selection_data.resolved_op_selection if self.op_selection_data else None
\n\n\ndef _swap_default_io_man(resources: Mapping[str, ResourceDefinition], job: JobDefinition):\n """Used to create the user facing experience of the default io_manager\n switching to in-memory when using execute_in_process.\n """\n from dagster._core.storage.mem_io_manager import mem_io_manager\n\n if (\n resources.get(DEFAULT_IO_MANAGER_KEY) in [default_job_io_manager]\n and job.version_strategy is None\n ):\n updated_resources = dict(resources)\n updated_resources[DEFAULT_IO_MANAGER_KEY] = mem_io_manager\n return updated_resources\n\n return resources\n\n\n@dagster_maintained_io_manager\n@io_manager(\n description="Built-in filesystem IO manager that stores and retrieves values using pickling."\n)\ndef default_job_io_manager(init_context: "InitResourceContext"):\n # support overriding the default io manager via environment variables\n module_name = os.getenv("DAGSTER_DEFAULT_IO_MANAGER_MODULE")\n attribute_name = os.getenv("DAGSTER_DEFAULT_IO_MANAGER_ATTRIBUTE")\n silence_failures = os.getenv("DAGSTER_DEFAULT_IO_MANAGER_SILENCE_FAILURES")\n\n if module_name and attribute_name:\n from dagster._core.execution.build_resources import build_resources\n\n try:\n module = importlib.import_module(module_name)\n attr = getattr(module, attribute_name)\n check.invariant(\n isinstance(attr, IOManagerDefinition),\n "DAGSTER_DEFAULT_IO_MANAGER_MODULE and DAGSTER_DEFAULT_IO_MANAGER_ATTRIBUTE"\n " must specify an IOManagerDefinition",\n )\n with build_resources({"io_manager": attr}, instance=init_context.instance) as resources:\n return resources.io_manager\n except Exception as e:\n if not silence_failures:\n raise\n else:\n warnings.warn(\n f"Failed to load io manager override with module: {module_name} attribute:"\n f" {attribute_name}: {e}\\nFalling back to default io manager."\n )\n\n # normally, default to the fs_io_manager\n from dagster._core.storage.fs_io_manager import PickledObjectFilesystemIOManager\n\n instance = check.not_none(init_context.instance)\n return PickledObjectFilesystemIOManager(base_dir=instance.storage_directory())\n\n\n@dagster_maintained_io_manager\n@io_manager(\n description="Built-in filesystem IO manager that stores and retrieves values using pickling.",\n config_schema={"base_dir": Field(StringSource, is_required=False)},\n)\ndef default_job_io_manager_with_fs_io_manager_schema(init_context: "InitResourceContext"):\n # support overriding the default io manager via environment variables\n module_name = os.getenv("DAGSTER_DEFAULT_IO_MANAGER_MODULE")\n attribute_name = os.getenv("DAGSTER_DEFAULT_IO_MANAGER_ATTRIBUTE")\n silence_failures = os.getenv("DAGSTER_DEFAULT_IO_MANAGER_SILENCE_FAILURES")\n\n if module_name and attribute_name:\n from dagster._core.execution.build_resources import build_resources\n\n try:\n module = importlib.import_module(module_name)\n attr = getattr(module, attribute_name)\n check.invariant(\n isinstance(attr, IOManagerDefinition),\n "DAGSTER_DEFAULT_IO_MANAGER_MODULE and DAGSTER_DEFAULT_IO_MANAGER_ATTRIBUTE"\n " must specify an IOManagerDefinition",\n )\n with build_resources({"io_manager": attr}, instance=init_context.instance) as resources:\n return resources.io_manager\n except Exception as e:\n if not silence_failures:\n raise\n else:\n warnings.warn(\n f"Failed to load io manager override with module: {module_name} attribute:"\n f" {attribute_name}: {e}\\nFalling back to default io manager."\n )\n from dagster._core.storage.fs_io_manager import PickledObjectFilesystemIOManager\n\n # normally, default to the fs_io_manager\n base_dir = init_context.resource_config.get(\n "base_dir", init_context.instance.storage_directory() if init_context.instance else None\n )\n\n return PickledObjectFilesystemIOManager(base_dir=base_dir)\n\n\ndef _config_mapping_with_default_value(\n inner_schema: ConfigType,\n default_config: Mapping[str, Any],\n job_name: str,\n) -> ConfigMapping:\n if not isinstance(inner_schema, Shape):\n check.failed("Only Shape (dictionary) config_schema allowed on Job ConfigMapping")\n\n def config_fn(x):\n return x\n\n updated_fields = {}\n field_aliases = inner_schema.field_aliases\n for name, field in inner_schema.fields.items():\n if name in default_config:\n updated_fields[name] = Field(\n config=field.config_type,\n default_value=default_config[name],\n description=field.description,\n )\n elif name in field_aliases and field_aliases[name] in default_config:\n updated_fields[name] = Field(\n config=field.config_type,\n default_value=default_config[field_aliases[name]],\n description=field.description,\n )\n else:\n updated_fields[name] = field\n\n config_schema = Shape(\n fields=updated_fields,\n description=(\n "This run config schema was automatically populated with default values "\n "from `default_config`."\n ),\n field_aliases=inner_schema.field_aliases,\n )\n\n config_evr = validate_config(config_schema, default_config)\n if not config_evr.success:\n raise DagsterInvalidConfigError(\n f"Error in config when building job '{job_name}' ",\n config_evr.errors,\n default_config,\n )\n\n return ConfigMapping(\n config_fn=config_fn, config_schema=config_schema, receive_processed_config_values=False\n )\n\n\ndef get_run_config_schema_for_job(\n graph_def: GraphDefinition,\n resource_defs: Mapping[str, ResourceDefinition],\n executor_def: "ExecutorDefinition",\n logger_defs: Mapping[str, LoggerDefinition],\n asset_layer: Optional[AssetLayer],\n was_explicitly_provided_resources: bool = False,\n) -> ConfigType:\n return JobDefinition(\n name=graph_def.name,\n graph_def=graph_def,\n resource_defs=resource_defs,\n executor_def=executor_def,\n logger_defs=logger_defs,\n asset_layer=asset_layer,\n _was_explicitly_provided_resources=was_explicitly_provided_resources,\n ).run_config_schema.run_config_schema_type\n\n\ndef _infer_asset_layer_from_source_asset_deps(job_graph_def: GraphDefinition) -> AssetLayer:\n """For non-asset jobs that have some inputs that are fed from SourceAssets, constructs an\n AssetLayer that includes those SourceAssets.\n """\n asset_keys_by_node_input_handle: Dict[NodeInputHandle, AssetKey] = {}\n source_assets_list = []\n source_asset_keys_set = set()\n io_manager_keys_by_asset_key: Mapping[AssetKey, str] = {}\n\n # each entry is a graph definition and its handle relative to the job root\n stack: List[Tuple[GraphDefinition, Optional[NodeHandle]]] = [(job_graph_def, None)]\n\n while stack:\n graph_def, parent_node_handle = stack.pop()\n\n for node_name, input_source_assets in graph_def.node_input_source_assets.items():\n node_handle = NodeHandle(node_name, parent_node_handle)\n for input_name, source_asset in input_source_assets.items():\n if source_asset.key not in source_asset_keys_set:\n source_asset_keys_set.add(source_asset.key)\n source_assets_list.append(source_asset)\n\n input_handle = NodeInputHandle(node_handle, input_name)\n asset_keys_by_node_input_handle[input_handle] = source_asset.key\n for resolved_input_handle in graph_def.node_dict[\n node_name\n ].definition.resolve_input_to_destinations(input_handle):\n asset_keys_by_node_input_handle[resolved_input_handle] = source_asset.key\n\n if source_asset.io_manager_key:\n io_manager_keys_by_asset_key[source_asset.key] = source_asset.io_manager_key\n\n for node_name, node in graph_def.node_dict.items():\n if isinstance(node.definition, GraphDefinition):\n stack.append((node.definition, NodeHandle(node_name, parent_node_handle)))\n\n return AssetLayer(\n assets_defs_by_node_handle={},\n asset_keys_by_node_input_handle=asset_keys_by_node_input_handle,\n asset_info_by_node_output_handle={},\n asset_deps={},\n dependency_node_handles_by_asset_key={},\n assets_defs_by_key={},\n source_assets_by_key={\n source_asset.key: source_asset for source_asset in source_assets_list\n },\n io_manager_keys_by_asset_key=io_manager_keys_by_asset_key,\n dep_asset_keys_by_node_output_handle={},\n partition_mappings_by_asset_dep={},\n asset_checks_defs_by_node_handle={},\n node_output_handles_by_asset_check_key={},\n check_names_by_asset_key_by_node_handle={},\n check_key_by_node_output_handle={},\n )\n\n\ndef _build_all_node_defs(node_defs: Sequence[NodeDefinition]) -> Mapping[str, NodeDefinition]:\n all_defs: Dict[str, NodeDefinition] = {}\n for current_level_node_def in node_defs:\n for node_def in current_level_node_def.iterate_node_defs():\n if node_def.name in all_defs:\n if all_defs[node_def.name] != node_def:\n raise DagsterInvalidDefinitionError(\n 'Detected conflicting node definitions with the same name "{name}"'.format(\n name=node_def.name\n )\n )\n else:\n all_defs[node_def.name] = node_def\n\n return all_defs\n\n\ndef _create_run_config_schema(\n job_def: JobDefinition,\n required_resources: AbstractSet[str],\n) -> "RunConfigSchema":\n from .run_config import (\n RunConfigSchemaCreationData,\n construct_config_type_dictionary,\n define_run_config_schema_type,\n )\n from .run_config_schema import RunConfigSchema\n\n # When executing with a subset job, include the missing nodes\n # from the original job as ignored to allow execution with\n # run config that is valid for the original\n ignored_nodes: Sequence[Node] = []\n if job_def.is_subset:\n if isinstance(job_def.graph, SubselectedGraphDefinition): # op selection provided\n ignored_nodes = job_def.graph.get_top_level_omitted_nodes()\n elif job_def.asset_selection_data:\n parent_job = job_def\n while parent_job.asset_selection_data:\n parent_job = parent_job.asset_selection_data.parent_job_def\n\n ignored_nodes = [\n node for node in parent_job.graph.nodes if not job_def.has_node_named(node.name)\n ]\n else:\n ignored_nodes = []\n\n run_config_schema_type = define_run_config_schema_type(\n RunConfigSchemaCreationData(\n job_name=job_def.name,\n nodes=job_def.graph.nodes,\n graph_def=job_def.graph,\n dependency_structure=job_def.graph.dependency_structure,\n executor_def=job_def.executor_def,\n resource_defs=job_def.resource_defs,\n logger_defs=job_def.loggers,\n ignored_nodes=ignored_nodes,\n required_resources=required_resources,\n direct_inputs=job_def.input_values,\n asset_layer=job_def.asset_layer,\n )\n )\n\n if job_def.config_mapping:\n outer_config_type = job_def.config_mapping.config_schema.config_type\n else:\n outer_config_type = run_config_schema_type\n\n if outer_config_type is None:\n check.failed("Unexpected outer_config_type value of None")\n\n config_type_dict_by_name, config_type_dict_by_key = construct_config_type_dictionary(\n job_def.all_node_defs,\n outer_config_type,\n )\n\n return RunConfigSchema(\n run_config_schema_type=run_config_schema_type,\n config_type_dict_by_name=config_type_dict_by_name,\n config_type_dict_by_key=config_type_dict_by_key,\n config_mapping=job_def.config_mapping,\n )\n
", "current_page_name": "_modules/dagster/_core/definitions/job_definition", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.job_definition"}, "load_assets_from_modules": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.load_assets_from_modules

\nimport inspect\nimport os\nimport pkgutil\nfrom importlib import import_module\nfrom types import ModuleType\nfrom typing import Dict, Generator, Iterable, List, Optional, Sequence, Set, Tuple, Union\n\nimport dagster._check as check\nfrom dagster._core.definitions.auto_materialize_policy import AutoMaterializePolicy\nfrom dagster._core.definitions.backfill_policy import BackfillPolicy\nfrom dagster._core.definitions.freshness_policy import FreshnessPolicy\nfrom dagster._core.errors import DagsterInvalidDefinitionError\n\nfrom .assets import AssetsDefinition\nfrom .cacheable_assets import CacheableAssetsDefinition\nfrom .events import (\n    AssetKey,\n    CoercibleToAssetKeyPrefix,\n    check_opt_coercible_to_asset_key_prefix_param,\n)\nfrom .source_asset import SourceAsset\n\n\ndef _find_assets_in_module(\n    module: ModuleType,\n) -> Generator[Union[AssetsDefinition, SourceAsset, CacheableAssetsDefinition], None, None]:\n    """Finds assets in the given module and adds them to the given sets of assets and source assets."""\n    for attr in dir(module):\n        value = getattr(module, attr)\n        if isinstance(value, (AssetsDefinition, SourceAsset, CacheableAssetsDefinition)):\n            yield value\n        elif isinstance(value, list) and all(\n            isinstance(el, (AssetsDefinition, SourceAsset, CacheableAssetsDefinition))\n            for el in value\n        ):\n            yield from value\n\n\ndef assets_from_modules(\n    modules: Iterable[ModuleType], extra_source_assets: Optional[Sequence[SourceAsset]] = None\n) -> Tuple[Sequence[AssetsDefinition], Sequence[SourceAsset], Sequence[CacheableAssetsDefinition]]:\n    """Constructs three lists, a list of assets, a list of source assets, and a list of cacheable\n    assets from the given modules.\n\n    Args:\n        modules (Iterable[ModuleType]): The Python modules to look for assets inside.\n        extra_source_assets (Optional[Sequence[SourceAsset]]): Source assets to include in the\n            group in addition to the source assets found in the modules.\n\n    Returns:\n        Tuple[Sequence[AssetsDefinition], Sequence[SourceAsset], Sequence[CacheableAssetsDefinition]]]:\n            A tuple containing a list of assets, a list of source assets, and a list of\n            cacheable assets defined in the given modules.\n    """\n    asset_ids: Set[int] = set()\n    asset_keys: Dict[AssetKey, ModuleType] = dict()\n    source_assets: List[SourceAsset] = list(\n        check.opt_sequence_param(extra_source_assets, "extra_source_assets", of_type=SourceAsset)\n    )\n    cacheable_assets: List[CacheableAssetsDefinition] = []\n    assets: Dict[AssetKey, AssetsDefinition] = {}\n    for module in modules:\n        for asset in _find_assets_in_module(module):\n            if id(asset) not in asset_ids:\n                asset_ids.add(id(asset))\n                if isinstance(asset, CacheableAssetsDefinition):\n                    cacheable_assets.append(asset)\n                else:\n                    keys = asset.keys if isinstance(asset, AssetsDefinition) else [asset.key]\n                    for key in keys:\n                        if key in asset_keys:\n                            modules_str = ", ".join(\n                                set([asset_keys[key].__name__, module.__name__])\n                            )\n                            error_str = (\n                                f"Asset key {key} is defined multiple times. Definitions found in"\n                                f" modules: {modules_str}. "\n                            )\n\n                            if key in assets and isinstance(asset, AssetsDefinition):\n                                if assets[key].node_def == asset.node_def:\n                                    error_str += (\n                                        "One possible cause of this bug is a call to with_resources"\n                                        " outside of a repository definition, causing a duplicate"\n                                        " asset definition."\n                                    )\n\n                            raise DagsterInvalidDefinitionError(error_str)\n                        else:\n                            asset_keys[key] = module\n                            if isinstance(asset, AssetsDefinition):\n                                assets[key] = asset\n                    if isinstance(asset, SourceAsset):\n                        source_assets.append(asset)\n    return list(set(assets.values())), source_assets, cacheable_assets\n\n\n
[docs]def load_assets_from_modules(\n modules: Iterable[ModuleType],\n group_name: Optional[str] = None,\n key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n *,\n freshness_policy: Optional[FreshnessPolicy] = None,\n auto_materialize_policy: Optional[AutoMaterializePolicy] = None,\n backfill_policy: Optional[BackfillPolicy] = None,\n source_key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n) -> Sequence[Union[AssetsDefinition, SourceAsset, CacheableAssetsDefinition]]:\n """Constructs a list of assets and source assets from the given modules.\n\n Args:\n modules (Iterable[ModuleType]): The Python modules to look for assets inside.\n group_name (Optional[str]):\n Group name to apply to the loaded assets. The returned assets will be copies of the\n loaded objects, with the group name added.\n key_prefix (Optional[Union[str, Sequence[str]]]):\n Prefix to prepend to the keys of the loaded assets. The returned assets will be copies\n of the loaded objects, with the prefix prepended.\n freshness_policy (Optional[FreshnessPolicy]): FreshnessPolicy to apply to all the loaded\n assets.\n auto_materialize_policy (Optional[AutoMaterializePolicy]): AutoMaterializePolicy to apply\n to all the loaded assets.\n backfill_policy (Optional[AutoMaterializePolicy]): BackfillPolicy to apply to all the loaded assets.\n source_key_prefix (bool): Prefix to prepend to the keys of loaded SourceAssets. The returned\n assets will be copies of the loaded objects, with the prefix prepended.\n\n Returns:\n Sequence[Union[AssetsDefinition, SourceAsset]]:\n A list containing assets and source assets defined in the given modules.\n """\n group_name = check.opt_str_param(group_name, "group_name")\n key_prefix = check_opt_coercible_to_asset_key_prefix_param(key_prefix, "key_prefix")\n freshness_policy = check.opt_inst_param(freshness_policy, "freshness_policy", FreshnessPolicy)\n auto_materialize_policy = check.opt_inst_param(\n auto_materialize_policy, "auto_materialize_policy", AutoMaterializePolicy\n )\n backfill_policy = check.opt_inst_param(backfill_policy, "backfill_policy", BackfillPolicy)\n\n (\n assets,\n source_assets,\n cacheable_assets,\n ) = assets_from_modules(modules)\n\n return assets_with_attributes(\n assets,\n source_assets,\n cacheable_assets,\n key_prefix=key_prefix,\n group_name=group_name,\n freshness_policy=freshness_policy,\n auto_materialize_policy=auto_materialize_policy,\n backfill_policy=backfill_policy,\n source_key_prefix=source_key_prefix,\n )
\n\n\n
[docs]def load_assets_from_current_module(\n group_name: Optional[str] = None,\n key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n *,\n freshness_policy: Optional[FreshnessPolicy] = None,\n auto_materialize_policy: Optional[AutoMaterializePolicy] = None,\n backfill_policy: Optional[BackfillPolicy] = None,\n source_key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n) -> Sequence[Union[AssetsDefinition, SourceAsset, CacheableAssetsDefinition]]:\n """Constructs a list of assets, source assets, and cacheable assets from the module where\n this function is called.\n\n Args:\n group_name (Optional[str]):\n Group name to apply to the loaded assets. The returned assets will be copies of the\n loaded objects, with the group name added.\n key_prefix (Optional[Union[str, Sequence[str]]]):\n Prefix to prepend to the keys of the loaded assets. The returned assets will be copies\n of the loaded objects, with the prefix prepended.\n freshness_policy (Optional[FreshnessPolicy]): FreshnessPolicy to apply to all the loaded\n assets.\n auto_materialize_policy (Optional[AutoMaterializePolicy]): AutoMaterializePolicy to apply\n to all the loaded assets.\n backfill_policy (Optional[AutoMaterializePolicy]): BackfillPolicy to apply to all the loaded assets.\n source_key_prefix (bool): Prefix to prepend to the keys of loaded SourceAssets. The returned\n assets will be copies of the loaded objects, with the prefix prepended.\n\n Returns:\n Sequence[Union[AssetsDefinition, SourceAsset, CachableAssetsDefinition]]:\n A list containing assets, source assets, and cacheable assets defined in the module.\n """\n caller = inspect.stack()[1]\n module = inspect.getmodule(caller[0])\n if module is None:\n check.failed("Could not find a module for the caller")\n\n return load_assets_from_modules(\n [module],\n group_name=group_name,\n key_prefix=key_prefix,\n freshness_policy=freshness_policy,\n auto_materialize_policy=auto_materialize_policy,\n backfill_policy=backfill_policy,\n )
\n\n\ndef assets_from_package_module(\n package_module: ModuleType,\n extra_source_assets: Optional[Sequence[SourceAsset]] = None,\n) -> Tuple[Sequence[AssetsDefinition], Sequence[SourceAsset], Sequence[CacheableAssetsDefinition]]:\n """Constructs three lists, a list of assets, a list of source assets, and a list of cacheable assets\n from the given package module.\n\n Args:\n package_module (ModuleType): The package module to looks for assets inside.\n extra_source_assets (Optional[Sequence[SourceAsset]]): Source assets to include in the\n group in addition to the source assets found in the modules.\n\n Returns:\n Tuple[Sequence[AssetsDefinition], Sequence[SourceAsset], Sequence[CacheableAssetsDefinition]]:\n A tuple containing a list of assets, a list of source assets, and a list of cacheable assets\n defined in the given modules.\n """\n return assets_from_modules(\n _find_modules_in_package(package_module), extra_source_assets=extra_source_assets\n )\n\n\n
[docs]def load_assets_from_package_module(\n package_module: ModuleType,\n group_name: Optional[str] = None,\n key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n *,\n freshness_policy: Optional[FreshnessPolicy] = None,\n auto_materialize_policy: Optional[AutoMaterializePolicy] = None,\n backfill_policy: Optional[BackfillPolicy] = None,\n source_key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n) -> Sequence[Union[AssetsDefinition, SourceAsset, CacheableAssetsDefinition]]:\n """Constructs a list of assets and source assets that includes all asset\n definitions, source assets, and cacheable assets in all sub-modules of the given package module.\n\n A package module is the result of importing a package.\n\n Args:\n package_module (ModuleType): The package module to looks for assets inside.\n group_name (Optional[str]):\n Group name to apply to the loaded assets. The returned assets will be copies of the\n loaded objects, with the group name added.\n key_prefix (Optional[Union[str, Sequence[str]]]):\n Prefix to prepend to the keys of the loaded assets. The returned assets will be copies\n of the loaded objects, with the prefix prepended.\n freshness_policy (Optional[FreshnessPolicy]): FreshnessPolicy to apply to all the loaded\n assets.\n auto_materialize_policy (Optional[AutoMaterializePolicy]): AutoMaterializePolicy to apply\n to all the loaded assets.\n backfill_policy (Optional[AutoMaterializePolicy]): BackfillPolicy to apply to all the loaded assets.\n source_key_prefix (bool): Prefix to prepend to the keys of loaded SourceAssets. The returned\n assets will be copies of the loaded objects, with the prefix prepended.\n\n Returns:\n Sequence[Union[AssetsDefinition, SourceAsset, CacheableAssetsDefinition]]:\n A list containing assets, source assets, and cacheable assets defined in the module.\n """\n group_name = check.opt_str_param(group_name, "group_name")\n key_prefix = check_opt_coercible_to_asset_key_prefix_param(key_prefix, "key_prefix")\n freshness_policy = check.opt_inst_param(freshness_policy, "freshness_policy", FreshnessPolicy)\n auto_materialize_policy = check.opt_inst_param(\n auto_materialize_policy, "auto_materialize_policy", AutoMaterializePolicy\n )\n backfill_policy = check.opt_inst_param(backfill_policy, "backfill_policy", BackfillPolicy)\n\n (\n assets,\n source_assets,\n cacheable_assets,\n ) = assets_from_package_module(package_module)\n return assets_with_attributes(\n assets,\n source_assets,\n cacheable_assets,\n key_prefix=key_prefix,\n group_name=group_name,\n freshness_policy=freshness_policy,\n auto_materialize_policy=auto_materialize_policy,\n backfill_policy=backfill_policy,\n source_key_prefix=source_key_prefix,\n )
\n\n\n
[docs]def load_assets_from_package_name(\n package_name: str,\n group_name: Optional[str] = None,\n key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n *,\n freshness_policy: Optional[FreshnessPolicy] = None,\n auto_materialize_policy: Optional[AutoMaterializePolicy] = None,\n backfill_policy: Optional[BackfillPolicy] = None,\n source_key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n) -> Sequence[Union[AssetsDefinition, SourceAsset, CacheableAssetsDefinition]]:\n """Constructs a list of assets, source assets, and cacheable assets that includes all asset\n definitions and source assets in all sub-modules of the given package.\n\n Args:\n package_name (str): The name of a Python package to look for assets inside.\n group_name (Optional[str]):\n Group name to apply to the loaded assets. The returned assets will be copies of the\n loaded objects, with the group name added.\n key_prefix (Optional[Union[str, Sequence[str]]]):\n Prefix to prepend to the keys of the loaded assets. The returned assets will be copies\n of the loaded objects, with the prefix prepended.\n freshness_policy (Optional[FreshnessPolicy]): FreshnessPolicy to apply to all the loaded\n assets.\n auto_materialize_policy (Optional[AutoMaterializePolicy]): AutoMaterializePolicy to apply\n to all the loaded assets.\n backfill_policy (Optional[AutoMaterializePolicy]): BackfillPolicy to apply to all the loaded assets.\n source_key_prefix (bool): Prefix to prepend to the keys of loaded SourceAssets. The returned\n assets will be copies of the loaded objects, with the prefix prepended.\n\n Returns:\n Sequence[Union[AssetsDefinition, SourceAsset, CacheableAssetsDefinition]]:\n A list containing assets, source assets, and cacheable assets defined in the module.\n """\n package_module = import_module(package_name)\n return load_assets_from_package_module(\n package_module,\n group_name=group_name,\n key_prefix=key_prefix,\n freshness_policy=freshness_policy,\n auto_materialize_policy=auto_materialize_policy,\n backfill_policy=backfill_policy,\n )
\n\n\ndef _find_modules_in_package(package_module: ModuleType) -> Iterable[ModuleType]:\n yield package_module\n package_path = package_module.__file__\n if package_path:\n for _, modname, is_pkg in pkgutil.walk_packages([os.path.dirname(package_path)]):\n submodule = import_module(f"{package_module.__name__}.{modname}")\n if is_pkg:\n yield from _find_modules_in_package(submodule)\n else:\n yield submodule\n else:\n raise ValueError(\n f"Tried to find modules in package {package_module}, but its __file__ is None"\n )\n\n\ndef prefix_assets(\n assets_defs: Sequence[AssetsDefinition],\n key_prefix: CoercibleToAssetKeyPrefix,\n source_assets: Sequence[SourceAsset],\n source_key_prefix: Optional[CoercibleToAssetKeyPrefix],\n) -> Tuple[Sequence[AssetsDefinition], Sequence[SourceAsset]]:\n """Given a list of assets, prefix the input and output asset keys with key_prefix.\n The prefix is not added to source assets.\n\n Input asset keys that reference other assets within assets_defs are "brought along" -\n i.e. prefixed as well.\n\n Example with a single asset:\n\n .. code-block:: python\n\n @asset\n def asset1():\n ...\n\n result = prefixed_asset_key_replacements([asset_1], "my_prefix")\n assert result.assets[0].asset_key == AssetKey(["my_prefix", "asset1"])\n\n Example with dependencies within the list of assets:\n\n .. code-block:: python\n\n @asset\n def asset1():\n ...\n\n @asset\n def asset2(asset1):\n ...\n\n result = prefixed_asset_key_replacements([asset1, asset2], "my_prefix")\n assert result.assets[0].asset_key == AssetKey(["my_prefix", "asset1"])\n assert result.assets[1].asset_key == AssetKey(["my_prefix", "asset2"])\n assert result.assets[1].dependency_keys == {AssetKey(["my_prefix", "asset1"])}\n\n """\n asset_keys = {asset_key for assets_def in assets_defs for asset_key in assets_def.keys}\n source_asset_keys = {source_asset.key for source_asset in source_assets}\n\n if isinstance(key_prefix, str):\n key_prefix = [key_prefix]\n key_prefix = check.is_list(key_prefix, of_type=str)\n\n result_assets: List[AssetsDefinition] = []\n for assets_def in assets_defs:\n output_asset_key_replacements = {\n asset_key: AssetKey([*key_prefix, *asset_key.path]) for asset_key in assets_def.keys\n }\n input_asset_key_replacements = {}\n for dep_asset_key in assets_def.dependency_keys:\n if dep_asset_key in asset_keys:\n input_asset_key_replacements[dep_asset_key] = AssetKey(\n [*key_prefix, *dep_asset_key.path]\n )\n elif source_key_prefix and dep_asset_key in source_asset_keys:\n input_asset_key_replacements[dep_asset_key] = AssetKey(\n [*source_key_prefix, *dep_asset_key.path]\n )\n\n result_assets.append(\n assets_def.with_attributes(\n output_asset_key_replacements=output_asset_key_replacements,\n input_asset_key_replacements=input_asset_key_replacements,\n )\n )\n\n if source_key_prefix:\n result_source_assets = [\n source_asset.with_attributes(key=AssetKey([*source_key_prefix, *source_asset.key.path]))\n for source_asset in source_assets\n ]\n else:\n result_source_assets = source_assets\n\n return result_assets, result_source_assets\n\n\ndef assets_with_attributes(\n assets_defs: Sequence[AssetsDefinition],\n source_assets: Sequence[SourceAsset],\n cacheable_assets: Sequence[CacheableAssetsDefinition],\n key_prefix: Optional[Sequence[str]],\n group_name: Optional[str],\n freshness_policy: Optional[FreshnessPolicy],\n auto_materialize_policy: Optional[AutoMaterializePolicy],\n backfill_policy: Optional[BackfillPolicy],\n source_key_prefix: Optional[Sequence[str]],\n) -> Sequence[Union[AssetsDefinition, SourceAsset, CacheableAssetsDefinition]]:\n # There is a tricky edge case here where if a non-cacheable asset depends on a cacheable asset,\n # and the assets are prefixed, the non-cacheable asset's dependency will not be prefixed since\n # at prefix-time it is not known that its dependency is one of the cacheable assets.\n # https://github.com/dagster-io/dagster/pull/10389#pullrequestreview-1170913271\n if key_prefix:\n assets_defs, source_assets = prefix_assets(\n assets_defs, key_prefix, source_assets, source_key_prefix\n )\n cacheable_assets = [\n cached_asset.with_prefix_for_all(key_prefix) for cached_asset in cacheable_assets\n ]\n\n if group_name or freshness_policy or auto_materialize_policy or backfill_policy:\n assets_defs = [\n asset.with_attributes(\n group_names_by_key=(\n {asset_key: group_name for asset_key in asset.keys} if group_name else None\n ),\n freshness_policy=freshness_policy,\n auto_materialize_policy=auto_materialize_policy,\n backfill_policy=backfill_policy,\n )\n for asset in assets_defs\n ]\n if group_name:\n source_assets = [\n source_asset.with_attributes(group_name=group_name)\n for source_asset in source_assets\n ]\n cacheable_assets = [\n cached_asset.with_attributes_for_all(\n group_name,\n freshness_policy=freshness_policy,\n auto_materialize_policy=auto_materialize_policy,\n backfill_policy=backfill_policy,\n )\n for cached_asset in cacheable_assets\n ]\n\n return [*assets_defs, *source_assets, *cacheable_assets]\n
", "current_page_name": "_modules/dagster/_core/definitions/load_assets_from_modules", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.load_assets_from_modules"}, "logger_definition": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.logger_definition

\nfrom typing import TYPE_CHECKING, Any, Callable, Optional, Union, cast, overload\n\nimport dagster._check as check\nfrom dagster._annotations import public\nfrom dagster._core.errors import DagsterInvalidInvocationError\n\nfrom ..decorator_utils import get_function_params\nfrom .config import is_callable_valid_config_arg\nfrom .configurable import AnonymousConfigurableDefinition\nfrom .definition_config_schema import (\n    CoercableToConfigSchema,\n    convert_user_facing_definition_config_schema,\n)\n\nif TYPE_CHECKING:\n    import logging\n\n    from dagster._core.definitions import JobDefinition\n    from dagster._core.execution.context.logger import InitLoggerContext, UnboundInitLoggerContext\n\n    InitLoggerFunction = Callable[[InitLoggerContext], logging.Logger]\n\n\n
[docs]class LoggerDefinition(AnonymousConfigurableDefinition):\n """Core class for defining loggers.\n\n Loggers are job-scoped logging handlers, which will be automatically invoked whenever\n dagster messages are logged from within a job.\n\n Args:\n logger_fn (Callable[[InitLoggerContext], logging.Logger]): User-provided function to\n instantiate the logger. This logger will be automatically invoked whenever the methods\n on ``context.log`` are called from within job compute logic.\n config_schema (Optional[ConfigSchema]): The schema for the config. Configuration data available in\n `init_context.logger_config`. If not set, Dagster will accept any config provided.\n description (Optional[str]): A human-readable description of this logger.\n """\n\n def __init__(\n self,\n logger_fn: "InitLoggerFunction",\n config_schema: Any = None,\n description: Optional[str] = None,\n ):\n self._logger_fn = check.callable_param(logger_fn, "logger_fn")\n self._config_schema = convert_user_facing_definition_config_schema(config_schema)\n self._description = check.opt_str_param(description, "description")\n\n def __call__(self, *args, **kwargs):\n from dagster._core.execution.context.logger import UnboundInitLoggerContext\n\n from .logger_invocation import logger_invocation_result\n\n if len(args) == 0 and len(kwargs) == 0:\n raise DagsterInvalidInvocationError(\n "Logger initialization function has context argument, but no context argument was "\n "provided when invoking."\n )\n if len(args) + len(kwargs) > 1:\n raise DagsterInvalidInvocationError(\n "Initialization of logger received multiple arguments. Only a first "\n "positional context parameter should be provided when invoking."\n )\n\n context_param_name = get_function_params(self.logger_fn)[0].name\n\n if args:\n context = check.opt_inst_param(\n args[0],\n context_param_name,\n UnboundInitLoggerContext,\n default=UnboundInitLoggerContext(logger_config=None, job_def=None),\n )\n return logger_invocation_result(self, context)\n else:\n if context_param_name not in kwargs:\n raise DagsterInvalidInvocationError(\n f"Logger initialization expected argument '{context_param_name}'."\n )\n context = check.opt_inst_param(\n kwargs[context_param_name],\n context_param_name,\n UnboundInitLoggerContext,\n default=UnboundInitLoggerContext(logger_config=None, job_def=None),\n )\n\n return logger_invocation_result(self, context)\n\n @public\n @property\n def logger_fn(self) -> "InitLoggerFunction":\n """Callable[[InitLoggerContext], logging.Logger]: The function that will be invoked to\n instantiate the logger.\n """\n return self._logger_fn\n\n @public\n @property\n def config_schema(self) -> Any:\n """Any: The schema for the logger's config. Configuration data available in `init_context.logger_config`."""\n return self._config_schema\n\n @public\n @property\n def description(self) -> Optional[str]:\n """Optional[str]: A human-readable description of the logger."""\n return self._description\n\n def copy_for_configured(\n self,\n description: Optional[str],\n config_schema: Any,\n ) -> "LoggerDefinition":\n return LoggerDefinition(\n config_schema=config_schema,\n description=description or self.description,\n logger_fn=self.logger_fn,\n )
\n\n\n@overload\ndef logger(\n config_schema: CoercableToConfigSchema, description: Optional[str] = ...\n) -> Callable[["InitLoggerFunction"], "LoggerDefinition"]: ...\n\n\n@overload\ndef logger(\n config_schema: "InitLoggerFunction", description: Optional[str] = ...\n) -> "LoggerDefinition": ...\n\n\n
[docs]def logger(\n config_schema: Union[CoercableToConfigSchema, "InitLoggerFunction"] = None,\n description: Optional[str] = None,\n) -> Union["LoggerDefinition", Callable[["InitLoggerFunction"], "LoggerDefinition"]]:\n """Define a logger.\n\n The decorated function should accept an :py:class:`InitLoggerContext` and return an instance of\n :py:class:`python:logging.Logger`. This function will become the ``logger_fn`` of an underlying\n :py:class:`LoggerDefinition`.\n\n Args:\n config_schema (Optional[ConfigSchema]): The schema for the config. Configuration data available in\n `init_context.logger_config`. If not set, Dagster will accept any config provided.\n description (Optional[str]): A human-readable description of the logger.\n """\n # This case is for when decorator is used bare, without arguments.\n # E.g. @logger versus @logger()\n if callable(config_schema) and not is_callable_valid_config_arg(config_schema):\n return LoggerDefinition(logger_fn=cast("InitLoggerFunction", config_schema))\n\n def _wrap(logger_fn: "InitLoggerFunction") -> "LoggerDefinition":\n return LoggerDefinition(\n logger_fn=logger_fn,\n config_schema=config_schema,\n description=description,\n )\n\n return _wrap
\n\n\n
[docs]def build_init_logger_context(\n logger_config: Any = None,\n job_def: Optional["JobDefinition"] = None,\n) -> "UnboundInitLoggerContext":\n """Builds logger initialization context from provided parameters.\n\n This function can be used to provide the context argument to the invocation of a logger\n definition.\n\n Note that you may only specify one of pipeline_def and job_def.\n\n Args:\n logger_config (Any): The config to provide during initialization of logger.\n job_def (Optional[JobDefinition]): The job definition that the logger will be used with.\n\n Examples:\n .. code-block:: python\n\n context = build_init_logger_context()\n logger_to_init(context)\n """\n from dagster._core.definitions import JobDefinition\n from dagster._core.execution.context.logger import UnboundInitLoggerContext\n\n check.opt_inst_param(job_def, "job_def", JobDefinition)\n\n return UnboundInitLoggerContext(logger_config=logger_config, job_def=job_def)
\n
", "current_page_name": "_modules/dagster/_core/definitions/logger_definition", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.logger_definition"}, "materialize": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.materialize

\nfrom typing import TYPE_CHECKING, Any, Mapping, Optional, Sequence, Set, Union\n\nimport dagster._check as check\nfrom dagster._core.definitions.unresolved_asset_job_definition import define_asset_job\nfrom dagster._utils.merger import merge_dicts\n\nfrom ..errors import DagsterInvariantViolationError\nfrom ..instance import DagsterInstance\nfrom ..storage.io_manager import IOManagerDefinition\nfrom ..storage.mem_io_manager import mem_io_manager\nfrom .assets import AssetsDefinition\nfrom .source_asset import SourceAsset\n\nif TYPE_CHECKING:\n    from dagster._core.definitions.asset_selection import CoercibleToAssetSelection\n    from dagster._core.definitions.events import AssetKey\n\n    from ..execution.execute_in_process_result import ExecuteInProcessResult\n\nEPHEMERAL_JOB_NAME = "__ephemeral_asset_job__"\n\n\n
[docs]def materialize(\n assets: Sequence[Union[AssetsDefinition, SourceAsset]],\n run_config: Any = None,\n instance: Optional[DagsterInstance] = None,\n resources: Optional[Mapping[str, object]] = None,\n partition_key: Optional[str] = None,\n raise_on_error: bool = True,\n tags: Optional[Mapping[str, str]] = None,\n selection: Optional["CoercibleToAssetSelection"] = None,\n) -> "ExecuteInProcessResult":\n """Executes a single-threaded, in-process run which materializes provided assets.\n\n By default, will materialize assets to the local filesystem.\n\n Args:\n assets (Sequence[Union[AssetsDefinition, SourceAsset]]):\n The assets to materialize.\n\n Unless you're using `deps` or `non_argument_deps`, you must also include all assets that are\n upstream of the assets that you want to materialize. This is because those upstream\n asset definitions have information that is needed to load their contents while\n materializing the downstream assets.\n\n You can use the `selection` argument to distinguish between assets that you want to\n materialize and assets that are just present for loading.\n resources (Optional[Mapping[str, object]]):\n The resources needed for execution. Can provide resource instances\n directly, or resource definitions. Note that if provided resources\n conflict with resources directly on assets, an error will be thrown.\n run_config (Optional[Any]): The run config to use for the run that materializes the assets.\n partition_key: (Optional[str])\n The string partition key that specifies the run config to execute. Can only be used\n to select run config for assets with partitioned config.\n tags (Optional[Mapping[str, str]]): Tags for the run.\n selection (Optional[Union[str, Sequence[str], Sequence[AssetKey], Sequence[Union[AssetsDefinition, SourceAsset]], AssetSelection]]):\n A sub-selection of assets to materialize.\n\n If not provided, then all assets will be materialized.\n\n If providing a string or sequence of strings,\n https://docs.dagster.io/concepts/assets/asset-selection-syntax describes the accepted\n syntax.\n\n Returns:\n ExecuteInProcessResult: The result of the execution.\n\n Examples:\n .. code-block:: python\n\n @asset\n def asset1():\n ...\n\n @asset\n def asset2(asset1):\n ...\n\n # executes a run that materializes asset1 and then asset2\n materialize([asset1, asset2])\n\n # executes a run that materializes just asset2, loading its input from asset1\n materialize([asset1, asset2], selection=[asset2])\n """\n from dagster._core.definitions.definitions_class import Definitions\n\n assets = check.sequence_param(assets, "assets", of_type=(AssetsDefinition, SourceAsset))\n instance = check.opt_inst_param(instance, "instance", DagsterInstance)\n partition_key = check.opt_str_param(partition_key, "partition_key")\n resources = check.opt_mapping_param(resources, "resources", key_type=str)\n\n all_executable_keys: Set[AssetKey] = set()\n for asset in assets:\n if isinstance(asset, AssetsDefinition):\n all_executable_keys = all_executable_keys.union(set(asset.keys))\n\n defs = Definitions(\n jobs=[define_asset_job(name=EPHEMERAL_JOB_NAME, selection=selection)],\n assets=assets,\n resources=resources,\n )\n return check.not_none(\n defs.get_job_def(EPHEMERAL_JOB_NAME),\n "This should always return a job",\n ).execute_in_process(\n run_config=run_config,\n instance=instance,\n partition_key=partition_key,\n raise_on_error=raise_on_error,\n tags=tags,\n )
\n\n\n
[docs]def materialize_to_memory(\n assets: Sequence[Union[AssetsDefinition, SourceAsset]],\n run_config: Any = None,\n instance: Optional[DagsterInstance] = None,\n resources: Optional[Mapping[str, object]] = None,\n partition_key: Optional[str] = None,\n raise_on_error: bool = True,\n tags: Optional[Mapping[str, str]] = None,\n selection: Optional["CoercibleToAssetSelection"] = None,\n) -> "ExecuteInProcessResult":\n """Executes a single-threaded, in-process run which materializes provided assets in memory.\n\n Will explicitly use :py:func:`mem_io_manager` for all required io manager\n keys. If any io managers are directly provided using the `resources`\n argument, a :py:class:`DagsterInvariantViolationError` will be thrown.\n\n Args:\n assets (Sequence[Union[AssetsDefinition, SourceAsset]]):\n The assets to materialize. Can also provide :py:class:`SourceAsset` objects to fill dependencies for asset defs.\n run_config (Optional[Any]): The run config to use for the run that materializes the assets.\n resources (Optional[Mapping[str, object]]):\n The resources needed for execution. Can provide resource instances\n directly, or resource definitions. If provided resources\n conflict with resources directly on assets, an error will be thrown.\n partition_key: (Optional[str])\n The string partition key that specifies the run config to execute. Can only be used\n to select run config for assets with partitioned config.\n tags (Optional[Mapping[str, str]]): Tags for the run.\n selection (Optional[Union[str, Sequence[str], Sequence[AssetKey], Sequence[Union[AssetsDefinition, SourceAsset]], AssetSelection]]):\n A sub-selection of assets to materialize.\n\n If not provided, then all assets will be materialized.\n\n If providing a string or sequence of strings,\n https://docs.dagster.io/concepts/assets/asset-selection-syntax describes the accepted\n syntax.\n\n Returns:\n ExecuteInProcessResult: The result of the execution.\n\n Examples:\n .. code-block:: python\n\n @asset\n def asset1():\n ...\n\n @asset\n def asset2(asset1):\n ...\n\n # executes a run that materializes asset1 and then asset2\n materialize([asset1, asset2])\n\n # executes a run that materializes just asset1\n materialize([asset1, asset2], selection=[asset1])\n """\n assets = check.sequence_param(assets, "assets", of_type=(AssetsDefinition, SourceAsset))\n\n # Gather all resource defs for the purpose of checking io managers.\n resources_dict = resources or {}\n all_resource_keys = set(resources_dict.keys())\n for asset in assets:\n all_resource_keys = all_resource_keys.union(asset.resource_defs.keys())\n\n io_manager_keys = _get_required_io_manager_keys(assets)\n for io_manager_key in io_manager_keys:\n if io_manager_key in all_resource_keys:\n raise DagsterInvariantViolationError(\n "Attempted to call `materialize_to_memory` with a resource "\n f"provided for io manager key '{io_manager_key}'. Do not "\n "provide resources for io manager keys when calling "\n "`materialize_to_memory`, as it will override io management "\n "behavior for all keys."\n )\n\n resource_defs = merge_dicts({key: mem_io_manager for key in io_manager_keys}, resources_dict)\n\n return materialize(\n assets=assets,\n run_config=run_config,\n resources=resource_defs,\n instance=instance,\n partition_key=partition_key,\n raise_on_error=raise_on_error,\n tags=tags,\n selection=selection,\n )
\n\n\ndef _get_required_io_manager_keys(\n assets: Sequence[Union[AssetsDefinition, SourceAsset]]\n) -> Set[str]:\n io_manager_keys = set()\n for asset in assets:\n for requirement in asset.get_resource_requirements():\n if requirement.expected_type == IOManagerDefinition:\n io_manager_keys.add(requirement.key)\n return io_manager_keys\n
", "current_page_name": "_modules/dagster/_core/definitions/materialize", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.materialize"}, "metadata": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.metadata

\nimport os\nfrom abc import ABC, abstractmethod\nfrom typing import (\n    TYPE_CHECKING,\n    Any,\n    Callable,\n    Dict,\n    Generic,\n    List,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Union,\n    cast,\n)\n\nfrom typing_extensions import Self, TypeAlias, TypeVar\n\nimport dagster._check as check\nimport dagster._seven as seven\nfrom dagster._annotations import PublicAttr, deprecated, deprecated_param, experimental, public\nfrom dagster._core.errors import DagsterInvalidMetadata\nfrom dagster._serdes import whitelist_for_serdes\nfrom dagster._serdes.serdes import (\n    FieldSerializer,\n    PackableValue,\n    UnpackContext,\n    WhitelistMap,\n    pack_value,\n)\nfrom dagster._utils.warnings import (\n    deprecation_warning,\n    normalize_renamed_param,\n)\n\nfrom .table import (  # re-exported\n    TableColumn as TableColumn,\n    TableColumnConstraints as TableColumnConstraints,\n    TableConstraints as TableConstraints,\n    TableRecord as TableRecord,\n    TableSchema as TableSchema,\n)\n\nif TYPE_CHECKING:\n    from dagster._core.definitions.events import AssetKey\n\nArbitraryMetadataMapping: TypeAlias = Mapping[str, Any]\n\nRawMetadataValue = Union[\n    "MetadataValue",\n    TableSchema,\n    "AssetKey",\n    os.PathLike,\n    Dict[Any, Any],\n    float,\n    int,\n    List[Any],\n    str,\n    None,\n]\n\nMetadataMapping: TypeAlias = Mapping[str, "MetadataValue"]\nMetadataUserInput: TypeAlias = Mapping[str, RawMetadataValue]\n\nT_Packable = TypeVar("T_Packable", bound=PackableValue, default=PackableValue, covariant=True)\n\n# ########################\n# ##### NORMALIZATION\n# ########################\n\n\ndef normalize_metadata(\n    metadata: Mapping[str, RawMetadataValue],\n    allow_invalid: bool = False,\n) -> Mapping[str, "MetadataValue"]:\n    # This is a stopgap measure to deal with unsupported metadata values, which occur when we try\n    # to convert arbitrary metadata (on e.g. OutputDefinition) to a MetadataValue, which is required\n    # for serialization. This will cause unsupported values to be silently replaced with a\n    # string placeholder.\n    normalized_metadata: Dict[str, MetadataValue] = {}\n    for k, v in metadata.items():\n        try:\n            normalized_value = normalize_metadata_value(v)\n        except DagsterInvalidMetadata as e:\n            if allow_invalid:\n                deprecation_warning(\n                    "Support for arbitrary metadata values",\n                    "2.0.0",\n                    additional_warn_text=(\n                        "In the future, all user-supplied metadata values must be one of"\n                        f" {RawMetadataValue}"\n                    ),\n                    stacklevel=4,  # to get the caller of `normalize_metadata`\n                )\n                normalized_value = TextMetadataValue(f"[{v.__class__.__name__}] (unserializable)")\n            else:\n                raise DagsterInvalidMetadata(\n                    f'Could not resolve the metadata value for "{k}" to a known type. {e}'\n                ) from None\n        normalized_metadata[k] = normalized_value\n\n    return normalized_metadata\n\n\ndef normalize_metadata_value(raw_value: RawMetadataValue) -> "MetadataValue[Any]":\n    from dagster._core.definitions.events import AssetKey\n\n    if isinstance(raw_value, MetadataValue):\n        return raw_value\n    elif isinstance(raw_value, str):\n        return MetadataValue.text(raw_value)\n    elif isinstance(raw_value, float):\n        return MetadataValue.float(raw_value)\n    elif isinstance(raw_value, bool):\n        return MetadataValue.bool(raw_value)\n    elif isinstance(raw_value, int):\n        return MetadataValue.int(raw_value)\n    elif isinstance(raw_value, (list, dict)):\n        return MetadataValue.json(raw_value)\n    elif isinstance(raw_value, os.PathLike):\n        return MetadataValue.path(raw_value)\n    elif isinstance(raw_value, AssetKey):\n        return MetadataValue.asset(raw_value)\n    elif isinstance(raw_value, TableSchema):\n        return MetadataValue.table_schema(raw_value)\n    elif raw_value is None:\n        return MetadataValue.null()\n\n    raise DagsterInvalidMetadata(\n        f"Its type was {type(raw_value)}. Consider wrapping the value with the appropriate "\n        "MetadataValue type."\n    )\n\n\n# ########################\n# ##### METADATA VALUE\n# ########################\n\n\n
[docs]class MetadataValue(ABC, Generic[T_Packable]):\n """Utility class to wrap metadata values passed into Dagster events so that they can be\n displayed in the Dagster UI and other tooling.\n\n .. code-block:: python\n\n @op\n def emit_metadata(context, df):\n yield AssetMaterialization(\n asset_key="my_dataset",\n metadata={\n "my_text_label": "hello",\n "dashboard_url": MetadataValue.url("http://mycoolsite.com/my_dashboard"),\n "num_rows": 0,\n },\n )\n """\n\n @public\n @property\n @abstractmethod\n def value(self) -> T_Packable:\n """The wrapped value."""\n raise NotImplementedError()\n\n
[docs] @public\n @staticmethod\n def text(text: str) -> "TextMetadataValue":\n """Static constructor for a metadata value wrapping text as\n :py:class:`TextMetadataValue`. Can be used as the value type for the `metadata`\n parameter for supported events.\n\n Example:\n .. code-block:: python\n\n @op\n def emit_metadata(context, df):\n yield AssetMaterialization(\n asset_key="my_dataset",\n metadata={\n "my_text_label": MetadataValue.text("hello")\n },\n )\n\n Args:\n text (str): The text string for a metadata entry.\n """\n return TextMetadataValue(text)
\n\n
[docs] @public\n @staticmethod\n def url(url: str) -> "UrlMetadataValue":\n """Static constructor for a metadata value wrapping a URL as\n :py:class:`UrlMetadataValue`. Can be used as the value type for the `metadata`\n parameter for supported events.\n\n Example:\n .. code-block:: python\n\n @op\n def emit_metadata(context):\n yield AssetMaterialization(\n asset_key="my_dashboard",\n metadata={\n "dashboard_url": MetadataValue.url("http://mycoolsite.com/my_dashboard"),\n }\n )\n\n Args:\n url (str): The URL for a metadata entry.\n """\n return UrlMetadataValue(url)
\n\n
[docs] @public\n @staticmethod\n def path(path: Union[str, os.PathLike]) -> "PathMetadataValue":\n """Static constructor for a metadata value wrapping a path as\n :py:class:`PathMetadataValue`.\n\n Example:\n .. code-block:: python\n\n @op\n def emit_metadata(context):\n yield AssetMaterialization(\n asset_key="my_dataset",\n metadata={\n "filepath": MetadataValue.path("path/to/file"),\n }\n )\n\n Args:\n path (str): The path for a metadata entry.\n """\n return PathMetadataValue(path)
\n\n
[docs] @public\n @staticmethod\n def notebook(path: Union[str, os.PathLike]) -> "NotebookMetadataValue":\n """Static constructor for a metadata value wrapping a notebook path as\n :py:class:`NotebookMetadataValue`.\n\n Example:\n .. code-block:: python\n\n @op\n def emit_metadata(context):\n yield AssetMaterialization(\n asset_key="my_dataset",\n metadata={\n "notebook_path": MetadataValue.notebook("path/to/notebook.ipynb"),\n }\n )\n\n Args:\n path (str): The path to a notebook for a metadata entry.\n """\n return NotebookMetadataValue(path)
\n\n
[docs] @public\n @staticmethod\n def json(data: Union[Sequence[Any], Mapping[str, Any]]) -> "JsonMetadataValue":\n """Static constructor for a metadata value wrapping a json-serializable list or dict\n as :py:class:`JsonMetadataValue`. Can be used as the value type for the `metadata`\n parameter for supported events.\n\n Example:\n .. code-block:: python\n\n @op\n def emit_metadata(context):\n yield ExpectationResult(\n success=not missing_things,\n label="is_present",\n metadata={\n "about my dataset": MetadataValue.json({"missing_columns": missing_things})\n },\n )\n\n Args:\n data (Union[Sequence[Any], Mapping[str, Any]]): The JSON data for a metadata entry.\n """\n return JsonMetadataValue(data)
\n\n
[docs] @public\n @staticmethod\n def md(data: str) -> "MarkdownMetadataValue":\n """Static constructor for a metadata value wrapping markdown data as\n :py:class:`MarkdownMetadataValue`. Can be used as the value type for the `metadata`\n parameter for supported events.\n\n\n Example:\n .. code-block:: python\n\n @op\n def emit_metadata(context, md_str):\n yield AssetMaterialization(\n asset_key="info",\n metadata={\n 'Details': MetadataValue.md(md_str)\n },\n )\n\n Args:\n md_str (str): The markdown for a metadata entry.\n """\n return MarkdownMetadataValue(data)
\n\n
[docs] @public\n @staticmethod\n def python_artifact(python_artifact: Callable) -> "PythonArtifactMetadataValue":\n """Static constructor for a metadata value wrapping a python artifact as\n :py:class:`PythonArtifactMetadataValue`. Can be used as the value type for the\n `metadata` parameter for supported events.\n\n Example:\n .. code-block:: python\n\n @op\n def emit_metadata(context, df):\n yield AssetMaterialization(\n asset_key="my_dataset",\n metadata={\n "class": MetadataValue.python_artifact(MyClass),\n "function": MetadataValue.python_artifact(my_function),\n }\n )\n\n Args:\n value (Callable): The python class or function for a metadata entry.\n """\n check.callable_param(python_artifact, "python_artifact")\n return PythonArtifactMetadataValue(python_artifact.__module__, python_artifact.__name__)
\n\n
[docs] @public\n @staticmethod\n def float(value: float) -> "FloatMetadataValue":\n """Static constructor for a metadata value wrapping a float as\n :py:class:`FloatMetadataValue`. Can be used as the value type for the `metadata`\n parameter for supported events.\n\n Example:\n .. code-block:: python\n\n @op\n def emit_metadata(context, df):\n yield AssetMaterialization(\n asset_key="my_dataset",\n metadata={\n "size (bytes)": MetadataValue.float(calculate_bytes(df)),\n }\n )\n\n Args:\n value (float): The float value for a metadata entry.\n """\n return FloatMetadataValue(value)
\n\n
[docs] @public\n @staticmethod\n def int(value: int) -> "IntMetadataValue":\n """Static constructor for a metadata value wrapping an int as\n :py:class:`IntMetadataValue`. Can be used as the value type for the `metadata`\n parameter for supported events.\n\n Example:\n .. code-block:: python\n\n @op\n def emit_metadata(context, df):\n yield AssetMaterialization(\n asset_key="my_dataset",\n metadata={\n "number of rows": MetadataValue.int(len(df)),\n },\n )\n\n Args:\n value (int): The int value for a metadata entry.\n """\n return IntMetadataValue(value)
\n\n
[docs] @public\n @staticmethod\n def bool(value: bool) -> "BoolMetadataValue":\n """Static constructor for a metadata value wrapping a bool as\n :py:class:`BoolMetadataValuye`. Can be used as the value type for the `metadata`\n parameter for supported events.\n\n Example:\n .. code-block:: python\n\n @op\n def emit_metadata(context, df):\n yield AssetMaterialization(\n asset_key="my_dataset",\n metadata={\n "num rows > 1000": MetadataValue.bool(len(df) > 1000),\n },\n )\n\n Args:\n value (bool): The bool value for a metadata entry.\n """\n return BoolMetadataValue(value)
\n\n
[docs] @public\n @staticmethod\n def dagster_run(run_id: str) -> "DagsterRunMetadataValue":\n """Static constructor for a metadata value wrapping a reference to a Dagster run.\n\n Args:\n run_id (str): The ID of the run.\n """\n return DagsterRunMetadataValue(run_id)
\n\n
[docs] @public\n @staticmethod\n def asset(asset_key: "AssetKey") -> "DagsterAssetMetadataValue":\n """Static constructor for a metadata value referencing a Dagster asset, by key.\n\n For example:\n\n .. code-block:: python\n\n @op\n def validate_table(context, df):\n yield AssetMaterialization(\n asset_key=AssetKey("my_table"),\n metadata={\n "Related asset": MetadataValue.asset(AssetKey('my_other_table')),\n },\n )\n\n Args:\n asset_key (AssetKey): The asset key referencing the asset.\n """\n from dagster._core.definitions.events import AssetKey\n\n check.inst_param(asset_key, "asset_key", AssetKey)\n return DagsterAssetMetadataValue(asset_key)
\n\n
[docs] @public\n @staticmethod\n @experimental\n def table(\n records: Sequence[TableRecord], schema: Optional[TableSchema] = None\n ) -> "TableMetadataValue":\n """Static constructor for a metadata value wrapping arbitrary tabular data as\n :py:class:`TableMetadataValue`. Can be used as the value type for the `metadata`\n parameter for supported events.\n\n Example:\n .. code-block:: python\n\n @op\n def emit_metadata(context):\n yield ExpectationResult(\n success=not has_errors,\n label="is_valid",\n metadata={\n "errors": MetadataValue.table(\n records=[\n TableRecord(code="invalid-data-type", row=2, col="name"),\n ],\n schema=TableSchema(\n columns=[\n TableColumn(name="code", type="string"),\n TableColumn(name="row", type="int"),\n TableColumn(name="col", type="string"),\n ]\n )\n ),\n },\n )\n """\n return TableMetadataValue(records, schema)
\n\n
[docs] @public\n @staticmethod\n def table_schema(\n schema: TableSchema,\n ) -> "TableSchemaMetadataValue":\n """Static constructor for a metadata value wrapping a table schema as\n :py:class:`TableSchemaMetadataValue`. Can be used as the value type\n for the `metadata` parameter for supported events.\n\n Example:\n .. code-block:: python\n\n schema = TableSchema(\n columns = [\n TableColumn(name="id", type="int"),\n TableColumn(name="status", type="bool"),\n ]\n )\n\n DagsterType(\n type_check_fn=some_validation_fn,\n name='MyTable',\n metadata={\n 'my_table_schema': MetadataValue.table_schema(schema),\n }\n )\n\n Args:\n schema (TableSchema): The table schema for a metadata entry.\n """\n return TableSchemaMetadataValue(schema)
\n\n
[docs] @public\n @staticmethod\n def null() -> "NullMetadataValue":\n """Static constructor for a metadata value representing null. Can be used as the value type\n for the `metadata` parameter for supported events.\n """\n return NullMetadataValue()
\n\n\n# ########################\n# ##### METADATA VALUE TYPES\n# ########################\n\n# NOTE: We have `type: ignore` in a few places below because mypy complains about an instance method\n# (e.g. `text`) overriding a static method on the superclass of the same name. This is not a concern\n# for us because these static methods should never be called on instances.\n\n# NOTE: `XMetadataValue` classes are serialized with a storage name of `XMetadataEntryData` to\n# maintain backward compatibility. See docstring of `whitelist_for_serdes` for more info.\n\n\n
[docs]@whitelist_for_serdes(storage_name="TextMetadataEntryData")\nclass TextMetadataValue(\n NamedTuple(\n "_TextMetadataValue",\n [\n ("text", PublicAttr[Optional[str]]),\n ],\n ),\n MetadataValue[str],\n):\n """Container class for text metadata entry data.\n\n Args:\n text (Optional[str]): The text data.\n """\n\n def __new__(cls, text: Optional[str]):\n return super(TextMetadataValue, cls).__new__(\n cls, check.opt_str_param(text, "text", default="")\n )\n\n @public\n @property\n def value(self) -> Optional[str]:\n """Optional[str]: The wrapped text data."""\n return self.text
\n\n\n
[docs]@whitelist_for_serdes(storage_name="UrlMetadataEntryData")\nclass UrlMetadataValue(\n NamedTuple(\n "_UrlMetadataValue",\n [\n ("url", PublicAttr[Optional[str]]),\n ],\n ),\n MetadataValue[str],\n):\n """Container class for URL metadata entry data.\n\n Args:\n url (Optional[str]): The URL as a string.\n """\n\n def __new__(cls, url: Optional[str]):\n return super(UrlMetadataValue, cls).__new__(\n cls, check.opt_str_param(url, "url", default="")\n )\n\n @public\n @property\n def value(self) -> Optional[str]:\n """Optional[str]: The wrapped URL."""\n return self.url
\n\n\n
[docs]@whitelist_for_serdes(storage_name="PathMetadataEntryData")\nclass PathMetadataValue(\n NamedTuple("_PathMetadataValue", [("path", PublicAttr[Optional[str]])]), MetadataValue[str]\n):\n """Container class for path metadata entry data.\n\n Args:\n path (Optional[str]): The path as a string or conforming to os.PathLike.\n """\n\n def __new__(cls, path: Optional[Union[str, os.PathLike]]):\n return super(PathMetadataValue, cls).__new__(\n cls, check.opt_path_param(path, "path", default="")\n )\n\n @public\n @property\n def value(self) -> Optional[str]:\n """Optional[str]: The wrapped path."""\n return self.path
\n\n\n
[docs]@whitelist_for_serdes(storage_name="NotebookMetadataEntryData")\nclass NotebookMetadataValue(\n NamedTuple("_NotebookMetadataValue", [("path", PublicAttr[Optional[str]])]), MetadataValue[str]\n):\n """Container class for notebook metadata entry data.\n\n Args:\n path (Optional[str]): The path to the notebook as a string or conforming to os.PathLike.\n """\n\n def __new__(cls, path: Optional[Union[str, os.PathLike]]):\n return super(NotebookMetadataValue, cls).__new__(\n cls, check.opt_path_param(path, "path", default="")\n )\n\n @public\n @property\n def value(self) -> Optional[str]:\n """Optional[str]: The wrapped path to the notebook as a string."""\n return self.path
\n\n\n
[docs]@whitelist_for_serdes(storage_name="JsonMetadataEntryData")\nclass JsonMetadataValue(\n NamedTuple(\n "_JsonMetadataValue",\n [\n ("data", PublicAttr[Optional[Union[Sequence[Any], Mapping[str, Any]]]]),\n ],\n ),\n MetadataValue[Union[Sequence[Any], Mapping[str, Any]]],\n):\n """Container class for JSON metadata entry data.\n\n Args:\n data (Union[Sequence[Any], Dict[str, Any]]): The JSON data.\n """\n\n def __new__(cls, data: Optional[Union[Sequence[Any], Mapping[str, Any]]]):\n data = check.opt_inst_param(data, "data", (Sequence, Mapping))\n try:\n # check that the value is JSON serializable\n seven.dumps(data)\n except TypeError:\n raise DagsterInvalidMetadata("Value is not JSON serializable.")\n return super(JsonMetadataValue, cls).__new__(cls, data)\n\n @public\n @property\n def value(self) -> Optional[Union[Sequence[Any], Mapping[str, Any]]]:\n """Optional[Union[Sequence[Any], Dict[str, Any]]]: The wrapped JSON data."""\n return self.data
\n\n\n
[docs]@whitelist_for_serdes(storage_name="MarkdownMetadataEntryData")\nclass MarkdownMetadataValue(\n NamedTuple(\n "_MarkdownMetadataValue",\n [\n ("md_str", PublicAttr[Optional[str]]),\n ],\n ),\n MetadataValue[str],\n):\n """Container class for markdown metadata entry data.\n\n Args:\n md_str (Optional[str]): The markdown as a string.\n """\n\n def __new__(cls, md_str: Optional[str]):\n return super(MarkdownMetadataValue, cls).__new__(\n cls, check.opt_str_param(md_str, "md_str", default="")\n )\n\n @public\n @property\n def value(self) -> Optional[str]:\n """Optional[str]: The wrapped markdown as a string."""\n return self.md_str
\n\n\n# This should be deprecated or fixed so that `value` does not return itself.\n
[docs]@whitelist_for_serdes(storage_name="PythonArtifactMetadataEntryData")\nclass PythonArtifactMetadataValue(\n NamedTuple(\n "_PythonArtifactMetadataValue",\n [\n ("module", PublicAttr[str]),\n ("name", PublicAttr[str]),\n ],\n ),\n MetadataValue["PythonArtifactMetadataValue"],\n):\n """Container class for python artifact metadata entry data.\n\n Args:\n module (str): The module where the python artifact can be found\n name (str): The name of the python artifact\n """\n\n def __new__(cls, module: str, name: str):\n return super(PythonArtifactMetadataValue, cls).__new__(\n cls, check.str_param(module, "module"), check.str_param(name, "name")\n )\n\n @public\n @property\n def value(self) -> Self:\n """PythonArtifactMetadataValue: Identity function."""\n return self
\n\n\n
[docs]@whitelist_for_serdes(storage_name="FloatMetadataEntryData")\nclass FloatMetadataValue(\n NamedTuple(\n "_FloatMetadataValue",\n [\n ("value", PublicAttr[Optional[float]]),\n ],\n ),\n MetadataValue[float],\n):\n """Container class for float metadata entry data.\n\n Args:\n value (Optional[float]): The float value.\n """\n\n def __new__(cls, value: Optional[float]):\n return super(FloatMetadataValue, cls).__new__(cls, check.opt_float_param(value, "value"))
\n\n\n
[docs]@whitelist_for_serdes(storage_name="IntMetadataEntryData")\nclass IntMetadataValue(\n NamedTuple(\n "_IntMetadataValue",\n [\n ("value", PublicAttr[Optional[int]]),\n ],\n ),\n MetadataValue[int],\n):\n """Container class for int metadata entry data.\n\n Args:\n value (Optional[int]): The int value.\n """\n\n def __new__(cls, value: Optional[int]):\n return super(IntMetadataValue, cls).__new__(cls, check.opt_int_param(value, "value"))
\n\n\n@whitelist_for_serdes(storage_name="BoolMetadataEntryData")\nclass BoolMetadataValue(\n NamedTuple("_BoolMetadataValue", [("value", PublicAttr[Optional[bool]])]),\n MetadataValue[bool],\n):\n """Container class for bool metadata entry data.\n\n Args:\n value (Optional[bool]): The bool value.\n """\n\n def __new__(cls, value: Optional[bool]):\n return super(BoolMetadataValue, cls).__new__(cls, check.opt_bool_param(value, "value"))\n\n\n
[docs]@whitelist_for_serdes(storage_name="DagsterPipelineRunMetadataEntryData")\nclass DagsterRunMetadataValue(\n NamedTuple(\n "_DagsterRunMetadataValue",\n [\n ("run_id", PublicAttr[str]),\n ],\n ),\n MetadataValue[str],\n):\n """Representation of a dagster run.\n\n Args:\n run_id (str): The run id\n """\n\n def __new__(cls, run_id: str):\n return super(DagsterRunMetadataValue, cls).__new__(cls, check.str_param(run_id, "run_id"))\n\n @public\n @property\n def value(self) -> str:\n """str: The wrapped run id."""\n return self.run_id
\n\n\n
[docs]@whitelist_for_serdes(storage_name="DagsterAssetMetadataEntryData")\nclass DagsterAssetMetadataValue(\n NamedTuple("_DagsterAssetMetadataValue", [("asset_key", PublicAttr["AssetKey"])]),\n MetadataValue["AssetKey"],\n):\n """Representation of a dagster asset.\n\n Args:\n asset_key (AssetKey): The dagster asset key\n """\n\n def __new__(cls, asset_key: "AssetKey"):\n from dagster._core.definitions.events import AssetKey\n\n return super(DagsterAssetMetadataValue, cls).__new__(\n cls, check.inst_param(asset_key, "asset_key", AssetKey)\n )\n\n @public\n @property\n def value(self) -> "AssetKey":\n """AssetKey: The wrapped :py:class:`AssetKey`."""\n return self.asset_key
\n\n\n# This should be deprecated or fixed so that `value` does not return itself.\n
[docs]@experimental\n@whitelist_for_serdes(storage_name="TableMetadataEntryData")\nclass TableMetadataValue(\n NamedTuple(\n "_TableMetadataValue",\n [\n ("records", PublicAttr[Sequence[TableRecord]]),\n ("schema", PublicAttr[TableSchema]),\n ],\n ),\n MetadataValue["TableMetadataValue"],\n):\n """Container class for table metadata entry data.\n\n Args:\n records (TableRecord): The data as a list of records (i.e. rows).\n schema (Optional[TableSchema]): A schema for the table.\n """\n\n
[docs] @public\n @staticmethod\n def infer_column_type(value: object) -> str:\n """str: Infer the :py:class:`TableSchema` column type that will be used for a value."""\n if isinstance(value, bool):\n return "bool"\n elif isinstance(value, int):\n return "int"\n elif isinstance(value, float):\n return "float"\n else:\n return "string"
\n\n def __new__(cls, records: Sequence[TableRecord], schema: Optional[TableSchema]):\n check.sequence_param(records, "records", of_type=TableRecord)\n check.opt_inst_param(schema, "schema", TableSchema)\n\n if len(records) == 0:\n schema = check.not_none(schema, "schema must be provided if records is empty")\n else:\n columns = set(records[0].data.keys())\n for record in records[1:]:\n check.invariant(\n set(record.data.keys()) == columns, "All records must have the same fields"\n )\n schema = schema or TableSchema(\n columns=[\n TableColumn(name=k, type=TableMetadataValue.infer_column_type(v))\n for k, v in records[0].data.items()\n ]\n )\n\n return super(TableMetadataValue, cls).__new__(\n cls,\n records,\n schema,\n )\n\n @public\n @property\n def value(self) -> Self:\n """TableMetadataValue: Identity function."""\n return self
\n\n\n
[docs]@whitelist_for_serdes(storage_name="TableSchemaMetadataEntryData")\nclass TableSchemaMetadataValue(\n NamedTuple("_TableSchemaMetadataValue", [("schema", PublicAttr[TableSchema])]),\n MetadataValue[TableSchema],\n):\n """Representation of a schema for arbitrary tabular data.\n\n Args:\n schema (TableSchema): The dictionary containing the schema representation.\n """\n\n def __new__(cls, schema: TableSchema):\n return super(TableSchemaMetadataValue, cls).__new__(\n cls, check.inst_param(schema, "schema", TableSchema)\n )\n\n @public\n @property\n def value(self) -> TableSchema:\n """TableSchema: The wrapped :py:class:`TableSchema`."""\n return self.schema
\n\n\n@whitelist_for_serdes(storage_name="NullMetadataEntryData")\nclass NullMetadataValue(NamedTuple("_NullMetadataValue", []), MetadataValue[None]):\n """Representation of null."""\n\n @public\n @property\n def value(self) -> None:\n """None: The wrapped null value."""\n return None\n\n\n# ########################\n# ##### METADATA BACKCOMPAT\n# ########################\n\n# Metadata used to be represented as a `List[MetadataEntry]`, but that class has been deleted. But\n# we still serialize metadata dicts to the serialized representation of `List[MetadataEntry]` for\n# backcompat purposes.\n\n\nclass MetadataFieldSerializer(FieldSerializer):\n """Converts between metadata dict (new) and metadata entries list (old)."""\n\n storage_name = "metadata_entries"\n loaded_name = "metadata"\n\n def pack(\n self,\n metadata_dict: Mapping[str, MetadataValue],\n whitelist_map: WhitelistMap,\n descent_path: str,\n ) -> Sequence[Mapping[str, Any]]:\n return [\n {\n "__class__": "EventMetadataEntry",\n "label": k,\n # MetadataValue itself can't inherit from NamedTuple and so isn't a PackableValue,\n # but one of its subclasses will always be returned here.\n "entry_data": pack_value(v, whitelist_map, descent_path), # type: ignore\n "description": None,\n }\n for k, v in metadata_dict.items()\n ]\n\n def unpack(\n self,\n metadata_entries: List["MetadataEntry"],\n whitelist_map: WhitelistMap,\n context: UnpackContext,\n ) -> Mapping[str, MetadataValue]:\n return {e.label: e.entry_data for e in metadata_entries}\n\n\nT_MetadataValue = TypeVar("T_MetadataValue", bound=MetadataValue, covariant=True)\n\n\n# NOTE: MetadataEntry is no longer accessible via the public API-- all metadata APIs use metadata\n# dicts. This clas shas only been preserved to adhere strictly to our backcompat guarantees. It is\n# still instantiated in the above `MetadataFieldSerializer` but that can easily be changed.\n
[docs]@deprecated(\n breaking_version="2.0",\n additional_warn_text="Please use a dict with `MetadataValue` values instead.",\n)\n@deprecated_param(\n param="entry_data", breaking_version="2.0", additional_warn_text="Use `value` instead."\n)\n@whitelist_for_serdes(storage_name="EventMetadataEntry")\nclass MetadataEntry(\n NamedTuple(\n "_MetadataEntry",\n [\n ("label", PublicAttr[str]),\n ("description", PublicAttr[Optional[str]]),\n ("entry_data", PublicAttr[MetadataValue]),\n ],\n ),\n Generic[T_MetadataValue],\n):\n """A structure for describing metadata for Dagster events.\n\n .. note:: This class is no longer usable in any Dagster API, and will be completely removed in 2.0.\n\n Lists of objects of this type can be passed as arguments to Dagster events and will be displayed\n in the Dagster UI and other tooling.\n\n Should be yielded from within an IO manager to append metadata for a given input/output event.\n For other event types, passing a dict with `MetadataValue` values to the `metadata` argument\n is preferred.\n\n Args:\n label (str): Short display label for this metadata entry.\n description (Optional[str]): A human-readable description of this metadata entry.\n value (MetadataValue): Typed metadata entry data. The different types allow\n for customized display in tools like the Dagster UI.\n """\n\n def __new__(\n cls,\n label: str,\n description: Optional[str] = None,\n entry_data: Optional["RawMetadataValue"] = None,\n value: Optional["RawMetadataValue"] = None,\n ):\n value = cast(\n RawMetadataValue,\n normalize_renamed_param(\n new_val=value,\n new_arg="value",\n old_val=entry_data,\n old_arg="entry_data",\n ),\n )\n value = normalize_metadata_value(value)\n\n return super(MetadataEntry, cls).__new__(\n cls,\n check.str_param(label, "label"),\n check.opt_str_param(description, "description"),\n check.inst_param(value, "value", MetadataValue),\n )\n\n @property\n def value(self):\n """Alias of `entry_data`."""\n return self.entry_data
\n
", "current_page_name": "_modules/dagster/_core/definitions/metadata", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "table": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.metadata.table

\nfrom typing import Mapping, NamedTuple, Optional, Sequence, Union, cast\n\nimport dagster._check as check\nfrom dagster._annotations import PublicAttr, experimental, public\nfrom dagster._serdes.serdes import (\n    whitelist_for_serdes,\n)\n\n# ########################\n# ##### TABLE RECORD\n# ########################\n\n\n
[docs]@experimental\n@whitelist_for_serdes\nclass TableRecord(\n NamedTuple("TableRecord", [("data", PublicAttr[Mapping[str, Union[str, int, float, bool]]])])\n):\n """Represents one record in a table. Field keys are arbitrary strings-- field values must be\n strings, integers, floats, or bools.\n """\n\n def __new__(cls, data: Mapping[str, Union[str, int, float, bool]]):\n check.dict_param(\n data,\n "data",\n value_type=(str, float, int, bool, type(None)),\n additional_message="Record fields must be one of types: (str, float, int, bool)",\n )\n return super(TableRecord, cls).__new__(cls, data=data)
\n\n\n# ########################\n# ##### TABLE SCHEMA\n# ########################\n\n\n
[docs]@whitelist_for_serdes\nclass TableSchema(\n NamedTuple(\n "TableSchema",\n [\n ("columns", PublicAttr[Sequence["TableColumn"]]),\n ("constraints", PublicAttr["TableConstraints"]),\n ],\n )\n):\n """Representation of a schema for tabular data.\n\n Schema is composed of two parts:\n\n - A required list of columns (`TableColumn`). Each column specifies a\n `name`, `type`, set of `constraints`, and (optional) `description`. `type`\n defaults to `string` if unspecified. Column constraints\n (`TableColumnConstraints`) consist of boolean properties `unique` and\n `nullable`, as well as a list of strings `other` containing string\n descriptions of all additional constraints (e.g. `"<= 5"`).\n - An optional list of table-level constraints (`TableConstraints`). A\n table-level constraint cannot be expressed in terms of a single column,\n e.g. col a > col b. Presently, all table-level constraints must be\n expressed as strings under the `other` attribute of a `TableConstraints`\n object.\n\n .. code-block:: python\n\n # example schema\n TableSchema(\n constraints = TableConstraints(\n other = [\n "foo > bar",\n ],\n ),\n columns = [\n TableColumn(\n name = "foo",\n type = "string",\n description = "Foo description",\n constraints = TableColumnConstraints(\n required = True,\n other = [\n "starts with the letter 'a'",\n ],\n ),\n ),\n TableColumn(\n name = "bar",\n type = "string",\n ),\n TableColumn(\n name = "baz",\n type = "custom_type",\n constraints = TableColumnConstraints(\n unique = True,\n )\n ),\n ],\n )\n\n Args:\n columns (List[TableColumn]): The columns of the table.\n constraints (Optional[TableConstraints]): The constraints of the table.\n """\n\n def __new__(\n cls,\n columns: Sequence["TableColumn"],\n constraints: Optional["TableConstraints"] = None,\n ):\n return super(TableSchema, cls).__new__(\n cls,\n columns=check.sequence_param(columns, "columns", of_type=TableColumn),\n constraints=check.opt_inst_param(\n constraints, "constraints", TableConstraints, default=_DEFAULT_TABLE_CONSTRAINTS\n ),\n )\n\n
[docs] @public\n @staticmethod\n def from_name_type_dict(name_type_dict: Mapping[str, str]):\n """Constructs a TableSchema from a dictionary whose keys are column names and values are the\n names of data types of those columns.\n """\n return TableSchema(\n columns=[\n TableColumn(name=name, type=type_str) for name, type_str in name_type_dict.items()\n ]\n )
\n\n\n# ########################\n# ##### TABLE CONSTRAINTS\n# ########################\n\n\n
[docs]@whitelist_for_serdes\nclass TableConstraints(\n NamedTuple(\n "TableConstraints",\n [\n ("other", PublicAttr[Sequence[str]]),\n ],\n )\n):\n """Descriptor for "table-level" constraints. Presently only one property,\n `other` is supported. This contains strings describing arbitrary\n table-level constraints. A table-level constraint is a constraint defined\n in terms of multiple columns (e.g. col_A > col_B) or in terms of rows.\n\n Args:\n other (List[str]): Descriptions of arbitrary table-level constraints.\n """\n\n def __new__(\n cls,\n other: Sequence[str],\n ):\n return super(TableConstraints, cls).__new__(\n cls,\n other=check.sequence_param(other, "other", of_type=str),\n )
\n\n\n_DEFAULT_TABLE_CONSTRAINTS = TableConstraints(other=[])\n\n# ########################\n# ##### TABLE COLUMN\n# ########################\n\n\n
[docs]@whitelist_for_serdes\nclass TableColumn(\n NamedTuple(\n "TableColumn",\n [\n ("name", PublicAttr[str]),\n ("type", PublicAttr[str]),\n ("description", PublicAttr[Optional[str]]),\n ("constraints", PublicAttr["TableColumnConstraints"]),\n ],\n )\n):\n """Descriptor for a table column. The only property that must be specified\n by the user is `name`. If no `type` is specified, `string` is assumed. If\n no `constraints` are specified, the column is assumed to be nullable\n (i.e. `required = False`) and have no other constraints beyond the data type.\n\n Args:\n name (List[str]): Descriptions of arbitrary table-level constraints.\n type (Optional[str]): The type of the column. Can be an arbitrary\n string. Defaults to `"string"`.\n description (Optional[str]): Description of this column. Defaults to `None`.\n constraints (Optional[TableColumnConstraints]): Column-level constraints.\n If unspecified, column is nullable with no constraints.\n """\n\n def __new__(\n cls,\n name: str,\n type: str = "string", # noqa: A002\n description: Optional[str] = None,\n constraints: Optional["TableColumnConstraints"] = None,\n ):\n return super(TableColumn, cls).__new__(\n cls,\n name=check.str_param(name, "name"),\n type=check.str_param(type, "type"),\n description=check.opt_str_param(description, "description"),\n constraints=cast(\n "TableColumnConstraints",\n check.opt_inst_param(\n constraints,\n "constraints",\n TableColumnConstraints,\n default=_DEFAULT_TABLE_COLUMN_CONSTRAINTS,\n ),\n ),\n )
\n\n\n# ########################\n# ##### TABLE COLUMN CONSTRAINTS\n# ########################\n\n\n
[docs]@whitelist_for_serdes\nclass TableColumnConstraints(\n NamedTuple(\n "TableColumnConstraints",\n [\n ("nullable", PublicAttr[bool]),\n ("unique", PublicAttr[bool]),\n ("other", PublicAttr[Optional[Sequence[str]]]),\n ],\n )\n):\n """Descriptor for a table column's constraints. Nullability and uniqueness are specified with\n boolean properties. All other constraints are described using arbitrary strings under the\n `other` property.\n\n Args:\n nullable (Optional[bool]): If true, this column can hold null values.\n unique (Optional[bool]): If true, all values in this column must be unique.\n other (List[str]): Descriptions of arbitrary column-level constraints\n not expressible by the predefined properties.\n """\n\n def __new__(\n cls,\n nullable: bool = True,\n unique: bool = False,\n other: Optional[Sequence[str]] = None,\n ):\n return super(TableColumnConstraints, cls).__new__(\n cls,\n nullable=check.bool_param(nullable, "nullable"),\n unique=check.bool_param(unique, "unique"),\n other=check.opt_sequence_param(other, "other"),\n )
\n\n\n_DEFAULT_TABLE_COLUMN_CONSTRAINTS = TableColumnConstraints()\n
", "current_page_name": "_modules/dagster/_core/definitions/metadata/table", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}, {"link": "../", "title": "dagster._core.definitions.metadata"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.metadata.table"}, "title": "dagster._core.definitions.metadata"}, "multi_asset_sensor_definition": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.multi_asset_sensor_definition

\nimport inspect\nimport json\nfrom collections import OrderedDict, defaultdict\nfrom typing import (\n    TYPE_CHECKING,\n    Callable,\n    Dict,\n    Iterable,\n    Iterator,\n    List,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Set,\n    Union,\n    cast,\n)\n\nimport dagster._check as check\nfrom dagster._annotations import experimental, public\nfrom dagster._core.definitions.asset_selection import AssetSelection\nfrom dagster._core.definitions.assets import AssetsDefinition\nfrom dagster._core.definitions.partition import PartitionsDefinition\nfrom dagster._core.definitions.resource_annotation import get_resource_args\nfrom dagster._core.definitions.resource_definition import ResourceDefinition\nfrom dagster._core.definitions.scoped_resources_builder import ScopedResourcesBuilder\nfrom dagster._core.errors import (\n    DagsterInvalidDefinitionError,\n    DagsterInvalidInvocationError,\n    DagsterInvariantViolationError,\n)\nfrom dagster._core.instance import DagsterInstance\nfrom dagster._core.instance.ref import InstanceRef\nfrom dagster._utils import normalize_to_repository\n\nfrom .events import AssetKey\nfrom .run_request import RunRequest, SensorResult, SkipReason\nfrom .sensor_definition import (\n    DefaultSensorStatus,\n    SensorDefinition,\n    SensorEvaluationContext,\n    SensorType,\n    get_context_param_name,\n    get_sensor_context_from_args_or_kwargs,\n    validate_and_get_resource_dict,\n)\nfrom .target import ExecutableDefinition\nfrom .utils import check_valid_name\n\nif TYPE_CHECKING:\n    from dagster._core.definitions.definitions_class import Definitions\n    from dagster._core.definitions.repository_definition import RepositoryDefinition\n    from dagster._core.storage.event_log.base import EventLogRecord\n\nMAX_NUM_UNCONSUMED_EVENTS = 25\n\n\nclass MultiAssetSensorAssetCursorComponent(\n    NamedTuple(\n        "_MultiAssetSensorAssetCursorComponent",\n        [\n            ("latest_consumed_event_partition", Optional[str]),\n            ("latest_consumed_event_id", Optional[int]),\n            ("trailing_unconsumed_partitioned_event_ids", Dict[str, int]),\n        ],\n    )\n):\n    """A cursor component that is used to track the cursor for a particular asset in a multi-asset\n    sensor.\n\n    Here's an illustration to help explain how this representation works:\n\n    partition_1  ---|----------a----\n    partition_2  -t-----|-x---------\n    partition_3  ----t------|---a---\n\n\n    The "|", "a", "t", and "x" characters represent materialization events.\n    The x-axis is storage_id, which is basically time. The cursor has been advanced to the "|" event\n    for each partition. latest_evaluated_event_partition would be "partition_3", and\n    "latest_evaluated_event_id" would be the storage_id of the "|" event for partition_3.\n\n    The "t" events aren't directly represented in the cursor, because they trail the event that the\n    the cursor for their partition has advanced to. The "a" events aren't directly represented\n    in the cursor, because they occurred after the "latest_evaluated_event_id".  The "x" event is\n    included in "unevaluated_partitioned_event_ids", because it's after the event that the cursor\n    for its partition has advanced to, but trails "latest_evaluated_event_id".\n\n    Attributes:\n        latest_consumed_event_partition (Optional[str]): The partition of the latest consumed event\n            for this asset.\n        latest_consumed_event_id (Optional[int]): The event ID of the latest consumed event for\n            this asset.\n        trailing_unconsumed_partitioned_event_ids (Dict[str, int]): A mapping containing\n            the partition key mapped to the latest unconsumed materialization event for this\n            partition with an ID less than latest_consumed_event_id.\n    """\n\n    def __new__(\n        cls,\n        latest_consumed_event_partition,\n        latest_consumed_event_id,\n        trailing_unconsumed_partitioned_event_ids,\n    ):\n        return super(MultiAssetSensorAssetCursorComponent, cls).__new__(\n            cls,\n            latest_consumed_event_partition=check.opt_str_param(\n                latest_consumed_event_partition, "latest_consumed_event_partition"\n            ),\n            latest_consumed_event_id=check.opt_int_param(\n                latest_consumed_event_id, "latest_consumed_event_id"\n            ),\n            trailing_unconsumed_partitioned_event_ids=check.dict_param(\n                trailing_unconsumed_partitioned_event_ids,\n                "trailing_unconsumed_partitioned_event_ids",\n                key_type=str,\n                value_type=int,\n            ),\n        )\n\n\nclass MultiAssetSensorContextCursor:\n    # Tracks the state of the cursor within the tick, created for utility purposes.\n    # Must call MultiAssetSensorEvaluationContext._update_cursor_after_evaluation at end of tick\n    # to serialize the cursor.\n    def __init__(self, cursor: Optional[str], context: "MultiAssetSensorEvaluationContext"):\n        loaded_cursor = json.loads(cursor) if cursor else {}\n        self._cursor_component_by_asset_key: Dict[str, MultiAssetSensorAssetCursorComponent] = {}\n\n        # The initial latest consumed event ID at the beginning of the tick\n        self.initial_latest_consumed_event_ids_by_asset_key: Dict[str, Optional[int]] = {}\n\n        for str_asset_key, cursor_list in loaded_cursor.items():\n            if len(cursor_list) != 3:\n                # In this case, the cursor object is not a multi asset sensor asset cursor\n                # component. This cursor is maintained by the asset reconciliation sensor.\n                break\n            else:\n                partition_key, event_id, trailing_unconsumed_partitioned_event_ids = cursor_list\n                self._cursor_component_by_asset_key[str_asset_key] = (\n                    MultiAssetSensorAssetCursorComponent(\n                        latest_consumed_event_partition=partition_key,\n                        latest_consumed_event_id=event_id,\n                        trailing_unconsumed_partitioned_event_ids=trailing_unconsumed_partitioned_event_ids,\n                    )\n                )\n\n                self.initial_latest_consumed_event_ids_by_asset_key[str_asset_key] = event_id\n\n        check.dict_param(self._cursor_component_by_asset_key, "unpacked_cursor", key_type=str)\n        self._context = context\n\n    def get_cursor_for_asset(self, asset_key: AssetKey) -> MultiAssetSensorAssetCursorComponent:\n        return self._cursor_component_by_asset_key.get(\n            str(asset_key), MultiAssetSensorAssetCursorComponent(None, None, {})\n        )\n\n    def get_stringified_cursor(self) -> str:\n        return json.dumps(self._cursor_component_by_asset_key)\n\n\n
[docs]@experimental\nclass MultiAssetSensorEvaluationContext(SensorEvaluationContext):\n """The context object available as the argument to the evaluation function of a\n :py:class:`dagster.MultiAssetSensorDefinition`.\n\n Users should not instantiate this object directly. To construct a\n `MultiAssetSensorEvaluationContext` for testing purposes, use :py:func:`dagster.\n build_multi_asset_sensor_context`.\n\n The `MultiAssetSensorEvaluationContext` contains a cursor object that tracks the state of\n consumed event logs for each monitored asset. For each asset, the cursor stores the storage ID\n of the latest materialization that has been marked as "consumed" (via a call to `advance_cursor`)\n in a `latest_consumed_event_id` field.\n\n For each monitored asset, the cursor will store the latest unconsumed event ID for up to 25\n partitions. Each event ID must be before the `latest_consumed_event_id` field for the asset.\n\n Events marked as consumed via `advance_cursor` will be returned in future ticks until they\n are marked as consumed.\n\n To update the cursor to the latest materialization and clear the unconsumed events, call\n `advance_all_cursors`.\n\n Attributes:\n monitored_assets (Union[Sequence[AssetKey], AssetSelection]): The assets monitored\n by the sensor. If an AssetSelection object is provided, it will only apply to assets\n within the Definitions that this sensor is part of.\n repository_def (Optional[RepositoryDefinition]): The repository that the sensor belongs to.\n If needed by the sensor top-level resource definitions will be pulled from this repository.\n You can provide either this or `definitions`.\n instance_ref (Optional[InstanceRef]): The serialized instance configured to run the schedule\n cursor (Optional[str]): The cursor, passed back from the last sensor evaluation via\n the cursor attribute of SkipReason and RunRequest. Must be a dictionary of asset key\n strings to a stringified tuple of (latest_event_partition, latest_event_storage_id,\n trailing_unconsumed_partitioned_event_ids).\n last_completion_time (float): DEPRECATED The last time that the sensor was consumed (UTC).\n last_run_key (str): DEPRECATED The run key of the RunRequest most recently created by this\n sensor. Use the preferred `cursor` attribute instead.\n repository_name (Optional[str]): The name of the repository that the sensor belongs to.\n instance (Optional[DagsterInstance]): The deserialized instance can also be passed in\n directly (primarily useful in testing contexts).\n definitions (Optional[Definitions]): `Definitions` object that the sensor is defined in.\n If needed by the sensor, top-level resource definitions will be pulled from these\n definitions. You can provide either this or `repository_def`.\n\n Example:\n .. code-block:: python\n\n from dagster import multi_asset_sensor, MultiAssetSensorEvaluationContext\n\n @multi_asset_sensor(monitored_assets=[AssetKey("asset_1), AssetKey("asset_2)])\n def the_sensor(context: MultiAssetSensorEvaluationContext):\n ...\n """\n\n def __init__(\n self,\n instance_ref: Optional[InstanceRef],\n last_completion_time: Optional[float],\n last_run_key: Optional[str],\n cursor: Optional[str],\n repository_name: Optional[str],\n repository_def: Optional["RepositoryDefinition"],\n monitored_assets: Union[Sequence[AssetKey], AssetSelection],\n instance: Optional[DagsterInstance] = None,\n resource_defs: Optional[Mapping[str, ResourceDefinition]] = None,\n definitions: Optional["Definitions"] = None,\n ):\n from dagster._core.definitions.definitions_class import Definitions\n from dagster._core.definitions.repository_definition import RepositoryDefinition\n\n self._repository_def = normalize_to_repository(\n check.opt_inst_param(definitions, "definitions", Definitions),\n check.opt_inst_param(repository_def, "repository_def", RepositoryDefinition),\n )\n self._monitored_asset_keys: Sequence[AssetKey]\n if isinstance(monitored_assets, AssetSelection):\n repo_assets = self._repository_def.assets_defs_by_key.values()\n repo_source_assets = self._repository_def.source_assets_by_key.values()\n self._monitored_asset_keys = list(\n monitored_assets.resolve([*repo_assets, *repo_source_assets])\n )\n else:\n self._monitored_asset_keys = monitored_assets\n\n self._assets_by_key: Dict[AssetKey, Optional[AssetsDefinition]] = {}\n self._partitions_def_by_asset_key: Dict[AssetKey, Optional[PartitionsDefinition]] = {}\n for asset_key in self._monitored_asset_keys:\n assets_def = self._repository_def.assets_defs_by_key.get(asset_key)\n self._assets_by_key[asset_key] = assets_def\n\n source_asset_def = self._repository_def.source_assets_by_key.get(asset_key)\n self._partitions_def_by_asset_key[asset_key] = (\n assets_def.partitions_def\n if assets_def\n else source_asset_def.partitions_def if source_asset_def else None\n )\n\n # Cursor object with utility methods for updating and retrieving cursor information.\n # At the end of each tick, must call update_cursor_after_evaluation to update the serialized\n # cursor.\n self._unpacked_cursor = MultiAssetSensorContextCursor(cursor, self)\n self._cursor_advance_state_mutation = MultiAssetSensorCursorAdvances()\n\n self._initial_unconsumed_events_by_id: Dict[int, EventLogRecord] = {}\n self._fetched_initial_unconsumed_events = False\n\n super(MultiAssetSensorEvaluationContext, self).__init__(\n instance_ref=instance_ref,\n last_completion_time=last_completion_time,\n last_run_key=last_run_key,\n cursor=cursor,\n repository_name=repository_name,\n instance=instance,\n repository_def=repository_def,\n resources=resource_defs,\n )\n\n def _cache_initial_unconsumed_events(self) -> None:\n from dagster._core.events import DagsterEventType\n from dagster._core.storage.event_log.base import EventRecordsFilter\n\n # This method caches the initial unconsumed events for each asset key. To generate the\n # current unconsumed events, call get_trailing_unconsumed_events instead.\n if self._fetched_initial_unconsumed_events:\n return\n\n for asset_key in self._monitored_asset_keys:\n unconsumed_event_ids = list(\n self._get_cursor(asset_key).trailing_unconsumed_partitioned_event_ids.values()\n )\n if unconsumed_event_ids:\n event_records = self.instance.get_event_records(\n EventRecordsFilter(\n event_type=DagsterEventType.ASSET_MATERIALIZATION,\n storage_ids=unconsumed_event_ids,\n )\n )\n self._initial_unconsumed_events_by_id.update(\n {event_record.storage_id: event_record for event_record in event_records}\n )\n\n self._fetched_initial_unconsumed_events = True\n\n def _get_unconsumed_events_with_ids(\n self, event_ids: Sequence[int]\n ) -> Sequence["EventLogRecord"]:\n self._cache_initial_unconsumed_events()\n unconsumed_events = []\n for event_id in sorted(event_ids):\n event = self._initial_unconsumed_events_by_id.get(event_id)\n unconsumed_events.extend([event] if event else [])\n\n return unconsumed_events\n\n
[docs] @public\n def get_trailing_unconsumed_events(self, asset_key: AssetKey) -> Sequence["EventLogRecord"]:\n """Fetches the unconsumed events for a given asset key. Returns only events\n before the latest consumed event ID for the given asset. To mark an event as consumed,\n pass the event to `advance_cursor`. Returns events in ascending order by storage ID.\n\n Args:\n asset_key (AssetKey): The asset key to get unconsumed events for.\n\n Returns:\n Sequence[EventLogRecord]: The unconsumed events for the given asset key.\n """\n check.inst_param(asset_key, "asset_key", AssetKey)\n\n return self._get_unconsumed_events_with_ids(\n list(self._get_cursor(asset_key).trailing_unconsumed_partitioned_event_ids.values())\n )
\n\n def _get_partitions_after_cursor(self, asset_key: AssetKey) -> Sequence[str]:\n asset_key = check.inst_param(asset_key, "asset_key", AssetKey)\n partition_key = self._get_cursor(asset_key).latest_consumed_event_partition\n\n partitions_def = self._partitions_def_by_asset_key.get(asset_key)\n\n if not isinstance(partitions_def, PartitionsDefinition):\n raise DagsterInvalidInvocationError(f"No partitions defined for asset key {asset_key}")\n\n partitions_to_fetch = list(\n partitions_def.get_partition_keys(dynamic_partitions_store=self.instance)\n )\n\n if partition_key is not None:\n # Return partitions after the cursor partition, not including the cursor partition\n partitions_to_fetch = partitions_to_fetch[\n partitions_to_fetch.index(partition_key) + 1 :\n ]\n return partitions_to_fetch\n\n def update_cursor_after_evaluation(self) -> None:\n """Updates the cursor after the sensor evaluation function has been called. This method\n should be called at most once per evaluation.\n """\n new_cursor = self._cursor_advance_state_mutation.get_cursor_with_advances(\n self, self._unpacked_cursor\n )\n\n if new_cursor is not None:\n # Cursor was not updated by this context object, so we do not need to update it\n self._cursor = new_cursor\n self._unpacked_cursor = MultiAssetSensorContextCursor(new_cursor, self)\n self._cursor_advance_state_mutation = MultiAssetSensorCursorAdvances()\n self._fetched_initial_unconsumed_events = False\n\n
[docs] @public\n def latest_materialization_records_by_key(\n self,\n asset_keys: Optional[Sequence[AssetKey]] = None,\n ) -> Mapping[AssetKey, Optional["EventLogRecord"]]:\n """Fetches the most recent materialization event record for each asset in asset_keys.\n Only fetches events after the latest consumed event ID for the given asset key.\n\n Args:\n asset_keys (Optional[Sequence[AssetKey]]): list of asset keys to fetch events for. If\n not specified, the latest materialization will be fetched for all assets the\n multi_asset_sensor monitors.\n\n Returns: Mapping of AssetKey to EventLogRecord where the EventLogRecord is the latest\n materialization event for the asset. If there is no materialization event for the asset,\n the value in the mapping will be None.\n """\n # Do not evaluate unconsumed events, only events newer than the cursor\n # if there are no new events after the cursor, the cursor points to the most\n # recent event.\n\n if asset_keys is None:\n asset_keys = self._monitored_asset_keys\n else:\n asset_keys = check.opt_sequence_param(asset_keys, "asset_keys", of_type=AssetKey)\n\n asset_records = self.instance.get_asset_records(asset_keys)\n\n asset_event_records: Dict[AssetKey, Optional[EventLogRecord]] = {\n asset_key: None for asset_key in asset_keys\n }\n for record in asset_records:\n if (\n record.asset_entry.last_materialization_record\n and record.asset_entry.last_materialization_record.storage_id\n > (self._get_cursor(record.asset_entry.asset_key).latest_consumed_event_id or 0)\n ):\n asset_event_records[record.asset_entry.asset_key] = (\n record.asset_entry.last_materialization_record\n )\n\n return asset_event_records
\n\n
[docs] @public\n def materialization_records_for_key(\n self, asset_key: AssetKey, limit: Optional[int] = None\n ) -> Iterable["EventLogRecord"]:\n """Fetches asset materialization event records for asset_key, with the earliest event first.\n\n Only fetches events after the latest consumed event ID for the given asset key.\n\n Args:\n asset_key (AssetKey): The asset to fetch materialization events for\n limit (Optional[int]): The number of events to fetch\n """\n from dagster._core.events import DagsterEventType\n from dagster._core.storage.event_log.base import EventRecordsFilter\n\n asset_key = check.inst_param(asset_key, "asset_key", AssetKey)\n if asset_key not in self._assets_by_key:\n raise DagsterInvalidInvocationError(f"Asset key {asset_key} not monitored by sensor.")\n\n events = list(\n self.instance.get_event_records(\n EventRecordsFilter(\n event_type=DagsterEventType.ASSET_MATERIALIZATION,\n asset_key=asset_key,\n after_cursor=self._get_cursor(asset_key).latest_consumed_event_id,\n ),\n ascending=True,\n limit=limit,\n )\n )\n\n return events
\n\n def _get_cursor(self, asset_key: AssetKey) -> MultiAssetSensorAssetCursorComponent:\n """Returns the MultiAssetSensorAssetCursorComponent for the asset key.\n\n For more information, view the docstring for the MultiAssetSensorAssetCursorComponent class.\n """\n check.inst_param(asset_key, "asset_key", AssetKey)\n\n return self._unpacked_cursor.get_cursor_for_asset(asset_key)\n\n
[docs] @public\n def latest_materialization_records_by_partition(\n self,\n asset_key: AssetKey,\n after_cursor_partition: Optional[bool] = False,\n ) -> Mapping[str, "EventLogRecord"]:\n """Given an asset, returns a mapping of partition key to the latest materialization event\n for that partition. Fetches only materializations that have not been marked as "consumed"\n via a call to `advance_cursor`.\n\n Args:\n asset_key (AssetKey): The asset to fetch events for.\n after_cursor_partition (Optional[bool]): If True, only materializations with partitions\n after the cursor's current partition will be returned. By default, set to False.\n\n Returns:\n Mapping[str, EventLogRecord]:\n Mapping of AssetKey to a mapping of partitions to EventLogRecords where the\n EventLogRecord is the most recent materialization event for the partition.\n The mapping preserves the order that the materializations occurred.\n\n Example:\n .. code-block:: python\n\n @asset(partitions_def=DailyPartitionsDefinition("2022-07-01"))\n def july_asset():\n return 1\n\n @multi_asset_sensor(asset_keys=[july_asset.key])\n def my_sensor(context):\n context.latest_materialization_records_by_partition(july_asset.key)\n\n # After materializing july_asset for 2022-07-05, latest_materialization_by_partition\n # returns {"2022-07-05": EventLogRecord(...)}\n\n """\n from dagster._core.events import DagsterEventType\n from dagster._core.storage.event_log.base import EventLogRecord, EventRecordsFilter\n\n asset_key = check.inst_param(asset_key, "asset_key", AssetKey)\n\n if asset_key not in self._assets_by_key:\n raise DagsterInvalidInvocationError(\n f"Asset key {asset_key} not monitored in sensor definition"\n )\n\n partitions_def = self._partitions_def_by_asset_key.get(asset_key)\n if not isinstance(partitions_def, PartitionsDefinition):\n raise DagsterInvariantViolationError(\n "Cannot get latest materialization by partition for assets with no partitions"\n )\n\n partitions_to_fetch = (\n self._get_partitions_after_cursor(asset_key)\n if after_cursor_partition\n else list(partitions_def.get_partition_keys(dynamic_partitions_store=self.instance))\n )\n\n # Retain ordering of materializations\n materialization_by_partition: Dict[str, EventLogRecord] = OrderedDict()\n\n # Add unconsumed events to the materialization by partition dictionary\n # These events came before the cursor, so should be inserted in storage ID ascending order\n for unconsumed_event in sorted(\n self._get_unconsumed_events_with_ids(\n list(self._get_cursor(asset_key).trailing_unconsumed_partitioned_event_ids.values())\n )\n ):\n partition = unconsumed_event.partition_key\n if isinstance(partition, str) and partition in partitions_to_fetch:\n if partition in materialization_by_partition:\n # Remove partition to ensure materialization_by_partition preserves\n # the order of materializations\n materialization_by_partition.pop(partition)\n # Add partition and materialization to the end of the OrderedDict\n materialization_by_partition[partition] = unconsumed_event\n\n partition_materializations = self.instance.get_event_records(\n EventRecordsFilter(\n event_type=DagsterEventType.ASSET_MATERIALIZATION,\n asset_key=asset_key,\n asset_partitions=partitions_to_fetch,\n after_cursor=self._get_cursor(asset_key).latest_consumed_event_id,\n ),\n ascending=True,\n )\n for materialization in partition_materializations:\n partition = materialization.partition_key\n\n if isinstance(partition, str):\n if partition in materialization_by_partition:\n # Remove partition to ensure materialization_by_partition preserves\n # the order of materializations\n materialization_by_partition.pop(partition)\n # Add partition and materialization to the end of the OrderedDict\n materialization_by_partition[partition] = materialization\n\n return materialization_by_partition
\n\n
[docs] @public\n def latest_materialization_records_by_partition_and_asset(\n self,\n ) -> Mapping[str, Mapping[AssetKey, "EventLogRecord"]]:\n """Finds the most recent unconsumed materialization for each partition for each asset\n monitored by the sensor. Aggregates all materializations into a mapping of partition key\n to a mapping of asset key to the materialization event for that partition.\n\n For example, if the sensor monitors two partitioned assets A and B that are materialized\n for partition_x after the cursor, this function returns:\n\n .. code-block:: python\n\n {\n "partition_x": {asset_a.key: EventLogRecord(...), asset_b.key: EventLogRecord(...)}\n }\n\n This method can only be called when all monitored assets are partitioned and share\n the same partition definition.\n """\n partitions_defs = list(self._partitions_def_by_asset_key.values())\n if not partitions_defs or not all(x == partitions_defs[0] for x in partitions_defs):\n raise DagsterInvalidInvocationError(\n "All assets must be partitioned and share the same partitions definition"\n )\n\n asset_and_materialization_tuple_by_partition: Dict[\n str, Dict[AssetKey, "EventLogRecord"]\n ] = defaultdict(dict)\n\n for asset_key in self._monitored_asset_keys:\n materialization_by_partition = self.latest_materialization_records_by_partition(\n asset_key\n )\n for partition, materialization in materialization_by_partition.items():\n asset_and_materialization_tuple_by_partition[partition][asset_key] = materialization\n\n return asset_and_materialization_tuple_by_partition
\n\n
[docs] @public\n def get_cursor_partition(self, asset_key: Optional[AssetKey]) -> Optional[str]:\n """A utility method to get the current partition the cursor is on."""\n asset_key = check.opt_inst_param(asset_key, "asset_key", AssetKey)\n if asset_key not in self._monitored_asset_keys:\n raise DagsterInvalidInvocationError(\n "Provided asset key must correspond to a provided asset"\n )\n if asset_key:\n partition_key = self._get_cursor(asset_key).latest_consumed_event_partition\n elif self._monitored_asset_keys is not None and len(self._monitored_asset_keys) == 1:\n partition_key = self._get_cursor(\n self._monitored_asset_keys[0]\n ).latest_consumed_event_partition\n else:\n raise DagsterInvalidInvocationError(\n "Asset key must be provided when multiple assets are defined"\n )\n\n return partition_key
\n\n
[docs] @public\n def all_partitions_materialized(\n self, asset_key: AssetKey, partitions: Optional[Sequence[str]] = None\n ) -> bool:\n """A utility method to check if a provided list of partitions have been materialized\n for a particular asset. This method ignores the cursor and checks all materializations\n for the asset.\n\n Args:\n asset_key (AssetKey): The asset to check partitions for.\n partitions (Optional[Sequence[str]]): A list of partitions to check. If not provided,\n all partitions for the asset will be checked.\n\n Returns:\n bool: True if all selected partitions have been materialized, False otherwise.\n """\n check.inst_param(asset_key, "asset_key", AssetKey)\n\n if partitions is not None:\n check.sequence_param(partitions, "partitions", of_type=str)\n if len(partitions) == 0:\n raise DagsterInvalidInvocationError("Must provide at least one partition in list")\n\n materialization_count_by_partition = self.instance.get_materialization_count_by_partition(\n [asset_key]\n ).get(asset_key, {})\n if not partitions:\n if asset_key not in self._monitored_asset_keys:\n raise DagsterInvariantViolationError(\n f"Asset key {asset_key} not monitored by sensor"\n )\n\n partitions_def = self._partitions_def_by_asset_key.get(asset_key)\n if not partitions_def:\n raise DagsterInvariantViolationError(\n f"Asset key {asset_key} is not partitioned. Cannot check if partitions have"\n " been materialized."\n )\n partitions = partitions_def.get_partition_keys(dynamic_partitions_store=self.instance)\n\n return all(\n [materialization_count_by_partition.get(partition, 0) != 0 for partition in partitions]\n )
\n\n def _get_asset(self, asset_key: AssetKey, fn_name: str) -> AssetsDefinition:\n from dagster._core.definitions.repository_definition import RepositoryDefinition\n\n repo_def = cast(RepositoryDefinition, self._repository_def)\n repository_assets = repo_def.assets_defs_by_key\n if asset_key in self._assets_by_key:\n asset_def = self._assets_by_key[asset_key]\n if asset_def is None:\n raise DagsterInvalidInvocationError(\n f"Asset key {asset_key} does not have an AssetDefinition in this repository"\n f" (likely because it is a SourceAsset). fn context.{fn_name} can only be"\n " called for assets with AssetDefinitions in the repository."\n )\n else:\n return asset_def\n elif asset_key in repository_assets:\n return repository_assets[asset_key]\n else:\n raise DagsterInvalidInvocationError(\n f"Asset key {asset_key} not monitored in sensor and does not exist in target jobs"\n )\n\n
[docs] @public\n def get_downstream_partition_keys(\n self, partition_key: str, from_asset_key: AssetKey, to_asset_key: AssetKey\n ) -> Sequence[str]:\n """Converts a partition key from one asset to the corresponding partition key in a downstream\n asset. Uses the existing partition mapping between the upstream asset and the downstream\n asset if it exists, otherwise, uses the default partition mapping.\n\n Args:\n partition_key (str): The partition key to convert.\n from_asset_key (AssetKey): The asset key of the upstream asset, which the provided\n partition key belongs to.\n to_asset_key (AssetKey): The asset key of the downstream asset. The provided partition\n key will be mapped to partitions within this asset.\n\n Returns:\n Sequence[str]: A list of the corresponding downstream partitions in to_asset_key that\n partition_key maps to.\n """\n partition_key = check.str_param(partition_key, "partition_key")\n\n to_asset = self._get_asset(to_asset_key, fn_name="get_downstream_partition_keys")\n from_asset = self._get_asset(from_asset_key, fn_name="get_downstream_partition_keys")\n\n to_partitions_def = to_asset.partitions_def\n\n if not isinstance(to_partitions_def, PartitionsDefinition):\n raise DagsterInvalidInvocationError(\n f"Asset key {to_asset_key} is not partitioned. Cannot get partition keys."\n )\n if not isinstance(from_asset.partitions_def, PartitionsDefinition):\n raise DagsterInvalidInvocationError(\n f"Asset key {from_asset_key} is not partitioned. Cannot get partition keys."\n )\n\n partition_mapping = to_asset.infer_partition_mapping(\n from_asset_key, from_asset.partitions_def\n )\n downstream_partition_key_subset = (\n partition_mapping.get_downstream_partitions_for_partitions(\n from_asset.partitions_def.empty_subset().with_partition_keys([partition_key]),\n downstream_partitions_def=to_partitions_def,\n dynamic_partitions_store=self.instance,\n )\n )\n\n return list(downstream_partition_key_subset.get_partition_keys())
\n\n
[docs] @public\n def advance_cursor(\n self, materialization_records_by_key: Mapping[AssetKey, Optional["EventLogRecord"]]\n ):\n """Marks the provided materialization records as having been consumed by the sensor.\n\n At the end of the tick, the cursor will be updated to advance past all materializations\n records provided via `advance_cursor`. In the next tick, records that have been consumed\n will no longer be returned.\n\n Passing a partitioned materialization record into this function will mark prior materializations\n with the same asset key and partition as having been consumed.\n\n Args:\n materialization_records_by_key (Mapping[AssetKey, Optional[EventLogRecord]]): Mapping of\n AssetKeys to EventLogRecord or None. If an EventLogRecord is provided, the cursor\n for the AssetKey will be updated and future calls to fetch asset materialization events\n will not fetch this event again. If None is provided, the cursor for the AssetKey\n will not be updated.\n """\n self._cursor_advance_state_mutation.add_advanced_records(materialization_records_by_key)\n self._cursor_updated = True
\n\n
[docs] @public\n def advance_all_cursors(self):\n """Updates the cursor to the most recent materialization event for all assets monitored by\n the multi_asset_sensor.\n\n Marks all materialization events as consumed by the sensor, including unconsumed events.\n """\n materializations_by_key = self.latest_materialization_records_by_key()\n\n self._cursor_advance_state_mutation.add_advanced_records(materializations_by_key)\n self._cursor_advance_state_mutation.advance_all_cursors_called = True\n self._cursor_updated = True
\n\n @public\n @property\n def assets_defs_by_key(self) -> Mapping[AssetKey, Optional[AssetsDefinition]]:\n """Mapping[AssetKey, Optional[AssetsDefinition]]: A mapping from AssetKey to the\n AssetsDefinition object which produces it. If a given asset is monitored by this sensor, but\n is not produced within the same code location as this sensor, then the value will be None.\n """\n return self._assets_by_key\n\n @public\n @property\n def asset_keys(self) -> Sequence[AssetKey]:\n """Sequence[AssetKey]: The asset keys which are monitored by this sensor."""\n return self._monitored_asset_keys
\n\n\nclass MultiAssetSensorCursorAdvances:\n _advanced_record_ids_by_key: Dict[AssetKey, Set[int]]\n _partition_key_by_record_id: Dict[int, Optional[str]]\n advance_all_cursors_called: bool\n\n def __init__(self):\n self._advanced_record_ids_by_key = defaultdict(set)\n self._partition_key_by_record_id = {}\n self.advance_all_cursors_called = False\n\n def add_advanced_records(\n self, materialization_records_by_key: Mapping[AssetKey, Optional["EventLogRecord"]]\n ):\n for asset_key, materialization in materialization_records_by_key.items():\n if materialization:\n self._advanced_record_ids_by_key[asset_key].add(materialization.storage_id)\n\n self._partition_key_by_record_id[materialization.storage_id] = (\n materialization.partition_key\n )\n\n def get_cursor_with_advances(\n self,\n context: MultiAssetSensorEvaluationContext,\n initial_cursor: MultiAssetSensorContextCursor,\n ) -> Optional[str]:\n """Given the multi asset sensor context and the cursor at the start of the tick,\n returns the cursor that should be used in the next tick.\n\n If the cursor has not been updated, returns None\n """\n if len(self._advanced_record_ids_by_key) == 0:\n # No events marked as advanced\n return None\n\n return json.dumps(\n {\n str(asset_key): self.get_asset_cursor_with_advances(\n asset_key, context, initial_cursor\n )\n for asset_key in context.asset_keys\n }\n )\n\n def get_asset_cursor_with_advances(\n self,\n asset_key: AssetKey,\n context: MultiAssetSensorEvaluationContext,\n initial_cursor: MultiAssetSensorContextCursor,\n ) -> MultiAssetSensorAssetCursorComponent:\n from dagster._core.events import DagsterEventType\n from dagster._core.storage.event_log.base import EventRecordsFilter\n\n advanced_records: Set[int] = self._advanced_record_ids_by_key.get(asset_key, set())\n if len(advanced_records) == 0:\n # No events marked as advanced for this asset key\n return initial_cursor.get_cursor_for_asset(asset_key)\n\n initial_asset_cursor = initial_cursor.get_cursor_for_asset(asset_key)\n\n latest_consumed_event_id_at_tick_start = initial_asset_cursor.latest_consumed_event_id\n\n greatest_consumed_event_id_in_tick = max(advanced_records)\n latest_consumed_partition_in_tick = self._partition_key_by_record_id[\n greatest_consumed_event_id_in_tick\n ]\n latest_unconsumed_record_by_partition: Dict[str, int] = {}\n\n if not self.advance_all_cursors_called:\n latest_unconsumed_record_by_partition = (\n initial_asset_cursor.trailing_unconsumed_partitioned_event_ids\n )\n unconsumed_events = list(context.get_trailing_unconsumed_events(asset_key)) + list(\n context.instance.get_event_records(\n EventRecordsFilter(\n event_type=DagsterEventType.ASSET_MATERIALIZATION,\n asset_key=asset_key,\n after_cursor=latest_consumed_event_id_at_tick_start,\n before_cursor=greatest_consumed_event_id_in_tick,\n ),\n ascending=True,\n )\n if greatest_consumed_event_id_in_tick\n > (latest_consumed_event_id_at_tick_start or 0)\n else []\n )\n\n # Iterate through events in ascending order, storing the latest unconsumed\n # event for each partition. If an advanced event exists for a partition, clear\n # the prior unconsumed event for that partition.\n for event in unconsumed_events:\n partition = event.partition_key\n if partition is not None: # Ignore unpartitioned events\n if event.storage_id not in advanced_records:\n latest_unconsumed_record_by_partition[partition] = event.storage_id\n elif partition in latest_unconsumed_record_by_partition:\n latest_unconsumed_record_by_partition.pop(partition)\n\n if (\n latest_consumed_partition_in_tick is not None\n and latest_consumed_partition_in_tick in latest_unconsumed_record_by_partition\n ):\n latest_unconsumed_record_by_partition.pop(latest_consumed_partition_in_tick)\n\n if len(latest_unconsumed_record_by_partition.keys()) >= MAX_NUM_UNCONSUMED_EVENTS:\n raise DagsterInvariantViolationError(f"""\n You have reached the maximum number of trailing unconsumed events\n ({MAX_NUM_UNCONSUMED_EVENTS}) for asset {asset_key} and no more events can be\n added. You can access the unconsumed events by calling the\n `get_trailing_unconsumed_events` method on the sensor context, and\n mark events as consumed by passing them to `advance_cursor`.\n\n Otherwise, you can clear all unconsumed events and reset the cursor to the latest\n materialization for each asset by calling `advance_all_cursors`.\n """)\n\n return MultiAssetSensorAssetCursorComponent(\n latest_consumed_event_partition=(\n latest_consumed_partition_in_tick\n if greatest_consumed_event_id_in_tick\n > (latest_consumed_event_id_at_tick_start or 0)\n else initial_asset_cursor.latest_consumed_event_partition\n ),\n latest_consumed_event_id=(\n greatest_consumed_event_id_in_tick\n if greatest_consumed_event_id_in_tick\n > (latest_consumed_event_id_at_tick_start or 0)\n else latest_consumed_event_id_at_tick_start\n ),\n trailing_unconsumed_partitioned_event_ids=latest_unconsumed_record_by_partition,\n )\n\n\ndef get_cursor_from_latest_materializations(\n asset_keys: Sequence[AssetKey], instance: DagsterInstance\n) -> str:\n from dagster._core.events import DagsterEventType\n from dagster._core.storage.event_log.base import EventRecordsFilter\n\n cursor_dict: Dict[str, MultiAssetSensorAssetCursorComponent] = {}\n\n for asset_key in asset_keys:\n materializations = instance.get_event_records(\n EventRecordsFilter(\n DagsterEventType.ASSET_MATERIALIZATION,\n asset_key=asset_key,\n ),\n limit=1,\n )\n if materializations:\n last_materialization = list(materializations)[-1]\n\n cursor_dict[str(asset_key)] = MultiAssetSensorAssetCursorComponent(\n last_materialization.partition_key,\n last_materialization.storage_id,\n {},\n )\n\n cursor_str = json.dumps(cursor_dict)\n return cursor_str\n\n\n
[docs]@experimental\ndef build_multi_asset_sensor_context(\n *,\n monitored_assets: Union[Sequence[AssetKey], AssetSelection],\n repository_def: Optional["RepositoryDefinition"] = None,\n instance: Optional[DagsterInstance] = None,\n cursor: Optional[str] = None,\n repository_name: Optional[str] = None,\n cursor_from_latest_materializations: bool = False,\n resources: Optional[Mapping[str, object]] = None,\n definitions: Optional["Definitions"] = None,\n) -> MultiAssetSensorEvaluationContext:\n """Builds multi asset sensor execution context for testing purposes using the provided parameters.\n\n This function can be used to provide a context to the invocation of a multi asset sensor definition. If\n provided, the dagster instance must be persistent; DagsterInstance.ephemeral() will result in an\n error.\n\n Args:\n monitored_assets (Union[Sequence[AssetKey], AssetSelection]): The assets monitored\n by the sensor. If an AssetSelection object is provided, it will only apply to assets\n within the Definitions that this sensor is part of.\n repository_def (RepositoryDefinition): `RepositoryDefinition` object that\n the sensor is defined in. Must provide `definitions` if this is not provided.\n instance (Optional[DagsterInstance]): The dagster instance configured to run the sensor.\n cursor (Optional[str]): A string cursor to provide to the evaluation of the sensor. Must be\n a dictionary of asset key strings to ints that has been converted to a json string\n repository_name (Optional[str]): The name of the repository that the sensor belongs to.\n cursor_from_latest_materializations (bool): If True, the cursor will be set to the latest\n materialization for each monitored asset. By default, set to False.\n resources (Optional[Mapping[str, object]]): The resource definitions\n to provide to the sensor.\n definitions (Optional[Definitions]): `Definitions` object that the sensor is defined in.\n Must provide `repository_def` if this is not provided.\n\n Examples:\n .. code-block:: python\n\n with instance_for_test() as instance:\n context = build_multi_asset_sensor_context(\n monitored_assets=[AssetKey("asset_1"), AssetKey("asset_2")],\n instance=instance,\n )\n my_asset_sensor(context)\n\n """\n from dagster._core.definitions import RepositoryDefinition\n from dagster._core.definitions.definitions_class import Definitions\n from dagster._core.execution.build_resources import wrap_resources_for_execution\n\n check.opt_inst_param(instance, "instance", DagsterInstance)\n check.opt_str_param(cursor, "cursor")\n check.opt_str_param(repository_name, "repository_name")\n repository_def = normalize_to_repository(\n check.opt_inst_param(definitions, "definitions", Definitions),\n check.opt_inst_param(repository_def, "repository_def", RepositoryDefinition),\n )\n\n check.bool_param(cursor_from_latest_materializations, "cursor_from_latest_materializations")\n\n if cursor_from_latest_materializations:\n if cursor:\n raise DagsterInvalidInvocationError(\n "Cannot provide both cursor and cursor_from_latest_materializations objects."\n " Dagster will override the provided cursor based on the"\n " cursor_from_latest_materializations object."\n )\n if not instance:\n raise DagsterInvalidInvocationError(\n "Cannot provide cursor_from_latest_materializations object without a Dagster"\n " instance."\n )\n\n asset_keys: Sequence[AssetKey]\n if isinstance(monitored_assets, AssetSelection):\n asset_keys = cast(\n List[AssetKey],\n list(\n monitored_assets.resolve(list(set(repository_def.assets_defs_by_key.values())))\n ),\n )\n else:\n asset_keys = monitored_assets\n\n cursor = get_cursor_from_latest_materializations(asset_keys, instance)\n\n return MultiAssetSensorEvaluationContext(\n instance_ref=None,\n last_completion_time=None,\n last_run_key=None,\n cursor=cursor,\n repository_name=repository_name,\n instance=instance,\n monitored_assets=monitored_assets,\n repository_def=repository_def,\n resource_defs=wrap_resources_for_execution(resources),\n )
\n\n\nAssetMaterializationFunctionReturn = Union[\n Iterator[Union[RunRequest, SkipReason, SensorResult]],\n Sequence[RunRequest],\n RunRequest,\n SkipReason,\n None,\n SensorResult,\n]\nAssetMaterializationFunction = Callable[\n ...,\n AssetMaterializationFunctionReturn,\n]\n\nMultiAssetMaterializationFunction = Callable[\n ...,\n AssetMaterializationFunctionReturn,\n]\n\n\n
[docs]@experimental\nclass MultiAssetSensorDefinition(SensorDefinition):\n """Define an asset sensor that initiates a set of runs based on the materialization of a list of\n assets.\n\n Users should not instantiate this object directly. To construct a\n `MultiAssetSensorDefinition`, use :py:func:`dagster.\n multi_asset_sensor`.\n\n Args:\n name (str): The name of the sensor to create.\n asset_keys (Sequence[AssetKey]): The asset_keys this sensor monitors.\n asset_materialization_fn (Callable[[MultiAssetSensorEvaluationContext], Union[Iterator[Union[RunRequest, SkipReason]], RunRequest, SkipReason]]): The core\n evaluation function for the sensor, which is run at an interval to determine whether a\n run should be launched or not. Takes a :py:class:`~dagster.MultiAssetSensorEvaluationContext`.\n\n This function must return a generator, which must yield either a single SkipReason\n or one or more RunRequest objects.\n minimum_interval_seconds (Optional[int]): The minimum number of seconds that will elapse\n between sensor evaluations.\n description (Optional[str]): A human-readable description of the sensor.\n job (Optional[Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]]): The job\n object to target with this sensor.\n jobs (Optional[Sequence[Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]]]):\n (experimental) A list of jobs to be executed when the sensor fires.\n default_status (DefaultSensorStatus): Whether the sensor starts as running or not. The default\n status can be overridden from the Dagster UI or via the GraphQL API.\n request_assets (Optional[AssetSelection]): (Experimental) an asset selection to launch a run\n for if the sensor condition is met. This can be provided instead of specifying a job.\n """\n\n def __init__(\n self,\n name: str,\n monitored_assets: Union[Sequence[AssetKey], AssetSelection],\n job_name: Optional[str],\n asset_materialization_fn: MultiAssetMaterializationFunction,\n minimum_interval_seconds: Optional[int] = None,\n description: Optional[str] = None,\n job: Optional[ExecutableDefinition] = None,\n jobs: Optional[Sequence[ExecutableDefinition]] = None,\n default_status: DefaultSensorStatus = DefaultSensorStatus.STOPPED,\n request_assets: Optional[AssetSelection] = None,\n required_resource_keys: Optional[Set[str]] = None,\n ):\n resource_arg_names: Set[str] = {\n arg.name for arg in get_resource_args(asset_materialization_fn)\n }\n\n combined_required_resource_keys = (\n check.opt_set_param(required_resource_keys, "required_resource_keys", of_type=str)\n | resource_arg_names\n )\n\n def _wrap_asset_fn(materialization_fn):\n def _fn(context):\n def _check_cursor_not_set(sensor_result: SensorResult):\n if sensor_result.cursor:\n raise DagsterInvariantViolationError(\n "Cannot set cursor in a multi_asset_sensor. Cursor is set automatically"\n " based on the latest materialization for each monitored asset."\n )\n\n resource_args_populated = validate_and_get_resource_dict(\n context.resources, name, resource_arg_names\n )\n\n with MultiAssetSensorEvaluationContext(\n instance_ref=context.instance_ref,\n last_completion_time=context.last_completion_time,\n last_run_key=context.last_run_key,\n cursor=context.cursor,\n repository_name=context.repository_def.name,\n repository_def=context.repository_def,\n monitored_assets=monitored_assets,\n instance=context.instance,\n resource_defs=context.resource_defs,\n ) as multi_asset_sensor_context:\n context_param_name = get_context_param_name(materialization_fn)\n context_param = (\n {context_param_name: multi_asset_sensor_context}\n if context_param_name\n else {}\n )\n result = materialization_fn(\n **context_param,\n **resource_args_populated,\n )\n if result is None:\n return\n\n # because the materialization_fn can yield results (see _wrapped_fn in multi_asset_sensor decorator),\n # even if you return None in a sensor, it will still cause in inspect.isgenerator(result) to be True.\n # So keep track to see if we actually return any values and should update the cursor\n runs_yielded = False\n if inspect.isgenerator(result) or isinstance(result, list):\n for item in result:\n if isinstance(item, RunRequest):\n runs_yielded = True\n if isinstance(item, SensorResult):\n raise DagsterInvariantViolationError(\n "Cannot yield a SensorResult from a multi_asset_sensor. Instead"\n " return the SensorResult."\n )\n yield item\n elif isinstance(result, RunRequest):\n runs_yielded = True\n yield result\n elif isinstance(result, SkipReason):\n # if result is a SkipReason, we don't update the cursor, so don't set runs_yielded = True\n yield result\n elif isinstance(result, SensorResult):\n _check_cursor_not_set(result)\n if result.run_requests:\n runs_yielded = True\n yield result\n\n if runs_yielded and not multi_asset_sensor_context.cursor_updated:\n raise DagsterInvalidDefinitionError(\n "Asset materializations have been handled in this sensor, but the cursor"\n " was not updated. This means the same materialization events will be"\n " handled in the next sensor tick. Use context.advance_cursor or"\n " context.advance_all_cursors to update the cursor."\n )\n\n multi_asset_sensor_context.update_cursor_after_evaluation()\n context.update_cursor(multi_asset_sensor_context.cursor)\n\n return _fn\n\n self._raw_asset_materialization_fn = asset_materialization_fn\n\n super(MultiAssetSensorDefinition, self).__init__(\n name=check_valid_name(name),\n job_name=job_name,\n evaluation_fn=_wrap_asset_fn(\n check.callable_param(asset_materialization_fn, "asset_materialization_fn")\n ),\n minimum_interval_seconds=minimum_interval_seconds,\n description=description,\n job=job,\n jobs=jobs,\n default_status=default_status,\n asset_selection=request_assets,\n required_resource_keys=combined_required_resource_keys,\n )\n\n def __call__(self, *args, **kwargs) -> AssetMaterializationFunctionReturn:\n context_param_name = get_context_param_name(self._raw_asset_materialization_fn)\n context = get_sensor_context_from_args_or_kwargs(\n self._raw_asset_materialization_fn,\n args,\n kwargs,\n context_type=MultiAssetSensorEvaluationContext,\n )\n\n resources = validate_and_get_resource_dict(\n context.resources if context else ScopedResourcesBuilder.build_empty(),\n self._name,\n self._required_resource_keys,\n )\n\n context_param = {context_param_name: context} if context_param_name and context else {}\n result = self._raw_asset_materialization_fn(**context_param, **resources)\n\n if context:\n context.update_cursor_after_evaluation()\n return result\n\n @property\n def sensor_type(self) -> SensorType:\n return SensorType.MULTI_ASSET
\n
", "current_page_name": "_modules/dagster/_core/definitions/multi_asset_sensor_definition", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.multi_asset_sensor_definition"}, "multi_dimensional_partitions": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.multi_dimensional_partitions

\nimport hashlib\nimport itertools\nfrom datetime import datetime\nfrom functools import lru_cache, reduce\nfrom typing import (\n    Dict,\n    Iterable,\n    List,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Set,\n    Tuple,\n    Type,\n    Union,\n    cast,\n)\n\nimport pendulum\n\nimport dagster._check as check\nfrom dagster._annotations import public\nfrom dagster._core.errors import (\n    DagsterInvalidDefinitionError,\n    DagsterInvalidInvocationError,\n    DagsterUnknownPartitionError,\n)\nfrom dagster._core.instance import DynamicPartitionsStore\nfrom dagster._core.storage.tags import (\n    MULTIDIMENSIONAL_PARTITION_PREFIX,\n    get_multidimensional_partition_tag,\n)\n\nfrom .partition import (\n    DefaultPartitionsSubset,\n    DynamicPartitionsDefinition,\n    PartitionsDefinition,\n    PartitionsSubset,\n    StaticPartitionsDefinition,\n)\nfrom .time_window_partitions import TimeWindow, TimeWindowPartitionsDefinition\n\nINVALID_STATIC_PARTITIONS_KEY_CHARACTERS = set(["|", ",", "[", "]"])\n\nMULTIPARTITION_KEY_DELIMITER = "|"\n\n\nclass PartitionDimensionKey(\n    NamedTuple("_PartitionDimensionKey", [("dimension_name", str), ("partition_key", str)])\n):\n    """Representation of a single dimension of a multi-dimensional partition key."""\n\n    def __new__(cls, dimension_name: str, partition_key: str):\n        return super(PartitionDimensionKey, cls).__new__(\n            cls,\n            dimension_name=check.str_param(dimension_name, "dimension_name"),\n            partition_key=check.str_param(partition_key, "partition_key"),\n        )\n\n\n
[docs]class MultiPartitionKey(str):\n """A multi-dimensional partition key stores the partition key for each dimension.\n Subclasses the string class to keep partition key type as a string.\n\n Contains additional methods to access the partition key for each dimension.\n Creates a string representation of the partition key for each dimension, separated by a pipe (|).\n Orders the dimensions by name, to ensure consistent string representation.\n """\n\n dimension_keys: List[PartitionDimensionKey] = []\n\n def __new__(cls, keys_by_dimension: Mapping[str, str]):\n check.mapping_param(\n keys_by_dimension, "partitions_by_dimension", key_type=str, value_type=str\n )\n\n dimension_keys: List[PartitionDimensionKey] = [\n PartitionDimensionKey(dimension, keys_by_dimension[dimension])\n for dimension in sorted(list(keys_by_dimension.keys()))\n ]\n\n str_key = super(MultiPartitionKey, cls).__new__(\n cls,\n MULTIPARTITION_KEY_DELIMITER.join(\n [dim_key.partition_key for dim_key in dimension_keys]\n ),\n )\n\n str_key.dimension_keys = dimension_keys\n\n return str_key\n\n def __getnewargs__(self):\n # When this instance is pickled, replace the argument to __new__ with the\n # dimension key mapping instead of the string representation.\n return ({dim_key.dimension_name: dim_key.partition_key for dim_key in self.dimension_keys},)\n\n @property\n def keys_by_dimension(self) -> Mapping[str, str]:\n return {dim_key.dimension_name: dim_key.partition_key for dim_key in self.dimension_keys}
\n\n\nclass PartitionDimensionDefinition(\n NamedTuple(\n "_PartitionDimensionDefinition",\n [\n ("name", str),\n ("partitions_def", PartitionsDefinition),\n ],\n )\n):\n def __new__(\n cls,\n name: str,\n partitions_def: PartitionsDefinition,\n ):\n return super().__new__(\n cls,\n name=check.str_param(name, "name"),\n partitions_def=check.inst_param(partitions_def, "partitions_def", PartitionsDefinition),\n )\n\n def __eq__(self, other: object) -> bool:\n return (\n isinstance(other, PartitionDimensionDefinition)\n and self.name == other.name\n and self.partitions_def == other.partitions_def\n )\n\n\nALLOWED_PARTITION_DIMENSION_TYPES = (\n StaticPartitionsDefinition,\n TimeWindowPartitionsDefinition,\n DynamicPartitionsDefinition,\n)\n\n\ndef _check_valid_partitions_dimensions(\n partitions_dimensions: Mapping[str, PartitionsDefinition]\n) -> None:\n for dim_name, partitions_def in partitions_dimensions.items():\n if not any(isinstance(partitions_def, t) for t in ALLOWED_PARTITION_DIMENSION_TYPES):\n raise DagsterInvalidDefinitionError(\n f"Invalid partitions definition type {type(partitions_def)}. "\n "Only the following partitions definition types are supported: "\n f"{ALLOWED_PARTITION_DIMENSION_TYPES}."\n )\n if isinstance(partitions_def, DynamicPartitionsDefinition) and partitions_def.name is None:\n raise DagsterInvalidDefinitionError(\n "DynamicPartitionsDefinition must have a name to be used in a"\n " MultiPartitionsDefinition."\n )\n\n if isinstance(partitions_def, StaticPartitionsDefinition):\n if any(\n [\n INVALID_STATIC_PARTITIONS_KEY_CHARACTERS & set(key)\n for key in partitions_def.get_partition_keys()\n ]\n ):\n raise DagsterInvalidDefinitionError(\n f"Invalid character in partition key for dimension {dim_name}. "\n "A multi-partitions definition cannot contain partition keys with "\n "the following characters: |, [, ], ,"\n )\n\n\n
[docs]class MultiPartitionsDefinition(PartitionsDefinition[MultiPartitionKey]):\n """Takes the cross-product of partitions from two partitions definitions.\n\n For example, with a static partitions definition where the partitions are ["a", "b", "c"]\n and a daily partitions definition, this partitions definition will have the following\n partitions:\n\n 2020-01-01|a\n 2020-01-01|b\n 2020-01-01|c\n 2020-01-02|a\n 2020-01-02|b\n ...\n\n Args:\n partitions_defs (Mapping[str, PartitionsDefinition]):\n A mapping of dimension name to partitions definition. The total set of partitions will\n be the cross-product of the partitions from each PartitionsDefinition.\n\n Attributes:\n partitions_defs (Sequence[PartitionDimensionDefinition]):\n A sequence of PartitionDimensionDefinition objects, each of which contains a dimension\n name and a PartitionsDefinition. The total set of partitions will be the cross-product\n of the partitions from each PartitionsDefinition. This sequence is ordered by\n dimension name, to ensure consistent ordering of the partitions.\n """\n\n def __init__(self, partitions_defs: Mapping[str, PartitionsDefinition]):\n if not len(partitions_defs.keys()) == 2:\n raise DagsterInvalidInvocationError(\n "Dagster currently only supports multi-partitions definitions with 2 partitions"\n " definitions. Your multi-partitions definition has"\n f" {len(partitions_defs.keys())} partitions definitions."\n )\n check.mapping_param(\n partitions_defs, "partitions_defs", key_type=str, value_type=PartitionsDefinition\n )\n\n _check_valid_partitions_dimensions(partitions_defs)\n\n self._partitions_defs: List[PartitionDimensionDefinition] = sorted(\n [\n PartitionDimensionDefinition(name, partitions_def)\n for name, partitions_def in partitions_defs.items()\n ],\n key=lambda x: x.name,\n )\n\n @property\n def partitions_subset_class(self) -> Type["PartitionsSubset"]:\n return MultiPartitionsSubset\n\n def get_serializable_unique_identifier(\n self, dynamic_partitions_store: Optional[DynamicPartitionsStore] = None\n ) -> str:\n return hashlib.sha1(\n str(\n {\n dim_def.name: dim_def.partitions_def.get_serializable_unique_identifier(\n dynamic_partitions_store\n )\n for dim_def in self.partitions_defs\n }\n ).encode("utf-8")\n ).hexdigest()\n\n @property\n def partition_dimension_names(self) -> List[str]:\n return [dim_def.name for dim_def in self._partitions_defs]\n\n @property\n def partitions_defs(self) -> Sequence[PartitionDimensionDefinition]:\n return self._partitions_defs\n\n def get_partitions_def_for_dimension(self, dimension_name: str) -> PartitionsDefinition:\n for dim_def in self._partitions_defs:\n if dim_def.name == dimension_name:\n return dim_def.partitions_def\n check.failed(f"Invalid dimension name {dimension_name}")\n\n # We override the default implementation of `has_partition_key` for performance.\n def has_partition_key(\n self,\n partition_key: Union[MultiPartitionKey, str],\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> bool:\n partition_key = (\n partition_key\n if isinstance(partition_key, MultiPartitionKey)\n else self.get_partition_key_from_str(partition_key)\n )\n if partition_key.keys_by_dimension.keys() != set(self.partition_dimension_names):\n raise DagsterUnknownPartitionError(\n f"Invalid partition key {partition_key}. The dimensions of the partition key are"\n " not the dimensions of the partitions definition."\n )\n\n for dimension in self.partitions_defs:\n if not dimension.partitions_def.has_partition_key(\n partition_key.keys_by_dimension[dimension.name],\n current_time=current_time,\n dynamic_partitions_store=dynamic_partitions_store,\n ):\n return False\n return True\n\n # store results for repeated calls with the same current_time\n @lru_cache(maxsize=1)\n def _get_partition_keys(\n self, current_time: datetime, dynamic_partitions_store: Optional[DynamicPartitionsStore]\n ) -> Sequence[MultiPartitionKey]:\n partition_key_sequences = [\n partition_dim.partitions_def.get_partition_keys(\n current_time=current_time, dynamic_partitions_store=dynamic_partitions_store\n )\n for partition_dim in self._partitions_defs\n ]\n\n return [\n MultiPartitionKey(\n {self._partitions_defs[i].name: key for i, key in enumerate(partition_key_tuple)}\n )\n for partition_key_tuple in itertools.product(*partition_key_sequences)\n ]\n\n
[docs] @public\n def get_partition_keys(\n self,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> Sequence[MultiPartitionKey]:\n """Returns a list of MultiPartitionKeys representing the partition keys of the\n PartitionsDefinition.\n\n Args:\n current_time (Optional[datetime]): A datetime object representing the current time, only\n applicable to time-based partition dimensions.\n dynamic_partitions_store (Optional[DynamicPartitionsStore]): The DynamicPartitionsStore\n object that is responsible for fetching dynamic partitions. Required when a\n dimension is a DynamicPartitionsDefinition with a name defined. Users can pass the\n DagsterInstance fetched via `context.instance` to this argument.\n\n Returns:\n Sequence[MultiPartitionKey]\n """\n return self._get_partition_keys(\n current_time or pendulum.now("UTC"), dynamic_partitions_store\n )
\n\n def filter_valid_partition_keys(\n self, partition_keys: Set[str], dynamic_partitions_store: DynamicPartitionsStore\n ) -> Set[MultiPartitionKey]:\n partition_keys_by_dimension = {\n dim.name: dim.partitions_def.get_partition_keys(\n dynamic_partitions_store=dynamic_partitions_store\n )\n for dim in self.partitions_defs\n }\n validated_partitions = set()\n for partition_key in partition_keys:\n partition_key_strs = partition_key.split(MULTIPARTITION_KEY_DELIMITER)\n if len(partition_key_strs) != len(self.partitions_defs):\n continue\n\n multipartition_key = MultiPartitionKey(\n {dim.name: partition_key_strs[i] for i, dim in enumerate(self._partitions_defs)}\n )\n\n if all(\n key in partition_keys_by_dimension.get(dim, [])\n for dim, key in multipartition_key.keys_by_dimension.items()\n ):\n validated_partitions.add(partition_key)\n\n return validated_partitions\n\n def __eq__(self, other):\n return (\n isinstance(other, MultiPartitionsDefinition)\n and self.partitions_defs == other.partitions_defs\n )\n\n def __hash__(self):\n return hash(\n tuple(\n [\n (partitions_def.name, partitions_def.__repr__())\n for partitions_def in self.partitions_defs\n ]\n )\n )\n\n def __str__(self) -> str:\n dimension_1 = self._partitions_defs[0]\n dimension_2 = self._partitions_defs[1]\n partition_str = (\n "Multi-partitioned, with dimensions: \\n"\n f"{dimension_1.name.capitalize()}: {dimension_1.partitions_def} \\n"\n f"{dimension_2.name.capitalize()}: {dimension_2.partitions_def}"\n )\n return partition_str\n\n def __repr__(self) -> str:\n return f"{type(self).__name__}(dimensions={[str(dim) for dim in self.partitions_defs]}"\n\n def get_partition_key_from_str(self, partition_key_str: str) -> MultiPartitionKey:\n """Given a string representation of a partition key, returns a MultiPartitionKey object."""\n check.str_param(partition_key_str, "partition_key_str")\n\n partition_key_strs = partition_key_str.split(MULTIPARTITION_KEY_DELIMITER)\n check.invariant(\n len(partition_key_strs) == len(self.partitions_defs),\n f"Expected {len(self.partitions_defs)} partition keys in partition key string"\n f" {partition_key_str}, but got {len(partition_key_strs)}",\n )\n\n return MultiPartitionKey(\n {dim.name: partition_key_strs[i] for i, dim in enumerate(self._partitions_defs)}\n )\n\n def _get_primary_and_secondary_dimension(\n self,\n ) -> Tuple[PartitionDimensionDefinition, PartitionDimensionDefinition]:\n # Multipartitions subsets are serialized by primary dimension. If changing\n # the selection of primary/secondary dimension, will need to also update the\n # serialization of MultiPartitionsSubsets\n\n time_dimensions = [\n dim\n for dim in self.partitions_defs\n if isinstance(dim.partitions_def, TimeWindowPartitionsDefinition)\n ]\n if len(time_dimensions) == 1:\n primary_dimension, secondary_dimension = time_dimensions[0], next(\n iter([dim for dim in self.partitions_defs if dim != time_dimensions[0]])\n )\n else:\n primary_dimension, secondary_dimension = (\n self.partitions_defs[0],\n self.partitions_defs[1],\n )\n\n return primary_dimension, secondary_dimension\n\n @property\n def primary_dimension(self) -> PartitionDimensionDefinition:\n return self._get_primary_and_secondary_dimension()[0]\n\n @property\n def secondary_dimension(self) -> PartitionDimensionDefinition:\n return self._get_primary_and_secondary_dimension()[1]\n\n def get_tags_for_partition_key(self, partition_key: str) -> Mapping[str, str]:\n partition_key = cast(MultiPartitionKey, self.get_partition_key_from_str(partition_key))\n tags = {**super().get_tags_for_partition_key(partition_key)}\n tags.update(get_tags_from_multi_partition_key(partition_key))\n return tags\n\n @property\n def time_window_dimension(self) -> PartitionDimensionDefinition:\n time_window_dims = [\n dim\n for dim in self.partitions_defs\n if isinstance(dim.partitions_def, TimeWindowPartitionsDefinition)\n ]\n check.invariant(\n len(time_window_dims) == 1, "Expected exactly one time window partitioned dimension"\n )\n return next(iter(time_window_dims))\n\n def time_window_for_partition_key(self, partition_key: str) -> TimeWindow:\n if not isinstance(partition_key, MultiPartitionKey):\n partition_key = self.get_partition_key_from_str(partition_key)\n\n time_window_dimension = self.time_window_dimension\n return cast(\n TimeWindowPartitionsDefinition, time_window_dimension.partitions_def\n ).time_window_for_partition_key(\n cast(MultiPartitionKey, partition_key).keys_by_dimension[time_window_dimension.name]\n )\n\n def get_multipartition_keys_with_dimension_value(\n self,\n dimension_name: str,\n dimension_partition_key: str,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n current_time: Optional[datetime] = None,\n ) -> Sequence[MultiPartitionKey]:\n check.str_param(dimension_name, "dimension_name")\n check.str_param(dimension_partition_key, "dimension_partition_key")\n\n matching_dimensions = [\n dimension for dimension in self.partitions_defs if dimension.name == dimension_name\n ]\n other_dimensions = [\n dimension for dimension in self.partitions_defs if dimension.name != dimension_name\n ]\n\n check.invariant(\n len(matching_dimensions) == 1,\n f"Dimension {dimension_name} not found in MultiPartitionsDefinition with dimensions"\n f" {[dim.name for dim in self.partitions_defs]}",\n )\n\n partition_sequences = [\n partition_dim.partitions_def.get_partition_keys(\n current_time=current_time, dynamic_partitions_store=dynamic_partitions_store\n )\n for partition_dim in other_dimensions\n ] + [[dimension_partition_key]]\n\n # Names of partitions dimensions in the same order as partition_sequences\n partition_dim_names = [dim.name for dim in other_dimensions] + [dimension_name]\n\n return [\n MultiPartitionKey(\n {\n partition_dim_names[i]: partition_key\n for i, partition_key in enumerate(partitions_tuple)\n }\n )\n for partitions_tuple in itertools.product(*partition_sequences)\n ]\n\n def get_num_partitions(\n self,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> int:\n # Static partitions definitions can contain duplicate keys (will throw error in 1.3.0)\n # In the meantime, relying on get_num_partitions to handle duplicates to display\n # correct counts in the Dagster UI.\n dimension_counts = [\n dim.partitions_def.get_num_partitions(\n current_time=current_time, dynamic_partitions_store=dynamic_partitions_store\n )\n for dim in self.partitions_defs\n ]\n return reduce(lambda x, y: x * y, dimension_counts, 1)
\n\n\nclass MultiPartitionsSubset(DefaultPartitionsSubset):\n def __init__(\n self,\n partitions_def: MultiPartitionsDefinition,\n subset: Optional[Set[str]] = None,\n ):\n check.inst_param(partitions_def, "partitions_def", MultiPartitionsDefinition)\n subset = (\n set(\n [\n partitions_def.get_partition_key_from_str(key)\n for key in subset\n if MULTIPARTITION_KEY_DELIMITER in key\n ]\n )\n if subset\n else set()\n )\n super(MultiPartitionsSubset, self).__init__(partitions_def, subset)\n\n def with_partition_keys(self, partition_keys: Iterable[str]) -> "MultiPartitionsSubset":\n return MultiPartitionsSubset(\n cast(MultiPartitionsDefinition, self._partitions_def),\n self._subset | set(partition_keys),\n )\n\n\ndef get_tags_from_multi_partition_key(multi_partition_key: MultiPartitionKey) -> Mapping[str, str]:\n check.inst_param(multi_partition_key, "multi_partition_key", MultiPartitionKey)\n\n return {\n get_multidimensional_partition_tag(dimension.dimension_name): dimension.partition_key\n for dimension in multi_partition_key.dimension_keys\n }\n\n\ndef get_multipartition_key_from_tags(tags: Mapping[str, str]) -> str:\n partitions_by_dimension: Dict[str, str] = {}\n for tag in tags:\n if tag.startswith(MULTIDIMENSIONAL_PARTITION_PREFIX):\n dimension = tag[len(MULTIDIMENSIONAL_PARTITION_PREFIX) :]\n partitions_by_dimension[dimension] = tags[tag]\n\n return MultiPartitionKey(partitions_by_dimension)\n
", "current_page_name": "_modules/dagster/_core/definitions/multi_dimensional_partitions", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.multi_dimensional_partitions"}, "op_definition": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.op_definition

\nimport inspect\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Any,\n    Callable,\n    Iterator,\n    List,\n    Mapping,\n    Optional,\n    Sequence,\n    Tuple,\n    TypeVar,\n    Union,\n    cast,\n)\n\nfrom typing_extensions import TypeAlias, get_args, get_origin\n\nimport dagster._check as check\nfrom dagster._annotations import deprecated, deprecated_param, public\nfrom dagster._config.config_schema import UserConfigSchema\nfrom dagster._core.definitions.dependency import NodeHandle, NodeInputHandle\nfrom dagster._core.definitions.node_definition import NodeDefinition\nfrom dagster._core.definitions.op_invocation import direct_invocation_result\nfrom dagster._core.definitions.policy import RetryPolicy\nfrom dagster._core.definitions.resource_requirement import (\n    InputManagerRequirement,\n    OpDefinitionResourceRequirement,\n    OutputManagerRequirement,\n    ResourceRequirement,\n)\nfrom dagster._core.errors import (\n    DagsterInvalidDefinitionError,\n    DagsterInvalidInvocationError,\n    DagsterInvariantViolationError,\n)\nfrom dagster._core.types.dagster_type import DagsterType, DagsterTypeKind\nfrom dagster._utils import IHasInternalInit\nfrom dagster._utils.warnings import normalize_renamed_param\n\nfrom .definition_config_schema import (\n    IDefinitionConfigSchema,\n    convert_user_facing_definition_config_schema,\n)\nfrom .hook_definition import HookDefinition\nfrom .inference import infer_output_props\nfrom .input import In, InputDefinition\nfrom .output import Out, OutputDefinition\n\nif TYPE_CHECKING:\n    from dagster._core.definitions.asset_layer import AssetLayer\n\n    from .composition import PendingNodeInvocation\n    from .decorators.op_decorator import DecoratedOpFunction\n\nOpComputeFunction: TypeAlias = Callable[..., Any]\n\n\n
[docs]@deprecated_param(\n param="version", breaking_version="2.0", additional_warn_text="Use `code_version` instead."\n)\nclass OpDefinition(NodeDefinition, IHasInternalInit):\n """Defines an op, the functional unit of user-defined computation.\n\n For more details on what a op is, refer to the\n `Ops Overview <../../concepts/ops-jobs-graphs/ops>`_ .\n\n End users should prefer the :func:`@op <op>` decorator. OpDefinition is generally intended to be\n used by framework authors or for programatically generated ops.\n\n Args:\n name (str): Name of the op. Must be unique within any :py:class:`GraphDefinition` or\n :py:class:`JobDefinition` that contains the op.\n input_defs (List[InputDefinition]): Inputs of the op.\n compute_fn (Callable): The core of the op, the function that performs the actual\n computation. The signature of this function is determined by ``input_defs``, and\n optionally, an injected first argument, ``context``, a collection of information\n provided by the system.\n\n This function will be coerced into a generator or an async generator, which must yield\n one :py:class:`Output` for each of the op's ``output_defs``, and additionally may\n yield other types of Dagster events, including :py:class:`AssetMaterialization` and\n :py:class:`ExpectationResult`.\n output_defs (List[OutputDefinition]): Outputs of the op.\n config_schema (Optional[ConfigSchema): The schema for the config. If set, Dagster will check\n that the config provided for the op matches this schema and will fail if it does not. If\n not set, Dagster will accept any config provided for the op.\n description (Optional[str]): Human-readable description of the op.\n tags (Optional[Dict[str, Any]]): Arbitrary metadata for the op. Frameworks may\n expect and require certain metadata to be attached to a op. Users should generally\n not set metadata directly. Values that are not strings will be json encoded and must meet\n the criteria that `json.loads(json.dumps(value)) == value`.\n required_resource_keys (Optional[Set[str]]): Set of resources handles required by this op.\n code_version (Optional[str]): (Experimental) Version of the code encapsulated by the op. If set,\n this is used as a default code version for all outputs.\n retry_policy (Optional[RetryPolicy]): The retry policy for this op.\n\n\n Examples:\n .. code-block:: python\n\n def _add_one(_context, inputs):\n yield Output(inputs["num"] + 1)\n\n OpDefinition(\n name="add_one",\n ins={"num": In(int)},\n outs={"result": Out(int)},\n compute_fn=_add_one,\n )\n """\n\n _compute_fn: Union[Callable[..., Any], "DecoratedOpFunction"]\n _config_schema: IDefinitionConfigSchema\n _required_resource_keys: AbstractSet[str]\n _version: Optional[str]\n _retry_policy: Optional[RetryPolicy]\n\n def __init__(\n self,\n compute_fn: Union[Callable[..., Any], "DecoratedOpFunction"],\n name: str,\n ins: Optional[Mapping[str, In]] = None,\n outs: Optional[Mapping[str, Out]] = None,\n description: Optional[str] = None,\n config_schema: Optional[Union[UserConfigSchema, IDefinitionConfigSchema]] = None,\n required_resource_keys: Optional[AbstractSet[str]] = None,\n tags: Optional[Mapping[str, Any]] = None,\n version: Optional[str] = None,\n retry_policy: Optional[RetryPolicy] = None,\n code_version: Optional[str] = None,\n ):\n from .decorators.op_decorator import DecoratedOpFunction, resolve_checked_op_fn_inputs\n\n ins = check.opt_mapping_param(ins, "ins")\n input_defs = [\n inp.to_definition(name) for name, inp in sorted(ins.items(), key=lambda inp: inp[0])\n ] # sort so that input definition order is deterministic\n\n if isinstance(compute_fn, DecoratedOpFunction):\n resolved_input_defs: Sequence[InputDefinition] = resolve_checked_op_fn_inputs(\n decorator_name="@op",\n fn_name=name,\n compute_fn=cast(DecoratedOpFunction, compute_fn),\n explicit_input_defs=input_defs,\n exclude_nothing=True,\n )\n self._compute_fn = compute_fn\n _validate_context_type_hint(self._compute_fn.decorated_fn)\n else:\n resolved_input_defs = input_defs\n self._compute_fn = check.callable_param(compute_fn, "compute_fn")\n _validate_context_type_hint(self._compute_fn)\n\n code_version = normalize_renamed_param(\n code_version,\n "code_version",\n version,\n "version",\n )\n self._version = code_version\n\n check.opt_mapping_param(outs, "outs")\n output_defs = _resolve_output_defs_from_outs(\n compute_fn=compute_fn, outs=outs, default_code_version=code_version\n )\n\n self._config_schema = convert_user_facing_definition_config_schema(config_schema)\n self._required_resource_keys = frozenset(\n check.opt_set_param(required_resource_keys, "required_resource_keys", of_type=str)\n )\n self._retry_policy = check.opt_inst_param(retry_policy, "retry_policy", RetryPolicy)\n\n positional_inputs = (\n self._compute_fn.positional_inputs()\n if isinstance(self._compute_fn, DecoratedOpFunction)\n else None\n )\n\n super(OpDefinition, self).__init__(\n name=name,\n input_defs=check.sequence_param(resolved_input_defs, "input_defs", InputDefinition),\n output_defs=check.sequence_param(output_defs, "output_defs", OutputDefinition),\n description=description,\n tags=check.opt_mapping_param(tags, "tags", key_type=str),\n positional_inputs=positional_inputs,\n )\n\n def dagster_internal_init(\n *,\n compute_fn: Union[Callable[..., Any], "DecoratedOpFunction"],\n name: str,\n ins: Optional[Mapping[str, In]],\n outs: Optional[Mapping[str, Out]],\n description: Optional[str],\n config_schema: Optional[Union[UserConfigSchema, IDefinitionConfigSchema]],\n required_resource_keys: Optional[AbstractSet[str]],\n tags: Optional[Mapping[str, Any]],\n version: Optional[str],\n retry_policy: Optional[RetryPolicy],\n code_version: Optional[str],\n ) -> "OpDefinition":\n return OpDefinition(\n compute_fn=compute_fn,\n name=name,\n ins=ins,\n outs=outs,\n description=description,\n config_schema=config_schema,\n required_resource_keys=required_resource_keys,\n tags=tags,\n version=version,\n retry_policy=retry_policy,\n code_version=code_version,\n )\n\n @property\n def node_type_str(self) -> str:\n return "op"\n\n @property\n def is_graph_job_op_node(self) -> bool:\n return True\n\n @public\n @property\n def name(self) -> str:\n """str: The name of this op."""\n return super(OpDefinition, self).name\n\n @public\n @property\n def ins(self) -> Mapping[str, In]:\n """Mapping[str, In]: A mapping from input name to the In object that represents that input."""\n return {input_def.name: In.from_definition(input_def) for input_def in self.input_defs}\n\n @public\n @property\n def outs(self) -> Mapping[str, Out]:\n """Mapping[str, Out]: A mapping from output name to the Out object that represents that output."""\n return {output_def.name: Out.from_definition(output_def) for output_def in self.output_defs}\n\n @property\n def compute_fn(self) -> Union[Callable[..., Any], "DecoratedOpFunction"]:\n return self._compute_fn\n\n @public\n @property\n def config_schema(self) -> IDefinitionConfigSchema:\n """IDefinitionConfigSchema: The config schema for this op."""\n return self._config_schema\n\n @public\n @property\n def required_resource_keys(self) -> AbstractSet[str]:\n """AbstractSet[str]: A set of keys for resources that must be provided to this OpDefinition."""\n return frozenset(self._required_resource_keys)\n\n @public\n @deprecated(breaking_version="2.0", additional_warn_text="Use `code_version` instead.")\n @property\n def version(self) -> Optional[str]:\n """str: Version of the code encapsulated by the op. If set, this is used as a\n default code version for all outputs.\n """\n return self._version\n\n @public\n @property\n def retry_policy(self) -> Optional[RetryPolicy]:\n """Optional[RetryPolicy]: The RetryPolicy for this op."""\n return self._retry_policy\n\n @public\n @property\n def tags(self) -> Mapping[str, str]:\n """Mapping[str, str]: The tags for this op."""\n return super(OpDefinition, self).tags\n\n
[docs] @public\n def alias(self, name: str) -> "PendingNodeInvocation":\n """Creates a copy of this op with the given name."""\n return super(OpDefinition, self).alias(name)
\n\n
[docs] @public\n def tag(self, tags: Optional[Mapping[str, str]]) -> "PendingNodeInvocation":\n """Creates a copy of this op with the given tags."""\n return super(OpDefinition, self).tag(tags)
\n\n
[docs] @public\n def with_hooks(self, hook_defs: AbstractSet[HookDefinition]) -> "PendingNodeInvocation":\n """Creates a copy of this op with the given hook definitions."""\n return super(OpDefinition, self).with_hooks(hook_defs)
\n\n
[docs] @public\n def with_retry_policy(self, retry_policy: RetryPolicy) -> "PendingNodeInvocation":\n """Creates a copy of this op with the given retry policy."""\n return super(OpDefinition, self).with_retry_policy(retry_policy)
\n\n def is_from_decorator(self) -> bool:\n from .decorators.op_decorator import DecoratedOpFunction\n\n return isinstance(self._compute_fn, DecoratedOpFunction)\n\n def get_output_annotation(self) -> Any:\n if not self.is_from_decorator():\n raise DagsterInvalidInvocationError(\n f"Attempted to get output annotation for {self.node_type_str} '{self.name}', "\n "which was not constructed from a decorated function."\n )\n return cast("DecoratedOpFunction", self.compute_fn).get_output_annotation()\n\n def all_dagster_types(self) -> Iterator[DagsterType]:\n yield from self.all_input_output_types()\n\n def iterate_node_defs(self) -> Iterator[NodeDefinition]:\n yield self\n\n def iterate_op_defs(self) -> Iterator["OpDefinition"]:\n yield self\n\n T_Handle = TypeVar("T_Handle", bound=Optional[NodeHandle])\n\n def resolve_output_to_origin(\n self, output_name: str, handle: T_Handle\n ) -> Tuple[OutputDefinition, T_Handle]:\n return self.output_def_named(output_name), handle\n\n def resolve_output_to_origin_op_def(self, output_name: str) -> "OpDefinition":\n return self\n\n def get_inputs_must_be_resolved_top_level(\n self, asset_layer: "AssetLayer", handle: Optional[NodeHandle] = None\n ) -> Sequence[InputDefinition]:\n handle = cast(NodeHandle, check.inst_param(handle, "handle", NodeHandle))\n unresolveable_input_defs = []\n for input_def in self.input_defs:\n if (\n not input_def.dagster_type.loader\n and not input_def.dagster_type.kind == DagsterTypeKind.NOTHING\n and not input_def.has_default_value\n and not input_def.input_manager_key\n ):\n input_asset_key = asset_layer.asset_key_for_input(handle, input_def.name)\n # If input_asset_key is present, this input can be resolved\n # by a source asset, so input does not need to be resolved\n # at the top level.\n if input_asset_key:\n continue\n unresolveable_input_defs.append(input_def)\n return unresolveable_input_defs\n\n def input_has_default(self, input_name: str) -> bool:\n return self.input_def_named(input_name).has_default_value\n\n def default_value_for_input(self, input_name: str) -> InputDefinition:\n return self.input_def_named(input_name).default_value\n\n def input_supports_dynamic_output_dep(self, input_name: str) -> bool:\n return True\n\n def with_replaced_properties(\n self,\n name: str,\n ins: Optional[Mapping[str, In]] = None,\n outs: Optional[Mapping[str, Out]] = None,\n config_schema: Optional[IDefinitionConfigSchema] = None,\n description: Optional[str] = None,\n ) -> "OpDefinition":\n return OpDefinition.dagster_internal_init(\n name=name,\n ins=ins\n or {input_def.name: In.from_definition(input_def) for input_def in self.input_defs},\n outs=outs\n or {\n output_def.name: Out.from_definition(output_def) for output_def in self.output_defs\n },\n compute_fn=self.compute_fn,\n config_schema=config_schema or self.config_schema,\n description=description or self.description,\n tags=self.tags,\n required_resource_keys=self.required_resource_keys,\n code_version=self._version,\n retry_policy=self.retry_policy,\n version=None, # code_version replaces version\n )\n\n def copy_for_configured(\n self,\n name: str,\n description: Optional[str],\n config_schema: IDefinitionConfigSchema,\n ) -> "OpDefinition":\n return self.with_replaced_properties(\n name=name,\n description=description,\n config_schema=config_schema,\n )\n\n def get_resource_requirements(\n self,\n outer_context: Optional[object] = None,\n ) -> Iterator[ResourceRequirement]:\n # Outer requiree in this context is the outer-calling node handle. If not provided, then\n # just use the op name.\n outer_context = cast(Optional[Tuple[NodeHandle, Optional["AssetLayer"]]], outer_context)\n if not outer_context:\n handle = None\n asset_layer = None\n else:\n handle, asset_layer = outer_context\n node_description = f"{self.node_type_str} '{handle or self.name}'"\n for resource_key in sorted(list(self.required_resource_keys)):\n yield OpDefinitionResourceRequirement(\n key=resource_key, node_description=node_description\n )\n for input_def in self.input_defs:\n if input_def.input_manager_key:\n yield InputManagerRequirement(\n key=input_def.input_manager_key,\n node_description=node_description,\n input_name=input_def.name,\n root_input=False,\n )\n elif asset_layer and handle:\n input_asset_key = asset_layer.asset_key_for_input(handle, input_def.name)\n if input_asset_key:\n io_manager_key = asset_layer.io_manager_key_for_asset(input_asset_key)\n yield InputManagerRequirement(\n key=io_manager_key,\n node_description=node_description,\n input_name=input_def.name,\n root_input=False,\n )\n\n for output_def in self.output_defs:\n yield OutputManagerRequirement(\n key=output_def.io_manager_key,\n node_description=node_description,\n output_name=output_def.name,\n )\n\n def resolve_input_to_destinations(\n self, input_handle: NodeInputHandle\n ) -> Sequence[NodeInputHandle]:\n return [input_handle]\n\n def __call__(self, *args, **kwargs) -> Any:\n from .composition import is_in_composition\n\n if is_in_composition():\n return super(OpDefinition, self).__call__(*args, **kwargs)\n\n return direct_invocation_result(self, *args, **kwargs)
\n\n\ndef _resolve_output_defs_from_outs(\n compute_fn: Union[Callable[..., Any], "DecoratedOpFunction"],\n outs: Optional[Mapping[str, Out]],\n default_code_version: Optional[str],\n) -> Sequence[OutputDefinition]:\n from .decorators.op_decorator import DecoratedOpFunction\n\n if isinstance(compute_fn, DecoratedOpFunction):\n inferred_output_props = infer_output_props(compute_fn.decorated_fn)\n annotation = inferred_output_props.annotation\n description = inferred_output_props.description\n else:\n inferred_output_props = None\n annotation = inspect.Parameter.empty\n description = None\n\n if outs is None:\n return [OutputDefinition.create_from_inferred(inferred_output_props, default_code_version)]\n\n # If only a single entry has been provided to the out dict, then slurp the\n # annotation into the entry.\n if len(outs) == 1:\n name = next(iter(outs.keys()))\n only_out = outs[name]\n return [only_out.to_definition(annotation, name, description, default_code_version)]\n\n output_defs: List[OutputDefinition] = []\n\n # Introspection on type annotations is experimental, so checking\n # metaclass is the best we can do.\n if annotation != inspect.Parameter.empty and not get_origin(annotation) == tuple:\n raise DagsterInvariantViolationError(\n "Expected Tuple annotation for multiple outputs, but received non-tuple annotation."\n )\n if annotation != inspect.Parameter.empty and not len(get_args(annotation)) == len(outs):\n raise DagsterInvariantViolationError(\n "Expected Tuple annotation to have number of entries matching the "\n f"number of outputs for more than one output. Expected {len(outs)} "\n f"outputs but annotation has {len(get_args(annotation))}."\n )\n for idx, (name, cur_out) in enumerate(outs.items()):\n annotation_type = (\n get_args(annotation)[idx]\n if annotation != inspect.Parameter.empty\n else inspect.Parameter.empty\n )\n # Don't provide description when using multiple outputs. Introspection\n # is challenging when faced with multiple inputs.\n output_defs.append(\n cur_out.to_definition(\n annotation_type, name=name, description=None, code_version=default_code_version\n )\n )\n\n return output_defs\n\n\ndef _validate_context_type_hint(fn):\n from inspect import _empty as EmptyAnnotation\n\n from dagster._core.decorator_utils import get_function_params\n from dagster._core.definitions.decorators.op_decorator import is_context_provided\n from dagster._core.execution.context.compute import AssetExecutionContext, OpExecutionContext\n\n params = get_function_params(fn)\n if is_context_provided(params):\n if (\n params[0].annotation is not AssetExecutionContext\n and params[0].annotation is not OpExecutionContext\n and params[0].annotation is not EmptyAnnotation\n ):\n raise DagsterInvalidDefinitionError(\n f"Cannot annotate `context` parameter with type {params[0].annotation}. `context`"\n " must be annotated with AssetExecutionContext, OpExecutionContext, or left blank."\n )\n
", "current_page_name": "_modules/dagster/_core/definitions/op_definition", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.op_definition"}, "output": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.output

\nimport inspect\nfrom typing import (\n    Any,\n    NamedTuple,\n    Optional,\n    Type,\n    TypeVar,\n    Union,\n)\n\nimport dagster._check as check\nfrom dagster._annotations import PublicAttr, deprecated_param\nfrom dagster._core.definitions.metadata import (\n    ArbitraryMetadataMapping,\n    MetadataUserInput,\n    normalize_metadata,\n)\nfrom dagster._core.errors import DagsterError, DagsterInvalidDefinitionError\nfrom dagster._core.types.dagster_type import (\n    DagsterType,\n    is_dynamic_output_annotation,\n    resolve_dagster_type,\n)\n\nfrom .inference import InferredOutputProps\nfrom .input import NoValueSentinel\nfrom .utils import DEFAULT_IO_MANAGER_KEY, DEFAULT_OUTPUT, check_valid_name\n\nTOutputDefinition = TypeVar("TOutputDefinition", bound="OutputDefinition")\nTOut = TypeVar("TOut", bound="Out")\n\n\nclass OutputDefinition:\n    """Defines an output from an op's compute function.\n\n    Ops can have multiple outputs, in which case outputs cannot be anonymous.\n\n    Many ops have only one output, in which case the user can provide a single output definition\n    that will be given the default name, "result".\n\n    Output definitions may be typed using the Dagster type system.\n\n    Args:\n        dagster_type (Optional[Union[Type, DagsterType]]]): The type of this output.\n            Users should provide the Python type of the objects that they expect the op to yield\n            for this output, or a :py:class:`DagsterType` that defines a runtime check that they\n            want to be run on this output. Defaults to :py:class:`Any`.\n        name (Optional[str]): Name of the output. (default: "result")\n        description (Optional[str]): Human-readable description of the output.\n        is_required (Optional[bool]): Whether the presence of this field is required. (default: True)\n        io_manager_key (Optional[str]): The resource key of the IOManager used for storing this\n            output and loading it in downstream steps (default: "io_manager").\n        metadata (Optional[Dict[str, Any]]): A dict of the metadata for the output.\n            For example, users can provide a file path if the data object will be stored in a\n            filesystem, or provide information of a database table when it is going to load the data\n            into the table.\n        code_version (Optional[str]): (Experimental) Version of the code that generates this output. In\n            general, versions should be set only for code that deterministically produces the same\n            output when given the same inputs.\n\n    """\n\n    def __init__(\n        self,\n        dagster_type=None,\n        name: Optional[str] = None,\n        description: Optional[str] = None,\n        is_required: bool = True,\n        io_manager_key: Optional[str] = None,\n        metadata: Optional[ArbitraryMetadataMapping] = None,\n        code_version: Optional[str] = None,\n        # make sure new parameters are updated in combine_with_inferred below\n    ):\n        self._name = check_valid_name(check.opt_str_param(name, "name", DEFAULT_OUTPUT))\n        self._type_not_set = dagster_type is None\n        self._dagster_type = resolve_dagster_type(dagster_type)\n        self._description = check.opt_str_param(description, "description")\n        self._is_required = check.bool_param(is_required, "is_required")\n        self._io_manager_key = check.opt_str_param(\n            io_manager_key,\n            "io_manager_key",\n            default=DEFAULT_IO_MANAGER_KEY,\n        )\n        self._code_version = check.opt_str_param(code_version, "code_version")\n        self._raw_metadata = check.opt_mapping_param(metadata, "metadata", key_type=str)\n        self._metadata = normalize_metadata(self._raw_metadata, allow_invalid=True)\n\n    @property\n    def name(self) -> str:\n        return self._name\n\n    @property\n    def dagster_type(self) -> DagsterType:\n        return self._dagster_type\n\n    @property\n    def description(self) -> Optional[str]:\n        return self._description\n\n    @property\n    def is_required(self) -> bool:\n        return self._is_required\n\n    @property\n    def io_manager_key(self) -> str:\n        return self._io_manager_key\n\n    @property\n    def code_version(self) -> Optional[str]:\n        return self._code_version\n\n    @property\n    def optional(self) -> bool:\n        return not self.is_required\n\n    @property\n    def metadata(self) -> ArbitraryMetadataMapping:\n        return self._raw_metadata\n\n    @property\n    def is_dynamic(self) -> bool:\n        return False\n\n    def mapping_from(\n        self, node_name: str, output_name: Optional[str] = None, from_dynamic_mapping: bool = False\n    ) -> "OutputMapping":\n        """Create an output mapping from an output of a child node.\n\n        In a GraphDefinition, you can use this helper function to construct\n        an :py:class:`OutputMapping` from the output of a child node.\n\n        Args:\n            node_name (str): The name of the child node from which to map this output.\n            output_name (str): The name of the child node's output from which to map this output.\n\n        Examples:\n            .. code-block:: python\n\n                output_mapping = OutputDefinition(Int).mapping_from('child_node')\n        """\n        return OutputMapping(\n            graph_output_name=self.name,\n            mapped_node_name=node_name,\n            mapped_node_output_name=output_name or DEFAULT_OUTPUT,\n            graph_output_description=self.description,\n            dagster_type=self.dagster_type,\n            from_dynamic_mapping=from_dynamic_mapping or self.is_dynamic,\n        )\n\n    @staticmethod\n    def create_from_inferred(\n        inferred: Optional[InferredOutputProps], code_version: Optional[str] = None\n    ) -> "OutputDefinition":\n        if not inferred:\n            return OutputDefinition(code_version=code_version)\n        if is_dynamic_output_annotation(inferred.annotation):\n            return DynamicOutputDefinition(\n                dagster_type=_checked_inferred_type(inferred.annotation),\n                description=inferred.description,\n                code_version=code_version,\n            )\n        else:\n            return OutputDefinition(\n                dagster_type=_checked_inferred_type(inferred.annotation),\n                description=inferred.description,\n                code_version=code_version,\n            )\n\n    def combine_with_inferred(\n        self: TOutputDefinition, inferred: InferredOutputProps\n    ) -> TOutputDefinition:\n        dagster_type = self.dagster_type\n        if self._type_not_set:\n            dagster_type = _checked_inferred_type(inferred.annotation)\n        if self.description is None:\n            description = inferred.description\n        else:\n            description = self.description\n\n        return self.__class__(\n            name=self.name,\n            dagster_type=dagster_type,\n            description=description,\n            is_required=self.is_required,\n            io_manager_key=self.io_manager_key,\n            metadata=self._metadata,\n        )\n\n\ndef _checked_inferred_type(inferred: Any) -> DagsterType:\n    try:\n        if inferred == inspect.Parameter.empty:\n            return resolve_dagster_type(None)\n        elif inferred is None:\n            # When inferred.annotation is None, it means someone explicitly put "None" as the\n            # annotation, so want to map it to a DagsterType that checks for the None type\n            return resolve_dagster_type(type(None))\n        else:\n            return resolve_dagster_type(inferred)\n\n    except DagsterError as e:\n        raise DagsterInvalidDefinitionError(\n            f"Problem using type '{inferred}' from return type annotation, correct the issue "\n            "or explicitly set the dagster_type via Out()."\n        ) from e\n\n\nclass DynamicOutputDefinition(OutputDefinition):\n    """Variant of :py:class:`OutputDefinition <dagster.OutputDefinition>` for an\n    output that will dynamically alter the graph at runtime.\n\n    When using in a composition function such as :py:func:`@job <dagster.job>`,\n    dynamic outputs must be used with either:\n\n    * ``map`` - clone downstream nodes for each separate :py:class:`DynamicOutput`\n    * ``collect`` - gather across all :py:class:`DynamicOutput` in to a list\n\n    Uses the same constructor as :py:class:`OutputDefinition <dagster.OutputDefinition>`\n\n        .. code-block:: python\n\n            @op(\n                config_schema={\n                    "path": Field(str, default_value=file_relative_path(__file__, "sample"))\n                },\n                output_defs=[DynamicOutputDefinition(str)],\n            )\n            def files_in_directory(context):\n                path = context.op_config["path"]\n                dirname, _, filenames = next(os.walk(path))\n                for file in filenames:\n                    yield DynamicOutput(os.path.join(dirname, file), mapping_key=_clean(file))\n\n            @job\n            def process_directory():\n                files = files_in_directory()\n\n                # use map to invoke an op on each dynamic output\n                file_results = files.map(process_file)\n\n                # use collect to gather the results in to a list\n                summarize_directory(file_results.collect())\n    """\n\n    @property\n    def is_dynamic(self) -> bool:\n        return True\n\n\nclass OutputPointer(NamedTuple("_OutputPointer", [("node_name", str), ("output_name", str)])):\n    def __new__(cls, node_name: str, output_name: Optional[str] = None):\n        return super(OutputPointer, cls).__new__(\n            cls,\n            check.str_param(node_name, "node_name"),\n            check.opt_str_param(output_name, "output_name", DEFAULT_OUTPUT),\n        )\n\n\n
[docs]@deprecated_param(\n param="dagster_type",\n breaking_version="2.0",\n additional_warn_text="Any defined `dagster_type` should come from the underlying op `Output`.",\n # Disabling warning here since we're passing this internally and I'm not sure whether it is\n # actually used or discarded.\n emit_runtime_warning=False,\n)\nclass OutputMapping(NamedTuple):\n """Defines an output mapping for a graph.\n\n Args:\n graph_output_name (str): Name of the output in the graph being mapped to.\n mapped_node_name (str): Named of the node (op/graph) that the output is being mapped from.\n mapped_node_output_name (str): Name of the output in the node (op/graph) that is being mapped from.\n graph_output_description (Optional[str]): A description of the output in the graph being mapped from.\n from_dynamic_mapping (bool): Set to true if the node being mapped to is a mapped dynamic node.\n dagster_type (Optional[DagsterType]): The dagster type of the graph's output being mapped to.\n\n Examples:\n .. code-block:: python\n\n from dagster import OutputMapping, GraphDefinition, op, graph, GraphOut\n\n @op\n def emit_five(x):\n return 5\n\n # The following two graph definitions are equivalent\n GraphDefinition(\n name="the_graph",\n node_defs=[emit_five],\n output_mappings=[\n OutputMapping(\n graph_output_name="result", # Default output name\n mapped_node_name="emit_five",\n mapped_node_output_name="result"\n )\n ]\n )\n\n @graph(out=GraphOut())\n def the_graph:\n return emit_five()\n """\n\n graph_output_name: str\n mapped_node_name: str\n mapped_node_output_name: str\n graph_output_description: Optional[str] = None\n dagster_type: Optional[DagsterType] = None\n from_dynamic_mapping: bool = False\n\n @property\n def maps_from(self) -> OutputPointer:\n return OutputPointer(self.mapped_node_name, self.mapped_node_output_name)\n\n def get_definition(self, is_dynamic: bool) -> "OutputDefinition":\n check.invariant(not is_dynamic or self.from_dynamic_mapping)\n is_dynamic = is_dynamic or self.from_dynamic_mapping\n klass = DynamicOutputDefinition if is_dynamic else OutputDefinition\n return klass(\n name=self.graph_output_name,\n description=self.graph_output_description,\n dagster_type=self.dagster_type,\n )
\n\n\n
[docs]class Out(\n NamedTuple(\n "_Out",\n [\n ("dagster_type", PublicAttr[Union[DagsterType, Type[NoValueSentinel]]]),\n ("description", PublicAttr[Optional[str]]),\n ("is_required", PublicAttr[bool]),\n ("io_manager_key", PublicAttr[str]),\n ("metadata", PublicAttr[Optional[MetadataUserInput]]),\n ("code_version", PublicAttr[Optional[str]]),\n ],\n )\n):\n """Defines an output from an op's compute function.\n\n Ops can have multiple outputs, in which case outputs cannot be anonymous.\n\n Many ops have only one output, in which case the user can provide a single output definition\n that will be given the default name, "result".\n\n Outs may be typed using the Dagster type system.\n\n Args:\n dagster_type (Optional[Union[Type, DagsterType]]]):\n The type of this output. Should only be set if the correct type can not\n be inferred directly from the type signature of the decorated function.\n description (Optional[str]): Human-readable description of the output.\n is_required (bool): Whether the presence of this field is required. (default: True)\n io_manager_key (Optional[str]): The resource key of the output manager used for this output.\n (default: "io_manager").\n metadata (Optional[Dict[str, Any]]): A dict of the metadata for the output.\n For example, users can provide a file path if the data object will be stored in a\n filesystem, or provide information of a database table when it is going to load the data\n into the table.\n code_version (Optional[str]): (Experimental) Version of the code that generates this output. In\n general, versions should be set only for code that deterministically produces the same\n output when given the same inputs.\n """\n\n def __new__(\n cls,\n dagster_type: Union[Type, DagsterType] = NoValueSentinel,\n description: Optional[str] = None,\n is_required: bool = True,\n io_manager_key: Optional[str] = None,\n metadata: Optional[ArbitraryMetadataMapping] = None,\n code_version: Optional[str] = None,\n # make sure new parameters are updated in combine_with_inferred below\n ):\n return super(Out, cls).__new__(\n cls,\n dagster_type=(\n NoValueSentinel\n if dagster_type is NoValueSentinel\n else resolve_dagster_type(dagster_type)\n ),\n description=description,\n is_required=check.bool_param(is_required, "is_required"),\n io_manager_key=check.opt_str_param(\n io_manager_key, "io_manager_key", default=DEFAULT_IO_MANAGER_KEY\n ),\n metadata=metadata,\n code_version=code_version,\n )\n\n @classmethod\n def from_definition(cls, output_def: "OutputDefinition"):\n klass = Out if not output_def.is_dynamic else DynamicOut\n return klass(\n dagster_type=output_def.dagster_type,\n description=output_def.description,\n is_required=output_def.is_required,\n io_manager_key=output_def.io_manager_key,\n metadata=output_def.metadata,\n code_version=output_def.code_version,\n )\n\n def to_definition(\n self,\n annotation_type: type,\n name: Optional[str],\n description: Optional[str],\n code_version: Optional[str],\n ) -> "OutputDefinition":\n dagster_type = (\n self.dagster_type\n if self.dagster_type is not NoValueSentinel\n else _checked_inferred_type(annotation_type)\n )\n\n klass = OutputDefinition if not self.is_dynamic else DynamicOutputDefinition\n\n return klass(\n dagster_type=dagster_type,\n name=name,\n description=self.description or description,\n is_required=self.is_required,\n io_manager_key=self.io_manager_key,\n metadata=self.metadata,\n code_version=self.code_version or code_version,\n )\n\n @property\n def is_dynamic(self) -> bool:\n return False
\n\n\n
[docs]class DynamicOut(Out):\n """Variant of :py:class:`Out <dagster.Out>` for an output that will dynamically alter the graph at\n runtime.\n\n When using in a composition function such as :py:func:`@graph <dagster.graph>`,\n dynamic outputs must be used with either\n\n * ``map`` - clone downstream ops for each separate :py:class:`DynamicOut`\n * ``collect`` - gather across all :py:class:`DynamicOut` in to a list\n\n Uses the same constructor as :py:class:`Out <dagster.Out>`\n\n .. code-block:: python\n\n @op(\n config_schema={\n "path": Field(str, default_value=file_relative_path(__file__, "sample"))\n },\n out=DynamicOut(str),\n )\n def files_in_directory(context):\n path = context.op_config["path"]\n dirname, _, filenames = next(os.walk(path))\n for file in filenames:\n yield DynamicOutput(os.path.join(dirname, file), mapping_key=_clean(file))\n\n @job\n def process_directory():\n files = files_in_directory()\n\n # use map to invoke an op on each dynamic output\n file_results = files.map(process_file)\n\n # use collect to gather the results in to a list\n summarize_directory(file_results.collect())\n """\n\n def to_definition(\n self,\n annotation_type: type,\n name: Optional[str],\n description: Optional[str],\n code_version: Optional[str],\n ) -> "OutputDefinition":\n dagster_type = (\n self.dagster_type\n if self.dagster_type is not NoValueSentinel\n else _checked_inferred_type(annotation_type)\n )\n\n return DynamicOutputDefinition(\n dagster_type=dagster_type,\n name=name,\n description=self.description or description,\n is_required=self.is_required,\n io_manager_key=self.io_manager_key,\n metadata=self.metadata,\n code_version=self.code_version or code_version,\n )\n\n @property\n def is_dynamic(self) -> bool:\n return True
\n\n\n
[docs]class GraphOut(NamedTuple("_GraphOut", [("description", PublicAttr[Optional[str]])])):\n """Represents information about the outputs that a graph maps.\n\n Args:\n description (Optional[str]): Human-readable description of the output.\n """\n\n def __new__(cls, description: Optional[str] = None):\n return super(GraphOut, cls).__new__(cls, description=description)\n\n def to_definition(self, name: Optional[str]) -> "OutputDefinition":\n return OutputDefinition(name=name, description=self.description)
\n
", "current_page_name": "_modules/dagster/_core/definitions/output", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.output"}, "partition": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.partition

\nimport copy\nimport hashlib\nimport json\nfrom abc import ABC, abstractmethod\nfrom collections import defaultdict\nfrom datetime import (\n    datetime,\n    timedelta,\n)\nfrom enum import Enum\nfrom typing import (\n    Any,\n    Callable,\n    Dict,\n    Generic,\n    Iterable,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Set,\n    Type,\n    Union,\n    cast,\n)\n\nfrom dateutil.relativedelta import relativedelta\nfrom typing_extensions import TypeVar\n\nimport dagster._check as check\nfrom dagster._annotations import PublicAttr, deprecated, deprecated_param, public\nfrom dagster._core.definitions.partition_key_range import PartitionKeyRange\nfrom dagster._core.definitions.run_request import (\n    AddDynamicPartitionsRequest,\n    DeleteDynamicPartitionsRequest,\n)\nfrom dagster._core.instance import DagsterInstance, DynamicPartitionsStore\nfrom dagster._core.storage.tags import PARTITION_NAME_TAG, PARTITION_SET_TAG\nfrom dagster._serdes import whitelist_for_serdes\nfrom dagster._utils import xor\nfrom dagster._utils.cached_method import cached_method\nfrom dagster._utils.warnings import (\n    normalize_renamed_param,\n)\n\nfrom ..errors import (\n    DagsterInvalidDefinitionError,\n    DagsterInvalidDeserializationVersionError,\n    DagsterInvalidInvocationError,\n    DagsterUnknownPartitionError,\n)\nfrom .config import ConfigMapping\nfrom .utils import validate_tags\n\nDEFAULT_DATE_FORMAT = "%Y-%m-%d"\n\nT_cov = TypeVar("T_cov", default=Any, covariant=True)\nT_str = TypeVar("T_str", bound=str, default=str, covariant=True)\nT_PartitionsDefinition = TypeVar(\n    "T_PartitionsDefinition",\n    bound="PartitionsDefinition",\n    default="PartitionsDefinition",\n    covariant=True,\n)\n\n# In the Dagster UI users can select partition ranges following the format '2022-01-13...2022-01-14'\n# "..." is an invalid substring in partition keys\n# The other escape characters are characters that may not display in the Dagster UI.\nINVALID_PARTITION_SUBSTRINGS = ["...", "\\a", "\\b", "\\f", "\\n", "\\r", "\\t", "\\v", "\\0"]\n\n\n@deprecated(breaking_version="2.0", additional_warn_text="Use string partition keys instead.")\nclass Partition(Generic[T_cov]):\n    """A Partition represents a single slice of the entire set of a job's possible work. It consists\n    of a value, which is an object that represents that partition, and an optional name, which is\n    used to label the partition in a human-readable way.\n\n    Args:\n        value (Any): The object for this partition\n        name (str): Name for this partition\n    """\n\n    def __init__(self, value: Any, name: Optional[str] = None):\n        self._value = value\n        self._name = check.str_param(name or str(value), "name")\n\n    @property\n    def value(self) -> T_cov:\n        return self._value\n\n    @property\n    def name(self) -> str:\n        return self._name\n\n    def __eq__(self, other: object) -> bool:\n        if not isinstance(other, Partition):\n            return False\n        else:\n            return self.value == other.value and self.name == other.name\n\n\n@whitelist_for_serdes\nclass ScheduleType(Enum):\n    HOURLY = "HOURLY"\n    DAILY = "DAILY"\n    WEEKLY = "WEEKLY"\n    MONTHLY = "MONTHLY"\n\n    @property\n    def ordinal(self):\n        return {"HOURLY": 1, "DAILY": 2, "WEEKLY": 3, "MONTHLY": 4}[self.value]\n\n    @property\n    def delta(self):\n        if self == ScheduleType.HOURLY:\n            return timedelta(hours=1)\n        elif self == ScheduleType.DAILY:\n            return timedelta(days=1)\n        elif self == ScheduleType.WEEKLY:\n            return timedelta(weeks=1)\n        elif self == ScheduleType.MONTHLY:\n            return relativedelta(months=1)\n        else:\n            check.failed(f"Unexpected ScheduleType {self}")\n\n    def __gt__(self, other: "ScheduleType") -> bool:\n        check.inst(other, ScheduleType, "Cannot compare ScheduleType with non-ScheduleType")\n        return self.ordinal > other.ordinal\n\n    def __lt__(self, other: "ScheduleType") -> bool:\n        check.inst(other, ScheduleType, "Cannot compare ScheduleType with non-ScheduleType")\n        return self.ordinal < other.ordinal\n\n\n
[docs]class PartitionsDefinition(ABC, Generic[T_str]):\n """Defines a set of partitions, which can be attached to a software-defined asset or job.\n\n Abstract class with implementations for different kinds of partitions.\n """\n\n @property\n def partitions_subset_class(self) -> Type["PartitionsSubset[T_str]"]:\n return DefaultPartitionsSubset[T_str]\n\n
[docs] @abstractmethod\n @public\n def get_partition_keys(\n self,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> Sequence[T_str]:\n """Returns a list of strings representing the partition keys of the PartitionsDefinition.\n\n Args:\n current_time (Optional[datetime]): A datetime object representing the current time, only\n applicable to time-based partitions definitions.\n dynamic_partitions_store (Optional[DynamicPartitionsStore]): The DynamicPartitionsStore\n object that is responsible for fetching dynamic partitions. Required when the\n partitions definition is a DynamicPartitionsDefinition with a name defined. Users\n can pass the DagsterInstance fetched via `context.instance` to this argument.\n\n Returns:\n Sequence[str]\n """\n ...
\n\n def __str__(self) -> str:\n joined_keys = ", ".join([f"'{key}'" for key in self.get_partition_keys()])\n return joined_keys\n\n def get_last_partition_key(\n self,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> Optional[T_str]:\n partition_keys = self.get_partition_keys(current_time, dynamic_partitions_store)\n return partition_keys[-1] if partition_keys else None\n\n def get_first_partition_key(\n self,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> Optional[T_str]:\n partition_keys = self.get_partition_keys(current_time, dynamic_partitions_store)\n return partition_keys[0] if partition_keys else None\n\n def get_partition_keys_in_range(\n self,\n partition_key_range: PartitionKeyRange,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> Sequence[T_str]:\n keys_exist = {\n partition_key_range.start: self.has_partition_key(\n partition_key_range.start, dynamic_partitions_store=dynamic_partitions_store\n ),\n partition_key_range.end: self.has_partition_key(\n partition_key_range.end, dynamic_partitions_store=dynamic_partitions_store\n ),\n }\n if not all(keys_exist.values()):\n raise DagsterInvalidInvocationError(\n f"""Partition range {partition_key_range.start} to {partition_key_range.end} is\n not a valid range. Nonexistent partition keys:\n {list(key for key in keys_exist if keys_exist[key] is False)}"""\n )\n\n # in the simple case, simply return the single key in the range\n if partition_key_range.start == partition_key_range.end:\n return [cast(T_str, partition_key_range.start)]\n\n # defer this call as it is potentially expensive\n partition_keys = self.get_partition_keys(dynamic_partitions_store=dynamic_partitions_store)\n return partition_keys[\n partition_keys.index(partition_key_range.start) : partition_keys.index(\n partition_key_range.end\n )\n + 1\n ]\n\n def empty_subset(self) -> "PartitionsSubset[T_str]":\n return self.partitions_subset_class.empty_subset(self)\n\n def subset_with_partition_keys(\n self, partition_keys: Iterable[str]\n ) -> "PartitionsSubset[T_str]":\n return self.empty_subset().with_partition_keys(partition_keys)\n\n def subset_with_all_partitions(\n self,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> "PartitionsSubset[T_str]":\n return self.subset_with_partition_keys(\n self.get_partition_keys(\n current_time=current_time, dynamic_partitions_store=dynamic_partitions_store\n )\n )\n\n def deserialize_subset(self, serialized: str) -> "PartitionsSubset[T_str]":\n return self.partitions_subset_class.from_serialized(self, serialized)\n\n def can_deserialize_subset(\n self,\n serialized: str,\n serialized_partitions_def_unique_id: Optional[str],\n serialized_partitions_def_class_name: Optional[str],\n ) -> bool:\n return self.partitions_subset_class.can_deserialize(\n self,\n serialized,\n serialized_partitions_def_unique_id,\n serialized_partitions_def_class_name,\n )\n\n def get_serializable_unique_identifier(\n self, dynamic_partitions_store: Optional[DynamicPartitionsStore] = None\n ) -> str:\n return hashlib.sha1(\n json.dumps(\n self.get_partition_keys(dynamic_partitions_store=dynamic_partitions_store)\n ).encode("utf-8")\n ).hexdigest()\n\n def get_tags_for_partition_key(self, partition_key: str) -> Mapping[str, str]:\n tags = {PARTITION_NAME_TAG: partition_key}\n return tags\n\n def get_num_partitions(\n self,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> int:\n return len(self.get_partition_keys(current_time, dynamic_partitions_store))\n\n def has_partition_key(\n self,\n partition_key: str,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> bool:\n return partition_key in self.get_partition_keys(\n current_time=current_time,\n dynamic_partitions_store=dynamic_partitions_store,\n )\n\n def validate_partition_key(\n self,\n partition_key: str,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> None:\n if not self.has_partition_key(partition_key, current_time, dynamic_partitions_store):\n raise DagsterUnknownPartitionError(\n f"Could not find a partition with key `{partition_key}`."\n )
\n\n\ndef raise_error_on_invalid_partition_key_substring(partition_keys: Sequence[str]) -> None:\n for partition_key in partition_keys:\n found_invalid_substrs = [\n invalid_substr\n for invalid_substr in INVALID_PARTITION_SUBSTRINGS\n if invalid_substr in partition_key\n ]\n if found_invalid_substrs:\n raise DagsterInvalidDefinitionError(\n f"{found_invalid_substrs} are invalid substrings in a partition key"\n )\n\n\ndef raise_error_on_duplicate_partition_keys(partition_keys: Sequence[str]) -> None:\n counts: Dict[str, int] = defaultdict(lambda: 0)\n for partition_key in partition_keys:\n counts[partition_key] += 1\n found_duplicates = [key for key in counts.keys() if counts[key] > 1]\n if found_duplicates:\n raise DagsterInvalidDefinitionError(\n "Partition keys must be unique. Duplicate instances of partition keys:"\n f" {found_duplicates}."\n )\n\n\n
[docs]class StaticPartitionsDefinition(PartitionsDefinition[str]):\n """A statically-defined set of partitions.\n\n Example:\n .. code-block:: python\n\n from dagster import StaticPartitionsDefinition, asset\n\n oceans_partitions_def = StaticPartitionsDefinition(\n ["arctic", "atlantic", "indian", "pacific", "southern"]\n )\n\n @asset(partitions_def=oceans_partitions_defs)\n def ml_model_for_each_ocean():\n ...\n """\n\n def __init__(self, partition_keys: Sequence[str]):\n check.sequence_param(partition_keys, "partition_keys", of_type=str)\n\n raise_error_on_invalid_partition_key_substring(partition_keys)\n raise_error_on_duplicate_partition_keys(partition_keys)\n\n self._partition_keys = partition_keys\n\n
[docs] @public\n def get_partition_keys(\n self,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> Sequence[str]:\n """Returns a list of strings representing the partition keys of the PartitionsDefinition.\n\n Args:\n current_time (Optional[datetime]): A datetime object representing the current time, only\n applicable to time-based partitions definitions.\n dynamic_partitions_store (Optional[DynamicPartitionsStore]): The DynamicPartitionsStore\n object that is responsible for fetching dynamic partitions. Only applicable to\n DynamicPartitionsDefinitions.\n\n Returns:\n Sequence[str]\n\n """\n return self._partition_keys
\n\n def __hash__(self):\n return hash(self.__repr__())\n\n def __eq__(self, other) -> bool:\n return isinstance(other, StaticPartitionsDefinition) and (\n self is other or self._partition_keys == other.get_partition_keys()\n )\n\n def __repr__(self) -> str:\n return f"{type(self).__name__}(partition_keys={self._partition_keys})"\n\n def get_num_partitions(\n self,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> int:\n # We don't currently throw an error when a duplicate partition key is defined\n # in a static partitions definition, though we will at 1.3.0.\n # This ensures that partition counts are correct in the Dagster UI.\n return len(set(self.get_partition_keys(current_time, dynamic_partitions_store)))
\n\n\nclass CachingDynamicPartitionsLoader(DynamicPartitionsStore):\n """A batch loader that caches the partition keys for a given dynamic partitions definition,\n to avoid repeated calls to the database for the same partitions definition.\n """\n\n def __init__(self, instance: DagsterInstance):\n self._instance = instance\n\n @cached_method\n def get_dynamic_partitions(self, partitions_def_name: str) -> Sequence[str]:\n return self._instance.get_dynamic_partitions(partitions_def_name)\n\n @cached_method\n def has_dynamic_partition(self, partitions_def_name: str, partition_key: str) -> bool:\n return self._instance.has_dynamic_partition(partitions_def_name, partition_key)\n\n\n
[docs]@deprecated_param(\n param="partition_fn",\n breaking_version="2.0",\n additional_warn_text="Provide partition definition name instead.",\n)\nclass DynamicPartitionsDefinition(\n PartitionsDefinition,\n NamedTuple(\n "_DynamicPartitionsDefinition",\n [\n (\n "partition_fn",\n PublicAttr[\n Optional[\n Callable[[Optional[datetime]], Union[Sequence[Partition], Sequence[str]]]\n ]\n ],\n ),\n ("name", PublicAttr[Optional[str]]),\n ],\n ),\n):\n """A partitions definition whose partition keys can be dynamically added and removed.\n\n This is useful for cases where the set of partitions is not known at definition time,\n but is instead determined at runtime.\n\n Partitions can be added and removed using `instance.add_dynamic_partitions` and\n `instance.delete_dynamic_partition` methods.\n\n Args:\n name (Optional[str]): The name of the partitions definition.\n partition_fn (Optional[Callable[[Optional[datetime]], Union[Sequence[Partition], Sequence[str]]]]):\n A function that returns the current set of partitions. This argument is deprecated and\n will be removed in 2.0.0.\n\n Examples:\n .. code-block:: python\n\n fruits = DynamicPartitionsDefinition(name="fruits")\n\n @sensor(job=my_job)\n def my_sensor(context):\n return SensorResult(\n run_requests=[RunRequest(partition_key="apple")],\n dynamic_partitions_requests=[fruits.build_add_request(["apple"])]\n )\n """\n\n def __new__(\n cls,\n partition_fn: Optional[\n Callable[[Optional[datetime]], Union[Sequence[Partition], Sequence[str]]]\n ] = None,\n name: Optional[str] = None,\n ):\n partition_fn = check.opt_callable_param(partition_fn, "partition_fn")\n name = check.opt_str_param(name, "name")\n\n if partition_fn is None and name is None:\n raise DagsterInvalidDefinitionError(\n "Must provide either partition_fn or name to DynamicPartitionsDefinition."\n )\n\n if partition_fn and name:\n raise DagsterInvalidDefinitionError(\n "Cannot provide both partition_fn and name to DynamicPartitionsDefinition."\n )\n\n return super(DynamicPartitionsDefinition, cls).__new__(\n cls,\n partition_fn=check.opt_callable_param(partition_fn, "partition_fn"),\n name=check.opt_str_param(name, "name"),\n )\n\n def _validated_name(self) -> str:\n if self.name is None:\n check.failed(\n "Dynamic partitions definition must have a name to fetch dynamic partitions"\n )\n return self.name\n\n def __eq__(self, other):\n return (\n isinstance(other, DynamicPartitionsDefinition)\n and self.name == other.name\n and self.partition_fn == other.partition_fn\n )\n\n def __hash__(self):\n return hash(tuple(self.__repr__()))\n\n def __str__(self) -> str:\n if self.name:\n return f'Dynamic partitions: "{self._validated_name()}"'\n else:\n return super().__str__()\n\n
[docs] @public\n def get_partition_keys(\n self,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> Sequence[str]:\n """Returns a list of strings representing the partition keys of the\n PartitionsDefinition.\n\n Args:\n current_time (Optional[datetime]): A datetime object representing the current time, only\n applicable to time-based partitions definitions.\n dynamic_partitions_store (Optional[DynamicPartitionsStore]): The DynamicPartitionsStore\n object that is responsible for fetching dynamic partitions. Required when the\n partitions definition is a DynamicPartitionsDefinition with a name defined. Users\n can pass the DagsterInstance fetched via `context.instance` to this argument.\n\n Returns:\n Sequence[str]\n """\n if self.partition_fn:\n partitions = self.partition_fn(current_time)\n if all(isinstance(partition, Partition) for partition in partitions):\n return [partition.name for partition in partitions] # type: ignore # (illegible conditional)\n else:\n return partitions # type: ignore # (illegible conditional)\n else:\n check.opt_inst_param(\n dynamic_partitions_store, "dynamic_partitions_store", DynamicPartitionsStore\n )\n\n if dynamic_partitions_store is None:\n check.failed(\n "The instance is not available to load partitions. You may be seeing this error"\n " when using dynamic partitions with a version of dagster-webserver or"\n " dagster-cloud that is older than 1.1.18."\n )\n\n return dynamic_partitions_store.get_dynamic_partitions(\n partitions_def_name=self._validated_name()\n )
\n\n def has_partition_key(\n self,\n partition_key: str,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> bool:\n if self.partition_fn:\n return partition_key in self.get_partition_keys(current_time)\n else:\n if dynamic_partitions_store is None:\n check.failed(\n "The instance is not available to load partitions. You may be seeing this error"\n " when using dynamic partitions with a version of dagster-webserver or"\n " dagster-cloud that is older than 1.1.18."\n )\n\n return dynamic_partitions_store.has_dynamic_partition(\n partitions_def_name=self._validated_name(), partition_key=partition_key\n )\n\n def build_add_request(self, partition_keys: Sequence[str]) -> AddDynamicPartitionsRequest:\n check.sequence_param(partition_keys, "partition_keys", of_type=str)\n validated_name = self._validated_name()\n return AddDynamicPartitionsRequest(validated_name, partition_keys)\n\n def build_delete_request(self, partition_keys: Sequence[str]) -> DeleteDynamicPartitionsRequest:\n check.sequence_param(partition_keys, "partition_keys", of_type=str)\n validated_name = self._validated_name()\n return DeleteDynamicPartitionsRequest(validated_name, partition_keys)
\n\n\n
[docs]@deprecated_param(\n param="run_config_for_partition_fn",\n breaking_version="2.0",\n additional_warn_text="Use `run_config_for_partition_key_fn` instead.",\n)\n@deprecated_param(\n param="tags_for_partition_fn",\n breaking_version="2.0",\n additional_warn_text="Use `tags_for_partition_key_fn` instead.",\n)\nclass PartitionedConfig(Generic[T_PartitionsDefinition]):\n """Defines a way of configuring a job where the job can be run on one of a discrete set of\n partitions, and each partition corresponds to run configuration for the job.\n\n Setting PartitionedConfig as the config for a job allows you to launch backfills for that job\n and view the run history across partitions.\n """\n\n def __init__(\n self,\n partitions_def: T_PartitionsDefinition,\n run_config_for_partition_fn: Optional[Callable[[Partition], Mapping[str, Any]]] = None,\n decorated_fn: Optional[Callable[..., Mapping[str, Any]]] = None,\n tags_for_partition_fn: Optional[Callable[[Partition[Any]], Mapping[str, str]]] = None,\n run_config_for_partition_key_fn: Optional[Callable[[str], Mapping[str, Any]]] = None,\n tags_for_partition_key_fn: Optional[Callable[[str], Mapping[str, str]]] = None,\n ):\n self._partitions = check.inst_param(partitions_def, "partitions_def", PartitionsDefinition)\n self._decorated_fn = decorated_fn\n\n check.invariant(\n xor(run_config_for_partition_fn, run_config_for_partition_key_fn),\n "Must provide exactly one of run_config_for_partition_fn or"\n " run_config_for_partition_key_fn",\n )\n check.invariant(\n not (tags_for_partition_fn and tags_for_partition_key_fn),\n "Cannot provide both of tags_for_partition_fn or tags_for_partition_key_fn",\n )\n\n self._run_config_for_partition_fn = check.opt_callable_param(\n run_config_for_partition_fn, "run_config_for_partition_fn"\n )\n self._run_config_for_partition_key_fn = check.opt_callable_param(\n run_config_for_partition_key_fn, "run_config_for_partition_key_fn"\n )\n self._tags_for_partition_fn = check.opt_callable_param(\n tags_for_partition_fn, "tags_for_partition_fn"\n )\n self._tags_for_partition_key_fn = check.opt_callable_param(\n tags_for_partition_key_fn, "tags_for_partition_key_fn"\n )\n\n @public\n @property\n def partitions_def(\n self,\n ) -> T_PartitionsDefinition:\n """T_PartitionsDefinition: The partitions definition associated with this PartitionedConfig."""\n return self._partitions\n\n @deprecated(\n breaking_version="2.0",\n additional_warn_text="Use `run_config_for_partition_key_fn` instead.",\n )\n @public\n @property\n def run_config_for_partition_fn(\n self,\n ) -> Optional[Callable[[Partition], Mapping[str, Any]]]:\n """Optional[Callable[[Partition], Mapping[str, Any]]]: A function that accepts a partition\n and returns a dictionary representing the config to attach to runs for that partition.\n Deprecated as of 1.3.3.\n """\n return self._run_config_for_partition_fn\n\n @public\n @property\n def run_config_for_partition_key_fn(\n self,\n ) -> Optional[Callable[[str], Mapping[str, Any]]]:\n """Optional[Callable[[str], Mapping[str, Any]]]: A function that accepts a partition key\n and returns a dictionary representing the config to attach to runs for that partition.\n """\n\n @deprecated(\n breaking_version="2.0", additional_warn_text="Use `tags_for_partition_key_fn` instead."\n )\n @public\n @property\n def tags_for_partition_fn(self) -> Optional[Callable[[Partition], Mapping[str, str]]]:\n """Optional[Callable[[Partition], Mapping[str, str]]]: A function that\n accepts a partition and returns a dictionary of tags to attach to runs for\n that partition. Deprecated as of 1.3.3.\n """\n return self._tags_for_partition_fn\n\n @public\n @property\n def tags_for_partition_key_fn(\n self,\n ) -> Optional[Callable[[str], Mapping[str, str]]]:\n """Optional[Callable[[str], Mapping[str, str]]]: A function that\n accepts a partition key and returns a dictionary of tags to attach to runs for\n that partition.\n """\n return self._tags_for_partition_key_fn\n\n
[docs] @public\n def get_partition_keys(self, current_time: Optional[datetime] = None) -> Sequence[str]:\n """Returns a list of partition keys, representing the full set of partitions that\n config can be applied to.\n\n Args:\n current_time (Optional[datetime]): A datetime object representing the current time. Only\n applicable to time-based partitions definitions.\n\n Returns:\n Sequence[str]\n """\n return self.partitions_def.get_partition_keys(current_time)
\n\n # Assumes partition key already validated\n def get_run_config_for_partition_key(\n self,\n partition_key: str,\n ) -> Mapping[str, Any]:\n """Generates the run config corresponding to a partition key.\n\n Args:\n partition_key (str): the key for a partition that should be used to generate a run config.\n """\n # _run_config_for_partition_fn is deprecated, we can remove this branching logic in 2.0\n if self._run_config_for_partition_fn:\n run_config = self._run_config_for_partition_fn(Partition(partition_key))\n elif self._run_config_for_partition_key_fn:\n run_config = self._run_config_for_partition_key_fn(partition_key)\n else:\n check.failed("Unreachable.") # one of the above funcs always defined\n return copy.deepcopy(run_config)\n\n # Assumes partition key already validated\n def get_tags_for_partition_key(\n self,\n partition_key: str,\n job_name: Optional[str] = None,\n ) -> Mapping[str, str]:\n from dagster._core.host_representation.external_data import (\n external_partition_set_name_for_job_name,\n )\n\n # _tags_for_partition_fn is deprecated, we can remove this branching logic in 2.0\n if self._tags_for_partition_fn:\n user_tags = self._tags_for_partition_fn(Partition(partition_key))\n elif self._tags_for_partition_key_fn:\n user_tags = self._tags_for_partition_key_fn(partition_key)\n else:\n user_tags = {}\n user_tags = validate_tags(user_tags, allow_reserved_tags=False)\n\n system_tags = {\n **self.partitions_def.get_tags_for_partition_key(partition_key),\n **(\n # `PartitionSetDefinition` has been deleted but we still need to attach this special tag in\n # order for reexecution against partitions to work properly.\n {PARTITION_SET_TAG: external_partition_set_name_for_job_name(job_name)}\n if job_name\n else {}\n ),\n }\n\n return {**user_tags, **system_tags}\n\n @classmethod\n def from_flexible_config(\n cls,\n config: Optional[Union[ConfigMapping, Mapping[str, object], "PartitionedConfig"]],\n partitions_def: PartitionsDefinition,\n ) -> "PartitionedConfig":\n check.invariant(\n not isinstance(config, ConfigMapping),\n "Can't supply a ConfigMapping for 'config' when 'partitions_def' is supplied.",\n )\n\n if isinstance(config, PartitionedConfig):\n check.invariant(\n config.partitions_def == partitions_def,\n "Can't supply a PartitionedConfig for 'config' with a different "\n "PartitionsDefinition than supplied for 'partitions_def'.",\n )\n return config\n else:\n hardcoded_config = config if config else {}\n return cls(\n partitions_def,\n run_config_for_partition_key_fn=lambda _: cast(Mapping, hardcoded_config),\n )\n\n def __call__(self, *args, **kwargs):\n if self._decorated_fn is None:\n raise DagsterInvalidInvocationError(\n "Only PartitionedConfig objects created using one of the partitioned config "\n "decorators can be directly invoked."\n )\n else:\n return self._decorated_fn(*args, **kwargs)
\n\n\n
[docs]@deprecated_param(\n param="tags_for_partition_fn",\n breaking_version="2.0",\n additional_warn_text="Use tags_for_partition_key_fn instead.",\n)\ndef static_partitioned_config(\n partition_keys: Sequence[str],\n tags_for_partition_fn: Optional[Callable[[str], Mapping[str, str]]] = None,\n tags_for_partition_key_fn: Optional[Callable[[str], Mapping[str, str]]] = None,\n) -> Callable[[Callable[[str], Mapping[str, Any]]], PartitionedConfig[StaticPartitionsDefinition]]:\n """Creates a static partitioned config for a job.\n\n The provided partition_keys is a static list of strings identifying the set of partitions. The\n list of partitions is static, so while the run config returned by the decorated function may\n change over time, the list of valid partition keys does not.\n\n This has performance advantages over `dynamic_partitioned_config` in terms of loading different\n partition views in the Dagster UI.\n\n The decorated function takes in a partition key and returns a valid run config for a particular\n target job.\n\n Args:\n partition_keys (Sequence[str]): A list of valid partition keys, which serve as the range of\n values that can be provided to the decorated run config function.\n tags_for_partition_fn (Optional[Callable[[str], Mapping[str, str]]]): A function that\n accepts a partition key and returns a dictionary of tags to attach to runs for that\n partition.\n tags_for_partition_key_fn (Optional[Callable[[str], Mapping[str, str]]]): A function that\n accepts a partition key and returns a dictionary of tags to attach to runs for that\n partition.\n\n Returns:\n PartitionedConfig\n """\n check.sequence_param(partition_keys, "partition_keys", str)\n\n tags_for_partition_key_fn = normalize_renamed_param(\n tags_for_partition_key_fn,\n "tags_for_partition_key_fn",\n tags_for_partition_fn,\n "tags_for_partition_fn",\n )\n\n def inner(\n fn: Callable[[str], Mapping[str, Any]]\n ) -> PartitionedConfig[StaticPartitionsDefinition]:\n return PartitionedConfig(\n partitions_def=StaticPartitionsDefinition(partition_keys),\n run_config_for_partition_key_fn=fn,\n decorated_fn=fn,\n tags_for_partition_key_fn=tags_for_partition_key_fn,\n )\n\n return inner
\n\n\ndef partitioned_config(\n partitions_def: PartitionsDefinition,\n tags_for_partition_key_fn: Optional[Callable[[str], Mapping[str, str]]] = None,\n) -> Callable[[Callable[[str], Mapping[str, Any]]], PartitionedConfig]:\n """Creates a partitioned config for a job given a PartitionsDefinition.\n\n The partitions_def provides the set of partitions, which may change over time\n (for example, when using a DynamicPartitionsDefinition).\n\n The decorated function takes in a partition key and returns a valid run config for a particular\n target job.\n\n Args:\n partitions_def: (Optional[DynamicPartitionsDefinition]): PartitionsDefinition for the job\n tags_for_partition_key_fn (Optional[Callable[[str], Mapping[str, str]]]): A function that\n accepts a partition key and returns a dictionary of tags to attach to runs for that\n partition.\n\n Returns:\n PartitionedConfig\n """\n check.opt_callable_param(tags_for_partition_key_fn, "tags_for_partition_key_fn")\n\n def inner(fn: Callable[[str], Mapping[str, Any]]) -> PartitionedConfig:\n return PartitionedConfig(\n partitions_def=partitions_def,\n run_config_for_partition_key_fn=fn,\n decorated_fn=fn,\n tags_for_partition_key_fn=tags_for_partition_key_fn,\n )\n\n return inner\n\n\n
[docs]@deprecated_param(\n param="tags_for_partition_fn",\n breaking_version="2.0",\n additional_warn_text="Use tags_for_partition_key_fn instead.",\n)\ndef dynamic_partitioned_config(\n partition_fn: Callable[[Optional[datetime]], Sequence[str]],\n tags_for_partition_fn: Optional[Callable[[str], Mapping[str, str]]] = None,\n tags_for_partition_key_fn: Optional[Callable[[str], Mapping[str, str]]] = None,\n) -> Callable[[Callable[[str], Mapping[str, Any]]], PartitionedConfig]:\n """Creates a dynamic partitioned config for a job.\n\n The provided partition_fn returns a list of strings identifying the set of partitions, given\n an optional datetime argument (representing the current time). The list of partitions returned\n may change over time.\n\n The decorated function takes in a partition key and returns a valid run config for a particular\n target job.\n\n Args:\n partition_fn (Callable[[datetime.datetime], Sequence[str]]): A function that generates a\n list of valid partition keys, which serve as the range of values that can be provided\n to the decorated run config function.\n tags_for_partition_fn (Optional[Callable[[str], Mapping[str, str]]]): A function that\n accepts a partition key and returns a dictionary of tags to attach to runs for that\n partition.\n\n Returns:\n PartitionedConfig\n """\n check.callable_param(partition_fn, "partition_fn")\n\n tags_for_partition_key_fn = normalize_renamed_param(\n tags_for_partition_key_fn,\n "tags_for_partition_key_fn",\n tags_for_partition_fn,\n "tags_for_partition_fn",\n )\n\n def inner(fn: Callable[[str], Mapping[str, Any]]) -> PartitionedConfig:\n return PartitionedConfig(\n partitions_def=DynamicPartitionsDefinition(partition_fn),\n run_config_for_partition_key_fn=fn,\n decorated_fn=fn,\n tags_for_partition_key_fn=tags_for_partition_key_fn,\n )\n\n return inner
\n\n\ndef cron_schedule_from_schedule_type_and_offsets(\n schedule_type: ScheduleType,\n minute_offset: int,\n hour_offset: int,\n day_offset: Optional[int],\n) -> str:\n if schedule_type is ScheduleType.HOURLY:\n return f"{minute_offset} * * * *"\n elif schedule_type is ScheduleType.DAILY:\n return f"{minute_offset} {hour_offset} * * *"\n elif schedule_type is ScheduleType.WEEKLY:\n return f"{minute_offset} {hour_offset} * * {day_offset if day_offset is not None else 0}"\n elif schedule_type is ScheduleType.MONTHLY:\n return f"{minute_offset} {hour_offset} {day_offset if day_offset is not None else 1} * *"\n else:\n check.assert_never(schedule_type)\n\n\nclass PartitionsSubset(ABC, Generic[T_str]):\n """Represents a subset of the partitions within a PartitionsDefinition."""\n\n @abstractmethod\n def get_partition_keys_not_in_subset(\n self,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> Iterable[T_str]: ...\n\n @abstractmethod\n @public\n def get_partition_keys(self, current_time: Optional[datetime] = None) -> Iterable[T_str]: ...\n\n @abstractmethod\n def get_partition_key_ranges(\n self,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> Sequence[PartitionKeyRange]: ...\n\n @abstractmethod\n def with_partition_keys(self, partition_keys: Iterable[str]) -> "PartitionsSubset[T_str]": ...\n\n def with_partition_key_range(\n self,\n partition_key_range: PartitionKeyRange,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> "PartitionsSubset[T_str]":\n return self.with_partition_keys(\n self.partitions_def.get_partition_keys_in_range(\n partition_key_range, dynamic_partitions_store=dynamic_partitions_store\n )\n )\n\n def __or__(self, other: "PartitionsSubset") -> "PartitionsSubset[T_str]":\n if self is other:\n return self\n return self.with_partition_keys(other.get_partition_keys())\n\n @abstractmethod\n def serialize(self) -> str: ...\n\n @classmethod\n @abstractmethod\n def from_serialized(\n cls, partitions_def: PartitionsDefinition[T_str], serialized: str\n ) -> "PartitionsSubset[T_str]": ...\n\n @classmethod\n @abstractmethod\n def can_deserialize(\n cls,\n partitions_def: PartitionsDefinition,\n serialized: str,\n serialized_partitions_def_unique_id: Optional[str],\n serialized_partitions_def_class_name: Optional[str],\n ) -> bool: ...\n\n @property\n @abstractmethod\n def partitions_def(self) -> PartitionsDefinition[T_str]: ...\n\n @abstractmethod\n def __len__(self) -> int: ...\n\n @abstractmethod\n def __contains__(self, value) -> bool: ...\n\n @classmethod\n @abstractmethod\n def empty_subset(\n cls, partitions_def: PartitionsDefinition[T_str]\n ) -> "PartitionsSubset[T_str]": ...\n\n\n@whitelist_for_serdes\nclass SerializedPartitionsSubset(NamedTuple):\n serialized_subset: str\n serialized_partitions_def_unique_id: str\n serialized_partitions_def_class_name: str\n\n @classmethod\n def from_subset(\n cls,\n subset: PartitionsSubset,\n partitions_def: PartitionsDefinition,\n dynamic_partitions_store: DynamicPartitionsStore,\n ):\n return cls(\n serialized_subset=subset.serialize(),\n serialized_partitions_def_unique_id=partitions_def.get_serializable_unique_identifier(\n dynamic_partitions_store\n ),\n serialized_partitions_def_class_name=partitions_def.__class__.__name__,\n )\n\n def can_deserialize(self, partitions_def: Optional[PartitionsDefinition]) -> bool:\n if not partitions_def:\n # Asset had a partitions definition at storage time, but no longer does\n return False\n\n return partitions_def.can_deserialize_subset(\n self.serialized_subset,\n serialized_partitions_def_unique_id=self.serialized_partitions_def_unique_id,\n serialized_partitions_def_class_name=self.serialized_partitions_def_class_name,\n )\n\n def deserialize(self, partitions_def: PartitionsDefinition) -> PartitionsSubset:\n return partitions_def.deserialize_subset(self.serialized_subset)\n\n\nclass DefaultPartitionsSubset(PartitionsSubset[T_str]):\n # Every time we change the serialization format, we should increment the version number.\n # This will ensure that we can gracefully degrade when deserializing old data.\n SERIALIZATION_VERSION = 1\n\n def __init__(\n self, partitions_def: PartitionsDefinition[T_str], subset: Optional[Set[T_str]] = None\n ):\n check.opt_set_param(subset, "subset")\n self._partitions_def = partitions_def\n self._subset = subset or set()\n\n def get_partition_keys_not_in_subset(\n self,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> Iterable[str]:\n return (\n set(\n self._partitions_def.get_partition_keys(\n current_time=current_time, dynamic_partitions_store=dynamic_partitions_store\n )\n )\n - self._subset\n )\n\n def get_partition_keys(self, current_time: Optional[datetime] = None) -> Iterable[str]:\n return self._subset\n\n def get_partition_key_ranges(\n self,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> Sequence[PartitionKeyRange]:\n partition_keys = self._partitions_def.get_partition_keys(\n current_time, dynamic_partitions_store=dynamic_partitions_store\n )\n cur_range_start = None\n cur_range_end = None\n result = []\n for partition_key in partition_keys:\n if partition_key in self._subset:\n if cur_range_start is None:\n cur_range_start = partition_key\n cur_range_end = partition_key\n else:\n if cur_range_start is not None and cur_range_end is not None:\n result.append(PartitionKeyRange(cur_range_start, cur_range_end))\n cur_range_start = cur_range_end = None\n\n if cur_range_start is not None and cur_range_end is not None:\n result.append(PartitionKeyRange(cur_range_start, cur_range_end))\n\n return result\n\n def with_partition_keys(\n self, partition_keys: Iterable[T_str]\n ) -> "DefaultPartitionsSubset[T_str]":\n return DefaultPartitionsSubset(\n self._partitions_def,\n self._subset | set(partition_keys),\n )\n\n def serialize(self) -> str:\n # Serialize version number, so attempting to deserialize old versions can be handled gracefully.\n # Any time the serialization format changes, we should increment the version number.\n return json.dumps({"version": self.SERIALIZATION_VERSION, "subset": list(self._subset)})\n\n @classmethod\n def from_serialized(\n cls, partitions_def: PartitionsDefinition[T_str], serialized: str\n ) -> "PartitionsSubset[T_str]":\n # Check the version number, so only valid versions can be deserialized.\n data = json.loads(serialized)\n\n if isinstance(data, list):\n # backwards compatibility\n return cls(subset=set(data), partitions_def=partitions_def)\n else:\n if data.get("version") != cls.SERIALIZATION_VERSION:\n raise DagsterInvalidDeserializationVersionError(\n f"Attempted to deserialize partition subset with version {data.get('version')},"\n f" but only version {cls.SERIALIZATION_VERSION} is supported."\n )\n return cls(subset=set(data.get("subset")), partitions_def=partitions_def)\n\n @classmethod\n def can_deserialize(\n cls,\n partitions_def: PartitionsDefinition[T_str],\n serialized: str,\n serialized_partitions_def_unique_id: Optional[str],\n serialized_partitions_def_class_name: Optional[str],\n ) -> bool:\n if serialized_partitions_def_class_name is not None:\n return serialized_partitions_def_class_name == partitions_def.__class__.__name__\n\n data = json.loads(serialized)\n return isinstance(data, list) or (\n data.get("subset") is not None and data.get("version") == cls.SERIALIZATION_VERSION\n )\n\n @property\n def partitions_def(self) -> PartitionsDefinition[T_str]:\n return self._partitions_def\n\n def __eq__(self, other: object) -> bool:\n return (\n isinstance(other, DefaultPartitionsSubset)\n and self._partitions_def == other._partitions_def\n and self._subset == other._subset\n )\n\n def __len__(self) -> int:\n return len(self._subset)\n\n def __contains__(self, value) -> bool:\n return value in self._subset\n\n def __repr__(self) -> str:\n return (\n f"DefaultPartitionsSubset(subset={self._subset}, partitions_def={self._partitions_def})"\n )\n\n @classmethod\n def empty_subset(cls, partitions_def: PartitionsDefinition[T_str]) -> "PartitionsSubset[T_str]":\n return cls(partitions_def=partitions_def)\n
", "current_page_name": "_modules/dagster/_core/definitions/partition", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.partition"}, "partition_key_range": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.partition_key_range

\nfrom typing import NamedTuple\n\nfrom dagster._annotations import PublicAttr\n\n\n
[docs]class PartitionKeyRange(NamedTuple):\n """Defines a range of partitions.\n\n Attributes:\n start (str): The starting partition key in the range (inclusive).\n end (str): The ending partition key in the range (inclusive).\n\n Examples:\n .. code-block:: python\n\n partitions_def = StaticPartitionsDefinition(["a", "b", "c", "d"])\n partition_key_range = PartitionKeyRange(start="a", end="c") # Represents ["a", "b", "c"]\n """\n\n # Inclusive on both sides\n start: PublicAttr[str]\n end: PublicAttr[str]
\n
", "current_page_name": "_modules/dagster/_core/definitions/partition_key_range", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.partition_key_range"}, "partition_mapping": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.partition_mapping

\nimport collections.abc\nimport itertools\nfrom abc import ABC, abstractmethod\nfrom collections import defaultdict\nfrom datetime import datetime\nfrom typing import (\n    Collection,\n    Dict,\n    List,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Set,\n    Tuple,\n    Type,\n    Union,\n    cast,\n)\n\nimport dagster._check as check\nfrom dagster._annotations import PublicAttr, experimental, public\nfrom dagster._core.definitions.multi_dimensional_partitions import (\n    MultiPartitionKey,\n    MultiPartitionsDefinition,\n)\nfrom dagster._core.definitions.partition import (\n    PartitionsDefinition,\n    PartitionsSubset,\n    StaticPartitionsDefinition,\n)\nfrom dagster._core.definitions.time_window_partitions import TimeWindowPartitionsDefinition\nfrom dagster._core.instance import DynamicPartitionsStore\nfrom dagster._serdes import whitelist_for_serdes\nfrom dagster._utils.cached_method import cached_method\nfrom dagster._utils.warnings import disable_dagster_warnings\n\n\nclass UpstreamPartitionsResult(NamedTuple):\n    """Represents the result of mapping a PartitionsSubset to the corresponding\n    partitions in another PartitionsDefinition.\n\n    partitions_subset (PartitionsSubset): The resulting partitions subset that was\n        mapped to. Only contains partitions for existent partitions, filtering out nonexistent partitions.\n    required_but_nonexistent_partition_keys (Sequence[str]): A list containing invalid partition keys in to_partitions_def\n        that partitions in from_partitions_subset were mapped to.\n    """\n\n    partitions_subset: PartitionsSubset\n    required_but_nonexistent_partition_keys: Sequence[str]\n\n\n
[docs]class PartitionMapping(ABC):\n """Defines a correspondence between the partitions in an asset and the partitions in an asset\n that it depends on.\n\n Overriding PartitionMapping outside of Dagster is not supported. The abstract methods of this\n class may change at any time.\n """\n\n
[docs] @public\n @abstractmethod\n def get_downstream_partitions_for_partitions(\n self,\n upstream_partitions_subset: PartitionsSubset,\n downstream_partitions_def: PartitionsDefinition,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> PartitionsSubset:\n """Returns the subset of partition keys in the downstream asset that use the data in the given\n partition key subset of the upstream asset.\n\n Args:\n upstream_partitions_subset (Union[PartitionKeyRange, PartitionsSubset]): The\n subset of partition keys in the upstream asset.\n downstream_partitions_def (PartitionsDefinition): The partitions definition for the\n downstream asset.\n """
\n\n
[docs] @public\n @abstractmethod\n def get_upstream_mapped_partitions_result_for_partitions(\n self,\n downstream_partitions_subset: Optional[PartitionsSubset],\n upstream_partitions_def: PartitionsDefinition,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> UpstreamPartitionsResult:\n """Returns a UpstreamPartitionsResult object containing the partition keys the downstream\n partitions subset was mapped to in the upstream partitions definition.\n\n Valid upstream partitions will be included in UpstreamPartitionsResult.partitions_subset.\n Invalid upstream partitions will be included in UpstreamPartitionsResult.required_but_nonexistent_partition_keys.\n\n For example, if an upstream asset is time-partitioned and starts in June 2023, and the\n downstream asset is time-partitioned and starts in May 2023, this function would return a\n UpstreamPartitionsResult(PartitionsSubset("2023-06-01"), required_but_nonexistent_partition_keys=["2023-05-01"])\n when downstream_partitions_subset contains 2023-05-01 and 2023-06-01.\n """
\n\n\n
[docs]@whitelist_for_serdes\nclass IdentityPartitionMapping(PartitionMapping, NamedTuple("_IdentityPartitionMapping", [])):\n """Expects that the upstream and downstream assets are partitioned in the same way, and maps\n partitions in the downstream asset to the same partition in the upstream asset.\n """\n\n def get_upstream_mapped_partitions_result_for_partitions(\n self,\n downstream_partitions_subset: Optional[PartitionsSubset],\n upstream_partitions_def: PartitionsDefinition,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> UpstreamPartitionsResult:\n if downstream_partitions_subset is None:\n check.failed("downstream asset is not partitioned")\n\n if downstream_partitions_subset.partitions_def == upstream_partitions_def:\n return UpstreamPartitionsResult(downstream_partitions_subset, [])\n\n upstream_partition_keys = set(\n upstream_partitions_def.get_partition_keys(\n dynamic_partitions_store=dynamic_partitions_store\n )\n )\n downstream_partition_keys = set(downstream_partitions_subset.get_partition_keys())\n\n return UpstreamPartitionsResult(\n upstream_partitions_def.subset_with_partition_keys(\n list(upstream_partition_keys & downstream_partition_keys)\n ),\n list(downstream_partition_keys - upstream_partition_keys),\n )\n\n def get_downstream_partitions_for_partitions(\n self,\n upstream_partitions_subset: PartitionsSubset,\n downstream_partitions_def: PartitionsDefinition,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> PartitionsSubset:\n if upstream_partitions_subset is None:\n check.failed("upstream asset is not partitioned")\n\n if upstream_partitions_subset.partitions_def == downstream_partitions_def:\n return upstream_partitions_subset\n\n upstream_partition_keys = set(upstream_partitions_subset.get_partition_keys())\n downstream_partition_keys = set(\n downstream_partitions_def.get_partition_keys(\n dynamic_partitions_store=dynamic_partitions_store\n )\n )\n\n return downstream_partitions_def.empty_subset().with_partition_keys(\n list(downstream_partition_keys & upstream_partition_keys)\n )
\n\n\n
[docs]@whitelist_for_serdes\nclass AllPartitionMapping(PartitionMapping, NamedTuple("_AllPartitionMapping", [])):\n """Maps every partition in the downstream asset to every partition in the upstream asset.\n\n Commonly used in the case when the downstream asset is not partitioned, in which the entire\n downstream asset depends on all partitions of the usptream asset.\n """\n\n def get_upstream_mapped_partitions_result_for_partitions(\n self,\n downstream_partitions_subset: Optional[PartitionsSubset],\n upstream_partitions_def: PartitionsDefinition,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> UpstreamPartitionsResult:\n upstream_subset = upstream_partitions_def.subset_with_all_partitions(\n current_time=current_time, dynamic_partitions_store=dynamic_partitions_store\n )\n return UpstreamPartitionsResult(upstream_subset, [])\n\n def get_downstream_partitions_for_partitions(\n self,\n upstream_partitions_subset: PartitionsSubset,\n downstream_partitions_def: PartitionsDefinition,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> PartitionsSubset:\n raise NotImplementedError()
\n\n\n
[docs]@whitelist_for_serdes\nclass LastPartitionMapping(PartitionMapping, NamedTuple("_LastPartitionMapping", [])):\n """Maps all dependencies to the last partition in the upstream asset.\n\n Commonly used in the case when the downstream asset is not partitioned, in which the entire\n downstream asset depends on the last partition of the upstream asset.\n """\n\n def get_upstream_mapped_partitions_result_for_partitions(\n self,\n downstream_partitions_subset: Optional[PartitionsSubset],\n upstream_partitions_def: PartitionsDefinition,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> UpstreamPartitionsResult:\n last = upstream_partitions_def.get_last_partition_key(\n current_time=None, dynamic_partitions_store=dynamic_partitions_store\n )\n\n upstream_subset = upstream_partitions_def.empty_subset()\n if last is not None:\n upstream_subset = upstream_subset.with_partition_keys([last])\n\n return UpstreamPartitionsResult(upstream_subset, [])\n\n def get_downstream_partitions_for_partitions(\n self,\n upstream_partitions_subset: PartitionsSubset,\n downstream_partitions_def: PartitionsDefinition,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> PartitionsSubset:\n raise NotImplementedError()
\n\n\n
[docs]@whitelist_for_serdes\nclass SpecificPartitionsPartitionMapping(\n PartitionMapping,\n NamedTuple(\n "_SpecificPartitionsPartitionMapping", [("partition_keys", PublicAttr[Sequence[str]])]\n ),\n):\n """Maps to a specific subset of partitions in the upstream asset.\n\n Example:\n .. code-block:: python\n\n from dagster import SpecificPartitionsPartitionMapping, StaticPartitionsDefinition, asset\n\n @asset(partitions_def=StaticPartitionsDefinition(["a", "b", "c"]))\n def upstream():\n ...\n\n @asset(\n ins={\n "upstream": AssetIn(partition_mapping=SpecificPartitionsPartitionMapping(["a"]))\n }\n )\n def a_downstream(upstream):\n ...\n """\n\n def get_upstream_mapped_partitions_result_for_partitions(\n self,\n downstream_partitions_subset: Optional[PartitionsSubset],\n upstream_partitions_def: PartitionsDefinition,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> UpstreamPartitionsResult:\n return UpstreamPartitionsResult(\n upstream_partitions_def.subset_with_partition_keys(self.partition_keys), []\n )\n\n def get_downstream_partitions_for_partitions(\n self,\n upstream_partitions_subset: PartitionsSubset,\n downstream_partitions_def: PartitionsDefinition,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> PartitionsSubset:\n # if any of the partition keys in this partition mapping are contained within the upstream\n # partitions subset, then all partitions of the downstream asset are dependencies\n if any(key in upstream_partitions_subset for key in self.partition_keys):\n return downstream_partitions_def.subset_with_all_partitions(\n dynamic_partitions_store=dynamic_partitions_store\n )\n return downstream_partitions_def.empty_subset()
\n\n\n
[docs]@experimental\n@whitelist_for_serdes\nclass MultiToSingleDimensionPartitionMapping(\n PartitionMapping,\n NamedTuple(\n "_MultiToSingleDimensionPartitionMapping", [("partition_dimension_name", Optional[str])]\n ),\n):\n """Defines a correspondence between an single-dimensional partitions definition\n and a MultiPartitionsDefinition. The single-dimensional partitions definition must be\n a dimension of the MultiPartitionsDefinition.\n\n This class handles the case where the upstream asset is multipartitioned and the\n downstream asset is single dimensional, and vice versa.\n\n For a partition key X, this partition mapping assumes that any multi-partition key with\n X in the selected dimension is a dependency.\n\n Args:\n partition_dimension_name (Optional[str]): The name of the partition dimension in the\n MultiPartitionsDefinition that matches the single-dimension partitions definition.\n """\n\n def __new__(cls, partition_dimension_name: Optional[str] = None):\n return super(MultiToSingleDimensionPartitionMapping, cls).__new__(\n cls,\n partition_dimension_name=check.opt_str_param(\n partition_dimension_name, "partition_dimension_name"\n ),\n )\n\n def _check_partitions_defs_and_get_partition_dimension_name(\n self,\n upstream_partitions_def: PartitionsDefinition,\n downstream_partitions_def: PartitionsDefinition,\n ) -> Tuple[PartitionsDefinition, PartitionsDefinition, str]:\n if not _can_infer_single_to_multi_partition_mapping(\n upstream_partitions_def, downstream_partitions_def\n ):\n check.failed(\n "This partition mapping defines a relationship between a multipartitioned and"\n " single dimensional asset. The single dimensional partitions definition must be a"\n " dimension of the MultiPartitionsDefinition."\n )\n\n multipartitions_def = cast(\n MultiPartitionsDefinition,\n (\n upstream_partitions_def\n if isinstance(upstream_partitions_def, MultiPartitionsDefinition)\n else downstream_partitions_def\n ),\n )\n\n single_dimension_partitions_def = (\n upstream_partitions_def\n if not isinstance(upstream_partitions_def, MultiPartitionsDefinition)\n else downstream_partitions_def\n )\n\n if self.partition_dimension_name is None:\n dimension_partitions_defs = [\n partitions_def.partitions_def\n for partitions_def in multipartitions_def.partitions_defs\n ]\n if len(set(dimension_partitions_defs)) != len(dimension_partitions_defs):\n check.failed(\n "Partition dimension name must be specified on the "\n "MultiToSingleDimensionPartitionMapping object when dimensions of a"\n " MultiPartitions definition share the same partitions definition."\n )\n matching_dimension_defs = [\n dimension_def\n for dimension_def in multipartitions_def.partitions_defs\n if dimension_def.partitions_def == single_dimension_partitions_def\n ]\n if len(matching_dimension_defs) != 1:\n check.failed(\n "No partition dimension name was specified and no dimensions of the"\n " MultiPartitionsDefinition match the single dimension"\n " PartitionsDefinition."\n )\n partition_dimension_name = next(iter(matching_dimension_defs)).name\n else:\n matching_dimensions = [\n partitions_def\n for partitions_def in multipartitions_def.partitions_defs\n if partitions_def.name == self.partition_dimension_name\n ]\n if len(matching_dimensions) != 1:\n check.failed(f"Partition dimension '{self.partition_dimension_name}' not found")\n matching_dimension_def = next(iter(matching_dimensions))\n\n if single_dimension_partitions_def != matching_dimension_def.partitions_def:\n check.failed(\n "The single dimension partitions definition does not have the same partitions"\n f" definition as dimension {matching_dimension_def.name}"\n )\n partition_dimension_name = self.partition_dimension_name\n\n return (upstream_partitions_def, downstream_partitions_def, partition_dimension_name)\n\n def _get_matching_multipartition_keys_for_single_dim_subset(\n self,\n partitions_subset: PartitionsSubset,\n multipartitions_def: MultiPartitionsDefinition,\n partition_dimension_name: str,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> Sequence[str]:\n matching_keys = []\n for key in multipartitions_def.get_partition_keys(\n current_time=None, dynamic_partitions_store=dynamic_partitions_store\n ):\n key = cast(MultiPartitionKey, key)\n if (\n key.keys_by_dimension[partition_dimension_name]\n in partitions_subset.get_partition_keys()\n ):\n matching_keys.append(key)\n return matching_keys\n\n def _get_single_dim_keys_from_multipartitioned_subset(\n self,\n partitions_subset: PartitionsSubset,\n partition_dimension_name: str,\n ) -> Set[str]:\n upstream_partitions = set()\n for partition_key in partitions_subset.get_partition_keys():\n if not isinstance(partition_key, MultiPartitionKey):\n check.failed("Partition keys in subset must be MultiPartitionKeys")\n upstream_partitions.add(partition_key.keys_by_dimension[partition_dimension_name])\n return upstream_partitions\n\n def get_upstream_mapped_partitions_result_for_partitions(\n self,\n downstream_partitions_subset: Optional[PartitionsSubset],\n upstream_partitions_def: PartitionsDefinition,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> UpstreamPartitionsResult:\n if downstream_partitions_subset is None:\n check.failed("downstream asset is not partitioned")\n\n (\n upstream_partitions_def,\n _,\n partition_dimension_name,\n ) = self._check_partitions_defs_and_get_partition_dimension_name(\n upstream_partitions_def, downstream_partitions_subset.partitions_def\n )\n\n if isinstance(upstream_partitions_def, MultiPartitionsDefinition):\n # upstream partitions def is multipartitioned\n # downstream partitions def has single dimension\n return UpstreamPartitionsResult(\n upstream_partitions_def.empty_subset().with_partition_keys(\n self._get_matching_multipartition_keys_for_single_dim_subset(\n downstream_partitions_subset,\n cast(MultiPartitionsDefinition, upstream_partitions_def),\n partition_dimension_name,\n dynamic_partitions_store,\n )\n ),\n [],\n )\n else:\n # upstream partitions_def has single dimension\n # downstream partitions def is multipartitioned\n return UpstreamPartitionsResult(\n upstream_partitions_def.empty_subset().with_partition_keys(\n self._get_single_dim_keys_from_multipartitioned_subset(\n downstream_partitions_subset, partition_dimension_name\n )\n ),\n [],\n )\n\n def get_downstream_partitions_for_partitions(\n self,\n upstream_partitions_subset: PartitionsSubset,\n downstream_partitions_def: PartitionsDefinition,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> PartitionsSubset:\n if downstream_partitions_def is None:\n check.failed("downstream asset is not multi-partitioned")\n\n (\n _,\n downstream_partitions_def,\n partition_dimension_name,\n ) = self._check_partitions_defs_and_get_partition_dimension_name(\n upstream_partitions_subset.partitions_def, downstream_partitions_def\n )\n\n if isinstance(downstream_partitions_def, MultiPartitionsDefinition):\n # upstream partitions def has single dimension\n # downstream partitions def is multipartitioned\n return downstream_partitions_def.empty_subset().with_partition_keys(\n self._get_matching_multipartition_keys_for_single_dim_subset(\n upstream_partitions_subset,\n downstream_partitions_def,\n partition_dimension_name,\n dynamic_partitions_store,\n )\n )\n else:\n # upstream partitions def is multipartitioned\n # downstream partitions def has single dimension\n return downstream_partitions_def.empty_subset().with_partition_keys(\n self._get_single_dim_keys_from_multipartitioned_subset(\n upstream_partitions_subset, partition_dimension_name\n )\n )
\n\n\n@whitelist_for_serdes\nclass DimensionPartitionMapping(\n NamedTuple(\n "_DimensionPartitionMapping",\n [\n ("dimension_name", str),\n ("partition_mapping", PartitionMapping),\n ],\n )\n):\n """A helper class for MultiPartitionMapping that defines a partition mapping used to calculate\n the dependent partition keys in the selected downstream MultiPartitions definition dimension.\n\n Args:\n dimension_name (str): The name of the dimension in the downstream MultiPartitionsDefinition.\n partition_mapping (PartitionMapping): The partition mapping object used to calculate\n the downstream dimension partitions from the upstream dimension partitions and vice versa.\n """\n\n def __new__(\n cls,\n dimension_name: str,\n partition_mapping: PartitionMapping,\n ):\n return super(DimensionPartitionMapping, cls).__new__(\n cls,\n dimension_name=check.str_param(dimension_name, "dimension_name"),\n partition_mapping=check.inst_param(\n partition_mapping, "partition_mapping", PartitionMapping\n ),\n )\n\n\n
[docs]@experimental\n@whitelist_for_serdes\nclass MultiPartitionMapping(\n PartitionMapping,\n NamedTuple(\n "_MultiPartitionMapping",\n [("downstream_mappings_by_upstream_dimension", Mapping[str, DimensionPartitionMapping])],\n ),\n):\n """Defines a correspondence between two MultiPartitionsDefinitions.\n\n Accepts a mapping of upstream dimension name to downstream DimensionPartitionMapping, representing\n the explicit correspondence between the upstream and downstream MultiPartitions dimensions\n and the partition mapping used to calculate the downstream partitions.\n\n Examples:\n .. code-block:: python\n\n weekly_abc = MultiPartitionsDefinition(\n {\n "abc": StaticPartitionsDefinition(["a", "b", "c"]),\n "weekly": WeeklyPartitionsDefinition("2023-01-01"),\n }\n )\n daily_123 = MultiPartitionsDefinition(\n {\n "123": StaticPartitionsDefinition(["1", "2", "3"]),\n "daily": DailyPartitionsDefinition("2023-01-01"),\n }\n )\n\n MultiPartitionsMapping(\n {\n "abc": DimensionPartitionMapping(\n dimension_name="123",\n partition_mapping=StaticPartitionMapping({"a": "1", "b": "2", "c": "3"}),\n ),\n "weekly": DimensionPartitionMapping(\n dimension_name="daily",\n partition_mapping=TimeWindowPartitionMapping(),\n )\n }\n )\n\n For upstream or downstream dimensions not explicitly defined in the mapping, Dagster will\n assume an `AllPartitionsMapping`, meaning that all upstream partitions in those dimensions\n will be mapped to all downstream partitions in those dimensions.\n\n Examples:\n .. code-block:: python\n\n weekly_abc = MultiPartitionsDefinition(\n {\n "abc": StaticPartitionsDefinition(["a", "b", "c"]),\n "daily": DailyPartitionsDefinition("2023-01-01"),\n }\n )\n daily_123 = MultiPartitionsDefinition(\n {\n "123": StaticPartitionsDefinition(["1", "2", "3"]),\n "daily": DailyPartitionsDefinition("2023-01-01"),\n }\n )\n\n MultiPartitionsMapping(\n {\n "daily": DimensionPartitionMapping(\n dimension_name="daily",\n partition_mapping=IdentityPartitionMapping(),\n )\n }\n )\n\n # Will map `daily_123` partition key {"123": "1", "daily": "2023-01-01"} to the upstream:\n # {"abc": "a", "daily": "2023-01-01"}\n # {"abc": "b", "daily": "2023-01-01"}\n # {"abc": "c", "daily": "2023-01-01"}\n\n Args:\n downstream_mappings_by_upstream_dimension (Mapping[str, DimensionPartitionMapping]): A\n mapping that defines an explicit correspondence between one dimension of the upstream\n MultiPartitionsDefinition and one dimension of the downstream MultiPartitionsDefinition.\n Maps a string representing upstream dimension name to downstream DimensionPartitionMapping,\n containing the downstream dimension name and partition mapping.\n """\n\n def __new__(\n cls, downstream_mappings_by_upstream_dimension: Mapping[str, DimensionPartitionMapping]\n ):\n return super(MultiPartitionMapping, cls).__new__(\n cls,\n downstream_mappings_by_upstream_dimension=check.mapping_param(\n downstream_mappings_by_upstream_dimension,\n "downstream_mappings_by_upstream_dimension",\n key_type=str,\n value_type=DimensionPartitionMapping,\n ),\n )\n\n def _check_all_dimensions_accounted_for(\n self,\n upstream_partitions_def: PartitionsDefinition,\n downstream_partitions_def: PartitionsDefinition,\n ) -> None:\n if any(\n not isinstance(partitions_def, MultiPartitionsDefinition)\n for partitions_def in (upstream_partitions_def, downstream_partitions_def)\n ):\n check.failed(\n "Both partitions defs provided to a MultiPartitionMapping must be multi-partitioned"\n )\n\n upstream_dimension_names = {\n dim.name\n for dim in cast(MultiPartitionsDefinition, upstream_partitions_def).partitions_defs\n }\n dimension_names = {\n dim.name\n for dim in cast(MultiPartitionsDefinition, downstream_partitions_def).partitions_defs\n }\n\n for (\n upstream_dimension_name,\n dimension_mapping,\n ) in self.downstream_mappings_by_upstream_dimension.items():\n if upstream_dimension_name not in upstream_dimension_names:\n check.failed(\n "Dimension mapping has an upstream dimension name that is not in the upstream "\n "partitions def"\n )\n if dimension_mapping.dimension_name not in dimension_names:\n check.failed(\n "Dimension mapping has a downstream dimension name that is not in the"\n " downstream partitions def"\n )\n\n upstream_dimension_names.remove(upstream_dimension_name)\n dimension_names.remove(dimension_mapping.dimension_name)\n\n def _get_dependency_partitions_subset(\n self,\n a_partitions_def: MultiPartitionsDefinition,\n a_partition_keys: Sequence[MultiPartitionKey],\n b_partitions_def: MultiPartitionsDefinition,\n a_upstream_of_b: bool,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n current_time: Optional[datetime] = None,\n ) -> Union[UpstreamPartitionsResult, PartitionsSubset]:\n """Given two partitions definitions a_partitions_def and b_partitions_def that have a dependency\n relationship (a_upstream_of_b is True if a_partitions_def is upstream of b_partitions_def),\n and a_partition_keys, a list of partition keys in a_partitions_def, returns a list of\n partition keys in the partitions definition b_partitions_def that are\n dependencies of the partition keys in a_partition_keys.\n """\n a_partition_keys_by_dimension = defaultdict(set)\n for partition_key in a_partition_keys:\n for dimension_name, key in partition_key.keys_by_dimension.items():\n a_partition_keys_by_dimension[dimension_name].add(key)\n\n # Maps the dimension name and key of a partition in a_partitions_def to the list of\n # partition keys in b_partitions_def that are dependencies of that partition\n dep_b_keys_by_a_dim_and_key: Dict[str, Dict[str, List[str]]] = defaultdict(\n lambda: defaultdict(list)\n )\n required_but_nonexistent_upstream_partitions = set()\n\n b_dimension_partitions_def_by_name = {\n dimension.name: dimension.partitions_def\n for dimension in b_partitions_def.partitions_defs\n }\n\n if a_upstream_of_b:\n # a_partitions_def is upstream of b_partitions_def, so we need to map the\n # dimension names of a_partitions_def to the corresponding dependent dimensions of\n # b_partitions_def\n a_dim_to_dependency_b_dim = {\n upstream_dim: (\n dimension_mapping.dimension_name,\n dimension_mapping.partition_mapping,\n )\n for upstream_dim, dimension_mapping in self.downstream_mappings_by_upstream_dimension.items()\n }\n\n for a_dim_name, keys in a_partition_keys_by_dimension.items():\n if a_dim_name in a_dim_to_dependency_b_dim:\n (\n b_dim_name,\n dimension_mapping,\n ) = a_dim_to_dependency_b_dim[a_dim_name]\n a_dimension_partitions_def = a_partitions_def.get_partitions_def_for_dimension(\n a_dim_name\n )\n b_dimension_partitions_def = b_partitions_def.get_partitions_def_for_dimension(\n b_dim_name\n )\n for key in keys:\n # if downstream dimension mapping exists, for a given key, get the list of\n # downstream partition keys that are dependencies of that key\n dep_b_keys_by_a_dim_and_key[a_dim_name][key] = list(\n dimension_mapping.get_downstream_partitions_for_partitions(\n a_dimension_partitions_def.empty_subset().with_partition_keys(\n [key]\n ),\n b_dimension_partitions_def,\n current_time=current_time,\n dynamic_partitions_store=dynamic_partitions_store,\n ).get_partition_keys()\n )\n\n else:\n # a_partitions_def is downstream of b_partitions_def, so we need to map the\n # dimension names of a_partitions_def to the corresponding dependency dimensions of\n # b_partitions_def\n a_dim_to_dependency_b_dim = {\n dimension_mapping.dimension_name: (\n upstream_dim,\n dimension_mapping.partition_mapping,\n )\n for upstream_dim, dimension_mapping in self.downstream_mappings_by_upstream_dimension.items()\n }\n\n for a_dim_name, keys in a_partition_keys_by_dimension.items():\n if a_dim_name in a_dim_to_dependency_b_dim:\n (\n b_dim_name,\n partition_mapping,\n ) = a_dim_to_dependency_b_dim[a_dim_name]\n a_dimension_partitions_def = a_partitions_def.get_partitions_def_for_dimension(\n a_dim_name\n )\n b_dimension_partitions_def = b_partitions_def.get_partitions_def_for_dimension(\n b_dim_name\n )\n for key in keys:\n mapped_partitions_result = (\n partition_mapping.get_upstream_mapped_partitions_result_for_partitions(\n a_dimension_partitions_def.empty_subset().with_partition_keys(\n [key]\n ),\n b_dimension_partitions_def,\n current_time=current_time,\n dynamic_partitions_store=dynamic_partitions_store,\n )\n )\n dep_b_keys_by_a_dim_and_key[a_dim_name][key] = list(\n mapped_partitions_result.partitions_subset.get_partition_keys()\n )\n required_but_nonexistent_upstream_partitions.update(\n set(mapped_partitions_result.required_but_nonexistent_partition_keys)\n )\n\n b_partition_keys = set()\n\n mapped_a_dim_names = a_dim_to_dependency_b_dim.keys()\n mapped_b_dim_names = [mapping[0] for mapping in a_dim_to_dependency_b_dim.values()]\n unmapped_b_dim_names = list(\n set(b_dimension_partitions_def_by_name.keys()) - set(mapped_b_dim_names)\n )\n\n for key in a_partition_keys:\n for b_key_values in itertools.product(\n *(\n [\n dep_b_keys_by_a_dim_and_key[dim_name][key.keys_by_dimension[dim_name]]\n for dim_name in mapped_a_dim_names\n ]\n ),\n *[\n b_dimension_partitions_def_by_name[dim_name].get_partition_keys()\n for dim_name in unmapped_b_dim_names\n ],\n ):\n b_partition_keys.add(\n MultiPartitionKey(\n {\n (mapped_b_dim_names + unmapped_b_dim_names)[i]: key\n for i, key in enumerate(b_key_values)\n }\n )\n )\n\n mapped_subset = b_partitions_def.empty_subset().with_partition_keys(b_partition_keys)\n if a_upstream_of_b:\n return mapped_subset\n else:\n return UpstreamPartitionsResult(\n mapped_subset,\n required_but_nonexistent_partition_keys=list(\n required_but_nonexistent_upstream_partitions\n ),\n )\n\n def get_upstream_mapped_partitions_result_for_partitions(\n self,\n downstream_partitions_subset: Optional[PartitionsSubset],\n upstream_partitions_def: PartitionsDefinition,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> UpstreamPartitionsResult:\n if downstream_partitions_subset is None:\n check.failed("downstream asset is not partitioned")\n\n self._check_all_dimensions_accounted_for(\n upstream_partitions_def,\n downstream_partitions_subset.partitions_def,\n )\n\n result = self._get_dependency_partitions_subset(\n cast(MultiPartitionsDefinition, downstream_partitions_subset.partitions_def),\n list(\n cast(Sequence[MultiPartitionKey], downstream_partitions_subset.get_partition_keys())\n ),\n cast(MultiPartitionsDefinition, upstream_partitions_def),\n a_upstream_of_b=False,\n dynamic_partitions_store=dynamic_partitions_store,\n )\n\n if not isinstance(result, UpstreamPartitionsResult):\n check.failed("Expected UpstreamPartitionsResult")\n\n return result\n\n def get_downstream_partitions_for_partitions(\n self,\n upstream_partitions_subset: PartitionsSubset,\n downstream_partitions_def: PartitionsDefinition,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> PartitionsSubset:\n if upstream_partitions_subset is None:\n check.failed("upstream asset is not partitioned")\n\n self._check_all_dimensions_accounted_for(\n upstream_partitions_subset.partitions_def,\n downstream_partitions_def,\n )\n\n result = self._get_dependency_partitions_subset(\n cast(MultiPartitionsDefinition, upstream_partitions_subset.partitions_def),\n list(\n cast(Sequence[MultiPartitionKey], upstream_partitions_subset.get_partition_keys())\n ),\n cast(MultiPartitionsDefinition, downstream_partitions_def),\n a_upstream_of_b=True,\n dynamic_partitions_store=dynamic_partitions_store,\n )\n\n if isinstance(result, UpstreamPartitionsResult):\n check.failed("Expected PartitionsSubset")\n\n return result
\n\n\n
[docs]@whitelist_for_serdes\nclass StaticPartitionMapping(\n PartitionMapping,\n NamedTuple(\n "_StaticPartitionMapping",\n [\n (\n "downstream_partition_keys_by_upstream_partition_key",\n PublicAttr[Mapping[str, Union[str, Collection[str]]]],\n )\n ],\n ),\n):\n """Define an explicit correspondence between two StaticPartitionsDefinitions.\n\n Args:\n downstream_partition_keys_by_upstream_partition_key (Dict[str, str | Collection[str]]):\n The single or multi-valued correspondence from upstream keys to downstream keys.\n """\n\n def __init__(\n self,\n downstream_partition_keys_by_upstream_partition_key: Mapping[\n str, Union[str, Collection[str]]\n ],\n ):\n check.mapping_param(\n downstream_partition_keys_by_upstream_partition_key,\n "downstream_partition_keys_by_upstream_partition_key",\n key_type=str,\n value_type=(str, collections.abc.Collection),\n )\n\n # cache forward and reverse mappings\n self._mapping = defaultdict(set)\n for (\n upstream_key,\n downstream_keys,\n ) in downstream_partition_keys_by_upstream_partition_key.items():\n self._mapping[upstream_key] = (\n {downstream_keys} if isinstance(downstream_keys, str) else set(downstream_keys)\n )\n\n self._inverse_mapping = defaultdict(set)\n for upstream_key, downstream_keys in self._mapping.items():\n for downstream_key in downstream_keys:\n self._inverse_mapping[downstream_key].add(upstream_key)\n\n @cached_method\n def _check_upstream(self, *, upstream_partitions_def: PartitionsDefinition):\n """Validate that the mapping from upstream to downstream is only defined on upstream keys."""\n check.inst(\n upstream_partitions_def,\n StaticPartitionsDefinition,\n "StaticPartitionMapping can only be defined between two StaticPartitionsDefinitions",\n )\n upstream_keys = upstream_partitions_def.get_partition_keys()\n extra_keys = set(self._mapping.keys()).difference(upstream_keys)\n if extra_keys:\n raise ValueError(\n f"mapping source partitions not in the upstream partitions definition: {extra_keys}"\n )\n\n @cached_method\n def _check_downstream(self, *, downstream_partitions_def: PartitionsDefinition):\n """Validate that the mapping from upstream to downstream only maps to downstream keys."""\n check.inst(\n downstream_partitions_def,\n StaticPartitionsDefinition,\n "StaticPartitionMapping can only be defined between two StaticPartitionsDefinitions",\n )\n downstream_keys = downstream_partitions_def.get_partition_keys()\n extra_keys = set(self._inverse_mapping.keys()).difference(downstream_keys)\n if extra_keys:\n raise ValueError(\n "mapping target partitions not in the downstream partitions definition:"\n f" {extra_keys}"\n )\n\n def get_downstream_partitions_for_partitions(\n self,\n upstream_partitions_subset: PartitionsSubset,\n downstream_partitions_def: PartitionsDefinition,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> PartitionsSubset:\n self._check_downstream(downstream_partitions_def=downstream_partitions_def)\n\n downstream_subset = downstream_partitions_def.empty_subset()\n downstream_keys = set()\n for key in upstream_partitions_subset.get_partition_keys():\n downstream_keys.update(self._mapping[key])\n return downstream_subset.with_partition_keys(downstream_keys)\n\n def get_upstream_mapped_partitions_result_for_partitions(\n self,\n downstream_partitions_subset: Optional[PartitionsSubset],\n upstream_partitions_def: PartitionsDefinition,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> UpstreamPartitionsResult:\n self._check_upstream(upstream_partitions_def=upstream_partitions_def)\n\n upstream_subset = upstream_partitions_def.empty_subset()\n if downstream_partitions_subset is None:\n return UpstreamPartitionsResult(upstream_subset, [])\n\n upstream_keys = set()\n for key in downstream_partitions_subset.get_partition_keys():\n upstream_keys.update(self._inverse_mapping[key])\n\n return UpstreamPartitionsResult(upstream_subset.with_partition_keys(upstream_keys), [])
\n\n\ndef _can_infer_single_to_multi_partition_mapping(\n upstream_partitions_def: PartitionsDefinition, downstream_partitions_def: PartitionsDefinition\n) -> bool:\n multipartitions_defs = [\n partitions_def\n for partitions_def in [upstream_partitions_def, downstream_partitions_def]\n if isinstance(partitions_def, MultiPartitionsDefinition)\n ]\n\n if len(multipartitions_defs) != 1:\n return False\n\n multipartitions_def = cast(MultiPartitionsDefinition, next(iter(multipartitions_defs)))\n\n single_dimension_partitions_def = next(\n iter(\n {\n upstream_partitions_def,\n downstream_partitions_def,\n }\n - set(multipartitions_defs)\n )\n )\n\n matching_dimension_defs = [\n dimension_def\n for dimension_def in multipartitions_def.partitions_defs\n if dimension_def.partitions_def == single_dimension_partitions_def\n ]\n\n if not matching_dimension_defs:\n return False\n\n return True\n\n\ndef infer_partition_mapping(\n partition_mapping: Optional[PartitionMapping],\n downstream_partitions_def: Optional[PartitionsDefinition],\n upstream_partitions_def: Optional[PartitionsDefinition],\n) -> PartitionMapping:\n from .time_window_partition_mapping import TimeWindowPartitionMapping\n\n if partition_mapping is not None:\n return partition_mapping\n elif upstream_partitions_def and downstream_partitions_def:\n if _can_infer_single_to_multi_partition_mapping(\n upstream_partitions_def, downstream_partitions_def\n ):\n with disable_dagster_warnings():\n return MultiToSingleDimensionPartitionMapping()\n elif isinstance(upstream_partitions_def, TimeWindowPartitionsDefinition) and isinstance(\n downstream_partitions_def, TimeWindowPartitionsDefinition\n ):\n return TimeWindowPartitionMapping()\n else:\n return IdentityPartitionMapping()\n else:\n return AllPartitionMapping()\n\n\ndef get_builtin_partition_mapping_types() -> Tuple[Type[PartitionMapping], ...]:\n from dagster._core.definitions.time_window_partition_mapping import TimeWindowPartitionMapping\n\n return (\n AllPartitionMapping,\n IdentityPartitionMapping,\n LastPartitionMapping,\n SpecificPartitionsPartitionMapping,\n StaticPartitionMapping,\n TimeWindowPartitionMapping,\n MultiToSingleDimensionPartitionMapping,\n MultiPartitionMapping,\n )\n
", "current_page_name": "_modules/dagster/_core/definitions/partition_mapping", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.partition_mapping"}, "partitioned_schedule": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.partitioned_schedule

\nfrom typing import Callable, Mapping, NamedTuple, Optional, Union, cast\n\nimport dagster._check as check\nfrom dagster._core.errors import DagsterInvalidDefinitionError\n\nfrom .decorators.schedule_decorator import schedule\nfrom .job_definition import JobDefinition\nfrom .multi_dimensional_partitions import MultiPartitionsDefinition\nfrom .partition import PartitionsDefinition\nfrom .run_request import RunRequest, SkipReason\nfrom .schedule_definition import (\n    DefaultScheduleStatus,\n    RunRequestIterator,\n    ScheduleDefinition,\n    ScheduleEvaluationContext,\n)\nfrom .time_window_partitions import (\n    TimeWindowPartitionsDefinition,\n    get_time_partitions_def,\n    has_one_dimension_time_window_partitioning,\n)\nfrom .unresolved_asset_job_definition import UnresolvedAssetJobDefinition\n\n\nclass UnresolvedPartitionedAssetScheduleDefinition(NamedTuple):\n    """Points to an unresolved asset job. The asset selection isn't resolved yet, so we can't resolve\n    the PartitionsDefinition, so we can't resolve the schedule cadence.\n    """\n\n    name: str\n    job: UnresolvedAssetJobDefinition\n    description: Optional[str]\n    default_status: DefaultScheduleStatus\n    minute_of_hour: Optional[int]\n    hour_of_day: Optional[int]\n    day_of_week: Optional[int]\n    day_of_month: Optional[int]\n    tags: Optional[Mapping[str, str]]\n\n    def resolve(self, resolved_job: JobDefinition) -> ScheduleDefinition:\n        partitions_def = resolved_job.partitions_def\n        if partitions_def is None:\n            check.failed(\n                f"Job '{resolved_job.name}' provided to build_schedule_from_partitioned_job must"\n                " contain partitioned assets or a partitions definition."\n            )\n\n        partitions_def = _check_valid_schedule_partitions_def(partitions_def)\n        time_partitions_def = check.not_none(get_time_partitions_def(partitions_def))\n\n        return ScheduleDefinition(\n            job=resolved_job,\n            name=self.name,\n            execution_fn=_get_schedule_evaluation_fn(partitions_def, resolved_job, self.tags),\n            execution_timezone=time_partitions_def.timezone,\n            cron_schedule=time_partitions_def.get_cron_schedule(\n                self.minute_of_hour, self.hour_of_day, self.day_of_week, self.day_of_month\n            ),\n        )\n\n\n
[docs]def build_schedule_from_partitioned_job(\n job: Union[JobDefinition, UnresolvedAssetJobDefinition],\n description: Optional[str] = None,\n name: Optional[str] = None,\n minute_of_hour: Optional[int] = None,\n hour_of_day: Optional[int] = None,\n day_of_week: Optional[int] = None,\n day_of_month: Optional[int] = None,\n default_status: DefaultScheduleStatus = DefaultScheduleStatus.STOPPED,\n tags: Optional[Mapping[str, str]] = None,\n) -> Union[UnresolvedPartitionedAssetScheduleDefinition, ScheduleDefinition]:\n """Creates a schedule from a time window-partitioned job or a job that targets\n time window-partitioned assets. The job can also be multipartitioned, as long as one\n of the partitions dimensions is time-partitioned.\n\n The schedule executes at the cadence specified by the time partitioning of the job or assets.\n\n Examples:\n .. code-block:: python\n\n ######################################\n # Job that targets partitioned assets\n ######################################\n\n from dagster import (\n DailyPartitionsDefinition,\n asset,\n build_schedule_from_partitioned_job,\n define_asset_job,\n )\n\n @asset(partitions_def=DailyPartitionsDefinition(start_date="2020-01-01"))\n def asset1():\n ...\n\n asset1_job = define_asset_job("asset1_job", selection=[asset1])\n\n # The created schedule will fire daily\n asset1_job_schedule = build_schedule_from_partitioned_job(asset1_job)\n\n defs = Definitions(assets=[asset1], schedules=[asset1_job_schedule])\n\n ################\n # Non-asset job\n ################\n\n from dagster import DailyPartitionsDefinition, build_schedule_from_partitioned_job, jog\n\n\n @job(partitions_def=DailyPartitionsDefinition(start_date="2020-01-01"))\n def do_stuff_partitioned():\n ...\n\n # The created schedule will fire daily\n do_stuff_partitioned_schedule = build_schedule_from_partitioned_job(\n do_stuff_partitioned,\n )\n\n defs = Definitions(schedules=[do_stuff_partitioned_schedule])\n """\n check.invariant(\n not (day_of_week and day_of_month),\n "Cannot provide both day_of_month and day_of_week parameter to"\n " build_schedule_from_partitioned_job.",\n )\n\n if isinstance(job, UnresolvedAssetJobDefinition) and job.partitions_def is None:\n return UnresolvedPartitionedAssetScheduleDefinition(\n job=job,\n default_status=default_status,\n name=check.opt_str_param(name, "name", f"{job.name}_schedule"),\n description=check.opt_str_param(description, "description"),\n minute_of_hour=minute_of_hour,\n hour_of_day=hour_of_day,\n day_of_week=day_of_week,\n day_of_month=day_of_month,\n tags=tags,\n )\n else:\n partitions_def = job.partitions_def\n if partitions_def is None:\n check.failed("The provided job is not partitioned")\n\n partitions_def = _check_valid_schedule_partitions_def(partitions_def)\n time_partitions_def = check.not_none(get_time_partitions_def(partitions_def))\n\n return schedule(\n cron_schedule=time_partitions_def.get_cron_schedule(\n minute_of_hour, hour_of_day, day_of_week, day_of_month\n ),\n job=job,\n default_status=default_status,\n execution_timezone=time_partitions_def.timezone,\n name=check.opt_str_param(name, "name", f"{job.name}_schedule"),\n description=check.opt_str_param(description, "description"),\n )(_get_schedule_evaluation_fn(partitions_def, job, tags))
\n\n\ndef _get_schedule_evaluation_fn(\n partitions_def: PartitionsDefinition,\n job: Union[JobDefinition, UnresolvedAssetJobDefinition],\n tags: Optional[Mapping[str, str]] = None,\n) -> Callable[[ScheduleEvaluationContext], Union[SkipReason, RunRequest, RunRequestIterator]]:\n def schedule_fn(context):\n # Run for the latest partition. Prior partitions will have been handled by prior ticks.\n if isinstance(partitions_def, TimeWindowPartitionsDefinition):\n partition_key = partitions_def.get_last_partition_key(context.scheduled_execution_time)\n if partition_key is None:\n return SkipReason("The job's PartitionsDefinition has no partitions")\n\n return job.run_request_for_partition(\n partition_key=partition_key,\n run_key=partition_key,\n tags=tags,\n current_time=context.scheduled_execution_time,\n )\n else:\n check.invariant(isinstance(partitions_def, MultiPartitionsDefinition))\n time_window_dimension = partitions_def.time_window_dimension\n partition_key = time_window_dimension.partitions_def.get_last_partition_key(\n context.scheduled_execution_time\n )\n if partition_key is None:\n return SkipReason("The job's PartitionsDefinition has no partitions")\n\n return [\n job.run_request_for_partition(\n partition_key=key,\n run_key=key,\n tags=tags,\n current_time=context.scheduled_execution_time,\n dynamic_partitions_store=context.instance if context.instance_ref else None,\n )\n for key in partitions_def.get_multipartition_keys_with_dimension_value(\n time_window_dimension.name,\n partition_key,\n dynamic_partitions_store=context.instance if context.instance_ref else None,\n )\n ]\n\n return schedule_fn\n\n\ndef _check_valid_schedule_partitions_def(\n partitions_def: PartitionsDefinition,\n) -> Union[TimeWindowPartitionsDefinition, MultiPartitionsDefinition]:\n if not has_one_dimension_time_window_partitioning(partitions_def):\n raise DagsterInvalidDefinitionError(\n "Tried to build a partitioned schedule from an asset job, but received an invalid"\n " partitions definition. The permitted partitions definitions are: \\n1."\n " TimeWindowPartitionsDefinition\\n2. MultiPartitionsDefinition with a single"\n " TimeWindowPartitionsDefinition dimension"\n )\n\n return cast(Union[TimeWindowPartitionsDefinition, MultiPartitionsDefinition], partitions_def)\n\n\nschedule_from_partitions = build_schedule_from_partitioned_job\n
", "current_page_name": "_modules/dagster/_core/definitions/partitioned_schedule", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.partitioned_schedule"}, "policy": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.policy

\nfrom enum import Enum\nfrom random import random\nfrom typing import NamedTuple, Optional\n\nimport dagster._check as check\nfrom dagster._annotations import PublicAttr\nfrom dagster._core.errors import DagsterInvalidDefinitionError\n\n\n
[docs]class Backoff(Enum):\n """A modifier for delay as a function of attempt number.\n\n LINEAR: `attempt_num * delay`\n EXPONENTIAL: `((2 ^ attempt_num) - 1) * delay`\n """\n\n LINEAR = "LINEAR"\n EXPONENTIAL = "EXPONENTIAL"
\n\n\n
[docs]class Jitter(Enum):\n """A randomizing modifier for delay, applied after backoff calculation.\n\n FULL: between 0 and the calculated delay based on backoff: `random() * backoff_delay`\n PLUS_MINUS: +/- the delay: `backoff_delay + ((2 * (random() * delay)) - delay)`\n """\n\n FULL = "FULL"\n PLUS_MINUS = "PLUS_MINUS"
\n\n\n
[docs]class RetryPolicy(\n NamedTuple(\n "_RetryPolicy",\n [\n ("max_retries", PublicAttr[int]),\n ("delay", PublicAttr[Optional[check.Numeric]]),\n # declarative time modulation to allow calc witout running user function\n ("backoff", PublicAttr[Optional[Backoff]]),\n ("jitter", PublicAttr[Optional[Jitter]]),\n ],\n ),\n):\n """A declarative policy for when to request retries when an exception occurs during op execution.\n\n Args:\n max_retries (int):\n The maximum number of retries to attempt. Defaults to 1.\n delay (Optional[Union[int,float]]):\n The time in seconds to wait between the retry being requested and the next attempt\n being started. This unit of time can be modulated as a function of attempt number\n with backoff and randomly with jitter.\n backoff (Optional[Backoff]):\n A modifier for delay as a function of retry attempt number.\n jitter (Optional[Jitter]):\n A randomizing modifier for delay, applied after backoff calculation.\n """\n\n def __new__(\n cls,\n max_retries: int = 1,\n delay: Optional[check.Numeric] = None,\n backoff: Optional[Backoff] = None,\n jitter: Optional[Jitter] = None,\n ):\n if backoff is not None and delay is None:\n raise DagsterInvalidDefinitionError(\n "Can not set jitter on RetryPolicy without also setting delay"\n )\n\n if jitter is not None and delay is None:\n raise DagsterInvalidDefinitionError(\n "Can not set backoff on RetryPolicy without also setting delay"\n )\n\n return super().__new__(\n cls,\n max_retries=check.int_param(max_retries, "max_retries"),\n delay=check.opt_numeric_param(delay, "delay"),\n backoff=check.opt_inst_param(backoff, "backoff", Backoff),\n jitter=check.opt_inst_param(jitter, "jitter", Jitter),\n )\n\n def calculate_delay(self, attempt_num: int) -> check.Numeric:\n return calculate_delay(\n attempt_num=attempt_num,\n backoff=self.backoff,\n jitter=self.jitter,\n base_delay=self.delay or 0,\n )
\n\n\ndef calculate_delay(\n attempt_num: int, backoff: Optional[Backoff], jitter: Optional[Jitter], base_delay: float\n) -> float:\n if backoff is Backoff.EXPONENTIAL:\n calc_delay = ((2**attempt_num) - 1) * base_delay\n elif backoff is Backoff.LINEAR:\n calc_delay = base_delay * attempt_num\n elif backoff is None:\n calc_delay = base_delay\n else:\n check.assert_never(backoff)\n\n if jitter is Jitter.FULL:\n calc_delay = random() * calc_delay\n elif jitter is Jitter.PLUS_MINUS:\n calc_delay = calc_delay + ((2 * (random() * base_delay)) - base_delay)\n elif jitter is None:\n pass\n else:\n check.assert_never(jitter)\n\n return calc_delay\n
", "current_page_name": "_modules/dagster/_core/definitions/policy", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.policy"}, "reconstruct": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.reconstruct

\nimport inspect\nimport json\nimport os\nimport sys\nfrom functools import lru_cache\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Any,\n    Callable,\n    Dict,\n    Iterable,\n    List,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Tuple,\n    TypeVar,\n    Union,\n    overload,\n)\n\nfrom typing_extensions import Self, TypeAlias\n\nimport dagster._check as check\nimport dagster._seven as seven\nfrom dagster._annotations import experimental\nfrom dagster._core.code_pointer import (\n    CodePointer,\n    CustomPointer,\n    FileCodePointer,\n    ModuleCodePointer,\n    get_python_file_from_target,\n)\nfrom dagster._core.definitions.asset_check_spec import AssetCheckKey\nfrom dagster._core.errors import DagsterInvariantViolationError\nfrom dagster._core.origin import (\n    DEFAULT_DAGSTER_ENTRY_POINT,\n    JobPythonOrigin,\n    RepositoryPythonOrigin,\n)\nfrom dagster._serdes import pack_value, unpack_value, whitelist_for_serdes\nfrom dagster._serdes.serdes import NamedTupleSerializer\nfrom dagster._utils import hash_collection\n\nfrom .events import AssetKey\nfrom .job_base import IJob\n\nif TYPE_CHECKING:\n    from dagster._core.definitions.assets import AssetsDefinition\n    from dagster._core.definitions.job_definition import JobDefinition\n    from dagster._core.definitions.repository_definition import (\n        PendingRepositoryDefinition,\n        RepositoryLoadData,\n    )\n    from dagster._core.definitions.source_asset import SourceAsset\n\n    from .graph_definition import GraphDefinition\n    from .repository_definition import RepositoryDefinition\n\n\ndef get_ephemeral_repository_name(job_name: str) -> str:\n    check.str_param(job_name, "job_name")\n    return f"__repository__{job_name}"\n\n\n@whitelist_for_serdes\nclass ReconstructableRepository(\n    NamedTuple(\n        "_ReconstructableRepository",\n        [\n            ("pointer", CodePointer),\n            ("container_image", Optional[str]),\n            ("executable_path", Optional[str]),\n            ("entry_point", Sequence[str]),\n            ("container_context", Optional[Mapping[str, Any]]),\n            ("repository_load_data", Optional["RepositoryLoadData"]),\n        ],\n    )\n):\n    def __new__(\n        cls,\n        pointer: CodePointer,\n        container_image: Optional[str] = None,\n        executable_path: Optional[str] = None,\n        entry_point: Optional[Sequence[str]] = None,\n        container_context: Optional[Mapping[str, Any]] = None,\n        repository_load_data: Optional["RepositoryLoadData"] = None,\n    ):\n        from dagster._core.definitions.repository_definition import RepositoryLoadData\n\n        return super(ReconstructableRepository, cls).__new__(\n            cls,\n            pointer=check.inst_param(pointer, "pointer", CodePointer),\n            container_image=check.opt_str_param(container_image, "container_image"),\n            executable_path=check.opt_str_param(executable_path, "executable_path"),\n            entry_point=(\n                check.sequence_param(entry_point, "entry_point", of_type=str)\n                if entry_point is not None\n                else DEFAULT_DAGSTER_ENTRY_POINT\n            ),\n            container_context=(\n                check.mapping_param(container_context, "container_context")\n                if container_context is not None\n                else None\n            ),\n            repository_load_data=check.opt_inst_param(\n                repository_load_data, "repository_load_data", RepositoryLoadData\n            ),\n        )\n\n    def with_repository_load_data(\n        self, metadata: Optional["RepositoryLoadData"]\n    ) -> "ReconstructableRepository":\n        return self._replace(repository_load_data=metadata)\n\n    def get_definition(self) -> "RepositoryDefinition":\n        return repository_def_from_pointer(self.pointer, self.repository_load_data)\n\n    def get_reconstructable_job(self, name: str) -> "ReconstructableJob":\n        return ReconstructableJob(self, name)\n\n    @classmethod\n    def for_file(\n        cls,\n        file: str,\n        fn_name: str,\n        working_directory: Optional[str] = None,\n        container_image: Optional[str] = None,\n        container_context: Optional[Mapping[str, Any]] = None,\n    ) -> "ReconstructableRepository":\n        if not working_directory:\n            working_directory = os.getcwd()\n        return cls(\n            FileCodePointer(file, fn_name, working_directory),\n            container_image=container_image,\n            container_context=container_context,\n        )\n\n    @classmethod\n    def for_module(\n        cls,\n        module: str,\n        fn_name: str,\n        working_directory: Optional[str] = None,\n        container_image: Optional[str] = None,\n        container_context: Optional[Mapping[str, Any]] = None,\n    ) -> "ReconstructableRepository":\n        return cls(\n            ModuleCodePointer(module, fn_name, working_directory),\n            container_image=container_image,\n            container_context=container_context,\n        )\n\n    def get_python_origin(self) -> RepositoryPythonOrigin:\n        return RepositoryPythonOrigin(\n            executable_path=self.executable_path if self.executable_path else sys.executable,\n            code_pointer=self.pointer,\n            container_image=self.container_image,\n            entry_point=self.entry_point,\n            container_context=self.container_context,\n        )\n\n    def get_python_origin_id(self) -> str:\n        return self.get_python_origin().get_id()\n\n    # Allow this to be hashed for use in `lru_cache`. This is needed because:\n    # - `ReconstructableJob` uses `lru_cache`\n    # - `ReconstructableJob` has a `ReconstructableRepository` attribute\n    # - `ReconstructableRepository` has `Sequence` attributes that are unhashable by default\n    def __hash__(self) -> int:\n        if not hasattr(self, "_hash"):\n            self._hash = hash_collection(self)\n        return self._hash\n\n\nclass ReconstructableJobSerializer(NamedTupleSerializer):\n    def before_unpack(self, _, unpacked_dict: Dict[str, Any]) -> Dict[str, Any]:\n        solid_selection_str = unpacked_dict.get("solid_selection_str")\n        solids_to_execute = unpacked_dict.get("solids_to_execute")\n        if solid_selection_str:\n            unpacked_dict["op_selection"] = json.loads(solid_selection_str)\n        elif solids_to_execute:\n            unpacked_dict["op_selection"] = solids_to_execute\n        return unpacked_dict\n\n    def after_pack(self, **packed_dict: Any) -> Dict[str, Any]:\n        if packed_dict["op_selection"]:\n            packed_dict["solid_selection_str"] = json.dumps(packed_dict["op_selection"]["__set__"])\n        else:\n            packed_dict["solid_selection_str"] = None\n        del packed_dict["op_selection"]\n        return packed_dict\n\n\n@whitelist_for_serdes(\n    serializer=ReconstructableJobSerializer,\n    storage_name="ReconstructablePipeline",\n    storage_field_names={\n        "job_name": "pipeline_name",\n    },\n)\nclass ReconstructableJob(\n    NamedTuple(\n        "_ReconstructableJob",\n        [\n            ("repository", ReconstructableRepository),\n            ("job_name", str),\n            ("op_selection", Optional[AbstractSet[str]]),\n            ("asset_selection", Optional[AbstractSet[AssetKey]]),\n            ("asset_check_selection", Optional[AbstractSet[AssetCheckKey]]),\n        ],\n    ),\n    IJob,\n):\n    """Defines a reconstructable job. When your job must cross process boundaries, Dagster must know\n    how to reconstruct the job on the other side of the process boundary.\n\n    Args:\n        repository (ReconstructableRepository): The reconstructable representation of the repository\n            the job belongs to.\n        job_name (str): The name of the job.\n        op_selection (Optional[AbstractSet[str]]): A set of op query strings. Ops matching any of\n            these queries will be selected. None if no selection is specified.\n        asset_selection (Optional[AbstractSet[AssetKey]]) A set of assets to execute. None if no selection\n            is specified, i.e. the entire job will be run.\n    """\n\n    def __new__(\n        cls,\n        repository: ReconstructableRepository,\n        job_name: str,\n        op_selection: Optional[Iterable[str]] = None,\n        asset_selection: Optional[AbstractSet[AssetKey]] = None,\n        asset_check_selection: Optional[AbstractSet[AssetCheckKey]] = None,\n    ):\n        op_selection = set(op_selection) if op_selection else None\n        return super(ReconstructableJob, cls).__new__(\n            cls,\n            repository=check.inst_param(repository, "repository", ReconstructableRepository),\n            job_name=check.str_param(job_name, "job_name"),\n            op_selection=check.opt_nullable_set_param(op_selection, "op_selection", of_type=str),\n            asset_selection=check.opt_nullable_set_param(\n                asset_selection, "asset_selection", AssetKey\n            ),\n            asset_check_selection=check.opt_nullable_set_param(\n                asset_check_selection, "asset_check_selection", AssetCheckKey\n            ),\n        )\n\n    def with_repository_load_data(\n        self, metadata: Optional["RepositoryLoadData"]\n    ) -> "ReconstructableJob":\n        return self._replace(repository=self.repository.with_repository_load_data(metadata))\n\n    # Keep the most recent 1 definition (globally since this is a NamedTuple method)\n    # This allows repeated calls to get_definition in execution paths to not reload the job\n    @lru_cache(maxsize=1)\n    def get_definition(self) -> "JobDefinition":\n        return self.repository.get_definition().get_maybe_subset_job_def(\n            self.job_name,\n            self.op_selection,\n            self.asset_selection,\n        )\n\n    def get_reconstructable_repository(self) -> ReconstructableRepository:\n        return self.repository\n\n    def get_subset(\n        self,\n        *,\n        op_selection: Optional[Iterable[str]] = None,\n        asset_selection: Optional[AbstractSet[AssetKey]] = None,\n        asset_check_selection: Optional[AbstractSet[AssetCheckKey]] = None,\n    ) -> Self:\n        if op_selection and (asset_selection or asset_check_selection):\n            check.failed(\n                "op_selection and asset_selection or asset_check_selection cannot both be provided"\n                " as arguments",\n            )\n        op_selection = set(op_selection) if op_selection else None\n        return ReconstructableJob(\n            repository=self.repository,\n            job_name=self.job_name,\n            op_selection=op_selection,\n            asset_selection=asset_selection,\n            asset_check_selection=asset_check_selection,\n        )\n\n    def describe(self) -> str:\n        return f'"{self.job_name}" in repository ({self.repository.pointer.describe})'\n\n    @staticmethod\n    def for_file(python_file: str, fn_name: str) -> "ReconstructableJob":\n        return bootstrap_standalone_recon_job(FileCodePointer(python_file, fn_name, os.getcwd()))\n\n    @staticmethod\n    def for_module(module: str, fn_name: str) -> "ReconstructableJob":\n        return bootstrap_standalone_recon_job(ModuleCodePointer(module, fn_name, os.getcwd()))\n\n    def to_dict(self) -> Mapping[str, object]:\n        return pack_value(self)\n\n    @staticmethod\n    def from_dict(val: Mapping[str, Any]) -> "ReconstructableJob":\n        check.mapping_param(val, "val")\n\n        inst = unpack_value(val)\n        check.invariant(\n            isinstance(inst, ReconstructableJob),\n            f"Deserialized object is not instance of ReconstructableJob, got {type(inst)}",\n        )\n        return inst  # type: ignore  # (illegible runtime check)\n\n    def get_python_origin(self) -> JobPythonOrigin:\n        return JobPythonOrigin(self.job_name, self.repository.get_python_origin())\n\n    def get_python_origin_id(self) -> str:\n        return self.get_python_origin().get_id()\n\n    def get_module(self) -> Optional[str]:\n        """Return the module the job is found in, the origin is a module code pointer."""\n        pointer = self.get_python_origin().get_repo_pointer()\n        if isinstance(pointer, ModuleCodePointer):\n            return pointer.module\n\n        return None\n\n    # Allow this to be hashed for `lru_cache` in `get_definition`\n    def __hash__(self) -> int:\n        if not hasattr(self, "_hash"):\n            self._hash = hash_collection(self)\n        return self._hash\n\n\n
[docs]def reconstructable(target: Callable[..., "JobDefinition"]) -> ReconstructableJob:\n """Create a :py:class:`~dagster._core.definitions.reconstructable.ReconstructableJob` from a\n function that returns a :py:class:`~dagster.JobDefinition`/:py:class:`~dagster.JobDefinition`,\n or a function decorated with :py:func:`@job <dagster.job>`.\n\n When your job must cross process boundaries, e.g., for execution on multiple nodes or\n in different systems (like ``dagstermill``), Dagster must know how to reconstruct the job\n on the other side of the process boundary.\n\n Passing a job created with ``~dagster.GraphDefinition.to_job`` to ``reconstructable()``,\n requires you to wrap that job's definition in a module-scoped function, and pass that function\n instead:\n\n .. code-block:: python\n\n from dagster import graph, reconstructable\n\n @graph\n def my_graph():\n ...\n\n def define_my_job():\n return my_graph.to_job()\n\n reconstructable(define_my_job)\n\n This function implements a very conservative strategy for reconstruction, so that its behavior\n is easy to predict, but as a consequence it is not able to reconstruct certain kinds of jobs\n or jobs, such as those defined by lambdas, in nested scopes (e.g., dynamically within a method\n call), or in interactive environments such as the Python REPL or Jupyter notebooks.\n\n If you need to reconstruct objects constructed in these ways, you should use\n :py:func:`~dagster.reconstructable.build_reconstructable_job` instead, which allows you to\n specify your own reconstruction strategy.\n\n Examples:\n .. code-block:: python\n\n from dagster import job, reconstructable\n\n @job\n def foo_job():\n ...\n\n reconstructable_foo_job = reconstructable(foo_job)\n\n\n @graph\n def foo():\n ...\n\n def make_bar_job():\n return foo.to_job()\n\n reconstructable_bar_job = reconstructable(make_bar_job)\n """\n from dagster._core.definitions import JobDefinition\n\n if not seven.is_function_or_decorator_instance_of(target, JobDefinition):\n if isinstance(target, JobDefinition):\n raise DagsterInvariantViolationError(\n "Reconstructable target was not a function returning a job definition, or a job "\n "definition produced by a decorated function. If your job was constructed using "\n "``GraphDefinition.to_job``, you must wrap the ``to_job`` call in a function at "\n "module scope, ie not within any other functions. "\n "To learn more, check out the docs on ``reconstructable``: "\n "https://docs.dagster.io/_apidocs/execution#dagster.reconstructable"\n )\n raise DagsterInvariantViolationError(\n "Reconstructable target should be a function or definition produced "\n f"by a decorated function, got {type(target)}.",\n )\n\n if seven.is_lambda(target):\n raise DagsterInvariantViolationError(\n "Reconstructable target can not be a lambda. Use a function or "\n "decorated function defined at module scope instead, or use "\n "build_reconstructable_job."\n )\n\n if seven.qualname_differs(target):\n raise DagsterInvariantViolationError(\n f'Reconstructable target "{target.__name__}" has a different '\n f'__qualname__ "{target.__qualname__}" indicating it is not '\n "defined at module scope. Use a function or decorated function "\n "defined at module scope instead, or use build_reconstructable_job."\n )\n\n try:\n if (\n hasattr(target, "__module__")\n and hasattr(target, "__name__")\n and getattr(inspect.getmodule(target), "__name__", None) != "__main__"\n ):\n return ReconstructableJob.for_module(target.__module__, target.__name__)\n except:\n pass\n\n python_file = get_python_file_from_target(target)\n if not python_file:\n raise DagsterInvariantViolationError(\n "reconstructable() can not reconstruct jobs defined in interactive "\n "environments like <stdin>, IPython, or Jupyter notebooks. "\n "Use a job defined in a module or file instead, or use build_reconstructable_job."\n )\n\n pointer = FileCodePointer(\n python_file=python_file, fn_name=target.__name__, working_directory=os.getcwd()\n )\n\n return bootstrap_standalone_recon_job(pointer)
\n\n\n
[docs]@experimental\ndef build_reconstructable_job(\n reconstructor_module_name: str,\n reconstructor_function_name: str,\n reconstructable_args: Optional[Tuple[object]] = None,\n reconstructable_kwargs: Optional[Mapping[str, object]] = None,\n reconstructor_working_directory: Optional[str] = None,\n) -> ReconstructableJob:\n """Create a :py:class:`dagster._core.definitions.reconstructable.ReconstructableJob`.\n\n When your job must cross process boundaries, e.g., for execution on multiple nodes or in\n different systems (like ``dagstermill``), Dagster must know how to reconstruct the job\n on the other side of the process boundary.\n\n This function allows you to use the strategy of your choice for reconstructing jobs, so\n that you can reconstruct certain kinds of jobs that are not supported by\n :py:func:`~dagster.reconstructable`, such as those defined by lambdas, in nested scopes (e.g.,\n dynamically within a method call), or in interactive environments such as the Python REPL or\n Jupyter notebooks.\n\n If you need to reconstruct jobs constructed in these ways, use this function instead of\n :py:func:`~dagster.reconstructable`.\n\n Args:\n reconstructor_module_name (str): The name of the module containing the function to use to\n reconstruct the job.\n reconstructor_function_name (str): The name of the function to use to reconstruct the\n job.\n reconstructable_args (Tuple): Args to the function to use to reconstruct the job.\n Values of the tuple must be JSON serializable.\n reconstructable_kwargs (Dict[str, Any]): Kwargs to the function to use to reconstruct the\n job. Values of the dict must be JSON serializable.\n\n Examples:\n .. code-block:: python\n\n # module: mymodule\n\n from dagster import JobDefinition, job, build_reconstructable_job\n\n class JobFactory:\n def make_job(*args, **kwargs):\n\n @job\n def _job(...):\n ...\n\n return _job\n\n def reconstruct_job(*args):\n factory = JobFactory()\n return factory.make_job(*args)\n\n factory = JobFactory()\n\n foo_job_args = (...,...)\n\n foo_job_kwargs = {...:...}\n\n foo_job = factory.make_job(*foo_job_args, **foo_job_kwargs)\n\n reconstructable_foo_job = build_reconstructable_job(\n 'mymodule',\n 'reconstruct_job',\n foo_job_args,\n foo_job_kwargs,\n )\n """\n check.str_param(reconstructor_module_name, "reconstructor_module_name")\n check.str_param(reconstructor_function_name, "reconstructor_function_name")\n check.opt_str_param(\n reconstructor_working_directory, "reconstructor_working_directory", os.getcwd()\n )\n\n _reconstructable_args: List[object] = list(\n check.opt_tuple_param(reconstructable_args, "reconstructable_args")\n )\n _reconstructable_kwargs: List[List[Union[str, object]]] = list(\n (\n [key, value]\n for key, value in check.opt_mapping_param(\n reconstructable_kwargs, "reconstructable_kwargs", key_type=str\n ).items()\n )\n )\n\n reconstructor_pointer = ModuleCodePointer(\n reconstructor_module_name,\n reconstructor_function_name,\n working_directory=reconstructor_working_directory,\n )\n\n pointer = CustomPointer(reconstructor_pointer, _reconstructable_args, _reconstructable_kwargs)\n\n job_def = job_def_from_pointer(pointer)\n\n return ReconstructableJob(\n repository=ReconstructableRepository(pointer), # creates ephemeral repo\n job_name=job_def.name,\n )
\n\n\ndef bootstrap_standalone_recon_job(pointer: CodePointer) -> ReconstructableJob:\n # So this actually straps the the job for the sole\n # purpose of getting the job name. If we changed ReconstructableJob\n # to get the job on demand in order to get name, we could avoid this.\n job_def = job_def_from_pointer(pointer)\n return ReconstructableJob(\n repository=ReconstructableRepository(pointer), # creates ephemeral repo\n job_name=job_def.name,\n )\n\n\nLoadableDefinition: TypeAlias = Union[\n "JobDefinition",\n "RepositoryDefinition",\n "PendingRepositoryDefinition",\n "GraphDefinition",\n "Sequence[Union[AssetsDefinition, SourceAsset]]",\n]\n\nT_LoadableDefinition = TypeVar("T_LoadableDefinition", bound=LoadableDefinition)\n\n\ndef _is_list_of_assets(\n definition: LoadableDefinition,\n) -> bool:\n from dagster._core.definitions.assets import AssetsDefinition\n from dagster._core.definitions.source_asset import SourceAsset\n\n return isinstance(definition, list) and all(\n isinstance(item, (AssetsDefinition, SourceAsset)) for item in definition\n )\n\n\ndef _check_is_loadable(definition: T_LoadableDefinition) -> T_LoadableDefinition:\n from .definitions_class import Definitions\n from .graph_definition import GraphDefinition\n from .job_definition import JobDefinition\n from .repository_definition import PendingRepositoryDefinition, RepositoryDefinition\n\n if not (\n isinstance(\n definition,\n (\n JobDefinition,\n RepositoryDefinition,\n PendingRepositoryDefinition,\n GraphDefinition,\n Definitions,\n ),\n )\n or _is_list_of_assets(definition)\n ):\n raise DagsterInvariantViolationError(\n "Loadable attributes must be either a JobDefinition, GraphDefinition, "\n f"or RepositoryDefinition. Got {definition!r}."\n )\n return definition\n\n\ndef load_def_in_module(\n module_name: str, attribute: str, working_directory: Optional[str]\n) -> LoadableDefinition:\n return def_from_pointer(CodePointer.from_module(module_name, attribute, working_directory))\n\n\ndef load_def_in_package(\n package_name: str, attribute: str, working_directory: Optional[str]\n) -> LoadableDefinition:\n return def_from_pointer(\n CodePointer.from_python_package(package_name, attribute, working_directory)\n )\n\n\ndef load_def_in_python_file(\n python_file: str, attribute: str, working_directory: Optional[str]\n) -> LoadableDefinition:\n return def_from_pointer(CodePointer.from_python_file(python_file, attribute, working_directory))\n\n\ndef def_from_pointer(\n pointer: CodePointer,\n) -> LoadableDefinition:\n target = pointer.load_target()\n\n from .graph_definition import GraphDefinition\n from .job_definition import JobDefinition\n from .repository_definition import PendingRepositoryDefinition, RepositoryDefinition\n\n if isinstance(\n target,\n (\n GraphDefinition,\n JobDefinition,\n PendingRepositoryDefinition,\n RepositoryDefinition,\n ),\n ) or not callable(target):\n return _check_is_loadable(target) # type: ignore\n\n # if its a function invoke it - otherwise we are pointing to a\n # artifact in module scope, likely decorator output\n\n if seven.get_arg_names(target):\n raise DagsterInvariantViolationError(\n f"Error invoking function at {pointer.describe()} with no arguments. "\n "Reconstructable target must be callable with no arguments"\n )\n\n return _check_is_loadable(target())\n\n\ndef job_def_from_pointer(pointer: CodePointer) -> "JobDefinition":\n from .job_definition import JobDefinition\n\n target = def_from_pointer(pointer)\n\n if isinstance(target, JobDefinition):\n return target\n\n raise DagsterInvariantViolationError(\n "CodePointer ({str}) must resolve to a JobDefinition (or JobDefinition for legacy"\n " code). Received a {type}".format(str=pointer.describe(), type=type(target))\n )\n\n\n@overload\ndef repository_def_from_target_def(\n target: Union["RepositoryDefinition", "JobDefinition", "GraphDefinition"],\n repository_load_data: Optional["RepositoryLoadData"] = None,\n) -> "RepositoryDefinition": ...\n\n\n@overload\ndef repository_def_from_target_def(\n target: object, repository_load_data: Optional["RepositoryLoadData"] = None\n) -> None: ...\n\n\ndef repository_def_from_target_def(\n target: object, repository_load_data: Optional["RepositoryLoadData"] = None\n) -> Optional["RepositoryDefinition"]:\n from .assets import AssetsDefinition\n from .definitions_class import Definitions\n from .graph_definition import GraphDefinition\n from .job_definition import JobDefinition\n from .repository_definition import (\n SINGLETON_REPOSITORY_NAME,\n CachingRepositoryData,\n PendingRepositoryDefinition,\n RepositoryDefinition,\n )\n from .source_asset import SourceAsset\n\n if isinstance(target, Definitions):\n # reassign to handle both repository and pending repo case\n target = target.get_inner_repository_for_loading_process()\n\n # special case - we can wrap a single job in a repository\n if isinstance(target, (JobDefinition, GraphDefinition)):\n # consider including job name in generated repo name\n return RepositoryDefinition(\n name=get_ephemeral_repository_name(target.name),\n repository_data=CachingRepositoryData.from_list([target]),\n )\n elif isinstance(target, list) and all(\n isinstance(item, (AssetsDefinition, SourceAsset)) for item in target\n ):\n return RepositoryDefinition(\n name=SINGLETON_REPOSITORY_NAME,\n repository_data=CachingRepositoryData.from_list(target),\n )\n elif isinstance(target, RepositoryDefinition):\n return target\n elif isinstance(target, PendingRepositoryDefinition):\n # must load repository from scratch\n if repository_load_data is None:\n return target.compute_repository_definition()\n # can use the cached data to more efficiently load data\n return target.reconstruct_repository_definition(repository_load_data)\n else:\n return None\n\n\ndef repository_def_from_pointer(\n pointer: CodePointer, repository_load_data: Optional["RepositoryLoadData"] = None\n) -> "RepositoryDefinition":\n target = def_from_pointer(pointer)\n repo_def = repository_def_from_target_def(target, repository_load_data)\n if not repo_def:\n raise DagsterInvariantViolationError(\n f"CodePointer ({pointer.describe()}) must resolve to a "\n "RepositoryDefinition, JobDefinition, or JobDefinition. "\n f"Received a {type(target)}"\n )\n return repo_def\n
", "current_page_name": "_modules/dagster/_core/definitions/reconstruct", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.reconstruct"}, "repository_definition": {"repository_data": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.repository_definition.repository_data

\nfrom abc import ABC, abstractmethod\nfrom types import FunctionType\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Any,\n    Callable,\n    Dict,\n    Mapping,\n    Optional,\n    Sequence,\n    TypeVar,\n    Union,\n)\n\nimport dagster._check as check\nfrom dagster._annotations import public\nfrom dagster._core.definitions.events import AssetKey\nfrom dagster._core.definitions.executor_definition import ExecutorDefinition\nfrom dagster._core.definitions.graph_definition import SubselectedGraphDefinition\nfrom dagster._core.definitions.job_definition import JobDefinition\nfrom dagster._core.definitions.logger_definition import LoggerDefinition\nfrom dagster._core.definitions.resource_definition import ResourceDefinition\nfrom dagster._core.definitions.schedule_definition import ScheduleDefinition\nfrom dagster._core.definitions.sensor_definition import SensorDefinition\nfrom dagster._core.definitions.source_asset import SourceAsset\nfrom dagster._core.errors import DagsterInvalidDefinitionError, DagsterInvariantViolationError\n\nfrom .caching_index import CacheingDefinitionIndex\nfrom .valid_definitions import RepositoryListDefinition\n\nif TYPE_CHECKING:\n    from dagster._core.definitions import AssetsDefinition\n\n\nT = TypeVar("T")\nResolvable = Callable[[], T]\n\n\n
[docs]class RepositoryData(ABC):\n """Users should usually rely on the :py:func:`@repository <repository>` decorator to create new\n repositories, which will in turn call the static constructors on this class. However, users may\n subclass :py:class:`RepositoryData` for fine-grained control over access to and lazy creation\n of repository members.\n """\n\n @abstractmethod\n def get_resource_key_mapping(self) -> Mapping[int, str]:\n pass\n\n @abstractmethod\n def get_top_level_resources(self) -> Mapping[str, ResourceDefinition]:\n """Return all top-level resources in the repository as a list,\n such as those provided to the Definitions constructor.\n\n Returns:\n List[ResourceDefinition]: All top-level resources in the repository.\n """\n\n @abstractmethod\n def get_env_vars_by_top_level_resource(self) -> Mapping[str, AbstractSet[str]]:\n pass\n\n
[docs] @abstractmethod\n @public\n def get_all_jobs(self) -> Sequence[JobDefinition]:\n """Return all jobs in the repository as a list.\n\n Returns:\n List[JobDefinition]: All jobs in the repository.\n """
\n\n
[docs] @public\n def get_job_names(self) -> Sequence[str]:\n """Get the names of all jobs in the repository.\n\n Returns:\n List[str]\n """\n return [job_def.name for job_def in self.get_all_jobs()]
\n\n
[docs] @public\n def has_job(self, job_name: str) -> bool:\n """Check if a job with a given name is present in the repository.\n\n Args:\n job_name (str): The name of the job.\n\n Returns:\n bool\n """\n return job_name in self.get_job_names()
\n\n
[docs] @public\n def get_job(self, job_name: str) -> JobDefinition:\n """Get a job by name.\n\n Args:\n job_name (str): Name of the job to retrieve.\n\n Returns:\n JobDefinition: The job definition corresponding to the given name.\n """\n match = next(job for job in self.get_all_jobs() if job.name == job_name)\n if match is None:\n raise DagsterInvariantViolationError(f"Could not find job {job_name} in repository")\n return match
\n\n
[docs] @public\n def get_schedule_names(self) -> Sequence[str]:\n """Get the names of all schedules in the repository.\n\n Returns:\n List[str]\n """\n return [schedule.name for schedule in self.get_all_schedules()]
\n\n
[docs] @public\n def get_all_schedules(self) -> Sequence[ScheduleDefinition]:\n """Return all schedules in the repository as a list.\n\n Returns:\n List[ScheduleDefinition]: All jobs in the repository.\n """\n return []
\n\n
[docs] @public\n def get_schedule(self, schedule_name: str) -> ScheduleDefinition:\n """Get a schedule by name.\n\n Args:\n schedule_name (str): name of the schedule to retrieve.\n\n Returns:\n ScheduleDefinition: The schedule definition corresponding to the given name.\n """\n schedules_with_name = [\n schedule for schedule in self.get_all_schedules() if schedule.name == schedule_name\n ]\n if not schedules_with_name:\n raise DagsterInvariantViolationError(\n f"Could not find schedule {schedule_name} in repository"\n )\n return schedules_with_name[0]
\n\n
[docs] @public\n def has_schedule(self, schedule_name: str) -> bool:\n """Check if a schedule with a given name is present in the repository."""\n return schedule_name in self.get_schedule_names()
\n\n
[docs] @public\n def get_all_sensors(self) -> Sequence[SensorDefinition]:\n """Sequence[SensorDefinition]: Return all sensors in the repository as a list."""\n return []
\n\n
[docs] @public\n def get_sensor_names(self) -> Sequence[str]:\n """Sequence[str]: Get the names of all sensors in the repository."""\n return [sensor.name for sensor in self.get_all_sensors()]
\n\n
[docs] @public\n def get_sensor(self, sensor_name: str) -> SensorDefinition:\n """Get a sensor by name.\n\n Args:\n sensor_name (str): name of the sensor to retrieve.\n\n Returns:\n SensorDefinition: The sensor definition corresponding to the given name.\n """\n sensors_with_name = [\n sensor for sensor in self.get_all_sensors() if sensor.name == sensor_name\n ]\n if not sensors_with_name:\n raise DagsterInvariantViolationError(\n f"Could not find sensor {sensor_name} in repository"\n )\n return sensors_with_name[0]
\n\n
[docs] @public\n def has_sensor(self, sensor_name: str) -> bool:\n """Check if a sensor with a given name is present in the repository."""\n return sensor_name in self.get_sensor_names()
\n\n
[docs] @public\n def get_source_assets_by_key(self) -> Mapping[AssetKey, SourceAsset]:\n """Mapping[AssetKey, SourceAsset]: Get the source assets for the repository."""\n return {}
\n\n
[docs] @public\n def get_assets_defs_by_key(self) -> Mapping[AssetKey, "AssetsDefinition"]:\n """Mapping[AssetKey, AssetsDefinition]: Get the asset definitions for the repository."""\n return {}
\n\n def load_all_definitions(self):\n # force load of all lazy constructed code artifacts\n self.get_all_jobs()\n self.get_all_schedules()\n self.get_all_sensors()\n self.get_source_assets_by_key()
\n\n\nclass CachingRepositoryData(RepositoryData):\n """Default implementation of RepositoryData used by the :py:func:`@repository <repository>` decorator."""\n\n _all_jobs: Optional[Sequence[JobDefinition]]\n _all_pipelines: Optional[Sequence[JobDefinition]]\n\n def __init__(\n self,\n jobs: Mapping[str, Union[JobDefinition, Resolvable[JobDefinition]]],\n schedules: Mapping[str, Union[ScheduleDefinition, Resolvable[ScheduleDefinition]]],\n sensors: Mapping[str, Union[SensorDefinition, Resolvable[SensorDefinition]]],\n source_assets_by_key: Mapping[AssetKey, SourceAsset],\n assets_defs_by_key: Mapping[AssetKey, "AssetsDefinition"],\n top_level_resources: Mapping[str, ResourceDefinition],\n utilized_env_vars: Mapping[str, AbstractSet[str]],\n resource_key_mapping: Mapping[int, str],\n ):\n """Constructs a new CachingRepositoryData object.\n\n You may pass pipeline, job, and schedule definitions directly, or you may pass callables\n with no arguments that will be invoked to lazily construct definitions when accessed by\n name. This can be helpful for performance when there are many definitions in a repository,\n or when constructing the definitions is costly.\n\n Note that when lazily constructing a definition, the name of the definition must match its\n key in its dictionary index, or a :py:class:`DagsterInvariantViolationError` will be thrown\n at retrieval time.\n\n Args:\n jobs (Mapping[str, Union[JobDefinition, Callable[[], JobDefinition]]]):\n The job definitions belonging to the repository.\n schedules (Mapping[str, Union[ScheduleDefinition, Callable[[], ScheduleDefinition]]]):\n The schedules belonging to the repository.\n sensors (Mapping[str, Union[SensorDefinition, Callable[[], SensorDefinition]]]):\n The sensors belonging to a repository.\n source_assets_by_key (Mapping[AssetKey, SourceAsset]): The source assets belonging to a repository.\n assets_defs_by_key (Mapping[AssetKey, AssetsDefinition]): The assets definitions\n belonging to a repository.\n top_level_resources (Mapping[str, ResourceDefinition]): A dict of top-level\n resource keys to defintions, for resources which should be displayed in the UI.\n """\n from dagster._core.definitions import AssetsDefinition\n\n check.mapping_param(jobs, "jobs", key_type=str, value_type=(JobDefinition, FunctionType))\n check.mapping_param(\n schedules, "schedules", key_type=str, value_type=(ScheduleDefinition, FunctionType)\n )\n check.mapping_param(\n sensors, "sensors", key_type=str, value_type=(SensorDefinition, FunctionType)\n )\n check.mapping_param(\n source_assets_by_key, "source_assets_by_key", key_type=AssetKey, value_type=SourceAsset\n )\n check.mapping_param(\n assets_defs_by_key, "assets_defs_by_key", key_type=AssetKey, value_type=AssetsDefinition\n )\n check.mapping_param(\n top_level_resources, "top_level_resources", key_type=str, value_type=ResourceDefinition\n )\n check.mapping_param(\n utilized_env_vars,\n "utilized_resources",\n key_type=str,\n )\n check.mapping_param(\n resource_key_mapping, "resource_key_mapping", key_type=int, value_type=str\n )\n\n self._jobs = CacheingDefinitionIndex(\n JobDefinition,\n "JobDefinition",\n "job",\n jobs,\n self._validate_job,\n )\n\n self._schedules = CacheingDefinitionIndex(\n ScheduleDefinition,\n "ScheduleDefinition",\n "schedule",\n schedules,\n self._validate_schedule,\n )\n # load all schedules to force validation\n self._schedules.get_all_definitions()\n\n self._source_assets_by_key = source_assets_by_key\n self._assets_defs_by_key = assets_defs_by_key\n self._top_level_resources = top_level_resources\n self._utilized_env_vars = utilized_env_vars\n self._resource_key_mapping = resource_key_mapping\n\n self._sensors = CacheingDefinitionIndex(\n SensorDefinition,\n "SensorDefinition",\n "sensor",\n sensors,\n self._validate_sensor,\n )\n # load all sensors to force validation\n self._sensors.get_all_definitions()\n\n self._all_jobs = None\n\n @staticmethod\n def from_dict(repository_definitions: Dict[str, Dict[str, Any]]) -> "CachingRepositoryData":\n """Static constructor.\n\n Args:\n repository_definition (Dict[str, Dict[str, ...]]): A dict of the form:\n\n {\n 'jobs': Dict[str, Callable[[], JobDefinition]],\n 'schedules': Dict[str, Callable[[], ScheduleDefinition]]\n }\n\n This form is intended to allow definitions to be created lazily when accessed by name,\n which can be helpful for performance when there are many definitions in a repository, or\n when constructing the definitions is costly.\n """\n from .repository_data_builder import build_caching_repository_data_from_dict\n\n return build_caching_repository_data_from_dict(repository_definitions)\n\n @classmethod\n def from_list(\n cls,\n repository_definitions: Sequence[RepositoryListDefinition],\n default_executor_def: Optional[ExecutorDefinition] = None,\n default_logger_defs: Optional[Mapping[str, LoggerDefinition]] = None,\n top_level_resources: Optional[Mapping[str, ResourceDefinition]] = None,\n resource_key_mapping: Optional[Mapping[int, str]] = None,\n ) -> "CachingRepositoryData":\n """Static constructor.\n\n Args:\n repository_definitions (List[Union[JobDefinition, ScheduleDefinition, SensorDefinition, GraphDefinition]]):\n Use this constructor when you have no need to lazy load jobs or other definitions.\n top_level_resources (Optional[Mapping[str, ResourceDefinition]]): A dict of top-level\n resource keys to defintions, for resources which should be displayed in the UI.\n """\n from .repository_data_builder import build_caching_repository_data_from_list\n\n return build_caching_repository_data_from_list(\n repository_definitions=repository_definitions,\n default_executor_def=default_executor_def,\n default_logger_defs=default_logger_defs,\n top_level_resources=top_level_resources,\n resource_key_mapping=resource_key_mapping,\n )\n\n def get_env_vars_by_top_level_resource(self) -> Mapping[str, AbstractSet[str]]:\n return self._utilized_env_vars\n\n def get_resource_key_mapping(self) -> Mapping[int, str]:\n return self._resource_key_mapping\n\n def get_job_names(self) -> Sequence[str]:\n """Get the names of all jobs in the repository.\n\n Returns:\n List[str]\n """\n return self._jobs.get_definition_names()\n\n def has_job(self, job_name: str) -> bool:\n """Check if a job with a given name is present in the repository.\n\n Args:\n job_name (str): The name of the job.\n\n Returns:\n bool\n """\n check.str_param(job_name, "job_name")\n return self._jobs.has_definition(job_name)\n\n def get_top_level_resources(self) -> Mapping[str, ResourceDefinition]:\n return self._top_level_resources\n\n def get_all_jobs(self) -> Sequence[JobDefinition]:\n """Return all jobs in the repository as a list.\n\n Note that this will construct any job that has not yet been constructed.\n\n Returns:\n List[JobDefinition]: All jobs in the repository.\n """\n if self._all_jobs is not None:\n return self._all_jobs\n\n self._all_jobs = self._jobs.get_all_definitions()\n self._check_node_defs(self._all_jobs)\n return self._all_jobs\n\n def get_job(self, job_name: str) -> JobDefinition:\n """Get a job by name.\n\n If this job has not yet been constructed, only this job is constructed, and will\n be cached for future calls.\n\n Args:\n job_name (str): Name of the job to retrieve.\n\n Returns:\n JobDefinition: The job definition corresponding to the given name.\n """\n check.str_param(job_name, "job_name")\n return self._jobs.get_definition(job_name)\n\n def get_schedule_names(self) -> Sequence[str]:\n """Get the names of all schedules in the repository.\n\n Returns:\n List[str]\n """\n return self._schedules.get_definition_names()\n\n def get_all_schedules(self) -> Sequence[ScheduleDefinition]:\n """Return all schedules in the repository as a list.\n\n Note that this will construct any schedule that has not yet been constructed.\n\n Returns:\n List[ScheduleDefinition]: All schedules in the repository.\n """\n return self._schedules.get_all_definitions()\n\n def get_schedule(self, schedule_name: str) -> ScheduleDefinition:\n """Get a schedule by name.\n\n if this schedule has not yet been constructed, only this schedule is constructed, and will\n be cached for future calls.\n\n Args:\n schedule_name (str): name of the schedule to retrieve.\n\n Returns:\n ScheduleDefinition: The schedule definition corresponding to the given name.\n """\n check.str_param(schedule_name, "schedule_name")\n\n return self._schedules.get_definition(schedule_name)\n\n def has_schedule(self, schedule_name: str) -> bool:\n check.str_param(schedule_name, "schedule_name")\n\n return self._schedules.has_definition(schedule_name)\n\n def get_all_sensors(self) -> Sequence[SensorDefinition]:\n return self._sensors.get_all_definitions()\n\n def get_sensor_names(self) -> Sequence[str]:\n return self._sensors.get_definition_names()\n\n def get_sensor(self, sensor_name: str) -> SensorDefinition:\n return self._sensors.get_definition(sensor_name)\n\n def has_sensor(self, sensor_name: str) -> bool:\n return self._sensors.has_definition(sensor_name)\n\n def get_source_assets_by_key(self) -> Mapping[AssetKey, SourceAsset]:\n return self._source_assets_by_key\n\n def get_assets_defs_by_key(self) -> Mapping[AssetKey, "AssetsDefinition"]:\n return self._assets_defs_by_key\n\n def _check_node_defs(self, job_defs: Sequence[JobDefinition]) -> None:\n node_defs = {}\n node_to_job = {}\n for job_def in job_defs:\n for node_def in [*job_def.all_node_defs, job_def.graph]:\n # skip checks for subselected graphs because they don't have their own names\n if isinstance(node_def, SubselectedGraphDefinition):\n break\n\n if node_def.name not in node_defs:\n node_defs[node_def.name] = node_def\n node_to_job[node_def.name] = job_def.name\n\n if node_defs[node_def.name] is not node_def:\n first_name, second_name = sorted([node_to_job[node_def.name], job_def.name])\n raise DagsterInvalidDefinitionError(\n f"Conflicting definitions found in repository with name '{node_def.name}'."\n " Op/Graph definition names must be unique within a repository."\n f" {node_def.__class__.__name__} is defined in"\n f" job '{first_name}' and in"\n f" job '{second_name}'."\n )\n\n def _validate_job(self, job: JobDefinition) -> JobDefinition:\n return job\n\n def _validate_schedule(self, schedule: ScheduleDefinition) -> ScheduleDefinition:\n job_names = self.get_job_names()\n\n if schedule.job_name not in job_names:\n raise DagsterInvalidDefinitionError(\n f'ScheduleDefinition "{schedule.name}" targets job "{schedule.job_name}" '\n "which was not found in this repository."\n )\n\n return schedule\n\n def _validate_sensor(self, sensor: SensorDefinition) -> SensorDefinition:\n job_names = self.get_job_names()\n if len(sensor.targets) == 0:\n # skip validation when the sensor does not target a job\n return sensor\n\n for target in sensor.targets:\n if target.job_name not in job_names:\n raise DagsterInvalidDefinitionError(\n f'SensorDefinition "{sensor.name}" targets job "{sensor.job_name}" '\n "which was not found in this repository."\n )\n\n return sensor\n
", "current_page_name": "_modules/dagster/_core/definitions/repository_definition/repository_data", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.repository_definition.repository_data"}, "repository_definition": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.repository_definition.repository_definition

\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Any,\n    Dict,\n    Iterable,\n    List,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Type,\n    Union,\n)\n\nimport dagster._check as check\nfrom dagster._annotations import public\nfrom dagster._core.definitions.asset_check_spec import AssetCheckKey\nfrom dagster._core.definitions.asset_graph import AssetGraph, InternalAssetGraph\nfrom dagster._core.definitions.assets_job import (\n    ASSET_BASE_JOB_PREFIX,\n)\nfrom dagster._core.definitions.cacheable_assets import AssetsDefinitionCacheableData\nfrom dagster._core.definitions.events import AssetKey, CoercibleToAssetKey\nfrom dagster._core.definitions.executor_definition import ExecutorDefinition\nfrom dagster._core.definitions.job_definition import JobDefinition\nfrom dagster._core.definitions.logger_definition import LoggerDefinition\nfrom dagster._core.definitions.metadata import MetadataMapping\nfrom dagster._core.definitions.resource_definition import ResourceDefinition\nfrom dagster._core.definitions.schedule_definition import ScheduleDefinition\nfrom dagster._core.definitions.sensor_definition import SensorDefinition\nfrom dagster._core.definitions.source_asset import SourceAsset\nfrom dagster._core.definitions.utils import check_valid_name\nfrom dagster._core.errors import DagsterInvariantViolationError\nfrom dagster._core.instance import DagsterInstance\nfrom dagster._serdes import whitelist_for_serdes\nfrom dagster._utils import hash_collection\n\nfrom .repository_data import CachingRepositoryData, RepositoryData\nfrom .valid_definitions import (\n    RepositoryListDefinition as RepositoryListDefinition,\n)\n\nif TYPE_CHECKING:\n    from dagster._core.definitions import AssetsDefinition\n    from dagster._core.definitions.cacheable_assets import CacheableAssetsDefinition\n    from dagster._core.storage.asset_value_loader import AssetValueLoader\n\n\n@whitelist_for_serdes\nclass RepositoryLoadData(\n    NamedTuple(\n        "_RepositoryLoadData",\n        [\n            ("cached_data_by_key", Mapping[str, Sequence[AssetsDefinitionCacheableData]]),\n        ],\n    )\n):\n    def __new__(cls, cached_data_by_key: Mapping[str, Sequence[AssetsDefinitionCacheableData]]):\n        return super(RepositoryLoadData, cls).__new__(\n            cls,\n            cached_data_by_key=(\n                check.mapping_param(\n                    cached_data_by_key,\n                    "cached_data_by_key",\n                    key_type=str,\n                    value_type=list,\n                )\n            ),\n        )\n\n    # Allow this to be hashed for use in `lru_cache`. This is needed because:\n    # - `ReconstructableJob` uses `lru_cache`\n    # - `ReconstructableJob` has a `ReconstructableRepository` attribute\n    # - `ReconstructableRepository` has a `RepositoryLoadData` attribute\n    # - `RepositoryLoadData` has collection attributes that are unhashable by default\n    def __hash__(self) -> int:\n        if not hasattr(self, "_hash"):\n            self._hash = hash_collection(self)\n        return self._hash\n\n\n
[docs]class RepositoryDefinition:\n """Define a repository that contains a group of definitions.\n\n Users should typically not create objects of this class directly. Instead, use the\n :py:func:`@repository` decorator.\n\n Args:\n name (str): The name of the repository.\n repository_data (RepositoryData): Contains the definitions making up the repository.\n description (Optional[str]): A string description of the repository.\n metadata (Optional[MetadataMapping]): A map of arbitrary metadata for the repository.\n """\n\n def __init__(\n self,\n name,\n *,\n repository_data,\n description=None,\n metadata=None,\n repository_load_data=None,\n ):\n self._name = check_valid_name(name)\n self._description = check.opt_str_param(description, "description")\n self._repository_data: RepositoryData = check.inst_param(\n repository_data, "repository_data", RepositoryData\n )\n self._metadata = check.opt_mapping_param(metadata, "metadata", key_type=str)\n self._repository_load_data = check.opt_inst_param(\n repository_load_data, "repository_load_data", RepositoryLoadData\n )\n\n @property\n def repository_load_data(self) -> Optional[RepositoryLoadData]:\n return self._repository_load_data\n\n @public\n @property\n def name(self) -> str:\n """str: The name of the repository."""\n return self._name\n\n @public\n @property\n def description(self) -> Optional[str]:\n """Optional[str]: A human-readable description of the repository."""\n return self._description\n\n @public\n @property\n def metadata(self) -> Optional[MetadataMapping]:\n """Optional[MetadataMapping]: Arbitrary metadata for the repository."""\n return self._metadata\n\n def load_all_definitions(self):\n # force load of all lazy constructed code artifacts\n self._repository_data.load_all_definitions()\n\n @public\n @property\n def job_names(self) -> Sequence[str]:\n """List[str]: Names of all jobs in the repository."""\n return self._repository_data.get_job_names()\n\n def get_top_level_resources(self) -> Mapping[str, ResourceDefinition]:\n return self._repository_data.get_top_level_resources()\n\n def get_env_vars_by_top_level_resource(self) -> Mapping[str, AbstractSet[str]]:\n return self._repository_data.get_env_vars_by_top_level_resource()\n\n def get_resource_key_mapping(self) -> Mapping[int, str]:\n return self._repository_data.get_resource_key_mapping()\n\n
[docs] @public\n def has_job(self, name: str) -> bool:\n """Check if a job with a given name is present in the repository.\n\n Args:\n name (str): The name of the job.\n\n Returns:\n bool\n """\n return self._repository_data.has_job(name)
\n\n
[docs] @public\n def get_job(self, name: str) -> JobDefinition:\n """Get a job by name.\n\n If this job is present in the lazily evaluated dictionary passed to the\n constructor, but has not yet been constructed, only this job is constructed, and\n will be cached for future calls.\n\n Args:\n name (str): Name of the job to retrieve.\n\n Returns:\n JobDefinition: The job definition corresponding to\n the given name.\n """\n return self._repository_data.get_job(name)
\n\n
[docs] @public\n def get_all_jobs(self) -> Sequence[JobDefinition]:\n """Return all jobs in the repository as a list.\n\n Note that this will construct any job in the lazily evaluated dictionary that has\n not yet been constructed.\n\n Returns:\n List[JobDefinition]: All jobs in the repository.\n """\n return self._repository_data.get_all_jobs()
\n\n @public\n @property\n def schedule_defs(self) -> Sequence[ScheduleDefinition]:\n """List[ScheduleDefinition]: All schedules in the repository."""\n return self._repository_data.get_all_schedules()\n\n
[docs] @public\n def get_schedule_def(self, name: str) -> ScheduleDefinition:\n """Get a schedule definition by name.\n\n Args:\n name (str): The name of the schedule.\n\n Returns:\n ScheduleDefinition: The schedule definition.\n """\n return self._repository_data.get_schedule(name)
\n\n
[docs] @public\n def has_schedule_def(self, name: str) -> bool:\n """bool: Check if a schedule with a given name is present in the repository."""\n return self._repository_data.has_schedule(name)
\n\n @public\n @property\n def sensor_defs(self) -> Sequence[SensorDefinition]:\n """Sequence[SensorDefinition]: All sensors in the repository."""\n return self._repository_data.get_all_sensors()\n\n
[docs] @public\n def get_sensor_def(self, name: str) -> SensorDefinition:\n """Get a sensor definition by name.\n\n Args:\n name (str): The name of the sensor.\n\n Returns:\n SensorDefinition: The sensor definition.\n """\n return self._repository_data.get_sensor(name)
\n\n
[docs] @public\n def has_sensor_def(self, name: str) -> bool:\n """bool: Check if a sensor with a given name is present in the repository."""\n return self._repository_data.has_sensor(name)
\n\n @property\n def source_assets_by_key(self) -> Mapping[AssetKey, SourceAsset]:\n return self._repository_data.get_source_assets_by_key()\n\n @property\n def assets_defs_by_key(self) -> Mapping[AssetKey, "AssetsDefinition"]:\n return self._repository_data.get_assets_defs_by_key()\n\n def has_implicit_global_asset_job_def(self) -> bool:\n """Returns true is there is a single implicit asset job for all asset keys in a repository."""\n return self.has_job(ASSET_BASE_JOB_PREFIX)\n\n def get_implicit_global_asset_job_def(self) -> JobDefinition:\n """A useful conveninence method for repositories where there are a set of assets with\n the same partitioning schema and one wants to access their corresponding implicit job\n easily.\n """\n if not self.has_job(ASSET_BASE_JOB_PREFIX):\n raise DagsterInvariantViolationError(\n "There is no single global asset job, likely due to assets using "\n "different partitioning schemes via their partitions_def parameter. You must "\n "use get_implicit_job_def_for_assets in order to access the correct implicit job."\n )\n\n return self.get_job(ASSET_BASE_JOB_PREFIX)\n\n def get_implicit_asset_job_names(self) -> Sequence[str]:\n return [\n job_name for job_name in self.job_names if job_name.startswith(ASSET_BASE_JOB_PREFIX)\n ]\n\n def get_implicit_job_def_for_assets(\n self, asset_keys: Iterable[AssetKey]\n ) -> Optional[JobDefinition]:\n """Returns the asset base job that contains all the given assets, or None if there is no such\n job.\n """\n if self.has_job(ASSET_BASE_JOB_PREFIX):\n base_job = self.get_job(ASSET_BASE_JOB_PREFIX)\n if all(\n key in base_job.asset_layer.assets_defs_by_key\n or base_job.asset_layer.is_observable_for_asset(key)\n for key in asset_keys\n ):\n return base_job\n else:\n i = 0\n while self.has_job(f"{ASSET_BASE_JOB_PREFIX}_{i}"):\n base_job = self.get_job(f"{ASSET_BASE_JOB_PREFIX}_{i}")\n\n if all(\n key in base_job.asset_layer.assets_defs_by_key\n or base_job.asset_layer.is_observable_for_asset(key)\n for key in asset_keys\n ):\n return base_job\n\n i += 1\n\n return None\n\n def get_maybe_subset_job_def(\n self,\n job_name: str,\n op_selection: Optional[Iterable[str]] = None,\n asset_selection: Optional[AbstractSet[AssetKey]] = None,\n asset_check_selection: Optional[AbstractSet[AssetCheckKey]] = None,\n ):\n defn = self.get_job(job_name)\n return defn.get_subset(\n op_selection=op_selection,\n asset_selection=asset_selection,\n asset_check_selection=asset_check_selection,\n )\n\n
[docs] @public\n def load_asset_value(\n self,\n asset_key: CoercibleToAssetKey,\n *,\n python_type: Optional[Type] = None,\n instance: Optional[DagsterInstance] = None,\n partition_key: Optional[str] = None,\n metadata: Optional[Dict[str, Any]] = None,\n resource_config: Optional[Any] = None,\n ) -> object:\n """Load the contents of an asset as a Python object.\n\n Invokes `load_input` on the :py:class:`IOManager` associated with the asset.\n\n If you want to load the values of multiple assets, it's more efficient to use\n :py:meth:`~dagster.RepositoryDefinition.get_asset_value_loader`, which avoids spinning up\n resources separately for each asset.\n\n Args:\n asset_key (Union[AssetKey, Sequence[str], str]): The key of the asset to load.\n python_type (Optional[Type]): The python type to load the asset as. This is what will\n be returned inside `load_input` by `context.dagster_type.typing_type`.\n partition_key (Optional[str]): The partition of the asset to load.\n metadata (Optional[Dict[str, Any]]): Input metadata to pass to the :py:class:`IOManager`\n (is equivalent to setting the metadata argument in `In` or `AssetIn`).\n resource_config (Optional[Any]): A dictionary of resource configurations to be passed\n to the :py:class:`IOManager`.\n\n Returns:\n The contents of an asset as a Python object.\n """\n from dagster._core.storage.asset_value_loader import AssetValueLoader\n\n with AssetValueLoader(\n self.assets_defs_by_key, self.source_assets_by_key, instance=instance\n ) as loader:\n return loader.load_asset_value(\n asset_key,\n python_type=python_type,\n partition_key=partition_key,\n metadata=metadata,\n resource_config=resource_config,\n )
\n\n
[docs] @public\n def get_asset_value_loader(\n self, instance: Optional[DagsterInstance] = None\n ) -> "AssetValueLoader":\n """Returns an object that can load the contents of assets as Python objects.\n\n Invokes `load_input` on the :py:class:`IOManager` associated with the assets. Avoids\n spinning up resources separately for each asset.\n\n Usage:\n\n .. code-block:: python\n\n with my_repo.get_asset_value_loader() as loader:\n asset1 = loader.load_asset_value("asset1")\n asset2 = loader.load_asset_value("asset2")\n\n """\n from dagster._core.storage.asset_value_loader import AssetValueLoader\n\n return AssetValueLoader(\n self.assets_defs_by_key, self.source_assets_by_key, instance=instance\n )
\n\n @property\n def asset_graph(self) -> InternalAssetGraph:\n return AssetGraph.from_assets(\n [*set(self.assets_defs_by_key.values()), *self.source_assets_by_key.values()]\n )\n\n # If definition comes from the @repository decorator, then the __call__ method will be\n # overwritten. Therefore, we want to maintain the call-ability of repository definitions.\n def __call__(self, *args, **kwargs):\n return self
\n\n\nclass PendingRepositoryDefinition:\n def __init__(\n self,\n name: str,\n repository_definitions: Sequence[\n Union[RepositoryListDefinition, "CacheableAssetsDefinition"]\n ],\n description: Optional[str] = None,\n metadata: Optional[MetadataMapping] = None,\n default_logger_defs: Optional[Mapping[str, LoggerDefinition]] = None,\n default_executor_def: Optional[ExecutorDefinition] = None,\n _top_level_resources: Optional[Mapping[str, ResourceDefinition]] = None,\n _resource_key_mapping: Optional[Mapping[int, str]] = None,\n ):\n self._repository_definitions = check.list_param(\n repository_definitions,\n "repository_definition",\n additional_message=(\n "PendingRepositoryDefinition supports only list-based repository data at this time."\n ),\n )\n self._name = name\n self._description = description\n self._metadata = metadata\n self._default_logger_defs = default_logger_defs\n self._default_executor_def = default_executor_def\n self._top_level_resources = _top_level_resources\n self._resource_key_mapping = _resource_key_mapping\n\n @property\n def name(self) -> str:\n return self._name\n\n def _compute_repository_load_data(self) -> RepositoryLoadData:\n from dagster._core.definitions.cacheable_assets import CacheableAssetsDefinition\n\n return RepositoryLoadData(\n cached_data_by_key={\n defn.unique_id: defn.compute_cacheable_data()\n for defn in self._repository_definitions\n if isinstance(defn, CacheableAssetsDefinition)\n }\n )\n\n def _get_repository_definition(\n self, repository_load_data: RepositoryLoadData\n ) -> RepositoryDefinition:\n from dagster._core.definitions.cacheable_assets import CacheableAssetsDefinition\n\n resolved_definitions: List[RepositoryListDefinition] = []\n for defn in self._repository_definitions:\n if isinstance(defn, CacheableAssetsDefinition):\n # should always have metadata for each cached defn at this point\n check.invariant(\n defn.unique_id in repository_load_data.cached_data_by_key,\n "No metadata found for CacheableAssetsDefinition with unique_id"\n f" {defn.unique_id}.",\n )\n # use the emtadata to generate definitions\n resolved_definitions.extend(\n defn.build_definitions(\n data=repository_load_data.cached_data_by_key[defn.unique_id]\n )\n )\n else:\n resolved_definitions.append(defn)\n\n repository_data = CachingRepositoryData.from_list(\n resolved_definitions,\n default_executor_def=self._default_executor_def,\n default_logger_defs=self._default_logger_defs,\n top_level_resources=self._top_level_resources,\n resource_key_mapping=self._resource_key_mapping,\n )\n\n return RepositoryDefinition(\n self._name,\n repository_data=repository_data,\n description=self._description,\n metadata=self._metadata,\n repository_load_data=repository_load_data,\n )\n\n def reconstruct_repository_definition(\n self, repository_load_data: RepositoryLoadData\n ) -> RepositoryDefinition:\n """Use the provided RepositoryLoadData to construct and return a RepositoryDefinition."""\n check.inst_param(repository_load_data, "repository_load_data", RepositoryLoadData)\n return self._get_repository_definition(repository_load_data)\n\n def compute_repository_definition(self) -> RepositoryDefinition:\n """Compute the required RepositoryLoadData and use it to construct and return a RepositoryDefinition."""\n repository_load_data = self._compute_repository_load_data()\n return self._get_repository_definition(repository_load_data)\n
", "current_page_name": "_modules/dagster/_core/definitions/repository_definition/repository_definition", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.repository_definition.repository_definition"}}, "resource_definition": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.resource_definition

\nfrom functools import update_wrapper\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Any,\n    Callable,\n    Dict,\n    Iterator,\n    Mapping,\n    Optional,\n    Union,\n    cast,\n    overload,\n)\n\nfrom typing_extensions import TypeAlias\n\nimport dagster._check as check\nfrom dagster._annotations import experimental_param, public\nfrom dagster._core.decorator_utils import format_docstring_for_description\nfrom dagster._core.definitions.config import is_callable_valid_config_arg\nfrom dagster._core.definitions.configurable import AnonymousConfigurableDefinition\nfrom dagster._core.errors import DagsterInvalidDefinitionError, DagsterInvalidInvocationError\nfrom dagster._utils import IHasInternalInit\n\nfrom ..decorator_utils import (\n    get_function_params,\n    has_at_least_one_parameter,\n    is_required_param,\n    positional_arg_name_list,\n    validate_expected_params,\n)\nfrom .definition_config_schema import (\n    CoercableToConfigSchema,\n    IDefinitionConfigSchema,\n    convert_user_facing_definition_config_schema,\n)\nfrom .resource_invocation import resource_invocation_result\nfrom .resource_requirement import (\n    RequiresResources,\n    ResourceDependencyRequirement,\n    ResourceRequirement,\n)\nfrom .scoped_resources_builder import (  # re-exported\n    IContainsGenerator as IContainsGenerator,\n    Resources as Resources,\n    ScopedResourcesBuilder as ScopedResourcesBuilder,\n)\n\nif TYPE_CHECKING:\n    from dagster._core.execution.resources_init import InitResourceContext\n\nResourceFunctionWithContext: TypeAlias = Callable[["InitResourceContext"], Any]\nResourceFunctionWithoutContext: TypeAlias = Callable[[], Any]\nResourceFunction: TypeAlias = Union[\n    ResourceFunctionWithContext,\n    ResourceFunctionWithoutContext,\n]\n\n\n
[docs]@experimental_param(param="version")\nclass ResourceDefinition(AnonymousConfigurableDefinition, RequiresResources, IHasInternalInit):\n """Core class for defining resources.\n\n Resources are scoped ways to make external resources (like database connections) available to\n ops and assets during job execution and to clean up after execution resolves.\n\n If resource_fn yields once rather than returning (in the manner of functions decorable with\n :py:func:`@contextlib.contextmanager <python:contextlib.contextmanager>`) then the body of the\n function after the yield will be run after execution resolves, allowing users to write their\n own teardown/cleanup logic.\n\n Depending on your executor, resources may be instantiated and cleaned up more than once in a\n job execution.\n\n Args:\n resource_fn (Callable[[InitResourceContext], Any]): User-provided function to instantiate\n the resource, which will be made available to executions keyed on the\n ``context.resources`` object.\n config_schema (Optional[ConfigSchema): The schema for the config. If set, Dagster will check\n that config provided for the resource matches this schema and fail if it does not. If\n not set, Dagster will accept any config provided for the resource.\n description (Optional[str]): A human-readable description of the resource.\n required_resource_keys: (Optional[Set[str]]) Keys for the resources required by this\n resource. A DagsterInvariantViolationError will be raised during initialization if\n dependencies are cyclic.\n version (Optional[str]): (Experimental) The version of the resource's definition fn. Two\n wrapped resource functions should only have the same version if they produce the same\n resource definition when provided with the same inputs.\n """\n\n def __init__(\n self,\n resource_fn: ResourceFunction,\n config_schema: CoercableToConfigSchema = None,\n description: Optional[str] = None,\n required_resource_keys: Optional[AbstractSet[str]] = None,\n version: Optional[str] = None,\n ):\n self._resource_fn = check.callable_param(resource_fn, "resource_fn")\n self._config_schema = convert_user_facing_definition_config_schema(config_schema)\n self._description = check.opt_str_param(description, "description")\n self._required_resource_keys = check.opt_set_param(\n required_resource_keys, "required_resource_keys"\n )\n self._version = check.opt_str_param(version, "version")\n\n # this attribute will be updated by the @dagster_maintained_resource and @dagster_maintained_io_manager decorators\n self._dagster_maintained = False\n\n @staticmethod\n def dagster_internal_init(\n *,\n resource_fn: ResourceFunction,\n config_schema: CoercableToConfigSchema,\n description: Optional[str],\n required_resource_keys: Optional[AbstractSet[str]],\n version: Optional[str],\n ) -> "ResourceDefinition":\n return ResourceDefinition(\n resource_fn=resource_fn,\n config_schema=config_schema,\n description=description,\n required_resource_keys=required_resource_keys,\n version=version,\n )\n\n @property\n def resource_fn(self) -> ResourceFunction:\n return self._resource_fn\n\n @property\n def config_schema(self) -> IDefinitionConfigSchema:\n return self._config_schema\n\n @public\n @property\n def description(self) -> Optional[str]:\n """A human-readable description of the resource."""\n return self._description\n\n @public\n @property\n def version(self) -> Optional[str]:\n """A string which can be used to identify a particular code version of a resource definition."""\n return self._version\n\n @public\n @property\n def required_resource_keys(self) -> AbstractSet[str]:\n """A set of the resource keys that this resource depends on. These keys will be made available\n to the resource's init context during execution, and the resource will not be instantiated\n until all required resources are available.\n """\n return self._required_resource_keys\n\n def _is_dagster_maintained(self) -> bool:\n return self._dagster_maintained\n\n
[docs] @public\n @staticmethod\n def none_resource(description: Optional[str] = None) -> "ResourceDefinition":\n """A helper function that returns a none resource.\n\n Args:\n description ([Optional[str]]): The description of the resource. Defaults to None.\n\n Returns:\n [ResourceDefinition]: A resource that does nothing.\n """\n return ResourceDefinition.hardcoded_resource(value=None, description=description)
\n\n
[docs] @public\n @staticmethod\n def hardcoded_resource(value: Any, description: Optional[str] = None) -> "ResourceDefinition":\n """A helper function that creates a ``ResourceDefinition`` with a hardcoded object.\n\n Args:\n value (Any): The value that will be accessible via context.resources.resource_name.\n description ([Optional[str]]): The description of the resource. Defaults to None.\n\n Returns:\n [ResourceDefinition]: A hardcoded resource.\n """\n return ResourceDefinition(resource_fn=lambda _init_context: value, description=description)
\n\n
[docs] @public\n @staticmethod\n def mock_resource(description: Optional[str] = None) -> "ResourceDefinition":\n """A helper function that creates a ``ResourceDefinition`` which wraps a ``mock.MagicMock``.\n\n Args:\n description ([Optional[str]]): The description of the resource. Defaults to None.\n\n Returns:\n [ResourceDefinition]: A resource that creates the magic methods automatically and helps\n you mock existing resources.\n """\n from unittest import mock\n\n return ResourceDefinition(\n resource_fn=lambda _init_context: mock.MagicMock(), description=description\n )
\n\n
[docs] @public\n @staticmethod\n def string_resource(description: Optional[str] = None) -> "ResourceDefinition":\n """Creates a ``ResourceDefinition`` which takes in a single string as configuration\n and returns this configured string to any ops or assets which depend on it.\n\n Args:\n description ([Optional[str]]): The description of the string resource. Defaults to None.\n\n Returns:\n [ResourceDefinition]: A resource that takes in a single string as configuration and\n returns that string.\n """\n return ResourceDefinition(\n resource_fn=lambda init_context: init_context.resource_config,\n config_schema=str,\n description=description,\n )
\n\n def copy_for_configured(\n self,\n description: Optional[str],\n config_schema: CoercableToConfigSchema,\n ) -> "ResourceDefinition":\n resource_def = ResourceDefinition.dagster_internal_init(\n config_schema=config_schema,\n description=description or self.description,\n resource_fn=self.resource_fn,\n required_resource_keys=self.required_resource_keys,\n version=self.version,\n )\n\n resource_def._dagster_maintained = self._is_dagster_maintained() # noqa: SLF001\n\n return resource_def\n\n def __call__(self, *args, **kwargs):\n from dagster._core.execution.context.init import UnboundInitResourceContext\n\n if has_at_least_one_parameter(self.resource_fn):\n if len(args) + len(kwargs) == 0:\n raise DagsterInvalidInvocationError(\n "Resource initialization function has context argument, but no context was"\n " provided when invoking."\n )\n if len(args) + len(kwargs) > 1:\n raise DagsterInvalidInvocationError(\n "Initialization of resource received multiple arguments. Only a first "\n "positional context parameter should be provided when invoking."\n )\n\n context_param_name = get_function_params(self.resource_fn)[0].name\n\n if args:\n check.opt_inst_param(args[0], context_param_name, UnboundInitResourceContext)\n return resource_invocation_result(\n self, cast(Optional[UnboundInitResourceContext], args[0])\n )\n else:\n if context_param_name not in kwargs:\n raise DagsterInvalidInvocationError(\n f"Resource initialization expected argument '{context_param_name}'."\n )\n check.opt_inst_param(\n kwargs[context_param_name], context_param_name, UnboundInitResourceContext\n )\n\n return resource_invocation_result(\n self, cast(Optional[UnboundInitResourceContext], kwargs[context_param_name])\n )\n elif len(args) + len(kwargs) > 0:\n raise DagsterInvalidInvocationError(\n "Attempted to invoke resource with argument, but underlying function has no context"\n " argument. Either specify a context argument on the resource function, or remove"\n " the passed-in argument."\n )\n else:\n return resource_invocation_result(self, None)\n\n def get_resource_requirements(\n self, outer_context: Optional[object] = None\n ) -> Iterator[ResourceRequirement]:\n source_key = cast(str, outer_context)\n for resource_key in sorted(list(self.required_resource_keys)):\n yield ResourceDependencyRequirement(key=resource_key, source_key=source_key)
\n\n\ndef dagster_maintained_resource(\n resource_def: ResourceDefinition,\n) -> ResourceDefinition:\n resource_def._dagster_maintained = True # noqa: SLF001\n return resource_def\n\n\nclass _ResourceDecoratorCallable:\n def __init__(\n self,\n config_schema: Optional[Mapping[str, Any]] = None,\n description: Optional[str] = None,\n required_resource_keys: Optional[AbstractSet[str]] = None,\n version: Optional[str] = None,\n ):\n self.config_schema = config_schema # checked by underlying definition\n self.description = check.opt_str_param(description, "description")\n self.version = check.opt_str_param(version, "version")\n self.required_resource_keys = check.opt_set_param(\n required_resource_keys, "required_resource_keys"\n )\n\n def __call__(self, resource_fn: ResourceFunction) -> ResourceDefinition:\n check.callable_param(resource_fn, "resource_fn")\n\n any_name = ["*"] if has_at_least_one_parameter(resource_fn) else []\n\n params = get_function_params(resource_fn)\n\n missing_positional = validate_expected_params(params, any_name)\n if missing_positional:\n raise DagsterInvalidDefinitionError(\n f"@resource decorated function '{resource_fn.__name__}' expects a single "\n "positional argument."\n )\n\n extras = params[len(any_name) :]\n\n required_extras = list(filter(is_required_param, extras))\n if required_extras:\n raise DagsterInvalidDefinitionError(\n f"@resource decorated function '{resource_fn.__name__}' expects only a single"\n " positional required argument. Got required extra params"\n f" {', '.join(positional_arg_name_list(required_extras))}"\n )\n\n resource_def = ResourceDefinition.dagster_internal_init(\n resource_fn=resource_fn,\n config_schema=self.config_schema,\n description=self.description or format_docstring_for_description(resource_fn),\n version=self.version,\n required_resource_keys=self.required_resource_keys,\n )\n\n # `update_wrapper` typing cannot currently handle a Union of Callables correctly\n update_wrapper(resource_def, wrapped=resource_fn) # type: ignore\n\n return resource_def\n\n\n@overload\ndef resource(config_schema: ResourceFunction) -> ResourceDefinition: ...\n\n\n@overload\ndef resource(\n config_schema: CoercableToConfigSchema = ...,\n description: Optional[str] = ...,\n required_resource_keys: Optional[AbstractSet[str]] = ...,\n version: Optional[str] = ...,\n) -> Callable[[ResourceFunction], "ResourceDefinition"]: ...\n\n\n
[docs]def resource(\n config_schema: Union[ResourceFunction, CoercableToConfigSchema] = None,\n description: Optional[str] = None,\n required_resource_keys: Optional[AbstractSet[str]] = None,\n version: Optional[str] = None,\n) -> Union[Callable[[ResourceFunction], "ResourceDefinition"], "ResourceDefinition"]:\n """Define a resource.\n\n The decorated function should accept an :py:class:`InitResourceContext` and return an instance of\n the resource. This function will become the ``resource_fn`` of an underlying\n :py:class:`ResourceDefinition`.\n\n If the decorated function yields once rather than returning (in the manner of functions\n decorable with :py:func:`@contextlib.contextmanager <python:contextlib.contextmanager>`) then\n the body of the function after the yield will be run after execution resolves, allowing users\n to write their own teardown/cleanup logic.\n\n Args:\n config_schema (Optional[ConfigSchema]): The schema for the config. Configuration data available in\n `init_context.resource_config`. If not set, Dagster will accept any config provided.\n description(Optional[str]): A human-readable description of the resource.\n version (Optional[str]): (Experimental) The version of a resource function. Two wrapped\n resource functions should only have the same version if they produce the same resource\n definition when provided with the same inputs.\n required_resource_keys (Optional[Set[str]]): Keys for the resources required by this resource.\n """\n # This case is for when decorator is used bare, without arguments.\n # E.g. @resource versus @resource()\n if callable(config_schema) and not is_callable_valid_config_arg(config_schema):\n return _ResourceDecoratorCallable()(config_schema)\n\n def _wrap(resource_fn: ResourceFunction) -> "ResourceDefinition":\n return _ResourceDecoratorCallable(\n config_schema=cast(Optional[Dict[str, Any]], config_schema),\n description=description,\n required_resource_keys=required_resource_keys,\n version=version,\n )(resource_fn)\n\n return _wrap
\n\n\n
[docs]def make_values_resource(**kwargs: Any) -> ResourceDefinition:\n """A helper function that creates a ``ResourceDefinition`` to take in user-defined values.\n\n This is useful for sharing values between ops.\n\n Args:\n **kwargs: Arbitrary keyword arguments that will be passed to the config schema of the\n returned resource definition. If not set, Dagster will accept any config provided for\n the resource.\n\n For example:\n\n .. code-block:: python\n\n @op(required_resource_keys={"globals"})\n def my_op(context):\n print(context.resources.globals["my_str_var"])\n\n @job(resource_defs={"globals": make_values_resource(my_str_var=str, my_int_var=int)})\n def my_job():\n my_op()\n\n Returns:\n ResourceDefinition: A resource that passes in user-defined values.\n """\n return ResourceDefinition(\n resource_fn=lambda init_context: init_context.resource_config,\n config_schema=kwargs or Any,\n )
\n
", "current_page_name": "_modules/dagster/_core/definitions/resource_definition", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.resource_definition"}, "result": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.result

\nfrom typing import NamedTuple, Optional, Sequence\n\nimport dagster._check as check\nfrom dagster._annotations import PublicAttr, experimental\nfrom dagster._core.definitions.asset_check_result import AssetCheckResult\nfrom dagster._core.definitions.data_version import DataVersion\n\nfrom .events import (\n    AssetKey,\n    CoercibleToAssetKey,\n)\nfrom .metadata import MetadataUserInput\n\n\n
[docs]@experimental\nclass MaterializeResult(\n NamedTuple(\n "_MaterializeResult",\n [\n ("asset_key", PublicAttr[Optional[AssetKey]]),\n ("metadata", PublicAttr[Optional[MetadataUserInput]]),\n ("check_results", PublicAttr[Optional[Sequence[AssetCheckResult]]]),\n ("data_version", PublicAttr[Optional[DataVersion]]),\n ],\n )\n):\n """An object representing a successful materialization of an asset. These can be returned from\n @asset and @multi_asset decorated functions to pass metadata or specify specific assets were\n materialized.\n\n Attributes:\n asset_key (Optional[AssetKey]): Optional in @asset, required in @multi_asset to discern which asset this refers to.\n metadata (Optional[MetadataUserInput]): Metadata to record with the corresponding AssetMaterialization event.\n """\n\n def __new__(\n cls,\n *, # enforce kwargs\n asset_key: Optional[CoercibleToAssetKey] = None,\n metadata: Optional[MetadataUserInput] = None,\n check_results: Optional[Sequence[AssetCheckResult]] = None,\n data_version: Optional[DataVersion] = None,\n ):\n asset_key = AssetKey.from_coercible(asset_key) if asset_key else None\n\n return super().__new__(\n cls,\n asset_key=asset_key,\n metadata=check.opt_nullable_mapping_param(\n metadata,\n "metadata",\n key_type=str,\n ),\n check_results=check.opt_nullable_sequence_param(\n check_results, "check_results", of_type=AssetCheckResult\n ),\n data_version=check.opt_inst_param(data_version, "data_version", DataVersion),\n )
\n
", "current_page_name": "_modules/dagster/_core/definitions/result", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.result"}, "run_config": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.run_config

\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Any,\n    Dict,\n    Iterator,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Tuple,\n    TypeVar,\n    Union,\n    cast,\n)\n\nfrom typing_extensions import TypeAlias\n\nfrom dagster._config import (\n    ALL_CONFIG_BUILTINS,\n    ConfigType,\n    Field,\n    Permissive,\n    Selector,\n    Shape,\n)\nfrom dagster._config.pythonic_config import Config\nfrom dagster._core.definitions.asset_layer import AssetLayer\nfrom dagster._core.definitions.executor_definition import (\n    ExecutorDefinition,\n    execute_in_process_executor,\n    in_process_executor,\n)\nfrom dagster._core.definitions.input import InputDefinition\nfrom dagster._core.definitions.output import OutputDefinition\nfrom dagster._core.errors import DagsterInvalidDefinitionError\nfrom dagster._core.storage.input_manager import IInputManagerDefinition\nfrom dagster._core.storage.output_manager import IOutputManagerDefinition\nfrom dagster._core.types.dagster_type import ALL_RUNTIME_BUILTINS, construct_dagster_type_dictionary\nfrom dagster._utils import check\n\nfrom .configurable import ConfigurableDefinition\nfrom .definition_config_schema import IDefinitionConfigSchema\nfrom .dependency import DependencyStructure, GraphNode, Node, NodeHandle, NodeInput, OpNode\nfrom .graph_definition import GraphDefinition\nfrom .logger_definition import LoggerDefinition\nfrom .op_definition import NodeDefinition, OpDefinition\nfrom .resource_definition import ResourceDefinition\n\nif TYPE_CHECKING:\n    from .source_asset import SourceAsset\n\n\ndef define_resource_dictionary_cls(\n    resource_defs: Mapping[str, ResourceDefinition],\n    required_resources: AbstractSet[str],\n) -> Shape:\n    fields = {}\n    for resource_name, resource_def in resource_defs.items():\n        if resource_def.config_schema:\n            is_required = None\n            if resource_name not in required_resources:\n                # explicitly make section not required if resource is not required\n                # for the current mode\n                is_required = False\n\n            fields[resource_name] = def_config_field(\n                resource_def,\n                is_required=is_required,\n                description=resource_def.description,\n            )\n\n    return Shape(fields=fields)\n\n\ndef remove_none_entries(ddict: Mapping[Any, Any]) -> dict:\n    return {k: v for k, v in ddict.items() if v is not None}\n\n\ndef def_config_field(\n    configurable_def: ConfigurableDefinition,\n    is_required: Optional[bool] = None,\n    description: Optional[str] = None,\n) -> Field:\n    return Field(\n        Shape(\n            {"config": configurable_def.config_field} if configurable_def.has_config_field else {}\n        ),\n        is_required=is_required,\n        description=description,\n    )\n\n\nclass RunConfigSchemaCreationData(NamedTuple):\n    job_name: str\n    nodes: Sequence[Node]\n    graph_def: GraphDefinition\n    dependency_structure: DependencyStructure\n    executor_def: ExecutorDefinition\n    resource_defs: Mapping[str, ResourceDefinition]\n    logger_defs: Mapping[str, LoggerDefinition]\n    ignored_nodes: Sequence[Node]\n    required_resources: AbstractSet[str]\n    direct_inputs: Mapping[str, Any]\n    asset_layer: AssetLayer\n\n\ndef define_logger_dictionary_cls(creation_data: RunConfigSchemaCreationData) -> Shape:\n    return Shape(\n        {\n            logger_name: def_config_field(logger_definition, is_required=False)\n            for logger_name, logger_definition in creation_data.logger_defs.items()\n        }\n    )\n\n\ndef define_execution_field(executor_defs: Sequence[ExecutorDefinition], description: str) -> Field:\n    default_in_process = False\n    for executor_def in executor_defs:\n        if executor_def == in_process_executor:\n            default_in_process = True\n\n    selector = selector_for_named_defs(executor_defs)\n\n    if default_in_process:\n        return Field(\n            selector, default_value={in_process_executor.name: {}}, description=description\n        )\n\n    # If we are using the execute_in_process executor, then ignore all executor config.\n    if len(executor_defs) == 1 and executor_defs[0] == execute_in_process_executor:\n        return Field(Permissive(), is_required=False, default_value={}, description=description)\n\n    return Field(selector, description=description)\n\n\ndef define_single_execution_field(executor_def: ExecutorDefinition, description: str) -> Field:\n    return def_config_field(executor_def, description=description)\n\n\ndef define_run_config_schema_type(creation_data: RunConfigSchemaCreationData) -> ConfigType:\n    execution_field = define_single_execution_field(\n        creation_data.executor_def,\n        "Configure how steps are executed within a run.",\n    )\n\n    top_level_node = GraphNode(\n        name=creation_data.graph_def.name,\n        definition=creation_data.graph_def,\n        graph_definition=creation_data.graph_def,\n    )\n\n    fields = {\n        "execution": execution_field,\n        "loggers": Field(\n            define_logger_dictionary_cls(creation_data),\n            description="Configure how loggers emit messages within a run.",\n        ),\n        "resources": Field(\n            define_resource_dictionary_cls(\n                creation_data.resource_defs,\n                creation_data.required_resources,\n            ),\n            description="Configure how shared resources are implemented within a run.",\n        ),\n        "inputs": get_inputs_field(\n            node=top_level_node,\n            handle=NodeHandle(top_level_node.name, parent=None),\n            dependency_structure=creation_data.dependency_structure,\n            resource_defs=creation_data.resource_defs,\n            node_ignored=False,\n            direct_inputs=creation_data.direct_inputs,\n            input_source_assets={},\n            asset_layer=creation_data.asset_layer,\n        ),\n    }\n\n    if creation_data.graph_def.has_config_mapping:\n        config_schema = cast(IDefinitionConfigSchema, creation_data.graph_def.config_schema)\n        nodes_field = Field(\n            {"config": config_schema.as_field()},\n            description="Configure runtime parameters for ops or assets.",\n        )\n    else:\n        nodes_field = Field(\n            define_node_shape(\n                nodes=creation_data.nodes,\n                ignored_nodes=creation_data.ignored_nodes,\n                dependency_structure=creation_data.dependency_structure,\n                resource_defs=creation_data.resource_defs,\n                asset_layer=creation_data.asset_layer,\n                node_input_source_assets=creation_data.graph_def.node_input_source_assets,\n            ),\n            description="Configure runtime parameters for ops or assets.",\n        )\n\n    fields["ops"] = nodes_field\n\n    return Shape(\n        fields=remove_none_entries(fields),\n    )\n\n\n# Common pattern for a set of named definitions (e.g. executors)\n# to build a selector so that one of them is selected\ndef selector_for_named_defs(named_defs) -> Selector:\n    return Selector({named_def.name: def_config_field(named_def) for named_def in named_defs})\n\n\ndef get_inputs_field(\n    node: Node,\n    handle: NodeHandle,\n    dependency_structure: DependencyStructure,\n    resource_defs: Mapping[str, ResourceDefinition],\n    node_ignored: bool,\n    asset_layer: AssetLayer,\n    input_source_assets: Mapping[str, "SourceAsset"],\n    direct_inputs: Optional[Mapping[str, Any]] = None,\n) -> Optional[Field]:\n    direct_inputs = check.opt_mapping_param(direct_inputs, "direct_inputs")\n    inputs_field_fields = {}\n    for name, inp in node.definition.input_dict.items():\n        inp_handle = NodeInput(node, inp)\n        has_upstream = input_has_upstream(dependency_structure, inp_handle, node, name)\n        if inp.input_manager_key:\n            input_field = get_input_manager_input_field(node, inp, resource_defs)\n        elif (\n            # if you have asset definitions, input will be loaded from the source asset\n            asset_layer.has_assets_defs\n            or asset_layer.has_asset_check_defs\n            and asset_layer.asset_key_for_input(handle, name)\n            and not has_upstream\n        ):\n            input_field = None\n        elif name in direct_inputs and not has_upstream:\n            input_field = None\n        elif name in input_source_assets and not has_upstream:\n            input_field = None\n        elif inp.dagster_type.loader and not has_upstream:\n            input_field = get_type_loader_input_field(node, name, inp)\n        else:\n            input_field = None\n\n        if input_field:\n            inputs_field_fields[name] = input_field\n\n    if not inputs_field_fields:\n        return None\n    if node_ignored:\n        return Field(\n            Shape(inputs_field_fields),\n            is_required=False,\n            description=(\n                "This op is not present in the current op selection, "\n                "the input config values are allowed but ignored."\n            ),\n        )\n    else:\n        return Field(Shape(inputs_field_fields))\n\n\ndef input_has_upstream(\n    dependency_structure: DependencyStructure,\n    input_handle: NodeInput,\n    node: Node,\n    input_name: str,\n) -> bool:\n    return dependency_structure.has_deps(input_handle) or node.container_maps_input(input_name)\n\n\ndef get_input_manager_input_field(\n    node: Node,\n    input_def: InputDefinition,\n    resource_defs: Mapping[str, ResourceDefinition],\n) -> Optional[Field]:\n    if input_def.input_manager_key:\n        if input_def.input_manager_key not in resource_defs:\n            raise DagsterInvalidDefinitionError(\n                f"Input '{input_def.name}' for {node.describe_node()} requires input_manager_key"\n                f" '{input_def.input_manager_key}', but no resource has been provided. Please"\n                " include a resource definition for that key in the provided resource_defs."\n            )\n\n        input_manager = resource_defs[input_def.input_manager_key]\n        if not isinstance(input_manager, IInputManagerDefinition):\n            raise DagsterInvalidDefinitionError(\n                f"Input '{input_def.name}' for {node.describe_node()} requires input_manager_key "\n                f"'{input_def.input_manager_key}', but the resource definition provided is not an "\n                "IInputManagerDefinition"\n            )\n\n        input_config_schema = input_manager.input_config_schema\n        if input_config_schema:\n            return input_config_schema.as_field()\n        return None\n\n    return None\n\n\ndef get_type_loader_input_field(node: Node, input_name: str, input_def: InputDefinition) -> Field:\n    loader = check.not_none(input_def.dagster_type.loader)\n    return Field(\n        loader.schema_type,\n        is_required=(not node.definition.input_has_default(input_name)),\n    )\n\n\ndef get_outputs_field(\n    node: Node,\n    resource_defs: Mapping[str, ResourceDefinition],\n) -> Optional[Field]:\n    output_manager_fields = {}\n    for name, output_def in node.definition.output_dict.items():\n        output_manager_output_field = get_output_manager_output_field(\n            node, output_def, resource_defs\n        )\n        if output_manager_output_field:\n            output_manager_fields[name] = output_manager_output_field\n\n    return Field(Shape(output_manager_fields)) if output_manager_fields else None\n\n\ndef get_output_manager_output_field(\n    node: Node, output_def: OutputDefinition, resource_defs: Mapping[str, ResourceDefinition]\n) -> Optional[ConfigType]:\n    if output_def.io_manager_key not in resource_defs:\n        raise DagsterInvalidDefinitionError(\n            f'Output "{output_def.name}" for {node.describe_node()} requires io_manager_key '\n            f'"{output_def.io_manager_key}", but no resource has been provided. Please include a '\n            "resource definition for that key in the provided resource_defs."\n        )\n    if not isinstance(resource_defs[output_def.io_manager_key], IOutputManagerDefinition):\n        raise DagsterInvalidDefinitionError(\n            f'Output "{output_def.name}" for {node.describe_node()} requires io_manager_key '\n            f'"{output_def.io_manager_key}", but the resource definition provided is not an '\n            "IOutputManagerDefinition"\n        )\n    output_manager_def = resource_defs[output_def.io_manager_key]\n    if (\n        output_manager_def\n        and isinstance(output_manager_def, IOutputManagerDefinition)\n        and output_manager_def.output_config_schema\n    ):\n        return output_manager_def.output_config_schema.as_field()\n\n    return None\n\n\ndef node_config_field(fields: Mapping[str, Optional[Field]], ignored: bool) -> Optional[Field]:\n    trimmed_fields = remove_none_entries(fields)\n    if trimmed_fields:\n        if ignored:\n            return Field(\n                Shape(trimmed_fields),\n                is_required=False,\n                description=(\n                    "This op is not present in the current op selection, "\n                    "the config values are allowed but ignored."\n                ),\n            )\n        else:\n            return Field(Shape(trimmed_fields))\n    else:\n        return None\n\n\ndef construct_leaf_node_config(\n    node: Node,\n    handle: NodeHandle,\n    dependency_structure: DependencyStructure,\n    config_schema: Optional[IDefinitionConfigSchema],\n    resource_defs: Mapping[str, ResourceDefinition],\n    ignored: bool,\n    asset_layer: AssetLayer,\n    input_source_assets: Mapping[str, "SourceAsset"],\n) -> Optional[Field]:\n    return node_config_field(\n        {\n            "inputs": get_inputs_field(\n                node,\n                handle,\n                dependency_structure,\n                resource_defs,\n                ignored,\n                asset_layer,\n                input_source_assets,\n            ),\n            "outputs": get_outputs_field(node, resource_defs),\n            "config": config_schema.as_field() if config_schema else None,\n        },\n        ignored=ignored,\n    )\n\n\ndef define_node_field(\n    node: Node,\n    handle: NodeHandle,\n    dependency_structure: DependencyStructure,\n    resource_defs: Mapping[str, ResourceDefinition],\n    ignored: bool,\n    asset_layer: AssetLayer,\n    input_source_assets: Mapping[str, "SourceAsset"],\n) -> Optional[Field]:\n    # All nodes regardless of compositing status get the same inputs and outputs\n    # config. The only thing the varies is on extra element of configuration\n    # 1) Vanilla op definition: a 'config' key with the config_schema as the value\n    # 2) Graph with field mapping: a 'config' key with the config_schema of\n    #    the config mapping (via GraphDefinition#config_schema)\n    # 3) Graph without field mapping: an 'ops' key with recursively defined\n    #    ops dictionary\n    # 4) `configured` graph with field mapping: a 'config' key with the config_schema that was\n    #    provided when `configured` was called (via GraphDefinition#config_schema)\n\n    assert isinstance(node, (OpNode, GraphNode)), f"Invalid node type: {type(node)}"\n\n    if isinstance(node, OpNode):\n        return construct_leaf_node_config(\n            node,\n            handle,\n            dependency_structure,\n            node.definition.config_schema,\n            resource_defs,\n            ignored,\n            asset_layer,\n            input_source_assets,\n        )\n\n    graph_def = node.definition\n\n    if graph_def.has_config_mapping:\n        # has_config_mapping covers cases 2 & 4 from above (only config mapped graphs can\n        # be `configured`)...\n        return construct_leaf_node_config(\n            node,\n            handle,\n            dependency_structure,\n            # ...and in both cases, the correct schema for 'config' key is exposed by this property:\n            graph_def.config_schema,\n            resource_defs,\n            ignored,\n            asset_layer,\n            input_source_assets,\n        )\n        # This case omits an 'ops' key, thus if a graph is `configured` or has a field\n        # mapping, the user cannot stub any config, inputs, or outputs for inner (child) nodes.\n    else:\n        fields = {\n            "inputs": get_inputs_field(\n                node,\n                handle,\n                dependency_structure,\n                resource_defs,\n                ignored,\n                asset_layer,\n                input_source_assets,\n            ),\n            "outputs": get_outputs_field(node, resource_defs),\n            "ops": Field(\n                define_node_shape(\n                    nodes=graph_def.nodes,\n                    ignored_nodes=None,\n                    dependency_structure=graph_def.dependency_structure,\n                    parent_handle=handle,\n                    resource_defs=resource_defs,\n                    asset_layer=asset_layer,\n                    node_input_source_assets=graph_def.node_input_source_assets,\n                )\n            ),\n        }\n\n        return node_config_field(fields, ignored=ignored)\n\n\ndef define_node_shape(\n    nodes: Sequence[Node],\n    ignored_nodes: Optional[Sequence[Node]],\n    dependency_structure: DependencyStructure,\n    resource_defs: Mapping[str, ResourceDefinition],\n    asset_layer: AssetLayer,\n    node_input_source_assets: Mapping[str, Mapping[str, "SourceAsset"]],\n    parent_handle: Optional[NodeHandle] = None,\n) -> Shape:\n    """Examples of what this method is used to generate the schema for:\n    1.\n        inputs: ...\n        ops:\n      >    op1: ...\n      >    op2: ...\n\n    2.\n        inputs:\n        ops:\n          graph1: ...\n            inputs: ...\n            ops:\n      >       op1: ...\n      >       inner_graph: ...\n\n\n    """\n    ignored_nodes = check.opt_sequence_param(ignored_nodes, "ignored_nodes", of_type=Node)\n\n    fields = {}\n    for node in nodes:\n        node_field = define_node_field(\n            node,\n            NodeHandle(node.name, parent_handle),\n            dependency_structure,\n            resource_defs,\n            ignored=False,\n            asset_layer=asset_layer,\n            input_source_assets=node_input_source_assets.get(node.name, {}),\n        )\n\n        if node_field:\n            fields[node.name] = node_field\n\n    for node in ignored_nodes:\n        node_field = define_node_field(\n            node,\n            NodeHandle(node.name, parent_handle),\n            dependency_structure,\n            resource_defs,\n            ignored=True,\n            asset_layer=asset_layer,\n            input_source_assets=node_input_source_assets.get(node.name, {}),\n        )\n        if node_field:\n            fields[node.name] = node_field\n\n    return Shape(fields)\n\n\ndef iterate_node_def_config_types(node_def: NodeDefinition) -> Iterator[ConfigType]:\n    if isinstance(node_def, OpDefinition):\n        if node_def.has_config_field:\n            yield from node_def.get_config_field().config_type.type_iterator()\n    elif isinstance(node_def, GraphDefinition):\n        for node in node_def.nodes:\n            yield from iterate_node_def_config_types(node.definition)\n\n    else:\n        check.invariant(f"Unexpected NodeDefinition type {type(node_def)}")\n\n\ndef _gather_all_schemas(node_defs: Sequence[NodeDefinition]) -> Iterator[ConfigType]:\n    dagster_types = construct_dagster_type_dictionary(node_defs)\n    for dagster_type in list(dagster_types.values()) + list(ALL_RUNTIME_BUILTINS):\n        if dagster_type.loader:\n            yield from dagster_type.loader.schema_type.type_iterator()\n\n\ndef _gather_all_config_types(\n    node_defs: Sequence[NodeDefinition], run_config_schema_type: ConfigType\n) -> Iterator[ConfigType]:\n    for node_def in node_defs:\n        yield from iterate_node_def_config_types(node_def)\n\n    yield from run_config_schema_type.type_iterator()\n\n\ndef construct_config_type_dictionary(\n    node_defs: Sequence[NodeDefinition],\n    run_config_schema_type: ConfigType,\n) -> Tuple[Mapping[str, ConfigType], Mapping[str, ConfigType]]:\n    type_dict_by_name = {t.given_name: t for t in ALL_CONFIG_BUILTINS if t.given_name}\n    type_dict_by_key = {t.key: t for t in ALL_CONFIG_BUILTINS}\n    all_types = list(_gather_all_config_types(node_defs, run_config_schema_type)) + list(\n        _gather_all_schemas(node_defs)\n    )\n\n    for config_type in all_types:\n        name = config_type.given_name\n        if name and name in type_dict_by_name:\n            if type(config_type) is not type(type_dict_by_name[name]):\n                raise DagsterInvalidDefinitionError(\n                    "Type names must be unique. You have constructed two different "\n                    f'instances of types with the same name "{name}".'\n                )\n        elif name:\n            type_dict_by_name[name] = config_type\n\n        type_dict_by_key[config_type.key] = config_type\n\n    return type_dict_by_name, type_dict_by_key\n\n\ndef _convert_config_classes_inner(configs: Any) -> Any:\n    if not isinstance(configs, dict):\n        return configs\n\n    return {\n        k: (\n            {"config": v._convert_to_config_dictionary()}  # noqa: SLF001\n            if isinstance(v, Config)\n            else _convert_config_classes_inner(v)\n        )\n        for k, v in configs.items()\n    }\n\n\ndef _convert_config_classes(configs: Dict[str, Any]) -> Dict[str, Any]:\n    return _convert_config_classes_inner(configs)\n\n\n
[docs]class RunConfig:\n """Container for all the configuration that can be passed to a run. Accepts Pythonic definitions\n for op and asset config and resources and converts them under the hood to the appropriate config dictionaries.\n\n Example usage:\n\n .. code-block:: python\n\n class MyAssetConfig(Config):\n a_str: str\n\n @asset\n def my_asset(config: MyAssetConfig):\n assert config.a_str == "foo"\n\n materialize(\n [my_asset],\n run_config=RunConfig(\n ops={"my_asset": MyAssetConfig(a_str="foo")}\n )\n )\n\n """\n\n def __init__(\n self,\n ops: Optional[Dict[str, Any]] = None,\n resources: Optional[Dict[str, Any]] = None,\n loggers: Optional[Dict[str, Any]] = None,\n execution: Optional[Dict[str, Any]] = None,\n ):\n self.ops = check.opt_dict_param(ops, "ops")\n self.resources = check.opt_dict_param(resources, "resources")\n self.loggers = check.opt_dict_param(loggers, "loggers")\n self.execution = check.opt_dict_param(execution, "execution")\n\n def to_config_dict(self):\n return {\n "loggers": self.loggers,\n "resources": _convert_config_classes(self.resources),\n "ops": _convert_config_classes(self.ops),\n "execution": self.execution,\n }
\n\n\nCoercibleToRunConfig: TypeAlias = Union[Dict[str, Any], RunConfig]\n\nT = TypeVar("T")\n\n\ndef convert_config_input(inp: Union[CoercibleToRunConfig, T]) -> Union[T, Mapping[str, Any]]:\n if isinstance(inp, RunConfig):\n return inp.to_config_dict()\n else:\n return inp\n
", "current_page_name": "_modules/dagster/_core/definitions/run_config", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.run_config"}, "run_request": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.run_request

\nfrom datetime import datetime\nfrom enum import Enum\nfrom typing import (\n    TYPE_CHECKING,\n    Any,\n    List,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Set,\n    Union,\n    cast,\n)\n\nimport dagster._check as check\nfrom dagster._annotations import PublicAttr, experimental_param\nfrom dagster._core.definitions.asset_check_evaluation import AssetCheckEvaluation\nfrom dagster._core.definitions.events import AssetKey, AssetMaterialization, AssetObservation\nfrom dagster._core.definitions.utils import validate_tags\nfrom dagster._core.instance import DynamicPartitionsStore\nfrom dagster._core.storage.dagster_run import DagsterRun, DagsterRunStatus\nfrom dagster._core.storage.tags import PARTITION_NAME_TAG\nfrom dagster._serdes.serdes import whitelist_for_serdes\nfrom dagster._utils.error import SerializableErrorInfo\n\nif TYPE_CHECKING:\n    from dagster._core.definitions.job_definition import JobDefinition\n    from dagster._core.definitions.partition import PartitionsDefinition\n    from dagster._core.definitions.run_config import RunConfig\n    from dagster._core.definitions.unresolved_asset_job_definition import (\n        UnresolvedAssetJobDefinition,\n    )\n\n\n@whitelist_for_serdes(old_storage_names={"JobType"})\nclass InstigatorType(Enum):\n    SCHEDULE = "SCHEDULE"\n    SENSOR = "SENSOR"\n\n\n
[docs]@whitelist_for_serdes\nclass SkipReason(NamedTuple("_SkipReason", [("skip_message", PublicAttr[Optional[str]])])):\n """Represents a skipped evaluation, where no runs are requested. May contain a message to indicate\n why no runs were requested.\n\n Attributes:\n skip_message (Optional[str]): A message displayed in the Dagster UI for why this evaluation resulted\n in no requested runs.\n """\n\n def __new__(cls, skip_message: Optional[str] = None):\n return super(SkipReason, cls).__new__(\n cls,\n skip_message=check.opt_str_param(skip_message, "skip_message"),\n )
\n\n\n
[docs]@whitelist_for_serdes\nclass AddDynamicPartitionsRequest(\n NamedTuple(\n "_AddDynamicPartitionsRequest",\n [\n ("partitions_def_name", str),\n ("partition_keys", Sequence[str]),\n ],\n )\n):\n """A request to add partitions to a dynamic partitions definition, to be evaluated by a sensor or schedule."""\n\n def __new__(\n cls,\n partitions_def_name: str,\n partition_keys: Sequence[str],\n ):\n return super(AddDynamicPartitionsRequest, cls).__new__(\n cls,\n partitions_def_name=check.str_param(partitions_def_name, "partitions_def_name"),\n partition_keys=check.list_param(partition_keys, "partition_keys", of_type=str),\n )
\n\n\n
[docs]@whitelist_for_serdes\nclass DeleteDynamicPartitionsRequest(\n NamedTuple(\n "_AddDynamicPartitionsRequest",\n [\n ("partitions_def_name", str),\n ("partition_keys", Sequence[str]),\n ],\n )\n):\n """A request to delete partitions to a dynamic partitions definition, to be evaluated by a sensor or schedule."""\n\n def __new__(\n cls,\n partitions_def_name: str,\n partition_keys: Sequence[str],\n ):\n return super(DeleteDynamicPartitionsRequest, cls).__new__(\n cls,\n partitions_def_name=check.str_param(partitions_def_name, "partitions_def_name"),\n partition_keys=check.list_param(partition_keys, "partition_keys", of_type=str),\n )
\n\n\n
[docs]@whitelist_for_serdes\nclass RunRequest(\n NamedTuple(\n "_RunRequest",\n [\n ("run_key", PublicAttr[Optional[str]]),\n ("run_config", PublicAttr[Mapping[str, Any]]),\n ("tags", PublicAttr[Mapping[str, str]]),\n ("job_name", PublicAttr[Optional[str]]),\n ("asset_selection", PublicAttr[Optional[Sequence[AssetKey]]]),\n ("stale_assets_only", PublicAttr[bool]),\n ("partition_key", PublicAttr[Optional[str]]),\n ],\n )\n):\n """Represents all the information required to launch a single run. Must be returned by a\n SensorDefinition or ScheduleDefinition's evaluation function for a run to be launched.\n\n Attributes:\n run_key (Optional[str]): A string key to identify this launched run. For sensors, ensures that\n only one run is created per run key across all sensor evaluations. For schedules,\n ensures that one run is created per tick, across failure recoveries. Passing in a `None`\n value means that a run will always be launched per evaluation.\n run_config (Optional[Mapping[str, Any]]: Configuration for the run. If the job has\n a :py:class:`PartitionedConfig`, this value will override replace the config\n provided by it.\n tags (Optional[Dict[str, Any]]): A dictionary of tags (string key-value pairs) to attach\n to the launched run.\n job_name (Optional[str]): (Experimental) The name of the job this run request will launch.\n Required for sensors that target multiple jobs.\n asset_selection (Optional[Sequence[AssetKey]]): A sequence of AssetKeys that should be\n launched with this run.\n stale_assets_only (bool): Set to true to further narrow the asset\n selection to stale assets. If passed without an asset selection, all stale assets in the\n job will be materialized. If the job does not materialize assets, this flag is ignored.\n partition_key (Optional[str]): The partition key for this run request.\n """\n\n def __new__(\n cls,\n run_key: Optional[str] = None,\n run_config: Optional[Union["RunConfig", Mapping[str, Any]]] = None,\n tags: Optional[Mapping[str, Any]] = None,\n job_name: Optional[str] = None,\n asset_selection: Optional[Sequence[AssetKey]] = None,\n stale_assets_only: bool = False,\n partition_key: Optional[str] = None,\n ):\n from dagster._core.definitions.run_config import convert_config_input\n\n return super(RunRequest, cls).__new__(\n cls,\n run_key=check.opt_str_param(run_key, "run_key"),\n run_config=check.opt_mapping_param(\n convert_config_input(run_config), "run_config", key_type=str\n ),\n tags=validate_tags(check.opt_mapping_param(tags, "tags", key_type=str)),\n job_name=check.opt_str_param(job_name, "job_name"),\n asset_selection=check.opt_nullable_sequence_param(\n asset_selection, "asset_selection", of_type=AssetKey\n ),\n stale_assets_only=check.bool_param(stale_assets_only, "stale_assets_only"),\n partition_key=check.opt_str_param(partition_key, "partition_key"),\n )\n\n def with_replaced_attrs(self, **kwargs: Any) -> "RunRequest":\n fields = self._asdict()\n for k in fields.keys():\n if k in kwargs:\n fields[k] = kwargs[k]\n return RunRequest(**fields)\n\n def with_resolved_tags_and_config(\n self,\n target_definition: Union["JobDefinition", "UnresolvedAssetJobDefinition"],\n dynamic_partitions_requests: Sequence[\n Union[AddDynamicPartitionsRequest, DeleteDynamicPartitionsRequest]\n ],\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> "RunRequest":\n from dagster._core.definitions.job_definition import JobDefinition\n from dagster._core.definitions.partition import (\n PartitionedConfig,\n PartitionsDefinition,\n )\n\n if self.partition_key is None:\n check.failed(\n "Cannot resolve partition for run request without partition key",\n )\n\n partitions_def = target_definition.partitions_def\n if partitions_def is None:\n check.failed(\n "Cannot resolve partition for run request when target job"\n f" '{target_definition.name}' is unpartitioned.",\n )\n partitions_def = cast(PartitionsDefinition, partitions_def)\n\n partitioned_config = (\n target_definition.partitioned_config\n if isinstance(target_definition, JobDefinition)\n else PartitionedConfig.from_flexible_config(target_definition.config, partitions_def)\n )\n if partitioned_config is None:\n check.failed(\n "Cannot resolve partition for run request on unpartitioned job",\n )\n\n _check_valid_partition_key_after_dynamic_partitions_requests(\n self.partition_key,\n partitions_def,\n dynamic_partitions_requests,\n current_time,\n dynamic_partitions_store,\n )\n\n tags = {\n **(self.tags or {}),\n **partitioned_config.get_tags_for_partition_key(\n self.partition_key,\n job_name=target_definition.name,\n ),\n }\n\n return self.with_replaced_attrs(\n run_config=(\n self.run_config\n if self.run_config\n else partitioned_config.get_run_config_for_partition_key(self.partition_key)\n ),\n tags=tags,\n )\n\n def has_resolved_partition(self) -> bool:\n # Backcompat run requests yielded via `run_request_for_partition` already have resolved\n # partitioning\n return self.tags.get(PARTITION_NAME_TAG) is not None if self.partition_key else True
\n\n\ndef _check_valid_partition_key_after_dynamic_partitions_requests(\n partition_key: str,\n partitions_def: "PartitionsDefinition",\n dynamic_partitions_requests: Sequence[\n Union[AddDynamicPartitionsRequest, DeleteDynamicPartitionsRequest]\n ],\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n):\n from dagster._core.definitions.multi_dimensional_partitions import MultiPartitionsDefinition\n from dagster._core.definitions.partition import (\n DynamicPartitionsDefinition,\n )\n\n if isinstance(partitions_def, MultiPartitionsDefinition):\n multipartition_key = partitions_def.get_partition_key_from_str(partition_key)\n\n for dimension in partitions_def.partitions_defs:\n _check_valid_partition_key_after_dynamic_partitions_requests(\n multipartition_key.keys_by_dimension[dimension.name],\n dimension.partitions_def,\n dynamic_partitions_requests,\n current_time,\n dynamic_partitions_store,\n )\n\n elif isinstance(partitions_def, DynamicPartitionsDefinition) and partitions_def.name:\n if not dynamic_partitions_store:\n check.failed(\n "Cannot resolve partition for run request on dynamic partitions without"\n " dynamic_partitions_store"\n )\n\n add_partition_keys: Set[str] = set()\n delete_partition_keys: Set[str] = set()\n for req in dynamic_partitions_requests:\n if isinstance(req, AddDynamicPartitionsRequest):\n if req.partitions_def_name == partitions_def.name:\n add_partition_keys.update(set(req.partition_keys))\n elif isinstance(req, DeleteDynamicPartitionsRequest):\n if req.partitions_def_name == partitions_def.name:\n delete_partition_keys.update(set(req.partition_keys))\n\n partition_keys_after_requests_resolved = (\n set(\n dynamic_partitions_store.get_dynamic_partitions(\n partitions_def_name=partitions_def.name\n )\n )\n | add_partition_keys\n ) - delete_partition_keys\n\n if partition_key not in partition_keys_after_requests_resolved:\n check.failed(\n f"Dynamic partition key {partition_key} for partitions def"\n f" '{partitions_def.name}' is invalid. After dynamic partitions requests are"\n " applied, it does not exist in the set of valid partition keys."\n )\n\n else:\n partitions_def.validate_partition_key(\n partition_key,\n dynamic_partitions_store=dynamic_partitions_store,\n current_time=current_time,\n )\n\n\n@whitelist_for_serdes(\n storage_name="PipelineRunReaction",\n storage_field_names={\n "dagster_run": "pipeline_run",\n },\n)\nclass DagsterRunReaction(\n NamedTuple(\n "_DagsterRunReaction",\n [\n ("dagster_run", Optional[DagsterRun]),\n ("error", Optional[SerializableErrorInfo]),\n ("run_status", Optional[DagsterRunStatus]),\n ],\n )\n):\n """Represents a request that reacts to an existing dagster run. If success, it will report logs\n back to the run.\n\n Attributes:\n dagster_run (Optional[DagsterRun]): The dagster run that originates this reaction.\n error (Optional[SerializableErrorInfo]): user code execution error.\n run_status: (Optional[DagsterRunStatus]): The run status that triggered the reaction.\n """\n\n def __new__(\n cls,\n dagster_run: Optional[DagsterRun],\n error: Optional[SerializableErrorInfo] = None,\n run_status: Optional[DagsterRunStatus] = None,\n ):\n return super(DagsterRunReaction, cls).__new__(\n cls,\n dagster_run=check.opt_inst_param(dagster_run, "dagster_run", DagsterRun),\n error=check.opt_inst_param(error, "error", SerializableErrorInfo),\n run_status=check.opt_inst_param(run_status, "run_status", DagsterRunStatus),\n )\n\n\n
[docs]@experimental_param(\n param="asset_events", additional_warn_text="Runless asset events are experimental"\n)\nclass SensorResult(\n NamedTuple(\n "_SensorResult",\n [\n ("run_requests", Optional[Sequence[RunRequest]]),\n ("skip_reason", Optional[SkipReason]),\n ("cursor", Optional[str]),\n (\n "dynamic_partitions_requests",\n Optional[\n Sequence[Union[DeleteDynamicPartitionsRequest, AddDynamicPartitionsRequest]]\n ],\n ),\n (\n "asset_events",\n List[Union[AssetObservation, AssetMaterialization, AssetCheckEvaluation]],\n ),\n ],\n )\n):\n """The result of a sensor evaluation.\n\n Attributes:\n run_requests (Optional[Sequence[RunRequest]]): A list\n of run requests to be executed.\n skip_reason (Optional[Union[str, SkipReason]]): A skip message indicating why sensor\n evaluation was skipped.\n cursor (Optional[str]): The cursor value for this sensor, which will be provided on the\n context for the next sensor evaluation.\n dynamic_partitions_requests (Optional[Sequence[Union[DeleteDynamicPartitionsRequest,\n AddDynamicPartitionsRequest]]]): A list of dynamic partition requests to request dynamic\n partition addition and deletion. Run requests will be evaluated using the state of the\n partitions with these changes applied.\n asset_events (Optional[Sequence[Union[AssetObservation, AssetMaterialization, AssetCheckEvaluation]]]): (Experimental) A\n list of materializations, observations, and asset check evaluations that the system\n will persist on your behalf at the end of sensor evaluation. These events will be not\n be associated with any particular run, but will be queryable and viewable in the asset catalog.\n\n\n """\n\n def __new__(\n cls,\n run_requests: Optional[Sequence[RunRequest]] = None,\n skip_reason: Optional[Union[str, SkipReason]] = None,\n cursor: Optional[str] = None,\n dynamic_partitions_requests: Optional[\n Sequence[Union[DeleteDynamicPartitionsRequest, AddDynamicPartitionsRequest]]\n ] = None,\n asset_events: Optional[\n Sequence[Union[AssetObservation, AssetMaterialization, AssetCheckEvaluation]]\n ] = None,\n ):\n if skip_reason and len(run_requests if run_requests else []) > 0:\n check.failed(\n "Expected a single skip reason or one or more run requests: received values for "\n "both run_requests and skip_reason"\n )\n\n skip_reason = check.opt_inst_param(skip_reason, "skip_reason", (SkipReason, str))\n if isinstance(skip_reason, str):\n skip_reason = SkipReason(skip_reason)\n\n return super(SensorResult, cls).__new__(\n cls,\n run_requests=check.opt_sequence_param(run_requests, "run_requests", RunRequest),\n skip_reason=skip_reason,\n cursor=check.opt_str_param(cursor, "cursor"),\n dynamic_partitions_requests=check.opt_sequence_param(\n dynamic_partitions_requests,\n "dynamic_partitions_requests",\n (AddDynamicPartitionsRequest, DeleteDynamicPartitionsRequest),\n ),\n asset_events=list(\n check.opt_sequence_param(\n asset_events,\n "asset_check_evaluations",\n (AssetObservation, AssetMaterialization, AssetCheckEvaluation),\n )\n ),\n )
\n
", "current_page_name": "_modules/dagster/_core/definitions/run_request", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.run_request"}, "run_status_sensor_definition": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.run_status_sensor_definition

\nimport functools\nimport logging\nfrom contextlib import ExitStack\nfrom datetime import datetime\nfrom typing import (\n    TYPE_CHECKING,\n    Any,\n    Callable,\n    Iterator,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Set,\n    Union,\n    cast,\n    overload,\n)\n\nimport pendulum\nfrom typing_extensions import TypeAlias\n\nimport dagster._check as check\nfrom dagster._annotations import deprecated_param, public\nfrom dagster._core.definitions.instigation_logger import InstigationLogger\nfrom dagster._core.definitions.resource_annotation import get_resource_args\nfrom dagster._core.definitions.scoped_resources_builder import Resources, ScopedResourcesBuilder\nfrom dagster._core.errors import (\n    DagsterInvalidDefinitionError,\n    DagsterInvariantViolationError,\n    RunStatusSensorExecutionError,\n    user_code_error_boundary,\n)\nfrom dagster._core.events import PIPELINE_RUN_STATUS_TO_EVENT_TYPE, DagsterEvent, DagsterEventType\nfrom dagster._core.instance import DagsterInstance\nfrom dagster._core.storage.dagster_run import DagsterRun, DagsterRunStatus, RunsFilter\nfrom dagster._serdes import (\n    serialize_value,\n    whitelist_for_serdes,\n)\nfrom dagster._serdes.errors import DeserializationError\nfrom dagster._serdes.serdes import deserialize_value\nfrom dagster._seven import JSONDecodeError\nfrom dagster._utils import utc_datetime_from_timestamp\nfrom dagster._utils.error import serializable_error_info_from_exc_info\n\nfrom .graph_definition import GraphDefinition\nfrom .job_definition import JobDefinition\nfrom .sensor_definition import (\n    DagsterRunReaction,\n    DefaultSensorStatus,\n    RawSensorEvaluationFunctionReturn,\n    RunRequest,\n    SensorDefinition,\n    SensorEvaluationContext,\n    SensorResult,\n    SensorType,\n    SkipReason,\n    get_context_param_name,\n    get_sensor_context_from_args_or_kwargs,\n    validate_and_get_resource_dict,\n)\nfrom .target import ExecutableDefinition\nfrom .unresolved_asset_job_definition import UnresolvedAssetJobDefinition\n\nif TYPE_CHECKING:\n    from dagster._core.definitions.resource_definition import ResourceDefinition\n    from dagster._core.definitions.selector import (\n        CodeLocationSelector,\n        JobSelector,\n        RepositorySelector,\n    )\n\nRunStatusSensorEvaluationFunction: TypeAlias = Union[\n    Callable[..., RawSensorEvaluationFunctionReturn],\n    Callable[..., RawSensorEvaluationFunctionReturn],\n]\nRunFailureSensorEvaluationFn: TypeAlias = Union[\n    Callable[..., RawSensorEvaluationFunctionReturn],\n    Callable[..., RawSensorEvaluationFunctionReturn],\n]\n\n\n@whitelist_for_serdes(old_storage_names={"PipelineSensorCursor"})\nclass RunStatusSensorCursor(\n    NamedTuple(\n        "_RunStatusSensorCursor",\n        [("record_id", int), ("update_timestamp", str)],\n    )\n):\n    def __new__(cls, record_id, update_timestamp):\n        return super(RunStatusSensorCursor, cls).__new__(\n            cls,\n            record_id=check.int_param(record_id, "record_id"),\n            update_timestamp=check.str_param(update_timestamp, "update_timestamp"),\n        )\n\n    @staticmethod\n    def is_valid(json_str: str) -> bool:\n        try:\n            obj = deserialize_value(json_str, RunStatusSensorCursor)\n            return isinstance(obj, RunStatusSensorCursor)\n        except (JSONDecodeError, DeserializationError):\n            return False\n\n    def to_json(self) -> str:\n        return serialize_value(cast(NamedTuple, self))\n\n    @staticmethod\n    def from_json(json_str: str) -> "RunStatusSensorCursor":\n        return deserialize_value(json_str, RunStatusSensorCursor)\n\n\n
[docs]class RunStatusSensorContext:\n """The ``context`` object available to a decorated function of ``run_status_sensor``."""\n\n def __init__(\n self,\n sensor_name,\n dagster_run,\n dagster_event,\n instance,\n context: Optional[\n SensorEvaluationContext\n ] = None, # deprecated arg, but we need to keep it for backcompat\n resource_defs: Optional[Mapping[str, "ResourceDefinition"]] = None,\n logger: Optional[logging.Logger] = None,\n partition_key: Optional[str] = None,\n _resources: Optional[Resources] = None,\n _cm_scope_entered: bool = False,\n ) -> None:\n self._exit_stack = ExitStack()\n self._sensor_name = check.str_param(sensor_name, "sensor_name")\n self._dagster_run = check.inst_param(dagster_run, "dagster_run", DagsterRun)\n self._dagster_event = check.inst_param(dagster_event, "dagster_event", DagsterEvent)\n self._instance = check.inst_param(instance, "instance", DagsterInstance)\n self._logger: Optional[logging.Logger] = logger or (context.log if context else None)\n self._partition_key = check.opt_str_param(partition_key, "partition_key")\n\n # Wait to set resources unless they're accessed\n self._resource_defs = resource_defs\n self._resources = _resources\n self._cm_scope_entered = _cm_scope_entered\n\n def for_run_failure(self) -> "RunFailureSensorContext":\n """Converts RunStatusSensorContext to RunFailureSensorContext."""\n return RunFailureSensorContext(\n sensor_name=self._sensor_name,\n dagster_run=self._dagster_run,\n dagster_event=self._dagster_event,\n instance=self._instance,\n logger=self._logger,\n partition_key=self._partition_key,\n resource_defs=self._resource_defs,\n _resources=self._resources,\n _cm_scope_entered=self._cm_scope_entered,\n )\n\n @property\n def resource_defs(self) -> Optional[Mapping[str, "ResourceDefinition"]]:\n return self._resource_defs\n\n @property\n def resources(self) -> Resources:\n from dagster._core.definitions.scoped_resources_builder import (\n IContainsGenerator,\n )\n from dagster._core.execution.build_resources import build_resources\n\n if not self._resources:\n """\n This is similar to what we do in e.g. the op context - we set up a resource\n building context manager, and immediately enter it. This is so that in cases\n where a user is not using any context-manager based resources, they don't\n need to enter this SensorEvaluationContext themselves.\n\n For example:\n\n my_sensor(build_sensor_context(resources={"my_resource": my_non_cm_resource})\n\n will work ok, but for a CM resource we must do\n\n with build_sensor_context(resources={"my_resource": my_cm_resource}) as context:\n my_sensor(context)\n """\n\n instance = self.instance if self._instance else None\n\n resources_cm = build_resources(resources=self._resource_defs or {}, instance=instance)\n self._resources = self._exit_stack.enter_context(resources_cm)\n\n if isinstance(self._resources, IContainsGenerator) and not self._cm_scope_entered:\n self._exit_stack.close()\n raise DagsterInvariantViolationError(\n "At least one provided resource is a generator, but attempting to access"\n " resources outside of context manager scope. You can use the following syntax"\n " to open a context manager: `with build_schedule_context(...) as context:`"\n )\n\n return self._resources\n\n @public\n @property\n def sensor_name(self) -> str:\n """The name of the sensor."""\n return self._sensor_name\n\n @public\n @property\n def dagster_run(self) -> DagsterRun:\n """The run of the job."""\n return self._dagster_run\n\n @public\n @property\n def dagster_event(self) -> DagsterEvent:\n """The event associated with the job run status."""\n return self._dagster_event\n\n @public\n @property\n def instance(self) -> DagsterInstance:\n """The current instance."""\n return self._instance\n\n @public\n @property\n def log(self) -> logging.Logger:\n """The logger for the current sensor evaluation."""\n if not self._logger:\n self._logger = InstigationLogger()\n\n return self._logger\n\n @public\n @property\n def partition_key(self) -> Optional[str]:\n """Optional[str]: The partition key of the relevant run."""\n return self._partition_key\n\n def __enter__(self) -> "RunStatusSensorContext":\n self._cm_scope_entered = True\n return self\n\n def __exit__(self, *exc) -> None:\n self._exit_stack.close()\n self._logger = None
\n\n\n
[docs]class RunFailureSensorContext(RunStatusSensorContext):\n """The ``context`` object available to a decorated function of ``run_failure_sensor``.\n\n Attributes:\n sensor_name (str): the name of the sensor.\n dagster_run (DagsterRun): the failed run.\n """\n\n @public\n @property\n def failure_event(self) -> DagsterEvent:\n """The run failure event.\n\n If the run failed because of an error inside a step, get_step_failure_events will have more\n details on the step failure.\n """\n return self.dagster_event\n\n
[docs] @public\n def get_step_failure_events(self) -> Sequence[DagsterEvent]:\n """The step failure event for each step in the run that failed.\n\n Examples:\n .. code-block:: python\n\n error_strings_by_step_key = {\n # includes the stack trace\n event.step_key: event.event_specific_data.error.to_string()\n for event in context.get_step_failure_events()\n }\n """\n records = self.instance.get_records_for_run(\n run_id=self.dagster_run.run_id, of_type=DagsterEventType.STEP_FAILURE\n ).records\n return [cast(DagsterEvent, record.event_log_entry.dagster_event) for record in records]
\n\n\n
[docs]def build_run_status_sensor_context(\n sensor_name: str,\n dagster_event: DagsterEvent,\n dagster_instance: DagsterInstance,\n dagster_run: DagsterRun,\n context: Optional[SensorEvaluationContext] = None,\n resources: Optional[Mapping[str, object]] = None,\n partition_key: Optional[str] = None,\n) -> RunStatusSensorContext:\n """Builds run status sensor context from provided parameters.\n\n This function can be used to provide the context argument when directly invoking a function\n decorated with `@run_status_sensor` or `@run_failure_sensor`, such as when writing unit tests.\n\n Args:\n sensor_name (str): The name of the sensor the context is being constructed for.\n dagster_event (DagsterEvent): A DagsterEvent with the same event type as the one that\n triggers the run_status_sensor\n dagster_instance (DagsterInstance): The dagster instance configured for the context.\n dagster_run (DagsterRun): DagsterRun object from running a job\n resources (Optional[Mapping[str, object]]): A dictionary of resources to be made available\n to the sensor.\n\n Examples:\n .. code-block:: python\n\n instance = DagsterInstance.ephemeral()\n result = my_job.execute_in_process(instance=instance)\n\n dagster_run = result.dagster_run\n dagster_event = result.get_job_success_event() # or get_job_failure_event()\n\n context = build_run_status_sensor_context(\n sensor_name="run_status_sensor_to_invoke",\n dagster_instance=instance,\n dagster_run=dagster_run,\n dagster_event=dagster_event,\n )\n run_status_sensor_to_invoke(context)\n """\n from dagster._core.execution.build_resources import wrap_resources_for_execution\n\n return RunStatusSensorContext(\n sensor_name=sensor_name,\n instance=dagster_instance,\n dagster_run=dagster_run,\n dagster_event=dagster_event,\n resource_defs=wrap_resources_for_execution(resources),\n logger=context.log if context else None,\n partition_key=partition_key,\n )
\n\n\n@overload\ndef run_failure_sensor(\n name: RunFailureSensorEvaluationFn,\n) -> SensorDefinition: ...\n\n\n@overload\ndef run_failure_sensor(\n name: Optional[str] = None,\n minimum_interval_seconds: Optional[int] = None,\n description: Optional[str] = None,\n monitored_jobs: Optional[\n Sequence[\n Union[\n JobDefinition,\n GraphDefinition,\n UnresolvedAssetJobDefinition,\n "RepositorySelector",\n "JobSelector",\n "CodeLocationSelector",\n ]\n ]\n ] = None,\n job_selection: Optional[\n Sequence[\n Union[\n JobDefinition,\n GraphDefinition,\n UnresolvedAssetJobDefinition,\n "RepositorySelector",\n "JobSelector",\n "CodeLocationSelector",\n ]\n ]\n ] = None,\n monitor_all_repositories: bool = False,\n default_status: DefaultSensorStatus = DefaultSensorStatus.STOPPED,\n request_job: Optional[ExecutableDefinition] = None,\n request_jobs: Optional[Sequence[ExecutableDefinition]] = None,\n) -> Callable[[RunFailureSensorEvaluationFn], SensorDefinition,]: ...\n\n\n
[docs]@deprecated_param(\n param="job_selection",\n breaking_version="2.0",\n additional_warn_text="Use `monitored_jobs` instead.",\n)\ndef run_failure_sensor(\n name: Optional[Union[RunFailureSensorEvaluationFn, str]] = None,\n minimum_interval_seconds: Optional[int] = None,\n description: Optional[str] = None,\n monitored_jobs: Optional[\n Sequence[\n Union[\n JobDefinition,\n GraphDefinition,\n UnresolvedAssetJobDefinition,\n "RepositorySelector",\n "JobSelector",\n "CodeLocationSelector",\n ]\n ]\n ] = None,\n job_selection: Optional[\n Sequence[\n Union[\n JobDefinition,\n GraphDefinition,\n UnresolvedAssetJobDefinition,\n "RepositorySelector",\n "JobSelector",\n "CodeLocationSelector",\n ]\n ]\n ] = None,\n monitor_all_repositories: bool = False,\n default_status: DefaultSensorStatus = DefaultSensorStatus.STOPPED,\n request_job: Optional[ExecutableDefinition] = None,\n request_jobs: Optional[Sequence[ExecutableDefinition]] = None,\n) -> Union[SensorDefinition, Callable[[RunFailureSensorEvaluationFn], SensorDefinition,]]:\n """Creates a sensor that reacts to job failure events, where the decorated function will be\n run when a run fails.\n\n Takes a :py:class:`~dagster.RunFailureSensorContext`.\n\n Args:\n name (Optional[str]): The name of the job failure sensor. Defaults to the name of the\n decorated function.\n minimum_interval_seconds (Optional[int]): The minimum number of seconds that will elapse\n between sensor evaluations.\n description (Optional[str]): A human-readable description of the sensor.\n monitored_jobs (Optional[List[Union[JobDefinition, GraphDefinition, UnresolvedAssetJobDefinition, RepositorySelector, JobSelector, CodeLocationSelector]]]):\n The jobs in the current repository that will be monitored by this failure sensor.\n Defaults to None, which means the alert will be sent when any job in the current\n repository fails.\n monitor_all_repositories (bool): If set to True, the sensor will monitor all runs in the\n Dagster instance. If set to True, an error will be raised if you also specify\n monitored_jobs or job_selection. Defaults to False.\n job_selection (Optional[List[Union[JobDefinition, GraphDefinition, RepositorySelector, JobSelector, CodeLocationSelector]]]):\n (deprecated in favor of monitored_jobs) The jobs in the current repository that will be\n monitored by this failure sensor. Defaults to None, which means the alert will be sent\n when any job in the repository fails.\n default_status (DefaultSensorStatus): Whether the sensor starts as running or not. The default\n status can be overridden from the Dagster UI or via the GraphQL API.\n request_job (Optional[Union[GraphDefinition, JobDefinition, UnresolvedAssetJob]]): The job a RunRequest should\n execute if yielded from the sensor.\n request_jobs (Optional[Sequence[Union[GraphDefinition, JobDefinition, UnresolvedAssetJob]]]): (experimental)\n A list of jobs to be executed if RunRequests are yielded from the sensor.\n """\n\n def inner(\n fn: RunFailureSensorEvaluationFn,\n ) -> SensorDefinition:\n check.callable_param(fn, "fn")\n if name is None or callable(name):\n sensor_name = fn.__name__\n else:\n sensor_name = name\n\n jobs = monitored_jobs if monitored_jobs else job_selection\n\n @run_status_sensor(\n run_status=DagsterRunStatus.FAILURE,\n name=sensor_name,\n minimum_interval_seconds=minimum_interval_seconds,\n description=description,\n monitored_jobs=jobs,\n monitor_all_repositories=monitor_all_repositories,\n default_status=default_status,\n request_job=request_job,\n request_jobs=request_jobs,\n )\n @functools.wraps(fn)\n def _run_failure_sensor(*args, **kwargs) -> Any:\n args_modified = [\n arg.for_run_failure() if isinstance(arg, RunStatusSensorContext) else arg\n for arg in args\n ]\n kwargs_modified = {\n k: v.for_run_failure() if isinstance(v, RunStatusSensorContext) else v\n for k, v in kwargs.items()\n }\n return fn(*args_modified, **kwargs_modified)\n\n return _run_failure_sensor\n\n # This case is for when decorator is used bare, without arguments\n if callable(name):\n return inner(name)\n\n return inner
\n\n\n
[docs]class RunStatusSensorDefinition(SensorDefinition):\n """Define a sensor that reacts to a given status of job execution, where the decorated\n function will be evaluated when a run is at the given status.\n\n Args:\n name (str): The name of the sensor. Defaults to the name of the decorated function.\n run_status (DagsterRunStatus): The status of a run which will be\n monitored by the sensor.\n run_status_sensor_fn (Callable[[RunStatusSensorContext], Union[SkipReason, DagsterRunReaction]]): The core\n evaluation function for the sensor. Takes a :py:class:`~dagster.RunStatusSensorContext`.\n minimum_interval_seconds (Optional[int]): The minimum number of seconds that will elapse\n between sensor evaluations.\n description (Optional[str]): A human-readable description of the sensor.\n monitored_jobs (Optional[List[Union[JobDefinition, GraphDefinition, UnresolvedAssetJobDefinition, JobSelector, RepositorySelector, CodeLocationSelector]]]):\n The jobs in the current repository that will be monitored by this sensor. Defaults to\n None, which means the alert will be sent when any job in the repository fails.\n monitor_all_repositories (bool): If set to True, the sensor will monitor all runs in the\n Dagster instance. If set to True, an error will be raised if you also specify\n monitored_jobs or job_selection. Defaults to False.\n default_status (DefaultSensorStatus): Whether the sensor starts as running or not. The default\n status can be overridden from the Dagster UI or via the GraphQL API.\n request_job (Optional[Union[GraphDefinition, JobDefinition]]): The job a RunRequest should\n execute if yielded from the sensor.\n request_jobs (Optional[Sequence[Union[GraphDefinition, JobDefinition]]]): (experimental)\n A list of jobs to be executed if RunRequests are yielded from the sensor.\n """\n\n def __init__(\n self,\n name: str,\n run_status: DagsterRunStatus,\n run_status_sensor_fn: RunStatusSensorEvaluationFunction,\n minimum_interval_seconds: Optional[int] = None,\n description: Optional[str] = None,\n monitored_jobs: Optional[\n Sequence[\n Union[\n JobDefinition,\n GraphDefinition,\n UnresolvedAssetJobDefinition,\n "RepositorySelector",\n "JobSelector",\n "CodeLocationSelector",\n ]\n ]\n ] = None,\n monitor_all_repositories: bool = False,\n default_status: DefaultSensorStatus = DefaultSensorStatus.STOPPED,\n request_job: Optional[ExecutableDefinition] = None,\n request_jobs: Optional[Sequence[ExecutableDefinition]] = None,\n required_resource_keys: Optional[Set[str]] = None,\n ):\n from dagster._core.definitions.selector import (\n CodeLocationSelector,\n JobSelector,\n RepositorySelector,\n )\n from dagster._core.event_api import RunShardedEventsCursor\n from dagster._core.storage.event_log.base import EventRecordsFilter\n\n check.str_param(name, "name")\n check.inst_param(run_status, "run_status", DagsterRunStatus)\n check.callable_param(run_status_sensor_fn, "run_status_sensor_fn")\n check.opt_int_param(minimum_interval_seconds, "minimum_interval_seconds")\n check.opt_str_param(description, "description")\n check.opt_list_param(\n monitored_jobs,\n "monitored_jobs",\n (\n JobDefinition,\n GraphDefinition,\n UnresolvedAssetJobDefinition,\n RepositorySelector,\n JobSelector,\n CodeLocationSelector,\n ),\n )\n check.inst_param(default_status, "default_status", DefaultSensorStatus)\n\n resource_arg_names: Set[str] = {arg.name for arg in get_resource_args(run_status_sensor_fn)}\n\n combined_required_resource_keys = (\n check.opt_set_param(required_resource_keys, "required_resource_keys", of_type=str)\n | resource_arg_names\n )\n\n # coerce CodeLocationSelectors to RepositorySelectors with repo name "__repository__"\n monitored_jobs = [\n job.to_repository_selector() if isinstance(job, CodeLocationSelector) else job\n for job in (monitored_jobs or [])\n ]\n\n self._run_status_sensor_fn = check.callable_param(\n run_status_sensor_fn, "run_status_sensor_fn"\n )\n event_type = PIPELINE_RUN_STATUS_TO_EVENT_TYPE[run_status]\n\n # split monitored_jobs into external repos, external jobs, and jobs in the current repo\n other_repos = (\n [x for x in monitored_jobs if isinstance(x, RepositorySelector)]\n if monitored_jobs\n else []\n )\n\n other_repo_jobs = (\n [x for x in monitored_jobs if isinstance(x, JobSelector)] if monitored_jobs else []\n )\n\n current_repo_jobs = (\n [x for x in monitored_jobs if not isinstance(x, (JobSelector, RepositorySelector))]\n if monitored_jobs\n else []\n )\n\n def _wrapped_fn(\n context: SensorEvaluationContext,\n ) -> Iterator[Union[RunRequest, SkipReason, DagsterRunReaction, SensorResult]]:\n # initiate the cursor to (most recent event id, current timestamp) when:\n # * it's the first time starting the sensor\n # * or, the cursor isn't in valid format (backcompt)\n if context.cursor is None or not RunStatusSensorCursor.is_valid(context.cursor):\n most_recent_event_records = list(\n context.instance.get_event_records(\n EventRecordsFilter(event_type=event_type), ascending=False, limit=1\n )\n )\n most_recent_event_id = (\n most_recent_event_records[0].storage_id\n if len(most_recent_event_records) == 1\n else -1\n )\n\n new_cursor = RunStatusSensorCursor(\n update_timestamp=pendulum.now("UTC").isoformat(),\n record_id=most_recent_event_id,\n )\n context.update_cursor(new_cursor.to_json())\n yield SkipReason(f"Initiating {name}. Set cursor to {new_cursor}")\n return\n\n record_id, update_timestamp = RunStatusSensorCursor.from_json(context.cursor)\n\n # Fetch events after the cursor id\n # * we move the cursor forward to the latest visited event's id to avoid revisits\n # * when the daemon is down, bc we persist the cursor info, we can go back to where we\n # left and backfill alerts for the qualified events (up to 5 at a time) during the downtime\n # Note: this is a cross-run query which requires extra handling in sqlite, see details in SqliteEventLogStorage.\n event_records = context.instance.get_event_records(\n EventRecordsFilter(\n after_cursor=RunShardedEventsCursor(\n id=record_id,\n run_updated_after=cast(datetime, pendulum.parse(update_timestamp)),\n ),\n event_type=event_type,\n ),\n ascending=True,\n limit=5,\n )\n\n for event_record in event_records:\n event_log_entry = event_record.event_log_entry\n storage_id = event_record.storage_id\n\n # get run info\n run_records = context.instance.get_run_records(\n filters=RunsFilter(run_ids=[event_log_entry.run_id])\n )\n\n # skip if we couldn't find the right run\n if len(run_records) != 1:\n # bc we couldn't find the run, we use the event timestamp as the approximate\n # run update timestamp\n approximate_update_timestamp = utc_datetime_from_timestamp(\n event_log_entry.timestamp\n )\n context.update_cursor(\n RunStatusSensorCursor(\n record_id=storage_id,\n update_timestamp=approximate_update_timestamp.isoformat(),\n ).to_json()\n )\n continue\n\n dagster_run = run_records[0].dagster_run\n update_timestamp = run_records[0].update_timestamp\n\n job_match = False\n\n # if monitor_all_repositories is provided, then we want to run the sensor for all jobs in all repositories\n if monitor_all_repositories:\n job_match = True\n\n # check if the run is in the current repository and (if provided) one of jobs specified in monitored_jobs\n if (\n not job_match\n and\n # the job has a repository (not manually executed)\n dagster_run.external_job_origin\n and\n # the job belongs to the current repository\n dagster_run.external_job_origin.external_repository_origin.repository_name\n == context.repository_name\n ):\n if monitored_jobs:\n if dagster_run.job_name in map(lambda x: x.name, current_repo_jobs):\n job_match = True\n else:\n job_match = True\n\n if not job_match:\n # check if the run is one of the jobs specified by JobSelector or RepositorySelector (ie in another repo)\n # make a JobSelector for the run in question\n external_repository_origin = check.not_none(\n dagster_run.external_job_origin\n ).external_repository_origin\n run_job_selector = JobSelector(\n location_name=external_repository_origin.code_location_origin.location_name,\n repository_name=external_repository_origin.repository_name,\n job_name=dagster_run.job_name,\n )\n if run_job_selector in other_repo_jobs:\n job_match = True\n\n # make a RepositorySelector for the run in question\n run_repo_selector = RepositorySelector(\n location_name=external_repository_origin.code_location_origin.location_name,\n repository_name=external_repository_origin.repository_name,\n )\n if run_repo_selector in other_repos:\n job_match = True\n\n if not job_match:\n # the run in question doesn't match any of the criteria for we advance the cursor and move on\n context.update_cursor(\n RunStatusSensorCursor(\n record_id=storage_id, update_timestamp=update_timestamp.isoformat()\n ).to_json()\n )\n continue\n\n serializable_error = None\n\n resource_args_populated = validate_and_get_resource_dict(\n context.resources, name, resource_arg_names\n )\n\n try:\n with RunStatusSensorContext(\n sensor_name=name,\n dagster_run=dagster_run,\n dagster_event=event_log_entry.dagster_event,\n instance=context.instance,\n resource_defs=context.resource_defs,\n logger=context.log,\n partition_key=dagster_run.tags.get("dagster/partition"),\n ) as sensor_context, user_code_error_boundary(\n RunStatusSensorExecutionError,\n lambda: f'Error occurred during the execution sensor "{name}".',\n ):\n context_param_name = get_context_param_name(run_status_sensor_fn)\n context_param = (\n {context_param_name: sensor_context} if context_param_name else {}\n )\n\n sensor_return = run_status_sensor_fn(\n **context_param,\n **resource_args_populated,\n )\n\n if sensor_return is not None:\n context.update_cursor(\n RunStatusSensorCursor(\n record_id=storage_id,\n update_timestamp=update_timestamp.isoformat(),\n ).to_json()\n )\n\n if isinstance(sensor_return, SensorResult):\n if sensor_return.cursor:\n raise DagsterInvariantViolationError(\n f"Error in run status sensor {name}: Sensor returned a"\n " SensorResult with a cursor value. The cursor is managed"\n " by the sensor and should not be modified by a user."\n )\n yield sensor_return\n elif isinstance(\n sensor_return,\n (RunRequest, SkipReason, DagsterRunReaction),\n ):\n yield sensor_return\n else:\n yield from sensor_return\n return\n except RunStatusSensorExecutionError as run_status_sensor_execution_error:\n # When the user code errors, we report error to the sensor tick not the original run.\n serializable_error = serializable_error_info_from_exc_info(\n run_status_sensor_execution_error.original_exc_info\n )\n\n context.update_cursor(\n RunStatusSensorCursor(\n record_id=storage_id, update_timestamp=update_timestamp.isoformat()\n ).to_json()\n )\n\n # Yield DagsterRunReaction to indicate the execution success/failure.\n # The sensor machinery would\n # * report back to the original run if success\n # * update cursor and job state\n yield DagsterRunReaction(\n dagster_run=dagster_run,\n run_status=run_status,\n error=serializable_error,\n )\n\n super(RunStatusSensorDefinition, self).__init__(\n name=name,\n evaluation_fn=_wrapped_fn,\n minimum_interval_seconds=minimum_interval_seconds,\n description=description,\n default_status=default_status,\n job=request_job,\n jobs=request_jobs,\n required_resource_keys=combined_required_resource_keys,\n )\n\n def __call__(self, *args, **kwargs) -> RawSensorEvaluationFunctionReturn:\n context_param_name = get_context_param_name(self._run_status_sensor_fn)\n context = get_sensor_context_from_args_or_kwargs(\n self._run_status_sensor_fn,\n args,\n kwargs,\n context_type=RunStatusSensorContext,\n )\n context_param = {context_param_name: context} if context_param_name and context else {}\n\n resources = validate_and_get_resource_dict(\n context.resources if context else ScopedResourcesBuilder.build_empty(),\n self._name,\n self._required_resource_keys,\n )\n return self._run_status_sensor_fn(**context_param, **resources)\n\n @property\n def sensor_type(self) -> SensorType:\n return SensorType.RUN_STATUS
\n\n\n
[docs]@deprecated_param(\n param="job_selection",\n breaking_version="2.0",\n additional_warn_text="Use `monitored_jobs` instead.",\n)\ndef run_status_sensor(\n run_status: DagsterRunStatus,\n name: Optional[str] = None,\n minimum_interval_seconds: Optional[int] = None,\n description: Optional[str] = None,\n monitored_jobs: Optional[\n Sequence[\n Union[\n JobDefinition,\n GraphDefinition,\n UnresolvedAssetJobDefinition,\n "RepositorySelector",\n "JobSelector",\n "CodeLocationSelector",\n ]\n ]\n ] = None,\n job_selection: Optional[\n Sequence[\n Union[\n JobDefinition,\n GraphDefinition,\n UnresolvedAssetJobDefinition,\n "RepositorySelector",\n "JobSelector",\n "CodeLocationSelector",\n ]\n ]\n ] = None,\n monitor_all_repositories: bool = False,\n default_status: DefaultSensorStatus = DefaultSensorStatus.STOPPED,\n request_job: Optional[ExecutableDefinition] = None,\n request_jobs: Optional[Sequence[ExecutableDefinition]] = None,\n) -> Callable[[RunStatusSensorEvaluationFunction], RunStatusSensorDefinition,]:\n """Creates a sensor that reacts to a given status of job execution, where the decorated\n function will be run when a job is at the given status.\n\n Takes a :py:class:`~dagster.RunStatusSensorContext`.\n\n Args:\n run_status (DagsterRunStatus): The status of run execution which will be\n monitored by the sensor.\n name (Optional[str]): The name of the sensor. Defaults to the name of the decorated function.\n minimum_interval_seconds (Optional[int]): The minimum number of seconds that will elapse\n between sensor evaluations.\n description (Optional[str]): A human-readable description of the sensor.\n monitored_jobs (Optional[List[Union[JobDefinition, GraphDefinition, UnresolvedAssetJobDefinition, RepositorySelector, JobSelector, CodeLocationSelector]]]):\n Jobs in the current repository that will be monitored by this sensor. Defaults to None, which means the alert will\n be sent when any job in the repository matches the requested run_status. Jobs in external repositories can be monitored by using\n RepositorySelector or JobSelector.\n monitor_all_repositories (bool): If set to True, the sensor will monitor all runs in the Dagster instance.\n If set to True, an error will be raised if you also specify monitored_jobs or job_selection.\n Defaults to False.\n job_selection (Optional[List[Union[JobDefinition, GraphDefinition, RepositorySelector, JobSelector, CodeLocationSelector]]]):\n (deprecated in favor of monitored_jobs) Jobs in the current repository that will be\n monitored by this sensor. Defaults to None, which means the alert will be sent when\n any job in the repository matches the requested run_status.\n default_status (DefaultSensorStatus): Whether the sensor starts as running or not. The default\n status can be overridden from the Dagster UI or via the GraphQL API.\n request_job (Optional[Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]]): The job that should be\n executed if a RunRequest is yielded from the sensor.\n request_jobs (Optional[Sequence[Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]]]): (experimental)\n A list of jobs to be executed if RunRequests are yielded from the sensor.\n """\n\n def inner(\n fn: RunStatusSensorEvaluationFunction,\n ) -> RunStatusSensorDefinition:\n check.callable_param(fn, "fn")\n sensor_name = name or fn.__name__\n\n jobs = monitored_jobs if monitored_jobs else job_selection\n\n if jobs and monitor_all_repositories:\n DagsterInvalidDefinitionError(\n "Cannot specify both monitor_all_repositories and"\n f" {'monitored_jobs' if monitored_jobs else 'job_selection'}."\n )\n\n return RunStatusSensorDefinition(\n name=sensor_name,\n run_status=run_status,\n run_status_sensor_fn=fn,\n minimum_interval_seconds=minimum_interval_seconds,\n description=description,\n monitored_jobs=jobs,\n monitor_all_repositories=monitor_all_repositories,\n default_status=default_status,\n request_job=request_job,\n request_jobs=request_jobs,\n )\n\n return inner
\n
", "current_page_name": "_modules/dagster/_core/definitions/run_status_sensor_definition", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.run_status_sensor_definition"}, "schedule_definition": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.schedule_definition

\nimport copy\nimport logging\nfrom contextlib import ExitStack\nfrom datetime import datetime\nfrom enum import Enum\nfrom typing import (\n    TYPE_CHECKING,\n    Any,\n    Callable,\n    Dict,\n    Iterator,\n    List,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Set,\n    TypeVar,\n    Union,\n    cast,\n)\n\nimport pendulum\nfrom typing_extensions import TypeAlias\n\nimport dagster._check as check\nfrom dagster._annotations import deprecated, deprecated_param, public\nfrom dagster._core.definitions.instigation_logger import InstigationLogger\nfrom dagster._core.definitions.resource_annotation import get_resource_args\nfrom dagster._core.definitions.scoped_resources_builder import Resources, ScopedResourcesBuilder\nfrom dagster._serdes import whitelist_for_serdes\nfrom dagster._utils import IHasInternalInit, ensure_gen\nfrom dagster._utils.merger import merge_dicts\nfrom dagster._utils.schedules import is_valid_cron_schedule\n\nfrom ..decorator_utils import has_at_least_one_parameter\nfrom ..errors import (\n    DagsterInvalidDefinitionError,\n    DagsterInvalidInvocationError,\n    DagsterInvariantViolationError,\n    ScheduleExecutionError,\n    user_code_error_boundary,\n)\nfrom ..instance import DagsterInstance\nfrom ..instance.ref import InstanceRef\nfrom ..storage.dagster_run import DagsterRun\nfrom .graph_definition import GraphDefinition\nfrom .job_definition import JobDefinition\nfrom .run_request import RunRequest, SkipReason\nfrom .target import DirectTarget, ExecutableDefinition, RepoRelativeTarget\nfrom .unresolved_asset_job_definition import UnresolvedAssetJobDefinition\nfrom .utils import check_valid_name, validate_tags\n\nif TYPE_CHECKING:\n    from dagster import ResourceDefinition\n    from dagster._core.definitions.repository_definition import RepositoryDefinition\nT = TypeVar("T")\n\nRunConfig: TypeAlias = Mapping[str, Any]\nRunRequestIterator: TypeAlias = Iterator[Union[RunRequest, SkipReason]]\n\nScheduleEvaluationFunctionReturn: TypeAlias = Union[\n    RunRequest, SkipReason, RunConfig, RunRequestIterator, Sequence[RunRequest]\n]\nRawScheduleEvaluationFunction: TypeAlias = Callable[..., ScheduleEvaluationFunctionReturn]\n\nScheduleRunConfigFunction: TypeAlias = Union[\n    Callable[["ScheduleEvaluationContext"], RunConfig],\n    Callable[[], RunConfig],\n]\n\nScheduleTagsFunction: TypeAlias = Callable[["ScheduleEvaluationContext"], Mapping[str, str]]\nScheduleShouldExecuteFunction: TypeAlias = Callable[["ScheduleEvaluationContext"], bool]\nScheduleExecutionFunction: TypeAlias = Union[\n    Callable[["ScheduleEvaluationContext"], Any],\n    "DecoratedScheduleFunction",\n]\n\n\n@whitelist_for_serdes\nclass DefaultScheduleStatus(Enum):\n    RUNNING = "RUNNING"\n    STOPPED = "STOPPED"\n\n\ndef get_or_create_schedule_context(\n    fn: Callable, *args: Any, **kwargs: Any\n) -> "ScheduleEvaluationContext":\n    """Based on the passed resource function and the arguments passed to it, returns the\n    user-passed ScheduleEvaluationContext or creates one if it is not passed.\n\n    Raises an exception if the user passes more than one argument or if the user-provided\n    function requires a context parameter but none is passed.\n    """\n    from dagster._config.pythonic_config import is_coercible_to_resource\n    from dagster._core.definitions.sensor_definition import get_context_param_name\n\n    context_param_name = get_context_param_name(fn)\n\n    kwarg_keys_non_resource = set(kwargs.keys()) - {param.name for param in get_resource_args(fn)}\n    if len(args) + len(kwarg_keys_non_resource) > 1:\n        raise DagsterInvalidInvocationError(\n            "Schedule invocation received multiple non-resource arguments. Only a first "\n            "positional context parameter should be provided when invoking."\n        )\n\n    if any(is_coercible_to_resource(arg) for arg in args):\n        raise DagsterInvalidInvocationError(\n            "If directly invoking a schedule, you may not provide resources as"\n            " positional arguments, only as keyword arguments."\n        )\n\n    context: Optional[ScheduleEvaluationContext] = None\n\n    if len(args) > 0:\n        context = check.opt_inst(args[0], ScheduleEvaluationContext)\n    elif len(kwargs) > 0:\n        if context_param_name and context_param_name not in kwargs:\n            raise DagsterInvalidInvocationError(\n                f"Schedule invocation expected argument '{context_param_name}'."\n            )\n        context = check.opt_inst(\n            kwargs.get(context_param_name or "context"), ScheduleEvaluationContext\n        )\n    elif context_param_name:\n        # If the context parameter is present but no value was provided, we error\n        raise DagsterInvalidInvocationError(\n            "Schedule evaluation function expected context argument, but no context argument "\n            "was provided when invoking."\n        )\n\n    context = context or build_schedule_context()\n    resource_args_from_kwargs = {}\n\n    resource_args = {param.name for param in get_resource_args(fn)}\n    for resource_arg in resource_args:\n        if resource_arg in kwargs:\n            resource_args_from_kwargs[resource_arg] = kwargs[resource_arg]\n\n    if resource_args_from_kwargs:\n        return context.merge_resources(resource_args_from_kwargs)\n\n    return context\n\n\n
[docs]class ScheduleEvaluationContext:\n """The context object available as the first argument various functions defined on a :py:class:`dagster.ScheduleDefinition`.\n\n A `ScheduleEvaluationContext` object is passed as the first argument to ``run_config_fn``, ``tags_fn``,\n and ``should_execute``.\n\n Users should not instantiate this object directly. To construct a `ScheduleEvaluationContext` for testing purposes, use :py:func:`dagster.build_schedule_context`.\n\n Example:\n .. code-block:: python\n\n from dagster import schedule, ScheduleEvaluationContext\n\n @schedule\n def the_schedule(context: ScheduleEvaluationContext):\n ...\n\n """\n\n __slots__ = [\n "_instance_ref",\n "_scheduled_execution_time",\n "_exit_stack",\n "_instance",\n "_log_key",\n "_logger",\n "_repository_name",\n "_resource_defs",\n "_schedule_name",\n "_resources_cm",\n "_resources",\n "_cm_scope_entered",\n "_repository_def",\n ]\n\n def __init__(\n self,\n instance_ref: Optional[InstanceRef],\n scheduled_execution_time: Optional[datetime],\n repository_name: Optional[str] = None,\n schedule_name: Optional[str] = None,\n resources: Optional[Mapping[str, "ResourceDefinition"]] = None,\n repository_def: Optional["RepositoryDefinition"] = None,\n ):\n from dagster._core.definitions.repository_definition import RepositoryDefinition\n\n self._exit_stack = ExitStack()\n self._instance = None\n\n self._instance_ref = check.opt_inst_param(instance_ref, "instance_ref", InstanceRef)\n self._scheduled_execution_time = check.opt_inst_param(\n scheduled_execution_time, "scheduled_execution_time", datetime\n )\n self._log_key = (\n [\n repository_name,\n schedule_name,\n scheduled_execution_time.strftime("%Y%m%d_%H%M%S"),\n ]\n if repository_name and schedule_name and scheduled_execution_time\n else None\n )\n self._logger = None\n self._repository_name = repository_name\n self._schedule_name = schedule_name\n\n # Wait to set resources unless they're accessed\n self._resource_defs = resources\n self._resources = None\n self._cm_scope_entered = False\n self._repository_def = check.opt_inst_param(\n repository_def, "repository_def", RepositoryDefinition\n )\n\n def __enter__(self) -> "ScheduleEvaluationContext":\n self._cm_scope_entered = True\n return self\n\n def __exit__(self, *exc) -> None:\n self._exit_stack.close()\n self._logger = None\n\n @property\n def resource_defs(self) -> Optional[Mapping[str, "ResourceDefinition"]]:\n return self._resource_defs\n\n @public\n @property\n def resources(self) -> Resources:\n """Mapping of resource key to resource definition to be made available\n during schedule execution.\n """\n from dagster._core.definitions.scoped_resources_builder import (\n IContainsGenerator,\n )\n from dagster._core.execution.build_resources import build_resources\n\n if not self._resources:\n # Early exit if no resources are defined. This skips unnecessary initialization\n # entirely. This allows users to run user code servers in cases where they\n # do not have access to the instance if they use a subset of features do\n # that do not require instance access. In this case, if they do not use\n # resources on schedules they do not require the instance, so we do not\n # instantiate it\n #\n # Tracking at https://github.com/dagster-io/dagster/issues/14345\n if not self._resource_defs:\n self._resources = ScopedResourcesBuilder.build_empty()\n return self._resources\n\n instance = self.instance if self._instance or self._instance_ref else None\n\n resources_cm = build_resources(resources=self._resource_defs, instance=instance)\n self._resources = self._exit_stack.enter_context(resources_cm)\n\n if isinstance(self._resources, IContainsGenerator) and not self._cm_scope_entered:\n self._exit_stack.close()\n raise DagsterInvariantViolationError(\n "At least one provided resource is a generator, but attempting to access"\n " resources outside of context manager scope. You can use the following syntax"\n " to open a context manager: `with build_sensor_context(...) as context:`"\n )\n\n return self._resources\n\n def merge_resources(self, resources_dict: Mapping[str, Any]) -> "ScheduleEvaluationContext":\n """Merge the specified resources into this context.\n This method is intended to be used by the Dagster framework, and should not be called by user code.\n\n Args:\n resources_dict (Mapping[str, Any]): The resources to replace in the context.\n """\n check.invariant(\n self._resources is None, "Cannot merge resources in context that has been initialized."\n )\n from dagster._core.execution.build_resources import wrap_resources_for_execution\n\n return ScheduleEvaluationContext(\n instance_ref=self._instance_ref,\n scheduled_execution_time=self._scheduled_execution_time,\n repository_name=self._repository_name,\n schedule_name=self._schedule_name,\n resources={\n **(self._resource_defs or {}),\n **wrap_resources_for_execution(resources_dict),\n },\n repository_def=self._repository_def,\n )\n\n @public\n @property\n def instance(self) -> "DagsterInstance":\n """DagsterInstance: The current DagsterInstance."""\n # self._instance_ref should only ever be None when this ScheduleEvaluationContext was\n # constructed under test.\n if not self._instance_ref:\n raise DagsterInvariantViolationError(\n "Attempted to initialize dagster instance, but no instance reference was provided."\n )\n if not self._instance:\n self._instance = self._exit_stack.enter_context(\n DagsterInstance.from_ref(self._instance_ref)\n )\n return cast(DagsterInstance, self._instance)\n\n @property\n def instance_ref(self) -> Optional[InstanceRef]:\n """The serialized instance configured to run the schedule."""\n return self._instance_ref\n\n @public\n @property\n def scheduled_execution_time(self) -> datetime:\n """The time in which the execution was scheduled to happen. May differ slightly\n from both the actual execution time and the time at which the run config is computed.\n """\n if self._scheduled_execution_time is None:\n check.failed(\n "Attempting to access scheduled_execution_time, but no scheduled_execution_time was"\n " set on this context"\n )\n\n return self._scheduled_execution_time\n\n @property\n def log(self) -> logging.Logger:\n if self._logger:\n return self._logger\n\n if not self._instance_ref:\n self._logger = self._exit_stack.enter_context(\n InstigationLogger(\n self._log_key,\n repository_name=self._repository_name,\n name=self._schedule_name,\n )\n )\n\n self._logger = self._exit_stack.enter_context(\n InstigationLogger(\n self._log_key,\n self.instance,\n repository_name=self._repository_name,\n name=self._schedule_name,\n )\n )\n return cast(InstigationLogger, self._logger)\n\n def has_captured_logs(self):\n return self._logger and self._logger.has_captured_logs()\n\n @property\n def log_key(self) -> Optional[List[str]]:\n return self._log_key\n\n @property\n def repository_def(self) -> "RepositoryDefinition":\n if not self._repository_def:\n raise DagsterInvariantViolationError(\n "Attempted to access repository_def, but no repository_def was provided."\n )\n return self._repository_def
\n\n\nclass DecoratedScheduleFunction(NamedTuple):\n """Wrapper around the decorated schedule function. Keeps track of both to better support the\n optimal return value for direct invocation of the evaluation function.\n """\n\n decorated_fn: RawScheduleEvaluationFunction\n wrapped_fn: Callable[[ScheduleEvaluationContext], RunRequestIterator]\n has_context_arg: bool\n\n\n
[docs]def build_schedule_context(\n instance: Optional[DagsterInstance] = None,\n scheduled_execution_time: Optional[datetime] = None,\n resources: Optional[Mapping[str, object]] = None,\n repository_def: Optional["RepositoryDefinition"] = None,\n instance_ref: Optional["InstanceRef"] = None,\n) -> ScheduleEvaluationContext:\n """Builds schedule execution context using the provided parameters.\n\n The instance provided to ``build_schedule_context`` must be persistent;\n DagsterInstance.ephemeral() will result in an error.\n\n Args:\n instance (Optional[DagsterInstance]): The dagster instance configured to run the schedule.\n scheduled_execution_time (datetime): The time in which the execution was scheduled to\n happen. May differ slightly from both the actual execution time and the time at which\n the run config is computed.\n\n Examples:\n .. code-block:: python\n\n context = build_schedule_context(instance)\n\n """\n from dagster._core.execution.build_resources import wrap_resources_for_execution\n\n check.opt_inst_param(instance, "instance", DagsterInstance)\n\n return ScheduleEvaluationContext(\n instance_ref=(\n instance_ref\n if instance_ref\n else instance.get_ref() if instance and instance.is_persistent else None\n ),\n scheduled_execution_time=check.opt_inst_param(\n scheduled_execution_time, "scheduled_execution_time", datetime\n ),\n resources=wrap_resources_for_execution(resources),\n repository_def=repository_def,\n )
\n\n\n@whitelist_for_serdes\nclass ScheduleExecutionData(\n NamedTuple(\n "_ScheduleExecutionData",\n [\n ("run_requests", Optional[Sequence[RunRequest]]),\n ("skip_message", Optional[str]),\n ("captured_log_key", Optional[Sequence[str]]),\n ],\n )\n):\n def __new__(\n cls,\n run_requests: Optional[Sequence[RunRequest]] = None,\n skip_message: Optional[str] = None,\n captured_log_key: Optional[Sequence[str]] = None,\n ):\n check.opt_sequence_param(run_requests, "run_requests", RunRequest)\n check.opt_str_param(skip_message, "skip_message")\n check.opt_list_param(captured_log_key, "captured_log_key", str)\n check.invariant(\n not (run_requests and skip_message), "Found both skip data and run request data"\n )\n return super(ScheduleExecutionData, cls).__new__(\n cls,\n run_requests=run_requests,\n skip_message=skip_message,\n captured_log_key=captured_log_key,\n )\n\n\ndef validate_and_get_schedule_resource_dict(\n resources: Resources, schedule_name: str, required_resource_keys: Set[str]\n) -> Dict[str, Any]:\n """Validates that the context has all the required resources and returns a dictionary of\n resource key to resource object.\n """\n for k in required_resource_keys:\n if not hasattr(resources, k):\n raise DagsterInvalidDefinitionError(\n f"Resource with key '{k}' required by schedule '{schedule_name}' was not provided."\n )\n\n return {k: getattr(resources, k) for k in required_resource_keys}\n\n\n
[docs]@deprecated_param(\n param="environment_vars",\n breaking_version="2.0",\n additional_warn_text=(\n "It is no longer necessary. Schedules will have access to all environment variables set in"\n " the containing environment, and can safely be deleted."\n ),\n)\nclass ScheduleDefinition(IHasInternalInit):\n """Define a schedule that targets a job.\n\n Args:\n name (Optional[str]): The name of the schedule to create. Defaults to the job name plus\n "_schedule".\n cron_schedule (Union[str, Sequence[str]]): A valid cron string or sequence of cron strings\n specifying when the schedule will run, e.g., ``'45 23 * * 6'`` for a schedule that runs\n at 11:45 PM every Saturday. If a sequence is provided, then the schedule will run for\n the union of all execution times for the provided cron strings, e.g.,\n ``['45 23 * * 6', '30 9 * * 0]`` for a schedule that runs at 11:45 PM every Saturday and\n 9:30 AM every Sunday.\n execution_fn (Callable[ScheduleEvaluationContext]): The core evaluation function for the\n schedule, which is run at an interval to determine whether a run should be launched or\n not. Takes a :py:class:`~dagster.ScheduleEvaluationContext`.\n\n This function must return a generator, which must yield either a single SkipReason\n or one or more RunRequest objects.\n run_config (Optional[Mapping]): The config that parameterizes this execution,\n as a dict.\n run_config_fn (Optional[Callable[[ScheduleEvaluationContext], [Mapping]]]): A function that\n takes a ScheduleEvaluationContext object and returns the run configuration that\n parameterizes this execution, as a dict. You may set only one of ``run_config``,\n ``run_config_fn``, and ``execution_fn``.\n tags (Optional[Mapping[str, str]]): A dictionary of tags (string key-value pairs) to attach\n to the scheduled runs.\n tags_fn (Optional[Callable[[ScheduleEvaluationContext], Optional[Mapping[str, str]]]]): A\n function that generates tags to attach to the schedules runs. Takes a\n :py:class:`~dagster.ScheduleEvaluationContext` and returns a dictionary of tags (string\n key-value pairs). You may set only one of ``tags``, ``tags_fn``, and ``execution_fn``.\n should_execute (Optional[Callable[[ScheduleEvaluationContext], bool]]): A function that runs\n at schedule execution time to determine whether a schedule should execute or skip. Takes\n a :py:class:`~dagster.ScheduleEvaluationContext` and returns a boolean (``True`` if the\n schedule should execute). Defaults to a function that always returns ``True``.\n execution_timezone (Optional[str]): Timezone in which the schedule should run.\n Supported strings for timezones are the ones provided by the\n `IANA time zone database <https://www.iana.org/time-zones>` - e.g. "America/Los_Angeles".\n description (Optional[str]): A human-readable description of the schedule.\n job (Optional[Union[GraphDefinition, JobDefinition]]): The job that should execute when this\n schedule runs.\n default_status (DefaultScheduleStatus): Whether the schedule starts as running or not. The default\n status can be overridden from the Dagster UI or via the GraphQL API.\n required_resource_keys (Optional[Set[str]]): The set of resource keys required by the schedule.\n """\n\n def with_updated_job(self, new_job: ExecutableDefinition) -> "ScheduleDefinition":\n """Returns a copy of this schedule with the job replaced.\n\n Args:\n job (ExecutableDefinition): The job that should execute when this\n schedule runs.\n """\n return ScheduleDefinition.dagster_internal_init(\n name=self.name,\n cron_schedule=self._cron_schedule,\n job_name=self.job_name,\n execution_timezone=self.execution_timezone,\n execution_fn=self._execution_fn,\n description=self.description,\n job=new_job,\n default_status=self.default_status,\n environment_vars=self._environment_vars,\n required_resource_keys=self._raw_required_resource_keys,\n run_config=None, # run_config, tags, should_execute encapsulated in execution_fn\n run_config_fn=None,\n tags=None,\n tags_fn=None,\n should_execute=None,\n )\n\n def __init__(\n self,\n name: Optional[str] = None,\n *,\n cron_schedule: Optional[Union[str, Sequence[str]]] = None,\n job_name: Optional[str] = None,\n run_config: Optional[Any] = None,\n run_config_fn: Optional[ScheduleRunConfigFunction] = None,\n tags: Optional[Mapping[str, str]] = None,\n tags_fn: Optional[ScheduleTagsFunction] = None,\n should_execute: Optional[ScheduleShouldExecuteFunction] = None,\n environment_vars: Optional[Mapping[str, str]] = None,\n execution_timezone: Optional[str] = None,\n execution_fn: Optional[ScheduleExecutionFunction] = None,\n description: Optional[str] = None,\n job: Optional[ExecutableDefinition] = None,\n default_status: DefaultScheduleStatus = DefaultScheduleStatus.STOPPED,\n required_resource_keys: Optional[Set[str]] = None,\n ):\n self._cron_schedule = check.inst_param(cron_schedule, "cron_schedule", (str, Sequence))\n if not isinstance(self._cron_schedule, str):\n check.sequence_param(self._cron_schedule, "cron_schedule", of_type=str) # type: ignore\n\n if not is_valid_cron_schedule(self._cron_schedule): # type: ignore\n raise DagsterInvalidDefinitionError(\n f"Found invalid cron schedule '{self._cron_schedule}' for schedule '{name}''. "\n "Dagster recognizes standard cron expressions consisting of 5 fields."\n )\n\n if job is not None:\n self._target: Union[DirectTarget, RepoRelativeTarget] = DirectTarget(job)\n else:\n self._target = RepoRelativeTarget(\n job_name=check.str_param(job_name, "job_name"),\n op_selection=None,\n )\n\n if name:\n self._name = check_valid_name(name)\n elif job_name:\n self._name = job_name + "_schedule"\n elif job:\n self._name = job.name + "_schedule"\n\n self._description = check.opt_str_param(description, "description")\n\n self._environment_vars = check.opt_mapping_param(\n environment_vars, "environment_vars", key_type=str, value_type=str\n )\n\n self._execution_timezone = check.opt_str_param(execution_timezone, "execution_timezone")\n\n if execution_fn and (run_config_fn or tags_fn or should_execute or tags or run_config):\n raise DagsterInvalidDefinitionError(\n "Attempted to provide both execution_fn and individual run_config/tags arguments "\n "to ScheduleDefinition. Must provide only one of the two."\n )\n elif execution_fn:\n self._execution_fn: Optional[Union[Callable[..., Any], DecoratedScheduleFunction]] = (\n None\n )\n if isinstance(execution_fn, DecoratedScheduleFunction):\n self._execution_fn = execution_fn\n else:\n self._execution_fn = check.opt_callable_param(execution_fn, "execution_fn")\n self._run_config_fn = None\n else:\n if run_config_fn and run_config:\n raise DagsterInvalidDefinitionError(\n "Attempted to provide both run_config_fn and run_config as arguments"\n " to ScheduleDefinition. Must provide only one of the two."\n )\n\n def _default_run_config_fn(context: ScheduleEvaluationContext) -> RunConfig:\n return check.opt_dict_param(run_config, "run_config")\n\n self._run_config_fn = check.opt_callable_param(\n run_config_fn, "run_config_fn", default=_default_run_config_fn\n )\n\n if tags_fn and tags:\n raise DagsterInvalidDefinitionError(\n "Attempted to provide both tags_fn and tags as arguments"\n " to ScheduleDefinition. Must provide only one of the two."\n )\n elif tags:\n tags = validate_tags(tags, allow_reserved_tags=False)\n tags_fn = lambda _context: tags\n else:\n tags_fn = check.opt_callable_param(\n tags_fn, "tags_fn", default=lambda _context: cast(Mapping[str, str], {})\n )\n self._tags_fn = tags_fn\n self._tags = tags\n\n self._should_execute: ScheduleShouldExecuteFunction = check.opt_callable_param(\n should_execute, "should_execute", default=lambda _context: True\n )\n\n # Several type-ignores are present in this function to work around bugs in mypy\n # inference.\n def _execution_fn(context: ScheduleEvaluationContext) -> RunRequestIterator:\n with user_code_error_boundary(\n ScheduleExecutionError,\n lambda: (\n f"Error occurred during the execution of should_execute for schedule {name}"\n ),\n ):\n if not self._should_execute(context):\n yield SkipReason(f"should_execute function for {name} returned false.")\n return\n\n with user_code_error_boundary(\n ScheduleExecutionError,\n lambda: (\n f"Error occurred during the execution of run_config_fn for schedule {name}"\n ),\n ):\n _run_config_fn = check.not_none(self._run_config_fn)\n evaluated_run_config = copy.deepcopy(\n _run_config_fn(context)\n if has_at_least_one_parameter(_run_config_fn)\n else _run_config_fn() # type: ignore # (strict type guard)\n )\n\n with user_code_error_boundary(\n ScheduleExecutionError,\n lambda: f"Error occurred during the execution of tags_fn for schedule {name}",\n ):\n evaluated_tags = validate_tags(tags_fn(context), allow_reserved_tags=False)\n\n yield RunRequest(\n run_key=None,\n run_config=evaluated_run_config,\n tags=evaluated_tags,\n )\n\n self._execution_fn = _execution_fn\n\n if self._execution_timezone:\n try:\n # Verify that the timezone can be loaded\n pendulum.tz.timezone(self._execution_timezone) # type: ignore\n except Exception as e:\n raise DagsterInvalidDefinitionError(\n f"Invalid execution timezone {self._execution_timezone} for {name}"\n ) from e\n\n self._default_status = check.inst_param(\n default_status, "default_status", DefaultScheduleStatus\n )\n\n resource_arg_names: Set[str] = (\n {arg.name for arg in get_resource_args(self._execution_fn.decorated_fn)}\n if isinstance(self._execution_fn, DecoratedScheduleFunction)\n else set()\n )\n\n check.param_invariant(\n len(required_resource_keys or []) == 0 or len(resource_arg_names) == 0,\n "Cannot specify resource requirements in both @schedule decorator and as arguments to"\n " the decorated function",\n )\n\n self._raw_required_resource_keys = check.opt_set_param(\n required_resource_keys, "required_resource_keys", of_type=str\n )\n self._required_resource_keys = self._raw_required_resource_keys or resource_arg_names\n\n @staticmethod\n def dagster_internal_init(\n *,\n name: Optional[str],\n cron_schedule: Optional[Union[str, Sequence[str]]],\n job_name: Optional[str],\n run_config: Optional[Any],\n run_config_fn: Optional[ScheduleRunConfigFunction],\n tags: Optional[Mapping[str, str]],\n tags_fn: Optional[ScheduleTagsFunction],\n should_execute: Optional[ScheduleShouldExecuteFunction],\n environment_vars: Optional[Mapping[str, str]],\n execution_timezone: Optional[str],\n execution_fn: Optional[ScheduleExecutionFunction],\n description: Optional[str],\n job: Optional[ExecutableDefinition],\n default_status: DefaultScheduleStatus,\n required_resource_keys: Optional[Set[str]],\n ) -> "ScheduleDefinition":\n return ScheduleDefinition(\n name=name,\n cron_schedule=cron_schedule,\n job_name=job_name,\n run_config=run_config,\n run_config_fn=run_config_fn,\n tags=tags,\n tags_fn=tags_fn,\n should_execute=should_execute,\n environment_vars=environment_vars,\n execution_timezone=execution_timezone,\n execution_fn=execution_fn,\n description=description,\n job=job,\n default_status=default_status,\n required_resource_keys=required_resource_keys,\n )\n\n def __call__(self, *args, **kwargs) -> ScheduleEvaluationFunctionReturn:\n from dagster._core.definitions.sensor_definition import get_context_param_name\n\n from .decorators.schedule_decorator import DecoratedScheduleFunction\n\n if not isinstance(self._execution_fn, DecoratedScheduleFunction):\n raise DagsterInvalidInvocationError(\n "Schedule invocation is only supported for schedules created via the schedule "\n "decorators."\n )\n\n context_param_name = get_context_param_name(self._execution_fn.decorated_fn)\n context = get_or_create_schedule_context(self._execution_fn.decorated_fn, *args, **kwargs)\n context_param = {context_param_name: context} if context_param_name else {}\n\n resources = validate_and_get_schedule_resource_dict(\n context.resources, self._name, self._required_resource_keys\n )\n result = self._execution_fn.decorated_fn(**context_param, **resources)\n\n if isinstance(result, dict):\n return copy.deepcopy(result)\n else:\n return result\n\n @public\n @property\n def name(self) -> str:\n """str: The name of the schedule."""\n return self._name\n\n @public\n @property\n def job_name(self) -> str:\n """str: The name of the job targeted by this schedule."""\n return self._target.job_name\n\n @public\n @property\n def description(self) -> Optional[str]:\n """Optional[str]: A description for this schedule."""\n return self._description\n\n @public\n @property\n def cron_schedule(self) -> Union[str, Sequence[str]]:\n """Union[str, Sequence[str]]: The cron schedule representing when this schedule will be evaluated."""\n return self._cron_schedule # type: ignore\n\n @public\n @deprecated(\n breaking_version="2.0",\n additional_warn_text="Setting this property no longer has any effect.",\n )\n @property\n def environment_vars(self) -> Mapping[str, str]:\n """Mapping[str, str]: Environment variables to export to the cron schedule."""\n return self._environment_vars\n\n @public\n @property\n def required_resource_keys(self) -> Set[str]:\n """Set[str]: The set of keys for resources that must be provided to this schedule."""\n return self._required_resource_keys\n\n @public\n @property\n def execution_timezone(self) -> Optional[str]:\n """Optional[str]: The timezone in which this schedule will be evaluated."""\n return self._execution_timezone\n\n @public\n @property\n def job(self) -> Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]:\n """Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]: The job that is\n targeted by this schedule.\n """\n if isinstance(self._target, DirectTarget):\n return self._target.target\n raise DagsterInvalidDefinitionError("No job was provided to ScheduleDefinition.")\n\n def evaluate_tick(self, context: "ScheduleEvaluationContext") -> ScheduleExecutionData:\n """Evaluate schedule using the provided context.\n\n Args:\n context (ScheduleEvaluationContext): The context with which to evaluate this schedule.\n\n Returns:\n ScheduleExecutionData: Contains list of run requests, or skip message if present.\n\n """\n from dagster._core.definitions.partition import CachingDynamicPartitionsLoader\n\n check.inst_param(context, "context", ScheduleEvaluationContext)\n execution_fn: Callable[..., "ScheduleEvaluationFunctionReturn"]\n if isinstance(self._execution_fn, DecoratedScheduleFunction):\n execution_fn = self._execution_fn.wrapped_fn\n else:\n execution_fn = cast(\n Callable[..., "ScheduleEvaluationFunctionReturn"],\n self._execution_fn,\n )\n\n result = list(ensure_gen(execution_fn(context)))\n\n skip_message: Optional[str] = None\n\n run_requests: List[RunRequest] = []\n if not result or result == [None]:\n run_requests = []\n skip_message = "Schedule function returned an empty result"\n elif len(result) == 1:\n item = check.inst(result[0], (SkipReason, RunRequest))\n if isinstance(item, RunRequest):\n run_requests = [item]\n skip_message = None\n elif isinstance(item, SkipReason):\n run_requests = []\n skip_message = item.skip_message\n else:\n # NOTE: mypy is not correctly reading this cast-- not sure why\n # (pyright reads it fine). Hence the type-ignores below.\n result = cast(List[RunRequest], check.is_list(result, of_type=RunRequest))\n check.invariant(\n not any(not request.run_key for request in result),\n "Schedules that return multiple RunRequests must specify a run_key in each"\n " RunRequest",\n )\n run_requests = result\n skip_message = None\n\n dynamic_partitions_store = (\n CachingDynamicPartitionsLoader(context.instance) if context.instance_ref else None\n )\n\n # clone all the run requests with resolved tags and config\n resolved_run_requests = []\n for run_request in run_requests:\n if run_request.partition_key and not run_request.has_resolved_partition():\n if context.repository_def is None:\n raise DagsterInvariantViolationError(\n "Must provide repository def to build_schedule_context when yielding"\n " partitioned run requests"\n )\n\n scheduled_target = context.repository_def.get_job(self._target.job_name)\n resolved_request = run_request.with_resolved_tags_and_config(\n target_definition=scheduled_target,\n dynamic_partitions_requests=[],\n current_time=context.scheduled_execution_time,\n dynamic_partitions_store=dynamic_partitions_store,\n )\n else:\n resolved_request = run_request\n\n resolved_run_requests.append(\n resolved_request.with_replaced_attrs(\n tags=merge_dicts(resolved_request.tags, DagsterRun.tags_for_schedule(self))\n )\n )\n\n return ScheduleExecutionData(\n run_requests=resolved_run_requests,\n skip_message=skip_message,\n captured_log_key=context.log_key if context.has_captured_logs() else None,\n )\n\n def has_loadable_target(self):\n return isinstance(self._target, DirectTarget)\n\n @property\n def targets_unresolved_asset_job(self) -> bool:\n return self.has_loadable_target() and isinstance(\n self.load_target(), UnresolvedAssetJobDefinition\n )\n\n def load_target(\n self,\n ) -> Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]:\n if isinstance(self._target, DirectTarget):\n return self._target.load()\n\n check.failed("Target is not loadable")\n\n @public\n @property\n def default_status(self) -> DefaultScheduleStatus:\n """DefaultScheduleStatus: The default status for this schedule when it is first loaded in\n a code location.\n """\n return self._default_status
\n
", "current_page_name": "_modules/dagster/_core/definitions/schedule_definition", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.schedule_definition"}, "selector": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.selector

\nfrom typing import AbstractSet, Iterable, NamedTuple, Optional, Sequence\n\nfrom typing_extensions import Self\n\nimport dagster._check as check\nfrom dagster._core.definitions.asset_check_spec import AssetCheckKey\nfrom dagster._core.definitions.events import AssetKey\nfrom dagster._core.definitions.repository_definition import SINGLETON_REPOSITORY_NAME\nfrom dagster._serdes import create_snapshot_id, whitelist_for_serdes\n\n\nclass JobSubsetSelector(\n    NamedTuple(\n        "_JobSubsetSelector",\n        [\n            ("location_name", str),\n            ("repository_name", str),\n            ("job_name", str),\n            ("op_selection", Optional[Sequence[str]]),\n            ("asset_selection", Optional[AbstractSet[AssetKey]]),\n            ("asset_check_selection", Optional[AbstractSet[AssetCheckKey]]),\n        ],\n    )\n):\n    """The information needed to resolve a job within a host process."""\n\n    def __new__(\n        cls,\n        location_name: str,\n        repository_name: str,\n        job_name: str,\n        op_selection: Optional[Sequence[str]],\n        asset_selection: Optional[Iterable[AssetKey]] = None,\n        asset_check_selection: Optional[Iterable[AssetCheckKey]] = None,\n    ):\n        asset_selection = set(asset_selection) if asset_selection else None\n        asset_check_selection = (\n            set(asset_check_selection) if asset_check_selection is not None else None\n        )\n        return super(JobSubsetSelector, cls).__new__(\n            cls,\n            location_name=check.str_param(location_name, "location_name"),\n            repository_name=check.str_param(repository_name, "repository_name"),\n            job_name=check.str_param(job_name, "job_name"),\n            op_selection=check.opt_nullable_sequence_param(op_selection, "op_selection", str),\n            asset_selection=check.opt_nullable_set_param(\n                asset_selection, "asset_selection", AssetKey\n            ),\n            asset_check_selection=check.opt_nullable_set_param(\n                asset_check_selection, "asset_check_selection", AssetCheckKey\n            ),\n        )\n\n    def to_graphql_input(self):\n        return {\n            "repositoryLocationName": self.location_name,\n            "repositoryName": self.repository_name,\n            "pipelineName": self.job_name,\n            "solidSelection": self.op_selection,\n        }\n\n    def with_op_selection(self, op_selection: Optional[Sequence[str]]) -> Self:\n        check.invariant(\n            self.op_selection is None,\n            f"Can not invoke with_op_selection when op_selection={self.op_selection} is"\n            " already set",\n        )\n        return JobSubsetSelector(\n            self.location_name, self.repository_name, self.job_name, op_selection\n        )\n\n\n
[docs]@whitelist_for_serdes\nclass JobSelector(\n NamedTuple(\n "_JobSelector", [("location_name", str), ("repository_name", str), ("job_name", str)]\n )\n):\n def __new__(\n cls,\n location_name: str,\n repository_name: Optional[str] = None,\n job_name: Optional[str] = None,\n ):\n return super(JobSelector, cls).__new__(\n cls,\n location_name=check.str_param(location_name, "location_name"),\n repository_name=check.opt_str_param(\n repository_name,\n "repository_name",\n default=SINGLETON_REPOSITORY_NAME,\n ),\n job_name=check.str_param(\n job_name,\n "job_name",\n "Must provide job_name argument even though it is marked as optional in the "\n "function signature. repository_name, a truly optional parameter, is before "\n "that argument and actually optional. Use of keyword arguments is "\n "recommended to avoid confusion.",\n ),\n )\n\n def to_graphql_input(self):\n return {\n "repositoryLocationName": self.location_name,\n "repositoryName": self.repository_name,\n "jobName": self.job_name,\n }\n\n @property\n def selector_id(self):\n return create_snapshot_id(self)\n\n @staticmethod\n def from_graphql_input(graphql_data):\n return JobSelector(\n location_name=graphql_data["repositoryLocationName"],\n repository_name=graphql_data["repositoryName"],\n job_name=graphql_data["jobName"],\n )
\n\n\n
[docs]@whitelist_for_serdes\nclass RepositorySelector(\n NamedTuple("_RepositorySelector", [("location_name", str), ("repository_name", str)])\n):\n def __new__(cls, location_name: str, repository_name: str):\n return super(RepositorySelector, cls).__new__(\n cls,\n location_name=check.str_param(location_name, "location_name"),\n repository_name=check.str_param(repository_name, "repository_name"),\n )\n\n def to_graphql_input(self):\n return {\n "repositoryLocationName": self.location_name,\n "repositoryName": self.repository_name,\n }\n\n @property\n def selector_id(self):\n return create_snapshot_id(self)\n\n @staticmethod\n def from_graphql_input(graphql_data):\n return RepositorySelector(\n location_name=graphql_data["repositoryLocationName"],\n repository_name=graphql_data["repositoryName"],\n )
\n\n\nclass CodeLocationSelector(NamedTuple("_CodeLocationSelector", [("location_name", str)])):\n def __new__(cls, location_name: str):\n return super(CodeLocationSelector, cls).__new__(\n cls,\n location_name=check.str_param(location_name, "location_name"),\n )\n\n def to_repository_selector(self) -> RepositorySelector:\n return RepositorySelector(\n location_name=self.location_name, repository_name=SINGLETON_REPOSITORY_NAME\n )\n\n\nclass ScheduleSelector(\n NamedTuple(\n "_ScheduleSelector",\n [("location_name", str), ("repository_name", str), ("schedule_name", str)],\n )\n):\n def __new__(cls, location_name: str, repository_name: str, schedule_name: str):\n return super(ScheduleSelector, cls).__new__(\n cls,\n location_name=check.str_param(location_name, "location_name"),\n repository_name=check.str_param(repository_name, "repository_name"),\n schedule_name=check.str_param(schedule_name, "schedule_name"),\n )\n\n def to_graphql_input(self):\n return {\n "repositoryLocationName": self.location_name,\n "repositoryName": self.repository_name,\n "scheduleName": self.schedule_name,\n }\n\n @staticmethod\n def from_graphql_input(graphql_data):\n return ScheduleSelector(\n location_name=graphql_data["repositoryLocationName"],\n repository_name=graphql_data["repositoryName"],\n schedule_name=graphql_data["scheduleName"],\n )\n\n\nclass ResourceSelector(NamedTuple):\n location_name: str\n repository_name: str\n resource_name: str\n\n def to_graphql_input(self):\n return {\n "repositoryLocationName": self.location_name,\n "repositoryName": self.repository_name,\n "resourceName": self.resource_name,\n }\n\n @staticmethod\n def from_graphql_input(graphql_data):\n return ResourceSelector(\n location_name=graphql_data["repositoryLocationName"],\n repository_name=graphql_data["repositoryName"],\n resource_name=graphql_data["resourceName"],\n )\n\n\nclass SensorSelector(\n NamedTuple(\n "_SensorSelector", [("location_name", str), ("repository_name", str), ("sensor_name", str)]\n )\n):\n def __new__(cls, location_name: str, repository_name: str, sensor_name: str):\n return super(SensorSelector, cls).__new__(\n cls,\n location_name=check.str_param(location_name, "location_name"),\n repository_name=check.str_param(repository_name, "repository_name"),\n sensor_name=check.str_param(sensor_name, "sensor_name"),\n )\n\n def to_graphql_input(self):\n return {\n "repositoryLocationName": self.location_name,\n "repositoryName": self.repository_name,\n "sensorName": self.sensor_name,\n }\n\n @staticmethod\n def from_graphql_input(graphql_data):\n return SensorSelector(\n location_name=graphql_data["repositoryLocationName"],\n repository_name=graphql_data["repositoryName"],\n sensor_name=graphql_data["sensorName"],\n )\n\n\n@whitelist_for_serdes\nclass InstigatorSelector(\n NamedTuple(\n "_InstigatorSelector", [("location_name", str), ("repository_name", str), ("name", str)]\n )\n):\n def __new__(cls, location_name: str, repository_name: str, name: str):\n return super(InstigatorSelector, cls).__new__(\n cls,\n location_name=check.str_param(location_name, "location_name"),\n repository_name=check.str_param(repository_name, "repository_name"),\n name=check.str_param(name, "name"),\n )\n\n def to_graphql_input(self):\n return {\n "repositoryLocationName": self.location_name,\n "repositoryName": self.repository_name,\n "name": self.name,\n }\n\n @staticmethod\n def from_graphql_input(graphql_data):\n return InstigatorSelector(\n location_name=graphql_data["repositoryLocationName"],\n repository_name=graphql_data["repositoryName"],\n name=graphql_data["name"],\n )\n\n\nclass GraphSelector(\n NamedTuple(\n "_GraphSelector", [("location_name", str), ("repository_name", str), ("graph_name", str)]\n )\n):\n """The information needed to resolve a graph within a host process."""\n\n def __new__(cls, location_name: str, repository_name: str, graph_name: str):\n return super(GraphSelector, cls).__new__(\n cls,\n location_name=check.str_param(location_name, "location_name"),\n repository_name=check.str_param(repository_name, "repository_name"),\n graph_name=check.str_param(graph_name, "graph_name"),\n )\n\n def to_graphql_input(self):\n return {\n "repositoryLocationName": self.location_name,\n "repositoryName": self.repository_name,\n "graphName": self.graph_name,\n }\n\n\n@whitelist_for_serdes\nclass PartitionSetSelector(\n NamedTuple(\n "_PartitionSetSelector",\n [("location_name", str), ("repository_name", str), ("partition_set_name", str)],\n )\n):\n """The information needed to resolve a partition set within a host process."""\n\n def __new__(cls, location_name: str, repository_name: str, partition_set_name: str):\n return super(PartitionSetSelector, cls).__new__(\n cls,\n location_name=check.str_param(location_name, "location_name"),\n repository_name=check.str_param(repository_name, "repository_name"),\n partition_set_name=check.str_param(partition_set_name, "partition_set_name"),\n )\n\n def to_graphql_input(self):\n return {\n "repositoryLocationName": self.location_name,\n "repositoryName": self.repository_name,\n "partitionSetName": self.partition_set_name,\n }\n\n\nclass PartitionRangeSelector(\n NamedTuple(\n "_PartitionRangeSelector",\n [("start", str), ("end", str)],\n )\n):\n """The information needed to resolve a partition range."""\n\n def __new__(cls, start: str, end: str):\n return super(PartitionRangeSelector, cls).__new__(\n cls,\n start=check.inst_param(start, "start", str),\n end=check.inst_param(end, "end", str),\n )\n\n def to_graphql_input(self):\n return {\n "start": self.start,\n "end": self.end,\n }\n\n @staticmethod\n def from_graphql_input(graphql_data):\n return PartitionRangeSelector(\n start=graphql_data["start"],\n end=graphql_data["end"],\n )\n\n\nclass PartitionsSelector(\n NamedTuple(\n "_PartitionsSelector",\n [("partition_range", PartitionRangeSelector)],\n )\n):\n """The information needed to define selection partitions.\n Using partition_range as property name to avoid shadowing Python 'range' builtin .\n """\n\n def __new__(cls, partition_range: PartitionRangeSelector):\n return super(PartitionsSelector, cls).__new__(\n cls,\n partition_range=check.inst_param(partition_range, "range", PartitionRangeSelector),\n )\n\n def to_graphql_input(self):\n return {\n "range": self.partition_range.to_graphql_input(),\n }\n\n @staticmethod\n def from_graphql_input(graphql_data):\n return PartitionsSelector(\n partition_range=PartitionRangeSelector.from_graphql_input(graphql_data["range"])\n )\n\n\nclass PartitionsByAssetSelector(\n NamedTuple(\n "PartitionsByAssetSelector",\n [\n ("asset_key", AssetKey),\n ("partitions", Optional[PartitionsSelector]),\n ],\n )\n):\n """The information needed to define partitions selection for a given asset key."""\n\n def __new__(cls, asset_key: AssetKey, partitions: Optional[PartitionsSelector] = None):\n return super(PartitionsByAssetSelector, cls).__new__(\n cls,\n asset_key=check.inst_param(asset_key, "asset_key", AssetKey),\n partitions=check.opt_inst_param(partitions, "partitions", PartitionsSelector),\n )\n\n def to_graphql_input(self):\n return {\n "assetKey": self.asset_key.to_graphql_input(),\n "partitions": self.partitions.to_graphql_input() if self.partitions else None,\n }\n\n @staticmethod\n def from_graphql_input(graphql_data):\n asset_key = graphql_data["assetKey"]\n partitions = graphql_data.get("partitions")\n return PartitionsByAssetSelector(\n asset_key=AssetKey.from_graphql_input(asset_key),\n partitions=PartitionsSelector.from_graphql_input(partitions) if partitions else None,\n )\n
", "current_page_name": "_modules/dagster/_core/definitions/selector", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.selector"}, "sensor_definition": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.sensor_definition

\nimport inspect\nimport logging\nfrom collections import defaultdict\nfrom contextlib import ExitStack\nfrom enum import Enum\nfrom typing import (\n    TYPE_CHECKING,\n    Any,\n    Callable,\n    Dict,\n    Iterable,\n    Iterator,\n    List,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Set,\n    Tuple,\n    Type,\n    TypeVar,\n    Union,\n    cast,\n)\n\nimport pendulum\nfrom typing_extensions import TypeAlias\n\nimport dagster._check as check\nfrom dagster._annotations import public\nfrom dagster._core.definitions.asset_check_evaluation import AssetCheckEvaluation\nfrom dagster._core.definitions.events import (\n    AssetMaterialization,\n    AssetObservation,\n)\nfrom dagster._core.definitions.instigation_logger import InstigationLogger\nfrom dagster._core.definitions.job_definition import JobDefinition\nfrom dagster._core.definitions.partition import (\n    CachingDynamicPartitionsLoader,\n)\nfrom dagster._core.definitions.resource_annotation import (\n    get_resource_args,\n)\nfrom dagster._core.definitions.resource_definition import (\n    Resources,\n)\nfrom dagster._core.definitions.scoped_resources_builder import ScopedResourcesBuilder\nfrom dagster._core.errors import (\n    DagsterInvalidDefinitionError,\n    DagsterInvalidInvocationError,\n    DagsterInvalidSubsetError,\n    DagsterInvariantViolationError,\n)\nfrom dagster._core.instance import DagsterInstance\nfrom dagster._core.instance.ref import InstanceRef\nfrom dagster._serdes import whitelist_for_serdes\nfrom dagster._utils import IHasInternalInit, normalize_to_repository\n\nfrom ..decorator_utils import (\n    get_function_params,\n)\nfrom .asset_selection import AssetSelection\nfrom .graph_definition import GraphDefinition\nfrom .run_request import (\n    AddDynamicPartitionsRequest,\n    DagsterRunReaction,\n    DeleteDynamicPartitionsRequest,\n    RunRequest,\n    SensorResult,\n    SkipReason,\n)\nfrom .target import DirectTarget, ExecutableDefinition, RepoRelativeTarget\nfrom .unresolved_asset_job_definition import UnresolvedAssetJobDefinition\nfrom .utils import check_valid_name\n\nif TYPE_CHECKING:\n    from dagster import ResourceDefinition\n    from dagster._core.definitions.definitions_class import Definitions\n    from dagster._core.definitions.repository_definition import RepositoryDefinition\n\n\n@whitelist_for_serdes\nclass DefaultSensorStatus(Enum):\n    RUNNING = "RUNNING"\n    STOPPED = "STOPPED"\n\n\n@whitelist_for_serdes\nclass SensorType(Enum):\n    STANDARD = "STANDARD"\n    RUN_STATUS = "RUN_STATUS"\n    ASSET = "ASSET"\n    MULTI_ASSET = "MULTI_ASSET"\n    FRESHNESS_POLICY = "FRESHNESS_POLICY"\n    UNKNOWN = "UNKNOWN"\n\n\nDEFAULT_SENSOR_DAEMON_INTERVAL = 30\n\n\n
[docs]class SensorEvaluationContext:\n """The context object available as the argument to the evaluation function of a :py:class:`dagster.SensorDefinition`.\n\n Users should not instantiate this object directly. To construct a\n `SensorEvaluationContext` for testing purposes, use :py:func:`dagster.\n build_sensor_context`.\n\n Attributes:\n instance_ref (Optional[InstanceRef]): The serialized instance configured to run the schedule\n cursor (Optional[str]): The cursor, passed back from the last sensor evaluation via\n the cursor attribute of SkipReason and RunRequest\n last_completion_time (float): DEPRECATED The last time that the sensor was evaluated (UTC).\n last_run_key (str): DEPRECATED The run key of the RunRequest most recently created by this\n sensor. Use the preferred `cursor` attribute instead.\n repository_name (Optional[str]): The name of the repository that the sensor belongs to.\n repository_def (Optional[RepositoryDefinition]): The repository or that\n the sensor belongs to. If needed by the sensor top-level resource definitions will be\n pulled from this repository. You can provide either this or `definitions`.\n instance (Optional[DagsterInstance]): The deserialized instance can also be passed in\n directly (primarily useful in testing contexts).\n definitions (Optional[Definitions]): `Definitions` object that the sensor is defined in.\n If needed by the sensor, top-level resource definitions will be pulled from these\n definitions. You can provide either this or `repository_def`.\n resources (Optional[Dict[str, Any]]): A dict of resource keys to resource\n definitions to be made available during sensor execution.\n\n Example:\n .. code-block:: python\n\n from dagster import sensor, SensorEvaluationContext\n\n @sensor\n def the_sensor(context: SensorEvaluationContext):\n ...\n\n """\n\n def __init__(\n self,\n instance_ref: Optional[InstanceRef],\n last_completion_time: Optional[float],\n last_run_key: Optional[str],\n cursor: Optional[str],\n repository_name: Optional[str],\n repository_def: Optional["RepositoryDefinition"] = None,\n instance: Optional[DagsterInstance] = None,\n sensor_name: Optional[str] = None,\n resources: Optional[Mapping[str, "ResourceDefinition"]] = None,\n definitions: Optional["Definitions"] = None,\n ):\n from dagster._core.definitions.definitions_class import Definitions\n from dagster._core.definitions.repository_definition import RepositoryDefinition\n\n self._exit_stack = ExitStack()\n self._instance_ref = check.opt_inst_param(instance_ref, "instance_ref", InstanceRef)\n self._last_completion_time = check.opt_float_param(\n last_completion_time, "last_completion_time"\n )\n self._last_run_key = check.opt_str_param(last_run_key, "last_run_key")\n self._cursor = check.opt_str_param(cursor, "cursor")\n self._repository_name = check.opt_str_param(repository_name, "repository_name")\n self._repository_def = normalize_to_repository(\n check.opt_inst_param(definitions, "definitions", Definitions),\n check.opt_inst_param(repository_def, "repository_def", RepositoryDefinition),\n error_on_none=False,\n )\n self._instance = check.opt_inst_param(instance, "instance", DagsterInstance)\n self._sensor_name = sensor_name\n\n # Wait to set resources unless they're accessed\n self._resource_defs = resources\n self._resources = None\n self._cm_scope_entered = False\n\n self._log_key = (\n [\n repository_name,\n sensor_name,\n pendulum.now("UTC").strftime("%Y%m%d_%H%M%S"),\n ]\n if repository_name and sensor_name\n else None\n )\n self._logger: Optional[InstigationLogger] = None\n self._cursor_updated = False\n\n def __enter__(self) -> "SensorEvaluationContext":\n self._cm_scope_entered = True\n return self\n\n def __exit__(self, *exc) -> None:\n self._exit_stack.close()\n self._logger = None\n\n @property\n def resource_defs(self) -> Optional[Mapping[str, "ResourceDefinition"]]:\n return self._resource_defs\n\n def merge_resources(self, resources_dict: Mapping[str, Any]) -> "SensorEvaluationContext":\n """Merge the specified resources into this context.\n\n This method is intended to be used by the Dagster framework, and should not be called by user code.\n\n Args:\n resources_dict (Mapping[str, Any]): The resources to replace in the context.\n """\n check.invariant(\n self._resources is None, "Cannot merge resources in context that has been initialized."\n )\n from dagster._core.execution.build_resources import wrap_resources_for_execution\n\n return SensorEvaluationContext(\n instance_ref=self._instance_ref,\n last_completion_time=self._last_completion_time,\n last_run_key=self._last_run_key,\n cursor=self._cursor,\n repository_name=self._repository_name,\n repository_def=self._repository_def,\n instance=self._instance,\n sensor_name=self._sensor_name,\n resources={\n **(self._resource_defs or {}),\n **wrap_resources_for_execution(resources_dict),\n },\n )\n\n @public\n @property\n def resources(self) -> Resources:\n """Resources: A mapping from resource key to instantiated resources for this sensor."""\n from dagster._core.definitions.scoped_resources_builder import (\n IContainsGenerator,\n )\n from dagster._core.execution.build_resources import build_resources\n\n if not self._resources:\n """\n This is similar to what we do in e.g. the op context - we set up a resource\n building context manager, and immediately enter it. This is so that in cases\n where a user is not using any context-manager based resources, they don't\n need to enter this SensorEvaluationContext themselves.\n\n For example:\n\n my_sensor(build_sensor_context(resources={"my_resource": my_non_cm_resource})\n\n will work ok, but for a CM resource we must do\n\n with build_sensor_context(resources={"my_resource": my_cm_resource}) as context:\n my_sensor(context)\n """\n\n # Early exit if no resources are defined. This skips unnecessary initialization\n # entirely. This allows users to run user code servers in cases where they\n # do not have access to the instance if they use a subset of features do\n # that do not require instance access. In this case, if they do not use\n # resources on sensors they do not require the instance, so we do not\n # instantiate it\n #\n # Tracking at https://github.com/dagster-io/dagster/issues/14345\n if not self._resource_defs:\n self._resources = ScopedResourcesBuilder.build_empty()\n return self._resources\n\n instance = self.instance if self._instance or self._instance_ref else None\n\n resources_cm = build_resources(resources=self._resource_defs or {}, instance=instance)\n self._resources = self._exit_stack.enter_context(resources_cm)\n\n if isinstance(self._resources, IContainsGenerator) and not self._cm_scope_entered:\n self._exit_stack.close()\n raise DagsterInvariantViolationError(\n "At least one provided resource is a generator, but attempting to access"\n " resources outside of context manager scope. You can use the following syntax"\n " to open a context manager: `with build_schedule_context(...) as context:`"\n )\n\n return self._resources\n\n @public\n @property\n def instance(self) -> DagsterInstance:\n """DagsterInstance: The current DagsterInstance."""\n # self._instance_ref should only ever be None when this SensorEvaluationContext was\n # constructed under test.\n if not self._instance:\n if not self._instance_ref:\n raise DagsterInvariantViolationError(\n "Attempted to initialize dagster instance, but no instance reference was"\n " provided."\n )\n self._instance = self._exit_stack.enter_context(\n DagsterInstance.from_ref(self._instance_ref)\n )\n return cast(DagsterInstance, self._instance)\n\n @property\n def instance_ref(self) -> Optional[InstanceRef]:\n return self._instance_ref\n\n @public\n @property\n def last_completion_time(self) -> Optional[float]:\n """Optional[float]: Timestamp representing the last time this sensor completed an evaluation."""\n return self._last_completion_time\n\n @public\n @property\n def last_run_key(self) -> Optional[str]:\n """Optional[str]: The run key supplied to the most recent RunRequest produced by this sensor."""\n return self._last_run_key\n\n @public\n @property\n def cursor(self) -> Optional[str]:\n """The cursor value for this sensor, which was set in an earlier sensor evaluation."""\n return self._cursor\n\n
[docs] @public\n def update_cursor(self, cursor: Optional[str]) -> None:\n """Updates the cursor value for this sensor, which will be provided on the context for the\n next sensor evaluation.\n\n This can be used to keep track of progress and avoid duplicate work across sensor\n evaluations.\n\n Args:\n cursor (Optional[str]):\n """\n self._cursor = check.opt_str_param(cursor, "cursor")\n self._cursor_updated = True
\n\n @property\n def cursor_updated(self) -> bool:\n return self._cursor_updated\n\n @public\n @property\n def repository_name(self) -> Optional[str]:\n """Optional[str]: The name of the repository that this sensor resides in."""\n return self._repository_name\n\n @public\n @property\n def repository_def(self) -> Optional["RepositoryDefinition"]:\n """Optional[RepositoryDefinition]: The RepositoryDefinition that this sensor resides in."""\n return self._repository_def\n\n @property\n def log(self) -> logging.Logger:\n if self._logger:\n return self._logger\n\n if not self._instance_ref:\n self._logger = self._exit_stack.enter_context(\n InstigationLogger(\n self._log_key,\n repository_name=self._repository_name,\n name=self._sensor_name,\n )\n )\n return cast(logging.Logger, self._logger)\n\n self._logger = self._exit_stack.enter_context(\n InstigationLogger(\n self._log_key,\n self.instance,\n repository_name=self._repository_name,\n name=self._sensor_name,\n )\n )\n return cast(logging.Logger, self._logger)\n\n def has_captured_logs(self):\n return self._logger and self._logger.has_captured_logs()\n\n @property\n def log_key(self) -> Optional[List[str]]:\n return self._log_key
\n\n\nRawSensorEvaluationFunctionReturn = Union[\n Iterator[Union[SkipReason, RunRequest, DagsterRunReaction, SensorResult]],\n Sequence[RunRequest],\n SkipReason,\n RunRequest,\n DagsterRunReaction,\n SensorResult,\n]\nRawSensorEvaluationFunction: TypeAlias = Callable[..., RawSensorEvaluationFunctionReturn]\n\nSensorEvaluationFunction: TypeAlias = Callable[..., Sequence[Union[SkipReason, RunRequest]]]\n\n\ndef get_context_param_name(fn: Callable) -> Optional[str]:\n """Determines the sensor's context parameter name by excluding all resource parameters."""\n resource_params = {param.name for param in get_resource_args(fn)}\n\n return next(\n (param.name for param in get_function_params(fn) if param.name not in resource_params), None\n )\n\n\ndef validate_and_get_resource_dict(\n resources: Resources, sensor_name: str, required_resource_keys: Set[str]\n) -> Dict[str, Any]:\n """Validates that the context has all the required resources and returns a dictionary of\n resource key to resource object.\n """\n for k in required_resource_keys:\n if not hasattr(resources, k):\n raise DagsterInvalidDefinitionError(\n f"Resource with key '{k}' required by sensor '{sensor_name}' was not provided."\n )\n\n return {k: getattr(resources, k) for k in required_resource_keys}\n\n\ndef _check_dynamic_partitions_requests(\n dynamic_partitions_requests: Sequence[\n Union[AddDynamicPartitionsRequest, DeleteDynamicPartitionsRequest]\n ],\n) -> None:\n req_keys_to_add_by_partitions_def_name = defaultdict(set)\n req_keys_to_delete_by_partitions_def_name = defaultdict(set)\n\n for req in dynamic_partitions_requests:\n duplicate_req_keys_to_delete = req_keys_to_delete_by_partitions_def_name.get(\n req.partitions_def_name, set()\n ).intersection(req.partition_keys)\n duplicate_req_keys_to_add = req_keys_to_add_by_partitions_def_name.get(\n req.partitions_def_name, set()\n ).intersection(req.partition_keys)\n if isinstance(req, AddDynamicPartitionsRequest):\n if duplicate_req_keys_to_delete:\n raise DagsterInvariantViolationError(\n "Dynamic partition requests cannot contain both add and delete requests for"\n " the same partition keys.Invalid request: partitions_def_name"\n f" '{req.partitions_def_name}', partition_keys: {duplicate_req_keys_to_delete}"\n )\n elif duplicate_req_keys_to_add:\n raise DagsterInvariantViolationError(\n "Cannot request to add duplicate dynamic partition keys: \\npartitions_def_name"\n f" '{req.partitions_def_name}', partition_keys: {duplicate_req_keys_to_add}"\n )\n req_keys_to_add_by_partitions_def_name[req.partitions_def_name].update(\n req.partition_keys\n )\n elif isinstance(req, DeleteDynamicPartitionsRequest):\n if duplicate_req_keys_to_delete:\n raise DagsterInvariantViolationError(\n "Cannot request to add duplicate dynamic partition keys: \\npartitions_def_name"\n f" '{req.partitions_def_name}', partition_keys:"\n f" {req_keys_to_add_by_partitions_def_name}"\n )\n elif duplicate_req_keys_to_add:\n raise DagsterInvariantViolationError(\n "Dynamic partition requests cannot contain both add and delete requests for"\n " the same partition keys.Invalid request: partitions_def_name"\n f" '{req.partitions_def_name}', partition_keys: {duplicate_req_keys_to_add}"\n )\n req_keys_to_delete_by_partitions_def_name[req.partitions_def_name].update(\n req.partition_keys\n )\n else:\n check.failed(f"Unexpected dynamic partition request type: {req}")\n\n\n
[docs]class SensorDefinition(IHasInternalInit):\n """Define a sensor that initiates a set of runs based on some external state.\n\n Args:\n evaluation_fn (Callable[[SensorEvaluationContext]]): The core evaluation function for the\n sensor, which is run at an interval to determine whether a run should be launched or\n not. Takes a :py:class:`~dagster.SensorEvaluationContext`.\n\n This function must return a generator, which must yield either a single SkipReason\n or one or more RunRequest objects.\n name (Optional[str]): The name of the sensor to create. Defaults to name of evaluation_fn\n minimum_interval_seconds (Optional[int]): The minimum number of seconds that will elapse\n between sensor evaluations.\n description (Optional[str]): A human-readable description of the sensor.\n job (Optional[GraphDefinition, JobDefinition, UnresolvedAssetJob]): The job to execute when this sensor fires.\n jobs (Optional[Sequence[GraphDefinition, JobDefinition, UnresolvedAssetJob]]): (experimental) A list of jobs to execute when this sensor fires.\n default_status (DefaultSensorStatus): Whether the sensor starts as running or not. The default\n status can be overridden from the Dagster UI or via the GraphQL API.\n asset_selection (AssetSelection): (Experimental) an asset selection to launch a run for if\n the sensor condition is met. This can be provided instead of specifying a job.\n """\n\n def with_updated_jobs(self, new_jobs: Sequence[ExecutableDefinition]) -> "SensorDefinition":\n """Returns a copy of this sensor with the jobs replaced.\n\n Args:\n job (ExecutableDefinition): The job that should execute when this\n schedule runs.\n """\n return SensorDefinition.dagster_internal_init(\n name=self.name,\n evaluation_fn=self._raw_fn,\n minimum_interval_seconds=self.minimum_interval_seconds,\n description=self.description,\n job_name=None, # if original init was passed job name, was resolved to a job\n jobs=new_jobs if len(new_jobs) > 1 else None,\n job=new_jobs[0] if len(new_jobs) == 1 else None,\n default_status=self.default_status,\n asset_selection=self.asset_selection,\n required_resource_keys=self._raw_required_resource_keys,\n )\n\n def with_updated_job(self, new_job: ExecutableDefinition) -> "SensorDefinition":\n """Returns a copy of this sensor with the job replaced.\n\n Args:\n job (ExecutableDefinition): The job that should execute when this\n schedule runs.\n """\n return self.with_updated_jobs([new_job])\n\n def __init__(\n self,\n name: Optional[str] = None,\n *,\n evaluation_fn: Optional[RawSensorEvaluationFunction] = None,\n job_name: Optional[str] = None,\n minimum_interval_seconds: Optional[int] = None,\n description: Optional[str] = None,\n job: Optional[ExecutableDefinition] = None,\n jobs: Optional[Sequence[ExecutableDefinition]] = None,\n default_status: DefaultSensorStatus = DefaultSensorStatus.STOPPED,\n asset_selection: Optional[AssetSelection] = None,\n required_resource_keys: Optional[Set[str]] = None,\n ):\n from dagster._config.pythonic_config import validate_resource_annotated_function\n\n if evaluation_fn is None:\n raise DagsterInvalidDefinitionError("Must provide evaluation_fn to SensorDefinition.")\n\n if (\n sum(\n [\n int(job is not None),\n int(jobs is not None),\n int(job_name is not None),\n int(asset_selection is not None),\n ]\n )\n > 1\n ):\n raise DagsterInvalidDefinitionError(\n "Attempted to provide more than one of 'job', 'jobs', 'job_name', and "\n "'asset_selection' params to SensorDefinition. Must provide only one."\n )\n\n jobs = jobs if jobs else [job] if job else None\n\n targets: Optional[List[Union[RepoRelativeTarget, DirectTarget]]] = None\n if job_name:\n targets = [\n RepoRelativeTarget(\n job_name=check.str_param(job_name, "job_name"),\n op_selection=None,\n )\n ]\n elif job:\n targets = [DirectTarget(job)]\n elif jobs:\n targets = [DirectTarget(job) for job in jobs]\n elif asset_selection:\n targets = []\n\n if name:\n self._name = check_valid_name(name)\n else:\n self._name = evaluation_fn.__name__\n\n self._raw_fn: RawSensorEvaluationFunction = check.callable_param(\n evaluation_fn, "evaluation_fn"\n )\n self._evaluation_fn: Union[\n SensorEvaluationFunction,\n Callable[\n [SensorEvaluationContext],\n List[Union[SkipReason, RunRequest, DagsterRunReaction]],\n ],\n ] = wrap_sensor_evaluation(self._name, evaluation_fn)\n self._min_interval = check.opt_int_param(\n minimum_interval_seconds, "minimum_interval_seconds", DEFAULT_SENSOR_DAEMON_INTERVAL\n )\n self._description = check.opt_str_param(description, "description")\n self._targets: Sequence[Union[RepoRelativeTarget, DirectTarget]] = check.opt_list_param(\n targets, "targets", (DirectTarget, RepoRelativeTarget)\n )\n self._default_status = check.inst_param(\n default_status, "default_status", DefaultSensorStatus\n )\n self._asset_selection = check.opt_inst_param(\n asset_selection, "asset_selection", AssetSelection\n )\n validate_resource_annotated_function(self._raw_fn)\n resource_arg_names: Set[str] = {arg.name for arg in get_resource_args(self._raw_fn)}\n\n check.param_invariant(\n len(required_resource_keys or []) == 0 or len(resource_arg_names) == 0,\n "Cannot specify resource requirements in both @sensor decorator and as arguments to"\n " the decorated function",\n )\n self._raw_required_resource_keys = check.opt_set_param(\n required_resource_keys, "required_resource_keys", of_type=str\n )\n self._required_resource_keys = self._raw_required_resource_keys or resource_arg_names\n\n @staticmethod\n def dagster_internal_init(\n *,\n name: Optional[str],\n evaluation_fn: Optional[RawSensorEvaluationFunction],\n job_name: Optional[str],\n minimum_interval_seconds: Optional[int],\n description: Optional[str],\n job: Optional[ExecutableDefinition],\n jobs: Optional[Sequence[ExecutableDefinition]],\n default_status: DefaultSensorStatus,\n asset_selection: Optional[AssetSelection],\n required_resource_keys: Optional[Set[str]],\n ) -> "SensorDefinition":\n return SensorDefinition(\n name=name,\n evaluation_fn=evaluation_fn,\n job_name=job_name,\n minimum_interval_seconds=minimum_interval_seconds,\n description=description,\n job=job,\n jobs=jobs,\n default_status=default_status,\n asset_selection=asset_selection,\n required_resource_keys=required_resource_keys,\n )\n\n def __call__(self, *args, **kwargs) -> RawSensorEvaluationFunctionReturn:\n context_param_name_if_present = get_context_param_name(self._raw_fn)\n context = get_or_create_sensor_context(self._raw_fn, *args, **kwargs)\n\n context_param = (\n {context_param_name_if_present: context} if context_param_name_if_present else {}\n )\n\n resources = validate_and_get_resource_dict(\n context.resources, self.name, self._required_resource_keys\n )\n return self._raw_fn(**context_param, **resources)\n\n @public\n @property\n def required_resource_keys(self) -> Set[str]:\n """Set[str]: The set of keys for resources that must be provided to this sensor."""\n return self._required_resource_keys\n\n @public\n @property\n def name(self) -> str:\n """str: The name of this sensor."""\n return self._name\n\n @public\n @property\n def description(self) -> Optional[str]:\n """Optional[str]: A description for this sensor."""\n return self._description\n\n @public\n @property\n def minimum_interval_seconds(self) -> Optional[int]:\n """Optional[int]: The minimum number of seconds between sequential evaluations of this sensor."""\n return self._min_interval\n\n @property\n def targets(self) -> Sequence[Union[DirectTarget, RepoRelativeTarget]]:\n return self._targets\n\n @public\n @property\n def job(self) -> Union[JobDefinition, GraphDefinition, UnresolvedAssetJobDefinition]:\n """Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]: The job that is\n targeted by this schedule.\n """\n if self._targets:\n if len(self._targets) == 1 and isinstance(self._targets[0], DirectTarget):\n return self._targets[0].target\n elif len(self._targets) > 1:\n raise DagsterInvalidDefinitionError(\n "Job property not available when SensorDefinition has multiple jobs."\n )\n raise DagsterInvalidDefinitionError("No job was provided to SensorDefinition.")\n\n @public\n @property\n def jobs(self) -> List[Union[JobDefinition, GraphDefinition, UnresolvedAssetJobDefinition]]:\n """List[Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]]: A list of jobs\n that are targeted by this schedule.\n """\n if self._targets and all(isinstance(target, DirectTarget) for target in self._targets):\n return [target.target for target in self._targets] # type: ignore # (illegible conditional)\n raise DagsterInvalidDefinitionError("No job was provided to SensorDefinition.")\n\n @property\n def sensor_type(self) -> SensorType:\n return SensorType.STANDARD\n\n def evaluate_tick(self, context: "SensorEvaluationContext") -> "SensorExecutionData":\n """Evaluate sensor using the provided context.\n\n Args:\n context (SensorEvaluationContext): The context with which to evaluate this sensor.\n\n Returns:\n SensorExecutionData: Contains list of run requests, or skip message if present.\n\n """\n context = check.inst_param(context, "context", SensorEvaluationContext)\n\n result = self._evaluation_fn(context)\n\n skip_message: Optional[str] = None\n run_requests: List[RunRequest] = []\n dagster_run_reactions: List[DagsterRunReaction] = []\n dynamic_partitions_requests: Optional[\n Sequence[Union[AddDynamicPartitionsRequest, DeleteDynamicPartitionsRequest]]\n ] = []\n updated_cursor = context.cursor\n asset_events = []\n\n if not result or result == [None]:\n skip_message = "Sensor function returned an empty result"\n elif len(result) == 1:\n item = result[0]\n check.inst(item, (SkipReason, RunRequest, DagsterRunReaction, SensorResult))\n\n if isinstance(item, SensorResult):\n run_requests = list(item.run_requests) if item.run_requests else []\n skip_message = (\n item.skip_reason.skip_message\n if item.skip_reason\n else (None if run_requests else "Sensor function returned an empty result")\n )\n\n _check_dynamic_partitions_requests(\n item.dynamic_partitions_requests or [],\n )\n dynamic_partitions_requests = item.dynamic_partitions_requests or []\n\n if item.cursor and context.cursor_updated:\n raise DagsterInvariantViolationError(\n "SensorResult.cursor cannot be set if context.update_cursor() was called."\n )\n updated_cursor = item.cursor\n asset_events = item.asset_events\n\n elif isinstance(item, RunRequest):\n run_requests = [item]\n elif isinstance(item, SkipReason):\n skip_message = item.skip_message if isinstance(item, SkipReason) else None\n elif isinstance(item, DagsterRunReaction):\n dagster_run_reactions = (\n [cast(DagsterRunReaction, item)] if isinstance(item, DagsterRunReaction) else []\n )\n else:\n check.failed(f"Unexpected type {type(item)} in sensor result")\n else:\n if any(isinstance(item, SensorResult) for item in result):\n check.failed(\n "When a SensorResult is returned from a sensor, it must be the only object"\n " returned."\n )\n\n check.is_list(result, (SkipReason, RunRequest, DagsterRunReaction))\n has_skip = any(map(lambda x: isinstance(x, SkipReason), result))\n run_requests = [item for item in result if isinstance(item, RunRequest)]\n dagster_run_reactions = [\n item for item in result if isinstance(item, DagsterRunReaction)\n ]\n\n if has_skip:\n if len(run_requests) > 0:\n check.failed(\n "Expected a single SkipReason or one or more RunRequests: received both "\n "RunRequest and SkipReason"\n )\n elif len(dagster_run_reactions) > 0:\n check.failed(\n "Expected a single SkipReason or one or more DagsterRunReaction: "\n "received both DagsterRunReaction and SkipReason"\n )\n else:\n check.failed("Expected a single SkipReason: received multiple SkipReasons")\n\n _check_dynamic_partitions_requests(dynamic_partitions_requests)\n resolved_run_requests = self.resolve_run_requests(\n run_requests, context, self._asset_selection, dynamic_partitions_requests\n )\n\n return SensorExecutionData(\n resolved_run_requests,\n skip_message,\n updated_cursor,\n dagster_run_reactions,\n captured_log_key=context.log_key if context.has_captured_logs() else None,\n dynamic_partitions_requests=dynamic_partitions_requests,\n asset_events=asset_events,\n )\n\n def has_loadable_targets(self) -> bool:\n for target in self._targets:\n if isinstance(target, DirectTarget):\n return True\n return False\n\n def load_targets(\n self,\n ) -> Sequence[Union[JobDefinition, GraphDefinition, UnresolvedAssetJobDefinition]]:\n """Returns job/graph definitions that have been directly passed into the sensor definition.\n Any jobs or graphs that are referenced by name will not be loaded.\n """\n targets = []\n for target in self._targets:\n if isinstance(target, DirectTarget):\n targets.append(target.load())\n return targets\n\n def resolve_run_requests(\n self,\n run_requests: Sequence[RunRequest],\n context: SensorEvaluationContext,\n asset_selection: Optional[AssetSelection],\n dynamic_partitions_requests: Sequence[\n Union[AddDynamicPartitionsRequest, DeleteDynamicPartitionsRequest]\n ],\n ) -> Sequence[RunRequest]:\n def _get_repo_job_by_name(context: SensorEvaluationContext, job_name: str) -> JobDefinition:\n if context.repository_def is None:\n raise DagsterInvariantViolationError(\n "Must provide repository def to build_sensor_context when yielding partitioned"\n " run requests"\n )\n return context.repository_def.get_job(job_name)\n\n has_multiple_targets = len(self._targets) > 1\n target_names = [target.job_name for target in self._targets]\n\n if run_requests and len(self._targets) == 0 and not self._asset_selection:\n raise Exception(\n f"Error in sensor {self._name}: Sensor evaluation function returned a RunRequest "\n "for a sensor lacking a specified target (job_name, job, or jobs). Targets "\n "can be specified by providing job, jobs, or job_name to the @sensor "\n "decorator."\n )\n\n if asset_selection:\n run_requests = [\n *_run_requests_with_base_asset_jobs(run_requests, context, asset_selection)\n ]\n\n dynamic_partitions_store = (\n CachingDynamicPartitionsLoader(context.instance) if context.instance_ref else None\n )\n\n # Run requests may contain an invalid target, or a partition key that does not exist.\n # We will resolve these run requests, applying the target and partition config/tags.\n resolved_run_requests = []\n for run_request in run_requests:\n if run_request.job_name is None and has_multiple_targets:\n raise Exception(\n f"Error in sensor {self._name}: Sensor returned a RunRequest that did not"\n " specify job_name for the requested run. Expected one of:"\n f" {target_names}"\n )\n elif (\n run_request.job_name\n and run_request.job_name not in target_names\n and not asset_selection\n ):\n raise Exception(\n f"Error in sensor {self._name}: Sensor returned a RunRequest with job_name "\n f"{run_request.job_name}. Expected one of: {target_names}"\n )\n\n if run_request.partition_key and not run_request.has_resolved_partition():\n selected_job = _get_repo_job_by_name(\n context, run_request.job_name if run_request.job_name else target_names[0]\n )\n resolved_run_requests.append(\n run_request.with_resolved_tags_and_config(\n target_definition=selected_job,\n current_time=None,\n dynamic_partitions_store=dynamic_partitions_store,\n dynamic_partitions_requests=dynamic_partitions_requests,\n )\n )\n else:\n resolved_run_requests.append(run_request)\n\n return resolved_run_requests\n\n @property\n def _target(self) -> Optional[Union[DirectTarget, RepoRelativeTarget]]:\n return self._targets[0] if self._targets else None\n\n @public\n @property\n def job_name(self) -> Optional[str]:\n """Optional[str]: The name of the job that is targeted by this sensor."""\n if len(self._targets) > 1:\n raise DagsterInvalidInvocationError(\n f"Cannot use `job_name` property for sensor {self.name}, which targets multiple"\n " jobs."\n )\n return self._targets[0].job_name\n\n @public\n @property\n def default_status(self) -> DefaultSensorStatus:\n """DefaultSensorStatus: The default status for this sensor when it is first loaded in\n a code location.\n """\n return self._default_status\n\n @property\n def asset_selection(self) -> Optional[AssetSelection]:\n return self._asset_selection
\n\n\n@whitelist_for_serdes(\n storage_field_names={"dagster_run_reactions": "pipeline_run_reactions"},\n)\nclass SensorExecutionData(\n NamedTuple(\n "_SensorExecutionData",\n [\n ("run_requests", Optional[Sequence[RunRequest]]),\n ("skip_message", Optional[str]),\n ("cursor", Optional[str]),\n ("dagster_run_reactions", Optional[Sequence[DagsterRunReaction]]),\n ("captured_log_key", Optional[Sequence[str]]),\n (\n "dynamic_partitions_requests",\n Optional[\n Sequence[Union[AddDynamicPartitionsRequest, DeleteDynamicPartitionsRequest]]\n ],\n ),\n (\n "asset_events",\n Sequence[Union[AssetMaterialization, AssetObservation, AssetCheckEvaluation]],\n ),\n ],\n )\n):\n dagster_run_reactions: Optional[Sequence[DagsterRunReaction]]\n\n def __new__(\n cls,\n run_requests: Optional[Sequence[RunRequest]] = None,\n skip_message: Optional[str] = None,\n cursor: Optional[str] = None,\n dagster_run_reactions: Optional[Sequence[DagsterRunReaction]] = None,\n captured_log_key: Optional[Sequence[str]] = None,\n dynamic_partitions_requests: Optional[\n Sequence[Union[AddDynamicPartitionsRequest, DeleteDynamicPartitionsRequest]]\n ] = None,\n asset_events: Optional[\n Sequence[Union[AssetMaterialization, AssetObservation, AssetCheckEvaluation]]\n ] = None,\n ):\n check.opt_sequence_param(run_requests, "run_requests", RunRequest)\n check.opt_str_param(skip_message, "skip_message")\n check.opt_str_param(cursor, "cursor")\n check.opt_sequence_param(dagster_run_reactions, "dagster_run_reactions", DagsterRunReaction)\n check.opt_list_param(captured_log_key, "captured_log_key", str)\n check.opt_sequence_param(\n dynamic_partitions_requests,\n "dynamic_partitions_requests",\n (AddDynamicPartitionsRequest, DeleteDynamicPartitionsRequest),\n )\n check.opt_sequence_param(\n asset_events,\n "asset_events",\n (AssetMaterialization, AssetObservation, AssetCheckEvaluation),\n )\n check.invariant(\n not (run_requests and skip_message), "Found both skip data and run request data"\n )\n return super(SensorExecutionData, cls).__new__(\n cls,\n run_requests=run_requests,\n skip_message=skip_message,\n cursor=cursor,\n dagster_run_reactions=dagster_run_reactions,\n captured_log_key=captured_log_key,\n dynamic_partitions_requests=dynamic_partitions_requests,\n asset_events=asset_events or [],\n )\n\n\ndef wrap_sensor_evaluation(\n sensor_name: str,\n fn: RawSensorEvaluationFunction,\n) -> SensorEvaluationFunction:\n resource_arg_names: Set[str] = {arg.name for arg in get_resource_args(fn)}\n\n def _wrapped_fn(context: SensorEvaluationContext):\n resource_args_populated = validate_and_get_resource_dict(\n context.resources, sensor_name, resource_arg_names\n )\n\n context_param_name_if_present = get_context_param_name(fn)\n context_param = (\n {context_param_name_if_present: context} if context_param_name_if_present else {}\n )\n raw_evaluation_result = fn(**context_param, **resource_args_populated)\n\n def check_returned_scalar(scalar):\n if isinstance(scalar, (SkipReason, RunRequest, SensorResult)):\n return scalar\n elif scalar is not None:\n raise Exception(\n f"Error in sensor {sensor_name}: Sensor unexpectedly returned output "\n f"{scalar} of type {type(scalar)}. Should only return SkipReason or "\n "RunRequest objects."\n )\n\n if inspect.isgenerator(raw_evaluation_result):\n result = []\n try:\n while True:\n result.append(next(raw_evaluation_result))\n except StopIteration as e:\n # captures the case where the evaluation function has a yield and also returns a\n # value\n if e.value is not None:\n result.append(check_returned_scalar(e.value))\n\n return result\n elif isinstance(raw_evaluation_result, list):\n return raw_evaluation_result\n else:\n return [check_returned_scalar(raw_evaluation_result)]\n\n return _wrapped_fn\n\n\n
[docs]def build_sensor_context(\n instance: Optional[DagsterInstance] = None,\n cursor: Optional[str] = None,\n repository_name: Optional[str] = None,\n repository_def: Optional["RepositoryDefinition"] = None,\n sensor_name: Optional[str] = None,\n resources: Optional[Mapping[str, object]] = None,\n definitions: Optional["Definitions"] = None,\n instance_ref: Optional["InstanceRef"] = None,\n) -> SensorEvaluationContext:\n """Builds sensor execution context using the provided parameters.\n\n This function can be used to provide a context to the invocation of a sensor definition.If\n provided, the dagster instance must be persistent; DagsterInstance.ephemeral() will result in an\n error.\n\n Args:\n instance (Optional[DagsterInstance]): The dagster instance configured to run the sensor.\n cursor (Optional[str]): A cursor value to provide to the evaluation of the sensor.\n repository_name (Optional[str]): The name of the repository that the sensor belongs to.\n repository_def (Optional[RepositoryDefinition]): The repository that the sensor belongs to.\n If needed by the sensor top-level resource definitions will be pulled from this repository.\n You can provide either this or `definitions`.\n resources (Optional[Mapping[str, ResourceDefinition]]): A set of resource definitions\n to provide to the sensor. If passed, these will override any resource definitions\n provided by the repository.\n definitions (Optional[Definitions]): `Definitions` object that the sensor is defined in.\n If needed by the sensor, top-level resource definitions will be pulled from these\n definitions. You can provide either this or `repository_def`.\n\n Examples:\n .. code-block:: python\n\n context = build_sensor_context()\n my_sensor(context)\n\n """\n from dagster._core.definitions.definitions_class import Definitions\n from dagster._core.definitions.repository_definition import RepositoryDefinition\n from dagster._core.execution.build_resources import wrap_resources_for_execution\n\n check.opt_inst_param(instance, "instance", DagsterInstance)\n check.opt_str_param(cursor, "cursor")\n check.opt_str_param(repository_name, "repository_name")\n repository_def = normalize_to_repository(\n check.opt_inst_param(definitions, "definitions", Definitions),\n check.opt_inst_param(repository_def, "repository_def", RepositoryDefinition),\n error_on_none=False,\n )\n\n return SensorEvaluationContext(\n instance_ref=instance_ref,\n last_completion_time=None,\n last_run_key=None,\n cursor=cursor,\n repository_name=repository_name,\n instance=instance,\n repository_def=repository_def,\n sensor_name=sensor_name,\n resources=wrap_resources_for_execution(resources),\n )
\n\n\nT = TypeVar("T")\n\n\ndef get_sensor_context_from_args_or_kwargs(\n fn: Callable,\n args: Tuple[Any, ...],\n kwargs: Dict[str, Any],\n context_type: Type[T],\n) -> Optional[T]:\n from dagster._config.pythonic_config import is_coercible_to_resource\n\n context_param_name = get_context_param_name(fn)\n\n kwarg_keys_non_resource = set(kwargs.keys()) - {param.name for param in get_resource_args(fn)}\n if len(args) + len(kwarg_keys_non_resource) > 1:\n raise DagsterInvalidInvocationError(\n "Sensor invocation received multiple non-resource arguments. Only a first "\n "positional context parameter should be provided when invoking."\n )\n\n if any(is_coercible_to_resource(arg) for arg in args):\n raise DagsterInvalidInvocationError(\n "If directly invoking a sensor, you may not provide resources as"\n " positional"\n " arguments, only as keyword arguments."\n )\n\n context: Optional[T] = None\n\n if len(args) > 0:\n context = check.opt_inst(args[0], context_type)\n elif len(kwargs) > 0:\n if context_param_name and context_param_name not in kwargs:\n raise DagsterInvalidInvocationError(\n f"Sensor invocation expected argument '{context_param_name}'."\n )\n context = check.opt_inst(kwargs.get(context_param_name or "context"), context_type)\n elif context_param_name:\n # If the context parameter is present but no value was provided, we error\n raise DagsterInvalidInvocationError(\n "Sensor evaluation function expected context argument, but no context argument "\n "was provided when invoking."\n )\n\n return context\n\n\ndef get_or_create_sensor_context(\n fn: Callable,\n *args: Any,\n **kwargs: Any,\n) -> SensorEvaluationContext:\n """Based on the passed resource function and the arguments passed to it, returns the\n user-passed SensorEvaluationContext or creates one if it is not passed.\n\n Raises an exception if the user passes more than one argument or if the user-provided\n function requires a context parameter but none is passed.\n """\n context = (\n get_sensor_context_from_args_or_kwargs(\n fn,\n args,\n kwargs,\n context_type=SensorEvaluationContext,\n )\n or build_sensor_context()\n )\n resource_args_from_kwargs = {}\n\n resource_args = {param.name for param in get_resource_args(fn)}\n for resource_arg in resource_args:\n if resource_arg in kwargs:\n resource_args_from_kwargs[resource_arg] = kwargs[resource_arg]\n\n if resource_args_from_kwargs:\n return context.merge_resources(resource_args_from_kwargs)\n\n return context\n\n\ndef _run_requests_with_base_asset_jobs(\n run_requests: Iterable[RunRequest],\n context: SensorEvaluationContext,\n outer_asset_selection: AssetSelection,\n) -> Sequence[RunRequest]:\n """For sensors that target asset selections instead of jobs, finds the corresponding base asset\n for a selected set of assets.\n """\n asset_graph = context.repository_def.asset_graph # type: ignore # (possible none)\n result = []\n for run_request in run_requests:\n if run_request.asset_selection:\n asset_keys = run_request.asset_selection\n\n unexpected_asset_keys = (\n AssetSelection.keys(*asset_keys) - outer_asset_selection\n ).resolve(asset_graph)\n if unexpected_asset_keys:\n raise DagsterInvalidSubsetError(\n "RunRequest includes asset keys that are not part of sensor's asset_selection:"\n f" {unexpected_asset_keys}"\n )\n else:\n asset_keys = outer_asset_selection.resolve(asset_graph)\n\n base_job = context.repository_def.get_implicit_job_def_for_assets(asset_keys) # type: ignore # (possible none)\n result.append(\n run_request.with_replaced_attrs(\n job_name=base_job.name, asset_selection=list(asset_keys) # type: ignore # (possible none)\n )\n )\n\n return result\n
", "current_page_name": "_modules/dagster/_core/definitions/sensor_definition", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.sensor_definition"}, "source_asset": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.source_asset

\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Any,\n    Callable,\n    Dict,\n    Iterator,\n    Mapping,\n    Optional,\n    cast,\n)\n\nfrom typing_extensions import TypeAlias\n\nimport dagster._check as check\nfrom dagster._annotations import PublicAttr, experimental_param, public\nfrom dagster._core.decorator_utils import get_function_params\nfrom dagster._core.definitions.data_version import (\n    DATA_VERSION_TAG,\n    DataVersion,\n    DataVersionsByPartition,\n)\nfrom dagster._core.definitions.events import AssetKey, AssetObservation, CoercibleToAssetKey\nfrom dagster._core.definitions.metadata import (\n    ArbitraryMetadataMapping,\n    MetadataMapping,\n    normalize_metadata,\n)\nfrom dagster._core.definitions.op_definition import OpDefinition\nfrom dagster._core.definitions.partition import PartitionsDefinition\nfrom dagster._core.definitions.resource_annotation import get_resource_args\nfrom dagster._core.definitions.resource_definition import ResourceDefinition\nfrom dagster._core.definitions.resource_requirement import (\n    ResourceAddable,\n    ResourceRequirement,\n    SourceAssetIOManagerRequirement,\n    ensure_requirements_satisfied,\n    get_resource_key_conflicts,\n)\nfrom dagster._core.definitions.utils import (\n    DEFAULT_GROUP_NAME,\n    DEFAULT_IO_MANAGER_KEY,\n    validate_group_name,\n)\nfrom dagster._core.errors import (\n    DagsterInvalidDefinitionError,\n    DagsterInvalidInvocationError,\n    DagsterInvalidObservationError,\n)\n\nif TYPE_CHECKING:\n    from dagster._core.definitions.decorators.op_decorator import (\n        DecoratedOpFunction,\n    )\nfrom dagster._core.storage.io_manager import IOManagerDefinition\nfrom dagster._utils.merger import merge_dicts\nfrom dagster._utils.warnings import disable_dagster_warnings\n\n# Going with this catch-all for the time-being to permit pythonic resources\nSourceAssetObserveFunction: TypeAlias = Callable[..., Any]\n\n\ndef wrap_source_asset_observe_fn_in_op_compute_fn(\n    source_asset: "SourceAsset",\n) -> "DecoratedOpFunction":\n    from dagster._core.definitions.decorators.op_decorator import (\n        DecoratedOpFunction,\n        is_context_provided,\n    )\n    from dagster._core.execution.context.compute import (\n        OpExecutionContext,\n    )\n\n    check.not_none(source_asset.observe_fn, "Must be an observable source asset")\n    assert source_asset.observe_fn  # for type checker\n\n    observe_fn = source_asset.observe_fn\n\n    observe_fn_has_context = is_context_provided(get_function_params(observe_fn))\n\n    def fn(context: OpExecutionContext):\n        resource_kwarg_keys = [param.name for param in get_resource_args(observe_fn)]\n        resource_kwargs = {key: getattr(context.resources, key) for key in resource_kwarg_keys}\n        observe_fn_return_value = (\n            observe_fn(context, **resource_kwargs)\n            if observe_fn_has_context\n            else observe_fn(**resource_kwargs)\n        )\n\n        if isinstance(observe_fn_return_value, DataVersion):\n            if source_asset.partitions_def is not None:\n                raise DagsterInvalidObservationError(\n                    f"{source_asset.key} is partitioned, so its observe function should return a"\n                    " DataVersionsByPartition, not a DataVersion"\n                )\n\n            context.log_event(\n                AssetObservation(\n                    asset_key=source_asset.key,\n                    tags={DATA_VERSION_TAG: observe_fn_return_value.value},\n                )\n            )\n        elif isinstance(observe_fn_return_value, DataVersionsByPartition):\n            if source_asset.partitions_def is None:\n                raise DagsterInvalidObservationError(\n                    f"{source_asset.key} is not partitioned, so its observe function should return"\n                    " a DataVersion, not a DataVersionsByPartition"\n                )\n\n            for (\n                partition_key,\n                data_version,\n            ) in observe_fn_return_value.data_versions_by_partition.items():\n                context.log_event(\n                    AssetObservation(\n                        asset_key=source_asset.key,\n                        tags={DATA_VERSION_TAG: data_version.value},\n                        partition=partition_key,\n                    )\n                )\n        else:\n            raise DagsterInvalidObservationError(\n                f"Observe function for {source_asset.key} must return a DataVersion or"\n                " DataVersionsByPartition, but returned a value of type"\n                f" {type(observe_fn_return_value)}"\n            )\n\n    return DecoratedOpFunction(fn)\n\n\n
[docs]@experimental_param(param="resource_defs")\n@experimental_param(param="io_manager_def")\nclass SourceAsset(ResourceAddable):\n """A SourceAsset represents an asset that will be loaded by (but not updated by) Dagster.\n\n Attributes:\n key (Union[AssetKey, Sequence[str], str]): The key of the asset.\n metadata (Mapping[str, MetadataValue]): Metadata associated with the asset.\n io_manager_key (Optional[str]): The key for the IOManager that will be used to load the contents of\n the asset when it's used as an input to other assets inside a job.\n io_manager_def (Optional[IOManagerDefinition]): (Experimental) The definition of the IOManager that will be used to load the contents of\n the asset when it's used as an input to other assets inside a job.\n resource_defs (Optional[Mapping[str, ResourceDefinition]]): (Experimental) resource definitions that may be required by the :py:class:`dagster.IOManagerDefinition` provided in the `io_manager_def` argument.\n description (Optional[str]): The description of the asset.\n partitions_def (Optional[PartitionsDefinition]): Defines the set of partition keys that\n compose the asset.\n observe_fn (Optional[SourceAssetObserveFunction]) Observation function for the source asset.\n """\n\n key: PublicAttr[AssetKey]\n metadata: PublicAttr[MetadataMapping]\n raw_metadata: PublicAttr[ArbitraryMetadataMapping]\n io_manager_key: PublicAttr[Optional[str]]\n _io_manager_def: PublicAttr[Optional[IOManagerDefinition]]\n description: PublicAttr[Optional[str]]\n partitions_def: PublicAttr[Optional[PartitionsDefinition]]\n group_name: PublicAttr[str]\n resource_defs: PublicAttr[Dict[str, ResourceDefinition]]\n observe_fn: PublicAttr[Optional[SourceAssetObserveFunction]]\n _node_def: Optional[OpDefinition] # computed lazily\n auto_observe_interval_minutes: Optional[float]\n\n def __init__(\n self,\n key: CoercibleToAssetKey,\n metadata: Optional[ArbitraryMetadataMapping] = None,\n io_manager_key: Optional[str] = None,\n io_manager_def: Optional[object] = None,\n description: Optional[str] = None,\n partitions_def: Optional[PartitionsDefinition] = None,\n group_name: Optional[str] = None,\n resource_defs: Optional[Mapping[str, object]] = None,\n observe_fn: Optional[SourceAssetObserveFunction] = None,\n *,\n auto_observe_interval_minutes: Optional[float] = None,\n # This is currently private because it is necessary for source asset observation functions,\n # but we have not yet decided on a final API for associated one or more ops with a source\n # asset. If we were to make this public, then we would have a canonical public\n # `required_resource_keys` used for observation that might end up conflicting with a set of\n # required resource keys for a different operation.\n _required_resource_keys: Optional[AbstractSet[str]] = None,\n # Add additional fields to with_resources and with_group below\n ):\n from dagster._core.execution.build_resources import (\n wrap_resources_for_execution,\n )\n\n self.key = AssetKey.from_coercible(key)\n metadata = check.opt_mapping_param(metadata, "metadata", key_type=str)\n self.raw_metadata = metadata\n self.metadata = normalize_metadata(metadata, allow_invalid=True)\n\n resource_defs_dict = dict(check.opt_mapping_param(resource_defs, "resource_defs"))\n if io_manager_def:\n if not io_manager_key:\n io_manager_key = self.key.to_python_identifier("io_manager")\n\n if (\n io_manager_key in resource_defs_dict\n and resource_defs_dict[io_manager_key] != io_manager_def\n ):\n raise DagsterInvalidDefinitionError(\n f"Provided conflicting definitions for io manager key '{io_manager_key}'."\n " Please provide only one definition per key."\n )\n\n resource_defs_dict[io_manager_key] = io_manager_def\n\n self.resource_defs = wrap_resources_for_execution(resource_defs_dict)\n\n self.io_manager_key = check.opt_str_param(io_manager_key, "io_manager_key")\n self.partitions_def = check.opt_inst_param(\n partitions_def, "partitions_def", PartitionsDefinition\n )\n self.group_name = validate_group_name(group_name)\n self.description = check.opt_str_param(description, "description")\n self.observe_fn = check.opt_callable_param(observe_fn, "observe_fn")\n self._required_resource_keys = check.opt_set_param(\n _required_resource_keys, "_required_resource_keys", of_type=str\n )\n self._node_def = None\n self.auto_observe_interval_minutes = check.opt_numeric_param(\n auto_observe_interval_minutes, "auto_observe_interval_minutes"\n )\n\n def get_io_manager_key(self) -> str:\n return self.io_manager_key or DEFAULT_IO_MANAGER_KEY\n\n @property\n def io_manager_def(self) -> Optional[IOManagerDefinition]:\n io_manager_key = self.get_io_manager_key()\n return cast(\n Optional[IOManagerDefinition],\n self.resource_defs.get(io_manager_key) if io_manager_key else None,\n )\n\n @public\n @property\n def op(self) -> OpDefinition:\n """OpDefinition: The OpDefinition associated with the observation function of an observable\n source asset.\n\n Throws an error if the asset is not observable.\n """\n check.invariant(\n isinstance(self.node_def, OpDefinition),\n "The NodeDefinition for this AssetsDefinition is not of type OpDefinition.",\n )\n return cast(OpDefinition, self.node_def)\n\n @public\n @property\n def is_observable(self) -> bool:\n """bool: Whether the asset is observable."""\n return self.node_def is not None\n\n @property\n def required_resource_keys(self) -> AbstractSet[str]:\n return {requirement.key for requirement in self.get_resource_requirements()}\n\n @property\n def node_def(self) -> Optional[OpDefinition]:\n """Op that generates observation metadata for a source asset."""\n if self.observe_fn is None:\n return None\n\n if self._node_def is None:\n self._node_def = OpDefinition(\n compute_fn=wrap_source_asset_observe_fn_in_op_compute_fn(self),\n name=self.key.to_python_identifier(),\n description=self.description,\n required_resource_keys=self._required_resource_keys,\n )\n return self._node_def\n\n def with_resources(self, resource_defs) -> "SourceAsset":\n from dagster._core.execution.resources_init import get_transitive_required_resource_keys\n\n overlapping_keys = get_resource_key_conflicts(self.resource_defs, resource_defs)\n if overlapping_keys:\n raise DagsterInvalidInvocationError(\n f"SourceAsset with key {self.key} has conflicting resource "\n "definitions with provided resources for the following keys: "\n f"{sorted(list(overlapping_keys))}. Either remove the existing "\n "resources from the asset or change the resource keys so that "\n "they don't overlap."\n )\n\n merged_resource_defs = merge_dicts(resource_defs, self.resource_defs)\n\n # Ensure top-level resource requirements are met - except for\n # io_manager, since that is a default it can be resolved later.\n ensure_requirements_satisfied(merged_resource_defs, list(self.get_resource_requirements()))\n\n io_manager_def = merged_resource_defs.get(self.get_io_manager_key())\n if not io_manager_def and self.get_io_manager_key() != DEFAULT_IO_MANAGER_KEY:\n raise DagsterInvalidDefinitionError(\n f"SourceAsset with asset key {self.key} requires IO manager with key"\n f" '{self.get_io_manager_key()}', but none was provided."\n )\n relevant_keys = get_transitive_required_resource_keys(\n {*self._required_resource_keys, self.get_io_manager_key()}, merged_resource_defs\n )\n\n relevant_resource_defs = {\n key: resource_def\n for key, resource_def in merged_resource_defs.items()\n if key in relevant_keys\n }\n\n io_manager_key = (\n self.get_io_manager_key()\n if self.get_io_manager_key() != DEFAULT_IO_MANAGER_KEY\n else None\n )\n with disable_dagster_warnings():\n return SourceAsset(\n key=self.key,\n io_manager_key=io_manager_key,\n description=self.description,\n partitions_def=self.partitions_def,\n metadata=self.raw_metadata,\n resource_defs=relevant_resource_defs,\n group_name=self.group_name,\n observe_fn=self.observe_fn,\n auto_observe_interval_minutes=self.auto_observe_interval_minutes,\n _required_resource_keys=self._required_resource_keys,\n )\n\n def with_attributes(\n self, group_name: Optional[str] = None, key: Optional[AssetKey] = None\n ) -> "SourceAsset":\n if group_name is not None and self.group_name != DEFAULT_GROUP_NAME:\n raise DagsterInvalidDefinitionError(\n "A group name has already been provided to source asset"\n f" {self.key.to_user_string()}"\n )\n\n with disable_dagster_warnings():\n return SourceAsset(\n key=key or self.key,\n metadata=self.raw_metadata,\n io_manager_key=self.io_manager_key,\n io_manager_def=self.io_manager_def,\n description=self.description,\n partitions_def=self.partitions_def,\n group_name=group_name,\n resource_defs=self.resource_defs,\n observe_fn=self.observe_fn,\n auto_observe_interval_minutes=self.auto_observe_interval_minutes,\n _required_resource_keys=self._required_resource_keys,\n )\n\n def get_resource_requirements(self) -> Iterator[ResourceRequirement]:\n if self.node_def is not None:\n yield from self.node_def.get_resource_requirements()\n yield SourceAssetIOManagerRequirement(\n key=self.get_io_manager_key(), asset_key=self.key.to_string()\n )\n for source_key, resource_def in self.resource_defs.items():\n yield from resource_def.get_resource_requirements(outer_context=source_key)\n\n def __eq__(self, other: object) -> bool:\n if not isinstance(other, SourceAsset):\n return False\n else:\n return (\n self.key == other.key\n and self.raw_metadata == other.raw_metadata\n and self.io_manager_key == other.io_manager_key\n and self.description == other.description\n and self.group_name == other.group_name\n and self.resource_defs == other.resource_defs\n and self.observe_fn == other.observe_fn\n )
\n
", "current_page_name": "_modules/dagster/_core/definitions/source_asset", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.source_asset"}, "step_launcher": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.step_launcher

\nfrom abc import ABC, abstractmethod\nfrom typing import TYPE_CHECKING, Iterator, Mapping, NamedTuple, Optional\n\nimport dagster._check as check\nfrom dagster._core.definitions.reconstruct import ReconstructableJob\nfrom dagster._core.execution.retries import RetryMode\nfrom dagster._core.storage.dagster_run import DagsterRun\n\nif TYPE_CHECKING:\n    from dagster._core.events import DagsterEvent\n    from dagster._core.execution.context.system import StepExecutionContext\n    from dagster._core.execution.plan.state import KnownExecutionState\n\n\n
[docs]class StepRunRef(\n NamedTuple(\n "_StepRunRef",\n [\n ("run_config", Mapping[str, object]),\n ("dagster_run", DagsterRun),\n ("run_id", str),\n ("retry_mode", RetryMode),\n ("step_key", str),\n ("recon_job", ReconstructableJob),\n ("known_state", Optional["KnownExecutionState"]),\n ],\n )\n):\n """A serializable object that specifies what's needed to hydrate a step so\n that it can be executed in a process outside the plan process.\n\n Users should not instantiate this class directly.\n """\n\n def __new__(\n cls,\n run_config: Mapping[str, object],\n dagster_run: DagsterRun,\n run_id: str,\n retry_mode: RetryMode,\n step_key: str,\n recon_job: ReconstructableJob,\n known_state: Optional["KnownExecutionState"],\n ):\n from dagster._core.execution.plan.state import KnownExecutionState\n\n return super(StepRunRef, cls).__new__(\n cls,\n check.mapping_param(run_config, "run_config", key_type=str),\n check.inst_param(dagster_run, "dagster_run", DagsterRun),\n check.str_param(run_id, "run_id"),\n check.inst_param(retry_mode, "retry_mode", RetryMode),\n check.str_param(step_key, "step_key"),\n check.inst_param(recon_job, "recon_job", ReconstructableJob),\n check.opt_inst_param(known_state, "known_state", KnownExecutionState),\n )
\n\n\n
[docs]class StepLauncher(ABC):\n """A StepLauncher is responsible for executing steps, either in-process or in an external process."""\n\n @abstractmethod\n def launch_step(self, step_context: "StepExecutionContext") -> Iterator["DagsterEvent"]:\n """Args:\n step_context (StepExecutionContext): The context that we're executing the step in.\n\n Returns:\n Iterator[DagsterEvent]: The events for the step.\n """
\n
", "current_page_name": "_modules/dagster/_core/definitions/step_launcher", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.step_launcher"}, "time_window_partition_mapping": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.time_window_partition_mapping

\nfrom datetime import datetime\nfrom typing import NamedTuple, Optional, cast\n\nimport dagster._check as check\nfrom dagster._annotations import PublicAttr, experimental_param\nfrom dagster._core.definitions.partition import PartitionsDefinition, PartitionsSubset\nfrom dagster._core.definitions.partition_mapping import PartitionMapping, UpstreamPartitionsResult\nfrom dagster._core.definitions.time_window_partitions import (\n    TimeWindow,\n    TimeWindowPartitionsDefinition,\n    TimeWindowPartitionsSubset,\n)\nfrom dagster._core.errors import DagsterInvalidDefinitionError\nfrom dagster._core.instance import DynamicPartitionsStore\nfrom dagster._serdes import whitelist_for_serdes\n\n\n
[docs]@whitelist_for_serdes\n@experimental_param(param="allow_nonexistent_upstream_partitions")\nclass TimeWindowPartitionMapping(\n PartitionMapping,\n NamedTuple(\n "_TimeWindowPartitionMapping",\n [\n ("start_offset", PublicAttr[int]),\n ("end_offset", PublicAttr[int]),\n ("allow_nonexistent_upstream_partitions", PublicAttr[bool]),\n ],\n ),\n):\n """The default mapping between two TimeWindowPartitionsDefinitions.\n\n A partition in the downstream partitions definition is mapped to all partitions in the upstream\n asset whose time windows overlap it.\n\n This means that, if the upstream and downstream partitions definitions share the same time\n period, then this mapping is essentially the identity partition mapping - plus conversion of\n datetime formats.\n\n If the upstream time period is coarser than the downstream time period, then each partition in\n the downstream asset will map to a single (larger) upstream partition. E.g. if the downstream is\n hourly and the upstream is daily, then each hourly partition in the downstream will map to the\n daily partition in the upstream that contains that hour.\n\n If the upstream time period is finer than the downstream time period, then each partition in the\n downstream asset will map to multiple upstream partitions. E.g. if the downstream is daily and\n the upstream is hourly, then each daily partition in the downstream asset will map to the 24\n hourly partitions in the upstream that occur on that day.\n\n Attributes:\n start_offset (int): If not 0, then the starts of the upstream windows are shifted by this\n offset relative to the starts of the downstream windows. For example, if start_offset=-1\n and end_offset=0, then the downstream partition "2022-07-04" would map to the upstream\n partitions "2022-07-03" and "2022-07-04". Only permitted to be non-zero when the\n upstream and downstream PartitionsDefinitions are the same. Defaults to 0.\n end_offset (int): If not 0, then the ends of the upstream windows are shifted by this\n offset relative to the ends of the downstream windows. For example, if start_offset=0\n and end_offset=1, then the downstream partition "2022-07-04" would map to the upstream\n partitions "2022-07-04" and "2022-07-05". Only permitted to be non-zero when the\n upstream and downstream PartitionsDefinitions are the same. Defaults to 0.\n allow_nonexistent_upstream_partitions (bool): Defaults to false. If true, does not\n raise an error when mapped upstream partitions fall outside the start-end time window of the\n partitions def. For example, if the upstream partitions def starts on "2023-01-01" but\n the downstream starts on "2022-01-01", setting this bool to true would return no\n partition keys when get_upstream_partitions_for_partitions is called with "2022-06-01".\n When set to false, would raise an error.\n\n Examples:\n .. code-block:: python\n\n from dagster import DailyPartitionsDefinition, TimeWindowPartitionMapping, AssetIn, asset\n\n partitions_def = DailyPartitionsDefinition(start_date="2020-01-01")\n\n @asset(partitions_def=partitions_def)\n def asset1():\n ...\n\n @asset(\n partitions_def=partitions_def,\n ins={\n "asset1": AssetIn(\n partition_mapping=TimeWindowPartitionMapping(start_offset=-1)\n )\n }\n )\n def asset2(asset1):\n ...\n """\n\n def __new__(\n cls,\n start_offset: int = 0,\n end_offset: int = 0,\n allow_nonexistent_upstream_partitions: bool = False,\n ):\n return super(TimeWindowPartitionMapping, cls).__new__(\n cls,\n start_offset=check.int_param(start_offset, "start_offset"),\n end_offset=check.int_param(end_offset, "end_offset"),\n allow_nonexistent_upstream_partitions=check.bool_param(\n allow_nonexistent_upstream_partitions,\n "allow_nonexistent_upstream_partitions",\n ),\n )\n\n def get_upstream_mapped_partitions_result_for_partitions(\n self,\n downstream_partitions_subset: Optional[PartitionsSubset],\n upstream_partitions_def: PartitionsDefinition,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> UpstreamPartitionsResult:\n if not isinstance(downstream_partitions_subset, TimeWindowPartitionsSubset):\n check.failed("downstream_partitions_subset must be a TimeWindowPartitionsSubset")\n\n return self._map_partitions(\n downstream_partitions_subset.partitions_def,\n upstream_partitions_def,\n downstream_partitions_subset,\n start_offset=self.start_offset,\n end_offset=self.end_offset,\n current_time=current_time,\n )\n\n def get_downstream_partitions_for_partitions(\n self,\n upstream_partitions_subset: PartitionsSubset,\n downstream_partitions_def: Optional[PartitionsDefinition],\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> PartitionsSubset:\n """Returns the partitions in the downstream asset that map to the given upstream partitions.\n\n Filters for partitions that exist at the given current_time, fetching the current time\n if not provided.\n """\n return self._map_partitions(\n upstream_partitions_subset.partitions_def,\n downstream_partitions_def,\n upstream_partitions_subset,\n end_offset=-self.start_offset,\n start_offset=-self.end_offset,\n current_time=current_time,\n ).partitions_subset\n\n def _map_partitions(\n self,\n from_partitions_def: PartitionsDefinition,\n to_partitions_def: Optional[PartitionsDefinition],\n from_partitions_subset: PartitionsSubset,\n start_offset: int,\n end_offset: int,\n current_time: Optional[datetime] = None,\n ) -> UpstreamPartitionsResult:\n """Maps the partitions in from_partitions_subset to partitions in to_partitions_def.\n\n If partitions in from_partitions_subset represent time windows that do not exist in\n to_partitions_def, raises an error if raise_error_on_invalid_mapped_partition is True.\n Otherwise, filters out the partitions that do not exist in to_partitions_def and returns\n the filtered subset, also returning a bool indicating whether there were mapped time windows\n that did not exist in to_partitions_def.\n """\n if not isinstance(from_partitions_subset, TimeWindowPartitionsSubset):\n check.failed("from_partitions_subset must be a TimeWindowPartitionsSubset")\n\n if not isinstance(from_partitions_def, TimeWindowPartitionsDefinition):\n check.failed("from_partitions_def must be a TimeWindowPartitionsDefinition")\n\n if not isinstance(to_partitions_def, TimeWindowPartitionsDefinition):\n check.failed("to_partitions_def must be a TimeWindowPartitionsDefinition")\n\n if (start_offset != 0 or end_offset != 0) and (\n from_partitions_def.cron_schedule != to_partitions_def.cron_schedule\n ):\n raise DagsterInvalidDefinitionError(\n "Can't use the start_offset or end_offset parameters of"\n " TimeWindowPartitionMapping when the cron schedule of the upstream"\n " PartitionsDefinition is different than the cron schedule of the downstream"\n f" one. Attempted to map from cron schedule '{from_partitions_def.cron_schedule}' "\n f"to cron schedule '{to_partitions_def.cron_schedule}'."\n )\n\n if to_partitions_def.timezone != from_partitions_def.timezone:\n raise DagsterInvalidDefinitionError("Timezones don't match")\n\n # skip fancy mapping logic in the simple case\n if from_partitions_def == to_partitions_def and start_offset == 0 and end_offset == 0:\n return UpstreamPartitionsResult(from_partitions_subset, [])\n\n time_windows = []\n for from_partition_time_window in from_partitions_subset.included_time_windows:\n from_start_dt, from_end_dt = from_partition_time_window\n offsetted_start_dt = _offsetted_datetime(\n from_partitions_def, from_start_dt, start_offset\n )\n offsetted_end_dt = _offsetted_datetime(from_partitions_def, from_end_dt, end_offset)\n\n to_start_partition_key = (\n to_partitions_def.get_partition_key_for_timestamp(\n offsetted_start_dt.timestamp(), end_closed=False\n )\n if offsetted_start_dt is not None\n else None\n )\n to_end_partition_key = (\n to_partitions_def.get_partition_key_for_timestamp(\n offsetted_end_dt.timestamp(), end_closed=True\n )\n if offsetted_end_dt is not None\n else None\n )\n\n if to_start_partition_key is not None or to_end_partition_key is not None:\n window_start = (\n to_partitions_def.start_time_for_partition_key(to_start_partition_key)\n if to_start_partition_key\n else cast(TimeWindow, to_partitions_def.get_first_partition_window()).start\n )\n window_end = (\n to_partitions_def.end_time_for_partition_key(to_end_partition_key)\n if to_end_partition_key\n else cast(TimeWindow, to_partitions_def.get_last_partition_window()).end\n )\n\n if window_start < window_end:\n time_windows.append(TimeWindow(window_start, window_end))\n\n first_window = to_partitions_def.get_first_partition_window(current_time=current_time)\n last_window = to_partitions_def.get_last_partition_window(current_time=current_time)\n\n filtered_time_windows = []\n required_but_nonexistent_partition_keys = set()\n\n for time_window in time_windows:\n if (\n first_window\n and last_window\n and time_window.start <= last_window.start\n and time_window.end >= first_window.end\n ):\n window_start = max(time_window.start, first_window.start)\n window_end = min(time_window.end, last_window.end)\n filtered_time_windows.append(TimeWindow(window_start, window_end))\n\n if self.allow_nonexistent_upstream_partitions:\n # If allowed to have nonexistent upstream partitions, do not consider\n # out of range partitions to be invalid\n continue\n else:\n invalid_time_window = None\n if not (first_window and last_window) or (\n time_window.start < first_window.start and time_window.end > last_window.end\n ):\n invalid_time_window = time_window\n elif time_window.start < first_window.start:\n invalid_time_window = TimeWindow(\n time_window.start, min(time_window.end, first_window.start)\n )\n elif time_window.end > last_window.end:\n invalid_time_window = TimeWindow(\n max(time_window.start, last_window.end), time_window.end\n )\n\n if invalid_time_window:\n required_but_nonexistent_partition_keys.update(\n set(\n to_partitions_def.get_partition_keys_in_time_window(\n time_window=invalid_time_window\n )\n )\n )\n\n return UpstreamPartitionsResult(\n TimeWindowPartitionsSubset(\n to_partitions_def,\n num_partitions=sum(\n len(to_partitions_def.get_partition_keys_in_time_window(time_window))\n for time_window in filtered_time_windows\n ),\n included_time_windows=filtered_time_windows,\n ),\n sorted(list(required_but_nonexistent_partition_keys)),\n )
\n\n\ndef _offsetted_datetime(\n partitions_def: TimeWindowPartitionsDefinition, dt: datetime, offset: int\n) -> Optional[datetime]:\n for _ in range(abs(offset)):\n if offset < 0:\n prev_window = partitions_def.get_prev_partition_window(dt)\n if prev_window is None:\n return None\n\n dt = prev_window.start\n else:\n # TODO: what if we're at the end of the line?\n next_window = partitions_def.get_next_partition_window(dt)\n if next_window is None:\n return None\n\n dt = next_window.end\n\n return dt\n
", "current_page_name": "_modules/dagster/_core/definitions/time_window_partition_mapping", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.time_window_partition_mapping"}, "time_window_partitions": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.time_window_partitions

\nimport functools\nimport hashlib\nimport json\nimport re\nfrom datetime import datetime\nfrom enum import Enum\nfrom typing import (\n    AbstractSet,\n    Any,\n    Callable,\n    FrozenSet,\n    Iterable,\n    List,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Tuple,\n    Type,\n    Union,\n    cast,\n)\n\nimport pendulum\n\nimport dagster._check as check\nfrom dagster._annotations import PublicAttr, public\nfrom dagster._core.instance import DynamicPartitionsStore\nfrom dagster._utils.partitions import DEFAULT_HOURLY_FORMAT_WITHOUT_TIMEZONE\nfrom dagster._utils.schedules import (\n    cron_string_iterator,\n    is_valid_cron_schedule,\n    reverse_cron_string_iterator,\n)\n\nfrom ..errors import (\n    DagsterInvalidDefinitionError,\n    DagsterInvalidDeserializationVersionError,\n)\nfrom .partition import (\n    DEFAULT_DATE_FORMAT,\n    PartitionedConfig,\n    PartitionsDefinition,\n    PartitionsSubset,\n    ScheduleType,\n    cron_schedule_from_schedule_type_and_offsets,\n)\nfrom .partition_key_range import PartitionKeyRange\n\n\n
[docs]class TimeWindow(NamedTuple):\n """An interval that is closed at the start and open at the end.\n\n Attributes:\n start (datetime): A pendulum datetime that marks the start of the window.\n end (datetime): A pendulum datetime that marks the end of the window.\n """\n\n start: PublicAttr[datetime]\n end: PublicAttr[datetime]
\n\n\n
[docs]class TimeWindowPartitionsDefinition(\n PartitionsDefinition,\n NamedTuple(\n "_TimeWindowPartitionsDefinition",\n [\n ("start", PublicAttr[datetime]),\n ("timezone", PublicAttr[str]),\n ("end", PublicAttr[Optional[datetime]]),\n ("fmt", PublicAttr[str]),\n ("end_offset", PublicAttr[int]),\n ("cron_schedule", PublicAttr[str]),\n ],\n ),\n):\n r"""A set of partitions where each partitions corresponds to a time window.\n\n The provided cron_schedule determines the bounds of the time windows. E.g. a cron_schedule of\n "0 0 \\\\* \\\\* \\\\*" will result in daily partitions that start at midnight and end at midnight of the\n following day.\n\n The string partition_key associated with each partition corresponds to the start of the\n partition's time window.\n\n The first partition in the set will start on at the first cron_schedule tick that is equal to\n or after the given start datetime. The last partition in the set will end before the current\n time, unless the end_offset argument is set to a positive number.\n\n Args:\n cron_schedule (str): Determines the bounds of the time windows.\n start (datetime): The first partition in the set will start on at the first cron_schedule\n tick that is equal to or after this value.\n timezone (Optional[str]): The timezone in which each time should exist.\n Supported strings for timezones are the ones provided by the\n `IANA time zone database <https://www.iana.org/time-zones>` - e.g. "America/Los_Angeles".\n end (datetime): The last partition (excluding) in the set.\n fmt (str): The date format to use for partition_keys.\n end_offset (int): Extends the partition set by a number of partitions equal to the value\n passed. If end_offset is 0 (the default), the last partition ends before the current\n time. If end_offset is 1, the second-to-last partition ends before the current time,\n and so on.\n """\n\n def __new__(\n cls,\n start: Union[datetime, str],\n fmt: str,\n end: Union[datetime, str, None] = None,\n schedule_type: Optional[ScheduleType] = None,\n timezone: Optional[str] = None,\n end_offset: int = 0,\n minute_offset: Optional[int] = None,\n hour_offset: Optional[int] = None,\n day_offset: Optional[int] = None,\n cron_schedule: Optional[str] = None,\n ):\n check.opt_str_param(timezone, "timezone")\n timezone = timezone or "UTC"\n\n if isinstance(start, datetime):\n start_dt = pendulum.instance(start, tz=timezone)\n else:\n start_dt = pendulum.instance(datetime.strptime(start, fmt), tz=timezone)\n\n if not end:\n end_dt = None\n elif isinstance(end, datetime):\n end_dt = pendulum.instance(end, tz=timezone)\n else:\n end_dt = pendulum.instance(datetime.strptime(end, fmt), tz=timezone)\n\n if cron_schedule is not None:\n check.invariant(\n schedule_type is None and not minute_offset and not hour_offset and not day_offset,\n "If cron_schedule argument is provided, then schedule_type, minute_offset, "\n "hour_offset, and day_offset can't also be provided",\n )\n else:\n if schedule_type is None:\n check.failed("One of schedule_type and cron_schedule must be provided")\n\n cron_schedule = cron_schedule_from_schedule_type_and_offsets(\n schedule_type=schedule_type,\n minute_offset=minute_offset or 0,\n hour_offset=hour_offset or 0,\n day_offset=day_offset or 0,\n )\n\n if not is_valid_cron_schedule(cron_schedule):\n raise DagsterInvalidDefinitionError(\n f"Found invalid cron schedule '{cron_schedule}' for a"\n " TimeWindowPartitionsDefinition."\n )\n\n return super(TimeWindowPartitionsDefinition, cls).__new__(\n cls, start_dt, timezone, end_dt, fmt, end_offset, cron_schedule\n )\n\n def get_current_timestamp(self, current_time: Optional[datetime] = None) -> float:\n return (\n pendulum.instance(current_time, tz=self.timezone)\n if current_time\n else pendulum.now(self.timezone)\n ).timestamp()\n\n def get_num_partitions(\n self,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> int:\n # Method added for performance reasons.\n # Fetching partition keys requires significantly more compute time to\n # string format datetimes.\n current_timestamp = self.get_current_timestamp(current_time=current_time)\n\n partitions_past_current_time = 0\n\n num_partitions = 0\n for time_window in self._iterate_time_windows(self.start):\n if self.end and time_window.end.timestamp() > self.end.timestamp():\n break\n if (\n time_window.end.timestamp() <= current_timestamp\n or partitions_past_current_time < self.end_offset\n ):\n num_partitions += 1\n\n if time_window.end.timestamp() > current_timestamp:\n partitions_past_current_time += 1\n else:\n break\n\n if self.end_offset < 0:\n num_partitions += self.end_offset\n\n return num_partitions\n\n def get_partition_keys_between_indexes(\n self, start_idx: int, end_idx: int, current_time: Optional[datetime] = None\n ) -> List[str]:\n # Fetches the partition keys between the given start and end indices.\n # Start index is inclusive, end index is exclusive.\n # Method added for performance reasons, to only string format\n # partition keys included within the indices.\n current_timestamp = self.get_current_timestamp(current_time=current_time)\n\n partitions_past_current_time = 0\n partition_keys = []\n reached_end = False\n\n for idx, time_window in enumerate(self._iterate_time_windows(self.start)):\n if time_window.end.timestamp() >= current_timestamp:\n reached_end = True\n if self.end and time_window.end.timestamp() > self.end.timestamp():\n reached_end = True\n if (\n time_window.end.timestamp() <= current_timestamp\n or partitions_past_current_time < self.end_offset\n ):\n if idx >= start_idx and idx < end_idx:\n partition_keys.append(time_window.start.strftime(self.fmt))\n if time_window.end.timestamp() > current_timestamp:\n partitions_past_current_time += 1\n else:\n break\n if len(partition_keys) >= end_idx - start_idx:\n break\n\n if reached_end and self.end_offset < 0:\n partition_keys = partition_keys[: self.end_offset]\n\n return partition_keys\n\n def get_partition_keys(\n self,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> Sequence[str]:\n current_timestamp = self.get_current_timestamp(current_time=current_time)\n\n partitions_past_current_time = 0\n partition_keys: List[str] = []\n for time_window in self._iterate_time_windows(self.start):\n if self.end and time_window.end.timestamp() > self.end.timestamp():\n break\n if (\n time_window.end.timestamp() <= current_timestamp\n or partitions_past_current_time < self.end_offset\n ):\n partition_keys.append(time_window.start.strftime(self.fmt))\n\n if time_window.end.timestamp() > current_timestamp:\n partitions_past_current_time += 1\n else:\n break\n\n if self.end_offset < 0:\n partition_keys = partition_keys[: self.end_offset]\n\n return partition_keys\n\n def _get_validated_time_window_for_partition_key(\n self, partition_key: str, current_time: Optional[datetime] = None\n ) -> Optional[TimeWindow]:\n """Returns a TimeWindow for the given partition key if it is valid, otherwise returns None."""\n try:\n time_window = self.time_window_for_partition_key(partition_key)\n except ValueError:\n return None\n\n first_partition_window = self.get_first_partition_window(current_time=current_time)\n last_partition_window = self.get_last_partition_window(current_time=current_time)\n if (\n first_partition_window is None\n or last_partition_window is None\n or time_window.start < first_partition_window.start\n or time_window.start > last_partition_window.start\n or time_window.start.strftime(self.fmt) != partition_key\n ):\n return None\n\n return time_window\n\n def __str__(self) -> str:\n schedule_str = (\n self.schedule_type.value.capitalize() if self.schedule_type else self.cron_schedule\n )\n partition_def_str = (\n f"{schedule_str}, starting {self.start.strftime(self.fmt)} {self.timezone}."\n )\n if self.end_offset != 0:\n partition_def_str += (\n " End offsetted by"\n f" {self.end_offset} partition{'' if self.end_offset == 1 else 's'}."\n )\n return partition_def_str\n\n def __repr__(self):\n # Between python 3.8 and 3.9 the repr of a datetime object changed.\n # Replaces start time with timestamp as a workaround to make sure the repr is consistent across versions.\n return (\n f"TimeWindowPartitionsDefinition(start={self.start.timestamp()},"\n f" timezone='{self.timezone}', fmt='{self.fmt}', end_offset={self.end_offset},"\n f" cron_schedule='{self.cron_schedule}')"\n )\n\n def __hash__(self):\n return hash(tuple(self.__repr__()))\n\n @functools.lru_cache(maxsize=100)\n def _time_window_for_partition_key(self, *, partition_key: str) -> TimeWindow:\n partition_key_dt = pendulum.instance(\n datetime.strptime(partition_key, self.fmt), tz=self.timezone\n )\n return next(iter(self._iterate_time_windows(partition_key_dt)))\n\n def time_window_for_partition_key(self, partition_key: str) -> TimeWindow:\n return self._time_window_for_partition_key(partition_key=partition_key)\n\n @functools.lru_cache(maxsize=5)\n def time_windows_for_partition_keys(\n self,\n partition_keys: FrozenSet[str],\n validate: bool = True,\n ) -> Sequence[TimeWindow]:\n if len(partition_keys) == 0:\n return []\n\n sorted_pks = sorted(partition_keys, key=lambda pk: datetime.strptime(pk, self.fmt))\n cur_windows_iterator = iter(\n self._iterate_time_windows(\n pendulum.instance(datetime.strptime(sorted_pks[0], self.fmt), tz=self.timezone)\n )\n )\n partition_key_time_windows: List[TimeWindow] = []\n for partition_key in sorted_pks:\n next_window = next(cur_windows_iterator)\n if next_window.start.strftime(self.fmt) == partition_key:\n partition_key_time_windows.append(next_window)\n else:\n cur_windows_iterator = iter(\n self._iterate_time_windows(\n pendulum.instance(\n datetime.strptime(partition_key, self.fmt), tz=self.timezone\n )\n )\n )\n partition_key_time_windows.append(next(cur_windows_iterator))\n\n if validate:\n start_time_window = self.get_first_partition_window()\n end_time_window = self.get_last_partition_window()\n\n if start_time_window is None or end_time_window is None:\n check.failed("No partitions in the PartitionsDefinition")\n\n start_timestamp = start_time_window.start.timestamp()\n end_timestamp = end_time_window.end.timestamp()\n\n partition_key_time_windows = [\n tw\n for tw in partition_key_time_windows\n if tw.start.timestamp() >= start_timestamp and tw.end.timestamp() <= end_timestamp\n ]\n return partition_key_time_windows\n\n def start_time_for_partition_key(self, partition_key: str) -> datetime:\n partition_key_dt = pendulum.instance(\n datetime.strptime(partition_key, self.fmt), tz=self.timezone\n )\n # the datetime format might not include granular components, so we need to recover them\n # we make the assumption that the parsed partition key is <= the start datetime\n return next(iter(self._iterate_time_windows(partition_key_dt))).start\n\n def get_next_partition_key(\n self, partition_key: str, current_time: Optional[datetime] = None\n ) -> Optional[str]:\n last_partition_window = self.get_last_partition_window(current_time)\n if last_partition_window is None:\n return None\n\n partition_key_dt = pendulum.instance(\n datetime.strptime(partition_key, self.fmt), tz=self.timezone\n )\n windows_iter = iter(self._iterate_time_windows(partition_key_dt))\n next(windows_iter)\n start_time = next(windows_iter).start\n if start_time >= last_partition_window.end:\n return None\n else:\n return start_time.strftime(self.fmt)\n\n def get_next_partition_window(\n self, end_dt: datetime, current_time: Optional[datetime] = None\n ) -> Optional[TimeWindow]:\n last_partition_window = self.get_last_partition_window(current_time)\n if last_partition_window is None:\n return None\n\n windows_iter = iter(self._iterate_time_windows(end_dt))\n next_window = next(windows_iter)\n if next_window.start >= last_partition_window.end:\n return None\n else:\n return next_window\n\n def get_prev_partition_window(self, start_dt: datetime) -> Optional[TimeWindow]:\n windows_iter = iter(self._reverse_iterate_time_windows(start_dt))\n prev_window = next(windows_iter)\n first_partition_window = self.get_first_partition_window()\n if first_partition_window is None or prev_window.start < first_partition_window.start:\n return None\n else:\n return prev_window\n\n @functools.lru_cache(maxsize=5)\n def _get_first_partition_window(self, *, current_time: datetime) -> Optional[TimeWindow]:\n current_timestamp = current_time.timestamp()\n\n time_window = next(iter(self._iterate_time_windows(self.start)))\n\n if self.end_offset == 0:\n return time_window if time_window.end.timestamp() <= current_timestamp else None\n elif self.end_offset > 0:\n iterator = iter(self._iterate_time_windows(current_time))\n # first returned time window is time window of current time\n curr_window_plus_offset = next(iterator)\n for _ in range(self.end_offset):\n curr_window_plus_offset = next(iterator)\n return (\n time_window\n if time_window.end.timestamp() <= curr_window_plus_offset.start.timestamp()\n else None\n )\n else:\n # end offset < 0\n end_window = None\n iterator = iter(self._reverse_iterate_time_windows(current_time))\n for _ in range(abs(self.end_offset)):\n end_window = next(iterator)\n\n if end_window is None:\n check.failed("end_window should not be None")\n\n return (\n time_window if time_window.end.timestamp() <= end_window.start.timestamp() else None\n )\n\n def get_first_partition_window(\n self, current_time: Optional[datetime] = None\n ) -> Optional[TimeWindow]:\n current_time = cast(\n datetime,\n (\n pendulum.instance(current_time, tz=self.timezone)\n if current_time\n else pendulum.now(self.timezone)\n ),\n )\n return self._get_first_partition_window(current_time=current_time)\n\n @functools.lru_cache(maxsize=5)\n def _get_last_partition_window(self, *, current_time: datetime) -> Optional[TimeWindow]:\n if self.get_first_partition_window(current_time) is None:\n return None\n\n current_time = (\n pendulum.instance(current_time, tz=self.timezone)\n if current_time\n else pendulum.now(self.timezone)\n )\n\n if self.end and self.end < current_time:\n current_time = self.end\n\n if self.end_offset == 0:\n return next(iter(self._reverse_iterate_time_windows(current_time)))\n else:\n # TODO: make this efficient\n last_partition_key = super().get_last_partition_key(current_time)\n return (\n self.time_window_for_partition_key(last_partition_key)\n if last_partition_key\n else None\n )\n\n def get_last_partition_window(\n self, current_time: Optional[datetime] = None\n ) -> Optional[TimeWindow]:\n current_time = cast(\n datetime,\n (\n pendulum.instance(current_time, tz=self.timezone)\n if current_time\n else pendulum.now(self.timezone)\n ),\n )\n return self._get_last_partition_window(current_time=current_time)\n\n def get_first_partition_key(\n self,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> Optional[str]:\n first_window = self.get_first_partition_window(current_time)\n if first_window is None:\n return None\n\n return first_window.start.strftime(self.fmt)\n\n def get_last_partition_key(\n self,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> Optional[str]:\n last_window = self.get_last_partition_window(current_time)\n if last_window is None:\n return None\n\n return last_window.start.strftime(self.fmt)\n\n def end_time_for_partition_key(self, partition_key: str) -> datetime:\n return self.time_window_for_partition_key(partition_key).end\n\n @functools.lru_cache(maxsize=5)\n def get_partition_keys_in_time_window(self, time_window: TimeWindow) -> Sequence[str]:\n result: List[str] = []\n for partition_time_window in self._iterate_time_windows(time_window.start):\n if partition_time_window.start < time_window.end:\n result.append(partition_time_window.start.strftime(self.fmt))\n else:\n break\n return result\n\n def get_partition_key_range_for_time_window(self, time_window: TimeWindow) -> PartitionKeyRange:\n start_partition_key = self.get_partition_key_for_timestamp(time_window.start.timestamp())\n end_partition_key = self.get_partition_key_for_timestamp(\n cast(TimeWindow, self.get_prev_partition_window(time_window.end)).start.timestamp()\n )\n\n return PartitionKeyRange(start_partition_key, end_partition_key)\n\n def get_partition_keys_in_range(\n self,\n partition_key_range: PartitionKeyRange,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> Sequence[str]:\n start_time = self.start_time_for_partition_key(partition_key_range.start)\n end_time = self.end_time_for_partition_key(partition_key_range.end)\n\n return self.get_partition_keys_in_time_window(TimeWindow(start_time, end_time))\n\n @public\n @property\n def schedule_type(self) -> Optional[ScheduleType]:\n """Optional[ScheduleType]: An enum representing the partition cadence (hourly, daily,\n weekly, or monthly).\n """\n if re.fullmatch(r"\\d+ \\* \\* \\* \\*", self.cron_schedule):\n return ScheduleType.HOURLY\n elif re.fullmatch(r"\\d+ \\d+ \\* \\* \\*", self.cron_schedule):\n return ScheduleType.DAILY\n elif re.fullmatch(r"\\d+ \\d+ \\* \\* \\d+", self.cron_schedule):\n return ScheduleType.WEEKLY\n elif re.fullmatch(r"\\d+ \\d+ \\d+ \\* \\*", self.cron_schedule):\n return ScheduleType.MONTHLY\n else:\n return None\n\n @public\n @property\n def minute_offset(self) -> int:\n """int: Number of minutes past the hour to "split" partitions. Defaults to 0.\n\n For example, returns 15 if each partition starts at 15 minutes past the hour.\n """\n match = re.fullmatch(r"(\\d+) (\\d+|\\*) (\\d+|\\*) (\\d+|\\*) (\\d+|\\*)", self.cron_schedule)\n if match is None:\n check.failed(f"{self.cron_schedule} has no minute offset")\n return int(match.groups()[0])\n\n @public\n @property\n def hour_offset(self) -> int:\n """int: Number of hours past 00:00 to "split" partitions. Defaults to 0.\n\n For example, returns 1 if each partition starts at 01:00.\n """\n match = re.fullmatch(r"(\\d+|\\*) (\\d+) (\\d+|\\*) (\\d+|\\*) (\\d+|\\*)", self.cron_schedule)\n if match is None:\n check.failed(f"{self.cron_schedule} has no hour offset")\n return int(match.groups()[1])\n\n @public\n @property\n def day_offset(self) -> int:\n """int: For a weekly or monthly partitions definition, returns the day to "split" partitions\n by. Each partition will start on this day, and end before this day in the following\n week/month. Returns 0 if the day_offset parameter is unset in the\n WeeklyPartitionsDefinition, MonthlyPartitionsDefinition, or the provided cron schedule.\n\n For weekly partitions, returns a value between 0 (representing Sunday) and 6 (representing\n Saturday). Providing a value of 1 means that a partition will exist weekly from Monday to\n the following Sunday.\n\n For monthly partitions, returns a value between 0 (the first day of the month) and 31 (the\n last possible day of the month).\n """\n schedule_type = self.schedule_type\n if schedule_type == ScheduleType.WEEKLY:\n match = re.fullmatch(r"(\\d+|\\*) (\\d+|\\*) (\\d+|\\*) (\\d+|\\*) (\\d+)", self.cron_schedule)\n if match is None:\n check.failed(f"{self.cron_schedule} has no day offset")\n return int(match.groups()[4])\n elif schedule_type == ScheduleType.MONTHLY:\n match = re.fullmatch(r"(\\d+|\\*) (\\d+|\\*) (\\d+) (\\d+|\\*) (\\d+|\\*)", self.cron_schedule)\n if match is None:\n check.failed(f"{self.cron_schedule} has no day offset")\n return int(match.groups()[2])\n else:\n check.failed(f"Unsupported schedule type for day_offset: {schedule_type}")\n\n
[docs] @public\n def get_cron_schedule(\n self,\n minute_of_hour: Optional[int] = None,\n hour_of_day: Optional[int] = None,\n day_of_week: Optional[int] = None,\n day_of_month: Optional[int] = None,\n ) -> str:\n """The schedule executes at the cadence specified by the partitioning, but may overwrite\n the minute/hour/day offset of the partitioning.\n\n This is useful e.g. if you have partitions that span midnight to midnight but you want to\n schedule a job that runs at 2 am.\n """\n if (\n minute_of_hour is None\n and hour_of_day is None\n and day_of_week is None\n and day_of_month is None\n ):\n return self.cron_schedule\n\n schedule_type = self.schedule_type\n if schedule_type is None:\n check.failed(\n f"{self.cron_schedule} does not support"\n " minute_of_hour/hour_of_day/day_of_week/day_of_month arguments"\n )\n\n minute_of_hour = cast(\n int,\n check.opt_int_param(minute_of_hour, "minute_of_hour", default=self.minute_offset),\n )\n\n if schedule_type == ScheduleType.HOURLY:\n check.invariant(\n hour_of_day is None, "Cannot set hour parameter with hourly partitions."\n )\n else:\n hour_of_day = cast(\n int, check.opt_int_param(hour_of_day, "hour_of_day", default=self.hour_offset)\n )\n\n if schedule_type == ScheduleType.DAILY:\n check.invariant(\n day_of_week is None, "Cannot set day of week parameter with daily partitions."\n )\n check.invariant(\n day_of_month is None, "Cannot set day of month parameter with daily partitions."\n )\n\n if schedule_type == ScheduleType.MONTHLY:\n default = self.day_offset or 1\n day_offset = check.opt_int_param(day_of_month, "day_of_month", default=default)\n elif schedule_type == ScheduleType.WEEKLY:\n default = self.day_offset or 0\n day_offset = check.opt_int_param(day_of_week, "day_of_week", default=default)\n else:\n day_offset = 0\n\n return cron_schedule_from_schedule_type_and_offsets(\n schedule_type,\n minute_offset=minute_of_hour,\n hour_offset=hour_of_day or 0,\n day_offset=day_offset,\n )
\n\n def _iterate_time_windows(self, start: datetime) -> Iterable[TimeWindow]:\n """Returns an infinite generator of time windows that start after the given start time."""\n start_timestamp = pendulum.instance(start, tz=self.timezone).timestamp()\n iterator = cron_string_iterator(\n start_timestamp=start_timestamp,\n cron_string=self.cron_schedule,\n execution_timezone=self.timezone,\n )\n prev_time = next(iterator)\n while prev_time.timestamp() < start_timestamp:\n prev_time = next(iterator)\n\n while True:\n next_time = next(iterator)\n yield TimeWindow(prev_time, next_time)\n prev_time = next_time\n\n def _reverse_iterate_time_windows(self, end: datetime) -> Iterable[TimeWindow]:\n """Returns an infinite generator of time windows that end before the given end time."""\n end_timestamp = pendulum.instance(end, tz=self.timezone).timestamp()\n iterator = reverse_cron_string_iterator(\n end_timestamp=end_timestamp,\n cron_string=self.cron_schedule,\n execution_timezone=self.timezone,\n )\n\n prev_time = next(iterator)\n while prev_time.timestamp() > end_timestamp:\n prev_time = next(iterator)\n\n while True:\n next_time = next(iterator)\n yield TimeWindow(next_time, prev_time)\n prev_time = next_time\n\n def get_partition_key_for_timestamp(self, timestamp: float, end_closed: bool = False) -> str:\n """Args:\n timestamp (float): Timestamp from the unix epoch, UTC.\n end_closed (bool): Whether the interval is closed at the end or at the beginning.\n """\n iterator = cron_string_iterator(\n timestamp, self.cron_schedule, self.timezone, start_offset=-1\n )\n # prev will be < timestamp\n prev = next(iterator)\n # prev_next will be >= timestamp\n prev_next = next(iterator)\n\n if end_closed or prev_next.timestamp() > timestamp:\n return prev.strftime(self.fmt)\n else:\n return prev_next.strftime(self.fmt)\n\n def less_than(self, partition_key1: str, partition_key2: str) -> bool:\n """Returns true if the partition_key1 is earlier than partition_key2."""\n return self.start_time_for_partition_key(\n partition_key1\n ) < self.start_time_for_partition_key(partition_key2)\n\n @property\n def partitions_subset_class(self) -> Type["PartitionsSubset"]:\n return TimeWindowPartitionsSubset\n\n def empty_subset(self) -> "PartitionsSubset":\n return self.partitions_subset_class.empty_subset(self)\n\n def is_valid_partition_key(self, partition_key: str) -> bool:\n try:\n partition_time = pendulum.instance(\n datetime.strptime(partition_key, self.fmt), tz=self.timezone\n )\n return partition_time >= self.start\n except ValueError:\n return False\n\n def get_serializable_unique_identifier(\n self, dynamic_partitions_store: Optional[DynamicPartitionsStore] = None\n ) -> str:\n return hashlib.sha1(self.__repr__().encode("utf-8")).hexdigest()\n\n def has_partition_key(\n self,\n partition_key: str,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> bool:\n return bool(self._get_validated_time_window_for_partition_key(partition_key, current_time))
\n\n\n
[docs]class DailyPartitionsDefinition(TimeWindowPartitionsDefinition):\n """A set of daily partitions.\n\n The first partition in the set will start at the start_date at midnight. The last partition\n in the set will end before the current time, unless the end_offset argument is set to a\n positive number. If minute_offset and/or hour_offset are used, the start and end times of\n each partition will be hour_offset:minute_offset of each day.\n\n Args:\n start_date (Union[datetime.datetime, str]): The first date in the set of partitions. Can\n provide in either a datetime or string format.\n end_date (Union[datetime.datetime, str, None]): The last date(excluding) in the set of partitions.\n Default is None. Can provide in either a datetime or string format.\n minute_offset (int): Number of minutes past the hour to "split" the partition. Defaults\n to 0.\n hour_offset (int): Number of hours past 00:00 to "split" the partition. Defaults to 0.\n timezone (Optional[str]): The timezone in which each date should exist.\n Supported strings for timezones are the ones provided by the\n `IANA time zone database <https://www.iana.org/time-zones>` - e.g. "America/Los_Angeles".\n fmt (Optional[str]): The date format to use. Defaults to `%Y-%m-%d`.\n end_offset (int): Extends the partition set by a number of partitions equal to the value\n passed. If end_offset is 0 (the default), the last partition ends before the current\n time. If end_offset is 1, the second-to-last partition ends before the current time,\n and so on.\n\n .. code-block:: python\n\n DailyPartitionsDefinition(start_date="2022-03-12")\n # creates partitions (2022-03-12-00:00, 2022-03-13-00:00), (2022-03-13-00:00, 2022-03-14-00:00), ...\n\n DailyPartitionsDefinition(start_date="2022-03-12", minute_offset=15, hour_offset=16)\n # creates partitions (2022-03-12-16:15, 2022-03-13-16:15), (2022-03-13-16:15, 2022-03-14-16:15), ...\n """\n\n def __new__(\n cls,\n start_date: Union[datetime, str],\n end_date: Union[datetime, str, None] = None,\n minute_offset: int = 0,\n hour_offset: int = 0,\n timezone: Optional[str] = None,\n fmt: Optional[str] = None,\n end_offset: int = 0,\n ):\n _fmt = fmt or DEFAULT_DATE_FORMAT\n\n return super(DailyPartitionsDefinition, cls).__new__(\n cls,\n schedule_type=ScheduleType.DAILY,\n start=start_date,\n end=end_date,\n minute_offset=minute_offset,\n hour_offset=hour_offset,\n timezone=timezone,\n fmt=_fmt,\n end_offset=end_offset,\n )
\n\n\ndef wrap_time_window_run_config_fn(\n run_config_fn: Optional[Callable[[datetime, datetime], Mapping[str, Any]]],\n partitions_def: TimeWindowPartitionsDefinition,\n) -> Callable[[str], Mapping[str, Any]]:\n def _run_config_wrapper(key: str) -> Mapping[str, Any]:\n if not run_config_fn:\n return {}\n time_window = partitions_def.time_window_for_partition_key(key)\n return run_config_fn(time_window.start, time_window.end)\n\n return _run_config_wrapper\n\n\ndef wrap_time_window_tags_fn(\n tags_fn: Optional[Callable[[datetime, datetime], Mapping[str, str]]],\n partitions_def: TimeWindowPartitionsDefinition,\n) -> Callable[[str], Mapping[str, str]]:\n def _tag_wrapper(key: str) -> Mapping[str, str]:\n if not tags_fn:\n return {}\n time_window = partitions_def.time_window_for_partition_key(key)\n return tags_fn(time_window.start, time_window.end)\n\n return _tag_wrapper\n\n\n
[docs]def daily_partitioned_config(\n start_date: Union[datetime, str],\n minute_offset: int = 0,\n hour_offset: int = 0,\n timezone: Optional[str] = None,\n fmt: Optional[str] = None,\n end_offset: int = 0,\n tags_for_partition_fn: Optional[Callable[[datetime, datetime], Mapping[str, str]]] = None,\n) -> Callable[\n [Callable[[datetime, datetime], Mapping[str, Any]]],\n PartitionedConfig[DailyPartitionsDefinition],\n]:\n """Defines run config over a set of daily partitions.\n\n The decorated function should accept a start datetime and end datetime, which represent the bounds\n of the date partition the config should delineate.\n\n The decorated function should return a run config dictionary.\n\n The resulting object created by this decorator can be provided to the config argument of a Job.\n The first partition in the set will start at the start_date at midnight. The last partition in\n the set will end before the current time, unless the end_offset argument is set to a positive\n number. If minute_offset and/or hour_offset are used, the start and end times of each partition\n will be hour_offset:minute_offset of each day.\n\n Args:\n start_date (Union[datetime.datetime, str]): The first date in the set of partitions. Can\n provide in either a datetime or string format.\n minute_offset (int): Number of minutes past the hour to "split" the partition. Defaults\n to 0.\n hour_offset (int): Number of hours past 00:00 to "split" the partition. Defaults to 0.\n timezone (Optional[str]): The timezone in which each date should exist.\n Supported strings for timezones are the ones provided by the\n `IANA time zone database <https://www.iana.org/time-zones>` - e.g. "America/Los_Angeles".\n fmt (Optional[str]): The date format to use. Defaults to `%Y-%m-%d`.\n end_offset (int): Extends the partition set by a number of partitions equal to the value\n passed. If end_offset is 0 (the default), the last partition ends before the current\n time. If end_offset is 1, the second-to-last partition ends before the current time,\n and so on.\n tags_for_partition_fn (Optional[Callable[[str], Mapping[str, str]]]): A function that\n accepts a partition time window and returns a dictionary of tags to attach to runs for\n that partition.\n\n .. code-block:: python\n\n @daily_partitioned_config(start_date="2022-03-12")\n # creates partitions (2022-03-12-00:00, 2022-03-13-00:00), (2022-03-13-00:00, 2022-03-14-00:00), ...\n\n @daily_partitioned_config(start_date="2022-03-12", minute_offset=15, hour_offset=16)\n # creates partitions (2022-03-12-16:15, 2022-03-13-16:15), (2022-03-13-16:15, 2022-03-14-16:15), ...\n """\n\n def inner(\n fn: Callable[[datetime, datetime], Mapping[str, Any]]\n ) -> PartitionedConfig[DailyPartitionsDefinition]:\n check.callable_param(fn, "fn")\n\n partitions_def = DailyPartitionsDefinition(\n start_date=start_date,\n minute_offset=minute_offset,\n hour_offset=hour_offset,\n timezone=timezone,\n fmt=fmt,\n end_offset=end_offset,\n )\n\n return PartitionedConfig(\n run_config_for_partition_key_fn=wrap_time_window_run_config_fn(fn, partitions_def),\n partitions_def=partitions_def,\n decorated_fn=fn,\n tags_for_partition_key_fn=wrap_time_window_tags_fn(\n tags_for_partition_fn, partitions_def\n ),\n )\n\n return inner
\n\n\n
[docs]class HourlyPartitionsDefinition(TimeWindowPartitionsDefinition):\n """A set of hourly partitions.\n\n The first partition in the set will start on the start_date at midnight. The last partition\n in the set will end before the current time, unless the end_offset argument is set to a\n positive number. If minute_offset is provided, the start and end times of each partition\n will be minute_offset past the hour.\n\n Args:\n start_date (Union[datetime.datetime, str]): The first date in the set of partitions. Can\n provide in either a datetime or string format.\n end_date (Union[datetime.datetime, str, None]): The last date(excluding) in the set of partitions.\n Default is None. Can provide in either a datetime or string format.\n minute_offset (int): Number of minutes past the hour to "split" the partition. Defaults\n to 0.\n fmt (Optional[str]): The date format to use. Defaults to `%Y-%m-%d`.\n timezone (Optional[str]): The timezone in which each date should exist.\n Supported strings for timezones are the ones provided by the\n `IANA time zone database <https://www.iana.org/time-zones>` - e.g. "America/Los_Angeles".\n end_offset (int): Extends the partition set by a number of partitions equal to the value\n passed. If end_offset is 0 (the default), the last partition ends before the current\n time. If end_offset is 1, the second-to-last partition ends before the current time,\n and so on.\n\n .. code-block:: python\n\n HourlyPartitionsDefinition(start_date=datetime(2022, 03, 12))\n # creates partitions (2022-03-12-00:00, 2022-03-12-01:00), (2022-03-12-01:00, 2022-03-12-02:00), ...\n\n HourlyPartitionsDefinition(start_date=datetime(2022, 03, 12), minute_offset=15)\n # creates partitions (2022-03-12-00:15, 2022-03-12-01:15), (2022-03-12-01:15, 2022-03-12-02:15), ...\n """\n\n def __new__(\n cls,\n start_date: Union[datetime, str],\n end_date: Union[datetime, str, None] = None,\n minute_offset: int = 0,\n timezone: Optional[str] = None,\n fmt: Optional[str] = None,\n end_offset: int = 0,\n ):\n _fmt = fmt or DEFAULT_HOURLY_FORMAT_WITHOUT_TIMEZONE\n\n return super(HourlyPartitionsDefinition, cls).__new__(\n cls,\n schedule_type=ScheduleType.HOURLY,\n start=start_date,\n end=end_date,\n minute_offset=minute_offset,\n timezone=timezone,\n fmt=_fmt,\n end_offset=end_offset,\n )
\n\n\n
[docs]def hourly_partitioned_config(\n start_date: Union[datetime, str],\n minute_offset: int = 0,\n timezone: Optional[str] = None,\n fmt: Optional[str] = None,\n end_offset: int = 0,\n tags_for_partition_fn: Optional[Callable[[datetime, datetime], Mapping[str, str]]] = None,\n) -> Callable[\n [Callable[[datetime, datetime], Mapping[str, Any]]],\n PartitionedConfig[HourlyPartitionsDefinition],\n]:\n """Defines run config over a set of hourly partitions.\n\n The decorated function should accept a start datetime and end datetime, which represent the date\n partition the config should delineate.\n\n The decorated function should return a run config dictionary.\n\n The resulting object created by this decorator can be provided to the config argument of a Job.\n The first partition in the set will start at the start_date at midnight. The last partition in\n the set will end before the current time, unless the end_offset argument is set to a positive\n number. If minute_offset is provided, the start and end times of each partition will be\n minute_offset past the hour.\n\n Args:\n start_date (Union[datetime.datetime, str]): The first date in the set of partitions. Can\n provide in either a datetime or string format.\n minute_offset (int): Number of minutes past the hour to "split" the partition. Defaults\n to 0.\n fmt (Optional[str]): The date format to use. Defaults to `%Y-%m-%d`.\n timezone (Optional[str]): The timezone in which each date should exist.\n Supported strings for timezones are the ones provided by the\n `IANA time zone database <https://www.iana.org/time-zones>` - e.g. "America/Los_Angeles".\n end_offset (int): Extends the partition set by a number of partitions equal to the value\n passed. If end_offset is 0 (the default), the last partition ends before the current\n time. If end_offset is 1, the second-to-last partition ends before the current time,\n and so on.\n tags_for_partition_fn (Optional[Callable[[str], Mapping[str, str]]]): A function that\n accepts a partition time window and returns a dictionary of tags to attach to runs for\n that partition.\n\n .. code-block:: python\n\n @hourly_partitioned_config(start_date=datetime(2022, 03, 12))\n # creates partitions (2022-03-12-00:00, 2022-03-12-01:00), (2022-03-12-01:00, 2022-03-12-02:00), ...\n\n @hourly_partitioned_config(start_date=datetime(2022, 03, 12), minute_offset=15)\n # creates partitions (2022-03-12-00:15, 2022-03-12-01:15), (2022-03-12-01:15, 2022-03-12-02:15), ...\n """\n\n def inner(\n fn: Callable[[datetime, datetime], Mapping[str, Any]]\n ) -> PartitionedConfig[HourlyPartitionsDefinition]:\n check.callable_param(fn, "fn")\n\n partitions_def = HourlyPartitionsDefinition(\n start_date=start_date,\n minute_offset=minute_offset,\n timezone=timezone,\n fmt=fmt,\n end_offset=end_offset,\n )\n return PartitionedConfig(\n run_config_for_partition_key_fn=wrap_time_window_run_config_fn(fn, partitions_def),\n partitions_def=partitions_def,\n decorated_fn=fn,\n tags_for_partition_key_fn=wrap_time_window_tags_fn(\n tags_for_partition_fn, partitions_def\n ),\n )\n\n return inner
\n\n\n
[docs]class MonthlyPartitionsDefinition(TimeWindowPartitionsDefinition):\n """A set of monthly partitions.\n\n The first partition in the set will start at the soonest first of the month after start_date\n at midnight. The last partition in the set will end before the current time, unless the\n end_offset argument is set to a positive number. If day_offset is provided, the start and\n end date of each partition will be day_offset. If minute_offset and/or hour_offset are used,\n the start and end times of each partition will be hour_offset:minute_offset of each day.\n\n Args:\n start_date (Union[datetime.datetime, str]): The first date in the set of partitions will be\n midnight the sonnest first of the month following start_date. Can provide in either a\n datetime or string format.\n end_date (Union[datetime.datetime, str, None]): The last date(excluding) in the set of partitions.\n Default is None. Can provide in either a datetime or string format.\n minute_offset (int): Number of minutes past the hour to "split" the partition. Defaults\n to 0.\n hour_offset (int): Number of hours past 00:00 to "split" the partition. Defaults to 0.\n day_offset (int): Day of the month to "split" the partition. Defaults to 1.\n timezone (Optional[str]): The timezone in which each date should exist.\n Supported strings for timezones are the ones provided by the\n `IANA time zone database <https://www.iana.org/time-zones>` - e.g. "America/Los_Angeles".\n fmt (Optional[str]): The date format to use. Defaults to `%Y-%m-%d`.\n end_offset (int): Extends the partition set by a number of partitions equal to the value\n passed. If end_offset is 0 (the default), the last partition ends before the current\n time. If end_offset is 1, the second-to-last partition ends before the current time,\n and so on.\n\n .. code-block:: python\n\n MonthlyPartitionsDefinition(start_date="2022-03-12")\n # creates partitions (2022-04-01-00:00, 2022-05-01-00:00), (2022-05-01-00:00, 2022-06-01-00:00), ...\n\n MonthlyPartitionsDefinition(start_date="2022-03-12", minute_offset=15, hour_offset=3, day_offset=5)\n # creates partitions (2022-04-05-03:15, 2022-05-05-03:15), (2022-05-05-03:15, 2022-06-05-03:15), ...\n """\n\n def __new__(\n cls,\n start_date: Union[datetime, str],\n end_date: Union[datetime, str, None] = None,\n minute_offset: int = 0,\n hour_offset: int = 0,\n day_offset: int = 1,\n timezone: Optional[str] = None,\n fmt: Optional[str] = None,\n end_offset: int = 0,\n ):\n _fmt = fmt or DEFAULT_DATE_FORMAT\n\n return super(MonthlyPartitionsDefinition, cls).__new__(\n cls,\n schedule_type=ScheduleType.MONTHLY,\n start=start_date,\n end=end_date,\n minute_offset=minute_offset,\n hour_offset=hour_offset,\n day_offset=day_offset,\n timezone=timezone,\n fmt=_fmt,\n end_offset=end_offset,\n )
\n\n\n
[docs]def monthly_partitioned_config(\n start_date: Union[datetime, str],\n minute_offset: int = 0,\n hour_offset: int = 0,\n day_offset: int = 1,\n timezone: Optional[str] = None,\n fmt: Optional[str] = None,\n end_offset: int = 0,\n tags_for_partition_fn: Optional[Callable[[datetime, datetime], Mapping[str, str]]] = None,\n) -> Callable[\n [Callable[[datetime, datetime], Mapping[str, Any]]],\n PartitionedConfig[MonthlyPartitionsDefinition],\n]:\n """Defines run config over a set of monthly partitions.\n\n The decorated function should accept a start datetime and end datetime, which represent the date\n partition the config should delineate.\n\n The decorated function should return a run config dictionary.\n\n The resulting object created by this decorator can be provided to the config argument of a Job.\n The first partition in the set will start at midnight on the soonest first of the month after\n start_date. The last partition in the set will end before the current time, unless the\n end_offset argument is set to a positive number. If day_offset is provided, the start and end\n date of each partition will be day_offset. If minute_offset and/or hour_offset are used, the\n start and end times of each partition will be hour_offset:minute_offset of each day.\n\n Args:\n start_date (Union[datetime.datetime, str]): The first date in the set of partitions will be\n midnight the sonnest first of the month following start_date. Can provide in either a\n datetime or string format.\n minute_offset (int): Number of minutes past the hour to "split" the partition. Defaults\n to 0.\n hour_offset (int): Number of hours past 00:00 to "split" the partition. Defaults to 0.\n day_offset (int): Day of the month to "split" the partition. Defaults to 1.\n timezone (Optional[str]): The timezone in which each date should exist.\n Supported strings for timezones are the ones provided by the\n `IANA time zone database <https://www.iana.org/time-zones>` - e.g. "America/Los_Angeles".\n fmt (Optional[str]): The date format to use. Defaults to `%Y-%m-%d`.\n end_offset (int): Extends the partition set by a number of partitions equal to the value\n passed. If end_offset is 0 (the default), the last partition ends before the current\n time. If end_offset is 1, the second-to-last partition ends before the current time,\n and so on.\n tags_for_partition_fn (Optional[Callable[[str], Mapping[str, str]]]): A function that\n accepts a partition time window and returns a dictionary of tags to attach to runs for\n that partition.\n\n .. code-block:: python\n\n @monthly_partitioned_config(start_date="2022-03-12")\n # creates partitions (2022-04-01-00:00, 2022-05-01-00:00), (2022-05-01-00:00, 2022-06-01-00:00), ...\n\n @monthly_partitioned_config(start_date="2022-03-12", minute_offset=15, hour_offset=3, day_offset=5)\n # creates partitions (2022-04-05-03:15, 2022-05-05-03:15), (2022-05-05-03:15, 2022-06-05-03:15), ...\n """\n\n def inner(\n fn: Callable[[datetime, datetime], Mapping[str, Any]]\n ) -> PartitionedConfig[MonthlyPartitionsDefinition]:\n check.callable_param(fn, "fn")\n\n partitions_def = MonthlyPartitionsDefinition(\n start_date=start_date,\n minute_offset=minute_offset,\n hour_offset=hour_offset,\n day_offset=day_offset,\n timezone=timezone,\n fmt=fmt,\n end_offset=end_offset,\n )\n\n return PartitionedConfig(\n run_config_for_partition_key_fn=wrap_time_window_run_config_fn(fn, partitions_def),\n partitions_def=partitions_def,\n decorated_fn=fn,\n tags_for_partition_key_fn=wrap_time_window_tags_fn(\n tags_for_partition_fn, partitions_def\n ),\n )\n\n return inner
\n\n\n
[docs]class WeeklyPartitionsDefinition(TimeWindowPartitionsDefinition):\n """Defines a set of weekly partitions.\n\n The first partition in the set will start at the start_date. The last partition in the set will\n end before the current time, unless the end_offset argument is set to a positive number. If\n day_offset is provided, the start and end date of each partition will be day of the week\n corresponding to day_offset (0 indexed with Sunday as the start of the week). If\n minute_offset and/or hour_offset are used, the start and end times of each partition will be\n hour_offset:minute_offset of each day.\n\n Args:\n start_date (Union[datetime.datetime, str]): The first date in the set of partitions will\n Sunday at midnight following start_date. Can provide in either a datetime or string\n format.\n end_date (Union[datetime.datetime, str, None]): The last date(excluding) in the set of partitions.\n Default is None. Can provide in either a datetime or string format.\n minute_offset (int): Number of minutes past the hour to "split" the partition. Defaults\n to 0.\n hour_offset (int): Number of hours past 00:00 to "split" the partition. Defaults to 0.\n day_offset (int): Day of the week to "split" the partition. Defaults to 0 (Sunday).\n timezone (Optional[str]): The timezone in which each date should exist.\n Supported strings for timezones are the ones provided by the\n `IANA time zone database <https://www.iana.org/time-zones>` - e.g. "America/Los_Angeles".\n fmt (Optional[str]): The date format to use. Defaults to `%Y-%m-%d`.\n end_offset (int): Extends the partition set by a number of partitions equal to the value\n passed. If end_offset is 0 (the default), the last partition ends before the current\n time. If end_offset is 1, the second-to-last partition ends before the current time,\n and so on.\n\n .. code-block:: python\n\n WeeklyPartitionsDefinition(start_date="2022-03-12")\n # creates partitions (2022-03-13-00:00, 2022-03-20-00:00), (2022-03-20-00:00, 2022-03-27-00:00), ...\n\n WeeklyPartitionsDefinition(start_date="2022-03-12", minute_offset=15, hour_offset=3, day_offset=6)\n # creates partitions (2022-03-12-03:15, 2022-03-19-03:15), (2022-03-19-03:15, 2022-03-26-03:15), ...\n """\n\n def __new__(\n cls,\n start_date: Union[datetime, str],\n end_date: Union[datetime, str, None] = None,\n minute_offset: int = 0,\n hour_offset: int = 0,\n day_offset: int = 0,\n timezone: Optional[str] = None,\n fmt: Optional[str] = None,\n end_offset: int = 0,\n ):\n _fmt = fmt or DEFAULT_DATE_FORMAT\n\n return super(WeeklyPartitionsDefinition, cls).__new__(\n cls,\n schedule_type=ScheduleType.WEEKLY,\n start=start_date,\n end=end_date,\n minute_offset=minute_offset,\n hour_offset=hour_offset,\n day_offset=day_offset,\n timezone=timezone,\n fmt=_fmt,\n end_offset=end_offset,\n )
\n\n\n
[docs]def weekly_partitioned_config(\n start_date: Union[datetime, str],\n minute_offset: int = 0,\n hour_offset: int = 0,\n day_offset: int = 0,\n timezone: Optional[str] = None,\n fmt: Optional[str] = None,\n end_offset: int = 0,\n tags_for_partition_fn: Optional[Callable[[datetime, datetime], Mapping[str, str]]] = None,\n) -> Callable[\n [Callable[[datetime, datetime], Mapping[str, Any]]],\n PartitionedConfig[WeeklyPartitionsDefinition],\n]:\n """Defines run config over a set of weekly partitions.\n\n The decorated function should accept a start datetime and end datetime, which represent the date\n partition the config should delineate.\n\n The decorated function should return a run config dictionary.\n\n The resulting object created by this decorator can be provided to the config argument of a Job.\n The first partition in the set will start at the start_date. The last partition in the set will\n end before the current time, unless the end_offset argument is set to a positive number. If\n day_offset is provided, the start and end date of each partition will be day of the week\n corresponding to day_offset (0 indexed with Sunday as the start of the week). If\n minute_offset and/or hour_offset are used, the start and end times of each partition will be\n hour_offset:minute_offset of each day.\n\n Args:\n start_date (Union[datetime.datetime, str]): The first date in the set of partitions will\n Sunday at midnight following start_date. Can provide in either a datetime or string\n format.\n minute_offset (int): Number of minutes past the hour to "split" the partition. Defaults\n to 0.\n hour_offset (int): Number of hours past 00:00 to "split" the partition. Defaults to 0.\n day_offset (int): Day of the week to "split" the partition. Defaults to 0 (Sunday).\n timezone (Optional[str]): The timezone in which each date should exist.\n Supported strings for timezones are the ones provided by the\n `IANA time zone database <https://www.iana.org/time-zones>` - e.g. "America/Los_Angeles".\n fmt (Optional[str]): The date format to use. Defaults to `%Y-%m-%d`.\n end_offset (int): Extends the partition set by a number of partitions equal to the value\n passed. If end_offset is 0 (the default), the last partition ends before the current\n time. If end_offset is 1, the second-to-last partition ends before the current time,\n and so on.\n tags_for_partition_fn (Optional[Callable[[str], Mapping[str, str]]]): A function that\n accepts a partition time window and returns a dictionary of tags to attach to runs for\n that partition.\n\n .. code-block:: python\n\n @weekly_partitioned_config(start_date="2022-03-12")\n # creates partitions (2022-03-13-00:00, 2022-03-20-00:00), (2022-03-20-00:00, 2022-03-27-00:00), ...\n\n @weekly_partitioned_config(start_date="2022-03-12", minute_offset=15, hour_offset=3, day_offset=6)\n # creates partitions (2022-03-12-03:15, 2022-03-19-03:15), (2022-03-19-03:15, 2022-03-26-03:15), ...\n """\n\n def inner(\n fn: Callable[[datetime, datetime], Mapping[str, Any]]\n ) -> PartitionedConfig[WeeklyPartitionsDefinition]:\n check.callable_param(fn, "fn")\n\n partitions_def = WeeklyPartitionsDefinition(\n start_date=start_date,\n minute_offset=minute_offset,\n hour_offset=hour_offset,\n day_offset=day_offset,\n timezone=timezone,\n fmt=fmt,\n end_offset=end_offset,\n )\n return PartitionedConfig(\n run_config_for_partition_key_fn=wrap_time_window_run_config_fn(fn, partitions_def),\n partitions_def=partitions_def,\n decorated_fn=fn,\n tags_for_partition_key_fn=wrap_time_window_tags_fn(\n tags_for_partition_fn, partitions_def\n ),\n )\n\n return inner
\n\n\nclass TimeWindowPartitionsSubset(PartitionsSubset):\n # Every time we change the serialization format, we should increment the version number.\n # This will ensure that we can gracefully degrade when deserializing old data.\n SERIALIZATION_VERSION = 1\n\n def __init__(\n self,\n partitions_def: TimeWindowPartitionsDefinition,\n num_partitions: int,\n included_time_windows: Optional[Sequence[TimeWindow]] = None,\n included_partition_keys: Optional[AbstractSet[str]] = None,\n ):\n self._partitions_def = check.inst_param(\n partitions_def, "partitions_def", TimeWindowPartitionsDefinition\n )\n self._included_time_windows = included_time_windows\n self._num_partitions = num_partitions\n\n check.param_invariant(\n not (included_partition_keys and included_time_windows),\n "Cannot specify both included_partition_keys and included_time_windows",\n )\n self._included_time_windows = check.opt_nullable_sequence_param(\n included_time_windows, "included_time_windows", of_type=TimeWindow\n )\n\n self._included_partition_keys = check.opt_nullable_set_param(\n included_partition_keys, "included_partition_keys", of_type=str\n )\n\n @property\n def included_time_windows(self) -> Sequence[TimeWindow]:\n if self._included_time_windows is None:\n result_time_windows, _ = self._add_partitions_to_time_windows(\n initial_windows=[],\n partition_keys=list(check.not_none(self._included_partition_keys)),\n validate=False,\n )\n self._included_time_windows = result_time_windows\n return self._included_time_windows\n\n def _get_partition_time_windows_not_in_subset(\n self,\n current_time: Optional[datetime] = None,\n ) -> Sequence[TimeWindow]:\n """Returns a list of partition time windows that are not in the subset.\n Each time window is a single partition.\n """\n first_tw = self._partitions_def.get_first_partition_window(current_time=current_time)\n last_tw = self._partitions_def.get_last_partition_window(current_time=current_time)\n\n if not first_tw or not last_tw:\n check.failed("No partitions found")\n\n if len(self.included_time_windows) == 0:\n return [TimeWindow(first_tw.start, last_tw.end)]\n\n time_windows = []\n if first_tw.start < self.included_time_windows[0].start:\n time_windows.append(TimeWindow(first_tw.start, self.included_time_windows[0].start))\n\n for i in range(len(self.included_time_windows) - 1):\n if self.included_time_windows[i].start >= last_tw.end:\n break\n if self.included_time_windows[i].end < last_tw.end:\n if self.included_time_windows[i + 1].start <= last_tw.end:\n time_windows.append(\n TimeWindow(\n self.included_time_windows[i].end,\n self.included_time_windows[i + 1].start,\n )\n )\n else:\n time_windows.append(\n TimeWindow(\n self.included_time_windows[i].end,\n last_tw.end,\n )\n )\n\n if last_tw.end > self.included_time_windows[-1].end:\n time_windows.append(TimeWindow(self.included_time_windows[-1].end, last_tw.end))\n\n return time_windows\n\n def get_partition_keys_not_in_subset(\n self,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> Iterable[str]:\n partition_keys: List[str] = []\n for tw in self._get_partition_time_windows_not_in_subset(current_time):\n partition_keys.extend(self._partitions_def.get_partition_keys_in_time_window(tw))\n return partition_keys\n\n @public\n def get_partition_keys(self, current_time: Optional[datetime] = None) -> Iterable[str]:\n if self._included_partition_keys is None:\n return [\n pk\n for time_window in self.included_time_windows\n for pk in self._partitions_def.get_partition_keys_in_time_window(time_window)\n ]\n return list(self._included_partition_keys) if self._included_partition_keys else []\n\n def get_partition_key_ranges(\n self,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> Sequence[PartitionKeyRange]:\n return [\n self._partitions_def.get_partition_key_range_for_time_window(window)\n for window in self.included_time_windows\n ]\n\n def _add_partitions_to_time_windows(\n self,\n initial_windows: Sequence[TimeWindow],\n partition_keys: Sequence[str],\n validate: bool = True,\n ) -> Tuple[Sequence[TimeWindow], int]:\n """Merges a set of partition keys into an existing set of time windows, returning the\n minimized set of time windows and the number of partitions added.\n """\n result_windows = [*initial_windows]\n time_windows = self._partitions_def.time_windows_for_partition_keys(\n frozenset(partition_keys), validate=validate\n )\n num_added_partitions = 0\n for window in sorted(time_windows):\n # go in reverse order because it's more common to add partitions at the end than the\n # beginning\n for i in reversed(range(len(result_windows))):\n included_window = result_windows[i]\n lt_end_of_range = window.start < included_window.end\n gte_start_of_range = window.start >= included_window.start\n\n if lt_end_of_range and gte_start_of_range:\n break\n\n if not lt_end_of_range:\n merge_with_range = included_window.end == window.start\n merge_with_later_range = i + 1 < len(result_windows) and (\n window.end == result_windows[i + 1].start\n )\n\n if merge_with_range and merge_with_later_range:\n result_windows[i] = TimeWindow(\n included_window.start, result_windows[i + 1].end\n )\n del result_windows[i + 1]\n elif merge_with_range:\n result_windows[i] = TimeWindow(included_window.start, window.end)\n elif merge_with_later_range:\n result_windows[i + 1] = TimeWindow(window.start, result_windows[i + 1].end)\n else:\n result_windows.insert(i + 1, window)\n\n num_added_partitions += 1\n break\n else:\n if result_windows and window.start == result_windows[0].start:\n result_windows[0] = TimeWindow(window.start, included_window.end) # type: ignore\n else:\n result_windows.insert(0, window)\n\n num_added_partitions += 1\n\n return result_windows, num_added_partitions\n\n def with_partition_keys(self, partition_keys: Iterable[str]) -> "TimeWindowPartitionsSubset":\n # if we are representing things as a static set of keys, continue doing so\n if self._included_partition_keys is not None:\n new_partitions = {*self._included_partition_keys, *partition_keys}\n return TimeWindowPartitionsSubset(\n self._partitions_def,\n num_partitions=len(new_partitions),\n included_partition_keys=new_partitions,\n )\n\n result_windows, added_partitions = self._add_partitions_to_time_windows(\n self.included_time_windows, list(partition_keys)\n )\n\n return TimeWindowPartitionsSubset(\n self._partitions_def,\n num_partitions=self._num_partitions + added_partitions,\n included_time_windows=result_windows,\n )\n\n @classmethod\n def from_serialized(\n cls, partitions_def: PartitionsDefinition, serialized: str\n ) -> "PartitionsSubset":\n if not isinstance(partitions_def, TimeWindowPartitionsDefinition):\n check.failed("Partitions definition must be a TimeWindowPartitionsDefinition")\n partitions_def = cast(TimeWindowPartitionsDefinition, partitions_def)\n\n loaded = json.loads(serialized)\n\n def tuples_to_time_windows(tuples):\n return [\n TimeWindow(\n pendulum.from_timestamp(tup[0], tz=partitions_def.timezone),\n pendulum.from_timestamp(tup[1], tz=partitions_def.timezone),\n )\n for tup in tuples\n ]\n\n if isinstance(loaded, list):\n # backwards compatibility\n time_windows = tuples_to_time_windows(loaded)\n num_partitions = sum(\n len(partitions_def.get_partition_keys_in_time_window(time_window))\n for time_window in time_windows\n )\n elif isinstance(loaded, dict) and (\n "version" not in loaded or loaded["version"] == cls.SERIALIZATION_VERSION\n ): # version 1\n time_windows = tuples_to_time_windows(loaded["time_windows"])\n num_partitions = loaded["num_partitions"]\n else:\n raise DagsterInvalidDeserializationVersionError(\n f"Attempted to deserialize partition subset with version {loaded.get('version')},"\n f" but only version {cls.SERIALIZATION_VERSION} is supported."\n )\n\n return TimeWindowPartitionsSubset(\n partitions_def, num_partitions=num_partitions, included_time_windows=time_windows\n )\n\n @classmethod\n def can_deserialize(\n cls,\n partitions_def: PartitionsDefinition,\n serialized: str,\n serialized_partitions_def_unique_id: Optional[str],\n serialized_partitions_def_class_name: Optional[str],\n ) -> bool:\n if (\n serialized_partitions_def_class_name\n and serialized_partitions_def_class_name != partitions_def.__class__.__name__\n ):\n return False\n\n if serialized_partitions_def_unique_id:\n return (\n partitions_def.get_serializable_unique_identifier()\n == serialized_partitions_def_unique_id\n )\n\n data = json.loads(serialized)\n return isinstance(data, list) or (\n isinstance(data, dict)\n and data.get("time_windows") is not None\n and data.get("num_partitions") is not None\n )\n\n @classmethod\n def empty_subset(cls, partitions_def: PartitionsDefinition) -> "PartitionsSubset":\n if not isinstance(partitions_def, TimeWindowPartitionsDefinition):\n check.failed("Partitions definition must be a TimeWindowPartitionsDefinition")\n partitions_def = cast(TimeWindowPartitionsDefinition, partitions_def)\n return cls(partitions_def, 0, [], set())\n\n def serialize(self) -> str:\n return json.dumps(\n {\n "version": self.SERIALIZATION_VERSION,\n "time_windows": [\n (window.start.timestamp(), window.end.timestamp())\n for window in self.included_time_windows\n ],\n "num_partitions": self._num_partitions,\n }\n )\n\n @property\n def partitions_def(self) -> PartitionsDefinition:\n return self._partitions_def\n\n def __eq__(self, other):\n return (\n isinstance(other, TimeWindowPartitionsSubset)\n and self._partitions_def == other._partitions_def\n and (\n # faster comparison, but will not catch all cases\n (\n self._included_time_windows == other._included_time_windows\n and self._included_partition_keys == other._included_partition_keys\n )\n # slower comparison, catches all cases\n or self.included_time_windows == other.included_time_windows\n )\n )\n\n def __len__(self) -> int:\n return self._num_partitions\n\n def __contains__(self, partition_key: str) -> bool:\n if self._included_partition_keys is not None:\n return partition_key in self._included_partition_keys\n\n time_window = self._partitions_def.time_window_for_partition_key(partition_key)\n\n return any(\n time_window.start >= included_time_window.start\n and time_window.start < included_time_window.end\n for included_time_window in self.included_time_windows\n )\n\n def __repr__(self) -> str:\n return f"TimeWindowPartitionsSubset({self.get_partition_key_ranges()})"\n\n\nclass PartitionRangeStatus(Enum):\n MATERIALIZING = "MATERIALIZING"\n MATERIALIZED = "MATERIALIZED"\n FAILED = "FAILED"\n\n\nPARTITION_RANGE_STATUS_PRIORITY = [\n PartitionRangeStatus.MATERIALIZING,\n PartitionRangeStatus.FAILED,\n PartitionRangeStatus.MATERIALIZED,\n]\n\n\nclass PartitionTimeWindowStatus:\n def __init__(self, time_window: TimeWindow, status: PartitionRangeStatus):\n self.time_window = time_window\n self.status = status\n\n def __repr__(self):\n return f"({self.time_window.start} - {self.time_window.end}): {self.status.value}"\n\n def __eq__(self, other):\n return (\n isinstance(other, PartitionTimeWindowStatus)\n and self.time_window == other.time_window\n and self.status == other.status\n )\n\n\ndef _flatten(\n high_pri_time_windows: List[PartitionTimeWindowStatus],\n low_pri_time_windows: List[PartitionTimeWindowStatus],\n) -> List[PartitionTimeWindowStatus]:\n high_pri_time_windows = sorted(high_pri_time_windows, key=lambda t: t.time_window.start)\n low_pri_time_windows = sorted(low_pri_time_windows, key=lambda t: t.time_window.start)\n\n high_pri_idx = 0\n low_pri_idx = 0\n\n filtered_low_pri: List[PartitionTimeWindowStatus] = []\n\n # slice and dice the low pri time windows so there's no overlap with high pri\n while True:\n if low_pri_idx >= len(low_pri_time_windows):\n # reached end of materialized\n break\n if high_pri_idx >= len(high_pri_time_windows):\n # reached end of failed, add all remaining materialized bc there's no overlap\n filtered_low_pri.extend(low_pri_time_windows[low_pri_idx:])\n break\n\n low_pri_tw = low_pri_time_windows[low_pri_idx]\n high_pri_tw = high_pri_time_windows[high_pri_idx]\n\n if low_pri_tw.time_window.start < high_pri_tw.time_window.start:\n if low_pri_tw.time_window.end <= high_pri_tw.time_window.start:\n # low_pri_tw is entirely before high pri\n filtered_low_pri.append(low_pri_tw)\n low_pri_idx += 1\n else:\n # high pri cuts the low pri short\n filtered_low_pri.append(\n PartitionTimeWindowStatus(\n TimeWindow(\n low_pri_tw.time_window.start,\n high_pri_tw.time_window.start,\n ),\n low_pri_tw.status,\n )\n )\n\n if low_pri_tw.time_window.end > high_pri_tw.time_window.end:\n # the low pri time window will continue on the other end of the high pri\n # and get split in two. Modify low_pri[low_pri_idx] to be\n # the second half of the low pri time window. It will be added in the next iteration.\n # (don't add it now, because we need to check if it overlaps with the next high pri)\n low_pri_time_windows[low_pri_idx] = PartitionTimeWindowStatus(\n TimeWindow(high_pri_tw.time_window.end, low_pri_tw.time_window.end),\n low_pri_tw.status,\n )\n high_pri_idx += 1\n else:\n # the rest of the low pri time window is inside the high pri time window\n low_pri_idx += 1\n else:\n if low_pri_tw.time_window.start >= high_pri_tw.time_window.end:\n # high pri is entirely before low pri. The next high pri may overlap\n high_pri_idx += 1\n elif low_pri_tw.time_window.end <= high_pri_tw.time_window.end:\n # low pri is entirely within high pri, skip it\n low_pri_idx += 1\n else:\n # high pri cuts out the start of the low pri. It will continue on the other end.\n # Modify low_pri[low_pri_idx] to shorten the start. It will be added\n # in the next iteration. (don't add it now, because we need to check if it overlaps with the next high pri)\n low_pri_time_windows[low_pri_idx] = PartitionTimeWindowStatus(\n TimeWindow(high_pri_tw.time_window.end, low_pri_tw.time_window.end),\n low_pri_tw.status,\n )\n high_pri_idx += 1\n\n # combine the high pri windwos with the filtered low pri windows\n flattened_time_windows = high_pri_time_windows\n flattened_time_windows.extend(filtered_low_pri)\n flattened_time_windows.sort(key=lambda t: t.time_window.start)\n return flattened_time_windows\n\n\ndef fetch_flattened_time_window_ranges(\n subsets: Mapping[PartitionRangeStatus, TimeWindowPartitionsSubset]\n) -> Sequence[PartitionTimeWindowStatus]:\n """Given potentially overlapping subsets, return a flattened list of timewindows where the highest priority status wins\n on overlaps.\n """\n prioritized_subsets = sorted(\n [(status, subset) for status, subset in subsets.items()],\n key=lambda t: PARTITION_RANGE_STATUS_PRIORITY.index(t[0]),\n )\n\n # progressively add lower priority time windows to the list of higher priority time windows\n flattened_time_window_statuses = []\n for status, subset in prioritized_subsets:\n subset_time_window_statuses = [\n PartitionTimeWindowStatus(tw, status) for tw in subset.included_time_windows\n ]\n flattened_time_window_statuses = _flatten(\n flattened_time_window_statuses, subset_time_window_statuses\n )\n\n return flattened_time_window_statuses\n\n\ndef has_one_dimension_time_window_partitioning(\n partitions_def: PartitionsDefinition,\n) -> bool:\n from .multi_dimensional_partitions import MultiPartitionsDefinition\n\n if isinstance(partitions_def, TimeWindowPartitionsDefinition):\n return True\n\n if isinstance(partitions_def, MultiPartitionsDefinition):\n time_window_dims = [\n dim\n for dim in partitions_def.partitions_defs\n if isinstance(dim.partitions_def, TimeWindowPartitionsDefinition)\n ]\n if len(time_window_dims) == 1:\n return True\n\n return False\n\n\ndef get_time_partitions_def(\n partitions_def: Optional[PartitionsDefinition],\n) -> Optional[TimeWindowPartitionsDefinition]:\n """For a given PartitionsDefinition, return the associated TimeWindowPartitionsDefinition if it\n exists.\n """\n from .multi_dimensional_partitions import MultiPartitionsDefinition\n\n if partitions_def is None:\n return None\n elif isinstance(partitions_def, TimeWindowPartitionsDefinition):\n return partitions_def\n elif isinstance(\n partitions_def, MultiPartitionsDefinition\n ) and has_one_dimension_time_window_partitioning(partitions_def):\n return cast(\n TimeWindowPartitionsDefinition, partitions_def.time_window_dimension.partitions_def\n )\n else:\n return None\n\n\ndef get_time_partition_key(\n partitions_def: Optional[PartitionsDefinition], partition_key: Optional[str]\n) -> str:\n from .multi_dimensional_partitions import MultiPartitionsDefinition\n\n if partitions_def is None or partition_key is None:\n check.failed(\n "Cannot get time partitions key from when partitions def is None or partition key is"\n " None"\n )\n elif isinstance(partitions_def, TimeWindowPartitionsDefinition):\n return partition_key\n elif isinstance(partitions_def, MultiPartitionsDefinition):\n return partitions_def.get_partition_key_from_str(partition_key).keys_by_dimension[\n partitions_def.time_window_dimension.name\n ]\n else:\n check.failed(f"Cannot get time partition from non-time partitions def {partitions_def}")\n
", "current_page_name": "_modules/dagster/_core/definitions/time_window_partitions", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.time_window_partitions"}, "unresolved_asset_job_definition": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.unresolved_asset_job_definition

\nfrom collections import defaultdict\nfrom datetime import datetime\nfrom typing import TYPE_CHECKING, AbstractSet, Any, Mapping, NamedTuple, Optional, Sequence, Union\n\nimport dagster._check as check\nfrom dagster._annotations import deprecated\nfrom dagster._core.definitions import AssetKey\nfrom dagster._core.definitions.run_request import RunRequest\nfrom dagster._core.errors import DagsterInvalidDefinitionError\nfrom dagster._core.instance import DynamicPartitionsStore\n\nfrom .asset_layer import build_asset_selection_job\nfrom .config import ConfigMapping\nfrom .metadata import RawMetadataValue\n\nif TYPE_CHECKING:\n    from dagster._core.definitions import (\n        AssetSelection,\n        ExecutorDefinition,\n        HookDefinition,\n        JobDefinition,\n        PartitionedConfig,\n        PartitionsDefinition,\n        ResourceDefinition,\n    )\n    from dagster._core.definitions.asset_graph import InternalAssetGraph\n    from dagster._core.definitions.asset_selection import CoercibleToAssetSelection\n    from dagster._core.definitions.run_config import RunConfig\n\n\nclass UnresolvedAssetJobDefinition(\n    NamedTuple(\n        "_UnresolvedAssetJobDefinition",\n        [\n            ("name", str),\n            ("selection", "AssetSelection"),\n            (\n                "config",\n                Optional[Union[ConfigMapping, Mapping[str, Any], "PartitionedConfig"]],\n            ),\n            ("description", Optional[str]),\n            ("tags", Optional[Mapping[str, Any]]),\n            ("metadata", Optional[Mapping[str, RawMetadataValue]]),\n            ("partitions_def", Optional["PartitionsDefinition"]),\n            ("executor_def", Optional["ExecutorDefinition"]),\n            ("hooks", Optional[AbstractSet["HookDefinition"]]),\n        ],\n    )\n):\n    def __new__(\n        cls,\n        name: str,\n        selection: "AssetSelection",\n        config: Optional[\n            Union[ConfigMapping, Mapping[str, Any], "PartitionedConfig", "RunConfig"]\n        ] = None,\n        description: Optional[str] = None,\n        tags: Optional[Mapping[str, Any]] = None,\n        metadata: Optional[Mapping[str, RawMetadataValue]] = None,\n        partitions_def: Optional["PartitionsDefinition"] = None,\n        executor_def: Optional["ExecutorDefinition"] = None,\n        hooks: Optional[AbstractSet["HookDefinition"]] = None,\n    ):\n        from dagster._core.definitions import (\n            AssetSelection,\n            ExecutorDefinition,\n            HookDefinition,\n            PartitionsDefinition,\n        )\n        from dagster._core.definitions.run_config import convert_config_input\n\n        return super(UnresolvedAssetJobDefinition, cls).__new__(\n            cls,\n            name=check.str_param(name, "name"),\n            selection=check.inst_param(selection, "selection", AssetSelection),\n            config=convert_config_input(config),\n            description=check.opt_str_param(description, "description"),\n            tags=check.opt_mapping_param(tags, "tags"),\n            metadata=check.opt_mapping_param(metadata, "metadata"),\n            partitions_def=check.opt_inst_param(\n                partitions_def, "partitions_def", PartitionsDefinition\n            ),\n            executor_def=check.opt_inst_param(executor_def, "partitions_def", ExecutorDefinition),\n            hooks=check.opt_nullable_set_param(hooks, "hooks", of_type=HookDefinition),\n        )\n\n    @deprecated(\n        breaking_version="2.0.0",\n        additional_warn_text="Directly instantiate `RunRequest(partition_key=...)` instead.",\n    )\n    def run_request_for_partition(\n        self,\n        partition_key: str,\n        run_key: Optional[str] = None,\n        tags: Optional[Mapping[str, str]] = None,\n        asset_selection: Optional[Sequence[AssetKey]] = None,\n        run_config: Optional[Mapping[str, Any]] = None,\n        current_time: Optional[datetime] = None,\n        dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n    ) -> RunRequest:\n        """Creates a RunRequest object for a run that processes the given partition.\n\n        Args:\n            partition_key: The key of the partition to request a run for.\n            run_key (Optional[str]): A string key to identify this launched run. For sensors, ensures that\n                only one run is created per run key across all sensor evaluations.  For schedules,\n                ensures that one run is created per tick, across failure recoveries. Passing in a `None`\n                value means that a run will always be launched per evaluation.\n            tags (Optional[Dict[str, str]]): A dictionary of tags (string key-value pairs) to attach\n                to the launched run.\n            run_config (Optional[Mapping[str, Any]]: Configuration for the run. If the job has\n                a :py:class:`PartitionedConfig`, this value will override replace the config\n                provided by it.\n            current_time (Optional[datetime]): Used to determine which time-partitions exist.\n                Defaults to now.\n            dynamic_partitions_store (Optional[DynamicPartitionsStore]): The DynamicPartitionsStore\n                object that is responsible for fetching dynamic partitions. Required when the\n                partitions definition is a DynamicPartitionsDefinition with a name defined. Users\n                can pass the DagsterInstance fetched via `context.instance` to this argument.\n\n        Returns:\n            RunRequest: an object that requests a run to process the given partition.\n        """\n        from dagster._core.definitions.partition import (\n            DynamicPartitionsDefinition,\n            PartitionedConfig,\n        )\n\n        if not self.partitions_def:\n            check.failed("Called run_request_for_partition on a non-partitioned job")\n\n        partitioned_config = PartitionedConfig.from_flexible_config(\n            self.config, self.partitions_def\n        )\n\n        if (\n            isinstance(self.partitions_def, DynamicPartitionsDefinition)\n            and self.partitions_def.name\n        ):\n            # Do not support using run_request_for_partition with dynamic partitions,\n            # since this requires querying the instance once per run request for the\n            # existent dynamic partitions\n            check.failed(\n                "run_request_for_partition is not supported for dynamic partitions. Instead, use"\n                " RunRequest(partition_key=...)"\n            )\n\n        self.partitions_def.validate_partition_key(\n            partition_key,\n            current_time=current_time,\n            dynamic_partitions_store=dynamic_partitions_store,\n        )\n\n        run_config = (\n            run_config\n            if run_config is not None\n            else partitioned_config.get_run_config_for_partition_key(partition_key)\n        )\n        run_request_tags = {\n            **(tags or {}),\n            **partitioned_config.get_tags_for_partition_key(partition_key),\n        }\n\n        return RunRequest(\n            job_name=self.name,\n            run_key=run_key,\n            run_config=run_config,\n            tags=run_request_tags,\n            asset_selection=asset_selection,\n            partition_key=partition_key,\n        )\n\n    def resolve(\n        self,\n        asset_graph: "InternalAssetGraph",\n        default_executor_def: Optional["ExecutorDefinition"] = None,\n        resource_defs: Optional[Mapping[str, "ResourceDefinition"]] = None,\n    ) -> "JobDefinition":\n        """Resolve this UnresolvedAssetJobDefinition into a JobDefinition."""\n        assets = asset_graph.assets\n        source_assets = asset_graph.source_assets\n        selected_asset_keys = self.selection.resolve(asset_graph)\n        selected_asset_checks = self.selection.resolve_checks(asset_graph)\n\n        asset_keys_by_partitions_def = defaultdict(set)\n        for asset_key in selected_asset_keys:\n            partitions_def = asset_graph.get_partitions_def(asset_key)\n            if partitions_def is not None:\n                asset_keys_by_partitions_def[partitions_def].add(asset_key)\n\n        if len(asset_keys_by_partitions_def) > 1:\n            keys_by_partitions_def_str = "\\n".join(\n                f"{partitions_def}: {asset_keys}"\n                for partitions_def, asset_keys in asset_keys_by_partitions_def.items()\n            )\n            raise DagsterInvalidDefinitionError(\n                f"Multiple partitioned assets exist in assets job '{self.name}'. Selected assets"\n                " must have the same partitions definitions, but the selected assets have"\n                f" different partitions definitions: \\n{keys_by_partitions_def_str}"\n            )\n\n        inferred_partitions_def = (\n            next(iter(asset_keys_by_partitions_def.keys()))\n            if asset_keys_by_partitions_def\n            else None\n        )\n        if (\n            inferred_partitions_def\n            and self.partitions_def != inferred_partitions_def\n            and self.partitions_def is not None\n        ):\n            raise DagsterInvalidDefinitionError(\n                f"Job '{self.name}' received a partitions_def of {self.partitions_def}, but the"\n                f" selected assets {next(iter(asset_keys_by_partitions_def.values()))} have a"\n                f" non-matching partitions_def of {inferred_partitions_def}"\n            )\n\n        return build_asset_selection_job(\n            name=self.name,\n            assets=assets,\n            asset_checks=asset_graph.asset_checks,\n            config=self.config,\n            source_assets=source_assets,\n            description=self.description,\n            tags=self.tags,\n            metadata=self.metadata,\n            asset_selection=selected_asset_keys,\n            asset_check_selection=selected_asset_checks,\n            partitions_def=self.partitions_def if self.partitions_def else inferred_partitions_def,\n            executor_def=self.executor_def or default_executor_def,\n            hooks=self.hooks,\n            resource_defs=resource_defs,\n        )\n\n\n
[docs]def define_asset_job(\n name: str,\n selection: Optional["CoercibleToAssetSelection"] = None,\n config: Optional[\n Union[ConfigMapping, Mapping[str, Any], "PartitionedConfig", "RunConfig"]\n ] = None,\n description: Optional[str] = None,\n tags: Optional[Mapping[str, Any]] = None,\n metadata: Optional[Mapping[str, RawMetadataValue]] = None,\n partitions_def: Optional["PartitionsDefinition"] = None,\n executor_def: Optional["ExecutorDefinition"] = None,\n hooks: Optional[AbstractSet["HookDefinition"]] = None,\n) -> UnresolvedAssetJobDefinition:\n """Creates a definition of a job which will either materialize a selection of assets or observe\n a selection of source assets. This will only be resolved to a JobDefinition once placed in a\n code location.\n\n Args:\n name (str):\n The name for the job.\n selection (Union[str, Sequence[str], Sequence[AssetKey], Sequence[Union[AssetsDefinition, SourceAsset]], AssetSelection]):\n The assets that will be materialized or observed when the job is run.\n\n The selected assets must all be included in the assets that are passed to the assets\n argument of the Definitions object that this job is included on.\n\n The string "my_asset*" selects my_asset and all downstream assets within the code\n location. A list of strings represents the union of all assets selected by strings\n within the list.\n\n The selection will be resolved to a set of assets when the location is loaded. If the\n selection resolves to all source assets, the created job will perform source asset\n observations. If the selection resolves to all regular assets, the created job will\n materialize assets. If the selection resolves to a mixed set of source assets and\n regular assets, an error will be thrown.\n\n config:\n Describes how the Job is parameterized at runtime.\n\n If no value is provided, then the schema for the job's run config is a standard\n format based on its ops and resources.\n\n If a dictionary is provided, then it must conform to the standard config schema, and\n it will be used as the job's run config for the job whenever the job is executed.\n The values provided will be viewable and editable in the Dagster UI, so be\n careful with secrets.\n\n If a :py:class:`ConfigMapping` object is provided, then the schema for the job's run config is\n determined by the config mapping, and the ConfigMapping, which should return\n configuration in the standard format to configure the job.\n tags (Optional[Mapping[str, Any]]):\n Arbitrary information that will be attached to the execution of the Job.\n Values that are not strings will be json encoded and must meet the criteria that\n `json.loads(json.dumps(value)) == value`. These tag values may be overwritten by tag\n values provided at invocation time.\n metadata (Optional[Mapping[str, RawMetadataValue]]): Arbitrary metadata about the job.\n Keys are displayed string labels, and values are one of the following: string, float,\n int, JSON-serializable dict, JSON-serializable list, and one of the data classes\n returned by a MetadataValue static method.\n description (Optional[str]):\n A description for the Job.\n partitions_def (Optional[PartitionsDefinition]):\n Defines the set of partitions for this job. All AssetDefinitions selected for this job\n must have a matching PartitionsDefinition. If no PartitionsDefinition is provided, the\n PartitionsDefinition will be inferred from the selected AssetDefinitions.\n executor_def (Optional[ExecutorDefinition]):\n How this Job will be executed. Defaults to :py:class:`multi_or_in_process_executor`,\n which can be switched between multi-process and in-process modes of execution. The\n default mode of execution is multi-process.\n\n\n Returns:\n UnresolvedAssetJobDefinition: The job, which can be placed inside a code location.\n\n Examples:\n .. code-block:: python\n\n # A job that targets all assets in the code location:\n @asset\n def asset1():\n ...\n\n defs = Definitions(\n assets=[asset1],\n jobs=[define_asset_job("all_assets")],\n )\n\n # A job that targets a single asset\n @asset\n def asset1():\n ...\n\n defs = Definitions(\n assets=[asset1],\n jobs=[define_asset_job("all_assets", selection=[asset1])],\n )\n\n # A job that targets all the assets in a group:\n defs = Definitions(\n assets=assets,\n jobs=[define_asset_job("marketing_job", selection=AssetSelection.groups("marketing"))],\n )\n\n @observable_source_asset\n def source_asset():\n ...\n\n # A job that observes a source asset:\n defs = Definitions(\n assets=assets,\n jobs=[define_asset_job("observation_job", selection=[source_asset])],\n )\n\n # Resources are supplied to the assets, not the job:\n @asset(required_resource_keys={"slack_client"})\n def asset1():\n ...\n\n defs = Definitions(\n assets=[asset1],\n jobs=[define_asset_job("all_assets")],\n resources={"slack_client": prod_slack_client},\n )\n\n """\n from dagster._core.definitions import AssetSelection\n\n # convert string-based selections to AssetSelection objects\n if selection is None:\n resolved_selection = AssetSelection.all()\n else:\n resolved_selection = AssetSelection.from_coercible(selection)\n\n return UnresolvedAssetJobDefinition(\n name=name,\n selection=resolved_selection,\n config=config,\n description=description,\n tags=tags,\n metadata=metadata,\n partitions_def=partitions_def,\n executor_def=executor_def,\n hooks=hooks,\n )
\n
", "current_page_name": "_modules/dagster/_core/definitions/unresolved_asset_job_definition", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.unresolved_asset_job_definition"}, "utils": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.utils

\nimport keyword\nimport os\nimport re\nfrom glob import glob\nfrom typing import Any, Dict, List, Mapping, Optional, Sequence, Tuple, cast\n\nimport yaml\n\nimport dagster._check as check\nimport dagster._seven as seven\nfrom dagster._core.errors import DagsterInvalidDefinitionError, DagsterInvariantViolationError\nfrom dagster._core.storage.tags import check_reserved_tags\nfrom dagster._utils.yaml_utils import merge_yaml_strings, merge_yamls\n\nDEFAULT_OUTPUT = "result"\nDEFAULT_GROUP_NAME = "default"  # asset group_name used when none is provided\nDEFAULT_IO_MANAGER_KEY = "io_manager"\n\nDISALLOWED_NAMES = set(\n    [\n        "context",\n        "conf",\n        "config",\n        "meta",\n        "arg_dict",\n        "dict",\n        "input_arg_dict",\n        "output_arg_dict",\n        "int",\n        "str",\n        "float",\n        "bool",\n        "input",\n        "output",\n        "type",\n    ]\n    + list(keyword.kwlist)  # just disallow all python keywords\n)\n\nVALID_NAME_REGEX_STR = r"^[A-Za-z0-9_]+$"\nVALID_NAME_REGEX = re.compile(VALID_NAME_REGEX_STR)\n\n\nclass NoValueSentinel:\n    """Sentinel value to distinguish unset from None."""\n\n\ndef has_valid_name_chars(name: str) -> bool:\n    return bool(VALID_NAME_REGEX.match(name))\n\n\ndef check_valid_name(name: str, allow_list: Optional[List[str]] = None) -> str:\n    check.str_param(name, "name")\n\n    if allow_list and name in allow_list:\n        return name\n\n    if name in DISALLOWED_NAMES:\n        raise DagsterInvalidDefinitionError(\n            f'"{name}" is not a valid name in Dagster. It conflicts with a Dagster or python'\n            " reserved keyword."\n        )\n\n    check_valid_chars(name)\n\n    check.invariant(is_valid_name(name))\n    return name\n\n\ndef check_valid_chars(name: str):\n    if not has_valid_name_chars(name):\n        raise DagsterInvalidDefinitionError(\n            f'"{name}" is not a valid name in Dagster. Names must be in regex'\n            f" {VALID_NAME_REGEX_STR}."\n        )\n\n\ndef is_valid_name(name: str) -> bool:\n    check.str_param(name, "name")\n\n    return name not in DISALLOWED_NAMES and has_valid_name_chars(name)\n\n\ndef _kv_str(key: object, value: object) -> str:\n    return f'{key}="{value!r}"'\n\n\ndef struct_to_string(name: str, **kwargs: object) -> str:\n    # Sort the kwargs to ensure consistent representations across Python versions\n    props_str = ", ".join([_kv_str(key, value) for key, value in sorted(kwargs.items())])\n    return f"{name}({props_str})"\n\n\ndef validate_tags(\n    tags: Optional[Mapping[str, Any]], allow_reserved_tags: bool = True\n) -> Mapping[str, str]:\n    valid_tags: Dict[str, str] = {}\n    for key, value in check.opt_mapping_param(tags, "tags", key_type=str).items():\n        if not isinstance(value, str):\n            valid = False\n            err_reason = f'Could not JSON encode value "{value}"'\n            str_val = None\n            try:\n                str_val = seven.json.dumps(value)\n                err_reason = (\n                    'JSON encoding "{json}" of value "{val}" is not equivalent to original value'\n                    .format(json=str_val, val=value)\n                )\n\n                valid = seven.json.loads(str_val) == value\n            except Exception:\n                pass\n\n            if not valid:\n                raise DagsterInvalidDefinitionError(\n                    f'Invalid value for tag "{key}", {err_reason}. Tag values must be strings '\n                    "or meet the constraint that json.loads(json.dumps(value)) == value."\n                )\n\n            valid_tags[key] = str_val  # type: ignore  # (possible none)\n        else:\n            valid_tags[key] = value\n\n    if not allow_reserved_tags:\n        check_reserved_tags(valid_tags)\n\n    return valid_tags\n\n\ndef validate_group_name(group_name: Optional[str]) -> str:\n    """Ensures a string name is valid and returns a default if no name provided."""\n    if group_name:\n        check_valid_chars(group_name)\n        return group_name\n    return DEFAULT_GROUP_NAME\n\n\n
[docs]def config_from_files(config_files: Sequence[str]) -> Mapping[str, Any]:\n """Constructs run config from YAML files.\n\n Args:\n config_files (List[str]): List of paths or glob patterns for yaml files\n to load and parse as the run config.\n\n Returns:\n Dict[str, Any]: A run config dictionary constructed from provided YAML files.\n\n Raises:\n FileNotFoundError: When a config file produces no results\n DagsterInvariantViolationError: When one of the YAML files is invalid and has a parse\n error.\n """\n config_files = check.opt_sequence_param(config_files, "config_files")\n\n filenames = []\n for file_glob in config_files or []:\n globbed_files = glob(file_glob)\n if not globbed_files:\n raise DagsterInvariantViolationError(\n f'File or glob pattern "{file_glob}" for "config_files" produced no results.'\n )\n\n filenames += [os.path.realpath(globbed_file) for globbed_file in globbed_files]\n\n try:\n run_config = merge_yamls(filenames)\n except yaml.YAMLError as err:\n raise DagsterInvariantViolationError(\n f"Encountered error attempting to parse yaml. Parsing files {filenames} "\n f"loaded by file/patterns {config_files}."\n ) from err\n\n return check.is_dict(cast(Dict[str, object], run_config), key_type=str)
\n\n\n
[docs]def config_from_yaml_strings(yaml_strings: Sequence[str]) -> Mapping[str, Any]:\n """Static constructor for run configs from YAML strings.\n\n Args:\n yaml_strings (List[str]): List of yaml strings to parse as the run config.\n\n Returns:\n Dict[Str, Any]: A run config dictionary constructed from the provided yaml strings\n\n Raises:\n DagsterInvariantViolationError: When one of the YAML documents is invalid and has a\n parse error.\n """\n yaml_strings = check.sequence_param(yaml_strings, "yaml_strings", of_type=str)\n\n try:\n run_config = merge_yaml_strings(yaml_strings)\n except yaml.YAMLError as err:\n raise DagsterInvariantViolationError(\n f"Encountered error attempting to parse yaml. Parsing YAMLs {yaml_strings} "\n ) from err\n\n return check.is_dict(cast(Dict[str, object], run_config), key_type=str)
\n\n\n
[docs]def config_from_pkg_resources(pkg_resource_defs: Sequence[Tuple[str, str]]) -> Mapping[str, Any]:\n """Load a run config from a package resource, using :py:func:`pkg_resources.resource_string`.\n\n Example:\n .. code-block:: python\n\n config_from_pkg_resources(\n pkg_resource_defs=[\n ('dagster_examples.airline_demo.environments', 'local_base.yaml'),\n ('dagster_examples.airline_demo.environments', 'local_warehouse.yaml'),\n ],\n )\n\n\n Args:\n pkg_resource_defs (List[(str, str)]): List of pkg_resource modules/files to\n load as the run config.\n\n Returns:\n Dict[Str, Any]: A run config dictionary constructed from the provided yaml strings\n\n Raises:\n DagsterInvariantViolationError: When one of the YAML documents is invalid and has a\n parse error.\n """\n import pkg_resources # expensive, import only on use\n\n pkg_resource_defs = check.sequence_param(pkg_resource_defs, "pkg_resource_defs", of_type=tuple)\n\n try:\n yaml_strings = [\n pkg_resources.resource_string(*pkg_resource_def).decode("utf-8")\n for pkg_resource_def in pkg_resource_defs\n ]\n except (ModuleNotFoundError, FileNotFoundError, UnicodeDecodeError) as err:\n raise DagsterInvariantViolationError(\n "Encountered error attempting to parse yaml. Loading YAMLs from "\n f"package resources {pkg_resource_defs}."\n ) from err\n\n return config_from_yaml_strings(yaml_strings=yaml_strings)
\n
", "current_page_name": "_modules/dagster/_core/definitions/utils", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.utils"}, "version_strategy": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.version_strategy

\nimport hashlib\nimport inspect\nfrom abc import ABC, abstractmethod\nfrom typing import TYPE_CHECKING, Any, NamedTuple, Optional\n\nfrom dagster._annotations import public\n\nif TYPE_CHECKING:\n    from .op_definition import OpDefinition\n    from .resource_definition import ResourceDefinition\n\n\n
[docs]class OpVersionContext(NamedTuple):\n """Provides execution-time information for computing the version for an op.\n\n Attributes:\n op_def (OpDefinition): The definition of the op to compute a version for.\n op_config (Any): The parsed config to be passed to the op during execution.\n """\n\n op_def: "OpDefinition"\n op_config: Any
\n\n\n
[docs]class ResourceVersionContext(NamedTuple):\n """Provides execution-time information for computing the version for a resource.\n\n Attributes:\n resource_def (ResourceDefinition): The definition of the resource whose version will be computed.\n resource_config (Any): The parsed config to be passed to the resource during execution.\n """\n\n resource_def: "ResourceDefinition"\n resource_config: Any
\n\n\n
[docs]class VersionStrategy(ABC):\n """Abstract class for defining a strategy to version ops and resources.\n\n When subclassing, `get_op_version` must be implemented, and\n `get_resource_version` can be optionally implemented.\n\n `get_op_version` should ingest an OpVersionContext, and `get_resource_version` should ingest a\n ResourceVersionContext. From that, each synthesize a unique string called\n a `version`, which will\n be tagged to outputs of that op in the job. Providing a\n `VersionStrategy` instance to a\n job will enable memoization on that job, such that only steps whose\n outputs do not have an up-to-date version will run.\n """\n\n
[docs] @public\n @abstractmethod\n def get_op_version(self, context: OpVersionContext) -> str:\n """Computes a version for an op.\n\n Args:\n context (OpVersionContext): The context for computing the version.\n\n Returns:\n str: The version for the op.\n """\n raise NotImplementedError()
\n\n
[docs] @public\n def get_resource_version(self, context: ResourceVersionContext) -> Optional[str]:\n """Computes a version for a resource.\n\n Args:\n context (ResourceVersionContext): The context for computing the version.\n\n Returns:\n Optional[str]: The version for the resource. If None, the resource will not be\n memoized.\n """\n return None
\n\n\n
[docs]class SourceHashVersionStrategy(VersionStrategy):\n """VersionStrategy that checks for changes to the source code of ops and resources.\n\n Only checks for changes within the immediate body of the op/resource's\n decorated function (or compute function, if the op/resource was\n constructed directly from a definition).\n """\n\n def _get_source_hash(self, fn):\n code_as_str = inspect.getsource(fn)\n return hashlib.sha1(code_as_str.encode("utf-8")).hexdigest()\n\n
[docs] @public\n def get_op_version(self, context: OpVersionContext) -> str:\n """Computes a version for an op by hashing its source code.\n\n Args:\n context (OpVersionContext): The context for computing the version.\n\n Returns:\n str: The version for the op.\n """\n compute_fn = context.op_def.compute_fn\n if callable(compute_fn):\n return self._get_source_hash(compute_fn)\n else:\n return self._get_source_hash(compute_fn.decorated_fn)
\n\n
[docs] @public\n def get_resource_version(self, context: ResourceVersionContext) -> Optional[str]:\n """Computes a version for a resource by hashing its source code.\n\n Args:\n context (ResourceVersionContext): The context for computing the version.\n\n Returns:\n Optional[str]: The version for the resource. If None, the resource will not be\n memoized.\n """\n return self._get_source_hash(context.resource_def.resource_fn)
\n
", "current_page_name": "_modules/dagster/_core/definitions/version_strategy", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.version_strategy"}}, "errors": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.errors

\n"""Core Dagster error classes.\n\nAll errors thrown by the Dagster framework inherit from :py:class:`~dagster.DagsterError`. Users\nshould not subclass this base class for their own exceptions.\n\nThere is another exception base class, :py:class:`~dagster.DagsterUserCodeExecutionError`, which is\nused by the framework in concert with the :py:func:`~dagster._core.errors.user_code_error_boundary`.\n\nDagster uses this construct to wrap user code into which it calls. User code can perform arbitrary\ncomputations and may itself throw exceptions. The error boundary catches these user code-generated\nexceptions, and then reraises them wrapped in a subclass of\n:py:class:`~dagster.DagsterUserCodeExecutionError`.\n\nThe wrapped exceptions include additional context for the original exceptions, injected by the\nDagster runtime.\n"""\n\nimport sys\nfrom contextlib import contextmanager\nfrom typing import TYPE_CHECKING, Any, Callable, Iterator, Optional, Type\n\nimport dagster._check as check\nfrom dagster._utils.interrupts import raise_interrupts_as\n\nif TYPE_CHECKING:\n    from dagster._core.log_manager import DagsterLogManager\n\n\nclass DagsterExecutionInterruptedError(BaseException):\n    """Pipeline execution was interrupted during the execution process.\n\n    Just like KeyboardInterrupt this inherits from BaseException\n    as to not be accidentally caught by code that catches Exception\n    and thus prevent the interpreter from exiting.\n    """\n\n\n
[docs]class DagsterError(Exception):\n """Base class for all errors thrown by the Dagster framework.\n\n Users should not subclass this base class for their own exceptions.\n """\n\n @property\n def is_user_code_error(self):\n """Returns true if this error is attributable to user code."""\n return False
\n\n\n
[docs]class DagsterInvalidDefinitionError(DagsterError):\n """Indicates that the rules for a definition have been violated by the user."""
\n\n\nclass DagsterInvalidObservationError(DagsterError):\n """Indicates that an invalid value was returned from a source asset observation function."""\n\n\n
[docs]class DagsterInvalidSubsetError(DagsterError):\n """Indicates that a subset of a pipeline is invalid because either:\n - One or more ops in the specified subset do not exist on the job.'\n - The subset produces an invalid job.\n """
\n\n\nclass DagsterInvalidDeserializationVersionError(DagsterError):\n """Indicates that a serialized value has an unsupported version and cannot be deserialized."""\n\n\nPYTHONIC_CONFIG_ERROR_VERBIAGE = """\nThis config type can be a:\n - Python primitive type\n - int, float, bool, str, list\n - A Python Dict or List type containing other valid types\n - Custom data classes extending dagster.Config\n - A Pydantic discriminated union type (https://docs.pydantic.dev/usage/types/#discriminated-unions-aka-tagged-unions)\n"""\n\nPYTHONIC_RESOURCE_ADDITIONAL_TYPES = """\n\nIf this config type represents a resource dependency, its annotation must either:\n - Extend dagster.ConfigurableResource, dagster.ConfigurableIOManager, or\n - Be wrapped in a ResourceDependency annotation, e.g. ResourceDependency[{invalid_type_str}]\n"""\n\n\ndef _generate_pythonic_config_error_message(\n config_class: Optional[Type],\n field_name: Optional[str],\n invalid_type: Any,\n is_resource: bool = False,\n) -> str:\n invalid_type_name = getattr(invalid_type, "__name__", "<my type>")\n pythonic_config_error_verbiage = (\n PYTHONIC_CONFIG_ERROR_VERBIAGE + (PYTHONIC_RESOURCE_ADDITIONAL_TYPES if is_resource else "")\n ).format(invalid_type_str=invalid_type_name)\n\n return ("""\nError defining Dagster config class{config_class}{field_name}.\nUnable to resolve config type {invalid_type} to a supported Dagster config type.\n\n{PYTHONIC_CONFIG_ERROR_VERBIAGE}""").format(\n config_class=f" {config_class!r}" if config_class else "",\n field_name=f" on field '{field_name}'" if field_name else "",\n invalid_type=repr(invalid_type),\n PYTHONIC_CONFIG_ERROR_VERBIAGE=pythonic_config_error_verbiage,\n )\n\n\nclass DagsterInvalidPythonicConfigDefinitionError(DagsterError):\n """Indicates that you have attempted to construct a Pythonic config or resource class with an invalid value."""\n\n def __init__(\n self,\n config_class: Optional[Type],\n field_name: Optional[str],\n invalid_type: Any,\n is_resource: bool = False,\n **kwargs,\n ):\n self.invalid_type = invalid_type\n self.field_name = field_name\n self.config_class = config_class\n super(DagsterInvalidPythonicConfigDefinitionError, self).__init__(\n _generate_pythonic_config_error_message(\n config_class=config_class,\n field_name=field_name,\n invalid_type=invalid_type,\n is_resource=is_resource,\n ),\n **kwargs,\n )\n\n\nclass DagsterInvalidDagsterTypeInPythonicConfigDefinitionError(DagsterError):\n """Indicates that you have attempted to construct a Pythonic config or resource class with a DagsterType\n annotated field.\n """\n\n def __init__(\n self,\n config_class_name: str,\n field_name: Optional[str],\n **kwargs,\n ):\n self.field_name = field_name\n super(DagsterInvalidDagsterTypeInPythonicConfigDefinitionError, self).__init__(\n f"""Error defining Dagster config class '{config_class_name}' on field '{field_name}'. DagsterTypes cannot be used to annotate a config type. DagsterType is meant only for type checking and coercion in op and asset inputs and outputs.\n{PYTHONIC_CONFIG_ERROR_VERBIAGE}""",\n **kwargs,\n )\n\n\nCONFIG_ERROR_VERBIAGE = """\nThis value can be a:\n - Field\n - Python primitive types that resolve to dagster config types\n - int, float, bool, str, list.\n - A dagster config type: Int, Float, Bool, Array, Optional, Selector, Shape, Permissive, Map\n - A bare python dictionary, which is wrapped in Field(Shape(...)). Any values\n in the dictionary get resolved by the same rules, recursively.\n - A python list with a single entry that can resolve to a type, e.g. [int]\n"""\n\n\n
[docs]class DagsterInvalidConfigDefinitionError(DagsterError):\n """Indicates that you have attempted to construct a config with an invalid value.\n\n Acceptable values for config types are any of:\n 1. A Python primitive type that resolves to a Dagster config type\n (:py:class:`~python:int`, :py:class:`~python:float`, :py:class:`~python:bool`,\n :py:class:`~python:str`, or :py:class:`~python:list`).\n\n 2. A Dagster config type: :py:data:`~dagster.Int`, :py:data:`~dagster.Float`,\n :py:data:`~dagster.Bool`, :py:data:`~dagster.String`,\n :py:data:`~dagster.StringSource`, :py:data:`~dagster.Any`,\n :py:class:`~dagster.Array`, :py:data:`~dagster.Noneable`, :py:data:`~dagster.Enum`,\n :py:class:`~dagster.Selector`, :py:class:`~dagster.Shape`, or\n :py:class:`~dagster.Permissive`.\n\n 3. A bare python dictionary, which will be automatically wrapped in\n :py:class:`~dagster.Shape`. Values of the dictionary are resolved recursively\n according to the same rules.\n\n 4. A bare python list of length one which itself is config type.\n Becomes :py:class:`Array` with list element as an argument.\n\n 5. An instance of :py:class:`~dagster.Field`.\n """\n\n def __init__(self, original_root, current_value, stack, reason=None, **kwargs):\n self.original_root = original_root\n self.current_value = current_value\n self.stack = stack\n super(DagsterInvalidConfigDefinitionError, self).__init__(\n (\n "Error defining config. Original value passed: {original_root}. "\n "{stack_str}{current_value} "\n "cannot be resolved.{reason_str}"\n + CONFIG_ERROR_VERBIAGE\n ).format(\n original_root=repr(original_root),\n stack_str="Error at stack path :" + ":".join(stack) + ". " if stack else "",\n current_value=repr(current_value),\n reason_str=f" Reason: {reason}." if reason else "",\n ),\n **kwargs,\n )
\n\n\n
[docs]class DagsterInvariantViolationError(DagsterError):\n """Indicates the user has violated a well-defined invariant that can only be enforced\n at runtime.\n """
\n\n\n
[docs]class DagsterExecutionStepNotFoundError(DagsterError):\n """Thrown when the user specifies execution step keys that do not exist."""\n\n def __init__(self, *args, **kwargs):\n self.step_keys = check.list_param(kwargs.pop("step_keys"), "step_keys", str)\n super(DagsterExecutionStepNotFoundError, self).__init__(*args, **kwargs)
\n\n\nclass DagsterExecutionPlanSnapshotNotFoundError(DagsterError):\n """Thrown when an expected execution plan snapshot could not be found on a PipelineRun."""\n\n\n
[docs]class DagsterRunNotFoundError(DagsterError):\n """Thrown when a run cannot be found in run storage."""\n\n def __init__(self, *args, **kwargs):\n self.invalid_run_id = check.str_param(kwargs.pop("invalid_run_id"), "invalid_run_id")\n super(DagsterRunNotFoundError, self).__init__(*args, **kwargs)
\n\n\n
[docs]class DagsterStepOutputNotFoundError(DagsterError):\n """Indicates that previous step outputs required for an execution step to proceed are not\n available.\n """\n\n def __init__(self, *args, **kwargs):\n self.step_key = check.str_param(kwargs.pop("step_key"), "step_key")\n self.output_name = check.str_param(kwargs.pop("output_name"), "output_name")\n super(DagsterStepOutputNotFoundError, self).__init__(*args, **kwargs)
\n\n\n@contextmanager\ndef raise_execution_interrupts() -> Iterator[None]:\n with raise_interrupts_as(DagsterExecutionInterruptedError):\n yield\n\n\n
[docs]@contextmanager\ndef user_code_error_boundary(\n error_cls: Type["DagsterUserCodeExecutionError"],\n msg_fn: Callable[[], str],\n log_manager: Optional["DagsterLogManager"] = None,\n **kwargs: object,\n) -> Iterator[None]:\n """Wraps the execution of user-space code in an error boundary. This places a uniform\n policy around any user code invoked by the framework. This ensures that all user\n errors are wrapped in an exception derived from DagsterUserCodeExecutionError,\n and that the original stack trace of the user error is preserved, so that it\n can be reported without confusing framework code in the stack trace, if a\n tool author wishes to do so.\n\n Examples:\n .. code-block:: python\n\n with user_code_error_boundary(\n # Pass a class that inherits from DagsterUserCodeExecutionError\n DagsterExecutionStepExecutionError,\n # Pass a function that produces a message\n "Error occurred during step execution"\n ):\n call_user_provided_function()\n\n """\n check.callable_param(msg_fn, "msg_fn")\n check.class_param(error_cls, "error_cls", superclass=DagsterUserCodeExecutionError)\n\n with raise_execution_interrupts():\n if log_manager:\n log_manager.begin_python_log_capture()\n try:\n yield\n except DagsterError as de:\n # The system has thrown an error that is part of the user-framework contract\n raise de\n except Exception as e:\n # An exception has been thrown by user code and computation should cease\n # with the error reported further up the stack\n raise error_cls(\n msg_fn(), user_exception=e, original_exc_info=sys.exc_info(), **kwargs\n ) from e\n finally:\n if log_manager:\n log_manager.end_python_log_capture()
\n\n\n
[docs]class DagsterUserCodeExecutionError(DagsterError):\n """This is the base class for any exception that is meant to wrap an\n :py:class:`~python:Exception` thrown by user code. It wraps that existing user code.\n The ``original_exc_info`` argument to the constructor is meant to be a tuple of the type\n returned by :py:func:`sys.exc_info <python:sys.exc_info>` at the call site of the constructor.\n\n Users should not subclass this base class for their own exceptions and should instead throw\n freely from user code. User exceptions will be automatically wrapped and rethrown.\n """\n\n def __init__(self, *args, **kwargs):\n # original_exc_info should be gotten from a sys.exc_info() call at the\n # callsite inside of the exception handler. this will allow consuming\n # code to *re-raise* the user error in it's original format\n # for cleaner error reporting that does not have framework code in it\n user_exception = check.inst_param(kwargs.pop("user_exception"), "user_exception", Exception)\n original_exc_info = check.tuple_param(kwargs.pop("original_exc_info"), "original_exc_info")\n\n check.invariant(original_exc_info[0] is not None)\n\n super(DagsterUserCodeExecutionError, self).__init__(args[0], *args[1:], **kwargs)\n\n self.user_exception = check.opt_inst_param(user_exception, "user_exception", Exception)\n self.original_exc_info = original_exc_info\n\n @property\n def is_user_code_error(self) -> bool:\n return True
\n\n\n
[docs]class DagsterTypeCheckError(DagsterUserCodeExecutionError):\n """Indicates an error in the op type system at runtime. E.g. a op receives an\n unexpected input, or produces an output that does not match the type of the output definition.\n """
\n\n\nclass DagsterExecutionLoadInputError(DagsterUserCodeExecutionError):\n """Indicates an error occurred while loading an input for a step."""\n\n def __init__(self, *args, **kwargs):\n self.step_key = check.str_param(kwargs.pop("step_key"), "step_key")\n self.input_name = check.str_param(kwargs.pop("input_name"), "input_name")\n super(DagsterExecutionLoadInputError, self).__init__(*args, **kwargs)\n\n\nclass DagsterExecutionHandleOutputError(DagsterUserCodeExecutionError):\n """Indicates an error occurred while handling an output for a step."""\n\n def __init__(self, *args, **kwargs):\n self.step_key = check.str_param(kwargs.pop("step_key"), "step_key")\n self.output_name = check.str_param(kwargs.pop("output_name"), "output_name")\n super(DagsterExecutionHandleOutputError, self).__init__(*args, **kwargs)\n\n\n
[docs]class DagsterExecutionStepExecutionError(DagsterUserCodeExecutionError):\n """Indicates an error occurred while executing the body of an execution step."""\n\n def __init__(self, *args, **kwargs):\n self.step_key = check.str_param(kwargs.pop("step_key"), "step_key")\n self.op_name = check.str_param(kwargs.pop("op_name"), "op_name")\n self.op_def_name = check.str_param(kwargs.pop("op_def_name"), "op_def_name")\n super(DagsterExecutionStepExecutionError, self).__init__(*args, **kwargs)
\n\n\n
[docs]class DagsterResourceFunctionError(DagsterUserCodeExecutionError):\n """Indicates an error occurred while executing the body of the ``resource_fn`` in a\n :py:class:`~dagster.ResourceDefinition` during resource initialization.\n """
\n\n\n
[docs]class DagsterConfigMappingFunctionError(DagsterUserCodeExecutionError):\n """Indicates that an unexpected error occurred while executing the body of a config mapping\n function defined in a :py:class:`~dagster.JobDefinition` or `~dagster.GraphDefinition` during\n config parsing.\n """
\n\n\nclass DagsterTypeLoadingError(DagsterUserCodeExecutionError):\n """Indicates that an unexpected error occurred while executing the body of an type load\n function defined in a :py:class:`~dagster.DagsterTypeLoader` during loading of a custom type.\n """\n\n\n
[docs]class DagsterUnknownResourceError(DagsterError, AttributeError):\n # inherits from AttributeError as it is raised within a __getattr__ call... used to support\n # object hasattr method\n """Indicates that an unknown resource was accessed in the body of an execution step. May often\n happen by accessing a resource in the compute function of an op without first supplying the\n op with the correct `required_resource_keys` argument.\n """\n\n def __init__(self, resource_name, *args, **kwargs):\n self.resource_name = check.str_param(resource_name, "resource_name")\n msg = (\n f"Unknown resource `{resource_name}`. Specify `{resource_name}` as a required resource "\n "on the compute / config function that accessed it."\n )\n super(DagsterUnknownResourceError, self).__init__(msg, *args, **kwargs)
\n\n\nclass DagsterInvalidInvocationError(DagsterError):\n """Indicates that an error has occurred when an op has been invoked, but before the actual\n core compute has been reached.\n """\n\n\n
[docs]class DagsterInvalidConfigError(DagsterError):\n """Thrown when provided config is invalid (does not type check against the relevant config\n schema).\n """\n\n def __init__(self, preamble, errors, config_value, *args, **kwargs):\n from dagster._config import EvaluationError\n\n check.str_param(preamble, "preamble")\n self.errors = check.list_param(errors, "errors", of_type=EvaluationError)\n self.config_value = config_value\n\n error_msg = preamble\n error_messages = []\n\n for i_error, error in enumerate(self.errors):\n error_messages.append(error.message)\n error_msg += f"\\n Error {i_error + 1}: {error.message}"\n\n self.message = error_msg\n self.error_messages = error_messages\n\n super(DagsterInvalidConfigError, self).__init__(error_msg, *args, **kwargs)
\n\n\n
[docs]class DagsterUnmetExecutorRequirementsError(DagsterError):\n """Indicates the resolved executor is incompatible with the state of other systems\n such as the :py:class:`~dagster._core.instance.DagsterInstance` or system storage configuration.\n """
\n\n\n
[docs]class DagsterSubprocessError(DagsterError):\n """An exception has occurred in one or more of the child processes dagster manages.\n This error forwards the message and stack trace for all of the collected errors.\n """\n\n def __init__(self, *args, **kwargs):\n from dagster._utils.error import SerializableErrorInfo\n\n self.subprocess_error_infos = check.list_param(\n kwargs.pop("subprocess_error_infos"), "subprocess_error_infos", SerializableErrorInfo\n )\n super(DagsterSubprocessError, self).__init__(*args, **kwargs)
\n\n\nclass DagsterUserCodeUnreachableError(DagsterError):\n """Dagster was unable to reach a user code server to fetch information about user code."""\n\n\nclass DagsterUserCodeProcessError(DagsterError):\n """An exception has occurred in a user code process that the host process raising this error\n was communicating with.\n """\n\n @staticmethod\n def from_error_info(error_info):\n from dagster._utils.error import SerializableErrorInfo\n\n check.inst_param(error_info, "error_info", SerializableErrorInfo)\n return DagsterUserCodeProcessError(\n error_info.to_string(), user_code_process_error_infos=[error_info]\n )\n\n def __init__(self, *args, **kwargs):\n from dagster._utils.error import SerializableErrorInfo\n\n self.user_code_process_error_infos = check.list_param(\n kwargs.pop("user_code_process_error_infos"),\n "user_code_process_error_infos",\n SerializableErrorInfo,\n )\n super(DagsterUserCodeProcessError, self).__init__(*args, **kwargs)\n\n\nclass DagsterMaxRetriesExceededError(DagsterError):\n """Raised when raise_on_error is true, and retries were exceeded, this error should be raised."""\n\n def __init__(self, *args, **kwargs):\n from dagster._utils.error import SerializableErrorInfo\n\n self.user_code_process_error_infos = check.list_param(\n kwargs.pop("user_code_process_error_infos"),\n "user_code_process_error_infos",\n SerializableErrorInfo,\n )\n super(DagsterMaxRetriesExceededError, self).__init__(*args, **kwargs)\n\n @staticmethod\n def from_error_info(error_info):\n from dagster._utils.error import SerializableErrorInfo\n\n check.inst_param(error_info, "error_info", SerializableErrorInfo)\n return DagsterMaxRetriesExceededError(\n error_info.to_string(), user_code_process_error_infos=[error_info]\n )\n\n\nclass DagsterCodeLocationNotFoundError(DagsterError):\n pass\n\n\nclass DagsterCodeLocationLoadError(DagsterError):\n def __init__(self, *args, **kwargs):\n from dagster._utils.error import SerializableErrorInfo\n\n self.load_error_infos = check.list_param(\n kwargs.pop("load_error_infos"),\n "load_error_infos",\n SerializableErrorInfo,\n )\n super(DagsterCodeLocationLoadError, self).__init__(*args, **kwargs)\n\n\nclass DagsterLaunchFailedError(DagsterError):\n """Indicates an error while attempting to launch a pipeline run."""\n\n def __init__(self, *args, **kwargs):\n from dagster._utils.error import SerializableErrorInfo\n\n self.serializable_error_info = check.opt_inst_param(\n kwargs.pop("serializable_error_info", None),\n "serializable_error_info",\n SerializableErrorInfo,\n )\n super(DagsterLaunchFailedError, self).__init__(*args, **kwargs)\n\n\nclass DagsterBackfillFailedError(DagsterError):\n """Indicates an error while attempting to launch a backfill."""\n\n def __init__(self, *args, **kwargs):\n from dagster._utils.error import SerializableErrorInfo\n\n self.serializable_error_info = check.opt_inst_param(\n kwargs.pop("serializable_error_info", None),\n "serializable_error_info",\n SerializableErrorInfo,\n )\n super(DagsterBackfillFailedError, self).__init__(*args, **kwargs)\n\n\nclass DagsterRunAlreadyExists(DagsterError):\n """Indicates that a pipeline run already exists in a run storage."""\n\n\nclass DagsterSnapshotDoesNotExist(DagsterError):\n """Indicates you attempted to create a pipeline run with a nonexistent snapshot id."""\n\n\nclass DagsterRunConflict(DagsterError):\n """Indicates that a conflicting pipeline run exists in a run storage."""\n\n\n
[docs]class DagsterTypeCheckDidNotPass(DagsterError):\n """Indicates that a type check failed.\n\n This is raised when ``raise_on_error`` is ``True`` in calls to the synchronous job and\n graph execution APIs (e.g. `graph.execute_in_process()`, `job.execute_in_process()` -- typically\n within a test), and a :py:class:`~dagster.DagsterType`'s type check fails by returning either\n ``False`` or an instance of :py:class:`~dagster.TypeCheck` whose ``success`` member is ``False``.\n """\n\n def __init__(self, description=None, metadata=None, dagster_type=None):\n from dagster import DagsterType\n from dagster._core.definitions.metadata import normalize_metadata\n\n super(DagsterTypeCheckDidNotPass, self).__init__(description)\n self.description = check.opt_str_param(description, "description")\n self.metadata = normalize_metadata(\n check.opt_mapping_param(metadata, "metadata", key_type=str)\n )\n self.dagster_type = check.opt_inst_param(dagster_type, "dagster_type", DagsterType)
\n\n\nclass DagsterAssetCheckFailedError(DagsterError):\n """Indicates than an asset check failed."""\n\n\n
[docs]class DagsterEventLogInvalidForRun(DagsterError):\n """Raised when the event logs for a historical run are malformed or invalid."""\n\n def __init__(self, run_id):\n self.run_id = check.str_param(run_id, "run_id")\n super(DagsterEventLogInvalidForRun, self).__init__(\n f"Event logs invalid for run id {run_id}"\n )
\n\n\nclass ScheduleExecutionError(DagsterUserCodeExecutionError):\n """Errors raised in a user process during the execution of schedule."""\n\n\nclass SensorExecutionError(DagsterUserCodeExecutionError):\n """Errors raised in a user process during the execution of a sensor (or its job)."""\n\n\nclass PartitionExecutionError(DagsterUserCodeExecutionError):\n """Errors raised during the execution of user-provided functions of a partition set schedule."""\n\n\nclass DagsterInvalidAssetKey(DagsterError):\n """Error raised by invalid asset key."""\n\n\nclass DagsterInvalidMetadata(DagsterError):\n """Error raised by invalid metadata parameters."""\n\n\nclass HookExecutionError(DagsterUserCodeExecutionError):\n """Error raised during the execution of a user-defined hook."""\n\n\nclass RunStatusSensorExecutionError(DagsterUserCodeExecutionError):\n """Error raised during the execution of a user-defined run status sensor."""\n\n\nclass FreshnessPolicySensorExecutionError(DagsterUserCodeExecutionError):\n """Error raised during the execution of a user-defined freshness policy sensor."""\n\n\nclass DagsterImportError(DagsterError):\n """Import error raised while importing user-code."""\n\n\nclass JobError(DagsterUserCodeExecutionError):\n """Errors raised during the execution of user-provided functions for a defined Job."""\n\n\nclass DagsterUnknownStepStateError(DagsterError):\n """When job execution completes with steps in an unknown state."""\n\n\nclass DagsterObjectStoreError(DagsterError):\n """Errors during an object store operation."""\n\n\nclass DagsterInvalidPropertyError(DagsterError):\n """Indicates that an invalid property was accessed. May often happen by accessing a property\n that no longer exists after breaking changes.\n """\n\n\nclass DagsterHomeNotSetError(DagsterError):\n """The user has tried to use a command that requires an instance or invoke DagsterInstance.get()\n without setting DAGSTER_HOME env var.\n """\n\n\nclass DagsterUnknownPartitionError(DagsterError):\n """The user has tried to access run config for a partition name that does not exist."""\n\n\nclass DagsterUndefinedDataVersionError(DagsterError):\n """The user attempted to retrieve the most recent logical version for an asset, but no logical version is defined."""\n\n\nclass DagsterAssetBackfillDataLoadError(DagsterError):\n """Indicates that an asset backfill is now unloadable. May happen when (1) a code location containing\n targeted assets is unloadable or (2) and asset or an asset's partitions definition has been removed.\n """\n\n\nclass DagsterDefinitionChangedDeserializationError(DagsterError):\n """Indicates that a stored value can't be deserialized because the definition needed to interpret\n it has changed.\n """\n\n\nclass DagsterExternalExecutionError(DagsterError):\n """Indicates that an error occurred during the execution of an external process."""\n
", "current_page_name": "_modules/dagster/_core/errors", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.errors"}, "event_api": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.event_api

\nfrom datetime import datetime\nfrom typing import Callable, Mapping, NamedTuple, Optional, Sequence, Union\n\nfrom typing_extensions import TypeAlias\n\nimport dagster._check as check\nfrom dagster._annotations import PublicAttr\nfrom dagster._core.definitions.events import AssetKey, AssetMaterialization, AssetObservation\nfrom dagster._core.errors import DagsterInvalidInvocationError\nfrom dagster._core.events import DagsterEventType\nfrom dagster._core.events.log import EventLogEntry\nfrom dagster._serdes import whitelist_for_serdes\n\nEventHandlerFn: TypeAlias = Callable[[EventLogEntry, str], None]\n\n\n
[docs]class RunShardedEventsCursor(NamedTuple):\n """Pairs an id-based event log cursor with a timestamp-based run cursor, for improved\n performance on run-sharded event log storages (e.g. the default SqliteEventLogStorage). For\n run-sharded storages, the id field is ignored, since they may not be unique across shards.\n """\n\n id: int\n run_updated_after: datetime
\n\n\n
[docs]@whitelist_for_serdes\nclass EventLogRecord(NamedTuple):\n """Internal representation of an event record, as stored in a\n :py:class:`~dagster._core.storage.event_log.EventLogStorage`.\n\n Users should not instantiate this class directly.\n """\n\n storage_id: PublicAttr[int]\n event_log_entry: PublicAttr[EventLogEntry]\n\n @property\n def run_id(self) -> str:\n return self.event_log_entry.run_id\n\n @property\n def timestamp(self) -> float:\n return self.event_log_entry.timestamp\n\n @property\n def asset_key(self) -> Optional[AssetKey]:\n dagster_event = self.event_log_entry.dagster_event\n if dagster_event:\n return dagster_event.asset_key\n\n return None\n\n @property\n def partition_key(self) -> Optional[str]:\n dagster_event = self.event_log_entry.dagster_event\n if dagster_event:\n return dagster_event.partition\n\n return None\n\n @property\n def asset_materialization(self) -> Optional[AssetMaterialization]:\n return self.event_log_entry.asset_materialization\n\n @property\n def asset_observation(self) -> Optional[AssetObservation]:\n return self.event_log_entry.asset_observation
\n\n\n
[docs]@whitelist_for_serdes\nclass EventRecordsFilter(\n NamedTuple(\n "_EventRecordsFilter",\n [\n ("event_type", DagsterEventType),\n ("asset_key", Optional[AssetKey]),\n ("asset_partitions", Optional[Sequence[str]]),\n ("after_cursor", Optional[Union[int, RunShardedEventsCursor]]),\n ("before_cursor", Optional[Union[int, RunShardedEventsCursor]]),\n ("after_timestamp", Optional[float]),\n ("before_timestamp", Optional[float]),\n ("storage_ids", Optional[Sequence[int]]),\n ("tags", Optional[Mapping[str, Union[str, Sequence[str]]]]),\n ],\n )\n):\n """Defines a set of filter fields for fetching a set of event log entries or event log records.\n\n Args:\n event_type (DagsterEventType): Filter argument for dagster event type\n asset_key (Optional[AssetKey]): Asset key for which to get asset materialization event\n entries / records.\n asset_partitions (Optional[List[str]]): Filter parameter such that only asset\n events with a partition value matching one of the provided values. Only\n valid when the `asset_key` parameter is provided.\n after_cursor (Optional[Union[int, RunShardedEventsCursor]]): Filter parameter such that only\n records with storage_id greater than the provided value are returned. Using a\n run-sharded events cursor will result in a significant performance gain when run against\n a SqliteEventLogStorage implementation (which is run-sharded)\n before_cursor (Optional[Union[int, RunShardedEventsCursor]]): Filter parameter such that\n records with storage_id less than the provided value are returned. Using a run-sharded\n events cursor will result in a significant performance gain when run against\n a SqliteEventLogStorage implementation (which is run-sharded)\n after_timestamp (Optional[float]): Filter parameter such that only event records for\n events with timestamp greater than the provided value are returned.\n before_timestamp (Optional[float]): Filter parameter such that only event records for\n events with timestamp less than the provided value are returned.\n """\n\n def __new__(\n cls,\n event_type: DagsterEventType,\n asset_key: Optional[AssetKey] = None,\n asset_partitions: Optional[Sequence[str]] = None,\n after_cursor: Optional[Union[int, RunShardedEventsCursor]] = None,\n before_cursor: Optional[Union[int, RunShardedEventsCursor]] = None,\n after_timestamp: Optional[float] = None,\n before_timestamp: Optional[float] = None,\n storage_ids: Optional[Sequence[int]] = None,\n tags: Optional[Mapping[str, Union[str, Sequence[str]]]] = None,\n ):\n check.opt_sequence_param(asset_partitions, "asset_partitions", of_type=str)\n check.inst_param(event_type, "event_type", DagsterEventType)\n\n tags = check.opt_mapping_param(tags, "tags", key_type=str)\n if tags and event_type is not DagsterEventType.ASSET_MATERIALIZATION:\n raise DagsterInvalidInvocationError(\n "Can only filter by tags for asset materialization events"\n )\n\n # type-ignores work around mypy type inference bug\n return super(EventRecordsFilter, cls).__new__(\n cls,\n event_type=event_type,\n asset_key=check.opt_inst_param(asset_key, "asset_key", AssetKey),\n asset_partitions=asset_partitions,\n after_cursor=check.opt_inst_param(\n after_cursor, "after_cursor", (int, RunShardedEventsCursor)\n ),\n before_cursor=check.opt_inst_param(\n before_cursor, "before_cursor", (int, RunShardedEventsCursor)\n ),\n after_timestamp=check.opt_float_param(after_timestamp, "after_timestamp"),\n before_timestamp=check.opt_float_param(before_timestamp, "before_timestamp"),\n storage_ids=check.opt_nullable_sequence_param(storage_ids, "storage_ids", of_type=int),\n tags=check.opt_mapping_param(tags, "tags", key_type=str),\n )
\n
", "current_page_name": "_modules/dagster/_core/event_api", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.event_api"}, "events": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.events

\n"""Structured representations of system events."""\nimport logging\nimport os\nimport sys\nfrom enum import Enum\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Any,\n    Dict,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Tuple,\n    Union,\n    cast,\n)\n\nimport dagster._check as check\nfrom dagster._annotations import public\nfrom dagster._core.definitions import (\n    AssetKey,\n    AssetMaterialization,\n    AssetObservation,\n    ExpectationResult,\n    HookDefinition,\n    NodeHandle,\n)\nfrom dagster._core.definitions.asset_check_evaluation import (\n    AssetCheckEvaluation,\n    AssetCheckEvaluationPlanned,\n)\nfrom dagster._core.definitions.events import AssetLineageInfo, ObjectStoreOperationType\nfrom dagster._core.definitions.metadata import (\n    MetadataFieldSerializer,\n    MetadataValue,\n    RawMetadataValue,\n    normalize_metadata,\n)\nfrom dagster._core.errors import HookExecutionError\nfrom dagster._core.execution.context.system import IPlanContext, IStepContext, StepExecutionContext\nfrom dagster._core.execution.plan.handle import ResolvedFromDynamicStepHandle, StepHandle\nfrom dagster._core.execution.plan.inputs import StepInputData\nfrom dagster._core.execution.plan.objects import StepFailureData, StepRetryData, StepSuccessData\nfrom dagster._core.execution.plan.outputs import StepOutputData\nfrom dagster._core.log_manager import DagsterLogManager\nfrom dagster._core.storage.captured_log_manager import CapturedLogContext\nfrom dagster._core.storage.dagster_run import DagsterRunStatus\nfrom dagster._serdes import (\n    NamedTupleSerializer,\n    whitelist_for_serdes,\n)\nfrom dagster._serdes.serdes import UnpackContext\nfrom dagster._utils.error import SerializableErrorInfo, serializable_error_info_from_exc_info\nfrom dagster._utils.timing import format_duration\n\nif TYPE_CHECKING:\n    from dagster._core.definitions.events import ObjectStoreOperation\n    from dagster._core.execution.plan.plan import ExecutionPlan\n    from dagster._core.execution.plan.step import StepKind\n\n\nEventSpecificData = Union[\n    StepOutputData,\n    StepFailureData,\n    StepSuccessData,\n    "StepMaterializationData",\n    "StepExpectationResultData",\n    StepInputData,\n    "EngineEventData",\n    "HookErroredData",\n    StepRetryData,\n    "JobFailureData",\n    "JobCanceledData",\n    "ObjectStoreOperationResultData",\n    "HandledOutputData",\n    "LoadedInputData",\n    "ComputeLogsCaptureData",\n    "AssetObservationData",\n    "AssetMaterializationPlannedData",\n    "AssetCheckEvaluation",\n    "AssetCheckEvaluationPlanned",\n]\n\n\n
[docs]class DagsterEventType(str, Enum):\n """The types of events that may be yielded by op and job execution."""\n\n STEP_OUTPUT = "STEP_OUTPUT"\n STEP_INPUT = "STEP_INPUT"\n STEP_FAILURE = "STEP_FAILURE"\n STEP_START = "STEP_START"\n STEP_SUCCESS = "STEP_SUCCESS"\n STEP_SKIPPED = "STEP_SKIPPED"\n\n # The process carrying out step execution is starting/started. Shown as a\n # marker start/end in the Dagster UI.\n STEP_WORKER_STARTING = "STEP_WORKER_STARTING"\n STEP_WORKER_STARTED = "STEP_WORKER_STARTED"\n\n # Resource initialization for execution has started/succeede/failed. Shown\n # as a marker start/end in the Dagster UI.\n RESOURCE_INIT_STARTED = "RESOURCE_INIT_STARTED"\n RESOURCE_INIT_SUCCESS = "RESOURCE_INIT_SUCCESS"\n RESOURCE_INIT_FAILURE = "RESOURCE_INIT_FAILURE"\n\n STEP_UP_FOR_RETRY = "STEP_UP_FOR_RETRY" # "failed" but want to retry\n STEP_RESTARTED = "STEP_RESTARTED"\n\n ASSET_MATERIALIZATION = "ASSET_MATERIALIZATION"\n ASSET_MATERIALIZATION_PLANNED = "ASSET_MATERIALIZATION_PLANNED"\n ASSET_OBSERVATION = "ASSET_OBSERVATION"\n STEP_EXPECTATION_RESULT = "STEP_EXPECTATION_RESULT"\n ASSET_CHECK_EVALUATION_PLANNED = "ASSET_CHECK_EVALUATION_PLANNED"\n ASSET_CHECK_EVALUATION = "ASSET_CHECK_EVALUATION"\n\n # We want to display RUN_* events in the Dagster UI and in our LogManager output, but in order to\n # support backcompat for our storage layer, we need to keep the persisted value to be strings\n # of the form "PIPELINE_*". We may have user code that pass in the DagsterEventType\n # enum values into storage APIs (like get_event_records, which takes in an EventRecordsFilter).\n RUN_ENQUEUED = "PIPELINE_ENQUEUED"\n RUN_DEQUEUED = "PIPELINE_DEQUEUED"\n RUN_STARTING = "PIPELINE_STARTING" # Launch is happening, execution hasn't started yet\n RUN_START = "PIPELINE_START" # Execution has started\n RUN_SUCCESS = "PIPELINE_SUCCESS"\n RUN_FAILURE = "PIPELINE_FAILURE"\n RUN_CANCELING = "PIPELINE_CANCELING"\n RUN_CANCELED = "PIPELINE_CANCELED"\n\n # Keep these legacy enum values around, to keep back-compatability for user code that might be\n # using these constants to filter event records\n PIPELINE_ENQUEUED = RUN_ENQUEUED\n PIPELINE_DEQUEUED = RUN_DEQUEUED\n PIPELINE_STARTING = RUN_STARTING\n PIPELINE_START = RUN_START\n PIPELINE_SUCCESS = RUN_SUCCESS\n PIPELINE_FAILURE = RUN_FAILURE\n PIPELINE_CANCELING = RUN_CANCELING\n PIPELINE_CANCELED = RUN_CANCELED\n\n OBJECT_STORE_OPERATION = "OBJECT_STORE_OPERATION"\n ASSET_STORE_OPERATION = "ASSET_STORE_OPERATION"\n LOADED_INPUT = "LOADED_INPUT"\n HANDLED_OUTPUT = "HANDLED_OUTPUT"\n\n ENGINE_EVENT = "ENGINE_EVENT"\n\n HOOK_COMPLETED = "HOOK_COMPLETED"\n HOOK_ERRORED = "HOOK_ERRORED"\n HOOK_SKIPPED = "HOOK_SKIPPED"\n\n ALERT_START = "ALERT_START"\n ALERT_SUCCESS = "ALERT_SUCCESS"\n ALERT_FAILURE = "ALERT_FAILURE"\n\n LOGS_CAPTURED = "LOGS_CAPTURED"
\n\n\nEVENT_TYPE_VALUE_TO_DISPLAY_STRING = {\n "PIPELINE_ENQUEUED": "RUN_ENQUEUED",\n "PIPELINE_DEQUEUED": "RUN_DEQUEUED",\n "PIPELINE_STARTING": "RUN_STARTING",\n "PIPELINE_START": "RUN_START",\n "PIPELINE_SUCCESS": "RUN_SUCCESS",\n "PIPELINE_FAILURE": "RUN_FAILURE",\n "PIPELINE_CANCELING": "RUN_CANCELING",\n "PIPELINE_CANCELED": "RUN_CANCELED",\n}\n\nSTEP_EVENTS = {\n DagsterEventType.STEP_INPUT,\n DagsterEventType.STEP_START,\n DagsterEventType.STEP_OUTPUT,\n DagsterEventType.STEP_FAILURE,\n DagsterEventType.STEP_SUCCESS,\n DagsterEventType.STEP_SKIPPED,\n DagsterEventType.ASSET_MATERIALIZATION,\n DagsterEventType.ASSET_OBSERVATION,\n DagsterEventType.STEP_EXPECTATION_RESULT,\n DagsterEventType.ASSET_CHECK_EVALUATION,\n DagsterEventType.OBJECT_STORE_OPERATION,\n DagsterEventType.HANDLED_OUTPUT,\n DagsterEventType.LOADED_INPUT,\n DagsterEventType.STEP_RESTARTED,\n DagsterEventType.STEP_UP_FOR_RETRY,\n}\n\nFAILURE_EVENTS = {\n DagsterEventType.RUN_FAILURE,\n DagsterEventType.STEP_FAILURE,\n DagsterEventType.RUN_CANCELED,\n}\n\nPIPELINE_EVENTS = {\n DagsterEventType.RUN_ENQUEUED,\n DagsterEventType.RUN_DEQUEUED,\n DagsterEventType.RUN_STARTING,\n DagsterEventType.RUN_START,\n DagsterEventType.RUN_SUCCESS,\n DagsterEventType.RUN_FAILURE,\n DagsterEventType.RUN_CANCELING,\n DagsterEventType.RUN_CANCELED,\n}\n\nHOOK_EVENTS = {\n DagsterEventType.HOOK_COMPLETED,\n DagsterEventType.HOOK_ERRORED,\n DagsterEventType.HOOK_SKIPPED,\n}\n\nALERT_EVENTS = {\n DagsterEventType.ALERT_START,\n DagsterEventType.ALERT_SUCCESS,\n DagsterEventType.ALERT_FAILURE,\n}\n\nMARKER_EVENTS = {\n DagsterEventType.ENGINE_EVENT,\n DagsterEventType.STEP_WORKER_STARTING,\n DagsterEventType.STEP_WORKER_STARTED,\n DagsterEventType.RESOURCE_INIT_STARTED,\n DagsterEventType.RESOURCE_INIT_SUCCESS,\n DagsterEventType.RESOURCE_INIT_FAILURE,\n}\n\n\nEVENT_TYPE_TO_PIPELINE_RUN_STATUS = {\n DagsterEventType.RUN_START: DagsterRunStatus.STARTED,\n DagsterEventType.RUN_SUCCESS: DagsterRunStatus.SUCCESS,\n DagsterEventType.RUN_FAILURE: DagsterRunStatus.FAILURE,\n DagsterEventType.RUN_ENQUEUED: DagsterRunStatus.QUEUED,\n DagsterEventType.RUN_STARTING: DagsterRunStatus.STARTING,\n DagsterEventType.RUN_CANCELING: DagsterRunStatus.CANCELING,\n DagsterEventType.RUN_CANCELED: DagsterRunStatus.CANCELED,\n}\n\nPIPELINE_RUN_STATUS_TO_EVENT_TYPE = {v: k for k, v in EVENT_TYPE_TO_PIPELINE_RUN_STATUS.items()}\n\nASSET_EVENTS = {\n DagsterEventType.ASSET_MATERIALIZATION,\n DagsterEventType.ASSET_OBSERVATION,\n DagsterEventType.ASSET_MATERIALIZATION_PLANNED,\n}\n\nASSET_CHECK_EVENTS = {\n DagsterEventType.ASSET_CHECK_EVALUATION,\n DagsterEventType.ASSET_CHECK_EVALUATION_PLANNED,\n}\n\n\ndef _assert_type(\n method: str,\n expected_type: Union[DagsterEventType, Sequence[DagsterEventType]],\n actual_type: DagsterEventType,\n) -> None:\n _expected_type = (\n [expected_type] if isinstance(expected_type, DagsterEventType) else expected_type\n )\n check.invariant(\n actual_type in _expected_type,\n f"{method} only callable when event_type is"\n f" {','.join([t.value for t in _expected_type])}, called on {actual_type}",\n )\n\n\ndef _validate_event_specific_data(\n event_type: DagsterEventType, event_specific_data: Optional["EventSpecificData"]\n) -> Optional["EventSpecificData"]:\n if event_type == DagsterEventType.STEP_OUTPUT:\n check.inst_param(event_specific_data, "event_specific_data", StepOutputData)\n elif event_type == DagsterEventType.STEP_FAILURE:\n check.inst_param(event_specific_data, "event_specific_data", StepFailureData)\n elif event_type == DagsterEventType.STEP_SUCCESS:\n check.inst_param(event_specific_data, "event_specific_data", StepSuccessData)\n elif event_type == DagsterEventType.ASSET_MATERIALIZATION:\n check.inst_param(event_specific_data, "event_specific_data", StepMaterializationData)\n elif event_type == DagsterEventType.STEP_EXPECTATION_RESULT:\n check.inst_param(event_specific_data, "event_specific_data", StepExpectationResultData)\n elif event_type == DagsterEventType.STEP_INPUT:\n check.inst_param(event_specific_data, "event_specific_data", StepInputData)\n elif event_type in (\n DagsterEventType.ENGINE_EVENT,\n DagsterEventType.STEP_WORKER_STARTING,\n DagsterEventType.STEP_WORKER_STARTED,\n DagsterEventType.RESOURCE_INIT_STARTED,\n DagsterEventType.RESOURCE_INIT_SUCCESS,\n DagsterEventType.RESOURCE_INIT_FAILURE,\n ):\n check.inst_param(event_specific_data, "event_specific_data", EngineEventData)\n elif event_type == DagsterEventType.HOOK_ERRORED:\n check.inst_param(event_specific_data, "event_specific_data", HookErroredData)\n elif event_type == DagsterEventType.ASSET_MATERIALIZATION_PLANNED:\n check.inst_param(\n event_specific_data, "event_specific_data", AssetMaterializationPlannedData\n )\n elif event_type == DagsterEventType.ASSET_CHECK_EVALUATION_PLANNED:\n check.inst_param(event_specific_data, "event_specific_data", AssetCheckEvaluationPlanned)\n elif event_type == DagsterEventType.ASSET_CHECK_EVALUATION:\n check.inst_param(event_specific_data, "event_specific_data", AssetCheckEvaluation)\n\n return event_specific_data\n\n\ndef log_step_event(step_context: IStepContext, event: "DagsterEvent") -> None:\n event_type = DagsterEventType(event.event_type_value)\n log_level = logging.ERROR if event_type in FAILURE_EVENTS else logging.DEBUG\n\n step_context.log.log_dagster_event(\n level=log_level,\n msg=event.message or f"{event_type} for step {step_context.step.key}",\n dagster_event=event,\n )\n\n\ndef log_job_event(job_context: IPlanContext, event: "DagsterEvent") -> None:\n event_type = DagsterEventType(event.event_type_value)\n log_level = logging.ERROR if event_type in FAILURE_EVENTS else logging.DEBUG\n\n job_context.log.log_dagster_event(\n level=log_level,\n msg=event.message or f"{event_type} for pipeline {job_context.job_name}",\n dagster_event=event,\n )\n\n\ndef log_resource_event(log_manager: DagsterLogManager, event: "DagsterEvent") -> None:\n event_specific_data = cast(EngineEventData, event.event_specific_data)\n\n log_level = logging.ERROR if event_specific_data.error else logging.DEBUG\n log_manager.log_dagster_event(level=log_level, msg=event.message or "", dagster_event=event)\n\n\nclass DagsterEventSerializer(NamedTupleSerializer["DagsterEvent"]):\n def before_unpack(self, context, unpacked_dict: Any) -> Dict[str, Any]:\n event_type_value, event_specific_data = _handle_back_compat(\n unpacked_dict["event_type_value"], unpacked_dict.get("event_specific_data")\n )\n unpacked_dict["event_type_value"] = event_type_value\n unpacked_dict["event_specific_data"] = event_specific_data\n\n return unpacked_dict\n\n def handle_unpack_error(\n self,\n exc: Exception,\n context: UnpackContext,\n storage_dict: Dict[str, Any],\n ) -> "DagsterEvent":\n event_type_value, _ = _handle_back_compat(\n storage_dict["event_type_value"], storage_dict.get("event_specific_data")\n )\n step_key = storage_dict.get("step_key")\n orig_message = storage_dict.get("message")\n new_message = (\n f"Could not deserialize event of type {event_type_value}. This event may have been"\n " written by a newer version of Dagster."\n + (f' Original message: "{orig_message}"' if orig_message else "")\n )\n return DagsterEvent(\n event_type_value=DagsterEventType.ENGINE_EVENT.value,\n job_name=storage_dict["pipeline_name"],\n message=new_message,\n step_key=step_key,\n event_specific_data=EngineEventData(\n error=serializable_error_info_from_exc_info(sys.exc_info())\n ),\n )\n\n\n
[docs]@whitelist_for_serdes(\n serializer=DagsterEventSerializer,\n storage_field_names={\n "node_handle": "solid_handle",\n "job_name": "pipeline_name",\n },\n)\nclass DagsterEvent(\n NamedTuple(\n "_DagsterEvent",\n [\n ("event_type_value", str),\n ("job_name", str),\n ("step_handle", Optional[Union[StepHandle, ResolvedFromDynamicStepHandle]]),\n ("node_handle", Optional[NodeHandle]),\n ("step_kind_value", Optional[str]),\n ("logging_tags", Optional[Mapping[str, str]]),\n ("event_specific_data", Optional["EventSpecificData"]),\n ("message", Optional[str]),\n ("pid", Optional[int]),\n ("step_key", Optional[str]),\n ],\n )\n):\n """Events yielded by op and job execution.\n\n Users should not instantiate this class.\n\n Attributes:\n event_type_value (str): Value for a DagsterEventType.\n job_name (str)\n node_handle (NodeHandle)\n step_kind_value (str): Value for a StepKind.\n logging_tags (Dict[str, str])\n event_specific_data (Any): Type must correspond to event_type_value.\n message (str)\n pid (int)\n step_key (Optional[str]): DEPRECATED\n """\n\n @staticmethod\n def from_step(\n event_type: "DagsterEventType",\n step_context: IStepContext,\n event_specific_data: Optional["EventSpecificData"] = None,\n message: Optional[str] = None,\n ) -> "DagsterEvent":\n event = DagsterEvent(\n event_type_value=check.inst_param(event_type, "event_type", DagsterEventType).value,\n job_name=step_context.job_name,\n step_handle=step_context.step.handle,\n node_handle=step_context.step.node_handle,\n step_kind_value=step_context.step.kind.value,\n logging_tags=step_context.event_tags,\n event_specific_data=_validate_event_specific_data(event_type, event_specific_data),\n message=check.opt_str_param(message, "message"),\n pid=os.getpid(),\n )\n\n log_step_event(step_context, event)\n\n return event\n\n @staticmethod\n def from_job(\n event_type: DagsterEventType,\n job_context: IPlanContext,\n message: Optional[str] = None,\n event_specific_data: Optional["EventSpecificData"] = None,\n step_handle: Optional[Union[StepHandle, ResolvedFromDynamicStepHandle]] = None,\n ) -> "DagsterEvent":\n check.opt_inst_param(\n step_handle, "step_handle", (StepHandle, ResolvedFromDynamicStepHandle)\n )\n\n event = DagsterEvent(\n event_type_value=check.inst_param(event_type, "event_type", DagsterEventType).value,\n job_name=job_context.job_name,\n message=check.opt_str_param(message, "message"),\n event_specific_data=_validate_event_specific_data(event_type, event_specific_data),\n step_handle=step_handle,\n pid=os.getpid(),\n )\n\n log_job_event(job_context, event)\n\n return event\n\n @staticmethod\n def from_resource(\n event_type: DagsterEventType,\n job_name: str,\n execution_plan: "ExecutionPlan",\n log_manager: DagsterLogManager,\n message: Optional[str] = None,\n event_specific_data: Optional["EngineEventData"] = None,\n ) -> "DagsterEvent":\n event = DagsterEvent(\n event_type_value=check.inst_param(event_type, "event_type", DagsterEventType).value,\n job_name=job_name,\n message=check.opt_str_param(message, "message"),\n event_specific_data=_validate_event_specific_data(\n DagsterEventType.ENGINE_EVENT, event_specific_data\n ),\n step_handle=execution_plan.step_handle_for_single_step_plans(),\n pid=os.getpid(),\n )\n log_resource_event(log_manager, event)\n return event\n\n def __new__(\n cls,\n event_type_value: str,\n job_name: str,\n step_handle: Optional[Union[StepHandle, ResolvedFromDynamicStepHandle]] = None,\n node_handle: Optional[NodeHandle] = None,\n step_kind_value: Optional[str] = None,\n logging_tags: Optional[Mapping[str, str]] = None,\n event_specific_data: Optional["EventSpecificData"] = None,\n message: Optional[str] = None,\n pid: Optional[int] = None,\n # legacy\n step_key: Optional[str] = None,\n ):\n # old events may contain node_handle but not step_handle\n if node_handle is not None and step_handle is None:\n step_handle = StepHandle(node_handle)\n\n # Legacy events may have step_key set directly, preserve those to stay in sync\n # with legacy execution plan snapshots.\n if step_handle is not None and step_key is None:\n step_key = step_handle.to_key()\n\n return super(DagsterEvent, cls).__new__(\n cls,\n check.str_param(event_type_value, "event_type_value"),\n check.str_param(job_name, "job_name"),\n check.opt_inst_param(\n step_handle, "step_handle", (StepHandle, ResolvedFromDynamicStepHandle)\n ),\n check.opt_inst_param(node_handle, "node_handle", NodeHandle),\n check.opt_str_param(step_kind_value, "step_kind_value"),\n check.opt_mapping_param(logging_tags, "logging_tags"),\n _validate_event_specific_data(DagsterEventType(event_type_value), event_specific_data),\n check.opt_str_param(message, "message"),\n check.opt_int_param(pid, "pid"),\n check.opt_str_param(step_key, "step_key"),\n )\n\n @property\n def node_name(self) -> str:\n check.invariant(self.node_handle is not None)\n node_handle = cast(NodeHandle, self.node_handle)\n return node_handle.name\n\n @public\n @property\n def event_type(self) -> DagsterEventType:\n """DagsterEventType: The type of this event."""\n return DagsterEventType(self.event_type_value)\n\n @public\n @property\n def is_step_event(self) -> bool:\n """bool: If this event relates to a specific step."""\n return self.event_type in STEP_EVENTS\n\n @public\n @property\n def is_hook_event(self) -> bool:\n """bool: If this event relates to the execution of a hook."""\n return self.event_type in HOOK_EVENTS\n\n @property\n def is_alert_event(self) -> bool:\n return self.event_type in ALERT_EVENTS\n\n @property\n def step_kind(self) -> "StepKind":\n from dagster._core.execution.plan.step import StepKind\n\n return StepKind(self.step_kind_value)\n\n @public\n @property\n def is_step_success(self) -> bool:\n """bool: If this event is of type STEP_SUCCESS."""\n return self.event_type == DagsterEventType.STEP_SUCCESS\n\n @public\n @property\n def is_successful_output(self) -> bool:\n """bool: If this event is of type STEP_OUTPUT."""\n return self.event_type == DagsterEventType.STEP_OUTPUT\n\n @public\n @property\n def is_step_start(self) -> bool:\n """bool: If this event is of type STEP_START."""\n return self.event_type == DagsterEventType.STEP_START\n\n @public\n @property\n def is_step_failure(self) -> bool:\n """bool: If this event is of type STEP_FAILURE."""\n return self.event_type == DagsterEventType.STEP_FAILURE\n\n @public\n @property\n def is_resource_init_failure(self) -> bool:\n """bool: If this event is of type RESOURCE_INIT_FAILURE."""\n return self.event_type == DagsterEventType.RESOURCE_INIT_FAILURE\n\n @public\n @property\n def is_step_skipped(self) -> bool:\n """bool: If this event is of type STEP_SKIPPED."""\n return self.event_type == DagsterEventType.STEP_SKIPPED\n\n @public\n @property\n def is_step_up_for_retry(self) -> bool:\n """bool: If this event is of type STEP_UP_FOR_RETRY."""\n return self.event_type == DagsterEventType.STEP_UP_FOR_RETRY\n\n @public\n @property\n def is_step_restarted(self) -> bool:\n """bool: If this event is of type STEP_RESTARTED."""\n return self.event_type == DagsterEventType.STEP_RESTARTED\n\n @property\n def is_job_success(self) -> bool:\n return self.event_type == DagsterEventType.RUN_SUCCESS\n\n @property\n def is_job_failure(self) -> bool:\n return self.event_type == DagsterEventType.RUN_FAILURE\n\n @property\n def is_run_failure(self) -> bool:\n return self.event_type == DagsterEventType.RUN_FAILURE\n\n @public\n @property\n def is_failure(self) -> bool:\n """bool: If this event represents the failure of a run or step."""\n return self.event_type in FAILURE_EVENTS\n\n @property\n def is_job_event(self) -> bool:\n return self.event_type in PIPELINE_EVENTS\n\n @public\n @property\n def is_engine_event(self) -> bool:\n """bool: If this event is of type ENGINE_EVENT."""\n return self.event_type == DagsterEventType.ENGINE_EVENT\n\n @public\n @property\n def is_handled_output(self) -> bool:\n """bool: If this event is of type HANDLED_OUTPUT."""\n return self.event_type == DagsterEventType.HANDLED_OUTPUT\n\n @public\n @property\n def is_loaded_input(self) -> bool:\n """bool: If this event is of type LOADED_INPUT."""\n return self.event_type == DagsterEventType.LOADED_INPUT\n\n @public\n @property\n def is_step_materialization(self) -> bool:\n """bool: If this event is of type ASSET_MATERIALIZATION."""\n return self.event_type == DagsterEventType.ASSET_MATERIALIZATION\n\n @public\n @property\n def is_expectation_result(self) -> bool:\n """bool: If this event is of type STEP_EXPECTATION_RESULT."""\n return self.event_type == DagsterEventType.STEP_EXPECTATION_RESULT\n\n @public\n @property\n def is_asset_observation(self) -> bool:\n """bool: If this event is of type ASSET_OBSERVATION."""\n return self.event_type == DagsterEventType.ASSET_OBSERVATION\n\n @public\n @property\n def is_asset_materialization_planned(self) -> bool:\n """bool: If this event is of type ASSET_MATERIALIZATION_PLANNED."""\n return self.event_type == DagsterEventType.ASSET_MATERIALIZATION_PLANNED\n\n @public\n @property\n def asset_key(self) -> Optional[AssetKey]:\n """Optional[AssetKey]: For events that correspond to a specific asset_key / partition\n (ASSET_MATERIALIZTION, ASSET_OBSERVATION, ASSET_MATERIALIZATION_PLANNED), returns that\n asset key. Otherwise, returns None.\n """\n if self.event_type == DagsterEventType.ASSET_MATERIALIZATION:\n return self.step_materialization_data.materialization.asset_key\n elif self.event_type == DagsterEventType.ASSET_OBSERVATION:\n return self.asset_observation_data.asset_observation.asset_key\n elif self.event_type == DagsterEventType.ASSET_MATERIALIZATION_PLANNED:\n return self.asset_materialization_planned_data.asset_key\n else:\n return None\n\n @public\n @property\n def partition(self) -> Optional[str]:\n """Optional[AssetKey]: For events that correspond to a specific asset_key / partition\n (ASSET_MATERIALIZTION, ASSET_OBSERVATION, ASSET_MATERIALIZATION_PLANNED), returns that\n partition. Otherwise, returns None.\n """\n if self.event_type == DagsterEventType.ASSET_MATERIALIZATION:\n return self.step_materialization_data.materialization.partition\n elif self.event_type == DagsterEventType.ASSET_OBSERVATION:\n return self.asset_observation_data.asset_observation.partition\n elif self.event_type == DagsterEventType.ASSET_MATERIALIZATION_PLANNED:\n return self.asset_materialization_planned_data.partition\n else:\n return None\n\n @property\n def step_input_data(self) -> "StepInputData":\n _assert_type("step_input_data", DagsterEventType.STEP_INPUT, self.event_type)\n return cast(StepInputData, self.event_specific_data)\n\n @property\n def step_output_data(self) -> StepOutputData:\n _assert_type("step_output_data", DagsterEventType.STEP_OUTPUT, self.event_type)\n return cast(StepOutputData, self.event_specific_data)\n\n @property\n def step_success_data(self) -> "StepSuccessData":\n _assert_type("step_success_data", DagsterEventType.STEP_SUCCESS, self.event_type)\n return cast(StepSuccessData, self.event_specific_data)\n\n @property\n def step_failure_data(self) -> "StepFailureData":\n _assert_type("step_failure_data", DagsterEventType.STEP_FAILURE, self.event_type)\n return cast(StepFailureData, self.event_specific_data)\n\n @property\n def step_retry_data(self) -> "StepRetryData":\n _assert_type("step_retry_data", DagsterEventType.STEP_UP_FOR_RETRY, self.event_type)\n return cast(StepRetryData, self.event_specific_data)\n\n @property\n def step_materialization_data(self) -> "StepMaterializationData":\n _assert_type(\n "step_materialization_data", DagsterEventType.ASSET_MATERIALIZATION, self.event_type\n )\n return cast(StepMaterializationData, self.event_specific_data)\n\n @property\n def asset_observation_data(self) -> "AssetObservationData":\n _assert_type("asset_observation_data", DagsterEventType.ASSET_OBSERVATION, self.event_type)\n return cast(AssetObservationData, self.event_specific_data)\n\n @property\n def asset_materialization_planned_data(self) -> "AssetMaterializationPlannedData":\n _assert_type(\n "asset_materialization_planned",\n DagsterEventType.ASSET_MATERIALIZATION_PLANNED,\n self.event_type,\n )\n return cast(AssetMaterializationPlannedData, self.event_specific_data)\n\n @property\n def asset_check_planned_data(self) -> "AssetCheckEvaluationPlanned":\n _assert_type(\n "asset_check_planned",\n DagsterEventType.ASSET_CHECK_EVALUATION_PLANNED,\n self.event_type,\n )\n return cast(AssetCheckEvaluationPlanned, self.event_specific_data)\n\n @property\n def step_expectation_result_data(self) -> "StepExpectationResultData":\n _assert_type(\n "step_expectation_result_data",\n DagsterEventType.STEP_EXPECTATION_RESULT,\n self.event_type,\n )\n return cast(StepExpectationResultData, self.event_specific_data)\n\n @property\n def materialization(self) -> AssetMaterialization:\n _assert_type(\n "step_materialization_data", DagsterEventType.ASSET_MATERIALIZATION, self.event_type\n )\n return cast(StepMaterializationData, self.event_specific_data).materialization\n\n @property\n def asset_check_evaluation_data(self) -> AssetCheckEvaluation:\n _assert_type(\n "asset_check_evaluation", DagsterEventType.ASSET_CHECK_EVALUATION, self.event_type\n )\n return cast(AssetCheckEvaluation, self.event_specific_data)\n\n @property\n def job_failure_data(self) -> "JobFailureData":\n _assert_type("job_failure_data", DagsterEventType.RUN_FAILURE, self.event_type)\n return cast(JobFailureData, self.event_specific_data)\n\n @property\n def engine_event_data(self) -> "EngineEventData":\n _assert_type(\n "engine_event_data",\n [\n DagsterEventType.ENGINE_EVENT,\n DagsterEventType.RESOURCE_INIT_STARTED,\n DagsterEventType.RESOURCE_INIT_SUCCESS,\n DagsterEventType.RESOURCE_INIT_FAILURE,\n DagsterEventType.STEP_WORKER_STARTED,\n DagsterEventType.STEP_WORKER_STARTING,\n ],\n self.event_type,\n )\n return cast(EngineEventData, self.event_specific_data)\n\n @property\n def hook_completed_data(self) -> Optional["EventSpecificData"]:\n _assert_type("hook_completed_data", DagsterEventType.HOOK_COMPLETED, self.event_type)\n return self.event_specific_data\n\n @property\n def hook_errored_data(self) -> "HookErroredData":\n _assert_type("hook_errored_data", DagsterEventType.HOOK_ERRORED, self.event_type)\n return cast(HookErroredData, self.event_specific_data)\n\n @property\n def hook_skipped_data(self) -> Optional["EventSpecificData"]:\n _assert_type("hook_skipped_data", DagsterEventType.HOOK_SKIPPED, self.event_type)\n return self.event_specific_data\n\n @property\n def logs_captured_data(self) -> "ComputeLogsCaptureData":\n _assert_type("logs_captured_data", DagsterEventType.LOGS_CAPTURED, self.event_type)\n return cast(ComputeLogsCaptureData, self.event_specific_data)\n\n @staticmethod\n def step_output_event(\n step_context: StepExecutionContext, step_output_data: StepOutputData\n ) -> "DagsterEvent":\n output_def = step_context.op.output_def_named(\n step_output_data.step_output_handle.output_name\n )\n\n return DagsterEvent.from_step(\n event_type=DagsterEventType.STEP_OUTPUT,\n step_context=step_context,\n event_specific_data=step_output_data,\n message=(\n 'Yielded output "{output_name}"{mapping_clause} of type'\n ' "{output_type}".{type_check_clause}'.format(\n output_name=step_output_data.step_output_handle.output_name,\n output_type=output_def.dagster_type.display_name,\n type_check_clause=(\n (\n " Warning! Type check failed."\n if not step_output_data.type_check_data.success\n else " (Type check passed)."\n )\n if step_output_data.type_check_data\n else " (No type check)."\n ),\n mapping_clause=(\n f' mapping key "{step_output_data.step_output_handle.mapping_key}"'\n if step_output_data.step_output_handle.mapping_key\n else ""\n ),\n )\n ),\n )\n\n @staticmethod\n def step_failure_event(\n step_context: IStepContext,\n step_failure_data: "StepFailureData",\n message=None,\n ) -> "DagsterEvent":\n return DagsterEvent.from_step(\n event_type=DagsterEventType.STEP_FAILURE,\n step_context=step_context,\n event_specific_data=step_failure_data,\n message=(message or f'Execution of step "{step_context.step.key}" failed.'),\n )\n\n @staticmethod\n def step_retry_event(\n step_context: IStepContext, step_retry_data: "StepRetryData"\n ) -> "DagsterEvent":\n return DagsterEvent.from_step(\n event_type=DagsterEventType.STEP_UP_FOR_RETRY,\n step_context=step_context,\n event_specific_data=step_retry_data,\n message=(\n 'Execution of step "{step_key}" failed and has requested a retry{wait_str}.'.format(\n step_key=step_context.step.key,\n wait_str=(\n f" in {step_retry_data.seconds_to_wait} seconds"\n if step_retry_data.seconds_to_wait\n else ""\n ),\n )\n ),\n )\n\n @staticmethod\n def step_input_event(\n step_context: StepExecutionContext, step_input_data: "StepInputData"\n ) -> "DagsterEvent":\n input_def = step_context.op_def.input_def_named(step_input_data.input_name)\n\n return DagsterEvent.from_step(\n event_type=DagsterEventType.STEP_INPUT,\n step_context=step_context,\n event_specific_data=step_input_data,\n message='Got input "{input_name}" of type "{input_type}".{type_check_clause}'.format(\n input_name=step_input_data.input_name,\n input_type=input_def.dagster_type.display_name,\n type_check_clause=(\n (\n " Warning! Type check failed."\n if not step_input_data.type_check_data.success\n else " (Type check passed)."\n )\n if step_input_data.type_check_data\n else " (No type check)."\n ),\n ),\n )\n\n @staticmethod\n def step_start_event(step_context: IStepContext) -> "DagsterEvent":\n return DagsterEvent.from_step(\n event_type=DagsterEventType.STEP_START,\n step_context=step_context,\n message=f'Started execution of step "{step_context.step.key}".',\n )\n\n @staticmethod\n def step_restarted_event(step_context: IStepContext, previous_attempts: int) -> "DagsterEvent":\n return DagsterEvent.from_step(\n event_type=DagsterEventType.STEP_RESTARTED,\n step_context=step_context,\n message='Started re-execution (attempt # {n}) of step "{step_key}".'.format(\n step_key=step_context.step.key, n=previous_attempts + 1\n ),\n )\n\n @staticmethod\n def step_success_event(\n step_context: IStepContext, success: "StepSuccessData"\n ) -> "DagsterEvent":\n return DagsterEvent.from_step(\n event_type=DagsterEventType.STEP_SUCCESS,\n step_context=step_context,\n event_specific_data=success,\n message='Finished execution of step "{step_key}" in {duration}.'.format(\n step_key=step_context.step.key,\n duration=format_duration(success.duration_ms),\n ),\n )\n\n @staticmethod\n def step_skipped_event(step_context: IStepContext) -> "DagsterEvent":\n return DagsterEvent.from_step(\n event_type=DagsterEventType.STEP_SKIPPED,\n step_context=step_context,\n message=f'Skipped execution of step "{step_context.step.key}".',\n )\n\n @staticmethod\n def asset_materialization(\n step_context: IStepContext,\n materialization: AssetMaterialization,\n ) -> "DagsterEvent":\n return DagsterEvent.from_step(\n event_type=DagsterEventType.ASSET_MATERIALIZATION,\n step_context=step_context,\n event_specific_data=StepMaterializationData(materialization),\n message=(\n materialization.description\n if materialization.description\n else "Materialized value{label_clause}.".format(\n label_clause=f" {materialization.label}" if materialization.label else ""\n )\n ),\n )\n\n @staticmethod\n def asset_observation(\n step_context: IStepContext, observation: AssetObservation\n ) -> "DagsterEvent":\n return DagsterEvent.from_step(\n event_type=DagsterEventType.ASSET_OBSERVATION,\n step_context=step_context,\n event_specific_data=AssetObservationData(observation),\n )\n\n @staticmethod\n def asset_check_evaluation(\n step_context: IStepContext, asset_check_evaluation: AssetCheckEvaluation\n ) -> "DagsterEvent":\n return DagsterEvent.from_step(\n event_type=DagsterEventType.ASSET_CHECK_EVALUATION,\n step_context=step_context,\n event_specific_data=asset_check_evaluation,\n )\n\n @staticmethod\n def step_expectation_result(\n step_context: IStepContext, expectation_result: ExpectationResult\n ) -> "DagsterEvent":\n def _msg():\n if expectation_result.description:\n return expectation_result.description\n\n return "Expectation{label_clause} {result_verb}".format(\n label_clause=" " + expectation_result.label if expectation_result.label else "",\n result_verb="passed" if expectation_result.success else "failed",\n )\n\n return DagsterEvent.from_step(\n event_type=DagsterEventType.STEP_EXPECTATION_RESULT,\n step_context=step_context,\n event_specific_data=StepExpectationResultData(expectation_result),\n message=_msg(),\n )\n\n @staticmethod\n def job_start(job_context: IPlanContext) -> "DagsterEvent":\n return DagsterEvent.from_job(\n DagsterEventType.RUN_START,\n job_context,\n message=f'Started execution of run for "{job_context.job_name}".',\n )\n\n @staticmethod\n def job_success(job_context: IPlanContext) -> "DagsterEvent":\n return DagsterEvent.from_job(\n DagsterEventType.RUN_SUCCESS,\n job_context,\n message=f'Finished execution of run for "{job_context.job_name}".',\n )\n\n @staticmethod\n def job_failure(\n job_context_or_name: Union[IPlanContext, str],\n context_msg: str,\n error_info: Optional[SerializableErrorInfo] = None,\n ) -> "DagsterEvent":\n check.str_param(context_msg, "context_msg")\n if isinstance(job_context_or_name, IPlanContext):\n return DagsterEvent.from_job(\n DagsterEventType.RUN_FAILURE,\n job_context_or_name,\n message=(\n f'Execution of run for "{job_context_or_name.job_name}" failed. {context_msg}'\n ),\n event_specific_data=JobFailureData(error_info),\n )\n else:\n # when the failure happens trying to bring up context, the job_context hasn't been\n # built and so can't use from_pipeline\n check.str_param(job_context_or_name, "pipeline_name")\n event = DagsterEvent(\n event_type_value=DagsterEventType.RUN_FAILURE.value,\n job_name=job_context_or_name,\n event_specific_data=JobFailureData(error_info),\n message=f'Execution of run for "{job_context_or_name}" failed. {context_msg}',\n pid=os.getpid(),\n )\n return event\n\n @staticmethod\n def job_canceled(\n job_context: IPlanContext, error_info: Optional[SerializableErrorInfo] = None\n ) -> "DagsterEvent":\n return DagsterEvent.from_job(\n DagsterEventType.RUN_CANCELED,\n job_context,\n message=f'Execution of run for "{job_context.job_name}" canceled.',\n event_specific_data=JobCanceledData(\n check.opt_inst_param(error_info, "error_info", SerializableErrorInfo)\n ),\n )\n\n @staticmethod\n def step_worker_starting(\n step_context: IStepContext,\n message: str,\n metadata: Mapping[str, MetadataValue],\n ) -> "DagsterEvent":\n return DagsterEvent.from_step(\n DagsterEventType.STEP_WORKER_STARTING,\n step_context,\n message=message,\n event_specific_data=EngineEventData(\n metadata=metadata, marker_start="step_process_start"\n ),\n )\n\n @staticmethod\n def step_worker_started(\n log_manager: DagsterLogManager,\n job_name: str,\n message: str,\n metadata: Mapping[str, MetadataValue],\n step_key: Optional[str],\n ) -> "DagsterEvent":\n event = DagsterEvent(\n DagsterEventType.STEP_WORKER_STARTED.value,\n job_name=job_name,\n message=message,\n event_specific_data=EngineEventData(metadata=metadata, marker_end="step_process_start"),\n pid=os.getpid(),\n step_key=step_key,\n )\n log_manager.log_dagster_event(\n level=logging.DEBUG,\n msg=message,\n dagster_event=event,\n )\n return event\n\n @staticmethod\n def resource_init_start(\n job_name: str,\n execution_plan: "ExecutionPlan",\n log_manager: DagsterLogManager,\n resource_keys: AbstractSet[str],\n ) -> "DagsterEvent":\n return DagsterEvent.from_resource(\n DagsterEventType.RESOURCE_INIT_STARTED,\n job_name=job_name,\n execution_plan=execution_plan,\n log_manager=log_manager,\n message="Starting initialization of resources [{}].".format(\n ", ".join(sorted(resource_keys))\n ),\n event_specific_data=EngineEventData(metadata={}, marker_start="resources"),\n )\n\n @staticmethod\n def resource_init_success(\n job_name: str,\n execution_plan: "ExecutionPlan",\n log_manager: DagsterLogManager,\n resource_instances: Mapping[str, Any],\n resource_init_times: Mapping[str, str],\n ) -> "DagsterEvent":\n metadata = {}\n for key in resource_instances.keys():\n metadata[key] = MetadataValue.python_artifact(resource_instances[key].__class__)\n metadata[f"{key}:init_time"] = resource_init_times[key]\n\n return DagsterEvent.from_resource(\n DagsterEventType.RESOURCE_INIT_SUCCESS,\n job_name=job_name,\n execution_plan=execution_plan,\n log_manager=log_manager,\n message="Finished initialization of resources [{}].".format(\n ", ".join(sorted(resource_init_times.keys()))\n ),\n event_specific_data=EngineEventData(\n metadata=metadata,\n marker_end="resources",\n ),\n )\n\n @staticmethod\n def resource_init_failure(\n job_name: str,\n execution_plan: "ExecutionPlan",\n log_manager: DagsterLogManager,\n resource_keys: AbstractSet[str],\n error: SerializableErrorInfo,\n ) -> "DagsterEvent":\n return DagsterEvent.from_resource(\n DagsterEventType.RESOURCE_INIT_FAILURE,\n job_name=job_name,\n execution_plan=execution_plan,\n log_manager=log_manager,\n message="Initialization of resources [{}] failed.".format(", ".join(resource_keys)),\n event_specific_data=EngineEventData(\n metadata={},\n marker_end="resources",\n error=error,\n ),\n )\n\n @staticmethod\n def resource_teardown_failure(\n job_name: str,\n execution_plan: "ExecutionPlan",\n log_manager: DagsterLogManager,\n resource_keys: AbstractSet[str],\n error: SerializableErrorInfo,\n ) -> "DagsterEvent":\n return DagsterEvent.from_resource(\n DagsterEventType.ENGINE_EVENT,\n job_name=job_name,\n execution_plan=execution_plan,\n log_manager=log_manager,\n message="Teardown of resources [{}] failed.".format(", ".join(resource_keys)),\n event_specific_data=EngineEventData(\n metadata={},\n marker_start=None,\n marker_end=None,\n error=error,\n ),\n )\n\n @staticmethod\n def engine_event(\n plan_context: IPlanContext,\n message: str,\n event_specific_data: Optional["EngineEventData"] = None,\n ) -> "DagsterEvent":\n if isinstance(plan_context, IStepContext):\n return DagsterEvent.from_step(\n DagsterEventType.ENGINE_EVENT,\n step_context=plan_context,\n event_specific_data=event_specific_data,\n message=message,\n )\n else:\n return DagsterEvent.from_job(\n DagsterEventType.ENGINE_EVENT,\n plan_context,\n message,\n event_specific_data=event_specific_data,\n )\n\n @staticmethod\n def object_store_operation(\n step_context: IStepContext, object_store_operation_result: "ObjectStoreOperation"\n ) -> "DagsterEvent":\n object_store_name = (\n f"{object_store_operation_result.object_store_name} "\n if object_store_operation_result.object_store_name\n else ""\n )\n\n serialization_strategy_modifier = (\n f" using {object_store_operation_result.serialization_strategy_name}"\n if object_store_operation_result.serialization_strategy_name\n else ""\n )\n\n value_name = object_store_operation_result.value_name\n\n if (\n ObjectStoreOperationType(object_store_operation_result.op)\n == ObjectStoreOperationType.SET_OBJECT\n ):\n message = (\n f"Stored intermediate object for output {value_name} in "\n f"{object_store_name}object store{serialization_strategy_modifier}."\n )\n elif (\n ObjectStoreOperationType(object_store_operation_result.op)\n == ObjectStoreOperationType.GET_OBJECT\n ):\n message = (\n f"Retrieved intermediate object for input {value_name} in "\n f"{object_store_name}object store{serialization_strategy_modifier}."\n )\n elif (\n ObjectStoreOperationType(object_store_operation_result.op)\n == ObjectStoreOperationType.CP_OBJECT\n ):\n message = (\n "Copied intermediate object for input {value_name} from {key} to {dest_key}"\n ).format(\n value_name=value_name,\n key=object_store_operation_result.key,\n dest_key=object_store_operation_result.dest_key,\n )\n else:\n message = ""\n\n return DagsterEvent.from_step(\n DagsterEventType.OBJECT_STORE_OPERATION,\n step_context,\n event_specific_data=ObjectStoreOperationResultData(\n op=object_store_operation_result.op,\n value_name=value_name,\n address=object_store_operation_result.key,\n metadata={"key": MetadataValue.path(object_store_operation_result.key)},\n version=object_store_operation_result.version,\n mapping_key=object_store_operation_result.mapping_key,\n ),\n message=message,\n )\n\n @staticmethod\n def handled_output(\n step_context: IStepContext,\n output_name: str,\n manager_key: str,\n message_override: Optional[str] = None,\n metadata: Optional[Mapping[str, MetadataValue]] = None,\n ) -> "DagsterEvent":\n message = f'Handled output "{output_name}" using IO manager "{manager_key}"'\n return DagsterEvent.from_step(\n event_type=DagsterEventType.HANDLED_OUTPUT,\n step_context=step_context,\n event_specific_data=HandledOutputData(\n output_name=output_name,\n manager_key=manager_key,\n metadata=metadata if metadata else {},\n ),\n message=message_override or message,\n )\n\n @staticmethod\n def loaded_input(\n step_context: IStepContext,\n input_name: str,\n manager_key: str,\n upstream_output_name: Optional[str] = None,\n upstream_step_key: Optional[str] = None,\n message_override: Optional[str] = None,\n metadata: Optional[Mapping[str, MetadataValue]] = None,\n ) -> "DagsterEvent":\n message = f'Loaded input "{input_name}" using input manager "{manager_key}"'\n if upstream_output_name:\n message += f', from output "{upstream_output_name}" of step "{upstream_step_key}"'\n\n return DagsterEvent.from_step(\n event_type=DagsterEventType.LOADED_INPUT,\n step_context=step_context,\n event_specific_data=LoadedInputData(\n input_name=input_name,\n manager_key=manager_key,\n upstream_output_name=upstream_output_name,\n upstream_step_key=upstream_step_key,\n metadata=metadata if metadata else {},\n ),\n message=message_override or message,\n )\n\n @staticmethod\n def hook_completed(\n step_context: StepExecutionContext, hook_def: HookDefinition\n ) -> "DagsterEvent":\n event_type = DagsterEventType.HOOK_COMPLETED\n\n event = DagsterEvent(\n event_type_value=event_type.value,\n job_name=step_context.job_name,\n step_handle=step_context.step.handle,\n node_handle=step_context.step.node_handle,\n step_kind_value=step_context.step.kind.value,\n logging_tags=step_context.event_tags,\n message=(\n f'Finished the execution of hook "{hook_def.name}" triggered for'\n f' "{step_context.op.name}".'\n ),\n )\n\n step_context.log.log_dagster_event(\n level=logging.DEBUG, msg=event.message or "", dagster_event=event\n )\n\n return event\n\n @staticmethod\n def hook_errored(\n step_context: StepExecutionContext, error: HookExecutionError\n ) -> "DagsterEvent":\n event_type = DagsterEventType.HOOK_ERRORED\n\n event = DagsterEvent(\n event_type_value=event_type.value,\n job_name=step_context.job_name,\n step_handle=step_context.step.handle,\n node_handle=step_context.step.node_handle,\n step_kind_value=step_context.step.kind.value,\n logging_tags=step_context.event_tags,\n event_specific_data=_validate_event_specific_data(\n event_type,\n HookErroredData(\n error=serializable_error_info_from_exc_info(error.original_exc_info)\n ),\n ),\n )\n\n step_context.log.log_dagster_event(level=logging.ERROR, msg=str(error), dagster_event=event)\n\n return event\n\n @staticmethod\n def hook_skipped(\n step_context: StepExecutionContext, hook_def: HookDefinition\n ) -> "DagsterEvent":\n event_type = DagsterEventType.HOOK_SKIPPED\n\n event = DagsterEvent(\n event_type_value=event_type.value,\n job_name=step_context.job_name,\n step_handle=step_context.step.handle,\n node_handle=step_context.step.node_handle,\n step_kind_value=step_context.step.kind.value,\n logging_tags=step_context.event_tags,\n message=(\n f'Skipped the execution of hook "{hook_def.name}". It did not meet its triggering '\n f'condition during the execution of "{step_context.op.name}".'\n ),\n )\n\n step_context.log.log_dagster_event(\n level=logging.DEBUG, msg=event.message or "", dagster_event=event\n )\n\n return event\n\n @staticmethod\n def legacy_compute_log_step_event(step_context: StepExecutionContext):\n step_key = step_context.step.key\n return DagsterEvent.from_step(\n DagsterEventType.LOGS_CAPTURED,\n step_context,\n message=f"Started capturing logs for step: {step_key}.",\n event_specific_data=ComputeLogsCaptureData(\n step_keys=[step_key],\n file_key=step_key,\n ),\n )\n\n @staticmethod\n def capture_logs(\n job_context: IPlanContext,\n step_keys: Sequence[str],\n log_key: Sequence[str],\n log_context: CapturedLogContext,\n ):\n file_key = log_key[-1]\n return DagsterEvent.from_job(\n DagsterEventType.LOGS_CAPTURED,\n job_context,\n message=f"Started capturing logs in process (pid: {os.getpid()}).",\n event_specific_data=ComputeLogsCaptureData(\n step_keys=step_keys,\n file_key=file_key,\n external_stdout_url=log_context.external_stdout_url,\n external_stderr_url=log_context.external_stderr_url,\n external_url=log_context.external_url,\n ),\n )
\n\n\ndef get_step_output_event(\n events: Sequence[DagsterEvent], step_key: str, output_name: Optional[str] = "result"\n) -> Optional["DagsterEvent"]:\n check.sequence_param(events, "events", of_type=DagsterEvent)\n check.str_param(step_key, "step_key")\n check.str_param(output_name, "output_name")\n for event in events:\n if (\n event.event_type == DagsterEventType.STEP_OUTPUT\n and event.step_key == step_key\n and event.step_output_data.output_name == output_name\n ):\n return event\n return None\n\n\n@whitelist_for_serdes\nclass AssetObservationData(\n NamedTuple("_AssetObservation", [("asset_observation", AssetObservation)])\n):\n def __new__(cls, asset_observation: AssetObservation):\n return super(AssetObservationData, cls).__new__(\n cls,\n asset_observation=check.inst_param(\n asset_observation, "asset_observation", AssetObservation\n ),\n )\n\n\n@whitelist_for_serdes\nclass StepMaterializationData(\n NamedTuple(\n "_StepMaterializationData",\n [\n ("materialization", AssetMaterialization),\n ("asset_lineage", Sequence[AssetLineageInfo]),\n ],\n )\n):\n def __new__(\n cls,\n materialization: AssetMaterialization,\n asset_lineage: Optional[Sequence[AssetLineageInfo]] = None,\n ):\n return super(StepMaterializationData, cls).__new__(\n cls,\n materialization=check.inst_param(\n materialization, "materialization", AssetMaterialization\n ),\n asset_lineage=check.opt_sequence_param(\n asset_lineage, "asset_lineage", of_type=AssetLineageInfo\n ),\n )\n\n\n@whitelist_for_serdes\nclass AssetMaterializationPlannedData(\n NamedTuple(\n "_AssetMaterializationPlannedData",\n [("asset_key", AssetKey), ("partition", Optional[str])],\n )\n):\n def __new__(cls, asset_key: AssetKey, partition: Optional[str] = None):\n return super(AssetMaterializationPlannedData, cls).__new__(\n cls,\n asset_key=check.inst_param(asset_key, "asset_key", AssetKey),\n partition=check.opt_str_param(partition, "partition"),\n )\n\n\n@whitelist_for_serdes\nclass StepExpectationResultData(\n NamedTuple(\n "_StepExpectationResultData",\n [\n ("expectation_result", ExpectationResult),\n ],\n )\n):\n def __new__(cls, expectation_result: ExpectationResult):\n return super(StepExpectationResultData, cls).__new__(\n cls,\n expectation_result=check.inst_param(\n expectation_result, "expectation_result", ExpectationResult\n ),\n )\n\n\n@whitelist_for_serdes(\n storage_field_names={"metadata": "metadata_entries"},\n field_serializers={"metadata": MetadataFieldSerializer},\n)\nclass ObjectStoreOperationResultData(\n NamedTuple(\n "_ObjectStoreOperationResultData",\n [\n ("op", ObjectStoreOperationType),\n ("value_name", Optional[str]),\n ("metadata", Mapping[str, MetadataValue]),\n ("address", Optional[str]),\n ("version", Optional[str]),\n ("mapping_key", Optional[str]),\n ],\n )\n):\n def __new__(\n cls,\n op: ObjectStoreOperationType,\n value_name: Optional[str] = None,\n metadata: Optional[Mapping[str, MetadataValue]] = None,\n address: Optional[str] = None,\n version: Optional[str] = None,\n mapping_key: Optional[str] = None,\n ):\n return super(ObjectStoreOperationResultData, cls).__new__(\n cls,\n op=cast(ObjectStoreOperationType, check.str_param(op, "op")),\n value_name=check.opt_str_param(value_name, "value_name"),\n metadata=normalize_metadata(\n check.opt_mapping_param(metadata, "metadata", key_type=str)\n ),\n address=check.opt_str_param(address, "address"),\n version=check.opt_str_param(version, "version"),\n mapping_key=check.opt_str_param(mapping_key, "mapping_key"),\n )\n\n\n@whitelist_for_serdes(\n storage_field_names={"metadata": "metadata_entries"},\n field_serializers={"metadata": MetadataFieldSerializer},\n)\nclass EngineEventData(\n NamedTuple(\n "_EngineEventData",\n [\n ("metadata", Mapping[str, MetadataValue]),\n ("error", Optional[SerializableErrorInfo]),\n ("marker_start", Optional[str]),\n ("marker_end", Optional[str]),\n ],\n )\n):\n # serdes log\n # * added optional error\n # * added marker_start / marker_end\n #\n def __new__(\n cls,\n metadata: Optional[Mapping[str, RawMetadataValue]] = None,\n error: Optional[SerializableErrorInfo] = None,\n marker_start: Optional[str] = None,\n marker_end: Optional[str] = None,\n ):\n return super(EngineEventData, cls).__new__(\n cls,\n metadata=normalize_metadata(\n check.opt_mapping_param(metadata, "metadata", key_type=str)\n ),\n error=check.opt_inst_param(error, "error", SerializableErrorInfo),\n marker_start=check.opt_str_param(marker_start, "marker_start"),\n marker_end=check.opt_str_param(marker_end, "marker_end"),\n )\n\n @staticmethod\n def in_process(\n pid: int, step_keys_to_execute: Optional[Sequence[str]] = None\n ) -> "EngineEventData":\n return EngineEventData(\n metadata={\n "pid": MetadataValue.text(str(pid)),\n **(\n {"step_keys": MetadataValue.text(str(step_keys_to_execute))}\n if step_keys_to_execute\n else {}\n ),\n }\n )\n\n @staticmethod\n def multiprocess(\n pid: int, step_keys_to_execute: Optional[Sequence[str]] = None\n ) -> "EngineEventData":\n return EngineEventData(\n metadata={\n "pid": MetadataValue.text(str(pid)),\n **(\n {"step_keys": MetadataValue.text(str(step_keys_to_execute))}\n if step_keys_to_execute\n else {}\n ),\n }\n )\n\n @staticmethod\n def interrupted(steps_interrupted: Sequence[str]) -> "EngineEventData":\n return EngineEventData(\n metadata={"steps_interrupted": MetadataValue.text(str(steps_interrupted))}\n )\n\n @staticmethod\n def engine_error(error: SerializableErrorInfo) -> "EngineEventData":\n return EngineEventData(metadata={}, error=error)\n\n\n@whitelist_for_serdes(storage_name="PipelineFailureData")\nclass JobFailureData(\n NamedTuple(\n "_JobFailureData",\n [\n ("error", Optional[SerializableErrorInfo]),\n ],\n )\n):\n def __new__(cls, error: Optional[SerializableErrorInfo]):\n return super(JobFailureData, cls).__new__(\n cls, error=check.opt_inst_param(error, "error", SerializableErrorInfo)\n )\n\n\n@whitelist_for_serdes(storage_name="PipelineCanceledData")\nclass JobCanceledData(\n NamedTuple(\n "_JobCanceledData",\n [\n ("error", Optional[SerializableErrorInfo]),\n ],\n )\n):\n def __new__(cls, error: Optional[SerializableErrorInfo]):\n return super(JobCanceledData, cls).__new__(\n cls, error=check.opt_inst_param(error, "error", SerializableErrorInfo)\n )\n\n\n@whitelist_for_serdes\nclass HookErroredData(\n NamedTuple(\n "_HookErroredData",\n [\n ("error", SerializableErrorInfo),\n ],\n )\n):\n def __new__(cls, error: SerializableErrorInfo):\n return super(HookErroredData, cls).__new__(\n cls, error=check.inst_param(error, "error", SerializableErrorInfo)\n )\n\n\n@whitelist_for_serdes(\n storage_field_names={"metadata": "metadata_entries"},\n field_serializers={"metadata": MetadataFieldSerializer},\n)\nclass HandledOutputData(\n NamedTuple(\n "_HandledOutputData",\n [\n ("output_name", str),\n ("manager_key", str),\n ("metadata", Mapping[str, MetadataValue]),\n ],\n )\n):\n def __new__(\n cls,\n output_name: str,\n manager_key: str,\n metadata: Optional[Mapping[str, MetadataValue]] = None,\n ):\n return super(HandledOutputData, cls).__new__(\n cls,\n output_name=check.str_param(output_name, "output_name"),\n manager_key=check.str_param(manager_key, "manager_key"),\n metadata=normalize_metadata(\n check.opt_mapping_param(metadata, "metadata", key_type=str)\n ),\n )\n\n\n@whitelist_for_serdes(\n storage_field_names={"metadata": "metadata_entries"},\n field_serializers={"metadata": MetadataFieldSerializer},\n)\nclass LoadedInputData(\n NamedTuple(\n "_LoadedInputData",\n [\n ("input_name", str),\n ("manager_key", str),\n ("upstream_output_name", Optional[str]),\n ("upstream_step_key", Optional[str]),\n ("metadata", Mapping[str, MetadataValue]),\n ],\n )\n):\n def __new__(\n cls,\n input_name: str,\n manager_key: str,\n upstream_output_name: Optional[str] = None,\n upstream_step_key: Optional[str] = None,\n metadata: Optional[Mapping[str, MetadataValue]] = None,\n ):\n return super(LoadedInputData, cls).__new__(\n cls,\n input_name=check.str_param(input_name, "input_name"),\n manager_key=check.str_param(manager_key, "manager_key"),\n upstream_output_name=check.opt_str_param(upstream_output_name, "upstream_output_name"),\n upstream_step_key=check.opt_str_param(upstream_step_key, "upstream_step_key"),\n metadata=normalize_metadata(\n check.opt_mapping_param(metadata, "metadata", key_type=str)\n ),\n )\n\n\n@whitelist_for_serdes(storage_field_names={"file_key": "log_key"})\nclass ComputeLogsCaptureData(\n NamedTuple(\n "_ComputeLogsCaptureData",\n [\n ("file_key", str), # renamed log_key => file_key to avoid confusion\n ("step_keys", Sequence[str]),\n ("external_url", Optional[str]),\n ("external_stdout_url", Optional[str]),\n ("external_stderr_url", Optional[str]),\n ],\n )\n):\n def __new__(\n cls,\n file_key: str,\n step_keys: Sequence[str],\n external_url: Optional[str] = None,\n external_stdout_url: Optional[str] = None,\n external_stderr_url: Optional[str] = None,\n ):\n return super(ComputeLogsCaptureData, cls).__new__(\n cls,\n file_key=check.str_param(file_key, "file_key"),\n step_keys=check.opt_list_param(step_keys, "step_keys", of_type=str),\n external_url=check.opt_str_param(external_url, "external_url"),\n external_stdout_url=check.opt_str_param(external_stdout_url, "external_stdout_url"),\n external_stderr_url=check.opt_str_param(external_stderr_url, "external_stderr_url"),\n )\n\n\n###################################################################################################\n# THE GRAVEYARD\n#\n# -|- -|- -|-\n# | | |\n# _-'~~~~~`-_ . _-'~~~~~`-_ _-'~~~~~`-_\n# .' '. .' '. .' '.\n# | R I P | | R I P | | R I P |\n# | | | | | |\n# | Synthetic | | Asset | | Pipeline |\n# | Process | | Store | | Init |\n# | Events | | Operations | | Failures |\n# | | | | | |\n###################################################################################################\n\n\n# Old data structures referenced below\n# class AssetStoreOperationData(NamedTuple):\n# op: str\n# step_key: str\n# output_name: str\n# asset_store_key: str\n#\n#\n# class AssetStoreOperationType(Enum):\n# SET_ASSET = "SET_ASSET"\n# GET_ASSET = "GET_ASSET"\n#\n#\n# class PipelineInitFailureData(NamedTuple):\n# error: SerializableErrorInfo\n\n\ndef _handle_back_compat(\n event_type_value: str,\n event_specific_data: Optional[Dict[str, Any]],\n) -> Tuple[str, Optional[Dict[str, Any]]]:\n # transform old specific process events in to engine events\n if event_type_value in [\n "PIPELINE_PROCESS_START",\n "PIPELINE_PROCESS_STARTED",\n "PIPELINE_PROCESS_EXITED",\n ]:\n return "ENGINE_EVENT", {"__class__": "EngineEventData"}\n\n # changes asset store ops in to get/set asset\n elif event_type_value == "ASSET_STORE_OPERATION":\n assert (\n event_specific_data is not None\n ), "ASSET_STORE_OPERATION event must have specific data"\n if event_specific_data["op"] in (\n "GET_ASSET",\n '{"__enum__": "AssetStoreOperationType.GET_ASSET"}',\n ):\n return (\n "LOADED_INPUT",\n {\n "__class__": "LoadedInputData",\n "input_name": event_specific_data["output_name"],\n "manager_key": event_specific_data["asset_store_key"],\n },\n )\n if event_specific_data["op"] in (\n "SET_ASSET",\n '{"__enum__": "AssetStoreOperationType.SET_ASSET"}',\n ):\n return (\n "HANDLED_OUTPUT",\n {\n "__class__": "HandledOutputData",\n "output_name": event_specific_data["output_name"],\n "manager_key": event_specific_data["asset_store_key"],\n },\n )\n\n # previous name for ASSET_MATERIALIZATION was STEP_MATERIALIZATION\n if event_type_value == "STEP_MATERIALIZATION":\n assert event_specific_data is not None, "STEP_MATERIALIZATION event must have specific data"\n return "ASSET_MATERIALIZATION", event_specific_data\n\n # transform PIPELINE_INIT_FAILURE to PIPELINE_FAILURE\n if event_type_value == "PIPELINE_INIT_FAILURE":\n assert (\n event_specific_data is not None\n ), "PIPELINE_INIT_FAILURE event must have specific data"\n return "PIPELINE_FAILURE", {\n "__class__": "PipelineFailureData",\n "error": event_specific_data.get("error"),\n }\n\n return event_type_value, event_specific_data\n
", "current_page_name": "_modules/dagster/_core/events", "customsidebar": null, "favicon_url": null, "log": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.events.log

\nfrom typing import Mapping, NamedTuple, Optional, Union\n\nimport dagster._check as check\nfrom dagster._annotations import PublicAttr, public\nfrom dagster._core.definitions.events import AssetMaterialization, AssetObservation\nfrom dagster._core.events import DagsterEvent, DagsterEventType\nfrom dagster._core.utils import coerce_valid_log_level\nfrom dagster._serdes.serdes import (\n    deserialize_value,\n    serialize_value,\n    whitelist_for_serdes,\n)\nfrom dagster._utils.error import SerializableErrorInfo\nfrom dagster._utils.log import (\n    JsonEventLoggerHandler,\n    StructuredLoggerHandler,\n    StructuredLoggerMessage,\n    construct_single_handler_logger,\n)\n\n\n
[docs]@whitelist_for_serdes(\n # These were originally distinguished from each other but ended up being empty subclasses\n # of EventLogEntry -- instead of using the subclasses we were relying on\n # EventLogEntry.is_dagster_event to distinguish events that originate in the logging\n # machinery from events that are yielded by user code\n old_storage_names={"DagsterEventRecord", "LogMessageRecord", "EventRecord"},\n old_fields={"message": ""},\n storage_field_names={"job_name": "pipeline_name"},\n)\nclass EventLogEntry(\n NamedTuple(\n "_EventLogEntry",\n [\n ("error_info", PublicAttr[Optional[SerializableErrorInfo]]),\n ("level", PublicAttr[Union[str, int]]),\n ("user_message", PublicAttr[str]),\n ("run_id", PublicAttr[str]),\n ("timestamp", PublicAttr[float]),\n ("step_key", PublicAttr[Optional[str]]),\n ("job_name", PublicAttr[Optional[str]]),\n ("dagster_event", PublicAttr[Optional[DagsterEvent]]),\n ],\n )\n):\n """Entries in the event log.\n\n Users should not instantiate this object directly. These entries may originate from the logging machinery (DagsterLogManager/context.log), from\n framework events (e.g. EngineEvent), or they may correspond to events yielded by user code\n (e.g. Output).\n\n Args:\n error_info (Optional[SerializableErrorInfo]): Error info for an associated exception, if\n any, as generated by serializable_error_info_from_exc_info and friends.\n level (Union[str, int]): The Python log level at which to log this event. Note that\n framework and user code events are also logged to Python logging. This value may be an\n integer or a (case-insensitive) string member of PYTHON_LOGGING_LEVELS_NAMES.\n user_message (str): For log messages, this is the user-generated message.\n run_id (str): The id of the run which generated this event.\n timestamp (float): The Unix timestamp of this event.\n step_key (Optional[str]): The step key for the step which generated this event. Some events\n are generated outside of a step context.\n job_name (Optional[str]): The job which generated this event. Some events are\n generated outside of a job context.\n dagster_event (Optional[DagsterEvent]): For framework and user events, the associated\n structured event.\n """\n\n def __new__(\n cls,\n error_info,\n level,\n user_message,\n run_id,\n timestamp,\n step_key=None,\n job_name=None,\n dagster_event=None,\n ):\n return super(EventLogEntry, cls).__new__(\n cls,\n check.opt_inst_param(error_info, "error_info", SerializableErrorInfo),\n coerce_valid_log_level(level),\n check.str_param(user_message, "user_message"),\n check.str_param(run_id, "run_id"),\n check.float_param(timestamp, "timestamp"),\n check.opt_str_param(step_key, "step_key"),\n check.opt_str_param(job_name, "job_name"),\n check.opt_inst_param(dagster_event, "dagster_event", DagsterEvent),\n )\n\n @public\n @property\n def is_dagster_event(self) -> bool:\n """bool: If this entry contains a DagsterEvent."""\n return bool(self.dagster_event)\n\n
[docs] @public\n def get_dagster_event(self) -> DagsterEvent:\n """DagsterEvent: Returns the DagsterEvent contained within this entry. If this entry does not\n contain a DagsterEvent, an error will be raised.\n """\n if not isinstance(self.dagster_event, DagsterEvent):\n check.failed(\n "Not a dagster event, check is_dagster_event before calling get_dagster_event",\n )\n\n return self.dagster_event
\n\n def to_json(self):\n return serialize_value(self)\n\n @staticmethod\n def from_json(json_str: str):\n return deserialize_value(json_str, EventLogEntry)\n\n @public\n @property\n def dagster_event_type(self) -> Optional[DagsterEventType]:\n """Optional[DagsterEventType]: The type of the DagsterEvent contained by this entry, if any."""\n return self.dagster_event.event_type if self.dagster_event else None\n\n @public\n @property\n def message(self) -> str:\n """Return the message from the structured DagsterEvent if present, fallback to user_message."""\n if self.is_dagster_event:\n msg = self.get_dagster_event().message\n if msg is not None:\n return msg\n\n return self.user_message\n\n @property\n def asset_materialization(self) -> Optional[AssetMaterialization]:\n if (\n self.dagster_event\n and self.dagster_event.event_type_value == DagsterEventType.ASSET_MATERIALIZATION\n ):\n materialization = self.dagster_event.step_materialization_data.materialization\n if isinstance(materialization, AssetMaterialization):\n return materialization\n\n return None\n\n @property\n def asset_observation(self) -> Optional[AssetObservation]:\n if (\n self.dagster_event\n and self.dagster_event.event_type_value == DagsterEventType.ASSET_OBSERVATION\n ):\n observation = self.dagster_event.asset_observation_data.asset_observation\n if isinstance(observation, AssetObservation):\n return observation\n\n return None\n\n @property\n def tags(self) -> Optional[Mapping[str, str]]:\n materialization = self.asset_materialization\n if materialization:\n return materialization.tags\n\n observation = self.asset_observation\n if observation:\n return observation.tags\n\n return None
\n\n\ndef construct_event_record(logger_message: StructuredLoggerMessage) -> EventLogEntry:\n check.inst_param(logger_message, "logger_message", StructuredLoggerMessage)\n\n return EventLogEntry(\n level=logger_message.level,\n user_message=logger_message.meta["orig_message"],\n run_id=logger_message.meta["run_id"],\n timestamp=logger_message.record.created,\n step_key=logger_message.meta.get("step_key"),\n job_name=logger_message.meta.get("job_name"),\n dagster_event=logger_message.meta.get("dagster_event"),\n error_info=None,\n )\n\n\ndef construct_event_logger(event_record_callback):\n """Callback receives a stream of event_records. Piggybacks on the logging machinery."""\n check.callable_param(event_record_callback, "event_record_callback")\n\n return construct_single_handler_logger(\n "event-logger",\n "debug",\n StructuredLoggerHandler(\n lambda logger_message: event_record_callback(construct_event_record(logger_message))\n ),\n )\n\n\ndef construct_json_event_logger(json_path):\n """Record a stream of event records to json."""\n check.str_param(json_path, "json_path")\n return construct_single_handler_logger(\n "json-event-record-logger",\n "debug",\n JsonEventLoggerHandler(\n json_path,\n lambda record: construct_event_record(\n StructuredLoggerMessage(\n name=record.name,\n message=record.msg,\n level=record.levelno,\n meta=record.dagster_meta,\n record=record,\n )\n ),\n ),\n )\n
", "current_page_name": "_modules/dagster/_core/events/log", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}, {"link": "../", "title": "dagster._core.events"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.events.log"}, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.events"}, "execution": {"api": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.execution.api

\nimport sys\nfrom contextlib import contextmanager\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Any,\n    Callable,\n    Dict,\n    Iterator,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Tuple,\n    Union,\n    cast,\n)\n\nimport dagster._check as check\nfrom dagster._core.definitions import IJob, JobDefinition\nfrom dagster._core.definitions.events import AssetKey\nfrom dagster._core.definitions.job_base import InMemoryJob\nfrom dagster._core.definitions.reconstruct import ReconstructableJob\nfrom dagster._core.definitions.repository_definition import RepositoryLoadData\nfrom dagster._core.errors import DagsterExecutionInterruptedError, DagsterInvariantViolationError\nfrom dagster._core.events import DagsterEvent, EngineEventData\nfrom dagster._core.execution.context.system import PlanOrchestrationContext\nfrom dagster._core.execution.plan.execute_plan import inner_plan_execution_iterator\nfrom dagster._core.execution.plan.plan import ExecutionPlan\nfrom dagster._core.execution.plan.state import KnownExecutionState\nfrom dagster._core.execution.retries import RetryMode\nfrom dagster._core.instance import DagsterInstance, InstanceRef\nfrom dagster._core.selector import parse_step_selection\nfrom dagster._core.storage.dagster_run import DagsterRun, DagsterRunStatus\nfrom dagster._core.system_config.objects import ResolvedRunConfig\nfrom dagster._core.telemetry import log_dagster_event, log_repo_stats, telemetry_wrapper\nfrom dagster._utils.error import serializable_error_info_from_exc_info\nfrom dagster._utils.interrupts import capture_interrupts\nfrom dagster._utils.merger import merge_dicts\n\nfrom .context_creation_job import (\n    ExecutionContextManager,\n    PlanExecutionContextManager,\n    PlanOrchestrationContextManager,\n    orchestration_context_event_generator,\n    scoped_job_context,\n)\nfrom .job_execution_result import JobExecutionResult\n\nif TYPE_CHECKING:\n    from dagster._core.execution.plan.outputs import StepOutputHandle\n\n## Brief guide to the execution APIs\n# | function name               | operates over      | sync  | supports    | creates new DagsterRun  |\n# |                             |                    |       | reexecution | in instance             |\n# | --------------------------- | ------------------ | ----- | ----------- | ----------------------- |\n# | execute_job                 | ReconstructableJob | sync  | yes         | yes                     |\n# | execute_run_iterator        | DagsterRun         | async | (1)         | no                      |\n# | execute_run                 | DagsterRun         | sync  | (1)         | no                      |\n# | execute_plan_iterator       | ExecutionPlan      | async | (2)         | no                      |\n# | execute_plan                | ExecutionPlan      | sync  | (2)         | no                      |\n#\n# Notes on reexecution support:\n# (1) The appropriate bits must be set on the DagsterRun passed to this function. Specifically,\n#     parent_run_id and root_run_id must be set and consistent, and if a resolved_op_selection or\n#     step_keys_to_execute are set they must be consistent with the parent and root runs.\n# (2) As for (1), but the ExecutionPlan passed must also agree in all relevant bits.\n\n\ndef execute_run_iterator(\n    job: IJob,\n    dagster_run: DagsterRun,\n    instance: DagsterInstance,\n    resume_from_failure: bool = False,\n) -> Iterator[DagsterEvent]:\n    check.inst_param(job, "job", IJob)\n    check.inst_param(dagster_run, "dagster_run", DagsterRun)\n    check.inst_param(instance, "instance", DagsterInstance)\n\n    if dagster_run.status == DagsterRunStatus.CANCELED:\n        # This can happen if the run was force-terminated while it was starting\n        def gen_execute_on_cancel():\n            yield instance.report_engine_event(\n                "Not starting execution since the run was canceled before execution could start",\n                dagster_run,\n            )\n\n        return gen_execute_on_cancel()\n\n    if not resume_from_failure:\n        if dagster_run.status not in (DagsterRunStatus.NOT_STARTED, DagsterRunStatus.STARTING):\n            if dagster_run.is_finished:\n\n                def gen_ignore_duplicate_run_worker():\n                    yield instance.report_engine_event(\n                        "Ignoring a run worker that started after the run had already finished.",\n                        dagster_run,\n                    )\n\n                return gen_ignore_duplicate_run_worker()\n            elif instance.run_monitoring_enabled:\n                # This can happen if the pod was unexpectedly restarted by the cluster - ignore it since\n                # the run monitoring daemon will also spin up a new pod\n                def gen_ignore_duplicate_run_worker():\n                    yield instance.report_engine_event(\n                        "Ignoring a duplicate run that was started from somewhere other than"\n                        " the run monitor daemon",\n                        dagster_run,\n                    )\n\n                return gen_ignore_duplicate_run_worker()\n            else:\n\n                def gen_fail_restarted_run_worker():\n                    yield instance.report_engine_event(\n                        f"{dagster_run.job_name} ({dagster_run.run_id}) started a new"\n                        f" run worker while the run was already in state {dagster_run.status}."\n                        " This most frequently happens when the run worker unexpectedly stops"\n                        " and is restarted by the cluster. Marking the run as failed.",\n                        dagster_run,\n                    )\n                    yield instance.report_run_failed(dagster_run)\n\n                return gen_fail_restarted_run_worker()\n\n    else:\n        check.invariant(\n            dagster_run.status == DagsterRunStatus.STARTED\n            or dagster_run.status == DagsterRunStatus.STARTING,\n            desc=(\n                "Run of {} ({}) in state {}, expected STARTED or STARTING because it's "\n                "resuming from a run worker failure".format(\n                    dagster_run.job_name, dagster_run.run_id, dagster_run.status\n                )\n            ),\n        )\n\n    if dagster_run.resolved_op_selection or dagster_run.asset_selection:\n        # when `execute_run_iterator` is directly called, the sub pipeline hasn't been created\n        # note that when we receive the solids to execute via DagsterRun, it won't support\n        # solid selection query syntax\n        job = job.get_subset(\n            op_selection=(\n                list(dagster_run.resolved_op_selection)\n                if dagster_run.resolved_op_selection\n                else None\n            ),\n            asset_selection=dagster_run.asset_selection,\n        )\n\n    execution_plan = _get_execution_plan_from_run(job, dagster_run, instance)\n    if isinstance(job, ReconstructableJob):\n        job = job.with_repository_load_data(execution_plan.repository_load_data)\n\n    return iter(\n        ExecuteRunWithPlanIterable(\n            execution_plan=execution_plan,\n            iterator=job_execution_iterator,\n            execution_context_manager=PlanOrchestrationContextManager(\n                context_event_generator=orchestration_context_event_generator,\n                job=job,\n                execution_plan=execution_plan,\n                dagster_run=dagster_run,\n                instance=instance,\n                run_config=dagster_run.run_config,\n                raise_on_error=False,\n                executor_defs=None,\n                output_capture=None,\n                resume_from_failure=resume_from_failure,\n            ),\n        )\n    )\n\n\ndef execute_run(\n    job: IJob,\n    dagster_run: DagsterRun,\n    instance: DagsterInstance,\n    raise_on_error: bool = False,\n) -> JobExecutionResult:\n    """Executes an existing job run synchronously.\n\n    Synchronous version of execute_run_iterator.\n\n    Args:\n        job (IJob): The pipeline to execute.\n        dagster_run (DagsterRun): The run to execute\n        instance (DagsterInstance): The instance in which the run has been created.\n        raise_on_error (Optional[bool]): Whether or not to raise exceptions when they occur.\n            Defaults to ``False``.\n\n    Returns:\n        JobExecutionResult: The result of the execution.\n    """\n    if isinstance(job, JobDefinition):\n        raise DagsterInvariantViolationError(\n            "execute_run requires a reconstructable job but received job definition directly"\n            " instead. To support hand-off to other processes please wrap your definition in a call"\n            " to reconstructable(). Learn more about reconstructable here:"\n            " https://docs.dagster.io/_apidocs/execution#dagster.reconstructable"\n        )\n\n    check.inst_param(job, "job", IJob)\n    check.inst_param(dagster_run, "dagster_run", DagsterRun)\n    check.inst_param(instance, "instance", DagsterInstance)\n\n    if dagster_run.status == DagsterRunStatus.CANCELED:\n        message = "Not starting execution since the run was canceled before execution could start"\n        instance.report_engine_event(\n            message,\n            dagster_run,\n        )\n        raise DagsterInvariantViolationError(message)\n\n    check.invariant(\n        dagster_run.status == DagsterRunStatus.NOT_STARTED\n        or dagster_run.status == DagsterRunStatus.STARTING,\n        desc="Run {} ({}) in state {}, expected NOT_STARTED or STARTING".format(\n            dagster_run.job_name, dagster_run.run_id, dagster_run.status\n        ),\n    )\n    if dagster_run.resolved_op_selection or dagster_run.asset_selection:\n        # when `execute_run` is directly called, the sub job hasn't been created\n        # note that when we receive the solids to execute via DagsterRun, it won't support\n        # solid selection query syntax\n        job = job.get_subset(\n            op_selection=(\n                list(dagster_run.resolved_op_selection)\n                if dagster_run.resolved_op_selection\n                else None\n            ),\n            asset_selection=dagster_run.asset_selection,\n        )\n\n    execution_plan = _get_execution_plan_from_run(job, dagster_run, instance)\n    if isinstance(job, ReconstructableJob):\n        job = job.with_repository_load_data(execution_plan.repository_load_data)\n\n    output_capture: Optional[Dict[StepOutputHandle, Any]] = {}\n\n    _execute_run_iterable = ExecuteRunWithPlanIterable(\n        execution_plan=execution_plan,\n        iterator=job_execution_iterator,\n        execution_context_manager=PlanOrchestrationContextManager(\n            context_event_generator=orchestration_context_event_generator,\n            job=job,\n            execution_plan=execution_plan,\n            dagster_run=dagster_run,\n            instance=instance,\n            run_config=dagster_run.run_config,\n            raise_on_error=raise_on_error,\n            executor_defs=None,\n            output_capture=output_capture,\n        ),\n    )\n    event_list = list(_execute_run_iterable)\n\n    # We need to reload the run object after execution for it to be accurate\n    reloaded_dagster_run = check.not_none(instance.get_run_by_id(dagster_run.run_id))\n\n    return JobExecutionResult(\n        job.get_definition(),\n        scoped_job_context(\n            execution_plan,\n            job,\n            reloaded_dagster_run.run_config,\n            reloaded_dagster_run,\n            instance,\n        ),\n        event_list,\n        reloaded_dagster_run,\n    )\n\n\n@contextmanager\ndef ephemeral_instance_if_missing(\n    instance: Optional[DagsterInstance],\n) -> Iterator[DagsterInstance]:\n    if instance:\n        yield instance\n    else:\n        with DagsterInstance.ephemeral() as ephemeral_instance:\n            yield ephemeral_instance\n\n\n
[docs]class ReexecutionOptions(NamedTuple):\n """Reexecution options for python-based execution in Dagster.\n\n Args:\n parent_run_id (str): The run_id of the run to reexecute.\n step_selection (Sequence[str]):\n The list of step selections to reexecute. Must be a subset or match of the\n set of steps executed in the original run. For example:\n\n - ``['some_op']``: selects ``some_op`` itself.\n - ``['*some_op']``: select ``some_op`` and all its ancestors (upstream dependencies).\n - ``['*some_op+++']``: select ``some_op``, all its ancestors, and its descendants\n (downstream dependencies) within 3 levels down.\n - ``['*some_op', 'other_op_a', 'other_op_b+']``: select ``some_op`` and all its\n ancestors, ``other_op_a`` itself, and ``other_op_b`` and its direct child ops.\n """\n\n parent_run_id: str\n step_selection: Sequence[str] = []\n\n @staticmethod\n def from_failure(run_id: str, instance: DagsterInstance) -> "ReexecutionOptions":\n """Creates reexecution options from a failed run.\n\n Args:\n run_id (str): The run_id of the failed run. Run must fail in order to be reexecuted.\n instance (DagsterInstance): The DagsterInstance that the original run occurred in.\n\n Returns:\n ReexecutionOptions: Reexecution options to pass to a python execution.\n """\n from dagster._core.execution.plan.state import KnownExecutionState\n\n parent_run = check.not_none(instance.get_run_by_id(run_id))\n check.invariant(\n parent_run.status == DagsterRunStatus.FAILURE,\n "Cannot reexecute from failure a run that is not failed",\n )\n # Tried to thread through KnownExecutionState to execution plan creation, but little benefit.\n # It is recalculated later by the re-execution machinery.\n step_keys_to_execute, _ = KnownExecutionState.build_resume_retry_reexecution(\n instance, parent_run=cast(DagsterRun, instance.get_run_by_id(run_id))\n )\n return ReexecutionOptions(parent_run_id=run_id, step_selection=step_keys_to_execute)
\n\n\n
[docs]def execute_job(\n job: ReconstructableJob,\n instance: "DagsterInstance",\n run_config: Any = None,\n tags: Optional[Mapping[str, Any]] = None,\n raise_on_error: bool = False,\n op_selection: Optional[Sequence[str]] = None,\n reexecution_options: Optional[ReexecutionOptions] = None,\n asset_selection: Optional[Sequence[AssetKey]] = None,\n) -> JobExecutionResult:\n """Execute a job synchronously.\n\n This API represents dagster's python entrypoint for out-of-process\n execution. For most testing purposes, :py:meth:`~dagster.JobDefinition.\n execute_in_process` will be more suitable, but when wanting to run\n execution using an out-of-process executor (such as :py:class:`dagster.\n multiprocess_executor`), then `execute_job` is suitable.\n\n `execute_job` expects a persistent :py:class:`DagsterInstance` for\n execution, meaning the `$DAGSTER_HOME` environment variable must be set.\n It also expects a reconstructable pointer to a :py:class:`JobDefinition` so\n that it can be reconstructed in separate processes. This can be done by\n wrapping the ``JobDefinition`` in a call to :py:func:`dagster.\n reconstructable`.\n\n .. code-block:: python\n\n from dagster import DagsterInstance, execute_job, job, reconstructable\n\n @job\n def the_job():\n ...\n\n instance = DagsterInstance.get()\n result = execute_job(reconstructable(the_job), instance=instance)\n assert result.success\n\n\n If using the :py:meth:`~dagster.GraphDefinition.to_job` method to\n construct the ``JobDefinition``, then the invocation must be wrapped in a\n module-scope function, which can be passed to ``reconstructable``.\n\n .. code-block:: python\n\n from dagster import graph, reconstructable\n\n @graph\n def the_graph():\n ...\n\n def define_job():\n return the_graph.to_job(...)\n\n result = execute_job(reconstructable(define_job), ...)\n\n Since `execute_job` is potentially executing outside of the current\n process, output objects need to be retrieved by use of the provided job's\n io managers. Output objects can be retrieved by opening the result of\n `execute_job` as a context manager.\n\n .. code-block:: python\n\n from dagster import execute_job\n\n with execute_job(...) as result:\n output_obj = result.output_for_node("some_op")\n\n ``execute_job`` can also be used to reexecute a run, by providing a :py:class:`ReexecutionOptions` object.\n\n .. code-block:: python\n\n from dagster import ReexecutionOptions, execute_job\n\n instance = DagsterInstance.get()\n\n options = ReexecutionOptions.from_failure(run_id=failed_run_id, instance)\n execute_job(reconstructable(job), instance, reexecution_options=options)\n\n Parameters:\n job (ReconstructableJob): A reconstructable pointer to a :py:class:`JobDefinition`.\n instance (DagsterInstance): The instance to execute against.\n run_config (Optional[dict]): The configuration that parametrizes this run, as a dict.\n tags (Optional[Dict[str, Any]]): Arbitrary key-value pairs that will be added to run logs.\n raise_on_error (Optional[bool]): Whether or not to raise exceptions when they occur.\n Defaults to ``False``.\n op_selection (Optional[List[str]]): A list of op selection queries (including single\n op names) to execute. For example:\n\n - ``['some_op']``: selects ``some_op`` itself.\n - ``['*some_op']``: select ``some_op`` and all its ancestors (upstream dependencies).\n - ``['*some_op+++']``: select ``some_op``, all its ancestors, and its descendants\n (downstream dependencies) within 3 levels down.\n - ``['*some_op', 'other_op_a', 'other_op_b+']``: select ``some_op`` and all its\n ancestors, ``other_op_a`` itself, and ``other_op_b`` and its direct child ops.\n reexecution_options (Optional[ReexecutionOptions]):\n Reexecution options to provide to the run, if this run is\n intended to be a reexecution of a previous run. Cannot be used in\n tandem with the ``op_selection`` argument.\n\n Returns:\n :py:class:`JobExecutionResult`: The result of job execution.\n """\n check.inst_param(job, "job", ReconstructableJob)\n check.inst_param(instance, "instance", DagsterInstance)\n check.opt_sequence_param(asset_selection, "asset_selection", of_type=AssetKey)\n\n # get the repository load data here because we call job.get_definition() later in this fn\n job_def, _ = _job_with_repository_load_data(job)\n\n if reexecution_options is not None and op_selection is not None:\n raise DagsterInvariantViolationError(\n "re-execution and op selection cannot be used together at this time."\n )\n\n if reexecution_options:\n if run_config is None:\n run = check.not_none(instance.get_run_by_id(reexecution_options.parent_run_id))\n run_config = run.run_config\n return _reexecute_job(\n job_arg=job_def,\n parent_run_id=reexecution_options.parent_run_id,\n run_config=run_config,\n step_selection=list(reexecution_options.step_selection),\n tags=tags,\n instance=instance,\n raise_on_error=raise_on_error,\n )\n else:\n return _logged_execute_job(\n job_arg=job_def,\n instance=instance,\n run_config=run_config,\n tags=tags,\n op_selection=op_selection,\n raise_on_error=raise_on_error,\n asset_selection=asset_selection,\n )
\n\n\n@telemetry_wrapper\ndef _logged_execute_job(\n job_arg: Union[IJob, JobDefinition],\n instance: DagsterInstance,\n run_config: Optional[Mapping[str, object]] = None,\n tags: Optional[Mapping[str, str]] = None,\n op_selection: Optional[Sequence[str]] = None,\n raise_on_error: bool = True,\n asset_selection: Optional[Sequence[AssetKey]] = None,\n) -> JobExecutionResult:\n check.inst_param(instance, "instance", DagsterInstance)\n\n job_arg, repository_load_data = _job_with_repository_load_data(job_arg)\n\n (\n job_arg,\n run_config,\n tags,\n resolved_op_selection,\n op_selection,\n ) = _check_execute_job_args(\n job_arg=job_arg,\n run_config=run_config,\n tags=tags,\n op_selection=op_selection,\n )\n\n log_repo_stats(instance=instance, job=job_arg, source="execute_pipeline")\n\n dagster_run = instance.create_run_for_job(\n job_def=job_arg.get_definition(),\n run_config=run_config,\n op_selection=op_selection,\n resolved_op_selection=resolved_op_selection,\n tags=tags,\n job_code_origin=(\n job_arg.get_python_origin() if isinstance(job_arg, ReconstructableJob) else None\n ),\n repository_load_data=repository_load_data,\n asset_selection=frozenset(asset_selection) if asset_selection else None,\n )\n\n return execute_run(\n job_arg,\n dagster_run,\n instance,\n raise_on_error=raise_on_error,\n )\n\n\ndef _reexecute_job(\n job_arg: Union[IJob, JobDefinition],\n parent_run_id: str,\n run_config: Optional[Mapping[str, object]] = None,\n step_selection: Optional[Sequence[str]] = None,\n tags: Optional[Mapping[str, str]] = None,\n instance: Optional[DagsterInstance] = None,\n raise_on_error: bool = True,\n) -> JobExecutionResult:\n """Reexecute an existing job run."""\n check.opt_sequence_param(step_selection, "step_selection", of_type=str)\n\n check.str_param(parent_run_id, "parent_run_id")\n\n with ephemeral_instance_if_missing(instance) as execute_instance:\n job_arg, repository_load_data = _job_with_repository_load_data(job_arg)\n\n (job_arg, run_config, tags, _, _) = _check_execute_job_args(\n job_arg=job_arg,\n run_config=run_config,\n tags=tags,\n )\n\n parent_dagster_run = execute_instance.get_run_by_id(parent_run_id)\n if parent_dagster_run is None:\n check.failed(\n f"No parent run with id {parent_run_id} found in instance.",\n )\n\n execution_plan: Optional[ExecutionPlan] = None\n # resolve step selection DSL queries using parent execution information\n if step_selection:\n execution_plan = _resolve_reexecute_step_selection(\n execute_instance,\n job_arg,\n run_config,\n cast(DagsterRun, parent_dagster_run),\n step_selection,\n )\n\n if parent_dagster_run.asset_selection:\n job_arg = job_arg.get_subset(\n op_selection=None, asset_selection=parent_dagster_run.asset_selection\n )\n\n dagster_run = execute_instance.create_run_for_job(\n job_def=job_arg.get_definition(),\n execution_plan=execution_plan,\n run_config=run_config,\n tags=tags,\n op_selection=parent_dagster_run.op_selection,\n asset_selection=parent_dagster_run.asset_selection,\n resolved_op_selection=parent_dagster_run.resolved_op_selection,\n root_run_id=parent_dagster_run.root_run_id or parent_dagster_run.run_id,\n parent_run_id=parent_dagster_run.run_id,\n job_code_origin=(\n job_arg.get_python_origin() if isinstance(job_arg, ReconstructableJob) else None\n ),\n repository_load_data=repository_load_data,\n )\n\n return execute_run(\n job_arg,\n dagster_run,\n execute_instance,\n raise_on_error=raise_on_error,\n )\n check.failed("Should not reach here.")\n\n\ndef execute_plan_iterator(\n execution_plan: ExecutionPlan,\n job: IJob,\n dagster_run: DagsterRun,\n instance: DagsterInstance,\n retry_mode: Optional[RetryMode] = None,\n run_config: Optional[Mapping[str, object]] = None,\n) -> Iterator[DagsterEvent]:\n check.inst_param(execution_plan, "execution_plan", ExecutionPlan)\n check.inst_param(job, "job", IJob)\n check.inst_param(dagster_run, "dagster_run", DagsterRun)\n check.inst_param(instance, "instance", DagsterInstance)\n retry_mode = check.opt_inst_param(retry_mode, "retry_mode", RetryMode, RetryMode.DISABLED)\n run_config = check.opt_mapping_param(run_config, "run_config")\n\n if isinstance(job, ReconstructableJob):\n job = job.with_repository_load_data(execution_plan.repository_load_data)\n\n return iter(\n ExecuteRunWithPlanIterable(\n execution_plan=execution_plan,\n iterator=inner_plan_execution_iterator,\n execution_context_manager=PlanExecutionContextManager(\n job=job,\n retry_mode=retry_mode,\n execution_plan=execution_plan,\n run_config=run_config,\n dagster_run=dagster_run,\n instance=instance,\n ),\n )\n )\n\n\ndef execute_plan(\n execution_plan: ExecutionPlan,\n job: IJob,\n instance: DagsterInstance,\n dagster_run: DagsterRun,\n run_config: Optional[Mapping[str, object]] = None,\n retry_mode: Optional[RetryMode] = None,\n) -> Sequence[DagsterEvent]:\n """This is the entry point of dagster-graphql executions. For the dagster CLI entry point, see\n execute_job() above.\n """\n check.inst_param(execution_plan, "execution_plan", ExecutionPlan)\n check.inst_param(job, "job", IJob)\n check.inst_param(instance, "instance", DagsterInstance)\n check.inst_param(dagster_run, "dagster_run", DagsterRun)\n run_config = check.opt_mapping_param(run_config, "run_config")\n check.opt_inst_param(retry_mode, "retry_mode", RetryMode)\n\n return list(\n execute_plan_iterator(\n execution_plan=execution_plan,\n job=job,\n run_config=run_config,\n dagster_run=dagster_run,\n instance=instance,\n retry_mode=retry_mode,\n )\n )\n\n\ndef _get_execution_plan_from_run(\n job: IJob,\n dagster_run: DagsterRun,\n instance: DagsterInstance,\n) -> ExecutionPlan:\n execution_plan_snapshot = (\n instance.get_execution_plan_snapshot(dagster_run.execution_plan_snapshot_id)\n if dagster_run.execution_plan_snapshot_id\n else None\n )\n\n # Rebuild from snapshot if able and selection has not changed\n if (\n execution_plan_snapshot is not None\n and execution_plan_snapshot.can_reconstruct_plan\n and job.resolved_op_selection == dagster_run.resolved_op_selection\n and job.asset_selection == dagster_run.asset_selection\n ):\n return ExecutionPlan.rebuild_from_snapshot(\n dagster_run.job_name,\n execution_plan_snapshot,\n )\n\n return create_execution_plan(\n job,\n run_config=dagster_run.run_config,\n step_keys_to_execute=dagster_run.step_keys_to_execute,\n instance_ref=instance.get_ref() if instance.is_persistent else None,\n repository_load_data=(\n execution_plan_snapshot.repository_load_data if execution_plan_snapshot else None\n ),\n known_state=(\n execution_plan_snapshot.initial_known_state if execution_plan_snapshot else None\n ),\n )\n\n\ndef create_execution_plan(\n job: Union[IJob, JobDefinition],\n run_config: Optional[Mapping[str, object]] = None,\n step_keys_to_execute: Optional[Sequence[str]] = None,\n known_state: Optional[KnownExecutionState] = None,\n instance_ref: Optional[InstanceRef] = None,\n tags: Optional[Mapping[str, str]] = None,\n repository_load_data: Optional[RepositoryLoadData] = None,\n) -> ExecutionPlan:\n if isinstance(job, IJob):\n # If you have repository_load_data, make sure to use it when building plan\n if isinstance(job, ReconstructableJob) and repository_load_data is not None:\n job = job.with_repository_load_data(repository_load_data)\n job_def = job.get_definition()\n else:\n job_def = job\n\n run_config = check.opt_mapping_param(run_config, "run_config", key_type=str)\n check.opt_nullable_sequence_param(step_keys_to_execute, "step_keys_to_execute", of_type=str)\n check.opt_inst_param(instance_ref, "instance_ref", InstanceRef)\n tags = check.opt_mapping_param(tags, "tags", key_type=str, value_type=str)\n known_state = check.opt_inst_param(\n known_state,\n "known_state",\n KnownExecutionState,\n default=KnownExecutionState(),\n )\n repository_load_data = check.opt_inst_param(\n repository_load_data, "repository_load_data", RepositoryLoadData\n )\n\n resolved_run_config = ResolvedRunConfig.build(job_def, run_config)\n\n return ExecutionPlan.build(\n job_def,\n resolved_run_config,\n step_keys_to_execute=step_keys_to_execute,\n known_state=known_state,\n instance_ref=instance_ref,\n tags=tags,\n repository_load_data=repository_load_data,\n )\n\n\ndef job_execution_iterator(\n job_context: PlanOrchestrationContext, execution_plan: ExecutionPlan\n) -> Iterator[DagsterEvent]:\n """A complete execution of a pipeline. Yields pipeline start, success,\n and failure events.\n\n Args:\n pipeline_context (PlanOrchestrationContext):\n execution_plan (ExecutionPlan):\n """\n # TODO: restart event?\n if not job_context.resume_from_failure:\n yield DagsterEvent.job_start(job_context)\n\n job_exception_info = None\n job_canceled_info = None\n failed_steps = []\n generator_closed = False\n try:\n for event in job_context.executor.execute(job_context, execution_plan):\n if event.is_step_failure:\n failed_steps.append(event.step_key)\n elif event.is_resource_init_failure and event.step_key:\n failed_steps.append(event.step_key)\n\n # Telemetry\n log_dagster_event(event, job_context)\n\n yield event\n except GeneratorExit:\n # Shouldn't happen, but avoid runtime-exception in case this generator gets GC-ed\n # (see https://amir.rachum.com/blog/2017/03/03/generator-cleanup/).\n generator_closed = True\n job_exception_info = serializable_error_info_from_exc_info(sys.exc_info())\n if job_context.raise_on_error:\n raise\n except (KeyboardInterrupt, DagsterExecutionInterruptedError):\n job_canceled_info = serializable_error_info_from_exc_info(sys.exc_info())\n if job_context.raise_on_error:\n raise\n except BaseException:\n job_exception_info = serializable_error_info_from_exc_info(sys.exc_info())\n if job_context.raise_on_error:\n raise # finally block will run before this is re-raised\n finally:\n if job_canceled_info:\n reloaded_run = job_context.instance.get_run_by_id(job_context.run_id)\n if reloaded_run and reloaded_run.status == DagsterRunStatus.CANCELING:\n event = DagsterEvent.job_canceled(job_context, job_canceled_info)\n elif reloaded_run and reloaded_run.status == DagsterRunStatus.CANCELED:\n # This happens if the run was force-terminated but was still able to send\n # a cancellation request\n event = DagsterEvent.engine_event(\n job_context,\n "Computational resources were cleaned up after the run was forcibly marked"\n " as canceled.",\n EngineEventData(),\n )\n elif job_context.instance.run_will_resume(job_context.run_id):\n event = DagsterEvent.engine_event(\n job_context,\n "Execution was interrupted unexpectedly. No user initiated termination"\n " request was found, not treating as failure because run will be resumed.",\n EngineEventData(),\n )\n elif reloaded_run and reloaded_run.status == DagsterRunStatus.FAILURE:\n event = DagsterEvent.engine_event(\n job_context,\n "Execution was interrupted for a run that was already in a failure state.",\n EngineEventData(),\n )\n else:\n event = DagsterEvent.job_failure(\n job_context,\n "Execution was interrupted unexpectedly. "\n "No user initiated termination request was found, treating as failure.",\n job_canceled_info,\n )\n elif job_exception_info:\n event = DagsterEvent.job_failure(\n job_context,\n "An exception was thrown during execution.",\n job_exception_info,\n )\n elif failed_steps:\n event = DagsterEvent.job_failure(\n job_context,\n f"Steps failed: {failed_steps}.",\n )\n else:\n event = DagsterEvent.job_success(job_context)\n if not generator_closed:\n yield event\n\n\nclass ExecuteRunWithPlanIterable:\n """Utility class to consolidate execution logic.\n\n This is a class and not a function because, e.g., in constructing a `scoped_pipeline_context`\n for `JobExecutionResult`, we need to pull out the `pipeline_context` after we're done\n yielding events. This broadly follows a pattern we make use of in other places,\n cf. `dagster._utils.EventGenerationManager`.\n """\n\n def __init__(\n self,\n execution_plan: ExecutionPlan,\n iterator: Callable[..., Iterator[DagsterEvent]],\n execution_context_manager: ExecutionContextManager[Any],\n ):\n self.execution_plan = check.inst_param(execution_plan, "execution_plan", ExecutionPlan)\n self.iterator = check.callable_param(iterator, "iterator")\n self.execution_context_manager = check.inst_param(\n execution_context_manager, "execution_context_manager", ExecutionContextManager\n )\n\n self.job_context = None\n\n def __iter__(self) -> Iterator[DagsterEvent]:\n # Since interrupts can't be raised at arbitrary points safely, delay them until designated\n # checkpoints during the execution.\n # To be maximally certain that interrupts are always caught during an execution process,\n # you can safely add an additional `with capture_interrupts()` at the very beginning of the\n # process that performs the execution.\n with capture_interrupts():\n yield from self.execution_context_manager.prepare_context()\n self.job_context = self.execution_context_manager.get_context()\n generator_closed = False\n try:\n if self.job_context: # False if we had a pipeline init failure\n yield from self.iterator(\n execution_plan=self.execution_plan,\n job_context=self.job_context,\n )\n except GeneratorExit:\n # Shouldn't happen, but avoid runtime-exception in case this generator gets GC-ed\n # (see https://amir.rachum.com/blog/2017/03/03/generator-cleanup/).\n generator_closed = True\n raise\n finally:\n for event in self.execution_context_manager.shutdown_context():\n if not generator_closed:\n yield event\n\n\ndef _check_execute_job_args(\n job_arg: Union[JobDefinition, IJob],\n run_config: Optional[Mapping[str, object]],\n tags: Optional[Mapping[str, str]],\n op_selection: Optional[Sequence[str]] = None,\n) -> Tuple[\n IJob,\n Optional[Mapping],\n Mapping[str, str],\n Optional[AbstractSet[str]],\n Optional[Sequence[str]],\n]:\n ijob = InMemoryJob(job_arg) if isinstance(job_arg, JobDefinition) else job_arg\n job_def = job_arg if isinstance(job_arg, JobDefinition) else job_arg.get_definition()\n\n run_config = check.opt_mapping_param(run_config, "run_config")\n\n tags = check.opt_mapping_param(tags, "tags", key_type=str)\n check.opt_sequence_param(op_selection, "op_selection", of_type=str)\n\n tags = merge_dicts(job_def.tags, tags)\n\n # generate job subset from the given op_selection\n if op_selection:\n ijob = ijob.get_subset(op_selection=op_selection)\n\n return (\n ijob,\n run_config,\n tags,\n ijob.resolved_op_selection,\n op_selection,\n )\n\n\ndef _resolve_reexecute_step_selection(\n instance: DagsterInstance,\n job: IJob,\n run_config: Optional[Mapping],\n parent_dagster_run: DagsterRun,\n step_selection: Sequence[str],\n) -> ExecutionPlan:\n if parent_dagster_run.op_selection:\n job = job.get_subset(op_selection=parent_dagster_run.op_selection)\n\n state = KnownExecutionState.build_for_reexecution(instance, parent_dagster_run)\n\n parent_plan = create_execution_plan(\n job,\n parent_dagster_run.run_config,\n known_state=state,\n )\n step_keys_to_execute = parse_step_selection(parent_plan.get_all_step_deps(), step_selection)\n execution_plan = create_execution_plan(\n job,\n run_config,\n step_keys_to_execute=list(step_keys_to_execute),\n known_state=state.update_for_step_selection(step_keys_to_execute),\n tags=parent_dagster_run.tags,\n )\n return execution_plan\n\n\ndef _job_with_repository_load_data(\n job_arg: Union[JobDefinition, IJob],\n) -> Tuple[Union[JobDefinition, IJob], Optional[RepositoryLoadData]]:\n """For ReconstructableJob, generate and return any required RepositoryLoadData, alongside\n a ReconstructableJob with this repository load data baked in.\n """\n if isinstance(job_arg, ReconstructableJob):\n # Unless this ReconstructableJob alread has repository_load_data attached, this will\n # force the repository_load_data to be computed from scratch.\n repository_load_data = job_arg.repository.get_definition().repository_load_data\n return job_arg.with_repository_load_data(repository_load_data), repository_load_data\n return job_arg, None\n
", "current_page_name": "_modules/dagster/_core/execution/api", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.execution.api"}, "build_resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.execution.build_resources

\nfrom contextlib import contextmanager\nfrom typing import Any, Dict, Generator, Mapping, Optional, cast\n\nimport dagster._check as check\nfrom dagster._config import process_config\nfrom dagster._core.definitions.resource_definition import (\n    ResourceDefinition,\n    Resources,\n    ScopedResourcesBuilder,\n)\nfrom dagster._core.definitions.run_config import define_resource_dictionary_cls\nfrom dagster._core.errors import DagsterInvalidConfigError\nfrom dagster._core.execution.resources_init import resource_initialization_manager\nfrom dagster._core.instance import DagsterInstance\nfrom dagster._core.log_manager import DagsterLogManager\nfrom dagster._core.storage.dagster_run import DagsterRun\nfrom dagster._core.storage.io_manager import IOManager, IOManagerDefinition\nfrom dagster._core.system_config.objects import ResourceConfig, config_map_resources\n\nfrom .api import ephemeral_instance_if_missing\nfrom .context_creation_job import initialize_console_manager\n\n\ndef get_mapped_resource_config(\n    resource_defs: Mapping[str, ResourceDefinition], resource_config: Mapping[str, Any]\n) -> Mapping[str, ResourceConfig]:\n    resource_config_schema = define_resource_dictionary_cls(\n        resource_defs, set(resource_defs.keys())\n    )\n    config_evr = process_config(resource_config_schema, resource_config)\n    if not config_evr.success:\n        raise DagsterInvalidConfigError(\n            "Error in config for resources ",\n            config_evr.errors,\n            resource_config,\n        )\n    config_value = cast(Dict[str, Any], config_evr.value)\n    return config_map_resources(resource_defs, config_value)\n\n\n
[docs]@contextmanager\ndef build_resources(\n resources: Mapping[str, Any],\n instance: Optional[DagsterInstance] = None,\n resource_config: Optional[Mapping[str, Any]] = None,\n dagster_run: Optional[DagsterRun] = None,\n log_manager: Optional[DagsterLogManager] = None,\n) -> Generator[Resources, None, None]:\n """Context manager that yields resources using provided resource definitions and run config.\n\n This API allows for using resources in an independent context. Resources will be initialized\n with the provided run config, and optionally, dagster_run. The resulting resources will be\n yielded on a dictionary keyed identically to that provided for `resource_defs`. Upon exiting the\n context, resources will also be torn down safely.\n\n Args:\n resources (Mapping[str, Any]): Resource instances or definitions to build. All\n required resource dependencies to a given resource must be contained within this\n dictionary, or the resource build will fail.\n instance (Optional[DagsterInstance]): The dagster instance configured to instantiate\n resources on.\n resource_config (Optional[Mapping[str, Any]]): A dict representing the config to be\n provided to each resource during initialization and teardown.\n dagster_run (Optional[PipelineRun]): The pipeline run to provide during resource\n initialization and teardown. If the provided resources require either the `dagster_run`\n or `run_id` attributes of the provided context during resource initialization and/or\n teardown, this must be provided, or initialization will fail.\n log_manager (Optional[DagsterLogManager]): Log Manager to use during resource\n initialization. Defaults to system log manager.\n\n Examples:\n .. code-block:: python\n\n from dagster import resource, build_resources\n\n @resource\n def the_resource():\n return "foo"\n\n with build_resources(resources={"from_def": the_resource, "from_val": "bar"}) as resources:\n assert resources.from_def == "foo"\n assert resources.from_val == "bar"\n\n """\n resources = check.mapping_param(resources, "resource_defs", key_type=str)\n instance = check.opt_inst_param(instance, "instance", DagsterInstance)\n resource_config = check.opt_mapping_param(resource_config, "resource_config", key_type=str)\n log_manager = check.opt_inst_param(log_manager, "log_manager", DagsterLogManager)\n resource_defs = wrap_resources_for_execution(resources)\n mapped_resource_config = get_mapped_resource_config(resource_defs, resource_config)\n\n with ephemeral_instance_if_missing(instance) as dagster_instance:\n resources_manager = resource_initialization_manager(\n resource_defs=resource_defs,\n resource_configs=mapped_resource_config,\n log_manager=log_manager if log_manager else initialize_console_manager(dagster_run),\n execution_plan=None,\n dagster_run=dagster_run,\n resource_keys_to_init=set(resource_defs.keys()),\n instance=dagster_instance,\n emit_persistent_events=False,\n )\n try:\n list(resources_manager.generate_setup_events())\n instantiated_resources = check.inst(\n resources_manager.get_object(), ScopedResourcesBuilder\n )\n yield instantiated_resources.build(\n set(instantiated_resources.resource_instance_dict.keys())\n )\n finally:\n list(resources_manager.generate_teardown_events())
\n\n\ndef wrap_resources_for_execution(\n resources: Optional[Mapping[str, Any]] = None\n) -> Dict[str, ResourceDefinition]:\n return (\n {\n resource_key: wrap_resource_for_execution(resource)\n for resource_key, resource in resources.items()\n }\n if resources\n else {}\n )\n\n\ndef wrap_resource_for_execution(resource: Any) -> ResourceDefinition:\n from dagster._config.pythonic_config import ConfigurableResourceFactory, PartialResource\n\n # Wrap instantiated resource values in a resource definition.\n # If an instantiated IO manager is provided, wrap it in an IO manager definition.\n if isinstance(resource, (ConfigurableResourceFactory, PartialResource)):\n return resource.get_resource_definition()\n elif isinstance(resource, ResourceDefinition):\n return resource\n elif isinstance(resource, IOManager):\n return IOManagerDefinition.hardcoded_io_manager(resource)\n else:\n return ResourceDefinition.hardcoded_resource(resource)\n
", "current_page_name": "_modules/dagster/_core/execution/build_resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.execution.build_resources"}, "context": {"compute": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.execution.context.compute

\nfrom abc import ABC, ABCMeta, abstractmethod\nfrom inspect import _empty as EmptyAnnotation\nfrom typing import (\n    AbstractSet,\n    Any,\n    Dict,\n    Iterator,\n    List,\n    Mapping,\n    Optional,\n    Sequence,\n    Set,\n    Union,\n    cast,\n)\n\nimport dagster._check as check\nfrom dagster._annotations import deprecated, experimental, public\nfrom dagster._core.definitions.asset_check_spec import AssetCheckKey, AssetCheckSpec\nfrom dagster._core.definitions.asset_checks import AssetChecksDefinition\nfrom dagster._core.definitions.assets import AssetsDefinition\nfrom dagster._core.definitions.data_version import (\n    DataProvenance,\n    DataVersion,\n    extract_data_provenance_from_entry,\n)\nfrom dagster._core.definitions.decorators.op_decorator import DecoratedOpFunction\nfrom dagster._core.definitions.dependency import Node, NodeHandle\nfrom dagster._core.definitions.events import (\n    AssetKey,\n    AssetMaterialization,\n    AssetObservation,\n    ExpectationResult,\n    UserEvent,\n)\nfrom dagster._core.definitions.job_definition import JobDefinition\nfrom dagster._core.definitions.op_definition import OpDefinition\nfrom dagster._core.definitions.partition import PartitionsDefinition\nfrom dagster._core.definitions.partition_key_range import PartitionKeyRange\nfrom dagster._core.definitions.step_launcher import StepLauncher\nfrom dagster._core.definitions.time_window_partitions import TimeWindow\nfrom dagster._core.errors import (\n    DagsterInvalidDefinitionError,\n    DagsterInvalidPropertyError,\n    DagsterInvariantViolationError,\n)\nfrom dagster._core.events import DagsterEvent\nfrom dagster._core.instance import DagsterInstance\nfrom dagster._core.log_manager import DagsterLogManager\nfrom dagster._core.storage.dagster_run import DagsterRun\nfrom dagster._utils.forked_pdb import ForkedPdb\nfrom dagster._utils.warnings import (\n    deprecation_warning,\n)\n\nfrom .system import StepExecutionContext\n\n\n# This metaclass has to exist for OpExecutionContext to have a metaclass\nclass AbstractComputeMetaclass(ABCMeta):\n    pass\n\n\nclass AbstractComputeExecutionContext(ABC, metaclass=AbstractComputeMetaclass):\n    """Base class for op context implemented by OpExecutionContext and DagstermillExecutionContext."""\n\n    @abstractmethod\n    def has_tag(self, key: str) -> bool:\n        """Implement this method to check if a logging tag is set."""\n\n    @abstractmethod\n    def get_tag(self, key: str) -> Optional[str]:\n        """Implement this method to get a logging tag."""\n\n    @property\n    @abstractmethod\n    def run_id(self) -> str:\n        """The run id for the context."""\n\n    @property\n    @abstractmethod\n    def op_def(self) -> OpDefinition:\n        """The op definition corresponding to the execution step being executed."""\n\n    @property\n    @abstractmethod\n    def job_def(self) -> JobDefinition:\n        """The job being executed."""\n\n    @property\n    @abstractmethod\n    def run(self) -> DagsterRun:\n        """The DagsterRun object corresponding to the execution."""\n\n    @property\n    @abstractmethod\n    def resources(self) -> Any:\n        """Resources available in the execution context."""\n\n    @property\n    @abstractmethod\n    def log(self) -> DagsterLogManager:\n        """The log manager available in the execution context."""\n\n    @property\n    @abstractmethod\n    def op_config(self) -> Any:\n        """The parsed config specific to this op."""\n\n\nclass OpExecutionContextMetaClass(AbstractComputeMetaclass):\n    def __instancecheck__(cls, instance) -> bool:\n        # This makes isinstance(context, OpExecutionContext) throw a deprecation warning when\n        # context is an AssetExecutionContext. This metaclass can be deleted once AssetExecutionContext\n        # has been split into it's own class in 1.7.0\n        if type(instance) is AssetExecutionContext and cls is not AssetExecutionContext:\n            deprecation_warning(\n                subject="AssetExecutionContext",\n                additional_warn_text=(\n                    "Starting in version 1.7.0 AssetExecutionContext will no longer be a subclass"\n                    " of OpExecutionContext."\n                ),\n                breaking_version="1.7.0",\n                stacklevel=1,\n            )\n        return super().__instancecheck__(instance)\n\n\n
[docs]class OpExecutionContext(AbstractComputeExecutionContext, metaclass=OpExecutionContextMetaClass):\n """The ``context`` object that can be made available as the first argument to the function\n used for computing an op or asset.\n\n This context object provides system information such as resources, config, and logging.\n\n To construct an execution context for testing purposes, use :py:func:`dagster.build_op_context`.\n\n Example:\n .. code-block:: python\n\n from dagster import op, OpExecutionContext\n\n @op\n def hello_world(context: OpExecutionContext):\n context.log.info("Hello, world!")\n """\n\n __slots__ = ["_step_execution_context"]\n\n def __init__(self, step_execution_context: StepExecutionContext):\n self._step_execution_context = check.inst_param(\n step_execution_context,\n "step_execution_context",\n StepExecutionContext,\n )\n self._pdb: Optional[ForkedPdb] = None\n self._events: List[DagsterEvent] = []\n self._output_metadata: Dict[str, Any] = {}\n\n @public\n @property\n def op_config(self) -> Any:\n """Any: The parsed config specific to this op."""\n return self._step_execution_context.op_config\n\n @property\n def dagster_run(self) -> DagsterRun:\n """PipelineRun: The current pipeline run."""\n return self._step_execution_context.dagster_run\n\n @property\n def run(self) -> DagsterRun:\n """DagsterRun: The current run."""\n return self.dagster_run\n\n @public\n @property\n def instance(self) -> DagsterInstance:\n """DagsterInstance: The current Dagster instance."""\n return self._step_execution_context.instance\n\n @public\n @property\n def pdb(self) -> ForkedPdb:\n """dagster.utils.forked_pdb.ForkedPdb: Gives access to pdb debugging from within the op.\n\n Example:\n .. code-block:: python\n\n @op\n def debug(context):\n context.pdb.set_trace()\n """\n if self._pdb is None:\n self._pdb = ForkedPdb()\n\n return self._pdb\n\n @property\n def file_manager(self):\n """Deprecated access to the file manager.\n\n :meta private:\n """\n raise DagsterInvalidPropertyError(\n "You have attempted to access the file manager which has been moved to resources in"\n " 0.10.0. Please access it via `context.resources.file_manager` instead."\n )\n\n @public\n @property\n def resources(self) -> Any:\n """Resources: The currently available resources."""\n return self._step_execution_context.resources\n\n @property\n def step_launcher(self) -> Optional[StepLauncher]:\n """Optional[StepLauncher]: The current step launcher, if any."""\n return self._step_execution_context.step_launcher\n\n @public\n @property\n def run_id(self) -> str:\n """str: The id of the current execution's run."""\n return self._step_execution_context.run_id\n\n @public\n @property\n def run_config(self) -> Mapping[str, object]:\n """dict: The run config for the current execution."""\n return self._step_execution_context.run_config\n\n @public\n @property\n def job_def(self) -> JobDefinition:\n """JobDefinition: The currently executing pipeline."""\n return self._step_execution_context.job_def\n\n @public\n @property\n def job_name(self) -> str:\n """str: The name of the currently executing pipeline."""\n return self._step_execution_context.job_name\n\n @public\n @property\n def log(self) -> DagsterLogManager:\n """DagsterLogManager: The log manager available in the execution context."""\n return self._step_execution_context.log\n\n @property\n def node_handle(self) -> NodeHandle:\n """NodeHandle: The current op's handle.\n\n :meta private:\n """\n return self._step_execution_context.node_handle\n\n @property\n def op_handle(self) -> NodeHandle:\n """NodeHandle: The current op's handle.\n\n :meta private:\n """\n return self.node_handle\n\n @property\n def op(self) -> Node:\n """Node: The object representing the invoked op within the graph.\n\n :meta private:\n\n """\n return self._step_execution_context.job_def.get_node(self.node_handle)\n\n @public\n @property\n def op_def(self) -> OpDefinition:\n """OpDefinition: The current op definition."""\n return cast(OpDefinition, self.op.definition)\n\n @public\n @property\n def has_partition_key(self) -> bool:\n """Whether the current run is a partitioned run."""\n return self._step_execution_context.has_partition_key\n\n @public\n @property\n def partition_key(self) -> str:\n """The partition key for the current run.\n\n Raises an error if the current run is not a partitioned run. Or if the current run is operating\n over a range of partitions (ie. a backfill of several partitions executed in a single run).\n\n Examples:\n .. code-block:: python\n\n partitions_def = DailyPartitionsDefinition("2023-08-20")\n\n @asset(\n partitions_def=partitions_def\n )\n def my_asset(context: AssetExecutionContext):\n context.log.info(context.partition_key)\n\n # materializing the 2023-08-21 partition of this asset will log:\n # "2023-08-21"\n """\n return self._step_execution_context.partition_key\n\n @deprecated(breaking_version="2.0", additional_warn_text="Use `partition_key_range` instead.")\n @public\n @property\n def asset_partition_key_range(self) -> PartitionKeyRange:\n """The range of partition keys for the current run.\n\n If run is for a single partition key, return a `PartitionKeyRange` with the same start and\n end. Raises an error if the current run is not a partitioned run.\n """\n return self.partition_key_range\n\n @public\n @property\n def partition_key_range(self) -> PartitionKeyRange:\n """The range of partition keys for the current run.\n\n If run is for a single partition key, returns a `PartitionKeyRange` with the same start and\n end. Raises an error if the current run is not a partitioned run.\n\n Examples:\n .. code-block:: python\n\n partitions_def = DailyPartitionsDefinition("2023-08-20")\n\n @asset(\n partitions_def=partitions_def\n )\n def my_asset(context: AssetExecutionContext):\n context.log.info(context.partition_key_range)\n\n # running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n # PartitionKeyRange(start="2023-08-21", end="2023-08-25")\n """\n return self._step_execution_context.asset_partition_key_range\n\n @public\n @property\n def partition_time_window(self) -> TimeWindow:\n """The partition time window for the current run.\n\n Raises an error if the current run is not a partitioned run, or if the job's partition\n definition is not a TimeWindowPartitionsDefinition.\n\n Examples:\n .. code-block:: python\n\n partitions_def = DailyPartitionsDefinition("2023-08-20")\n\n @asset(\n partitions_def=partitions_def\n )\n def my_asset(context: AssetExecutionContext):\n context.log.info(context.partition_time_window)\n\n # materializing the 2023-08-21 partition of this asset will log:\n # TimeWindow("2023-08-21", "2023-08-22")\n """\n return self._step_execution_context.partition_time_window\n\n
[docs] @public\n def has_tag(self, key: str) -> bool:\n """Check if a logging tag is set.\n\n Args:\n key (str): The tag to check.\n\n Returns:\n bool: Whether the tag is set.\n """\n return self._step_execution_context.has_tag(key)
\n\n
[docs] @public\n def get_tag(self, key: str) -> Optional[str]:\n """Get a logging tag.\n\n Args:\n key (tag): The tag to get.\n\n Returns:\n Optional[str]: The value of the tag, if present.\n """\n return self._step_execution_context.get_tag(key)
\n\n @property\n def run_tags(self) -> Mapping[str, str]:\n """Mapping[str, str]: The tags for the current run."""\n return self._step_execution_context.run_tags\n\n def has_events(self) -> bool:\n return bool(self._events)\n\n def consume_events(self) -> Iterator[DagsterEvent]:\n """Pops and yields all user-generated events that have been recorded from this context.\n\n If consume_events has not yet been called, this will yield all logged events since the beginning of the op's computation. If consume_events has been called, it will yield all events since the last time consume_events was called. Designed for internal use. Users should never need to invoke this method.\n """\n events = self._events\n self._events = []\n yield from events\n\n
[docs] @public\n def log_event(self, event: UserEvent) -> None:\n """Log an AssetMaterialization, AssetObservation, or ExpectationResult from within the body of an op.\n\n Events logged with this method will appear in the list of DagsterEvents, as well as the event log.\n\n Args:\n event (Union[AssetMaterialization, AssetObservation, ExpectationResult]): The event to log.\n\n **Examples:**\n\n .. code-block:: python\n\n from dagster import op, AssetMaterialization\n\n @op\n def log_materialization(context):\n context.log_event(AssetMaterialization("foo"))\n """\n if isinstance(event, AssetMaterialization):\n self._events.append(\n DagsterEvent.asset_materialization(self._step_execution_context, event)\n )\n elif isinstance(event, AssetObservation):\n self._events.append(DagsterEvent.asset_observation(self._step_execution_context, event))\n elif isinstance(event, ExpectationResult):\n self._events.append(\n DagsterEvent.step_expectation_result(self._step_execution_context, event)\n )\n else:\n check.failed(f"Unexpected event {event}")
\n\n
[docs] @public\n def add_output_metadata(\n self,\n metadata: Mapping[str, Any],\n output_name: Optional[str] = None,\n mapping_key: Optional[str] = None,\n ) -> None:\n """Add metadata to one of the outputs of an op.\n\n This can be invoked multiple times per output in the body of an op. If the same key is\n passed multiple times, the value associated with the last call will be used.\n\n Args:\n metadata (Mapping[str, Any]): The metadata to attach to the output\n output_name (Optional[str]): The name of the output to attach metadata to. If there is only one output on the op, then this argument does not need to be provided. The metadata will automatically be attached to the only output.\n mapping_key (Optional[str]): The mapping key of the output to attach metadata to. If the\n output is not dynamic, this argument does not need to be provided.\n\n **Examples:**\n\n .. code-block:: python\n\n from dagster import Out, op\n from typing import Tuple\n\n @op\n def add_metadata(context):\n context.add_output_metadata({"foo", "bar"})\n return 5 # Since the default output is called "result", metadata will be attached to the output "result".\n\n @op(out={"a": Out(), "b": Out()})\n def add_metadata_two_outputs(context) -> Tuple[str, int]:\n context.add_output_metadata({"foo": "bar"}, output_name="b")\n context.add_output_metadata({"baz": "bat"}, output_name="a")\n\n return ("dog", 5)\n\n """\n metadata = check.mapping_param(metadata, "metadata", key_type=str)\n output_name = check.opt_str_param(output_name, "output_name")\n mapping_key = check.opt_str_param(mapping_key, "mapping_key")\n\n self._step_execution_context.add_output_metadata(\n metadata=metadata, output_name=output_name, mapping_key=mapping_key\n )
\n\n def get_output_metadata(\n self, output_name: str, mapping_key: Optional[str] = None\n ) -> Optional[Mapping[str, Any]]:\n return self._step_execution_context.get_output_metadata(\n output_name=output_name, mapping_key=mapping_key\n )\n\n def get_step_execution_context(self) -> StepExecutionContext:\n """Allows advanced users (e.g. framework authors) to punch through to the underlying\n step execution context.\n\n :meta private:\n\n Returns:\n StepExecutionContext: The underlying system context.\n """\n return self._step_execution_context\n\n @public\n @property\n def retry_number(self) -> int:\n """Which retry attempt is currently executing i.e. 0 for initial attempt, 1 for first retry, etc."""\n return self._step_execution_context.previous_attempt_count\n\n def describe_op(self):\n return self._step_execution_context.describe_op()\n\n
[docs] @public\n def get_mapping_key(self) -> Optional[str]:\n """Which mapping_key this execution is for if downstream of a DynamicOutput, otherwise None."""\n return self._step_execution_context.step.get_mapping_key()
\n\n #############################################################################################\n # asset related methods\n #############################################################################################\n\n @public\n @property\n def asset_key(self) -> AssetKey:\n """The AssetKey for the current asset. In a multi_asset, use asset_key_for_output instead."""\n if self.has_assets_def and len(self.assets_def.keys_by_output_name.keys()) > 1:\n raise DagsterInvariantViolationError(\n "Cannot call `context.asset_key` in a multi_asset with more than one asset. Use"\n " `context.asset_key_for_output` instead."\n )\n # pass in the output name to handle the case when a multi_asset has a single AssetOut\n return self.asset_key_for_output(\n output_name=next(iter(self.assets_def.keys_by_output_name.keys()))\n )\n\n @public\n @property\n def has_assets_def(self) -> bool:\n """If there is a backing AssetsDefinition for what is currently executing."""\n assets_def = self.job_def.asset_layer.assets_def_for_node(self.node_handle)\n return assets_def is not None\n\n @public\n @property\n def assets_def(self) -> AssetsDefinition:\n """The backing AssetsDefinition for what is currently executing, errors if not available."""\n assets_def = self.job_def.asset_layer.assets_def_for_node(self.node_handle)\n if assets_def is None:\n raise DagsterInvalidPropertyError(\n f"Op '{self.op.name}' does not have an assets definition."\n )\n return assets_def\n\n @public\n @property\n def selected_asset_keys(self) -> AbstractSet[AssetKey]:\n """Get the set of AssetKeys this execution is expected to materialize."""\n if not self.has_assets_def:\n return set()\n return self.assets_def.keys\n\n @public\n @property\n def has_asset_checks_def(self) -> bool:\n """Return a boolean indicating the presence of a backing AssetChecksDefinition\n for the current execution.\n\n Returns:\n bool: True if there is a backing AssetChecksDefinition for the current execution, otherwise False.\n """\n return self.job_def.asset_layer.asset_checks_def_for_node(self.node_handle) is not None\n\n @public\n @property\n def asset_checks_def(self) -> AssetChecksDefinition:\n """The backing AssetChecksDefinition for what is currently executing, errors if not\n available.\n\n Returns:\n AssetChecksDefinition.\n """\n asset_checks_def = self.job_def.asset_layer.asset_checks_def_for_node(self.node_handle)\n if asset_checks_def is None:\n raise DagsterInvalidPropertyError(\n f"Op '{self.op.name}' does not have an asset checks definition."\n )\n\n return asset_checks_def\n\n @public\n @property\n def selected_asset_check_keys(self) -> AbstractSet[AssetCheckKey]:\n if self.has_assets_def:\n return self.assets_def.check_keys\n\n if self.has_asset_checks_def:\n check.failed("Subset selection is not yet supported within an AssetChecksDefinition")\n\n return set()\n\n @public\n @property\n def selected_output_names(self) -> AbstractSet[str]:\n """Get the output names that correspond to the current selection of assets this execution is expected to materialize."""\n # map selected asset keys to the output names they correspond to\n selected_asset_keys = self.selected_asset_keys\n selected_outputs: Set[str] = set()\n for output_name in self.op.output_dict.keys():\n asset_info = self.job_def.asset_layer.asset_info_for_output(\n self.node_handle, output_name\n )\n if any( # For graph-backed assets, check if a downstream asset is selected\n [\n asset_key in selected_asset_keys\n for asset_key in self.job_def.asset_layer.downstream_dep_assets(\n self.node_handle, output_name\n )\n ]\n ) or (asset_info and asset_info.key in selected_asset_keys):\n selected_outputs.add(output_name)\n\n return selected_outputs\n\n
[docs] @public\n def asset_key_for_output(self, output_name: str = "result") -> AssetKey:\n """Return the AssetKey for the corresponding output."""\n asset_output_info = self.job_def.asset_layer.asset_info_for_output(\n node_handle=self.op_handle, output_name=output_name\n )\n if asset_output_info is None:\n check.failed(f"Output '{output_name}' has no asset")\n else:\n return asset_output_info.key
\n\n
[docs] @public\n def output_for_asset_key(self, asset_key: AssetKey) -> str:\n """Return the output name for the corresponding asset key."""\n node_output_handle = self.job_def.asset_layer.node_output_handle_for_asset(asset_key)\n if node_output_handle is None:\n check.failed(f"Asset key '{asset_key}' has no output")\n else:\n return node_output_handle.output_name
\n\n
[docs] @public\n def asset_key_for_input(self, input_name: str) -> AssetKey:\n """Return the AssetKey for the corresponding input."""\n key = self.job_def.asset_layer.asset_key_for_input(\n node_handle=self.op_handle, input_name=input_name\n )\n if key is None:\n check.failed(f"Input '{input_name}' has no asset")\n else:\n return key
\n\n
[docs] @public\n def asset_partition_key_for_output(self, output_name: str = "result") -> str:\n """Returns the asset partition key for the given output.\n\n Args:\n output_name (str): For assets defined with the ``@asset`` decorator, the name of the output\n will be automatically provided. For assets defined with ``@multi_asset``, ``output_name``\n should be the op output associated with the asset key (as determined by AssetOut)\n to get the partition key for.\n\n Examples:\n .. code-block:: python\n\n partitions_def = DailyPartitionsDefinition("2023-08-20")\n\n @asset(\n partitions_def=partitions_def\n )\n def an_asset(context: AssetExecutionContext):\n context.log.info(context.asset_partition_key_for_output())\n\n\n # materializing the 2023-08-21 partition of this asset will log:\n # "2023-08-21"\n\n @multi_asset(\n outs={\n "first_asset": AssetOut(key=["my_assets", "first_asset"]),\n "second_asset": AssetOut(key=["my_assets", "second_asset"])\n }\n partitions_def=partitions_def,\n )\n def a_multi_asset(context: AssetExecutionContext):\n context.log.info(context.asset_partition_key_for_output("first_asset"))\n context.log.info(context.asset_partition_key_for_output("second_asset"))\n\n\n # materializing the 2023-08-21 partition of this asset will log:\n # "2023-08-21"\n # "2023-08-21"\n\n\n @asset(\n partitions_def=partitions_def,\n ins={\n "self_dependent_asset": AssetIn(partition_mapping=TimeWindowPartitionMapping(start_offset=-1, end_offset=-1))\n }\n )\n def self_dependent_asset(context: AssetExecutionContext, self_dependent_asset):\n context.log.info(context.asset_partition_key_for_output())\n\n # materializing the 2023-08-21 partition of this asset will log:\n # "2023-08-21"\n\n """\n return self._step_execution_context.asset_partition_key_for_output(output_name)
\n\n
[docs] @public\n def asset_partitions_time_window_for_output(self, output_name: str = "result") -> TimeWindow:\n """The time window for the partitions of the output asset.\n\n If you want to write your asset to support running a backfill of several partitions in a single run,\n you can use ``asset_partitions_time_window_for_output`` to get the TimeWindow of all of the partitions\n being materialized by the backfill.\n\n Raises an error if either of the following are true:\n - The output asset has no partitioning.\n - The output asset is not partitioned with a TimeWindowPartitionsDefinition or a\n MultiPartitionsDefinition with one time-partitioned dimension.\n\n Args:\n output_name (str): For assets defined with the ``@asset`` decorator, the name of the output\n will be automatically provided. For assets defined with ``@multi_asset``, ``output_name``\n should be the op output associated with the asset key (as determined by AssetOut)\n to get the time window for.\n\n Examples:\n .. code-block:: python\n\n partitions_def = DailyPartitionsDefinition("2023-08-20")\n\n @asset(\n partitions_def=partitions_def\n )\n def an_asset(context: AssetExecutionContext):\n context.log.info(context.asset_partitions_time_window_for_output())\n\n\n # materializing the 2023-08-21 partition of this asset will log:\n # TimeWindow("2023-08-21", "2023-08-22")\n\n # running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n # TimeWindow("2023-08-21", "2023-08-26")\n\n @multi_asset(\n outs={\n "first_asset": AssetOut(key=["my_assets", "first_asset"]),\n "second_asset": AssetOut(key=["my_assets", "second_asset"])\n }\n partitions_def=partitions_def,\n )\n def a_multi_asset(context: AssetExecutionContext):\n context.log.info(context.asset_partitions_time_window_for_output("first_asset"))\n context.log.info(context.asset_partitions_time_window_for_output("second_asset"))\n\n # materializing the 2023-08-21 partition of this asset will log:\n # TimeWindow("2023-08-21", "2023-08-22")\n # TimeWindow("2023-08-21", "2023-08-22")\n\n # running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n # TimeWindow("2023-08-21", "2023-08-26")\n # TimeWindow("2023-08-21", "2023-08-26")\n\n\n @asset(\n partitions_def=partitions_def,\n ins={\n "self_dependent_asset": AssetIn(partition_mapping=TimeWindowPartitionMapping(start_offset=-1, end_offset=-1))\n }\n )\n def self_dependent_asset(context: AssetExecutionContext, self_dependent_asset):\n context.log.info(context.asset_partitions_time_window_for_output())\n\n # materializing the 2023-08-21 partition of this asset will log:\n # TimeWindow("2023-08-21", "2023-08-22")\n\n # running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n # TimeWindow("2023-08-21", "2023-08-26")\n\n """\n return self._step_execution_context.asset_partitions_time_window_for_output(output_name)
\n\n
[docs] @public\n def asset_partition_key_range_for_output(\n self, output_name: str = "result"\n ) -> PartitionKeyRange:\n """Return the PartitionKeyRange for the corresponding output. Errors if the run is not partitioned.\n\n If you want to write your asset to support running a backfill of several partitions in a single run,\n you can use ``asset_partition_key_range_for_output`` to get all of the partitions being materialized\n by the backfill.\n\n Args:\n output_name (str): For assets defined with the ``@asset`` decorator, the name of the output\n will be automatically provided. For assets defined with ``@multi_asset``, ``output_name``\n should be the op output associated with the asset key (as determined by AssetOut)\n to get the partition key range for.\n\n Examples:\n .. code-block:: python\n\n partitions_def = DailyPartitionsDefinition("2023-08-20")\n\n @asset(\n partitions_def=partitions_def\n )\n def an_asset(context: AssetExecutionContext):\n context.log.info(context.asset_partition_key_range_for_output())\n\n\n # running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n # PartitionKeyRange(start="2023-08-21", end="2023-08-25")\n\n @multi_asset(\n outs={\n "first_asset": AssetOut(key=["my_assets", "first_asset"]),\n "second_asset": AssetOut(key=["my_assets", "second_asset"])\n }\n partitions_def=partitions_def,\n )\n def a_multi_asset(context: AssetExecutionContext):\n context.log.info(context.asset_partition_key_range_for_output("first_asset"))\n context.log.info(context.asset_partition_key_range_for_output("second_asset"))\n\n\n # running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n # PartitionKeyRange(start="2023-08-21", end="2023-08-25")\n # PartitionKeyRange(start="2023-08-21", end="2023-08-25")\n\n\n @asset(\n partitions_def=partitions_def,\n ins={\n "self_dependent_asset": AssetIn(partition_mapping=TimeWindowPartitionMapping(start_offset=-1, end_offset=-1))\n }\n )\n def self_dependent_asset(context: AssetExecutionContext, self_dependent_asset):\n context.log.info(context.asset_partition_key_range_for_output())\n\n # running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n # PartitionKeyRange(start="2023-08-21", end="2023-08-25")\n\n """\n return self._step_execution_context.asset_partition_key_range_for_output(output_name)
\n\n
[docs] @public\n def asset_partition_key_range_for_input(self, input_name: str) -> PartitionKeyRange:\n """Return the PartitionKeyRange for the corresponding input. Errors if the asset depends on a\n non-contiguous chunk of the input.\n\n If you want to write your asset to support running a backfill of several partitions in a single run,\n you can use ``asset_partition_key_range_for_input`` to get the range of partitions keys of the input that\n are relevant to that backfill.\n\n Args:\n input_name (str): The name of the input to get the time window for.\n\n Examples:\n .. code-block:: python\n\n partitions_def = DailyPartitionsDefinition("2023-08-20")\n\n @asset(\n partitions_def=partitions_def\n )\n def upstream_asset():\n ...\n\n @asset(\n partitions_def=partitions_def\n )\n def an_asset(context: AssetExecutionContext, upstream_asset):\n context.log.info(context.asset_partition_key_range_for_input("upstream_asset"))\n\n\n # running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n # PartitionKeyRange(start="2023-08-21", end="2023-08-25")\n\n @asset(\n ins={\n "upstream_asset": AssetIn(partition_mapping=TimeWindowPartitionMapping(start_offset=-1, end_offset=-1))\n }\n partitions_def=partitions_def,\n )\n def another_asset(context: AssetExecutionContext, upstream_asset):\n context.log.info(context.asset_partition_key_range_for_input("upstream_asset"))\n\n\n # running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n # PartitionKeyRange(start="2023-08-20", end="2023-08-24")\n\n\n @asset(\n partitions_def=partitions_def,\n ins={\n "self_dependent_asset": AssetIn(partition_mapping=TimeWindowPartitionMapping(start_offset=-1, end_offset=-1))\n }\n )\n def self_dependent_asset(context: AssetExecutionContext, self_dependent_asset):\n context.log.info(context.asset_partition_key_range_for_input("self_dependent_asset"))\n\n # running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n # PartitionKeyRange(start="2023-08-20", end="2023-08-24")\n\n\n """\n return self._step_execution_context.asset_partition_key_range_for_input(input_name)
\n\n
[docs] @public\n def asset_partition_key_for_input(self, input_name: str) -> str:\n """Returns the partition key of the upstream asset corresponding to the given input.\n\n Args:\n input_name (str): The name of the input to get the partition key for.\n\n Examples:\n .. code-block:: python\n\n partitions_def = DailyPartitionsDefinition("2023-08-20")\n\n @asset(\n partitions_def=partitions_def\n )\n def upstream_asset():\n ...\n\n @asset(\n partitions_def=partitions_def\n )\n def an_asset(context: AssetExecutionContext, upstream_asset):\n context.log.info(context.asset_partition_key_for_input("upstream_asset"))\n\n # materializing the 2023-08-21 partition of this asset will log:\n # "2023-08-21"\n\n\n @asset(\n partitions_def=partitions_def,\n ins={\n "self_dependent_asset": AssetIn(partition_mapping=TimeWindowPartitionMapping(start_offset=-1, end_offset=-1))\n }\n )\n def self_dependent_asset(context: AssetExecutionContext, self_dependent_asset):\n context.log.info(context.asset_partition_key_for_input("self_dependent_asset"))\n\n # materializing the 2023-08-21 partition of this asset will log:\n # "2023-08-20"\n\n """\n return self._step_execution_context.asset_partition_key_for_input(input_name)
\n\n
[docs] @public\n def asset_partitions_def_for_output(self, output_name: str = "result") -> PartitionsDefinition:\n """The PartitionsDefinition on the asset corresponding to this output.\n\n Args:\n output_name (str): For assets defined with the ``@asset`` decorator, the name of the output\n will be automatically provided. For assets defined with ``@multi_asset``, ``output_name``\n should be the op output associated with the asset key (as determined by AssetOut)\n to get the PartitionsDefinition for.\n\n Examples:\n .. code-block:: python\n\n partitions_def = DailyPartitionsDefinition("2023-08-20")\n\n @asset(\n partitions_def=partitions_def\n )\n def upstream_asset(context: AssetExecutionContext):\n context.log.info(context.asset_partitions_def_for_output())\n\n # materializing the 2023-08-21 partition of this asset will log:\n # DailyPartitionsDefinition("2023-08-20")\n\n @multi_asset(\n outs={\n "first_asset": AssetOut(key=["my_assets", "first_asset"]),\n "second_asset": AssetOut(key=["my_assets", "second_asset"])\n }\n partitions_def=partitions_def,\n )\n def a_multi_asset(context: AssetExecutionContext):\n context.log.info(context.asset_partitions_def_for_output("first_asset"))\n context.log.info(context.asset_partitions_def_for_output("second_asset"))\n\n # materializing the 2023-08-21 partition of this asset will log:\n # DailyPartitionsDefinition("2023-08-20")\n # DailyPartitionsDefinition("2023-08-20")\n\n """\n asset_key = self.asset_key_for_output(output_name)\n result = self._step_execution_context.job_def.asset_layer.partitions_def_for_asset(\n asset_key\n )\n if result is None:\n raise DagsterInvariantViolationError(\n f"Attempting to access partitions def for asset {asset_key}, but it is not"\n " partitioned"\n )\n\n return result
\n\n
[docs] @public\n def asset_partitions_def_for_input(self, input_name: str) -> PartitionsDefinition:\n """The PartitionsDefinition on the upstream asset corresponding to this input.\n\n Args:\n input_name (str): The name of the input to get the PartitionsDefinition for.\n\n Examples:\n .. code-block:: python\n\n partitions_def = DailyPartitionsDefinition("2023-08-20")\n\n @asset(\n partitions_def=partitions_def\n )\n def upstream_asset():\n ...\n\n @asset(\n partitions_def=partitions_def\n )\n def upstream_asset(context: AssetExecutionContext, upstream_asset):\n context.log.info(context.asset_partitions_def_for_input("upstream_asset"))\n\n # materializing the 2023-08-21 partition of this asset will log:\n # DailyPartitionsDefinition("2023-08-20")\n\n """\n asset_key = self.asset_key_for_input(input_name)\n result = self._step_execution_context.job_def.asset_layer.partitions_def_for_asset(\n asset_key\n )\n if result is None:\n raise DagsterInvariantViolationError(\n f"Attempting to access partitions def for asset {asset_key}, but it is not"\n " partitioned"\n )\n\n return result
\n\n
[docs] @public\n def asset_partition_keys_for_output(self, output_name: str = "result") -> Sequence[str]:\n """Returns a list of the partition keys for the given output.\n\n If you want to write your asset to support running a backfill of several partitions in a single run,\n you can use ``asset_partition_keys_for_output`` to get all of the partitions being materialized\n by the backfill.\n\n Args:\n output_name (str): For assets defined with the ``@asset`` decorator, the name of the output\n will be automatically provided. For assets defined with ``@multi_asset``, ``output_name``\n should be the op output associated with the asset key (as determined by AssetOut)\n to get the partition keys for.\n\n Examples:\n .. code-block:: python\n\n partitions_def = DailyPartitionsDefinition("2023-08-20")\n\n @asset(\n partitions_def=partitions_def\n )\n def an_asset(context: AssetExecutionContext):\n context.log.info(context.asset_partition_keys_for_output())\n\n\n # running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n # ["2023-08-21", "2023-08-22", "2023-08-23", "2023-08-24", "2023-08-25"]\n\n @multi_asset(\n outs={\n "first_asset": AssetOut(key=["my_assets", "first_asset"]),\n "second_asset": AssetOut(key=["my_assets", "second_asset"])\n }\n partitions_def=partitions_def,\n )\n def a_multi_asset(context: AssetExecutionContext):\n context.log.info(context.asset_partition_keys_for_output("first_asset"))\n context.log.info(context.asset_partition_keys_for_output("second_asset"))\n\n\n # running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n # ["2023-08-21", "2023-08-22", "2023-08-23", "2023-08-24", "2023-08-25"]\n # ["2023-08-21", "2023-08-22", "2023-08-23", "2023-08-24", "2023-08-25"]\n\n\n @asset(\n partitions_def=partitions_def,\n ins={\n "self_dependent_asset": AssetIn(partition_mapping=TimeWindowPartitionMapping(start_offset=-1, end_offset=-1))\n }\n )\n def self_dependent_asset(context: AssetExecutionContext, self_dependent_asset):\n context.log.info(context.asset_partition_keys_for_output())\n\n # running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n # ["2023-08-21", "2023-08-22", "2023-08-23", "2023-08-24", "2023-08-25"]\n """\n return self.asset_partitions_def_for_output(output_name).get_partition_keys_in_range(\n self._step_execution_context.asset_partition_key_range_for_output(output_name),\n dynamic_partitions_store=self.instance,\n )
\n\n
[docs] @public\n def asset_partition_keys_for_input(self, input_name: str) -> Sequence[str]:\n """Returns a list of the partition keys of the upstream asset corresponding to the\n given input.\n\n If you want to write your asset to support running a backfill of several partitions in a single run,\n you can use ``asset_partition_keys_for_input`` to get all of the partition keys of the input that\n are relevant to that backfill.\n\n Args:\n input_name (str): The name of the input to get the time window for.\n\n Examples:\n .. code-block:: python\n\n partitions_def = DailyPartitionsDefinition("2023-08-20")\n\n @asset(\n partitions_def=partitions_def\n )\n def upstream_asset():\n ...\n\n @asset(\n partitions_def=partitions_def\n )\n def an_asset(context: AssetExecutionContext, upstream_asset):\n context.log.info(context.asset_partition_keys_for_input("upstream_asset"))\n\n\n # running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n # ["2023-08-21", "2023-08-22", "2023-08-23", "2023-08-24", "2023-08-25"]\n\n @asset(\n ins={\n "upstream_asset": AssetIn(partition_mapping=TimeWindowPartitionMapping(start_offset=-1, end_offset=-1))\n }\n partitions_def=partitions_def,\n )\n def another_asset(context: AssetExecutionContext, upstream_asset):\n context.log.info(context.asset_partition_keys_for_input("upstream_asset"))\n\n\n # running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n # ["2023-08-20", "2023-08-21", "2023-08-22", "2023-08-23", "2023-08-24"]\n\n\n @asset(\n partitions_def=partitions_def,\n ins={\n "self_dependent_asset": AssetIn(partition_mapping=TimeWindowPartitionMapping(start_offset=-1, end_offset=-1))\n }\n )\n def self_dependent_asset(context: AssetExecutionContext, self_dependent_asset):\n context.log.info(context.asset_partition_keys_for_input("self_dependent_asset"))\n\n # running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n # ["2023-08-20", "2023-08-21", "2023-08-22", "2023-08-23", "2023-08-24"]\n """\n return list(\n self._step_execution_context.asset_partitions_subset_for_input(\n input_name\n ).get_partition_keys()\n )
\n\n
[docs] @public\n def asset_partitions_time_window_for_input(self, input_name: str = "result") -> TimeWindow:\n """The time window for the partitions of the input asset.\n\n If you want to write your asset to support running a backfill of several partitions in a single run,\n you can use ``asset_partitions_time_window_for_input`` to get the time window of the input that\n are relevant to that backfill.\n\n Raises an error if either of the following are true:\n - The input asset has no partitioning.\n - The input asset is not partitioned with a TimeWindowPartitionsDefinition or a\n MultiPartitionsDefinition with one time-partitioned dimension.\n\n Args:\n input_name (str): The name of the input to get the partition key for.\n\n Examples:\n .. code-block:: python\n\n partitions_def = DailyPartitionsDefinition("2023-08-20")\n\n @asset(\n partitions_def=partitions_def\n )\n def upstream_asset():\n ...\n\n @asset(\n partitions_def=partitions_def\n )\n def an_asset(context: AssetExecutionContext, upstream_asset):\n context.log.info(context.asset_partitions_time_window_for_input("upstream_asset"))\n\n\n # materializing the 2023-08-21 partition of this asset will log:\n # TimeWindow("2023-08-21", "2023-08-22")\n\n # running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n # TimeWindow("2023-08-21", "2023-08-26")\n\n\n @asset(\n ins={\n "upstream_asset": AssetIn(partition_mapping=TimeWindowPartitionMapping(start_offset=-1, end_offset=-1))\n }\n partitions_def=partitions_def,\n )\n def another_asset(context: AssetExecutionContext, upstream_asset):\n context.log.info(context.asset_partitions_time_window_for_input("upstream_asset"))\n\n\n # materializing the 2023-08-21 partition of this asset will log:\n # TimeWindow("2023-08-20", "2023-08-21")\n\n # running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n # TimeWindow("2023-08-21", "2023-08-26")\n\n\n @asset(\n partitions_def=partitions_def,\n ins={\n "self_dependent_asset": AssetIn(partition_mapping=TimeWindowPartitionMapping(start_offset=-1, end_offset=-1))\n }\n )\n def self_dependent_asset(context: AssetExecutionContext, self_dependent_asset):\n context.log.info(context.asset_partitions_time_window_for_input("self_dependent_asset"))\n\n # materializing the 2023-08-21 partition of this asset will log:\n # TimeWindow("2023-08-20", "2023-08-21")\n\n # running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n # TimeWindow("2023-08-20", "2023-08-25")\n\n """\n return self._step_execution_context.asset_partitions_time_window_for_input(input_name)
\n\n
[docs] @public\n @experimental\n def get_asset_provenance(self, asset_key: AssetKey) -> Optional[DataProvenance]:\n """Return the provenance information for the most recent materialization of an asset.\n\n Args:\n asset_key (AssetKey): Key of the asset for which to retrieve provenance.\n\n Returns:\n Optional[DataProvenance]: Provenance information for the most recent\n materialization of the asset. Returns `None` if the asset was never materialized or\n the materialization record is too old to contain provenance information.\n """\n record = self.instance.get_latest_data_version_record(asset_key)\n\n return (\n None if record is None else extract_data_provenance_from_entry(record.event_log_entry)\n )
\n\n def set_data_version(self, asset_key: AssetKey, data_version: DataVersion) -> None:\n """Set the data version for an asset being materialized by the currently executing step.\n This is useful for external execution situations where it is not possible to return\n an `Output`.\n\n Args:\n asset_key (AssetKey): Key of the asset for which to set the data version.\n data_version (DataVersion): The data version to set.\n """\n self._step_execution_context.set_data_version(asset_key, data_version)\n\n @property\n def asset_check_spec(self) -> AssetCheckSpec:\n asset_checks_def = check.not_none(\n self.job_def.asset_layer.asset_checks_def_for_node(self.node_handle),\n "This context does not correspond to an AssetChecksDefinition",\n )\n return asset_checks_def.spec\n\n # In this mode no conversion is done on returned values and missing but expected outputs are not\n # allowed.\n @property\n def requires_typed_event_stream(self) -> bool:\n return self._step_execution_context.requires_typed_event_stream\n\n @property\n def typed_event_stream_error_message(self) -> Optional[str]:\n return self._step_execution_context.typed_event_stream_error_message\n\n def set_requires_typed_event_stream(self, *, error_message: Optional[str] = None) -> None:\n self._step_execution_context.set_requires_typed_event_stream(error_message=error_message)
\n\n\n
[docs]class AssetExecutionContext(OpExecutionContext):\n def __init__(self, step_execution_context: StepExecutionContext):\n super().__init__(step_execution_context=step_execution_context)
\n\n\ndef build_execution_context(\n step_context: StepExecutionContext,\n) -> Union[OpExecutionContext, AssetExecutionContext]:\n """Get the correct context based on the type of step (op or asset) and the user provided context\n type annotation. Follows these rules.\n\n step type annotation result\n asset AssetExecutionContext AssetExecutionContext\n asset OpExecutionContext OpExecutionContext\n asset None AssetExecutionContext\n op AssetExecutionContext Error - we cannot init an AssetExecutionContext w/o an AssetsDefinition\n op OpExecutionContext OpExecutionContext\n op None OpExecutionContext\n For ops in graph-backed assets\n step type annotation result\n op AssetExecutionContext AssetExecutionContext\n op OpExecutionContext OpExecutionContext\n op None OpExecutionContext\n """\n is_sda_step = step_context.is_sda_step\n is_op_in_graph_asset = is_sda_step and step_context.is_op_in_graph\n context_annotation = EmptyAnnotation\n compute_fn = step_context.op_def._compute_fn # noqa: SLF001\n compute_fn = (\n compute_fn\n if isinstance(compute_fn, DecoratedOpFunction)\n else DecoratedOpFunction(compute_fn)\n )\n if compute_fn.has_context_arg():\n context_param = compute_fn.get_context_arg()\n context_annotation = context_param.annotation\n\n # It would be nice to do this check at definition time, rather than at run time, but we don't\n # know if the op is part of an op job or a graph-backed asset until we have the step execution context\n if context_annotation is AssetExecutionContext and not is_sda_step:\n # AssetExecutionContext requires an AssetsDefinition during init, so an op in an op job\n # cannot be annotated with AssetExecutionContext\n raise DagsterInvalidDefinitionError(\n "Cannot annotate @op `context` parameter with type AssetExecutionContext unless the"\n " op is part of a graph-backed asset. `context` must be annotated with"\n " OpExecutionContext, or left blank."\n )\n\n if context_annotation is EmptyAnnotation:\n # if no type hint has been given, default to:\n # * AssetExecutionContext for sda steps, not in graph-backed assets\n # * OpExecutionContext for non sda steps\n # * OpExecutionContext for ops in graph-backed assets\n if is_op_in_graph_asset or not is_sda_step:\n return OpExecutionContext(step_context)\n return AssetExecutionContext(step_context)\n if context_annotation is AssetExecutionContext:\n return AssetExecutionContext(step_context)\n return OpExecutionContext(step_context)\n
", "current_page_name": "_modules/dagster/_core/execution/context/compute", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.execution.context.compute"}, "hook": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.execution.context.hook

\nimport warnings\nfrom typing import TYPE_CHECKING, AbstractSet, Any, Dict, Mapping, Optional, Set, Union\n\nimport dagster._check as check\nfrom dagster._annotations import public\n\nfrom ...definitions.composition import PendingNodeInvocation\nfrom ...definitions.decorators.graph_decorator import graph\nfrom ...definitions.dependency import Node\nfrom ...definitions.hook_definition import HookDefinition\nfrom ...definitions.op_definition import OpDefinition\nfrom ...definitions.resource_definition import IContainsGenerator, Resources\nfrom ...errors import DagsterInvalidPropertyError, DagsterInvariantViolationError\nfrom ...log_manager import DagsterLogManager\nfrom ..plan.step import ExecutionStep\nfrom ..plan.utils import RetryRequestedFromPolicy\nfrom .system import StepExecutionContext\n\nif TYPE_CHECKING:\n    from dagster._core.instance import DagsterInstance\n\n\ndef _property_msg(prop_name: str, method_name: str) -> str:\n    return (\n        f"The {prop_name} {method_name} is not set when a `HookContext` is constructed from "\n        "`build_hook_context`."\n    )\n\n\ndef _check_property_on_test_context(\n    context: "HookContext", attr_str: str, user_facing_name: str, param_on_builder: str\n):\n    """Check if attribute is not None on context. If none, error, and point user in direction of\n    how to specify the parameter on the context object.\n    """\n    value = getattr(context, attr_str)\n    if value is None:\n        raise DagsterInvalidPropertyError(\n            f"Attribute '{user_facing_name}' was not provided when "\n            f"constructing context. Provide a value for the '{param_on_builder}' parameter on "\n            "'build_hook_context'. To learn more, check out the testing hooks section of Dagster's "\n            "concepts docs: https://docs.dagster.io/concepts/ops-jobs-graphs/op-hooks#testing-hooks"\n        )\n    else:\n        return value\n\n\n
[docs]class HookContext:\n """The ``context`` object available to a hook function on an DagsterEvent."""\n\n def __init__(\n self,\n step_execution_context: StepExecutionContext,\n hook_def: HookDefinition,\n ):\n self._step_execution_context = step_execution_context\n self._hook_def = check.inst_param(hook_def, "hook_def", HookDefinition)\n self._required_resource_keys = hook_def.required_resource_keys\n self._resources = step_execution_context.scoped_resources_builder.build(\n self._required_resource_keys\n )\n\n @public\n @property\n def job_name(self) -> str:\n """The name of the job where this hook is being triggered."""\n return self._step_execution_context.job_name\n\n @public\n @property\n def run_id(self) -> str:\n """The id of the run where this hook is being triggered."""\n return self._step_execution_context.run_id\n\n @public\n @property\n def hook_def(self) -> HookDefinition:\n """The hook that the context object belongs to."""\n return self._hook_def\n\n @public\n @property\n def instance(self) -> "DagsterInstance":\n """The instance configured to run the current job."""\n return self._step_execution_context.instance\n\n @property\n def op(self) -> Node:\n """The op instance associated with the hook."""\n return self._step_execution_context.op\n\n @property\n def step(self) -> ExecutionStep:\n warnings.warn(\n "The step property of HookContext has been deprecated, and will be removed "\n "in a future release."\n )\n return self._step_execution_context.step\n\n @public\n @property\n def step_key(self) -> str:\n """The key for the step where this hook is being triggered."""\n return self._step_execution_context.step.key\n\n @public\n @property\n def required_resource_keys(self) -> AbstractSet[str]:\n """Resources required by this hook."""\n return self._required_resource_keys\n\n @public\n @property\n def resources(self) -> "Resources":\n """Resources available in the hook context."""\n return self._resources\n\n @property\n def solid_config(self) -> Any:\n solid_config = self._step_execution_context.resolved_run_config.ops.get(\n str(self._step_execution_context.step.node_handle)\n )\n return solid_config.config if solid_config else None\n\n @public\n @property\n def op_config(self) -> Any:\n """The parsed config specific to this op."""\n return self.solid_config\n\n # Because of the fact that we directly use the log manager of the step, if a user calls\n # hook_context.log.with_tags, then they will end up mutating the step's logging tags as well.\n # This is not problematic because the hook only runs after the step has been completed.\n @public\n @property\n def log(self) -> DagsterLogManager:\n """Centralized log dispatch from user code."""\n return self._step_execution_context.log\n\n @property\n def solid_exception(self) -> Optional[BaseException]:\n """The thrown exception in a failed solid.\n\n Returns:\n Optional[BaseException]: the exception object, None if the solid execution succeeds.\n """\n return self.op_exception\n\n @public\n @property\n def op_exception(self) -> Optional[BaseException]:\n """The thrown exception in a failed op."""\n exc = self._step_execution_context.step_exception\n\n if isinstance(exc, RetryRequestedFromPolicy):\n return exc.__cause__\n\n return exc\n\n @property\n def solid_output_values(self) -> Mapping[str, Union[Any, Mapping[str, Any]]]:\n """The computed output values.\n\n Returns a dictionary where keys are output names and the values are:\n * the output values in the normal case\n * a dictionary from mapping key to corresponding value in the mapped case\n """\n results: Dict[str, Union[Any, Dict[str, Any]]] = {}\n captured = self._step_execution_context.step_output_capture\n\n if captured is None:\n check.failed("Outputs were unexpectedly not captured for hook")\n\n # make the returned values more user-friendly\n for step_output_handle, value in captured.items():\n if step_output_handle.mapping_key:\n if results.get(step_output_handle.output_name) is None:\n results[step_output_handle.output_name] = {\n step_output_handle.mapping_key: value\n }\n else:\n results[step_output_handle.output_name][step_output_handle.mapping_key] = value\n else:\n results[step_output_handle.output_name] = value\n\n return results\n\n @public\n @property\n def op_output_values(self):\n """Computed output values in an op."""\n return self.solid_output_values
\n\n\nclass UnboundHookContext(HookContext):\n def __init__(\n self,\n resources: Mapping[str, Any],\n op: Optional[Union[OpDefinition, PendingNodeInvocation]],\n run_id: Optional[str],\n job_name: Optional[str],\n op_exception: Optional[Exception],\n instance: Optional["DagsterInstance"],\n ):\n from ..build_resources import build_resources, wrap_resources_for_execution\n from ..context_creation_job import initialize_console_manager\n\n self._op = None\n if op is not None:\n\n @graph(name="hook_context_container")\n def temp_graph():\n op()\n\n self._op = temp_graph.nodes[0]\n\n # Open resource context manager\n self._resource_defs = wrap_resources_for_execution(resources)\n self._resources_cm = build_resources(self._resource_defs)\n self._resources = self._resources_cm.__enter__()\n self._resources_contain_cm = isinstance(self._resources, IContainsGenerator)\n\n self._run_id = run_id\n self._job_name = job_name\n self._op_exception = op_exception\n self._instance = instance\n\n self._log = initialize_console_manager(None)\n\n self._cm_scope_entered = False\n\n def __enter__(self):\n self._cm_scope_entered = True\n return self\n\n def __exit__(self, *exc: Any):\n self._resources_cm.__exit__(*exc)\n\n def __del__(self):\n if self._resources_contain_cm and not self._cm_scope_entered:\n self._resources_cm.__exit__(None, None, None)\n\n @property\n def job_name(self) -> str:\n return _check_property_on_test_context(\n self, attr_str="_job_name", user_facing_name="job_name", param_on_builder="job_name"\n )\n\n @property\n def run_id(self) -> str:\n return _check_property_on_test_context(\n self, attr_str="_run_id", user_facing_name="run_id", param_on_builder="run_id"\n )\n\n @property\n def hook_def(self) -> HookDefinition:\n raise DagsterInvalidPropertyError(_property_msg("hook_def", "property"))\n\n @property\n def op(self) -> Node:\n return _check_property_on_test_context(\n self, attr_str="_op", user_facing_name="op", param_on_builder="op"\n )\n\n @property\n def step(self) -> ExecutionStep:\n raise DagsterInvalidPropertyError(_property_msg("step", "property"))\n\n @property\n def step_key(self) -> str:\n raise DagsterInvalidPropertyError(_property_msg("step_key", "property"))\n\n @property\n def required_resource_keys(self) -> Set[str]:\n raise DagsterInvalidPropertyError(_property_msg("hook_def", "property"))\n\n @property\n def resources(self) -> "Resources":\n if self._resources_contain_cm and not self._cm_scope_entered:\n raise DagsterInvariantViolationError(\n "At least one provided resource is a generator, but attempting to access "\n "resources outside of context manager scope. You can use the following syntax to "\n "open a context manager: `with build_hook_context(...) as context:`"\n )\n return self._resources\n\n @property\n def solid_config(self) -> Any:\n raise DagsterInvalidPropertyError(_property_msg("solid_config", "property"))\n\n @property\n def log(self) -> DagsterLogManager:\n return self._log\n\n @property\n def op_exception(self) -> Optional[BaseException]:\n return self._op_exception\n\n @property\n def solid_output_values(self) -> Mapping[str, Union[Any, Mapping[str, Any]]]:\n """The computed output values.\n\n Returns a dictionary where keys are output names and the values are:\n * the output values in the normal case\n * a dictionary from mapping key to corresponding value in the mapped case\n """\n raise DagsterInvalidPropertyError(_property_msg("solid_output_values", "method"))\n\n @property\n def instance(self) -> "DagsterInstance":\n if not self._instance:\n raise DagsterInvariantViolationError(\n "Tried to access the HookContext instance, but no instance was provided to"\n " `build_hook_context`."\n )\n\n return self._instance\n\n\nclass BoundHookContext(HookContext):\n def __init__(\n self,\n hook_def: HookDefinition,\n resources: Resources,\n op: Optional[Node],\n log_manager: DagsterLogManager,\n run_id: Optional[str],\n job_name: Optional[str],\n op_exception: Optional[Exception],\n instance: Optional["DagsterInstance"],\n ):\n self._hook_def = hook_def\n self._resources = resources\n self._op = op\n self._log_manager = log_manager\n self._run_id = run_id\n self._job_name = job_name\n self._op_exception = op_exception\n self._instance = instance\n\n @property\n def job_name(self) -> str:\n return _check_property_on_test_context(\n self, attr_str="_job_name", user_facing_name="job_name", param_on_builder="job_name"\n )\n\n @property\n def run_id(self) -> str:\n return _check_property_on_test_context(\n self, attr_str="_run_id", user_facing_name="run_id", param_on_builder="run_id"\n )\n\n @property\n def hook_def(self) -> HookDefinition:\n return self._hook_def\n\n @property\n def op(self) -> Node:\n return _check_property_on_test_context(\n self, attr_str="_op", user_facing_name="op", param_on_builder="op"\n )\n\n @property\n def step(self) -> ExecutionStep:\n raise DagsterInvalidPropertyError(_property_msg("step", "property"))\n\n @property\n def step_key(self) -> str:\n raise DagsterInvalidPropertyError(_property_msg("step_key", "property"))\n\n @property\n def required_resource_keys(self) -> AbstractSet[str]:\n return self._hook_def.required_resource_keys\n\n @property\n def resources(self) -> "Resources":\n return self._resources\n\n @property\n def solid_config(self) -> Any:\n raise DagsterInvalidPropertyError(_property_msg("solid_config", "property"))\n\n @property\n def log(self) -> DagsterLogManager:\n return self._log_manager\n\n @property\n def op_exception(self):\n return self._op_exception\n\n @property\n def solid_output_values(self) -> Mapping[str, Union[Any, Mapping[str, Any]]]:\n """The computed output values.\n\n Returns a dictionary where keys are output names and the values are:\n * the output values in the normal case\n * a dictionary from mapping key to corresponding value in the mapped case\n """\n raise DagsterInvalidPropertyError(_property_msg("solid_output_values", "method"))\n\n @property\n def instance(self) -> "DagsterInstance":\n if not self._instance:\n raise DagsterInvariantViolationError(\n "Tried to access the HookContext instance, but no instance was provided to"\n " `build_hook_context`."\n )\n\n return self._instance\n\n\n
[docs]def build_hook_context(\n resources: Optional[Mapping[str, Any]] = None,\n op: Optional[Union[OpDefinition, PendingNodeInvocation]] = None,\n run_id: Optional[str] = None,\n job_name: Optional[str] = None,\n op_exception: Optional[Exception] = None,\n instance: Optional["DagsterInstance"] = None,\n) -> UnboundHookContext:\n """Builds hook context from provided parameters.\n\n ``build_hook_context`` can be used as either a function or a context manager. If there is a\n provided resource to ``build_hook_context`` that is a context manager, then it must be used as a\n context manager. This function can be used to provide the context argument to the invocation of\n a hook definition.\n\n Args:\n resources (Optional[Dict[str, Any]]): The resources to provide to the context. These can\n either be values or resource definitions.\n op (Optional[OpDefinition, PendingNodeInvocation]): The op definition which the\n hook may be associated with.\n run_id (Optional[str]): The id of the run in which the hook is invoked (provided for mocking purposes).\n job_name (Optional[str]): The name of the job in which the hook is used (provided for mocking purposes).\n op_exception (Optional[Exception]): The exception that caused the hook to be triggered.\n instance (Optional[DagsterInstance]): The Dagster instance configured to run the hook.\n\n Examples:\n .. code-block:: python\n\n context = build_hook_context()\n hook_to_invoke(context)\n\n with build_hook_context(resources={"foo": context_manager_resource}) as context:\n hook_to_invoke(context)\n """\n op = check.opt_inst_param(op, "op", (OpDefinition, PendingNodeInvocation))\n\n from dagster._core.instance import DagsterInstance\n\n return UnboundHookContext(\n resources=check.opt_mapping_param(resources, "resources", key_type=str),\n op=op,\n run_id=check.opt_str_param(run_id, "run_id"),\n job_name=check.opt_str_param(job_name, "job_name"),\n op_exception=check.opt_inst_param(op_exception, "op_exception", Exception),\n instance=check.opt_inst_param(instance, "instance", DagsterInstance),\n )
\n
", "current_page_name": "_modules/dagster/_core/execution/context/hook", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.execution.context.hook"}, "init": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.execution.context.init

\nfrom typing import Any, Mapping, Optional, Union\n\nimport dagster._check as check\nfrom dagster._annotations import public\nfrom dagster._core.definitions.resource_definition import (\n    IContainsGenerator,\n    ResourceDefinition,\n    Resources,\n)\nfrom dagster._core.errors import DagsterInvariantViolationError\nfrom dagster._core.instance import DagsterInstance\nfrom dagster._core.log_manager import DagsterLogManager\nfrom dagster._core.storage.dagster_run import DagsterRun\n\n\n
[docs]class InitResourceContext:\n """The context object available as the argument to the initialization function of a :py:class:`dagster.ResourceDefinition`.\n\n Users should not instantiate this object directly. To construct an `InitResourceContext` for testing purposes, use :py:func:`dagster.build_init_resource_context`.\n\n Example:\n .. code-block:: python\n\n from dagster import resource, InitResourceContext\n\n @resource\n def the_resource(init_context: InitResourceContext):\n init_context.log.info("Hello, world!")\n """\n\n def __init__(\n self,\n resource_config: Any,\n resources: Resources,\n resource_def: Optional[ResourceDefinition] = None,\n instance: Optional[DagsterInstance] = None,\n dagster_run: Optional[DagsterRun] = None,\n log_manager: Optional[DagsterLogManager] = None,\n ):\n self._resource_config = resource_config\n self._resource_def = resource_def\n self._log_manager = log_manager\n self._instance = instance\n self._resources = resources\n self._dagster_run = dagster_run\n\n @public\n @property\n def resource_config(self) -> Any:\n """The configuration data provided by the run config. The schema\n for this data is defined by the ``config_field`` argument to\n :py:class:`ResourceDefinition`.\n """\n return self._resource_config\n\n @public\n @property\n def resource_def(self) -> Optional[ResourceDefinition]:\n """The definition of the resource currently being constructed."""\n return self._resource_def\n\n @public\n @property\n def resources(self) -> Resources:\n """The resources that are available to the resource that we are initalizing."""\n return self._resources\n\n @public\n @property\n def instance(self) -> Optional[DagsterInstance]:\n """The Dagster instance configured for the current execution context."""\n return self._instance\n\n @property\n def dagster_run(self) -> Optional[DagsterRun]:\n """The dagster run to use. When initializing resources outside of execution context, this will be None."""\n return self._dagster_run\n\n @public\n @property\n def log(self) -> Optional[DagsterLogManager]:\n """The Dagster log manager configured for the current execution context."""\n return self._log_manager\n\n # backcompat: keep around this property from when InitResourceContext used to be a NamedTuple\n @public\n @property\n def log_manager(self) -> Optional[DagsterLogManager]:\n """The log manager for this run of the job."""\n return self._log_manager\n\n @public\n @property\n def run_id(self) -> Optional[str]:\n """The id for this run of the job or pipeline. When initializing resources outside of\n execution context, this will be None.\n """\n return self.dagster_run.run_id if self.dagster_run else None\n\n def replace_config(self, config: Any) -> "InitResourceContext":\n return InitResourceContext(\n resource_config=config,\n resources=self.resources,\n instance=self.instance,\n resource_def=self.resource_def,\n dagster_run=self.dagster_run,\n log_manager=self.log,\n )
\n\n\nclass UnboundInitResourceContext(InitResourceContext):\n """Resource initialization context outputted by ``build_init_resource_context``.\n\n Represents a context whose config has not yet been validated against a resource definition,\n hence the inability to access the `resource_def` attribute. When an instance of\n ``UnboundInitResourceContext`` is passed to a resource invocation, config is validated,\n and it is subsumed into an `InitResourceContext`, which contains the resource_def validated\n against.\n """\n\n def __init__(\n self,\n resource_config: Any,\n resources: Optional[Union[Resources, Mapping[str, Any]]],\n instance: Optional[DagsterInstance],\n ):\n from dagster._core.execution.api import ephemeral_instance_if_missing\n from dagster._core.execution.build_resources import (\n build_resources,\n wrap_resources_for_execution,\n )\n from dagster._core.execution.context_creation_job import initialize_console_manager\n\n self._instance_provided = (\n check.opt_inst_param(instance, "instance", DagsterInstance) is not None\n )\n # Construct ephemeral instance if missing\n self._instance_cm = ephemeral_instance_if_missing(instance)\n # Pylint can't infer that the ephemeral_instance context manager has an __enter__ method,\n # so ignore lint error\n instance = self._instance_cm.__enter__()\n\n if isinstance(resources, Resources):\n check.failed("Should not have a Resources object directly from this initialization")\n\n self._resource_defs = wrap_resources_for_execution(\n check.opt_mapping_param(resources, "resources")\n )\n\n self._resources_cm = build_resources(self._resource_defs, instance=instance)\n resources = self._resources_cm.__enter__()\n self._resources_contain_cm = isinstance(resources, IContainsGenerator)\n\n self._cm_scope_entered = False\n super(UnboundInitResourceContext, self).__init__(\n resource_config=resource_config,\n resources=resources,\n resource_def=None,\n instance=instance,\n dagster_run=None,\n log_manager=initialize_console_manager(None),\n )\n\n def __enter__(self):\n self._cm_scope_entered = True\n return self\n\n def __exit__(self, *exc):\n self._resources_cm.__exit__(*exc)\n if self._instance_provided:\n self._instance_cm.__exit__(*exc)\n\n def __del__(self):\n if self._resources_cm and self._resources_contain_cm and not self._cm_scope_entered:\n self._resources_cm.__exit__(None, None, None)\n if self._instance_provided and not self._cm_scope_entered:\n self._instance_cm.__exit__(None, None, None)\n\n @property\n def resource_config(self) -> Any:\n return self._resource_config\n\n @property\n def resource_def(self) -> Optional[ResourceDefinition]:\n raise DagsterInvariantViolationError(\n "UnboundInitLoggerContext has not been validated against a logger definition."\n )\n\n @property\n def resources(self) -> Resources:\n """The resources that are available to the resource that we are initalizing."""\n if self._resources_cm and self._resources_contain_cm and not self._cm_scope_entered:\n raise DagsterInvariantViolationError(\n "At least one provided resource is a generator, but attempting to access "\n "resources outside of context manager scope. You can use the following syntax to "\n "open a context manager: `with build_init_resource_context(...) as context:`"\n )\n return self._resources\n\n @property\n def instance(self) -> Optional[DagsterInstance]:\n return self._instance\n\n @property\n def log(self) -> Optional[DagsterLogManager]:\n return self._log_manager\n\n # backcompat: keep around this property from when InitResourceContext used to be a NamedTuple\n @property\n def log_manager(self) -> Optional[DagsterLogManager]:\n return self._log_manager\n\n @property\n def run_id(self) -> Optional[str]:\n return None\n\n\n
[docs]def build_init_resource_context(\n config: Optional[Mapping[str, Any]] = None,\n resources: Optional[Mapping[str, Any]] = None,\n instance: Optional[DagsterInstance] = None,\n) -> InitResourceContext:\n """Builds resource initialization context from provided parameters.\n\n ``build_init_resource_context`` can be used as either a function or context manager. If there is a\n provided resource to ``build_init_resource_context`` that is a context manager, then it must be\n used as a context manager. This function can be used to provide the context argument to the\n invocation of a resource.\n\n Args:\n resources (Optional[Dict[str, Any]]): The resources to provide to the context. These can be\n either values or resource definitions.\n config (Optional[Any]): The resource config to provide to the context.\n instance (Optional[DagsterInstance]): The dagster instance configured for the context.\n Defaults to DagsterInstance.ephemeral().\n\n Examples:\n .. code-block:: python\n\n context = build_init_resource_context()\n resource_to_init(context)\n\n with build_init_resource_context(\n resources={"foo": context_manager_resource}\n ) as context:\n resource_to_init(context)\n\n """\n return UnboundInitResourceContext(\n resource_config=check.opt_mapping_param(config, "config", key_type=str),\n instance=check.opt_inst_param(instance, "instance", DagsterInstance),\n resources=check.opt_mapping_param(resources, "resources", key_type=str),\n )
\n
", "current_page_name": "_modules/dagster/_core/execution/context/init", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.execution.context.init"}, "input": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.execution.context.input

\nfrom datetime import datetime\nfrom typing import (\n    TYPE_CHECKING,\n    Any,\n    Iterable,\n    Iterator,\n    List,\n    Mapping,\n    Optional,\n    Sequence,\n    Union,\n)\n\nimport dagster._check as check\nfrom dagster._annotations import public\nfrom dagster._core.definitions.events import AssetKey, AssetObservation, CoercibleToAssetKey\nfrom dagster._core.definitions.metadata import (\n    ArbitraryMetadataMapping,\n    MetadataValue,\n)\nfrom dagster._core.definitions.partition import PartitionsSubset\nfrom dagster._core.definitions.partition_key_range import PartitionKeyRange\nfrom dagster._core.definitions.time_window_partitions import TimeWindow, TimeWindowPartitionsSubset\nfrom dagster._core.errors import DagsterInvariantViolationError\nfrom dagster._core.instance import DagsterInstance, DynamicPartitionsStore\n\nif TYPE_CHECKING:\n    from dagster._core.definitions import PartitionsDefinition\n    from dagster._core.definitions.op_definition import OpDefinition\n    from dagster._core.definitions.resource_definition import Resources\n    from dagster._core.events import DagsterEvent\n    from dagster._core.execution.context.system import StepExecutionContext\n    from dagster._core.log_manager import DagsterLogManager\n    from dagster._core.types.dagster_type import DagsterType\n\n    from .output import OutputContext\n\n\n
[docs]class InputContext:\n """The ``context`` object available to the load_input method of :py:class:`InputManager`.\n\n Users should not instantiate this object directly. In order to construct\n an `InputContext` for testing an IO Manager's `load_input` method, use\n :py:func:`dagster.build_input_context`.\n\n Example:\n .. code-block:: python\n\n from dagster import IOManager, InputContext\n\n class MyIOManager(IOManager):\n def load_input(self, context: InputContext):\n ...\n """\n\n def __init__(\n self,\n *,\n name: Optional[str] = None,\n job_name: Optional[str] = None,\n op_def: Optional["OpDefinition"] = None,\n config: Optional[Any] = None,\n metadata: Optional[ArbitraryMetadataMapping] = None,\n upstream_output: Optional["OutputContext"] = None,\n dagster_type: Optional["DagsterType"] = None,\n log_manager: Optional["DagsterLogManager"] = None,\n resource_config: Optional[Mapping[str, Any]] = None,\n resources: Optional[Union["Resources", Mapping[str, Any]]] = None,\n step_context: Optional["StepExecutionContext"] = None,\n asset_key: Optional[AssetKey] = None,\n partition_key: Optional[str] = None,\n asset_partitions_subset: Optional[PartitionsSubset] = None,\n asset_partitions_def: Optional["PartitionsDefinition"] = None,\n instance: Optional[DagsterInstance] = None,\n ):\n from dagster._core.definitions.resource_definition import IContainsGenerator, Resources\n from dagster._core.execution.build_resources import build_resources\n\n self._name = name\n self._job_name = job_name\n self._op_def = op_def\n self._config = config\n self._metadata = metadata or {}\n self._upstream_output = upstream_output\n self._dagster_type = dagster_type\n self._log = log_manager\n self._resource_config = resource_config\n self._step_context = step_context\n self._asset_key = asset_key\n if self._step_context and self._step_context.has_partition_key:\n self._partition_key: Optional[str] = self._step_context.partition_key\n else:\n self._partition_key = partition_key\n\n self._asset_partitions_subset = asset_partitions_subset\n self._asset_partitions_def = asset_partitions_def\n\n if isinstance(resources, Resources):\n self._resources_cm = None\n self._resources = resources\n else:\n self._resources_cm = build_resources(\n check.opt_mapping_param(resources, "resources", key_type=str)\n )\n self._resources = self._resources_cm.__enter__()\n self._resources_contain_cm = isinstance(self._resources, IContainsGenerator)\n self._cm_scope_entered = False\n\n self._events: List["DagsterEvent"] = []\n self._observations: List[AssetObservation] = []\n self._instance = instance\n\n def __enter__(self):\n if self._resources_cm:\n self._cm_scope_entered = True\n return self\n\n def __exit__(self, *exc):\n if self._resources_cm:\n self._resources_cm.__exit__(*exc)\n\n def __del__(self):\n if self._resources_cm and self._resources_contain_cm and not self._cm_scope_entered:\n self._resources_cm.__exit__(None, None, None)\n\n @property\n def instance(self) -> DagsterInstance:\n if self._instance is None:\n raise DagsterInvariantViolationError(\n "Attempting to access instance, "\n "but it was not provided when constructing the InputContext"\n )\n return self._instance\n\n @public\n @property\n def has_input_name(self) -> bool:\n """If we're the InputContext is being used to load the result of a run from outside the run,\n then it won't have an input name.\n """\n return self._name is not None\n\n @public\n @property\n def name(self) -> str:\n """The name of the input that we're loading."""\n if self._name is None:\n raise DagsterInvariantViolationError(\n "Attempting to access name, "\n "but it was not provided when constructing the InputContext"\n )\n\n return self._name\n\n @property\n def job_name(self) -> str:\n if self._job_name is None:\n raise DagsterInvariantViolationError(\n "Attempting to access job_name, "\n "but it was not provided when constructing the InputContext"\n )\n return self._job_name\n\n @public\n @property\n def op_def(self) -> "OpDefinition":\n """The definition of the op that's loading the input."""\n if self._op_def is None:\n raise DagsterInvariantViolationError(\n "Attempting to access op_def, "\n "but it was not provided when constructing the InputContext"\n )\n\n return self._op_def\n\n @public\n @property\n def config(self) -> Any:\n """The config attached to the input that we're loading."""\n return self._config\n\n @public\n @property\n def metadata(self) -> Optional[ArbitraryMetadataMapping]:\n """A dict of metadata that is assigned to the InputDefinition that we're loading for.\n This property only contains metadata passed in explicitly with :py:class:`AssetIn`\n or :py:class:`In`. To access metadata of an upstream asset or operation definition,\n use the metadata in :py:attr:`.InputContext.upstream_output`.\n """\n return self._metadata\n\n @public\n @property\n def upstream_output(self) -> Optional["OutputContext"]:\n """Info about the output that produced the object we're loading."""\n return self._upstream_output\n\n @public\n @property\n def dagster_type(self) -> "DagsterType":\n """The type of this input.\n Dagster types do not propagate from an upstream output to downstream inputs,\n and this property only captures type information for the input that is either\n passed in explicitly with :py:class:`AssetIn` or :py:class:`In`, or can be\n infered from type hints. For an asset input, the Dagster type from the upstream\n asset definition is ignored.\n """\n if self._dagster_type is None:\n raise DagsterInvariantViolationError(\n "Attempting to access dagster_type, "\n "but it was not provided when constructing the InputContext"\n )\n\n return self._dagster_type\n\n @public\n @property\n def log(self) -> "DagsterLogManager":\n """The log manager to use for this input."""\n if self._log is None:\n raise DagsterInvariantViolationError(\n "Attempting to access log, "\n "but it was not provided when constructing the InputContext"\n )\n\n return self._log\n\n @public\n @property\n def resource_config(self) -> Optional[Mapping[str, Any]]:\n """The config associated with the resource that initializes the InputManager."""\n return self._resource_config\n\n @public\n @property\n def resources(self) -> Any:\n """The resources required by the resource that initializes the\n input manager. If using the :py:func:`@input_manager` decorator, these resources\n correspond to those requested with the `required_resource_keys` parameter.\n """\n if self._resources is None:\n raise DagsterInvariantViolationError(\n "Attempting to access resources, "\n "but it was not provided when constructing the InputContext"\n )\n\n if self._resources_cm and self._resources_contain_cm and not self._cm_scope_entered:\n raise DagsterInvariantViolationError(\n "At least one provided resource is a generator, but attempting to access "\n "resources outside of context manager scope. You can use the following syntax to "\n "open a context manager: `with build_input_context(...) as context:`"\n )\n return self._resources\n\n @public\n @property\n def has_asset_key(self) -> bool:\n """Returns True if an asset is being loaded as input, otherwise returns False. A return value of False\n indicates that an output from an op is being loaded as the input.\n """\n return self._asset_key is not None\n\n @public\n @property\n def asset_key(self) -> AssetKey:\n """The ``AssetKey`` of the asset that is being loaded as an input."""\n if self._asset_key is None:\n raise DagsterInvariantViolationError(\n "Attempting to access asset_key, but no asset is associated with this input"\n )\n\n return self._asset_key\n\n @public\n @property\n def asset_partitions_def(self) -> "PartitionsDefinition":\n """The PartitionsDefinition on the upstream asset corresponding to this input."""\n if self._asset_partitions_def is None:\n if self.asset_key:\n raise DagsterInvariantViolationError(\n f"Attempting to access partitions def for asset {self.asset_key}, but it is not"\n " partitioned"\n )\n else:\n raise DagsterInvariantViolationError(\n "Attempting to access partitions def for asset, but input does not correspond"\n " to an asset"\n )\n\n return self._asset_partitions_def\n\n @property\n def step_context(self) -> "StepExecutionContext":\n if self._step_context is None:\n raise DagsterInvariantViolationError(\n "Attempting to access step_context, "\n "but it was not provided when constructing the InputContext"\n )\n\n return self._step_context\n\n @public\n @property\n def has_partition_key(self) -> bool:\n """Whether the current run is a partitioned run."""\n return self._partition_key is not None\n\n @public\n @property\n def partition_key(self) -> str:\n """The partition key for the current run.\n\n Raises an error if the current run is not a partitioned run.\n """\n if self._partition_key is None:\n check.failed(\n "Tried to access partition_key on a non-partitioned run.",\n )\n\n return self._partition_key\n\n @public\n @property\n def has_asset_partitions(self) -> bool:\n """Returns True if the asset being loaded as input is partitioned."""\n return self._asset_partitions_subset is not None\n\n @public\n @property\n def asset_partition_key(self) -> str:\n """The partition key for input asset.\n\n Raises an error if the input asset has no partitioning, or if the run covers a partition\n range for the input asset.\n """\n subset = self._asset_partitions_subset\n\n if subset is None:\n check.failed("The input does not correspond to a partitioned asset.")\n\n partition_keys = list(subset.get_partition_keys())\n if len(partition_keys) == 1:\n return partition_keys[0]\n else:\n check.failed(\n f"Tried to access partition key for asset '{self.asset_key}', "\n f"but the number of input partitions != 1: '{subset}'."\n )\n\n @public\n @property\n def asset_partition_key_range(self) -> PartitionKeyRange:\n """The partition key range for input asset.\n\n Raises an error if the input asset has no partitioning.\n """\n subset = self._asset_partitions_subset\n\n if subset is None:\n check.failed(\n "Tried to access asset_partition_key_range, but the asset is not partitioned.",\n )\n\n partition_key_ranges = subset.get_partition_key_ranges(\n dynamic_partitions_store=self.instance\n )\n if len(partition_key_ranges) != 1:\n check.failed(\n "Tried to access asset_partition_key_range, but there are "\n f"({len(partition_key_ranges)}) key ranges associated with this input.",\n )\n\n return partition_key_ranges[0]\n\n @public\n @property\n def asset_partition_keys(self) -> Sequence[str]:\n """The partition keys for input asset.\n\n Raises an error if the input asset has no partitioning.\n """\n if self._asset_partitions_subset is None:\n check.failed(\n "Tried to access asset_partition_keys, but the asset is not partitioned.",\n )\n\n return list(self._asset_partitions_subset.get_partition_keys())\n\n @public\n @property\n def asset_partitions_time_window(self) -> TimeWindow:\n """The time window for the partitions of the input asset.\n\n Raises an error if either of the following are true:\n - The input asset has no partitioning.\n - The input asset is not partitioned with a TimeWindowPartitionsDefinition.\n """\n subset = self._asset_partitions_subset\n\n if subset is None:\n check.failed(\n "Tried to access asset_partitions_time_window, but the asset is not partitioned.",\n )\n\n if not isinstance(subset, TimeWindowPartitionsSubset):\n check.failed(\n "Tried to access asset_partitions_time_window, but the asset is not partitioned"\n " with time windows.",\n )\n\n time_windows = subset.included_time_windows\n if len(time_windows) != 1:\n check.failed(\n "Tried to access asset_partitions_time_window, but there are "\n f"({len(time_windows)}) time windows associated with this input.",\n )\n\n return time_windows[0]\n\n
[docs] @public\n def get_identifier(self) -> Sequence[str]:\n """Utility method to get a collection of identifiers that as a whole represent a unique\n step input.\n\n If not using memoization, the unique identifier collection consists of\n\n - ``run_id``: the id of the run which generates the input.\n Note: This method also handles the re-execution memoization logic. If the step that\n generates the input is skipped in the re-execution, the ``run_id`` will be the id\n of its parent run.\n - ``step_key``: the key for a compute step.\n - ``name``: the name of the output. (default: 'result').\n\n If using memoization, the ``version`` corresponding to the step output is used in place of\n the ``run_id``.\n\n Returns:\n List[str, ...]: A list of identifiers, i.e. (run_id or version), step_key, and output_name\n """\n if self.upstream_output is None:\n raise DagsterInvariantViolationError(\n "InputContext.upstream_output not defined. Cannot compute an identifier"\n )\n\n return self.upstream_output.get_identifier()
\n\n
[docs] @public\n def get_asset_identifier(self) -> Sequence[str]:\n """The sequence of strings making up the AssetKey for the asset being loaded as an input.\n If the asset is partitioned, the identifier contains the partition key as the final element in the\n sequence. For example, for the asset key ``AssetKey(["foo", "bar", "baz"])``, materialized with\n partition key "2023-06-01", ``get_asset_identifier`` will return ``["foo", "bar", "baz", "2023-06-01"]``.\n """\n if self.asset_key is not None:\n if self.has_asset_partitions:\n return [*self.asset_key.path, self.asset_partition_key]\n else:\n return self.asset_key.path\n else:\n check.failed("Can't get asset identifier for an input with no asset key")
\n\n def consume_events(self) -> Iterator["DagsterEvent"]:\n """Pops and yields all user-generated events that have been recorded from this context.\n\n If consume_events has not yet been called, this will yield all logged events since the call to `handle_input`. If consume_events has been called, it will yield all events since the last time consume_events was called. Designed for internal use. Users should never need to invoke this method.\n """\n events = self._events\n self._events = []\n yield from events\n\n def add_input_metadata(\n self,\n metadata: Mapping[str, Any],\n description: Optional[str] = None,\n ) -> None:\n """Accepts a dictionary of metadata. Metadata entries will appear on the LOADED_INPUT event.\n If the input is an asset, metadata will be attached to an asset observation.\n\n The asset observation will be yielded from the run and appear in the event log.\n Only valid if the context has an asset key.\n """\n from dagster._core.definitions.metadata import normalize_metadata\n from dagster._core.events import DagsterEvent\n\n metadata = check.mapping_param(metadata, "metadata", key_type=str)\n self._metadata = {**self._metadata, **normalize_metadata(metadata)}\n if self.has_asset_key:\n check.opt_str_param(description, "description")\n\n observation = AssetObservation(\n asset_key=self.asset_key,\n description=description,\n partition=self.asset_partition_key if self.has_asset_partitions else None,\n metadata=metadata,\n )\n self._observations.append(observation)\n if self._step_context:\n self._events.append(DagsterEvent.asset_observation(self._step_context, observation))\n\n def get_observations(\n self,\n ) -> Sequence[AssetObservation]:\n """Retrieve the list of user-generated asset observations that were observed via the context.\n\n User-generated events that were yielded will not appear in this list.\n\n **Examples:**\n\n .. code-block:: python\n\n from dagster import IOManager, build_input_context, AssetObservation\n\n class MyIOManager(IOManager):\n def load_input(self, context, obj):\n ...\n\n def test_load_input():\n mgr = MyIOManager()\n context = build_input_context()\n mgr.load_input(context)\n observations = context.get_observations()\n ...\n """\n return self._observations\n\n def consume_metadata(self) -> Mapping[str, MetadataValue]:\n result = self._metadata\n self._metadata = {}\n return result
\n\n\n
[docs]def build_input_context(\n name: Optional[str] = None,\n config: Optional[Any] = None,\n metadata: Optional[ArbitraryMetadataMapping] = None,\n upstream_output: Optional["OutputContext"] = None,\n dagster_type: Optional["DagsterType"] = None,\n resource_config: Optional[Mapping[str, Any]] = None,\n resources: Optional[Mapping[str, Any]] = None,\n op_def: Optional["OpDefinition"] = None,\n step_context: Optional["StepExecutionContext"] = None,\n asset_key: Optional[CoercibleToAssetKey] = None,\n partition_key: Optional[str] = None,\n asset_partition_key_range: Optional[PartitionKeyRange] = None,\n asset_partitions_def: Optional["PartitionsDefinition"] = None,\n instance: Optional[DagsterInstance] = None,\n) -> "InputContext":\n """Builds input context from provided parameters.\n\n ``build_input_context`` can be used as either a function, or a context manager. If resources\n that are also context managers are provided, then ``build_input_context`` must be used as a\n context manager.\n\n Args:\n name (Optional[str]): The name of the input that we're loading.\n config (Optional[Any]): The config attached to the input that we're loading.\n metadata (Optional[Dict[str, Any]]): A dict of metadata that is assigned to the\n InputDefinition that we're loading for.\n upstream_output (Optional[OutputContext]): Info about the output that produced the object\n we're loading.\n dagster_type (Optional[DagsterType]): The type of this input.\n resource_config (Optional[Dict[str, Any]]): The resource config to make available from the\n input context. This usually corresponds to the config provided to the resource that\n loads the input manager.\n resources (Optional[Dict[str, Any]]): The resources to make available from the context.\n For a given key, you can provide either an actual instance of an object, or a resource\n definition.\n asset_key (Optional[Union[AssetKey, Sequence[str], str]]): The asset key attached to the InputDefinition.\n op_def (Optional[OpDefinition]): The definition of the op that's loading the input.\n step_context (Optional[StepExecutionContext]): For internal use.\n partition_key (Optional[str]): String value representing partition key to execute with.\n asset_partition_key_range (Optional[str]): The range of asset partition keys to load.\n asset_partitions_def: Optional[PartitionsDefinition]: The PartitionsDefinition of the asset\n being loaded.\n\n Examples:\n .. code-block:: python\n\n build_input_context()\n\n with build_input_context(resources={"foo": context_manager_resource}) as context:\n do_something\n """\n from dagster._core.definitions import OpDefinition, PartitionsDefinition\n from dagster._core.execution.context.output import OutputContext\n from dagster._core.execution.context.system import StepExecutionContext\n from dagster._core.execution.context_creation_job import initialize_console_manager\n from dagster._core.types.dagster_type import DagsterType\n\n name = check.opt_str_param(name, "name")\n metadata = check.opt_mapping_param(metadata, "metadata", key_type=str)\n upstream_output = check.opt_inst_param(upstream_output, "upstream_output", OutputContext)\n dagster_type = check.opt_inst_param(dagster_type, "dagster_type", DagsterType)\n resource_config = check.opt_mapping_param(resource_config, "resource_config", key_type=str)\n resources = check.opt_mapping_param(resources, "resources", key_type=str)\n op_def = check.opt_inst_param(op_def, "op_def", OpDefinition)\n step_context = check.opt_inst_param(step_context, "step_context", StepExecutionContext)\n asset_key = AssetKey.from_coercible(asset_key) if asset_key else None\n partition_key = check.opt_str_param(partition_key, "partition_key")\n asset_partition_key_range = check.opt_inst_param(\n asset_partition_key_range, "asset_partition_key_range", PartitionKeyRange\n )\n asset_partitions_def = check.opt_inst_param(\n asset_partitions_def, "asset_partitions_def", PartitionsDefinition\n )\n if asset_partitions_def and asset_partition_key_range:\n asset_partitions_subset = asset_partitions_def.empty_subset().with_partition_key_range(\n asset_partition_key_range, dynamic_partitions_store=instance\n )\n elif asset_partition_key_range:\n asset_partitions_subset = KeyRangeNoPartitionsDefPartitionsSubset(asset_partition_key_range)\n else:\n asset_partitions_subset = None\n\n return InputContext(\n name=name,\n job_name=None,\n config=config,\n metadata=metadata,\n upstream_output=upstream_output,\n dagster_type=dagster_type,\n log_manager=initialize_console_manager(None),\n resource_config=resource_config,\n resources=resources,\n step_context=step_context,\n op_def=op_def,\n asset_key=asset_key,\n partition_key=partition_key,\n asset_partitions_subset=asset_partitions_subset,\n asset_partitions_def=asset_partitions_def,\n instance=instance,\n )
\n\n\nclass KeyRangeNoPartitionsDefPartitionsSubset(PartitionsSubset):\n """For build_input_context when no PartitionsDefinition has been provided."""\n\n def __init__(self, key_range: PartitionKeyRange):\n self._key_range = key_range\n\n def get_partition_keys_not_in_subset(\n self,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> Iterable[str]:\n raise NotImplementedError()\n\n def get_partition_keys(self, current_time: Optional[datetime] = None) -> Iterable[str]:\n if self._key_range.start == self._key_range.end:\n return self._key_range.start\n else:\n raise NotImplementedError()\n\n def get_partition_key_ranges(\n self,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> Sequence[PartitionKeyRange]:\n return [self._key_range]\n\n def with_partition_keys(self, partition_keys: Iterable[str]) -> "PartitionsSubset":\n raise NotImplementedError()\n\n def with_partition_key_range(\n self,\n partition_key_range: PartitionKeyRange,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> "PartitionsSubset":\n raise NotImplementedError()\n\n def serialize(self) -> str:\n raise NotImplementedError()\n\n @property\n def partitions_def(self) -> "PartitionsDefinition":\n raise NotImplementedError()\n\n def __len__(self) -> int:\n raise NotImplementedError()\n\n def __contains__(self, value) -> bool:\n raise NotImplementedError()\n\n @classmethod\n def from_serialized(\n cls, partitions_def: "PartitionsDefinition", serialized: str\n ) -> "PartitionsSubset":\n raise NotImplementedError()\n\n @classmethod\n def can_deserialize(\n cls,\n partitions_def: "PartitionsDefinition",\n serialized: str,\n serialized_partitions_def_unique_id: Optional[str],\n serialized_partitions_def_class_name: Optional[str],\n ) -> bool:\n raise NotImplementedError()\n\n @classmethod\n def empty_subset(cls, partitions_def: "PartitionsDefinition") -> "PartitionsSubset":\n raise NotImplementedError()\n
", "current_page_name": "_modules/dagster/_core/execution/context/input", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.execution.context.input"}, "invocation": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.execution.context.invocation

\nfrom contextlib import ExitStack\nfrom typing import (\n    AbstractSet,\n    Any,\n    Dict,\n    List,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Set,\n    Union,\n    cast,\n)\n\nimport dagster._check as check\nfrom dagster._core.definitions.assets import AssetsDefinition\nfrom dagster._core.definitions.composition import PendingNodeInvocation\nfrom dagster._core.definitions.decorators.op_decorator import DecoratedOpFunction\nfrom dagster._core.definitions.dependency import Node, NodeHandle\nfrom dagster._core.definitions.events import (\n    AssetMaterialization,\n    AssetObservation,\n    ExpectationResult,\n    UserEvent,\n)\nfrom dagster._core.definitions.hook_definition import HookDefinition\nfrom dagster._core.definitions.job_definition import JobDefinition\nfrom dagster._core.definitions.multi_dimensional_partitions import MultiPartitionsDefinition\nfrom dagster._core.definitions.op_definition import OpDefinition\nfrom dagster._core.definitions.partition_key_range import PartitionKeyRange\nfrom dagster._core.definitions.resource_definition import (\n    IContainsGenerator,\n    ResourceDefinition,\n    Resources,\n    ScopedResourcesBuilder,\n)\nfrom dagster._core.definitions.resource_requirement import ensure_requirements_satisfied\nfrom dagster._core.definitions.step_launcher import StepLauncher\nfrom dagster._core.definitions.time_window_partitions import (\n    TimeWindow,\n    TimeWindowPartitionsDefinition,\n    has_one_dimension_time_window_partitioning,\n)\nfrom dagster._core.errors import (\n    DagsterInvalidInvocationError,\n    DagsterInvalidPropertyError,\n    DagsterInvariantViolationError,\n)\nfrom dagster._core.execution.build_resources import build_resources, wrap_resources_for_execution\nfrom dagster._core.instance import DagsterInstance\nfrom dagster._core.log_manager import DagsterLogManager\nfrom dagster._core.storage.dagster_run import DagsterRun\nfrom dagster._core.types.dagster_type import DagsterType\nfrom dagster._utils.forked_pdb import ForkedPdb\nfrom dagster._utils.merger import merge_dicts\n\nfrom .compute import OpExecutionContext\nfrom .system import StepExecutionContext, TypeCheckContext\n\n\ndef _property_msg(prop_name: str, method_name: str) -> str:\n    return (\n        f"The {prop_name} {method_name} is not set on the context when a solid is directly invoked."\n    )\n\n\nclass UnboundOpExecutionContext(OpExecutionContext):\n    """The ``context`` object available as the first argument to a solid's compute function when\n    being invoked directly. Can also be used as a context manager.\n    """\n\n    def __init__(\n        self,\n        op_config: Any,\n        resources_dict: Mapping[str, Any],\n        resources_config: Mapping[str, Any],\n        instance: Optional[DagsterInstance],\n        partition_key: Optional[str],\n        partition_key_range: Optional[PartitionKeyRange],\n        mapping_key: Optional[str],\n        assets_def: Optional[AssetsDefinition],\n    ):\n        from dagster._core.execution.api import ephemeral_instance_if_missing\n        from dagster._core.execution.context_creation_job import initialize_console_manager\n\n        self._op_config = op_config\n        self._mapping_key = mapping_key\n\n        self._exit_stack = ExitStack()\n\n        # Construct ephemeral instance if missing\n        self._instance = self._exit_stack.enter_context(ephemeral_instance_if_missing(instance))\n\n        self._resources_config = resources_config\n        # Open resource context manager\n        self._resources_contain_cm = False\n        self._resource_defs = wrap_resources_for_execution(resources_dict)\n        self._resources = self._exit_stack.enter_context(\n            build_resources(\n                resources=self._resource_defs,\n                instance=self._instance,\n                resource_config=resources_config,\n            )\n        )\n        self._resources_contain_cm = isinstance(self._resources, IContainsGenerator)\n\n        self._log = initialize_console_manager(None)\n        self._pdb: Optional[ForkedPdb] = None\n        self._cm_scope_entered = False\n        check.invariant(\n            not (partition_key and partition_key_range),\n            "Must supply at most one of partition_key or partition_key_range",\n        )\n        self._partition_key = partition_key\n        self._partition_key_range = partition_key_range\n        self._user_events: List[UserEvent] = []\n        self._output_metadata: Dict[str, Any] = {}\n\n        self._assets_def = check.opt_inst_param(assets_def, "assets_def", AssetsDefinition)\n\n    def __enter__(self):\n        self._cm_scope_entered = True\n        return self\n\n    def __exit__(self, *exc):\n        self._exit_stack.close()\n\n    def __del__(self):\n        self._exit_stack.close()\n\n    @property\n    def op_config(self) -> Any:\n        return self._op_config\n\n    @property\n    def resource_keys(self) -> AbstractSet[str]:\n        return self._resource_defs.keys()\n\n    @property\n    def resources(self) -> Resources:\n        if self._resources_contain_cm and not self._cm_scope_entered:\n            raise DagsterInvariantViolationError(\n                "At least one provided resource is a generator, but attempting to access "\n                "resources outside of context manager scope. You can use the following syntax to "\n                "open a context manager: `with build_op_context(...) as context:`"\n            )\n        return self._resources\n\n    @property\n    def dagster_run(self) -> DagsterRun:\n        raise DagsterInvalidPropertyError(_property_msg("pipeline_run", "property"))\n\n    @property\n    def instance(self) -> DagsterInstance:\n        return self._instance\n\n    @property\n    def pdb(self) -> ForkedPdb:\n        """dagster.utils.forked_pdb.ForkedPdb: Gives access to pdb debugging from within the solid.\n\n        Example:\n        .. code-block:: python\n\n            @solid\n            def debug_solid(context):\n                context.pdb.set_trace()\n\n        """\n        if self._pdb is None:\n            self._pdb = ForkedPdb()\n\n        return self._pdb\n\n    @property\n    def step_launcher(self) -> Optional[StepLauncher]:\n        raise DagsterInvalidPropertyError(_property_msg("step_launcher", "property"))\n\n    @property\n    def run_id(self) -> str:\n        """str: Hard-coded value to indicate that we are directly invoking solid."""\n        return "EPHEMERAL"\n\n    @property\n    def run_config(self) -> dict:\n        raise DagsterInvalidPropertyError(_property_msg("run_config", "property"))\n\n    @property\n    def job_def(self) -> JobDefinition:\n        raise DagsterInvalidPropertyError(_property_msg("job_def", "property"))\n\n    @property\n    def job_name(self) -> str:\n        raise DagsterInvalidPropertyError(_property_msg("job_name", "property"))\n\n    @property\n    def log(self) -> DagsterLogManager:\n        """DagsterLogManager: A console manager constructed for this context."""\n        return self._log\n\n    @property\n    def node_handle(self) -> NodeHandle:\n        raise DagsterInvalidPropertyError(_property_msg("solid_handle", "property"))\n\n    @property\n    def op(self) -> JobDefinition:\n        raise DagsterInvalidPropertyError(_property_msg("op", "property"))\n\n    @property\n    def solid(self) -> Node:\n        raise DagsterInvalidPropertyError(_property_msg("solid", "property"))\n\n    @property\n    def op_def(self) -> OpDefinition:\n        raise DagsterInvalidPropertyError(_property_msg("op_def", "property"))\n\n    @property\n    def assets_def(self) -> AssetsDefinition:\n        raise DagsterInvalidPropertyError(_property_msg("assets_def", "property"))\n\n    @property\n    def has_partition_key(self) -> bool:\n        return self._partition_key is not None\n\n    @property\n    def partition_key(self) -> str:\n        if self._partition_key:\n            return self._partition_key\n        check.failed("Tried to access partition_key for a non-partitioned run")\n\n    @property\n    def partition_key_range(self) -> PartitionKeyRange:\n        """The range of partition keys for the current run.\n\n        If run is for a single partition key, return a `PartitionKeyRange` with the same start and\n        end. Raises an error if the current run is not a partitioned run.\n        """\n        if self._partition_key_range:\n            return self._partition_key_range\n        elif self._partition_key:\n            return PartitionKeyRange(self._partition_key, self._partition_key)\n        else:\n            check.failed("Tried to access partition_key range for a non-partitioned run")\n\n    def asset_partition_key_for_output(self, output_name: str = "result") -> str:\n        return self.partition_key\n\n    def has_tag(self, key: str) -> bool:\n        raise DagsterInvalidPropertyError(_property_msg("has_tag", "method"))\n\n    def get_tag(self, key: str) -> str:\n        raise DagsterInvalidPropertyError(_property_msg("get_tag", "method"))\n\n    def get_step_execution_context(self) -> StepExecutionContext:\n        raise DagsterInvalidPropertyError(_property_msg("get_step_execution_context", "methods"))\n\n    def bind(\n        self,\n        op_def: OpDefinition,\n        pending_invocation: Optional[PendingNodeInvocation[OpDefinition]],\n        assets_def: Optional[AssetsDefinition],\n        config_from_args: Optional[Mapping[str, Any]],\n        resources_from_args: Optional[Mapping[str, Any]],\n    ) -> "BoundOpExecutionContext":\n        from dagster._core.definitions.resource_invocation import resolve_bound_config\n\n        if resources_from_args:\n            if self._resource_defs:\n                raise DagsterInvalidInvocationError(\n                    "Cannot provide resources in both context and kwargs"\n                )\n            resource_defs = wrap_resources_for_execution(resources_from_args)\n            # add new resources context to the stack to be cleared on exit\n            resources = self._exit_stack.enter_context(\n                build_resources(resource_defs, self.instance)\n            )\n        elif assets_def and assets_def.resource_defs:\n            for key in sorted(list(assets_def.resource_defs.keys())):\n                if key in self._resource_defs:\n                    raise DagsterInvalidInvocationError(\n                        f"Error when invoking {assets_def!s} resource '{key}' "\n                        "provided on both the definition and invocation context. Please "\n                        "provide on only one or the other."\n                    )\n            resource_defs = wrap_resources_for_execution(\n                {**self._resource_defs, **assets_def.resource_defs}\n            )\n            # add new resources context to the stack to be cleared on exit\n            resources = self._exit_stack.enter_context(\n                build_resources(resource_defs, self.instance, self._resources_config)\n            )\n        else:\n            resources = self.resources\n            resource_defs = self._resource_defs\n\n        _validate_resource_requirements(resource_defs, op_def)\n\n        if self.op_config and config_from_args:\n            raise DagsterInvalidInvocationError("Cannot provide config in both context and kwargs")\n        op_config = resolve_bound_config(config_from_args or self.op_config, op_def)\n\n        return BoundOpExecutionContext(\n            op_def=op_def,\n            op_config=op_config,\n            resources=resources,\n            resources_config=self._resources_config,\n            instance=self.instance,\n            log_manager=self.log,\n            pdb=self.pdb,\n            tags=(\n                pending_invocation.tags\n                if isinstance(pending_invocation, PendingNodeInvocation)\n                else None\n            ),\n            hook_defs=(\n                pending_invocation.hook_defs\n                if isinstance(pending_invocation, PendingNodeInvocation)\n                else None\n            ),\n            alias=(\n                pending_invocation.given_alias\n                if isinstance(pending_invocation, PendingNodeInvocation)\n                else None\n            ),\n            user_events=self._user_events,\n            output_metadata=self._output_metadata,\n            mapping_key=self._mapping_key,\n            partition_key=self._partition_key,\n            partition_key_range=self._partition_key_range,\n            assets_def=assets_def,\n        )\n\n    def get_events(self) -> Sequence[UserEvent]:\n        """Retrieve the list of user-generated events that were logged via the context.\n\n        **Examples:**\n\n        .. code-block:: python\n\n            from dagster import op, build_op_context, AssetMaterialization, ExpectationResult\n\n            @op\n            def my_op(context):\n                ...\n\n            def test_my_op():\n                context = build_op_context()\n                my_op(context)\n                all_user_events = context.get_events()\n                materializations = [event for event in all_user_events if isinstance(event, AssetMaterialization)]\n                expectation_results = [event for event in all_user_events if isinstance(event, ExpectationResult)]\n                ...\n        """\n        return self._user_events\n\n    def get_output_metadata(\n        self, output_name: str, mapping_key: Optional[str] = None\n    ) -> Optional[Mapping[str, Any]]:\n        """Retrieve metadata that was logged for an output and mapping_key, if it exists.\n\n        If metadata cannot be found for the particular output_name/mapping_key combination, None will be returned.\n\n        Args:\n            output_name (str): The name of the output to retrieve logged metadata for.\n            mapping_key (Optional[str]): The mapping key to retrieve metadata for (only applies when using dynamic outputs).\n\n        Returns:\n            Optional[Mapping[str, Any]]: The metadata values present for the output_name/mapping_key combination, if present.\n        """\n        metadata = self._output_metadata.get(output_name)\n        if mapping_key and metadata:\n            return metadata.get(mapping_key)\n        return metadata\n\n    def get_mapping_key(self) -> Optional[str]:\n        return self._mapping_key\n\n\ndef _validate_resource_requirements(\n    resource_defs: Mapping[str, ResourceDefinition], op_def: OpDefinition\n) -> None:\n    """Validate correctness of resources against required resource keys."""\n    if cast(DecoratedOpFunction, op_def.compute_fn).has_context_arg():\n        for requirement in op_def.get_resource_requirements():\n            if not requirement.is_io_manager_requirement:\n                ensure_requirements_satisfied(resource_defs, [requirement])\n\n\nclass BoundOpExecutionContext(OpExecutionContext):\n    """The op execution context that is passed to the compute function during invocation.\n\n    This context is bound to a specific op definition, for which the resources and config have\n    been validated.\n    """\n\n    _op_def: OpDefinition\n    _op_config: Any\n    _resources: "Resources"\n    _resources_config: Mapping[str, Any]\n    _instance: DagsterInstance\n    _log_manager: DagsterLogManager\n    _pdb: Optional[ForkedPdb]\n    _tags: Mapping[str, str]\n    _hook_defs: Optional[AbstractSet[HookDefinition]]\n    _alias: str\n    _user_events: List[UserEvent]\n    _seen_outputs: Dict[str, Union[str, Set[str]]]\n    _output_metadata: Dict[str, Any]\n    _mapping_key: Optional[str]\n    _partition_key: Optional[str]\n    _partition_key_range: Optional[PartitionKeyRange]\n    _assets_def: Optional[AssetsDefinition]\n\n    def __init__(\n        self,\n        op_def: OpDefinition,\n        op_config: Any,\n        resources: "Resources",\n        resources_config: Mapping[str, Any],\n        instance: DagsterInstance,\n        log_manager: DagsterLogManager,\n        pdb: Optional[ForkedPdb],\n        tags: Optional[Mapping[str, str]],\n        hook_defs: Optional[AbstractSet[HookDefinition]],\n        alias: Optional[str],\n        user_events: List[UserEvent],\n        output_metadata: Dict[str, Any],\n        mapping_key: Optional[str],\n        partition_key: Optional[str],\n        partition_key_range: Optional[PartitionKeyRange],\n        assets_def: Optional[AssetsDefinition],\n    ):\n        self._op_def = op_def\n        self._op_config = op_config\n        self._resources = resources\n        self._instance = instance\n        self._log = log_manager\n        self._pdb = pdb\n        self._tags = merge_dicts(self._op_def.tags, tags) if tags else self._op_def.tags\n        self._hook_defs = hook_defs\n        self._alias = alias if alias else self._op_def.name\n        self._resources_config = resources_config\n        self._user_events = user_events\n        self._seen_outputs = {}\n        self._output_metadata = output_metadata\n        self._mapping_key = mapping_key\n        self._partition_key = partition_key\n        self._partition_key_range = partition_key_range\n        self._assets_def = assets_def\n        self._requires_typed_event_stream = False\n        self._typed_event_stream_error_message = None\n\n    @property\n    def op_config(self) -> Any:\n        return self._op_config\n\n    @property\n    def resources(self) -> Resources:\n        return self._resources\n\n    @property\n    def dagster_run(self) -> DagsterRun:\n        raise DagsterInvalidPropertyError(_property_msg("pipeline_run", "property"))\n\n    @property\n    def instance(self) -> DagsterInstance:\n        return self._instance\n\n    @property\n    def pdb(self) -> ForkedPdb:\n        """dagster.utils.forked_pdb.ForkedPdb: Gives access to pdb debugging from within the solid.\n\n        Example:\n        .. code-block:: python\n\n            @solid\n            def debug_solid(context):\n                context.pdb.set_trace()\n\n        """\n        if self._pdb is None:\n            self._pdb = ForkedPdb()\n\n        return self._pdb\n\n    @property\n    def step_launcher(self) -> Optional[StepLauncher]:\n        raise DagsterInvalidPropertyError(_property_msg("step_launcher", "property"))\n\n    @property\n    def run_id(self) -> str:\n        """str: Hard-coded value to indicate that we are directly invoking solid."""\n        return "EPHEMERAL"\n\n    @property\n    def run_config(self) -> Mapping[str, object]:\n        run_config: Dict[str, object] = {}\n        if self._op_config:\n            run_config["ops"] = {self._op_def.name: {"config": self._op_config}}\n        run_config["resources"] = self._resources_config\n        return run_config\n\n    @property\n    def job_def(self) -> JobDefinition:\n        raise DagsterInvalidPropertyError(_property_msg("job_def", "property"))\n\n    @property\n    def job_name(self) -> str:\n        raise DagsterInvalidPropertyError(_property_msg("job_name", "property"))\n\n    @property\n    def log(self) -> DagsterLogManager:\n        """DagsterLogManager: A console manager constructed for this context."""\n        return self._log\n\n    @property\n    def node_handle(self) -> NodeHandle:\n        raise DagsterInvalidPropertyError(_property_msg("node_handle", "property"))\n\n    @property\n    def op(self) -> Node:\n        raise DagsterInvalidPropertyError(_property_msg("op", "property"))\n\n    @property\n    def op_def(self) -> OpDefinition:\n        return self._op_def\n\n    @property\n    def has_assets_def(self) -> bool:\n        return self._assets_def is not None\n\n    @property\n    def assets_def(self) -> AssetsDefinition:\n        if self._assets_def is None:\n            raise DagsterInvalidPropertyError(\n                f"Op {self.op_def.name} does not have an assets definition."\n            )\n        return self._assets_def\n\n    @property\n    def has_partition_key(self) -> bool:\n        return self._partition_key is not None\n\n    def has_tag(self, key: str) -> bool:\n        return key in self._tags\n\n    def get_tag(self, key: str) -> Optional[str]:\n        return self._tags.get(key)\n\n    @property\n    def alias(self) -> str:\n        return self._alias\n\n    def get_step_execution_context(self) -> StepExecutionContext:\n        raise DagsterInvalidPropertyError(_property_msg("get_step_execution_context", "methods"))\n\n    def for_type(self, dagster_type: DagsterType) -> TypeCheckContext:\n        resources = cast(NamedTuple, self.resources)\n        return TypeCheckContext(\n            self.run_id,\n            self.log,\n            ScopedResourcesBuilder(resources._asdict()),\n            dagster_type,\n        )\n\n    def get_mapping_key(self) -> Optional[str]:\n        return self._mapping_key\n\n    def describe_op(self) -> str:\n        if isinstance(self.op_def, OpDefinition):\n            return f'op "{self.op_def.name}"'\n\n        return f'solid "{self.op_def.name}"'\n\n    def log_event(self, event: UserEvent) -> None:\n        check.inst_param(\n            event,\n            "event",\n            (AssetMaterialization, AssetObservation, ExpectationResult),\n        )\n        self._user_events.append(event)\n\n    def observe_output(self, output_name: str, mapping_key: Optional[str] = None) -> None:\n        if mapping_key:\n            if output_name not in self._seen_outputs:\n                self._seen_outputs[output_name] = set()\n            cast(Set[str], self._seen_outputs[output_name]).add(mapping_key)\n        else:\n            self._seen_outputs[output_name] = "seen"\n\n    def has_seen_output(self, output_name: str, mapping_key: Optional[str] = None) -> bool:\n        if mapping_key:\n            return (\n                output_name in self._seen_outputs and mapping_key in self._seen_outputs[output_name]\n            )\n        return output_name in self._seen_outputs\n\n    @property\n    def partition_key(self) -> str:\n        if self._partition_key is not None:\n            return self._partition_key\n        check.failed("Tried to access partition_key for a non-partitioned asset")\n\n    @property\n    def partition_key_range(self) -> PartitionKeyRange:\n        """The range of partition keys for the current run.\n\n        If run is for a single partition key, return a `PartitionKeyRange` with the same start and\n        end. Raises an error if the current run is not a partitioned run.\n        """\n        if self._partition_key_range:\n            return self._partition_key_range\n        elif self._partition_key:\n            return PartitionKeyRange(self._partition_key, self._partition_key)\n        else:\n            check.failed("Tried to access partition_key range for a non-partitioned run")\n\n    def asset_partition_key_for_output(self, output_name: str = "result") -> str:\n        return self.partition_key\n\n    def asset_partitions_time_window_for_output(self, output_name: str = "result") -> TimeWindow:\n        partitions_def = self.assets_def.partitions_def\n        if partitions_def is None:\n            check.failed("Tried to access partition_key for a non-partitioned asset")\n\n        if not has_one_dimension_time_window_partitioning(partitions_def=partitions_def):\n            raise DagsterInvariantViolationError(\n                "Expected a TimeWindowPartitionsDefinition or MultiPartitionsDefinition with a"\n                f" single time dimension, but instead found {type(partitions_def)}"\n            )\n\n        return cast(\n            Union[MultiPartitionsDefinition, TimeWindowPartitionsDefinition], partitions_def\n        ).time_window_for_partition_key(self.partition_key)\n\n    def add_output_metadata(\n        self,\n        metadata: Mapping[str, Any],\n        output_name: Optional[str] = None,\n        mapping_key: Optional[str] = None,\n    ) -> None:\n        """Add metadata to one of the outputs of an op.\n\n        This can only be used once per output in the body of an op. Using this method with the same output_name more than once within an op will result in an error.\n\n        Args:\n            metadata (Mapping[str, Any]): The metadata to attach to the output\n            output_name (Optional[str]): The name of the output to attach metadata to. If there is only one output on the op, then this argument does not need to be provided. The metadata will automatically be attached to the only output.\n\n        **Examples:**\n\n        .. code-block:: python\n\n            from dagster import Out, op\n            from typing import Tuple\n\n            @op\n            def add_metadata(context):\n                context.add_output_metadata({"foo", "bar"})\n                return 5 # Since the default output is called "result", metadata will be attached to the output "result".\n\n            @op(out={"a": Out(), "b": Out()})\n            def add_metadata_two_outputs(context) -> Tuple[str, int]:\n                context.add_output_metadata({"foo": "bar"}, output_name="b")\n                context.add_output_metadata({"baz": "bat"}, output_name="a")\n\n                return ("dog", 5)\n\n        """\n        metadata = check.mapping_param(metadata, "metadata", key_type=str)\n        output_name = check.opt_str_param(output_name, "output_name")\n        mapping_key = check.opt_str_param(mapping_key, "mapping_key")\n\n        if output_name is None and len(self.op_def.output_defs) == 1:\n            output_def = self.op_def.output_defs[0]\n            output_name = output_def.name\n        elif output_name is None:\n            raise DagsterInvariantViolationError(\n                "Attempted to log metadata without providing output_name, but multiple outputs"\n                " exist. Please provide an output_name to the invocation of"\n                " `context.add_output_metadata`."\n            )\n        else:\n            output_def = self.op_def.output_def_named(output_name)\n\n        if self.has_seen_output(output_name, mapping_key):\n            output_desc = (\n                f"output '{output_def.name}'"\n                if not mapping_key\n                else f"output '{output_def.name}' with mapping_key '{mapping_key}'"\n            )\n            raise DagsterInvariantViolationError(\n                f"In {self.op_def.node_type_str} '{self.op_def.name}', attempted to log output"\n                f" metadata for {output_desc} which has already been yielded. Metadata must be"\n                " logged before the output is yielded."\n            )\n        if output_def.is_dynamic and not mapping_key:\n            raise DagsterInvariantViolationError(\n                f"In {self.op_def.node_type_str} '{self.op_def.name}', attempted to log metadata"\n                f" for dynamic output '{output_def.name}' without providing a mapping key. When"\n                " logging metadata for a dynamic output, it is necessary to provide a mapping key."\n            )\n\n        output_name = output_def.name\n        if output_name in self._output_metadata:\n            if not mapping_key or mapping_key in self._output_metadata[output_name]:\n                raise DagsterInvariantViolationError(\n                    f"In {self.op_def.node_type_str} '{self.op_def.name}', attempted to log"\n                    f" metadata for output '{output_name}' more than once."\n                )\n        if mapping_key:\n            if output_name not in self._output_metadata:\n                self._output_metadata[output_name] = {}\n            self._output_metadata[output_name][mapping_key] = metadata\n\n        else:\n            self._output_metadata[output_name] = metadata\n\n    # In this mode no conversion is done on returned values and missing but expected outputs are not\n    # allowed.\n    @property\n    def requires_typed_event_stream(self) -> bool:\n        return self._requires_typed_event_stream\n\n    @property\n    def typed_event_stream_error_message(self) -> Optional[str]:\n        return self._typed_event_stream_error_message\n\n    def set_requires_typed_event_stream(self, *, error_message: Optional[str]) -> None:\n        self._requires_typed_event_stream = True\n        self._typed_event_stream_error_message = error_message\n\n\n
[docs]def build_op_context(\n resources: Optional[Mapping[str, Any]] = None,\n op_config: Any = None,\n resources_config: Optional[Mapping[str, Any]] = None,\n instance: Optional[DagsterInstance] = None,\n config: Any = None,\n partition_key: Optional[str] = None,\n partition_key_range: Optional[PartitionKeyRange] = None,\n mapping_key: Optional[str] = None,\n _assets_def: Optional[AssetsDefinition] = None,\n) -> UnboundOpExecutionContext:\n """Builds op execution context from provided parameters.\n\n ``build_op_context`` can be used as either a function or context manager. If there is a\n provided resource that is a context manager, then ``build_op_context`` must be used as a\n context manager. This function can be used to provide the context argument when directly\n invoking a op.\n\n Args:\n resources (Optional[Dict[str, Any]]): The resources to provide to the context. These can be\n either values or resource definitions.\n op_config (Optional[Mapping[str, Any]]): The config to provide to the op.\n resources_config (Optional[Mapping[str, Any]]): The config to provide to the resources.\n instance (Optional[DagsterInstance]): The dagster instance configured for the context.\n Defaults to DagsterInstance.ephemeral().\n mapping_key (Optional[str]): A key representing the mapping key from an upstream dynamic\n output. Can be accessed using ``context.get_mapping_key()``.\n partition_key (Optional[str]): String value representing partition key to execute with.\n partition_key_range (Optional[PartitionKeyRange]): Partition key range to execute with.\n _assets_def (Optional[AssetsDefinition]): Internal argument that populates the op's assets\n definition, not meant to be populated by users.\n\n Examples:\n .. code-block:: python\n\n context = build_op_context()\n op_to_invoke(context)\n\n with build_op_context(resources={"foo": context_manager_resource}) as context:\n op_to_invoke(context)\n """\n if op_config and config:\n raise DagsterInvalidInvocationError(\n "Attempted to invoke ``build_op_context`` with both ``op_config``, and its "\n "legacy version, ``config``. Please provide one or the other."\n )\n\n op_config = op_config if op_config else config\n return UnboundOpExecutionContext(\n resources_dict=check.opt_mapping_param(resources, "resources", key_type=str),\n resources_config=check.opt_mapping_param(\n resources_config, "resources_config", key_type=str\n ),\n op_config=op_config,\n instance=check.opt_inst_param(instance, "instance", DagsterInstance),\n partition_key=check.opt_str_param(partition_key, "partition_key"),\n partition_key_range=check.opt_inst_param(\n partition_key_range, "partition_key_range", PartitionKeyRange\n ),\n mapping_key=check.opt_str_param(mapping_key, "mapping_key"),\n assets_def=check.opt_inst_param(_assets_def, "_assets_def", AssetsDefinition),\n )
\n\n\n
[docs]def build_asset_context(\n resources: Optional[Mapping[str, Any]] = None,\n resources_config: Optional[Mapping[str, Any]] = None,\n asset_config: Optional[Mapping[str, Any]] = None,\n instance: Optional[DagsterInstance] = None,\n partition_key: Optional[str] = None,\n partition_key_range: Optional[PartitionKeyRange] = None,\n):\n """Builds asset execution context from provided parameters.\n\n ``build_asset_context`` can be used as either a function or context manager. If there is a\n provided resource that is a context manager, then ``build_asset_context`` must be used as a\n context manager. This function can be used to provide the context argument when directly\n invoking an asset.\n\n Args:\n resources (Optional[Dict[str, Any]]): The resources to provide to the context. These can be\n either values or resource definitions.\n resources_config (Optional[Mapping[str, Any]]): The config to provide to the resources.\n asset_config (Optional[Mapping[str, Any]]): The config to provide to the asset.\n instance (Optional[DagsterInstance]): The dagster instance configured for the context.\n Defaults to DagsterInstance.ephemeral().\n partition_key (Optional[str]): String value representing partition key to execute with.\n partition_key_range (Optional[PartitionKeyRange]): Partition key range to execute with.\n\n Examples:\n .. code-block:: python\n\n context = build_asset_context()\n asset_to_invoke(context)\n\n with build_asset_context(resources={"foo": context_manager_resource}) as context:\n asset_to_invoke(context)\n """\n return build_op_context(\n op_config=asset_config,\n resources=resources,\n resources_config=resources_config,\n partition_key=partition_key,\n partition_key_range=partition_key_range,\n instance=instance,\n )
\n
", "current_page_name": "_modules/dagster/_core/execution/context/invocation", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.execution.context.invocation"}, "logger": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.execution.context.logger

\nfrom typing import Any, Optional\n\nimport dagster._check as check\nfrom dagster._annotations import public\nfrom dagster._core.definitions.job_definition import JobDefinition\nfrom dagster._core.definitions.logger_definition import LoggerDefinition\nfrom dagster._core.errors import DagsterInvariantViolationError\n\nfrom .output import RUN_ID_PLACEHOLDER\n\n\n
[docs]class InitLoggerContext:\n """The context object available as the argument to the initialization function of a :py:class:`dagster.LoggerDefinition`.\n\n Users should not instantiate this object directly. To construct an\n `InitLoggerContext` for testing purposes, use :py:func:`dagster.\n build_init_logger_context`.\n\n Example:\n .. code-block:: python\n\n from dagster import logger, InitLoggerContext\n\n @logger\n def hello_world(init_context: InitLoggerContext):\n ...\n\n """\n\n def __init__(\n self,\n logger_config: Any,\n logger_def: Optional[LoggerDefinition] = None,\n job_def: Optional[JobDefinition] = None,\n run_id: Optional[str] = None,\n ):\n self._logger_config = logger_config\n self._job_def = check.opt_inst_param(job_def, "job_def", JobDefinition)\n self._logger_def = check.opt_inst_param(logger_def, "logger_def", LoggerDefinition)\n self._run_id = check.opt_str_param(run_id, "run_id")\n\n @public\n @property\n def logger_config(self) -> Any:\n """The configuration data provided by the run config. The\n schema for this data is defined by ``config_schema`` on the :py:class:`LoggerDefinition`.\n """\n return self._logger_config\n\n @property\n def job_def(self) -> Optional[JobDefinition]:\n """The job definition currently being executed."""\n return self._job_def\n\n @public\n @property\n def logger_def(self) -> Optional[LoggerDefinition]:\n """The logger definition for the logger being constructed."""\n return self._logger_def\n\n @public\n @property\n def run_id(self) -> Optional[str]:\n """The ID for this run of the job."""\n return self._run_id
\n\n\nclass UnboundInitLoggerContext(InitLoggerContext):\n """Logger initialization context outputted by ``build_init_logger_context``.\n\n Represents a context whose config has not yet been validated against a logger definition, hence\n the inability to access the `logger_def` attribute. When an instance of\n ``UnboundInitLoggerContext`` is passed to ``LoggerDefinition.initialize``, config is validated,\n and it is subsumed into an `InitLoggerContext`, which contains the logger_def validated against.\n """\n\n def __init__(self, logger_config: Any, job_def: Optional[JobDefinition]):\n super(UnboundInitLoggerContext, self).__init__(\n logger_config, logger_def=None, job_def=job_def, run_id=None\n )\n\n @property\n def logger_def(self) -> LoggerDefinition:\n raise DagsterInvariantViolationError(\n "UnboundInitLoggerContext has not been validated against a logger definition."\n )\n\n @property\n def run_id(self) -> Optional[str]:\n return RUN_ID_PLACEHOLDER\n
", "current_page_name": "_modules/dagster/_core/execution/context/logger", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.execution.context.logger"}, "output": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.execution.context.output

\nimport warnings\nfrom typing import (\n    TYPE_CHECKING,\n    Any,\n    ContextManager,\n    Iterator,\n    List,\n    Mapping,\n    Optional,\n    Sequence,\n    Union,\n    cast,\n)\n\nimport dagster._check as check\nfrom dagster._annotations import public\nfrom dagster._core.definitions.asset_layer import AssetOutputInfo\nfrom dagster._core.definitions.events import (\n    AssetKey,\n    AssetMaterialization,\n    AssetObservation,\n    CoercibleToAssetKey,\n)\nfrom dagster._core.definitions.metadata import (\n    ArbitraryMetadataMapping,\n    MetadataValue,\n    RawMetadataValue,\n)\nfrom dagster._core.definitions.partition_key_range import PartitionKeyRange\nfrom dagster._core.definitions.time_window_partitions import TimeWindow\nfrom dagster._core.errors import DagsterInvalidMetadata, DagsterInvariantViolationError\nfrom dagster._core.execution.plan.utils import build_resources_for_manager\n\nif TYPE_CHECKING:\n    from dagster._core.definitions import JobDefinition, PartitionsDefinition\n    from dagster._core.definitions.op_definition import OpDefinition\n    from dagster._core.definitions.resource_definition import Resources\n    from dagster._core.events import DagsterEvent\n    from dagster._core.execution.context.system import StepExecutionContext\n    from dagster._core.execution.plan.outputs import StepOutputHandle\n    from dagster._core.execution.plan.plan import ExecutionPlan\n    from dagster._core.log_manager import DagsterLogManager\n    from dagster._core.system_config.objects import ResolvedRunConfig\n    from dagster._core.types.dagster_type import DagsterType\n\nRUN_ID_PLACEHOLDER = "__EPHEMERAL_RUN_ID"\n\n\n
[docs]class OutputContext:\n """The context object that is available to the `handle_output` method of an :py:class:`IOManager`.\n\n Users should not instantiate this object directly. To construct an\n `OutputContext` for testing an IO Manager's `handle_output` method, use\n :py:func:`dagster.build_output_context`.\n\n Example:\n .. code-block:: python\n\n from dagster import IOManager, OutputContext\n\n class MyIOManager(IOManager):\n def handle_output(self, context: OutputContext, obj):\n ...\n """\n\n _step_key: Optional[str]\n _name: Optional[str]\n _job_name: Optional[str]\n _run_id: Optional[str]\n _metadata: ArbitraryMetadataMapping\n _user_generated_metadata: Mapping[str, MetadataValue]\n _mapping_key: Optional[str]\n _config: object\n _op_def: Optional["OpDefinition"]\n _dagster_type: Optional["DagsterType"]\n _log: Optional["DagsterLogManager"]\n _version: Optional[str]\n _resource_config: Optional[Mapping[str, object]]\n _step_context: Optional["StepExecutionContext"]\n _asset_info: Optional[AssetOutputInfo]\n _warn_on_step_context_use: bool\n _resources: Optional["Resources"]\n _resources_cm: Optional[ContextManager["Resources"]]\n _resources_contain_cm: Optional[bool]\n _cm_scope_entered: Optional[bool]\n _events: List["DagsterEvent"]\n _user_events: List[Union[AssetMaterialization, AssetObservation]]\n\n def __init__(\n self,\n step_key: Optional[str] = None,\n name: Optional[str] = None,\n job_name: Optional[str] = None,\n run_id: Optional[str] = None,\n metadata: Optional[ArbitraryMetadataMapping] = None,\n mapping_key: Optional[str] = None,\n config: object = None,\n dagster_type: Optional["DagsterType"] = None,\n log_manager: Optional["DagsterLogManager"] = None,\n version: Optional[str] = None,\n resource_config: Optional[Mapping[str, object]] = None,\n resources: Optional[Union["Resources", Mapping[str, object]]] = None,\n step_context: Optional["StepExecutionContext"] = None,\n op_def: Optional["OpDefinition"] = None,\n asset_info: Optional[AssetOutputInfo] = None,\n warn_on_step_context_use: bool = False,\n partition_key: Optional[str] = None,\n ):\n from dagster._core.definitions.resource_definition import IContainsGenerator, Resources\n from dagster._core.execution.build_resources import build_resources\n\n self._step_key = step_key\n self._name = name\n self._job_name = job_name\n self._run_id = run_id\n self._metadata = metadata or {}\n self._mapping_key = mapping_key\n self._config = config\n self._op_def = op_def\n self._dagster_type = dagster_type\n self._log = log_manager\n self._version = version\n self._resource_config = resource_config\n self._step_context = step_context\n self._asset_info = asset_info\n self._warn_on_step_context_use = warn_on_step_context_use\n if self._step_context and self._step_context.has_partition_key:\n self._partition_key: Optional[str] = self._step_context.partition_key\n else:\n self._partition_key = partition_key\n\n if isinstance(resources, Resources):\n self._resources_cm = None\n self._resources = resources\n else:\n self._resources_cm = build_resources(\n check.opt_mapping_param(resources, "resources", key_type=str)\n )\n self._resources = self._resources_cm.__enter__()\n self._resources_contain_cm = isinstance(self._resources, IContainsGenerator)\n self._cm_scope_entered = False\n\n self._events = []\n self._user_events = []\n self._user_generated_metadata = {}\n\n def __enter__(self):\n if self._resources_cm:\n self._cm_scope_entered = True\n return self\n\n def __exit__(self, *exc):\n if self._resources_cm:\n self._resources_cm.__exit__(*exc)\n\n def __del__(self):\n if (\n hasattr(self, "_resources_cm")\n and self._resources_cm\n and self._resources_contain_cm\n and not self._cm_scope_entered\n ):\n self._resources_cm.__exit__(None, None, None)\n\n @public\n @property\n def step_key(self) -> str:\n """The step_key for the compute step that produced the output."""\n if self._step_key is None:\n raise DagsterInvariantViolationError(\n "Attempting to access step_key, "\n "but it was not provided when constructing the OutputContext"\n )\n\n return self._step_key\n\n @public\n @property\n def name(self) -> str:\n """The name of the output that produced the output."""\n if self._name is None:\n raise DagsterInvariantViolationError(\n "Attempting to access name, "\n "but it was not provided when constructing the OutputContext"\n )\n\n return self._name\n\n @property\n def job_name(self) -> str:\n if self._job_name is None:\n raise DagsterInvariantViolationError(\n "Attempting to access pipeline_name, "\n "but it was not provided when constructing the OutputContext"\n )\n\n return self._job_name\n\n @public\n @property\n def run_id(self) -> str:\n """The id of the run that produced the output."""\n if self._run_id is None:\n raise DagsterInvariantViolationError(\n "Attempting to access run_id, "\n "but it was not provided when constructing the OutputContext"\n )\n\n return self._run_id\n\n @public\n @property\n def metadata(self) -> Optional[ArbitraryMetadataMapping]:\n """A dict of the metadata that is assigned to the OutputDefinition that produced\n the output.\n """\n return self._metadata\n\n @public\n @property\n def mapping_key(self) -> Optional[str]:\n """The key that identifies a unique mapped output. None for regular outputs."""\n return self._mapping_key\n\n @public\n @property\n def config(self) -> Any:\n """The configuration for the output."""\n return self._config\n\n @public\n @property\n def op_def(self) -> "OpDefinition":\n """The definition of the op that produced the output."""\n from dagster._core.definitions import OpDefinition\n\n if self._op_def is None:\n raise DagsterInvariantViolationError(\n "Attempting to access op_def, "\n "but it was not provided when constructing the OutputContext"\n )\n\n return cast(OpDefinition, self._op_def)\n\n @public\n @property\n def dagster_type(self) -> "DagsterType":\n """The type of this output."""\n if self._dagster_type is None:\n raise DagsterInvariantViolationError(\n "Attempting to access dagster_type, "\n "but it was not provided when constructing the OutputContext"\n )\n\n return self._dagster_type\n\n @public\n @property\n def log(self) -> "DagsterLogManager":\n """The log manager to use for this output."""\n if self._log is None:\n raise DagsterInvariantViolationError(\n "Attempting to access log, "\n "but it was not provided when constructing the OutputContext"\n )\n\n return self._log\n\n @public\n @property\n def version(self) -> Optional[str]:\n """(Experimental) The version of the output."""\n return self._version\n\n @public\n @property\n def resource_config(self) -> Optional[Mapping[str, object]]:\n """The config associated with the resource that initializes the InputManager."""\n return self._resource_config\n\n @public\n @property\n def resources(self) -> Any:\n """The resources required by the output manager, specified by the `required_resource_keys`\n parameter.\n """\n if self._resources is None:\n raise DagsterInvariantViolationError(\n "Attempting to access resources, "\n "but it was not provided when constructing the OutputContext"\n )\n\n if self._resources_cm and self._resources_contain_cm and not self._cm_scope_entered:\n raise DagsterInvariantViolationError(\n "At least one provided resource is a generator, but attempting to access "\n "resources outside of context manager scope. You can use the following syntax to "\n "open a context manager: `with build_output_context(...) as context:`"\n )\n return self._resources\n\n @property\n def asset_info(self) -> Optional[AssetOutputInfo]:\n """(Experimental) Asset info corresponding to the output."""\n return self._asset_info\n\n @public\n @property\n def has_asset_key(self) -> bool:\n """Returns True if an asset is being stored, otherwise returns False. A return value of False\n indicates that an output from an op is being stored.\n """\n return self._asset_info is not None\n\n @public\n @property\n def asset_key(self) -> AssetKey:\n """The ``AssetKey`` of the asset that is being stored as an output."""\n if self._asset_info is None:\n raise DagsterInvariantViolationError(\n "Attempting to access asset_key, "\n "but it was not provided when constructing the OutputContext"\n )\n\n return self._asset_info.key\n\n @public\n @property\n def asset_partitions_def(self) -> "PartitionsDefinition":\n """The PartitionsDefinition on the asset corresponding to this output."""\n asset_key = self.asset_key\n result = self.step_context.job_def.asset_layer.partitions_def_for_asset(asset_key)\n if result is None:\n raise DagsterInvariantViolationError(\n f"Attempting to access partitions def for asset {asset_key}, but it is not"\n " partitioned"\n )\n\n return result\n\n @property\n def step_context(self) -> "StepExecutionContext":\n if self._warn_on_step_context_use:\n warnings.warn(\n "You are using InputContext.upstream_output.step_context"\n "This use on upstream_output is deprecated and will fail in the future"\n "Try to obtain what you need directly from InputContext"\n "For more details: https://github.com/dagster-io/dagster/issues/7900"\n )\n\n if self._step_context is None:\n raise DagsterInvariantViolationError(\n "Attempting to access step_context, "\n "but it was not provided when constructing the OutputContext"\n )\n\n return self._step_context\n\n @public\n @property\n def has_partition_key(self) -> bool:\n """Whether the current run is a partitioned run."""\n if self._warn_on_step_context_use:\n warnings.warn(\n "You are using InputContext.upstream_output.has_partition_key"\n "This use on upstream_output is deprecated and will fail in the future"\n "Try to obtain what you need directly from InputContext"\n "For more details: https://github.com/dagster-io/dagster/issues/7900"\n )\n\n return self._partition_key is not None\n\n @public\n @property\n def partition_key(self) -> str:\n """The partition key for the current run.\n\n Raises an error if the current run is not a partitioned run.\n """\n if self._warn_on_step_context_use:\n warnings.warn(\n "You are using InputContext.upstream_output.partition_key"\n "This use on upstream_output is deprecated and will fail in the future"\n "Try to obtain what you need directly from InputContext"\n "For more details: https://github.com/dagster-io/dagster/issues/7900"\n )\n\n if self._partition_key is None:\n check.failed(\n "Tried to access partition_key on a non-partitioned run.",\n )\n\n return self._partition_key\n\n @public\n @property\n def has_asset_partitions(self) -> bool:\n """Returns True if the asset being stored is partitioned."""\n if self._warn_on_step_context_use:\n warnings.warn(\n "You are using InputContext.upstream_output.has_asset_partitions"\n "This use on upstream_output is deprecated and will fail in the future"\n "Try to obtain what you need directly from InputContext"\n "For more details: https://github.com/dagster-io/dagster/issues/7900"\n )\n\n if self._step_context is not None:\n return self._step_context.has_asset_partitions_for_output(self.name)\n else:\n return False\n\n @public\n @property\n def asset_partition_key(self) -> str:\n """The partition key for output asset.\n\n Raises an error if the output asset has no partitioning, or if the run covers a partition\n range for the output asset.\n """\n if self._warn_on_step_context_use:\n warnings.warn(\n "You are using InputContext.upstream_output.asset_partition_key"\n "This use on upstream_output is deprecated and will fail in the future"\n "Try to obtain what you need directly from InputContext"\n "For more details: https://github.com/dagster-io/dagster/issues/7900"\n )\n\n return self.step_context.asset_partition_key_for_output(self.name)\n\n @public\n @property\n def asset_partition_key_range(self) -> PartitionKeyRange:\n """The partition key range for output asset.\n\n Raises an error if the output asset has no partitioning.\n """\n if self._warn_on_step_context_use:\n warnings.warn(\n "You are using InputContext.upstream_output.asset_partition_key_range"\n "This use on upstream_output is deprecated and will fail in the future"\n "Try to obtain what you need directly from InputContext"\n "For more details: https://github.com/dagster-io/dagster/issues/7900"\n )\n\n return self.step_context.asset_partition_key_range_for_output(self.name)\n\n @public\n @property\n def asset_partition_keys(self) -> Sequence[str]:\n """The partition keys for the output asset.\n\n Raises an error if the output asset has no partitioning.\n """\n if self._warn_on_step_context_use:\n warnings.warn(\n "You are using InputContext.upstream_output.asset_partition_keys"\n "This use on upstream_output is deprecated and will fail in the future"\n "Try to obtain what you need directly from InputContext"\n "For more details: https://github.com/dagster-io/dagster/issues/7900"\n )\n\n return self.asset_partitions_def.get_partition_keys_in_range(\n self.step_context.asset_partition_key_range_for_output(self.name),\n dynamic_partitions_store=self.step_context.instance,\n )\n\n @public\n @property\n def asset_partitions_time_window(self) -> TimeWindow:\n """The time window for the partitions of the output asset.\n\n Raises an error if either of the following are true:\n - The output asset has no partitioning.\n - The output asset is not partitioned with a TimeWindowPartitionsDefinition or a\n MultiPartitionsDefinition with one time-partitioned dimension.\n """\n if self._warn_on_step_context_use:\n warnings.warn(\n "You are using InputContext.upstream_output.asset_partitions_time_window"\n "This use on upstream_output is deprecated and will fail in the future"\n "Try to obtain what you need directly from InputContext"\n "For more details: https://github.com/dagster-io/dagster/issues/7900"\n )\n\n return self.step_context.asset_partitions_time_window_for_output(self.name)\n\n def get_run_scoped_output_identifier(self) -> Sequence[str]:\n """Utility method to get a collection of identifiers that as a whole represent a unique\n step output.\n\n The unique identifier collection consists of\n\n - ``run_id``: the id of the run which generates the output.\n Note: This method also handles the re-execution memoization logic. If the step that\n generates the output is skipped in the re-execution, the ``run_id`` will be the id\n of its parent run.\n - ``step_key``: the key for a compute step.\n - ``name``: the name of the output. (default: 'result').\n\n Returns:\n Sequence[str, ...]: A list of identifiers, i.e. run id, step key, and output name\n """\n warnings.warn(\n "`OutputContext.get_run_scoped_output_identifier` is deprecated. Use "\n "`OutputContext.get_identifier` instead."\n )\n # if run_id is None and this is a re-execution, it means we failed to find its source run id\n check.invariant(\n self.run_id is not None,\n "Unable to find the run scoped output identifier: run_id is None on OutputContext.",\n )\n check.invariant(\n self.step_key is not None,\n "Unable to find the run scoped output identifier: step_key is None on OutputContext.",\n )\n check.invariant(\n self.name is not None,\n "Unable to find the run scoped output identifier: name is None on OutputContext.",\n )\n run_id = cast(str, self.run_id)\n step_key = cast(str, self.step_key)\n name = cast(str, self.name)\n\n if self.mapping_key:\n return [run_id, step_key, name, self.mapping_key]\n\n return [run_id, step_key, name]\n\n
[docs] @public\n def get_identifier(self) -> Sequence[str]:\n """Utility method to get a collection of identifiers that as a whole represent a unique\n step output.\n\n If not using memoization, the unique identifier collection consists of\n\n - ``run_id``: the id of the run which generates the output.\n Note: This method also handles the re-execution memoization logic. If the step that\n generates the output is skipped in the re-execution, the ``run_id`` will be the id\n of its parent run.\n - ``step_key``: the key for a compute step.\n - ``name``: the name of the output. (default: 'result').\n\n If using memoization, the ``version`` corresponding to the step output is used in place of\n the ``run_id``.\n\n Returns:\n Sequence[str, ...]: A list of identifiers, i.e. (run_id or version), step_key, and output_name\n """\n version = self.version\n step_key = self.step_key\n name = self.name\n if version is not None:\n check.invariant(\n self.mapping_key is None,\n f"Mapping key and version both provided for output '{name}' of step"\n f" '{step_key}'. Dynamic mapping is not supported when using versioning.",\n )\n identifier = ["versioned_outputs", version, step_key, name]\n else:\n run_id = self.run_id\n identifier = [run_id, step_key, name]\n if self.mapping_key:\n identifier.append(self.mapping_key)\n\n return identifier
\n\n def get_output_identifier(self) -> Sequence[str]:\n warnings.warn(\n "`OutputContext.get_output_identifier` is deprecated. Use "\n "`OutputContext.get_identifier` instead."\n )\n\n return self.get_identifier()\n\n
[docs] @public\n def get_asset_identifier(self) -> Sequence[str]:\n """The sequence of strings making up the AssetKey for the asset being stored as an output.\n If the asset is partitioned, the identifier contains the partition key as the final element in the\n sequence. For example, for the asset key ``AssetKey(["foo", "bar", "baz"])`` materialized with\n partition key "2023-06-01", ``get_asset_identifier`` will return ``["foo", "bar", "baz", "2023-06-01"]``.\n """\n if self.asset_key is not None:\n if self.has_asset_partitions:\n return [*self.asset_key.path, self.asset_partition_key]\n else:\n return self.asset_key.path\n else:\n check.failed("Can't get asset output identifier for an output with no asset key")
\n\n def get_asset_output_identifier(self) -> Sequence[str]:\n warnings.warn(\n "`OutputContext.get_asset_output_identifier` is deprecated. Use "\n "`OutputContext.get_asset_identifier` instead."\n )\n\n return self.get_asset_identifier()\n\n
[docs] @public\n def log_event(self, event: Union[AssetObservation, AssetMaterialization]) -> None:\n """Log an AssetMaterialization or AssetObservation from within the body of an io manager's `handle_output` method.\n\n Events logged with this method will appear in the event log.\n\n Args:\n event (Union[AssetMaterialization, AssetObservation]): The event to log.\n\n Examples:\n .. code-block:: python\n\n from dagster import IOManager, AssetMaterialization\n\n class MyIOManager(IOManager):\n def handle_output(self, context, obj):\n context.log_event(AssetMaterialization("foo"))\n """\n from dagster._core.events import DagsterEvent\n\n if isinstance(event, (AssetMaterialization)):\n if self._step_context:\n self._events.append(DagsterEvent.asset_materialization(self._step_context, event))\n self._user_events.append(event)\n elif isinstance(event, AssetObservation):\n if self._step_context:\n self._events.append(DagsterEvent.asset_observation(self._step_context, event))\n self._user_events.append(event)\n else:\n check.failed(f"Unexpected event {event}")
\n\n def consume_events(self) -> Iterator["DagsterEvent"]:\n """Pops and yields all user-generated events that have been recorded from this context.\n\n If consume_events has not yet been called, this will yield all logged events since the call to `handle_output`. If consume_events has been called, it will yield all events since the last time consume_events was called. Designed for internal use. Users should never need to invoke this method.\n """\n events = self._events\n self._events = []\n yield from events\n\n def get_logged_events(\n self,\n ) -> Sequence[Union[AssetMaterialization, AssetObservation]]:\n """Retrieve the list of user-generated events that were logged via the context.\n\n\n User-generated events that were yielded will not appear in this list.\n\n **Examples:**\n\n .. code-block:: python\n\n from dagster import IOManager, build_output_context, AssetMaterialization\n\n class MyIOManager(IOManager):\n def handle_output(self, context, obj):\n ...\n\n def test_handle_output():\n mgr = MyIOManager()\n context = build_output_context()\n mgr.handle_output(context)\n all_user_events = context.get_logged_events()\n materializations = [event for event in all_user_events if isinstance(event, AssetMaterialization)]\n ...\n """\n return self._user_events\n\n
[docs] @public\n def add_output_metadata(self, metadata: Mapping[str, RawMetadataValue]) -> None:\n """Add a dictionary of metadata to the handled output.\n\n Metadata entries added will show up in the HANDLED_OUTPUT and ASSET_MATERIALIZATION events for the run.\n\n Args:\n metadata (Mapping[str, RawMetadataValue]): A metadata dictionary to log\n\n Examples:\n .. code-block:: python\n\n from dagster import IOManager\n\n class MyIOManager(IOManager):\n def handle_output(self, context, obj):\n context.add_output_metadata({"foo": "bar"})\n """\n from dagster._core.definitions.metadata import normalize_metadata\n\n overlapping_labels = set(self._user_generated_metadata.keys()) & metadata.keys()\n if overlapping_labels:\n raise DagsterInvalidMetadata(\n f"Tried to add metadata for key(s) that already have metadata: {overlapping_labels}"\n )\n\n self._user_generated_metadata = {\n **self._user_generated_metadata,\n **normalize_metadata(metadata),\n }
\n\n def get_logged_metadata(\n self,\n ) -> Mapping[str, MetadataValue]:\n """Get the mapping of metadata entries that have been logged for use with this output."""\n return self._user_generated_metadata\n\n def consume_logged_metadata(\n self,\n ) -> Mapping[str, MetadataValue]:\n """Pops and yields all user-generated metadata entries that have been recorded from this context.\n\n If consume_logged_metadata has not yet been called, this will yield all logged events since\n the call to `handle_output`. If consume_logged_metadata has been called, it will yield all\n events since the last time consume_logged_metadata_entries was called. Designed for internal\n use. Users should never need to invoke this method.\n """\n result = self._user_generated_metadata\n self._user_generated_metadata = {}\n return result or {}
\n\n\ndef get_output_context(\n execution_plan: "ExecutionPlan",\n job_def: "JobDefinition",\n resolved_run_config: "ResolvedRunConfig",\n step_output_handle: "StepOutputHandle",\n run_id: Optional[str],\n log_manager: Optional["DagsterLogManager"],\n step_context: Optional["StepExecutionContext"],\n resources: Optional["Resources"],\n version: Optional[str],\n warn_on_step_context_use: bool = False,\n) -> "OutputContext":\n """Args:\n run_id (str): The run ID of the run that produced the output, not necessarily the run that\n the context will be used in.\n """\n step = execution_plan.get_step_by_key(step_output_handle.step_key)\n # get config\n op_config = resolved_run_config.ops[step.node_handle.to_string()]\n outputs_config = op_config.outputs\n\n if outputs_config:\n output_config = outputs_config.get_output_manager_config(step_output_handle.output_name)\n else:\n output_config = None\n\n step_output = execution_plan.get_step_output(step_output_handle)\n output_def = job_def.get_node(step_output.node_handle).output_def_named(step_output.name)\n\n io_manager_key = output_def.io_manager_key\n resource_config = resolved_run_config.resources[io_manager_key].config\n\n node_handle = execution_plan.get_step_by_key(step.key).node_handle\n asset_info = job_def.asset_layer.asset_info_for_output(\n node_handle=node_handle, output_name=step_output.name\n )\n if asset_info is not None:\n metadata = job_def.asset_layer.metadata_for_asset(asset_info.key) or output_def.metadata\n else:\n metadata = output_def.metadata\n\n if step_context:\n check.invariant(\n not resources,\n "Expected either resources or step context to be set, but "\n "received both. If step context is provided, resources for IO manager will be "\n "retrieved off of that.",\n )\n resources = build_resources_for_manager(io_manager_key, step_context)\n\n return OutputContext(\n step_key=step_output_handle.step_key,\n name=step_output_handle.output_name,\n job_name=job_def.name,\n run_id=run_id,\n metadata=metadata,\n mapping_key=step_output_handle.mapping_key,\n config=output_config,\n op_def=job_def.get_node(step.node_handle).definition, # type: ignore # (should be OpDefinition not NodeDefinition)\n dagster_type=output_def.dagster_type,\n log_manager=log_manager,\n version=version,\n step_context=step_context,\n resource_config=resource_config,\n resources=resources,\n asset_info=asset_info,\n warn_on_step_context_use=warn_on_step_context_use,\n )\n\n\ndef step_output_version(\n job_def: "JobDefinition",\n execution_plan: "ExecutionPlan",\n resolved_run_config: "ResolvedRunConfig",\n step_output_handle: "StepOutputHandle",\n) -> Optional[str]:\n from dagster._core.execution.resolve_versions import resolve_step_output_versions\n\n step_output_versions = resolve_step_output_versions(\n job_def, execution_plan, resolved_run_config\n )\n return (\n step_output_versions[step_output_handle]\n if step_output_handle in step_output_versions\n else None\n )\n\n\n
[docs]def build_output_context(\n step_key: Optional[str] = None,\n name: Optional[str] = None,\n metadata: Optional[Mapping[str, RawMetadataValue]] = None,\n run_id: Optional[str] = None,\n mapping_key: Optional[str] = None,\n config: Optional[Any] = None,\n dagster_type: Optional["DagsterType"] = None,\n version: Optional[str] = None,\n resource_config: Optional[Mapping[str, object]] = None,\n resources: Optional[Mapping[str, object]] = None,\n op_def: Optional["OpDefinition"] = None,\n asset_key: Optional[CoercibleToAssetKey] = None,\n partition_key: Optional[str] = None,\n) -> "OutputContext":\n """Builds output context from provided parameters.\n\n ``build_output_context`` can be used as either a function, or a context manager. If resources\n that are also context managers are provided, then ``build_output_context`` must be used as a\n context manager.\n\n Args:\n step_key (Optional[str]): The step_key for the compute step that produced the output.\n name (Optional[str]): The name of the output that produced the output.\n metadata (Optional[Mapping[str, Any]]): A dict of the metadata that is assigned to the\n OutputDefinition that produced the output.\n mapping_key (Optional[str]): The key that identifies a unique mapped output. None for regular outputs.\n config (Optional[Any]): The configuration for the output.\n dagster_type (Optional[DagsterType]): The type of this output.\n version (Optional[str]): (Experimental) The version of the output.\n resource_config (Optional[Mapping[str, Any]]): The resource config to make available from the\n input context. This usually corresponds to the config provided to the resource that\n loads the output manager.\n resources (Optional[Resources]): The resources to make available from the context.\n For a given key, you can provide either an actual instance of an object, or a resource\n definition.\n op_def (Optional[OpDefinition]): The definition of the op that produced the output.\n asset_key: Optional[Union[AssetKey, Sequence[str], str]]: The asset key corresponding to the\n output.\n partition_key: Optional[str]: String value representing partition key to execute with.\n\n Examples:\n .. code-block:: python\n\n build_output_context()\n\n with build_output_context(resources={"foo": context_manager_resource}) as context:\n do_something\n\n """\n from dagster._core.definitions import OpDefinition\n from dagster._core.execution.context_creation_job import initialize_console_manager\n from dagster._core.types.dagster_type import DagsterType\n\n step_key = check.opt_str_param(step_key, "step_key")\n name = check.opt_str_param(name, "name")\n metadata = check.opt_mapping_param(metadata, "metadata", key_type=str)\n run_id = check.opt_str_param(run_id, "run_id", default=RUN_ID_PLACEHOLDER)\n mapping_key = check.opt_str_param(mapping_key, "mapping_key")\n dagster_type = check.opt_inst_param(dagster_type, "dagster_type", DagsterType)\n version = check.opt_str_param(version, "version")\n resource_config = check.opt_mapping_param(resource_config, "resource_config", key_type=str)\n resources = check.opt_mapping_param(resources, "resources", key_type=str)\n op_def = check.opt_inst_param(op_def, "op_def", OpDefinition)\n asset_key = AssetKey.from_coercible(asset_key) if asset_key else None\n partition_key = check.opt_str_param(partition_key, "partition_key")\n\n return OutputContext(\n step_key=step_key,\n name=name,\n job_name=None,\n run_id=run_id,\n metadata=metadata,\n mapping_key=mapping_key,\n config=config,\n dagster_type=dagster_type,\n log_manager=initialize_console_manager(None),\n version=version,\n resource_config=resource_config,\n resources=resources,\n step_context=None,\n op_def=op_def,\n asset_info=AssetOutputInfo(key=asset_key) if asset_key else None,\n partition_key=partition_key,\n )
\n
", "current_page_name": "_modules/dagster/_core/execution/context/output", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.execution.context.output"}, "system": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.execution.context.system

\n"""This module contains the execution context objects that are internal to the system.\nNot every property on these should be exposed to random Jane or Joe dagster user\nso we have a different layer of objects that encode the explicit public API\nin the user_context module.\n"""\nfrom abc import ABC, abstractmethod\nfrom dataclasses import dataclass\nfrom hashlib import sha256\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Any,\n    Dict,\n    Iterable,\n    List,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Set,\n    Union,\n    cast,\n)\n\nimport dagster._check as check\nfrom dagster._annotations import public\nfrom dagster._core.definitions.data_version import (\n    DATA_VERSION_TAG,\n    SKIP_PARTITION_DATA_VERSION_DEPENDENCY_THRESHOLD,\n    extract_data_version_from_entry,\n)\nfrom dagster._core.definitions.dependency import OpNode\nfrom dagster._core.definitions.events import AssetKey, AssetLineageInfo\nfrom dagster._core.definitions.hook_definition import HookDefinition\nfrom dagster._core.definitions.job_base import IJob\nfrom dagster._core.definitions.job_definition import JobDefinition\nfrom dagster._core.definitions.multi_dimensional_partitions import MultiPartitionsDefinition\nfrom dagster._core.definitions.op_definition import OpDefinition\nfrom dagster._core.definitions.partition import PartitionsDefinition, PartitionsSubset\nfrom dagster._core.definitions.partition_key_range import PartitionKeyRange\nfrom dagster._core.definitions.partition_mapping import (\n    PartitionMapping,\n    infer_partition_mapping,\n)\nfrom dagster._core.definitions.policy import RetryPolicy\nfrom dagster._core.definitions.reconstruct import ReconstructableJob\nfrom dagster._core.definitions.resource_definition import ScopedResourcesBuilder\nfrom dagster._core.definitions.step_launcher import StepLauncher\nfrom dagster._core.definitions.time_window_partitions import (\n    TimeWindow,\n    TimeWindowPartitionsDefinition,\n    has_one_dimension_time_window_partitioning,\n)\nfrom dagster._core.errors import DagsterInvariantViolationError\nfrom dagster._core.execution.plan.handle import ResolvedFromDynamicStepHandle, StepHandle\nfrom dagster._core.execution.plan.outputs import StepOutputHandle\nfrom dagster._core.execution.plan.step import ExecutionStep\nfrom dagster._core.execution.retries import RetryMode\nfrom dagster._core.executor.base import Executor\nfrom dagster._core.log_manager import DagsterLogManager\nfrom dagster._core.storage.dagster_run import DagsterRun\nfrom dagster._core.storage.io_manager import IOManager\nfrom dagster._core.storage.tags import (\n    ASSET_PARTITION_RANGE_END_TAG,\n    ASSET_PARTITION_RANGE_START_TAG,\n    MULTIDIMENSIONAL_PARTITION_PREFIX,\n    PARTITION_NAME_TAG,\n)\nfrom dagster._core.system_config.objects import ResolvedRunConfig\nfrom dagster._core.types.dagster_type import DagsterType\n\nfrom .input import InputContext\nfrom .output import OutputContext, get_output_context\n\nif TYPE_CHECKING:\n    from dagster._core.definitions.data_version import (\n        DataVersion,\n    )\n    from dagster._core.definitions.dependency import NodeHandle\n    from dagster._core.definitions.resource_definition import Resources\n    from dagster._core.event_api import EventLogRecord\n    from dagster._core.execution.plan.plan import ExecutionPlan\n    from dagster._core.execution.plan.state import KnownExecutionState\n    from dagster._core.instance import DagsterInstance\n\n    from .hook import HookContext\n\n\ndef is_iterable(obj: Any) -> bool:\n    try:\n        iter(obj)\n    except:\n        return False\n    return True\n\n\nclass IPlanContext(ABC):\n    """Context interface to represent run information that does not require access to user code.\n\n    The information available via this interface is accessible to the system throughout a run.\n    """\n\n    @property\n    @abstractmethod\n    def plan_data(self) -> "PlanData":\n        raise NotImplementedError()\n\n    @property\n    def job(self) -> IJob:\n        return self.plan_data.job\n\n    @property\n    def dagster_run(self) -> DagsterRun:\n        return self.plan_data.dagster_run\n\n    @property\n    def run_id(self) -> str:\n        return self.dagster_run.run_id\n\n    @property\n    def run_config(self) -> Mapping[str, object]:\n        return self.dagster_run.run_config\n\n    @property\n    def job_name(self) -> str:\n        return self.dagster_run.job_name\n\n    @property\n    def instance(self) -> "DagsterInstance":\n        return self.plan_data.instance\n\n    @property\n    def raise_on_error(self) -> bool:\n        return self.plan_data.raise_on_error\n\n    @property\n    def retry_mode(self) -> RetryMode:\n        return self.plan_data.retry_mode\n\n    @property\n    def execution_plan(self) -> "ExecutionPlan":\n        return self.plan_data.execution_plan\n\n    @property\n    @abstractmethod\n    def output_capture(self) -> Optional[Mapping[StepOutputHandle, Any]]:\n        raise NotImplementedError()\n\n    @property\n    def log(self) -> DagsterLogManager:\n        raise NotImplementedError()\n\n    @property\n    def logging_tags(self) -> Mapping[str, str]:\n        return self.log.logging_metadata.all_tags()\n\n    @property\n    def event_tags(self) -> Mapping[str, str]:\n        return self.log.logging_metadata.event_tags()\n\n    def has_tag(self, key: str) -> bool:\n        check.str_param(key, "key")\n        return key in self.dagster_run.tags\n\n    def get_tag(self, key: str) -> Optional[str]:\n        check.str_param(key, "key")\n        return self.dagster_run.tags.get(key)\n\n    @property\n    def run_tags(self) -> Mapping[str, str]:\n        return self.dagster_run.tags\n\n\nclass PlanData(NamedTuple):\n    """The data about a run that is available during both orchestration and execution.\n\n    This object does not contain any information that requires access to user code, such as the\n    pipeline definition and resources.\n    """\n\n    job: IJob\n    dagster_run: DagsterRun\n    instance: "DagsterInstance"\n    execution_plan: "ExecutionPlan"\n    raise_on_error: bool = False\n    retry_mode: RetryMode = RetryMode.DISABLED\n\n\nclass ExecutionData(NamedTuple):\n    """The data that is available to the system during execution.\n\n    This object contains information that requires access to user code, such as the pipeline\n    definition and resources.\n    """\n\n    scoped_resources_builder: ScopedResourcesBuilder\n    resolved_run_config: ResolvedRunConfig\n    job_def: JobDefinition\n\n\nclass IStepContext(IPlanContext):\n    """Interface to represent data to be available during either step orchestration or execution."""\n\n    @property\n    @abstractmethod\n    def step(self) -> ExecutionStep:\n        raise NotImplementedError()\n\n    @property\n    @abstractmethod\n    def node_handle(self) -> "NodeHandle":\n        raise NotImplementedError()\n\n\nclass PlanOrchestrationContext(IPlanContext):\n    """Context for the orchestration of a run.\n\n    This context assumes inability to run user code directly.\n    """\n\n    def __init__(\n        self,\n        plan_data: PlanData,\n        log_manager: DagsterLogManager,\n        executor: Executor,\n        output_capture: Optional[Dict[StepOutputHandle, Any]],\n        resume_from_failure: bool = False,\n    ):\n        self._plan_data = plan_data\n        self._log_manager = log_manager\n        self._executor = executor\n        self._output_capture = output_capture\n        self._resume_from_failure = resume_from_failure\n\n    @property\n    def plan_data(self) -> PlanData:\n        return self._plan_data\n\n    @property\n    def reconstructable_job(self) -> ReconstructableJob:\n        if not isinstance(self.job, ReconstructableJob):\n            raise DagsterInvariantViolationError(\n                "reconstructable_pipeline property must be a ReconstructableJob"\n            )\n        return self.job\n\n    @property\n    def log(self) -> DagsterLogManager:\n        return self._log_manager\n\n    @property\n    def executor(self) -> Executor:\n        return self._executor\n\n    @property\n    def output_capture(self) -> Optional[Dict[StepOutputHandle, Any]]:\n        return self._output_capture\n\n    def for_step(self, step: ExecutionStep) -> "IStepContext":\n        return StepOrchestrationContext(\n            plan_data=self.plan_data,\n            log_manager=self._log_manager.with_tags(**step.logging_tags),\n            executor=self.executor,\n            step=step,\n            output_capture=self.output_capture,\n        )\n\n    @property\n    def resume_from_failure(self) -> bool:\n        return self._resume_from_failure\n\n\nclass StepOrchestrationContext(PlanOrchestrationContext, IStepContext):\n    """Context for the orchestration of a step.\n\n    This context assumes inability to run user code directly. Thus, it does not include any resource\n    information.\n    """\n\n    def __init__(\n        self,\n        plan_data: PlanData,\n        log_manager: DagsterLogManager,\n        executor: Executor,\n        step: ExecutionStep,\n        output_capture: Optional[Dict[StepOutputHandle, Any]],\n    ):\n        super(StepOrchestrationContext, self).__init__(\n            plan_data, log_manager, executor, output_capture\n        )\n        self._step = step\n\n    @property\n    def step(self) -> ExecutionStep:\n        return self._step\n\n    @property\n    def node_handle(self) -> "NodeHandle":\n        return self.step.node_handle\n\n\nclass PlanExecutionContext(IPlanContext):\n    """Context for the execution of a plan.\n\n    This context assumes that user code can be run directly, and thus includes resource and\n    information.\n    """\n\n    def __init__(\n        self,\n        plan_data: PlanData,\n        execution_data: ExecutionData,\n        log_manager: DagsterLogManager,\n        output_capture: Optional[Dict[StepOutputHandle, Any]] = None,\n    ):\n        self._plan_data = plan_data\n        self._execution_data = execution_data\n        self._log_manager = log_manager\n        self._output_capture = output_capture\n\n    @property\n    def plan_data(self) -> PlanData:\n        return self._plan_data\n\n    @property\n    def output_capture(self) -> Optional[Dict[StepOutputHandle, Any]]:\n        return self._output_capture\n\n    def for_step(\n        self,\n        step: ExecutionStep,\n        known_state: Optional["KnownExecutionState"] = None,\n    ) -> IStepContext:\n        return StepExecutionContext(\n            plan_data=self.plan_data,\n            execution_data=self._execution_data,\n            log_manager=self._log_manager.with_tags(**step.logging_tags),\n            step=step,\n            output_capture=self.output_capture,\n            known_state=known_state,\n        )\n\n    @property\n    def job_def(self) -> JobDefinition:\n        return self._execution_data.job_def\n\n    @property\n    def resolved_run_config(self) -> ResolvedRunConfig:\n        return self._execution_data.resolved_run_config\n\n    @property\n    def scoped_resources_builder(self) -> ScopedResourcesBuilder:\n        return self._execution_data.scoped_resources_builder\n\n    @property\n    def log(self) -> DagsterLogManager:\n        return self._log_manager\n\n    @property\n    def partitions_def(self) -> Optional[PartitionsDefinition]:\n        from dagster._core.definitions.job_definition import JobDefinition\n\n        job_def = self._execution_data.job_def\n        if not isinstance(job_def, JobDefinition):\n            check.failed(\n                "Can only call 'partitions_def', when using jobs, not legacy pipelines",\n            )\n        partitions_def = job_def.partitions_def\n        return partitions_def\n\n    @property\n    def has_partitions(self) -> bool:\n        tags = self._plan_data.dagster_run.tags\n        return bool(\n            PARTITION_NAME_TAG in tags\n            or any([tag.startswith(MULTIDIMENSIONAL_PARTITION_PREFIX) for tag in tags.keys()])\n            or (\n                tags.get(ASSET_PARTITION_RANGE_START_TAG)\n                and tags.get(ASSET_PARTITION_RANGE_END_TAG)\n            )\n        )\n\n    @property\n    def partition_key(self) -> str:\n        from dagster._core.definitions.multi_dimensional_partitions import (\n            MultiPartitionsDefinition,\n            get_multipartition_key_from_tags,\n        )\n\n        if not self.has_partitions:\n            raise DagsterInvariantViolationError(\n                "Cannot access partition_key for a non-partitioned run"\n            )\n\n        tags = self._plan_data.dagster_run.tags\n        if any([tag.startswith(MULTIDIMENSIONAL_PARTITION_PREFIX) for tag in tags.keys()]):\n            return get_multipartition_key_from_tags(tags)\n        elif PARTITION_NAME_TAG in tags:\n            return tags[PARTITION_NAME_TAG]\n        else:\n            range_start = tags[ASSET_PARTITION_RANGE_START_TAG]\n            range_end = tags[ASSET_PARTITION_RANGE_END_TAG]\n\n            if range_start != range_end:\n                raise DagsterInvariantViolationError(\n                    "Cannot access partition_key for a partitioned run with a range of partitions."\n                    " Call partition_key_range instead."\n                )\n            else:\n                if isinstance(self.partitions_def, MultiPartitionsDefinition):\n                    return self.partitions_def.get_partition_key_from_str(cast(str, range_start))\n                return cast(str, range_start)\n\n    @property\n    def asset_partition_key_range(self) -> PartitionKeyRange:\n        from dagster._core.definitions.multi_dimensional_partitions import (\n            MultiPartitionsDefinition,\n            get_multipartition_key_from_tags,\n        )\n\n        if not self.has_partitions:\n            raise DagsterInvariantViolationError(\n                "Cannot access partition_key for a non-partitioned run"\n            )\n\n        tags = self._plan_data.dagster_run.tags\n        if any([tag.startswith(MULTIDIMENSIONAL_PARTITION_PREFIX) for tag in tags.keys()]):\n            multipartition_key = get_multipartition_key_from_tags(tags)\n            return PartitionKeyRange(multipartition_key, multipartition_key)\n        elif PARTITION_NAME_TAG in tags:\n            partition_key = tags[PARTITION_NAME_TAG]\n            return PartitionKeyRange(partition_key, partition_key)\n        else:\n            partition_key_range_start = tags[ASSET_PARTITION_RANGE_START_TAG]\n            if partition_key_range_start is not None:\n                if isinstance(self.partitions_def, MultiPartitionsDefinition):\n                    return PartitionKeyRange(\n                        self.partitions_def.get_partition_key_from_str(partition_key_range_start),\n                        self.partitions_def.get_partition_key_from_str(\n                            tags[ASSET_PARTITION_RANGE_END_TAG]\n                        ),\n                    )\n            return PartitionKeyRange(partition_key_range_start, tags[ASSET_PARTITION_RANGE_END_TAG])\n\n    @property\n    def partition_time_window(self) -> TimeWindow:\n        partitions_def = self.partitions_def\n\n        if partitions_def is None:\n            raise DagsterInvariantViolationError("Partitions definition is not defined")\n\n        if not has_one_dimension_time_window_partitioning(partitions_def=partitions_def):\n            raise DagsterInvariantViolationError(\n                "Expected a TimeWindowPartitionsDefinition or MultiPartitionsDefinition with a"\n                f" single time dimension, but instead found {type(partitions_def)}"\n            )\n\n        if self.has_partition_key:\n            return cast(\n                Union[MultiPartitionsDefinition, TimeWindowPartitionsDefinition], partitions_def\n            ).time_window_for_partition_key(self.partition_key)\n        elif self.has_partition_key_range:\n            partition_key_range = self.asset_partition_key_range\n            partitions_def = cast(\n                Union[TimeWindowPartitionsDefinition, MultiPartitionsDefinition], partitions_def\n            )\n            return TimeWindow(\n                partitions_def.time_window_for_partition_key(partition_key_range.start).start,\n                partitions_def.time_window_for_partition_key(partition_key_range.end).end,\n            )\n\n        else:\n            check.failed(\n                "Has a PartitionsDefinition, so should either have a partition key or a partition"\n                " key range"\n            )\n\n    @property\n    def has_partition_key(self) -> bool:\n        return PARTITION_NAME_TAG in self._plan_data.dagster_run.tags\n\n    @property\n    def has_partition_key_range(self) -> bool:\n        return ASSET_PARTITION_RANGE_START_TAG in self._plan_data.dagster_run.tags\n\n    def for_type(self, dagster_type: DagsterType) -> "TypeCheckContext":\n        return TypeCheckContext(\n            self.run_id, self.log, self._execution_data.scoped_resources_builder, dagster_type\n        )\n\n\n@dataclass\nclass InputAssetVersionInfo:\n    # This is the storage id of the last materialization of any partition of an asset. Thus it is\n    # computed the same way for both partitioned and non-partitioned assets.\n    storage_id: int\n\n    # If the input asset is partitioned, this is a hash of the sorted data versions of each dependency\n    # partition. If the input asset is not partitioned, this is the data version of the asset. It\n    # can be none if we are sourcing a materialization from before data versions.\n    data_version: Optional["DataVersion"]\n\n    # This is the run_id on the event that the storage_id references\n    run_id: str\n\n    # This is the timestamp on the event that the storage_id references\n    timestamp: float\n\n\n
[docs]class StepExecutionContext(PlanExecutionContext, IStepContext):\n """Context for the execution of a step. Users should not instantiate this class directly.\n\n This context assumes that user code can be run directly, and thus includes resource and information.\n """\n\n def __init__(\n self,\n plan_data: PlanData,\n execution_data: ExecutionData,\n log_manager: DagsterLogManager,\n step: ExecutionStep,\n output_capture: Optional[Dict[StepOutputHandle, Any]],\n known_state: Optional["KnownExecutionState"],\n ):\n from dagster._core.execution.resources_init import get_required_resource_keys_for_step\n\n super(StepExecutionContext, self).__init__(\n plan_data=plan_data,\n execution_data=execution_data,\n log_manager=log_manager,\n output_capture=output_capture,\n )\n self._step = step\n self._required_resource_keys = get_required_resource_keys_for_step(\n plan_data.job.get_definition(),\n step,\n plan_data.execution_plan,\n )\n self._resources = execution_data.scoped_resources_builder.build(\n self._required_resource_keys\n )\n self._known_state = known_state\n self._input_lineage: List[AssetLineageInfo] = []\n\n resources_iter = cast(Iterable, self._resources)\n\n step_launcher_resources = [\n resource for resource in resources_iter if isinstance(resource, StepLauncher)\n ]\n\n self._step_launcher: Optional[StepLauncher] = None\n if len(step_launcher_resources) > 1:\n raise DagsterInvariantViolationError(\n "Multiple required resources for {described_op} have inherited StepLauncher"\n "There should be at most one step launcher resource per {node_type}.".format(\n described_op=self.describe_op(), node_type=self.op_def.node_type_str\n )\n )\n elif len(step_launcher_resources) == 1:\n self._step_launcher = step_launcher_resources[0]\n\n self._step_exception: Optional[BaseException] = None\n\n self._step_output_capture: Optional[Dict[StepOutputHandle, Any]] = None\n # Enable step output capture if there are any hooks which will receive them.\n # Expect in the future that hooks may control whether or not they get outputs,\n # but for now presence of any will cause output capture.\n if self.job_def.get_all_hooks_for_handle(self.node_handle):\n self._step_output_capture = {}\n\n self._output_metadata: Dict[str, Any] = {}\n self._seen_outputs: Dict[str, Union[str, Set[str]]] = {}\n\n self._input_asset_version_info: Dict[AssetKey, Optional["InputAssetVersionInfo"]] = {}\n self._is_external_input_asset_version_info_loaded = False\n self._data_version_cache: Dict[AssetKey, "DataVersion"] = {}\n\n self._requires_typed_event_stream = False\n self._typed_event_stream_error_message = None\n\n # In this mode no conversion is done on returned values and missing but expected outputs are not\n # allowed.\n @property\n def requires_typed_event_stream(self) -> bool:\n return self._requires_typed_event_stream\n\n @property\n def typed_event_stream_error_message(self) -> Optional[str]:\n return self._typed_event_stream_error_message\n\n # Error message will be appended to the default error message.\n def set_requires_typed_event_stream(self, *, error_message: Optional[str] = None):\n self._requires_typed_event_stream = True\n self._typed_event_stream_error_message = error_message\n\n @property\n def step(self) -> ExecutionStep:\n return self._step\n\n @property\n def node_handle(self) -> "NodeHandle":\n return self.step.node_handle\n\n @property\n def required_resource_keys(self) -> AbstractSet[str]:\n return self._required_resource_keys\n\n @property\n def resources(self) -> "Resources":\n return self._resources\n\n @property\n def step_launcher(self) -> Optional[StepLauncher]:\n return self._step_launcher\n\n @property\n def op_def(self) -> OpDefinition:\n return self.op.definition\n\n @property\n def job_def(self) -> "JobDefinition":\n return self._execution_data.job_def\n\n @property\n def op(self) -> OpNode:\n return self.job_def.get_op(self._step.node_handle)\n\n @property\n def op_retry_policy(self) -> Optional[RetryPolicy]:\n return self.job_def.get_retry_policy_for_handle(self.node_handle)\n\n def describe_op(self) -> str:\n return f'op "{self.node_handle}"'\n\n def get_io_manager(self, step_output_handle: StepOutputHandle) -> IOManager:\n step_output = self.execution_plan.get_step_output(step_output_handle)\n io_manager_key = (\n self.job_def.get_node(step_output.node_handle)\n .output_def_named(step_output.name)\n .io_manager_key\n )\n\n output_manager = getattr(self.resources, io_manager_key)\n return check.inst(output_manager, IOManager)\n\n def get_output_context(self, step_output_handle: StepOutputHandle) -> OutputContext:\n return get_output_context(\n self.execution_plan,\n self.job_def,\n self.resolved_run_config,\n step_output_handle,\n self._get_source_run_id(step_output_handle),\n log_manager=self.log,\n step_context=self,\n resources=None,\n version=self.execution_plan.get_version_for_step_output_handle(step_output_handle),\n )\n\n def for_input_manager(\n self,\n name: str,\n config: Any,\n metadata: Any,\n dagster_type: DagsterType,\n source_handle: Optional[StepOutputHandle] = None,\n resource_config: Any = None,\n resources: Optional["Resources"] = None,\n artificial_output_context: Optional["OutputContext"] = None,\n ) -> InputContext:\n if source_handle and artificial_output_context:\n check.failed("Cannot specify both source_handle and artificial_output_context.")\n\n upstream_output: Optional[OutputContext] = None\n\n if source_handle is not None:\n version = self.execution_plan.get_version_for_step_output_handle(source_handle)\n\n # NOTE: this is using downstream step_context for upstream OutputContext. step_context\n # will be set to None for 0.15 release.\n upstream_output = get_output_context(\n self.execution_plan,\n self.job_def,\n self.resolved_run_config,\n source_handle,\n self._get_source_run_id(source_handle),\n log_manager=self.log,\n step_context=self,\n resources=None,\n version=version,\n warn_on_step_context_use=True,\n )\n else:\n upstream_output = artificial_output_context\n\n asset_key = self.job_def.asset_layer.asset_key_for_input(\n node_handle=self.node_handle, input_name=name\n )\n asset_partitions_subset = (\n self.asset_partitions_subset_for_input(name)\n if self.has_asset_partitions_for_input(name)\n else None\n )\n\n asset_partitions_def = (\n self.job_def.asset_layer.partitions_def_for_asset(asset_key) if asset_key else None\n )\n return InputContext(\n job_name=self.job_def.name,\n name=name,\n op_def=self.op_def,\n config=config,\n metadata=metadata,\n upstream_output=upstream_output,\n dagster_type=dagster_type,\n log_manager=self.log,\n step_context=self,\n resource_config=resource_config,\n resources=resources,\n asset_key=asset_key,\n asset_partitions_subset=asset_partitions_subset,\n asset_partitions_def=asset_partitions_def,\n instance=self.instance,\n )\n\n def for_hook(self, hook_def: HookDefinition) -> "HookContext":\n from .hook import HookContext\n\n return HookContext(self, hook_def)\n\n def get_known_state(self) -> "KnownExecutionState":\n if not self._known_state:\n check.failed(\n "Attempted to access KnownExecutionState but it was not provided at context"\n " creation"\n )\n return self._known_state\n\n def can_load(\n self,\n step_output_handle: StepOutputHandle,\n ) -> bool:\n # can load from upstream in the same run\n if step_output_handle in self.get_known_state().ready_outputs:\n return True\n\n if (\n self._should_load_from_previous_runs(step_output_handle)\n # should and can load from a previous run\n and self._get_source_run_id_from_logs(step_output_handle)\n ):\n return True\n\n return False\n\n def observe_output(self, output_name: str, mapping_key: Optional[str] = None) -> None:\n if mapping_key:\n if output_name not in self._seen_outputs:\n self._seen_outputs[output_name] = set()\n cast(Set[str], self._seen_outputs[output_name]).add(mapping_key)\n else:\n self._seen_outputs[output_name] = "seen"\n\n def has_seen_output(self, output_name: str, mapping_key: Optional[str] = None) -> bool:\n if mapping_key:\n return (\n output_name in self._seen_outputs and mapping_key in self._seen_outputs[output_name]\n )\n return output_name in self._seen_outputs\n\n def add_output_metadata(\n self,\n metadata: Mapping[str, Any],\n output_name: Optional[str] = None,\n mapping_key: Optional[str] = None,\n ) -> None:\n if output_name is None and len(self.op_def.output_defs) == 1:\n output_def = self.op_def.output_defs[0]\n output_name = output_def.name\n elif output_name is None:\n raise DagsterInvariantViolationError(\n "Attempted to log metadata without providing output_name, but multiple outputs"\n " exist. Please provide an output_name to the invocation of"\n " `context.add_output_metadata`."\n )\n else:\n output_def = self.op_def.output_def_named(output_name)\n\n if self.has_seen_output(output_name, mapping_key):\n output_desc = (\n f"output '{output_def.name}'"\n if not mapping_key\n else f"output '{output_def.name}' with mapping_key '{mapping_key}'"\n )\n raise DagsterInvariantViolationError(\n f"In {self.op_def.node_type_str} '{self.op.name}', attempted to log output"\n f" metadata for {output_desc} which has already been yielded. Metadata must be"\n " logged before the output is yielded."\n )\n if output_def.is_dynamic and not mapping_key:\n raise DagsterInvariantViolationError(\n f"In {self.op_def.node_type_str} '{self.op.name}', attempted to log metadata"\n f" for dynamic output '{output_def.name}' without providing a mapping key. When"\n " logging metadata for a dynamic output, it is necessary to provide a mapping key."\n )\n\n if mapping_key:\n if output_name not in self._output_metadata:\n self._output_metadata[output_name] = {}\n if mapping_key in self._output_metadata[output_name]:\n self._output_metadata[output_name][mapping_key].update(metadata)\n else:\n self._output_metadata[output_name][mapping_key] = metadata\n else:\n if output_name in self._output_metadata:\n self._output_metadata[output_name].update(metadata)\n else:\n self._output_metadata[output_name] = metadata\n\n def get_output_metadata(\n self, output_name: str, mapping_key: Optional[str] = None\n ) -> Optional[Mapping[str, Any]]:\n metadata = self._output_metadata.get(output_name)\n if mapping_key and metadata:\n return metadata.get(mapping_key)\n return metadata\n\n def _get_source_run_id_from_logs(self, step_output_handle: StepOutputHandle) -> Optional[str]:\n # walk through event logs to find the right run_id based on the run lineage\n\n parent_state = self.get_known_state().parent_state\n while parent_state:\n # if the parent run has yielded an StepOutput event for the given step output,\n # we find the source run id\n if step_output_handle in parent_state.produced_outputs:\n return parent_state.run_id\n\n # else, keep looking backwards\n parent_state = parent_state.get_parent_state()\n\n # When a fixed path is provided via io manager, it's able to run step subset using an execution\n # plan when the ascendant outputs were not previously created by dagster-controlled\n # computations. for example, in backfills, with fixed path io manager, we allow users to\n # "re-execute" runs with steps where the outputs weren't previously stored by dagster.\n\n # Warn about this special case because it will also reach here when all previous runs have\n # skipped yielding this output. From the logs, we have no easy way to differentiate the fixed\n # path case and the skipping case, until we record the skipping info in KnownExecutionState,\n # i.e. resolve https://github.com/dagster-io/dagster/issues/3511\n self.log.warning(\n f"No previously stored outputs found for source {step_output_handle}. "\n "This is either because you are using an IO Manager that does not depend on run ID, "\n "or because all the previous runs have skipped the output in conditional execution."\n )\n return None\n\n def _should_load_from_previous_runs(self, step_output_handle: StepOutputHandle) -> bool:\n # should not load if not a re-execution\n if self.dagster_run.parent_run_id is None:\n return False\n # should not load if re-executing the entire pipeline\n if self.dagster_run.step_keys_to_execute is None:\n return False\n\n # should not load if the entire dynamic step is being executed in the current run\n handle = StepHandle.parse_from_key(step_output_handle.step_key)\n if (\n isinstance(handle, ResolvedFromDynamicStepHandle)\n and handle.unresolved_form.to_key() in self.dagster_run.step_keys_to_execute\n ):\n return False\n\n # should not load if this step is being executed in the current run\n return step_output_handle.step_key not in self.dagster_run.step_keys_to_execute\n\n def _get_source_run_id(self, step_output_handle: StepOutputHandle) -> Optional[str]:\n if self._should_load_from_previous_runs(step_output_handle):\n return self._get_source_run_id_from_logs(step_output_handle)\n else:\n return self.dagster_run.run_id\n\n def capture_step_exception(self, exception: BaseException):\n self._step_exception = check.inst_param(exception, "exception", BaseException)\n\n @property\n def step_exception(self) -> Optional[BaseException]:\n return self._step_exception\n\n @property\n def step_output_capture(self) -> Optional[Dict[StepOutputHandle, Any]]:\n return self._step_output_capture\n\n @property\n def previous_attempt_count(self) -> int:\n return self.get_known_state().get_retry_state().get_attempt_count(self._step.key)\n\n @property\n def op_config(self) -> Any:\n op_config = self.resolved_run_config.ops.get(str(self.node_handle))\n return op_config.config if op_config else None\n\n @property\n def is_op_in_graph(self) -> bool:\n """Whether this step corresponds to an op within a graph (either @graph, or @graph_asset)."""\n return self.step.node_handle.parent is not None\n\n @property\n def is_sda_step(self) -> bool:\n """Whether this step corresponds to a software define asset, inferred by presence of asset info on outputs.\n\n note: ops can materialize assets as well.\n """\n for output in self.step.step_outputs:\n asset_info = self.job_def.asset_layer.asset_info_for_output(\n self.node_handle, output.name\n )\n if asset_info is not None:\n return True\n return False\n\n def set_data_version(self, asset_key: AssetKey, data_version: "DataVersion") -> None:\n self._data_version_cache[asset_key] = data_version\n\n def has_data_version(self, asset_key: AssetKey) -> bool:\n return asset_key in self._data_version_cache\n\n def get_data_version(self, asset_key: AssetKey) -> "DataVersion":\n return self._data_version_cache[asset_key]\n\n @property\n def input_asset_records(self) -> Optional[Mapping[AssetKey, Optional["InputAssetVersionInfo"]]]:\n return self._input_asset_version_info\n\n @property\n def is_external_input_asset_version_info_loaded(self) -> bool:\n return self._is_external_input_asset_version_info_loaded\n\n def get_input_asset_version_info(self, key: AssetKey) -> Optional["InputAssetVersionInfo"]:\n if key not in self._input_asset_version_info:\n self._fetch_input_asset_version_info(key)\n return self._input_asset_version_info[key]\n\n # "external" refers to records for inputs generated outside of this step\n def fetch_external_input_asset_version_info(self) -> None:\n output_keys = self.get_output_asset_keys()\n\n all_dep_keys: List[AssetKey] = []\n for output_key in output_keys:\n if output_key not in self.job_def.asset_layer.asset_deps:\n continue\n dep_keys = self.job_def.asset_layer.upstream_assets_for_asset(output_key)\n for key in dep_keys:\n if key not in all_dep_keys and key not in output_keys:\n all_dep_keys.append(key)\n\n self._input_asset_version_info = {}\n for key in all_dep_keys:\n self._fetch_input_asset_version_info(key)\n self._is_external_input_asset_version_info_loaded = True\n\n def _fetch_input_asset_version_info(self, key: AssetKey) -> None:\n from dagster._core.definitions.data_version import (\n extract_data_version_from_entry,\n )\n\n event = self._get_input_asset_event(key)\n if event is None:\n self._input_asset_version_info[key] = None\n else:\n storage_id = event.storage_id\n # Input name will be none if this is an internal dep\n input_name = self.job_def.asset_layer.input_for_asset_key(self.node_handle, key)\n # Exclude AllPartitionMapping for now to avoid huge queries\n if input_name and self.has_asset_partitions_for_input(input_name):\n subset = self.asset_partitions_subset_for_input(\n input_name, require_valid_partitions=False\n )\n input_keys = list(subset.get_partition_keys())\n\n # This check represents a temporary constraint that prevents huge query results for upstream\n # partition data versions from timing out runs. If a partitioned dependency (a) uses an\n # AllPartitionMapping; and (b) has greater than or equal to\n # SKIP_PARTITION_DATA_VERSION_DEPENDENCY_THRESHOLD dependency partitions, then we\n # process it as a non-partitioned dependency (note that this was the behavior for\n # all partition dependencies prior to 2023-08). This means that stale status\n # results cannot be accurately computed for the dependency, and there is thus\n # corresponding logic in the CachingStaleStatusResolver to account for this. This\n # constraint should be removed when we have thoroughly examined the performance of\n # the data version retrieval query and can guarantee decent performance.\n if len(input_keys) < SKIP_PARTITION_DATA_VERSION_DEPENDENCY_THRESHOLD:\n data_version = self._get_partitions_data_version_from_keys(key, input_keys)\n else:\n data_version = extract_data_version_from_entry(event.event_log_entry)\n else:\n data_version = extract_data_version_from_entry(event.event_log_entry)\n self._input_asset_version_info[key] = InputAssetVersionInfo(\n storage_id, data_version, event.run_id, event.timestamp\n )\n\n def partition_mapping_for_input(self, input_name: str) -> Optional[PartitionMapping]:\n asset_layer = self.job_def.asset_layer\n upstream_asset_key = asset_layer.asset_key_for_input(self.node_handle, input_name)\n if upstream_asset_key:\n upstream_asset_partitions_def = asset_layer.partitions_def_for_asset(upstream_asset_key)\n assets_def = asset_layer.assets_def_for_node(self.node_handle)\n partitions_def = assets_def.partitions_def if assets_def else None\n explicit_partition_mapping = self.job_def.asset_layer.partition_mapping_for_node_input(\n self.node_handle, upstream_asset_key\n )\n return infer_partition_mapping(\n explicit_partition_mapping,\n partitions_def,\n upstream_asset_partitions_def,\n )\n else:\n return None\n\n def _get_input_asset_event(self, key: AssetKey) -> Optional["EventLogRecord"]:\n event = self.instance.get_latest_data_version_record(key)\n if event:\n self._check_input_asset_event(key, event)\n return event\n\n def _check_input_asset_event(self, key: AssetKey, event: "EventLogRecord") -> None:\n assert event.event_log_entry\n event_data_version = extract_data_version_from_entry(event.event_log_entry)\n if key in self._data_version_cache and self._data_version_cache[key] != event_data_version:\n self.log.warning(\n f"Data version mismatch for asset {key}. Data version from materialization within"\n f" current step is `{self._data_version_cache[key]}`. Data version from most recent"\n f" materialization is `{event_data_version}`. Most recent materialization will be"\n " used for provenance tracking."\n )\n\n def _get_partitions_data_version_from_keys(\n self, key: AssetKey, partition_keys: Sequence[str]\n ) -> "DataVersion":\n from dagster._core.definitions.data_version import (\n DataVersion,\n )\n from dagster._core.events import DagsterEventType\n\n # TODO: this needs to account for observations also\n event_type = DagsterEventType.ASSET_MATERIALIZATION\n tags_by_partition = (\n self.instance._event_storage.get_latest_tags_by_partition( # noqa: SLF001\n key, event_type, [DATA_VERSION_TAG], asset_partitions=list(partition_keys)\n )\n )\n partition_data_versions = [\n pair[1][DATA_VERSION_TAG]\n for pair in sorted(tags_by_partition.items(), key=lambda x: x[0])\n ]\n hash_sig = sha256()\n hash_sig.update(bytearray("".join(partition_data_versions), "utf8"))\n return DataVersion(hash_sig.hexdigest())\n\n # Call this to clear the cache for an input asset record. This is necessary when an old\n # materialization for an asset was loaded during `fetch_external_input_asset_records` because an\n # intrastep asset is not required, but then that asset is materialized during the step. If we\n # don't clear the cache for this asset, then we won't use the most up-to-date asset record.\n def wipe_input_asset_version_info(self, key: AssetKey) -> None:\n if key in self._input_asset_version_info:\n del self._input_asset_version_info[key]\n\n def get_output_asset_keys(self) -> AbstractSet[AssetKey]:\n output_keys: Set[AssetKey] = set()\n for step_output in self.step.step_outputs:\n asset_info = self.job_def.asset_layer.asset_info_for_output(\n self.node_handle, step_output.name\n )\n if asset_info is None or not asset_info.is_required:\n continue\n output_keys.add(asset_info.key)\n return output_keys\n\n def has_asset_partitions_for_input(self, input_name: str) -> bool:\n asset_layer = self.job_def.asset_layer\n upstream_asset_key = asset_layer.asset_key_for_input(self.node_handle, input_name)\n\n return (\n upstream_asset_key is not None\n and asset_layer.partitions_def_for_asset(upstream_asset_key) is not None\n )\n\n def asset_partition_key_range_for_input(self, input_name: str) -> PartitionKeyRange:\n subset = self.asset_partitions_subset_for_input(input_name)\n partition_key_ranges = subset.get_partition_key_ranges(\n dynamic_partitions_store=self.instance\n )\n\n if len(partition_key_ranges) != 1:\n check.failed(\n "Tried to access asset partition key range, but there are "\n f"({len(partition_key_ranges)}) key ranges associated with this input.",\n )\n\n return partition_key_ranges[0]\n\n def asset_partitions_subset_for_input(\n self, input_name: str, *, require_valid_partitions: bool = True\n ) -> PartitionsSubset:\n asset_layer = self.job_def.asset_layer\n assets_def = asset_layer.assets_def_for_node(self.node_handle)\n upstream_asset_key = asset_layer.asset_key_for_input(self.node_handle, input_name)\n\n if upstream_asset_key is not None:\n upstream_asset_partitions_def = asset_layer.partitions_def_for_asset(upstream_asset_key)\n\n if upstream_asset_partitions_def is not None:\n partitions_def = assets_def.partitions_def if assets_def else None\n partitions_subset = (\n partitions_def.empty_subset().with_partition_key_range(\n self.asset_partition_key_range, dynamic_partitions_store=self.instance\n )\n if partitions_def\n else None\n )\n partition_mapping = infer_partition_mapping(\n asset_layer.partition_mapping_for_node_input(\n self.node_handle, upstream_asset_key\n ),\n partitions_def,\n upstream_asset_partitions_def,\n )\n mapped_partitions_result = (\n partition_mapping.get_upstream_mapped_partitions_result_for_partitions(\n partitions_subset,\n upstream_asset_partitions_def,\n dynamic_partitions_store=self.instance,\n )\n )\n\n if (\n require_valid_partitions\n and mapped_partitions_result.required_but_nonexistent_partition_keys\n ):\n raise DagsterInvariantViolationError(\n f"Partition key range {self.asset_partition_key_range} in"\n f" {self.node_handle.name} depends on invalid partition keys"\n f" {mapped_partitions_result.required_but_nonexistent_partition_keys} in"\n f" upstream asset {upstream_asset_key}"\n )\n\n return mapped_partitions_result.partitions_subset\n\n check.failed("The input has no asset partitions")\n\n def asset_partition_key_for_input(self, input_name: str) -> str:\n start, end = self.asset_partition_key_range_for_input(input_name)\n if start == end:\n return start\n else:\n check.failed(\n f"Tried to access partition key for input '{input_name}' of step '{self.step.key}',"\n f" but the step input has a partition range: '{start}' to '{end}'."\n )\n\n def _partitions_def_for_output(self, output_name: str) -> Optional[PartitionsDefinition]:\n asset_info = self.job_def.asset_layer.asset_info_for_output(\n node_handle=self.node_handle, output_name=output_name\n )\n if asset_info:\n return asset_info.partitions_def\n else:\n return None\n\n def partitions_def_for_output(self, output_name: str) -> Optional[PartitionsDefinition]:\n return self._partitions_def_for_output(output_name)\n\n def has_asset_partitions_for_output(self, output_name: str) -> bool:\n return self._partitions_def_for_output(output_name) is not None\n\n def asset_partition_key_range_for_output(self, output_name: str) -> PartitionKeyRange:\n if self._partitions_def_for_output(output_name) is not None:\n return self.asset_partition_key_range\n\n check.failed("The output has no asset partitions")\n\n def asset_partition_key_for_output(self, output_name: str) -> str:\n start, end = self.asset_partition_key_range_for_output(output_name)\n if start == end:\n return start\n else:\n check.failed(\n f"Tried to access partition key for output '{output_name}' of step"\n f" '{self.step.key}', but the step output has a partition range: '{start}' to"\n f" '{end}'."\n )\n\n def asset_partitions_time_window_for_output(self, output_name: str) -> TimeWindow:\n """The time window for the partitions of the asset correponding to the given output.\n\n Raises an error if either of the following are true:\n - The output asset has no partitioning.\n - The output asset is not partitioned with a TimeWindowPartitionsDefinition or a\n MultiPartitionsDefinition with one time-partitioned dimension.\n """\n partitions_def = self._partitions_def_for_output(output_name)\n\n if not partitions_def:\n raise ValueError(\n "Tried to get asset partitions for an output that does not correspond to a "\n "partitioned asset."\n )\n\n if not has_one_dimension_time_window_partitioning(partitions_def):\n raise ValueError(\n "Tried to get asset partitions for an output that correponds to a partitioned "\n "asset that is not time-partitioned."\n )\n\n partitions_def = cast(\n Union[TimeWindowPartitionsDefinition, MultiPartitionsDefinition], partitions_def\n )\n partition_key_range = self.asset_partition_key_range_for_output(output_name)\n return TimeWindow(\n # mypy thinks partitions_def is <nothing> here because ????\n partitions_def.time_window_for_partition_key(partition_key_range.start).start,\n partitions_def.time_window_for_partition_key(partition_key_range.end).end,\n )\n\n def asset_partitions_time_window_for_input(self, input_name: str) -> TimeWindow:\n """The time window for the partitions of the asset correponding to the given input.\n\n Raises an error if either of the following are true:\n - The input asset has no partitioning.\n - The input asset is not partitioned with a TimeWindowPartitionsDefinition or a\n MultiPartitionsDefinition with one time-partitioned dimension.\n """\n asset_layer = self.job_def.asset_layer\n upstream_asset_key = asset_layer.asset_key_for_input(self.node_handle, input_name)\n\n if upstream_asset_key is None:\n raise ValueError("The input has no corresponding asset")\n\n upstream_asset_partitions_def = asset_layer.partitions_def_for_asset(upstream_asset_key)\n\n if not upstream_asset_partitions_def:\n raise ValueError(\n "Tried to get asset partitions for an input that does not correspond to a "\n "partitioned asset."\n )\n\n if not has_one_dimension_time_window_partitioning(upstream_asset_partitions_def):\n raise ValueError(\n "Tried to get asset partitions for an input that correponds to a partitioned "\n "asset that is not time-partitioned."\n )\n\n upstream_asset_partitions_def = cast(\n Union[TimeWindowPartitionsDefinition, MultiPartitionsDefinition],\n upstream_asset_partitions_def,\n )\n partition_key_range = self.asset_partition_key_range_for_input(input_name)\n\n return TimeWindow(\n upstream_asset_partitions_def.time_window_for_partition_key(\n partition_key_range.start\n ).start,\n upstream_asset_partitions_def.time_window_for_partition_key(\n partition_key_range.end\n ).end,\n )\n\n def get_type_loader_context(self) -> "DagsterTypeLoaderContext":\n return DagsterTypeLoaderContext(\n plan_data=self.plan_data,\n execution_data=self._execution_data,\n log_manager=self._log_manager,\n step=self.step,\n output_capture=self._output_capture,\n known_state=self._known_state,\n )
\n\n\n
[docs]class TypeCheckContext:\n """The ``context`` object available to a type check function on a DagsterType."""\n\n def __init__(\n self,\n run_id: str,\n log_manager: DagsterLogManager,\n scoped_resources_builder: ScopedResourcesBuilder,\n dagster_type: DagsterType,\n ):\n self._run_id = run_id\n self._log = log_manager\n self._resources = scoped_resources_builder.build(dagster_type.required_resource_keys)\n\n @public\n @property\n def resources(self) -> "Resources":\n """An object whose attributes contain the resources available to this op."""\n return self._resources\n\n @public\n @property\n def run_id(self) -> str:\n """The id of this job run."""\n return self._run_id\n\n @public\n @property\n def log(self) -> DagsterLogManager:\n """Centralized log dispatch from user code."""\n return self._log
\n\n\n
[docs]class DagsterTypeLoaderContext(StepExecutionContext):\n """The context object provided to a :py:class:`@dagster_type_loader <dagster_type_loader>`-decorated function during execution.\n\n Users should not construct this object directly.\n """\n\n @public\n @property\n def resources(self) -> "Resources":\n """The resources available to the type loader, specified by the `required_resource_keys` argument of the decorator."""\n return super(DagsterTypeLoaderContext, self).resources\n\n @public\n @property\n def job_def(self) -> "JobDefinition":\n """The underlying job definition being executed."""\n return super(DagsterTypeLoaderContext, self).job_def\n\n @public\n @property\n def op_def(self) -> "OpDefinition":\n """The op for which type loading is occurring."""\n return super(DagsterTypeLoaderContext, self).op_def
\n
", "current_page_name": "_modules/dagster/_core/execution/context/system", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.execution.context.system"}}, "execute_in_process_result": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.execution.execute_in_process_result

\nfrom typing import Any, Mapping, Optional, Sequence\n\nimport dagster._check as check\nfrom dagster._annotations import public\nfrom dagster._core.definitions import JobDefinition, NodeHandle\nfrom dagster._core.definitions.events import AssetKey, CoercibleToAssetKey\nfrom dagster._core.definitions.utils import DEFAULT_OUTPUT\nfrom dagster._core.errors import DagsterInvariantViolationError\nfrom dagster._core.events import DagsterEvent\nfrom dagster._core.execution.plan.outputs import StepOutputHandle\nfrom dagster._core.storage.dagster_run import DagsterRun\n\nfrom .execution_result import ExecutionResult\n\n\n
[docs]class ExecuteInProcessResult(ExecutionResult):\n """Result object returned by in-process testing APIs.\n\n Users should not instantiate this object directly. Used for retrieving run success, events, and outputs from execution methods that return this object.\n\n This object is returned by:\n - :py:meth:`dagster.GraphDefinition.execute_in_process`\n - :py:meth:`dagster.JobDefinition.execute_in_process`\n - :py:meth:`dagster.materialize_to_memory`\n - :py:meth:`dagster.materialize`\n """\n\n _handle: NodeHandle\n _event_list: Sequence[DagsterEvent]\n _dagster_run: DagsterRun\n _output_capture: Mapping[StepOutputHandle, Any]\n _job_def: JobDefinition\n\n def __init__(\n self,\n event_list: Sequence[DagsterEvent],\n dagster_run: DagsterRun,\n output_capture: Optional[Mapping[StepOutputHandle, Any]],\n job_def: JobDefinition,\n ):\n self._job_def = job_def\n\n self._event_list = event_list\n self._dagster_run = dagster_run\n\n self._output_capture = check.opt_mapping_param(\n output_capture, "output_capture", key_type=StepOutputHandle\n )\n\n @public\n @property\n def job_def(self) -> JobDefinition:\n """JobDefinition: The job definition that was executed."""\n return self._job_def\n\n @public\n @property\n def dagster_run(self) -> DagsterRun:\n """DagsterRun: The Dagster run that was executed."""\n return self._dagster_run\n\n @public\n @property\n def all_events(self) -> Sequence[DagsterEvent]:\n """List[DagsterEvent]: All dagster events emitted during execution."""\n return self._event_list\n\n @public\n @property\n def run_id(self) -> str:\n """str: The run ID of the executed :py:class:`DagsterRun`."""\n return self.dagster_run.run_id\n\n def _get_output_for_handle(self, handle: NodeHandle, output_name: str) -> Any:\n mapped_outputs = {}\n step_key = str(handle)\n output_found = False\n for step_output_handle, value in self._output_capture.items():\n # For the mapped output case, where step keys are in the format\n # "step_key[upstream_mapped_output_name]" within the step output handle.\n if (\n step_output_handle.step_key.startswith(f"{step_key}[")\n and step_output_handle.output_name == output_name\n ):\n output_found = True\n key_start = step_output_handle.step_key.find("[")\n key_end = step_output_handle.step_key.find("]")\n upstream_mapped_output_name = step_output_handle.step_key[key_start + 1 : key_end]\n mapped_outputs[upstream_mapped_output_name] = value\n\n # For all other cases, search for exact match.\n elif (\n step_key == step_output_handle.step_key\n and step_output_handle.output_name == output_name\n ):\n output_found = True\n if not step_output_handle.mapping_key:\n return self._output_capture[step_output_handle]\n mapped_outputs[step_output_handle.mapping_key] = value\n\n if not output_found:\n raise DagsterInvariantViolationError(\n f"No outputs found for output '{output_name}' from node '{handle}'."\n )\n return mapped_outputs\n\n
[docs] @public\n def output_for_node(self, node_str: str, output_name: str = DEFAULT_OUTPUT) -> Any:\n """Retrieves output value with a particular name from the in-process run of the job.\n\n Args:\n node_str (str): Name of the op/graph whose output should be retrieved. If the intended\n graph/op is nested within another graph, the syntax is `outer_graph.inner_node`.\n output_name (Optional[str]): Name of the output on the op/graph to retrieve. Defaults to\n `result`, the default output name in dagster.\n\n Returns:\n Any: The value of the retrieved output.\n """\n return super(ExecuteInProcessResult, self).output_for_node(\n node_str, output_name=output_name\n )
\n\n
[docs] @public\n def asset_value(self, asset_key: CoercibleToAssetKey) -> Any:\n """Retrieves the value of an asset that was materialized during the execution of the job.\n\n Args:\n asset_key (CoercibleToAssetKey): The key of the asset to retrieve.\n\n Returns:\n Any: The value of the retrieved asset.\n """\n node_output_handle = self._job_def.asset_layer.node_output_handle_for_asset(\n AssetKey.from_coercible(asset_key)\n )\n return self.output_for_node(\n node_str=str(node_output_handle.node_handle), output_name=node_output_handle.output_name\n )
\n\n
[docs] @public\n def output_value(self, output_name: str = DEFAULT_OUTPUT) -> Any:\n """Retrieves output of top-level job, if an output is returned.\n\n Args:\n output_name (Optional[str]): The name of the output to retrieve. Defaults to `result`,\n the default output name in dagster.\n\n Returns:\n Any: The value of the retrieved output.\n """\n return super(ExecuteInProcessResult, self).output_value(output_name=output_name)
\n
", "current_page_name": "_modules/dagster/_core/execution/execute_in_process_result", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.execution.execute_in_process_result"}, "job_execution_result": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.execution.job_execution_result

\nfrom typing import Any, Sequence\n\nimport dagster._check as check\nfrom dagster._annotations import public\nfrom dagster._core.definitions import JobDefinition, NodeHandle\nfrom dagster._core.definitions.utils import DEFAULT_OUTPUT\nfrom dagster._core.errors import DagsterInvariantViolationError\nfrom dagster._core.events import DagsterEvent\nfrom dagster._core.execution.plan.utils import build_resources_for_manager\nfrom dagster._core.storage.dagster_run import DagsterRun\n\nfrom .execution_result import ExecutionResult\n\n\n
[docs]class JobExecutionResult(ExecutionResult):\n """Result object returned by :py:func:`dagster.execute_job`.\n\n Used for retrieving run success, events, and outputs from `execute_job`.\n Users should not directly instantiate this class.\n\n Events and run information can be retrieved off of the object directly. In\n order to access outputs, the `ExecuteJobResult` object needs to be opened\n as a context manager, which will re-initialize the resources from\n execution.\n """\n\n def __init__(self, job_def, reconstruct_context, event_list, dagster_run):\n self._job_def = job_def\n self._reconstruct_context = reconstruct_context\n self._context = None\n self._event_list = event_list\n self._dagster_run = dagster_run\n\n def __enter__(self) -> "JobExecutionResult":\n context = self._reconstruct_context.__enter__()\n self._context = context\n return self\n\n def __exit__(self, *exc):\n exit_result = self._reconstruct_context.__exit__(*exc)\n self._context = None\n return exit_result\n\n @public\n @property\n def job_def(self) -> JobDefinition:\n """JobDefinition: The job definition that was executed."""\n return self._job_def\n\n @public\n @property\n def dagster_run(self) -> DagsterRun:\n """DagsterRun: The Dagster run that was executed."""\n return self._dagster_run\n\n @public\n @property\n def all_events(self) -> Sequence[DagsterEvent]:\n """Sequence[DagsterEvent]: List of all events yielded by the job execution."""\n return self._event_list\n\n @public\n @property\n def run_id(self) -> str:\n """str: The id of the Dagster run that was executed."""\n return self.dagster_run.run_id\n\n
[docs] @public\n def output_value(self, output_name: str = DEFAULT_OUTPUT) -> Any:\n """Retrieves output of top-level job, if an output is returned.\n\n In order to use this method, the `ExecuteJobResult` object must be opened as a context manager. If this method is used without opening the context manager, it will result in a :py:class:`DagsterInvariantViolationError`. If the top-level job has no output, calling this method will also result in a :py:class:`DagsterInvariantViolationError`.\n\n Args:\n output_name (Optional[str]): The name of the output to retrieve. Defaults to `result`,\n the default output name in dagster.\n\n Returns:\n Any: The value of the retrieved output.\n """\n return super(JobExecutionResult, self).output_value(output_name=output_name)
\n\n
[docs] @public\n def output_for_node(self, node_str: str, output_name: str = DEFAULT_OUTPUT) -> Any:\n """Retrieves output value with a particular name from the run of the job.\n\n In order to use this method, the `ExecuteJobResult` object must be opened as a context manager. If this method is used without opening the context manager, it will result in a :py:class:`DagsterInvariantViolationError`.\n\n Args:\n node_str (str): Name of the op/graph whose output should be retrieved. If the intended\n graph/op is nested within another graph, the syntax is `outer_graph.inner_node`.\n output_name (Optional[str]): Name of the output on the op/graph to retrieve. Defaults to\n `result`, the default output name in dagster.\n\n Returns:\n Any: The value of the retrieved output.\n """\n return super(JobExecutionResult, self).output_for_node(node_str, output_name=output_name)
\n\n def _get_output_for_handle(self, handle: NodeHandle, output_name: str) -> Any:\n if not self._context:\n raise DagsterInvariantViolationError(\n "In order to access output objects, the result of `execute_job` must be opened as a"\n " context manager: 'with execute_job(...) as result:"\n )\n found = False\n result = None\n for compute_step_event in self.compute_events_for_handle(handle):\n if (\n compute_step_event.is_successful_output\n and compute_step_event.step_output_data.output_name == output_name\n ):\n found = True\n output = compute_step_event.step_output_data\n step = self._context.execution_plan.get_step_by_key(compute_step_event.step_key)\n dagster_type = (\n self.job_def.get_node(handle).output_def_named(output_name).dagster_type\n )\n value = self._get_value(self._context.for_step(step), output, dagster_type)\n check.invariant(\n not (output.mapping_key and step.get_mapping_key()),\n "Not set up to handle mapped outputs downstream of mapped steps",\n )\n mapping_key = output.mapping_key or step.get_mapping_key()\n if mapping_key:\n if result is None:\n result = {mapping_key: value}\n else:\n result[mapping_key] = (\n value # pylint:disable=unsupported-assignment-operation\n )\n else:\n result = value\n\n if found:\n return result\n\n node = self.job_def.get_node(handle)\n raise DagsterInvariantViolationError(\n f"Did not find result {output_name} in {node.describe_node()}"\n )\n\n def _get_value(self, context, step_output_data, dagster_type):\n step_output_handle = step_output_data.step_output_handle\n manager = context.get_io_manager(step_output_handle)\n manager_key = context.execution_plan.get_manager_key(step_output_handle, self.job_def)\n res = manager.load_input(\n context.for_input_manager(\n name=None,\n config=None,\n metadata=None,\n dagster_type=dagster_type,\n source_handle=step_output_handle,\n resource_config=context.resolved_run_config.resources[manager_key].config,\n resources=build_resources_for_manager(manager_key, context),\n )\n )\n return res
\n
", "current_page_name": "_modules/dagster/_core/execution/job_execution_result", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.execution.job_execution_result"}, "validate_run_config": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.execution.validate_run_config

\nfrom typing import Any, Mapping, Optional, Union\n\nimport dagster._check as check\nfrom dagster._core.definitions import JobDefinition\nfrom dagster._core.definitions.run_config import RunConfig, convert_config_input\nfrom dagster._core.system_config.objects import ResolvedRunConfig\n\n\n
[docs]def validate_run_config(\n job_def: JobDefinition,\n run_config: Optional[Union[Mapping[str, Any], RunConfig]] = None,\n) -> Mapping[str, Any]:\n """Function to validate a provided run config blob against a given job.\n\n If validation is successful, this function will return a dictionary representation of the\n validated config actually used during execution.\n\n Args:\n job_def (JobDefinition): The job definition to validate run\n config against\n run_config (Optional[Dict[str, Any]]): The run config to validate\n\n Returns:\n Dict[str, Any]: A dictionary representation of the validated config.\n """\n check.inst_param(job_def, "job_def", JobDefinition)\n run_config = check.opt_mapping_param(\n convert_config_input(run_config), "run_config", key_type=str\n )\n\n return ResolvedRunConfig.build(job_def, run_config).to_dict()
\n
", "current_page_name": "_modules/dagster/_core/execution/validate_run_config", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.execution.validate_run_config"}, "with_resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.execution.with_resources

\nfrom typing import Any, Iterable, List, Mapping, Optional, Sequence, TypeVar, cast\n\nfrom dagster import _check as check\nfrom dagster._core.execution.build_resources import wrap_resources_for_execution\nfrom dagster._utils.merger import merge_dicts\n\nfrom ..._config import Shape\nfrom ..definitions.resource_requirement import ResourceAddable\nfrom ..definitions.utils import DEFAULT_IO_MANAGER_KEY\nfrom ..errors import DagsterInvalidConfigError, DagsterInvalidInvocationError\n\nT = TypeVar("T", bound=ResourceAddable)\n\n\n
[docs]def with_resources(\n definitions: Iterable[T],\n resource_defs: Mapping[str, object],\n resource_config_by_key: Optional[Mapping[str, Any]] = None,\n) -> Sequence[T]:\n """Adds dagster resources to copies of resource-requiring dagster definitions.\n\n An error will be thrown if any provided definitions have a conflicting\n resource definition provided for a key provided to resource_defs. Resource\n config can be provided, with keys in the config dictionary corresponding to\n the keys for each resource definition. If any definition has unsatisfied\n resource keys after applying with_resources, an error will be thrown.\n\n Args:\n definitions (Iterable[ResourceAddable]): Dagster definitions to provide resources to.\n resource_defs (Mapping[str, object]):\n Mapping of resource keys to objects to satisfy\n resource requirements of provided dagster definitions.\n resource_config_by_key (Optional[Mapping[str, Any]]):\n Specifies config for provided resources. The key in this dictionary\n corresponds to configuring the same key in the resource_defs\n dictionary.\n\n Examples:\n .. code-block:: python\n\n from dagster import asset, resource, with_resources\n\n @resource(config_schema={"bar": str})\n def foo_resource():\n ...\n\n @asset(required_resource_keys={"foo"})\n def asset1(context):\n foo = context.resources.foo\n ...\n\n @asset(required_resource_keys={"foo"})\n def asset2(context):\n foo = context.resources.foo\n ...\n\n asset1_with_foo, asset2_with_foo = with_resources(\n [the_asset, other_asset],\n resource_config_by_key={\n "foo": {\n "config": {"bar": ...}\n }\n }\n )\n """\n from dagster._config import validate_config\n from dagster._core.definitions.job_definition import (\n default_job_io_manager_with_fs_io_manager_schema,\n )\n\n check.mapping_param(resource_defs, "resource_defs")\n resource_config_by_key = check.opt_mapping_param(\n resource_config_by_key, "resource_config_by_key"\n )\n\n resource_defs = wrap_resources_for_execution(\n merge_dicts(\n {DEFAULT_IO_MANAGER_KEY: default_job_io_manager_with_fs_io_manager_schema},\n resource_defs,\n )\n )\n\n for key, resource_def in resource_defs.items():\n if key in resource_config_by_key:\n resource_config = resource_config_by_key[key]\n if not isinstance(resource_config, dict) or "config" not in resource_config:\n raise DagsterInvalidInvocationError(\n f"Error with config for resource key '{key}': Expected a "\n "dictionary of the form {'config': ...}, but received "\n f"{resource_config}"\n )\n\n outer_config_shape = Shape({"config": resource_def.get_config_field()})\n config_evr = validate_config(outer_config_shape, resource_config)\n if not config_evr.success:\n raise DagsterInvalidConfigError(\n f"Error when applying config for resource with key '{key}' ",\n config_evr.errors,\n resource_config,\n )\n resource_defs[key] = resource_defs[key].configured(resource_config["config"])\n\n transformed_defs: List[T] = []\n for definition in definitions:\n transformed_defs.append(cast(T, definition.with_resources(resource_defs)))\n\n return transformed_defs
\n
", "current_page_name": "_modules/dagster/_core/execution/with_resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.execution.with_resources"}}, "executor": {"base": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.executor.base

\nfrom abc import ABC, abstractmethod\nfrom typing import TYPE_CHECKING, Iterator\n\nfrom dagster._annotations import public\nfrom dagster._core.execution.retries import RetryMode\n\nif TYPE_CHECKING:\n    from dagster._core.events import DagsterEvent\n    from dagster._core.execution.context.system import PlanOrchestrationContext\n    from dagster._core.execution.plan.plan import ExecutionPlan\n\n\n
[docs]class Executor(ABC):\n
[docs] @public\n @abstractmethod\n def execute(\n self, plan_context: "PlanOrchestrationContext", execution_plan: "ExecutionPlan"\n ) -> Iterator["DagsterEvent"]:\n """For the given context and execution plan, orchestrate a series of sub plan executions in a way that satisfies the whole plan being executed.\n\n Args:\n plan_context (PlanOrchestrationContext): The plan's orchestration context.\n execution_plan (ExecutionPlan): The plan to execute.\n\n Returns:\n A stream of dagster events.\n """
\n\n @public\n @property\n @abstractmethod\n def retries(self) -> RetryMode:\n """Whether retries are enabled or disabled for this instance of the executor.\n\n Executors should allow this to be controlled via configuration if possible.\n\n Returns: RetryMode\n """
\n
", "current_page_name": "_modules/dagster/_core/executor/base", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.executor.base"}, "init": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.executor.init

\nfrom typing import Mapping, NamedTuple\n\nimport dagster._check as check\nfrom dagster._annotations import PublicAttr\nfrom dagster._core.definitions import ExecutorDefinition, IJob\nfrom dagster._core.instance import DagsterInstance\n\n\n
[docs]class InitExecutorContext(\n NamedTuple(\n "InitExecutorContext",\n [\n ("job", PublicAttr[IJob]),\n ("executor_def", PublicAttr[ExecutorDefinition]),\n ("executor_config", PublicAttr[Mapping[str, object]]),\n ("instance", PublicAttr[DagsterInstance]),\n ],\n )\n):\n """Executor-specific initialization context.\n\n Attributes:\n job (IJob): The job to be executed.\n executor_def (ExecutorDefinition): The definition of the executor currently being\n constructed.\n executor_config (dict): The parsed config passed to the executor.\n instance (DagsterInstance): The current instance.\n """\n\n def __new__(\n cls,\n job: IJob,\n executor_def: ExecutorDefinition,\n executor_config: Mapping[str, object],\n instance: DagsterInstance,\n ):\n return super(InitExecutorContext, cls).__new__(\n cls,\n job=check.inst_param(job, "job", IJob),\n executor_def=check.inst_param(executor_def, "executor_def", ExecutorDefinition),\n executor_config=check.mapping_param(executor_config, "executor_config", key_type=str),\n instance=check.inst_param(instance, "instance", DagsterInstance),\n )
\n
", "current_page_name": "_modules/dagster/_core/executor/init", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.executor.init"}}, "instance": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.instance

\nimport logging\nimport logging.config\nimport os\nimport sys\nimport time\nimport weakref\nfrom abc import abstractmethod\nfrom collections import defaultdict\nfrom enum import Enum\nfrom tempfile import TemporaryDirectory\nfrom types import TracebackType\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Any,\n    Callable,\n    Dict,\n    Generic,\n    Iterable,\n    List,\n    Mapping,\n    Optional,\n    Sequence,\n    Set,\n    Tuple,\n    Type,\n    Union,\n    cast,\n)\n\nimport yaml\nfrom typing_extensions import Protocol, Self, TypeAlias, TypeVar, runtime_checkable\n\nimport dagster._check as check\nfrom dagster._annotations import experimental, public\nfrom dagster._core.definitions.asset_check_evaluation import (\n    AssetCheckEvaluation,\n    AssetCheckEvaluationPlanned,\n)\nfrom dagster._core.definitions.data_version import extract_data_provenance_from_entry\nfrom dagster._core.definitions.events import AssetKey, AssetObservation\nfrom dagster._core.errors import (\n    DagsterHomeNotSetError,\n    DagsterInvalidInvocationError,\n    DagsterInvariantViolationError,\n    DagsterRunAlreadyExists,\n    DagsterRunConflict,\n)\nfrom dagster._core.log_manager import DagsterLogRecord\nfrom dagster._core.origin import JobPythonOrigin\nfrom dagster._core.storage.dagster_run import (\n    IN_PROGRESS_RUN_STATUSES,\n    DagsterRun,\n    DagsterRunStatsSnapshot,\n    DagsterRunStatus,\n    JobBucket,\n    RunPartitionData,\n    RunRecord,\n    RunsFilter,\n    TagBucket,\n)\nfrom dagster._core.storage.tags import (\n    ASSET_PARTITION_RANGE_END_TAG,\n    ASSET_PARTITION_RANGE_START_TAG,\n    PARENT_RUN_ID_TAG,\n    PARTITION_NAME_TAG,\n    RESUME_RETRY_TAG,\n    ROOT_RUN_ID_TAG,\n)\nfrom dagster._serdes import ConfigurableClass\nfrom dagster._seven import get_current_datetime_in_utc\nfrom dagster._utils import PrintFn, traced\nfrom dagster._utils.error import serializable_error_info_from_exc_info\nfrom dagster._utils.merger import merge_dicts\nfrom dagster._utils.warnings import (\n    deprecation_warning,\n    experimental_warning,\n)\n\nfrom .config import (\n    DAGSTER_CONFIG_YAML_FILENAME,\n    DEFAULT_LOCAL_CODE_SERVER_STARTUP_TIMEOUT,\n    get_default_tick_retention_settings,\n    get_tick_retention_settings,\n)\nfrom .ref import InstanceRef\n\n# 'airflow_execution_date' and 'is_airflow_ingest_pipeline' are hardcoded tags used in the\n# airflow ingestion logic (see: dagster_pipeline_factory.py). 'airflow_execution_date' stores the\n# 'execution_date' used in Airflow operator execution and 'is_airflow_ingest_pipeline' determines\n# whether 'airflow_execution_date' is needed.\n# https://github.com/dagster-io/dagster/issues/2403\nAIRFLOW_EXECUTION_DATE_STR = "airflow_execution_date"\nIS_AIRFLOW_INGEST_PIPELINE_STR = "is_airflow_ingest_pipeline"\n\n# Our internal guts can handle empty strings for job name and run id\n# However making these named constants for documentation, to encode where we are making the assumption,\n# and to allow us to change this more easily in the future, provided we are disciplined about\n# actually using this constants.\nRUNLESS_RUN_ID = ""\nRUNLESS_JOB_NAME = ""\n\nif TYPE_CHECKING:\n    from dagster._core.debug import DebugRunPayload\n    from dagster._core.definitions.asset_check_spec import AssetCheckKey\n    from dagster._core.definitions.job_definition import (\n        JobDefinition,\n    )\n    from dagster._core.definitions.partition import PartitionsDefinition\n    from dagster._core.definitions.repository_definition.repository_definition import (\n        RepositoryLoadData,\n    )\n    from dagster._core.definitions.run_request import InstigatorType\n    from dagster._core.event_api import EventHandlerFn\n    from dagster._core.events import (\n        AssetMaterialization,\n        DagsterEvent,\n        DagsterEventType,\n        EngineEventData,\n    )\n    from dagster._core.events.log import EventLogEntry\n    from dagster._core.execution.backfill import BulkActionStatus, PartitionBackfill\n    from dagster._core.execution.plan.plan import ExecutionPlan\n    from dagster._core.execution.plan.resume_retry import ReexecutionStrategy\n    from dagster._core.execution.stats import RunStepKeyStatsSnapshot\n    from dagster._core.host_representation import (\n        CodeLocation,\n        ExternalJob,\n        ExternalJobOrigin,\n        ExternalSensor,\n        HistoricalJob,\n    )\n    from dagster._core.host_representation.external import ExternalSchedule\n    from dagster._core.launcher import RunLauncher\n    from dagster._core.run_coordinator import RunCoordinator\n    from dagster._core.scheduler import Scheduler, SchedulerDebugInfo\n    from dagster._core.scheduler.instigation import (\n        InstigatorState,\n        InstigatorStatus,\n        InstigatorTick,\n        TickData,\n        TickStatus,\n    )\n    from dagster._core.secrets import SecretsLoader\n    from dagster._core.snap import ExecutionPlanSnapshot, JobSnapshot\n    from dagster._core.storage.compute_log_manager import ComputeLogManager\n    from dagster._core.storage.daemon_cursor import DaemonCursorStorage\n    from dagster._core.storage.event_log import EventLogStorage\n    from dagster._core.storage.event_log.base import (\n        AssetRecord,\n        EventLogConnection,\n        EventLogRecord,\n        EventRecordsFilter,\n    )\n    from dagster._core.storage.partition_status_cache import (\n        AssetPartitionStatus,\n        AssetStatusCacheValue,\n    )\n    from dagster._core.storage.root import LocalArtifactStorage\n    from dagster._core.storage.runs import RunStorage\n    from dagster._core.storage.schedules import ScheduleStorage\n    from dagster._core.storage.sql import AlembicVersion\n    from dagster._core.workspace.workspace import IWorkspace\n    from dagster._daemon.types import DaemonHeartbeat, DaemonStatus\n\nDagsterInstanceOverrides: TypeAlias = Mapping[str, Any]\n\n\ndef _check_run_equality(\n    pipeline_run: DagsterRun, candidate_run: DagsterRun\n) -> Mapping[str, Tuple[Any, Any]]:\n    field_diff: Dict[str, Tuple[Any, Any]] = {}\n    for field in pipeline_run._fields:\n        expected_value = getattr(pipeline_run, field)\n        candidate_value = getattr(candidate_run, field)\n        if expected_value != candidate_value:\n            field_diff[field] = (expected_value, candidate_value)\n\n    return field_diff\n\n\ndef _format_field_diff(field_diff: Mapping[str, Tuple[Any, Any]]) -> str:\n    return "\\n".join(\n        [\n            (\n                "    {field_name}:\\n"\n                + "        Expected: {expected_value}\\n"\n                + "        Received: {candidate_value}"\n            ).format(\n                field_name=field_name,\n                expected_value=expected_value,\n                candidate_value=candidate_value,\n            )\n            for field_name, (\n                expected_value,\n                candidate_value,\n            ) in field_diff.items()\n        ]\n    )\n\n\nclass _EventListenerLogHandler(logging.Handler):\n    def __init__(self, instance: "DagsterInstance"):\n        self._instance = instance\n        super(_EventListenerLogHandler, self).__init__()\n\n    def emit(self, record: DagsterLogRecord) -> None:\n        from dagster._core.events import EngineEventData\n        from dagster._core.events.log import StructuredLoggerMessage, construct_event_record\n\n        event = construct_event_record(\n            StructuredLoggerMessage(\n                name=record.name,\n                message=record.msg,\n                level=record.levelno,\n                meta=record.dagster_meta,  # type: ignore\n                record=record,\n            )\n        )\n\n        try:\n            self._instance.handle_new_event(event)\n        except Exception as e:\n            sys.stderr.write(f"Exception while writing logger call to event log: {e}\\n")\n            if event.dagster_event:\n                # Swallow user-generated log failures so that the entire step/run doesn't fail, but\n                # raise failures writing system-generated log events since they are the source of\n                # truth for the state of the run\n                raise\n            elif event.run_id:\n                self._instance.report_engine_event(\n                    "Exception while writing logger call to event log",\n                    job_name=event.job_name,\n                    run_id=event.run_id,\n                    step_key=event.step_key,\n                    engine_event_data=EngineEventData(\n                        error=serializable_error_info_from_exc_info(sys.exc_info()),\n                    ),\n                )\n\n\nclass InstanceType(Enum):\n    PERSISTENT = "PERSISTENT"\n    EPHEMERAL = "EPHEMERAL"\n\n\nT_DagsterInstance = TypeVar("T_DagsterInstance", bound="DagsterInstance", default="DagsterInstance")\n\n\nclass MayHaveInstanceWeakref(Generic[T_DagsterInstance]):\n    """Mixin for classes that can have a weakref back to a Dagster instance."""\n\n    _instance_weakref: "Optional[weakref.ReferenceType[T_DagsterInstance]]"\n\n    def __init__(self):\n        self._instance_weakref = None\n\n    @property\n    def has_instance(self) -> bool:\n        return hasattr(self, "_instance_weakref") and (self._instance_weakref is not None)\n\n    @property\n    def _instance(self) -> T_DagsterInstance:\n        instance = (\n            self._instance_weakref()\n            # Backcompat with custom subclasses that don't call super().__init__()\n            # in their own __init__ implementations\n            if (hasattr(self, "_instance_weakref") and self._instance_weakref is not None)\n            else None\n        )\n        if instance is None:\n            raise DagsterInvariantViolationError(\n                "Attempted to resolve undefined DagsterInstance weakref."\n            )\n        else:\n            return instance\n\n    def register_instance(self, instance: T_DagsterInstance) -> None:\n        check.invariant(\n            # Backcompat with custom subclasses that don't call super().__init__()\n            # in their own __init__ implementations\n            (not hasattr(self, "_instance_weakref") or self._instance_weakref is None),\n            "Must only call initialize once",\n        )\n\n        # Store a weakref to avoid a circular reference / enable GC\n        self._instance_weakref = weakref.ref(instance)\n\n\n@runtime_checkable\nclass DynamicPartitionsStore(Protocol):\n    @abstractmethod\n    def get_dynamic_partitions(self, partitions_def_name: str) -> Sequence[str]: ...\n\n    @abstractmethod\n    def has_dynamic_partition(self, partitions_def_name: str, partition_key: str) -> bool: ...\n\n\n
[docs]class DagsterInstance(DynamicPartitionsStore):\n """Core abstraction for managing Dagster's access to storage and other resources.\n\n Use DagsterInstance.get() to grab the current DagsterInstance which will load based on\n the values in the ``dagster.yaml`` file in ``$DAGSTER_HOME``.\n\n Alternatively, DagsterInstance.ephemeral() can use used which provides a set of\n transient in-memory components.\n\n Configuration of this class should be done by setting values in ``$DAGSTER_HOME/dagster.yaml``.\n For example, to use Postgres for dagster storage, you can write a ``dagster.yaml`` such as the\n following:\n\n .. literalinclude:: ../../../../../examples/docs_snippets/docs_snippets/deploying/dagster-pg.yaml\n :caption: dagster.yaml\n :language: YAML\n\n Args:\n instance_type (InstanceType): Indicates whether the instance is ephemeral or persistent.\n Users should not attempt to set this value directly or in their ``dagster.yaml`` files.\n local_artifact_storage (LocalArtifactStorage): The local artifact storage is used to\n configure storage for any artifacts that require a local disk, such as schedules, or\n when using the filesystem system storage to manage files and intermediates. By default,\n this will be a :py:class:`dagster._core.storage.root.LocalArtifactStorage`. Configurable\n in ``dagster.yaml`` using the :py:class:`~dagster.serdes.ConfigurableClass`\n machinery.\n run_storage (RunStorage): The run storage is used to store metadata about ongoing and past\n pipeline runs. By default, this will be a\n :py:class:`dagster._core.storage.runs.SqliteRunStorage`. Configurable in ``dagster.yaml``\n using the :py:class:`~dagster.serdes.ConfigurableClass` machinery.\n event_storage (EventLogStorage): Used to store the structured event logs generated by\n pipeline runs. By default, this will be a\n :py:class:`dagster._core.storage.event_log.SqliteEventLogStorage`. Configurable in\n ``dagster.yaml`` using the :py:class:`~dagster.serdes.ConfigurableClass` machinery.\n compute_log_manager (Optional[ComputeLogManager]): The compute log manager handles stdout\n and stderr logging for op compute functions. By default, this will be a\n :py:class:`dagster._core.storage.local_compute_log_manager.LocalComputeLogManager`.\n Configurable in ``dagster.yaml`` using the\n :py:class:`~dagster.serdes.ConfigurableClass` machinery.\n run_coordinator (Optional[RunCoordinator]): A runs coordinator may be used to manage the execution\n of pipeline runs.\n run_launcher (Optional[RunLauncher]): Optionally, a run launcher may be used to enable\n a Dagster instance to launch pipeline runs, e.g. on a remote Kubernetes cluster, in\n addition to running them locally.\n settings (Optional[Dict]): Specifies certain per-instance settings,\n such as feature flags. These are set in the ``dagster.yaml`` under a set of whitelisted\n keys.\n ref (Optional[InstanceRef]): Used by internal machinery to pass instances across process\n boundaries.\n """\n\n # Stores TemporaryDirectory instances that were created for DagsterInstance.local_temp() calls\n # to be removed once the instance is garbage collected.\n _TEMP_DIRS: "weakref.WeakKeyDictionary[DagsterInstance, TemporaryDirectory]" = (\n weakref.WeakKeyDictionary()\n )\n\n def __init__(\n self,\n instance_type: InstanceType,\n local_artifact_storage: "LocalArtifactStorage",\n run_storage: "RunStorage",\n event_storage: "EventLogStorage",\n run_coordinator: Optional["RunCoordinator"],\n compute_log_manager: Optional["ComputeLogManager"],\n run_launcher: Optional["RunLauncher"],\n scheduler: Optional["Scheduler"] = None,\n schedule_storage: Optional["ScheduleStorage"] = None,\n settings: Optional[Mapping[str, Any]] = None,\n secrets_loader: Optional["SecretsLoader"] = None,\n ref: Optional[InstanceRef] = None,\n **_kwargs: Any, # we accept kwargs for forward-compat of custom instances\n ):\n from dagster._core.launcher import RunLauncher\n from dagster._core.run_coordinator import RunCoordinator\n from dagster._core.scheduler import Scheduler\n from dagster._core.secrets import SecretsLoader\n from dagster._core.storage.captured_log_manager import CapturedLogManager\n from dagster._core.storage.compute_log_manager import ComputeLogManager\n from dagster._core.storage.event_log import EventLogStorage\n from dagster._core.storage.root import LocalArtifactStorage\n from dagster._core.storage.runs import RunStorage\n from dagster._core.storage.schedules import ScheduleStorage\n\n self._instance_type = check.inst_param(instance_type, "instance_type", InstanceType)\n self._local_artifact_storage = check.inst_param(\n local_artifact_storage, "local_artifact_storage", LocalArtifactStorage\n )\n self._event_storage = check.inst_param(event_storage, "event_storage", EventLogStorage)\n self._event_storage.register_instance(self)\n\n self._run_storage = check.inst_param(run_storage, "run_storage", RunStorage)\n self._run_storage.register_instance(self)\n\n if compute_log_manager:\n self._compute_log_manager = check.inst_param(\n compute_log_manager, "compute_log_manager", ComputeLogManager\n )\n if not isinstance(self._compute_log_manager, CapturedLogManager):\n deprecation_warning(\n "ComputeLogManager",\n "1.2.0",\n "Implement the CapturedLogManager interface instead.",\n )\n self._compute_log_manager.register_instance(self)\n else:\n check.invariant(\n ref, "Compute log manager must be provided if instance is not from a ref"\n )\n self._compute_log_manager = None\n\n self._scheduler = check.opt_inst_param(scheduler, "scheduler", Scheduler)\n\n self._schedule_storage = check.opt_inst_param(\n schedule_storage, "schedule_storage", ScheduleStorage\n )\n if self._schedule_storage:\n self._schedule_storage.register_instance(self)\n\n if run_coordinator:\n self._run_coordinator = check.inst_param(\n run_coordinator, "run_coordinator", RunCoordinator\n )\n self._run_coordinator.register_instance(self)\n else:\n check.invariant(ref, "Run coordinator must be provided if instance is not from a ref")\n self._run_coordinator = None\n\n if run_launcher:\n self._run_launcher: Optional[RunLauncher] = check.inst_param(\n run_launcher, "run_launcher", RunLauncher\n )\n run_launcher.register_instance(self)\n else:\n check.invariant(ref, "Run launcher must be provided if instance is not from a ref")\n self._run_launcher = None\n\n self._settings = check.opt_mapping_param(settings, "settings")\n\n self._secrets_loader = check.opt_inst_param(secrets_loader, "secrets_loader", SecretsLoader)\n\n if self._secrets_loader:\n self._secrets_loader.register_instance(self)\n\n self._ref = check.opt_inst_param(ref, "ref", InstanceRef)\n\n self._subscribers: Dict[str, List[Callable]] = defaultdict(list)\n\n run_monitoring_enabled = self.run_monitoring_settings.get("enabled", False)\n self._run_monitoring_enabled = run_monitoring_enabled\n if self.run_monitoring_enabled and self.run_monitoring_max_resume_run_attempts:\n check.invariant(\n self.run_launcher.supports_resume_run,\n "The configured run launcher does not support resuming runs. Set"\n " max_resume_run_attempts to 0 to use run monitoring. Any runs with a failed"\n " run worker will be marked as failed, but will not be resumed.",\n )\n\n if self.run_retries_enabled:\n check.invariant(\n self.event_log_storage.supports_event_consumer_queries(),\n "Run retries are enabled, but the configured event log storage does not support"\n " them. Consider switching to Postgres or Mysql.",\n )\n\n # ctors\n\n
[docs] @public\n @staticmethod\n def ephemeral(\n tempdir: Optional[str] = None,\n preload: Optional[Sequence["DebugRunPayload"]] = None,\n settings: Optional[Dict] = None,\n ) -> "DagsterInstance":\n """Create a `DagsterInstance` suitable for ephemeral execution, useful in test contexts. An\n ephemeral instance uses mostly in-memory components. Use `local_temp` to create a test\n instance that is fully persistent.\n\n Args:\n tempdir (Optional[str]): The path of a directory to be used for local artifact storage.\n preload (Optional[Sequence[DebugRunPayload]]): A sequence of payloads to load into the\n instance's run storage. Useful for debugging.\n settings (Optional[Dict]): Settings for the instance.\n\n Returns:\n DagsterInstance: An ephemeral DagsterInstance.\n """\n from dagster._core.launcher.sync_in_memory_run_launcher import SyncInMemoryRunLauncher\n from dagster._core.run_coordinator import DefaultRunCoordinator\n from dagster._core.storage.event_log import InMemoryEventLogStorage\n from dagster._core.storage.noop_compute_log_manager import NoOpComputeLogManager\n from dagster._core.storage.root import LocalArtifactStorage, TemporaryLocalArtifactStorage\n from dagster._core.storage.runs import InMemoryRunStorage\n\n if tempdir is not None:\n local_storage = LocalArtifactStorage(tempdir)\n else:\n local_storage = TemporaryLocalArtifactStorage()\n\n return DagsterInstance(\n instance_type=InstanceType.EPHEMERAL,\n local_artifact_storage=local_storage,\n run_storage=InMemoryRunStorage(preload=preload),\n event_storage=InMemoryEventLogStorage(preload=preload),\n compute_log_manager=NoOpComputeLogManager(),\n run_coordinator=DefaultRunCoordinator(),\n run_launcher=SyncInMemoryRunLauncher(),\n settings=settings,\n )
\n\n
[docs] @public\n @staticmethod\n def get() -> "DagsterInstance":\n """Get the current `DagsterInstance` as specified by the ``DAGSTER_HOME`` environment variable.\n\n Returns:\n DagsterInstance: The current DagsterInstance.\n """\n dagster_home_path = os.getenv("DAGSTER_HOME")\n\n if not dagster_home_path:\n raise DagsterHomeNotSetError(\n "The environment variable $DAGSTER_HOME is not set. \\nDagster requires this"\n " environment variable to be set to an existing directory in your filesystem. This"\n " directory is used to store metadata across sessions, or load the dagster.yaml"\n " file which can configure storing metadata in an external database.\\nYou can"\n " resolve this error by exporting the environment variable. For example, you can"\n " run the following command in your shell or include it in your shell configuration"\n ' file:\\n\\texport DAGSTER_HOME=~"/dagster_home"\\nor PowerShell\\n$env:DAGSTER_HOME'\n " = ($home + '\\\\dagster_home')or batchset"\n " DAGSTER_HOME=%UserProfile%/dagster_homeAlternatively, DagsterInstance.ephemeral()"\n " can be used for a transient instance.\\n"\n )\n\n dagster_home_path = os.path.expanduser(dagster_home_path)\n\n if not os.path.isabs(dagster_home_path):\n raise DagsterInvariantViolationError(\n (\n '$DAGSTER_HOME "{}" must be an absolute path. Dagster requires this '\n "environment variable to be set to an existing directory in your filesystem."\n ).format(dagster_home_path)\n )\n\n if not (os.path.exists(dagster_home_path) and os.path.isdir(dagster_home_path)):\n raise DagsterInvariantViolationError(\n (\n '$DAGSTER_HOME "{}" is not a directory or does not exist. Dagster requires this'\n " environment variable to be set to an existing directory in your filesystem"\n ).format(dagster_home_path)\n )\n\n return DagsterInstance.from_config(dagster_home_path)
\n\n
[docs] @public\n @staticmethod\n def local_temp(\n tempdir: Optional[str] = None,\n overrides: Optional[DagsterInstanceOverrides] = None,\n ) -> "DagsterInstance":\n """Create a DagsterInstance that uses a temporary directory for local storage. This is a\n regular, fully persistent instance. Use `ephemeral` to get an ephemeral instance with\n in-memory components.\n\n Args:\n tempdir (Optional[str]): The path of a directory to be used for local artifact storage.\n overrides (Optional[DagsterInstanceOverrides]): Override settings for the instance.\n\n Returns:\n DagsterInstance\n """\n if tempdir is None:\n created_dir = TemporaryDirectory()\n i = DagsterInstance.from_ref(\n InstanceRef.from_dir(created_dir.name, overrides=overrides)\n )\n DagsterInstance._TEMP_DIRS[i] = created_dir\n return i\n\n return DagsterInstance.from_ref(InstanceRef.from_dir(tempdir, overrides=overrides))
\n\n @staticmethod\n def from_config(\n config_dir: str,\n config_filename: str = DAGSTER_CONFIG_YAML_FILENAME,\n ) -> "DagsterInstance":\n instance_ref = InstanceRef.from_dir(config_dir, config_filename=config_filename)\n return DagsterInstance.from_ref(instance_ref)\n\n @staticmethod\n def from_ref(instance_ref: InstanceRef) -> "DagsterInstance":\n check.inst_param(instance_ref, "instance_ref", InstanceRef)\n\n # DagsterInstance doesn't implement ConfigurableClass, but we may still sometimes want to\n # have custom subclasses of DagsterInstance. This machinery allows for those custom\n # subclasses to receive additional keyword arguments passed through the config YAML.\n klass = instance_ref.custom_instance_class or DagsterInstance\n kwargs = instance_ref.custom_instance_class_config\n\n unified_storage = instance_ref.storage\n run_storage = unified_storage.run_storage if unified_storage else instance_ref.run_storage\n event_storage = (\n unified_storage.event_log_storage if unified_storage else instance_ref.event_storage\n )\n schedule_storage = (\n unified_storage.schedule_storage if unified_storage else instance_ref.schedule_storage\n )\n\n return klass(\n instance_type=InstanceType.PERSISTENT,\n local_artifact_storage=instance_ref.local_artifact_storage,\n run_storage=run_storage, # type: ignore # (possible none)\n event_storage=event_storage, # type: ignore # (possible none)\n schedule_storage=schedule_storage,\n compute_log_manager=None, # lazy load\n scheduler=instance_ref.scheduler,\n run_coordinator=None, # lazy load\n run_launcher=None, # lazy load\n settings=instance_ref.settings,\n secrets_loader=instance_ref.secrets_loader,\n ref=instance_ref,\n **kwargs,\n )\n\n # flags\n\n @property\n def is_persistent(self) -> bool:\n return self._instance_type == InstanceType.PERSISTENT\n\n @property\n def is_ephemeral(self) -> bool:\n return self._instance_type == InstanceType.EPHEMERAL\n\n def get_ref(self) -> InstanceRef:\n if self._ref:\n return self._ref\n\n check.failed(\n "Attempted to prepare an ineligible DagsterInstance ({inst_type}) for cross "\n "process communication.{dagster_home_msg}".format(\n inst_type=self._instance_type,\n dagster_home_msg=(\n "\\nDAGSTER_HOME environment variable is not set, set it to "\n "a directory on the filesystem for dagster to use for storage and cross "\n "process coordination."\n if os.getenv("DAGSTER_HOME") is None\n else ""\n ),\n )\n )\n\n @property\n def root_directory(self) -> str:\n return self._local_artifact_storage.base_dir\n\n def _info(self, component: object) -> Union[str, Mapping[Any, Any]]:\n # ConfigurableClass may not have inst_data if it's a direct instantiation\n # which happens for ephemeral instances\n if isinstance(component, ConfigurableClass) and component.inst_data:\n return component.inst_data.info_dict()\n if type(component) is dict:\n return component\n return component.__class__.__name__\n\n def _info_str_for_component(self, component_name: str, component: object) -> str:\n return yaml.dump(\n {component_name: self._info(component)}, default_flow_style=False, sort_keys=False\n )\n\n def info_dict(self) -> Mapping[str, object]:\n settings: Mapping[str, object] = self._settings if self._settings else {}\n\n ret = {\n "local_artifact_storage": self._info(self._local_artifact_storage),\n "run_storage": self._info(self._run_storage),\n "event_log_storage": self._info(self._event_storage),\n "compute_logs": self._info(self._compute_log_manager),\n "schedule_storage": self._info(self._schedule_storage),\n "scheduler": self._info(self._scheduler),\n "run_coordinator": self._info(self._run_coordinator),\n "run_launcher": self._info(self.run_launcher),\n }\n ret.update(\n {\n settings_key: self._info(settings_value)\n for settings_key, settings_value in settings.items()\n }\n )\n\n return ret\n\n def info_str(self) -> str:\n return yaml.dump(self.info_dict(), default_flow_style=False, sort_keys=False)\n\n def schema_str(self) -> str:\n def _schema_dict(alembic_version: "AlembicVersion") -> Optional[Mapping[str, object]]:\n if not alembic_version:\n return None\n db_revision, head_revision = alembic_version\n return {\n "current": db_revision,\n "latest": head_revision,\n }\n\n return yaml.dump(\n {\n "schema": {\n "event_log_storage": _schema_dict(self._event_storage.alembic_version()), # type: ignore # (possible none)\n "run_storage": _schema_dict(self._event_storage.alembic_version()), # type: ignore # (possible none)\n "schedule_storage": _schema_dict(self._event_storage.alembic_version()), # type: ignore # (possible none)\n }\n },\n default_flow_style=False,\n sort_keys=False,\n )\n\n @property\n def run_storage(self) -> "RunStorage":\n return self._run_storage\n\n @property\n def event_log_storage(self) -> "EventLogStorage":\n return self._event_storage\n\n @property\n def daemon_cursor_storage(self) -> "DaemonCursorStorage":\n return self._run_storage\n\n # schedule storage\n\n @property\n def schedule_storage(self) -> Optional["ScheduleStorage"]:\n return self._schedule_storage\n\n @property\n def scheduler(self) -> Optional["Scheduler"]:\n return self._scheduler\n\n @property\n def scheduler_class(self) -> Optional[str]:\n return self.scheduler.__class__.__name__ if self.scheduler else None\n\n # run coordinator\n\n @property\n def run_coordinator(self) -> "RunCoordinator":\n # Lazily load in case the run coordinator requires dependencies that are not available\n # everywhere that loads the instance\n if not self._run_coordinator:\n check.invariant(\n self._ref, "Run coordinator not provided, and no instance ref available"\n )\n run_coordinator = cast(InstanceRef, self._ref).run_coordinator\n check.invariant(run_coordinator, "Run coordinator not configured in instance ref")\n self._run_coordinator = cast("RunCoordinator", run_coordinator)\n self._run_coordinator.register_instance(self)\n return self._run_coordinator\n\n # run launcher\n\n @property\n def run_launcher(self) -> "RunLauncher":\n # Lazily load in case the launcher requires dependencies that are not available everywhere\n # that loads the instance (e.g. The EcsRunLauncher requires boto3)\n if not self._run_launcher:\n check.invariant(self._ref, "Run launcher not provided, and no instance ref available")\n launcher = cast(InstanceRef, self._ref).run_launcher\n check.invariant(launcher, "Run launcher not configured in instance ref")\n self._run_launcher = cast("RunLauncher", launcher)\n self._run_launcher.register_instance(self)\n return self._run_launcher\n\n # compute logs\n\n @property\n def compute_log_manager(self) -> "ComputeLogManager":\n if not self._compute_log_manager:\n check.invariant(\n self._ref, "Compute log manager not provided, and no instance ref available"\n )\n compute_log_manager = cast(InstanceRef, self._ref).compute_log_manager\n check.invariant(\n compute_log_manager, "Compute log manager not configured in instance ref"\n )\n self._compute_log_manager = cast("ComputeLogManager", compute_log_manager)\n self._compute_log_manager.register_instance(self)\n return self._compute_log_manager\n\n def get_settings(self, settings_key: str) -> Any:\n check.str_param(settings_key, "settings_key")\n if self._settings and settings_key in self._settings:\n return self._settings.get(settings_key)\n return {}\n\n @property\n def telemetry_enabled(self) -> bool:\n if self.is_ephemeral:\n return False\n\n dagster_telemetry_enabled_default = True\n\n telemetry_settings = self.get_settings("telemetry")\n\n if not telemetry_settings:\n return dagster_telemetry_enabled_default\n\n if "enabled" in telemetry_settings:\n return telemetry_settings["enabled"]\n else:\n return dagster_telemetry_enabled_default\n\n @property\n def nux_enabled(self) -> bool:\n if self.is_ephemeral:\n return False\n\n nux_enabled_by_default = True\n\n nux_settings = self.get_settings("nux")\n if not nux_settings:\n return nux_enabled_by_default\n\n if "enabled" in nux_settings:\n return nux_settings["enabled"]\n else:\n return nux_enabled_by_default\n\n # run monitoring\n\n @property\n def run_monitoring_enabled(self) -> bool:\n return self._run_monitoring_enabled\n\n @property\n def run_monitoring_settings(self) -> Any:\n return self.get_settings("run_monitoring")\n\n @property\n def run_monitoring_start_timeout_seconds(self) -> int:\n return self.run_monitoring_settings.get("start_timeout_seconds", 180)\n\n @property\n def run_monitoring_cancel_timeout_seconds(self) -> int:\n return self.run_monitoring_settings.get("cancel_timeout_seconds", 180)\n\n @property\n def code_server_settings(self) -> Any:\n return self.get_settings("code_servers")\n\n @property\n def code_server_process_startup_timeout(self) -> int:\n return self.code_server_settings.get(\n "local_startup_timeout", DEFAULT_LOCAL_CODE_SERVER_STARTUP_TIMEOUT\n )\n\n @property\n def code_server_reload_timeout(self) -> int:\n return self.code_server_settings.get(\n "reload_timeout", DEFAULT_LOCAL_CODE_SERVER_STARTUP_TIMEOUT\n )\n\n @property\n def wait_for_local_code_server_processes_on_shutdown(self) -> bool:\n return self.code_server_settings.get("wait_for_local_processes_on_shutdown", False)\n\n @property\n def run_monitoring_max_resume_run_attempts(self) -> int:\n return self.run_monitoring_settings.get("max_resume_run_attempts", 0)\n\n @property\n def run_monitoring_poll_interval_seconds(self) -> int:\n return self.run_monitoring_settings.get("poll_interval_seconds", 120)\n\n @property\n def cancellation_thread_poll_interval_seconds(self) -> int:\n return self.get_settings("run_monitoring").get(\n "cancellation_thread_poll_interval_seconds", 10\n )\n\n @property\n def run_retries_enabled(self) -> bool:\n return self.get_settings("run_retries").get("enabled", False)\n\n @property\n def run_retries_max_retries(self) -> int:\n return self.get_settings("run_retries").get("max_retries")\n\n @property\n def auto_materialize_enabled(self) -> bool:\n return self.get_settings("auto_materialize").get("enabled", True)\n\n @property\n def auto_materialize_minimum_interval_seconds(self) -> int:\n return self.get_settings("auto_materialize").get("minimum_interval_seconds")\n\n @property\n def auto_materialize_run_tags(self) -> Dict[str, str]:\n return self.get_settings("auto_materialize").get("run_tags", {})\n\n @property\n def auto_materialize_respect_materialization_data_versions(self) -> bool:\n return self.get_settings("auto_materialize").get(\n "respect_materialization_data_versions", False\n )\n\n # python logs\n\n @property\n def managed_python_loggers(self) -> Sequence[str]:\n python_log_settings = self.get_settings("python_logs") or {}\n loggers: Sequence[str] = python_log_settings.get("managed_python_loggers", [])\n return loggers\n\n @property\n def python_log_level(self) -> Optional[str]:\n python_log_settings = self.get_settings("python_logs") or {}\n return python_log_settings.get("python_log_level")\n\n def upgrade(self, print_fn: Optional[PrintFn] = None) -> None:\n from dagster._core.storage.migration.utils import upgrading_instance\n\n with upgrading_instance(self):\n if print_fn:\n print_fn("Updating run storage...")\n self._run_storage.upgrade() # type: ignore # (unknown method on run storage)\n self._run_storage.migrate(print_fn)\n\n if print_fn:\n print_fn("Updating event storage...")\n self._event_storage.upgrade()\n self._event_storage.reindex_assets(print_fn=print_fn)\n\n if print_fn:\n print_fn("Updating schedule storage...")\n self._schedule_storage.upgrade() # type: ignore # (possible none)\n self._schedule_storage.migrate(print_fn) # type: ignore # (possible none)\n\n def optimize_for_webserver(self, statement_timeout: int, pool_recycle: int) -> None:\n if self._schedule_storage:\n self._schedule_storage.optimize_for_webserver(\n statement_timeout=statement_timeout, pool_recycle=pool_recycle\n )\n self._run_storage.optimize_for_webserver(\n statement_timeout=statement_timeout, pool_recycle=pool_recycle\n )\n self._event_storage.optimize_for_webserver(\n statement_timeout=statement_timeout, pool_recycle=pool_recycle\n )\n\n def reindex(self, print_fn: PrintFn = lambda _: None) -> None:\n print_fn("Checking for reindexing...")\n self._event_storage.reindex_events(print_fn)\n self._event_storage.reindex_assets(print_fn)\n self._run_storage.optimize(print_fn)\n self._schedule_storage.optimize(print_fn) # type: ignore # (possible none)\n print_fn("Done.")\n\n def dispose(self) -> None:\n self._local_artifact_storage.dispose()\n self._run_storage.dispose()\n if self._run_coordinator:\n self._run_coordinator.dispose()\n if self._run_launcher:\n self._run_launcher.dispose()\n self._event_storage.dispose()\n if self._compute_log_manager:\n self._compute_log_manager.dispose()\n if self._secrets_loader:\n self._secrets_loader.dispose()\n\n if self in DagsterInstance._TEMP_DIRS:\n DagsterInstance._TEMP_DIRS[self].cleanup()\n del DagsterInstance._TEMP_DIRS[self]\n\n # run storage\n
[docs] @public\n def get_run_by_id(self, run_id: str) -> Optional[DagsterRun]:\n """Get a :py:class:`DagsterRun` matching the provided `run_id`.\n\n Args:\n run_id (str): The id of the run to retrieve.\n\n Returns:\n Optional[DagsterRun]: The run corresponding to the given id. If no run matching the id\n is found, return `None`.\n """\n record = self.get_run_record_by_id(run_id)\n if record is None:\n return None\n return record.dagster_run
\n\n
[docs] @public\n @traced\n def get_run_record_by_id(self, run_id: str) -> Optional[RunRecord]:\n """Get a :py:class:`RunRecord` matching the provided `run_id`.\n\n Args:\n run_id (str): The id of the run record to retrieve.\n\n Returns:\n Optional[RunRecord]: The run record corresponding to the given id. If no run matching\n the id is found, return `None`.\n """\n records = self._run_storage.get_run_records(RunsFilter(run_ids=[run_id]))\n if not records:\n return None\n return records[0]
\n\n @traced\n def get_job_snapshot(self, snapshot_id: str) -> "JobSnapshot":\n return self._run_storage.get_job_snapshot(snapshot_id)\n\n @traced\n def has_job_snapshot(self, snapshot_id: str) -> bool:\n return self._run_storage.has_job_snapshot(snapshot_id)\n\n @traced\n def has_snapshot(self, snapshot_id: str) -> bool:\n return self._run_storage.has_snapshot(snapshot_id)\n\n @traced\n def get_historical_job(self, snapshot_id: str) -> "HistoricalJob":\n from dagster._core.host_representation import HistoricalJob\n\n snapshot = self._run_storage.get_job_snapshot(snapshot_id)\n parent_snapshot = (\n self._run_storage.get_job_snapshot(snapshot.lineage_snapshot.parent_snapshot_id)\n if snapshot.lineage_snapshot\n else None\n )\n return HistoricalJob(snapshot, snapshot_id, parent_snapshot)\n\n @traced\n def has_historical_job(self, snapshot_id: str) -> bool:\n return self._run_storage.has_job_snapshot(snapshot_id)\n\n @traced\n def get_execution_plan_snapshot(self, snapshot_id: str) -> "ExecutionPlanSnapshot":\n return self._run_storage.get_execution_plan_snapshot(snapshot_id)\n\n @traced\n def get_run_stats(self, run_id: str) -> DagsterRunStatsSnapshot:\n return self._event_storage.get_stats_for_run(run_id)\n\n @traced\n def get_run_step_stats(\n self, run_id: str, step_keys: Optional[Sequence[str]] = None\n ) -> Sequence["RunStepKeyStatsSnapshot"]:\n return self._event_storage.get_step_stats_for_run(run_id, step_keys)\n\n @traced\n def get_run_tags(\n self,\n tag_keys: Optional[Sequence[str]] = None,\n value_prefix: Optional[str] = None,\n limit: Optional[int] = None,\n ) -> Sequence[Tuple[str, Set[str]]]:\n return self._run_storage.get_run_tags(\n tag_keys=tag_keys, value_prefix=value_prefix, limit=limit\n )\n\n @traced\n def get_run_tag_keys(self) -> Sequence[str]:\n return self._run_storage.get_run_tag_keys()\n\n @traced\n def get_run_group(self, run_id: str) -> Optional[Tuple[str, Sequence[DagsterRun]]]:\n return self._run_storage.get_run_group(run_id)\n\n def create_run_for_job(\n self,\n job_def: "JobDefinition",\n execution_plan: Optional["ExecutionPlan"] = None,\n run_id: Optional[str] = None,\n run_config: Optional[Mapping[str, object]] = None,\n resolved_op_selection: Optional[AbstractSet[str]] = None,\n status: Optional[Union[DagsterRunStatus, str]] = None,\n tags: Optional[Mapping[str, str]] = None,\n root_run_id: Optional[str] = None,\n parent_run_id: Optional[str] = None,\n op_selection: Optional[Sequence[str]] = None,\n asset_selection: Optional[AbstractSet[AssetKey]] = None,\n external_job_origin: Optional["ExternalJobOrigin"] = None,\n job_code_origin: Optional[JobPythonOrigin] = None,\n repository_load_data: Optional["RepositoryLoadData"] = None,\n ) -> DagsterRun:\n from dagster._core.definitions.job_definition import JobDefinition\n from dagster._core.execution.api import create_execution_plan\n from dagster._core.execution.plan.plan import ExecutionPlan\n from dagster._core.snap import snapshot_from_execution_plan\n\n check.inst_param(job_def, "pipeline_def", JobDefinition)\n check.opt_inst_param(execution_plan, "execution_plan", ExecutionPlan)\n\n # note that op_selection is required to execute the solid subset, which is the\n # frozenset version of the previous solid_subset.\n # op_selection is not required and will not be converted to op_selection here.\n # i.e. this function doesn't handle solid queries.\n # op_selection is only used to pass the user queries further down.\n check.opt_set_param(resolved_op_selection, "resolved_op_selection", of_type=str)\n check.opt_list_param(op_selection, "op_selection", of_type=str)\n check.opt_set_param(asset_selection, "asset_selection", of_type=AssetKey)\n\n # op_selection never provided\n if asset_selection or op_selection:\n # for cases when `create_run_for_pipeline` is directly called\n job_def = job_def.get_subset(\n asset_selection=asset_selection,\n op_selection=op_selection,\n )\n step_keys_to_execute = None\n\n if execution_plan:\n step_keys_to_execute = execution_plan.step_keys_to_execute\n\n else:\n execution_plan = create_execution_plan(\n job=job_def,\n run_config=run_config,\n instance_ref=self.get_ref() if self.is_persistent else None,\n tags=tags,\n repository_load_data=repository_load_data,\n )\n\n return self.create_run(\n job_name=job_def.name,\n run_id=run_id,\n run_config=run_config,\n op_selection=op_selection,\n asset_selection=asset_selection,\n asset_check_selection=None,\n resolved_op_selection=resolved_op_selection,\n step_keys_to_execute=step_keys_to_execute,\n status=DagsterRunStatus(status) if status else None,\n tags=tags,\n root_run_id=root_run_id,\n parent_run_id=parent_run_id,\n job_snapshot=job_def.get_job_snapshot(),\n execution_plan_snapshot=snapshot_from_execution_plan(\n execution_plan,\n job_def.get_job_snapshot_id(),\n ),\n parent_job_snapshot=job_def.get_parent_job_snapshot(),\n external_job_origin=external_job_origin,\n job_code_origin=job_code_origin,\n )\n\n def _construct_run_with_snapshots(\n self,\n job_name: str,\n run_id: str,\n run_config: Optional[Mapping[str, object]],\n resolved_op_selection: Optional[AbstractSet[str]],\n step_keys_to_execute: Optional[Sequence[str]],\n status: Optional[DagsterRunStatus],\n tags: Mapping[str, str],\n root_run_id: Optional[str],\n parent_run_id: Optional[str],\n job_snapshot: Optional["JobSnapshot"],\n execution_plan_snapshot: Optional["ExecutionPlanSnapshot"],\n parent_job_snapshot: Optional["JobSnapshot"],\n asset_selection: Optional[AbstractSet[AssetKey]] = None,\n asset_check_selection: Optional[AbstractSet["AssetCheckKey"]] = None,\n op_selection: Optional[Sequence[str]] = None,\n external_job_origin: Optional["ExternalJobOrigin"] = None,\n job_code_origin: Optional[JobPythonOrigin] = None,\n ) -> DagsterRun:\n # https://github.com/dagster-io/dagster/issues/2403\n if tags and IS_AIRFLOW_INGEST_PIPELINE_STR in tags:\n if AIRFLOW_EXECUTION_DATE_STR not in tags:\n tags = {\n **tags,\n AIRFLOW_EXECUTION_DATE_STR: get_current_datetime_in_utc().isoformat(),\n }\n\n check.invariant(\n not (not job_snapshot and execution_plan_snapshot),\n "It is illegal to have an execution plan snapshot and not have a pipeline snapshot."\n " It is possible to have no execution plan snapshot since we persist runs that do"\n " not successfully compile execution plans in the scheduled case.",\n )\n\n job_snapshot_id = (\n self._ensure_persisted_job_snapshot(job_snapshot, parent_job_snapshot)\n if job_snapshot\n else None\n )\n\n execution_plan_snapshot_id = (\n self._ensure_persisted_execution_plan_snapshot(\n execution_plan_snapshot, job_snapshot_id, step_keys_to_execute\n )\n if execution_plan_snapshot and job_snapshot_id\n else None\n )\n\n return DagsterRun(\n job_name=job_name,\n run_id=run_id,\n run_config=run_config,\n asset_selection=asset_selection,\n asset_check_selection=asset_check_selection,\n op_selection=op_selection,\n resolved_op_selection=resolved_op_selection,\n step_keys_to_execute=step_keys_to_execute,\n status=status,\n tags=tags,\n root_run_id=root_run_id,\n parent_run_id=parent_run_id,\n job_snapshot_id=job_snapshot_id,\n execution_plan_snapshot_id=execution_plan_snapshot_id,\n external_job_origin=external_job_origin,\n job_code_origin=job_code_origin,\n has_repository_load_data=execution_plan_snapshot is not None\n and execution_plan_snapshot.repository_load_data is not None,\n )\n\n def _ensure_persisted_job_snapshot(\n self,\n job_snapshot: "JobSnapshot",\n parent_job_snapshot: "Optional[JobSnapshot]",\n ) -> str:\n from dagster._core.snap import JobSnapshot, create_job_snapshot_id\n\n check.inst_param(job_snapshot, "job_snapshot", JobSnapshot)\n check.opt_inst_param(parent_job_snapshot, "parent_job_snapshot", JobSnapshot)\n\n if job_snapshot.lineage_snapshot:\n if not self._run_storage.has_job_snapshot(\n job_snapshot.lineage_snapshot.parent_snapshot_id\n ):\n check.invariant(\n create_job_snapshot_id(parent_job_snapshot) # type: ignore # (possible none)\n == job_snapshot.lineage_snapshot.parent_snapshot_id,\n "Parent pipeline snapshot id out of sync with passed parent pipeline snapshot",\n )\n\n returned_job_snapshot_id = self._run_storage.add_job_snapshot(\n parent_job_snapshot # type: ignore # (possible none)\n )\n check.invariant(\n job_snapshot.lineage_snapshot.parent_snapshot_id == returned_job_snapshot_id\n )\n\n job_snapshot_id = create_job_snapshot_id(job_snapshot)\n if not self._run_storage.has_job_snapshot(job_snapshot_id):\n returned_job_snapshot_id = self._run_storage.add_job_snapshot(job_snapshot)\n check.invariant(job_snapshot_id == returned_job_snapshot_id)\n\n return job_snapshot_id\n\n def _ensure_persisted_execution_plan_snapshot(\n self,\n execution_plan_snapshot: "ExecutionPlanSnapshot",\n job_snapshot_id: str,\n step_keys_to_execute: Optional[Sequence[str]],\n ) -> str:\n from dagster._core.snap.execution_plan_snapshot import (\n ExecutionPlanSnapshot,\n create_execution_plan_snapshot_id,\n )\n\n check.inst_param(execution_plan_snapshot, "execution_plan_snapshot", ExecutionPlanSnapshot)\n check.str_param(job_snapshot_id, "job_snapshot_id")\n check.opt_nullable_sequence_param(step_keys_to_execute, "step_keys_to_execute", of_type=str)\n\n check.invariant(\n execution_plan_snapshot.job_snapshot_id == job_snapshot_id,\n "Snapshot mismatch: Snapshot ID in execution plan snapshot is "\n f'"{execution_plan_snapshot.job_snapshot_id}" and snapshot_id created in memory is '\n f'"{job_snapshot_id}"',\n )\n\n execution_plan_snapshot_id = create_execution_plan_snapshot_id(execution_plan_snapshot)\n\n if not self._run_storage.has_execution_plan_snapshot(execution_plan_snapshot_id):\n returned_execution_plan_snapshot_id = self._run_storage.add_execution_plan_snapshot(\n execution_plan_snapshot\n )\n\n check.invariant(execution_plan_snapshot_id == returned_execution_plan_snapshot_id)\n\n return execution_plan_snapshot_id\n\n def _log_asset_planned_events(\n self, dagster_run: DagsterRun, execution_plan_snapshot: "ExecutionPlanSnapshot"\n ) -> None:\n from dagster._core.events import (\n AssetMaterializationPlannedData,\n DagsterEvent,\n DagsterEventType,\n )\n\n job_name = dagster_run.job_name\n\n for step in execution_plan_snapshot.steps:\n if step.key in execution_plan_snapshot.step_keys_to_execute:\n for output in step.outputs:\n asset_key = check.not_none(output.properties).asset_key\n if asset_key:\n # Logs and stores asset_materialization_planned event\n partition_tag = dagster_run.tags.get(PARTITION_NAME_TAG)\n partition_range_start, partition_range_end = dagster_run.tags.get(\n ASSET_PARTITION_RANGE_START_TAG\n ), dagster_run.tags.get(ASSET_PARTITION_RANGE_END_TAG)\n\n if partition_tag and (partition_range_start or partition_range_end):\n raise DagsterInvariantViolationError(\n f"Cannot have {ASSET_PARTITION_RANGE_START_TAG} or"\n f" {ASSET_PARTITION_RANGE_END_TAG} set along with"\n f" {PARTITION_NAME_TAG}"\n )\n\n if partition_range_start or partition_range_end:\n if not partition_range_start or not partition_range_end:\n raise DagsterInvariantViolationError(\n f"Cannot have {ASSET_PARTITION_RANGE_START_TAG} or"\n f" {ASSET_PARTITION_RANGE_END_TAG} set without the other"\n )\n\n # TODO: resolve which partitions are in the range, and emit an event for each\n\n partition = (\n partition_tag\n if check.not_none(output.properties).is_asset_partitioned\n else None\n )\n\n event = DagsterEvent(\n event_type_value=DagsterEventType.ASSET_MATERIALIZATION_PLANNED.value,\n job_name=job_name,\n message=(\n f"{job_name} intends to materialize asset {asset_key.to_string()}"\n ),\n event_specific_data=AssetMaterializationPlannedData(\n asset_key, partition=partition\n ),\n step_key=step.key,\n )\n self.report_dagster_event(event, dagster_run.run_id, logging.DEBUG)\n\n if check.not_none(output.properties).asset_check_key:\n asset_check_key = check.not_none(\n check.not_none(output.properties).asset_check_key\n )\n target_asset_key = asset_check_key.asset_key\n check_name = asset_check_key.name\n\n event = DagsterEvent(\n event_type_value=DagsterEventType.ASSET_CHECK_EVALUATION_PLANNED.value,\n job_name=job_name,\n message=(\n f"{job_name} intends to execute asset check {check_name} on"\n f" asset {target_asset_key.to_string()}"\n ),\n event_specific_data=AssetCheckEvaluationPlanned(\n target_asset_key,\n check_name=check_name,\n ),\n step_key=step.key,\n )\n self.report_dagster_event(event, dagster_run.run_id, logging.DEBUG)\n\n def create_run(\n self,\n *,\n job_name: str,\n run_id: Optional[str],\n run_config: Optional[Mapping[str, object]],\n status: Optional[DagsterRunStatus],\n tags: Optional[Mapping[str, Any]],\n root_run_id: Optional[str],\n parent_run_id: Optional[str],\n step_keys_to_execute: Optional[Sequence[str]],\n execution_plan_snapshot: Optional["ExecutionPlanSnapshot"],\n job_snapshot: Optional["JobSnapshot"],\n parent_job_snapshot: Optional["JobSnapshot"],\n asset_selection: Optional[AbstractSet[AssetKey]],\n asset_check_selection: Optional[AbstractSet["AssetCheckKey"]],\n resolved_op_selection: Optional[AbstractSet[str]],\n op_selection: Optional[Sequence[str]],\n external_job_origin: Optional["ExternalJobOrigin"],\n job_code_origin: Optional[JobPythonOrigin],\n ) -> DagsterRun:\n from dagster._core.definitions.asset_check_spec import AssetCheckKey\n from dagster._core.definitions.utils import validate_tags\n from dagster._core.host_representation.origin import ExternalJobOrigin\n from dagster._core.snap import ExecutionPlanSnapshot, JobSnapshot\n\n check.str_param(job_name, "job_name")\n check.opt_str_param(\n run_id, "run_id"\n ) # will be assigned to make_new_run_id() lower in callstack\n check.opt_mapping_param(run_config, "run_config", key_type=str)\n\n check.opt_inst_param(status, "status", DagsterRunStatus)\n check.opt_mapping_param(tags, "tags", key_type=str)\n\n validated_tags = validate_tags(tags)\n\n check.opt_str_param(root_run_id, "root_run_id")\n check.opt_str_param(parent_run_id, "parent_run_id")\n\n # If step_keys_to_execute is None, then everything is executed. In some cases callers\n # are still exploding and sending the full list of step keys even though that is\n # unnecessary.\n\n check.opt_sequence_param(step_keys_to_execute, "step_keys_to_execute")\n check.opt_inst_param(\n execution_plan_snapshot, "execution_plan_snapshot", ExecutionPlanSnapshot\n )\n\n if root_run_id or parent_run_id:\n check.invariant(\n root_run_id and parent_run_id,\n "If root_run_id or parent_run_id is passed, this is a re-execution scenario and"\n " root_run_id and parent_run_id must both be passed.",\n )\n\n # The job_snapshot should always be set in production scenarios. In tests\n # we have sometimes omitted it out of convenience.\n\n check.opt_inst_param(job_snapshot, "job_snapshot", JobSnapshot)\n check.opt_inst_param(parent_job_snapshot, "parent_job_snapshot", JobSnapshot)\n\n if parent_job_snapshot:\n check.invariant(\n job_snapshot,\n "If parent_job_snapshot is set, job_snapshot should also be.",\n )\n\n # op_selection is a sequence of selection queries assigned by the user.\n # *Most* callers expand the op_selection into an explicit set of\n # resolved_op_selection via accessing external_job.resolved_op_selection\n # but not all do. Some (launch execution mutation in graphql and backfill run\n # creation, for example) actually pass the solid *selection* into the\n # resolved_op_selection parameter, but just as a frozen set, rather than\n # fully resolving the selection, as the daemon launchers do. Given the\n # state of callers we just check to ensure that the arguments are well-formed.\n #\n # asset_selection adds another dimension to this lovely dance. op_selection\n # and asset_selection are mutually exclusive and should never both be set.\n # This is invariant is checked in a sporadic fashion around\n # the codebase, but is never enforced in a typed fashion.\n #\n # Additionally, the way that callsites currently behave *if* asset selection\n # is set (i.e., not None) then *neither* op_selection *nor*\n # resolved_op_selection is passed. In the asset selection case resolving\n # the set of assets into the canonical resolved_op_selection is done in\n # the user process, and the exact resolution is never persisted in the run.\n # We are asserting that invariant here to maintain that behavior.\n #\n # Finally, asset_check_selection can be passed along with asset_selection. It\n # is mutually exclusive with op_selection and resolved_op_selection. A `None`\n # value will include any asset checks that target selected assets. An empty set\n # will include no asset checks.\n\n check.opt_set_param(resolved_op_selection, "resolved_op_selection", of_type=str)\n check.opt_sequence_param(op_selection, "op_selection", of_type=str)\n check.opt_set_param(asset_selection, "asset_selection", of_type=AssetKey)\n check.opt_set_param(asset_check_selection, "asset_check_selection", of_type=AssetCheckKey)\n\n if asset_selection is not None or asset_check_selection is not None:\n check.invariant(\n op_selection is None,\n "Cannot pass op_selection with either of asset_selection or asset_check_selection",\n )\n\n check.invariant(\n resolved_op_selection is None,\n "Cannot pass resolved_op_selection with either of asset_selection or"\n " asset_check_selection",\n )\n\n # The "python origin" arguments exist so a job can be reconstructed in memory\n # after a DagsterRun has been fetched from the database.\n #\n # There are cases (notably in _logged_execute_job with Reconstructable jobs)\n # where job_code_origin and is not. In some cloud test cases only\n # external_job_origin is passed But they are almost always passed together.\n # If these are not set the created run will never be able to be relaunched from\n # the information just in the run or in another process.\n\n check.opt_inst_param(external_job_origin, "external_job_origin", ExternalJobOrigin)\n check.opt_inst_param(job_code_origin, "job_code_origin", JobPythonOrigin)\n\n dagster_run = self._construct_run_with_snapshots(\n job_name=job_name,\n run_id=run_id, # type: ignore # (possible none)\n run_config=run_config,\n asset_selection=asset_selection,\n asset_check_selection=asset_check_selection,\n op_selection=op_selection,\n resolved_op_selection=resolved_op_selection,\n step_keys_to_execute=step_keys_to_execute,\n status=status,\n tags=validated_tags,\n root_run_id=root_run_id,\n parent_run_id=parent_run_id,\n job_snapshot=job_snapshot,\n execution_plan_snapshot=execution_plan_snapshot,\n parent_job_snapshot=parent_job_snapshot,\n external_job_origin=external_job_origin,\n job_code_origin=job_code_origin,\n )\n\n dagster_run = self._run_storage.add_run(dagster_run)\n\n if execution_plan_snapshot:\n self._log_asset_planned_events(dagster_run, execution_plan_snapshot)\n\n return dagster_run\n\n def create_reexecuted_run(\n self,\n *,\n parent_run: DagsterRun,\n code_location: "CodeLocation",\n external_job: "ExternalJob",\n strategy: "ReexecutionStrategy",\n extra_tags: Optional[Mapping[str, Any]] = None,\n run_config: Optional[Mapping[str, Any]] = None,\n use_parent_run_tags: bool = False,\n ) -> DagsterRun:\n from dagster._core.execution.plan.resume_retry import (\n ReexecutionStrategy,\n )\n from dagster._core.execution.plan.state import KnownExecutionState\n from dagster._core.host_representation import CodeLocation, ExternalJob\n\n check.inst_param(parent_run, "parent_run", DagsterRun)\n check.inst_param(code_location, "code_location", CodeLocation)\n check.inst_param(external_job, "external_job", ExternalJob)\n check.inst_param(strategy, "strategy", ReexecutionStrategy)\n check.opt_mapping_param(extra_tags, "extra_tags", key_type=str)\n check.opt_mapping_param(run_config, "run_config", key_type=str)\n\n check.bool_param(use_parent_run_tags, "use_parent_run_tags")\n\n root_run_id = parent_run.root_run_id or parent_run.run_id\n parent_run_id = parent_run.run_id\n\n tags = merge_dicts(\n external_job.tags,\n (\n # these can differ from external_job.tags if tags were added at launch time\n parent_run.tags\n if use_parent_run_tags\n else {}\n ),\n extra_tags or {},\n {\n PARENT_RUN_ID_TAG: parent_run_id,\n ROOT_RUN_ID_TAG: root_run_id,\n },\n )\n\n run_config = run_config if run_config is not None else parent_run.run_config\n\n if strategy == ReexecutionStrategy.FROM_FAILURE:\n check.invariant(\n parent_run.status == DagsterRunStatus.FAILURE,\n "Cannot reexecute from failure a run that is not failed",\n )\n\n (\n step_keys_to_execute,\n known_state,\n ) = KnownExecutionState.build_resume_retry_reexecution(\n self,\n parent_run=parent_run,\n )\n tags[RESUME_RETRY_TAG] = "true"\n elif strategy == ReexecutionStrategy.ALL_STEPS:\n step_keys_to_execute = None\n known_state = None\n else:\n raise DagsterInvariantViolationError(f"Unknown reexecution strategy: {strategy}")\n\n external_execution_plan = code_location.get_external_execution_plan(\n external_job,\n run_config,\n step_keys_to_execute=step_keys_to_execute,\n known_state=known_state,\n instance=self,\n )\n\n return self.create_run(\n job_name=parent_run.job_name,\n run_id=None,\n run_config=run_config,\n resolved_op_selection=parent_run.resolved_op_selection,\n step_keys_to_execute=step_keys_to_execute,\n status=DagsterRunStatus.NOT_STARTED,\n tags=tags,\n root_run_id=root_run_id,\n parent_run_id=parent_run_id,\n job_snapshot=external_job.job_snapshot,\n execution_plan_snapshot=external_execution_plan.execution_plan_snapshot,\n parent_job_snapshot=external_job.parent_job_snapshot,\n op_selection=parent_run.op_selection,\n asset_selection=parent_run.asset_selection,\n asset_check_selection=parent_run.asset_check_selection,\n external_job_origin=external_job.get_external_origin(),\n job_code_origin=external_job.get_python_origin(),\n )\n\n def register_managed_run(\n self,\n job_name: str,\n run_id: str,\n run_config: Optional[Mapping[str, object]],\n resolved_op_selection: Optional[AbstractSet[str]],\n step_keys_to_execute: Optional[Sequence[str]],\n tags: Mapping[str, str],\n root_run_id: Optional[str],\n parent_run_id: Optional[str],\n job_snapshot: Optional["JobSnapshot"],\n execution_plan_snapshot: Optional["ExecutionPlanSnapshot"],\n parent_job_snapshot: Optional["JobSnapshot"],\n op_selection: Optional[Sequence[str]] = None,\n job_code_origin: Optional[JobPythonOrigin] = None,\n ) -> DagsterRun:\n # The usage of this method is limited to dagster-airflow, specifically in Dagster\n # Operators that are executed in Airflow. Because a common workflow in Airflow is to\n # retry dags from arbitrary tasks, we need any node to be capable of creating a\n # DagsterRun.\n #\n # The try-except DagsterRunAlreadyExists block handles the race when multiple "root" tasks\n # simultaneously execute self._run_storage.add_run(dagster_run). When this happens, only\n # one task succeeds in creating the run, while the others get DagsterRunAlreadyExists\n # error; at this point, the failed tasks try again to fetch the existing run.\n # https://github.com/dagster-io/dagster/issues/2412\n\n dagster_run = self._construct_run_with_snapshots(\n job_name=job_name,\n run_id=run_id,\n run_config=run_config,\n op_selection=op_selection,\n resolved_op_selection=resolved_op_selection,\n step_keys_to_execute=step_keys_to_execute,\n status=DagsterRunStatus.MANAGED,\n tags=tags,\n root_run_id=root_run_id,\n parent_run_id=parent_run_id,\n job_snapshot=job_snapshot,\n execution_plan_snapshot=execution_plan_snapshot,\n parent_job_snapshot=parent_job_snapshot,\n job_code_origin=job_code_origin,\n )\n\n def get_run() -> DagsterRun:\n candidate_run = self.get_run_by_id(dagster_run.run_id)\n\n field_diff = _check_run_equality(dagster_run, candidate_run) # type: ignore # (possible none)\n\n if field_diff:\n raise DagsterRunConflict(\n "Found conflicting existing run with same id {run_id}. Runs differ in:"\n "\\n{field_diff}".format(\n run_id=dagster_run.run_id,\n field_diff=_format_field_diff(field_diff),\n ),\n )\n return candidate_run # type: ignore # (possible none)\n\n if self.has_run(dagster_run.run_id):\n return get_run()\n\n try:\n return self._run_storage.add_run(dagster_run)\n except DagsterRunAlreadyExists:\n return get_run()\n\n @traced\n def add_run(self, dagster_run: DagsterRun) -> DagsterRun:\n return self._run_storage.add_run(dagster_run)\n\n @traced\n def add_snapshot(\n self,\n snapshot: Union["JobSnapshot", "ExecutionPlanSnapshot"],\n snapshot_id: Optional[str] = None,\n ) -> None:\n return self._run_storage.add_snapshot(snapshot, snapshot_id)\n\n @traced\n def handle_run_event(self, run_id: str, event: "DagsterEvent") -> None:\n return self._run_storage.handle_run_event(run_id, event)\n\n @traced\n def add_run_tags(self, run_id: str, new_tags: Mapping[str, str]) -> None:\n return self._run_storage.add_run_tags(run_id, new_tags)\n\n @traced\n def has_run(self, run_id: str) -> bool:\n return self._run_storage.has_run(run_id)\n\n @traced\n def get_runs(\n self,\n filters: Optional[RunsFilter] = None,\n cursor: Optional[str] = None,\n limit: Optional[int] = None,\n bucket_by: Optional[Union[JobBucket, TagBucket]] = None,\n ) -> Sequence[DagsterRun]:\n return self._run_storage.get_runs(filters, cursor, limit, bucket_by)\n\n @traced\n def get_run_ids(\n self,\n filters: Optional[RunsFilter] = None,\n cursor: Optional[str] = None,\n limit: Optional[int] = None,\n ) -> Sequence[str]:\n return self._run_storage.get_run_ids(filters, cursor=cursor, limit=limit)\n\n @traced\n def get_runs_count(self, filters: Optional[RunsFilter] = None) -> int:\n return self._run_storage.get_runs_count(filters)\n\n
[docs] @public\n @traced\n def get_run_records(\n self,\n filters: Optional[RunsFilter] = None,\n limit: Optional[int] = None,\n order_by: Optional[str] = None,\n ascending: bool = False,\n cursor: Optional[str] = None,\n bucket_by: Optional[Union[JobBucket, TagBucket]] = None,\n ) -> Sequence[RunRecord]:\n """Return a list of run records stored in the run storage, sorted by the given column in given order.\n\n Args:\n filters (Optional[RunsFilter]): the filter by which to filter runs.\n limit (Optional[int]): Number of results to get. Defaults to infinite.\n order_by (Optional[str]): Name of the column to sort by. Defaults to id.\n ascending (Optional[bool]): Sort the result in ascending order if True, descending\n otherwise. Defaults to descending.\n\n Returns:\n List[RunRecord]: List of run records stored in the run storage.\n """\n return self._run_storage.get_run_records(\n filters, limit, order_by, ascending, cursor, bucket_by\n )
\n\n @traced\n def get_run_partition_data(self, runs_filter: RunsFilter) -> Sequence[RunPartitionData]:\n """Get run partition data for a given partitioned job."""\n return self._run_storage.get_run_partition_data(runs_filter)\n\n def wipe(self) -> None:\n self._run_storage.wipe()\n self._event_storage.wipe()\n\n
[docs] @public\n @traced\n def delete_run(self, run_id: str) -> None:\n """Delete a run and all events generated by that from storage.\n\n Args:\n run_id (str): The id of the run to delete.\n """\n self._run_storage.delete_run(run_id)\n self._event_storage.delete_events(run_id)
\n\n # event storage\n @traced\n def logs_after(\n self,\n run_id: str,\n cursor: Optional[int] = None,\n of_type: Optional["DagsterEventType"] = None,\n limit: Optional[int] = None,\n ) -> Sequence["EventLogEntry"]:\n return self._event_storage.get_logs_for_run(\n run_id,\n cursor=cursor,\n of_type=of_type,\n limit=limit,\n )\n\n @traced\n def all_logs(\n self,\n run_id: str,\n of_type: Optional[Union["DagsterEventType", Set["DagsterEventType"]]] = None,\n ) -> Sequence["EventLogEntry"]:\n return self._event_storage.get_logs_for_run(run_id, of_type=of_type)\n\n @traced\n def get_records_for_run(\n self,\n run_id: str,\n cursor: Optional[str] = None,\n of_type: Optional[Union["DagsterEventType", Set["DagsterEventType"]]] = None,\n limit: Optional[int] = None,\n ascending: bool = True,\n ) -> "EventLogConnection":\n return self._event_storage.get_records_for_run(run_id, cursor, of_type, limit, ascending)\n\n def watch_event_logs(self, run_id: str, cursor: Optional[str], cb: "EventHandlerFn") -> None:\n return self._event_storage.watch(run_id, cursor, cb)\n\n def end_watch_event_logs(self, run_id: str, cb: "EventHandlerFn") -> None:\n return self._event_storage.end_watch(run_id, cb)\n\n # asset storage\n\n @traced\n def can_cache_asset_status_data(self) -> bool:\n return self._event_storage.can_cache_asset_status_data()\n\n @traced\n def update_asset_cached_status_data(\n self, asset_key: AssetKey, cache_values: "AssetStatusCacheValue"\n ) -> None:\n self._event_storage.update_asset_cached_status_data(asset_key, cache_values)\n\n @traced\n def wipe_asset_cached_status(self, asset_keys: Sequence[AssetKey]) -> None:\n check.list_param(asset_keys, "asset_keys", of_type=AssetKey)\n for asset_key in asset_keys:\n self._event_storage.wipe_asset_cached_status(asset_key)\n\n @traced\n def all_asset_keys(self) -> Sequence[AssetKey]:\n return self._event_storage.all_asset_keys()\n\n
[docs] @public\n @traced\n def get_asset_keys(\n self,\n prefix: Optional[Sequence[str]] = None,\n limit: Optional[int] = None,\n cursor: Optional[str] = None,\n ) -> Sequence[AssetKey]:\n """Return a filtered subset of asset keys managed by this instance.\n\n Args:\n prefix (Optional[Sequence[str]]): Return only assets having this key prefix.\n limit (Optional[int]): Maximum number of keys to return.\n cursor (Optional[str]): Cursor to use for pagination.\n\n Returns:\n Sequence[AssetKey]: List of asset keys.\n """\n return self._event_storage.get_asset_keys(prefix=prefix, limit=limit, cursor=cursor)
\n\n
[docs] @public\n @traced\n def has_asset_key(self, asset_key: AssetKey) -> bool:\n """Return true if this instance manages the given asset key.\n\n Args:\n asset_key (AssetKey): Asset key to check.\n """\n return self._event_storage.has_asset_key(asset_key)
\n\n @traced\n def get_latest_materialization_events(\n self, asset_keys: Iterable[AssetKey]\n ) -> Mapping[AssetKey, Optional["EventLogEntry"]]:\n return self._event_storage.get_latest_materialization_events(asset_keys)\n\n
[docs] @public\n @traced\n def get_latest_materialization_event(self, asset_key: AssetKey) -> Optional["EventLogEntry"]:\n """Fetch the latest materialization event for the given asset key.\n\n Args:\n asset_key (AssetKey): Asset key to return materialization for.\n\n Returns:\n Optional[AssetMaterialization]: The latest materialization event for the given asset\n key, or `None` if the asset has not been materialized.\n """\n return self._event_storage.get_latest_materialization_events([asset_key]).get(asset_key)
\n\n
[docs] @public\n @traced\n def get_event_records(\n self,\n event_records_filter: "EventRecordsFilter",\n limit: Optional[int] = None,\n ascending: bool = False,\n ) -> Sequence["EventLogRecord"]:\n """Return a list of event records stored in the event log storage.\n\n Args:\n event_records_filter (Optional[EventRecordsFilter]): the filter by which to filter event\n records.\n limit (Optional[int]): Number of results to get. Defaults to infinite.\n ascending (Optional[bool]): Sort the result in ascending order if True, descending\n otherwise. Defaults to descending.\n\n Returns:\n List[EventLogRecord]: List of event log records stored in the event log storage.\n """\n return self._event_storage.get_event_records(event_records_filter, limit, ascending)
\n\n
[docs] @public\n @traced\n def get_status_by_partition(\n self,\n asset_key: AssetKey,\n partition_keys: Sequence[str],\n partitions_def: "PartitionsDefinition",\n ) -> Optional[Mapping[str, "AssetPartitionStatus"]]:\n """Get the current status of provided partition_keys for the provided asset.\n\n Args:\n asset_key (AssetKey): The asset to get per-partition status for.\n partition_keys (Sequence[str]): The partitions to get status for.\n partitions_def (PartitionsDefinition): The PartitionsDefinition of the asset to get\n per-partition status for.\n\n Returns:\n Optional[Mapping[str, AssetPartitionStatus]]: status for each partition key\n\n """\n from dagster._core.storage.partition_status_cache import (\n AssetPartitionStatus,\n AssetStatusCacheValue,\n get_and_update_asset_status_cache_value,\n )\n\n cached_value = get_and_update_asset_status_cache_value(self, asset_key, partitions_def)\n\n if isinstance(cached_value, AssetStatusCacheValue):\n materialized_partitions = cached_value.deserialize_materialized_partition_subsets(\n partitions_def\n )\n failed_partitions = cached_value.deserialize_failed_partition_subsets(partitions_def)\n in_progress_partitions = cached_value.deserialize_in_progress_partition_subsets(\n partitions_def\n )\n\n status_by_partition = {}\n\n for partition_key in partition_keys:\n if partition_key in in_progress_partitions:\n status_by_partition[partition_key] = AssetPartitionStatus.IN_PROGRESS\n elif partition_key in failed_partitions:\n status_by_partition[partition_key] = AssetPartitionStatus.FAILED\n elif partition_key in materialized_partitions:\n status_by_partition[partition_key] = AssetPartitionStatus.MATERIALIZED\n else:\n status_by_partition[partition_key] = None\n\n return status_by_partition
\n\n
[docs] @public\n @traced\n def get_asset_records(\n self, asset_keys: Optional[Sequence[AssetKey]] = None\n ) -> Sequence["AssetRecord"]:\n """Return an `AssetRecord` for each of the given asset keys.\n\n Args:\n asset_keys (Optional[Sequence[AssetKey]]): List of asset keys to retrieve records for.\n\n Returns:\n Sequence[AssetRecord]: List of asset records.\n """\n return self._event_storage.get_asset_records(asset_keys)
\n\n @traced\n def get_event_tags_for_asset(\n self,\n asset_key: AssetKey,\n filter_tags: Optional[Mapping[str, str]] = None,\n filter_event_id: Optional[int] = None,\n ) -> Sequence[Mapping[str, str]]:\n """Fetches asset event tags for the given asset key.\n\n If filter_tags is provided, searches for events containing all of the filter tags. Then,\n returns all tags for those events. This enables searching for multipartitioned asset\n partition tags with a fixed dimension value, e.g. all of the tags for events where\n "country" == "US".\n\n If filter_event_id is provided, searches for the event with the provided event_id.\n\n Returns a list of dicts, where each dict is a mapping of tag key to tag value for a\n single event.\n """\n return self._event_storage.get_event_tags_for_asset(asset_key, filter_tags, filter_event_id)\n\n
[docs] @public\n @traced\n def wipe_assets(self, asset_keys: Sequence[AssetKey]) -> None:\n """Wipes asset event history from the event log for the given asset keys.\n\n Args:\n asset_keys (Sequence[AssetKey]): Asset keys to wipe.\n """\n check.list_param(asset_keys, "asset_keys", of_type=AssetKey)\n for asset_key in asset_keys:\n self._event_storage.wipe_asset(asset_key)
\n\n @traced\n def get_materialization_count_by_partition(\n self, asset_keys: Sequence[AssetKey], after_cursor: Optional[int] = None\n ) -> Mapping[AssetKey, Mapping[str, int]]:\n return self._event_storage.get_materialization_count_by_partition(asset_keys, after_cursor)\n\n @traced\n def get_materialized_partitions(\n self,\n asset_key: AssetKey,\n before_cursor: Optional[int] = None,\n after_cursor: Optional[int] = None,\n ) -> Set[str]:\n return self._event_storage.get_materialized_partitions(\n asset_key, before_cursor=before_cursor, after_cursor=after_cursor\n )\n\n @traced\n def get_latest_storage_id_by_partition(\n self, asset_key: AssetKey, event_type: "DagsterEventType"\n ) -> Mapping[str, int]:\n """Fetch the latest materialzation storage id for each partition for a given asset key.\n\n Returns a mapping of partition to storage id.\n """\n return self._event_storage.get_latest_storage_id_by_partition(asset_key, event_type)\n\n
[docs] @public\n @traced\n def get_dynamic_partitions(self, partitions_def_name: str) -> Sequence[str]:\n """Get the set of partition keys for the specified :py:class:`DynamicPartitionsDefinition`.\n\n Args:\n partitions_def_name (str): The name of the `DynamicPartitionsDefinition`.\n """\n check.str_param(partitions_def_name, "partitions_def_name")\n return self._event_storage.get_dynamic_partitions(partitions_def_name)
\n\n
[docs] @public\n @traced\n def add_dynamic_partitions(\n self, partitions_def_name: str, partition_keys: Sequence[str]\n ) -> None:\n """Add partitions to the specified :py:class:`DynamicPartitionsDefinition` idempotently.\n Does not add any partitions that already exist.\n\n Args:\n partitions_def_name (str): The name of the `DynamicPartitionsDefinition`.\n partition_keys (Sequence[str]): Partition keys to add.\n """\n from dagster._core.definitions.partition import (\n raise_error_on_invalid_partition_key_substring,\n )\n\n check.str_param(partitions_def_name, "partitions_def_name")\n check.sequence_param(partition_keys, "partition_keys", of_type=str)\n if isinstance(partition_keys, str):\n # Guard against a single string being passed in `partition_keys`\n raise DagsterInvalidInvocationError("partition_keys must be a sequence of strings")\n raise_error_on_invalid_partition_key_substring(partition_keys)\n return self._event_storage.add_dynamic_partitions(partitions_def_name, partition_keys)
\n\n
[docs] @public\n @traced\n def delete_dynamic_partition(self, partitions_def_name: str, partition_key: str) -> None:\n """Delete a partition for the specified :py:class:`DynamicPartitionsDefinition`.\n If the partition does not exist, exits silently.\n\n Args:\n partitions_def_name (str): The name of the `DynamicPartitionsDefinition`.\n partition_key (Sequence[str]): Partition key to delete.\n """\n check.str_param(partitions_def_name, "partitions_def_name")\n check.sequence_param(partition_key, "partition_key", of_type=str)\n self._event_storage.delete_dynamic_partition(partitions_def_name, partition_key)
\n\n
[docs] @public\n @traced\n def has_dynamic_partition(self, partitions_def_name: str, partition_key: str) -> bool:\n """Check if a partition key exists for the :py:class:`DynamicPartitionsDefinition`.\n\n Args:\n partitions_def_name (str): The name of the `DynamicPartitionsDefinition`.\n partition_key (Sequence[str]): Partition key to check.\n """\n check.str_param(partitions_def_name, "partitions_def_name")\n check.str_param(partition_key, "partition_key")\n return self._event_storage.has_dynamic_partition(partitions_def_name, partition_key)
\n\n # event subscriptions\n\n def _get_yaml_python_handlers(self) -> Sequence[logging.Handler]:\n if self._settings:\n logging_config = self.get_settings("python_logs").get("dagster_handler_config", {})\n\n if logging_config:\n experimental_warning("Handling yaml-defined logging configuration")\n\n # Handlers can only be retrieved from dictConfig configuration if they are attached\n # to a logger. We add a dummy logger to the configuration that allows us to access user\n # defined handlers.\n handler_names = logging_config.get("handlers", {}).keys()\n\n dagster_dummy_logger_name = "dagster_dummy_logger"\n\n processed_dict_conf = {\n "version": 1,\n "disable_existing_loggers": False,\n "loggers": {dagster_dummy_logger_name: {"handlers": handler_names}},\n }\n processed_dict_conf.update(logging_config)\n\n logging.config.dictConfig(processed_dict_conf)\n\n dummy_logger = logging.getLogger(dagster_dummy_logger_name)\n return dummy_logger.handlers\n return []\n\n def _get_event_log_handler(self) -> _EventListenerLogHandler:\n event_log_handler = _EventListenerLogHandler(self)\n event_log_handler.setLevel(10)\n return event_log_handler\n\n def get_handlers(self) -> Sequence[logging.Handler]:\n handlers: List[logging.Handler] = [self._get_event_log_handler()]\n handlers.extend(self._get_yaml_python_handlers())\n return handlers\n\n def store_event(self, event: "EventLogEntry") -> None:\n self._event_storage.store_event(event)\n\n def handle_new_event(self, event: "EventLogEntry") -> None:\n run_id = event.run_id\n\n self._event_storage.store_event(event)\n\n if event.is_dagster_event and event.get_dagster_event().is_job_event:\n self._run_storage.handle_run_event(run_id, event.get_dagster_event())\n\n for sub in self._subscribers[run_id]:\n sub(event)\n\n def add_event_listener(self, run_id: str, cb) -> None:\n self._subscribers[run_id].append(cb)\n\n def report_engine_event(\n self,\n message: str,\n dagster_run: Optional[DagsterRun] = None,\n engine_event_data: Optional["EngineEventData"] = None,\n cls: Optional[Type[object]] = None,\n step_key: Optional[str] = None,\n job_name: Optional[str] = None,\n run_id: Optional[str] = None,\n ) -> "DagsterEvent":\n """Report a EngineEvent that occurred outside of a job execution context."""\n from dagster._core.events import DagsterEvent, DagsterEventType, EngineEventData\n\n check.opt_class_param(cls, "cls")\n check.str_param(message, "message")\n check.opt_inst_param(dagster_run, "dagster_run", DagsterRun)\n check.opt_str_param(run_id, "run_id")\n check.opt_str_param(job_name, "job_name")\n\n check.invariant(\n dagster_run or (job_name and run_id),\n "Must include either dagster_run or job_name and run_id",\n )\n\n run_id = run_id if run_id else dagster_run.run_id # type: ignore\n job_name = job_name if job_name else dagster_run.job_name # type: ignore\n\n engine_event_data = check.opt_inst_param(\n engine_event_data,\n "engine_event_data",\n EngineEventData,\n EngineEventData({}),\n )\n\n if cls:\n message = f"[{cls.__name__}] {message}"\n\n log_level = logging.INFO\n if engine_event_data and engine_event_data.error:\n log_level = logging.ERROR\n\n dagster_event = DagsterEvent(\n event_type_value=DagsterEventType.ENGINE_EVENT.value,\n job_name=job_name,\n message=message,\n event_specific_data=engine_event_data,\n step_key=step_key,\n )\n self.report_dagster_event(dagster_event, run_id=run_id, log_level=log_level)\n return dagster_event\n\n def report_dagster_event(\n self,\n dagster_event: "DagsterEvent",\n run_id: str,\n log_level: Union[str, int] = logging.INFO,\n ) -> None:\n """Takes a DagsterEvent and stores it in persistent storage for the corresponding DagsterRun."""\n from dagster._core.events.log import EventLogEntry\n\n event_record = EventLogEntry(\n user_message="",\n level=log_level,\n job_name=dagster_event.job_name,\n run_id=run_id,\n error_info=None,\n timestamp=time.time(),\n step_key=dagster_event.step_key,\n dagster_event=dagster_event,\n )\n self.handle_new_event(event_record)\n\n def report_run_canceling(self, run: DagsterRun, message: Optional[str] = None):\n from dagster._core.events import DagsterEvent, DagsterEventType\n\n check.inst_param(run, "run", DagsterRun)\n message = check.opt_str_param(\n message,\n "message",\n "Sending run termination request.",\n )\n canceling_event = DagsterEvent(\n event_type_value=DagsterEventType.PIPELINE_CANCELING.value,\n job_name=run.job_name,\n message=message,\n )\n self.report_dagster_event(canceling_event, run_id=run.run_id)\n\n def report_run_canceled(\n self,\n dagster_run: DagsterRun,\n message: Optional[str] = None,\n ) -> "DagsterEvent":\n from dagster._core.events import DagsterEvent, DagsterEventType\n\n check.inst_param(dagster_run, "dagster_run", DagsterRun)\n\n message = check.opt_str_param(\n message,\n "mesage",\n "This run has been marked as canceled from outside the execution context.",\n )\n\n dagster_event = DagsterEvent(\n event_type_value=DagsterEventType.PIPELINE_CANCELED.value,\n job_name=dagster_run.job_name,\n message=message,\n )\n self.report_dagster_event(dagster_event, run_id=dagster_run.run_id, log_level=logging.ERROR)\n return dagster_event\n\n def report_run_failed(\n self, dagster_run: DagsterRun, message: Optional[str] = None\n ) -> "DagsterEvent":\n from dagster._core.events import DagsterEvent, DagsterEventType\n\n check.inst_param(dagster_run, "dagster_run", DagsterRun)\n\n message = check.opt_str_param(\n message,\n "message",\n "This run has been marked as failed from outside the execution context.",\n )\n\n dagster_event = DagsterEvent(\n event_type_value=DagsterEventType.PIPELINE_FAILURE.value,\n job_name=dagster_run.job_name,\n message=message,\n )\n self.report_dagster_event(dagster_event, run_id=dagster_run.run_id, log_level=logging.ERROR)\n return dagster_event\n\n # directories\n\n def file_manager_directory(self, run_id: str) -> str:\n return self._local_artifact_storage.file_manager_dir(run_id)\n\n def storage_directory(self) -> str:\n return self._local_artifact_storage.storage_dir\n\n def schedules_directory(self) -> str:\n return self._local_artifact_storage.schedules_dir\n\n # Runs coordinator\n\n def submit_run(self, run_id: str, workspace: "IWorkspace") -> DagsterRun:\n """Submit a pipeline run to the coordinator.\n\n This method delegates to the ``RunCoordinator``, configured on the instance, and will\n call its implementation of ``RunCoordinator.submit_run()`` to send the run to the\n coordinator for execution. Runs should be created in the instance (e.g., by calling\n ``DagsterInstance.create_run()``) *before* this method is called, and\n should be in the ``PipelineRunStatus.NOT_STARTED`` state. They also must have a non-null\n ExternalPipelineOrigin.\n\n Args:\n run_id (str): The id of the run.\n """\n from dagster._core.host_representation import ExternalJobOrigin\n from dagster._core.run_coordinator import SubmitRunContext\n\n run = self.get_run_by_id(run_id)\n if run is None:\n raise DagsterInvariantViolationError(\n f"Could not load run {run_id} that was passed to submit_run"\n )\n\n check.inst(\n run.external_job_origin,\n ExternalJobOrigin,\n "External pipeline origin must be set for submitted runs",\n )\n check.inst(\n run.job_code_origin,\n JobPythonOrigin,\n "Python origin must be set for submitted runs",\n )\n\n try:\n submitted_run = self.run_coordinator.submit_run(\n SubmitRunContext(run, workspace=workspace)\n )\n except:\n from dagster._core.events import EngineEventData\n\n error = serializable_error_info_from_exc_info(sys.exc_info())\n self.report_engine_event(\n error.message,\n run,\n EngineEventData.engine_error(error),\n )\n self.report_run_failed(run)\n raise\n\n return submitted_run\n\n # Run launcher\n\n def launch_run(self, run_id: str, workspace: "IWorkspace") -> DagsterRun:\n """Launch a pipeline run.\n\n This method is typically called using `instance.submit_run` rather than being invoked\n directly. This method delegates to the ``RunLauncher``, if any, configured on the instance,\n and will call its implementation of ``RunLauncher.launch_run()`` to begin the execution of\n the specified run. Runs should be created in the instance (e.g., by calling\n ``DagsterInstance.create_run()``) *before* this method is called, and should be in the\n ``PipelineRunStatus.NOT_STARTED`` state.\n\n Args:\n run_id (str): The id of the run the launch.\n """\n from dagster._core.events import DagsterEvent, DagsterEventType, EngineEventData\n from dagster._core.launcher import LaunchRunContext\n\n run = self.get_run_by_id(run_id)\n if run is None:\n raise DagsterInvariantViolationError(\n f"Could not load run {run_id} that was passed to launch_run"\n )\n\n launch_started_event = DagsterEvent(\n event_type_value=DagsterEventType.PIPELINE_STARTING.value,\n job_name=run.job_name,\n )\n self.report_dagster_event(launch_started_event, run_id=run.run_id)\n\n run = self.get_run_by_id(run_id)\n if run is None:\n check.failed(f"Failed to reload run {run_id}")\n\n try:\n self.run_launcher.launch_run(LaunchRunContext(dagster_run=run, workspace=workspace))\n except:\n error = serializable_error_info_from_exc_info(sys.exc_info())\n self.report_engine_event(\n error.message,\n run,\n EngineEventData.engine_error(error),\n )\n self.report_run_failed(run)\n raise\n\n return run\n\n def resume_run(self, run_id: str, workspace: "IWorkspace", attempt_number: int) -> DagsterRun:\n """Resume a pipeline run.\n\n This method should be called on runs which have already been launched, but whose run workers\n have died.\n\n Args:\n run_id (str): The id of the run the launch.\n """\n from dagster._core.events import EngineEventData\n from dagster._core.launcher import ResumeRunContext\n from dagster._daemon.monitoring import RESUME_RUN_LOG_MESSAGE\n\n run = self.get_run_by_id(run_id)\n if run is None:\n raise DagsterInvariantViolationError(\n f"Could not load run {run_id} that was passed to resume_run"\n )\n if run.status not in IN_PROGRESS_RUN_STATUSES:\n raise DagsterInvariantViolationError(\n f"Run {run_id} is not in a state that can be resumed"\n )\n\n self.report_engine_event(\n RESUME_RUN_LOG_MESSAGE,\n run,\n )\n\n try:\n self.run_launcher.resume_run(\n ResumeRunContext(\n dagster_run=run,\n workspace=workspace,\n resume_attempt_number=attempt_number,\n )\n )\n except:\n error = serializable_error_info_from_exc_info(sys.exc_info())\n self.report_engine_event(\n error.message,\n run,\n EngineEventData.engine_error(error),\n )\n self.report_run_failed(run)\n raise\n\n return run\n\n def count_resume_run_attempts(self, run_id: str) -> int:\n from dagster._daemon.monitoring import count_resume_run_attempts\n\n return count_resume_run_attempts(self, run_id)\n\n def run_will_resume(self, run_id: str) -> bool:\n if not self.run_monitoring_enabled:\n return False\n return self.count_resume_run_attempts(run_id) < self.run_monitoring_max_resume_run_attempts\n\n # Scheduler\n\n def start_schedule(self, external_schedule: "ExternalSchedule") -> "InstigatorState":\n return self._scheduler.start_schedule(self, external_schedule) # type: ignore\n\n def stop_schedule(\n self,\n schedule_origin_id: str,\n schedule_selector_id: str,\n external_schedule: Optional["ExternalSchedule"],\n ) -> "InstigatorState":\n return self._scheduler.stop_schedule( # type: ignore\n self, schedule_origin_id, schedule_selector_id, external_schedule\n )\n\n def scheduler_debug_info(self) -> "SchedulerDebugInfo":\n from dagster._core.definitions.run_request import InstigatorType\n from dagster._core.scheduler import SchedulerDebugInfo\n\n errors = []\n\n schedules: List[str] = []\n for schedule_state in self.all_instigator_state(instigator_type=InstigatorType.SCHEDULE):\n schedule_info: Mapping[str, Mapping[str, object]] = {\n schedule_state.instigator_name: {\n "status": schedule_state.status.value,\n "cron_schedule": schedule_state.instigator_data.cron_schedule,\n "schedule_origin_id": schedule_state.instigator_origin_id,\n "repository_origin_id": schedule_state.repository_origin_id,\n }\n }\n\n schedules.append(yaml.safe_dump(schedule_info, default_flow_style=False))\n\n return SchedulerDebugInfo(\n scheduler_config_info=self._info_str_for_component("Scheduler", self.scheduler),\n scheduler_info=self.scheduler.debug_info(), # type: ignore\n schedule_storage=schedules,\n errors=errors,\n )\n\n # Schedule / Sensor Storage\n\n def start_sensor(self, external_sensor: "ExternalSensor") -> "InstigatorState":\n from dagster._core.definitions.run_request import InstigatorType\n from dagster._core.scheduler.instigation import (\n InstigatorState,\n InstigatorStatus,\n SensorInstigatorData,\n )\n\n stored_state = self.get_instigator_state(\n external_sensor.get_external_origin_id(), external_sensor.selector_id\n )\n\n computed_state = external_sensor.get_current_instigator_state(stored_state)\n if computed_state.is_running:\n return computed_state\n\n if not stored_state:\n return self.add_instigator_state(\n InstigatorState(\n external_sensor.get_external_origin(),\n InstigatorType.SENSOR,\n InstigatorStatus.RUNNING,\n SensorInstigatorData(min_interval=external_sensor.min_interval_seconds),\n )\n )\n else:\n return self.update_instigator_state(stored_state.with_status(InstigatorStatus.RUNNING))\n\n def stop_sensor(\n self,\n instigator_origin_id: str,\n selector_id: str,\n external_sensor: Optional["ExternalSensor"],\n ) -> "InstigatorState":\n from dagster._core.definitions.run_request import InstigatorType\n from dagster._core.scheduler.instigation import (\n InstigatorState,\n InstigatorStatus,\n SensorInstigatorData,\n )\n\n stored_state = self.get_instigator_state(instigator_origin_id, selector_id)\n computed_state: InstigatorState\n if external_sensor:\n computed_state = external_sensor.get_current_instigator_state(stored_state)\n else:\n computed_state = check.not_none(stored_state)\n\n if not computed_state.is_running:\n return computed_state\n\n if not stored_state:\n assert external_sensor\n return self.add_instigator_state(\n InstigatorState(\n external_sensor.get_external_origin(),\n InstigatorType.SENSOR,\n InstigatorStatus.STOPPED,\n SensorInstigatorData(min_interval=external_sensor.min_interval_seconds),\n )\n )\n else:\n return self.update_instigator_state(stored_state.with_status(InstigatorStatus.STOPPED))\n\n @traced\n def all_instigator_state(\n self,\n repository_origin_id: Optional[str] = None,\n repository_selector_id: Optional[str] = None,\n instigator_type: Optional["InstigatorType"] = None,\n instigator_statuses: Optional[Set["InstigatorStatus"]] = None,\n ):\n if not self._schedule_storage:\n check.failed("Schedule storage not available")\n return self._schedule_storage.all_instigator_state(\n repository_origin_id, repository_selector_id, instigator_type, instigator_statuses\n )\n\n @traced\n def get_instigator_state(self, origin_id: str, selector_id: str) -> Optional["InstigatorState"]:\n if not self._schedule_storage:\n check.failed("Schedule storage not available")\n return self._schedule_storage.get_instigator_state(origin_id, selector_id)\n\n def add_instigator_state(self, state: "InstigatorState") -> "InstigatorState":\n if not self._schedule_storage:\n check.failed("Schedule storage not available")\n return self._schedule_storage.add_instigator_state(state)\n\n def update_instigator_state(self, state: "InstigatorState") -> "InstigatorState":\n if not self._schedule_storage:\n check.failed("Schedule storage not available")\n return self._schedule_storage.update_instigator_state(state)\n\n def delete_instigator_state(self, origin_id: str, selector_id: str) -> None:\n return self._schedule_storage.delete_instigator_state(origin_id, selector_id) # type: ignore # (possible none)\n\n @property\n def supports_batch_tick_queries(self) -> bool:\n return self._schedule_storage and self._schedule_storage.supports_batch_queries # type: ignore # (possible none)\n\n @traced\n def get_batch_ticks(\n self,\n selector_ids: Sequence[str],\n limit: Optional[int] = None,\n statuses: Optional[Sequence["TickStatus"]] = None,\n ) -> Mapping[str, Sequence["InstigatorTick"]]:\n if not self._schedule_storage:\n return {}\n return self._schedule_storage.get_batch_ticks(selector_ids, limit, statuses)\n\n @traced\n def get_tick(\n self, origin_id: str, selector_id: str, timestamp: float\n ) -> Optional["InstigatorTick"]:\n matches = self._schedule_storage.get_ticks( # type: ignore # (possible none)\n origin_id, selector_id, before=timestamp + 1, after=timestamp - 1, limit=1\n )\n return matches[0] if len(matches) else None\n\n @traced\n def get_ticks(\n self,\n origin_id: str,\n selector_id: str,\n before: Optional[float] = None,\n after: Optional[float] = None,\n limit: Optional[int] = None,\n statuses: Optional[Sequence["TickStatus"]] = None,\n ) -> Sequence["InstigatorTick"]:\n return self._schedule_storage.get_ticks( # type: ignore # (possible none)\n origin_id, selector_id, before=before, after=after, limit=limit, statuses=statuses\n )\n\n def create_tick(self, tick_data: "TickData") -> "InstigatorTick":\n return check.not_none(self._schedule_storage).create_tick(tick_data)\n\n def update_tick(self, tick: "InstigatorTick"):\n return check.not_none(self._schedule_storage).update_tick(tick)\n\n def purge_ticks(\n self,\n origin_id: str,\n selector_id: str,\n before: float,\n tick_statuses: Optional[Sequence["TickStatus"]] = None,\n ) -> None:\n self._schedule_storage.purge_ticks(origin_id, selector_id, before, tick_statuses) # type: ignore # (possible none)\n\n def wipe_all_schedules(self) -> None:\n if self._scheduler:\n self._scheduler.wipe(self) # type: ignore # (possible none)\n\n self._schedule_storage.wipe() # type: ignore # (possible none)\n\n def logs_path_for_schedule(self, schedule_origin_id: str) -> str:\n return self._scheduler.get_logs_path(self, schedule_origin_id) # type: ignore # (possible none)\n\n def __enter__(self) -> Self:\n return self\n\n def __exit__(\n self,\n exception_type: Optional[Type[BaseException]],\n exception_value: Optional[BaseException],\n traceback: Optional[TracebackType],\n ) -> None:\n self.dispose()\n\n # dagster daemon\n def add_daemon_heartbeat(self, daemon_heartbeat: "DaemonHeartbeat") -> None:\n """Called on a regular interval by the daemon."""\n self._run_storage.add_daemon_heartbeat(daemon_heartbeat)\n\n def get_daemon_heartbeats(self) -> Mapping[str, "DaemonHeartbeat"]:\n """Latest heartbeats of all daemon types."""\n return self._run_storage.get_daemon_heartbeats()\n\n def wipe_daemon_heartbeats(self) -> None:\n self._run_storage.wipe_daemon_heartbeats()\n\n def get_required_daemon_types(self) -> Sequence[str]:\n from dagster._core.run_coordinator import QueuedRunCoordinator\n from dagster._core.scheduler import DagsterDaemonScheduler\n from dagster._daemon.asset_daemon import AssetDaemon\n from dagster._daemon.auto_run_reexecution.event_log_consumer import EventLogConsumerDaemon\n from dagster._daemon.daemon import (\n BackfillDaemon,\n MonitoringDaemon,\n SchedulerDaemon,\n SensorDaemon,\n )\n from dagster._daemon.run_coordinator.queued_run_coordinator_daemon import (\n QueuedRunCoordinatorDaemon,\n )\n\n if self.is_ephemeral:\n return []\n\n daemons = [SensorDaemon.daemon_type(), BackfillDaemon.daemon_type()]\n if isinstance(self.scheduler, DagsterDaemonScheduler):\n daemons.append(SchedulerDaemon.daemon_type())\n if isinstance(self.run_coordinator, QueuedRunCoordinator):\n daemons.append(QueuedRunCoordinatorDaemon.daemon_type())\n if self.run_monitoring_enabled:\n daemons.append(MonitoringDaemon.daemon_type())\n if self.run_retries_enabled:\n daemons.append(EventLogConsumerDaemon.daemon_type())\n if self.auto_materialize_enabled:\n daemons.append(AssetDaemon.daemon_type())\n return daemons\n\n def get_daemon_statuses(\n self, daemon_types: Optional[Sequence[str]] = None\n ) -> Mapping[str, "DaemonStatus"]:\n """Get the current status of the daemons. If daemon_types aren't provided, defaults to all\n required types. Returns a dict of daemon type to status.\n """\n from dagster._daemon.controller import get_daemon_statuses\n\n check.opt_sequence_param(daemon_types, "daemon_types", of_type=str)\n return get_daemon_statuses(\n self, daemon_types=daemon_types or self.get_required_daemon_types(), ignore_errors=True\n )\n\n @property\n def daemon_skip_heartbeats_without_errors(self) -> bool:\n # If enabled, daemon threads won't write heartbeats unless they encounter an error. This is\n # enabled in cloud, where we don't need to use heartbeats to check if daemons are running, but\n # do need to surface errors to users. This is an optimization to reduce DB writes.\n return False\n\n # backfill\n def get_backfills(\n self,\n status: Optional["BulkActionStatus"] = None,\n cursor: Optional[str] = None,\n limit: Optional[int] = None,\n ) -> Sequence["PartitionBackfill"]:\n return self._run_storage.get_backfills(status=status, cursor=cursor, limit=limit)\n\n def get_backfill(self, backfill_id: str) -> Optional["PartitionBackfill"]:\n return self._run_storage.get_backfill(backfill_id)\n\n def add_backfill(self, partition_backfill: "PartitionBackfill") -> None:\n self._run_storage.add_backfill(partition_backfill)\n\n def update_backfill(self, partition_backfill: "PartitionBackfill") -> None:\n self._run_storage.update_backfill(partition_backfill)\n\n @property\n def should_start_background_run_thread(self) -> bool:\n """Gate on an experimental feature to start a thread that monitors for if the run should be canceled."""\n return False\n\n def get_tick_retention_settings(\n self, instigator_type: "InstigatorType"\n ) -> Mapping["TickStatus", int]:\n from dagster._core.definitions.run_request import InstigatorType\n\n retention_settings = self.get_settings("retention")\n tick_settings = (\n retention_settings.get("schedule")\n if instigator_type == InstigatorType.SCHEDULE\n else retention_settings.get("sensor")\n )\n default_tick_settings = get_default_tick_retention_settings(instigator_type)\n return get_tick_retention_settings(tick_settings, default_tick_settings)\n\n def inject_env_vars(self, location_name: Optional[str]) -> None:\n if not self._secrets_loader:\n return\n\n new_env = self._secrets_loader.get_secrets_for_environment(location_name)\n for k, v in new_env.items():\n os.environ[k] = v\n\n def get_latest_data_version_record(\n self,\n key: AssetKey,\n is_source: Optional[bool] = None,\n partition_key: Optional[str] = None,\n before_cursor: Optional[int] = None,\n after_cursor: Optional[int] = None,\n ) -> Optional["EventLogRecord"]:\n from dagster._core.event_api import EventRecordsFilter\n from dagster._core.events import DagsterEventType\n\n # When we cant don't know whether the requested key corresponds to a source or regular\n # asset, we need to retrieve both the latest observation and materialization for all assets.\n # If there is a materialization, it's a regular asset and we can ignore the observation.\n\n observation: Optional[EventLogRecord] = None\n if is_source or is_source is None:\n observations = self.get_event_records(\n EventRecordsFilter(\n event_type=DagsterEventType.ASSET_OBSERVATION,\n asset_key=key,\n asset_partitions=[partition_key] if partition_key else None,\n before_cursor=before_cursor,\n after_cursor=after_cursor,\n ),\n limit=1,\n )\n observation = next(iter(observations), None)\n\n materialization: Optional[EventLogRecord] = None\n if not is_source:\n materializations = self.get_event_records(\n EventRecordsFilter(\n event_type=DagsterEventType.ASSET_MATERIALIZATION,\n asset_key=key,\n asset_partitions=[partition_key] if partition_key else None,\n before_cursor=before_cursor,\n after_cursor=after_cursor,\n ),\n limit=1,\n )\n materialization = next(iter(materializations), None)\n\n return materialization or observation\n\n
[docs] @public\n def get_latest_materialization_code_versions(\n self, asset_keys: Iterable[AssetKey]\n ) -> Mapping[AssetKey, Optional[str]]:\n """Returns the code version used for the latest materialization of each of the provided\n assets.\n\n Args:\n asset_keys (Iterable[AssetKey]): The asset keys to find latest materialization code\n versions for.\n\n Returns:\n Mapping[AssetKey, Optional[str]]: A dictionary with a key for each of the provided asset\n keys. The values will be None if the asset has no materializations. If an asset does\n not have a code version explicitly assigned to its definitions, but was\n materialized, Dagster assigns the run ID as its code version.\n """\n result: Dict[AssetKey, Optional[str]] = {}\n latest_materialization_events = self.get_latest_materialization_events(asset_keys)\n for asset_key in asset_keys:\n event_log_entry = latest_materialization_events.get(asset_key)\n if event_log_entry is None:\n result[asset_key] = None\n else:\n data_provenance = extract_data_provenance_from_entry(event_log_entry)\n result[asset_key] = data_provenance.code_version if data_provenance else None\n\n return result
\n\n @experimental\n def report_runless_asset_event(\n self,\n asset_event: Union["AssetMaterialization", "AssetObservation", "AssetCheckEvaluation"],\n ):\n """Record an event log entry related to assets that does not belong to a Dagster run."""\n from dagster._core.events import (\n AssetMaterialization,\n AssetObservationData,\n DagsterEvent,\n DagsterEventType,\n StepMaterializationData,\n )\n\n if isinstance(asset_event, AssetMaterialization):\n event_type_value = DagsterEventType.ASSET_MATERIALIZATION.value\n data_payload = StepMaterializationData(asset_event)\n elif isinstance(asset_event, AssetCheckEvaluation):\n event_type_value = DagsterEventType.ASSET_CHECK_EVALUATION.value\n data_payload = asset_event\n elif isinstance(asset_event, AssetObservation):\n event_type_value = DagsterEventType.ASSET_OBSERVATION.value\n data_payload = AssetObservationData(asset_event)\n else:\n raise DagsterInvariantViolationError(\n f"Received unexpected asset event type {asset_event}, expected"\n " AssetMaterialization, AssetObservation or AssetCheckEvaluation"\n )\n\n return self.report_dagster_event(\n run_id=RUNLESS_RUN_ID,\n dagster_event=DagsterEvent(\n event_type_value=event_type_value,\n event_specific_data=data_payload,\n job_name=RUNLESS_JOB_NAME,\n ),\n )
\n
", "current_page_name": "_modules/dagster/_core/instance", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "ref": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.instance.ref

\nimport os\nfrom typing import TYPE_CHECKING, Any, Mapping, NamedTuple, Optional, Sequence, Type\n\nimport yaml\n\nimport dagster._check as check\nfrom dagster._serdes import ConfigurableClassData, class_from_code_pointer, whitelist_for_serdes\n\nfrom .config import DAGSTER_CONFIG_YAML_FILENAME, dagster_instance_config\n\nif TYPE_CHECKING:\n    from dagster._core.instance import DagsterInstance, DagsterInstanceOverrides\n    from dagster._core.launcher.base import RunLauncher\n    from dagster._core.run_coordinator.base import RunCoordinator\n    from dagster._core.scheduler.scheduler import Scheduler\n    from dagster._core.secrets.loader import SecretsLoader\n    from dagster._core.storage.base_storage import DagsterStorage\n    from dagster._core.storage.compute_log_manager import ComputeLogManager\n    from dagster._core.storage.event_log.base import EventLogStorage\n    from dagster._core.storage.root import LocalArtifactStorage\n    from dagster._core.storage.runs.base import RunStorage\n    from dagster._core.storage.schedules.base import ScheduleStorage\n\n\ndef compute_logs_directory(base: str) -> str:\n    return os.path.join(base, "storage")\n\n\ndef _runs_directory(base: str) -> str:\n    return os.path.join(base, "history", "")\n\n\ndef _event_logs_directory(base: str) -> str:\n    return os.path.join(base, "history", "runs", "")\n\n\ndef _schedule_directory(base: str) -> str:\n    return os.path.join(base, "schedules")\n\n\ndef configurable_class_data(config_field: Mapping[str, Any]) -> ConfigurableClassData:\n    return ConfigurableClassData(\n        check.str_elem(config_field, "module"),\n        check.str_elem(config_field, "class"),\n        yaml.dump(check.opt_dict_elem(config_field, "config"), default_flow_style=False),\n    )\n\n\ndef configurable_class_data_or_default(\n    config_value: Mapping[str, Any], field_name: str, default: Optional[ConfigurableClassData]\n) -> Optional[ConfigurableClassData]:\n    return (\n        configurable_class_data(config_value[field_name])\n        if config_value.get(field_name)\n        else default\n    )\n\n\ndef configurable_secrets_loader_data(\n    config_field: Mapping[str, Any], default: Optional[ConfigurableClassData]\n) -> Optional[ConfigurableClassData]:\n    if not config_field:\n        return default\n    elif "custom" in config_field:\n        return configurable_class_data(config_field["custom"])\n    else:\n        return None\n\n\ndef configurable_storage_data(\n    config_field: Mapping[str, Any], defaults: Mapping[str, Optional[ConfigurableClassData]]\n) -> Sequence[Optional[ConfigurableClassData]]:\n    storage_data: ConfigurableClassData\n    run_storage_data: Optional[ConfigurableClassData]\n    event_storage_data: Optional[ConfigurableClassData]\n    schedule_storage_data: Optional[ConfigurableClassData]\n\n    if not config_field:\n        storage_data = check.not_none(defaults.get("storage"))\n        run_storage_data = check.not_none(defaults.get("run_storage"))\n        event_storage_data = check.not_none(defaults.get("event_log_storage"))\n        schedule_storage_data = check.not_none(defaults.get("schedule_storage"))\n    elif "postgres" in config_field:\n        config_yaml = yaml.dump(config_field["postgres"], default_flow_style=False)\n        storage_data = ConfigurableClassData(\n            module_name="dagster_postgres",\n            class_name="DagsterPostgresStorage",\n            config_yaml=config_yaml,\n        )\n        # for backwards compatibility\n        run_storage_data = ConfigurableClassData(\n            module_name="dagster_postgres",\n            class_name="PostgresRunStorage",\n            config_yaml=config_yaml,\n        )\n        event_storage_data = ConfigurableClassData(\n            module_name="dagster_postgres",\n            class_name="PostgresEventLogStorage",\n            config_yaml=config_yaml,\n        )\n        schedule_storage_data = ConfigurableClassData(\n            module_name="dagster_postgres",\n            class_name="PostgresScheduleStorage",\n            config_yaml=config_yaml,\n        )\n\n    elif "mysql" in config_field:\n        config_yaml = yaml.dump(config_field["mysql"], default_flow_style=False)\n        storage_data = ConfigurableClassData(\n            module_name="dagster_mysql",\n            class_name="DagsterMySQLStorage",\n            config_yaml=config_yaml,\n        )\n        # for backwards compatibility\n        run_storage_data = ConfigurableClassData(\n            module_name="dagster_mysql",\n            class_name="MySQLRunStorage",\n            config_yaml=config_yaml,\n        )\n        event_storage_data = ConfigurableClassData(\n            module_name="dagster_mysql",\n            class_name="MySQLEventLogStorage",\n            config_yaml=config_yaml,\n        )\n        schedule_storage_data = ConfigurableClassData(\n            module_name="dagster_mysql",\n            class_name="MySQLScheduleStorage",\n            config_yaml=config_yaml,\n        )\n\n    elif "sqlite" in config_field:\n        base_dir = config_field["sqlite"]["base_dir"]\n        storage_data = ConfigurableClassData(\n            "dagster._core.storage.sqlite_storage",\n            "DagsterSqliteStorage",\n            yaml.dump({"base_dir": base_dir}, default_flow_style=False),\n        )\n\n        # Back-compat fo the legacy storage field only works if the base_dir is a string\n        # (env var doesn't work since each storage has a different value for the base_dir field)\n        if isinstance(base_dir, str):\n            run_storage_data = ConfigurableClassData(\n                "dagster._core.storage.runs",\n                "SqliteRunStorage",\n                yaml.dump({"base_dir": _runs_directory(base_dir)}, default_flow_style=False),\n            )\n\n            event_storage_data = ConfigurableClassData(\n                "dagster._core.storage.event_log",\n                "SqliteEventLogStorage",\n                yaml.dump({"base_dir": _event_logs_directory(base_dir)}, default_flow_style=False),\n            )\n\n            schedule_storage_data = ConfigurableClassData(\n                "dagster._core.storage.schedules",\n                "SqliteScheduleStorage",\n                yaml.dump({"base_dir": _schedule_directory(base_dir)}, default_flow_style=False),\n            )\n        else:\n            run_storage_data = None\n            event_storage_data = None\n            schedule_storage_data = None\n    else:\n        storage_data = configurable_class_data(config_field["custom"])\n        storage_config_yaml = yaml.dump(\n            {\n                "module_name": storage_data.module_name,\n                "class_name": storage_data.class_name,\n                "config_yaml": storage_data.config_yaml,\n            },\n            default_flow_style=False,\n        )\n        run_storage_data = ConfigurableClassData(\n            "dagster._core.storage.legacy_storage", "LegacyRunStorage", storage_config_yaml\n        )\n        event_storage_data = ConfigurableClassData(\n            "dagster._core.storage.legacy_storage", "LegacyEventLogStorage", storage_config_yaml\n        )\n        schedule_storage_data = ConfigurableClassData(\n            "dagster._core.storage.legacy_storage", "LegacyScheduleStorage", storage_config_yaml\n        )\n\n    return [storage_data, run_storage_data, event_storage_data, schedule_storage_data]\n\n\n
[docs]@whitelist_for_serdes\nclass InstanceRef(\n NamedTuple(\n "_InstanceRef",\n [\n ("local_artifact_storage_data", ConfigurableClassData),\n ("compute_logs_data", ConfigurableClassData),\n ("scheduler_data", Optional[ConfigurableClassData]),\n ("run_coordinator_data", Optional[ConfigurableClassData]),\n ("run_launcher_data", Optional[ConfigurableClassData]),\n ("settings", Mapping[str, object]),\n # Required for backwards compatibility, but going forward will be unused by new versions\n # of DagsterInstance, which instead will instead grab the constituent storages from the\n # unified `storage_data`, if it is populated.\n ("run_storage_data", Optional[ConfigurableClassData]),\n ("event_storage_data", Optional[ConfigurableClassData]),\n ("schedule_storage_data", Optional[ConfigurableClassData]),\n ("custom_instance_class_data", Optional[ConfigurableClassData]),\n # unified storage field\n ("storage_data", Optional[ConfigurableClassData]),\n ("secrets_loader_data", Optional[ConfigurableClassData]),\n ],\n )\n):\n """Serializable representation of a :py:class:`DagsterInstance`.\n\n Users should not instantiate this class directly.\n """\n\n def __new__(\n cls,\n local_artifact_storage_data: ConfigurableClassData,\n compute_logs_data: ConfigurableClassData,\n scheduler_data: Optional[ConfigurableClassData],\n run_coordinator_data: Optional[ConfigurableClassData],\n run_launcher_data: Optional[ConfigurableClassData],\n settings: Mapping[str, object],\n run_storage_data: Optional[ConfigurableClassData],\n event_storage_data: Optional[ConfigurableClassData],\n schedule_storage_data: Optional[ConfigurableClassData],\n custom_instance_class_data: Optional[ConfigurableClassData] = None,\n storage_data: Optional[ConfigurableClassData] = None,\n secrets_loader_data: Optional[ConfigurableClassData] = None,\n ):\n return super(cls, InstanceRef).__new__(\n cls,\n local_artifact_storage_data=check.inst_param(\n local_artifact_storage_data, "local_artifact_storage_data", ConfigurableClassData\n ),\n compute_logs_data=check.inst_param(\n compute_logs_data, "compute_logs_data", ConfigurableClassData\n ),\n scheduler_data=check.opt_inst_param(\n scheduler_data, "scheduler_data", ConfigurableClassData\n ),\n run_coordinator_data=check.opt_inst_param(\n run_coordinator_data, "run_coordinator_data", ConfigurableClassData\n ),\n run_launcher_data=check.opt_inst_param(\n run_launcher_data, "run_launcher_data", ConfigurableClassData\n ),\n settings=check.opt_mapping_param(settings, "settings", key_type=str),\n run_storage_data=check.opt_inst_param(\n run_storage_data, "run_storage_data", ConfigurableClassData\n ),\n event_storage_data=check.opt_inst_param(\n event_storage_data, "event_storage_data", ConfigurableClassData\n ),\n schedule_storage_data=check.opt_inst_param(\n schedule_storage_data, "schedule_storage_data", ConfigurableClassData\n ),\n custom_instance_class_data=check.opt_inst_param(\n custom_instance_class_data,\n "instance_class",\n ConfigurableClassData,\n ),\n storage_data=check.opt_inst_param(storage_data, "storage_data", ConfigurableClassData),\n secrets_loader_data=check.opt_inst_param(\n secrets_loader_data, "secrets_loader_data", ConfigurableClassData\n ),\n )\n\n @staticmethod\n def config_defaults(base_dir: str) -> Mapping[str, Optional[ConfigurableClassData]]:\n default_run_storage_data = ConfigurableClassData(\n "dagster._core.storage.runs",\n "SqliteRunStorage",\n yaml.dump({"base_dir": _runs_directory(base_dir)}, default_flow_style=False),\n )\n default_event_log_storage_data = ConfigurableClassData(\n "dagster._core.storage.event_log",\n "SqliteEventLogStorage",\n yaml.dump({"base_dir": _event_logs_directory(base_dir)}, default_flow_style=False),\n )\n default_schedule_storage_data = ConfigurableClassData(\n "dagster._core.storage.schedules",\n "SqliteScheduleStorage",\n yaml.dump({"base_dir": _schedule_directory(base_dir)}, default_flow_style=False),\n )\n\n return {\n "local_artifact_storage": ConfigurableClassData(\n "dagster._core.storage.root",\n "LocalArtifactStorage",\n yaml.dump({"base_dir": base_dir}, default_flow_style=False),\n ),\n "storage": ConfigurableClassData(\n "dagster._core.storage.sqlite_storage",\n "DagsterSqliteStorage",\n yaml.dump({"base_dir": base_dir}, default_flow_style=False),\n ),\n "compute_logs": ConfigurableClassData(\n "dagster._core.storage.local_compute_log_manager",\n "LocalComputeLogManager",\n yaml.dump({"base_dir": compute_logs_directory(base_dir)}, default_flow_style=False),\n ),\n "scheduler": ConfigurableClassData(\n "dagster._core.scheduler",\n "DagsterDaemonScheduler",\n yaml.dump({}),\n ),\n "run_coordinator": ConfigurableClassData(\n "dagster._core.run_coordinator", "DefaultRunCoordinator", yaml.dump({})\n ),\n "run_launcher": ConfigurableClassData(\n "dagster",\n "DefaultRunLauncher",\n yaml.dump({}),\n ),\n # For back-compat, the default is actually set in the secrets_loader property above,\n # so that old clients loading new config don't try to load a class that they\n # don't recognize\n "secrets": None,\n # LEGACY DEFAULTS\n "run_storage": default_run_storage_data,\n "event_log_storage": default_event_log_storage_data,\n "schedule_storage": default_schedule_storage_data,\n }\n\n @staticmethod\n def from_dir(\n base_dir: str,\n *,\n config_dir: Optional[str] = None,\n config_filename: str = DAGSTER_CONFIG_YAML_FILENAME,\n overrides: Optional["DagsterInstanceOverrides"] = None,\n ) -> "InstanceRef":\n if config_dir is None:\n config_dir = base_dir\n\n overrides = check.opt_mapping_param(overrides, "overrides")\n config_value, custom_instance_class = dagster_instance_config(\n config_dir, config_filename=config_filename, overrides=overrides\n )\n\n if custom_instance_class:\n config_keys = set(custom_instance_class.config_schema().keys()) # type: ignore # (undefined method)\n custom_instance_class_config = {\n key: val for key, val in config_value.items() if key in config_keys\n }\n custom_instance_class_data = ConfigurableClassData(\n config_value["instance_class"]["module"],\n config_value["instance_class"]["class"],\n yaml.dump(custom_instance_class_config, default_flow_style=False),\n )\n defaults = custom_instance_class.config_defaults(base_dir) # type: ignore # (undefined method)\n else:\n custom_instance_class_data = None\n defaults = InstanceRef.config_defaults(base_dir)\n\n local_artifact_storage_data = configurable_class_data_or_default(\n config_value, "local_artifact_storage", defaults["local_artifact_storage"]\n )\n\n compute_logs_data = configurable_class_data_or_default(\n config_value,\n "compute_logs",\n defaults["compute_logs"],\n )\n\n if (\n config_value.get("run_storage")\n or config_value.get("event_log_storage")\n or config_value.get("schedule_storage")\n ):\n # using legacy config, specifying config for each of the constituent storages, make sure\n # to create a composite storage\n run_storage_data = configurable_class_data_or_default(\n config_value, "run_storage", defaults["run_storage"]\n )\n event_storage_data = configurable_class_data_or_default(\n config_value, "event_log_storage", defaults["event_log_storage"]\n )\n schedule_storage_data = configurable_class_data_or_default(\n config_value, "schedule_storage", defaults["schedule_storage"]\n )\n storage_data = ConfigurableClassData(\n module_name="dagster._core.storage.legacy_storage",\n class_name="CompositeStorage",\n config_yaml=yaml.dump(\n {\n "run_storage": {\n "module_name": run_storage_data.module_name, # type: ignore # (possible none)\n "class_name": run_storage_data.class_name, # type: ignore # (possible none)\n "config_yaml": run_storage_data.config_yaml, # type: ignore # (possible none)\n },\n "event_log_storage": {\n "module_name": event_storage_data.module_name, # type: ignore # (possible none)\n "class_name": event_storage_data.class_name, # type: ignore # (possible none)\n "config_yaml": event_storage_data.config_yaml, # type: ignore # (possible none)\n },\n "schedule_storage": {\n "module_name": schedule_storage_data.module_name, # type: ignore # (possible none)\n "class_name": schedule_storage_data.class_name, # type: ignore # (possible none)\n "config_yaml": schedule_storage_data.config_yaml, # type: ignore # (possible none)\n },\n },\n default_flow_style=False,\n ),\n )\n\n else:\n [\n storage_data,\n run_storage_data,\n event_storage_data,\n schedule_storage_data,\n ] = configurable_storage_data(\n config_value.get("storage"), defaults # type: ignore # (possible none)\n )\n\n scheduler_data = configurable_class_data_or_default(\n config_value, "scheduler", defaults["scheduler"]\n )\n\n if config_value.get("run_queue"):\n run_coordinator_data = configurable_class_data(\n {\n "module": "dagster.core.run_coordinator",\n "class": "QueuedRunCoordinator",\n "config": config_value["run_queue"],\n }\n )\n else:\n run_coordinator_data = configurable_class_data_or_default(\n config_value,\n "run_coordinator",\n defaults["run_coordinator"],\n )\n\n run_launcher_data = configurable_class_data_or_default(\n config_value,\n "run_launcher",\n defaults["run_launcher"],\n )\n\n secrets_loader_data = configurable_secrets_loader_data(\n config_value.get("secrets"), defaults["secrets"] # type: ignore # (possible none)\n )\n\n settings_keys = {\n "telemetry",\n "python_logs",\n "run_monitoring",\n "run_retries",\n "code_servers",\n "retention",\n "sensors",\n "schedules",\n "nux",\n "auto_materialize",\n }\n settings = {key: config_value.get(key) for key in settings_keys if config_value.get(key)}\n\n return InstanceRef(\n local_artifact_storage_data=local_artifact_storage_data, # type: ignore # (possible none)\n run_storage_data=run_storage_data,\n event_storage_data=event_storage_data,\n compute_logs_data=compute_logs_data, # type: ignore # (possible none)\n schedule_storage_data=schedule_storage_data,\n scheduler_data=scheduler_data,\n run_coordinator_data=run_coordinator_data,\n run_launcher_data=run_launcher_data,\n settings=settings,\n custom_instance_class_data=custom_instance_class_data,\n storage_data=storage_data,\n secrets_loader_data=secrets_loader_data,\n )\n\n @staticmethod\n def from_dict(instance_ref_dict):\n def value_for_ref_item(k, v):\n if v is None:\n return None\n if k == "settings":\n return v\n return ConfigurableClassData(*v)\n\n return InstanceRef(**{k: value_for_ref_item(k, v) for k, v in instance_ref_dict.items()})\n\n @property\n def local_artifact_storage(self) -> "LocalArtifactStorage":\n from dagster._core.storage.root import LocalArtifactStorage\n\n return self.local_artifact_storage_data.rehydrate(as_type=LocalArtifactStorage)\n\n @property\n def storage(self) -> Optional["DagsterStorage"]:\n from dagster._core.storage.base_storage import DagsterStorage\n\n return self.storage_data.rehydrate(as_type=DagsterStorage) if self.storage_data else None\n\n @property\n def run_storage(self) -> Optional["RunStorage"]:\n from dagster._core.storage.runs.base import RunStorage\n\n return (\n self.run_storage_data.rehydrate(as_type=RunStorage) if self.run_storage_data else None\n )\n\n @property\n def event_storage(self) -> Optional["EventLogStorage"]:\n from dagster._core.storage.event_log.base import EventLogStorage\n\n return (\n self.event_storage_data.rehydrate(as_type=EventLogStorage)\n if self.event_storage_data\n else None\n )\n\n @property\n def schedule_storage(self) -> Optional["ScheduleStorage"]:\n from dagster._core.storage.schedules.base import ScheduleStorage\n\n return (\n self.schedule_storage_data.rehydrate(as_type=ScheduleStorage)\n if self.schedule_storage_data\n else None\n )\n\n @property\n def compute_log_manager(self) -> "ComputeLogManager":\n from dagster._core.storage.compute_log_manager import ComputeLogManager\n\n return self.compute_logs_data.rehydrate(as_type=ComputeLogManager)\n\n @property\n def scheduler(self) -> Optional["Scheduler"]:\n from dagster._core.scheduler.scheduler import Scheduler\n\n return self.scheduler_data.rehydrate(as_type=Scheduler) if self.scheduler_data else None\n\n @property\n def run_coordinator(self) -> Optional["RunCoordinator"]:\n from dagster._core.run_coordinator.base import RunCoordinator\n\n return (\n self.run_coordinator_data.rehydrate(as_type=RunCoordinator)\n if self.run_coordinator_data\n else None\n )\n\n @property\n def run_launcher(self) -> Optional["RunLauncher"]:\n from dagster._core.launcher.base import RunLauncher\n\n return (\n self.run_launcher_data.rehydrate(as_type=RunLauncher)\n if self.run_launcher_data\n else None\n )\n\n @property\n def secrets_loader(self) -> Optional["SecretsLoader"]:\n from dagster._core.secrets.loader import SecretsLoader\n\n # Defining a default here rather than in stored config to avoid\n # back-compat issues when loading the config on older versions where\n # EnvFileLoader was not defined\n return (\n self.secrets_loader_data.rehydrate(as_type=SecretsLoader)\n if self.secrets_loader_data\n else None\n )\n\n @property\n def custom_instance_class(self) -> Type["DagsterInstance"]:\n return ( # type: ignore # (ambiguous return type)\n class_from_code_pointer(\n self.custom_instance_class_data.module_name,\n self.custom_instance_class_data.class_name,\n )\n if self.custom_instance_class_data\n else None\n )\n\n @property\n def custom_instance_class_config(self) -> Mapping[str, Any]:\n return (\n self.custom_instance_class_data.config_dict if self.custom_instance_class_data else {}\n )\n\n def to_dict(self) -> Mapping[str, Any]:\n return self._asdict()
\n
", "current_page_name": "_modules/dagster/_core/instance/ref", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}, {"link": "../", "title": "dagster._core.instance"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.instance.ref"}, "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.instance"}, "instance_for_test": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.instance_for_test

\nimport os\nimport sys\nimport tempfile\nfrom contextlib import ExitStack, contextmanager\nfrom typing import Any, Iterator, Mapping, Optional\n\nimport yaml\n\nfrom dagster._utils.error import serializable_error_info_from_exc_info\n\nfrom .._utils.env import environ\nfrom .._utils.merger import merge_dicts\nfrom .instance import DagsterInstance\n\n\n
[docs]@contextmanager\ndef instance_for_test(\n overrides: Optional[Mapping[str, Any]] = None,\n set_dagster_home: bool = True,\n temp_dir: Optional[str] = None,\n) -> Iterator[DagsterInstance]:\n """Creates a persistent :py:class:`~dagster.DagsterInstance` available within a context manager.\n\n When a context manager is opened, if no `temp_dir` parameter is set, a new\n temporary directory will be created for the duration of the context\n manager's opening. If the `set_dagster_home` parameter is set to True\n (True by default), the `$DAGSTER_HOME` environment variable will be\n overridden to be this directory (or the directory passed in by `temp_dir`)\n for the duration of the context manager being open.\n\n Args:\n overrides (Optional[Mapping[str, Any]]):\n Config to provide to instance (config format follows that typically found in an `instance.yaml` file).\n set_dagster_home (Optional[bool]):\n If set to True, the `$DAGSTER_HOME` environment variable will be\n overridden to be the directory used by this instance for the\n duration that the context manager is open. Upon the context\n manager closing, the `$DAGSTER_HOME` variable will be re-set to the original value. (Defaults to True).\n temp_dir (Optional[str]):\n The directory to use for storing local artifacts produced by the\n instance. If not set, a temporary directory will be created for\n the duration of the context manager being open, and all artifacts\n will be torn down afterward.\n """\n with ExitStack() as stack:\n if not temp_dir:\n temp_dir = stack.enter_context(tempfile.TemporaryDirectory())\n\n # wait for any grpc processes that created runs during test disposal to finish,\n # since they might also be using this instance's tempdir (and to keep each test\n # isolated / avoid race conditions in newer versions of grpcio when servers are\n # shutting down and spinning up at the same time)\n instance_overrides = merge_dicts(\n {\n "telemetry": {"enabled": False},\n "code_servers": {"wait_for_local_processes_on_shutdown": True},\n },\n (overrides if overrides else {}),\n )\n\n if set_dagster_home:\n stack.enter_context(\n environ({"DAGSTER_HOME": temp_dir, "DAGSTER_DISABLE_TELEMETRY": "yes"})\n )\n\n with open(os.path.join(temp_dir, "dagster.yaml"), "w", encoding="utf8") as fd:\n yaml.dump(instance_overrides, fd, default_flow_style=False)\n\n with DagsterInstance.from_config(temp_dir) as instance:\n try:\n yield instance\n except:\n sys.stderr.write(\n "Test raised an exception, attempting to clean up instance:"\n + serializable_error_info_from_exc_info(sys.exc_info()).to_string()\n + "\\n"\n )\n raise\n finally:\n cleanup_test_instance(instance)
\n\n\ndef cleanup_test_instance(instance: DagsterInstance) -> None:\n # To avoid filesystem contention when we close the temporary directory, wait for\n # all runs to reach a terminal state, and close any subprocesses or threads\n # that might be accessing the run history DB.\n\n # Since launcher is lazy loaded, we don't need to do anyting if it's None\n if instance._run_launcher: # noqa: SLF001\n instance._run_launcher.join() # noqa: SLF001\n
", "current_page_name": "_modules/dagster/_core/instance_for_test", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.instance_for_test"}, "launcher": {"base": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.launcher.base

\nfrom abc import ABC, abstractmethod\nfrom enum import Enum\nfrom typing import NamedTuple, Optional\n\nfrom dagster._core.instance import MayHaveInstanceWeakref, T_DagsterInstance\nfrom dagster._core.origin import JobPythonOrigin\nfrom dagster._core.storage.dagster_run import DagsterRun\nfrom dagster._core.workspace.workspace import IWorkspace\nfrom dagster._serdes import whitelist_for_serdes\n\n\nclass LaunchRunContext(NamedTuple):\n    """Context available within a run launcher's launch_run call."""\n\n    dagster_run: DagsterRun\n    workspace: Optional[IWorkspace]\n\n    @property\n    def job_code_origin(self) -> Optional[JobPythonOrigin]:\n        return self.dagster_run.job_code_origin\n\n\nclass ResumeRunContext(NamedTuple):\n    """Context available within a run launcher's resume_run call."""\n\n    dagster_run: DagsterRun\n    workspace: Optional[IWorkspace]\n    resume_attempt_number: Optional[int] = None\n\n    @property\n    def job_code_origin(self) -> Optional[JobPythonOrigin]:\n        return self.dagster_run.job_code_origin\n\n\n@whitelist_for_serdes\nclass WorkerStatus(Enum):\n    RUNNING = "RUNNING"\n    NOT_FOUND = "NOT_FOUND"\n    FAILED = "FAILED"\n    SUCCESS = "SUCCESS"\n    UNKNOWN = "UNKNOWN"\n\n\nclass CheckRunHealthResult(NamedTuple):\n    """Result of a check_run_worker_health call."""\n\n    status: WorkerStatus\n    msg: Optional[str] = None\n    transient: Optional[bool] = None\n    run_worker_id: Optional[str] = None  # Identifier for a particular run worker\n\n    def __str__(self) -> str:\n        return f"{self.status.value}: '{self.msg}'"\n\n\n
[docs]class RunLauncher(ABC, MayHaveInstanceWeakref[T_DagsterInstance]):\n @abstractmethod\n def launch_run(self, context: LaunchRunContext) -> None:\n """Launch a run.\n\n This method should begin the execution of the specified run, and may emit engine events.\n Runs should be created in the instance (e.g., by calling\n ``DagsterInstance.create_run()``) *before* this method is called, and\n should be in the ``PipelineRunStatus.STARTING`` state. Typically, this method will\n not be invoked directly, but should be invoked through ``DagsterInstance.launch_run()``.\n\n Args:\n context (LaunchRunContext): information about the launch - every run launcher\n will need the PipelineRun, and some run launchers may need information from the\n IWorkspace from which the run was launched.\n """\n\n @abstractmethod\n def terminate(self, run_id: str) -> bool:\n """Terminates a process.\n\n Returns False is the process was already terminated. Returns true if\n the process was alive and was successfully terminated\n """\n\n def dispose(self) -> None:\n """Do any resource cleanup that should happen when the DagsterInstance is\n cleaning itself up.\n """\n\n def join(self, timeout: int = 30) -> None:\n pass\n\n @property\n def supports_check_run_worker_health(self) -> bool:\n """Whether the run launcher supports check_run_worker_health."""\n return False\n\n def check_run_worker_health(self, run: DagsterRun) -> CheckRunHealthResult:\n raise NotImplementedError(\n "This run launcher does not support run monitoring. Please disable it on your instance."\n )\n\n def get_run_worker_debug_info(self, run: DagsterRun) -> Optional[str]:\n return None\n\n @property\n def supports_resume_run(self) -> bool:\n """Whether the run launcher supports resume_run."""\n return False\n\n def resume_run(self, context: ResumeRunContext) -> None:\n raise NotImplementedError(\n "This run launcher does not support resuming runs. If using "\n "run monitoring, set max_resume_run_attempts to 0."\n )
\n
", "current_page_name": "_modules/dagster/_core/launcher/base", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.launcher.base"}, "default_run_launcher": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.launcher.default_run_launcher

\nimport time\nfrom typing import TYPE_CHECKING, Any, Mapping, Optional, cast\n\nfrom typing_extensions import Self\n\nimport dagster._seven as seven\nfrom dagster import (\n    _check as check,\n)\nfrom dagster._config.config_schema import UserConfigSchema\nfrom dagster._core.errors import (\n    DagsterInvariantViolationError,\n    DagsterLaunchFailedError,\n    DagsterUserCodeProcessError,\n)\nfrom dagster._core.storage.dagster_run import DagsterRun\nfrom dagster._core.storage.tags import GRPC_INFO_TAG\nfrom dagster._serdes import (\n    ConfigurableClass,\n    deserialize_value,\n)\nfrom dagster._serdes.config_class import ConfigurableClassData\nfrom dagster._utils.merger import merge_dicts\n\nfrom .base import LaunchRunContext, RunLauncher\n\nif TYPE_CHECKING:\n    from dagster._core.instance import DagsterInstance\n    from dagster._grpc.client import DagsterGrpcClient\n\n\n# note: this class is a top level export, so we defer many imports til use for performance\n
[docs]class DefaultRunLauncher(RunLauncher, ConfigurableClass):\n """Launches runs against running GRPC servers."""\n\n def __init__(\n self,\n inst_data: Optional[ConfigurableClassData] = None,\n ):\n self._inst_data = inst_data\n\n self._run_ids = set()\n\n super().__init__()\n\n @property\n def inst_data(self) -> Optional[ConfigurableClassData]:\n return self._inst_data\n\n @classmethod\n def config_type(cls) -> UserConfigSchema:\n return {}\n\n @classmethod\n def from_config_value(\n cls, inst_data: ConfigurableClassData, config_value: Mapping[str, Any]\n ) -> Self:\n return DefaultRunLauncher(inst_data=inst_data)\n\n @staticmethod\n def launch_run_from_grpc_client(\n instance: "DagsterInstance", run: DagsterRun, grpc_client: "DagsterGrpcClient"\n ):\n # defer for perf\n from dagster._grpc.types import ExecuteExternalJobArgs, StartRunResult\n\n instance.add_run_tags(\n run.run_id,\n {\n GRPC_INFO_TAG: seven.json.dumps(\n merge_dicts(\n {"host": grpc_client.host},\n (\n {"port": grpc_client.port}\n if grpc_client.port\n else {"socket": grpc_client.socket}\n ),\n ({"use_ssl": True} if grpc_client.use_ssl else {}),\n )\n )\n },\n )\n\n res = deserialize_value(\n grpc_client.start_run(\n ExecuteExternalJobArgs(\n job_origin=run.external_job_origin, # type: ignore # (possible none)\n run_id=run.run_id,\n instance_ref=instance.get_ref(),\n )\n ),\n StartRunResult,\n )\n if not res.success:\n raise (\n DagsterLaunchFailedError(\n res.message, serializable_error_info=res.serializable_error_info\n )\n )\n\n def launch_run(self, context: LaunchRunContext) -> None:\n # defer for perf\n from dagster._core.host_representation.code_location import (\n GrpcServerCodeLocation,\n )\n\n run = context.dagster_run\n\n check.inst_param(run, "run", DagsterRun)\n\n if not context.workspace:\n raise DagsterInvariantViolationError(\n "DefaultRunLauncher requires a workspace to be included in its LaunchRunContext"\n )\n\n external_job_origin = check.not_none(run.external_job_origin)\n code_location = context.workspace.get_code_location(\n external_job_origin.external_repository_origin.code_location_origin.location_name\n )\n\n check.inst(\n code_location,\n GrpcServerCodeLocation,\n "DefaultRunLauncher: Can't launch runs for pipeline not loaded from a GRPC server",\n )\n\n DefaultRunLauncher.launch_run_from_grpc_client(\n self._instance, run, cast(GrpcServerCodeLocation, code_location).client\n )\n\n self._run_ids.add(run.run_id)\n\n def _get_grpc_client_for_termination(self, run_id):\n # defer for perf\n from dagster._grpc.client import DagsterGrpcClient\n\n if not self.has_instance:\n return None\n\n run = self._instance.get_run_by_id(run_id)\n if not run or run.is_finished:\n return None\n\n tags = run.tags\n\n if GRPC_INFO_TAG not in tags:\n return None\n\n grpc_info = seven.json.loads(tags.get(GRPC_INFO_TAG))\n\n return DagsterGrpcClient(\n port=grpc_info.get("port"),\n socket=grpc_info.get("socket"),\n host=grpc_info.get("host"),\n use_ssl=bool(grpc_info.get("use_ssl", False)),\n )\n\n def terminate(self, run_id):\n # defer for perf\n from dagster._grpc.types import CancelExecutionRequest, CancelExecutionResult\n\n check.str_param(run_id, "run_id")\n if not self.has_instance:\n return False\n\n run = self._instance.get_run_by_id(run_id)\n if not run:\n return False\n\n self._instance.report_run_canceling(run)\n\n client = self._get_grpc_client_for_termination(run_id)\n\n if not client:\n self._instance.report_engine_event(\n message="Unable to get grpc client to send termination request to.",\n dagster_run=run,\n cls=self.__class__,\n )\n return False\n\n res = deserialize_value(\n client.cancel_execution(CancelExecutionRequest(run_id=run_id)), CancelExecutionResult\n )\n\n if res.serializable_error_info:\n raise DagsterUserCodeProcessError.from_error_info(res.serializable_error_info)\n\n return res.success\n\n def join(self, timeout=30):\n # If this hasn't been initialized at all, we can just do a noop\n if not self.has_instance:\n return\n\n total_time = 0\n interval = 0.01\n\n while True:\n active_run_ids = [\n run_id\n for run_id in self._run_ids\n if (\n self._instance.get_run_by_id(run_id)\n and not self._instance.get_run_by_id(run_id).is_finished\n )\n ]\n\n if len(active_run_ids) == 0:\n return\n\n if total_time >= timeout:\n raise Exception(f"Timed out waiting for these runs to finish: {active_run_ids!r}")\n\n total_time += interval\n time.sleep(interval)\n interval = interval * 2
\n
", "current_page_name": "_modules/dagster/_core/launcher/default_run_launcher", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.launcher.default_run_launcher"}}, "log_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.log_manager

\nimport datetime\nimport logging\nfrom typing import TYPE_CHECKING, Any, Mapping, NamedTuple, Optional, Sequence, Union, cast\n\nfrom typing_extensions import Protocol\n\nimport dagster._check as check\nfrom dagster._core.utils import coerce_valid_log_level, make_new_run_id\nfrom dagster._utils.log import get_dagster_logger\n\nif TYPE_CHECKING:\n    from dagster import DagsterInstance\n    from dagster._core.events import DagsterEvent\n    from dagster._core.storage.dagster_run import DagsterRun\n\nDAGSTER_META_KEY = "dagster_meta"\n\n\nclass IDagsterMeta(Protocol):\n    @property\n    def dagster_meta(self) -> "DagsterLoggingMetadata": ...\n\n\n# The type-checker complains here that DagsterLogRecord does not implement the `dagster_meta`\n# property of `IDagsterMeta`. We ignore this error because we don't need to implement this method--\n# `DagsterLogRecord` is a stub class that is never instantiated. We only ever cast\n# `logging.LogRecord` objects to `DagsterLogRecord`, because it gives us typed access to the\n# `dagster_meta` property. `dagster_meta` itself is set on these `logging.LogRecord` objects via the\n# `extra` argument to `logging.Logger.log` (see `DagsterLogManager.log_dagster_event`), but\n# `logging.LogRecord` has no way of exposing to the type-checker the attributes that are dynamically\n# defined via `extra`.\nclass DagsterLogRecord(logging.LogRecord, IDagsterMeta):  # type: ignore\n    pass\n\n\nclass DagsterMessageProps(\n    NamedTuple(\n        "_DagsterMessageProps",\n        [\n            ("orig_message", Optional[str]),\n            ("log_message_id", Optional[str]),\n            ("log_timestamp", Optional[str]),\n            ("dagster_event", Optional[Any]),\n        ],\n    )\n):\n    """Internal class used to represent specific attributes about a logged message."""\n\n    def __new__(\n        cls,\n        orig_message: str,\n        log_message_id: Optional[str] = None,\n        log_timestamp: Optional[str] = None,\n        dagster_event: Optional["DagsterEvent"] = None,\n    ):\n        return super().__new__(\n            cls,\n            orig_message=check.str_param(orig_message, "orig_message"),\n            log_message_id=check.opt_str_param(\n                log_message_id, "log_message_id", default=make_new_run_id()\n            ),\n            log_timestamp=check.opt_str_param(\n                log_timestamp,\n                "log_timestamp",\n                default=datetime.datetime.utcnow().isoformat(),\n            ),\n            dagster_event=dagster_event,\n        )\n\n    @property\n    def error_str(self) -> Optional[str]:\n        if self.dagster_event is None:\n            return None\n\n        event_specific_data = self.dagster_event.event_specific_data\n        if not event_specific_data:\n            return None\n\n        error = getattr(event_specific_data, "error", None)\n        if error:\n            return f'\\n\\n{getattr(event_specific_data, "error_display_string", error.to_string())}'\n        return None\n\n    @property\n    def pid(self) -> Optional[str]:\n        if self.dagster_event is None or self.dagster_event.pid is None:\n            return None\n        return str(self.dagster_event.pid)\n\n    @property\n    def step_key(self) -> Optional[str]:\n        if self.dagster_event is None:\n            return None\n        return self.dagster_event.step_key\n\n    @property\n    def event_type_value(self) -> Optional[str]:\n        if self.dagster_event is None:\n            return None\n        return self.dagster_event.event_type_value\n\n\nclass DagsterLoggingMetadata(\n    NamedTuple(\n        "_DagsterLoggingMetadata",\n        [\n            ("run_id", Optional[str]),\n            ("job_name", Optional[str]),\n            ("job_tags", Mapping[str, str]),\n            ("step_key", Optional[str]),\n            ("op_name", Optional[str]),\n            ("resource_name", Optional[str]),\n            ("resource_fn_name", Optional[str]),\n        ],\n    )\n):\n    """Internal class used to represent the context in which a given message was logged (i.e. the\n    step, pipeline run, resource, etc.).\n    """\n\n    def __new__(\n        cls,\n        run_id: Optional[str] = None,\n        job_name: Optional[str] = None,\n        job_tags: Optional[Mapping[str, str]] = None,\n        step_key: Optional[str] = None,\n        op_name: Optional[str] = None,\n        resource_name: Optional[str] = None,\n        resource_fn_name: Optional[str] = None,\n    ):\n        return super().__new__(\n            cls,\n            run_id=run_id,\n            job_name=job_name,\n            job_tags=job_tags or {},\n            step_key=step_key,\n            op_name=op_name,\n            resource_name=resource_name,\n            resource_fn_name=resource_fn_name,\n        )\n\n    @property\n    def log_source(self) -> str:\n        if self.resource_name is None:\n            return self.job_name or "system"\n        return f"resource:{self.resource_name}"\n\n    def all_tags(self) -> Mapping[str, str]:\n        # converts all values into strings\n        return {k: str(v) for k, v in self._asdict().items()}\n\n    def event_tags(self) -> Mapping[str, str]:\n        # Exclude pipeline_tags since it can be quite large and can be found on the run\n        return {k: str(v) for k, v in self._asdict().items() if k != "job_tags"}\n\n\ndef construct_log_string(\n    logging_metadata: DagsterLoggingMetadata, message_props: DagsterMessageProps\n) -> str:\n    from dagster._core.events import EVENT_TYPE_VALUE_TO_DISPLAY_STRING\n\n    event_type_str = (\n        EVENT_TYPE_VALUE_TO_DISPLAY_STRING[message_props.event_type_value]\n        if message_props.event_type_value in EVENT_TYPE_VALUE_TO_DISPLAY_STRING\n        else message_props.event_type_value\n    )\n    return " - ".join(\n        filter(\n            None,\n            (\n                logging_metadata.log_source,\n                logging_metadata.run_id,\n                message_props.pid,\n                logging_metadata.step_key,\n                event_type_str,\n                message_props.orig_message,\n            ),\n        )\n    ) + (message_props.error_str or "")\n\n\ndef get_dagster_meta_dict(\n    logging_metadata: DagsterLoggingMetadata, dagster_message_props: DagsterMessageProps\n) -> Mapping[str, object]:\n    # combine all dagster meta information into a single dictionary\n    meta_dict = {\n        **logging_metadata._asdict(),\n        **dagster_message_props._asdict(),\n    }\n    # step-level events can be logged from a pipeline context. for these cases, pull the step\n    # key from the underlying DagsterEvent\n    if meta_dict["step_key"] is None:\n        meta_dict["step_key"] = dagster_message_props.step_key\n\n    return meta_dict\n\n\nclass DagsterLogHandler(logging.Handler):\n    """Internal class used to turn regular logs into Dagster logs by adding Dagster-specific\n    metadata (such as pipeline_name or step_key), as well as reformatting the underlying message.\n\n    Note: The `loggers` argument will be populated with the set of @loggers supplied to the current\n    pipeline run. These essentially work as handlers (they do not create their own log messages,\n    they simply re-log messages that are created from context.log.x() calls), which is why they are\n    referenced from within this handler class.\n    """\n\n    def __init__(\n        self,\n        logging_metadata: DagsterLoggingMetadata,\n        loggers: Sequence[logging.Logger],\n        handlers: Sequence[logging.Handler],\n    ):\n        self._logging_metadata = logging_metadata\n        self._loggers = loggers\n        self._handlers = handlers\n        self._should_capture = True\n        super().__init__()\n\n    @property\n    def logging_metadata(self) -> DagsterLoggingMetadata:\n        return self._logging_metadata\n\n    def with_tags(self, **new_tags: str) -> "DagsterLogHandler":\n        return DagsterLogHandler(\n            logging_metadata=self.logging_metadata._replace(**new_tags),\n            loggers=self._loggers,\n            handlers=self._handlers,\n        )\n\n    def _extract_extra(self, record: logging.LogRecord) -> Mapping[str, Any]:\n        """In the logging.Logger log() implementation, the elements of the `extra` dictionary\n        argument are smashed into the __dict__ of the underlying logging.LogRecord.\n        This function figures out what the original `extra` values of the log call were by\n        comparing the set of attributes in the received record to those of a default record.\n        """\n        ref_attrs = list(logging.makeLogRecord({}).__dict__.keys()) + [\n            "message",\n            "asctime",\n        ]\n        return {k: v for k, v in record.__dict__.items() if k not in ref_attrs}\n\n    def _convert_record(self, record: logging.LogRecord) -> DagsterLogRecord:\n        # we store the originating DagsterEvent in the DAGSTER_META_KEY field, if applicable\n        dagster_meta = getattr(record, DAGSTER_META_KEY, None)\n\n        # generate some properties for this specific record\n        dagster_message_props = DagsterMessageProps(\n            orig_message=record.getMessage(), dagster_event=dagster_meta\n        )\n\n        # set the dagster meta info for the record\n        setattr(\n            record,\n            DAGSTER_META_KEY,\n            get_dagster_meta_dict(self._logging_metadata, dagster_message_props),\n        )\n\n        # update the message to be formatted like other dagster logs\n        record.msg = construct_log_string(self._logging_metadata, dagster_message_props)\n        record.args = ()\n\n        # DagsterLogRecord is a LogRecord with a `dagster_meta` field\n        return cast(DagsterLogRecord, record)\n\n    def filter(self, record: logging.LogRecord) -> bool:\n        """If you list multiple levels of a python logging hierarchy as managed loggers, and do not\n        set the propagate attribute to False, this will result in that record getting logged\n        multiple times, as the DagsterLogHandler will be invoked at each level of the hierarchy as\n        the message is propagated. This filter prevents this from happening.\n        """\n        return self._should_capture and not isinstance(\n            getattr(record, DAGSTER_META_KEY, None), dict\n        )\n\n    def emit(self, record: logging.LogRecord) -> None:\n        """For any received record, add Dagster metadata, and have handlers handle it."""\n        try:\n            # to prevent the potential for infinite loops in which a handler produces log messages\n            # which are then captured and then handled by that same handler (etc.), do not capture\n            # any log messages while one is currently being emitted\n            self._should_capture = False\n            dagster_record = self._convert_record(record)\n            # built-in handlers\n            for handler in self._handlers:\n                if dagster_record.levelno >= handler.level:\n                    handler.handle(dagster_record)\n            # user-defined @loggers\n            for logger in self._loggers:\n                logger.log(\n                    dagster_record.levelno,\n                    dagster_record.msg,\n                    exc_info=dagster_record.exc_info,\n                    extra=self._extract_extra(record),\n                )\n        finally:\n            self._should_capture = True\n\n\n
[docs]class DagsterLogManager(logging.Logger):\n """Centralized dispatch for logging from user code.\n\n Handles the construction of uniform structured log messages and passes them through to the\n underlying loggers/handlers.\n\n An instance of the log manager is made available to ops as ``context.log``. Users should not\n initialize instances of the log manager directly. To configure custom loggers, set the\n ``logger_defs`` argument in an `@job` decorator or when calling the `to_job()` method on a\n :py:class:`GraphDefinition`.\n\n The log manager inherits standard convenience methods like those exposed by the Python standard\n library :py:mod:`python:logging` module (i.e., within the body of an op,\n ``context.log.{debug, info, warning, warn, error, critical, fatal}``).\n\n The underlying integer API can also be called directly using, e.g.\n ``context.log.log(5, msg)``, and the log manager will delegate to the ``log`` method\n defined on each of the loggers it manages.\n\n User-defined custom log levels are not supported, and calls to, e.g.,\n ``context.log.trace`` or ``context.log.notice`` will result in hard exceptions **at runtime**.\n """\n\n def __init__(\n self,\n dagster_handler: DagsterLogHandler,\n level: int = logging.NOTSET,\n managed_loggers: Optional[Sequence[logging.Logger]] = None,\n ):\n super().__init__(name="dagster", level=coerce_valid_log_level(level))\n self._managed_loggers = check.opt_sequence_param(\n managed_loggers, "managed_loggers", of_type=logging.Logger\n )\n self._dagster_handler = dagster_handler\n self.addHandler(dagster_handler)\n\n @classmethod\n def create(\n cls,\n loggers: Sequence[logging.Logger],\n handlers: Optional[Sequence[logging.Handler]] = None,\n instance: Optional["DagsterInstance"] = None,\n dagster_run: Optional["DagsterRun"] = None,\n ) -> "DagsterLogManager":\n """Create a DagsterLogManager with a set of subservient loggers."""\n handlers = check.opt_sequence_param(handlers, "handlers", of_type=logging.Handler)\n\n managed_loggers = [get_dagster_logger()]\n python_log_level = logging.NOTSET\n\n if instance:\n handlers = [*handlers, *instance.get_handlers()]\n managed_loggers += [\n logging.getLogger(lname) if lname != "root" else logging.getLogger()\n for lname in instance.managed_python_loggers\n ]\n if instance.python_log_level is not None:\n python_log_level = coerce_valid_log_level(instance.python_log_level)\n\n # set all loggers to the declared logging level\n for logger in managed_loggers:\n logger.setLevel(python_log_level)\n\n if dagster_run:\n logging_metadata = DagsterLoggingMetadata(\n run_id=dagster_run.run_id,\n job_name=dagster_run.job_name,\n job_tags=dagster_run.tags,\n )\n else:\n logging_metadata = DagsterLoggingMetadata()\n\n return cls(\n dagster_handler=DagsterLogHandler(\n logging_metadata=logging_metadata,\n loggers=loggers,\n handlers=handlers,\n ),\n level=python_log_level,\n managed_loggers=managed_loggers,\n )\n\n @property\n def logging_metadata(self) -> DagsterLoggingMetadata:\n return self._dagster_handler.logging_metadata\n\n def begin_python_log_capture(self) -> None:\n for logger in self._managed_loggers:\n logger.addHandler(self._dagster_handler)\n\n def end_python_log_capture(self) -> None:\n for logger in self._managed_loggers:\n logger.removeHandler(self._dagster_handler)\n\n def log_dagster_event(\n self, level: Union[str, int], msg: str, dagster_event: "DagsterEvent"\n ) -> None:\n """Log a DagsterEvent at the given level. Attributes about the context it was logged in\n (such as the solid name or pipeline name) will be automatically attached to the created record.\n\n Args:\n level (str, int): either a string representing the desired log level ("INFO", "WARN"),\n or an integer level such as logging.INFO or logging.DEBUG.\n msg (str): message describing the event\n dagster_event (DagsterEvent): DagsterEvent that will be logged\n """\n self.log(level=level, msg=msg, extra={DAGSTER_META_KEY: dagster_event})\n\n def log(self, level: Union[str, int], msg: object, *args: Any, **kwargs: Any) -> None:\n """Log a message at the given level. Attributes about the context it was logged in (such as\n the solid name or pipeline name) will be automatically attached to the created record.\n\n Args:\n level (str, int): either a string representing the desired log level ("INFO", "WARN"),\n or an integer level such as logging.INFO or logging.DEBUG.\n msg (str): the message to be logged\n *args: the logged message will be msg % args\n """\n level = coerce_valid_log_level(level)\n # log DagsterEvents regardless of level\n if self.isEnabledFor(level) or ("extra" in kwargs and DAGSTER_META_KEY in kwargs["extra"]):\n self._log(level, msg, args, **kwargs)\n\n def with_tags(self, **new_tags: str) -> "DagsterLogManager":\n """Add new tags in "new_tags" to the set of tags attached to this log manager instance, and\n return a new DagsterLogManager with the merged set of tags.\n\n Args:\n new_tags (Dict[str,str]): Dictionary of tags\n\n Returns:\n DagsterLogManager: a new DagsterLogManager namedtuple with updated tags for the same\n run ID and loggers.\n """\n return DagsterLogManager(\n dagster_handler=self._dagster_handler.with_tags(**new_tags),\n managed_loggers=self._managed_loggers,\n level=self.level,\n )
\n
", "current_page_name": "_modules/dagster/_core/log_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.log_manager"}, "run_coordinator": {"default_run_coordinator": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.run_coordinator.default_run_coordinator

\nimport logging\nfrom typing import Mapping, Optional\n\nfrom typing_extensions import Self\n\nimport dagster._check as check\nfrom dagster._config.config_schema import UserConfigSchema\nfrom dagster._core.storage.dagster_run import DagsterRun, DagsterRunStatus\nfrom dagster._serdes import ConfigurableClass, ConfigurableClassData\n\nfrom .base import RunCoordinator, SubmitRunContext\n\n\n
[docs]class DefaultRunCoordinator(RunCoordinator, ConfigurableClass):\n """Immediately send runs to the run launcher."""\n\n def __init__(self, inst_data: Optional[ConfigurableClassData] = None):\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n self._logger = logging.getLogger("dagster.run_coordinator.default_run_coordinator")\n super().__init__()\n\n @property\n def inst_data(self) -> Optional[ConfigurableClassData]:\n return self._inst_data\n\n @classmethod\n def config_type(cls) -> UserConfigSchema:\n return {}\n\n @classmethod\n def from_config_value(\n cls, inst_data: Optional[ConfigurableClassData], config_value: Mapping[str, object]\n ) -> Self:\n return cls(inst_data=inst_data, **config_value)\n\n def submit_run(self, context: SubmitRunContext) -> DagsterRun:\n dagster_run = context.dagster_run\n\n if dagster_run.status == DagsterRunStatus.NOT_STARTED:\n self._instance.launch_run(dagster_run.run_id, context.workspace)\n else:\n self._logger.warning(\n f"submit_run called for run {dagster_run.run_id} with status "\n f"{dagster_run.status.value}, skipping launch."\n )\n\n run = self._instance.get_run_by_id(dagster_run.run_id)\n if run is None:\n check.failed(f"Failed to reload run {dagster_run.run_id}")\n return run\n\n def cancel_run(self, run_id: str) -> bool:\n return self._instance.run_launcher.terminate(run_id)
\n
", "current_page_name": "_modules/dagster/_core/run_coordinator/default_run_coordinator", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.run_coordinator.default_run_coordinator"}, "queued_run_coordinator": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.run_coordinator.queued_run_coordinator

\nimport logging\nfrom typing import Any, Mapping, NamedTuple, Optional, Sequence\n\nfrom typing_extensions import Self\n\nfrom dagster import (\n    DagsterEvent,\n    DagsterEventType,\n    IntSource,\n    String,\n    _check as check,\n)\nfrom dagster._builtins import Bool\nfrom dagster._config import Array, Field, Noneable, ScalarUnion, Shape\nfrom dagster._config.config_schema import UserConfigSchema\nfrom dagster._core.instance import T_DagsterInstance\nfrom dagster._core.storage.dagster_run import DagsterRun, DagsterRunStatus\nfrom dagster._serdes import ConfigurableClass, ConfigurableClassData\n\nfrom .base import RunCoordinator, SubmitRunContext\n\n\nclass RunQueueConfig(\n    NamedTuple(\n        "_RunQueueConfig",\n        [\n            ("max_concurrent_runs", int),\n            ("tag_concurrency_limits", Sequence[Mapping[str, Any]]),\n            ("max_user_code_failure_retries", int),\n            ("user_code_failure_retry_delay", int),\n        ],\n    )\n):\n    def __new__(\n        cls,\n        max_concurrent_runs: int,\n        tag_concurrency_limits: Optional[Sequence[Mapping[str, Any]]],\n        max_user_code_failure_retries: int = 0,\n        user_code_failure_retry_delay: int = 60,\n    ):\n        return super(RunQueueConfig, cls).__new__(\n            cls,\n            check.int_param(max_concurrent_runs, "max_concurrent_runs"),\n            check.opt_sequence_param(tag_concurrency_limits, "tag_concurrency_limits"),\n            check.int_param(max_user_code_failure_retries, "max_user_code_failure_retries"),\n            check.int_param(user_code_failure_retry_delay, "user_code_failure_retry_delay"),\n        )\n\n\n
[docs]class QueuedRunCoordinator(RunCoordinator[T_DagsterInstance], ConfigurableClass):\n """Enqueues runs via the run storage, to be deqeueued by the Dagster Daemon process. Requires\n the Dagster Daemon process to be alive in order for runs to be launched.\n """\n\n def __init__(\n self,\n max_concurrent_runs: Optional[int] = None,\n tag_concurrency_limits: Optional[Sequence[Mapping[str, Any]]] = None,\n dequeue_interval_seconds: Optional[int] = None,\n dequeue_use_threads: Optional[bool] = None,\n dequeue_num_workers: Optional[int] = None,\n max_user_code_failure_retries: Optional[int] = None,\n user_code_failure_retry_delay: Optional[int] = None,\n inst_data: Optional[ConfigurableClassData] = None,\n ):\n self._inst_data: Optional[ConfigurableClassData] = check.opt_inst_param(\n inst_data, "inst_data", ConfigurableClassData\n )\n self._max_concurrent_runs: int = check.opt_int_param(\n max_concurrent_runs, "max_concurrent_runs", 10\n )\n check.invariant(\n self._max_concurrent_runs >= -1,\n "Negative values other than -1 (which disables the limit) for max_concurrent_runs"\n " are disallowed.",\n )\n self._tag_concurrency_limits: Sequence[Mapping[str, Any]] = check.opt_list_param(\n tag_concurrency_limits,\n "tag_concurrency_limits",\n )\n self._dequeue_interval_seconds: int = check.opt_int_param(\n dequeue_interval_seconds, "dequeue_interval_seconds", 5\n )\n self._dequeue_use_threads: bool = check.opt_bool_param(\n dequeue_use_threads, "dequeue_use_threads", False\n )\n self._dequeue_num_workers: Optional[int] = check.opt_int_param(\n dequeue_num_workers, "dequeue_num_workers"\n )\n self._max_user_code_failure_retries: int = check.opt_int_param(\n max_user_code_failure_retries, "max_user_code_failure_retries", 0\n )\n self._user_code_failure_retry_delay: int = check.opt_int_param(\n user_code_failure_retry_delay, "user_code_failure_retry_delay", 60\n )\n self._logger = logging.getLogger("dagster.run_coordinator.queued_run_coordinator")\n super().__init__()\n\n @property\n def inst_data(self) -> Optional[ConfigurableClassData]:\n return self._inst_data\n\n def get_run_queue_config(self) -> RunQueueConfig:\n return RunQueueConfig(\n max_concurrent_runs=self._max_concurrent_runs,\n tag_concurrency_limits=self._tag_concurrency_limits,\n max_user_code_failure_retries=self._max_user_code_failure_retries,\n user_code_failure_retry_delay=self._user_code_failure_retry_delay,\n )\n\n @property\n def dequeue_interval_seconds(self) -> int:\n return self._dequeue_interval_seconds\n\n @property\n def dequeue_use_threads(self) -> bool:\n return self._dequeue_use_threads\n\n @property\n def dequeue_num_workers(self) -> Optional[int]:\n return self._dequeue_num_workers\n\n @classmethod\n def config_type(cls) -> UserConfigSchema:\n return {\n "max_concurrent_runs": Field(\n config=IntSource,\n is_required=False,\n description=(\n "The maximum number of runs that are allowed to be in progress at once."\n " Defaults to 10. Set to -1 to disable the limit. Set to 0 to stop any runs"\n " from launching. Any other negative values are disallowed."\n ),\n ),\n "tag_concurrency_limits": Field(\n config=Noneable(\n Array(\n Shape(\n {\n "key": String,\n "value": Field(\n ScalarUnion(\n scalar_type=String,\n non_scalar_schema=Shape({"applyLimitPerUniqueValue": Bool}),\n ),\n is_required=False,\n ),\n "limit": Field(int),\n }\n )\n )\n ),\n is_required=False,\n description=(\n "A set of limits that are applied to runs with particular tags. If a value is"\n " set, the limit is applied to only that key-value pair. If no value is set,"\n " the limit is applied across all values of that key. If the value is set to a"\n " dict with `applyLimitPerUniqueValue: true`, the limit will apply to the"\n " number of unique values for that key."\n ),\n ),\n "dequeue_interval_seconds": Field(\n config=IntSource,\n is_required=False,\n description=(\n "The interval in seconds at which the Dagster Daemon "\n "should periodically check the run queue for new runs to launch."\n ),\n ),\n "dequeue_use_threads": Field(\n config=bool,\n is_required=False,\n description=(\n "Whether or not to use threads for concurrency when launching dequeued runs."\n ),\n ),\n "dequeue_num_workers": Field(\n config=IntSource,\n is_required=False,\n description=(\n "If dequeue_use_threads is true, limit the number of concurrent worker threads."\n ),\n ),\n "max_user_code_failure_retries": Field(\n config=IntSource,\n is_required=False,\n default_value=0,\n description=(\n "If there is an error reaching a Dagster gRPC server while dequeuing the run,"\n " how many times to retry the dequeue before failing it. The only run launcher"\n " that requires the gRPC server to be running is the DefaultRunLauncher, so"\n " setting this will have no effect unless that run launcher is being used."\n ),\n ),\n "user_code_failure_retry_delay": Field(\n config=IntSource,\n is_required=False,\n default_value=60,\n description=(\n "If there is an error reaching a Dagster gRPC server while dequeuing the run,"\n " how long to wait before retrying any runs from that same code location. The"\n " only run launcher that requires the gRPC server to be running is the"\n " DefaultRunLauncher, so setting this will have no effect unless that run"\n " launcher is being used."\n ),\n ),\n }\n\n @classmethod\n def from_config_value(\n cls, inst_data: ConfigurableClassData, config_value: Mapping[str, Any]\n ) -> Self:\n return cls(\n inst_data=inst_data,\n max_concurrent_runs=config_value.get("max_concurrent_runs"),\n tag_concurrency_limits=config_value.get("tag_concurrency_limits"),\n dequeue_interval_seconds=config_value.get("dequeue_interval_seconds"),\n dequeue_use_threads=config_value.get("dequeue_use_threads"),\n dequeue_num_workers=config_value.get("dequeue_num_workers"),\n max_user_code_failure_retries=config_value.get("max_user_code_failure_retries"),\n user_code_failure_retry_delay=config_value.get("user_code_failure_retry_delay"),\n )\n\n def submit_run(self, context: SubmitRunContext) -> DagsterRun:\n dagster_run = context.dagster_run\n\n if dagster_run.status == DagsterRunStatus.NOT_STARTED:\n enqueued_event = DagsterEvent(\n event_type_value=DagsterEventType.PIPELINE_ENQUEUED.value,\n job_name=dagster_run.job_name,\n )\n self._instance.report_dagster_event(enqueued_event, run_id=dagster_run.run_id)\n else:\n # the run was already submitted, this is a no-op\n self._logger.warning(\n f"submit_run called for run {dagster_run.run_id} with status "\n f"{dagster_run.status.value}, skipping enqueue."\n )\n\n run = self._instance.get_run_by_id(dagster_run.run_id)\n if run is None:\n check.failed(f"Failed to reload run {dagster_run.run_id}")\n return run\n\n def cancel_run(self, run_id: str) -> bool:\n run = self._instance.get_run_by_id(run_id)\n if not run:\n return False\n # NOTE: possible race condition if the dequeuer acts on this run at the same time\n # https://github.com/dagster-io/dagster/issues/3323\n if run.status == DagsterRunStatus.QUEUED:\n self._instance.report_run_canceling(\n run,\n message="Canceling run from the queue.",\n )\n self._instance.report_run_canceled(run)\n return True\n else:\n return self._instance.run_launcher.terminate(run_id)
\n
", "current_page_name": "_modules/dagster/_core/run_coordinator/queued_run_coordinator", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.run_coordinator.queued_run_coordinator"}}, "scheduler": {"scheduler": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.scheduler.scheduler

\nimport abc\nimport os\nfrom typing import Any, Mapping, NamedTuple, Optional, Sequence\n\nfrom typing_extensions import Self\n\nimport dagster._check as check\nfrom dagster._config import Field, IntSource\nfrom dagster._core.definitions.run_request import InstigatorType\nfrom dagster._core.errors import DagsterError\nfrom dagster._core.host_representation import ExternalSchedule\nfrom dagster._core.instance import DagsterInstance\nfrom dagster._core.scheduler.instigation import (\n    InstigatorState,\n    InstigatorStatus,\n    ScheduleInstigatorData,\n)\nfrom dagster._serdes import ConfigurableClass\nfrom dagster._serdes.config_class import ConfigurableClassData\nfrom dagster._seven import get_current_datetime_in_utc\nfrom dagster._utils import mkdir_p\n\n\nclass DagsterSchedulerError(DagsterError):\n    """Base class for all Dagster Scheduler errors."""\n\n\nclass DagsterScheduleDoesNotExist(DagsterSchedulerError):\n    """Errors raised when fetching a schedule."""\n\n\nclass SchedulerDebugInfo(\n    NamedTuple(\n        "SchedulerDebugInfo",\n        [\n            ("errors", Sequence[str]),\n            ("scheduler_config_info", str),\n            ("scheduler_info", str),\n            ("schedule_storage", Sequence[str]),\n        ],\n    )\n):\n    def __new__(\n        cls,\n        errors: Sequence[str],\n        scheduler_config_info: str,\n        scheduler_info: str,\n        schedule_storage: Sequence[str],\n    ):\n        return super(SchedulerDebugInfo, cls).__new__(\n            cls,\n            errors=check.sequence_param(errors, "errors", of_type=str),\n            scheduler_config_info=check.str_param(scheduler_config_info, "scheduler_config_info"),\n            scheduler_info=check.str_param(scheduler_info, "scheduler_info"),\n            schedule_storage=check.sequence_param(\n                schedule_storage, "schedule_storage", of_type=str\n            ),\n        )\n\n\n
[docs]class Scheduler(abc.ABC):\n """Abstract base class for a scheduler. This component is responsible for interfacing with\n an external system such as cron to ensure scheduled repeated execution according.\n """\n\n def start_schedule(\n self, instance: DagsterInstance, external_schedule: ExternalSchedule\n ) -> InstigatorState:\n """Updates the status of the given schedule to `InstigatorStatus.RUNNING` in schedule storage,.\n\n This should not be overridden by subclasses.\n\n Args:\n instance (DagsterInstance): The current instance.\n external_schedule (ExternalSchedule): The schedule to start\n\n """\n check.inst_param(instance, "instance", DagsterInstance)\n check.inst_param(external_schedule, "external_schedule", ExternalSchedule)\n\n stored_state = instance.get_instigator_state(\n external_schedule.get_external_origin_id(), external_schedule.selector_id\n )\n computed_state = external_schedule.get_current_instigator_state(stored_state)\n if computed_state.is_running:\n return computed_state\n\n new_instigator_data = ScheduleInstigatorData(\n external_schedule.cron_schedule,\n get_current_datetime_in_utc().timestamp(),\n )\n\n if not stored_state:\n started_state = InstigatorState(\n external_schedule.get_external_origin(),\n InstigatorType.SCHEDULE,\n InstigatorStatus.RUNNING,\n new_instigator_data,\n )\n instance.add_instigator_state(started_state)\n else:\n started_state = stored_state.with_status(InstigatorStatus.RUNNING).with_data(\n new_instigator_data\n )\n instance.update_instigator_state(started_state)\n return started_state\n\n def stop_schedule(\n self,\n instance: DagsterInstance,\n schedule_origin_id: str,\n schedule_selector_id: str,\n external_schedule: Optional[ExternalSchedule],\n ) -> InstigatorState:\n """Updates the status of the given schedule to `InstigatorStatus.STOPPED` in schedule storage,.\n\n This should not be overridden by subclasses.\n\n Args:\n schedule_origin_id (string): The id of the schedule target to stop running.\n """\n check.str_param(schedule_origin_id, "schedule_origin_id")\n check.opt_inst_param(external_schedule, "external_schedule", ExternalSchedule)\n\n stored_state = instance.get_instigator_state(schedule_origin_id, schedule_selector_id)\n\n if not external_schedule:\n computed_state = stored_state\n else:\n computed_state = external_schedule.get_current_instigator_state(stored_state)\n\n if computed_state and not computed_state.is_running:\n return computed_state\n\n if not stored_state:\n assert external_schedule\n stopped_state = InstigatorState(\n external_schedule.get_external_origin(),\n InstigatorType.SCHEDULE,\n InstigatorStatus.STOPPED,\n ScheduleInstigatorData(\n external_schedule.cron_schedule,\n ),\n )\n instance.add_instigator_state(stopped_state)\n else:\n stopped_state = stored_state.with_status(InstigatorStatus.STOPPED).with_data(\n ScheduleInstigatorData(\n cron_schedule=computed_state.instigator_data.cron_schedule, # type: ignore\n )\n )\n instance.update_instigator_state(stopped_state)\n\n return stopped_state\n\n @abc.abstractmethod\n def debug_info(self) -> str:\n """Returns debug information about the scheduler."""\n\n @abc.abstractmethod\n def get_logs_path(self, instance: DagsterInstance, schedule_origin_id: str) -> str:\n """Get path to store logs for schedule.\n\n Args:\n schedule_origin_id (string): The id of the schedule target to retrieve the log path for\n """
\n\n\nDEFAULT_MAX_CATCHUP_RUNS = 5\n\n\n
[docs]class DagsterDaemonScheduler(Scheduler, ConfigurableClass):\n """Default scheduler implementation that submits runs from the `dagster-daemon`\n long-lived process. Periodically checks each running schedule for execution times that don't\n have runs yet and launches them.\n """\n\n def __init__(\n self,\n max_catchup_runs: int = DEFAULT_MAX_CATCHUP_RUNS,\n max_tick_retries: int = 0,\n inst_data: Optional[ConfigurableClassData] = None,\n ):\n self.max_catchup_runs = check.opt_int_param(\n max_catchup_runs, "max_catchup_runs", DEFAULT_MAX_CATCHUP_RUNS\n )\n self.max_tick_retries = check.opt_int_param(max_tick_retries, "max_tick_retries", 0)\n self._inst_data = inst_data\n\n @property\n def inst_data(self) -> Optional[ConfigurableClassData]:\n return self._inst_data\n\n @classmethod\n def config_type(cls):\n return {\n "max_catchup_runs": Field(\n IntSource,\n is_required=False,\n default_value=DEFAULT_MAX_CATCHUP_RUNS,\n description="""For partitioned schedules, controls the maximum number of past\n partitions for each schedule that will be considered when looking for missing\n runs . Generally this parameter will only come into play if the scheduler\n falls behind or launches after experiencing downtime. This parameter will not be checked for\n schedules without partition sets (for example, schedules created using the @schedule\n decorator) - only the most recent execution time will be considered for those schedules.\n\n Note that no matter what this value is, the scheduler will never launch a run from a time\n before the schedule was turned on (even if the start_date on the schedule is earlier) - if\n you want to launch runs for earlier partitions, launch a backfill.\n """,\n ),\n "max_tick_retries": Field(\n IntSource,\n default_value=0,\n is_required=False,\n description=(\n "For each schedule tick that raises an error, how many times to retry that tick"\n ),\n ),\n }\n\n @classmethod\n def from_config_value(\n cls, inst_data: ConfigurableClassData, config_value: Mapping[str, Any]\n ) -> Self:\n return DagsterDaemonScheduler(inst_data=inst_data, **config_value)\n\n def debug_info(self) -> str:\n return ""\n\n def wipe(self, instance: DagsterInstance) -> None:\n pass\n\n def _get_or_create_logs_directory(\n self, instance: DagsterInstance, schedule_origin_id: str\n ) -> str:\n check.inst_param(instance, "instance", DagsterInstance)\n check.str_param(schedule_origin_id, "schedule_origin_id")\n\n logs_directory = os.path.join(instance.schedules_directory(), "logs", schedule_origin_id)\n if not os.path.isdir(logs_directory):\n mkdir_p(logs_directory)\n\n return logs_directory\n\n def get_logs_path(self, instance: DagsterInstance, schedule_origin_id: str) -> str:\n check.inst_param(instance, "instance", DagsterInstance)\n check.str_param(schedule_origin_id, "schedule_origin_id")\n\n logs_directory = self._get_or_create_logs_directory(instance, schedule_origin_id)\n return os.path.join(logs_directory, "scheduler.log")
\n
", "current_page_name": "_modules/dagster/_core/scheduler/scheduler", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.scheduler.scheduler"}}, "storage": {"asset_value_loader": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.storage.asset_value_loader

\nfrom contextlib import ExitStack\nfrom typing import Any, Dict, Mapping, Optional, Type, cast\n\nimport dagster._check as check\nfrom dagster._annotations import public\nfrom dagster._core.definitions.assets import AssetsDefinition\nfrom dagster._core.definitions.events import AssetKey, CoercibleToAssetKey\nfrom dagster._core.definitions.job_definition import (\n    default_job_io_manager_with_fs_io_manager_schema,\n)\nfrom dagster._core.definitions.partition_key_range import PartitionKeyRange\nfrom dagster._core.definitions.resource_definition import ResourceDefinition\nfrom dagster._core.definitions.source_asset import SourceAsset\nfrom dagster._core.definitions.utils import DEFAULT_IO_MANAGER_KEY\nfrom dagster._core.execution.build_resources import build_resources, get_mapped_resource_config\nfrom dagster._core.execution.context.input import build_input_context\nfrom dagster._core.execution.context.output import build_output_context\nfrom dagster._core.execution.resources_init import get_transitive_required_resource_keys\nfrom dagster._core.instance import DagsterInstance\nfrom dagster._core.instance.config import is_dagster_home_set\nfrom dagster._core.types.dagster_type import resolve_dagster_type\nfrom dagster._utils.merger import merge_dicts\n\nfrom .io_manager import IOManager\n\n\n
[docs]class AssetValueLoader:\n """Caches resource definitions that are used to load asset values across multiple load\n invocations.\n\n Should not be instantiated directly. Instead, use\n :py:meth:`~dagster.RepositoryDefinition.get_asset_value_loader`.\n """\n\n def __init__(\n self,\n assets_defs_by_key: Mapping[AssetKey, AssetsDefinition],\n source_assets_by_key: Mapping[AssetKey, SourceAsset],\n instance: Optional[DagsterInstance] = None,\n ):\n self._assets_defs_by_key = assets_defs_by_key\n self._source_assets_by_key = source_assets_by_key\n self._resource_instance_cache: Dict[str, object] = {}\n self._exit_stack: ExitStack = ExitStack().__enter__()\n if not instance and is_dagster_home_set():\n self._instance = self._exit_stack.enter_context(DagsterInstance.get())\n else:\n self._instance = instance\n\n def _ensure_resource_instances_in_cache(\n self,\n resource_defs: Mapping[str, ResourceDefinition],\n resource_config: Optional[Mapping[str, Any]] = None,\n ):\n for built_resource_key, built_resource in (\n self._exit_stack.enter_context(\n build_resources(\n resources={\n resource_key: self._resource_instance_cache.get(resource_key, resource_def)\n for resource_key, resource_def in resource_defs.items()\n },\n instance=self._instance,\n resource_config=resource_config,\n )\n )\n ._asdict()\n .items()\n ):\n self._resource_instance_cache[built_resource_key] = built_resource\n\n
[docs] @public\n def load_asset_value(\n self,\n asset_key: CoercibleToAssetKey,\n *,\n python_type: Optional[Type[object]] = None,\n partition_key: Optional[str] = None,\n metadata: Optional[Dict[str, Any]] = None,\n resource_config: Optional[Mapping[str, Any]] = None,\n ) -> object:\n """Loads the contents of an asset as a Python object.\n\n Invokes `load_input` on the :py:class:`IOManager` associated with the asset.\n\n Args:\n asset_key (Union[AssetKey, Sequence[str], str]): The key of the asset to load.\n python_type (Optional[Type]): The python type to load the asset as. This is what will\n be returned inside `load_input` by `context.dagster_type.typing_type`.\n partition_key (Optional[str]): The partition of the asset to load.\n metadata (Optional[Dict[str, Any]]): Input metadata to pass to the :py:class:`IOManager`\n (is equivalent to setting the metadata argument in `In` or `AssetIn`).\n resource_config (Optional[Any]): A dictionary of resource configurations to be passed\n to the :py:class:`IOManager`.\n\n Returns:\n The contents of an asset as a Python object.\n """\n asset_key = AssetKey.from_coercible(asset_key)\n resource_config = resource_config or {}\n output_metadata = {}\n\n if asset_key in self._assets_defs_by_key:\n assets_def = self._assets_defs_by_key[asset_key]\n\n resource_defs = merge_dicts(\n {DEFAULT_IO_MANAGER_KEY: default_job_io_manager_with_fs_io_manager_schema},\n assets_def.resource_defs,\n )\n io_manager_key = assets_def.get_io_manager_key_for_asset_key(asset_key)\n io_manager_def = resource_defs[io_manager_key]\n name = assets_def.get_output_name_for_asset_key(asset_key)\n output_metadata = assets_def.metadata_by_key[asset_key]\n op_def = assets_def.get_op_def_for_asset_key(asset_key)\n asset_partitions_def = assets_def.partitions_def\n elif asset_key in self._source_assets_by_key:\n source_asset = self._source_assets_by_key[asset_key]\n\n resource_defs = merge_dicts(\n {DEFAULT_IO_MANAGER_KEY: default_job_io_manager_with_fs_io_manager_schema},\n source_asset.resource_defs,\n )\n io_manager_key = source_asset.get_io_manager_key()\n io_manager_def = resource_defs[io_manager_key]\n name = asset_key.path[-1]\n output_metadata = source_asset.raw_metadata\n op_def = None\n asset_partitions_def = source_asset.partitions_def\n else:\n check.failed(f"Asset key {asset_key} not found")\n\n required_resource_keys = get_transitive_required_resource_keys(\n io_manager_def.required_resource_keys, resource_defs\n ) | {io_manager_key}\n\n self._ensure_resource_instances_in_cache(\n {k: v for k, v in resource_defs.items() if k in required_resource_keys},\n resource_config=resource_config,\n )\n io_manager = cast(IOManager, self._resource_instance_cache[io_manager_key])\n\n io_config = resource_config.get(io_manager_key)\n io_resource_config = {io_manager_key: io_config} if io_config else {}\n\n io_manager_config = get_mapped_resource_config(\n {io_manager_key: io_manager_def}, io_resource_config\n )\n\n input_context = build_input_context(\n name=None,\n asset_key=asset_key,\n dagster_type=resolve_dagster_type(python_type),\n upstream_output=build_output_context(\n name=name,\n metadata=output_metadata,\n asset_key=asset_key,\n op_def=op_def,\n resource_config=resource_config,\n ),\n resources=self._resource_instance_cache,\n resource_config=io_manager_config[io_manager_key].config,\n partition_key=partition_key,\n asset_partition_key_range=(\n PartitionKeyRange(partition_key, partition_key)\n if partition_key is not None\n else None\n ),\n asset_partitions_def=asset_partitions_def,\n instance=self._instance,\n metadata=metadata,\n )\n\n return io_manager.load_input(input_context)
\n\n def __enter__(self):\n return self\n\n def __exit__(self, *exc):\n self._exit_stack.close()
\n
", "current_page_name": "_modules/dagster/_core/storage/asset_value_loader", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.storage.asset_value_loader"}, "base_storage": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.storage.base_storage

\nfrom abc import ABC, abstractmethod\n\nfrom dagster._core.instance import MayHaveInstanceWeakref, T_DagsterInstance\n\nfrom .event_log.base import EventLogStorage\nfrom .runs.base import RunStorage\nfrom .schedules.base import ScheduleStorage\n\n\n
[docs]class DagsterStorage(ABC, MayHaveInstanceWeakref[T_DagsterInstance]):\n """Abstract base class for Dagster persistent storage, for reading and writing data for runs,\n events, and schedule/sensor state.\n\n Users should not directly instantiate concrete subclasses of this class; they are instantiated\n by internal machinery when ``dagster-webserver`` and ``dagster-daemon`` load, based on the values in the\n ``dagster.yaml`` file in ``$DAGSTER_HOME``. Configuration of concrete subclasses of this class\n should be done by setting values in that file.\n """\n\n @property\n @abstractmethod\n def event_log_storage(self) -> EventLogStorage[T_DagsterInstance]:\n raise NotImplementedError()\n\n @property\n @abstractmethod\n def run_storage(self) -> RunStorage[T_DagsterInstance]:\n raise NotImplementedError()\n\n @property\n @abstractmethod\n def schedule_storage(self) -> ScheduleStorage[T_DagsterInstance]:\n raise NotImplementedError()
\n
", "current_page_name": "_modules/dagster/_core/storage/base_storage", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.storage.base_storage"}, "captured_log_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.storage.captured_log_manager

\nfrom abc import ABC, abstractmethod\nfrom contextlib import contextmanager\nfrom typing import IO, Callable, Generator, Iterator, NamedTuple, Optional, Sequence\n\nfrom typing_extensions import Final, Self\n\nimport dagster._check as check\nfrom dagster._core.storage.compute_log_manager import ComputeIOType\n\nMAX_BYTES_CHUNK_READ: Final = 4194304  # 4 MB\n\n\nclass CapturedLogContext(\n    NamedTuple(\n        "_CapturedLogContext",\n        [\n            ("log_key", Sequence[str]),\n            ("external_url", Optional[str]),\n            ("external_stdout_url", Optional[str]),\n            ("external_stderr_url", Optional[str]),\n        ],\n    )\n):\n    """Object representing the context in which logs are captured.  Can be used by external logging\n    sidecar implementations to point the Dagster UI to an external url to view compute logs instead of a\n    Dagster-managed location.\n    """\n\n    def __new__(\n        cls,\n        log_key: Sequence[str],\n        external_stdout_url: Optional[str] = None,\n        external_stderr_url: Optional[str] = None,\n        external_url: Optional[str] = None,\n    ):\n        if external_url and (external_stdout_url or external_stderr_url):\n            check.failed(\n                "Cannot specify both `external_url` and one of"\n                " `external_stdout_url`/`external_stderr_url`"\n            )\n\n        return super(CapturedLogContext, cls).__new__(\n            cls,\n            log_key,\n            external_stdout_url=external_stdout_url,\n            external_stderr_url=external_stderr_url,\n            external_url=external_url,\n        )\n\n\nclass CapturedLogData(\n    NamedTuple(\n        "_CapturedLogData",\n        [\n            ("log_key", Sequence[str]),\n            ("stdout", Optional[bytes]),\n            ("stderr", Optional[bytes]),\n            ("cursor", Optional[str]),\n        ],\n    )\n):\n    """Object representing captured log data, either a partial chunk of the log data or the full\n    capture.  Contains the raw bytes and optionally the cursor offset for the partial chunk.\n    """\n\n    def __new__(\n        cls,\n        log_key: Sequence[str],\n        stdout: Optional[bytes] = None,\n        stderr: Optional[bytes] = None,\n        cursor: Optional[str] = None,\n    ):\n        return super(CapturedLogData, cls).__new__(cls, log_key, stdout, stderr, cursor)\n\n\nclass CapturedLogMetadata(\n    NamedTuple(\n        "_CapturedLogMetadata",\n        [\n            ("stdout_location", Optional[str]),\n            ("stderr_location", Optional[str]),\n            ("stdout_download_url", Optional[str]),\n            ("stderr_download_url", Optional[str]),\n        ],\n    )\n):\n    """Object representing metadata info for the captured log data, containing a display string for\n    the location of the log data and a URL for direct download of the captured log data.\n    """\n\n    def __new__(\n        cls,\n        stdout_location: Optional[str] = None,\n        stderr_location: Optional[str] = None,\n        stdout_download_url: Optional[str] = None,\n        stderr_download_url: Optional[str] = None,\n    ):\n        return super(CapturedLogMetadata, cls).__new__(\n            cls,\n            stdout_location=stdout_location,\n            stderr_location=stderr_location,\n            stdout_download_url=stdout_download_url,\n            stderr_download_url=stderr_download_url,\n        )\n\n\nclass CapturedLogSubscription:\n    def __init__(\n        self, manager: "CapturedLogManager", log_key: Sequence[str], cursor: Optional[str]\n    ):\n        self._manager = manager\n        self._log_key = log_key\n        self._cursor = cursor\n        self._observer: Optional[Callable[[CapturedLogData], None]] = None\n        self.is_complete = False\n\n    def __call__(self, observer: Optional[Callable[[CapturedLogData], None]]) -> Self:\n        self._observer = observer\n        self.fetch()\n        if self._manager.is_capture_complete(self._log_key):\n            self.complete()\n        return self\n\n    @property\n    def log_key(self) -> Sequence[str]:\n        return self._log_key\n\n    def dispose(self) -> None:\n        self._observer = None\n        self._manager.unsubscribe(self)\n\n    def fetch(self) -> None:\n        if not self._observer:\n            return\n\n        should_fetch = True\n        while should_fetch:\n            log_data = self._manager.get_log_data(\n                self._log_key,\n                self._cursor,\n                max_bytes=MAX_BYTES_CHUNK_READ,\n            )\n            if not self._cursor or log_data.cursor != self._cursor:\n                self._observer(log_data)\n                self._cursor = log_data.cursor\n            should_fetch = _has_max_data(log_data.stdout) or _has_max_data(log_data.stderr)\n\n    def complete(self) -> None:\n        self.is_complete = True\n\n\ndef _has_max_data(chunk: Optional[bytes]) -> bool:\n    # function is used as predicate but does not actually return a boolean\n    return chunk and len(chunk) >= MAX_BYTES_CHUNK_READ  # type: ignore\n\n\n
[docs]class CapturedLogManager(ABC):\n """Abstract base class for capturing the unstructured logs (stdout/stderr) in the current\n process, stored / retrieved with a provided log_key.\n """\n\n @abstractmethod\n @contextmanager\n def capture_logs(self, log_key: Sequence[str]) -> Generator[CapturedLogContext, None, None]:\n """Context manager for capturing the stdout/stderr within the current process, and persisting\n it under the given log key.\n\n Args:\n log_key (List[String]): The log key identifying the captured logs\n """\n\n @abstractmethod\n @contextmanager\n def open_log_stream(\n self, log_key: Sequence[str], io_type: ComputeIOType\n ) -> Iterator[Optional[IO[bytes]]]:\n """Context manager for providing an IO stream that enables the caller to write to a log stream\n managed by the captured log manager, to be read later using the given log key.\n\n Args:\n log_key (List[String]): The log key identifying the captured logs\n """\n\n @abstractmethod\n def is_capture_complete(self, log_key: Sequence[str]) -> bool:\n """Flag indicating when the log capture for a given log key has completed.\n\n Args:\n log_key (List[String]): The log key identifying the captured logs\n\n Returns:\n Boolean\n """\n\n @abstractmethod\n def get_log_data(\n self,\n log_key: Sequence[str],\n cursor: Optional[str] = None,\n max_bytes: Optional[int] = None,\n ) -> CapturedLogData:\n """Returns a chunk of the captured stdout logs for a given log key.\n\n Args:\n log_key (List[String]): The log key identifying the captured logs\n cursor (Optional[str]): A cursor representing the position of the log chunk to fetch\n max_bytes (Optional[int]): A limit on the size of the log chunk to fetch\n\n Returns:\n CapturedLogData\n """\n\n @abstractmethod\n def get_log_metadata(self, log_key: Sequence[str]) -> CapturedLogMetadata:\n """Returns the metadata of the captured logs for a given log key, including\n displayable information on where the logs are persisted.\n\n Args:\n log_key (List[String]): The log key identifying the captured logs\n\n Returns:\n CapturedLogMetadata\n """\n\n @abstractmethod\n def delete_logs(\n self, log_key: Optional[Sequence[str]] = None, prefix: Optional[Sequence[str]] = None\n ) -> None:\n """Deletes the captured logs for a given log key.\n\n Args:\n log_key(Optional[List[String]]): The log key of the logs to delete\n prefix(Optional[List[String]]): The prefix of the log keys to delete\n """\n\n @abstractmethod\n def subscribe(\n self, log_key: Sequence[str], cursor: Optional[str] = None\n ) -> CapturedLogSubscription:\n """Registers an observable object for log data.\n\n Args:\n log_key (List[String]): The log key identifying the captured logs\n cursor (Optional[String]): The string cursor marking the position within the log stream\n Returns:\n ComputeLogSubscription\n """\n\n @abstractmethod\n def unsubscribe(self, subscription: CapturedLogSubscription) -> None:\n """Deregisters an observable object from receiving log updates.\n\n Args:\n subscription (CapturedLogSubscription): subscription object which manages when to send\n back data to the subscriber\n """\n\n def build_log_key_for_run(self, run_id: str, step_key: str) -> Sequence[str]:\n """Legacy adapter to translate run_id/key to captured log manager-based log_key."""\n return [run_id, "compute_logs", step_key]
\n
", "current_page_name": "_modules/dagster/_core/storage/captured_log_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.storage.captured_log_manager"}, "compute_log_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.storage.compute_log_manager

\nfrom abc import ABC, abstractmethod\nfrom contextlib import contextmanager\nfrom enum import Enum\nfrom typing import Callable, Iterator, NamedTuple, Optional\n\nfrom typing_extensions import Self\n\nimport dagster._check as check\nfrom dagster._core.instance import MayHaveInstanceWeakref, T_DagsterInstance\nfrom dagster._core.storage.dagster_run import DagsterRun\n\nMAX_BYTES_FILE_READ = 33554432  # 32 MB\nMAX_BYTES_CHUNK_READ = 4194304  # 4 MB\n\n\nclass ComputeIOType(Enum):\n    STDOUT = "stdout"\n    STDERR = "stderr"\n\n\nclass ComputeLogFileData(\n    NamedTuple(\n        "ComputeLogFileData",\n        [\n            ("path", str),\n            ("data", Optional[str]),\n            ("cursor", int),\n            ("size", int),\n            ("download_url", Optional[str]),\n        ],\n    )\n):\n    """Representation of a chunk of compute execution log data."""\n\n    def __new__(\n        cls, path: str, data: Optional[str], cursor: int, size: int, download_url: Optional[str]\n    ):\n        return super(ComputeLogFileData, cls).__new__(\n            cls,\n            path=check.str_param(path, "path"),\n            data=check.opt_str_param(data, "data"),\n            cursor=check.int_param(cursor, "cursor"),\n            size=check.int_param(size, "size"),\n            download_url=check.opt_str_param(download_url, "download_url"),\n        )\n\n\n
[docs]class ComputeLogManager(ABC, MayHaveInstanceWeakref[T_DagsterInstance]):\n """Abstract base class for storing unstructured compute logs (stdout/stderr) from the compute\n steps of pipeline solids.\n """\n\n @contextmanager\n def watch(self, dagster_run: DagsterRun, step_key: Optional[str] = None) -> Iterator[None]:\n """Watch the stdout/stderr for a given execution for a given run_id / step_key and persist it.\n\n Args:\n dagster_run (DagsterRun): The run config\n step_key (Optional[String]): The step_key for a compute step\n """\n check.inst_param(dagster_run, "dagster_run", DagsterRun)\n check.opt_str_param(step_key, "step_key")\n\n if not self.enabled(dagster_run, step_key):\n yield\n return\n\n self.on_watch_start(dagster_run, step_key)\n with self._watch_logs(dagster_run, step_key):\n yield\n self.on_watch_finish(dagster_run, step_key)\n\n @contextmanager\n @abstractmethod\n def _watch_logs(\n self, dagster_run: DagsterRun, step_key: Optional[str] = None\n ) -> Iterator[None]:\n """Method to watch the stdout/stderr logs for a given run_id / step_key. Kept separate from\n blessed `watch` method, which triggers all the start/finish hooks that are necessary to\n implement the different remote implementations.\n\n Args:\n dagster_run (DagsterRun): The run config\n step_key (Optional[String]): The step_key for a compute step\n """\n\n @abstractmethod\n def get_local_path(self, run_id: str, key: str, io_type: ComputeIOType) -> str:\n """Get the local path of the logfile for a given execution step. This determines the\n location on the local filesystem to which stdout/stderr will be rerouted.\n\n Args:\n run_id (str): The id of the pipeline run.\n key (str): The unique descriptor of the execution step (e.g. `solid_invocation.compute`)\n io_type (ComputeIOType): Flag indicating the I/O type, either ComputeIOType.STDOUT or\n ComputeIOType.STDERR\n\n Returns:\n str\n """\n ...\n\n @abstractmethod\n def is_watch_completed(self, run_id: str, key: str) -> bool:\n """Flag indicating when computation for a given execution step has completed.\n\n Args:\n run_id (str): The id of the pipeline run.\n key (str): The unique descriptor of the execution step (e.g. `solid_invocation.compute`)\n\n Returns:\n Boolean\n """\n\n @abstractmethod\n def on_watch_start(self, dagster_run: DagsterRun, step_key: Optional[str]) -> None:\n """Hook called when starting to watch compute logs.\n\n Args:\n pipeline_run (PipelineRun): The pipeline run config\n step_key (Optional[String]): The step_key for a compute step\n """\n\n @abstractmethod\n def on_watch_finish(self, dagster_run: DagsterRun, step_key: Optional[str]) -> None:\n """Hook called when computation for a given execution step is finished.\n\n Args:\n pipeline_run (PipelineRun): The pipeline run config\n step_key (Optional[String]): The step_key for a compute step\n """\n\n @abstractmethod\n def download_url(self, run_id: str, key: str, io_type: ComputeIOType) -> str:\n """Get a URL where the logs can be downloaded.\n\n Args:\n run_id (str): The id of the pipeline run.\n key (str): The unique descriptor of the execution step (e.g. `solid_invocation.compute`)\n io_type (ComputeIOType): Flag indicating the I/O type, either stdout or stderr\n\n Returns:\n String\n """\n\n @abstractmethod\n def read_logs_file(\n self,\n run_id: str,\n key: str,\n io_type: ComputeIOType,\n cursor: int = 0,\n max_bytes: int = MAX_BYTES_FILE_READ,\n ) -> ComputeLogFileData:\n """Get compute log data for a given compute step.\n\n Args:\n run_id (str): The id of the pipeline run.\n key (str): The unique descriptor of the execution step (e.g. `solid_invocation.compute`)\n io_type (ComputeIOType): Flag indicating the I/O type, either stdout or stderr\n cursor (Optional[Int]): Starting cursor (byte) of log file\n max_bytes (Optional[Int]): Maximum number of bytes to be read and returned\n\n Returns:\n ComputeLogFileData\n """\n\n def enabled(self, _dagster_run: DagsterRun, _step_key: Optional[str]) -> bool:\n """Hook for disabling compute log capture.\n\n Args:\n _step_key (Optional[String]): The step_key for a compute step\n\n Returns:\n Boolean\n """\n return True\n\n @abstractmethod\n def on_subscribe(self, subscription: "ComputeLogSubscription") -> None:\n """Hook for managing streaming subscriptions for log data from `dagster-webserver`.\n\n Args:\n subscription (ComputeLogSubscription): subscription object which manages when to send\n back data to the subscriber\n """\n\n def on_unsubscribe(self, subscription: "ComputeLogSubscription") -> None:\n pass\n\n def observable(\n self, run_id: str, key: str, io_type: ComputeIOType, cursor: Optional[str] = None\n ) -> "ComputeLogSubscription":\n """Return a ComputeLogSubscription which streams back log data from the execution logs for a given\n compute step.\n\n Args:\n run_id (str): The id of the pipeline run.\n key (str): The unique descriptor of the execution step (e.g. `solid_invocation.compute`)\n io_type (ComputeIOType): Flag indicating the I/O type, either stdout or stderr\n cursor (Optional[Int]): Starting cursor (byte) of log file\n\n Returns:\n Observable\n """\n check.str_param(run_id, "run_id")\n check.str_param(key, "key")\n check.inst_param(io_type, "io_type", ComputeIOType)\n check.opt_str_param(cursor, "cursor")\n\n if cursor:\n cursor = int(cursor) # type: ignore # (var reassigned diff type)\n else:\n cursor = 0 # type: ignore # (var reassigned diff type)\n\n subscription = ComputeLogSubscription(self, run_id, key, io_type, cursor) # type: ignore # (var reassigned diff type)\n self.on_subscribe(subscription)\n return subscription\n\n def dispose(self):\n pass
\n\n\nclass ComputeLogSubscription:\n """Observable object that generates ComputeLogFileData objects as compute step execution logs\n are written.\n """\n\n def __init__(\n self,\n manager: ComputeLogManager,\n run_id: str,\n key: str,\n io_type: ComputeIOType,\n cursor: int,\n ):\n self.manager = manager\n self.run_id = run_id\n self.key = key\n self.io_type = io_type\n self.cursor = cursor\n self.observer: Optional[Callable[[ComputeLogFileData], None]] = None\n self.is_complete = False\n\n def __call__(self, observer: Callable[[ComputeLogFileData], None]) -> Self:\n self.observer = observer\n self.fetch()\n if self.manager.is_watch_completed(self.run_id, self.key):\n self.complete()\n return self\n\n def dispose(self) -> None:\n # called when the connection gets closed, allowing the observer to get GC'ed\n self.observer = None\n self.manager.on_unsubscribe(self)\n\n def fetch(self) -> None:\n if not self.observer:\n return\n\n should_fetch = True\n while should_fetch:\n update = self.manager.read_logs_file(\n self.run_id,\n self.key,\n self.io_type,\n self.cursor,\n max_bytes=MAX_BYTES_CHUNK_READ,\n )\n if not self.cursor or update.cursor != self.cursor:\n self.observer(update)\n self.cursor = update.cursor\n should_fetch = update.data and len(update.data.encode("utf-8")) >= MAX_BYTES_CHUNK_READ\n\n def complete(self) -> None:\n self.is_complete = True\n if not self.observer:\n return\n
", "current_page_name": "_modules/dagster/_core/storage/compute_log_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.storage.compute_log_manager"}, "dagster_run": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.storage.dagster_run

\nfrom datetime import datetime\nfrom enum import Enum\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Any,\n    Dict,\n    List,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Union,\n)\n\nfrom typing_extensions import Self\n\nimport dagster._check as check\nfrom dagster._annotations import PublicAttr, public\nfrom dagster._core.definitions.asset_check_spec import AssetCheckKey\nfrom dagster._core.definitions.events import AssetKey\nfrom dagster._core.origin import JobPythonOrigin\nfrom dagster._core.storage.tags import PARENT_RUN_ID_TAG, ROOT_RUN_ID_TAG\nfrom dagster._core.utils import make_new_run_id\nfrom dagster._serdes.serdes import (\n    NamedTupleSerializer,\n    whitelist_for_serdes,\n)\n\nfrom .tags import (\n    BACKFILL_ID_TAG,\n    REPOSITORY_LABEL_TAG,\n    RESUME_RETRY_TAG,\n    SCHEDULE_NAME_TAG,\n    SENSOR_NAME_TAG,\n)\n\nif TYPE_CHECKING:\n    from dagster._core.host_representation.external import ExternalSchedule, ExternalSensor\n    from dagster._core.host_representation.origin import ExternalJobOrigin\n\n\n
[docs]@whitelist_for_serdes(storage_name="PipelineRunStatus")\nclass DagsterRunStatus(Enum):\n """The status of run execution."""\n\n # Runs waiting to be launched by the Dagster Daemon.\n QUEUED = "QUEUED"\n\n # Runs that have been launched, but execution has not yet started."""\n NOT_STARTED = "NOT_STARTED"\n\n # Runs that are managed outside of the Dagster control plane.\n MANAGED = "MANAGED"\n\n # Runs that have been launched, but execution has not yet started.\n STARTING = "STARTING"\n\n # Runs that have been launched and execution has started.\n STARTED = "STARTED"\n\n # Runs that have successfully completed.\n SUCCESS = "SUCCESS"\n\n # Runs that have failed to complete.\n FAILURE = "FAILURE"\n\n # Runs that are in-progress and pending to be canceled.\n CANCELING = "CANCELING"\n\n # Runs that have been canceled before completion.\n CANCELED = "CANCELED"
\n\n\n# These statuses that indicate a run may be using compute resources\nIN_PROGRESS_RUN_STATUSES = [\n DagsterRunStatus.STARTING,\n DagsterRunStatus.STARTED,\n DagsterRunStatus.CANCELING,\n]\n\n# This serves as an explicit list of run statuses that indicate that the run is not using compute\n# resources. This and the enum above should cover all run statuses.\nNON_IN_PROGRESS_RUN_STATUSES = [\n DagsterRunStatus.QUEUED,\n DagsterRunStatus.NOT_STARTED,\n DagsterRunStatus.SUCCESS,\n DagsterRunStatus.FAILURE,\n DagsterRunStatus.MANAGED,\n DagsterRunStatus.CANCELED,\n]\n\nFINISHED_STATUSES = [\n DagsterRunStatus.SUCCESS,\n DagsterRunStatus.FAILURE,\n DagsterRunStatus.CANCELED,\n]\n\n# Run statuses for runs that can be safely canceled.\n# Does not include the other unfinished statuses for the following reasons:\n# STARTING: Control has been ceded to the run worker, which will eventually move the run to a STARTED.\n# NOT_STARTED: Mostly replaced with STARTING. Runs are only here in the the brief window between\n# creating the run and launching or enqueueing it.\nCANCELABLE_RUN_STATUSES = [DagsterRunStatus.STARTED, DagsterRunStatus.QUEUED]\n\n\n@whitelist_for_serdes(storage_name="PipelineRunStatsSnapshot")\nclass DagsterRunStatsSnapshot(\n NamedTuple(\n "_DagsterRunStatsSnapshot",\n [\n ("run_id", str),\n ("steps_succeeded", int),\n ("steps_failed", int),\n ("materializations", int),\n ("expectations", int),\n ("enqueued_time", Optional[float]),\n ("launch_time", Optional[float]),\n ("start_time", Optional[float]),\n ("end_time", Optional[float]),\n ],\n )\n):\n def __new__(\n cls,\n run_id: str,\n steps_succeeded: int,\n steps_failed: int,\n materializations: int,\n expectations: int,\n enqueued_time: Optional[float],\n launch_time: Optional[float],\n start_time: Optional[float],\n end_time: Optional[float],\n ):\n return super(DagsterRunStatsSnapshot, cls).__new__(\n cls,\n run_id=check.str_param(run_id, "run_id"),\n steps_succeeded=check.int_param(steps_succeeded, "steps_succeeded"),\n steps_failed=check.int_param(steps_failed, "steps_failed"),\n materializations=check.int_param(materializations, "materializations"),\n expectations=check.int_param(expectations, "expectations"),\n enqueued_time=check.opt_float_param(enqueued_time, "enqueued_time"),\n launch_time=check.opt_float_param(launch_time, "launch_time"),\n start_time=check.opt_float_param(start_time, "start_time"),\n end_time=check.opt_float_param(end_time, "end_time"),\n )\n\n\nclass DagsterRunSerializer(NamedTupleSerializer["DagsterRun"]):\n # serdes log\n # * removed reexecution_config - serdes logic expected to strip unknown keys so no need to preserve\n # * added pipeline_snapshot_id\n # * renamed previous_run_id -> parent_run_id, added root_run_id\n # * added execution_plan_snapshot_id\n # * removed selector\n # * added solid_subset\n # * renamed solid_subset -> solid_selection, added solids_to_execute\n # * renamed environment_dict -> run_config\n # * added asset_selection\n # * added has_repository_load_data\n def before_unpack(self, context, unpacked_dict: Dict[str, Any]) -> Dict[str, Any]:\n # back compat for environment dict => run_config\n if "environment_dict" in unpacked_dict:\n check.invariant(\n unpacked_dict.get("run_config") is None,\n "Cannot set both run_config and environment_dict. Use run_config parameter.",\n )\n unpacked_dict["run_config"] = unpacked_dict["environment_dict"]\n del unpacked_dict["environment_dict"]\n\n # back compat for previous_run_id => parent_run_id, root_run_id\n if "previous_run_id" in unpacked_dict and not (\n "parent_run_id" in unpacked_dict and "root_run_id" in unpacked_dict\n ):\n unpacked_dict["parent_run_id"] = unpacked_dict["previous_run_id"]\n unpacked_dict["root_run_id"] = unpacked_dict["previous_run_id"]\n del unpacked_dict["previous_run_id"]\n\n # back compat for selector => pipeline_name, solids_to_execute\n if "selector" in unpacked_dict:\n selector = unpacked_dict["selector"]\n\n if not isinstance(selector, ExecutionSelector):\n check.failed(f"unexpected entry for 'select', {selector}")\n selector_name = selector.name\n selector_subset = selector.solid_subset\n\n job_name = unpacked_dict.get("pipeline_name")\n check.invariant(\n job_name is None or selector_name == job_name,\n f"Conflicting pipeline name {job_name} in arguments to PipelineRun: "\n f"selector was passed with pipeline {selector_name}",\n )\n if job_name is None:\n unpacked_dict["pipeline_name"] = selector_name\n\n solids_to_execute = unpacked_dict.get("solids_to_execute")\n check.invariant(\n solids_to_execute is None\n or (selector_subset and set(selector_subset) == solids_to_execute),\n f"Conflicting solids_to_execute {solids_to_execute} in arguments to"\n f" PipelineRun: selector was passed with subset {selector_subset}",\n )\n # for old runs that only have selector but no solids_to_execute\n if solids_to_execute is None:\n solids_to_execute = frozenset(selector_subset) if selector_subset else None\n\n # back compat for solid_subset => solids_to_execute\n if "solid_subset" in unpacked_dict:\n unpacked_dict["solids_to_execute"] = unpacked_dict["solid_subset"]\n del unpacked_dict["solid_subset"]\n\n return unpacked_dict\n\n\n
[docs]@whitelist_for_serdes(\n serializer=DagsterRunSerializer,\n # DagsterRun is serialized as PipelineRun so that it can be read by older (pre 0.13.x) version\n # of Dagster, but is read back in as a DagsterRun.\n storage_name="PipelineRun",\n old_fields={"mode": None},\n storage_field_names={\n "job_name": "pipeline_name",\n "job_snapshot_id": "pipeline_snapshot_id",\n "external_job_origin": "external_pipeline_origin",\n "job_code_origin": "pipeline_code_origin",\n "op_selection": "solid_selection",\n "resolved_op_selection": "solids_to_execute",\n },\n)\nclass DagsterRun(\n NamedTuple(\n "_DagsterRun",\n [\n ("job_name", PublicAttr[str]),\n ("run_id", str),\n ("run_config", Mapping[str, object]),\n ("asset_selection", Optional[AbstractSet[AssetKey]]),\n ("asset_check_selection", Optional[AbstractSet[AssetCheckKey]]),\n ("op_selection", Optional[Sequence[str]]),\n ("resolved_op_selection", Optional[AbstractSet[str]]),\n ("step_keys_to_execute", Optional[Sequence[str]]),\n ("status", DagsterRunStatus),\n ("tags", Mapping[str, str]),\n ("root_run_id", Optional[str]),\n ("parent_run_id", Optional[str]),\n ("job_snapshot_id", Optional[str]),\n ("execution_plan_snapshot_id", Optional[str]),\n ("external_job_origin", Optional["ExternalJobOrigin"]),\n ("job_code_origin", Optional[JobPythonOrigin]),\n ("has_repository_load_data", bool),\n ],\n )\n):\n """Serializable internal representation of a dagster run, as stored in a\n :py:class:`~dagster._core.storage.runs.RunStorage`.\n """\n\n def __new__(\n cls,\n job_name: str,\n run_id: Optional[str] = None,\n run_config: Optional[Mapping[str, object]] = None,\n asset_selection: Optional[AbstractSet[AssetKey]] = None,\n asset_check_selection: Optional[AbstractSet[AssetCheckKey]] = None,\n op_selection: Optional[Sequence[str]] = None,\n resolved_op_selection: Optional[AbstractSet[str]] = None,\n step_keys_to_execute: Optional[Sequence[str]] = None,\n status: Optional[DagsterRunStatus] = None,\n tags: Optional[Mapping[str, str]] = None,\n root_run_id: Optional[str] = None,\n parent_run_id: Optional[str] = None,\n job_snapshot_id: Optional[str] = None,\n execution_plan_snapshot_id: Optional[str] = None,\n external_job_origin: Optional["ExternalJobOrigin"] = None,\n job_code_origin: Optional[JobPythonOrigin] = None,\n has_repository_load_data: Optional[bool] = None,\n ):\n check.invariant(\n (root_run_id is not None and parent_run_id is not None)\n or (root_run_id is None and parent_run_id is None),\n "Must set both root_run_id and parent_run_id when creating a PipelineRun that "\n "belongs to a run group",\n )\n # a set which contains the names of the ops to execute\n resolved_op_selection = check.opt_nullable_set_param(\n resolved_op_selection, "resolved_op_selection", of_type=str\n )\n # a list of op queries provided by the user\n # possible to be None when resolved_op_selection is set by the user directly\n op_selection = check.opt_nullable_sequence_param(op_selection, "op_selection", of_type=str)\n check.opt_nullable_sequence_param(step_keys_to_execute, "step_keys_to_execute", of_type=str)\n\n asset_selection = check.opt_nullable_set_param(\n asset_selection, "asset_selection", of_type=AssetKey\n )\n asset_check_selection = check.opt_nullable_set_param(\n asset_check_selection, "asset_check_selection", of_type=AssetCheckKey\n )\n\n # Placing this with the other imports causes a cyclic import\n # https://github.com/dagster-io/dagster/issues/3181\n from dagster._core.host_representation.origin import ExternalJobOrigin\n\n if status == DagsterRunStatus.QUEUED:\n check.inst_param(\n external_job_origin,\n "external_job_origin",\n ExternalJobOrigin,\n "external_job_origin is required for queued runs",\n )\n\n if run_id is None:\n run_id = make_new_run_id()\n\n return super(DagsterRun, cls).__new__(\n cls,\n job_name=check.str_param(job_name, "job_name"),\n run_id=check.str_param(run_id, "run_id"),\n run_config=check.opt_mapping_param(run_config, "run_config", key_type=str),\n op_selection=op_selection,\n asset_selection=asset_selection,\n asset_check_selection=asset_check_selection,\n resolved_op_selection=resolved_op_selection,\n step_keys_to_execute=step_keys_to_execute,\n status=check.opt_inst_param(\n status, "status", DagsterRunStatus, DagsterRunStatus.NOT_STARTED\n ),\n tags=check.opt_mapping_param(tags, "tags", key_type=str, value_type=str),\n root_run_id=check.opt_str_param(root_run_id, "root_run_id"),\n parent_run_id=check.opt_str_param(parent_run_id, "parent_run_id"),\n job_snapshot_id=check.opt_str_param(job_snapshot_id, "job_snapshot_id"),\n execution_plan_snapshot_id=check.opt_str_param(\n execution_plan_snapshot_id, "execution_plan_snapshot_id"\n ),\n external_job_origin=check.opt_inst_param(\n external_job_origin, "external_job_origin", ExternalJobOrigin\n ),\n job_code_origin=check.opt_inst_param(\n job_code_origin, "job_code_origin", JobPythonOrigin\n ),\n has_repository_load_data=check.opt_bool_param(\n has_repository_load_data, "has_repository_load_data", default=False\n ),\n )\n\n def with_status(self, status: DagsterRunStatus) -> Self:\n if status == DagsterRunStatus.QUEUED:\n # Placing this with the other imports causes a cyclic import\n # https://github.com/dagster-io/dagster/issues/3181\n from dagster._core.host_representation.origin import ExternalJobOrigin\n\n check.inst(\n self.external_job_origin,\n ExternalJobOrigin,\n "external_pipeline_origin is required for queued runs",\n )\n\n return self._replace(status=status)\n\n def with_job_origin(self, origin: "ExternalJobOrigin") -> Self:\n from dagster._core.host_representation.origin import ExternalJobOrigin\n\n check.inst_param(origin, "origin", ExternalJobOrigin)\n return self._replace(external_job_origin=origin)\n\n def with_tags(self, tags: Mapping[str, str]) -> Self:\n return self._replace(tags=tags)\n\n def get_root_run_id(self) -> Optional[str]:\n return self.tags.get(ROOT_RUN_ID_TAG)\n\n def get_parent_run_id(self) -> Optional[str]:\n return self.tags.get(PARENT_RUN_ID_TAG)\n\n def tags_for_storage(self) -> Mapping[str, str]:\n repository_tags = {}\n if self.external_job_origin:\n # tag the run with a label containing the repository name / location name, to allow for\n # per-repository filtering of runs from the Dagster UI.\n repository_tags[REPOSITORY_LABEL_TAG] = (\n self.external_job_origin.external_repository_origin.get_label()\n )\n\n if not self.tags:\n return repository_tags\n\n return {**repository_tags, **self.tags}\n\n @public\n @property\n def is_finished(self) -> bool:\n """bool: If this run has completely finished execution."""\n return self.status in FINISHED_STATUSES\n\n @public\n @property\n def is_success(self) -> bool:\n """bool: If this run has successfully finished executing."""\n return self.status == DagsterRunStatus.SUCCESS\n\n @public\n @property\n def is_failure(self) -> bool:\n """bool: If this run has failed."""\n return self.status == DagsterRunStatus.FAILURE\n\n @public\n @property\n def is_failure_or_canceled(self) -> bool:\n """bool: If this run has either failed or was canceled."""\n return self.status == DagsterRunStatus.FAILURE or self.status == DagsterRunStatus.CANCELED\n\n @public\n @property\n def is_resume_retry(self) -> bool:\n """bool: If this run was created from retrying another run from the point of failure."""\n return self.tags.get(RESUME_RETRY_TAG) == "true"\n\n @property\n def previous_run_id(self) -> Optional[str]:\n # Compat\n return self.parent_run_id\n\n @staticmethod\n def tags_for_schedule(schedule) -> Mapping[str, str]:\n return {SCHEDULE_NAME_TAG: schedule.name}\n\n @staticmethod\n def tags_for_sensor(sensor) -> Mapping[str, str]:\n return {SENSOR_NAME_TAG: sensor.name}\n\n @staticmethod\n def tags_for_backfill_id(backfill_id: str) -> Mapping[str, str]:\n return {BACKFILL_ID_TAG: backfill_id}
\n\n\nclass RunsFilterSerializer(NamedTupleSerializer["RunsFilter"]):\n def before_unpack(\n self,\n context,\n unpacked_dict: Dict[str, Any],\n ) -> Dict[str, Any]:\n # We store empty run ids as [] but only accept None\n if "run_ids" in unpacked_dict and unpacked_dict["run_ids"] == []:\n unpacked_dict["run_ids"] = None\n return unpacked_dict\n\n\n
[docs]@whitelist_for_serdes(\n serializer=RunsFilterSerializer,\n old_storage_names={"PipelineRunsFilter"},\n storage_field_names={"job_name": "pipeline_name"},\n)\nclass RunsFilter(\n NamedTuple(\n "_RunsFilter",\n [\n ("run_ids", Sequence[str]),\n ("job_name", Optional[str]),\n ("statuses", Sequence[DagsterRunStatus]),\n ("tags", Mapping[str, Union[str, Sequence[str]]]),\n ("snapshot_id", Optional[str]),\n ("updated_after", Optional[datetime]),\n ("updated_before", Optional[datetime]),\n ("created_after", Optional[datetime]),\n ("created_before", Optional[datetime]),\n ],\n )\n):\n """Defines a filter across job runs, for use when querying storage directly.\n\n Each field of the RunsFilter represents a logical AND with each other. For\n example, if you specify job_name and tags, then you will receive only runs\n with the specified job_name AND the specified tags. If left blank, then\n all values will be permitted for that field.\n\n Args:\n run_ids (Optional[List[str]]): A list of job run_id values.\n job_name (Optional[str]):\n Name of the job to query for. If blank, all job_names will be accepted.\n statuses (Optional[List[DagsterRunStatus]]):\n A list of run statuses to filter by. If blank, all run statuses will be allowed.\n tags (Optional[Dict[str, Union[str, List[str]]]]):\n A dictionary of run tags to query by. All tags specified here must be present for a given run to pass the filter.\n snapshot_id (Optional[str]): The ID of the job snapshot to query for. Intended for internal use.\n updated_after (Optional[DateTime]): Filter by runs that were last updated before this datetime.\n created_before (Optional[DateTime]): Filter by runs that were created before this datetime.\n\n """\n\n def __new__(\n cls,\n run_ids: Optional[Sequence[str]] = None,\n job_name: Optional[str] = None,\n statuses: Optional[Sequence[DagsterRunStatus]] = None,\n tags: Optional[Mapping[str, Union[str, Sequence[str]]]] = None,\n snapshot_id: Optional[str] = None,\n updated_after: Optional[datetime] = None,\n updated_before: Optional[datetime] = None,\n created_after: Optional[datetime] = None,\n created_before: Optional[datetime] = None,\n ):\n check.invariant(run_ids != [], "When filtering on run ids, a non-empty list must be used.")\n\n return super(RunsFilter, cls).__new__(\n cls,\n run_ids=check.opt_sequence_param(run_ids, "run_ids", of_type=str),\n job_name=check.opt_str_param(job_name, "job_name"),\n statuses=check.opt_sequence_param(statuses, "statuses", of_type=DagsterRunStatus),\n tags=check.opt_mapping_param(tags, "tags", key_type=str),\n snapshot_id=check.opt_str_param(snapshot_id, "snapshot_id"),\n updated_after=check.opt_inst_param(updated_after, "updated_after", datetime),\n updated_before=check.opt_inst_param(updated_before, "updated_before", datetime),\n created_after=check.opt_inst_param(created_after, "created_after", datetime),\n created_before=check.opt_inst_param(created_before, "created_before", datetime),\n )\n\n @staticmethod\n def for_schedule(schedule: "ExternalSchedule") -> "RunsFilter":\n return RunsFilter(tags=DagsterRun.tags_for_schedule(schedule))\n\n @staticmethod\n def for_sensor(sensor: "ExternalSensor") -> "RunsFilter":\n return RunsFilter(tags=DagsterRun.tags_for_sensor(sensor))\n\n @staticmethod\n def for_backfill(backfill_id: str) -> "RunsFilter":\n return RunsFilter(tags=DagsterRun.tags_for_backfill_id(backfill_id))
\n\n\nclass JobBucket(NamedTuple):\n job_names: List[str]\n bucket_limit: Optional[int]\n\n\nclass TagBucket(NamedTuple):\n tag_key: str\n tag_values: List[str]\n bucket_limit: Optional[int]\n\n\n
[docs]class RunRecord(\n NamedTuple(\n "_RunRecord",\n [\n ("storage_id", int),\n ("dagster_run", DagsterRun),\n ("create_timestamp", datetime),\n ("update_timestamp", datetime),\n ("start_time", Optional[float]),\n ("end_time", Optional[float]),\n ],\n )\n):\n """Internal representation of a run record, as stored in a\n :py:class:`~dagster._core.storage.runs.RunStorage`.\n\n Users should not invoke this class directly.\n """\n\n def __new__(\n cls,\n storage_id: int,\n dagster_run: DagsterRun,\n create_timestamp: datetime,\n update_timestamp: datetime,\n start_time: Optional[float] = None,\n end_time: Optional[float] = None,\n ):\n return super(RunRecord, cls).__new__(\n cls,\n storage_id=check.int_param(storage_id, "storage_id"),\n dagster_run=check.inst_param(dagster_run, "dagster_run", DagsterRun),\n create_timestamp=check.inst_param(create_timestamp, "create_timestamp", datetime),\n update_timestamp=check.inst_param(update_timestamp, "update_timestamp", datetime),\n # start_time and end_time fields will be populated once the run has started and ended, respectively, but will be None beforehand.\n start_time=check.opt_float_param(start_time, "start_time"),\n end_time=check.opt_float_param(end_time, "end_time"),\n )
\n\n\n@whitelist_for_serdes\nclass RunPartitionData(\n NamedTuple(\n "_RunPartitionData",\n [\n ("run_id", str),\n ("partition", str),\n ("status", DagsterRunStatus),\n ("start_time", Optional[float]),\n ("end_time", Optional[float]),\n ],\n )\n):\n def __new__(\n cls,\n run_id: str,\n partition: str,\n status: DagsterRunStatus,\n start_time: Optional[float],\n end_time: Optional[float],\n ):\n return super(RunPartitionData, cls).__new__(\n cls,\n run_id=check.str_param(run_id, "run_id"),\n partition=check.str_param(partition, "partition"),\n status=check.inst_param(status, "status", DagsterRunStatus),\n start_time=check.opt_inst(start_time, float),\n end_time=check.opt_inst(end_time, float),\n )\n\n\n###################################################################################################\n# GRAVEYARD\n#\n# -|-\n# |\n# _-'~~~~~`-_\n# .' '.\n# | R I P |\n# | |\n# | Execution |\n# | Selector |\n# | |\n# | |\n###################################################################################################\n\n\n@whitelist_for_serdes\nclass ExecutionSelector(\n NamedTuple("_ExecutionSelector", [("name", str), ("solid_subset", Optional[Sequence[str]])])\n):\n """Kept here to maintain loading of PipelineRuns from when it was still alive."""\n\n def __new__(cls, name: str, solid_subset: Optional[Sequence[str]] = None):\n return super(ExecutionSelector, cls).__new__(\n cls,\n name=check.str_param(name, "name"),\n solid_subset=(\n None\n if solid_subset is None\n else check.sequence_param(solid_subset, "solid_subset", of_type=str)\n ),\n )\n
", "current_page_name": "_modules/dagster/_core/storage/dagster_run", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.storage.dagster_run"}, "event_log": {"base": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.storage.event_log.base

\nimport base64\nfrom abc import ABC, abstractmethod\nfrom enum import Enum\nfrom typing import (\n    TYPE_CHECKING,\n    Iterable,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Set,\n    Tuple,\n    Union,\n)\n\nimport dagster._check as check\nfrom dagster._core.assets import AssetDetails\nfrom dagster._core.definitions.events import AssetKey\nfrom dagster._core.event_api import EventHandlerFn, EventLogRecord, EventRecordsFilter\nfrom dagster._core.events import DagsterEventType\nfrom dagster._core.execution.stats import (\n    RunStepKeyStatsSnapshot,\n    build_run_stats_from_events,\n    build_run_step_stats_from_events,\n)\nfrom dagster._core.instance import MayHaveInstanceWeakref, T_DagsterInstance\nfrom dagster._core.storage.asset_check_execution_record import AssetCheckExecutionRecord\nfrom dagster._core.storage.dagster_run import DagsterRunStatsSnapshot\nfrom dagster._core.storage.sql import AlembicVersion\nfrom dagster._seven import json\nfrom dagster._utils import PrintFn\nfrom dagster._utils.concurrency import ConcurrencyClaimStatus, ConcurrencyKeyInfo\n\nif TYPE_CHECKING:\n    from dagster._core.events.log import EventLogEntry\n    from dagster._core.storage.partition_status_cache import AssetStatusCacheValue\n\n\nclass EventLogConnection(NamedTuple):\n    records: Sequence[EventLogRecord]\n    cursor: str\n    has_more: bool\n\n\nclass EventLogCursorType(Enum):\n    OFFSET = "OFFSET"\n    STORAGE_ID = "STORAGE_ID"\n\n\nclass EventLogCursor(NamedTuple):\n    """Representation of an event record cursor, keeping track of the log query state."""\n\n    cursor_type: EventLogCursorType\n    value: int\n\n    def is_offset_cursor(self) -> bool:\n        return self.cursor_type == EventLogCursorType.OFFSET\n\n    def is_id_cursor(self) -> bool:\n        return self.cursor_type == EventLogCursorType.STORAGE_ID\n\n    def offset(self) -> int:\n        check.invariant(self.cursor_type == EventLogCursorType.OFFSET)\n        return max(0, int(self.value))\n\n    def storage_id(self) -> int:\n        check.invariant(self.cursor_type == EventLogCursorType.STORAGE_ID)\n        return int(self.value)\n\n    def __str__(self) -> str:\n        return self.to_string()\n\n    def to_string(self) -> str:\n        raw = json.dumps({"type": self.cursor_type.value, "value": self.value})\n        return base64.b64encode(bytes(raw, encoding="utf-8")).decode("utf-8")\n\n    @staticmethod\n    def parse(cursor_str: str) -> "EventLogCursor":\n        raw = json.loads(base64.b64decode(cursor_str).decode("utf-8"))\n        return EventLogCursor(EventLogCursorType(raw["type"]), raw["value"])\n\n    @staticmethod\n    def from_offset(offset: int) -> "EventLogCursor":\n        return EventLogCursor(EventLogCursorType.OFFSET, offset)\n\n    @staticmethod\n    def from_storage_id(storage_id: int) -> "EventLogCursor":\n        return EventLogCursor(EventLogCursorType.STORAGE_ID, storage_id)\n\n\nclass AssetEntry(\n    NamedTuple(\n        "_AssetEntry",\n        [\n            ("asset_key", AssetKey),\n            ("last_materialization_record", Optional[EventLogRecord]),\n            ("last_run_id", Optional[str]),\n            ("asset_details", Optional[AssetDetails]),\n            ("cached_status", Optional["AssetStatusCacheValue"]),\n        ],\n    )\n):\n    def __new__(\n        cls,\n        asset_key: AssetKey,\n        last_materialization_record: Optional[EventLogRecord] = None,\n        last_run_id: Optional[str] = None,\n        asset_details: Optional[AssetDetails] = None,\n        cached_status: Optional["AssetStatusCacheValue"] = None,\n    ):\n        from dagster._core.storage.partition_status_cache import AssetStatusCacheValue\n\n        return super(AssetEntry, cls).__new__(\n            cls,\n            asset_key=check.inst_param(asset_key, "asset_key", AssetKey),\n            last_materialization_record=check.opt_inst_param(\n                last_materialization_record, "last_materialization_record", EventLogRecord\n            ),\n            last_run_id=check.opt_str_param(last_run_id, "last_run_id"),\n            asset_details=check.opt_inst_param(asset_details, "asset_details", AssetDetails),\n            cached_status=check.opt_inst_param(\n                cached_status, "cached_status", AssetStatusCacheValue\n            ),\n        )\n\n    @property\n    def last_materialization(self) -> Optional["EventLogEntry"]:\n        if self.last_materialization_record is None:\n            return None\n        return self.last_materialization_record.event_log_entry\n\n    @property\n    def last_materialization_storage_id(self) -> Optional[int]:\n        if self.last_materialization_record is None:\n            return None\n        return self.last_materialization_record.storage_id\n\n\n
[docs]class AssetRecord(NamedTuple):\n """Internal representation of an asset record, as stored in a :py:class:`~dagster._core.storage.event_log.EventLogStorage`.\n\n Users should not invoke this class directly.\n """\n\n storage_id: int\n asset_entry: AssetEntry
\n\n\n
[docs]class EventLogStorage(ABC, MayHaveInstanceWeakref[T_DagsterInstance]):\n """Abstract base class for storing structured event logs from pipeline runs.\n\n Note that event log storages using SQL databases as backing stores should implement\n :py:class:`~dagster._core.storage.event_log.SqlEventLogStorage`.\n\n Users should not directly instantiate concrete subclasses of this class; they are instantiated\n by internal machinery when ``dagster-webserver`` and ``dagster-graphql`` load, based on the values in the\n ``dagster.yaml`` file in ``$DAGSTER_HOME``. Configuration of concrete subclasses of this class\n should be done by setting values in that file.\n """\n\n def get_logs_for_run(\n self,\n run_id: str,\n cursor: Optional[Union[str, int]] = None,\n of_type: Optional[Union[DagsterEventType, Set[DagsterEventType]]] = None,\n limit: Optional[int] = None,\n ascending: bool = True,\n ) -> Sequence["EventLogEntry"]:\n """Get all of the logs corresponding to a run.\n\n Args:\n run_id (str): The id of the run for which to fetch logs.\n cursor (Optional[Union[str, int]]): Cursor value to track paginated queries. Legacy\n support for integer offset cursors.\n of_type (Optional[DagsterEventType]): the dagster event type to filter the logs.\n limit (Optional[int]): Max number of records to return.\n """\n if isinstance(cursor, int):\n cursor = EventLogCursor.from_offset(cursor + 1).to_string()\n records = self.get_records_for_run(\n run_id, cursor, of_type, limit, ascending=ascending\n ).records\n return [record.event_log_entry for record in records]\n\n @abstractmethod\n def get_records_for_run(\n self,\n run_id: str,\n cursor: Optional[str] = None,\n of_type: Optional[Union[DagsterEventType, Set[DagsterEventType]]] = None,\n limit: Optional[int] = None,\n ascending: bool = True,\n ) -> EventLogConnection:\n """Get all of the event log records corresponding to a run.\n\n Args:\n run_id (str): The id of the run for which to fetch logs.\n cursor (Optional[str]): Cursor value to track paginated queries.\n of_type (Optional[DagsterEventType]): the dagster event type to filter the logs.\n limit (Optional[int]): Max number of records to return.\n """\n\n def get_stats_for_run(self, run_id: str) -> DagsterRunStatsSnapshot:\n """Get a summary of events that have ocurred in a run."""\n return build_run_stats_from_events(run_id, self.get_logs_for_run(run_id))\n\n def get_step_stats_for_run(\n self, run_id: str, step_keys: Optional[Sequence[str]] = None\n ) -> Sequence[RunStepKeyStatsSnapshot]:\n """Get per-step stats for a pipeline run."""\n logs = self.get_logs_for_run(run_id)\n if step_keys:\n logs = [\n event\n for event in logs\n if event.is_dagster_event and event.get_dagster_event().step_key in step_keys\n ]\n\n return build_run_step_stats_from_events(run_id, logs)\n\n @abstractmethod\n def store_event(self, event: "EventLogEntry") -> None:\n """Store an event corresponding to a pipeline run.\n\n Args:\n event (EventLogEntry): The event to store.\n """\n\n @abstractmethod\n def delete_events(self, run_id: str) -> None:\n """Remove events for a given run id."""\n\n @abstractmethod\n def upgrade(self) -> None:\n """This method should perform any schema migrations necessary to bring an\n out-of-date instance of the storage up to date.\n """\n\n @abstractmethod\n def reindex_events(self, print_fn: Optional[PrintFn] = None, force: bool = False) -> None:\n """Call this method to run any data migrations across the event_log tables."""\n\n @abstractmethod\n def reindex_assets(self, print_fn: Optional[PrintFn] = None, force: bool = False) -> None:\n """Call this method to run any data migrations across the asset tables."""\n\n @abstractmethod\n def wipe(self) -> None:\n """Clear the log storage."""\n\n @abstractmethod\n def watch(self, run_id: str, cursor: Optional[str], callback: EventHandlerFn) -> None:\n """Call this method to start watching."""\n\n @abstractmethod\n def end_watch(self, run_id: str, handler: EventHandlerFn) -> None:\n """Call this method to stop watching."""\n\n @property\n @abstractmethod\n def is_persistent(self) -> bool:\n """bool: Whether the storage is persistent."""\n\n def dispose(self) -> None:\n """Explicit lifecycle management."""\n\n def optimize_for_webserver(self, statement_timeout: int, pool_recycle: int) -> None:\n """Allows for optimizing database connection / use in the context of a long lived webserver process."""\n\n @abstractmethod\n def get_event_records(\n self,\n event_records_filter: EventRecordsFilter,\n limit: Optional[int] = None,\n ascending: bool = False,\n ) -> Sequence[EventLogRecord]:\n pass\n\n def supports_event_consumer_queries(self) -> bool:\n return False\n\n def get_logs_for_all_runs_by_log_id(\n self,\n after_cursor: int = -1,\n dagster_event_type: Optional[Union[DagsterEventType, Set[DagsterEventType]]] = None,\n limit: Optional[int] = None,\n ) -> Mapping[int, "EventLogEntry"]:\n """Get event records across all runs. Only supported for non sharded sql storage."""\n raise NotImplementedError()\n\n def get_maximum_record_id(self) -> Optional[int]:\n """Get the current greatest record id in the event log. Only supported for non sharded sql storage."""\n raise NotImplementedError()\n\n @abstractmethod\n def can_cache_asset_status_data(self) -> bool:\n pass\n\n @abstractmethod\n def wipe_asset_cached_status(self, asset_key: AssetKey) -> None:\n pass\n\n @abstractmethod\n def get_asset_records(\n self, asset_keys: Optional[Sequence[AssetKey]] = None\n ) -> Sequence[AssetRecord]:\n pass\n\n @abstractmethod\n def has_asset_key(self, asset_key: AssetKey) -> bool:\n pass\n\n @abstractmethod\n def all_asset_keys(self) -> Sequence[AssetKey]:\n pass\n\n @abstractmethod\n def update_asset_cached_status_data(\n self, asset_key: AssetKey, cache_values: "AssetStatusCacheValue"\n ) -> None:\n pass\n\n def get_asset_keys(\n self,\n prefix: Optional[Sequence[str]] = None,\n limit: Optional[int] = None,\n cursor: Optional[str] = None,\n ) -> Sequence[AssetKey]:\n # base implementation of get_asset_keys, using the existing `all_asset_keys` and doing the\n # filtering in-memory\n asset_keys = sorted(self.all_asset_keys(), key=str)\n if prefix:\n asset_keys = [\n asset_key for asset_key in asset_keys if asset_key.path[: len(prefix)] == prefix\n ]\n if cursor:\n cursor_asset = AssetKey.from_db_string(cursor)\n if cursor_asset and cursor_asset in asset_keys:\n idx = asset_keys.index(cursor_asset)\n asset_keys = asset_keys[idx + 1 :]\n if limit:\n asset_keys = asset_keys[:limit]\n return asset_keys\n\n @abstractmethod\n def get_latest_materialization_events(\n self, asset_keys: Iterable[AssetKey]\n ) -> Mapping[AssetKey, Optional["EventLogEntry"]]:\n pass\n\n def supports_add_asset_event_tags(self) -> bool:\n return False\n\n def add_asset_event_tags(\n self,\n event_id: int,\n event_timestamp: float,\n asset_key: AssetKey,\n new_tags: Mapping[str, str],\n ) -> None:\n raise NotImplementedError()\n\n @abstractmethod\n def get_event_tags_for_asset(\n self,\n asset_key: AssetKey,\n filter_tags: Optional[Mapping[str, str]] = None,\n filter_event_id: Optional[int] = None,\n ) -> Sequence[Mapping[str, str]]:\n pass\n\n @abstractmethod\n def wipe_asset(self, asset_key: AssetKey) -> None:\n """Remove asset index history from event log for given asset_key."""\n\n @abstractmethod\n def get_materialized_partitions(\n self,\n asset_key: AssetKey,\n before_cursor: Optional[int] = None,\n after_cursor: Optional[int] = None,\n ) -> Set[str]:\n pass\n\n @abstractmethod\n def get_materialization_count_by_partition(\n self, asset_keys: Sequence[AssetKey], after_cursor: Optional[int] = None\n ) -> Mapping[AssetKey, Mapping[str, int]]:\n pass\n\n @abstractmethod\n def get_latest_storage_id_by_partition(\n self, asset_key: AssetKey, event_type: DagsterEventType\n ) -> Mapping[str, int]:\n pass\n\n @abstractmethod\n def get_latest_tags_by_partition(\n self,\n asset_key: AssetKey,\n event_type: DagsterEventType,\n tag_keys: Sequence[str],\n asset_partitions: Optional[Sequence[str]] = None,\n before_cursor: Optional[int] = None,\n after_cursor: Optional[int] = None,\n ) -> Mapping[str, Mapping[str, str]]:\n pass\n\n @abstractmethod\n def get_latest_asset_partition_materialization_attempts_without_materializations(\n self, asset_key: AssetKey\n ) -> Mapping[str, Tuple[str, int]]:\n pass\n\n @abstractmethod\n def get_dynamic_partitions(self, partitions_def_name: str) -> Sequence[str]:\n """Get the list of partition keys for a dynamic partitions definition."""\n raise NotImplementedError()\n\n @abstractmethod\n def has_dynamic_partition(self, partitions_def_name: str, partition_key: str) -> bool:\n """Check if a dynamic partition exists."""\n raise NotImplementedError()\n\n @abstractmethod\n def add_dynamic_partitions(\n self, partitions_def_name: str, partition_keys: Sequence[str]\n ) -> None:\n """Add a partition for the specified dynamic partitions definition."""\n raise NotImplementedError()\n\n @abstractmethod\n def delete_dynamic_partition(self, partitions_def_name: str, partition_key: str) -> None:\n """Delete a partition for the specified dynamic partitions definition."""\n raise NotImplementedError()\n\n def alembic_version(self) -> Optional[AlembicVersion]:\n return None\n\n @property\n def is_run_sharded(self) -> bool:\n """Indicates that the EventLogStoarge is sharded."""\n return False\n\n @property\n def supports_global_concurrency_limits(self) -> bool:\n """Indicates that the EventLogStorage supports global concurrency limits."""\n return False\n\n @abstractmethod\n def set_concurrency_slots(self, concurrency_key: str, num: int) -> None:\n """Allocate concurrency slots for the given concurrency key."""\n raise NotImplementedError()\n\n @abstractmethod\n def get_concurrency_keys(self) -> Set[str]:\n """Get the set of concurrency limited keys."""\n raise NotImplementedError()\n\n @abstractmethod\n def get_concurrency_info(self, concurrency_key: str) -> ConcurrencyKeyInfo:\n """Get concurrency info for key."""\n raise NotImplementedError()\n\n @abstractmethod\n def claim_concurrency_slot(\n self, concurrency_key: str, run_id: str, step_key: str, priority: Optional[int] = None\n ) -> ConcurrencyClaimStatus:\n """Claim concurrency slots for step."""\n raise NotImplementedError()\n\n @abstractmethod\n def check_concurrency_claim(\n self, concurrency_key: str, run_id: str, step_key: str\n ) -> ConcurrencyClaimStatus:\n """Claim concurrency slots for step."""\n raise NotImplementedError()\n\n @abstractmethod\n def get_concurrency_run_ids(self) -> Set[str]:\n """Get a list of run_ids that are occupying or waiting for a concurrency key slot."""\n raise NotImplementedError()\n\n @abstractmethod\n def free_concurrency_slots_for_run(self, run_id: str) -> None:\n """Frees concurrency slots for a given run."""\n raise NotImplementedError()\n\n @abstractmethod\n def free_concurrency_slot_for_step(self, run_id: str, step_key: str) -> None:\n """Frees concurrency slots for a given run/step."""\n raise NotImplementedError()\n\n @property\n def supports_asset_checks(self):\n return True\n\n def get_asset_check_executions(\n self,\n asset_key: AssetKey,\n check_name: str,\n limit: int,\n cursor: Optional[int] = None,\n materialization_event_storage_id: Optional[int] = None,\n include_planned: bool = True,\n ) -> Sequence[AssetCheckExecutionRecord]:\n """Get the executions for an asset check, sorted by recency. If materialization_event_storage_id\n is set and include_planned is True, the returned Sequence will include executions that are planned\n but do not have a target materialization yet (since we don't set the target until the check is executed).\n """\n raise NotImplementedError()
\n
", "current_page_name": "_modules/dagster/_core/storage/event_log/base", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.storage.event_log.base"}, "sql_event_log": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.storage.event_log.sql_event_log

\nimport logging\nfrom abc import abstractmethod\nfrom collections import OrderedDict, defaultdict\nfrom datetime import datetime\nfrom typing import (\n    TYPE_CHECKING,\n    Any,\n    ContextManager,\n    Dict,\n    Iterable,\n    List,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Set,\n    Tuple,\n    Union,\n    cast,\n)\n\nimport pendulum\nimport sqlalchemy as db\nimport sqlalchemy.exc as db_exc\nfrom sqlalchemy.engine import Connection\nfrom typing_extensions import TypeAlias\n\nimport dagster._check as check\nimport dagster._seven as seven\nfrom dagster._core.assets import AssetDetails\nfrom dagster._core.definitions.asset_check_evaluation import (\n    AssetCheckEvaluation,\n    AssetCheckEvaluationPlanned,\n)\nfrom dagster._core.definitions.events import AssetKey, AssetMaterialization\nfrom dagster._core.errors import (\n    DagsterEventLogInvalidForRun,\n    DagsterInvalidInvocationError,\n    DagsterInvariantViolationError,\n)\nfrom dagster._core.event_api import RunShardedEventsCursor\nfrom dagster._core.events import ASSET_CHECK_EVENTS, ASSET_EVENTS, MARKER_EVENTS, DagsterEventType\nfrom dagster._core.events.log import EventLogEntry\nfrom dagster._core.execution.stats import RunStepKeyStatsSnapshot, build_run_step_stats_from_events\nfrom dagster._core.storage.asset_check_execution_record import (\n    AssetCheckExecutionRecord,\n    AssetCheckExecutionRecordStatus,\n)\nfrom dagster._core.storage.sql import SqlAlchemyQuery, SqlAlchemyRow\nfrom dagster._core.storage.sqlalchemy_compat import (\n    db_case,\n    db_fetch_mappings,\n    db_select,\n    db_subquery,\n)\nfrom dagster._serdes import (\n    deserialize_value,\n    serialize_value,\n)\nfrom dagster._serdes.errors import DeserializationError\nfrom dagster._utils import (\n    PrintFn,\n    datetime_as_float,\n    utc_datetime_from_naive,\n    utc_datetime_from_timestamp,\n)\nfrom dagster._utils.concurrency import (\n    ConcurrencyClaimStatus,\n    ConcurrencyKeyInfo,\n    ConcurrencySlotStatus,\n)\n\nfrom ..dagster_run import DagsterRunStatsSnapshot\nfrom .base import (\n    AssetEntry,\n    AssetRecord,\n    EventLogConnection,\n    EventLogCursor,\n    EventLogRecord,\n    EventLogStorage,\n    EventRecordsFilter,\n)\nfrom .migration import ASSET_DATA_MIGRATIONS, ASSET_KEY_INDEX_COLS, EVENT_LOG_DATA_MIGRATIONS\nfrom .schema import (\n    AssetCheckExecutionsTable,\n    AssetEventTagsTable,\n    AssetKeyTable,\n    ConcurrencySlotsTable,\n    DynamicPartitionsTable,\n    PendingStepsTable,\n    SecondaryIndexMigrationTable,\n    SqlEventLogStorageTable,\n)\n\nif TYPE_CHECKING:\n    from dagster._core.storage.partition_status_cache import AssetStatusCacheValue\n\nMAX_CONCURRENCY_SLOTS = 1000\nMIN_ASSET_ROWS = 25\n\n# We are using third-party library objects for DB connections-- at this time, these libraries are\n# untyped. When/if we upgrade to typed variants, the `Any` here can be replaced or the alias as a\n# whole can be dropped.\nSqlDbConnection: TypeAlias = Any\n\n\n
[docs]class SqlEventLogStorage(EventLogStorage):\n """Base class for SQL backed event log storages.\n\n Distinguishes between run-based connections and index connections in order to support run-level\n sharding, while maintaining the ability to do cross-run queries\n """\n\n @abstractmethod\n def run_connection(self, run_id: Optional[str]) -> ContextManager[Connection]:\n """Context manager yielding a connection to access the event logs for a specific run.\n\n Args:\n run_id (Optional[str]): Enables those storages which shard based on run_id, e.g.,\n SqliteEventLogStorage, to connect appropriately.\n """\n\n @abstractmethod\n def index_connection(self) -> ContextManager[Connection]:\n """Context manager yielding a connection to access cross-run indexed tables."""\n\n @abstractmethod\n def upgrade(self) -> None:\n """This method should perform any schema migrations necessary to bring an\n out-of-date instance of the storage up to date.\n """\n\n @abstractmethod\n def has_table(self, table_name: str) -> bool:\n """This method checks if a table exists in the database."""\n\n def prepare_insert_event(self, event):\n """Helper method for preparing the event log SQL insertion statement. Abstracted away to\n have a single place for the logical table representation of the event, while having a way\n for SQL backends to implement different execution implementations for `store_event`. See\n the `dagster-postgres` implementation which overrides the generic SQL implementation of\n `store_event`.\n """\n dagster_event_type = None\n asset_key_str = None\n partition = None\n step_key = event.step_key\n if event.is_dagster_event:\n dagster_event_type = event.dagster_event.event_type_value\n step_key = event.dagster_event.step_key\n if event.dagster_event.asset_key:\n check.inst_param(event.dagster_event.asset_key, "asset_key", AssetKey)\n asset_key_str = event.dagster_event.asset_key.to_string()\n if event.dagster_event.partition:\n partition = event.dagster_event.partition\n\n # https://stackoverflow.com/a/54386260/324449\n return SqlEventLogStorageTable.insert().values(\n run_id=event.run_id,\n event=serialize_value(event),\n dagster_event_type=dagster_event_type,\n # Postgres requires a datetime that is in UTC but has no timezone info set\n # in order to be stored correctly\n timestamp=datetime.utcfromtimestamp(event.timestamp),\n step_key=step_key,\n asset_key=asset_key_str,\n partition=partition,\n )\n\n def has_asset_key_col(self, column_name: str) -> bool:\n with self.index_connection() as conn:\n column_names = [x.get("name") for x in db.inspect(conn).get_columns(AssetKeyTable.name)]\n return column_name in column_names\n\n def has_asset_key_index_cols(self) -> bool:\n return self.has_asset_key_col("last_materialization_timestamp")\n\n def store_asset_event(self, event: EventLogEntry, event_id: int):\n check.inst_param(event, "event", EventLogEntry)\n\n if not (event.dagster_event and event.dagster_event.asset_key):\n return\n\n # We switched to storing the entire event record of the last materialization instead of just\n # the AssetMaterialization object, so that we have access to metadata like timestamp,\n # pipeline, run_id, etc.\n #\n # This should make certain asset queries way more performant, without having to do extra\n # queries against the event log.\n #\n # This should be accompanied by a schema change in 0.12.0, renaming `last_materialization`\n # to `last_materialization_event`, for clarity. For now, we should do some back-compat.\n #\n # https://github.com/dagster-io/dagster/issues/3945\n\n values = self._get_asset_entry_values(event, event_id, self.has_asset_key_index_cols())\n insert_statement = AssetKeyTable.insert().values(\n asset_key=event.dagster_event.asset_key.to_string(), **values\n )\n update_statement = (\n AssetKeyTable.update()\n .values(**values)\n .where(\n AssetKeyTable.c.asset_key == event.dagster_event.asset_key.to_string(),\n )\n )\n\n with self.index_connection() as conn:\n try:\n conn.execute(insert_statement)\n except db_exc.IntegrityError:\n conn.execute(update_statement)\n\n def _get_asset_entry_values(\n self, event: EventLogEntry, event_id: int, has_asset_key_index_cols: bool\n ) -> Dict[str, Any]:\n # The AssetKeyTable contains a `last_materialization_timestamp` column that is exclusively\n # used to determine if an asset exists (last materialization timestamp > wipe timestamp).\n # This column is used nowhere else, and as of AssetObservation/AssetMaterializationPlanned\n # event creation, we want to extend this functionality to ensure that assets with any event\n # (observation, materialization, or materialization planned) yielded with timestamp\n # > wipe timestamp display in the Dagster UI.\n\n # As of the following PRs, we update last_materialization_timestamp to store the timestamp\n # of the latest asset observation, materialization, or materialization_planned that has occurred.\n # https://github.com/dagster-io/dagster/pull/6885\n # https://github.com/dagster-io/dagster/pull/7319\n\n entry_values: Dict[str, Any] = {}\n dagster_event = check.not_none(event.dagster_event)\n if dagster_event.is_step_materialization:\n entry_values.update(\n {\n "last_materialization": serialize_value(\n EventLogRecord(\n storage_id=event_id,\n event_log_entry=event,\n )\n ),\n "last_run_id": event.run_id,\n }\n )\n if has_asset_key_index_cols:\n entry_values.update(\n {\n "last_materialization_timestamp": utc_datetime_from_timestamp(\n event.timestamp\n ),\n }\n )\n elif dagster_event.is_asset_materialization_planned:\n # The AssetKeyTable also contains a `last_run_id` column that is updated upon asset\n # materialization. This column was not being used until the below PR. This new change\n # writes to the column upon `ASSET_MATERIALIZATION_PLANNED` events to fetch the last\n # run id for a set of assets in one roundtrip call to event log storage.\n # https://github.com/dagster-io/dagster/pull/7319\n entry_values.update({"last_run_id": event.run_id})\n if has_asset_key_index_cols:\n entry_values.update(\n {\n "last_materialization_timestamp": utc_datetime_from_timestamp(\n event.timestamp\n ),\n }\n )\n elif dagster_event.is_asset_observation:\n if has_asset_key_index_cols:\n entry_values.update(\n {\n "last_materialization_timestamp": utc_datetime_from_timestamp(\n event.timestamp\n ),\n }\n )\n\n return entry_values\n\n def supports_add_asset_event_tags(self) -> bool:\n return self.has_table(AssetEventTagsTable.name)\n\n def add_asset_event_tags(\n self,\n event_id: int,\n event_timestamp: float,\n asset_key: AssetKey,\n new_tags: Mapping[str, str],\n ) -> None:\n check.int_param(event_id, "event_id")\n check.float_param(event_timestamp, "event_timestamp")\n check.inst_param(asset_key, "asset_key", AssetKey)\n check.mapping_param(new_tags, "new_tags", key_type=str, value_type=str)\n\n if not self.supports_add_asset_event_tags():\n raise DagsterInvalidInvocationError(\n "In order to add asset event tags, you must run `dagster instance migrate` to "\n "create the AssetEventTags table."\n )\n\n current_tags_list = self.get_event_tags_for_asset(asset_key, filter_event_id=event_id)\n\n asset_key_str = asset_key.to_string()\n\n if len(current_tags_list) == 0:\n current_tags: Mapping[str, str] = {}\n else:\n current_tags = current_tags_list[0]\n\n with self.index_connection() as conn:\n current_tags_set = set(current_tags.keys())\n new_tags_set = set(new_tags.keys())\n\n existing_tags = current_tags_set & new_tags_set\n added_tags = new_tags_set.difference(existing_tags)\n\n for tag in existing_tags:\n conn.execute(\n AssetEventTagsTable.update()\n .where(\n db.and_(\n AssetEventTagsTable.c.event_id == event_id,\n AssetEventTagsTable.c.asset_key == asset_key_str,\n AssetEventTagsTable.c.key == tag,\n )\n )\n .values(value=new_tags[tag])\n )\n\n if added_tags:\n conn.execute(\n AssetEventTagsTable.insert(),\n [\n dict(\n event_id=event_id,\n asset_key=asset_key_str,\n key=tag,\n value=new_tags[tag],\n # Postgres requires a datetime that is in UTC but has no timezone info\n # set in order to be stored correctly\n event_timestamp=datetime.utcfromtimestamp(event_timestamp),\n )\n for tag in added_tags\n ],\n )\n\n def store_asset_event_tags(self, event: EventLogEntry, event_id: int) -> None:\n check.inst_param(event, "event", EventLogEntry)\n check.int_param(event_id, "event_id")\n\n if event.dagster_event and event.dagster_event.asset_key:\n if event.dagster_event.is_step_materialization:\n tags = event.dagster_event.step_materialization_data.materialization.tags\n elif event.dagster_event.is_asset_observation:\n tags = event.dagster_event.asset_observation_data.asset_observation.tags\n else:\n tags = None\n\n if not tags or not self.has_table(AssetEventTagsTable.name):\n # If tags table does not exist, silently exit. This is to support OSS\n # users who have not yet run the migration to create the table.\n # On read, we will throw an error if the table does not exist.\n return\n\n check.inst_param(event.dagster_event.asset_key, "asset_key", AssetKey)\n asset_key_str = event.dagster_event.asset_key.to_string()\n\n with self.index_connection() as conn:\n conn.execute(\n AssetEventTagsTable.insert(),\n [\n dict(\n event_id=event_id,\n asset_key=asset_key_str,\n key=key,\n value=value,\n # Postgres requires a datetime that is in UTC but has no timezone info\n # set in order to be stored correctly\n event_timestamp=datetime.utcfromtimestamp(event.timestamp),\n )\n for key, value in tags.items()\n ],\n )\n\n def store_event(self, event: EventLogEntry) -> None:\n """Store an event corresponding to a pipeline run.\n\n Args:\n event (EventLogEntry): The event to store.\n """\n check.inst_param(event, "event", EventLogEntry)\n insert_event_statement = self.prepare_insert_event(event)\n run_id = event.run_id\n\n event_id = None\n\n with self.run_connection(run_id) as conn:\n result = conn.execute(insert_event_statement)\n event_id = result.inserted_primary_key[0]\n\n if (\n event.is_dagster_event\n and event.dagster_event_type in ASSET_EVENTS\n and event.dagster_event.asset_key # type: ignore\n ):\n self.store_asset_event(event, event_id)\n\n if event_id is None:\n raise DagsterInvariantViolationError(\n "Cannot store asset event tags for null event id."\n )\n\n self.store_asset_event_tags(event, event_id)\n\n if event.is_dagster_event and event.dagster_event_type in ASSET_CHECK_EVENTS:\n self.store_asset_check_event(event, event_id)\n\n def get_records_for_run(\n self,\n run_id,\n cursor: Optional[str] = None,\n of_type: Optional[Union[DagsterEventType, Set[DagsterEventType]]] = None,\n limit: Optional[int] = None,\n ascending: bool = True,\n ) -> EventLogConnection:\n """Get all of the logs corresponding to a run.\n\n Args:\n run_id (str): The id of the run for which to fetch logs.\n cursor (Optional[int]): Zero-indexed logs will be returned starting from cursor + 1,\n i.e., if cursor is -1, all logs will be returned. (default: -1)\n of_type (Optional[DagsterEventType]): the dagster event type to filter the logs.\n limit (Optional[int]): the maximum number of events to fetch\n """\n check.str_param(run_id, "run_id")\n check.opt_str_param(cursor, "cursor")\n\n check.invariant(not of_type or isinstance(of_type, (DagsterEventType, frozenset, set)))\n\n dagster_event_types = (\n {of_type}\n if isinstance(of_type, DagsterEventType)\n else check.opt_set_param(of_type, "dagster_event_type", of_type=DagsterEventType)\n )\n\n query = (\n db_select([SqlEventLogStorageTable.c.id, SqlEventLogStorageTable.c.event])\n .where(SqlEventLogStorageTable.c.run_id == run_id)\n .order_by(\n SqlEventLogStorageTable.c.id.asc()\n if ascending\n else SqlEventLogStorageTable.c.id.desc()\n )\n )\n if dagster_event_types:\n query = query.where(\n SqlEventLogStorageTable.c.dagster_event_type.in_(\n [dagster_event_type.value for dagster_event_type in dagster_event_types]\n )\n )\n\n # adjust 0 based index cursor to SQL offset\n if cursor is not None:\n cursor_obj = EventLogCursor.parse(cursor)\n if cursor_obj.is_offset_cursor():\n query = query.offset(cursor_obj.offset())\n elif cursor_obj.is_id_cursor():\n if ascending:\n query = query.where(SqlEventLogStorageTable.c.id > cursor_obj.storage_id())\n else:\n query = query.where(SqlEventLogStorageTable.c.id < cursor_obj.storage_id())\n\n if limit:\n query = query.limit(limit)\n\n with self.run_connection(run_id) as conn:\n results = conn.execute(query).fetchall()\n\n last_record_id = None\n try:\n records = []\n for (\n record_id,\n json_str,\n ) in results:\n records.append(\n EventLogRecord(\n storage_id=record_id,\n event_log_entry=deserialize_value(json_str, EventLogEntry),\n )\n )\n last_record_id = record_id\n except (seven.JSONDecodeError, DeserializationError) as err:\n raise DagsterEventLogInvalidForRun(run_id=run_id) from err\n\n if last_record_id is not None:\n next_cursor = EventLogCursor.from_storage_id(last_record_id).to_string()\n elif cursor:\n # record fetch returned no new logs, return the same cursor\n next_cursor = cursor\n else:\n # rely on the fact that all storage ids will be positive integers\n next_cursor = EventLogCursor.from_storage_id(-1).to_string()\n\n return EventLogConnection(\n records=records,\n cursor=next_cursor,\n has_more=bool(limit and len(results) == limit),\n )\n\n def get_stats_for_run(self, run_id: str) -> DagsterRunStatsSnapshot:\n check.str_param(run_id, "run_id")\n\n query = (\n db_select(\n [\n SqlEventLogStorageTable.c.dagster_event_type,\n db.func.count().label("n_events_of_type"),\n db.func.max(SqlEventLogStorageTable.c.timestamp).label("last_event_timestamp"),\n ]\n )\n .where(\n db.and_(\n SqlEventLogStorageTable.c.run_id == run_id,\n SqlEventLogStorageTable.c.dagster_event_type != None, # noqa: E711\n )\n )\n .group_by("dagster_event_type")\n )\n\n with self.run_connection(run_id) as conn:\n results = conn.execute(query).fetchall()\n\n try:\n counts = {}\n times = {}\n for result in results:\n (dagster_event_type, n_events_of_type, last_event_timestamp) = result\n check.invariant(dagster_event_type is not None)\n counts[dagster_event_type] = n_events_of_type\n times[dagster_event_type] = last_event_timestamp\n\n enqueued_time = times.get(DagsterEventType.PIPELINE_ENQUEUED.value, None)\n launch_time = times.get(DagsterEventType.PIPELINE_STARTING.value, None)\n start_time = times.get(DagsterEventType.PIPELINE_START.value, None)\n end_time = times.get(\n DagsterEventType.PIPELINE_SUCCESS.value,\n times.get(\n DagsterEventType.PIPELINE_FAILURE.value,\n times.get(DagsterEventType.PIPELINE_CANCELED.value, None),\n ),\n )\n\n return DagsterRunStatsSnapshot(\n run_id=run_id,\n steps_succeeded=counts.get(DagsterEventType.STEP_SUCCESS.value, 0),\n steps_failed=counts.get(DagsterEventType.STEP_FAILURE.value, 0),\n materializations=counts.get(DagsterEventType.ASSET_MATERIALIZATION.value, 0),\n expectations=counts.get(DagsterEventType.STEP_EXPECTATION_RESULT.value, 0),\n enqueued_time=datetime_as_float(enqueued_time) if enqueued_time else None,\n launch_time=datetime_as_float(launch_time) if launch_time else None,\n start_time=datetime_as_float(start_time) if start_time else None,\n end_time=datetime_as_float(end_time) if end_time else None,\n )\n except (seven.JSONDecodeError, DeserializationError) as err:\n raise DagsterEventLogInvalidForRun(run_id=run_id) from err\n\n def get_step_stats_for_run(\n self, run_id: str, step_keys: Optional[Sequence[str]] = None\n ) -> Sequence[RunStepKeyStatsSnapshot]:\n check.str_param(run_id, "run_id")\n check.opt_list_param(step_keys, "step_keys", of_type=str)\n\n # Originally, this was two different queries:\n # 1) one query which aggregated top-level step stats by grouping by event type / step_key in\n # a single query, using pure SQL (e.g. start_time, end_time, status, attempt counts).\n # 2) one query which fetched all the raw events for a specific event type and then inspected\n # the deserialized event object to aggregate stats derived from sequences of events.\n # (e.g. marker events, materializations, expectations resuls, attempts timing, etc.)\n #\n # For simplicity, we now just do the second type of query and derive the stats in Python\n # from the raw events. This has the benefit of being easier to read and also the benefit of\n # being able to share code with the in-memory event log storage implementation. We may\n # choose to revisit this in the future, especially if we are able to do JSON-column queries\n # in SQL as a way of bypassing the serdes layer in all cases.\n raw_event_query = (\n db_select([SqlEventLogStorageTable.c.event])\n .where(SqlEventLogStorageTable.c.run_id == run_id)\n .where(SqlEventLogStorageTable.c.step_key != None) # noqa: E711\n .where(\n SqlEventLogStorageTable.c.dagster_event_type.in_(\n [\n DagsterEventType.STEP_START.value,\n DagsterEventType.STEP_SUCCESS.value,\n DagsterEventType.STEP_SKIPPED.value,\n DagsterEventType.STEP_FAILURE.value,\n DagsterEventType.STEP_RESTARTED.value,\n DagsterEventType.ASSET_MATERIALIZATION.value,\n DagsterEventType.STEP_EXPECTATION_RESULT.value,\n DagsterEventType.STEP_RESTARTED.value,\n DagsterEventType.STEP_UP_FOR_RETRY.value,\n ]\n + [marker_event.value for marker_event in MARKER_EVENTS]\n )\n )\n .order_by(SqlEventLogStorageTable.c.id.asc())\n )\n if step_keys:\n raw_event_query = raw_event_query.where(\n SqlEventLogStorageTable.c.step_key.in_(step_keys)\n )\n\n with self.run_connection(run_id) as conn:\n results = conn.execute(raw_event_query).fetchall()\n\n try:\n records = [deserialize_value(json_str, EventLogEntry) for (json_str,) in results]\n return build_run_step_stats_from_events(run_id, records)\n except (seven.JSONDecodeError, DeserializationError) as err:\n raise DagsterEventLogInvalidForRun(run_id=run_id) from err\n\n def _apply_migration(self, migration_name, migration_fn, print_fn, force):\n if self.has_secondary_index(migration_name):\n if not force:\n if print_fn:\n print_fn(f"Skipping already applied data migration: {migration_name}")\n return\n if print_fn:\n print_fn(f"Starting data migration: {migration_name}")\n migration_fn()(self, print_fn)\n self.enable_secondary_index(migration_name)\n if print_fn:\n print_fn(f"Finished data migration: {migration_name}")\n\n def reindex_events(self, print_fn: Optional[PrintFn] = None, force: bool = False) -> None:\n """Call this method to run any data migrations across the event_log table."""\n for migration_name, migration_fn in EVENT_LOG_DATA_MIGRATIONS.items():\n self._apply_migration(migration_name, migration_fn, print_fn, force)\n\n def reindex_assets(self, print_fn: Optional[PrintFn] = None, force: bool = False) -> None:\n """Call this method to run any data migrations across the asset_keys table."""\n for migration_name, migration_fn in ASSET_DATA_MIGRATIONS.items():\n self._apply_migration(migration_name, migration_fn, print_fn, force)\n\n def wipe(self) -> None:\n """Clears the event log storage."""\n # Should be overridden by SqliteEventLogStorage and other storages that shard based on\n # run_id\n\n # https://stackoverflow.com/a/54386260/324449\n with self.run_connection(run_id=None) as conn:\n conn.execute(SqlEventLogStorageTable.delete())\n conn.execute(AssetKeyTable.delete())\n\n if self.has_table("asset_event_tags"):\n conn.execute(AssetEventTagsTable.delete())\n\n if self.has_table("dynamic_partitions"):\n conn.execute(DynamicPartitionsTable.delete())\n\n if self.has_table("concurrency_slots"):\n conn.execute(ConcurrencySlotsTable.delete())\n\n if self.has_table("pending_steps"):\n conn.execute(PendingStepsTable.delete())\n\n if self.has_table("asset_check_executions"):\n conn.execute(AssetCheckExecutionsTable.delete())\n\n self._wipe_index()\n\n def _wipe_index(self):\n with self.index_connection() as conn:\n conn.execute(SqlEventLogStorageTable.delete())\n conn.execute(AssetKeyTable.delete())\n\n if self.has_table("asset_event_tags"):\n conn.execute(AssetEventTagsTable.delete())\n\n if self.has_table("dynamic_partitions"):\n conn.execute(DynamicPartitionsTable.delete())\n\n if self.has_table("concurrency_slots"):\n conn.execute(ConcurrencySlotsTable.delete())\n\n if self.has_table("pending_steps"):\n conn.execute(PendingStepsTable.delete())\n\n if self.has_table("asset_check_executions"):\n conn.execute(AssetCheckExecutionsTable.delete())\n\n def delete_events(self, run_id: str) -> None:\n with self.run_connection(run_id) as conn:\n self.delete_events_for_run(conn, run_id)\n with self.index_connection() as conn:\n self.delete_events_for_run(conn, run_id)\n self.free_concurrency_slots_for_run(run_id)\n\n def delete_events_for_run(self, conn: Connection, run_id: str) -> None:\n check.str_param(run_id, "run_id")\n conn.execute(\n SqlEventLogStorageTable.delete().where(SqlEventLogStorageTable.c.run_id == run_id)\n )\n\n @property\n def is_persistent(self) -> bool:\n return True\n\n def update_event_log_record(self, record_id: int, event: EventLogEntry) -> None:\n """Utility method for migration scripts to update SQL representation of event records."""\n check.int_param(record_id, "record_id")\n check.inst_param(event, "event", EventLogEntry)\n dagster_event_type = None\n asset_key_str = None\n if event.is_dagster_event:\n dagster_event_type = event.dagster_event.event_type_value # type: ignore\n if event.dagster_event.asset_key: # type: ignore\n check.inst_param(event.dagster_event.asset_key, "asset_key", AssetKey) # type: ignore\n asset_key_str = event.dagster_event.asset_key.to_string() # type: ignore\n\n with self.run_connection(run_id=event.run_id) as conn:\n conn.execute(\n SqlEventLogStorageTable.update()\n .where(SqlEventLogStorageTable.c.id == record_id)\n .values(\n event=serialize_value(event),\n dagster_event_type=dagster_event_type,\n timestamp=datetime.utcfromtimestamp(event.timestamp),\n step_key=event.step_key,\n asset_key=asset_key_str,\n )\n )\n\n def get_event_log_table_data(self, run_id: str, record_id: int) -> Optional[SqlAlchemyRow]:\n """Utility method to test representation of the record in the SQL table. Returns all of\n the columns stored in the event log storage (as opposed to the deserialized `EventLogEntry`).\n This allows checking that certain fields are extracted to support performant lookups (e.g.\n extracting `step_key` for fast filtering).\n """\n with self.run_connection(run_id=run_id) as conn:\n query = (\n db_select([SqlEventLogStorageTable])\n .where(SqlEventLogStorageTable.c.id == record_id)\n .order_by(SqlEventLogStorageTable.c.id.asc())\n )\n return conn.execute(query).fetchone()\n\n def has_secondary_index(self, name: str) -> bool:\n """This method uses a checkpoint migration table to see if summary data has been constructed\n in a secondary index table. Can be used to checkpoint event_log data migrations.\n """\n query = (\n db_select([1])\n .where(SecondaryIndexMigrationTable.c.name == name)\n .where(SecondaryIndexMigrationTable.c.migration_completed != None) # noqa: E711\n .limit(1)\n )\n with self.index_connection() as conn:\n results = conn.execute(query).fetchall()\n\n return len(results) > 0\n\n def enable_secondary_index(self, name: str) -> None:\n """This method marks an event_log data migration as complete, to indicate that a summary\n data migration is complete.\n """\n query = SecondaryIndexMigrationTable.insert().values(\n name=name,\n migration_completed=datetime.now(),\n )\n with self.index_connection() as conn:\n try:\n conn.execute(query)\n except db_exc.IntegrityError:\n conn.execute(\n SecondaryIndexMigrationTable.update()\n .where(SecondaryIndexMigrationTable.c.name == name)\n .values(migration_completed=datetime.now())\n )\n\n def _apply_filter_to_query(\n self,\n query: SqlAlchemyQuery,\n event_records_filter: EventRecordsFilter,\n asset_details: Optional[AssetDetails] = None,\n apply_cursor_filters: bool = True,\n ) -> SqlAlchemyQuery:\n query = query.where(\n SqlEventLogStorageTable.c.dagster_event_type == event_records_filter.event_type.value\n )\n\n if event_records_filter.asset_key:\n query = query.where(\n SqlEventLogStorageTable.c.asset_key == event_records_filter.asset_key.to_string(),\n )\n\n if event_records_filter.asset_partitions:\n query = query.where(\n SqlEventLogStorageTable.c.partition.in_(event_records_filter.asset_partitions)\n )\n\n if asset_details and asset_details.last_wipe_timestamp:\n query = query.where(\n SqlEventLogStorageTable.c.timestamp\n > datetime.utcfromtimestamp(asset_details.last_wipe_timestamp)\n )\n\n if apply_cursor_filters:\n # allow the run-sharded sqlite implementation to disable this cursor filtering so that\n # it can implement its own custom cursor logic, as cursor ids are not unique across run\n # shards\n if event_records_filter.before_cursor is not None:\n before_cursor_id = (\n event_records_filter.before_cursor.id\n if isinstance(event_records_filter.before_cursor, RunShardedEventsCursor)\n else event_records_filter.before_cursor\n )\n query = query.where(SqlEventLogStorageTable.c.id < before_cursor_id)\n\n if event_records_filter.after_cursor is not None:\n after_cursor_id = (\n event_records_filter.after_cursor.id\n if isinstance(event_records_filter.after_cursor, RunShardedEventsCursor)\n else event_records_filter.after_cursor\n )\n query = query.where(SqlEventLogStorageTable.c.id > after_cursor_id)\n\n if event_records_filter.before_timestamp:\n query = query.where(\n SqlEventLogStorageTable.c.timestamp\n < datetime.utcfromtimestamp(event_records_filter.before_timestamp)\n )\n\n if event_records_filter.after_timestamp:\n query = query.where(\n SqlEventLogStorageTable.c.timestamp\n > datetime.utcfromtimestamp(event_records_filter.after_timestamp)\n )\n\n if event_records_filter.storage_ids:\n query = query.where(SqlEventLogStorageTable.c.id.in_(event_records_filter.storage_ids))\n\n if event_records_filter.tags and self.has_table(AssetEventTagsTable.name):\n # If we don't have the tags table, we'll filter the results after the query\n check.invariant(\n isinstance(event_records_filter.asset_key, AssetKey),\n "Asset key must be set in event records filter to filter by tags.",\n )\n if self.supports_intersect:\n intersections = [\n db_select([AssetEventTagsTable.c.event_id]).where(\n db.and_(\n AssetEventTagsTable.c.asset_key\n == event_records_filter.asset_key.to_string(), # type: ignore # (bad sig?)\n AssetEventTagsTable.c.key == key,\n (\n AssetEventTagsTable.c.value == value\n if isinstance(value, str)\n else AssetEventTagsTable.c.value.in_(value)\n ),\n )\n )\n for key, value in event_records_filter.tags.items()\n ]\n query = query.where(SqlEventLogStorageTable.c.id.in_(db.intersect(*intersections)))\n\n return query\n\n def _apply_tags_table_joins(\n self,\n table: db.Table,\n tags: Mapping[str, Union[str, Sequence[str]]],\n asset_key: Optional[AssetKey],\n ) -> db.Table:\n event_id_col = table.c.id if table == SqlEventLogStorageTable else table.c.event_id\n i = 0\n for key, value in tags.items():\n i += 1\n tags_table = db_subquery(\n db_select([AssetEventTagsTable]), f"asset_event_tags_subquery_{i}"\n )\n table = table.join(\n tags_table,\n db.and_(\n event_id_col == tags_table.c.event_id,\n not asset_key or tags_table.c.asset_key == asset_key.to_string(),\n tags_table.c.key == key,\n (\n tags_table.c.value == value\n if isinstance(value, str)\n else tags_table.c.value.in_(value)\n ),\n ),\n )\n return table\n\n def get_event_records(\n self,\n event_records_filter: EventRecordsFilter,\n limit: Optional[int] = None,\n ascending: bool = False,\n ) -> Sequence[EventLogRecord]:\n """Returns a list of (record_id, record)."""\n check.inst_param(event_records_filter, "event_records_filter", EventRecordsFilter)\n check.opt_int_param(limit, "limit")\n check.bool_param(ascending, "ascending")\n\n if event_records_filter.asset_key:\n asset_details = next(iter(self._get_assets_details([event_records_filter.asset_key])))\n else:\n asset_details = None\n\n if (\n event_records_filter.tags\n and not self.supports_intersect\n and self.has_table(AssetEventTagsTable.name)\n ):\n table = self._apply_tags_table_joins(\n SqlEventLogStorageTable, event_records_filter.tags, event_records_filter.asset_key\n )\n else:\n table = SqlEventLogStorageTable\n\n query = db_select(\n [SqlEventLogStorageTable.c.id, SqlEventLogStorageTable.c.event]\n ).select_from(table)\n\n query = self._apply_filter_to_query(\n query=query,\n event_records_filter=event_records_filter,\n asset_details=asset_details,\n )\n if limit:\n query = query.limit(limit)\n\n if ascending:\n query = query.order_by(SqlEventLogStorageTable.c.id.asc())\n else:\n query = query.order_by(SqlEventLogStorageTable.c.id.desc())\n\n with self.index_connection() as conn:\n results = conn.execute(query).fetchall()\n\n event_records = []\n for row_id, json_str in results:\n try:\n event_record = deserialize_value(json_str, NamedTuple)\n if not isinstance(event_record, EventLogEntry):\n logging.warning(\n "Could not resolve event record as EventLogEntry for id `%s`.", row_id\n )\n continue\n\n if event_records_filter.tags and not self.has_table(AssetEventTagsTable.name):\n # If we can't filter tags via the tags table, filter the returned records\n if limit is not None:\n raise DagsterInvalidInvocationError(\n "Cannot filter events on tags with a limit, without the asset event "\n "tags table. To fix, run `dagster instance migrate`."\n )\n\n event_record_tags = event_record.tags\n if not event_record_tags or any(\n event_record_tags.get(k) != v for k, v in event_records_filter.tags.items()\n ):\n continue\n\n event_records.append(\n EventLogRecord(storage_id=row_id, event_log_entry=event_record)\n )\n except seven.JSONDecodeError:\n logging.warning("Could not parse event record id `%s`.", row_id)\n\n return event_records\n\n def supports_event_consumer_queries(self) -> bool:\n return True\n\n @property\n def supports_intersect(self) -> bool:\n return True\n\n def get_logs_for_all_runs_by_log_id(\n self,\n after_cursor: int = -1,\n dagster_event_type: Optional[Union[DagsterEventType, Set[DagsterEventType]]] = None,\n limit: Optional[int] = None,\n ) -> Mapping[int, EventLogEntry]:\n check.int_param(after_cursor, "after_cursor")\n check.invariant(\n after_cursor >= -1,\n f"Don't know what to do with negative cursor {after_cursor}",\n )\n dagster_event_types = (\n {dagster_event_type}\n if isinstance(dagster_event_type, DagsterEventType)\n else check.opt_set_param(\n dagster_event_type, "dagster_event_type", of_type=DagsterEventType\n )\n )\n\n query = (\n db_select([SqlEventLogStorageTable.c.id, SqlEventLogStorageTable.c.event])\n .where(SqlEventLogStorageTable.c.id > after_cursor)\n .order_by(SqlEventLogStorageTable.c.id.asc())\n )\n\n if dagster_event_types:\n query = query.where(\n SqlEventLogStorageTable.c.dagster_event_type.in_(\n [dagster_event_type.value for dagster_event_type in dagster_event_types]\n )\n )\n\n if limit:\n query = query.limit(limit)\n\n with self.index_connection() as conn:\n results = conn.execute(query).fetchall()\n\n events = {}\n record_id = None\n try:\n for (\n record_id,\n json_str,\n ) in results:\n events[record_id] = deserialize_value(json_str, EventLogEntry)\n except (seven.JSONDecodeError, DeserializationError):\n logging.warning("Could not parse event record id `%s`.", record_id)\n\n return events\n\n def get_maximum_record_id(self) -> Optional[int]:\n with self.index_connection() as conn:\n result = conn.execute(db_select([db.func.max(SqlEventLogStorageTable.c.id)])).fetchone()\n return result[0] # type: ignore\n\n def _construct_asset_record_from_row(\n self,\n row,\n last_materialization_record: Optional[EventLogRecord],\n can_cache_asset_status_data: bool,\n ) -> AssetRecord:\n from dagster._core.storage.partition_status_cache import AssetStatusCacheValue\n\n asset_key = AssetKey.from_db_string(row["asset_key"])\n if asset_key:\n return AssetRecord(\n storage_id=row["id"],\n asset_entry=AssetEntry(\n asset_key=asset_key,\n last_materialization_record=last_materialization_record,\n last_run_id=row["last_run_id"],\n asset_details=AssetDetails.from_db_string(row["asset_details"]),\n cached_status=(\n AssetStatusCacheValue.from_db_string(row["cached_status_data"])\n if can_cache_asset_status_data\n else None\n ),\n ),\n )\n else:\n check.failed("Row did not contain asset key.")\n\n def _get_latest_materialization_records(\n self, raw_asset_rows\n ) -> Mapping[AssetKey, Optional[EventLogRecord]]:\n # Given a list of raw asset rows, returns a mapping of asset key to latest asset materialization\n # event log entry. Fetches backcompat EventLogEntry records when the last_materialization\n # in the raw asset row is an AssetMaterialization.\n to_backcompat_fetch = set()\n results: Dict[AssetKey, Optional[EventLogRecord]] = {}\n for row in raw_asset_rows:\n asset_key = AssetKey.from_db_string(row["asset_key"])\n if not asset_key:\n continue\n event_or_materialization = (\n deserialize_value(row["last_materialization"], NamedTuple)\n if row["last_materialization"]\n else None\n )\n if isinstance(event_or_materialization, EventLogRecord):\n results[asset_key] = event_or_materialization\n else:\n to_backcompat_fetch.add(asset_key)\n\n latest_event_subquery = db_subquery(\n db_select(\n [\n SqlEventLogStorageTable.c.asset_key,\n db.func.max(SqlEventLogStorageTable.c.id).label("id"),\n ]\n )\n .where(\n db.and_(\n SqlEventLogStorageTable.c.asset_key.in_(\n [asset_key.to_string() for asset_key in to_backcompat_fetch]\n ),\n SqlEventLogStorageTable.c.dagster_event_type\n == DagsterEventType.ASSET_MATERIALIZATION.value,\n )\n )\n .group_by(SqlEventLogStorageTable.c.asset_key),\n "latest_event_subquery",\n )\n backcompat_query = db_select(\n [\n SqlEventLogStorageTable.c.asset_key,\n SqlEventLogStorageTable.c.id,\n SqlEventLogStorageTable.c.event,\n ]\n ).select_from(\n latest_event_subquery.join(\n SqlEventLogStorageTable,\n db.and_(\n SqlEventLogStorageTable.c.asset_key == latest_event_subquery.c.asset_key,\n SqlEventLogStorageTable.c.id == latest_event_subquery.c.id,\n ),\n )\n )\n with self.index_connection() as conn:\n event_rows = db_fetch_mappings(conn, backcompat_query)\n\n for row in event_rows:\n asset_key = AssetKey.from_db_string(cast(Optional[str], row["asset_key"]))\n if asset_key:\n results[asset_key] = EventLogRecord(\n storage_id=cast(int, row["id"]),\n event_log_entry=deserialize_value(cast(str, row["event"]), EventLogEntry),\n )\n return results\n\n def can_cache_asset_status_data(self) -> bool:\n return self.has_asset_key_col("cached_status_data")\n\n def wipe_asset_cached_status(self, asset_key: AssetKey) -> None:\n if self.can_cache_asset_status_data():\n check.inst_param(asset_key, "asset_key", AssetKey)\n with self.index_connection() as conn:\n conn.execute(\n AssetKeyTable.update()\n .values(dict(cached_status_data=None))\n .where(\n AssetKeyTable.c.asset_key == asset_key.to_string(),\n )\n )\n\n def get_asset_records(\n self, asset_keys: Optional[Sequence[AssetKey]] = None\n ) -> Sequence[AssetRecord]:\n rows = self._fetch_asset_rows(asset_keys=asset_keys)\n latest_materialization_records = self._get_latest_materialization_records(rows)\n can_cache_asset_status_data = self.can_cache_asset_status_data()\n\n asset_records: List[AssetRecord] = []\n for row in rows:\n asset_key = AssetKey.from_db_string(row["asset_key"])\n if asset_key:\n asset_records.append(\n self._construct_asset_record_from_row(\n row,\n latest_materialization_records.get(asset_key),\n can_cache_asset_status_data,\n )\n )\n\n return asset_records\n\n def has_asset_key(self, asset_key: AssetKey) -> bool:\n check.inst_param(asset_key, "asset_key", AssetKey)\n rows = self._fetch_asset_rows(asset_keys=[asset_key])\n return bool(rows)\n\n def all_asset_keys(self):\n rows = self._fetch_asset_rows()\n asset_keys = [\n AssetKey.from_db_string(row["asset_key"])\n for row in sorted(rows, key=lambda x: x["asset_key"])\n ]\n return [asset_key for asset_key in asset_keys if asset_key]\n\n def get_asset_keys(\n self,\n prefix: Optional[Sequence[str]] = None,\n limit: Optional[int] = None,\n cursor: Optional[str] = None,\n ) -> Sequence[AssetKey]:\n rows = self._fetch_asset_rows(prefix=prefix, limit=limit, cursor=cursor)\n asset_keys = [\n AssetKey.from_db_string(row["asset_key"])\n for row in sorted(rows, key=lambda x: x["asset_key"])\n ]\n return [asset_key for asset_key in asset_keys if asset_key]\n\n def get_latest_materialization_events(\n self, asset_keys: Iterable[AssetKey]\n ) -> Mapping[AssetKey, Optional[EventLogEntry]]:\n check.iterable_param(asset_keys, "asset_keys", AssetKey)\n rows = self._fetch_asset_rows(asset_keys=asset_keys)\n return {\n asset_key: event_log_record.event_log_entry if event_log_record is not None else None\n for asset_key, event_log_record in self._get_latest_materialization_records(\n rows\n ).items()\n }\n\n def _fetch_asset_rows(\n self,\n asset_keys=None,\n prefix: Optional[Sequence[str]] = None,\n limit: Optional[int] = None,\n cursor: Optional[str] = None,\n ) -> Sequence[SqlAlchemyRow]:\n # fetches rows containing asset_key, last_materialization, and asset_details from the DB,\n # applying the filters specified in the arguments.\n #\n # Differs from _fetch_raw_asset_rows, in that it loops through to make sure enough rows are\n # returned to satisfy the limit.\n #\n # returns a list of rows where each row is a tuple of serialized asset_key, materialization,\n # and asset_details\n should_query = True\n current_cursor = cursor\n if self.has_secondary_index(ASSET_KEY_INDEX_COLS):\n # if we have migrated, we can limit using SQL\n fetch_limit = limit\n else:\n # if we haven't migrated, overfetch in case the first N results are wiped\n fetch_limit = max(limit, MIN_ASSET_ROWS) if limit else None\n result = []\n\n while should_query:\n rows, has_more, current_cursor = self._fetch_raw_asset_rows(\n asset_keys=asset_keys, prefix=prefix, limit=fetch_limit, cursor=current_cursor\n )\n result.extend(rows)\n should_query = bool(has_more) and bool(limit) and len(result) < cast(int, limit)\n\n is_partial_query = asset_keys is not None or bool(prefix) or bool(limit) or bool(cursor)\n if not is_partial_query and self._can_mark_assets_as_migrated(rows): # type: ignore\n self.enable_secondary_index(ASSET_KEY_INDEX_COLS)\n\n return result[:limit] if limit else result\n\n def _fetch_raw_asset_rows(\n self,\n asset_keys: Optional[Sequence[AssetKey]] = None,\n prefix: Optional[Sequence[str]] = None,\n limit: Optional[int] = None,\n cursor=None,\n ) -> Tuple[Iterable[SqlAlchemyRow], bool, Optional[str]]:\n # fetches rows containing asset_key, last_materialization, and asset_details from the DB,\n # applying the filters specified in the arguments. Does not guarantee that the number of\n # rows returned will match the limit specified. This helper function is used to fetch a\n # chunk of asset key rows, which may or may not be wiped.\n #\n # Returns a tuple of (rows, has_more, cursor), where each row is a tuple of serialized\n # asset_key, materialization, and asset_details\n # TODO update comment\n\n columns = [\n AssetKeyTable.c.id,\n AssetKeyTable.c.asset_key,\n AssetKeyTable.c.last_materialization,\n AssetKeyTable.c.last_run_id,\n AssetKeyTable.c.asset_details,\n ]\n if self.can_cache_asset_status_data():\n columns.extend([AssetKeyTable.c.cached_status_data])\n\n is_partial_query = asset_keys is not None or bool(prefix) or bool(limit) or bool(cursor)\n if self.has_asset_key_index_cols() and not is_partial_query:\n # if the schema has been migrated, fetch the last_materialization_timestamp to see if\n # we can lazily migrate the data table\n columns.append(AssetKeyTable.c.last_materialization_timestamp)\n columns.append(AssetKeyTable.c.wipe_timestamp)\n\n query = db_select(columns).order_by(AssetKeyTable.c.asset_key.asc())\n query = self._apply_asset_filter_to_query(query, asset_keys, prefix, limit, cursor)\n\n if self.has_secondary_index(ASSET_KEY_INDEX_COLS):\n query = query.where(\n db.or_(\n AssetKeyTable.c.wipe_timestamp.is_(None),\n AssetKeyTable.c.last_materialization_timestamp > AssetKeyTable.c.wipe_timestamp,\n )\n )\n with self.index_connection() as conn:\n rows = db_fetch_mappings(conn, query)\n\n return rows, False, None\n\n with self.index_connection() as conn:\n rows = db_fetch_mappings(conn, query)\n\n wiped_timestamps_by_asset_key: Dict[AssetKey, float] = {}\n row_by_asset_key: Dict[AssetKey, SqlAlchemyRow] = OrderedDict()\n\n for row in rows:\n asset_key = AssetKey.from_db_string(cast(str, row["asset_key"]))\n if not asset_key:\n continue\n asset_details = AssetDetails.from_db_string(row["asset_details"])\n if not asset_details or not asset_details.last_wipe_timestamp:\n row_by_asset_key[asset_key] = row\n continue\n materialization_or_event_or_record = (\n deserialize_value(cast(str, row["last_materialization"]), NamedTuple)\n if row["last_materialization"]\n else None\n )\n if isinstance(materialization_or_event_or_record, (EventLogRecord, EventLogEntry)):\n if isinstance(materialization_or_event_or_record, EventLogRecord):\n event_timestamp = materialization_or_event_or_record.event_log_entry.timestamp\n else:\n event_timestamp = materialization_or_event_or_record.timestamp\n\n if asset_details.last_wipe_timestamp > event_timestamp:\n # this asset has not been materialized since being wiped, skip\n continue\n else:\n # add the key\n row_by_asset_key[asset_key] = row\n else:\n row_by_asset_key[asset_key] = row\n wiped_timestamps_by_asset_key[asset_key] = asset_details.last_wipe_timestamp\n\n if wiped_timestamps_by_asset_key:\n materialization_times = self._fetch_backcompat_materialization_times(\n wiped_timestamps_by_asset_key.keys() # type: ignore\n )\n for asset_key, wiped_timestamp in wiped_timestamps_by_asset_key.items():\n materialization_time = materialization_times.get(asset_key)\n if not materialization_time or utc_datetime_from_naive(\n materialization_time\n ) < utc_datetime_from_timestamp(wiped_timestamp):\n # remove rows that have not been materialized since being wiped\n row_by_asset_key.pop(asset_key)\n\n has_more = limit and len(rows) == limit\n new_cursor = rows[-1]["id"] if rows else None\n\n return row_by_asset_key.values(), has_more, new_cursor # type: ignore\n\n def update_asset_cached_status_data(\n self, asset_key: AssetKey, cache_values: "AssetStatusCacheValue"\n ) -> None:\n if self.can_cache_asset_status_data():\n with self.index_connection() as conn:\n conn.execute(\n AssetKeyTable.update()\n .where(\n AssetKeyTable.c.asset_key == asset_key.to_string(),\n )\n .values(cached_status_data=serialize_value(cache_values))\n )\n\n def _fetch_backcompat_materialization_times(\n self, asset_keys: Sequence[AssetKey]\n ) -> Mapping[AssetKey, datetime]:\n # fetches the latest materialization timestamp for the given asset_keys. Uses the (slower)\n # raw event log table.\n backcompat_query = (\n db_select(\n [\n SqlEventLogStorageTable.c.asset_key,\n db.func.max(SqlEventLogStorageTable.c.timestamp).label("timestamp"),\n ]\n )\n .where(\n SqlEventLogStorageTable.c.asset_key.in_(\n [asset_key.to_string() for asset_key in asset_keys]\n )\n )\n .group_by(SqlEventLogStorageTable.c.asset_key)\n .order_by(db.func.max(SqlEventLogStorageTable.c.timestamp).asc())\n )\n with self.index_connection() as conn:\n backcompat_rows = db_fetch_mappings(conn, backcompat_query)\n return {AssetKey.from_db_string(row["asset_key"]): row["timestamp"] for row in backcompat_rows} # type: ignore\n\n def _can_mark_assets_as_migrated(self, rows):\n if not self.has_asset_key_index_cols():\n return False\n\n if self.has_secondary_index(ASSET_KEY_INDEX_COLS):\n # we have already migrated\n return False\n\n for row in rows:\n if not _get_from_row(row, "last_materialization_timestamp"):\n return False\n\n if _get_from_row(row, "asset_details") and not _get_from_row(row, "wipe_timestamp"):\n return False\n\n return True\n\n def _apply_asset_filter_to_query(\n self,\n query: SqlAlchemyQuery,\n asset_keys: Optional[Sequence[AssetKey]] = None,\n prefix=None,\n limit: Optional[int] = None,\n cursor: Optional[str] = None,\n ) -> SqlAlchemyQuery:\n if asset_keys is not None:\n query = query.where(\n AssetKeyTable.c.asset_key.in_([asset_key.to_string() for asset_key in asset_keys])\n )\n\n if prefix:\n prefix_str = seven.dumps(prefix)[:-1]\n query = query.where(AssetKeyTable.c.asset_key.startswith(prefix_str))\n\n if cursor:\n query = query.where(AssetKeyTable.c.asset_key > cursor)\n\n if limit:\n query = query.limit(limit)\n return query\n\n def _get_assets_details(\n self, asset_keys: Sequence[AssetKey]\n ) -> Sequence[Optional[AssetDetails]]:\n check.sequence_param(asset_keys, "asset_key", AssetKey)\n rows = None\n with self.index_connection() as conn:\n rows = db_fetch_mappings(\n conn,\n db_select([AssetKeyTable.c.asset_key, AssetKeyTable.c.asset_details]).where(\n AssetKeyTable.c.asset_key.in_(\n [asset_key.to_string() for asset_key in asset_keys]\n ),\n ),\n )\n\n asset_key_to_details = {\n cast(str, row["asset_key"]): (\n deserialize_value(cast(str, row["asset_details"]), AssetDetails)\n if row["asset_details"]\n else None\n )\n for row in rows\n }\n\n # returns a list of the corresponding asset_details to provided asset_keys\n return [\n asset_key_to_details.get(asset_key.to_string(), None) for asset_key in asset_keys\n ]\n\n def _add_assets_wipe_filter_to_query(\n self,\n query: SqlAlchemyQuery,\n assets_details: Sequence[Optional[AssetDetails]],\n asset_keys: Sequence[AssetKey],\n ) -> SqlAlchemyQuery:\n check.invariant(\n len(assets_details) == len(asset_keys),\n "asset_details and asset_keys must be the same length",\n )\n for i in range(len(assets_details)):\n asset_key, asset_details = asset_keys[i], assets_details[i]\n if asset_details and asset_details.last_wipe_timestamp:\n asset_key_in_row = SqlEventLogStorageTable.c.asset_key == asset_key.to_string()\n # If asset key is in row, keep the row if the timestamp > wipe timestamp, else remove the row.\n # If asset key is not in row, keep the row.\n query = query.where(\n db.or_(\n db.and_(\n asset_key_in_row,\n SqlEventLogStorageTable.c.timestamp\n > datetime.utcfromtimestamp(asset_details.last_wipe_timestamp),\n ),\n db.not_(asset_key_in_row),\n )\n )\n\n return query\n\n def get_event_tags_for_asset(\n self,\n asset_key: AssetKey,\n filter_tags: Optional[Mapping[str, str]] = None,\n filter_event_id: Optional[int] = None,\n ) -> Sequence[Mapping[str, str]]:\n """Fetches asset event tags for the given asset key.\n\n If filter_tags is provided, searches for events containing all of the filter tags. Then,\n returns all tags for those events. This enables searching for multipartitioned asset\n partition tags with a fixed dimension value, e.g. all of the tags for events where\n "country" == "US".\n\n If filter_event_id is provided, fetches only tags applied to the given event.\n\n Returns a list of dicts, where each dict is a mapping of tag key to tag value for a\n single event.\n """\n asset_key = check.inst_param(asset_key, "asset_key", AssetKey)\n filter_tags = check.opt_mapping_param(\n filter_tags, "filter_tags", key_type=str, value_type=str\n )\n filter_event_id = check.opt_int_param(filter_event_id, "filter_event_id")\n\n if not self.has_table(AssetEventTagsTable.name):\n raise DagsterInvalidInvocationError(\n "In order to search for asset event tags, you must run "\n "`dagster instance migrate` to create the AssetEventTags table."\n )\n\n asset_details = self._get_assets_details([asset_key])[0]\n if not filter_tags:\n tags_query = db_select(\n [\n AssetEventTagsTable.c.key,\n AssetEventTagsTable.c.value,\n AssetEventTagsTable.c.event_id,\n ]\n ).where(AssetEventTagsTable.c.asset_key == asset_key.to_string())\n if asset_details and asset_details.last_wipe_timestamp:\n tags_query = tags_query.where(\n AssetEventTagsTable.c.event_timestamp\n > datetime.utcfromtimestamp(asset_details.last_wipe_timestamp)\n )\n elif self.supports_intersect:\n\n def get_tag_filter_query(tag_key, tag_value):\n filter_query = db_select([AssetEventTagsTable.c.event_id]).where(\n db.and_(\n AssetEventTagsTable.c.asset_key == asset_key.to_string(),\n AssetEventTagsTable.c.key == tag_key,\n AssetEventTagsTable.c.value == tag_value,\n )\n )\n if asset_details and asset_details.last_wipe_timestamp:\n filter_query = filter_query.where(\n AssetEventTagsTable.c.event_timestamp\n > datetime.utcfromtimestamp(asset_details.last_wipe_timestamp)\n )\n return filter_query\n\n intersections = [\n get_tag_filter_query(tag_key, tag_value)\n for tag_key, tag_value in filter_tags.items()\n ]\n\n tags_query = db_select(\n [\n AssetEventTagsTable.c.key,\n AssetEventTagsTable.c.value,\n AssetEventTagsTable.c.event_id,\n ]\n ).where(\n db.and_(\n AssetEventTagsTable.c.event_id.in_(db.intersect(*intersections)),\n )\n )\n else:\n table = self._apply_tags_table_joins(AssetEventTagsTable, filter_tags, asset_key)\n tags_query = db_select(\n [\n AssetEventTagsTable.c.key,\n AssetEventTagsTable.c.value,\n AssetEventTagsTable.c.event_id,\n ]\n ).select_from(table)\n\n if asset_details and asset_details.last_wipe_timestamp:\n tags_query = tags_query.where(\n AssetEventTagsTable.c.event_timestamp\n > datetime.utcfromtimestamp(asset_details.last_wipe_timestamp)\n )\n\n if filter_event_id is not None:\n tags_query = tags_query.where(AssetEventTagsTable.c.event_id == filter_event_id)\n\n with self.index_connection() as conn:\n results = conn.execute(tags_query).fetchall()\n\n tags_by_event_id: Dict[int, Dict[str, str]] = defaultdict(dict)\n for row in results:\n key, value, event_id = row\n tags_by_event_id[event_id][key] = value\n\n return list(tags_by_event_id.values())\n\n def _asset_materialization_from_json_column(\n self, json_str: str\n ) -> Optional[AssetMaterialization]:\n if not json_str:\n return None\n\n # We switched to storing the entire event record of the last materialization instead of just\n # the AssetMaterialization object, so that we have access to metadata like timestamp,\n # pipeline, run_id, etc.\n #\n # This should make certain asset queries way more performant, without having to do extra\n # queries against the event log.\n #\n # This should be accompanied by a schema change in 0.12.0, renaming `last_materialization`\n # to `last_materialization_event`, for clarity. For now, we should do some back-compat.\n #\n # https://github.com/dagster-io/dagster/issues/3945\n\n event_or_materialization = deserialize_value(json_str, NamedTuple)\n if isinstance(event_or_materialization, AssetMaterialization):\n return event_or_materialization\n\n if (\n not isinstance(event_or_materialization, EventLogEntry)\n or not event_or_materialization.is_dagster_event\n or not event_or_materialization.dagster_event.asset_key # type: ignore\n ):\n return None\n\n return event_or_materialization.dagster_event.step_materialization_data.materialization # type: ignore\n\n def _get_asset_key_values_on_wipe(self) -> Mapping[str, Any]:\n wipe_timestamp = pendulum.now("UTC").timestamp()\n values = {\n "asset_details": serialize_value(AssetDetails(last_wipe_timestamp=wipe_timestamp)),\n "last_run_id": None,\n }\n if self.has_asset_key_index_cols():\n values.update(\n dict(\n wipe_timestamp=utc_datetime_from_timestamp(wipe_timestamp),\n )\n )\n if self.can_cache_asset_status_data():\n values.update(dict(cached_status_data=None))\n return values\n\n def wipe_asset(self, asset_key: AssetKey) -> None:\n check.inst_param(asset_key, "asset_key", AssetKey)\n wiped_values = self._get_asset_key_values_on_wipe()\n\n with self.index_connection() as conn:\n conn.execute(\n AssetKeyTable.update()\n .values(**wiped_values)\n .where(\n AssetKeyTable.c.asset_key == asset_key.to_string(),\n )\n )\n\n def get_materialized_partitions(\n self,\n asset_key: AssetKey,\n before_cursor: Optional[int] = None,\n after_cursor: Optional[int] = None,\n ) -> Set[str]:\n query = (\n db_select(\n [\n SqlEventLogStorageTable.c.partition,\n db.func.max(SqlEventLogStorageTable.c.id),\n ]\n )\n .where(\n db.and_(\n SqlEventLogStorageTable.c.asset_key == asset_key.to_string(),\n SqlEventLogStorageTable.c.partition != None, # noqa: E711\n SqlEventLogStorageTable.c.dagster_event_type\n == DagsterEventType.ASSET_MATERIALIZATION.value,\n )\n )\n .group_by(SqlEventLogStorageTable.c.partition)\n )\n\n assets_details = self._get_assets_details([asset_key])\n query = self._add_assets_wipe_filter_to_query(query, assets_details, [asset_key])\n\n if after_cursor:\n query = query.where(SqlEventLogStorageTable.c.id > after_cursor)\n if before_cursor:\n query = query.where(SqlEventLogStorageTable.c.id < before_cursor)\n\n with self.index_connection() as conn:\n results = conn.execute(query).fetchall()\n\n return set([cast(str, row[0]) for row in results])\n\n def get_materialization_count_by_partition(\n self,\n asset_keys: Sequence[AssetKey],\n after_cursor: Optional[int] = None,\n before_cursor: Optional[int] = None,\n ) -> Mapping[AssetKey, Mapping[str, int]]:\n check.sequence_param(asset_keys, "asset_keys", AssetKey)\n\n query = (\n db_select(\n [\n SqlEventLogStorageTable.c.asset_key,\n SqlEventLogStorageTable.c.partition,\n db.func.count(SqlEventLogStorageTable.c.id),\n ]\n )\n .where(\n db.and_(\n SqlEventLogStorageTable.c.asset_key.in_(\n [asset_key.to_string() for asset_key in asset_keys]\n ),\n SqlEventLogStorageTable.c.partition != None, # noqa: E711\n SqlEventLogStorageTable.c.dagster_event_type\n == DagsterEventType.ASSET_MATERIALIZATION.value,\n )\n )\n .group_by(SqlEventLogStorageTable.c.asset_key, SqlEventLogStorageTable.c.partition)\n )\n\n assets_details = self._get_assets_details(asset_keys)\n query = self._add_assets_wipe_filter_to_query(query, assets_details, asset_keys)\n\n if after_cursor:\n query = query.where(SqlEventLogStorageTable.c.id > after_cursor)\n\n with self.index_connection() as conn:\n results = conn.execute(query).fetchall()\n\n materialization_count_by_partition: Dict[AssetKey, Dict[str, int]] = {\n asset_key: {} for asset_key in asset_keys\n }\n for row in results:\n asset_key = AssetKey.from_db_string(cast(Optional[str], row[0]))\n if asset_key:\n materialization_count_by_partition[asset_key][cast(str, row[1])] = cast(int, row[2])\n\n return materialization_count_by_partition\n\n def _latest_event_ids_by_partition_subquery(\n self,\n asset_key: AssetKey,\n event_types: Sequence[DagsterEventType],\n asset_partitions: Optional[Sequence[str]] = None,\n before_cursor: Optional[int] = None,\n after_cursor: Optional[int] = None,\n ):\n """Subquery for locating the latest event ids by partition for a given asset key and set\n of event types.\n """\n query = db_select(\n [\n SqlEventLogStorageTable.c.dagster_event_type,\n SqlEventLogStorageTable.c.partition,\n db.func.max(SqlEventLogStorageTable.c.id).label("id"),\n ]\n ).where(\n db.and_(\n SqlEventLogStorageTable.c.asset_key == asset_key.to_string(),\n SqlEventLogStorageTable.c.partition != None, # noqa: E711\n SqlEventLogStorageTable.c.dagster_event_type.in_(\n [event_type.value for event_type in event_types]\n ),\n )\n )\n if asset_partitions is not None:\n query = query.where(SqlEventLogStorageTable.c.partition.in_(asset_partitions))\n if before_cursor is not None:\n query = query.where(SqlEventLogStorageTable.c.id < before_cursor)\n if after_cursor is not None:\n query = query.where(SqlEventLogStorageTable.c.id > after_cursor)\n\n latest_event_ids_subquery = query.group_by(\n SqlEventLogStorageTable.c.dagster_event_type, SqlEventLogStorageTable.c.partition\n )\n\n assets_details = self._get_assets_details([asset_key])\n return db_subquery(\n self._add_assets_wipe_filter_to_query(\n latest_event_ids_subquery, assets_details, [asset_key]\n ),\n "latest_event_ids_by_partition_subquery",\n )\n\n def get_latest_storage_id_by_partition(\n self, asset_key: AssetKey, event_type: DagsterEventType\n ) -> Mapping[str, int]:\n """Fetch the latest materialzation storage id for each partition for a given asset key.\n\n Returns a mapping of partition to storage id.\n """\n check.inst_param(asset_key, "asset_key", AssetKey)\n\n latest_event_ids_by_partition_subquery = self._latest_event_ids_by_partition_subquery(\n asset_key, [event_type]\n )\n latest_event_ids_by_partition = db_select(\n [\n latest_event_ids_by_partition_subquery.c.partition,\n latest_event_ids_by_partition_subquery.c.id,\n ]\n )\n\n with self.index_connection() as conn:\n rows = conn.execute(latest_event_ids_by_partition).fetchall()\n\n latest_materialization_storage_id_by_partition: Dict[str, int] = {}\n for row in rows:\n latest_materialization_storage_id_by_partition[cast(str, row[0])] = cast(int, row[1])\n return latest_materialization_storage_id_by_partition\n\n def get_latest_tags_by_partition(\n self,\n asset_key: AssetKey,\n event_type: DagsterEventType,\n tag_keys: Sequence[str],\n asset_partitions: Optional[Sequence[str]] = None,\n before_cursor: Optional[int] = None,\n after_cursor: Optional[int] = None,\n ) -> Mapping[str, Mapping[str, str]]:\n check.inst_param(asset_key, "asset_key", AssetKey)\n check.inst_param(event_type, "event_type", DagsterEventType)\n check.sequence_param(tag_keys, "tag_keys", of_type=str)\n check.opt_nullable_sequence_param(asset_partitions, "asset_partitions", of_type=str)\n check.opt_int_param(before_cursor, "before_cursor")\n check.opt_int_param(after_cursor, "after_cursor")\n\n latest_event_ids_subquery = self._latest_event_ids_by_partition_subquery(\n asset_key=asset_key,\n event_types=[event_type],\n asset_partitions=asset_partitions,\n before_cursor=before_cursor,\n after_cursor=after_cursor,\n )\n\n latest_tags_by_partition_query = (\n db_select(\n [\n latest_event_ids_subquery.c.partition,\n AssetEventTagsTable.c.key,\n AssetEventTagsTable.c.value,\n ]\n )\n .select_from(\n latest_event_ids_subquery.join(\n AssetEventTagsTable,\n AssetEventTagsTable.c.event_id == latest_event_ids_subquery.c.id,\n )\n )\n .where(AssetEventTagsTable.c.key.in_(tag_keys))\n )\n\n latest_tags_by_partition: Dict[str, Dict[str, str]] = defaultdict(dict)\n with self.index_connection() as conn:\n rows = conn.execute(latest_tags_by_partition_query).fetchall()\n\n for row in rows:\n latest_tags_by_partition[cast(str, row[0])][cast(str, row[1])] = cast(str, row[2])\n\n # convert defaultdict to dict\n return dict(latest_tags_by_partition)\n\n def get_latest_asset_partition_materialization_attempts_without_materializations(\n self, asset_key: AssetKey\n ) -> Mapping[str, Tuple[str, int]]:\n """Fetch the latest materialzation and materialization planned events for each partition of the given asset.\n Return the partitions that have a materialization planned event but no matching (same run) materialization event.\n These materializations could be in progress, or they could have failed. A separate query checking the run status\n is required to know.\n\n Returns a mapping of partition to [run id, event id].\n """\n check.inst_param(asset_key, "asset_key", AssetKey)\n\n latest_event_ids_subquery = self._latest_event_ids_by_partition_subquery(\n asset_key,\n [\n DagsterEventType.ASSET_MATERIALIZATION,\n DagsterEventType.ASSET_MATERIALIZATION_PLANNED,\n ],\n )\n\n latest_events_subquery = db_subquery(\n db_select(\n [\n SqlEventLogStorageTable.c.dagster_event_type,\n SqlEventLogStorageTable.c.partition,\n SqlEventLogStorageTable.c.run_id,\n SqlEventLogStorageTable.c.id,\n ]\n ).select_from(\n latest_event_ids_subquery.join(\n SqlEventLogStorageTable,\n SqlEventLogStorageTable.c.id == latest_event_ids_subquery.c.id,\n ),\n ),\n "latest_events_subquery",\n )\n\n materialization_planned_events = db_select(\n [\n latest_events_subquery.c.dagster_event_type,\n latest_events_subquery.c.partition,\n latest_events_subquery.c.run_id,\n latest_events_subquery.c.id,\n ]\n ).where(\n latest_events_subquery.c.dagster_event_type\n == DagsterEventType.ASSET_MATERIALIZATION_PLANNED.value\n )\n\n materialization_events = db_select(\n [\n latest_events_subquery.c.dagster_event_type,\n latest_events_subquery.c.partition,\n latest_events_subquery.c.run_id,\n ]\n ).where(\n latest_events_subquery.c.dagster_event_type\n == DagsterEventType.ASSET_MATERIALIZATION.value\n )\n\n with self.index_connection() as conn:\n materialization_planned_rows = db_fetch_mappings(conn, materialization_planned_events)\n materialization_rows = db_fetch_mappings(conn, materialization_events)\n\n materialization_planned_rows_by_partition = {\n cast(str, row["partition"]): (cast(str, row["run_id"]), cast(int, row["id"]))\n for row in materialization_planned_rows\n }\n for row in materialization_rows:\n if (\n row["partition"] in materialization_planned_rows_by_partition\n and materialization_planned_rows_by_partition[cast(str, row["partition"])][0]\n == row["run_id"]\n ):\n materialization_planned_rows_by_partition.pop(cast(str, row["partition"]))\n\n return materialization_planned_rows_by_partition\n\n def _check_partitions_table(self) -> None:\n # Guards against cases where the user is not running the latest migration for\n # partitions storage. Should be updated when the partitions storage schema changes.\n if not self.has_table("dynamic_partitions"):\n raise DagsterInvalidInvocationError(\n "Using dynamic partitions definitions requires the dynamic partitions table, which"\n " currently does not exist. Add this table by running `dagster"\n " instance migrate`."\n )\n\n def get_dynamic_partitions(self, partitions_def_name: str) -> Sequence[str]:\n """Get the list of partition keys for a partition definition."""\n self._check_partitions_table()\n columns = [\n DynamicPartitionsTable.c.partitions_def_name,\n DynamicPartitionsTable.c.partition,\n ]\n query = (\n db_select(columns)\n .where(DynamicPartitionsTable.c.partitions_def_name == partitions_def_name)\n .order_by(DynamicPartitionsTable.c.id)\n )\n with self.index_connection() as conn:\n rows = conn.execute(query).fetchall()\n\n return [cast(str, row[1]) for row in rows]\n\n def has_dynamic_partition(self, partitions_def_name: str, partition_key: str) -> bool:\n self._check_partitions_table()\n query = (\n db_select([DynamicPartitionsTable.c.partition])\n .where(\n db.and_(\n DynamicPartitionsTable.c.partitions_def_name == partitions_def_name,\n DynamicPartitionsTable.c.partition == partition_key,\n )\n )\n .limit(1)\n )\n with self.index_connection() as conn:\n results = conn.execute(query).fetchall()\n\n return len(results) > 0\n\n def add_dynamic_partitions(\n self, partitions_def_name: str, partition_keys: Sequence[str]\n ) -> None:\n self._check_partitions_table()\n with self.index_connection() as conn:\n existing_rows = conn.execute(\n db_select([DynamicPartitionsTable.c.partition]).where(\n db.and_(\n DynamicPartitionsTable.c.partition.in_(partition_keys),\n DynamicPartitionsTable.c.partitions_def_name == partitions_def_name,\n )\n )\n ).fetchall()\n existing_keys = set([row[0] for row in existing_rows])\n new_keys = [\n partition_key\n for partition_key in partition_keys\n if partition_key not in existing_keys\n ]\n\n if new_keys:\n conn.execute(\n DynamicPartitionsTable.insert(),\n [\n dict(partitions_def_name=partitions_def_name, partition=partition_key)\n for partition_key in new_keys\n ],\n )\n\n def delete_dynamic_partition(self, partitions_def_name: str, partition_key: str) -> None:\n self._check_partitions_table()\n with self.index_connection() as conn:\n conn.execute(\n DynamicPartitionsTable.delete().where(\n db.and_(\n DynamicPartitionsTable.c.partitions_def_name == partitions_def_name,\n DynamicPartitionsTable.c.partition == partition_key,\n )\n )\n )\n\n @property\n def supports_global_concurrency_limits(self) -> bool:\n return self.has_table(ConcurrencySlotsTable.name)\n\n def set_concurrency_slots(self, concurrency_key: str, num: int) -> None:\n """Allocate a set of concurrency slots.\n\n Args:\n concurrency_key (str): The key to allocate the slots for.\n num (int): The number of slots to allocate.\n """\n if num > MAX_CONCURRENCY_SLOTS:\n raise DagsterInvalidInvocationError(\n f"Cannot have more than {MAX_CONCURRENCY_SLOTS} slots per concurrency key."\n )\n if num < 0:\n raise DagsterInvalidInvocationError("Cannot have a negative number of slots.")\n\n keys_to_assign = None\n with self.index_connection() as conn:\n count_row = conn.execute(\n db_select([db.func.count()])\n .select_from(ConcurrencySlotsTable)\n .where(\n db.and_(\n ConcurrencySlotsTable.c.concurrency_key == concurrency_key,\n ConcurrencySlotsTable.c.deleted == False, # noqa: E712\n )\n )\n ).fetchone()\n existing = cast(int, count_row[0]) if count_row else 0\n\n if existing > num:\n # need to delete some slots, favoring ones where the slot is unallocated\n rows = conn.execute(\n db_select([ConcurrencySlotsTable.c.id])\n .select_from(ConcurrencySlotsTable)\n .where(\n db.and_(\n ConcurrencySlotsTable.c.concurrency_key == concurrency_key,\n ConcurrencySlotsTable.c.deleted == False, # noqa: E712\n )\n )\n .order_by(\n db_case([(ConcurrencySlotsTable.c.run_id.is_(None), 1)], else_=0).desc(),\n ConcurrencySlotsTable.c.id.desc(),\n )\n .limit(existing - num)\n ).fetchall()\n\n if rows:\n # mark rows as deleted\n conn.execute(\n ConcurrencySlotsTable.update()\n .values(deleted=True)\n .where(ConcurrencySlotsTable.c.id.in_([row[0] for row in rows]))\n )\n\n # actually delete rows that are marked as deleted and are not claimed... the rest\n # will be deleted when the slots are released by the free_concurrency_slots\n conn.execute(\n ConcurrencySlotsTable.delete().where(\n db.and_(\n ConcurrencySlotsTable.c.deleted == True, # noqa: E712\n ConcurrencySlotsTable.c.run_id == None, # noqa: E711\n )\n )\n )\n elif num > existing:\n # need to add some slots\n rows = [\n {\n "concurrency_key": concurrency_key,\n "run_id": None,\n "step_key": None,\n "deleted": False,\n }\n for _ in range(existing, num)\n ]\n conn.execute(ConcurrencySlotsTable.insert().values(rows))\n keys_to_assign = [concurrency_key for _ in range(existing, num)]\n\n if keys_to_assign:\n # we've added some slots... if there are any pending steps, we can assign them now or\n # they will be unutilized until free_concurrency_slots is called\n self.assign_pending_steps(keys_to_assign)\n\n def has_unassigned_slots(self, concurrency_key: str) -> bool:\n with self.index_connection() as conn:\n pending_row = conn.execute(\n db_select([db.func.count()])\n .select_from(PendingStepsTable)\n .where(\n db.and_(\n PendingStepsTable.c.concurrency_key == concurrency_key,\n PendingStepsTable.c.assigned_timestamp != None, # noqa: E711\n )\n )\n ).fetchone()\n slots = conn.execute(\n db_select([db.func.count()])\n .select_from(ConcurrencySlotsTable)\n .where(\n db.and_(\n ConcurrencySlotsTable.c.concurrency_key == concurrency_key,\n ConcurrencySlotsTable.c.deleted == False, # noqa: E712\n )\n )\n ).fetchone()\n pending_count = cast(int, pending_row[0]) if pending_row else 0\n slots_count = cast(int, slots[0]) if slots else 0\n return slots_count > pending_count\n\n def check_concurrency_claim(\n self, concurrency_key: str, run_id: str, step_key: str\n ) -> ConcurrencyClaimStatus:\n with self.index_connection() as conn:\n pending_row = conn.execute(\n db_select(\n [\n PendingStepsTable.c.assigned_timestamp,\n PendingStepsTable.c.priority,\n PendingStepsTable.c.create_timestamp,\n ]\n ).where(\n db.and_(\n PendingStepsTable.c.run_id == run_id,\n PendingStepsTable.c.step_key == step_key,\n PendingStepsTable.c.concurrency_key == concurrency_key,\n )\n )\n ).fetchone()\n\n if not pending_row:\n # no pending step pending_row exists, the slot is blocked and the enqueued timestamp is None\n return ConcurrencyClaimStatus(\n concurrency_key=concurrency_key,\n slot_status=ConcurrencySlotStatus.BLOCKED,\n priority=None,\n assigned_timestamp=None,\n enqueued_timestamp=None,\n )\n\n priority = cast(int, pending_row[1]) if pending_row[1] else None\n assigned_timestamp = cast(datetime, pending_row[0]) if pending_row[0] else None\n create_timestamp = cast(datetime, pending_row[2]) if pending_row[2] else None\n if assigned_timestamp is None:\n return ConcurrencyClaimStatus(\n concurrency_key=concurrency_key,\n slot_status=ConcurrencySlotStatus.BLOCKED,\n priority=priority,\n assigned_timestamp=None,\n enqueued_timestamp=create_timestamp,\n )\n\n # pending step is assigned, check to see if it's been claimed\n slot_row = conn.execute(\n db_select([db.func.count()]).where(\n db.and_(\n ConcurrencySlotsTable.c.concurrency_key == concurrency_key,\n ConcurrencySlotsTable.c.run_id == run_id,\n ConcurrencySlotsTable.c.step_key == step_key,\n )\n )\n ).fetchone()\n\n return ConcurrencyClaimStatus(\n concurrency_key=concurrency_key,\n slot_status=(\n ConcurrencySlotStatus.CLAIMED\n if slot_row and slot_row[0]\n else ConcurrencySlotStatus.BLOCKED\n ),\n priority=priority,\n assigned_timestamp=assigned_timestamp,\n enqueued_timestamp=create_timestamp,\n )\n\n def can_claim_from_pending(self, concurrency_key: str, run_id: str, step_key: str):\n with self.index_connection() as conn:\n row = conn.execute(\n db_select([PendingStepsTable.c.assigned_timestamp]).where(\n db.and_(\n PendingStepsTable.c.run_id == run_id,\n PendingStepsTable.c.step_key == step_key,\n PendingStepsTable.c.concurrency_key == concurrency_key,\n )\n )\n ).fetchone()\n return row and row[0] is not None\n\n def has_pending_step(self, concurrency_key: str, run_id: str, step_key: str):\n with self.index_connection() as conn:\n row = conn.execute(\n db_select([db.func.count()])\n .select_from(PendingStepsTable)\n .where(\n db.and_(\n PendingStepsTable.c.concurrency_key == concurrency_key,\n PendingStepsTable.c.run_id == run_id,\n PendingStepsTable.c.step_key == step_key,\n )\n )\n ).fetchone()\n return row and cast(int, row[0]) > 0\n\n def assign_pending_steps(self, concurrency_keys: Sequence[str]):\n if not concurrency_keys:\n return\n\n with self.index_connection() as conn:\n for key in concurrency_keys:\n row = conn.execute(\n db_select([PendingStepsTable.c.id])\n .where(\n db.and_(\n PendingStepsTable.c.concurrency_key == key,\n PendingStepsTable.c.assigned_timestamp == None, # noqa: E711\n )\n )\n .order_by(\n PendingStepsTable.c.priority.desc(),\n PendingStepsTable.c.create_timestamp.asc(),\n )\n .limit(1)\n ).fetchone()\n if row:\n conn.execute(\n PendingStepsTable.update()\n .where(PendingStepsTable.c.id == row[0])\n .values(assigned_timestamp=db.func.now())\n )\n\n def add_pending_step(\n self,\n concurrency_key: str,\n run_id: str,\n step_key: str,\n priority: Optional[int] = None,\n should_assign: bool = False,\n ):\n with self.index_connection() as conn:\n try:\n conn.execute(\n PendingStepsTable.insert().values(\n [\n dict(\n run_id=run_id,\n step_key=step_key,\n concurrency_key=concurrency_key,\n priority=priority or 0,\n assigned_timestamp=db.func.now() if should_assign else None,\n )\n ]\n )\n )\n except db_exc.IntegrityError:\n # do nothing\n pass\n\n def _remove_pending_steps(self, run_id: str, step_key: Optional[str] = None):\n query = PendingStepsTable.delete().where(PendingStepsTable.c.run_id == run_id)\n if step_key:\n query = query.where(PendingStepsTable.c.step_key == step_key)\n with self.index_connection() as conn:\n conn.execute(query)\n\n def claim_concurrency_slot(\n self, concurrency_key: str, run_id: str, step_key: str, priority: Optional[int] = None\n ) -> ConcurrencyClaimStatus:\n """Claim concurrency slot for step.\n\n Args:\n concurrency_keys (str): The concurrency key to claim.\n run_id (str): The run id to claim for.\n step_key (str): The step key to claim for.\n """\n # first, register the step by adding to pending queue\n if not self.has_pending_step(\n concurrency_key=concurrency_key, run_id=run_id, step_key=step_key\n ):\n has_unassigned_slots = self.has_unassigned_slots(concurrency_key)\n self.add_pending_step(\n concurrency_key=concurrency_key,\n run_id=run_id,\n step_key=step_key,\n priority=priority,\n should_assign=has_unassigned_slots,\n )\n\n # if the step is not assigned (i.e. has not been popped from queue), block the claim\n claim_status = self.check_concurrency_claim(\n concurrency_key=concurrency_key, run_id=run_id, step_key=step_key\n )\n if claim_status.is_claimed or not claim_status.is_assigned:\n return claim_status\n\n # attempt to claim a concurrency slot... this should generally work because we only assign\n # based on the number of unclaimed slots, but this should act as a safeguard, using the slot\n # rows as a semaphore\n slot_status = self._claim_concurrency_slot(\n concurrency_key=concurrency_key, run_id=run_id, step_key=step_key\n )\n return claim_status.with_slot_status(slot_status)\n\n def _claim_concurrency_slot(\n self, concurrency_key: str, run_id: str, step_key: str\n ) -> ConcurrencySlotStatus:\n """Claim a concurrency slot for the step. Helper method that is called for steps that are\n popped off the priority queue.\n\n Args:\n concurrency_key (str): The concurrency key to claim.\n run_id (str): The run id to claim a slot for.\n step_key (str): The step key to claim a slot for.\n """\n with self.index_connection() as conn:\n result = conn.execute(\n db_select([ConcurrencySlotsTable.c.id])\n .select_from(ConcurrencySlotsTable)\n .where(\n db.and_(\n ConcurrencySlotsTable.c.concurrency_key == concurrency_key,\n ConcurrencySlotsTable.c.step_key == None, # noqa: E711\n ConcurrencySlotsTable.c.deleted == False, # noqa: E712\n )\n )\n .with_for_update(skip_locked=True)\n .limit(1)\n ).fetchone()\n if not result or not result[0]:\n return ConcurrencySlotStatus.BLOCKED\n if not conn.execute(\n ConcurrencySlotsTable.update()\n .values(run_id=run_id, step_key=step_key)\n .where(ConcurrencySlotsTable.c.id == result[0])\n ).rowcount:\n return ConcurrencySlotStatus.BLOCKED\n\n return ConcurrencySlotStatus.CLAIMED\n\n def get_concurrency_keys(self) -> Set[str]:\n """Get the set of concurrency limited keys."""\n with self.index_connection() as conn:\n rows = conn.execute(\n db_select([ConcurrencySlotsTable.c.concurrency_key])\n .select_from(ConcurrencySlotsTable)\n .where(ConcurrencySlotsTable.c.deleted == False) # noqa: E712\n .distinct()\n ).fetchall()\n return {cast(str, row[0]) for row in rows}\n\n def get_concurrency_info(self, concurrency_key: str) -> ConcurrencyKeyInfo:\n """Get the list of concurrency slots for a given concurrency key.\n\n Args:\n concurrency_key (str): The concurrency key to get the slots for.\n\n Returns:\n List[Tuple[str, int]]: A list of tuples of run_id and the number of slots it is\n occupying for the given concurrency key.\n """\n with self.index_connection() as conn:\n slot_query = (\n db_select(\n [\n ConcurrencySlotsTable.c.run_id,\n ConcurrencySlotsTable.c.deleted,\n db.func.count().label("count"),\n ]\n )\n .select_from(ConcurrencySlotsTable)\n .where(ConcurrencySlotsTable.c.concurrency_key == concurrency_key)\n .group_by(ConcurrencySlotsTable.c.run_id, ConcurrencySlotsTable.c.deleted)\n )\n slot_rows = db_fetch_mappings(conn, slot_query)\n pending_query = (\n db_select(\n [\n PendingStepsTable.c.run_id,\n db_case(\n [(PendingStepsTable.c.assigned_timestamp.is_(None), False)],\n else_=True,\n ).label("is_assigned"),\n db.func.count().label("count"),\n ]\n )\n .select_from(PendingStepsTable)\n .where(PendingStepsTable.c.concurrency_key == concurrency_key)\n .group_by(PendingStepsTable.c.run_id, "is_assigned")\n )\n pending_rows = db_fetch_mappings(conn, pending_query)\n\n return ConcurrencyKeyInfo(\n concurrency_key=concurrency_key,\n slot_count=sum(\n [\n cast(int, slot_row["count"])\n for slot_row in slot_rows\n if not slot_row["deleted"]\n ]\n ),\n active_slot_count=sum(\n [cast(int, slot_row["count"]) for slot_row in slot_rows if slot_row["run_id"]]\n ),\n active_run_ids={\n cast(str, slot_row["run_id"]) for slot_row in slot_rows if slot_row["run_id"]\n },\n pending_step_count=sum(\n [cast(int, row["count"]) for row in pending_rows if not row["is_assigned"]]\n ),\n pending_run_ids={\n cast(str, row["run_id"]) for row in pending_rows if not row["is_assigned"]\n },\n assigned_step_count=sum(\n [cast(int, row["count"]) for row in pending_rows if row["is_assigned"]]\n ),\n assigned_run_ids={\n cast(str, row["run_id"]) for row in pending_rows if row["is_assigned"]\n },\n )\n\n def get_concurrency_run_ids(self) -> Set[str]:\n with self.index_connection() as conn:\n rows = conn.execute(db_select([PendingStepsTable.c.run_id]).distinct()).fetchall()\n return set([cast(str, row[0]) for row in rows])\n\n def free_concurrency_slots_for_run(self, run_id: str) -> None:\n freed_concurrency_keys = self._free_concurrency_slots(run_id=run_id)\n self._remove_pending_steps(run_id=run_id)\n if freed_concurrency_keys:\n # assign any pending steps that can now claim a slot\n self.assign_pending_steps(freed_concurrency_keys)\n\n def free_concurrency_slot_for_step(self, run_id: str, step_key: str) -> None:\n freed_concurrency_keys = self._free_concurrency_slots(run_id=run_id, step_key=step_key)\n self._remove_pending_steps(run_id=run_id, step_key=step_key)\n if freed_concurrency_keys:\n # assign any pending steps that can now claim a slot\n self.assign_pending_steps(freed_concurrency_keys)\n\n def _free_concurrency_slots(self, run_id: str, step_key: Optional[str] = None) -> Sequence[str]:\n """Frees concurrency slots for a given run/step.\n\n Args:\n run_id (str): The run id to free the slots for.\n step_key (Optional[str]): The step key to free the slots for. If not provided, all the\n slots for all the steps of the run will be freed.\n """\n with self.index_connection() as conn:\n # first delete any rows that apply and are marked as deleted. This happens when the\n # configured number of slots has been reduced, and some of the pruned slots included\n # ones that were already allocated to the run/step\n delete_query = ConcurrencySlotsTable.delete().where(\n db.and_(\n ConcurrencySlotsTable.c.run_id == run_id,\n ConcurrencySlotsTable.c.deleted == True, # noqa: E712\n )\n )\n if step_key:\n delete_query = delete_query.where(ConcurrencySlotsTable.c.step_key == step_key)\n conn.execute(delete_query)\n\n # next, fetch the slots to free up, while grabbing the concurrency keys so that we can\n # allocate any pending steps from the queue for the freed slots, if necessary\n select_query = (\n db_select([ConcurrencySlotsTable.c.id, ConcurrencySlotsTable.c.concurrency_key])\n .select_from(ConcurrencySlotsTable)\n .where(ConcurrencySlotsTable.c.run_id == run_id)\n .with_for_update(skip_locked=True)\n )\n if step_key:\n select_query = select_query.where(ConcurrencySlotsTable.c.step_key == step_key)\n rows = conn.execute(select_query).fetchall()\n if not rows:\n return []\n\n # now, actually free the slots\n conn.execute(\n ConcurrencySlotsTable.update()\n .values(run_id=None, step_key=None)\n .where(\n db.and_(\n ConcurrencySlotsTable.c.id.in_([row[0] for row in rows]),\n )\n )\n )\n\n # return the concurrency keys for the freed slots\n return [cast(str, row[1]) for row in rows]\n\n def store_asset_check_event(self, event: EventLogEntry, event_id: Optional[int]) -> None:\n check.inst_param(event, "event", EventLogEntry)\n check.opt_int_param(event_id, "event_id")\n\n check.invariant(\n self.supports_asset_checks,\n "Asset checks require a database schema migration. Run `dagster instance migrate`.",\n )\n\n if event.dagster_event_type == DagsterEventType.ASSET_CHECK_EVALUATION_PLANNED:\n self._store_asset_check_evaluation_planned(event, event_id)\n if event.dagster_event_type == DagsterEventType.ASSET_CHECK_EVALUATION:\n if event.run_id == "" or event.run_id is None:\n self._store_runless_asset_check_evaluation(event, event_id)\n else:\n self._update_asset_check_evaluation(event, event_id)\n\n def _store_asset_check_evaluation_planned(\n self, event: EventLogEntry, event_id: Optional[int]\n ) -> None:\n planned = cast(\n AssetCheckEvaluationPlanned, check.not_none(event.dagster_event).event_specific_data\n )\n with self.index_connection() as conn:\n conn.execute(\n AssetCheckExecutionsTable.insert().values(\n asset_key=planned.asset_key.to_string(),\n check_name=planned.check_name,\n run_id=event.run_id,\n execution_status=AssetCheckExecutionRecordStatus.PLANNED.value,\n )\n )\n\n def _store_runless_asset_check_evaluation(\n self, event: EventLogEntry, event_id: Optional[int]\n ) -> None:\n evaluation = cast(\n AssetCheckEvaluation, check.not_none(event.dagster_event).event_specific_data\n )\n with self.index_connection() as conn:\n conn.execute(\n AssetCheckExecutionsTable.insert().values(\n asset_key=evaluation.asset_key.to_string(),\n check_name=evaluation.check_name,\n run_id=event.run_id,\n execution_status=(\n AssetCheckExecutionRecordStatus.SUCCEEDED.value\n if evaluation.success\n else AssetCheckExecutionRecordStatus.FAILED.value\n ),\n evaluation_event=serialize_value(event),\n evaluation_event_timestamp=datetime.utcfromtimestamp(event.timestamp),\n evaluation_event_storage_id=event_id,\n materialization_event_storage_id=(\n evaluation.target_materialization_data.storage_id\n if evaluation.target_materialization_data\n else None\n ),\n )\n )\n\n def _update_asset_check_evaluation(self, event: EventLogEntry, event_id: Optional[int]) -> None:\n evaluation = cast(\n AssetCheckEvaluation, check.not_none(event.dagster_event).event_specific_data\n )\n with self.index_connection() as conn:\n rows_updated = conn.execute(\n AssetCheckExecutionsTable.update()\n .where(\n # (asset_key, check_name, run_id) uniquely identifies the row created for the planned event\n db.and_(\n AssetCheckExecutionsTable.c.asset_key == evaluation.asset_key.to_string(),\n AssetCheckExecutionsTable.c.check_name == evaluation.check_name,\n AssetCheckExecutionsTable.c.run_id == event.run_id,\n )\n )\n .values(\n execution_status=(\n AssetCheckExecutionRecordStatus.SUCCEEDED.value\n if evaluation.success\n else AssetCheckExecutionRecordStatus.FAILED.value\n ),\n evaluation_event=serialize_value(event),\n evaluation_event_timestamp=datetime.utcfromtimestamp(event.timestamp),\n evaluation_event_storage_id=event_id,\n materialization_event_storage_id=(\n evaluation.target_materialization_data.storage_id\n if evaluation.target_materialization_data\n else None\n ),\n )\n ).rowcount\n if rows_updated != 1:\n raise DagsterInvariantViolationError(\n "Expected to update one row for asset check evaluation, but updated"\n f" {rows_updated}."\n )\n\n def get_asset_check_executions(\n self,\n asset_key: AssetKey,\n check_name: str,\n limit: int,\n cursor: Optional[int] = None,\n materialization_event_storage_id: Optional[int] = None,\n include_planned: bool = True,\n ) -> Sequence[AssetCheckExecutionRecord]:\n query = (\n db_select(\n [\n AssetCheckExecutionsTable.c.id,\n AssetCheckExecutionsTable.c.run_id,\n AssetCheckExecutionsTable.c.execution_status,\n AssetCheckExecutionsTable.c.evaluation_event,\n AssetCheckExecutionsTable.c.create_timestamp,\n ]\n )\n .where(\n db.and_(\n AssetCheckExecutionsTable.c.asset_key == asset_key.to_string(),\n AssetCheckExecutionsTable.c.check_name == check_name,\n )\n )\n .order_by(AssetCheckExecutionsTable.c.id.desc())\n ).limit(limit)\n\n if cursor:\n query = query.where(AssetCheckExecutionsTable.c.id < cursor)\n if not include_planned:\n query = query.where(\n AssetCheckExecutionsTable.c.execution_status\n != AssetCheckExecutionRecordStatus.PLANNED.value\n )\n if materialization_event_storage_id:\n if include_planned:\n # rows in PLANNED status are not associated with a materialization event yet\n query = query.where(\n db.or_(\n AssetCheckExecutionsTable.c.materialization_event_storage_id\n == materialization_event_storage_id,\n AssetCheckExecutionsTable.c.execution_status\n == AssetCheckExecutionRecordStatus.PLANNED.value,\n )\n )\n else:\n query = query.where(\n AssetCheckExecutionsTable.c.materialization_event_storage_id\n == materialization_event_storage_id\n )\n\n with self.index_connection() as conn:\n rows = conn.execute(query).fetchall()\n\n return [\n AssetCheckExecutionRecord(\n id=cast(int, row[0]),\n run_id=cast(str, row[1]),\n status=AssetCheckExecutionRecordStatus(row[2]),\n evaluation_event=(\n deserialize_value(cast(str, row[3]), EventLogEntry) if row[3] else None\n ),\n create_timestamp=datetime_as_float(cast(datetime, row[4])),\n )\n for row in rows\n ]\n\n @property\n def supports_asset_checks(self):\n return self.has_table(AssetCheckExecutionsTable.name)
\n\n\ndef _get_from_row(row: SqlAlchemyRow, column: str) -> object:\n """Utility function for extracting a column from a sqlalchemy row proxy, since '_asdict' is not\n supported in sqlalchemy 1.3.\n """\n if column not in row.keys():\n return None\n return row[column]\n
", "current_page_name": "_modules/dagster/_core/storage/event_log/sql_event_log", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.storage.event_log.sql_event_log"}, "sqlite": {"consolidated_sqlite_event_log": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.storage.event_log.sqlite.consolidated_sqlite_event_log

\nimport logging\nimport os\nfrom collections import defaultdict\nfrom contextlib import contextmanager\nfrom typing import Any, Mapping, Optional\n\nimport sqlalchemy as db\nfrom sqlalchemy.pool import NullPool\nfrom typing_extensions import Self\nfrom watchdog.events import PatternMatchingEventHandler\nfrom watchdog.observers import Observer\n\nimport dagster._check as check\nfrom dagster._config import StringSource\nfrom dagster._core.storage.dagster_run import DagsterRunStatus\nfrom dagster._core.storage.event_log.base import EventLogCursor\nfrom dagster._core.storage.sql import (\n    check_alembic_revision,\n    create_engine,\n    get_alembic_config,\n    run_alembic_upgrade,\n    stamp_alembic_rev,\n)\nfrom dagster._core.storage.sqlite import create_db_conn_string\nfrom dagster._serdes import ConfigurableClass, ConfigurableClassData\nfrom dagster._utils import mkdir_p\n\nfrom ..schema import SqlEventLogStorageMetadata\nfrom ..sql_event_log import SqlDbConnection, SqlEventLogStorage\n\nSQLITE_EVENT_LOG_FILENAME = "event_log"\n\n\n
[docs]class ConsolidatedSqliteEventLogStorage(SqlEventLogStorage, ConfigurableClass):\n """SQLite-backed consolidated event log storage intended for test cases only.\n\n Users should not directly instantiate this class; it is instantiated by internal machinery when\n ``dagster-webserver`` and ``dagster-graphql`` load, based on the values in the ``dagster.yaml`` file in\n ``$DAGSTER_HOME``. Configuration of this class should be done by setting values in that file.\n\n To explicitly specify the consolidated SQLite for event log storage, you can add a block such as\n the following to your ``dagster.yaml``:\n\n .. code-block:: YAML\n\n run_storage:\n module: dagster._core.storage.event_log\n class: ConsolidatedSqliteEventLogStorage\n config:\n base_dir: /path/to/dir\n\n The ``base_dir`` param tells the event log storage where on disk to store the database.\n """\n\n def __init__(self, base_dir, inst_data: Optional[ConfigurableClassData] = None):\n self._base_dir = check.str_param(base_dir, "base_dir")\n self._conn_string = create_db_conn_string(base_dir, SQLITE_EVENT_LOG_FILENAME)\n self._secondary_index_cache = {}\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n self._watchers = defaultdict(dict)\n self._obs = None\n\n if not os.path.exists(self.get_db_path()):\n self._init_db()\n\n super().__init__()\n\n @property\n def inst_data(self):\n return self._inst_data\n\n @classmethod\n def config_type(cls):\n return {"base_dir": StringSource}\n\n @classmethod\n def from_config_value(\n cls, inst_data: ConfigurableClassData, config_value: Mapping[str, Any]\n ) -> Self:\n return ConsolidatedSqliteEventLogStorage(inst_data=inst_data, **config_value)\n\n def _init_db(self):\n mkdir_p(self._base_dir)\n engine = create_engine(self._conn_string, poolclass=NullPool)\n alembic_config = get_alembic_config(__file__)\n\n should_mark_indexes = False\n with engine.connect() as connection:\n db_revision, head_revision = check_alembic_revision(alembic_config, connection)\n if not (db_revision and head_revision):\n SqlEventLogStorageMetadata.create_all(engine)\n connection.execute(db.text("PRAGMA journal_mode=WAL;"))\n stamp_alembic_rev(alembic_config, connection)\n should_mark_indexes = True\n\n if should_mark_indexes:\n # mark all secondary indexes\n self.reindex_events()\n self.reindex_assets()\n\n @contextmanager\n def _connect(self):\n engine = create_engine(self._conn_string, poolclass=NullPool)\n with engine.connect() as conn:\n with conn.begin():\n yield conn\n\n def run_connection(self, run_id: Optional[str]) -> SqlDbConnection:\n return self._connect()\n\n def index_connection(self):\n return self._connect()\n\n def has_table(self, table_name: str) -> bool:\n engine = create_engine(self._conn_string, poolclass=NullPool)\n return bool(engine.dialect.has_table(engine.connect(), table_name))\n\n def get_db_path(self):\n return os.path.join(self._base_dir, f"{SQLITE_EVENT_LOG_FILENAME}.db")\n\n def upgrade(self):\n alembic_config = get_alembic_config(__file__)\n with self._connect() as conn:\n run_alembic_upgrade(alembic_config, conn)\n\n def has_secondary_index(self, name):\n if name not in self._secondary_index_cache:\n self._secondary_index_cache[name] = super(\n ConsolidatedSqliteEventLogStorage, self\n ).has_secondary_index(name)\n return self._secondary_index_cache[name]\n\n def enable_secondary_index(self, name):\n super(ConsolidatedSqliteEventLogStorage, self).enable_secondary_index(name)\n if name in self._secondary_index_cache:\n del self._secondary_index_cache[name]\n\n def watch(self, run_id, cursor, callback):\n if not self._obs:\n self._obs = Observer()\n self._obs.start()\n self._obs.schedule(\n ConsolidatedSqliteEventLogStorageWatchdog(self), self._base_dir, True\n )\n\n self._watchers[run_id][callback] = cursor\n\n @property\n def supports_global_concurrency_limits(self) -> bool:\n return False\n\n def on_modified(self):\n keys = [\n (run_id, callback)\n for run_id, callback_dict in self._watchers.items()\n for callback, _ in callback_dict.items()\n ]\n for run_id, callback in keys:\n cursor = self._watchers[run_id][callback]\n\n # fetch events\n connection = self.get_records_for_run(run_id, cursor)\n\n # update cursor\n if connection.cursor:\n self._watchers[run_id][callback] = connection.cursor\n\n for record in connection.records:\n status = None\n try:\n status = callback(\n record.event_log_entry,\n str(EventLogCursor.from_storage_id(record.storage_id)),\n )\n except Exception:\n logging.exception("Exception in callback for event watch on run %s.", run_id)\n\n if (\n status == DagsterRunStatus.SUCCESS\n or status == DagsterRunStatus.FAILURE\n or status == DagsterRunStatus.CANCELED\n ):\n self.end_watch(run_id, callback)\n\n def end_watch(self, run_id, handler):\n if run_id in self._watchers and handler in self._watchers[run_id]:\n del self._watchers[run_id][handler]\n\n def dispose(self):\n if self._obs:\n self._obs.stop()\n self._obs.join(timeout=15)
\n\n\nclass ConsolidatedSqliteEventLogStorageWatchdog(PatternMatchingEventHandler):\n def __init__(self, event_log_storage, **kwargs):\n self._event_log_storage = check.inst_param(\n event_log_storage, "event_log_storage", ConsolidatedSqliteEventLogStorage\n )\n self._log_path = event_log_storage.get_db_path()\n super(ConsolidatedSqliteEventLogStorageWatchdog, self).__init__(\n patterns=[self._log_path], **kwargs\n )\n\n def on_modified(self, event):\n check.invariant(event.src_path == self._log_path)\n self._event_log_storage.on_modified()\n
", "current_page_name": "_modules/dagster/_core/storage/event_log/sqlite/consolidated_sqlite_event_log", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.storage.event_log.sqlite.consolidated_sqlite_event_log"}, "sqlite_event_log": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.storage.event_log.sqlite.sqlite_event_log

\nimport contextlib\nimport glob\nimport logging\nimport os\nimport re\nimport sqlite3\nimport threading\nimport time\nfrom collections import defaultdict\nfrom contextlib import contextmanager\nfrom typing import TYPE_CHECKING, Any, ContextManager, Iterable, Iterator, Optional, Sequence\n\nimport sqlalchemy as db\nimport sqlalchemy.exc as db_exc\nfrom sqlalchemy.engine import Connection, Engine\nfrom sqlalchemy.pool import NullPool\nfrom tqdm import tqdm\nfrom watchdog.events import FileSystemEvent, PatternMatchingEventHandler\nfrom watchdog.observers import Observer\n\nimport dagster._check as check\nimport dagster._seven as seven\nfrom dagster._config import StringSource\nfrom dagster._config.config_schema import UserConfigSchema\nfrom dagster._core.definitions.events import AssetKey\nfrom dagster._core.errors import DagsterInvariantViolationError\nfrom dagster._core.event_api import EventHandlerFn\nfrom dagster._core.events import ASSET_CHECK_EVENTS, ASSET_EVENTS\nfrom dagster._core.events.log import EventLogEntry\nfrom dagster._core.storage.dagster_run import DagsterRunStatus, RunsFilter\nfrom dagster._core.storage.event_log.base import EventLogCursor, EventLogRecord, EventRecordsFilter\nfrom dagster._core.storage.sql import (\n    AlembicVersion,\n    check_alembic_revision,\n    create_engine,\n    get_alembic_config,\n    run_alembic_upgrade,\n    stamp_alembic_rev,\n)\nfrom dagster._core.storage.sqlalchemy_compat import db_select\nfrom dagster._core.storage.sqlite import create_db_conn_string\nfrom dagster._serdes import (\n    ConfigurableClass,\n    ConfigurableClassData,\n)\nfrom dagster._serdes.errors import DeserializationError\nfrom dagster._serdes.serdes import deserialize_value\nfrom dagster._utils import mkdir_p\n\nfrom ..schema import SqlEventLogStorageMetadata, SqlEventLogStorageTable\nfrom ..sql_event_log import RunShardedEventsCursor, SqlEventLogStorage\n\nif TYPE_CHECKING:\n    from dagster._core.storage.sqlite_storage import SqliteStorageConfig\nINDEX_SHARD_NAME = "index"\n\n\n
[docs]class SqliteEventLogStorage(SqlEventLogStorage, ConfigurableClass):\n """SQLite-backed event log storage.\n\n Users should not directly instantiate this class; it is instantiated by internal machinery when\n ``dagster-webserver`` and ``dagster-graphql`` load, based on the values in the ``dagster.yaml`` file insqliteve\n ``$DAGSTER_HOME``. Configuration of this class should be done by setting values in that file.\n\n This is the default event log storage when none is specified in the ``dagster.yaml``.\n\n To explicitly specify SQLite for event log storage, you can add a block such as the following\n to your ``dagster.yaml``:\n\n .. code-block:: YAML\n\n event_log_storage:\n module: dagster._core.storage.event_log\n class: SqliteEventLogStorage\n config:\n base_dir: /path/to/dir\n\n The ``base_dir`` param tells the event log storage where on disk to store the databases. To\n improve concurrent performance, event logs are stored in a separate SQLite database for each\n run.\n """\n\n def __init__(self, base_dir: str, inst_data: Optional[ConfigurableClassData] = None):\n """Note that idempotent initialization of the SQLite database is done on a per-run_id\n basis in the body of connect, since each run is stored in a separate database.\n """\n self._base_dir = os.path.abspath(check.str_param(base_dir, "base_dir"))\n mkdir_p(self._base_dir)\n\n self._obs = None\n\n self._watchers = defaultdict(dict)\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n\n # Used to ensure that each run ID attempts to initialize its DB the first time it connects,\n # ensuring that the database will be created if it doesn't exist\n self._initialized_dbs = set()\n\n # Ensure that multiple threads (like the event log watcher) interact safely with each other\n self._db_lock = threading.Lock()\n\n if not os.path.exists(self.path_for_shard(INDEX_SHARD_NAME)):\n conn_string = self.conn_string_for_shard(INDEX_SHARD_NAME)\n engine = create_engine(conn_string, poolclass=NullPool)\n self._initdb(engine)\n self.reindex_events()\n self.reindex_assets()\n\n super().__init__()\n\n def upgrade(self) -> None:\n all_run_ids = self.get_all_run_ids()\n print(f"Updating event log storage for {len(all_run_ids)} runs on disk...") # noqa: T201\n alembic_config = get_alembic_config(__file__)\n if all_run_ids:\n for run_id in tqdm(all_run_ids):\n with self.run_connection(run_id) as conn:\n run_alembic_upgrade(alembic_config, conn, run_id)\n\n print("Updating event log storage for index db on disk...") # noqa: T201\n with self.index_connection() as conn:\n run_alembic_upgrade(alembic_config, conn, "index")\n\n self._initialized_dbs = set()\n\n @property\n def inst_data(self) -> Optional[ConfigurableClassData]:\n return self._inst_data\n\n @classmethod\n def config_type(cls) -> UserConfigSchema:\n return {"base_dir": StringSource}\n\n @classmethod\n def from_config_value(\n cls, inst_data: Optional[ConfigurableClassData], config_value: "SqliteStorageConfig"\n ) -> "SqliteEventLogStorage":\n return SqliteEventLogStorage(inst_data=inst_data, **config_value)\n\n def get_all_run_ids(self) -> Sequence[str]:\n all_filenames = glob.glob(os.path.join(self._base_dir, "*.db"))\n return [\n os.path.splitext(os.path.basename(filename))[0]\n for filename in all_filenames\n if os.path.splitext(os.path.basename(filename))[0] != INDEX_SHARD_NAME\n ]\n\n def has_table(self, table_name: str) -> bool:\n conn_string = self.conn_string_for_shard(INDEX_SHARD_NAME)\n engine = create_engine(conn_string, poolclass=NullPool)\n with engine.connect() as conn:\n return bool(engine.dialect.has_table(conn, table_name))\n\n def path_for_shard(self, run_id: str) -> str:\n return os.path.join(self._base_dir, f"{run_id}.db")\n\n def conn_string_for_shard(self, shard_name: str) -> str:\n check.str_param(shard_name, "shard_name")\n return create_db_conn_string(self._base_dir, shard_name)\n\n def _initdb(self, engine: Engine) -> None:\n alembic_config = get_alembic_config(__file__)\n\n retry_limit = 10\n\n while True:\n try:\n with engine.connect() as connection:\n db_revision, head_revision = check_alembic_revision(alembic_config, connection)\n\n if not (db_revision and head_revision):\n SqlEventLogStorageMetadata.create_all(engine)\n connection.execute(db.text("PRAGMA journal_mode=WAL;"))\n stamp_alembic_rev(alembic_config, connection)\n\n break\n except (db_exc.DatabaseError, sqlite3.DatabaseError, sqlite3.OperationalError) as exc:\n # This is SQLite-specific handling for concurrency issues that can arise when\n # multiple processes (e.g. the dagster-webserver process and user code process) contend with\n # each other to init the db. When we hit the following errors, we know that another\n # process is on the case and we should retry.\n err_msg = str(exc)\n\n if not (\n re.search(r"table [A-Za-z_]* already exists", err_msg)\n or "database is locked" in err_msg\n or "UNIQUE constraint failed: alembic_version.version_num" in err_msg\n ):\n raise\n\n if retry_limit == 0:\n raise\n else:\n logging.info(\n "SqliteEventLogStorage._initdb: Encountered apparent concurrent init, "\n "retrying (%s retries left). Exception: %s",\n retry_limit,\n err_msg,\n )\n time.sleep(0.2)\n retry_limit -= 1\n\n @contextmanager\n def _connect(self, shard: str) -> Iterator[Connection]:\n with self._db_lock:\n check.str_param(shard, "shard")\n\n conn_string = self.conn_string_for_shard(shard)\n engine = create_engine(conn_string, poolclass=NullPool)\n\n if shard not in self._initialized_dbs:\n self._initdb(engine)\n self._initialized_dbs.add(shard)\n\n with engine.connect() as conn:\n with conn.begin():\n yield conn\n engine.dispose()\n\n def run_connection(self, run_id: Optional[str] = None) -> Any:\n return self._connect(run_id) # type: ignore # bad sig\n\n def index_connection(self) -> ContextManager[Connection]:\n return self._connect(INDEX_SHARD_NAME)\n\n def store_event(self, event: EventLogEntry) -> None:\n """Overridden method to replicate asset events in a central assets.db sqlite shard, enabling\n cross-run asset queries.\n\n Args:\n event (EventLogEntry): The event to store.\n """\n check.inst_param(event, "event", EventLogEntry)\n insert_event_statement = self.prepare_insert_event(event)\n run_id = event.run_id\n\n with self.run_connection(run_id) as conn:\n conn.execute(insert_event_statement)\n\n if event.is_dagster_event and event.dagster_event.asset_key: # type: ignore\n check.invariant(\n event.dagster_event_type in ASSET_EVENTS,\n "Can only store asset materializations, materialization_planned, and"\n " observations in index database",\n )\n\n event_id = None\n\n # mirror the event in the cross-run index database\n with self.index_connection() as conn:\n result = conn.execute(insert_event_statement)\n event_id = result.inserted_primary_key[0]\n\n self.store_asset_event(event, event_id)\n\n if event_id is None:\n raise DagsterInvariantViolationError(\n "Cannot store asset event tags for null event id."\n )\n\n self.store_asset_event_tags(event, event_id)\n\n if event.is_dagster_event and event.dagster_event_type in ASSET_CHECK_EVENTS:\n self.store_asset_check_event(event, None)\n\n def get_event_records(\n self,\n event_records_filter: EventRecordsFilter,\n limit: Optional[int] = None,\n ascending: bool = False,\n ) -> Iterable[EventLogRecord]:\n """Overridden method to enable cross-run event queries in sqlite.\n\n The record id in sqlite does not auto increment cross runs, so instead of fetching events\n after record id, we only fetch events whose runs updated after update_timestamp.\n """\n check.opt_inst_param(event_records_filter, "event_records_filter", EventRecordsFilter)\n check.opt_int_param(limit, "limit")\n check.bool_param(ascending, "ascending")\n\n is_asset_query = event_records_filter and event_records_filter.event_type in ASSET_EVENTS\n if is_asset_query:\n # asset materializations, observations and materialization planned events\n # get mirrored into the index shard, so no custom run shard-aware cursor logic needed\n return super(SqliteEventLogStorage, self).get_event_records(\n event_records_filter=event_records_filter, limit=limit, ascending=ascending\n )\n\n query = db_select([SqlEventLogStorageTable.c.id, SqlEventLogStorageTable.c.event])\n if event_records_filter.asset_key:\n asset_details = next(iter(self._get_assets_details([event_records_filter.asset_key])))\n else:\n asset_details = None\n\n if event_records_filter.after_cursor is not None and not isinstance(\n event_records_filter.after_cursor, RunShardedEventsCursor\n ):\n raise Exception("""\n Called `get_event_records` on a run-sharded event log storage with a cursor that\n is not run-aware. Add a RunShardedEventsCursor to your query filter\n or switch your instance configuration to use a non-run-sharded event log storage\n (e.g. PostgresEventLogStorage, ConsolidatedSqliteEventLogStorage)\n """)\n\n query = self._apply_filter_to_query(\n query=query,\n event_records_filter=event_records_filter,\n asset_details=asset_details,\n apply_cursor_filters=False, # run-sharded cursor filters don't really make sense\n )\n if limit:\n query = query.limit(limit)\n if ascending:\n query = query.order_by(SqlEventLogStorageTable.c.timestamp.asc())\n else:\n query = query.order_by(SqlEventLogStorageTable.c.timestamp.desc())\n\n # workaround for the run-shard sqlite to enable cross-run queries: get a list of run_ids\n # whose events may qualify the query, and then open run_connection per run_id at a time.\n run_updated_after = (\n event_records_filter.after_cursor.run_updated_after\n if isinstance(event_records_filter.after_cursor, RunShardedEventsCursor)\n else None\n )\n run_records = self._instance.get_run_records(\n filters=RunsFilter(updated_after=run_updated_after),\n order_by="update_timestamp",\n ascending=ascending,\n )\n\n event_records = []\n for run_record in run_records:\n run_id = run_record.dagster_run.run_id\n with self.run_connection(run_id) as conn:\n results = conn.execute(query).fetchall()\n\n for row_id, json_str in results:\n try:\n event_record = deserialize_value(json_str, EventLogEntry)\n event_records.append(\n EventLogRecord(storage_id=row_id, event_log_entry=event_record)\n )\n if limit and len(event_records) >= limit:\n break\n except DeserializationError:\n logging.warning(\n "Could not resolve event record as EventLogEntry for id `%s`.", row_id\n )\n except seven.JSONDecodeError:\n logging.warning("Could not parse event record id `%s`.", row_id)\n\n if limit and len(event_records) >= limit:\n break\n\n return event_records[:limit]\n\n def supports_event_consumer_queries(self) -> bool:\n return False\n\n def delete_events(self, run_id: str) -> None:\n with self.run_connection(run_id) as conn:\n self.delete_events_for_run(conn, run_id)\n\n # delete the mirrored event in the cross-run index database\n with self.index_connection() as conn:\n self.delete_events_for_run(conn, run_id)\n\n def wipe(self) -> None:\n # should delete all the run-sharded db files and drop the contents of the index\n for filename in (\n glob.glob(os.path.join(self._base_dir, "*.db"))\n + glob.glob(os.path.join(self._base_dir, "*.db-wal"))\n + glob.glob(os.path.join(self._base_dir, "*.db-shm"))\n ):\n if (\n not filename.endswith(f"{INDEX_SHARD_NAME}.db")\n and not filename.endswith(f"{INDEX_SHARD_NAME}.db-wal")\n and not filename.endswith(f"{INDEX_SHARD_NAME}.db-shm")\n ):\n with contextlib.suppress(FileNotFoundError):\n os.unlink(filename)\n\n self._initialized_dbs = set()\n self._wipe_index()\n\n def _delete_mirrored_events_for_asset_key(self, asset_key: AssetKey) -> None:\n with self.index_connection() as conn:\n conn.execute(\n SqlEventLogStorageTable.delete().where(\n SqlEventLogStorageTable.c.asset_key == asset_key.to_string(),\n )\n )\n\n def wipe_asset(self, asset_key: AssetKey) -> None:\n # default implementation will update the event_logs in the sharded dbs, and the asset_key\n # table in the asset shard, but will not remove the mirrored event_log events in the asset\n # shard\n super(SqliteEventLogStorage, self).wipe_asset(asset_key)\n self._delete_mirrored_events_for_asset_key(asset_key)\n\n def watch(self, run_id: str, cursor: Optional[str], callback: EventHandlerFn) -> None:\n if not self._obs:\n self._obs = Observer()\n self._obs.start()\n\n watchdog = SqliteEventLogStorageWatchdog(self, run_id, callback, cursor)\n self._watchers[run_id][callback] = (\n watchdog,\n self._obs.schedule(watchdog, self._base_dir, True),\n )\n\n def end_watch(self, run_id: str, handler: EventHandlerFn) -> None:\n if handler in self._watchers[run_id]:\n event_handler, watch = self._watchers[run_id][handler]\n self._obs.remove_handler_for_watch(event_handler, watch) # type: ignore # (possible none)\n del self._watchers[run_id][handler]\n\n def dispose(self) -> None:\n if self._obs:\n self._obs.stop()\n self._obs.join(timeout=15)\n\n def alembic_version(self) -> AlembicVersion:\n alembic_config = get_alembic_config(__file__)\n with self.index_connection() as conn:\n return check_alembic_revision(alembic_config, conn)\n\n @property\n def is_run_sharded(self) -> bool:\n return True\n\n @property\n def supports_global_concurrency_limits(self) -> bool:\n return False
\n\n\nclass SqliteEventLogStorageWatchdog(PatternMatchingEventHandler):\n def __init__(\n self,\n event_log_storage: SqliteEventLogStorage,\n run_id: str,\n callback: EventHandlerFn,\n cursor: Optional[str],\n **kwargs: Any,\n ):\n self._event_log_storage = check.inst_param(\n event_log_storage, "event_log_storage", SqliteEventLogStorage\n )\n self._run_id = check.str_param(run_id, "run_id")\n self._cb = check.callable_param(callback, "callback")\n self._log_path = event_log_storage.path_for_shard(run_id)\n self._cursor = cursor\n super(SqliteEventLogStorageWatchdog, self).__init__(patterns=[self._log_path], **kwargs)\n\n def _process_log(self) -> None:\n connection = self._event_log_storage.get_records_for_run(self._run_id, self._cursor)\n if connection.cursor:\n self._cursor = connection.cursor\n for record in connection.records:\n status = None\n try:\n status = self._cb(\n record.event_log_entry, str(EventLogCursor.from_storage_id(record.storage_id))\n )\n except Exception:\n logging.exception("Exception in callback for event watch on run %s.", self._run_id)\n\n if (\n status == DagsterRunStatus.SUCCESS\n or status == DagsterRunStatus.FAILURE\n or status == DagsterRunStatus.CANCELED\n ):\n self._event_log_storage.end_watch(self._run_id, self._cb)\n\n def on_modified(self, event: FileSystemEvent) -> None:\n check.invariant(event.src_path == self._log_path)\n self._process_log()\n
", "current_page_name": "_modules/dagster/_core/storage/event_log/sqlite/sqlite_event_log", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.storage.event_log.sqlite.sqlite_event_log"}}}, "file_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.storage.file_manager

\nimport io\nimport os\nimport shutil\nimport uuid\nfrom abc import ABC, abstractmethod\nfrom contextlib import contextmanager\nfrom typing import BinaryIO, ContextManager, Iterator, Optional, TextIO, Union\n\nfrom typing_extensions import TypeAlias\n\nimport dagster._check as check\nfrom dagster._annotations import public\nfrom dagster._config import Field, StringSource\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource, resource\nfrom dagster._core.execution.context.init import InitResourceContext\nfrom dagster._core.instance import DagsterInstance\nfrom dagster._utils import mkdir_p\n\nfrom .temp_file_manager import TempfileManager\n\nIOStream: TypeAlias = Union[TextIO, BinaryIO]\n\n\n
[docs]class FileHandle(ABC):\n """A reference to a file as manipulated by a FileManager.\n\n Subclasses may handle files that are resident on the local file system, in an object store, or\n in any arbitrary place where a file can be stored.\n\n This exists to handle the very common case where you wish to write a computation that reads,\n transforms, and writes files, but where you also want the same code to work in local development\n as well as on a cluster where the files will be stored in a globally available object store\n such as S3.\n """\n\n @public\n @property\n @abstractmethod\n def path_desc(self) -> str:\n """A representation of the file path for display purposes only."""\n raise NotImplementedError()
\n\n\n
[docs]class LocalFileHandle(FileHandle):\n """A reference to a file on a local filesystem."""\n\n def __init__(self, path: str):\n self._path = check.str_param(path, "path")\n\n @public\n @property\n def path(self) -> str:\n """The file's path."""\n return self._path\n\n @public\n @property\n def path_desc(self) -> str:\n """A representation of the file path for display purposes only."""\n return self._path
\n\n\n
[docs]class FileManager(ABC):\n """Base class for all file managers in dagster.\n\n The file manager is an interface that can be implemented by resources to provide abstract\n access to a file system such as local disk, S3, or other cloud storage.\n\n For examples of usage, see the documentation of the concrete file manager implementations.\n """\n\n
[docs] @public\n @abstractmethod\n def copy_handle_to_local_temp(self, file_handle: FileHandle) -> str:\n """Copy a file represented by a file handle to a temp file.\n\n In an implementation built around an object store such as S3, this method would be expected\n to download the file from S3 to local filesystem in a location assigned by the standard\n library's :py:mod:`python:tempfile` module.\n\n Temp files returned by this method are *not* guaranteed to be reusable across solid\n boundaries. For files that must be available across solid boundaries, use the\n :py:meth:`~dagster._core.storage.file_manager.FileManager.read`,\n :py:meth:`~dagster._core.storage.file_manager.FileManager.read_data`,\n :py:meth:`~dagster._core.storage.file_manager.FileManager.write`, and\n :py:meth:`~dagster._core.storage.file_manager.FileManager.write_data` methods.\n\n Args:\n file_handle (FileHandle): The handle to the file to make available as a local temp file.\n\n Returns:\n str: Path to the local temp file.\n """\n raise NotImplementedError()
\n\n
[docs] @public\n @abstractmethod\n def delete_local_temp(self) -> None:\n """Delete all local temporary files created by previous calls to\n :py:meth:`~dagster._core.storage.file_manager.FileManager.copy_handle_to_local_temp`.\n\n Should typically only be called by framework implementors.\n """\n raise NotImplementedError()
\n\n
[docs] @public\n @abstractmethod\n def read(self, file_handle: FileHandle, mode: str = "rb") -> ContextManager[IOStream]:\n """Return a file-like stream for the file handle.\n\n This may incur an expensive network call for file managers backed by object stores\n such as S3.\n\n Args:\n file_handle (FileHandle): The file handle to make available as a stream.\n mode (str): The mode in which to open the file. Default: ``"rb"``.\n\n Returns:\n Union[TextIO, BinaryIO]: A file-like stream.\n """\n raise NotImplementedError()
\n\n
[docs] @public\n @abstractmethod\n def read_data(self, file_handle: FileHandle) -> bytes:\n """Return the bytes for a given file handle. This may incur an expensive network\n call for file managers backed by object stores such as s3.\n\n Args:\n file_handle (FileHandle): The file handle for which to return bytes.\n\n Returns:\n bytes: Bytes for a given file handle.\n """\n raise NotImplementedError()
\n\n
[docs] @public\n @abstractmethod\n def write(self, file_obj: IOStream, mode: str = "wb", ext: Optional[str] = None) -> FileHandle:\n """Write the bytes contained within the given file object into the file manager.\n\n Args:\n file_obj (Union[TextIO, StringIO]): A file-like object.\n mode (Optional[str]): The mode in which to write the file into the file manager.\n Default: ``"wb"``.\n ext (Optional[str]): For file managers that support file extensions, the extension with\n which to write the file. Default: ``None``.\n\n Returns:\n FileHandle: A handle to the newly created file.\n """\n raise NotImplementedError()
\n\n
[docs] @public\n @abstractmethod\n def write_data(self, data: bytes, ext: Optional[str] = None) -> FileHandle:\n """Write raw bytes into the file manager.\n\n Args:\n data (bytes): The bytes to write into the file manager.\n ext (Optional[str]): For file managers that support file extensions, the extension with\n which to write the file. Default: ``None``.\n\n Returns:\n FileHandle: A handle to the newly created file.\n """\n raise NotImplementedError()
\n\n\n
[docs]@dagster_maintained_resource\n@resource(config_schema={"base_dir": Field(StringSource, is_required=False)})\ndef local_file_manager(init_context: InitResourceContext) -> "LocalFileManager":\n """FileManager that provides abstract access to a local filesystem.\n\n By default, files will be stored in `<local_artifact_storage>/storage/file_manager` where\n `<local_artifact_storage>` can be configured the ``dagster.yaml`` file in ``$DAGSTER_HOME``.\n\n Implements the :py:class:`~dagster._core.storage.file_manager.FileManager` API.\n\n Examples:\n .. code-block:: python\n\n import tempfile\n\n from dagster import job, local_file_manager, op\n\n\n @op(required_resource_keys={"file_manager"})\n def write_files(context):\n fh_1 = context.resources.file_manager.write_data(b"foo")\n\n with tempfile.NamedTemporaryFile("w+") as fd:\n fd.write("bar")\n fd.seek(0)\n fh_2 = context.resources.file_manager.write(fd, mode="w", ext=".txt")\n\n return (fh_1, fh_2)\n\n\n @op(required_resource_keys={"file_manager"})\n def read_files(context, file_handles):\n fh_1, fh_2 = file_handles\n assert context.resources.file_manager.read_data(fh_2) == b"bar"\n fd = context.resources.file_manager.read(fh_2, mode="r")\n assert fd.read() == "foo"\n fd.close()\n\n\n @job(resource_defs={"file_manager": local_file_manager})\n def files_pipeline():\n read_files(write_files())\n\n Or to specify the file directory:\n\n .. code-block:: python\n\n @job(\n resource_defs={\n "file_manager": local_file_manager.configured({"base_dir": "/my/base/dir"})\n }\n )\n def files_pipeline():\n read_files(write_files())\n """\n return LocalFileManager(\n base_dir=init_context.resource_config.get(\n "base_dir", os.path.join(init_context.instance.storage_directory(), "file_manager") # type: ignore # (possible none)\n )\n )
\n\n\ndef check_file_like_obj(obj: object) -> None:\n check.invariant(obj and hasattr(obj, "read") and hasattr(obj, "write"))\n\n\nclass LocalFileManager(FileManager):\n def __init__(self, base_dir: str):\n self.base_dir = base_dir\n self._base_dir_ensured = False\n self._temp_file_manager = TempfileManager()\n\n @staticmethod\n def for_instance(instance: DagsterInstance, run_id: str) -> "LocalFileManager":\n check.inst_param(instance, "instance", DagsterInstance)\n return LocalFileManager(instance.file_manager_directory(run_id))\n\n def ensure_base_dir_exists(self) -> None:\n if self._base_dir_ensured:\n return\n\n mkdir_p(self.base_dir)\n\n self._base_dir_ensured = True\n\n def copy_handle_to_local_temp(self, file_handle: FileHandle) -> str:\n check.inst_param(file_handle, "file_handle", FileHandle)\n with self.read(file_handle, "rb") as handle_obj: # type: ignore # (??)\n temp_file_obj = self._temp_file_manager.tempfile()\n temp_file_obj.write(handle_obj.read())\n temp_name = temp_file_obj.name\n temp_file_obj.close()\n return temp_name\n\n @contextmanager\n def read(self, file_handle: LocalFileHandle, mode: str = "rb") -> Iterator[IOStream]:\n check.inst_param(file_handle, "file_handle", LocalFileHandle)\n check.str_param(mode, "mode")\n check.param_invariant(mode in {"r", "rb"}, "mode")\n\n encoding = None if mode == "rb" else "utf8"\n with open(file_handle.path, mode, encoding=encoding) as file_obj:\n yield file_obj # type: ignore # (??)\n\n def read_data(self, file_handle: LocalFileHandle) -> bytes:\n with self.read(file_handle, mode="rb") as file_obj:\n return file_obj.read() # type: ignore # (??)\n\n def write_data(self, data: bytes, ext: Optional[str] = None):\n check.inst_param(data, "data", bytes)\n return self.write(io.BytesIO(data), mode="wb", ext=ext)\n\n def write(\n self, file_obj: IOStream, mode: str = "wb", ext: Optional[str] = None\n ) -> LocalFileHandle:\n check_file_like_obj(file_obj)\n check.opt_str_param(ext, "ext")\n\n self.ensure_base_dir_exists()\n\n dest_file_path = os.path.join(\n self.base_dir, str(uuid.uuid4()) + (("." + ext) if ext is not None else "")\n )\n\n encoding = None if "b" in mode else "utf8"\n with open(dest_file_path, mode, encoding=encoding) as dest_file_obj:\n shutil.copyfileobj(file_obj, dest_file_obj) # type: ignore # (??)\n return LocalFileHandle(dest_file_path)\n\n def delete_local_temp(self) -> None:\n self._temp_file_manager.close()\n
", "current_page_name": "_modules/dagster/_core/storage/file_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.storage.file_manager"}, "fs_io_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.storage.fs_io_manager

\nimport os\nimport pickle\nfrom typing import TYPE_CHECKING, Any, Optional\n\nfrom pydantic import Field\n\nimport dagster._check as check\nfrom dagster import (\n    DagsterInvariantViolationError,\n    Field as DagsterField,\n)\nfrom dagster._annotations import experimental\nfrom dagster._config import StringSource\nfrom dagster._config.pythonic_config import ConfigurableIOManagerFactory\nfrom dagster._core.definitions.events import AssetKey, AssetMaterialization\nfrom dagster._core.definitions.metadata import MetadataValue\nfrom dagster._core.execution.context.init import InitResourceContext\nfrom dagster._core.execution.context.input import InputContext\nfrom dagster._core.execution.context.output import OutputContext\nfrom dagster._core.storage.io_manager import IOManager, dagster_maintained_io_manager, io_manager\nfrom dagster._core.storage.upath_io_manager import UPathIOManager\nfrom dagster._utils import PICKLE_PROTOCOL, mkdir_p\n\nif TYPE_CHECKING:\n    from typing_extensions import Literal\n    from upath import UPath\n\n\n
[docs]class FilesystemIOManager(ConfigurableIOManagerFactory["PickledObjectFilesystemIOManager"]):\n """Built-in filesystem IO manager that stores and retrieves values using pickling.\n\n The base directory that the pickle files live inside is determined by:\n\n * The IO manager's "base_dir" configuration value, if specified. Otherwise...\n * A "storage/" directory underneath the value for "local_artifact_storage" in your dagster.yaml\n file, if specified. Otherwise...\n * A "storage/" directory underneath the directory that the DAGSTER_HOME environment variable\n points to, if that environment variable is specified. Otherwise...\n * A temporary directory.\n\n Assigns each op output to a unique filepath containing run ID, step key, and output name.\n Assigns each asset to a single filesystem path, at "<base_dir>/<asset_key>". If the asset key\n has multiple components, the final component is used as the name of the file, and the preceding\n components as parent directories under the base_dir.\n\n Subsequent materializations of an asset will overwrite previous materializations of that asset.\n So, with a base directory of "/my/base/path", an asset with key\n `AssetKey(["one", "two", "three"])` would be stored in a file called "three" in a directory\n with path "/my/base/path/one/two/".\n\n Example usage:\n\n\n 1. Attach an IO manager to a set of assets using the reserved resource key ``"io_manager"``.\n\n .. code-block:: python\n\n from dagster import Definitions, asset, FilesystemIOManager\n\n @asset\n def asset1():\n # create df ...\n return df\n\n @asset\n def asset2(asset1):\n return asset1[:5]\n\n defs = Definitions(\n assets=[asset1, asset2],\n resources={\n "io_manager": FilesystemIOManager(base_dir="/my/base/path")\n },\n )\n\n\n 2. Specify a job-level IO manager using the reserved resource key ``"io_manager"``,\n which will set the given IO manager on all ops in a job.\n\n .. code-block:: python\n\n from dagster import FilesystemIOManager, job, op\n\n @op\n def op_a():\n # create df ...\n return df\n\n @op\n def op_b(df):\n return df[:5]\n\n @job(\n resource_defs={\n "io_manager": FilesystemIOManager(base_dir="/my/base/path")\n }\n )\n def job():\n op_b(op_a())\n\n\n 3. Specify IO manager on :py:class:`Out`, which allows you to set different IO managers on\n different step outputs.\n\n .. code-block:: python\n\n from dagster import FilesystemIOManager, job, op, Out\n\n @op(out=Out(io_manager_key="my_io_manager"))\n def op_a():\n # create df ...\n return df\n\n @op\n def op_b(df):\n return df[:5]\n\n @job(resource_defs={"my_io_manager": FilesystemIOManager()})\n def job():\n op_b(op_a())\n\n """\n\n base_dir: Optional[str] = Field(default=None, description="Base directory for storing files.")\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n def create_io_manager(self, context: InitResourceContext) -> "PickledObjectFilesystemIOManager":\n base_dir = self.base_dir or check.not_none(context.instance).storage_directory()\n return PickledObjectFilesystemIOManager(base_dir=base_dir)
\n\n\n
[docs]@dagster_maintained_io_manager\n@io_manager(\n config_schema=FilesystemIOManager.to_config_schema(),\n description="Built-in filesystem IO manager that stores and retrieves values using pickling.",\n)\ndef fs_io_manager(init_context: InitResourceContext) -> "PickledObjectFilesystemIOManager":\n """Built-in filesystem IO manager that stores and retrieves values using pickling.\n\n The base directory that the pickle files live inside is determined by:\n\n * The IO manager's "base_dir" configuration value, if specified. Otherwise...\n * A "storage/" directory underneath the value for "local_artifact_storage" in your dagster.yaml\n file, if specified. Otherwise...\n * A "storage/" directory underneath the directory that the DAGSTER_HOME environment variable\n points to, if that environment variable is specified. Otherwise...\n * A temporary directory.\n\n Assigns each op output to a unique filepath containing run ID, step key, and output name.\n Assigns each asset to a single filesystem path, at "<base_dir>/<asset_key>". If the asset key\n has multiple components, the final component is used as the name of the file, and the preceding\n components as parent directories under the base_dir.\n\n Subsequent materializations of an asset will overwrite previous materializations of that asset.\n So, with a base directory of "/my/base/path", an asset with key\n `AssetKey(["one", "two", "three"])` would be stored in a file called "three" in a directory\n with path "/my/base/path/one/two/".\n\n Example usage:\n\n\n 1. Attach an IO manager to a set of assets using the reserved resource key ``"io_manager"``.\n\n .. code-block:: python\n\n from dagster import Definitions, asset, fs_io_manager\n\n @asset\n def asset1():\n # create df ...\n return df\n\n @asset\n def asset2(asset1):\n return asset1[:5]\n\n defs = Definitions(\n assets=[asset1, asset2],\n resources={\n "io_manager": fs_io_manager.configured({"base_dir": "/my/base/path"})\n },\n )\n\n\n 2. Specify a job-level IO manager using the reserved resource key ``"io_manager"``,\n which will set the given IO manager on all ops in a job.\n\n .. code-block:: python\n\n from dagster import fs_io_manager, job, op\n\n @op\n def op_a():\n # create df ...\n return df\n\n @op\n def op_b(df):\n return df[:5]\n\n @job(\n resource_defs={\n "io_manager": fs_io_manager.configured({"base_dir": "/my/base/path"})\n }\n )\n def job():\n op_b(op_a())\n\n\n 3. Specify IO manager on :py:class:`Out`, which allows you to set different IO managers on\n different step outputs.\n\n .. code-block:: python\n\n from dagster import fs_io_manager, job, op, Out\n\n @op(out=Out(io_manager_key="my_io_manager"))\n def op_a():\n # create df ...\n return df\n\n @op\n def op_b(df):\n return df[:5]\n\n @job(resource_defs={"my_io_manager": fs_io_manager})\n def job():\n op_b(op_a())\n\n """\n return FilesystemIOManager.from_resource_context(init_context)
\n\n\nclass PickledObjectFilesystemIOManager(UPathIOManager):\n """Built-in filesystem IO manager that stores and retrieves values using pickling.\n Is compatible with local and remote filesystems via `universal-pathlib` and `fsspec`.\n Learn more about how to use remote filesystems here: https://github.com/fsspec/universal_pathlib.\n\n Args:\n base_dir (Optional[str]): base directory where all the step outputs which use this object\n manager will be stored in.\n **kwargs: additional keyword arguments for `universal_pathlib.UPath`.\n """\n\n extension: str = "" # TODO: maybe change this to .pickle? Leaving blank for compatibility.\n\n def __init__(self, base_dir=None, **kwargs):\n from upath import UPath\n\n self.base_dir = check.opt_str_param(base_dir, "base_dir")\n\n super().__init__(base_path=UPath(base_dir, **kwargs))\n\n def dump_to_path(self, context: OutputContext, obj: Any, path: "UPath"):\n try:\n with path.open("wb") as file:\n pickle.dump(obj, file, PICKLE_PROTOCOL)\n except (AttributeError, RecursionError, ImportError, pickle.PicklingError) as e:\n executor = context.step_context.job_def.executor_def\n\n if isinstance(e, RecursionError):\n # if obj can't be pickled because of RecursionError then __str__() will also\n # throw a RecursionError\n obj_repr = f"{obj.__class__} exceeds recursion limit and"\n else:\n obj_repr = obj.__str__()\n\n raise DagsterInvariantViolationError(\n f"Object {obj_repr} is not picklable. You are currently using the "\n f"fs_io_manager and the {executor.name}. You will need to use a different "\n "io manager to continue using this output. For example, you can use the "\n "mem_io_manager with the in_process_executor.\\n"\n "For more information on io managers, visit "\n "https://docs.dagster.io/concepts/io-management/io-managers \\n"\n "For more information on executors, vist "\n "https://docs.dagster.io/deployment/executors#overview"\n ) from e\n\n def load_from_path(self, context: InputContext, path: "UPath") -> Any:\n with path.open("rb") as file:\n return pickle.load(file)\n\n\nclass CustomPathPickledObjectFilesystemIOManager(IOManager):\n """Built-in filesystem IO managerthat stores and retrieves values using pickling and\n allow users to specify file path for outputs.\n\n Args:\n base_dir (Optional[str]): base directory where all the step outputs which use this object\n manager will be stored in.\n """\n\n def __init__(self, base_dir: Optional[str] = None):\n self.base_dir = check.opt_str_param(base_dir, "base_dir")\n self.write_mode: Literal["wb"] = "wb"\n self.read_mode: Literal["rb"] = "rb"\n\n def _get_path(self, path: str) -> str:\n return os.path.join(self.base_dir, path) # type: ignore # (possible none)\n\n def handle_output(self, context: OutputContext, obj: object):\n """Pickle the data and store the object to a custom file path.\n\n This method emits an AssetMaterialization event so the assets will be tracked by the\n Asset Catalog.\n """\n check.inst_param(context, "context", OutputContext)\n metadata = context.metadata\n path = check.str_param(metadata.get("path"), "metadata.path") # type: ignore # (possible none)\n\n filepath = self._get_path(path)\n\n # Ensure path exists\n mkdir_p(os.path.dirname(filepath))\n context.log.debug(f"Writing file at: {filepath}")\n\n with open(filepath, self.write_mode) as write_obj:\n pickle.dump(obj, write_obj, PICKLE_PROTOCOL)\n\n return AssetMaterialization(\n asset_key=AssetKey([context.job_name, context.step_key, context.name]),\n metadata={"path": MetadataValue.path(os.path.abspath(filepath))},\n )\n\n def load_input(self, context: InputContext) -> object:\n """Unpickle the file from a given file path and Load it to a data object."""\n check.inst_param(context, "context", InputContext)\n metadata = context.upstream_output.metadata # type: ignore # (possible none)\n path = check.str_param(metadata.get("path"), "metadata.path") # type: ignore # (possible none)\n filepath = self._get_path(path)\n context.log.debug(f"Loading file from: {filepath}")\n\n with open(filepath, self.read_mode) as read_obj:\n return pickle.load(read_obj)\n\n\n@dagster_maintained_io_manager\n@io_manager(config_schema={"base_dir": DagsterField(StringSource, is_required=True)})\n@experimental\ndef custom_path_fs_io_manager(\n init_context: InitResourceContext,\n) -> CustomPathPickledObjectFilesystemIOManager:\n """Built-in IO manager that allows users to custom output file path per output definition.\n\n It requires users to specify a base directory where all the step output will be stored in. It\n serializes and deserializes output values (assets) using pickling and stores the pickled object\n in the user-provided file paths.\n\n Example usage:\n\n .. code-block:: python\n\n from dagster import custom_path_fs_io_manager, job, op\n\n @op(out=Out(metadata={"path": "path/to/sample_output"}))\n def sample_data(df):\n return df[:5]\n\n my_custom_path_fs_io_manager = custom_path_fs_io_manager.configured(\n {"base_dir": "path/to/basedir"}\n )\n\n @job(resource_defs={"io_manager": my_custom_path_fs_io_manager})\n def my_job():\n sample_data()\n\n """\n return CustomPathPickledObjectFilesystemIOManager(\n base_dir=init_context.resource_config.get("base_dir")\n )\n
", "current_page_name": "_modules/dagster/_core/storage/fs_io_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.storage.fs_io_manager"}, "input_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.storage.input_manager

\nfrom abc import ABC, abstractmethod\nfrom functools import update_wrapper\nfrom typing import TYPE_CHECKING, AbstractSet, Callable, Optional, Union, cast, overload\n\nfrom typing_extensions import TypeAlias, TypeGuard\n\nimport dagster._check as check\nfrom dagster._core.decorator_utils import has_at_least_one_parameter\nfrom dagster._core.definitions.config import is_callable_valid_config_arg\nfrom dagster._core.definitions.definition_config_schema import (\n    CoercableToConfigSchema,\n    IDefinitionConfigSchema,\n    convert_user_facing_definition_config_schema,\n)\nfrom dagster._core.definitions.resource_definition import ResourceDefinition, ResourceFunction\n\nif TYPE_CHECKING:\n    from dagster._core.execution.context.input import InputContext\n\nInputLoadFn: TypeAlias = Union[\n    Callable[["InputContext"], object],\n    Callable[[], object],\n]\n\n\n
[docs]class InputManager(ABC):\n """Base interface for classes that are responsible for loading solid inputs."""\n\n @abstractmethod\n def load_input(self, context: "InputContext") -> object:\n """The user-defined read method that loads an input to a solid.\n\n Args:\n context (InputContext): The input context.\n\n Returns:\n Any: The data object.\n """
\n\n\nclass IInputManagerDefinition:\n @property\n @abstractmethod\n def input_config_schema(self) -> IDefinitionConfigSchema:\n """The schema for per-input configuration for inputs that are managed by this\n input manager.\n """\n\n\n
[docs]class InputManagerDefinition(ResourceDefinition, IInputManagerDefinition):\n """Definition of an input manager resource.\n\n Input managers load op inputs.\n\n An InputManagerDefinition is a :py:class:`ResourceDefinition` whose resource_fn returns an\n :py:class:`InputManager`.\n\n The easiest way to create an InputManagerDefinition is with the\n :py:func:`@input_manager <input_manager>` decorator.\n """\n\n def __init__(\n self,\n resource_fn: ResourceFunction,\n config_schema: Optional[CoercableToConfigSchema] = None,\n description: Optional[str] = None,\n input_config_schema: Optional[CoercableToConfigSchema] = None,\n required_resource_keys: Optional[AbstractSet[str]] = None,\n version: Optional[str] = None,\n ):\n self._input_config_schema = convert_user_facing_definition_config_schema(\n input_config_schema\n )\n super(InputManagerDefinition, self).__init__(\n resource_fn=resource_fn,\n config_schema=config_schema,\n description=description,\n required_resource_keys=required_resource_keys,\n version=version,\n )\n\n @property\n def input_config_schema(self) -> IDefinitionConfigSchema:\n return self._input_config_schema\n\n def copy_for_configured(\n self,\n description: Optional[str],\n config_schema: CoercableToConfigSchema,\n ) -> "InputManagerDefinition":\n return InputManagerDefinition(\n config_schema=config_schema,\n description=description or self.description,\n resource_fn=self.resource_fn,\n required_resource_keys=self.required_resource_keys,\n input_config_schema=self.input_config_schema,\n )
\n\n\n@overload\ndef input_manager(\n config_schema: InputLoadFn,\n) -> InputManagerDefinition: ...\n\n\n@overload\ndef input_manager(\n config_schema: Optional[CoercableToConfigSchema] = None,\n description: Optional[str] = None,\n input_config_schema: Optional[CoercableToConfigSchema] = None,\n required_resource_keys: Optional[AbstractSet[str]] = None,\n version: Optional[str] = None,\n) -> Callable[[InputLoadFn], InputManagerDefinition]: ...\n\n\n
[docs]def input_manager(\n config_schema: Union[InputLoadFn, Optional[CoercableToConfigSchema]] = None,\n description: Optional[str] = None,\n input_config_schema: Optional[CoercableToConfigSchema] = None,\n required_resource_keys: Optional[AbstractSet[str]] = None,\n version: Optional[str] = None,\n) -> Union[InputManagerDefinition, Callable[[InputLoadFn], InputManagerDefinition]]:\n """Define an input manager.\n\n Input managers load op inputs, either from upstream outputs or by providing default values.\n\n The decorated function should accept a :py:class:`InputContext` and resource config, and return\n a loaded object that will be passed into one of the inputs of an op.\n\n The decorator produces an :py:class:`InputManagerDefinition`.\n\n Args:\n config_schema (Optional[ConfigSchema]): The schema for the resource-level config. If not\n set, Dagster will accept any config provided.\n description (Optional[str]): A human-readable description of the resource.\n input_config_schema (Optional[ConfigSchema]): A schema for the input-level config. Each\n input that uses this input manager can be configured separately using this config.\n If not set, Dagster will accept any config provided.\n required_resource_keys (Optional[Set[str]]): Keys for the resources required by the input\n manager.\n version (Optional[str]): (Experimental) the version of the input manager definition.\n\n **Examples:**\n\n .. code-block:: python\n\n from dagster import input_manager, op, job, In\n\n @input_manager\n def csv_loader(_):\n return read_csv("some/path")\n\n @op(ins={"input1": In(input_manager_key="csv_loader_key")})\n def my_op(_, input1):\n do_stuff(input1)\n\n @job(resource_defs={"csv_loader_key": csv_loader})\n def my_job():\n my_op()\n\n @input_manager(config_schema={"base_dir": str})\n def csv_loader(context):\n return read_csv(context.resource_config["base_dir"] + "/some/path")\n\n @input_manager(input_config_schema={"path": str})\n def csv_loader(context):\n return read_csv(context.config["path"])\n """\n if _is_input_load_fn(config_schema):\n return _InputManagerDecoratorCallable()(config_schema)\n\n def _wrap(load_fn: InputLoadFn) -> InputManagerDefinition:\n return _InputManagerDecoratorCallable(\n config_schema=cast(CoercableToConfigSchema, config_schema),\n description=description,\n version=version,\n input_config_schema=input_config_schema,\n required_resource_keys=required_resource_keys,\n )(load_fn)\n\n return _wrap
\n\n\ndef _is_input_load_fn(obj: Union[InputLoadFn, CoercableToConfigSchema]) -> TypeGuard[InputLoadFn]:\n return callable(obj) and not is_callable_valid_config_arg(obj)\n\n\nclass InputManagerWrapper(InputManager):\n def __init__(self, load_fn: InputLoadFn):\n self._load_fn = load_fn\n\n def load_input(self, context: "InputContext") -> object:\n # the @input_manager decorated function (self._load_fn) may return a direct value that\n # should be used or an instance of an InputManager. So we call self._load_fn and see if the\n # result is an InputManager. If so we call it's load_input method\n intermediate = (\n # type-ignore because function being used as attribute\n self._load_fn(context)\n if has_at_least_one_parameter(self._load_fn)\n else self._load_fn() # type: ignore # (strict type guard)\n )\n\n if isinstance(intermediate, InputManager):\n return intermediate.load_input(context)\n return intermediate\n\n\nclass _InputManagerDecoratorCallable:\n def __init__(\n self,\n config_schema: CoercableToConfigSchema = None,\n description: Optional[str] = None,\n version: Optional[str] = None,\n input_config_schema: CoercableToConfigSchema = None,\n required_resource_keys: Optional[AbstractSet[str]] = None,\n ):\n self.config_schema = config_schema\n self.description = check.opt_str_param(description, "description")\n self.version = check.opt_str_param(version, "version")\n self.input_config_schema = input_config_schema\n self.required_resource_keys = required_resource_keys\n\n def __call__(self, load_fn: InputLoadFn) -> InputManagerDefinition:\n check.callable_param(load_fn, "load_fn")\n\n def _resource_fn(_):\n return InputManagerWrapper(load_fn)\n\n input_manager_def = InputManagerDefinition(\n resource_fn=_resource_fn,\n config_schema=self.config_schema,\n description=self.description,\n version=self.version,\n input_config_schema=self.input_config_schema,\n required_resource_keys=self.required_resource_keys,\n )\n\n # `update_wrapper` typing cannot currently handle a Union of Callables correctly\n update_wrapper(input_manager_def, wrapped=load_fn) # type: ignore\n\n return input_manager_def\n
", "current_page_name": "_modules/dagster/_core/storage/input_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.storage.input_manager"}, "io_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.storage.io_manager

\nfrom abc import abstractmethod\nfrom functools import update_wrapper\nfrom typing import TYPE_CHECKING, AbstractSet, Any, Callable, Optional, Set, Union, cast, overload\n\nfrom typing_extensions import TypeAlias, TypeGuard\n\nimport dagster._check as check\nfrom dagster._annotations import public\nfrom dagster._config import UserConfigSchema\nfrom dagster._core.definitions.config import is_callable_valid_config_arg\nfrom dagster._core.definitions.definition_config_schema import (\n    CoercableToConfigSchema,\n    IDefinitionConfigSchema,\n    convert_user_facing_definition_config_schema,\n)\nfrom dagster._core.definitions.resource_definition import ResourceDefinition\nfrom dagster._core.storage.input_manager import IInputManagerDefinition, InputManager\nfrom dagster._core.storage.output_manager import IOutputManagerDefinition, OutputManager\n\nfrom ..decorator_utils import get_function_params\n\nif TYPE_CHECKING:\n    from dagster._core.execution.context.init import InitResourceContext\n    from dagster._core.execution.context.input import InputContext\n    from dagster._core.execution.context.output import OutputContext\n\nIOManagerFunctionWithContext = Callable[["InitResourceContext"], "IOManager"]\nIOManagerFunction: TypeAlias = Union[\n    IOManagerFunctionWithContext,\n    Callable[[], "IOManager"],\n]\n\n\ndef is_io_manager_context_provided(\n    fn: IOManagerFunction,\n) -> TypeGuard[IOManagerFunctionWithContext]:\n    return len(get_function_params(fn)) >= 1\n\n\n
[docs]class IOManagerDefinition(ResourceDefinition, IInputManagerDefinition, IOutputManagerDefinition):\n """Definition of an IO manager resource.\n\n IOManagers are used to store op outputs and load them as inputs to downstream ops.\n\n An IOManagerDefinition is a :py:class:`ResourceDefinition` whose `resource_fn` returns an\n :py:class:`IOManager`.\n\n The easiest way to create an IOManagerDefnition is with the :py:func:`@io_manager <io_manager>`\n decorator.\n """\n\n def __init__(\n self,\n resource_fn: IOManagerFunction,\n config_schema: CoercableToConfigSchema = None,\n description: Optional[str] = None,\n required_resource_keys: Optional[AbstractSet[str]] = None,\n version: Optional[str] = None,\n input_config_schema: CoercableToConfigSchema = None,\n output_config_schema: CoercableToConfigSchema = None,\n ):\n self._input_config_schema = convert_user_facing_definition_config_schema(\n input_config_schema\n )\n # Unlike other configurable objects, whose config schemas default to Any,\n # output_config_schema defaults to None. This the because IOManager input / output config\n # shares config namespace with dagster type loaders.\n self._output_config_schema = (\n convert_user_facing_definition_config_schema(output_config_schema)\n if output_config_schema is not None\n else None\n )\n super(IOManagerDefinition, self).__init__(\n resource_fn=resource_fn,\n config_schema=config_schema,\n description=description,\n required_resource_keys=required_resource_keys,\n version=version,\n )\n\n @property\n def input_config_schema(self) -> IDefinitionConfigSchema:\n return self._input_config_schema\n\n @property\n def output_config_schema(self) -> Optional[IDefinitionConfigSchema]:\n return self._output_config_schema\n\n def copy_for_configured(\n self,\n description: Optional[str],\n config_schema: CoercableToConfigSchema,\n ) -> "IOManagerDefinition":\n io_def = IOManagerDefinition(\n config_schema=config_schema,\n description=description or self.description,\n resource_fn=self.resource_fn,\n required_resource_keys=self.required_resource_keys,\n input_config_schema=self.input_config_schema,\n output_config_schema=self.output_config_schema,\n )\n\n io_def._dagster_maintained = self._is_dagster_maintained() # noqa: SLF001\n\n return io_def\n\n
[docs] @public\n @staticmethod\n def hardcoded_io_manager(\n value: "IOManager", description: Optional[str] = None\n ) -> "IOManagerDefinition":\n """A helper function that creates an ``IOManagerDefinition`` with a hardcoded IOManager.\n\n Args:\n value (IOManager): A hardcoded IO Manager which helps mock the definition.\n description ([Optional[str]]): The description of the IO Manager. Defaults to None.\n\n Returns:\n [IOManagerDefinition]: A hardcoded resource.\n """\n check.inst_param(value, "value", IOManager)\n return IOManagerDefinition(resource_fn=lambda _init_context: value, description=description)
\n\n\n
[docs]class IOManager(InputManager, OutputManager):\n """Base class for user-provided IO managers.\n\n IOManagers are used to store op outputs and load them as inputs to downstream ops.\n\n Extend this class to handle how objects are loaded and stored. Users should implement\n ``handle_output`` to store an object and ``load_input`` to retrieve an object.\n """\n\n
[docs] @public\n @abstractmethod\n def load_input(self, context: "InputContext") -> Any:\n """User-defined method that loads an input to an op.\n\n Args:\n context (InputContext): The input context, which describes the input that's being loaded\n and the upstream output that's being loaded from.\n\n Returns:\n Any: The data object.\n """
\n\n
[docs] @public\n @abstractmethod\n def handle_output(self, context: "OutputContext", obj: Any) -> None:\n """User-defined method that stores an output of an op.\n\n Args:\n context (OutputContext): The context of the step output that produces this object.\n obj (Any): The object, returned by the op, to be stored.\n """
\n\n\n@overload\ndef io_manager(config_schema: IOManagerFunction) -> IOManagerDefinition: ...\n\n\n@overload\ndef io_manager(\n config_schema: CoercableToConfigSchema = None,\n description: Optional[str] = None,\n output_config_schema: CoercableToConfigSchema = None,\n input_config_schema: CoercableToConfigSchema = None,\n required_resource_keys: Optional[Set[str]] = None,\n version: Optional[str] = None,\n) -> Callable[[IOManagerFunction], IOManagerDefinition]: ...\n\n\n
[docs]def io_manager(\n config_schema: Union[IOManagerFunction, CoercableToConfigSchema] = None,\n description: Optional[str] = None,\n output_config_schema: CoercableToConfigSchema = None,\n input_config_schema: CoercableToConfigSchema = None,\n required_resource_keys: Optional[Set[str]] = None,\n version: Optional[str] = None,\n) -> Union[IOManagerDefinition, Callable[[IOManagerFunction], IOManagerDefinition],]:\n """Define an IO manager.\n\n IOManagers are used to store op outputs and load them as inputs to downstream ops.\n\n The decorated function should accept an :py:class:`InitResourceContext` and return an\n :py:class:`IOManager`.\n\n Args:\n config_schema (Optional[ConfigSchema]): The schema for the resource config. Configuration\n data available in `init_context.resource_config`. If not set, Dagster will accept any\n config provided.\n description(Optional[str]): A human-readable description of the resource.\n output_config_schema (Optional[ConfigSchema]): The schema for per-output config. If not set,\n no per-output configuration will be allowed.\n input_config_schema (Optional[ConfigSchema]): The schema for per-input config. If not set,\n Dagster will accept any config provided.\n required_resource_keys (Optional[Set[str]]): Keys for the resources required by the object\n manager.\n version (Optional[str]): (Experimental) The version of a resource function. Two wrapped\n resource functions should only have the same version if they produce the same resource\n definition when provided with the same inputs.\n\n **Examples:**\n\n .. code-block:: python\n\n class MyIOManager(IOManager):\n def handle_output(self, context, obj):\n write_csv("some/path")\n\n def load_input(self, context):\n return read_csv("some/path")\n\n @io_manager\n def my_io_manager(init_context):\n return MyIOManager()\n\n @op(out=Out(io_manager_key="my_io_manager_key"))\n def my_op(_):\n return do_stuff()\n\n @job(resource_defs={"my_io_manager_key": my_io_manager})\n def my_job():\n my_op()\n\n """\n if callable(config_schema) and not is_callable_valid_config_arg(config_schema):\n config_schema = cast(IOManagerFunction, config_schema)\n return _IOManagerDecoratorCallable()(config_schema)\n\n def _wrap(resource_fn: IOManagerFunction) -> IOManagerDefinition:\n return _IOManagerDecoratorCallable(\n config_schema=cast(Optional[UserConfigSchema], config_schema),\n description=description,\n required_resource_keys=required_resource_keys,\n version=version,\n output_config_schema=output_config_schema,\n input_config_schema=input_config_schema,\n )(resource_fn)\n\n return _wrap
\n\n\ndef dagster_maintained_io_manager(io_manager_def: IOManagerDefinition) -> IOManagerDefinition:\n io_manager_def._dagster_maintained = True # noqa: SLF001\n return io_manager_def\n\n\nclass _IOManagerDecoratorCallable:\n def __init__(\n self,\n config_schema: CoercableToConfigSchema = None,\n description: Optional[str] = None,\n output_config_schema: CoercableToConfigSchema = None,\n input_config_schema: CoercableToConfigSchema = None,\n required_resource_keys: Optional[Set[str]] = None,\n version: Optional[str] = None,\n ):\n # type validation happens in IOManagerDefinition\n self.config_schema = config_schema\n self.description = description\n self.required_resource_keys = required_resource_keys\n self.version = version\n self.output_config_schema = output_config_schema\n self.input_config_schema = input_config_schema\n\n def __call__(self, fn: IOManagerFunction) -> IOManagerDefinition:\n check.callable_param(fn, "fn")\n\n io_manager_def = IOManagerDefinition(\n resource_fn=fn,\n config_schema=self.config_schema,\n description=self.description,\n required_resource_keys=self.required_resource_keys,\n version=self.version,\n output_config_schema=self.output_config_schema,\n input_config_schema=self.input_config_schema,\n )\n\n # `update_wrapper` typing cannot currently handle a Union of Callables correctly\n update_wrapper(io_manager_def, wrapped=fn) # type: ignore\n\n return io_manager_def\n
", "current_page_name": "_modules/dagster/_core/storage/io_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.storage.io_manager"}, "local_compute_log_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.storage.local_compute_log_manager

\nimport hashlib\nimport os\nimport shutil\nimport sys\nfrom collections import defaultdict\nfrom contextlib import contextmanager\nfrom typing import IO, TYPE_CHECKING, Generator, Iterator, Mapping, Optional, Sequence, Tuple\n\nfrom typing_extensions import Final\nfrom watchdog.events import PatternMatchingEventHandler\nfrom watchdog.observers.polling import PollingObserver\n\nfrom dagster import (\n    Field,\n    Float,\n    StringSource,\n    _check as check,\n)\nfrom dagster._config.config_schema import UserConfigSchema\nfrom dagster._core.execution.compute_logs import mirror_stream_to_file\nfrom dagster._core.storage.dagster_run import DagsterRun\nfrom dagster._serdes import ConfigurableClass, ConfigurableClassData\nfrom dagster._seven import json\nfrom dagster._utils import ensure_dir, ensure_file, touch_file\n\nfrom .captured_log_manager import (\n    CapturedLogContext,\n    CapturedLogData,\n    CapturedLogManager,\n    CapturedLogMetadata,\n    CapturedLogSubscription,\n)\nfrom .compute_log_manager import (\n    MAX_BYTES_FILE_READ,\n    ComputeIOType,\n    ComputeLogFileData,\n    ComputeLogManager,\n    ComputeLogSubscription,\n)\n\nif TYPE_CHECKING:\n    from dagster._core.storage.cloud_storage_compute_log_manager import LogSubscription\n\nDEFAULT_WATCHDOG_POLLING_TIMEOUT: Final = 2.5\n\nIO_TYPE_EXTENSION: Final[Mapping[ComputeIOType, str]] = {\n    ComputeIOType.STDOUT: "out",\n    ComputeIOType.STDERR: "err",\n}\n\nMAX_FILENAME_LENGTH: Final = 255\n\n\n
[docs]class LocalComputeLogManager(CapturedLogManager, ComputeLogManager, ConfigurableClass):\n """Stores copies of stdout & stderr for each compute step locally on disk."""\n\n def __init__(\n self,\n base_dir: str,\n polling_timeout: Optional[float] = None,\n inst_data: Optional[ConfigurableClassData] = None,\n ):\n self._base_dir = base_dir\n self._polling_timeout = check.opt_float_param(\n polling_timeout, "polling_timeout", DEFAULT_WATCHDOG_POLLING_TIMEOUT\n )\n self._subscription_manager = LocalComputeLogSubscriptionManager(self)\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n\n @property\n def inst_data(self) -> Optional[ConfigurableClassData]:\n return self._inst_data\n\n @property\n def polling_timeout(self) -> float:\n return self._polling_timeout\n\n @classmethod\n def config_type(cls) -> UserConfigSchema:\n return {\n "base_dir": StringSource,\n "polling_timeout": Field(Float, is_required=False),\n }\n\n @classmethod\n def from_config_value(\n cls, inst_data: Optional[ConfigurableClassData], config_value\n ) -> "LocalComputeLogManager":\n return LocalComputeLogManager(inst_data=inst_data, **config_value)\n\n @contextmanager\n def capture_logs(self, log_key: Sequence[str]) -> Generator[CapturedLogContext, None, None]:\n outpath = self.get_captured_local_path(log_key, IO_TYPE_EXTENSION[ComputeIOType.STDOUT])\n errpath = self.get_captured_local_path(log_key, IO_TYPE_EXTENSION[ComputeIOType.STDERR])\n with mirror_stream_to_file(sys.stdout, outpath), mirror_stream_to_file(sys.stderr, errpath):\n yield CapturedLogContext(log_key)\n\n # leave artifact on filesystem so that we know the capture is completed\n touch_file(self.complete_artifact_path(log_key))\n\n @contextmanager\n def open_log_stream(\n self, log_key: Sequence[str], io_type: ComputeIOType\n ) -> Iterator[Optional[IO]]:\n path = self.get_captured_local_path(log_key, IO_TYPE_EXTENSION[io_type])\n ensure_file(path)\n with open(path, "+a", encoding="utf-8") as f:\n yield f\n\n def is_capture_complete(self, log_key: Sequence[str]) -> bool:\n return os.path.exists(self.complete_artifact_path(log_key))\n\n def get_log_data(\n self, log_key: Sequence[str], cursor: Optional[str] = None, max_bytes: Optional[int] = None\n ) -> CapturedLogData:\n stdout_cursor, stderr_cursor = self.parse_cursor(cursor)\n stdout, stdout_offset = self._read_bytes(\n log_key, ComputeIOType.STDOUT, offset=stdout_cursor, max_bytes=max_bytes\n )\n stderr, stderr_offset = self._read_bytes(\n log_key, ComputeIOType.STDERR, offset=stderr_cursor, max_bytes=max_bytes\n )\n return CapturedLogData(\n log_key=log_key,\n stdout=stdout,\n stderr=stderr,\n cursor=self.build_cursor(stdout_offset, stderr_offset),\n )\n\n def get_log_metadata(self, log_key: Sequence[str]) -> CapturedLogMetadata:\n return CapturedLogMetadata(\n stdout_location=self.get_captured_local_path(\n log_key, IO_TYPE_EXTENSION[ComputeIOType.STDOUT]\n ),\n stderr_location=self.get_captured_local_path(\n log_key, IO_TYPE_EXTENSION[ComputeIOType.STDERR]\n ),\n stdout_download_url=self.get_captured_log_download_url(log_key, ComputeIOType.STDOUT),\n stderr_download_url=self.get_captured_log_download_url(log_key, ComputeIOType.STDERR),\n )\n\n def delete_logs(\n self, log_key: Optional[Sequence[str]] = None, prefix: Optional[Sequence[str]] = None\n ):\n if log_key:\n paths = [\n self.get_captured_local_path(log_key, IO_TYPE_EXTENSION[ComputeIOType.STDOUT]),\n self.get_captured_local_path(log_key, IO_TYPE_EXTENSION[ComputeIOType.STDERR]),\n self.get_captured_local_path(\n log_key, IO_TYPE_EXTENSION[ComputeIOType.STDOUT], partial=True\n ),\n self.get_captured_local_path(\n log_key, IO_TYPE_EXTENSION[ComputeIOType.STDERR], partial=True\n ),\n self.get_captured_local_path(log_key, "complete"),\n ]\n for path in paths:\n if os.path.exists(path) and os.path.isfile(path):\n os.remove(path)\n elif prefix:\n dir_to_delete = os.path.join(self._base_dir, *prefix)\n if os.path.exists(dir_to_delete) and os.path.isdir(dir_to_delete):\n # recursively delete all files in dir\n shutil.rmtree(dir_to_delete)\n else:\n check.failed("Must pass in either `log_key` or `prefix` argument to delete_logs")\n\n def _read_bytes(\n self,\n log_key: Sequence[str],\n io_type: ComputeIOType,\n offset: Optional[int] = 0,\n max_bytes: Optional[int] = None,\n ):\n path = self.get_captured_local_path(log_key, IO_TYPE_EXTENSION[io_type])\n return self.read_path(path, offset or 0, max_bytes)\n\n def parse_cursor(self, cursor: Optional[str] = None) -> Tuple[int, int]:\n # Translates a string cursor into a set of byte offsets for stdout, stderr\n if not cursor:\n return 0, 0\n\n parts = cursor.split(":")\n if not parts or len(parts) != 2:\n return 0, 0\n\n stdout, stderr = [int(_) for _ in parts]\n return stdout, stderr\n\n def build_cursor(self, stdout_offset: int, stderr_offset: int) -> str:\n return f"{stdout_offset}:{stderr_offset}"\n\n def complete_artifact_path(self, log_key):\n return self.get_captured_local_path(log_key, "complete")\n\n def read_path(\n self,\n path: str,\n offset: int = 0,\n max_bytes: Optional[int] = None,\n ):\n if not os.path.exists(path) or not os.path.isfile(path):\n return None, offset\n\n with open(path, "rb") as f:\n f.seek(offset, os.SEEK_SET)\n if max_bytes is None:\n data = f.read()\n else:\n data = f.read(max_bytes)\n new_offset = f.tell()\n return data, new_offset\n\n def get_captured_log_download_url(self, log_key, io_type):\n check.inst_param(io_type, "io_type", ComputeIOType)\n url = "/logs"\n for part in log_key:\n url = f"{url}/{part}"\n\n return f"{url}/{IO_TYPE_EXTENSION[io_type]}"\n\n def get_captured_local_path(self, log_key: Sequence[str], extension: str, partial=False):\n [*namespace, filebase] = log_key\n filename = f"{filebase}.{extension}"\n if partial:\n filename = f"{filename}.partial"\n if len(filename) > MAX_FILENAME_LENGTH:\n filename = "{}.{}".format(hashlib.md5(filebase.encode("utf-8")).hexdigest(), extension)\n return os.path.join(self._base_dir, *namespace, filename)\n\n def subscribe(\n self, log_key: Sequence[str], cursor: Optional[str] = None\n ) -> CapturedLogSubscription:\n subscription = CapturedLogSubscription(self, log_key, cursor)\n self.on_subscribe(subscription)\n return subscription\n\n def unsubscribe(self, subscription):\n self.on_unsubscribe(subscription)\n\n ###############################################\n #\n # Methods for the ComputeLogManager interface\n #\n ###############################################\n @contextmanager\n def _watch_logs(\n self, dagster_run: DagsterRun, step_key: Optional[str] = None\n ) -> Iterator[None]:\n check.inst_param(dagster_run, "dagster_run", DagsterRun)\n check.opt_str_param(step_key, "step_key")\n\n log_key = self.build_log_key_for_run(dagster_run.run_id, step_key or dagster_run.job_name)\n with self.capture_logs(log_key):\n yield\n\n def get_local_path(self, run_id: str, key: str, io_type: ComputeIOType) -> str:\n """Legacy adapter from compute log manager to more generic captured log manager API."""\n check.inst_param(io_type, "io_type", ComputeIOType)\n log_key = self.build_log_key_for_run(run_id, key)\n return self.get_captured_local_path(log_key, IO_TYPE_EXTENSION[io_type])\n\n def read_logs_file(\n self,\n run_id: str,\n key: str,\n io_type: ComputeIOType,\n cursor: int = 0,\n max_bytes: int = MAX_BYTES_FILE_READ,\n ) -> ComputeLogFileData:\n path = self.get_local_path(run_id, key, io_type)\n\n if not os.path.exists(path) or not os.path.isfile(path):\n return ComputeLogFileData(path=path, data=None, cursor=0, size=0, download_url=None)\n\n # See: https://docs.python.org/2/library/stdtypes.html#file.tell for Windows behavior\n with open(path, "rb") as f:\n f.seek(cursor, os.SEEK_SET)\n data = f.read(max_bytes)\n cursor = f.tell()\n stats = os.fstat(f.fileno())\n\n # local download path\n download_url = self.download_url(run_id, key, io_type)\n return ComputeLogFileData(\n path=path,\n data=data.decode("utf-8"),\n cursor=cursor,\n size=stats.st_size,\n download_url=download_url,\n )\n\n def get_key(self, dagster_run: DagsterRun, step_key: Optional[str]):\n check.inst_param(dagster_run, "dagster_run", DagsterRun)\n check.opt_str_param(step_key, "step_key")\n return step_key or dagster_run.job_name\n\n def is_watch_completed(self, run_id: str, key: str) -> bool:\n log_key = self.build_log_key_for_run(run_id, key)\n return self.is_capture_complete(log_key)\n\n def on_watch_start(self, dagster_run: DagsterRun, step_key: Optional[str]):\n pass\n\n def on_watch_finish(self, dagster_run: DagsterRun, step_key: Optional[str] = None):\n check.inst_param(dagster_run, "dagster_run", DagsterRun)\n check.opt_str_param(step_key, "step_key")\n log_key = self.build_log_key_for_run(dagster_run.run_id, step_key or dagster_run.job_name)\n touchpath = self.complete_artifact_path(log_key)\n touch_file(touchpath)\n\n def download_url(self, run_id: str, key: str, io_type: ComputeIOType):\n check.inst_param(io_type, "io_type", ComputeIOType)\n return f"/download/{run_id}/{key}/{io_type.value}"\n\n def on_subscribe(self, subscription: "LogSubscription") -> None:\n self._subscription_manager.add_subscription(subscription)\n\n def on_unsubscribe(self, subscription: "LogSubscription") -> None:\n self._subscription_manager.remove_subscription(subscription)\n\n def dispose(self) -> None:\n self._subscription_manager.dispose()
\n\n\nclass LocalComputeLogSubscriptionManager:\n def __init__(self, manager):\n self._manager = manager\n self._subscriptions = defaultdict(list)\n self._watchers = {}\n self._observer = None\n\n def add_subscription(self, subscription: "LogSubscription") -> None:\n check.inst_param(\n subscription, "subscription", (ComputeLogSubscription, CapturedLogSubscription)\n )\n\n if self.is_complete(subscription):\n subscription.fetch()\n subscription.complete()\n else:\n log_key = self._log_key(subscription)\n watch_key = self._watch_key(log_key)\n self._subscriptions[watch_key].append(subscription)\n self.watch(subscription)\n\n def is_complete(self, subscription: "LogSubscription") -> bool:\n check.inst_param(\n subscription, "subscription", (ComputeLogSubscription, CapturedLogSubscription)\n )\n\n if isinstance(subscription, ComputeLogSubscription):\n return self._manager.is_watch_completed(subscription.run_id, subscription.key)\n return self._manager.is_capture_complete(subscription.log_key)\n\n def remove_subscription(self, subscription: "LogSubscription") -> None:\n check.inst_param(\n subscription, "subscription", (ComputeLogSubscription, CapturedLogSubscription)\n )\n log_key = self._log_key(subscription)\n watch_key = self._watch_key(log_key)\n if subscription in self._subscriptions[watch_key]:\n self._subscriptions[watch_key].remove(subscription)\n subscription.complete()\n\n def _log_key(self, subscription: "LogSubscription") -> Sequence[str]:\n check.inst_param(\n subscription, "subscription", (ComputeLogSubscription, CapturedLogSubscription)\n )\n\n if isinstance(subscription, ComputeLogSubscription):\n return self._manager.build_log_key_for_run(subscription.run_id, subscription.key)\n return subscription.log_key\n\n def _watch_key(self, log_key: Sequence[str]) -> str:\n return json.dumps(log_key)\n\n def remove_all_subscriptions(self, log_key: Sequence[str]) -> None:\n watch_key = self._watch_key(log_key)\n for subscription in self._subscriptions.pop(watch_key, []):\n subscription.complete()\n\n def watch(self, subscription: "LogSubscription") -> None:\n log_key = self._log_key(subscription)\n watch_key = self._watch_key(log_key)\n if watch_key in self._watchers:\n return\n\n update_paths = [\n self._manager.get_captured_local_path(log_key, IO_TYPE_EXTENSION[ComputeIOType.STDOUT]),\n self._manager.get_captured_local_path(log_key, IO_TYPE_EXTENSION[ComputeIOType.STDERR]),\n self._manager.get_captured_local_path(\n log_key, IO_TYPE_EXTENSION[ComputeIOType.STDOUT], partial=True\n ),\n self._manager.get_captured_local_path(\n log_key, IO_TYPE_EXTENSION[ComputeIOType.STDERR], partial=True\n ),\n ]\n complete_paths = [self._manager.complete_artifact_path(log_key)]\n directory = os.path.dirname(\n self._manager.get_captured_local_path(log_key, ComputeIOType.STDERR),\n )\n\n if not self._observer:\n self._observer = PollingObserver(self._manager.polling_timeout)\n self._observer.start()\n\n ensure_dir(directory)\n\n self._watchers[watch_key] = self._observer.schedule(\n LocalComputeLogFilesystemEventHandler(self, log_key, update_paths, complete_paths),\n str(directory),\n )\n\n def notify_subscriptions(self, log_key: Sequence[str]) -> None:\n watch_key = self._watch_key(log_key)\n for subscription in self._subscriptions[watch_key]:\n subscription.fetch()\n\n def unwatch(self, log_key: Sequence[str], handler) -> None:\n watch_key = self._watch_key(log_key)\n if watch_key in self._watchers:\n self._observer.remove_handler_for_watch(handler, self._watchers[watch_key]) # type: ignore\n del self._watchers[watch_key]\n\n def dispose(self) -> None:\n if self._observer:\n self._observer.stop()\n self._observer.join(15)\n\n\nclass LocalComputeLogFilesystemEventHandler(PatternMatchingEventHandler):\n def __init__(self, manager, log_key, update_paths, complete_paths):\n self.manager = manager\n self.log_key = log_key\n self.update_paths = update_paths\n self.complete_paths = complete_paths\n patterns = update_paths + complete_paths\n super(LocalComputeLogFilesystemEventHandler, self).__init__(patterns=patterns)\n\n def on_created(self, event):\n if event.src_path in self.complete_paths:\n self.manager.remove_all_subscriptions(self.log_key)\n self.manager.unwatch(self.log_key, self)\n\n def on_modified(self, event):\n if event.src_path in self.update_paths:\n self.manager.notify_subscriptions(self.log_key)\n
", "current_page_name": "_modules/dagster/_core/storage/local_compute_log_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.storage.local_compute_log_manager"}, "mem_io_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.storage.mem_io_manager

\nfrom typing import Dict, Tuple\n\nfrom dagster._core.execution.context.input import InputContext\nfrom dagster._core.execution.context.output import OutputContext\nfrom dagster._core.storage.io_manager import IOManager, dagster_maintained_io_manager, io_manager\n\n\n
[docs]class InMemoryIOManager(IOManager):\n """I/O manager that stores and retrieves values in memory. After execution is complete, the values will\n be garbage-collected. Note that this means that each run will not have access to values from previous runs.\n """\n\n def __init__(self):\n self.values: Dict[Tuple[object, ...], object] = {}\n\n def handle_output(self, context: OutputContext, obj: object):\n keys = tuple(context.get_identifier())\n self.values[keys] = obj\n\n def load_input(self, context: InputContext) -> object:\n keys = tuple(context.get_identifier())\n return self.values[keys]
\n\n\n
[docs]@dagster_maintained_io_manager\n@io_manager(description="Built-in IO manager that stores and retrieves values in memory.")\ndef mem_io_manager(_) -> InMemoryIOManager:\n """Built-in IO manager that stores and retrieves values in memory."""\n return InMemoryIOManager()
\n
", "current_page_name": "_modules/dagster/_core/storage/mem_io_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.storage.mem_io_manager"}, "memoizable_io_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.storage.memoizable_io_manager

\nimport os\nimport pickle\nfrom abc import abstractmethod\nfrom typing import Union\n\nimport dagster._check as check\nfrom dagster._annotations import experimental, public\nfrom dagster._config import Field, StringSource\nfrom dagster._core.errors import DagsterInvariantViolationError\nfrom dagster._core.execution.context.input import InputContext\nfrom dagster._core.execution.context.output import OutputContext\nfrom dagster._core.storage.io_manager import IOManager, dagster_maintained_io_manager, io_manager\nfrom dagster._utils import PICKLE_PROTOCOL, mkdir_p\n\n\n
[docs]class MemoizableIOManager(IOManager):\n """Base class for IO manager enabled to work with memoized execution. Users should implement\n the ``load_input`` and ``handle_output`` methods described in the ``IOManager`` API, and the\n ``has_output`` method, which returns a boolean representing whether a data object can be found.\n """\n\n
[docs] @public\n @abstractmethod\n def has_output(self, context: OutputContext) -> bool:\n """The user-defined method that returns whether data exists given the metadata.\n\n Args:\n context (OutputContext): The context of the step performing this check.\n\n Returns:\n bool: True if there is data present that matches the provided context. False otherwise.\n """
\n\n\nclass VersionedPickledObjectFilesystemIOManager(MemoizableIOManager):\n def __init__(self, base_dir=None):\n self.base_dir = check.opt_str_param(base_dir, "base_dir")\n self.write_mode = "wb"\n self.read_mode = "rb"\n\n def _get_path(self, context: Union[InputContext, OutputContext]) -> str:\n output_context: OutputContext\n\n if isinstance(context, OutputContext):\n output_context = context\n else:\n if context.upstream_output is None:\n raise DagsterInvariantViolationError(\n "Missing value of InputContext.upstream_output. Cannot compute the input path."\n )\n\n output_context = context.upstream_output\n\n # automatically construct filepath\n step_key = check.str_param(output_context.step_key, "context.step_key")\n output_name = check.str_param(output_context.name, "context.name")\n version = check.str_param(output_context.version, "context.version")\n\n return os.path.join(self.base_dir, step_key, output_name, version)\n\n def handle_output(self, context, obj):\n """Pickle the data with the associated version, and store the object to a file.\n\n This method omits the AssetMaterialization event so assets generated by it won't be tracked\n by the Asset Catalog.\n """\n filepath = self._get_path(context)\n\n context.log.debug(f"Writing file at: {filepath}")\n\n # Ensure path exists\n mkdir_p(os.path.dirname(filepath))\n\n with open(filepath, self.write_mode) as write_obj:\n pickle.dump(obj, write_obj, PICKLE_PROTOCOL)\n\n def load_input(self, context):\n """Unpickle the file and Load it to a data object."""\n filepath = self._get_path(context)\n\n context.log.debug(f"Loading file from: {filepath}")\n\n with open(filepath, self.read_mode) as read_obj:\n return pickle.load(read_obj)\n\n def has_output(self, context):\n """Returns true if data object exists with the associated version, False otherwise."""\n filepath = self._get_path(context)\n\n context.log.debug(f"Checking for file at: {filepath}")\n\n return os.path.exists(filepath) and not os.path.isdir(filepath)\n\n\n@dagster_maintained_io_manager\n@io_manager(config_schema={"base_dir": Field(StringSource, is_required=False)})\n@experimental\ndef versioned_filesystem_io_manager(init_context):\n """Filesystem IO manager that utilizes versioning of stored objects.\n\n It requires users to specify a base directory where all the step outputs will be stored in. It\n serializes and deserializes output values (assets) using pickling and automatically constructs\n the filepaths for the assets using the provided directory, and the version for a provided step\n output.\n """\n return VersionedPickledObjectFilesystemIOManager(\n base_dir=init_context.resource_config.get(\n "base_dir", os.path.join(init_context.instance.storage_directory(), "versioned_outputs")\n )\n )\n
", "current_page_name": "_modules/dagster/_core/storage/memoizable_io_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.storage.memoizable_io_manager"}, "noop_compute_log_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.storage.noop_compute_log_manager

\nfrom contextlib import contextmanager\nfrom typing import IO, Any, Generator, Mapping, Optional, Sequence\n\nfrom typing_extensions import Self\n\nimport dagster._check as check\nfrom dagster._core.storage.captured_log_manager import (\n    CapturedLogContext,\n    CapturedLogData,\n    CapturedLogManager,\n    CapturedLogMetadata,\n    CapturedLogSubscription,\n)\nfrom dagster._serdes import ConfigurableClass, ConfigurableClassData\n\nfrom .compute_log_manager import (\n    MAX_BYTES_FILE_READ,\n    ComputeIOType,\n    ComputeLogFileData,\n    ComputeLogManager,\n)\n\n\n
[docs]class NoOpComputeLogManager(CapturedLogManager, ComputeLogManager, ConfigurableClass):\n """When enabled for a Dagster instance, stdout and stderr will not be available for any step."""\n\n def __init__(self, inst_data: Optional[ConfigurableClassData] = None):\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n\n @property\n def inst_data(self):\n return self._inst_data\n\n @classmethod\n def config_type(cls):\n return {}\n\n @classmethod\n def from_config_value(\n cls, inst_data: ConfigurableClassData, config_value: Mapping[str, Any]\n ) -> Self:\n return NoOpComputeLogManager(inst_data=inst_data, **config_value)\n\n def enabled(self, _dagster_run, _step_key):\n return False\n\n def _watch_logs(self, dagster_run, step_key=None):\n pass\n\n def get_local_path(self, run_id: str, key: str, io_type: ComputeIOType) -> str:\n raise NotImplementedError()\n\n def is_watch_completed(self, run_id, key):\n return True\n\n def on_watch_start(self, dagster_run, step_key):\n pass\n\n def on_watch_finish(self, dagster_run, step_key):\n pass\n\n def download_url(self, run_id, key, io_type):\n return None\n\n def read_logs_file(self, run_id, key, io_type, cursor=0, max_bytes=MAX_BYTES_FILE_READ):\n return ComputeLogFileData(\n path=f"{key}.{io_type}", data=None, cursor=0, size=0, download_url=None\n )\n\n def on_subscribe(self, subscription):\n pass\n\n def on_unsubscribe(self, subscription):\n pass\n\n @contextmanager\n def capture_logs(self, log_key: Sequence[str]) -> Generator[CapturedLogContext, None, None]:\n yield CapturedLogContext(log_key=log_key)\n\n def is_capture_complete(self, log_key: Sequence[str]):\n return True\n\n @contextmanager\n def open_log_stream(\n self, log_key: Sequence[str], io_type: ComputeIOType\n ) -> Generator[Optional[IO], None, None]:\n yield None\n\n def get_log_data(\n self,\n log_key: Sequence[str],\n cursor: Optional[str] = None,\n max_bytes: Optional[int] = None,\n ) -> CapturedLogData:\n return CapturedLogData(log_key=log_key)\n\n def get_log_metadata(self, log_key: Sequence[str]) -> CapturedLogMetadata:\n return CapturedLogMetadata()\n\n def delete_logs(\n self, log_key: Optional[Sequence[str]] = None, prefix: Optional[Sequence[str]] = None\n ):\n pass\n\n def subscribe(\n self, log_key: Sequence[str], cursor: Optional[str] = None\n ) -> CapturedLogSubscription:\n return CapturedLogSubscription(self, log_key, cursor)\n\n def unsubscribe(self, subscription: CapturedLogSubscription):\n pass
\n
", "current_page_name": "_modules/dagster/_core/storage/noop_compute_log_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.storage.noop_compute_log_manager"}, "root": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.storage.root

\nimport os\nfrom tempfile import TemporaryDirectory\nfrom typing import Optional\n\nfrom typing_extensions import TypedDict\n\nfrom dagster import (\n    StringSource,\n    _check as check,\n)\nfrom dagster._config.config_schema import UserConfigSchema\nfrom dagster._serdes import ConfigurableClass, ConfigurableClassData\n\n\nclass LocalArtifactStorageConfig(TypedDict):\n    base_dir: str\n\n\n
[docs]class LocalArtifactStorage(ConfigurableClass):\n def __init__(self, base_dir: str, inst_data: Optional[ConfigurableClassData] = None):\n self._base_dir = base_dir\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n\n @property\n def inst_data(self) -> Optional[ConfigurableClassData]:\n return self._inst_data\n\n @property\n def base_dir(self) -> str:\n return self._base_dir\n\n def file_manager_dir(self, run_id: str) -> str:\n check.str_param(run_id, "run_id")\n return os.path.join(self.base_dir, "storage", run_id, "files")\n\n @property\n def storage_dir(self) -> str:\n return os.path.join(self.base_dir, "storage")\n\n @property\n def schedules_dir(self) -> str:\n return os.path.join(self.base_dir, "schedules")\n\n @classmethod\n def from_config_value(\n cls, inst_data: Optional[ConfigurableClassData], config_value: LocalArtifactStorageConfig\n ) -> "LocalArtifactStorage":\n return LocalArtifactStorage(inst_data=inst_data, **config_value)\n\n @classmethod\n def config_type(cls) -> UserConfigSchema:\n return {"base_dir": StringSource}\n\n def dispose(self):\n pass
\n\n\nclass TemporaryLocalArtifactStorage(LocalArtifactStorage):\n """Used by ephemeral DagsterInstances, defers directory creation til\n access since many uses of ephemeral instance do not require artifact directory.\n """\n\n def __init__(self):\n self._temp_dir = None\n\n @property\n def base_dir(self):\n if self._temp_dir is None:\n self._temp_dir = TemporaryDirectory()\n return self._temp_dir.name\n\n def dispose(self):\n if self._temp_dir:\n self._temp_dir.cleanup()\n
", "current_page_name": "_modules/dagster/_core/storage/root", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.storage.root"}, "runs": {"base": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.storage.runs.base

\nfrom abc import ABC, abstractmethod\nfrom typing import TYPE_CHECKING, Mapping, Optional, Sequence, Set, Tuple, Union\n\nfrom typing_extensions import TypedDict\n\nfrom dagster._core.events import DagsterEvent\nfrom dagster._core.execution.backfill import BulkActionStatus, PartitionBackfill\nfrom dagster._core.instance import MayHaveInstanceWeakref, T_DagsterInstance\nfrom dagster._core.snap import ExecutionPlanSnapshot, JobSnapshot\nfrom dagster._core.storage.dagster_run import (\n    DagsterRun,\n    JobBucket,\n    RunPartitionData,\n    RunRecord,\n    RunsFilter,\n    TagBucket,\n)\nfrom dagster._core.storage.sql import AlembicVersion\nfrom dagster._daemon.types import DaemonHeartbeat\nfrom dagster._utils import PrintFn\n\nfrom ..daemon_cursor import DaemonCursorStorage\n\nif TYPE_CHECKING:\n    from dagster._core.host_representation.origin import ExternalJobOrigin\n\n\nclass RunGroupInfo(TypedDict):\n    count: int\n    runs: Sequence[DagsterRun]\n\n\n
[docs]class RunStorage(ABC, MayHaveInstanceWeakref[T_DagsterInstance], DaemonCursorStorage):\n """Abstract base class for storing pipeline run history.\n\n Note that run storages using SQL databases as backing stores should implement\n :py:class:`~dagster._core.storage.runs.SqlRunStorage`.\n\n Users should not directly instantiate concrete subclasses of this class; they are instantiated\n by internal machinery when ``dagster-webserver`` and ``dagster-graphql`` load, based on the values in the\n ``dagster.yaml`` file in ``$DAGSTER_HOME``. Configuration of concrete subclasses of this class\n should be done by setting values in that file.\n """\n\n @abstractmethod\n def add_run(self, dagster_run: DagsterRun) -> DagsterRun:\n """Add a run to storage.\n\n If a run already exists with the same ID, raise DagsterRunAlreadyExists\n If the run's snapshot ID does not exist raise DagsterSnapshotDoesNotExist\n\n Args:\n dagster_run (DagsterRun): The run to add.\n """\n\n @abstractmethod\n def handle_run_event(self, run_id: str, event: DagsterEvent) -> None:\n """Update run storage in accordance to a pipeline run related DagsterEvent.\n\n Args:\n run_id (str)\n event (DagsterEvent)\n """\n\n @abstractmethod\n def get_runs(\n self,\n filters: Optional[RunsFilter] = None,\n cursor: Optional[str] = None,\n limit: Optional[int] = None,\n bucket_by: Optional[Union[JobBucket, TagBucket]] = None,\n ) -> Sequence[DagsterRun]:\n """Return all the runs present in the storage that match the given filters.\n\n Args:\n filters (Optional[RunsFilter]) -- The\n :py:class:`~dagster._core.storage.pipeline_run.RunsFilter` by which to filter\n runs\n cursor (Optional[str]): Starting cursor (run_id) of range of runs\n limit (Optional[int]): Number of results to get. Defaults to infinite.\n\n Returns:\n List[PipelineRun]\n """\n\n @abstractmethod\n def get_run_ids(\n self,\n filters: Optional[RunsFilter] = None,\n cursor: Optional[str] = None,\n limit: Optional[int] = None,\n ) -> Sequence[str]:\n """Return all the run IDs for runs present in the storage that match the given filters.\n\n Args:\n filters (Optional[RunsFilter]) -- The\n :py:class:`~dagster._core.storage.pipeline_run.RunsFilter` by which to filter\n runs\n cursor (Optional[str]): Starting cursor (run_id) of range of runs\n limit (Optional[int]): Number of results to get. Defaults to infinite.\n\n Returns:\n Sequence[str]\n """\n\n @abstractmethod\n def get_runs_count(self, filters: Optional[RunsFilter] = None) -> int:\n """Return the number of runs present in the storage that match the given filters.\n\n Args:\n filters (Optional[RunsFilter]) -- The\n :py:class:`~dagster._core.storage.pipeline_run.PipelineRunFilter` by which to filter\n runs\n\n Returns:\n int: The number of runs that match the given filters.\n """\n\n @abstractmethod\n def get_run_group(self, run_id: str) -> Optional[Tuple[str, Sequence[DagsterRun]]]:\n """Get the run group to which a given run belongs.\n\n Args:\n run_id (str): If the corresponding run is the descendant of some root run (i.e., there\n is a root_run_id on the :py:class:`PipelineRun`), that root run and all of its\n descendants are returned; otherwise, the group will consist only of the given run\n (a run that does not descend from any root is its own root).\n\n Returns:\n Optional[Tuple[string, List[PipelineRun]]]: If there is a corresponding run group, tuple\n whose first element is the root_run_id and whose second element is a list of all the\n descendent runs. Otherwise `None`.\n """\n\n @abstractmethod\n def get_run_records(\n self,\n filters: Optional[RunsFilter] = None,\n limit: Optional[int] = None,\n order_by: Optional[str] = None,\n ascending: bool = False,\n cursor: Optional[str] = None,\n bucket_by: Optional[Union[JobBucket, TagBucket]] = None,\n ) -> Sequence[RunRecord]:\n """Return a list of run records stored in the run storage, sorted by the given column in given order.\n\n Args:\n filters (Optional[RunsFilter]): the filter by which to filter runs.\n limit (Optional[int]): Number of results to get. Defaults to infinite.\n order_by (Optional[str]): Name of the column to sort by. Defaults to id.\n ascending (Optional[bool]): Sort the result in ascending order if True, descending\n otherwise. Defaults to descending.\n\n Returns:\n List[RunRecord]: List of run records stored in the run storage.\n """\n\n @abstractmethod\n def get_run_tags(\n self,\n tag_keys: Optional[Sequence[str]] = None,\n value_prefix: Optional[str] = None,\n limit: Optional[int] = None,\n ) -> Sequence[Tuple[str, Set[str]]]:\n """Get a list of tag keys and the values that have been associated with them.\n\n Args:\n tag_keys (Optional[Sequence[str]]): tag keys to filter by.\n\n Returns:\n List[Tuple[str, Set[str]]]\n """\n\n @abstractmethod\n def get_run_tag_keys(self) -> Sequence[str]:\n """Get a list of tag keys.\n\n Returns:\n List[str]\n """\n\n @abstractmethod\n def add_run_tags(self, run_id: str, new_tags: Mapping[str, str]) -> None:\n """Add additional tags for a pipeline run.\n\n Args:\n run_id (str)\n new_tags (Dict[string, string])\n """\n\n @abstractmethod\n def has_run(self, run_id: str) -> bool:\n """Check if the storage contains a run.\n\n Args:\n run_id (str): The id of the run\n\n Returns:\n bool\n """\n\n def add_snapshot(\n self,\n snapshot: Union[JobSnapshot, ExecutionPlanSnapshot],\n snapshot_id: Optional[str] = None,\n ) -> None:\n """Add a snapshot to the storage.\n\n Args:\n snapshot (Union[PipelineSnapshot, ExecutionPlanSnapshot])\n snapshot_id (Optional[str]): [Internal] The id of the snapshot. If not provided, the\n snapshot id will be generated from a hash of the snapshot. This should only be used\n in debugging, where we might want to import a historical run whose snapshots were\n calculated using a different hash function than the current code.\n """\n if isinstance(snapshot, JobSnapshot):\n self.add_job_snapshot(snapshot, snapshot_id)\n else:\n self.add_execution_plan_snapshot(snapshot, snapshot_id)\n\n def has_snapshot(self, snapshot_id: str):\n return self.has_job_snapshot(snapshot_id) or self.has_execution_plan_snapshot(snapshot_id)\n\n @abstractmethod\n def has_job_snapshot(self, job_snapshot_id: str) -> bool:\n """Check to see if storage contains a pipeline snapshot.\n\n Args:\n pipeline_snapshot_id (str): The id of the run.\n\n Returns:\n bool\n """\n\n @abstractmethod\n def add_job_snapshot(self, job_snapshot: JobSnapshot, snapshot_id: Optional[str] = None) -> str:\n """Add a pipeline snapshot to the run store.\n\n Pipeline snapshots are content-addressable, meaning\n that the ID for a snapshot is a hash based on the\n body of the snapshot. This function returns\n that snapshot ID.\n\n Args:\n job_snapshot (PipelineSnapshot)\n snapshot_id (Optional[str]): [Internal] The id of the snapshot. If not provided, the\n snapshot id will be generated from a hash of the snapshot. This should only be used\n in debugging, where we might want to import a historical run whose snapshots were\n calculated using a different hash function than the current code.\n\n Return:\n str: The job_snapshot_id\n """\n\n @abstractmethod\n def get_job_snapshot(self, job_snapshot_id: str) -> JobSnapshot:\n """Fetch a snapshot by ID.\n\n Args:\n job_snapshot_id (str)\n\n Returns:\n PipelineSnapshot\n """\n\n @abstractmethod\n def has_execution_plan_snapshot(self, execution_plan_snapshot_id: str) -> bool:\n """Check to see if storage contains an execution plan snapshot.\n\n Args:\n execution_plan_snapshot_id (str): The id of the execution plan.\n\n Returns:\n bool\n """\n\n @abstractmethod\n def add_execution_plan_snapshot(\n self, execution_plan_snapshot: ExecutionPlanSnapshot, snapshot_id: Optional[str] = None\n ) -> str:\n """Add an execution plan snapshot to the run store.\n\n Execution plan snapshots are content-addressable, meaning\n that the ID for a snapshot is a hash based on the\n body of the snapshot. This function returns\n that snapshot ID.\n\n Args:\n execution_plan_snapshot (ExecutionPlanSnapshot)\n snapshot_id (Optional[str]): [Internal] The id of the snapshot. If not provided, the\n snapshot id will be generated from a hash of the snapshot. This should only be used\n in debugging, where we might want to import a historical run whose snapshots were\n calculated using a different hash function than the current code.\n\n Return:\n str: The execution_plan_snapshot_id\n """\n\n @abstractmethod\n def get_execution_plan_snapshot(self, execution_plan_snapshot_id: str) -> ExecutionPlanSnapshot:\n """Fetch a snapshot by ID.\n\n Args:\n execution_plan_snapshot_id (str)\n\n Returns:\n ExecutionPlanSnapshot\n """\n\n @abstractmethod\n def wipe(self) -> None:\n """Clears the run storage."""\n\n @abstractmethod\n def delete_run(self, run_id: str) -> None:\n """Remove a run from storage."""\n\n @property\n def supports_bucket_queries(self) -> bool:\n return False\n\n @abstractmethod\n def get_run_partition_data(self, runs_filter: RunsFilter) -> Sequence[RunPartitionData]:\n """Get run partition data for a given partitioned job."""\n\n def migrate(self, print_fn: Optional[PrintFn] = None, force_rebuild_all: bool = False) -> None:\n """Call this method to run any required data migrations."""\n\n def optimize(self, print_fn: Optional[PrintFn] = None, force_rebuild_all: bool = False) -> None:\n """Call this method to run any optional data migrations for optimized reads."""\n\n def dispose(self) -> None:\n """Explicit lifecycle management."""\n\n def optimize_for_webserver(self, statement_timeout: int, pool_recycle: int) -> None:\n """Allows for optimizing database connection / use in the context of a long lived webserver process."""\n\n # Daemon Heartbeat Storage\n #\n # Holds heartbeats from the Dagster Daemon so that other system components can alert when it's not\n # alive.\n # This is temporarily placed along with run storage to avoid adding a new instance concept. It\n # should be split out once all metadata storages are configured together.\n\n @abstractmethod\n def add_daemon_heartbeat(self, daemon_heartbeat: DaemonHeartbeat) -> None:\n """Called on a regular interval by the daemon."""\n\n @abstractmethod\n def get_daemon_heartbeats(self) -> Mapping[str, DaemonHeartbeat]:\n """Latest heartbeats of all daemon types."""\n\n @abstractmethod\n def wipe_daemon_heartbeats(self) -> None:\n """Wipe all daemon heartbeats."""\n\n # Backfill storage\n @abstractmethod\n def get_backfills(\n self,\n status: Optional[BulkActionStatus] = None,\n cursor: Optional[str] = None,\n limit: Optional[int] = None,\n ) -> Sequence[PartitionBackfill]:\n """Get a list of partition backfills."""\n\n @abstractmethod\n def get_backfill(self, backfill_id: str) -> Optional[PartitionBackfill]:\n """Get the partition backfill of the given backfill id."""\n\n @abstractmethod\n def add_backfill(self, partition_backfill: PartitionBackfill):\n """Add partition backfill to run storage."""\n\n @abstractmethod\n def update_backfill(self, partition_backfill: PartitionBackfill):\n """Update a partition backfill in run storage."""\n\n def alembic_version(self) -> Optional[AlembicVersion]:\n return None\n\n @abstractmethod\n def replace_job_origin(self, run: "DagsterRun", job_origin: "ExternalJobOrigin") -> None: ...
\n
", "current_page_name": "_modules/dagster/_core/storage/runs/base", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.storage.runs.base"}, "sql_run_storage": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.storage.runs.sql_run_storage

\nimport logging\nimport uuid\nimport zlib\nfrom abc import abstractmethod\nfrom collections import defaultdict\nfrom datetime import datetime\nfrom enum import Enum\nfrom typing import (\n    Any,\n    Callable,\n    ContextManager,\n    Dict,\n    Iterable,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Set,\n    Tuple,\n    Union,\n    cast,\n)\n\nimport pendulum\nimport sqlalchemy as db\nimport sqlalchemy.exc as db_exc\nfrom sqlalchemy.engine import Connection\n\nimport dagster._check as check\nfrom dagster._core.errors import (\n    DagsterInvariantViolationError,\n    DagsterRunAlreadyExists,\n    DagsterRunNotFoundError,\n    DagsterSnapshotDoesNotExist,\n)\nfrom dagster._core.events import EVENT_TYPE_TO_PIPELINE_RUN_STATUS, DagsterEvent, DagsterEventType\nfrom dagster._core.execution.backfill import BulkActionStatus, PartitionBackfill\nfrom dagster._core.host_representation.origin import ExternalJobOrigin\nfrom dagster._core.snap import (\n    ExecutionPlanSnapshot,\n    JobSnapshot,\n    create_execution_plan_snapshot_id,\n    create_job_snapshot_id,\n)\nfrom dagster._core.storage.sql import SqlAlchemyQuery\nfrom dagster._core.storage.sqlalchemy_compat import (\n    db_fetch_mappings,\n    db_scalar_subquery,\n    db_select,\n    db_subquery,\n)\nfrom dagster._core.storage.tags import (\n    PARTITION_NAME_TAG,\n    PARTITION_SET_TAG,\n    REPOSITORY_LABEL_TAG,\n    ROOT_RUN_ID_TAG,\n)\nfrom dagster._daemon.types import DaemonHeartbeat\nfrom dagster._serdes import (\n    deserialize_value,\n    serialize_value,\n)\nfrom dagster._seven import JSONDecodeError\nfrom dagster._utils import PrintFn, utc_datetime_from_timestamp\nfrom dagster._utils.merger import merge_dicts\n\nfrom ..dagster_run import (\n    DagsterRun,\n    DagsterRunStatus,\n    JobBucket,\n    RunPartitionData,\n    RunRecord,\n    RunsFilter,\n    TagBucket,\n)\nfrom .base import RunStorage\nfrom .migration import (\n    OPTIONAL_DATA_MIGRATIONS,\n    REQUIRED_DATA_MIGRATIONS,\n    RUN_PARTITIONS,\n    MigrationFn,\n)\nfrom .schema import (\n    BulkActionsTable,\n    DaemonHeartbeatsTable,\n    InstanceInfo,\n    KeyValueStoreTable,\n    RunsTable,\n    RunTagsTable,\n    SecondaryIndexMigrationTable,\n    SnapshotsTable,\n)\n\n\nclass SnapshotType(Enum):\n    PIPELINE = "PIPELINE"\n    EXECUTION_PLAN = "EXECUTION_PLAN"\n\n\n
[docs]class SqlRunStorage(RunStorage):\n """Base class for SQL based run storages."""\n\n @abstractmethod\n def connect(self) -> ContextManager[Connection]:\n """Context manager yielding a sqlalchemy.engine.Connection."""\n\n @abstractmethod\n def upgrade(self) -> None:\n """This method should perform any schema or data migrations necessary to bring an\n out-of-date instance of the storage up to date.\n """\n\n def fetchall(self, query: SqlAlchemyQuery) -> Sequence[Any]:\n with self.connect() as conn:\n return db_fetch_mappings(conn, query)\n\n def fetchone(self, query: SqlAlchemyQuery) -> Optional[Any]:\n with self.connect() as conn:\n if db.__version__.startswith("2."):\n return conn.execute(query).mappings().first()\n else:\n return conn.execute(query).fetchone()\n\n def add_run(self, dagster_run: DagsterRun) -> DagsterRun:\n check.inst_param(dagster_run, "dagster_run", DagsterRun)\n\n if dagster_run.job_snapshot_id and not self.has_job_snapshot(dagster_run.job_snapshot_id):\n raise DagsterSnapshotDoesNotExist(\n f"Snapshot {dagster_run.job_snapshot_id} does not exist in run storage"\n )\n\n has_tags = dagster_run.tags and len(dagster_run.tags) > 0\n partition = dagster_run.tags.get(PARTITION_NAME_TAG) if has_tags else None\n partition_set = dagster_run.tags.get(PARTITION_SET_TAG) if has_tags else None\n\n runs_insert = RunsTable.insert().values(\n run_id=dagster_run.run_id,\n pipeline_name=dagster_run.job_name,\n status=dagster_run.status.value,\n run_body=serialize_value(dagster_run),\n snapshot_id=dagster_run.job_snapshot_id,\n partition=partition,\n partition_set=partition_set,\n )\n with self.connect() as conn:\n try:\n conn.execute(runs_insert)\n except db_exc.IntegrityError as exc:\n raise DagsterRunAlreadyExists from exc\n\n tags_to_insert = dagster_run.tags_for_storage()\n if tags_to_insert:\n conn.execute(\n RunTagsTable.insert(),\n [\n dict(run_id=dagster_run.run_id, key=k, value=v)\n for k, v in tags_to_insert.items()\n ],\n )\n\n return dagster_run\n\n def handle_run_event(self, run_id: str, event: DagsterEvent) -> None:\n check.str_param(run_id, "run_id")\n check.inst_param(event, "event", DagsterEvent)\n\n if event.event_type not in EVENT_TYPE_TO_PIPELINE_RUN_STATUS:\n return\n\n run = self._get_run_by_id(run_id)\n if not run:\n # TODO log?\n return\n\n new_job_status = EVENT_TYPE_TO_PIPELINE_RUN_STATUS[event.event_type]\n\n run_stats_cols_in_index = self.has_run_stats_index_cols()\n\n kwargs = {}\n\n # consider changing the `handle_run_event` signature to get timestamp off of the\n # EventLogEntry instead of the DagsterEvent, for consistency\n now = pendulum.now("UTC")\n\n if run_stats_cols_in_index and event.event_type == DagsterEventType.PIPELINE_START:\n kwargs["start_time"] = now.timestamp()\n\n if run_stats_cols_in_index and event.event_type in {\n DagsterEventType.PIPELINE_CANCELED,\n DagsterEventType.PIPELINE_FAILURE,\n DagsterEventType.PIPELINE_SUCCESS,\n }:\n kwargs["end_time"] = now.timestamp()\n\n with self.connect() as conn:\n conn.execute(\n RunsTable.update()\n .where(RunsTable.c.run_id == run_id)\n .values(\n run_body=serialize_value(run.with_status(new_job_status)),\n status=new_job_status.value,\n update_timestamp=now,\n **kwargs,\n )\n )\n\n def _row_to_run(self, row: Dict) -> DagsterRun:\n run = deserialize_value(row["run_body"], DagsterRun)\n status = DagsterRunStatus(row["status"])\n # NOTE: the status column is more trustworthy than the status in the run body, since concurrent\n # writes (e.g. handle_run_event and add_tags) can cause the status in the body to be out of\n # overriden with an old value.\n return run.with_status(status)\n\n def _rows_to_runs(self, rows: Iterable[Dict]) -> Sequence[DagsterRun]:\n return list(map(self._row_to_run, rows))\n\n def _add_cursor_limit_to_query(\n self,\n query: SqlAlchemyQuery,\n cursor: Optional[str],\n limit: Optional[int],\n order_by: Optional[str],\n ascending: Optional[bool],\n ) -> SqlAlchemyQuery:\n """Helper function to deal with cursor/limit pagination args."""\n if cursor:\n cursor_query = db_select([RunsTable.c.id]).where(RunsTable.c.run_id == cursor)\n query = query.where(RunsTable.c.id < db_scalar_subquery(cursor_query))\n\n if limit:\n query = query.limit(limit)\n\n sorting_column = getattr(RunsTable.c, order_by) if order_by else RunsTable.c.id\n direction = db.asc if ascending else db.desc\n query = query.order_by(direction(sorting_column))\n\n return query\n\n @property\n def supports_intersect(self) -> bool:\n return True\n\n def _add_filters_to_query(self, query: SqlAlchemyQuery, filters: RunsFilter) -> SqlAlchemyQuery:\n check.inst_param(filters, "filters", RunsFilter)\n\n if filters.run_ids:\n query = query.where(RunsTable.c.run_id.in_(filters.run_ids))\n\n if filters.job_name:\n query = query.where(RunsTable.c.pipeline_name == filters.job_name)\n\n if filters.statuses:\n query = query.where(\n RunsTable.c.status.in_([status.value for status in filters.statuses])\n )\n\n if filters.snapshot_id:\n query = query.where(RunsTable.c.snapshot_id == filters.snapshot_id)\n\n if filters.updated_after:\n query = query.where(RunsTable.c.update_timestamp > filters.updated_after)\n\n if filters.updated_before:\n query = query.where(RunsTable.c.update_timestamp < filters.updated_before)\n\n if filters.created_after:\n query = query.where(RunsTable.c.create_timestamp > filters.created_after)\n\n if filters.created_before:\n query = query.where(RunsTable.c.create_timestamp < filters.created_before)\n\n return query\n\n def _runs_query(\n self,\n filters: Optional[RunsFilter] = None,\n cursor: Optional[str] = None,\n limit: Optional[int] = None,\n columns: Optional[Sequence[str]] = None,\n order_by: Optional[str] = None,\n ascending: bool = False,\n bucket_by: Optional[Union[JobBucket, TagBucket]] = None,\n ) -> SqlAlchemyQuery:\n filters = check.opt_inst_param(filters, "filters", RunsFilter, default=RunsFilter())\n check.opt_str_param(cursor, "cursor")\n check.opt_int_param(limit, "limit")\n check.opt_sequence_param(columns, "columns")\n check.opt_str_param(order_by, "order_by")\n check.opt_bool_param(ascending, "ascending")\n\n if columns is None:\n columns = ["run_body", "status"]\n\n if filters.tags:\n table = self._apply_tags_table_joins(RunsTable, filters.tags)\n else:\n table = RunsTable\n\n base_query = db_select([getattr(RunsTable.c, column) for column in columns]).select_from(\n table\n )\n base_query = self._add_filters_to_query(base_query, filters)\n return self._add_cursor_limit_to_query(base_query, cursor, limit, order_by, ascending)\n\n def _apply_tags_table_joins(\n self,\n table: db.Table,\n tags: Mapping[str, Union[str, Sequence[str]]],\n ) -> db.Table:\n multi_join = len(tags) > 1\n i = 0\n for key, value in tags.items():\n i += 1\n tags_table = (\n db_subquery(db_select([RunTagsTable]), f"run_tags_subquery_{i}")\n if multi_join\n else RunTagsTable\n )\n table = table.join(\n tags_table,\n db.and_(\n RunsTable.c.run_id == tags_table.c.run_id,\n tags_table.c.key == key,\n (\n tags_table.c.value == value\n if isinstance(value, str)\n else tags_table.c.value.in_(value)\n ),\n ),\n )\n return table\n\n def get_runs(\n self,\n filters: Optional[RunsFilter] = None,\n cursor: Optional[str] = None,\n limit: Optional[int] = None,\n bucket_by: Optional[Union[JobBucket, TagBucket]] = None,\n ) -> Sequence[DagsterRun]:\n query = self._runs_query(filters, cursor, limit, bucket_by=bucket_by)\n rows = self.fetchall(query)\n return self._rows_to_runs(rows)\n\n def get_run_ids(\n self,\n filters: Optional[RunsFilter] = None,\n cursor: Optional[str] = None,\n limit: Optional[int] = None,\n ) -> Sequence[str]:\n query = self._runs_query(filters=filters, cursor=cursor, limit=limit, columns=["run_id"])\n rows = self.fetchall(query)\n return [row["run_id"] for row in rows]\n\n def get_runs_count(self, filters: Optional[RunsFilter] = None) -> int:\n subquery = db_subquery(self._runs_query(filters=filters))\n query = db_select([db.func.count().label("count")]).select_from(subquery)\n row = self.fetchone(query)\n count = row["count"] if row else 0\n return count\n\n def _get_run_by_id(self, run_id: str) -> Optional[DagsterRun]:\n check.str_param(run_id, "run_id")\n\n query = db_select([RunsTable.c.run_body, RunsTable.c.status]).where(\n RunsTable.c.run_id == run_id\n )\n rows = self.fetchall(query)\n return self._row_to_run(rows[0]) if rows else None\n\n def get_run_records(\n self,\n filters: Optional[RunsFilter] = None,\n limit: Optional[int] = None,\n order_by: Optional[str] = None,\n ascending: bool = False,\n cursor: Optional[str] = None,\n bucket_by: Optional[Union[JobBucket, TagBucket]] = None,\n ) -> Sequence[RunRecord]:\n filters = check.opt_inst_param(filters, "filters", RunsFilter, default=RunsFilter())\n check.opt_int_param(limit, "limit")\n\n columns = ["id", "run_body", "status", "create_timestamp", "update_timestamp"]\n\n if self.has_run_stats_index_cols():\n columns += ["start_time", "end_time"]\n # only fetch columns we use to build RunRecord\n query = self._runs_query(\n filters=filters,\n limit=limit,\n columns=columns,\n order_by=order_by,\n ascending=ascending,\n cursor=cursor,\n bucket_by=bucket_by,\n )\n\n rows = self.fetchall(query)\n return [\n RunRecord(\n storage_id=check.int_param(row["id"], "id"),\n dagster_run=self._row_to_run(row),\n create_timestamp=check.inst(row["create_timestamp"], datetime),\n update_timestamp=check.inst(row["update_timestamp"], datetime),\n start_time=(\n check.opt_inst(row["start_time"], float) if "start_time" in row else None\n ),\n end_time=check.opt_inst(row["end_time"], float) if "end_time" in row else None,\n )\n for row in rows\n ]\n\n def get_run_tags(\n self,\n tag_keys: Optional[Sequence[str]] = None,\n value_prefix: Optional[str] = None,\n limit: Optional[int] = None,\n ) -> Sequence[Tuple[str, Set[str]]]:\n result = defaultdict(set)\n query = (\n db_select([RunTagsTable.c.key, RunTagsTable.c.value])\n .distinct()\n .order_by(RunTagsTable.c.key, RunTagsTable.c.value)\n )\n if tag_keys:\n query = query.where(RunTagsTable.c.key.in_(tag_keys))\n if value_prefix:\n query = query.where(RunTagsTable.c.value.startswith(value_prefix))\n if limit:\n query = query.limit(limit)\n rows = self.fetchall(query)\n for r in rows:\n result[r["key"]].add(r["value"])\n return sorted(list([(k, v) for k, v in result.items()]), key=lambda x: x[0])\n\n def get_run_tag_keys(self) -> Sequence[str]:\n query = db_select([RunTagsTable.c.key]).distinct().order_by(RunTagsTable.c.key)\n rows = self.fetchall(query)\n return sorted([r["key"] for r in rows])\n\n def add_run_tags(self, run_id: str, new_tags: Mapping[str, str]) -> None:\n check.str_param(run_id, "run_id")\n check.mapping_param(new_tags, "new_tags", key_type=str, value_type=str)\n\n run = self._get_run_by_id(run_id)\n if not run:\n raise DagsterRunNotFoundError(\n f"Run {run_id} was not found in instance.", invalid_run_id=run_id\n )\n current_tags = run.tags if run.tags else {}\n\n all_tags = merge_dicts(current_tags, new_tags)\n partition = all_tags.get(PARTITION_NAME_TAG)\n partition_set = all_tags.get(PARTITION_SET_TAG)\n\n with self.connect() as conn:\n conn.execute(\n RunsTable.update()\n .where(RunsTable.c.run_id == run_id)\n .values(\n run_body=serialize_value(run.with_tags(merge_dicts(current_tags, new_tags))),\n partition=partition,\n partition_set=partition_set,\n update_timestamp=pendulum.now("UTC"),\n )\n )\n\n current_tags_set = set(current_tags.keys())\n new_tags_set = set(new_tags.keys())\n\n existing_tags = current_tags_set & new_tags_set\n added_tags = new_tags_set.difference(existing_tags)\n\n for tag in existing_tags:\n conn.execute(\n RunTagsTable.update()\n .where(db.and_(RunTagsTable.c.run_id == run_id, RunTagsTable.c.key == tag))\n .values(value=new_tags[tag])\n )\n\n if added_tags:\n conn.execute(\n RunTagsTable.insert(),\n [dict(run_id=run_id, key=tag, value=new_tags[tag]) for tag in added_tags],\n )\n\n def get_run_group(self, run_id: str) -> Tuple[str, Sequence[DagsterRun]]:\n check.str_param(run_id, "run_id")\n dagster_run = self._get_run_by_id(run_id)\n if not dagster_run:\n raise DagsterRunNotFoundError(\n f"Run {run_id} was not found in instance.", invalid_run_id=run_id\n )\n\n # find root_run\n root_run_id = dagster_run.root_run_id if dagster_run.root_run_id else dagster_run.run_id\n root_run = self._get_run_by_id(root_run_id)\n if not root_run:\n raise DagsterRunNotFoundError(\n f"Run id {root_run_id} set as root run id for run {run_id} was not found in"\n " instance.",\n invalid_run_id=root_run_id,\n )\n\n # root_run_id to run_id 1:1 mapping\n # https://github.com/dagster-io/dagster/issues/2495\n # Note: we currently use tags to persist the run group info\n root_to_run = db_subquery(\n db_select(\n [RunTagsTable.c.value.label("root_run_id"), RunTagsTable.c.run_id.label("run_id")]\n ).where(\n db.and_(RunTagsTable.c.key == ROOT_RUN_ID_TAG, RunTagsTable.c.value == root_run_id)\n ),\n "root_to_run",\n )\n # get run group\n run_group_query = db_select([RunsTable.c.run_body, RunsTable.c.status]).select_from(\n root_to_run.join(\n RunsTable,\n root_to_run.c.run_id == RunsTable.c.run_id,\n isouter=True,\n )\n )\n\n res = self.fetchall(run_group_query)\n run_group = self._rows_to_runs(res)\n\n return (root_run_id, [root_run, *run_group])\n\n def has_run(self, run_id: str) -> bool:\n check.str_param(run_id, "run_id")\n return bool(self._get_run_by_id(run_id))\n\n def delete_run(self, run_id: str) -> None:\n check.str_param(run_id, "run_id")\n query = db.delete(RunsTable).where(RunsTable.c.run_id == run_id)\n with self.connect() as conn:\n conn.execute(query)\n\n def has_job_snapshot(self, job_snapshot_id: str) -> bool:\n check.str_param(job_snapshot_id, "job_snapshot_id")\n return self._has_snapshot_id(job_snapshot_id)\n\n def add_job_snapshot(self, job_snapshot: JobSnapshot, snapshot_id: Optional[str] = None) -> str:\n check.inst_param(job_snapshot, "job_snapshot", JobSnapshot)\n check.opt_str_param(snapshot_id, "snapshot_id")\n\n if not snapshot_id:\n snapshot_id = create_job_snapshot_id(job_snapshot)\n\n return self._add_snapshot(\n snapshot_id=snapshot_id,\n snapshot_obj=job_snapshot,\n snapshot_type=SnapshotType.PIPELINE,\n )\n\n def get_job_snapshot(self, job_snapshot_id: str) -> JobSnapshot:\n check.str_param(job_snapshot_id, "job_snapshot_id")\n return self._get_snapshot(job_snapshot_id) # type: ignore # (allowed to return None?)\n\n def has_execution_plan_snapshot(self, execution_plan_snapshot_id: str) -> bool:\n check.str_param(execution_plan_snapshot_id, "execution_plan_snapshot_id")\n return bool(self.get_execution_plan_snapshot(execution_plan_snapshot_id))\n\n def add_execution_plan_snapshot(\n self, execution_plan_snapshot: ExecutionPlanSnapshot, snapshot_id: Optional[str] = None\n ) -> str:\n check.inst_param(execution_plan_snapshot, "execution_plan_snapshot", ExecutionPlanSnapshot)\n check.opt_str_param(snapshot_id, "snapshot_id")\n\n if not snapshot_id:\n snapshot_id = create_execution_plan_snapshot_id(execution_plan_snapshot)\n\n return self._add_snapshot(\n snapshot_id=snapshot_id,\n snapshot_obj=execution_plan_snapshot,\n snapshot_type=SnapshotType.EXECUTION_PLAN,\n )\n\n def get_execution_plan_snapshot(self, execution_plan_snapshot_id: str) -> ExecutionPlanSnapshot:\n check.str_param(execution_plan_snapshot_id, "execution_plan_snapshot_id")\n return self._get_snapshot(execution_plan_snapshot_id) # type: ignore # (allowed to return None?)\n\n def _add_snapshot(self, snapshot_id: str, snapshot_obj, snapshot_type: SnapshotType) -> str:\n check.str_param(snapshot_id, "snapshot_id")\n check.not_none_param(snapshot_obj, "snapshot_obj")\n check.inst_param(snapshot_type, "snapshot_type", SnapshotType)\n\n with self.connect() as conn:\n snapshot_insert = SnapshotsTable.insert().values(\n snapshot_id=snapshot_id,\n snapshot_body=zlib.compress(serialize_value(snapshot_obj).encode("utf-8")),\n snapshot_type=snapshot_type.value,\n )\n try:\n conn.execute(snapshot_insert)\n except db_exc.IntegrityError:\n # on_conflict_do_nothing equivalent\n pass\n\n return snapshot_id\n\n def get_run_storage_id(self) -> str:\n query = db_select([InstanceInfo.c.run_storage_id])\n row = self.fetchone(query)\n if not row:\n run_storage_id = str(uuid.uuid4())\n with self.connect() as conn:\n conn.execute(InstanceInfo.insert().values(run_storage_id=run_storage_id))\n return run_storage_id\n else:\n return row["run_storage_id"]\n\n def _has_snapshot_id(self, snapshot_id: str) -> bool:\n query = db_select([SnapshotsTable.c.snapshot_id]).where(\n SnapshotsTable.c.snapshot_id == snapshot_id\n )\n\n row = self.fetchone(query)\n\n return bool(row)\n\n def _get_snapshot(self, snapshot_id: str) -> Optional[JobSnapshot]:\n query = db_select([SnapshotsTable.c.snapshot_body]).where(\n SnapshotsTable.c.snapshot_id == snapshot_id\n )\n\n row = self.fetchone(query)\n\n return defensively_unpack_execution_plan_snapshot_query(logging, [row["snapshot_body"]]) if row else None # type: ignore\n\n def get_run_partition_data(self, runs_filter: RunsFilter) -> Sequence[RunPartitionData]:\n if self.has_built_index(RUN_PARTITIONS) and self.has_run_stats_index_cols():\n query = self._runs_query(\n filters=runs_filter,\n columns=["run_id", "status", "start_time", "end_time", "partition"],\n )\n rows = self.fetchall(query)\n\n # dedup by partition\n _partition_data_by_partition = {}\n for row in rows:\n if not row["partition"] or row["partition"] in _partition_data_by_partition:\n continue\n\n _partition_data_by_partition[row["partition"]] = RunPartitionData(\n run_id=row["run_id"],\n partition=row["partition"],\n status=DagsterRunStatus[row["status"]],\n start_time=row["start_time"],\n end_time=row["end_time"],\n )\n\n return list(_partition_data_by_partition.values())\n else:\n query = self._runs_query(filters=runs_filter)\n rows = self.fetchall(query)\n _partition_data_by_partition = {}\n for row in rows:\n run = self._row_to_run(row)\n partition = run.tags.get(PARTITION_NAME_TAG)\n if not partition or partition in _partition_data_by_partition:\n continue\n\n _partition_data_by_partition[partition] = RunPartitionData(\n run_id=run.run_id,\n partition=partition,\n status=run.status,\n start_time=None,\n end_time=None,\n )\n\n return list(_partition_data_by_partition.values())\n\n def _get_partition_runs(\n self, partition_set_name: str, partition_name: str\n ) -> Sequence[DagsterRun]:\n # utility method to help test reads off of the partition column\n if not self.has_built_index(RUN_PARTITIONS):\n # query by tags\n return self.get_runs(\n filters=RunsFilter(\n tags={\n PARTITION_SET_TAG: partition_set_name,\n PARTITION_NAME_TAG: partition_name,\n }\n )\n )\n else:\n query = (\n self._runs_query()\n .where(RunsTable.c.partition == partition_name)\n .where(RunsTable.c.partition_set == partition_set_name)\n )\n rows = self.fetchall(query)\n return self._rows_to_runs(rows)\n\n # Tracking data migrations over secondary indexes\n\n def _execute_data_migrations(\n self,\n migrations: Mapping[str, Callable[[], MigrationFn]],\n print_fn: Optional[PrintFn] = None,\n force_rebuild_all: bool = False,\n ) -> None:\n for migration_name, migration_fn in migrations.items():\n if self.has_built_index(migration_name):\n if not force_rebuild_all:\n if print_fn:\n print_fn(f"Skipping already applied data migration: {migration_name}")\n continue\n if print_fn:\n print_fn(f"Starting data migration: {migration_name}")\n migration_fn()(self, print_fn)\n self.mark_index_built(migration_name)\n if print_fn:\n print_fn(f"Finished data migration: {migration_name}")\n\n def migrate(self, print_fn: Optional[PrintFn] = None, force_rebuild_all: bool = False) -> None:\n self._execute_data_migrations(REQUIRED_DATA_MIGRATIONS, print_fn, force_rebuild_all)\n\n def optimize(self, print_fn: Optional[PrintFn] = None, force_rebuild_all: bool = False) -> None:\n self._execute_data_migrations(OPTIONAL_DATA_MIGRATIONS, print_fn, force_rebuild_all)\n\n def has_built_index(self, migration_name: str) -> bool:\n query = (\n db_select([1])\n .where(SecondaryIndexMigrationTable.c.name == migration_name)\n .where(SecondaryIndexMigrationTable.c.migration_completed != None) # noqa: E711\n .limit(1)\n )\n results = self.fetchall(query)\n\n return len(results) > 0\n\n def mark_index_built(self, migration_name: str) -> None:\n query = SecondaryIndexMigrationTable.insert().values(\n name=migration_name,\n migration_completed=datetime.now(),\n )\n with self.connect() as conn:\n try:\n conn.execute(query)\n except db_exc.IntegrityError:\n conn.execute(\n SecondaryIndexMigrationTable.update()\n .where(SecondaryIndexMigrationTable.c.name == migration_name)\n .values(migration_completed=datetime.now())\n )\n\n # Checking for migrations\n\n def has_run_stats_index_cols(self) -> bool:\n with self.connect() as conn:\n column_names = [x.get("name") for x in db.inspect(conn).get_columns(RunsTable.name)]\n return "start_time" in column_names and "end_time" in column_names\n\n def has_bulk_actions_selector_cols(self) -> bool:\n with self.connect() as conn:\n column_names = [\n x.get("name") for x in db.inspect(conn).get_columns(BulkActionsTable.name)\n ]\n return "selector_id" in column_names\n\n # Daemon heartbeats\n\n def add_daemon_heartbeat(self, daemon_heartbeat: DaemonHeartbeat) -> None:\n with self.connect() as conn:\n # insert, or update if already present\n try:\n conn.execute(\n DaemonHeartbeatsTable.insert().values(\n timestamp=utc_datetime_from_timestamp(daemon_heartbeat.timestamp),\n daemon_type=daemon_heartbeat.daemon_type,\n daemon_id=daemon_heartbeat.daemon_id,\n body=serialize_value(daemon_heartbeat),\n )\n )\n except db_exc.IntegrityError:\n conn.execute(\n DaemonHeartbeatsTable.update()\n .where(DaemonHeartbeatsTable.c.daemon_type == daemon_heartbeat.daemon_type)\n .values(\n timestamp=utc_datetime_from_timestamp(daemon_heartbeat.timestamp),\n daemon_id=daemon_heartbeat.daemon_id,\n body=serialize_value(daemon_heartbeat),\n )\n )\n\n def get_daemon_heartbeats(self) -> Mapping[str, DaemonHeartbeat]:\n rows = self.fetchall(db_select([DaemonHeartbeatsTable.c.body]))\n heartbeats = []\n for row in rows:\n heartbeats.append(deserialize_value(row["body"], DaemonHeartbeat))\n return {heartbeat.daemon_type: heartbeat for heartbeat in heartbeats}\n\n def wipe(self) -> None:\n """Clears the run storage."""\n with self.connect() as conn:\n # https://stackoverflow.com/a/54386260/324449\n conn.execute(RunsTable.delete())\n conn.execute(RunTagsTable.delete())\n conn.execute(SnapshotsTable.delete())\n conn.execute(DaemonHeartbeatsTable.delete())\n conn.execute(BulkActionsTable.delete())\n\n def wipe_daemon_heartbeats(self) -> None:\n with self.connect() as conn:\n # https://stackoverflow.com/a/54386260/324449\n conn.execute(DaemonHeartbeatsTable.delete())\n\n def get_backfills(\n self,\n status: Optional[BulkActionStatus] = None,\n cursor: Optional[str] = None,\n limit: Optional[int] = None,\n ) -> Sequence[PartitionBackfill]:\n check.opt_inst_param(status, "status", BulkActionStatus)\n query = db_select([BulkActionsTable.c.body])\n if status:\n query = query.where(BulkActionsTable.c.status == status.value)\n if cursor:\n cursor_query = db_select([BulkActionsTable.c.id]).where(\n BulkActionsTable.c.key == cursor\n )\n query = query.where(BulkActionsTable.c.id < cursor_query)\n if limit:\n query = query.limit(limit)\n query = query.order_by(BulkActionsTable.c.id.desc())\n rows = self.fetchall(query)\n return [deserialize_value(row["body"], PartitionBackfill) for row in rows]\n\n def get_backfill(self, backfill_id: str) -> Optional[PartitionBackfill]:\n check.str_param(backfill_id, "backfill_id")\n query = db_select([BulkActionsTable.c.body]).where(BulkActionsTable.c.key == backfill_id)\n row = self.fetchone(query)\n return deserialize_value(row["body"], PartitionBackfill) if row else None\n\n def add_backfill(self, partition_backfill: PartitionBackfill) -> None:\n check.inst_param(partition_backfill, "partition_backfill", PartitionBackfill)\n values: Dict[str, Any] = dict(\n key=partition_backfill.backfill_id,\n status=partition_backfill.status.value,\n timestamp=utc_datetime_from_timestamp(partition_backfill.backfill_timestamp),\n body=serialize_value(cast(NamedTuple, partition_backfill)),\n )\n\n if self.has_bulk_actions_selector_cols():\n values["selector_id"] = partition_backfill.selector_id\n values["action_type"] = partition_backfill.bulk_action_type.value\n\n with self.connect() as conn:\n conn.execute(BulkActionsTable.insert().values(**values))\n\n def update_backfill(self, partition_backfill: PartitionBackfill) -> None:\n check.inst_param(partition_backfill, "partition_backfill", PartitionBackfill)\n backfill_id = partition_backfill.backfill_id\n if not self.get_backfill(backfill_id):\n raise DagsterInvariantViolationError(\n f"Backfill {backfill_id} is not present in storage"\n )\n with self.connect() as conn:\n conn.execute(\n BulkActionsTable.update()\n .where(BulkActionsTable.c.key == backfill_id)\n .values(\n status=partition_backfill.status.value,\n body=serialize_value(partition_backfill),\n )\n )\n\n def get_cursor_values(self, keys: Set[str]) -> Mapping[str, str]:\n check.set_param(keys, "keys", of_type=str)\n\n rows = self.fetchall(\n db_select([KeyValueStoreTable.c.key, KeyValueStoreTable.c.value]).where(\n KeyValueStoreTable.c.key.in_(keys)\n ),\n )\n return {row["key"]: row["value"] for row in rows}\n\n def set_cursor_values(self, pairs: Mapping[str, str]) -> None:\n check.mapping_param(pairs, "pairs", key_type=str, value_type=str)\n db_values = [{"key": k, "value": v} for k, v in pairs.items()]\n\n with self.connect() as conn:\n try:\n conn.execute(KeyValueStoreTable.insert().values(db_values))\n except db_exc.IntegrityError:\n conn.execute(\n KeyValueStoreTable.update()\n .where(KeyValueStoreTable.c.key.in_(pairs.keys()))\n .values(value=db.sql.case(pairs, value=KeyValueStoreTable.c.key))\n )\n\n # Migrating run history\n def replace_job_origin(self, run: DagsterRun, job_origin: ExternalJobOrigin) -> None:\n new_label = job_origin.external_repository_origin.get_label()\n with self.connect() as conn:\n conn.execute(\n RunsTable.update()\n .where(RunsTable.c.run_id == run.run_id)\n .values(\n run_body=serialize_value(run.with_job_origin(job_origin)),\n )\n )\n conn.execute(\n RunTagsTable.update()\n .where(RunTagsTable.c.run_id == run.run_id)\n .where(RunTagsTable.c.key == REPOSITORY_LABEL_TAG)\n .values(value=new_label)\n )
\n\n\nGET_PIPELINE_SNAPSHOT_QUERY_ID = "get-pipeline-snapshot"\n\n\ndef defensively_unpack_execution_plan_snapshot_query(\n logger: logging.Logger, row: Sequence[Any]\n) -> Optional[Union[ExecutionPlanSnapshot, JobSnapshot]]:\n # minimal checking here because sqlalchemy returns a different type based on what version of\n # SqlAlchemy you are using\n\n def _warn(msg: str) -> None:\n logger.warning(f"get-pipeline-snapshot: {msg}")\n\n if not isinstance(row[0], bytes):\n _warn("First entry in row is not a binary type.")\n return None\n\n try:\n uncompressed_bytes = zlib.decompress(row[0])\n except zlib.error:\n _warn("Could not decompress bytes stored in snapshot table.")\n return None\n\n try:\n decoded_str = uncompressed_bytes.decode("utf-8")\n except UnicodeDecodeError:\n _warn("Could not unicode decode decompressed bytes stored in snapshot table.")\n return None\n\n try:\n return deserialize_value(decoded_str, (ExecutionPlanSnapshot, JobSnapshot))\n except JSONDecodeError:\n _warn("Could not parse json in snapshot table.")\n return None\n
", "current_page_name": "_modules/dagster/_core/storage/runs/sql_run_storage", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.storage.runs.sql_run_storage"}, "sqlite": {"sqlite_run_storage": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.storage.runs.sqlite.sqlite_run_storage

\nimport os\nfrom contextlib import contextmanager\nfrom typing import TYPE_CHECKING, Iterator, Optional\nfrom urllib.parse import urljoin, urlparse\n\nimport sqlalchemy as db\nfrom sqlalchemy.engine import Connection\nfrom sqlalchemy.pool import NullPool\nfrom typing_extensions import Self\n\nfrom dagster import (\n    StringSource,\n    _check as check,\n)\nfrom dagster._config.config_schema import UserConfigSchema\nfrom dagster._core.storage.sql import (\n    AlembicVersion,\n    check_alembic_revision,\n    create_engine,\n    get_alembic_config,\n    run_alembic_downgrade,\n    run_alembic_upgrade,\n    stamp_alembic_rev,\n)\nfrom dagster._core.storage.sqlite import create_db_conn_string\nfrom dagster._serdes import ConfigurableClass, ConfigurableClassData\nfrom dagster._utils import mkdir_p\n\nfrom ..schema import InstanceInfo, RunsTable, RunStorageSqlMetadata, RunTagsTable\nfrom ..sql_run_storage import SqlRunStorage\n\nif TYPE_CHECKING:\n    from dagster._core.storage.sqlite_storage import SqliteStorageConfig\nMINIMUM_SQLITE_BUCKET_VERSION = [3, 25, 0]\n\n\n
[docs]class SqliteRunStorage(SqlRunStorage, ConfigurableClass):\n """SQLite-backed run storage.\n\n Users should not directly instantiate this class; it is instantiated by internal machinery when\n ``dagster-webserver`` and ``dagster-graphql`` load, based on the values in the ``dagster.yaml`` file in\n ``$DAGSTER_HOME``. Configuration of this class should be done by setting values in that file.\n\n This is the default run storage when none is specified in the ``dagster.yaml``.\n\n To explicitly specify SQLite for run storage, you can add a block such as the following to your\n ``dagster.yaml``:\n\n .. code-block:: YAML\n\n run_storage:\n module: dagster._core.storage.runs\n class: SqliteRunStorage\n config:\n base_dir: /path/to/dir\n\n The ``base_dir`` param tells the run storage where on disk to store the database.\n """\n\n def __init__(self, conn_string: str, inst_data: Optional[ConfigurableClassData] = None):\n check.str_param(conn_string, "conn_string")\n self._conn_string = conn_string\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n super().__init__()\n\n @property\n def inst_data(self) -> Optional[ConfigurableClassData]:\n return self._inst_data\n\n @classmethod\n def config_type(cls) -> UserConfigSchema:\n return {"base_dir": StringSource}\n\n @classmethod\n def from_config_value(\n cls, inst_data: Optional[ConfigurableClassData], config_value: "SqliteStorageConfig"\n ) -> "SqliteRunStorage":\n return SqliteRunStorage.from_local(inst_data=inst_data, **config_value)\n\n @classmethod\n def from_local(cls, base_dir: str, inst_data: Optional[ConfigurableClassData] = None) -> Self:\n check.str_param(base_dir, "base_dir")\n mkdir_p(base_dir)\n conn_string = create_db_conn_string(base_dir, "runs")\n engine = create_engine(conn_string, poolclass=NullPool)\n alembic_config = get_alembic_config(__file__)\n\n should_mark_indexes = False\n with engine.connect() as connection:\n db_revision, head_revision = check_alembic_revision(alembic_config, connection)\n if not (db_revision and head_revision):\n RunStorageSqlMetadata.create_all(engine)\n connection.execute(db.text("PRAGMA journal_mode=WAL;"))\n stamp_alembic_rev(alembic_config, connection)\n should_mark_indexes = True\n\n table_names = db.inspect(engine).get_table_names()\n if "instance_info" not in table_names:\n InstanceInfo.create(engine)\n\n run_storage = cls(conn_string, inst_data)\n\n if should_mark_indexes:\n run_storage.migrate()\n run_storage.optimize()\n\n return run_storage\n\n @contextmanager\n def connect(self) -> Iterator[Connection]:\n engine = create_engine(self._conn_string, poolclass=NullPool)\n with engine.connect() as conn:\n with conn.begin():\n yield conn\n\n def _alembic_upgrade(self, rev: str = "head") -> None:\n alembic_config = get_alembic_config(__file__)\n with self.connect() as conn:\n run_alembic_upgrade(alembic_config, conn, rev=rev)\n\n def _alembic_downgrade(self, rev: str = "head") -> None:\n alembic_config = get_alembic_config(__file__)\n with self.connect() as conn:\n run_alembic_downgrade(alembic_config, conn, rev=rev)\n\n def upgrade(self) -> None:\n self._check_for_version_066_migration_and_perform()\n self._alembic_upgrade()\n\n # In version 0.6.6, we changed the layout of the of the sqllite dbs on disk\n # to move from the root of DAGSTER_HOME/runs.db to DAGSTER_HOME/history/runs.bd\n # This function checks for that condition and does the move\n def _check_for_version_066_migration_and_perform(self) -> None:\n old_conn_string = "sqlite://" + urljoin(urlparse(self._conn_string).path, "../runs.db")\n path_to_old_db = urlparse(old_conn_string).path\n # sqlite URLs look like `sqlite:///foo/bar/baz on Unix/Mac` but on Windows they look like\n # `sqlite:///D:/foo/bar/baz` (or `sqlite:///D:\\foo\\bar\\baz`)\n if os.name == "nt":\n path_to_old_db = path_to_old_db.lstrip("/")\n if os.path.exists(path_to_old_db):\n old_storage = SqliteRunStorage(old_conn_string)\n old_runs = old_storage.get_runs()\n for run in old_runs:\n self.add_run(run)\n os.unlink(path_to_old_db)\n\n def delete_run(self, run_id: str) -> None:\n """Override the default sql delete run implementation until we can get full\n support on cascading deletes.\n """\n check.str_param(run_id, "run_id")\n remove_tags = db.delete(RunTagsTable).where(RunTagsTable.c.run_id == run_id)\n remove_run = db.delete(RunsTable).where(RunsTable.c.run_id == run_id)\n with self.connect() as conn:\n conn.execute(remove_tags)\n conn.execute(remove_run)\n\n def alembic_version(self) -> AlembicVersion:\n alembic_config = get_alembic_config(__file__)\n with self.connect() as conn:\n return check_alembic_revision(alembic_config, conn)
\n
", "current_page_name": "_modules/dagster/_core/storage/runs/sqlite/sqlite_run_storage", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.storage.runs.sqlite.sqlite_run_storage"}}}, "schedules": {"base": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.storage.schedules.base

\nimport abc\nfrom typing import Mapping, Optional, Sequence, Set\n\nfrom dagster import AssetKey\nfrom dagster._core.definitions.auto_materialize_rule import AutoMaterializeAssetEvaluation\nfrom dagster._core.definitions.run_request import InstigatorType\nfrom dagster._core.instance import MayHaveInstanceWeakref, T_DagsterInstance\nfrom dagster._core.scheduler.instigation import (\n    AutoMaterializeAssetEvaluationRecord,\n    InstigatorState,\n    InstigatorStatus,\n    InstigatorTick,\n    TickData,\n    TickStatus,\n)\nfrom dagster._core.storage.sql import AlembicVersion\nfrom dagster._utils import PrintFn\n\n\n
[docs]class ScheduleStorage(abc.ABC, MayHaveInstanceWeakref[T_DagsterInstance]):\n """Abstract class for managing persistance of scheduler artifacts."""\n\n @abc.abstractmethod\n def wipe(self) -> None:\n """Delete all schedules from storage."""\n\n @abc.abstractmethod\n def all_instigator_state(\n self,\n repository_origin_id: Optional[str] = None,\n repository_selector_id: Optional[str] = None,\n instigator_type: Optional[InstigatorType] = None,\n instigator_statuses: Optional[Set[InstigatorStatus]] = None,\n ) -> Sequence[InstigatorState]:\n """Return all InstigationStates present in storage.\n\n Args:\n repository_origin_id (Optional[str]): The ExternalRepository target id to scope results to\n repository_selector_id (Optional[str]): The repository selector id to scope results to\n instigator_type (Optional[InstigatorType]): The InstigatorType to scope results to\n instigator_statuses (Optional[Set[InstigatorStatus]]): The InstigatorStatuses to scope results to\n """\n\n @abc.abstractmethod\n def get_instigator_state(self, origin_id: str, selector_id: str) -> Optional[InstigatorState]:\n """Return the instigator state for the given id.\n\n Args:\n origin_id (str): The unique instigator identifier\n selector_id (str): The logical instigator identifier\n """\n\n @abc.abstractmethod\n def add_instigator_state(self, state: InstigatorState) -> InstigatorState:\n """Add an instigator state to storage.\n\n Args:\n state (InstigatorState): The state to add\n """\n\n @abc.abstractmethod\n def update_instigator_state(self, state: InstigatorState) -> InstigatorState:\n """Update an instigator state in storage.\n\n Args:\n state (InstigatorState): The state to update\n """\n\n @abc.abstractmethod\n def delete_instigator_state(self, origin_id: str, selector_id: str) -> None:\n """Delete a state in storage.\n\n Args:\n origin_id (str): The id of the instigator target to delete\n selector_id (str): The logical instigator identifier\n """\n\n @property\n def supports_batch_queries(self) -> bool:\n return False\n\n def get_batch_ticks(\n self,\n selector_ids: Sequence[str],\n limit: Optional[int] = None,\n statuses: Optional[Sequence[TickStatus]] = None,\n ) -> Mapping[str, Sequence[InstigatorTick]]:\n raise NotImplementedError()\n\n @abc.abstractmethod\n def get_ticks(\n self,\n origin_id: str,\n selector_id: str,\n before: Optional[float] = None,\n after: Optional[float] = None,\n limit: Optional[int] = None,\n statuses: Optional[Sequence[TickStatus]] = None,\n ) -> Sequence[InstigatorTick]:\n """Get the ticks for a given instigator.\n\n Args:\n origin_id (str): The id of the instigator target\n selector_id (str): The logical instigator identifier\n """\n\n @abc.abstractmethod\n def create_tick(self, tick_data: TickData) -> InstigatorTick:\n """Add a tick to storage.\n\n Args:\n tick_data (TickData): The tick to add\n """\n\n @abc.abstractmethod\n def update_tick(self, tick: InstigatorTick) -> InstigatorTick:\n """Update a tick already in storage.\n\n Args:\n tick (InstigatorTick): The tick to update\n """\n\n @abc.abstractmethod\n def purge_ticks(\n self,\n origin_id: str,\n selector_id: str,\n before: float,\n tick_statuses: Optional[Sequence[TickStatus]] = None,\n ) -> None:\n """Wipe ticks for an instigator for a certain status and timestamp.\n\n Args:\n origin_id (str): The id of the instigator target to delete\n selector_id (str): The logical instigator identifier\n before (datetime): All ticks before this datetime will get purged\n tick_statuses (Optional[List[TickStatus]]): The tick statuses to wipe\n """\n\n @property\n def supports_auto_materialize_asset_evaluations(self) -> bool:\n return True\n\n @abc.abstractmethod\n def add_auto_materialize_asset_evaluations(\n self,\n evaluation_id: int,\n asset_evaluations: Sequence[AutoMaterializeAssetEvaluation],\n ) -> None:\n """Add asset policy evaluations to storage."""\n\n @abc.abstractmethod\n def get_auto_materialize_asset_evaluations(\n self, asset_key: AssetKey, limit: int, cursor: Optional[int] = None\n ) -> Sequence[AutoMaterializeAssetEvaluationRecord]:\n """Get the policy evaluations for a given asset.\n\n Args:\n asset_key (AssetKey): The asset key to query\n limit (Optional[int]): The maximum number of evaluations to return\n cursor (Optional[int]): The cursor to paginate from\n """\n\n @abc.abstractmethod\n def purge_asset_evaluations(self, before: float) -> None:\n """Wipe evaluations before a certain timestamp.\n\n Args:\n before (datetime): All evaluations before this datetime will get purged\n """\n\n @abc.abstractmethod\n def upgrade(self) -> None:\n """Perform any needed migrations."""\n\n def migrate(self, print_fn: Optional[PrintFn] = None, force_rebuild_all: bool = False) -> None:\n """Call this method to run any required data migrations."""\n\n def optimize(self, print_fn: Optional[PrintFn] = None, force_rebuild_all: bool = False) -> None:\n """Call this method to run any optional data migrations for optimized reads."""\n\n def optimize_for_webserver(self, statement_timeout: int, pool_recycle: int) -> None:\n """Allows for optimizing database connection / use in the context of a long lived webserver process."""\n\n def alembic_version(self) -> Optional[AlembicVersion]:\n return None\n\n def dispose(self) -> None:\n """Explicit lifecycle management."""
\n
", "current_page_name": "_modules/dagster/_core/storage/schedules/base", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.storage.schedules.base"}, "sql_schedule_storage": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.storage.schedules.sql_schedule_storage

\nfrom abc import abstractmethod\nfrom collections import defaultdict\nfrom datetime import datetime\nfrom typing import (\n    Any,\n    Callable,\n    ContextManager,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Set,\n    Type,\n    TypeVar,\n)\n\nimport pendulum\nimport sqlalchemy as db\nimport sqlalchemy.exc as db_exc\nfrom sqlalchemy.engine import Connection\n\nimport dagster._check as check\nfrom dagster._core.definitions.auto_materialize_rule import AutoMaterializeAssetEvaluation\nfrom dagster._core.definitions.events import AssetKey\nfrom dagster._core.definitions.run_request import InstigatorType\nfrom dagster._core.errors import DagsterInvariantViolationError\nfrom dagster._core.scheduler.instigation import (\n    AutoMaterializeAssetEvaluationRecord,\n    InstigatorState,\n    InstigatorStatus,\n    InstigatorTick,\n    TickData,\n    TickStatus,\n)\nfrom dagster._core.storage.sql import SqlAlchemyQuery, SqlAlchemyRow\nfrom dagster._core.storage.sqlalchemy_compat import db_fetch_mappings, db_select, db_subquery\nfrom dagster._serdes import serialize_value\nfrom dagster._serdes.serdes import deserialize_value\nfrom dagster._utils import PrintFn, utc_datetime_from_timestamp\n\nfrom .base import ScheduleStorage\nfrom .migration import (\n    OPTIONAL_SCHEDULE_DATA_MIGRATIONS,\n    REQUIRED_SCHEDULE_DATA_MIGRATIONS,\n    SCHEDULE_JOBS_SELECTOR_ID,\n    SCHEDULE_TICKS_SELECTOR_ID,\n)\nfrom .schema import (\n    AssetDaemonAssetEvaluationsTable,\n    InstigatorsTable,\n    JobTable,\n    JobTickTable,\n    SecondaryIndexMigrationTable,\n)\n\nT_NamedTuple = TypeVar("T_NamedTuple", bound=NamedTuple)\n\n\n
[docs]class SqlScheduleStorage(ScheduleStorage):\n """Base class for SQL backed schedule storage."""\n\n @abstractmethod\n def connect(self) -> ContextManager[Connection]:\n """Context manager yielding a sqlalchemy.engine.Connection."""\n\n def execute(self, query: SqlAlchemyQuery) -> Sequence[SqlAlchemyRow]:\n with self.connect() as conn:\n result_proxy = conn.execute(query)\n res = result_proxy.fetchall()\n result_proxy.close()\n\n return res\n\n def _deserialize_rows(\n self, rows: Sequence[SqlAlchemyRow], as_type: Type[T_NamedTuple]\n ) -> Sequence[T_NamedTuple]:\n return list(map(lambda r: deserialize_value(r[0], as_type), rows))\n\n def all_instigator_state(\n self,\n repository_origin_id: Optional[str] = None,\n repository_selector_id: Optional[str] = None,\n instigator_type: Optional[InstigatorType] = None,\n instigator_statuses: Optional[Set[InstigatorStatus]] = None,\n ) -> Sequence[InstigatorState]:\n check.opt_inst_param(instigator_type, "instigator_type", InstigatorType)\n\n if self.has_instigators_table() and self.has_built_index(SCHEDULE_JOBS_SELECTOR_ID):\n query = db_select([InstigatorsTable.c.instigator_body]).select_from(InstigatorsTable)\n if repository_selector_id:\n query = query.where(\n InstigatorsTable.c.repository_selector_id == repository_selector_id\n )\n if instigator_type:\n query = query.where(InstigatorsTable.c.instigator_type == instigator_type.value)\n if instigator_statuses:\n query = query.where(\n InstigatorsTable.c.status.in_([status.value for status in instigator_statuses])\n )\n\n else:\n query = db_select([JobTable.c.job_body]).select_from(JobTable)\n if repository_origin_id:\n query = query.where(JobTable.c.repository_origin_id == repository_origin_id)\n if instigator_type:\n query = query.where(JobTable.c.job_type == instigator_type.value)\n if instigator_statuses:\n query = query.where(\n JobTable.c.status.in_([status.value for status in instigator_statuses])\n )\n\n rows = self.execute(query)\n return self._deserialize_rows(rows, InstigatorState)\n\n def get_instigator_state(self, origin_id: str, selector_id: str) -> Optional[InstigatorState]:\n check.str_param(origin_id, "origin_id")\n check.str_param(selector_id, "selector_id")\n\n if self.has_instigators_table() and self.has_built_index(SCHEDULE_JOBS_SELECTOR_ID):\n query = (\n db_select([InstigatorsTable.c.instigator_body])\n .select_from(InstigatorsTable)\n .where(InstigatorsTable.c.selector_id == selector_id)\n )\n else:\n query = (\n db_select([JobTable.c.job_body])\n .select_from(JobTable)\n .where(JobTable.c.job_origin_id == origin_id)\n )\n\n rows = self.execute(query)\n return self._deserialize_rows(rows[:1], InstigatorState)[0] if len(rows) else None\n\n def _has_instigator_state_by_selector(self, selector_id: str) -> bool:\n check.str_param(selector_id, "selector_id")\n\n query = (\n db_select([JobTable.c.job_body])\n .select_from(JobTable)\n .where(JobTable.c.selector_id == selector_id)\n )\n\n rows = self.execute(query)\n return self._deserialize_rows(rows[:1])[0] if len(rows) else None # type: ignore\n\n def _add_or_update_instigators_table(self, conn: Connection, state: InstigatorState) -> None:\n selector_id = state.selector_id\n try:\n conn.execute(\n InstigatorsTable.insert().values(\n selector_id=selector_id,\n repository_selector_id=state.repository_selector_id,\n status=state.status.value,\n instigator_type=state.instigator_type.value,\n instigator_body=serialize_value(state),\n )\n )\n except db_exc.IntegrityError:\n conn.execute(\n InstigatorsTable.update()\n .where(InstigatorsTable.c.selector_id == selector_id)\n .values(\n status=state.status.value,\n instigator_type=state.instigator_type.value,\n instigator_body=serialize_value(state),\n update_timestamp=pendulum.now("UTC"),\n )\n )\n\n def add_instigator_state(self, state: InstigatorState) -> InstigatorState:\n check.inst_param(state, "state", InstigatorState)\n with self.connect() as conn:\n try:\n conn.execute(\n JobTable.insert().values(\n job_origin_id=state.instigator_origin_id,\n repository_origin_id=state.repository_origin_id,\n status=state.status.value,\n job_type=state.instigator_type.value,\n job_body=serialize_value(state),\n )\n )\n except db_exc.IntegrityError as exc:\n raise DagsterInvariantViolationError(\n f"InstigatorState {state.instigator_origin_id} is already present in storage"\n ) from exc\n\n # try writing to the instigators table\n if self._has_instigators_table(conn):\n self._add_or_update_instigators_table(conn, state)\n\n return state\n\n def update_instigator_state(self, state: InstigatorState) -> InstigatorState:\n check.inst_param(state, "state", InstigatorState)\n if not self.get_instigator_state(state.instigator_origin_id, state.selector_id):\n raise DagsterInvariantViolationError(\n f"InstigatorState {state.instigator_origin_id} is not present in storage"\n )\n\n values = {\n "status": state.status.value,\n "job_body": serialize_value(state),\n "update_timestamp": pendulum.now("UTC"),\n }\n if self.has_instigators_table():\n values["selector_id"] = state.selector_id\n\n with self.connect() as conn:\n conn.execute(\n JobTable.update()\n .where(JobTable.c.job_origin_id == state.instigator_origin_id)\n .values(**values)\n )\n if self._has_instigators_table(conn):\n self._add_or_update_instigators_table(conn, state)\n\n return state\n\n def delete_instigator_state(self, origin_id: str, selector_id: str) -> None:\n check.str_param(origin_id, "origin_id")\n check.str_param(selector_id, "selector_id")\n\n if not self.get_instigator_state(origin_id, selector_id):\n raise DagsterInvariantViolationError(\n f"InstigatorState {origin_id} is not present in storage"\n )\n\n with self.connect() as conn:\n conn.execute(JobTable.delete().where(JobTable.c.job_origin_id == origin_id))\n\n if self._has_instigators_table(conn):\n if not self._jobs_has_selector_state(conn, selector_id):\n conn.execute(\n InstigatorsTable.delete().where(\n InstigatorsTable.c.selector_id == selector_id\n )\n )\n\n def _jobs_has_selector_state(self, conn: Connection, selector_id: str) -> bool:\n query = (\n db_select([db.func.count()])\n .select_from(JobTable)\n .where(JobTable.c.selector_id == selector_id)\n )\n result = conn.execute(query)\n row = result.fetchone()\n result.close()\n return row[0] > 0 # type: ignore # (possible none)\n\n def _add_filter_limit(\n self,\n query: SqlAlchemyQuery,\n before: Optional[float] = None,\n after: Optional[float] = None,\n limit: Optional[int] = None,\n statuses=None,\n ) -> SqlAlchemyQuery:\n check.opt_float_param(before, "before")\n check.opt_float_param(after, "after")\n check.opt_int_param(limit, "limit")\n check.opt_list_param(statuses, "statuses", of_type=TickStatus)\n\n if before:\n query = query.where(JobTickTable.c.timestamp < utc_datetime_from_timestamp(before))\n if after:\n query = query.where(JobTickTable.c.timestamp > utc_datetime_from_timestamp(after))\n if limit:\n query = query.limit(limit)\n if statuses:\n query = query.where(JobTickTable.c.status.in_([status.value for status in statuses]))\n return query\n\n @property\n def supports_batch_queries(self) -> bool:\n return self.has_instigators_table() and self.has_built_index(SCHEDULE_TICKS_SELECTOR_ID)\n\n def has_instigators_table(self) -> bool:\n with self.connect() as conn:\n return self._has_instigators_table(conn)\n\n def _has_instigators_table(self, conn: Connection) -> bool:\n table_names = db.inspect(conn).get_table_names()\n return "instigators" in table_names\n\n def _has_asset_daemon_asset_evaluations_table(self, conn: Connection) -> bool:\n table_names = db.inspect(conn).get_table_names()\n return "asset_daemon_asset_evaluations" in table_names\n\n def get_batch_ticks(\n self,\n selector_ids: Sequence[str],\n limit: Optional[int] = None,\n statuses: Optional[Sequence[TickStatus]] = None,\n ) -> Mapping[str, Sequence[InstigatorTick]]:\n check.sequence_param(selector_ids, "selector_ids", of_type=str)\n check.opt_int_param(limit, "limit")\n check.opt_sequence_param(statuses, "statuses", of_type=TickStatus)\n\n bucket_rank_column = (\n db.func.rank()\n .over(\n order_by=db.desc(JobTickTable.c.timestamp),\n partition_by=JobTickTable.c.selector_id,\n )\n .label("rank")\n )\n subquery = db_subquery(\n db_select(\n [\n JobTickTable.c.id,\n JobTickTable.c.selector_id,\n JobTickTable.c.tick_body,\n bucket_rank_column,\n ]\n )\n .select_from(JobTickTable)\n .where(JobTickTable.c.selector_id.in_(selector_ids))\n )\n if statuses:\n subquery = subquery.where(\n JobTickTable.c.status.in_([status.value for status in statuses])\n )\n\n query = (\n db_select([subquery.c.id, subquery.c.selector_id, subquery.c.tick_body])\n .order_by(subquery.c.rank.asc())\n .where(subquery.c.rank <= limit)\n )\n\n rows = self.execute(query)\n results = defaultdict(list)\n for row in rows:\n tick_id = row[0]\n selector_id = row[1]\n tick_data = deserialize_value(row[2], TickData)\n results[selector_id].append(InstigatorTick(tick_id, tick_data))\n return results\n\n def get_ticks(\n self,\n origin_id: str,\n selector_id: str,\n before: Optional[float] = None,\n after: Optional[float] = None,\n limit: Optional[int] = None,\n statuses: Optional[Sequence[TickStatus]] = None,\n ) -> Sequence[InstigatorTick]:\n check.str_param(origin_id, "origin_id")\n check.opt_float_param(before, "before")\n check.opt_float_param(after, "after")\n check.opt_int_param(limit, "limit")\n check.opt_list_param(statuses, "statuses", of_type=TickStatus)\n\n base_query = (\n db_select([JobTickTable.c.id, JobTickTable.c.tick_body])\n .select_from(JobTickTable)\n .order_by(JobTickTable.c.timestamp.desc())\n )\n if self.has_instigators_table():\n query = base_query.where(\n db.or_(\n JobTickTable.c.selector_id == selector_id,\n db.and_(\n JobTickTable.c.selector_id.is_(None),\n JobTickTable.c.job_origin_id == origin_id,\n ),\n )\n )\n else:\n query = base_query.where(JobTickTable.c.job_origin_id == origin_id)\n\n query = self._add_filter_limit(\n query, before=before, after=after, limit=limit, statuses=statuses\n )\n\n rows = self.execute(query)\n return list(map(lambda r: InstigatorTick(r[0], deserialize_value(r[1], TickData)), rows))\n\n def create_tick(self, tick_data: TickData) -> InstigatorTick:\n check.inst_param(tick_data, "tick_data", TickData)\n\n values = {\n "job_origin_id": tick_data.instigator_origin_id,\n "status": tick_data.status.value,\n "type": tick_data.instigator_type.value,\n "timestamp": utc_datetime_from_timestamp(tick_data.timestamp),\n "tick_body": serialize_value(tick_data),\n }\n if self.has_instigators_table() and tick_data.selector_id:\n values["selector_id"] = tick_data.selector_id\n\n with self.connect() as conn:\n try:\n tick_insert = JobTickTable.insert().values(**values)\n result = conn.execute(tick_insert)\n tick_id = result.inserted_primary_key[0]\n return InstigatorTick(tick_id, tick_data)\n except db_exc.IntegrityError as exc:\n raise DagsterInvariantViolationError(\n f"Unable to insert InstigatorTick for job {tick_data.instigator_name} in"\n " storage"\n ) from exc\n\n def update_tick(self, tick: InstigatorTick) -> InstigatorTick:\n check.inst_param(tick, "tick", InstigatorTick)\n\n values = {\n "status": tick.status.value,\n "type": tick.instigator_type.value,\n "timestamp": utc_datetime_from_timestamp(tick.timestamp),\n "tick_body": serialize_value(tick.tick_data),\n }\n if self.has_instigators_table() and tick.selector_id:\n values["selector_id"] = tick.selector_id\n\n with self.connect() as conn:\n conn.execute(\n JobTickTable.update().where(JobTickTable.c.id == tick.tick_id).values(**values)\n )\n\n return tick\n\n def purge_ticks(\n self,\n origin_id: str,\n selector_id: str,\n before: float,\n tick_statuses: Optional[Sequence[TickStatus]] = None,\n ) -> None:\n check.str_param(origin_id, "origin_id")\n check.float_param(before, "before")\n check.opt_list_param(tick_statuses, "tick_statuses", of_type=TickStatus)\n\n utc_before = utc_datetime_from_timestamp(before)\n\n query = JobTickTable.delete().where(JobTickTable.c.timestamp < utc_before)\n if tick_statuses:\n query = query.where(\n JobTickTable.c.status.in_([tick_status.value for tick_status in tick_statuses])\n )\n\n if self.has_instigators_table():\n query = query.where(\n db.or_(\n JobTickTable.c.selector_id == selector_id,\n db.and_(\n JobTickTable.c.selector_id.is_(None),\n JobTickTable.c.job_origin_id == origin_id,\n ),\n )\n )\n else:\n query = query.where(JobTickTable.c.job_origin_id == origin_id)\n\n with self.connect() as conn:\n conn.execute(query)\n\n @property\n def supports_auto_materialize_asset_evaluations(self) -> bool:\n with self.connect() as conn:\n return self._has_asset_daemon_asset_evaluations_table(conn)\n\n def add_auto_materialize_asset_evaluations(\n self,\n evaluation_id: int,\n asset_evaluations: Sequence[AutoMaterializeAssetEvaluation],\n ):\n if not asset_evaluations:\n return\n\n with self.connect() as conn:\n bulk_insert = AssetDaemonAssetEvaluationsTable.insert().values(\n [\n {\n "evaluation_id": evaluation_id,\n "asset_key": evaluation.asset_key.to_string(),\n "asset_evaluation_body": serialize_value(evaluation),\n "num_requested": evaluation.num_requested,\n "num_skipped": evaluation.num_skipped,\n "num_discarded": evaluation.num_discarded,\n }\n for evaluation in asset_evaluations\n ]\n )\n conn.execute(bulk_insert)\n\n def get_auto_materialize_asset_evaluations(\n self, asset_key: AssetKey, limit: int, cursor: Optional[int] = None\n ) -> Sequence[AutoMaterializeAssetEvaluationRecord]:\n with self.connect() as conn:\n query = (\n db_select(\n [\n AssetDaemonAssetEvaluationsTable.c.id,\n AssetDaemonAssetEvaluationsTable.c.asset_evaluation_body,\n AssetDaemonAssetEvaluationsTable.c.evaluation_id,\n AssetDaemonAssetEvaluationsTable.c.create_timestamp,\n ]\n )\n .where(AssetDaemonAssetEvaluationsTable.c.asset_key == asset_key.to_string())\n .order_by(AssetDaemonAssetEvaluationsTable.c.evaluation_id.desc())\n ).limit(limit)\n\n if cursor:\n query = query.where(AssetDaemonAssetEvaluationsTable.c.evaluation_id < cursor)\n\n rows = db_fetch_mappings(conn, query)\n return [AutoMaterializeAssetEvaluationRecord.from_db_row(row) for row in rows]\n\n def purge_asset_evaluations(self, before: float):\n check.float_param(before, "before")\n\n utc_before = utc_datetime_from_timestamp(before)\n query = AssetDaemonAssetEvaluationsTable.delete().where(\n AssetDaemonAssetEvaluationsTable.c.create_timestamp < utc_before\n )\n\n with self.connect() as conn:\n conn.execute(query)\n\n def wipe(self) -> None:\n """Clears the schedule storage."""\n with self.connect() as conn:\n # https://stackoverflow.com/a/54386260/324449\n conn.execute(JobTable.delete())\n conn.execute(JobTickTable.delete())\n if self._has_instigators_table(conn):\n conn.execute(InstigatorsTable.delete())\n if self._has_asset_daemon_asset_evaluations_table(conn):\n conn.execute(AssetDaemonAssetEvaluationsTable.delete())\n\n # MIGRATIONS\n\n def has_secondary_index_table(self) -> bool:\n with self.connect() as conn:\n return "secondary_indexes" in db.inspect(conn).get_table_names()\n\n def has_built_index(self, migration_name: str) -> bool:\n if not self.has_secondary_index_table():\n return False\n\n query = (\n db_select([1])\n .where(SecondaryIndexMigrationTable.c.name == migration_name)\n .where(SecondaryIndexMigrationTable.c.migration_completed != None) # noqa: E711\n .limit(1)\n )\n with self.connect() as conn:\n results = conn.execute(query).fetchall()\n\n return len(results) > 0\n\n def mark_index_built(self, migration_name: str) -> None:\n query = SecondaryIndexMigrationTable.insert().values(\n name=migration_name,\n migration_completed=datetime.now(),\n )\n with self.connect() as conn:\n try:\n conn.execute(query)\n except db_exc.IntegrityError:\n conn.execute(\n SecondaryIndexMigrationTable.update()\n .where(SecondaryIndexMigrationTable.c.name == migration_name)\n .values(migration_completed=datetime.now())\n )\n\n def _execute_data_migrations(\n self,\n migrations: Mapping[str, Callable[..., Any]],\n print_fn: Optional[Callable] = None,\n force_rebuild_all: bool = False,\n ) -> None:\n for migration_name, migration_fn in migrations.items():\n if self.has_built_index(migration_name):\n if not force_rebuild_all:\n if print_fn:\n print_fn(f"Skipping already applied migration: {migration_name}")\n continue\n if print_fn:\n print_fn(f"Starting data migration: {migration_name}")\n migration_fn()(self, print_fn)\n self.mark_index_built(migration_name)\n if print_fn:\n print_fn(f"Finished data migration: {migration_name}")\n\n def migrate(self, print_fn: Optional[PrintFn] = None, force_rebuild_all: bool = False) -> None:\n self._execute_data_migrations(\n REQUIRED_SCHEDULE_DATA_MIGRATIONS, print_fn, force_rebuild_all\n )\n\n def optimize(self, print_fn: Optional[PrintFn] = None, force_rebuild_all: bool = False) -> None:\n self._execute_data_migrations(\n OPTIONAL_SCHEDULE_DATA_MIGRATIONS, print_fn, force_rebuild_all\n )
\n
", "current_page_name": "_modules/dagster/_core/storage/schedules/sql_schedule_storage", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.storage.schedules.sql_schedule_storage"}, "sqlite": {"sqlite_schedule_storage": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.storage.schedules.sqlite.sqlite_schedule_storage

\nfrom contextlib import contextmanager\nfrom typing import Iterator, Optional\n\nimport sqlalchemy as db\nfrom packaging.version import parse\nfrom sqlalchemy.engine import Connection\nfrom sqlalchemy.pool import NullPool\n\nfrom dagster import (\n    StringSource,\n    _check as check,\n)\nfrom dagster._config.config_schema import UserConfigSchema\nfrom dagster._core.storage.sql import (\n    AlembicVersion,\n    check_alembic_revision,\n    create_engine,\n    get_alembic_config,\n    run_alembic_upgrade,\n    stamp_alembic_rev,\n)\nfrom dagster._core.storage.sqlite import create_db_conn_string, get_sqlite_version\nfrom dagster._serdes import ConfigurableClass, ConfigurableClassData\nfrom dagster._utils import mkdir_p\n\nfrom ..schema import ScheduleStorageSqlMetadata\nfrom ..sql_schedule_storage import SqlScheduleStorage\n\nMINIMUM_SQLITE_BATCH_VERSION = "3.25.0"\n\n\n
[docs]class SqliteScheduleStorage(SqlScheduleStorage, ConfigurableClass):\n """Local SQLite backed schedule storage."""\n\n def __init__(self, conn_string: str, inst_data: Optional[ConfigurableClassData] = None):\n check.str_param(conn_string, "conn_string")\n self._conn_string = conn_string\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n\n super().__init__()\n\n @property\n def inst_data(self) -> Optional[ConfigurableClassData]:\n return self._inst_data\n\n @classmethod\n def config_type(cls) -> UserConfigSchema:\n return {"base_dir": StringSource}\n\n @classmethod\n def from_config_value(\n cls, inst_data: Optional[ConfigurableClassData], config_value\n ) -> "SqliteScheduleStorage":\n return SqliteScheduleStorage.from_local(inst_data=inst_data, **config_value)\n\n @classmethod\n def from_local(\n cls, base_dir: str, inst_data: Optional[ConfigurableClassData] = None\n ) -> "SqliteScheduleStorage":\n check.str_param(base_dir, "base_dir")\n mkdir_p(base_dir)\n conn_string = create_db_conn_string(base_dir, "schedules")\n engine = create_engine(conn_string, poolclass=NullPool)\n alembic_config = get_alembic_config(__file__)\n\n should_migrate_data = False\n with engine.connect() as connection:\n db_revision, head_revision = check_alembic_revision(alembic_config, connection)\n if not (db_revision and head_revision):\n ScheduleStorageSqlMetadata.create_all(engine)\n connection.execute(db.text("PRAGMA journal_mode=WAL;"))\n stamp_alembic_rev(alembic_config, connection)\n should_migrate_data = True\n\n schedule_storage = cls(conn_string, inst_data)\n if should_migrate_data:\n schedule_storage.migrate()\n schedule_storage.optimize()\n\n return schedule_storage\n\n @contextmanager\n def connect(self) -> Iterator[Connection]:\n engine = create_engine(self._conn_string, poolclass=NullPool)\n with engine.connect() as conn:\n with conn.begin():\n yield conn\n\n @property\n def supports_batch_queries(self) -> bool:\n if not super().supports_batch_queries:\n return False\n\n return super().supports_batch_queries and parse(get_sqlite_version()) >= parse(\n MINIMUM_SQLITE_BATCH_VERSION\n )\n\n def upgrade(self) -> None:\n alembic_config = get_alembic_config(__file__)\n with self.connect() as conn:\n run_alembic_upgrade(alembic_config, conn)\n\n def alembic_version(self) -> AlembicVersion:\n alembic_config = get_alembic_config(__file__)\n with self.connect() as conn:\n return check_alembic_revision(alembic_config, conn)
\n
", "current_page_name": "_modules/dagster/_core/storage/schedules/sqlite/sqlite_schedule_storage", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.storage.schedules.sqlite.sqlite_schedule_storage"}}}, "upath_io_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.storage.upath_io_manager

\nimport asyncio\nimport inspect\nfrom abc import abstractmethod\nfrom pathlib import Path\nfrom typing import TYPE_CHECKING, Any, Dict, Mapping, Optional, Union\n\nfrom fsspec import AbstractFileSystem\nfrom fsspec.implementations.local import LocalFileSystem\n\nfrom dagster import (\n    InputContext,\n    MetadataValue,\n    MultiPartitionKey,\n    OutputContext,\n    _check as check,\n)\nfrom dagster._core.storage.memoizable_io_manager import MemoizableIOManager\n\nif TYPE_CHECKING:\n    from upath import UPath\n\n\n
[docs]class UPathIOManager(MemoizableIOManager):\n """Abstract IOManager base class compatible with local and cloud storage via `universal-pathlib` and `fsspec`.\n\n Features:\n - handles partitioned assets\n - handles loading a single upstream partition\n - handles loading multiple upstream partitions (with respect to :py:class:`PartitionMapping`)\n - supports loading multiple partitions concurrently with async `load_from_path` method\n - the `get_metadata` method can be customized to add additional metadata to the output\n - the `allow_missing_partitions` metadata value can be set to `True` to skip missing partitions\n (the default behavior is to raise an error)\n\n """\n\n extension: Optional[str] = None # override in child class\n\n def __init__(\n self,\n base_path: Optional["UPath"] = None,\n ):\n from upath import UPath\n\n assert not self.extension or "." in self.extension\n self._base_path = base_path or UPath(".")\n\n @abstractmethod\n def dump_to_path(self, context: OutputContext, obj: Any, path: "UPath"):\n """Child classes should override this method to write the object to the filesystem."""\n\n @abstractmethod\n def load_from_path(self, context: InputContext, path: "UPath") -> Any:\n """Child classes should override this method to load the object from the filesystem."""\n\n @property\n def fs(self) -> AbstractFileSystem:\n """Utility function to get the IOManager filesystem.\n\n Returns:\n AbstractFileSystem: fsspec filesystem.\n\n """\n from upath import UPath\n\n if isinstance(self._base_path, UPath):\n return self._base_path.fs\n elif isinstance(self._base_path, Path):\n return LocalFileSystem()\n else:\n raise ValueError(f"Unsupported base_path type: {type(self._base_path)}")\n\n @property\n def storage_options(self) -> Dict[str, Any]:\n """Utility function to get the fsspec storage_options which are often consumed by various I/O functions.\n\n Returns:\n Dict[str, Any]: fsspec storage_options.\n """\n from upath import UPath\n\n if isinstance(self._base_path, UPath):\n return self._base_path._kwargs.copy() # noqa\n elif isinstance(self._base_path, Path):\n return {}\n else:\n raise ValueError(f"Unsupported base_path type: {type(self._base_path)}")\n\n def get_metadata(\n self,\n context: OutputContext,\n obj: Any,\n ) -> Dict[str, MetadataValue]:\n """Child classes should override this method to add custom metadata to the outputs."""\n return {}\n\n # Read/write operations on paths can generally be handled by methods on the\n # UPath class, but when the backend requires credentials, this isn't\n # always possible. Override these path_* methods to provide custom\n # implementations for targeting backends that require authentication.\n\n def unlink(self, path: "UPath") -> None:\n """Remove the file or object at the provided path."""\n path.unlink()\n\n def path_exists(self, path: "UPath") -> bool:\n """Check if a file or object exists at the provided path."""\n return path.exists()\n\n def make_directory(self, path: "UPath"):\n """Create a directory at the provided path.\n\n Override as a no-op if the target backend doesn't use directories.\n """\n path.mkdir(parents=True, exist_ok=True)\n\n def has_output(self, context: OutputContext) -> bool:\n return self.path_exists(self._get_path(context))\n\n def _with_extension(self, path: "UPath") -> "UPath":\n return path.with_suffix(path.suffix + self.extension) if self.extension else path\n\n def _get_path_without_extension(self, context: Union[InputContext, OutputContext]) -> "UPath":\n if context.has_asset_key:\n context_path = self.get_asset_relative_path(context)\n else:\n # we are dealing with an op output\n context_path = self.get_op_output_relative_path(context)\n\n return self._base_path.joinpath(context_path)\n\n def get_asset_relative_path(self, context: Union[InputContext, OutputContext]) -> "UPath":\n from upath import UPath\n\n # we are not using context.get_asset_identifier() because it already includes the partition_key\n return UPath(*context.asset_key.path)\n\n def get_op_output_relative_path(self, context: Union[InputContext, OutputContext]) -> "UPath":\n from upath import UPath\n\n return UPath(*context.get_identifier())\n\n def get_loading_input_log_message(self, path: "UPath") -> str:\n return f"Loading file from: {path} using {self.__class__.__name__}..."\n\n def get_writing_output_log_message(self, path: "UPath") -> str:\n return f"Writing file at: {path} using {self.__class__.__name__}..."\n\n def get_loading_input_partition_log_message(self, path: "UPath", partition_key: str) -> str:\n return f"Loading partition {partition_key} from {path} using {self.__class__.__name__}..."\n\n def get_missing_partition_log_message(self, partition_key: str) -> str:\n return (\n f"Couldn't load partition {partition_key} and skipped it "\n "because the input metadata includes allow_missing_partitions=True"\n )\n\n def _get_path(self, context: Union[InputContext, OutputContext]) -> "UPath":\n """Returns the I/O path for a given context.\n Should not be used with partitions (use `_get_paths_for_partitions` instead).\n """\n path = self._get_path_without_extension(context)\n return self._with_extension(path)\n\n def get_path_for_partition(\n self, context: Union[InputContext, OutputContext], path: "UPath", partition: str\n ) -> "UPath":\n """Override this method if you want to use a different partitioning scheme\n (for example, if the saving function handles partitioning instead).\n The extension will be added later.\n\n Args:\n context (Union[InputContext, OutputContext]): The context for the I/O operation.\n path (UPath): The path to the file or object.\n partition (str): Formatted partition/multipartition key\n\n Returns:\n UPath: The path to the file with the partition key appended.\n """\n return path / partition\n\n def _get_paths_for_partitions(\n self, context: Union[InputContext, OutputContext]\n ) -> Dict[str, "UPath"]:\n """Returns a dict of partition_keys into I/O paths for a given context."""\n if not context.has_asset_partitions:\n raise TypeError(\n f"Detected {context.dagster_type.typing_type} input type "\n "but the asset is not partitioned"\n )\n\n def _formatted_multipartitioned_path(partition_key: MultiPartitionKey) -> str:\n ordered_dimension_keys = [\n key[1]\n for key in sorted(partition_key.keys_by_dimension.items(), key=lambda x: x[0])\n ]\n return "/".join(ordered_dimension_keys)\n\n formatted_partition_keys = {\n partition_key: (\n _formatted_multipartitioned_path(partition_key)\n if isinstance(partition_key, MultiPartitionKey)\n else partition_key\n )\n for partition_key in context.asset_partition_keys\n }\n\n asset_path = self._get_path_without_extension(context)\n return {\n partition_key: self._with_extension(\n self.get_path_for_partition(context, asset_path, partition)\n )\n for partition_key, partition in formatted_partition_keys.items()\n }\n\n def _get_multipartition_backcompat_paths(\n self, context: Union[InputContext, OutputContext]\n ) -> Mapping[str, "UPath"]:\n if not context.has_asset_partitions:\n raise TypeError(\n f"Detected {context.dagster_type.typing_type} input type "\n "but the asset is not partitioned"\n )\n\n partition_keys = context.asset_partition_keys\n\n asset_path = self._get_path_without_extension(context)\n return {\n partition_key: self._with_extension(asset_path / partition_key)\n for partition_key in partition_keys\n if isinstance(partition_key, MultiPartitionKey)\n }\n\n def _load_single_input(\n self, path: "UPath", context: InputContext, backcompat_path: Optional["UPath"] = None\n ) -> Any:\n context.log.debug(self.get_loading_input_log_message(path))\n try:\n obj = self.load_from_path(context=context, path=path)\n if asyncio.iscoroutine(obj):\n obj = asyncio.run(obj)\n except FileNotFoundError as e:\n if backcompat_path is not None:\n try:\n obj = self.load_from_path(context=context, path=backcompat_path)\n if asyncio.iscoroutine(obj):\n obj = asyncio.run(obj)\n\n context.log.debug(\n f"File not found at {path}. Loaded instead from backcompat path:"\n f" {backcompat_path}"\n )\n except FileNotFoundError:\n raise e\n else:\n raise e\n\n context.add_input_metadata({"path": MetadataValue.path(str(path))})\n return obj\n\n def _load_partition_from_path(\n self,\n context: InputContext,\n partition_key: str,\n path: "UPath",\n backcompat_path: Optional["UPath"] = None,\n ) -> Any:\n """1. Try to load the partition from the normal path.\n 2. If it was not found, try to load it from the backcompat path.\n 3. If allow_missing_partitions metadata is True, skip the partition if it was not found in any of the paths.\n Otherwise, raise an error.\n\n Args:\n context (InputContext): IOManager Input context\n partition_key (str): the partition key corresponding to the partition being loaded\n path (UPath): The path to the partition.\n backcompat_path (Optional[UPath]): The path to the partition in the backcompat location.\n\n Returns:\n Any: The object loaded from the partition.\n """\n allow_missing_partitions = (\n context.metadata.get("allow_missing_partitions", False)\n if context.metadata is not None\n else False\n )\n\n try:\n context.log.debug(self.get_loading_input_partition_log_message(path, partition_key))\n obj = self.load_from_path(context=context, path=path)\n return obj\n except FileNotFoundError as e:\n if backcompat_path is not None:\n try:\n obj = self.load_from_path(context=context, path=path)\n context.log.debug(\n f"File not found at {path}. Loaded instead from backcompat path:"\n f" {backcompat_path}"\n )\n return obj\n except FileNotFoundError as e:\n if allow_missing_partitions:\n context.log.warning(self.get_missing_partition_log_message(partition_key))\n return None\n else:\n raise e\n if allow_missing_partitions:\n context.log.warning(self.get_missing_partition_log_message(partition_key))\n return None\n else:\n raise e\n\n def _load_multiple_inputs(self, context: InputContext) -> Dict[str, Any]:\n # load multiple partitions\n paths = self._get_paths_for_partitions(context) # paths for normal partitions\n backcompat_paths = self._get_multipartition_backcompat_paths(\n context\n ) # paths for multipartitions\n\n context.log.debug(f"Loading {len(paths)} partitions...")\n\n objs = {}\n\n if not inspect.iscoroutinefunction(self.load_from_path):\n for partition_key in context.asset_partition_keys:\n obj = self._load_partition_from_path(\n context,\n partition_key,\n paths[partition_key],\n backcompat_paths.get(partition_key),\n )\n if obj is not None: # in case some partitions were skipped\n objs[partition_key] = obj\n return objs\n else:\n # load_from_path returns a coroutine, so we need to await the results\n\n async def collect():\n loop = asyncio.get_running_loop()\n\n tasks = []\n\n for partition_key in context.asset_partition_keys:\n tasks.append(\n loop.create_task(\n self._load_partition_from_path(\n context,\n partition_key,\n paths[partition_key],\n backcompat_paths.get(partition_key),\n )\n )\n )\n\n results = await asyncio.gather(*tasks, return_exceptions=True)\n\n # need to handle missing partitions here because exceptions don't get propagated from async calls\n allow_missing_partitions = (\n context.metadata.get("allow_missing_partitions", False)\n if context.metadata is not None\n else False\n )\n\n results_without_errors = []\n found_errors = False\n for partition_key, result in zip(context.asset_partition_keys, results):\n if isinstance(result, FileNotFoundError):\n if allow_missing_partitions:\n context.log.warning(\n self.get_missing_partition_log_message(partition_key)\n )\n else:\n context.log.error(str(result))\n found_errors = True\n elif isinstance(result, Exception):\n context.log.error(str(result))\n found_errors = True\n else:\n results_without_errors.append(result)\n\n if found_errors:\n raise RuntimeError(\n f"{len(paths) - len(results_without_errors)} partitions could not be loaded"\n )\n\n return results_without_errors\n\n awaited_objects = asyncio.get_event_loop().run_until_complete(collect())\n\n return {\n partition_key: awaited_object\n for partition_key, awaited_object in zip(\n context.asset_partition_keys, awaited_objects\n )\n if awaited_object is not None\n }\n\n def load_input(self, context: InputContext) -> Union[Any, Dict[str, Any]]:\n # If no asset key, we are dealing with an op output which is always non-partitioned\n if not context.has_asset_key or not context.has_asset_partitions:\n path = self._get_path(context)\n return self._load_single_input(path, context)\n else:\n asset_partition_keys = context.asset_partition_keys\n if len(asset_partition_keys) == 0:\n return None\n elif len(asset_partition_keys) == 1:\n paths = self._get_paths_for_partitions(context)\n check.invariant(len(paths) == 1, f"Expected 1 path, but got {len(paths)}")\n path = next(iter(paths.values()))\n backcompat_paths = self._get_multipartition_backcompat_paths(context)\n backcompat_path = (\n None if not backcompat_paths else next(iter(backcompat_paths.values()))\n )\n\n return self._load_single_input(path, context, backcompat_path)\n else: # we are dealing with multiple partitions of an asset\n type_annotation = context.dagster_type.typing_type\n if type_annotation != Any and not is_dict_type(type_annotation):\n check.failed(\n "Loading an input that corresponds to multiple partitions, but the"\n " type annotation on the op input is not a dict, Dict, Mapping, or"\n f" Any: is '{type_annotation}'."\n )\n\n return self._load_multiple_inputs(context)\n\n def handle_output(self, context: OutputContext, obj: Any):\n if context.dagster_type.typing_type == type(None):\n check.invariant(\n obj is None,\n "Output had Nothing type or 'None' annotation, but handle_output received"\n f" value that was not None and was of type {type(obj)}.",\n )\n return None\n\n if context.has_asset_partitions:\n paths = self._get_paths_for_partitions(context)\n\n check.invariant(\n len(paths) == 1,\n f"The current IO manager {type(self)} does not support persisting an output"\n " associated with multiple partitions. This error is likely occurring because a"\n " backfill was launched using the 'single run' option. Instead, launch the"\n " backfill with the 'multiple runs' option.",\n )\n\n path = next(iter(paths.values()))\n else:\n path = self._get_path(context)\n self.make_directory(path.parent)\n context.log.debug(self.get_writing_output_log_message(path))\n self.dump_to_path(context=context, obj=obj, path=path)\n\n metadata = {"path": MetadataValue.path(str(path))}\n custom_metadata = self.get_metadata(context=context, obj=obj)\n metadata.update(custom_metadata) # type: ignore\n\n context.add_output_metadata(metadata)
\n\n\ndef is_dict_type(type_obj) -> bool:\n if type_obj == dict:\n return True\n\n if hasattr(type_obj, "__origin__") and type_obj.__origin__ in (dict, Dict, Mapping):\n return True\n\n return False\n
", "current_page_name": "_modules/dagster/_core/storage/upath_io_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.storage.upath_io_manager"}}, "types": {"config_schema": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.types.config_schema

\nimport hashlib\nfrom abc import ABC, abstractmethod\nfrom typing import TYPE_CHECKING, AbstractSet, Any, Callable, Iterator, Optional, cast\n\nfrom typing_extensions import TypeAlias\n\nimport dagster._check as check\nfrom dagster._annotations import experimental_param\nfrom dagster._config import ConfigType\nfrom dagster._core.decorator_utils import get_function_params, validate_expected_params\nfrom dagster._core.errors import DagsterInvalidDefinitionError\n\nfrom ..definitions.resource_requirement import (\n    ResourceRequirement,\n    TypeLoaderResourceRequirement,\n)\n\nif TYPE_CHECKING:\n    from dagster._core.execution.context.system import (\n        DagsterTypeLoaderContext,\n    )\n\n\n
[docs]class DagsterTypeLoader(ABC):\n """Dagster type loaders are used to load unconnected inputs of the dagster type they are attached\n to.\n\n The recommended way to define a type loader is with the\n :py:func:`@dagster_type_loader <dagster_type_loader>` decorator.\n """\n\n @property\n @abstractmethod\n def schema_type(self) -> ConfigType:\n pass\n\n @property\n def loader_version(self) -> Optional[str]:\n return None\n\n def compute_loaded_input_version(self, _config_value: object) -> Optional[str]:\n return None\n\n def construct_from_config_value(\n self, _context: "DagsterTypeLoaderContext", config_value: object\n ) -> object:\n """How to create a runtime value from config data."""\n return config_value\n\n def required_resource_keys(self) -> AbstractSet[str]:\n return frozenset()\n\n def get_resource_requirements(\n self, outer_context: Optional[object] = None\n ) -> Iterator[ResourceRequirement]:\n type_display_name = cast(str, outer_context)\n for resource_key in sorted(list(self.required_resource_keys())):\n yield TypeLoaderResourceRequirement(\n key=resource_key, type_display_name=type_display_name\n )
\n\n\n@experimental_param(param="loader_version")\n@experimental_param(param="external_version_fn")\nclass DagsterTypeLoaderFromDecorator(DagsterTypeLoader):\n def __init__(\n self,\n config_type,\n func,\n required_resource_keys,\n loader_version=None,\n external_version_fn=None,\n ):\n self._config_type = check.inst_param(config_type, "config_type", ConfigType)\n self._func = check.callable_param(func, "func")\n self._required_resource_keys = check.opt_set_param(\n required_resource_keys, "required_resource_keys", of_type=str\n )\n self._loader_version = check.opt_str_param(loader_version, "loader_version")\n self._external_version_fn = check.opt_callable_param(\n external_version_fn, "external_version_fn"\n )\n\n @property\n def schema_type(self) -> ConfigType:\n return self._config_type\n\n @property\n def loader_version(self) -> Optional[str]:\n return self._loader_version\n\n def compute_loaded_input_version(self, config_value: object) -> Optional[str]:\n """Compute the type-loaded input from a given config_value.\n\n Args:\n config_value (object): Config value to be ingested by the external version\n loading function.\n\n Returns:\n Optional[str]: Hash of concatenated loader version and external input version if both\n are provided, else None.\n """\n version = ""\n if self.loader_version:\n version += str(self.loader_version)\n if self._external_version_fn:\n ext_version = self._external_version_fn(config_value)\n version += str(ext_version)\n\n if version == "":\n return None # Sentinel value for no version provided.\n else:\n return hashlib.sha1(version.encode("utf-8")).hexdigest()\n\n def construct_from_config_value(\n self, context: "DagsterTypeLoaderContext", config_value: object\n ):\n return self._func(context, config_value)\n\n def required_resource_keys(self):\n return frozenset(self._required_resource_keys)\n\n\ndef _create_type_loader_for_decorator(\n config_type: ConfigType,\n func,\n required_resource_keys: Optional[AbstractSet[str]],\n loader_version: Optional[str] = None,\n external_version_fn: Optional[Callable[[object], str]] = None,\n):\n return DagsterTypeLoaderFromDecorator(\n config_type, func, required_resource_keys, loader_version, external_version_fn\n )\n\n\nDagsterTypeLoaderFn: TypeAlias = Callable[["DagsterTypeLoaderContext", Any], Any]\n\n\n
[docs]def dagster_type_loader(\n config_schema: object,\n required_resource_keys: Optional[AbstractSet[str]] = None,\n loader_version: Optional[str] = None,\n external_version_fn: Optional[Callable[[object], str]] = None,\n) -> Callable[[DagsterTypeLoaderFn], DagsterTypeLoaderFromDecorator]:\n """Create an dagster type loader that maps config data to a runtime value.\n\n The decorated function should take the execution context and parsed config value and return the\n appropriate runtime value.\n\n Args:\n config_schema (ConfigSchema): The schema for the config that's passed to the decorated\n function.\n loader_version (str): (Experimental) The version of the decorated compute function. Two\n loading functions should have the same version if and only if they deterministically\n produce the same outputs when provided the same inputs.\n external_version_fn (Callable): (Experimental) A function that takes in the same parameters as the loader\n function (config_value) and returns a representation of the version of the external\n asset (str). Two external assets with identical versions are treated as identical to one\n another.\n\n Examples:\n .. code-block:: python\n\n @dagster_type_loader(Permissive())\n def load_dict(_context, value):\n return value\n """\n from dagster._config import resolve_to_config_type\n\n config_type = resolve_to_config_type(config_schema)\n assert isinstance(\n config_type, ConfigType\n ), f"{config_schema} could not be resolved to config type"\n EXPECTED_POSITIONALS = ["context", "*"]\n\n def wrapper(func: DagsterTypeLoaderFn) -> DagsterTypeLoaderFromDecorator:\n params = get_function_params(func)\n missing_positional = validate_expected_params(params, EXPECTED_POSITIONALS)\n if missing_positional:\n raise DagsterInvalidDefinitionError(\n f"@dagster_type_loader '{func.__name__}' decorated function does not have required"\n f" positional parameter '{missing_positional}'. @dagster_type_loader decorated"\n " functions should only have keyword arguments that match input names and a first"\n " positional parameter named 'context'."\n )\n\n return _create_type_loader_for_decorator(\n config_type, func, required_resource_keys, loader_version, external_version_fn\n )\n\n return wrapper
\n
", "current_page_name": "_modules/dagster/_core/types/config_schema", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.types.config_schema"}, "dagster_type": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.types.dagster_type

\nimport typing as t\nfrom abc import abstractmethod\nfrom enum import Enum as PythonEnum\nfrom functools import partial\nfrom typing import (\n    AbstractSet as TypingAbstractSet,\n    AnyStr,\n    Iterator as TypingIterator,\n    Mapping,\n    Optional as TypingOptional,\n    Sequence,\n    Type as TypingType,\n    cast,\n)\n\nfrom typing_extensions import get_args, get_origin\n\nimport dagster._check as check\nfrom dagster._annotations import public\nfrom dagster._builtins import BuiltinEnum\nfrom dagster._config import (\n    Array,\n    ConfigType,\n    Noneable as ConfigNoneable,\n)\nfrom dagster._core.definitions.events import DynamicOutput, Output, TypeCheck\nfrom dagster._core.definitions.metadata import (\n    MetadataValue,\n    RawMetadataValue,\n    normalize_metadata,\n)\nfrom dagster._core.errors import DagsterInvalidDefinitionError, DagsterInvariantViolationError\nfrom dagster._serdes import whitelist_for_serdes\nfrom dagster._seven import is_subclass\n\nfrom ..definitions.resource_requirement import (\n    RequiresResources,\n    ResourceRequirement,\n    TypeResourceRequirement,\n)\nfrom .builtin_config_schemas import BuiltinSchemas\nfrom .config_schema import DagsterTypeLoader\n\nif t.TYPE_CHECKING:\n    from dagster._core.definitions.node_definition import NodeDefinition\n    from dagster._core.execution.context.system import DagsterTypeLoaderContext, TypeCheckContext\n\nTypeCheckFn = t.Callable[["TypeCheckContext", AnyStr], t.Union[TypeCheck, bool]]\n\n\n@whitelist_for_serdes\nclass DagsterTypeKind(PythonEnum):\n    ANY = "ANY"\n    SCALAR = "SCALAR"\n    LIST = "LIST"\n    NOTHING = "NOTHING"\n    NULLABLE = "NULLABLE"\n    REGULAR = "REGULAR"\n\n\n
[docs]class DagsterType(RequiresResources):\n """Define a type in dagster. These can be used in the inputs and outputs of ops.\n\n Args:\n type_check_fn (Callable[[TypeCheckContext, Any], [Union[bool, TypeCheck]]]):\n The function that defines the type check. It takes the value flowing\n through the input or output of the op. If it passes, return either\n ``True`` or a :py:class:`~dagster.TypeCheck` with ``success`` set to ``True``. If it fails,\n return either ``False`` or a :py:class:`~dagster.TypeCheck` with ``success`` set to ``False``.\n The first argument must be named ``context`` (or, if unused, ``_``, ``_context``, or ``context_``).\n Use ``required_resource_keys`` for access to resources.\n key (Optional[str]): The unique key to identify types programmatically.\n The key property always has a value. If you omit key to the argument\n to the init function, it instead receives the value of ``name``. If\n neither ``key`` nor ``name`` is provided, a ``CheckError`` is thrown.\n\n In the case of a generic type such as ``List`` or ``Optional``, this is\n generated programmatically based on the type parameters.\n\n For most use cases, name should be set and the key argument should\n not be specified.\n name (Optional[str]): A unique name given by a user. If ``key`` is ``None``, ``key``\n becomes this value. Name is not given in a case where the user does\n not specify a unique name for this type, such as a generic class.\n description (Optional[str]): A markdown-formatted string, displayed in tooling.\n loader (Optional[DagsterTypeLoader]): An instance of a class that\n inherits from :py:class:`~dagster.DagsterTypeLoader` and can map config data to a value of\n this type. Specify this argument if you will need to shim values of this type using the\n config machinery. As a rule, you should use the\n :py:func:`@dagster_type_loader <dagster.dagster_type_loader>` decorator to construct\n these arguments.\n required_resource_keys (Optional[Set[str]]): Resource keys required by the ``type_check_fn``.\n is_builtin (bool): Defaults to False. This is used by tools to display or\n filter built-in types (such as :py:class:`~dagster.String`, :py:class:`~dagster.Int`) to visually distinguish\n them from user-defined types. Meant for internal use.\n kind (DagsterTypeKind): Defaults to None. This is used to determine the kind of runtime type\n for InputDefinition and OutputDefinition type checking.\n typing_type: Defaults to None. A valid python typing type (e.g. Optional[List[int]]) for the\n value contained within the DagsterType. Meant for internal use.\n """\n\n def __init__(\n self,\n type_check_fn: TypeCheckFn,\n key: t.Optional[str] = None,\n name: t.Optional[str] = None,\n is_builtin: bool = False,\n description: t.Optional[str] = None,\n loader: t.Optional[DagsterTypeLoader] = None,\n required_resource_keys: t.Optional[t.Set[str]] = None,\n kind: DagsterTypeKind = DagsterTypeKind.REGULAR,\n typing_type: t.Any = t.Any,\n metadata: t.Optional[t.Mapping[str, RawMetadataValue]] = None,\n ):\n check.opt_str_param(key, "key")\n check.opt_str_param(name, "name")\n\n check.invariant(not (name is None and key is None), "Must set key or name")\n if name is None:\n key = check.not_none(\n key,\n "If name is not provided, must provide key.",\n )\n self.key, self._name = key, None\n elif key is None:\n name = check.not_none(\n name,\n "If key is not provided, must provide name.",\n )\n self.key, self._name = name, name\n else:\n check.invariant(key and name)\n self.key, self._name = key, name\n\n self._description = check.opt_str_param(description, "description")\n self._loader = check.opt_inst_param(loader, "loader", DagsterTypeLoader)\n\n self._required_resource_keys = check.opt_set_param(\n required_resource_keys,\n "required_resource_keys",\n )\n\n self._type_check_fn = check.callable_param(type_check_fn, "type_check_fn")\n _validate_type_check_fn(self._type_check_fn, self._name)\n\n self.is_builtin = check.bool_param(is_builtin, "is_builtin")\n check.invariant(\n self.display_name is not None,\n f"All types must have a valid display name, got None for key {key}",\n )\n\n self.kind = check.inst_param(kind, "kind", DagsterTypeKind)\n\n self._typing_type = typing_type\n\n self._metadata = normalize_metadata(\n check.opt_mapping_param(metadata, "metadata", key_type=str),\n )\n\n
[docs] @public\n def type_check(self, context: "TypeCheckContext", value: object) -> TypeCheck:\n """Type check the value against the type.\n\n Args:\n context (TypeCheckContext): The context of the type check.\n value (Any): The value to check.\n\n Returns:\n TypeCheck: The result of the type check.\n """\n retval = self._type_check_fn(context, value)\n\n if not isinstance(retval, (bool, TypeCheck)):\n raise DagsterInvariantViolationError(\n f"You have returned {retval!r} of type {type(retval)} from the type "\n f'check function of type "{self.key}". Return value must be instance '\n "of TypeCheck or a bool."\n )\n\n return TypeCheck(success=retval) if isinstance(retval, bool) else retval
\n\n def __eq__(self, other):\n return isinstance(other, DagsterType) and self.key == other.key\n\n def __ne__(self, other):\n return not self.__eq__(other)\n\n def __hash__(self):\n return hash(self.key)\n\n @staticmethod\n def from_builtin_enum(builtin_enum) -> "DagsterType":\n check.invariant(BuiltinEnum.contains(builtin_enum), "must be member of BuiltinEnum")\n return _RUNTIME_MAP[builtin_enum]\n\n @property\n def metadata(self) -> t.Mapping[str, MetadataValue]:\n return self._metadata\n\n @public\n @property\n def required_resource_keys(self) -> TypingAbstractSet[str]:\n """AbstractSet[str]: Set of resource keys required by the type check function."""\n return self._required_resource_keys\n\n @public\n @property\n def display_name(self) -> str:\n """Either the name or key (if name is `None`) of the type, overridden in many subclasses."""\n return cast(str, self._name or self.key)\n\n @public\n @property\n def unique_name(self) -> t.Optional[str]:\n """The unique name of this type. Can be None if the type is not unique, such as container types."""\n # TODO: docstring and body inconsistent-- can this be None or not?\n check.invariant(\n self._name is not None,\n f"unique_name requested but is None for type {self.display_name}",\n )\n return self._name\n\n @public\n @property\n def has_unique_name(self) -> bool:\n """bool: Whether the type has a unique name."""\n return self._name is not None\n\n @public\n @property\n def typing_type(self) -> t.Any:\n """Any: The python typing type for this type."""\n return self._typing_type\n\n @public\n @property\n def loader(self) -> t.Optional[DagsterTypeLoader]:\n """Optional[DagsterTypeLoader]: Loader for this type, if any."""\n return self._loader\n\n @public\n @property\n def description(self) -> t.Optional[str]:\n """Optional[str]: Description of the type, or None if not provided."""\n return self._description\n\n @property\n def inner_types(self) -> t.Sequence["DagsterType"]:\n return []\n\n @property\n def loader_schema_key(self) -> t.Optional[str]:\n return self.loader.schema_type.key if self.loader else None\n\n @property\n def type_param_keys(self) -> t.Sequence[str]:\n return []\n\n @property\n def is_nothing(self) -> bool:\n return self.kind == DagsterTypeKind.NOTHING\n\n @property\n def supports_fan_in(self) -> bool:\n return False\n\n def get_inner_type_for_fan_in(self) -> "DagsterType":\n check.failed(\n "DagsterType {name} does not support fan-in, should have checked supports_fan_in before"\n " calling getter.".format(name=self.display_name)\n )\n\n def get_resource_requirements(\n self, _outer_context: TypingOptional[object] = None\n ) -> TypingIterator[ResourceRequirement]:\n for resource_key in sorted(list(self.required_resource_keys)):\n yield TypeResourceRequirement(key=resource_key, type_display_name=self.display_name)\n if self.loader:\n yield from self.loader.get_resource_requirements(outer_context=self.display_name)
\n\n\ndef _validate_type_check_fn(fn: t.Callable, name: t.Optional[str]) -> bool:\n from dagster._seven import get_arg_names\n\n args = get_arg_names(fn)\n\n # py2 doesn't filter out self\n if len(args) >= 1 and args[0] == "self":\n args = args[1:]\n\n if len(args) == 2:\n possible_names = {\n "_",\n "context",\n "_context",\n "context_",\n }\n if args[0] not in possible_names:\n DagsterInvalidDefinitionError(\n f'type_check function on type "{name}" must have first '\n 'argument named "context" (or _, _context, context_).'\n )\n return True\n\n raise DagsterInvalidDefinitionError(\n f'type_check_fn argument on type "{name}" must take 2 arguments, received {len(args)}.'\n )\n\n\nclass BuiltinScalarDagsterType(DagsterType):\n def __init__(self, name: str, type_check_fn: TypeCheckFn, typing_type: t.Type, **kwargs):\n super(BuiltinScalarDagsterType, self).__init__(\n key=name,\n name=name,\n kind=DagsterTypeKind.SCALAR,\n type_check_fn=type_check_fn,\n is_builtin=True,\n typing_type=typing_type,\n **kwargs,\n )\n\n # This is passed to the constructor of subclasses as the argument `type_check_fn`-- that's why\n # it exists together with the `type_check_fn` arg.\n def type_check_fn(self, _context, value) -> TypeCheck:\n return self.type_check_scalar_value(value)\n\n @abstractmethod\n def type_check_scalar_value(self, _value) -> TypeCheck:\n raise NotImplementedError()\n\n\ndef _typemismatch_error_str(value: object, expected_type_desc: str) -> str:\n return 'Value "{value}" of python type "{python_type}" must be a {type_desc}.'.format(\n value=value, python_type=type(value).__name__, type_desc=expected_type_desc\n )\n\n\ndef _fail_if_not_of_type(\n value: object, value_type: t.Type[t.Any], value_type_desc: str\n) -> TypeCheck:\n if not isinstance(value, value_type):\n return TypeCheck(success=False, description=_typemismatch_error_str(value, value_type_desc))\n\n return TypeCheck(success=True)\n\n\nclass _Int(BuiltinScalarDagsterType):\n def __init__(self):\n super(_Int, self).__init__(\n name="Int",\n loader=BuiltinSchemas.INT_INPUT,\n type_check_fn=self.type_check_fn,\n typing_type=int,\n )\n\n def type_check_scalar_value(self, value) -> TypeCheck:\n return _fail_if_not_of_type(value, int, "int")\n\n\nclass _String(BuiltinScalarDagsterType):\n def __init__(self):\n super(_String, self).__init__(\n name="String",\n loader=BuiltinSchemas.STRING_INPUT,\n type_check_fn=self.type_check_fn,\n typing_type=str,\n )\n\n def type_check_scalar_value(self, value: object) -> TypeCheck:\n return _fail_if_not_of_type(value, str, "string")\n\n\nclass _Float(BuiltinScalarDagsterType):\n def __init__(self):\n super(_Float, self).__init__(\n name="Float",\n loader=BuiltinSchemas.FLOAT_INPUT,\n type_check_fn=self.type_check_fn,\n typing_type=float,\n )\n\n def type_check_scalar_value(self, value: object) -> TypeCheck:\n return _fail_if_not_of_type(value, float, "float")\n\n\nclass _Bool(BuiltinScalarDagsterType):\n def __init__(self):\n super(_Bool, self).__init__(\n name="Bool",\n loader=BuiltinSchemas.BOOL_INPUT,\n type_check_fn=self.type_check_fn,\n typing_type=bool,\n )\n\n def type_check_scalar_value(self, value: object) -> TypeCheck:\n return _fail_if_not_of_type(value, bool, "bool")\n\n\nclass Anyish(DagsterType):\n def __init__(\n self,\n key: t.Optional[str],\n name: t.Optional[str],\n loader: t.Optional[DagsterTypeLoader] = None,\n is_builtin: bool = False,\n description: t.Optional[str] = None,\n ):\n super(Anyish, self).__init__(\n key=key,\n name=name,\n kind=DagsterTypeKind.ANY,\n loader=loader,\n is_builtin=is_builtin,\n type_check_fn=self.type_check_method,\n description=description,\n typing_type=t.Any,\n )\n\n def type_check_method(self, _context: "TypeCheckContext", _value: object) -> TypeCheck:\n return TypeCheck(success=True)\n\n @property\n def supports_fan_in(self) -> bool:\n return True\n\n def get_inner_type_for_fan_in(self) -> DagsterType:\n # Anyish all the way down\n return self\n\n\nclass _Any(Anyish):\n def __init__(self):\n super(_Any, self).__init__(\n key="Any",\n name="Any",\n loader=BuiltinSchemas.ANY_INPUT,\n is_builtin=True,\n )\n\n\ndef create_any_type(\n name: str,\n loader: t.Optional[DagsterTypeLoader] = None,\n description: t.Optional[str] = None,\n) -> Anyish:\n return Anyish(\n key=name,\n name=name,\n description=description,\n loader=loader,\n )\n\n\nclass _Nothing(DagsterType):\n def __init__(self):\n super(_Nothing, self).__init__(\n key="Nothing",\n name="Nothing",\n kind=DagsterTypeKind.NOTHING,\n loader=None,\n type_check_fn=self.type_check_method,\n is_builtin=True,\n typing_type=type(None),\n )\n\n def type_check_method(self, _context: "TypeCheckContext", value: object) -> TypeCheck:\n if value is not None:\n return TypeCheck(\n success=False,\n description=f"Value must be None, got a {type(value)}",\n )\n\n return TypeCheck(success=True)\n\n @property\n def supports_fan_in(self) -> bool:\n return True\n\n def get_inner_type_for_fan_in(self) -> DagsterType:\n return self\n\n\ndef isinstance_type_check_fn(\n expected_python_type: t.Union[t.Type, t.Tuple[t.Type, ...]],\n dagster_type_name: str,\n expected_python_type_str: str,\n) -> TypeCheckFn:\n def type_check(_context: "TypeCheckContext", value: object) -> TypeCheck:\n if not isinstance(value, expected_python_type):\n return TypeCheck(\n success=False,\n description=(\n f"Value of type {type(value)} failed type check for Dagster type"\n f" {dagster_type_name}, expected value to be of Python type"\n f" {expected_python_type_str}."\n ),\n )\n\n return TypeCheck(success=True)\n\n return type_check\n\n\n
[docs]class PythonObjectDagsterType(DagsterType):\n """Define a type in dagster whose typecheck is an isinstance check.\n\n Specifically, the type can either be a single python type (e.g. int),\n or a tuple of types (e.g. (int, float)) which is treated as a union.\n\n Examples:\n .. code-block:: python\n\n ntype = PythonObjectDagsterType(python_type=int)\n assert ntype.name == 'int'\n assert_success(ntype, 1)\n assert_failure(ntype, 'a')\n\n .. code-block:: python\n\n ntype = PythonObjectDagsterType(python_type=(int, float))\n assert ntype.name == 'Union[int, float]'\n assert_success(ntype, 1)\n assert_success(ntype, 1.5)\n assert_failure(ntype, 'a')\n\n\n Args:\n python_type (Union[Type, Tuple[Type, ...]): The dagster typecheck function calls instanceof on\n this type.\n name (Optional[str]): Name the type. Defaults to the name of ``python_type``.\n key (Optional[str]): Key of the type. Defaults to name.\n description (Optional[str]): A markdown-formatted string, displayed in tooling.\n loader (Optional[DagsterTypeLoader]): An instance of a class that\n inherits from :py:class:`~dagster.DagsterTypeLoader` and can map config data to a value of\n this type. Specify this argument if you will need to shim values of this type using the\n config machinery. As a rule, you should use the\n :py:func:`@dagster_type_loader <dagster.dagster_type_loader>` decorator to construct\n these arguments.\n """\n\n def __init__(\n self,\n python_type: t.Union[t.Type, t.Tuple[t.Type, ...]],\n key: t.Optional[str] = None,\n name: t.Optional[str] = None,\n **kwargs,\n ):\n if isinstance(python_type, tuple):\n self.python_type = check.tuple_param(\n python_type, "python_type", of_shape=tuple(type for item in python_type)\n )\n self.type_str = "Union[{}]".format(\n ", ".join(python_type.__name__ for python_type in python_type)\n )\n typing_type = t.Union[python_type] # type: ignore\n\n else:\n self.python_type = check.class_param(python_type, "python_type")\n self.type_str = cast(str, python_type.__name__)\n typing_type = self.python_type\n name = check.opt_str_param(name, "name", self.type_str)\n key = check.opt_str_param(key, "key", name)\n super(PythonObjectDagsterType, self).__init__(\n key=key,\n name=name,\n type_check_fn=isinstance_type_check_fn(python_type, name, self.type_str),\n typing_type=typing_type,\n **kwargs,\n )
\n\n\nclass NoneableInputSchema(DagsterTypeLoader):\n def __init__(self, inner_dagster_type: DagsterType):\n self._inner_dagster_type = check.inst_param(\n inner_dagster_type, "inner_dagster_type", DagsterType\n )\n self._inner_loader = check.not_none_param(inner_dagster_type.loader, "inner_dagster_type")\n self._schema_type = ConfigNoneable(self._inner_loader.schema_type)\n\n @property\n def schema_type(self) -> ConfigType:\n return self._schema_type\n\n def construct_from_config_value(\n self, context: "DagsterTypeLoaderContext", config_value: object\n ) -> object:\n if config_value is None:\n return None\n return self._inner_loader.construct_from_config_value(context, config_value)\n\n\ndef _create_nullable_input_schema(inner_type: DagsterType) -> t.Optional[DagsterTypeLoader]:\n if not inner_type.loader:\n return None\n\n return NoneableInputSchema(inner_type)\n\n\nclass OptionalType(DagsterType):\n def __init__(self, inner_type: DagsterType):\n inner_type = resolve_dagster_type(inner_type)\n\n if inner_type is Nothing:\n raise DagsterInvalidDefinitionError(\n "Type Nothing can not be wrapped in List or Optional"\n )\n\n key = "Optional." + cast(str, inner_type.key)\n self.inner_type = inner_type\n super(OptionalType, self).__init__(\n key=key,\n name=None,\n kind=DagsterTypeKind.NULLABLE,\n type_check_fn=self.type_check_method,\n loader=_create_nullable_input_schema(inner_type),\n # This throws a type error with Py\n typing_type=t.Optional[inner_type.typing_type],\n )\n\n @property\n def display_name(self) -> str:\n return self.inner_type.display_name + "?"\n\n def type_check_method(self, context, value):\n return (\n TypeCheck(success=True) if value is None else self.inner_type.type_check(context, value)\n )\n\n @property\n def inner_types(self):\n return [self.inner_type] + self.inner_type.inner_types\n\n @property\n def type_param_keys(self):\n return [self.inner_type.key]\n\n @property\n def supports_fan_in(self):\n return self.inner_type.supports_fan_in\n\n def get_inner_type_for_fan_in(self):\n return self.inner_type.get_inner_type_for_fan_in()\n\n\nclass ListInputSchema(DagsterTypeLoader):\n def __init__(self, inner_dagster_type):\n self._inner_dagster_type = check.inst_param(\n inner_dagster_type, "inner_dagster_type", DagsterType\n )\n check.param_invariant(inner_dagster_type.loader, "inner_dagster_type")\n self._schema_type = Array(inner_dagster_type.loader.schema_type)\n\n @property\n def schema_type(self):\n return self._schema_type\n\n def construct_from_config_value(self, context, config_value):\n convert_item = partial(self._inner_dagster_type.loader.construct_from_config_value, context)\n return list(map(convert_item, config_value))\n\n\ndef _create_list_input_schema(inner_type):\n if not inner_type.loader:\n return None\n\n return ListInputSchema(inner_type)\n\n\nclass ListType(DagsterType):\n def __init__(self, inner_type: DagsterType):\n key = "List." + inner_type.key\n self.inner_type = inner_type\n super(ListType, self).__init__(\n key=key,\n name=None,\n kind=DagsterTypeKind.LIST,\n type_check_fn=self.type_check_method,\n loader=_create_list_input_schema(inner_type),\n typing_type=t.List[inner_type.typing_type],\n )\n\n @property\n def display_name(self):\n return "[" + self.inner_type.display_name + "]"\n\n def type_check_method(self, context, value):\n value_check = _fail_if_not_of_type(value, list, "list")\n if not value_check.success:\n return value_check\n\n for item in value:\n item_check = self.inner_type.type_check(context, item)\n if not item_check.success:\n return item_check\n\n return TypeCheck(success=True)\n\n @property\n def inner_types(self):\n return [self.inner_type] + self.inner_type.inner_types\n\n @property\n def type_param_keys(self):\n return [self.inner_type.key]\n\n @property\n def supports_fan_in(self):\n return True\n\n def get_inner_type_for_fan_in(self):\n return self.inner_type\n\n\nclass DagsterListApi:\n def __getitem__(self, inner_type):\n check.not_none_param(inner_type, "inner_type")\n return _List(resolve_dagster_type(inner_type))\n\n def __call__(self, inner_type):\n check.not_none_param(inner_type, "inner_type")\n return _List(inner_type)\n\n\nList: DagsterListApi = DagsterListApi()\n\n\ndef _List(inner_type):\n check.inst_param(inner_type, "inner_type", DagsterType)\n if inner_type is Nothing:\n raise DagsterInvalidDefinitionError("Type Nothing can not be wrapped in List or Optional")\n return ListType(inner_type)\n\n\nclass Stringish(DagsterType):\n def __init__(self, key: t.Optional[str] = None, name: t.Optional[str] = None, **kwargs):\n name = check.opt_str_param(name, "name", type(self).__name__)\n key = check.opt_str_param(key, "key", name)\n super(Stringish, self).__init__(\n key=key,\n name=name,\n kind=DagsterTypeKind.SCALAR,\n type_check_fn=self.type_check_method,\n loader=BuiltinSchemas.STRING_INPUT,\n typing_type=str,\n **kwargs,\n )\n\n def type_check_method(self, _context: "TypeCheckContext", value: object) -> TypeCheck:\n return _fail_if_not_of_type(value, str, "string")\n\n\ndef create_string_type(name, description=None):\n return Stringish(name=name, key=name, description=description)\n\n\nAny = _Any()\nBool = _Bool()\nFloat = _Float()\nInt = _Int()\nString = _String()\nNothing = _Nothing()\n\n_RUNTIME_MAP = {\n BuiltinEnum.ANY: Any,\n BuiltinEnum.BOOL: Bool,\n BuiltinEnum.FLOAT: Float,\n BuiltinEnum.INT: Int,\n BuiltinEnum.STRING: String,\n BuiltinEnum.NOTHING: Nothing,\n}\n\n_PYTHON_TYPE_TO_DAGSTER_TYPE_MAPPING_REGISTRY: t.Dict[type, DagsterType] = {}\n"""Python types corresponding to user-defined RunTime types created using @map_to_dagster_type or\nas_dagster_type are registered here so that we can remap the Python types to runtime types."""\n\n\n
[docs]def make_python_type_usable_as_dagster_type(\n python_type: TypingType[t.Any], dagster_type: DagsterType\n) -> None:\n """Take any existing python type and map it to a dagster type (generally created with\n :py:class:`DagsterType <dagster.DagsterType>`) This can only be called once\n on a given python type.\n """\n check.inst_param(python_type, "python_type", type)\n check.inst_param(dagster_type, "dagster_type", DagsterType)\n registered_dagster_type = _PYTHON_TYPE_TO_DAGSTER_TYPE_MAPPING_REGISTRY.get(python_type)\n\n if registered_dagster_type is None:\n _PYTHON_TYPE_TO_DAGSTER_TYPE_MAPPING_REGISTRY[python_type] = dagster_type\n elif registered_dagster_type is not dagster_type:\n # This would be just a great place to insert a short URL pointing to the type system\n # documentation into the error message\n # https://github.com/dagster-io/dagster/issues/1831\n if isinstance(registered_dagster_type, TypeHintInferredDagsterType):\n raise DagsterInvalidDefinitionError(\n "A Dagster type has already been registered for the Python type "\n f'{python_type}. The Dagster type was "auto-registered" - i.e. a solid definition '\n "used the Python type as an annotation for one of its arguments or for its return "\n "value before make_python_type_usable_as_dagster_type was called, and we "\n "generated a Dagster type to correspond to it. To override the auto-generated "\n "Dagster type, call make_python_type_usable_as_dagster_type before any solid "\n "definitions refer to the Python type."\n )\n else:\n raise DagsterInvalidDefinitionError(\n "A Dagster type has already been registered for the Python type "\n f"{python_type}. make_python_type_usable_as_dagster_type can only "\n "be called once on a python type as it is registering a 1:1 mapping "\n "between that python type and a dagster type."\n )
\n\n\nDAGSTER_INVALID_TYPE_ERROR_MESSAGE = (\n "Invalid type: dagster_type must be an instance of DagsterType or a Python type: "\n "got {dagster_type}{additional_msg}"\n)\n\n\nclass TypeHintInferredDagsterType(DagsterType):\n def __init__(self, python_type: t.Type):\n qualified_name = f"{python_type.__module__}.{python_type.__name__}"\n self.python_type = python_type\n super(TypeHintInferredDagsterType, self).__init__(\n key=f"_TypeHintInferred[{qualified_name}]",\n description=(\n f"DagsterType created from a type hint for the Python type {qualified_name}"\n ),\n type_check_fn=isinstance_type_check_fn(\n python_type, python_type.__name__, qualified_name\n ),\n typing_type=python_type,\n )\n\n @property\n def display_name(self) -> str:\n return self.python_type.__name__\n\n\ndef resolve_dagster_type(dagster_type: object) -> DagsterType:\n # circular dep\n from dagster._utils.typing_api import is_typing_type\n\n from .primitive_mapping import (\n is_supported_runtime_python_builtin,\n remap_python_builtin_for_runtime,\n )\n from .python_dict import (\n Dict as DDict,\n PythonDict,\n )\n from .python_set import DagsterSetApi, PythonSet\n from .python_tuple import DagsterTupleApi, PythonTuple\n from .transform_typing import transform_typing_type\n\n check.invariant(\n not (isinstance(dagster_type, type) and is_subclass(dagster_type, ConfigType)),\n "Cannot resolve a config type to a runtime type",\n )\n\n check.invariant(\n not (isinstance(dagster_type, type) and is_subclass(dagster_type, DagsterType)),\n f"Do not pass runtime type classes. Got {dagster_type}",\n )\n\n # First, check to see if we're using Dagster's generic output type to do the type catching.\n if is_generic_output_annotation(dagster_type):\n type_args = get_args(dagster_type)\n # If no inner type was provided, forward Any type.\n dagster_type = type_args[0] if len(type_args) == 1 else Any\n elif is_dynamic_output_annotation(dagster_type):\n dynamic_out_annotation = get_args(dagster_type)[0]\n type_args = get_args(dynamic_out_annotation)\n dagster_type = type_args[0] if len(type_args) == 1 else Any\n\n # Then, check to see if it is part of python's typing library\n if is_typing_type(dagster_type):\n dagster_type = transform_typing_type(dagster_type)\n if isinstance(dagster_type, DagsterType):\n return dagster_type\n\n # Test for unhashable objects -- this is if, for instance, someone has passed us an instance of\n # a dict where they meant to pass dict or Dict, etc.\n try:\n hash(dagster_type)\n except TypeError as e:\n raise DagsterInvalidDefinitionError(\n DAGSTER_INVALID_TYPE_ERROR_MESSAGE.format(\n additional_msg=(\n ", which isn't hashable. Did you pass an instance of a type instead of "\n "the type?"\n ),\n dagster_type=str(dagster_type),\n )\n ) from e\n\n if BuiltinEnum.contains(dagster_type):\n return DagsterType.from_builtin_enum(dagster_type)\n\n if is_supported_runtime_python_builtin(dagster_type):\n return remap_python_builtin_for_runtime(dagster_type)\n\n if dagster_type is None:\n return Any\n\n if dagster_type is DDict:\n return PythonDict\n if isinstance(dagster_type, DagsterTupleApi):\n return PythonTuple\n if isinstance(dagster_type, DagsterSetApi):\n return PythonSet\n if isinstance(dagster_type, DagsterListApi):\n return List(Any)\n\n if isinstance(dagster_type, type):\n return resolve_python_type_to_dagster_type(dagster_type)\n\n raise DagsterInvalidDefinitionError(\n DAGSTER_INVALID_TYPE_ERROR_MESSAGE.format(\n dagster_type=str(dagster_type), additional_msg="."\n )\n )\n\n\ndef is_dynamic_output_annotation(dagster_type: object) -> bool:\n check.invariant(\n not (isinstance(dagster_type, type) and is_subclass(dagster_type, ConfigType)),\n "Cannot resolve a config type to a runtime type",\n )\n\n check.invariant(\n not (isinstance(dagster_type, type) and is_subclass(dagster_type, ConfigType)),\n f"Do not pass runtime type classes. Got {dagster_type}",\n )\n\n if dagster_type == DynamicOutput or get_origin(dagster_type) == DynamicOutput:\n raise DagsterInvariantViolationError(\n "Op annotated with return type DynamicOutput. DynamicOutputs can only be returned in"\n " the context of a List. If only one output is needed, use the Output API."\n )\n\n if get_origin(dagster_type) == list and len(get_args(dagster_type)) == 1:\n list_inner_type = get_args(dagster_type)[0]\n return list_inner_type == DynamicOutput or get_origin(list_inner_type) == DynamicOutput\n return False\n\n\ndef is_generic_output_annotation(dagster_type: object) -> bool:\n return dagster_type == Output or get_origin(dagster_type) == Output\n\n\ndef resolve_python_type_to_dagster_type(python_type: t.Type) -> DagsterType:\n """Resolves a Python type to a Dagster type."""\n check.inst_param(python_type, "python_type", type)\n\n if python_type in _PYTHON_TYPE_TO_DAGSTER_TYPE_MAPPING_REGISTRY:\n return _PYTHON_TYPE_TO_DAGSTER_TYPE_MAPPING_REGISTRY[python_type]\n else:\n dagster_type = TypeHintInferredDagsterType(python_type)\n _PYTHON_TYPE_TO_DAGSTER_TYPE_MAPPING_REGISTRY[python_type] = dagster_type\n return dagster_type\n\n\nALL_RUNTIME_BUILTINS = list(_RUNTIME_MAP.values())\n\n\ndef construct_dagster_type_dictionary(\n node_defs: Sequence["NodeDefinition"],\n) -> Mapping[str, DagsterType]:\n from dagster._core.definitions.graph_definition import GraphDefinition\n\n type_dict_by_name = {t.unique_name: t for t in ALL_RUNTIME_BUILTINS}\n type_dict_by_key = {t.key: t for t in ALL_RUNTIME_BUILTINS}\n\n def process_node_def(node_def: "NodeDefinition"):\n input_output_types = list(node_def.all_input_output_types())\n for dagster_type in input_output_types:\n # We don't do uniqueness check on key because with classes\n # like Array, Noneable, etc, those are ephemeral objects\n # and it is perfectly fine to have many of them.\n type_dict_by_key[dagster_type.key] = dagster_type\n\n if not dagster_type.has_unique_name:\n continue\n\n if dagster_type.unique_name not in type_dict_by_name:\n type_dict_by_name[dagster_type.unique_name] = dagster_type\n continue\n\n if type_dict_by_name[dagster_type.unique_name] is not dagster_type:\n raise DagsterInvalidDefinitionError(\n (\n 'You have created two dagster types with the same name "{type_name}". '\n "Dagster types have must have unique names."\n ).format(type_name=dagster_type.display_name)\n )\n\n if isinstance(node_def, GraphDefinition):\n for child_node_def in node_def.node_defs:\n process_node_def(child_node_def)\n\n for node_def in node_defs:\n process_node_def(node_def)\n\n return type_dict_by_key\n\n\nclass DagsterOptionalApi:\n def __getitem__(self, inner_type: t.Union[t.Type, DagsterType]) -> OptionalType:\n inner_type = resolve_dagster_type(check.not_none_param(inner_type, "inner_type"))\n return OptionalType(inner_type)\n\n\nOptional: DagsterOptionalApi = DagsterOptionalApi()\n
", "current_page_name": "_modules/dagster/_core/types/dagster_type", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.types.dagster_type"}, "decorator": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.types.decorator

\nfrom typing import TYPE_CHECKING, Callable, Optional, Type, TypeVar, Union, overload\n\nimport dagster._check as check\n\nfrom .dagster_type import PythonObjectDagsterType, make_python_type_usable_as_dagster_type\n\nif TYPE_CHECKING:\n    from dagster._core.types.config_schema import DagsterTypeLoader\n\nT_Type = TypeVar("T_Type", bound=Type[object])\n\n\n@overload\ndef usable_as_dagster_type(\n    name: Optional[str] = ...,\n    description: Optional[str] = ...,\n    loader: Optional["DagsterTypeLoader"] = ...,\n) -> Callable[[T_Type], T_Type]: ...\n\n\n@overload\ndef usable_as_dagster_type(\n    name: T_Type,\n) -> T_Type: ...\n\n\n
[docs]def usable_as_dagster_type(\n name: Optional[Union[str, T_Type]] = None,\n description: Optional[str] = None,\n loader: Optional["DagsterTypeLoader"] = None,\n) -> Union[T_Type, Callable[[T_Type], T_Type]]:\n """Decorate a Python class to make it usable as a Dagster Type.\n\n This is intended to make it straightforward to annotate existing business logic classes to\n make them dagster types whose typecheck is an isinstance check against that python class.\n\n Args:\n python_type (cls): The python type to make usable as python type.\n name (Optional[str]): Name of the new Dagster type. If ``None``, the name (``__name__``) of\n the ``python_type`` will be used.\n description (Optional[str]): A user-readable description of the type.\n loader (Optional[DagsterTypeLoader]): An instance of a class that\n inherits from :py:class:`DagsterTypeLoader` and can map config data to a value of\n this type. Specify this argument if you will need to shim values of this type using the\n config machinery. As a rule, you should use the\n :py:func:`@dagster_type_loader <dagster.dagster_type_loader>` decorator to construct\n these arguments.\n\n Examples:\n .. code-block:: python\n\n # dagster_aws.s3.file_manager.S3FileHandle\n @usable_as_dagster_type\n class S3FileHandle(FileHandle):\n def __init__(self, s3_bucket, s3_key):\n self._s3_bucket = check.str_param(s3_bucket, 's3_bucket')\n self._s3_key = check.str_param(s3_key, 's3_key')\n\n @property\n def s3_bucket(self):\n return self._s3_bucket\n\n @property\n def s3_key(self):\n return self._s3_key\n\n @property\n def path_desc(self):\n return self.s3_path\n\n @property\n def s3_path(self):\n return 's3://{bucket}/{key}'.format(bucket=self.s3_bucket, key=self.s3_key)\n """\n # check for no args, no parens case\n if isinstance(name, type):\n bare_cls = name # with no parens, name is actually the decorated class\n make_python_type_usable_as_dagster_type(\n bare_cls,\n PythonObjectDagsterType(python_type=bare_cls, name=bare_cls.__name__, description=None),\n )\n return bare_cls\n\n def _with_args(bare_cls: T_Type) -> T_Type:\n check.class_param(bare_cls, "bare_cls")\n new_name = check.opt_str_param(name, "name") if name else bare_cls.__name__\n\n make_python_type_usable_as_dagster_type(\n bare_cls,\n PythonObjectDagsterType(\n name=new_name,\n description=description,\n python_type=bare_cls,\n loader=loader,\n ),\n )\n return bare_cls\n\n return _with_args
\n
", "current_page_name": "_modules/dagster/_core/types/decorator", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.types.decorator"}}}, "_serdes": {"config_class": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._serdes.config_class

\nimport importlib\nfrom abc import ABC, abstractmethod\nfrom typing import (\n    TYPE_CHECKING,\n    Any,\n    Dict,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Type,\n    TypeVar,\n    Union,\n    overload,\n)\n\nfrom typing_extensions import Self\n\nimport dagster._check as check\nfrom dagster._utils import convert_dagster_submodule_name\nfrom dagster._utils.yaml_utils import load_run_config_yaml\n\nfrom .serdes import (\n    NamedTupleSerializer,\n    whitelist_for_serdes,\n)\n\nif TYPE_CHECKING:\n    from dagster._config.config_schema import UserConfigSchema\n\nT_ConfigurableClass = TypeVar("T_ConfigurableClass")\n\n\nclass ConfigurableClassDataSerializer(NamedTupleSerializer["ConfigurableClassData"]):\n    def after_pack(self, **packed: Any) -> Dict[str, Any]:\n        packed["module_name"] = convert_dagster_submodule_name(packed["module_name"], "public")\n        return packed\n\n\n
[docs]@whitelist_for_serdes(serializer=ConfigurableClassDataSerializer)\nclass ConfigurableClassData(\n NamedTuple(\n "_ConfigurableClassData",\n [\n ("module_name", str),\n ("class_name", str),\n ("config_yaml", str),\n ],\n )\n):\n """Serializable tuple describing where to find a class and the config fragment that should\n be used to instantiate it.\n\n Users should not instantiate this class directly.\n\n Classes intended to be serialized in this way should implement the\n :py:class:`dagster.serdes.ConfigurableClass` mixin.\n """\n\n def __new__(cls, module_name: str, class_name: str, config_yaml: str):\n return super(ConfigurableClassData, cls).__new__(\n cls,\n convert_dagster_submodule_name(check.str_param(module_name, "module_name"), "private"),\n check.str_param(class_name, "class_name"),\n check.str_param(config_yaml, "config_yaml"),\n )\n\n @property\n def config_dict(self) -> Mapping[str, Any]:\n return check.is_dict(load_run_config_yaml(self.config_yaml), key_type=str)\n\n def info_dict(self) -> Mapping[str, Any]:\n return {\n "module": self.module_name,\n "class": self.class_name,\n "config": self.config_dict,\n }\n\n @overload\n def rehydrate(self, as_type: Type[T_ConfigurableClass]) -> T_ConfigurableClass: ...\n\n @overload\n def rehydrate(self, as_type: None = ...) -> "ConfigurableClass": ...\n\n def rehydrate(\n self, as_type: Optional[Type[T_ConfigurableClass]] = None\n ) -> Union["ConfigurableClass", T_ConfigurableClass]:\n from dagster._config import process_config, resolve_to_config_type\n from dagster._core.errors import DagsterInvalidConfigError\n\n try:\n module = importlib.import_module(self.module_name)\n except ModuleNotFoundError:\n check.failed(\n f"Couldn't import module {self.module_name} when attempting to load the "\n f"configurable class {self.module_name}.{self.class_name}"\n )\n try:\n klass = getattr(module, self.class_name)\n except AttributeError:\n check.failed(\n f"Couldn't find class {self.class_name} in module when attempting to load the "\n f"configurable class {self.module_name}.{self.class_name}"\n )\n\n if not issubclass(klass, as_type or ConfigurableClass):\n raise check.CheckError(\n klass,\n f"class {self.class_name} in module {self.module_name}",\n ConfigurableClass,\n )\n\n config_dict = self.config_dict\n result = process_config(resolve_to_config_type(klass.config_type()), config_dict)\n if not result.success:\n raise DagsterInvalidConfigError(\n f"Errors whilst loading configuration for {klass.config_type()}.",\n result.errors,\n config_dict,\n )\n return klass.from_config_value(self, check.not_none(result.value))
\n\n\n
[docs]class ConfigurableClass(ABC):\n """Abstract mixin for classes that can be loaded from config.\n\n This supports a powerful plugin pattern which avoids both a) a lengthy, hard-to-synchronize list\n of conditional imports / optional extras_requires in dagster core and b) a magic directory or\n file in which third parties can place plugin packages. Instead, the intention is to make, e.g.,\n run storage, pluggable with a config chunk like:\n\n .. code-block:: yaml\n\n run_storage:\n module: very_cool_package.run_storage\n class: SplendidRunStorage\n config:\n magic_word: "quux"\n\n This same pattern should eventually be viable for other system components, e.g. engines.\n\n The ``ConfigurableClass`` mixin provides the necessary hooks for classes to be instantiated from\n an instance of ``ConfigurableClassData``.\n\n Pieces of the Dagster system which we wish to make pluggable in this way should consume a config\n type such as:\n\n .. code-block:: python\n\n {'module': str, 'class': str, 'config': Field(Permissive())}\n\n """\n\n @property\n @abstractmethod\n def inst_data(self) -> Optional[ConfigurableClassData]:\n """Subclass must be able to return the inst_data as a property if it has been constructed\n through the from_config_value code path.\n """\n\n @classmethod\n @abstractmethod\n def config_type(cls) -> "UserConfigSchema":\n """Get the config type against which to validate a config yaml fragment.\n\n The only place config values matching this type are used is inside `from_config_value`. This\n is an alternative constructor for a class. It is a common pattern for the config type to\n match constructor arguments, so `from_config_value`\n\n The config type against which to validate a config yaml fragment\n serialized in an instance of ``ConfigurableClassData``.\n """\n ...\n # We need to raise `NotImplementedError` here because nothing prevents abstract class\n # methods from being called.\n raise NotImplementedError(f"{cls.__name__} must implement the config_type classmethod")\n\n @classmethod\n @abstractmethod\n def from_config_value(\n cls, inst_data: ConfigurableClassData, config_value: Mapping[str, Any]\n ) -> Self:\n """Create an instance of the ConfigurableClass from a validated config value.\n\n The config value used here should be derived from the accompanying `inst_data` argument.\n `inst_data` contains the yaml-serialized config-- this must be parsed and\n validated/normalized, then passed to this method for object instantiation. This is done in\n ConfigurableClassData.rehydrate.\n\n Args:\n config_value (dict): The validated config value to use. Typically this should be the\n ``value`` attribute of a\n :py:class:`~dagster._core.types.evaluator.evaluation.EvaluateValueResult`.\n\n\n A common pattern is for the implementation to align the config_value with the signature\n of the ConfigurableClass's constructor:\n\n .. code-block:: python\n\n @classmethod\n def from_config_value(cls, inst_data, config_value):\n return MyConfigurableClass(inst_data=inst_data, **config_value)\n\n """
\n\n\ndef class_from_code_pointer(module_name: str, class_name: str) -> Type[object]:\n try:\n module = importlib.import_module(module_name)\n except ModuleNotFoundError:\n check.failed(\n "Couldn't import module {module_name} when attempting to load the class {klass}".format(\n module_name=module_name,\n klass=module_name + "." + class_name,\n )\n )\n try:\n return getattr(module, class_name)\n except AttributeError:\n check.failed(\n "Couldn't find class {class_name} in module when attempting to load the "\n "class {klass}".format(\n class_name=class_name,\n klass=module_name + "." + class_name,\n )\n )\n
", "current_page_name": "_modules/dagster/_serdes/config_class", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._serdes.config_class"}}, "_utils": {"alabaster_version": "0.7.13", "alert": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._utils.alert

\nimport datetime\nimport smtplib\nimport ssl\nfrom typing import TYPE_CHECKING, Callable, Optional, Sequence, Union\n\nfrom dagster._annotations import deprecated_param\nfrom dagster._core.definitions.sensor_definition import DefaultSensorStatus, SensorDefinition\nfrom dagster._core.errors import DagsterInvalidDefinitionError\n\nif TYPE_CHECKING:\n    from dagster._core.definitions.graph_definition import GraphDefinition\n    from dagster._core.definitions.job_definition import JobDefinition\n    from dagster._core.definitions.run_status_sensor_definition import RunFailureSensorContext\n    from dagster._core.definitions.selector import JobSelector, RepositorySelector\n    from dagster._core.definitions.unresolved_asset_job_definition import (\n        UnresolvedAssetJobDefinition,\n    )\n\n\ndef _default_failure_email_body(context: "RunFailureSensorContext") -> str:\n    from dagster._core.host_representation.external_data import DEFAULT_MODE_NAME\n\n    return "<br>".join(\n        [\n            f"Pipeline {context.dagster_run.job_name} failed!",\n            f"Run ID: {context.dagster_run.run_id}",\n            f"Mode: {DEFAULT_MODE_NAME}",\n            f"Error: {context.failure_event.message}",\n        ]\n    )\n\n\ndef _default_failure_email_subject(context) -> str:\n    return f"Dagster Run Failed: {context.pipeline_run.job_name}"\n\n\nEMAIL_MESSAGE = """From: {email_from}\nTo: {email_to}\nMIME-Version: 1.0\nContent-type: text/html\nSubject: {email_subject}\n\n{email_body}\n\n<!-- this ensures Gmail doesn't trim the email -->\n<span style="opacity: 0"> {randomness} </span>\n"""\n\n\ndef send_email_via_ssl(\n    email_from: str,\n    email_password: str,\n    email_to: Sequence[str],\n    message: str,\n    smtp_host: str,\n    smtp_port: int,\n):\n    context = ssl.create_default_context()\n    with smtplib.SMTP_SSL(smtp_host, smtp_port, context=context) as server:\n        server.login(email_from, email_password)\n        server.sendmail(email_from, email_to, message)\n\n\ndef send_email_via_starttls(\n    email_from: str,\n    email_password: str,\n    email_to: Sequence[str],\n    message: str,\n    smtp_host: str,\n    smtp_port: int,\n):\n    context = ssl.create_default_context()\n    with smtplib.SMTP(smtp_host, smtp_port) as server:\n        server.starttls(context=context)\n        server.login(email_from, email_password)\n        server.sendmail(email_from, email_to, message)\n\n\n
[docs]@deprecated_param(\n param="job_selection",\n breaking_version="2.0",\n additional_warn_text="Use `monitored_jobs` instead.",\n)\ndef make_email_on_run_failure_sensor(\n email_from: str,\n email_password: str,\n email_to: Sequence[str],\n email_body_fn: Callable[["RunFailureSensorContext"], str] = _default_failure_email_body,\n email_subject_fn: Callable[["RunFailureSensorContext"], str] = _default_failure_email_subject,\n smtp_host: str = "smtp.gmail.com",\n smtp_type: str = "SSL",\n smtp_port: Optional[int] = None,\n name: Optional[str] = None,\n webserver_base_url: Optional[str] = None,\n monitored_jobs: Optional[\n Sequence[\n Union[\n "JobDefinition",\n "GraphDefinition",\n "UnresolvedAssetJobDefinition",\n "RepositorySelector",\n "JobSelector",\n ]\n ]\n ] = None,\n job_selection: Optional[\n Sequence[\n Union[\n "JobDefinition",\n "GraphDefinition",\n "UnresolvedAssetJobDefinition",\n "RepositorySelector",\n "JobSelector",\n ]\n ]\n ] = None,\n monitor_all_repositories: bool = False,\n default_status: DefaultSensorStatus = DefaultSensorStatus.STOPPED,\n) -> SensorDefinition:\n """Create a job failure sensor that sends email via the SMTP protocol.\n\n Args:\n email_from (str): The sender email address to send the message from.\n email_password (str): The password of the sender.\n email_to (List[str]): The receipt email addresses to send the message to.\n email_body_fn (Optional(Callable[[RunFailureSensorContext], str])): Function which\n takes in the ``RunFailureSensorContext`` outputs the email body you want to send.\n Defaults to the plain text that contains error message, job name, and run ID.\n email_subject_fn (Optional(Callable[[RunFailureSensorContext], str])): Function which\n takes in the ``RunFailureSensorContext`` outputs the email subject you want to send.\n Defaults to "Dagster Run Failed: <job_name>".\n smtp_host (str): The hostname of the SMTP server. Defaults to "smtp.gmail.com".\n smtp_type (str): The protocol; either "SSL" or "STARTTLS". Defaults to SSL.\n smtp_port (Optional[int]): The SMTP port. Defaults to 465 for SSL, 587 for STARTTLS.\n name: (Optional[str]): The name of the sensor. Defaults to "email_on_job_failure".\n webserver_base_url: (Optional[str]): The base url of your dagster-webserver instance. Specify this to allow\n messages to include deeplinks to the failed run.\n monitored_jobs (Optional[List[Union[JobDefinition, GraphDefinition, JobDefinition, RepositorySelector, JobSelector]]]):\n The jobs that will be monitored by this failure sensor. Defaults to None, which means the alert will\n be sent when any job in the repository fails. To monitor jobs in external repositories,\n use RepositorySelector and JobSelector.\n monitor_all_repositories (bool): If set to True, the sensor will monitor all runs in the\n Dagster instance. If set to True, an error will be raised if you also specify\n monitored_jobs or job_selection. Defaults to False.\n job_selection (Optional[List[Union[JobDefinition, GraphDefinition, JobDefinition, RepositorySelector, JobSelector]]]):\n (deprecated in favor of monitored_jobs) The jobs that will be monitored by this failure\n sensor. Defaults to None, which means the alert will be sent when any job in the repository fails.\n default_status (DefaultSensorStatus): Whether the sensor starts as running or not. The default\n status can be overridden from the Dagster UI or via the GraphQL API.\n\n Examples:\n .. code-block:: python\n\n email_on_run_failure = make_email_on_run_failure_sensor(\n email_from="no-reply@example.com",\n email_password=os.getenv("ALERT_EMAIL_PASSWORD"),\n email_to=["xxx@example.com"],\n )\n\n @repository\n def my_repo():\n return [my_job + email_on_run_failure]\n\n .. code-block:: python\n\n def my_message_fn(context: RunFailureSensorContext) -> str:\n return (\n f"Job {context.pipeline_run.job_name} failed!"\n f"Error: {context.failure_event.message}"\n )\n\n email_on_run_failure = make_email_on_run_failure_sensor(\n email_from="no-reply@example.com",\n email_password=os.getenv("ALERT_EMAIL_PASSWORD"),\n email_to=["xxx@example.com"],\n email_body_fn=my_message_fn,\n email_subject_fn=lambda _: "Dagster Alert",\n webserver_base_url="http://mycoolsite.com",\n )\n\n\n """\n from dagster._core.definitions.run_status_sensor_definition import (\n RunFailureSensorContext,\n run_failure_sensor,\n )\n\n jobs = monitored_jobs if monitored_jobs else job_selection\n\n @run_failure_sensor(\n name=name,\n monitored_jobs=jobs,\n default_status=default_status,\n monitor_all_repositories=monitor_all_repositories,\n )\n def email_on_run_failure(context: RunFailureSensorContext):\n email_body = email_body_fn(context)\n if webserver_base_url:\n email_body += (\n f'<p><a href="{webserver_base_url}/runs/{context.dagster_run.run_id}">View in'\n " the Dagster UI</a></p>"\n )\n\n message = EMAIL_MESSAGE.format(\n email_to=",".join(email_to),\n email_from=email_from,\n email_subject=email_subject_fn(context),\n email_body=email_body,\n randomness=datetime.datetime.now(),\n )\n\n if smtp_type == "SSL":\n send_email_via_ssl(\n email_from, email_password, email_to, message, smtp_host, smtp_port=smtp_port or 465\n )\n elif smtp_type == "STARTTLS":\n send_email_via_starttls(\n email_from, email_password, email_to, message, smtp_host, smtp_port=smtp_port or 587\n )\n else:\n raise DagsterInvalidDefinitionError(f'smtp_type "{smtp_type}" is not supported.')\n\n return email_on_run_failure
\n
", "current_page_name": "_modules/dagster/_utils/alert", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}, {"link": "../", "title": "dagster._utils"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._utils.alert"}, "body": "

Source code for dagster._utils

\nimport _thread as thread\nimport contextlib\nimport contextvars\nimport datetime\nimport errno\nimport functools\nimport inspect\nimport multiprocessing\nimport os\nimport re\nimport signal\nimport socket\nimport subprocess\nimport sys\nimport tempfile\nimport threading\nimport time\nfrom collections import OrderedDict\nfrom datetime import timezone\nfrom enum import Enum\nfrom signal import Signals\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Any,\n    Callable,\n    ContextManager,\n    Dict,\n    Generator,\n    Generic,\n    Hashable,\n    Iterator,\n    List,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Set,\n    Tuple,\n    Type,\n    TypeVar,\n    Union,\n    cast,\n    overload,\n)\n\nimport packaging.version\nfrom typing_extensions import Literal, TypeAlias, TypeGuard\n\nimport dagster._check as check\nimport dagster._seven as seven\n\nfrom .internal_init import IHasInternalInit as IHasInternalInit\n\nif sys.version_info > (3,):\n    from pathlib import Path\nelse:\n    from pathlib2 import Path\n\nif TYPE_CHECKING:\n    from dagster._core.definitions.definitions_class import Definitions\n    from dagster._core.definitions.repository_definition.repository_definition import (\n        RepositoryDefinition,\n    )\n    from dagster._core.events import DagsterEvent\n\nK = TypeVar("K")\nT = TypeVar("T")\nU = TypeVar("U")\nV = TypeVar("V")\n\nEPOCH = datetime.datetime.utcfromtimestamp(0)\n\nPICKLE_PROTOCOL = 4\n\n\nDEFAULT_WORKSPACE_YAML_FILENAME = "workspace.yaml"\n\nPrintFn: TypeAlias = Callable[[Any], None]\n\nSingleInstigatorDebugCrashFlags: TypeAlias = Mapping[str, int]\nDebugCrashFlags: TypeAlias = Mapping[str, SingleInstigatorDebugCrashFlags]\n\n\n# Use this to get the "library version" (pre-1.0 version) from the "core version" (post 1.0\n# version). 16 is from the 0.16.0 that library versions stayed on when core went to 1.0.0.\ndef library_version_from_core_version(core_version: str) -> str:\n    parsed_version = parse_package_version(core_version)\n\n    release = parsed_version.release\n    if release[0] >= 1:\n        library_version = ".".join(["0", str(16 + release[1]), str(release[2])])\n\n        if parsed_version.is_prerelease:\n            library_version = library_version + "".join(\n                [str(pre) for pre in check.not_none(parsed_version.pre)]\n            )\n\n        if parsed_version.is_postrelease:\n            library_version = library_version + "post" + str(parsed_version.post)\n\n        return library_version\n    else:\n        return core_version\n\n\ndef parse_package_version(version_str: str) -> packaging.version.Version:\n    parsed_version = packaging.version.parse(version_str)\n    assert isinstance(parsed_version, packaging.version.Version)\n    return parsed_version\n\n\ndef convert_dagster_submodule_name(name: str, mode: Literal["private", "public"]) -> str:\n    """This function was introduced when all Dagster submodules were marked private by\n    underscore-prefixing the root submodules (e.g. `dagster._core`). The function provides\n    backcompatibility by converting modules between the old and new (i.e. public and private) forms.\n    This is needed when reading older data or communicating with older versions of Dagster.\n    """\n    if mode == "private":\n        return re.sub(r"^dagster\\.([^_])", r"dagster._\\1", name)\n    elif mode == "public":\n        return re.sub(r"^dagster._", "dagster.", name)\n    else:\n        check.failed("`mode` must be 'private' or 'public'")\n\n\n
[docs]def file_relative_path(dunderfile: str, relative_path: str) -> str:\n """Get a path relative to the currently executing Python file.\n\n This function is useful when one needs to load a file that is relative to the position of\n the current file. (Such as when you encode a configuration file path in source file and want\n in runnable in any current working directory)\n\n Args:\n dunderfile (str): Should always be ``__file__``.\n relative_path (str): Path to get relative to the currently executing file.\n\n **Examples**:\n\n .. code-block:: python\n\n file_relative_path(__file__, 'path/relative/to/file')\n\n """\n check.str_param(dunderfile, "dunderfile")\n check.str_param(relative_path, "relative_path")\n\n return os.path.join(os.path.dirname(dunderfile), relative_path)
\n\n\ndef script_relative_path(file_path: str) -> str:\n """Useful for testing with local files. Use a path relative to where the\n test resides and this function will return the absolute path\n of that file. Otherwise it will be relative to script that\n ran the test.\n\n Note: this is function is very, very expensive (on the order of 1\n millisecond per invocation) so this should only be used in performance\n insensitive contexts. Prefer file_relative_path for anything with\n performance constraints.\n\n """\n # from http://bit.ly/2snyC6s\n\n check.str_param(file_path, "file_path")\n scriptdir = inspect.stack()[1][1]\n return os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(scriptdir)), file_path))\n\n\n# Adapted from https://github.com/okunishinishi/python-stringcase/blob/master/stringcase.py\ndef camelcase(string: str) -> str:\n check.str_param(string, "string")\n\n string = re.sub(r"^[\\-_\\.]", "", str(string))\n if not string:\n return string\n return str(string[0]).upper() + re.sub(\n r"[\\-_\\.\\s]([a-z])", lambda matched: str(matched.group(1)).upper(), string[1:]\n )\n\n\ndef ensure_single_item(ddict: Mapping[T, U]) -> Tuple[T, U]:\n check.mapping_param(ddict, "ddict")\n check.param_invariant(len(ddict) == 1, "ddict", "Expected dict with single item")\n return next(iter(ddict.items()))\n\n\n@contextlib.contextmanager\ndef pushd(path: str) -> Iterator[str]:\n old_cwd = os.getcwd()\n os.chdir(path)\n try:\n yield path\n finally:\n os.chdir(old_cwd)\n\n\ndef safe_isfile(path: str) -> bool:\n """Backport of Python 3.8 os.path.isfile behavior.\n\n This is intended to backport https://docs.python.org/dev/whatsnew/3.8.html#os-path. I'm not\n sure that there are other ways to provoke this behavior on Unix other than the null byte,\n but there are certainly other ways to do it on Windows. Afaict, we won't mask other\n ValueErrors, and the behavior in the status quo ante is rough because we risk throwing an\n unexpected, uncaught ValueError from very deep in our logic.\n """\n try:\n return os.path.isfile(path)\n except ValueError:\n return False\n\n\ndef mkdir_p(path: str) -> str:\n try:\n os.makedirs(path)\n return path\n except OSError as exc: # Python >2.5\n if exc.errno == errno.EEXIST and os.path.isdir(path):\n return path\n else:\n raise\n\n\ndef hash_collection(\n collection: Union[\n Mapping[Hashable, Any], Sequence[Any], AbstractSet[Any], Tuple[Any, ...], NamedTuple\n ]\n) -> int:\n """Hash a mutable collection or immutable collection containing mutable elements.\n\n This is useful for hashing Dagster-specific NamedTuples that contain mutable lists or dicts.\n The default NamedTuple __hash__ function assumes the contents of the NamedTuple are themselves\n hashable, and will throw an error if they are not. This can occur when trying to e.g. compute a\n cache key for the tuple for use with `lru_cache`.\n\n This alternative implementation will recursively process collection elements to convert basic\n lists and dicts to tuples prior to hashing. It is recommended to cache the result:\n\n Example:\n .. code-block:: python\n\n def __hash__(self):\n if not hasattr(self, '_hash'):\n self._hash = hash_named_tuple(self)\n return self._hash\n """\n assert isinstance(\n collection, (list, dict, set, tuple)\n ), f"Cannot hash collection of type {type(collection)}"\n return hash(make_hashable(collection))\n\n\n@overload\ndef make_hashable(value: Union[List[Any], Set[Any]]) -> Tuple[Any, ...]: ...\n\n\n@overload\ndef make_hashable(value: Dict[Any, Any]) -> Tuple[Tuple[Any, Any]]: ...\n\n\n@overload\ndef make_hashable(value: Any) -> Any: ...\n\n\ndef make_hashable(value: Any) -> Any:\n if isinstance(value, dict):\n return tuple(sorted((key, make_hashable(value)) for key, value in value.items()))\n elif isinstance(value, (list, tuple, set)):\n return tuple([make_hashable(x) for x in value])\n else:\n return value\n\n\ndef get_prop_or_key(elem, key):\n if isinstance(elem, Mapping):\n return elem.get(key)\n else:\n return getattr(elem, key)\n\n\ndef list_pull(alist, key):\n return list(map(lambda elem: get_prop_or_key(elem, key), alist))\n\n\ndef all_none(kwargs):\n for value in kwargs.values():\n if value is not None:\n return False\n return True\n\n\ndef check_script(path, return_code=0):\n try:\n subprocess.check_output([sys.executable, path])\n except subprocess.CalledProcessError as exc:\n if return_code != 0:\n if exc.returncode == return_code:\n return\n raise\n\n\ndef check_cli_execute_file_job(path, pipeline_fn_name, env_file=None):\n from dagster._core.test_utils import instance_for_test\n\n with instance_for_test():\n cli_cmd = [\n sys.executable,\n "-m",\n "dagster",\n "pipeline",\n "execute",\n "-f",\n path,\n "-a",\n pipeline_fn_name,\n ]\n\n if env_file:\n cli_cmd.append("-c")\n cli_cmd.append(env_file)\n\n try:\n subprocess.check_output(cli_cmd)\n except subprocess.CalledProcessError as cpe:\n print(cpe) # noqa: T201\n raise cpe\n\n\ndef safe_tempfile_path_unmanaged() -> str:\n # This gets a valid temporary file path in the safest possible way, although there is still no\n # guarantee that another process will not create a file at this path. The NamedTemporaryFile is\n # deleted when the context manager exits and the file object is closed.\n #\n # This is preferable to using NamedTemporaryFile as a context manager and passing the name\n # attribute of the file object around because NamedTemporaryFiles cannot be opened a second time\n # if already open on Windows NT or later:\n # https://docs.python.org/3.8/library/tempfile.html#tempfile.NamedTemporaryFile\n # https://github.com/dagster-io/dagster/issues/1582\n with tempfile.NamedTemporaryFile() as fd:\n path = fd.name\n return Path(path).as_posix()\n\n\n@contextlib.contextmanager\ndef safe_tempfile_path() -> Iterator[str]:\n path = None\n try:\n path = safe_tempfile_path_unmanaged()\n yield path\n finally:\n if path is not None and os.path.exists(path):\n os.unlink(path)\n\n\n@overload\ndef ensure_gen(thing_or_gen: Generator[T, Any, Any]) -> Generator[T, Any, Any]:\n pass\n\n\n@overload\ndef ensure_gen(thing_or_gen: T) -> Generator[T, Any, Any]:\n pass\n\n\ndef ensure_gen(\n thing_or_gen: Union[T, Iterator[T], Generator[T, Any, Any]]\n) -> Generator[T, Any, Any]:\n if not inspect.isgenerator(thing_or_gen):\n thing_or_gen = cast(T, thing_or_gen)\n\n def _gen_thing():\n yield thing_or_gen\n\n return _gen_thing()\n\n return thing_or_gen\n\n\ndef ensure_dir(file_path: str) -> str:\n try:\n os.makedirs(file_path)\n except OSError as e:\n if e.errno != errno.EEXIST:\n raise\n return file_path\n\n\ndef ensure_file(path: str) -> str:\n ensure_dir(os.path.dirname(path))\n if not os.path.exists(path):\n touch_file(path)\n return path\n\n\ndef touch_file(path):\n ensure_dir(os.path.dirname(path))\n with open(path, "a", encoding="utf8"):\n os.utime(path, None)\n\n\ndef _kill_on_event(termination_event):\n termination_event.wait()\n send_interrupt()\n\n\ndef send_interrupt():\n if seven.IS_WINDOWS:\n # This will raise a KeyboardInterrupt in python land - meaning this wont be able to\n # interrupt things like sleep()\n thread.interrupt_main()\n else:\n # If on unix send an os level signal to interrupt any situation we may be stuck in\n os.kill(os.getpid(), signal.SIGINT)\n\n\n# Function to be invoked by daemon thread in processes which seek to be cancellable.\n# The motivation for this approach is to be able to exit cleanly on Windows. An alternative\n# path is to change how the processes are opened and send CTRL_BREAK signals, which at\n# the time of authoring seemed a more costly approach.\n#\n# Reading for the curious:\n# * https://stackoverflow.com/questions/35772001/how-to-handle-the-signal-in-python-on-windows-machine\n# * https://stefan.sofa-rockers.org/2013/08/15/handling-sub-process-hierarchies-python-linux-os-x/\ndef start_termination_thread(termination_event):\n check.inst_param(termination_event, "termination_event", ttype=type(multiprocessing.Event()))\n\n int_thread = threading.Thread(\n target=_kill_on_event, args=(termination_event,), name="kill-on-event"\n )\n int_thread.daemon = True\n int_thread.start()\n\n\n# Executes the next() function within an instance of the supplied context manager class\n# (leaving the context before yielding each result)\ndef iterate_with_context(\n context_fn: Callable[[], ContextManager[Any]], iterator: Iterator[T]\n) -> Iterator[T]:\n while True:\n # Allow interrupts during user code so that we can terminate slow/hanging steps\n with context_fn():\n try:\n next_output = next(iterator)\n except StopIteration:\n return\n\n yield next_output\n\n\ndef datetime_as_float(dt: datetime.datetime) -> float:\n check.inst_param(dt, "dt", datetime.datetime)\n return float((dt - EPOCH).total_seconds())\n\n\nT_GeneratedContext = TypeVar("T_GeneratedContext")\n\n\nclass EventGenerationManager(Generic[T_GeneratedContext]):\n """Utility class that wraps an event generator function, that also yields a single instance of\n a typed object. All events yielded before the typed object are yielded through the method\n `generate_setup_events` and all events yielded after the typed object are yielded through the\n method `generate_teardown_events`.\n\n This is used to help replace the context managers used in pipeline initialization with\n generators so that we can begin emitting initialization events AND construct a pipeline context\n object, while managing explicit setup/teardown.\n\n This does require calling `generate_setup_events` AND `generate_teardown_events` in order to\n get the typed object.\n """\n\n def __init__(\n self,\n generator: Iterator[Union["DagsterEvent", T_GeneratedContext]],\n object_cls: Type[T_GeneratedContext],\n require_object: Optional[bool] = True,\n ):\n self.generator = check.generator(generator)\n self.object_cls: Type[T_GeneratedContext] = check.class_param(object_cls, "object_cls")\n self.require_object = check.bool_param(require_object, "require_object")\n self.object: Optional[T_GeneratedContext] = None\n self.did_setup = False\n self.did_teardown = False\n\n def generate_setup_events(self) -> Iterator["DagsterEvent"]:\n self.did_setup = True\n try:\n while self.object is None:\n obj = next(self.generator)\n if isinstance(obj, self.object_cls):\n self.object = obj\n else:\n yield obj\n except StopIteration:\n if self.require_object:\n check.inst_param(\n self.object,\n "self.object",\n self.object_cls,\n f"generator never yielded object of type {self.object_cls.__name__}",\n )\n\n def get_object(self) -> T_GeneratedContext:\n if not self.did_setup:\n check.failed("Called `get_object` before `generate_setup_events`")\n return cast(T_GeneratedContext, self.object)\n\n def generate_teardown_events(self) -> Iterator["DagsterEvent"]:\n self.did_teardown = True\n if self.object:\n yield from self.generator\n\n\ndef utc_datetime_from_timestamp(timestamp: float) -> datetime.datetime:\n tz = timezone.utc\n return datetime.datetime.fromtimestamp(timestamp, tz=tz)\n\n\ndef utc_datetime_from_naive(dt: datetime.datetime) -> datetime.datetime:\n tz = timezone.utc\n return dt.replace(tzinfo=tz)\n\n\ndef is_enum_value(value: object) -> bool:\n return False if value is None else issubclass(value.__class__, Enum)\n\n\ndef git_repository_root() -> str:\n return subprocess.check_output(["git", "rev-parse", "--show-toplevel"]).decode("utf-8").strip()\n\n\ndef segfault() -> None:\n """Reliable cross-Python version segfault.\n\n https://bugs.python.org/issue1215#msg143236\n """\n import ctypes\n\n ctypes.string_at(0)\n\n\ndef find_free_port() -> int:\n with contextlib.closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:\n s.bind(("", 0))\n s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)\n return s.getsockname()[1]\n\n\ndef is_port_in_use(host, port) -> bool:\n # Similar to the socket options that uvicorn uses to bind ports:\n # https://github.com/encode/uvicorn/blob/62f19c1c39929c84968712c371c9b7b96a041dec/uvicorn/config.py#L565-L566\n sock = socket.socket(family=socket.AF_INET)\n sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)\n try:\n sock.bind((host, port))\n return False\n except socket.error as e:\n return e.errno == errno.EADDRINUSE\n finally:\n sock.close()\n\n\n@contextlib.contextmanager\ndef alter_sys_path(to_add: Sequence[str], to_remove: Sequence[str]) -> Iterator[None]:\n to_restore = [path for path in sys.path]\n\n # remove paths\n for path in to_remove:\n if path in sys.path:\n sys.path.remove(path)\n\n # add paths\n for path in to_add:\n sys.path.insert(0, path)\n\n try:\n yield\n finally:\n sys.path = to_restore\n\n\n@contextlib.contextmanager\ndef restore_sys_modules() -> Iterator[None]:\n sys_modules = {k: v for k, v in sys.modules.items()}\n try:\n yield\n finally:\n to_delete = set(sys.modules) - set(sys_modules)\n for key in to_delete:\n del sys.modules[key]\n\n\ndef process_is_alive(pid: int) -> bool:\n if seven.IS_WINDOWS:\n import psutil\n\n return psutil.pid_exists(pid=pid)\n else:\n try:\n subprocess.check_output(["ps", str(pid)])\n except subprocess.CalledProcessError as exc:\n assert exc.returncode == 1\n return False\n return True\n\n\ndef compose(*args):\n """Compose python functions args such that compose(f, g)(x) is equivalent to f(g(x)).""" # noqa: D402\n # reduce using functional composition over all the arguments, with the identity function as\n # initializer\n return functools.reduce(lambda f, g: lambda x: f(g(x)), args, lambda x: x)\n\n\ndef dict_without_keys(ddict, *keys):\n return {key: value for key, value in ddict.items() if key not in set(keys)}\n\n\nclass Counter:\n def __init__(self):\n self._lock = threading.Lock()\n self._counts = OrderedDict()\n super(Counter, self).__init__()\n\n def increment(self, key: str):\n with self._lock:\n self._counts[key] = self._counts.get(key, 0) + 1\n\n def counts(self) -> Mapping[str, int]:\n with self._lock:\n copy = {k: v for k, v in self._counts.items()}\n return copy\n\n\ntraced_counter = contextvars.ContextVar("traced_counts", default=Counter())\n\nT_Callable = TypeVar("T_Callable", bound=Callable)\n\n\ndef traced(func: T_Callable) -> T_Callable:\n """A decorator that keeps track of how many times a function is called."""\n\n @functools.wraps(func)\n def inner(*args, **kwargs):\n counter = traced_counter.get()\n if counter and isinstance(counter, Counter):\n counter.increment(func.__qualname__)\n\n return func(*args, **kwargs)\n\n return cast(T_Callable, inner)\n\n\ndef get_terminate_signal():\n if sys.platform == "win32":\n return signal.SIGTERM\n return signal.SIGKILL\n\n\ndef get_run_crash_explanation(prefix: str, exit_code: int):\n # As per https://docs.python.org/3/library/subprocess.html#subprocess.CompletedProcess.returncode\n # negative exit code means a posix signal\n if exit_code < 0 and -exit_code in [signal.value for signal in Signals]:\n posix_signal = -exit_code\n signal_str = Signals(posix_signal).name\n exit_clause = f"was terminated by signal {posix_signal} ({signal_str})."\n if posix_signal == get_terminate_signal():\n exit_clause = (\n exit_clause\n + " This usually indicates that the process was"\n " killed by the operating system due to running out of"\n " memory. Possible solutions include increasing the"\n " amount of memory available to the run, reducing"\n " the amount of memory used by the ops in the run, or"\n " configuring the executor to run fewer ops concurrently."\n )\n else:\n exit_clause = f"unexpectedly exited with code {exit_code}."\n\n return prefix + " " + exit_clause\n\n\ndef last_file_comp(path: str) -> str:\n return os.path.basename(os.path.normpath(path))\n\n\ndef is_named_tuple_instance(obj: object) -> TypeGuard[NamedTuple]:\n return isinstance(obj, tuple) and hasattr(obj, "_fields")\n\n\ndef is_named_tuple_subclass(klass: Type[object]) -> TypeGuard[Type[NamedTuple]]:\n return isinstance(klass, type) and issubclass(klass, tuple) and hasattr(klass, "_fields")\n\n\n@overload\ndef normalize_to_repository(\n definitions_or_repository: Optional[Union["Definitions", "RepositoryDefinition"]] = ...,\n repository: Optional["RepositoryDefinition"] = ...,\n error_on_none: Literal[True] = ...,\n) -> "RepositoryDefinition": ...\n\n\n@overload\ndef normalize_to_repository(\n definitions_or_repository: Optional[Union["Definitions", "RepositoryDefinition"]] = ...,\n repository: Optional["RepositoryDefinition"] = ...,\n error_on_none: Literal[False] = ...,\n) -> Optional["RepositoryDefinition"]: ...\n\n\ndef normalize_to_repository(\n definitions_or_repository: Optional[Union["Definitions", "RepositoryDefinition"]] = None,\n repository: Optional["RepositoryDefinition"] = None,\n error_on_none: bool = True,\n) -> Optional["RepositoryDefinition"]:\n """Normalizes the arguments that take a RepositoryDefinition or Definitions object to a\n RepositoryDefinition.\n\n This is intended to handle both the case where a single argument takes a\n `Union[RepositoryDefinition, Definitions]` or separate keyword arguments accept\n `RepositoryDefinition` or `Definitions`.\n """\n from dagster._core.definitions.definitions_class import Definitions\n\n if (definitions_or_repository and repository) or (\n error_on_none and not (definitions_or_repository or repository)\n ):\n check.failed("Exactly one of `definitions` or `repository_def` must be provided.")\n elif isinstance(definitions_or_repository, Definitions):\n return definitions_or_repository.get_repository_def()\n elif definitions_or_repository:\n return definitions_or_repository\n elif repository:\n return repository\n else:\n return None\n\n\ndef xor(a, b):\n return bool(a) != bool(b)\n\n\ndef tail_file(path_or_fd: Union[str, int], should_stop: Callable[[], bool]) -> Iterator[str]:\n with open(path_or_fd, "r") as output_stream:\n while True:\n line = output_stream.readline()\n if line:\n yield line\n elif should_stop():\n break\n else:\n time.sleep(0.01)\n
", "current_page_name": "_modules/dagster/_utils", "customsidebar": null, "dagster_type": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._utils.dagster_type

\nfrom typing import Any\n\nfrom dagster._core.definitions.events import Failure, TypeCheck\nfrom dagster._core.definitions.graph_definition import GraphDefinition\nfrom dagster._core.definitions.job_base import InMemoryJob\nfrom dagster._core.errors import DagsterInvariantViolationError\nfrom dagster._core.execution.api import create_execution_plan\nfrom dagster._core.execution.context_creation_job import scoped_job_context\nfrom dagster._core.instance import DagsterInstance\nfrom dagster._core.types.dagster_type import resolve_dagster_type\n\nfrom .typing_api import is_typing_type\n\n\n
[docs]def check_dagster_type(dagster_type: Any, value: Any) -> TypeCheck:\n """Test a custom Dagster type.\n\n Args:\n dagster_type (Any): The Dagster type to test. Should be one of the\n :ref:`built-in types <builtin>`, a dagster type explicitly constructed with\n :py:func:`as_dagster_type`, :py:func:`@usable_as_dagster_type <dagster_type>`, or\n :py:func:`PythonObjectDagsterType`, or a Python type.\n value (Any): The runtime value to test.\n\n Returns:\n TypeCheck: The result of the type check.\n\n\n Examples:\n .. code-block:: python\n\n assert check_dagster_type(Dict[Any, Any], {'foo': 'bar'}).success\n """\n if is_typing_type(dagster_type):\n raise DagsterInvariantViolationError(\n f"Must pass in a type from dagster module. You passed {dagster_type} "\n "which is part of python's typing module."\n )\n\n dagster_type = resolve_dagster_type(dagster_type)\n\n job = InMemoryJob(GraphDefinition(node_defs=[], name="empty").to_job())\n job_def = job.get_definition()\n\n instance = DagsterInstance.ephemeral()\n execution_plan = create_execution_plan(job)\n dagster_run = instance.create_run_for_job(job_def)\n with scoped_job_context(execution_plan, job, {}, dagster_run, instance) as context:\n type_check_context = context.for_type(dagster_type)\n try:\n type_check = dagster_type.type_check(type_check_context, value)\n except Failure as failure:\n return TypeCheck(success=False, description=failure.description)\n\n if not isinstance(type_check, TypeCheck):\n raise DagsterInvariantViolationError(\n "Type checks can only return TypeCheck. Type {type_name} returned {value}.".format(\n type_name=dagster_type.display_name, value=repr(type_check)\n )\n )\n return type_check
\n
", "current_page_name": "_modules/dagster/_utils/dagster_type", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}, {"link": "../", "title": "dagster._utils"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._utils.dagster_type"}, "favicon_url": null, "forked_pdb": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._utils.forked_pdb

\nimport pdb\nimport sys\n\n\n# From https://stackoverflow.com/questions/4716533/how-to-attach-debugger-to-a-python-subproccess\n
[docs]class ForkedPdb(pdb.Pdb):\n """A pdb subclass that may be used from a forked multiprocessing child.\n\n **Examples**:\n\n .. code-block:: python\n\n from dagster._utils.forked_pdb import ForkedPdb\n\n @solid\n def complex_solid(_):\n # some complicated stuff\n\n ForkedPdb().set_trace()\n\n # some other complicated stuff\n\n You can initiate pipeline execution via the webserver and use the pdb debugger to examine/step through\n execution at the breakpoint.\n """\n\n def interaction(self, frame, traceback):\n _stdin = sys.stdin\n try:\n sys.stdin = open("/dev/stdin", encoding="utf8")\n pdb.Pdb.interaction(self, frame, traceback)\n finally:\n sys.stdin = _stdin
\n
", "current_page_name": "_modules/dagster/_utils/forked_pdb", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}, {"link": "../", "title": "dagster._utils"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._utils.forked_pdb"}, "log": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._utils.log

\nimport copy\nimport logging\nimport sys\nimport traceback\nfrom typing import Mapping, NamedTuple, Optional\n\nimport coloredlogs\n\nimport dagster._check as check\nimport dagster._seven as seven\nfrom dagster._annotations import deprecated\nfrom dagster._config import Enum, EnumValue\nfrom dagster._core.definitions.logger_definition import logger\nfrom dagster._core.utils import PYTHON_LOGGING_LEVELS_MAPPING, coerce_valid_log_level\n\nLogLevelEnum = Enum("log_level", list(map(EnumValue, PYTHON_LOGGING_LEVELS_MAPPING.keys())))\n\n\nclass JsonFileHandler(logging.Handler):\n    def __init__(self, json_path: str):\n        super(JsonFileHandler, self).__init__()\n        self.json_path = check.str_param(json_path, "json_path")\n\n    def emit(self, record: logging.LogRecord) -> None:\n        try:\n            log_dict = copy.copy(record.__dict__)\n\n            # This horrific monstrosity is to maintain backwards compatability\n            # with the old behavior of the JsonFileHandler, which the clarify\n            # project has a dependency on. It relied on the dagster-defined\n            # properties smashing all the properties of the LogRecord object\n            # and uploads all of those properties to a redshift table for\n            # in order to do analytics on the log\n\n            if "dagster_meta" in log_dict:\n                dagster_meta_dict = log_dict["dagster_meta"]\n                del log_dict["dagster_meta"]\n            else:\n                dagster_meta_dict = {}\n\n            log_dict.update(dagster_meta_dict)\n\n            with open(self.json_path, "a", encoding="utf8") as ff:\n                text_line = seven.json.dumps(log_dict)\n                ff.write(text_line + "\\n")\n        # Need to catch Exception here, so disabling lint\n        except Exception as e:\n            logging.critical("[%s] Error during logging!", self.__class__.__name__)\n            logging.exception(str(e))\n\n\nclass StructuredLoggerMessage(\n    NamedTuple(\n        "_StructuredLoggerMessage",\n        [\n            ("name", str),\n            ("message", str),\n            ("level", int),\n            ("meta", Mapping[object, object]),\n            ("record", logging.LogRecord),\n        ],\n    )\n):\n    def __new__(\n        cls,\n        name: str,\n        message: str,\n        level: int,\n        meta: Mapping[object, object],\n        record: logging.LogRecord,\n    ):\n        return super(StructuredLoggerMessage, cls).__new__(\n            cls,\n            check.str_param(name, "name"),\n            check.str_param(message, "message"),\n            coerce_valid_log_level(level),\n            check.mapping_param(meta, "meta"),\n            check.inst_param(record, "record", logging.LogRecord),\n        )\n\n\nclass JsonEventLoggerHandler(logging.Handler):\n    def __init__(self, json_path: str, construct_event_record):\n        super(JsonEventLoggerHandler, self).__init__()\n        self.json_path = check.str_param(json_path, "json_path")\n        self.construct_event_record = construct_event_record\n\n    def emit(self, record: logging.LogRecord) -> None:\n        try:\n            event_record = self.construct_event_record(record)\n            with open(self.json_path, "a", encoding="utf8") as ff:\n                text_line = seven.json.dumps(event_record.to_dict())\n                ff.write(text_line + "\\n")\n\n        # Need to catch Exception here, so disabling lint\n        except Exception as e:\n            logging.critical("[%s] Error during logging!", self.__class__.__name__)\n            logging.exception(str(e))\n\n\nclass StructuredLoggerHandler(logging.Handler):\n    def __init__(self, callback):\n        super(StructuredLoggerHandler, self).__init__()\n        self.callback = check.is_callable(callback, "callback")\n\n    def emit(self, record: logging.LogRecord) -> None:\n        try:\n            self.callback(\n                StructuredLoggerMessage(\n                    name=record.name,\n                    message=record.msg,\n                    level=record.levelno,\n                    meta=record.dagster_meta,  # type: ignore\n                    record=record,\n                )\n            )\n        # Need to catch Exception here, so disabling lint\n        except Exception as e:\n            logging.critical("[%s] Error during logging!", self.__class__.__name__)\n            logging.exception(str(e))\n\n\ndef construct_single_handler_logger(name, level, handler):\n    check.str_param(name, "name")\n    check.inst_param(handler, "handler", logging.Handler)\n\n    level = coerce_valid_log_level(level)\n\n    @logger\n    def single_handler_logger(_init_context):\n        klass = logging.getLoggerClass()\n        logger_ = klass(name, level=level)\n        logger_.addHandler(handler)\n        handler.setLevel(level)\n        return logger_\n\n    return single_handler_logger\n\n\n# Base python logger whose messages will be captured as structured Dagster log messages.\nBASE_DAGSTER_LOGGER = logging.getLogger(name="dagster")\n\n\n
[docs]def get_dagster_logger(name: Optional[str] = None) -> logging.Logger:\n """Creates a python logger whose output messages will be captured and converted into Dagster log\n messages. This means they will have structured information such as the step_key, run_id, etc.\n embedded into them, and will show up in the Dagster event log.\n\n This can be used as a more convenient alternative to `context.log` in most cases. If log level\n is not set explicitly, defaults to DEBUG.\n\n Args:\n name (Optional[str]): If supplied, will create a logger with the name "dagster.builtin.{name}",\n with properties inherited from the base Dagster logger. If omitted, the returned logger\n will be named "dagster.builtin".\n\n Returns:\n :class:`logging.Logger`: A logger whose output will be captured by Dagster.\n\n Example:\n .. code-block:: python\n\n from dagster import get_dagster_logger, op\n\n @op\n def hello_op():\n log = get_dagster_logger()\n for i in range(5):\n # do something\n log.info(f"Did {i+1} things!")\n\n """\n # enforce that the parent logger will always have a DEBUG log level\n BASE_DAGSTER_LOGGER.setLevel(logging.DEBUG)\n base_builtin = BASE_DAGSTER_LOGGER.getChild("builtin")\n if name:\n return base_builtin.getChild(name)\n return base_builtin
\n\n\ndef define_structured_logger(name, callback, level):\n check.str_param(name, "name")\n check.callable_param(callback, "callback")\n level = coerce_valid_log_level(level)\n\n return construct_single_handler_logger(name, level, StructuredLoggerHandler(callback))\n\n\ndef define_json_file_logger(name, json_path, level):\n check.str_param(name, "name")\n check.str_param(json_path, "json_path")\n level = coerce_valid_log_level(level)\n\n stream_handler = JsonFileHandler(json_path)\n stream_handler.setFormatter(define_default_formatter())\n return construct_single_handler_logger(name, level, stream_handler)\n\n\ndef get_stack_trace_array(exception):\n check.inst_param(exception, "exception", Exception)\n if hasattr(exception, "__traceback__"):\n tb = exception.__traceback__\n else:\n _exc_type, _exc_value, tb = sys.exc_info()\n return traceback.format_tb(tb)\n\n\ndef default_format_string():\n return "%(asctime)s - %(name)s - %(levelname)s - %(message)s"\n\n\ndef default_date_format_string():\n return "%Y-%m-%d %H:%M:%S %z"\n\n\ndef define_default_formatter():\n return logging.Formatter(default_format_string(), default_date_format_string())\n\n\n@deprecated(\n breaking_version="2.0",\n subject="loggers.dagit",\n emit_runtime_warning=False,\n)\ndef configure_loggers(handler="default", log_level="INFO"):\n LOGGING_CONFIG = {\n "version": 1,\n "disable_existing_loggers": False,\n "formatters": {\n "colored": {\n "()": coloredlogs.ColoredFormatter,\n "fmt": default_format_string(),\n "datefmt": default_date_format_string(),\n "field_styles": {"levelname": {"color": "blue"}, "asctime": {"color": "green"}},\n "level_styles": {"debug": {}, "error": {"color": "red"}},\n },\n },\n "handlers": {\n "default": {\n "formatter": "colored",\n "class": "logging.StreamHandler",\n "stream": sys.stdout,\n "level": log_level,\n },\n "null": {\n "class": "logging.NullHandler",\n },\n },\n "loggers": {\n "dagster": {\n "handlers": [handler],\n "level": log_level,\n },\n # Only one of dagster or dagster-webserver will be used at a time. We configure them\n # both here to avoid a dependency on the dagster-webserver package.\n "dagit": {\n "handlers": [handler],\n "level": log_level,\n },\n "dagster-webserver": {\n "handlers": [handler],\n "level": log_level,\n },\n },\n }\n\n logging.config.dictConfig(LOGGING_CONFIG)\n\n\ndef create_console_logger(name, level):\n klass = logging.getLoggerClass()\n handler = klass(name, level=level)\n coloredlogs.install(\n logger=handler,\n level=level,\n fmt=default_format_string(),\n datefmt=default_date_format_string(),\n field_styles={"levelname": {"color": "blue"}, "asctime": {"color": "green"}},\n level_styles={"debug": {}, "error": {"color": "red"}},\n )\n return handler\n
", "current_page_name": "_modules/dagster/_utils/log", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}, {"link": "../", "title": "dagster._utils"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._utils.log"}, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._utils", "warnings": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._utils.warnings

\nimport warnings\nfrom contextlib import contextmanager\nfrom typing import Callable, Iterator, Optional, TypeVar\n\nimport dagster._check as check\nfrom dagster._core.decorator_utils import (\n    Decoratable,\n    apply_context_manager_decorator,\n)\n\nT = TypeVar("T")\n\n# ########################\n# ##### DEPRECATED\n# ########################\n\n\ndef normalize_renamed_param(\n    new_val: T,\n    new_arg: str,\n    old_val: T,\n    old_arg: str,\n    coerce_old_to_new: Optional[Callable[[T], T]] = None,\n) -> T:\n    """Utility for managing backwards compatibility of a renamed parameter.\n\n    .. code-block::\n\n       # The name of param `old_flag` is being updated to `new_flag`, but we are temporarily\n       # accepting either param.\n       def is_new(old_flag=None, new_flag=None):\n           return canonicalize_backcompat_args(\n               new_val=new_flag,\n               new_arg='new_flag',\n               old_val=old_flag,\n               old_arg='old_flag',\n               breaking_version='0.9.0',\n               coerce_old_to_new=lambda val: not val,\n           )\n\n    In the above example, if the caller sets both new_flag and old_flag, it will fail by throwing\n    a CheckError. If the caller sets the new_flag, it's returned unaltered. If the caller sets\n    old_flag, it will return the old_flag run through the coercion function.\n    """\n    check.str_param(new_arg, "new_arg")\n    check.str_param(old_arg, "old_arg")\n    check.opt_callable_param(coerce_old_to_new, "coerce_old_to_new")\n    if new_val is not None and old_val is not None:\n        check.failed(f'Do not use deprecated "{old_arg}" now that you are using "{new_arg}".')\n    elif old_val is not None:\n        return coerce_old_to_new(old_val) if coerce_old_to_new else old_val\n    else:\n        return new_val\n\n\ndef deprecation_warning(\n    subject: str,\n    breaking_version: str,\n    additional_warn_text: Optional[str] = None,\n    stacklevel: int = 3,\n):\n    warnings.warn(\n        f"{subject} is deprecated and will be removed in {breaking_version}."\n        + ((" " + additional_warn_text) if additional_warn_text else ""),\n        category=DeprecationWarning,\n        stacklevel=stacklevel,\n    )\n\n\n# ########################\n# ##### EXPERIMENTAL\n# ########################\n\nEXPERIMENTAL_WARNING_HELP = (\n    "To mute warnings for experimental functionality, invoke"\n    ' warnings.filterwarnings("ignore", category=dagster.ExperimentalWarning) or use'\n    " one of the other methods described at"\n    " https://docs.python.org/3/library/warnings.html#describing-warning-filters."\n)\n\n\n
[docs]class ExperimentalWarning(Warning):\n pass
\n\n\ndef experimental_warning(\n subject: str, additional_warn_text: Optional[str] = None, stacklevel: int = 3\n) -> None:\n extra_text = f" {additional_warn_text}" if additional_warn_text else ""\n warnings.warn(\n f"{subject} is experimental. It may break in future versions, even between dot"\n f" releases.{extra_text} {EXPERIMENTAL_WARNING_HELP}",\n ExperimentalWarning,\n stacklevel=stacklevel,\n )\n\n\n# ########################\n# ##### DISABLE DAGSTER WARNINGS\n# ########################\n\n\n@contextmanager\ndef disable_dagster_warnings() -> Iterator[None]:\n with warnings.catch_warnings():\n warnings.simplefilter("ignore", category=DeprecationWarning)\n warnings.simplefilter("ignore", category=ExperimentalWarning)\n yield\n\n\nT_Decoratable = TypeVar("T_Decoratable", bound=Decoratable)\n\n\ndef suppress_dagster_warnings(__obj: T_Decoratable) -> T_Decoratable:\n """Mark a method/function as ignoring Dagster-generated warnings. This suppresses any\n `ExperimentalWarnings` or `DeprecationWarnings` when the function is called.\n\n Usage:\n\n .. code-block:: python\n\n @suppress_dagster_warnings\n def invokes_some_experimental_stuff(my_arg):\n my_experimental_function(my_arg)\n """\n return apply_context_manager_decorator(__obj, disable_dagster_warnings)\n
", "current_page_name": "_modules/dagster/_utils/warnings", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}, {"link": "../", "title": "dagster._utils"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._utils.warnings"}}}, "dagster_airbyte": {"asset_defs": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_airbyte.asset_defs

\nimport hashlib\nimport inspect\nimport os\nimport re\nfrom abc import abstractmethod\nfrom functools import partial\nfrom itertools import chain\nfrom typing import (\n    Any,\n    Callable,\n    Dict,\n    Iterable,\n    List,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Set,\n    Tuple,\n    Union,\n    cast,\n)\n\nimport yaml\nfrom dagster import (\n    AssetKey,\n    AssetOut,\n    AutoMaterializePolicy,\n    FreshnessPolicy,\n    Nothing,\n    Output,\n    ResourceDefinition,\n    SourceAsset,\n    _check as check,\n)\nfrom dagster._core.definitions import AssetsDefinition, multi_asset\nfrom dagster._core.definitions.cacheable_assets import (\n    AssetsDefinitionCacheableData,\n    CacheableAssetsDefinition,\n)\nfrom dagster._core.definitions.events import CoercibleToAssetKey, CoercibleToAssetKeyPrefix\nfrom dagster._core.definitions.metadata import MetadataValue, TableSchemaMetadataValue\nfrom dagster._core.definitions.metadata.table import TableSchema\nfrom dagster._core.errors import DagsterInvalidDefinitionError, DagsterInvalidInvocationError\nfrom dagster._core.execution.context.init import build_init_resource_context\nfrom dagster._utils.merger import merge_dicts\n\nfrom dagster_airbyte.resources import AirbyteCloudResource, AirbyteResource, BaseAirbyteResource\nfrom dagster_airbyte.types import AirbyteTableMetadata\nfrom dagster_airbyte.utils import (\n    generate_materializations,\n    generate_table_schema,\n    is_basic_normalization_operation,\n)\n\n\ndef _table_to_output_name_fn(table: str) -> str:\n    return table.replace("-", "_")\n\n\ndef _build_airbyte_asset_defn_metadata(\n    connection_id: str,\n    destination_tables: Sequence[str],\n    table_to_asset_key_fn: Callable[[str], AssetKey],\n    asset_key_prefix: Optional[Sequence[str]] = None,\n    normalization_tables: Optional[Mapping[str, Set[str]]] = None,\n    upstream_assets: Optional[Iterable[AssetKey]] = None,\n    group_name: Optional[str] = None,\n    io_manager_key: Optional[str] = None,\n    schema_by_table_name: Optional[Mapping[str, TableSchema]] = None,\n    freshness_policy: Optional[FreshnessPolicy] = None,\n    auto_materialize_policy: Optional[AutoMaterializePolicy] = None,\n) -> AssetsDefinitionCacheableData:\n    asset_key_prefix = (\n        check.opt_sequence_param(asset_key_prefix, "asset_key_prefix", of_type=str) or []\n    )\n\n    # Generate a list of outputs, the set of destination tables plus any affiliated\n    # normalization tables\n    tables = list(\n        chain.from_iterable(\n            chain(\n                [destination_tables], normalization_tables.values() if normalization_tables else []\n            )\n        )\n    )\n\n    outputs = {\n        _table_to_output_name_fn(table): AssetKey(\n            [*asset_key_prefix, *table_to_asset_key_fn(table).path]\n        )\n        for table in tables\n    }\n\n    internal_deps: Dict[str, Set[AssetKey]] = {}\n\n    metadata_encodable_normalization_tables = (\n        {k: list(v) for k, v in normalization_tables.items()} if normalization_tables else {}\n    )\n\n    # If normalization tables are specified, we need to add a dependency from the destination table\n    # to the affilitated normalization table\n    if len(metadata_encodable_normalization_tables) > 0:\n        for base_table, derived_tables in metadata_encodable_normalization_tables.items():\n            for derived_table in derived_tables:\n                internal_deps[derived_table] = {\n                    AssetKey([*asset_key_prefix, *table_to_asset_key_fn(base_table).path])\n                }\n\n    # All non-normalization tables depend on any user-provided upstream assets\n    for table in destination_tables:\n        internal_deps[table] = set(upstream_assets or [])\n\n    return AssetsDefinitionCacheableData(\n        keys_by_input_name=(\n            {asset_key.path[-1]: asset_key for asset_key in upstream_assets}\n            if upstream_assets\n            else {}\n        ),\n        keys_by_output_name=outputs,\n        internal_asset_deps=internal_deps,\n        group_name=group_name,\n        key_prefix=asset_key_prefix,\n        can_subset=False,\n        metadata_by_output_name=(\n            {\n                table: {"table_schema": MetadataValue.table_schema(schema_by_table_name[table])}\n                for table in tables\n            }\n            if schema_by_table_name\n            else None\n        ),\n        freshness_policies_by_output_name=(\n            {output: freshness_policy for output in outputs} if freshness_policy else None\n        ),\n        auto_materialize_policies_by_output_name=(\n            {output: auto_materialize_policy for output in outputs}\n            if auto_materialize_policy\n            else None\n        ),\n        extra_metadata={\n            "connection_id": connection_id,\n            "group_name": group_name,\n            "destination_tables": destination_tables,\n            "normalization_tables": metadata_encodable_normalization_tables,\n            "io_manager_key": io_manager_key,\n        },\n    )\n\n\ndef _build_airbyte_assets_from_metadata(\n    assets_defn_meta: AssetsDefinitionCacheableData,\n    resource_defs: Optional[Mapping[str, ResourceDefinition]],\n) -> AssetsDefinition:\n    metadata = cast(Mapping[str, Any], assets_defn_meta.extra_metadata)\n    connection_id = cast(str, metadata["connection_id"])\n    group_name = cast(Optional[str], metadata["group_name"])\n    destination_tables = cast(List[str], metadata["destination_tables"])\n    normalization_tables = cast(Mapping[str, List[str]], metadata["normalization_tables"])\n    io_manager_key = cast(Optional[str], metadata["io_manager_key"])\n\n    @multi_asset(\n        name=f"airbyte_sync_{connection_id[:5]}",\n        deps=list((assets_defn_meta.keys_by_input_name or {}).values()),\n        outs={\n            k: AssetOut(\n                key=v,\n                metadata=(\n                    {\n                        k: cast(TableSchemaMetadataValue, v)\n                        for k, v in assets_defn_meta.metadata_by_output_name.get(k, {}).items()\n                    }\n                    if assets_defn_meta.metadata_by_output_name\n                    else None\n                ),\n                io_manager_key=io_manager_key,\n                freshness_policy=(\n                    assets_defn_meta.freshness_policies_by_output_name.get(k)\n                    if assets_defn_meta.freshness_policies_by_output_name\n                    else None\n                ),\n                dagster_type=Nothing,\n            )\n            for k, v in (assets_defn_meta.keys_by_output_name or {}).items()\n        },\n        internal_asset_deps={\n            k: set(v) for k, v in (assets_defn_meta.internal_asset_deps or {}).items()\n        },\n        compute_kind="airbyte",\n        group_name=group_name,\n        resource_defs=resource_defs,\n    )\n    def _assets(context, airbyte: AirbyteResource):\n        ab_output = airbyte.sync_and_poll(connection_id=connection_id)\n        for materialization in generate_materializations(\n            ab_output, assets_defn_meta.key_prefix or []\n        ):\n            table_name = materialization.asset_key.path[-1]\n            if table_name in destination_tables:\n                yield Output(\n                    value=None,\n                    output_name=_table_to_output_name_fn(table_name),\n                    metadata=materialization.metadata,\n                )\n                # Also materialize any normalization tables affiliated with this destination\n                # e.g. nested objects, lists etc\n                if normalization_tables:\n                    for dependent_table in normalization_tables.get(table_name, set()):\n                        yield Output(\n                            value=None,\n                            output_name=_table_to_output_name_fn(dependent_table),\n                        )\n            else:\n                yield materialization\n\n    return _assets\n\n\n
[docs]def build_airbyte_assets(\n connection_id: str,\n destination_tables: Sequence[str],\n asset_key_prefix: Optional[Sequence[str]] = None,\n group_name: Optional[str] = None,\n normalization_tables: Optional[Mapping[str, Set[str]]] = None,\n deps: Optional[Iterable[Union[CoercibleToAssetKey, AssetsDefinition, SourceAsset]]] = None,\n upstream_assets: Optional[Set[AssetKey]] = None,\n schema_by_table_name: Optional[Mapping[str, TableSchema]] = None,\n freshness_policy: Optional[FreshnessPolicy] = None,\n) -> Sequence[AssetsDefinition]:\n """Builds a set of assets representing the tables created by an Airbyte sync operation.\n\n Args:\n connection_id (str): The Airbyte Connection ID that this op will sync. You can retrieve this\n value from the "Connections" tab of a given connector in the Airbyte UI.\n destination_tables (List[str]): The names of the tables that you want to be represented\n in the Dagster asset graph for this sync. This will generally map to the name of the\n stream in Airbyte, unless a stream prefix has been specified in Airbyte.\n normalization_tables (Optional[Mapping[str, List[str]]]): If you are using Airbyte's\n normalization feature, you may specify a mapping of destination table to a list of\n derived tables that will be created by the normalization process.\n asset_key_prefix (Optional[List[str]]): A prefix for the asset keys inside this asset.\n If left blank, assets will have a key of `AssetKey([table_name])`.\n deps (Optional[Sequence[Union[AssetsDefinition, SourceAsset, str, AssetKey]]]):\n A list of assets to add as sources.\n upstream_assets (Optional[Set[AssetKey]]): Deprecated, use deps instead. A list of assets to add as sources.\n freshness_policy (Optional[FreshnessPolicy]): A freshness policy to apply to the assets\n """\n if upstream_assets is not None and deps is not None:\n raise DagsterInvalidDefinitionError(\n "Cannot specify both deps and upstream_assets to build_airbyte_assets. Use only deps"\n " instead."\n )\n\n asset_key_prefix = check.opt_sequence_param(asset_key_prefix, "asset_key_prefix", of_type=str)\n\n # Generate a list of outputs, the set of destination tables plus any affiliated\n # normalization tables\n tables = chain.from_iterable(\n chain([destination_tables], normalization_tables.values() if normalization_tables else [])\n )\n outputs = {\n table: AssetOut(\n key=AssetKey([*asset_key_prefix, table]),\n metadata=(\n {"table_schema": MetadataValue.table_schema(schema_by_table_name[table])}\n if schema_by_table_name\n else None\n ),\n freshness_policy=freshness_policy,\n )\n for table in tables\n }\n\n internal_deps = {}\n\n # If normalization tables are specified, we need to add a dependency from the destination table\n # to the affilitated normalization table\n if normalization_tables:\n for base_table, derived_tables in normalization_tables.items():\n for derived_table in derived_tables:\n internal_deps[derived_table] = {AssetKey([*asset_key_prefix, base_table])}\n\n upstream_deps = deps\n if upstream_assets is not None:\n upstream_deps = list(upstream_assets)\n\n # All non-normalization tables depend on any user-provided upstream assets\n for table in destination_tables:\n internal_deps[table] = set(upstream_deps) if upstream_deps else set()\n\n @multi_asset(\n name=f"airbyte_sync_{connection_id[:5]}",\n deps=upstream_deps,\n outs=outputs,\n internal_asset_deps=internal_deps,\n compute_kind="airbyte",\n group_name=group_name,\n )\n def _assets(context, airbyte: BaseAirbyteResource):\n ab_output = airbyte.sync_and_poll(connection_id=connection_id)\n\n # No connection details (e.g. using Airbyte Cloud) means we just assume\n # that the outputs were produced\n if len(ab_output.connection_details) == 0:\n for table_name in destination_tables:\n yield Output(\n value=None,\n output_name=_table_to_output_name_fn(table_name),\n )\n if normalization_tables:\n for dependent_table in normalization_tables.get(table_name, set()):\n yield Output(\n value=None,\n output_name=_table_to_output_name_fn(dependent_table),\n )\n else:\n for materialization in generate_materializations(ab_output, asset_key_prefix):\n table_name = materialization.asset_key.path[-1]\n if table_name in destination_tables:\n yield Output(\n value=None,\n output_name=_table_to_output_name_fn(table_name),\n metadata=materialization.metadata,\n )\n # Also materialize any normalization tables affiliated with this destination\n # e.g. nested objects, lists etc\n if normalization_tables:\n for dependent_table in normalization_tables.get(table_name, set()):\n yield Output(\n value=None,\n output_name=_table_to_output_name_fn(dependent_table),\n )\n else:\n yield materialization\n\n return [_assets]
\n\n\ndef _get_schema_types(schema: Mapping[str, Any]) -> Sequence[str]:\n """Given a schema definition, return a list of data types that are valid for this schema."""\n types = schema.get("types") or schema.get("type")\n if not types:\n return []\n if isinstance(types, str):\n return [types]\n return types\n\n\ndef _get_sub_schemas(schema: Mapping[str, Any]) -> Sequence[Mapping[str, Any]]:\n """Returns a list of sub-schema definitions for a given schema. This is used to handle union types."""\n return schema.get("anyOf") or schema.get("oneOf") or [schema]\n\n\ndef _get_normalization_tables_for_schema(\n key: str, schema: Mapping[str, Any], prefix: str = ""\n) -> Mapping[str, AirbyteTableMetadata]:\n """Recursively traverses a schema, returning metadata for the tables that will be created by the Airbyte\n normalization process.\n\n For example, a table `cars` with a nested object field `limited_editions` will produce the tables\n `cars` and `cars_limited_editions`.\n\n For more information on Airbyte's normalization process, see:\n https://docs.airbyte.com/understanding-airbyte/basic-normalization/#nesting\n """\n out: Dict[str, AirbyteTableMetadata] = {}\n # Object types are broken into a new table, as long as they have children\n\n sub_schemas = _get_sub_schemas(schema)\n\n for sub_schema in sub_schemas:\n schema_types = _get_schema_types(sub_schema)\n if not schema_types:\n continue\n\n if "object" in schema_types and len(sub_schema.get("properties", {})) > 0:\n out[prefix + key] = AirbyteTableMetadata(\n schema=generate_table_schema(sub_schema.get("properties", {}))\n )\n for k, v in sub_schema["properties"].items():\n out = merge_dicts(\n out, _get_normalization_tables_for_schema(k, v, f"{prefix}{key}_")\n )\n # Array types are also broken into a new table\n elif "array" in schema_types:\n out[prefix + key] = AirbyteTableMetadata(\n schema=generate_table_schema(sub_schema.get("items", {}).get("properties", {}))\n )\n if sub_schema.get("items", {}).get("properties"):\n for k, v in sub_schema["items"]["properties"].items():\n out = merge_dicts(\n out, _get_normalization_tables_for_schema(k, v, f"{prefix}{key}_")\n )\n\n return out\n\n\ndef _clean_name(name: str) -> str:\n """Cleans an input to be a valid Dagster asset name."""\n return re.sub(r"[^a-z0-9]+", "_", name.lower())\n\n\nclass AirbyteConnectionMetadata(\n NamedTuple(\n "_AirbyteConnectionMetadata",\n [\n ("name", str),\n ("stream_prefix", str),\n ("has_basic_normalization", bool),\n ("stream_data", List[Mapping[str, Any]]),\n ],\n )\n):\n """Contains information about an Airbyte connection.\n\n Attributes:\n name (str): The name of the connection.\n stream_prefix (str): A prefix to add to all stream names.\n has_basic_normalization (bool): Whether or not the connection has basic normalization enabled.\n stream_data (List[Mapping[str, Any]]): Unparsed list of dicts with information about each stream.\n """\n\n @classmethod\n def from_api_json(\n cls, contents: Mapping[str, Any], operations: Mapping[str, Any]\n ) -> "AirbyteConnectionMetadata":\n return cls(\n name=contents["name"],\n stream_prefix=contents.get("prefix", ""),\n has_basic_normalization=any(\n is_basic_normalization_operation(op.get("operatorConfiguration", {}))\n for op in operations.get("operations", [])\n ),\n stream_data=contents.get("syncCatalog", {}).get("streams", []),\n )\n\n @classmethod\n def from_config(cls, contents: Mapping[str, Any]) -> "AirbyteConnectionMetadata":\n config_contents = cast(Mapping[str, Any], contents.get("configuration"))\n check.invariant(\n config_contents is not None, "Airbyte connection config is missing 'configuration' key"\n )\n\n return cls(\n name=contents["resource_name"],\n stream_prefix=config_contents.get("prefix", ""),\n has_basic_normalization=any(\n is_basic_normalization_operation(op.get("operator_configuration", {}))\n for op in config_contents.get("operations", [])\n ),\n stream_data=config_contents.get("sync_catalog", {}).get("streams", []),\n )\n\n def parse_stream_tables(\n self, return_normalization_tables: bool = False\n ) -> Mapping[str, AirbyteTableMetadata]:\n """Parses the stream data and returns a mapping, with keys representing destination\n tables associated with each enabled stream and values representing any affiliated\n tables created by Airbyte's normalization process, if enabled.\n """\n tables: Dict[str, AirbyteTableMetadata] = {}\n\n enabled_streams = [\n stream for stream in self.stream_data if stream.get("config", {}).get("selected", False)\n ]\n\n for stream in enabled_streams:\n name = cast(str, stream.get("stream", {}).get("name"))\n prefixed_name = f"{self.stream_prefix}{name}"\n\n schema = (\n stream["stream"]["json_schema"]\n if "json_schema" in stream["stream"]\n else stream["stream"]["jsonSchema"]\n )\n normalization_tables: Dict[str, AirbyteTableMetadata] = {}\n schema_props = schema.get("properties", schema.get("items", {}).get("properties", {}))\n if self.has_basic_normalization and return_normalization_tables:\n for k, v in schema_props.items():\n for normalization_table_name, meta in _get_normalization_tables_for_schema(\n k, v, f"{name}_"\n ).items():\n prefixed_norm_table_name = f"{self.stream_prefix}{normalization_table_name}"\n normalization_tables[prefixed_norm_table_name] = meta\n tables[prefixed_name] = AirbyteTableMetadata(\n schema=generate_table_schema(schema_props),\n normalization_tables=normalization_tables,\n )\n\n return tables\n\n\ndef _get_schema_by_table_name(\n stream_table_metadata: Mapping[str, AirbyteTableMetadata]\n) -> Mapping[str, TableSchema]:\n schema_by_base_table_name = [(k, v.schema) for k, v in stream_table_metadata.items()]\n schema_by_normalization_table_name = list(\n chain.from_iterable(\n [\n [\n (k, v.schema)\n for k, v in cast(\n Dict[str, AirbyteTableMetadata], meta.normalization_tables\n ).items()\n ]\n for meta in stream_table_metadata.values()\n ]\n )\n )\n\n return dict(schema_by_normalization_table_name + schema_by_base_table_name)\n\n\nclass AirbyteCoreCacheableAssetsDefinition(CacheableAssetsDefinition):\n def __init__(\n self,\n key_prefix: Sequence[str],\n create_assets_for_normalization_tables: bool,\n connection_to_group_fn: Optional[Callable[[str], Optional[str]]],\n connection_to_io_manager_key_fn: Optional[Callable[[str], Optional[str]]],\n connection_filter: Optional[Callable[[AirbyteConnectionMetadata], bool]],\n connection_to_asset_key_fn: Optional[Callable[[AirbyteConnectionMetadata, str], AssetKey]],\n connection_to_freshness_policy_fn: Optional[\n Callable[[AirbyteConnectionMetadata], Optional[FreshnessPolicy]]\n ],\n connection_to_auto_materialize_policy_fn: Optional[\n Callable[[AirbyteConnectionMetadata], Optional[AutoMaterializePolicy]]\n ] = None,\n ):\n self._key_prefix = key_prefix\n self._create_assets_for_normalization_tables = create_assets_for_normalization_tables\n self._connection_to_group_fn = connection_to_group_fn\n self._connection_to_io_manager_key_fn = connection_to_io_manager_key_fn\n self._connection_filter = connection_filter\n self._connection_to_asset_key_fn: Callable[[AirbyteConnectionMetadata, str], AssetKey] = (\n connection_to_asset_key_fn or (lambda _, table: AssetKey(path=[table]))\n )\n self._connection_to_freshness_policy_fn = connection_to_freshness_policy_fn or (\n lambda _: None\n )\n self._connection_to_auto_materialize_policy_fn = (\n connection_to_auto_materialize_policy_fn or (lambda _: None)\n )\n\n contents = hashlib.sha1() # so that hexdigest is 40, not 64 bytes\n contents.update(",".join(key_prefix).encode("utf-8"))\n contents.update(str(create_assets_for_normalization_tables).encode("utf-8"))\n if connection_filter:\n contents.update(inspect.getsource(connection_filter).encode("utf-8"))\n\n super().__init__(unique_id=f"airbyte-{contents.hexdigest()}")\n\n @abstractmethod\n def _get_connections(self) -> Sequence[Tuple[str, AirbyteConnectionMetadata]]:\n pass\n\n def compute_cacheable_data(self) -> Sequence[AssetsDefinitionCacheableData]:\n asset_defn_data: List[AssetsDefinitionCacheableData] = []\n for connection_id, connection in self._get_connections():\n stream_table_metadata = connection.parse_stream_tables(\n self._create_assets_for_normalization_tables\n )\n schema_by_table_name = _get_schema_by_table_name(stream_table_metadata)\n\n table_to_asset_key = partial(self._connection_to_asset_key_fn, connection)\n asset_data_for_conn = _build_airbyte_asset_defn_metadata(\n connection_id=connection_id,\n destination_tables=list(stream_table_metadata.keys()),\n normalization_tables={\n table: set(metadata.normalization_tables.keys())\n for table, metadata in stream_table_metadata.items()\n },\n asset_key_prefix=self._key_prefix,\n group_name=(\n self._connection_to_group_fn(connection.name)\n if self._connection_to_group_fn\n else None\n ),\n io_manager_key=(\n self._connection_to_io_manager_key_fn(connection.name)\n if self._connection_to_io_manager_key_fn\n else None\n ),\n schema_by_table_name=schema_by_table_name,\n table_to_asset_key_fn=table_to_asset_key,\n freshness_policy=self._connection_to_freshness_policy_fn(connection),\n auto_materialize_policy=self._connection_to_auto_materialize_policy_fn(connection),\n )\n\n asset_defn_data.append(asset_data_for_conn)\n\n return asset_defn_data\n\n def _build_definitions_with_resources(\n self,\n data: Sequence[AssetsDefinitionCacheableData],\n resource_defs: Optional[Mapping[str, ResourceDefinition]] = None,\n ) -> Sequence[AssetsDefinition]:\n return [_build_airbyte_assets_from_metadata(meta, resource_defs) for meta in data]\n\n def build_definitions(\n self, data: Sequence[AssetsDefinitionCacheableData]\n ) -> Sequence[AssetsDefinition]:\n return self._build_definitions_with_resources(data)\n\n\nclass AirbyteInstanceCacheableAssetsDefinition(AirbyteCoreCacheableAssetsDefinition):\n def __init__(\n self,\n airbyte_resource_def: Union[ResourceDefinition, AirbyteResource],\n workspace_id: Optional[str],\n key_prefix: Sequence[str],\n create_assets_for_normalization_tables: bool,\n connection_to_group_fn: Optional[Callable[[str], Optional[str]]],\n connection_to_io_manager_key_fn: Optional[Callable[[str], Optional[str]]],\n connection_filter: Optional[Callable[[AirbyteConnectionMetadata], bool]],\n connection_to_asset_key_fn: Optional[Callable[[AirbyteConnectionMetadata, str], AssetKey]],\n connection_to_freshness_policy_fn: Optional[\n Callable[[AirbyteConnectionMetadata], Optional[FreshnessPolicy]]\n ],\n connection_to_auto_materialize_policy_fn: Optional[\n Callable[[AirbyteConnectionMetadata], Optional[AutoMaterializePolicy]]\n ] = None,\n ):\n super().__init__(\n key_prefix=key_prefix,\n create_assets_for_normalization_tables=create_assets_for_normalization_tables,\n connection_to_group_fn=connection_to_group_fn,\n connection_to_io_manager_key_fn=connection_to_io_manager_key_fn,\n connection_filter=connection_filter,\n connection_to_asset_key_fn=connection_to_asset_key_fn,\n connection_to_freshness_policy_fn=connection_to_freshness_policy_fn,\n connection_to_auto_materialize_policy_fn=connection_to_auto_materialize_policy_fn,\n )\n self._workspace_id = workspace_id\n self._airbyte_instance: AirbyteResource = (\n airbyte_resource_def.process_config_and_initialize()\n if isinstance(airbyte_resource_def, AirbyteResource)\n else airbyte_resource_def(build_init_resource_context())\n )\n\n def _get_connections(self) -> Sequence[Tuple[str, AirbyteConnectionMetadata]]:\n workspace_id = self._workspace_id\n if not workspace_id:\n workspaces = cast(\n List[Dict[str, Any]],\n check.not_none(\n self._airbyte_instance.make_request(endpoint="/workspaces/list", data={})\n ).get("workspaces", []),\n )\n\n check.invariant(len(workspaces) <= 1, "Airbyte instance has more than one workspace")\n check.invariant(len(workspaces) > 0, "Airbyte instance has no workspaces")\n\n workspace_id = workspaces[0].get("workspaceId")\n\n connections = cast(\n List[Dict[str, Any]],\n check.not_none(\n self._airbyte_instance.make_request(\n endpoint="/connections/list", data={"workspaceId": workspace_id}\n )\n ).get("connections", []),\n )\n\n output_connections: List[Tuple[str, AirbyteConnectionMetadata]] = []\n for connection_json in connections:\n connection_id = cast(str, connection_json.get("connectionId"))\n\n operations_json = cast(\n Dict[str, Any],\n check.not_none(\n self._airbyte_instance.make_request(\n endpoint="/operations/list",\n data={"connectionId": connection_id},\n )\n ),\n )\n connection = AirbyteConnectionMetadata.from_api_json(connection_json, operations_json)\n\n # Filter out connections that don't match the filter function\n if self._connection_filter and not self._connection_filter(connection):\n continue\n\n output_connections.append((connection_id, connection))\n return output_connections\n\n def build_definitions(\n self, data: Sequence[AssetsDefinitionCacheableData]\n ) -> Sequence[AssetsDefinition]:\n return super()._build_definitions_with_resources(\n data, {"airbyte": self._airbyte_instance.get_resource_definition()}\n )\n\n\nclass AirbyteYAMLCacheableAssetsDefinition(AirbyteCoreCacheableAssetsDefinition):\n def __init__(\n self,\n project_dir: str,\n workspace_id: Optional[str],\n key_prefix: Sequence[str],\n create_assets_for_normalization_tables: bool,\n connection_to_group_fn: Optional[Callable[[str], Optional[str]]],\n connection_to_io_manager_key_fn: Optional[Callable[[str], Optional[str]]],\n connection_filter: Optional[Callable[[AirbyteConnectionMetadata], bool]],\n connection_directories: Optional[Sequence[str]],\n connection_to_asset_key_fn: Optional[Callable[[AirbyteConnectionMetadata, str], AssetKey]],\n connection_to_freshness_policy_fn: Optional[\n Callable[[AirbyteConnectionMetadata], Optional[FreshnessPolicy]]\n ],\n connection_to_auto_materialize_policy_fn: Optional[\n Callable[[AirbyteConnectionMetadata], Optional[AutoMaterializePolicy]]\n ] = None,\n ):\n super().__init__(\n key_prefix=key_prefix,\n create_assets_for_normalization_tables=create_assets_for_normalization_tables,\n connection_to_group_fn=connection_to_group_fn,\n connection_to_io_manager_key_fn=connection_to_io_manager_key_fn,\n connection_filter=connection_filter,\n connection_to_asset_key_fn=connection_to_asset_key_fn,\n connection_to_freshness_policy_fn=connection_to_freshness_policy_fn,\n connection_to_auto_materialize_policy_fn=connection_to_auto_materialize_policy_fn,\n )\n self._workspace_id = workspace_id\n self._project_dir = project_dir\n self._connection_directories = connection_directories\n\n def _get_connections(self) -> Sequence[Tuple[str, AirbyteConnectionMetadata]]:\n connections_dir = os.path.join(self._project_dir, "connections")\n\n output_connections: List[Tuple[str, AirbyteConnectionMetadata]] = []\n\n connection_directories = self._connection_directories or os.listdir(connections_dir)\n for connection_name in connection_directories:\n connection_dir = os.path.join(connections_dir, connection_name)\n with open(os.path.join(connection_dir, "configuration.yaml"), encoding="utf-8") as f:\n connection = AirbyteConnectionMetadata.from_config(yaml.safe_load(f.read()))\n\n # Filter out connections that don't match the filter function\n if self._connection_filter and not self._connection_filter(connection):\n continue\n\n if self._workspace_id:\n state_file = f"state_{self._workspace_id}.yaml"\n check.invariant(\n state_file in os.listdir(connection_dir),\n f"Workspace state file {state_file} not found",\n )\n else:\n state_files = [\n filename\n for filename in os.listdir(connection_dir)\n if filename.startswith("state_")\n ]\n check.invariant(\n len(state_files) > 0,\n f"No state files found for connection {connection_name} in {connection_dir}",\n )\n check.invariant(\n len(state_files) <= 1,\n "More than one state file found for connection {} in {}, specify a workspace_id"\n " to disambiguate".format(connection_name, connection_dir),\n )\n state_file = state_files[0]\n\n with open(os.path.join(connection_dir, cast(str, state_file)), encoding="utf-8") as f:\n state = yaml.safe_load(f.read())\n connection_id = state.get("resource_id")\n\n output_connections.append((connection_id, connection))\n return output_connections\n\n\n
[docs]def load_assets_from_airbyte_instance(\n airbyte: Union[AirbyteResource, ResourceDefinition],\n workspace_id: Optional[str] = None,\n key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n create_assets_for_normalization_tables: bool = True,\n connection_to_group_fn: Optional[Callable[[str], Optional[str]]] = _clean_name,\n io_manager_key: Optional[str] = None,\n connection_to_io_manager_key_fn: Optional[Callable[[str], Optional[str]]] = None,\n connection_filter: Optional[Callable[[AirbyteConnectionMetadata], bool]] = None,\n connection_to_asset_key_fn: Optional[\n Callable[[AirbyteConnectionMetadata, str], AssetKey]\n ] = None,\n connection_to_freshness_policy_fn: Optional[\n Callable[[AirbyteConnectionMetadata], Optional[FreshnessPolicy]]\n ] = None,\n connection_to_auto_materialize_policy_fn: Optional[\n Callable[[AirbyteConnectionMetadata], Optional[AutoMaterializePolicy]]\n ] = None,\n) -> CacheableAssetsDefinition:\n """Loads Airbyte connection assets from a configured AirbyteResource instance. This fetches information\n about defined connections at initialization time, and will error on workspace load if the Airbyte\n instance is not reachable.\n\n Args:\n airbyte (ResourceDefinition): An AirbyteResource configured with the appropriate connection\n details.\n workspace_id (Optional[str]): The ID of the Airbyte workspace to load connections from. Only\n required if multiple workspaces exist in your instance.\n key_prefix (Optional[CoercibleToAssetKeyPrefix]): A prefix for the asset keys created.\n create_assets_for_normalization_tables (bool): If True, assets will be created for tables\n created by Airbyte's normalization feature. If False, only the destination tables\n will be created. Defaults to True.\n connection_to_group_fn (Optional[Callable[[str], Optional[str]]]): Function which returns an asset\n group name for a given Airbyte connection name. If None, no groups will be created. Defaults\n to a basic sanitization function.\n io_manager_key (Optional[str]): The I/O manager key to use for all assets. Defaults to "io_manager".\n Use this if all assets should be loaded from the same source, otherwise use connection_to_io_manager_key_fn.\n connection_to_io_manager_key_fn (Optional[Callable[[str], Optional[str]]]): Function which returns an\n I/O manager key for a given Airbyte connection name. When other ops are downstream of the loaded assets,\n the IOManager specified determines how the inputs to those ops are loaded. Defaults to "io_manager".\n connection_filter (Optional[Callable[[AirbyteConnectionMetadata], bool]]): Optional function which takes\n in connection metadata and returns False if the connection should be excluded from the output assets.\n connection_to_asset_key_fn (Optional[Callable[[AirbyteConnectionMetadata, str], AssetKey]]): Optional function which\n takes in connection metadata and table name and returns an asset key for the table. If None, the default asset\n key is based on the table name. Any asset key prefix will be applied to the output of this function.\n connection_to_freshness_policy_fn (Optional[Callable[[AirbyteConnectionMetadata], Optional[FreshnessPolicy]]]): Optional function\n which takes in connection metadata and returns a freshness policy for the connection's assets. If None, no freshness policies\n will be applied to the assets.\n connection_to_auto_materialize_policy_fn (Optional[Callable[[AirbyteConnectionMetadata], Optional[AutoMaterializePolicy]]]): Optional\n function which takes in connection metadata and returns an auto materialization policy for the connection's assets. If None, no\n auto materialization policies will be applied to the assets.\n\n **Examples:**\n\n Loading all Airbyte connections as assets:\n\n .. code-block:: python\n\n from dagster_airbyte import airbyte_resource, load_assets_from_airbyte_instance\n\n airbyte_instance = airbyte_resource.configured(\n {\n "host": "localhost",\n "port": "8000",\n }\n )\n airbyte_assets = load_assets_from_airbyte_instance(airbyte_instance)\n\n Filtering the set of loaded connections:\n\n .. code-block:: python\n\n from dagster_airbyte import airbyte_resource, load_assets_from_airbyte_instance\n\n airbyte_instance = airbyte_resource.configured(\n {\n "host": "localhost",\n "port": "8000",\n }\n )\n airbyte_assets = load_assets_from_airbyte_instance(\n airbyte_instance,\n connection_filter=lambda meta: "snowflake" in meta.name,\n )\n """\n if isinstance(airbyte, AirbyteCloudResource):\n raise DagsterInvalidInvocationError(\n "load_assets_from_airbyte_instance is not yet supported for AirbyteCloudResource"\n )\n\n if isinstance(key_prefix, str):\n key_prefix = [key_prefix]\n key_prefix = check.list_param(key_prefix or [], "key_prefix", of_type=str)\n\n check.invariant(\n not io_manager_key or not connection_to_io_manager_key_fn,\n "Cannot specify both io_manager_key and connection_to_io_manager_key_fn",\n )\n if not connection_to_io_manager_key_fn:\n connection_to_io_manager_key_fn = lambda _: io_manager_key\n\n return AirbyteInstanceCacheableAssetsDefinition(\n airbyte_resource_def=airbyte,\n workspace_id=workspace_id,\n key_prefix=key_prefix,\n create_assets_for_normalization_tables=create_assets_for_normalization_tables,\n connection_to_group_fn=connection_to_group_fn,\n connection_to_io_manager_key_fn=connection_to_io_manager_key_fn,\n connection_filter=connection_filter,\n connection_to_asset_key_fn=connection_to_asset_key_fn,\n connection_to_freshness_policy_fn=connection_to_freshness_policy_fn,\n connection_to_auto_materialize_policy_fn=connection_to_auto_materialize_policy_fn,\n )
\n\n\n
[docs]def load_assets_from_airbyte_project(\n project_dir: str,\n workspace_id: Optional[str] = None,\n key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n create_assets_for_normalization_tables: bool = True,\n connection_to_group_fn: Optional[Callable[[str], Optional[str]]] = _clean_name,\n io_manager_key: Optional[str] = None,\n connection_to_io_manager_key_fn: Optional[Callable[[str], Optional[str]]] = None,\n connection_filter: Optional[Callable[[AirbyteConnectionMetadata], bool]] = None,\n connection_directories: Optional[Sequence[str]] = None,\n connection_to_asset_key_fn: Optional[\n Callable[[AirbyteConnectionMetadata, str], AssetKey]\n ] = None,\n connection_to_freshness_policy_fn: Optional[\n Callable[[AirbyteConnectionMetadata], Optional[FreshnessPolicy]]\n ] = None,\n connection_to_auto_materialize_policy_fn: Optional[\n Callable[[AirbyteConnectionMetadata], Optional[AutoMaterializePolicy]]\n ] = None,\n) -> CacheableAssetsDefinition:\n """Loads an Airbyte project into a set of Dagster assets.\n\n Point to the root folder of an Airbyte project synced using the Octavia CLI. For\n more information, see https://github.com/airbytehq/airbyte/tree/master/octavia-cli#octavia-import-all.\n\n Args:\n project_dir (str): The path to the root of your Airbyte project, containing sources, destinations,\n and connections folders.\n workspace_id (Optional[str]): The ID of the Airbyte workspace to load connections from. Only\n required if multiple workspace state YAMLfiles exist in the project.\n key_prefix (Optional[CoercibleToAssetKeyPrefix]): A prefix for the asset keys created.\n create_assets_for_normalization_tables (bool): If True, assets will be created for tables\n created by Airbyte's normalization feature. If False, only the destination tables\n will be created. Defaults to True.\n connection_to_group_fn (Optional[Callable[[str], Optional[str]]]): Function which returns an asset\n group name for a given Airbyte connection name. If None, no groups will be created. Defaults\n to a basic sanitization function.\n io_manager_key (Optional[str]): The I/O manager key to use for all assets. Defaults to "io_manager".\n Use this if all assets should be loaded from the same source, otherwise use connection_to_io_manager_key_fn.\n connection_to_io_manager_key_fn (Optional[Callable[[str], Optional[str]]]): Function which returns an\n I/O manager key for a given Airbyte connection name. When other ops are downstream of the loaded assets,\n the IOManager specified determines how the inputs to those ops are loaded. Defaults to "io_manager".\n connection_filter (Optional[Callable[[AirbyteConnectionMetadata], bool]]): Optional function which\n takes in connection metadata and returns False if the connection should be excluded from the output assets.\n connection_directories (Optional[List[str]]): Optional list of connection directories to load assets from.\n If omitted, all connections in the Airbyte project are loaded. May be faster than connection_filter\n if the project has many connections or if the connection yaml files are large.\n connection_to_asset_key_fn (Optional[Callable[[AirbyteConnectionMetadata, str], AssetKey]]): Optional function which\n takes in connection metadata and table name and returns an asset key for the table. If None, the default asset\n key is based on the table name. Any asset key prefix will be applied to the output of this function.\n connection_to_freshness_policy_fn (Optional[Callable[[AirbyteConnectionMetadata], Optional[FreshnessPolicy]]]):\n Optional function which takes in connection metadata and returns a freshness policy for the connection's assets.\n If None, no freshness policies will be applied to the assets.\n connection_to_auto_materialize_policy_fn (Optional[Callable[[AirbyteConnectionMetadata], Optional[AutoMaterializePolicy]]]):\n Optional function which takes in connection metadata and returns an auto materialization policy for the connection's assets.\n If None, no auto materialization policies will be applied to the assets.\n\n **Examples:**\n\n Loading all Airbyte connections as assets:\n\n .. code-block:: python\n\n from dagster_airbyte import load_assets_from_airbyte_project\n\n airbyte_assets = load_assets_from_airbyte_project(\n project_dir="path/to/airbyte/project",\n )\n\n Filtering the set of loaded connections:\n\n .. code-block:: python\n\n from dagster_airbyte import load_assets_from_airbyte_project\n\n airbyte_assets = load_assets_from_airbyte_project(\n project_dir="path/to/airbyte/project",\n connection_filter=lambda meta: "snowflake" in meta.name,\n )\n """\n if isinstance(key_prefix, str):\n key_prefix = [key_prefix]\n key_prefix = check.list_param(key_prefix or [], "key_prefix", of_type=str)\n\n check.invariant(\n not io_manager_key or not connection_to_io_manager_key_fn,\n "Cannot specify both io_manager_key and connection_to_io_manager_key_fn",\n )\n if not connection_to_io_manager_key_fn:\n connection_to_io_manager_key_fn = lambda _: io_manager_key\n\n return AirbyteYAMLCacheableAssetsDefinition(\n project_dir=project_dir,\n workspace_id=workspace_id,\n key_prefix=key_prefix,\n create_assets_for_normalization_tables=create_assets_for_normalization_tables,\n connection_to_group_fn=connection_to_group_fn,\n connection_to_io_manager_key_fn=connection_to_io_manager_key_fn,\n connection_filter=connection_filter,\n connection_directories=connection_directories,\n connection_to_asset_key_fn=connection_to_asset_key_fn,\n connection_to_freshness_policy_fn=connection_to_freshness_policy_fn,\n connection_to_auto_materialize_policy_fn=connection_to_auto_materialize_policy_fn,\n )
\n
", "current_page_name": "_modules/dagster_airbyte/asset_defs", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_airbyte.asset_defs"}, "managed": {"generated": {"destinations": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_airbyte.managed.generated.destinations

\n# ruff: noqa: A001, A002\nfrom typing import Optional, Union\n\nimport dagster._check as check\nfrom dagster._annotations import public\n\nfrom dagster_airbyte.managed.types import GeneratedAirbyteDestination\n\n\n
[docs]class DynamodbDestination(GeneratedAirbyteDestination):\n
[docs] @public\n def __init__(\n self,\n name: str,\n dynamodb_table_name_prefix: str,\n dynamodb_region: str,\n access_key_id: str,\n secret_access_key: str,\n dynamodb_endpoint: Optional[str] = None,\n ):\n """Airbyte Destination for Dynamodb.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/dynamodb\n\n Args:\n name (str): The name of the destination.\n dynamodb_endpoint (Optional[str]): This is your DynamoDB endpoint url.(if you are working with AWS DynamoDB, just leave empty).\n dynamodb_table_name_prefix (str): The prefix to use when naming DynamoDB tables.\n dynamodb_region (str): The region of the DynamoDB.\n access_key_id (str): The access key id to access the DynamoDB. Airbyte requires Read and Write permissions to the DynamoDB.\n secret_access_key (str): The corresponding secret to the access key id.\n """\n self.dynamodb_endpoint = check.opt_str_param(dynamodb_endpoint, "dynamodb_endpoint")\n self.dynamodb_table_name_prefix = check.str_param(\n dynamodb_table_name_prefix, "dynamodb_table_name_prefix"\n )\n self.dynamodb_region = check.str_param(dynamodb_region, "dynamodb_region")\n self.access_key_id = check.str_param(access_key_id, "access_key_id")\n self.secret_access_key = check.str_param(secret_access_key, "secret_access_key")\n super().__init__("Dynamodb", name)
\n\n\n
[docs]class BigqueryDestination(GeneratedAirbyteDestination):\n
[docs] class StandardInserts:\n
[docs] @public\n def __init__(\n self,\n ):\n self.method = "Standard"
\n\n
[docs] class HMACKey:\n
[docs] @public\n def __init__(self, hmac_key_access_id: str, hmac_key_secret: str):\n self.credential_type = "HMAC_KEY"\n self.hmac_key_access_id = check.str_param(hmac_key_access_id, "hmac_key_access_id")\n self.hmac_key_secret = check.str_param(hmac_key_secret, "hmac_key_secret")
\n\n
[docs] class GCSStaging:\n
[docs] @public\n def __init__(\n self,\n credential: "BigqueryDestination.HMACKey",\n gcs_bucket_name: str,\n gcs_bucket_path: str,\n keep_files_in_gcs_bucket: Optional[str] = None,\n ):\n self.method = "GCS Staging"\n self.credential = check.inst_param(\n credential, "credential", BigqueryDestination.HMACKey\n )\n self.gcs_bucket_name = check.str_param(gcs_bucket_name, "gcs_bucket_name")\n self.gcs_bucket_path = check.str_param(gcs_bucket_path, "gcs_bucket_path")\n self.keep_files_in_gcs_bucket = check.opt_str_param(\n keep_files_in_gcs_bucket, "keep_files_in_gcs_bucket"\n )
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n project_id: str,\n dataset_location: str,\n dataset_id: str,\n loading_method: Union[\n "BigqueryDestination.StandardInserts", "BigqueryDestination.GCSStaging"\n ],\n credentials_json: Optional[str] = None,\n transformation_priority: Optional[str] = None,\n big_query_client_buffer_size_mb: Optional[int] = None,\n ):\n """Airbyte Destination for Bigquery.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/bigquery\n\n Args:\n name (str): The name of the destination.\n project_id (str): The GCP project ID for the project containing the target BigQuery dataset. Read more here.\n dataset_location (str): The location of the dataset. Warning: Changes made after creation will not be applied. Read more here.\n dataset_id (str): The default BigQuery Dataset ID that tables are replicated to if the source does not specify a namespace. Read more here.\n loading_method (Union[BigqueryDestination.StandardInserts, BigqueryDestination.GCSStaging]): Loading method used to send select the way data will be uploaded to BigQuery. Standard Inserts - Direct uploading using SQL INSERT statements. This method is extremely inefficient and provided only for quick testing. In almost all cases, you should use staging. GCS Staging - Writes large batches of records to a file, uploads the file to GCS, then uses COPY INTO table to upload the file. Recommended for most workloads for better speed and scalability. Read more about GCS Staging here.\n credentials_json (Optional[str]): The contents of the JSON service account key. Check out the docs if you need help generating this key. Default credentials will be used if this field is left empty.\n transformation_priority (Optional[str]): Interactive run type means that the query is executed as soon as possible, and these queries count towards concurrent rate limit and daily limit. Read more about interactive run type here. Batch queries are queued and started as soon as idle resources are available in the BigQuery shared resource pool, which usually occurs within a few minutes. Batch queries don`t count towards your concurrent rate limit. Read more about batch queries here. The default "interactive" value is used if not set explicitly.\n big_query_client_buffer_size_mb (Optional[int]): Google BigQuery client's chunk (buffer) size (MIN=1, MAX = 15) for each table. The size that will be written by a single RPC. Written data will be buffered and only flushed upon reaching this size or closing the channel. The default 15MB value is used if not set explicitly. Read more here.\n """\n self.project_id = check.str_param(project_id, "project_id")\n self.dataset_location = check.str_param(dataset_location, "dataset_location")\n self.dataset_id = check.str_param(dataset_id, "dataset_id")\n self.loading_method = check.inst_param(\n loading_method,\n "loading_method",\n (BigqueryDestination.StandardInserts, BigqueryDestination.GCSStaging),\n )\n self.credentials_json = check.opt_str_param(credentials_json, "credentials_json")\n self.transformation_priority = check.opt_str_param(\n transformation_priority, "transformation_priority"\n )\n self.big_query_client_buffer_size_mb = check.opt_int_param(\n big_query_client_buffer_size_mb, "big_query_client_buffer_size_mb"\n )\n super().__init__("Bigquery", name)
\n\n\n
[docs]class RabbitmqDestination(GeneratedAirbyteDestination):\n
[docs] @public\n def __init__(\n self,\n name: str,\n host: str,\n routing_key: str,\n ssl: Optional[bool] = None,\n port: Optional[int] = None,\n virtual_host: Optional[str] = None,\n username: Optional[str] = None,\n password: Optional[str] = None,\n exchange: Optional[str] = None,\n ):\n """Airbyte Destination for Rabbitmq.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/rabbitmq\n\n Args:\n name (str): The name of the destination.\n ssl (Optional[bool]): SSL enabled.\n host (str): The RabbitMQ host name.\n port (Optional[int]): The RabbitMQ port.\n virtual_host (Optional[str]): The RabbitMQ virtual host name.\n username (Optional[str]): The username to connect.\n password (Optional[str]): The password to connect.\n exchange (Optional[str]): The exchange name.\n routing_key (str): The routing key.\n """\n self.ssl = check.opt_bool_param(ssl, "ssl")\n self.host = check.str_param(host, "host")\n self.port = check.opt_int_param(port, "port")\n self.virtual_host = check.opt_str_param(virtual_host, "virtual_host")\n self.username = check.opt_str_param(username, "username")\n self.password = check.opt_str_param(password, "password")\n self.exchange = check.opt_str_param(exchange, "exchange")\n self.routing_key = check.str_param(routing_key, "routing_key")\n super().__init__("Rabbitmq", name)
\n\n\n
[docs]class KvdbDestination(GeneratedAirbyteDestination):\n
[docs] @public\n def __init__(self, name: str, bucket_id: str, secret_key: str):\n """Airbyte Destination for Kvdb.\n\n Documentation can be found at https://kvdb.io/docs/api/\n\n Args:\n name (str): The name of the destination.\n bucket_id (str): The ID of your KVdb bucket.\n secret_key (str): Your bucket Secret Key.\n """\n self.bucket_id = check.str_param(bucket_id, "bucket_id")\n self.secret_key = check.str_param(secret_key, "secret_key")\n super().__init__("Kvdb", name)
\n\n\n
[docs]class ClickhouseDestination(GeneratedAirbyteDestination):\n
[docs] @public\n def __init__(\n self,\n name: str,\n host: str,\n port: int,\n database: str,\n username: str,\n password: Optional[str] = None,\n jdbc_url_params: Optional[str] = None,\n ssl: Optional[bool] = None,\n ):\n """Airbyte Destination for Clickhouse.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/clickhouse\n\n Args:\n name (str): The name of the destination.\n host (str): Hostname of the database.\n port (int): HTTP port of the database.\n database (str): Name of the database.\n username (str): Username to use to access the database.\n password (Optional[str]): Password associated with the username.\n jdbc_url_params (Optional[str]): Additional properties to pass to the JDBC URL string when connecting to the database formatted as 'key=value' pairs separated by the symbol '&'. (example: key1=value1&key2=value2&key3=value3).\n ssl (Optional[bool]): Encrypt data using SSL.\n """\n self.host = check.str_param(host, "host")\n self.port = check.int_param(port, "port")\n self.database = check.str_param(database, "database")\n self.username = check.str_param(username, "username")\n self.password = check.opt_str_param(password, "password")\n self.jdbc_url_params = check.opt_str_param(jdbc_url_params, "jdbc_url_params")\n self.ssl = check.opt_bool_param(ssl, "ssl")\n super().__init__("Clickhouse", name)
\n\n\n
[docs]class AmazonSqsDestination(GeneratedAirbyteDestination):\n
[docs] @public\n def __init__(\n self,\n name: str,\n queue_url: str,\n region: str,\n message_delay: Optional[int] = None,\n access_key: Optional[str] = None,\n secret_key: Optional[str] = None,\n message_body_key: Optional[str] = None,\n message_group_id: Optional[str] = None,\n ):\n """Airbyte Destination for Amazon Sqs.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/amazon-sqs\n\n Args:\n name (str): The name of the destination.\n queue_url (str): URL of the SQS Queue\n region (str): AWS Region of the SQS Queue\n message_delay (Optional[int]): Modify the Message Delay of the individual message from the Queue's default (seconds).\n access_key (Optional[str]): The Access Key ID of the AWS IAM Role to use for sending messages\n secret_key (Optional[str]): The Secret Key of the AWS IAM Role to use for sending messages\n message_body_key (Optional[str]): Use this property to extract the contents of the named key in the input record to use as the SQS message body. If not set, the entire content of the input record data is used as the message body.\n message_group_id (Optional[str]): The tag that specifies that a message belongs to a specific message group. This parameter applies only to, and is REQUIRED by, FIFO queues.\n """\n self.queue_url = check.str_param(queue_url, "queue_url")\n self.region = check.str_param(region, "region")\n self.message_delay = check.opt_int_param(message_delay, "message_delay")\n self.access_key = check.opt_str_param(access_key, "access_key")\n self.secret_key = check.opt_str_param(secret_key, "secret_key")\n self.message_body_key = check.opt_str_param(message_body_key, "message_body_key")\n self.message_group_id = check.opt_str_param(message_group_id, "message_group_id")\n super().__init__("Amazon Sqs", name)
\n\n\n
[docs]class MariadbColumnstoreDestination(GeneratedAirbyteDestination):\n
[docs] @public\n def __init__(\n self,\n name: str,\n host: str,\n port: int,\n database: str,\n username: str,\n password: Optional[str] = None,\n jdbc_url_params: Optional[str] = None,\n ):\n """Airbyte Destination for Mariadb Columnstore.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/mariadb-columnstore\n\n Args:\n name (str): The name of the destination.\n host (str): The Hostname of the database.\n port (int): The Port of the database.\n database (str): Name of the database.\n username (str): The Username which is used to access the database.\n password (Optional[str]): The Password associated with the username.\n jdbc_url_params (Optional[str]): Additional properties to pass to the JDBC URL string when connecting to the database formatted as 'key=value' pairs separated by the symbol '&'. (example: key1=value1&key2=value2&key3=value3).\n """\n self.host = check.str_param(host, "host")\n self.port = check.int_param(port, "port")\n self.database = check.str_param(database, "database")\n self.username = check.str_param(username, "username")\n self.password = check.opt_str_param(password, "password")\n self.jdbc_url_params = check.opt_str_param(jdbc_url_params, "jdbc_url_params")\n super().__init__("Mariadb Columnstore", name)
\n\n\n
[docs]class KinesisDestination(GeneratedAirbyteDestination):\n
[docs] @public\n def __init__(\n self,\n name: str,\n endpoint: str,\n region: str,\n shardCount: int,\n accessKey: str,\n privateKey: str,\n bufferSize: int,\n ):\n """Airbyte Destination for Kinesis.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/kinesis\n\n Args:\n name (str): The name of the destination.\n endpoint (str): AWS Kinesis endpoint.\n region (str): AWS region. Your account determines the Regions that are available to you.\n shardCount (int): Number of shards to which the data should be streamed.\n accessKey (str): Generate the AWS Access Key for current user.\n privateKey (str): The AWS Private Key - a string of numbers and letters that are unique for each account, also known as a "recovery phrase".\n bufferSize (int): Buffer size for storing kinesis records before being batch streamed.\n """\n self.endpoint = check.str_param(endpoint, "endpoint")\n self.region = check.str_param(region, "region")\n self.shardCount = check.int_param(shardCount, "shardCount")\n self.accessKey = check.str_param(accessKey, "accessKey")\n self.privateKey = check.str_param(privateKey, "privateKey")\n self.bufferSize = check.int_param(bufferSize, "bufferSize")\n super().__init__("Kinesis", name)
\n\n\n
[docs]class AzureBlobStorageDestination(GeneratedAirbyteDestination):\n
[docs] class CSVCommaSeparatedValues:\n
[docs] @public\n def __init__(self, flattening: str):\n self.format_type = "CSV"\n self.flattening = check.str_param(flattening, "flattening")
\n\n
[docs] class JSONLinesNewlineDelimitedJSON:\n
[docs] @public\n def __init__(\n self,\n ):\n self.format_type = "JSONL"
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n azure_blob_storage_account_name: str,\n azure_blob_storage_account_key: str,\n format: Union[\n "AzureBlobStorageDestination.CSVCommaSeparatedValues",\n "AzureBlobStorageDestination.JSONLinesNewlineDelimitedJSON",\n ],\n azure_blob_storage_endpoint_domain_name: Optional[str] = None,\n azure_blob_storage_container_name: Optional[str] = None,\n azure_blob_storage_output_buffer_size: Optional[int] = None,\n ):\n """Airbyte Destination for Azure Blob Storage.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/azureblobstorage\n\n Args:\n name (str): The name of the destination.\n azure_blob_storage_endpoint_domain_name (Optional[str]): This is Azure Blob Storage endpoint domain name. Leave default value (or leave it empty if run container from command line) to use Microsoft native from example.\n azure_blob_storage_container_name (Optional[str]): The name of the Azure blob storage container. If not exists - will be created automatically. May be empty, then will be created automatically airbytecontainer+timestamp\n azure_blob_storage_account_name (str): The account's name of the Azure Blob Storage.\n azure_blob_storage_account_key (str): The Azure blob storage account key.\n azure_blob_storage_output_buffer_size (Optional[int]): The amount of megabytes to buffer for the output stream to Azure. This will impact memory footprint on workers, but may need adjustment for performance and appropriate block size in Azure.\n format (Union[AzureBlobStorageDestination.CSVCommaSeparatedValues, AzureBlobStorageDestination.JSONLinesNewlineDelimitedJSON]): Output data format\n """\n self.azure_blob_storage_endpoint_domain_name = check.opt_str_param(\n azure_blob_storage_endpoint_domain_name, "azure_blob_storage_endpoint_domain_name"\n )\n self.azure_blob_storage_container_name = check.opt_str_param(\n azure_blob_storage_container_name, "azure_blob_storage_container_name"\n )\n self.azure_blob_storage_account_name = check.str_param(\n azure_blob_storage_account_name, "azure_blob_storage_account_name"\n )\n self.azure_blob_storage_account_key = check.str_param(\n azure_blob_storage_account_key, "azure_blob_storage_account_key"\n )\n self.azure_blob_storage_output_buffer_size = check.opt_int_param(\n azure_blob_storage_output_buffer_size, "azure_blob_storage_output_buffer_size"\n )\n self.format = check.inst_param(\n format,\n "format",\n (\n AzureBlobStorageDestination.CSVCommaSeparatedValues,\n AzureBlobStorageDestination.JSONLinesNewlineDelimitedJSON,\n ),\n )\n super().__init__("Azure Blob Storage", name)
\n\n\n
[docs]class KafkaDestination(GeneratedAirbyteDestination):\n
[docs] class PLAINTEXT:\n
[docs] @public\n def __init__(self, security_protocol: str):\n self.security_protocol = check.str_param(security_protocol, "security_protocol")
\n\n
[docs] class SASLPLAINTEXT:\n
[docs] @public\n def __init__(self, security_protocol: str, sasl_mechanism: str, sasl_jaas_config: str):\n self.security_protocol = check.str_param(security_protocol, "security_protocol")\n self.sasl_mechanism = check.str_param(sasl_mechanism, "sasl_mechanism")\n self.sasl_jaas_config = check.str_param(sasl_jaas_config, "sasl_jaas_config")
\n\n
[docs] class SASLSSL:\n
[docs] @public\n def __init__(self, security_protocol: str, sasl_mechanism: str, sasl_jaas_config: str):\n self.security_protocol = check.str_param(security_protocol, "security_protocol")\n self.sasl_mechanism = check.str_param(sasl_mechanism, "sasl_mechanism")\n self.sasl_jaas_config = check.str_param(sasl_jaas_config, "sasl_jaas_config")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n bootstrap_servers: str,\n topic_pattern: str,\n protocol: Union[\n "KafkaDestination.PLAINTEXT",\n "KafkaDestination.SASLPLAINTEXT",\n "KafkaDestination.SASLSSL",\n ],\n acks: str,\n enable_idempotence: bool,\n compression_type: str,\n batch_size: int,\n linger_ms: str,\n max_in_flight_requests_per_connection: int,\n client_dns_lookup: str,\n buffer_memory: str,\n max_request_size: int,\n retries: int,\n socket_connection_setup_timeout_ms: str,\n socket_connection_setup_timeout_max_ms: str,\n max_block_ms: str,\n request_timeout_ms: int,\n delivery_timeout_ms: int,\n send_buffer_bytes: int,\n receive_buffer_bytes: int,\n test_topic: Optional[str] = None,\n sync_producer: Optional[bool] = None,\n client_id: Optional[str] = None,\n ):\n """Airbyte Destination for Kafka.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/kafka\n\n Args:\n name (str): The name of the destination.\n bootstrap_servers (str): A list of host/port pairs to use for establishing the initial connection to the Kafka cluster. The client will make use of all servers irrespective of which servers are specified here for bootstrapping&mdash;this list only impacts the initial hosts used to discover the full set of servers. This list should be in the form host1:port1,host2:port2,.... Since these servers are just used for the initial connection to discover the full cluster membership (which may change dynamically), this list need not contain the full set of servers (you may want more than one, though, in case a server is down).\n topic_pattern (str): Topic pattern in which the records will be sent. You can use patterns like '{namespace}' and/or '{stream}' to send the message to a specific topic based on these values. Notice that the topic name will be transformed to a standard naming convention.\n test_topic (Optional[str]): Topic to test if Airbyte can produce messages.\n sync_producer (Optional[bool]): Wait synchronously until the record has been sent to Kafka.\n protocol (Union[KafkaDestination.PLAINTEXT, KafkaDestination.SASLPLAINTEXT, KafkaDestination.SASLSSL]): Protocol used to communicate with brokers.\n client_id (Optional[str]): An ID string to pass to the server when making requests. The purpose of this is to be able to track the source of requests beyond just ip/port by allowing a logical application name to be included in server-side request logging.\n acks (str): The number of acknowledgments the producer requires the leader to have received before considering a request complete. This controls the durability of records that are sent.\n enable_idempotence (bool): When set to 'true', the producer will ensure that exactly one copy of each message is written in the stream. If 'false', producer retries due to broker failures, etc., may write duplicates of the retried message in the stream.\n compression_type (str): The compression type for all data generated by the producer.\n batch_size (int): The producer will attempt to batch records together into fewer requests whenever multiple records are being sent to the same partition.\n linger_ms (str): The producer groups together any records that arrive in between request transmissions into a single batched request.\n max_in_flight_requests_per_connection (int): The maximum number of unacknowledged requests the client will send on a single connection before blocking. Can be greater than 1, and the maximum value supported with idempotency is 5.\n client_dns_lookup (str): Controls how the client uses DNS lookups. If set to use_all_dns_ips, connect to each returned IP address in sequence until a successful connection is established. After a disconnection, the next IP is used. Once all IPs have been used once, the client resolves the IP(s) from the hostname again. If set to resolve_canonical_bootstrap_servers_only, resolve each bootstrap address into a list of canonical names. After the bootstrap phase, this behaves the same as use_all_dns_ips. If set to default (deprecated), attempt to connect to the first IP address returned by the lookup, even if the lookup returns multiple IP addresses.\n buffer_memory (str): The total bytes of memory the producer can use to buffer records waiting to be sent to the server.\n max_request_size (int): The maximum size of a request in bytes.\n retries (int): Setting a value greater than zero will cause the client to resend any record whose send fails with a potentially transient error.\n socket_connection_setup_timeout_ms (str): The amount of time the client will wait for the socket connection to be established.\n socket_connection_setup_timeout_max_ms (str): The maximum amount of time the client will wait for the socket connection to be established. The connection setup timeout will increase exponentially for each consecutive connection failure up to this maximum.\n max_block_ms (str): The configuration controls how long the KafkaProducer's send(), partitionsFor(), initTransactions(), sendOffsetsToTransaction(), commitTransaction() and abortTransaction() methods will block.\n request_timeout_ms (int): The configuration controls the maximum amount of time the client will wait for the response of a request. If the response is not received before the timeout elapses the client will resend the request if necessary or fail the request if retries are exhausted.\n delivery_timeout_ms (int): An upper bound on the time to report success or failure after a call to 'send()' returns.\n send_buffer_bytes (int): The size of the TCP send buffer (SO_SNDBUF) to use when sending data. If the value is -1, the OS default will be used.\n receive_buffer_bytes (int): The size of the TCP receive buffer (SO_RCVBUF) to use when reading data. If the value is -1, the OS default will be used.\n """\n self.bootstrap_servers = check.str_param(bootstrap_servers, "bootstrap_servers")\n self.topic_pattern = check.str_param(topic_pattern, "topic_pattern")\n self.test_topic = check.opt_str_param(test_topic, "test_topic")\n self.sync_producer = check.opt_bool_param(sync_producer, "sync_producer")\n self.protocol = check.inst_param(\n protocol,\n "protocol",\n (KafkaDestination.PLAINTEXT, KafkaDestination.SASLPLAINTEXT, KafkaDestination.SASLSSL),\n )\n self.client_id = check.opt_str_param(client_id, "client_id")\n self.acks = check.str_param(acks, "acks")\n self.enable_idempotence = check.bool_param(enable_idempotence, "enable_idempotence")\n self.compression_type = check.str_param(compression_type, "compression_type")\n self.batch_size = check.int_param(batch_size, "batch_size")\n self.linger_ms = check.str_param(linger_ms, "linger_ms")\n self.max_in_flight_requests_per_connection = check.int_param(\n max_in_flight_requests_per_connection, "max_in_flight_requests_per_connection"\n )\n self.client_dns_lookup = check.str_param(client_dns_lookup, "client_dns_lookup")\n self.buffer_memory = check.str_param(buffer_memory, "buffer_memory")\n self.max_request_size = check.int_param(max_request_size, "max_request_size")\n self.retries = check.int_param(retries, "retries")\n self.socket_connection_setup_timeout_ms = check.str_param(\n socket_connection_setup_timeout_ms, "socket_connection_setup_timeout_ms"\n )\n self.socket_connection_setup_timeout_max_ms = check.str_param(\n socket_connection_setup_timeout_max_ms, "socket_connection_setup_timeout_max_ms"\n )\n self.max_block_ms = check.str_param(max_block_ms, "max_block_ms")\n self.request_timeout_ms = check.int_param(request_timeout_ms, "request_timeout_ms")\n self.delivery_timeout_ms = check.int_param(delivery_timeout_ms, "delivery_timeout_ms")\n self.send_buffer_bytes = check.int_param(send_buffer_bytes, "send_buffer_bytes")\n self.receive_buffer_bytes = check.int_param(receive_buffer_bytes, "receive_buffer_bytes")\n super().__init__("Kafka", name)
\n\n\n
[docs]class ElasticsearchDestination(GeneratedAirbyteDestination):\n
[docs] class None_:\n
[docs] @public\n def __init__(\n self,\n ):\n self.method = "none"
\n\n
[docs] class ApiKeySecret:\n
[docs] @public\n def __init__(self, apiKeyId: str, apiKeySecret: str):\n self.method = "secret"\n self.apiKeyId = check.str_param(apiKeyId, "apiKeyId")\n self.apiKeySecret = check.str_param(apiKeySecret, "apiKeySecret")
\n\n
[docs] class UsernamePassword:\n
[docs] @public\n def __init__(self, username: str, password: str):\n self.method = "basic"\n self.username = check.str_param(username, "username")\n self.password = check.str_param(password, "password")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n endpoint: str,\n authenticationMethod: Union[\n "ElasticsearchDestination.None_",\n "ElasticsearchDestination.ApiKeySecret",\n "ElasticsearchDestination.UsernamePassword",\n ],\n upsert: Optional[bool] = None,\n ):\n r"""Airbyte Destination for Elasticsearch.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/elasticsearch\n\n Args:\n name (str): The name of the destination.\n endpoint (str): The full url of the Elasticsearch server\n upsert (Optional[bool]): If a primary key identifier is defined in the source, an upsert will be performed using the primary key value as the elasticsearch doc id. Does not support composite primary keys.\n authenticationMethod (Union[ElasticsearchDestination.None\\\\_, ElasticsearchDestination.ApiKeySecret, ElasticsearchDestination.UsernamePassword]): The type of authentication to be used\n """\n self.endpoint = check.str_param(endpoint, "endpoint")\n self.upsert = check.opt_bool_param(upsert, "upsert")\n self.authenticationMethod = check.inst_param(\n authenticationMethod,\n "authenticationMethod",\n (\n ElasticsearchDestination.None_,\n ElasticsearchDestination.ApiKeySecret,\n ElasticsearchDestination.UsernamePassword,\n ),\n )\n super().__init__("Elasticsearch", name)
\n\n\n
[docs]class MysqlDestination(GeneratedAirbyteDestination):\n
[docs] @public\n def __init__(\n self,\n name: str,\n host: str,\n port: int,\n database: str,\n username: str,\n password: Optional[str] = None,\n ssl: Optional[bool] = None,\n jdbc_url_params: Optional[str] = None,\n ):\n """Airbyte Destination for Mysql.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/mysql\n\n Args:\n name (str): The name of the destination.\n host (str): Hostname of the database.\n port (int): Port of the database.\n database (str): Name of the database.\n username (str): Username to use to access the database.\n password (Optional[str]): Password associated with the username.\n ssl (Optional[bool]): Encrypt data using SSL.\n jdbc_url_params (Optional[str]): Additional properties to pass to the JDBC URL string when connecting to the database formatted as 'key=value' pairs separated by the symbol '&'. (example: key1=value1&key2=value2&key3=value3).\n """\n self.host = check.str_param(host, "host")\n self.port = check.int_param(port, "port")\n self.database = check.str_param(database, "database")\n self.username = check.str_param(username, "username")\n self.password = check.opt_str_param(password, "password")\n self.ssl = check.opt_bool_param(ssl, "ssl")\n self.jdbc_url_params = check.opt_str_param(jdbc_url_params, "jdbc_url_params")\n super().__init__("Mysql", name)
\n\n\n
[docs]class SftpJsonDestination(GeneratedAirbyteDestination):\n
[docs] @public\n def __init__(\n self,\n name: str,\n host: str,\n username: str,\n password: str,\n destination_path: str,\n port: Optional[int] = None,\n ):\n """Airbyte Destination for Sftp Json.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/sftp-json\n\n Args:\n name (str): The name of the destination.\n host (str): Hostname of the SFTP server.\n port (Optional[int]): Port of the SFTP server.\n username (str): Username to use to access the SFTP server.\n password (str): Password associated with the username.\n destination_path (str): Path to the directory where json files will be written.\n """\n self.host = check.str_param(host, "host")\n self.port = check.opt_int_param(port, "port")\n self.username = check.str_param(username, "username")\n self.password = check.str_param(password, "password")\n self.destination_path = check.str_param(destination_path, "destination_path")\n super().__init__("Sftp Json", name)
\n\n\n
[docs]class GcsDestination(GeneratedAirbyteDestination):\n
[docs] class HMACKey:\n
[docs] @public\n def __init__(self, credential_type: str, hmac_key_access_id: str, hmac_key_secret: str):\n self.credential_type = check.str_param(credential_type, "credential_type")\n self.hmac_key_access_id = check.str_param(hmac_key_access_id, "hmac_key_access_id")\n self.hmac_key_secret = check.str_param(hmac_key_secret, "hmac_key_secret")
\n\n
[docs] class NoCompression:\n
[docs] @public\n def __init__(self, compression_type: Optional[str] = None):\n self.compression_type = check.opt_str_param(compression_type, "compression_type")
\n\n
[docs] class Deflate:\n
[docs] @public\n def __init__(self, codec: str, compression_level: Optional[int] = None):\n self.codec = check.str_param(codec, "codec")\n self.compression_level = check.opt_int_param(compression_level, "compression_level")
\n\n
[docs] class Bzip2:\n
[docs] @public\n def __init__(self, codec: str):\n self.codec = check.str_param(codec, "codec")
\n\n
[docs] class Xz:\n
[docs] @public\n def __init__(self, codec: str, compression_level: Optional[int] = None):\n self.codec = check.str_param(codec, "codec")\n self.compression_level = check.opt_int_param(compression_level, "compression_level")
\n\n
[docs] class Zstandard:\n
[docs] @public\n def __init__(\n self,\n codec: str,\n compression_level: Optional[int] = None,\n include_checksum: Optional[bool] = None,\n ):\n self.codec = check.str_param(codec, "codec")\n self.compression_level = check.opt_int_param(compression_level, "compression_level")\n self.include_checksum = check.opt_bool_param(include_checksum, "include_checksum")
\n\n
[docs] class Snappy:\n
[docs] @public\n def __init__(self, codec: str):\n self.codec = check.str_param(codec, "codec")
\n\n
[docs] class AvroApacheAvro:\n
[docs] @public\n def __init__(\n self,\n format_type: str,\n compression_codec: Union[\n "GcsDestination.NoCompression",\n "GcsDestination.Deflate",\n "GcsDestination.Bzip2",\n "GcsDestination.Xz",\n "GcsDestination.Zstandard",\n "GcsDestination.Snappy",\n ],\n ):\n self.format_type = check.str_param(format_type, "format_type")\n self.compression_codec = check.inst_param(\n compression_codec,\n "compression_codec",\n (\n GcsDestination.NoCompression,\n GcsDestination.Deflate,\n GcsDestination.Bzip2,\n GcsDestination.Xz,\n GcsDestination.Zstandard,\n GcsDestination.Snappy,\n ),\n )
\n\n
[docs] class GZIP:\n
[docs] @public\n def __init__(self, compression_type: Optional[str] = None):\n self.compression_type = check.opt_str_param(compression_type, "compression_type")
\n\n
[docs] class CSVCommaSeparatedValues:\n
[docs] @public\n def __init__(\n self,\n format_type: str,\n compression: Union["GcsDestination.NoCompression", "GcsDestination.GZIP"],\n flattening: Optional[str] = None,\n ):\n self.format_type = check.str_param(format_type, "format_type")\n self.flattening = check.opt_str_param(flattening, "flattening")\n self.compression = check.inst_param(\n compression, "compression", (GcsDestination.NoCompression, GcsDestination.GZIP)\n )
\n\n
[docs] class JSONLinesNewlineDelimitedJSON:\n
[docs] @public\n def __init__(\n self,\n format_type: str,\n compression: Union["GcsDestination.NoCompression", "GcsDestination.GZIP"],\n ):\n self.format_type = check.str_param(format_type, "format_type")\n self.compression = check.inst_param(\n compression, "compression", (GcsDestination.NoCompression, GcsDestination.GZIP)\n )
\n\n
[docs] class ParquetColumnarStorage:\n
[docs] @public\n def __init__(\n self,\n format_type: str,\n compression_codec: Optional[str] = None,\n block_size_mb: Optional[int] = None,\n max_padding_size_mb: Optional[int] = None,\n page_size_kb: Optional[int] = None,\n dictionary_page_size_kb: Optional[int] = None,\n dictionary_encoding: Optional[bool] = None,\n ):\n self.format_type = check.str_param(format_type, "format_type")\n self.compression_codec = check.opt_str_param(compression_codec, "compression_codec")\n self.block_size_mb = check.opt_int_param(block_size_mb, "block_size_mb")\n self.max_padding_size_mb = check.opt_int_param(\n max_padding_size_mb, "max_padding_size_mb"\n )\n self.page_size_kb = check.opt_int_param(page_size_kb, "page_size_kb")\n self.dictionary_page_size_kb = check.opt_int_param(\n dictionary_page_size_kb, "dictionary_page_size_kb"\n )\n self.dictionary_encoding = check.opt_bool_param(\n dictionary_encoding, "dictionary_encoding"\n )
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n gcs_bucket_name: str,\n gcs_bucket_path: str,\n credential: "GcsDestination.HMACKey",\n format: Union[\n "GcsDestination.AvroApacheAvro",\n "GcsDestination.CSVCommaSeparatedValues",\n "GcsDestination.JSONLinesNewlineDelimitedJSON",\n "GcsDestination.ParquetColumnarStorage",\n ],\n gcs_bucket_region: Optional[str] = None,\n ):\n """Airbyte Destination for Gcs.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/gcs\n\n Args:\n name (str): The name of the destination.\n gcs_bucket_name (str): You can find the bucket name in the App Engine Admin console Application Settings page, under the label Google Cloud Storage Bucket. Read more here.\n gcs_bucket_path (str): GCS Bucket Path string Subdirectory under the above bucket to sync the data into.\n gcs_bucket_region (Optional[str]): Select a Region of the GCS Bucket. Read more here.\n credential (GcsDestination.HMACKey): An HMAC key is a type of credential and can be associated with a service account or a user account in Cloud Storage. Read more here.\n format (Union[GcsDestination.AvroApacheAvro, GcsDestination.CSVCommaSeparatedValues, GcsDestination.JSONLinesNewlineDelimitedJSON, GcsDestination.ParquetColumnarStorage]): Output data format. One of the following formats must be selected - AVRO format, PARQUET format, CSV format, or JSONL format.\n """\n self.gcs_bucket_name = check.str_param(gcs_bucket_name, "gcs_bucket_name")\n self.gcs_bucket_path = check.str_param(gcs_bucket_path, "gcs_bucket_path")\n self.gcs_bucket_region = check.opt_str_param(gcs_bucket_region, "gcs_bucket_region")\n self.credential = check.inst_param(credential, "credential", GcsDestination.HMACKey)\n self.format = check.inst_param(\n format,\n "format",\n (\n GcsDestination.AvroApacheAvro,\n GcsDestination.CSVCommaSeparatedValues,\n GcsDestination.JSONLinesNewlineDelimitedJSON,\n GcsDestination.ParquetColumnarStorage,\n ),\n )\n super().__init__("Gcs", name)
\n\n\n
[docs]class CassandraDestination(GeneratedAirbyteDestination):\n
[docs] @public\n def __init__(\n self,\n name: str,\n keyspace: str,\n username: str,\n password: str,\n address: str,\n port: int,\n datacenter: Optional[str] = None,\n replication: Optional[int] = None,\n ):\n """Airbyte Destination for Cassandra.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/cassandra\n\n Args:\n name (str): The name of the destination.\n keyspace (str): Default Cassandra keyspace to create data in.\n username (str): Username to use to access Cassandra.\n password (str): Password associated with Cassandra.\n address (str): Address to connect to.\n port (int): Port of Cassandra.\n datacenter (Optional[str]): Datacenter of the cassandra cluster.\n replication (Optional[int]): Indicates to how many nodes the data should be replicated to.\n """\n self.keyspace = check.str_param(keyspace, "keyspace")\n self.username = check.str_param(username, "username")\n self.password = check.str_param(password, "password")\n self.address = check.str_param(address, "address")\n self.port = check.int_param(port, "port")\n self.datacenter = check.opt_str_param(datacenter, "datacenter")\n self.replication = check.opt_int_param(replication, "replication")\n super().__init__("Cassandra", name)
\n\n\n
[docs]class FireboltDestination(GeneratedAirbyteDestination):\n
[docs] class SQLInserts:\n
[docs] @public\n def __init__(\n self,\n ):\n self.method = "SQL"
\n\n
[docs] class ExternalTableViaS3:\n
[docs] @public\n def __init__(self, s3_bucket: str, s3_region: str, aws_key_id: str, aws_key_secret: str):\n self.method = "S3"\n self.s3_bucket = check.str_param(s3_bucket, "s3_bucket")\n self.s3_region = check.str_param(s3_region, "s3_region")\n self.aws_key_id = check.str_param(aws_key_id, "aws_key_id")\n self.aws_key_secret = check.str_param(aws_key_secret, "aws_key_secret")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n username: str,\n password: str,\n database: str,\n loading_method: Union[\n "FireboltDestination.SQLInserts", "FireboltDestination.ExternalTableViaS3"\n ],\n account: Optional[str] = None,\n host: Optional[str] = None,\n engine: Optional[str] = None,\n ):\n """Airbyte Destination for Firebolt.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/firebolt\n\n Args:\n name (str): The name of the destination.\n username (str): Firebolt email address you use to login.\n password (str): Firebolt password.\n account (Optional[str]): Firebolt account to login.\n host (Optional[str]): The host name of your Firebolt database.\n database (str): The database to connect to.\n engine (Optional[str]): Engine name or url to connect to.\n loading_method (Union[FireboltDestination.SQLInserts, FireboltDestination.ExternalTableViaS3]): Loading method used to select the way data will be uploaded to Firebolt\n """\n self.username = check.str_param(username, "username")\n self.password = check.str_param(password, "password")\n self.account = check.opt_str_param(account, "account")\n self.host = check.opt_str_param(host, "host")\n self.database = check.str_param(database, "database")\n self.engine = check.opt_str_param(engine, "engine")\n self.loading_method = check.inst_param(\n loading_method,\n "loading_method",\n (FireboltDestination.SQLInserts, FireboltDestination.ExternalTableViaS3),\n )\n super().__init__("Firebolt", name)
\n\n\n
[docs]class GoogleSheetsDestination(GeneratedAirbyteDestination):\n
[docs] class AuthenticationViaGoogleOAuth:\n
[docs] @public\n def __init__(self, client_id: str, client_secret: str, refresh_token: str):\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n spreadsheet_id: str,\n credentials: "GoogleSheetsDestination.AuthenticationViaGoogleOAuth",\n ):\n """Airbyte Destination for Google Sheets.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/google-sheets\n\n Args:\n name (str): The name of the destination.\n spreadsheet_id (str): The link to your spreadsheet. See this guide for more details.\n credentials (GoogleSheetsDestination.AuthenticationViaGoogleOAuth): Google API Credentials for connecting to Google Sheets and Google Drive APIs\n """\n self.spreadsheet_id = check.str_param(spreadsheet_id, "spreadsheet_id")\n self.credentials = check.inst_param(\n credentials, "credentials", GoogleSheetsDestination.AuthenticationViaGoogleOAuth\n )\n super().__init__("Google Sheets", name)
\n\n\n
[docs]class DatabricksDestination(GeneratedAirbyteDestination):\n
[docs] class AmazonS3:\n
[docs] @public\n def __init__(\n self,\n data_source_type: str,\n s3_bucket_name: str,\n s3_bucket_path: str,\n s3_bucket_region: str,\n s3_access_key_id: str,\n s3_secret_access_key: str,\n file_name_pattern: Optional[str] = None,\n ):\n self.data_source_type = check.str_param(data_source_type, "data_source_type")\n self.s3_bucket_name = check.str_param(s3_bucket_name, "s3_bucket_name")\n self.s3_bucket_path = check.str_param(s3_bucket_path, "s3_bucket_path")\n self.s3_bucket_region = check.str_param(s3_bucket_region, "s3_bucket_region")\n self.s3_access_key_id = check.str_param(s3_access_key_id, "s3_access_key_id")\n self.s3_secret_access_key = check.str_param(\n s3_secret_access_key, "s3_secret_access_key"\n )\n self.file_name_pattern = check.opt_str_param(file_name_pattern, "file_name_pattern")
\n\n
[docs] class AzureBlobStorage:\n
[docs] @public\n def __init__(\n self,\n data_source_type: str,\n azure_blob_storage_account_name: str,\n azure_blob_storage_container_name: str,\n azure_blob_storage_sas_token: str,\n azure_blob_storage_endpoint_domain_name: Optional[str] = None,\n ):\n self.data_source_type = check.str_param(data_source_type, "data_source_type")\n self.azure_blob_storage_endpoint_domain_name = check.opt_str_param(\n azure_blob_storage_endpoint_domain_name, "azure_blob_storage_endpoint_domain_name"\n )\n self.azure_blob_storage_account_name = check.str_param(\n azure_blob_storage_account_name, "azure_blob_storage_account_name"\n )\n self.azure_blob_storage_container_name = check.str_param(\n azure_blob_storage_container_name, "azure_blob_storage_container_name"\n )\n self.azure_blob_storage_sas_token = check.str_param(\n azure_blob_storage_sas_token, "azure_blob_storage_sas_token"\n )
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n accept_terms: bool,\n databricks_server_hostname: str,\n databricks_http_path: str,\n databricks_personal_access_token: str,\n data_source: Union[\n "DatabricksDestination.AmazonS3", "DatabricksDestination.AzureBlobStorage"\n ],\n databricks_port: Optional[str] = None,\n database_schema: Optional[str] = None,\n purge_staging_data: Optional[bool] = None,\n ):\n """Airbyte Destination for Databricks.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/databricks\n\n Args:\n name (str): The name of the destination.\n accept_terms (bool): You must agree to the Databricks JDBC Driver Terms & Conditions to use this connector.\n databricks_server_hostname (str): Databricks Cluster Server Hostname.\n databricks_http_path (str): Databricks Cluster HTTP Path.\n databricks_port (Optional[str]): Databricks Cluster Port.\n databricks_personal_access_token (str): Databricks Personal Access Token for making authenticated requests.\n database_schema (Optional[str]): The default schema tables are written to if the source does not specify a namespace. Unless specifically configured, the usual value for this field is "public".\n data_source (Union[DatabricksDestination.AmazonS3, DatabricksDestination.AzureBlobStorage]): Storage on which the delta lake is built.\n purge_staging_data (Optional[bool]): Default to 'true'. Switch it to 'false' for debugging purpose.\n """\n self.accept_terms = check.bool_param(accept_terms, "accept_terms")\n self.databricks_server_hostname = check.str_param(\n databricks_server_hostname, "databricks_server_hostname"\n )\n self.databricks_http_path = check.str_param(databricks_http_path, "databricks_http_path")\n self.databricks_port = check.opt_str_param(databricks_port, "databricks_port")\n self.databricks_personal_access_token = check.str_param(\n databricks_personal_access_token, "databricks_personal_access_token"\n )\n self.database_schema = check.opt_str_param(database_schema, "database_schema")\n self.data_source = check.inst_param(\n data_source,\n "data_source",\n (DatabricksDestination.AmazonS3, DatabricksDestination.AzureBlobStorage),\n )\n self.purge_staging_data = check.opt_bool_param(purge_staging_data, "purge_staging_data")\n super().__init__("Databricks", name)
\n\n\n
[docs]class BigqueryDenormalizedDestination(GeneratedAirbyteDestination):\n
[docs] class StandardInserts:\n
[docs] @public\n def __init__(\n self,\n ):\n self.method = "Standard"
\n\n
[docs] class HMACKey:\n
[docs] @public\n def __init__(self, hmac_key_access_id: str, hmac_key_secret: str):\n self.credential_type = "HMAC_KEY"\n self.hmac_key_access_id = check.str_param(hmac_key_access_id, "hmac_key_access_id")\n self.hmac_key_secret = check.str_param(hmac_key_secret, "hmac_key_secret")
\n\n
[docs] class GCSStaging:\n
[docs] @public\n def __init__(\n self,\n credential: "BigqueryDenormalizedDestination.HMACKey",\n gcs_bucket_name: str,\n gcs_bucket_path: str,\n keep_files_in_gcs_bucket: Optional[str] = None,\n ):\n self.method = "GCS Staging"\n self.credential = check.inst_param(\n credential, "credential", BigqueryDenormalizedDestination.HMACKey\n )\n self.gcs_bucket_name = check.str_param(gcs_bucket_name, "gcs_bucket_name")\n self.gcs_bucket_path = check.str_param(gcs_bucket_path, "gcs_bucket_path")\n self.keep_files_in_gcs_bucket = check.opt_str_param(\n keep_files_in_gcs_bucket, "keep_files_in_gcs_bucket"\n )
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n project_id: str,\n dataset_id: str,\n loading_method: Union[\n "BigqueryDenormalizedDestination.StandardInserts",\n "BigqueryDenormalizedDestination.GCSStaging",\n ],\n credentials_json: Optional[str] = None,\n dataset_location: Optional[str] = None,\n big_query_client_buffer_size_mb: Optional[int] = None,\n ):\n """Airbyte Destination for Bigquery Denormalized.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/bigquery\n\n Args:\n name (str): The name of the destination.\n project_id (str): The GCP project ID for the project containing the target BigQuery dataset. Read more here.\n dataset_id (str): The default BigQuery Dataset ID that tables are replicated to if the source does not specify a namespace. Read more here.\n loading_method (Union[BigqueryDenormalizedDestination.StandardInserts, BigqueryDenormalizedDestination.GCSStaging]): Loading method used to send select the way data will be uploaded to BigQuery. Standard Inserts - Direct uploading using SQL INSERT statements. This method is extremely inefficient and provided only for quick testing. In almost all cases, you should use staging. GCS Staging - Writes large batches of records to a file, uploads the file to GCS, then uses COPY INTO table to upload the file. Recommended for most workloads for better speed and scalability. Read more about GCS Staging here.\n credentials_json (Optional[str]): The contents of the JSON service account key. Check out the docs if you need help generating this key. Default credentials will be used if this field is left empty.\n dataset_location (Optional[str]): The location of the dataset. Warning: Changes made after creation will not be applied. The default "US" value is used if not set explicitly. Read more here.\n big_query_client_buffer_size_mb (Optional[int]): Google BigQuery client's chunk (buffer) size (MIN=1, MAX = 15) for each table. The size that will be written by a single RPC. Written data will be buffered and only flushed upon reaching this size or closing the channel. The default 15MB value is used if not set explicitly. Read more here.\n """\n self.project_id = check.str_param(project_id, "project_id")\n self.dataset_id = check.str_param(dataset_id, "dataset_id")\n self.loading_method = check.inst_param(\n loading_method,\n "loading_method",\n (\n BigqueryDenormalizedDestination.StandardInserts,\n BigqueryDenormalizedDestination.GCSStaging,\n ),\n )\n self.credentials_json = check.opt_str_param(credentials_json, "credentials_json")\n self.dataset_location = check.opt_str_param(dataset_location, "dataset_location")\n self.big_query_client_buffer_size_mb = check.opt_int_param(\n big_query_client_buffer_size_mb, "big_query_client_buffer_size_mb"\n )\n super().__init__("Bigquery Denormalized", name)
\n\n\n
[docs]class SqliteDestination(GeneratedAirbyteDestination):\n
[docs] @public\n def __init__(self, name: str, destination_path: str):\n """Airbyte Destination for Sqlite.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/sqlite\n\n Args:\n name (str): The name of the destination.\n destination_path (str): Path to the sqlite.db file. The file will be placed inside that local mount. For more information check out our docs\n """\n self.destination_path = check.str_param(destination_path, "destination_path")\n super().__init__("Sqlite", name)
\n\n\n
[docs]class MongodbDestination(GeneratedAirbyteDestination):\n
[docs] class StandaloneMongoDbInstance:\n
[docs] @public\n def __init__(self, instance: str, host: str, port: int, tls: Optional[bool] = None):\n self.instance = check.str_param(instance, "instance")\n self.host = check.str_param(host, "host")\n self.port = check.int_param(port, "port")\n self.tls = check.opt_bool_param(tls, "tls")
\n\n
[docs] class ReplicaSet:\n
[docs] @public\n def __init__(self, instance: str, server_addresses: str, replica_set: Optional[str] = None):\n self.instance = check.str_param(instance, "instance")\n self.server_addresses = check.str_param(server_addresses, "server_addresses")\n self.replica_set = check.opt_str_param(replica_set, "replica_set")
\n\n
[docs] class MongoDBAtlas:\n
[docs] @public\n def __init__(self, instance: str, cluster_url: str):\n self.instance = check.str_param(instance, "instance")\n self.cluster_url = check.str_param(cluster_url, "cluster_url")
\n\n
[docs] class None_:\n
[docs] @public\n def __init__(\n self,\n ):\n self.authorization = "none"
\n\n
[docs] class LoginPassword:\n
[docs] @public\n def __init__(self, username: str, password: str):\n self.authorization = "login/password"\n self.username = check.str_param(username, "username")\n self.password = check.str_param(password, "password")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n instance_type: Union[\n "MongodbDestination.StandaloneMongoDbInstance",\n "MongodbDestination.ReplicaSet",\n "MongodbDestination.MongoDBAtlas",\n ],\n database: str,\n auth_type: Union["MongodbDestination.None_", "MongodbDestination.LoginPassword"],\n ):\n r"""Airbyte Destination for Mongodb.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/mongodb\n\n Args:\n name (str): The name of the destination.\n instance_type (Union[MongodbDestination.StandaloneMongoDbInstance, MongodbDestination.ReplicaSet, MongodbDestination.MongoDBAtlas]): MongoDb instance to connect to. For MongoDB Atlas and Replica Set TLS connection is used by default.\n database (str): Name of the database.\n auth_type (Union[MongodbDestination.None\\\\_, MongodbDestination.LoginPassword]): Authorization type.\n """\n self.instance_type = check.inst_param(\n instance_type,\n "instance_type",\n (\n MongodbDestination.StandaloneMongoDbInstance,\n MongodbDestination.ReplicaSet,\n MongodbDestination.MongoDBAtlas,\n ),\n )\n self.database = check.str_param(database, "database")\n self.auth_type = check.inst_param(\n auth_type, "auth_type", (MongodbDestination.None_, MongodbDestination.LoginPassword)\n )\n super().__init__("Mongodb", name)
\n\n\n
[docs]class RocksetDestination(GeneratedAirbyteDestination):\n
[docs] @public\n def __init__(self, name: str, api_key: str, workspace: str, api_server: Optional[str] = None):\n """Airbyte Destination for Rockset.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/rockset\n\n Args:\n name (str): The name of the destination.\n api_key (str): Rockset api key\n workspace (str): The Rockset workspace in which collections will be created + written to.\n api_server (Optional[str]): Rockset api URL\n """\n self.api_key = check.str_param(api_key, "api_key")\n self.workspace = check.str_param(workspace, "workspace")\n self.api_server = check.opt_str_param(api_server, "api_server")\n super().__init__("Rockset", name)
\n\n\n
[docs]class OracleDestination(GeneratedAirbyteDestination):\n
[docs] class Unencrypted:\n
[docs] @public\n def __init__(\n self,\n ):\n self.encryption_method = "unencrypted"
\n\n
[docs] class NativeNetworkEncryptionNNE:\n
[docs] @public\n def __init__(self, encryption_algorithm: Optional[str] = None):\n self.encryption_method = "client_nne"\n self.encryption_algorithm = check.opt_str_param(\n encryption_algorithm, "encryption_algorithm"\n )
\n\n
[docs] class TLSEncryptedVerifyCertificate:\n
[docs] @public\n def __init__(self, ssl_certificate: str):\n self.encryption_method = "encrypted_verify_certificate"\n self.ssl_certificate = check.str_param(ssl_certificate, "ssl_certificate")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n host: str,\n port: int,\n sid: str,\n username: str,\n encryption: Union[\n "OracleDestination.Unencrypted",\n "OracleDestination.NativeNetworkEncryptionNNE",\n "OracleDestination.TLSEncryptedVerifyCertificate",\n ],\n password: Optional[str] = None,\n jdbc_url_params: Optional[str] = None,\n schema: Optional[str] = None,\n ):\n """Airbyte Destination for Oracle.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/oracle\n\n Args:\n name (str): The name of the destination.\n host (str): The hostname of the database.\n port (int): The port of the database.\n sid (str): The System Identifier uniquely distinguishes the instance from any other instance on the same computer.\n username (str): The username to access the database. This user must have CREATE USER privileges in the database.\n password (Optional[str]): The password associated with the username.\n jdbc_url_params (Optional[str]): Additional properties to pass to the JDBC URL string when connecting to the database formatted as 'key=value' pairs separated by the symbol '&'. (example: key1=value1&key2=value2&key3=value3).\n schema (Optional[str]): The default schema is used as the target schema for all statements issued from the connection that do not explicitly specify a schema name. The usual value for this field is "airbyte". In Oracle, schemas and users are the same thing, so the "user" parameter is used as the login credentials and this is used for the default Airbyte message schema.\n encryption (Union[OracleDestination.Unencrypted, OracleDestination.NativeNetworkEncryptionNNE, OracleDestination.TLSEncryptedVerifyCertificate]): The encryption method which is used when communicating with the database.\n """\n self.host = check.str_param(host, "host")\n self.port = check.int_param(port, "port")\n self.sid = check.str_param(sid, "sid")\n self.username = check.str_param(username, "username")\n self.password = check.opt_str_param(password, "password")\n self.jdbc_url_params = check.opt_str_param(jdbc_url_params, "jdbc_url_params")\n self.schema = check.opt_str_param(schema, "schema")\n self.encryption = check.inst_param(\n encryption,\n "encryption",\n (\n OracleDestination.Unencrypted,\n OracleDestination.NativeNetworkEncryptionNNE,\n OracleDestination.TLSEncryptedVerifyCertificate,\n ),\n )\n super().__init__("Oracle", name)
\n\n\n
[docs]class CsvDestination(GeneratedAirbyteDestination):\n
[docs] @public\n def __init__(self, name: str, destination_path: str):\n """Airbyte Destination for Csv.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/local-csv\n\n Args:\n name (str): The name of the destination.\n destination_path (str): Path to the directory where csv files will be written. The destination uses the local mount "/local" and any data files will be placed inside that local mount. For more information check out our docs\n """\n self.destination_path = check.str_param(destination_path, "destination_path")\n super().__init__("Csv", name)
\n\n\n
[docs]class S3Destination(GeneratedAirbyteDestination):\n
[docs] class NoCompression:\n
[docs] @public\n def __init__(self, compression_type: Optional[str] = None):\n self.compression_type = check.opt_str_param(compression_type, "compression_type")
\n\n
[docs] class Deflate:\n
[docs] @public\n def __init__(self, codec: str, compression_level: int):\n self.codec = check.str_param(codec, "codec")\n self.compression_level = check.int_param(compression_level, "compression_level")
\n\n
[docs] class Bzip2:\n
[docs] @public\n def __init__(self, codec: str):\n self.codec = check.str_param(codec, "codec")
\n\n
[docs] class Xz:\n
[docs] @public\n def __init__(self, codec: str, compression_level: int):\n self.codec = check.str_param(codec, "codec")\n self.compression_level = check.int_param(compression_level, "compression_level")
\n\n
[docs] class Zstandard:\n
[docs] @public\n def __init__(\n self, codec: str, compression_level: int, include_checksum: Optional[bool] = None\n ):\n self.codec = check.str_param(codec, "codec")\n self.compression_level = check.int_param(compression_level, "compression_level")\n self.include_checksum = check.opt_bool_param(include_checksum, "include_checksum")
\n\n
[docs] class Snappy:\n
[docs] @public\n def __init__(self, codec: str):\n self.codec = check.str_param(codec, "codec")
\n\n
[docs] class AvroApacheAvro:\n
[docs] @public\n def __init__(\n self,\n format_type: str,\n compression_codec: Union[\n "S3Destination.NoCompression",\n "S3Destination.Deflate",\n "S3Destination.Bzip2",\n "S3Destination.Xz",\n "S3Destination.Zstandard",\n "S3Destination.Snappy",\n ],\n ):\n self.format_type = check.str_param(format_type, "format_type")\n self.compression_codec = check.inst_param(\n compression_codec,\n "compression_codec",\n (\n S3Destination.NoCompression,\n S3Destination.Deflate,\n S3Destination.Bzip2,\n S3Destination.Xz,\n S3Destination.Zstandard,\n S3Destination.Snappy,\n ),\n )
\n\n
[docs] class GZIP:\n
[docs] @public\n def __init__(self, compression_type: Optional[str] = None):\n self.compression_type = check.opt_str_param(compression_type, "compression_type")
\n\n
[docs] class CSVCommaSeparatedValues:\n
[docs] @public\n def __init__(\n self,\n format_type: str,\n flattening: str,\n compression: Union["S3Destination.NoCompression", "S3Destination.GZIP"],\n ):\n self.format_type = check.str_param(format_type, "format_type")\n self.flattening = check.str_param(flattening, "flattening")\n self.compression = check.inst_param(\n compression, "compression", (S3Destination.NoCompression, S3Destination.GZIP)\n )
\n\n
[docs] class JSONLinesNewlineDelimitedJSON:\n
[docs] @public\n def __init__(\n self,\n format_type: str,\n compression: Union["S3Destination.NoCompression", "S3Destination.GZIP"],\n ):\n self.format_type = check.str_param(format_type, "format_type")\n self.compression = check.inst_param(\n compression, "compression", (S3Destination.NoCompression, S3Destination.GZIP)\n )
\n\n
[docs] class ParquetColumnarStorage:\n
[docs] @public\n def __init__(\n self,\n format_type: str,\n compression_codec: Optional[str] = None,\n block_size_mb: Optional[int] = None,\n max_padding_size_mb: Optional[int] = None,\n page_size_kb: Optional[int] = None,\n dictionary_page_size_kb: Optional[int] = None,\n dictionary_encoding: Optional[bool] = None,\n ):\n self.format_type = check.str_param(format_type, "format_type")\n self.compression_codec = check.opt_str_param(compression_codec, "compression_codec")\n self.block_size_mb = check.opt_int_param(block_size_mb, "block_size_mb")\n self.max_padding_size_mb = check.opt_int_param(\n max_padding_size_mb, "max_padding_size_mb"\n )\n self.page_size_kb = check.opt_int_param(page_size_kb, "page_size_kb")\n self.dictionary_page_size_kb = check.opt_int_param(\n dictionary_page_size_kb, "dictionary_page_size_kb"\n )\n self.dictionary_encoding = check.opt_bool_param(\n dictionary_encoding, "dictionary_encoding"\n )
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n s3_bucket_name: str,\n s3_bucket_path: str,\n s3_bucket_region: str,\n format: Union[\n "S3Destination.AvroApacheAvro",\n "S3Destination.CSVCommaSeparatedValues",\n "S3Destination.JSONLinesNewlineDelimitedJSON",\n "S3Destination.ParquetColumnarStorage",\n ],\n access_key_id: Optional[str] = None,\n secret_access_key: Optional[str] = None,\n s3_endpoint: Optional[str] = None,\n s3_path_format: Optional[str] = None,\n file_name_pattern: Optional[str] = None,\n ):\n """Airbyte Destination for S3.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/s3\n\n Args:\n name (str): The name of the destination.\n access_key_id (Optional[str]): The access key ID to access the S3 bucket. Airbyte requires Read and Write permissions to the given bucket. Read more here.\n secret_access_key (Optional[str]): The corresponding secret to the access key ID. Read more here\n s3_bucket_name (str): The name of the S3 bucket. Read more here.\n s3_bucket_path (str): Directory under the S3 bucket where data will be written. Read more here\n s3_bucket_region (str): The region of the S3 bucket. See here for all region codes.\n format (Union[S3Destination.AvroApacheAvro, S3Destination.CSVCommaSeparatedValues, S3Destination.JSONLinesNewlineDelimitedJSON, S3Destination.ParquetColumnarStorage]): Format of the data output. See here for more details\n s3_endpoint (Optional[str]): Your S3 endpoint url. Read more here\n s3_path_format (Optional[str]): Format string on how data will be organized inside the S3 bucket directory. Read more here\n file_name_pattern (Optional[str]): The pattern allows you to set the file-name format for the S3 staging file(s)\n """\n self.access_key_id = check.opt_str_param(access_key_id, "access_key_id")\n self.secret_access_key = check.opt_str_param(secret_access_key, "secret_access_key")\n self.s3_bucket_name = check.str_param(s3_bucket_name, "s3_bucket_name")\n self.s3_bucket_path = check.str_param(s3_bucket_path, "s3_bucket_path")\n self.s3_bucket_region = check.str_param(s3_bucket_region, "s3_bucket_region")\n self.format = check.inst_param(\n format,\n "format",\n (\n S3Destination.AvroApacheAvro,\n S3Destination.CSVCommaSeparatedValues,\n S3Destination.JSONLinesNewlineDelimitedJSON,\n S3Destination.ParquetColumnarStorage,\n ),\n )\n self.s3_endpoint = check.opt_str_param(s3_endpoint, "s3_endpoint")\n self.s3_path_format = check.opt_str_param(s3_path_format, "s3_path_format")\n self.file_name_pattern = check.opt_str_param(file_name_pattern, "file_name_pattern")\n super().__init__("S3", name)
\n\n\n
[docs]class AwsDatalakeDestination(GeneratedAirbyteDestination):\n
[docs] class IAMRole:\n
[docs] @public\n def __init__(self, role_arn: str):\n self.credentials_title = "IAM Role"\n self.role_arn = check.str_param(role_arn, "role_arn")
\n\n
[docs] class IAMUser:\n
[docs] @public\n def __init__(self, aws_access_key_id: str, aws_secret_access_key: str):\n self.credentials_title = "IAM User"\n self.aws_access_key_id = check.str_param(aws_access_key_id, "aws_access_key_id")\n self.aws_secret_access_key = check.str_param(\n aws_secret_access_key, "aws_secret_access_key"\n )
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n region: str,\n credentials: Union["AwsDatalakeDestination.IAMRole", "AwsDatalakeDestination.IAMUser"],\n bucket_name: str,\n bucket_prefix: str,\n aws_account_id: Optional[str] = None,\n lakeformation_database_name: Optional[str] = None,\n ):\n """Airbyte Destination for Aws Datalake.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/aws-datalake\n\n Args:\n name (str): The name of the destination.\n aws_account_id (Optional[str]): target aws account id\n region (str): Region name\n credentials (Union[AwsDatalakeDestination.IAMRole, AwsDatalakeDestination.IAMUser]): Choose How to Authenticate to AWS.\n bucket_name (str): Name of the bucket\n bucket_prefix (str): S3 prefix\n lakeformation_database_name (Optional[str]): Which database to use\n """\n self.aws_account_id = check.opt_str_param(aws_account_id, "aws_account_id")\n self.region = check.str_param(region, "region")\n self.credentials = check.inst_param(\n credentials,\n "credentials",\n (AwsDatalakeDestination.IAMRole, AwsDatalakeDestination.IAMUser),\n )\n self.bucket_name = check.str_param(bucket_name, "bucket_name")\n self.bucket_prefix = check.str_param(bucket_prefix, "bucket_prefix")\n self.lakeformation_database_name = check.opt_str_param(\n lakeformation_database_name, "lakeformation_database_name"\n )\n super().__init__("Aws Datalake", name)
\n\n\n
[docs]class MssqlDestination(GeneratedAirbyteDestination):\n
[docs] class Unencrypted:\n
[docs] @public\n def __init__(\n self,\n ):\n self.ssl_method = "unencrypted"
\n\n
[docs] class EncryptedTrustServerCertificate:\n
[docs] @public\n def __init__(\n self,\n ):\n self.ssl_method = "encrypted_trust_server_certificate"
\n\n
[docs] class EncryptedVerifyCertificate:\n
[docs] @public\n def __init__(self, hostNameInCertificate: Optional[str] = None):\n self.ssl_method = "encrypted_verify_certificate"\n self.hostNameInCertificate = check.opt_str_param(\n hostNameInCertificate, "hostNameInCertificate"\n )
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n host: str,\n port: int,\n database: str,\n schema: str,\n username: str,\n ssl_method: Union[\n "MssqlDestination.Unencrypted",\n "MssqlDestination.EncryptedTrustServerCertificate",\n "MssqlDestination.EncryptedVerifyCertificate",\n ],\n password: Optional[str] = None,\n jdbc_url_params: Optional[str] = None,\n ):\n """Airbyte Destination for Mssql.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/mssql\n\n Args:\n name (str): The name of the destination.\n host (str): The host name of the MSSQL database.\n port (int): The port of the MSSQL database.\n database (str): The name of the MSSQL database.\n schema (str): The default schema tables are written to if the source does not specify a namespace. The usual value for this field is "public".\n username (str): The username which is used to access the database.\n password (Optional[str]): The password associated with this username.\n jdbc_url_params (Optional[str]): Additional properties to pass to the JDBC URL string when connecting to the database formatted as 'key=value' pairs separated by the symbol '&'. (example: key1=value1&key2=value2&key3=value3).\n ssl_method (Union[MssqlDestination.Unencrypted, MssqlDestination.EncryptedTrustServerCertificate, MssqlDestination.EncryptedVerifyCertificate]): The encryption method which is used to communicate with the database.\n """\n self.host = check.str_param(host, "host")\n self.port = check.int_param(port, "port")\n self.database = check.str_param(database, "database")\n self.schema = check.str_param(schema, "schema")\n self.username = check.str_param(username, "username")\n self.password = check.opt_str_param(password, "password")\n self.jdbc_url_params = check.opt_str_param(jdbc_url_params, "jdbc_url_params")\n self.ssl_method = check.inst_param(\n ssl_method,\n "ssl_method",\n (\n MssqlDestination.Unencrypted,\n MssqlDestination.EncryptedTrustServerCertificate,\n MssqlDestination.EncryptedVerifyCertificate,\n ),\n )\n super().__init__("Mssql", name)
\n\n\n
[docs]class PubsubDestination(GeneratedAirbyteDestination):\n
[docs] @public\n def __init__(self, name: str, project_id: str, topic_id: str, credentials_json: str):\n """Airbyte Destination for Pubsub.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/pubsub\n\n Args:\n name (str): The name of the destination.\n project_id (str): The GCP project ID for the project containing the target PubSub.\n topic_id (str): The PubSub topic ID in the given GCP project ID.\n credentials_json (str): The contents of the JSON service account key. Check out the docs if you need help generating this key.\n """\n self.project_id = check.str_param(project_id, "project_id")\n self.topic_id = check.str_param(topic_id, "topic_id")\n self.credentials_json = check.str_param(credentials_json, "credentials_json")\n super().__init__("Pubsub", name)
\n\n\n
[docs]class R2Destination(GeneratedAirbyteDestination):\n
[docs] class NoCompression:\n
[docs] @public\n def __init__(self, compression_type: Optional[str] = None):\n self.compression_type = check.opt_str_param(compression_type, "compression_type")
\n\n
[docs] class Deflate:\n
[docs] @public\n def __init__(self, codec: str, compression_level: int):\n self.codec = check.str_param(codec, "codec")\n self.compression_level = check.int_param(compression_level, "compression_level")
\n\n
[docs] class Bzip2:\n
[docs] @public\n def __init__(self, codec: str):\n self.codec = check.str_param(codec, "codec")
\n\n
[docs] class Xz:\n
[docs] @public\n def __init__(self, codec: str, compression_level: int):\n self.codec = check.str_param(codec, "codec")\n self.compression_level = check.int_param(compression_level, "compression_level")
\n\n
[docs] class Zstandard:\n
[docs] @public\n def __init__(\n self, codec: str, compression_level: int, include_checksum: Optional[bool] = None\n ):\n self.codec = check.str_param(codec, "codec")\n self.compression_level = check.int_param(compression_level, "compression_level")\n self.include_checksum = check.opt_bool_param(include_checksum, "include_checksum")
\n\n
[docs] class Snappy:\n
[docs] @public\n def __init__(self, codec: str):\n self.codec = check.str_param(codec, "codec")
\n\n
[docs] class AvroApacheAvro:\n
[docs] @public\n def __init__(\n self,\n format_type: str,\n compression_codec: Union[\n "R2Destination.NoCompression",\n "R2Destination.Deflate",\n "R2Destination.Bzip2",\n "R2Destination.Xz",\n "R2Destination.Zstandard",\n "R2Destination.Snappy",\n ],\n ):\n self.format_type = check.str_param(format_type, "format_type")\n self.compression_codec = check.inst_param(\n compression_codec,\n "compression_codec",\n (\n R2Destination.NoCompression,\n R2Destination.Deflate,\n R2Destination.Bzip2,\n R2Destination.Xz,\n R2Destination.Zstandard,\n R2Destination.Snappy,\n ),\n )
\n\n
[docs] class GZIP:\n
[docs] @public\n def __init__(self, compression_type: Optional[str] = None):\n self.compression_type = check.opt_str_param(compression_type, "compression_type")
\n\n
[docs] class CSVCommaSeparatedValues:\n
[docs] @public\n def __init__(\n self,\n format_type: str,\n flattening: str,\n compression: Union["R2Destination.NoCompression", "R2Destination.GZIP"],\n ):\n self.format_type = check.str_param(format_type, "format_type")\n self.flattening = check.str_param(flattening, "flattening")\n self.compression = check.inst_param(\n compression, "compression", (R2Destination.NoCompression, R2Destination.GZIP)\n )
\n\n
[docs] class JSONLinesNewlineDelimitedJSON:\n
[docs] @public\n def __init__(\n self,\n format_type: str,\n compression: Union["R2Destination.NoCompression", "R2Destination.GZIP"],\n ):\n self.format_type = check.str_param(format_type, "format_type")\n self.compression = check.inst_param(\n compression, "compression", (R2Destination.NoCompression, R2Destination.GZIP)\n )
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n account_id: str,\n access_key_id: str,\n secret_access_key: str,\n s3_bucket_name: str,\n s3_bucket_path: str,\n format: Union[\n "R2Destination.AvroApacheAvro",\n "R2Destination.CSVCommaSeparatedValues",\n "R2Destination.JSONLinesNewlineDelimitedJSON",\n ],\n s3_path_format: Optional[str] = None,\n file_name_pattern: Optional[str] = None,\n ):\n """Airbyte Destination for R2.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/r2\n\n Args:\n name (str): The name of the destination.\n account_id (str): Cloudflare account ID\n access_key_id (str): The access key ID to access the R2 bucket. Airbyte requires Read and Write permissions to the given bucket. Read more here.\n secret_access_key (str): The corresponding secret to the access key ID. Read more here\n s3_bucket_name (str): The name of the R2 bucket. Read more here.\n s3_bucket_path (str): Directory under the R2 bucket where data will be written.\n format (Union[R2Destination.AvroApacheAvro, R2Destination.CSVCommaSeparatedValues, R2Destination.JSONLinesNewlineDelimitedJSON]): Format of the data output. See here for more details\n s3_path_format (Optional[str]): Format string on how data will be organized inside the R2 bucket directory. Read more here\n file_name_pattern (Optional[str]): The pattern allows you to set the file-name format for the R2 staging file(s)\n """\n self.account_id = check.str_param(account_id, "account_id")\n self.access_key_id = check.str_param(access_key_id, "access_key_id")\n self.secret_access_key = check.str_param(secret_access_key, "secret_access_key")\n self.s3_bucket_name = check.str_param(s3_bucket_name, "s3_bucket_name")\n self.s3_bucket_path = check.str_param(s3_bucket_path, "s3_bucket_path")\n self.format = check.inst_param(\n format,\n "format",\n (\n R2Destination.AvroApacheAvro,\n R2Destination.CSVCommaSeparatedValues,\n R2Destination.JSONLinesNewlineDelimitedJSON,\n ),\n )\n self.s3_path_format = check.opt_str_param(s3_path_format, "s3_path_format")\n self.file_name_pattern = check.opt_str_param(file_name_pattern, "file_name_pattern")\n super().__init__("R2", name)
\n\n\n
[docs]class JdbcDestination(GeneratedAirbyteDestination):\n
[docs] @public\n def __init__(\n self,\n name: str,\n username: str,\n jdbc_url: str,\n password: Optional[str] = None,\n schema: Optional[str] = None,\n ):\n """Airbyte Destination for Jdbc.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/postgres\n\n Args:\n name (str): The name of the destination.\n username (str): The username which is used to access the database.\n password (Optional[str]): The password associated with this username.\n jdbc_url (str): JDBC formatted url. See the standard here.\n schema (Optional[str]): If you leave the schema unspecified, JDBC defaults to a schema named "public".\n """\n self.username = check.str_param(username, "username")\n self.password = check.opt_str_param(password, "password")\n self.jdbc_url = check.str_param(jdbc_url, "jdbc_url")\n self.schema = check.opt_str_param(schema, "schema")\n super().__init__("Jdbc", name)
\n\n\n
[docs]class KeenDestination(GeneratedAirbyteDestination):\n
[docs] @public\n def __init__(\n self, name: str, project_id: str, api_key: str, infer_timestamp: Optional[bool] = None\n ):\n """Airbyte Destination for Keen.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/keen\n\n Args:\n name (str): The name of the destination.\n project_id (str): To get Keen Project ID, navigate to the Access tab from the left-hand, side panel and check the Project Details section.\n api_key (str): To get Keen Master API Key, navigate to the Access tab from the left-hand, side panel and check the Project Details section.\n infer_timestamp (Optional[bool]): Allow connector to guess keen.timestamp value based on the streamed data.\n """\n self.project_id = check.str_param(project_id, "project_id")\n self.api_key = check.str_param(api_key, "api_key")\n self.infer_timestamp = check.opt_bool_param(infer_timestamp, "infer_timestamp")\n super().__init__("Keen", name)
\n\n\n
[docs]class TidbDestination(GeneratedAirbyteDestination):\n
[docs] @public\n def __init__(\n self,\n name: str,\n host: str,\n port: int,\n database: str,\n username: str,\n password: Optional[str] = None,\n ssl: Optional[bool] = None,\n jdbc_url_params: Optional[str] = None,\n ):\n """Airbyte Destination for Tidb.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/tidb\n\n Args:\n name (str): The name of the destination.\n host (str): Hostname of the database.\n port (int): Port of the database.\n database (str): Name of the database.\n username (str): Username to use to access the database.\n password (Optional[str]): Password associated with the username.\n ssl (Optional[bool]): Encrypt data using SSL.\n jdbc_url_params (Optional[str]): Additional properties to pass to the JDBC URL string when connecting to the database formatted as 'key=value' pairs separated by the symbol '&'. (example: key1=value1&key2=value2&key3=value3).\n """\n self.host = check.str_param(host, "host")\n self.port = check.int_param(port, "port")\n self.database = check.str_param(database, "database")\n self.username = check.str_param(username, "username")\n self.password = check.opt_str_param(password, "password")\n self.ssl = check.opt_bool_param(ssl, "ssl")\n self.jdbc_url_params = check.opt_str_param(jdbc_url_params, "jdbc_url_params")\n super().__init__("Tidb", name)
\n\n\n
[docs]class FirestoreDestination(GeneratedAirbyteDestination):\n
[docs] @public\n def __init__(self, name: str, project_id: str, credentials_json: Optional[str] = None):\n """Airbyte Destination for Firestore.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/firestore\n\n Args:\n name (str): The name of the destination.\n project_id (str): The GCP project ID for the project containing the target BigQuery dataset.\n credentials_json (Optional[str]): The contents of the JSON service account key. Check out the docs if you need help generating this key. Default credentials will be used if this field is left empty.\n """\n self.project_id = check.str_param(project_id, "project_id")\n self.credentials_json = check.opt_str_param(credentials_json, "credentials_json")\n super().__init__("Firestore", name)
\n\n\n
[docs]class ScyllaDestination(GeneratedAirbyteDestination):\n
[docs] @public\n def __init__(\n self,\n name: str,\n keyspace: str,\n username: str,\n password: str,\n address: str,\n port: int,\n replication: Optional[int] = None,\n ):\n """Airbyte Destination for Scylla.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/scylla\n\n Args:\n name (str): The name of the destination.\n keyspace (str): Default Scylla keyspace to create data in.\n username (str): Username to use to access Scylla.\n password (str): Password associated with Scylla.\n address (str): Address to connect to.\n port (int): Port of Scylla.\n replication (Optional[int]): Indicates to how many nodes the data should be replicated to.\n """\n self.keyspace = check.str_param(keyspace, "keyspace")\n self.username = check.str_param(username, "username")\n self.password = check.str_param(password, "password")\n self.address = check.str_param(address, "address")\n self.port = check.int_param(port, "port")\n self.replication = check.opt_int_param(replication, "replication")\n super().__init__("Scylla", name)
\n\n\n
[docs]class RedisDestination(GeneratedAirbyteDestination):\n
[docs] @public\n def __init__(\n self, name: str, host: str, port: int, username: str, password: str, cache_type: str\n ):\n """Airbyte Destination for Redis.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/redis\n\n Args:\n name (str): The name of the destination.\n host (str): Redis host to connect to.\n port (int): Port of Redis.\n username (str): Username associated with Redis.\n password (str): Password associated with Redis.\n cache_type (str): Redis cache type to store data in.\n """\n self.host = check.str_param(host, "host")\n self.port = check.int_param(port, "port")\n self.username = check.str_param(username, "username")\n self.password = check.str_param(password, "password")\n self.cache_type = check.str_param(cache_type, "cache_type")\n super().__init__("Redis", name)
\n\n\n
[docs]class MqttDestination(GeneratedAirbyteDestination):\n
[docs] @public\n def __init__(\n self,\n name: str,\n broker_host: str,\n broker_port: int,\n use_tls: bool,\n topic_pattern: str,\n publisher_sync: bool,\n connect_timeout: int,\n automatic_reconnect: bool,\n clean_session: bool,\n message_retained: bool,\n message_qos: str,\n username: Optional[str] = None,\n password: Optional[str] = None,\n topic_test: Optional[str] = None,\n client: Optional[str] = None,\n ):\n """Airbyte Destination for Mqtt.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/mqtt\n\n Args:\n name (str): The name of the destination.\n broker_host (str): Host of the broker to connect to.\n broker_port (int): Port of the broker.\n use_tls (bool): Whether to use TLS encryption on the connection.\n username (Optional[str]): User name to use for the connection.\n password (Optional[str]): Password to use for the connection.\n topic_pattern (str): Topic pattern in which the records will be sent. You can use patterns like '{namespace}' and/or '{stream}' to send the message to a specific topic based on these values. Notice that the topic name will be transformed to a standard naming convention.\n topic_test (Optional[str]): Topic to test if Airbyte can produce messages.\n client (Optional[str]): A client identifier that is unique on the server being connected to.\n publisher_sync (bool): Wait synchronously until the record has been sent to the broker.\n connect_timeout (int): Maximum time interval (in seconds) the client will wait for the network connection to the MQTT server to be established.\n automatic_reconnect (bool): Whether the client will automatically attempt to reconnect to the server if the connection is lost.\n clean_session (bool): Whether the client and server should remember state across restarts and reconnects.\n message_retained (bool): Whether or not the publish message should be retained by the messaging engine.\n message_qos (str): Quality of service used for each message to be delivered.\n """\n self.broker_host = check.str_param(broker_host, "broker_host")\n self.broker_port = check.int_param(broker_port, "broker_port")\n self.use_tls = check.bool_param(use_tls, "use_tls")\n self.username = check.opt_str_param(username, "username")\n self.password = check.opt_str_param(password, "password")\n self.topic_pattern = check.str_param(topic_pattern, "topic_pattern")\n self.topic_test = check.opt_str_param(topic_test, "topic_test")\n self.client = check.opt_str_param(client, "client")\n self.publisher_sync = check.bool_param(publisher_sync, "publisher_sync")\n self.connect_timeout = check.int_param(connect_timeout, "connect_timeout")\n self.automatic_reconnect = check.bool_param(automatic_reconnect, "automatic_reconnect")\n self.clean_session = check.bool_param(clean_session, "clean_session")\n self.message_retained = check.bool_param(message_retained, "message_retained")\n self.message_qos = check.str_param(message_qos, "message_qos")\n super().__init__("Mqtt", name)
\n\n\n
[docs]class RedshiftDestination(GeneratedAirbyteDestination):\n
[docs] class Standard:\n
[docs] @public\n def __init__(\n self,\n ):\n self.method = "Standard"
\n\n
[docs] class NoEncryption:\n
[docs] @public\n def __init__(\n self,\n ):\n self.encryption_type = "none"
\n\n
[docs] class AESCBCEnvelopeEncryption:\n
[docs] @public\n def __init__(self, key_encrypting_key: Optional[str] = None):\n self.encryption_type = "aes_cbc_envelope"\n self.key_encrypting_key = check.opt_str_param(key_encrypting_key, "key_encrypting_key")
\n\n
[docs] class S3Staging:\n
[docs] @public\n def __init__(\n self,\n s3_bucket_name: str,\n s3_bucket_region: str,\n access_key_id: str,\n secret_access_key: str,\n encryption: Union[\n "RedshiftDestination.NoEncryption", "RedshiftDestination.AESCBCEnvelopeEncryption"\n ],\n s3_bucket_path: Optional[str] = None,\n file_name_pattern: Optional[str] = None,\n purge_staging_data: Optional[bool] = None,\n ):\n self.method = "S3 Staging"\n self.s3_bucket_name = check.str_param(s3_bucket_name, "s3_bucket_name")\n self.s3_bucket_path = check.opt_str_param(s3_bucket_path, "s3_bucket_path")\n self.s3_bucket_region = check.str_param(s3_bucket_region, "s3_bucket_region")\n self.file_name_pattern = check.opt_str_param(file_name_pattern, "file_name_pattern")\n self.access_key_id = check.str_param(access_key_id, "access_key_id")\n self.secret_access_key = check.str_param(secret_access_key, "secret_access_key")\n self.purge_staging_data = check.opt_bool_param(purge_staging_data, "purge_staging_data")\n self.encryption = check.inst_param(\n encryption,\n "encryption",\n (RedshiftDestination.NoEncryption, RedshiftDestination.AESCBCEnvelopeEncryption),\n )
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n host: str,\n port: int,\n username: str,\n password: str,\n database: str,\n schema: str,\n uploading_method: Union["RedshiftDestination.Standard", "RedshiftDestination.S3Staging"],\n jdbc_url_params: Optional[str] = None,\n ):\n """Airbyte Destination for Redshift.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/redshift\n\n Args:\n name (str): The name of the destination.\n host (str): Host Endpoint of the Redshift Cluster (must include the cluster-id, region and end with .redshift.amazonaws.com)\n port (int): Port of the database.\n username (str): Username to use to access the database.\n password (str): Password associated with the username.\n database (str): Name of the database.\n schema (str): The default schema tables are written to if the source does not specify a namespace. Unless specifically configured, the usual value for this field is "public".\n jdbc_url_params (Optional[str]): Additional properties to pass to the JDBC URL string when connecting to the database formatted as 'key=value' pairs separated by the symbol '&'. (example: key1=value1&key2=value2&key3=value3).\n uploading_method (Union[RedshiftDestination.Standard, RedshiftDestination.S3Staging]): The method how the data will be uploaded to the database.\n """\n self.host = check.str_param(host, "host")\n self.port = check.int_param(port, "port")\n self.username = check.str_param(username, "username")\n self.password = check.str_param(password, "password")\n self.database = check.str_param(database, "database")\n self.schema = check.str_param(schema, "schema")\n self.jdbc_url_params = check.opt_str_param(jdbc_url_params, "jdbc_url_params")\n self.uploading_method = check.inst_param(\n uploading_method,\n "uploading_method",\n (RedshiftDestination.Standard, RedshiftDestination.S3Staging),\n )\n super().__init__("Redshift", name)
\n\n\n
[docs]class PulsarDestination(GeneratedAirbyteDestination):\n
[docs] @public\n def __init__(\n self,\n name: str,\n brokers: str,\n use_tls: bool,\n topic_type: str,\n topic_tenant: str,\n topic_namespace: str,\n topic_pattern: str,\n compression_type: str,\n send_timeout_ms: int,\n max_pending_messages: int,\n max_pending_messages_across_partitions: int,\n batching_enabled: bool,\n batching_max_messages: int,\n batching_max_publish_delay: int,\n block_if_queue_full: bool,\n topic_test: Optional[str] = None,\n producer_name: Optional[str] = None,\n producer_sync: Optional[bool] = None,\n ):\n """Airbyte Destination for Pulsar.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/pulsar\n\n Args:\n name (str): The name of the destination.\n brokers (str): A list of host/port pairs to use for establishing the initial connection to the Pulsar cluster.\n use_tls (bool): Whether to use TLS encryption on the connection.\n topic_type (str): It identifies type of topic. Pulsar supports two kind of topics: persistent and non-persistent. In persistent topic, all messages are durably persisted on disk (that means on multiple disks unless the broker is standalone), whereas non-persistent topic does not persist message into storage disk.\n topic_tenant (str): The topic tenant within the instance. Tenants are essential to multi-tenancy in Pulsar, and spread across clusters.\n topic_namespace (str): The administrative unit of the topic, which acts as a grouping mechanism for related topics. Most topic configuration is performed at the namespace level. Each tenant has one or multiple namespaces.\n topic_pattern (str): Topic pattern in which the records will be sent. You can use patterns like '{namespace}' and/or '{stream}' to send the message to a specific topic based on these values. Notice that the topic name will be transformed to a standard naming convention.\n topic_test (Optional[str]): Topic to test if Airbyte can produce messages.\n producer_name (Optional[str]): Name for the producer. If not filled, the system will generate a globally unique name which can be accessed with.\n producer_sync (Optional[bool]): Wait synchronously until the record has been sent to Pulsar.\n compression_type (str): Compression type for the producer.\n send_timeout_ms (int): If a message is not acknowledged by a server before the send-timeout expires, an error occurs (in ms).\n max_pending_messages (int): The maximum size of a queue holding pending messages.\n max_pending_messages_across_partitions (int): The maximum number of pending messages across partitions.\n batching_enabled (bool): Control whether automatic batching of messages is enabled for the producer.\n batching_max_messages (int): Maximum number of messages permitted in a batch.\n batching_max_publish_delay (int): Time period in milliseconds within which the messages sent will be batched.\n block_if_queue_full (bool): If the send operation should block when the outgoing message queue is full.\n """\n self.brokers = check.str_param(brokers, "brokers")\n self.use_tls = check.bool_param(use_tls, "use_tls")\n self.topic_type = check.str_param(topic_type, "topic_type")\n self.topic_tenant = check.str_param(topic_tenant, "topic_tenant")\n self.topic_namespace = check.str_param(topic_namespace, "topic_namespace")\n self.topic_pattern = check.str_param(topic_pattern, "topic_pattern")\n self.topic_test = check.opt_str_param(topic_test, "topic_test")\n self.producer_name = check.opt_str_param(producer_name, "producer_name")\n self.producer_sync = check.opt_bool_param(producer_sync, "producer_sync")\n self.compression_type = check.str_param(compression_type, "compression_type")\n self.send_timeout_ms = check.int_param(send_timeout_ms, "send_timeout_ms")\n self.max_pending_messages = check.int_param(max_pending_messages, "max_pending_messages")\n self.max_pending_messages_across_partitions = check.int_param(\n max_pending_messages_across_partitions, "max_pending_messages_across_partitions"\n )\n self.batching_enabled = check.bool_param(batching_enabled, "batching_enabled")\n self.batching_max_messages = check.int_param(batching_max_messages, "batching_max_messages")\n self.batching_max_publish_delay = check.int_param(\n batching_max_publish_delay, "batching_max_publish_delay"\n )\n self.block_if_queue_full = check.bool_param(block_if_queue_full, "block_if_queue_full")\n super().__init__("Pulsar", name)
\n\n\n
[docs]class SnowflakeDestination(GeneratedAirbyteDestination):\n
[docs] class OAuth20:\n
[docs] @public\n def __init__(\n self,\n access_token: str,\n refresh_token: str,\n auth_type: Optional[str] = None,\n client_id: Optional[str] = None,\n client_secret: Optional[str] = None,\n ):\n self.auth_type = check.opt_str_param(auth_type, "auth_type")\n self.client_id = check.opt_str_param(client_id, "client_id")\n self.client_secret = check.opt_str_param(client_secret, "client_secret")\n self.access_token = check.str_param(access_token, "access_token")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")
\n\n
[docs] class KeyPairAuthentication:\n
[docs] @public\n def __init__(\n self,\n private_key: str,\n auth_type: Optional[str] = None,\n private_key_password: Optional[str] = None,\n ):\n self.auth_type = check.opt_str_param(auth_type, "auth_type")\n self.private_key = check.str_param(private_key, "private_key")\n self.private_key_password = check.opt_str_param(\n private_key_password, "private_key_password"\n )
\n\n
[docs] class UsernameAndPassword:\n
[docs] @public\n def __init__(self, password: str):\n self.password = check.str_param(password, "password")
\n\n
[docs] class SelectAnotherOption:\n
[docs] @public\n def __init__(self, method: str):\n self.method = check.str_param(method, "method")
\n\n
[docs] class RecommendedInternalStaging:\n
[docs] @public\n def __init__(self, method: str):\n self.method = check.str_param(method, "method")
\n\n
[docs] class NoEncryption:\n
[docs] @public\n def __init__(\n self,\n ):\n self.encryption_type = "none"
\n\n
[docs] class AESCBCEnvelopeEncryption:\n
[docs] @public\n def __init__(self, key_encrypting_key: Optional[str] = None):\n self.encryption_type = "aes_cbc_envelope"\n self.key_encrypting_key = check.opt_str_param(key_encrypting_key, "key_encrypting_key")
\n\n
[docs] class AWSS3Staging:\n
[docs] @public\n def __init__(\n self,\n method: str,\n s3_bucket_name: str,\n access_key_id: str,\n secret_access_key: str,\n encryption: Union[\n "SnowflakeDestination.NoEncryption", "SnowflakeDestination.AESCBCEnvelopeEncryption"\n ],\n s3_bucket_region: Optional[str] = None,\n purge_staging_data: Optional[bool] = None,\n file_name_pattern: Optional[str] = None,\n ):\n self.method = check.str_param(method, "method")\n self.s3_bucket_name = check.str_param(s3_bucket_name, "s3_bucket_name")\n self.s3_bucket_region = check.opt_str_param(s3_bucket_region, "s3_bucket_region")\n self.access_key_id = check.str_param(access_key_id, "access_key_id")\n self.secret_access_key = check.str_param(secret_access_key, "secret_access_key")\n self.purge_staging_data = check.opt_bool_param(purge_staging_data, "purge_staging_data")\n self.encryption = check.inst_param(\n encryption,\n "encryption",\n (SnowflakeDestination.NoEncryption, SnowflakeDestination.AESCBCEnvelopeEncryption),\n )\n self.file_name_pattern = check.opt_str_param(file_name_pattern, "file_name_pattern")
\n\n
[docs] class GoogleCloudStorageStaging:\n
[docs] @public\n def __init__(self, method: str, project_id: str, bucket_name: str, credentials_json: str):\n self.method = check.str_param(method, "method")\n self.project_id = check.str_param(project_id, "project_id")\n self.bucket_name = check.str_param(bucket_name, "bucket_name")\n self.credentials_json = check.str_param(credentials_json, "credentials_json")
\n\n
[docs] class AzureBlobStorageStaging:\n
[docs] @public\n def __init__(\n self,\n method: str,\n azure_blob_storage_account_name: str,\n azure_blob_storage_container_name: str,\n azure_blob_storage_sas_token: str,\n azure_blob_storage_endpoint_domain_name: Optional[str] = None,\n ):\n self.method = check.str_param(method, "method")\n self.azure_blob_storage_endpoint_domain_name = check.opt_str_param(\n azure_blob_storage_endpoint_domain_name, "azure_blob_storage_endpoint_domain_name"\n )\n self.azure_blob_storage_account_name = check.str_param(\n azure_blob_storage_account_name, "azure_blob_storage_account_name"\n )\n self.azure_blob_storage_container_name = check.str_param(\n azure_blob_storage_container_name, "azure_blob_storage_container_name"\n )\n self.azure_blob_storage_sas_token = check.str_param(\n azure_blob_storage_sas_token, "azure_blob_storage_sas_token"\n )
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n host: str,\n role: str,\n warehouse: str,\n database: str,\n schema: str,\n username: str,\n credentials: Union[\n "SnowflakeDestination.OAuth20",\n "SnowflakeDestination.KeyPairAuthentication",\n "SnowflakeDestination.UsernameAndPassword",\n ],\n loading_method: Union[\n "SnowflakeDestination.SelectAnotherOption",\n "SnowflakeDestination.RecommendedInternalStaging",\n "SnowflakeDestination.AWSS3Staging",\n "SnowflakeDestination.GoogleCloudStorageStaging",\n "SnowflakeDestination.AzureBlobStorageStaging",\n ],\n jdbc_url_params: Optional[str] = None,\n ):\n """Airbyte Destination for Snowflake.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/snowflake\n\n Args:\n name (str): The name of the destination.\n host (str): Enter your Snowflake account's locator (in the format ...snowflakecomputing.com)\n role (str): Enter the role that you want to use to access Snowflake\n warehouse (str): Enter the name of the warehouse that you want to sync data into\n database (str): Enter the name of the database you want to sync data into\n schema (str): Enter the name of the default schema\n username (str): Enter the name of the user you want to use to access the database\n jdbc_url_params (Optional[str]): Enter the additional properties to pass to the JDBC URL string when connecting to the database (formatted as key=value pairs separated by the symbol &). Example: key1=value1&key2=value2&key3=value3\n loading_method (Union[SnowflakeDestination.SelectAnotherOption, SnowflakeDestination.RecommendedInternalStaging, SnowflakeDestination.AWSS3Staging, SnowflakeDestination.GoogleCloudStorageStaging, SnowflakeDestination.AzureBlobStorageStaging]): Select a data staging method\n """\n self.host = check.str_param(host, "host")\n self.role = check.str_param(role, "role")\n self.warehouse = check.str_param(warehouse, "warehouse")\n self.database = check.str_param(database, "database")\n self.schema = check.str_param(schema, "schema")\n self.username = check.str_param(username, "username")\n self.credentials = check.inst_param(\n credentials,\n "credentials",\n (\n SnowflakeDestination.OAuth20,\n SnowflakeDestination.KeyPairAuthentication,\n SnowflakeDestination.UsernameAndPassword,\n ),\n )\n self.jdbc_url_params = check.opt_str_param(jdbc_url_params, "jdbc_url_params")\n self.loading_method = check.inst_param(\n loading_method,\n "loading_method",\n (\n SnowflakeDestination.SelectAnotherOption,\n SnowflakeDestination.RecommendedInternalStaging,\n SnowflakeDestination.AWSS3Staging,\n SnowflakeDestination.GoogleCloudStorageStaging,\n SnowflakeDestination.AzureBlobStorageStaging,\n ),\n )\n super().__init__("Snowflake", name)
\n\n\n
[docs]class PostgresDestination(GeneratedAirbyteDestination):\n
[docs] class Disable:\n
[docs] @public\n def __init__(\n self,\n ):\n self.mode = "disable"
\n\n
[docs] class Allow:\n
[docs] @public\n def __init__(\n self,\n ):\n self.mode = "allow"
\n\n
[docs] class Prefer:\n
[docs] @public\n def __init__(\n self,\n ):\n self.mode = "prefer"
\n\n
[docs] class Require:\n
[docs] @public\n def __init__(\n self,\n ):\n self.mode = "require"
\n\n
[docs] class VerifyCa:\n
[docs] @public\n def __init__(self, ca_certificate: str, client_key_password: Optional[str] = None):\n self.mode = "verify-ca"\n self.ca_certificate = check.str_param(ca_certificate, "ca_certificate")\n self.client_key_password = check.opt_str_param(\n client_key_password, "client_key_password"\n )
\n\n
[docs] class VerifyFull:\n
[docs] @public\n def __init__(\n self,\n ca_certificate: str,\n client_certificate: str,\n client_key: str,\n client_key_password: Optional[str] = None,\n ):\n self.mode = "verify-full"\n self.ca_certificate = check.str_param(ca_certificate, "ca_certificate")\n self.client_certificate = check.str_param(client_certificate, "client_certificate")\n self.client_key = check.str_param(client_key, "client_key")\n self.client_key_password = check.opt_str_param(\n client_key_password, "client_key_password"\n )
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n host: str,\n port: int,\n database: str,\n schema: str,\n username: str,\n ssl_mode: Union[\n "PostgresDestination.Disable",\n "PostgresDestination.Allow",\n "PostgresDestination.Prefer",\n "PostgresDestination.Require",\n "PostgresDestination.VerifyCa",\n "PostgresDestination.VerifyFull",\n ],\n password: Optional[str] = None,\n ssl: Optional[bool] = None,\n jdbc_url_params: Optional[str] = None,\n ):\n """Airbyte Destination for Postgres.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/postgres\n\n Args:\n name (str): The name of the destination.\n host (str): Hostname of the database.\n port (int): Port of the database.\n database (str): Name of the database.\n schema (str): The default schema tables are written to if the source does not specify a namespace. The usual value for this field is "public".\n username (str): Username to use to access the database.\n password (Optional[str]): Password associated with the username.\n ssl (Optional[bool]): Encrypt data using SSL. When activating SSL, please select one of the connection modes.\n ssl_mode (Union[PostgresDestination.Disable, PostgresDestination.Allow, PostgresDestination.Prefer, PostgresDestination.Require, PostgresDestination.VerifyCa, PostgresDestination.VerifyFull]): SSL connection modes. disable - Chose this mode to disable encryption of communication between Airbyte and destination database allow - Chose this mode to enable encryption only when required by the source database prefer - Chose this mode to allow unencrypted connection only if the source database does not support encryption require - Chose this mode to always require encryption. If the source database server does not support encryption, connection will fail verify-ca - Chose this mode to always require encryption and to verify that the source database server has a valid SSL certificate verify-full - This is the most secure mode. Chose this mode to always require encryption and to verify the identity of the source database server See more information - in the docs.\n jdbc_url_params (Optional[str]): Additional properties to pass to the JDBC URL string when connecting to the database formatted as 'key=value' pairs separated by the symbol '&'. (example: key1=value1&key2=value2&key3=value3).\n """\n self.host = check.str_param(host, "host")\n self.port = check.int_param(port, "port")\n self.database = check.str_param(database, "database")\n self.schema = check.str_param(schema, "schema")\n self.username = check.str_param(username, "username")\n self.password = check.opt_str_param(password, "password")\n self.ssl = check.opt_bool_param(ssl, "ssl")\n self.ssl_mode = check.inst_param(\n ssl_mode,\n "ssl_mode",\n (\n PostgresDestination.Disable,\n PostgresDestination.Allow,\n PostgresDestination.Prefer,\n PostgresDestination.Require,\n PostgresDestination.VerifyCa,\n PostgresDestination.VerifyFull,\n ),\n )\n self.jdbc_url_params = check.opt_str_param(jdbc_url_params, "jdbc_url_params")\n super().__init__("Postgres", name)
\n\n\n
[docs]class ScaffoldDestinationPythonDestination(GeneratedAirbyteDestination):\n
[docs] @public\n def __init__(self, name: str, TODO: Optional[str] = None):\n """Airbyte Destination for Scaffold Destination Python.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/scaffold-destination-python\n\n Args:\n name (str): The name of the destination.\n TODO (Optional[str]): FIX ME\n """\n self.TODO = check.opt_str_param(TODO, "TODO")\n super().__init__("Scaffold Destination Python", name)
\n\n\n
[docs]class LocalJsonDestination(GeneratedAirbyteDestination):\n
[docs] @public\n def __init__(self, name: str, destination_path: str):\n """Airbyte Destination for Local Json.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/local-json\n\n Args:\n name (str): The name of the destination.\n destination_path (str): Path to the directory where json files will be written. The files will be placed inside that local mount. For more information check out our docs\n """\n self.destination_path = check.str_param(destination_path, "destination_path")\n super().__init__("Local Json", name)
\n\n\n
[docs]class MeilisearchDestination(GeneratedAirbyteDestination):\n
[docs] @public\n def __init__(self, name: str, host: str, api_key: Optional[str] = None):\n """Airbyte Destination for Meilisearch.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/meilisearch\n\n Args:\n name (str): The name of the destination.\n host (str): Hostname of the MeiliSearch instance.\n api_key (Optional[str]): MeiliSearch API Key. See the docs for more information on how to obtain this key.\n """\n self.host = check.str_param(host, "host")\n self.api_key = check.opt_str_param(api_key, "api_key")\n super().__init__("Meilisearch", name)
\n
", "current_page_name": "_modules/dagster_airbyte/managed/generated/destinations", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_airbyte.managed.generated.destinations"}, "sources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_airbyte.managed.generated.sources

\n# ruff: noqa: A001, A002\nfrom typing import List, Optional, Union\n\nimport dagster._check as check\nfrom dagster._annotations import public\n\nfrom dagster_airbyte.managed.types import GeneratedAirbyteSource\n\n\n
[docs]class StravaSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n client_id: str,\n client_secret: str,\n refresh_token: str,\n athlete_id: int,\n start_date: str,\n auth_type: Optional[str] = None,\n ):\n """Airbyte Source for Strava.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/strava\n\n Args:\n name (str): The name of the destination.\n client_id (str): The Client ID of your Strava developer application.\n client_secret (str): The Client Secret of your Strava developer application.\n refresh_token (str): The Refresh Token with the activity: read_all permissions.\n athlete_id (int): The Athlete ID of your Strava developer application.\n start_date (str): UTC date and time. Any data before this date will not be replicated.\n """\n self.auth_type = check.opt_str_param(auth_type, "auth_type")\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")\n self.athlete_id = check.int_param(athlete_id, "athlete_id")\n self.start_date = check.str_param(start_date, "start_date")\n super().__init__("Strava", name)
\n\n\n
[docs]class AppsflyerSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n app_id: str,\n api_token: str,\n start_date: str,\n timezone: Optional[str] = None,\n ):\n """Airbyte Source for Appsflyer.\n\n Args:\n name (str): The name of the destination.\n app_id (str): App identifier as found in AppsFlyer.\n api_token (str): Pull API token for authentication. If you change the account admin, the token changes, and you must update scripts with the new token. Get the API token in the Dashboard.\n start_date (str): The default value to use if no bookmark exists for an endpoint. Raw Reports historical lookback is limited to 90 days.\n timezone (Optional[str]): Time zone in which date times are stored. The project timezone may be found in the App settings in the AppsFlyer console.\n """\n self.app_id = check.str_param(app_id, "app_id")\n self.api_token = check.str_param(api_token, "api_token")\n self.start_date = check.str_param(start_date, "start_date")\n self.timezone = check.opt_str_param(timezone, "timezone")\n super().__init__("Appsflyer", name)
\n\n\n
[docs]class GoogleWorkspaceAdminReportsSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self, name: str, credentials_json: str, email: str, lookback: Optional[int] = None\n ):\n """Airbyte Source for Google Workspace Admin Reports.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/google-workspace-admin-reports\n\n Args:\n name (str): The name of the destination.\n credentials_json (str): The contents of the JSON service account key. See the docs for more information on how to generate this key.\n email (str): The email of the user, who has permissions to access the Google Workspace Admin APIs.\n lookback (Optional[int]): Sets the range of time shown in the report. The maximum value allowed by the Google API is 180 days.\n """\n self.credentials_json = check.str_param(credentials_json, "credentials_json")\n self.email = check.str_param(email, "email")\n self.lookback = check.opt_int_param(lookback, "lookback")\n super().__init__("Google Workspace Admin Reports", name)
\n\n\n
[docs]class CartSource(GeneratedAirbyteSource):\n
[docs] class CentralAPIRouter:\n
[docs] @public\n def __init__(self, user_name: str, user_secret: str, site_id: str):\n self.auth_type = "CENTRAL_API_ROUTER"\n self.user_name = check.str_param(user_name, "user_name")\n self.user_secret = check.str_param(user_secret, "user_secret")\n self.site_id = check.str_param(site_id, "site_id")
\n\n
[docs] class SingleStoreAccessToken:\n
[docs] @public\n def __init__(self, access_token: str, store_name: str):\n self.auth_type = "SINGLE_STORE_ACCESS_TOKEN"\n self.access_token = check.str_param(access_token, "access_token")\n self.store_name = check.str_param(store_name, "store_name")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n credentials: Union["CartSource.CentralAPIRouter", "CartSource.SingleStoreAccessToken"],\n start_date: str,\n ):\n """Airbyte Source for Cart.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/cart\n\n Args:\n name (str): The name of the destination.\n start_date (str): The date from which you'd like to replicate the data\n """\n self.credentials = check.inst_param(\n credentials,\n "credentials",\n (CartSource.CentralAPIRouter, CartSource.SingleStoreAccessToken),\n )\n self.start_date = check.str_param(start_date, "start_date")\n super().__init__("Cart", name)
\n\n\n
[docs]class LinkedinAdsSource(GeneratedAirbyteSource):\n
[docs] class OAuth20:\n
[docs] @public\n def __init__(\n self,\n client_id: str,\n client_secret: str,\n refresh_token: str,\n auth_method: Optional[str] = None,\n ):\n self.auth_method = check.opt_str_param(auth_method, "auth_method")\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")
\n\n
[docs] class AccessToken:\n
[docs] @public\n def __init__(self, access_token: str, auth_method: Optional[str] = None):\n self.auth_method = check.opt_str_param(auth_method, "auth_method")\n self.access_token = check.str_param(access_token, "access_token")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n credentials: Union["LinkedinAdsSource.OAuth20", "LinkedinAdsSource.AccessToken"],\n start_date: str,\n account_ids: Optional[List[int]] = None,\n ):\n """Airbyte Source for Linkedin Ads.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/linkedin-ads\n\n Args:\n name (str): The name of the destination.\n start_date (str): UTC date in the format 2020-09-17. Any data before this date will not be replicated.\n account_ids (Optional[List[int]]): Specify the account IDs separated by a space, to pull the data from. Leave empty, if you want to pull the data from all associated accounts. See the LinkedIn Ads docs for more info.\n """\n self.credentials = check.inst_param(\n credentials, "credentials", (LinkedinAdsSource.OAuth20, LinkedinAdsSource.AccessToken)\n )\n self.start_date = check.str_param(start_date, "start_date")\n self.account_ids = check.opt_nullable_list_param(account_ids, "account_ids", int)\n super().__init__("Linkedin Ads", name)
\n\n\n
[docs]class MongodbSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n host: str,\n port: int,\n database: str,\n user: str,\n password: str,\n auth_source: str,\n replica_set: Optional[str] = None,\n ssl: Optional[bool] = None,\n ):\n """Airbyte Source for Mongodb.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/mongodb\n\n Args:\n name (str): The name of the destination.\n host (str): Host of a Mongo database to be replicated.\n port (int): Port of a Mongo database to be replicated.\n database (str): Database to be replicated.\n user (str): User\n password (str): Password\n auth_source (str): Authentication source where user information is stored. See the Mongo docs for more info.\n replica_set (Optional[str]): The name of the set to filter servers by, when connecting to a replica set (Under this condition, the 'TLS connection' value automatically becomes 'true'). See the Mongo docs for more info.\n ssl (Optional[bool]): If this switch is enabled, TLS connections will be used to connect to MongoDB.\n """\n self.host = check.str_param(host, "host")\n self.port = check.int_param(port, "port")\n self.database = check.str_param(database, "database")\n self.user = check.str_param(user, "user")\n self.password = check.str_param(password, "password")\n self.auth_source = check.str_param(auth_source, "auth_source")\n self.replica_set = check.opt_str_param(replica_set, "replica_set")\n self.ssl = check.opt_bool_param(ssl, "ssl")\n super().__init__("Mongodb", name)
\n\n\n
[docs]class TimelySource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, account_id: str, start_date: str, bearer_token: str):\n """Airbyte Source for Timely.\n\n Args:\n name (str): The name of the destination.\n account_id (str): Timely account id\n start_date (str): start date\n bearer_token (str): Timely bearer token\n """\n self.account_id = check.str_param(account_id, "account_id")\n self.start_date = check.str_param(start_date, "start_date")\n self.bearer_token = check.str_param(bearer_token, "bearer_token")\n super().__init__("Timely", name)
\n\n\n
[docs]class StockTickerApiTutorialSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, stock_ticker: str, api_key: str):\n """Airbyte Source for Stock Ticker Api Tutorial.\n\n Documentation can be found at https://polygon.io/docs/stocks/get_v2_aggs_grouped_locale_us_market_stocks__date\n\n Args:\n name (str): The name of the destination.\n stock_ticker (str): The stock ticker to track\n api_key (str): The Polygon.io Stocks API key to use to hit the API.\n """\n self.stock_ticker = check.str_param(stock_ticker, "stock_ticker")\n self.api_key = check.str_param(api_key, "api_key")\n super().__init__("Stock Ticker Api Tutorial", name)
\n\n\n
[docs]class WrikeSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self, name: str, access_token: str, wrike_instance: str, start_date: Optional[str] = None\n ):\n """Airbyte Source for Wrike.\n\n Args:\n name (str): The name of the destination.\n access_token (str): Permanent access token. You can find documentation on how to acquire a permanent access token here\n wrike_instance (str): Wrike's instance such as `app-us2.wrike.com`\n start_date (Optional[str]): UTC date and time in the format 2017-01-25T00:00:00Z. Only comments after this date will be replicated.\n """\n self.access_token = check.str_param(access_token, "access_token")\n self.wrike_instance = check.str_param(wrike_instance, "wrike_instance")\n self.start_date = check.opt_str_param(start_date, "start_date")\n super().__init__("Wrike", name)
\n\n\n
[docs]class CommercetoolsSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n region: str,\n host: str,\n start_date: str,\n project_key: str,\n client_id: str,\n client_secret: str,\n ):\n """Airbyte Source for Commercetools.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/commercetools\n\n Args:\n name (str): The name of the destination.\n region (str): The region of the platform.\n host (str): The cloud provider your shop is hosted. See: https://docs.commercetools.com/api/authorization\n start_date (str): The date you would like to replicate data. Format: YYYY-MM-DD.\n project_key (str): The project key\n client_id (str): Id of API Client.\n client_secret (str): The password of secret of API Client.\n """\n self.region = check.str_param(region, "region")\n self.host = check.str_param(host, "host")\n self.start_date = check.str_param(start_date, "start_date")\n self.project_key = check.str_param(project_key, "project_key")\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n super().__init__("Commercetools", name)
\n\n\n
[docs]class GutendexSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n author_year_start: Optional[str] = None,\n author_year_end: Optional[str] = None,\n copyright: Optional[str] = None,\n languages: Optional[str] = None,\n search: Optional[str] = None,\n sort: Optional[str] = None,\n topic: Optional[str] = None,\n ):\n """Airbyte Source for Gutendex.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/gutendex\n\n Args:\n name (str): The name of the destination.\n author_year_start (Optional[str]): (Optional) Defines the minimum birth year of the authors. Books by authors born prior to the start year will not be returned. Supports both positive (CE) or negative (BCE) integer values\n author_year_end (Optional[str]): (Optional) Defines the maximum birth year of the authors. Books by authors born after the end year will not be returned. Supports both positive (CE) or negative (BCE) integer values\n copyright (Optional[str]): (Optional) Use this to find books with a certain copyright status - true for books with existing copyrights, false for books in the public domain in the USA, or null for books with no available copyright information.\n languages (Optional[str]): (Optional) Use this to find books in any of a list of languages. They must be comma-separated, two-character language codes.\n search (Optional[str]): (Optional) Use this to search author names and book titles with given words. They must be separated by a space (i.e. %20 in URL-encoded format) and are case-insensitive.\n sort (Optional[str]): (Optional) Use this to sort books - ascending for Project Gutenberg ID numbers from lowest to highest, descending for IDs highest to lowest, or popular (the default) for most popular to least popular by number of downloads.\n topic (Optional[str]): (Optional) Use this to search for a case-insensitive key-phrase in books' bookshelves or subjects.\n """\n self.author_year_start = check.opt_str_param(author_year_start, "author_year_start")\n self.author_year_end = check.opt_str_param(author_year_end, "author_year_end")\n self.copyright = check.opt_str_param(copyright, "copyright")\n self.languages = check.opt_str_param(languages, "languages")\n self.search = check.opt_str_param(search, "search")\n self.sort = check.opt_str_param(sort, "sort")\n self.topic = check.opt_str_param(topic, "topic")\n super().__init__("Gutendex", name)
\n\n\n
[docs]class IterableSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, api_key: str, start_date: str):\n """Airbyte Source for Iterable.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/iterable\n\n Args:\n name (str): The name of the destination.\n api_key (str): Iterable API Key. See the docs for more information on how to obtain this key.\n start_date (str): The date from which you'd like to replicate data for Iterable, in the format YYYY-MM-DDT00:00:00Z. All data generated after this date will be replicated.\n """\n self.api_key = check.str_param(api_key, "api_key")\n self.start_date = check.str_param(start_date, "start_date")\n super().__init__("Iterable", name)
\n\n\n
[docs]class QuickbooksSingerSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n client_id: str,\n client_secret: str,\n refresh_token: str,\n realm_id: str,\n user_agent: str,\n start_date: str,\n sandbox: bool,\n ):\n """Airbyte Source for Quickbooks Singer.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/quickbooks\n\n Args:\n name (str): The name of the destination.\n client_id (str): Identifies which app is making the request. Obtain this value from the Keys tab on the app profile via My Apps on the developer site. There are two versions of this key: development and production.\n client_secret (str): Obtain this value from the Keys tab on the app profile via My Apps on the developer site. There are two versions of this key: development and production.\n refresh_token (str): A token used when refreshing the access token.\n realm_id (str): Labeled Company ID. The Make API Calls panel is populated with the realm id and the current access token.\n user_agent (str): Process and email for API logging purposes. Example: tap-quickbooks .\n start_date (str): The default value to use if no bookmark exists for an endpoint (rfc3339 date string). E.g, 2021-03-20T00:00:00Z. Any data before this date will not be replicated.\n sandbox (bool): Determines whether to use the sandbox or production environment.\n """\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")\n self.realm_id = check.str_param(realm_id, "realm_id")\n self.user_agent = check.str_param(user_agent, "user_agent")\n self.start_date = check.str_param(start_date, "start_date")\n self.sandbox = check.bool_param(sandbox, "sandbox")\n super().__init__("Quickbooks Singer", name)
\n\n\n
[docs]class BigcommerceSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, start_date: str, store_hash: str, access_token: str):\n """Airbyte Source for Bigcommerce.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/bigcommerce\n\n Args:\n name (str): The name of the destination.\n start_date (str): The date you would like to replicate data. Format: YYYY-MM-DD.\n store_hash (str): The hash code of the store. For https://api.bigcommerce.com/stores/HASH_CODE/v3/, The store's hash code is 'HASH_CODE'.\n access_token (str): Access Token for making authenticated requests.\n """\n self.start_date = check.str_param(start_date, "start_date")\n self.store_hash = check.str_param(store_hash, "store_hash")\n self.access_token = check.str_param(access_token, "access_token")\n super().__init__("Bigcommerce", name)
\n\n\n
[docs]class ShopifySource(GeneratedAirbyteSource):\n
[docs] class APIPassword:\n
[docs] @public\n def __init__(self, api_password: str):\n self.auth_method = "api_password"\n self.api_password = check.str_param(api_password, "api_password")
\n\n
[docs] class OAuth20:\n
[docs] @public\n def __init__(\n self,\n client_id: Optional[str] = None,\n client_secret: Optional[str] = None,\n access_token: Optional[str] = None,\n ):\n self.auth_method = "oauth2.0"\n self.client_id = check.opt_str_param(client_id, "client_id")\n self.client_secret = check.opt_str_param(client_secret, "client_secret")\n self.access_token = check.opt_str_param(access_token, "access_token")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n shop: str,\n credentials: Union["ShopifySource.APIPassword", "ShopifySource.OAuth20"],\n start_date: str,\n ):\n """Airbyte Source for Shopify.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/shopify\n\n Args:\n name (str): The name of the destination.\n shop (str): The name of your Shopify store found in the URL. For example, if your URL was https://NAME.myshopify.com, then the name would be 'NAME'.\n credentials (Union[ShopifySource.APIPassword, ShopifySource.OAuth20]): The authorization method to use to retrieve data from Shopify\n start_date (str): The date you would like to replicate data from. Format: YYYY-MM-DD. Any data before this date will not be replicated.\n """\n self.shop = check.str_param(shop, "shop")\n self.credentials = check.inst_param(\n credentials, "credentials", (ShopifySource.APIPassword, ShopifySource.OAuth20)\n )\n self.start_date = check.str_param(start_date, "start_date")\n super().__init__("Shopify", name)
\n\n\n
[docs]class AppstoreSingerSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self, name: str, key_id: str, private_key: str, issuer_id: str, vendor: str, start_date: str\n ):\n """Airbyte Source for Appstore Singer.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/appstore\n\n Args:\n name (str): The name of the destination.\n key_id (str): Appstore Key ID. See the docs for more information on how to obtain this key.\n private_key (str): Appstore Private Key. See the docs for more information on how to obtain this key.\n issuer_id (str): Appstore Issuer ID. See the docs for more information on how to obtain this ID.\n vendor (str): Appstore Vendor ID. See the docs for more information on how to obtain this ID.\n start_date (str): UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated.\n """\n self.key_id = check.str_param(key_id, "key_id")\n self.private_key = check.str_param(private_key, "private_key")\n self.issuer_id = check.str_param(issuer_id, "issuer_id")\n self.vendor = check.str_param(vendor, "vendor")\n self.start_date = check.str_param(start_date, "start_date")\n super().__init__("Appstore Singer", name)
\n\n\n
[docs]class GreenhouseSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, api_key: str):\n """Airbyte Source for Greenhouse.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/greenhouse\n\n Args:\n name (str): The name of the destination.\n api_key (str): Greenhouse API Key. See the docs for more information on how to generate this key.\n """\n self.api_key = check.str_param(api_key, "api_key")\n super().__init__("Greenhouse", name)
\n\n\n
[docs]class ZoomSingerSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, jwt: str):\n """Airbyte Source for Zoom Singer.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/zoom\n\n Args:\n name (str): The name of the destination.\n jwt (str): Zoom JWT Token. See the docs for more information on how to obtain this key.\n """\n self.jwt = check.str_param(jwt, "jwt")\n super().__init__("Zoom Singer", name)
\n\n\n
[docs]class TiktokMarketingSource(GeneratedAirbyteSource):\n
[docs] class OAuth20:\n
[docs] @public\n def __init__(\n self, app_id: str, secret: str, access_token: str, auth_type: Optional[str] = None\n ):\n self.auth_type = check.opt_str_param(auth_type, "auth_type")\n self.app_id = check.str_param(app_id, "app_id")\n self.secret = check.str_param(secret, "secret")\n self.access_token = check.str_param(access_token, "access_token")
\n\n
[docs] class SandboxAccessToken:\n
[docs] @public\n def __init__(self, advertiser_id: str, access_token: str, auth_type: Optional[str] = None):\n self.auth_type = check.opt_str_param(auth_type, "auth_type")\n self.advertiser_id = check.str_param(advertiser_id, "advertiser_id")\n self.access_token = check.str_param(access_token, "access_token")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n credentials: Union[\n "TiktokMarketingSource.OAuth20", "TiktokMarketingSource.SandboxAccessToken"\n ],\n start_date: Optional[str] = None,\n end_date: Optional[str] = None,\n report_granularity: Optional[str] = None,\n ):\n """Airbyte Source for Tiktok Marketing.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/tiktok-marketing\n\n Args:\n name (str): The name of the destination.\n credentials (Union[TiktokMarketingSource.OAuth20, TiktokMarketingSource.SandboxAccessToken]): Authentication method\n start_date (Optional[str]): The Start Date in format: YYYY-MM-DD. Any data before this date will not be replicated. If this parameter is not set, all data will be replicated.\n end_date (Optional[str]): The date until which you'd like to replicate data for all incremental streams, in the format YYYY-MM-DD. All data generated between start_date and this date will be replicated. Not setting this option will result in always syncing the data till the current date.\n report_granularity (Optional[str]): The granularity used for aggregating performance data in reports. See the docs.\n """\n self.credentials = check.inst_param(\n credentials,\n "credentials",\n (TiktokMarketingSource.OAuth20, TiktokMarketingSource.SandboxAccessToken),\n )\n self.start_date = check.opt_str_param(start_date, "start_date")\n self.end_date = check.opt_str_param(end_date, "end_date")\n self.report_granularity = check.opt_str_param(report_granularity, "report_granularity")\n super().__init__("Tiktok Marketing", name)
\n\n\n
[docs]class ZendeskChatSource(GeneratedAirbyteSource):\n
[docs] class OAuth20:\n
[docs] @public\n def __init__(\n self,\n client_id: Optional[str] = None,\n client_secret: Optional[str] = None,\n access_token: Optional[str] = None,\n refresh_token: Optional[str] = None,\n ):\n self.credentials = "oauth2.0"\n self.client_id = check.opt_str_param(client_id, "client_id")\n self.client_secret = check.opt_str_param(client_secret, "client_secret")\n self.access_token = check.opt_str_param(access_token, "access_token")\n self.refresh_token = check.opt_str_param(refresh_token, "refresh_token")
\n\n
[docs] class AccessToken:\n
[docs] @public\n def __init__(self, access_token: str):\n self.credentials = "access_token"\n self.access_token = check.str_param(access_token, "access_token")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n start_date: str,\n credentials: Union["ZendeskChatSource.OAuth20", "ZendeskChatSource.AccessToken"],\n subdomain: Optional[str] = None,\n ):\n """Airbyte Source for Zendesk Chat.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/zendesk-chat\n\n Args:\n name (str): The name of the destination.\n start_date (str): The date from which you'd like to replicate data for Zendesk Chat API, in the format YYYY-MM-DDT00:00:00Z.\n subdomain (Optional[str]): Required if you access Zendesk Chat from a Zendesk Support subdomain.\n """\n self.start_date = check.str_param(start_date, "start_date")\n self.subdomain = check.opt_str_param(subdomain, "subdomain")\n self.credentials = check.inst_param(\n credentials, "credentials", (ZendeskChatSource.OAuth20, ZendeskChatSource.AccessToken)\n )\n super().__init__("Zendesk Chat", name)
\n\n\n
[docs]class AwsCloudtrailSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self, name: str, aws_key_id: str, aws_secret_key: str, aws_region_name: str, start_date: str\n ):\n """Airbyte Source for Aws Cloudtrail.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/aws-cloudtrail\n\n Args:\n name (str): The name of the destination.\n aws_key_id (str): AWS CloudTrail Access Key ID. See the docs for more information on how to obtain this key.\n aws_secret_key (str): AWS CloudTrail Access Key ID. See the docs for more information on how to obtain this key.\n aws_region_name (str): The default AWS Region to use, for example, us-west-1 or us-west-2. When specifying a Region inline during client initialization, this property is named region_name.\n start_date (str): The date you would like to replicate data. Data in AWS CloudTrail is available for last 90 days only. Format: YYYY-MM-DD.\n """\n self.aws_key_id = check.str_param(aws_key_id, "aws_key_id")\n self.aws_secret_key = check.str_param(aws_secret_key, "aws_secret_key")\n self.aws_region_name = check.str_param(aws_region_name, "aws_region_name")\n self.start_date = check.str_param(start_date, "start_date")\n super().__init__("Aws Cloudtrail", name)
\n\n\n
[docs]class OktaSource(GeneratedAirbyteSource):\n
[docs] class OAuth20:\n
[docs] @public\n def __init__(self, client_id: str, client_secret: str, refresh_token: str):\n self.auth_type = "oauth2.0"\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")
\n\n
[docs] class APIToken:\n
[docs] @public\n def __init__(self, api_token: str):\n self.auth_type = "api_token"\n self.api_token = check.str_param(api_token, "api_token")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n credentials: Union["OktaSource.OAuth20", "OktaSource.APIToken"],\n domain: Optional[str] = None,\n start_date: Optional[str] = None,\n ):\n """Airbyte Source for Okta.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/okta\n\n Args:\n name (str): The name of the destination.\n domain (Optional[str]): The Okta domain. See the docs for instructions on how to find it.\n start_date (Optional[str]): UTC date and time in the format YYYY-MM-DDTHH:MM:SSZ. Any data before this date will not be replicated.\n """\n self.domain = check.opt_str_param(domain, "domain")\n self.start_date = check.opt_str_param(start_date, "start_date")\n self.credentials = check.inst_param(\n credentials, "credentials", (OktaSource.OAuth20, OktaSource.APIToken)\n )\n super().__init__("Okta", name)
\n\n\n
[docs]class InsightlySource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, token: Optional[str] = None, start_date: Optional[str] = None):\n """Airbyte Source for Insightly.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/insightly\n\n Args:\n name (str): The name of the destination.\n token (Optional[str]): Your Insightly API token.\n start_date (Optional[str]): The date from which you'd like to replicate data for Insightly in the format YYYY-MM-DDT00:00:00Z. All data generated after this date will be replicated. Note that it will be used only for incremental streams.\n """\n self.token = check.opt_str_param(token, "token")\n self.start_date = check.opt_str_param(start_date, "start_date")\n super().__init__("Insightly", name)
\n\n\n
[docs]class LinkedinPagesSource(GeneratedAirbyteSource):\n
[docs] class OAuth20:\n
[docs] @public\n def __init__(\n self,\n client_id: str,\n client_secret: str,\n refresh_token: str,\n auth_method: Optional[str] = None,\n ):\n self.auth_method = check.opt_str_param(auth_method, "auth_method")\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")
\n\n
[docs] class AccessToken:\n
[docs] @public\n def __init__(self, access_token: str, auth_method: Optional[str] = None):\n self.auth_method = check.opt_str_param(auth_method, "auth_method")\n self.access_token = check.str_param(access_token, "access_token")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n org_id: int,\n credentials: Union["LinkedinPagesSource.OAuth20", "LinkedinPagesSource.AccessToken"],\n ):\n """Airbyte Source for Linkedin Pages.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/linkedin-pages/\n\n Args:\n name (str): The name of the destination.\n org_id (int): Specify the Organization ID\n """\n self.org_id = check.int_param(org_id, "org_id")\n self.credentials = check.inst_param(\n credentials,\n "credentials",\n (LinkedinPagesSource.OAuth20, LinkedinPagesSource.AccessToken),\n )\n super().__init__("Linkedin Pages", name)
\n\n\n
[docs]class PersistiqSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, api_key: str):\n """Airbyte Source for Persistiq.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/persistiq\n\n Args:\n name (str): The name of the destination.\n api_key (str): PersistIq API Key. See the docs for more information on where to find that key.\n """\n self.api_key = check.str_param(api_key, "api_key")\n super().__init__("Persistiq", name)
\n\n\n
[docs]class FreshcallerSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n domain: str,\n api_key: str,\n start_date: str,\n requests_per_minute: Optional[int] = None,\n sync_lag_minutes: Optional[int] = None,\n ):\n """Airbyte Source for Freshcaller.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/freshcaller\n\n Args:\n name (str): The name of the destination.\n domain (str): Used to construct Base URL for the Freshcaller APIs\n api_key (str): Freshcaller API Key. See the docs for more information on how to obtain this key.\n requests_per_minute (Optional[int]): The number of requests per minute that this source allowed to use. There is a rate limit of 50 requests per minute per app per account.\n start_date (str): UTC date and time. Any data created after this date will be replicated.\n sync_lag_minutes (Optional[int]): Lag in minutes for each sync, i.e., at time T, data for the time range [prev_sync_time, T-30] will be fetched\n """\n self.domain = check.str_param(domain, "domain")\n self.api_key = check.str_param(api_key, "api_key")\n self.requests_per_minute = check.opt_int_param(requests_per_minute, "requests_per_minute")\n self.start_date = check.str_param(start_date, "start_date")\n self.sync_lag_minutes = check.opt_int_param(sync_lag_minutes, "sync_lag_minutes")\n super().__init__("Freshcaller", name)
\n\n\n
[docs]class AppfollowSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, ext_id: str, cid: str, api_secret: str, country: str):\n """Airbyte Source for Appfollow.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/appfollow\n\n Args:\n name (str): The name of the destination.\n ext_id (str): for App Store \u2014 this is 9-10 digits identification number; for Google Play \u2014 this is bundle name;\n cid (str): client id provided by Appfollow\n api_secret (str): api secret provided by Appfollow\n country (str): getting data by Country\n """\n self.ext_id = check.str_param(ext_id, "ext_id")\n self.cid = check.str_param(cid, "cid")\n self.api_secret = check.str_param(api_secret, "api_secret")\n self.country = check.str_param(country, "country")\n super().__init__("Appfollow", name)
\n\n\n
[docs]class FacebookPagesSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, access_token: str, page_id: str):\n """Airbyte Source for Facebook Pages.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/facebook-pages\n\n Args:\n name (str): The name of the destination.\n access_token (str): Facebook Page Access Token\n page_id (str): Page ID\n """\n self.access_token = check.str_param(access_token, "access_token")\n self.page_id = check.str_param(page_id, "page_id")\n super().__init__("Facebook Pages", name)
\n\n\n
[docs]class JiraSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n api_token: str,\n domain: str,\n email: str,\n projects: Optional[List[str]] = None,\n start_date: Optional[str] = None,\n additional_fields: Optional[List[str]] = None,\n expand_issue_changelog: Optional[bool] = None,\n render_fields: Optional[bool] = None,\n enable_experimental_streams: Optional[bool] = None,\n ):\n """Airbyte Source for Jira.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/jira\n\n Args:\n name (str): The name of the destination.\n api_token (str): Jira API Token. See the docs for more information on how to generate this key.\n domain (str): The Domain for your Jira account, e.g. airbyteio.atlassian.net\n email (str): The user email for your Jira account.\n projects (Optional[List[str]]): List of Jira project keys to replicate data for.\n start_date (Optional[str]): The date from which you'd like to replicate data for Jira in the format YYYY-MM-DDT00:00:00Z. All data generated after this date will be replicated. Note that it will be used only in the following incremental streams: issues.\n additional_fields (Optional[List[str]]): List of additional fields to include in replicating issues.\n expand_issue_changelog (Optional[bool]): Expand the changelog when replicating issues.\n render_fields (Optional[bool]): Render issue fields in HTML format in addition to Jira JSON-like format.\n enable_experimental_streams (Optional[bool]): Allow the use of experimental streams which rely on undocumented Jira API endpoints. See https://docs.airbyte.com/integrations/sources/jira#experimental-tables for more info.\n """\n self.api_token = check.str_param(api_token, "api_token")\n self.domain = check.str_param(domain, "domain")\n self.email = check.str_param(email, "email")\n self.projects = check.opt_nullable_list_param(projects, "projects", str)\n self.start_date = check.opt_str_param(start_date, "start_date")\n self.additional_fields = check.opt_nullable_list_param(\n additional_fields, "additional_fields", str\n )\n self.expand_issue_changelog = check.opt_bool_param(\n expand_issue_changelog, "expand_issue_changelog"\n )\n self.render_fields = check.opt_bool_param(render_fields, "render_fields")\n self.enable_experimental_streams = check.opt_bool_param(\n enable_experimental_streams, "enable_experimental_streams"\n )\n super().__init__("Jira", name)
\n\n\n
[docs]class GoogleSheetsSource(GeneratedAirbyteSource):\n
[docs] class AuthenticateViaGoogleOAuth:\n
[docs] @public\n def __init__(self, client_id: str, client_secret: str, refresh_token: str):\n self.auth_type = "Client"\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")
\n\n
[docs] class ServiceAccountKeyAuthentication:\n
[docs] @public\n def __init__(self, service_account_info: str):\n self.auth_type = "Service"\n self.service_account_info = check.str_param(\n service_account_info, "service_account_info"\n )
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n spreadsheet_id: str,\n credentials: Union[\n "GoogleSheetsSource.AuthenticateViaGoogleOAuth",\n "GoogleSheetsSource.ServiceAccountKeyAuthentication",\n ],\n row_batch_size: Optional[int] = None,\n ):\n """Airbyte Source for Google Sheets.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/google-sheets\n\n Args:\n name (str): The name of the destination.\n spreadsheet_id (str): Enter the link to the Google spreadsheet you want to sync\n row_batch_size (Optional[int]): Number of rows fetched when making a Google Sheet API call. Defaults to 200.\n credentials (Union[GoogleSheetsSource.AuthenticateViaGoogleOAuth, GoogleSheetsSource.ServiceAccountKeyAuthentication]): Credentials for connecting to the Google Sheets API\n """\n self.spreadsheet_id = check.str_param(spreadsheet_id, "spreadsheet_id")\n self.row_batch_size = check.opt_int_param(row_batch_size, "row_batch_size")\n self.credentials = check.inst_param(\n credentials,\n "credentials",\n (\n GoogleSheetsSource.AuthenticateViaGoogleOAuth,\n GoogleSheetsSource.ServiceAccountKeyAuthentication,\n ),\n )\n super().__init__("Google Sheets", name)
\n\n\n
[docs]class DockerhubSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, docker_username: str):\n """Airbyte Source for Dockerhub.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/dockerhub\n\n Args:\n name (str): The name of the destination.\n docker_username (str): Username of DockerHub person or organization (for https://hub.docker.com/v2/repositories/USERNAME/ API call)\n """\n self.docker_username = check.str_param(docker_username, "docker_username")\n super().__init__("Dockerhub", name)
\n\n\n
[docs]class UsCensusSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self, name: str, query_path: str, api_key: str, query_params: Optional[str] = None\n ):\n """Airbyte Source for Us Census.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/us-census\n\n Args:\n name (str): The name of the destination.\n query_params (Optional[str]): The query parameters portion of the GET request, without the api key\n query_path (str): The path portion of the GET request\n api_key (str): Your API Key. Get your key here.\n """\n self.query_params = check.opt_str_param(query_params, "query_params")\n self.query_path = check.str_param(query_path, "query_path")\n self.api_key = check.str_param(api_key, "api_key")\n super().__init__("Us Census", name)
\n\n\n
[docs]class KustomerSingerSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, api_token: str, start_date: str):\n """Airbyte Source for Kustomer Singer.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/kustomer\n\n Args:\n name (str): The name of the destination.\n api_token (str): Kustomer API Token. See the docs on how to obtain this\n start_date (str): The date from which you'd like to replicate the data\n """\n self.api_token = check.str_param(api_token, "api_token")\n self.start_date = check.str_param(start_date, "start_date")\n super().__init__("Kustomer Singer", name)
\n\n\n
[docs]class AzureTableSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n storage_account_name: str,\n storage_access_key: str,\n storage_endpoint_suffix: Optional[str] = None,\n ):\n """Airbyte Source for Azure Table.\n\n Args:\n name (str): The name of the destination.\n storage_account_name (str): The name of your storage account.\n storage_access_key (str): Azure Table Storage Access Key. See the docs for more information on how to obtain this key.\n storage_endpoint_suffix (Optional[str]): Azure Table Storage service account URL suffix. See the docs for more information on how to obtain endpoint suffix\n """\n self.storage_account_name = check.str_param(storage_account_name, "storage_account_name")\n self.storage_access_key = check.str_param(storage_access_key, "storage_access_key")\n self.storage_endpoint_suffix = check.opt_str_param(\n storage_endpoint_suffix, "storage_endpoint_suffix"\n )\n super().__init__("Azure Table", name)
\n\n\n
[docs]class ScaffoldJavaJdbcSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n host: str,\n port: int,\n database: str,\n username: str,\n replication_method: str,\n password: Optional[str] = None,\n jdbc_url_params: Optional[str] = None,\n ):\n """Airbyte Source for Scaffold Java Jdbc.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/scaffold_java_jdbc\n\n Args:\n name (str): The name of the destination.\n host (str): Hostname of the database.\n port (int): Port of the database.\n database (str): Name of the database.\n username (str): Username to use to access the database.\n password (Optional[str]): Password associated with the username.\n jdbc_url_params (Optional[str]): Additional properties to pass to the JDBC URL string when connecting to the database formatted as 'key=value' pairs separated by the symbol '&'. (example: key1=value1&key2=value2&key3=value3)\n replication_method (str): Replication method to use for extracting data from the database. STANDARD replication requires no setup on the DB side but will not be able to represent deletions incrementally. CDC uses the Binlog to detect inserts, updates, and deletes. This needs to be configured on the source database itself.\n """\n self.host = check.str_param(host, "host")\n self.port = check.int_param(port, "port")\n self.database = check.str_param(database, "database")\n self.username = check.str_param(username, "username")\n self.password = check.opt_str_param(password, "password")\n self.jdbc_url_params = check.opt_str_param(jdbc_url_params, "jdbc_url_params")\n self.replication_method = check.str_param(replication_method, "replication_method")\n super().__init__("Scaffold Java Jdbc", name)
\n\n\n
[docs]class TidbSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n host: str,\n port: int,\n database: str,\n username: str,\n password: Optional[str] = None,\n jdbc_url_params: Optional[str] = None,\n ssl: Optional[bool] = None,\n ):\n """Airbyte Source for Tidb.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/tidb\n\n Args:\n name (str): The name of the destination.\n host (str): Hostname of the database.\n port (int): Port of the database.\n database (str): Name of the database.\n username (str): Username to use to access the database.\n password (Optional[str]): Password associated with the username.\n jdbc_url_params (Optional[str]): Additional properties to pass to the JDBC URL string when connecting to the database formatted as 'key=value' pairs separated by the symbol '&'. (example: key1=value1&key2=value2&key3=value3)\n ssl (Optional[bool]): Encrypt data using SSL.\n """\n self.host = check.str_param(host, "host")\n self.port = check.int_param(port, "port")\n self.database = check.str_param(database, "database")\n self.username = check.str_param(username, "username")\n self.password = check.opt_str_param(password, "password")\n self.jdbc_url_params = check.opt_str_param(jdbc_url_params, "jdbc_url_params")\n self.ssl = check.opt_bool_param(ssl, "ssl")\n super().__init__("Tidb", name)
\n\n\n
[docs]class QualarooSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n token: str,\n key: str,\n start_date: str,\n survey_ids: Optional[List[str]] = None,\n ):\n """Airbyte Source for Qualaroo.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/qualaroo\n\n Args:\n name (str): The name of the destination.\n token (str): A Qualaroo token. See the docs for instructions on how to generate it.\n key (str): A Qualaroo token. See the docs for instructions on how to generate it.\n start_date (str): UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated.\n survey_ids (Optional[List[str]]): IDs of the surveys from which you'd like to replicate data. If left empty, data from all surveys to which you have access will be replicated.\n """\n self.token = check.str_param(token, "token")\n self.key = check.str_param(key, "key")\n self.start_date = check.str_param(start_date, "start_date")\n self.survey_ids = check.opt_nullable_list_param(survey_ids, "survey_ids", str)\n super().__init__("Qualaroo", name)
\n\n\n
[docs]class YahooFinancePriceSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self, name: str, tickers: str, interval: Optional[str] = None, range: Optional[str] = None\n ):\n """Airbyte Source for Yahoo Finance Price.\n\n Args:\n name (str): The name of the destination.\n tickers (str): Comma-separated identifiers for the stocks to be queried. Whitespaces are allowed.\n interval (Optional[str]): The interval of between prices queried.\n range (Optional[str]): The range of prices to be queried.\n """\n self.tickers = check.str_param(tickers, "tickers")\n self.interval = check.opt_str_param(interval, "interval")\n self.range = check.opt_str_param(range, "range")\n super().__init__("Yahoo Finance Price", name)
\n\n\n
[docs]class GoogleAnalyticsV4Source(GeneratedAirbyteSource):\n
[docs] class AuthenticateViaGoogleOauth:\n
[docs] @public\n def __init__(\n self,\n client_id: str,\n client_secret: str,\n refresh_token: str,\n auth_type: Optional[str] = None,\n access_token: Optional[str] = None,\n ):\n self.auth_type = check.opt_str_param(auth_type, "auth_type")\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")\n self.access_token = check.opt_str_param(access_token, "access_token")
\n\n
[docs] class ServiceAccountKeyAuthentication:\n
[docs] @public\n def __init__(self, credentials_json: str, auth_type: Optional[str] = None):\n self.auth_type = check.opt_str_param(auth_type, "auth_type")\n self.credentials_json = check.str_param(credentials_json, "credentials_json")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n credentials: Union[\n "GoogleAnalyticsV4Source.AuthenticateViaGoogleOauth",\n "GoogleAnalyticsV4Source.ServiceAccountKeyAuthentication",\n ],\n start_date: str,\n view_id: str,\n custom_reports: Optional[str] = None,\n window_in_days: Optional[int] = None,\n ):\n """Airbyte Source for Google Analytics V4.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/google-analytics-universal-analytics\n\n Args:\n name (str): The name of the destination.\n credentials (Union[GoogleAnalyticsV4Source.AuthenticateViaGoogleOauth, GoogleAnalyticsV4Source.ServiceAccountKeyAuthentication]): Credentials for the service\n start_date (str): The date in the format YYYY-MM-DD. Any data before this date will not be replicated.\n view_id (str): The ID for the Google Analytics View you want to fetch data from. This can be found from the Google Analytics Account Explorer.\n custom_reports (Optional[str]): A JSON array describing the custom reports you want to sync from Google Analytics. See the docs for more information about the exact format you can use to fill out this field.\n window_in_days (Optional[int]): The time increment used by the connector when requesting data from the Google Analytics API. More information is available in the the docs. The bigger this value is, the faster the sync will be, but the more likely that sampling will be applied to your data, potentially causing inaccuracies in the returned results. We recommend setting this to 1 unless you have a hard requirement to make the sync faster at the expense of accuracy. The minimum allowed value for this field is 1, and the maximum is 364.\n """\n self.credentials = check.inst_param(\n credentials,\n "credentials",\n (\n GoogleAnalyticsV4Source.AuthenticateViaGoogleOauth,\n GoogleAnalyticsV4Source.ServiceAccountKeyAuthentication,\n ),\n )\n self.start_date = check.str_param(start_date, "start_date")\n self.view_id = check.str_param(view_id, "view_id")\n self.custom_reports = check.opt_str_param(custom_reports, "custom_reports")\n self.window_in_days = check.opt_int_param(window_in_days, "window_in_days")\n super().__init__("Google Analytics V4", name)
\n\n\n
[docs]class JdbcSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n username: str,\n jdbc_url: str,\n password: Optional[str] = None,\n jdbc_url_params: Optional[str] = None,\n ):\n """Airbyte Source for Jdbc.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/postgres\n\n Args:\n name (str): The name of the destination.\n username (str): The username which is used to access the database.\n password (Optional[str]): The password associated with this username.\n jdbc_url (str): JDBC formatted URL. See the standard here.\n jdbc_url_params (Optional[str]): Additional properties to pass to the JDBC URL string when connecting to the database formatted as 'key=value' pairs separated by the symbol '&'. (example: key1=value1&key2=value2&key3=value3).\n """\n self.username = check.str_param(username, "username")\n self.password = check.opt_str_param(password, "password")\n self.jdbc_url = check.str_param(jdbc_url, "jdbc_url")\n self.jdbc_url_params = check.opt_str_param(jdbc_url_params, "jdbc_url_params")\n super().__init__("Jdbc", name)
\n\n\n
[docs]class FakerSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n count: int,\n seed: Optional[int] = None,\n records_per_sync: Optional[int] = None,\n records_per_slice: Optional[int] = None,\n ):\n """Airbyte Source for Faker.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/faker\n\n Args:\n name (str): The name of the destination.\n count (int): How many users should be generated in total. This setting does not apply to the purchases or products stream.\n seed (Optional[int]): Manually control the faker random seed to return the same values on subsequent runs (leave -1 for random)\n records_per_sync (Optional[int]): How many fake records will be returned for each sync, for each stream? By default, it will take 2 syncs to create the requested 1000 records.\n records_per_slice (Optional[int]): How many fake records will be in each page (stream slice), before a state message is emitted?\n """\n self.count = check.int_param(count, "count")\n self.seed = check.opt_int_param(seed, "seed")\n self.records_per_sync = check.opt_int_param(records_per_sync, "records_per_sync")\n self.records_per_slice = check.opt_int_param(records_per_slice, "records_per_slice")\n super().__init__("Faker", name)
\n\n\n
[docs]class TplcentralSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n url_base: str,\n client_id: str,\n client_secret: str,\n user_login_id: Optional[int] = None,\n user_login: Optional[str] = None,\n tpl_key: Optional[str] = None,\n customer_id: Optional[int] = None,\n facility_id: Optional[int] = None,\n start_date: Optional[str] = None,\n ):\n """Airbyte Source for Tplcentral.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/tplcentral\n\n Args:\n name (str): The name of the destination.\n user_login_id (Optional[int]): User login ID and/or name is required\n user_login (Optional[str]): User login ID and/or name is required\n start_date (Optional[str]): Date and time together in RFC 3339 format, for example, 2018-11-13T20:20:39+00:00.\n """\n self.url_base = check.str_param(url_base, "url_base")\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.user_login_id = check.opt_int_param(user_login_id, "user_login_id")\n self.user_login = check.opt_str_param(user_login, "user_login")\n self.tpl_key = check.opt_str_param(tpl_key, "tpl_key")\n self.customer_id = check.opt_int_param(customer_id, "customer_id")\n self.facility_id = check.opt_int_param(facility_id, "facility_id")\n self.start_date = check.opt_str_param(start_date, "start_date")\n super().__init__("Tplcentral", name)
\n\n\n
[docs]class ClickhouseSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n host: str,\n port: int,\n database: str,\n username: str,\n password: Optional[str] = None,\n jdbc_url_params: Optional[str] = None,\n ssl: Optional[bool] = None,\n ):\n """Airbyte Source for Clickhouse.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/clickhouse\n\n Args:\n name (str): The name of the destination.\n host (str): The host endpoint of the Clickhouse cluster.\n port (int): The port of the database.\n database (str): The name of the database.\n username (str): The username which is used to access the database.\n password (Optional[str]): The password associated with this username.\n jdbc_url_params (Optional[str]): Additional properties to pass to the JDBC URL string when connecting to the database formatted as 'key=value' pairs separated by the symbol '&'. (Eg. key1=value1&key2=value2&key3=value3). For more information read about JDBC URL parameters.\n ssl (Optional[bool]): Encrypt data using SSL.\n """\n self.host = check.str_param(host, "host")\n self.port = check.int_param(port, "port")\n self.database = check.str_param(database, "database")\n self.username = check.str_param(username, "username")\n self.password = check.opt_str_param(password, "password")\n self.jdbc_url_params = check.opt_str_param(jdbc_url_params, "jdbc_url_params")\n self.ssl = check.opt_bool_param(ssl, "ssl")\n super().__init__("Clickhouse", name)
\n\n\n
[docs]class FreshserviceSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, domain_name: str, api_key: str, start_date: str):\n """Airbyte Source for Freshservice.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/freshservice\n\n Args:\n name (str): The name of the destination.\n domain_name (str): The name of your Freshservice domain\n api_key (str): Freshservice API Key. See here. The key is case sensitive.\n start_date (str): UTC date and time in the format 2020-10-01T00:00:00Z. Any data before this date will not be replicated.\n """\n self.domain_name = check.str_param(domain_name, "domain_name")\n self.api_key = check.str_param(api_key, "api_key")\n self.start_date = check.str_param(start_date, "start_date")\n super().__init__("Freshservice", name)
\n\n\n
[docs]class ZenloopSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n api_token: str,\n date_from: Optional[str] = None,\n survey_id: Optional[str] = None,\n survey_group_id: Optional[str] = None,\n ):\n """Airbyte Source for Zenloop.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/zenloop\n\n Args:\n name (str): The name of the destination.\n api_token (str): Zenloop API Token. You can get the API token in settings page here\n date_from (Optional[str]): Zenloop date_from. Format: 2021-10-24T03:30:30Z or 2021-10-24. Leave empty if only data from current data should be synced\n survey_id (Optional[str]): Zenloop Survey ID. Can be found here. Leave empty to pull answers from all surveys\n survey_group_id (Optional[str]): Zenloop Survey Group ID. Can be found by pulling All Survey Groups via SurveyGroups stream. Leave empty to pull answers from all survey groups\n """\n self.api_token = check.str_param(api_token, "api_token")\n self.date_from = check.opt_str_param(date_from, "date_from")\n self.survey_id = check.opt_str_param(survey_id, "survey_id")\n self.survey_group_id = check.opt_str_param(survey_group_id, "survey_group_id")\n super().__init__("Zenloop", name)
\n\n\n
[docs]class OracleSource(GeneratedAirbyteSource):\n
[docs] class ServiceName:\n
[docs] @public\n def __init__(self, service_name: str, connection_type: Optional[str] = None):\n self.connection_type = check.opt_str_param(connection_type, "connection_type")\n self.service_name = check.str_param(service_name, "service_name")
\n\n
[docs] class SystemIDSID:\n
[docs] @public\n def __init__(self, sid: str, connection_type: Optional[str] = None):\n self.connection_type = check.opt_str_param(connection_type, "connection_type")\n self.sid = check.str_param(sid, "sid")
\n\n
[docs] class Unencrypted:\n
[docs] @public\n def __init__(\n self,\n ):\n self.encryption_method = "unencrypted"
\n\n
[docs] class NativeNetworkEncryptionNNE:\n
[docs] @public\n def __init__(self, encryption_algorithm: Optional[str] = None):\n self.encryption_method = "client_nne"\n self.encryption_algorithm = check.opt_str_param(\n encryption_algorithm, "encryption_algorithm"\n )
\n\n
[docs] class TLSEncryptedVerifyCertificate:\n
[docs] @public\n def __init__(self, ssl_certificate: str):\n self.encryption_method = "encrypted_verify_certificate"\n self.ssl_certificate = check.str_param(ssl_certificate, "ssl_certificate")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n host: str,\n port: int,\n connection_data: Union["OracleSource.ServiceName", "OracleSource.SystemIDSID"],\n username: str,\n encryption: Union[\n "OracleSource.Unencrypted",\n "OracleSource.NativeNetworkEncryptionNNE",\n "OracleSource.TLSEncryptedVerifyCertificate",\n ],\n password: Optional[str] = None,\n schemas: Optional[List[str]] = None,\n jdbc_url_params: Optional[str] = None,\n ):\n """Airbyte Source for Oracle.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/oracle\n\n Args:\n name (str): The name of the destination.\n host (str): Hostname of the database.\n port (int): Port of the database. Oracle Corporations recommends the following port numbers: 1521 - Default listening port for client connections to the listener. 2484 - Recommended and officially registered listening port for client connections to the listener using TCP/IP with SSL\n connection_data (Union[OracleSource.ServiceName, OracleSource.SystemIDSID]): Connect data that will be used for DB connection\n username (str): The username which is used to access the database.\n password (Optional[str]): The password associated with the username.\n schemas (Optional[List[str]]): The list of schemas to sync from. Defaults to user. Case sensitive.\n jdbc_url_params (Optional[str]): Additional properties to pass to the JDBC URL string when connecting to the database formatted as 'key=value' pairs separated by the symbol '&'. (example: key1=value1&key2=value2&key3=value3).\n encryption (Union[OracleSource.Unencrypted, OracleSource.NativeNetworkEncryptionNNE, OracleSource.TLSEncryptedVerifyCertificate]): The encryption method with is used when communicating with the database.\n """\n self.host = check.str_param(host, "host")\n self.port = check.int_param(port, "port")\n self.connection_data = check.inst_param(\n connection_data, "connection_data", (OracleSource.ServiceName, OracleSource.SystemIDSID)\n )\n self.username = check.str_param(username, "username")\n self.password = check.opt_str_param(password, "password")\n self.schemas = check.opt_nullable_list_param(schemas, "schemas", str)\n self.jdbc_url_params = check.opt_str_param(jdbc_url_params, "jdbc_url_params")\n self.encryption = check.inst_param(\n encryption,\n "encryption",\n (\n OracleSource.Unencrypted,\n OracleSource.NativeNetworkEncryptionNNE,\n OracleSource.TLSEncryptedVerifyCertificate,\n ),\n )\n super().__init__("Oracle", name)
\n\n\n
[docs]class KlaviyoSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, api_key: str, start_date: str):\n """Airbyte Source for Klaviyo.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/klaviyo\n\n Args:\n name (str): The name of the destination.\n api_key (str): Klaviyo API Key. See our docs if you need help finding this key.\n start_date (str): UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated.\n """\n self.api_key = check.str_param(api_key, "api_key")\n self.start_date = check.str_param(start_date, "start_date")\n super().__init__("Klaviyo", name)
\n\n\n
[docs]class GoogleDirectorySource(GeneratedAirbyteSource):\n
[docs] class SignInViaGoogleOAuth:\n
[docs] @public\n def __init__(\n self,\n client_id: str,\n client_secret: str,\n refresh_token: str,\n credentials_title: Optional[str] = None,\n ):\n self.credentials_title = check.opt_str_param(credentials_title, "credentials_title")\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")
\n\n
[docs] class ServiceAccountKey:\n
[docs] @public\n def __init__(\n self, credentials_json: str, email: str, credentials_title: Optional[str] = None\n ):\n self.credentials_title = check.opt_str_param(credentials_title, "credentials_title")\n self.credentials_json = check.str_param(credentials_json, "credentials_json")\n self.email = check.str_param(email, "email")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n credentials: Union[\n "GoogleDirectorySource.SignInViaGoogleOAuth", "GoogleDirectorySource.ServiceAccountKey"\n ],\n ):\n """Airbyte Source for Google Directory.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/google-directory\n\n Args:\n name (str): The name of the destination.\n credentials (Union[GoogleDirectorySource.SignInViaGoogleOAuth, GoogleDirectorySource.ServiceAccountKey]): Google APIs use the OAuth 2.0 protocol for authentication and authorization. The Source supports Web server application and Service accounts scenarios.\n """\n self.credentials = check.inst_param(\n credentials,\n "credentials",\n (GoogleDirectorySource.SignInViaGoogleOAuth, GoogleDirectorySource.ServiceAccountKey),\n )\n super().__init__("Google Directory", name)
\n\n\n
[docs]class InstagramSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, start_date: str, access_token: str):\n """Airbyte Source for Instagram.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/instagram\n\n Args:\n name (str): The name of the destination.\n start_date (str): The date from which you'd like to replicate data for User Insights, in the format YYYY-MM-DDT00:00:00Z. All data generated after this date will be replicated.\n access_token (str): The value of the access token generated. See the docs for more information\n """\n self.start_date = check.str_param(start_date, "start_date")\n self.access_token = check.str_param(access_token, "access_token")\n super().__init__("Instagram", name)
\n\n\n
[docs]class ShortioSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, domain_id: str, secret_key: str, start_date: str):\n """Airbyte Source for Shortio.\n\n Documentation can be found at https://developers.short.io/reference\n\n Args:\n name (str): The name of the destination.\n secret_key (str): Short.io Secret Key\n start_date (str): UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated.\n """\n self.domain_id = check.str_param(domain_id, "domain_id")\n self.secret_key = check.str_param(secret_key, "secret_key")\n self.start_date = check.str_param(start_date, "start_date")\n super().__init__("Shortio", name)
\n\n\n
[docs]class SquareSource(GeneratedAirbyteSource):\n
[docs] class OauthAuthentication:\n
[docs] @public\n def __init__(self, client_id: str, client_secret: str, refresh_token: str):\n self.auth_type = "Oauth"\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")
\n\n
[docs] class APIKey:\n
[docs] @public\n def __init__(self, api_key: str):\n self.auth_type = "Apikey"\n self.api_key = check.str_param(api_key, "api_key")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n is_sandbox: bool,\n credentials: Union["SquareSource.OauthAuthentication", "SquareSource.APIKey"],\n start_date: Optional[str] = None,\n include_deleted_objects: Optional[bool] = None,\n ):\n """Airbyte Source for Square.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/square\n\n Args:\n name (str): The name of the destination.\n is_sandbox (bool): Determines whether to use the sandbox or production environment.\n start_date (Optional[str]): UTC date in the format YYYY-MM-DD. Any data before this date will not be replicated. If not set, all data will be replicated.\n include_deleted_objects (Optional[bool]): In some streams there is an option to include deleted objects (Items, Categories, Discounts, Taxes)\n """\n self.is_sandbox = check.bool_param(is_sandbox, "is_sandbox")\n self.start_date = check.opt_str_param(start_date, "start_date")\n self.include_deleted_objects = check.opt_bool_param(\n include_deleted_objects, "include_deleted_objects"\n )\n self.credentials = check.inst_param(\n credentials, "credentials", (SquareSource.OauthAuthentication, SquareSource.APIKey)\n )\n super().__init__("Square", name)
\n\n\n
[docs]class DelightedSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, since: str, api_key: str):\n """Airbyte Source for Delighted.\n\n Args:\n name (str): The name of the destination.\n since (str): The date from which you'd like to replicate the data\n api_key (str): A Delighted API key.\n """\n self.since = check.str_param(since, "since")\n self.api_key = check.str_param(api_key, "api_key")\n super().__init__("Delighted", name)
\n\n\n
[docs]class AmazonSqsSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n queue_url: str,\n region: str,\n delete_messages: bool,\n max_batch_size: Optional[int] = None,\n max_wait_time: Optional[int] = None,\n attributes_to_return: Optional[str] = None,\n visibility_timeout: Optional[int] = None,\n access_key: Optional[str] = None,\n secret_key: Optional[str] = None,\n ):\n """Airbyte Source for Amazon Sqs.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/amazon-sqs\n\n Args:\n name (str): The name of the destination.\n queue_url (str): URL of the SQS Queue\n region (str): AWS Region of the SQS Queue\n delete_messages (bool): If Enabled, messages will be deleted from the SQS Queue after being read. If Disabled, messages are left in the queue and can be read more than once. WARNING: Enabling this option can result in data loss in cases of failure, use with caution, see documentation for more detail.\n max_batch_size (Optional[int]): Max amount of messages to get in one batch (10 max)\n max_wait_time (Optional[int]): Max amount of time in seconds to wait for messages in a single poll (20 max)\n attributes_to_return (Optional[str]): Comma separated list of Mesage Attribute names to return\n visibility_timeout (Optional[int]): Modify the Visibility Timeout of the individual message from the Queue's default (seconds).\n access_key (Optional[str]): The Access Key ID of the AWS IAM Role to use for pulling messages\n secret_key (Optional[str]): The Secret Key of the AWS IAM Role to use for pulling messages\n """\n self.queue_url = check.str_param(queue_url, "queue_url")\n self.region = check.str_param(region, "region")\n self.delete_messages = check.bool_param(delete_messages, "delete_messages")\n self.max_batch_size = check.opt_int_param(max_batch_size, "max_batch_size")\n self.max_wait_time = check.opt_int_param(max_wait_time, "max_wait_time")\n self.attributes_to_return = check.opt_str_param(\n attributes_to_return, "attributes_to_return"\n )\n self.visibility_timeout = check.opt_int_param(visibility_timeout, "visibility_timeout")\n self.access_key = check.opt_str_param(access_key, "access_key")\n self.secret_key = check.opt_str_param(secret_key, "secret_key")\n super().__init__("Amazon Sqs", name)
\n\n\n
[docs]class YoutubeAnalyticsSource(GeneratedAirbyteSource):\n
[docs] class AuthenticateViaOAuth20:\n
[docs] @public\n def __init__(self, client_id: str, client_secret: str, refresh_token: str):\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")
\n\n
[docs] @public\n def __init__(self, name: str, credentials: "YoutubeAnalyticsSource.AuthenticateViaOAuth20"):\n """Airbyte Source for Youtube Analytics.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/youtube-analytics\n\n Args:\n name (str): The name of the destination.\n\n """\n self.credentials = check.inst_param(\n credentials, "credentials", YoutubeAnalyticsSource.AuthenticateViaOAuth20\n )\n super().__init__("Youtube Analytics", name)
\n\n\n
[docs]class ScaffoldSourcePythonSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, fix_me: Optional[str] = None):\n """Airbyte Source for Scaffold Source Python.\n\n Args:\n name (str): The name of the destination.\n fix_me (Optional[str]): describe me\n """\n self.fix_me = check.opt_str_param(fix_me, "fix_me")\n super().__init__("Scaffold Source Python", name)
\n\n\n
[docs]class LookerSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n domain: str,\n client_id: str,\n client_secret: str,\n run_look_ids: Optional[List[str]] = None,\n ):\n """Airbyte Source for Looker.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/looker\n\n Args:\n name (str): The name of the destination.\n domain (str): Domain for your Looker account, e.g. airbyte.cloud.looker.com,looker.[clientname].com,IP address\n client_id (str): The Client ID is first part of an API3 key that is specific to each Looker user. See the docs for more information on how to generate this key.\n client_secret (str): The Client Secret is second part of an API3 key.\n run_look_ids (Optional[List[str]]): The IDs of any Looks to run\n """\n self.domain = check.str_param(domain, "domain")\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.run_look_ids = check.opt_nullable_list_param(run_look_ids, "run_look_ids", str)\n super().__init__("Looker", name)
\n\n\n
[docs]class GitlabSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n api_url: str,\n private_token: str,\n start_date: str,\n groups: Optional[str] = None,\n projects: Optional[str] = None,\n ):\n """Airbyte Source for Gitlab.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/gitlab\n\n Args:\n name (str): The name of the destination.\n api_url (str): Please enter your basic URL from GitLab instance.\n private_token (str): Log into your GitLab account and then generate a personal Access Token.\n groups (Optional[str]): Space-delimited list of groups. e.g. airbyte.io.\n projects (Optional[str]): Space-delimited list of projects. e.g. airbyte.io/documentation meltano/tap-gitlab.\n start_date (str): The date from which you'd like to replicate data for GitLab API, in the format YYYY-MM-DDT00:00:00Z. All data generated after this date will be replicated.\n """\n self.api_url = check.str_param(api_url, "api_url")\n self.private_token = check.str_param(private_token, "private_token")\n self.groups = check.opt_str_param(groups, "groups")\n self.projects = check.opt_str_param(projects, "projects")\n self.start_date = check.str_param(start_date, "start_date")\n super().__init__("Gitlab", name)
\n\n\n
[docs]class ExchangeRatesSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n start_date: str,\n access_key: str,\n base: Optional[str] = None,\n ignore_weekends: Optional[bool] = None,\n ):\n """Airbyte Source for Exchange Rates.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/exchangeratesapi\n\n Args:\n name (str): The name of the destination.\n start_date (str): Start getting data from that date.\n access_key (str): Your API Key. See here. The key is case sensitive.\n base (Optional[str]): ISO reference currency. See here. Free plan doesn't support Source Currency Switching, default base currency is EUR\n ignore_weekends (Optional[bool]): Ignore weekends? (Exchanges don't run on weekends)\n """\n self.start_date = check.str_param(start_date, "start_date")\n self.access_key = check.str_param(access_key, "access_key")\n self.base = check.opt_str_param(base, "base")\n self.ignore_weekends = check.opt_bool_param(ignore_weekends, "ignore_weekends")\n super().__init__("Exchange Rates", name)
\n\n\n
[docs]class AmazonAdsSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n client_id: str,\n client_secret: str,\n refresh_token: str,\n auth_type: Optional[str] = None,\n region: Optional[str] = None,\n report_wait_timeout: Optional[int] = None,\n report_generation_max_retries: Optional[int] = None,\n start_date: Optional[str] = None,\n profiles: Optional[List[int]] = None,\n state_filter: Optional[List[str]] = None,\n ):\n """Airbyte Source for Amazon Ads.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/amazon-ads\n\n Args:\n name (str): The name of the destination.\n client_id (str): The client ID of your Amazon Ads developer application. See the docs for more information.\n client_secret (str): The client secret of your Amazon Ads developer application. See the docs for more information.\n refresh_token (str): Amazon Ads refresh token. See the docs for more information on how to obtain this token.\n region (Optional[str]): Region to pull data from (EU/NA/FE). See docs for more details.\n report_wait_timeout (Optional[int]): Timeout duration in minutes for Reports. Default is 60 minutes.\n report_generation_max_retries (Optional[int]): Maximum retries Airbyte will attempt for fetching report data. Default is 5.\n start_date (Optional[str]): The Start date for collecting reports, should not be more than 60 days in the past. In YYYY-MM-DD format\n profiles (Optional[List[int]]): Profile IDs you want to fetch data for. See docs for more details.\n state_filter (Optional[List[str]]): Reflects the state of the Display, Product, and Brand Campaign streams as enabled, paused, or archived. If you do not populate this field, it will be ignored completely.\n """\n self.auth_type = check.opt_str_param(auth_type, "auth_type")\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")\n self.region = check.opt_str_param(region, "region")\n self.report_wait_timeout = check.opt_int_param(report_wait_timeout, "report_wait_timeout")\n self.report_generation_max_retries = check.opt_int_param(\n report_generation_max_retries, "report_generation_max_retries"\n )\n self.start_date = check.opt_str_param(start_date, "start_date")\n self.profiles = check.opt_nullable_list_param(profiles, "profiles", int)\n self.state_filter = check.opt_nullable_list_param(state_filter, "state_filter", str)\n super().__init__("Amazon Ads", name)
\n\n\n
[docs]class MixpanelSource(GeneratedAirbyteSource):\n
[docs] class ServiceAccount:\n
[docs] @public\n def __init__(self, username: str, secret: str):\n self.username = check.str_param(username, "username")\n self.secret = check.str_param(secret, "secret")
\n\n
[docs] class ProjectSecret:\n
[docs] @public\n def __init__(self, api_secret: str):\n self.api_secret = check.str_param(api_secret, "api_secret")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n credentials: Union["MixpanelSource.ServiceAccount", "MixpanelSource.ProjectSecret"],\n project_id: Optional[int] = None,\n attribution_window: Optional[int] = None,\n project_timezone: Optional[str] = None,\n select_properties_by_default: Optional[bool] = None,\n start_date: Optional[str] = None,\n end_date: Optional[str] = None,\n region: Optional[str] = None,\n date_window_size: Optional[int] = None,\n ):\n """Airbyte Source for Mixpanel.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/mixpanel\n\n Args:\n name (str): The name of the destination.\n credentials (Union[MixpanelSource.ServiceAccount, MixpanelSource.ProjectSecret]): Choose how to authenticate to Mixpanel\n project_id (Optional[int]): Your project ID number. See the docs for more information on how to obtain this.\n attribution_window (Optional[int]): A period of time for attributing results to ads and the lookback period after those actions occur during which ad results are counted. Default attribution window is 5 days.\n project_timezone (Optional[str]): Time zone in which integer date times are stored. The project timezone may be found in the project settings in the Mixpanel console.\n select_properties_by_default (Optional[bool]): Setting this config parameter to TRUE ensures that new properties on events and engage records are captured. Otherwise new properties will be ignored.\n start_date (Optional[str]): The date in the format YYYY-MM-DD. Any data before this date will not be replicated. If this option is not set, the connector will replicate data from up to one year ago by default.\n end_date (Optional[str]): The date in the format YYYY-MM-DD. Any data after this date will not be replicated. Left empty to always sync to most recent date\n region (Optional[str]): The region of mixpanel domain instance either US or EU.\n date_window_size (Optional[int]): Defines window size in days, that used to slice through data. You can reduce it, if amount of data in each window is too big for your environment.\n """\n self.credentials = check.inst_param(\n credentials,\n "credentials",\n (MixpanelSource.ServiceAccount, MixpanelSource.ProjectSecret),\n )\n self.project_id = check.opt_int_param(project_id, "project_id")\n self.attribution_window = check.opt_int_param(attribution_window, "attribution_window")\n self.project_timezone = check.opt_str_param(project_timezone, "project_timezone")\n self.select_properties_by_default = check.opt_bool_param(\n select_properties_by_default, "select_properties_by_default"\n )\n self.start_date = check.opt_str_param(start_date, "start_date")\n self.end_date = check.opt_str_param(end_date, "end_date")\n self.region = check.opt_str_param(region, "region")\n self.date_window_size = check.opt_int_param(date_window_size, "date_window_size")\n super().__init__("Mixpanel", name)
\n\n\n
[docs]class OrbitSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, api_token: str, workspace: str, start_date: Optional[str] = None):\n """Airbyte Source for Orbit.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/orbit\n\n Args:\n name (str): The name of the destination.\n api_token (str): Authorizes you to work with Orbit workspaces associated with the token.\n workspace (str): The unique name of the workspace that your API token is associated with.\n start_date (Optional[str]): Date in the format 2022-06-26. Only load members whose last activities are after this date.\n """\n self.api_token = check.str_param(api_token, "api_token")\n self.workspace = check.str_param(workspace, "workspace")\n self.start_date = check.opt_str_param(start_date, "start_date")\n super().__init__("Orbit", name)
\n\n\n
[docs]class AmazonSellerPartnerSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n lwa_app_id: str,\n lwa_client_secret: str,\n refresh_token: str,\n aws_access_key: str,\n aws_secret_key: str,\n role_arn: str,\n replication_start_date: str,\n aws_environment: str,\n region: str,\n app_id: Optional[str] = None,\n auth_type: Optional[str] = None,\n replication_end_date: Optional[str] = None,\n period_in_days: Optional[int] = None,\n report_options: Optional[str] = None,\n max_wait_seconds: Optional[int] = None,\n ):\n """Airbyte Source for Amazon Seller Partner.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/amazon-seller-partner\n\n Args:\n name (str): The name of the destination.\n app_id (Optional[str]): Your Amazon App ID\n lwa_app_id (str): Your Login with Amazon Client ID.\n lwa_client_secret (str): Your Login with Amazon Client Secret.\n refresh_token (str): The Refresh Token obtained via OAuth flow authorization.\n aws_access_key (str): Specifies the AWS access key used as part of the credentials to authenticate the user.\n aws_secret_key (str): Specifies the AWS secret key used as part of the credentials to authenticate the user.\n role_arn (str): Specifies the Amazon Resource Name (ARN) of an IAM role that you want to use to perform operations requested using this profile. (Needs permission to 'Assume Role' STS).\n replication_start_date (str): UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated.\n replication_end_date (Optional[str]): UTC date and time in the format 2017-01-25T00:00:00Z. Any data after this date will not be replicated.\n period_in_days (Optional[int]): Will be used for stream slicing for initial full_refresh sync when no updated state is present for reports that support sliced incremental sync.\n report_options (Optional[str]): Additional information passed to reports. This varies by report type. Must be a valid json string.\n max_wait_seconds (Optional[int]): Sometimes report can take up to 30 minutes to generate. This will set the limit for how long to wait for a successful report.\n aws_environment (str): An enumeration.\n region (str): An enumeration.\n """\n self.app_id = check.opt_str_param(app_id, "app_id")\n self.auth_type = check.opt_str_param(auth_type, "auth_type")\n self.lwa_app_id = check.str_param(lwa_app_id, "lwa_app_id")\n self.lwa_client_secret = check.str_param(lwa_client_secret, "lwa_client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")\n self.aws_access_key = check.str_param(aws_access_key, "aws_access_key")\n self.aws_secret_key = check.str_param(aws_secret_key, "aws_secret_key")\n self.role_arn = check.str_param(role_arn, "role_arn")\n self.replication_start_date = check.str_param(\n replication_start_date, "replication_start_date"\n )\n self.replication_end_date = check.opt_str_param(\n replication_end_date, "replication_end_date"\n )\n self.period_in_days = check.opt_int_param(period_in_days, "period_in_days")\n self.report_options = check.opt_str_param(report_options, "report_options")\n self.max_wait_seconds = check.opt_int_param(max_wait_seconds, "max_wait_seconds")\n self.aws_environment = check.str_param(aws_environment, "aws_environment")\n self.region = check.str_param(region, "region")\n super().__init__("Amazon Seller Partner", name)
\n\n\n
[docs]class CourierSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, api_key: str):\n """Airbyte Source for Courier.\n\n Documentation can be found at https://docs.airbyte.io/integrations/sources/courier\n\n Args:\n name (str): The name of the destination.\n api_key (str): Courier API Key to retrieve your data.\n """\n self.api_key = check.str_param(api_key, "api_key")\n super().__init__("Courier", name)
\n\n\n
[docs]class CloseComSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, api_key: str, start_date: Optional[str] = None):\n r"""Airbyte Source for Close Com.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/close-com\n\n Args:\n name (str): The name of the destination.\n api_key (str): Close.com API key (usually starts with 'api\\\\_'; find yours here).\n start_date (Optional[str]): The start date to sync data. Leave blank for full sync. Format: YYYY-MM-DD.\n """\n self.api_key = check.str_param(api_key, "api_key")\n self.start_date = check.opt_str_param(start_date, "start_date")\n super().__init__("Close Com", name)
\n\n\n
[docs]class BingAdsSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n client_id: str,\n refresh_token: str,\n developer_token: str,\n reports_start_date: str,\n auth_method: Optional[str] = None,\n tenant_id: Optional[str] = None,\n client_secret: Optional[str] = None,\n ):\n """Airbyte Source for Bing Ads.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/bing-ads\n\n Args:\n name (str): The name of the destination.\n tenant_id (Optional[str]): The Tenant ID of your Microsoft Advertising developer application. Set this to "common" unless you know you need a different value.\n client_id (str): The Client ID of your Microsoft Advertising developer application.\n client_secret (Optional[str]): The Client Secret of your Microsoft Advertising developer application.\n refresh_token (str): Refresh Token to renew the expired Access Token.\n developer_token (str): Developer token associated with user. See more info in the docs.\n reports_start_date (str): The start date from which to begin replicating report data. Any data generated before this date will not be replicated in reports. This is a UTC date in YYYY-MM-DD format.\n """\n self.auth_method = check.opt_str_param(auth_method, "auth_method")\n self.tenant_id = check.opt_str_param(tenant_id, "tenant_id")\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.opt_str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")\n self.developer_token = check.str_param(developer_token, "developer_token")\n self.reports_start_date = check.str_param(reports_start_date, "reports_start_date")\n super().__init__("Bing Ads", name)
\n\n\n
[docs]class PrimetricSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, client_id: str, client_secret: str):\n """Airbyte Source for Primetric.\n\n Args:\n name (str): The name of the destination.\n client_id (str): The Client ID of your Primetric developer application. The Client ID is visible here.\n client_secret (str): The Client Secret of your Primetric developer application. You can manage your client's credentials here.\n """\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n super().__init__("Primetric", name)
\n\n\n
[docs]class PivotalTrackerSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, api_token: str):\n """Airbyte Source for Pivotal Tracker.\n\n Args:\n name (str): The name of the destination.\n api_token (str): Pivotal Tracker API token\n """\n self.api_token = check.str_param(api_token, "api_token")\n super().__init__("Pivotal Tracker", name)
\n\n\n
[docs]class ElasticsearchSource(GeneratedAirbyteSource):\n
[docs] class None_:\n
[docs] @public\n def __init__(\n self,\n ):\n self.method = "none"
\n\n
[docs] class ApiKeySecret:\n
[docs] @public\n def __init__(self, apiKeyId: str, apiKeySecret: str):\n self.method = "secret"\n self.apiKeyId = check.str_param(apiKeyId, "apiKeyId")\n self.apiKeySecret = check.str_param(apiKeySecret, "apiKeySecret")
\n\n
[docs] class UsernamePassword:\n
[docs] @public\n def __init__(self, username: str, password: str):\n self.method = "basic"\n self.username = check.str_param(username, "username")\n self.password = check.str_param(password, "password")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n endpoint: str,\n authenticationMethod: Union[\n "ElasticsearchSource.None_",\n "ElasticsearchSource.ApiKeySecret",\n "ElasticsearchSource.UsernamePassword",\n ],\n ):\n r"""Airbyte Source for Elasticsearch.\n\n Documentation can be found at https://docs.airbyte.com/integrations/source/elasticsearch\n\n Args:\n name (str): The name of the destination.\n endpoint (str): The full url of the Elasticsearch server\n authenticationMethod (Union[ElasticsearchSource.None\\\\_, ElasticsearchSource.ApiKeySecret, ElasticsearchSource.UsernamePassword]): The type of authentication to be used\n """\n self.endpoint = check.str_param(endpoint, "endpoint")\n self.authenticationMethod = check.inst_param(\n authenticationMethod,\n "authenticationMethod",\n (\n ElasticsearchSource.None_,\n ElasticsearchSource.ApiKeySecret,\n ElasticsearchSource.UsernamePassword,\n ),\n )\n super().__init__("Elasticsearch", name)
\n\n\n
[docs]class BigquerySource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self, name: str, project_id: str, credentials_json: str, dataset_id: Optional[str] = None\n ):\n """Airbyte Source for Bigquery.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/bigquery\n\n Args:\n name (str): The name of the destination.\n project_id (str): The GCP project ID for the project containing the target BigQuery dataset.\n dataset_id (Optional[str]): The dataset ID to search for tables and views. If you are only loading data from one dataset, setting this option could result in much faster schema discovery.\n credentials_json (str): The contents of your Service Account Key JSON file. See the docs for more information on how to obtain this key.\n """\n self.project_id = check.str_param(project_id, "project_id")\n self.dataset_id = check.opt_str_param(dataset_id, "dataset_id")\n self.credentials_json = check.str_param(credentials_json, "credentials_json")\n super().__init__("Bigquery", name)
\n\n\n
[docs]class WoocommerceSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n shop: str,\n start_date: str,\n api_key: str,\n api_secret: str,\n conversion_window_days: Optional[int] = None,\n ):\n """Airbyte Source for Woocommerce.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/woocommerce\n\n Args:\n name (str): The name of the destination.\n shop (str): The name of the store. For https://EXAMPLE.com, the shop name is 'EXAMPLE.com'.\n start_date (str): The date you would like to replicate data. Format: YYYY-MM-DD.\n api_key (str): The CUSTOMER KEY for API in WooCommerce shop.\n api_secret (str): The CUSTOMER SECRET for API in WooCommerce shop.\n conversion_window_days (Optional[int]): A conversion window is the period of time after an ad interaction (such as an ad click or video view) during which a conversion, such as a purchase, is recorded in Google Ads.\n """\n self.shop = check.str_param(shop, "shop")\n self.start_date = check.str_param(start_date, "start_date")\n self.api_key = check.str_param(api_key, "api_key")\n self.api_secret = check.str_param(api_secret, "api_secret")\n self.conversion_window_days = check.opt_int_param(\n conversion_window_days, "conversion_window_days"\n )\n super().__init__("Woocommerce", name)
\n\n\n
[docs]class SearchMetricsSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self, name: str, api_key: str, client_secret: str, country_code: str, start_date: str\n ):\n """Airbyte Source for Search Metrics.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/seacrh-metrics\n\n Args:\n name (str): The name of the destination.\n country_code (str): The region of the S3 staging bucket to use if utilising a copy strategy.\n start_date (str): Data generated in SearchMetrics after this date will be replicated. This date must be specified in the format YYYY-MM-DDT00:00:00Z.\n """\n self.api_key = check.str_param(api_key, "api_key")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.country_code = check.str_param(country_code, "country_code")\n self.start_date = check.str_param(start_date, "start_date")\n super().__init__("Search Metrics", name)
\n\n\n
[docs]class TypeformSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self, name: str, start_date: str, token: str, form_ids: Optional[List[str]] = None\n ):\n """Airbyte Source for Typeform.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/typeform\n\n Args:\n name (str): The name of the destination.\n start_date (str): UTC date and time in the format: YYYY-MM-DDTHH:mm:ss[Z]. Any data before this date will not be replicated.\n token (str): The API Token for a Typeform account.\n form_ids (Optional[List[str]]): When this parameter is set, the connector will replicate data only from the input forms. Otherwise, all forms in your Typeform account will be replicated. You can find form IDs in your form URLs. For example, in the URL "https://mysite.typeform.com/to/u6nXL7" the form_id is u6nXL7. You can find form URLs on Share panel\n """\n self.start_date = check.str_param(start_date, "start_date")\n self.token = check.str_param(token, "token")\n self.form_ids = check.opt_nullable_list_param(form_ids, "form_ids", str)\n super().__init__("Typeform", name)
\n\n\n
[docs]class WebflowSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, site_id: str, api_key: str):\n """Airbyte Source for Webflow.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/webflow\n\n Args:\n name (str): The name of the destination.\n site_id (str): The id of the Webflow site you are requesting data from. See https://developers.webflow.com/#sites\n api_key (str): The API token for authenticating to Webflow. See https://university.webflow.com/lesson/intro-to-the-webflow-api\n """\n self.site_id = check.str_param(site_id, "site_id")\n self.api_key = check.str_param(api_key, "api_key")\n super().__init__("Webflow", name)
\n\n\n
[docs]class FireboltSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n username: str,\n password: str,\n database: str,\n account: Optional[str] = None,\n host: Optional[str] = None,\n engine: Optional[str] = None,\n ):\n """Airbyte Source for Firebolt.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/firebolt\n\n Args:\n name (str): The name of the destination.\n username (str): Firebolt email address you use to login.\n password (str): Firebolt password.\n account (Optional[str]): Firebolt account to login.\n host (Optional[str]): The host name of your Firebolt database.\n database (str): The database to connect to.\n engine (Optional[str]): Engine name or url to connect to.\n """\n self.username = check.str_param(username, "username")\n self.password = check.str_param(password, "password")\n self.account = check.opt_str_param(account, "account")\n self.host = check.opt_str_param(host, "host")\n self.database = check.str_param(database, "database")\n self.engine = check.opt_str_param(engine, "engine")\n super().__init__("Firebolt", name)
\n\n\n
[docs]class FaunaSource(GeneratedAirbyteSource):\n
[docs] class Disabled:\n
[docs] @public\n def __init__(\n self,\n ):\n self.deletion_mode = "ignore"
\n\n
[docs] class Enabled:\n
[docs] @public\n def __init__(self, column: str):\n self.deletion_mode = "deleted_field"\n self.column = check.str_param(column, "column")
\n\n
[docs] class Collection:\n
[docs] @public\n def __init__(\n self, page_size: int, deletions: Union["FaunaSource.Disabled", "FaunaSource.Enabled"]\n ):\n self.page_size = check.int_param(page_size, "page_size")\n self.deletions = check.inst_param(\n deletions, "deletions", (FaunaSource.Disabled, FaunaSource.Enabled)\n )
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n domain: str,\n port: int,\n scheme: str,\n secret: str,\n collection: "FaunaSource.Collection",\n ):\n """Airbyte Source for Fauna.\n\n Documentation can be found at https://github.com/fauna/airbyte/blob/source-fauna/docs/integrations/sources/fauna.md\n\n Args:\n name (str): The name of the destination.\n domain (str): Domain of Fauna to query. Defaults db.fauna.com. See the docs.\n port (int): Endpoint port.\n scheme (str): URL scheme.\n secret (str): Fauna secret, used when authenticating with the database.\n collection (FaunaSource.Collection): Settings for the Fauna Collection.\n """\n self.domain = check.str_param(domain, "domain")\n self.port = check.int_param(port, "port")\n self.scheme = check.str_param(scheme, "scheme")\n self.secret = check.str_param(secret, "secret")\n self.collection = check.inst_param(collection, "collection", FaunaSource.Collection)\n super().__init__("Fauna", name)
\n\n\n
[docs]class IntercomSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, start_date: str, access_token: str):\n """Airbyte Source for Intercom.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/intercom\n\n Args:\n name (str): The name of the destination.\n start_date (str): UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated.\n access_token (str): Access token for making authenticated requests. See the Intercom docs for more information.\n """\n self.start_date = check.str_param(start_date, "start_date")\n self.access_token = check.str_param(access_token, "access_token")\n super().__init__("Intercom", name)
\n\n\n
[docs]class FreshsalesSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, domain_name: str, api_key: str):\n """Airbyte Source for Freshsales.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/freshsales\n\n Args:\n name (str): The name of the destination.\n domain_name (str): The Name of your Freshsales domain\n api_key (str): Freshsales API Key. See here. The key is case sensitive.\n """\n self.domain_name = check.str_param(domain_name, "domain_name")\n self.api_key = check.str_param(api_key, "api_key")\n super().__init__("Freshsales", name)
\n\n\n
[docs]class AdjustSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n api_token: str,\n dimensions: List[str],\n ingest_start: str,\n metrics: List[str],\n additional_metrics: Optional[List[str]] = None,\n until_today: Optional[bool] = None,\n ):\n """Airbyte Source for Adjust.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/adjust\n\n Args:\n name (str): The name of the destination.\n additional_metrics (Optional[List[str]]): Metrics names that are not pre-defined, such as cohort metrics or app specific metrics.\n api_token (str): Adjust API key, see https://help.adjust.com/en/article/report-service-api-authentication\n dimensions (List[str]): Dimensions allow a user to break down metrics into groups using one or several parameters. For example, the number of installs by date, country and network. See https://help.adjust.com/en/article/reports-endpoint#dimensions for more information about the dimensions.\n ingest_start (str): Data ingest start date.\n metrics (List[str]): Select at least one metric to query.\n until_today (Optional[bool]): Syncs data up until today. Useful when running daily incremental syncs, and duplicates are not desired.\n """\n self.additional_metrics = check.opt_nullable_list_param(\n additional_metrics, "additional_metrics", str\n )\n self.api_token = check.str_param(api_token, "api_token")\n self.dimensions = check.list_param(dimensions, "dimensions", str)\n self.ingest_start = check.str_param(ingest_start, "ingest_start")\n self.metrics = check.list_param(metrics, "metrics", str)\n self.until_today = check.opt_bool_param(until_today, "until_today")\n super().__init__("Adjust", name)
\n\n\n
[docs]class BambooHrSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n subdomain: str,\n api_key: str,\n custom_reports_fields: Optional[str] = None,\n custom_reports_include_default_fields: Optional[bool] = None,\n ):\n """Airbyte Source for Bamboo Hr.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/bamboo-hr\n\n Args:\n name (str): The name of the destination.\n subdomain (str): Sub Domain of bamboo hr\n api_key (str): Api key of bamboo hr\n custom_reports_fields (Optional[str]): Comma-separated list of fields to include in custom reports.\n custom_reports_include_default_fields (Optional[bool]): If true, the custom reports endpoint will include the default fields defined here: https://documentation.bamboohr.com/docs/list-of-field-names.\n """\n self.subdomain = check.str_param(subdomain, "subdomain")\n self.api_key = check.str_param(api_key, "api_key")\n self.custom_reports_fields = check.opt_str_param(\n custom_reports_fields, "custom_reports_fields"\n )\n self.custom_reports_include_default_fields = check.opt_bool_param(\n custom_reports_include_default_fields, "custom_reports_include_default_fields"\n )\n super().__init__("Bamboo Hr", name)
\n\n\n
[docs]class GoogleAdsSource(GeneratedAirbyteSource):\n
[docs] class GoogleCredentials:\n
[docs] @public\n def __init__(\n self,\n developer_token: str,\n client_id: str,\n client_secret: str,\n refresh_token: str,\n access_token: Optional[str] = None,\n ):\n self.developer_token = check.str_param(developer_token, "developer_token")\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")\n self.access_token = check.opt_str_param(access_token, "access_token")
\n\n
[docs] class CustomGAQLQueriesEntry:\n
[docs] @public\n def __init__(self, query: str, table_name: str):\n self.query = check.str_param(query, "query")\n self.table_name = check.str_param(table_name, "table_name")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n credentials: "GoogleAdsSource.GoogleCredentials",\n customer_id: str,\n start_date: str,\n end_date: Optional[str] = None,\n custom_queries: Optional[List[CustomGAQLQueriesEntry]] = None,\n login_customer_id: Optional[str] = None,\n conversion_window_days: Optional[int] = None,\n ):\n """Airbyte Source for Google Ads.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/google-ads\n\n Args:\n name (str): The name of the destination.\n customer_id (str): Comma separated list of (client) customer IDs. Each customer ID must be specified as a 10-digit number without dashes. More instruction on how to find this value in our docs. Metrics streams like AdGroupAdReport cannot be requested for a manager account.\n start_date (str): UTC date and time in the format 2017-01-25. Any data before this date will not be replicated.\n end_date (Optional[str]): UTC date and time in the format 2017-01-25. Any data after this date will not be replicated.\n login_customer_id (Optional[str]): If your access to the customer account is through a manager account, this field is required and must be set to the customer ID of the manager account (10-digit number without dashes). More information about this field you can see here\n conversion_window_days (Optional[int]): A conversion window is the period of time after an ad interaction (such as an ad click or video view) during which a conversion, such as a purchase, is recorded in Google Ads. For more information, see Google's documentation.\n """\n self.credentials = check.inst_param(\n credentials, "credentials", GoogleAdsSource.GoogleCredentials\n )\n self.customer_id = check.str_param(customer_id, "customer_id")\n self.start_date = check.str_param(start_date, "start_date")\n self.end_date = check.opt_str_param(end_date, "end_date")\n self.custom_queries = check.opt_nullable_list_param(\n custom_queries, "custom_queries", GoogleAdsSource.CustomGAQLQueriesEntry\n )\n self.login_customer_id = check.opt_str_param(login_customer_id, "login_customer_id")\n self.conversion_window_days = check.opt_int_param(\n conversion_window_days, "conversion_window_days"\n )\n super().__init__("Google Ads", name)
\n\n\n
[docs]class HellobatonSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, api_key: str, company: str):\n """Airbyte Source for Hellobaton.\n\n Args:\n name (str): The name of the destination.\n api_key (str): authentication key required to access the api endpoints\n company (str): Company name that generates your base api url\n """\n self.api_key = check.str_param(api_key, "api_key")\n self.company = check.str_param(company, "company")\n super().__init__("Hellobaton", name)
\n\n\n
[docs]class SendgridSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, apikey: str, start_time: Union[int, str]):\n """Airbyte Source for Sendgrid.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/sendgrid\n\n Args:\n name (str): The name of the destination.\n apikey (str): API Key, use admin to generate this key.\n start_time (Union[int, str]): Start time in ISO8601 format. Any data before this time point will not be replicated.\n """\n self.apikey = check.str_param(apikey, "apikey")\n self.start_time = check.inst_param(start_time, "start_time", (int, str))\n super().__init__("Sendgrid", name)
\n\n\n
[docs]class MondaySource(GeneratedAirbyteSource):\n
[docs] class OAuth20:\n
[docs] @public\n def __init__(\n self,\n client_id: str,\n client_secret: str,\n access_token: str,\n subdomain: Optional[str] = None,\n ):\n self.auth_type = "oauth2.0"\n self.subdomain = check.opt_str_param(subdomain, "subdomain")\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.access_token = check.str_param(access_token, "access_token")
\n\n
[docs] class APIToken:\n
[docs] @public\n def __init__(self, api_token: str):\n self.auth_type = "api_token"\n self.api_token = check.str_param(api_token, "api_token")
\n\n
[docs] @public\n def __init__(\n self, name: str, credentials: Union["MondaySource.OAuth20", "MondaySource.APIToken"]\n ):\n """Airbyte Source for Monday.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/monday\n\n Args:\n name (str): The name of the destination.\n\n """\n self.credentials = check.inst_param(\n credentials, "credentials", (MondaySource.OAuth20, MondaySource.APIToken)\n )\n super().__init__("Monday", name)
\n\n\n
[docs]class DixaSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self, name: str, api_token: str, start_date: str, batch_size: Optional[int] = None\n ):\n """Airbyte Source for Dixa.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/dixa\n\n Args:\n name (str): The name of the destination.\n api_token (str): Dixa API token\n start_date (str): The connector pulls records updated from this date onwards.\n batch_size (Optional[int]): Number of days to batch into one request. Max 31.\n """\n self.api_token = check.str_param(api_token, "api_token")\n self.start_date = check.str_param(start_date, "start_date")\n self.batch_size = check.opt_int_param(batch_size, "batch_size")\n super().__init__("Dixa", name)
\n\n\n
[docs]class SalesforceSource(GeneratedAirbyteSource):\n
[docs] class FilterSalesforceObjectsEntry:\n
[docs] @public\n def __init__(self, criteria: str, value: str):\n self.criteria = check.str_param(criteria, "criteria")\n self.value = check.str_param(value, "value")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n client_id: str,\n client_secret: str,\n refresh_token: str,\n is_sandbox: Optional[bool] = None,\n auth_type: Optional[str] = None,\n start_date: Optional[str] = None,\n streams_criteria: Optional[List[FilterSalesforceObjectsEntry]] = None,\n ):\n """Airbyte Source for Salesforce.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/salesforce\n\n Args:\n name (str): The name of the destination.\n is_sandbox (Optional[bool]): Toggle if you're using a Salesforce Sandbox\n client_id (str): Enter your Salesforce developer application's Client ID\n client_secret (str): Enter your Salesforce developer application's Client secret\n refresh_token (str): Enter your application's Salesforce Refresh Token used for Airbyte to access your Salesforce account.\n start_date (Optional[str]): Enter the date in the YYYY-MM-DD format. Airbyte will replicate the data added on and after this date. If this field is blank, Airbyte will replicate all data.\n streams_criteria (Optional[List[SalesforceSource.FilterSalesforceObjectsEntry]]): Filter streams relevant to you\n """\n self.is_sandbox = check.opt_bool_param(is_sandbox, "is_sandbox")\n self.auth_type = check.opt_str_param(auth_type, "auth_type")\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")\n self.start_date = check.opt_str_param(start_date, "start_date")\n self.streams_criteria = check.opt_nullable_list_param(\n streams_criteria, "streams_criteria", SalesforceSource.FilterSalesforceObjectsEntry\n )\n super().__init__("Salesforce", name)
\n\n\n
[docs]class PipedriveSource(GeneratedAirbyteSource):\n
[docs] class SignInViaPipedriveOAuth:\n
[docs] @public\n def __init__(self, client_id: str, client_secret: str, refresh_token: str):\n self.auth_type = "Client"\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")
\n\n
[docs] class APIKeyAuthentication:\n
[docs] @public\n def __init__(self, api_token: str):\n self.auth_type = "Token"\n self.api_token = check.str_param(api_token, "api_token")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n authorization: Union[\n "PipedriveSource.SignInViaPipedriveOAuth", "PipedriveSource.APIKeyAuthentication"\n ],\n replication_start_date: str,\n ):\n """Airbyte Source for Pipedrive.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/pipedrive\n\n Args:\n name (str): The name of the destination.\n authorization (Union[PipedriveSource.SignInViaPipedriveOAuth, PipedriveSource.APIKeyAuthentication]): Choose one of the possible authorization method\n replication_start_date (str): UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated. When specified and not None, then stream will behave as incremental\n """\n self.authorization = check.inst_param(\n authorization,\n "authorization",\n (PipedriveSource.SignInViaPipedriveOAuth, PipedriveSource.APIKeyAuthentication),\n )\n self.replication_start_date = check.str_param(\n replication_start_date, "replication_start_date"\n )\n super().__init__("Pipedrive", name)
\n\n\n
[docs]class FileSource(GeneratedAirbyteSource):\n
[docs] class HTTPSPublicWeb:\n
[docs] @public\n def __init__(self, user_agent: Optional[bool] = None):\n self.storage = "HTTPS"\n self.user_agent = check.opt_bool_param(user_agent, "user_agent")
\n\n
[docs] class GCSGoogleCloudStorage:\n
[docs] @public\n def __init__(self, service_account_json: Optional[str] = None):\n self.storage = "GCS"\n self.service_account_json = check.opt_str_param(\n service_account_json, "service_account_json"\n )
\n\n
[docs] class S3AmazonWebServices:\n
[docs] @public\n def __init__(\n self,\n aws_access_key_id: Optional[str] = None,\n aws_secret_access_key: Optional[str] = None,\n ):\n self.storage = "S3"\n self.aws_access_key_id = check.opt_str_param(aws_access_key_id, "aws_access_key_id")\n self.aws_secret_access_key = check.opt_str_param(\n aws_secret_access_key, "aws_secret_access_key"\n )
\n\n
[docs] class AzBlobAzureBlobStorage:\n
[docs] @public\n def __init__(\n self,\n storage_account: str,\n sas_token: Optional[str] = None,\n shared_key: Optional[str] = None,\n ):\n self.storage = "AzBlob"\n self.storage_account = check.str_param(storage_account, "storage_account")\n self.sas_token = check.opt_str_param(sas_token, "sas_token")\n self.shared_key = check.opt_str_param(shared_key, "shared_key")
\n\n
[docs] class SSHSecureShell:\n
[docs] @public\n def __init__(\n self, user: str, host: str, password: Optional[str] = None, port: Optional[str] = None\n ):\n self.storage = "SSH"\n self.user = check.str_param(user, "user")\n self.password = check.opt_str_param(password, "password")\n self.host = check.str_param(host, "host")\n self.port = check.opt_str_param(port, "port")
\n\n
[docs] class SCPSecureCopyProtocol:\n
[docs] @public\n def __init__(\n self, user: str, host: str, password: Optional[str] = None, port: Optional[str] = None\n ):\n self.storage = "SCP"\n self.user = check.str_param(user, "user")\n self.password = check.opt_str_param(password, "password")\n self.host = check.str_param(host, "host")\n self.port = check.opt_str_param(port, "port")
\n\n
[docs] class SFTPSecureFileTransferProtocol:\n
[docs] @public\n def __init__(\n self, user: str, host: str, password: Optional[str] = None, port: Optional[str] = None\n ):\n self.storage = "SFTP"\n self.user = check.str_param(user, "user")\n self.password = check.opt_str_param(password, "password")\n self.host = check.str_param(host, "host")\n self.port = check.opt_str_param(port, "port")
\n\n
[docs] class LocalFilesystemLimited:\n
[docs] @public\n def __init__(\n self,\n ):\n self.storage = "local"
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n dataset_name: str,\n format: str,\n url: str,\n provider: Union[\n "FileSource.HTTPSPublicWeb",\n "FileSource.GCSGoogleCloudStorage",\n "FileSource.S3AmazonWebServices",\n "FileSource.AzBlobAzureBlobStorage",\n "FileSource.SSHSecureShell",\n "FileSource.SCPSecureCopyProtocol",\n "FileSource.SFTPSecureFileTransferProtocol",\n "FileSource.LocalFilesystemLimited",\n ],\n reader_options: Optional[str] = None,\n ):\n """Airbyte Source for File.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/file\n\n Args:\n name (str): The name of the destination.\n dataset_name (str): The Name of the final table to replicate this file into (should include letters, numbers dash and underscores only).\n format (str): The Format of the file which should be replicated (Warning: some formats may be experimental, please refer to the docs).\n reader_options (Optional[str]): This should be a string in JSON format. It depends on the chosen file format to provide additional options and tune its behavior.\n url (str): The URL path to access the file which should be replicated.\n provider (Union[FileSource.HTTPSPublicWeb, FileSource.GCSGoogleCloudStorage, FileSource.S3AmazonWebServices, FileSource.AzBlobAzureBlobStorage, FileSource.SSHSecureShell, FileSource.SCPSecureCopyProtocol, FileSource.SFTPSecureFileTransferProtocol, FileSource.LocalFilesystemLimited]): The storage Provider or Location of the file(s) which should be replicated.\n """\n self.dataset_name = check.str_param(dataset_name, "dataset_name")\n self.format = check.str_param(format, "format")\n self.reader_options = check.opt_str_param(reader_options, "reader_options")\n self.url = check.str_param(url, "url")\n self.provider = check.inst_param(\n provider,\n "provider",\n (\n FileSource.HTTPSPublicWeb,\n FileSource.GCSGoogleCloudStorage,\n FileSource.S3AmazonWebServices,\n FileSource.AzBlobAzureBlobStorage,\n FileSource.SSHSecureShell,\n FileSource.SCPSecureCopyProtocol,\n FileSource.SFTPSecureFileTransferProtocol,\n FileSource.LocalFilesystemLimited,\n ),\n )\n super().__init__("File", name)
\n\n\n
[docs]class GlassfrogSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, api_key: str):\n """Airbyte Source for Glassfrog.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/glassfrog\n\n Args:\n name (str): The name of the destination.\n api_key (str): API key provided by Glassfrog\n """\n self.api_key = check.str_param(api_key, "api_key")\n super().__init__("Glassfrog", name)
\n\n\n
[docs]class ChartmogulSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, api_key: str, start_date: str, interval: str):\n """Airbyte Source for Chartmogul.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/chartmogul\n\n Args:\n name (str): The name of the destination.\n api_key (str): Chartmogul API key\n start_date (str): UTC date and time in the format 2017-01-25T00:00:00Z. When feasible, any data before this date will not be replicated.\n interval (str): Some APIs such as Metrics require intervals to cluster data.\n """\n self.api_key = check.str_param(api_key, "api_key")\n self.start_date = check.str_param(start_date, "start_date")\n self.interval = check.str_param(interval, "interval")\n super().__init__("Chartmogul", name)
\n\n\n
[docs]class OrbSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n api_key: str,\n start_date: Optional[str] = None,\n lookback_window_days: Optional[int] = None,\n string_event_properties_keys: Optional[List[str]] = None,\n numeric_event_properties_keys: Optional[List[str]] = None,\n ):\n """Airbyte Source for Orb.\n\n Documentation can be found at https://docs.withorb.com/\n\n Args:\n name (str): The name of the destination.\n api_key (str): Orb API Key, issued from the Orb admin console.\n start_date (Optional[str]): UTC date and time in the format 2022-03-01T00:00:00Z. Any data with created_at before this data will not be synced.\n lookback_window_days (Optional[int]): When set to N, the connector will always refresh resources created within the past N days. By default, updated objects that are not newly created are not incrementally synced.\n string_event_properties_keys (Optional[List[str]]): Property key names to extract from all events, in order to enrich ledger entries corresponding to an event deduction.\n numeric_event_properties_keys (Optional[List[str]]): Property key names to extract from all events, in order to enrich ledger entries corresponding to an event deduction.\n """\n self.api_key = check.str_param(api_key, "api_key")\n self.start_date = check.opt_str_param(start_date, "start_date")\n self.lookback_window_days = check.opt_int_param(\n lookback_window_days, "lookback_window_days"\n )\n self.string_event_properties_keys = check.opt_nullable_list_param(\n string_event_properties_keys, "string_event_properties_keys", str\n )\n self.numeric_event_properties_keys = check.opt_nullable_list_param(\n numeric_event_properties_keys, "numeric_event_properties_keys", str\n )\n super().__init__("Orb", name)
\n\n\n
[docs]class CockroachdbSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n host: str,\n port: int,\n database: str,\n username: str,\n password: Optional[str] = None,\n jdbc_url_params: Optional[str] = None,\n ssl: Optional[bool] = None,\n ):\n """Airbyte Source for Cockroachdb.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/cockroachdb\n\n Args:\n name (str): The name of the destination.\n host (str): Hostname of the database.\n port (int): Port of the database.\n database (str): Name of the database.\n username (str): Username to use to access the database.\n password (Optional[str]): Password associated with the username.\n jdbc_url_params (Optional[str]): Additional properties to pass to the JDBC URL string when connecting to the database formatted as 'key=value' pairs separated by the symbol '&'. (Eg. key1=value1&key2=value2&key3=value3). For more information read about JDBC URL parameters.\n ssl (Optional[bool]): Encrypt client/server communications for increased security.\n """\n self.host = check.str_param(host, "host")\n self.port = check.int_param(port, "port")\n self.database = check.str_param(database, "database")\n self.username = check.str_param(username, "username")\n self.password = check.opt_str_param(password, "password")\n self.jdbc_url_params = check.opt_str_param(jdbc_url_params, "jdbc_url_params")\n self.ssl = check.opt_bool_param(ssl, "ssl")\n super().__init__("Cockroachdb", name)
\n\n\n
[docs]class ConfluenceSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, api_token: str, domain_name: str, email: str):\n """Airbyte Source for Confluence.\n\n Args:\n name (str): The name of the destination.\n api_token (str): Please follow the Jira confluence for generating an API token: https://support.atlassian.com/atlassian-account/docs/manage-api-tokens-for-your-atlassian-account/\n domain_name (str): Your Confluence domain name\n email (str): Your Confluence login email\n """\n self.api_token = check.str_param(api_token, "api_token")\n self.domain_name = check.str_param(domain_name, "domain_name")\n self.email = check.str_param(email, "email")\n super().__init__("Confluence", name)
\n\n\n
[docs]class PlaidSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n access_token: str,\n api_key: str,\n client_id: str,\n plaid_env: str,\n start_date: Optional[str] = None,\n ):\n """Airbyte Source for Plaid.\n\n Documentation can be found at https://plaid.com/docs/api/\n\n Args:\n name (str): The name of the destination.\n access_token (str): The end-user's Link access token.\n api_key (str): The Plaid API key to use to hit the API.\n client_id (str): The Plaid client id\n plaid_env (str): The Plaid environment\n start_date (Optional[str]): The date from which you'd like to replicate data for Plaid in the format YYYY-MM-DD. All data generated after this date will be replicated.\n """\n self.access_token = check.str_param(access_token, "access_token")\n self.api_key = check.str_param(api_key, "api_key")\n self.client_id = check.str_param(client_id, "client_id")\n self.plaid_env = check.str_param(plaid_env, "plaid_env")\n self.start_date = check.opt_str_param(start_date, "start_date")\n super().__init__("Plaid", name)
\n\n\n
[docs]class SnapchatMarketingSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n client_id: str,\n client_secret: str,\n refresh_token: str,\n start_date: Optional[str] = None,\n end_date: Optional[str] = None,\n ):\n """Airbyte Source for Snapchat Marketing.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/snapchat-marketing\n\n Args:\n name (str): The name of the destination.\n client_id (str): The Client ID of your Snapchat developer application.\n client_secret (str): The Client Secret of your Snapchat developer application.\n refresh_token (str): Refresh Token to renew the expired Access Token.\n start_date (Optional[str]): Date in the format 2022-01-01. Any data before this date will not be replicated.\n end_date (Optional[str]): Date in the format 2017-01-25. Any data after this date will not be replicated.\n """\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")\n self.start_date = check.opt_str_param(start_date, "start_date")\n self.end_date = check.opt_str_param(end_date, "end_date")\n super().__init__("Snapchat Marketing", name)
\n\n\n
[docs]class MicrosoftTeamsSource(GeneratedAirbyteSource):\n
[docs] class AuthenticateViaMicrosoftOAuth20:\n
[docs] @public\n def __init__(\n self,\n tenant_id: str,\n client_id: str,\n client_secret: str,\n refresh_token: str,\n auth_type: Optional[str] = None,\n ):\n self.auth_type = check.opt_str_param(auth_type, "auth_type")\n self.tenant_id = check.str_param(tenant_id, "tenant_id")\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")
\n\n
[docs] class AuthenticateViaMicrosoft:\n
[docs] @public\n def __init__(\n self,\n tenant_id: str,\n client_id: str,\n client_secret: str,\n auth_type: Optional[str] = None,\n ):\n self.auth_type = check.opt_str_param(auth_type, "auth_type")\n self.tenant_id = check.str_param(tenant_id, "tenant_id")\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n period: str,\n credentials: Union[\n "MicrosoftTeamsSource.AuthenticateViaMicrosoftOAuth20",\n "MicrosoftTeamsSource.AuthenticateViaMicrosoft",\n ],\n ):\n """Airbyte Source for Microsoft Teams.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/microsoft-teams\n\n Args:\n name (str): The name of the destination.\n period (str): Specifies the length of time over which the Team Device Report stream is aggregated. The supported values are: D7, D30, D90, and D180.\n credentials (Union[MicrosoftTeamsSource.AuthenticateViaMicrosoftOAuth20, MicrosoftTeamsSource.AuthenticateViaMicrosoft]): Choose how to authenticate to Microsoft\n """\n self.period = check.str_param(period, "period")\n self.credentials = check.inst_param(\n credentials,\n "credentials",\n (\n MicrosoftTeamsSource.AuthenticateViaMicrosoftOAuth20,\n MicrosoftTeamsSource.AuthenticateViaMicrosoft,\n ),\n )\n super().__init__("Microsoft Teams", name)
\n\n\n
[docs]class LeverHiringSource(GeneratedAirbyteSource):\n
[docs] class OAuthCredentials:\n
[docs] @public\n def __init__(\n self,\n refresh_token: str,\n auth_type: Optional[str] = None,\n client_id: Optional[str] = None,\n client_secret: Optional[str] = None,\n ):\n self.auth_type = check.opt_str_param(auth_type, "auth_type")\n self.client_id = check.opt_str_param(client_id, "client_id")\n self.client_secret = check.opt_str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n credentials: "LeverHiringSource.OAuthCredentials",\n start_date: str,\n environment: Optional[str] = None,\n ):\n """Airbyte Source for Lever Hiring.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/lever-hiring\n\n Args:\n name (str): The name of the destination.\n credentials (LeverHiringSource.OAuthCredentials): Choose how to authenticate to Lever Hiring.\n start_date (str): UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated. Note that it will be used only in the following incremental streams: comments, commits, and issues.\n environment (Optional[str]): The environment in which you'd like to replicate data for Lever. This is used to determine which Lever API endpoint to use.\n """\n self.credentials = check.inst_param(\n credentials, "credentials", LeverHiringSource.OAuthCredentials\n )\n self.start_date = check.str_param(start_date, "start_date")\n self.environment = check.opt_str_param(environment, "environment")\n super().__init__("Lever Hiring", name)
\n\n\n
[docs]class TwilioSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n account_sid: str,\n auth_token: str,\n start_date: str,\n lookback_window: Optional[int] = None,\n ):\n """Airbyte Source for Twilio.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/twilio\n\n Args:\n name (str): The name of the destination.\n account_sid (str): Twilio account SID\n auth_token (str): Twilio Auth Token.\n start_date (str): UTC date and time in the format 2020-10-01T00:00:00Z. Any data before this date will not be replicated.\n lookback_window (Optional[int]): How far into the past to look for records. (in minutes)\n """\n self.account_sid = check.str_param(account_sid, "account_sid")\n self.auth_token = check.str_param(auth_token, "auth_token")\n self.start_date = check.str_param(start_date, "start_date")\n self.lookback_window = check.opt_int_param(lookback_window, "lookback_window")\n super().__init__("Twilio", name)
\n\n\n
[docs]class StripeSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n account_id: str,\n client_secret: str,\n start_date: str,\n lookback_window_days: Optional[int] = None,\n slice_range: Optional[int] = None,\n ):\n r"""Airbyte Source for Stripe.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/stripe\n\n Args:\n name (str): The name of the destination.\n account_id (str): Your Stripe account ID (starts with 'acct\\\\_', find yours here).\n client_secret (str): Stripe API key (usually starts with 'sk_live\\\\_'; find yours here).\n start_date (str): UTC date and time in the format 2017-01-25T00:00:00Z. Only data generated after this date will be replicated.\n lookback_window_days (Optional[int]): When set, the connector will always re-export data from the past N days, where N is the value set here. This is useful if your data is frequently updated after creation. More info here\n slice_range (Optional[int]): The time increment used by the connector when requesting data from the Stripe API. The bigger the value is, the less requests will be made and faster the sync will be. On the other hand, the more seldom the state is persisted.\n """\n self.account_id = check.str_param(account_id, "account_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.start_date = check.str_param(start_date, "start_date")\n self.lookback_window_days = check.opt_int_param(\n lookback_window_days, "lookback_window_days"\n )\n self.slice_range = check.opt_int_param(slice_range, "slice_range")\n super().__init__("Stripe", name)
\n\n\n
[docs]class Db2Source(GeneratedAirbyteSource):\n
[docs] class Unencrypted:\n
[docs] @public\n def __init__(\n self,\n ):\n self.encryption_method = "unencrypted"
\n\n
[docs] class TLSEncryptedVerifyCertificate:\n
[docs] @public\n def __init__(self, ssl_certificate: str, key_store_password: Optional[str] = None):\n self.encryption_method = "encrypted_verify_certificate"\n self.ssl_certificate = check.str_param(ssl_certificate, "ssl_certificate")\n self.key_store_password = check.opt_str_param(key_store_password, "key_store_password")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n host: str,\n port: int,\n db: str,\n username: str,\n password: str,\n encryption: Union["Db2Source.Unencrypted", "Db2Source.TLSEncryptedVerifyCertificate"],\n jdbc_url_params: Optional[str] = None,\n ):\n """Airbyte Source for Db2.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/db2\n\n Args:\n name (str): The name of the destination.\n host (str): Host of the Db2.\n port (int): Port of the database.\n db (str): Name of the database.\n username (str): Username to use to access the database.\n password (str): Password associated with the username.\n jdbc_url_params (Optional[str]): Additional properties to pass to the JDBC URL string when connecting to the database formatted as 'key=value' pairs separated by the symbol '&'. (example: key1=value1&key2=value2&key3=value3).\n encryption (Union[Db2Source.Unencrypted, Db2Source.TLSEncryptedVerifyCertificate]): Encryption method to use when communicating with the database\n """\n self.host = check.str_param(host, "host")\n self.port = check.int_param(port, "port")\n self.db = check.str_param(db, "db")\n self.username = check.str_param(username, "username")\n self.password = check.str_param(password, "password")\n self.jdbc_url_params = check.opt_str_param(jdbc_url_params, "jdbc_url_params")\n self.encryption = check.inst_param(\n encryption,\n "encryption",\n (Db2Source.Unencrypted, Db2Source.TLSEncryptedVerifyCertificate),\n )\n super().__init__("Db2", name)
\n\n\n
[docs]class SlackSource(GeneratedAirbyteSource):\n
[docs] class DefaultOAuth20Authorization:\n
[docs] @public\n def __init__(\n self,\n client_id: str,\n client_secret: str,\n access_token: str,\n refresh_token: Optional[str] = None,\n ):\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.access_token = check.str_param(access_token, "access_token")\n self.refresh_token = check.opt_str_param(refresh_token, "refresh_token")
\n\n
[docs] class APITokenCredentials:\n
[docs] @public\n def __init__(self, api_token: str):\n self.api_token = check.str_param(api_token, "api_token")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n start_date: str,\n lookback_window: int,\n join_channels: bool,\n credentials: Union[\n "SlackSource.DefaultOAuth20Authorization", "SlackSource.APITokenCredentials"\n ],\n channel_filter: Optional[List[str]] = None,\n ):\n """Airbyte Source for Slack.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/slack\n\n Args:\n name (str): The name of the destination.\n start_date (str): UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated.\n lookback_window (int): How far into the past to look for messages in threads.\n join_channels (bool): Whether to join all channels or to sync data only from channels the bot is already in. If false, you'll need to manually add the bot to all the channels from which you'd like to sync messages.\n channel_filter (Optional[List[str]]): A channel name list (without leading '#' char) which limit the channels from which you'd like to sync. Empty list means no filter.\n credentials (Union[SlackSource.DefaultOAuth20Authorization, SlackSource.APITokenCredentials]): Choose how to authenticate into Slack\n """\n self.start_date = check.str_param(start_date, "start_date")\n self.lookback_window = check.int_param(lookback_window, "lookback_window")\n self.join_channels = check.bool_param(join_channels, "join_channels")\n self.channel_filter = check.opt_nullable_list_param(channel_filter, "channel_filter", str)\n self.credentials = check.inst_param(\n credentials,\n "credentials",\n (SlackSource.DefaultOAuth20Authorization, SlackSource.APITokenCredentials),\n )\n super().__init__("Slack", name)
\n\n\n
[docs]class RechargeSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, start_date: str, access_token: str):\n """Airbyte Source for Recharge.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/recharge\n\n Args:\n name (str): The name of the destination.\n start_date (str): The date from which you'd like to replicate data for Recharge API, in the format YYYY-MM-DDT00:00:00Z. Any data before this date will not be replicated.\n access_token (str): The value of the Access Token generated. See the docs for more information.\n """\n self.start_date = check.str_param(start_date, "start_date")\n self.access_token = check.str_param(access_token, "access_token")\n super().__init__("Recharge", name)
\n\n\n
[docs]class OpenweatherSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n lat: str,\n lon: str,\n appid: str,\n units: Optional[str] = None,\n lang: Optional[str] = None,\n ):\n """Airbyte Source for Openweather.\n\n Args:\n name (str): The name of the destination.\n lat (str): Latitude for which you want to get weather condition from. (min -90, max 90)\n lon (str): Longitude for which you want to get weather condition from. (min -180, max 180)\n appid (str): Your OpenWeather API Key. See here. The key is case sensitive.\n units (Optional[str]): Units of measurement. standard, metric and imperial units are available. If you do not use the units parameter, standard units will be applied by default.\n lang (Optional[str]): You can use lang parameter to get the output in your language. The contents of the description field will be translated. See here for the list of supported languages.\n """\n self.lat = check.str_param(lat, "lat")\n self.lon = check.str_param(lon, "lon")\n self.appid = check.str_param(appid, "appid")\n self.units = check.opt_str_param(units, "units")\n self.lang = check.opt_str_param(lang, "lang")\n super().__init__("Openweather", name)
\n\n\n
[docs]class RetentlySource(GeneratedAirbyteSource):\n
[docs] class AuthenticateViaRetentlyOAuth:\n
[docs] @public\n def __init__(\n self,\n client_id: str,\n client_secret: str,\n refresh_token: str,\n auth_type: Optional[str] = None,\n ):\n self.auth_type = check.opt_str_param(auth_type, "auth_type")\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")
\n\n
[docs] class AuthenticateWithAPIToken:\n
[docs] @public\n def __init__(self, api_key: str, auth_type: Optional[str] = None):\n self.auth_type = check.opt_str_param(auth_type, "auth_type")\n self.api_key = check.str_param(api_key, "api_key")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n credentials: Union[\n "RetentlySource.AuthenticateViaRetentlyOAuth", "RetentlySource.AuthenticateWithAPIToken"\n ],\n ):\n """Airbyte Source for Retently.\n\n Args:\n name (str): The name of the destination.\n credentials (Union[RetentlySource.AuthenticateViaRetentlyOAuth, RetentlySource.AuthenticateWithAPIToken]): Choose how to authenticate to Retently\n """\n self.credentials = check.inst_param(\n credentials,\n "credentials",\n (RetentlySource.AuthenticateViaRetentlyOAuth, RetentlySource.AuthenticateWithAPIToken),\n )\n super().__init__("Retently", name)
\n\n\n
[docs]class ScaffoldSourceHttpSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, TODO: str):\n """Airbyte Source for Scaffold Source Http.\n\n Args:\n name (str): The name of the destination.\n TODO (str): describe me\n """\n self.TODO = check.str_param(TODO, "TODO")\n super().__init__("Scaffold Source Http", name)
\n\n\n
[docs]class YandexMetricaSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, auth_token: str, counter_id: str, start_date: str, end_date: str):\n """Airbyte Source for Yandex Metrica.\n\n Args:\n name (str): The name of the destination.\n auth_token (str): Your Yandex Metrica API access token\n counter_id (str): Counter ID\n start_date (str): UTC date and time in the format YYYY-MM-DD.\n end_date (str): UTC date and time in the format YYYY-MM-DD.\n """\n self.auth_token = check.str_param(auth_token, "auth_token")\n self.counter_id = check.str_param(counter_id, "counter_id")\n self.start_date = check.str_param(start_date, "start_date")\n self.end_date = check.str_param(end_date, "end_date")\n super().__init__("Yandex Metrica", name)
\n\n\n
[docs]class TalkdeskExploreSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n start_date: str,\n auth_url: str,\n api_key: str,\n timezone: Optional[str] = None,\n ):\n """Airbyte Source for Talkdesk Explore.\n\n Args:\n name (str): The name of the destination.\n start_date (str): The date from which you'd like to replicate data for Talkdesk Explore API, in the format YYYY-MM-DDT00:00:00. All data generated after this date will be replicated.\n timezone (Optional[str]): Timezone to use when generating reports. Only IANA timezones are supported (https://nodatime.org/TimeZones)\n auth_url (str): Talkdesk Auth URL. Only 'client_credentials' auth type supported at the moment.\n api_key (str): Talkdesk API key.\n """\n self.start_date = check.str_param(start_date, "start_date")\n self.timezone = check.opt_str_param(timezone, "timezone")\n self.auth_url = check.str_param(auth_url, "auth_url")\n self.api_key = check.str_param(api_key, "api_key")\n super().__init__("Talkdesk Explore", name)
\n\n\n
[docs]class ChargifySource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, api_key: str, domain: str):\n """Airbyte Source for Chargify.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/chargify\n\n Args:\n name (str): The name of the destination.\n api_key (str): Chargify API Key.\n domain (str): Chargify domain. Normally this domain follows the following format companyname.chargify.com\n """\n self.api_key = check.str_param(api_key, "api_key")\n self.domain = check.str_param(domain, "domain")\n super().__init__("Chargify", name)
\n\n\n
[docs]class RkiCovidSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, start_date: str):\n """Airbyte Source for Rki Covid.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/rki-covid\n\n Args:\n name (str): The name of the destination.\n start_date (str): UTC date in the format 2017-01-25. Any data before this date will not be replicated.\n """\n self.start_date = check.str_param(start_date, "start_date")\n super().__init__("Rki Covid", name)
\n\n\n
[docs]class PostgresSource(GeneratedAirbyteSource):\n
[docs] class Disable:\n
[docs] @public\n def __init__(\n self,\n ):\n self.mode = "disable"
\n\n
[docs] class Allow:\n
[docs] @public\n def __init__(\n self,\n ):\n self.mode = "allow"
\n\n
[docs] class Prefer:\n
[docs] @public\n def __init__(\n self,\n ):\n self.mode = "prefer"
\n\n
[docs] class Require:\n
[docs] @public\n def __init__(\n self,\n ):\n self.mode = "require"
\n\n
[docs] class VerifyCa:\n
[docs] @public\n def __init__(\n self,\n ca_certificate: str,\n client_certificate: Optional[str] = None,\n client_key: Optional[str] = None,\n client_key_password: Optional[str] = None,\n ):\n self.mode = "verify-ca"\n self.ca_certificate = check.str_param(ca_certificate, "ca_certificate")\n self.client_certificate = check.opt_str_param(client_certificate, "client_certificate")\n self.client_key = check.opt_str_param(client_key, "client_key")\n self.client_key_password = check.opt_str_param(\n client_key_password, "client_key_password"\n )
\n\n
[docs] class VerifyFull:\n
[docs] @public\n def __init__(\n self,\n ca_certificate: str,\n client_certificate: Optional[str] = None,\n client_key: Optional[str] = None,\n client_key_password: Optional[str] = None,\n ):\n self.mode = "verify-full"\n self.ca_certificate = check.str_param(ca_certificate, "ca_certificate")\n self.client_certificate = check.opt_str_param(client_certificate, "client_certificate")\n self.client_key = check.opt_str_param(client_key, "client_key")\n self.client_key_password = check.opt_str_param(\n client_key_password, "client_key_password"\n )
\n\n
[docs] class Standard:\n
[docs] @public\n def __init__(\n self,\n ):\n self.method = "Standard"
\n\n
[docs] class LogicalReplicationCDC:\n
[docs] @public\n def __init__(\n self,\n replication_slot: str,\n publication: str,\n plugin: Optional[str] = None,\n initial_waiting_seconds: Optional[int] = None,\n ):\n self.method = "CDC"\n self.plugin = check.opt_str_param(plugin, "plugin")\n self.replication_slot = check.str_param(replication_slot, "replication_slot")\n self.publication = check.str_param(publication, "publication")\n self.initial_waiting_seconds = check.opt_int_param(\n initial_waiting_seconds, "initial_waiting_seconds"\n )
\n\n
[docs] class NoTunnel:\n
[docs] @public\n def __init__(\n self,\n ):\n self.tunnel_method = "NO_TUNNEL"
\n\n
[docs] class SSHKeyAuthentication:\n
[docs] @public\n def __init__(self, tunnel_host: str, tunnel_port: int, tunnel_user: str, ssh_key: str):\n self.tunnel_method = "SSH_KEY_AUTH"\n self.tunnel_host = check.str_param(tunnel_host, "tunnel_host")\n self.tunnel_port = check.int_param(tunnel_port, "tunnel_port")\n self.tunnel_user = check.str_param(tunnel_user, "tunnel_user")\n self.ssh_key = check.str_param(ssh_key, "ssh_key")
\n\n
[docs] class PasswordAuthentication:\n
[docs] @public\n def __init__(\n self, tunnel_host: str, tunnel_port: int, tunnel_user: str, tunnel_user_password: str\n ):\n self.tunnel_method = "SSH_PASSWORD_AUTH"\n self.tunnel_host = check.str_param(tunnel_host, "tunnel_host")\n self.tunnel_port = check.int_param(tunnel_port, "tunnel_port")\n self.tunnel_user = check.str_param(tunnel_user, "tunnel_user")\n self.tunnel_user_password = check.str_param(\n tunnel_user_password, "tunnel_user_password"\n )
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n host: str,\n port: int,\n database: str,\n username: str,\n ssl_mode: Union[\n "PostgresSource.Disable",\n "PostgresSource.Allow",\n "PostgresSource.Prefer",\n "PostgresSource.Require",\n "PostgresSource.VerifyCa",\n "PostgresSource.VerifyFull",\n ],\n replication_method: Union[\n "PostgresSource.Standard", "PostgresSource.LogicalReplicationCDC"\n ],\n tunnel_method: Union[\n "PostgresSource.NoTunnel",\n "PostgresSource.SSHKeyAuthentication",\n "PostgresSource.PasswordAuthentication",\n ],\n schemas: Optional[List[str]] = None,\n password: Optional[str] = None,\n jdbc_url_params: Optional[str] = None,\n ssl: Optional[bool] = None,\n ):\n """Airbyte Source for Postgres.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/postgres\n\n Args:\n name (str): The name of the destination.\n host (str): Hostname of the database.\n port (int): Port of the database.\n database (str): Name of the database.\n schemas (Optional[List[str]]): The list of schemas (case sensitive) to sync from. Defaults to public.\n username (str): Username to access the database.\n password (Optional[str]): Password associated with the username.\n jdbc_url_params (Optional[str]): Additional properties to pass to the JDBC URL string when connecting to the database formatted as 'key=value' pairs separated by the symbol '&'. (Eg. key1=value1&key2=value2&key3=value3). For more information read about JDBC URL parameters.\n ssl (Optional[bool]): Encrypt data using SSL. When activating SSL, please select one of the connection modes.\n ssl_mode (Union[PostgresSource.Disable, PostgresSource.Allow, PostgresSource.Prefer, PostgresSource.Require, PostgresSource.VerifyCa, PostgresSource.VerifyFull]): SSL connection modes. disable - Disables encryption of communication between Airbyte and source database allow - Enables encryption only when required by the source database prefer - allows unencrypted connection only if the source database does not support encryption require - Always require encryption. If the source database server does not support encryption, connection will fail verify-ca - Always require encryption and verifies that the source database server has a valid SSL certificate verify-full - This is the most secure mode. Always require encryption and verifies the identity of the source database server Read more in the docs.\n replication_method (Union[PostgresSource.Standard, PostgresSource.LogicalReplicationCDC]): Replication method for extracting data from the database.\n tunnel_method (Union[PostgresSource.NoTunnel, PostgresSource.SSHKeyAuthentication, PostgresSource.PasswordAuthentication]): Whether to initiate an SSH tunnel before connecting to the database, and if so, which kind of authentication to use.\n """\n self.host = check.str_param(host, "host")\n self.port = check.int_param(port, "port")\n self.database = check.str_param(database, "database")\n self.schemas = check.opt_nullable_list_param(schemas, "schemas", str)\n self.username = check.str_param(username, "username")\n self.password = check.opt_str_param(password, "password")\n self.jdbc_url_params = check.opt_str_param(jdbc_url_params, "jdbc_url_params")\n self.ssl = check.opt_bool_param(ssl, "ssl")\n self.ssl_mode = check.inst_param(\n ssl_mode,\n "ssl_mode",\n (\n PostgresSource.Disable,\n PostgresSource.Allow,\n PostgresSource.Prefer,\n PostgresSource.Require,\n PostgresSource.VerifyCa,\n PostgresSource.VerifyFull,\n ),\n )\n self.replication_method = check.inst_param(\n replication_method,\n "replication_method",\n (PostgresSource.Standard, PostgresSource.LogicalReplicationCDC),\n )\n self.tunnel_method = check.inst_param(\n tunnel_method,\n "tunnel_method",\n (\n PostgresSource.NoTunnel,\n PostgresSource.SSHKeyAuthentication,\n PostgresSource.PasswordAuthentication,\n ),\n )\n super().__init__("Postgres", name)
\n\n\n
[docs]class TrelloSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n token: str,\n key: str,\n start_date: str,\n board_ids: Optional[List[str]] = None,\n ):\n """Airbyte Source for Trello.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/trello\n\n Args:\n name (str): The name of the destination.\n token (str): Trello v API token. See the docs for instructions on how to generate it.\n key (str): Trello API key. See the docs for instructions on how to generate it.\n start_date (str): UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated.\n board_ids (Optional[List[str]]): IDs of the boards to replicate data from. If left empty, data from all boards to which you have access will be replicated.\n """\n self.token = check.str_param(token, "token")\n self.key = check.str_param(key, "key")\n self.start_date = check.str_param(start_date, "start_date")\n self.board_ids = check.opt_nullable_list_param(board_ids, "board_ids", str)\n super().__init__("Trello", name)
\n\n\n
[docs]class PrestashopSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, url: str, access_key: str):\n """Airbyte Source for Prestashop.\n\n Args:\n name (str): The name of the destination.\n url (str): Shop URL without trailing slash (domain name or IP address)\n access_key (str): Your PrestaShop access key. See the docs for info on how to obtain this.\n """\n self.url = check.str_param(url, "url")\n self.access_key = check.str_param(access_key, "access_key")\n super().__init__("Prestashop", name)
\n\n\n
[docs]class PaystackSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n secret_key: str,\n start_date: str,\n lookback_window_days: Optional[int] = None,\n ):\n r"""Airbyte Source for Paystack.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/paystack\n\n Args:\n name (str): The name of the destination.\n secret_key (str): The Paystack API key (usually starts with 'sk_live\\\\_'; find yours here).\n start_date (str): UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated.\n lookback_window_days (Optional[int]): When set, the connector will always reload data from the past N days, where N is the value set here. This is useful if your data is updated after creation.\n """\n self.secret_key = check.str_param(secret_key, "secret_key")\n self.start_date = check.str_param(start_date, "start_date")\n self.lookback_window_days = check.opt_int_param(\n lookback_window_days, "lookback_window_days"\n )\n super().__init__("Paystack", name)
\n\n\n
[docs]class S3Source(GeneratedAirbyteSource):\n
[docs] class CSV:\n
[docs] @public\n def __init__(\n self,\n filetype: Optional[str] = None,\n delimiter: Optional[str] = None,\n infer_datatypes: Optional[bool] = None,\n quote_char: Optional[str] = None,\n escape_char: Optional[str] = None,\n encoding: Optional[str] = None,\n double_quote: Optional[bool] = None,\n newlines_in_values: Optional[bool] = None,\n additional_reader_options: Optional[str] = None,\n advanced_options: Optional[str] = None,\n block_size: Optional[int] = None,\n ):\n self.filetype = check.opt_str_param(filetype, "filetype")\n self.delimiter = check.opt_str_param(delimiter, "delimiter")\n self.infer_datatypes = check.opt_bool_param(infer_datatypes, "infer_datatypes")\n self.quote_char = check.opt_str_param(quote_char, "quote_char")\n self.escape_char = check.opt_str_param(escape_char, "escape_char")\n self.encoding = check.opt_str_param(encoding, "encoding")\n self.double_quote = check.opt_bool_param(double_quote, "double_quote")\n self.newlines_in_values = check.opt_bool_param(newlines_in_values, "newlines_in_values")\n self.additional_reader_options = check.opt_str_param(\n additional_reader_options, "additional_reader_options"\n )\n self.advanced_options = check.opt_str_param(advanced_options, "advanced_options")\n self.block_size = check.opt_int_param(block_size, "block_size")
\n\n
[docs] class Parquet:\n
[docs] @public\n def __init__(\n self,\n filetype: Optional[str] = None,\n columns: Optional[List[str]] = None,\n batch_size: Optional[int] = None,\n buffer_size: Optional[int] = None,\n ):\n self.filetype = check.opt_str_param(filetype, "filetype")\n self.columns = check.opt_nullable_list_param(columns, "columns", str)\n self.batch_size = check.opt_int_param(batch_size, "batch_size")\n self.buffer_size = check.opt_int_param(buffer_size, "buffer_size")
\n\n
[docs] class Avro:\n
[docs] @public\n def __init__(self, filetype: Optional[str] = None):\n self.filetype = check.opt_str_param(filetype, "filetype")
\n\n
[docs] class Jsonl:\n
[docs] @public\n def __init__(\n self,\n filetype: Optional[str] = None,\n newlines_in_values: Optional[bool] = None,\n unexpected_field_behavior: Optional[str] = None,\n block_size: Optional[int] = None,\n ):\n self.filetype = check.opt_str_param(filetype, "filetype")\n self.newlines_in_values = check.opt_bool_param(newlines_in_values, "newlines_in_values")\n self.unexpected_field_behavior = check.opt_str_param(\n unexpected_field_behavior, "unexpected_field_behavior"\n )\n self.block_size = check.opt_int_param(block_size, "block_size")
\n\n
[docs] class S3AmazonWebServices:\n
[docs] @public\n def __init__(\n self,\n bucket: str,\n aws_access_key_id: Optional[str] = None,\n aws_secret_access_key: Optional[str] = None,\n path_prefix: Optional[str] = None,\n endpoint: Optional[str] = None,\n ):\n self.bucket = check.str_param(bucket, "bucket")\n self.aws_access_key_id = check.opt_str_param(aws_access_key_id, "aws_access_key_id")\n self.aws_secret_access_key = check.opt_str_param(\n aws_secret_access_key, "aws_secret_access_key"\n )\n self.path_prefix = check.opt_str_param(path_prefix, "path_prefix")\n self.endpoint = check.opt_str_param(endpoint, "endpoint")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n dataset: str,\n path_pattern: str,\n format: Union["S3Source.CSV", "S3Source.Parquet", "S3Source.Avro", "S3Source.Jsonl"],\n provider: "S3Source.S3AmazonWebServices",\n schema: Optional[str] = None,\n ):\n """Airbyte Source for S3.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/s3\n\n Args:\n name (str): The name of the destination.\n dataset (str): The name of the stream you would like this source to output. Can contain letters, numbers, or underscores.\n path_pattern (str): A regular expression which tells the connector which files to replicate. All files which match this pattern will be replicated. Use | to separate multiple patterns. See this page to understand pattern syntax (GLOBSTAR and SPLIT flags are enabled). Use pattern ** to pick up all files.\n format (Union[S3Source.CSV, S3Source.Parquet, S3Source.Avro, S3Source.Jsonl]): The format of the files you'd like to replicate\n schema (Optional[str]): Optionally provide a schema to enforce, as a valid JSON string. Ensure this is a mapping of { "column" : "type" }, where types are valid JSON Schema datatypes. Leave as {} to auto-infer the schema.\n provider (S3Source.S3AmazonWebServices): Use this to load files from S3 or S3-compatible services\n """\n self.dataset = check.str_param(dataset, "dataset")\n self.path_pattern = check.str_param(path_pattern, "path_pattern")\n self.format = check.inst_param(\n format, "format", (S3Source.CSV, S3Source.Parquet, S3Source.Avro, S3Source.Jsonl)\n )\n self.schema = check.opt_str_param(schema, "schema")\n self.provider = check.inst_param(provider, "provider", S3Source.S3AmazonWebServices)\n super().__init__("S3", name)
\n\n\n
[docs]class SnowflakeSource(GeneratedAirbyteSource):\n
[docs] class OAuth20:\n
[docs] @public\n def __init__(\n self,\n client_id: str,\n client_secret: str,\n access_token: Optional[str] = None,\n refresh_token: Optional[str] = None,\n ):\n self.auth_type = "OAuth"\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.access_token = check.opt_str_param(access_token, "access_token")\n self.refresh_token = check.opt_str_param(refresh_token, "refresh_token")
\n\n
[docs] class UsernameAndPassword:\n
[docs] @public\n def __init__(self, username: str, password: str):\n self.auth_type = "username/password"\n self.username = check.str_param(username, "username")\n self.password = check.str_param(password, "password")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n credentials: Union["SnowflakeSource.OAuth20", "SnowflakeSource.UsernameAndPassword"],\n host: str,\n role: str,\n warehouse: str,\n database: str,\n schema: str,\n jdbc_url_params: Optional[str] = None,\n ):\n """Airbyte Source for Snowflake.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/snowflake\n\n Args:\n name (str): The name of the destination.\n host (str): The host domain of the snowflake instance (must include the account, region, cloud environment, and end with snowflakecomputing.com).\n role (str): The role you created for Airbyte to access Snowflake.\n warehouse (str): The warehouse you created for Airbyte to access data.\n database (str): The database you created for Airbyte to access data.\n schema (str): The source Snowflake schema tables.\n jdbc_url_params (Optional[str]): Additional properties to pass to the JDBC URL string when connecting to the database formatted as 'key=value' pairs separated by the symbol '&'. (example: key1=value1&key2=value2&key3=value3).\n """\n self.credentials = check.inst_param(\n credentials,\n "credentials",\n (SnowflakeSource.OAuth20, SnowflakeSource.UsernameAndPassword),\n )\n self.host = check.str_param(host, "host")\n self.role = check.str_param(role, "role")\n self.warehouse = check.str_param(warehouse, "warehouse")\n self.database = check.str_param(database, "database")\n self.schema = check.str_param(schema, "schema")\n self.jdbc_url_params = check.opt_str_param(jdbc_url_params, "jdbc_url_params")\n super().__init__("Snowflake", name)
\n\n\n
[docs]class AmplitudeSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, api_key: str, secret_key: str, start_date: str):\n """Airbyte Source for Amplitude.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/amplitude\n\n Args:\n name (str): The name of the destination.\n api_key (str): Amplitude API Key. See the setup guide for more information on how to obtain this key.\n secret_key (str): Amplitude Secret Key. See the setup guide for more information on how to obtain this key.\n start_date (str): UTC date and time in the format 2021-01-25T00:00:00Z. Any data before this date will not be replicated.\n """\n self.api_key = check.str_param(api_key, "api_key")\n self.secret_key = check.str_param(secret_key, "secret_key")\n self.start_date = check.str_param(start_date, "start_date")\n super().__init__("Amplitude", name)
\n\n\n
[docs]class PosthogSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, start_date: str, api_key: str, base_url: Optional[str] = None):\n """Airbyte Source for Posthog.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/posthog\n\n Args:\n name (str): The name of the destination.\n start_date (str): The date from which you'd like to replicate the data. Any data before this date will not be replicated.\n api_key (str): API Key. See the docs for information on how to generate this key.\n base_url (Optional[str]): Base PostHog url. Defaults to PostHog Cloud (https://app.posthog.com).\n """\n self.start_date = check.str_param(start_date, "start_date")\n self.api_key = check.str_param(api_key, "api_key")\n self.base_url = check.opt_str_param(base_url, "base_url")\n super().__init__("Posthog", name)
\n\n\n
[docs]class PaypalTransactionSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n start_date: str,\n is_sandbox: bool,\n client_id: Optional[str] = None,\n client_secret: Optional[str] = None,\n refresh_token: Optional[str] = None,\n ):\n """Airbyte Source for Paypal Transaction.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/paypal-transactions\n\n Args:\n name (str): The name of the destination.\n client_id (Optional[str]): The Client ID of your Paypal developer application.\n client_secret (Optional[str]): The Client Secret of your Paypal developer application.\n refresh_token (Optional[str]): The key to refresh the expired access token.\n start_date (str): Start Date for data extraction in ISO format. Date must be in range from 3 years till 12 hrs before present time.\n is_sandbox (bool): Determines whether to use the sandbox or production environment.\n """\n self.client_id = check.opt_str_param(client_id, "client_id")\n self.client_secret = check.opt_str_param(client_secret, "client_secret")\n self.refresh_token = check.opt_str_param(refresh_token, "refresh_token")\n self.start_date = check.str_param(start_date, "start_date")\n self.is_sandbox = check.bool_param(is_sandbox, "is_sandbox")\n super().__init__("Paypal Transaction", name)
\n\n\n
[docs]class MssqlSource(GeneratedAirbyteSource):\n
[docs] class Unencrypted:\n
[docs] @public\n def __init__(\n self,\n ):\n self.ssl_method = "unencrypted"
\n\n
[docs] class EncryptedTrustServerCertificate:\n
[docs] @public\n def __init__(\n self,\n ):\n self.ssl_method = "encrypted_trust_server_certificate"
\n\n
[docs] class EncryptedVerifyCertificate:\n
[docs] @public\n def __init__(self, hostNameInCertificate: Optional[str] = None):\n self.ssl_method = "encrypted_verify_certificate"\n self.hostNameInCertificate = check.opt_str_param(\n hostNameInCertificate, "hostNameInCertificate"\n )
\n\n
[docs] class Standard:\n
[docs] @public\n def __init__(\n self,\n ):\n self.method = "STANDARD"
\n\n
[docs] class LogicalReplicationCDC:\n
[docs] @public\n def __init__(\n self, data_to_sync: Optional[str] = None, snapshot_isolation: Optional[str] = None\n ):\n self.method = "CDC"\n self.data_to_sync = check.opt_str_param(data_to_sync, "data_to_sync")\n self.snapshot_isolation = check.opt_str_param(snapshot_isolation, "snapshot_isolation")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n host: str,\n port: int,\n database: str,\n username: str,\n ssl_method: Union[\n "MssqlSource.Unencrypted",\n "MssqlSource.EncryptedTrustServerCertificate",\n "MssqlSource.EncryptedVerifyCertificate",\n ],\n replication_method: Union["MssqlSource.Standard", "MssqlSource.LogicalReplicationCDC"],\n schemas: Optional[List[str]] = None,\n password: Optional[str] = None,\n jdbc_url_params: Optional[str] = None,\n ):\n """Airbyte Source for Mssql.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/mssql\n\n Args:\n name (str): The name of the destination.\n host (str): The hostname of the database.\n port (int): The port of the database.\n database (str): The name of the database.\n schemas (Optional[List[str]]): The list of schemas to sync from. Defaults to user. Case sensitive.\n username (str): The username which is used to access the database.\n password (Optional[str]): The password associated with the username.\n jdbc_url_params (Optional[str]): Additional properties to pass to the JDBC URL string when connecting to the database formatted as 'key=value' pairs separated by the symbol '&'. (example: key1=value1&key2=value2&key3=value3).\n ssl_method (Union[MssqlSource.Unencrypted, MssqlSource.EncryptedTrustServerCertificate, MssqlSource.EncryptedVerifyCertificate]): The encryption method which is used when communicating with the database.\n replication_method (Union[MssqlSource.Standard, MssqlSource.LogicalReplicationCDC]): The replication method used for extracting data from the database. STANDARD replication requires no setup on the DB side but will not be able to represent deletions incrementally. CDC uses {TBC} to detect inserts, updates, and deletes. This needs to be configured on the source database itself.\n """\n self.host = check.str_param(host, "host")\n self.port = check.int_param(port, "port")\n self.database = check.str_param(database, "database")\n self.schemas = check.opt_nullable_list_param(schemas, "schemas", str)\n self.username = check.str_param(username, "username")\n self.password = check.opt_str_param(password, "password")\n self.jdbc_url_params = check.opt_str_param(jdbc_url_params, "jdbc_url_params")\n self.ssl_method = check.inst_param(\n ssl_method,\n "ssl_method",\n (\n MssqlSource.Unencrypted,\n MssqlSource.EncryptedTrustServerCertificate,\n MssqlSource.EncryptedVerifyCertificate,\n ),\n )\n self.replication_method = check.inst_param(\n replication_method,\n "replication_method",\n (MssqlSource.Standard, MssqlSource.LogicalReplicationCDC),\n )\n super().__init__("Mssql", name)
\n\n\n
[docs]class ZohoCrmSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n client_id: str,\n client_secret: str,\n refresh_token: str,\n dc_region: str,\n environment: str,\n edition: str,\n start_datetime: Optional[str] = None,\n ):\n """Airbyte Source for Zoho Crm.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/zoho-crm\n\n Args:\n name (str): The name of the destination.\n client_id (str): OAuth2.0 Client ID\n client_secret (str): OAuth2.0 Client Secret\n refresh_token (str): OAuth2.0 Refresh Token\n dc_region (str): Please choose the region of your Data Center location. More info by this Link\n environment (str): Please choose the environment\n start_datetime (Optional[str]): ISO 8601, for instance: `YYYY-MM-DD`, `YYYY-MM-DD HH:MM:SS+HH:MM`\n edition (str): Choose your Edition of Zoho CRM to determine API Concurrency Limits\n """\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")\n self.dc_region = check.str_param(dc_region, "dc_region")\n self.environment = check.str_param(environment, "environment")\n self.start_datetime = check.opt_str_param(start_datetime, "start_datetime")\n self.edition = check.str_param(edition, "edition")\n super().__init__("Zoho Crm", name)
\n\n\n
[docs]class RedshiftSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n host: str,\n port: int,\n database: str,\n username: str,\n password: str,\n schemas: Optional[List[str]] = None,\n jdbc_url_params: Optional[str] = None,\n ):\n """Airbyte Source for Redshift.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/redshift\n\n Args:\n name (str): The name of the destination.\n host (str): Host Endpoint of the Redshift Cluster (must include the cluster-id, region and end with .redshift.amazonaws.com).\n port (int): Port of the database.\n database (str): Name of the database.\n schemas (Optional[List[str]]): The list of schemas to sync from. Specify one or more explicitly or keep empty to process all schemas. Schema names are case sensitive.\n username (str): Username to use to access the database.\n password (str): Password associated with the username.\n jdbc_url_params (Optional[str]): Additional properties to pass to the JDBC URL string when connecting to the database formatted as 'key=value' pairs separated by the symbol '&'. (example: key1=value1&key2=value2&key3=value3).\n """\n self.host = check.str_param(host, "host")\n self.port = check.int_param(port, "port")\n self.database = check.str_param(database, "database")\n self.schemas = check.opt_nullable_list_param(schemas, "schemas", str)\n self.username = check.str_param(username, "username")\n self.password = check.str_param(password, "password")\n self.jdbc_url_params = check.opt_str_param(jdbc_url_params, "jdbc_url_params")\n super().__init__("Redshift", name)
\n\n\n
[docs]class AsanaSource(GeneratedAirbyteSource):\n
[docs] class PATCredentials:\n
[docs] @public\n def __init__(self, personal_access_token: str):\n self.personal_access_token = check.str_param(\n personal_access_token, "personal_access_token"\n )
\n\n
[docs] class OAuthCredentials:\n
[docs] @public\n def __init__(self, client_id: str, client_secret: str, refresh_token: str):\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n credentials: Union["AsanaSource.PATCredentials", "AsanaSource.OAuthCredentials"],\n ):\n """Airbyte Source for Asana.\n\n Args:\n name (str): The name of the destination.\n credentials (Union[AsanaSource.PATCredentials, AsanaSource.OAuthCredentials]): Choose how to authenticate to Github\n """\n self.credentials = check.inst_param(\n credentials, "credentials", (AsanaSource.PATCredentials, AsanaSource.OAuthCredentials)\n )\n super().__init__("Asana", name)
\n\n\n
[docs]class SmartsheetsSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n access_token: str,\n spreadsheet_id: str,\n start_datetime: Optional[str] = None,\n ):\n """Airbyte Source for Smartsheets.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/smartsheets\n\n Args:\n name (str): The name of the destination.\n access_token (str): The access token to use for accessing your data from Smartsheets. This access token must be generated by a user with at least read access to the data you'd like to replicate. Generate an access token in the Smartsheets main menu by clicking Account > Apps & Integrations > API Access. See the setup guide for information on how to obtain this token.\n spreadsheet_id (str): The spreadsheet ID. Find it by opening the spreadsheet then navigating to File > Properties\n start_datetime (Optional[str]): Only rows modified after this date/time will be replicated. This should be an ISO 8601 string, for instance: `2000-01-01T13:00:00`\n """\n self.access_token = check.str_param(access_token, "access_token")\n self.spreadsheet_id = check.str_param(spreadsheet_id, "spreadsheet_id")\n self.start_datetime = check.opt_str_param(start_datetime, "start_datetime")\n super().__init__("Smartsheets", name)
\n\n\n
[docs]class MailchimpSource(GeneratedAirbyteSource):\n
[docs] class OAuth20:\n
[docs] @public\n def __init__(\n self,\n access_token: str,\n client_id: Optional[str] = None,\n client_secret: Optional[str] = None,\n ):\n self.auth_type = "oauth2.0"\n self.client_id = check.opt_str_param(client_id, "client_id")\n self.client_secret = check.opt_str_param(client_secret, "client_secret")\n self.access_token = check.str_param(access_token, "access_token")
\n\n
[docs] class APIKey:\n
[docs] @public\n def __init__(self, apikey: str):\n self.auth_type = "apikey"\n self.apikey = check.str_param(apikey, "apikey")
\n\n
[docs] @public\n def __init__(\n self, name: str, credentials: Union["MailchimpSource.OAuth20", "MailchimpSource.APIKey"]\n ):\n """Airbyte Source for Mailchimp.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/mailchimp\n\n Args:\n name (str): The name of the destination.\n\n """\n self.credentials = check.inst_param(\n credentials, "credentials", (MailchimpSource.OAuth20, MailchimpSource.APIKey)\n )\n super().__init__("Mailchimp", name)
\n\n\n
[docs]class SentrySource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n auth_token: str,\n organization: str,\n project: str,\n hostname: Optional[str] = None,\n discover_fields: Optional[List[str]] = None,\n ):\n """Airbyte Source for Sentry.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/sentry\n\n Args:\n name (str): The name of the destination.\n auth_token (str): Log into Sentry and then create authentication tokens.For self-hosted, you can find or create authentication tokens by visiting "{instance_url_prefix}/settings/account/api/auth-tokens/"\n hostname (Optional[str]): Host name of Sentry API server.For self-hosted, specify your host name here. Otherwise, leave it empty.\n organization (str): The slug of the organization the groups belong to.\n project (str): The name (slug) of the Project you want to sync.\n discover_fields (Optional[List[str]]): Fields to retrieve when fetching discover events\n """\n self.auth_token = check.str_param(auth_token, "auth_token")\n self.hostname = check.opt_str_param(hostname, "hostname")\n self.organization = check.str_param(organization, "organization")\n self.project = check.str_param(project, "project")\n self.discover_fields = check.opt_nullable_list_param(\n discover_fields, "discover_fields", str\n )\n super().__init__("Sentry", name)
\n\n\n
[docs]class MailgunSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n private_key: str,\n domain_region: Optional[str] = None,\n start_date: Optional[str] = None,\n ):\n """Airbyte Source for Mailgun.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/mailgun\n\n Args:\n name (str): The name of the destination.\n private_key (str): Primary account API key to access your Mailgun data.\n domain_region (Optional[str]): Domain region code. 'EU' or 'US' are possible values. The default is 'US'.\n start_date (Optional[str]): UTC date and time in the format 2020-10-01 00:00:00. Any data before this date will not be replicated. If omitted, defaults to 3 days ago.\n """\n self.private_key = check.str_param(private_key, "private_key")\n self.domain_region = check.opt_str_param(domain_region, "domain_region")\n self.start_date = check.opt_str_param(start_date, "start_date")\n super().__init__("Mailgun", name)
\n\n\n
[docs]class OnesignalSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, user_auth_key: str, start_date: str, outcome_names: str):\n """Airbyte Source for Onesignal.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/onesignal\n\n Args:\n name (str): The name of the destination.\n user_auth_key (str): OneSignal User Auth Key, see the docs for more information on how to obtain this key.\n start_date (str): The date from which you'd like to replicate data for OneSignal API, in the format YYYY-MM-DDT00:00:00Z. All data generated after this date will be replicated.\n outcome_names (str): Comma-separated list of names and the value (sum/count) for the returned outcome data. See the docs for more details\n """\n self.user_auth_key = check.str_param(user_auth_key, "user_auth_key")\n self.start_date = check.str_param(start_date, "start_date")\n self.outcome_names = check.str_param(outcome_names, "outcome_names")\n super().__init__("Onesignal", name)
\n\n\n
[docs]class PythonHttpTutorialSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, start_date: str, base: str, access_key: Optional[str] = None):\n """Airbyte Source for Python Http Tutorial.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/exchangeratesapi\n\n Args:\n name (str): The name of the destination.\n access_key (Optional[str]): API access key used to retrieve data from the Exchange Rates API.\n start_date (str): UTC date and time in the format 2017-01-25. Any data before this date will not be replicated.\n base (str): ISO reference currency. See here.\n """\n self.access_key = check.opt_str_param(access_key, "access_key")\n self.start_date = check.str_param(start_date, "start_date")\n self.base = check.str_param(base, "base")\n super().__init__("Python Http Tutorial", name)
\n\n\n
[docs]class AirtableSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, api_key: str, base_id: str, tables: List[str]):\n """Airbyte Source for Airtable.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/airtable\n\n Args:\n name (str): The name of the destination.\n api_key (str): The API Key for the Airtable account. See the Support Guide for more information on how to obtain this key.\n base_id (str): The Base ID to integrate the data from. You can find the Base ID following the link Airtable API, log in to your account, select the base you need and find Base ID in the docs.\n tables (List[str]): The list of Tables to integrate.\n """\n self.api_key = check.str_param(api_key, "api_key")\n self.base_id = check.str_param(base_id, "base_id")\n self.tables = check.list_param(tables, "tables", str)\n super().__init__("Airtable", name)
\n\n\n
[docs]class MongodbV2Source(GeneratedAirbyteSource):\n
[docs] class StandaloneMongoDbInstance:\n
[docs] @public\n def __init__(self, instance: str, host: str, port: int, tls: Optional[bool] = None):\n self.instance = check.str_param(instance, "instance")\n self.host = check.str_param(host, "host")\n self.port = check.int_param(port, "port")\n self.tls = check.opt_bool_param(tls, "tls")
\n\n
[docs] class ReplicaSet:\n
[docs] @public\n def __init__(self, instance: str, server_addresses: str, replica_set: Optional[str] = None):\n self.instance = check.str_param(instance, "instance")\n self.server_addresses = check.str_param(server_addresses, "server_addresses")\n self.replica_set = check.opt_str_param(replica_set, "replica_set")
\n\n
[docs] class MongoDBAtlas:\n
[docs] @public\n def __init__(self, instance: str, cluster_url: str):\n self.instance = check.str_param(instance, "instance")\n self.cluster_url = check.str_param(cluster_url, "cluster_url")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n instance_type: Union[\n "MongodbV2Source.StandaloneMongoDbInstance",\n "MongodbV2Source.ReplicaSet",\n "MongodbV2Source.MongoDBAtlas",\n ],\n database: str,\n user: Optional[str] = None,\n password: Optional[str] = None,\n auth_source: Optional[str] = None,\n ):\n """Airbyte Source for Mongodb V2.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/mongodb-v2\n\n Args:\n name (str): The name of the destination.\n instance_type (Union[MongodbV2Source.StandaloneMongoDbInstance, MongodbV2Source.ReplicaSet, MongodbV2Source.MongoDBAtlas]): The MongoDb instance to connect to. For MongoDB Atlas and Replica Set TLS connection is used by default.\n database (str): The database you want to replicate.\n user (Optional[str]): The username which is used to access the database.\n password (Optional[str]): The password associated with this username.\n auth_source (Optional[str]): The authentication source where the user information is stored.\n """\n self.instance_type = check.inst_param(\n instance_type,\n "instance_type",\n (\n MongodbV2Source.StandaloneMongoDbInstance,\n MongodbV2Source.ReplicaSet,\n MongodbV2Source.MongoDBAtlas,\n ),\n )\n self.database = check.str_param(database, "database")\n self.user = check.opt_str_param(user, "user")\n self.password = check.opt_str_param(password, "password")\n self.auth_source = check.opt_str_param(auth_source, "auth_source")\n super().__init__("Mongodb V2", name)
\n\n\n
[docs]class FileSecureSource(GeneratedAirbyteSource):\n
[docs] class HTTPSPublicWeb:\n
[docs] @public\n def __init__(self, user_agent: Optional[bool] = None):\n self.storage = "HTTPS"\n self.user_agent = check.opt_bool_param(user_agent, "user_agent")
\n\n
[docs] class GCSGoogleCloudStorage:\n
[docs] @public\n def __init__(self, service_account_json: Optional[str] = None):\n self.storage = "GCS"\n self.service_account_json = check.opt_str_param(\n service_account_json, "service_account_json"\n )
\n\n
[docs] class S3AmazonWebServices:\n
[docs] @public\n def __init__(\n self,\n aws_access_key_id: Optional[str] = None,\n aws_secret_access_key: Optional[str] = None,\n ):\n self.storage = "S3"\n self.aws_access_key_id = check.opt_str_param(aws_access_key_id, "aws_access_key_id")\n self.aws_secret_access_key = check.opt_str_param(\n aws_secret_access_key, "aws_secret_access_key"\n )
\n\n
[docs] class AzBlobAzureBlobStorage:\n
[docs] @public\n def __init__(\n self,\n storage_account: str,\n sas_token: Optional[str] = None,\n shared_key: Optional[str] = None,\n ):\n self.storage = "AzBlob"\n self.storage_account = check.str_param(storage_account, "storage_account")\n self.sas_token = check.opt_str_param(sas_token, "sas_token")\n self.shared_key = check.opt_str_param(shared_key, "shared_key")
\n\n
[docs] class SSHSecureShell:\n
[docs] @public\n def __init__(\n self, user: str, host: str, password: Optional[str] = None, port: Optional[str] = None\n ):\n self.storage = "SSH"\n self.user = check.str_param(user, "user")\n self.password = check.opt_str_param(password, "password")\n self.host = check.str_param(host, "host")\n self.port = check.opt_str_param(port, "port")
\n\n
[docs] class SCPSecureCopyProtocol:\n
[docs] @public\n def __init__(\n self, user: str, host: str, password: Optional[str] = None, port: Optional[str] = None\n ):\n self.storage = "SCP"\n self.user = check.str_param(user, "user")\n self.password = check.opt_str_param(password, "password")\n self.host = check.str_param(host, "host")\n self.port = check.opt_str_param(port, "port")
\n\n
[docs] class SFTPSecureFileTransferProtocol:\n
[docs] @public\n def __init__(\n self, user: str, host: str, password: Optional[str] = None, port: Optional[str] = None\n ):\n self.storage = "SFTP"\n self.user = check.str_param(user, "user")\n self.password = check.opt_str_param(password, "password")\n self.host = check.str_param(host, "host")\n self.port = check.opt_str_param(port, "port")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n dataset_name: str,\n format: str,\n url: str,\n provider: Union[\n "FileSecureSource.HTTPSPublicWeb",\n "FileSecureSource.GCSGoogleCloudStorage",\n "FileSecureSource.S3AmazonWebServices",\n "FileSecureSource.AzBlobAzureBlobStorage",\n "FileSecureSource.SSHSecureShell",\n "FileSecureSource.SCPSecureCopyProtocol",\n "FileSecureSource.SFTPSecureFileTransferProtocol",\n ],\n reader_options: Optional[str] = None,\n ):\n """Airbyte Source for File Secure.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/file\n\n Args:\n name (str): The name of the destination.\n dataset_name (str): The Name of the final table to replicate this file into (should include letters, numbers dash and underscores only).\n format (str): The Format of the file which should be replicated (Warning: some formats may be experimental, please refer to the docs).\n reader_options (Optional[str]): This should be a string in JSON format. It depends on the chosen file format to provide additional options and tune its behavior.\n url (str): The URL path to access the file which should be replicated.\n provider (Union[FileSecureSource.HTTPSPublicWeb, FileSecureSource.GCSGoogleCloudStorage, FileSecureSource.S3AmazonWebServices, FileSecureSource.AzBlobAzureBlobStorage, FileSecureSource.SSHSecureShell, FileSecureSource.SCPSecureCopyProtocol, FileSecureSource.SFTPSecureFileTransferProtocol]): The storage Provider or Location of the file(s) which should be replicated.\n """\n self.dataset_name = check.str_param(dataset_name, "dataset_name")\n self.format = check.str_param(format, "format")\n self.reader_options = check.opt_str_param(reader_options, "reader_options")\n self.url = check.str_param(url, "url")\n self.provider = check.inst_param(\n provider,\n "provider",\n (\n FileSecureSource.HTTPSPublicWeb,\n FileSecureSource.GCSGoogleCloudStorage,\n FileSecureSource.S3AmazonWebServices,\n FileSecureSource.AzBlobAzureBlobStorage,\n FileSecureSource.SSHSecureShell,\n FileSecureSource.SCPSecureCopyProtocol,\n FileSecureSource.SFTPSecureFileTransferProtocol,\n ),\n )\n super().__init__("File Secure", name)
\n\n\n
[docs]class ZendeskSupportSource(GeneratedAirbyteSource):\n
[docs] class OAuth20:\n
[docs] @public\n def __init__(self, access_token: str, credentials: Optional[str] = None):\n self.credentials = check.opt_str_param(credentials, "credentials")\n self.access_token = check.str_param(access_token, "access_token")
\n\n
[docs] class APIToken:\n
[docs] @public\n def __init__(self, email: str, api_token: str, credentials: Optional[str] = None):\n self.credentials = check.opt_str_param(credentials, "credentials")\n self.email = check.str_param(email, "email")\n self.api_token = check.str_param(api_token, "api_token")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n start_date: str,\n subdomain: str,\n credentials: Union["ZendeskSupportSource.OAuth20", "ZendeskSupportSource.APIToken"],\n ):\n """Airbyte Source for Zendesk Support.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/zendesk-support\n\n Args:\n name (str): The name of the destination.\n start_date (str): The date from which you'd like to replicate data for Zendesk Support API, in the format YYYY-MM-DDT00:00:00Z. All data generated after this date will be replicated.\n subdomain (str): This is your Zendesk subdomain that can be found in your account URL. For example, in https://{MY_SUBDOMAIN}.zendesk.com/, where MY_SUBDOMAIN is the value of your subdomain.\n credentials (Union[ZendeskSupportSource.OAuth20, ZendeskSupportSource.APIToken]): Zendesk service provides two authentication methods. Choose between: `OAuth2.0` or `API token`.\n """\n self.start_date = check.str_param(start_date, "start_date")\n self.subdomain = check.str_param(subdomain, "subdomain")\n self.credentials = check.inst_param(\n credentials,\n "credentials",\n (ZendeskSupportSource.OAuth20, ZendeskSupportSource.APIToken),\n )\n super().__init__("Zendesk Support", name)
\n\n\n
[docs]class TempoSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, api_token: str):\n """Airbyte Source for Tempo.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/\n\n Args:\n name (str): The name of the destination.\n api_token (str): Tempo API Token. Go to Tempo>Settings, scroll down to Data Access and select API integration.\n """\n self.api_token = check.str_param(api_token, "api_token")\n super().__init__("Tempo", name)
\n\n\n
[docs]class BraintreeSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n merchant_id: str,\n public_key: str,\n private_key: str,\n environment: str,\n start_date: Optional[str] = None,\n ):\n """Airbyte Source for Braintree.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/braintree\n\n Args:\n name (str): The name of the destination.\n merchant_id (str): The unique identifier for your entire gateway account. See the docs for more information on how to obtain this ID.\n public_key (str): Braintree Public Key. See the docs for more information on how to obtain this key.\n private_key (str): Braintree Private Key. See the docs for more information on how to obtain this key.\n start_date (Optional[str]): UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated.\n environment (str): Environment specifies where the data will come from.\n """\n self.merchant_id = check.str_param(merchant_id, "merchant_id")\n self.public_key = check.str_param(public_key, "public_key")\n self.private_key = check.str_param(private_key, "private_key")\n self.start_date = check.opt_str_param(start_date, "start_date")\n self.environment = check.str_param(environment, "environment")\n super().__init__("Braintree", name)
\n\n\n
[docs]class SalesloftSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self, name: str, client_id: str, client_secret: str, refresh_token: str, start_date: str\n ):\n """Airbyte Source for Salesloft.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/salesloft\n\n Args:\n name (str): The name of the destination.\n client_id (str): The Client ID of your Salesloft developer application.\n client_secret (str): The Client Secret of your Salesloft developer application.\n refresh_token (str): The token for obtaining a new access token.\n start_date (str): The date from which you'd like to replicate data for Salesloft API, in the format YYYY-MM-DDT00:00:00Z. All data generated after this date will be replicated.\n """\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")\n self.start_date = check.str_param(start_date, "start_date")\n super().__init__("Salesloft", name)
\n\n\n
[docs]class LinnworksSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self, name: str, application_id: str, application_secret: str, token: str, start_date: str\n ):\n """Airbyte Source for Linnworks.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/linnworks\n\n Args:\n name (str): The name of the destination.\n application_id (str): Linnworks Application ID\n application_secret (str): Linnworks Application Secret\n start_date (str): UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated.\n """\n self.application_id = check.str_param(application_id, "application_id")\n self.application_secret = check.str_param(application_secret, "application_secret")\n self.token = check.str_param(token, "token")\n self.start_date = check.str_param(start_date, "start_date")\n super().__init__("Linnworks", name)
\n\n\n
[docs]class ChargebeeSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self, name: str, site: str, site_api_key: str, start_date: str, product_catalog: str\n ):\n """Airbyte Source for Chargebee.\n\n Documentation can be found at https://apidocs.chargebee.com/docs/api\n\n Args:\n name (str): The name of the destination.\n site (str): The site prefix for your Chargebee instance.\n site_api_key (str): Chargebee API Key. See the docs for more information on how to obtain this key.\n start_date (str): UTC date and time in the format 2021-01-25T00:00:00Z. Any data before this date will not be replicated.\n product_catalog (str): Product Catalog version of your Chargebee site. Instructions on how to find your version you may find here under `API Version` section.\n """\n self.site = check.str_param(site, "site")\n self.site_api_key = check.str_param(site_api_key, "site_api_key")\n self.start_date = check.str_param(start_date, "start_date")\n self.product_catalog = check.str_param(product_catalog, "product_catalog")\n super().__init__("Chargebee", name)
\n\n\n
[docs]class GoogleAnalyticsDataApiSource(GeneratedAirbyteSource):\n
[docs] class AuthenticateViaGoogleOauth:\n
[docs] @public\n def __init__(\n self,\n client_id: str,\n client_secret: str,\n refresh_token: str,\n auth_type: Optional[str] = None,\n access_token: Optional[str] = None,\n ):\n self.auth_type = check.opt_str_param(auth_type, "auth_type")\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")\n self.access_token = check.opt_str_param(access_token, "access_token")
\n\n
[docs] class ServiceAccountKeyAuthentication:\n
[docs] @public\n def __init__(self, credentials_json: str, auth_type: Optional[str] = None):\n self.auth_type = check.opt_str_param(auth_type, "auth_type")\n self.credentials_json = check.str_param(credentials_json, "credentials_json")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n property_id: str,\n credentials: Union[\n "GoogleAnalyticsDataApiSource.AuthenticateViaGoogleOauth",\n "GoogleAnalyticsDataApiSource.ServiceAccountKeyAuthentication",\n ],\n date_ranges_start_date: str,\n custom_reports: Optional[str] = None,\n window_in_days: Optional[int] = None,\n ):\n """Airbyte Source for Google Analytics Data Api.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/google-analytics-v4\n\n Args:\n name (str): The name of the destination.\n property_id (str): A Google Analytics GA4 property identifier whose events are tracked. Specified in the URL path and not the body\n credentials (Union[GoogleAnalyticsDataApiSource.AuthenticateViaGoogleOauth, GoogleAnalyticsDataApiSource.ServiceAccountKeyAuthentication]): Credentials for the service\n date_ranges_start_date (str): The start date. One of the values Ndaysago, yesterday, today or in the format YYYY-MM-DD\n custom_reports (Optional[str]): A JSON array describing the custom reports you want to sync from Google Analytics. See the docs for more information about the exact format you can use to fill out this field.\n window_in_days (Optional[int]): The time increment used by the connector when requesting data from the Google Analytics API. More information is available in the the docs. The bigger this value is, the faster the sync will be, but the more likely that sampling will be applied to your data, potentially causing inaccuracies in the returned results. We recommend setting this to 1 unless you have a hard requirement to make the sync faster at the expense of accuracy. The minimum allowed value for this field is 1, and the maximum is 364.\n """\n self.property_id = check.str_param(property_id, "property_id")\n self.credentials = check.inst_param(\n credentials,\n "credentials",\n (\n GoogleAnalyticsDataApiSource.AuthenticateViaGoogleOauth,\n GoogleAnalyticsDataApiSource.ServiceAccountKeyAuthentication,\n ),\n )\n self.date_ranges_start_date = check.str_param(\n date_ranges_start_date, "date_ranges_start_date"\n )\n self.custom_reports = check.opt_str_param(custom_reports, "custom_reports")\n self.window_in_days = check.opt_int_param(window_in_days, "window_in_days")\n super().__init__("Google Analytics Data Api", name)
\n\n\n
[docs]class OutreachSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n client_id: str,\n client_secret: str,\n refresh_token: str,\n redirect_uri: str,\n start_date: str,\n ):\n """Airbyte Source for Outreach.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/outreach\n\n Args:\n name (str): The name of the destination.\n client_id (str): The Client ID of your Outreach developer application.\n client_secret (str): The Client Secret of your Outreach developer application.\n refresh_token (str): The token for obtaining the new access token.\n redirect_uri (str): A Redirect URI is the location where the authorization server sends the user once the app has been successfully authorized and granted an authorization code or access token.\n start_date (str): The date from which you'd like to replicate data for Outreach API, in the format YYYY-MM-DDT00:00:00Z. All data generated after this date will be replicated.\n """\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")\n self.redirect_uri = check.str_param(redirect_uri, "redirect_uri")\n self.start_date = check.str_param(start_date, "start_date")\n super().__init__("Outreach", name)
\n\n\n
[docs]class LemlistSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, api_key: str):\n """Airbyte Source for Lemlist.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/lemlist\n\n Args:\n name (str): The name of the destination.\n api_key (str): Lemlist API key.\n """\n self.api_key = check.str_param(api_key, "api_key")\n super().__init__("Lemlist", name)
\n\n\n
[docs]class ApifyDatasetSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, datasetId: str, clean: Optional[bool] = None):\n """Airbyte Source for Apify Dataset.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/apify-dataset\n\n Args:\n name (str): The name of the destination.\n datasetId (str): ID of the dataset you would like to load to Airbyte.\n clean (Optional[bool]): If set to true, only clean items will be downloaded from the dataset. See description of what clean means in Apify API docs. If not sure, set clean to false.\n """\n self.datasetId = check.str_param(datasetId, "datasetId")\n self.clean = check.opt_bool_param(clean, "clean")\n super().__init__("Apify Dataset", name)
\n\n\n
[docs]class RecurlySource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n api_key: str,\n begin_time: Optional[str] = None,\n end_time: Optional[str] = None,\n ):\n """Airbyte Source for Recurly.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/recurly\n\n Args:\n name (str): The name of the destination.\n api_key (str): Recurly API Key. See the docs for more information on how to generate this key.\n begin_time (Optional[str]): ISO8601 timestamp from which the replication from Recurly API will start from.\n end_time (Optional[str]): ISO8601 timestamp to which the replication from Recurly API will stop. Records after that date won't be imported.\n """\n self.api_key = check.str_param(api_key, "api_key")\n self.begin_time = check.opt_str_param(begin_time, "begin_time")\n self.end_time = check.opt_str_param(end_time, "end_time")\n super().__init__("Recurly", name)
\n\n\n
[docs]class ZendeskTalkSource(GeneratedAirbyteSource):\n
[docs] class APIToken:\n
[docs] @public\n def __init__(self, email: str, api_token: str, auth_type: Optional[str] = None):\n self.auth_type = check.opt_str_param(auth_type, "auth_type")\n self.email = check.str_param(email, "email")\n self.api_token = check.str_param(api_token, "api_token")
\n\n
[docs] class OAuth20:\n
[docs] @public\n def __init__(self, access_token: str, auth_type: Optional[str] = None):\n self.auth_type = check.opt_str_param(auth_type, "auth_type")\n self.access_token = check.str_param(access_token, "access_token")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n subdomain: str,\n credentials: Union["ZendeskTalkSource.APIToken", "ZendeskTalkSource.OAuth20"],\n start_date: str,\n ):\n """Airbyte Source for Zendesk Talk.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/zendesk-talk\n\n Args:\n name (str): The name of the destination.\n subdomain (str): This is your Zendesk subdomain that can be found in your account URL. For example, in https://{MY_SUBDOMAIN}.zendesk.com/, where MY_SUBDOMAIN is the value of your subdomain.\n credentials (Union[ZendeskTalkSource.APIToken, ZendeskTalkSource.OAuth20]): Zendesk service provides two authentication methods. Choose between: `OAuth2.0` or `API token`.\n start_date (str): The date from which you'd like to replicate data for Zendesk Talk API, in the format YYYY-MM-DDT00:00:00Z. All data generated after this date will be replicated.\n """\n self.subdomain = check.str_param(subdomain, "subdomain")\n self.credentials = check.inst_param(\n credentials, "credentials", (ZendeskTalkSource.APIToken, ZendeskTalkSource.OAuth20)\n )\n self.start_date = check.str_param(start_date, "start_date")\n super().__init__("Zendesk Talk", name)
\n\n\n
[docs]class SftpSource(GeneratedAirbyteSource):\n
[docs] class PasswordAuthentication:\n
[docs] @public\n def __init__(self, auth_user_password: str):\n self.auth_method = "SSH_PASSWORD_AUTH"\n self.auth_user_password = check.str_param(auth_user_password, "auth_user_password")
\n\n
[docs] class SSHKeyAuthentication:\n
[docs] @public\n def __init__(self, auth_ssh_key: str):\n self.auth_method = "SSH_KEY_AUTH"\n self.auth_ssh_key = check.str_param(auth_ssh_key, "auth_ssh_key")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n user: str,\n host: str,\n port: int,\n credentials: Union["SftpSource.PasswordAuthentication", "SftpSource.SSHKeyAuthentication"],\n file_types: Optional[str] = None,\n folder_path: Optional[str] = None,\n file_pattern: Optional[str] = None,\n ):\n """Airbyte Source for Sftp.\n\n Documentation can be found at https://docs.airbyte.com/integrations/source/sftp\n\n Args:\n name (str): The name of the destination.\n user (str): The server user\n host (str): The server host address\n port (int): The server port\n credentials (Union[SftpSource.PasswordAuthentication, SftpSource.SSHKeyAuthentication]): The server authentication method\n file_types (Optional[str]): Coma separated file types. Currently only 'csv' and 'json' types are supported.\n folder_path (Optional[str]): The directory to search files for sync\n file_pattern (Optional[str]): The regular expression to specify files for sync in a chosen Folder Path\n """\n self.user = check.str_param(user, "user")\n self.host = check.str_param(host, "host")\n self.port = check.int_param(port, "port")\n self.credentials = check.inst_param(\n credentials,\n "credentials",\n (SftpSource.PasswordAuthentication, SftpSource.SSHKeyAuthentication),\n )\n self.file_types = check.opt_str_param(file_types, "file_types")\n self.folder_path = check.opt_str_param(folder_path, "folder_path")\n self.file_pattern = check.opt_str_param(file_pattern, "file_pattern")\n super().__init__("Sftp", name)
\n\n\n
[docs]class WhiskyHunterSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n ):\n """Airbyte Source for Whisky Hunter.\n\n Documentation can be found at https://docs.airbyte.io/integrations/sources/whisky-hunter\n\n Args:\n name (str): The name of the destination.\n\n """\n super().__init__("Whisky Hunter", name)
\n\n\n
[docs]class FreshdeskSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n domain: str,\n api_key: str,\n requests_per_minute: Optional[int] = None,\n start_date: Optional[str] = None,\n ):\n """Airbyte Source for Freshdesk.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/freshdesk\n\n Args:\n name (str): The name of the destination.\n domain (str): Freshdesk domain\n api_key (str): Freshdesk API Key. See the docs for more information on how to obtain this key.\n requests_per_minute (Optional[int]): The number of requests per minute that this source allowed to use. There is a rate limit of 50 requests per minute per app per account.\n start_date (Optional[str]): UTC date and time. Any data created after this date will be replicated. If this parameter is not set, all data will be replicated.\n """\n self.domain = check.str_param(domain, "domain")\n self.api_key = check.str_param(api_key, "api_key")\n self.requests_per_minute = check.opt_int_param(requests_per_minute, "requests_per_minute")\n self.start_date = check.opt_str_param(start_date, "start_date")\n super().__init__("Freshdesk", name)
\n\n\n
[docs]class GocardlessSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n access_token: str,\n gocardless_environment: str,\n gocardless_version: str,\n start_date: str,\n ):\n """Airbyte Source for Gocardless.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/gocardless\n\n Args:\n name (str): The name of the destination.\n access_token (str): Gocardless API TOKEN\n gocardless_environment (str): Environment you are trying to connect to.\n gocardless_version (str): GoCardless version. This is a date. You can find the latest here: https://developer.gocardless.com/api-reference/#api-usage-making-requests\n start_date (str): UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated.\n """\n self.access_token = check.str_param(access_token, "access_token")\n self.gocardless_environment = check.str_param(\n gocardless_environment, "gocardless_environment"\n )\n self.gocardless_version = check.str_param(gocardless_version, "gocardless_version")\n self.start_date = check.str_param(start_date, "start_date")\n super().__init__("Gocardless", name)
\n\n\n
[docs]class ZuoraSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n start_date: str,\n tenant_endpoint: str,\n data_query: str,\n client_id: str,\n client_secret: str,\n window_in_days: Optional[str] = None,\n ):\n """Airbyte Source for Zuora.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/zuora\n\n Args:\n name (str): The name of the destination.\n start_date (str): Start Date in format: YYYY-MM-DD\n window_in_days (Optional[str]): The amount of days for each data-chunk begining from start_date. Bigger the value - faster the fetch. (0.1 - as for couple of hours, 1 - as for a Day; 364 - as for a Year).\n tenant_endpoint (str): Please choose the right endpoint where your Tenant is located. More info by this Link\n data_query (str): Choose between `Live`, or `Unlimited` - the optimized, replicated database at 12 hours freshness for high volume extraction Link\n client_id (str): Your OAuth user Client ID\n client_secret (str): Your OAuth user Client Secret\n """\n self.start_date = check.str_param(start_date, "start_date")\n self.window_in_days = check.opt_str_param(window_in_days, "window_in_days")\n self.tenant_endpoint = check.str_param(tenant_endpoint, "tenant_endpoint")\n self.data_query = check.str_param(data_query, "data_query")\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n super().__init__("Zuora", name)
\n\n\n
[docs]class MarketoSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self, name: str, domain_url: str, client_id: str, client_secret: str, start_date: str\n ):\n """Airbyte Source for Marketo.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/marketo\n\n Args:\n name (str): The name of the destination.\n domain_url (str): Your Marketo Base URL. See the docs for info on how to obtain this.\n client_id (str): The Client ID of your Marketo developer application. See the docs for info on how to obtain this.\n client_secret (str): The Client Secret of your Marketo developer application. See the docs for info on how to obtain this.\n start_date (str): UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated.\n """\n self.domain_url = check.str_param(domain_url, "domain_url")\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.start_date = check.str_param(start_date, "start_date")\n super().__init__("Marketo", name)
\n\n\n
[docs]class DriftSource(GeneratedAirbyteSource):\n
[docs] class OAuth20:\n
[docs] @public\n def __init__(\n self,\n client_id: str,\n client_secret: str,\n access_token: str,\n refresh_token: str,\n credentials: Optional[str] = None,\n ):\n self.credentials = check.opt_str_param(credentials, "credentials")\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.access_token = check.str_param(access_token, "access_token")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")
\n\n
[docs] class AccessToken:\n
[docs] @public\n def __init__(self, access_token: str, credentials: Optional[str] = None):\n self.credentials = check.opt_str_param(credentials, "credentials")\n self.access_token = check.str_param(access_token, "access_token")
\n\n
[docs] @public\n def __init__(\n self, name: str, credentials: Union["DriftSource.OAuth20", "DriftSource.AccessToken"]\n ):\n """Airbyte Source for Drift.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/drift\n\n Args:\n name (str): The name of the destination.\n\n """\n self.credentials = check.inst_param(\n credentials, "credentials", (DriftSource.OAuth20, DriftSource.AccessToken)\n )\n super().__init__("Drift", name)
\n\n\n
[docs]class PokeapiSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, pokemon_name: str):\n """Airbyte Source for Pokeapi.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/pokeapi\n\n Args:\n name (str): The name of the destination.\n pokemon_name (str): Pokemon requested from the API.\n """\n self.pokemon_name = check.str_param(pokemon_name, "pokemon_name")\n super().__init__("Pokeapi", name)
\n\n\n
[docs]class NetsuiteSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n realm: str,\n consumer_key: str,\n consumer_secret: str,\n token_key: str,\n token_secret: str,\n start_datetime: str,\n object_types: Optional[List[str]] = None,\n window_in_days: Optional[int] = None,\n ):\n """Airbyte Source for Netsuite.\n\n Args:\n name (str): The name of the destination.\n realm (str): Netsuite realm e.g. 2344535, as for `production` or 2344535_SB1, as for the `sandbox`\n consumer_key (str): Consumer key associated with your integration\n consumer_secret (str): Consumer secret associated with your integration\n token_key (str): Access token key\n token_secret (str): Access token secret\n object_types (Optional[List[str]]): The API names of the Netsuite objects you want to sync. Setting this speeds up the connection setup process by limiting the number of schemas that need to be retrieved from Netsuite.\n start_datetime (str): Starting point for your data replication, in format of "YYYY-MM-DDTHH:mm:ssZ"\n window_in_days (Optional[int]): The amount of days used to query the data with date chunks. Set smaller value, if you have lots of data.\n """\n self.realm = check.str_param(realm, "realm")\n self.consumer_key = check.str_param(consumer_key, "consumer_key")\n self.consumer_secret = check.str_param(consumer_secret, "consumer_secret")\n self.token_key = check.str_param(token_key, "token_key")\n self.token_secret = check.str_param(token_secret, "token_secret")\n self.object_types = check.opt_nullable_list_param(object_types, "object_types", str)\n self.start_datetime = check.str_param(start_datetime, "start_datetime")\n self.window_in_days = check.opt_int_param(window_in_days, "window_in_days")\n super().__init__("Netsuite", name)
\n\n\n
[docs]class HubplannerSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, api_key: str):\n """Airbyte Source for Hubplanner.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/hubplanner\n\n Args:\n name (str): The name of the destination.\n api_key (str): Hubplanner API key. See https://github.com/hubplanner/API#authentication for more details.\n """\n self.api_key = check.str_param(api_key, "api_key")\n super().__init__("Hubplanner", name)
\n\n\n
[docs]class Dv360Source(GeneratedAirbyteSource):\n
[docs] class Oauth2Credentials:\n
[docs] @public\n def __init__(\n self,\n access_token: str,\n refresh_token: str,\n token_uri: str,\n client_id: str,\n client_secret: str,\n ):\n self.access_token = check.str_param(access_token, "access_token")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")\n self.token_uri = check.str_param(token_uri, "token_uri")\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n credentials: "Dv360Source.Oauth2Credentials",\n partner_id: int,\n start_date: str,\n end_date: Optional[str] = None,\n filters: Optional[List[str]] = None,\n ):\n """Airbyte Source for Dv 360.\n\n Args:\n name (str): The name of the destination.\n credentials (Dv360Source.Oauth2Credentials): Oauth2 credentials\n partner_id (int): Partner ID\n start_date (str): UTC date and time in the format 2017-01-25. Any data before this date will not be replicated\n end_date (Optional[str]): UTC date and time in the format 2017-01-25. Any data after this date will not be replicated.\n filters (Optional[List[str]]): filters for the dimensions. each filter object had 2 keys: 'type' for the name of the dimension to be used as. and 'value' for the value of the filter\n """\n self.credentials = check.inst_param(\n credentials, "credentials", Dv360Source.Oauth2Credentials\n )\n self.partner_id = check.int_param(partner_id, "partner_id")\n self.start_date = check.str_param(start_date, "start_date")\n self.end_date = check.opt_str_param(end_date, "end_date")\n self.filters = check.opt_nullable_list_param(filters, "filters", str)\n super().__init__("Dv 360", name)
\n\n\n
[docs]class NotionSource(GeneratedAirbyteSource):\n
[docs] class OAuth20:\n
[docs] @public\n def __init__(self, client_id: str, client_secret: str, access_token: str):\n self.auth_type = "OAuth2.0"\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.access_token = check.str_param(access_token, "access_token")
\n\n
[docs] class AccessToken:\n
[docs] @public\n def __init__(self, token: str):\n self.auth_type = "token"\n self.token = check.str_param(token, "token")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n start_date: str,\n credentials: Union["NotionSource.OAuth20", "NotionSource.AccessToken"],\n ):\n """Airbyte Source for Notion.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/notion\n\n Args:\n name (str): The name of the destination.\n start_date (str): UTC date and time in the format 2017-01-25T00:00:00.000Z. Any data before this date will not be replicated.\n credentials (Union[NotionSource.OAuth20, NotionSource.AccessToken]): Pick an authentication method.\n """\n self.start_date = check.str_param(start_date, "start_date")\n self.credentials = check.inst_param(\n credentials, "credentials", (NotionSource.OAuth20, NotionSource.AccessToken)\n )\n super().__init__("Notion", name)
\n\n\n
[docs]class ZendeskSunshineSource(GeneratedAirbyteSource):\n
[docs] class OAuth20:\n
[docs] @public\n def __init__(self, client_id: str, client_secret: str, access_token: str):\n self.auth_method = "oauth2.0"\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.access_token = check.str_param(access_token, "access_token")
\n\n
[docs] class APIToken:\n
[docs] @public\n def __init__(self, api_token: str, email: str):\n self.auth_method = "api_token"\n self.api_token = check.str_param(api_token, "api_token")\n self.email = check.str_param(email, "email")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n subdomain: str,\n start_date: str,\n credentials: Union["ZendeskSunshineSource.OAuth20", "ZendeskSunshineSource.APIToken"],\n ):\n """Airbyte Source for Zendesk Sunshine.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/zendesk_sunshine\n\n Args:\n name (str): The name of the destination.\n subdomain (str): The subdomain for your Zendesk Account.\n start_date (str): The date from which you'd like to replicate data for Zendesk Sunshine API, in the format YYYY-MM-DDT00:00:00Z.\n """\n self.subdomain = check.str_param(subdomain, "subdomain")\n self.start_date = check.str_param(start_date, "start_date")\n self.credentials = check.inst_param(\n credentials,\n "credentials",\n (ZendeskSunshineSource.OAuth20, ZendeskSunshineSource.APIToken),\n )\n super().__init__("Zendesk Sunshine", name)
\n\n\n
[docs]class PinterestSource(GeneratedAirbyteSource):\n
[docs] class OAuth20:\n
[docs] @public\n def __init__(\n self,\n refresh_token: str,\n client_id: Optional[str] = None,\n client_secret: Optional[str] = None,\n ):\n self.auth_method = "oauth2.0"\n self.client_id = check.opt_str_param(client_id, "client_id")\n self.client_secret = check.opt_str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")
\n\n
[docs] class AccessToken:\n
[docs] @public\n def __init__(self, access_token: str):\n self.auth_method = "access_token"\n self.access_token = check.str_param(access_token, "access_token")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n start_date: str,\n credentials: Union["PinterestSource.OAuth20", "PinterestSource.AccessToken"],\n ):\n """Airbyte Source for Pinterest.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/pinterest\n\n Args:\n name (str): The name of the destination.\n start_date (str): A date in the format YYYY-MM-DD. If you have not set a date, it would be defaulted to latest allowed date by api (914 days from today).\n """\n self.start_date = check.str_param(start_date, "start_date")\n self.credentials = check.inst_param(\n credentials, "credentials", (PinterestSource.OAuth20, PinterestSource.AccessToken)\n )\n super().__init__("Pinterest", name)
\n\n\n
[docs]class MetabaseSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n instance_api_url: str,\n username: Optional[str] = None,\n password: Optional[str] = None,\n session_token: Optional[str] = None,\n ):\n r"""Airbyte Source for Metabase.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/metabase\n\n Args:\n name (str): The name of the destination.\n instance_api_url (str): URL to your metabase instance API\n session_token (Optional[str]): To generate your session token, you need to run the following command: ``` curl -X POST \\\\ -H "Content-Type: application/json" \\\\ -d '{"username": "person@metabase.com", "password": "fakepassword"}' \\\\ http://localhost:3000/api/session ``` Then copy the value of the `id` field returned by a successful call to that API. Note that by default, sessions are good for 14 days and needs to be regenerated.\n """\n self.instance_api_url = check.str_param(instance_api_url, "instance_api_url")\n self.username = check.opt_str_param(username, "username")\n self.password = check.opt_str_param(password, "password")\n self.session_token = check.opt_str_param(session_token, "session_token")\n super().__init__("Metabase", name)
\n\n\n
[docs]class HubspotSource(GeneratedAirbyteSource):\n
[docs] class OAuth:\n
[docs] @public\n def __init__(self, client_id: str, client_secret: str, refresh_token: str):\n self.credentials_title = "OAuth Credentials"\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")
\n\n
[docs] class APIKey:\n
[docs] @public\n def __init__(self, api_key: str):\n self.credentials_title = "API Key Credentials"\n self.api_key = check.str_param(api_key, "api_key")
\n\n
[docs] class PrivateAPP:\n
[docs] @public\n def __init__(self, access_token: str):\n self.credentials_title = "Private App Credentials"\n self.access_token = check.str_param(access_token, "access_token")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n start_date: str,\n credentials: Union[\n "HubspotSource.OAuth", "HubspotSource.APIKey", "HubspotSource.PrivateAPP"\n ],\n ):\n """Airbyte Source for Hubspot.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/hubspot\n\n Args:\n name (str): The name of the destination.\n start_date (str): UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated.\n credentials (Union[HubspotSource.OAuth, HubspotSource.APIKey, HubspotSource.PrivateAPP]): Choose how to authenticate to HubSpot.\n """\n self.start_date = check.str_param(start_date, "start_date")\n self.credentials = check.inst_param(\n credentials,\n "credentials",\n (HubspotSource.OAuth, HubspotSource.APIKey, HubspotSource.PrivateAPP),\n )\n super().__init__("Hubspot", name)
\n\n\n
[docs]class HarvestSource(GeneratedAirbyteSource):\n
[docs] class AuthenticateViaHarvestOAuth:\n
[docs] @public\n def __init__(\n self,\n client_id: str,\n client_secret: str,\n refresh_token: str,\n auth_type: Optional[str] = None,\n ):\n self.auth_type = check.opt_str_param(auth_type, "auth_type")\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")
\n\n
[docs] class AuthenticateWithPersonalAccessToken:\n
[docs] @public\n def __init__(self, api_token: str, auth_type: Optional[str] = None):\n self.auth_type = check.opt_str_param(auth_type, "auth_type")\n self.api_token = check.str_param(api_token, "api_token")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n account_id: str,\n replication_start_date: str,\n credentials: Union[\n "HarvestSource.AuthenticateViaHarvestOAuth",\n "HarvestSource.AuthenticateWithPersonalAccessToken",\n ],\n ):\n """Airbyte Source for Harvest.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/harvest\n\n Args:\n name (str): The name of the destination.\n account_id (str): Harvest account ID. Required for all Harvest requests in pair with Personal Access Token\n replication_start_date (str): UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated.\n credentials (Union[HarvestSource.AuthenticateViaHarvestOAuth, HarvestSource.AuthenticateWithPersonalAccessToken]): Choose how to authenticate to Harvest.\n """\n self.account_id = check.str_param(account_id, "account_id")\n self.replication_start_date = check.str_param(\n replication_start_date, "replication_start_date"\n )\n self.credentials = check.inst_param(\n credentials,\n "credentials",\n (\n HarvestSource.AuthenticateViaHarvestOAuth,\n HarvestSource.AuthenticateWithPersonalAccessToken,\n ),\n )\n super().__init__("Harvest", name)
\n\n\n
[docs]class GithubSource(GeneratedAirbyteSource):\n
[docs] class OAuthCredentials:\n
[docs] @public\n def __init__(self, access_token: str):\n self.access_token = check.str_param(access_token, "access_token")
\n\n
[docs] class PATCredentials:\n
[docs] @public\n def __init__(self, personal_access_token: str):\n self.personal_access_token = check.str_param(\n personal_access_token, "personal_access_token"\n )
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n credentials: Union["GithubSource.OAuthCredentials", "GithubSource.PATCredentials"],\n start_date: str,\n repository: str,\n branch: Optional[str] = None,\n page_size_for_large_streams: Optional[int] = None,\n ):\n """Airbyte Source for Github.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/github\n\n Args:\n name (str): The name of the destination.\n credentials (Union[GithubSource.OAuthCredentials, GithubSource.PATCredentials]): Choose how to authenticate to GitHub\n start_date (str): The date from which you'd like to replicate data from GitHub in the format YYYY-MM-DDT00:00:00Z. For the streams which support this configuration, only data generated on or after the start date will be replicated. This field doesn't apply to all streams, see the docs for more info\n repository (str): Space-delimited list of GitHub organizations/repositories, e.g. `airbytehq/airbyte` for single repository, `airbytehq/*` for get all repositories from organization and `airbytehq/airbyte airbytehq/another-repo` for multiple repositories.\n branch (Optional[str]): Space-delimited list of GitHub repository branches to pull commits for, e.g. `airbytehq/airbyte/master`. If no branches are specified for a repository, the default branch will be pulled.\n page_size_for_large_streams (Optional[int]): The Github connector contains several streams with a large amount of data. The page size of such streams depends on the size of your repository. We recommended that you specify values between 10 and 30.\n """\n self.credentials = check.inst_param(\n credentials, "credentials", (GithubSource.OAuthCredentials, GithubSource.PATCredentials)\n )\n self.start_date = check.str_param(start_date, "start_date")\n self.repository = check.str_param(repository, "repository")\n self.branch = check.opt_str_param(branch, "branch")\n self.page_size_for_large_streams = check.opt_int_param(\n page_size_for_large_streams, "page_size_for_large_streams"\n )\n super().__init__("Github", name)
\n\n\n
[docs]class E2eTestSource(GeneratedAirbyteSource):\n
[docs] class SingleSchema:\n
[docs] @public\n def __init__(\n self, stream_name: str, stream_schema: str, stream_duplication: Optional[int] = None\n ):\n self.type = "SINGLE_STREAM"\n self.stream_name = check.str_param(stream_name, "stream_name")\n self.stream_schema = check.str_param(stream_schema, "stream_schema")\n self.stream_duplication = check.opt_int_param(stream_duplication, "stream_duplication")
\n\n
[docs] class MultiSchema:\n
[docs] @public\n def __init__(self, stream_schemas: str):\n self.type = "MULTI_STREAM"\n self.stream_schemas = check.str_param(stream_schemas, "stream_schemas")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n max_messages: int,\n mock_catalog: Union["E2eTestSource.SingleSchema", "E2eTestSource.MultiSchema"],\n type: Optional[str] = None,\n seed: Optional[int] = None,\n message_interval_ms: Optional[int] = None,\n ):\n """Airbyte Source for E2e Test.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/e2e-test\n\n Args:\n name (str): The name of the destination.\n max_messages (int): Number of records to emit per stream. Min 1. Max 100 billion.\n seed (Optional[int]): When the seed is unspecified, the current time millis will be used as the seed. Range: [0, 1000000].\n message_interval_ms (Optional[int]): Interval between messages in ms. Min 0 ms. Max 60000 ms (1 minute).\n """\n self.type = check.opt_str_param(type, "type")\n self.max_messages = check.int_param(max_messages, "max_messages")\n self.seed = check.opt_int_param(seed, "seed")\n self.message_interval_ms = check.opt_int_param(message_interval_ms, "message_interval_ms")\n self.mock_catalog = check.inst_param(\n mock_catalog, "mock_catalog", (E2eTestSource.SingleSchema, E2eTestSource.MultiSchema)\n )\n super().__init__("E2e Test", name)
\n\n\n
[docs]class MysqlSource(GeneratedAirbyteSource):\n
[docs] class Preferred:\n
[docs] @public\n def __init__(\n self,\n ):\n self.mode = "preferred"
\n\n
[docs] class Required:\n
[docs] @public\n def __init__(\n self,\n ):\n self.mode = "required"
\n\n
[docs] class VerifyCA:\n
[docs] @public\n def __init__(\n self,\n ca_certificate: str,\n client_certificate: Optional[str] = None,\n client_key: Optional[str] = None,\n client_key_password: Optional[str] = None,\n ):\n self.mode = "verify_ca"\n self.ca_certificate = check.str_param(ca_certificate, "ca_certificate")\n self.client_certificate = check.opt_str_param(client_certificate, "client_certificate")\n self.client_key = check.opt_str_param(client_key, "client_key")\n self.client_key_password = check.opt_str_param(\n client_key_password, "client_key_password"\n )
\n\n
[docs] class VerifyIdentity:\n
[docs] @public\n def __init__(\n self,\n ca_certificate: str,\n client_certificate: Optional[str] = None,\n client_key: Optional[str] = None,\n client_key_password: Optional[str] = None,\n ):\n self.mode = "verify_identity"\n self.ca_certificate = check.str_param(ca_certificate, "ca_certificate")\n self.client_certificate = check.opt_str_param(client_certificate, "client_certificate")\n self.client_key = check.opt_str_param(client_key, "client_key")\n self.client_key_password = check.opt_str_param(\n client_key_password, "client_key_password"\n )
\n\n
[docs] class Standard:\n
[docs] @public\n def __init__(\n self,\n ):\n self.method = "STANDARD"
\n\n
[docs] class LogicalReplicationCDC:\n
[docs] @public\n def __init__(\n self,\n initial_waiting_seconds: Optional[int] = None,\n server_time_zone: Optional[str] = None,\n ):\n self.method = "CDC"\n self.initial_waiting_seconds = check.opt_int_param(\n initial_waiting_seconds, "initial_waiting_seconds"\n )\n self.server_time_zone = check.opt_str_param(server_time_zone, "server_time_zone")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n host: str,\n port: int,\n database: str,\n username: str,\n ssl_mode: Union[\n "MysqlSource.Preferred",\n "MysqlSource.Required",\n "MysqlSource.VerifyCA",\n "MysqlSource.VerifyIdentity",\n ],\n replication_method: Union["MysqlSource.Standard", "MysqlSource.LogicalReplicationCDC"],\n password: Optional[str] = None,\n jdbc_url_params: Optional[str] = None,\n ssl: Optional[bool] = None,\n ):\n """Airbyte Source for Mysql.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/mysql\n\n Args:\n name (str): The name of the destination.\n host (str): The host name of the database.\n port (int): The port to connect to.\n database (str): The database name.\n username (str): The username which is used to access the database.\n password (Optional[str]): The password associated with the username.\n jdbc_url_params (Optional[str]): Additional properties to pass to the JDBC URL string when connecting to the database formatted as 'key=value' pairs separated by the symbol '&'. (example: key1=value1&key2=value2&key3=value3). For more information read about JDBC URL parameters.\n ssl (Optional[bool]): Encrypt data using SSL.\n ssl_mode (Union[MysqlSource.Preferred, MysqlSource.Required, MysqlSource.VerifyCA, MysqlSource.VerifyIdentity]): SSL connection modes. preferred - Automatically attempt SSL connection. If the MySQL server does not support SSL, continue with a regular connection.required - Always connect with SSL. If the MySQL server doesn`t support SSL, the connection will not be established. Certificate Authority (CA) and Hostname are not verified.verify-ca - Always connect with SSL. Verifies CA, but allows connection even if Hostname does not match.Verify Identity - Always connect with SSL. Verify both CA and Hostname.Read more in the docs.\n replication_method (Union[MysqlSource.Standard, MysqlSource.LogicalReplicationCDC]): Replication method to use for extracting data from the database.\n """\n self.host = check.str_param(host, "host")\n self.port = check.int_param(port, "port")\n self.database = check.str_param(database, "database")\n self.username = check.str_param(username, "username")\n self.password = check.opt_str_param(password, "password")\n self.jdbc_url_params = check.opt_str_param(jdbc_url_params, "jdbc_url_params")\n self.ssl = check.opt_bool_param(ssl, "ssl")\n self.ssl_mode = check.inst_param(\n ssl_mode,\n "ssl_mode",\n (\n MysqlSource.Preferred,\n MysqlSource.Required,\n MysqlSource.VerifyCA,\n MysqlSource.VerifyIdentity,\n ),\n )\n self.replication_method = check.inst_param(\n replication_method,\n "replication_method",\n (MysqlSource.Standard, MysqlSource.LogicalReplicationCDC),\n )\n super().__init__("Mysql", name)
\n\n\n
[docs]class MyHoursSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n email: str,\n password: str,\n start_date: str,\n logs_batch_size: Optional[int] = None,\n ):\n """Airbyte Source for My Hours.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/my-hours\n\n Args:\n name (str): The name of the destination.\n email (str): Your My Hours username\n password (str): The password associated to the username\n start_date (str): Start date for collecting time logs\n logs_batch_size (Optional[int]): Pagination size used for retrieving logs in days\n """\n self.email = check.str_param(email, "email")\n self.password = check.str_param(password, "password")\n self.start_date = check.str_param(start_date, "start_date")\n self.logs_batch_size = check.opt_int_param(logs_batch_size, "logs_batch_size")\n super().__init__("My Hours", name)
\n\n\n
[docs]class KyribaSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n domain: str,\n username: str,\n password: str,\n start_date: str,\n end_date: Optional[str] = None,\n ):\n """Airbyte Source for Kyriba.\n\n Args:\n name (str): The name of the destination.\n domain (str): Kyriba domain\n username (str): Username to be used in basic auth\n password (str): Password to be used in basic auth\n start_date (str): The date the sync should start from.\n end_date (Optional[str]): The date the sync should end. If let empty the sync will run to the current date.\n """\n self.domain = check.str_param(domain, "domain")\n self.username = check.str_param(username, "username")\n self.password = check.str_param(password, "password")\n self.start_date = check.str_param(start_date, "start_date")\n self.end_date = check.opt_str_param(end_date, "end_date")\n super().__init__("Kyriba", name)
\n\n\n
[docs]class GoogleSearchConsoleSource(GeneratedAirbyteSource):\n
[docs] class OAuth:\n
[docs] @public\n def __init__(\n self,\n client_id: str,\n client_secret: str,\n refresh_token: str,\n access_token: Optional[str] = None,\n ):\n self.auth_type = "Client"\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.access_token = check.opt_str_param(access_token, "access_token")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")
\n\n
[docs] class ServiceAccountKeyAuthentication:\n
[docs] @public\n def __init__(self, service_account_info: str, email: str):\n self.auth_type = "Service"\n self.service_account_info = check.str_param(\n service_account_info, "service_account_info"\n )\n self.email = check.str_param(email, "email")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n site_urls: List[str],\n start_date: str,\n authorization: Union[\n "GoogleSearchConsoleSource.OAuth",\n "GoogleSearchConsoleSource.ServiceAccountKeyAuthentication",\n ],\n end_date: Optional[str] = None,\n custom_reports: Optional[str] = None,\n ):\n """Airbyte Source for Google Search Console.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/google-search-console\n\n Args:\n name (str): The name of the destination.\n site_urls (List[str]): The URLs of the website property attached to your GSC account. Read more here.\n start_date (str): UTC date in the format 2017-01-25. Any data before this date will not be replicated.\n end_date (Optional[str]): UTC date in the format 2017-01-25. Any data after this date will not be replicated. Must be greater or equal to the start date field.\n custom_reports (Optional[str]): A JSON array describing the custom reports you want to sync from Google Search Console. See the docs for more information about the exact format you can use to fill out this field.\n """\n self.site_urls = check.list_param(site_urls, "site_urls", str)\n self.start_date = check.str_param(start_date, "start_date")\n self.end_date = check.opt_str_param(end_date, "end_date")\n self.authorization = check.inst_param(\n authorization,\n "authorization",\n (\n GoogleSearchConsoleSource.OAuth,\n GoogleSearchConsoleSource.ServiceAccountKeyAuthentication,\n ),\n )\n self.custom_reports = check.opt_str_param(custom_reports, "custom_reports")\n super().__init__("Google Search Console", name)
\n\n\n
[docs]class FacebookMarketingSource(GeneratedAirbyteSource):\n
[docs] class InsightConfig:\n
[docs] @public\n def __init__(\n self,\n name: str,\n fields: Optional[List[str]] = None,\n breakdowns: Optional[List[str]] = None,\n action_breakdowns: Optional[List[str]] = None,\n time_increment: Optional[int] = None,\n start_date: Optional[str] = None,\n end_date: Optional[str] = None,\n insights_lookback_window: Optional[int] = None,\n ):\n self.name = check.str_param(name, "name")\n self.fields = check.opt_nullable_list_param(fields, "fields", str)\n self.breakdowns = check.opt_nullable_list_param(breakdowns, "breakdowns", str)\n self.action_breakdowns = check.opt_nullable_list_param(\n action_breakdowns, "action_breakdowns", str\n )\n self.time_increment = check.opt_int_param(time_increment, "time_increment")\n self.start_date = check.opt_str_param(start_date, "start_date")\n self.end_date = check.opt_str_param(end_date, "end_date")\n self.insights_lookback_window = check.opt_int_param(\n insights_lookback_window, "insights_lookback_window"\n )
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n account_id: str,\n start_date: str,\n access_token: str,\n end_date: Optional[str] = None,\n include_deleted: Optional[bool] = None,\n fetch_thumbnail_images: Optional[bool] = None,\n custom_insights: Optional[List[InsightConfig]] = None,\n page_size: Optional[int] = None,\n insights_lookback_window: Optional[int] = None,\n max_batch_size: Optional[int] = None,\n ):\n """Airbyte Source for Facebook Marketing.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/facebook-marketing\n\n Args:\n name (str): The name of the destination.\n account_id (str): The Facebook Ad account ID to use when pulling data from the Facebook Marketing API.\n start_date (str): The date from which you'd like to replicate data for all incremental streams, in the format YYYY-MM-DDT00:00:00Z. All data generated after this date will be replicated.\n end_date (Optional[str]): The date until which you'd like to replicate data for all incremental streams, in the format YYYY-MM-DDT00:00:00Z. All data generated between start_date and this date will be replicated. Not setting this option will result in always syncing the latest data.\n access_token (str): The value of the access token generated. See the docs for more information\n include_deleted (Optional[bool]): Include data from deleted Campaigns, Ads, and AdSets\n fetch_thumbnail_images (Optional[bool]): In each Ad Creative, fetch the thumbnail_url and store the result in thumbnail_data_url\n custom_insights (Optional[List[FacebookMarketingSource.InsightConfig]]): A list which contains insights entries, each entry must have a name and can contains fields, breakdowns or action_breakdowns)\n page_size (Optional[int]): Page size used when sending requests to Facebook API to specify number of records per page when response has pagination. Most users do not need to set this field unless they specifically need to tune the connector to address specific issues or use cases.\n insights_lookback_window (Optional[int]): The attribution window\n max_batch_size (Optional[int]): Maximum batch size used when sending batch requests to Facebook API. Most users do not need to set this field unless they specifically need to tune the connector to address specific issues or use cases.\n """\n self.account_id = check.str_param(account_id, "account_id")\n self.start_date = check.str_param(start_date, "start_date")\n self.end_date = check.opt_str_param(end_date, "end_date")\n self.access_token = check.str_param(access_token, "access_token")\n self.include_deleted = check.opt_bool_param(include_deleted, "include_deleted")\n self.fetch_thumbnail_images = check.opt_bool_param(\n fetch_thumbnail_images, "fetch_thumbnail_images"\n )\n self.custom_insights = check.opt_nullable_list_param(\n custom_insights, "custom_insights", FacebookMarketingSource.InsightConfig\n )\n self.page_size = check.opt_int_param(page_size, "page_size")\n self.insights_lookback_window = check.opt_int_param(\n insights_lookback_window, "insights_lookback_window"\n )\n self.max_batch_size = check.opt_int_param(max_batch_size, "max_batch_size")\n super().__init__("Facebook Marketing", name)
\n\n\n
[docs]class SurveymonkeySource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self, name: str, access_token: str, start_date: str, survey_ids: Optional[List[str]] = None\n ):\n """Airbyte Source for Surveymonkey.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/surveymonkey\n\n Args:\n name (str): The name of the destination.\n access_token (str): Access Token for making authenticated requests. See the docs for information on how to generate this key.\n start_date (str): UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated.\n survey_ids (Optional[List[str]]): IDs of the surveys from which you'd like to replicate data. If left empty, data from all boards to which you have access will be replicated.\n """\n self.access_token = check.str_param(access_token, "access_token")\n self.start_date = check.str_param(start_date, "start_date")\n self.survey_ids = check.opt_nullable_list_param(survey_ids, "survey_ids", str)\n super().__init__("Surveymonkey", name)
\n\n\n
[docs]class PardotSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n pardot_business_unit_id: str,\n client_id: str,\n client_secret: str,\n refresh_token: str,\n start_date: Optional[str] = None,\n is_sandbox: Optional[bool] = None,\n ):\n """Airbyte Source for Pardot.\n\n Args:\n name (str): The name of the destination.\n pardot_business_unit_id (str): Pardot Business ID, can be found at Setup > Pardot > Pardot Account Setup\n client_id (str): The Consumer Key that can be found when viewing your app in Salesforce\n client_secret (str): The Consumer Secret that can be found when viewing your app in Salesforce\n refresh_token (str): Salesforce Refresh Token used for Airbyte to access your Salesforce account. If you don't know what this is, follow this guide to retrieve it.\n start_date (Optional[str]): UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated. Leave blank to skip this filter\n is_sandbox (Optional[bool]): Whether or not the the app is in a Salesforce sandbox. If you do not know what this, assume it is false.\n """\n self.pardot_business_unit_id = check.str_param(\n pardot_business_unit_id, "pardot_business_unit_id"\n )\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")\n self.start_date = check.opt_str_param(start_date, "start_date")\n self.is_sandbox = check.opt_bool_param(is_sandbox, "is_sandbox")\n super().__init__("Pardot", name)
\n\n\n
[docs]class FlexportSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, api_key: str, start_date: str):\n """Airbyte Source for Flexport.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/flexport\n\n Args:\n name (str): The name of the destination.\n\n """\n self.api_key = check.str_param(api_key, "api_key")\n self.start_date = check.str_param(start_date, "start_date")\n super().__init__("Flexport", name)
\n\n\n
[docs]class ZenefitsSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, token: str):\n """Airbyte Source for Zenefits.\n\n Args:\n name (str): The name of the destination.\n token (str): Use Sync with Zenefits button on the link given on the readme file, and get the token to access the api\n """\n self.token = check.str_param(token, "token")\n super().__init__("Zenefits", name)
\n\n\n
[docs]class KafkaSource(GeneratedAirbyteSource):\n
[docs] class JSON:\n
[docs] @public\n def __init__(self, deserialization_type: Optional[str] = None):\n self.deserialization_type = check.opt_str_param(\n deserialization_type, "deserialization_type"\n )
\n\n
[docs] class AVRO:\n
[docs] @public\n def __init__(\n self,\n deserialization_type: Optional[str] = None,\n deserialization_strategy: Optional[str] = None,\n schema_registry_url: Optional[str] = None,\n schema_registry_username: Optional[str] = None,\n schema_registry_password: Optional[str] = None,\n ):\n self.deserialization_type = check.opt_str_param(\n deserialization_type, "deserialization_type"\n )\n self.deserialization_strategy = check.opt_str_param(\n deserialization_strategy, "deserialization_strategy"\n )\n self.schema_registry_url = check.opt_str_param(\n schema_registry_url, "schema_registry_url"\n )\n self.schema_registry_username = check.opt_str_param(\n schema_registry_username, "schema_registry_username"\n )\n self.schema_registry_password = check.opt_str_param(\n schema_registry_password, "schema_registry_password"\n )
\n\n
[docs] class ManuallyAssignAListOfPartitions:\n
[docs] @public\n def __init__(self, topic_partitions: str):\n self.subscription_type = "assign"\n self.topic_partitions = check.str_param(topic_partitions, "topic_partitions")
\n\n
[docs] class SubscribeToAllTopicsMatchingSpecifiedPattern:\n
[docs] @public\n def __init__(self, topic_pattern: str):\n self.subscription_type = "subscribe"\n self.topic_pattern = check.str_param(topic_pattern, "topic_pattern")
\n\n
[docs] class PLAINTEXT:\n
[docs] @public\n def __init__(self, security_protocol: str):\n self.security_protocol = check.str_param(security_protocol, "security_protocol")
\n\n
[docs] class SASLPLAINTEXT:\n
[docs] @public\n def __init__(self, security_protocol: str, sasl_mechanism: str, sasl_jaas_config: str):\n self.security_protocol = check.str_param(security_protocol, "security_protocol")\n self.sasl_mechanism = check.str_param(sasl_mechanism, "sasl_mechanism")\n self.sasl_jaas_config = check.str_param(sasl_jaas_config, "sasl_jaas_config")
\n\n
[docs] class SASLSSL:\n
[docs] @public\n def __init__(self, security_protocol: str, sasl_mechanism: str, sasl_jaas_config: str):\n self.security_protocol = check.str_param(security_protocol, "security_protocol")\n self.sasl_mechanism = check.str_param(sasl_mechanism, "sasl_mechanism")\n self.sasl_jaas_config = check.str_param(sasl_jaas_config, "sasl_jaas_config")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n MessageFormat: Union["KafkaSource.JSON", "KafkaSource.AVRO"],\n bootstrap_servers: str,\n subscription: Union[\n "KafkaSource.ManuallyAssignAListOfPartitions",\n "KafkaSource.SubscribeToAllTopicsMatchingSpecifiedPattern",\n ],\n protocol: Union[\n "KafkaSource.PLAINTEXT", "KafkaSource.SASLPLAINTEXT", "KafkaSource.SASLSSL"\n ],\n test_topic: Optional[str] = None,\n group_id: Optional[str] = None,\n max_poll_records: Optional[int] = None,\n polling_time: Optional[int] = None,\n client_id: Optional[str] = None,\n enable_auto_commit: Optional[bool] = None,\n auto_commit_interval_ms: Optional[int] = None,\n client_dns_lookup: Optional[str] = None,\n retry_backoff_ms: Optional[int] = None,\n request_timeout_ms: Optional[int] = None,\n receive_buffer_bytes: Optional[int] = None,\n auto_offset_reset: Optional[str] = None,\n repeated_calls: Optional[int] = None,\n max_records_process: Optional[int] = None,\n ):\n """Airbyte Source for Kafka.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/kafka\n\n Args:\n name (str): The name of the destination.\n MessageFormat (Union[KafkaSource.JSON, KafkaSource.AVRO]): The serialization used based on this\n bootstrap_servers (str): A list of host/port pairs to use for establishing the initial connection to the Kafka cluster. The client will make use of all servers irrespective of which servers are specified here for bootstrapping&mdash;this list only impacts the initial hosts used to discover the full set of servers. This list should be in the form host1:port1,host2:port2,.... Since these servers are just used for the initial connection to discover the full cluster membership (which may change dynamically), this list need not contain the full set of servers (you may want more than one, though, in case a server is down).\n subscription (Union[KafkaSource.ManuallyAssignAListOfPartitions, KafkaSource.SubscribeToAllTopicsMatchingSpecifiedPattern]): You can choose to manually assign a list of partitions, or subscribe to all topics matching specified pattern to get dynamically assigned partitions.\n test_topic (Optional[str]): The Topic to test in case the Airbyte can consume messages.\n group_id (Optional[str]): The Group ID is how you distinguish different consumer groups.\n max_poll_records (Optional[int]): The maximum number of records returned in a single call to poll(). Note, that max_poll_records does not impact the underlying fetching behavior. The consumer will cache the records from each fetch request and returns them incrementally from each poll.\n polling_time (Optional[int]): Amount of time Kafka connector should try to poll for messages.\n protocol (Union[KafkaSource.PLAINTEXT, KafkaSource.SASLPLAINTEXT, KafkaSource.SASLSSL]): The Protocol used to communicate with brokers.\n client_id (Optional[str]): An ID string to pass to the server when making requests. The purpose of this is to be able to track the source of requests beyond just ip/port by allowing a logical application name to be included in server-side request logging.\n enable_auto_commit (Optional[bool]): If true, the consumer's offset will be periodically committed in the background.\n auto_commit_interval_ms (Optional[int]): The frequency in milliseconds that the consumer offsets are auto-committed to Kafka if enable.auto.commit is set to true.\n client_dns_lookup (Optional[str]): Controls how the client uses DNS lookups. If set to use_all_dns_ips, connect to each returned IP address in sequence until a successful connection is established. After a disconnection, the next IP is used. Once all IPs have been used once, the client resolves the IP(s) from the hostname again. If set to resolve_canonical_bootstrap_servers_only, resolve each bootstrap address into a list of canonical names. After the bootstrap phase, this behaves the same as use_all_dns_ips. If set to default (deprecated), attempt to connect to the first IP address returned by the lookup, even if the lookup returns multiple IP addresses.\n retry_backoff_ms (Optional[int]): The amount of time to wait before attempting to retry a failed request to a given topic partition. This avoids repeatedly sending requests in a tight loop under some failure scenarios.\n request_timeout_ms (Optional[int]): The configuration controls the maximum amount of time the client will wait for the response of a request. If the response is not received before the timeout elapses the client will resend the request if necessary or fail the request if retries are exhausted.\n receive_buffer_bytes (Optional[int]): The size of the TCP receive buffer (SO_RCVBUF) to use when reading data. If the value is -1, the OS default will be used.\n auto_offset_reset (Optional[str]): What to do when there is no initial offset in Kafka or if the current offset does not exist any more on the server - earliest: automatically reset the offset to the earliest offset, latest: automatically reset the offset to the latest offset, none: throw exception to the consumer if no previous offset is found for the consumer's group, anything else: throw exception to the consumer.\n repeated_calls (Optional[int]): The number of repeated calls to poll() if no messages were received.\n max_records_process (Optional[int]): The Maximum to be processed per execution\n """\n self.MessageFormat = check.inst_param(\n MessageFormat, "MessageFormat", (KafkaSource.JSON, KafkaSource.AVRO)\n )\n self.bootstrap_servers = check.str_param(bootstrap_servers, "bootstrap_servers")\n self.subscription = check.inst_param(\n subscription,\n "subscription",\n (\n KafkaSource.ManuallyAssignAListOfPartitions,\n KafkaSource.SubscribeToAllTopicsMatchingSpecifiedPattern,\n ),\n )\n self.test_topic = check.opt_str_param(test_topic, "test_topic")\n self.group_id = check.opt_str_param(group_id, "group_id")\n self.max_poll_records = check.opt_int_param(max_poll_records, "max_poll_records")\n self.polling_time = check.opt_int_param(polling_time, "polling_time")\n self.protocol = check.inst_param(\n protocol,\n "protocol",\n (KafkaSource.PLAINTEXT, KafkaSource.SASLPLAINTEXT, KafkaSource.SASLSSL),\n )\n self.client_id = check.opt_str_param(client_id, "client_id")\n self.enable_auto_commit = check.opt_bool_param(enable_auto_commit, "enable_auto_commit")\n self.auto_commit_interval_ms = check.opt_int_param(\n auto_commit_interval_ms, "auto_commit_interval_ms"\n )\n self.client_dns_lookup = check.opt_str_param(client_dns_lookup, "client_dns_lookup")\n self.retry_backoff_ms = check.opt_int_param(retry_backoff_ms, "retry_backoff_ms")\n self.request_timeout_ms = check.opt_int_param(request_timeout_ms, "request_timeout_ms")\n self.receive_buffer_bytes = check.opt_int_param(\n receive_buffer_bytes, "receive_buffer_bytes"\n )\n self.auto_offset_reset = check.opt_str_param(auto_offset_reset, "auto_offset_reset")\n self.repeated_calls = check.opt_int_param(repeated_calls, "repeated_calls")\n self.max_records_process = check.opt_int_param(max_records_process, "max_records_process")\n super().__init__("Kafka", name)
\n
", "current_page_name": "_modules/dagster_airbyte/managed/generated/sources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_airbyte.managed.generated.sources"}}, "reconciliation": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_airbyte.managed.reconciliation

\nfrom typing import (\n    Any,\n    Callable,\n    Dict,\n    Iterable,\n    List,\n    Mapping,\n    Optional,\n    Sequence,\n    Tuple,\n    Union,\n    cast,\n)\n\nimport dagster._check as check\nfrom dagster import AssetKey\nfrom dagster._annotations import experimental, public\nfrom dagster._core.definitions.cacheable_assets import CacheableAssetsDefinition\nfrom dagster._core.definitions.events import CoercibleToAssetKeyPrefix\nfrom dagster._core.definitions.freshness_policy import FreshnessPolicy\nfrom dagster._core.definitions.resource_definition import ResourceDefinition\nfrom dagster._core.execution.context.init import build_init_resource_context\nfrom dagster._utils.merger import deep_merge_dicts\nfrom dagster_managed_elements import (\n    ManagedElementCheckResult,\n    ManagedElementDiff,\n    ManagedElementError,\n)\nfrom dagster_managed_elements.types import (\n    SECRET_MASK_VALUE,\n    ManagedElementReconciler,\n    is_key_secret,\n)\nfrom dagster_managed_elements.utils import UNSET, diff_dicts\n\nfrom dagster_airbyte.asset_defs import (\n    AirbyteConnectionMetadata,\n    AirbyteInstanceCacheableAssetsDefinition,\n    _clean_name,\n)\nfrom dagster_airbyte.managed.types import (\n    AirbyteConnection,\n    AirbyteDestination,\n    AirbyteDestinationNamespace,\n    AirbyteSource,\n    AirbyteSyncMode,\n    InitializedAirbyteConnection,\n    InitializedAirbyteDestination,\n    InitializedAirbyteSource,\n)\nfrom dagster_airbyte.resources import AirbyteResource\nfrom dagster_airbyte.utils import is_basic_normalization_operation\n\n\ndef gen_configured_stream_json(\n    source_stream: Mapping[str, Any], user_stream_config: Mapping[str, AirbyteSyncMode]\n) -> Mapping[str, Any]:\n    """Generates an Airbyte API stream defintiion based on the succinct user-provided config and the\n    full stream definition from the source.\n    """\n    config = user_stream_config[source_stream["stream"]["name"]]\n    return deep_merge_dicts(\n        source_stream,\n        {"config": config.to_json()},\n    )\n\n\ndef _ignore_secrets_compare_fn(k: str, _cv: Any, dv: Any) -> Optional[bool]:\n    if is_key_secret(k):\n        return dv == SECRET_MASK_VALUE\n    return None\n\n\ndef _diff_configs(\n    config_dict: Mapping[str, Any], dst_dict: Mapping[str, Any], ignore_secrets: bool = True\n) -> ManagedElementDiff:\n    return diff_dicts(\n        config_dict=config_dict,\n        dst_dict=dst_dict,\n        custom_compare_fn=_ignore_secrets_compare_fn if ignore_secrets else None,\n    )\n\n\ndef diff_sources(\n    config_src: Optional[AirbyteSource],\n    curr_src: Optional[AirbyteSource],\n    ignore_secrets: bool = True,\n) -> ManagedElementCheckResult:\n    """Utility to diff two AirbyteSource objects."""\n    diff = _diff_configs(\n        config_src.source_configuration if config_src else {},\n        curr_src.source_configuration if curr_src else {},\n        ignore_secrets,\n    )\n    if not diff.is_empty():\n        name = config_src.name if config_src else curr_src.name if curr_src else "Unknown"\n        return ManagedElementDiff().with_nested(name, diff)\n\n    return ManagedElementDiff()\n\n\ndef diff_destinations(\n    config_dst: Optional[AirbyteDestination],\n    curr_dst: Optional[AirbyteDestination],\n    ignore_secrets: bool = True,\n) -> ManagedElementCheckResult:\n    """Utility to diff two AirbyteDestination objects."""\n    diff = _diff_configs(\n        config_dst.destination_configuration if config_dst else {},\n        curr_dst.destination_configuration if curr_dst else {},\n        ignore_secrets,\n    )\n    if not diff.is_empty():\n        name = config_dst.name if config_dst else curr_dst.name if curr_dst else "Unknown"\n        return ManagedElementDiff().with_nested(name, diff)\n\n    return ManagedElementDiff()\n\n\ndef conn_dict(conn: Optional[AirbyteConnection]) -> Mapping[str, Any]:\n    if not conn:\n        return {}\n    return {\n        "source": conn.source.name if conn.source else "Unknown",\n        "destination": conn.destination.name if conn.destination else "Unknown",\n        "normalize data": conn.normalize_data,\n        "streams": {k: v.to_json() for k, v in conn.stream_config.items()},\n        "destination namespace": (\n            conn.destination_namespace.name\n            if isinstance(conn.destination_namespace, AirbyteDestinationNamespace)\n            else conn.destination_namespace\n        ),\n        "prefix": conn.prefix,\n    }\n\n\nOPTIONAL_STREAM_SETTINGS = ("cursorField", "primaryKey")\n\n\ndef _compare_stream_values(k: str, cv: str, _dv: str):\n    """Don't register a diff for optional stream settings if the value is not set\n    in the user-provided config, this means it will default to the value in the\n    source.\n    """\n    return True if k in OPTIONAL_STREAM_SETTINGS and cv == UNSET else None\n\n\ndef diff_connections(\n    config_conn: Optional[AirbyteConnection], curr_conn: Optional[AirbyteConnection]\n) -> ManagedElementCheckResult:\n    """Utility to diff two AirbyteConnection objects."""\n    diff = diff_dicts(\n        conn_dict(config_conn),\n        conn_dict(curr_conn),\n        custom_compare_fn=_compare_stream_values,\n    )\n    if not diff.is_empty():\n        name = config_conn.name if config_conn else curr_conn.name if curr_conn else "Unknown"\n        return ManagedElementDiff().with_nested(name, diff)\n\n    return ManagedElementDiff()\n\n\ndef reconcile_sources(\n    res: AirbyteResource,\n    config_sources: Mapping[str, AirbyteSource],\n    existing_sources: Mapping[str, InitializedAirbyteSource],\n    workspace_id: str,\n    dry_run: bool,\n    should_delete: bool,\n    ignore_secrets: bool,\n) -> Tuple[Mapping[str, InitializedAirbyteSource], ManagedElementCheckResult]:\n    """Generates a diff of the configured and existing sources and reconciles them to match the\n    configured state if dry_run is False.\n    """\n    diff = ManagedElementDiff()\n\n    initialized_sources: Dict[str, InitializedAirbyteSource] = {}\n    for source_name in set(config_sources.keys()).union(existing_sources.keys()):\n        configured_source = config_sources.get(source_name)\n        existing_source = existing_sources.get(source_name)\n\n        # Ignore sources not mentioned in the user config unless the user specifies to delete\n        if not should_delete and existing_source and not configured_source:\n            initialized_sources[source_name] = existing_source\n            continue\n\n        diff = diff.join(\n            diff_sources(  # type: ignore\n                configured_source,\n                existing_source.source if existing_source else None,\n                ignore_secrets,\n            )\n        )\n\n        if existing_source and (\n            not configured_source or (configured_source.must_be_recreated(existing_source.source))\n        ):\n            initialized_sources[source_name] = existing_source\n            if not dry_run:\n                res.make_request(\n                    endpoint="/sources/delete",\n                    data={"sourceId": existing_source.source_id},\n                )\n            existing_source = None\n\n        if configured_source:\n            defn_id = check.not_none(\n                res.get_source_definition_by_name(configured_source.source_type)\n            )\n            base_source_defn_dict = {\n                "name": configured_source.name,\n                "connectionConfiguration": configured_source.source_configuration,\n            }\n            source_id = ""\n            if existing_source:\n                source_id = existing_source.source_id\n                if not dry_run:\n                    res.make_request(\n                        endpoint="/sources/update",\n                        data={"sourceId": source_id, **base_source_defn_dict},\n                    )\n            else:\n                if not dry_run:\n                    create_result = cast(\n                        Dict[str, str],\n                        check.not_none(\n                            res.make_request(\n                                endpoint="/sources/create",\n                                data={\n                                    "sourceDefinitionId": defn_id,\n                                    "workspaceId": workspace_id,\n                                    **base_source_defn_dict,\n                                },\n                            )\n                        ),\n                    )\n                    source_id = create_result["sourceId"]\n\n            if source_name in initialized_sources:\n                # Preserve to be able to initialize old connection object\n                initialized_sources[f"{source_name}_old"] = initialized_sources[source_name]\n            initialized_sources[source_name] = InitializedAirbyteSource(\n                source=configured_source,\n                source_id=source_id,\n                source_definition_id=defn_id,\n            )\n    return initialized_sources, diff\n\n\ndef reconcile_destinations(\n    res: AirbyteResource,\n    config_destinations: Mapping[str, AirbyteDestination],\n    existing_destinations: Mapping[str, InitializedAirbyteDestination],\n    workspace_id: str,\n    dry_run: bool,\n    should_delete: bool,\n    ignore_secrets: bool,\n) -> Tuple[Mapping[str, InitializedAirbyteDestination], ManagedElementCheckResult]:\n    """Generates a diff of the configured and existing destinations and reconciles them to match the\n    configured state if dry_run is False.\n    """\n    diff = ManagedElementDiff()\n\n    initialized_destinations: Dict[str, InitializedAirbyteDestination] = {}\n    for destination_name in set(config_destinations.keys()).union(existing_destinations.keys()):\n        configured_destination = config_destinations.get(destination_name)\n        existing_destination = existing_destinations.get(destination_name)\n\n        # Ignore destinations not mentioned in the user config unless the user specifies to delete\n        if not should_delete and existing_destination and not configured_destination:\n            initialized_destinations[destination_name] = existing_destination\n            continue\n\n        diff = diff.join(\n            diff_destinations(  # type: ignore\n                configured_destination,\n                existing_destination.destination if existing_destination else None,\n                ignore_secrets,\n            )\n        )\n\n        if existing_destination and (\n            not configured_destination\n            or (configured_destination.must_be_recreated(existing_destination.destination))\n        ):\n            initialized_destinations[destination_name] = existing_destination\n            if not dry_run:\n                res.make_request(\n                    endpoint="/destinations/delete",\n                    data={"destinationId": existing_destination.destination_id},\n                )\n            existing_destination = None\n\n        if configured_destination:\n            defn_id = res.get_destination_definition_by_name(\n                configured_destination.destination_type\n            )\n            base_destination_defn_dict = {\n                "name": configured_destination.name,\n                "connectionConfiguration": configured_destination.destination_configuration,\n            }\n            destination_id = ""\n            if existing_destination:\n                destination_id = existing_destination.destination_id\n                if not dry_run:\n                    res.make_request(\n                        endpoint="/destinations/update",\n                        data={"destinationId": destination_id, **base_destination_defn_dict},\n                    )\n            else:\n                if not dry_run:\n                    create_result = cast(\n                        Dict[str, str],\n                        check.not_none(\n                            res.make_request(\n                                endpoint="/destinations/create",\n                                data={\n                                    "destinationDefinitionId": defn_id,\n                                    "workspaceId": workspace_id,\n                                    **base_destination_defn_dict,\n                                },\n                            )\n                        ),\n                    )\n                    destination_id = create_result["destinationId"]\n\n            if destination_name in initialized_destinations:\n                # Preserve to be able to initialize old connection object\n                initialized_destinations[f"{destination_name}_old"] = initialized_destinations[\n                    destination_name\n                ]\n            initialized_destinations[destination_name] = InitializedAirbyteDestination(\n                destination=configured_destination,\n                destination_id=destination_id,\n                destination_definition_id=defn_id,\n            )\n    return initialized_destinations, diff\n\n\ndef reconcile_config(\n    res: AirbyteResource,\n    objects: Sequence[AirbyteConnection],\n    dry_run: bool = False,\n    should_delete: bool = False,\n    ignore_secrets: bool = True,\n) -> ManagedElementCheckResult:\n    """Main entry point for the reconciliation process. Takes a list of AirbyteConnection objects\n    and a pointer to an Airbyte instance and returns a diff, along with applying the diff\n    if dry_run is False.\n    """\n    with res.cache_requests():\n        config_connections = {conn.name: conn for conn in objects}\n        config_sources = {conn.source.name: conn.source for conn in objects}\n        config_dests = {conn.destination.name: conn.destination for conn in objects}\n\n        workspace_id = res.get_default_workspace()\n\n        existing_sources_raw = cast(\n            Dict[str, List[Dict[str, Any]]],\n            check.not_none(\n                res.make_request(endpoint="/sources/list", data={"workspaceId": workspace_id})\n            ),\n        )\n        existing_dests_raw = cast(\n            Dict[str, List[Dict[str, Any]]],\n            check.not_none(\n                res.make_request(endpoint="/destinations/list", data={"workspaceId": workspace_id})\n            ),\n        )\n\n        existing_sources: Dict[str, InitializedAirbyteSource] = {\n            source_json["name"]: InitializedAirbyteSource.from_api_json(source_json)\n            for source_json in existing_sources_raw.get("sources", [])\n        }\n        existing_dests: Dict[str, InitializedAirbyteDestination] = {\n            destination_json["name"]: InitializedAirbyteDestination.from_api_json(destination_json)\n            for destination_json in existing_dests_raw.get("destinations", [])\n        }\n\n        # First, remove any connections that need to be deleted, so that we can\n        # safely delete any sources/destinations that are no longer referenced\n        # or that need to be recreated.\n        connections_diff = reconcile_connections_pre(\n            res,\n            config_connections,\n            existing_sources,\n            existing_dests,\n            workspace_id,\n            dry_run,\n            should_delete,\n        )\n\n        all_sources, sources_diff = reconcile_sources(\n            res,\n            config_sources,\n            existing_sources,\n            workspace_id,\n            dry_run,\n            should_delete,\n            ignore_secrets,\n        )\n        all_dests, dests_diff = reconcile_destinations(\n            res, config_dests, existing_dests, workspace_id, dry_run, should_delete, ignore_secrets\n        )\n\n        # Now that we have updated the set of sources and destinations, we can\n        # recreate or update any connections which depend on them.\n        reconcile_connections_post(\n            res,\n            config_connections,\n            all_sources,\n            all_dests,\n            workspace_id,\n            dry_run,\n        )\n\n        return ManagedElementDiff().join(sources_diff).join(dests_diff).join(connections_diff)  # type: ignore\n\n\ndef reconcile_normalization(\n    res: AirbyteResource,\n    existing_connection_id: Optional[str],\n    destination: InitializedAirbyteDestination,\n    normalization_config: Optional[bool],\n    workspace_id: str,\n) -> Optional[str]:\n    """Reconciles the normalization configuration for a connection.\n\n    If normalization_config is None, then defaults to True on destinations that support normalization\n    and False on destinations that do not.\n    """\n    existing_basic_norm_op_id = None\n    if existing_connection_id:\n        operations = cast(\n            Dict[str, List[Dict[str, str]]],\n            check.not_none(\n                res.make_request(\n                    endpoint="/operations/list",\n                    data={"connectionId": existing_connection_id},\n                )\n            ),\n        )\n        existing_basic_norm_op = next(\n            (\n                operation\n                for operation in operations["operations"]\n                if is_basic_normalization_operation(operation)\n            ),\n            None,\n        )\n        existing_basic_norm_op_id = (\n            existing_basic_norm_op["operationId"] if existing_basic_norm_op else None\n        )\n\n    if normalization_config is not False:\n        if destination.destination_definition_id and res.does_dest_support_normalization(\n            destination.destination_definition_id, workspace_id\n        ):\n            if existing_basic_norm_op_id:\n                return existing_basic_norm_op_id\n            else:\n                return cast(\n                    Dict[str, str],\n                    check.not_none(\n                        res.make_request(\n                            endpoint="/operations/create",\n                            data={\n                                "workspaceId": workspace_id,\n                                "name": "Normalization",\n                                "operatorConfiguration": {\n                                    "operatorType": "normalization",\n                                    "normalization": {"option": "basic"},\n                                },\n                            },\n                        )\n                    ),\n                )["operationId"]\n        elif normalization_config is True:\n            raise Exception(\n                f"Destination {destination.destination.name} does not support normalization."\n            )\n\n    return None\n\n\ndef reconcile_connections_pre(\n    res: AirbyteResource,\n    config_connections: Mapping[str, AirbyteConnection],\n    existing_sources: Mapping[str, InitializedAirbyteSource],\n    existing_destinations: Mapping[str, InitializedAirbyteDestination],\n    workspace_id: str,\n    dry_run: bool,\n    should_delete: bool,\n) -> ManagedElementCheckResult:\n    """Generates the diff for connections, and deletes any connections that are not in the config if\n    dry_run is False.\n\n    It's necessary to do this in two steps because we need to remove connections that depend on\n    sources and destinations that are being deleted or recreated before Airbyte will allow us to\n    delete or recreate them.\n    """\n    diff = ManagedElementDiff()\n\n    existing_connections_raw = cast(\n        Dict[str, List[Dict[str, Any]]],\n        check.not_none(\n            res.make_request(endpoint="/connections/list", data={"workspaceId": workspace_id})\n        ),\n    )\n    existing_connections: Dict[str, InitializedAirbyteConnection] = {\n        connection_json["name"]: InitializedAirbyteConnection.from_api_json(\n            connection_json, existing_sources, existing_destinations\n        )\n        for connection_json in existing_connections_raw.get("connections", [])\n    }\n\n    for conn_name in set(config_connections.keys()).union(existing_connections.keys()):\n        config_conn = config_connections.get(conn_name)\n        existing_conn = existing_connections.get(conn_name)\n\n        # Ignore connections not mentioned in the user config unless the user specifies to delete\n        if not should_delete and not config_conn:\n            continue\n\n        diff = diff.join(\n            diff_connections(config_conn, existing_conn.connection if existing_conn else None)  # type: ignore\n        )\n\n        if existing_conn and (\n            not config_conn or config_conn.must_be_recreated(existing_conn.connection)\n        ):\n            if not dry_run:\n                res.make_request(\n                    endpoint="/connections/delete",\n                    data={"connectionId": existing_conn.connection_id},\n                )\n    return diff\n\n\ndef reconcile_connections_post(\n    res: AirbyteResource,\n    config_connections: Mapping[str, AirbyteConnection],\n    init_sources: Mapping[str, InitializedAirbyteSource],\n    init_dests: Mapping[str, InitializedAirbyteDestination],\n    workspace_id: str,\n    dry_run: bool,\n) -> None:\n    """Creates new and modifies existing connections based on the config if dry_run is False."""\n    existing_connections_raw = cast(\n        Dict[str, List[Dict[str, Any]]],\n        check.not_none(\n            res.make_request(endpoint="/connections/list", data={"workspaceId": workspace_id})\n        ),\n    )\n    existing_connections = {\n        connection_json["name"]: InitializedAirbyteConnection.from_api_json(\n            connection_json, init_sources, init_dests\n        )\n        for connection_json in existing_connections_raw.get("connections", [])\n    }\n\n    for conn_name, config_conn in config_connections.items():\n        existing_conn = existing_connections.get(conn_name)\n\n        normalization_operation_id = None\n        if not dry_run:\n            destination = init_dests[config_conn.destination.name]\n\n            # Enable or disable basic normalization based on config\n            normalization_operation_id = reconcile_normalization(\n                res,\n                existing_connections.get("name", {}).get("connectionId"),\n                destination,\n                config_conn.normalize_data,\n                workspace_id,\n            )\n\n        configured_streams = []\n        if not dry_run:\n            source = init_sources[config_conn.source.name]\n            schema = res.get_source_schema(source.source_id)\n            base_streams = schema["catalog"]["streams"]\n\n            configured_streams = [\n                gen_configured_stream_json(stream, config_conn.stream_config)\n                for stream in base_streams\n                if stream["stream"]["name"] in config_conn.stream_config\n            ]\n\n        connection_base_json = {\n            "name": conn_name,\n            "namespaceDefinition": "source",\n            "namespaceFormat": "${SOURCE_NAMESPACE}",\n            "prefix": "",\n            "operationIds": [normalization_operation_id] if normalization_operation_id else [],\n            "syncCatalog": {"streams": configured_streams},\n            "scheduleType": "manual",\n            "status": "active",\n        }\n\n        if isinstance(config_conn.destination_namespace, AirbyteDestinationNamespace):\n            connection_base_json["namespaceDefinition"] = config_conn.destination_namespace.value\n        else:\n            connection_base_json["namespaceDefinition"] = "customformat"\n            connection_base_json["namespaceFormat"] = cast(str, config_conn.destination_namespace)\n\n        if config_conn.prefix:\n            connection_base_json["prefix"] = config_conn.prefix\n\n        if existing_conn:\n            if not dry_run:\n                source = init_sources[config_conn.source.name]\n                res.make_request(\n                    endpoint="/connections/update",\n                    data={\n                        **connection_base_json,\n                        "sourceCatalogId": res.get_source_catalog_id(source.source_id),\n                        "connectionId": existing_conn.connection_id,\n                    },\n                )\n        else:\n            if not dry_run:\n                source = init_sources[config_conn.source.name]\n                destination = init_dests[config_conn.destination.name]\n\n                res.make_request(\n                    endpoint="/connections/create",\n                    data={\n                        **connection_base_json,\n                        "sourceCatalogId": res.get_source_catalog_id(source.source_id),\n                        "sourceId": source.source_id,\n                        "destinationId": destination.destination_id,\n                    },\n                )\n\n\n
[docs]@experimental\nclass AirbyteManagedElementReconciler(ManagedElementReconciler):\n """Reconciles Python-specified Airbyte connections with an Airbyte instance.\n\n Passing the module containing an AirbyteManagedElementReconciler to the dagster-airbyte\n CLI will allow you to check the state of your Python-code-specified Airbyte connections\n against an Airbyte instance, and reconcile them if necessary.\n\n This functionality is experimental and subject to change.\n """\n\n
[docs] @public\n def __init__(\n self,\n airbyte: Union[AirbyteResource, ResourceDefinition],\n connections: Iterable[AirbyteConnection],\n delete_unmentioned_resources: bool = False,\n ):\n """Reconciles Python-specified Airbyte connections with an Airbyte instance.\n\n Args:\n airbyte (Union[AirbyteResource, ResourceDefinition]): The Airbyte resource definition to reconcile against.\n connections (Iterable[AirbyteConnection]): The Airbyte connection objects to reconcile.\n delete_unmentioned_resources (bool): Whether to delete resources that are not mentioned in\n the set of connections provided. When True, all Airbyte instance contents are effectively\n managed by the reconciler. Defaults to False.\n """\n # airbyte = check.inst_param(airbyte, "airbyte", ResourceDefinition)\n\n self._airbyte_instance: AirbyteResource = (\n airbyte\n if isinstance(airbyte, AirbyteResource)\n else airbyte(build_init_resource_context())\n )\n self._connections = list(\n check.iterable_param(connections, "connections", of_type=AirbyteConnection)\n )\n self._delete_unmentioned_resources = check.bool_param(\n delete_unmentioned_resources, "delete_unmentioned_resources"\n )\n\n super().__init__()
\n\n def check(self, **kwargs) -> ManagedElementCheckResult:\n return reconcile_config(\n self._airbyte_instance,\n self._connections,\n dry_run=True,\n should_delete=self._delete_unmentioned_resources,\n ignore_secrets=(not kwargs.get("include_all_secrets", False)),\n )\n\n def apply(self, **kwargs) -> ManagedElementCheckResult:\n return reconcile_config(\n self._airbyte_instance,\n self._connections,\n dry_run=False,\n should_delete=self._delete_unmentioned_resources,\n ignore_secrets=(not kwargs.get("include_all_secrets", False)),\n )
\n\n\nclass AirbyteManagedElementCacheableAssetsDefinition(AirbyteInstanceCacheableAssetsDefinition):\n def __init__(\n self,\n airbyte_resource_def: AirbyteResource,\n key_prefix: Sequence[str],\n create_assets_for_normalization_tables: bool,\n connection_to_group_fn: Optional[Callable[[str], Optional[str]]],\n connections: Iterable[AirbyteConnection],\n connection_to_io_manager_key_fn: Optional[Callable[[str], Optional[str]]],\n connection_to_asset_key_fn: Optional[Callable[[AirbyteConnectionMetadata, str], AssetKey]],\n connection_to_freshness_policy_fn: Optional[\n Callable[[AirbyteConnectionMetadata], Optional[FreshnessPolicy]]\n ],\n ):\n defined_conn_names = {conn.name for conn in connections}\n super().__init__(\n airbyte_resource_def=airbyte_resource_def,\n workspace_id=None,\n key_prefix=key_prefix,\n create_assets_for_normalization_tables=create_assets_for_normalization_tables,\n connection_to_group_fn=connection_to_group_fn,\n connection_to_io_manager_key_fn=connection_to_io_manager_key_fn,\n connection_filter=lambda conn: conn.name in defined_conn_names,\n connection_to_asset_key_fn=connection_to_asset_key_fn,\n connection_to_freshness_policy_fn=connection_to_freshness_policy_fn,\n )\n self._connections: List[AirbyteConnection] = list(connections)\n\n def _get_connections(self) -> Sequence[Tuple[str, AirbyteConnectionMetadata]]:\n diff = reconcile_config(self._airbyte_instance, self._connections, dry_run=True)\n if isinstance(diff, ManagedElementDiff) and not diff.is_empty():\n raise ValueError(\n "Airbyte connections are not in sync with provided configuration, diff:\\n{}".format(\n str(diff)\n )\n )\n elif isinstance(diff, ManagedElementError):\n raise ValueError(f"Error checking Airbyte connections: {diff}")\n\n return super()._get_connections()\n\n\n
[docs]@experimental\ndef load_assets_from_connections(\n airbyte: Union[AirbyteResource, ResourceDefinition],\n connections: Iterable[AirbyteConnection],\n key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n create_assets_for_normalization_tables: bool = True,\n connection_to_group_fn: Optional[Callable[[str], Optional[str]]] = _clean_name,\n io_manager_key: Optional[str] = None,\n connection_to_io_manager_key_fn: Optional[Callable[[str], Optional[str]]] = None,\n connection_to_asset_key_fn: Optional[\n Callable[[AirbyteConnectionMetadata, str], AssetKey]\n ] = None,\n connection_to_freshness_policy_fn: Optional[\n Callable[[AirbyteConnectionMetadata], Optional[FreshnessPolicy]]\n ] = None,\n) -> CacheableAssetsDefinition:\n """Loads Airbyte connection assets from a configured AirbyteResource instance, checking against a list of AirbyteConnection objects.\n This method will raise an error on repo load if the passed AirbyteConnection objects are not in sync with the Airbyte instance.\n\n Args:\n airbyte (Union[AirbyteResource, ResourceDefinition]): An AirbyteResource configured with the appropriate connection\n details.\n connections (Iterable[AirbyteConnection]): A list of AirbyteConnection objects to build assets for.\n key_prefix (Optional[CoercibleToAssetKeyPrefix]): A prefix for the asset keys created.\n create_assets_for_normalization_tables (bool): If True, assets will be created for tables\n created by Airbyte's normalization feature. If False, only the destination tables\n will be created. Defaults to True.\n connection_to_group_fn (Optional[Callable[[str], Optional[str]]]): Function which returns an asset\n group name for a given Airbyte connection name. If None, no groups will be created. Defaults\n to a basic sanitization function.\n io_manager_key (Optional[str]): The IO manager key to use for all assets. Defaults to "io_manager".\n Use this if all assets should be loaded from the same source, otherwise use connection_to_io_manager_key_fn.\n connection_to_io_manager_key_fn (Optional[Callable[[str], Optional[str]]]): Function which returns an\n IO manager key for a given Airbyte connection name. When other ops are downstream of the loaded assets,\n the IOManager specified determines how the inputs to those ops are loaded. Defaults to "io_manager".\n connection_to_asset_key_fn (Optional[Callable[[AirbyteConnectionMetadata, str], AssetKey]]): Optional function which\n takes in connection metadata and table name and returns an asset key for the table. If None, the default asset\n key is based on the table name. Any asset key prefix will be applied to the output of this function.\n connection_to_freshness_policy_fn (Optional[Callable[[AirbyteConnectionMetadata], Optional[FreshnessPolicy]]]): Optional function which\n takes in connection metadata and returns a freshness policy for the connection. If None, no freshness policy will be applied.\n\n **Examples:**\n\n .. code-block:: python\n\n from dagster_airbyte import (\n AirbyteConnection,\n AirbyteResource,\n load_assets_from_connections,\n )\n\n airbyte_instance = AirbyteResource(\n host: "localhost",\n port: "8000",\n )\n airbyte_connections = [\n AirbyteConnection(...),\n AirbyteConnection(...)\n ]\n airbyte_assets = load_assets_from_connections(airbyte_instance, airbyte_connections)\n """\n if isinstance(key_prefix, str):\n key_prefix = [key_prefix]\n key_prefix = check.list_param(key_prefix or [], "key_prefix", of_type=str)\n\n check.invariant(\n not io_manager_key or not connection_to_io_manager_key_fn,\n "Cannot specify both io_manager_key and connection_to_io_manager_key_fn",\n )\n if not connection_to_io_manager_key_fn:\n connection_to_io_manager_key_fn = lambda _: io_manager_key\n\n return AirbyteManagedElementCacheableAssetsDefinition(\n airbyte_resource_def=(\n airbyte\n if isinstance(airbyte, AirbyteResource)\n else airbyte(build_init_resource_context())\n ),\n key_prefix=key_prefix,\n create_assets_for_normalization_tables=check.bool_param(\n create_assets_for_normalization_tables, "create_assets_for_normalization_tables"\n ),\n connection_to_group_fn=check.opt_callable_param(\n connection_to_group_fn, "connection_to_group_fn"\n ),\n connection_to_io_manager_key_fn=connection_to_io_manager_key_fn,\n connections=check.iterable_param(connections, "connections", of_type=AirbyteConnection),\n connection_to_asset_key_fn=connection_to_asset_key_fn,\n connection_to_freshness_policy_fn=connection_to_freshness_policy_fn,\n )
\n
", "current_page_name": "_modules/dagster_airbyte/managed/reconciliation", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_airbyte.managed.reconciliation"}, "types": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_airbyte.managed.types

\nimport json\nfrom abc import ABC\nfrom enum import Enum\nfrom typing import Any, Dict, List, Mapping, Optional, Union\n\nimport dagster._check as check\nfrom dagster._annotations import public\n\n\n
[docs]class AirbyteSyncMode(ABC):\n """Represents the sync mode for a given Airbyte stream, which governs how Airbyte reads\n from a source and writes to a destination.\n\n For more information, see https://docs.airbyte.com/understanding-airbyte/connections/.\n """\n\n def __eq__(self, other: Any) -> bool:\n return isinstance(other, AirbyteSyncMode) and self.to_json() == other.to_json()\n\n def __init__(self, json_repr: Dict[str, Any]):\n self.json_repr = json_repr\n\n def to_json(self) -> Dict[str, Any]:\n return self.json_repr\n\n @classmethod\n def from_json(cls, json_repr: Dict[str, Any]) -> "AirbyteSyncMode":\n return cls(\n {\n k: v\n for k, v in json_repr.items()\n if k in ("syncMode", "destinationSyncMode", "cursorField", "primaryKey")\n }\n )\n\n
[docs] @public\n @classmethod\n def full_refresh_append(cls) -> "AirbyteSyncMode":\n """Syncs the entire data stream from the source, appending rows to the destination.\n\n https://docs.airbyte.com/understanding-airbyte/connections/full-refresh-append/\n """\n return cls({"syncMode": "full_refresh", "destinationSyncMode": "append"})
\n\n
[docs] @public\n @classmethod\n def full_refresh_overwrite(cls) -> "AirbyteSyncMode":\n """Syncs the entire data stream from the source, replaces data in the destination by\n overwriting it.\n\n https://docs.airbyte.com/understanding-airbyte/connections/full-refresh-overwrite\n """\n return cls({"syncMode": "full_refresh", "destinationSyncMode": "overwrite"})
\n\n
[docs] @public\n @classmethod\n def incremental_append(\n cls,\n cursor_field: Optional[str] = None,\n ) -> "AirbyteSyncMode":\n """Syncs only new records from the source, appending rows to the destination.\n May optionally specify the cursor field used to determine which records\n are new.\n\n https://docs.airbyte.com/understanding-airbyte/connections/incremental-append/\n """\n cursor_field = check.opt_str_param(cursor_field, "cursor_field")\n\n return cls(\n {\n "syncMode": "incremental",\n "destinationSyncMode": "append",\n **({"cursorField": [cursor_field]} if cursor_field else {}),\n }\n )
\n\n
[docs] @public\n @classmethod\n def incremental_append_dedup(\n cls,\n cursor_field: Optional[str] = None,\n primary_key: Optional[Union[str, List[str]]] = None,\n ) -> "AirbyteSyncMode":\n """Syncs new records from the source, appending to an append-only history\n table in the destination. Also generates a deduplicated view mirroring the\n source table. May optionally specify the cursor field used to determine\n which records are new, and the primary key used to determine which records\n are duplicates.\n\n https://docs.airbyte.com/understanding-airbyte/connections/incremental-append-dedup/\n """\n cursor_field = check.opt_str_param(cursor_field, "cursor_field")\n if isinstance(primary_key, str):\n primary_key = [primary_key]\n primary_key = check.opt_list_param(primary_key, "primary_key", of_type=str)\n\n return cls(\n {\n "syncMode": "incremental",\n "destinationSyncMode": "append_dedup",\n **({"cursorField": [cursor_field]} if cursor_field else {}),\n **({"primaryKey": [[x] for x in primary_key]} if primary_key else {}),\n }\n )
\n\n\n
[docs]class AirbyteSource:\n """Represents a user-defined Airbyte source.\n\n Args:\n name (str): The display name of the source.\n source_type (str): The type of the source, from Airbyte's list\n of sources https://airbytehq.github.io/category/sources/.\n source_configuration (Mapping[str, Any]): The configuration for the\n source, as defined by Airbyte's API.\n """\n\n
[docs] @public\n def __init__(self, name: str, source_type: str, source_configuration: Mapping[str, Any]):\n self.name = check.str_param(name, "name")\n self.source_type = check.str_param(source_type, "source_type")\n self.source_configuration = check.mapping_param(\n source_configuration, "source_configuration", key_type=str\n )
\n\n def must_be_recreated(self, other: "AirbyteSource") -> bool:\n return self.name != other.name or self.source_type != other.source_type
\n\n\nclass InitializedAirbyteSource:\n """User-defined Airbyte source bound to actual created Airbyte source."""\n\n def __init__(self, source: AirbyteSource, source_id: str, source_definition_id: Optional[str]):\n self.source = source\n self.source_id = source_id\n self.source_definition_id = source_definition_id\n\n @classmethod\n def from_api_json(cls, api_json: Mapping[str, Any]):\n return cls(\n source=AirbyteSource(\n name=api_json["name"],\n source_type=api_json["sourceName"],\n source_configuration=api_json["connectionConfiguration"],\n ),\n source_id=api_json["sourceId"],\n source_definition_id=None,\n )\n\n\n
[docs]class AirbyteDestination:\n """Represents a user-defined Airbyte destination.\n\n Args:\n name (str): The display name of the destination.\n destination_type (str): The type of the destination, from Airbyte's list\n of destinations https://airbytehq.github.io/category/destinations/.\n destination_configuration (Mapping[str, Any]): The configuration for the\n destination, as defined by Airbyte's API.\n """\n\n
[docs] @public\n def __init__(\n self, name: str, destination_type: str, destination_configuration: Mapping[str, Any]\n ):\n self.name = check.str_param(name, "name")\n self.destination_type = check.str_param(destination_type, "destination_type")\n self.destination_configuration = check.mapping_param(\n destination_configuration, "destination_configuration", key_type=str\n )
\n\n def must_be_recreated(self, other: "AirbyteDestination") -> bool:\n return self.name != other.name or self.destination_type != other.destination_type
\n\n\nclass InitializedAirbyteDestination:\n """User-defined Airbyte destination bound to actual created Airbyte destination."""\n\n def __init__(\n self,\n destination: AirbyteDestination,\n destination_id: str,\n destination_definition_id: Optional[str],\n ):\n self.destination = destination\n self.destination_id = destination_id\n self.destination_definition_id = destination_definition_id\n\n @classmethod\n def from_api_json(cls, api_json: Mapping[str, Any]):\n return cls(\n destination=AirbyteDestination(\n name=api_json["name"],\n destination_type=api_json["destinationName"],\n destination_configuration=api_json["connectionConfiguration"],\n ),\n destination_id=api_json["destinationId"],\n destination_definition_id=None,\n )\n\n\nclass AirbyteDestinationNamespace(Enum):\n """Represents the sync mode for a given Airbyte stream."""\n\n SAME_AS_SOURCE = "source"\n DESTINATION_DEFAULT = "destination"\n\n\n
[docs]class AirbyteConnection:\n """A user-defined Airbyte connection, pairing an Airbyte source and destination and configuring\n which streams to sync.\n\n Args:\n name (str): The display name of the connection.\n source (AirbyteSource): The source to sync from.\n destination (AirbyteDestination): The destination to sync to.\n stream_config (Mapping[str, AirbyteSyncMode]): A mapping from stream name to\n the sync mode for that stream, including any additional configuration\n of primary key or cursor field.\n normalize_data (Optional[bool]): Whether to normalize the data in the\n destination.\n destination_namespace (Optional[Union[AirbyteDestinationNamespace, str]]):\n The namespace to sync to in the destination. If set to\n AirbyteDestinationNamespace.SAME_AS_SOURCE, the namespace will be the\n same as the source namespace. If set to\n AirbyteDestinationNamespace.DESTINATION_DEFAULT, the namespace will be\n the default namespace for the destination. If set to a string, the\n namespace will be that string.\n prefix (Optional[str]): A prefix to add to the table names in the destination.\n\n Example:\n .. code-block:: python\n\n from dagster_airbyte.managed.generated.sources import FileSource\n from dagster_airbyte.managed.generated.destinations import LocalJsonDestination\n from dagster_airbyte import AirbyteConnection, AirbyteSyncMode\n\n cereals_csv_source = FileSource(...)\n local_json_destination = LocalJsonDestination(...)\n\n cereals_connection = AirbyteConnection(\n name="download-cereals",\n source=cereals_csv_source,\n destination=local_json_destination,\n stream_config={"cereals": AirbyteSyncMode.full_refresh_overwrite()},\n )\n """\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n source: AirbyteSource,\n destination: AirbyteDestination,\n stream_config: Mapping[str, AirbyteSyncMode],\n normalize_data: Optional[bool] = None,\n destination_namespace: Optional[\n Union[AirbyteDestinationNamespace, str]\n ] = AirbyteDestinationNamespace.SAME_AS_SOURCE,\n prefix: Optional[str] = None,\n ):\n self.name = check.str_param(name, "name")\n self.source = check.inst_param(source, "source", AirbyteSource)\n self.destination = check.inst_param(destination, "destination", AirbyteDestination)\n self.stream_config = check.mapping_param(\n stream_config, "stream_config", key_type=str, value_type=AirbyteSyncMode\n )\n self.normalize_data = check.opt_bool_param(normalize_data, "normalize_data")\n self.destination_namespace = check.opt_inst_param(\n destination_namespace, "destination_namespace", (str, AirbyteDestinationNamespace)\n )\n self.prefix = check.opt_str_param(prefix, "prefix")
\n\n def must_be_recreated(self, other: Optional["AirbyteConnection"]) -> bool:\n return (\n not other\n or self.source.must_be_recreated(other.source)\n or self.destination.must_be_recreated(other.destination)\n )
\n\n\nclass InitializedAirbyteConnection:\n """User-defined Airbyte connection bound to actual created Airbyte connection."""\n\n def __init__(\n self,\n connection: AirbyteConnection,\n connection_id: str,\n ):\n self.connection = connection\n self.connection_id = connection_id\n\n @classmethod\n def from_api_json(\n cls,\n api_dict: Mapping[str, Any],\n init_sources: Mapping[str, InitializedAirbyteSource],\n init_dests: Mapping[str, InitializedAirbyteDestination],\n ):\n source = next(\n (\n source.source\n for source in init_sources.values()\n if source.source_id == api_dict["sourceId"]\n ),\n None,\n )\n dest = next(\n (\n dest.destination\n for dest in init_dests.values()\n if dest.destination_id == api_dict["destinationId"]\n ),\n None,\n )\n\n source = check.not_none(source, f"Could not find source with id {api_dict['sourceId']}")\n dest = check.not_none(\n dest, f"Could not find destination with id {api_dict['destinationId']}"\n )\n\n streams = {\n stream["stream"]["name"]: AirbyteSyncMode.from_json(stream["config"])\n for stream in api_dict["syncCatalog"]["streams"]\n }\n return cls(\n AirbyteConnection(\n name=api_dict["name"],\n source=source,\n destination=dest,\n stream_config=streams,\n normalize_data=len(api_dict["operationIds"]) > 0,\n destination_namespace=(\n api_dict["namespaceFormat"]\n if api_dict["namespaceDefinition"] == "customformat"\n else AirbyteDestinationNamespace(api_dict["namespaceDefinition"])\n ),\n prefix=api_dict["prefix"] if api_dict.get("prefix") else None,\n ),\n api_dict["connectionId"],\n )\n\n\ndef _remove_none_values(obj: Dict[str, Any]) -> Dict[str, Any]:\n return {k: v for k, v in obj.items() if v is not None}\n\n\ndef _dump_class(obj: Any) -> Dict[str, Any]:\n return json.loads(json.dumps(obj, default=lambda o: _remove_none_values(o.__dict__)))\n\n\nclass GeneratedAirbyteSource(AirbyteSource):\n """Base class used by the codegen Airbyte sources. This class is not intended to be used directly.\n\n Converts all of its attributes into a source configuration dict which is passed down to the base\n AirbyteSource class.\n """\n\n def __init__(self, source_type: str, name: str):\n source_configuration = _dump_class(self)\n super().__init__(\n name=name, source_type=source_type, source_configuration=source_configuration\n )\n\n\nclass GeneratedAirbyteDestination(AirbyteDestination):\n """Base class used by the codegen Airbyte destinations. This class is not intended to be used directly.\n\n Converts all of its attributes into a destination configuration dict which is passed down to the\n base AirbyteDestination class.\n """\n\n def __init__(self, source_type: str, name: str):\n destination_configuration = _dump_class(self)\n super().__init__(\n name=name,\n destination_type=source_type,\n destination_configuration=destination_configuration,\n )\n
", "current_page_name": "_modules/dagster_airbyte/managed/types", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_airbyte.managed.types"}}, "ops": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_airbyte.ops

\nfrom typing import Any, Iterable, List, Optional\n\nfrom dagster import Config, In, Nothing, Out, Output, op\nfrom pydantic import Field\n\nfrom dagster_airbyte.types import AirbyteOutput\nfrom dagster_airbyte.utils import _get_attempt, generate_materializations\n\nfrom .resources import DEFAULT_POLL_INTERVAL_SECONDS, BaseAirbyteResource\n\n\nclass AirbyteSyncConfig(Config):\n    connection_id: str = Field(\n        ...,\n        description=(\n            "Parsed json dictionary representing the details of the Airbyte connector after the"\n            " sync successfully completes. See the [Airbyte API"\n            " Docs](https://airbyte-public-api-docs.s3.us-east-2.amazonaws.com/rapidoc-api-docs.html#overview)"\n            " to see detailed information on this response."\n        ),\n    )\n    poll_interval: float = Field(\n        DEFAULT_POLL_INTERVAL_SECONDS,\n        description=(\n            "The maximum time that will waited before this operation is timed out. By "\n            "default, this will never time out."\n        ),\n    )\n    poll_timeout: Optional[float] = Field(\n        None,\n        description=(\n            "The maximum time that will waited before this operation is timed out. By "\n            "default, this will never time out."\n        ),\n    )\n    yield_materializations: bool = Field(\n        True,\n        description=(\n            "If True, materializations corresponding to the results of the Airbyte sync will "\n            "be yielded when the op executes."\n        ),\n    )\n    asset_key_prefix: List[str] = Field(\n        ["airbyte"],\n        description=(\n            "If provided and yield_materializations is True, these components will be used to "\n            "prefix the generated asset keys."\n        ),\n    )\n\n\n
[docs]@op(\n ins={"start_after": In(Nothing)},\n out=Out(\n AirbyteOutput,\n description=(\n "Parsed json dictionary representing the details of the Airbyte connector after the"\n " sync successfully completes. See the [Airbyte API"\n " Docs](https://airbyte-public-api-docs.s3.us-east-2.amazonaws.com/rapidoc-api-docs.html#overview)"\n " to see detailed information on this response."\n ),\n ),\n tags={"kind": "airbyte"},\n)\ndef airbyte_sync_op(\n context, config: AirbyteSyncConfig, airbyte: BaseAirbyteResource\n) -> Iterable[Any]:\n """Executes a Airbyte job sync for a given ``connection_id``, and polls until that sync\n completes, raising an error if it is unsuccessful. It outputs a AirbyteOutput which contains\n the job details for a given ``connection_id``.\n\n It requires the use of the :py:class:`~dagster_airbyte.airbyte_resource`, which allows it to\n communicate with the Airbyte API.\n\n Examples:\n .. code-block:: python\n\n from dagster import job\n from dagster_airbyte import airbyte_resource, airbyte_sync_op\n\n my_airbyte_resource = airbyte_resource.configured(\n {\n "host": {"env": "AIRBYTE_HOST"},\n "port": {"env": "AIRBYTE_PORT"},\n }\n )\n\n sync_foobar = airbyte_sync_op.configured({"connection_id": "foobar"}, name="sync_foobar")\n\n @job(resource_defs={"airbyte": my_airbyte_resource})\n def my_simple_airbyte_job():\n sync_foobar()\n\n @job(resource_defs={"airbyte": my_airbyte_resource})\n def my_composed_airbyte_job():\n final_foobar_state = sync_foobar(start_after=some_op())\n other_op(final_foobar_state)\n """\n airbyte_output = airbyte.sync_and_poll(\n connection_id=config.connection_id,\n poll_interval=config.poll_interval,\n poll_timeout=config.poll_timeout,\n )\n if config.yield_materializations:\n yield from generate_materializations(\n airbyte_output, asset_key_prefix=config.asset_key_prefix\n )\n yield Output(\n airbyte_output,\n metadata={\n **_get_attempt(airbyte_output.job_details.get("attempts", [{}])[-1]).get(\n "totalStats", {}\n )\n },\n )
\n
", "current_page_name": "_modules/dagster_airbyte/ops", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_airbyte.ops"}, "resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_airbyte.resources

\nimport hashlib\nimport json\nimport logging\nimport sys\nimport time\nfrom abc import abstractmethod\nfrom contextlib import contextmanager\nfrom typing import Any, Dict, List, Mapping, Optional, cast\n\nimport requests\nfrom dagster import (\n    ConfigurableResource,\n    Failure,\n    _check as check,\n    get_dagster_logger,\n    resource,\n)\nfrom dagster._config.pythonic_config import infer_schema_from_config_class\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom dagster._utils.cached_method import cached_method\nfrom dagster._utils.merger import deep_merge_dicts\nfrom pydantic import Field\nfrom requests.exceptions import RequestException\n\nfrom dagster_airbyte.types import AirbyteOutput\n\nDEFAULT_POLL_INTERVAL_SECONDS = 10\n\n\nclass AirbyteState:\n    RUNNING = "running"\n    SUCCEEDED = "succeeded"\n    CANCELLED = "cancelled"\n    PENDING = "pending"\n    FAILED = "failed"\n    ERROR = "error"\n    INCOMPLETE = "incomplete"\n\n\nclass AirbyteResourceState:\n    def __init__(self) -> None:\n        self.request_cache: Dict[str, Optional[Mapping[str, object]]] = {}\n        # Int in case we nest contexts\n        self.cache_enabled = 0\n\n\nclass BaseAirbyteResource(ConfigurableResource):\n    request_max_retries: int = Field(\n        default=3,\n        description=(\n            "The maximum number of times requests to the Airbyte API should be retried "\n            "before failing."\n        ),\n    )\n    request_retry_delay: float = Field(\n        default=0.25,\n        description="Time (in seconds) to wait between each request retry.",\n    )\n    request_timeout: int = Field(\n        default=15,\n        description="Time (in seconds) after which the requests to Airbyte are declared timed out.",\n    )\n    cancel_sync_on_run_termination: bool = Field(\n        default=True,\n        description=(\n            "Whether to cancel a sync in Airbyte if the Dagster runner is terminated. This may"\n            " be useful to disable if using Airbyte sources that cannot be cancelled and"\n            " resumed easily, or if your Dagster deployment may experience runner interruptions"\n            " that do not impact your Airbyte deployment."\n        ),\n    )\n    poll_interval: float = Field(\n        default=DEFAULT_POLL_INTERVAL_SECONDS,\n        description="Time (in seconds) to wait between checking a sync's status.",\n    )\n\n    @classmethod\n    def _is_dagster_maintained(cls) -> bool:\n        return True\n\n    @property\n    @cached_method\n    def _log(self) -> logging.Logger:\n        return get_dagster_logger()\n\n    @property\n    @abstractmethod\n    def api_base_url(self) -> str:\n        raise NotImplementedError()\n\n    @property\n    @abstractmethod\n    def all_additional_request_params(self) -> Mapping[str, Any]:\n        raise NotImplementedError()\n\n    def make_request(\n        self, endpoint: str, data: Optional[Mapping[str, object]] = None, method: str = "POST"\n    ) -> Optional[Mapping[str, object]]:\n        """Creates and sends a request to the desired Airbyte REST API endpoint.\n\n        Args:\n            endpoint (str): The Airbyte API endpoint to send this request to.\n            data (Optional[str]): JSON-formatted data string to be included in the request.\n\n        Returns:\n            Optional[Dict[str, Any]]: Parsed json data from the response to this request\n        """\n        url = self.api_base_url + endpoint\n        headers = {"accept": "application/json"}\n\n        num_retries = 0\n        while True:\n            try:\n                request_args: Dict[str, Any] = dict(\n                    method=method,\n                    url=url,\n                    headers=headers,\n                    timeout=self.request_timeout,\n                )\n                if data:\n                    request_args["json"] = data\n\n                request_args = deep_merge_dicts(\n                    request_args,\n                    self.all_additional_request_params,\n                )\n\n                response = requests.request(\n                    **request_args,\n                )\n                response.raise_for_status()\n                if response.status_code == 204:\n                    return None\n                return response.json()\n            except RequestException as e:\n                self._log.error("Request to Airbyte API failed: %s", e)\n                if num_retries == self.request_max_retries:\n                    break\n                num_retries += 1\n                time.sleep(self.request_retry_delay)\n\n        raise Failure(f"Max retries ({self.request_max_retries}) exceeded with url: {url}.")\n\n    @abstractmethod\n    def start_sync(self, connection_id: str) -> Mapping[str, object]:\n        raise NotImplementedError()\n\n    @abstractmethod\n    def get_connection_details(self, connection_id: str) -> Mapping[str, object]:\n        raise NotImplementedError()\n\n    @abstractmethod\n    def get_job_status(self, connection_id: str, job_id: int) -> Mapping[str, object]:\n        raise NotImplementedError()\n\n    @abstractmethod\n    def cancel_job(self, job_id: int):\n        raise NotImplementedError()\n\n    @property\n    @abstractmethod\n    def _should_forward_logs(self) -> bool:\n        raise NotImplementedError()\n\n    def sync_and_poll(\n        self,\n        connection_id: str,\n        poll_interval: Optional[float] = None,\n        poll_timeout: Optional[float] = None,\n    ) -> AirbyteOutput:\n        """Initializes a sync operation for the given connector, and polls until it completes.\n\n        Args:\n            connection_id (str): The Airbyte Connector ID. You can retrieve this value from the\n                "Connection" tab of a given connection in the Arbyte UI.\n            poll_interval (float): The time (in seconds) that will be waited between successive polls.\n            poll_timeout (float): The maximum time that will waited before this operation is timed\n                out. By default, this will never time out.\n\n        Returns:\n            :py:class:`~AirbyteOutput`:\n                Details of the sync job.\n        """\n        connection_details = self.get_connection_details(connection_id)\n        job_details = self.start_sync(connection_id)\n        job_info = cast(Dict[str, object], job_details.get("job", {}))\n        job_id = cast(int, job_info.get("id"))\n\n        self._log.info(f"Job {job_id} initialized for connection_id={connection_id}.")\n        start = time.monotonic()\n        logged_attempts = 0\n        logged_lines = 0\n        state = None\n\n        try:\n            while True:\n                if poll_timeout and start + poll_timeout < time.monotonic():\n                    raise Failure(\n                        f"Timeout: Airbyte job {job_id} is not ready after the timeout"\n                        f" {poll_timeout} seconds"\n                    )\n                time.sleep(poll_interval or self.poll_interval)\n                job_details = self.get_job_status(connection_id, job_id)\n                attempts = cast(List, job_details.get("attempts", []))\n                cur_attempt = len(attempts)\n                # spit out the available Airbyte log info\n                if cur_attempt:\n                    if self._should_forward_logs:\n                        log_lines = attempts[logged_attempts].get("logs", {}).get("logLines", [])\n\n                        for line in log_lines[logged_lines:]:\n                            sys.stdout.write(line + "\\n")\n                            sys.stdout.flush()\n                        logged_lines = len(log_lines)\n\n                    # if there's a next attempt, this one will have no more log messages\n                    if logged_attempts < cur_attempt - 1:\n                        logged_lines = 0\n                        logged_attempts += 1\n\n                job_info = cast(Dict[str, object], job_details.get("job", {}))\n                state = job_info.get("status")\n\n                if state in (AirbyteState.RUNNING, AirbyteState.PENDING, AirbyteState.INCOMPLETE):\n                    continue\n                elif state == AirbyteState.SUCCEEDED:\n                    break\n                elif state == AirbyteState.ERROR:\n                    raise Failure(f"Job failed: {job_id}")\n                elif state == AirbyteState.CANCELLED:\n                    raise Failure(f"Job was cancelled: {job_id}")\n                else:\n                    raise Failure(f"Encountered unexpected state `{state}` for job_id {job_id}")\n        finally:\n            # if Airbyte sync has not completed, make sure to cancel it so that it doesn't outlive\n            # the python process\n            if (\n                state not in (AirbyteState.SUCCEEDED, AirbyteState.ERROR, AirbyteState.CANCELLED)\n                and self.cancel_sync_on_run_termination\n            ):\n                self.cancel_job(job_id)\n\n        return AirbyteOutput(job_details=job_details, connection_details=connection_details)\n\n\nclass AirbyteCloudResource(BaseAirbyteResource):\n    """This resource allows users to programatically interface with the Airbyte Cloud API to launch\n    syncs and monitor their progress.\n\n    **Examples:**\n\n    .. code-block:: python\n\n        from dagster import job, EnvVar\n        from dagster_airbyte import AirbyteResource\n\n        my_airbyte_resource = AirbyteCloudResource(\n            api_key=EnvVar("AIRBYTE_API_KEY"),\n        )\n\n        airbyte_assets = build_airbyte_assets(\n            connection_id="87b7fe85-a22c-420e-8d74-b30e7ede77df",\n            destination_tables=["releases", "tags", "teams"],\n        )\n\n        defs = Definitions(\n            assets=[airbyte_assets],\n            resources={"airbyte": my_airbyte_resource},\n        )\n    """\n\n    api_key: str = Field(..., description="The Airbyte Cloud API key.")\n\n    @property\n    def api_base_url(self) -> str:\n        return "https://api.airbyte.com/v1"\n\n    @property\n    def all_additional_request_params(self) -> Mapping[str, Any]:\n        return {"headers": {"Authorization": f"Bearer {self.api_key}", "User-Agent": "dagster"}}\n\n    def start_sync(self, connection_id: str) -> Mapping[str, object]:\n        job_sync = check.not_none(\n            self.make_request(\n                endpoint="/jobs",\n                data={\n                    "connectionId": connection_id,\n                    "jobType": "sync",\n                },\n            )\n        )\n        return {"job": {"id": job_sync["jobId"], "status": job_sync["status"]}}\n\n    def get_connection_details(self, connection_id: str) -> Mapping[str, object]:\n        return {}\n\n    def get_job_status(self, connection_id: str, job_id: int) -> Mapping[str, object]:\n        job_status = check.not_none(self.make_request(endpoint=f"/jobs/{job_id}", method="GET"))\n        return {"job": {"id": job_status["jobId"], "status": job_status["status"]}}\n\n    def cancel_job(self, job_id: int):\n        self.make_request(endpoint=f"/jobs/{job_id}", method="DELETE")\n\n    @property\n    def _should_forward_logs(self) -> bool:\n        # Airbyte Cloud does not support streaming logs yet\n        return False\n\n\n
[docs]class AirbyteResource(BaseAirbyteResource):\n """This resource allows users to programatically interface with the Airbyte REST API to launch\n syncs and monitor their progress.\n\n **Examples:**\n\n .. code-block:: python\n\n from dagster import job, EnvVar\n from dagster_airbyte import AirbyteResource\n\n my_airbyte_resource = AirbyteResource(\n host=EnvVar("AIRBYTE_HOST"),\n port=EnvVar("AIRBYTE_PORT"),\n # If using basic auth\n username=EnvVar("AIRBYTE_USERNAME"),\n password=EnvVar("AIRBYTE_PASSWORD"),\n )\n\n airbyte_assets = build_airbyte_assets(\n connection_id="87b7fe85-a22c-420e-8d74-b30e7ede77df",\n destination_tables=["releases", "tags", "teams"],\n )\n\n defs = Definitions(\n assets=[airbyte_assets],\n resources={"airbyte": my_airbyte_resource},\n )\n """\n\n host: str = Field(description="The Airbyte server address.")\n port: str = Field(description="Port used for the Airbyte server.")\n username: Optional[str] = Field(default=None, description="Username if using basic auth.")\n password: Optional[str] = Field(default=None, description="Password if using basic auth.")\n use_https: bool = Field(\n default=False, description="Whether to use HTTPS to connect to the Airbyte server."\n )\n forward_logs: bool = Field(\n default=True,\n description=(\n "Whether to forward Airbyte logs to the compute log, can be expensive for"\n " long-running syncs."\n ),\n )\n request_additional_params: Mapping[str, Any] = Field(\n default=dict(),\n description=(\n "Any additional kwargs to pass to the requests library when making requests to Airbyte."\n ),\n )\n\n @property\n @cached_method\n def _state(self) -> AirbyteResourceState:\n return AirbyteResourceState()\n\n @property\n @cached_method\n def _log(self) -> logging.Logger:\n return get_dagster_logger()\n\n @property\n def api_base_url(self) -> str:\n return (\n ("https://" if self.use_https else "http://")\n + (f"{self.host}:{self.port}" if self.port else self.host)\n + "/api/v1"\n )\n\n @property\n def _should_forward_logs(self) -> bool:\n return self.forward_logs\n\n @contextmanager\n def cache_requests(self):\n """Context manager that enables caching certain requests to the Airbyte API,\n cleared when the context is exited.\n """\n self.clear_request_cache()\n self._state.cache_enabled += 1\n try:\n yield\n finally:\n self.clear_request_cache()\n self._state.cache_enabled -= 1\n\n def clear_request_cache(self) -> None:\n self._state.request_cache = {}\n\n def make_request_cached(self, endpoint: str, data: Optional[Mapping[str, object]]):\n if not self._state.cache_enabled > 0:\n return self.make_request(endpoint, data)\n data_json = json.dumps(data, sort_keys=True)\n sha = hashlib.sha1()\n sha.update(endpoint.encode("utf-8"))\n sha.update(data_json.encode("utf-8"))\n digest = sha.hexdigest()\n\n if digest not in self._state.request_cache:\n self._state.request_cache[digest] = self.make_request(endpoint, data)\n return self._state.request_cache[digest]\n\n @property\n def all_additional_request_params(self) -> Mapping[str, Any]:\n auth_param = (\n {"auth": (self.username, self.password)} if self.username and self.password else {}\n )\n return {**auth_param, **self.request_additional_params}\n\n def make_request(\n self, endpoint: str, data: Optional[Mapping[str, object]]\n ) -> Optional[Mapping[str, object]]:\n """Creates and sends a request to the desired Airbyte REST API endpoint.\n\n Args:\n endpoint (str): The Airbyte API endpoint to send this request to.\n data (Optional[str]): JSON-formatted data string to be included in the request.\n\n Returns:\n Optional[Dict[str, Any]]: Parsed json data from the response to this request\n """\n url = self.api_base_url + endpoint\n headers = {"accept": "application/json"}\n\n num_retries = 0\n while True:\n try:\n response = requests.request(\n **deep_merge_dicts( # type: ignore\n dict(\n method="POST",\n url=url,\n headers=headers,\n json=data,\n timeout=self.request_timeout,\n auth=(\n (self.username, self.password)\n if self.username and self.password\n else None\n ),\n ),\n self.request_additional_params,\n ),\n )\n response.raise_for_status()\n if response.status_code == 204:\n return None\n return response.json()\n except RequestException as e:\n self._log.error("Request to Airbyte API failed: %s", e)\n if num_retries == self.request_max_retries:\n break\n num_retries += 1\n time.sleep(self.request_retry_delay)\n\n raise Failure(f"Max retries ({self.request_max_retries}) exceeded with url: {url}.")\n\n def cancel_job(self, job_id: int):\n self.make_request(endpoint="/jobs/cancel", data={"id": job_id})\n\n def get_default_workspace(self) -> str:\n workspaces = cast(\n List[Dict[str, Any]],\n check.not_none(self.make_request_cached(endpoint="/workspaces/list", data={})).get(\n "workspaces", []\n ),\n )\n return workspaces[0]["workspaceId"]\n\n def get_source_definition_by_name(self, name: str) -> Optional[str]:\n name_lower = name.lower()\n definitions = self.make_request_cached(endpoint="/source_definitions/list", data={})\n\n return next(\n (\n definition["sourceDefinitionId"]\n for definition in definitions["sourceDefinitions"]\n if definition["name"].lower() == name_lower\n ),\n None,\n )\n\n def get_destination_definition_by_name(self, name: str):\n name_lower = name.lower()\n definitions = cast(\n Dict[str, List[Dict[str, str]]],\n check.not_none(\n self.make_request_cached(endpoint="/destination_definitions/list", data={})\n ),\n )\n return next(\n (\n definition["destinationDefinitionId"]\n for definition in definitions["destinationDefinitions"]\n if definition["name"].lower() == name_lower\n ),\n None,\n )\n\n def get_source_catalog_id(self, source_id: str):\n result = cast(\n Dict[str, Any],\n check.not_none(\n self.make_request(endpoint="/sources/discover_schema", data={"sourceId": source_id})\n ),\n )\n return result["catalogId"]\n\n def get_source_schema(self, source_id: str) -> Mapping[str, Any]:\n return cast(\n Dict[str, Any],\n check.not_none(\n self.make_request(endpoint="/sources/discover_schema", data={"sourceId": source_id})\n ),\n )\n\n def does_dest_support_normalization(\n self, destination_definition_id: str, workspace_id: str\n ) -> bool:\n # Airbyte API changed source of truth for normalization in PR\n # https://github.com/airbytehq/airbyte/pull/21005\n norm_dest_def_spec: bool = cast(\n Dict[str, Any],\n check.not_none(\n self.make_request_cached(\n endpoint="/destination_definition_specifications/get",\n data={\n "destinationDefinitionId": destination_definition_id,\n "workspaceId": workspace_id,\n },\n )\n ),\n ).get("supportsNormalization", False)\n\n norm_dest_def: bool = (\n cast(\n Dict[str, Any],\n check.not_none(\n self.make_request_cached(\n endpoint="/destination_definitions/get",\n data={\n "destinationDefinitionId": destination_definition_id,\n },\n )\n ),\n )\n .get("normalizationConfig", {})\n .get("supported", False)\n )\n\n return any([norm_dest_def_spec, norm_dest_def])\n\n def get_job_status(self, connection_id: str, job_id: int) -> Mapping[str, object]:\n if self.forward_logs:\n return check.not_none(self.make_request(endpoint="/jobs/get", data={"id": job_id}))\n else:\n # the "list all jobs" endpoint doesn't return logs, which actually makes it much more\n # lightweight for long-running syncs with many logs\n out = check.not_none(\n self.make_request(\n endpoint="/jobs/list",\n data={\n "configTypes": ["sync"],\n "configId": connection_id,\n # sync should be the most recent, so pageSize 5 is sufficient\n "pagination": {"pageSize": 5},\n },\n )\n )\n job = next((job for job in cast(List, out["jobs"]) if job["job"]["id"] == job_id), None)\n\n return check.not_none(job)\n\n def start_sync(self, connection_id: str) -> Mapping[str, object]:\n return check.not_none(\n self.make_request(endpoint="/connections/sync", data={"connectionId": connection_id})\n )\n\n def get_connection_details(self, connection_id: str) -> Mapping[str, object]:\n return check.not_none(\n self.make_request(endpoint="/connections/get", data={"connectionId": connection_id})\n )\n\n def sync_and_poll(\n self,\n connection_id: str,\n poll_interval: Optional[float] = None,\n poll_timeout: Optional[float] = None,\n ) -> AirbyteOutput:\n """Initializes a sync operation for the given connector, and polls until it completes.\n\n Args:\n connection_id (str): The Airbyte Connector ID. You can retrieve this value from the\n "Connection" tab of a given connection in the Arbyte UI.\n poll_interval (float): The time (in seconds) that will be waited between successive polls.\n poll_timeout (float): The maximum time that will waited before this operation is timed\n out. By default, this will never time out.\n\n Returns:\n :py:class:`~AirbyteOutput`:\n Details of the sync job.\n """\n connection_details = self.get_connection_details(connection_id)\n job_details = self.start_sync(connection_id)\n job_info = cast(Dict[str, object], job_details.get("job", {}))\n job_id = cast(int, job_info.get("id"))\n\n self._log.info(f"Job {job_id} initialized for connection_id={connection_id}.")\n start = time.monotonic()\n logged_attempts = 0\n logged_lines = 0\n state = None\n\n try:\n while True:\n if poll_timeout and start + poll_timeout < time.monotonic():\n raise Failure(\n f"Timeout: Airbyte job {job_id} is not ready after the timeout"\n f" {poll_timeout} seconds"\n )\n time.sleep(poll_interval or self.poll_interval)\n job_details = self.get_job_status(connection_id, job_id)\n attempts = cast(List, job_details.get("attempts", []))\n cur_attempt = len(attempts)\n # spit out the available Airbyte log info\n if cur_attempt:\n if self.forward_logs:\n log_lines = attempts[logged_attempts].get("logs", {}).get("logLines", [])\n\n for line in log_lines[logged_lines:]:\n sys.stdout.write(line + "\\n")\n sys.stdout.flush()\n logged_lines = len(log_lines)\n\n # if there's a next attempt, this one will have no more log messages\n if logged_attempts < cur_attempt - 1:\n logged_lines = 0\n logged_attempts += 1\n\n job_info = cast(Dict[str, object], job_details.get("job", {}))\n state = job_info.get("status")\n\n if state in (AirbyteState.RUNNING, AirbyteState.PENDING, AirbyteState.INCOMPLETE):\n continue\n elif state == AirbyteState.SUCCEEDED:\n break\n elif state == AirbyteState.ERROR:\n raise Failure(f"Job failed: {job_id}")\n elif state == AirbyteState.CANCELLED:\n raise Failure(f"Job was cancelled: {job_id}")\n else:\n raise Failure(f"Encountered unexpected state `{state}` for job_id {job_id}")\n finally:\n # if Airbyte sync has not completed, make sure to cancel it so that it doesn't outlive\n # the python process\n if (\n state not in (AirbyteState.SUCCEEDED, AirbyteState.ERROR, AirbyteState.CANCELLED)\n and self.cancel_sync_on_run_termination\n ):\n self.cancel_job(job_id)\n\n return AirbyteOutput(job_details=job_details, connection_details=connection_details)
\n\n\n
[docs]@dagster_maintained_resource\n@resource(config_schema=AirbyteResource.to_config_schema())\ndef airbyte_resource(context) -> AirbyteResource:\n """This resource allows users to programatically interface with the Airbyte REST API to launch\n syncs and monitor their progress. This currently implements only a subset of the functionality\n exposed by the API.\n\n For a complete set of documentation on the Airbyte REST API, including expected response JSON\n schema, see the `Airbyte API Docs <https://airbyte-public-api-docs.s3.us-east-2.amazonaws.com/rapidoc-api-docs.html#overview>`_.\n\n To configure this resource, we recommend using the `configured\n <https://docs.dagster.io/concepts/configuration/configured>`_ method.\n\n **Examples:**\n\n .. code-block:: python\n\n from dagster import job\n from dagster_airbyte import airbyte_resource\n\n my_airbyte_resource = airbyte_resource.configured(\n {\n "host": {"env": "AIRBYTE_HOST"},\n "port": {"env": "AIRBYTE_PORT"},\n # If using basic auth\n "username": {"env": "AIRBYTE_USERNAME"},\n "password": {"env": "AIRBYTE_PASSWORD"},\n }\n )\n\n @job(resource_defs={"airbyte":my_airbyte_resource})\n def my_airbyte_job():\n ...\n\n """\n return AirbyteResource.from_resource_context(context)
\n\n\n@dagster_maintained_resource\n@resource(config_schema=infer_schema_from_config_class(AirbyteCloudResource))\ndef airbyte_cloud_resource(context) -> AirbyteCloudResource:\n """This resource allows users to programatically interface with the Airbyte Cloud REST API to launch\n syncs and monitor their progress. Currently, this resource may only be used with the more basic\n `dagster-airbyte` APIs, including the ops and assets.\n\n """\n return AirbyteCloudResource.from_resource_context(context)\n
", "current_page_name": "_modules/dagster_airbyte/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_airbyte.resources"}}, "dagster_airflow": {"dagster_asset_factory": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_airflow.dagster_asset_factory

\nfrom typing import AbstractSet, List, Mapping, Optional, Set, Tuple\n\nfrom airflow.models.connection import Connection\nfrom airflow.models.dag import DAG\nfrom dagster import (\n    AssetKey,\n    AssetsDefinition,\n    GraphDefinition,\n    OutputMapping,\n    TimeWindowPartitionsDefinition,\n)\nfrom dagster._core.definitions.graph_definition import create_adjacency_lists\nfrom dagster._utils.schedules import is_valid_cron_schedule\n\nfrom dagster_airflow.dagster_job_factory import make_dagster_job_from_airflow_dag\nfrom dagster_airflow.utils import (\n    DagsterAirflowError,\n    normalized_name,\n)\n\n\ndef _build_asset_dependencies(\n    dag: DAG,\n    graph: GraphDefinition,\n    task_ids_by_asset_key: Mapping[AssetKey, AbstractSet[str]],\n    upstream_dependencies_by_asset_key: Mapping[AssetKey, AbstractSet[AssetKey]],\n) -> Tuple[AbstractSet[OutputMapping], Mapping[str, AssetKey], Mapping[str, Set[AssetKey]]]:\n    """Builds the asset dependency graph for a given set of airflow task mappings and a dagster graph."""\n    output_mappings = set()\n    keys_by_output_name = {}\n    internal_asset_deps: dict[str, Set[AssetKey]] = {}\n\n    visited_nodes: dict[str, bool] = {}\n    upstream_deps = set()\n\n    def find_upstream_dependency(node_name: str) -> None:\n        """Uses Depth-Firs-Search to find all upstream asset dependencies\n        as described in task_ids_by_asset_key.\n        """\n        # node has been visited\n        if visited_nodes[node_name]:\n            return\n        # mark node as visted\n        visited_nodes[node_name] = True\n        # traverse upstream nodes\n        for output_handle in graph.dependency_structure.all_upstream_outputs_from_node(node_name):\n            forward_node = output_handle.node_name\n            match = False\n            # find any assets produced by upstream nodes and add them to the internal asset deps\n            for asset_key in task_ids_by_asset_key:\n                if (\n                    forward_node.replace(f"{normalized_name(dag.dag_id)}__", "")\n                    in task_ids_by_asset_key[asset_key]\n                ):\n                    upstream_deps.add(asset_key)\n                    match = True\n            # don't traverse past nodes that have assets\n            if not match:\n                find_upstream_dependency(forward_node)\n\n    # iterate through each asset to find all upstream asset dependencies\n    for asset_key in task_ids_by_asset_key:\n        asset_upstream_deps = set()\n        for task_id in task_ids_by_asset_key[asset_key]:\n            visited_nodes = {s.name: False for s in graph.nodes}\n            upstream_deps = set()\n            find_upstream_dependency(normalized_name(dag.dag_id, task_id))\n            for dep in upstream_deps:\n                asset_upstream_deps.add(dep)\n            keys_by_output_name[f"result_{normalized_name(dag.dag_id, task_id)}"] = asset_key\n            output_mappings.add(\n                OutputMapping(\n                    graph_output_name=f"result_{normalized_name(dag.dag_id, task_id)}",\n                    mapped_node_name=normalized_name(dag.dag_id, task_id),\n                    mapped_node_output_name="airflow_task_complete",  # Default output name\n                )\n            )\n\n        # the tasks for a given asset should have the same internal deps\n        for task_id in task_ids_by_asset_key[asset_key]:\n            if f"result_{normalized_name(dag.dag_id, task_id)}" in internal_asset_deps:\n                internal_asset_deps[f"result_{normalized_name(dag.dag_id, task_id)}"].update(\n                    asset_upstream_deps\n                )\n            else:\n                internal_asset_deps[f"result_{normalized_name(dag.dag_id, task_id)}"] = (\n                    asset_upstream_deps\n                )\n\n    # add new upstream asset dependencies to the internal deps\n    for asset_key in upstream_dependencies_by_asset_key:\n        for key in keys_by_output_name:\n            if keys_by_output_name[key] == asset_key:\n                internal_asset_deps[key].update(upstream_dependencies_by_asset_key[asset_key])\n\n    return (output_mappings, keys_by_output_name, internal_asset_deps)\n\n\n
[docs]def load_assets_from_airflow_dag(\n dag: DAG,\n task_ids_by_asset_key: Mapping[AssetKey, AbstractSet[str]] = {},\n upstream_dependencies_by_asset_key: Mapping[AssetKey, AbstractSet[AssetKey]] = {},\n connections: Optional[List[Connection]] = None,\n) -> List[AssetsDefinition]:\n """[Experimental] Construct Dagster Assets for a given Airflow DAG.\n\n Args:\n dag (DAG): The Airflow DAG to compile into a Dagster job\n task_ids_by_asset_key (Optional[Mapping[AssetKey, AbstractSet[str]]]): A mapping from asset\n keys to task ids. Used break up the Airflow Dag into multiple SDAs\n upstream_dependencies_by_asset_key (Optional[Mapping[AssetKey, AbstractSet[AssetKey]]]): A\n mapping from upstream asset keys to assets provided in task_ids_by_asset_key. Used to\n declare new upstream SDA depenencies.\n connections (List[Connection]): List of Airflow Connections to be created in the Airflow DB\n\n Returns:\n List[AssetsDefinition]\n """\n cron_schedule = dag.normalized_schedule_interval\n if cron_schedule is not None and not is_valid_cron_schedule(str(cron_schedule)):\n raise DagsterAirflowError(f"Invalid cron schedule: {cron_schedule} in DAG {dag.dag_id}")\n\n job = make_dagster_job_from_airflow_dag(dag, connections=connections)\n graph = job._graph_def # noqa: SLF001\n start_date = dag.start_date if dag.start_date else dag.default_args.get("start_date")\n if start_date is None:\n raise DagsterAirflowError(f"Invalid start_date: {start_date} in DAG {dag.dag_id}")\n\n # leaf nodes have no downstream nodes\n forward_edges, _ = create_adjacency_lists(graph.nodes, graph.dependency_structure)\n leaf_nodes = {\n node_name.replace(f"{normalized_name(dag.dag_id)}__", "")\n for node_name, downstream_nodes in forward_edges.items()\n if not downstream_nodes\n }\n\n mutated_task_ids_by_asset_key: dict[AssetKey, set[str]] = {}\n\n if task_ids_by_asset_key is None or task_ids_by_asset_key == {}:\n # if no mappings are provided the dag becomes a single SDA\n task_ids_by_asset_key = {AssetKey(dag.dag_id): leaf_nodes}\n else:\n # if mappings were provide any unmapped leaf nodes are added to a default asset\n used_nodes: set[str] = set()\n for key in task_ids_by_asset_key:\n used_nodes.update(task_ids_by_asset_key[key])\n\n mutated_task_ids_by_asset_key[AssetKey(dag.dag_id)] = leaf_nodes - used_nodes\n\n for key in task_ids_by_asset_key:\n if key not in mutated_task_ids_by_asset_key:\n mutated_task_ids_by_asset_key[key] = set(task_ids_by_asset_key[key])\n else:\n mutated_task_ids_by_asset_key[key].update(task_ids_by_asset_key[key])\n\n output_mappings, keys_by_output_name, internal_asset_deps = _build_asset_dependencies(\n dag, graph, mutated_task_ids_by_asset_key, upstream_dependencies_by_asset_key\n )\n\n new_graph = graph.copy(\n output_mappings=list(output_mappings),\n )\n\n asset_def = AssetsDefinition.from_graph(\n graph_def=new_graph,\n partitions_def=(\n TimeWindowPartitionsDefinition(\n cron_schedule=str(cron_schedule),\n timezone=dag.timezone.name,\n start=start_date.strftime("%Y-%m-%dT%H:%M:%S"),\n fmt="%Y-%m-%dT%H:%M:%S",\n )\n if cron_schedule is not None\n else None\n ),\n group_name=dag.dag_id,\n keys_by_output_name=keys_by_output_name,\n internal_asset_deps=internal_asset_deps,\n can_subset=True,\n )\n return [asset_def]
\n
", "current_page_name": "_modules/dagster_airflow/dagster_asset_factory", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_airflow.dagster_asset_factory"}, "dagster_factory": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_airflow.dagster_factory

\nimport os\nfrom typing import List, Mapping, Optional, Tuple\n\nfrom airflow.models.connection import Connection\nfrom airflow.models.dagbag import DagBag\nfrom dagster import (\n    Definitions,\n    JobDefinition,\n    ResourceDefinition,\n    ScheduleDefinition,\n    _check as check,\n)\n\nfrom dagster_airflow.dagster_job_factory import make_dagster_job_from_airflow_dag\nfrom dagster_airflow.dagster_schedule_factory import (\n    _is_dag_is_schedule,\n    make_dagster_schedule_from_airflow_dag,\n)\nfrom dagster_airflow.patch_airflow_example_dag import patch_airflow_example_dag\nfrom dagster_airflow.resources import (\n    make_ephemeral_airflow_db_resource as make_ephemeral_airflow_db_resource,\n)\nfrom dagster_airflow.resources.airflow_ephemeral_db import AirflowEphemeralDatabase\nfrom dagster_airflow.resources.airflow_persistent_db import AirflowPersistentDatabase\nfrom dagster_airflow.utils import (\n    is_airflow_2_loaded_in_environment,\n)\n\n\n
[docs]def make_dagster_definitions_from_airflow_dag_bag(\n dag_bag: DagBag,\n connections: Optional[List[Connection]] = None,\n resource_defs: Optional[Mapping[str, ResourceDefinition]] = {},\n) -> Definitions:\n """Construct a Dagster definition corresponding to Airflow DAGs in DagBag.\n\n Usage:\n Create `make_dagster_definition.py`:\n from dagster_airflow import make_dagster_definition_from_airflow_dag_bag\n from airflow_home import my_dag_bag\n\n def make_definition_from_dag_bag():\n return make_dagster_definition_from_airflow_dag_bag(my_dag_bag)\n\n Use Definitions as usual, for example:\n `dagster-webserver -f path/to/make_dagster_definition.py`\n\n Args:\n dag_bag (DagBag): Airflow DagBag Model\n connections (List[Connection]): List of Airflow Connections to be created in the Airflow DB\n\n Returns:\n Definitions\n """\n check.inst_param(dag_bag, "dag_bag", DagBag)\n connections = check.opt_list_param(connections, "connections", of_type=Connection)\n resource_defs = check.opt_mapping_param(resource_defs, "resource_defs")\n if resource_defs is None or "airflow_db" not in resource_defs:\n resource_defs = dict(resource_defs) if resource_defs else {}\n resource_defs["airflow_db"] = make_ephemeral_airflow_db_resource(connections=connections)\n\n schedules, jobs = make_schedules_and_jobs_from_airflow_dag_bag(\n dag_bag=dag_bag,\n connections=connections,\n resource_defs=resource_defs,\n )\n\n return Definitions(\n schedules=schedules,\n jobs=jobs,\n resources=resource_defs,\n )
\n\n\n
[docs]def make_dagster_definitions_from_airflow_dags_path(\n dag_path: str,\n safe_mode: bool = True,\n connections: Optional[List[Connection]] = None,\n resource_defs: Optional[Mapping[str, ResourceDefinition]] = {},\n) -> Definitions:\n """Construct a Dagster repository corresponding to Airflow DAGs in dag_path.\n\n Usage:\n Create ``make_dagster_definitions.py``:\n\n .. code-block:: python\n\n from dagster_airflow import make_dagster_definitions_from_airflow_dags_path\n\n def make_definitions_from_dir():\n return make_dagster_definitions_from_airflow_dags_path(\n '/path/to/dags/',\n )\n\n Use RepositoryDefinition as usual, for example:\n ``dagster-webserver -f path/to/make_dagster_repo.py -n make_repo_from_dir``\n\n Args:\n dag_path (str): Path to directory or file that contains Airflow Dags\n include_examples (bool): True to include Airflow's example DAGs. (default: False)\n safe_mode (bool): True to use Airflow's default heuristic to find files that contain DAGs\n (ie find files that contain both b'DAG' and b'airflow') (default: True)\n connections (List[Connection]): List of Airflow Connections to be created in the Airflow DB\n\n Returns:\n Definitions\n """\n check.str_param(dag_path, "dag_path")\n check.bool_param(safe_mode, "safe_mode")\n connections = check.opt_list_param(connections, "connections", of_type=Connection)\n resource_defs = check.opt_mapping_param(resource_defs, "resource_defs")\n if resource_defs is None or "airflow_db" not in resource_defs:\n resource_defs = dict(resource_defs) if resource_defs else {}\n resource_defs["airflow_db"] = make_ephemeral_airflow_db_resource(connections=connections)\n\n if (\n resource_defs["airflow_db"].resource_fn.__qualname__.split(".")[0]\n == "AirflowEphemeralDatabase"\n ):\n AirflowEphemeralDatabase._initialize_database(connections=connections) # noqa: SLF001\n elif (\n resource_defs["airflow_db"].resource_fn.__qualname__.split(".")[0]\n == "AirflowPersistentDatabase"\n ):\n AirflowPersistentDatabase._initialize_database( # noqa: SLF001\n uri=(\n os.getenv("AIRFLOW__DATABASE__SQL_ALCHEMY_CONN", "")\n if is_airflow_2_loaded_in_environment()\n else os.getenv("AIRFLOW__CORE__SQL_ALCHEMY_CONN", "")\n ),\n connections=connections,\n )\n\n dag_bag = DagBag(\n dag_folder=dag_path,\n include_examples=False, # Exclude Airflow example dags\n safe_mode=safe_mode,\n )\n\n return make_dagster_definitions_from_airflow_dag_bag(\n dag_bag=dag_bag,\n connections=connections,\n resource_defs=resource_defs,\n )
\n\n\ndef make_dagster_definitions_from_airflow_example_dags(\n resource_defs: Optional[Mapping[str, ResourceDefinition]] = {},\n) -> Definitions:\n """Construct a Dagster repository for Airflow's example DAGs.\n\n Usage:\n\n Create `make_dagster_definitions.py`:\n from dagster_airflow import make_dagster_definitions_from_airflow_example_dags\n\n def make_airflow_example_dags():\n return make_dagster_definitions_from_airflow_example_dags()\n\n Use Definitions as usual, for example:\n `dagster-webserver -f path/to/make_dagster_definitions.py`\n\n Args:\n resource_defs: Optional[Mapping[str, ResourceDefinition]]\n Resource definitions to be used with the definitions\n\n Returns:\n Definitions\n """\n dag_bag = DagBag(\n dag_folder="some/empty/folder/with/no/dags", # prevent defaulting to settings.DAGS_FOLDER\n include_examples=True,\n )\n\n # There is a bug in Airflow v1 where the python_callable for task\n # 'search_catalog' is missing a required position argument '_'. It is fixed in airflow v2\n patch_airflow_example_dag(dag_bag)\n\n return make_dagster_definitions_from_airflow_dag_bag(\n dag_bag=dag_bag, resource_defs=resource_defs\n )\n\n\n
[docs]def make_schedules_and_jobs_from_airflow_dag_bag(\n dag_bag: DagBag,\n connections: Optional[List[Connection]] = None,\n resource_defs: Optional[Mapping[str, ResourceDefinition]] = {},\n) -> Tuple[List[ScheduleDefinition], List[JobDefinition]]:\n """Construct Dagster Schedules and Jobs corresponding to Airflow DagBag.\n\n Args:\n dag_bag (DagBag): Airflow DagBag Model\n connections (List[Connection]): List of Airflow Connections to be created in the Airflow DB\n\n Returns:\n - List[ScheduleDefinition]: The generated Dagster Schedules\n - List[JobDefinition]: The generated Dagster Jobs\n """\n check.inst_param(dag_bag, "dag_bag", DagBag)\n connections = check.opt_list_param(connections, "connections", of_type=Connection)\n\n job_defs = []\n schedule_defs = []\n count = 0\n # To enforce predictable iteration order\n sorted_dag_ids = sorted(dag_bag.dag_ids)\n for dag_id in sorted_dag_ids:\n dag = dag_bag.dags.get(dag_id)\n if not dag:\n continue\n if _is_dag_is_schedule(dag):\n schedule_defs.append(\n make_dagster_schedule_from_airflow_dag(\n dag=dag, tags=None, connections=connections, resource_defs=resource_defs\n )\n )\n else:\n job_defs.append(\n make_dagster_job_from_airflow_dag(\n dag=dag, tags=None, connections=connections, resource_defs=resource_defs\n )\n )\n\n count += 1\n\n return schedule_defs, job_defs
\n
", "current_page_name": "_modules/dagster_airflow/dagster_factory", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_airflow.dagster_factory"}, "dagster_job_factory": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_airflow.dagster_job_factory

\nfrom typing import List, Mapping, Optional\n\nfrom airflow.models.connection import Connection\nfrom airflow.models.dag import DAG\nfrom dagster import (\n    GraphDefinition,\n    JobDefinition,\n    ResourceDefinition,\n    _check as check,\n)\nfrom dagster._core.definitions.utils import validate_tags\nfrom dagster._core.instance import IS_AIRFLOW_INGEST_PIPELINE_STR\n\nfrom dagster_airflow.airflow_dag_converter import get_graph_definition_args\nfrom dagster_airflow.resources import (\n    make_ephemeral_airflow_db_resource as make_ephemeral_airflow_db_resource,\n)\nfrom dagster_airflow.utils import (\n    normalized_name,\n)\n\n\n
[docs]def make_dagster_job_from_airflow_dag(\n dag: DAG,\n tags: Optional[Mapping[str, str]] = None,\n connections: Optional[List[Connection]] = None,\n resource_defs: Optional[Mapping[str, ResourceDefinition]] = {},\n) -> JobDefinition:\n """Construct a Dagster job corresponding to a given Airflow DAG.\n\n Tasks in the resulting job will execute the ``execute()`` method on the corresponding\n Airflow Operator. Dagster, any dependencies required by Airflow Operators, and the module\n containing your DAG definition must be available in the Python environment within which your\n Dagster solids execute.\n\n To set Airflow's ``execution_date`` for use with Airflow Operator's ``execute()`` methods,\n either:\n\n 1. (Best for ad hoc runs) Execute job directly. This will set execution_date to the\n time (in UTC) of the run.\n\n 2. Add ``{'airflow_execution_date': utc_date_string}`` to the job tags. This will override\n behavior from (1).\n\n .. code-block:: python\n\n my_dagster_job = make_dagster_job_from_airflow_dag(\n dag=dag,\n tags={'airflow_execution_date': utc_execution_date_str}\n )\n my_dagster_job.execute_in_process()\n\n 3. (Recommended) Add ``{'airflow_execution_date': utc_date_string}`` to the run tags,\n such as in the Dagster UI. This will override behavior from (1) and (2)\n\n\n We apply normalized_name() to the dag id and task ids when generating job name and op\n names to ensure that names conform to Dagster's naming conventions.\n\n Args:\n dag (DAG): The Airflow DAG to compile into a Dagster job\n tags (Dict[str, Field]): Job tags. Optionally include\n `tags={'airflow_execution_date': utc_date_string}` to specify execution_date used within\n execution of Airflow Operators.\n connections (List[Connection]): List of Airflow Connections to be created in the Ephemeral\n Airflow DB, if use_emphemeral_airflow_db is False this will be ignored.\n\n Returns:\n JobDefinition: The generated Dagster job\n\n """\n check.inst_param(dag, "dag", DAG)\n tags = check.opt_mapping_param(tags, "tags")\n connections = check.opt_list_param(connections, "connections", of_type=Connection)\n\n mutated_tags = dict(tags)\n if IS_AIRFLOW_INGEST_PIPELINE_STR not in tags:\n mutated_tags[IS_AIRFLOW_INGEST_PIPELINE_STR] = "true"\n\n mutated_tags = validate_tags(mutated_tags)\n\n node_dependencies, node_defs = get_graph_definition_args(dag=dag)\n\n graph_def = GraphDefinition(\n name=normalized_name(dag.dag_id),\n description="",\n node_defs=node_defs,\n dependencies=node_dependencies,\n tags=mutated_tags,\n )\n\n if resource_defs is None or "airflow_db" not in resource_defs:\n resource_defs = dict(resource_defs) if resource_defs else {}\n resource_defs["airflow_db"] = make_ephemeral_airflow_db_resource(connections=connections)\n\n job_def = JobDefinition(\n name=normalized_name(dag.dag_id),\n description="",\n graph_def=graph_def,\n resource_defs=resource_defs,\n tags=mutated_tags,\n metadata={},\n op_retry_policy=None,\n version_strategy=None,\n )\n return job_def
\n
", "current_page_name": "_modules/dagster_airflow/dagster_job_factory", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_airflow.dagster_job_factory"}, "operators": {"dagster_operator": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_airflow.operators.dagster_operator

\nimport json\n\nfrom airflow.models import BaseOperator\nfrom airflow.utils.decorators import apply_defaults\n\nfrom dagster_airflow.hooks.dagster_hook import DagsterHook\nfrom dagster_airflow.links.dagster_link import LINK_FMT, DagsterLink\nfrom dagster_airflow.utils import is_airflow_2_loaded_in_environment\n\n\n
[docs]class DagsterOperator(BaseOperator):\n """DagsterOperator.\n\n Uses the dagster graphql api to run and monitor dagster jobs on remote dagster infrastructure\n\n Parameters:\n repository_name (str): the name of the repository to use\n repostitory_location_name (str): the name of the repostitory location to use\n job_name (str): the name of the job to run\n run_config (Optional[Dict[str, Any]]): the run config to use for the job run\n dagster_conn_id (Optional[str]): the id of the dagster connection, airflow 2.0+ only\n organization_id (Optional[str]): the id of the dagster cloud organization\n deployment_name (Optional[str]): the name of the dagster cloud deployment\n user_token (Optional[str]): the dagster cloud user token to use\n """\n\n template_fields = ["run_config"]\n template_ext = (".yaml", ".yml", ".json")\n ui_color = "#663399"\n ui_fgcolor = "#e0e3fc"\n operator_extra_links = (DagsterLink(),)\n\n @apply_defaults\n def __init__(\n self,\n dagster_conn_id="dagster_default",\n run_config=None,\n repository_name="",\n repostitory_location_name="",\n job_name="",\n # params for airflow < 2.0.0 were custom connections aren't supported\n deployment_name="prod",\n user_token=None,\n organization_id="",\n url="https://dagster.cloud/",\n *args,\n **kwargs,\n ) -> None:\n super().__init__(*args, **kwargs)\n self.run_id = None\n self.dagster_conn_id = dagster_conn_id if is_airflow_2_loaded_in_environment() else None\n self.run_config = run_config or {}\n self.repository_name = repository_name\n self.repostitory_location_name = repostitory_location_name\n self.job_name = job_name\n\n self.user_token = user_token\n self.url = url\n self.organization_id = organization_id\n self.deployment_name = deployment_name\n\n self.hook = DagsterHook(\n dagster_conn_id=self.dagster_conn_id,\n user_token=self.user_token,\n url=f"{self.url}{self.organization_id}/{self.deployment_name}/graphql",\n )\n\n def _is_json(self, blob):\n try:\n json.loads(blob)\n except ValueError:\n return False\n return True\n\n def pre_execute(self, context):\n # force re-rendering to ensure run_config renders any templated\n # content from run_config that couldn't be accessed on init\n setattr(\n self,\n "run_config",\n self.render_template(self.run_config, context),\n )\n\n def on_kill(self):\n self.log.info("Terminating Run")\n self.hook.terminate_run(\n run_id=self.run_id,\n )\n\n def execute(self, context):\n try:\n return self._execute(context)\n except Exception as e:\n raise e\n\n def _execute(self, context):\n self.run_id = self.hook.launch_run(\n repository_name=self.repository_name,\n repostitory_location_name=self.repostitory_location_name,\n job_name=self.job_name,\n run_config=self.run_config,\n )\n # save relevant info in xcom for use in links\n context["task_instance"].xcom_push(key="run_id", value=self.run_id)\n context["task_instance"].xcom_push(\n key="organization_id",\n value=self.hook.organization_id if self.dagster_conn_id else self.organization_id,\n )\n context["task_instance"].xcom_push(\n key="deployment_name",\n value=self.hook.deployment_name if self.dagster_conn_id else self.deployment_name,\n )\n\n self.log.info("Run Starting....")\n self.log.info(\n "Run tracking: %s",\n LINK_FMT.format(\n organization_id=self.hook.organization_id,\n deployment_name=self.hook.deployment_name,\n run_id=self.run_id,\n ),\n )\n self.hook.wait_for_run(\n run_id=self.run_id,\n )
\n\n\n
[docs]class DagsterCloudOperator(DagsterOperator):\n """DagsterCloudOperator.\n\n Uses the dagster cloud graphql api to run and monitor dagster jobs on dagster cloud\n\n Parameters:\n repository_name (str): the name of the repository to use\n repostitory_location_name (str): the name of the repostitory location to use\n job_name (str): the name of the job to run\n run_config (Optional[Dict[str, Any]]): the run config to use for the job run\n dagster_conn_id (Optional[str]): the id of the dagster connection, airflow 2.0+ only\n organization_id (Optional[str]): the id of the dagster cloud organization\n deployment_name (Optional[str]): the name of the dagster cloud deployment\n user_token (Optional[str]): the dagster cloud user token to use\n """
\n
", "current_page_name": "_modules/dagster_airflow/operators/dagster_operator", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_airflow.operators.dagster_operator"}}, "resources": {"airflow_ephemeral_db": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_airflow.resources.airflow_ephemeral_db

\nimport importlib\nimport os\nimport tempfile\nfrom typing import List, Optional\n\nimport airflow\nfrom airflow.models.connection import Connection\nfrom airflow.utils import db\nfrom dagster import (\n    Array,\n    DagsterRun,\n    Field,\n    InitResourceContext,\n    Noneable,\n    ResourceDefinition,\n    _check as check,\n)\n\nfrom dagster_airflow.resources.airflow_db import AirflowDatabase\nfrom dagster_airflow.utils import (\n    Locker,\n    create_airflow_connections,\n    is_airflow_2_loaded_in_environment,\n    serialize_connections,\n)\n\n\nclass AirflowEphemeralDatabase(AirflowDatabase):\n    """A ephemeral Airflow database Dagster resource."""\n\n    def __init__(\n        self, airflow_home_path: str, dagster_run: DagsterRun, dag_run_config: Optional[dict] = None\n    ):\n        self.airflow_home_path = airflow_home_path\n        super().__init__(dagster_run=dagster_run, dag_run_config=dag_run_config)\n\n    @staticmethod\n    def _initialize_database(\n        airflow_home_path: str = os.path.join(tempfile.gettempdir(), "dagster_airflow"),\n        connections: List[Connection] = [],\n    ):\n        os.environ["AIRFLOW_HOME"] = airflow_home_path\n        os.makedirs(airflow_home_path, exist_ok=True)\n        with Locker(airflow_home_path):\n            airflow_initialized = os.path.exists(f"{airflow_home_path}/airflow.db")\n            # because AIRFLOW_HOME has been overriden airflow needs to be reloaded\n            if is_airflow_2_loaded_in_environment():\n                importlib.reload(airflow.configuration)\n                importlib.reload(airflow.settings)\n                importlib.reload(airflow)\n            else:\n                importlib.reload(airflow)\n            if not airflow_initialized:\n                db.initdb()\n                create_airflow_connections(connections)\n\n    @staticmethod\n    def from_resource_context(context: InitResourceContext) -> "AirflowEphemeralDatabase":\n        airflow_home_path = os.path.join(tempfile.gettempdir(), f"dagster_airflow_{context.run_id}")\n        AirflowEphemeralDatabase._initialize_database(\n            airflow_home_path=airflow_home_path,\n            connections=[Connection(**c) for c in context.resource_config["connections"]],\n        )\n        return AirflowEphemeralDatabase(\n            airflow_home_path=airflow_home_path,\n            dagster_run=check.not_none(context.dagster_run, "Context must have run"),\n            dag_run_config=context.resource_config.get("dag_run_config"),\n        )\n\n\n
[docs]def make_ephemeral_airflow_db_resource(\n connections: List[Connection] = [], dag_run_config: Optional[dict] = None\n) -> ResourceDefinition:\n """Creates a Dagster resource that provides an ephemeral Airflow database.\n\n Args:\n connections (List[Connection]): List of Airflow Connections to be created in the Airflow DB\n dag_run_config (Optional[dict]): dag_run configuration to be used when creating a DagRun\n\n Returns:\n ResourceDefinition: The ephemeral Airflow DB resource\n\n """\n serialized_connections = serialize_connections(connections)\n airflow_db_resource_def = ResourceDefinition(\n resource_fn=AirflowEphemeralDatabase.from_resource_context,\n config_schema={\n "connections": Field(\n Array(inner_type=dict),\n default_value=serialized_connections,\n is_required=False,\n ),\n "dag_run_config": Field(\n Noneable(dict),\n default_value=dag_run_config,\n is_required=False,\n ),\n },\n description="Ephemeral Airflow DB to be used by dagster-airflow ",\n )\n return airflow_db_resource_def
\n
", "current_page_name": "_modules/dagster_airflow/resources/airflow_ephemeral_db", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_airflow.resources.airflow_ephemeral_db"}, "airflow_persistent_db": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_airflow.resources.airflow_persistent_db

\nimport importlib\nimport os\nfrom typing import List, Optional\n\nimport airflow\nfrom airflow.models.connection import Connection\nfrom dagster import (\n    Array,\n    DagsterRun,\n    Field,\n    InitResourceContext,\n    ResourceDefinition,\n    StringSource,\n    _check as check,\n)\n\nfrom dagster_airflow.resources.airflow_db import AirflowDatabase\nfrom dagster_airflow.utils import (\n    create_airflow_connections,\n    is_airflow_2_loaded_in_environment,\n    serialize_connections,\n)\n\n\nclass AirflowPersistentDatabase(AirflowDatabase):\n    """A persistent Airflow database Dagster resource."""\n\n    def __init__(self, dagster_run: DagsterRun, uri: str, dag_run_config: Optional[dict] = None):\n        self.uri = uri\n        super().__init__(dagster_run=dagster_run, dag_run_config=dag_run_config)\n\n    @staticmethod\n    def _initialize_database(uri: str, connections: List[Connection] = []):\n        if is_airflow_2_loaded_in_environment("2.3.0"):\n            os.environ["AIRFLOW__DATABASE__SQL_ALCHEMY_CONN"] = uri\n            importlib.reload(airflow.configuration)\n            importlib.reload(airflow.settings)\n            importlib.reload(airflow)\n        else:\n            os.environ["AIRFLOW__CORE__SQL_ALCHEMY_CONN"] = uri\n            importlib.reload(airflow)\n        create_airflow_connections(connections)\n\n    @staticmethod\n    def from_resource_context(context: InitResourceContext) -> "AirflowPersistentDatabase":\n        uri = context.resource_config["uri"]\n        AirflowPersistentDatabase._initialize_database(\n            uri=uri, connections=[Connection(**c) for c in context.resource_config["connections"]]\n        )\n        return AirflowPersistentDatabase(\n            dagster_run=check.not_none(context.dagster_run, "Context must have run"),\n            uri=uri,\n            dag_run_config=context.resource_config["dag_run_config"],\n        )\n\n\n
[docs]def make_persistent_airflow_db_resource(\n uri: str = "",\n connections: List[Connection] = [],\n dag_run_config: Optional[dict] = {},\n) -> ResourceDefinition:\n """Creates a Dagster resource that provides an persistent Airflow database.\n\n\n Usage:\n .. code-block:: python\n\n from dagster_airflow import (\n make_dagster_definitions_from_airflow_dags_path,\n make_persistent_airflow_db_resource,\n )\n postgres_airflow_db = "postgresql+psycopg2://airflow:airflow@localhost:5432/airflow"\n airflow_db = make_persistent_airflow_db_resource(uri=postgres_airflow_db)\n definitions = make_dagster_definitions_from_airflow_example_dags(\n '/path/to/dags/',\n resource_defs={"airflow_db": airflow_db}\n )\n\n\n Args:\n uri: SQLAlchemy URI of the Airflow DB to be used\n connections (List[Connection]): List of Airflow Connections to be created in the Airflow DB\n dag_run_config (Optional[dict]): dag_run configuration to be used when creating a DagRun\n\n Returns:\n ResourceDefinition: The persistent Airflow DB resource\n\n """\n if is_airflow_2_loaded_in_environment():\n os.environ["AIRFLOW__DATABASE__SQL_ALCHEMY_CONN"] = uri\n else:\n os.environ["AIRFLOW__CORE__SQL_ALCHEMY_CONN"] = uri\n\n serialized_connections = serialize_connections(connections)\n\n airflow_db_resource_def = ResourceDefinition(\n resource_fn=AirflowPersistentDatabase.from_resource_context,\n config_schema={\n "uri": Field(\n StringSource,\n default_value=uri,\n is_required=False,\n ),\n "connections": Field(\n Array(inner_type=dict),\n default_value=serialized_connections,\n is_required=False,\n ),\n "dag_run_config": Field(\n dict,\n default_value=dag_run_config,\n is_required=False,\n ),\n },\n description="Persistent Airflow DB to be used by dagster-airflow ",\n )\n return airflow_db_resource_def
\n
", "current_page_name": "_modules/dagster_airflow/resources/airflow_persistent_db", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_airflow.resources.airflow_persistent_db"}}}, "dagster_aws": {"ecs": {"launcher": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_aws.ecs.launcher

\nimport json\nimport logging\nimport os\nimport uuid\nimport warnings\nfrom collections import namedtuple\nfrom typing import Any, Dict, List, Mapping, Optional, Sequence\n\nimport boto3\nfrom botocore.exceptions import ClientError\nfrom dagster import (\n    Array,\n    DagsterRunStatus,\n    Field,\n    Noneable,\n    Permissive,\n    ScalarUnion,\n    StringSource,\n    _check as check,\n)\nfrom dagster._core.events import EngineEventData\nfrom dagster._core.instance import T_DagsterInstance\nfrom dagster._core.launcher.base import (\n    CheckRunHealthResult,\n    LaunchRunContext,\n    RunLauncher,\n    WorkerStatus,\n)\nfrom dagster._core.storage.dagster_run import DagsterRun\nfrom dagster._core.storage.tags import RUN_WORKER_ID_TAG\nfrom dagster._grpc.types import ExecuteRunArgs\nfrom dagster._serdes import ConfigurableClass\nfrom dagster._serdes.config_class import ConfigurableClassData\nfrom dagster._utils.backoff import backoff\nfrom typing_extensions import Self\n\nfrom ..secretsmanager import get_secrets_from_arns\nfrom .container_context import SHARED_ECS_SCHEMA, SHARED_TASK_DEFINITION_FIELDS, EcsContainerContext\nfrom .tasks import (\n    DagsterEcsTaskDefinitionConfig,\n    get_current_ecs_task,\n    get_current_ecs_task_metadata,\n    get_task_definition_dict_from_current_task,\n    get_task_kwargs_from_current_task,\n)\nfrom .utils import get_task_definition_family, get_task_logs, task_definitions_match\n\nTags = namedtuple("Tags", ["arn", "cluster", "cpu", "memory"])\n\nRUNNING_STATUSES = [\n    "PROVISIONING",\n    "PENDING",\n    "ACTIVATING",\n    "RUNNING",\n    "DEACTIVATING",\n    "STOPPING",\n    "DEPROVISIONING",\n]\nSTOPPED_STATUSES = ["STOPPED"]\n\nDEFAULT_WINDOWS_RESOURCES = {"cpu": "1024", "memory": "2048"}\n\nDEFAULT_LINUX_RESOURCES = {"cpu": "256", "memory": "512"}\n\n\n
[docs]class EcsRunLauncher(RunLauncher[T_DagsterInstance], ConfigurableClass):\n """RunLauncher that starts a task in ECS for each Dagster job run."""\n\n def __init__(\n self,\n inst_data: Optional[ConfigurableClassData] = None,\n task_definition=None,\n container_name="run",\n secrets=None,\n secrets_tag="dagster",\n env_vars=None,\n include_sidecars=False,\n use_current_ecs_task_config: bool = True,\n run_task_kwargs: Optional[Mapping[str, Any]] = None,\n run_resources: Optional[Dict[str, Any]] = None,\n run_ecs_tags: Optional[List[Dict[str, Optional[str]]]] = None,\n ):\n self._inst_data = inst_data\n self.ecs = boto3.client("ecs")\n self.ec2 = boto3.resource("ec2")\n self.secrets_manager = boto3.client("secretsmanager")\n self.logs = boto3.client("logs")\n\n self.task_definition = None\n self.task_definition_dict = {}\n if isinstance(task_definition, str):\n self.task_definition = task_definition\n elif task_definition and "env" in task_definition:\n check.invariant(\n len(task_definition) == 1,\n "If `task_definition` is set to a dictionary with `env`, `env` must be the only"\n " key.",\n )\n env_var = task_definition["env"]\n self.task_definition = os.getenv(env_var)\n if not self.task_definition:\n raise Exception(\n f"You have attempted to fetch the environment variable {env_var} which is not"\n " set."\n )\n else:\n self.task_definition_dict = task_definition or {}\n\n self.container_name = container_name\n\n self.secrets = check.opt_list_param(secrets, "secrets")\n\n self.env_vars = check.opt_list_param(env_vars, "env_vars")\n\n if self.secrets and all(isinstance(secret, str) for secret in self.secrets):\n warnings.warn(\n "Setting secrets as a list of ARNs is deprecated. "\n "Secrets should instead follow the same structure as the ECS API: "\n "https://docs.aws.amazon.com/AmazonECS/latest/APIReference/API_Secret.html",\n DeprecationWarning,\n )\n self.secrets = [\n {"name": name, "valueFrom": value_from}\n for name, value_from in get_secrets_from_arns(\n self.secrets_manager, self.secrets\n ).items()\n ]\n\n self.secrets_tags = [secrets_tag] if secrets_tag else []\n self.include_sidecars = include_sidecars\n\n if self.task_definition:\n task_definition = self.ecs.describe_task_definition(taskDefinition=self.task_definition)\n container_names = [\n container.get("name")\n for container in task_definition["taskDefinition"]["containerDefinitions"]\n ]\n check.invariant(\n container_name in container_names,\n f"Cannot override container '{container_name}' in task definition "\n f"'{self.task_definition}' because the container is not defined.",\n )\n self.task_definition = task_definition["taskDefinition"]["taskDefinitionArn"]\n\n self.use_current_ecs_task_config = check.opt_bool_param(\n use_current_ecs_task_config, "use_current_ecs_task_config"\n )\n\n self.run_task_kwargs = check.opt_mapping_param(run_task_kwargs, "run_task_kwargs")\n if run_task_kwargs:\n check.invariant(\n "taskDefinition" not in run_task_kwargs,\n "Use the `taskDefinition` config field to pass in a task definition to run.",\n )\n check.invariant(\n "overrides" not in run_task_kwargs,\n "Task overrides are set by the run launcher and cannot be set in run_task_kwargs.",\n )\n\n expected_keys = [\n key for key in self.ecs.meta.service_model.shape_for("RunTaskRequest").members\n ]\n\n for key in run_task_kwargs:\n check.invariant(\n key in expected_keys, f"Found an unexpected key {key} in run_task_kwargs"\n )\n\n self.run_resources = check.opt_mapping_param(run_resources, "run_resources")\n\n self.run_ecs_tags = check.opt_sequence_param(run_ecs_tags, "run_ecs_tags")\n\n self._current_task_metadata = None\n self._current_task = None\n\n @property\n def inst_data(self):\n return self._inst_data\n\n @property\n def task_role_arn(self) -> Optional[str]:\n if not self.task_definition_dict:\n return None\n return self.task_definition_dict.get("task_role_arn")\n\n @property\n def execution_role_arn(self) -> Optional[str]:\n if not self.task_definition_dict:\n return None\n return self.task_definition_dict.get("execution_role_arn")\n\n @property\n def runtime_platform(self) -> Optional[Mapping[str, Any]]:\n if not self.task_definition_dict:\n return None\n return self.task_definition_dict.get("runtime_platform")\n\n @property\n def mount_points(self) -> Optional[Sequence[Mapping[str, Any]]]:\n if not self.task_definition_dict:\n return None\n return self.task_definition_dict.get("mount_points")\n\n @property\n def volumes(self) -> Optional[Sequence[Mapping[str, Any]]]:\n if not self.task_definition_dict:\n return None\n return self.task_definition_dict.get("volumes")\n\n @property\n def repository_credentials(self) -> Optional[str]:\n if not self.task_definition_dict:\n return None\n return self.task_definition_dict.get("repository_credentials")\n\n @property\n def run_sidecar_containers(self) -> Optional[Sequence[Mapping[str, Any]]]:\n if not self.task_definition_dict:\n return None\n return self.task_definition_dict.get("sidecar_containers")\n\n @classmethod\n def config_type(cls):\n return {\n "task_definition": Field(\n ScalarUnion(\n scalar_type=str,\n non_scalar_schema={\n "log_group": Field(StringSource, is_required=False),\n "sidecar_containers": Field(Array(Permissive({})), is_required=False),\n "requires_compatibilities": Field(Array(str), is_required=False),\n "env": Field(\n str,\n is_required=False,\n description=(\n "Backwards-compatibility for when task_definition was a"\n " StringSource.Can be used to source the task_definition scalar"\n " from an environment variable."\n ),\n ),\n **SHARED_TASK_DEFINITION_FIELDS,\n },\n ),\n is_required=False,\n description=(\n "Either the short name of an existing task definition to use when launching new"\n " tasks, or a dictionary configuration to use when creating a task definition"\n " for the run.If neither is provided, the task definition will be created based"\n " on the current task's task definition."\n ),\n ),\n "container_name": Field(\n StringSource,\n is_required=False,\n default_value="run",\n description=(\n "The container name to use when launching new tasks. Defaults to 'run'."\n ),\n ),\n "secrets": Field(\n Array(\n ScalarUnion(\n scalar_type=str,\n non_scalar_schema={"name": StringSource, "valueFrom": StringSource},\n )\n ),\n is_required=False,\n description=(\n "An array of AWS Secrets Manager secrets. These secrets will "\n "be mounted as environment variables in the container. See "\n "https://docs.aws.amazon.com/AmazonECS/latest/APIReference/API_Secret.html."\n ),\n ),\n "secrets_tag": Field(\n Noneable(StringSource),\n is_required=False,\n default_value="dagster",\n description=(\n "AWS Secrets Manager secrets with this tag will be mounted as "\n "environment variables in the container. Defaults to 'dagster'."\n ),\n ),\n "include_sidecars": Field(\n bool,\n is_required=False,\n default_value=False,\n description=(\n "Whether each run should use the same sidecars as the task that launches it. "\n "Defaults to False."\n ),\n ),\n "use_current_ecs_task_config": Field(\n bool,\n is_required=False,\n default_value=True,\n description=(\n "Whether to use the run launcher's current ECS task in order to determine "\n "the cluster and networking configuration for the launched task. Defaults to "\n "True. Should only be called if the run launcher is running within an ECS "\n "task."\n ),\n ),\n "run_task_kwargs": Field(\n Permissive(\n {\n "cluster": Field(\n StringSource,\n is_required=False,\n description="Name of the ECS cluster to launch ECS tasks in.",\n ),\n }\n ),\n is_required=False,\n description=(\n "Additional arguments to include while running the task. See"\n " https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/ecs.html#ECS.Client.run_task"\n " for the available parameters. The overrides and taskDefinition arguments will"\n " always be set by the run launcher."\n ),\n ),\n **SHARED_ECS_SCHEMA,\n }\n\n @classmethod\n def from_config_value(\n cls, inst_data: ConfigurableClassData, config_value: Mapping[str, Any]\n ) -> Self:\n return EcsRunLauncher(inst_data=inst_data, **config_value)\n\n def _set_run_tags(self, run_id: str, cluster: str, task_arn: str):\n tags = {\n "ecs/task_arn": task_arn,\n "ecs/cluster": cluster,\n RUN_WORKER_ID_TAG: str(uuid.uuid4().hex)[0:6],\n }\n self._instance.add_run_tags(run_id, tags)\n\n def build_ecs_tags_for_run_task(self, run, container_context: EcsContainerContext):\n if any(tag["key"] == "dagster/run_id" for tag in container_context.run_ecs_tags):\n raise Exception("Cannot override system ECS tag: dagster/run_id")\n\n return [{"key": "dagster/run_id", "value": run.run_id}, *container_context.run_ecs_tags]\n\n def _get_run_tags(self, run_id):\n run = self._instance.get_run_by_id(run_id)\n tags = run.tags if run else {}\n arn = tags.get("ecs/task_arn")\n cluster = tags.get("ecs/cluster")\n cpu = tags.get("ecs/cpu")\n memory = tags.get("ecs/memory")\n\n return Tags(arn, cluster, cpu, memory)\n\n def _get_command_args(self, run_args: ExecuteRunArgs, context: LaunchRunContext):\n return run_args.get_command_args()\n\n def _get_image_for_run(self, context: LaunchRunContext) -> Optional[str]:\n job_origin = check.not_none(context.job_code_origin)\n return job_origin.repository_origin.container_image\n\n def launch_run(self, context: LaunchRunContext) -> None:\n """Launch a run in an ECS task."""\n run = context.dagster_run\n container_context = EcsContainerContext.create_for_run(run, self)\n\n job_origin = check.not_none(context.job_code_origin)\n\n # ECS limits overrides to 8192 characters including json formatting\n # https://docs.aws.amazon.com/AmazonECS/latest/APIReference/API_RunTask.html\n # When container_context is serialized as part of the ExecuteRunArgs, we risk\n # going over this limit (for example, if many secrets have been set). This strips\n # the container context off of our job origin because we don't actually need\n # it to launch the run; we only needed it to create the task definition.\n repository_origin = job_origin.repository_origin\n\n stripped_repository_origin = repository_origin._replace(container_context={})\n stripped_job_origin = job_origin._replace(repository_origin=stripped_repository_origin)\n\n args = ExecuteRunArgs(\n job_origin=stripped_job_origin,\n run_id=run.run_id,\n instance_ref=self._instance.get_ref(),\n )\n command = self._get_command_args(args, context)\n image = self._get_image_for_run(context)\n\n run_task_kwargs = self._run_task_kwargs(run, image, container_context)\n\n # Set cpu or memory overrides\n # https://docs.aws.amazon.com/AmazonECS/latest/developerguide/task-cpu-memory-error.html\n cpu_and_memory_overrides = self.get_cpu_and_memory_overrides(container_context, run)\n\n task_overrides = self._get_task_overrides(container_context, run)\n\n container_overrides: List[Dict[str, Any]] = [\n {\n "name": self._get_container_name(container_context),\n "command": command,\n # containerOverrides expects cpu/memory as integers\n **{k: int(v) for k, v in cpu_and_memory_overrides.items()},\n }\n ]\n\n run_task_kwargs["overrides"] = {\n "containerOverrides": container_overrides,\n # taskOverrides expects cpu/memory as strings\n **cpu_and_memory_overrides,\n **task_overrides,\n }\n run_task_kwargs["tags"] = [\n *run_task_kwargs.get("tags", []),\n *self.build_ecs_tags_for_run_task(run, container_context),\n ]\n\n run_task_kwargs_from_run = self._get_run_task_kwargs_from_run(run)\n run_task_kwargs.update(run_task_kwargs_from_run)\n\n # launchType and capacityProviderStrategy are incompatible - prefer the latter if it is set\n if "launchType" in run_task_kwargs and run_task_kwargs.get("capacityProviderStrategy"):\n del run_task_kwargs["launchType"]\n\n # Run a task using the same network configuration as this processes's task.\n response = self.ecs.run_task(**run_task_kwargs)\n\n tasks = response["tasks"]\n\n if not tasks:\n failures = response["failures"]\n failure_messages = []\n for failure in failures:\n arn = failure.get("arn")\n reason = failure.get("reason")\n detail = failure.get("detail")\n\n failure_message = (\n "Task"\n + (f" {arn}" if arn else "")\n + " failed."\n + (f" Failure reason: {reason}" if reason else "")\n + (f" Failure details: {detail}" if detail else "")\n )\n failure_messages.append(failure_message)\n\n raise Exception("\\n".join(failure_messages) if failure_messages else "Task failed.")\n\n arn = tasks[0]["taskArn"]\n cluster_arn = tasks[0]["clusterArn"]\n self._set_run_tags(run.run_id, cluster=cluster_arn, task_arn=arn)\n self.report_launch_events(run, arn, cluster_arn)\n\n def report_launch_events(\n self, run: DagsterRun, arn: Optional[str] = None, cluster: Optional[str] = None\n ):\n # Extracted method to allow for subclasses to customize the launch reporting behavior\n\n metadata = {}\n if arn:\n metadata["ECS Task ARN"] = arn\n if cluster:\n metadata["ECS Cluster"] = cluster\n\n metadata["Run ID"] = run.run_id\n self._instance.report_engine_event(\n message="Launching run in ECS task",\n dagster_run=run,\n engine_event_data=EngineEventData(metadata),\n cls=self.__class__,\n )\n\n def get_cpu_and_memory_overrides(\n self, container_context: EcsContainerContext, run: DagsterRun\n ) -> Mapping[str, str]:\n overrides = {}\n\n cpu = run.tags.get("ecs/cpu", container_context.run_resources.get("cpu"))\n memory = run.tags.get("ecs/memory", container_context.run_resources.get("memory"))\n\n if cpu:\n overrides["cpu"] = cpu\n if memory:\n overrides["memory"] = memory\n\n return overrides\n\n def _get_task_overrides(\n self, container_context: EcsContainerContext, run: DagsterRun\n ) -> Mapping[str, Any]:\n tag_overrides = run.tags.get("ecs/task_overrides")\n\n overrides = {}\n\n if tag_overrides:\n overrides = json.loads(tag_overrides)\n\n ephemeral_storage = run.tags.get(\n "ecs/ephemeral_storage", container_context.run_resources.get("ephemeral_storage")\n )\n if ephemeral_storage:\n overrides["ephemeralStorage"] = {"sizeInGiB": int(ephemeral_storage)}\n\n return overrides\n\n def _get_run_task_kwargs_from_run(self, run: DagsterRun) -> Mapping[str, Any]:\n run_task_kwargs = run.tags.get("ecs/run_task_kwargs")\n if run_task_kwargs:\n return json.loads(run_task_kwargs)\n return {}\n\n def terminate(self, run_id):\n tags = self._get_run_tags(run_id)\n\n run = self._instance.get_run_by_id(run_id)\n if not run:\n return False\n\n self._instance.report_run_canceling(run)\n\n if not (tags.arn and tags.cluster):\n return False\n\n tasks = self.ecs.describe_tasks(tasks=[tags.arn], cluster=tags.cluster).get("tasks")\n if not tasks:\n return False\n\n status = tasks[0].get("lastStatus")\n if status == "STOPPED":\n return False\n\n self.ecs.stop_task(task=tags.arn, cluster=tags.cluster)\n return True\n\n def _get_current_task_metadata(self):\n if self._current_task_metadata is None:\n self._current_task_metadata = get_current_ecs_task_metadata()\n return self._current_task_metadata\n\n def _get_current_task(self):\n if self._current_task is None:\n current_task_metadata = self._get_current_task_metadata()\n self._current_task = get_current_ecs_task(\n self.ecs, current_task_metadata.task_arn, current_task_metadata.cluster\n )\n\n return self._current_task\n\n def _get_run_task_definition_family(self, run: DagsterRun) -> str:\n return get_task_definition_family("run", check.not_none(run.external_job_origin))\n\n def _get_container_name(self, container_context) -> str:\n return container_context.container_name or self.container_name\n\n def _run_task_kwargs(self, run, image, container_context) -> Dict[str, Any]:\n """Return a dictionary of args to launch the ECS task, registering a new task\n definition if needed.\n """\n environment = self._environment(container_context)\n environment.append({"name": "DAGSTER_RUN_JOB_NAME", "value": run.job_name})\n\n secrets = self._secrets(container_context)\n\n if container_context.task_definition_arn:\n task_definition = container_context.task_definition_arn\n else:\n family = self._get_run_task_definition_family(run)\n\n if self.task_definition_dict or not self.use_current_ecs_task_config:\n runtime_platform = container_context.runtime_platform\n is_windows = container_context.runtime_platform.get(\n "operatingSystemFamily"\n ) not in {None, "LINUX"}\n\n default_resources = (\n DEFAULT_WINDOWS_RESOURCES if is_windows else DEFAULT_LINUX_RESOURCES\n )\n task_definition_config = DagsterEcsTaskDefinitionConfig(\n family,\n image,\n self._get_container_name(container_context),\n command=None,\n log_configuration=(\n {\n "logDriver": "awslogs",\n "options": {\n "awslogs-group": self.task_definition_dict["log_group"],\n "awslogs-region": self.ecs.meta.region_name,\n "awslogs-stream-prefix": family,\n },\n }\n if self.task_definition_dict.get("log_group")\n else None\n ),\n secrets=secrets if secrets else [],\n environment=environment,\n execution_role_arn=container_context.execution_role_arn,\n task_role_arn=container_context.task_role_arn,\n sidecars=container_context.run_sidecar_containers,\n requires_compatibilities=self.task_definition_dict.get(\n "requires_compatibilities", []\n ),\n cpu=container_context.run_resources.get("cpu", default_resources["cpu"]),\n memory=container_context.run_resources.get(\n "memory", default_resources["memory"]\n ),\n ephemeral_storage=container_context.run_resources.get("ephemeral_storage"),\n runtime_platform=runtime_platform,\n volumes=container_context.volumes,\n mount_points=container_context.mount_points,\n repository_credentials=container_context.repository_credentials,\n )\n task_definition_dict = task_definition_config.task_definition_dict()\n else:\n task_definition_dict = get_task_definition_dict_from_current_task(\n self.ecs,\n family,\n self._get_current_task(),\n image,\n self._get_container_name(container_context),\n environment=environment,\n secrets=secrets if secrets else {},\n include_sidecars=self.include_sidecars,\n task_role_arn=container_context.task_role_arn,\n execution_role_arn=container_context.execution_role_arn,\n cpu=container_context.run_resources.get("cpu"),\n memory=container_context.run_resources.get("memory"),\n runtime_platform=container_context.runtime_platform,\n ephemeral_storage=container_context.run_resources.get("ephemeral_storage"),\n volumes=container_context.volumes,\n mount_points=container_context.mount_points,\n additional_sidecars=container_context.run_sidecar_containers,\n repository_credentials=container_context.repository_credentials,\n )\n\n task_definition_config = DagsterEcsTaskDefinitionConfig.from_task_definition_dict(\n task_definition_dict,\n self._get_container_name(container_context),\n )\n\n container_name = self._get_container_name(container_context)\n\n backoff(\n self._reuse_or_register_task_definition,\n retry_on=(Exception,),\n kwargs={\n "desired_task_definition_config": task_definition_config,\n "container_name": container_name,\n "task_definition_dict": task_definition_dict,\n },\n max_retries=5,\n )\n\n task_definition = family\n\n if self.use_current_ecs_task_config:\n current_task_metadata = get_current_ecs_task_metadata()\n current_task = get_current_ecs_task(\n self.ecs, current_task_metadata.task_arn, current_task_metadata.cluster\n )\n task_kwargs = get_task_kwargs_from_current_task(\n self.ec2,\n current_task_metadata.cluster,\n current_task,\n )\n else:\n task_kwargs = {}\n\n return {**task_kwargs, **self.run_task_kwargs, "taskDefinition": task_definition}\n\n def _reuse_task_definition(\n self, desired_task_definition_config: DagsterEcsTaskDefinitionConfig, container_name: str\n ):\n family = desired_task_definition_config.family\n\n try:\n existing_task_definition = self.ecs.describe_task_definition(taskDefinition=family)[\n "taskDefinition"\n ]\n except ClientError:\n # task definition does not exist, do not reuse\n return False\n\n return task_definitions_match(\n desired_task_definition_config,\n existing_task_definition,\n container_name=container_name,\n )\n\n def _reuse_or_register_task_definition(\n self,\n desired_task_definition_config: DagsterEcsTaskDefinitionConfig,\n container_name: str,\n task_definition_dict: dict,\n ):\n if not self._reuse_task_definition(desired_task_definition_config, container_name):\n self.ecs.register_task_definition(**task_definition_dict)\n\n def _environment(self, container_context):\n return [\n {"name": key, "value": value}\n for key, value in container_context.get_environment_dict().items()\n ]\n\n def _secrets(self, container_context):\n secrets = container_context.get_secrets_dict(self.secrets_manager)\n return (\n [{"name": key, "valueFrom": value} for key, value in secrets.items()] if secrets else []\n )\n\n @property\n def supports_check_run_worker_health(self):\n return True\n\n @property\n def include_cluster_info_in_failure_messages(self):\n return True\n\n def _is_transient_startup_failure(self, run, task):\n if not task.get("stoppedReason"):\n return False\n return (\n run.status == DagsterRunStatus.STARTING\n and "Timeout waiting for network interface provisioning to complete"\n in task.get("stoppedReason")\n )\n\n def check_run_worker_health(self, run: DagsterRun):\n run_worker_id = run.tags.get(RUN_WORKER_ID_TAG)\n\n tags = self._get_run_tags(run.run_id)\n container_context = EcsContainerContext.create_for_run(run, self)\n\n if not (tags.arn and tags.cluster):\n return CheckRunHealthResult(WorkerStatus.UNKNOWN, "", run_worker_id=run_worker_id)\n\n tasks = self.ecs.describe_tasks(tasks=[tags.arn], cluster=tags.cluster).get("tasks")\n if not tasks:\n return CheckRunHealthResult(WorkerStatus.UNKNOWN, "", run_worker_id=run_worker_id)\n\n t = tasks[0]\n\n if t.get("lastStatus") in RUNNING_STATUSES:\n return CheckRunHealthResult(WorkerStatus.RUNNING, run_worker_id=run_worker_id)\n elif t.get("lastStatus") in STOPPED_STATUSES:\n failed_containers = []\n for c in t.get("containers"):\n if c.get("exitCode") != 0:\n failed_containers.append(c)\n if len(failed_containers) > 0:\n if len(failed_containers) > 1:\n container_str = "Containers"\n else:\n container_str = "Container"\n\n failure_text = []\n\n if self.include_cluster_info_in_failure_messages:\n failure_text.append(\n f"Task {t.get('taskArn')} failed. Stop code: {t.get('stopCode')}. Stop"\n f" reason: {t.get('stoppedReason')}."\n + f" {container_str} {[c.get('name') for c in failed_containers]} failed."\n )\n\n logs = []\n\n try:\n logs = get_task_logs(\n self.ecs,\n logs_client=self.logs,\n cluster=tags.cluster,\n task_arn=tags.arn,\n container_name=self._get_container_name(container_context),\n )\n except:\n logging.exception(f"Error trying to get logs for failed task {tags.arn}")\n\n if logs:\n failure_text.append("Run worker logs:\\n" + "\\n".join(logs))\n\n return CheckRunHealthResult(\n WorkerStatus.FAILED,\n "\\n\\n".join(failure_text),\n transient=self._is_transient_startup_failure(run, t),\n run_worker_id=run_worker_id,\n )\n\n return CheckRunHealthResult(WorkerStatus.SUCCESS, run_worker_id=run_worker_id)\n\n return CheckRunHealthResult(\n WorkerStatus.UNKNOWN, "ECS task health status is unknown.", run_worker_id=run_worker_id\n )
\n
", "current_page_name": "_modules/dagster_aws/ecs/launcher", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_aws.ecs.launcher"}}, "emr": {"emr": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_aws.emr.emr

\n# Portions of this file are copied from the Yelp MRJob project:\n#\n#   https://github.com/Yelp/mrjob\n#\n#\n# Copyright 2009-2013 Yelp, David Marin\n# Copyright 2015 Yelp\n# Copyright 2017 Yelp\n# Copyright 2018 Contributors\n# Copyright 2019 Yelp and Contributors\n#\n# Licensed under the Apache License, Version 2.0 (the "License");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an "AS IS" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport gzip\nimport re\nfrom io import BytesIO\nfrom urllib.parse import urlparse\n\nimport boto3\nimport dagster\nimport dagster._check as check\nfrom botocore.exceptions import WaiterError\n\nfrom dagster_aws.utils.mrjob.utils import _boto3_now, _wrap_aws_client, strip_microseconds\n\nfrom .types import EMR_CLUSTER_TERMINATED_STATES, EmrClusterState, EmrStepState\n\n# if we can't create or find our own service role, use the one\n# created by the AWS console and CLI\n_FALLBACK_SERVICE_ROLE = "EMR_DefaultRole"\n\n# if we can't create or find our own instance profile, use the one\n# created by the AWS console and CLI\n_FALLBACK_INSTANCE_PROFILE = "EMR_EC2_DefaultRole"\n\n\n
[docs]class EmrError(Exception):\n pass
\n\n\n
[docs]class EmrJobRunner:\n def __init__(\n self,\n region,\n check_cluster_every=30,\n aws_access_key_id=None,\n aws_secret_access_key=None,\n ):\n """This object encapsulates various utilities for interacting with EMR clusters and invoking\n steps (jobs) on them.\n\n See also :py:class:`~dagster_aws.emr.EmrPySparkResource`, which wraps this job runner in a\n resource for pyspark workloads.\n\n Args:\n region (str): AWS region to use\n check_cluster_every (int, optional): How frequently to poll boto3 APIs for updates.\n Defaults to 30 seconds.\n aws_access_key_id ([type], optional): AWS access key ID. Defaults to None, which will\n use the default boto3 credentials chain.\n aws_secret_access_key ([type], optional): AWS secret access key. Defaults to None, which\n will use the default boto3 credentials chain.\n """\n self.region = check.str_param(region, "region")\n\n # This is in seconds\n self.check_cluster_every = check.int_param(check_cluster_every, "check_cluster_every")\n self.aws_access_key_id = check.opt_str_param(aws_access_key_id, "aws_access_key_id")\n self.aws_secret_access_key = check.opt_str_param(\n aws_secret_access_key, "aws_secret_access_key"\n )\n\n def make_emr_client(self):\n """Creates a boto3 EMR client. Construction is wrapped in retries in case client connection\n fails transiently.\n\n Returns:\n botocore.client.EMR: An EMR client\n """\n raw_emr_client = boto3.client(\n "emr",\n aws_access_key_id=self.aws_access_key_id,\n aws_secret_access_key=self.aws_secret_access_key,\n region_name=self.region,\n )\n return _wrap_aws_client(raw_emr_client, min_backoff=self.check_cluster_every)\n\n def cluster_id_from_name(self, cluster_name):\n """Get a cluster ID in the format "j-123ABC123ABC1" given a cluster name "my cool cluster".\n\n Args:\n cluster_name (str): The name of the cluster for which to find an ID\n\n Returns:\n str: The ID of the cluster\n\n Raises:\n EmrError: No cluster with the specified name exists\n """\n check.str_param(cluster_name, "cluster_name")\n\n response = self.make_emr_client().list_clusters().get("Clusters", [])\n for cluster in response:\n if cluster["Name"] == cluster_name:\n return cluster["Id"]\n\n raise EmrError(f"cluster {cluster_name} not found in region {self.region}")\n\n @staticmethod\n def construct_step_dict_for_command(step_name, command, action_on_failure="CONTINUE"):\n """Construct an EMR step definition which uses command-runner.jar to execute a shell command\n on the EMR master.\n\n Args:\n step_name (str): The name of the EMR step (will show up in the EMR UI)\n command (str): The shell command to execute with command-runner.jar\n action_on_failure (str, optional): Configure action on failure (e.g., continue, or\n terminate the cluster). Defaults to 'CONTINUE'.\n\n Returns:\n dict: Step definition dict\n """\n check.str_param(step_name, "step_name")\n check.list_param(command, "command", of_type=str)\n check.str_param(action_on_failure, "action_on_failure")\n\n return {\n "Name": step_name,\n "ActionOnFailure": action_on_failure,\n "HadoopJarStep": {"Jar": "command-runner.jar", "Args": command},\n }\n\n def add_tags(self, log, tags, cluster_id):\n """Add tags in the dict tags to cluster cluster_id.\n\n Args:\n log (DagsterLogManager): Log manager, for logging\n tags (dict): Dictionary of {'key': 'value'} tags\n cluster_id (str): The ID of the cluster to tag\n """\n check.dict_param(tags, "tags")\n check.str_param(cluster_id, "cluster_id")\n\n tags_items = sorted(tags.items())\n\n self.make_emr_client().add_tags(\n ResourceId=cluster_id, Tags=[dict(Key=k, Value=v) for k, v in tags_items]\n )\n\n log.info(\n "Added EMR tags to cluster %s: %s"\n % (cluster_id, ", ".join("%s=%s" % (tag, value) for tag, value in tags_items))\n )\n\n def run_job_flow(self, log, cluster_config):\n """Create an empty cluster on EMR, and return the ID of that job flow.\n\n Args:\n log (DagsterLogManager): Log manager, for logging\n cluster_config (dict): Configuration for this EMR job flow. See:\n https://docs.aws.amazon.com/emr/latest/APIReference/API_RunJobFlow.html\n\n Returns:\n str: The cluster ID, e.g. "j-ZKIY4CKQRX72"\n """\n check.dict_param(cluster_config, "cluster_config")\n\n log.debug("Creating Elastic MapReduce cluster")\n emr_client = self.make_emr_client()\n\n log.debug(\n "Calling run_job_flow(%s)"\n % (", ".join("%s=%r" % (k, v) for k, v in sorted(cluster_config.items())))\n )\n cluster_id = emr_client.run_job_flow(**cluster_config)["JobFlowId"]\n\n log.info("Created new cluster %s" % cluster_id)\n\n # set EMR tags for the cluster\n tags_items = cluster_config.get("Tags", [])\n tags = {k: v for k, v in tags_items}\n tags["__dagster_version"] = dagster.__version__\n self.add_tags(log, tags, cluster_id)\n return cluster_id\n\n def describe_cluster(self, cluster_id):\n """Thin wrapper over boto3 describe_cluster.\n\n Args:\n cluster_id (str): Cluster to inspect\n\n Returns:\n dict: The cluster info. See:\n https://docs.aws.amazon.com/emr/latest/APIReference/API_DescribeCluster.html\n """\n check.str_param(cluster_id, "cluster_id")\n\n emr_client = self.make_emr_client()\n return emr_client.describe_cluster(ClusterId=cluster_id)\n\n def describe_step(self, cluster_id, step_id):\n """Thin wrapper over boto3 describe_step.\n\n Args:\n cluster_id (str): Cluster to inspect\n step_id (str): Step ID to describe\n\n Returns:\n dict: The step info. See:\n https://docs.aws.amazon.com/emr/latest/APIReference/API_DescribeStep.html\n """\n check.str_param(cluster_id, "cluster_id")\n check.str_param(step_id, "step_id")\n\n emr_client = self.make_emr_client()\n return emr_client.describe_step(ClusterId=cluster_id, StepId=step_id)\n\n def add_job_flow_steps(self, log, cluster_id, step_defs):\n """Submit the constructed job flow steps to EMR for execution.\n\n Args:\n log (DagsterLogManager): Log manager, for logging\n cluster_id (str): The ID of the cluster\n step_defs (List[dict]): List of steps; see also `construct_step_dict_for_command`\n\n Returns:\n List[str]: list of step IDs.\n """\n check.str_param(cluster_id, "cluster_id")\n check.list_param(step_defs, "step_defs", of_type=dict)\n\n emr_client = self.make_emr_client()\n\n steps_kwargs = dict(JobFlowId=cluster_id, Steps=step_defs)\n log.debug(\n "Calling add_job_flow_steps(%s)"\n % ",".join(("%s=%r" % (k, v)) for k, v in steps_kwargs.items())\n )\n return emr_client.add_job_flow_steps(**steps_kwargs)["StepIds"]\n\n def is_emr_step_complete(self, log, cluster_id, emr_step_id):\n step = self.describe_step(cluster_id, emr_step_id)["Step"]\n step_state = EmrStepState(step["Status"]["State"])\n\n if step_state == EmrStepState.Pending:\n cluster = self.describe_cluster(cluster_id)["Cluster"]\n\n reason = _get_reason(cluster)\n reason_desc = (": %s" % reason) if reason else ""\n\n log.info("PENDING (cluster is %s%s)" % (cluster["Status"]["State"], reason_desc))\n return False\n\n elif step_state == EmrStepState.Running:\n time_running_desc = ""\n\n start = step["Status"]["Timeline"].get("StartDateTime")\n if start:\n time_running_desc = " for %s" % strip_microseconds(_boto3_now() - start)\n\n log.info("RUNNING%s" % time_running_desc)\n return False\n\n # we're done, will return at the end of this\n elif step_state == EmrStepState.Completed:\n log.info("COMPLETED")\n return True\n else:\n # step has failed somehow. *reason* seems to only be set\n # when job is cancelled (e.g. 'Job terminated')\n reason = _get_reason(step)\n reason_desc = (" (%s)" % reason) if reason else ""\n\n log.info("%s%s" % (step_state.value, reason_desc))\n\n # print cluster status; this might give more context\n # why step didn't succeed\n cluster = self.describe_cluster(cluster_id)["Cluster"]\n reason = _get_reason(cluster)\n reason_desc = (": %s" % reason) if reason else ""\n log.info(\n "Cluster %s %s %s%s"\n % (\n cluster["Id"],\n "was" if "ED" in cluster["Status"]["State"] else "is",\n cluster["Status"]["State"],\n reason_desc,\n )\n )\n\n if EmrClusterState(cluster["Status"]["State"]) in EMR_CLUSTER_TERMINATED_STATES:\n # was it caused by IAM roles?\n self._check_for_missing_default_iam_roles(log, cluster)\n\n # TODO: extract logs here to surface failure reason\n # See: https://github.com/dagster-io/dagster/issues/1954\n\n if step_state == EmrStepState.Failed:\n log.error("EMR step %s failed" % emr_step_id)\n\n raise EmrError("EMR step %s failed" % emr_step_id)\n\n def _check_for_missing_default_iam_roles(self, log, cluster):\n """If cluster couldn't start due to missing IAM roles, tell user what to do."""\n check.dict_param(cluster, "cluster")\n\n reason = _get_reason(cluster)\n if any(\n reason.endswith("/%s is invalid" % role)\n for role in (_FALLBACK_INSTANCE_PROFILE, _FALLBACK_SERVICE_ROLE)\n ):\n log.warning(\n "IAM roles are missing. See documentation for IAM roles on EMR here: "\n "https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-iam-roles.html"\n )\n\n def log_location_for_cluster(self, cluster_id):\n """EMR clusters are typically launched with S3 logging configured. This method inspects a\n cluster using boto3 describe_cluster to retrieve the log URI.\n\n Args:\n cluster_id (str): The cluster to inspect.\n\n Raises:\n EmrError: the log URI was missing (S3 log mirroring not enabled for this cluster)\n\n Returns:\n (str, str): log bucket and key\n """\n check.str_param(cluster_id, "cluster_id")\n\n # The S3 log URI is specified per job flow (cluster)\n log_uri = self.describe_cluster(cluster_id)["Cluster"].get("LogUri", None)\n\n # ugh, seriously boto3?! This will come back as string "None"\n if log_uri == "None" or log_uri is None:\n raise EmrError("Log URI not specified, cannot retrieve step execution logs")\n\n # For some reason the API returns an s3n:// protocol log URI instead of s3://\n log_uri = re.sub("^s3n", "s3", log_uri)\n log_uri_parsed = urlparse(log_uri)\n log_bucket = log_uri_parsed.netloc\n log_key_prefix = log_uri_parsed.path.lstrip("/")\n return log_bucket, log_key_prefix\n\n def retrieve_logs_for_step_id(self, log, cluster_id, step_id):\n """Retrieves stdout and stderr logs for the given step ID.\n\n Args:\n log (DagsterLogManager): Log manager, for logging\n cluster_id (str): EMR cluster ID\n step_id (str): EMR step ID for the job that was submitted.\n\n Returns:\n (str, str): Tuple of stdout log string contents, and stderr log string contents\n """\n check.str_param(cluster_id, "cluster_id")\n check.str_param(step_id, "step_id")\n\n log_bucket, log_key_prefix = self.log_location_for_cluster(cluster_id)\n\n prefix = f"{log_key_prefix}{cluster_id}/steps/{step_id}"\n stdout_log = self.wait_for_log(log, log_bucket, f"{prefix}/stdout.gz")\n stderr_log = self.wait_for_log(log, log_bucket, f"{prefix}/stderr.gz")\n return stdout_log, stderr_log\n\n def wait_for_log(self, log, log_bucket, log_key, waiter_delay=30, waiter_max_attempts=20):\n """Wait for gzipped EMR logs to appear on S3. Note that EMR syncs logs to S3 every 5\n minutes, so this may take a long time.\n\n Args:\n log_bucket (str): S3 bucket where log is expected to appear\n log_key (str): S3 key for the log file\n waiter_delay (int): How long to wait between attempts to check S3 for the log file\n waiter_max_attempts (int): Number of attempts before giving up on waiting\n\n Raises:\n EmrError: Raised if we waited the full duration and the logs did not appear\n\n Returns:\n str: contents of the log file\n """\n check.str_param(log_bucket, "log_bucket")\n check.str_param(log_key, "log_key")\n check.int_param(waiter_delay, "waiter_delay")\n check.int_param(waiter_max_attempts, "waiter_max_attempts")\n\n log.info(f"Attempting to get log: s3://{log_bucket}/{log_key}")\n\n s3 = _wrap_aws_client(boto3.client("s3"), min_backoff=self.check_cluster_every)\n waiter = s3.get_waiter("object_exists")\n try:\n waiter.wait(\n Bucket=log_bucket,\n Key=log_key,\n WaiterConfig={"Delay": waiter_delay, "MaxAttempts": waiter_max_attempts},\n )\n except WaiterError as err:\n raise EmrError("EMR log file did not appear on S3 after waiting") from err\n\n obj = BytesIO(s3.get_object(Bucket=log_bucket, Key=log_key)["Body"].read())\n gzip_file = gzip.GzipFile(fileobj=obj)\n return gzip_file.read().decode("utf-8")
\n\n\ndef _get_reason(cluster_or_step):\n """Get state change reason message."""\n # StateChangeReason is {} before the first state change\n return cluster_or_step["Status"]["StateChangeReason"].get("Message", "")\n
", "current_page_name": "_modules/dagster_aws/emr/emr", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_aws.emr.emr"}, "pyspark_step_launcher": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_aws.emr.pyspark_step_launcher

\nimport os\nimport pickle\nimport sys\nimport tempfile\nimport time\n\nimport boto3\nfrom botocore.exceptions import ClientError\nfrom dagster import (\n    Field,\n    StringSource,\n    _check as check,\n    resource,\n)\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom dagster._core.definitions.step_launcher import StepLauncher\nfrom dagster._core.errors import DagsterInvariantViolationError, raise_execution_interrupts\nfrom dagster._core.execution.plan.external_step import (\n    PICKLED_EVENTS_FILE_NAME,\n    PICKLED_STEP_RUN_REF_FILE_NAME,\n    step_context_to_step_run_ref,\n)\nfrom dagster._serdes import deserialize_value\n\nfrom dagster_aws.emr import EmrError, EmrJobRunner, emr_step_main\nfrom dagster_aws.emr.configs_spark import spark_config as get_spark_config\nfrom dagster_aws.utils.mrjob.log4j import parse_hadoop_log4j_records\n\n# On EMR, Spark is installed here\nEMR_SPARK_HOME = "/usr/lib/spark/"\n\nCODE_ZIP_NAME = "code.zip"\n\n\n
[docs]@dagster_maintained_resource\n@resource(\n {\n "spark_config": get_spark_config(),\n "cluster_id": Field(\n StringSource, description="Name of the job flow (cluster) on which to execute."\n ),\n "region_name": Field(StringSource, description="The AWS region that the cluster is in."),\n "action_on_failure": Field(\n str,\n is_required=False,\n default_value="CANCEL_AND_WAIT",\n description=(\n "The EMR action to take when the cluster step fails: "\n "https://docs.aws.amazon.com/emr/latest/APIReference/API_StepConfig.html"\n ),\n ),\n "staging_bucket": Field(\n StringSource,\n is_required=True,\n description=(\n "S3 bucket to use for passing files between the plan process and EMR process."\n ),\n ),\n "staging_prefix": Field(\n StringSource,\n is_required=False,\n default_value="emr_staging",\n description=(\n "S3 key prefix inside the staging_bucket to use for files passed the plan "\n "process and EMR process"\n ),\n ),\n "wait_for_logs": Field(\n bool,\n is_required=False,\n default_value=False,\n description=(\n "If set, the system will wait for EMR logs to appear on S3. Note that logs "\n "are copied every 5 minutes, so enabling this will add several minutes to the job "\n "runtime."\n ),\n ),\n "local_job_package_path": Field(\n StringSource,\n is_required=False,\n description=(\n "Absolute path to the package that contains the job definition(s) whose steps will"\n " execute remotely on EMR. This is a path on the local fileystem of the process"\n " executing the job. The expectation is that this package will also be available on"\n " the python path of the launched process running the Spark step on EMR, either"\n " deployed on step launch via the deploy_local_job_package option, referenced on s3"\n " via the s3_job_package_path option, or installed on the cluster via bootstrap"\n " actions."\n ),\n ),\n "local_pipeline_package_path": Field(\n StringSource,\n is_required=False,\n description=(\n "(legacy) Absolute path to the package that contains the pipeline definition(s)"\n " whose steps will execute remotely on EMR. This is a path on the local fileystem"\n " of the process executing the pipeline. The expectation is that this package will"\n " also be available on the python path of the launched process running the Spark"\n " step on EMR, either deployed on step launch via the deploy_local_pipeline_package"\n " option, referenced on s3 via the s3_pipeline_package_path option, or installed on"\n " the cluster via bootstrap actions."\n ),\n ),\n "deploy_local_job_package": Field(\n bool,\n default_value=False,\n is_required=False,\n description=(\n "If set, before every step run, the launcher will zip up all the code in"\n " local_job_package_path, upload it to s3, and pass it to spark-submit's --py-files"\n " option. This gives the remote process access to up-to-date user code. If not set,"\n " the assumption is that some other mechanism is used for distributing code to the"\n " EMR cluster. If this option is set to True, s3_job_package_path should not also"\n " be set."\n ),\n ),\n "deploy_local_pipeline_package": Field(\n bool,\n default_value=False,\n is_required=False,\n description=(\n "(legacy) If set, before every step run, the launcher will zip up all the code in"\n " local_job_package_path, upload it to s3, and pass it to spark-submit's --py-files"\n " option. This gives the remote process access to up-to-date user code. If not set,"\n " the assumption is that some other mechanism is used for distributing code to the"\n " EMR cluster. If this option is set to True, s3_job_package_path should not also"\n " be set."\n ),\n ),\n "s3_job_package_path": Field(\n StringSource,\n is_required=False,\n description=(\n "If set, this path will be passed to the --py-files option of spark-submit. "\n "This should usually be a path to a zip file. If this option is set, "\n "deploy_local_job_package should not be set to True."\n ),\n ),\n "s3_pipeline_package_path": Field(\n StringSource,\n is_required=False,\n description=(\n "If set, this path will be passed to the --py-files option of spark-submit. "\n "This should usually be a path to a zip file. If this option is set, "\n "deploy_local_pipeline_package should not be set to True."\n ),\n ),\n }\n)\ndef emr_pyspark_step_launcher(context):\n # Resolve legacy arguments\n if context.resource_config.get("local_job_package_path") and context.resource_config.get(\n "local_pipeline_package_path"\n ):\n raise DagsterInvariantViolationError(\n "Provided both ``local_job_package_path`` and legacy version "\n "``local_pipeline_package_path`` arguments to ``emr_pyspark_step_launcher`` "\n "resource. Please choose one or the other."\n )\n\n if not context.resource_config.get(\n "local_job_package_path"\n ) and not context.resource_config.get("local_pipeline_package_path"):\n raise DagsterInvariantViolationError(\n "For resource ``emr_pyspark_step_launcher``, no config value provided for required "\n "schema entry ``local_job_package_path``."\n )\n\n local_job_package_path = context.resource_config.get(\n "local_job_package_path"\n ) or context.resource_config.get("local_pipeline_package_path")\n\n if context.resource_config.get("deploy_local_job_package") and context.resource_config.get(\n "deploy_local_pipeline_package"\n ):\n raise DagsterInvariantViolationError(\n "Provided both ``deploy_local_job_package`` and legacy version "\n "``deploy_local_pipeline_package`` arguments to ``emr_pyspark_step_launcher`` "\n "resource. Please choose one or the other."\n )\n\n deploy_local_job_package = context.resource_config.get(\n "deploy_local_job_package"\n ) or context.resource_config.get("deploy_local_pipeline_package")\n\n if context.resource_config.get("s3_job_package_path") and context.resource_config.get(\n "s3_pipeline_package_path"\n ):\n raise DagsterInvariantViolationError(\n "Provided both ``s3_job_package_path`` and legacy version "\n "``s3_pipeline_package_path`` arguments to ``emr_pyspark_step_launcher`` "\n "resource. Please choose one or the other."\n )\n\n s3_job_package_path = context.resource_config.get(\n "s3_job_package_path"\n ) or context.resource_config.get("s3_pipeline_package_path")\n\n return EmrPySparkStepLauncher(\n region_name=context.resource_config.get("region_name"),\n staging_bucket=context.resource_config.get("staging_bucket"),\n staging_prefix=context.resource_config.get("staging_prefix"),\n wait_for_logs=context.resource_config.get("wait_for_logs"),\n action_on_failure=context.resource_config.get("action_on_failure"),\n cluster_id=context.resource_config.get("cluster_id"),\n spark_config=context.resource_config.get("spark_config"),\n local_job_package_path=local_job_package_path,\n deploy_local_job_package=deploy_local_job_package,\n s3_job_package_path=s3_job_package_path,\n )
\n\n\nemr_pyspark_step_launcher.__doc__ = "\\n".join(\n "- **" + option + "**: " + (field.description or "")\n for option, field in emr_pyspark_step_launcher.config_schema.config_type.fields.items() # type: ignore\n)\n\n\nclass EmrPySparkStepLauncher(StepLauncher):\n def __init__(\n self,\n region_name,\n staging_bucket,\n staging_prefix,\n wait_for_logs,\n action_on_failure,\n cluster_id,\n spark_config,\n local_job_package_path,\n deploy_local_job_package,\n s3_job_package_path=None,\n ):\n self.region_name = check.str_param(region_name, "region_name")\n self.staging_bucket = check.str_param(staging_bucket, "staging_bucket")\n self.staging_prefix = check.str_param(staging_prefix, "staging_prefix")\n self.wait_for_logs = check.bool_param(wait_for_logs, "wait_for_logs")\n self.action_on_failure = check.str_param(action_on_failure, "action_on_failure")\n self.cluster_id = check.str_param(cluster_id, "cluster_id")\n self.spark_config = spark_config\n\n check.invariant(\n not deploy_local_job_package or not s3_job_package_path,\n "If deploy_local_job_package is set to True, s3_job_package_path should not "\n "also be set.",\n )\n\n self.local_job_package_path = check.str_param(\n local_job_package_path, "local_job_package_path"\n )\n self.deploy_local_job_package = check.bool_param(\n deploy_local_job_package, "deploy_local_job_package"\n )\n self.s3_job_package_path = check.opt_str_param(s3_job_package_path, "s3_job_package_path")\n\n self.emr_job_runner = EmrJobRunner(region=self.region_name)\n\n def _post_artifacts(self, log, step_run_ref, run_id, step_key):\n """Synchronize the step run ref and pyspark code to an S3 staging bucket for use on EMR.\n\n For the zip file, consider the following toy example:\n\n # Folder: my_pyspark_project/\n # a.py\n def foo():\n print(1)\n\n # b.py\n def bar():\n print(2)\n\n # main.py\n from a import foo\n from b import bar\n\n foo()\n bar()\n\n This will zip up `my_pyspark_project/` as `my_pyspark_project.zip`. Then, when running\n `spark-submit --py-files my_pyspark_project.zip emr_step_main.py` on EMR this will\n print 1, 2.\n """\n from dagster_pyspark.utils import build_pyspark_zip\n\n with tempfile.TemporaryDirectory() as temp_dir:\n s3 = boto3.client("s3", region_name=self.region_name)\n\n # Upload step run ref\n def _upload_file_to_s3(local_path, s3_filename):\n key = self._artifact_s3_key(run_id, step_key, s3_filename)\n s3_uri = self._artifact_s3_uri(run_id, step_key, s3_filename)\n log.debug(f"Uploading file {local_path} to {s3_uri}")\n s3.upload_file(Filename=local_path, Bucket=self.staging_bucket, Key=key)\n\n # Upload main file.\n # The remote Dagster installation should also have the file, but locating it there\n # could be a pain.\n main_local_path = self._main_file_local_path()\n _upload_file_to_s3(main_local_path, self._main_file_name())\n\n if self.deploy_local_job_package:\n # Zip and upload package containing job\n zip_local_path = os.path.join(temp_dir, CODE_ZIP_NAME)\n\n build_pyspark_zip(zip_local_path, self.local_job_package_path)\n _upload_file_to_s3(zip_local_path, CODE_ZIP_NAME)\n\n # Create step run ref pickle file\n step_run_ref_local_path = os.path.join(temp_dir, PICKLED_STEP_RUN_REF_FILE_NAME)\n with open(step_run_ref_local_path, "wb") as step_pickle_file:\n pickle.dump(step_run_ref, step_pickle_file)\n\n _upload_file_to_s3(step_run_ref_local_path, PICKLED_STEP_RUN_REF_FILE_NAME)\n\n def launch_step(self, step_context):\n step_run_ref = step_context_to_step_run_ref(step_context, self.local_job_package_path)\n\n run_id = step_context.dagster_run.run_id\n log = step_context.log\n\n step_key = step_run_ref.step_key\n self._post_artifacts(log, step_run_ref, run_id, step_key)\n\n emr_step_def = self._get_emr_step_def(run_id, step_key, step_context.op.name)\n emr_step_id = self.emr_job_runner.add_job_flow_steps(log, self.cluster_id, [emr_step_def])[\n 0\n ]\n\n yield from self.wait_for_completion_and_log(run_id, step_key, emr_step_id, step_context)\n\n def wait_for_completion_and_log(self, run_id, step_key, emr_step_id, step_context):\n s3 = boto3.resource("s3", region_name=self.region_name)\n try:\n for event in self.wait_for_completion(step_context, s3, run_id, step_key, emr_step_id):\n yield event\n except EmrError as emr_error:\n if self.wait_for_logs:\n self._log_logs_from_s3(step_context.log, emr_step_id)\n raise emr_error\n\n if self.wait_for_logs:\n self._log_logs_from_s3(step_context.log, emr_step_id)\n\n def wait_for_completion(\n self, step_context, s3, run_id, step_key, emr_step_id, check_interval=15\n ):\n """We want to wait for the EMR steps to complete, and while that's happening, we want to\n yield any events that have been written to S3 for us by the remote process.\n After the the EMR steps complete, we want a final chance to fetch events before finishing\n the step.\n """\n done = False\n all_events = []\n # If this is being called within a `capture_interrupts` context, allow interrupts\n # while waiting for the pyspark execution to complete, so that we can terminate slow or\n # hanging steps\n while not done:\n with raise_execution_interrupts():\n time.sleep(check_interval) # AWS rate-limits us if we poll it too often\n done = self.emr_job_runner.is_emr_step_complete(\n step_context.log, self.cluster_id, emr_step_id\n )\n\n all_events_new = self.read_events(s3, run_id, step_key)\n\n if len(all_events_new) > len(all_events):\n for i in range(len(all_events), len(all_events_new)):\n event = all_events_new[i]\n # write each event from the EMR instance to the local instance\n step_context.instance.handle_new_event(event)\n if event.is_dagster_event:\n yield event.dagster_event\n all_events = all_events_new\n\n def read_events(self, s3, run_id, step_key):\n events_s3_obj = s3.Object(\n self.staging_bucket, self._artifact_s3_key(run_id, step_key, PICKLED_EVENTS_FILE_NAME)\n )\n\n try:\n events_data = events_s3_obj.get()["Body"].read()\n return deserialize_value(pickle.loads(events_data))\n except ClientError as ex:\n # The file might not be there yet, which is fine\n if ex.response["Error"]["Code"] == "NoSuchKey":\n return []\n else:\n raise ex\n\n def _log_logs_from_s3(self, log, emr_step_id):\n """Retrieves the logs from the remote PySpark process that EMR posted to S3 and logs\n them to the given log.\n """\n stdout_log, stderr_log = self.emr_job_runner.retrieve_logs_for_step_id(\n log, self.cluster_id, emr_step_id\n )\n # Since stderr is YARN / Hadoop Log4J output, parse and reformat those log lines for\n # Dagster's logging system.\n records = parse_hadoop_log4j_records(stderr_log)\n for record in records:\n if record.level:\n log.log(\n level=record.level,\n msg="".join(["Spark Driver stderr: ", record.logger, ": ", record.message]),\n )\n else:\n log.debug(f"Spark Driver stderr: {record.message}")\n\n sys.stdout.write(\n "---------- Spark Driver stdout: ----------\\n"\n + stdout_log\n + "\\n"\n + "---------- End of Spark Driver stdout ----------\\n"\n )\n\n def _get_emr_step_def(self, run_id, step_key, solid_name):\n """From the local Dagster instance, construct EMR steps that will kick off execution on a\n remote EMR cluster.\n """\n from dagster_spark.utils import flatten_dict, format_for_cli\n\n action_on_failure = self.action_on_failure\n\n # Execute Solid via spark-submit\n conf = dict(flatten_dict(self.spark_config))\n conf["spark.app.name"] = conf.get("spark.app.name", solid_name)\n\n check.invariant(\n conf.get("spark.master", "yarn") == "yarn",\n desc=(\n "spark.master is configured as %s; cannot set Spark master on EMR to anything "\n 'other than "yarn"'\n )\n % conf.get("spark.master"),\n )\n\n command = (\n [\n EMR_SPARK_HOME + "bin/spark-submit",\n "--master",\n "yarn",\n "--deploy-mode",\n conf.get("spark.submit.deployMode", "client"),\n ]\n + format_for_cli(list(flatten_dict(conf)))\n + [\n "--py-files",\n self._artifact_s3_uri(run_id, step_key, CODE_ZIP_NAME),\n self._artifact_s3_uri(run_id, step_key, self._main_file_name()),\n self.staging_bucket,\n self._artifact_s3_key(run_id, step_key, PICKLED_STEP_RUN_REF_FILE_NAME),\n ]\n )\n\n return EmrJobRunner.construct_step_dict_for_command(\n "Execute Solid/Op %s" % solid_name, command, action_on_failure=action_on_failure\n )\n\n def _main_file_name(self):\n return os.path.basename(self._main_file_local_path())\n\n def _main_file_local_path(self):\n return emr_step_main.__file__\n\n def _sanitize_step_key(self, step_key: str) -> str:\n # step_keys of dynamic steps contain brackets, which are invalid characters\n return step_key.replace("[", "__").replace("]", "__")\n\n def _artifact_s3_uri(self, run_id, step_key, filename):\n key = self._artifact_s3_key(run_id, self._sanitize_step_key(step_key), filename)\n return f"s3://{self.staging_bucket}/{key}"\n\n def _artifact_s3_key(self, run_id, step_key, filename):\n return "/".join(\n [\n self.staging_prefix,\n run_id,\n self._sanitize_step_key(step_key),\n os.path.basename(filename),\n ]\n )\n
", "current_page_name": "_modules/dagster_aws/emr/pyspark_step_launcher", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_aws.emr.pyspark_step_launcher"}, "types": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_aws.emr.types

\nfrom enum import Enum as PyEnum\n\nfrom dagster import Enum, EnumValue\n\nEbsVolumeType = Enum(\n    name="EbsVolumeType", enum_values=[EnumValue("gp2"), EnumValue("io1"), EnumValue("standard")]\n)\n\n\n
[docs]class EmrClusterState(PyEnum):\n Starting = "STARTING"\n Bootstrapping = "BOOTSTRAPPING"\n Running = "RUNNING"\n Waiting = "WAITING"\n Terminating = "TERMINATING"\n Terminated = "TERMINATED"\n TerminatedWithErrors = "TERMINATED_WITH_ERRORS"
\n\n\nEMR_CLUSTER_TERMINATED_STATES = [\n EmrClusterState.Terminating,\n EmrClusterState.Terminated,\n EmrClusterState.TerminatedWithErrors,\n]\n\nEMR_CLUSTER_DONE_STATES = EMR_CLUSTER_TERMINATED_STATES + [EmrClusterState.Waiting]\n\n\n
[docs]class EmrStepState(PyEnum):\n Pending = "PENDING"\n Running = "RUNNING"\n Continue = "CONTINUE"\n Completed = "COMPLETED"\n Cancelled = "CANCELLED"\n Failed = "FAILED"\n Interrupted = "INTERRUPTED"
\n\n\nEmrActionOnFailure = Enum(\n name="EmrActionOnFailure",\n enum_values=[\n EnumValue("TERMINATE_JOB_FLOW"),\n EnumValue("TERMINATE_CLUSTER"),\n EnumValue("CANCEL_AND_WAIT"),\n EnumValue("CONTINUE"),\n ],\n)\n\nEmrAdjustmentType = Enum(\n name="EmrAdjustmentType",\n enum_values=[\n EnumValue("CHANGE_IN_CAPACITY"),\n EnumValue("PERCENT_CHANGE_IN_CAPACITY"),\n EnumValue("EXACT_CAPACITY"),\n ],\n)\n\nEmrComparisonOperator = Enum(\n name="EmrComparisonOperator",\n enum_values=[\n EnumValue("GREATER_THAN_OR_EQUAL"),\n EnumValue("GREATER_THAN"),\n EnumValue("LESS_THAN"),\n EnumValue("LESS_THAN_OR_EQUAL"),\n ],\n)\n\nEmrInstanceRole = Enum(\n name="EmrInstanceRole", enum_values=[EnumValue("MASTER"), EnumValue("CORE"), EnumValue("TASK")]\n)\n\nEmrMarket = Enum(name="EmrMarket", enum_values=[EnumValue("ON_DEMAND"), EnumValue("SPOT")])\n\nEmrRepoUpgradeOnBoot = Enum(\n name="EmrRepoUpgradeOnBoot", enum_values=[EnumValue("SECURITY"), EnumValue("NONE")]\n)\n\nEmrScaleDownBehavior = Enum(\n name="EmrScaleDownBehavior",\n enum_values=[\n EnumValue("TERMINATE_AT_INSTANCE_HOUR"),\n EnumValue("TERMINATE_AT_TASK_COMPLETION"),\n ],\n)\n\nEmrStatistic = Enum(\n name="EmrStatistic",\n enum_values=[\n EnumValue("SAMPLE_COUNT"),\n EnumValue("AVERAGE"),\n EnumValue("SUM"),\n EnumValue("MINIMUM"),\n EnumValue("MAXIMUM"),\n ],\n)\n\nEmrSupportedProducts = Enum(\n name="EmrSupportedProducts", enum_values=[EnumValue("mapr-m3"), EnumValue("mapr-m5")]\n)\n\nEmrTimeoutAction = Enum(\n name="EmrTimeoutAction",\n enum_values=[EnumValue("SWITCH_TO_ON_DEMAND"), EnumValue("TERMINATE_CLUSTER")],\n)\n\nEmrUnit = Enum(\n name="EmrUnit",\n enum_values=[\n EnumValue("NONE"),\n EnumValue("SECONDS"),\n EnumValue("MICRO_SECONDS"),\n EnumValue("MILLI_SECONDS"),\n EnumValue("BYTES"),\n EnumValue("KILO_BYTES"),\n EnumValue("MEGA_BYTES"),\n EnumValue("GIGA_BYTES"),\n EnumValue("TERA_BYTES"),\n EnumValue("BITS"),\n EnumValue("KILO_BITS"),\n EnumValue("MEGA_BITS"),\n EnumValue("GIGA_BITS"),\n EnumValue("TERA_BITS"),\n EnumValue("PERCENT"),\n EnumValue("COUNT"),\n EnumValue("BYTES_PER_SECOND"),\n EnumValue("KILO_BYTES_PER_SECOND"),\n EnumValue("MEGA_BYTES_PER_SECOND"),\n EnumValue("GIGA_BYTES_PER_SECOND"),\n EnumValue("TERA_BYTES_PER_SECOND"),\n EnumValue("BITS_PER_SECOND"),\n EnumValue("KILO_BITS_PER_SECOND"),\n EnumValue("MEGA_BITS_PER_SECOND"),\n EnumValue("GIGA_BITS_PER_SECOND"),\n EnumValue("TERA_BITS_PER_SECOND"),\n EnumValue("COUNT_PER_SECOND"),\n ],\n)\n
", "current_page_name": "_modules/dagster_aws/emr/types", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_aws.emr.types"}}, "redshift": {"resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_aws.redshift.resources

\nimport abc\nfrom contextlib import contextmanager\nfrom logging import Logger\nfrom typing import Any, Dict, Optional, cast\n\nimport psycopg2\nimport psycopg2.extensions\nfrom dagster import (\n    ConfigurableResource,\n    _check as check,\n    get_dagster_logger,\n    resource,\n)\nfrom dagster._annotations import deprecated\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom pydantic import Field\n\n\nclass RedshiftError(Exception):\n    pass\n\n\nclass BaseRedshiftClient(abc.ABC):\n    @abc.abstractmethod\n    def execute_query(self, query, fetch_results=False, cursor_factory=None, error_callback=None):\n        pass\n\n    @abc.abstractmethod\n    def execute_queries(\n        self, queries, fetch_results=False, cursor_factory=None, error_callback=None\n    ):\n        pass\n\n\nclass RedshiftClient(BaseRedshiftClient):\n    def __init__(self, conn_args: Dict[str, Any], autocommit: Optional[bool], log: Logger):\n        # Extract parameters from resource config\n        self.conn_args = conn_args\n\n        self.autocommit = autocommit\n        self.log = log\n\n    def execute_query(self, query, fetch_results=False, cursor_factory=None, error_callback=None):\n        """Synchronously execute a single query against Redshift. Will return a list of rows, where\n        each row is a tuple of values, e.g. SELECT 1 will return [(1,)].\n\n        Args:\n            query (str): The query to execute.\n            fetch_results (Optional[bool]): Whether to return the results of executing the query.\n                Defaults to False, in which case the query will be executed without retrieving the\n                results.\n            cursor_factory (Optional[:py:class:`psycopg2.extensions.cursor`]): An alternative\n                cursor_factory; defaults to None. Will be used when constructing the cursor.\n            error_callback (Optional[Callable[[Exception, Cursor, DagsterLogManager], None]]): A\n                callback function, invoked when an exception is encountered during query execution;\n                this is intended to support executing additional queries to provide diagnostic\n                information, e.g. by querying ``stl_load_errors`` using ``pg_last_copy_id()``. If no\n                function is provided, exceptions during query execution will be raised directly.\n\n        Returns:\n            Optional[List[Tuple[Any, ...]]]: Results of the query, as a list of tuples, when\n                fetch_results is set. Otherwise return None.\n        """\n        check.str_param(query, "query")\n        check.bool_param(fetch_results, "fetch_results")\n        check.opt_class_param(\n            cursor_factory, "cursor_factory", superclass=psycopg2.extensions.cursor\n        )\n        check.opt_callable_param(error_callback, "error_callback")\n\n        with self._get_conn() as conn:\n            with self._get_cursor(conn, cursor_factory=cursor_factory) as cursor:\n                try:\n                    self.log.info(f"Executing query '{query}'")\n                    cursor.execute(query)\n\n                    if fetch_results and cursor.rowcount > 0:\n                        return cursor.fetchall()\n                    else:\n                        self.log.info("Empty result from query")\n\n                except Exception as e:\n                    # If autocommit is disabled or not set (it is disabled by default), Redshift\n                    # will be in the middle of a transaction at exception time, and because of\n                    # the failure the current transaction will not accept any further queries.\n                    #\n                    # This conn.commit() call closes the open transaction before handing off\n                    # control to the error callback, so that the user can issue additional\n                    # queries. Notably, for e.g. pg_last_copy_id() to work, it requires you to\n                    # use the same conn/cursor, so you have to do this conn.commit() to ensure\n                    # things are in a usable state in the error callback.\n                    if not self.autocommit:\n                        conn.commit()\n\n                    if error_callback is not None:\n                        error_callback(e, cursor, self.log)\n                    else:\n                        raise\n\n    def execute_queries(\n        self, queries, fetch_results=False, cursor_factory=None, error_callback=None\n    ):\n        """Synchronously execute a list of queries against Redshift. Will return a list of list of\n        rows, where each row is a tuple of values, e.g. ['SELECT 1', 'SELECT 1'] will return\n        [[(1,)], [(1,)]].\n\n        Args:\n            queries (List[str]): The queries to execute.\n            fetch_results (Optional[bool]): Whether to return the results of executing the query.\n                Defaults to False, in which case the query will be executed without retrieving the\n                results.\n            cursor_factory (Optional[:py:class:`psycopg2.extensions.cursor`]): An alternative\n            cursor_factory; defaults to None. Will be used when constructing the cursor.\n            error_callback (Optional[Callable[[Exception, Cursor, DagsterLogManager], None]]): A\n                callback function, invoked when an exception is encountered during query execution;\n                this is intended to support executing additional queries to provide diagnostic\n                information, e.g. by querying ``stl_load_errors`` using ``pg_last_copy_id()``. If no\n                function is provided, exceptions during query execution will be raised directly.\n\n        Returns:\n            Optional[List[List[Tuple[Any, ...]]]]: Results of the query, as a list of list of\n                tuples, when fetch_results is set. Otherwise return None.\n        """\n        check.list_param(queries, "queries", of_type=str)\n        check.bool_param(fetch_results, "fetch_results")\n        check.opt_class_param(\n            cursor_factory, "cursor_factory", superclass=psycopg2.extensions.cursor\n        )\n        check.opt_callable_param(error_callback, "error_callback")\n\n        results = []\n        with self._get_conn() as conn:\n            with self._get_cursor(conn, cursor_factory=cursor_factory) as cursor:\n                for query in queries:\n                    try:\n                        self.log.info(f"Executing query '{query}'")\n                        cursor.execute(query)\n\n                        if fetch_results and cursor.rowcount > 0:\n                            results.append(cursor.fetchall())\n                        else:\n                            results.append([])\n                            self.log.info("Empty result from query")\n\n                    except Exception as e:\n                        # If autocommit is disabled or not set (it is disabled by default), Redshift\n                        # will be in the middle of a transaction at exception time, and because of\n                        # the failure the current transaction will not accept any further queries.\n                        #\n                        # This conn.commit() call closes the open transaction before handing off\n                        # control to the error callback, so that the user can issue additional\n                        # queries. Notably, for e.g. pg_last_copy_id() to work, it requires you to\n                        # use the same conn/cursor, so you have to do this conn.commit() to ensure\n                        # things are in a usable state in the error callback.\n                        if not self.autocommit:\n                            conn.commit()\n\n                        if error_callback is not None:\n                            error_callback(e, cursor, self.log)\n                        else:\n                            raise\n\n        if fetch_results:\n            return results\n\n    @contextmanager\n    def _get_conn(self):\n        conn = None\n        try:\n            conn = psycopg2.connect(**self.conn_args)\n            yield conn\n        finally:\n            if conn:\n                conn.close()\n\n    @contextmanager\n    def _get_cursor(self, conn, cursor_factory=None):\n        check.opt_class_param(\n            cursor_factory, "cursor_factory", superclass=psycopg2.extensions.cursor\n        )\n\n        # Could be none, in which case we should respect the connection default. Otherwise\n        # explicitly set to true/false.\n        if self.autocommit is not None:\n            conn.autocommit = self.autocommit\n\n        with conn:\n            with conn.cursor(cursor_factory=cursor_factory) as cursor:\n                yield cursor\n\n            # If autocommit is set, we'll commit after each and every query execution. Otherwise, we\n            # want to do a final commit after we're wrapped up executing the full set of one or more\n            # queries.\n            if not self.autocommit:\n                conn.commit()\n\n\n@deprecated(breaking_version="2.0", additional_warn_text="Use RedshiftClientResource instead.")\nclass RedshiftResource(RedshiftClient):\n    """This class was used by the function-style Redshift resource."""\n\n\nclass FakeRedshiftClient(BaseRedshiftClient):\n    QUERY_RESULT = [(1,)]\n\n    def __init__(self, log: Logger):\n        # Extract parameters from resource config\n\n        self.log = log\n\n    def execute_query(self, query, fetch_results=False, cursor_factory=None, error_callback=None):\n        """Fake for execute_query; returns [self.QUERY_RESULT].\n\n        Args:\n            query (str): The query to execute.\n            fetch_results (Optional[bool]): Whether to return the results of executing the query.\n                Defaults to False, in which case the query will be executed without retrieving the\n                results.\n            cursor_factory (Optional[:py:class:`psycopg2.extensions.cursor`]): An alternative\n                cursor_factory; defaults to None. Will be used when constructing the cursor.\n            error_callback (Optional[Callable[[Exception, Cursor, DagsterLogManager], None]]): A\n                callback function, invoked when an exception is encountered during query execution;\n                this is intended to support executing additional queries to provide diagnostic\n                information, e.g. by querying ``stl_load_errors`` using ``pg_last_copy_id()``. If no\n                function is provided, exceptions during query execution will be raised directly.\n\n        Returns:\n            Optional[List[Tuple[Any, ...]]]: Results of the query, as a list of tuples, when\n                fetch_results is set. Otherwise return None.\n        """\n        check.str_param(query, "query")\n        check.bool_param(fetch_results, "fetch_results")\n        check.opt_class_param(\n            cursor_factory, "cursor_factory", superclass=psycopg2.extensions.cursor\n        )\n        check.opt_callable_param(error_callback, "error_callback")\n\n        self.log.info(f"Executing query '{query}'")\n        if fetch_results:\n            return self.QUERY_RESULT\n\n    def execute_queries(\n        self, queries, fetch_results=False, cursor_factory=None, error_callback=None\n    ):\n        """Fake for execute_queries; returns [self.QUERY_RESULT] * 3.\n\n        Args:\n            queries (List[str]): The queries to execute.\n            fetch_results (Optional[bool]): Whether to return the results of executing the query.\n                Defaults to False, in which case the query will be executed without retrieving the\n                results.\n            cursor_factory (Optional[:py:class:`psycopg2.extensions.cursor`]): An alternative\n                cursor_factory; defaults to None. Will be used when constructing the cursor.\n            error_callback (Optional[Callable[[Exception, Cursor, DagsterLogManager], None]]): A\n                callback function, invoked when an exception is encountered during query execution;\n                this is intended to support executing additional queries to provide diagnostic\n                information, e.g. by querying ``stl_load_errors`` using ``pg_last_copy_id()``. If no\n                function is provided, exceptions during query execution will be raised directly.\n\n        Returns:\n            Optional[List[List[Tuple[Any, ...]]]]: Results of the query, as a list of list of\n                tuples, when fetch_results is set. Otherwise return None.\n        """\n        check.list_param(queries, "queries", of_type=str)\n        check.bool_param(fetch_results, "fetch_results")\n        check.opt_class_param(\n            cursor_factory, "cursor_factory", superclass=psycopg2.extensions.cursor\n        )\n        check.opt_callable_param(error_callback, "error_callback")\n\n        for query in queries:\n            self.log.info(f"Executing query '{query}'")\n        if fetch_results:\n            return [self.QUERY_RESULT] * 3\n\n\n@deprecated(breaking_version="2.0", additional_warn_text="Use FakeRedshiftClientResource instead.")\nclass FakeRedshiftResource(FakeRedshiftClient):\n    """This class was used by the function-style fake Redshift resource."""\n\n\n
[docs]class RedshiftClientResource(ConfigurableResource):\n """This resource enables connecting to a Redshift cluster and issuing queries against that\n cluster.\n\n Example:\n .. code-block:: python\n\n from dagster import Definitions, asset, EnvVar\n from dagster_aws.redshift import RedshiftClientResource\n\n @asset\n def example_redshift_asset(context, redshift: RedshiftClientResource):\n redshift.get_client().execute_query('SELECT 1', fetch_results=True)\n\n redshift_configured = RedshiftClientResource(\n host='my-redshift-cluster.us-east-1.redshift.amazonaws.com',\n port=5439,\n user='dagster',\n password=EnvVar("DAGSTER_REDSHIFT_PASSWORD"),\n database='dev',\n )\n\n defs = Definitions(\n assets=[example_redshift_asset],\n resources={'redshift': redshift_configured},\n )\n\n """\n\n host: str = Field(description="Redshift host")\n port: int = Field(default=5439, description="Redshift port")\n user: Optional[str] = Field(default=None, description="Username for Redshift connection")\n password: Optional[str] = Field(default=None, description="Password for Redshift connection")\n database: Optional[str] = Field(\n default=None,\n description=(\n "Name of the default database to use. After login, you can use USE DATABASE to change"\n " the database."\n ),\n )\n autocommit: Optional[bool] = Field(default=None, description="Whether to autocommit queries")\n connect_timeout: int = Field(\n default=5, description="Timeout for connection to Redshift cluster. Defaults to 5 seconds."\n )\n sslmode: str = Field(\n default="require",\n description=(\n "SSL mode to use. See the Redshift documentation for reference:"\n " https://docs.aws.amazon.com/redshift/latest/mgmt/connecting-ssl-support.html"\n ),\n )\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n def get_client(self) -> RedshiftClient:\n conn_args = {\n k: getattr(self, k, None)\n for k in (\n "host",\n "port",\n "user",\n "password",\n "database",\n "connect_timeout",\n "sslmode",\n )\n if getattr(self, k, None) is not None\n }\n\n return RedshiftClient(conn_args, self.autocommit, get_dagster_logger())
\n\n\n
[docs]class FakeRedshiftClientResource(RedshiftClientResource):\n def get_client(self) -> FakeRedshiftClient:\n return FakeRedshiftClient(get_dagster_logger())
\n\n\n
[docs]@dagster_maintained_resource\n@resource(\n config_schema=RedshiftClientResource.to_config_schema(),\n description="Resource for connecting to the Redshift data warehouse",\n)\ndef redshift_resource(context) -> RedshiftClient:\n """This resource enables connecting to a Redshift cluster and issuing queries against that\n cluster.\n\n Example:\n .. code-block:: python\n\n from dagster import build_op_context, op\n from dagster_aws.redshift import redshift_resource\n\n @op(required_resource_keys={'redshift'})\n def example_redshift_op(context):\n return context.resources.redshift.execute_query('SELECT 1', fetch_results=True)\n\n redshift_configured = redshift_resource.configured({\n 'host': 'my-redshift-cluster.us-east-1.redshift.amazonaws.com',\n 'port': 5439,\n 'user': 'dagster',\n 'password': 'dagster',\n 'database': 'dev',\n })\n context = build_op_context(resources={'redshift': redshift_configured})\n assert example_redshift_op(context) == [(1,)]\n\n """\n return RedshiftClientResource.from_resource_context(context).get_client()
\n\n\n
[docs]@dagster_maintained_resource\n@resource(\n config_schema=FakeRedshiftClientResource.to_config_schema(),\n description=(\n "Fake resource for connecting to the Redshift data warehouse. Usage is identical "\n "to the real redshift_resource. Will always return [(1,)] for the single query case and "\n "[[(1,)], [(1,)], [(1,)]] for the multi query case."\n ),\n)\ndef fake_redshift_resource(context) -> FakeRedshiftClient:\n return cast(\n FakeRedshiftClient,\n FakeRedshiftClientResource.from_resource_context(context).get_client(),\n )
\n
", "current_page_name": "_modules/dagster_aws/redshift/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_aws.redshift.resources"}}, "s3": {"compute_log_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_aws.s3.compute_log_manager

\nimport os\nfrom contextlib import contextmanager\nfrom typing import Any, Iterator, Mapping, Optional, Sequence\n\nimport boto3\nimport dagster._seven as seven\nfrom botocore.errorfactory import ClientError\nfrom dagster import (\n    Field,\n    Permissive,\n    StringSource,\n    _check as check,\n)\nfrom dagster._config.config_type import Noneable\nfrom dagster._core.storage.captured_log_manager import CapturedLogContext\nfrom dagster._core.storage.cloud_storage_compute_log_manager import (\n    CloudStorageComputeLogManager,\n    PollingComputeLogSubscriptionManager,\n)\nfrom dagster._core.storage.compute_log_manager import ComputeIOType\nfrom dagster._core.storage.local_compute_log_manager import (\n    IO_TYPE_EXTENSION,\n    LocalComputeLogManager,\n)\nfrom dagster._serdes import ConfigurableClass, ConfigurableClassData\nfrom dagster._utils import ensure_dir, ensure_file\nfrom typing_extensions import Self\n\nPOLLING_INTERVAL = 5\n\n\n
[docs]class S3ComputeLogManager(CloudStorageComputeLogManager, ConfigurableClass):\n """Logs compute function stdout and stderr to S3.\n\n Users should not instantiate this class directly. Instead, use a YAML block in ``dagster.yaml``\n such as the following:\n\n .. code-block:: YAML\n\n compute_logs:\n module: dagster_aws.s3.compute_log_manager\n class: S3ComputeLogManager\n config:\n bucket: "mycorp-dagster-compute-logs"\n local_dir: "/tmp/cool"\n prefix: "dagster-test-"\n use_ssl: true\n verify: true\n verify_cert_path: "/path/to/cert/bundle.pem"\n endpoint_url: "http://alternate-s3-host.io"\n skip_empty_files: true\n upload_interval: 30\n upload_extra_args:\n ServerSideEncryption: "AES256"\n show_url_only: false\n region: "us-west-1"\n\n Args:\n bucket (str): The name of the s3 bucket to which to log.\n local_dir (Optional[str]): Path to the local directory in which to stage logs. Default:\n ``dagster._seven.get_system_temp_directory()``.\n prefix (Optional[str]): Prefix for the log file keys.\n use_ssl (Optional[bool]): Whether or not to use SSL. Default True.\n verify (Optional[bool]): Whether or not to verify SSL certificates. Default True.\n verify_cert_path (Optional[str]): A filename of the CA cert bundle to use. Only used if\n `verify` set to False.\n endpoint_url (Optional[str]): Override for the S3 endpoint url.\n skip_empty_files: (Optional[bool]): Skip upload of empty log files.\n upload_interval: (Optional[int]): Interval in seconds to upload partial log files to S3. By default, will only upload when the capture is complete.\n upload_extra_args: (Optional[dict]): Extra args for S3 file upload\n show_url_only: (Optional[bool]): Only show the URL of the log file in the UI, instead of fetching and displaying the full content. Default False.\n region: (Optional[str]): The region of the S3 bucket. If not specified, will use the default region of the AWS session.\n inst_data (Optional[ConfigurableClassData]): Serializable representation of the compute\n log manager when newed up from config.\n """\n\n def __init__(\n self,\n bucket,\n local_dir=None,\n inst_data: Optional[ConfigurableClassData] = None,\n prefix="dagster",\n use_ssl=True,\n verify=True,\n verify_cert_path=None,\n endpoint_url=None,\n skip_empty_files=False,\n upload_interval=None,\n upload_extra_args=None,\n show_url_only=False,\n region=None,\n ):\n _verify = False if not verify else verify_cert_path\n self._s3_session = boto3.resource(\n "s3", use_ssl=use_ssl, verify=_verify, endpoint_url=endpoint_url\n ).meta.client\n self._s3_bucket = check.str_param(bucket, "bucket")\n self._s3_prefix = self._clean_prefix(check.str_param(prefix, "prefix"))\n\n # proxy calls to local compute log manager (for subscriptions, etc)\n if not local_dir:\n local_dir = seven.get_system_temp_directory()\n\n self._local_manager = LocalComputeLogManager(local_dir)\n self._subscription_manager = PollingComputeLogSubscriptionManager(self)\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n self._skip_empty_files = check.bool_param(skip_empty_files, "skip_empty_files")\n self._upload_interval = check.opt_int_param(upload_interval, "upload_interval")\n check.opt_dict_param(upload_extra_args, "upload_extra_args")\n self._upload_extra_args = upload_extra_args\n self._show_url_only = show_url_only\n if region is None:\n # if unspecified, use the current session name\n self._region = self._s3_session.meta.region_name\n else:\n self._region = region\n\n @property\n def inst_data(self):\n return self._inst_data\n\n @classmethod\n def config_type(cls):\n return {\n "bucket": StringSource,\n "local_dir": Field(StringSource, is_required=False),\n "prefix": Field(StringSource, is_required=False, default_value="dagster"),\n "use_ssl": Field(bool, is_required=False, default_value=True),\n "verify": Field(bool, is_required=False, default_value=True),\n "verify_cert_path": Field(StringSource, is_required=False),\n "endpoint_url": Field(StringSource, is_required=False),\n "skip_empty_files": Field(bool, is_required=False, default_value=False),\n "upload_interval": Field(Noneable(int), is_required=False, default_value=None),\n "upload_extra_args": Field(\n Permissive(), is_required=False, description="Extra args for S3 file upload"\n ),\n "show_url_only": Field(bool, is_required=False, default_value=False),\n "region": Field(StringSource, is_required=False),\n }\n\n @classmethod\n def from_config_value(\n cls, inst_data: ConfigurableClassData, config_value: Mapping[str, Any]\n ) -> Self:\n return S3ComputeLogManager(inst_data=inst_data, **config_value)\n\n @property\n def local_manager(self) -> LocalComputeLogManager:\n return self._local_manager\n\n @property\n def upload_interval(self) -> Optional[int]:\n return self._upload_interval if self._upload_interval else None\n\n def _clean_prefix(self, prefix):\n parts = prefix.split("/")\n return "/".join([part for part in parts if part])\n\n def _s3_key(self, log_key, io_type, partial=False):\n check.inst_param(io_type, "io_type", ComputeIOType)\n extension = IO_TYPE_EXTENSION[io_type]\n [*namespace, filebase] = log_key\n filename = f"{filebase}.{extension}"\n if partial:\n filename = f"{filename}.partial"\n paths = [self._s3_prefix, "storage", *namespace, filename]\n return "/".join(paths) # s3 path delimiter\n\n @contextmanager\n def capture_logs(self, log_key: Sequence[str]) -> Iterator[CapturedLogContext]:\n with super().capture_logs(log_key) as local_context:\n if not self._show_url_only:\n yield local_context\n else:\n out_key = self._s3_key(log_key, ComputeIOType.STDOUT)\n err_key = self._s3_key(log_key, ComputeIOType.STDERR)\n s3_base = f"https://s3.console.aws.amazon.com/s3/object/{self._s3_bucket}?region={self._region}"\n yield CapturedLogContext(\n local_context.log_key,\n external_stdout_url=f"{s3_base}&prefix={out_key}",\n external_stderr_url=f"{s3_base}&prefix={err_key}",\n )\n\n def delete_logs(\n self, log_key: Optional[Sequence[str]] = None, prefix: Optional[Sequence[str]] = None\n ):\n self.local_manager.delete_logs(log_key=log_key, prefix=prefix)\n\n s3_keys_to_remove = None\n if log_key:\n s3_keys_to_remove = [\n self._s3_key(log_key, ComputeIOType.STDOUT),\n self._s3_key(log_key, ComputeIOType.STDERR),\n self._s3_key(log_key, ComputeIOType.STDOUT, partial=True),\n self._s3_key(log_key, ComputeIOType.STDERR, partial=True),\n ]\n elif prefix:\n # add the trailing '' to make sure that ['a'] does not match ['apple']\n s3_prefix = "/".join([self._s3_prefix, "storage", *prefix, ""])\n matching = self._s3_session.list_objects(Bucket=self._s3_bucket, Prefix=s3_prefix)\n s3_keys_to_remove = [obj["Key"] for obj in matching.get("Contents", [])]\n else:\n check.failed("Must pass in either `log_key` or `prefix` argument to delete_logs")\n\n if s3_keys_to_remove:\n to_delete = [{"Key": key} for key in s3_keys_to_remove]\n self._s3_session.delete_objects(Bucket=self._s3_bucket, Delete={"Objects": to_delete})\n\n def download_url_for_type(self, log_key: Sequence[str], io_type: ComputeIOType):\n if not self.is_capture_complete(log_key):\n return None\n\n s3_key = self._s3_key(log_key, io_type)\n return self._s3_session.generate_presigned_url(\n ClientMethod="get_object", Params={"Bucket": self._s3_bucket, "Key": s3_key}\n )\n\n def display_path_for_type(self, log_key: Sequence[str], io_type: ComputeIOType):\n if not self.is_capture_complete(log_key):\n return None\n s3_key = self._s3_key(log_key, io_type)\n return f"s3://{self._s3_bucket}/{s3_key}"\n\n def cloud_storage_has_logs(\n self, log_key: Sequence[str], io_type: ComputeIOType, partial: bool = False\n ) -> bool:\n s3_key = self._s3_key(log_key, io_type, partial=partial)\n try: # https://stackoverflow.com/a/38376288/14656695\n self._s3_session.head_object(Bucket=self._s3_bucket, Key=s3_key)\n except ClientError:\n return False\n return True\n\n def upload_to_cloud_storage(\n self, log_key: Sequence[str], io_type: ComputeIOType, partial=False\n ):\n path = self.local_manager.get_captured_local_path(log_key, IO_TYPE_EXTENSION[io_type])\n ensure_file(path)\n\n if (self._skip_empty_files or partial) and os.stat(path).st_size == 0:\n return\n\n s3_key = self._s3_key(log_key, io_type, partial=partial)\n with open(path, "rb") as data:\n extra_args = {\n "ContentType": "text/plain",\n **(self._upload_extra_args if self._upload_extra_args else {}),\n }\n self._s3_session.upload_fileobj(data, self._s3_bucket, s3_key, ExtraArgs=extra_args)\n\n def download_from_cloud_storage(\n self, log_key: Sequence[str], io_type: ComputeIOType, partial=False\n ):\n path = self._local_manager.get_captured_local_path(\n log_key, IO_TYPE_EXTENSION[io_type], partial=partial\n )\n ensure_dir(os.path.dirname(path))\n s3_key = self._s3_key(log_key, io_type, partial=partial)\n with open(path, "wb") as fileobj:\n self._s3_session.download_fileobj(self._s3_bucket, s3_key, fileobj)\n\n def on_subscribe(self, subscription):\n self._subscription_manager.add_subscription(subscription)\n\n def on_unsubscribe(self, subscription):\n self._subscription_manager.remove_subscription(subscription)\n\n def dispose(self):\n self._subscription_manager.dispose()\n self._local_manager.dispose()
\n
", "current_page_name": "_modules/dagster_aws/s3/compute_log_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_aws.s3.compute_log_manager"}, "file_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_aws.s3.file_manager

\nimport io\nimport uuid\nfrom contextlib import contextmanager\n\nimport dagster._check as check\nfrom dagster._core.storage.file_manager import (\n    FileHandle,\n    FileManager,\n    TempfileManager,\n    check_file_like_obj,\n)\n\n\n
[docs]class S3FileHandle(FileHandle):\n """A reference to a file on S3."""\n\n def __init__(self, s3_bucket: str, s3_key: str):\n self._s3_bucket = check.str_param(s3_bucket, "s3_bucket")\n self._s3_key = check.str_param(s3_key, "s3_key")\n\n @property\n def s3_bucket(self) -> str:\n """str: The name of the S3 bucket."""\n return self._s3_bucket\n\n @property\n def s3_key(self) -> str:\n """str: The S3 key."""\n return self._s3_key\n\n @property\n def path_desc(self) -> str:\n """str: The file's S3 URL."""\n return self.s3_path\n\n @property\n def s3_path(self) -> str:\n """str: The file's S3 URL."""\n return f"s3://{self.s3_bucket}/{self.s3_key}"
\n\n\nclass S3FileManager(FileManager):\n def __init__(self, s3_session, s3_bucket, s3_base_key):\n self._s3_session = s3_session\n self._s3_bucket = check.str_param(s3_bucket, "s3_bucket")\n self._s3_base_key = check.str_param(s3_base_key, "s3_base_key")\n self._local_handle_cache = {}\n self._temp_file_manager = TempfileManager()\n\n def copy_handle_to_local_temp(self, file_handle):\n self._download_if_not_cached(file_handle)\n return self._get_local_path(file_handle)\n\n def _download_if_not_cached(self, file_handle):\n if not self._file_handle_cached(file_handle):\n # instigate download\n temp_file_obj = self._temp_file_manager.tempfile()\n temp_name = temp_file_obj.name\n self._s3_session.download_file(\n Bucket=file_handle.s3_bucket, Key=file_handle.s3_key, Filename=temp_name\n )\n self._local_handle_cache[file_handle.s3_path] = temp_name\n\n return file_handle\n\n @contextmanager\n def read(self, file_handle, mode="rb"):\n check.inst_param(file_handle, "file_handle", S3FileHandle)\n check.str_param(mode, "mode")\n check.param_invariant(mode in {"r", "rb"}, "mode")\n\n self._download_if_not_cached(file_handle)\n\n encoding = None if mode == "rb" else "utf-8"\n with open(self._get_local_path(file_handle), mode, encoding=encoding) as file_obj:\n yield file_obj\n\n def _file_handle_cached(self, file_handle):\n return file_handle.s3_path in self._local_handle_cache\n\n def _get_local_path(self, file_handle):\n return self._local_handle_cache[file_handle.s3_path]\n\n def read_data(self, file_handle):\n with self.read(file_handle, mode="rb") as file_obj:\n return file_obj.read()\n\n def write_data(self, data, ext=None):\n check.inst_param(data, "data", bytes)\n return self.write(io.BytesIO(data), mode="wb", ext=ext)\n\n def write(self, file_obj, mode="wb", ext=None):\n check_file_like_obj(file_obj)\n s3_key = self.get_full_key(str(uuid.uuid4()) + (("." + ext) if ext is not None else ""))\n self._s3_session.put_object(Body=file_obj, Bucket=self._s3_bucket, Key=s3_key)\n return S3FileHandle(self._s3_bucket, s3_key)\n\n def get_full_key(self, file_key):\n return f"{self._s3_base_key}/{file_key}"\n\n def delete_local_temp(self):\n self._temp_file_manager.close()\n
", "current_page_name": "_modules/dagster_aws/s3/file_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_aws.s3.file_manager"}, "io_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_aws.s3.io_manager

\nimport io\nimport pickle\nfrom typing import Any, Dict, Optional, Union\n\nfrom dagster import (\n    ConfigurableIOManager,\n    InputContext,\n    MetadataValue,\n    OutputContext,\n    ResourceDependency,\n    _check as check,\n    io_manager,\n)\nfrom dagster._annotations import deprecated\nfrom dagster._core.storage.io_manager import dagster_maintained_io_manager\nfrom dagster._core.storage.upath_io_manager import UPathIOManager\nfrom dagster._utils import PICKLE_PROTOCOL\nfrom dagster._utils.cached_method import cached_method\nfrom pydantic import Field\nfrom upath import UPath\n\nfrom .resources import S3Resource\n\n\nclass PickledObjectS3IOManager(UPathIOManager):\n    def __init__(\n        self,\n        s3_bucket: str,\n        s3_session: Any,\n        s3_prefix: Optional[str] = None,\n    ):\n        self.bucket = check.str_param(s3_bucket, "s3_bucket")\n        check.opt_str_param(s3_prefix, "s3_prefix")\n        self.s3 = s3_session\n        self.s3.list_objects(Bucket=s3_bucket, Prefix=s3_prefix, MaxKeys=1)\n        base_path = UPath(s3_prefix) if s3_prefix else None\n        super().__init__(base_path=base_path)\n\n    def load_from_path(self, context: InputContext, path: UPath) -> Any:\n        try:\n            s3_obj = self.s3.get_object(Bucket=self.bucket, Key=str(path))["Body"].read()\n            return pickle.loads(s3_obj)\n        except self.s3.exceptions.NoSuchKey:\n            raise FileNotFoundError(f"Could not find file {path} in S3 bucket {self.bucket}")\n\n    def dump_to_path(self, context: OutputContext, obj: Any, path: UPath) -> None:\n        if self.path_exists(path):\n            context.log.warning(f"Removing existing S3 object: {path}")\n            self.unlink(path)\n\n        pickled_obj = pickle.dumps(obj, PICKLE_PROTOCOL)\n        pickled_obj_bytes = io.BytesIO(pickled_obj)\n        self.s3.upload_fileobj(pickled_obj_bytes, self.bucket, str(path))\n\n    def path_exists(self, path: UPath) -> bool:\n        try:\n            self.s3.get_object(Bucket=self.bucket, Key=str(path))\n        except self.s3.exceptions.NoSuchKey:\n            return False\n        return True\n\n    def get_loading_input_log_message(self, path: UPath) -> str:\n        return f"Loading S3 object from: {self._uri_for_path(path)}"\n\n    def get_writing_output_log_message(self, path: UPath) -> str:\n        return f"Writing S3 object at: {self._uri_for_path(path)}"\n\n    def unlink(self, path: UPath) -> None:\n        self.s3.delete_object(Bucket=self.bucket, Key=str(path))\n\n    def make_directory(self, path: UPath) -> None:\n        # It is not necessary to create directories in S3\n        return None\n\n    def get_metadata(self, context: OutputContext, obj: Any) -> Dict[str, MetadataValue]:\n        path = self._get_path(context)\n        return {"uri": MetadataValue.path(self._uri_for_path(path))}\n\n    def get_op_output_relative_path(self, context: Union[InputContext, OutputContext]) -> UPath:\n        return UPath("storage", super().get_op_output_relative_path(context))\n\n    def _uri_for_path(self, path: UPath) -> str:\n        return f"s3://{self.bucket}/{path}"\n\n\n
[docs]class S3PickleIOManager(ConfigurableIOManager):\n """Persistent IO manager using S3 for storage.\n\n Serializes objects via pickling. Suitable for objects storage for distributed executors, so long\n as each execution node has network connectivity and credentials for S3 and the backing bucket.\n\n Assigns each op output to a unique filepath containing run ID, step key, and output name.\n Assigns each asset to a single filesystem path, at "<base_dir>/<asset_key>". If the asset key\n has multiple components, the final component is used as the name of the file, and the preceding\n components as parent directories under the base_dir.\n\n Subsequent materializations of an asset will overwrite previous materializations of that asset.\n With a base directory of "/my/base/path", an asset with key\n `AssetKey(["one", "two", "three"])` would be stored in a file called "three" in a directory\n with path "/my/base/path/one/two/".\n\n Example usage:\n\n .. code-block:: python\n\n from dagster import asset, Definitions\n from dagster_aws.s3 import S3PickleIOManager, S3Resource\n\n\n @asset\n def asset1():\n # create df ...\n return df\n\n @asset\n def asset2(asset1):\n return asset1[:5]\n\n defs = Definitions(\n assets=[asset1, asset2],\n resources={\n "io_manager": S3PickleIOManager(\n s3_resource=S3Resource(),\n s3_bucket="my-cool-bucket",\n s3_prefix="my-cool-prefix",\n )\n }\n )\n\n """\n\n s3_resource: ResourceDependency[S3Resource]\n s3_bucket: str = Field(description="S3 bucket to use for the file manager.")\n s3_prefix: str = Field(\n default="dagster", description="Prefix to use for the S3 bucket for this file manager."\n )\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n @cached_method\n def inner_io_manager(self) -> PickledObjectS3IOManager:\n return PickledObjectS3IOManager(\n s3_bucket=self.s3_bucket,\n s3_session=self.s3_resource.get_client(),\n s3_prefix=self.s3_prefix,\n )\n\n def load_input(self, context: InputContext) -> Any:\n return self.inner_io_manager().load_input(context)\n\n def handle_output(self, context: OutputContext, obj: Any) -> None:\n return self.inner_io_manager().handle_output(context, obj)
\n\n\n
[docs]@deprecated(\n breaking_version="2.0",\n additional_warn_text="Please use S3PickleIOManager instead.",\n)\nclass ConfigurablePickledObjectS3IOManager(S3PickleIOManager):\n """Renamed to S3PickleIOManager. See S3PickleIOManager for documentation."""\n\n pass
\n\n\n
[docs]@dagster_maintained_io_manager\n@io_manager(\n config_schema=S3PickleIOManager.to_config_schema(),\n required_resource_keys={"s3"},\n)\ndef s3_pickle_io_manager(init_context):\n """Persistent IO manager using S3 for storage.\n\n Serializes objects via pickling. Suitable for objects storage for distributed executors, so long\n as each execution node has network connectivity and credentials for S3 and the backing bucket.\n\n Assigns each op output to a unique filepath containing run ID, step key, and output name.\n Assigns each asset to a single filesystem path, at "<base_dir>/<asset_key>". If the asset key\n has multiple components, the final component is used as the name of the file, and the preceding\n components as parent directories under the base_dir.\n\n Subsequent materializations of an asset will overwrite previous materializations of that asset.\n With a base directory of "/my/base/path", an asset with key\n `AssetKey(["one", "two", "three"])` would be stored in a file called "three" in a directory\n with path "/my/base/path/one/two/".\n\n Example usage:\n\n 1. Attach this IO manager to a set of assets.\n\n .. code-block:: python\n\n from dagster import Definitions, asset\n from dagster_aws.s3 import s3_pickle_io_manager, s3_resource\n\n\n @asset\n def asset1():\n # create df ...\n return df\n\n @asset\n def asset2(asset1):\n return asset1[:5]\n\n defs = Definitions(\n assets=[asset1, asset2],\n resources={\n "io_manager": s3_pickle_io_manager.configured(\n {"s3_bucket": "my-cool-bucket", "s3_prefix": "my-cool-prefix"}\n ),\n "s3": s3_resource,\n },\n )\n\n\n 2. Attach this IO manager to your job to make it available to your ops.\n\n .. code-block:: python\n\n from dagster import job\n from dagster_aws.s3 import s3_pickle_io_manager, s3_resource\n\n @job(\n resource_defs={\n "io_manager": s3_pickle_io_manager.configured(\n {"s3_bucket": "my-cool-bucket", "s3_prefix": "my-cool-prefix"}\n ),\n "s3": s3_resource,\n },\n )\n def my_job():\n ...\n """\n s3_session = init_context.resources.s3\n s3_bucket = init_context.resource_config["s3_bucket"]\n s3_prefix = init_context.resource_config.get("s3_prefix") # s3_prefix is optional\n pickled_io_manager = PickledObjectS3IOManager(s3_bucket, s3_session, s3_prefix=s3_prefix)\n return pickled_io_manager
\n
", "current_page_name": "_modules/dagster_aws/s3/io_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_aws.s3.io_manager"}, "ops": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_aws.s3.ops

\nfrom typing import Any, Generator, Mapping\n\nfrom dagster import (\n    AssetMaterialization,\n    Field,\n    FileHandle,\n    In,\n    MetadataValue,\n    Out,\n    Output,\n    StringSource,\n    _check as check,\n    dagster_type_loader,\n    op,\n)\nfrom dagster._core.types.dagster_type import PythonObjectDagsterType\n\nfrom .file_manager import S3FileHandle\n\n\ndef dict_with_fields(name: str, fields: Mapping[str, object]):\n    check.str_param(name, "name")\n    check.mapping_param(fields, "fields", key_type=str)\n    field_names = set(fields.keys())\n\n    @dagster_type_loader(fields)\n    def _input_schema(_context, value):\n        check.dict_param(value, "value")\n        check.param_invariant(set(value.keys()) == field_names, "value")\n        return value\n\n    class _DictWithSchema(PythonObjectDagsterType):\n        def __init__(self):\n            super(_DictWithSchema, self).__init__(python_type=dict, name=name, loader=_input_schema)\n\n    return _DictWithSchema()\n\n\nS3Coordinate = dict_with_fields(\n    "S3Coordinate",\n    fields={\n        "bucket": Field(StringSource, description="S3 bucket name"),\n        "key": Field(StringSource, description="S3 key name"),\n    },\n)\n\n\ndef last_key(key: str) -> str:\n    if "/" not in key:\n        return key\n    comps = key.split("/")\n    return comps[-1]\n\n\n@op(\n    config_schema={\n        "Bucket": Field(\n            StringSource, description="The name of the bucket to upload to.", is_required=True\n        ),\n        "Key": Field(\n            StringSource, description="The name of the key to upload to.", is_required=True\n        ),\n    },\n    ins={"file_handle": In(FileHandle, description="The file to upload.")},\n    out={"s3_file_handle": Out(S3FileHandle)},\n    description="""Take a file handle and upload it to s3. Returns an S3FileHandle.""",\n    required_resource_keys={"s3", "file_manager"},\n)\ndef file_handle_to_s3(context, file_handle) -> Generator[Any, None, None]:\n    bucket = context.op_config["Bucket"]\n    key = context.op_config["Key"]\n\n    file_manager = context.resources.file_manager\n    s3 = context.resources.s3\n\n    with file_manager.read(file_handle, "rb") as fileobj:\n        s3.upload_fileobj(fileobj, bucket, key)\n        s3_file_handle = S3FileHandle(bucket, key)\n\n        yield AssetMaterialization(\n            asset_key=s3_file_handle.s3_path,\n            metadata={last_key(key): MetadataValue.path(s3_file_handle.s3_path)},\n        )\n\n        yield Output(value=s3_file_handle, output_name="s3_file_handle")\n
", "current_page_name": "_modules/dagster_aws/s3/ops", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_aws.s3.ops"}, "resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_aws.s3.resources

\nfrom typing import Any, Optional, TypeVar\n\nfrom dagster import ConfigurableResource, IAttachDifferentObjectToOpContext, resource\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom pydantic import Field\n\nfrom .file_manager import S3FileManager\nfrom .utils import construct_s3_client\n\nT = TypeVar("T")\n\n\nclass ResourceWithS3Configuration(ConfigurableResource):\n    use_unsigned_session: bool = Field(\n        default=False, description="Specifies whether to use an unsigned S3 session."\n    )\n    region_name: Optional[str] = Field(\n        default=None, description="Specifies a custom region for the S3 session."\n    )\n    endpoint_url: Optional[str] = Field(\n        default=None, description="Specifies a custom endpoint for the S3 session."\n    )\n    max_attempts: int = Field(\n        default=5,\n        description=(\n            "This provides Boto3's retry handler with a value of maximum retry attempts, where the"\n            " initial call counts toward the max_attempts value that you provide."\n        ),\n    )\n    profile_name: Optional[str] = Field(\n        default=None, description="Specifies a profile to connect that session."\n    )\n    use_ssl: bool = Field(\n        default=True, description="Whether or not to use SSL. By default, SSL is used."\n    )\n    verify: Optional[str] = Field(\n        default=None,\n        description=(\n            "Whether or not to verify SSL certificates. By default SSL certificates are verified."\n            " You can also specify this argument if you want to use a different CA cert bundle than"\n            " the one used by botocore."\n        ),\n    )\n    aws_access_key_id: Optional[str] = Field(\n        default=None, description="AWS access key ID to use when creating the boto3 session."\n    )\n    aws_secret_access_key: Optional[str] = Field(\n        default=None, description="AWS secret access key to use when creating the boto3 session."\n    )\n    aws_session_token: str = Field(\n        default=None, description="AWS session token to use when creating the boto3 session."\n    )\n\n\n
[docs]class S3Resource(ResourceWithS3Configuration, IAttachDifferentObjectToOpContext):\n """Resource that gives access to S3.\n\n The underlying S3 session is created by calling\n :py:func:`boto3.session.Session(profile_name) <boto3:boto3.session>`.\n The returned resource object is an S3 client, an instance of `botocore.client.S3`.\n\n Example:\n .. code-block:: python\n\n from dagster import job, op, Definitions\n from dagster_aws.s3 import S3Resource\n\n @op\n def example_s3_op(s3: S3Resource):\n return s3.get_client().list_objects_v2(\n Bucket='my-bucket',\n Prefix='some-key'\n )\n\n @job\n def example_job():\n example_s3_op()\n\n defs = Definitions(\n jobs=[example_job],\n resources={'s3': S3Resource(region_name='us-west-1')}\n )\n\n """\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n def get_client(self) -> Any:\n return construct_s3_client(\n max_attempts=self.max_attempts,\n region_name=self.region_name,\n endpoint_url=self.endpoint_url,\n use_unsigned_session=self.use_unsigned_session,\n profile_name=self.profile_name,\n use_ssl=self.use_ssl,\n verify=self.verify,\n aws_access_key_id=self.aws_access_key_id,\n aws_secret_access_key=self.aws_secret_access_key,\n aws_session_token=self.aws_session_token,\n )\n\n def get_object_to_set_on_execution_context(self) -> Any:\n return self.get_client()
\n\n\n
[docs]@dagster_maintained_resource\n@resource(config_schema=S3Resource.to_config_schema())\ndef s3_resource(context) -> Any:\n """Resource that gives access to S3.\n\n The underlying S3 session is created by calling\n :py:func:`boto3.session.Session(profile_name) <boto3:boto3.session>`.\n The returned resource object is an S3 client, an instance of `botocore.client.S3`.\n\n Example:\n .. code-block:: python\n\n from dagster import build_op_context, job, op\n from dagster_aws.s3 import s3_resource\n\n @op(required_resource_keys={'s3'})\n def example_s3_op(context):\n return context.resources.s3.list_objects_v2(\n Bucket='my-bucket',\n Prefix='some-key'\n )\n\n @job(resource_defs={'s3': s3_resource})\n def example_job():\n example_s3_op()\n\n example_job.execute_in_process(\n run_config={\n 'resources': {\n 's3': {\n 'config': {\n 'region_name': 'us-west-1',\n }\n }\n }\n }\n )\n\n Note that your ops must also declare that they require this resource with\n `required_resource_keys`, or it will not be initialized for the execution of their compute\n functions.\n\n You may configure this resource as follows:\n\n .. code-block:: YAML\n\n resources:\n s3:\n config:\n region_name: "us-west-1"\n # Optional[str]: Specifies a custom region for the S3 session. Default is chosen\n # through the ordinary boto credential chain.\n use_unsigned_session: false\n # Optional[bool]: Specifies whether to use an unsigned S3 session. Default: True\n endpoint_url: "http://localhost"\n # Optional[str]: Specifies a custom endpoint for the S3 session. Default is None.\n profile_name: "dev"\n # Optional[str]: Specifies a custom profile for S3 session. Default is default\n # profile as specified in ~/.aws/credentials file\n use_ssl: true\n # Optional[bool]: Whether or not to use SSL. By default, SSL is used.\n verify: None\n # Optional[str]: Whether or not to verify SSL certificates. By default SSL certificates are verified.\n # You can also specify this argument if you want to use a different CA cert bundle than the one used by botocore."\n aws_access_key_id: None\n # Optional[str]: The access key to use when creating the client.\n aws_secret_access_key: None\n # Optional[str]: The secret key to use when creating the client.\n aws_session_token: None\n # Optional[str]: The session token to use when creating the client.\n """\n return S3Resource.from_resource_context(context).get_client()
\n\n\n
[docs]class S3FileManagerResource(ResourceWithS3Configuration, IAttachDifferentObjectToOpContext):\n s3_bucket: str = Field(description="S3 bucket to use for the file manager.")\n s3_prefix: str = Field(\n default="dagster", description="Prefix to use for the S3 bucket for this file manager."\n )\n\n def get_client(self) -> S3FileManager:\n return S3FileManager(\n s3_session=construct_s3_client(\n max_attempts=self.max_attempts,\n region_name=self.region_name,\n endpoint_url=self.endpoint_url,\n use_unsigned_session=self.use_unsigned_session,\n profile_name=self.profile_name,\n use_ssl=self.use_ssl,\n verify=self.verify,\n aws_access_key_id=self.aws_access_key_id,\n aws_secret_access_key=self.aws_secret_access_key,\n aws_session_token=self.aws_session_token,\n ),\n s3_bucket=self.s3_bucket,\n s3_base_key=self.s3_prefix,\n )\n\n def get_object_to_set_on_execution_context(self) -> Any:\n return self.get_client()
\n\n\n
[docs]@dagster_maintained_resource\n@resource(\n config_schema=S3FileManagerResource.to_config_schema(),\n)\ndef s3_file_manager(context) -> S3FileManager:\n """FileManager that provides abstract access to S3.\n\n Implements the :py:class:`~dagster._core.storage.file_manager.FileManager` API.\n """\n return S3FileManagerResource.from_resource_context(context).get_client()
\n
", "current_page_name": "_modules/dagster_aws/s3/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_aws.s3.resources"}}, "secretsmanager": {"resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_aws.secretsmanager.resources

\nfrom contextlib import contextmanager\nfrom typing import TYPE_CHECKING, Dict, Generator, List, Optional, cast\n\nfrom dagster import (\n    Field as LegacyDagsterField,\n    resource,\n)\nfrom dagster._config.field_utils import Shape\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom dagster._core.test_utils import environ\nfrom dagster._utils.merger import merge_dicts\nfrom pydantic import Field\n\nfrom dagster_aws.utils import ResourceWithBoto3Configuration\n\nfrom .secrets import construct_secretsmanager_client, get_secrets_from_arns, get_tagged_secrets\n\nif TYPE_CHECKING:\n    import botocore\n\n\n
[docs]class SecretsManagerResource(ResourceWithBoto3Configuration):\n """Resource that gives access to AWS SecretsManager.\n\n The underlying SecretsManager session is created by calling\n :py:func:`boto3.session.Session(profile_name) <boto3:boto3.session>`.\n The returned resource object is a SecretsManager client, an instance of `botocore.client.SecretsManager`.\n\n Example:\n .. code-block:: python\n\n from dagster import build_op_context, job, op\n from dagster_aws.secretsmanager import SecretsManagerResource\n\n @op\n def example_secretsmanager_op(secretsmanager: SecretsManagerResource):\n return secretsmanager.get_client().get_secret_value(\n SecretId='arn:aws:secretsmanager:region:aws_account_id:secret:appauthexample-AbCdEf'\n )\n\n @job\n def example_job():\n example_secretsmanager_op()\n\n defs = Definitions(\n jobs=[example_job],\n resources={\n 'secretsmanager': SecretsManagerResource(\n region_name='us-west-1'\n )\n }\n )\n """\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n def get_client(self) -> "botocore.client.SecretsManager":\n return construct_secretsmanager_client(\n max_attempts=self.max_attempts,\n region_name=self.region_name,\n profile_name=self.profile_name,\n )
\n\n\n
[docs]@dagster_maintained_resource\n@resource(SecretsManagerResource.to_config_schema())\ndef secretsmanager_resource(context) -> "botocore.client.SecretsManager":\n """Resource that gives access to AWS SecretsManager.\n\n The underlying SecretsManager session is created by calling\n :py:func:`boto3.session.Session(profile_name) <boto3:boto3.session>`.\n The returned resource object is a SecretsManager client, an instance of `botocore.client.SecretsManager`.\n\n Example:\n .. code-block:: python\n\n from dagster import build_op_context, job, op\n from dagster_aws.secretsmanager import secretsmanager_resource\n\n @op(required_resource_keys={'secretsmanager'})\n def example_secretsmanager_op(context):\n return context.resources.secretsmanager.get_secret_value(\n SecretId='arn:aws:secretsmanager:region:aws_account_id:secret:appauthexample-AbCdEf'\n )\n\n @job(resource_defs={'secretsmanager': secretsmanager_resource})\n def example_job():\n example_secretsmanager_op()\n\n example_job.execute_in_process(\n run_config={\n 'resources': {\n 'secretsmanager': {\n 'config': {\n 'region_name': 'us-west-1',\n }\n }\n }\n }\n )\n\n You may configure this resource as follows:\n\n .. code-block:: YAML\n\n resources:\n secretsmanager:\n config:\n region_name: "us-west-1"\n # Optional[str]: Specifies a custom region for the SecretsManager session. Default is chosen\n # through the ordinary boto credential chain.\n profile_name: "dev"\n # Optional[str]: Specifies a custom profile for SecretsManager session. Default is default\n # profile as specified in ~/.aws/credentials file\n\n """\n return SecretsManagerResource.from_resource_context(context).get_client()
\n\n\n
[docs]class SecretsManagerSecretsResource(ResourceWithBoto3Configuration):\n """Resource that provides a dict which maps selected SecretsManager secrets to\n their string values. Also optionally sets chosen secrets as environment variables.\n\n Example:\n .. code-block:: python\n\n import os\n from dagster import build_op_context, job, op, ResourceParam\n from dagster_aws.secretsmanager import SecretsManagerSecretsResource\n\n @op\n def example_secretsmanager_secrets_op(secrets: SecretsManagerSecretsResource):\n return secrets.fetch_secrets().get("my-secret-name")\n\n @op\n def example_secretsmanager_secrets_op_2(secrets: SecretsManagerSecretsResource):\n with secrets.secrets_in_environment():\n return os.getenv("my-other-secret-name")\n\n @job\n def example_job():\n example_secretsmanager_secrets_op()\n example_secretsmanager_secrets_op_2()\n\n defs = Definitions(\n jobs=[example_job],\n resources={\n 'secrets': SecretsManagerSecretsResource(\n region_name='us-west-1',\n secrets_tag="dagster",\n add_to_environment=True,\n )\n }\n )\n\n Note that your ops must also declare that they require this resource with or it will not be initialized\n for the execution of their compute functions.\n """\n\n secrets: List[str] = Field(\n default=[], description="An array of AWS Secrets Manager secrets arns to fetch."\n )\n secrets_tag: Optional[str] = Field(\n default=None,\n description="AWS Secrets Manager secrets with this tag will be fetched and made available.",\n )\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n @contextmanager\n def secrets_in_environment(\n self,\n secrets: Optional[List[str]] = None,\n secrets_tag: Optional[str] = None,\n ) -> Generator[Dict[str, str], None, None]:\n """Yields a dict which maps selected SecretsManager secrets to their string values. Also\n sets chosen secrets as environment variables.\n\n Args:\n secrets (Optional[List[str]]): An array of AWS Secrets Manager secrets arns to fetch.\n Note that this will override the secrets specified in the resource config.\n secrets_tag (Optional[str]): AWS Secrets Manager secrets with this tag will be fetched\n and made available. Note that this will override the secrets_tag specified in the\n resource config.\n """\n secrets_manager = construct_secretsmanager_client(\n max_attempts=self.max_attempts,\n region_name=self.region_name,\n profile_name=self.profile_name,\n )\n\n secrets_tag_to_fetch = secrets_tag if secrets_tag is not None else self.secrets_tag\n secrets_to_fetch = secrets if secrets is not None else self.secrets\n\n secret_arns = merge_dicts(\n (\n get_tagged_secrets(secrets_manager, [secrets_tag_to_fetch])\n if secrets_tag_to_fetch\n else {}\n ),\n get_secrets_from_arns(secrets_manager, secrets_to_fetch),\n )\n\n secrets_map = {\n name: secrets_manager.get_secret_value(SecretId=arn).get("SecretString")\n for name, arn in secret_arns.items()\n }\n with environ(secrets_map):\n yield secrets_map\n\n def fetch_secrets(\n self,\n secrets: Optional[List[str]] = None,\n secrets_tag: Optional[str] = None,\n ) -> Dict[str, str]:\n """Fetches secrets from AWS Secrets Manager and returns them as a dict.\n\n Args:\n secrets (Optional[List[str]]): An array of AWS Secrets Manager secrets arns to fetch.\n Note that this will override the secrets specified in the resource config.\n secrets_tag (Optional[str]): AWS Secrets Manager secrets with this tag will be fetched\n and made available. Note that this will override the secrets_tag specified in the\n resource config.\n """\n with self.secrets_in_environment(secrets=secrets, secrets_tag=secrets_tag) as secret_values:\n return secret_values
\n\n\nLEGACY_SECRETSMANAGER_SECRETS_SCHEMA = {\n **cast(Shape, SecretsManagerSecretsResource.to_config_schema().as_field().config_type).fields,\n "add_to_environment": LegacyDagsterField(\n bool,\n default_value=False,\n description="Whether to add the secrets to the environment. Defaults to False.",\n ),\n}\n\n\n
[docs]@dagster_maintained_resource\n@resource(config_schema=LEGACY_SECRETSMANAGER_SECRETS_SCHEMA)\n@contextmanager\ndef secretsmanager_secrets_resource(context):\n """Resource that provides a dict which maps selected SecretsManager secrets to\n their string values. Also optionally sets chosen secrets as environment variables.\n\n Example:\n .. code-block:: python\n\n import os\n from dagster import build_op_context, job, op\n from dagster_aws.secretsmanager import secretsmanager_secrets_resource\n\n @op(required_resource_keys={'secrets'})\n def example_secretsmanager_secrets_op(context):\n return context.resources.secrets.get("my-secret-name")\n\n @op(required_resource_keys={'secrets'})\n def example_secretsmanager_secrets_op_2(context):\n return os.getenv("my-other-secret-name")\n\n @job(resource_defs={'secrets': secretsmanager_secrets_resource})\n def example_job():\n example_secretsmanager_secrets_op()\n example_secretsmanager_secrets_op_2()\n\n example_job.execute_in_process(\n run_config={\n 'resources': {\n 'secrets': {\n 'config': {\n 'region_name': 'us-west-1',\n 'secrets_tag': 'dagster',\n 'add_to_environment': True,\n }\n }\n }\n }\n )\n\n Note that your ops must also declare that they require this resource with\n `required_resource_keys`, or it will not be initialized for the execution of their compute\n functions.\n\n You may configure this resource as follows:\n\n .. code-block:: YAML\n\n resources:\n secretsmanager:\n config:\n region_name: "us-west-1"\n # Optional[str]: Specifies a custom region for the SecretsManager session. Default is chosen\n # through the ordinary boto credential chain.\n profile_name: "dev"\n # Optional[str]: Specifies a custom profile for SecretsManager session. Default is default\n # profile as specified in ~/.aws/credentials file\n secrets: ["arn:aws:secretsmanager:region:aws_account_id:secret:appauthexample-AbCdEf"]\n # Optional[List[str]]: Specifies a list of secret ARNs to pull from SecretsManager.\n secrets_tag: "dagster"\n # Optional[str]: Specifies a tag, all secrets which have the tag set will be pulled\n # from SecretsManager.\n add_to_environment: true\n # Optional[bool]: Whether to set the selected secrets as environment variables. Defaults\n # to false.\n\n """\n add_to_environment = context.resource_config.get("add_to_environment", False)\n if add_to_environment:\n with SecretsManagerSecretsResource.from_resource_context(\n context\n ).secrets_in_environment() as secrets:\n yield secrets\n else:\n yield SecretsManagerSecretsResource.from_resource_context(context).fetch_secrets()
\n
", "current_page_name": "_modules/dagster_aws/secretsmanager/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_aws.secretsmanager.resources"}}}, "dagster_azure": {"adls2": {"fake_adls2_resource": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_azure.adls2.fake_adls2_resource

\nimport io\nimport random\nfrom typing import Any, Dict, Optional\nfrom unittest import mock\n\nfrom dagster import resource\nfrom dagster._config.pythonic_config import ConfigurableResource\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom dagster._utils.cached_method import cached_method\n\nfrom dagster_azure.blob import FakeBlobServiceClient\n\nfrom .utils import ResourceNotFoundError\n\n\n@dagster_maintained_resource\n@resource({"account_name": str})\ndef fake_adls2_resource(context):\n    return FakeADLS2Resource(account_name=context.resource_config["account_name"])\n\n\n
[docs]class FakeADLS2Resource(ConfigurableResource):\n """Stateful mock of an ADLS2Resource for testing.\n\n Wraps a ``mock.MagicMock``. Containers are implemented using an in-memory dict.\n """\n\n account_name: str\n storage_account: Optional[str] = None\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n @property\n @cached_method\n def adls2_client(self) -> "FakeADLS2ServiceClient":\n return FakeADLS2ServiceClient(self.account_name)\n\n @property\n @cached_method\n def blob_client(self) -> FakeBlobServiceClient:\n return FakeBlobServiceClient(self.account_name)\n\n @property\n def lease_client_constructor(self) -> Any:\n return FakeLeaseClient
\n\n\nclass FakeLeaseClient:\n def __init__(self, client):\n self.client = client\n self.id = None\n\n # client needs a ref to self to check if a given lease is valid\n self.client._lease = self # noqa: SLF001\n\n def acquire(self, lease_duration=-1):\n if self.id is None:\n self.id = random.randint(0, 2**9)\n else:\n raise Exception("Lease already held")\n\n def release(self):\n self.id = None\n\n def is_valid(self, lease):\n if self.id is None:\n # no lease is held so any operation is valid\n return True\n return lease == self.id\n\n\nclass FakeADLS2ServiceClient:\n """Stateful mock of an ADLS2 service client for testing.\n\n Wraps a ``mock.MagicMock``. Containers are implemented using an in-memory dict.\n """\n\n def __init__(self, account_name, credential="fake-creds"):\n self._account_name = account_name\n self._credential = mock.MagicMock()\n self._credential.account_key = credential\n self._file_systems = {}\n\n @property\n def account_name(self):\n return self._account_name\n\n @property\n def credential(self):\n return self._credential\n\n @property\n def file_systems(self):\n return self._file_systems\n\n def get_file_system_client(self, file_system):\n return self._file_systems.setdefault(\n file_system, FakeADLS2FilesystemClient(self.account_name, file_system)\n )\n\n def get_file_client(self, file_system, file_path):\n return self.get_file_system_client(file_system).get_file_client(file_path)\n\n\nclass FakeADLS2FilesystemClient:\n """Stateful mock of an ADLS2 filesystem client for testing."""\n\n def __init__(self, account_name, file_system_name):\n self._file_system: Dict[str, FakeADLS2FileClient] = {}\n self._account_name = account_name\n self._file_system_name = file_system_name\n\n @property\n def account_name(self):\n return self._account_name\n\n @property\n def file_system_name(self):\n return self._file_system_name\n\n def keys(self):\n return self._file_system.keys()\n\n def get_file_system_properties(self):\n return {"account_name": self.account_name, "file_system_name": self.file_system_name}\n\n def has_file(self, path):\n return bool(self._file_system.get(path))\n\n def get_file_client(self, file_path):\n # pass fileclient a ref to self and its name so the file can delete itself\n self._file_system.setdefault(file_path, FakeADLS2FileClient(self, file_path))\n return self._file_system[file_path]\n\n def create_file(self, file):\n # pass fileclient a ref to self and the file's name so the file can delete itself by\n # accessing the self._file_system dict\n self._file_system.setdefault(file, FakeADLS2FileClient(fs_client=self, name=file))\n return self._file_system[file]\n\n def delete_file(self, file):\n for k in list(self._file_system.keys()):\n if k.startswith(file):\n del self._file_system[k]\n\n\nclass FakeADLS2FileClient:\n """Stateful mock of an ADLS2 file client for testing."""\n\n def __init__(self, name, fs_client):\n self.name = name\n self.contents = None\n self._lease = None\n self.fs_client = fs_client\n\n @property\n def lease(self):\n return self._lease if self._lease is None else self._lease.id\n\n def get_file_properties(self):\n if self.contents is None:\n raise ResourceNotFoundError("File does not exist!")\n lease_id = None if self._lease is None else self._lease.id\n return {"lease": lease_id}\n\n def upload_data(self, contents, overwrite=False, lease=None):\n if self._lease is not None:\n if not self._lease.is_valid(lease):\n raise Exception("Invalid lease!")\n if self.contents is not None or overwrite is True:\n if isinstance(contents, str):\n self.contents = contents.encode("utf8")\n elif isinstance(contents, io.BytesIO):\n self.contents = contents.read()\n elif isinstance(contents, io.StringIO):\n self.contents = contents.read().encode("utf8")\n elif isinstance(contents, bytes):\n self.contents = contents\n else:\n self.contents = contents\n\n def download_file(self):\n if self.contents is None:\n raise ResourceNotFoundError("File does not exist!")\n return FakeADLS2FileDownloader(contents=self.contents)\n\n def delete_file(self, lease=None):\n if self._lease is not None:\n if not self._lease.is_valid(lease):\n raise Exception("Invalid lease!")\n self.fs_client.delete_file(self.name)\n\n\nclass FakeADLS2FileDownloader:\n """Mock of an ADLS2 file downloader for testing."""\n\n def __init__(self, contents):\n self.contents = contents\n\n def readall(self):\n return self.contents\n\n def readinto(self, fileobj):\n fileobj.write(self.contents)\n
", "current_page_name": "_modules/dagster_azure/adls2/fake_adls2_resource", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_azure.adls2.fake_adls2_resource"}, "file_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_azure.adls2.file_manager

\nimport io\nimport uuid\nfrom contextlib import contextmanager\n\nimport dagster._check as check\nfrom dagster._core.storage.file_manager import (\n    FileHandle,\n    FileManager,\n    TempfileManager,\n    check_file_like_obj,\n)\n\n\n
[docs]class ADLS2FileHandle(FileHandle):\n """A reference to a file on ADLS2."""\n\n def __init__(self, account: str, file_system: str, key: str):\n self._account = check.str_param(account, "account")\n self._file_system = check.str_param(file_system, "file_system")\n self._key = check.str_param(key, "key")\n\n @property\n def account(self):\n """str: The name of the ADLS2 account."""\n return self._account\n\n @property\n def file_system(self):\n """str: The name of the ADLS2 file system."""\n return self._file_system\n\n @property\n def key(self):\n """str: The ADLS2 key."""\n return self._key\n\n @property\n def path_desc(self):\n """str: The file's ADLS2 URL."""\n return self.adls2_path\n\n @property\n def adls2_path(self):\n """str: The file's ADLS2 URL."""\n return f"adfss://{self.file_system}@{self.account}.dfs.core.windows.net/{self.key}"
\n\n\nclass ADLS2FileManager(FileManager):\n def __init__(self, adls2_client, file_system, prefix):\n self._client = adls2_client\n self._file_system = check.str_param(file_system, "file_system")\n self._prefix = check.str_param(prefix, "prefix")\n self._local_handle_cache = {}\n self._temp_file_manager = TempfileManager()\n\n def copy_handle_to_local_temp(self, file_handle):\n self._download_if_not_cached(file_handle)\n return self._get_local_path(file_handle)\n\n def _download_if_not_cached(self, file_handle):\n if not self._file_handle_cached(file_handle):\n # instigate download\n temp_file_obj = self._temp_file_manager.tempfile()\n temp_name = temp_file_obj.name\n file = self._client.get_file_client(\n file_system=file_handle.file_system,\n file_path=file_handle.key,\n )\n download = file.download_file()\n with open(temp_name, "wb") as file_obj:\n download.readinto(file_obj)\n self._local_handle_cache[file_handle.adls2_path] = temp_name\n\n return file_handle\n\n @contextmanager\n def read(self, file_handle, mode="rb"):\n check.inst_param(file_handle, "file_handle", ADLS2FileHandle)\n check.str_param(mode, "mode")\n check.param_invariant(mode in {"r", "rb"}, "mode")\n\n self._download_if_not_cached(file_handle)\n\n encoding = None if "b" in mode else "utf-8"\n with open(self._get_local_path(file_handle), mode, encoding=encoding) as file_obj:\n yield file_obj\n\n def _file_handle_cached(self, file_handle):\n return file_handle.adls2_path in self._local_handle_cache\n\n def _get_local_path(self, file_handle):\n return self._local_handle_cache[file_handle.adls2_path]\n\n def read_data(self, file_handle):\n with self.read(file_handle, mode="rb") as file_obj:\n return file_obj.read()\n\n def write_data(self, data, ext=None):\n check.inst_param(data, "data", bytes)\n return self.write(io.BytesIO(data), mode="wb", ext=ext)\n\n def write(self, file_obj, mode="wb", ext=None):\n check_file_like_obj(file_obj)\n adls2_key = self.get_full_key(str(uuid.uuid4()) + (("." + ext) if ext is not None else ""))\n adls2_file = self._client.get_file_client(\n file_system=self._file_system, file_path=adls2_key\n )\n adls2_file.upload_data(file_obj, overwrite=True)\n return ADLS2FileHandle(self._client.account_name, self._file_system, adls2_key)\n\n def get_full_key(self, file_key):\n return f"{self._prefix}/{file_key}"\n\n def delete_local_temp(self):\n self._temp_file_manager.close()\n
", "current_page_name": "_modules/dagster_azure/adls2/file_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_azure.adls2.file_manager"}, "io_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_azure.adls2.io_manager

\nimport pickle\nfrom contextlib import contextmanager\nfrom typing import Any, Iterator, Union\n\nfrom dagster import (\n    InputContext,\n    OutputContext,\n    ResourceDependency,\n    _check as check,\n    io_manager,\n)\nfrom dagster._annotations import deprecated\nfrom dagster._config.pythonic_config import ConfigurableIOManager\nfrom dagster._core.storage.io_manager import dagster_maintained_io_manager\nfrom dagster._core.storage.upath_io_manager import UPathIOManager\nfrom dagster._utils import PICKLE_PROTOCOL\nfrom dagster._utils.cached_method import cached_method\nfrom pydantic import Field\nfrom upath import UPath\n\nfrom dagster_azure.adls2.resources import ADLS2Resource\nfrom dagster_azure.adls2.utils import ResourceNotFoundError\n\n_LEASE_DURATION = 60  # One minute\n\n\nclass PickledObjectADLS2IOManager(UPathIOManager):\n    def __init__(\n        self,\n        file_system: Any,\n        adls2_client: Any,\n        blob_client: Any,\n        lease_client_constructor: Any,\n        prefix: str = "dagster",\n    ):\n        self.adls2_client = adls2_client\n        self.file_system_client = self.adls2_client.get_file_system_client(file_system)\n        # We also need a blob client to handle copying as ADLS doesn't have a copy API yet\n        self.blob_client = blob_client\n        self.blob_container_client = self.blob_client.get_container_client(file_system)\n        self.prefix = check.str_param(prefix, "prefix")\n\n        self.lease_client_constructor = lease_client_constructor\n        self.lease_duration = _LEASE_DURATION\n        self.file_system_client.get_file_system_properties()\n        super().__init__(base_path=UPath(self.prefix))\n\n    def get_op_output_relative_path(self, context: Union[InputContext, OutputContext]) -> UPath:\n        parts = context.get_identifier()\n        run_id = parts[0]\n        output_parts = parts[1:]\n        return UPath("storage", run_id, "files", *output_parts)\n\n    def get_loading_input_log_message(self, path: UPath) -> str:\n        return f"Loading ADLS2 object from: {self._uri_for_path(path)}"\n\n    def get_writing_output_log_message(self, path: UPath) -> str:\n        return f"Writing ADLS2 object at: {self._uri_for_path(path)}"\n\n    def unlink(self, path: UPath) -> None:\n        file_client = self.file_system_client.get_file_client(str(path))\n        with self._acquire_lease(file_client, is_rm=True) as lease:\n            file_client.delete_file(lease=lease, recursive=True)\n\n    def path_exists(self, path: UPath) -> bool:\n        try:\n            self.file_system_client.get_file_client(str(path)).get_file_properties()\n        except ResourceNotFoundError:\n            return False\n        return True\n\n    def _uri_for_path(self, path: UPath, protocol: str = "abfss://") -> str:\n        return "{protocol}{filesystem}@{account}.dfs.core.windows.net/{key}".format(\n            protocol=protocol,\n            filesystem=self.file_system_client.file_system_name,\n            account=self.file_system_client.account_name,\n            key=path,\n        )\n\n    @contextmanager\n    def _acquire_lease(self, client: Any, is_rm: bool = False) -> Iterator[str]:\n        lease_client = self.lease_client_constructor(client=client)\n        try:\n            lease_client.acquire(lease_duration=self.lease_duration)\n            yield lease_client.id\n        finally:\n            # cannot release a lease on a file that no longer exists, so need to check\n            if not is_rm:\n                lease_client.release()\n\n    def load_from_path(self, context: InputContext, path: UPath) -> Any:\n        if context.dagster_type.typing_type == type(None):\n            return None\n        file = self.file_system_client.get_file_client(str(path))\n        stream = file.download_file()\n        return pickle.loads(stream.readall())\n\n    def dump_to_path(self, context: OutputContext, obj: Any, path: UPath) -> None:\n        if self.path_exists(path):\n            context.log.warning(f"Removing existing ADLS2 key: {path}")\n            self.unlink(path)\n\n        pickled_obj = pickle.dumps(obj, PICKLE_PROTOCOL)\n        file = self.file_system_client.create_file(str(path))\n        with self._acquire_lease(file) as lease:\n            file.upload_data(pickled_obj, lease=lease, overwrite=True)\n\n\n
[docs]class ADLS2PickleIOManager(ConfigurableIOManager):\n """Persistent IO manager using Azure Data Lake Storage Gen2 for storage.\n\n Serializes objects via pickling. Suitable for objects storage for distributed executors, so long\n as each execution node has network connectivity and credentials for ADLS and the backing\n container.\n\n Assigns each op output to a unique filepath containing run ID, step key, and output name.\n Assigns each asset to a single filesystem path, at "<base_dir>/<asset_key>". If the asset key\n has multiple components, the final component is used as the name of the file, and the preceding\n components as parent directories under the base_dir.\n\n Subsequent materializations of an asset will overwrite previous materializations of that asset.\n With a base directory of "/my/base/path", an asset with key\n `AssetKey(["one", "two", "three"])` would be stored in a file called "three" in a directory\n with path "/my/base/path/one/two/".\n\n Example usage:\n\n 1. Attach this IO manager to a set of assets.\n\n .. code-block:: python\n\n from dagster import Definitions, asset\n from dagster_azure.adls2 import ADLS2PickleIOManager, adls2_resource\n\n @asset\n def asset1():\n # create df ...\n return df\n\n @asset\n def asset2(asset1):\n return df[:5]\n\n defs = Definitions(\n assets=[asset1, asset2],\n resources={\n "io_manager": ADLS2PickleIOManager(\n adls2_file_system="my-cool-fs",\n adls2_prefix="my-cool-prefix"\n ),\n "adls2": adls2_resource,\n },\n )\n\n\n 2. Attach this IO manager to your job to make it available to your ops.\n\n .. code-block:: python\n\n from dagster import job\n from dagster_azure.adls2 import ADLS2PickleIOManager, adls2_resource\n\n @job(\n resource_defs={\n "io_manager": ADLS2PickleIOManager(\n adls2_file_system="my-cool-fs",\n adls2_prefix="my-cool-prefix"\n ),\n "adls2": adls2_resource,\n },\n )\n def my_job():\n ...\n """\n\n adls2: ResourceDependency[ADLS2Resource]\n adls2_file_system: str = Field(description="ADLS Gen2 file system name.")\n adls2_prefix: str = Field(\n default="dagster", description="ADLS Gen2 file system prefix to write to."\n )\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n @property\n @cached_method\n def _internal_io_manager(self) -> PickledObjectADLS2IOManager:\n return PickledObjectADLS2IOManager(\n self.adls2_file_system,\n self.adls2.adls2_client,\n self.adls2.blob_client,\n self.adls2.lease_client_constructor,\n self.adls2_prefix,\n )\n\n def load_input(self, context: "InputContext") -> Any:\n return self._internal_io_manager.load_input(context)\n\n def handle_output(self, context: "OutputContext", obj: Any) -> None:\n self._internal_io_manager.handle_output(context, obj)
\n\n\n
[docs]@deprecated(\n breaking_version="2.0",\n additional_warn_text="Please use GCSPickleIOManager instead.",\n)\nclass ConfigurablePickledObjectADLS2IOManager(ADLS2PickleIOManager):\n """Renamed to ADLS2PickleIOManager. See ADLS2PickleIOManager for documentation."""\n\n pass
\n\n\n
[docs]@dagster_maintained_io_manager\n@io_manager(\n config_schema=ADLS2PickleIOManager.to_config_schema(),\n required_resource_keys={"adls2"},\n)\ndef adls2_pickle_io_manager(init_context):\n """Persistent IO manager using Azure Data Lake Storage Gen2 for storage.\n\n Serializes objects via pickling. Suitable for objects storage for distributed executors, so long\n as each execution node has network connectivity and credentials for ADLS and the backing\n container.\n\n Assigns each op output to a unique filepath containing run ID, step key, and output name.\n Assigns each asset to a single filesystem path, at "<base_dir>/<asset_key>". If the asset key\n has multiple components, the final component is used as the name of the file, and the preceding\n components as parent directories under the base_dir.\n\n Subsequent materializations of an asset will overwrite previous materializations of that asset.\n With a base directory of "/my/base/path", an asset with key\n `AssetKey(["one", "two", "three"])` would be stored in a file called "three" in a directory\n with path "/my/base/path/one/two/".\n\n Example usage:\n\n 1. Attach this IO manager to a set of assets.\n\n .. code-block:: python\n\n from dagster import Definitions, asset\n from dagster_azure.adls2 import adls2_pickle_io_manager, adls2_resource\n\n @asset\n def asset1():\n # create df ...\n return df\n\n @asset\n def asset2(asset1):\n return df[:5]\n\n defs = Definitions(\n assets=[asset1, asset2],\n resources={\n "io_manager": adls2_pickle_io_manager.configured(\n {"adls2_file_system": "my-cool-fs", "adls2_prefix": "my-cool-prefix"}\n ),\n "adls2": adls2_resource,\n },\n )\n\n\n 2. Attach this IO manager to your job to make it available to your ops.\n\n .. code-block:: python\n\n from dagster import job\n from dagster_azure.adls2 import adls2_pickle_io_manager, adls2_resource\n\n @job(\n resource_defs={\n "io_manager": adls2_pickle_io_manager.configured(\n {"adls2_file_system": "my-cool-fs", "adls2_prefix": "my-cool-prefix"}\n ),\n "adls2": adls2_resource,\n },\n )\n def my_job():\n ...\n """\n adls_resource = init_context.resources.adls2\n adls2_client = adls_resource.adls2_client\n blob_client = adls_resource.blob_client\n lease_client = adls_resource.lease_client_constructor\n pickled_io_manager = PickledObjectADLS2IOManager(\n init_context.resource_config["adls2_file_system"],\n adls2_client,\n blob_client,\n lease_client,\n init_context.resource_config.get("adls2_prefix"),\n )\n return pickled_io_manager
\n
", "current_page_name": "_modules/dagster_azure/adls2/io_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_azure.adls2.io_manager"}, "resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_azure.adls2.resources

\nfrom typing import Any, Dict, Union\n\nfrom azure.identity import DefaultAzureCredential\nfrom azure.storage.filedatalake import DataLakeLeaseClient\nfrom dagster import (\n    Config,\n    ConfigurableResource,\n    Field as DagsterField,\n    Permissive,\n    Selector,\n    StringSource,\n    resource,\n)\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom dagster._utils.cached_method import cached_method\nfrom dagster._utils.merger import merge_dicts\nfrom pydantic import Field\nfrom typing_extensions import Literal\n\nfrom dagster_azure.blob.utils import BlobServiceClient, create_blob_client\n\nfrom .file_manager import ADLS2FileManager\nfrom .utils import DataLakeServiceClient, create_adls2_client\n\n\nclass ADLS2SASToken(Config):\n    credential_type: Literal["sas"] = "sas"\n    token: str\n\n\nclass ADLS2Key(Config):\n    credential_type: Literal["key"] = "key"\n    key: str\n\n\nclass ADLS2DefaultAzureCredential(Config):\n    credential_type: Literal["default_azure_credential"] = "default_azure_credential"\n    kwargs: Dict[str, Any]\n\n\nclass ADLS2BaseResource(ConfigurableResource):\n    storage_account: str = Field(description="The storage account name.")\n    credential: Union[ADLS2SASToken, ADLS2Key, ADLS2DefaultAzureCredential] = Field(\n        discriminator="credential_type", description="The credentials with which to authenticate."\n    )\n\n\nDEFAULT_AZURE_CREDENTIAL_CONFIG = DagsterField(\n    Permissive(\n        description="Uses DefaultAzureCredential to authenticate and passed as keyword arguments",\n    )\n)\n\nADLS2_CLIENT_CONFIG = {\n    "storage_account": DagsterField(StringSource, description="The storage account name."),\n    "credential": DagsterField(\n        Selector(\n            {\n                "sas": DagsterField(StringSource, description="SAS token for the account."),\n                "key": DagsterField(StringSource, description="Shared Access Key for the account."),\n                "DefaultAzureCredential": DEFAULT_AZURE_CREDENTIAL_CONFIG,\n            }\n        ),\n        description="The credentials with which to authenticate.",\n    ),\n}\n\n\n
[docs]class ADLS2Resource(ADLS2BaseResource):\n """Resource containing clients to access Azure Data Lake Storage Gen2.\n\n Contains a client for both the Data Lake and Blob APIs, to work around the limitations\n of each.\n """\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n @property\n @cached_method\n def _raw_credential(self) -> Any:\n if isinstance(self.credential, ADLS2Key):\n return self.credential.key\n elif isinstance(self.credential, ADLS2SASToken):\n return self.credential.token\n else:\n return DefaultAzureCredential(**self.credential.kwargs)\n\n @property\n @cached_method\n def adls2_client(self) -> DataLakeServiceClient:\n return create_adls2_client(self.storage_account, self._raw_credential)\n\n @property\n @cached_method\n def blob_client(self) -> BlobServiceClient:\n return create_blob_client(self.storage_account, self._raw_credential)\n\n @property\n def lease_client_constructor(self) -> Any:\n return DataLakeLeaseClient
\n\n\n# Due to a limitation of the discriminated union type, we can't directly mirror these old\n# config fields in the new resource config. Instead, we'll just use the old config fields\n# to construct the new config and then use that to construct the resource.\n
[docs]@dagster_maintained_resource\n@resource(ADLS2_CLIENT_CONFIG)\ndef adls2_resource(context):\n """Resource that gives ops access to Azure Data Lake Storage Gen2.\n\n The underlying client is a :py:class:`~azure.storage.filedatalake.DataLakeServiceClient`.\n\n Attach this resource definition to a :py:class:`~dagster.JobDefinition` in order to make it\n available to your ops.\n\n Example:\n .. code-block:: python\n\n from dagster import job, op\n from dagster_azure.adls2 import adls2_resource\n\n @op(required_resource_keys={'adls2'})\n def example_adls2_op(context):\n return list(context.resources.adls2.adls2_client.list_file_systems())\n\n @job(resource_defs={"adls2": adls2_resource})\n def my_job():\n example_adls2_op()\n\n Note that your ops must also declare that they require this resource with\n `required_resource_keys`, or it will not be initialized for the execution of their compute\n functions.\n\n You may pass credentials to this resource using either a SAS token, a key or by passing the\n `DefaultAzureCredential` object.\n\n .. code-block:: YAML\n\n resources:\n adls2:\n config:\n storage_account: my_storage_account\n # str: The storage account name.\n credential:\n sas: my_sas_token\n # str: the SAS token for the account.\n key:\n env: AZURE_DATA_LAKE_STORAGE_KEY\n # str: The shared access key for the account.\n DefaultAzureCredential: {}\n # dict: The keyword arguments used for DefaultAzureCredential\n # or leave the object empty for no arguments\n DefaultAzureCredential:\n exclude_environment_credential: true\n\n """\n return _adls2_resource_from_config(context.resource_config)
\n\n\n
[docs]@dagster_maintained_resource\n@resource(\n merge_dicts(\n ADLS2_CLIENT_CONFIG,\n {\n "adls2_file_system": DagsterField(\n StringSource, description="ADLS Gen2 file system name"\n ),\n "adls2_prefix": DagsterField(StringSource, is_required=False, default_value="dagster"),\n },\n )\n)\ndef adls2_file_manager(context):\n """FileManager that provides abstract access to ADLS2.\n\n Implements the :py:class:`~dagster._core.storage.file_manager.FileManager` API.\n """\n adls2_client = _adls2_resource_from_config(context.resource_config).adls2_client\n\n return ADLS2FileManager(\n adls2_client=adls2_client,\n file_system=context.resource_config["adls2_file_system"],\n prefix=context.resource_config["adls2_prefix"],\n )
\n\n\ndef _adls2_resource_from_config(config) -> ADLS2Resource:\n """Args:\n config: A configuration containing the fields in ADLS2_CLIENT_CONFIG.\n\n Returns: An adls2 client.\n """\n storage_account = config["storage_account"]\n if "DefaultAzureCredential" in config["credential"]:\n credential = ADLS2DefaultAzureCredential(\n kwargs=config["credential"]["DefaultAzureCredential"]\n )\n elif "sas" in config["credential"]:\n credential = ADLS2SASToken(token=config["credential"]["sas"])\n else:\n credential = ADLS2Key(key=config["credential"]["key"])\n\n return ADLS2Resource(storage_account=storage_account, credential=credential)\n
", "current_page_name": "_modules/dagster_azure/adls2/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_azure.adls2.resources"}}, "blob": {"compute_log_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_azure.blob.compute_log_manager

\nimport os\nfrom contextlib import contextmanager\nfrom typing import Any, Mapping, Optional, Sequence\n\nimport dagster._seven as seven\nfrom azure.identity import DefaultAzureCredential\nfrom dagster import (\n    Field,\n    Noneable,\n    Permissive,\n    StringSource,\n    _check as check,\n)\nfrom dagster._core.storage.cloud_storage_compute_log_manager import (\n    CloudStorageComputeLogManager,\n    PollingComputeLogSubscriptionManager,\n)\nfrom dagster._core.storage.compute_log_manager import ComputeIOType\nfrom dagster._core.storage.local_compute_log_manager import (\n    IO_TYPE_EXTENSION,\n    LocalComputeLogManager,\n)\nfrom dagster._serdes import ConfigurableClass, ConfigurableClassData\nfrom dagster._utils import ensure_dir, ensure_file\nfrom typing_extensions import Self\n\nfrom .utils import create_blob_client, generate_blob_sas\n\n\n
[docs]class AzureBlobComputeLogManager(CloudStorageComputeLogManager, ConfigurableClass):\n """Logs op compute function stdout and stderr to Azure Blob Storage.\n\n This is also compatible with Azure Data Lake Storage.\n\n Users should not instantiate this class directly. Instead, use a YAML block in ``dagster.yaml``\n such as the following:\n\n .. code-block:: YAML\n\n compute_logs:\n module: dagster_azure.blob.compute_log_manager\n class: AzureBlobComputeLogManager\n config:\n storage_account: my-storage-account\n container: my-container\n credential: sas-token-or-secret-key\n default_azure_credential:\n exclude_environment_credential: true\n prefix: "dagster-test-"\n local_dir: "/tmp/cool"\n upload_interval: 30\n\n Args:\n storage_account (str): The storage account name to which to log.\n container (str): The container (or ADLS2 filesystem) to which to log.\n secret_key (Optional[str]): Secret key for the storage account. SAS tokens are not\n supported because we need a secret key to generate a SAS token for a download URL.\n default_azure_credential (Optional[dict]): Use and configure DefaultAzureCredential.\n Cannot be used with sas token or secret key config.\n local_dir (Optional[str]): Path to the local directory in which to stage logs. Default:\n ``dagster._seven.get_system_temp_directory()``.\n prefix (Optional[str]): Prefix for the log file keys.\n upload_interval: (Optional[int]): Interval in seconds to upload partial log files blob storage. By default, will only upload when the capture is complete.\n inst_data (Optional[ConfigurableClassData]): Serializable representation of the compute\n log manager when newed up from config.\n """\n\n def __init__(\n self,\n storage_account,\n container,\n secret_key=None,\n local_dir=None,\n inst_data: Optional[ConfigurableClassData] = None,\n prefix="dagster",\n upload_interval=None,\n default_azure_credential=None,\n ):\n self._storage_account = check.str_param(storage_account, "storage_account")\n self._container = check.str_param(container, "container")\n self._blob_prefix = self._clean_prefix(check.str_param(prefix, "prefix"))\n self._default_azure_credential = check.opt_dict_param(\n default_azure_credential, "default_azure_credential"\n )\n check.opt_str_param(secret_key, "secret_key")\n check.invariant(\n secret_key is not None or default_azure_credential is not None,\n "Missing config: need to provide one of secret_key or default_azure_credential",\n )\n\n if default_azure_credential is None:\n self._blob_client = create_blob_client(storage_account, secret_key)\n else:\n credential = DefaultAzureCredential(**self._default_azure_credential)\n self._blob_client = create_blob_client(storage_account, credential)\n\n self._container_client = self._blob_client.get_container_client(container)\n self._download_urls = {}\n\n # proxy calls to local compute log manager (for subscriptions, etc)\n if not local_dir:\n local_dir = seven.get_system_temp_directory()\n\n self._local_manager = LocalComputeLogManager(local_dir)\n self._subscription_manager = PollingComputeLogSubscriptionManager(self)\n self._upload_interval = check.opt_int_param(upload_interval, "upload_interval")\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n\n @contextmanager\n def _watch_logs(self, dagster_run, step_key=None):\n # proxy watching to the local compute log manager, interacting with the filesystem\n with self.local_manager._watch_logs(dagster_run, step_key): # noqa: SLF001\n yield\n\n @property\n def inst_data(self):\n return self._inst_data\n\n @classmethod\n def config_type(cls):\n return {\n "storage_account": StringSource,\n "container": StringSource,\n "secret_key": Field(StringSource, is_required=False),\n "default_azure_credential": Field(\n Noneable(Permissive(description="keyword arguments for DefaultAzureCredential")),\n is_required=False,\n default_value=None,\n ),\n "local_dir": Field(StringSource, is_required=False),\n "prefix": Field(StringSource, is_required=False, default_value="dagster"),\n "upload_interval": Field(Noneable(int), is_required=False, default_value=None),\n }\n\n @classmethod\n def from_config_value(\n cls, inst_data: ConfigurableClassData, config_value: Mapping[str, Any]\n ) -> Self:\n return AzureBlobComputeLogManager(inst_data=inst_data, **config_value)\n\n @property\n def local_manager(self) -> LocalComputeLogManager:\n return self._local_manager\n\n @property\n def upload_interval(self) -> Optional[int]:\n return self._upload_interval if self._upload_interval else None\n\n def _clean_prefix(self, prefix):\n parts = prefix.split("/")\n return "/".join([part for part in parts if part])\n\n def _blob_key(self, log_key, io_type, partial=False):\n check.inst_param(io_type, "io_type", ComputeIOType)\n extension = IO_TYPE_EXTENSION[io_type]\n [*namespace, filebase] = log_key\n filename = f"{filebase}.{extension}"\n if partial:\n filename = f"{filename}.partial"\n paths = [self._blob_prefix, "storage", *namespace, filename]\n return "/".join(paths) # blob path delimiter\n\n def delete_logs(\n self, log_key: Optional[Sequence[str]] = None, prefix: Optional[Sequence[str]] = None\n ):\n self.local_manager.delete_logs(log_key=log_key, prefix=prefix)\n if log_key:\n prefix_path = "/".join([self._blob_prefix, "storage", *log_key])\n elif prefix:\n # add the trailing '/' to make sure that ['a'] does not match ['apple']\n prefix_path = "/".join([self._blob_prefix, "storage", *prefix, ""])\n else:\n prefix_path = None\n\n blob_list = {\n b.name for b in list(self._container_client.list_blobs(name_starts_with=prefix_path))\n }\n\n to_remove = None\n if log_key:\n # filter to the known set of keys\n known_keys = [\n self._blob_key(log_key, ComputeIOType.STDOUT),\n self._blob_key(log_key, ComputeIOType.STDERR),\n self._blob_key(log_key, ComputeIOType.STDOUT, partial=True),\n self._blob_key(log_key, ComputeIOType.STDERR, partial=True),\n ]\n to_remove = [key for key in known_keys if key in blob_list]\n elif prefix:\n to_remove = list(blob_list)\n else:\n check.failed("Must pass in either `log_key` or `prefix` argument to delete_logs")\n\n if to_remove:\n self._container_client.delete_blobs(*to_remove)\n\n def download_url_for_type(self, log_key: Sequence[str], io_type: ComputeIOType):\n if not self.is_capture_complete(log_key):\n return None\n\n blob_key = self._blob_key(log_key, io_type)\n if blob_key in self._download_urls:\n return self._download_urls[blob_key]\n blob = self._container_client.get_blob_client(blob_key)\n sas = generate_blob_sas(\n self._storage_account,\n self._container,\n blob_key,\n account_key=self._blob_client.credential.account_key,\n )\n url = blob.url + sas\n self._download_urls[blob_key] = url\n return url\n\n def display_path_for_type(self, log_key: Sequence[str], io_type: ComputeIOType):\n if not self.is_capture_complete(log_key):\n return self.local_manager.get_captured_local_path(log_key, IO_TYPE_EXTENSION[io_type])\n\n blob_key = self._blob_key(log_key, io_type)\n return f"https://{self._storage_account}.blob.core.windows.net/{self._container}/{blob_key}"\n\n def cloud_storage_has_logs(\n self, log_key: Sequence[str], io_type: ComputeIOType, partial: bool = False\n ) -> bool:\n blob_key = self._blob_key(log_key, io_type, partial=partial)\n blob_objects = self._container_client.list_blobs(blob_key)\n exact_matches = [blob for blob in blob_objects if blob.name == blob_key]\n return len(exact_matches) > 0\n\n def upload_to_cloud_storage(\n self, log_key: Sequence[str], io_type: ComputeIOType, partial=False\n ):\n path = self.local_manager.get_captured_local_path(log_key, IO_TYPE_EXTENSION[io_type])\n ensure_file(path)\n blob_key = self._blob_key(log_key, io_type, partial=partial)\n with open(path, "rb") as data:\n blob = self._container_client.get_blob_client(blob_key)\n blob.upload_blob(data)\n\n def download_from_cloud_storage(\n self, log_key: Sequence[str], io_type: ComputeIOType, partial=False\n ):\n path = self.local_manager.get_captured_local_path(\n log_key, IO_TYPE_EXTENSION[io_type], partial=partial\n )\n ensure_dir(os.path.dirname(path))\n blob_key = self._blob_key(log_key, io_type, partial=partial)\n with open(path, "wb") as fileobj:\n blob = self._container_client.get_blob_client(blob_key)\n blob.download_blob().readinto(fileobj)\n\n def on_subscribe(self, subscription):\n self._subscription_manager.add_subscription(subscription)\n\n def on_unsubscribe(self, subscription):\n self._subscription_manager.remove_subscription(subscription)
\n
", "current_page_name": "_modules/dagster_azure/blob/compute_log_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_azure.blob.compute_log_manager"}}}, "dagster_celery": {"executor": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_celery.executor

\nfrom dagster import (\n    Executor,\n    Field,\n    Noneable,\n    Permissive,\n    StringSource,\n    _check as check,\n    executor,\n    multiple_process_executor_requirements,\n)\nfrom dagster._core.execution.retries import RetryMode, get_retries_config\nfrom dagster._grpc.types import ExecuteStepArgs\nfrom dagster._serdes import pack_value\n\nfrom .config import DEFAULT_CONFIG, dict_wrapper\nfrom .defaults import broker_url, result_backend\n\nCELERY_CONFIG = {\n    "broker": Field(\n        Noneable(StringSource),\n        is_required=False,\n        description=(\n            "The URL of the Celery broker. Default: "\n            "'pyamqp://guest@{os.getenv('DAGSTER_CELERY_BROKER_HOST',"\n            "'localhost')}//'."\n        ),\n    ),\n    "backend": Field(\n        Noneable(StringSource),\n        is_required=False,\n        default_value="rpc://",\n        description="The URL of the Celery results backend. Default: 'rpc://'.",\n    ),\n    "include": Field(\n        [str], is_required=False, description="List of modules every worker should import"\n    ),\n    "config_source": Field(\n        Noneable(Permissive()),\n        is_required=False,\n        description="Additional settings for the Celery app.",\n    ),\n    "retries": get_retries_config(),\n}\n\n\n
[docs]@executor(\n name="celery",\n config_schema=CELERY_CONFIG,\n requirements=multiple_process_executor_requirements(),\n)\ndef celery_executor(init_context):\n """Celery-based executor.\n\n The Celery executor exposes config settings for the underlying Celery app under\n the ``config_source`` key. This config corresponds to the "new lowercase settings" introduced\n in Celery version 4.0 and the object constructed from config will be passed to the\n :py:class:`celery.Celery` constructor as its ``config_source`` argument.\n (See https://docs.celeryq.dev/en/stable/userguide/configuration.html for details.)\n\n The executor also exposes the ``broker``, `backend`, and ``include`` arguments to the\n :py:class:`celery.Celery` constructor.\n\n In the most common case, you may want to modify the ``broker`` and ``backend`` (e.g., to use\n Redis instead of RabbitMQ). We expect that ``config_source`` will be less frequently\n modified, but that when solid executions are especially fast or slow, or when there are\n different requirements around idempotence or retry, it may make sense to execute jobs\n with variations on these settings.\n\n To use the `celery_executor`, set it as the `executor_def` when defining a job:\n\n .. code-block:: python\n\n from dagster import job\n from dagster_celery import celery_executor\n\n @job(executor_def=celery_executor)\n def celery_enabled_job():\n pass\n\n Then you can configure the executor as follows:\n\n .. code-block:: YAML\n\n execution:\n config:\n broker: 'pyamqp://guest@localhost//' # Optional[str]: The URL of the Celery broker\n backend: 'rpc://' # Optional[str]: The URL of the Celery results backend\n include: ['my_module'] # Optional[List[str]]: Modules every worker should import\n config_source: # Dict[str, Any]: Any additional parameters to pass to the\n #... # Celery workers. This dict will be passed as the `config_source`\n #... # argument of celery.Celery().\n\n Note that the YAML you provide here must align with the configuration with which the Celery\n workers on which you hope to run were started. If, for example, you point the executor at a\n different broker than the one your workers are listening to, the workers will never be able to\n pick up tasks for execution.\n """\n return CeleryExecutor(\n broker=init_context.executor_config.get("broker"),\n backend=init_context.executor_config.get("backend"),\n config_source=init_context.executor_config.get("config_source"),\n include=init_context.executor_config.get("include"),\n retries=RetryMode.from_config(init_context.executor_config["retries"]),\n )
\n\n\ndef _submit_task(app, plan_context, step, queue, priority, known_state):\n from .tasks import create_task\n\n execute_step_args = ExecuteStepArgs(\n job_origin=plan_context.reconstructable_job.get_python_origin(),\n run_id=plan_context.dagster_run.run_id,\n step_keys_to_execute=[step.key],\n instance_ref=plan_context.instance.get_ref(),\n retry_mode=plan_context.executor.retries.for_inner_plan(),\n known_state=known_state,\n print_serialized_events=True, # Not actually checked by the celery task\n )\n\n task = create_task(app)\n task_signature = task.si(\n execute_step_args_packed=pack_value(execute_step_args),\n executable_dict=plan_context.reconstructable_job.to_dict(),\n )\n return task_signature.apply_async(\n priority=priority,\n queue=queue,\n routing_key=f"{queue}.execute_plan",\n )\n\n\nclass CeleryExecutor(Executor):\n def __init__(\n self,\n retries,\n broker=None,\n backend=None,\n include=None,\n config_source=None,\n ):\n self.broker = check.opt_str_param(broker, "broker", default=broker_url)\n self.backend = check.opt_str_param(backend, "backend", default=result_backend)\n self.include = check.opt_list_param(include, "include", of_type=str)\n self.config_source = dict_wrapper(\n dict(DEFAULT_CONFIG, **check.opt_dict_param(config_source, "config_source"))\n )\n self._retries = check.inst_param(retries, "retries", RetryMode)\n\n @property\n def retries(self):\n return self._retries\n\n def execute(self, plan_context, execution_plan):\n from .core_execution_loop import core_celery_execution_loop\n\n return core_celery_execution_loop(\n plan_context, execution_plan, step_execution_fn=_submit_task\n )\n\n @staticmethod\n def for_cli(broker=None, backend=None, include=None, config_source=None):\n return CeleryExecutor(\n retries=RetryMode(RetryMode.DISABLED),\n broker=broker,\n backend=backend,\n include=include,\n config_source=config_source,\n )\n\n def app_args(self):\n return {\n "broker": self.broker,\n "backend": self.backend,\n "include": self.include,\n "config_source": self.config_source,\n "retries": self.retries,\n }\n
", "current_page_name": "_modules/dagster_celery/executor", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_celery.executor"}}, "dagster_celery_docker": {"executor": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_celery_docker.executor

\nimport os\n\nimport docker.client\nfrom dagster import (\n    DagsterInstance,\n    Executor,\n    Field,\n    Permissive,\n    StringSource,\n    _check as check,\n    executor,\n    multiple_process_executor_requirements,\n)\nfrom dagster._cli.api import ExecuteStepArgs\nfrom dagster._core.events import EngineEventData\nfrom dagster._core.events.utils import filter_dagster_events_from_cli_logs\nfrom dagster._core.execution.retries import RetryMode\nfrom dagster._core.storage.dagster_run import DagsterRun\nfrom dagster._serdes import pack_value, serialize_value, unpack_value\nfrom dagster._utils.merger import merge_dicts\nfrom dagster_celery.config import DEFAULT_CONFIG, dict_wrapper\nfrom dagster_celery.core_execution_loop import DELEGATE_MARKER, core_celery_execution_loop\nfrom dagster_celery.defaults import broker_url, result_backend\nfrom dagster_celery.executor import CELERY_CONFIG\n\nCELERY_DOCKER_CONFIG_KEY = "celery-docker"\n\n\ndef celery_docker_config():\n    additional_config = {\n        "docker": Field(\n            {\n                "image": Field(\n                    StringSource,\n                    is_required=False,\n                    description="The docker image to be used for step execution.",\n                ),\n                "registry": Field(\n                    {\n                        "url": Field(StringSource),\n                        "username": Field(StringSource),\n                        "password": Field(StringSource),\n                    },\n                    is_required=False,\n                    description="Information for using a non local/public docker registry",\n                ),\n                "env_vars": Field(\n                    [str],\n                    is_required=False,\n                    description=(\n                        "The list of environment variables names to forward from the celery worker"\n                        " in to the docker container"\n                    ),\n                ),\n                "network": Field(\n                    str,\n                    is_required=False,\n                    description=(\n                        "Name of the network this container will be connected to at creation time"\n                    ),\n                ),\n                "container_kwargs": Field(\n                    Permissive(),\n                    is_required=False,\n                    description="Additional keyword args for the docker container",\n                ),\n            },\n            is_required=True,\n            description="The configuration for interacting with docker in the celery worker.",\n        ),\n    }\n\n    cfg = merge_dicts(CELERY_CONFIG, additional_config)\n    return cfg\n\n\n
[docs]@executor(\n name=CELERY_DOCKER_CONFIG_KEY,\n config_schema=celery_docker_config(),\n requirements=multiple_process_executor_requirements(),\n)\ndef celery_docker_executor(init_context):\n """Celery-based executor which launches tasks in docker containers.\n\n The Celery executor exposes config settings for the underlying Celery app under\n the ``config_source`` key. This config corresponds to the "new lowercase settings" introduced\n in Celery version 4.0 and the object constructed from config will be passed to the\n :py:class:`celery.Celery` constructor as its ``config_source`` argument.\n (See https://docs.celeryq.dev/en/stable/userguide/configuration.html for details.)\n\n The executor also exposes the ``broker``, `backend`, and ``include`` arguments to the\n :py:class:`celery.Celery` constructor.\n\n In the most common case, you may want to modify the ``broker`` and ``backend`` (e.g., to use\n Redis instead of RabbitMQ). We expect that ``config_source`` will be less frequently\n modified, but that when op executions are especially fast or slow, or when there are\n different requirements around idempotence or retry, it may make sense to execute jobs\n with variations on these settings.\n\n To use the `celery_docker_executor`, set it as the `executor_def` when defining a job:\n\n .. code-block:: python\n\n from dagster import job\n from dagster_celery_docker.executor import celery_docker_executor\n\n @job(executor_def=celery_docker_executor)\n def celery_enabled_job():\n pass\n\n Then you can configure the executor as follows:\n\n .. code-block:: YAML\n\n execution:\n config:\n docker:\n image: 'my_repo.com/image_name:latest'\n registry:\n url: 'my_repo.com'\n username: 'my_user'\n password: {env: 'DOCKER_PASSWORD'}\n env_vars: ["DAGSTER_HOME"] # environment vars to pass from celery worker to docker\n container_kwargs: # keyword args to be passed to the container. example:\n volumes: ['/home/user1/:/mnt/vol2','/var/www:/mnt/vol1']\n\n broker: 'pyamqp://guest@localhost//' # Optional[str]: The URL of the Celery broker\n backend: 'rpc://' # Optional[str]: The URL of the Celery results backend\n include: ['my_module'] # Optional[List[str]]: Modules every worker should import\n config_source: # Dict[str, Any]: Any additional parameters to pass to the\n #... # Celery workers. This dict will be passed as the `config_source`\n #... # argument of celery.Celery().\n\n Note that the YAML you provide here must align with the configuration with which the Celery\n workers on which you hope to run were started. If, for example, you point the executor at a\n different broker than the one your workers are listening to, the workers will never be able to\n pick up tasks for execution.\n\n In deployments where the celery_docker_job_executor is used all appropriate celery and dagster_celery\n commands must be invoked with the `-A dagster_celery_docker.app` argument.\n """\n exc_cfg = init_context.executor_config\n\n return CeleryDockerExecutor(\n broker=exc_cfg.get("broker"),\n backend=exc_cfg.get("backend"),\n config_source=exc_cfg.get("config_source"),\n include=exc_cfg.get("include"),\n retries=RetryMode.from_config(exc_cfg.get("retries")),\n docker_config=exc_cfg.get("docker"),\n )
\n\n\nclass CeleryDockerExecutor(Executor):\n def __init__(\n self,\n retries,\n docker_config,\n broker=None,\n backend=None,\n include=None,\n config_source=None,\n ):\n self._retries = check.inst_param(retries, "retries", RetryMode)\n self.broker = check.opt_str_param(broker, "broker", default=broker_url)\n self.backend = check.opt_str_param(backend, "backend", default=result_backend)\n self.include = check.opt_list_param(include, "include", of_type=str)\n self.config_source = dict_wrapper(\n dict(DEFAULT_CONFIG, **check.opt_dict_param(config_source, "config_source"))\n )\n self.docker_config = check.dict_param(docker_config, "docker_config")\n\n @property\n def retries(self):\n return self._retries\n\n def execute(self, plan_context, execution_plan):\n return core_celery_execution_loop(\n plan_context, execution_plan, step_execution_fn=_submit_task_docker\n )\n\n def app_args(self):\n return {\n "broker": self.broker,\n "backend": self.backend,\n "include": self.include,\n "config_source": self.config_source,\n "retries": self.retries,\n }\n\n\ndef _submit_task_docker(app, plan_context, step, queue, priority, known_state):\n execute_step_args = ExecuteStepArgs(\n job_origin=plan_context.reconstructable_job.get_python_origin(),\n run_id=plan_context.dagster_run.run_id,\n step_keys_to_execute=[step.key],\n instance_ref=plan_context.instance.get_ref(),\n retry_mode=plan_context.executor.retries.for_inner_plan(),\n known_state=known_state,\n print_serialized_events=True,\n )\n\n task = create_docker_task(app)\n task_signature = task.si(\n execute_step_args_packed=pack_value(execute_step_args),\n docker_config=plan_context.executor.docker_config,\n )\n return task_signature.apply_async(\n priority=priority,\n queue=queue,\n routing_key=f"{queue}.execute_step_docker",\n )\n\n\ndef create_docker_task(celery_app, **task_kwargs):\n @celery_app.task(bind=True, name="execute_step_docker", **task_kwargs)\n def _execute_step_docker(\n self,\n execute_step_args_packed,\n docker_config,\n ):\n """Run step execution in a Docker container."""\n execute_step_args = unpack_value(\n check.dict_param(\n execute_step_args_packed,\n "execute_step_args_packed",\n ),\n as_type=ExecuteStepArgs,\n )\n\n check.dict_param(docker_config, "docker_config")\n\n instance = DagsterInstance.from_ref(execute_step_args.instance_ref)\n dagster_run = instance.get_run_by_id(execute_step_args.run_id)\n check.inst(\n dagster_run,\n DagsterRun,\n f"Could not load run {execute_step_args.run_id}",\n )\n step_keys_str = ", ".join(execute_step_args.step_keys_to_execute)\n\n docker_image = (\n docker_config["image"]\n if docker_config.get("image")\n else dagster_run.job_code_origin.repository_origin.container_image\n )\n\n if not docker_image:\n raise Exception("No docker image specified by either the job or the repository")\n\n client = docker.client.from_env()\n\n if docker_config.get("registry"):\n client.login(\n registry=docker_config["registry"]["url"],\n username=docker_config["registry"]["username"],\n password=docker_config["registry"]["password"],\n )\n\n # Post event for starting execution\n engine_event = instance.report_engine_event(\n f"Executing steps {step_keys_str} in Docker container {docker_image}",\n dagster_run,\n EngineEventData(\n {\n "Step keys": step_keys_str,\n "Image": docker_image,\n "Celery worker": self.request.hostname,\n },\n marker_end=DELEGATE_MARKER,\n ),\n CeleryDockerExecutor,\n step_key=execute_step_args.step_keys_to_execute[0],\n )\n\n serialized_events = [serialize_value(engine_event)]\n\n docker_env = {}\n if docker_config.get("env_vars"):\n docker_env = {env_name: os.getenv(env_name) for env_name in docker_config["env_vars"]}\n\n container_kwargs = check.opt_dict_param(\n docker_config.get("container_kwargs"), "container_kwargs", key_type=str\n )\n\n # set defaults for detach and auto_remove\n container_kwargs["detach"] = container_kwargs.get("detach", False)\n container_kwargs["auto_remove"] = container_kwargs.get("auto_remove", True)\n\n # if environment variables are provided via container_kwargs, merge with env_vars\n if container_kwargs.get("environment") is not None:\n e_vars = container_kwargs.get("environment")\n if isinstance(e_vars, dict):\n docker_env.update(e_vars)\n else:\n for v in e_vars:\n key, val = v.split("=")\n docker_env[key] = val\n del container_kwargs["environment"]\n\n try:\n docker_response = client.containers.run(\n docker_image,\n command=execute_step_args.get_command_args(),\n # pass through this worker's environment for things like AWS creds etc.\n environment=docker_env,\n network=docker_config.get("network", None),\n **container_kwargs,\n )\n\n res = docker_response.decode("utf-8")\n except docker.errors.ContainerError as err:\n metadata = {"Job image": docker_image}\n if err.stderr is not None:\n metadata["Docker stderr"] = err.stderr\n\n instance.report_engine_event(\n f"Failed to run steps {step_keys_str} in Docker container {docker_image}",\n dagster_run,\n EngineEventData(metadata),\n CeleryDockerExecutor,\n step_key=execute_step_args.step_keys_to_execute[0],\n )\n raise\n else:\n if res is None:\n raise Exception("No response from execute_step in CeleryDockerExecutor")\n\n events = filter_dagster_events_from_cli_logs(res.split("\\n"))\n serialized_events += [serialize_value(event) for event in events]\n\n return serialized_events\n\n return _execute_step_docker\n
", "current_page_name": "_modules/dagster_celery_docker/executor", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_celery_docker.executor"}}, "dagster_celery_k8s": {"executor": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_celery_k8s.executor

\nimport logging\nimport os\nimport sys\nimport time\n\nimport kubernetes\nfrom dagster import (\n    DagsterEvent,\n    DagsterEventType,\n    DagsterInstance,\n    Executor,\n    _check as check,\n    executor,\n    multiple_process_executor_requirements,\n)\nfrom dagster._cli.api import ExecuteStepArgs\nfrom dagster._core.errors import DagsterUnmetExecutorRequirementsError\nfrom dagster._core.events import EngineEventData\nfrom dagster._core.events.log import EventLogEntry\nfrom dagster._core.events.utils import filter_dagster_events_from_cli_logs\nfrom dagster._core.execution.plan.objects import StepFailureData, UserFailureData\nfrom dagster._core.execution.retries import RetryMode\nfrom dagster._core.storage.dagster_run import DagsterRun, DagsterRunStatus\nfrom dagster._serdes import pack_value, serialize_value, unpack_value\nfrom dagster._utils.error import serializable_error_info_from_exc_info\nfrom dagster_celery.config import DEFAULT_CONFIG, dict_wrapper\nfrom dagster_celery.core_execution_loop import DELEGATE_MARKER\nfrom dagster_celery.defaults import broker_url, result_backend\nfrom dagster_k8s import DagsterK8sJobConfig, construct_dagster_k8s_job\nfrom dagster_k8s.client import (\n    DagsterK8sAPIRetryLimitExceeded,\n    DagsterK8sError,\n    DagsterK8sJobStatusException,\n    DagsterK8sTimeoutError,\n    DagsterK8sUnrecoverableAPIError,\n    DagsterKubernetesClient,\n)\nfrom dagster_k8s.job import (\n    UserDefinedDagsterK8sConfig,\n    get_k8s_job_name,\n    get_user_defined_k8s_config,\n)\n\nfrom .config import CELERY_K8S_CONFIG_KEY, celery_k8s_executor_config\nfrom .launcher import CeleryK8sRunLauncher\n\n\n
[docs]@executor(\n name=CELERY_K8S_CONFIG_KEY,\n config_schema=celery_k8s_executor_config(),\n requirements=multiple_process_executor_requirements(),\n)\ndef celery_k8s_job_executor(init_context):\n """Celery-based executor which launches tasks as Kubernetes Jobs.\n\n The Celery executor exposes config settings for the underlying Celery app under\n the ``config_source`` key. This config corresponds to the "new lowercase settings" introduced\n in Celery version 4.0 and the object constructed from config will be passed to the\n :py:class:`celery.Celery` constructor as its ``config_source`` argument.\n (See https://docs.celeryq.dev/en/stable/userguide/configuration.html for details.)\n\n The executor also exposes the ``broker``, `backend`, and ``include`` arguments to the\n :py:class:`celery.Celery` constructor.\n\n In the most common case, you may want to modify the ``broker`` and ``backend`` (e.g., to use\n Redis instead of RabbitMQ). We expect that ``config_source`` will be less frequently\n modified, but that when op executions are especially fast or slow, or when there are\n different requirements around idempotence or retry, it may make sense to execute dagster jobs\n with variations on these settings.\n\n To use the `celery_k8s_job_executor`, set it as the `executor_def` when defining a job:\n\n .. literalinclude:: ../../../../../../python_modules/libraries/dagster-celery-k8s/dagster_celery_k8s_tests/example_celery_mode_def.py\n :language: python\n\n Then you can configure the executor as follows:\n\n .. code-block:: YAML\n\n execution:\n config:\n job_image: 'my_repo.com/image_name:latest'\n job_namespace: 'some-namespace'\n broker: 'pyamqp://guest@localhost//' # Optional[str]: The URL of the Celery broker\n backend: 'rpc://' # Optional[str]: The URL of the Celery results backend\n include: ['my_module'] # Optional[List[str]]: Modules every worker should import\n config_source: # Dict[str, Any]: Any additional parameters to pass to the\n #... # Celery workers. This dict will be passed as the `config_source`\n #... # argument of celery.Celery().\n\n Note that the YAML you provide here must align with the configuration with which the Celery\n workers on which you hope to run were started. If, for example, you point the executor at a\n different broker than the one your workers are listening to, the workers will never be able to\n pick up tasks for execution.\n\n In deployments where the celery_k8s_job_executor is used all appropriate celery and dagster_celery\n commands must be invoked with the `-A dagster_celery_k8s.app` argument.\n """\n run_launcher = init_context.instance.run_launcher\n exc_cfg = init_context.executor_config\n\n if not isinstance(run_launcher, CeleryK8sRunLauncher):\n raise DagsterUnmetExecutorRequirementsError(\n "This engine is only compatible with a CeleryK8sRunLauncher; configure the "\n "CeleryK8sRunLauncher on your instance to use it.",\n )\n\n job_config = run_launcher.get_k8s_job_config(\n job_image=exc_cfg.get("job_image") or os.getenv("DAGSTER_CURRENT_IMAGE"), exc_config=exc_cfg\n )\n\n # Set on the instance but overrideable here\n broker = run_launcher.broker or exc_cfg.get("broker")\n backend = run_launcher.backend or exc_cfg.get("backend")\n config_source = run_launcher.config_source or exc_cfg.get("config_source")\n include = run_launcher.include or exc_cfg.get("include")\n retries = run_launcher.retries or RetryMode.from_config(exc_cfg.get("retries"))\n\n return CeleryK8sJobExecutor(\n broker=broker,\n backend=backend,\n config_source=config_source,\n include=include,\n retries=retries,\n job_config=job_config,\n job_namespace=exc_cfg.get("job_namespace", run_launcher.job_namespace),\n load_incluster_config=exc_cfg.get("load_incluster_config"),\n kubeconfig_file=exc_cfg.get("kubeconfig_file"),\n repo_location_name=exc_cfg.get("repo_location_name"),\n job_wait_timeout=exc_cfg.get("job_wait_timeout"),\n )
\n\n\nclass CeleryK8sJobExecutor(Executor):\n def __init__(\n self,\n retries,\n broker=None,\n backend=None,\n include=None,\n config_source=None,\n job_config=None,\n job_namespace=None,\n load_incluster_config=False,\n kubeconfig_file=None,\n repo_location_name=None,\n job_wait_timeout=None,\n ):\n if load_incluster_config:\n check.invariant(\n kubeconfig_file is None,\n "`kubeconfig_file` is set but `load_incluster_config` is True.",\n )\n else:\n check.opt_str_param(kubeconfig_file, "kubeconfig_file")\n\n self._retries = check.inst_param(retries, "retries", RetryMode)\n self.broker = check.opt_str_param(broker, "broker", default=broker_url)\n self.backend = check.opt_str_param(backend, "backend", default=result_backend)\n self.include = check.opt_list_param(include, "include", of_type=str)\n self.config_source = dict_wrapper(\n dict(DEFAULT_CONFIG, **check.opt_dict_param(config_source, "config_source"))\n )\n self.job_config = check.inst_param(job_config, "job_config", DagsterK8sJobConfig)\n self.job_namespace = check.opt_str_param(job_namespace, "job_namespace")\n\n self.load_incluster_config = check.bool_param(\n load_incluster_config, "load_incluster_config"\n )\n\n self.kubeconfig_file = check.opt_str_param(kubeconfig_file, "kubeconfig_file")\n self.repo_location_name = check.opt_str_param(repo_location_name, "repo_location_name")\n self.job_wait_timeout = check.float_param(job_wait_timeout, "job_wait_timeout")\n\n @property\n def retries(self):\n return self._retries\n\n def execute(self, plan_context, execution_plan):\n from dagster_celery.core_execution_loop import core_celery_execution_loop\n\n return core_celery_execution_loop(\n plan_context, execution_plan, step_execution_fn=_submit_task_k8s_job\n )\n\n def app_args(self):\n return {\n "broker": self.broker,\n "backend": self.backend,\n "include": self.include,\n "config_source": self.config_source,\n "retries": self.retries,\n }\n\n\ndef _submit_task_k8s_job(app, plan_context, step, queue, priority, known_state):\n user_defined_k8s_config = get_user_defined_k8s_config(step.tags)\n\n job_origin = plan_context.reconstructable_job.get_python_origin()\n\n execute_step_args = ExecuteStepArgs(\n job_origin=job_origin,\n run_id=plan_context.dagster_run.run_id,\n step_keys_to_execute=[step.key],\n instance_ref=plan_context.instance.get_ref(),\n retry_mode=plan_context.executor.retries.for_inner_plan(),\n known_state=known_state,\n should_verify_step=True,\n print_serialized_events=True,\n )\n\n job_config = plan_context.executor.job_config\n if not job_config.job_image:\n job_config = job_config.with_image(job_origin.repository_origin.container_image)\n\n if not job_config.job_image:\n raise Exception("No image included in either executor config or the dagster job")\n\n task = create_k8s_job_task(app)\n task_signature = task.si(\n execute_step_args_packed=pack_value(execute_step_args),\n job_config_dict=job_config.to_dict(),\n job_namespace=plan_context.executor.job_namespace,\n user_defined_k8s_config_dict=user_defined_k8s_config.to_dict(),\n load_incluster_config=plan_context.executor.load_incluster_config,\n job_wait_timeout=plan_context.executor.job_wait_timeout,\n kubeconfig_file=plan_context.executor.kubeconfig_file,\n )\n\n return task_signature.apply_async(\n priority=priority,\n queue=queue,\n routing_key=f"{queue}.execute_step_k8s_job",\n )\n\n\ndef construct_step_failure_event_and_handle(dagster_run, step_key, err, instance):\n step_failure_event = DagsterEvent(\n event_type_value=DagsterEventType.STEP_FAILURE.value,\n job_name=dagster_run.job_name,\n step_key=step_key,\n event_specific_data=StepFailureData(\n error=serializable_error_info_from_exc_info(sys.exc_info()),\n user_failure_data=UserFailureData(label="K8sError"),\n ),\n )\n event_record = EventLogEntry(\n user_message=str(err),\n level=logging.ERROR,\n job_name=dagster_run.job_name,\n run_id=dagster_run.run_id,\n error_info=None,\n step_key=step_key,\n timestamp=time.time(),\n dagster_event=step_failure_event,\n )\n instance.handle_new_event(event_record)\n return step_failure_event\n\n\ndef create_k8s_job_task(celery_app, **task_kwargs):\n @celery_app.task(bind=True, name="execute_step_k8s_job", **task_kwargs)\n def _execute_step_k8s_job(\n self,\n execute_step_args_packed,\n job_config_dict,\n job_namespace,\n load_incluster_config,\n job_wait_timeout,\n user_defined_k8s_config_dict=None,\n kubeconfig_file=None,\n ):\n """Run step execution in a K8s job pod."""\n execute_step_args = unpack_value(\n check.dict_param(\n execute_step_args_packed,\n "execute_step_args_packed",\n )\n )\n check.inst_param(execute_step_args, "execute_step_args", ExecuteStepArgs)\n check.invariant(\n len(execute_step_args.step_keys_to_execute) == 1,\n "Celery K8s task executor can only execute 1 step at a time",\n )\n\n # Celery will serialize this as a list\n job_config = DagsterK8sJobConfig.from_dict(job_config_dict)\n check.inst_param(job_config, "job_config", DagsterK8sJobConfig)\n check.str_param(job_namespace, "job_namespace")\n\n check.bool_param(load_incluster_config, "load_incluster_config")\n\n user_defined_k8s_config = UserDefinedDagsterK8sConfig.from_dict(\n user_defined_k8s_config_dict\n )\n check.opt_inst_param(\n user_defined_k8s_config,\n "user_defined_k8s_config",\n UserDefinedDagsterK8sConfig,\n )\n check.opt_str_param(kubeconfig_file, "kubeconfig_file")\n\n # For when launched via DinD or running the cluster\n if load_incluster_config:\n kubernetes.config.load_incluster_config()\n else:\n kubernetes.config.load_kube_config(kubeconfig_file)\n\n api_client = DagsterKubernetesClient.production_client()\n instance = DagsterInstance.from_ref(execute_step_args.instance_ref)\n dagster_run = instance.get_run_by_id(execute_step_args.run_id)\n\n check.inst(\n dagster_run,\n DagsterRun,\n f"Could not load run {execute_step_args.run_id}",\n )\n step_key = execute_step_args.step_keys_to_execute[0]\n\n celery_worker_name = self.request.hostname\n celery_pod_name = os.environ.get("HOSTNAME")\n instance.report_engine_event(\n f"Task for step {step_key} picked up by Celery",\n dagster_run,\n EngineEventData(\n {\n "Celery worker name": celery_worker_name,\n "Celery worker Kubernetes Pod name": celery_pod_name,\n }\n ),\n CeleryK8sJobExecutor,\n step_key=step_key,\n )\n\n if dagster_run.status != DagsterRunStatus.STARTED:\n instance.report_engine_event(\n "Not scheduling step because dagster run status is not STARTED",\n dagster_run,\n EngineEventData(\n {\n "Step key": step_key,\n }\n ),\n CeleryK8sJobExecutor,\n step_key=step_key,\n )\n return []\n\n # Ensure we stay below k8s name length limits\n k8s_name_key = get_k8s_job_name(execute_step_args.run_id, step_key)\n\n retry_state = execute_step_args.known_state.get_retry_state()\n\n if retry_state.get_attempt_count(step_key):\n attempt_number = retry_state.get_attempt_count(step_key)\n job_name = "dagster-step-%s-%d" % (k8s_name_key, attempt_number)\n pod_name = "dagster-step-%s-%d" % (k8s_name_key, attempt_number)\n else:\n job_name = "dagster-step-%s" % (k8s_name_key)\n pod_name = "dagster-step-%s" % (k8s_name_key)\n\n args = execute_step_args.get_command_args()\n\n labels = {\n "dagster/job": dagster_run.job_name,\n "dagster/op": step_key,\n "dagster/run-id": execute_step_args.run_id,\n }\n if dagster_run.external_job_origin:\n labels["dagster/code-location"] = (\n dagster_run.external_job_origin.external_repository_origin.code_location_origin.location_name\n )\n job = construct_dagster_k8s_job(\n job_config,\n args,\n job_name,\n user_defined_k8s_config,\n pod_name,\n component="step_worker",\n labels=labels,\n env_vars=[\n {\n "name": "DAGSTER_RUN_JOB_NAME",\n "value": dagster_run.job_name,\n },\n {"name": "DAGSTER_RUN_STEP_KEY", "value": step_key},\n ],\n )\n\n # Running list of events generated from this task execution\n events = []\n\n # Post event for starting execution\n job_name = job.metadata.name\n engine_event = instance.report_engine_event(\n f'Executing step "{step_key}" in Kubernetes job {job_name}.',\n dagster_run,\n EngineEventData(\n {\n "Step key": step_key,\n "Kubernetes Job name": job_name,\n "Job image": job_config.job_image,\n "Image pull policy": job_config.image_pull_policy,\n "Image pull secrets": str(job_config.image_pull_secrets),\n "Service account name": str(job_config.service_account_name),\n },\n marker_end=DELEGATE_MARKER,\n ),\n CeleryK8sJobExecutor,\n # validated above that step_keys is length 1, and it is not possible to use ETH or\n # execution plan in this function (Celery K8s workers should not access to user code)\n step_key=step_key,\n )\n events.append(engine_event)\n try:\n api_client.batch_api.create_namespaced_job(body=job, namespace=job_namespace)\n except kubernetes.client.rest.ApiException as e:\n if e.reason == "Conflict":\n # There is an existing job with the same name so proceed and see if the existing job succeeded\n instance.report_engine_event(\n "Did not create Kubernetes job {} for step {} since job name already "\n "exists, proceeding with existing job.".format(job_name, step_key),\n dagster_run,\n EngineEventData(\n {\n "Step key": step_key,\n "Kubernetes Job name": job_name,\n },\n marker_end=DELEGATE_MARKER,\n ),\n CeleryK8sJobExecutor,\n step_key=step_key,\n )\n else:\n instance.report_engine_event(\n "Encountered unexpected error while creating Kubernetes job {} for step {}, "\n "exiting.".format(job_name, step_key),\n dagster_run,\n EngineEventData(\n {\n "Step key": step_key,\n },\n error=serializable_error_info_from_exc_info(sys.exc_info()),\n ),\n CeleryK8sJobExecutor,\n step_key=step_key,\n )\n return []\n\n try:\n api_client.wait_for_job_success(\n job_name=job_name,\n namespace=job_namespace,\n instance=instance,\n run_id=execute_step_args.run_id,\n wait_timeout=job_wait_timeout,\n )\n except (DagsterK8sError, DagsterK8sTimeoutError) as err:\n step_failure_event = construct_step_failure_event_and_handle(\n dagster_run, step_key, err, instance=instance\n )\n events.append(step_failure_event)\n except DagsterK8sJobStatusException:\n instance.report_engine_event(\n "Terminating Kubernetes Job because dagster run status is not STARTED",\n dagster_run,\n EngineEventData(\n {\n "Step key": step_key,\n "Kubernetes Job name": job_name,\n "Kubernetes Job namespace": job_namespace,\n }\n ),\n CeleryK8sJobExecutor,\n step_key=step_key,\n )\n api_client.delete_job(job_name=job_name, namespace=job_namespace)\n return []\n except (\n DagsterK8sUnrecoverableAPIError,\n DagsterK8sAPIRetryLimitExceeded,\n # We shouldn't see unwrapped APIExceptions anymore, as they should all be wrapped in\n # a retry boundary. We still catch it here just in case we missed one so that we can\n # report it to the event log\n kubernetes.client.rest.ApiException,\n ):\n instance.report_engine_event(\n "Encountered unexpected error while waiting on Kubernetes job {} for step {}, "\n "exiting.".format(job_name, step_key),\n dagster_run,\n EngineEventData(\n {\n "Step key": step_key,\n },\n error=serializable_error_info_from_exc_info(sys.exc_info()),\n ),\n CeleryK8sJobExecutor,\n step_key=step_key,\n )\n return []\n\n try:\n pod_names = api_client.get_pod_names_in_job(job_name, namespace=job_namespace)\n except kubernetes.client.rest.ApiException:\n instance.report_engine_event(\n "Encountered unexpected error retreiving Pods for Kubernetes job {} for step {}, "\n "exiting.".format(job_name, step_key),\n dagster_run,\n EngineEventData(\n {\n "Step key": step_key,\n },\n error=serializable_error_info_from_exc_info(sys.exc_info()),\n ),\n CeleryK8sJobExecutor,\n step_key=step_key,\n )\n return []\n\n # Post engine event for log retrieval\n engine_event = instance.report_engine_event(\n "Retrieving logs from Kubernetes Job pods",\n dagster_run,\n EngineEventData({"Pod names": "\\n".join(pod_names)}),\n CeleryK8sJobExecutor,\n step_key=step_key,\n )\n events.append(engine_event)\n\n logs = []\n for pod_name in pod_names:\n try:\n raw_logs = api_client.retrieve_pod_logs(pod_name, namespace=job_namespace)\n logs += raw_logs.split("\\n")\n except kubernetes.client.exceptions.ApiException:\n instance.report_engine_event(\n "Encountered unexpected error while fetching pod logs for Kubernetes job {}, "\n "Pod name {} for step {}. Will attempt to continue with other pods.".format(\n job_name, pod_name, step_key\n ),\n dagster_run,\n EngineEventData(\n {\n "Step key": step_key,\n },\n error=serializable_error_info_from_exc_info(sys.exc_info()),\n ),\n CeleryK8sJobExecutor,\n step_key=step_key,\n )\n\n events += filter_dagster_events_from_cli_logs(logs)\n serialized_events = [serialize_value(event) for event in events]\n return serialized_events\n\n return _execute_step_k8s_job\n
", "current_page_name": "_modules/dagster_celery_k8s/executor", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_celery_k8s.executor"}, "launcher": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_celery_k8s.launcher

\nimport sys\nfrom typing import Optional, cast\n\nimport kubernetes\nfrom dagster import (\n    DagsterInvariantViolationError,\n    _check as check,\n)\nfrom dagster._config import process_config, resolve_to_config_type\nfrom dagster._core.events import EngineEventData\nfrom dagster._core.execution.retries import RetryMode\nfrom dagster._core.launcher import LaunchRunContext, RunLauncher\nfrom dagster._core.launcher.base import CheckRunHealthResult, WorkerStatus\nfrom dagster._core.origin import JobPythonOrigin\nfrom dagster._core.storage.dagster_run import DagsterRun\nfrom dagster._core.storage.tags import DOCKER_IMAGE_TAG\nfrom dagster._serdes import ConfigurableClass, ConfigurableClassData\nfrom dagster._utils.error import serializable_error_info_from_exc_info\nfrom dagster._utils.merger import merge_dicts\nfrom dagster_k8s.client import DagsterKubernetesClient\nfrom dagster_k8s.job import (\n    DagsterK8sJobConfig,\n    construct_dagster_k8s_job,\n    get_job_name_from_run_id,\n    get_user_defined_k8s_config,\n)\n\nfrom .config import CELERY_K8S_CONFIG_KEY, celery_k8s_executor_config\n\n\n
[docs]class CeleryK8sRunLauncher(RunLauncher, ConfigurableClass):\n """In contrast to the :py:class:`K8sRunLauncher`, which launches dagster runs as single K8s\n Jobs, this run launcher is intended for use in concert with\n :py:func:`dagster_celery_k8s.celery_k8s_job_executor`.\n\n With this run launcher, execution is delegated to:\n\n 1. A run worker Kubernetes Job, which traverses the dagster run execution plan and\n submits steps to Celery queues for execution;\n 2. The step executions which are submitted to Celery queues are picked up by Celery workers,\n and each step execution spawns a step execution Kubernetes Job. See the implementation\n defined in :py:func:`dagster_celery_k8.executor.create_k8s_job_task`.\n\n You can configure a Dagster instance to use this RunLauncher by adding a section to your\n ``dagster.yaml`` like the following:\n\n .. code-block:: yaml\n\n run_launcher:\n module: dagster_k8s.launcher\n class: CeleryK8sRunLauncher\n config:\n instance_config_map: "dagster-k8s-instance-config-map"\n dagster_home: "/some/path"\n postgres_password_secret: "dagster-k8s-pg-password"\n broker: "some_celery_broker_url"\n backend: "some_celery_backend_url"\n\n """\n\n def __init__(\n self,\n instance_config_map,\n dagster_home,\n postgres_password_secret,\n load_incluster_config=True,\n kubeconfig_file=None,\n broker=None,\n backend=None,\n include=None,\n config_source=None,\n retries=None,\n inst_data: Optional[ConfigurableClassData] = None,\n k8s_client_batch_api=None,\n env_config_maps=None,\n env_secrets=None,\n volume_mounts=None,\n volumes=None,\n service_account_name=None,\n image_pull_policy=None,\n image_pull_secrets=None,\n labels=None,\n fail_pod_on_run_failure=None,\n job_namespace=None,\n ):\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n\n if load_incluster_config:\n check.invariant(\n kubeconfig_file is None,\n "`kubeconfig_file` is set but `load_incluster_config` is True.",\n )\n kubernetes.config.load_incluster_config()\n else:\n check.opt_str_param(kubeconfig_file, "kubeconfig_file")\n kubernetes.config.load_kube_config(kubeconfig_file)\n\n self._api_client = DagsterKubernetesClient.production_client(\n batch_api_override=k8s_client_batch_api\n )\n\n self.instance_config_map = check.str_param(instance_config_map, "instance_config_map")\n self.dagster_home = check.str_param(dagster_home, "dagster_home")\n self.postgres_password_secret = check.str_param(\n postgres_password_secret, "postgres_password_secret"\n )\n self.broker = check.opt_str_param(broker, "broker")\n self.backend = check.opt_str_param(backend, "backend")\n self.include = check.opt_list_param(include, "include")\n self.config_source = check.opt_dict_param(config_source, "config_source")\n\n retries = check.opt_dict_param(retries, "retries") or {"enabled": {}}\n self.retries = RetryMode.from_config(retries)\n\n self._env_config_maps = check.opt_list_param(\n env_config_maps, "env_config_maps", of_type=str\n )\n self._env_secrets = check.opt_list_param(env_secrets, "env_secrets", of_type=str)\n\n self._volume_mounts = check.opt_list_param(volume_mounts, "volume_mounts")\n self._volumes = check.opt_list_param(volumes, "volumes")\n\n self._service_account_name = check.opt_str_param(\n service_account_name, "service_account_name"\n )\n self._image_pull_policy = check.opt_str_param(\n image_pull_policy, "image_pull_policy", "IfNotPresent"\n )\n self._image_pull_secrets = check.opt_list_param(\n image_pull_secrets, "image_pull_secrets", of_type=dict\n )\n self._labels = check.opt_dict_param(labels, "labels", key_type=str, value_type=str)\n self._fail_pod_on_run_failure = check.opt_bool_param(\n fail_pod_on_run_failure, "fail_pod_on_run_failure"\n )\n self.job_namespace = check.opt_str_param(job_namespace, "job_namespace", default="default")\n\n super().__init__()\n\n @classmethod\n def config_type(cls):\n from dagster_celery.executor import CELERY_CONFIG\n\n return merge_dicts(DagsterK8sJobConfig.config_type_run_launcher(), CELERY_CONFIG)\n\n @classmethod\n def from_config_value(cls, inst_data, config_value):\n return cls(inst_data=inst_data, **config_value)\n\n @property\n def inst_data(self):\n return self._inst_data\n\n def launch_run(self, context: LaunchRunContext) -> None:\n run = context.dagster_run\n\n job_name = get_job_name_from_run_id(run.run_id)\n pod_name = job_name\n exc_config = _get_validated_celery_k8s_executor_config(run.run_config)\n\n job_image_from_executor_config = exc_config.get("job_image")\n\n job_origin = cast(JobPythonOrigin, context.job_code_origin)\n repository_origin = job_origin.repository_origin\n\n job_image = repository_origin.container_image\n\n if job_image:\n if job_image_from_executor_config:\n job_image = job_image_from_executor_config\n self._instance.report_engine_event(\n f"You have specified a job_image {job_image_from_executor_config} in your"\n f" executor configuration, but also {job_image} in your user-code"\n f" deployment. Using the job image {job_image_from_executor_config} from"\n " executor configuration as it takes precedence.",\n run,\n cls=self.__class__,\n )\n else:\n if not job_image_from_executor_config:\n raise DagsterInvariantViolationError(\n "You have not specified a job_image in your executor configuration. To resolve"\n " this error, specify the job_image configuration in the executor config"\n " section in your run config. \\nNote: You may also be seeing this error because"\n " you are using the configured API. Using configured with the celery-k8s"\n " executor is not supported at this time, and the job_image must be configured"\n " at the top-level executor config without using configured."\n )\n\n job_image = job_image_from_executor_config\n\n job_config = self.get_k8s_job_config(job_image, exc_config)\n\n self._instance.add_run_tags(\n run.run_id,\n {DOCKER_IMAGE_TAG: job_config.job_image},\n )\n\n user_defined_k8s_config = get_user_defined_k8s_config(run.tags)\n\n from dagster._cli.api import ExecuteRunArgs\n\n run_args = ExecuteRunArgs(\n job_origin=job_origin,\n run_id=run.run_id,\n instance_ref=self._instance.get_ref(),\n set_exit_code_on_failure=self._fail_pod_on_run_failure,\n ).get_command_args()\n\n labels = {\n "dagster/job": job_origin.job_name,\n "dagster/run-id": run.run_id,\n }\n if run.external_job_origin:\n labels["dagster/code-location"] = (\n run.external_job_origin.external_repository_origin.code_location_origin.location_name\n )\n\n job = construct_dagster_k8s_job(\n job_config,\n args=run_args,\n job_name=job_name,\n pod_name=pod_name,\n component="run_worker",\n user_defined_k8s_config=user_defined_k8s_config,\n labels=labels,\n env_vars=[{"name": "DAGSTER_RUN_JOB_NAME", "value": job_origin.job_name}],\n )\n\n job_namespace = exc_config.get("job_namespace", self.job_namespace)\n\n self._instance.report_engine_event(\n "Creating Kubernetes run worker job",\n run,\n EngineEventData(\n {\n "Kubernetes Job name": job_name,\n "Kubernetes Namespace": job_namespace,\n "Run ID": run.run_id,\n }\n ),\n cls=self.__class__,\n )\n\n self._api_client.batch_api.create_namespaced_job(body=job, namespace=job_namespace)\n self._instance.report_engine_event(\n "Kubernetes run worker job created",\n run,\n EngineEventData(\n {\n "Kubernetes Job name": job_name,\n "Kubernetes Namespace": job_namespace,\n "Run ID": run.run_id,\n }\n ),\n cls=self.__class__,\n )\n\n def get_k8s_job_config(self, job_image, exc_config):\n return DagsterK8sJobConfig(\n dagster_home=self.dagster_home,\n instance_config_map=self.instance_config_map,\n postgres_password_secret=self.postgres_password_secret,\n job_image=check.opt_str_param(job_image, "job_image"),\n image_pull_policy=exc_config.get("image_pull_policy", self._image_pull_policy),\n image_pull_secrets=exc_config.get("image_pull_secrets", []) + self._image_pull_secrets,\n service_account_name=exc_config.get("service_account_name", self._service_account_name),\n env_config_maps=exc_config.get("env_config_maps", []) + self._env_config_maps,\n env_secrets=exc_config.get("env_secrets", []) + self._env_secrets,\n volume_mounts=exc_config.get("volume_mounts", []) + self._volume_mounts,\n volumes=exc_config.get("volumes", []) + self._volumes,\n labels=merge_dicts(self._labels, exc_config.get("labels", {})),\n )\n\n def terminate(self, run_id):\n check.str_param(run_id, "run_id")\n\n run = self._instance.get_run_by_id(run_id)\n if not run:\n return False\n\n self._instance.report_run_canceling(run)\n\n job_name = get_job_name_from_run_id(run_id)\n\n job_namespace = self.get_namespace_from_run_config(run_id)\n\n try:\n termination_result = self._api_client.delete_job(\n job_name=job_name, namespace=job_namespace\n )\n if termination_result:\n self._instance.report_engine_event(\n message="Dagster Job was terminated successfully.",\n dagster_run=run,\n cls=self.__class__,\n )\n else:\n self._instance.report_engine_event(\n message=(\n "Dagster Job was not terminated successfully; delete_job returned {}"\n .format(termination_result)\n ),\n dagster_run=run,\n cls=self.__class__,\n )\n return termination_result\n except Exception:\n self._instance.report_engine_event(\n message=(\n "Dagster Job was not terminated successfully; encountered error in delete_job"\n ),\n dagster_run=run,\n engine_event_data=EngineEventData.engine_error(\n serializable_error_info_from_exc_info(sys.exc_info())\n ),\n cls=self.__class__,\n )\n\n def get_namespace_from_run_config(self, run_id):\n check.str_param(run_id, "run_id")\n\n dagster_run = self._instance.get_run_by_id(run_id)\n run_config = dagster_run.run_config\n executor_config = _get_validated_celery_k8s_executor_config(run_config)\n return executor_config.get("job_namespace", self.job_namespace)\n\n @property\n def supports_check_run_worker_health(self):\n return True\n\n def check_run_worker_health(self, run: DagsterRun):\n job_namespace = _get_validated_celery_k8s_executor_config(run.run_config).get(\n "job_namespace", self.job_namespace\n )\n job_name = get_job_name_from_run_id(run.run_id)\n try:\n status = self._api_client.get_job_status(namespace=job_namespace, job_name=job_name)\n except Exception:\n return CheckRunHealthResult(\n WorkerStatus.UNKNOWN, str(serializable_error_info_from_exc_info(sys.exc_info()))\n )\n if status.failed:\n return CheckRunHealthResult(WorkerStatus.FAILED, "K8s job failed")\n return CheckRunHealthResult(WorkerStatus.RUNNING)
\n\n\ndef _get_validated_celery_k8s_executor_config(run_config):\n check.dict_param(run_config, "run_config")\n\n executor_config = run_config.get("execution", {})\n execution_config_schema = resolve_to_config_type(celery_k8s_executor_config())\n\n # In run config on jobs, we don't have an executor key\n if CELERY_K8S_CONFIG_KEY not in executor_config:\n execution_run_config = executor_config.get("config", {})\n else:\n execution_run_config = (run_config["execution"][CELERY_K8S_CONFIG_KEY] or {}).get(\n "config", {}\n )\n\n res = process_config(execution_config_schema, execution_run_config)\n\n check.invariant(\n res.success,\n "Incorrect execution schema provided. Note: You may also be seeing this error "\n "because you are using the configured API. "\n "Using configured with the {config_key} executor is not supported at this time, "\n "and all executor config must be directly in the run config without using configured."\n .format(\n config_key=CELERY_K8S_CONFIG_KEY,\n ),\n )\n\n return res.value\n
", "current_page_name": "_modules/dagster_celery_k8s/launcher", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_celery_k8s.launcher"}}, "dagster_census": {"ops": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_census.ops

\nfrom dagster import Array, Bool, Field, In, Noneable, Nothing, Out, Output, op\n\nfrom .resources import DEFAULT_POLL_INTERVAL\nfrom .types import CensusOutput\nfrom .utils import generate_materialization\n\n\n
[docs]@op(\n required_resource_keys={"census"},\n ins={"start_after": In(Nothing)},\n out=Out(\n CensusOutput,\n description=(\n "Parsed json dictionary representing the details of the Census sync after "\n "the sync successfully completes."\n ),\n ),\n config_schema={\n "sync_id": Field(\n int,\n is_required=True,\n description="Id of the parent sync.",\n ),\n "force_full_sync": Field(\n config=Bool,\n default_value=False,\n description=(\n "If this trigger request should be a Full Sync. "\n "Note that some sync configurations such as Append do not support full syncs."\n ),\n ),\n "poll_interval": Field(\n float,\n default_value=DEFAULT_POLL_INTERVAL,\n description="The time (in seconds) to wait between successive polls.",\n ),\n "poll_timeout": Field(\n Noneable(float),\n default_value=None,\n description=(\n "The maximum time to wait before this operation is timed out. By "\n "default, this will never time out."\n ),\n ),\n "yield_materializations": Field(\n config=Bool,\n default_value=True,\n description=(\n "If True, materializations corresponding to the results of the Census sync will "\n "be yielded when the op executes."\n ),\n ),\n "asset_key_prefix": Field(\n config=Array(str),\n default_value=["census"],\n description=(\n "If provided and yield_materializations is True, these components will be used to "\n "prefix the generated asset keys."\n ),\n ),\n },\n tags={"kind": "census"},\n)\ndef census_trigger_sync_op(context):\n """Executes a Census sync for a given ``sync_id`` and polls until that sync completes, raising\n an error if it is unsuccessful.\n\n It outputs a :py:class:`~dagster_census.CensusOutput` which contains the details of the Census\n sync after it successfully completes.\n\n It requires the use of the :py:class:`~dagster_census.census_resource`, which allows it to\n communicate with the Census API.\n\n **Examples:**\n\n .. code-block:: python\n\n from dagster import job\n from dagster_census import census_resource, census_sync_op\n\n my_census_resource = census_resource.configured(\n {\n "api_key": {"env": "CENSUS_API_KEY"},\n }\n )\n\n sync_foobar = census_sync_op.configured({"sync_id": "foobar"}, name="sync_foobar")\n\n @job(resource_defs={"census": my_census_resource})\n def my_simple_census_job():\n sync_foobar()\n\n """\n census_output = context.resources.census.trigger_sync_and_poll(\n sync_id=context.op_config["sync_id"],\n force_full_sync=context.op_config["force_full_sync"],\n poll_interval=context.op_config["poll_interval"],\n poll_timeout=context.op_config["poll_timeout"],\n )\n if context.op_config["yield_materializations"]:\n yield generate_materialization(\n census_output, asset_key_prefix=context.op_config["asset_key_prefix"]\n )\n yield Output(census_output)
\n
", "current_page_name": "_modules/dagster_census/ops", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_census.ops"}, "resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_census.resources

\nimport datetime\nimport json\nimport logging\nimport time\nfrom typing import Any, Mapping, Optional\n\nimport requests\nfrom dagster import Failure, Field, StringSource, __version__, get_dagster_logger, resource\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom requests.auth import HTTPBasicAuth\nfrom requests.exceptions import RequestException\n\nfrom .types import CensusOutput\n\nCENSUS_API_BASE = "app.getcensus.com/api"\nCENSUS_VERSION = "v1"\n\nDEFAULT_POLL_INTERVAL = 10\n\nSYNC_RUN_STATUSES = {"completed", "failed", "queued", "skipped", "working"}\n\n\n
[docs]class CensusResource:\n """This class exposes methods on top of the Census REST API."""\n\n def __init__(\n self,\n api_key: str,\n request_max_retries: int = 3,\n request_retry_delay: float = 0.25,\n log: logging.Logger = get_dagster_logger(),\n ):\n self.api_key = api_key\n\n self._request_max_retries = request_max_retries\n self._request_retry_delay = request_retry_delay\n\n self._log = log\n\n @property\n def _api_key(self):\n if self.api_key.startswith("secret-token:"):\n return self.api_key\n return "secret-token:" + self.api_key\n\n @property\n def api_base_url(self) -> str:\n return f"https://{CENSUS_API_BASE}/{CENSUS_VERSION}"\n\n def make_request(\n self, method: str, endpoint: str, data: Optional[str] = None\n ) -> Mapping[str, Any]:\n """Creates and sends a request to the desired Census API endpoint.\n\n Args:\n method (str): The http method to use for this request (e.g. "POST", "GET", "PATCH").\n endpoint (str): The Census API endpoint to send this request to.\n data (Optional[str]): JSON-formatted data string to be included in the request.\n\n Returns:\n Dict[str, Any]: JSON data from the response to this request\n """\n url = f"{self.api_base_url}/{endpoint}"\n headers = {\n "User-Agent": f"dagster-census/{__version__}",\n "Content-Type": "application/json;version=2",\n }\n\n num_retries = 0\n while True:\n try:\n response = requests.request(\n method=method,\n url=url,\n headers=headers,\n auth=HTTPBasicAuth("bearer", self._api_key),\n data=data,\n )\n response.raise_for_status()\n return response.json()\n except RequestException as e:\n self._log.error("Request to Census API failed: %s", e)\n if num_retries == self._request_max_retries:\n break\n num_retries += 1\n time.sleep(self._request_retry_delay)\n\n raise Failure(f"Max retries ({self._request_max_retries}) exceeded with url: {url}.")\n\n def get_sync(self, sync_id: int) -> Mapping[str, Any]:\n """Gets details about a given sync from the Census API.\n\n Args:\n sync_id (int): The Census Sync ID.\n\n Returns:\n Dict[str, Any]: JSON data from the response to this request\n """\n return self.make_request(method="GET", endpoint=f"syncs/{sync_id}")\n\n def get_source(self, source_id: int) -> Mapping[str, Any]:\n """Gets details about a given source from the Census API.\n\n Args:\n source_id (int): The Census Source ID.\n\n Returns:\n Dict[str, Any]: JSON data from the response to this request\n """\n return self.make_request(method="GET", endpoint=f"sources/{source_id}")\n\n def get_destination(self, destination_id: int) -> Mapping[str, Any]:\n """Gets details about a given destination from the Census API.\n\n Args:\n destination_id (int): The Census Destination ID.\n\n Returns:\n Dict[str, Any]: JSON data from the response to this request\n """\n return self.make_request(method="GET", endpoint=f"destinations/{destination_id}")\n\n def get_sync_run(self, sync_run_id: int) -> Mapping[str, Any]:\n """Gets details about a specific sync run from the Census API.\n\n Args:\n sync_run_id (int): The Census Sync Run ID.\n\n Returns:\n Dict[str, Any]: JSON data from the response to this request\n """\n return self.make_request(method="GET", endpoint=f"sync_runs/{sync_run_id}")\n\n def poll_sync_run(\n self,\n sync_run_id: int,\n poll_interval: float = DEFAULT_POLL_INTERVAL,\n poll_timeout: Optional[float] = None,\n ) -> Mapping[str, Any]:\n """Given a Census sync run, poll until the run is complete.\n\n Args:\n sync_id (int): The Census Sync Run ID.\n poll_interval (float): The time (in seconds) that will be waited between successive polls.\n poll_timeout (float): The maximum time that will waited before this operation is timed\n out. By default, this will never time out.\n\n Returns:\n Dict[str, Any]: JSON data from the response to this request\n """\n log_url = f"https://app.getcensus.com/syncs_runs/{sync_run_id}"\n poll_start = datetime.datetime.now()\n\n while True:\n time.sleep(poll_interval)\n response_dict = self.get_sync_run(sync_run_id)\n if "data" not in response_dict.keys():\n raise ValueError(\n f"Getting status of sync failed, please visit Census Logs at {log_url} to see"\n " more."\n )\n\n sync_status = response_dict["data"]["status"]\n sync_id = response_dict["data"]["sync_id"]\n\n if sync_status not in SYNC_RUN_STATUSES:\n raise ValueError(\n f"Unexpected response status '{sync_status}'; "\n f"must be one of {','.join(sorted(SYNC_RUN_STATUSES))}. "\n "See Management API docs for more information: "\n "https://docs.getcensus.com/basics/developers/api/sync-runs"\n )\n\n if sync_status in {"queued", "working"}:\n self._log.debug(\n f"Sync {sync_id} still running after {datetime.datetime.now() - poll_start}."\n )\n continue\n\n if poll_timeout and datetime.datetime.now() > poll_start + datetime.timedelta(\n seconds=poll_timeout\n ):\n raise Failure(\n f"Sync for sync '{sync_id}' timed out after"\n f" {datetime.datetime.now() - poll_start}."\n )\n\n break\n\n self._log.debug(\n f"Sync {sync_id} has finished running after {datetime.datetime.now() - poll_start}."\n )\n self._log.info(f"View sync details here: {log_url}.")\n\n return response_dict\n\n def trigger_sync(self, sync_id: int, force_full_sync: bool = False) -> Mapping[str, Any]:\n """Trigger an asynchronous run for a specific sync.\n\n Args:\n sync_id (int): The Census Sync Run ID.\n force_full_sync (bool): If the Sync should perform a full sync\n\n Returns:\n Dict[str, Any]: JSON data from the response to this request\n """\n data = {"force_full_sync": force_full_sync}\n return self.make_request(\n method="POST", endpoint=f"syncs/{sync_id}/trigger", data=json.dumps(data)\n )\n\n def trigger_sync_and_poll(\n self,\n sync_id: int,\n force_full_sync: bool = False,\n poll_interval: float = DEFAULT_POLL_INTERVAL,\n poll_timeout: Optional[float] = None,\n ) -> CensusOutput:\n """Trigger a run for a specific sync and poll until it has completed.\n\n Args:\n sync_id (int): The Census Sync Run ID.\n force_full_sync (bool): If the Sync should perform a full sync\n poll_interval (float): The time (in seconds) that will be waited between successive polls.\n poll_timeout (float): The maximum time that will waited before this operation is timed\n out. By default, this will never time out.\n\n Returns:\n :py:class:`~CensusOutput`:\n Object containing details about the sync run and the sync details\n """\n sync_details = self.get_sync(sync_id=sync_id)\n source_details = self.get_source(\n source_id=sync_details["data"]["source_attributes"]["connection_id"]\n )["data"]\n destination_details = self.get_destination(\n destination_id=sync_details["data"]["destination_attributes"]["connection_id"]\n )["data"]\n\n trigger_sync_resp = self.trigger_sync(sync_id=sync_id, force_full_sync=force_full_sync)\n sync_run_details = self.poll_sync_run(\n sync_run_id=trigger_sync_resp["data"]["sync_run_id"],\n poll_interval=poll_interval,\n poll_timeout=poll_timeout,\n )["data"]\n return CensusOutput(\n sync_run=sync_run_details,\n source=source_details,\n destination=destination_details,\n )
\n\n\n
[docs]@dagster_maintained_resource\n@resource(\n config_schema={\n "api_key": Field(\n StringSource,\n is_required=True,\n description="Census API Key.",\n ),\n "request_max_retries": Field(\n int,\n default_value=3,\n description=(\n "The maximum number of times requests to the Census API should be retried "\n "before failing."\n ),\n ),\n "request_retry_delay": Field(\n float,\n default_value=0.25,\n description="Time (in seconds) to wait between each request retry.",\n ),\n },\n description="This resource helps manage Census connectors",\n)\ndef census_resource(context) -> CensusResource:\n """This resource allows users to programatically interface with the Census REST API to launch\n syncs and monitor their progress. This currently implements only a subset of the functionality\n exposed by the API.\n\n **Examples:**\n\n .. code-block:: python\n\n from dagster import job\n from dagster_census import census_resource\n\n my_census_resource = census_resource.configured(\n {\n "api_key": {"env": "CENSUS_API_KEY"},\n }\n )\n\n @job(resource_defs={"census":my_census_resource})\n def my_census_job():\n ...\n\n """\n return CensusResource(\n api_key=context.resource_config["api_key"],\n request_max_retries=context.resource_config["request_max_retries"],\n request_retry_delay=context.resource_config["request_retry_delay"],\n log=context.log,\n )
\n
", "current_page_name": "_modules/dagster_census/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_census.resources"}, "types": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_census.types

\nfrom typing import Any, Mapping, NamedTuple\n\n\n
[docs]class CensusOutput(\n NamedTuple(\n "_CensusOutput",\n [\n ("sync_run", Mapping[str, Any]),\n ("source", Mapping[str, Any]),\n ("destination", Mapping[str, Any]),\n ],\n )\n):\n """Contains recorded information about the state of a Census sync after a sync completes.\n\n Attributes:\n sync_run (Dict[str, Any]):\n The details of the specific sync run.\n source (Dict[str, Any]):\n Information about the source for the Census sync.\n destination (Dict[str, Any]):\n Information about the destination for the Census sync.\n """
\n
", "current_page_name": "_modules/dagster_census/types", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_census.types"}}, "dagster_dask": {"executor": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_dask.executor

\nfrom typing import Any, Mapping, Optional, Sequence\n\nimport dask\nimport dask.distributed\nfrom dagster import (\n    Executor,\n    Field,\n    Permissive,\n    Selector,\n    StringSource,\n    _check as check,\n    _seven,\n    multiple_process_executor_requirements,\n)\nfrom dagster._core.definitions.executor_definition import executor\nfrom dagster._core.definitions.reconstruct import ReconstructableJob\nfrom dagster._core.errors import raise_execution_interrupts\nfrom dagster._core.events import DagsterEvent\nfrom dagster._core.execution.api import create_execution_plan, execute_plan\nfrom dagster._core.execution.context.system import PlanOrchestrationContext\nfrom dagster._core.execution.plan.plan import ExecutionPlan\nfrom dagster._core.execution.plan.state import KnownExecutionState\nfrom dagster._core.execution.retries import RetryMode\nfrom dagster._core.instance import DagsterInstance\nfrom dagster._core.instance.ref import InstanceRef\nfrom dagster._core.storage.dagster_run import DagsterRun\nfrom dagster._utils import iterate_with_context\n\n# Dask resource requirements are specified under this key\nDASK_RESOURCE_REQUIREMENTS_KEY = "dagster-dask/resource_requirements"\n\n\n
[docs]@executor(\n name="dask",\n requirements=multiple_process_executor_requirements(),\n config_schema={\n "cluster": Field(\n Selector(\n {\n "existing": Field(\n {"address": StringSource},\n description="Connect to an existing scheduler.",\n ),\n "local": Field(\n Permissive(), is_required=False, description="Local cluster configuration."\n ),\n "yarn": Field(\n Permissive(), is_required=False, description="YARN cluster configuration."\n ),\n "ssh": Field(\n Permissive(), is_required=False, description="SSH cluster configuration."\n ),\n "pbs": Field(\n Permissive(), is_required=False, description="PBS cluster configuration."\n ),\n "moab": Field(\n Permissive(), is_required=False, description="Moab cluster configuration."\n ),\n "sge": Field(\n Permissive(), is_required=False, description="SGE cluster configuration."\n ),\n "lsf": Field(\n Permissive(), is_required=False, description="LSF cluster configuration."\n ),\n "slurm": Field(\n Permissive(), is_required=False, description="SLURM cluster configuration."\n ),\n "oar": Field(\n Permissive(), is_required=False, description="OAR cluster configuration."\n ),\n "kube": Field(\n Permissive(),\n is_required=False,\n description="Kubernetes cluster configuration.",\n ),\n }\n )\n )\n },\n)\ndef dask_executor(init_context):\n """Dask-based executor.\n\n The 'cluster' can be one of the following:\n ('existing', 'local', 'yarn', 'ssh', 'pbs', 'moab', 'sge', 'lsf', 'slurm', 'oar', 'kube').\n\n If the Dask executor is used without providing executor-specific config, a local Dask cluster\n will be created (as when calling :py:class:`dask.distributed.Client() <dask:distributed.Client>`\n with :py:class:`dask.distributed.LocalCluster() <dask:distributed.LocalCluster>`).\n\n The Dask executor optionally takes the following config:\n\n .. code-block:: none\n\n cluster:\n {\n local?: # takes distributed.LocalCluster parameters\n {\n timeout?: 5, # Timeout duration for initial connection to the scheduler\n n_workers?: 4 # Number of workers to start\n threads_per_worker?: 1 # Number of threads per each worker\n }\n }\n\n To use the `dask_executor`, set it as the `executor_def` when defining a job:\n\n .. code-block:: python\n\n from dagster import job\n from dagster_dask import dask_executor\n\n @job(executor_def=dask_executor)\n def dask_enabled_job():\n pass\n\n """\n ((cluster_type, cluster_configuration),) = init_context.executor_config["cluster"].items()\n return DaskExecutor(cluster_type, cluster_configuration)
\n\n\ndef query_on_dask_worker(\n dependencies: Any,\n recon_job: ReconstructableJob,\n dagster_run: DagsterRun,\n run_config: Optional[Mapping[str, object]],\n step_keys: Optional[Sequence[str]],\n instance_ref: InstanceRef,\n known_state: Optional[KnownExecutionState],\n) -> Sequence[DagsterEvent]:\n """Note that we need to pass "dependencies" to ensure Dask sequences futures during task\n scheduling, even though we do not use this argument within the function.\n """\n with DagsterInstance.from_ref(instance_ref) as instance:\n subset_job = recon_job.get_subset(op_selection=dagster_run.resolved_op_selection)\n\n execution_plan = create_execution_plan(\n subset_job,\n run_config=run_config,\n step_keys_to_execute=step_keys,\n known_state=known_state,\n )\n\n return execute_plan(\n execution_plan, subset_job, instance, dagster_run, run_config=run_config\n )\n\n\ndef get_dask_resource_requirements(tags: Mapping[str, str]):\n check.mapping_param(tags, "tags", key_type=str, value_type=str)\n req_str = tags.get(DASK_RESOURCE_REQUIREMENTS_KEY)\n if req_str is not None:\n return _seven.json.loads(req_str)\n\n return {}\n\n\nclass DaskExecutor(Executor):\n def __init__(self, cluster_type, cluster_configuration):\n self.cluster_type = check.opt_str_param(cluster_type, "cluster_type", default="local")\n self.cluster_configuration = check.opt_dict_param(\n cluster_configuration, "cluster_configuration"\n )\n\n @property\n def retries(self):\n return RetryMode.DISABLED\n\n def execute(self, plan_context: PlanOrchestrationContext, execution_plan: ExecutionPlan):\n check.inst_param(plan_context, "plan_context", PlanOrchestrationContext)\n check.inst_param(execution_plan, "execution_plan", ExecutionPlan)\n check.param_invariant(\n isinstance(plan_context.executor, DaskExecutor),\n "plan_context",\n f"Expected executor to be DaskExecutor got {plan_context.executor}",\n )\n\n check.invariant(\n plan_context.instance.is_persistent,\n "Dask execution requires a persistent DagsterInstance",\n )\n\n step_levels = execution_plan.get_steps_to_execute_by_level()\n\n job_name = plan_context.job_name\n\n instance = plan_context.instance\n\n cluster_type = self.cluster_type\n if cluster_type == "existing":\n # address passed directly to Client() below to connect to existing Scheduler\n cluster = self.cluster_configuration["address"]\n elif cluster_type == "local":\n from dask.distributed import LocalCluster\n\n cluster = LocalCluster(**self.build_dict(job_name))\n elif cluster_type == "yarn":\n from dask_yarn import YarnCluster\n\n cluster = YarnCluster(**self.build_dict(job_name))\n elif cluster_type == "ssh":\n from dask.distributed import SSHCluster\n\n cluster = SSHCluster(**self.build_dict(job_name))\n elif cluster_type == "pbs":\n from dask_jobqueue import PBSCluster\n\n cluster = PBSCluster(**self.build_dict(job_name))\n elif cluster_type == "moab":\n from dask_jobqueue import MoabCluster\n\n cluster = MoabCluster(**self.build_dict(job_name))\n elif cluster_type == "sge":\n from dask_jobqueue import SGECluster\n\n cluster = SGECluster(**self.build_dict(job_name))\n elif cluster_type == "lsf":\n from dask_jobqueue import LSFCluster\n\n cluster = LSFCluster(**self.build_dict(job_name))\n elif cluster_type == "slurm":\n from dask_jobqueue import SLURMCluster\n\n cluster = SLURMCluster(**self.build_dict(job_name))\n elif cluster_type == "oar":\n from dask_jobqueue import OARCluster\n\n cluster = OARCluster(**self.build_dict(job_name))\n elif cluster_type == "kube":\n from dask_kubernetes import KubeCluster\n\n cluster = KubeCluster(**self.build_dict(job_name))\n else:\n raise ValueError(\n "Must be providing one of the following ('existing', 'local', 'yarn', 'ssh',"\n f" 'pbs', 'moab', 'sge', 'lsf', 'slurm', 'oar', 'kube') not {cluster_type}"\n )\n\n with dask.distributed.Client(cluster) as client:\n execution_futures = []\n execution_futures_dict = {}\n\n for step_level in step_levels:\n for step in step_level:\n # We ensure correctness in sequencing by letting Dask schedule futures and\n # awaiting dependencies within each step.\n dependencies = []\n for step_input in step.step_inputs:\n for key in step_input.dependency_keys:\n dependencies.append(execution_futures_dict[key])\n\n run_config = plan_context.run_config\n\n dask_task_name = "%s.%s" % (job_name, step.key)\n\n recon_job = plan_context.reconstructable_job\n\n future = client.submit(\n query_on_dask_worker,\n dependencies,\n recon_job,\n plan_context.dagster_run,\n run_config,\n [step.key],\n instance.get_ref(),\n execution_plan.known_state,\n key=dask_task_name,\n resources=get_dask_resource_requirements(step.tags),\n )\n\n execution_futures.append(future)\n execution_futures_dict[step.key] = future\n\n # This tells Dask to awaits the step executions and retrieve their results to the\n # master\n futures = dask.distributed.as_completed(execution_futures, with_results=True)\n\n # Allow interrupts while waiting for the results from Dask\n for future, result in iterate_with_context(raise_execution_interrupts, futures):\n for step_event in result:\n check.inst(step_event, DagsterEvent)\n yield step_event\n\n def build_dict(self, job_name):\n """Returns a dict we can use for kwargs passed to dask client instantiation.\n\n Intended to be used like:\n\n with dask.distributed.Client(**cfg.build_dict()) as client:\n << use client here >>\n\n """\n if self.cluster_type in ["yarn", "pbs", "moab", "sge", "lsf", "slurm", "oar", "kube"]:\n dask_cfg = {"name": job_name}\n else:\n dask_cfg = {}\n\n if self.cluster_configuration:\n for k, v in self.cluster_configuration.items():\n dask_cfg[k] = v\n\n # if address is set, don't add LocalCluster args\n # context: https://github.com/dask/distributed/issues/3313\n if (self.cluster_type == "local") and ("address" not in dask_cfg):\n # We set threads_per_worker because Dagster is not thread-safe. Even though\n # environments=True by default, there is a clever piece of machinery\n # (dask.distributed.deploy.local.nprocesses_nthreads) that automagically makes execution\n # multithreaded by default when the number of available cores is greater than 4.\n # See: https://github.com/dagster-io/dagster/issues/2181\n # We may want to try to figure out a way to enforce this on remote Dask clusters against\n # which users run Dagster workloads.\n dask_cfg["threads_per_worker"] = 1\n\n return dask_cfg\n
", "current_page_name": "_modules/dagster_dask/executor", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_dask.executor"}}, "dagster_databricks": {"databricks": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_databricks.databricks

\nimport base64\nimport logging\nimport time\nfrom typing import IO, Any, Mapping, Optional, Tuple, Union, cast\n\nimport dagster\nimport dagster._check as check\nimport dagster_pyspark\nimport databricks_api\nimport databricks_cli.sdk\nimport requests.exceptions\nfrom dagster._annotations import deprecated, public\nfrom databricks.sdk import WorkspaceClient\nfrom databricks.sdk.service import compute, jobs\nfrom typing_extensions import Final\n\nimport dagster_databricks\n\nfrom .types import (\n    DatabricksRunState,\n)\nfrom .version import __version__\n\n# wait at most 24 hours by default for run execution\nDEFAULT_RUN_MAX_WAIT_TIME_SEC: Final = 24 * 60 * 60\n\n\n
[docs]class DatabricksError(Exception):\n pass
\n\n\n
[docs]class DatabricksClient:\n """A thin wrapper over the Databricks REST API."""\n\n def __init__(\n self,\n host: str,\n token: Optional[str] = None,\n oauth_client_id: Optional[str] = None,\n oauth_client_secret: Optional[str] = None,\n workspace_id: Optional[str] = None,\n ):\n self.host = host\n self.workspace_id = workspace_id\n\n self._workspace_client = WorkspaceClient(\n host=host,\n token=token,\n client_id=oauth_client_id,\n client_secret=oauth_client_secret,\n product="dagster-databricks",\n product_version=__version__,\n )\n\n # TODO: This is the old shim client that we were previously using. Arguably this is\n # confusing for users to use since this is an unofficial wrapper around the documented\n # Databricks REST API. We should consider removing this in the next minor release.\n if token:\n self._client = databricks_api.DatabricksAPI(host=host, token=token)\n self.__setup_user_agent(self._client.client)\n # TODO: This is the old `databricks_cli` client that was previously recommended by Databricks.\n # It is no longer supported and should be removed in favour of `databricks-sdk` in the next\n # minor release.\n self._api_client = databricks_cli.sdk.ApiClient(host=host, token=token)\n self.__setup_user_agent(self._api_client)\n else:\n self._client = None\n self._api_client = None\n\n def __setup_user_agent(\n self,\n client: Union[WorkspaceClient, databricks_api.DatabricksAPI, databricks_cli.sdk.ApiClient],\n ) -> None:\n """Overrides the user agent for the Databricks API client."""\n client.default_headers["user-agent"] = f"dagster-databricks/{__version__}"\n\n @deprecated(\n breaking_version="0.21.0", additional_warn_text="Use `workspace_client` property instead."\n )\n @public\n @property\n def client(self) -> databricks_api.DatabricksAPI:\n """Retrieve the legacy Databricks API client. Note: accessing this property will throw an exception if oauth\n credentials are used to initialize the DatabricksClient, because oauth credentials are not supported by the\n legacy Databricks API client.\n """\n if self._client is None:\n raise ValueError(\n "Legacy Databricks API client from `databricks-api` was not initialized because"\n " oauth credentials were used instead of an access token. This legacy Databricks"\n " API client is not supported when using oauth credentials. Use the"\n " `workspace_client` property instead."\n )\n return self._client\n\n @client.setter\n def client(self, value: Optional[databricks_api.DatabricksAPI]) -> None:\n self._client = value\n\n @deprecated(\n breaking_version="0.21.0", additional_warn_text="Use `workspace_client` property instead."\n )\n @public\n @property\n def api_client(self) -> databricks_cli.sdk.ApiClient:\n """Retrieve a reference to the underlying Databricks API client. For more information,\n see the `Databricks Python API <https://docs.databricks.com/dev-tools/python-api.html>`_.\n Noe: accessing this property will throw an exception if oauth credentials are used to initialize the\n DatabricksClient, because oauth credentials are not supported by the legacy Databricks API client.\n **Examples:**.\n\n .. code-block:: python\n\n from dagster import op\n from databricks_cli.jobs.api import JobsApi\n from databricks_cli.runs.api import RunsApi\n from databricks.sdk import WorkspaceClient\n\n @op(required_resource_keys={"databricks_client"})\n def op1(context):\n # Initialize the Databricks Jobs API\n jobs_client = JobsApi(context.resources.databricks_client.api_client)\n runs_client = RunsApi(context.resources.databricks_client.api_client)\n client = context.resources.databricks_client.api_client\n\n # Example 1: Run a Databricks job with some parameters.\n jobs_client.run_now(...)\n client.jobs.run_now(...)\n\n # Example 2: Trigger a one-time run of a Databricks workload.\n runs_client.submit_run(...)\n client.jobs.submit(...)\n\n # Example 3: Get an existing run.\n runs_client.get_run(...)\n client.jobs.get_run(...)\n\n # Example 4: Cancel a run.\n runs_client.cancel_run(...)\n client.jobs.cancel_run(...)\n\n Returns:\n ApiClient: The authenticated Databricks API client.\n """\n if self._api_client is None:\n raise ValueError(\n "Legacy Databricks API client from `databricks-cli` was not initialized because"\n " oauth credentials were used instead of an access token. This legacy Databricks"\n " API client is not supported when using oauth credentials. Use the"\n " `workspace_client` property instead."\n )\n return self._api_client\n\n @public\n @property\n def workspace_client(self) -> WorkspaceClient:\n """Retrieve a reference to the underlying Databricks Workspace client. For more information,\n see the `Databricks SDK for Python <https://docs.databricks.com/dev-tools/sdk-python.html>`_.\n\n **Examples:**\n\n .. code-block:: python\n\n from dagster import op\n from databricks.sdk import WorkspaceClient\n\n @op(required_resource_keys={"databricks_client"})\n def op1(context):\n # Initialize the Databricks Jobs API\n client = context.resources.databricks_client.api_client\n\n # Example 1: Run a Databricks job with some parameters.\n client.jobs.run_now(...)\n\n # Example 2: Trigger a one-time run of a Databricks workload.\n client.jobs.submit(...)\n\n # Example 3: Get an existing run.\n client.jobs.get_run(...)\n\n # Example 4: Cancel a run.\n client.jobs.cancel_run(...)\n\n Returns:\n WorkspaceClient: The authenticated Databricks SDK Workspace Client.\n """\n return self._workspace_client\n\n def read_file(self, dbfs_path: str, block_size: int = 1024**2) -> bytes:\n """Read a file from DBFS to a **byte string**."""\n if dbfs_path.startswith("dbfs://"):\n dbfs_path = dbfs_path[7:]\n\n data = b""\n bytes_read = 0\n dbfs_service = self.workspace_client.dbfs\n\n jdoc = dbfs_service.read(path=dbfs_path, length=block_size)\n data += base64.b64decode(jdoc.data)\n while jdoc.bytes_read == block_size:\n bytes_read += jdoc.bytes_read\n jdoc = dbfs_service.read(path=dbfs_path, offset=bytes_read, length=block_size)\n data += base64.b64decode(jdoc.data)\n\n return data\n\n def put_file(\n self, file_obj: IO, dbfs_path: str, overwrite: bool = False, block_size: int = 1024**2\n ) -> None:\n """Upload an arbitrary large file to DBFS.\n\n This doesn't use the DBFS `Put` API because that endpoint is limited to 1MB.\n """\n if dbfs_path.startswith("dbfs://"):\n dbfs_path = dbfs_path[7:]\n\n dbfs_service = self.workspace_client.dbfs\n\n create_response = dbfs_service.create(path=dbfs_path, overwrite=overwrite)\n handle = create_response.handle\n\n block = file_obj.read(block_size)\n while block:\n data = base64.b64encode(block).decode("utf-8")\n dbfs_service.add_block(data=data, handle=handle)\n block = file_obj.read(block_size)\n\n dbfs_service.close(handle=handle)\n\n def get_run_state(self, databricks_run_id: int) -> "DatabricksRunState":\n """Get the state of a run by Databricks run ID.\n\n Return a `DatabricksRunState` object. Note that the `result_state`\n attribute may be `None` if the run hasn't yet terminated.\n """\n run = self.workspace_client.jobs.get_run(databricks_run_id)\n return DatabricksRunState.from_databricks(run.state)\n\n def poll_run_state(\n self,\n logger: logging.Logger,\n start_poll_time: float,\n databricks_run_id: int,\n max_wait_time_sec: float,\n verbose_logs: bool = True,\n ) -> bool:\n run_state = self.get_run_state(databricks_run_id)\n\n if run_state.has_terminated():\n if run_state.is_successful():\n logger.info(f"Run `{databricks_run_id}` completed successfully.")\n return True\n if run_state.is_skipped():\n logger.info(f"Run `{databricks_run_id}` was skipped.")\n return True\n else:\n error_message = (\n f"Run `{databricks_run_id}` failed with result state:"\n f" `{run_state.result_state}`. Message: {run_state.state_message}."\n )\n logger.error(error_message)\n raise DatabricksError(error_message)\n else:\n if verbose_logs:\n logger.debug(f"Run `{databricks_run_id}` in state {run_state}.")\n if time.time() - start_poll_time > max_wait_time_sec:\n raise DatabricksError(\n f"Run `{databricks_run_id}` took more than {max_wait_time_sec}s to complete."\n " Failing the run."\n )\n return False\n\n def wait_for_run_to_complete(\n self,\n logger: logging.Logger,\n databricks_run_id: int,\n poll_interval_sec: float,\n max_wait_time_sec: int,\n verbose_logs: bool = True,\n ) -> None:\n logger.info(f"Waiting for Databricks run `{databricks_run_id}` to complete...")\n\n start_poll_time = time.time()\n while True:\n if self.poll_run_state(\n logger=logger,\n start_poll_time=start_poll_time,\n databricks_run_id=databricks_run_id,\n max_wait_time_sec=max_wait_time_sec,\n verbose_logs=verbose_logs,\n ):\n return\n\n time.sleep(poll_interval_sec)
\n\n\nclass DatabricksJobRunner:\n """Submits jobs created using Dagster config to Databricks, and monitors their progress.\n\n Attributes:\n host (str): Databricks host, e.g. https://uksouth.azuredatabricks.net.\n token (str): Databricks authentication token.\n poll_interval_sec (float): How often to poll Databricks for run status.\n max_wait_time_sec (int): How long to wait for a run to complete before failing.\n """\n\n def __init__(\n self,\n host: str,\n token: Optional[str] = None,\n oauth_client_id: Optional[str] = None,\n oauth_client_secret: Optional[str] = None,\n poll_interval_sec: float = 5,\n max_wait_time_sec: int = DEFAULT_RUN_MAX_WAIT_TIME_SEC,\n ):\n self.host = check.str_param(host, "host")\n check.invariant(\n token is None or (oauth_client_id is None and oauth_client_secret is None),\n "Must provide either databricks_token or oauth_credentials, but cannot provide both",\n )\n self.token = check.opt_str_param(token, "token")\n self.oauth_client_id = check.opt_str_param(oauth_client_id, "oauth_client_id")\n self.oauth_client_secret = check.opt_str_param(oauth_client_secret, "oauth_client_secret")\n self.poll_interval_sec = check.numeric_param(poll_interval_sec, "poll_interval_sec")\n self.max_wait_time_sec = check.int_param(max_wait_time_sec, "max_wait_time_sec")\n\n self._client: DatabricksClient = DatabricksClient(\n host=self.host,\n token=self.token,\n oauth_client_id=oauth_client_id,\n oauth_client_secret=oauth_client_secret,\n )\n\n @property\n def client(self) -> DatabricksClient:\n """Return the underlying `DatabricksClient` object."""\n return self._client\n\n def submit_run(self, run_config: Mapping[str, Any], task: Mapping[str, Any]) -> int:\n """Submit a new run using the 'Runs submit' API."""\n existing_cluster_id = run_config["cluster"].get("existing")\n\n new_cluster = run_config["cluster"].get("new")\n\n # The Databricks API needs different keys to be present in API calls depending\n # on new/existing cluster, so we need to process the new_cluster\n # config first.\n if new_cluster:\n new_cluster = new_cluster.copy()\n\n nodes = new_cluster.pop("nodes")\n if "instance_pool_id" in nodes:\n new_cluster["instance_pool_id"] = nodes["instance_pool_id"]\n else:\n node_types = nodes["node_types"]\n new_cluster["node_type_id"] = node_types["node_type_id"]\n if "driver_node_type_id" in node_types:\n new_cluster["driver_node_type_id"] = node_types["driver_node_type_id"]\n\n cluster_size = new_cluster.pop("size")\n if "num_workers" in cluster_size:\n new_cluster["num_workers"] = cluster_size["num_workers"]\n else:\n new_cluster["autoscale"] = cluster_size["autoscale"]\n\n tags = new_cluster.get("custom_tags", {})\n if isinstance(tags, list):\n tags = {x["key"]: x["value"] for x in tags}\n tags["__dagster_version"] = dagster.__version__\n new_cluster["custom_tags"] = tags\n\n check.invariant(\n existing_cluster_id is not None or new_cluster is not None,\n "Invalid value for run_config.cluster",\n )\n\n # We'll always need some libraries, namely dagster/dagster_databricks/dagster_pyspark,\n # since they're imported by our scripts.\n # Add them if they're not already added by users in config.\n libraries = list(run_config.get("libraries", []))\n install_default_libraries = run_config.get("install_default_libraries", True)\n if install_default_libraries:\n python_libraries = {\n x["pypi"]["package"].split("==")[0].replace("_", "-")\n for x in libraries\n if "pypi" in x\n }\n\n for library_name, library in [\n ("dagster", dagster),\n ("dagster-databricks", dagster_databricks),\n ("dagster-pyspark", dagster_pyspark),\n ]:\n if library_name not in python_libraries:\n libraries.append(\n {"pypi": {"package": f"{library_name}=={library.__version__}"}}\n )\n\n # Only one task should be able to be chosen really; make sure of that here.\n check.invariant(\n sum(\n task.get(key) is not None\n for key in [\n "notebook_task",\n "spark_python_task",\n "spark_jar_task",\n "spark_submit_task",\n ]\n )\n == 1,\n "Multiple tasks specified in Databricks run",\n )\n\n return self.client.workspace_client.jobs.submit(\n run_name=run_config.get("run_name"),\n tasks=[\n jobs.SubmitTask.from_dict(\n {\n "new_cluster": new_cluster,\n "existing_cluster_id": existing_cluster_id,\n # "libraries": [compute.Library.from_dict(lib) for lib in libraries],\n "libraries": libraries,\n **task,\n "task_key": "dagster-task",\n },\n )\n ],\n ).bind()["run_id"]\n\n def retrieve_logs_for_run_id(\n self, log: logging.Logger, databricks_run_id: int\n ) -> Optional[Tuple[Optional[str], Optional[str]]]:\n """Retrieve the stdout and stderr logs for a run."""\n run = self.client.workspace_client.jobs.get_run(databricks_run_id)\n cluster = self.client.workspace_client.clusters.get(run.cluster_instance.cluster_id)\n log_config = cluster.cluster_log_conf\n if log_config is None:\n log.warn(\n "Logs not configured for cluster {cluster} used for run {run}".format(\n cluster=cluster.cluster_id, run=databricks_run_id\n )\n )\n return None\n if cast(Optional[compute.S3StorageInfo], log_config.s3) is not None:\n logs_prefix = log_config.s3.destination\n log.warn("Retrieving S3 logs not yet implemented")\n return None\n elif cast(Optional[compute.DbfsStorageInfo], log_config.dbfs) is not None:\n logs_prefix = log_config.dbfs.destination\n stdout = self.wait_for_dbfs_logs(log, logs_prefix, cluster.cluster_id, "stdout")\n stderr = self.wait_for_dbfs_logs(log, logs_prefix, cluster.cluster_id, "stderr")\n return stdout, stderr\n\n def wait_for_dbfs_logs(\n self,\n log: logging.Logger,\n prefix: str,\n cluster_id: str,\n filename: str,\n waiter_delay: int = 10,\n waiter_max_attempts: int = 10,\n ) -> Optional[str]:\n """Attempt up to `waiter_max_attempts` attempts to get logs from DBFS."""\n path = "/".join([prefix, cluster_id, "driver", filename])\n log.info(f"Retrieving logs from {path}")\n num_attempts = 0\n while num_attempts <= waiter_max_attempts:\n try:\n logs = self.client.read_file(path)\n return logs.decode("utf-8")\n except requests.exceptions.HTTPError:\n num_attempts += 1\n time.sleep(waiter_delay)\n log.warn("Could not retrieve cluster logs!")\n
", "current_page_name": "_modules/dagster_databricks/databricks", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_databricks.databricks"}, "databricks_pyspark_step_launcher": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_databricks.databricks_pyspark_step_launcher

\nimport gzip\nimport io\nimport os.path\nimport pickle\nimport sys\nimport tempfile\nimport time\nimport zlib\nfrom typing import Any, Dict, Iterator, Mapping, Optional, Sequence, cast\n\nfrom dagster import (\n    Bool,\n    Field,\n    IntSource,\n    Noneable,\n    StringSource,\n    _check as check,\n    resource,\n)\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom dagster._core.definitions.step_launcher import StepLauncher, StepRunRef\nfrom dagster._core.errors import raise_execution_interrupts\nfrom dagster._core.events import DagsterEvent\nfrom dagster._core.events.log import EventLogEntry\nfrom dagster._core.execution.context.init import InitResourceContext\nfrom dagster._core.execution.context.system import StepExecutionContext\nfrom dagster._core.execution.plan.external_step import (\n    PICKLED_EVENTS_FILE_NAME,\n    PICKLED_STEP_RUN_REF_FILE_NAME,\n    step_context_to_step_run_ref,\n)\nfrom dagster._core.log_manager import DagsterLogManager\nfrom dagster._serdes import deserialize_value\nfrom dagster._utils.backoff import backoff\nfrom dagster_pyspark.utils import build_pyspark_zip\nfrom databricks.sdk.core import DatabricksError\nfrom databricks.sdk.service import jobs\n\nfrom dagster_databricks import databricks_step_main\nfrom dagster_databricks.databricks import (\n    DEFAULT_RUN_MAX_WAIT_TIME_SEC,\n    DatabricksJobRunner,\n)\n\nfrom .configs import (\n    define_databricks_env_variables,\n    define_databricks_permissions,\n    define_databricks_secrets_config,\n    define_databricks_storage_config,\n    define_databricks_submit_run_config,\n    define_oauth_credentials,\n)\n\nCODE_ZIP_NAME = "code.zip"\nPICKLED_CONFIG_FILE_NAME = "config.pkl"\nDAGSTER_SYSTEM_ENV_VARS = {\n    "DAGSTER_CLOUD_DEPLOYMENT_NAME",\n    "DAGSTER_CLOUD_IS_BRANCH_DEPLOYMENT",\n    "DAGSTER_CLOUD_GIT_SHA",\n    "DAGSTER_CLOUD_GIT_TIMESTAMP",\n    "DAGSTER_CLOUD_GIT_AUTHOR_EMAIL",\n    "DAGSTER_CLOUD_GIT_AUTHOR_NAME",\n    "DAGSTER_CLOUD_GIT_MESSAGE",\n    "DAGSTER_CLOUD_GIT_BRANCH",\n    "DAGSTER_CLOUD_GIT_REPO",\n    "DAGSTER_CLOUD_PULL_REQUEST_ID",\n    "DAGSTER_CLOUD_PULL_REQUEST_STATUS",\n}\n\n\n
[docs]@dagster_maintained_resource\n@resource(\n {\n "run_config": define_databricks_submit_run_config(),\n "permissions": define_databricks_permissions(),\n "databricks_host": Field(\n StringSource,\n is_required=True,\n description="Databricks host, e.g. uksouth.azuredatabricks.com",\n ),\n "databricks_token": Field(\n Noneable(StringSource),\n default_value=None,\n description="Databricks access token",\n ),\n "oauth_credentials": define_oauth_credentials(),\n "env_variables": define_databricks_env_variables(),\n "secrets_to_env_variables": define_databricks_secrets_config(),\n "storage": define_databricks_storage_config(),\n "local_pipeline_package_path": Field(\n StringSource,\n is_required=False,\n description=(\n "Absolute path to root python package containing your Dagster code. If you set this"\n " value to a directory lower than the root package, and have user relative imports"\n " in your code (e.g. `from .foo import bar`), it's likely you'll encounter an"\n " import error on the remote step. Before every step run, the launcher will zip up"\n " the code in this local path, upload it to DBFS, and unzip it into the Python path"\n " of the remote Spark process. This gives the remote process access to up-to-date"\n " user code."\n ),\n ),\n "local_dagster_job_package_path": Field(\n StringSource,\n is_required=False,\n description=(\n "Absolute path to root python package containing your Dagster code. If you set this"\n " value to a directory lower than the root package, and have user relative imports"\n " in your code (e.g. `from .foo import bar`), it's likely you'll encounter an"\n " import error on the remote step. Before every step run, the launcher will zip up"\n " the code in this local path, upload it to DBFS, and unzip it into the Python path"\n " of the remote Spark process. This gives the remote process access to up-to-date"\n " user code."\n ),\n ),\n "staging_prefix": Field(\n StringSource,\n is_required=False,\n default_value="/dagster_staging",\n description="Directory in DBFS to use for uploaded job code. Must be absolute.",\n ),\n "wait_for_logs": Field(\n Bool,\n is_required=False,\n default_value=False,\n description=(\n "If set, and if the specified cluster is configured to export logs, the system will"\n " wait after job completion for the logs to appear in the configured location. Note"\n " that logs are copied every 5 minutes, so enabling this will add several minutes"\n " to the job runtime. NOTE: this integration will export stdout/stderrfrom the"\n " remote Databricks process automatically, so this option is not generally"\n " necessary."\n ),\n ),\n "max_completion_wait_time_seconds": Field(\n IntSource,\n is_required=False,\n default_value=DEFAULT_RUN_MAX_WAIT_TIME_SEC,\n description=(\n "If the Databricks job run takes more than this many seconds, then "\n "consider it failed and terminate the step."\n ),\n ),\n "poll_interval_sec": Field(\n float,\n is_required=False,\n default_value=5.0,\n description=(\n "How frequently Dagster will poll Databricks to determine the state of the job."\n ),\n ),\n "verbose_logs": Field(\n bool,\n default_value=True,\n description=(\n "Determines whether to display debug logs emitted while job is being polled. It can"\n " be helpful for Dagster UI performance to set to False when running long-running"\n " or fan-out Databricks jobs, to avoid forcing the UI to fetch large amounts of"\n " debug logs."\n ),\n ),\n "add_dagster_env_variables": Field(\n bool,\n default_value=True,\n description=(\n "Automatically add Dagster system environment variables. This option is only"\n " applicable when the code being executed is deployed on Dagster Cloud. It will be"\n " ignored when the environment variables provided by Dagster Cloud are not present."\n ),\n ),\n }\n)\ndef databricks_pyspark_step_launcher(\n context: InitResourceContext,\n) -> "DatabricksPySparkStepLauncher":\n """Resource for running ops as a Databricks Job.\n\n When this resource is used, the op will be executed in Databricks using the 'Run Submit'\n API. Pipeline code will be zipped up and copied to a directory in DBFS along with the op's\n execution context.\n\n Use the 'run_config' configuration to specify the details of the Databricks cluster used, and\n the 'storage' key to configure persistent storage on that cluster. Storage is accessed by\n setting the credentials in the Spark context, as documented `here for S3`_ and `here for ADLS`_.\n\n .. _`here for S3`: https://docs.databricks.com/data/data-sources/aws/amazon-s3.html#alternative-1-set-aws-keys-in-the-spark-context\n .. _`here for ADLS`: https://docs.microsoft.com/en-gb/azure/databricks/data/data-sources/azure/azure-datalake-gen2#--access-directly-using-the-storage-account-access-key\n """\n return DatabricksPySparkStepLauncher(**context.resource_config)
\n\n\nclass DatabricksPySparkStepLauncher(StepLauncher):\n def __init__(\n self,\n run_config: Mapping[str, Any],\n permissions: Mapping[str, Any],\n databricks_host: str,\n secrets_to_env_variables: Sequence[Mapping[str, Any]],\n staging_prefix: str,\n wait_for_logs: bool,\n max_completion_wait_time_seconds: int,\n databricks_token: Optional[str] = None,\n oauth_credentials: Optional[Mapping[str, str]] = None,\n env_variables: Optional[Mapping[str, str]] = None,\n storage: Optional[Mapping[str, Any]] = None,\n poll_interval_sec: int = 5,\n local_pipeline_package_path: Optional[str] = None,\n local_dagster_job_package_path: Optional[str] = None,\n verbose_logs: bool = True,\n add_dagster_env_variables: bool = True,\n ):\n self.run_config = check.mapping_param(run_config, "run_config")\n self.permissions = check.mapping_param(permissions, "permissions")\n self.databricks_host = check.str_param(databricks_host, "databricks_host")\n\n check.invariant(\n databricks_token is not None or oauth_credentials is not None,\n "Must provide either databricks_token or oauth_credentials",\n )\n check.invariant(\n databricks_token is None or oauth_credentials is None,\n "Must provide either databricks_token or oauth_credentials, but cannot provide both",\n )\n self.databricks_token = check.opt_str_param(databricks_token, "databricks_token")\n oauth_credentials = check.opt_mapping_param(\n oauth_credentials,\n "oauth_credentials",\n key_type=str,\n value_type=str,\n )\n\n self.secrets = check.sequence_param(\n secrets_to_env_variables, "secrets_to_env_variables", dict\n )\n self.env_variables = check.opt_mapping_param(env_variables, "env_variables")\n self.storage = check.opt_mapping_param(storage, "storage")\n check.invariant(\n local_dagster_job_package_path is not None or local_pipeline_package_path is not None,\n "Missing config: need to provide either 'local_dagster_job_package_path' or"\n " 'local_pipeline_package_path' config entry",\n )\n check.invariant(\n local_dagster_job_package_path is None or local_pipeline_package_path is None,\n "Error in config: Provided both 'local_dagster_job_package_path' and"\n " 'local_pipeline_package_path' entries. Need to specify one or the other.",\n )\n self.local_dagster_job_package_path = check.str_param(\n local_pipeline_package_path or local_dagster_job_package_path,\n "local_dagster_job_package_path",\n )\n self.staging_prefix = check.str_param(staging_prefix, "staging_prefix")\n check.invariant(staging_prefix.startswith("/"), "staging_prefix must be an absolute path")\n self.wait_for_logs = check.bool_param(wait_for_logs, "wait_for_logs")\n\n self.databricks_runner = DatabricksJobRunner(\n host=databricks_host,\n token=databricks_token,\n oauth_client_id=oauth_credentials.get("client_id"),\n oauth_client_secret=oauth_credentials.get("client_secret"),\n poll_interval_sec=poll_interval_sec,\n max_wait_time_sec=max_completion_wait_time_seconds,\n )\n self.verbose_logs = check.bool_param(verbose_logs, "verbose_logs")\n self.add_dagster_env_variables = check.bool_param(\n add_dagster_env_variables, "add_dagster_env_variables"\n )\n\n def launch_step(self, step_context: StepExecutionContext) -> Iterator[DagsterEvent]:\n step_run_ref = step_context_to_step_run_ref(\n step_context, self.local_dagster_job_package_path\n )\n run_id = step_context.dagster_run.run_id\n log = step_context.log\n\n step_key = step_run_ref.step_key\n self._upload_artifacts(log, step_run_ref, run_id, step_key)\n\n task = self._get_databricks_task(run_id, step_key)\n databricks_run_id = self.databricks_runner.submit_run(self.run_config, task)\n\n if self.permissions:\n self._grant_permissions(log, databricks_run_id)\n\n try:\n # If this is being called within a `capture_interrupts` context, allow interrupts while\n # waiting for the execution to complete, so that we can terminate slow or hanging steps\n with raise_execution_interrupts():\n yield from self.step_events_iterator(step_context, step_key, databricks_run_id)\n except:\n # if executon is interrupted before the step is completed, cancel the run\n self.databricks_runner.client.workspace_client.jobs.cancel_run(databricks_run_id)\n raise\n finally:\n self.log_compute_logs(log, run_id, step_key)\n # this is somewhat obsolete\n if self.wait_for_logs:\n self._log_logs_from_cluster(log, databricks_run_id)\n\n def log_compute_logs(self, log: DagsterLogManager, run_id: str, step_key: str) -> None:\n try:\n stdout = self.databricks_runner.client.read_file(\n self._dbfs_path(run_id, step_key, "stdout")\n ).decode()\n log.info(f"Captured stdout for step {step_key}:")\n log.info(stdout)\n sys.stdout.write(stdout)\n except Exception as e:\n log.error(\n f"Encountered exception {e} when attempting to load stdout logs for step"\n f" {step_key}. Check the databricks console for more info."\n )\n try:\n stderr = self.databricks_runner.client.read_file(\n self._dbfs_path(run_id, step_key, "stderr")\n ).decode()\n log.info(f"Captured stderr for step {step_key}:")\n log.info(stderr)\n sys.stderr.write(stderr)\n except Exception as e:\n log.error(\n f"Encountered exception {e} when attempting to load stderr logs for step"\n f" {step_key}. Check the databricks console for more info."\n )\n\n def step_events_iterator(\n self, step_context: StepExecutionContext, step_key: str, databricks_run_id: int\n ) -> Iterator[DagsterEvent]:\n """The launched Databricks job writes all event records to a specific dbfs file. This iterator\n regularly reads the contents of the file, adds any events that have not yet been seen to\n the instance, and yields any DagsterEvents.\n\n By doing this, we simulate having the remote Databricks process able to directly write to\n the local DagsterInstance. Importantly, this means that timestamps (and all other record\n properties) will be sourced from the Databricks process, rather than recording when this\n process happens to log them.\n """\n check.int_param(databricks_run_id, "databricks_run_id")\n processed_events = 0\n start_poll_time = time.time()\n done = False\n step_context.log.info("Waiting for Databricks run %s to complete..." % databricks_run_id)\n while not done:\n with raise_execution_interrupts():\n if self.verbose_logs:\n step_context.log.debug(\n "Waiting %.1f seconds...", self.databricks_runner.poll_interval_sec\n )\n time.sleep(self.databricks_runner.poll_interval_sec)\n try:\n done = self.databricks_runner.client.poll_run_state(\n logger=step_context.log,\n start_poll_time=start_poll_time,\n databricks_run_id=databricks_run_id,\n max_wait_time_sec=self.databricks_runner.max_wait_time_sec,\n verbose_logs=self.verbose_logs,\n )\n finally:\n all_events = self.get_step_events(\n step_context.run_id, step_key, step_context.previous_attempt_count\n )\n # we get all available records on each poll, but we only want to process the\n # ones we haven't seen before\n for event in all_events[processed_events:]:\n # write each event from the DataBricks instance to the local instance\n step_context.instance.handle_new_event(event)\n if event.is_dagster_event:\n yield event.get_dagster_event()\n processed_events = len(all_events)\n\n step_context.log.info(f"Databricks run {databricks_run_id} completed.")\n\n def get_step_events(\n self, run_id: str, step_key: str, retry_number: int\n ) -> Sequence[EventLogEntry]:\n path = self._dbfs_path(run_id, step_key, f"{retry_number}_{PICKLED_EVENTS_FILE_NAME}")\n\n def _get_step_records() -> Sequence[EventLogEntry]:\n serialized_records = self.databricks_runner.client.read_file(path)\n if not serialized_records:\n return []\n return cast(\n Sequence[EventLogEntry],\n deserialize_value(pickle.loads(gzip.decompress(serialized_records))),\n )\n\n try:\n # reading from dbfs while it writes can be flaky\n # allow for retry if we get malformed data\n return backoff(\n fn=_get_step_records,\n retry_on=(pickle.UnpicklingError, OSError, zlib.error, EOFError),\n max_retries=4,\n )\n # if you poll before the Databricks process has had a chance to create the file,\n # we expect to get this error\n except DatabricksError as e:\n if e.error_code == "RESOURCE_DOES_NOT_EXIST":\n return []\n raise\n\n def _grant_permissions(\n self, log: DagsterLogManager, databricks_run_id: int, request_retries: int = 3\n ) -> None:\n client = self.databricks_runner.client.workspace_client\n # Retrieve run info\n cluster_id = None\n for i in range(1, request_retries + 1):\n run_info = client.jobs.get_run(databricks_run_id)\n # if a new job cluster is created, the cluster_instance key may not be immediately present in the run response\n try:\n cluster_id = run_info.cluster_instance.cluster_id\n break\n except:\n log.warning(\n f"Failed to retrieve cluster info for databricks_run_id {databricks_run_id}. "\n f"Retrying {i} of {request_retries} times."\n )\n time.sleep(5)\n if not cluster_id:\n log.warning(\n f"Failed to retrieve cluster info for databricks_run_id {databricks_run_id} "\n f"{request_retries} times. Skipping permission updates..."\n )\n return\n\n # Update job permissions\n if "job_permissions" in self.permissions:\n job_permissions = self._format_permissions(self.permissions["job_permissions"])\n job_id = run_info.job_id # type: ignore # (??)\n log.debug(f"Updating job permissions with following json: {job_permissions}")\n client.permissions.update("jobs", job_id, access_control_list=job_permissions)\n log.info("Successfully updated cluster permissions")\n\n # Update cluster permissions\n if "cluster_permissions" in self.permissions:\n if "existing" in self.run_config["cluster"]:\n raise ValueError(\n "Attempting to update permissions of an existing cluster. "\n "This is dangerous and thus unsupported."\n )\n cluster_permissions = self._format_permissions(self.permissions["cluster_permissions"])\n log.debug(f"Updating cluster permissions with following json: {cluster_permissions}")\n client.permissions.update(\n "clusters", cluster_id, access_control_list=cluster_permissions\n )\n log.info("Successfully updated cluster permissions")\n\n def _format_permissions(\n self, input_permissions: Mapping[str, Sequence[Mapping[str, str]]]\n ) -> Sequence[Mapping[str, str]]:\n access_control_list = []\n for permission, accessors in input_permissions.items():\n access_control_list.extend(\n [\n jobs.JobAccessControlRequest.from_dict(\n {"permission_level": permission, **accessor}\n )\n for accessor in accessors\n ]\n )\n return access_control_list\n\n def _get_databricks_task(self, run_id: str, step_key: str) -> Mapping[str, Any]:\n """Construct the 'task' parameter to be submitted to the Databricks API.\n\n This will create a 'spark_python_task' dict where `python_file` is a path on DBFS\n pointing to the 'databricks_step_main.py' file, and `parameters` is an array with a single\n element, a path on DBFS pointing to the picked `step_run_ref` data.\n\n See https://docs.databricks.com/dev-tools/api/latest/jobs.html#jobssparkpythontask.\n """\n python_file = self._dbfs_path(run_id, step_key, self._main_file_name())\n parameters = [\n self._internal_dbfs_path(run_id, step_key, PICKLED_STEP_RUN_REF_FILE_NAME),\n self._internal_dbfs_path(run_id, step_key, PICKLED_CONFIG_FILE_NAME),\n self._internal_dbfs_path(run_id, step_key, CODE_ZIP_NAME),\n ]\n return {"spark_python_task": {"python_file": python_file, "parameters": parameters}}\n\n def _upload_artifacts(\n self, log: DagsterLogManager, step_run_ref: StepRunRef, run_id: str, step_key: str\n ) -> None:\n """Upload the step run ref and pyspark code to DBFS to run as a job."""\n log.info("Uploading main file to DBFS")\n main_local_path = self._main_file_local_path()\n with open(main_local_path, "rb") as infile:\n self.databricks_runner.client.put_file(\n infile, self._dbfs_path(run_id, step_key, self._main_file_name()), overwrite=True\n )\n\n log.info("Uploading dagster job to DBFS")\n with tempfile.TemporaryDirectory() as temp_dir:\n # Zip and upload package containing dagster job\n zip_local_path = os.path.join(temp_dir, CODE_ZIP_NAME)\n build_pyspark_zip(zip_local_path, self.local_dagster_job_package_path)\n with open(zip_local_path, "rb") as infile:\n self.databricks_runner.client.put_file(\n infile, self._dbfs_path(run_id, step_key, CODE_ZIP_NAME), overwrite=True\n )\n\n log.info("Uploading step run ref file to DBFS")\n step_pickle_file = io.BytesIO()\n\n pickle.dump(step_run_ref, step_pickle_file)\n step_pickle_file.seek(0)\n self.databricks_runner.client.put_file(\n step_pickle_file,\n self._dbfs_path(run_id, step_key, PICKLED_STEP_RUN_REF_FILE_NAME),\n overwrite=True,\n )\n\n databricks_config = self.create_remote_config()\n log.info("Uploading Databricks configuration to DBFS")\n databricks_config_file = io.BytesIO()\n pickle.dump(databricks_config, databricks_config_file)\n databricks_config_file.seek(0)\n self.databricks_runner.client.put_file(\n databricks_config_file,\n self._dbfs_path(run_id, step_key, PICKLED_CONFIG_FILE_NAME),\n overwrite=True,\n )\n\n def get_dagster_env_variables(self) -> Dict[str, str]:\n out = {}\n if self.add_dagster_env_variables:\n for var in DAGSTER_SYSTEM_ENV_VARS:\n if os.getenv(var):\n out.update({var: os.getenv(var)})\n return out\n\n def create_remote_config(self) -> "DatabricksConfig":\n env_variables = self.get_dagster_env_variables()\n env_variables.update(self.env_variables)\n databricks_config = DatabricksConfig(\n env_variables=env_variables,\n storage=self.storage,\n secrets=self.secrets,\n )\n return databricks_config\n\n def _log_logs_from_cluster(self, log: DagsterLogManager, run_id: int) -> None:\n logs = self.databricks_runner.retrieve_logs_for_run_id(log, run_id)\n if logs is None:\n return\n stdout, stderr = logs\n if stderr:\n log.info(stderr)\n if stdout:\n log.info(stdout)\n\n def _main_file_name(self) -> str:\n return os.path.basename(self._main_file_local_path())\n\n def _main_file_local_path(self) -> str:\n return databricks_step_main.__file__\n\n def _sanitize_step_key(self, step_key: str) -> str:\n # step_keys of dynamic steps contain brackets, which are invalid characters\n return step_key.replace("[", "__").replace("]", "__")\n\n def _dbfs_path(self, run_id: str, step_key: str, filename: str) -> str:\n path = "/".join(\n [\n self.staging_prefix,\n run_id,\n self._sanitize_step_key(step_key),\n os.path.basename(filename),\n ]\n )\n return f"dbfs://{path}"\n\n def _internal_dbfs_path(self, run_id: str, step_key: str, filename: str) -> str:\n """Scripts running on Databricks should access DBFS at /dbfs/."""\n path = "/".join(\n [\n self.staging_prefix,\n run_id,\n self._sanitize_step_key(step_key),\n os.path.basename(filename),\n ]\n )\n return f"/dbfs/{path}"\n\n\nclass DatabricksConfig:\n """Represents configuration required by Databricks to run jobs.\n\n Instances of this class will be created when a Databricks step is launched and will contain\n all configuration and secrets required to set up storage and environment variables within\n the Databricks environment. The instance will be serialized and uploaded to Databricks\n by the step launcher, then deserialized as part of the 'main' script when the job is running\n in Databricks.\n\n The `setup` method handles the actual setup prior to op execution on the Databricks side.\n\n This config is separated out from the regular Dagster run config system because the setup\n is done by the 'main' script before entering a Dagster context (i.e. using `run_step_from_ref`).\n We use a separate class to avoid coupling the setup to the format of the `step_run_ref` object.\n """\n\n def __init__(\n self,\n env_variables: Mapping[str, str],\n storage: Mapping[str, Any],\n secrets: Sequence[Mapping[str, Any]],\n ):\n """Create a new DatabricksConfig object.\n\n `storage` and `secrets` should be of the same shape as the `storage` and\n `secrets_to_env_variables` config passed to `databricks_pyspark_step_launcher`.\n """\n self.env_variables = env_variables\n self.storage = storage\n self.secrets = secrets\n\n def setup(self, dbutils: Any, sc: Any) -> None:\n """Set up storage and environment variables on Databricks.\n\n The `dbutils` and `sc` arguments must be passed in by the 'main' script, as they\n aren't accessible by any other modules.\n """\n self.setup_storage(dbutils, sc)\n self.setup_environment(dbutils)\n\n def setup_storage(self, dbutils: Any, sc: Any) -> None:\n """Set up storage using either S3 or ADLS2."""\n if "s3" in self.storage:\n self.setup_s3_storage(self.storage["s3"], dbutils, sc)\n elif "adls2" in self.storage:\n self.setup_adls2_storage(self.storage["adls2"], dbutils, sc)\n\n def setup_s3_storage(self, s3_storage: Mapping[str, Any], dbutils: Any, sc: Any) -> None:\n """Obtain AWS credentials from Databricks secrets and export so both Spark and boto can use them."""\n scope = s3_storage["secret_scope"]\n\n access_key = dbutils.secrets.get(scope=scope, key=s3_storage["access_key_key"])\n secret_key = dbutils.secrets.get(scope=scope, key=s3_storage["secret_key_key"])\n\n # Spark APIs will use this.\n # See https://docs.databricks.com/data/data-sources/aws/amazon-s3.html#alternative-1-set-aws-keys-in-the-spark-context.\n sc._jsc.hadoopConfiguration().set("fs.s3n.awsAccessKeyId", access_key) # noqa: SLF001\n sc._jsc.hadoopConfiguration().set("fs.s3n.awsSecretAccessKey", secret_key) # noqa: SLF001\n\n # Boto will use these.\n os.environ["AWS_ACCESS_KEY_ID"] = access_key\n os.environ["AWS_SECRET_ACCESS_KEY"] = secret_key\n\n def setup_adls2_storage(self, adls2_storage: Mapping[str, Any], dbutils: Any, sc: Any) -> None:\n """Obtain an Azure Storage Account key from Databricks secrets and export so Spark can use it."""\n storage_account_key = dbutils.secrets.get(\n scope=adls2_storage["secret_scope"], key=adls2_storage["storage_account_key_key"]\n )\n # Spark APIs will use this.\n # See https://docs.microsoft.com/en-gb/azure/databricks/data/data-sources/azure/azure-datalake-gen2#--access-directly-using-the-storage-account-access-key\n # sc is globally defined in the Databricks runtime and points to the Spark context\n sc._jsc.hadoopConfiguration().set( # noqa: SLF001\n "fs.azure.account.key.{}.dfs.core.windows.net".format(\n adls2_storage["storage_account_name"]\n ),\n storage_account_key,\n )\n\n def setup_environment(self, dbutils: Any) -> None:\n """Setup any environment variables required by the run.\n\n Extract any secrets in the run config and export them as environment variables.\n\n This is important for any `StringSource` config since the environment variables\n won't ordinarily be available in the Databricks execution environment.\n """\n for env_k, env_v in self.env_variables.items():\n os.environ[env_k] = env_v\n\n for secret in self.secrets:\n name = secret["name"]\n key = secret["key"]\n scope = secret["scope"]\n print(f"Exporting {name} from Databricks secret {key}, scope {scope}") # noqa: T201\n val = dbutils.secrets.get(scope=scope, key=key)\n os.environ[name] = val\n
", "current_page_name": "_modules/dagster_databricks/databricks_pyspark_step_launcher", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_databricks.databricks_pyspark_step_launcher"}, "ops": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_databricks.ops

\nfrom typing import TYPE_CHECKING, Optional\n\nfrom dagster import (\n    In,\n    Nothing,\n    OpExecutionContext,\n    _check as check,\n    op,\n)\nfrom dagster._core.definitions.op_definition import OpDefinition\nfrom databricks.sdk.service import jobs\nfrom pydantic import Field\n\nDEFAULT_POLL_INTERVAL_SECONDS = 10\n# wait at most 24 hours by default for run execution\nDEFAULT_MAX_WAIT_TIME_SECONDS = 24 * 60 * 60\nfrom dagster import Config\n\nif TYPE_CHECKING:\n    from .databricks import DatabricksClient\n\n\n
[docs]def create_databricks_run_now_op(\n databricks_job_id: int,\n databricks_job_configuration: Optional[dict] = None,\n poll_interval_seconds: float = DEFAULT_POLL_INTERVAL_SECONDS,\n max_wait_time_seconds: float = DEFAULT_MAX_WAIT_TIME_SECONDS,\n name: Optional[str] = None,\n databricks_resource_key: str = "databricks",\n) -> OpDefinition:\n """Creates an op that launches an existing databricks job.\n\n As config, the op accepts a blob of the form described in Databricks' Job API:\n https://docs.databricks.com/api-explorer/workspace/jobs/runnow. The only required field is\n ``job_id``, which is the ID of the job to be executed. Additional fields can be used to specify\n override parameters for the Databricks Job.\n\n Arguments:\n databricks_job_id (int): The ID of the Databricks Job to be executed.\n databricks_job_configuration (dict): Configuration for triggering a new job run of a\n Databricks Job. See https://docs.databricks.com/api-explorer/workspace/jobs/runnow\n for the full configuration.\n poll_interval_seconds (float): How often to poll the Databricks API to check whether the\n Databricks job has finished running.\n max_wait_time_seconds (float): How long to wait for the Databricks job to finish running\n before raising an error.\n name (Optional[str]): The name of the op. If not provided, the name will be\n _databricks_run_now_op.\n databricks_resource_key (str): The name of the resource key used by this op. If not\n provided, the resource key will be "databricks".\n\n Returns:\n OpDefinition: An op definition to run the Databricks Job.\n\n Example:\n .. code-block:: python\n\n from dagster import job\n from dagster_databricks import create_databricks_run_now_op, DatabricksClientResource\n\n DATABRICKS_JOB_ID = 1234\n\n\n run_now_op = create_databricks_run_now_op(\n databricks_job_id=DATABRICKS_JOB_ID,\n databricks_job_configuration={\n "python_params": [\n "--input",\n "schema.db.input_table",\n "--output",\n "schema.db.output_table",\n ],\n },\n )\n\n @job(\n resource_defs={\n "databricks": DatabricksClientResource(\n host=EnvVar("DATABRICKS_HOST"),\n token=EnvVar("DATABRICKS_TOKEN")\n )\n }\n )\n def do_stuff():\n run_now_op()\n """\n _poll_interval_seconds = poll_interval_seconds\n _max_wait_time_seconds = max_wait_time_seconds\n\n class DatabricksRunNowOpConfig(Config):\n poll_interval_seconds: float = Field(\n default=_poll_interval_seconds,\n description="Check whether the Databricks Job is done at this interval, in seconds.",\n )\n max_wait_time_seconds: int = Field(\n default=_max_wait_time_seconds,\n description=(\n "If the Databricks Job is not complete after this length of time, in seconds,"\n " raise an error."\n ),\n )\n\n @op(\n ins={"start_after": In(Nothing)},\n required_resource_keys={databricks_resource_key},\n tags={"kind": "databricks"},\n name=name,\n )\n def _databricks_run_now_op(context: OpExecutionContext, config: DatabricksRunNowOpConfig):\n databricks: DatabricksClient = getattr(context.resources, databricks_resource_key)\n jobs_service = databricks.workspace_client.jobs\n\n run = jobs_service.run_now(\n job_id=databricks_job_id,\n **(databricks_job_configuration or {}),\n )\n run_id = run.bind()["run_id"]\n\n get_run_response = jobs_service.get_run(run_id=run_id)\n\n context.log.info(\n f"Launched databricks job run for '{get_run_response.run_name}' (`{run_id}`). URL:"\n f" {get_run_response.run_page_url}. Waiting to run to complete."\n )\n\n databricks.wait_for_run_to_complete(\n logger=context.log,\n databricks_run_id=run_id,\n poll_interval_sec=config.poll_interval_seconds,\n max_wait_time_sec=config.max_wait_time_seconds,\n )\n\n return _databricks_run_now_op
\n\n\n
[docs]def create_databricks_submit_run_op(\n databricks_job_configuration: dict,\n poll_interval_seconds: float = DEFAULT_POLL_INTERVAL_SECONDS,\n max_wait_time_seconds: float = DEFAULT_MAX_WAIT_TIME_SECONDS,\n name: Optional[str] = None,\n databricks_resource_key: str = "databricks",\n) -> OpDefinition:\n """Creates an op that submits a one-time run of a set of tasks on Databricks.\n\n As config, the op accepts a blob of the form described in Databricks' Job API:\n https://docs.databricks.com/api-explorer/workspace/jobs/submit.\n\n Arguments:\n databricks_job_configuration (dict): Configuration for submitting a one-time run of a set\n of tasks on Databricks. See https://docs.databricks.com/api-explorer/workspace/jobs/submit\n for the full configuration.\n poll_interval_seconds (float): How often to poll the Databricks API to check whether the\n Databricks job has finished running.\n max_wait_time_seconds (float): How long to wait for the Databricks job to finish running\n before raising an error.\n name (Optional[str]): The name of the op. If not provided, the name will be\n _databricks_submit_run_op.\n databricks_resource_key (str): The name of the resource key used by this op. If not\n provided, the resource key will be "databricks".\n\n Returns:\n OpDefinition: An op definition to submit a one-time run of a set of tasks on Databricks.\n\n Example:\n .. code-block:: python\n\n from dagster import job\n from dagster_databricks import create_databricks_submit_run_op, DatabricksClientResource\n\n\n submit_run_op = create_databricks_submit_run_op(\n databricks_job_configuration={\n "new_cluster": {\n "spark_version": '2.1.0-db3-scala2.11',\n "num_workers": 2\n },\n "notebook_task": {\n "notebook_path": "/Users/dagster@example.com/PrepareData",\n },\n }\n )\n\n @job(\n resource_defs={\n "databricks": DatabricksClientResource(\n host=EnvVar("DATABRICKS_HOST"),\n token=EnvVar("DATABRICKS_TOKEN")\n )\n }\n )\n def do_stuff():\n submit_run_op()\n """\n check.invariant(\n bool(databricks_job_configuration),\n "Configuration for the one-time Databricks Job is required.",\n )\n\n _poll_interval_seconds = poll_interval_seconds\n _max_wait_time_seconds = max_wait_time_seconds\n\n class DatabricksSubmitRunOpConfig(Config):\n poll_interval_seconds: float = Field(\n default=_poll_interval_seconds,\n description="Check whether the Databricks Job is done at this interval, in seconds.",\n )\n max_wait_time_seconds: int = Field(\n default=_max_wait_time_seconds,\n description=(\n "If the Databricks Job is not complete after this length of time, in seconds,"\n " raise an error."\n ),\n )\n\n @op(\n ins={"start_after": In(Nothing)},\n required_resource_keys={databricks_resource_key},\n tags={"kind": "databricks"},\n name=name,\n )\n def _databricks_submit_run_op(\n context: OpExecutionContext, config: DatabricksSubmitRunOpConfig\n ) -> None:\n databricks: DatabricksClient = getattr(context.resources, databricks_resource_key)\n jobs_service = databricks.workspace_client.jobs\n\n run = jobs_service.submit(\n tasks=[jobs.SubmitTask.from_dict(databricks_job_configuration)],\n )\n run_id: int = run.bind()["run_id"]\n\n get_run_response = jobs_service.get_run(run_id=run_id)\n\n context.log.info(\n f"Launched databricks job run for '{get_run_response.run_name}' (`{run_id}`). URL:"\n f" {get_run_response.run_page_url}. Waiting to run to complete."\n )\n\n databricks.wait_for_run_to_complete(\n logger=context.log,\n databricks_run_id=run_id,\n poll_interval_sec=config.poll_interval_seconds,\n max_wait_time_sec=config.max_wait_time_seconds,\n )\n\n return _databricks_submit_run_op
\n
", "current_page_name": "_modules/dagster_databricks/ops", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_databricks.ops"}, "resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_databricks.resources

\nfrom typing import Any, Optional\n\nfrom dagster import (\n    Config,\n    ConfigurableResource,\n    IAttachDifferentObjectToOpContext,\n    resource,\n)\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom pydantic import Field, root_validator\n\nfrom .databricks import DatabricksClient\n\n\nclass OauthCredentials(Config):\n    """OAuth credentials for Databricks.\n\n    See https://docs.databricks.com/dev-tools/api/latest/authentication.html#oauth-2-0.\n    """\n\n    client_id: str = Field(description="OAuth client ID")\n    client_secret: str = Field(description="OAuth client secret")\n\n\n
[docs]class DatabricksClientResource(ConfigurableResource, IAttachDifferentObjectToOpContext):\n """Resource which provides a Python client for interacting with Databricks within an\n op or asset.\n """\n\n host: str = Field(description="Databricks host, e.g. https://uksouth.azuredatabricks.com")\n token: Optional[str] = Field(default=None, description="Databricks access token")\n oauth_credentials: Optional[OauthCredentials] = Field(\n default=None,\n description=(\n "Databricks OAuth credentials for using a service principal. See"\n " https://docs.databricks.com/en/dev-tools/auth.html#oauth-2-0"\n ),\n )\n workspace_id: Optional[str] = Field(\n default=None,\n description=(\n "DEPRECATED: The Databricks workspace ID, as described in"\n " https://docs.databricks.com/workspace/workspace-details.html#workspace-instance-names-urls-and-ids."\n " This is no longer used and will be removed in a 0.21."\n ),\n )\n\n @root_validator()\n def has_token_or_oauth_credentials(cls, values):\n token = values.get("token")\n oauth_credentials = values.get("oauth_credentials")\n if not token and not oauth_credentials:\n raise ValueError("Must provide either token or oauth_credentials")\n if token and oauth_credentials:\n raise ValueError("Must provide either token or oauth_credentials, not both")\n return values\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n def get_client(self) -> DatabricksClient:\n if self.oauth_credentials:\n client_id = self.oauth_credentials.client_id\n client_secret = self.oauth_credentials.client_secret\n else:\n client_id = None\n client_secret = None\n\n return DatabricksClient(\n host=self.host,\n token=self.token,\n oauth_client_id=client_id,\n oauth_client_secret=client_secret,\n workspace_id=self.workspace_id,\n )\n\n def get_object_to_set_on_execution_context(self) -> Any:\n return self.get_client()
\n\n\n
[docs]@dagster_maintained_resource\n@resource(config_schema=DatabricksClientResource.to_config_schema())\ndef databricks_client(init_context) -> DatabricksClient:\n return DatabricksClientResource.from_resource_context(init_context).get_client()
\n
", "current_page_name": "_modules/dagster_databricks/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_databricks.resources"}}, "dagster_datadog": {"resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_datadog.resources

\nfrom dagster import ConfigurableResource, resource\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom datadog import DogStatsd, initialize, statsd\nfrom pydantic import Field\n\n\nclass DatadogClient:\n    # Mirroring levels from the dogstatsd library\n    OK, WARNING, CRITICAL, UNKNOWN = (\n        DogStatsd.OK,\n        DogStatsd.WARNING,\n        DogStatsd.CRITICAL,\n        DogStatsd.UNKNOWN,\n    )\n\n    def __init__(self, api_key: str, app_key: str):\n        self.api_key = api_key\n        self.app_key = app_key\n        initialize(api_key=api_key, app_key=app_key)\n\n        # Pull in methods from the dogstatsd library\n        for method in [\n            "event",\n            "gauge",\n            "increment",\n            "decrement",\n            "histogram",\n            "distribution",\n            "set",\n            "service_check",\n            "timed",\n            "timing",\n        ]:\n            setattr(self, method, getattr(statsd, method))\n\n\n
[docs]class DatadogResource(ConfigurableResource):\n """This resource is a thin wrapper over the\n `dogstatsd library <https://datadogpy.readthedocs.io/en/latest/>`_.\n\n As such, we directly mirror the public API methods of DogStatsd here; you can refer to the\n `DataDog documentation <https://docs.datadoghq.com/developers/dogstatsd/>`_ for how to use this\n resource.\n\n Examples:\n .. code-block:: python\n\n @op\n def datadog_op(datadog_client: ResourceParam[DatadogClient]):\n datadog_client.event('Man down!', 'This server needs assistance.')\n datadog_client.gauge('users.online', 1001, tags=["protocol:http"])\n datadog_client.increment('page.views')\n datadog_client.decrement('page.views')\n datadog_client.histogram('album.photo.count', 26, tags=["gender:female"])\n datadog_client.distribution('album.photo.count', 26, tags=["color:blue"])\n datadog_client.set('visitors.uniques', 999, tags=["browser:ie"])\n datadog_client.service_check('svc.check_name', datadog_client.WARNING)\n datadog_client.timing("query.response.time", 1234)\n\n # Use timed decorator\n @datadog_client.timed('run_fn')\n def run_fn():\n pass\n\n run_fn()\n\n @job\n def job_for_datadog_op() -> None:\n datadog_op()\n\n job_for_datadog_op.execute_in_process(\n resources={"datadog_client": DatadogResource(api_key="FOO", app_key="BAR")}\n )\n\n """\n\n api_key: str = Field(\n description=(\n "Datadog API key. See https://docs.datadoghq.com/account_management/api-app-keys/"\n )\n )\n app_key: str = Field(\n description=(\n "Datadog application key. See"\n " https://docs.datadoghq.com/account_management/api-app-keys/."\n )\n )\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n def get_client(self) -> DatadogClient:\n return DatadogClient(self.api_key, self.app_key)
\n\n\n
[docs]@dagster_maintained_resource\n@resource(\n config_schema=DatadogResource.to_config_schema(),\n description="This resource is for publishing to DataDog",\n)\ndef datadog_resource(context) -> DatadogClient:\n """This legacy resource is a thin wrapper over the\n `dogstatsd library <https://datadogpy.readthedocs.io/en/latest/>`_.\n\n Prefer using :py:class:`DatadogResource`.\n\n As such, we directly mirror the public API methods of DogStatsd here; you can refer to the\n `DataDog documentation <https://docs.datadoghq.com/developers/dogstatsd/>`_ for how to use this\n resource.\n\n Examples:\n .. code-block:: python\n\n @op(required_resource_keys={'datadog'})\n def datadog_op(context):\n dd = context.resources.datadog\n\n dd.event('Man down!', 'This server needs assistance.')\n dd.gauge('users.online', 1001, tags=["protocol:http"])\n dd.increment('page.views')\n dd.decrement('page.views')\n dd.histogram('album.photo.count', 26, tags=["gender:female"])\n dd.distribution('album.photo.count', 26, tags=["color:blue"])\n dd.set('visitors.uniques', 999, tags=["browser:ie"])\n dd.service_check('svc.check_name', dd.WARNING)\n dd.timing("query.response.time", 1234)\n\n # Use timed decorator\n @dd.timed('run_fn')\n def run_fn():\n pass\n\n run_fn()\n\n @job(resource_defs={'datadog': datadog_resource})\n def dd_job():\n datadog_op()\n\n result = dd_job.execute_in_process(\n run_config={'resources': {'datadog': {'config': {'api_key': 'YOUR_KEY', 'app_key': 'YOUR_KEY'}}}}\n )\n\n """\n return DatadogResource.from_resource_context(context).get_client()
\n
", "current_page_name": "_modules/dagster_datadog/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_datadog.resources"}}, "dagster_datahub": {"resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_datahub.resources

\nfrom typing import Any, Dict, List, Optional\n\nfrom dagster import InitResourceContext, resource\nfrom dagster._config.pythonic_config import Config, ConfigurableResource\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom datahub.emitter.kafka_emitter import (\n    DEFAULT_MCE_KAFKA_TOPIC,\n    DEFAULT_MCP_KAFKA_TOPIC,\n    MCE_KEY,\n    MCP_KEY,\n    DatahubKafkaEmitter,\n    KafkaEmitterConfig,\n)\nfrom datahub.emitter.rest_emitter import DatahubRestEmitter\nfrom pydantic import Field\n\n\n
[docs]class DatahubRESTEmitterResource(ConfigurableResource):\n connection: str = Field(description="Datahub GMS Server")\n token: Optional[str] = Field(default=None, description="Personal Access Token")\n connect_timeout_sec: Optional[float] = None\n read_timeout_sec: Optional[float] = None\n retry_status_codes: Optional[List[int]] = None\n retry_methods: Optional[List[str]] = None\n retry_max_times: Optional[int] = None\n extra_headers: Optional[Dict[str, str]] = None\n ca_certificate_path: Optional[str] = None\n server_telemetry_id: Optional[str] = None\n disable_ssl_verification: bool = False\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n def get_emitter(self) -> DatahubRestEmitter:\n return DatahubRestEmitter(\n gms_server=self.connection,\n token=self.token,\n connect_timeout_sec=self.connect_timeout_sec,\n read_timeout_sec=self.read_timeout_sec,\n retry_status_codes=self.retry_status_codes,\n retry_methods=self.retry_methods,\n retry_max_times=self.retry_max_times,\n extra_headers=self.extra_headers,\n ca_certificate_path=self.ca_certificate_path,\n server_telemetry_id=self.server_telemetry_id,\n disable_ssl_verification=self.disable_ssl_verification,\n )
\n\n\n
[docs]@dagster_maintained_resource\n@resource(config_schema=DatahubRESTEmitterResource.to_config_schema())\ndef datahub_rest_emitter(init_context: InitResourceContext) -> DatahubRestEmitter:\n emitter = DatahubRestEmitter(\n gms_server=init_context.resource_config.get("connection"),\n token=init_context.resource_config.get("token"),\n connect_timeout_sec=init_context.resource_config.get("connect_timeout_sec"),\n read_timeout_sec=init_context.resource_config.get("read_timeout_sec"),\n retry_status_codes=init_context.resource_config.get("retry_status_codes"),\n retry_methods=init_context.resource_config.get("retry_methods"),\n retry_max_times=init_context.resource_config.get("retry_max_times"),\n extra_headers=init_context.resource_config.get("extra_headers"),\n ca_certificate_path=init_context.resource_config.get("ca_certificate_path"),\n server_telemetry_id=init_context.resource_config.get("server_telemetry_id"),\n disable_ssl_verification=init_context.resource_config.get("disable_ssl_verification"),\n )\n # Attempt to hit the server to ensure the resource is properly configured\n emitter.test_connection()\n return emitter
\n\n\nclass DatahubConnection(Config):\n bootstrap: str = Field(description="Kafka Boostrap Servers. Comma delimited")\n schema_registry_url: str = Field(description="Schema Registry Location.")\n schema_registry_config: Dict[str, Any] = Field(\n default={}, description="Extra Schema Registry Config."\n )\n\n\n
[docs]class DatahubKafkaEmitterResource(ConfigurableResource):\n connection: DatahubConnection\n topic: Optional[str] = None\n topic_routes: Dict[str, str] = Field(\n default={\n MCE_KEY: DEFAULT_MCE_KAFKA_TOPIC,\n MCP_KEY: DEFAULT_MCP_KAFKA_TOPIC,\n }\n )\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n def get_emitter(self) -> DatahubKafkaEmitter:\n return DatahubKafkaEmitter(\n KafkaEmitterConfig.parse_obj(self._convert_to_config_dictionary())\n )
\n\n\n
[docs]@dagster_maintained_resource\n@resource(config_schema=DatahubKafkaEmitterResource.to_config_schema())\ndef datahub_kafka_emitter(init_context: InitResourceContext) -> DatahubKafkaEmitter:\n return DatahubKafkaEmitter(KafkaEmitterConfig.parse_obj(init_context.resource_config))
\n
", "current_page_name": "_modules/dagster_datahub/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_datahub.resources"}}, "dagster_dbt": {"asset_decorator": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_dbt.asset_decorator

\nfrom typing import (\n    Any,\n    Callable,\n    Dict,\n    FrozenSet,\n    Mapping,\n    Optional,\n    Sequence,\n    Set,\n    Tuple,\n)\n\nimport dagster._check as check\nfrom dagster import (\n    AssetCheckSpec,\n    AssetKey,\n    AssetOut,\n    AssetsDefinition,\n    Nothing,\n    PartitionsDefinition,\n    multi_asset,\n)\n\nfrom .asset_utils import (\n    DAGSTER_DBT_TRANSLATOR_METADATA_KEY,\n    MANIFEST_METADATA_KEY,\n    default_asset_check_fn,\n    default_code_version_fn,\n    get_deps,\n)\nfrom .dagster_dbt_translator import DagsterDbtTranslator, DbtManifestWrapper\nfrom .dbt_manifest import DbtManifestParam, validate_manifest\nfrom .utils import (\n    ASSET_RESOURCE_TYPES,\n    get_dbt_resource_props_by_dbt_unique_id_from_manifest,\n    output_name_fn,\n    select_unique_ids_from_manifest,\n)\n\n\n
[docs]def dbt_assets(\n *,\n manifest: DbtManifestParam,\n select: str = "fqn:*",\n exclude: Optional[str] = None,\n io_manager_key: Optional[str] = None,\n partitions_def: Optional[PartitionsDefinition] = None,\n dagster_dbt_translator: DagsterDbtTranslator = DagsterDbtTranslator(),\n) -> Callable[..., AssetsDefinition]:\n """Create a definition for how to compute a set of dbt resources, described by a manifest.json.\n When invoking dbt commands using :py:class:`~dagster_dbt.DbtCliResource`'s\n :py:meth:`~dagster_dbt.DbtCliResource.cli` method, Dagster events are emitted by calling\n ``yield from`` on the event stream returned by :py:meth:`~dagster_dbt.DbtCliInvocation.stream`.\n\n Args:\n manifest (Union[Mapping[str, Any], str, Path]): The contents of a manifest.json file\n or the path to a manifest.json file. A manifest.json contains a representation of a\n dbt project (models, tests, macros, etc). We use this representation to create\n corresponding Dagster assets.\n select (str): A dbt selection string for the models in a project that you want\n to include. Defaults to ``fqn:*``.\n exclude (Optional[str]): A dbt selection string for the models in a project that you want\n to exclude. Defaults to "".\n io_manager_key (Optional[str]): The IO manager key that will be set on each of the returned\n assets. When other ops are downstream of the loaded assets, the IOManager specified\n here determines how the inputs to those ops are loaded. Defaults to "io_manager".\n partitions_def (Optional[PartitionsDefinition]): Defines the set of partition keys that\n compose the dbt assets.\n dagster_dbt_translator (Optional[DagsterDbtTranslator]): Allows customizing how to map\n dbt models, seeds, etc. to asset keys and asset metadata.\n\n Examples:\n Running ``dbt build`` for a dbt project:\n\n .. code-block:: python\n\n from pathlib import Path\n\n from dagster import AssetExecutionContext\n from dagster_dbt import DbtCliResource, dbt_assets\n\n\n @dbt_assets(manifest=Path("target", "manifest.json"))\n def my_dbt_assets(context: AssetExecutionContext, dbt: DbtCliResource):\n yield from dbt.cli(["build"], context=context).stream()\n\n Running dbt commands with flags:\n\n .. code-block:: python\n\n from pathlib import Path\n\n from dagster import AssetExecutionContext\n from dagster_dbt import DbtCliResource, dbt_assets\n\n\n @dbt_assets(manifest=Path("target", "manifest.json"))\n def my_dbt_assets(context: AssetExecutionContext, dbt: DbtCliResource):\n yield from dbt.cli(["build", "--full-refresh"], context=context).stream()\n\n Running dbt commands with ``--vars``:\n\n .. code-block:: python\n\n import json\n from pathlib import Path\n\n from dagster import AssetExecutionContext\n from dagster_dbt import DbtCliResource, dbt_assets\n\n\n @dbt_assets(manifest=Path("target", "manifest.json"))\n def my_dbt_assets(context: AssetExecutionContext, dbt: DbtCliResource):\n dbt_vars = {"key": "value"}\n\n yield from dbt.cli(["build", "--vars", json.dumps(dbt_vars)], context=context).stream()\n\n Retrieving dbt artifacts after running a dbt command:\n\n .. code-block:: python\n\n from pathlib import Path\n\n from dagster import AssetExecutionContext\n from dagster_dbt import DbtCliResource, dbt_assets\n\n\n @dbt_assets(manifest=Path("target", "manifest.json"))\n def my_dbt_assets(context: AssetExecutionContext, dbt: DbtCliResource):\n dbt_build_invocation = dbt.cli(["build"], context=context)\n\n yield from dbt_build_invocation.stream()\n\n run_results_json = dbt_build_invocation.get_artifact("run_results.json")\n\n Running multiple dbt commands for a dbt project:\n\n .. code-block:: python\n\n from pathlib import Path\n\n from dagster import AssetExecutionContext\n from dagster_dbt import DbtCliResource, dbt_assets\n\n\n @dbt_assets(manifest=Path("target", "manifest.json"))\n def my_dbt_assets(context: AssetExecutionContext, dbt: DbtCliResource):\n yield from dbt.cli(["run"], context=context).stream()\n yield from dbt.cli(["test"], context=context).stream()\n\n Customizing the Dagster asset metadata inferred from a dbt project using :py:class:`~dagster_dbt.DagsterDbtTranslator`:\n\n .. code-block:: python\n\n from pathlib import Path\n\n from dagster import AssetExecutionContext\n from dagster_dbt import DagsterDbtTranslator, DbtCliResource, dbt_assets\n\n\n class CustomDagsterDbtTranslator(DagsterDbtTranslator):\n ...\n\n\n @dbt_assets(\n manifest=Path("target", "manifest.json"),\n dagster_dbt_translator=CustomDagsterDbtTranslator(),\n )\n def my_dbt_assets(context: AssetExecutionContext, dbt: DbtCliResource):\n yield from dbt.cli(["build"], context=context).stream()\n\n Invoking another Dagster :py:class:`~dagster.ResourceDefinition` alongside dbt:\n\n .. code-block:: python\n\n from pathlib import Path\n\n from dagster import AssetExecutionContext\n from dagster_dbt import DagsterDbtTranslator, DbtCliResource, dbt_assets\n from dagster_slack import SlackResource\n\n\n @dbt_assets(manifest=Path("target", "manifest.json"))\n def my_dbt_assets(context: AssetExecutionContext, dbt: DbtCliResource, slack: SlackResource):\n yield from dbt.cli(["build"], context=context).stream()\n\n slack_client = slack.get_client()\n slack_client.chat_postMessage(channel="#my-channel", text="dbt build succeeded!")\n\n Defining and accessing Dagster :py:class:`~dagster.Config` alongside dbt:\n\n .. code-block:: python\n\n from pathlib import Path\n\n from dagster import AssetExecutionContext, Config\n from dagster_dbt import DagsterDbtTranslator, DbtCliResource, dbt_assets\n\n\n class MyDbtConfig(Config):\n full_refresh: bool\n\n\n @dbt_assets(manifest=Path("target", "manifest.json"))\n def my_dbt_assets(context: AssetExecutionContext, dbt: DbtCliResource, config: MyDbtConfig):\n dbt_build_args = ["build"]\n if config.full_refresh:\n dbt_build_args += ["--full-refresh"]\n\n yield from dbt.cli(dbt_build_args, context=context).stream()\n\n Defining Dagster :py:class:`~dagster.PartitionDefinition` alongside dbt:\n\n\n .. code-block:: python\n\n import json\n from pathlib import Path\n\n from dagster import AssetExecutionContext, DailyPartitionDefinition\n from dagster_dbt import DbtCliResource, dbt_assets\n\n\n @dbt_assets(\n manifest=Path("target", "manifest.json"),\n partitions_def=DailyPartitionsDefinition(start_date="2023-01-01")\n )\n def partitionshop_dbt_assets(context: AssetExecutionContext, dbt: DbtCliResource):\n time_window = context.asset_partitions_time_window_for_output(\n list(context.selected_output_names)[0]\n )\n\n dbt_vars = {\n "min_date": time_window.start.isoformat(),\n "max_date": time_window.end.isoformat()\n }\n dbt_build_args = ["build", "--vars", json.dumps(dbt_vars)]\n\n yield from dbt.cli(dbt_build_args, context=context).stream()\n\n """\n check.inst_param(\n dagster_dbt_translator,\n "dagster_dbt_translator",\n DagsterDbtTranslator,\n additional_message=(\n "Ensure that the argument is an instantiated class that subclasses"\n " DagsterDbtTranslator."\n ),\n )\n manifest = validate_manifest(manifest)\n\n unique_ids = select_unique_ids_from_manifest(\n select=select, exclude=exclude or "", manifest_json=manifest\n )\n node_info_by_dbt_unique_id = get_dbt_resource_props_by_dbt_unique_id_from_manifest(manifest)\n deps = get_deps(\n dbt_nodes=node_info_by_dbt_unique_id,\n selected_unique_ids=unique_ids,\n asset_resource_types=ASSET_RESOURCE_TYPES,\n )\n (\n non_argument_deps,\n outs,\n internal_asset_deps,\n check_specs,\n ) = get_dbt_multi_asset_args(\n dbt_nodes=node_info_by_dbt_unique_id,\n deps=deps,\n io_manager_key=io_manager_key,\n manifest=manifest,\n dagster_dbt_translator=dagster_dbt_translator,\n )\n\n def inner(fn) -> AssetsDefinition:\n asset_definition = multi_asset(\n outs=outs,\n internal_asset_deps=internal_asset_deps,\n deps=non_argument_deps,\n compute_kind="dbt",\n partitions_def=partitions_def,\n can_subset=True,\n op_tags={\n **({"dagster-dbt/select": select} if select else {}),\n **({"dagster-dbt/exclude": exclude} if exclude else {}),\n },\n check_specs=check_specs,\n )(fn)\n\n return asset_definition\n\n return inner
\n\n\ndef get_dbt_multi_asset_args(\n dbt_nodes: Mapping[str, Any],\n deps: Mapping[str, FrozenSet[str]],\n io_manager_key: Optional[str],\n manifest: Mapping[str, Any],\n dagster_dbt_translator: DagsterDbtTranslator,\n) -> Tuple[\n Sequence[AssetKey],\n Dict[str, AssetOut],\n Dict[str, Set[AssetKey]],\n Sequence[AssetCheckSpec],\n]:\n non_argument_deps: Set[AssetKey] = set()\n outs: Dict[str, AssetOut] = {}\n internal_asset_deps: Dict[str, Set[AssetKey]] = {}\n check_specs: Sequence[AssetCheckSpec] = []\n\n for unique_id, parent_unique_ids in deps.items():\n dbt_resource_props = dbt_nodes[unique_id]\n\n output_name = output_name_fn(dbt_resource_props)\n asset_key = dagster_dbt_translator.get_asset_key(dbt_resource_props)\n\n outs[output_name] = AssetOut(\n key=asset_key,\n dagster_type=Nothing,\n io_manager_key=io_manager_key,\n description=dagster_dbt_translator.get_description(dbt_resource_props),\n is_required=False,\n metadata={ # type: ignore\n **dagster_dbt_translator.get_metadata(dbt_resource_props),\n MANIFEST_METADATA_KEY: DbtManifestWrapper(manifest=manifest),\n DAGSTER_DBT_TRANSLATOR_METADATA_KEY: dagster_dbt_translator,\n },\n group_name=dagster_dbt_translator.get_group_name(dbt_resource_props),\n code_version=default_code_version_fn(dbt_resource_props),\n freshness_policy=dagster_dbt_translator.get_freshness_policy(dbt_resource_props),\n auto_materialize_policy=dagster_dbt_translator.get_auto_materialize_policy(\n dbt_resource_props\n ),\n )\n\n test_unique_ids = [\n child_unique_id\n for child_unique_id in manifest["child_map"][unique_id]\n if child_unique_id.startswith("test")\n ]\n for test_unique_id in test_unique_ids:\n test_resource_props = manifest["nodes"][test_unique_id]\n check_spec = default_asset_check_fn(asset_key, unique_id, test_resource_props)\n\n if check_spec:\n check_specs.append(check_spec)\n\n # Translate parent unique ids to internal asset deps and non argument dep\n output_internal_deps = internal_asset_deps.setdefault(output_name, set())\n for parent_unique_id in parent_unique_ids:\n parent_resource_props = dbt_nodes[parent_unique_id]\n parent_asset_key = dagster_dbt_translator.get_asset_key(parent_resource_props)\n\n # Add this parent as an internal dependency\n output_internal_deps.add(parent_asset_key)\n\n # Mark this parent as an input if it has no dependencies\n if parent_unique_id not in deps:\n non_argument_deps.add(parent_asset_key)\n\n return list(non_argument_deps), outs, internal_asset_deps, check_specs\n
", "current_page_name": "_modules/dagster_dbt/asset_decorator", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_dbt.asset_decorator"}, "asset_defs": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_dbt.asset_defs

\nimport hashlib\nimport json\nimport os\nfrom pathlib import Path\nfrom typing import (\n    AbstractSet,\n    Any,\n    Callable,\n    Dict,\n    Iterator,\n    List,\n    Mapping,\n    Optional,\n    Sequence,\n    Set,\n    Tuple,\n    Union,\n    cast,\n)\n\nimport dateutil\nfrom dagster import (\n    AssetCheckResult,\n    AssetKey,\n    AssetsDefinition,\n    AutoMaterializePolicy,\n    FreshnessPolicy,\n    In,\n    OpExecutionContext,\n    Out,\n    PartitionsDefinition,\n    PermissiveConfig,\n    _check as check,\n    get_dagster_logger,\n    op,\n)\nfrom dagster._annotations import deprecated_param\nfrom dagster._core.definitions.events import (\n    AssetMaterialization,\n    AssetObservation,\n    CoercibleToAssetKeyPrefix,\n    Output,\n)\nfrom dagster._core.definitions.metadata import MetadataUserInput, RawMetadataValue\nfrom dagster._core.errors import DagsterInvalidSubsetError\nfrom dagster._utils.merger import deep_merge_dicts\nfrom dagster._utils.warnings import (\n    deprecation_warning,\n    normalize_renamed_param,\n)\n\nfrom dagster_dbt.asset_utils import (\n    default_asset_key_fn,\n    default_auto_materialize_policy_fn,\n    default_description_fn,\n    default_freshness_policy_fn,\n    default_group_from_dbt_resource_props,\n    default_metadata_from_dbt_resource_props,\n    get_asset_deps,\n    get_deps,\n)\nfrom dagster_dbt.core.resources import DbtCliClient\nfrom dagster_dbt.core.resources_v2 import DbtCliResource\nfrom dagster_dbt.core.types import DbtCliOutput\nfrom dagster_dbt.core.utils import build_command_args_from_flags, execute_cli\nfrom dagster_dbt.dagster_dbt_translator import DagsterDbtTranslator\nfrom dagster_dbt.errors import DagsterDbtError\nfrom dagster_dbt.types import DbtOutput\nfrom dagster_dbt.utils import (\n    ASSET_RESOURCE_TYPES,\n    output_name_fn,\n    result_to_events,\n    select_unique_ids_from_manifest,\n)\n\n\ndef _load_manifest_for_project(\n    project_dir: str,\n    profiles_dir: str,\n    target_dir: str,\n    select: str,\n    exclude: str,\n) -> Tuple[Mapping[str, Any], DbtCliOutput]:\n    # running "dbt ls" regenerates the manifest.json, which includes a superset of the actual\n    # "dbt ls" output\n    cli_output = execute_cli(\n        executable="dbt",\n        command="ls",\n        log=get_dagster_logger(),\n        flags_dict={\n            "project-dir": project_dir,\n            "profiles-dir": profiles_dir,\n            "select": select,\n            "exclude": exclude,\n            "output": "json",\n        },\n        warn_error=False,\n        ignore_handled_error=False,\n        target_path=target_dir,\n        json_log_format=True,\n        capture_logs=True,\n    )\n    manifest_path = os.path.join(target_dir, "manifest.json")\n    with open(manifest_path, "r", encoding="utf8") as f:\n        return json.load(f), cli_output\n\n\ndef _can_stream_events(dbt_resource: Union[DbtCliClient, DbtCliResource]) -> bool:\n    """Check if the installed dbt version supports streaming events."""\n    import dbt.version\n    from packaging import version\n\n    if version.parse(dbt.version.__version__) >= version.parse("1.4.0"):\n        # The json log format is required for streaming events. DbtCliResource always uses this format, but\n        # DbtCliClient has an option to disable it.\n        if isinstance(dbt_resource, DbtCliResource):\n            return True\n        else:\n            return dbt_resource._json_log_format  # noqa: SLF001\n    else:\n        return False\n\n\ndef _batch_event_iterator(\n    context: OpExecutionContext,\n    dbt_resource: DbtCliClient,\n    use_build_command: bool,\n    node_info_to_asset_key: Callable[[Mapping[str, Any]], AssetKey],\n    runtime_metadata_fn: Optional[\n        Callable[[OpExecutionContext, Mapping[str, Any]], Mapping[str, RawMetadataValue]]\n    ],\n    kwargs: Dict[str, Any],\n) -> Iterator[Union[AssetObservation, AssetMaterialization, Output]]:\n    """Yields events for a dbt cli invocation. Waits until the entire command has completed before\n    emitting outputs.\n    """\n    # clean up any run results from the last run\n    dbt_resource.remove_run_results_json()\n\n    dbt_output: Optional[DbtOutput] = None\n    try:\n        if use_build_command:\n            dbt_output = dbt_resource.build(**kwargs)\n        else:\n            dbt_output = dbt_resource.run(**kwargs)\n    finally:\n        # in the case that the project only partially runs successfully, still attempt to generate\n        # events for the parts that were successful\n        if dbt_output is None:\n            dbt_output = DbtOutput(result=check.not_none(dbt_resource.get_run_results_json()))\n\n        manifest_json = check.not_none(dbt_resource.get_manifest_json())\n\n        dbt_output = check.not_none(dbt_output)\n        for result in dbt_output.result["results"]:\n            extra_metadata: Optional[Mapping[str, RawMetadataValue]] = None\n            if runtime_metadata_fn:\n                node_info = manifest_json["nodes"][result["unique_id"]]\n                extra_metadata = runtime_metadata_fn(context, node_info)\n            yield from result_to_events(\n                result=result,\n                docs_url=dbt_output.docs_url,\n                node_info_to_asset_key=node_info_to_asset_key,\n                manifest_json=manifest_json,\n                extra_metadata=extra_metadata,\n                generate_asset_outputs=True,\n            )\n\n\ndef _events_for_structured_json_line(\n    json_line: Mapping[str, Any],\n    context: OpExecutionContext,\n    node_info_to_asset_key: Callable[[Mapping[str, Any]], AssetKey],\n    runtime_metadata_fn: Optional[\n        Callable[[OpExecutionContext, Mapping[str, Any]], Mapping[str, RawMetadataValue]]\n    ],\n    manifest_json: Mapping[str, Any],\n) -> Iterator[Union[AssetObservation, Output]]:\n    """Parses a json line into a Dagster event. Attempts to replicate the behavior of result_to_events\n    as closely as possible.\n    """\n    runtime_node_info = json_line.get("data", {}).get("node_info", {})\n    if not runtime_node_info:\n        return\n\n    node_resource_type = runtime_node_info.get("resource_type")\n    node_status = runtime_node_info.get("node_status")\n    unique_id = runtime_node_info.get("unique_id")\n\n    if not node_resource_type or not unique_id:\n        return\n\n    compiled_node_info = manifest_json["nodes"][unique_id]\n\n    if node_resource_type in ASSET_RESOURCE_TYPES and node_status == "success":\n        metadata = dict(\n            runtime_metadata_fn(context, compiled_node_info) if runtime_metadata_fn else {}\n        )\n        started_at_str = runtime_node_info.get("node_started_at")\n        finished_at_str = runtime_node_info.get("node_finished_at")\n        if started_at_str is None or finished_at_str is None:\n            return\n\n        started_at = dateutil.parser.isoparse(started_at_str)  # type: ignore\n        completed_at = dateutil.parser.isoparse(finished_at_str)  # type: ignore\n        duration = completed_at - started_at\n        metadata.update(\n            {\n                "Execution Started At": started_at.isoformat(timespec="seconds"),\n                "Execution Completed At": completed_at.isoformat(timespec="seconds"),\n                "Execution Duration": duration.total_seconds(),\n            }\n        )\n        yield Output(\n            value=None,\n            output_name=output_name_fn(compiled_node_info),\n            metadata=metadata,\n        )\n    elif node_resource_type == "test" and runtime_node_info.get("node_finished_at"):\n        upstream_unique_ids = (\n            manifest_json["nodes"][unique_id].get("depends_on", {}).get("nodes", [])\n        )\n        # tests can apply to multiple asset keys\n        for upstream_id in upstream_unique_ids:\n            # the upstream id can reference a node or a source\n            upstream_node_info = manifest_json["nodes"].get(upstream_id) or manifest_json[\n                "sources"\n            ].get(upstream_id)\n            if upstream_node_info is None:\n                continue\n            upstream_asset_key = node_info_to_asset_key(upstream_node_info)\n            yield AssetObservation(\n                asset_key=upstream_asset_key,\n                metadata={\n                    "Test ID": unique_id,\n                    "Test Status": node_status,\n                },\n            )\n\n\ndef _stream_event_iterator(\n    context: OpExecutionContext,\n    dbt_resource: Union[DbtCliResource, DbtCliClient],\n    use_build_command: bool,\n    node_info_to_asset_key: Callable[[Mapping[str, Any]], AssetKey],\n    runtime_metadata_fn: Optional[\n        Callable[[OpExecutionContext, Mapping[str, Any]], Mapping[str, RawMetadataValue]]\n    ],\n    kwargs: Dict[str, Any],\n    manifest_json: Mapping[str, Any],\n) -> Iterator[Union[AssetObservation, Output, AssetCheckResult]]:\n    """Yields events for a dbt cli invocation. Emits outputs as soon as the relevant dbt logs are\n    emitted.\n    """\n    if isinstance(dbt_resource, DbtCliClient):\n        for parsed_json_line in dbt_resource.cli_stream_json(\n            command="build" if use_build_command else "run",\n            **kwargs,\n        ):\n            yield from _events_for_structured_json_line(\n                parsed_json_line,\n                context,\n                node_info_to_asset_key,\n                runtime_metadata_fn,\n                manifest_json,\n            )\n    else:\n        if runtime_metadata_fn is not None:\n            raise DagsterDbtError(\n                "The runtime_metadata_fn argument on the load_assets_from_dbt_manifest and"\n                " load_assets_from_dbt_project functions is not supported when using the"\n                " DbtCliResource resource. Use the @dbt_assets decorator instead if you want"\n                " control over what metadata is yielded at runtime."\n            )\n\n        class CustomDagsterDbtTranslator(DagsterDbtTranslator):\n            @classmethod\n            def get_asset_key(cls, dbt_resource_props: Mapping[str, Any]) -> AssetKey:\n                return node_info_to_asset_key(dbt_resource_props)\n\n        cli_output = dbt_resource.cli(\n            args=["build" if use_build_command else "run", *build_command_args_from_flags(kwargs)],\n            manifest=manifest_json,\n            dagster_dbt_translator=CustomDagsterDbtTranslator(),\n        )\n        yield from cli_output.stream()\n\n\nclass DbtOpConfig(PermissiveConfig):\n    """Keyword arguments to pass to the underlying dbt command. Additional arguments not listed in the schema will\n    be passed through as well, e.g. {'bool_flag': True, 'string_flag': 'hi'} will result in the flags\n    '--bool_flag --string_flag hi' being passed to the dbt command.\n    """\n\n    select: Optional[str] = None\n    exclude: Optional[str] = None\n    vars: Optional[Dict[str, Any]] = None\n    full_refresh: Optional[bool] = None\n\n\ndef _get_dbt_op(\n    op_name: str,\n    ins: Mapping[str, In],\n    outs: Mapping[str, Out],\n    select: str,\n    exclude: str,\n    use_build_command: bool,\n    fqns_by_output_name: Mapping[str, List[str]],\n    dbt_resource_key: str,\n    node_info_to_asset_key: Callable[[Mapping[str, Any]], AssetKey],\n    partition_key_to_vars_fn: Optional[Callable[[str], Mapping[str, Any]]],\n    runtime_metadata_fn: Optional[\n        Callable[[OpExecutionContext, Mapping[str, Any]], Mapping[str, RawMetadataValue]]\n    ],\n    manifest_json: Mapping[str, Any],\n):\n    @op(\n        name=op_name,\n        tags={"kind": "dbt"},\n        ins=ins,\n        out=outs,\n        required_resource_keys={dbt_resource_key},\n    )\n    def _dbt_op(context, config: DbtOpConfig):\n        dbt_resource: Union[DbtCliResource, DbtCliClient] = getattr(\n            context.resources, dbt_resource_key\n        )\n        check.inst(\n            dbt_resource,\n            (DbtCliResource, DbtCliClient),\n            "Resource with key 'dbt_resource_key' must be a DbtCliResource or DbtCliClient"\n            f" object, but is a {type(dbt_resource)}",\n        )\n\n        kwargs: Dict[str, Any] = {}\n        # in the case that we're running everything, opt for the cleaner selection string\n        if len(context.selected_output_names) == len(outs):\n            kwargs["select"] = select\n            kwargs["exclude"] = exclude\n        else:\n            # for each output that we want to emit, translate to a dbt select string by converting\n            # the out to its corresponding fqn\n            kwargs["select"] = [\n                ".".join(fqns_by_output_name[output_name])\n                for output_name in context.selected_output_names\n            ]\n        # variables to pass into the command\n        if partition_key_to_vars_fn:\n            kwargs["vars"] = partition_key_to_vars_fn(context.partition_key)\n        # merge in any additional kwargs from the config\n        kwargs = deep_merge_dicts(kwargs, context.op_config)\n\n        if _can_stream_events(dbt_resource):\n            yield from _stream_event_iterator(\n                context,\n                dbt_resource,\n                use_build_command,\n                node_info_to_asset_key,\n                runtime_metadata_fn,\n                kwargs,\n                manifest_json=manifest_json,\n            )\n        else:\n            if not isinstance(dbt_resource, DbtCliClient):\n                check.failed(\n                    "Chose batch event iterator, but it only works with DbtCliClient, and"\n                    f" resource has type {type(dbt_resource)}"\n                )\n            yield from _batch_event_iterator(\n                context,\n                dbt_resource,\n                use_build_command,\n                node_info_to_asset_key,\n                runtime_metadata_fn,\n                kwargs,\n            )\n\n    return _dbt_op\n\n\ndef _dbt_nodes_to_assets(\n    dbt_nodes: Mapping[str, Any],\n    select: str,\n    exclude: str,\n    selected_unique_ids: AbstractSet[str],\n    project_id: str,\n    dbt_resource_key: str,\n    manifest_json: Mapping[str, Any],\n    op_name: Optional[str],\n    runtime_metadata_fn: Optional[\n        Callable[[OpExecutionContext, Mapping[str, Any]], Mapping[str, RawMetadataValue]]\n    ],\n    io_manager_key: Optional[str],\n    use_build_command: bool,\n    partitions_def: Optional[PartitionsDefinition],\n    partition_key_to_vars_fn: Optional[Callable[[str], Mapping[str, Any]]],\n    dagster_dbt_translator: DagsterDbtTranslator,\n) -> AssetsDefinition:\n    if use_build_command:\n        deps = get_deps(\n            dbt_nodes,\n            selected_unique_ids,\n            asset_resource_types=["model", "seed", "snapshot"],\n        )\n    else:\n        deps = get_deps(dbt_nodes, selected_unique_ids, asset_resource_types=["model"])\n\n    (\n        asset_deps,\n        asset_ins,\n        asset_outs,\n        group_names_by_key,\n        freshness_policies_by_key,\n        auto_materialize_policies_by_key,\n        check_specs_by_output_name,\n        fqns_by_output_name,\n        _,\n    ) = get_asset_deps(\n        dbt_nodes=dbt_nodes,\n        deps=deps,\n        io_manager_key=io_manager_key,\n        manifest=manifest_json,\n        dagster_dbt_translator=dagster_dbt_translator,\n    )\n\n    # prevent op name collisions between multiple dbt multi-assets\n    if not op_name:\n        op_name = f"run_dbt_{project_id}"\n        if select != "fqn:*" or exclude:\n            op_name += "_" + hashlib.md5(select.encode() + exclude.encode()).hexdigest()[-5:]\n\n    check_outs_by_output_name: Mapping[str, Out] = {}\n    if check_specs_by_output_name:\n        check_outs_by_output_name = {\n            output_name: Out(dagster_type=None, is_required=False)\n            for output_name in check_specs_by_output_name.keys()\n        }\n\n    dbt_op = _get_dbt_op(\n        op_name=op_name,\n        ins=dict(asset_ins.values()),\n        outs={\n            **dict(asset_outs.values()),\n            **check_outs_by_output_name,\n        },\n        select=select,\n        exclude=exclude,\n        use_build_command=use_build_command,\n        fqns_by_output_name=fqns_by_output_name,\n        dbt_resource_key=dbt_resource_key,\n        node_info_to_asset_key=dagster_dbt_translator.get_asset_key,\n        partition_key_to_vars_fn=partition_key_to_vars_fn,\n        runtime_metadata_fn=runtime_metadata_fn,\n        manifest_json=manifest_json,\n    )\n\n    return AssetsDefinition(\n        keys_by_input_name={\n            input_name: asset_key for asset_key, (input_name, _) in asset_ins.items()\n        },\n        keys_by_output_name={\n            output_name: asset_key for asset_key, (output_name, _) in asset_outs.items()\n        },\n        node_def=dbt_op,\n        can_subset=True,\n        asset_deps=asset_deps,\n        group_names_by_key=group_names_by_key,\n        freshness_policies_by_key=freshness_policies_by_key,\n        auto_materialize_policies_by_key=auto_materialize_policies_by_key,\n        check_specs_by_output_name=check_specs_by_output_name,\n        partitions_def=partitions_def,\n    )\n\n\n
[docs]def load_assets_from_dbt_project(\n project_dir: str,\n profiles_dir: Optional[str] = None,\n *,\n select: Optional[str] = None,\n exclude: Optional[str] = None,\n dagster_dbt_translator: Optional[DagsterDbtTranslator] = None,\n io_manager_key: Optional[str] = None,\n target_dir: Optional[str] = None,\n # All arguments below are deprecated\n key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n source_key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n op_name: Optional[str] = None,\n runtime_metadata_fn: Optional[\n Callable[[OpExecutionContext, Mapping[str, Any]], Mapping[str, Any]]\n ] = None,\n node_info_to_asset_key: Callable[[Mapping[str, Any]], AssetKey] = default_asset_key_fn,\n use_build_command: bool = True,\n partitions_def: Optional[PartitionsDefinition] = None,\n partition_key_to_vars_fn: Optional[Callable[[str], Mapping[str, Any]]] = None,\n node_info_to_group_fn: Callable[\n [Mapping[str, Any]], Optional[str]\n ] = default_group_from_dbt_resource_props,\n node_info_to_freshness_policy_fn: Callable[\n [Mapping[str, Any]], Optional[FreshnessPolicy]\n ] = default_freshness_policy_fn,\n node_info_to_auto_materialize_policy_fn: Callable[\n [Mapping[str, Any]], Optional[AutoMaterializePolicy]\n ] = default_auto_materialize_policy_fn,\n node_info_to_definition_metadata_fn: Callable[\n [Mapping[str, Any]], Mapping[str, MetadataUserInput]\n ] = default_metadata_from_dbt_resource_props,\n display_raw_sql: Optional[bool] = None,\n dbt_resource_key: str = "dbt",\n) -> Sequence[AssetsDefinition]:\n """Loads a set of dbt models from a dbt project into Dagster assets.\n\n Creates one Dagster asset for each dbt model. All assets will be re-materialized using a single\n `dbt run` or `dbt build` command.\n\n When searching for more flexibility in defining the computations that materialize your\n dbt assets, we recommend that you use :py:class:`~dagster_dbt.dbt_assets`.\n\n Args:\n project_dir (Optional[str]): The directory containing the dbt project to load.\n profiles_dir (Optional[str]): The profiles directory to use for loading the DBT project.\n Defaults to a directory called "config" inside the project_dir.\n target_dir (Optional[str]): The target directory where dbt will place compiled artifacts.\n Defaults to "target" underneath the project_dir.\n select (Optional[str]): A dbt selection string for the models in a project that you want\n to include. Defaults to `"fqn:*"`.\n exclude (Optional[str]): A dbt selection string for the models in a project that you want\n to exclude. Defaults to "".\n dagster_dbt_translator (Optional[DagsterDbtTranslator]): Allows customizing how to map\n dbt models, seeds, etc. to asset keys and asset metadata.\n key_prefix (Optional[Union[str, List[str]]]): [Deprecated] A key prefix to apply to all assets loaded\n from the dbt project. Does not apply to input assets. Deprecated: use\n dagster_dbt_translator=KeyPrefixDagsterDbtTranslator(key_prefix=...) instead.\n source_key_prefix (Optional[Union[str, List[str]]]): [Deprecated] A key prefix to apply to all input\n assets for the set of assets loaded from the dbt project. Deprecated: use\n dagster_dbt_translator=KeyPrefixDagsterDbtTranslator(source_key_prefix=...) instead.\n op_name (Optional[str]): [Deprecated] Sets the name of the underlying Op that will generate the dbt assets.\n Deprecated: use the `@dbt_assets` decorator if you need to customize the op name.\n dbt_resource_key (Optional[str]): [Deprecated] The resource key that the dbt resource will be specified at.\n Defaults to "dbt". Deprecated: use the `@dbt_assets` decorator if you need to customize\n the resource key.\n runtime_metadata_fn (Optional[Callable[[OpExecutionContext, Mapping[str, Any]], Mapping[str, Any]]]): [Deprecated]\n A function that will be run after any of the assets are materialized and returns\n metadata entries for the asset, to be displayed in the asset catalog for that run.\n Deprecated: use the @dbt_assets decorator if you need to customize runtime metadata.\n manifest_json (Optional[Mapping[str, Any]]): [Deprecated] Use the manifest argument instead.\n selected_unique_ids (Optional[Set[str]]): [Deprecated] The set of dbt unique_ids that you want to load\n as assets. Deprecated: use the select argument instead.\n node_info_to_asset_key (Mapping[str, Any] -> AssetKey): [Deprecated] A function that takes a dictionary\n of dbt node info and returns the AssetKey that you want to represent that node. By\n default, the asset key will simply be the name of the dbt model. Deprecated: instead,\n provide a custom DagsterDbtTranslator that overrides node_info_to_asset_key.\n use_build_command (bool): Flag indicating if you want to use `dbt build` as the core computation\n for this asset. Defaults to True. If set to False, then `dbt run` will be used, and\n seeds and snapshots won't be loaded as assets.\n partitions_def (Optional[PartitionsDefinition]): [Deprecated] Defines the set of partition keys that\n compose the dbt assets. Deprecated: use the @dbt_assets decorator to define partitioned\n dbt assets.\n partition_key_to_vars_fn (Optional[str -> Dict[str, Any]]): [Deprecated] A function to translate a given\n partition key (e.g. '2022-01-01') to a dictionary of vars to be passed into the dbt\n invocation (e.g. {"run_date": "2022-01-01"}). Deprecated: use the @dbt_assets decorator\n to define partitioned dbt assets.\n node_info_to_group_fn (Dict[str, Any] -> Optional[str]): [Deprecated] A function that takes a\n dictionary of dbt node info and returns the group that this node should be assigned to.\n Deprecated: instead, configure dagster groups on a dbt resource's meta field or assign\n dbt groups.\n node_info_to_freshness_policy_fn (Dict[str, Any] -> Optional[FreshnessPolicy]): [Deprecated] A function\n that takes a dictionary of dbt node info and optionally returns a FreshnessPolicy that\n should be applied to this node. By default, freshness policies will be created from\n config applied to dbt models, i.e.:\n `dagster_freshness_policy={"maximum_lag_minutes": 60, "cron_schedule": "0 9 * * *"}`\n will result in that model being assigned\n `FreshnessPolicy(maximum_lag_minutes=60, cron_schedule="0 9 * * *")`. Deprecated:\n instead, configure auto-materialize policies on a dbt resource's meta field.\n node_info_to_auto_materialize_policy_fn (Dict[str, Any] -> Optional[AutoMaterializePolicy]): [Deprecated]\n A function that takes a dictionary of dbt node info and optionally returns a AutoMaterializePolicy\n that should be applied to this node. By default, AutoMaterializePolicies will be created from\n config applied to dbt models, i.e.:\n `dagster_auto_materialize_policy={"type": "lazy"}` will result in that model being assigned\n `AutoMaterializePolicy.lazy()`. Deprecated: instead, configure auto-materialize\n policies on a dbt resource's meta field.\n node_info_to_definition_metadata_fn (Dict[str, Any] -> Optional[Dict[str, MetadataUserInput]]): [Deprecated]\n A function that takes a dictionary of dbt node info and optionally returns a dictionary\n of metadata to be attached to the corresponding definition. This is added to the default\n metadata assigned to the node, which consists of the node's schema (if present).\n Deprecated: instead, provide a custom DagsterDbtTranslator that overrides\n node_info_to_metadata.\n display_raw_sql (Optional[bool]): [Deprecated] A flag to indicate if the raw sql associated\n with each model should be included in the asset description. For large projects, setting\n this flag to False is advised to reduce the size of the resulting snapshot. Deprecated:\n instead, provide a custom DagsterDbtTranslator that overrides node_info_to_description.\n """\n project_dir = check.str_param(project_dir, "project_dir")\n profiles_dir = check.opt_str_param(\n profiles_dir, "profiles_dir", os.path.join(project_dir, "config")\n )\n target_dir = check.opt_str_param(target_dir, "target_dir", os.path.join(project_dir, "target"))\n select = check.opt_str_param(select, "select", "fqn:*")\n exclude = check.opt_str_param(exclude, "exclude", "")\n\n _raise_warnings_for_deprecated_args(\n "load_assets_from_dbt_manifest",\n selected_unique_ids=None,\n dbt_resource_key=dbt_resource_key,\n use_build_command=use_build_command,\n partitions_def=partitions_def,\n partition_key_to_vars_fn=partition_key_to_vars_fn,\n runtime_metadata_fn=runtime_metadata_fn,\n node_info_to_asset_key=node_info_to_asset_key,\n node_info_to_group_fn=node_info_to_group_fn,\n node_info_to_freshness_policy_fn=node_info_to_freshness_policy_fn,\n node_info_to_auto_materialize_policy_fn=node_info_to_auto_materialize_policy_fn,\n node_info_to_definition_metadata_fn=node_info_to_definition_metadata_fn,\n )\n\n manifest, cli_output = _load_manifest_for_project(\n project_dir, profiles_dir, target_dir, select, exclude\n )\n selected_unique_ids: Set[str] = set(\n filter(None, (line.get("unique_id") for line in cli_output.logs))\n )\n return _load_assets_from_dbt_manifest(\n manifest=manifest,\n select=select,\n exclude=exclude,\n key_prefix=key_prefix,\n source_key_prefix=source_key_prefix,\n dagster_dbt_translator=dagster_dbt_translator,\n op_name=op_name,\n runtime_metadata_fn=runtime_metadata_fn,\n io_manager_key=io_manager_key,\n selected_unique_ids=selected_unique_ids,\n node_info_to_asset_key=node_info_to_asset_key,\n use_build_command=use_build_command,\n partitions_def=partitions_def,\n partition_key_to_vars_fn=partition_key_to_vars_fn,\n node_info_to_auto_materialize_policy_fn=node_info_to_auto_materialize_policy_fn,\n node_info_to_group_fn=node_info_to_group_fn,\n node_info_to_freshness_policy_fn=node_info_to_freshness_policy_fn,\n node_info_to_definition_metadata_fn=node_info_to_definition_metadata_fn,\n display_raw_sql=display_raw_sql,\n dbt_resource_key=dbt_resource_key,\n )
\n\n\n
[docs]@deprecated_param(\n param="manifest_json", breaking_version="0.21", additional_warn_text="Use manifest instead"\n)\n@deprecated_param(\n param="selected_unique_ids",\n breaking_version="0.21",\n additional_warn_text="Use the select parameter instead.",\n)\n@deprecated_param(\n param="dbt_resource_key",\n breaking_version="0.21",\n additional_warn_text=(\n "Use the `@dbt_assets` decorator if you need to customize your resource key."\n ),\n)\n@deprecated_param(\n param="use_build_command",\n breaking_version="0.21",\n additional_warn_text=(\n "Use the `@dbt_assets` decorator if you need to customize the underlying dbt commands."\n ),\n)\n@deprecated_param(\n param="partitions_def",\n breaking_version="0.21",\n additional_warn_text="Use the `@dbt_assets` decorator to define partitioned dbt assets.",\n)\n@deprecated_param(\n param="partition_key_to_vars_fn",\n breaking_version="0.21",\n additional_warn_text="Use the `@dbt_assets` decorator to define partitioned dbt assets.",\n)\n@deprecated_param(\n param="runtime_metadata_fn",\n breaking_version="0.21",\n additional_warn_text=(\n "Use the `@dbt_assets` decorator if you need to customize runtime metadata."\n ),\n)\ndef load_assets_from_dbt_manifest(\n manifest: Optional[Union[Path, Mapping[str, Any]]] = None,\n *,\n select: Optional[str] = None,\n exclude: Optional[str] = None,\n io_manager_key: Optional[str] = None,\n dagster_dbt_translator: Optional[DagsterDbtTranslator] = None,\n # All arguments below are deprecated\n key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n source_key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n selected_unique_ids: Optional[AbstractSet[str]] = None,\n display_raw_sql: Optional[bool] = None,\n dbt_resource_key: str = "dbt",\n op_name: Optional[str] = None,\n manifest_json: Optional[Mapping[str, Any]] = None,\n use_build_command: bool = True,\n partitions_def: Optional[PartitionsDefinition] = None,\n partition_key_to_vars_fn: Optional[Callable[[str], Mapping[str, Any]]] = None,\n runtime_metadata_fn: Optional[\n Callable[[OpExecutionContext, Mapping[str, Any]], Mapping[str, Any]]\n ] = None,\n node_info_to_asset_key: Callable[[Mapping[str, Any]], AssetKey] = default_asset_key_fn,\n node_info_to_group_fn: Callable[\n [Mapping[str, Any]], Optional[str]\n ] = default_group_from_dbt_resource_props,\n node_info_to_freshness_policy_fn: Callable[\n [Mapping[str, Any]], Optional[FreshnessPolicy]\n ] = default_freshness_policy_fn,\n node_info_to_auto_materialize_policy_fn: Callable[\n [Mapping[str, Any]], Optional[AutoMaterializePolicy]\n ] = default_auto_materialize_policy_fn,\n node_info_to_definition_metadata_fn: Callable[\n [Mapping[str, Any]], Mapping[str, MetadataUserInput]\n ] = default_metadata_from_dbt_resource_props,\n) -> Sequence[AssetsDefinition]:\n """Loads a set of dbt models, described in a manifest.json, into Dagster assets.\n\n Creates one Dagster asset for each dbt model. All assets will be re-materialized using a single\n `dbt run` command.\n\n When searching for more flexibility in defining the computations that materialize your\n dbt assets, we recommend that you use :py:class:`~dagster_dbt.dbt_assets`.\n\n Args:\n manifest (Optional[Mapping[str, Any]]): The contents of a DBT manifest.json, which contains\n a set of models to load into assets.\n select (Optional[str]): A dbt selection string for the models in a project that you want\n to include. Defaults to `"fqn:*"`.\n exclude (Optional[str]): A dbt selection string for the models in a project that you want\n to exclude. Defaults to "".\n io_manager_key (Optional[str]): The IO manager key that will be set on each of the returned\n assets. When other ops are downstream of the loaded assets, the IOManager specified\n here determines how the inputs to those ops are loaded. Defaults to "io_manager".\n dagster_dbt_translator (Optional[DagsterDbtTranslator]): Allows customizing how to map\n dbt models, seeds, etc. to asset keys and asset metadata.\n key_prefix (Optional[Union[str, List[str]]]): [Deprecated] A key prefix to apply to all assets loaded\n from the dbt project. Does not apply to input assets. Deprecated: use\n dagster_dbt_translator=KeyPrefixDagsterDbtTranslator(key_prefix=...) instead.\n source_key_prefix (Optional[Union[str, List[str]]]): [Deprecated] A key prefix to apply to all input\n assets for the set of assets loaded from the dbt project. Deprecated: use\n dagster_dbt_translator=KeyPrefixDagsterDbtTranslator(source_key_prefix=...) instead.\n op_name (Optional[str]): [Deprecated] Sets the name of the underlying Op that will generate the dbt assets.\n Deprecated: use the `@dbt_assets` decorator if you need to customize the op name.\n dbt_resource_key (Optional[str]): [Deprecated] The resource key that the dbt resource will be specified at.\n Defaults to "dbt". Deprecated: use the `@dbt_assets` decorator if you need to customize\n the resource key.\n runtime_metadata_fn (Optional[Callable[[OpExecutionContext, Mapping[str, Any]], Mapping[str, Any]]]): [Deprecated]\n A function that will be run after any of the assets are materialized and returns\n metadata entries for the asset, to be displayed in the asset catalog for that run.\n Deprecated: use the @dbt_assets decorator if you need to customize runtime metadata.\n selected_unique_ids (Optional[Set[str]]): [Deprecated] The set of dbt unique_ids that you want to load\n as assets. Deprecated: use the select argument instead.\n node_info_to_asset_key (Mapping[str, Any] -> AssetKey): [Deprecated] A function that takes a dictionary\n of dbt node info and returns the AssetKey that you want to represent that node. By\n default, the asset key will simply be the name of the dbt model.\n use_build_command (bool): Flag indicating if you want to use `dbt build` as the core computation\n for this asset. Defaults to True. If set to False, then `dbt run` will be used, and\n seeds and snapshots won't be loaded as assets.\n partitions_def (Optional[PartitionsDefinition]): [Deprecated] Defines the set of partition keys that\n compose the dbt assets. Deprecated: use the @dbt_assets decorator to define partitioned\n dbt assets.\n partition_key_to_vars_fn (Optional[str -> Dict[str, Any]]): [Deprecated] A function to translate a given\n partition key (e.g. '2022-01-01') to a dictionary of vars to be passed into the dbt\n invocation (e.g. {"run_date": "2022-01-01"}). Deprecated: use the @dbt_assets decorator\n to define partitioned dbt assets.\n node_info_to_group_fn (Dict[str, Any] -> Optional[str]): [Deprecated] A function that takes a\n dictionary of dbt node info and returns the group that this node should be assigned to.\n Deprecated: instead, configure dagster groups on a dbt resource's meta field or assign\n dbt groups.\n node_info_to_freshness_policy_fn (Dict[str, Any] -> Optional[FreshnessPolicy]): [Deprecated] A function\n that takes a dictionary of dbt node info and optionally returns a FreshnessPolicy that\n should be applied to this node. By default, freshness policies will be created from\n config applied to dbt models, i.e.:\n `dagster_freshness_policy={"maximum_lag_minutes": 60, "cron_schedule": "0 9 * * *"}`\n will result in that model being assigned\n `FreshnessPolicy(maximum_lag_minutes=60, cron_schedule="0 9 * * *")`. Deprecated:\n instead, configure auto-materialize policies on a dbt resource's meta field.\n node_info_to_auto_materialize_policy_fn (Dict[str, Any] -> Optional[AutoMaterializePolicy]): [Deprecated]\n A function that takes a dictionary of dbt node info and optionally returns a AutoMaterializePolicy\n that should be applied to this node. By default, AutoMaterializePolicies will be created from\n config applied to dbt models, i.e.:\n `dagster_auto_materialize_policy={"type": "lazy"}` will result in that model being assigned\n `AutoMaterializePolicy.lazy()`. Deprecated: instead, configure auto-materialize\n policies on a dbt resource's meta field.\n node_info_to_definition_metadata_fn (Dict[str, Any] -> Optional[Dict[str, MetadataUserInput]]): [Deprecated]\n A function that takes a dictionary of dbt node info and optionally returns a dictionary\n of metadata to be attached to the corresponding definition. This is added to the default\n metadata assigned to the node, which consists of the node's schema (if present).\n Deprecated: instead, provide a custom DagsterDbtTranslator that overrides\n node_info_to_metadata.\n display_raw_sql (Optional[bool]): [Deprecated] A flag to indicate if the raw sql associated\n with each model should be included in the asset description. For large projects, setting\n this flag to False is advised to reduce the size of the resulting snapshot. Deprecated:\n instead, provide a custom DagsterDbtTranslator that overrides node_info_to_description.\n """\n manifest = normalize_renamed_param(\n manifest,\n "manifest",\n manifest_json,\n "manifest_json",\n )\n manifest = cast(\n Union[Mapping[str, Any], Path], check.inst_param(manifest, "manifest", (Path, dict))\n )\n if isinstance(manifest, Path):\n manifest = cast(Mapping[str, Any], json.loads(manifest.read_bytes()))\n\n _raise_warnings_for_deprecated_args(\n "load_assets_from_dbt_manifest",\n selected_unique_ids=selected_unique_ids,\n dbt_resource_key=dbt_resource_key,\n use_build_command=use_build_command,\n partitions_def=partitions_def,\n partition_key_to_vars_fn=partition_key_to_vars_fn,\n runtime_metadata_fn=runtime_metadata_fn,\n node_info_to_asset_key=node_info_to_asset_key,\n node_info_to_group_fn=node_info_to_group_fn,\n node_info_to_freshness_policy_fn=node_info_to_freshness_policy_fn,\n node_info_to_auto_materialize_policy_fn=node_info_to_auto_materialize_policy_fn,\n node_info_to_definition_metadata_fn=node_info_to_definition_metadata_fn,\n )\n\n return _load_assets_from_dbt_manifest(\n manifest=manifest,\n select=select,\n exclude=exclude,\n io_manager_key=io_manager_key,\n dagster_dbt_translator=dagster_dbt_translator,\n key_prefix=key_prefix,\n source_key_prefix=source_key_prefix,\n selected_unique_ids=selected_unique_ids,\n display_raw_sql=display_raw_sql,\n dbt_resource_key=dbt_resource_key,\n op_name=op_name,\n use_build_command=use_build_command,\n partitions_def=partitions_def,\n partition_key_to_vars_fn=partition_key_to_vars_fn,\n runtime_metadata_fn=runtime_metadata_fn,\n node_info_to_asset_key=node_info_to_asset_key,\n node_info_to_group_fn=node_info_to_group_fn,\n node_info_to_freshness_policy_fn=node_info_to_freshness_policy_fn,\n node_info_to_auto_materialize_policy_fn=node_info_to_auto_materialize_policy_fn,\n node_info_to_definition_metadata_fn=node_info_to_definition_metadata_fn,\n )
\n\n\ndef _load_assets_from_dbt_manifest(\n manifest: Mapping[str, Any],\n select: Optional[str],\n exclude: Optional[str],\n io_manager_key: Optional[str],\n dagster_dbt_translator: Optional[DagsterDbtTranslator],\n key_prefix: Optional[CoercibleToAssetKeyPrefix],\n source_key_prefix: Optional[CoercibleToAssetKeyPrefix],\n selected_unique_ids: Optional[AbstractSet[str]],\n display_raw_sql: Optional[bool],\n dbt_resource_key: str,\n op_name: Optional[str],\n use_build_command: bool,\n partitions_def: Optional[PartitionsDefinition],\n partition_key_to_vars_fn: Optional[Callable[[str], Mapping[str, Any]]],\n runtime_metadata_fn: Optional[\n Callable[[OpExecutionContext, Mapping[str, Any]], Mapping[str, Any]]\n ],\n node_info_to_asset_key: Callable[[Mapping[str, Any]], AssetKey],\n node_info_to_group_fn: Callable[[Mapping[str, Any]], Optional[str]],\n node_info_to_freshness_policy_fn: Callable[[Mapping[str, Any]], Optional[FreshnessPolicy]],\n node_info_to_auto_materialize_policy_fn: Callable[\n [Mapping[str, Any]], Optional[AutoMaterializePolicy]\n ],\n node_info_to_definition_metadata_fn: Callable[\n [Mapping[str, Any]], Mapping[str, MetadataUserInput]\n ],\n) -> Sequence[AssetsDefinition]:\n if partition_key_to_vars_fn:\n check.invariant(\n partitions_def is not None,\n "Cannot supply a `partition_key_to_vars_fn` without a `partitions_def`.",\n )\n\n dbt_resource_key = check.str_param(dbt_resource_key, "dbt_resource_key")\n\n dbt_nodes = {\n **manifest["nodes"],\n **manifest["sources"],\n **manifest["metrics"],\n **manifest["exposures"],\n }\n\n if selected_unique_ids:\n select = (\n " ".join(".".join(dbt_nodes[uid]["fqn"]) for uid in selected_unique_ids)\n if select is None\n else select\n )\n exclude = "" if exclude is None else exclude\n else:\n select = select if select is not None else "fqn:*"\n exclude = exclude if exclude is not None else ""\n\n selected_unique_ids = select_unique_ids_from_manifest(\n select=select, exclude=exclude, manifest_json=manifest\n )\n if len(selected_unique_ids) == 0:\n raise DagsterInvalidSubsetError(f"No dbt models match the selection string '{select}'.")\n\n if dagster_dbt_translator is not None:\n check.invariant(\n node_info_to_asset_key == default_asset_key_fn,\n "Can't specify both dagster_dbt_translator and node_info_to_asset_key",\n )\n check.invariant(\n key_prefix is None,\n "Can't specify both dagster_dbt_translator and key_prefix",\n )\n check.invariant(\n source_key_prefix is None,\n "Can't specify both dagster_dbt_translator and source_key_prefix",\n )\n check.invariant(\n node_info_to_group_fn == default_group_from_dbt_resource_props,\n "Can't specify both dagster_dbt_translator and node_info_to_group_fn",\n )\n check.invariant(\n display_raw_sql is None,\n "Can't specify both dagster_dbt_translator and display_raw_sql",\n )\n check.invariant(\n node_info_to_definition_metadata_fn is default_metadata_from_dbt_resource_props,\n "Can't specify both dagster_dbt_translator and node_info_to_definition_metadata_fn",\n )\n else:\n\n class CustomDagsterDbtTranslator(DagsterDbtTranslator):\n @classmethod\n def get_asset_key(cls, dbt_resource_props):\n base_key = node_info_to_asset_key(dbt_resource_props)\n if dbt_resource_props["resource_type"] == "source":\n return base_key.with_prefix(source_key_prefix or [])\n else:\n return base_key.with_prefix(key_prefix or [])\n\n @classmethod\n def get_metadata(cls, dbt_resource_props):\n return node_info_to_definition_metadata_fn(dbt_resource_props)\n\n @classmethod\n def get_description(cls, dbt_resource_props):\n return default_description_fn(\n dbt_resource_props,\n display_raw_sql=display_raw_sql if display_raw_sql is not None else True,\n )\n\n @classmethod\n def get_group_name(cls, dbt_resource_props):\n return node_info_to_group_fn(dbt_resource_props)\n\n @classmethod\n def get_freshness_policy(\n cls, dbt_resource_props: Mapping[str, Any]\n ) -> Optional[FreshnessPolicy]:\n return node_info_to_freshness_policy_fn(dbt_resource_props)\n\n @classmethod\n def get_auto_materialize_policy(\n cls, dbt_resource_props: Mapping[str, Any]\n ) -> Optional[AutoMaterializePolicy]:\n return node_info_to_auto_materialize_policy_fn(dbt_resource_props)\n\n dagster_dbt_translator = CustomDagsterDbtTranslator()\n\n dbt_assets_def = _dbt_nodes_to_assets(\n dbt_nodes,\n runtime_metadata_fn=runtime_metadata_fn,\n io_manager_key=io_manager_key,\n select=select,\n exclude=exclude,\n selected_unique_ids=selected_unique_ids,\n dbt_resource_key=dbt_resource_key,\n op_name=op_name,\n project_id=manifest["metadata"]["project_id"][:5],\n use_build_command=use_build_command,\n partitions_def=partitions_def,\n partition_key_to_vars_fn=partition_key_to_vars_fn,\n dagster_dbt_translator=dagster_dbt_translator,\n manifest_json=manifest,\n )\n\n return [dbt_assets_def]\n\n\ndef _raise_warnings_for_deprecated_args(\n public_fn_name: str,\n selected_unique_ids: Optional[AbstractSet[str]],\n dbt_resource_key: Optional[str],\n use_build_command: Optional[bool],\n partitions_def: Optional[PartitionsDefinition],\n partition_key_to_vars_fn: Optional[Callable[[str], Mapping[str, Any]]],\n runtime_metadata_fn: Optional[\n Callable[[OpExecutionContext, Mapping[str, Any]], Mapping[str, Any]]\n ],\n node_info_to_asset_key: Callable[[Mapping[str, Any]], AssetKey],\n node_info_to_group_fn: Callable[[Mapping[str, Any]], Optional[str]],\n node_info_to_freshness_policy_fn: Callable[[Mapping[str, Any]], Optional[FreshnessPolicy]],\n node_info_to_auto_materialize_policy_fn: Callable[\n [Mapping[str, Any]], Optional[AutoMaterializePolicy]\n ],\n node_info_to_definition_metadata_fn: Callable[\n [Mapping[str, Any]], Mapping[str, MetadataUserInput]\n ],\n):\n if node_info_to_asset_key != default_asset_key_fn:\n deprecation_warning(\n f"The node_info_to_asset_key_fn arg of {public_fn_name}",\n "0.21",\n "Instead, provide a custom DagsterDbtTranslator that overrides get_asset_key.",\n stacklevel=4,\n )\n\n if node_info_to_group_fn != default_group_from_dbt_resource_props:\n deprecation_warning(\n f"The node_info_to_group_fn arg of {public_fn_name}",\n "0.21",\n "Instead, configure dagster groups on a dbt resource's meta field or assign dbt"\n " groups or provide a custom DagsterDbtTranslator that overrides get_group_name.",\n stacklevel=4,\n )\n\n if node_info_to_auto_materialize_policy_fn != default_auto_materialize_policy_fn:\n deprecation_warning(\n f"The node_info_to_auto_materialize_policy_fn arg of {public_fn_name}",\n "0.21",\n "Instead, configure Dagster auto-materialize policies on a dbt resource's meta field.",\n stacklevel=4,\n )\n\n if node_info_to_freshness_policy_fn != default_freshness_policy_fn:\n deprecation_warning(\n f"The node_info_to_freshness_policy_fn arg of {public_fn_name}",\n "0.21",\n "Instead, configure Dagster freshness policies on a dbt resource's meta field.",\n stacklevel=4,\n )\n\n if node_info_to_definition_metadata_fn != default_metadata_from_dbt_resource_props:\n deprecation_warning(\n f"The node_info_to_definition_metadata_fn arg of {public_fn_name}",\n "0.21",\n "Instead, provide a custom DagsterDbtTranslator that overrides get_metadata.",\n stacklevel=4,\n )\n
", "current_page_name": "_modules/dagster_dbt/asset_defs", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_dbt.asset_defs"}, "asset_utils": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_dbt.asset_utils

\nimport hashlib\nimport textwrap\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Any,\n    Dict,\n    FrozenSet,\n    List,\n    Mapping,\n    Optional,\n    Sequence,\n    Set,\n    Tuple,\n    cast,\n)\n\nfrom dagster import (\n    AssetCheckSpec,\n    AssetKey,\n    AssetsDefinition,\n    AssetSelection,\n    AutoMaterializePolicy,\n    DagsterInvariantViolationError,\n    FreshnessPolicy,\n    In,\n    MetadataValue,\n    Nothing,\n    Out,\n    RunConfig,\n    ScheduleDefinition,\n    TableColumn,\n    TableSchema,\n    _check as check,\n    define_asset_job,\n)\nfrom dagster._core.definitions.decorators.asset_decorator import (\n    _validate_and_assign_output_names_to_check_specs,\n)\nfrom dagster._utils.merger import merge_dicts\nfrom dagster._utils.warnings import deprecation_warning\n\nfrom .utils import input_name_fn, output_name_fn\n\nif TYPE_CHECKING:\n    from .dagster_dbt_translator import DagsterDbtTranslator, DbtManifestWrapper\n\nMANIFEST_METADATA_KEY = "dagster_dbt/manifest"\nDAGSTER_DBT_TRANSLATOR_METADATA_KEY = "dagster_dbt/dagster_dbt_translator"\n\n\n
[docs]def get_asset_key_for_model(dbt_assets: Sequence[AssetsDefinition], model_name: str) -> AssetKey:\n """Return the corresponding Dagster asset key for a dbt model.\n\n Args:\n dbt_assets (AssetsDefinition): An AssetsDefinition object produced by\n load_assets_from_dbt_project, load_assets_from_dbt_manifest, or @dbt_assets.\n model_name (str): The name of the dbt model.\n\n Returns:\n AssetKey: The corresponding Dagster asset key.\n\n Examples:\n .. code-block:: python\n\n from dagster import asset\n from dagster_dbt import dbt_assets, get_asset_key_for_model\n\n @dbt_assets(manifest=...)\n def all_dbt_assets():\n ...\n\n\n @asset(deps={get_asset_key_for_model([all_dbt_assets], "customers")})\n def cleaned_customers():\n ...\n """\n check.sequence_param(dbt_assets, "dbt_assets", of_type=AssetsDefinition)\n check.str_param(model_name, "model_name")\n\n manifest, dagster_dbt_translator = get_manifest_and_translator_from_dbt_assets(dbt_assets)\n\n matching_models = [\n value\n for value in manifest["nodes"].values()\n if value["name"] == model_name and value["resource_type"] == "model"\n ]\n\n if len(matching_models) == 0:\n raise KeyError(f"Could not find a dbt model with name: {model_name}")\n\n return dagster_dbt_translator.get_asset_key(next(iter(matching_models)))
\n\n\n
[docs]def get_asset_keys_by_output_name_for_source(\n dbt_assets: Sequence[AssetsDefinition], source_name: str\n) -> Mapping[str, AssetKey]:\n """Returns the corresponding Dagster asset keys for all tables in a dbt source.\n\n This is a convenience method that makes it easy to define a multi-asset that generates\n all the tables for a given dbt source.\n\n Args:\n source_name (str): The name of the dbt source.\n\n Returns:\n Mapping[str, AssetKey]: A mapping of the table name to corresponding Dagster asset key\n for all tables in the given dbt source.\n\n Examples:\n .. code-block:: python\n\n from dagster import AssetOut, multi_asset\n from dagster_dbt import dbt_assets, get_asset_keys_by_output_name_for_source\n\n @dbt_assets(manifest=...)\n def all_dbt_assets():\n ...\n\n @multi_asset(\n outs={\n name: AssetOut(key=asset_key)\n for name, asset_key in get_asset_keys_by_output_name_for_source(\n [all_dbt_assets], "raw_data"\n ).items()\n },\n )\n def upstream_python_asset():\n ...\n\n """\n check.sequence_param(dbt_assets, "dbt_assets", of_type=AssetsDefinition)\n check.str_param(source_name, "source_name")\n\n manifest, dagster_dbt_translator = get_manifest_and_translator_from_dbt_assets(dbt_assets)\n\n matching_nodes = [\n value for value in manifest["sources"].values() if value["source_name"] == source_name\n ]\n\n if len(matching_nodes) == 0:\n raise KeyError(f"Could not find a dbt source with name: {source_name}")\n\n return {\n output_name_fn(value): dagster_dbt_translator.get_asset_key(value)\n for value in matching_nodes\n }
\n\n\n
[docs]def get_asset_key_for_source(dbt_assets: Sequence[AssetsDefinition], source_name: str) -> AssetKey:\n """Returns the corresponding Dagster asset key for a dbt source with a singular table.\n\n Args:\n source_name (str): The name of the dbt source.\n\n Raises:\n DagsterInvalidInvocationError: If the source has more than one table.\n\n Returns:\n AssetKey: The corresponding Dagster asset key.\n\n Examples:\n .. code-block:: python\n\n from dagster import asset\n from dagster_dbt import dbt_assets, get_asset_key_for_source\n\n @dbt_assets(manifest=...)\n def all_dbt_assets():\n ...\n\n @asset(key=get_asset_key_for_source([all_dbt_assets], "my_source"))\n def upstream_python_asset():\n ...\n """\n asset_keys_by_output_name = get_asset_keys_by_output_name_for_source(dbt_assets, source_name)\n\n if len(asset_keys_by_output_name) > 1:\n raise KeyError(\n f"Source {source_name} has more than one table:"\n f" {asset_keys_by_output_name.values()}. Use"\n " `get_asset_keys_by_output_name_for_source` instead to get all tables for a"\n " source."\n )\n\n return next(iter(asset_keys_by_output_name.values()))
\n\n\n
[docs]def build_dbt_asset_selection(\n dbt_assets: Sequence[AssetsDefinition],\n dbt_select: str = "fqn:*",\n dbt_exclude: Optional[str] = None,\n) -> AssetSelection:\n """Build an asset selection for a dbt selection string.\n\n See https://docs.getdbt.com/reference/node-selection/syntax#how-does-selection-work for\n more information.\n\n Args:\n dbt_select (str): A dbt selection string to specify a set of dbt resources.\n dbt_exclude (Optional[str]): A dbt selection string to exclude a set of dbt resources.\n\n Returns:\n AssetSelection: An asset selection for the selected dbt nodes.\n\n Examples:\n .. code-block:: python\n\n from dagster_dbt import dbt_assets, build_dbt_asset_selection\n\n @dbt_assets(manifest=...)\n def all_dbt_assets():\n ...\n\n # Select the dbt assets that have the tag "foo".\n foo_selection = build_dbt_asset_selection([dbt_assets], dbt_select="tag:foo")\n\n # Select the dbt assets that have the tag "foo" and all Dagster assets downstream\n # of them (dbt-related or otherwise)\n foo_and_downstream_selection = foo_selection.downstream()\n\n """\n manifest, dagster_dbt_translator = get_manifest_and_translator_from_dbt_assets(dbt_assets)\n from .dbt_manifest_asset_selection import DbtManifestAssetSelection\n\n return DbtManifestAssetSelection(\n manifest=manifest,\n dagster_dbt_translator=dagster_dbt_translator,\n select=dbt_select,\n exclude=dbt_exclude,\n )
\n\n\n
[docs]def build_schedule_from_dbt_selection(\n dbt_assets: Sequence[AssetsDefinition],\n job_name: str,\n cron_schedule: str,\n dbt_select: str = "fqn:*",\n dbt_exclude: Optional[str] = None,\n tags: Optional[Mapping[str, str]] = None,\n config: Optional[RunConfig] = None,\n execution_timezone: Optional[str] = None,\n) -> ScheduleDefinition:\n """Build a schedule to materialize a specified set of dbt resources from a dbt selection string.\n\n See https://docs.getdbt.com/reference/node-selection/syntax#how-does-selection-work for\n more information.\n\n Args:\n job_name (str): The name of the job to materialize the dbt resources.\n cron_schedule (str): The cron schedule to define the schedule.\n dbt_select (str): A dbt selection string to specify a set of dbt resources.\n dbt_exclude (Optional[str]): A dbt selection string to exclude a set of dbt resources.\n tags (Optional[Mapping[str, str]]): A dictionary of tags (string key-value pairs) to attach\n to the scheduled runs.\n config (Optional[RunConfig]): The config that parameterizes the execution of this schedule.\n execution_timezone (Optional[str]): Timezone in which the schedule should run.\n Supported strings for timezones are the ones provided by the\n `IANA time zone database <https://www.iana.org/time-zones>` - e.g. "America/Los_Angeles".\n\n Returns:\n ScheduleDefinition: A definition to materialize the selected dbt resources on a cron schedule.\n\n Examples:\n .. code-block:: python\n\n from dagster_dbt import dbt_assets, build_schedule_from_dbt_selection\n\n @dbt_assets(manifest=...)\n def all_dbt_assets():\n ...\n\n daily_dbt_assets_schedule = build_schedule_from_dbt_selection(\n [all_dbt_assets],\n job_name="all_dbt_assets",\n cron_schedule="0 0 * * *",\n dbt_select="fqn:*",\n )\n """\n return ScheduleDefinition(\n cron_schedule=cron_schedule,\n job=define_asset_job(\n name=job_name,\n selection=build_dbt_asset_selection(\n dbt_assets,\n dbt_select=dbt_select,\n dbt_exclude=dbt_exclude,\n ),\n config=config,\n tags=tags,\n ),\n execution_timezone=execution_timezone,\n )
\n\n\ndef get_manifest_and_translator_from_dbt_assets(\n dbt_assets: Sequence[AssetsDefinition],\n) -> Tuple[Mapping[str, Any], "DagsterDbtTranslator"]:\n check.invariant(len(dbt_assets) == 1, "Exactly one dbt AssetsDefinition is required")\n dbt_assets_def = dbt_assets[0]\n metadata_by_key = dbt_assets_def.metadata_by_key or {}\n first_asset_key = next(iter(dbt_assets_def.keys))\n first_metadata = metadata_by_key.get(first_asset_key, {})\n manifest_wrapper: Optional["DbtManifestWrapper"] = first_metadata.get(MANIFEST_METADATA_KEY)\n if manifest_wrapper is None:\n raise DagsterInvariantViolationError(\n f"Expected to find dbt manifest metadata on asset {first_asset_key.to_user_string()},"\n " but did not. Did you pass in assets that weren't generated by"\n " load_assets_from_dbt_project, load_assets_from_dbt_manifest, or @dbt_assets?"\n )\n\n dagster_dbt_translator = first_metadata.get(DAGSTER_DBT_TRANSLATOR_METADATA_KEY)\n if dagster_dbt_translator is None:\n raise DagsterInvariantViolationError(\n f"Expected to find dbt translator metadata on asset {first_asset_key.to_user_string()},"\n " but did not. Did you pass in assets that weren't generated by"\n " load_assets_from_dbt_project, load_assets_from_dbt_manifest, or @dbt_assets?"\n )\n\n return manifest_wrapper.manifest, dagster_dbt_translator\n\n\n###################\n# DEFAULT FUNCTIONS\n###################\n\n\ndef default_asset_key_fn(dbt_resource_props: Mapping[str, Any]) -> AssetKey:\n """Get the asset key for a dbt node.\n\n By default, if the dbt node has a Dagster asset key configured in its metadata, then that is\n parsed and used.\n\n Otherwise:\n dbt sources: a dbt source's key is the union of its source name and its table name\n dbt models: a dbt model's key is the union of its model name and any schema configured on\n the model itself.\n """\n dagster_metadata = dbt_resource_props.get("meta", {}).get("dagster", {})\n asset_key_config = dagster_metadata.get("asset_key", [])\n if asset_key_config:\n return AssetKey(asset_key_config)\n\n if dbt_resource_props["resource_type"] == "source":\n components = [dbt_resource_props["source_name"], dbt_resource_props["name"]]\n else:\n configured_schema = dbt_resource_props["config"].get("schema")\n if configured_schema is not None:\n components = [configured_schema, dbt_resource_props["name"]]\n else:\n components = [dbt_resource_props["name"]]\n\n return AssetKey(components)\n\n\n
[docs]def default_metadata_from_dbt_resource_props(\n dbt_resource_props: Mapping[str, Any]\n) -> Mapping[str, Any]:\n metadata: Dict[str, Any] = {}\n columns = dbt_resource_props.get("columns", {})\n if len(columns) > 0:\n metadata["table_schema"] = MetadataValue.table_schema(\n TableSchema(\n columns=[\n TableColumn(\n name=column_name,\n type=column_info.get("data_type") or "?",\n description=column_info.get("description"),\n )\n for column_name, column_info in columns.items()\n ]\n )\n )\n return metadata
\n\n\n
[docs]def default_group_from_dbt_resource_props(dbt_resource_props: Mapping[str, Any]) -> Optional[str]:\n """Get the group name for a dbt node.\n\n If a Dagster group is configured in the metadata for the node, use that.\n\n Otherwise, if a dbt group is configured for the node, use that.\n """\n dagster_metadata = dbt_resource_props.get("meta", {}).get("dagster", {})\n\n dagster_group = dagster_metadata.get("group")\n if dagster_group:\n return dagster_group\n\n dbt_group = dbt_resource_props.get("config", {}).get("group")\n if dbt_group:\n return dbt_group\n\n return None
\n\n\n
[docs]def group_from_dbt_resource_props_fallback_to_directory(\n dbt_resource_props: Mapping[str, Any]\n) -> Optional[str]:\n """Get the group name for a dbt node.\n\n Has the same behavior as the default_group_from_dbt_resource_props, except for that, if no group can be determined\n from config or metadata, falls back to using the subdirectory of the models directory that the\n source file is in.\n\n Args:\n dbt_resource_props (Mapping[str, Any]): A dictionary representing the dbt resource.\n\n Examples:\n .. code-block:: python\n\n from dagster_dbt import group_from_dbt_resource_props_fallback_to_directory\n\n dbt_assets = load_assets_from_dbt_manifest(\n manifest=manifest,\n node_info_to_group_fn=group_from_dbt_resource_props_fallback_to_directory,\n )\n """\n group_name = default_group_from_dbt_resource_props(dbt_resource_props)\n if group_name is not None:\n return group_name\n\n fqn = dbt_resource_props.get("fqn", [])\n # the first component is the package name, and the last component is the model name\n if len(fqn) < 3:\n return None\n return fqn[1]
\n\n\ndef default_freshness_policy_fn(dbt_resource_props: Mapping[str, Any]) -> Optional[FreshnessPolicy]:\n dagster_metadata = dbt_resource_props.get("meta", {}).get("dagster", {})\n freshness_policy_config = dagster_metadata.get("freshness_policy", {})\n\n freshness_policy = _legacy_freshness_policy_fn(freshness_policy_config)\n if freshness_policy:\n return freshness_policy\n\n legacy_freshness_policy_config = dbt_resource_props["config"].get(\n "dagster_freshness_policy", {}\n )\n legacy_freshness_policy = _legacy_freshness_policy_fn(legacy_freshness_policy_config)\n\n if legacy_freshness_policy:\n deprecation_warning(\n "dagster_freshness_policy",\n "0.21.0",\n "Instead, configure a Dagster freshness policy on a dbt model using"\n " +meta.dagster.freshness_policy.",\n )\n\n return legacy_freshness_policy\n\n\ndef _legacy_freshness_policy_fn(\n freshness_policy_config: Mapping[str, Any]\n) -> Optional[FreshnessPolicy]:\n if freshness_policy_config:\n return FreshnessPolicy(\n maximum_lag_minutes=float(freshness_policy_config["maximum_lag_minutes"]),\n cron_schedule=freshness_policy_config.get("cron_schedule"),\n cron_schedule_timezone=freshness_policy_config.get("cron_schedule_timezone"),\n )\n return None\n\n\ndef default_auto_materialize_policy_fn(\n dbt_resource_props: Mapping[str, Any]\n) -> Optional[AutoMaterializePolicy]:\n dagster_metadata = dbt_resource_props.get("meta", {}).get("dagster", {})\n auto_materialize_policy_config = dagster_metadata.get("auto_materialize_policy", {})\n\n auto_materialize_policy = _auto_materialize_policy_fn(auto_materialize_policy_config)\n if auto_materialize_policy:\n return auto_materialize_policy\n\n legacy_auto_materialize_policy_config = dbt_resource_props["config"].get(\n "dagster_auto_materialize_policy", {}\n )\n legacy_auto_materialize_policy = _auto_materialize_policy_fn(\n legacy_auto_materialize_policy_config\n )\n\n if legacy_auto_materialize_policy:\n deprecation_warning(\n "dagster_auto_materialize_policy",\n "0.21.0",\n "Instead, configure a Dagster auto-materialize policy on a dbt model using"\n " +meta.dagster.auto_materialize_policy.",\n )\n\n return legacy_auto_materialize_policy\n\n\ndef _auto_materialize_policy_fn(\n auto_materialize_policy_config: Mapping[str, Any]\n) -> Optional[AutoMaterializePolicy]:\n if auto_materialize_policy_config.get("type") == "eager":\n return AutoMaterializePolicy.eager()\n elif auto_materialize_policy_config.get("type") == "lazy":\n return AutoMaterializePolicy.lazy()\n return None\n\n\ndef default_description_fn(dbt_resource_props: Mapping[str, Any], display_raw_sql: bool = True):\n code_block = textwrap.indent(\n dbt_resource_props.get("raw_sql") or dbt_resource_props.get("raw_code", ""), " "\n )\n description_sections = [\n dbt_resource_props["description"]\n or f"dbt {dbt_resource_props['resource_type']} {dbt_resource_props['name']}",\n ]\n if display_raw_sql:\n description_sections.append(f"#### Raw SQL:\\n```\\n{code_block}\\n```")\n return "\\n\\n".join(filter(None, description_sections))\n\n\ndef is_asset_check_from_dbt_resource_props(dbt_resource_props: Mapping[str, Any]) -> bool:\n return dbt_resource_props["meta"].get("dagster", {}).get("asset_check", False)\n\n\ndef is_generic_test_on_attached_node_from_dbt_resource_props(\n unique_id: str, dbt_resource_props: Mapping[str, Any]\n) -> bool:\n attached_node_unique_id = dbt_resource_props.get("attached_node")\n is_generic_test = bool(attached_node_unique_id)\n\n return is_generic_test and attached_node_unique_id == unique_id\n\n\ndef default_asset_check_fn(\n asset_key: AssetKey, unique_id: str, dbt_resource_props: Mapping[str, Any]\n) -> Optional[AssetCheckSpec]:\n is_asset_check = is_asset_check_from_dbt_resource_props(dbt_resource_props)\n is_generic_test_on_attached_node = is_generic_test_on_attached_node_from_dbt_resource_props(\n unique_id, dbt_resource_props\n )\n\n if not all([is_asset_check, is_generic_test_on_attached_node]):\n return None\n\n return AssetCheckSpec(\n name=dbt_resource_props["name"],\n asset=asset_key,\n description=dbt_resource_props["description"],\n )\n\n\ndef default_code_version_fn(dbt_resource_props: Mapping[str, Any]) -> str:\n return hashlib.sha1(\n (dbt_resource_props.get("raw_sql") or dbt_resource_props.get("raw_code", "")).encode(\n "utf-8"\n )\n ).hexdigest()\n\n\n###################\n# DEPENDENCIES\n###################\n\n\ndef is_non_asset_node(dbt_resource_props: Mapping[str, Any]):\n # some nodes exist inside the dbt graph but are not assets\n resource_type = dbt_resource_props["resource_type"]\n if resource_type == "metric":\n return True\n if (\n resource_type == "model"\n and dbt_resource_props.get("config", {}).get("materialized") == "ephemeral"\n ):\n return True\n return False\n\n\ndef get_deps(\n dbt_nodes: Mapping[str, Any],\n selected_unique_ids: AbstractSet[str],\n asset_resource_types: List[str],\n) -> Mapping[str, FrozenSet[str]]:\n def _valid_parent_node(dbt_resource_props):\n # sources are valid parents, but not assets\n return dbt_resource_props["resource_type"] in asset_resource_types + ["source"]\n\n asset_deps: Dict[str, Set[str]] = {}\n for unique_id in selected_unique_ids:\n dbt_resource_props = dbt_nodes[unique_id]\n node_resource_type = dbt_resource_props["resource_type"]\n\n # skip non-assets, such as metrics, tests, and ephemeral models\n if is_non_asset_node(dbt_resource_props) or node_resource_type not in asset_resource_types:\n continue\n\n asset_deps[unique_id] = set()\n for parent_unique_id in dbt_resource_props.get("depends_on", {}).get("nodes", []):\n parent_node_info = dbt_nodes[parent_unique_id]\n # for metrics or ephemeral dbt models, BFS to find valid parents\n if is_non_asset_node(parent_node_info):\n visited = set()\n replaced_parent_ids = set()\n # make a copy to avoid mutating the actual dictionary\n queue = list(parent_node_info.get("depends_on", {}).get("nodes", []))\n while queue:\n candidate_parent_id = queue.pop()\n if candidate_parent_id in visited:\n continue\n visited.add(candidate_parent_id)\n\n candidate_parent_info = dbt_nodes[candidate_parent_id]\n if is_non_asset_node(candidate_parent_info):\n queue.extend(candidate_parent_info.get("depends_on", {}).get("nodes", []))\n elif _valid_parent_node(candidate_parent_info):\n replaced_parent_ids.add(candidate_parent_id)\n\n asset_deps[unique_id] |= replaced_parent_ids\n # ignore nodes which are not assets / sources\n elif _valid_parent_node(parent_node_info):\n asset_deps[unique_id].add(parent_unique_id)\n\n frozen_asset_deps = {\n unique_id: frozenset(parent_ids) for unique_id, parent_ids in asset_deps.items()\n }\n\n return frozen_asset_deps\n\n\ndef get_asset_deps(\n dbt_nodes,\n deps,\n io_manager_key,\n manifest: Optional[Mapping[str, Any]],\n dagster_dbt_translator: "DagsterDbtTranslator",\n) -> Tuple[\n Dict[AssetKey, Set[AssetKey]],\n Dict[AssetKey, Tuple[str, In]],\n Dict[AssetKey, Tuple[str, Out]],\n Dict[AssetKey, str],\n Dict[AssetKey, FreshnessPolicy],\n Dict[AssetKey, AutoMaterializePolicy],\n Dict[str, AssetCheckSpec],\n Dict[str, List[str]],\n Dict[str, Dict[str, Any]],\n]:\n from .dagster_dbt_translator import DbtManifestWrapper\n\n asset_deps: Dict[AssetKey, Set[AssetKey]] = {}\n asset_ins: Dict[AssetKey, Tuple[str, In]] = {}\n asset_outs: Dict[AssetKey, Tuple[str, Out]] = {}\n\n # These dicts could be refactored as a single dict, mapping from output name to arbitrary\n # metadata that we need to store for reference.\n group_names_by_key: Dict[AssetKey, str] = {}\n freshness_policies_by_key: Dict[AssetKey, FreshnessPolicy] = {}\n auto_materialize_policies_by_key: Dict[AssetKey, AutoMaterializePolicy] = {}\n check_specs: List[AssetCheckSpec] = []\n fqns_by_output_name: Dict[str, List[str]] = {}\n metadata_by_output_name: Dict[str, Dict[str, Any]] = {}\n\n for unique_id, parent_unique_ids in deps.items():\n dbt_resource_props = dbt_nodes[unique_id]\n\n output_name = output_name_fn(dbt_resource_props)\n fqns_by_output_name[output_name] = dbt_resource_props["fqn"]\n\n metadata_by_output_name[output_name] = {\n key: dbt_resource_props[key] for key in ["unique_id", "resource_type"]\n }\n\n asset_key = dagster_dbt_translator.get_asset_key(dbt_resource_props)\n\n asset_deps[asset_key] = set()\n\n metadata = merge_dicts(\n dagster_dbt_translator.get_metadata(dbt_resource_props),\n {\n MANIFEST_METADATA_KEY: DbtManifestWrapper(manifest=manifest) if manifest else None,\n DAGSTER_DBT_TRANSLATOR_METADATA_KEY: dagster_dbt_translator,\n },\n )\n asset_outs[asset_key] = (\n output_name,\n Out(\n io_manager_key=io_manager_key,\n description=dagster_dbt_translator.get_description(dbt_resource_props),\n metadata=metadata,\n is_required=False,\n dagster_type=Nothing,\n code_version=default_code_version_fn(dbt_resource_props),\n ),\n )\n\n group_name = dagster_dbt_translator.get_group_name(dbt_resource_props)\n if group_name is not None:\n group_names_by_key[asset_key] = group_name\n\n freshness_policy = dagster_dbt_translator.get_freshness_policy(dbt_resource_props)\n if freshness_policy is not None:\n freshness_policies_by_key[asset_key] = freshness_policy\n\n auto_materialize_policy = dagster_dbt_translator.get_auto_materialize_policy(\n dbt_resource_props\n )\n if auto_materialize_policy is not None:\n auto_materialize_policies_by_key[asset_key] = auto_materialize_policy\n\n test_unique_ids = []\n if manifest:\n test_unique_ids = [\n child_unique_id\n for child_unique_id in manifest["child_map"][unique_id]\n if child_unique_id.startswith("test")\n ]\n\n for test_unique_id in test_unique_ids:\n test_resource_props = manifest["nodes"][test_unique_id]\n check_spec = default_asset_check_fn(asset_key, unique_id, test_resource_props)\n\n if check_spec:\n check_specs.append(check_spec)\n\n for parent_unique_id in parent_unique_ids:\n parent_node_info = dbt_nodes[parent_unique_id]\n parent_asset_key = dagster_dbt_translator.get_asset_key(parent_node_info)\n\n asset_deps[asset_key].add(parent_asset_key)\n\n # if this parent is not one of the selected nodes, it's an input\n if parent_unique_id not in deps:\n input_name = input_name_fn(parent_node_info)\n asset_ins[parent_asset_key] = (input_name, In(Nothing))\n\n check_specs_by_output_name = cast(\n Dict[str, AssetCheckSpec],\n _validate_and_assign_output_names_to_check_specs(check_specs, list(asset_outs.keys())),\n )\n\n return (\n asset_deps,\n asset_ins,\n asset_outs,\n group_names_by_key,\n freshness_policies_by_key,\n auto_materialize_policies_by_key,\n check_specs_by_output_name,\n fqns_by_output_name,\n metadata_by_output_name,\n )\n
", "current_page_name": "_modules/dagster_dbt/asset_utils", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_dbt.asset_utils"}, "cloud": {"asset_defs": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_dbt.cloud.asset_defs

\nimport json\nimport shlex\nfrom argparse import Namespace\nfrom typing import (\n    Any,\n    Callable,\n    Dict,\n    FrozenSet,\n    List,\n    Mapping,\n    Optional,\n    Sequence,\n    Set,\n    Tuple,\n    Union,\n    cast,\n)\n\nimport dagster._check as check\nfrom dagster import (\n    AssetExecutionContext,\n    AssetKey,\n    AssetOut,\n    AssetsDefinition,\n    AutoMaterializePolicy,\n    FreshnessPolicy,\n    MetadataValue,\n    PartitionsDefinition,\n    ResourceDefinition,\n    multi_asset,\n    with_resources,\n)\nfrom dagster._annotations import experimental, experimental_param\nfrom dagster._core.definitions.cacheable_assets import (\n    AssetsDefinitionCacheableData,\n    CacheableAssetsDefinition,\n)\nfrom dagster._core.definitions.metadata import MetadataUserInput\nfrom dagster._core.execution.context.init import build_init_resource_context\n\nfrom dagster_dbt.asset_utils import (\n    default_asset_key_fn,\n    default_auto_materialize_policy_fn,\n    default_description_fn,\n    default_freshness_policy_fn,\n    default_group_from_dbt_resource_props,\n    get_asset_deps,\n    get_deps,\n)\nfrom dagster_dbt.dagster_dbt_translator import DagsterDbtTranslator\n\nfrom ..errors import DagsterDbtCloudJobInvariantViolationError\nfrom ..utils import ASSET_RESOURCE_TYPES, result_to_events\nfrom .resources import DbtCloudClient, DbtCloudClientResource, DbtCloudRunStatus\n\nDAGSTER_DBT_COMPILE_RUN_ID_ENV_VAR = "DBT_DAGSTER_COMPILE_RUN_ID"\n\n\nclass DbtCloudCacheableAssetsDefinition(CacheableAssetsDefinition):\n    def __init__(\n        self,\n        dbt_cloud_resource_def: Union[DbtCloudClientResource, ResourceDefinition],\n        job_id: int,\n        node_info_to_asset_key: Callable[[Mapping[str, Any]], AssetKey],\n        node_info_to_group_fn: Callable[[Mapping[str, Any]], Optional[str]],\n        node_info_to_freshness_policy_fn: Callable[[Mapping[str, Any]], Optional[FreshnessPolicy]],\n        node_info_to_auto_materialize_policy_fn: Callable[\n            [Mapping[str, Any]], Optional[AutoMaterializePolicy]\n        ],\n        partitions_def: Optional[PartitionsDefinition] = None,\n        partition_key_to_vars_fn: Optional[Callable[[str], Mapping[str, Any]]] = None,\n    ):\n        self._dbt_cloud_resource_def: ResourceDefinition = (\n            dbt_cloud_resource_def.get_resource_definition()\n            if isinstance(dbt_cloud_resource_def, DbtCloudClientResource)\n            else dbt_cloud_resource_def\n        )\n\n        self._dbt_cloud: DbtCloudClient = (\n            dbt_cloud_resource_def.process_config_and_initialize().get_dbt_client()\n            if isinstance(dbt_cloud_resource_def, DbtCloudClientResource)\n            else dbt_cloud_resource_def(build_init_resource_context())\n        )\n        self._job_id = job_id\n        self._project_id: int\n        self._has_generate_docs: bool\n        self._job_commands: List[str]\n        self._job_materialization_command_step: int\n        self._node_info_to_asset_key = node_info_to_asset_key\n        self._node_info_to_group_fn = node_info_to_group_fn\n        self._node_info_to_freshness_policy_fn = node_info_to_freshness_policy_fn\n        self._node_info_to_auto_materialize_policy_fn = node_info_to_auto_materialize_policy_fn\n        self._partitions_def = partitions_def\n        self._partition_key_to_vars_fn = partition_key_to_vars_fn\n\n        super().__init__(unique_id=f"dbt-cloud-{job_id}")\n\n    def compute_cacheable_data(self) -> Sequence[AssetsDefinitionCacheableData]:\n        dbt_nodes, dbt_dependencies = self._get_dbt_nodes_and_dependencies()\n        return [self._build_dbt_cloud_assets_cacheable_data(dbt_nodes, dbt_dependencies)]\n\n    def build_definitions(\n        self, data: Sequence[AssetsDefinitionCacheableData]\n    ) -> Sequence[AssetsDefinition]:\n        return with_resources(\n            [\n                self._build_dbt_cloud_assets_from_cacheable_data(assets_definition_metadata)\n                for assets_definition_metadata in data\n            ],\n            {"dbt_cloud": self._dbt_cloud_resource_def},\n        )\n\n    @staticmethod\n    def parse_dbt_command(dbt_command: str) -> Namespace:\n        args = shlex.split(dbt_command)[1:]\n        try:\n            from dbt.cli.flags import (\n                Flags,\n                args_to_context,\n            )\n\n            # nasty hack to get dbt to parse the args\n            # dbt >= 1.5.0 requires that profiles-dir is set to an existing directory\n            return Namespace(**vars(Flags(args_to_context(args + ["--profiles-dir", "."]))))\n        except ImportError:\n            # dbt < 1.5.0 compat\n            from dbt.main import parse_args  # type: ignore\n\n            return parse_args(args=args)\n\n    @staticmethod\n    def get_job_materialization_command_step(execute_steps: List[str]) -> int:\n        materialization_command_filter = [\n            DbtCloudCacheableAssetsDefinition.parse_dbt_command(command).which in ["run", "build"]\n            for command in execute_steps\n        ]\n\n        if sum(materialization_command_filter) != 1:\n            raise DagsterDbtCloudJobInvariantViolationError(\n                "The dbt Cloud job must have a single `dbt run` or `dbt build` in its commands. "\n                f"Received commands: {execute_steps}."\n            )\n\n        return materialization_command_filter.index(True)\n\n    @staticmethod\n    def get_compile_filters(parsed_args: Namespace) -> List[str]:\n        dbt_compile_options: List[str] = []\n\n        selected_models = parsed_args.select or []\n        if selected_models:\n            dbt_compile_options.append(f"--select {' '.join(selected_models)}")\n\n        excluded_models = parsed_args.exclude or []\n        if excluded_models:\n            dbt_compile_options.append(f"--exclude {' '.join(excluded_models)}")\n\n        selector = getattr(parsed_args, "selector_name", None) or getattr(\n            parsed_args, "selector", None\n        )\n        if selector:\n            dbt_compile_options.append(f"--selector {selector}")\n\n        return dbt_compile_options\n\n    def _get_cached_compile_dbt_cloud_job_run(self, compile_run_id: int) -> Tuple[int, int]:\n        compile_run = self._dbt_cloud.get_run(\n            run_id=compile_run_id, include_related=["trigger", "run_steps"]\n        )\n\n        compile_run_status: str = compile_run["status_humanized"]\n        if compile_run_status != DbtCloudRunStatus.SUCCESS:\n            raise DagsterDbtCloudJobInvariantViolationError(\n                f"The cached dbt Cloud job run `{compile_run_id}` must have a status of"\n                f" `{DbtCloudRunStatus.SUCCESS}`. Received status: `{compile_run_status}. You can"\n                f" view the full status of your dbt Cloud run at {compile_run['href']}. Once it has"\n                " successfully completed, reload your Dagster definitions. If your run has failed,"\n                " you must manually refresh the cache using the `dagster-dbt"\n                " cache-compile-references` CLI."\n            )\n\n        compile_run_has_generate_docs = compile_run["trigger"]["generate_docs_override"]\n\n        compile_job_materialization_command_step = len(compile_run["run_steps"])\n        if compile_run_has_generate_docs:\n            compile_job_materialization_command_step -= 1\n\n        return compile_run_id, compile_job_materialization_command_step\n\n    def _compile_dbt_cloud_job(self, dbt_cloud_job: Mapping[str, Any]) -> Tuple[int, int]:\n        # Retrieve the filters options from the dbt Cloud job's materialization command.\n        #\n        # There are three filters: `--select`, `--exclude`, and `--selector`.\n        materialization_command = self._job_commands[self._job_materialization_command_step]\n        parsed_args = DbtCloudCacheableAssetsDefinition.parse_dbt_command(materialization_command)\n        dbt_compile_options = DbtCloudCacheableAssetsDefinition.get_compile_filters(\n            parsed_args=parsed_args\n        )\n\n        # Add the partition variable as a variable to the dbt Cloud job command.\n        #\n        # If existing variables passed through the dbt Cloud job's command, an error will be\n        # raised. Since these are static variables anyways, they can be moved to the\n        # `dbt_project.yml` without loss of functionality.\n        #\n        # Since we're only doing this to generate the dependency structure, just use an arbitrary\n        # partition key (e.g. the last one) to retrieve the partition variable.\n        if parsed_args.vars and parsed_args.vars != "{}":\n            raise DagsterDbtCloudJobInvariantViolationError(\n                f"The dbt Cloud job '{dbt_cloud_job['name']}' ({dbt_cloud_job['id']}) must not have"\n                " variables defined from `--vars` in its `dbt run` or `dbt build` command."\n                " Instead, declare the variables in the `dbt_project.yml` file. Received commands:"\n                f" {self._job_commands}."\n            )\n\n        if self._partitions_def and self._partition_key_to_vars_fn:\n            last_partition_key = self._partitions_def.get_last_partition_key()\n            if last_partition_key is None:\n                check.failed("PartitionsDefinition has no partitions")\n            partition_var = self._partition_key_to_vars_fn(last_partition_key)\n\n            dbt_compile_options.append(f"--vars '{json.dumps(partition_var)}'")\n\n        # We need to retrieve the dependency structure for the assets in the dbt Cloud project.\n        # However, we can't just use the dependency structure from the latest run, because\n        # this historical structure may not be up-to-date with the current state of the project.\n        #\n        # By always doing a compile step, we can always get the latest dependency structure.\n        # This incurs some latency, but at least it doesn't run through the entire materialization\n        # process.\n        dbt_compile_command = f"dbt compile {' '.join(dbt_compile_options)}"\n        compile_run_dbt_output = self._dbt_cloud.run_job_and_poll(\n            job_id=self._job_id,\n            cause="Generating software-defined assets for Dagster.",\n            steps_override=[dbt_compile_command],\n        )\n\n        # Target the compile execution step when retrieving run artifacts, rather than assuming\n        # that the last step is the correct target.\n        #\n        # Here, we ignore the `dbt docs generate` step.\n        compile_job_materialization_command_step = len(\n            compile_run_dbt_output.run_details.get("run_steps", [])\n        )\n        if self._has_generate_docs:\n            compile_job_materialization_command_step -= 1\n\n        return compile_run_dbt_output.run_id, compile_job_materialization_command_step\n\n    def _get_dbt_nodes_and_dependencies(\n        self,\n    ) -> Tuple[Mapping[str, Any], Mapping[str, FrozenSet[str]]]:\n        """For a given dbt Cloud job, fetch the latest run's dependency structure of executed nodes."""\n        # Fetch information about the job.\n        job = self._dbt_cloud.get_job(job_id=self._job_id)\n        self._project_id = job["project_id"]\n        self._has_generate_docs = job["generate_docs"]\n\n        # We constraint the kinds of dbt Cloud jobs that we support running.\n        #\n        # A simple constraint is that we only support jobs that run multiple steps,\n        # but it must contain one of either `dbt run` or `dbt build`.\n        #\n        # As a reminder, `dbt deps` is automatically run before the job's configured commands.\n        # And if the settings are enabled, `dbt docs generate` and `dbt source freshness` can\n        # automatically run after the job's configured commands.\n        #\n        # These commands that execute before and after the job's configured commands do not count\n        # towards the single command constraint.\n        self._job_commands = job["execute_steps"]\n        self._job_materialization_command_step = (\n            DbtCloudCacheableAssetsDefinition.get_job_materialization_command_step(\n                execute_steps=self._job_commands\n            )\n        )\n\n        # Determine whether to use a cached compile run. This should only be set up if the user is\n        # using a GitHub action along with their dbt project.\n        dbt_cloud_job_env_vars = self._dbt_cloud.get_job_environment_variables(\n            project_id=self._project_id, job_id=self._job_id\n        )\n        compile_run_id = (\n            dbt_cloud_job_env_vars.get(DAGSTER_DBT_COMPILE_RUN_ID_ENV_VAR, {})\n            .get("job", {})\n            .get("value")\n        )\n\n        compile_run_id, compile_job_materialization_command_step = (\n            # If a compile run is cached, then use it.\n            self._get_cached_compile_dbt_cloud_job_run(compile_run_id=int(compile_run_id))\n            if compile_run_id\n            # Otherwise, compile the dbt Cloud project in an ad-hoc manner.\n            else self._compile_dbt_cloud_job(dbt_cloud_job=job)\n        )\n\n        manifest_json = self._dbt_cloud.get_manifest(\n            run_id=compile_run_id, step=compile_job_materialization_command_step\n        )\n        run_results_json = self._dbt_cloud.get_run_results(\n            run_id=compile_run_id, step=compile_job_materialization_command_step\n        )\n\n        # Filter the manifest to only include the nodes that were executed.\n        dbt_nodes: Dict[str, Any] = {\n            **manifest_json.get("nodes", {}),\n            **manifest_json.get("sources", {}),\n            **manifest_json.get("metrics", {}),\n        }\n        executed_node_ids: Set[str] = set(\n            result["unique_id"] for result in run_results_json["results"]\n        )\n\n        # If there are no executed nodes, then there are no assets to generate.\n        # Inform the user to inspect their dbt Cloud job's command.\n        if not executed_node_ids:\n            raise DagsterDbtCloudJobInvariantViolationError(\n                f"The dbt Cloud job '{job['name']}' ({job['id']}) does not generate any "\n                "software-defined assets. Ensure that your dbt project has nodes to execute, "\n                "and that your dbt Cloud job's materialization command has the proper filter "\n                f"options applied. Received commands: {self._job_commands}."\n            )\n\n        # Generate the dependency structure for the executed nodes.\n        dbt_dependencies = get_deps(\n            dbt_nodes=dbt_nodes,\n            selected_unique_ids=executed_node_ids,\n            asset_resource_types=ASSET_RESOURCE_TYPES,\n        )\n\n        return dbt_nodes, dbt_dependencies\n\n    def _build_dbt_cloud_assets_cacheable_data(\n        self, dbt_nodes: Mapping[str, Any], dbt_dependencies: Mapping[str, FrozenSet[str]]\n    ) -> AssetsDefinitionCacheableData:\n        """Given all of the nodes and dependencies for a dbt Cloud job, build the cacheable\n        representation that generate the asset definition for the job.\n        """\n\n        class CustomDagsterDbtTranslator(DagsterDbtTranslator):\n            @classmethod\n            def get_asset_key(cls, dbt_resource_props):\n                return self._node_info_to_asset_key(dbt_resource_props)\n\n            @classmethod\n            def get_description(cls, dbt_resource_props):\n                # We shouldn't display the raw sql. Instead, inspect if dbt docs were generated,\n                # and attach metadata to link to the docs.\n                return default_description_fn(dbt_resource_props, display_raw_sql=False)\n\n            @classmethod\n            def get_group_name(cls, dbt_resource_props):\n                return self._node_info_to_group_fn(dbt_resource_props)\n\n            @classmethod\n            def get_freshness_policy(cls, dbt_resource_props):\n                return self._node_info_to_freshness_policy_fn(dbt_resource_props)\n\n            @classmethod\n            def get_auto_materialize_policy(cls, dbt_resource_props):\n                return self._node_info_to_auto_materialize_policy_fn(dbt_resource_props)\n\n        (\n            asset_deps,\n            asset_ins,\n            asset_outs,\n            group_names_by_key,\n            freshness_policies_by_key,\n            auto_materialize_policies_by_key,\n            _,\n            fqns_by_output_name,\n            metadata_by_output_name,\n        ) = get_asset_deps(\n            dbt_nodes=dbt_nodes,\n            deps=dbt_dependencies,\n            # TODO: In the future, allow the IO manager to be specified.\n            io_manager_key=None,\n            dagster_dbt_translator=CustomDagsterDbtTranslator(),\n            manifest=None,\n        )\n\n        return AssetsDefinitionCacheableData(\n            # TODO: In the future, we should allow additional upstream assets to be specified.\n            keys_by_input_name={\n                input_name: asset_key for asset_key, (input_name, _) in asset_ins.items()\n            },\n            keys_by_output_name={\n                output_name: asset_key for asset_key, (output_name, _) in asset_outs.items()\n            },\n            internal_asset_deps={\n                asset_outs[asset_key][0]: asset_deps for asset_key, asset_deps in asset_deps.items()\n            },\n            # We don't rely on a static group name. Instead, we map over the dbt metadata to\n            # determine the group name for each asset.\n            group_name=None,\n            metadata_by_output_name={\n                output_name: self._build_dbt_cloud_assets_metadata(dbt_metadata)\n                for output_name, dbt_metadata in metadata_by_output_name.items()\n            },\n            # TODO: In the future, we should allow the key prefix to be specified.\n            key_prefix=None,\n            can_subset=True,\n            extra_metadata={\n                "job_id": self._job_id,\n                "job_commands": self._job_commands,\n                "job_materialization_command_step": self._job_materialization_command_step,\n                "group_names_by_output_name": {\n                    asset_outs[asset_key][0]: group_name\n                    for asset_key, group_name in group_names_by_key.items()\n                },\n                "fqns_by_output_name": fqns_by_output_name,\n            },\n            freshness_policies_by_output_name={\n                asset_outs[asset_key][0]: freshness_policy\n                for asset_key, freshness_policy in freshness_policies_by_key.items()\n            },\n            auto_materialize_policies_by_output_name={\n                asset_outs[asset_key][0]: auto_materialize_policy\n                for asset_key, auto_materialize_policy in auto_materialize_policies_by_key.items()\n            },\n        )\n\n    def _build_dbt_cloud_assets_metadata(self, dbt_metadata: Dict[str, Any]) -> MetadataUserInput:\n        metadata = {\n            "dbt Cloud Job": MetadataValue.url(\n                self._dbt_cloud.build_url_for_job(\n                    project_id=self._project_id,\n                    job_id=self._job_id,\n                )\n            ),\n        }\n\n        if self._has_generate_docs:\n            metadata["dbt Cloud Documentation"] = MetadataValue.url(\n                self._dbt_cloud.build_url_for_cloud_docs(\n                    job_id=self._job_id,\n                    resource_type=dbt_metadata["resource_type"],\n                    unique_id=dbt_metadata["unique_id"],\n                )\n            )\n\n        return metadata\n\n    def _build_dbt_cloud_assets_from_cacheable_data(\n        self, assets_definition_cacheable_data: AssetsDefinitionCacheableData\n    ) -> AssetsDefinition:\n        metadata = cast(Mapping[str, Any], assets_definition_cacheable_data.extra_metadata)\n        job_id = cast(int, metadata["job_id"])\n        job_commands = cast(List[str], list(metadata["job_commands"]))\n        job_materialization_command_step = cast(int, metadata["job_materialization_command_step"])\n        group_names_by_output_name = cast(Mapping[str, str], metadata["group_names_by_output_name"])\n        fqns_by_output_name = cast(Mapping[str, List[str]], metadata["fqns_by_output_name"])\n\n        @multi_asset(\n            name=f"dbt_cloud_job_{job_id}",\n            deps=list((assets_definition_cacheable_data.keys_by_input_name or {}).values()),\n            outs={\n                output_name: AssetOut(\n                    key=asset_key,\n                    group_name=group_names_by_output_name.get(output_name),\n                    freshness_policy=(\n                        assets_definition_cacheable_data.freshness_policies_by_output_name or {}\n                    ).get(\n                        output_name,\n                    ),\n                    auto_materialize_policy=(\n                        assets_definition_cacheable_data.auto_materialize_policies_by_output_name\n                        or {}\n                    ).get(\n                        output_name,\n                    ),\n                    metadata=(assets_definition_cacheable_data.metadata_by_output_name or {}).get(\n                        output_name\n                    ),\n                    is_required=False,\n                )\n                for output_name, asset_key in (\n                    assets_definition_cacheable_data.keys_by_output_name or {}\n                ).items()\n            },\n            internal_asset_deps={\n                output_name: set(asset_deps)\n                for output_name, asset_deps in (\n                    assets_definition_cacheable_data.internal_asset_deps or {}\n                ).items()\n            },\n            partitions_def=self._partitions_def,\n            can_subset=assets_definition_cacheable_data.can_subset,\n            required_resource_keys={"dbt_cloud"},\n            compute_kind="dbt",\n        )\n        def _assets(context: AssetExecutionContext):\n            dbt_cloud = cast(DbtCloudClient, context.resources.dbt_cloud)\n\n            # Add the partition variable as a variable to the dbt Cloud job command.\n            dbt_options: List[str] = []\n            if context.has_partition_key and self._partition_key_to_vars_fn:\n                partition_var = self._partition_key_to_vars_fn(context.partition_key)\n\n                dbt_options.append(f"--vars '{json.dumps(partition_var)}'")\n\n            # Prepare the materialization step to be overriden with the selection filter\n            materialization_command = job_commands[job_materialization_command_step]\n\n            # Map the selected outputs to dbt models that should be materialized.\n            #\n            # HACK: This selection filter works even if an existing `--select` is specified in the\n            # dbt Cloud job. We take advantage of the fact that the last `--select` will be used.\n            #\n            # This is not ideal, as the triggered run for the dbt Cloud job will still have both\n            # `--select` options when displayed in the UI, but parsing the command line argument\n            # to remove the initial select using argparse.\n            if len(context.selected_output_names) != len(\n                assets_definition_cacheable_data.keys_by_output_name or {}\n            ):\n                selected_models = [\n                    ".".join(fqns_by_output_name[output_name])\n                    for output_name in context.selected_output_names\n                ]\n\n                dbt_options.append(f"--select {' '.join(sorted(selected_models))}")\n\n                # If the `--selector` option is used, we need to remove it from the command, since\n                # it disables other selection options from being functional.\n                #\n                # See https://docs.getdbt.com/reference/node-selection/syntax for details.\n                split_materialization_command = shlex.split(materialization_command)\n                if "--selector" in split_materialization_command:\n                    idx = split_materialization_command.index("--selector")\n\n                    materialization_command = " ".join(\n                        split_materialization_command[:idx]\n                        + split_materialization_command[idx + 2 :]\n                    )\n\n            job_commands[job_materialization_command_step] = (\n                f"{materialization_command} {' '.join(dbt_options)}".strip()\n            )\n\n            # Run the dbt Cloud job to rematerialize the assets.\n            dbt_cloud_output = dbt_cloud.run_job_and_poll(\n                job_id=job_id,\n                cause=f"Materializing software-defined assets in Dagster run {context.run_id[:8]}",\n                steps_override=job_commands,\n            )\n\n            # Target the materialization step when retrieving run artifacts, rather than assuming\n            # that the last step is the correct target.\n            #\n            # We ignore the commands in front of the materialization command. And again, we ignore\n            # the `dbt docs generate` step.\n            materialization_command_step = len(dbt_cloud_output.run_details.get("run_steps", []))\n            materialization_command_step -= len(job_commands) - job_materialization_command_step - 1\n            if dbt_cloud_output.run_details.get("job", {}).get("generate_docs"):\n                materialization_command_step -= 1\n\n            # TODO: Assume the run completely fails or completely succeeds.\n            # In the future, we can relax this assumption.\n            manifest_json = dbt_cloud.get_manifest(\n                run_id=dbt_cloud_output.run_id, step=materialization_command_step\n            )\n            run_results_json = self._dbt_cloud.get_run_results(\n                run_id=dbt_cloud_output.run_id, step=materialization_command_step\n            )\n\n            for result in run_results_json.get("results", []):\n                yield from result_to_events(\n                    result=result,\n                    docs_url=dbt_cloud_output.docs_url,\n                    node_info_to_asset_key=self._node_info_to_asset_key,\n                    manifest_json=manifest_json,\n                    # TODO: In the future, allow arbitrary mappings to Dagster output metadata from\n                    # the dbt metadata.\n                    extra_metadata=None,\n                    generate_asset_outputs=True,\n                )\n\n        return _assets\n\n\n
[docs]@experimental\n@experimental_param(param="partitions_def")\n@experimental_param(param="partition_key_to_vars_fn")\ndef load_assets_from_dbt_cloud_job(\n dbt_cloud: ResourceDefinition,\n job_id: int,\n node_info_to_asset_key: Callable[[Mapping[str, Any]], AssetKey] = default_asset_key_fn,\n node_info_to_group_fn: Callable[\n [Mapping[str, Any]], Optional[str]\n ] = default_group_from_dbt_resource_props,\n node_info_to_freshness_policy_fn: Callable[\n [Mapping[str, Any]], Optional[FreshnessPolicy]\n ] = default_freshness_policy_fn,\n node_info_to_auto_materialize_policy_fn: Callable[\n [Mapping[str, Any]], Optional[AutoMaterializePolicy]\n ] = default_auto_materialize_policy_fn,\n partitions_def: Optional[PartitionsDefinition] = None,\n partition_key_to_vars_fn: Optional[Callable[[str], Mapping[str, Any]]] = None,\n) -> CacheableAssetsDefinition:\n """Loads a set of dbt models, managed by a dbt Cloud job, into Dagster assets. In order to\n determine the set of dbt models, the project is compiled to generate the necessary artifacts\n that define the dbt models and their dependencies.\n\n One Dagster asset is created for each dbt model.\n\n Args:\n dbt_cloud (ResourceDefinition): The dbt Cloud resource to use to connect to the dbt Cloud API.\n job_id (int): The ID of the dbt Cloud job to load assets from.\n node_info_to_asset_key: (Mapping[str, Any] -> AssetKey): A function that takes a dictionary\n of dbt metadata and returns the AssetKey that you want to represent a given model or\n source. By default: dbt model -> AssetKey([model_name]) and\n dbt source -> AssetKey([source_name, table_name])\n node_info_to_group_fn (Dict[str, Any] -> Optional[str]): A function that takes a\n dictionary of dbt node info and returns the group that this node should be assigned to.\n node_info_to_freshness_policy_fn (Dict[str, Any] -> Optional[FreshnessPolicy]): A function\n that takes a dictionary of dbt node info and optionally returns a FreshnessPolicy that\n should be applied to this node. By default, freshness policies will be created from\n config applied to dbt models, i.e.:\n `dagster_freshness_policy={"maximum_lag_minutes": 60, "cron_schedule": "0 9 * * *"}`\n will result in that model being assigned\n `FreshnessPolicy(maximum_lag_minutes=60, cron_schedule="0 9 * * *")`\n node_info_to_auto_materialize_policy_fn (Dict[str, Any] -> Optional[AutoMaterializePolicy]):\n A function that takes a dictionary of dbt node info and optionally returns a AutoMaterializePolicy\n that should be applied to this node. By default, AutoMaterializePolicies will be created from\n config applied to dbt models, i.e.:\n `dagster_auto_materialize_policy={"type": "lazy"}` will result in that model being assigned\n `AutoMaterializePolicy.lazy()`\n node_info_to_definition_metadata_fn (Dict[str, Any] -> Optional[Dict[str, MetadataUserInput]]):\n A function that takes a dictionary of dbt node info and optionally returns a dictionary\n of metadata to be attached to the corresponding definition. This is added to the default\n metadata assigned to the node, which consists of the node's schema (if present).\n partitions_def (Optional[PartitionsDefinition]): Defines the set of partition keys that\n compose the dbt assets.\n partition_key_to_vars_fn (Optional[str -> Dict[str, Any]]): A function to translate a given\n partition key (e.g. '2022-01-01') to a dictionary of vars to be passed into the dbt\n invocation (e.g. {"run_date": "2022-01-01"})\n\n Returns:\n CacheableAssetsDefinition: A definition for the loaded assets.\n\n Examples:\n .. code-block:: python\n\n from dagster import repository\n from dagster_dbt import dbt_cloud_resource, load_assets_from_dbt_cloud_job\n\n DBT_CLOUD_JOB_ID = 1234\n\n dbt_cloud = dbt_cloud_resource.configured(\n {\n "auth_token": {"env": "DBT_CLOUD_API_TOKEN"},\n "account_id": {"env": "DBT_CLOUD_ACCOUNT_ID"},\n }\n )\n\n dbt_cloud_assets = load_assets_from_dbt_cloud_job(\n dbt_cloud=dbt_cloud, job_id=DBT_CLOUD_JOB_ID\n )\n\n\n @repository\n def dbt_cloud_sandbox():\n return [dbt_cloud_assets]\n """\n if partition_key_to_vars_fn:\n check.invariant(\n partitions_def is not None,\n "Cannot supply a `partition_key_to_vars_fn` without a `partitions_def`.",\n )\n\n return DbtCloudCacheableAssetsDefinition(\n dbt_cloud_resource_def=dbt_cloud,\n job_id=job_id,\n node_info_to_asset_key=node_info_to_asset_key,\n node_info_to_group_fn=node_info_to_group_fn,\n node_info_to_freshness_policy_fn=node_info_to_freshness_policy_fn,\n node_info_to_auto_materialize_policy_fn=node_info_to_auto_materialize_policy_fn,\n partitions_def=partitions_def,\n partition_key_to_vars_fn=partition_key_to_vars_fn,\n )
\n
", "current_page_name": "_modules/dagster_dbt/cloud/asset_defs", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_dbt.cloud.asset_defs"}, "ops": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_dbt.cloud.ops

\nfrom typing import List, Optional\n\nfrom dagster import Config, In, Nothing, Out, Output, op\nfrom pydantic import Field\n\nfrom ..utils import generate_materializations\nfrom .resources import DEFAULT_POLL_INTERVAL\nfrom .types import DbtCloudOutput\n\n\nclass DbtCloudRunOpConfig(Config):\n    job_id: int = Field(\n        description=(\n            "The integer ID of the relevant dbt Cloud job. You can find this value by going to the"\n            " details page of your job in the dbt Cloud UI. It will be the final number in the url,"\n            " e.g.:    "\n            " https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/jobs/{job_id}/"\n        )\n    )\n    poll_interval: float = Field(\n        default=DEFAULT_POLL_INTERVAL,\n        description="The time (in seconds) that will be waited between successive polls.",\n    )\n    poll_timeout: Optional[float] = Field(\n        default=None,\n        description=(\n            "The maximum time that will waited before this operation is timed out. By "\n            "default, this will never time out."\n        ),\n    )\n    yield_materializations: bool = Field(\n        default=True,\n        description=(\n            "If True, materializations corresponding to the results of the dbt operation will "\n            "be yielded when the op executes."\n        ),\n    )\n\n    asset_key_prefix: List[str] = Field(\n        default=["dbt"],\n        description=(\n            "If provided and yield_materializations is True, these components will be used to "\n            "prefix the generated asset keys."\n        ),\n    )\n\n\n
[docs]@op(\n required_resource_keys={"dbt_cloud"},\n ins={"start_after": In(Nothing)},\n out=Out(DbtCloudOutput, description="Parsed output from running the dbt Cloud job."),\n tags={"kind": "dbt_cloud"},\n)\ndef dbt_cloud_run_op(context, config: DbtCloudRunOpConfig):\n """Initiates a run for a dbt Cloud job, then polls until the run completes. If the job\n fails or is otherwised stopped before succeeding, a `dagster.Failure` exception will be raised,\n and this op will fail.\n\n It requires the use of a 'dbt_cloud' resource, which is used to connect to the dbt Cloud API.\n\n **Config Options:**\n\n job_id (int)\n The integer ID of the relevant dbt Cloud job. You can find this value by going to the details\n page of your job in the dbt Cloud UI. It will be the final number in the url, e.g.:\n ``https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/jobs/{job_id}/``\n poll_interval (float)\n The time (in seconds) that will be waited between successive polls. Defaults to ``10``.\n poll_timeout (float)\n The maximum time (in seconds) that will waited before this operation is timed out. By\n default, this will never time out.\n yield_materializations (bool)\n If True, materializations corresponding to the results of the dbt operation will be\n yielded when the solid executes. Defaults to ``True``.\n rasset_key_prefix (float)\n If provided and yield_materializations is True, these components will be used to "\n prefix the generated asset keys. Defaults to ["dbt"].\n\n **Examples:**\n\n .. code-block:: python\n\n from dagster import job\n from dagster_dbt import dbt_cloud_resource, dbt_cloud_run_op\n\n my_dbt_cloud_resource = dbt_cloud_resource.configured(\n {"auth_token": {"env": "DBT_CLOUD_AUTH_TOKEN"}, "account_id": 77777}\n )\n run_dbt_nightly_sync = dbt_cloud_run_op.configured(\n {"job_id": 54321}, name="run_dbt_nightly_sync"\n )\n\n @job(resource_defs={"dbt_cloud": my_dbt_cloud_resource})\n def dbt_cloud():\n run_dbt_nightly_sync()\n\n\n """\n dbt_output = context.resources.dbt_cloud.run_job_and_poll(\n config.job_id, poll_interval=config.poll_interval, poll_timeout=config.poll_timeout\n )\n if config.yield_materializations and "results" in dbt_output.result:\n yield from generate_materializations(dbt_output, asset_key_prefix=config.asset_key_prefix)\n yield Output(\n dbt_output,\n metadata={\n "created_at": dbt_output.run_details["created_at"],\n "started_at": dbt_output.run_details["started_at"],\n "finished_at": dbt_output.run_details["finished_at"],\n "total_duration": dbt_output.run_details["duration"],\n "run_duration": dbt_output.run_details["run_duration"],\n },\n )
\n
", "current_page_name": "_modules/dagster_dbt/cloud/ops", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_dbt.cloud.ops"}, "resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_dbt.cloud.resources

\nimport datetime\nimport json\nimport logging\nimport time\nfrom enum import Enum\nfrom typing import Any, Mapping, Optional, Sequence, cast\nfrom urllib.parse import urlencode, urljoin\n\nimport requests\nfrom dagster import (\n    ConfigurableResource,\n    Failure,\n    IAttachDifferentObjectToOpContext,\n    MetadataValue,\n    __version__,\n    _check as check,\n    get_dagster_logger,\n    resource,\n)\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom dagster._utils.merger import deep_merge_dicts\nfrom pydantic import Field\nfrom requests.exceptions import RequestException\n\nfrom .types import DbtCloudOutput\n\nDBT_DEFAULT_HOST = "https://cloud.getdbt.com/"\nDBT_API_V2_PATH = "api/v2/accounts/"\nDBT_API_V3_PATH = "api/v3/accounts/"\n\n# default polling interval (in seconds)\nDEFAULT_POLL_INTERVAL = 10\n\n\nclass DbtCloudRunStatus(str, Enum):\n    QUEUED = "Queued"\n    STARTING = "Starting"\n    RUNNING = "Running"\n    SUCCESS = "Success"\n    ERROR = "Error"\n    CANCELLED = "Cancelled"\n\n\n# TODO: This resource should be a wrapper over an existing client for a accessing dbt Cloud,\n# rather than using requests to the API directly.\nclass DbtCloudClient:\n    """This class exposes methods on top of the dbt Cloud REST API v2.\n\n    For a complete set of documentation on the dbt Cloud Administrative REST API, including expected\n    response JSON schemae, see the `dbt Cloud API Docs <https://docs.getdbt.com/dbt-cloud/api-v2>`_.\n    """\n\n    def __init__(\n        self,\n        auth_token: str,\n        account_id: int,\n        disable_schedule_on_trigger: bool = True,\n        request_max_retries: int = 3,\n        request_retry_delay: float = 0.25,\n        dbt_cloud_host: str = DBT_DEFAULT_HOST,\n        log: logging.Logger = get_dagster_logger(),\n        log_requests: bool = False,\n    ):\n        self._auth_token = auth_token\n        self._account_id = account_id\n        self._disable_schedule_on_trigger = disable_schedule_on_trigger\n\n        self._request_max_retries = request_max_retries\n        self._request_retry_delay = request_retry_delay\n\n        self._dbt_cloud_host = dbt_cloud_host\n        self._log = log\n        self._log_requests = log_requests\n\n    @property\n    def api_v2_base_url(self) -> str:\n        return urljoin(self._dbt_cloud_host, DBT_API_V2_PATH)\n\n    @property\n    def api_v3_base_url(self) -> str:\n        return urljoin(self._dbt_cloud_host, DBT_API_V3_PATH)\n\n    def build_url_for_job(self, project_id: int, job_id: int) -> str:\n        return urljoin(\n            self._dbt_cloud_host,\n            f"next/deploy/{self._account_id}/projects/{project_id}/jobs/{job_id}/",\n        )\n\n    def build_url_for_cloud_docs(self, job_id: int, resource_type: str, unique_id: str) -> str:\n        return urljoin(\n            self._dbt_cloud_host,\n            f"/accounts/{self._account_id}/jobs/{job_id}/docs/#!/{resource_type}/{unique_id}",\n        )\n\n    def make_request(\n        self,\n        method: str,\n        endpoint: str,\n        data: Optional[Mapping[str, Any]] = None,\n        params: Optional[Mapping[str, Any]] = None,\n        return_text: bool = False,\n        base_url: Optional[str] = None,\n    ) -> Any:\n        """Creates and sends a request to the desired dbt Cloud API endpoint.\n\n        Args:\n            method (str): The http method to use for this request (e.g. "POST", "GET", "PATCH").\n            endpoint (str): The dbt Cloud API endpoint to send this request to.\n            data (Optional[Mapping[str, Any]]): JSON-formatable data string to be included in the request.\n            params (Optional[Mapping[str, Any]]): Payload to add to query string of the request.\n            return_text (bool): Override default behavior and return unparsed {"text": response.text}\n                blob instead of json.\n\n        Returns:\n            Dict[str, Any]: Parsed json data from the response to this request\n        """\n        headers = {\n            "User-Agent": f"dagster-dbt/{__version__}",\n            "Content-Type": "application/json",\n            "Authorization": f"Bearer {self._auth_token}",\n        }\n        base_url = base_url or self.api_v2_base_url\n        url = urljoin(base_url, endpoint)\n\n        if self._log_requests:\n            self._log.debug(f"Making Request: method={method} url={url} data={data}")\n\n        num_retries = 0\n        while True:\n            try:\n                response = requests.request(\n                    method=method,\n                    url=url,\n                    headers=headers,\n                    data=json.dumps(data),\n                    params=params,\n                )\n                response.raise_for_status()\n                return {"text": response.text} if return_text else response.json()["data"]\n            except RequestException as e:\n                self._log.error("Request to dbt Cloud API failed: %s", e)\n                if num_retries == self._request_max_retries:\n                    break\n                num_retries += 1\n                time.sleep(self._request_retry_delay)\n\n        raise Failure(f"Max retries ({self._request_max_retries}) exceeded with url: {url}.")\n\n    def list_jobs(\n        self, project_id: int, order_by: Optional[str] = "-id"\n    ) -> Sequence[Mapping[str, Any]]:\n        """List all dbt jobs in a dbt Cloud project.\n\n        Args:\n            project_id (int): The ID of the relevant dbt Cloud project. You can find this value by\n                going to your account settings in the dbt Cloud UI. It will be the final\n                number in the url, e.g.: ``https://cloud.getdbt.com/next/settings/accounts/{account_id}/projects/{project_id}/``\n            order_by (Optional[str]): An identifier designated by dbt Cloud in which to sort the\n                results before returning them. Useful when combined with offset and limit to load\n                runs for a job. Defaults to "-id" where "-" designates reverse order and "id" is\n                the key to filter on.\n\n        Returns:\n            List[Dict[str, Any]]: Parsed json data from the response to this request\n        """\n        return self.make_request(\n            "GET",\n            f"{self._account_id}/jobs",\n            params={"project_id": project_id, "order_by": order_by},\n        )\n\n    def get_job(self, job_id: int) -> Mapping[str, Any]:\n        """Gets details about a given dbt job from the dbt Cloud API.\n\n        Args:\n            job_id (int): The ID of the relevant dbt Cloud job. You can find this value by going to\n                the details page of your job in the dbt Cloud UI. It will be the final number in the\n                url, e.g.: ``https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/jobs/{job_id}/``\n        Returns:\n            Dict[str, Any]: Parsed json data from the response to this request\n        """\n        return self.make_request("GET", f"{self._account_id}/jobs/{job_id}/")\n\n    def update_job(self, job_id: int, **kwargs) -> Mapping[str, Any]:\n        """Updates specific properties of a dbt job.\n\n        Documentation on the full set of potential parameters can be found here:\n        https://docs.getdbt.com/dbt-cloud/api-v2#operation/updateJobById.\n\n        Args:\n            job_id (int): The ID of the relevant dbt Cloud job. You can find this value by going to\n                the details page of your job in the dbt Cloud UI. It will be the final number in the\n                url, e.g.: ``https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/jobs/{job_id}/``\n            kwargs: Passed in as the properties to be changed.\n\n        Returns:\n            Dict[str, Any]: Parsed json data from the response to this request\n\n        Examples:\n        .. code-block:: python\n\n            # disable schedule for job with id=12345\n            my_dbt_cloud_resource.update_job(12345, triggers={"schedule": False})\n        """\n        # API requires you to supply a bunch of values, so we can just use the current state\n        # as the defaults\n        job_data = self.get_job(job_id)\n        return self.make_request(\n            "POST", f"{self._account_id}/jobs/{job_id}/", data=deep_merge_dicts(job_data, kwargs)\n        )\n\n    def run_job(self, job_id: int, **kwargs) -> Mapping[str, Any]:\n        """Initializes a run for a job.\n\n        Overrides for specific properties can be set by passing in values to the kwargs. A full list\n        of overridable properties can be found here:\n        https://docs.getdbt.com/dbt-cloud/api-v2#operation/triggerRun.\n\n        Args:\n            job_id (int): The ID of the relevant dbt Cloud job. You can find this value by going to\n                the details page of your job in the dbt Cloud UI. It will be the final number in the\n                url, e.g.: ``https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/jobs/{job_id}/``\n            kwargs: Passed in as the properties to be overridden.\n\n        Returns:\n            Dict[str, Any]: Parsed json data from the response to this request\n        """\n        self._log.info(f"Initializing run for job with job_id={job_id}")\n        if "cause" not in kwargs:\n            kwargs["cause"] = "Triggered via Dagster"\n        resp = self.make_request("POST", f"{self._account_id}/jobs/{job_id}/run/", data=kwargs)\n\n        has_schedule: bool = resp.get("job", {}).get("triggers", {}).get("schedule", False)\n        if has_schedule and self._disable_schedule_on_trigger:\n            self._log.info("Disabling dbt Cloud job schedule.")\n            self.update_job(job_id, triggers={"schedule": False})\n\n        self._log.info(\n            f"Run initialized with run_id={resp['id']}. View this run in "\n            f"the dbt Cloud UI: {resp['href']}"\n        )\n        return resp\n\n    def get_runs(\n        self,\n        include_related: Optional[Sequence[str]] = None,\n        job_id: Optional[int] = None,\n        order_by: Optional[str] = "-id",\n        offset: int = 0,\n        limit: int = 100,\n    ) -> Sequence[Mapping[str, object]]:\n        """Returns a list of runs from dbt Cloud. This can be optionally filtered to a specific job\n        using the job_definition_id. It supports pagination using offset and limit as well and\n        can be configured to load a variety of related information about the runs.\n\n        Args:\n            include_related (Optional[List[str]]): A list of resources to include in the response\n                from dbt Cloud. This is technically a required field according to the API, but it\n                can be passed with an empty list where it will only load the default run\n                information. Valid values are "trigger", "job", "repository", and "environment".\n            job_definition_id (Optional[int]): This method can be optionally filtered to only\n                load runs for a specific job id if it is included here. If omitted it will pull\n                runs for every job.\n            order_by (Optional[str]): An identifier designated by dbt Cloud in which to sort the\n                results before returning them. Useful when combined with offset and limit to load\n                runs for a job. Defaults to "-id" where "-" designates reverse order and "id" is\n                the key to filter on.\n            offset (int): An offset to apply when listing runs. Can be used to paginate results\n                when combined with order_by and limit. Defaults to 0.\n            limit (int): Limits the amount of rows returned by the API. Defaults to 100.\n\n        Returns:\n            List[Dict[str, Any]]: A list of dictionaries containing the runs and any included\n                related information.\n        """\n        query_dict = {\n            "include_related": include_related or [],\n            "order_by": order_by,\n            "offset": offset,\n            "limit": limit,\n        }\n        if job_id:\n            query_dict["job_definition_id"] = job_id\n        return self.make_request("GET", f"{self._account_id}/runs/?{urlencode(query_dict)}")\n\n    def get_run(\n        self, run_id: int, include_related: Optional[Sequence[str]] = None\n    ) -> Mapping[str, Any]:\n        """Gets details about a specific job run.\n\n        Args:\n            run_id (int): The ID of the relevant dbt Cloud run. You can find this value by going to\n                the details page of your run in the dbt Cloud UI. It will be the final number in the\n                url, e.g.: ``https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/runs/{run_id}/``\n            include_related (List[str]): List of related fields to pull with the run. Valid values\n                are "trigger", "job", and "debug_logs".\n\n        Returns:\n            Dict[str, Any]: A dictionary containing the parsed contents of the dbt Cloud run details.\n                See: https://docs.getdbt.com/dbt-cloud/api-v2#operation/getRunById for schema.\n        """\n        query_params = f"?include_related={','.join(include_related)}" if include_related else ""\n        return self.make_request(\n            "GET",\n            f"{self._account_id}/runs/{run_id}/{query_params}",\n        )\n\n    def get_run_steps(self, run_id: int) -> Sequence[str]:\n        """Gets the steps of an initialized dbt Cloud run.\n\n        Args:\n            run_id (int): The ID of the relevant dbt Cloud run. You can find this value by going to\n                the details page of your run in the dbt Cloud UI. It will be the final number in the\n                url, e.g.: ``https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/runs/{run_id}/``\n\n        Returns:\n            List[str, Any]: List of commands for each step of the run.\n        """\n        run_details = self.get_run(run_id, include_related=["trigger", "job"])\n        steps = run_details["job"]["execute_steps"]\n        steps_override = run_details["trigger"]["steps_override"]\n        return steps_override or steps\n\n    def cancel_run(self, run_id: int) -> Mapping[str, Any]:\n        """Cancels a dbt Cloud run.\n\n        Args:\n            run_id (int): The ID of the relevant dbt Cloud run. You can find this value by going to\n                the details page of your run in the dbt Cloud UI. It will be the final number in the\n                url, e.g.: ``https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/runs/{run_id}/``\n\n        Returns:\n            Dict[str, Any]: A dictionary containing the parsed contents of the dbt Cloud run details.\n                See: https://docs.getdbt.com/dbt-cloud/api-v2#operation/getRunById for schema.\n        """\n        self._log.info(f"Cancelling run with id '{run_id}'")\n        return self.make_request("POST", f"{self._account_id}/runs/{run_id}/cancel/")\n\n    def list_run_artifacts(self, run_id: int, step: Optional[int] = None) -> Sequence[str]:\n        """Lists the paths of the available run artifacts from a completed dbt Cloud run.\n\n        Args:\n            run_id (int): The ID of the relevant dbt Cloud run. You can find this value by going to\n                the details page of your run in the dbt Cloud UI. It will be the final number in the\n                url, e.g.: ``https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/runs/{run_id}/``\n            step (int): The index of the step in the run to query for artifacts. The first step in\n                the run has the index 1. If the step parameter is omitted, then this endpoint will\n                return the artifacts compiled for the last step in the run\n\n        Returns:\n            List[str]: List of the paths of the available run artifacts\n        """\n        query_params = f"?step={step}" if step else ""\n        return cast(\n            list,\n            self.make_request(\n                "GET",\n                f"{self._account_id}/runs/{run_id}/artifacts/{query_params}",\n                data={"step": step} if step else None,\n            ),\n        )\n\n    def get_run_artifact(self, run_id: int, path: str, step: Optional[int] = None) -> str:\n        """The string contents of a run artifact from a dbt Cloud run.\n\n        Args:\n            run_id (int): The ID of the relevant dbt Cloud run. You can find this value by going to\n                the details page of your run in the dbt Cloud UI. It will be the final number in the\n                url, e.g.: ``https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/runs/{run_id}/``\n            path (str): The path to this run artifact (e.g. 'run/my_new_project/models/example/my_first_dbt_model.sql')\n            step (int): The index of the step in the run to query for artifacts. The first step in\n                the run has the index 1. If the step parameter is omitted, then this endpoint will\n                return the artifacts compiled for the last step in the run.\n\n        Returns:\n            List[str]: List of the names of the available run artifacts\n        """\n        query_params = f"?step={step}" if step else ""\n        return self.make_request(\n            "GET",\n            f"{self._account_id}/runs/{run_id}/artifacts/{path}{query_params}",\n            data={"step": step} if step else None,\n            return_text=True,\n        )["text"]\n\n    def get_manifest(self, run_id: int, step: Optional[int] = None) -> Mapping[str, Any]:\n        """The parsed contents of a manifest.json file created by a completed run.\n\n        Args:\n            run_id (int): The ID of the relevant dbt Cloud run. You can find this value by going to\n                the details page of your run in the dbt Cloud UI. It will be the final number in the\n                url, e.g.: ``https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/runs/{run_id}/``\n            step (int): The index of the step in the run to query for artifacts. The first step in\n                the run has the index 1. If the step parameter is omitted, then this endpoint will\n                return the artifacts compiled for the last step in the run.\n\n        Returns:\n            Dict[str, Any]: Parsed contents of the manifest.json file\n        """\n        return json.loads(self.get_run_artifact(run_id, "manifest.json", step=step))\n\n    def get_run_results(self, run_id: int, step: Optional[int] = None) -> Mapping[str, Any]:\n        """The parsed contents of a run_results.json file created by a completed run.\n\n        Args:\n            run_id (int): The ID of the relevant dbt Cloud run. You can find this value by going to\n                the details page of your run in the dbt Cloud UI. It will be the final number in the\n                url, e.g.: ``https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/runs/{run_id}/``\n            step (int): The index of the step in the run to query for artifacts. The first step in\n                the run has the index 1. If the step parameter is omitted, then this endpoint will\n                return the artifacts compiled for the last step in the run.\n\n        Returns:\n            Dict[str, Any]: Parsed contents of the run_results.json file\n        """\n        return json.loads(self.get_run_artifact(run_id, "run_results.json", step=step))\n\n    def poll_run(\n        self,\n        run_id: int,\n        poll_interval: float = DEFAULT_POLL_INTERVAL,\n        poll_timeout: Optional[float] = None,\n        href: Optional[str] = None,\n    ) -> Mapping[str, Any]:\n        """Polls a dbt Cloud job run until it completes. Will raise a `dagster.Failure` exception if the\n        run does not complete successfully.\n\n        Args:\n            run_id (int): The ID of the relevant dbt Cloud run. You can find this value by going to\n                the details page of your run in the dbt Cloud UI. It will be the final number in the\n                url, e.g.: ``https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/runs/{run_id}/``\n            poll_interval (float): The time (in seconds) that should be waited between successive\n                polls of the dbt Cloud API.\n            poll_timeout (float): The maximum time (in seconds) that should be waited for this run\n                to complete. If this threshold is exceeded, the run will be cancelled and an\n                exception will be thrown. By default, this will poll forver.\n            href (str): For internal use, generally should not be set manually.\n\n        Returns:\n            Dict[str, Any]: A dictionary containing the parsed contents of the dbt Cloud run details.\n                See: https://docs.getdbt.com/dbt-cloud/api-v2#operation/getRunById for schema.\n        """\n        status: Optional[str] = None\n\n        if href is None:\n            href = self.get_run(run_id).get("href")\n        assert isinstance(href, str), "Run must have an href"\n\n        poll_start = datetime.datetime.now()\n        try:\n            while True:\n                run_details = self.get_run(run_id)\n                status = run_details["status_humanized"]\n                self._log.info(f"Polled run {run_id}. Status: [{status}]")\n\n                # completed successfully\n                if status == DbtCloudRunStatus.SUCCESS:\n                    return self.get_run(run_id, include_related=["job", "trigger", "run_steps"])\n                elif status in [DbtCloudRunStatus.ERROR, DbtCloudRunStatus.CANCELLED]:\n                    break\n                elif status not in [\n                    DbtCloudRunStatus.QUEUED,\n                    DbtCloudRunStatus.STARTING,\n                    DbtCloudRunStatus.RUNNING,\n                ]:\n                    check.failed(f"Received unexpected status '{status}'. This should never happen")\n\n                if poll_timeout and datetime.datetime.now() > poll_start + datetime.timedelta(\n                    seconds=poll_timeout\n                ):\n                    self.cancel_run(run_id)\n                    raise Failure(\n                        f"Run {run_id} timed out after "\n                        f"{datetime.datetime.now() - poll_start}. Attempted to cancel.",\n                        metadata={"run_page_url": MetadataValue.url(href)},\n                    )\n\n                # Sleep for the configured time interval before polling again.\n                time.sleep(poll_interval)\n        finally:\n            if status not in (\n                DbtCloudRunStatus.SUCCESS,\n                DbtCloudRunStatus.ERROR,\n                DbtCloudRunStatus.CANCELLED,\n            ):\n                self.cancel_run(run_id)\n\n        run_details = self.get_run(run_id, include_related=["trigger"])\n        raise Failure(\n            f"Run {run_id} failed. Status Message: {run_details['status_message']}",\n            metadata={\n                "run_details": MetadataValue.json(run_details),\n                "run_page_url": MetadataValue.url(href),\n            },\n        )\n\n    def run_job_and_poll(\n        self,\n        job_id: int,\n        poll_interval: float = DEFAULT_POLL_INTERVAL,\n        poll_timeout: Optional[float] = None,\n        **kwargs,\n    ) -> DbtCloudOutput:\n        """Runs a dbt Cloud job and polls until it completes. Will raise a `dagster.Failure` exception\n        if the run does not complete successfully.\n\n        Args:\n            job_id (int): The ID of the relevant dbt Cloud job. You can find this value by going to\n                the details page of your job in the dbt Cloud UI. It will be the final number in the\n                url, e.g.: ``https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/jobs/{job_id}/``\n            poll_interval (float): The time (in seconds) that should be waited between successive\n                polls of the dbt Cloud API.\n            poll_timeout (float): The maximum time (in seconds) that should be waited for this run\n                to complete. If this threshold is exceeded, the run will be cancelled and an\n                exception will be thrown. By default, this will poll forver.\n\n        Returns:\n            :py:class:`~DbtCloudOutput`: Class containing details about the specific job run and the\n                parsed run results.\n        """\n        run_details = self.run_job(job_id, **kwargs)\n        run_id = run_details["id"]\n        href = run_details["href"]\n        final_run_details = self.poll_run(\n            run_id, poll_interval=poll_interval, poll_timeout=poll_timeout, href=href\n        )\n        try:\n            run_results = self.get_run_results(run_id)\n        # if you fail to get run_results for this job, just leave it empty\n        except Failure:\n            self._log.info(\n                "run_results.json not available for this run. Defaulting to empty value."\n            )\n            run_results = {}\n        output = DbtCloudOutput(run_details=final_run_details, result=run_results)\n        if output.docs_url:\n            self._log.info(f"Docs for this run can be viewed here: {output.docs_url}")\n        return output\n\n    def get_job_environment_variables(self, project_id: int, job_id: int) -> Mapping[str, Any]:\n        """Get the dbt Cloud environment variables for a specific job.\n\n        Args:\n            project_id (int): The ID of the relevant dbt Cloud project. You can find this value by\n                going to your account settings in the dbt Cloud UI. It will be the final\n                number in the url, e.g.: ``https://cloud.getdbt.com/next/settings/accounts/{account_id}/projects/{project_id}/``\n            job_id (int): The ID of the relevant dbt Cloud job. You can find this value by going to\n                the details page of your job in the dbt Cloud UI. It will be the final number in the\n                url, e.g.: ``https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/jobs/{job_id}/``\n        """\n        return self.make_request(\n            "GET",\n            f"{self._account_id}/projects/{project_id}/environment-variables/job",\n            params={"job_definition_id": job_id},\n            base_url=self.api_v3_base_url,\n        )\n\n    def set_job_environment_variable(\n        self, project_id: int, job_id: int, environment_variable_id: int, name: str, value: str\n    ) -> Mapping[str, Any]:\n        """Set the dbt Cloud environment variables for a specific job.\n\n        Args:\n            project_id (int): The ID of the relevant dbt Cloud project. You can find this value by\n                going to your account settings in the dbt Cloud UI. It will be the final\n                number in the url, e.g.: ``https://cloud.getdbt.com/next/settings/accounts/{account_id}/projects/{project_id}/``\n            job_id (int): The ID of the relevant dbt Cloud job. You can find this value by going to\n                the details page of your job in the dbt Cloud UI. It will be the final number in the\n                url, e.g.: ``https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/jobs/{job_id}/``\n            name (str): The name of the environment variable to set.\n            value (str): The raw value of the environment variable.\n        """\n        return self.make_request(\n            "POST",\n            f"{self._account_id}/projects/{project_id}/environment-variables/{environment_variable_id}",\n            data={\n                "id": environment_variable_id,\n                "account_id": self._account_id,\n                "project_id": project_id,\n                "job_definition_id": job_id,\n                "type": "job",\n                "name": name,\n                "raw_value": value,\n            },\n            base_url=self.api_v3_base_url,\n        )\n\n\nclass DbtCloudResource(DbtCloudClient):\n    pass\n\n\n
[docs]class DbtCloudClientResource(ConfigurableResource, IAttachDifferentObjectToOpContext):\n """This resource helps interact with dbt Cloud connectors."""\n\n auth_token: str = Field(\n description=(\n "dbt Cloud API Token. User tokens can be found in the [dbt Cloud"\n " UI](https://cloud.getdbt.com/#/profile/api/), or see the [dbt Cloud"\n " Docs](https://docs.getdbt.com/docs/dbt-cloud/dbt-cloud-api/service-tokens) for"\n " instructions on creating a Service Account token."\n ),\n )\n account_id: int = Field(\n description=(\n "dbt Cloud Account ID. This value can be found in the url of a variety of views in"\n " the dbt Cloud UI, e.g."\n " https://cloud.getdbt.com/#/accounts/{account_id}/settings/."\n ),\n )\n disable_schedule_on_trigger: bool = Field(\n default=True,\n description=(\n "Specifies if you would like any job that is triggered using this "\n "resource to automatically disable its schedule."\n ),\n )\n request_max_retries: int = Field(\n default=3,\n description=(\n "The maximum number of times requests to the dbt Cloud API should be retried "\n "before failing."\n ),\n )\n request_retry_delay: float = Field(\n default=0.25,\n description="Time (in seconds) to wait between each request retry.",\n )\n dbt_cloud_host: str = Field(\n default=DBT_DEFAULT_HOST,\n description=(\n "The hostname where dbt cloud is being hosted (e.g. https://my_org.cloud.getdbt.com/)."\n ),\n )\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n def get_dbt_client(self) -> DbtCloudClient:\n context = self.get_resource_context()\n assert context.log\n\n return DbtCloudClient(\n auth_token=self.auth_token,\n account_id=self.account_id,\n disable_schedule_on_trigger=self.disable_schedule_on_trigger,\n request_max_retries=self.request_max_retries,\n request_retry_delay=self.request_retry_delay,\n log=context.log,\n dbt_cloud_host=self.dbt_cloud_host,\n )\n\n def get_object_to_set_on_execution_context(self) -> Any:\n return self.get_dbt_client()
\n\n\n
[docs]@dagster_maintained_resource\n@resource(\n config_schema=DbtCloudClientResource.to_config_schema(),\n description="This resource helps interact with dbt Cloud connectors",\n)\ndef dbt_cloud_resource(context) -> DbtCloudResource:\n """This resource allows users to programatically interface with the dbt Cloud Administrative REST\n API (v2) to launch jobs and monitor their progress. This currently implements only a subset of\n the functionality exposed by the API.\n\n For a complete set of documentation on the dbt Cloud Administrative REST API, including expected\n response JSON schemae, see the `dbt Cloud API Docs <https://docs.getdbt.com/dbt-cloud/api-v2>`_.\n\n To configure this resource, we recommend using the `configured\n <https://docs.dagster.io/concepts/configuration/configured>`_ method.\n\n **Examples:**\n\n .. code-block:: python\n\n from dagster import job\n from dagster_dbt import dbt_cloud_resource\n\n my_dbt_cloud_resource = dbt_cloud_resource.configured(\n {\n "auth_token": {"env": "DBT_CLOUD_AUTH_TOKEN"},\n "account_id": {"env": "DBT_CLOUD_ACCOUNT_ID"},\n }\n )\n\n @job(resource_defs={"dbt_cloud": my_dbt_cloud_resource})\n def my_dbt_cloud_job():\n ...\n """\n return DbtCloudResource(\n auth_token=context.resource_config["auth_token"],\n account_id=context.resource_config["account_id"],\n disable_schedule_on_trigger=context.resource_config["disable_schedule_on_trigger"],\n request_max_retries=context.resource_config["request_max_retries"],\n request_retry_delay=context.resource_config["request_retry_delay"],\n log=context.log,\n dbt_cloud_host=context.resource_config["dbt_cloud_host"],\n )
\n
", "current_page_name": "_modules/dagster_dbt/cloud/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_dbt.cloud.resources"}}, "core": {"resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_dbt.core.resources

\nfrom typing import Any, Iterator, Mapping, Optional, Sequence, Set\n\nimport dagster._check as check\nfrom dagster import resource\nfrom dagster._annotations import deprecated, public\nfrom dagster._config.pythonic_config import ConfigurableResource, IAttachDifferentObjectToOpContext\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom dagster._utils.merger import merge_dicts\nfrom pydantic import Field\n\nfrom ..dbt_resource import DbtClient\nfrom .types import DbtCliOutput\nfrom .utils import (\n    DEFAULT_DBT_TARGET_PATH,\n    execute_cli,\n    execute_cli_stream,\n    parse_manifest,\n    parse_run_results,\n    remove_run_results,\n)\n\nDEFAULT_DBT_EXECUTABLE = "dbt"\n\n# The set of dbt cli commands that result in the creation of a run_results.json output file\n# https://docs.getdbt.com/reference/artifacts/run-results-json\nDBT_RUN_RESULTS_COMMANDS = ["run", "test", "seed", "snapshot", "docs generate", "build"]\n\n# The following config fields correspond to flags that apply to all dbt CLI commands. For details\n# on dbt CLI flags, see\n# https://github.com/fishtown-analytics/dbt/blob/1f8e29276e910c697588c43f08bc881379fff178/core/dbt/main.py#L260-L329\n\nCOMMON_OPTION_KEYS = {\n    "warn_error",\n    "dbt_executable",\n    "ignore_handled_error",\n    "target_path",\n    "docs_url",\n    "json_log_format",\n    "capture_logs",\n    "debug",\n}\n\n\nclass ConfigurableResourceWithCliFlags(ConfigurableResource):\n    project_dir: str = Field(\n        default=".",\n        description=(\n            "Which directory to look in for the dbt_project.yml file. Default is the current "\n            "working directory and its parents."\n        ),\n    )\n    profiles_dir: Optional[str] = Field(\n        default=None,\n        description=(\n            "Which directory to look in for the profiles.yml file. Default = $DBT_PROFILES_DIR or "\n            "$HOME/.dbt"\n        ),\n    )\n    profile: Optional[str] = Field(\n        default=None, description="Which profile to load. Overrides setting in dbt_project.yml."\n    )\n    target: Optional[str] = Field(\n        default=None, description="Which target to load for the given profile."\n    )\n    vars: Optional[Mapping[str, Any]] = Field(\n        default=None,\n        description=(\n            "Supply variables to the project. This argument overrides variables defined in your "\n            "dbt_project.yml file. This argument should be a dictionary, eg. "\n            "{'my_variable': 'my_value'}"\n        ),\n    )\n    bypass_cache: bool = Field(\n        default=False, description="If set, bypass the adapter-level cache of database state"\n    )\n    warn_error: bool = Field(\n        default=False,\n        description=(\n            "If dbt would normally warn, instead raise an exception. Examples include --models "\n            "that selects nothing, deprecations, configurations with no associated models, "\n            "invalid test configurations, and missing sources/refs in tests."\n        ),\n    )\n    dbt_executable: str = Field(\n        default=DEFAULT_DBT_EXECUTABLE,\n        description=f"Path to the dbt executable. Default is {DEFAULT_DBT_EXECUTABLE}",\n    )\n    ignore_handled_error: bool = Field(\n        default=False,\n        description=(\n            "When True, will not raise an exception when the dbt CLI returns error code 1. "\n            "Default is False."\n        ),\n    )\n    target_path: str = Field(\n        default=DEFAULT_DBT_TARGET_PATH,\n        description=(\n            "The directory path for target if different from the default `target-path` in "\n            "your dbt project configuration file."\n        ),\n    )\n    docs_url: Optional[str] = Field(\n        default=None, description="The url for where dbt docs are being served for this project."\n    )\n    json_log_format: bool = Field(\n        default=True,\n        description=(\n            "When True, dbt will invoked with the `--log-format json` flag, allowing "\n            "Dagster to parse the log messages and emit simpler log messages to the event log."\n        ),\n    )\n    capture_logs: bool = Field(\n        default=True,\n        description=(\n            "When True, dbt will invoked with the `--capture-output` flag, allowing "\n            "Dagster to capture the logs and emit them to the event log."\n        ),\n    )\n    debug: bool = Field(\n        default=False,\n        description=(\n            "When True, dbt will invoked with the `--debug` flag, which will print "\n            "additional debug information to the console."\n        ),\n    )\n\n\nclass DbtCliClient(DbtClient):\n    """A resource that allows you to execute dbt cli commands.\n\n    For the most up-to-date documentation on the specific parameters available to you for each\n    command, check out the dbt docs:\n\n    https://docs.getdbt.com/reference/commands/run\n\n    To use this as a dagster resource, we recommend using\n    :func:`dbt_cli_resource <dagster_dbt.dbt_cli_resource>`.\n    """\n\n    def __init__(\n        self,\n        executable: str,\n        default_flags: Mapping[str, Any],\n        warn_error: bool,\n        ignore_handled_error: bool,\n        target_path: str,\n        logger: Optional[Any] = None,\n        docs_url: Optional[str] = None,\n        json_log_format: bool = True,\n        capture_logs: bool = True,\n        debug: bool = False,\n    ):\n        self._default_flags = default_flags\n        self._executable = executable\n        self._warn_error = warn_error\n        self._ignore_handled_error = ignore_handled_error\n        self._target_path = target_path\n        self._docs_url = docs_url\n        self._json_log_format = json_log_format\n        self._capture_logs = capture_logs\n        self._debug = debug\n        super().__init__(logger)\n\n    @property\n    def default_flags(self) -> Mapping[str, Any]:\n        """A set of params populated from resource config that are passed as flags to each dbt CLI command."""\n        return self._format_params(self._default_flags, replace_underscores=True)\n\n    @property\n    def strict_flags(self) -> Set[str]:\n        """A set of flags that should not be auto-populated from the default flags unless they are\n        arguments to the associated function.\n        """\n        return {"models", "exclude", "select"}\n\n    def _get_flags_dict(self, kwargs) -> Mapping[str, Any]:\n        extra_flags = {} if kwargs is None else kwargs\n\n        # remove default flags that are declared as "strict" and not explicitly passed in\n        default_flags = {\n            k: v\n            for k, v in self.default_flags.items()\n            if not (k in self.strict_flags and k not in extra_flags)\n        }\n\n        return merge_dicts(\n            default_flags, self._format_params(extra_flags, replace_underscores=True)\n        )\n\n    @public\n    def cli(self, command: str, **kwargs) -> DbtCliOutput:\n        """Executes a dbt CLI command. Params passed in as keyword arguments will be merged with the\n            default flags that were configured on resource initialization (if any) overriding the\n            default values if necessary.\n\n        Args:\n            command (str): The command you wish to run (e.g. 'run', 'test', 'docs generate', etc.)\n\n        Returns:\n            DbtCliOutput: An instance of :class:`DbtCliOutput<dagster_dbt.DbtCliOutput>` containing\n                parsed log output as well as the contents of run_results.json (if applicable).\n        """\n        command = check.str_param(command, "command")\n        return execute_cli(\n            executable=self._executable,\n            command=command,\n            flags_dict=self._get_flags_dict(kwargs),\n            log=self.logger,\n            warn_error=self._warn_error,\n            ignore_handled_error=self._ignore_handled_error,\n            target_path=self._target_path,\n            docs_url=self._docs_url,\n            json_log_format=self._json_log_format,\n            capture_logs=self._capture_logs,\n            debug=self._debug,\n        )\n\n    def cli_stream_json(self, command: str, **kwargs) -> Iterator[Mapping[str, Any]]:\n        """Executes a dbt CLI command. Params passed in as keyword arguments will be merged with the\n            default flags that were configured on resource initialization (if any) overriding the\n            default values if necessary.\n\n        Args:\n            command (str): The command you wish to run (e.g. 'run', 'test', 'docs generate', etc.)\n        """\n        check.invariant(self._json_log_format, "Cannot stream JSON if json_log_format is False.")\n        for event in execute_cli_stream(\n            executable=self._executable,\n            command=command,\n            flags_dict=self._get_flags_dict(kwargs),\n            log=self.logger,\n            warn_error=self._warn_error,\n            ignore_handled_error=self._ignore_handled_error,\n            json_log_format=self._json_log_format,\n            capture_logs=self._capture_logs,\n            debug=self._debug,\n        ):\n            if event.parsed_json_line is not None:\n                yield event.parsed_json_line\n\n    @public\n    def compile(\n        self,\n        models: Optional[Sequence[str]] = None,\n        exclude: Optional[Sequence[str]] = None,\n        select: Optional[Sequence[str]] = None,\n        **kwargs,\n    ) -> DbtCliOutput:\n        """Run the ``compile`` command on a dbt project. kwargs are passed in as additional parameters.\n\n        Args:\n            models (List[str], optional): the models to include in compilation.\n            exclude (List[str]), optional): the models to exclude from compilation.\n            select (List[str], optional): the models to include in compilation.\n\n        Returns:\n            DbtCliOutput: An instance of :class:`DbtCliOutput<dagster_dbt.DbtCliOutput>` containing\n                parsed log output as well as the contents of run_results.json (if applicable).\n        """\n        return self.cli("compile", models=models, exclude=exclude, select=select, **kwargs)\n\n    @public\n    def run(\n        self,\n        models: Optional[Sequence[str]] = None,\n        exclude: Optional[Sequence[str]] = None,\n        select: Optional[Sequence[str]] = None,\n        **kwargs,\n    ) -> DbtCliOutput:\n        """Run the ``run`` command on a dbt project. kwargs are passed in as additional parameters.\n\n        Args:\n            models (List[str], optional): the models to include in the run.\n            exclude (List[str]), optional): the models to exclude from the run.\n            select (List[str], optional): the models to include in the run.\n\n        Returns:\n            DbtCliOutput: An instance of :class:`DbtCliOutput<dagster_dbt.DbtCliOutput>` containing\n                parsed log output as well as the contents of run_results.json (if applicable).\n        """\n        return self.cli("run", models=models, exclude=exclude, select=select, **kwargs)\n\n    @public\n    def snapshot(\n        self,\n        select: Optional[Sequence[str]] = None,\n        exclude: Optional[Sequence[str]] = None,\n        **kwargs,\n    ) -> DbtCliOutput:\n        """Run the ``snapshot`` command on a dbt project. kwargs are passed in as additional parameters.\n\n        Args:\n            select (List[str], optional): the snapshots to include in the run.\n            exclude (List[str], optional): the snapshots to exclude from the run.\n\n        Returns:\n            DbtCliOutput: An instance of :class:`DbtCliOutput<dagster_dbt.DbtCliOutput>` containing\n                parsed log output as well as the contents of run_results.json (if applicable).\n        """\n        return self.cli("snapshot", select=select, exclude=exclude, **kwargs)\n\n    @public\n    def test(\n        self,\n        models: Optional[Sequence[str]] = None,\n        exclude: Optional[Sequence[str]] = None,\n        data: bool = True,\n        schema: bool = True,\n        select: Optional[Sequence[str]] = None,\n        **kwargs,\n    ) -> DbtCliOutput:\n        """Run the ``test`` command on a dbt project. kwargs are passed in as additional parameters.\n\n        Args:\n            models (List[str], optional): the models to include in testing.\n            exclude (List[str], optional): the models to exclude from testing.\n            data (bool, optional): If ``True`` (default), then run data tests.\n            schema (bool, optional): If ``True`` (default), then run schema tests.\n            select (List[str], optional): the models to include in testing.\n\n        Returns:\n            DbtCliOutput: An instance of :class:`DbtCliOutput<dagster_dbt.DbtCliOutput>` containing\n                parsed log output as well as the contents of run_results.json (if applicable).\n        """\n        if data and schema:\n            # do not include these arguments if both are True, as these are deprecated in later\n            # versions of dbt, and for older versions the functionality is the same regardless of\n            # if both are set or neither are set.\n            return self.cli("test", models=models, exclude=exclude, select=select, **kwargs)\n        return self.cli(\n            "test",\n            models=models,\n            exclude=exclude,\n            data=data,\n            schema=schema,\n            select=select,\n            **kwargs,\n        )\n\n    @public\n    def seed(\n        self,\n        show: bool = False,\n        select: Optional[Sequence[str]] = None,\n        exclude: Optional[Sequence[str]] = None,\n        **kwargs,\n    ) -> DbtCliOutput:\n        """Run the ``seed`` command on a dbt project. kwargs are passed in as additional parameters.\n\n        Args:\n            show (bool, optional): If ``True``, then show a sample of the seeded data in the\n                response. Defaults to ``False``.\n            select (List[str], optional): the snapshots to include in the run.\n            exclude (List[str], optional): the snapshots to exclude from the run.\n\n        Returns:\n            DbtCliOutput: An instance of :class:`DbtCliOutput<dagster_dbt.DbtCliOutput>` containing\n                parsed log output as well as the contents of run_results.json (if applicable).\n        """\n        return self.cli("seed", show=show, select=select, exclude=exclude, **kwargs)\n\n    @public\n    def ls(\n        self,\n        select: Optional[Sequence[str]] = None,\n        models: Optional[Sequence[str]] = None,\n        exclude: Optional[Sequence[str]] = None,\n        **kwargs,\n    ) -> DbtCliOutput:\n        """Run the ``ls`` command on a dbt project. kwargs are passed in as additional parameters.\n\n        Args:\n            select (List[str], optional): the resources to include in the output.\n            models (List[str], optional): the models to include in the output.\n            exclude (List[str], optional): the resources to exclude from the output.\n\n        Returns:\n            DbtCliOutput: An instance of :class:`DbtCliOutput<dagster_dbt.DbtCliOutput>` containing\n                parsed log output as well as the contents of run_results.json (if applicable).\n        """\n        return self.cli("ls", select=select, models=models, exclude=exclude, **kwargs)\n\n    @public\n    def build(self, select: Optional[Sequence[str]] = None, **kwargs) -> DbtCliOutput:\n        """Run the ``build`` command on a dbt project. kwargs are passed in as additional parameters.\n\n        Args:\n            select (List[str], optional): the models/resources to include in the run.\n\n        Returns:\n            DbtCliOutput: An instance of :class:`DbtCliOutput<dagster_dbt.DbtCliOutput>` containing\n                parsed log output as well as the contents of run_results.json (if applicable).\n        """\n        return self.cli("build", select=select, **kwargs)\n\n    @public\n    def freshness(self, select: Optional[Sequence[str]] = None, **kwargs) -> DbtCliOutput:\n        """Run the ``source snapshot-freshness`` command on a dbt project. kwargs are passed in as additional parameters.\n\n        Args:\n            select (List[str], optional): the sources to include in the run.\n\n        Returns:\n            DbtCliOutput: An instance of :class:`DbtCliOutput<dagster_dbt.DbtCliOutput>` containing\n                parsed log output as well as the contents of run_results.json (if applicable).\n        """\n        return self.cli("source snapshot-freshness", select=select, **kwargs)\n\n    @public\n    def generate_docs(self, compile_project: bool = False, **kwargs) -> DbtCliOutput:\n        """Run the ``docs generate`` command on a dbt project. kwargs are passed in as additional parameters.\n\n        Args:\n            compile_project (bool, optional): If true, compile the project before generating a catalog.\n\n        Returns:\n            DbtCliOutput: An instance of :class:`DbtCliOutput<dagster_dbt.DbtCliOutput>` containing\n                parsed log output as well as the contents of run_results.json (if applicable).\n        """\n        return self.cli("docs generate", compile=compile_project, **kwargs)\n\n    @public\n    def run_operation(\n        self, macro: str, args: Optional[Mapping[str, Any]] = None, **kwargs\n    ) -> DbtCliOutput:\n        """Run the ``run-operation`` command on a dbt project. kwargs are passed in as additional parameters.\n\n        Args:\n            macro (str): the dbt macro to invoke.\n            args (Dict[str, Any], optional): the keyword arguments to be supplied to the macro.\n\n        Returns:\n            DbtCliOutput: An instance of :class:`DbtCliOutput<dagster_dbt.DbtCliOutput>` containing\n                parsed log output as well as the contents of run_results.json (if applicable).\n        """\n        return self.cli(f"run-operation {macro}", args=args, **kwargs)\n\n    @public\n    def get_run_results_json(self, **kwargs) -> Optional[Mapping[str, Any]]:\n        """Get a parsed version of the run_results.json file for the relevant dbt project.\n\n        Returns:\n            Dict[str, Any]: dictionary containing the parsed contents of the manifest json file\n                for this dbt project.\n        """\n        project_dir = kwargs.get("project_dir", self.default_flags["project-dir"])\n        target_path = kwargs.get("target_path", self._target_path)\n        return parse_run_results(project_dir, target_path)\n\n    @public\n    def remove_run_results_json(self, **kwargs):\n        """Remove the run_results.json file from previous runs (if it exists)."""\n        project_dir = kwargs.get("project_dir", self.default_flags["project-dir"])\n        target_path = kwargs.get("target_path", self._target_path)\n        remove_run_results(project_dir, target_path)\n\n    @public\n    def get_manifest_json(self, **kwargs) -> Optional[Mapping[str, Any]]:\n        """Get a parsed version of the manifest.json file for the relevant dbt project.\n\n        Returns:\n            Dict[str, Any]: dictionary containing the parsed contents of the manifest json file\n                for this dbt project.\n        """\n        project_dir = kwargs.get("project_dir", self.default_flags["project-dir"])\n        target_path = kwargs.get("target_path", self._target_path)\n        return parse_manifest(project_dir, target_path)\n\n\nclass DbtCliClientResource(ConfigurableResourceWithCliFlags, IAttachDifferentObjectToOpContext):\n    """Resource which issues dbt CLI commands against a configured dbt project."""\n\n    class Config:\n        extra = "allow"\n\n    @classmethod\n    def _is_dagster_maintained(cls) -> bool:\n        return True\n\n    def get_dbt_client(self) -> DbtCliClient:\n        context = self.get_resource_context()\n        default_flags = {\n            k: v\n            for k, v in self._get_non_none_public_field_values().items()\n            if k not in COMMON_OPTION_KEYS\n        }\n\n        return DbtCliClient(\n            executable=self.dbt_executable,\n            default_flags=default_flags,\n            warn_error=self.warn_error,\n            ignore_handled_error=self.ignore_handled_error,\n            target_path=self.target_path,\n            docs_url=self.docs_url,\n            logger=context.log,\n            json_log_format=self.json_log_format,\n            capture_logs=self.capture_logs,\n            debug=self.debug,\n        )\n\n    def get_object_to_set_on_execution_context(self) -> Any:\n        return self.get_dbt_client()\n\n\n
[docs]@deprecated(breaking_version="0.21", additional_warn_text="Use DbtCliResource instead.")\n@dagster_maintained_resource\n@resource(config_schema=DbtCliClientResource.to_config_schema())\ndef dbt_cli_resource(context) -> DbtCliClient:\n """This resource issues dbt CLI commands against a configured dbt project. It is deprecated\n in favor of :py:class:`~dagster_dbt.DbtCliResource`.\n """\n # all config options that are intended to be used as flags for dbt commands\n\n default_flags = {\n k: v for k, v in context.resource_config.items() if k not in COMMON_OPTION_KEYS\n }\n return DbtCliClient(\n executable=context.resource_config["dbt_executable"],\n default_flags=default_flags,\n warn_error=context.resource_config["warn_error"],\n ignore_handled_error=context.resource_config["ignore_handled_error"],\n target_path=context.resource_config["target_path"],\n logger=context.log,\n docs_url=context.resource_config.get("docs_url"),\n capture_logs=context.resource_config["capture_logs"],\n json_log_format=context.resource_config["json_log_format"],\n debug=context.resource_config["debug"],\n )
\n
", "current_page_name": "_modules/dagster_dbt/core/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_dbt.core.resources"}, "resources_v2": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_dbt.core.resources_v2

\nimport atexit\nimport contextlib\nimport os\nimport shutil\nimport subprocess\nimport sys\nimport uuid\nfrom contextlib import suppress\nfrom dataclasses import dataclass\nfrom pathlib import Path\nfrom typing import (\n    Any,\n    Dict,\n    Iterator,\n    List,\n    Mapping,\n    Optional,\n    Union,\n)\n\nimport dateutil.parser\nimport orjson\nfrom dagster import (\n    AssetCheckResult,\n    AssetCheckSeverity,\n    AssetObservation,\n    AssetsDefinition,\n    ConfigurableResource,\n    Output,\n    get_dagster_logger,\n)\nfrom dagster._annotations import public\nfrom dagster._core.errors import DagsterInvalidPropertyError\nfrom dagster._core.execution.context.compute import OpExecutionContext\nfrom dbt.contracts.results import NodeStatus, TestStatus\nfrom dbt.node_types import NodeType\nfrom packaging import version\nfrom pydantic import Field, root_validator, validator\nfrom typing_extensions import Literal\n\nfrom ..asset_utils import (\n    get_manifest_and_translator_from_dbt_assets,\n    is_asset_check_from_dbt_resource_props,\n    output_name_fn,\n)\nfrom ..dagster_dbt_translator import DagsterDbtTranslator\nfrom ..dbt_manifest import DbtManifestParam, validate_manifest\nfrom ..errors import DagsterDbtCliRuntimeError\nfrom ..utils import ASSET_RESOURCE_TYPES, get_dbt_resource_props_by_dbt_unique_id_from_manifest\n\nlogger = get_dagster_logger()\n\n\nDBT_PROJECT_YML_NAME = "dbt_project.yml"\nDBT_PROFILES_YML_NAME = "profiles.yml"\nPARTIAL_PARSE_FILE_NAME = "partial_parse.msgpack"\n\n\ndef _get_dbt_target_path() -> Path:\n    return Path(os.getenv("DBT_TARGET_PATH", "target"))\n\n\n
[docs]@dataclass\nclass DbtCliEventMessage:\n """The representation of a dbt CLI event.\n\n Args:\n raw_event (Dict[str, Any]): The raw event dictionary.\n See https://docs.getdbt.com/reference/events-logging#structured-logging for more\n information.\n """\n\n raw_event: Dict[str, Any]\n\n @classmethod\n def from_log(cls, log: str) -> "DbtCliEventMessage":\n """Parse an event according to https://docs.getdbt.com/reference/events-logging#structured-logging.\n\n We assume that the log format is json.\n """\n raw_event: Dict[str, Any] = orjson.loads(log)\n\n return cls(raw_event=raw_event)\n\n def __str__(self) -> str:\n return self.raw_event["info"]["msg"]\n\n
[docs] @public\n def to_default_asset_events(\n self,\n manifest: DbtManifestParam,\n dagster_dbt_translator: DagsterDbtTranslator = DagsterDbtTranslator(),\n ) -> Iterator[Union[Output, AssetObservation, AssetCheckResult]]:\n """Convert a dbt CLI event to a set of corresponding Dagster events.\n\n Args:\n manifest (Union[Mapping[str, Any], str, Path]): The dbt manifest blob.\n dagster_dbt_translator (DagsterDbtTranslator): Optionally, a custom translator for\n linking dbt nodes to Dagster assets.\n\n Returns:\n Iterator[Union[Output, AssetObservation, AssetCheckResult]]: A set of corresponding Dagster events.\n - Output for refables (e.g. models, seeds, snapshots.)\n - AssetObservation for dbt test results that are not enabled as asset checks.\n - AssetCheckResult for dbt test results that are enabled as asset checks.\n """\n if self.raw_event["info"]["level"] == "debug":\n return\n\n event_node_info: Dict[str, Any] = self.raw_event["data"].get("node_info")\n if not event_node_info:\n return\n\n manifest = validate_manifest(manifest)\n\n if not manifest:\n logger.info(\n "No dbt manifest was provided. Dagster events for dbt tests will not be created."\n )\n\n unique_id: str = event_node_info["unique_id"]\n node_resource_type: str = event_node_info["resource_type"]\n node_status: str = event_node_info["node_status"]\n\n is_node_successful = node_status == NodeStatus.Success\n is_node_finished = bool(event_node_info.get("node_finished_at"))\n if node_resource_type in NodeType.refable() and is_node_successful:\n started_at = dateutil.parser.isoparse(event_node_info["node_started_at"])\n finished_at = dateutil.parser.isoparse(event_node_info["node_finished_at"])\n duration_seconds = (finished_at - started_at).total_seconds()\n\n yield Output(\n value=None,\n output_name=output_name_fn(event_node_info),\n metadata={\n "unique_id": unique_id,\n "Execution Duration": duration_seconds,\n },\n )\n elif manifest and node_resource_type == NodeType.Test and is_node_finished:\n upstream_unique_ids: List[str] = manifest["parent_map"][unique_id]\n test_resource_props = manifest["nodes"][unique_id]\n metadata = {"unique_id": unique_id, "status": node_status}\n\n is_asset_check = is_asset_check_from_dbt_resource_props(test_resource_props)\n attached_node_unique_id = test_resource_props.get("attached_node")\n is_generic_test = bool(attached_node_unique_id)\n\n if is_asset_check and is_generic_test:\n is_test_successful = node_status == TestStatus.Pass\n severity = AssetCheckSeverity(test_resource_props["config"]["severity"].upper())\n\n attached_node_resource_props: Dict[str, Any] = manifest["nodes"].get(\n attached_node_unique_id\n ) or manifest["sources"].get(attached_node_unique_id)\n attached_node_asset_key = dagster_dbt_translator.get_asset_key(\n attached_node_resource_props\n )\n\n yield AssetCheckResult(\n success=is_test_successful,\n asset_key=attached_node_asset_key,\n check_name=event_node_info["node_name"],\n metadata=metadata,\n severity=severity,\n )\n else:\n for upstream_unique_id in upstream_unique_ids:\n upstream_resource_props: Dict[str, Any] = manifest["nodes"].get(\n upstream_unique_id\n ) or manifest["sources"].get(upstream_unique_id)\n upstream_asset_key = dagster_dbt_translator.get_asset_key(\n upstream_resource_props\n )\n\n yield AssetObservation(\n asset_key=upstream_asset_key,\n metadata=metadata,\n )
\n\n\n
[docs]@dataclass\nclass DbtCliInvocation:\n """The representation of an invoked dbt command.\n\n Args:\n process (subprocess.Popen): The process running the dbt command.\n manifest (Mapping[str, Any]): The dbt manifest blob.\n project_dir (Path): The path to the dbt project.\n target_path (Path): The path to the dbt target folder.\n raise_on_error (bool): Whether to raise an exception if the dbt command fails.\n """\n\n process: subprocess.Popen\n manifest: Mapping[str, Any]\n dagster_dbt_translator: DagsterDbtTranslator\n project_dir: Path\n target_path: Path\n raise_on_error: bool\n\n @classmethod\n def run(\n cls,\n args: List[str],\n env: Dict[str, str],\n manifest: Mapping[str, Any],\n dagster_dbt_translator: DagsterDbtTranslator,\n project_dir: Path,\n target_path: Path,\n raise_on_error: bool,\n ) -> "DbtCliInvocation":\n # Attempt to take advantage of partial parsing. If there is a `partial_parse.msgpack` in\n # in the target folder, then copy it to the dynamic target path.\n #\n # This effectively allows us to skip the parsing of the manifest, which can be expensive.\n # See https://docs.getdbt.com/reference/programmatic-invocations#reusing-objects for more\n # details.\n current_target_path = _get_dbt_target_path()\n partial_parse_file_path = (\n current_target_path.joinpath(PARTIAL_PARSE_FILE_NAME)\n if current_target_path.is_absolute()\n else project_dir.joinpath(current_target_path, PARTIAL_PARSE_FILE_NAME)\n )\n partial_parse_destination_target_path = target_path.joinpath(PARTIAL_PARSE_FILE_NAME)\n\n if partial_parse_file_path.exists():\n logger.info(\n f"Copying `{partial_parse_file_path}` to `{partial_parse_destination_target_path}`"\n " to take advantage of partial parsing."\n )\n\n partial_parse_destination_target_path.parent.mkdir(parents=True, exist_ok=True)\n shutil.copy(partial_parse_file_path, partial_parse_destination_target_path)\n\n # Create a subprocess that runs the dbt CLI command.\n logger.info(f"Running dbt command: `{' '.join(args)}`.")\n process = subprocess.Popen(\n args=args,\n stdout=subprocess.PIPE,\n stderr=subprocess.STDOUT,\n env=env,\n cwd=project_dir,\n )\n\n # Add handler to terminate child process if running.\n # See https://stackoverflow.com/a/18258391 for more details.\n def cleanup_dbt_subprocess(process: subprocess.Popen) -> None:\n if process.returncode is None:\n logger.info(\n "The main process is being terminated, but the dbt command has not yet"\n " completed. Terminating the execution of dbt command."\n )\n process.terminate()\n process.wait()\n\n atexit.register(cleanup_dbt_subprocess, process)\n\n return cls(\n process=process,\n manifest=manifest,\n dagster_dbt_translator=dagster_dbt_translator,\n project_dir=project_dir,\n target_path=target_path,\n raise_on_error=raise_on_error,\n )\n\n
[docs] @public\n def wait(self) -> "DbtCliInvocation":\n """Wait for the dbt CLI process to complete.\n\n Returns:\n DbtCliInvocation: The current representation of the dbt CLI invocation.\n\n Examples:\n .. code-block:: python\n\n from dagster_dbt import DbtCliResource\n\n dbt = DbtCliResource(project_dir="/path/to/dbt/project")\n\n dbt_cli_invocation = dbt.cli(["run"]).wait()\n """\n list(self.stream_raw_events())\n\n return self
\n\n
[docs] @public\n def is_successful(self) -> bool:\n """Return whether the dbt CLI process completed successfully.\n\n Returns:\n bool: True, if the dbt CLI process returns with a zero exit code, and False otherwise.\n\n Examples:\n .. code-block:: python\n\n from dagster_dbt import DbtCliResource\n\n dbt = DbtCliResource(project_dir="/path/to/dbt/project")\n\n dbt_cli_invocation = dbt.cli(["run"], raise_on_error=False)\n\n if dbt_cli_invocation.is_successful():\n ...\n """\n return self.process.wait() == 0
\n\n
[docs] @public\n def stream(self) -> Iterator[Union[Output, AssetObservation, AssetCheckResult]]:\n """Stream the events from the dbt CLI process and convert them to Dagster events.\n\n Returns:\n Iterator[Union[Output, AssetObservation, AssetCheckResult]]: A set of corresponding Dagster events.\n - Output for refables (e.g. models, seeds, snapshots.)\n - AssetObservation for dbt test results that are not enabled as asset checks.\n - AssetCheckResult for dbt test results that are enabled as asset checks.\n\n Examples:\n .. code-block:: python\n\n from pathlib import Path\n from dagster_dbt import DbtCliResource, dbt_assets\n\n @dbt_assets(manifest=Path("target", "manifest.json"))\n def my_dbt_assets(context, dbt: DbtCliResource):\n yield from dbt.cli(["run"], context=context).stream()\n """\n for event in self.stream_raw_events():\n yield from event.to_default_asset_events(\n manifest=self.manifest, dagster_dbt_translator=self.dagster_dbt_translator\n )
\n\n
[docs] @public\n def stream_raw_events(self) -> Iterator[DbtCliEventMessage]:\n """Stream the events from the dbt CLI process.\n\n Returns:\n Iterator[DbtCliEventMessage]: An iterator of events from the dbt CLI process.\n """\n with self.process.stdout or contextlib.nullcontext():\n for raw_line in self.process.stdout or []:\n log: str = raw_line.decode().strip()\n try:\n event = DbtCliEventMessage.from_log(log=log)\n\n # Re-emit the logs from dbt CLI process into stdout.\n sys.stdout.write(str(event) + "\\n")\n sys.stdout.flush()\n\n yield event\n except:\n # If we can't parse the log, then just emit it as a raw log.\n sys.stdout.write(log + "\\n")\n sys.stdout.flush()\n\n # Ensure that the dbt CLI process has completed.\n self._raise_on_error()
\n\n
[docs] @public\n def get_artifact(\n self,\n artifact: Union[\n Literal["manifest.json"],\n Literal["catalog.json"],\n Literal["run_results.json"],\n Literal["sources.json"],\n ],\n ) -> Dict[str, Any]:\n """Retrieve a dbt artifact from the target path.\n\n See https://docs.getdbt.com/reference/artifacts/dbt-artifacts for more information.\n\n Args:\n artifact (Union[Literal["manifest.json"], Literal["catalog.json"], Literal["run_results.json"], Literal["sources.json"]]): The name of the artifact to retrieve.\n\n Returns:\n Dict[str, Any]: The artifact as a dictionary.\n\n Examples:\n .. code-block:: python\n\n from dagster_dbt import DbtCliResource\n\n dbt = DbtCliResource(project_dir="/path/to/dbt/project")\n\n dbt_cli_invocation = dbt.cli(["run"]).wait()\n\n # Retrieve the run_results.json artifact.\n run_results = dbt_cli_invocation.get_artifact("run_results.json")\n """\n artifact_path = self.target_path.joinpath(artifact)\n\n return orjson.loads(artifact_path.read_bytes())
\n\n def _raise_on_error(self) -> None:\n """Ensure that the dbt CLI process has completed. If the process has not successfully\n completed, then optionally raise an error.\n """\n if not self.is_successful() and self.raise_on_error:\n raise DagsterDbtCliRuntimeError(\n description=(\n f"The dbt CLI process failed with exit code {self.process.returncode}. Check"\n " the Dagster compute logs for the full information about the error, or view"\n f" the dbt debug log file: {self.target_path.joinpath('dbt.log')}."\n )\n )
\n\n\n
[docs]class DbtCliResource(ConfigurableResource):\n """A resource used to execute dbt CLI commands.\n\n Attributes:\n project_dir (str): The path to the dbt project directory. This directory should contain a\n `dbt_project.yml`. See https://docs.getdbt.com/reference/dbt_project.yml for more\n information.\n global_config_flags (List[str]): A list of global flags configuration to pass to the dbt CLI\n invocation. See https://docs.getdbt.com/reference/global-configs for a full list of\n configuration.\n profiles_dir (Optional[str]): The path to the directory containing your dbt `profiles.yml`.\n By default, the current working directory is used, which is the dbt project directory.\n See https://docs.getdbt.com/docs/core/connect-data-platform/connection-profiles for more\n information.\n profile (Optional[str]): The profile from your dbt `profiles.yml` to use for execution. See\n https://docs.getdbt.com/docs/core/connect-data-platform/connection-profiles for more\n information.\n target (Optional[str]): The target from your dbt `profiles.yml` to use for execution. See\n https://docs.getdbt.com/docs/core/connect-data-platform/connection-profiles for more\n information.\n\n Examples:\n Creating a dbt resource with only a reference to ``project_dir``:\n\n .. code-block:: python\n\n from dagster_dbt import DbtCliResource\n\n dbt = DbtCliResource(project_dir="/path/to/dbt/project")\n\n Creating a dbt resource with a custom ``profiles_dir``:\n\n .. code-block:: python\n\n from dagster_dbt import DbtCliResource\n\n dbt = DbtCliResource(\n project_dir="/path/to/dbt/project",\n profiles_dir="/path/to/dbt/project/profiles",\n )\n\n Creating a dbt resource with a custom ``profile`` and ``target``:\n\n .. code-block:: python\n\n from dagster_dbt import DbtCliResource\n\n dbt = DbtCliResource(\n project_dir="/path/to/dbt/project",\n profiles_dir="/path/to/dbt/project/profiles",\n profile="jaffle_shop",\n target="dev",\n )\n\n Creating a dbt resource with global configs, e.g. disabling colored logs with ``--no-use-color``:\n\n .. code-block:: python\n\n from dagster_dbt import DbtCliResource\n\n dbt = DbtCliResource(\n project_dir="/path/to/dbt/project",\n global_config_flags=["--no-use-color"],\n )\n """\n\n project_dir: str = Field(\n ...,\n description=(\n "The path to your dbt project directory. This directory should contain a"\n " `dbt_project.yml`. See https://docs.getdbt.com/reference/dbt_project.yml for more"\n " information."\n ),\n )\n global_config_flags: List[str] = Field(\n default=[],\n description=(\n "A list of global flags configuration to pass to the dbt CLI invocation. See"\n " https://docs.getdbt.com/reference/global-configs for a full list of configuration."\n ),\n )\n profiles_dir: Optional[str] = Field(\n default=None,\n description=(\n "The path to the directory containing your dbt `profiles.yml`. By default, the current"\n " working directory is used, which is the dbt project directory."\n " See https://docs.getdbt.com/docs/core/connect-data-platform/connection-profiles for "\n " more information."\n ),\n )\n profile: Optional[str] = Field(\n default=None,\n description=(\n "The profile from your dbt `profiles.yml` to use for execution. See"\n " https://docs.getdbt.com/docs/core/connect-data-platform/connection-profiles for more"\n " information."\n ),\n )\n target: Optional[str] = Field(\n default=None,\n description=(\n "The target from your dbt `profiles.yml` to use for execution. See"\n " https://docs.getdbt.com/docs/core/connect-data-platform/connection-profiles for more"\n " information."\n ),\n )\n\n @classmethod\n def _validate_absolute_path_exists(cls, path: Union[str, Path]) -> Path:\n absolute_path = Path(path).absolute()\n try:\n resolved_path = absolute_path.resolve(strict=True)\n except FileNotFoundError:\n raise ValueError(f"The absolute path of '{path}' ('{absolute_path}') does not exist")\n\n return resolved_path\n\n @classmethod\n def _validate_path_contains_file(cls, path: Path, file_name: str, error_message: str):\n if not path.joinpath(file_name).exists():\n raise ValueError(error_message)\n\n @validator("project_dir", "profiles_dir", pre=True)\n def convert_path_to_str(cls, v: Any) -> Any:\n """Validate that the path is converted to a string."""\n if isinstance(v, Path):\n resolved_path = cls._validate_absolute_path_exists(v)\n\n absolute_path = Path(v).absolute()\n try:\n resolved_path = absolute_path.resolve(strict=True)\n except FileNotFoundError:\n raise ValueError(f"The absolute path of '{v}' ('{absolute_path}') does not exist")\n return os.fspath(resolved_path)\n\n return v\n\n @validator("project_dir")\n def validate_project_dir(cls, project_dir: str) -> str:\n resolved_project_dir = cls._validate_absolute_path_exists(project_dir)\n\n cls._validate_path_contains_file(\n path=resolved_project_dir,\n file_name=DBT_PROJECT_YML_NAME,\n error_message=(\n f"{resolved_project_dir} does not contain a {DBT_PROJECT_YML_NAME} file. Please"\n " specify a valid path to a dbt project."\n ),\n )\n\n return os.fspath(resolved_project_dir)\n\n @validator("profiles_dir")\n def validate_profiles_dir(cls, profiles_dir: str) -> str:\n resolved_project_dir = cls._validate_absolute_path_exists(profiles_dir)\n\n cls._validate_path_contains_file(\n path=resolved_project_dir,\n file_name=DBT_PROFILES_YML_NAME,\n error_message=(\n f"{resolved_project_dir} does not contain a {DBT_PROFILES_YML_NAME} file. Please"\n " specify a valid path to a dbt profile directory."\n ),\n )\n\n return os.fspath(resolved_project_dir)\n\n @root_validator(pre=True)\n def validate_dbt_version(cls, values: Dict[str, Any]) -> Dict[str, Any]:\n """Validate that the dbt version is supported."""\n from dbt.version import __version__ as dbt_version\n\n if version.parse(dbt_version) < version.parse("1.4.0"):\n raise ValueError(\n "To use `dagster_dbt.DbtCliResource`, you must use `dbt-core>=1.4.0`. Currently,"\n f" you are using `dbt-core=={dbt_version}`. Please install a compatible dbt-core"\n " version."\n )\n\n return values\n\n def _get_unique_target_path(self, *, context: Optional[OpExecutionContext]) -> Path:\n """Get a unique target path for the dbt CLI invocation.\n\n Args:\n context (Optional[OpExecutionContext]): The execution context.\n\n Returns:\n str: A unique target path for the dbt CLI invocation.\n """\n unique_id = str(uuid.uuid4())[:7]\n path = unique_id\n if context:\n path = f"{context.op.name}-{context.run_id[:7]}-{unique_id}"\n\n current_target_path = _get_dbt_target_path()\n\n return current_target_path.joinpath(path)\n\n
[docs] @public\n def cli(\n self,\n args: List[str],\n *,\n raise_on_error: bool = True,\n manifest: Optional[DbtManifestParam] = None,\n dagster_dbt_translator: Optional[DagsterDbtTranslator] = None,\n context: Optional[OpExecutionContext] = None,\n ) -> DbtCliInvocation:\n """Create a subprocess to execute a dbt CLI command.\n\n Args:\n args (List[str]): The dbt CLI command to execute.\n raise_on_error (bool): Whether to raise an exception if the dbt CLI command fails.\n manifest (Optional[Union[Mapping[str, Any], str, Path]]): The dbt manifest blob. If an\n execution context from within `@dbt_assets` is provided to the context argument,\n then the manifest provided to `@dbt_assets` will be used.\n dagster_dbt_translator (Optional[DagsterDbtTranslator]): The translator to link dbt\n nodes to Dagster assets. If an execution context from within `@dbt_assets` is\n provided to the context argument, then the dagster_dbt_translator provided to\n `@dbt_assets` will be used.\n context (Optional[OpExecutionContext]): The execution context from within `@dbt_assets`.\n\n Returns:\n DbtCliInvocation: A invocation instance that can be used to retrieve the output of the\n dbt CLI command.\n\n Examples:\n Streaming Dagster events for dbt asset materializations and observations:\n\n .. code-block:: python\n\n from pathlib import Path\n\n from dagster import AssetExecutionContext\n from dagster_dbt import DbtCliResource, dbt_assets\n\n\n @dbt_assets(manifest=Path("target", "manifest.json"))\n def my_dbt_assets(context: AssetExecutionContext, dbt: DbtCliResource):\n yield from dbt.cli(["run"], context=context).stream()\n\n Retrieving a dbt artifact after streaming the Dagster events:\n\n .. code-block:: python\n\n from pathlib import Path\n\n from dagster import AssetExecutionContext\n from dagster_dbt import DbtCliResource, dbt_assets\n\n\n @dbt_assets(manifest=Path("target", "manifest.json"))\n def my_dbt_assets(context: AssetExecutionContext, dbt: DbtCliResource):\n dbt_run_invocation = dbt.cli(["run"], context=context)\n\n yield from dbt_run_invocation.stream()\n\n # Retrieve the `run_results.json` dbt artifact as a dictionary:\n run_results_json = dbt_run_invocation.get_artifact("run_results.json")\n\n # Retrieve the `run_results.json` dbt artifact as a file path:\n run_results_path = dbt_run_invocation.target_path.joinpath("run_results.json")\n\n Customizing the asset materialization metadata when streaming the Dagster events:\n\n .. code-block:: python\n\n from pathlib import Path\n\n from dagster import AssetExecutionContext\n from dagster_dbt import DbtCliResource, dbt_assets\n\n\n @dbt_assets(manifest=Path("target", "manifest.json"))\n def my_dbt_assets(context: AssetExecutionContext, dbt: DbtCliResource):\n dbt_cli_invocation = dbt.cli(["run"], context=context)\n\n for dbt_event in dbt_cli_invocation.stream_raw_events():\n for dagster_event in dbt_event.to_default_asset_events(manifest=dbt_cli_invocation.manifest):\n if isinstance(dagster_event, Output):\n context.add_output_metadata(\n metadata={\n "my_custom_metadata": "my_custom_metadata_value",\n },\n output_name=dagster_event.output_name,\n )\n\n yield dagster_event\n\n Suppressing exceptions from a dbt CLI command when a non-zero exit code is returned:\n\n .. code-block:: python\n\n from pathlib import Path\n\n from dagster import AssetExecutionContext\n from dagster_dbt import DbtCliResource, dbt_assets\n\n\n @dbt_assets(manifest=Path("target", "manifest.json"))\n def my_dbt_assets(context: AssetExecutionContext, dbt: DbtCliResource):\n dbt_run_invocation = dbt.cli(["run"], context=context, raise_on_error=False)\n\n if dbt_run_invocation.is_successful():\n yield from dbt_run_invocation.stream()\n else:\n ...\n\n Invoking a dbt CLI command in a custom asset or op:\n\n .. code-block:: python\n\n import json\n\n from dagster import asset, op\n from dagster_dbt import DbtCliResource\n\n\n @asset\n def my_dbt_asset(dbt: DbtCliResource):\n dbt_macro_args = {"key": "value"}\n dbt.cli(["run-operation", "my-macro", json.dumps(dbt_macro_args)]).wait()\n\n\n @op\n def my_dbt_op(dbt: DbtCliResource):\n dbt_macro_args = {"key": "value"}\n dbt.cli(["run-operation", "my-macro", json.dumps(dbt_macro_args)]).wait()\n """\n target_path = self._get_unique_target_path(context=context)\n env = {\n **os.environ.copy(),\n # Run dbt with unbuffered output.\n "PYTHONUNBUFFERED": "1",\n # Disable anonymous usage statistics for performance.\n "DBT_SEND_ANONYMOUS_USAGE_STATS": "false",\n # The DBT_LOG_FORMAT environment variable must be set to `json`. We use this\n # environment variable to ensure that the dbt CLI outputs structured logs.\n "DBT_LOG_FORMAT": "json",\n # The DBT_TARGET_PATH environment variable is set to a unique value for each dbt\n # invocation so that artifact paths are separated.\n # See https://discourse.getdbt.com/t/multiple-run-results-json-and-manifest-json-files/7555\n # for more information.\n "DBT_TARGET_PATH": os.fspath(target_path),\n # The DBT_LOG_PATH environment variable is set to the same value as DBT_TARGET_PATH\n # so that logs for each dbt invocation has separate log files.\n "DBT_LOG_PATH": os.fspath(target_path),\n # The DBT_PROFILES_DIR environment variable is set to the path containing the dbt\n # profiles.yml file.\n # See https://docs.getdbt.com/docs/core/connect-data-platform/connection-profiles#advanced-customizing-a-profile-directory\n # for more information.\n **({"DBT_PROFILES_DIR": self.profiles_dir} if self.profiles_dir else {}),\n }\n\n assets_def: Optional[AssetsDefinition] = None\n with suppress(DagsterInvalidPropertyError):\n assets_def = context.assets_def if context else None\n\n selection_args: List[str] = []\n dagster_dbt_translator = dagster_dbt_translator or DagsterDbtTranslator()\n if context and assets_def is not None:\n manifest, dagster_dbt_translator = get_manifest_and_translator_from_dbt_assets(\n [assets_def]\n )\n selection_args = get_subset_selection_for_context(\n context=context,\n manifest=manifest,\n select=context.op.tags.get("dagster-dbt/select"),\n exclude=context.op.tags.get("dagster-dbt/exclude"),\n )\n else:\n manifest = validate_manifest(manifest) if manifest else {}\n\n # TODO: verify that args does not have any selection flags if the context and manifest\n # are passed to this function.\n profile_args: List[str] = []\n if self.profile:\n profile_args = ["--profile", self.profile]\n\n if self.target:\n profile_args += ["--target", self.target]\n\n args = ["dbt"] + self.global_config_flags + args + profile_args + selection_args\n project_dir = Path(self.project_dir)\n\n if not target_path.is_absolute():\n target_path = project_dir.joinpath(target_path)\n\n return DbtCliInvocation.run(\n args=args,\n env=env,\n manifest=manifest,\n dagster_dbt_translator=dagster_dbt_translator,\n project_dir=project_dir,\n target_path=target_path,\n raise_on_error=raise_on_error,\n )
\n\n\ndef get_subset_selection_for_context(\n context: OpExecutionContext,\n manifest: Mapping[str, Any],\n select: Optional[str],\n exclude: Optional[str],\n) -> List[str]:\n """Generate a dbt selection string to materialize the selected resources in a subsetted execution context.\n\n See https://docs.getdbt.com/reference/node-selection/syntax#how-does-selection-work.\n\n Args:\n context (OpExecutionContext): The execution context for the current execution step.\n select (Optional[str]): A dbt selection string to select resources to materialize.\n exclude (Optional[str]): A dbt selection string to exclude resources from materializing.\n\n Returns:\n List[str]: dbt CLI arguments to materialize the selected resources in a\n subsetted execution context.\n\n If the current execution context is not performing a subsetted execution,\n return CLI arguments composed of the inputed selection and exclusion arguments.\n """\n default_dbt_selection = []\n if select:\n default_dbt_selection += ["--select", select]\n if exclude:\n default_dbt_selection += ["--exclude", exclude]\n\n dbt_resource_props_by_output_name = get_dbt_resource_props_by_output_name(manifest)\n\n # TODO: this should be a property on the context if this is a permanent indicator for\n # determining whether the current execution context is performing a subsetted execution.\n is_subsetted_execution = len(context.selected_output_names) != len(\n context.assets_def.node_keys_by_output_name\n )\n if not is_subsetted_execution:\n logger.info(\n "A dbt subsetted execution is not being performed. Using the default dbt selection"\n f" arguments `{default_dbt_selection}`."\n )\n return default_dbt_selection\n\n selected_dbt_resources = []\n for output_name in context.selected_output_names:\n dbt_resource_props = dbt_resource_props_by_output_name[output_name]\n\n # Explicitly select a dbt resource by its fully qualified name (FQN).\n # https://docs.getdbt.com/reference/node-selection/methods#the-file-or-fqn-method\n fqn_selector = f"fqn:{'.'.join(dbt_resource_props['fqn'])}"\n\n selected_dbt_resources.append(fqn_selector)\n\n # Take the union of all the selected resources.\n # https://docs.getdbt.com/reference/node-selection/set-operators#unions\n union_selected_dbt_resources = ["--select"] + [" ".join(selected_dbt_resources)]\n\n logger.info(\n "A dbt subsetted execution is being performed. Overriding default dbt selection"\n f" arguments `{default_dbt_selection}` with arguments: `{union_selected_dbt_resources}`"\n )\n\n return union_selected_dbt_resources\n\n\ndef get_dbt_resource_props_by_output_name(\n manifest: Mapping[str, Any]\n) -> Mapping[str, Mapping[str, Any]]:\n node_info_by_dbt_unique_id = get_dbt_resource_props_by_dbt_unique_id_from_manifest(manifest)\n\n return {\n output_name_fn(node): node\n for node in node_info_by_dbt_unique_id.values()\n if node["resource_type"] in ASSET_RESOURCE_TYPES\n }\n
", "current_page_name": "_modules/dagster_dbt/core/resources_v2", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_dbt.core.resources_v2"}, "types": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_dbt.core.types

\nfrom typing import Any, Mapping, Optional, Sequence\n\nimport dagster._check as check\n\nfrom ..types import DbtOutput\n\n\n
[docs]class DbtCliOutput(DbtOutput):\n """The results of executing a dbt command, along with additional metadata about the dbt CLI\n process that was run.\n\n This class is deprecated, because it's only produced by methods of the DbtCliClientResource class,\n which is deprecated in favor of DbtCliResource.\n\n Note that users should not construct instances of this class directly. This class is intended\n to be constructed from the JSON output of dbt commands.\n\n Attributes:\n command (str): The full shell command that was executed.\n return_code (int): The return code of the dbt CLI process.\n raw_output (str): The raw output (``stdout``) of the dbt CLI process.\n logs (List[Dict[str, Any]]): List of parsed JSON logs produced by the dbt command.\n result (Optional[Dict[str, Any]]): Dictionary containing dbt-reported result information\n contained in run_results.json. Some dbt commands do not produce results, and will\n therefore have result = None.\n docs_url (Optional[str]): Hostname where dbt docs are being served for this project.\n """\n\n def __init__(\n self,\n command: str,\n return_code: int,\n raw_output: str,\n logs: Sequence[Mapping[str, Any]],\n result: Mapping[str, Any],\n docs_url: Optional[str] = None,\n ):\n self._command = check.str_param(command, "command")\n self._return_code = check.int_param(return_code, "return_code")\n self._raw_output = check.str_param(raw_output, "raw_output")\n self._logs = check.sequence_param(logs, "logs", of_type=dict)\n self._docs_url = check.opt_str_param(docs_url, "docs_url")\n super().__init__(result)\n\n @property\n def command(self) -> str:\n return self._command\n\n @property\n def return_code(self) -> int:\n return self._return_code\n\n @property\n def raw_output(self) -> str:\n return self._raw_output\n\n @property\n def logs(self) -> Sequence[Mapping[str, Any]]:\n return self._logs\n\n @property\n def docs_url(self) -> Optional[str]:\n return self._docs_url
\n
", "current_page_name": "_modules/dagster_dbt/core/types", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_dbt.core.types"}}, "dagster_dbt_translator": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_dbt.dagster_dbt_translator

\nfrom dataclasses import dataclass\nfrom typing import Any, Mapping, Optional\n\nfrom dagster import AssetKey, AutoMaterializePolicy, FreshnessPolicy\nfrom dagster._annotations import public\nfrom dagster._core.definitions.events import (\n    CoercibleToAssetKeyPrefix,\n    check_opt_coercible_to_asset_key_prefix_param,\n)\n\nfrom .asset_utils import (\n    default_asset_key_fn,\n    default_auto_materialize_policy_fn,\n    default_description_fn,\n    default_freshness_policy_fn,\n    default_group_from_dbt_resource_props,\n    default_metadata_from_dbt_resource_props,\n)\n\n\n
[docs]class DagsterDbtTranslator:\n """Holds a set of methods that derive Dagster asset definition metadata given a representation\n of a dbt resource (models, tests, sources, etc).\n\n This class is exposed so that methods can be overriden to customize how Dagster asset metadata\n is derived.\n """\n\n
[docs] @classmethod\n @public\n def get_asset_key(cls, dbt_resource_props: Mapping[str, Any]) -> AssetKey:\n """A function that takes a dictionary representing properties of a dbt resource, and\n returns the Dagster asset key that represents that resource.\n\n Note that a dbt resource is unrelated to Dagster's resource concept, and simply represents\n a model, seed, snapshot or source in a given dbt project. You can learn more about dbt\n resources and the properties available in this dictionary here:\n https://docs.getdbt.com/reference/artifacts/manifest-json#resource-details\n\n This method can be overridden to provide a custom asset key for a dbt resource.\n\n Args:\n dbt_resource_props (Mapping[str, Any]): A dictionary representing the dbt resource.\n\n Returns:\n AssetKey: The Dagster asset key for the dbt resource.\n\n Examples:\n Adding a prefix to the default asset key generated for each dbt resource:\n\n .. code-block:: python\n\n from typing import Any, Mapping\n\n from dagster import AssetKey\n from dagster_dbt import DagsterDbtTranslator\n\n\n class CustomDagsterDbtTranslator(DagsterDbtTranslator):\n @classmethod\n def get_asset_key(cls, dbt_resource_props: Mapping[str, Any]) -> AssetKey:\n return super().get_asset_key(dbt_resource_props).with_prefix("prefix")\n\n Adding a prefix to the default asset key generated for each dbt resource, but only for dbt sources:\n\n .. code-block:: python\n\n from typing import Any, Mapping\n\n from dagster import AssetKey\n from dagster_dbt import DagsterDbtTranslator\n\n\n class CustomDagsterDbtTranslator(DagsterDbtTranslator):\n @classmethod\n def get_asset_key(cls, dbt_resource_props: Mapping[str, Any]) -> AssetKey:\n asset_key = super().get_asset_key(dbt_resource_props)\n\n if dbt_resource_props["resource_type"] == "source":\n asset_key = asset_key.with_prefix("my_prefix")\n\n return asset_key\n """\n return default_asset_key_fn(dbt_resource_props)
\n\n
[docs] @classmethod\n @public\n def get_description(cls, dbt_resource_props: Mapping[str, Any]) -> str:\n """A function that takes a dictionary representing properties of a dbt resource, and\n returns the Dagster description for that resource.\n\n Note that a dbt resource is unrelated to Dagster's resource concept, and simply represents\n a model, seed, snapshot or source in a given dbt project. You can learn more about dbt\n resources and the properties available in this dictionary here:\n https://docs.getdbt.com/reference/artifacts/manifest-json#resource-details\n\n This method can be overridden to provide a custom description for a dbt resource.\n\n Args:\n dbt_resource_props (Mapping[str, Any]): A dictionary representing the dbt resource.\n\n Returns:\n str: The description for the dbt resource.\n\n Examples:\n .. code-block:: python\n\n from typing import Any, Mapping\n\n from dagster_dbt import DagsterDbtTranslator\n\n\n class CustomDagsterDbtTranslator(DagsterDbtTranslator):\n @classmethod\n def get_description(cls, dbt_resource_props: Mapping[str, Any]) -> str:\n return "custom description"\n """\n return default_description_fn(dbt_resource_props)
\n\n
[docs] @classmethod\n @public\n def get_metadata(cls, dbt_resource_props: Mapping[str, Any]) -> Mapping[str, Any]:\n """A function that takes a dictionary representing properties of a dbt resource, and\n returns the Dagster metadata for that resource.\n\n Note that a dbt resource is unrelated to Dagster's resource concept, and simply represents\n a model, seed, snapshot or source in a given dbt project. You can learn more about dbt\n resources and the properties available in this dictionary here:\n https://docs.getdbt.com/reference/artifacts/manifest-json#resource-details\n\n This method can be overridden to provide a custom metadata for a dbt resource.\n\n Args:\n dbt_resource_props (Mapping[str, Any]): A dictionary representing the dbt resource.\n\n Returns:\n Mapping[str, Any]: A dictionary representing the Dagster metadata for the dbt resource.\n\n Examples:\n .. code-block:: python\n\n from typing import Any, Mapping\n\n from dagster_dbt import DagsterDbtTranslator\n\n\n class CustomDagsterDbtTranslator(DagsterDbtTranslator):\n @classmethod\n def get_metadata(cls, dbt_resource_props: Mapping[str, Any]) -> Mapping[str, Any]:\n return {"custom": "metadata"}\n """\n return default_metadata_from_dbt_resource_props(dbt_resource_props)
\n\n
[docs] @classmethod\n @public\n def get_group_name(cls, dbt_resource_props: Mapping[str, Any]) -> Optional[str]:\n """A function that takes a dictionary representing properties of a dbt resource, and\n returns the Dagster group name for that resource.\n\n Note that a dbt resource is unrelated to Dagster's resource concept, and simply represents\n a model, seed, snapshot or source in a given dbt project. You can learn more about dbt\n resources and the properties available in this dictionary here:\n https://docs.getdbt.com/reference/artifacts/manifest-json#resource-details\n\n This method can be overridden to provide a custom group name for a dbt resource.\n\n Args:\n dbt_resource_props (Mapping[str, Any]): A dictionary representing the dbt resource.\n\n Returns:\n Optional[str]: A Dagster group name.\n\n Examples:\n .. code-block:: python\n\n from typing import Any, Mapping\n\n from dagster_dbt import DagsterDbtTranslator\n\n\n class CustomDagsterDbtTranslator(DagsterDbtTranslator):\n @classmethod\n def get_group_name(cls, dbt_resource_props: Mapping[str, Any]) -> Optional[str]:\n return "custom_group_prefix" + dbt_resource_props.get("config", {}).get("group")\n """\n return default_group_from_dbt_resource_props(dbt_resource_props)
\n\n
[docs] @classmethod\n @public\n def get_freshness_policy(\n cls, dbt_resource_props: Mapping[str, Any]\n ) -> Optional[FreshnessPolicy]:\n """A function that takes a dictionary representing properties of a dbt resource, and\n returns the Dagster :py:class:`dagster.FreshnessPolicy` for that resource.\n\n Note that a dbt resource is unrelated to Dagster's resource concept, and simply represents\n a model, seed, snapshot or source in a given dbt project. You can learn more about dbt\n resources and the properties available in this dictionary here:\n https://docs.getdbt.com/reference/artifacts/manifest-json#resource-details\n\n This method can be overridden to provide a custom freshness policy for a dbt resource.\n\n Args:\n dbt_resource_props (Mapping[str, Any]): A dictionary representing the dbt resource.\n\n Returns:\n Optional[FreshnessPolicy]: A Dagster freshness policy.\n\n Examples:\n Set a custom freshness policy for all dbt resources:\n\n .. code-block:: python\n\n from typing import Any, Mapping\n\n from dagster_dbt import DagsterDbtTranslator\n\n\n class CustomDagsterDbtTranslator(DagsterDbtTranslator):\n @classmethod\n def get_freshness_policy(cls, dbt_resource_props: Mapping[str, Any]) -> Optional[FreshnessPolicy]:\n return FreshnessPolicy(maximum_lag_minutes=60)\n\n Set a custom freshness policy for dbt resources with a specific tag:\n\n .. code-block:: python\n\n from typing import Any, Mapping\n\n from dagster_dbt import DagsterDbtTranslator\n\n\n class CustomDagsterDbtTranslator(DagsterDbtTranslator):\n @classmethod\n def get_freshness_policy(cls, dbt_resource_props: Mapping[str, Any]) -> Optional[FreshnessPolicy]:\n freshness_policy = None\n if "my_custom_tag" in dbt_resource_props.get("tags", []):\n freshness_policy = FreshnessPolicy(maximum_lag_minutes=60)\n\n return freshness_policy\n """\n return default_freshness_policy_fn(dbt_resource_props)
\n\n
[docs] @classmethod\n @public\n def get_auto_materialize_policy(\n cls, dbt_resource_props: Mapping[str, Any]\n ) -> Optional[AutoMaterializePolicy]:\n """A function that takes a dictionary representing properties of a dbt resource, and\n returns the Dagster :py:class:`dagster.AutoMaterializePolicy` for that resource.\n\n Note that a dbt resource is unrelated to Dagster's resource concept, and simply represents\n a model, seed, snapshot or source in a given dbt project. You can learn more about dbt\n resources and the properties available in this dictionary here:\n https://docs.getdbt.com/reference/artifacts/manifest-json#resource-details\n\n This method can be overridden to provide a custom auto-materialize policy for a dbt resource.\n\n Args:\n dbt_resource_props (Mapping[str, Any]): A dictionary representing the dbt resource.\n\n Returns:\n Optional[AutoMaterializePolicy]: A Dagster auto-materialize policy.\n\n Examples:\n Set a custom auto-materialize policy for all dbt resources:\n\n .. code-block:: python\n\n from typing import Any, Mapping\n\n from dagster_dbt import DagsterDbtTranslator\n\n\n class CustomDagsterDbtTranslator(DagsterDbtTranslator):\n @classmethod\n def get_auto_materialize_policy(cls, dbt_resource_props: Mapping[str, Any]) -> Optional[AutoMaterializePolicy]:\n return AutoMaterializePolicy.eager()\n\n Set a custom auto-materialize policy for dbt resources with a specific tag:\n\n .. code-block:: python\n\n from typing import Any, Mapping\n\n from dagster_dbt import DagsterDbtTranslator\n\n\n class CustomDagsterDbtTranslator(DagsterDbtTranslator):\n @classmethod\n def get_auto_materialize_policy(cls, dbt_resource_props: Mapping[str, Any]) -> Optional[AutoMaterializePolicy]:\n auto_materialize_policy = None\n if "my_custom_tag" in dbt_resource_props.get("tags", []):\n auto_materialize_policy = AutoMaterializePolicy.eager()\n\n return auto_materialize_policy\n\n """\n return default_auto_materialize_policy_fn(dbt_resource_props)
\n\n\nclass KeyPrefixDagsterDbtTranslator(DagsterDbtTranslator):\n """A DagsterDbtTranslator that applies prefixes to the asset keys generated from dbt resources.\n\n Attributes:\n asset_key_prefix (Optional[Union[str, Sequence[str]]]): A prefix to apply to all dbt models,\n seeds, snapshots, etc. This will *not* apply to dbt sources.\n source_asset_key_prefix (Optional[Union[str, Sequence[str]]]): A prefix to apply to all dbt\n sources.\n """\n\n def __init__(\n self,\n asset_key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n source_asset_key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n ):\n self._asset_key_prefix = (\n check_opt_coercible_to_asset_key_prefix_param(asset_key_prefix, "asset_key_prefix")\n or []\n )\n self._source_asset_key_prefix = (\n check_opt_coercible_to_asset_key_prefix_param(\n source_asset_key_prefix, "source_asset_key_prefix"\n )\n or []\n )\n\n @public\n def get_asset_key(self, dbt_resource_props: Mapping[str, Any]) -> AssetKey:\n base_key = default_asset_key_fn(dbt_resource_props)\n if dbt_resource_props["resource_type"] == "source":\n return base_key.with_prefix(self._source_asset_key_prefix)\n else:\n return base_key.with_prefix(self._asset_key_prefix)\n\n\n@dataclass\nclass DbtManifestWrapper:\n manifest: Mapping[str, Any]\n
", "current_page_name": "_modules/dagster_dbt/dagster_dbt_translator", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_dbt.dagster_dbt_translator"}, "dbt_manifest_asset_selection": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_dbt.dbt_manifest_asset_selection

\nfrom typing import AbstractSet, Optional\n\nfrom dagster import (\n    AssetKey,\n    AssetSelection,\n    _check as check,\n)\nfrom dagster._core.definitions.asset_graph import AssetGraph\n\nfrom .asset_utils import is_non_asset_node\nfrom .dagster_dbt_translator import DagsterDbtTranslator\nfrom .dbt_manifest import DbtManifestParam, validate_manifest\nfrom .utils import (\n    ASSET_RESOURCE_TYPES,\n    get_dbt_resource_props_by_dbt_unique_id_from_manifest,\n    select_unique_ids_from_manifest,\n)\n\n\n
[docs]class DbtManifestAssetSelection(AssetSelection):\n """Defines a selection of assets from a dbt manifest wrapper and a dbt selection string.\n\n Args:\n manifest (Mapping[str, Any]): The dbt manifest blob.\n select (str): A dbt selection string to specify a set of dbt resources.\n exclude (Optional[str]): A dbt selection string to exclude a set of dbt resources.\n\n Examples:\n .. code-block:: python\n\n import json\n from pathlib import Path\n\n from dagster_dbt import DbtManifestAssetSelection\n\n manifest = json.loads(Path("path/to/manifest.json").read_text())\n\n # select the dbt assets that have the tag "foo".\n my_selection = DbtManifestAssetSelection(manifest=manifest, select="tag:foo")\n """\n\n def __init__(\n self,\n manifest: DbtManifestParam,\n select: str = "fqn:*",\n *,\n dagster_dbt_translator: Optional[DagsterDbtTranslator] = None,\n exclude: Optional[str] = None,\n ) -> None:\n self.manifest = validate_manifest(manifest)\n self.select = check.str_param(select, "select")\n self.exclude = check.opt_str_param(exclude, "exclude", default="")\n self.dagster_dbt_translator = check.opt_inst_param(\n dagster_dbt_translator,\n "dagster_dbt_translator",\n DagsterDbtTranslator,\n DagsterDbtTranslator(),\n )\n\n def resolve_inner(self, asset_graph: AssetGraph) -> AbstractSet[AssetKey]:\n dbt_nodes = get_dbt_resource_props_by_dbt_unique_id_from_manifest(self.manifest)\n\n keys = set()\n for unique_id in select_unique_ids_from_manifest(\n select=self.select,\n exclude=self.exclude,\n manifest_json=self.manifest,\n ):\n dbt_resource_props = dbt_nodes[unique_id]\n is_dbt_asset = dbt_resource_props["resource_type"] in ASSET_RESOURCE_TYPES\n if is_dbt_asset and not is_non_asset_node(dbt_resource_props):\n asset_key = self.dagster_dbt_translator.get_asset_key(dbt_resource_props)\n keys.add(asset_key)\n\n return keys
\n
", "current_page_name": "_modules/dagster_dbt/dbt_manifest_asset_selection", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_dbt.dbt_manifest_asset_selection"}, "dbt_resource": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_dbt.dbt_resource

\nimport logging\nfrom abc import abstractmethod\nfrom typing import Any, Mapping, Optional, Sequence\n\nfrom dagster import get_dagster_logger\n\nfrom .types import DbtOutput\n\n\nclass DbtClient:\n    """Base class for a client allowing users to interface with dbt."""\n\n    def __init__(\n        self,\n        logger: Optional[logging.Logger] = None,\n    ):\n        """Constructor.\n\n        Args:\n            logger (Optional[Any]): A property for injecting a logger dependency.\n                Default is ``None``.\n        """\n        self._logger = logger or get_dagster_logger()\n\n    def _format_params(\n        self, flags: Mapping[str, Any], replace_underscores: bool = False\n    ) -> Mapping[str, Any]:\n        """Reformats arguments that are easier to express as a list into the format that dbt expects,\n        and deletes and keys with no value.\n        """\n        # remove any keys with a value of None\n        if replace_underscores:\n            flags = {k.replace("_", "-"): v for k, v in flags.items() if v is not None}\n        else:\n            flags = {k: v for k, v in flags.items() if v is not None}\n\n        for param in ["select", "exclude", "models"]:\n            if param in flags:\n                if isinstance(flags[param], list):\n                    # if it's a list, format as space-separated\n                    flags[param] = " ".join(set(flags[param]))\n\n        return flags\n\n    @property\n    def logger(self) -> logging.Logger:\n        """logging.Logger: A property for injecting a logger dependency."""\n        return self._logger\n\n    @abstractmethod\n    def compile(\n        self,\n        models: Optional[Sequence[str]] = None,\n        exclude: Optional[Sequence[str]] = None,\n        **kwargs,\n    ) -> DbtOutput:\n        """Run the ``compile`` command on a dbt project. kwargs are passed in as additional parameters.\n\n        Args:\n            models (List[str], optional): the models to include in compilation.\n            exclude (List[str]), optional): the models to exclude from compilation.\n\n        Returns:\n            DbtOutput: object containing parsed output from dbt\n        """\n\n    @abstractmethod\n    def run(\n        self,\n        models: Optional[Sequence[str]] = None,\n        exclude: Optional[Sequence[str]] = None,\n        **kwargs,\n    ) -> DbtOutput:\n        """Run the ``run`` command on a dbt project. kwargs are passed in as additional parameters.\n\n        Args:\n            models (List[str], optional): the models to include in the run.\n            exclude (List[str]), optional): the models to exclude from the run.\n\n        Returns:\n            DbtOutput: object containing parsed output from dbt\n        """\n\n    @abstractmethod\n    def snapshot(\n        self,\n        select: Optional[Sequence[str]] = None,\n        exclude: Optional[Sequence[str]] = None,\n        **kwargs,\n    ) -> DbtOutput:\n        """Run the ``snapshot`` command on a dbt project. kwargs are passed in as additional parameters.\n\n        Args:\n            select (List[str], optional): the snapshots to include in the run.\n            exclude (List[str], optional): the snapshots to exclude from the run.\n\n        Returns:\n            DbtOutput: object containing parsed output from dbt\n        """\n\n    @abstractmethod\n    def test(\n        self,\n        models: Optional[Sequence[str]] = None,\n        exclude: Optional[Sequence[str]] = None,\n        data: bool = True,\n        schema: bool = True,\n        **kwargs,\n    ) -> DbtOutput:\n        """Run the ``test`` command on a dbt project. kwargs are passed in as additional parameters.\n\n        Args:\n            models (List[str], optional): the models to include in testing.\n            exclude (List[str], optional): the models to exclude from testing.\n            data (bool, optional): If ``True`` (default), then run data tests.\n            schema (bool, optional): If ``True`` (default), then run schema tests.\n\n        Returns:\n            DbtOutput: object containing parsed output from dbt\n        """\n\n    @abstractmethod\n    def seed(\n        self,\n        show: bool = False,\n        select: Optional[Sequence[str]] = None,\n        exclude: Optional[Sequence[str]] = None,\n        **kwargs,\n    ) -> DbtOutput:\n        """Run the ``seed`` command on a dbt project. kwargs are passed in as additional parameters.\n\n        Args:\n            show (bool, optional): If ``True``, then show a sample of the seeded data in the\n                response. Defaults to ``False``.\n            select (List[str], optional): the snapshots to include in the run.\n            exclude (List[str], optional): the snapshots to exclude from the run.\n\n\n        Returns:\n            DbtOutput: object containing parsed output from dbt\n        """\n\n    @abstractmethod\n    def ls(\n        self,\n        select: Optional[Sequence[str]] = None,\n        models: Optional[Sequence[str]] = None,\n        exclude: Optional[Sequence[str]] = None,\n        **kwargs,\n    ) -> DbtOutput:\n        """Run the ``ls`` command on a dbt project. kwargs are passed in as additional parameters.\n\n        Args:\n            select (List[str], optional): the resources to include in the output.\n            models (List[str], optional): the models to include in the output.\n            exclude (List[str], optional): the resources to exclude from the output.\n\n\n        Returns:\n            DbtOutput: object containing parsed output from dbt\n        """\n\n    @abstractmethod\n    def build(self, select: Optional[Sequence[str]] = None, **kwargs) -> DbtOutput:\n        """Run the ``build`` command on a dbt project. kwargs are passed in as additional parameters.\n\n        Args:\n            select (List[str], optional): the models/resources to include in the run.\n\n        Returns:\n            DbtOutput: object containing parsed output from dbt\n        """\n        raise NotImplementedError()\n\n    @abstractmethod\n    def generate_docs(self, compile_project: bool = False, **kwargs) -> DbtOutput:\n        """Run the ``docs generate`` command on a dbt project. kwargs are passed in as additional parameters.\n\n        Args:\n            compile_project (bool, optional): If true, compile the project before generating a catalog.\n\n        Returns:\n            DbtOutput: object containing parsed output from dbt\n        """\n\n    @abstractmethod\n    def run_operation(\n        self, macro: str, args: Optional[Mapping[str, Any]] = None, **kwargs\n    ) -> DbtOutput:\n        """Run the ``run-operation`` command on a dbt project. kwargs are passed in as additional parameters.\n\n        Args:\n            macro (str): the dbt macro to invoke.\n            args (Dict[str, Any], optional): the keyword arguments to be supplied to the macro.\n\n        Returns:\n            DbtOutput: object containing parsed output from dbt\n        """\n\n    @abstractmethod\n    def get_run_results_json(self, **kwargs) -> Optional[Mapping[str, Any]]:\n        """Get a parsed version of the run_results.json file for the relevant dbt project.\n\n        Returns:\n            Dict[str, Any]: dictionary containing the parsed contents of the run_results json file\n                for this dbt project.\n        """\n\n    @abstractmethod\n    def get_manifest_json(self, **kwargs) -> Optional[Mapping[str, Any]]:\n        """Get a parsed version of the manifest.json file for the relevant dbt project.\n\n        Returns:\n            Dict[str, Any]: dictionary containing the parsed contents of the manifest json file\n                for this dbt project.\n        """\n\n\n
[docs]class DbtResource(DbtClient):\n pass
\n
", "current_page_name": "_modules/dagster_dbt/dbt_resource", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_dbt.dbt_resource"}, "errors": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_dbt.errors

\nimport warnings\nfrom abc import ABC\nfrom typing import Any, Mapping, Optional, Sequence\n\nfrom dagster import (\n    DagsterInvariantViolationError,\n    Failure,\n    MetadataValue,\n    _check as check,\n)\n\n\n
[docs]class DagsterDbtError(Failure, ABC):\n """The base exception of the ``dagster-dbt`` library."""
\n\n\n
[docs]class DagsterDbtCliUnexpectedOutputError(DagsterDbtError):\n """Represents an error when parsing the output of a dbt CLI command."""\n\n invalid_line_nos: Sequence[int]\n\n def __init__(self, invalid_line_nos: Sequence[int]):\n check.sequence_param(invalid_line_nos, "invalid_line_nos", int)\n line_nos_str = ", ".join(map(str, invalid_line_nos))\n description = f"dbt CLI emitted unexpected output on lines {line_nos_str}"\n metadata = {\n "Invalid CLI Output Line Numbers": MetadataValue.json({"line_nos": invalid_line_nos})\n }\n super().__init__(description, metadata=metadata)\n self.invalid_line_nos = invalid_line_nos
\n\n\n
[docs]class DagsterDbtCliRuntimeError(DagsterDbtError, ABC):\n """Represents an error while executing a dbt CLI command."""\n\n def __init__(\n self,\n description: str,\n logs: Optional[Sequence[Mapping[str, Any]]] = None,\n raw_output: Optional[str] = None,\n messages: Optional[Sequence[str]] = None,\n ):\n if logs is not None:\n warnings.warn(\n "`logs` is a deprecated argument to DagsterDbtCliRuntimeError and will be discarded"\n )\n if raw_output is not None:\n warnings.warn(\n "`raw_output` is a deprecated argument to DagsterDbtCliRuntimeError and will be"\n " discarded"\n )\n metadata = {"Parsed CLI Messages": "\\n".join(messages or [])}\n super().__init__(description, metadata=metadata)
\n\n\n
[docs]class DagsterDbtCliHandledRuntimeError(DagsterDbtCliRuntimeError):\n """Represents a model error reported by the dbt CLI at runtime (return code 1)."""\n\n def __init__(\n self,\n logs: Optional[Sequence[Mapping[str, Any]]] = None,\n raw_output: Optional[str] = None,\n messages: Optional[Sequence[str]] = None,\n ):\n super().__init__("Handled error in the dbt CLI (return code 1)", logs, raw_output, messages)
\n\n\n
[docs]class DagsterDbtCliFatalRuntimeError(DagsterDbtCliRuntimeError):\n """Represents a fatal error in the dbt CLI (return code 2)."""\n\n def __init__(\n self,\n logs: Optional[Sequence[Mapping[str, Any]]] = None,\n raw_output: Optional[str] = None,\n messages: Optional[Sequence[str]] = None,\n ):\n super().__init__(\n "Fatal error in the dbt CLI (return code 2): " + " ".join(messages or []),\n logs,\n raw_output,\n messages,\n )
\n\n\n
[docs]class DagsterDbtCliOutputsNotFoundError(DagsterDbtError):\n """Represents a problem in finding the ``target/run_results.json`` artifact when executing a dbt\n CLI command.\n\n For more details on ``target/run_results.json``, see\n https://docs.getdbt.com/reference/dbt-artifacts#run_resultsjson.\n """\n\n def __init__(self, path: str):\n super().__init__(f"Expected to find file at path {path}")
\n\n\nclass DagsterDbtCloudJobInvariantViolationError(DagsterDbtError, DagsterInvariantViolationError):\n """Represents an error when a dbt Cloud job is not supported by the ``dagster-dbt`` library."""\n
", "current_page_name": "_modules/dagster_dbt/errors", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_dbt.errors"}, "ops": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_dbt.ops

\nfrom typing import Any, Dict, List, Optional\n\nfrom dagster import Config, In, Nothing, Out, Output, op\nfrom pydantic import Field\n\nfrom .types import DbtOutput\nfrom .utils import generate_events, generate_materializations\n\n_DEFAULT_OP_PROPS: Dict[str, Any] = dict(\n    required_resource_keys={"dbt"},\n    ins={"start_after": In(Nothing)},\n    out=Out(DbtOutput, description="Parsed output from running the dbt command."),\n    tags={"kind": "dbt"},\n)\n\n\ndef _get_doc(op_name: str, dbt_command: str) -> str:\n    return f"""\nThis op executes a ``dbt {dbt_command}`` command. It requires the use of a dbt resource, which can be\nset to execute this command through the CLI (using the :py:class:`~dagster_dbt.dbt_cli_resource`).\n\nExamples:\n\n.. code-block:: python\n\n    from dagster import job\n    from dagster_dbt import {op_name}, dbt_cli_resource\n\n    @job(resource_defs={{"dbt":dbt_cli_resource}})\n    def my_dbt_cli_job():\n        {op_name}()\n    """\n\n\n# NOTE: mypy fails to properly track the type of `_DEFAULT_OP_PROPS` items when they are\n# double-splatted, so we type-ignore the below op declarations.\n\n\nclass DbtBuildOpConfig(Config):\n    yield_asset_events: bool = Field(\n        default=True,\n        description=(\n            "If True, materializations and asset observations corresponding to the results of "\n            "the dbt operation will be yielded when the op executes. Default: True"\n        ),\n    )\n    asset_key_prefix: List[str] = Field(\n        default=["dbt"],\n        description=(\n            "If provided and yield_materializations is True, these components will be used to "\n            "prefix the generated asset keys."\n        ),\n    )\n\n\n@op(**_DEFAULT_OP_PROPS)\ndef dbt_build_op(context, config: DbtBuildOpConfig) -> Any:\n    dbt_output = context.resources.dbt.build()\n    if config.yield_asset_events and "results" in dbt_output.result:\n        yield from generate_events(\n            dbt_output,\n            node_info_to_asset_key=lambda info: config.asset_key_prefix\n            + info["unique_id"].split("."),\n            manifest_json=context.resources.dbt.get_manifest_json(),\n        )\n    yield Output(dbt_output)\n\n\nclass DbtRunOpConfig(Config):\n    yield_materializations: bool = Field(\n        default=True,\n        description=(\n            "If True, materializations corresponding to the results of the dbt operation will "\n            "be yielded when the op executes. Default: True"\n        ),\n    )\n    asset_key_prefix: Optional[List[str]] = Field(\n        default=["dbt"],\n        description=(\n            "If provided and yield_materializations is True, these components will be used to "\n            "prefix the generated asset keys."\n        ),\n    )\n\n\n
[docs]@op(**_DEFAULT_OP_PROPS)\ndef dbt_run_op(context, config: DbtRunOpConfig):\n dbt_output = context.resources.dbt.run()\n if config.yield_materializations and "results" in dbt_output.result:\n yield from generate_materializations(dbt_output, asset_key_prefix=config.asset_key_prefix)\n yield Output(dbt_output)
\n\n\n
[docs]@op(**_DEFAULT_OP_PROPS)\ndef dbt_compile_op(context):\n return context.resources.dbt.compile()
\n\n\n
[docs]@op(**_DEFAULT_OP_PROPS)\ndef dbt_ls_op(context):\n return context.resources.dbt.ls()
\n\n\n
[docs]@op(**_DEFAULT_OP_PROPS)\ndef dbt_test_op(context):\n return context.resources.dbt.test()
\n\n\n
[docs]@op(**_DEFAULT_OP_PROPS)\ndef dbt_snapshot_op(context):\n return context.resources.dbt.snapshot()
\n\n\n
[docs]@op(**_DEFAULT_OP_PROPS)\ndef dbt_seed_op(context):\n return context.resources.dbt.seed()
\n\n\n
[docs]@op(**_DEFAULT_OP_PROPS)\ndef dbt_docs_generate_op(context):\n return context.resources.dbt.generate_docs()
\n\n\nfor dbt_op, cmd in [\n (dbt_build_op, "build"),\n (dbt_run_op, "run"),\n (dbt_compile_op, "compile"),\n (dbt_ls_op, "ls"),\n (dbt_test_op, "test"),\n (dbt_snapshot_op, "snapshot"),\n (dbt_seed_op, "seed"),\n (dbt_docs_generate_op, "docs generate"),\n]:\n dbt_op.__doc__ = _get_doc(dbt_op.name, cmd)\n
", "current_page_name": "_modules/dagster_dbt/ops", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_dbt.ops"}, "types": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_dbt.types

\nfrom typing import Any, Mapping, Optional\n\nimport dagster._check as check\n\n\n
[docs]class DbtOutput:\n """Base class for both DbtCliOutput and DbtRPCOutput. Contains a single field, `result`, which\n represents the dbt-formatted result of the command that was run (if any).\n\n Used internally, should not be instantiated directly by the user.\n """\n\n def __init__(self, result: Mapping[str, Any]):\n self._result = check.mapping_param(result, "result", key_type=str)\n\n @property\n def result(self) -> Mapping[str, Any]:\n return self._result\n\n @property\n def docs_url(self) -> Optional[str]:\n return None
\n
", "current_page_name": "_modules/dagster_dbt/types", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_dbt.types"}, "utils": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_dbt.utils

\nfrom pathlib import Path\nfrom typing import (\n    AbstractSet,\n    Any,\n    Callable,\n    Dict,\n    Iterator,\n    Mapping,\n    Optional,\n    Sequence,\n    Union,\n    cast,\n)\n\nimport dateutil\nfrom dagster import (\n    AssetKey,\n    AssetMaterialization,\n    AssetObservation,\n    MetadataValue,\n    Output,\n    _check as check,\n)\nfrom dagster._core.definitions.metadata import RawMetadataValue\n\nfrom .types import DbtOutput\n\n# dbt resource types that may be considered assets\nASSET_RESOURCE_TYPES = ["model", "seed", "snapshot"]\n\n\ndef default_node_info_to_asset_key(node_info: Mapping[str, Any]) -> AssetKey:\n    return AssetKey(node_info["unique_id"].split("."))\n\n\ndef _resource_type(unique_id: str) -> str:\n    # returns the type of the node (e.g. model, test, snapshot)\n    return unique_id.split(".")[0]\n\n\ndef input_name_fn(dbt_resource_props: Mapping[str, Any]) -> str:\n    # * can be present when sources are sharded tables\n    return dbt_resource_props["unique_id"].replace(".", "_").replace("*", "_star")\n\n\ndef output_name_fn(dbt_resource_props: Mapping[str, Any]) -> str:\n    # hyphens are valid in dbt model names, but not in output names\n    return dbt_resource_props["unique_id"].split(".")[-1].replace("-", "_")\n\n\ndef _node_result_to_metadata(node_result: Mapping[str, Any]) -> Mapping[str, RawMetadataValue]:\n    return {\n        "Materialization Strategy": node_result["config"]["materialized"],\n        "Database": node_result["database"],\n        "Schema": node_result["schema"],\n        "Alias": node_result["alias"],\n        "Description": node_result["description"],\n    }\n\n\ndef _timing_to_metadata(timings: Sequence[Mapping[str, Any]]) -> Mapping[str, RawMetadataValue]:\n    metadata: Dict[str, RawMetadataValue] = {}\n    for timing in timings:\n        if timing["name"] == "execute":\n            desc = "Execution"\n        elif timing["name"] == "compile":\n            desc = "Compilation"\n        else:\n            continue\n\n        # dateutil does not properly expose its modules to static checkers\n        started_at = dateutil.parser.isoparse(timing["started_at"])  # type: ignore\n        completed_at = dateutil.parser.isoparse(timing["completed_at"])  # type: ignore\n        duration = completed_at - started_at\n        metadata.update(\n            {\n                f"{desc} Started At": started_at.isoformat(timespec="seconds"),\n                f"{desc} Completed At": started_at.isoformat(timespec="seconds"),\n                f"{desc} Duration": duration.total_seconds(),\n            }\n        )\n    return metadata\n\n\ndef result_to_events(\n    result: Mapping[str, Any],\n    docs_url: Optional[str] = None,\n    node_info_to_asset_key: Optional[Callable[[Mapping[str, Any]], AssetKey]] = None,\n    manifest_json: Optional[Mapping[str, Any]] = None,\n    extra_metadata: Optional[Mapping[str, RawMetadataValue]] = None,\n    generate_asset_outputs: bool = False,\n) -> Iterator[Union[AssetMaterialization, AssetObservation, Output]]:\n    """This is a hacky solution that attempts to consolidate parsing many of the potential formats\n    that dbt can provide its results in. This is known to work for CLI Outputs for dbt versions 0.18+,\n    as well as RPC responses for a similar time period, but as the RPC response schema is not documented\n    nor enforced, this can become out of date easily.\n    """\n    node_info_to_asset_key = check.opt_callable_param(\n        node_info_to_asset_key, "node_info_to_asset_key", default=default_node_info_to_asset_key\n    )\n\n    # status comes from set of fields rather than "status"\n    if "fail" in result:\n        status = (\n            "fail"\n            if result.get("fail")\n            else "skip" if result.get("skip") else "error" if result.get("error") else "success"\n        )\n    else:\n        status = result["status"]\n\n    # all versions represent timing the same way\n    metadata = {"Status": status, "Execution Time (seconds)": result["execution_time"]}\n    metadata.update(_timing_to_metadata(result["timing"]))\n\n    # working with a response that contains the node block (RPC and CLI 0.18.x)\n    if "node" in result:\n        unique_id = result["node"]["unique_id"]\n        metadata.update(_node_result_to_metadata(result["node"]))\n    else:\n        unique_id = result["unique_id"]\n\n    if docs_url:\n        metadata["docs_url"] = MetadataValue.url(f"{docs_url}#!/model/{unique_id}")\n\n    if extra_metadata:\n        metadata.update(extra_metadata)\n\n    # if you have a manifest available, get the full node info, otherwise just populate unique_id\n    dbt_resource_props = (\n        manifest_json["nodes"][unique_id] if manifest_json else {"unique_id": unique_id}\n    )\n\n    node_resource_type = _resource_type(unique_id)\n\n    if node_resource_type in ASSET_RESOURCE_TYPES and status == "success":\n        if generate_asset_outputs:\n            yield Output(\n                value=None,\n                output_name=output_name_fn(dbt_resource_props),\n                metadata=metadata,\n            )\n        else:\n            yield AssetMaterialization(\n                asset_key=node_info_to_asset_key(dbt_resource_props),\n                description=f"dbt node: {unique_id}",\n                metadata=metadata,\n            )\n    # can only associate tests with assets if we have manifest_json available\n    elif node_resource_type == "test" and manifest_json and status != "skipped":\n        upstream_unique_ids = manifest_json["nodes"][unique_id]["depends_on"]["nodes"]\n        # tests can apply to multiple asset keys\n        for upstream_id in upstream_unique_ids:\n            # the upstream id can reference a node or a source\n            dbt_resource_props = manifest_json["nodes"].get(upstream_id) or manifest_json[\n                "sources"\n            ].get(upstream_id)\n            if dbt_resource_props is None:\n                continue\n            upstream_asset_key = node_info_to_asset_key(dbt_resource_props)\n            yield AssetObservation(\n                asset_key=upstream_asset_key,\n                metadata={\n                    "Test ID": result["unique_id"],\n                    "Test Status": status,\n                    "Test Message": result.get("message") or "",\n                },\n            )\n\n\ndef generate_events(\n    dbt_output: DbtOutput,\n    node_info_to_asset_key: Optional[Callable[[Mapping[str, Any]], AssetKey]] = None,\n    manifest_json: Optional[Mapping[str, Any]] = None,\n) -> Iterator[Union[AssetMaterialization, AssetObservation]]:\n    """This function yields :py:class:`dagster.AssetMaterialization` events for each model updated by\n    a dbt command, and :py:class:`dagster.AssetObservation` events for each test run.\n\n    Information parsed from a :py:class:`~DbtOutput` object.\n    """\n    for result in dbt_output.result["results"]:\n        for event in result_to_events(\n            result,\n            docs_url=dbt_output.docs_url,\n            node_info_to_asset_key=node_info_to_asset_key,\n            manifest_json=manifest_json,\n        ):\n            yield check.inst(\n                cast(Union[AssetMaterialization, AssetObservation], event),\n                (AssetMaterialization, AssetObservation),\n            )\n\n\n
[docs]def generate_materializations(\n dbt_output: DbtOutput,\n asset_key_prefix: Optional[Sequence[str]] = None,\n) -> Iterator[AssetMaterialization]:\n """This function yields :py:class:`dagster.AssetMaterialization` events for each model updated by\n a dbt command.\n\n Information parsed from a :py:class:`~DbtOutput` object.\n\n Examples:\n .. code-block:: python\n\n from dagster import op, Output\n from dagster_dbt.utils import generate_materializations\n from dagster_dbt import dbt_cli_resource\n\n @op(required_resource_keys={"dbt"})\n def my_custom_dbt_run(context):\n dbt_output = context.resources.dbt.run()\n for materialization in generate_materializations(dbt_output):\n # you can modify the materialization object to add extra metadata, if desired\n yield materialization\n yield Output(my_dbt_output)\n\n @job(resource_defs={{"dbt":dbt_cli_resource}})\n def my_dbt_cli_job():\n my_custom_dbt_run()\n """\n asset_key_prefix = check.opt_sequence_param(asset_key_prefix, "asset_key_prefix", of_type=str)\n\n for event in generate_events(\n dbt_output,\n node_info_to_asset_key=lambda info: AssetKey(\n asset_key_prefix + info["unique_id"].split(".")\n ),\n ):\n yield check.inst(cast(AssetMaterialization, event), AssetMaterialization)
\n\n\ndef select_unique_ids_from_manifest(\n select: str,\n exclude: str,\n state_path: Optional[str] = None,\n manifest_json_path: Optional[str] = None,\n manifest_json: Optional[Mapping[str, Any]] = None,\n manifest_parsed: Optional[Any] = None,\n) -> AbstractSet[str]:\n """Method to apply a selection string to an existing manifest.json file."""\n import dbt.graph.cli as graph_cli\n import dbt.graph.selector as graph_selector\n from dbt.contracts.graph.manifest import Manifest, WritableManifest\n from dbt.contracts.state import PreviousState\n from dbt.graph.selector_spec import IndirectSelection, SelectionSpec\n from networkx import DiGraph\n\n if state_path is not None:\n previous_state = PreviousState(\n path=Path(state_path), # type: ignore # (unused path, slated for deletion)\n current_path=( # type: ignore # (unused path, slated for deletion)\n Path("/tmp/null") if manifest_json_path is None else Path(manifest_json_path)\n ),\n )\n else:\n previous_state = None\n\n if manifest_json_path is not None:\n manifest = WritableManifest.read_and_check_versions(manifest_json_path)\n child_map = manifest.child_map\n elif manifest_json is not None:\n\n class _DictShim(dict):\n """Shim to enable hydrating a dictionary into a dot-accessible object."""\n\n def __getattr__(self, item):\n ret = super().get(item)\n # allow recursive access e.g. foo.bar.baz\n return _DictShim(ret) if isinstance(ret, dict) else ret\n\n manifest = Manifest(\n # dbt expects dataclasses that can be accessed with dot notation, not bare dictionaries\n nodes={\n unique_id: _DictShim(info) for unique_id, info in manifest_json["nodes"].items() # type: ignore\n },\n sources={\n unique_id: _DictShim(info) for unique_id, info in manifest_json["sources"].items() # type: ignore\n },\n metrics={\n unique_id: _DictShim(info) for unique_id, info in manifest_json["metrics"].items() # type: ignore\n },\n exposures={\n unique_id: _DictShim(info) for unique_id, info in manifest_json["exposures"].items() # type: ignore\n },\n )\n child_map = manifest_json["child_map"]\n elif manifest_parsed is not None:\n manifest = manifest_parsed\n child_map = manifest.child_map\n else:\n check.failed("Must provide either a manifest_json_path, manifest_json, or manifest_parsed.")\n graph = graph_selector.Graph(DiGraph(incoming_graph_data=child_map))\n\n # create a parsed selection from the select string\n try:\n from dbt.flags import GLOBAL_FLAGS\n except ImportError:\n # dbt < 1.5.0 compat\n import dbt.flags as GLOBAL_FLAGS\n setattr(GLOBAL_FLAGS, "INDIRECT_SELECTION", IndirectSelection.Eager)\n setattr(GLOBAL_FLAGS, "WARN_ERROR", True)\n parsed_spec: SelectionSpec = graph_cli.parse_union([select], True)\n\n if exclude:\n parsed_spec = graph_cli.SelectionDifference(\n components=[parsed_spec, graph_cli.parse_union([exclude], True)]\n )\n\n # execute this selection against the graph\n selector = graph_selector.NodeSelector(graph, manifest, previous_state=previous_state)\n selected, _ = selector.select_nodes(parsed_spec)\n return selected\n\n\ndef get_dbt_resource_props_by_dbt_unique_id_from_manifest(\n manifest: Mapping[str, Any]\n) -> Mapping[str, Mapping[str, Any]]:\n """A mapping of a dbt node's unique id to the node's dictionary representation in the manifest."""\n return {\n **manifest["nodes"],\n **manifest["sources"],\n **manifest["exposures"],\n **manifest["metrics"],\n }\n
", "current_page_name": "_modules/dagster_dbt/utils", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_dbt.utils"}}, "dagster_docker": {"docker_executor": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_docker.docker_executor

\nfrom typing import Iterator, Optional, cast\n\nimport dagster._check as check\nimport docker\nimport docker.errors\nfrom dagster import Field, IntSource, executor\nfrom dagster._annotations import experimental\nfrom dagster._core.definitions.executor_definition import multiple_process_executor_requirements\nfrom dagster._core.events import DagsterEvent, EngineEventData\nfrom dagster._core.execution.retries import RetryMode, get_retries_config\nfrom dagster._core.execution.tags import get_tag_concurrency_limits_config\nfrom dagster._core.executor.base import Executor\nfrom dagster._core.executor.init import InitExecutorContext\nfrom dagster._core.executor.step_delegating import StepDelegatingExecutor\nfrom dagster._core.executor.step_delegating.step_handler.base import (\n    CheckStepHealthResult,\n    StepHandler,\n    StepHandlerContext,\n)\nfrom dagster._core.origin import JobPythonOrigin\nfrom dagster._core.utils import parse_env_var\nfrom dagster._grpc.types import ExecuteStepArgs\nfrom dagster._serdes.utils import hash_str\nfrom dagster._utils.merger import merge_dicts\n\nfrom dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_config, validate_docker_image\n\nfrom .container_context import DockerContainerContext\n\n\n
[docs]@executor(\n name="docker",\n config_schema=merge_dicts(\n DOCKER_CONFIG_SCHEMA,\n {\n "retries": get_retries_config(),\n "max_concurrent": Field(\n IntSource,\n is_required=False,\n description=(\n "Limit on the number of containers that will run concurrently within the scope "\n "of a Dagster run. Note that this limit is per run, not global."\n ),\n ),\n "tag_concurrency_limits": get_tag_concurrency_limits_config(),\n },\n ),\n requirements=multiple_process_executor_requirements(),\n)\n@experimental\ndef docker_executor(init_context: InitExecutorContext) -> Executor:\n """Executor which launches steps as Docker containers.\n\n To use the `docker_executor`, set it as the `executor_def` when defining a job:\n\n .. literalinclude:: ../../../../../../python_modules/libraries/dagster-docker/dagster_docker_tests/test_example_executor.py\n :start-after: start_marker\n :end-before: end_marker\n :language: python\n\n Then you can configure the executor with run config as follows:\n\n .. code-block:: YAML\n\n execution:\n config:\n registry: ...\n network: ...\n networks: ...\n container_kwargs: ...\n\n If you're using the DockerRunLauncher, configuration set on the containers created by the run\n launcher will also be set on the containers that are created for each step.\n """\n config = init_context.executor_config\n image = check.opt_str_elem(config, "image")\n registry = check.opt_dict_elem(config, "registry", key_type=str)\n env_vars = check.opt_list_elem(config, "env_vars", of_type=str)\n network = check.opt_str_elem(config, "network")\n networks = check.opt_list_elem(config, "networks", of_type=str)\n container_kwargs = check.opt_dict_elem(config, "container_kwargs", key_type=str)\n retries = check.dict_elem(config, "retries", key_type=str)\n max_concurrent = check.opt_int_elem(config, "max_concurrent")\n tag_concurrency_limits = check.opt_list_elem(config, "tag_concurrency_limits")\n\n validate_docker_config(network, networks, container_kwargs)\n\n if network and not networks:\n networks = [network]\n\n container_context = DockerContainerContext(\n registry=registry,\n env_vars=env_vars or [],\n networks=networks or [],\n container_kwargs=container_kwargs,\n )\n\n return StepDelegatingExecutor(\n DockerStepHandler(image, container_context),\n retries=check.not_none(RetryMode.from_config(retries)),\n max_concurrent=max_concurrent,\n tag_concurrency_limits=tag_concurrency_limits,\n )
\n\n\nclass DockerStepHandler(StepHandler):\n def __init__(\n self,\n image: Optional[str],\n container_context: DockerContainerContext,\n ):\n super().__init__()\n\n self._image = check.opt_str_param(image, "image")\n self._container_context = check.inst_param(\n container_context, "container_context", DockerContainerContext\n )\n\n def _get_image(self, step_handler_context: StepHandlerContext):\n from . import DockerRunLauncher\n\n image = cast(\n JobPythonOrigin, step_handler_context.dagster_run.job_code_origin\n ).repository_origin.container_image\n if not image:\n image = self._image\n\n run_launcher = step_handler_context.instance.run_launcher\n\n if not image and isinstance(run_launcher, DockerRunLauncher):\n image = run_launcher.image\n\n if not image:\n raise Exception("No docker image specified by the executor config or repository")\n\n return image\n\n def _get_docker_container_context(self, step_handler_context: StepHandlerContext):\n # This doesn't vary per step: would be good to have a hook where it can be set once\n # for the whole StepHandler but we need access to the DagsterRun for that\n\n from .docker_run_launcher import DockerRunLauncher\n\n run_launcher = step_handler_context.instance.run_launcher\n run_target = DockerContainerContext.create_for_run(\n step_handler_context.dagster_run,\n run_launcher if isinstance(run_launcher, DockerRunLauncher) else None,\n )\n\n merged_container_context = run_target.merge(self._container_context)\n\n validate_docker_config(\n network=None,\n networks=merged_container_context.networks,\n container_kwargs=merged_container_context.container_kwargs,\n )\n\n return merged_container_context\n\n @property\n def name(self) -> str:\n return "DockerStepHandler"\n\n def _get_client(self, docker_container_context: DockerContainerContext):\n client = docker.client.from_env()\n if docker_container_context.registry:\n client.login(\n registry=docker_container_context.registry["url"],\n username=docker_container_context.registry["username"],\n password=docker_container_context.registry["password"],\n )\n return client\n\n def _get_container_name(self, execute_step_args: ExecuteStepArgs):\n run_id = execute_step_args.run_id\n step_keys_to_execute = check.not_none(execute_step_args.step_keys_to_execute)\n assert len(step_keys_to_execute) == 1, "Launching multiple steps is not currently supported"\n step_key = step_keys_to_execute[0]\n\n step_name = f"dagster-step-{hash_str(run_id + step_key)}"\n\n if execute_step_args.known_state:\n retry_state = execute_step_args.known_state.get_retry_state()\n retry_number = retry_state.get_attempt_count(step_key)\n if retry_number:\n step_name = f"{step_name}-{retry_number}"\n\n return step_name\n\n def _create_step_container(\n self,\n client,\n container_context,\n step_image,\n step_handler_context: StepHandlerContext,\n ):\n execute_step_args = step_handler_context.execute_step_args\n step_keys_to_execute = check.not_none(execute_step_args.step_keys_to_execute)\n assert len(step_keys_to_execute) == 1, "Launching multiple steps is not currently supported"\n step_key = step_keys_to_execute[0]\n\n env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])\n env_vars["DAGSTER_RUN_JOB_NAME"] = step_handler_context.dagster_run.job_name\n env_vars["DAGSTER_RUN_STEP_KEY"] = step_key\n return client.containers.create(\n step_image,\n name=self._get_container_name(execute_step_args),\n detach=True,\n network=container_context.networks[0] if len(container_context.networks) else None,\n command=execute_step_args.get_command_args(),\n environment=env_vars,\n **container_context.container_kwargs,\n )\n\n def launch_step(self, step_handler_context: StepHandlerContext) -> Iterator[DagsterEvent]:\n container_context = self._get_docker_container_context(step_handler_context)\n\n client = self._get_client(container_context)\n\n step_image = self._get_image(step_handler_context)\n validate_docker_image(step_image)\n\n try:\n step_container = self._create_step_container(\n client, container_context, step_image, step_handler_context\n )\n except docker.errors.ImageNotFound:\n client.images.pull(step_image)\n step_container = self._create_step_container(\n client, container_context, step_image, step_handler_context\n )\n\n if len(container_context.networks) > 1:\n for network_name in container_context.networks[1:]:\n network = client.networks.get(network_name)\n network.connect(step_container)\n\n step_keys_to_execute = check.not_none(\n step_handler_context.execute_step_args.step_keys_to_execute\n )\n assert len(step_keys_to_execute) == 1, "Launching multiple steps is not currently supported"\n step_key = step_keys_to_execute[0]\n\n yield DagsterEvent.step_worker_starting(\n step_handler_context.get_step_context(step_key),\n message="Launching step in Docker container.",\n metadata={\n "Docker container id": step_container.id,\n },\n )\n step_container.start()\n\n def check_step_health(self, step_handler_context: StepHandlerContext) -> CheckStepHealthResult:\n container_context = self._get_docker_container_context(step_handler_context)\n\n client = self._get_client(container_context)\n\n container_name = self._get_container_name(step_handler_context.execute_step_args)\n\n container = client.containers.get(container_name)\n\n if container.status == "running":\n return CheckStepHealthResult.healthy()\n\n try:\n container_info = container.wait(timeout=0.1)\n except Exception as e:\n raise Exception(\n f"Container status is {container.status}. Raised exception attempting to get its"\n " return code."\n ) from e\n\n ret_code = container_info.get("StatusCode")\n if ret_code == 0:\n return CheckStepHealthResult.healthy()\n\n return CheckStepHealthResult.unhealthy(\n reason=f"Container status is {container.status}. Return code is {ret_code}."\n )\n\n def terminate_step(self, step_handler_context: StepHandlerContext) -> Iterator[DagsterEvent]:\n container_context = self._get_docker_container_context(step_handler_context)\n\n step_keys_to_execute = check.not_none(\n step_handler_context.execute_step_args.step_keys_to_execute\n )\n assert (\n len(step_keys_to_execute) == 1\n ), "Terminating multiple steps is not currently supported"\n step_key = step_keys_to_execute[0]\n\n container_name = self._get_container_name(step_handler_context.execute_step_args)\n\n yield DagsterEvent.engine_event(\n step_handler_context.get_step_context(step_key),\n message=f"Stopping Docker container {container_name} for step.",\n event_specific_data=EngineEventData(),\n )\n\n client = self._get_client(container_context)\n\n container = client.containers.get(container_name)\n\n container.stop()\n
", "current_page_name": "_modules/dagster_docker/docker_executor", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_docker.docker_executor"}, "docker_run_launcher": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_docker.docker_run_launcher

\nfrom typing import Any, Mapping, Optional\n\nimport dagster._check as check\nimport docker\nfrom dagster._core.launcher.base import (\n    CheckRunHealthResult,\n    LaunchRunContext,\n    ResumeRunContext,\n    RunLauncher,\n    WorkerStatus,\n)\nfrom dagster._core.storage.dagster_run import DagsterRun\nfrom dagster._core.storage.tags import DOCKER_IMAGE_TAG\nfrom dagster._core.utils import parse_env_var\nfrom dagster._grpc.types import ExecuteRunArgs, ResumeRunArgs\nfrom dagster._serdes import ConfigurableClass\nfrom dagster._serdes.config_class import ConfigurableClassData\nfrom typing_extensions import Self\n\nfrom dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_config, validate_docker_image\n\nfrom .container_context import DockerContainerContext\n\nDOCKER_CONTAINER_ID_TAG = "docker/container_id"\n\n\n
[docs]class DockerRunLauncher(RunLauncher, ConfigurableClass):\n """Launches runs in a Docker container."""\n\n def __init__(\n self,\n inst_data: Optional[ConfigurableClassData] = None,\n image=None,\n registry=None,\n env_vars=None,\n network=None,\n networks=None,\n container_kwargs=None,\n ):\n self._inst_data = inst_data\n self.image = image\n self.registry = registry\n self.env_vars = env_vars\n\n validate_docker_config(network, networks, container_kwargs)\n\n if network:\n self.networks = [network]\n elif networks:\n self.networks = networks\n else:\n self.networks = []\n\n self.container_kwargs = check.opt_dict_param(\n container_kwargs, "container_kwargs", key_type=str\n )\n\n super().__init__()\n\n @property\n def inst_data(self):\n return self._inst_data\n\n @classmethod\n def config_type(cls):\n return DOCKER_CONFIG_SCHEMA\n\n @classmethod\n def from_config_value(\n cls, inst_data: ConfigurableClassData, config_value: Mapping[str, Any]\n ) -> Self:\n return DockerRunLauncher(inst_data=inst_data, **config_value)\n\n def get_container_context(self, dagster_run: DagsterRun) -> DockerContainerContext:\n return DockerContainerContext.create_for_run(dagster_run, self)\n\n def _get_client(self, container_context: DockerContainerContext):\n client = docker.client.from_env()\n if container_context.registry:\n client.login(\n registry=container_context.registry["url"],\n username=container_context.registry["username"],\n password=container_context.registry["password"],\n )\n return client\n\n def _get_docker_image(self, job_code_origin):\n docker_image = job_code_origin.repository_origin.container_image\n\n if not docker_image:\n docker_image = self.image\n\n if not docker_image:\n raise Exception("No docker image specified by the instance config or repository")\n\n validate_docker_image(docker_image)\n return docker_image\n\n def _launch_container_with_command(self, run, docker_image, command):\n container_context = self.get_container_context(run)\n docker_env = dict([parse_env_var(env_var) for env_var in container_context.env_vars])\n docker_env["DAGSTER_RUN_JOB_NAME"] = run.job_name\n\n client = self._get_client(container_context)\n\n try:\n container = client.containers.create(\n image=docker_image,\n command=command,\n detach=True,\n environment=docker_env,\n network=container_context.networks[0] if len(container_context.networks) else None,\n **container_context.container_kwargs,\n )\n\n except docker.errors.ImageNotFound:\n client.images.pull(docker_image)\n container = client.containers.create(\n image=docker_image,\n command=command,\n detach=True,\n environment=docker_env,\n network=container_context.networks[0] if len(container_context.networks) else None,\n **container_context.container_kwargs,\n )\n\n if len(container_context.networks) > 1:\n for network_name in container_context.networks[1:]:\n network = client.networks.get(network_name)\n network.connect(container)\n\n self._instance.report_engine_event(\n message=f"Launching run in a new container {container.id} with image {docker_image}",\n dagster_run=run,\n cls=self.__class__,\n )\n\n self._instance.add_run_tags(\n run.run_id,\n {DOCKER_CONTAINER_ID_TAG: container.id, DOCKER_IMAGE_TAG: docker_image},\n )\n\n container.start()\n\n def launch_run(self, context: LaunchRunContext) -> None:\n run = context.dagster_run\n job_code_origin = check.not_none(context.job_code_origin)\n docker_image = self._get_docker_image(job_code_origin)\n\n command = ExecuteRunArgs(\n job_origin=job_code_origin,\n run_id=run.run_id,\n instance_ref=self._instance.get_ref(),\n ).get_command_args()\n\n self._launch_container_with_command(run, docker_image, command)\n\n @property\n def supports_resume_run(self):\n return True\n\n def resume_run(self, context: ResumeRunContext) -> None:\n run = context.dagster_run\n job_code_origin = check.not_none(context.job_code_origin)\n docker_image = self._get_docker_image(job_code_origin)\n\n command = ResumeRunArgs(\n job_origin=job_code_origin,\n run_id=run.run_id,\n instance_ref=self._instance.get_ref(),\n ).get_command_args()\n\n self._launch_container_with_command(run, docker_image, command)\n\n def _get_container(self, run):\n if not run or run.is_finished:\n return None\n\n container_id = run.tags.get(DOCKER_CONTAINER_ID_TAG)\n\n if not container_id:\n return None\n\n container_context = self.get_container_context(run)\n\n try:\n return self._get_client(container_context).containers.get(container_id)\n except Exception:\n return None\n\n def terminate(self, run_id):\n run = self._instance.get_run_by_id(run_id)\n\n if not run:\n return False\n\n self._instance.report_run_canceling(run)\n\n container = self._get_container(run)\n\n if not container:\n self._instance.report_engine_event(\n message="Unable to get docker container to send termination request to.",\n dagster_run=run,\n cls=self.__class__,\n )\n return False\n\n container.stop()\n\n return True\n\n @property\n def supports_check_run_worker_health(self):\n return True\n\n def check_run_worker_health(self, run: DagsterRun):\n container = self._get_container(run)\n if container is None:\n return CheckRunHealthResult(WorkerStatus.NOT_FOUND)\n if container.status == "running":\n return CheckRunHealthResult(WorkerStatus.RUNNING)\n return CheckRunHealthResult(\n WorkerStatus.FAILED, msg=f"Container status is {container.status}"\n )
\n
", "current_page_name": "_modules/dagster_docker/docker_run_launcher", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_docker.docker_run_launcher"}, "ops": {"docker_container_op": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_docker.ops.docker_container_op

\nfrom typing import Any, Mapping, Optional, Sequence\n\nimport docker\nfrom dagster import Field, In, Nothing, OpExecutionContext, StringSource, op\nfrom dagster._annotations import experimental\nfrom dagster._core.utils import parse_env_var\nfrom dagster._serdes.utils import hash_str\n\nfrom ..container_context import DockerContainerContext\nfrom ..docker_run_launcher import DockerRunLauncher\nfrom ..utils import DOCKER_CONFIG_SCHEMA, validate_docker_image\n\nDOCKER_CONTAINER_OP_CONFIG = {\n    **DOCKER_CONFIG_SCHEMA,\n    "image": Field(\n        StringSource,\n        is_required=True,\n        description="The image in which to run the Docker container.",\n    ),\n    "entrypoint": Field(\n        [str],\n        is_required=False,\n        description="The ENTRYPOINT for the Docker container",\n    ),\n    "command": Field(\n        [str],\n        is_required=False,\n        description="The command to run in the container within the launched Docker container.",\n    ),\n}\n\n\ndef _get_client(docker_container_context: DockerContainerContext):\n    client = docker.client.from_env()\n    if docker_container_context.registry:\n        client.login(\n            registry=docker_container_context.registry["url"],\n            username=docker_container_context.registry["username"],\n            password=docker_container_context.registry["password"],\n        )\n    return client\n\n\ndef _get_container_name(run_id, op_name, retry_number):\n    container_name = hash_str(run_id + op_name)\n\n    if retry_number > 0:\n        container_name = f"{container_name}-{retry_number}"\n\n    return container_name\n\n\ndef _create_container(\n    op_context: OpExecutionContext,\n    client,\n    container_context: DockerContainerContext,\n    image: str,\n    entrypoint: Optional[Sequence[str]],\n    command: Optional[Sequence[str]],\n):\n    env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])\n    return client.containers.create(\n        image,\n        name=_get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),\n        detach=True,\n        network=container_context.networks[0] if len(container_context.networks) else None,\n        entrypoint=entrypoint,\n        command=command,\n        environment=env_vars,\n        **container_context.container_kwargs,\n    )\n\n\n
[docs]@experimental\ndef execute_docker_container(\n context: OpExecutionContext,\n image: str,\n entrypoint: Optional[Sequence[str]] = None,\n command: Optional[Sequence[str]] = None,\n networks: Optional[Sequence[str]] = None,\n registry: Optional[Mapping[str, str]] = None,\n env_vars: Optional[Sequence[str]] = None,\n container_kwargs: Optional[Mapping[str, Any]] = None,\n):\n """This function is a utility for executing a Docker container from within a Dagster op.\n\n Args:\n image (str): The image to use for the launched Docker container.\n entrypoint (Optional[Sequence[str]]): The ENTRYPOINT to run in the launched Docker\n container. Default: None.\n command (Optional[Sequence[str]]): The CMD to run in the launched Docker container.\n Default: None.\n networks (Optional[Sequence[str]]): Names of the Docker networks to which to connect the\n launched container. Default: None.\n registry: (Optional[Mapping[str, str]]): Information for using a non local/public Docker\n registry. Can have "url", "username", or "password" keys.\n env_vars (Optional[Sequence[str]]): List of environemnt variables to include in the launched\n container. ach can be of the form KEY=VALUE or just KEY (in which case the value will be\n pulled from the calling environment.\n container_kwargs (Optional[Dict[str[Any]]]): key-value pairs that can be passed into\n containers.create in the Docker Python API. See\n https://docker-py.readthedocs.io/en/stable/containers.html for the full list\n of available options.\n """\n run_container_context = DockerContainerContext.create_for_run(\n context.dagster_run,\n (\n context.instance.run_launcher\n if isinstance(context.instance.run_launcher, DockerRunLauncher)\n else None\n ),\n )\n\n validate_docker_image(image)\n\n op_container_context = DockerContainerContext(\n registry=registry, env_vars=env_vars, networks=networks, container_kwargs=container_kwargs\n )\n\n container_context = run_container_context.merge(op_container_context)\n\n client = _get_client(container_context)\n\n try:\n container = _create_container(\n context, client, container_context, image, entrypoint, command\n )\n except docker.errors.ImageNotFound:\n client.images.pull(image)\n container = _create_container(\n context, client, container_context, image, entrypoint, command\n )\n\n if len(container_context.networks) > 1:\n for network_name in container_context.networks[1:]:\n network = client.networks.get(network_name)\n network.connect(container)\n\n container.start()\n\n for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):\n print(line) # noqa: T201\n\n exit_status = container.wait()["StatusCode"]\n\n if exit_status != 0:\n raise Exception(f"Docker container returned exit code {exit_status}")
\n\n\n
[docs]@op(ins={"start_after": In(Nothing)}, config_schema=DOCKER_CONTAINER_OP_CONFIG)\n@experimental\ndef docker_container_op(context):\n """An op that runs a Docker container using the docker Python API.\n\n Contrast with the `docker_executor`, which runs each Dagster op in a Dagster job in its\n own Docker container.\n\n This op may be useful when:\n - You need to orchestrate a command that isn't a Dagster op (or isn't written in Python)\n - You want to run the rest of a Dagster job using a specific executor, and only a single\n op in docker.\n\n For example:\n\n .. literalinclude:: ../../../../../../python_modules/libraries/dagster-docker/dagster_docker_tests/test_example_docker_container_op.py\n :start-after: start_marker\n :end-before: end_marker\n :language: python\n\n You can create your own op with the same implementation by calling the `execute_docker_container` function\n inside your own op.\n """\n execute_docker_container(context, **context.op_config)
\n
", "current_page_name": "_modules/dagster_docker/ops/docker_container_op", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_docker.ops.docker_container_op"}}}, "dagster_duckdb": {"io_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_duckdb.io_manager

\nfrom abc import abstractmethod\nfrom contextlib import contextmanager\nfrom typing import Optional, Sequence, Type, cast\n\nimport duckdb\nfrom dagster import IOManagerDefinition, OutputContext, io_manager\nfrom dagster._config.pythonic_config import (\n    ConfigurableIOManagerFactory,\n)\nfrom dagster._core.definitions.time_window_partitions import TimeWindow\nfrom dagster._core.storage.db_io_manager import (\n    DbClient,\n    DbIOManager,\n    DbTypeHandler,\n    TablePartitionDimension,\n    TableSlice,\n)\nfrom dagster._core.storage.io_manager import dagster_maintained_io_manager\nfrom dagster._utils.backoff import backoff\nfrom pydantic import Field\n\nDUCKDB_DATETIME_FORMAT = "%Y-%m-%d %H:%M:%S"\n\n\n
[docs]def build_duckdb_io_manager(\n type_handlers: Sequence[DbTypeHandler], default_load_type: Optional[Type] = None\n) -> IOManagerDefinition:\n """Builds an IO manager definition that reads inputs from and writes outputs to DuckDB.\n\n Args:\n type_handlers (Sequence[DbTypeHandler]): Each handler defines how to translate between\n DuckDB tables and an in-memory type - e.g. a Pandas DataFrame. If only\n one DbTypeHandler is provided, it will be used as teh default_load_type.\n default_load_type (Type): When an input has no type annotation, load it as this type.\n\n Returns:\n IOManagerDefinition\n\n Examples:\n .. code-block:: python\n\n from dagster_duckdb import build_duckdb_io_manager\n from dagster_duckdb_pandas import DuckDBPandasTypeHandler\n\n @asset(\n key_prefix=["my_schema"] # will be used as the schema in duckdb\n )\n def my_table() -> pd.DataFrame: # the name of the asset will be the table name\n ...\n\n duckdb_io_manager = build_duckdb_io_manager([DuckDBPandasTypeHandler()])\n\n @repository\n def my_repo():\n return with_resources(\n [my_table],\n {"io_manager": duckdb_io_manager.configured({"database": "my_db.duckdb"})}\n )\n\n If you do not provide a schema, Dagster will determine a schema based on the assets and ops using\n the IO Manager. For assets, the schema will be determined from the asset key. For ops, the schema can be\n specified by including a "schema" entry in output metadata. If none of these is provided, the schema will\n default to "public".\n\n .. code-block:: python\n\n @op(\n out={"my_table": Out(metadata={"schema": "my_schema"})}\n )\n def make_my_table() -> pd.DataFrame:\n ...\n\n To only use specific columns of a table as input to a downstream op or asset, add the metadata "columns" to the\n In or AssetIn.\n\n .. code-block:: python\n\n @asset(\n ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n )\n def my_table_a(my_table: pd.DataFrame):\n # my_table will just contain the data from column "a"\n ...\n\n """\n\n @dagster_maintained_io_manager\n @io_manager(config_schema=DuckDBIOManager.to_config_schema())\n def duckdb_io_manager(init_context):\n """IO Manager for storing outputs in a DuckDB database.\n\n Assets will be stored in the schema and table name specified by their AssetKey.\n Subsequent materializations of an asset will overwrite previous materializations of that asset.\n Op outputs will be stored in the schema specified by output metadata (defaults to public) in a\n table of the name of the output.\n """\n return DbIOManager(\n type_handlers=type_handlers,\n db_client=DuckDbClient(),\n io_manager_name="DuckDBIOManager",\n database=init_context.resource_config["database"],\n schema=init_context.resource_config.get("schema"),\n default_load_type=default_load_type,\n )\n\n return duckdb_io_manager
\n\n\n
[docs]class DuckDBIOManager(ConfigurableIOManagerFactory):\n """Base class for an IO manager definition that reads inputs from and writes outputs to DuckDB.\n\n Examples:\n .. code-block:: python\n\n from dagster_duckdb import DuckDBIOManager\n from dagster_duckdb_pandas import DuckDBPandasTypeHandler\n\n class MyDuckDBIOManager(DuckDBIOManager):\n @staticmethod\n def type_handlers() -> Sequence[DbTypeHandler]:\n return [DuckDBPandasTypeHandler()]\n\n @asset(\n key_prefix=["my_schema"] # will be used as the schema in duckdb\n )\n def my_table() -> pd.DataFrame: # the name of the asset will be the table name\n ...\n\n defs = Definitions(\n assets=[my_table],\n resources={"io_manager": MyDuckDBIOManager(database="my_db.duckdb")}\n )\n\n If you do not provide a schema, Dagster will determine a schema based on the assets and ops using\n the IO Manager. For assets, the schema will be determined from the asset key, as in the above example.\n For ops, the schema can be specified by including a "schema" entry in output metadata. If none\n of these is provided, the schema will default to "public".\n\n .. code-block:: python\n\n @op(\n out={"my_table": Out(metadata={"schema": "my_schema"})}\n )\n def make_my_table() -> pd.DataFrame:\n ...\n\n To only use specific columns of a table as input to a downstream op or asset, add the metadata "columns" to the\n In or AssetIn.\n\n .. code-block:: python\n\n @asset(\n ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n )\n def my_table_a(my_table: pd.DataFrame):\n # my_table will just contain the data from column "a"\n ...\n\n """\n\n database: str = Field(description="Path to the DuckDB database.")\n schema_: Optional[str] = Field(\n default=None, alias="schema", description="Name of the schema to use."\n ) # schema is a reserved word for pydantic\n\n @staticmethod\n @abstractmethod\n def type_handlers() -> Sequence[DbTypeHandler]: ...\n\n @staticmethod\n def default_load_type() -> Optional[Type]:\n return None\n\n def create_io_manager(self, context) -> DbIOManager:\n return DbIOManager(\n db_client=DuckDbClient(),\n database=self.database,\n schema=self.schema_,\n type_handlers=self.type_handlers(),\n default_load_type=self.default_load_type(),\n io_manager_name="DuckDBIOManager",\n )
\n\n\nclass DuckDbClient(DbClient):\n @staticmethod\n def delete_table_slice(context: OutputContext, table_slice: TableSlice, connection) -> None:\n try:\n connection.execute(_get_cleanup_statement(table_slice))\n except duckdb.CatalogException:\n # table doesn't exist yet, so ignore the error\n pass\n\n @staticmethod\n def ensure_schema_exists(context: OutputContext, table_slice: TableSlice, connection) -> None:\n connection.execute(f"create schema if not exists {table_slice.schema};")\n\n @staticmethod\n def get_select_statement(table_slice: TableSlice) -> str:\n col_str = ", ".join(table_slice.columns) if table_slice.columns else "*"\n\n if table_slice.partition_dimensions and len(table_slice.partition_dimensions) > 0:\n query = f"SELECT {col_str} FROM {table_slice.schema}.{table_slice.table} WHERE\\n"\n return query + _partition_where_clause(table_slice.partition_dimensions)\n else:\n return f"""SELECT {col_str} FROM {table_slice.schema}.{table_slice.table}"""\n\n @staticmethod\n @contextmanager\n def connect(context, _):\n conn = backoff(\n fn=duckdb.connect,\n retry_on=(RuntimeError, duckdb.IOException),\n kwargs={"database": context.resource_config["database"], "read_only": False},\n max_retries=10,\n )\n\n yield conn\n\n conn.close()\n\n\ndef _get_cleanup_statement(table_slice: TableSlice) -> str:\n """Returns a SQL statement that deletes data in the given table to make way for the output data\n being written.\n """\n if table_slice.partition_dimensions and len(table_slice.partition_dimensions) > 0:\n query = f"DELETE FROM {table_slice.schema}.{table_slice.table} WHERE\\n"\n return query + _partition_where_clause(table_slice.partition_dimensions)\n else:\n return f"DELETE FROM {table_slice.schema}.{table_slice.table}"\n\n\ndef _partition_where_clause(partition_dimensions: Sequence[TablePartitionDimension]) -> str:\n return " AND\\n".join(\n (\n _time_window_where_clause(partition_dimension)\n if isinstance(partition_dimension.partitions, TimeWindow)\n else _static_where_clause(partition_dimension)\n )\n for partition_dimension in partition_dimensions\n )\n\n\ndef _time_window_where_clause(table_partition: TablePartitionDimension) -> str:\n partition = cast(TimeWindow, table_partition.partitions)\n start_dt, end_dt = partition\n start_dt_str = start_dt.strftime(DUCKDB_DATETIME_FORMAT)\n end_dt_str = end_dt.strftime(DUCKDB_DATETIME_FORMAT)\n return f"""{table_partition.partition_expr} >= '{start_dt_str}' AND {table_partition.partition_expr} < '{end_dt_str}'"""\n\n\ndef _static_where_clause(table_partition: TablePartitionDimension) -> str:\n partitions = ", ".join(f"'{partition}'" for partition in table_partition.partitions)\n return f"""{table_partition.partition_expr} in ({partitions})"""\n
", "current_page_name": "_modules/dagster_duckdb/io_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_duckdb.io_manager"}, "resource": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_duckdb.resource

\nfrom contextlib import contextmanager\n\nimport duckdb\nfrom dagster import ConfigurableResource\nfrom dagster._utils.backoff import backoff\nfrom pydantic import Field\n\n\n
[docs]class DuckDBResource(ConfigurableResource):\n """Resource for interacting with a DuckDB database.\n\n Examples:\n .. code-block:: python\n\n from dagster import Definitions, asset\n from dagster_duckdb import DuckDBResource\n\n @asset\n def my_table(duckdb: DuckDBResource):\n with duckdb.get_connection() as conn:\n conn.execute("SELECT * from MY_SCHEMA.MY_TABLE")\n\n defs = Definitions(\n assets=[my_table],\n resources={"duckdb": DuckDBResource(database="path/to/db.duckdb")}\n )\n\n """\n\n database: str = Field(\n description=(\n "Path to the DuckDB database. Setting database=':memory:' will use an in-memory"\n " database "\n )\n )\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n @contextmanager\n def get_connection(self):\n conn = backoff(\n fn=duckdb.connect,\n retry_on=(RuntimeError, duckdb.IOException),\n kwargs={"database": self.database, "read_only": False},\n max_retries=10,\n )\n\n yield conn\n\n conn.close()
\n
", "current_page_name": "_modules/dagster_duckdb/resource", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_duckdb.resource"}}, "dagster_duckdb_pandas": {"duckdb_pandas_type_handler": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_duckdb_pandas.duckdb_pandas_type_handler

\nfrom typing import Optional, Sequence, Type\n\nimport pandas as pd\nfrom dagster import InputContext, MetadataValue, OutputContext, TableColumn, TableSchema\nfrom dagster._core.storage.db_io_manager import DbTypeHandler, TableSlice\nfrom dagster_duckdb.io_manager import (\n    DuckDbClient,\n    DuckDBIOManager,\n    build_duckdb_io_manager,\n)\n\n\n
[docs]class DuckDBPandasTypeHandler(DbTypeHandler[pd.DataFrame]):\n """Stores and loads Pandas DataFrames in DuckDB.\n\n To use this type handler, return it from the ``type_handlers` method of an I/O manager that inherits from ``DuckDBIOManager``.\n\n Example:\n .. code-block:: python\n\n from dagster_duckdb import DuckDBIOManager\n from dagster_duckdb_pandas import DuckDBPandasTypeHandler\n\n class MyDuckDBIOManager(DuckDBIOManager):\n @staticmethod\n def type_handlers() -> Sequence[DbTypeHandler]:\n return [DuckDBPandasTypeHandler()]\n\n @asset(\n key_prefix=["my_schema"] # will be used as the schema in duckdb\n )\n def my_table() -> pd.DataFrame: # the name of the asset will be the table name\n ...\n\n defs = Definitions(\n assets=[my_table],\n resources={"io_manager": MyDuckDBIOManager(database="my_db.duckdb")}\n )\n\n """\n\n def handle_output(\n self, context: OutputContext, table_slice: TableSlice, obj: pd.DataFrame, connection\n ):\n """Stores the pandas DataFrame in duckdb."""\n connection.execute(\n f"create table if not exists {table_slice.schema}.{table_slice.table} as select * from"\n " obj;"\n )\n if not connection.fetchall():\n # table was not created, therefore already exists. Insert the data\n connection.execute(\n f"insert into {table_slice.schema}.{table_slice.table} select * from obj"\n )\n\n context.add_output_metadata(\n {\n "row_count": obj.shape[0],\n "dataframe_columns": MetadataValue.table_schema(\n TableSchema(\n columns=[\n TableColumn(name=name, type=str(dtype)) # type: ignore # (bad stubs)\n for name, dtype in obj.dtypes.items()\n ]\n )\n ),\n }\n )\n\n def load_input(\n self, context: InputContext, table_slice: TableSlice, connection\n ) -> pd.DataFrame:\n """Loads the input as a Pandas DataFrame."""\n if table_slice.partition_dimensions and len(context.asset_partition_keys) == 0:\n return pd.DataFrame()\n return connection.execute(DuckDbClient.get_select_statement(table_slice)).fetchdf()\n\n @property\n def supported_types(self):\n return [pd.DataFrame]
\n\n\nduckdb_pandas_io_manager = build_duckdb_io_manager(\n [DuckDBPandasTypeHandler()], default_load_type=pd.DataFrame\n)\nduckdb_pandas_io_manager.__doc__ = """\nAn I/O manager definition that reads inputs from and writes Pandas DataFrames to DuckDB. When\nusing the duckdb_pandas_io_manager, any inputs and outputs without type annotations will be loaded\nas Pandas DataFrames.\n\nReturns:\n IOManagerDefinition\n\nExamples:\n\n .. code-block:: python\n\n from dagster_duckdb_pandas import duckdb_pandas_io_manager\n\n @asset(\n key_prefix=["my_schema"] # will be used as the schema in DuckDB\n )\n def my_table() -> pd.DataFrame: # the name of the asset will be the table name\n ...\n\n @repository\n def my_repo():\n return with_resources(\n [my_table],\n {"io_manager": duckdb_pandas_io_manager.configured({"database": "my_db.duckdb"})}\n )\n\n If you do not provide a schema, Dagster will determine a schema based on the assets and ops using\n the I/O Manager. For assets, the schema will be determined from the asset key.\n For ops, the schema can be specified by including a "schema" entry in output metadata. If "schema" is not provided\n via config or on the asset/op, "public" will be used for the schema.\n\n .. code-block:: python\n\n @op(\n out={"my_table": Out(metadata={"schema": "my_schema"})}\n )\n def make_my_table() -> pd.DataFrame:\n # the returned value will be stored at my_schema.my_table\n ...\n\n To only use specific columns of a table as input to a downstream op or asset, add the metadata "columns" to the\n In or AssetIn.\n\n .. code-block:: python\n\n @asset(\n ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n )\n def my_table_a(my_table: pd.DataFrame) -> pd.DataFrame:\n # my_table will just contain the data from column "a"\n ...\n\n"""\n\n\n
[docs]class DuckDBPandasIOManager(DuckDBIOManager):\n """An I/O manager definition that reads inputs from and writes Pandas DataFrames to DuckDB. When\n using the DuckDBPandasIOManager, any inputs and outputs without type annotations will be loaded\n as Pandas DataFrames.\n\n Returns:\n IOManagerDefinition\n\n Examples:\n .. code-block:: python\n\n from dagster_duckdb_pandas import DuckDBPandasIOManager\n\n @asset(\n key_prefix=["my_schema"] # will be used as the schema in DuckDB\n )\n def my_table() -> pd.DataFrame: # the name of the asset will be the table name\n ...\n\n defs = Definitions(\n assets=[my_table],\n resources={"io_manager": DuckDBPandasIOManager(database="my_db.duckdb")}\n )\n\n If you do not provide a schema, Dagster will determine a schema based on the assets and ops using\n the I/O Manager. For assets, the schema will be determined from the asset key, as in the above example.\n For ops, the schema can be specified by including a "schema" entry in output metadata. If "schema" is not provided\n via config or on the asset/op, "public" will be used for the schema.\n\n .. code-block:: python\n\n @op(\n out={"my_table": Out(metadata={"schema": "my_schema"})}\n )\n def make_my_table() -> pd.DataFrame:\n # the returned value will be stored at my_schema.my_table\n ...\n\n To only use specific columns of a table as input to a downstream op or asset, add the metadata "columns" to the\n In or AssetIn.\n\n .. code-block:: python\n\n @asset(\n ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n )\n def my_table_a(my_table: pd.DataFrame) -> pd.DataFrame:\n # my_table will just contain the data from column "a"\n ...\n\n """\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n @staticmethod\n def type_handlers() -> Sequence[DbTypeHandler]:\n return [DuckDBPandasTypeHandler()]\n\n @staticmethod\n def default_load_type() -> Optional[Type]:\n return pd.DataFrame
\n
", "current_page_name": "_modules/dagster_duckdb_pandas/duckdb_pandas_type_handler", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_duckdb_pandas.duckdb_pandas_type_handler"}}, "dagster_duckdb_polars": {"duckdb_polars_type_handler": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_duckdb_polars.duckdb_polars_type_handler

\nfrom typing import Optional, Sequence, Type\n\nimport polars as pl\nfrom dagster import InputContext, MetadataValue, OutputContext, TableColumn, TableSchema\nfrom dagster._core.storage.db_io_manager import DbTypeHandler, TableSlice\nfrom dagster_duckdb.io_manager import DuckDbClient, DuckDBIOManager, build_duckdb_io_manager\n\n\n
[docs]class DuckDBPolarsTypeHandler(DbTypeHandler[pl.DataFrame]):\n """Stores and loads Polars DataFrames in DuckDB.\n\n To use this type handler, return it from the ``type_handlers` method of an I/O manager that inherits from ``DuckDBIOManager``.\n\n Example:\n .. code-block:: python\n\n from dagster_duckdb import DuckDBIOManager\n from dagster_duckdb_polars import DuckDBPolarsTypeHandler\n\n class MyDuckDBIOManager(DuckDBIOManager):\n @staticmethod\n def type_handlers() -> Sequence[DbTypeHandler]:\n return [DuckDBPolarsTypeHandler()]\n\n @asset(\n key_prefix=["my_schema"] # will be used as the schema in duckdb\n )\n def my_table() -> pl.DataFrame: # the name of the asset will be the table name\n ...\n\n defs = Definitions(\n assets=[my_table],\n resources={"io_manager": MyDuckDBIOManager(database="my_db.duckdb")}\n )\n\n """\n\n def handle_output(\n self, context: OutputContext, table_slice: TableSlice, obj: pl.DataFrame, connection\n ):\n """Stores the polars DataFrame in duckdb."""\n obj_arrow = obj.to_arrow() # noqa: F841 # need obj_arrow symbol to exist for duckdb query\n connection.execute(f"create schema if not exists {table_slice.schema};")\n connection.execute(\n f"create table if not exists {table_slice.schema}.{table_slice.table} as select * from"\n " obj_arrow;"\n )\n if not connection.fetchall():\n # table was not created, therefore already exists. Insert the data\n connection.execute(\n f"insert into {table_slice.schema}.{table_slice.table} select * from obj_arrow"\n )\n\n context.add_output_metadata(\n {\n "row_count": obj.shape[0],\n "dataframe_columns": MetadataValue.table_schema(\n TableSchema(\n columns=[\n TableColumn(name=name, type=str(dtype))\n for name, dtype in zip(obj.columns, obj.dtypes)\n ]\n )\n ),\n }\n )\n\n def load_input(\n self, context: InputContext, table_slice: TableSlice, connection\n ) -> pl.DataFrame:\n """Loads the input as a Polars DataFrame."""\n if table_slice.partition_dimensions and len(context.asset_partition_keys) == 0:\n return pl.DataFrame()\n select_statement = connection.execute(\n DuckDbClient.get_select_statement(table_slice=table_slice)\n )\n duckdb_to_arrow = select_statement.arrow()\n return pl.DataFrame(duckdb_to_arrow)\n\n @property\n def supported_types(self):\n return [pl.DataFrame]
\n\n\nduckdb_polars_io_manager = build_duckdb_io_manager(\n [DuckDBPolarsTypeHandler()], default_load_type=pl.DataFrame\n)\nduckdb_polars_io_manager.__doc__ = """\nAn I/O manager definition that reads inputs from and writes polars dataframes to DuckDB. When\nusing the duckdb_polars_io_manager, any inputs and outputs without type annotations will be loaded\nas Polars DataFrames.\n\nReturns:\n IOManagerDefinition\n\nExamples:\n\n .. code-block:: python\n\n from dagster_duckdb_polars import duckdb_polars_io_manager\n\n @asset(\n key_prefix=["my_schema"] # will be used as the schema in DuckDB\n )\n def my_table() -> pl.DataFrame: # the name of the asset will be the table name\n ...\n\n @repository\n def my_repo():\n return with_resources(\n [my_table],\n {"io_manager": duckdb_polars_io_manager.configured({"database": "my_db.duckdb"})}\n )\n\n If you do not provide a schema, Dagster will determine a schema based on the assets and ops using\n the I/O Manager. For assets, the schema will be determined from the asset key.\n For ops, the schema can be specified by including a "schema" entry in output metadata. If "schema" is not provided\n via config or on the asset/op, "public" will be used for the schema.\n\n .. code-block:: python\n\n @op(\n out={"my_table": Out(metadata={"schema": "my_schema"})}\n )\n def make_my_table() -> pl.DataFrame:\n # the returned value will be stored at my_schema.my_table\n ...\n\n To only use specific columns of a table as input to a downstream op or asset, add the metadata "columns" to the\n In or AssetIn.\n\n .. code-block:: python\n\n @asset(\n ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n )\n def my_table_a(my_table: pl.DataFrame) -> pl.DataFrame:\n # my_table will just contain the data from column "a"\n ...\n\n"""\n\n\n
[docs]class DuckDBPolarsIOManager(DuckDBIOManager):\n """An I/O manager definition that reads inputs from and writes Polars DataFrames to DuckDB. When\n using the DuckDBPolarsIOManager, any inputs and outputs without type annotations will be loaded\n as Polars DataFrames.\n\n Returns:\n IOManagerDefinition\n\n Examples:\n .. code-block:: python\n\n from dagster_duckdb_polars import DuckDBPolarsIOManager\n\n @asset(\n key_prefix=["my_schema"] # will be used as the schema in DuckDB\n )\n def my_table() -> pl.DataFrame: # the name of the asset will be the table name\n ...\n\n defs = Definitions(\n assets=[my_table],\n resources={"io_manager": DuckDBPolarsIOManager(database="my_db.duckdb")}\n )\n\n If you do not provide a schema, Dagster will determine a schema based on the assets and ops using\n the I/O Manager. For assets, the schema will be determined from the asset key, as in the above example.\n For ops, the schema can be specified by including a "schema" entry in output metadata. If "schema" is not provided\n via config or on the asset/op, "public" will be used for the schema.\n\n .. code-block:: python\n\n @op(\n out={"my_table": Out(metadata={"schema": "my_schema"})}\n )\n def make_my_table() -> pl.DataFrame:\n # the returned value will be stored at my_schema.my_table\n ...\n\n To only use specific columns of a table as input to a downstream op or asset, add the metadata "columns" to the\n In or AssetIn.\n\n .. code-block:: python\n\n @asset(\n ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n )\n def my_table_a(my_table: pl.DataFrame) -> pl.DataFrame:\n # my_table will just contain the data from column "a"\n ...\n\n """\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n @staticmethod\n def type_handlers() -> Sequence[DbTypeHandler]:\n return [DuckDBPolarsTypeHandler()]\n\n @staticmethod\n def default_load_type() -> Optional[Type]:\n return pl.DataFrame
\n
", "current_page_name": "_modules/dagster_duckdb_polars/duckdb_polars_type_handler", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_duckdb_polars.duckdb_polars_type_handler"}}, "dagster_duckdb_pyspark": {"duckdb_pyspark_type_handler": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_duckdb_pyspark.duckdb_pyspark_type_handler

\nfrom typing import Optional, Sequence, Type\n\nimport pyarrow as pa\nimport pyspark\nimport pyspark.sql\nfrom dagster import InputContext, MetadataValue, OutputContext, TableColumn, TableSchema\nfrom dagster._core.storage.db_io_manager import DbTypeHandler, TableSlice\nfrom dagster_duckdb.io_manager import (\n    DuckDbClient,\n    DuckDBIOManager,\n    build_duckdb_io_manager,\n)\nfrom pyspark.sql import SparkSession\nfrom pyspark.sql.types import StructType\n\n\ndef pyspark_df_to_arrow_table(df: pyspark.sql.DataFrame) -> pa.Table:\n    """Converts a PySpark DataFrame to a PyArrow Table."""\n    # `_collect_as_arrow` API call sourced from:\n    #   https://stackoverflow.com/questions/73203318/how-to-transform-spark-dataframe-to-polars-dataframe\n    return pa.Table.from_batches(df._collect_as_arrow())  # noqa: SLF001\n\n\n
[docs]class DuckDBPySparkTypeHandler(DbTypeHandler[pyspark.sql.DataFrame]):\n """Stores PySpark DataFrames in DuckDB.\n\n To use this type handler, return it from the ``type_handlers` method of an I/O manager that inherits from ``DuckDBIOManager``.\n\n Example:\n .. code-block:: python\n\n from dagster_duckdb import DuckDBIOManager\n from dagster_duckdb_pyspark import DuckDBPySparkTypeHandler\n\n class MyDuckDBIOManager(DuckDBIOManager):\n @staticmethod\n def type_handlers() -> Sequence[DbTypeHandler]:\n return [DuckDBPySparkTypeHandler()]\n\n @asset(\n key_prefix=["my_schema"] # will be used as the schema in duckdb\n )\n def my_table() -> pyspark.sql.DataFrame: # the name of the asset will be the table name\n ...\n\n defs = Definitions(\n assets=[my_table],\n resources={"io_manager": MyDuckDBIOManager(database="my_db.duckdb")}\n )\n """\n\n def handle_output(\n self,\n context: OutputContext,\n table_slice: TableSlice,\n obj: pyspark.sql.DataFrame,\n connection,\n ):\n """Stores the given object at the provided filepath."""\n pa_df = pyspark_df_to_arrow_table(obj) # noqa: F841\n connection.execute(\n f"create table if not exists {table_slice.schema}.{table_slice.table} as select * from"\n " pa_df;"\n )\n if not connection.fetchall():\n # table was not created, therefore already exists. Insert the data\n connection.execute(\n f"insert into {table_slice.schema}.{table_slice.table} select * from pa_df;"\n )\n\n context.add_output_metadata(\n {\n "row_count": obj.count(),\n "dataframe_columns": MetadataValue.table_schema(\n TableSchema(\n columns=[\n TableColumn(name=name, type=str(dtype)) for name, dtype in obj.dtypes\n ]\n )\n ),\n }\n )\n\n def load_input(\n self, context: InputContext, table_slice: TableSlice, connection\n ) -> pyspark.sql.DataFrame:\n """Loads the return of the query as the correct type."""\n spark = SparkSession.builder.getOrCreate() # type: ignore\n if table_slice.partition_dimensions and len(context.asset_partition_keys) == 0:\n return spark.createDataFrame([], StructType([]))\n\n pd_df = connection.execute(DuckDbClient.get_select_statement(table_slice)).fetchdf()\n return spark.createDataFrame(pd_df)\n\n @property\n def supported_types(self):\n return [pyspark.sql.DataFrame]
\n\n\nduckdb_pyspark_io_manager = build_duckdb_io_manager(\n [DuckDBPySparkTypeHandler()], default_load_type=pyspark.sql.DataFrame\n)\nduckdb_pyspark_io_manager.__doc__ = """\nAn I/O manager definition that reads inputs from and writes PySpark DataFrames to DuckDB. When\nusing the duckdb_pyspark_io_manager, any inputs and outputs without type annotations will be loaded\nas PySpark DataFrames.\n\nReturns:\n IOManagerDefinition\n\nExamples:\n\n .. code-block:: python\n\n from dagster_duckdb_pyspark import duckdb_pyspark_io_manager\n\n @asset(\n key_prefix=["my_schema"] # will be used as the schema in DuckDB\n )\n def my_table() -> pyspark.sql.DataFrame: # the name of the asset will be the table name\n ...\n\n @repository\n def my_repo():\n return with_resources(\n [my_table],\n {"io_manager": duckdb_pyspark_io_manager.configured({"database": "my_db.duckdb"})}\n )\n\n If you do not provide a schema, Dagster will determine a schema based on the assets and ops using\n the I/O Manager. For assets, the schema will be determined from the asset key.\n For ops, the schema can be specified by including a "schema" entry in output metadata. If "schema" is not provided\n via config or on the asset/op, "public" will be used for the schema.\n\n .. code-block:: python\n\n @op(\n out={"my_table": Out(metadata={"schema": "my_schema"})}\n )\n def make_my_table() -> pyspark.sql.DataFrame:\n # the returned value will be stored at my_schema.my_table\n ...\n\n To only use specific columns of a table as input to a downstream op or asset, add the metadata "columns" to the\n In or AssetIn.\n\n .. code-block:: python\n\n @asset(\n ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n )\n def my_table_a(my_table: pyspark.sql.DataFrame) -> pyspark.sql.DataFrame:\n # my_table will just contain the data from column "a"\n ...\n\n"""\n\n\n
[docs]class DuckDBPySparkIOManager(DuckDBIOManager):\n """An I/O manager definition that reads inputs from and writes PySpark DataFrames to DuckDB. When\n using the DuckDBPySparkIOManager, any inputs and outputs without type annotations will be loaded\n as PySpark DataFrames.\n\n Returns:\n IOManagerDefinition\n\n Examples:\n .. code-block:: python\n\n from dagster_duckdb_pyspark import DuckDBPySparkIOManager\n\n @asset(\n key_prefix=["my_schema"] # will be used as the schema in DuckDB\n )\n def my_table() -> pyspark.sql.DataFrame: # the name of the asset will be the table name\n ...\n\n defs = Definitions(\n assets=[my_table],\n resources={"io_manager": DuckDBPySparkIOManager(database="my_db.duckdb")}\n )\n\n If you do not provide a schema, Dagster will determine a schema based on the assets and ops using\n the I/O Manager. For assets, the schema will be determined from the asset key, as in the above example.\n For ops, the schema can be specified by including a "schema" entry in output metadata. If "schema" is not provided\n via config or on the asset/op, "public" will be used for the schema.\n\n .. code-block:: python\n\n @op(\n out={"my_table": Out(metadata={"schema": "my_schema"})}\n )\n def make_my_table() -> pyspark.sql.DataFrame:\n # the returned value will be stored at my_schema.my_table\n ...\n\n To only use specific columns of a table as input to a downstream op or asset, add the metadata "columns" to the\n In or AssetIn.\n\n .. code-block:: python\n\n @asset(\n ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n )\n def my_table_a(my_table: pyspark.sql.DataFrame) -> pyspark.sql.DataFrame:\n # my_table will just contain the data from column "a"\n ...\n\n """\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n @staticmethod\n def type_handlers() -> Sequence[DbTypeHandler]:\n return [DuckDBPySparkTypeHandler()]\n\n @staticmethod\n def default_load_type() -> Optional[Type]:\n return pyspark.sql.DataFrame
\n
", "current_page_name": "_modules/dagster_duckdb_pyspark/duckdb_pyspark_type_handler", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_duckdb_pyspark.duckdb_pyspark_type_handler"}}, "dagster_embedded_elt": {"sling": {"asset_defs": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_embedded_elt.sling.asset_defs

\nimport re\nfrom typing import Any, Dict, List, Optional, Union\n\nfrom dagster import (\n    AssetExecutionContext,\n    AssetsDefinition,\n    AssetSpec,\n    MaterializeResult,\n    multi_asset,\n)\n\nfrom dagster_embedded_elt.sling.resources import SlingMode, SlingResource\n\n\n
[docs]def build_sling_asset(\n asset_spec: AssetSpec,\n source_stream: str,\n target_object: str,\n mode: SlingMode = SlingMode.FULL_REFRESH,\n primary_key: Optional[Union[str, List[str]]] = None,\n update_key: Optional[Union[str, List[str]]] = None,\n source_options: Optional[Dict[str, Any]] = None,\n target_options: Optional[Dict[str, Any]] = None,\n sling_resource_key: str = "sling",\n) -> AssetsDefinition:\n """Asset Factory for using Sling to sync data from a source stream to a target object.\n\n Args:\n asset_spec (AssetSpec): The AssetSpec to use to materialize this asset.\n source_stream (str): The source stream to sync from. This can be a table, a query, or a path.\n target_object (str): The target object to sync to. This can be a table, or a path.\n mode (SlingMode, optional): The sync mode to use when syncing. Defaults to SlingMode.FULL_REFRESH.\n primary_key (Optional[Union[str, List[str]]], optional): The optional primary key to use when syncing.\n update_key (Optional[Union[str, List[str]]], optional): The optional update key to use when syncing.\n source_options (Optional[Dict[str, Any]], optional): Any optional Sling source options to use when syncing.\n target_options (Optional[Dict[str, Any]], optional): Any optional target options to use when syncing.\n sling_resource_key (str, optional): The resource key for the SlingResource. Defaults to "sling".\n\n Examples:\n Creating a Sling asset that syncs from a file to a table:\n\n .. code-block:: python\n\n asset_spec = AssetSpec(key=["main", "dest_tbl"])\n asset_def = build_sling_asset(\n asset_spec=asset_spec,\n source_stream="file:///tmp/test.csv",\n target_object="main.dest_table",\n mode=SlingMode.INCREMENTAL,\n primary_key="id"\n )\n\n Creating a Sling asset that syncs from a table to a file with a full refresh:\n\n .. code-block:: python\n\n asset_spec = AssetSpec(key="test.csv")\n asset_def = build_sling_asset(\n asset_spec=asset_spec,\n source_stream="main.dest_table",\n table_object="file:///tmp/test.csv",\n mode=SlingMode.FULL_REFRESH\n primary_key="id"\n )\n\n\n """\n if primary_key is not None and not isinstance(primary_key, list):\n primary_key = [primary_key]\n\n if update_key is not None and not isinstance(update_key, list):\n update_key = [update_key]\n\n @multi_asset(\n compute_kind="sling", specs=[asset_spec], required_resource_keys={sling_resource_key}\n )\n def sync(context: AssetExecutionContext) -> MaterializeResult:\n sling: SlingResource = getattr(context.resources, sling_resource_key)\n for stdout_line in sling.sync(\n source_stream=source_stream,\n target_object=target_object,\n mode=mode,\n primary_key=primary_key,\n update_key=update_key,\n source_options=source_options,\n target_options=target_options,\n ):\n match = re.search(r"(\\d+) rows", stdout_line)\n if match:\n last_row_count_observed = int(match.group(1))\n context.log.info(stdout_line)\n\n return MaterializeResult(\n metadata=(\n {} if last_row_count_observed is None else {"row_count": last_row_count_observed}\n )\n )\n\n return sync
\n
", "current_page_name": "_modules/dagster_embedded_elt/sling/asset_defs", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_embedded_elt.sling.asset_defs"}, "resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_embedded_elt.sling.resources

\nimport contextlib\nimport json\nimport re\nfrom enum import Enum\nfrom subprocess import PIPE, STDOUT, Popen\nfrom typing import Any, Dict, Generator, List, Optional\n\nfrom dagster import ConfigurableResource, PermissiveConfig, get_dagster_logger\nfrom dagster._utils.env import environ\nfrom pydantic import Field\nfrom sling import Sling  # type: ignore\n\nlogger = get_dagster_logger()\n\n\nclass SlingMode(str, Enum):\n    """The mode to use when syncing.\n\n    See the Sling docs for more information: https://docs.slingdata.io/sling-cli/running-tasks#modes.\n    """\n\n    INCREMENTAL = "incremental"\n    TRUNCATE = "truncate"\n    FULL_REFRESH = "full-refresh"\n    SNAPSHOT = "snapshot"\n\n\n
[docs]class SlingSourceConnection(PermissiveConfig):\n """A Sling Source Connection defines the source connection used by :py:class:`~dagster_elt.sling.SlingResource`.\n\n Examples:\n\n Creating a Sling Source for a file, such as CSV or JSON:\n\n .. code-block:: python\n\n source = SlingSourceConnection(type="file")\n\n Create a Sling Source for a Postgres database, using a connection string:\n\n .. code-block:: python\n\n source = SlingTargetConnection(type="postgres", connection_string=EnvVar("POSTGRES_CONNECTION_STRING"))\n source = SlingSourceConnection(type="postgres", connection_string="postgresql://user:password@host:port/schema"\n\n Create a Sling Source for a Postgres database, using keyword arguments, as described here:\n https://docs.slingdata.io/connections/database-connections/postgres\n\n .. code-block:: python\n\n source = SlingTargetConnection(type="postgres", host="host", user="hunter42", password=EnvVar("POSTGRES_PASSWORD"))\n\n """\n\n type: str = Field(description="Type of the source connection. Use 'file' for local storage.")\n connection_string: Optional[str] = Field(\n description="The connection string for the source database."\n )
\n\n\n
[docs]class SlingTargetConnection(PermissiveConfig):\n """A Sling Target Connection defines the target connection used by :py:class:`~dagster_elt.sling.SlingResource`.\n\n Examples:\n Creating a Sling Target for a file, such as CSV or JSON:\n\n .. code-block:: python\n\n source = SlingTargetConnection(type="file")\n\n Create a Sling Source for a Postgres database, using a connection string:\n\n .. code-block:: python\n\n source = SlingTargetConnection(type="postgres", connection_string="postgresql://user:password@host:port/schema"\n source = SlingTargetConnection(type="postgres", connection_string=EnvVar("POSTGRES_CONNECTION_STRING"))\n\n Create a Sling Source for a Postgres database, using keyword arguments, as described here:\n https://docs.slingdata.io/connections/database-connections/postgres\n\n .. code-block::python\n\n source = SlingTargetConnection(type="postgres", host="host", user="hunter42", password=EnvVar("POSTGRES_PASSWORD"))\n\n\n """\n\n type: str = Field(\n description="Type of the destination connection. Use 'file' for local storage."\n )\n connection_string: Optional[str] = Field(\n description="The connection string for the target database."\n )
\n\n\n
[docs]class SlingResource(ConfigurableResource):\n """Resource for interacting with the Sling package.\n\n Examples:\n\n .. code-block:: python\n\n from dagster_etl.sling import SlingResource\n sling_resource = SlingResource(\n source_connection=SlingSourceConnection(\n type="postgres", connection_string=EnvVar("POSTGRES_CONNECTION_STRING")\n ),\n target_connection=SlingTargetConnection(\n type="snowflake",\n host="host",\n user="user",\n database="database",\n password="password",\n role="role",\n ),\n )\n\n """\n\n source_connection: SlingSourceConnection\n target_connection: SlingTargetConnection\n\n @contextlib.contextmanager\n def _setup_config(self) -> Generator[None, None, None]:\n """Uses environment variables to set the Sling source and target connections."""\n sling_source = self.source_connection.dict()\n sling_target = self.target_connection.dict()\n if self.source_connection.connection_string:\n sling_source["url"] = self.source_connection.connection_string\n if self.target_connection.connection_string:\n sling_target["url"] = self.target_connection.connection_string\n with environ(\n {\n "SLING_SOURCE": json.dumps(sling_source),\n "SLING_TARGET": json.dumps(sling_target),\n }\n ):\n yield\n\n @staticmethod\n def _exec_sling_cmd(cmd, stdin=None, stdout=PIPE, stderr=STDOUT) -> Generator[str, None, None]:\n ansi_escape = re.compile(r"\\x1B(?:[@-Z\\\\-_]|\\[[0-?]*[ -/]*[@-~])")\n with Popen(cmd, shell=True, stdin=stdin, stdout=stdout, stderr=stderr) as proc:\n assert proc.stdout\n\n for line in proc.stdout:\n fmt_line = str(line, "utf-8")\n clean_line = ansi_escape.sub("", fmt_line).replace("INF", "")\n yield clean_line\n\n proc.wait()\n if proc.returncode != 0:\n raise Exception("Sling command failed with error code %s", proc.returncode)\n\n def _sync(\n self,\n source_stream: str,\n target_object: str,\n mode: SlingMode = SlingMode.FULL_REFRESH,\n primary_key: Optional[List[str]] = None,\n update_key: Optional[List[str]] = None,\n source_options: Optional[Dict[str, Any]] = None,\n target_options: Optional[Dict[str, Any]] = None,\n ) -> Generator[str, None, None]:\n """Runs a Sling sync from the given source table to the given destination table. Generates\n output lines from the Sling CLI.\n """\n if self.source_connection.type == "file" and not source_stream.startswith("file://"):\n source_stream = "file://" + source_stream\n\n if self.target_connection.type == "file" and not target_object.startswith("file://"):\n target_object = "file://" + target_object\n\n with self._setup_config():\n config = {\n "source": {\n "conn": "SLING_SOURCE",\n "stream": source_stream,\n "primary_key": primary_key,\n "update_key": update_key,\n "options": source_options,\n },\n "target": {\n "conn": "SLING_TARGET",\n "object": target_object,\n "options": target_options,\n },\n }\n config["source"] = {k: v for k, v in config["source"].items() if v is not None}\n config["target"] = {k: v for k, v in config["target"].items() if v is not None}\n\n sling_cli = Sling(**config)\n logger.info("Starting Sling sync with mode: %s", mode)\n cmd = sling_cli._prep_cmd() # noqa: SLF001\n\n yield from self._exec_sling_cmd(cmd)\n\n def sync(\n self,\n source_stream: str,\n target_object: str,\n mode: SlingMode,\n primary_key: Optional[List[str]] = None,\n update_key: Optional[List[str]] = None,\n source_options: Optional[Dict[str, Any]] = None,\n target_options: Optional[Dict[str, Any]] = None,\n ) -> Generator[str, None, None]:\n """Initiate a Sling Sync between a source stream and a target object.\n\n Args:\n source_stream (str): The source stream to read from. For database sources, the source stream can be either\n a table name, a SQL statement or a path to a SQL file e.g. `TABLE1` or `SCHEMA1.TABLE2` or\n `SELECT * FROM TABLE`. For file sources, the source stream is a path or an url to a file.\n For file targets, the target object is a path or a url to a file, e.g. file:///tmp/file.csv or\n s3://my_bucket/my_folder/file.csv\n target_object (str): The target object to write into. For database targets, the target object is a table\n name, e.g. TABLE1, SCHEMA1.TABLE2. For file targets, the target object is a path or an url to a file.\n mode (SlingMode): The Sling mode to use when syncing, i.e. incremental, full-refresh\n See the Sling docs for more information: https://docs.slingdata.io/sling-cli/running-tasks#modes.\n primary_key (str): For incremental syncs, a primary key is used during merge statements to update\n existing rows.\n update_key (str): For incremental syncs, an update key is used to stream records after max(update_key)\n source_options (Dict[str, Any]): Other source options to pass to Sling,\n see https://docs.slingdata.io/sling-cli/running-tasks#source-options-src-options-flag-source.options-key\n for details\n target_options (Dict[str, Any[): Other target options to pass to Sling,\n see https://docs.slingdata.io/sling-cli/running-tasks#target-options-tgt-options-flag-target.options-key\n for details\n\n Examples:\n Sync from a source file to a sqlite database:\n\n .. code-block:: python\n\n sqllite_path = "/path/to/sqlite.db"\n csv_path = "/path/to/file.csv"\n\n @asset\n def run_sync(context, sling: SlingResource):\n res = sling.sync(\n source_stream=csv_path,\n target_object="events",\n mode=SlingMode.FULL_REFRESH,\n )\n for stdout in res:\n context.log.debug(stdout)\n counts = sqlite3.connect(sqllitepath).execute("SELECT count(1) FROM events").fetchone()\n assert counts[0] == 3\n\n source = SlingSourceConnection(\n type="file",\n )\n target = SlingTargetConnection(type="sqlite", instance=sqllitepath)\n\n materialize(\n [run_sync],\n resources={\n "sling": SlingResource(\n source_connection=source,\n target_connection=target,\n mode=SlingMode.TRUNCATE,\n )\n },\n )\n\n """\n yield from self._sync(\n source_stream=source_stream,\n target_object=target_object,\n mode=mode,\n primary_key=primary_key,\n update_key=update_key,\n source_options=source_options,\n target_options=target_options,\n )
\n\n
", "current_page_name": "_modules/dagster_embedded_elt/sling/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_embedded_elt.sling.resources"}}}, "dagster_fivetran": {"asset_defs": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_fivetran.asset_defs

\nimport hashlib\nimport inspect\nimport re\nfrom functools import partial\nfrom typing import (\n    Any,\n    Callable,\n    Dict,\n    List,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Set,\n    Union,\n    cast,\n)\n\nfrom dagster import (\n    AssetKey,\n    AssetOut,\n    AssetsDefinition,\n    Nothing,\n    OpExecutionContext,\n    Output,\n    _check as check,\n    multi_asset,\n)\nfrom dagster._core.definitions.cacheable_assets import (\n    AssetsDefinitionCacheableData,\n    CacheableAssetsDefinition,\n)\nfrom dagster._core.definitions.events import CoercibleToAssetKeyPrefix\nfrom dagster._core.definitions.metadata import MetadataUserInput\nfrom dagster._core.definitions.resource_definition import ResourceDefinition\nfrom dagster._core.errors import DagsterStepOutputNotFoundError\nfrom dagster._core.execution.context.init import build_init_resource_context\n\nfrom dagster_fivetran.resources import DEFAULT_POLL_INTERVAL, FivetranResource\nfrom dagster_fivetran.utils import (\n    generate_materializations,\n    get_fivetran_connector_url,\n    metadata_for_table,\n)\n\n\ndef _build_fivetran_assets(\n    connector_id: str,\n    destination_tables: Sequence[str],\n    poll_interval: float = DEFAULT_POLL_INTERVAL,\n    poll_timeout: Optional[float] = None,\n    io_manager_key: Optional[str] = None,\n    asset_key_prefix: Optional[Sequence[str]] = None,\n    metadata_by_table_name: Optional[Mapping[str, MetadataUserInput]] = None,\n    table_to_asset_key_map: Optional[Mapping[str, AssetKey]] = None,\n    resource_defs: Optional[Mapping[str, ResourceDefinition]] = None,\n    group_name: Optional[str] = None,\n    infer_missing_tables: bool = False,\n    op_tags: Optional[Mapping[str, Any]] = None,\n) -> Sequence[AssetsDefinition]:\n    asset_key_prefix = check.opt_sequence_param(asset_key_prefix, "asset_key_prefix", of_type=str)\n\n    tracked_asset_keys = {\n        table: AssetKey([*asset_key_prefix, *table.split(".")]) for table in destination_tables\n    }\n    user_facing_asset_keys = table_to_asset_key_map or tracked_asset_keys\n\n    _metadata_by_table_name = check.opt_mapping_param(\n        metadata_by_table_name, "metadata_by_table_name", key_type=str\n    )\n\n    @multi_asset(\n        name=f"fivetran_sync_{connector_id}",\n        outs={\n            "_".join(key.path): AssetOut(\n                io_manager_key=io_manager_key,\n                key=user_facing_asset_keys[table],\n                metadata=_metadata_by_table_name.get(table),\n                dagster_type=Nothing,\n            )\n            for table, key in tracked_asset_keys.items()\n        },\n        compute_kind="fivetran",\n        resource_defs=resource_defs,\n        group_name=group_name,\n        op_tags=op_tags,\n    )\n    def _assets(context: OpExecutionContext, fivetran: FivetranResource) -> Any:\n        fivetran_output = fivetran.sync_and_poll(\n            connector_id=connector_id,\n            poll_interval=poll_interval,\n            poll_timeout=poll_timeout,\n        )\n\n        materialized_asset_keys = set()\n        for materialization in generate_materializations(\n            fivetran_output, asset_key_prefix=asset_key_prefix\n        ):\n            # scan through all tables actually created, if it was expected then emit an Output.\n            # otherwise, emit a runtime AssetMaterialization\n            if materialization.asset_key in tracked_asset_keys.values():\n                yield Output(\n                    value=None,\n                    output_name="_".join(materialization.asset_key.path),\n                    metadata=materialization.metadata,\n                )\n                materialized_asset_keys.add(materialization.asset_key)\n\n            else:\n                yield materialization\n\n        unmaterialized_asset_keys = set(tracked_asset_keys.values()) - materialized_asset_keys\n        if infer_missing_tables:\n            for asset_key in unmaterialized_asset_keys:\n                yield Output(\n                    value=None,\n                    output_name="_".join(asset_key.path),\n                )\n\n        else:\n            if unmaterialized_asset_keys:\n                asset_key = next(iter(unmaterialized_asset_keys))\n                output_name = "_".join(asset_key.path)\n                raise DagsterStepOutputNotFoundError(\n                    f"Core compute for {context.op_def.name} did not return an output for"\n                    f' non-optional output "{output_name}".',\n                    step_key=context.get_step_execution_context().step.key,\n                    output_name=output_name,\n                )\n\n    return [_assets]\n\n\n
[docs]def build_fivetran_assets(\n connector_id: str,\n destination_tables: Sequence[str],\n poll_interval: float = DEFAULT_POLL_INTERVAL,\n poll_timeout: Optional[float] = None,\n io_manager_key: Optional[str] = None,\n asset_key_prefix: Optional[Sequence[str]] = None,\n metadata_by_table_name: Optional[Mapping[str, MetadataUserInput]] = None,\n group_name: Optional[str] = None,\n infer_missing_tables: bool = False,\n op_tags: Optional[Mapping[str, Any]] = None,\n) -> Sequence[AssetsDefinition]:\n """Build a set of assets for a given Fivetran connector.\n\n Returns an AssetsDefinition which connects the specified ``asset_keys`` to the computation that\n will update them. Internally, executes a Fivetran sync for a given ``connector_id``, and\n polls until that sync completes, raising an error if it is unsuccessful. Requires the use of the\n :py:class:`~dagster_fivetran.fivetran_resource`, which allows it to communicate with the\n Fivetran API.\n\n Args:\n connector_id (str): The Fivetran Connector ID that this op will sync. You can retrieve this\n value from the "Setup" tab of a given connector in the Fivetran UI.\n destination_tables (List[str]): `schema_name.table_name` for each table that you want to be\n represented in the Dagster asset graph for this connection.\n poll_interval (float): The time (in seconds) that will be waited between successive polls.\n poll_timeout (Optional[float]): The maximum time that will waited before this operation is\n timed out. By default, this will never time out.\n io_manager_key (Optional[str]): The io_manager to be used to handle each of these assets.\n asset_key_prefix (Optional[List[str]]): A prefix for the asset keys inside this asset.\n If left blank, assets will have a key of `AssetKey([schema_name, table_name])`.\n metadata_by_table_name (Optional[Mapping[str, MetadataUserInput]]): A mapping from destination\n table name to user-supplied metadata that should be associated with the asset for that table.\n group_name (Optional[str]): A string name used to organize multiple assets into groups. This\n group name will be applied to all assets produced by this multi_asset.\n infer_missing_tables (bool): If True, will create asset materializations for tables specified\n in destination_tables even if they are not present in the Fivetran sync output. This is useful\n in cases where Fivetran does not sync any data for a table and therefore does not include it\n in the sync output API response.\n op_tags (Optional[Dict[str, Any]]):\n A dictionary of tags for the op that computes the asset. Frameworks may expect and\n require certain metadata to be attached to a op. Values that are not strings will be\n json encoded and must meet the criteria that json.loads(json.dumps(value)) == value.\n\n **Examples:**\n\n Basic example:\n\n .. code-block:: python\n\n from dagster import AssetKey, repository, with_resources\n\n from dagster_fivetran import fivetran_resource\n from dagster_fivetran.assets import build_fivetran_assets\n\n my_fivetran_resource = fivetran_resource.configured(\n {\n "api_key": {"env": "FIVETRAN_API_KEY"},\n "api_secret": {"env": "FIVETRAN_API_SECRET"},\n }\n )\n\n Attaching metadata:\n\n .. code-block:: python\n\n fivetran_assets = build_fivetran_assets(\n connector_id="foobar",\n table_names=["schema1.table1", "schema2.table2"],\n metadata_by_table_name={\n "schema1.table1": {\n "description": "This is a table that contains foo and bar",\n },\n "schema2.table2": {\n "description": "This is a table that contains baz and quux",\n },\n },\n )\n """\n return _build_fivetran_assets(\n connector_id=connector_id,\n destination_tables=destination_tables,\n poll_interval=poll_interval,\n poll_timeout=poll_timeout,\n io_manager_key=io_manager_key,\n asset_key_prefix=asset_key_prefix,\n metadata_by_table_name=metadata_by_table_name,\n group_name=group_name,\n infer_missing_tables=infer_missing_tables,\n op_tags=op_tags,\n )
\n\n\nclass FivetranConnectionMetadata(\n NamedTuple(\n "_FivetranConnectionMetadata",\n [\n ("name", str),\n ("connector_id", str),\n ("connector_url", str),\n ("schemas", Mapping[str, Any]),\n ],\n )\n):\n def build_asset_defn_metadata(\n self,\n key_prefix: Sequence[str],\n group_name: Optional[str],\n table_to_asset_key_fn: Callable[[str], AssetKey],\n io_manager_key: Optional[str] = None,\n ) -> AssetsDefinitionCacheableData:\n schema_table_meta: Dict[str, MetadataUserInput] = {}\n if "schemas" in self.schemas:\n schemas_inner = cast(Dict[str, Any], self.schemas["schemas"])\n for schema in schemas_inner.values():\n if schema["enabled"]:\n schema_name = schema["name_in_destination"]\n schema_tables = cast(Dict[str, Dict[str, Any]], schema["tables"])\n for table in schema_tables.values():\n if table["enabled"]:\n table_name = table["name_in_destination"]\n schema_table_meta[f"{schema_name}.{table_name}"] = metadata_for_table(\n table, self.connector_url\n )\n else:\n schema_table_meta[self.name] = {}\n\n outputs = {\n table: AssetKey([*key_prefix, *list(table_to_asset_key_fn(table).path)])\n for table in schema_table_meta.keys()\n }\n\n internal_deps: Dict[str, Set[AssetKey]] = {}\n\n return AssetsDefinitionCacheableData(\n keys_by_input_name={},\n keys_by_output_name=outputs,\n internal_asset_deps=internal_deps,\n group_name=group_name,\n key_prefix=key_prefix,\n can_subset=False,\n metadata_by_output_name=schema_table_meta,\n extra_metadata={\n "connector_id": self.connector_id,\n "io_manager_key": io_manager_key,\n },\n )\n\n\ndef _build_fivetran_assets_from_metadata(\n assets_defn_meta: AssetsDefinitionCacheableData,\n resource_defs: Mapping[str, ResourceDefinition],\n poll_interval: float,\n poll_timeout: Optional[float] = None,\n) -> AssetsDefinition:\n metadata = cast(Mapping[str, Any], assets_defn_meta.extra_metadata)\n connector_id = cast(str, metadata["connector_id"])\n io_manager_key = cast(Optional[str], metadata["io_manager_key"])\n\n return _build_fivetran_assets(\n connector_id=connector_id,\n destination_tables=list(\n assets_defn_meta.keys_by_output_name.keys()\n if assets_defn_meta.keys_by_output_name\n else []\n ),\n asset_key_prefix=list(assets_defn_meta.key_prefix or []),\n metadata_by_table_name=cast(\n Dict[str, MetadataUserInput], assets_defn_meta.metadata_by_output_name\n ),\n io_manager_key=io_manager_key,\n table_to_asset_key_map=assets_defn_meta.keys_by_output_name,\n resource_defs=resource_defs,\n group_name=assets_defn_meta.group_name,\n poll_interval=poll_interval,\n poll_timeout=poll_timeout,\n )[0]\n\n\nclass FivetranInstanceCacheableAssetsDefinition(CacheableAssetsDefinition):\n def __init__(\n self,\n fivetran_resource_def: Union[FivetranResource, ResourceDefinition],\n key_prefix: Sequence[str],\n connector_to_group_fn: Optional[Callable[[str], Optional[str]]],\n connector_filter: Optional[Callable[[FivetranConnectionMetadata], bool]],\n connector_to_io_manager_key_fn: Optional[Callable[[str], Optional[str]]],\n connector_to_asset_key_fn: Optional[Callable[[FivetranConnectionMetadata, str], AssetKey]],\n poll_interval: float,\n poll_timeout: Optional[float],\n ):\n self._fivetran_resource_def = fivetran_resource_def\n self._fivetran_instance: FivetranResource = (\n fivetran_resource_def.process_config_and_initialize()\n if isinstance(fivetran_resource_def, FivetranResource)\n else fivetran_resource_def(build_init_resource_context())\n )\n\n self._key_prefix = key_prefix\n self._connector_to_group_fn = connector_to_group_fn\n self._connection_filter = connector_filter\n self._connector_to_io_manager_key_fn = connector_to_io_manager_key_fn\n self._connector_to_asset_key_fn: Callable[[FivetranConnectionMetadata, str], AssetKey] = (\n connector_to_asset_key_fn or (lambda _, table: AssetKey(path=table.split(".")))\n )\n self._poll_interval = poll_interval\n self._poll_timeout = poll_timeout\n\n contents = hashlib.sha1()\n contents.update(",".join(key_prefix).encode("utf-8"))\n if connector_filter:\n contents.update(inspect.getsource(connector_filter).encode("utf-8"))\n\n super().__init__(unique_id=f"fivetran-{contents.hexdigest()}")\n\n def _get_connectors(self) -> Sequence[FivetranConnectionMetadata]:\n output_connectors: List[FivetranConnectionMetadata] = []\n\n groups = self._fivetran_instance.make_request("GET", "groups")["items"]\n\n for group in groups:\n group_id = group["id"]\n\n connectors = self._fivetran_instance.make_request(\n "GET", f"groups/{group_id}/connectors"\n )["items"]\n for connector in connectors:\n connector_id = connector["id"]\n\n connector_name = connector["schema"]\n\n setup_state = connector.get("status", {}).get("setup_state")\n if setup_state and setup_state in ("incomplete", "broken"):\n continue\n\n connector_url = get_fivetran_connector_url(connector)\n\n schemas = self._fivetran_instance.make_request(\n "GET", f"connectors/{connector_id}/schemas"\n )\n\n output_connectors.append(\n FivetranConnectionMetadata(\n name=connector_name,\n connector_id=connector_id,\n connector_url=connector_url,\n schemas=schemas,\n )\n )\n\n return output_connectors\n\n def compute_cacheable_data(self) -> Sequence[AssetsDefinitionCacheableData]:\n asset_defn_data: List[AssetsDefinitionCacheableData] = []\n for connector in self._get_connectors():\n if not self._connection_filter or self._connection_filter(connector):\n table_to_asset_key = partial(self._connector_to_asset_key_fn, connector)\n asset_defn_data.append(\n connector.build_asset_defn_metadata(\n key_prefix=self._key_prefix,\n group_name=(\n self._connector_to_group_fn(connector.name)\n if self._connector_to_group_fn\n else None\n ),\n io_manager_key=(\n self._connector_to_io_manager_key_fn(connector.name)\n if self._connector_to_io_manager_key_fn\n else None\n ),\n table_to_asset_key_fn=table_to_asset_key,\n )\n )\n\n return asset_defn_data\n\n def build_definitions(\n self, data: Sequence[AssetsDefinitionCacheableData]\n ) -> Sequence[AssetsDefinition]:\n return [\n _build_fivetran_assets_from_metadata(\n meta,\n {"fivetran": self._fivetran_instance.get_resource_definition()},\n poll_interval=self._poll_interval,\n poll_timeout=self._poll_timeout,\n )\n for meta in data\n ]\n\n\ndef _clean_name(name: str) -> str:\n """Cleans an input to be a valid Dagster asset name."""\n return re.sub(r"[^a-z0-9]+", "_", name.lower())\n\n\n
[docs]def load_assets_from_fivetran_instance(\n fivetran: Union[FivetranResource, ResourceDefinition],\n key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n connector_to_group_fn: Optional[Callable[[str], Optional[str]]] = _clean_name,\n io_manager_key: Optional[str] = None,\n connector_to_io_manager_key_fn: Optional[Callable[[str], Optional[str]]] = None,\n connector_filter: Optional[Callable[[FivetranConnectionMetadata], bool]] = None,\n connector_to_asset_key_fn: Optional[\n Callable[[FivetranConnectionMetadata, str], AssetKey]\n ] = None,\n poll_interval: float = DEFAULT_POLL_INTERVAL,\n poll_timeout: Optional[float] = None,\n) -> CacheableAssetsDefinition:\n """Loads Fivetran connector assets from a configured FivetranResource instance. This fetches information\n about defined connectors at initialization time, and will error on workspace load if the Fivetran\n instance is not reachable.\n\n Args:\n fivetran (ResourceDefinition): A FivetranResource configured with the appropriate connection\n details.\n key_prefix (Optional[CoercibleToAssetKeyPrefix]): A prefix for the asset keys created.\n connector_to_group_fn (Optional[Callable[[str], Optional[str]]]): Function which returns an asset\n group name for a given Fivetran connector name. If None, no groups will be created. Defaults\n to a basic sanitization function.\n io_manager_key (Optional[str]): The IO manager key to use for all assets. Defaults to "io_manager".\n Use this if all assets should be loaded from the same source, otherwise use connector_to_io_manager_key_fn.\n connector_to_io_manager_key_fn (Optional[Callable[[str], Optional[str]]]): Function which returns an\n IO manager key for a given Fivetran connector name. When other ops are downstream of the loaded assets,\n the IOManager specified determines how the inputs to those ops are loaded. Defaults to "io_manager".\n connector_filter (Optional[Callable[[FivetranConnectorMetadata], bool]]): Optional function which takes\n in connector metadata and returns False if the connector should be excluded from the output assets.\n connector_to_asset_key_fn (Optional[Callable[[FivetranConnectorMetadata, str], AssetKey]]): Optional function\n which takes in connector metadata and a table name and returns an AssetKey for that table. Defaults to\n a function that generates an AssetKey matching the table name, split by ".".\n poll_interval (float): The time (in seconds) that will be waited between successive polls.\n poll_timeout (Optional[float]): The maximum time that will waited before this operation is\n timed out. By default, this will never time out.\n\n **Examples:**\n\n Loading all Fivetran connectors as assets:\n\n .. code-block:: python\n\n from dagster_fivetran import fivetran_resource, load_assets_from_fivetran_instance\n\n fivetran_instance = fivetran_resource.configured(\n {\n "api_key": "some_key",\n "api_secret": "some_secret",\n }\n )\n fivetran_assets = load_assets_from_fivetran_instance(fivetran_instance)\n\n Filtering the set of loaded connectors:\n\n .. code-block:: python\n\n from dagster_fivetran import fivetran_resource, load_assets_from_fivetran_instance\n\n fivetran_instance = fivetran_resource.configured(\n {\n "api_key": "some_key",\n "api_secret": "some_secret",\n }\n )\n fivetran_assets = load_assets_from_fivetran_instance(\n fivetran_instance,\n connector_filter=lambda meta: "snowflake" in meta.name,\n )\n """\n if isinstance(key_prefix, str):\n key_prefix = [key_prefix]\n key_prefix = check.list_param(key_prefix or [], "key_prefix", of_type=str)\n\n check.invariant(\n not io_manager_key or not connector_to_io_manager_key_fn,\n "Cannot specify both io_manager_key and connector_to_io_manager_key_fn",\n )\n if not connector_to_io_manager_key_fn:\n connector_to_io_manager_key_fn = lambda _: io_manager_key\n\n return FivetranInstanceCacheableAssetsDefinition(\n fivetran_resource_def=fivetran,\n key_prefix=key_prefix,\n connector_to_group_fn=connector_to_group_fn,\n connector_to_io_manager_key_fn=connector_to_io_manager_key_fn,\n connector_filter=connector_filter,\n connector_to_asset_key_fn=connector_to_asset_key_fn,\n poll_interval=poll_interval,\n poll_timeout=poll_timeout,\n )
\n
", "current_page_name": "_modules/dagster_fivetran/asset_defs", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_fivetran.asset_defs"}, "ops": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_fivetran.ops

\nfrom typing import Any, Dict, List, Optional\n\nfrom dagster import (\n    AssetKey,\n    Config,\n    In,\n    Nothing,\n    Out,\n    Output,\n    op,\n)\nfrom pydantic import Field\n\nfrom dagster_fivetran.resources import DEFAULT_POLL_INTERVAL, FivetranResource\nfrom dagster_fivetran.types import FivetranOutput\nfrom dagster_fivetran.utils import generate_materializations\n\n\nclass SyncConfig(Config):\n    connector_id: str = Field(\n        description=(\n            "The Fivetran Connector ID that this op will sync. You can retrieve this "\n            'value from the "Setup" tab of a given connector in the Fivetran UI.'\n        ),\n    )\n    poll_interval: float = Field(\n        default=DEFAULT_POLL_INTERVAL,\n        description="The time (in seconds) that will be waited between successive polls.",\n    )\n    poll_timeout: Optional[float] = Field(\n        default=None,\n        description=(\n            "The maximum time that will waited before this operation is timed out. By "\n            "default, this will never time out."\n        ),\n    )\n    yield_materializations: bool = Field(\n        default=True,\n        description=(\n            "If True, materializations corresponding to the results of the Fivetran sync will "\n            "be yielded when the op executes."\n        ),\n    )\n    asset_key_prefix: List[str] = Field(\n        default=["fivetran"],\n        description=(\n            "If provided and yield_materializations is True, these components will be used to "\n            "prefix the generated asset keys."\n        ),\n    )\n\n\n
[docs]@op(\n ins={"start_after": In(Nothing)},\n out=Out(\n FivetranOutput,\n description=(\n "Parsed json dictionary representing the details of the Fivetran connector after the"\n " sync successfully completes. See the [Fivetran API"\n " Docs](https://fivetran.com/docs/rest-api/connectors#retrieveconnectordetails) to see"\n " detailed information on this response."\n ),\n ),\n tags={"kind": "fivetran"},\n)\ndef fivetran_sync_op(config: SyncConfig, fivetran: FivetranResource) -> Any:\n """Executes a Fivetran sync for a given ``connector_id``, and polls until that sync\n completes, raising an error if it is unsuccessful. It outputs a FivetranOutput which contains\n the details of the Fivetran connector after the sync successfully completes, as well as details\n about which tables the sync updates.\n\n It requires the use of the :py:class:`~dagster_fivetran.fivetran_resource`, which allows it to\n communicate with the Fivetran API.\n\n Examples:\n .. code-block:: python\n\n from dagster import job\n from dagster_fivetran import fivetran_resource, fivetran_sync_op\n\n my_fivetran_resource = fivetran_resource.configured(\n {\n "api_key": {"env": "FIVETRAN_API_KEY"},\n "api_secret": {"env": "FIVETRAN_API_SECRET"},\n }\n )\n\n sync_foobar = fivetran_sync_op.configured({"connector_id": "foobar"}, name="sync_foobar")\n\n @job(resource_defs={"fivetran": my_fivetran_resource})\n def my_simple_fivetran_job():\n sync_foobar()\n\n @job(resource_defs={"fivetran": my_fivetran_resource})\n def my_composed_fivetran_job():\n final_foobar_state = sync_foobar(start_after=some_op())\n other_op(final_foobar_state)\n """\n fivetran_output = fivetran.sync_and_poll(\n connector_id=config.connector_id,\n poll_interval=config.poll_interval,\n poll_timeout=config.poll_timeout,\n )\n if config.yield_materializations:\n yield from generate_materializations(\n fivetran_output, asset_key_prefix=config.asset_key_prefix\n )\n yield Output(fivetran_output)
\n\n\nclass FivetranResyncConfig(SyncConfig):\n resync_parameters: Optional[Dict[str, Any]] = Field(\n None,\n description=(\n "Optional resync parameters to send in the payload to the Fivetran API. You can"\n " find an example resync payload here:"\n " https://fivetran.com/docs/rest-api/connectors#request_7"\n ),\n )\n\n\n@op(\n ins={"start_after": In(Nothing)},\n out=Out(\n FivetranOutput,\n description=(\n "Parsed json dictionary representing the details of the Fivetran connector after the"\n " resync successfully completes. See the [Fivetran API"\n " Docs](https://fivetran.com/docs/rest-api/connectors#retrieveconnectordetails) to see"\n " detailed information on this response."\n ),\n ),\n tags={"kind": "fivetran"},\n)\ndef fivetran_resync_op(\n config: FivetranResyncConfig,\n fivetran: FivetranResource,\n) -> Any:\n """Executes a Fivetran historical resync for a given ``connector_id``, and polls until that resync\n completes, raising an error if it is unsuccessful. It outputs a FivetranOutput which contains\n the details of the Fivetran connector after the resync successfully completes, as well as details\n about which tables the resync updates.\n\n It requires the use of the :py:class:`~dagster_fivetran.fivetran_resource`, which allows it to\n communicate with the Fivetran API.\n\n Examples:\n .. code-block:: python\n\n from dagster import job\n from dagster_fivetran import fivetran_resource, fivetran_resync_op\n\n my_fivetran_resource = fivetran_resource.configured(\n {\n "api_key": {"env": "FIVETRAN_API_KEY"},\n "api_secret": {"env": "FIVETRAN_API_SECRET"},\n }\n )\n\n sync_foobar = fivetran_resync_op.configured(\n {\n "connector_id": "foobar",\n "resync_parameters": {\n "schema_a": ["table_a", "table_b"],\n "schema_b": ["table_c"]\n }\n },\n name="sync_foobar"\n )\n\n @job(resource_defs={"fivetran": my_fivetran_resource})\n def my_simple_fivetran_job():\n sync_foobar()\n\n @job(resource_defs={"fivetran": my_fivetran_resource})\n def my_composed_fivetran_job():\n final_foobar_state = sync_foobar(start_after=some_op())\n other_op(final_foobar_state)\n """\n fivetran_output = fivetran.resync_and_poll(\n connector_id=config.connector_id,\n resync_parameters=config.resync_parameters,\n poll_interval=config.poll_interval,\n poll_timeout=config.poll_timeout,\n )\n if config.yield_materializations:\n asset_key_filter = (\n [\n AssetKey(config.asset_key_prefix + [schema, table])\n for schema, tables in config.resync_parameters.items()\n for table in tables\n ]\n if config.resync_parameters is not None\n else None\n )\n for mat in generate_materializations(\n fivetran_output, asset_key_prefix=config.asset_key_prefix\n ):\n if asset_key_filter is None or mat.asset_key in asset_key_filter:\n yield mat\n\n yield Output(fivetran_output)\n
", "current_page_name": "_modules/dagster_fivetran/ops", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_fivetran.ops"}, "resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_fivetran.resources

\nimport datetime\nimport json\nimport logging\nimport time\nfrom typing import Any, Mapping, Optional, Sequence, Tuple\nfrom urllib.parse import urljoin\n\nimport requests\nfrom dagster import (\n    Failure,\n    InitResourceContext,\n    MetadataValue,\n    __version__,\n    _check as check,\n    get_dagster_logger,\n    resource,\n)\nfrom dagster._config.pythonic_config import ConfigurableResource\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom dagster._utils.cached_method import cached_method\nfrom dateutil import parser\nfrom pydantic import Field\nfrom requests.auth import HTTPBasicAuth\nfrom requests.exceptions import RequestException\n\nfrom dagster_fivetran.types import FivetranOutput\nfrom dagster_fivetran.utils import get_fivetran_connector_url, get_fivetran_logs_url\n\nFIVETRAN_API_BASE = "https://api.fivetran.com"\nFIVETRAN_API_VERSION_PATH = "v1/"\nFIVETRAN_CONNECTOR_PATH = "connectors/"\n\n# default polling interval (in seconds)\nDEFAULT_POLL_INTERVAL = 10\n\n\n
[docs]class FivetranResource(ConfigurableResource):\n """This class exposes methods on top of the Fivetran REST API."""\n\n api_key: str = Field(description="The Fivetran API key to use for this resource.")\n api_secret: str = Field(description="The Fivetran API secret to use for this resource.")\n disable_schedule_on_trigger: bool = Field(\n default=True,\n description=(\n "Specifies if you would like any connector that is sync'd using this "\n "resource to be automatically taken off its Fivetran schedule."\n ),\n )\n request_max_retries: int = Field(\n default=3,\n description=(\n "The maximum number of times requests to the Fivetran API should be retried "\n "before failing."\n ),\n )\n request_retry_delay: float = Field(\n default=0.25,\n description="Time (in seconds) to wait between each request retry.",\n )\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n @property\n def _auth(self) -> HTTPBasicAuth:\n return HTTPBasicAuth(self.api_key, self.api_secret)\n\n @property\n @cached_method\n def _log(self) -> logging.Logger:\n return get_dagster_logger()\n\n @property\n def api_base_url(self) -> str:\n return urljoin(FIVETRAN_API_BASE, FIVETRAN_API_VERSION_PATH)\n\n @property\n def api_connector_url(self) -> str:\n return urljoin(self.api_base_url, FIVETRAN_CONNECTOR_PATH)\n\n def make_connector_request(\n self, method: str, endpoint: str, data: Optional[str] = None\n ) -> Mapping[str, Any]:\n return self.make_request(method, urljoin(FIVETRAN_CONNECTOR_PATH, endpoint), data)\n\n def make_request(\n self, method: str, endpoint: str, data: Optional[str] = None\n ) -> Mapping[str, Any]:\n """Creates and sends a request to the desired Fivetran Connector API endpoint.\n\n Args:\n method (str): The http method to use for this request (e.g. "POST", "GET", "PATCH").\n endpoint (str): The Fivetran API endpoint to send this request to.\n data (Optional[str]): JSON-formatted data string to be included in the request.\n\n Returns:\n Dict[str, Any]: Parsed json data from the response to this request\n """\n url = urljoin(self.api_base_url, endpoint)\n headers = {\n "User-Agent": f"dagster-fivetran/{__version__}",\n "Content-Type": "application/json;version=2",\n }\n\n num_retries = 0\n while True:\n try:\n response = requests.request(\n method=method,\n url=url,\n headers=headers,\n auth=self._auth,\n data=data,\n )\n response.raise_for_status()\n resp_dict = response.json()\n return resp_dict["data"] if "data" in resp_dict else resp_dict\n except RequestException as e:\n self._log.error("Request to Fivetran API failed: %s", e)\n if num_retries == self.request_max_retries:\n break\n num_retries += 1\n time.sleep(self.request_retry_delay)\n\n raise Failure(f"Max retries ({self.request_max_retries}) exceeded with url: {url}.")\n\n def get_connector_details(self, connector_id: str) -> Mapping[str, Any]:\n """Gets details about a given connector from the Fivetran Connector API.\n\n Args:\n connector_id (str): The Fivetran Connector ID. You can retrieve this value from the\n "Setup" tab of a given connector in the Fivetran UI.\n\n Returns:\n Dict[str, Any]: Parsed json data from the response to this request\n """\n return self.make_connector_request(method="GET", endpoint=connector_id)\n\n def _assert_syncable_connector(self, connector_id: str):\n """Confirms that a given connector is eligible to sync. Will raise a Failure in the event that\n the connector is either paused or not fully setup.\n\n Args:\n connector_id (str): The Fivetran Connector ID. You can retrieve this value from the\n "Setup" tab of a given connector in the Fivetran UI.\n """\n connector_details = self.get_connector_details(connector_id)\n if connector_details["paused"]:\n raise Failure(f"Connector '{connector_id}' cannot be synced as it is currently paused.")\n if connector_details["status"]["setup_state"] != "connected":\n raise Failure(f"Connector '{connector_id}' cannot be synced as it has not been setup")\n\n def get_connector_sync_status(self, connector_id: str) -> Tuple[datetime.datetime, bool, str]:\n """Gets details about the status of the most recent Fivetran sync operation for a given\n connector.\n\n Args:\n connector_id (str): The Fivetran Connector ID. You can retrieve this value from the\n "Setup" tab of a given connector in the Fivetran UI.\n\n Returns:\n Tuple[datetime.datetime, bool, str]:\n Tuple representing the timestamp of the last completeded sync, if it succeeded, and\n the currently reported sync status.\n """\n connector_details = self.get_connector_details(connector_id)\n\n min_time_str = "0001-01-01 00:00:00+00"\n succeeded_at = parser.parse(connector_details["succeeded_at"] or min_time_str)\n failed_at = parser.parse(connector_details["failed_at"] or min_time_str)\n\n return (\n max(succeeded_at, failed_at),\n succeeded_at > failed_at,\n connector_details["status"]["sync_state"],\n )\n\n def update_connector(\n self, connector_id: str, properties: Optional[Mapping[str, Any]] = None\n ) -> Mapping[str, Any]:\n """Updates properties of a Fivetran Connector.\n\n Args:\n connector_id (str): The Fivetran Connector ID. You can retrieve this value from the\n "Setup" tab of a given connector in the Fivetran UI.\n properties (Dict[str, Any]): The properties to be updated. For a comprehensive list of\n properties, see the [Fivetran docs](https://fivetran.com/docs/rest-api/connectors#modifyaconnector).\n\n Returns:\n Dict[str, Any]: Parsed json data representing the API response.\n """\n return self.make_connector_request(\n method="PATCH", endpoint=connector_id, data=json.dumps(properties)\n )\n\n def update_schedule_type(\n self, connector_id: str, schedule_type: Optional[str] = None\n ) -> Mapping[str, Any]:\n """Updates the schedule type property of the connector to either "auto" or "manual".\n\n Args:\n connector_id (str): The Fivetran Connector ID. You can retrieve this value from the\n "Setup" tab of a given connector in the Fivetran UI.\n schedule_type (Optional[str]): Either "auto" (to turn the schedule on) or "manual" (to\n turn it off).\n\n Returns:\n Dict[str, Any]: Parsed json data representing the API response.\n """\n if schedule_type not in ["auto", "manual"]:\n check.failed(f"schedule_type must be either 'auto' or 'manual': got '{schedule_type}'")\n return self.update_connector(connector_id, properties={"schedule_type": schedule_type})\n\n def get_connector_schema_config(self, connector_id: str) -> Mapping[str, Any]:\n return self.make_connector_request("GET", endpoint=f"{connector_id}/schemas")\n\n def start_sync(self, connector_id: str) -> Mapping[str, Any]:\n """Initiates a sync of a Fivetran connector.\n\n Args:\n connector_id (str): The Fivetran Connector ID. You can retrieve this value from the\n "Setup" tab of a given connector in the Fivetran UI.\n\n Returns:\n Dict[str, Any]: Parsed json data representing the connector details API response after\n the sync is started.\n """\n if self.disable_schedule_on_trigger:\n self._log.info("Disabling Fivetran sync schedule.")\n self.update_schedule_type(connector_id, "manual")\n self._assert_syncable_connector(connector_id)\n self.make_connector_request(method="POST", endpoint=f"{connector_id}/force")\n connector_details = self.get_connector_details(connector_id)\n self._log.info(\n f"Sync initialized for connector_id={connector_id}. View this sync in the Fivetran UI: "\n + get_fivetran_connector_url(connector_details)\n )\n return connector_details\n\n def start_resync(\n self, connector_id: str, resync_parameters: Optional[Mapping[str, Sequence[str]]] = None\n ) -> Mapping[str, Any]:\n """Initiates a historical sync of all data for multiple schema tables within a Fivetran connector.\n\n Args:\n connector_id (str): The Fivetran Connector ID. You can retrieve this value from the\n "Setup" tab of a given connector in the Fivetran UI.\n resync_parameters (Optional[Dict[str, List[str]]]): Optional resync parameters to send to the Fivetran API.\n An example payload can be found here: https://fivetran.com/docs/rest-api/connectors#request_7\n\n Returns:\n Dict[str, Any]: Parsed json data representing the connector details API response after\n the resync is started.\n """\n if self.disable_schedule_on_trigger:\n self._log.info("Disabling Fivetran sync schedule.")\n self.update_schedule_type(connector_id, "manual")\n self._assert_syncable_connector(connector_id)\n self.make_connector_request(\n method="POST",\n endpoint=(\n f"{connector_id}/schemas/tables/resync"\n if resync_parameters is not None\n else f"{connector_id}/resync"\n ),\n data=json.dumps(resync_parameters) if resync_parameters is not None else None,\n )\n connector_details = self.get_connector_details(connector_id)\n self._log.info(\n f"Sync initialized for connector_id={connector_id}. View this resync in the Fivetran"\n " UI: "\n + get_fivetran_connector_url(connector_details)\n )\n return connector_details\n\n def poll_sync(\n self,\n connector_id: str,\n initial_last_sync_completion: datetime.datetime,\n poll_interval: float = DEFAULT_POLL_INTERVAL,\n poll_timeout: Optional[float] = None,\n ) -> Mapping[str, Any]:\n """Given a Fivetran connector and the timestamp at which the previous sync completed, poll\n until the next sync completes.\n\n The previous sync completion time is necessary because the only way to tell when a sync\n completes is when this value changes.\n\n Args:\n connector_id (str): The Fivetran Connector ID. You can retrieve this value from the\n "Setup" tab of a given connector in the Fivetran UI.\n initial_last_sync_completion (datetime.datetime): The timestamp of the last completed sync\n (successful or otherwise) for this connector, prior to running this method.\n poll_interval (float): The time (in seconds) that will be waited between successive polls.\n poll_timeout (float): The maximum time that will waited before this operation is timed\n out. By default, this will never time out.\n\n Returns:\n Dict[str, Any]: Parsed json data representing the API response.\n """\n poll_start = datetime.datetime.now()\n while True:\n (\n curr_last_sync_completion,\n curr_last_sync_succeeded,\n curr_sync_state,\n ) = self.get_connector_sync_status(connector_id)\n self._log.info(f"Polled '{connector_id}'. Status: [{curr_sync_state}]")\n\n if curr_last_sync_completion > initial_last_sync_completion:\n break\n\n if poll_timeout and datetime.datetime.now() > poll_start + datetime.timedelta(\n seconds=poll_timeout\n ):\n raise Failure(\n f"Sync for connector '{connector_id}' timed out after "\n f"{datetime.datetime.now() - poll_start}."\n )\n\n # Sleep for the configured time interval before polling again.\n time.sleep(poll_interval)\n\n connector_details = self.get_connector_details(connector_id)\n if not curr_last_sync_succeeded:\n raise Failure(\n f"Sync for connector '{connector_id}' failed!",\n metadata={\n "connector_details": MetadataValue.json(connector_details),\n "log_url": MetadataValue.url(get_fivetran_logs_url(connector_details)),\n },\n )\n return connector_details\n\n def sync_and_poll(\n self,\n connector_id: str,\n poll_interval: float = DEFAULT_POLL_INTERVAL,\n poll_timeout: Optional[float] = None,\n ) -> FivetranOutput:\n """Initializes a sync operation for the given connector, and polls until it completes.\n\n Args:\n connector_id (str): The Fivetran Connector ID. You can retrieve this value from the\n "Setup" tab of a given connector in the Fivetran UI.\n poll_interval (float): The time (in seconds) that will be waited between successive polls.\n poll_timeout (float): The maximum time that will waited before this operation is timed\n out. By default, this will never time out.\n\n Returns:\n :py:class:`~FivetranOutput`:\n Object containing details about the connector and the tables it updates\n """\n schema_config = self.get_connector_schema_config(connector_id)\n init_last_sync_timestamp, _, _ = self.get_connector_sync_status(connector_id)\n self.start_sync(connector_id)\n final_details = self.poll_sync(\n connector_id,\n init_last_sync_timestamp,\n poll_interval=poll_interval,\n poll_timeout=poll_timeout,\n )\n return FivetranOutput(connector_details=final_details, schema_config=schema_config)\n\n def resync_and_poll(\n self,\n connector_id: str,\n poll_interval: float = DEFAULT_POLL_INTERVAL,\n poll_timeout: Optional[float] = None,\n resync_parameters: Optional[Mapping[str, Sequence[str]]] = None,\n ) -> FivetranOutput:\n """Initializes a historical resync operation for the given connector, and polls until it completes.\n\n Args:\n connector_id (str): The Fivetran Connector ID. You can retrieve this value from the\n "Setup" tab of a given connector in the Fivetran UI.\n resync_parameters (Dict[str, List[str]]): The payload to send to the Fivetran API.\n This should be a dictionary with schema names as the keys and a list of tables\n to resync as the values.\n poll_interval (float): The time (in seconds) that will be waited between successive polls.\n poll_timeout (float): The maximum time that will waited before this operation is timed\n out. By default, this will never time out.\n\n Returns:\n :py:class:`~FivetranOutput`:\n Object containing details about the connector and the tables it updates\n """\n schema_config = self.get_connector_schema_config(connector_id)\n init_last_sync_timestamp, _, _ = self.get_connector_sync_status(connector_id)\n self.start_resync(connector_id, resync_parameters)\n final_details = self.poll_sync(\n connector_id,\n init_last_sync_timestamp,\n poll_interval=poll_interval,\n poll_timeout=poll_timeout,\n )\n return FivetranOutput(connector_details=final_details, schema_config=schema_config)
\n\n\n
[docs]@dagster_maintained_resource\n@resource(config_schema=FivetranResource.to_config_schema())\ndef fivetran_resource(context: InitResourceContext) -> FivetranResource:\n """This resource allows users to programatically interface with the Fivetran REST API to launch\n syncs and monitor their progress. This currently implements only a subset of the functionality\n exposed by the API.\n\n For a complete set of documentation on the Fivetran REST API, including expected response JSON\n schemae, see the `Fivetran API Docs <https://fivetran.com/docs/rest-api/connectors>`_.\n\n To configure this resource, we recommend using the `configured\n <https://docs.dagster.io/concepts/configuration/configured>`_ method.\n\n **Examples:**\n\n .. code-block:: python\n\n from dagster import job\n from dagster_fivetran import fivetran_resource\n\n my_fivetran_resource = fivetran_resource.configured(\n {\n "api_key": {"env": "FIVETRAN_API_KEY"},\n "api_secret": {"env": "FIVETRAN_API_SECRET"},\n }\n )\n\n @job(resource_defs={"fivetran":my_fivetran_resource})\n def my_fivetran_job():\n ...\n\n """\n return FivetranResource.from_resource_context(context)
\n
", "current_page_name": "_modules/dagster_fivetran/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_fivetran.resources"}}, "dagster_gcp": {"bigquery": {"io_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_gcp.bigquery.io_manager

\nfrom abc import abstractmethod\nfrom contextlib import contextmanager\nfrom typing import Generator, Optional, Sequence, Type, cast\n\nfrom dagster import IOManagerDefinition, OutputContext, io_manager\nfrom dagster._annotations import experimental\nfrom dagster._config.pythonic_config import (\n    ConfigurableIOManagerFactory,\n)\nfrom dagster._core.storage.db_io_manager import (\n    DbClient,\n    DbIOManager,\n    DbTypeHandler,\n    TablePartitionDimension,\n    TableSlice,\n    TimeWindow,\n)\nfrom dagster._core.storage.io_manager import dagster_maintained_io_manager\nfrom google.api_core.exceptions import NotFound\nfrom google.cloud import bigquery\nfrom pydantic import Field\n\nfrom .utils import setup_gcp_creds\n\nBIGQUERY_DATETIME_FORMAT = "%Y-%m-%d %H:%M:%S"\n\n\n
[docs]@experimental\ndef build_bigquery_io_manager(\n type_handlers: Sequence[DbTypeHandler], default_load_type: Optional[Type] = None\n) -> IOManagerDefinition:\n """Builds an I/O manager definition that reads inputs from and writes outputs to BigQuery.\n\n Args:\n type_handlers (Sequence[DbTypeHandler]): Each handler defines how to translate between\n slices of BigQuery tables and an in-memory type - e.g. a Pandas DataFrame.\n If only one DbTypeHandler is provided, it will be used as the default_load_type.\n default_load_type (Type): When an input has no type annotation, load it as this type.\n\n Returns:\n IOManagerDefinition\n\n Examples:\n .. code-block:: python\n\n from dagster_gcp import build_bigquery_io_manager\n from dagster_bigquery_pandas import BigQueryPandasTypeHandler\n from dagster import Definitions\n\n @asset(\n key_prefix=["my_dataset"] # my_dataset will be used as the dataset in BigQuery\n )\n def my_table() -> pd.DataFrame: # the name of the asset will be the table name\n ...\n\n bigquery_io_manager = build_bigquery_io_manager([BigQueryPandasTypeHandler()])\n\n defs = Definitions(\n assets=[my_table],\n resources={\n "io_manager": bigquery_io_manager.configured({\n "project" : {"env": "GCP_PROJECT"}\n })\n }\n )\n\n You can tell Dagster in which dataset to create tables by setting the ``dataset`` configuration value.\n If you do not provide a dataset as configuration to the I/O manager, Dagster will determine a dataset based\n on the assets and ops using the I/O Manager. For assets, the dataset will be determined from the asset key,\n as shown in the above example. The final prefix before the asset name will be used as the dataset. For example,\n if the asset ``my_table`` had the key prefix ``["gcp", "bigquery", "my_dataset"]``, the dataset ``my_dataset`` will be\n used. For ops, the dataset can be specified by including a `schema` entry in output metadata. If ``schema`` is\n not provided via config or on the asset/op, ``public`` will be used for the dataset.\n\n .. code-block:: python\n\n @op(\n out={"my_table": Out(metadata={"schema": "my_dataset"})}\n )\n def make_my_table() -> pd.DataFrame:\n # the returned value will be stored at my_dataset.my_table\n ...\n\n To only use specific columns of a table as input to a downstream op or asset, add the metadata ``columns`` to the\n :py:class:`~dagster.In` or :py:class:`~dagster.AssetIn`.\n\n .. code-block:: python\n\n @asset(\n ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n )\n def my_table_a(my_table: pd.DataFrame) -> pd.DataFrame:\n # my_table will just contain the data from column "a"\n ...\n\n If you cannot upload a file to your Dagster deployment, or otherwise cannot\n `authenticate with GCP <https://cloud.google.com/docs/authentication/provide-credentials-adc>`_\n via a standard method, you can provide a service account key as the ``gcp_credentials`` configuration.\n Dagster willstore this key in a temporary file and set ``GOOGLE_APPLICATION_CREDENTIALS`` to point to the file.\n After the run completes, the file will be deleted, and ``GOOGLE_APPLICATION_CREDENTIALS`` will be\n unset. The key must be base64 encoded to avoid issues with newlines in the keys. You can retrieve\n the base64 encoded with this shell command: ``cat $GOOGLE_APPLICATION_CREDENTIALS | base64``\n """\n\n @dagster_maintained_io_manager\n @io_manager(config_schema=BigQueryIOManager.to_config_schema())\n def bigquery_io_manager(init_context):\n """I/O Manager for storing outputs in a BigQuery database.\n\n Assets will be stored in the dataset and table name specified by their AssetKey.\n Subsequent materializations of an asset will overwrite previous materializations of that asset.\n Op outputs will be stored in the dataset specified by output metadata (defaults to public) in a\n table of the name of the output.\n\n Note that the BigQuery config is mapped to the DB IO manager table hierarchy as follows:\n BigQuery DB IO\n * project -> database\n * dataset -> schema\n * table -> table\n """\n mgr = DbIOManager(\n type_handlers=type_handlers,\n db_client=BigQueryClient(),\n io_manager_name="BigQueryIOManager",\n database=init_context.resource_config["project"],\n schema=init_context.resource_config.get("dataset"),\n default_load_type=default_load_type,\n )\n if init_context.resource_config.get("gcp_credentials"):\n with setup_gcp_creds(init_context.resource_config.get("gcp_credentials")):\n yield mgr\n else:\n yield mgr\n\n return bigquery_io_manager
\n\n\n
[docs]class BigQueryIOManager(ConfigurableIOManagerFactory):\n """Base class for an I/O manager definition that reads inputs from and writes outputs to BigQuery.\n\n Examples:\n .. code-block:: python\n\n from dagster_gcp import BigQueryIOManager\n from dagster_bigquery_pandas import BigQueryPandasTypeHandler\n from dagster import Definitions, EnvVar\n\n class MyBigQueryIOManager(BigQueryIOManager):\n @staticmethod\n def type_handlers() -> Sequence[DbTypeHandler]:\n return [BigQueryPandasTypeHandler()]\n\n @asset(\n key_prefix=["my_dataset"] # my_dataset will be used as the dataset in BigQuery\n )\n def my_table() -> pd.DataFrame: # the name of the asset will be the table name\n ...\n\n defs = Definitions(\n assets=[my_table],\n resources={\n "io_manager": MyBigQueryIOManager(project=EnvVar("GCP_PROJECT"))\n }\n )\n\n You can tell Dagster in which dataset to create tables by setting the ``dataset`` configuration value.\n If you do not provide a dataset as configuration to the I/O manager, Dagster will determine a dataset based\n on the assets and ops using the I/O Manager. For assets, the dataset will be determined from the asset key,\n as shown in the above example. The final prefix before the asset name will be used as the dataset. For example,\n if the asset ``my_table`` had the key prefix ``["gcp", "bigquery", "my_dataset"]``, the dataset ``my_dataset`` will be\n used. For ops, the dataset can be specified by including a ``schema`` entry in output metadata. If ``schema`` is\n not provided via config or on the asset/op, ``public`` will be used for the dataset.\n\n .. code-block:: python\n\n @op(\n out={"my_table": Out(metadata={"schema": "my_dataset"})}\n )\n def make_my_table() -> pd.DataFrame:\n # the returned value will be stored at my_dataset.my_table\n ...\n\n To only use specific columns of a table as input to a downstream op or asset, add the metadata ``columns`` to the\n :py:class:`~dagster.In` or :py:class:`~dagster.AssetIn`.\n\n .. code-block:: python\n\n @asset(\n ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n )\n def my_table_a(my_table: pd.DataFrame) -> pd.DataFrame:\n # my_table will just contain the data from column "a"\n ...\n\n If you cannot upload a file to your Dagster deployment, or otherwise cannot\n `authenticate with GCP <https://cloud.google.com/docs/authentication/provide-credentials-adc>`_\n via a standard method, you can provide a service account key as the ``gcp_credentials`` configuration.\n Dagster will store this key in a temporary file and set ``GOOGLE_APPLICATION_CREDENTIALS`` to point to the file.\n After the run completes, the file will be deleted, and ``GOOGLE_APPLICATION_CREDENTIALS`` will be\n unset. The key must be base64 encoded to avoid issues with newlines in the keys. You can retrieve\n the base64 encoded with this shell command: ``cat $GOOGLE_APPLICATION_CREDENTIALS | base64``\n """\n\n project: str = Field(description="The GCP project to use.")\n dataset: Optional[str] = Field(\n default=None,\n description=(\n "Name of the BigQuery dataset to use. If not provided, the last prefix before"\n " the asset name will be used."\n ),\n )\n location: Optional[str] = Field(\n default=None,\n description=(\n "The GCP location. Note: When using PySpark DataFrames, the default"\n " location of the project will be used. A custom location can be specified in"\n " your SparkSession configuration."\n ),\n )\n gcp_credentials: Optional[str] = Field(\n default=None,\n description=(\n "GCP authentication credentials. If provided, a temporary file will be created"\n " with the credentials and ``GOOGLE_APPLICATION_CREDENTIALS`` will be set to the"\n " temporary file. To avoid issues with newlines in the keys, you must base64"\n " encode the key. You can retrieve the base64 encoded key with this shell"\n " command: ``cat $GOOGLE_AUTH_CREDENTIALS | base64``"\n ),\n )\n temporary_gcs_bucket: Optional[str] = Field(\n default=None,\n description=(\n "When using PySpark DataFrames, optionally specify a temporary GCS bucket to"\n " store data. If not provided, data will be directly written to BigQuery."\n ),\n )\n timeout: Optional[float] = Field(\n default=None,\n description=(\n "When using Pandas DataFrames, optionally specify a timeout for the BigQuery"\n " queries (loading and reading from tables)."\n ),\n )\n\n @staticmethod\n @abstractmethod\n def type_handlers() -> Sequence[DbTypeHandler]: ...\n\n @staticmethod\n def default_load_type() -> Optional[Type]:\n return None\n\n def create_io_manager(self, context) -> Generator:\n mgr = DbIOManager(\n db_client=BigQueryClient(),\n io_manager_name="BigQueryIOManager",\n database=self.project,\n schema=self.dataset,\n type_handlers=self.type_handlers(),\n default_load_type=self.default_load_type(),\n )\n if self.gcp_credentials:\n with setup_gcp_creds(self.gcp_credentials):\n yield mgr\n else:\n yield mgr
\n\n\nclass BigQueryClient(DbClient):\n @staticmethod\n def delete_table_slice(context: OutputContext, table_slice: TableSlice, connection) -> None:\n try:\n connection.query(_get_cleanup_statement(table_slice)).result()\n except NotFound:\n # table doesn't exist yet, so ignore the error\n pass\n\n @staticmethod\n def get_select_statement(table_slice: TableSlice) -> str:\n col_str = ", ".join(table_slice.columns) if table_slice.columns else "*"\n\n if table_slice.partition_dimensions and len(table_slice.partition_dimensions) > 0:\n query = (\n f"SELECT {col_str} FROM"\n f" `{table_slice.database}.{table_slice.schema}.{table_slice.table}` WHERE\\n"\n )\n return query + _partition_where_clause(table_slice.partition_dimensions)\n else:\n return f"""SELECT {col_str} FROM `{table_slice.database}.{table_slice.schema}.{table_slice.table}`"""\n\n @staticmethod\n def ensure_schema_exists(context: OutputContext, table_slice: TableSlice, connection) -> None:\n connection.query(f"CREATE SCHEMA IF NOT EXISTS {table_slice.schema}").result()\n\n @staticmethod\n @contextmanager\n def connect(context, _):\n conn = bigquery.Client(\n project=context.resource_config.get("project"),\n location=context.resource_config.get("location"),\n )\n\n yield conn\n\n\ndef _get_cleanup_statement(table_slice: TableSlice) -> str:\n """Returns a SQL statement that deletes data in the given table to make way for the output data\n being written.\n """\n if table_slice.partition_dimensions and len(table_slice.partition_dimensions) > 0:\n query = (\n f"DELETE FROM `{table_slice.database}.{table_slice.schema}.{table_slice.table}` WHERE\\n"\n )\n return query + _partition_where_clause(table_slice.partition_dimensions)\n else:\n return f"TRUNCATE TABLE `{table_slice.database}.{table_slice.schema}.{table_slice.table}`"\n\n\ndef _partition_where_clause(partition_dimensions: Sequence[TablePartitionDimension]) -> str:\n return " AND\\n".join(\n (\n _time_window_where_clause(partition_dimension)\n if isinstance(partition_dimension.partitions, TimeWindow)\n else _static_where_clause(partition_dimension)\n )\n for partition_dimension in partition_dimensions\n )\n\n\ndef _time_window_where_clause(table_partition: TablePartitionDimension) -> str:\n partition = cast(TimeWindow, table_partition.partitions)\n start_dt, end_dt = partition\n start_dt_str = start_dt.strftime(BIGQUERY_DATETIME_FORMAT)\n end_dt_str = end_dt.strftime(BIGQUERY_DATETIME_FORMAT)\n return f"""{table_partition.partition_expr} >= '{start_dt_str}' AND {table_partition.partition_expr} < '{end_dt_str}'"""\n\n\ndef _static_where_clause(table_partition: TablePartitionDimension) -> str:\n partitions = ", ".join(f"'{partition}'" for partition in table_partition.partitions)\n return f"""{table_partition.partition_expr} in ({partitions})"""\n
", "current_page_name": "_modules/dagster_gcp/bigquery/io_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_gcp.bigquery.io_manager"}, "ops": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_gcp.bigquery.ops

\nimport hashlib\n\nfrom dagster import (\n    In,\n    List,\n    Nothing,\n    Out,\n    _check as check,\n    op,\n)\nfrom dagster_pandas import DataFrame\nfrom google.cloud.bigquery.encryption_configuration import EncryptionConfiguration\nfrom google.cloud.bigquery.job import LoadJobConfig, QueryJobConfig\nfrom google.cloud.bigquery.table import TimePartitioning\n\nfrom .configs import (\n    define_bigquery_create_dataset_config,\n    define_bigquery_delete_dataset_config,\n    define_bigquery_load_config,\n    define_bigquery_query_config,\n)\nfrom .types import BigQueryLoadSource\n\n_START = "start"\n\n\ndef _preprocess_config(cfg):\n    destination_encryption_configuration = cfg.get("destination_encryption_configuration")\n    time_partitioning = cfg.get("time_partitioning")\n\n    if destination_encryption_configuration is not None:\n        cfg["destination_encryption_configuration"] = EncryptionConfiguration(\n            kms_key_name=destination_encryption_configuration\n        )\n\n    if time_partitioning is not None:\n        cfg["time_partitioning"] = TimePartitioning(**time_partitioning)\n\n    return cfg\n\n\n
[docs]def bq_op_for_queries(sql_queries):\n """Executes BigQuery SQL queries.\n\n Expects a BQ client to be provisioned in resources as context.resources.bigquery.\n """\n sql_queries = check.list_param(sql_queries, "sql queries", of_type=str)\n m = hashlib.sha1()\n for query in sql_queries:\n m.update(query.encode("utf-8"))\n hash_str = m.hexdigest()[:10]\n name = f"bq_op_{hash_str}"\n\n @op(\n name=name,\n ins={_START: In(Nothing)},\n out=Out(List[DataFrame]),\n config_schema=define_bigquery_query_config(),\n required_resource_keys={"bigquery"},\n tags={"kind": "sql", "sql": "\\n".join(sql_queries)},\n )\n def _bq_fn(context):\n query_job_config = _preprocess_config(context.op_config.get("query_job_config", {}))\n\n # Retrieve results as pandas DataFrames\n results = []\n for sql_query in sql_queries:\n # We need to construct a new QueryJobConfig for each query.\n # See: https://bit.ly/2VjD6sl\n cfg = QueryJobConfig(**query_job_config) if query_job_config else None\n context.log.info(\n "executing query %s with config: %s"\n % (sql_query, cfg.to_api_repr() if cfg else "(no config provided)")\n )\n results.append(\n context.resources.bigquery.query(sql_query, job_config=cfg).to_dataframe()\n )\n\n return results\n\n return _bq_fn
\n\n\nBIGQUERY_LOAD_CONFIG = define_bigquery_load_config()\n\n\n
[docs]@op(\n ins={"paths": In(List[str])},\n out=Out(Nothing),\n config_schema=BIGQUERY_LOAD_CONFIG,\n required_resource_keys={"bigquery"},\n)\ndef import_gcs_paths_to_bq(context, paths):\n return _execute_load_in_source(context, paths, BigQueryLoadSource.GCS)
\n\n\n
[docs]@op(\n ins={"df": In(DataFrame)},\n out=Out(Nothing),\n config_schema=BIGQUERY_LOAD_CONFIG,\n required_resource_keys={"bigquery"},\n)\ndef import_df_to_bq(context, df):\n return _execute_load_in_source(context, df, BigQueryLoadSource.DataFrame)
\n\n\n
[docs]@op(\n ins={"path": In(str)},\n out=Out(Nothing),\n config_schema=BIGQUERY_LOAD_CONFIG,\n required_resource_keys={"bigquery"},\n)\ndef import_file_to_bq(context, path):\n return _execute_load_in_source(context, path, BigQueryLoadSource.File)
\n\n\ndef _execute_load_in_source(context, source, source_name):\n destination = context.op_config.get("destination")\n load_job_config = _preprocess_config(context.op_config.get("load_job_config", {}))\n cfg = LoadJobConfig(**load_job_config) if load_job_config else None\n\n context.log.info(\n "executing BQ load with config: %s for source %s"\n % (cfg.to_api_repr() if cfg else "(no config provided)", source)\n )\n\n if source_name == BigQueryLoadSource.DataFrame:\n context.resources.bigquery.load_table_from_dataframe(\n source, destination, job_config=cfg\n ).result()\n\n # Load from file. See: https://cloud.google.com/bigquery/docs/loading-data-local\n elif source_name == BigQueryLoadSource.File:\n with open(source, "rb") as file_obj:\n context.resources.bigquery.load_table_from_file(\n file_obj, destination, job_config=cfg\n ).result()\n\n # Load from GCS. See: https://cloud.google.com/bigquery/docs/loading-data-cloud-storage\n elif source_name == BigQueryLoadSource.GCS:\n context.resources.bigquery.load_table_from_uri(source, destination, job_config=cfg).result()\n\n\n
[docs]@op(\n ins={_START: In(Nothing)},\n config_schema=define_bigquery_create_dataset_config(),\n required_resource_keys={"bigquery"},\n)\ndef bq_create_dataset(context):\n """BigQuery Create Dataset.\n\n This op encapsulates creating a BigQuery dataset.\n\n Expects a BQ client to be provisioned in resources as context.resources.bigquery.\n """\n (dataset, exists_ok) = [context.op_config.get(k) for k in ("dataset", "exists_ok")]\n context.log.info("executing BQ create_dataset for dataset %s" % (dataset))\n context.resources.bigquery.create_dataset(dataset, exists_ok)
\n\n\n
[docs]@op(\n ins={_START: In(Nothing)},\n config_schema=define_bigquery_delete_dataset_config(),\n required_resource_keys={"bigquery"},\n)\ndef bq_delete_dataset(context):\n """BigQuery Delete Dataset.\n\n This op encapsulates deleting a BigQuery dataset.\n\n Expects a BQ client to be provisioned in resources as context.resources.bigquery.\n """\n (dataset, delete_contents, not_found_ok) = [\n context.op_config.get(k) for k in ("dataset", "delete_contents", "not_found_ok")\n ]\n\n context.log.info("executing BQ delete_dataset for dataset %s" % dataset)\n\n context.resources.bigquery.delete_dataset(\n dataset, delete_contents=delete_contents, not_found_ok=not_found_ok\n )
\n
", "current_page_name": "_modules/dagster_gcp/bigquery/ops", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_gcp.bigquery.ops"}, "resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_gcp.bigquery.resources

\nfrom contextlib import contextmanager\nfrom typing import Any, Iterator, Optional\n\nfrom dagster import ConfigurableResource, IAttachDifferentObjectToOpContext, resource\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom google.cloud import bigquery\nfrom pydantic import Field\n\nfrom .utils import setup_gcp_creds\n\n\n
[docs]class BigQueryResource(ConfigurableResource, IAttachDifferentObjectToOpContext):\n """Resource for interacting with Google BigQuery.\n\n Examples:\n .. code-block:: python\n\n from dagster import Definitions, asset\n from dagster_gcp import BigQueryResource\n\n @asset\n def my_table(bigquery: BigQueryResource):\n with bigquery.get_client() as client:\n client.query("SELECT * FROM my_dataset.my_table")\n\n defs = Definitions(\n assets=[my_table],\n resources={\n "bigquery": BigQueryResource(project="my-project")\n }\n )\n """\n\n project: Optional[str] = Field(\n default=None,\n description=(\n "Project ID for the project which the client acts on behalf of. Will be passed when"\n " creating a dataset / job. If not passed, falls back to the default inferred from the"\n " environment."\n ),\n )\n\n location: Optional[str] = Field(\n default=None,\n description="Default location for jobs / datasets / tables.",\n )\n\n gcp_credentials: Optional[str] = Field(\n default=None,\n description=(\n "GCP authentication credentials. If provided, a temporary file will be created"\n " with the credentials and ``GOOGLE_APPLICATION_CREDENTIALS`` will be set to the"\n " temporary file. To avoid issues with newlines in the keys, you must base64"\n " encode the key. You can retrieve the base64 encoded key with this shell"\n " command: ``cat $GOOGLE_AUTH_CREDENTIALS | base64``"\n ),\n )\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n @contextmanager\n def get_client(self) -> Iterator[bigquery.Client]:\n """Context manager to create a BigQuery Client.\n\n Examples:\n .. code-block:: python\n\n from dagster import asset\n from dagster_gcp import BigQueryResource\n\n @asset\n def my_table(bigquery: BigQueryResource):\n with bigquery.get_client() as client:\n client.query("SELECT * FROM my_dataset.my_table")\n """\n if self.gcp_credentials:\n with setup_gcp_creds(self.gcp_credentials):\n yield bigquery.Client(project=self.project, location=self.location)\n\n else:\n yield bigquery.Client(project=self.project, location=self.location)\n\n def get_object_to_set_on_execution_context(self) -> Any:\n with self.get_client() as client:\n yield client
\n\n\n
[docs]@dagster_maintained_resource\n@resource(\n config_schema=BigQueryResource.to_config_schema(),\n description="Dagster resource for connecting to BigQuery",\n)\ndef bigquery_resource(context):\n bq_resource = BigQueryResource.from_resource_context(context)\n with bq_resource.get_client() as client:\n yield client
\n
", "current_page_name": "_modules/dagster_gcp/bigquery/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_gcp.bigquery.resources"}, "types": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_gcp.bigquery.types

\nimport re\nfrom enum import Enum as PyEnum\n\nfrom dagster import Enum, EnumValue\nfrom dagster._config import ConfigScalar, ConfigScalarKind, PostProcessingError\nfrom google.cloud.bigquery.job import (\n    CreateDisposition,\n    Encoding,\n    QueryPriority,\n    SchemaUpdateOption,\n    SourceFormat,\n    WriteDisposition,\n)\n\n\nclass BigQueryLoadSource(PyEnum):\n    DataFrame = "DATA_FRAME"\n    GCS = "GCS"\n    File = "FILE"\n\n\nBQCreateDisposition = Enum(\n    name="BQCreateDisposition",\n    enum_values=[\n        EnumValue(CreateDisposition.CREATE_IF_NEEDED),\n        EnumValue(CreateDisposition.CREATE_NEVER),\n    ],\n)\n\nBQPriority = Enum(\n    name="BQPriority",\n    enum_values=[EnumValue(QueryPriority.BATCH), EnumValue(QueryPriority.INTERACTIVE)],\n)\n\nBQSchemaUpdateOption = Enum(\n    name="BQSchemaUpdateOption",\n    enum_values=[\n        EnumValue(\n            SchemaUpdateOption.ALLOW_FIELD_ADDITION,\n            description="Allow adding a nullable field to the schema.",\n        ),\n        EnumValue(\n            SchemaUpdateOption.ALLOW_FIELD_RELAXATION,\n            description="Allow relaxing a required field in the original schema to nullable.",\n        ),\n    ],\n)\n\nBQWriteDisposition = Enum(\n    name="BQWriteDisposition",\n    enum_values=[\n        EnumValue(WriteDisposition.WRITE_APPEND),\n        EnumValue(WriteDisposition.WRITE_EMPTY),\n        EnumValue(WriteDisposition.WRITE_TRUNCATE),\n    ],\n)\n\nBQEncoding = Enum(\n    name="BQEncoding", enum_values=[EnumValue(Encoding.ISO_8859_1), EnumValue(Encoding.UTF_8)]\n)\n\nBQSourceFormat = Enum(\n    name="BQSourceFormat",\n    enum_values=[\n        EnumValue(SourceFormat.AVRO),\n        EnumValue(SourceFormat.CSV),\n        EnumValue(SourceFormat.DATASTORE_BACKUP),\n        EnumValue(SourceFormat.NEWLINE_DELIMITED_JSON),\n        EnumValue(SourceFormat.ORC),\n        EnumValue(SourceFormat.PARQUET),\n    ],\n)\n\n\n# Project names are permitted to have alphanumeric, dashes and underscores, up to 1024 characters.\nRE_PROJECT = r"[\\w\\d\\-\\_]{1,1024}"\n\n# Datasets and tables are permitted to have alphanumeric or underscores, no dashes allowed, up to\n# 1024 characters\nRE_DS_TABLE = r"[\\w\\d\\_]{1,1024}"\n\n# BigQuery supports writes directly to date partitions with the syntax foo.bar$20190101\nRE_PARTITION_SUFFIX = r"(\\$\\d{8})?"\n\n\ndef _is_valid_dataset(config_value):\n    """Datasets must be of form "project.dataset" or "dataset"."""\n    return re.match(\n        # regex matches: project.dataset -- OR -- dataset\n        r"^" + RE_PROJECT + r"\\." + RE_DS_TABLE + r"$|^" + RE_DS_TABLE + r"$",\n        config_value,\n    )\n\n\ndef _is_valid_table(config_value):\n    """Tables must be of form "project.dataset.table" or "dataset.table" with optional\n    date-partition suffix.\n    """\n    return re.match(\n        r"^"\n        + RE_PROJECT  #          project\n        + r"\\."  #               .\n        + RE_DS_TABLE  #         dataset\n        + r"\\."  #               .\n        + RE_DS_TABLE  #         table\n        + RE_PARTITION_SUFFIX  # date partition suffix\n        + r"$|^"  #              -- OR --\n        + RE_DS_TABLE  #         dataset\n        + r"\\."  #               .\n        + RE_DS_TABLE  #         table\n        + RE_PARTITION_SUFFIX  # date partition suffix\n        + r"$",\n        config_value,\n    )\n\n\nclass _Dataset(ConfigScalar):\n    def __init__(self):\n        super(_Dataset, self).__init__(\n            key=type(self).__name__,\n            given_name=type(self).__name__,\n            scalar_kind=ConfigScalarKind.STRING,\n        )\n\n    def post_process(self, value):\n        if not _is_valid_dataset(value):\n            raise PostProcessingError('Datasets must be of the form "project.dataset" or "dataset"')\n        return value\n\n\nclass _Table(ConfigScalar):\n    def __init__(self):\n        super(_Table, self).__init__(\n            key=type(self).__name__,\n            given_name=type(self).__name__,\n            scalar_kind=ConfigScalarKind.STRING,\n        )\n\n    def post_process(self, value):\n        if not _is_valid_table(value):\n            raise PostProcessingError(\n                'Tables must be of the form "project.dataset.table" or "dataset.table" '\n                "with optional date-partition suffix"\n            )\n\n        return value\n\n\n# https://github.com/dagster-io/dagster/issues/1971\nTable = _Table()\nDataset = _Dataset()\n\n\n
[docs]class BigQueryError(Exception):\n pass
\n
", "current_page_name": "_modules/dagster_gcp/bigquery/types", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_gcp.bigquery.types"}}, "dataproc": {"ops": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_gcp.dataproc.ops

\nfrom typing import Any, Dict\n\nfrom dagster import (\n    Bool,\n    Config,\n    Field as DagsterField,\n    Int,\n    op,\n)\nfrom dagster._seven import json\nfrom pydantic import Field\n\nfrom .configs import define_dataproc_submit_job_config\nfrom .resources import TWENTY_MINUTES, DataprocResource\n\n# maintain the old config schema because of the nested job_config schema\nDATAPROC_CONFIG_SCHEMA = {\n    "job_timeout_in_seconds": DagsterField(\n        Int,\n        description="""Optional. Maximum time in seconds to wait for the job being\n                    completed. Default is set to 1200 seconds (20 minutes).\n                    """,\n        is_required=False,\n        default_value=TWENTY_MINUTES,\n    ),\n    "job_config": define_dataproc_submit_job_config(),\n    "job_scoped_cluster": DagsterField(\n        Bool,\n        description="whether to create a cluster or use an existing cluster",\n        is_required=False,\n        default_value=True,\n    ),\n}\n\n\nclass DataprocOpConfig(Config):\n    job_timeout_in_seconds: int = Field(\n        default=TWENTY_MINUTES,\n        description=(\n            "Maximum time in seconds to wait for the job being completed. Default is set to 1200"\n            " seconds (20 minutes)."\n        ),\n    )\n    job_scoped_cluster: bool = Field(\n        default=True,\n        description="Whether to create a cluster or use an existing cluster. Defaults to True.",\n    )\n    project_id: str = Field(\n        description=(\n            "Required. Project ID for the project which the client acts on behalf of. Will be"\n            " passed when creating a dataset/job."\n        )\n    )\n    region: str = Field(description="The GCP region.")\n    job_config: Dict[str, Any] = Field(\n        description="Python dictionary containing configuration for the Dataproc Job."\n    )\n\n\ndef _dataproc_compute(context):\n    job_config = context.op_config["job_config"]\n    job_timeout = context.op_config["job_timeout_in_seconds"]\n\n    context.log.info(\n        "submitting job with config: %s and timeout of: %d seconds"\n        % (str(json.dumps(job_config)), job_timeout)\n    )\n\n    if context.op_config["job_scoped_cluster"]:\n        # Cluster context manager, creates and then deletes cluster\n        with context.resources.dataproc.cluster_context_manager() as cluster:\n            # Submit the job specified by this solid to the cluster defined by the associated resource\n            result = cluster.submit_job(job_config)\n\n            job_id = result["reference"]["jobId"]\n            context.log.info(f"Submitted job ID {job_id}")\n            cluster.wait_for_job(job_id, wait_timeout=job_timeout)\n\n    else:\n        # Submit to an existing cluster\n        # Submit the job specified by this solid to the cluster defined by the associated resource\n        result = context.resources.dataproc.submit_job(job_config)\n\n        job_id = result["reference"]["jobId"]\n        context.log.info(f"Submitted job ID {job_id}")\n        context.resources.dataproc.wait_for_job(job_id, wait_timeout=job_timeout)\n\n\n@op(required_resource_keys={"dataproc"}, config_schema=DATAPROC_CONFIG_SCHEMA)\ndef dataproc_solid(context):\n    return _dataproc_compute(context)\n\n\n
[docs]@op(required_resource_keys={"dataproc"}, config_schema=DATAPROC_CONFIG_SCHEMA)\ndef dataproc_op(context):\n return _dataproc_compute(context)
\n\n\n@op\ndef configurable_dataproc_op(context, dataproc: DataprocResource, config: DataprocOpConfig):\n job_config = {"projectId": config.project_id, "region": config.region, "job": config.job_config}\n job_timeout = config.job_timeout_in_seconds\n\n context.log.info(\n "submitting job with config: %s and timeout of: %d seconds"\n % (str(json.dumps(job_config)), job_timeout)\n )\n\n dataproc_client = dataproc.get_client()\n\n if config.job_scoped_cluster:\n # Cluster context manager, creates and then deletes cluster\n with dataproc_client.cluster_context_manager() as cluster:\n # Submit the job specified by this solid to the cluster defined by the associated resource\n result = cluster.submit_job(job_config)\n\n job_id = result["reference"]["jobId"]\n context.log.info(f"Submitted job ID {job_id}")\n cluster.wait_for_job(job_id, wait_timeout=job_timeout)\n\n else:\n # Submit to an existing cluster\n # Submit the job specified by this solid to the cluster defined by the associated resource\n result = dataproc_client.submit_job(job_config)\n\n job_id = result["reference"]["jobId"]\n context.log.info(f"Submitted job ID {job_id}")\n dataproc_client.wait_for_job(job_id, wait_timeout=job_timeout)\n
", "current_page_name": "_modules/dagster_gcp/dataproc/ops", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_gcp.dataproc.ops"}, "resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_gcp.dataproc.resources

\nimport json\nimport time\nfrom contextlib import contextmanager\nfrom typing import Any, Dict, Mapping, Optional\n\nimport dagster._check as check\nimport yaml\nfrom dagster import ConfigurableResource, IAttachDifferentObjectToOpContext, resource\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom googleapiclient.discovery import build\nfrom oauth2client.client import GoogleCredentials\nfrom pydantic import Field\n\nfrom .configs import define_dataproc_create_cluster_config\nfrom .types import DataprocError\n\nTWENTY_MINUTES = 20 * 60\nDEFAULT_ITER_TIME_SEC = 5\n\n\nclass DataprocClient:\n    """Builds a client to the dataproc API."""\n\n    def __init__(self, config):\n        # Use Application Default Credentials to check the\n        # GOOGLE_APPLICATION_CREDENTIALS environment variable\n        # for the location of the service account key file.\n        credentials = GoogleCredentials.get_application_default()\n\n        # See https://github.com/googleapis/google-api-python-client/issues/299 for the\n        # cache_discovery=False configuration below\n        self.dataproc = build("dataproc", "v1", credentials=credentials, cache_discovery=False)\n\n        self.config = config\n\n        (self.project_id, self.region, self.cluster_name, self.cluster_config) = (\n            self.config.get(k) for k in ("projectId", "region", "clusterName", "cluster_config")\n        )\n\n    @property\n    def dataproc_clusters(self):\n        return (\n            # Google APIs dynamically genned, so pylint pukes\n            self.dataproc.projects()\n            .regions()\n            .clusters()\n        )\n\n    @property\n    def dataproc_jobs(self):\n        return (\n            # Google APIs dynamically genned, so pylint pukes\n            self.dataproc.projects()\n            .regions()\n            .jobs()\n        )\n\n    def create_cluster(self):\n        (\n            self.dataproc_clusters.create(\n                projectId=self.project_id,\n                region=self.region,\n                body={\n                    "projectId": self.project_id,\n                    "clusterName": self.cluster_name,\n                    "config": self.cluster_config,\n                },\n            ).execute()\n        )\n\n        def iter_fn():\n            # TODO: Add logging\n            # See: https://bit.ly/2UW5JaN\n            cluster = self.get_cluster()\n            return cluster["status"]["state"] in {"RUNNING", "UPDATING"}\n\n        done = DataprocClient._iter_and_sleep_until_ready(iter_fn)\n        if not done:\n            cluster = self.get_cluster()\n            raise DataprocError(\n                "Could not provision cluster -- status: %s" % str(cluster["status"])\n            )\n\n    def get_cluster(self):\n        return self.dataproc_clusters.get(\n            projectId=self.project_id, region=self.region, clusterName=self.cluster_name\n        ).execute()\n\n    def delete_cluster(self):\n        return self.dataproc_clusters.delete(\n            projectId=self.project_id, region=self.region, clusterName=self.cluster_name\n        ).execute()\n\n    def submit_job(self, job_details):\n        return self.dataproc_jobs.submit(\n            projectId=self.project_id, region=self.region, body=job_details\n        ).execute()\n\n    def get_job(self, job_id):\n        return self.dataproc_jobs.get(\n            projectId=self.project_id, region=self.region, jobId=job_id\n        ).execute()\n\n    def wait_for_job(self, job_id, wait_timeout=TWENTY_MINUTES):\n        """This method polls job status every 5 seconds."""\n\n        # TODO: Add logging here print('Waiting for job ID {} to finish...'.format(job_id))\n        def iter_fn():\n            # See: https://bit.ly/2Lg2tHr\n            result = self.get_job(job_id)\n\n            # Handle exceptions\n            if result["status"]["state"] in {"CANCELLED", "ERROR"}:\n                raise DataprocError("Job error: %s" % str(result["status"]))\n\n            if result["status"]["state"] == "DONE":\n                return True\n\n            return False\n\n        done = DataprocClient._iter_and_sleep_until_ready(iter_fn, max_wait_time_sec=wait_timeout)\n        if not done:\n            job = self.get_job(job_id)\n            raise DataprocError("Job run timed out: %s" % str(job["status"]))\n\n    @staticmethod\n    def _iter_and_sleep_until_ready(\n        callable_fn, max_wait_time_sec=TWENTY_MINUTES, iter_time=DEFAULT_ITER_TIME_SEC\n    ):\n        """Iterates and sleeps until callable_fn returns true."""\n        # Wait for cluster ready state\n        ready, curr_iter = False, 0\n        max_iter = max_wait_time_sec / iter_time\n        while not ready and curr_iter < max_iter:\n            ready = callable_fn()\n            time.sleep(iter_time)\n            curr_iter += 1\n\n        # Will return false if ran up to max_iter without success\n        return ready\n\n    @contextmanager\n    def cluster_context_manager(self):\n        """Context manager allowing execution with a dataproc cluster.\n\n        Example:\n        .. code-block::\n            with context.resources.dataproc.cluster as cluster:\n                # do stuff...\n        """\n        self.create_cluster()\n        try:\n            yield self\n        finally:\n            self.delete_cluster()\n\n\n
[docs]class DataprocResource(ConfigurableResource, IAttachDifferentObjectToOpContext):\n """Resource for connecting to a Dataproc cluster.\n\n Example:\n .. code-block::\n\n @asset\n def my_asset(dataproc: DataprocResource):\n with dataproc.get_client() as client:\n # client is a dagster_gcp.DataprocClient\n ...\n """\n\n project_id: str = Field(\n description=(\n "Required. Project ID for the project which the client acts on behalf of. Will be"\n " passed when creating a dataset/job."\n )\n )\n region: str = Field(description="The GCP region.")\n cluster_name: str = Field(\n description=(\n "Required. The cluster name. Cluster names within a project must be unique. Names of"\n " deleted clusters can be reused."\n )\n )\n cluster_config_yaml_path: Optional[str] = Field(\n default=None,\n description=(\n "Full path to a YAML file containing cluster configuration. See"\n " https://cloud.google.com/dataproc/docs/reference/rest/v1/ClusterConfig for"\n " configuration options. Only one of cluster_config_yaml_path,"\n " cluster_config_json_path, or cluster_config_dict may be provided."\n ),\n )\n cluster_config_json_path: Optional[str] = Field(\n default=None,\n description=(\n "Full path to a JSON file containing cluster configuration. See"\n " https://cloud.google.com/dataproc/docs/reference/rest/v1/ClusterConfig for"\n " configuration options. Only one of cluster_config_yaml_path,"\n " cluster_config_json_path, or cluster_config_dict may be provided."\n ),\n )\n cluster_config_dict: Optional[Dict[str, Any]] = Field(\n default=None,\n description=(\n "Python dictionary containing cluster configuration. See"\n " https://cloud.google.com/dataproc/docs/reference/rest/v1/ClusterConfig for"\n " configuration options. Only one of cluster_config_yaml_path,"\n " cluster_config_json_path, or cluster_config_dict may be provided."\n ),\n )\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n def _read_yaml_config(self, path: str) -> Mapping[str, Any]:\n with open(path, "r", encoding="utf8") as f:\n return yaml.safe_load(f)\n\n def _read_json_config(self, path: str) -> Mapping[str, Any]:\n with open(path, "r", encoding="utf8") as f:\n return json.load(f)\n\n def _get_cluster_config(self) -> Optional[Mapping[str, Any]]:\n methods = 0\n methods += 1 if self.cluster_config_dict is not None else 0\n methods += 1 if self.cluster_config_json_path is not None else 0\n methods += 1 if self.cluster_config_yaml_path is not None else 0\n\n # ensure that at most 1 method is provided\n check.invariant(\n methods <= 1,\n "Dataproc Resource: Incorrect config: Cannot provide cluster config multiple ways."\n " Choose one of cluster_config_dict, cluster_config_json_path, or"\n " cluster_config_yaml_path",\n )\n\n cluster_config = None\n if self.cluster_config_json_path:\n cluster_config = self._read_json_config(self.cluster_config_json_path)\n elif self.cluster_config_yaml_path:\n cluster_config = self._read_yaml_config(self.cluster_config_yaml_path)\n elif self.cluster_config_dict:\n cluster_config = self.cluster_config_dict\n\n return cluster_config\n\n def get_client(self) -> DataprocClient:\n cluster_config = self._get_cluster_config()\n\n client_config_dict = {\n "projectId": self.project_id,\n "region": self.region,\n "clusterName": self.cluster_name,\n "cluster_config": cluster_config,\n }\n\n return DataprocClient(config=client_config_dict)\n\n def get_object_to_set_on_execution_context(self) -> Any:\n return self.get_client()
\n\n\n
[docs]@dagster_maintained_resource\n@resource(\n config_schema=define_dataproc_create_cluster_config(),\n description="Manage a Dataproc cluster resource",\n)\ndef dataproc_resource(context):\n return DataprocClient(context.resource_config)
\n
", "current_page_name": "_modules/dagster_gcp/dataproc/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_gcp.dataproc.resources"}}, "gcs": {"compute_log_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_gcp.gcs.compute_log_manager

\nimport datetime\nimport json\nimport os\nfrom typing import Any, Mapping, Optional, Sequence\n\nimport dagster._seven as seven\nfrom dagster import (\n    Field,\n    StringSource,\n    _check as check,\n)\nfrom dagster._config.config_type import Noneable\nfrom dagster._core.storage.cloud_storage_compute_log_manager import (\n    CloudStorageComputeLogManager,\n    PollingComputeLogSubscriptionManager,\n)\nfrom dagster._core.storage.compute_log_manager import ComputeIOType\nfrom dagster._core.storage.local_compute_log_manager import (\n    IO_TYPE_EXTENSION,\n    LocalComputeLogManager,\n)\nfrom dagster._serdes import ConfigurableClass, ConfigurableClassData\nfrom dagster._utils import ensure_dir, ensure_file\nfrom google.cloud import storage\nfrom typing_extensions import Self\n\n\n
[docs]class GCSComputeLogManager(CloudStorageComputeLogManager, ConfigurableClass):\n """Logs op compute function stdout and stderr to GCS.\n\n Users should not instantiate this class directly. Instead, use a YAML block in ``dagster.yaml``\n such as the following:\n\n .. code-block:: YAML\n\n compute_logs:\n module: dagster_gcp.gcs.compute_log_manager\n class: GCSComputeLogManager\n config:\n bucket: "mycorp-dagster-compute-logs"\n local_dir: "/tmp/cool"\n prefix: "dagster-test-"\n upload_interval: 30\n\n There are more configuration examples in the instance documentation guide: https://docs.dagster.io/deployment/dagster-instance#compute-log-storage\n\n Args:\n bucket (str): The name of the GCS bucket to which to log.\n local_dir (Optional[str]): Path to the local directory in which to stage logs. Default:\n ``dagster._seven.get_system_temp_directory()``.\n prefix (Optional[str]): Prefix for the log file keys.\n json_credentials_envvar (Optional[str]): Environment variable that contains the JSON with a private key\n and other credentials information. If this is set, ``GOOGLE_APPLICATION_CREDENTIALS`` will be ignored.\n Can be used when the private key cannot be used as a file.\n upload_interval: (Optional[int]): Interval in seconds to upload partial log files to GCS. By default, will only upload when the capture is complete.\n inst_data (Optional[ConfigurableClassData]): Serializable representation of the compute\n log manager when instantiated from config.\n """\n\n def __init__(\n self,\n bucket,\n local_dir=None,\n inst_data: Optional[ConfigurableClassData] = None,\n prefix="dagster",\n json_credentials_envvar=None,\n upload_interval=None,\n ):\n self._bucket_name = check.str_param(bucket, "bucket")\n self._prefix = self._clean_prefix(check.str_param(prefix, "prefix"))\n\n if json_credentials_envvar:\n json_info_str = os.environ.get(json_credentials_envvar)\n credentials_info = json.loads(json_info_str) # type: ignore # (possible none)\n self._bucket = (\n storage.Client()\n .from_service_account_info(credentials_info)\n .bucket(self._bucket_name)\n )\n else:\n self._bucket = storage.Client().bucket(self._bucket_name)\n\n # Check if the bucket exists\n check.invariant(self._bucket.exists())\n\n # proxy calls to local compute log manager (for subscriptions, etc)\n if not local_dir:\n local_dir = seven.get_system_temp_directory()\n\n self._upload_interval = check.opt_int_param(upload_interval, "upload_interval")\n self._local_manager = LocalComputeLogManager(local_dir)\n self._subscription_manager = PollingComputeLogSubscriptionManager(self)\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n\n @property\n def inst_data(self):\n return self._inst_data\n\n @classmethod\n def config_type(cls):\n return {\n "bucket": StringSource,\n "local_dir": Field(StringSource, is_required=False),\n "prefix": Field(StringSource, is_required=False, default_value="dagster"),\n "json_credentials_envvar": Field(StringSource, is_required=False),\n "upload_interval": Field(Noneable(int), is_required=False, default_value=None),\n }\n\n @classmethod\n def from_config_value(\n cls, inst_data: ConfigurableClassData, config_value: Mapping[str, Any]\n ) -> Self:\n return GCSComputeLogManager(inst_data=inst_data, **config_value)\n\n @property\n def local_manager(self) -> LocalComputeLogManager:\n return self._local_manager\n\n @property\n def upload_interval(self) -> Optional[int]:\n return self._upload_interval if self._upload_interval else None\n\n def _clean_prefix(self, prefix):\n parts = prefix.split("/")\n return "/".join([part for part in parts if part])\n\n def _gcs_key(self, log_key, io_type, partial=False):\n check.inst_param(io_type, "io_type", ComputeIOType)\n extension = IO_TYPE_EXTENSION[io_type]\n [*namespace, filebase] = log_key\n filename = f"{filebase}.{extension}"\n if partial:\n filename = f"{filename}.partial"\n paths = [self._prefix, "storage", *namespace, filename]\n return "/".join(paths)\n\n def delete_logs(\n self, log_key: Optional[Sequence[str]] = None, prefix: Optional[Sequence[str]] = None\n ):\n self._local_manager.delete_logs(log_key, prefix)\n if log_key:\n gcs_keys_to_remove = [\n self._gcs_key(log_key, ComputeIOType.STDOUT),\n self._gcs_key(log_key, ComputeIOType.STDERR),\n self._gcs_key(log_key, ComputeIOType.STDOUT, partial=True),\n self._gcs_key(log_key, ComputeIOType.STDERR, partial=True),\n ]\n # if the blob doesn't exist, do nothing instead of raising a not found exception\n self._bucket.delete_blobs(gcs_keys_to_remove, on_error=lambda _: None)\n elif prefix:\n # add the trailing '/' to make sure that ['a'] does not match ['apple']\n delete_prefix = "/".join([self._prefix, "storage", *prefix, ""])\n to_delete = self._bucket.list_blobs(prefix=delete_prefix)\n self._bucket.delete_blobs(list(to_delete))\n else:\n check.failed("Must pass in either `log_key` or `prefix` argument to delete_logs")\n\n def download_url_for_type(self, log_key: Sequence[str], io_type: ComputeIOType):\n if not self.is_capture_complete(log_key):\n return None\n\n gcs_key = self._gcs_key(log_key, io_type)\n try:\n return self._bucket.blob(gcs_key).generate_signed_url(\n expiration=datetime.timedelta(minutes=60)\n )\n except:\n # fallback to the local download url if the current credentials are insufficient to create\n # signed urls\n return self.local_manager.get_captured_log_download_url(log_key, io_type)\n\n def display_path_for_type(self, log_key: Sequence[str], io_type: ComputeIOType):\n if not self.is_capture_complete(log_key):\n return self.local_manager.get_captured_local_path(log_key, IO_TYPE_EXTENSION[io_type])\n gcs_key = self._gcs_key(log_key, io_type)\n return f"gs://{self._bucket_name}/{gcs_key}"\n\n def cloud_storage_has_logs(\n self, log_key: Sequence[str], io_type: ComputeIOType, partial: bool = False\n ) -> bool:\n gcs_key = self._gcs_key(log_key, io_type, partial)\n return self._bucket.blob(gcs_key).exists()\n\n def upload_to_cloud_storage(\n self, log_key: Sequence[str], io_type: ComputeIOType, partial=False\n ):\n path = self.local_manager.get_captured_local_path(log_key, IO_TYPE_EXTENSION[io_type])\n ensure_file(path)\n\n if partial and os.stat(path).st_size == 0:\n return\n\n gcs_key = self._gcs_key(log_key, io_type, partial=partial)\n with open(path, "rb") as data:\n self._bucket.blob(gcs_key).upload_from_file(data)\n\n def download_from_cloud_storage(\n self, log_key: Sequence[str], io_type: ComputeIOType, partial=False\n ):\n path = self.local_manager.get_captured_local_path(\n log_key, IO_TYPE_EXTENSION[io_type], partial=partial\n )\n ensure_dir(os.path.dirname(path))\n\n gcs_key = self._gcs_key(log_key, io_type, partial=partial)\n with open(path, "wb") as fileobj:\n self._bucket.blob(gcs_key).download_to_file(fileobj)\n\n def on_subscribe(self, subscription):\n self._subscription_manager.add_subscription(subscription)\n\n def on_unsubscribe(self, subscription):\n self._subscription_manager.remove_subscription(subscription)\n\n def dispose(self):\n self._subscription_manager.dispose()\n self._local_manager.dispose()
\n
", "current_page_name": "_modules/dagster_gcp/gcs/compute_log_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_gcp.gcs.compute_log_manager"}, "file_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_gcp.gcs.file_manager

\nimport io\nimport uuid\nfrom contextlib import contextmanager\nfrom typing import Optional\n\nimport dagster._check as check\nfrom dagster._core.storage.file_manager import (\n    FileHandle,\n    FileManager,\n    TempfileManager,\n    check_file_like_obj,\n)\nfrom google.cloud import storage\n\n\n
[docs]class GCSFileHandle(FileHandle):\n """A reference to a file on GCS."""\n\n def __init__(self, gcs_bucket: str, gcs_key: str):\n self._gcs_bucket = check.str_param(gcs_bucket, "gcs_bucket")\n self._gcs_key = check.str_param(gcs_key, "gcs_key")\n\n @property\n def gcs_bucket(self) -> str:\n """str: The name of the GCS bucket."""\n return self._gcs_bucket\n\n @property\n def gcs_key(self) -> str:\n """str: The GCS key."""\n return self._gcs_key\n\n @property\n def path_desc(self) -> str:\n """str: The file's GCS URL."""\n return self.gcs_path\n\n @property\n def gcs_path(self) -> str:\n """str: The file's GCS URL."""\n return f"gs://{self.gcs_bucket}/{self.gcs_key}"
\n\n\nclass GCSFileManager(FileManager):\n def __init__(self, client, gcs_bucket, gcs_base_key):\n self._client = check.inst_param(client, "client", storage.client.Client)\n self._gcs_bucket = check.str_param(gcs_bucket, "gcs_bucket")\n self._gcs_base_key = check.str_param(gcs_base_key, "gcs_base_key")\n self._local_handle_cache = {}\n self._temp_file_manager = TempfileManager()\n\n def copy_handle_to_local_temp(self, file_handle):\n self._download_if_not_cached(file_handle)\n return self._get_local_path(file_handle)\n\n def _download_if_not_cached(self, file_handle):\n if not self._file_handle_cached(file_handle):\n # instigate download\n temp_file_obj = self._temp_file_manager.tempfile()\n temp_name = temp_file_obj.name\n bucket_obj = self._client.bucket(file_handle.gcs_bucket)\n bucket_obj.blob(file_handle.gcs_key).download_to_file(temp_file_obj)\n self._local_handle_cache[file_handle.gcs_path] = temp_name\n\n return file_handle\n\n @contextmanager\n def read(self, file_handle, mode="rb"):\n check.inst_param(file_handle, "file_handle", GCSFileHandle)\n check.str_param(mode, "mode")\n check.param_invariant(mode in {"r", "rb"}, "mode")\n\n self._download_if_not_cached(file_handle)\n\n encoding = None if mode == "rb" else "utf-8"\n with open(self._get_local_path(file_handle), mode, encoding=encoding) as file_obj:\n yield file_obj\n\n def _file_handle_cached(self, file_handle):\n return file_handle.gcs_path in self._local_handle_cache\n\n def _get_local_path(self, file_handle):\n return self._local_handle_cache[file_handle.gcs_path]\n\n def read_data(self, file_handle):\n with self.read(file_handle, mode="rb") as file_obj:\n return file_obj.read()\n\n def write_data(self, data, ext=None, key: Optional[str] = None):\n key = check.opt_str_param(key, "key", default=str(uuid.uuid4()))\n check.inst_param(data, "data", bytes)\n return self.write(io.BytesIO(data), mode="wb", key=key, ext=ext)\n\n def write(self, file_obj, mode="wb", ext=None, key: Optional[str] = None):\n key = check.opt_str_param(key, "key", default=str(uuid.uuid4()))\n check_file_like_obj(file_obj)\n gcs_key = self.get_full_key(key + (("." + ext) if ext is not None else ""))\n bucket_obj = self._client.bucket(self._gcs_bucket)\n bucket_obj.blob(gcs_key).upload_from_file(file_obj)\n return GCSFileHandle(self._gcs_bucket, gcs_key)\n\n def get_full_key(self, file_key):\n return f"{self._gcs_base_key}/{file_key}"\n\n def delete_local_temp(self):\n self._temp_file_manager.close()\n
", "current_page_name": "_modules/dagster_gcp/gcs/file_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_gcp.gcs.file_manager"}, "io_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_gcp.gcs.io_manager

\nimport pickle\nfrom typing import Any, Optional, Union\n\nfrom dagster import (\n    ConfigurableIOManager,\n    InputContext,\n    OutputContext,\n    ResourceDependency,\n    _check as check,\n    io_manager,\n)\nfrom dagster._annotations import deprecated\nfrom dagster._core.storage.io_manager import dagster_maintained_io_manager\nfrom dagster._core.storage.upath_io_manager import UPathIOManager\nfrom dagster._utils import PICKLE_PROTOCOL\nfrom dagster._utils.backoff import backoff\nfrom dagster._utils.cached_method import cached_method\nfrom google.api_core.exceptions import Forbidden, ServiceUnavailable, TooManyRequests\nfrom google.cloud import storage\nfrom pydantic import Field\nfrom upath import UPath\n\nfrom .resources import GCSResource\n\nDEFAULT_LEASE_DURATION = 60  # One minute\n\n\nclass PickledObjectGCSIOManager(UPathIOManager):\n    def __init__(self, bucket: str, client: Optional[Any] = None, prefix: str = "dagster"):\n        self.bucket = check.str_param(bucket, "bucket")\n        self.client = client or storage.Client()\n        self.bucket_obj = self.client.bucket(bucket)\n        check.invariant(self.bucket_obj.exists())\n        self.prefix = check.str_param(prefix, "prefix")\n        super().__init__(base_path=UPath(self.prefix))\n\n    def unlink(self, path: UPath) -> None:\n        key = str(path)\n        if self.bucket_obj.blob(key).exists():\n            self.bucket_obj.blob(key).delete()\n\n    def path_exists(self, path: UPath) -> bool:\n        key = str(path)\n        blobs = self.client.list_blobs(self.bucket, prefix=key)\n        return len(list(blobs)) > 0\n\n    def get_op_output_relative_path(self, context: Union[InputContext, OutputContext]) -> UPath:\n        parts = context.get_identifier()\n        run_id = parts[0]\n        output_parts = parts[1:]\n        return UPath("storage", run_id, "files", *output_parts)\n\n    def get_loading_input_log_message(self, path: UPath) -> str:\n        return f"Loading GCS object from: {self._uri_for_path(path)}"\n\n    def get_writing_output_log_message(self, path: UPath) -> str:\n        return f"Writing GCS object at: {self._uri_for_path(path)}"\n\n    def _uri_for_path(self, path: UPath) -> str:\n        return f"gs://{self.bucket}/{path}"\n\n    def make_directory(self, path: UPath) -> None:\n        # It is not necessary to create directories in GCP\n        return None\n\n    def load_from_path(self, context: InputContext, path: UPath) -> Any:\n        bytes_obj = self.bucket_obj.blob(str(path)).download_as_bytes()\n        return pickle.loads(bytes_obj)\n\n    def dump_to_path(self, context: OutputContext, obj: Any, path: UPath) -> None:\n        if self.path_exists(path):\n            context.log.warning(f"Removing existing GCS key: {path}")\n            self.unlink(path)\n\n        pickled_obj = pickle.dumps(obj, PICKLE_PROTOCOL)\n\n        backoff(\n            self.bucket_obj.blob(str(path)).upload_from_string,\n            args=[pickled_obj],\n            retry_on=(TooManyRequests, Forbidden, ServiceUnavailable),\n        )\n\n\n
[docs]class GCSPickleIOManager(ConfigurableIOManager):\n """Persistent IO manager using GCS for storage.\n\n Serializes objects via pickling. Suitable for objects storage for distributed executors, so long\n as each execution node has network connectivity and credentials for GCS and the backing bucket.\n\n Assigns each op output to a unique filepath containing run ID, step key, and output name.\n Assigns each asset to a single filesystem path, at ``<base_dir>/<asset_key>``. If the asset key\n has multiple components, the final component is used as the name of the file, and the preceding\n components as parent directories under the base_dir.\n\n Subsequent materializations of an asset will overwrite previous materializations of that asset.\n With a base directory of ``/my/base/path``, an asset with key\n ``AssetKey(["one", "two", "three"])`` would be stored in a file called ``three`` in a directory\n with path ``/my/base/path/one/two/``.\n\n Example usage:\n\n 1. Attach this IO manager to a set of assets.\n\n .. code-block:: python\n\n from dagster import asset, Definitions\n from dagster_gcp.gcs import GCSPickleIOManager, GCSResource\n\n @asset\n def asset1():\n # create df ...\n return df\n\n @asset\n def asset2(asset1):\n return asset1[:5]\n\n defs = Definitions(\n assets=[asset1, asset2],\n resources={\n "io_manager": GCSPickleIOManager(\n gcs_bucket="my-cool-bucket",\n gcs_prefix="my-cool-prefix"\n ),\n "gcs": GCSResource(project="my-cool-project")\n }\n )\n\n\n 2. Attach this IO manager to your job to make it available to your ops.\n\n .. code-block:: python\n\n from dagster import job\n from dagster_gcp.gcs import GCSPickleIOManager, GCSResource\n\n @job(\n resource_defs={\n "io_manager": GCSPickleIOManager(\n gcs=GCSResource(project="my-cool-project")\n gcs_bucket="my-cool-bucket",\n gcs_prefix="my-cool-prefix"\n ),\n }\n )\n def my_job():\n ...\n """\n\n gcs: ResourceDependency[GCSResource]\n gcs_bucket: str = Field(description="GCS bucket to store files")\n gcs_prefix: str = Field(default="dagster", description="Prefix to add to all file paths")\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n @property\n @cached_method\n def _internal_io_manager(self) -> PickledObjectGCSIOManager:\n return PickledObjectGCSIOManager(\n bucket=self.gcs_bucket, client=self.gcs.get_client(), prefix=self.gcs_prefix\n )\n\n def load_input(self, context: InputContext) -> Any:\n return self._internal_io_manager.load_input(context)\n\n def handle_output(self, context: OutputContext, obj: Any) -> None:\n self._internal_io_manager.handle_output(context, obj)
\n\n\n
[docs]@deprecated(\n breaking_version="2.0",\n additional_warn_text="Please use GCSPickleIOManager instead.",\n)\nclass ConfigurablePickledObjectGCSIOManager(GCSPickleIOManager):\n """Renamed to GCSPickleIOManager. See GCSPickleIOManager for documentation."""\n\n pass
\n\n\n
[docs]@dagster_maintained_io_manager\n@io_manager(\n config_schema=GCSPickleIOManager.to_config_schema(),\n required_resource_keys={"gcs"},\n)\ndef gcs_pickle_io_manager(init_context):\n """Persistent IO manager using GCS for storage.\n\n Serializes objects via pickling. Suitable for objects storage for distributed executors, so long\n as each execution node has network connectivity and credentials for GCS and the backing bucket.\n\n Assigns each op output to a unique filepath containing run ID, step key, and output name.\n Assigns each asset to a single filesystem path, at ``<base_dir>/<asset_key>``. If the asset key\n has multiple components, the final component is used as the name of the file, and the preceding\n components as parent directories under the base_dir.\n\n Subsequent materializations of an asset will overwrite previous materializations of that asset.\n With a base directory of ``/my/base/path``, an asset with key\n ``AssetKey(["one", "two", "three"])`` would be stored in a file called ``three`` in a directory\n with path ``/my/base/path/one/two/``.\n\n Example usage:\n\n 1. Attach this IO manager to a set of assets.\n\n .. code-block:: python\n\n from dagster import Definitions, asset\n from dagster_gcp.gcs import gcs_pickle_io_manager, gcs_resource\n\n @asset\n def asset1():\n # create df ...\n return df\n\n @asset\n def asset2(asset1):\n return asset1[:5]\n\n defs = Definitions(\n assets=[asset1, asset2],\n resources={\n "io_manager": gcs_pickle_io_manager.configured(\n {"gcs_bucket": "my-cool-bucket", "gcs_prefix": "my-cool-prefix"}\n ),\n "gcs": gcs_resource.configured({"project": "my-cool-project"}),\n },\n )\n\n\n 2. Attach this IO manager to your job to make it available to your ops.\n\n .. code-block:: python\n\n from dagster import job\n from dagster_gcp.gcs import gcs_pickle_io_manager, gcs_resource\n\n @job(\n resource_defs={\n "io_manager": gcs_pickle_io_manager.configured(\n {"gcs_bucket": "my-cool-bucket", "gcs_prefix": "my-cool-prefix"}\n ),\n "gcs": gcs_resource.configured({"project": "my-cool-project"}),\n },\n )\n def my_job():\n ...\n """\n client = init_context.resources.gcs\n pickled_io_manager = PickledObjectGCSIOManager(\n bucket=init_context.resource_config["gcs_bucket"],\n client=client,\n prefix=init_context.resource_config["gcs_prefix"],\n )\n return pickled_io_manager
\n
", "current_page_name": "_modules/dagster_gcp/gcs/io_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_gcp.gcs.io_manager"}, "resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_gcp.gcs.resources

\nfrom typing import Any, Optional\n\nfrom dagster import ConfigurableResource, IAttachDifferentObjectToOpContext, resource\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom google.cloud import storage\nfrom pydantic import Field\n\nfrom .file_manager import GCSFileManager\n\n\n
[docs]class GCSResource(ConfigurableResource, IAttachDifferentObjectToOpContext):\n """Resource for interacting with Google Cloud Storage.\n\n Example:\n .. code-block::\n\n @asset\n def my_asset(gcs: GCSResource):\n with gcs.get_client() as client:\n # client is a google.cloud.storage.Client\n ...\n """\n\n project: Optional[str] = Field(default=None, description="Project name")\n\n def get_client(self) -> storage.Client:\n """Creates a GCS Client.\n\n Returns: google.cloud.storage.Client\n """\n return _gcs_client_from_config(project=self.project)\n\n def get_object_to_set_on_execution_context(self) -> Any:\n return self.get_client()
\n\n\n
[docs]@dagster_maintained_resource\n@resource(\n config_schema=GCSResource.to_config_schema(),\n description="This resource provides a GCS client",\n)\ndef gcs_resource(init_context) -> storage.Client:\n return GCSResource.from_resource_context(init_context).get_client()
\n\n\n
[docs]class GCSFileManagerResource(ConfigurableResource, IAttachDifferentObjectToOpContext):\n """FileManager that provides abstract access to GCS."""\n\n project: Optional[str] = Field(default=None, description="Project name")\n gcs_bucket: str = Field(description="GCS bucket to store files")\n gcs_prefix: str = Field(default="dagster", description="Prefix to add to all file paths")\n\n def get_client(self) -> GCSFileManager:\n """Creates a :py:class:`~dagster_gcp.GCSFileManager` object that implements the\n :py:class:`~dagster._core.storage.file_manager.FileManager` API .\n\n Returns: GCSFileManager\n """\n gcs_client = _gcs_client_from_config(project=self.project)\n return GCSFileManager(\n client=gcs_client,\n gcs_bucket=self.gcs_bucket,\n gcs_base_key=self.gcs_prefix,\n )\n\n def get_object_to_set_on_execution_context(self) -> Any:\n return self.get_client()
\n\n\n
[docs]@dagster_maintained_resource\n@resource(config_schema=GCSFileManagerResource.to_config_schema())\ndef gcs_file_manager(context):\n """FileManager that provides abstract access to GCS.\n\n Implements the :py:class:`~dagster._core.storage.file_manager.FileManager` API.\n """\n return GCSFileManagerResource.from_resource_context(context).get_client()
\n\n\ndef _gcs_client_from_config(project: Optional[str]) -> storage.Client:\n """Creates a GCS Client.\n\n Args:\n project: The GCP project\n\n Returns: A GCS client.\n """\n return storage.client.Client(project=project)\n
", "current_page_name": "_modules/dagster_gcp/gcs/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_gcp.gcs.resources"}}}, "dagster_gcp_pandas": {"bigquery": {"bigquery_pandas_type_handler": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_gcp_pandas.bigquery.bigquery_pandas_type_handler

\nfrom typing import Optional, Sequence, Type\n\nimport pandas as pd\nfrom dagster import InputContext, MetadataValue, OutputContext, TableColumn, TableSchema\nfrom dagster._core.storage.db_io_manager import DbTypeHandler, TableSlice\nfrom dagster_gcp.bigquery.io_manager import (\n    BigQueryClient,\n    BigQueryIOManager,\n    build_bigquery_io_manager,\n)\n\n\n
[docs]class BigQueryPandasTypeHandler(DbTypeHandler[pd.DataFrame]):\n """Plugin for the BigQuery I/O Manager that can store and load Pandas DataFrames as BigQuery tables.\n\n Examples:\n .. code-block:: python\n\n from dagster_gcp import BigQueryIOManager\n from dagster_bigquery_pandas import BigQueryPandasTypeHandler\n from dagster import Definitions, EnvVar\n\n class MyBigQueryIOManager(BigQueryIOManager):\n @staticmethod\n def type_handlers() -> Sequence[DbTypeHandler]:\n return [BigQueryPandasTypeHandler()]\n\n @asset(\n key_prefix=["my_dataset"] # my_dataset will be used as the dataset in BigQuery\n )\n def my_table() -> pd.DataFrame: # the name of the asset will be the table name\n ...\n\n defs = Definitions(\n assets=[my_table],\n resources={\n "io_manager": MyBigQueryIOManager(project=EnvVar("GCP_PROJECT"))\n }\n )\n\n """\n\n def handle_output(\n self, context: OutputContext, table_slice: TableSlice, obj: pd.DataFrame, connection\n ):\n """Stores the pandas DataFrame in BigQuery."""\n with_uppercase_cols = obj.rename(str.upper, copy=False, axis="columns")\n\n job = connection.load_table_from_dataframe(\n dataframe=with_uppercase_cols,\n destination=f"{table_slice.schema}.{table_slice.table}",\n project=table_slice.database,\n location=context.resource_config.get("location") if context.resource_config else None,\n timeout=context.resource_config.get("timeout") if context.resource_config else None,\n )\n job.result()\n\n context.add_output_metadata(\n {\n "row_count": obj.shape[0],\n "dataframe_columns": MetadataValue.table_schema(\n TableSchema(\n columns=[\n TableColumn(name=name, type=str(dtype)) # type: ignore # (bad stubs)\n for name, dtype in obj.dtypes.items()\n ]\n )\n ),\n }\n )\n\n def load_input(\n self, context: InputContext, table_slice: TableSlice, connection\n ) -> pd.DataFrame:\n """Loads the input as a Pandas DataFrame."""\n if table_slice.partition_dimensions and len(context.asset_partition_keys) == 0:\n return pd.DataFrame()\n result = connection.query(\n query=BigQueryClient.get_select_statement(table_slice),\n project=table_slice.database,\n location=context.resource_config.get("location") if context.resource_config else None,\n timeout=context.resource_config.get("timeout") if context.resource_config else None,\n ).to_dataframe()\n\n result.columns = map(str.lower, result.columns)\n return result\n\n @property\n def supported_types(self):\n return [pd.DataFrame]
\n\n\nbigquery_pandas_io_manager = build_bigquery_io_manager(\n [BigQueryPandasTypeHandler()], default_load_type=pd.DataFrame\n)\nbigquery_pandas_io_manager.__doc__ = """\nAn I/O manager definition that reads inputs from and writes pandas DataFrames to BigQuery.\n\nReturns:\n IOManagerDefinition\n\nExamples:\n\n .. code-block:: python\n\n from dagster_gcp_pandas import bigquery_pandas_io_manager\n from dagster import Definitions\n\n @asset(\n key_prefix=["my_dataset"] # will be used as the dataset in BigQuery\n )\n def my_table() -> pd.DataFrame: # the name of the asset will be the table name\n ...\n\n defs = Definitions(\n assets=[my_table],\n resources={\n "io_manager": bigquery_pandas_io_manager.configured({\n "project" : {"env": "GCP_PROJECT"}\n })\n }\n )\n\n You can tell Dagster in which dataset to create tables by setting the "dataset" configuration value.\n If you do not provide a dataset as configuration to the I/O manager, Dagster will determine a dataset based\n on the assets and ops using the I/O Manager. For assets, the dataset will be determined from the asset key,\n as shown in the above example. The final prefix before the asset name will be used as the dataset. For example,\n if the asset "my_table" had the key prefix ["gcp", "bigquery", "my_dataset"], the dataset "my_dataset" will be\n used. For ops, the dataset can be specified by including a "schema" entry in output metadata. If "schema" is not provided\n via config or on the asset/op, "public" will be used for the dataset.\n\n .. code-block:: python\n\n @op(\n out={"my_table": Out(metadata={"schema": "my_dataset"})}\n )\n def make_my_table() -> pd.DataFrame:\n # the returned value will be stored at my_dataset.my_table\n ...\n\n To only use specific columns of a table as input to a downstream op or asset, add the metadata "columns" to the\n In or AssetIn.\n\n .. code-block:: python\n\n @asset(\n ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n )\n def my_table_a(my_table: pd.DataFrame) -> pd.DataFrame:\n # my_table will just contain the data from column "a"\n ...\n\n If you cannot upload a file to your Dagster deployment, or otherwise cannot\n `authenticate with GCP <https://cloud.google.com/docs/authentication/provide-credentials-adc>`_\n via a standard method, you can provide a service account key as the "gcp_credentials" configuration.\n Dagster will store this key in a temporary file and set GOOGLE_APPLICATION_CREDENTIALS to point to the file.\n After the run completes, the file will be deleted, and GOOGLE_APPLICATION_CREDENTIALS will be\n unset. The key must be base64 encoded to avoid issues with newlines in the keys. You can retrieve\n the base64 encoded key with this shell command: cat $GOOGLE_APPLICATION_CREDENTIALS | base64\n\n"""\n\n\n
[docs]class BigQueryPandasIOManager(BigQueryIOManager):\n """An I/O manager definition that reads inputs from and writes pandas DataFrames to BigQuery.\n\n Returns:\n IOManagerDefinition\n\n Examples:\n .. code-block:: python\n\n from dagster_gcp_pandas import BigQueryPandasIOManager\n from dagster import Definitions, EnvVar\n\n @asset(\n key_prefix=["my_dataset"] # will be used as the dataset in BigQuery\n )\n def my_table() -> pd.DataFrame: # the name of the asset will be the table name\n ...\n\n defs = Definitions(\n assets=[my_table],\n resources={\n "io_manager": BigQueryPandasIOManager(project=EnvVar("GCP_PROJECT"))\n }\n )\n\n You can tell Dagster in which dataset to create tables by setting the "dataset" configuration value.\n If you do not provide a dataset as configuration to the I/O manager, Dagster will determine a dataset based\n on the assets and ops using the I/O Manager. For assets, the dataset will be determined from the asset key,\n as shown in the above example. The final prefix before the asset name will be used as the dataset. For example,\n if the asset "my_table" had the key prefix ["gcp", "bigquery", "my_dataset"], the dataset "my_dataset" will be\n used. For ops, the dataset can be specified by including a "schema" entry in output metadata. If "schema" is not provided\n via config or on the asset/op, "public" will be used for the dataset.\n\n .. code-block:: python\n\n @op(\n out={"my_table": Out(metadata={"schema": "my_dataset"})}\n )\n def make_my_table() -> pd.DataFrame:\n # the returned value will be stored at my_dataset.my_table\n ...\n\n To only use specific columns of a table as input to a downstream op or asset, add the metadata "columns" to the\n In or AssetIn.\n\n .. code-block:: python\n\n @asset(\n ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n )\n def my_table_a(my_table: pd.DataFrame) -> pd.DataFrame:\n # my_table will just contain the data from column "a"\n ...\n\n If you cannot upload a file to your Dagster deployment, or otherwise cannot\n `authenticate with GCP <https://cloud.google.com/docs/authentication/provide-credentials-adc>`_\n via a standard method, you can provide a service account key as the "gcp_credentials" configuration.\n Dagster will store this key in a temporary file and set GOOGLE_APPLICATION_CREDENTIALS to point to the file.\n After the run completes, the file will be deleted, and GOOGLE_APPLICATION_CREDENTIALS will be\n unset. The key must be base64 encoded to avoid issues with newlines in the keys. You can retrieve\n the base64 encoded key with this shell command: cat $GOOGLE_APPLICATION_CREDENTIALS | base64\n\n """\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n @staticmethod\n def type_handlers() -> Sequence[DbTypeHandler]:\n return [BigQueryPandasTypeHandler()]\n\n @staticmethod\n def default_load_type() -> Optional[Type]:\n return pd.DataFrame
\n
", "current_page_name": "_modules/dagster_gcp_pandas/bigquery/bigquery_pandas_type_handler", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_gcp_pandas.bigquery.bigquery_pandas_type_handler"}}}, "dagster_gcp_pyspark": {"bigquery": {"bigquery_pyspark_type_handler": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_gcp_pyspark.bigquery.bigquery_pyspark_type_handler

\nfrom typing import Any, Mapping, Optional, Sequence, Type\n\nfrom dagster import InputContext, MetadataValue, OutputContext, TableColumn, TableSchema\nfrom dagster._core.definitions.metadata import RawMetadataValue\nfrom dagster._core.storage.db_io_manager import DbTypeHandler, TableSlice\nfrom dagster_gcp import BigQueryIOManager, build_bigquery_io_manager\nfrom dagster_gcp.bigquery.io_manager import BigQueryClient\nfrom pyspark.sql import DataFrame, SparkSession\nfrom pyspark.sql.types import StructType\n\n\ndef _get_bigquery_write_options(\n    config: Optional[Mapping[str, Any]], table_slice: TableSlice\n) -> Mapping[str, str]:\n    conf = {\n        "table": f"{table_slice.database}.{table_slice.schema}.{table_slice.table}",\n    }\n    if config and config.get("temporary_gcs_bucket") is not None:\n        conf["temporaryGcsBucket"] = config["temporary_gcs_bucket"]\n    else:\n        conf["writeMethod"] = "direct"\n    return conf\n\n\ndef _get_bigquery_read_options(table_slice: TableSlice) -> Mapping[str, str]:\n    conf = {"viewsEnabled": "true", "materializationDataset": table_slice.schema}\n    return conf\n\n\n
[docs]class BigQueryPySparkTypeHandler(DbTypeHandler[DataFrame]):\n """Plugin for the BigQuery I/O Manager that can store and load PySpark DataFrames as BigQuery tables.\n\n Examples:\n .. code-block:: python\n\n from dagster_gcp import BigQueryIOManager\n from dagster_bigquery_pandas import BigQueryPySparkTypeHandler\n from dagster import Definitions, EnvVar\n\n class MyBigQueryIOManager(BigQueryIOManager):\n @staticmethod\n def type_handlers() -> Sequence[DbTypeHandler]:\n return [BigQueryPySparkTypeHandler()]\n\n @asset(\n key_prefix=["my_dataset"] # my_dataset will be used as the dataset in BigQuery\n )\n def my_table() -> pd.DataFrame: # the name of the asset will be the table name\n ...\n\n defs = Definitions(\n assets=[my_table],\n resources={\n "io_manager": MyBigQueryIOManager(project=EnvVar("GCP_PROJECT"))\n }\n )\n\n """\n\n def handle_output(\n self, context: OutputContext, table_slice: TableSlice, obj: DataFrame, _\n ) -> Mapping[str, RawMetadataValue]:\n options = _get_bigquery_write_options(context.resource_config, table_slice)\n\n with_uppercase_cols = obj.toDF(*[c.upper() for c in obj.columns])\n\n with_uppercase_cols.write.format("bigquery").options(**options).mode("append").save()\n\n return {\n "dataframe_columns": MetadataValue.table_schema(\n TableSchema(\n columns=[\n TableColumn(name=field.name, type=field.dataType.typeName())\n for field in obj.schema.fields\n ]\n )\n ),\n }\n\n def load_input(self, context: InputContext, table_slice: TableSlice, _) -> DataFrame:\n options = _get_bigquery_read_options(table_slice)\n spark = SparkSession.builder.getOrCreate() # type: ignore\n\n if table_slice.partition_dimensions and len(context.asset_partition_keys) == 0:\n return spark.createDataFrame([], StructType([]))\n\n df = (\n spark.read.format("bigquery")\n .options(**options)\n .load(BigQueryClient.get_select_statement(table_slice))\n )\n\n return df.toDF(*[c.lower() for c in df.columns])\n\n @property\n def supported_types(self):\n return [DataFrame]
\n\n\nbigquery_pyspark_io_manager = build_bigquery_io_manager(\n [BigQueryPySparkTypeHandler()], default_load_type=DataFrame\n)\nbigquery_pyspark_io_manager.__doc__ = """\nAn I/O manager definition that reads inputs from and writes PySpark DataFrames to BigQuery.\n\nReturns:\n IOManagerDefinition\n\nExamples:\n\n .. code-block:: python\n\n from dagster_gcp_pyspark import bigquery_pyspark_io_manager\n from dagster import Definitions\n\n @asset(\n key_prefix=["my_dataset"] # will be used as the dataset in BigQuery\n )\n def my_table() -> pd.DataFrame: # the name of the asset will be the table name\n ...\n\n defs = Definitions(\n assets=[my_table],\n resources={\n "io_manager": bigquery_pyspark_io_manager.configured({\n "project" : {"env": "GCP_PROJECT"}\n })\n }\n )\n\n You can tell Dagster in which dataset to create tables by setting the "dataset" configuration value.\n If you do not provide a dataset as configuration to the I/O manager, Dagster will determine a dataset based\n on the assets and ops using the I/O Manager. For assets, the dataset will be determined from the asset key,\n as shown in the above example. The final prefix before the asset name will be used as the dataset. For example,\n if the asset "my_table" had the key prefix ["gcp", "bigquery", "my_dataset"], the dataset "my_dataset" will be\n used. For ops, the dataset can be specified by including a "schema" entry in output metadata. If "schema" is not provided\n via config or on the asset/op, "public" will be used for the dataset.\n\n .. code-block:: python\n\n @op(\n out={"my_table": Out(metadata={"schema": "my_dataset"})}\n )\n def make_my_table() -> pd.DataFrame:\n # the returned value will be stored at my_dataset.my_table\n ...\n\n To only use specific columns of a table as input to a downstream op or asset, add the metadata "columns" to the\n In or AssetIn.\n\n .. code-block:: python\n\n @asset(\n ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n )\n def my_table_a(my_table: pd.DataFrame) -> pd.DataFrame:\n # my_table will just contain the data from column "a"\n ...\n\n If you cannot upload a file to your Dagster deployment, or otherwise cannot\n `authenticate with GCP <https://cloud.google.com/docs/authentication/provide-credentials-adc>`_\n via a standard method, you can provide a service account key as the "gcp_credentials" configuration.\n Dagster will store this key in a temporary file and set GOOGLE_APPLICATION_CREDENTIALS to point to the file.\n After the run completes, the file will be deleted, and GOOGLE_APPLICATION_CREDENTIALS will be\n unset. The key must be base64 encoded to avoid issues with newlines in the keys. You can retrieve\n the base64 encoded key with this shell command: cat $GOOGLE_APPLICATION_CREDENTIALS | base64\n\n"""\n\n\n
[docs]class BigQueryPySparkIOManager(BigQueryIOManager):\n """An I/O manager definition that reads inputs from and writes PySpark DataFrames to BigQuery.\n\n Returns:\n IOManagerDefinition\n\n Examples:\n .. code-block:: python\n\n from dagster_gcp_pyspark import BigQueryPySparkIOManager\n from dagster import Definitions, EnvVar\n\n @asset(\n key_prefix=["my_dataset"] # will be used as the dataset in BigQuery\n )\n def my_table() -> pd.DataFrame: # the name of the asset will be the table name\n ...\n\n defs = Definitions(\n assets=[my_table],\n resources={\n "io_manager": BigQueryPySparkIOManager(project=EnvVar("GCP_PROJECT"))\n }\n )\n\n You can tell Dagster in which dataset to create tables by setting the "dataset" configuration value.\n If you do not provide a dataset as configuration to the I/O manager, Dagster will determine a dataset based\n on the assets and ops using the I/O Manager. For assets, the dataset will be determined from the asset key,\n as shown in the above example. The final prefix before the asset name will be used as the dataset. For example,\n if the asset "my_table" had the key prefix ["gcp", "bigquery", "my_dataset"], the dataset "my_dataset" will be\n used. For ops, the dataset can be specified by including a "schema" entry in output metadata. If "schema" is not provided\n via config or on the asset/op, "public" will be used for the dataset.\n\n .. code-block:: python\n\n @op(\n out={"my_table": Out(metadata={"schema": "my_dataset"})}\n )\n def make_my_table() -> pd.DataFrame:\n # the returned value will be stored at my_dataset.my_table\n ...\n\n To only use specific columns of a table as input to a downstream op or asset, add the metadata "columns" to the\n In or AssetIn.\n\n .. code-block:: python\n\n @asset(\n ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n )\n def my_table_a(my_table: pd.DataFrame) -> pd.DataFrame:\n # my_table will just contain the data from column "a"\n ...\n\n If you cannot upload a file to your Dagster deployment, or otherwise cannot\n `authenticate with GCP <https://cloud.google.com/docs/authentication/provide-credentials-adc>`_\n via a standard method, you can provide a service account key as the "gcp_credentials" configuration.\n Dagster will store this key in a temporary file and set GOOGLE_APPLICATION_CREDENTIALS to point to the file.\n After the run completes, the file will be deleted, and GOOGLE_APPLICATION_CREDENTIALS will be\n unset. The key must be base64 encoded to avoid issues with newlines in the keys. You can retrieve\n the base64 encoded key with this shell command: cat $GOOGLE_APPLICATION_CREDENTIALS | base64\n\n """\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n @staticmethod\n def type_handlers() -> Sequence[DbTypeHandler]:\n return [BigQueryPySparkTypeHandler()]\n\n @staticmethod\n def default_load_type() -> Optional[Type]:\n return DataFrame
\n
", "current_page_name": "_modules/dagster_gcp_pyspark/bigquery/bigquery_pyspark_type_handler", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_gcp_pyspark.bigquery.bigquery_pyspark_type_handler"}}}, "dagster_ge": {"factory": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_ge.factory

\nimport datetime\nfrom typing import Any, Dict\n\nimport great_expectations as ge\nfrom dagster import (\n    ConfigurableResource,\n    ExpectationResult,\n    IAttachDifferentObjectToOpContext,\n    In,\n    MetadataValue,\n    OpExecutionContext,\n    Out,\n    Output,\n    _check as check,\n    op,\n    resource,\n)\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom dagster_pandas import DataFrame\nfrom great_expectations.render.renderer import ValidationResultsPageRenderer\nfrom great_expectations.render.view import DefaultMarkdownPageView\nfrom pydantic import Field\n\ntry:\n    # ge < v0.13.0\n    from great_expectations.core import convert_to_json_serializable\nexcept ImportError:\n    # ge >= v0.13.0\n    from great_expectations.core.util import convert_to_json_serializable\n\n\nclass GEContextResource(ConfigurableResource, IAttachDifferentObjectToOpContext):\n    ge_root_dir: str = Field(\n        default=None,\n        description="The root directory for your Great Expectations project.",\n    )\n\n    def get_data_context(self):\n        if self.ge_root_dir is None:\n            return ge.data_context.DataContext()\n        return ge.data_context.DataContext(context_root_dir=self.ge_root_dir)\n\n    def get_object_to_set_on_execution_context(self):\n        return self.get_data_context()\n\n\n@dagster_maintained_resource\n@resource(config_schema=GEContextResource.to_config_schema())\ndef ge_data_context(context):\n    return GEContextResource.from_resource_context(context).get_data_context()\n\n\n
[docs]def ge_validation_op_factory(\n name,\n datasource_name,\n suite_name,\n validation_operator_name=None,\n input_dagster_type=DataFrame,\n batch_kwargs=None,\n):\n """Generates ops for interacting with GE.\n\n Args:\n name (str): the name of the op\n datasource_name (str): the name of your DataSource, see your great_expectations.yml\n suite_name (str): the name of your expectation suite, see your great_expectations.yml\n validation_operator_name (Optional[str]): what validation operator to run -- defaults to\n None, which generates an ephemeral validator. If you want to save data docs, use\n 'action_list_operator'.\n See https://legacy.docs.greatexpectations.io/en/0.12.1/reference/core_concepts/validation_operators_and_actions.html#\n input_dagster_type (DagsterType): the Dagster type used to type check the input to the op.\n Defaults to `dagster_pandas.DataFrame`.\n batch_kwargs (Optional[dict]): overrides the `batch_kwargs` parameter when calling the\n `ge_data_context`'s `get_batch` method. Defaults to `{"dataset": dataset}`, where\n `dataset` is the input to the generated op.\n\n Returns:\n An op that takes in a set of data and yields both an expectation with relevant metadata\n and an output with all the metadata (for user processing)\n """\n check.str_param(datasource_name, "datasource_name")\n check.str_param(suite_name, "suite_name")\n check.opt_str_param(validation_operator_name, "validation_operator_name")\n batch_kwargs = check.opt_dict_param(batch_kwargs, "batch_kwargs")\n\n @op(\n name=name,\n ins={"dataset": In(input_dagster_type)},\n out=Out(\n dict,\n description="""\n This op yields an expectationResult with a structured dict of metadata from\n the GE suite, as well as the full result in case a user wants to process it differently.\n The structured dict contains both summary stats from the suite as well as expectation by\n expectation results/details.\n """,\n ),\n required_resource_keys={"ge_data_context"},\n tags={"kind": "ge"},\n )\n def _ge_validation_fn(context: OpExecutionContext, dataset):\n data_context = context.resources.ge_data_context\n\n if validation_operator_name is not None:\n validation_operator = validation_operator_name\n else:\n data_context.add_validation_operator(\n "ephemeral_validation",\n {"class_name": "ActionListValidationOperator", "action_list": []},\n )\n validation_operator = "ephemeral_validation"\n suite = data_context.get_expectation_suite(suite_name)\n final_batch_kwargs = batch_kwargs or {"dataset": dataset}\n if "datasource" in final_batch_kwargs:\n context.log.warning(\n "`datasource` field of `batch_kwargs` will be ignored; use the `datasource_name` "\n "parameter of the op factory instead."\n )\n final_batch_kwargs["datasource"] = datasource_name\n batch = data_context.get_batch(final_batch_kwargs, suite)\n run_id = {\n "run_name": datasource_name + " run",\n "run_time": datetime.datetime.utcnow(),\n }\n results = data_context.run_validation_operator(\n validation_operator, assets_to_validate=[batch], run_id=run_id\n )\n res = convert_to_json_serializable(results.list_validation_results())[0]\n validation_results_page_renderer = ValidationResultsPageRenderer(run_info_at_end=True)\n rendered_document_content_list = (\n validation_results_page_renderer.render_validation_operator_result(results)\n )\n md_str = " ".join(DefaultMarkdownPageView().render(rendered_document_content_list))\n\n yield ExpectationResult(\n success=res["success"],\n metadata={"Expectation Results": MetadataValue.md(md_str)},\n )\n yield Output(res)\n\n return _ge_validation_fn
\n\n\ndef ge_validation_op_factory_v3(\n name,\n datasource_name,\n data_connector_name,\n data_asset_name,\n suite_name,\n batch_identifiers: dict,\n input_dagster_type=DataFrame,\n runtime_method_type="batch_data",\n extra_kwargs=None,\n):\n """Generates ops for interacting with GE (v3 API).\n\n Args:\n name (str): the name of the op\n datasource_name (str): the name of your DataSource, see your great_expectations.yml\n data_connector_name (str): the name of the data connector for this datasource. This should\n point to a RuntimeDataConnector. For information on how to set this up, see:\n https://docs.greatexpectations.io/docs/guides/connecting_to_your_data/how_to_create_a_batch_of_data_from_an_in_memory_spark_or_pandas_dataframe\n data_asset_name (str): the name of the data asset that this op will be validating.\n suite_name (str): the name of your expectation suite, see your great_expectations.yml\n batch_identifier_fn (dict): A dicitonary of batch identifiers to uniquely identify this\n batch of data. To learn more about batch identifiers, see:\n https://docs.greatexpectations.io/docs/reference/datasources#batches.\n input_dagster_type (DagsterType): the Dagster type used to type check the input to the op.\n Defaults to `dagster_pandas.DataFrame`.\n runtime_method_type (str): how GE should interperet the op input. One of ("batch_data",\n "path", "query"). Defaults to "batch_data", which will interperet the input as an\n in-memory object.\n extra_kwargs (Optional[dict]): adds extra kwargs to the invocation of `ge_data_context`'s\n `get_validator` method. If not set, input will be:\n {\n "datasource_name": datasource_name,\n "data_connector_name": data_connector_name,\n "data_asset_name": data_asset_name,\n "runtime_parameters": {\n "<runtime_method_type>": <op input>\n },\n "batch_identifiers": batch_identifiers,\n "expectation_suite_name": suite_name,\n }\n\n Returns:\n An op that takes in a set of data and yields both an expectation with relevant metadata and\n an output with all the metadata (for user processing)\n\n """\n check.str_param(datasource_name, "datasource_name")\n check.str_param(data_connector_name, "data_connector_name")\n check.str_param(suite_name, "suite_name")\n\n _extra_kwargs: Dict[Any, Any] = check.opt_dict_param(extra_kwargs, "extra_kwargs")\n\n @op(\n name=name,\n ins={"dataset": In(input_dagster_type)},\n out=Out(\n dict,\n description="""\n This op yields an ExpectationResult with a structured dict of metadata from\n the GE suite, as well as the full result in case a user wants to process it differently.\n The structured dict contains both summary stats from the suite as well as expectation by\n expectation results/details.\n """,\n ),\n required_resource_keys={"ge_data_context"},\n tags={"kind": "ge"},\n )\n def _ge_validation_fn(context: OpExecutionContext, dataset):\n data_context = context.resources.ge_data_context\n\n validator_kwargs = {\n "datasource_name": datasource_name,\n "data_connector_name": data_connector_name,\n "data_asset_name": datasource_name or data_asset_name,\n "runtime_parameters": {runtime_method_type: dataset},\n "batch_identifiers": batch_identifiers,\n "expectation_suite_name": suite_name,\n **_extra_kwargs,\n }\n validator = data_context.get_validator(**validator_kwargs)\n\n run_id = {\n "run_name": datasource_name + " run",\n "run_time": datetime.datetime.utcnow(),\n }\n results = validator.validate(run_id=run_id)\n\n validation_results_page_renderer = ValidationResultsPageRenderer(run_info_at_end=True)\n rendered_document_content_list = validation_results_page_renderer.render(\n validation_results=results\n )\n md_str = "".join(DefaultMarkdownPageView().render(rendered_document_content_list))\n\n yield ExpectationResult(\n success=bool(results["success"]),\n metadata={"Expectation Results": MetadataValue.md(md_str)},\n )\n yield Output(results.to_json_dict())\n\n return _ge_validation_fn\n
", "current_page_name": "_modules/dagster_ge/factory", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_ge.factory"}}, "dagster_github": {"resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_github.resources

\nimport time\nfrom datetime import datetime\nfrom typing import Optional\n\nimport jwt\nimport requests\nfrom dagster import ConfigurableResource, resource\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom pydantic import Field\n\n\ndef to_seconds(dt):\n    return (dt - datetime(1970, 1, 1)).total_seconds()\n\n\nclass GithubClient:\n    def __init__(\n        self, client, app_id, app_private_rsa_key, default_installation_id, hostname=None\n    ) -> None:\n        self.client = client\n        self.app_private_rsa_key = app_private_rsa_key\n        self.app_id = app_id\n        self.default_installation_id = default_installation_id\n        self.installation_tokens = {}\n        self.app_token = {}\n        self.hostname = hostname\n\n    def __set_app_token(self):\n        # from https://developer.github.com/apps/building-github-apps/authenticating-with-github-apps/\n        # needing to self-sign a JWT\n        now = int(time.time())\n        # JWT expiration time (10 minute maximum)\n        expires = now + (10 * 60)\n        encoded_token = jwt.encode(\n            {\n                # issued at time\n                "iat": now,\n                # JWT expiration time\n                "exp": expires,\n                # GitHub App's identifier\n                "iss": self.app_id,\n            },\n            self.app_private_rsa_key,\n            algorithm="RS256",\n        )\n        self.app_token = {\n            "value": encoded_token,\n            "expires": expires,\n        }\n\n    def __check_app_token(self):\n        if ("expires" not in self.app_token) or (\n            self.app_token["expires"] < (int(time.time()) + 60)\n        ):\n            self.__set_app_token()\n\n    def get_installations(self, headers=None):\n        if headers is None:\n            headers = {}\n        self.__check_app_token()\n        headers["Authorization"] = "Bearer {}".format(self.app_token["value"])\n        headers["Accept"] = "application/vnd.github.machine-man-preview+json"\n        request = self.client.get(\n            (\n                "https://api.github.com/app/installations"\n                if self.hostname is None\n                else f"https://{self.hostname}/api/v3/app/installations"\n            ),\n            headers=headers,\n        )\n        request.raise_for_status()\n        return request.json()\n\n    def __set_installation_token(self, installation_id, headers=None):\n        if headers is None:\n            headers = {}\n        self.__check_app_token()\n        headers["Authorization"] = "Bearer {}".format(self.app_token["value"])\n        headers["Accept"] = "application/vnd.github.machine-man-preview+json"\n        request = requests.post(\n            (\n                f"https://api.github.com/app/installations/{installation_id}/access_tokens"\n                if self.hostname is None\n                else "https://{}/api/v3/app/installations/{}/access_tokens".format(\n                    self.hostname, installation_id\n                )\n            ),\n            headers=headers,\n        )\n        request.raise_for_status()\n        auth = request.json()\n        self.installation_tokens[installation_id] = {\n            "value": auth["token"],\n            "expires": to_seconds(datetime.strptime(auth["expires_at"], "%Y-%m-%dT%H:%M:%SZ")),\n        }\n\n    def __check_installation_tokens(self, installation_id):\n        if (installation_id not in self.installation_tokens) or (\n            self.installation_tokens[installation_id]["expires"] < (int(time.time()) + 60)\n        ):\n            self.__set_installation_token(installation_id)\n\n    def execute(self, query, variables, headers=None, installation_id=None):\n        if headers is None:\n            headers = {}\n        if installation_id is None:\n            installation_id = self.default_installation_id\n        self.__check_installation_tokens(installation_id)\n        headers["Authorization"] = "token {}".format(\n            self.installation_tokens[installation_id]["value"]\n        )\n        request = requests.post(\n            (\n                "https://api.github.com/graphql"\n                if self.hostname is None\n                else f"https://{self.hostname}/api/graphql"\n            ),\n            json={"query": query, "variables": variables},\n            headers=headers,\n        )\n        request.raise_for_status()\n        return request.json()\n\n    def create_issue(self, repo_name, repo_owner, title, body, installation_id=None):\n        if installation_id is None:\n            installation_id = self.default_installation_id\n        res = self.execute(\n            query="""\n            query get_repo_id($repo_name: String!, $repo_owner: String!) {\n                repository(name: $repo_name, owner: $repo_owner) {\n                    id\n                }\n            }\n            """,\n            variables={"repo_name": repo_name, "repo_owner": repo_owner},\n            installation_id=installation_id,\n        )\n\n        return self.execute(\n            query="""\n                mutation CreateIssue($id: ID!, $title: String!, $body: String!) {\n                createIssue(input: {\n                    repositoryId: $id,\n                    title: $title,\n                    body: $body\n                }) {\n                    clientMutationId,\n                    issue {\n                        body\n                        title\n                        url\n                    }\n                }\n                }\n            """,\n            variables={\n                "id": res["data"]["repository"]["id"],\n                "title": title,\n                "body": body,\n            },\n            installation_id=installation_id,\n        )\n\n\n
[docs]class GithubResource(ConfigurableResource):\n github_app_id: int = Field(\n description="Github Application ID, for more info see https://developer.github.com/apps/",\n )\n github_app_private_rsa_key: str = Field(\n description=(\n "Github Application Private RSA key text, for more info see"\n " https://developer.github.com/apps/"\n ),\n )\n github_installation_id: Optional[int] = Field(\n default=None,\n description=(\n "Github Application Installation ID, for more info see"\n " https://developer.github.com/apps/"\n ),\n )\n github_hostname: Optional[str] = Field(\n default=None,\n description=(\n "Github hostname. Defaults to `api.github.com`, for more info see"\n " https://developer.github.com/apps/"\n ),\n )\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n def get_client(self) -> GithubClient:\n return GithubClient(\n client=requests.Session(),\n app_id=self.github_app_id,\n app_private_rsa_key=self.github_app_private_rsa_key,\n default_installation_id=self.github_installation_id,\n hostname=self.github_hostname,\n )
\n\n\n
[docs]@dagster_maintained_resource\n@resource(\n config_schema=GithubResource.to_config_schema(),\n description="This resource is for connecting to Github",\n)\ndef github_resource(context) -> GithubClient:\n return GithubResource(**context.resource_config).get_client()
\n
", "current_page_name": "_modules/dagster_github/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_github.resources"}}, "dagster_graphql": {"client": {"client": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_graphql.client.client

\nfrom itertools import chain\nfrom typing import Any, Dict, Iterable, List, Mapping, Optional, Sequence, Union\n\nimport dagster._check as check\nimport requests.exceptions\nfrom dagster import DagsterRunStatus\nfrom dagster._annotations import experimental, public\nfrom dagster._core.definitions.run_config import RunConfig, convert_config_input\nfrom dagster._core.definitions.utils import validate_tags\nfrom gql import Client, gql\nfrom gql.transport import Transport\nfrom gql.transport.requests import RequestsHTTPTransport\n\nfrom .client_queries import (\n    CLIENT_GET_REPO_LOCATIONS_NAMES_AND_PIPELINES_QUERY,\n    CLIENT_SUBMIT_PIPELINE_RUN_MUTATION,\n    GET_PIPELINE_RUN_STATUS_QUERY,\n    RELOAD_REPOSITORY_LOCATION_MUTATION,\n    SHUTDOWN_REPOSITORY_LOCATION_MUTATION,\n    TERMINATE_RUN_JOB_MUTATION,\n)\nfrom .utils import (\n    DagsterGraphQLClientError,\n    InvalidOutputErrorInfo,\n    JobInfo,\n    ReloadRepositoryLocationInfo,\n    ReloadRepositoryLocationStatus,\n    ShutdownRepositoryLocationInfo,\n    ShutdownRepositoryLocationStatus,\n)\n\n\n
[docs]@experimental\nclass DagsterGraphQLClient:\n """Official Dagster Python Client for GraphQL.\n\n Utilizes the gql library to dispatch queries over HTTP to a remote Dagster GraphQL Server\n\n As of now, all operations on this client are synchronous.\n\n Intended usage:\n\n .. code-block:: python\n\n client = DagsterGraphQLClient("localhost", port_number=3000)\n status = client.get_run_status(**SOME_RUN_ID**)\n\n Args:\n hostname (str): Hostname for the Dagster GraphQL API, like `localhost` or\n `dagster.YOUR_ORG_HERE`.\n port_number (Optional[int]): Port number to connect to on the host.\n Defaults to None.\n transport (Optional[Transport], optional): A custom transport to use to connect to the\n GraphQL API with (e.g. for custom auth). Defaults to None.\n use_https (bool, optional): Whether to use https in the URL connection string for the\n GraphQL API. Defaults to False.\n timeout (int): Number of seconds before requests should time out. Defaults to 60.\n headers (Optional[Dict[str, str]]): Additional headers to include in the request. To use\n this client in Dagster Cloud, set the "Dagster-Cloud-Api-Token" header to a user token\n generated in the Dagster Cloud UI.\n\n Raises:\n :py:class:`~requests.exceptions.ConnectionError`: if the client cannot connect to the host.\n """\n\n def __init__(\n self,\n hostname: str,\n port_number: Optional[int] = None,\n transport: Optional[Transport] = None,\n use_https: bool = False,\n timeout: int = 300,\n headers: Optional[Dict[str, str]] = None,\n ):\n self._hostname = check.str_param(hostname, "hostname")\n self._port_number = check.opt_int_param(port_number, "port_number")\n self._use_https = check.bool_param(use_https, "use_https")\n\n self._url = (\n ("https://" if self._use_https else "http://")\n + (f"{self._hostname}:{self._port_number}" if self._port_number else self._hostname)\n + "/graphql"\n )\n\n self._transport = check.opt_inst_param(\n transport,\n "transport",\n Transport,\n default=RequestsHTTPTransport(\n url=self._url, use_json=True, timeout=timeout, headers=headers\n ),\n )\n try:\n self._client = Client(transport=self._transport, fetch_schema_from_transport=True)\n except requests.exceptions.ConnectionError as exc:\n raise DagsterGraphQLClientError(\n f"Error when connecting to url {self._url}. "\n + f"Did you specify hostname: {self._hostname} "\n + (f"and port_number: {self._port_number} " if self._port_number else "")\n + "correctly?"\n ) from exc\n\n def _execute(self, query: str, variables: Optional[Dict[str, Any]] = None):\n try:\n return self._client.execute(gql(query), variable_values=variables)\n except Exception as exc: # catch generic Exception from the gql client\n raise DagsterGraphQLClientError(\n f"Exception occured during execution of query \\n{query}\\n with variables"\n f" \\n{variables}\\n"\n ) from exc\n\n def _get_repo_locations_and_names_with_pipeline(self, job_name: str) -> List[JobInfo]:\n res_data = self._execute(CLIENT_GET_REPO_LOCATIONS_NAMES_AND_PIPELINES_QUERY)\n query_res = res_data["repositoriesOrError"]\n repo_connection_status = query_res["__typename"]\n if repo_connection_status == "RepositoryConnection":\n valid_nodes: Iterable[JobInfo] = chain(*map(JobInfo.from_node, query_res["nodes"]))\n return [info for info in valid_nodes if info.job_name == job_name]\n else:\n raise DagsterGraphQLClientError(repo_connection_status, query_res["message"])\n\n def _core_submit_execution(\n self,\n pipeline_name: str,\n repository_location_name: Optional[str] = None,\n repository_name: Optional[str] = None,\n run_config: Optional[Union[RunConfig, Mapping[str, Any]]] = None,\n mode: str = "default",\n preset: Optional[str] = None,\n tags: Optional[Mapping[str, str]] = None,\n op_selection: Optional[Sequence[str]] = None,\n is_using_job_op_graph_apis: Optional[bool] = False,\n ):\n check.opt_str_param(repository_location_name, "repository_location_name")\n check.opt_str_param(repository_name, "repository_name")\n check.str_param(pipeline_name, "pipeline_name")\n check.opt_str_param(mode, "mode")\n check.opt_str_param(preset, "preset")\n run_config = check.opt_mapping_param(convert_config_input(run_config), "run_config")\n\n # The following invariant will never fail when a job is executed\n check.invariant(\n (mode is not None and run_config is not None) or preset is not None,\n "Either a mode and run_config or a preset must be specified in order to "\n f"submit the pipeline {pipeline_name} for execution",\n )\n tags = validate_tags(tags)\n\n pipeline_or_job = "Job" if is_using_job_op_graph_apis else "Pipeline"\n\n if not repository_location_name or not repository_name:\n job_info_lst = self._get_repo_locations_and_names_with_pipeline(pipeline_name)\n if len(job_info_lst) == 0:\n raise DagsterGraphQLClientError(\n f"{pipeline_or_job}NotFoundError",\n f"No {'jobs' if is_using_job_op_graph_apis else 'pipelines'} with the name"\n f" `{pipeline_name}` exist",\n )\n elif len(job_info_lst) == 1:\n job_info = job_info_lst[0]\n repository_location_name = job_info.repository_location_name\n repository_name = job_info.repository_name\n else:\n raise DagsterGraphQLClientError(\n "Must specify repository_location_name and repository_name since there are"\n f" multiple {'jobs' if is_using_job_op_graph_apis else 'pipelines'} with the"\n f" name {pipeline_name}.\\n\\tchoose one of: {job_info_lst}"\n )\n\n variables: Dict[str, Any] = {\n "executionParams": {\n "selector": {\n "repositoryLocationName": repository_location_name,\n "repositoryName": repository_name,\n "pipelineName": pipeline_name,\n "solidSelection": op_selection,\n }\n }\n }\n if preset is not None:\n variables["executionParams"]["preset"] = preset\n if mode is not None and run_config is not None:\n variables["executionParams"] = {\n **variables["executionParams"],\n "runConfigData": run_config,\n "mode": mode,\n "executionMetadata": (\n {"tags": [{"key": k, "value": v} for k, v in tags.items()]} if tags else {}\n ),\n }\n\n res_data: Dict[str, Any] = self._execute(CLIENT_SUBMIT_PIPELINE_RUN_MUTATION, variables)\n query_result = res_data["launchPipelineExecution"]\n query_result_type = query_result["__typename"]\n if (\n query_result_type == "LaunchRunSuccess"\n or query_result_type == "LaunchPipelineRunSuccess"\n ):\n return query_result["run"]["runId"]\n elif query_result_type == "InvalidStepError":\n raise DagsterGraphQLClientError(query_result_type, query_result["invalidStepKey"])\n elif query_result_type == "InvalidOutputError":\n error_info = InvalidOutputErrorInfo(\n step_key=query_result["stepKey"],\n invalid_output_name=query_result["invalidOutputName"],\n )\n raise DagsterGraphQLClientError(query_result_type, body=error_info)\n elif (\n query_result_type == "RunConfigValidationInvalid"\n or query_result_type == "PipelineConfigValidationInvalid"\n ):\n raise DagsterGraphQLClientError(query_result_type, query_result["errors"])\n else:\n # query_result_type is a ConflictingExecutionParamsError, a PresetNotFoundError\n # a PipelineNotFoundError, a RunConflict, or a PythonError\n raise DagsterGraphQLClientError(query_result_type, query_result["message"])\n\n
[docs] @public\n def submit_job_execution(\n self,\n job_name: str,\n repository_location_name: Optional[str] = None,\n repository_name: Optional[str] = None,\n run_config: Optional[Dict[str, Any]] = None,\n tags: Optional[Dict[str, Any]] = None,\n op_selection: Optional[Sequence[str]] = None,\n ) -> str:\n """Submits a job with attached configuration for execution.\n\n Args:\n job_name (str): The job's name\n repository_location_name (Optional[str]): The name of the repository location where\n the job is located. If omitted, the client will try to infer the repository location\n from the available options on the Dagster deployment. Defaults to None.\n repository_name (Optional[str]): The name of the repository where the job is located.\n If omitted, the client will try to infer the repository from the available options\n on the Dagster deployment. Defaults to None.\n run_config (Optional[Dict[str, Any]]): This is the run config to execute the job with.\n Note that runConfigData is any-typed in the GraphQL type system. This type is used when passing in\n an arbitrary object for run config. However, it must conform to the constraints of the config\n schema for this job. If it does not, the client will throw a DagsterGraphQLClientError with a message of\n JobConfigValidationInvalid. Defaults to None.\n tags (Optional[Dict[str, Any]]): A set of tags to add to the job execution.\n\n Raises:\n DagsterGraphQLClientError("InvalidStepError", invalid_step_key): the job has an invalid step\n DagsterGraphQLClientError("InvalidOutputError", body=error_object): some solid has an invalid output within the job.\n The error_object is of type dagster_graphql.InvalidOutputErrorInfo.\n DagsterGraphQLClientError("RunConflict", message): a `DagsterRunConflict` occured during execution.\n This indicates that a conflicting job run already exists in run storage.\n DagsterGraphQLClientError("PipelineConfigurationInvalid", invalid_step_key): the run_config is not in the expected format\n for the job\n DagsterGraphQLClientError("JobNotFoundError", message): the requested job does not exist\n DagsterGraphQLClientError("PythonError", message): an internal framework error occurred\n\n Returns:\n str: run id of the submitted pipeline run\n """\n return self._core_submit_execution(\n pipeline_name=job_name,\n repository_location_name=repository_location_name,\n repository_name=repository_name,\n run_config=run_config,\n mode="default",\n preset=None,\n tags=tags,\n op_selection=op_selection,\n is_using_job_op_graph_apis=True,\n )
\n\n
[docs] @public\n def get_run_status(self, run_id: str) -> DagsterRunStatus:\n """Get the status of a given Pipeline Run.\n\n Args:\n run_id (str): run id of the requested pipeline run.\n\n Raises:\n DagsterGraphQLClientError("PipelineNotFoundError", message): if the requested run id is not found\n DagsterGraphQLClientError("PythonError", message): on internal framework errors\n\n Returns:\n DagsterRunStatus: returns a status Enum describing the state of the requested pipeline run\n """\n check.str_param(run_id, "run_id")\n\n res_data: Dict[str, Dict[str, Any]] = self._execute(\n GET_PIPELINE_RUN_STATUS_QUERY, {"runId": run_id}\n )\n query_result: Dict[str, Any] = res_data["pipelineRunOrError"]\n query_result_type: str = query_result["__typename"]\n if query_result_type == "PipelineRun" or query_result_type == "Run":\n return DagsterRunStatus(query_result["status"])\n else:\n raise DagsterGraphQLClientError(query_result_type, query_result["message"])
\n\n
[docs] @public\n def reload_repository_location(\n self, repository_location_name: str\n ) -> ReloadRepositoryLocationInfo:\n """Reloads a Dagster Repository Location, which reloads all repositories in that repository location.\n\n This is useful in a variety of contexts, including refreshing the Dagster UI without restarting\n the server.\n\n Args:\n repository_location_name (str): The name of the repository location\n\n Returns:\n ReloadRepositoryLocationInfo: Object with information about the result of the reload request\n """\n check.str_param(repository_location_name, "repository_location_name")\n\n res_data: Dict[str, Dict[str, Any]] = self._execute(\n RELOAD_REPOSITORY_LOCATION_MUTATION,\n {"repositoryLocationName": repository_location_name},\n )\n\n query_result: Dict[str, Any] = res_data["reloadRepositoryLocation"]\n query_result_type: str = query_result["__typename"]\n if query_result_type == "WorkspaceLocationEntry":\n location_or_error_type = query_result["locationOrLoadError"]["__typename"]\n if location_or_error_type == "RepositoryLocation":\n return ReloadRepositoryLocationInfo(status=ReloadRepositoryLocationStatus.SUCCESS)\n else:\n return ReloadRepositoryLocationInfo(\n status=ReloadRepositoryLocationStatus.FAILURE,\n failure_type="PythonError",\n message=query_result["locationOrLoadError"]["message"],\n )\n else:\n # query_result_type is either ReloadNotSupported or RepositoryLocationNotFound\n return ReloadRepositoryLocationInfo(\n status=ReloadRepositoryLocationStatus.FAILURE,\n failure_type=query_result_type,\n message=query_result["message"],\n )
\n\n
[docs] @public\n def shutdown_repository_location(\n self, repository_location_name: str\n ) -> ShutdownRepositoryLocationInfo:\n """Shuts down the server that is serving metadata for the provided repository location.\n\n This is primarily useful when you want the server to be restarted by the compute environment\n in which it is running (for example, in Kubernetes, the pod in which the server is running\n will automatically restart when the server is shut down, and the repository metadata will\n be reloaded)\n\n Args:\n repository_location_name (str): The name of the repository location\n\n Returns:\n ShutdownRepositoryLocationInfo: Object with information about the result of the reload request\n """\n check.str_param(repository_location_name, "repository_location_name")\n\n res_data: Dict[str, Dict[str, Any]] = self._execute(\n SHUTDOWN_REPOSITORY_LOCATION_MUTATION,\n {"repositoryLocationName": repository_location_name},\n )\n\n query_result: Dict[str, Any] = res_data["shutdownRepositoryLocation"]\n query_result_type: str = query_result["__typename"]\n if query_result_type == "ShutdownRepositoryLocationSuccess":\n return ShutdownRepositoryLocationInfo(status=ShutdownRepositoryLocationStatus.SUCCESS)\n elif (\n query_result_type == "RepositoryLocationNotFound" or query_result_type == "PythonError"\n ):\n return ShutdownRepositoryLocationInfo(\n status=ShutdownRepositoryLocationStatus.FAILURE,\n message=query_result["message"],\n )\n else:\n raise Exception(f"Unexpected query result type {query_result_type}")
\n\n def terminate_run(self, run_id: str):\n """Terminates a pipeline run. This method it is useful when you would like to stop a pipeline run\n based on a external event.\n\n Args:\n run_id (str): The run id of the pipeline run to terminate\n """\n check.str_param(run_id, "run_id")\n\n res_data: Dict[str, Dict[str, Any]] = self._execute(\n TERMINATE_RUN_JOB_MUTATION, {"runId": run_id}\n )\n\n query_result: Dict[str, Any] = res_data["terminateRun"]\n query_result_type: str = query_result["__typename"]\n if query_result_type == "TerminateRunSuccess":\n return\n\n elif query_result_type == "RunNotFoundError":\n raise DagsterGraphQLClientError("RunNotFoundError", f"Run Id {run_id} not found")\n else:\n raise DagsterGraphQLClientError(query_result_type, query_result["message"])
\n
", "current_page_name": "_modules/dagster_graphql/client/client", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_graphql.client.client"}, "utils": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_graphql.client.utils

\nfrom enum import Enum\nfrom typing import Any, Dict, List, NamedTuple, Optional\n\n\n
[docs]class DagsterGraphQLClientError(Exception):\n def __init__(self, *args, body=None):\n super().__init__(*args)\n self.body = body
\n\n\n
[docs]class ReloadRepositoryLocationStatus(Enum):\n """This enum describes the status of a GraphQL mutation to reload a Dagster repository location.\n\n Args:\n Enum (str): can be either `ReloadRepositoryLocationStatus.SUCCESS`\n or `ReloadRepositoryLocationStatus.FAILURE`.\n """\n\n SUCCESS = "SUCCESS"\n FAILURE = "FAILURE"
\n\n\nclass ShutdownRepositoryLocationStatus(Enum):\n SUCCESS = "SUCCESS"\n FAILURE = "FAILURE"\n\n\n
[docs]class ReloadRepositoryLocationInfo(NamedTuple):\n """This class gives information about the result of reloading\n a Dagster repository location with a GraphQL mutation.\n\n Args:\n status (ReloadRepositoryLocationStatus): The status of the reload repository location mutation\n failure_type: (Optional[str], optional): the failure type if `status == ReloadRepositoryLocationStatus.FAILURE`.\n Can be one of `ReloadNotSupported`, `RepositoryLocationNotFound`, or `RepositoryLocationLoadFailure`. Defaults to None.\n message (Optional[str], optional): the failure message/reason if\n `status == ReloadRepositoryLocationStatus.FAILURE`. Defaults to None.\n """\n\n status: ReloadRepositoryLocationStatus\n failure_type: Optional[str] = None\n message: Optional[str] = None
\n\n\nclass ShutdownRepositoryLocationInfo(NamedTuple):\n """This class gives information about the result of shutting down the server for\n a Dagster repository location using a GraphQL mutation.\n\n Args:\n status (ShutdownRepositoryLocationStatus) Whether the shutdown succeeded or failed.\n message (Optional[str], optional): the failure message/reason if\n `status == ShutdownRepositoryLocationStatus.FAILURE`. Defaults to None.\n """\n\n status: ShutdownRepositoryLocationStatus\n message: Optional[str] = None\n\n\nclass JobInfo(NamedTuple):\n repository_location_name: str\n repository_name: str\n job_name: str\n\n @staticmethod\n def from_node(node: Dict[str, Any]) -> List["JobInfo"]:\n repo_name = node["name"]\n repo_location_name = node["location"]["name"]\n return [\n JobInfo(\n repository_location_name=repo_location_name,\n repository_name=repo_name,\n job_name=job["name"],\n )\n for job in node["pipelines"]\n ]\n\n\n
[docs]class InvalidOutputErrorInfo(NamedTuple):\n """This class gives information about an InvalidOutputError from submitting a pipeline for execution\n from GraphQL.\n\n Args:\n step_key (str): key of the step that failed\n invalid_output_name (str): the name of the invalid output from the given step\n """\n\n step_key: str\n invalid_output_name: str
\n
", "current_page_name": "_modules/dagster_graphql/client/utils", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_graphql.client.utils"}}}, "dagster_k8s": {"executor": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_k8s.executor

\nfrom typing import Iterator, List, Optional, cast\n\nimport kubernetes.config\nfrom dagster import (\n    Field,\n    IntSource,\n    Noneable,\n    StringSource,\n    _check as check,\n    executor,\n)\nfrom dagster._core.definitions.executor_definition import multiple_process_executor_requirements\nfrom dagster._core.definitions.metadata import MetadataValue\nfrom dagster._core.events import DagsterEvent, EngineEventData\nfrom dagster._core.execution.retries import RetryMode, get_retries_config\nfrom dagster._core.execution.tags import get_tag_concurrency_limits_config\nfrom dagster._core.executor.base import Executor\nfrom dagster._core.executor.init import InitExecutorContext\nfrom dagster._core.executor.step_delegating import (\n    CheckStepHealthResult,\n    StepDelegatingExecutor,\n    StepHandler,\n    StepHandlerContext,\n)\nfrom dagster._utils.merger import merge_dicts\n\nfrom dagster_k8s.launcher import K8sRunLauncher\n\nfrom .client import DagsterKubernetesClient\nfrom .container_context import K8sContainerContext\nfrom .job import (\n    USER_DEFINED_K8S_CONFIG_SCHEMA,\n    DagsterK8sJobConfig,\n    construct_dagster_k8s_job,\n    get_k8s_job_name,\n    get_user_defined_k8s_config,\n)\n\n_K8S_EXECUTOR_CONFIG_SCHEMA = merge_dicts(\n    DagsterK8sJobConfig.config_type_job(),\n    {\n        "load_incluster_config": Field(\n            bool,\n            is_required=False,\n            description="""Whether or not the executor is running within a k8s cluster already. If\n            the job is using the `K8sRunLauncher`, the default value of this parameter will be\n            the same as the corresponding value on the run launcher.\n            If ``True``, we assume the executor is running within the target cluster and load config\n            using ``kubernetes.config.load_incluster_config``. Otherwise, we will use the k8s config\n            specified in ``kubeconfig_file`` (using ``kubernetes.config.load_kube_config``) or fall\n            back to the default kubeconfig.""",\n        ),\n        "kubeconfig_file": Field(\n            Noneable(str),\n            is_required=False,\n            description="""Path to a kubeconfig file to use, if not using default kubeconfig. If\n            the job is using the `K8sRunLauncher`, the default value of this parameter will be\n            the same as the corresponding value on the run launcher.""",\n        ),\n        "job_namespace": Field(StringSource, is_required=False),\n        "retries": get_retries_config(),\n        "max_concurrent": Field(\n            IntSource,\n            is_required=False,\n            description=(\n                "Limit on the number of pods that will run concurrently within the scope "\n                "of a Dagster run. Note that this limit is per run, not global."\n            ),\n        ),\n        "tag_concurrency_limits": get_tag_concurrency_limits_config(),\n        "step_k8s_config": Field(\n            USER_DEFINED_K8S_CONFIG_SCHEMA,\n            is_required=False,\n            description="Raw Kubernetes configuration for each step launched by the executor.",\n        ),\n    },\n)\n\n\n
[docs]@executor(\n name="k8s",\n config_schema=_K8S_EXECUTOR_CONFIG_SCHEMA,\n requirements=multiple_process_executor_requirements(),\n)\ndef k8s_job_executor(init_context: InitExecutorContext) -> Executor:\n """Executor which launches steps as Kubernetes Jobs.\n\n To use the `k8s_job_executor`, set it as the `executor_def` when defining a job:\n\n .. literalinclude:: ../../../../../../python_modules/libraries/dagster-k8s/dagster_k8s_tests/unit_tests/test_example_executor_mode_def.py\n :start-after: start_marker\n :end-before: end_marker\n :language: python\n\n Then you can configure the executor with run config as follows:\n\n .. code-block:: YAML\n\n execution:\n config:\n job_namespace: 'some-namespace'\n image_pull_policy: ...\n image_pull_secrets: ...\n service_account_name: ...\n env_config_maps: ...\n env_secrets: ...\n env_vars: ...\n job_image: ... # leave out if using userDeployments\n max_concurrent: ...\n\n `max_concurrent` limits the number of pods that will execute concurrently for one run. By default\n there is no limit- it will maximally parallel as allowed by the DAG. Note that this is not a\n global limit.\n\n Configuration set on the Kubernetes Jobs and Pods created by the `K8sRunLauncher` will also be\n set on Kubernetes Jobs and Pods created by the `k8s_job_executor`.\n\n Configuration set using `tags` on a `@job` will only apply to the `run` level. For configuration\n to apply at each `step` it must be set using `tags` for each `@op`.\n """\n run_launcher = (\n init_context.instance.run_launcher\n if isinstance(init_context.instance.run_launcher, K8sRunLauncher)\n else None\n )\n\n exc_cfg = init_context.executor_config\n\n k8s_container_context = K8sContainerContext(\n image_pull_policy=exc_cfg.get("image_pull_policy"), # type: ignore\n image_pull_secrets=exc_cfg.get("image_pull_secrets"), # type: ignore\n service_account_name=exc_cfg.get("service_account_name"), # type: ignore\n env_config_maps=exc_cfg.get("env_config_maps"), # type: ignore\n env_secrets=exc_cfg.get("env_secrets"), # type: ignore\n env_vars=exc_cfg.get("env_vars"), # type: ignore\n volume_mounts=exc_cfg.get("volume_mounts"), # type: ignore\n volumes=exc_cfg.get("volumes"), # type: ignore\n labels=exc_cfg.get("labels"), # type: ignore\n namespace=exc_cfg.get("job_namespace"), # type: ignore\n resources=exc_cfg.get("resources"), # type: ignore\n scheduler_name=exc_cfg.get("scheduler_name"), # type: ignore\n # step_k8s_config feeds into the run_k8s_config field because it is merged\n # with any configuration for the run that was set on the run launcher or code location\n run_k8s_config=exc_cfg.get("step_k8s_config"), # type: ignore\n )\n\n if "load_incluster_config" in exc_cfg:\n load_incluster_config = cast(bool, exc_cfg["load_incluster_config"])\n else:\n load_incluster_config = run_launcher.load_incluster_config if run_launcher else True\n\n if "kubeconfig_file" in exc_cfg:\n kubeconfig_file = cast(Optional[str], exc_cfg["kubeconfig_file"])\n else:\n kubeconfig_file = run_launcher.kubeconfig_file if run_launcher else None\n\n return StepDelegatingExecutor(\n K8sStepHandler(\n image=exc_cfg.get("job_image"), # type: ignore\n container_context=k8s_container_context,\n load_incluster_config=load_incluster_config,\n kubeconfig_file=kubeconfig_file,\n ),\n retries=RetryMode.from_config(exc_cfg["retries"]), # type: ignore\n max_concurrent=check.opt_int_elem(exc_cfg, "max_concurrent"),\n tag_concurrency_limits=check.opt_list_elem(exc_cfg, "tag_concurrency_limits"),\n should_verify_step=True,\n )
\n\n\nclass K8sStepHandler(StepHandler):\n @property\n def name(self):\n return "K8sStepHandler"\n\n def __init__(\n self,\n image: Optional[str],\n container_context: K8sContainerContext,\n load_incluster_config: bool,\n kubeconfig_file: Optional[str],\n k8s_client_batch_api=None,\n ):\n super().__init__()\n\n self._executor_image = check.opt_str_param(image, "image")\n self._executor_container_context = check.inst_param(\n container_context, "container_context", K8sContainerContext\n )\n\n if load_incluster_config:\n check.invariant(\n kubeconfig_file is None,\n "`kubeconfig_file` is set but `load_incluster_config` is True.",\n )\n kubernetes.config.load_incluster_config()\n else:\n check.opt_str_param(kubeconfig_file, "kubeconfig_file")\n kubernetes.config.load_kube_config(kubeconfig_file)\n\n self._api_client = DagsterKubernetesClient.production_client(\n batch_api_override=k8s_client_batch_api\n )\n\n def _get_step_key(self, step_handler_context: StepHandlerContext) -> str:\n step_keys_to_execute = cast(\n List[str], step_handler_context.execute_step_args.step_keys_to_execute\n )\n assert len(step_keys_to_execute) == 1, "Launching multiple steps is not currently supported"\n return step_keys_to_execute[0]\n\n def _get_container_context(\n self, step_handler_context: StepHandlerContext\n ) -> K8sContainerContext:\n step_key = self._get_step_key(step_handler_context)\n\n context = K8sContainerContext.create_for_run(\n step_handler_context.dagster_run,\n cast(K8sRunLauncher, step_handler_context.instance.run_launcher),\n include_run_tags=False, # For now don't include job-level dagster-k8s/config tags in step pods\n )\n context = context.merge(self._executor_container_context)\n\n user_defined_k8s_config = get_user_defined_k8s_config(\n step_handler_context.step_tags[step_key]\n )\n return context.merge(K8sContainerContext(run_k8s_config=user_defined_k8s_config.to_dict()))\n\n def _get_k8s_step_job_name(self, step_handler_context: StepHandlerContext):\n step_key = self._get_step_key(step_handler_context)\n\n name_key = get_k8s_job_name(\n step_handler_context.execute_step_args.run_id,\n step_key,\n )\n\n if step_handler_context.execute_step_args.known_state:\n retry_state = step_handler_context.execute_step_args.known_state.get_retry_state()\n if retry_state.get_attempt_count(step_key):\n return "dagster-step-%s-%d" % (name_key, retry_state.get_attempt_count(step_key))\n\n return "dagster-step-%s" % (name_key)\n\n def launch_step(self, step_handler_context: StepHandlerContext) -> Iterator[DagsterEvent]:\n step_key = self._get_step_key(step_handler_context)\n\n job_name = self._get_k8s_step_job_name(step_handler_context)\n pod_name = job_name\n\n container_context = self._get_container_context(step_handler_context)\n\n job_config = container_context.get_k8s_job_config(\n self._executor_image, step_handler_context.instance.run_launcher\n )\n\n args = step_handler_context.execute_step_args.get_command_args(\n skip_serialized_namedtuple=True\n )\n\n if not job_config.job_image:\n job_config = job_config.with_image(\n step_handler_context.execute_step_args.job_origin.repository_origin.container_image\n )\n\n if not job_config.job_image:\n raise Exception("No image included in either executor config or the job")\n\n run = step_handler_context.dagster_run\n labels = {\n "dagster/job": run.job_name,\n "dagster/op": step_key,\n "dagster/run-id": step_handler_context.execute_step_args.run_id,\n }\n if run.external_job_origin:\n labels["dagster/code-location"] = (\n run.external_job_origin.external_repository_origin.code_location_origin.location_name\n )\n job = construct_dagster_k8s_job(\n job_config=job_config,\n args=args,\n job_name=job_name,\n pod_name=pod_name,\n component="step_worker",\n user_defined_k8s_config=container_context.get_run_user_defined_k8s_config(),\n labels=labels,\n env_vars=[\n *step_handler_context.execute_step_args.get_command_env(),\n {\n "name": "DAGSTER_RUN_JOB_NAME",\n "value": run.job_name,\n },\n {"name": "DAGSTER_RUN_STEP_KEY", "value": step_key},\n *container_context.env,\n ],\n )\n\n yield DagsterEvent.step_worker_starting(\n step_handler_context.get_step_context(step_key),\n message=f'Executing step "{step_key}" in Kubernetes job {job_name}.',\n metadata={\n "Kubernetes Job name": MetadataValue.text(job_name),\n },\n )\n\n namespace = check.not_none(container_context.namespace)\n self._api_client.create_namespaced_job_with_retries(body=job, namespace=namespace)\n\n def check_step_health(self, step_handler_context: StepHandlerContext) -> CheckStepHealthResult:\n step_key = self._get_step_key(step_handler_context)\n\n job_name = self._get_k8s_step_job_name(step_handler_context)\n\n container_context = self._get_container_context(step_handler_context)\n\n status = self._api_client.get_job_status(\n namespace=container_context.namespace,\n job_name=job_name,\n )\n if status.failed:\n return CheckStepHealthResult.unhealthy(\n reason=f"Discovered failed Kubernetes job {job_name} for step {step_key}.",\n )\n\n return CheckStepHealthResult.healthy()\n\n def terminate_step(self, step_handler_context: StepHandlerContext) -> Iterator[DagsterEvent]:\n step_key = self._get_step_key(step_handler_context)\n\n job_name = self._get_k8s_step_job_name(step_handler_context)\n container_context = self._get_container_context(step_handler_context)\n\n yield DagsterEvent.engine_event(\n step_handler_context.get_step_context(step_key),\n message=f"Deleting Kubernetes job {job_name} for step",\n event_specific_data=EngineEventData(),\n )\n\n self._api_client.delete_job(job_name=job_name, namespace=container_context.namespace)\n
", "current_page_name": "_modules/dagster_k8s/executor", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_k8s.executor"}, "launcher": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_k8s.launcher

\nimport logging\nimport sys\nfrom typing import Any, Mapping, Optional, Sequence\n\nimport kubernetes\nfrom dagster import (\n    _check as check,\n)\nfrom dagster._cli.api import ExecuteRunArgs\nfrom dagster._core.events import EngineEventData\nfrom dagster._core.launcher import LaunchRunContext, ResumeRunContext, RunLauncher\nfrom dagster._core.launcher.base import CheckRunHealthResult, WorkerStatus\nfrom dagster._core.storage.dagster_run import DagsterRun, DagsterRunStatus\nfrom dagster._core.storage.tags import DOCKER_IMAGE_TAG\nfrom dagster._grpc.types import ResumeRunArgs\nfrom dagster._serdes import ConfigurableClass, ConfigurableClassData\nfrom dagster._utils.error import serializable_error_info_from_exc_info\n\nfrom .client import DagsterKubernetesClient\nfrom .container_context import K8sContainerContext\nfrom .job import DagsterK8sJobConfig, construct_dagster_k8s_job, get_job_name_from_run_id\n\n\n
[docs]class K8sRunLauncher(RunLauncher, ConfigurableClass):\n """RunLauncher that starts a Kubernetes Job for each Dagster job run.\n\n Encapsulates each run in a separate, isolated invocation of ``dagster-graphql``.\n\n You can configure a Dagster instance to use this RunLauncher by adding a section to your\n ``dagster.yaml`` like the following:\n\n .. code-block:: yaml\n\n run_launcher:\n module: dagster_k8s.launcher\n class: K8sRunLauncher\n config:\n service_account_name: your_service_account\n job_image: my_project/dagster_image:latest\n instance_config_map: dagster-instance\n postgres_password_secret: dagster-postgresql-secret\n\n """\n\n def __init__(\n self,\n service_account_name,\n instance_config_map,\n postgres_password_secret=None,\n dagster_home=None,\n job_image=None,\n image_pull_policy=None,\n image_pull_secrets=None,\n load_incluster_config=True,\n kubeconfig_file=None,\n inst_data: Optional[ConfigurableClassData] = None,\n job_namespace="default",\n env_config_maps=None,\n env_secrets=None,\n env_vars=None,\n k8s_client_batch_api=None,\n volume_mounts=None,\n volumes=None,\n labels=None,\n fail_pod_on_run_failure=None,\n resources=None,\n scheduler_name=None,\n security_context=None,\n run_k8s_config=None,\n ):\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n self.job_namespace = check.str_param(job_namespace, "job_namespace")\n\n self.load_incluster_config = load_incluster_config\n self.kubeconfig_file = kubeconfig_file\n if load_incluster_config:\n check.invariant(\n kubeconfig_file is None,\n "`kubeconfig_file` is set but `load_incluster_config` is True.",\n )\n kubernetes.config.load_incluster_config()\n else:\n check.opt_str_param(kubeconfig_file, "kubeconfig_file")\n kubernetes.config.load_kube_config(kubeconfig_file)\n\n self._api_client = DagsterKubernetesClient.production_client(\n batch_api_override=k8s_client_batch_api\n )\n\n self._job_config = None\n self._job_image = check.opt_str_param(job_image, "job_image")\n self.dagster_home = check.str_param(dagster_home, "dagster_home")\n self._image_pull_policy = check.opt_str_param(\n image_pull_policy, "image_pull_policy", "IfNotPresent"\n )\n self._image_pull_secrets = check.opt_list_param(\n image_pull_secrets, "image_pull_secrets", of_type=dict\n )\n self._service_account_name = check.str_param(service_account_name, "service_account_name")\n self.instance_config_map = check.str_param(instance_config_map, "instance_config_map")\n self.postgres_password_secret = check.opt_str_param(\n postgres_password_secret, "postgres_password_secret"\n )\n self._env_config_maps = check.opt_list_param(\n env_config_maps, "env_config_maps", of_type=str\n )\n self._env_secrets = check.opt_list_param(env_secrets, "env_secrets", of_type=str)\n self._env_vars = check.opt_list_param(env_vars, "env_vars", of_type=str)\n self._volume_mounts = check.opt_list_param(volume_mounts, "volume_mounts")\n self._volumes = check.opt_list_param(volumes, "volumes")\n self._labels: Mapping[str, str] = check.opt_mapping_param(\n labels, "labels", key_type=str, value_type=str\n )\n self._fail_pod_on_run_failure = check.opt_bool_param(\n fail_pod_on_run_failure, "fail_pod_on_run_failure"\n )\n self._resources: Mapping[str, Any] = check.opt_mapping_param(resources, "resources")\n self._scheduler_name = check.opt_str_param(scheduler_name, "scheduler_name")\n self._security_context = check.opt_dict_param(security_context, "security_context")\n self._run_k8s_config = check.opt_dict_param(run_k8s_config, "run_k8s_config")\n super().__init__()\n\n @property\n def job_image(self):\n return self._job_image\n\n @property\n def image_pull_policy(self) -> str:\n return self._image_pull_policy\n\n @property\n def image_pull_secrets(self) -> Sequence[Mapping]:\n return self._image_pull_secrets\n\n @property\n def service_account_name(self) -> str:\n return self._service_account_name\n\n @property\n def env_config_maps(self) -> Sequence[str]:\n return self._env_config_maps\n\n @property\n def env_secrets(self) -> Sequence[str]:\n return self._env_secrets\n\n @property\n def volume_mounts(self) -> Sequence:\n return self._volume_mounts\n\n @property\n def volumes(self) -> Sequence:\n return self._volumes\n\n @property\n def resources(self) -> Mapping:\n return self._resources\n\n @property\n def scheduler_name(self) -> Optional[str]:\n return self._scheduler_name\n\n @property\n def security_context(self) -> Mapping[str, Any]:\n return self._security_context\n\n @property\n def env_vars(self) -> Sequence[str]:\n return self._env_vars\n\n @property\n def labels(self) -> Mapping[str, str]:\n return self._labels\n\n @property\n def run_k8s_config(self) -> Mapping[str, str]:\n return self._run_k8s_config\n\n @property\n def fail_pod_on_run_failure(self) -> Optional[bool]:\n return self._fail_pod_on_run_failure\n\n @classmethod\n def config_type(cls):\n """Include all arguments required for DagsterK8sJobConfig along with additional arguments\n needed for the RunLauncher itself.\n """\n return DagsterK8sJobConfig.config_type_run_launcher()\n\n @classmethod\n def from_config_value(cls, inst_data, config_value):\n return cls(inst_data=inst_data, **config_value)\n\n @property\n def inst_data(self) -> Optional[ConfigurableClassData]:\n return self._inst_data\n\n def get_container_context_for_run(self, dagster_run: DagsterRun) -> K8sContainerContext:\n return K8sContainerContext.create_for_run(dagster_run, self, include_run_tags=True)\n\n def _launch_k8s_job_with_args(\n self, job_name: str, args: Optional[Sequence[str]], run: DagsterRun\n ) -> None:\n container_context = self.get_container_context_for_run(run)\n\n pod_name = job_name\n\n job_origin = check.not_none(run.job_code_origin)\n user_defined_k8s_config = container_context.get_run_user_defined_k8s_config()\n repository_origin = job_origin.repository_origin\n\n job_config = container_context.get_k8s_job_config(\n job_image=repository_origin.container_image, run_launcher=self\n )\n job_image = job_config.job_image\n if job_image: # expected to be set\n self._instance.add_run_tags(\n run.run_id,\n {DOCKER_IMAGE_TAG: job_image},\n )\n\n labels = {\n "dagster/job": job_origin.job_name,\n "dagster/run-id": run.run_id,\n }\n if run.external_job_origin:\n labels["dagster/code-location"] = (\n run.external_job_origin.external_repository_origin.code_location_origin.location_name\n )\n\n job = construct_dagster_k8s_job(\n job_config=job_config,\n args=args,\n job_name=job_name,\n pod_name=pod_name,\n component="run_worker",\n user_defined_k8s_config=user_defined_k8s_config,\n labels=labels,\n env_vars=[\n {\n "name": "DAGSTER_RUN_JOB_NAME",\n "value": job_origin.job_name,\n },\n *container_context.env,\n ],\n )\n\n namespace = check.not_none(container_context.namespace)\n\n self._instance.report_engine_event(\n "Creating Kubernetes run worker job",\n run,\n EngineEventData(\n {\n "Kubernetes Job name": job_name,\n "Kubernetes Namespace": namespace,\n "Run ID": run.run_id,\n }\n ),\n cls=self.__class__,\n )\n\n self._api_client.create_namespaced_job_with_retries(body=job, namespace=namespace)\n self._instance.report_engine_event(\n "Kubernetes run worker job created",\n run,\n cls=self.__class__,\n )\n\n def launch_run(self, context: LaunchRunContext) -> None:\n run = context.dagster_run\n job_name = get_job_name_from_run_id(run.run_id)\n job_origin = check.not_none(run.job_code_origin)\n\n args = ExecuteRunArgs(\n job_origin=job_origin,\n run_id=run.run_id,\n instance_ref=self._instance.get_ref(),\n set_exit_code_on_failure=self._fail_pod_on_run_failure,\n ).get_command_args()\n\n self._launch_k8s_job_with_args(job_name, args, run)\n\n @property\n def supports_resume_run(self):\n return True\n\n def resume_run(self, context: ResumeRunContext) -> None:\n run = context.dagster_run\n job_name = get_job_name_from_run_id(\n run.run_id, resume_attempt_number=context.resume_attempt_number\n )\n job_origin = check.not_none(run.job_code_origin)\n\n args = ResumeRunArgs(\n job_origin=job_origin,\n run_id=run.run_id,\n instance_ref=self._instance.get_ref(),\n set_exit_code_on_failure=self._fail_pod_on_run_failure,\n ).get_command_args()\n\n self._launch_k8s_job_with_args(job_name, args, run)\n\n def terminate(self, run_id):\n check.str_param(run_id, "run_id")\n run = self._instance.get_run_by_id(run_id)\n\n if not run:\n return False\n\n self._instance.report_run_canceling(run)\n\n container_context = self.get_container_context_for_run(run)\n\n job_name = get_job_name_from_run_id(\n run_id, resume_attempt_number=self._instance.count_resume_run_attempts(run.run_id)\n )\n\n try:\n termination_result = self._api_client.delete_job(\n job_name=job_name, namespace=container_context.namespace\n )\n if termination_result:\n self._instance.report_engine_event(\n message="Run was terminated successfully.",\n dagster_run=run,\n cls=self.__class__,\n )\n else:\n self._instance.report_engine_event(\n message="Run was not terminated successfully; delete_job returned {}".format(\n termination_result\n ),\n dagster_run=run,\n cls=self.__class__,\n )\n return termination_result\n except Exception:\n self._instance.report_engine_event(\n message="Run was not terminated successfully; encountered error in delete_job",\n dagster_run=run,\n engine_event_data=EngineEventData.engine_error(\n serializable_error_info_from_exc_info(sys.exc_info())\n ),\n cls=self.__class__,\n )\n\n @property\n def supports_check_run_worker_health(self):\n return True\n\n @property\n def supports_run_worker_crash_recovery(self):\n return True\n\n def get_run_worker_debug_info(self, run: DagsterRun) -> Optional[str]:\n container_context = self.get_container_context_for_run(run)\n if self.supports_run_worker_crash_recovery:\n resume_attempt_number = self._instance.count_resume_run_attempts(run.run_id)\n else:\n resume_attempt_number = None\n\n job_name = get_job_name_from_run_id(run.run_id, resume_attempt_number=resume_attempt_number)\n namespace = container_context.namespace\n user_defined_k8s_config = container_context.get_run_user_defined_k8s_config()\n container_name = user_defined_k8s_config.container_config.get("name", "dagster")\n pod_names = self._api_client.get_pod_names_in_job(job_name, namespace=namespace)\n full_msg = ""\n try:\n pod_debug_info = [\n self._api_client.get_pod_debug_info(\n pod_name, namespace, container_name=container_name\n )\n for pod_name in pod_names\n ]\n full_msg = "\\n".join(pod_debug_info)\n except Exception:\n logging.exception(\n f"Error trying to get debug information for failed k8s job {job_name}"\n )\n if pod_names:\n full_msg = (\n full_msg\n + "\\nFor more information about the failure, try running `kubectl describe pod"\n f" {pod_names[0]}`, `kubectl logs {pod_names[0]}`, or `kubectl describe job"\n f" {job_name}` in your cluster."\n )\n\n else:\n full_msg = (\n full_msg\n + "\\nFor more information about the failure, try running `kubectl describe job"\n f" {job_name}` in your cluster."\n )\n\n return full_msg\n\n def check_run_worker_health(self, run: DagsterRun):\n container_context = self.get_container_context_for_run(run)\n\n if self.supports_run_worker_crash_recovery:\n resume_attempt_number = self._instance.count_resume_run_attempts(run.run_id)\n else:\n resume_attempt_number = None\n\n job_name = get_job_name_from_run_id(run.run_id, resume_attempt_number=resume_attempt_number)\n try:\n status = self._api_client.get_job_status(\n namespace=container_context.namespace,\n job_name=job_name,\n )\n except Exception:\n return CheckRunHealthResult(\n WorkerStatus.UNKNOWN, str(serializable_error_info_from_exc_info(sys.exc_info()))\n )\n\n inactive_job_with_finished_pods = bool(\n (not status.active) and (status.failed or status.succeeded)\n )\n\n # If the run is in a non-terminal (and non-STARTING) state but the k8s job is not active,\n # something went wrong\n if (\n run.status in (DagsterRunStatus.STARTED, DagsterRunStatus.CANCELING)\n and inactive_job_with_finished_pods\n ):\n return CheckRunHealthResult(\n WorkerStatus.FAILED, "Run has not completed but K8s job has no active pods"\n )\n\n if status.failed:\n return CheckRunHealthResult(WorkerStatus.FAILED, "K8s job failed")\n if status.succeeded:\n return CheckRunHealthResult(WorkerStatus.SUCCESS)\n return CheckRunHealthResult(WorkerStatus.RUNNING)
\n
", "current_page_name": "_modules/dagster_k8s/launcher", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_k8s.launcher"}, "ops": {"k8s_job_op": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_k8s.ops.k8s_job_op

\nimport time\nfrom typing import Any, Dict, List, Optional\n\nimport kubernetes.config\nimport kubernetes.watch\nfrom dagster import Field, In, Noneable, Nothing, OpExecutionContext, Permissive, StringSource, op\nfrom dagster._annotations import experimental\nfrom dagster._utils.merger import merge_dicts\n\nfrom ..client import DEFAULT_JOB_POD_COUNT, DagsterKubernetesClient\nfrom ..container_context import K8sContainerContext\nfrom ..job import DagsterK8sJobConfig, construct_dagster_k8s_job, get_k8s_job_name\nfrom ..launcher import K8sRunLauncher\n\nK8S_JOB_OP_CONFIG = merge_dicts(\n    DagsterK8sJobConfig.config_type_container(),\n    {\n        "image": Field(\n            StringSource,\n            is_required=True,\n            description="The image in which to launch the k8s job.",\n        ),\n        "command": Field(\n            [str],\n            is_required=False,\n            description="The command to run in the container within the launched k8s job.",\n        ),\n        "args": Field(\n            [str],\n            is_required=False,\n            description="The args for the command for the container.",\n        ),\n        "namespace": Field(StringSource, is_required=False),\n        "load_incluster_config": Field(\n            bool,\n            is_required=False,\n            default_value=True,\n            description="""Set this value if you are running the launcher\n            within a k8s cluster. If ``True``, we assume the launcher is running within the target\n            cluster and load config using ``kubernetes.config.load_incluster_config``. Otherwise,\n            we will use the k8s config specified in ``kubeconfig_file`` (using\n            ``kubernetes.config.load_kube_config``) or fall back to the default kubeconfig.""",\n        ),\n        "kubeconfig_file": Field(\n            Noneable(str),\n            is_required=False,\n            default_value=None,\n            description=(\n                "The kubeconfig file from which to load config. Defaults to using the default"\n                " kubeconfig."\n            ),\n        ),\n        "timeout": Field(\n            int,\n            is_required=False,\n            description="How long to wait for the job to succeed before raising an exception",\n        ),\n        "container_config": Field(\n            Permissive(),\n            is_required=False,\n            description=(\n                "Raw k8s config for the k8s pod's main container"\n                " (https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.20/#container-v1-core)."\n                " Keys can either snake_case or camelCase."\n            ),\n        ),\n        "pod_template_spec_metadata": Field(\n            Permissive(),\n            is_required=False,\n            description=(\n                "Raw k8s config for the k8s pod's metadata"\n                " (https://kubernetes.io/docs/reference/kubernetes-api/common-definitions/object-meta/#ObjectMeta)."\n                " Keys can either snake_case or camelCase."\n            ),\n        ),\n        "pod_spec_config": Field(\n            Permissive(),\n            is_required=False,\n            description=(\n                "Raw k8s config for the k8s pod's pod spec"\n                " (https://kubernetes.io/docs/reference/kubernetes-api/workload-resources/pod-v1/#PodSpec)."\n                " Keys can either snake_case or camelCase."\n            ),\n        ),\n        "job_metadata": Field(\n            Permissive(),\n            is_required=False,\n            description=(\n                "Raw k8s config for the k8s job's metadata"\n                " (https://kubernetes.io/docs/reference/kubernetes-api/common-definitions/object-meta/#ObjectMeta)."\n                " Keys can either snake_case or camelCase."\n            ),\n        ),\n        "job_spec_config": Field(\n            Permissive(),\n            is_required=False,\n            description=(\n                "Raw k8s config for the k8s job's job spec"\n                " (https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.20/#jobspec-v1-batch)."\n                " Keys can either snake_case or camelCase."\n            ),\n        ),\n    },\n)\n\n\n
[docs]@experimental\ndef execute_k8s_job(\n context: OpExecutionContext,\n image: str,\n command: Optional[List[str]] = None,\n args: Optional[List[str]] = None,\n namespace: Optional[str] = None,\n image_pull_policy: Optional[str] = None,\n image_pull_secrets: Optional[List[Dict[str, str]]] = None,\n service_account_name: Optional[str] = None,\n env_config_maps: Optional[List[str]] = None,\n env_secrets: Optional[List[str]] = None,\n env_vars: Optional[List[str]] = None,\n volume_mounts: Optional[List[Dict[str, Any]]] = None,\n volumes: Optional[List[Dict[str, Any]]] = None,\n labels: Optional[Dict[str, str]] = None,\n resources: Optional[Dict[str, Any]] = None,\n scheduler_name: Optional[str] = None,\n load_incluster_config: bool = True,\n kubeconfig_file: Optional[str] = None,\n timeout: Optional[int] = None,\n container_config: Optional[Dict[str, Any]] = None,\n pod_template_spec_metadata: Optional[Dict[str, Any]] = None,\n pod_spec_config: Optional[Dict[str, Any]] = None,\n job_metadata: Optional[Dict[str, Any]] = None,\n job_spec_config: Optional[Dict[str, Any]] = None,\n k8s_job_name: Optional[str] = None,\n):\n """This function is a utility for executing a Kubernetes job from within a Dagster op.\n\n Args:\n image (str): The image in which to launch the k8s job.\n command (Optional[List[str]]): The command to run in the container within the launched\n k8s job. Default: None.\n args (Optional[List[str]]): The args for the command for the container. Default: None.\n namespace (Optional[str]): Override the kubernetes namespace in which to run the k8s job.\n Default: None.\n image_pull_policy (Optional[str]): Allows the image pull policy to be overridden, e.g. to\n facilitate local testing with `kind <https://kind.sigs.k8s.io/>`_. Default:\n ``"Always"``. See:\n https://kubernetes.io/docs/concepts/containers/images/#updating-images.\n image_pull_secrets (Optional[List[Dict[str, str]]]): Optionally, a list of dicts, each of\n which corresponds to a Kubernetes ``LocalObjectReference`` (e.g.,\n ``{'name': 'myRegistryName'}``). This allows you to specify the ```imagePullSecrets`` on\n a pod basis. Typically, these will be provided through the service account, when needed,\n and you will not need to pass this argument. See:\n https://kubernetes.io/docs/concepts/containers/images/#specifying-imagepullsecrets-on-a-pod\n and https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.17/#podspec-v1-core\n service_account_name (Optional[str]): The name of the Kubernetes service account under which\n to run the Job. Defaults to "default" env_config_maps (Optional[List[str]]): A list of custom ConfigMapEnvSource names from which to\n draw environment variables (using ``envFrom``) for the Job. Default: ``[]``. See:\n https://kubernetes.io/docs/tasks/inject-data-application/define-environment-variable-container/#define-an-environment-variable-for-a-container\n env_secrets (Optional[List[str]]): A list of custom Secret names from which to\n draw environment variables (using ``envFrom``) for the Job. Default: ``[]``. See:\n https://kubernetes.io/docs/tasks/inject-data-application/distribute-credentials-secure/#configure-all-key-value-pairs-in-a-secret-as-container-environment-variables\n env_vars (Optional[List[str]]): A list of environment variables to inject into the Job.\n Default: ``[]``. See: https://kubernetes.io/docs/tasks/inject-data-application/distribute-credentials-secure/#configure-all-key-value-pairs-in-a-secret-as-container-environment-variables\n volume_mounts (Optional[List[Permissive]]): A list of volume mounts to include in the job's\n container. Default: ``[]``. See:\n https://v1-18.docs.kubernetes.io/docs/reference/generated/kubernetes-api/v1.18/#volumemount-v1-core\n volumes (Optional[List[Permissive]]): A list of volumes to include in the Job's Pod. Default: ``[]``. See:\n https://v1-18.docs.kubernetes.io/docs/reference/generated/kubernetes-api/v1.18/#volume-v1-core\n labels (Optional[Dict[str, str]]): Additional labels that should be included in the Job's Pod. See:\n https://kubernetes.io/docs/concepts/overview/working-with-objects/labels\n resources (Optional[Dict[str, Any]]) Compute resource requirements for the container. See:\n https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/\n scheduler_name (Optional[str]): Use a custom Kubernetes scheduler for launched Pods. See:\n https://kubernetes.io/docs/tasks/extend-kubernetes/configure-multiple-schedulers/\n load_incluster_config (bool): Whether the op is running within a k8s cluster. If ``True``,\n we assume the launcher is running within the target cluster and load config using\n ``kubernetes.config.load_incluster_config``. Otherwise, we will use the k8s config\n specified in ``kubeconfig_file`` (using ``kubernetes.config.load_kube_config``) or fall\n back to the default kubeconfig. Default: True,\n kubeconfig_file (Optional[str]): The kubeconfig file from which to load config. Defaults to\n using the default kubeconfig. Default: None.\n timeout (Optional[int]): Raise an exception if the op takes longer than this timeout in\n seconds to execute. Default: None.\n container_config (Optional[Dict[str, Any]]): Raw k8s config for the k8s pod's main container\n (https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.20/#container-v1-core).\n Keys can either snake_case or camelCase.Default: None.\n pod_template_spec_metadata (Optional[Dict[str, Any]]): Raw k8s config for the k8s pod's\n metadata (https://kubernetes.io/docs/reference/kubernetes-api/common-definitions/object-meta/#ObjectMeta).\n Keys can either snake_case or camelCase. Default: None.\n pod_spec_config (Optional[Dict[str, Any]]): Raw k8s config for the k8s pod's pod spec\n (https://kubernetes.io/docs/reference/kubernetes-api/workload-resources/pod-v1/#PodSpec).\n Keys can either snake_case or camelCase. Default: None.\n job_metadata (Optional[Dict[str, Any]]): Raw k8s config for the k8s job's metadata\n (https://kubernetes.io/docs/reference/kubernetes-api/common-definitions/object-meta/#ObjectMeta).\n Keys can either snake_case or camelCase. Default: None.\n job_spec_config (Optional[Dict[str, Any]]): Raw k8s config for the k8s job's job spec\n (https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.20/#jobspec-v1-batch).\n Keys can either snake_case or camelCase.Default: None.\n k8s_job_name (Optional[str]): Overrides the name of the the k8s job. If not set, will be set\n to a unique name based on the current run ID and the name of the calling op. If set,\n make sure that the passed in name is a valid Kubernetes job name that does not\n already exist in the cluster.\n """\n run_container_context = K8sContainerContext.create_for_run(\n context.dagster_run,\n (\n context.instance.run_launcher\n if isinstance(context.instance.run_launcher, K8sRunLauncher)\n else None\n ),\n include_run_tags=False,\n )\n\n container_config = container_config.copy() if container_config else {}\n if command:\n container_config["command"] = command\n\n op_container_context = K8sContainerContext(\n image_pull_policy=image_pull_policy,\n image_pull_secrets=image_pull_secrets,\n service_account_name=service_account_name,\n env_config_maps=env_config_maps,\n env_secrets=env_secrets,\n env_vars=env_vars,\n volume_mounts=volume_mounts,\n volumes=volumes,\n labels=labels,\n namespace=namespace,\n resources=resources,\n scheduler_name=scheduler_name,\n run_k8s_config={\n "container_config": container_config,\n "pod_template_spec_metadata": pod_template_spec_metadata,\n "pod_spec_config": pod_spec_config,\n "job_metadata": job_metadata,\n "job_spec_config": job_spec_config,\n },\n )\n\n container_context = run_container_context.merge(op_container_context)\n\n namespace = container_context.namespace\n\n user_defined_k8s_config = container_context.get_run_user_defined_k8s_config()\n\n k8s_job_config = DagsterK8sJobConfig(\n job_image=image,\n dagster_home=None,\n image_pull_policy=container_context.image_pull_policy,\n image_pull_secrets=container_context.image_pull_secrets,\n service_account_name=container_context.service_account_name,\n instance_config_map=None,\n postgres_password_secret=None,\n env_config_maps=container_context.env_config_maps,\n env_secrets=container_context.env_secrets,\n env_vars=container_context.env_vars,\n volume_mounts=container_context.volume_mounts,\n volumes=container_context.volumes,\n labels=container_context.labels,\n resources=container_context.resources,\n )\n\n job_name = k8s_job_name or get_k8s_job_name(\n context.run_id, context.get_step_execution_context().step.key\n )\n\n retry_number = context.retry_number\n if retry_number > 0:\n job_name = f"{job_name}-{retry_number}"\n\n labels = {\n "dagster/job": context.dagster_run.job_name,\n "dagster/op": context.op.name,\n "dagster/run-id": context.dagster_run.run_id,\n }\n if context.dagster_run.external_job_origin:\n labels["dagster/code-location"] = (\n context.dagster_run.external_job_origin.external_repository_origin.code_location_origin.location_name\n )\n\n job = construct_dagster_k8s_job(\n job_config=k8s_job_config,\n args=args,\n job_name=job_name,\n pod_name=job_name,\n component="k8s_job_op",\n user_defined_k8s_config=user_defined_k8s_config,\n labels=labels,\n )\n\n if load_incluster_config:\n kubernetes.config.load_incluster_config()\n else:\n kubernetes.config.load_kube_config(kubeconfig_file)\n\n # changing this to be able to be passed in will allow for unit testing\n api_client = DagsterKubernetesClient.production_client()\n\n context.log.info(f"Creating Kubernetes job {job_name} in namespace {namespace}...")\n\n start_time = time.time()\n\n api_client.batch_api.create_namespaced_job(namespace, job)\n\n context.log.info("Waiting for Kubernetes job to finish...")\n\n timeout = timeout or 0\n\n api_client.wait_for_job(\n job_name=job_name,\n namespace=namespace,\n wait_timeout=timeout,\n start_time=start_time,\n )\n\n restart_policy = user_defined_k8s_config.pod_spec_config.get("restart_policy", "Never")\n\n if restart_policy == "Never":\n container_name = container_config.get("name", "dagster")\n\n pods = api_client.wait_for_job_to_have_pods(\n job_name,\n namespace,\n wait_timeout=timeout,\n start_time=start_time,\n )\n\n pod_names = [p.metadata.name for p in pods]\n\n if not pod_names:\n raise Exception("No pod names in job after it started")\n\n pod_to_watch = pod_names[0]\n watch = kubernetes.watch.Watch() # consider moving in to api_client\n\n api_client.wait_for_pod(\n pod_to_watch, namespace, wait_timeout=timeout, start_time=start_time\n )\n\n log_stream = watch.stream(\n api_client.core_api.read_namespaced_pod_log,\n name=pod_to_watch,\n namespace=namespace,\n container=container_name,\n )\n\n while True:\n if timeout and time.time() - start_time > timeout:\n watch.stop()\n raise Exception("Timed out waiting for pod to finish")\n\n try:\n log_entry = next(log_stream)\n print(log_entry) # noqa: T201\n except StopIteration:\n break\n else:\n context.log.info("Pod logs are disabled, because restart_policy is not Never")\n\n if job_spec_config and job_spec_config.get("parallelism"):\n num_pods_to_wait_for = job_spec_config["parallelism"]\n else:\n num_pods_to_wait_for = DEFAULT_JOB_POD_COUNT\n api_client.wait_for_running_job_to_succeed(\n job_name=job_name,\n namespace=namespace,\n wait_timeout=timeout,\n start_time=start_time,\n num_pods_to_wait_for=num_pods_to_wait_for,\n )
\n\n\n
[docs]@op(ins={"start_after": In(Nothing)}, config_schema=K8S_JOB_OP_CONFIG)\n@experimental\ndef k8s_job_op(context):\n """An op that runs a Kubernetes job using the k8s API.\n\n Contrast with the `k8s_job_executor`, which runs each Dagster op in a Dagster job in its\n own k8s job.\n\n This op may be useful when:\n - You need to orchestrate a command that isn't a Dagster op (or isn't written in Python)\n - You want to run the rest of a Dagster job using a specific executor, and only a single\n op in k8s.\n\n For example:\n\n .. literalinclude:: ../../../../../../python_modules/libraries/dagster-k8s/dagster_k8s_tests/unit_tests/test_example_k8s_job_op.py\n :start-after: start_marker\n :end-before: end_marker\n :language: python\n\n You can create your own op with the same implementation by calling the `execute_k8s_job` function\n inside your own op.\n\n The service account that is used to run this job should have the following RBAC permissions:\n\n .. literalinclude:: ../../../../../../examples/docs_snippets/docs_snippets/deploying/kubernetes/k8s_job_op_rbac.yaml\n :language: YAML\n """\n execute_k8s_job(context, **context.op_config)
\n
", "current_page_name": "_modules/dagster_k8s/ops/k8s_job_op", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_k8s.ops.k8s_job_op"}}}, "dagster_mlflow": {"hooks": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_mlflow.hooks

\nfrom dagster._core.definitions.decorators.hook_decorator import event_list_hook\nfrom dagster._core.definitions.events import HookExecutionResult\nfrom mlflow.entities.run_status import RunStatus\n\n\ndef _create_mlflow_run_hook(name):\n    @event_list_hook(name=name, required_resource_keys={"mlflow"})\n    def _hook(context, event_list):\n        for event in event_list:\n            if event.is_step_success:\n                _cleanup_on_success(context)\n            elif event.is_step_failure:\n                mlf = context.resources.mlflow\n                mlf.end_run(status=RunStatus.to_string(RunStatus.FAILED))\n\n        return HookExecutionResult(hook_name=name, is_skipped=False)\n\n    return _hook\n\n\ndef _cleanup_on_success(context):\n    """Checks if the current solid in the context is the last solid in the job\n    and ends the mlflow run with a successful status when this is the case.\n    """\n    last_solid_name = context._step_execution_context.job_def.nodes_in_topological_order[  # noqa: SLF001  # fmt: skip\n        -1\n    ].name\n\n    if context.op.name == last_solid_name:\n        context.resources.mlflow.end_run()\n\n\nend_mlflow_on_run_finished = _create_mlflow_run_hook("end_mlflow_on_run_finished")\n
", "current_page_name": "_modules/dagster_mlflow/hooks", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_mlflow.hooks"}, "resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_mlflow.resources

\n"""This module contains the mlflow resource provided by the MlFlow\nclass. This resource provides an easy way to configure mlflow for logging various\nthings from dagster runs.\n"""\nimport atexit\nimport sys\nfrom itertools import islice\nfrom os import environ\nfrom typing import Any, Optional\n\nimport mlflow\nfrom dagster import Field, Noneable, Permissive, StringSource, resource\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom mlflow.entities.run_status import RunStatus\n\nCONFIG_SCHEMA = {\n    "experiment_name": Field(StringSource, is_required=True, description="MlFlow experiment name."),\n    "mlflow_tracking_uri": Field(\n        Noneable(StringSource),\n        default_value=None,\n        is_required=False,\n        description="MlFlow tracking server uri.",\n    ),\n    "parent_run_id": Field(\n        Noneable(str),\n        default_value=None,\n        is_required=False,\n        description="Mlflow run ID of parent run if this is a nested run.",\n    ),\n    "env": Field(Permissive(), description="Environment variables for mlflow setup."),\n    "env_to_tag": Field(\n        Noneable(list),\n        default_value=None,\n        is_required=False,\n        description="List of environment variables to log as tags in mlflow.",\n    ),\n    "extra_tags": Field(Permissive(), description="Any extra key-value tags to log to mlflow."),\n}\n\n\nclass MlflowMeta(type):\n    """Mlflow Metaclass to create methods that "inherit" all of Mlflow's\n    methods. If the class has a method defined it is excluded from the\n    attribute setting from mlflow.\n    """\n\n    def __new__(cls, name, bases, attrs):\n        class_cls = super(MlflowMeta, cls).__new__(cls, name, bases, attrs)\n        for attr in (attr for attr in dir(mlflow) if attr not in dir(class_cls)):\n            mlflow_attribute = getattr(mlflow, attr)\n            if callable(mlflow_attribute):\n                setattr(class_cls, attr, staticmethod(mlflow_attribute))\n            else:\n                setattr(class_cls, attr, mlflow_attribute)\n        return class_cls\n\n\nclass MlFlow(metaclass=MlflowMeta):\n    """Class for setting up an mlflow resource for dagster runs.\n    This takes care of all the configuration required to use mlflow tracking and the complexities of\n    mlflow tracking dagster parallel runs.\n    """\n\n    def __init__(self, context):\n        # Context associated attributes\n        self.log = context.log\n        self.run_name = context.dagster_run.job_name\n        self.dagster_run_id = context.run_id\n\n        # resource config attributes\n        resource_config = context.resource_config\n        self.tracking_uri = resource_config.get("mlflow_tracking_uri")\n        if self.tracking_uri:\n            mlflow.set_tracking_uri(self.tracking_uri)\n        self.parent_run_id = resource_config.get("parent_run_id")\n        self.experiment_name = resource_config["experiment_name"]\n        self.env_tags_to_log = resource_config.get("env_to_tag") or []\n        self.extra_tags = resource_config.get("extra_tags")\n\n        # Update env variables if any are given\n        self.env_vars = resource_config.get("env", {})\n        if self.env_vars:\n            environ.update(self.env_vars)\n\n        # If the experiment exists then the set won't do anything\n        mlflow.set_experiment(self.experiment_name)\n        self.experiment = mlflow.get_experiment_by_name(self.experiment_name)\n\n        # Get the client object\n        self.tracking_client = mlflow.tracking.MlflowClient()\n\n        # Set up the active run and tags\n        self._setup()\n\n    def _setup(self):\n        """Sets the active run and tags. If an Mlflow run_id exists then the\n        active run is set to it. This way a single Dagster run outputs data\n        to the same Mlflow run, even when multiprocess executors are used.\n        """\n        # Get the run id\n        run_id = self._get_current_run_id()\n        self._set_active_run(run_id=run_id)\n        self._set_all_tags()\n\n        # hack needed to stop mlflow from marking run as finished when\n        # a process exits in parallel runs\n        atexit.unregister(mlflow.end_run)\n\n    def _get_current_run_id(\n        self, experiment: Optional[Any] = None, dagster_run_id: Optional[str] = None\n    ):\n        """Gets the run id of a specific dagster run and experiment id.\n        If it doesn't exist then it returns a None.\n\n        Args:\n            experiment (optional): Mlflow experiment.\n            When none is passed it fetches the experiment object set in\n            the constructor.  Defaults to None.\n            dagster_run_id (optional): The Dagster run id.\n            When none is passed it fetches the dagster_run_id object set in\n            the constructor.  Defaults to None.\n\n        Returns:\n            run_id (str or None): run_id if it is found else None\n        """\n        experiment = experiment or self.experiment\n        dagster_run_id = dagster_run_id or self.dagster_run_id\n        if experiment:\n            # Check if a run with this dagster run id has already been started\n            # in mlflow, will get an empty dataframe if not\n            current_run_df = mlflow.search_runs(\n                experiment_ids=[experiment.experiment_id],\n                filter_string=f"tags.dagster_run_id='{dagster_run_id}'",\n            )\n            if not current_run_df.empty:\n                return current_run_df.run_id.values[0]\n\n    def _set_active_run(self, run_id=None):\n        """This method sets the active run to be that of the specified\n        run_id. If None is passed then a new run is started. The new run also\n        takes care of nested runs.\n\n        Args:\n            run_id (str, optional): Mlflow run_id. Defaults to None.\n        """\n        nested_run = False\n        if self.parent_run_id is not None:\n            self._start_run(run_id=self.parent_run_id, run_name=self.run_name)\n            nested_run = True\n        self._start_run(run_id=run_id, run_name=self.run_name, nested=nested_run)\n\n    def _start_run(self, **kwargs):\n        """Catches the Mlflow exception if a run is already active."""\n        try:\n            run = mlflow.start_run(**kwargs)\n            self.log.info(\n                f"Starting a new mlflow run with id {run.info.run_id} "\n                f"in experiment {self.experiment_name}"\n            )\n        except Exception as ex:\n            run = mlflow.active_run()\n            if "is already active" not in str(ex):\n                raise (ex)\n            self.log.info(f"Run with id {run.info.run_id} is already active.")\n\n    def _set_all_tags(self):\n        """Method collects dagster_run_id plus all env variables/tags that have been\n            specified by the user in the config_schema and logs them as tags in mlflow.\n\n        Returns:\n            tags [dict]: Dictionary of all the tags\n        """\n        tags = {tag: environ.get(tag) for tag in self.env_tags_to_log}\n        tags["dagster_run_id"] = self.dagster_run_id\n        if self.extra_tags:\n            tags.update(self.extra_tags)\n\n        mlflow.set_tags(tags)\n\n    def cleanup_on_error(self):\n        """Method ends mlflow run with correct exit status for failed runs. Note that\n        this method does not work when a job running in the webserver fails, it seems\n        that in this case a different process runs the job and when it fails\n        the stack trace is therefore not available. For this case we can use the\n        cleanup_on_failure hook defined below.\n        """\n        any_error = sys.exc_info()\n\n        if any_error[1]:\n            if isinstance(any_error[1], KeyboardInterrupt):\n                mlflow.end_run(status=RunStatus.to_string(RunStatus.KILLED))\n            else:\n                mlflow.end_run(status=RunStatus.to_string(RunStatus.FAILED))\n\n    @staticmethod\n    def log_params(params: dict):\n        """Overload of the mlflow.log_params. If len(params) >100 then\n        params is sent to mlflow in chunks.\n\n        Args:\n            params (dict): Parameters to be logged\n        """\n        for param_chunk in MlFlow.chunks(params, 100):\n            mlflow.log_params(param_chunk)\n\n    @staticmethod\n    def chunks(params: dict, size: int = 100):\n        """Method that chunks a dictionary into batches of size.\n\n        Args:\n            params (dict): Dictionary set to be batched\n            size (int, optional): Number of batches. Defaults to 100.\n\n        Yields:\n            (dict): Batch of dictionary\n        """\n        it = iter(params)\n        for _ in range(0, len(params), size):\n            yield {k: params[k] for k in islice(it, size)}\n\n\n
[docs]@dagster_maintained_resource\n@resource(config_schema=CONFIG_SCHEMA)\ndef mlflow_tracking(context):\n """This resource initializes an MLflow run that's used for all steps within a Dagster run.\n\n This resource provides access to all of mlflow's methods as well as the mlflow tracking client's\n methods.\n\n Usage:\n\n 1. Add the mlflow resource to any ops in which you want to invoke mlflow tracking APIs.\n 2. Add the `end_mlflow_on_run_finished` hook to your job to end the MLflow run\n when the Dagster run is finished.\n\n Examples:\n .. code-block:: python\n\n from dagster_mlflow import end_mlflow_on_run_finished, mlflow_tracking\n\n @op(required_resource_keys={"mlflow"})\n def mlflow_op(context):\n mlflow.log_params(some_params)\n mlflow.tracking.MlflowClient().create_registered_model(some_model_name)\n\n @end_mlflow_on_run_finished\n @job(resource_defs={"mlflow": mlflow_tracking})\n def mlf_example():\n mlflow_op()\n\n # example using an mlflow instance with s3 storage\n mlf_example.execute_in_process(run_config={\n "resources": {\n "mlflow": {\n "config": {\n "experiment_name": my_experiment,\n "mlflow_tracking_uri": "http://localhost:5000",\n\n # if want to run a nested run, provide parent_run_id\n "parent_run_id": an_existing_mlflow_run_id,\n\n # env variables to pass to mlflow\n "env": {\n "MLFLOW_S3_ENDPOINT_URL": my_s3_endpoint,\n "AWS_ACCESS_KEY_ID": my_aws_key_id,\n "AWS_SECRET_ACCESS_KEY": my_secret,\n },\n\n # env variables you want to log as mlflow tags\n "env_to_tag": ["DOCKER_IMAGE_TAG"],\n\n # key-value tags to add to your experiment\n "extra_tags": {"super": "experiment"},\n }\n }\n }\n })\n """\n mlf = MlFlow(context)\n yield mlf\n mlf.cleanup_on_error()
\n
", "current_page_name": "_modules/dagster_mlflow/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_mlflow.resources"}}, "dagster_msteams": {"hooks": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_msteams.hooks

\nfrom typing import Callable, Optional\n\nfrom dagster._annotations import deprecated_param\nfrom dagster._core.definitions import failure_hook, success_hook\nfrom dagster._core.execution.context.hook import HookContext\nfrom dagster._utils.warnings import normalize_renamed_param\n\nfrom dagster_msteams.card import Card\n\n\ndef _default_status_message(context: HookContext, status: str) -> str:\n    return f"Op {context.op.name} on job {context.job_name} {status}!\\nRun ID: {context.run_id}"\n\n\ndef _default_failure_message(context: HookContext) -> str:\n    return _default_status_message(context, status="failed")\n\n\ndef _default_success_message(context: HookContext) -> str:\n    return _default_status_message(context, status="succeeded")\n\n\n
[docs]@deprecated_param(\n param="dagit_base_url",\n breaking_version="2.0",\n additional_warn_text="Use `webserver_base_url` instead.",\n)\ndef teams_on_failure(\n message_fn: Callable[[HookContext], str] = _default_failure_message,\n dagit_base_url: Optional[str] = None,\n webserver_base_url: Optional[str] = None,\n):\n """Create a hook on step failure events that will message the given MS Teams webhook URL.\n\n Args:\n message_fn (Optional(Callable[[HookContext], str])): Function which takes in the\n HookContext outputs the message you want to send.\n dagit_base_url: (Optional[str]): The base url of your webserver instance. Specify this\n to allow messages to include deeplinks to the specific run that triggered\n the hook.\n webserver_base_url: (Optional[str]): The base url of your webserver instance. Specify this\n to allow messages to include deeplinks to the specific run that triggered\n the hook.\n\n Examples:\n .. code-block:: python\n\n @teams_on_failure(webserver_base_url="http://localhost:3000")\n @job(...)\n def my_job():\n pass\n\n .. code-block:: python\n\n def my_message_fn(context: HookContext) -> str:\n return f"Op {context.op.name} failed!"\n\n @op\n def a_op(context):\n pass\n\n @job(...)\n def my_job():\n a_op.with_hooks(hook_defs={teams_on_failure("#foo", my_message_fn)})\n\n """\n webserver_base_url = normalize_renamed_param(\n webserver_base_url, "webserver_base_url", dagit_base_url, "dagit_base_url"\n )\n\n @failure_hook(required_resource_keys={"msteams"})\n def _hook(context: HookContext):\n text = message_fn(context)\n if webserver_base_url:\n text += f"<a href='{webserver_base_url}/runs/{context.run_id}'>View in Dagster UI</a>"\n card = Card()\n card.add_attachment(text_message=text)\n context.resources.msteams.post_message(payload=card.payload)\n\n return _hook
\n\n\n
[docs]@deprecated_param(\n param="dagit_base_url",\n breaking_version="2.0",\n additional_warn_text="Use `webserver_base_url` instead.",\n)\ndef teams_on_success(\n message_fn: Callable[[HookContext], str] = _default_success_message,\n dagit_base_url: Optional[str] = None,\n webserver_base_url: Optional[str] = None,\n):\n """Create a hook on step success events that will message the given MS Teams webhook URL.\n\n Args:\n message_fn (Optional(Callable[[HookContext], str])): Function which takes in the\n HookContext outputs the message you want to send.\n dagit_base_url: (Optional[str]): The base url of your webserver instance. Specify this\n to allow messages to include deeplinks to the specific run that triggered\n the hook.\n\n Examples:\n .. code-block:: python\n\n @teams_on_success(webserver_base_url="http://localhost:3000")\n @job(...)\n def my_job():\n pass\n\n .. code-block:: python\n\n def my_message_fn(context: HookContext) -> str:\n return f"Op {context.op.name} failed!"\n\n @op\n def a_op(context):\n pass\n\n @job(...)\n def my_job():\n a_op.with_hooks(hook_defs={teams_on_success("#foo", my_message_fn)})\n\n """\n webserver_base_url = normalize_renamed_param(\n webserver_base_url, "webserver_base_url", dagit_base_url, "dagit_base_url"\n )\n\n @success_hook(required_resource_keys={"msteams"})\n def _hook(context: HookContext):\n text = message_fn(context)\n if webserver_base_url:\n text += f"<a href='{webserver_base_url}/runs/{context.run_id}'>View in webserver</a>"\n card = Card()\n card.add_attachment(text_message=text)\n context.resources.msteams.post_message(payload=card.payload)\n\n return _hook
\n
", "current_page_name": "_modules/dagster_msteams/hooks", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_msteams.hooks"}, "resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_msteams.resources

\nfrom dagster import ConfigurableResource, resource\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom pydantic import Field\n\nfrom dagster_msteams.client import TeamsClient\n\n\n
[docs]class MSTeamsResource(ConfigurableResource):\n """This resource is for connecting to Microsoft Teams.\n\n Provides a `dagster_msteams.TeamsClient` which can be used to\n interface with the MS Teams API.\n\n By configuring this resource, you can post messages to MS Teams from any Dagster op,\n asset, schedule, or sensor:\n\n Examples:\n .. code-block:: python\n\n import os\n\n from dagster import op, job, Definitions, EnvVar\n from dagster_msteams import Card, MSTeamsResource\n\n\n @op\n def teams_op(msteams: MSTeamsResource):\n card = Card()\n card.add_attachment(text_message="Hello There !!")\n msteams.get_client().post_message(payload=card.payload)\n\n\n @job\n def teams_job():\n teams_op()\n\n defs = Definitions(\n jobs=[teams_job],\n resources={\n "msteams": MSTeamsResource(\n hook_url=EnvVar("TEAMS_WEBHOOK_URL")\n )\n }\n )\n """\n\n hook_url: str = Field(\n default=None,\n description=(\n "To send messages to MS Teams channel, an incoming webhook has to be created. The"\n " incoming webhook url must be given as a part of the resource config to the"\n " MSTeamsResource in Dagster. For more information on how to create an incoming"\n " webhook, see"\n " https://docs.microsoft.com/en-us/microsoftteams/platform/webhooks-and-connectors/how-to/add-incoming-webhook"\n ),\n )\n http_proxy: str = Field(default=None, description="HTTP proxy URL")\n https_proxy: str = Field(default=None, description="HTTPS proxy URL")\n timeout: float = Field(default=60, description="Timeout for requests to MS Teams")\n verify: bool = Field(\n default=True, description="Whether to verify SSL certificates, defaults to True"\n )\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n def get_client(self) -> TeamsClient:\n return TeamsClient(\n hook_url=self.hook_url,\n http_proxy=self.http_proxy,\n https_proxy=self.https_proxy,\n timeout=self.timeout,\n verify=self.verify,\n )
\n\n\n
[docs]@dagster_maintained_resource\n@resource(\n config_schema=MSTeamsResource.to_config_schema(),\n description="This resource is for connecting to MS Teams",\n)\ndef msteams_resource(context) -> TeamsClient:\n """This resource is for connecting to Microsoft Teams.\n\n The resource object is a `dagster_msteams.TeamsClient`.\n\n By configuring this resource, you can post messages to MS Teams from any Dagster solid:\n\n Examples:\n .. code-block:: python\n\n import os\n\n from dagster import op, job\n from dagster_msteams import Card, msteams_resource\n\n\n @op(required_resource_keys={"msteams"})\n def teams_op(context):\n card = Card()\n card.add_attachment(text_message="Hello There !!")\n context.resources.msteams.post_message(payload=card.payload)\n\n\n @job(resource_defs={"msteams": msteams_resource})\n def teams_job():\n teams_op()\n\n\n teams_job.execute_in_process(\n {"resources": {"msteams": {"config": {"hook_url": os.getenv("TEAMS_WEBHOOK_URL")}}}}\n )\n """\n return MSTeamsResource.from_resource_context(context).get_client()
\n
", "current_page_name": "_modules/dagster_msteams/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_msteams.resources"}, "sensors": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_msteams.sensors

\nfrom typing import TYPE_CHECKING, Callable, Optional, Sequence, Union\n\nfrom dagster import DefaultSensorStatus\nfrom dagster._annotations import deprecated_param\nfrom dagster._core.definitions import GraphDefinition, JobDefinition\nfrom dagster._core.definitions.run_status_sensor_definition import (\n    RunFailureSensorContext,\n    run_failure_sensor,\n)\nfrom dagster._core.definitions.unresolved_asset_job_definition import UnresolvedAssetJobDefinition\nfrom dagster._utils.warnings import normalize_renamed_param\n\nfrom dagster_msteams.card import Card\nfrom dagster_msteams.client import TeamsClient\n\nif TYPE_CHECKING:\n    from dagster._core.definitions.selector import JobSelector, RepositorySelector\n\n\ndef _default_failure_message(context: RunFailureSensorContext) -> str:\n    return "\\n".join(\n        [\n            f"Job {context.dagster_run.job_name} failed!",\n            f"Run ID: {context.dagster_run.run_id}",\n            f"Error: {context.failure_event.message}",\n        ]\n    )\n\n\n
[docs]@deprecated_param(\n param="dagit_base_url",\n breaking_version="2.0",\n additional_warn_text="Use `webserver_base_url` instead.",\n)\ndef make_teams_on_run_failure_sensor(\n hook_url: str,\n message_fn: Callable[[RunFailureSensorContext], str] = _default_failure_message,\n http_proxy: Optional[str] = None,\n https_proxy: Optional[str] = None,\n timeout: Optional[float] = 60,\n verify: Optional[bool] = None,\n name: Optional[str] = None,\n dagit_base_url: Optional[str] = None,\n default_status: DefaultSensorStatus = DefaultSensorStatus.STOPPED,\n monitored_jobs: Optional[\n Sequence[\n Union[\n JobDefinition,\n GraphDefinition,\n UnresolvedAssetJobDefinition,\n "RepositorySelector",\n "JobSelector",\n ]\n ]\n ] = None,\n monitor_all_repositories: bool = False,\n webserver_base_url: Optional[str] = None,\n):\n """Create a sensor on run failures that will message the given MS Teams webhook URL.\n\n Args:\n hook_url (str): MS Teams incoming webhook URL.\n message_fn (Optional(Callable[[RunFailureSensorContext], str])): Function which\n takes in the ``RunFailureSensorContext`` and outputs the message you want to send.\n Defaults to a text message that contains error message, job name, and run ID.\n http_proxy : (Optional[str]): Proxy for requests using http protocol.\n https_proxy : (Optional[str]): Proxy for requests using https protocol.\n timeout: (Optional[float]): Connection timeout in seconds. Defaults to 60.\n verify: (Optional[bool]): Whether to verify the servers TLS certificate.\n name: (Optional[str]): The name of the sensor. Defaults to "teams_on_run_failure".\n dagit_base_url: (Optional[str]): The base url of your webserver instance. Specify this to allow\n messages to include deeplinks to the failed run.\n default_status (DefaultSensorStatus): Whether the sensor starts as running or not. The default\n status can be overridden from Dagit or via the GraphQL API.\n monitored_jobs (Optional[List[Union[JobDefinition, GraphDefinition, UnresolvedAssetJobDefinition, RepositorySelector, JobSelector]]]):\n Jobs in the current repository that will be monitored by this sensor. Defaults to None,\n which means the alert will be sent when any job in the repository matches the requested\n run_status. To monitor jobs in external repositories, use RepositorySelector and JobSelector.\n monitor_all_repositories (bool): If set to True, the sensor will monitor all runs in the\n Dagster instance. If set to True, an error will be raised if you also specify\n monitored_jobs or job_selection. Defaults to False.\n webserver_base_url: (Optional[str]): The base url of your webserver instance. Specify this to allow\n messages to include deeplinks to the failed run.\n\n Examples:\n .. code-block:: python\n\n teams_on_run_failure = make_teams_on_run_failure_sensor(\n hook_url=os.getenv("TEAMS_WEBHOOK_URL")\n )\n\n @repository\n def my_repo():\n return [my_job + teams_on_run_failure]\n\n .. code-block:: python\n\n def my_message_fn(context: RunFailureSensorContext) -> str:\n return "Job {job_name} failed! Error: {error}".format(\n job_name=context.dagster_run.job_name,\n error=context.failure_event.message,\n )\n\n teams_on_run_failure = make_teams_on_run_failure_sensor(\n hook_url=os.getenv("TEAMS_WEBHOOK_URL"),\n message_fn=my_message_fn,\n webserver_base_url="http://localhost:3000",\n )\n\n\n """\n webserver_base_url = normalize_renamed_param(\n webserver_base_url, "webserver_base_url", dagit_base_url, "dagit_base_url"\n )\n\n teams_client = TeamsClient(\n hook_url=hook_url,\n http_proxy=http_proxy,\n https_proxy=https_proxy,\n timeout=timeout,\n verify=verify,\n )\n\n @run_failure_sensor(\n name=name,\n default_status=default_status,\n monitored_jobs=monitored_jobs,\n monitor_all_repositories=monitor_all_repositories,\n )\n def teams_on_run_failure(context: RunFailureSensorContext):\n text = message_fn(context)\n if webserver_base_url:\n text += "<a href='{base_url}/runs/{run_id}'>View in Dagit</a>".format(\n base_url=webserver_base_url,\n run_id=context.dagster_run.run_id,\n )\n card = Card()\n card.add_attachment(text_message=text)\n teams_client.post_message(payload=card.payload)\n\n return teams_on_run_failure
\n
", "current_page_name": "_modules/dagster_msteams/sensors", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_msteams.sensors"}}, "dagster_mysql": {"event_log": {"event_log": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_mysql.event_log.event_log

\nfrom typing import ContextManager, Optional, cast\n\nimport dagster._check as check\nimport sqlalchemy as db\nimport sqlalchemy.dialects as db_dialects\nimport sqlalchemy.exc as db_exc\nimport sqlalchemy.pool as db_pool\nfrom dagster._config.config_schema import UserConfigSchema\nfrom dagster._core.event_api import EventHandlerFn\nfrom dagster._core.events.log import EventLogEntry\nfrom dagster._core.storage.config import MySqlStorageConfig, mysql_config\nfrom dagster._core.storage.event_log import (\n    AssetKeyTable,\n    SqlEventLogStorage,\n    SqlEventLogStorageMetadata,\n    SqlPollingEventWatcher,\n)\nfrom dagster._core.storage.event_log.base import EventLogCursor\nfrom dagster._core.storage.event_log.migration import ASSET_KEY_INDEX_COLS\nfrom dagster._core.storage.sql import (\n    AlembicVersion,\n    check_alembic_revision,\n    create_engine,\n    run_alembic_upgrade,\n    stamp_alembic_rev,\n)\nfrom dagster._serdes import ConfigurableClass, ConfigurableClassData\nfrom sqlalchemy.engine import Connection\n\nfrom ..utils import (\n    create_mysql_connection,\n    mysql_alembic_config,\n    mysql_isolation_level,\n    mysql_url_from_config,\n    parse_mysql_version,\n    retry_mysql_connection_fn,\n    retry_mysql_creation_fn,\n)\n\nMINIMUM_MYSQL_INTERSECT_VERSION = "8.0.31"\n\n\n
[docs]class MySQLEventLogStorage(SqlEventLogStorage, ConfigurableClass):\n """MySQL-backed event log storage.\n\n Users should not directly instantiate this class; it is instantiated by internal machinery when\n ``dagster-webserver`` and ``dagster-graphql`` load, based on the values in the ``dagster.yaml`` file in\n ``$DAGSTER_HOME``. Configuration of this class should be done by setting values in that file.\n\n .. literalinclude:: ../../../../../../examples/docs_snippets/docs_snippets/deploying/dagster-mysql-legacy.yaml\n :caption: dagster.yaml\n :start-after: start_marker_event_log\n :end-before: end_marker_event_log\n :language: YAML\n\n Note that the fields in this config are :py:class:`~dagster.StringSource` and\n :py:class:`~dagster.IntSource` and can be configured from environment variables.\n\n """\n\n def __init__(self, mysql_url: str, inst_data: Optional[ConfigurableClassData] = None):\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n self.mysql_url = check.str_param(mysql_url, "mysql_url")\n self._disposed = False\n\n self._event_watcher = SqlPollingEventWatcher(self)\n\n # Default to not holding any connections open to prevent accumulating connections per DagsterInstance\n self._engine = create_engine(\n self.mysql_url,\n isolation_level=mysql_isolation_level(),\n poolclass=db_pool.NullPool,\n )\n self._secondary_index_cache = {}\n\n table_names = retry_mysql_connection_fn(db.inspect(self._engine).get_table_names)\n\n # Stamp and create tables if the main table does not exist (we can't check alembic\n # revision because alembic config may be shared with other storage classes)\n if "event_logs" not in table_names:\n retry_mysql_creation_fn(self._init_db)\n # mark all secondary indexes to be used\n self.reindex_events()\n self.reindex_assets()\n\n self._mysql_version = self.get_server_version()\n super().__init__()\n\n def _init_db(self) -> None:\n with self._connect() as conn:\n SqlEventLogStorageMetadata.create_all(conn)\n stamp_alembic_rev(mysql_alembic_config(__file__), conn)\n\n def optimize_for_webserver(self, statement_timeout: int, pool_recycle: int) -> None:\n # When running in dagster-webserver, hold an open connection\n # https://github.com/dagster-io/dagster/issues/3719\n self._engine = create_engine(\n self.mysql_url,\n isolation_level=mysql_isolation_level(),\n pool_size=1,\n pool_recycle=pool_recycle,\n )\n\n def upgrade(self) -> None:\n alembic_config = mysql_alembic_config(__file__)\n with self._connect() as conn:\n run_alembic_upgrade(alembic_config, conn)\n\n @property\n def inst_data(self) -> Optional[ConfigurableClassData]:\n return self._inst_data\n\n @classmethod\n def config_type(cls) -> UserConfigSchema:\n return mysql_config()\n\n @classmethod\n def from_config_value(\n cls, inst_data: Optional[ConfigurableClassData], config_value: MySqlStorageConfig\n ) -> "MySQLEventLogStorage":\n return MySQLEventLogStorage(\n inst_data=inst_data, mysql_url=mysql_url_from_config(config_value)\n )\n\n @staticmethod\n def wipe_storage(mysql_url: str) -> None:\n engine = create_engine(\n mysql_url, isolation_level=mysql_isolation_level(), poolclass=db_pool.NullPool\n )\n try:\n SqlEventLogStorageMetadata.drop_all(engine)\n finally:\n engine.dispose()\n\n @staticmethod\n def create_clean_storage(conn_string: str) -> "MySQLEventLogStorage":\n MySQLEventLogStorage.wipe_storage(conn_string)\n return MySQLEventLogStorage(conn_string)\n\n def get_server_version(self) -> Optional[str]:\n with self.index_connection() as conn:\n row = conn.execute(db.text("select version()")).fetchone()\n\n if not row:\n return None\n\n return cast(str, row[0])\n\n def store_asset_event(self, event: EventLogEntry, event_id: int) -> None:\n # last_materialization_timestamp is updated upon observation, materialization, materialization_planned\n # See SqlEventLogStorage.store_asset_event method for more details\n\n values = self._get_asset_entry_values(\n event, event_id, self.has_secondary_index(ASSET_KEY_INDEX_COLS)\n )\n with self.index_connection() as conn:\n if values:\n conn.execute(\n db_dialects.mysql.insert(AssetKeyTable)\n .values(\n asset_key=event.dagster_event.asset_key.to_string(), # type: ignore # (possible none)\n **values,\n )\n .on_duplicate_key_update(\n **values,\n )\n )\n else:\n try:\n conn.execute(\n db_dialects.mysql.insert(AssetKeyTable).values(\n asset_key=event.dagster_event.asset_key.to_string(), # type: ignore # (possible none)\n )\n )\n except db_exc.IntegrityError:\n pass\n\n def _connect(self) -> ContextManager[Connection]:\n return create_mysql_connection(self._engine, __file__, "event log")\n\n def run_connection(self, run_id: Optional[str] = None) -> ContextManager[Connection]:\n return self._connect()\n\n def index_connection(self) -> ContextManager[Connection]:\n return self._connect()\n\n def has_table(self, table_name: str) -> bool:\n with self._connect() as conn:\n return table_name in db.inspect(conn).get_table_names()\n\n def has_secondary_index(self, name: str) -> bool:\n if name not in self._secondary_index_cache:\n self._secondary_index_cache[name] = super(\n MySQLEventLogStorage, self\n ).has_secondary_index(name)\n return self._secondary_index_cache[name]\n\n def enable_secondary_index(self, name: str) -> None:\n super(MySQLEventLogStorage, self).enable_secondary_index(name)\n if name in self._secondary_index_cache:\n del self._secondary_index_cache[name]\n\n def watch(self, run_id: str, cursor: Optional[str], callback: EventHandlerFn) -> None:\n if cursor and EventLogCursor.parse(cursor).is_offset_cursor():\n check.failed("Cannot call `watch` with an offset cursor")\n self._event_watcher.watch_run(run_id, cursor, callback)\n\n def end_watch(self, run_id: str, handler: EventHandlerFn) -> None:\n self._event_watcher.unwatch_run(run_id, handler)\n\n @property\n def supports_intersect(self) -> bool:\n return parse_mysql_version(self._mysql_version) >= parse_mysql_version( # type: ignore # (possible none)\n MINIMUM_MYSQL_INTERSECT_VERSION\n )\n\n @property\n def event_watcher(self) -> SqlPollingEventWatcher:\n return self._event_watcher\n\n def __del__(self) -> None:\n self.dispose()\n\n def dispose(self) -> None:\n if not self._disposed:\n self._disposed = True\n self._event_watcher.close()\n\n def alembic_version(self) -> AlembicVersion:\n alembic_config = mysql_alembic_config(__file__)\n with self._connect() as conn:\n return check_alembic_revision(alembic_config, conn)
\n
", "current_page_name": "_modules/dagster_mysql/event_log/event_log", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_mysql.event_log.event_log"}}, "run_storage": {"run_storage": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_mysql.run_storage.run_storage

\nfrom typing import ContextManager, Mapping, Optional, cast\n\nimport dagster._check as check\nimport sqlalchemy as db\nimport sqlalchemy.dialects as db_dialects\nimport sqlalchemy.pool as db_pool\nfrom dagster._config.config_schema import UserConfigSchema\nfrom dagster._core.storage.config import MySqlStorageConfig, mysql_config\nfrom dagster._core.storage.runs import (\n    DaemonHeartbeatsTable,\n    InstanceInfo,\n    RunStorageSqlMetadata,\n    SqlRunStorage,\n)\nfrom dagster._core.storage.runs.schema import KeyValueStoreTable\nfrom dagster._core.storage.sql import (\n    AlembicVersion,\n    check_alembic_revision,\n    create_engine,\n    run_alembic_upgrade,\n    stamp_alembic_rev,\n)\nfrom dagster._daemon.types import DaemonHeartbeat\nfrom dagster._serdes import ConfigurableClass, ConfigurableClassData, serialize_value\nfrom dagster._utils import utc_datetime_from_timestamp\nfrom sqlalchemy.engine import Connection\n\nfrom ..utils import (\n    create_mysql_connection,\n    mysql_alembic_config,\n    mysql_isolation_level,\n    mysql_url_from_config,\n    parse_mysql_version,\n    retry_mysql_connection_fn,\n    retry_mysql_creation_fn,\n)\n\nMINIMUM_MYSQL_BUCKET_VERSION = "8.0.0"\nMINIMUM_MYSQL_INTERSECT_VERSION = "8.0.31"\n\n\n
[docs]class MySQLRunStorage(SqlRunStorage, ConfigurableClass):\n """MySQL-backed run storage.\n\n Users should not directly instantiate this class; it is instantiated by internal machinery when\n ``dagster-webserver`` and ``dagster-graphql`` load, based on the values in the ``dagster.yaml`` file in\n ``$DAGSTER_HOME``. Configuration of this class should be done by setting values in that file.\n\n\n .. literalinclude:: ../../../../../../examples/docs_snippets/docs_snippets/deploying/dagster-mysql-legacy.yaml\n :caption: dagster.yaml\n :start-after: start_marker_runs\n :end-before: end_marker_runs\n :language: YAML\n\n Note that the fields in this config are :py:class:`~dagster.StringSource` and\n :py:class:`~dagster.IntSource` and can be configured from environment variables.\n """\n\n def __init__(self, mysql_url: str, inst_data: Optional[ConfigurableClassData] = None):\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n self.mysql_url = mysql_url\n\n # Default to not holding any connections open to prevent accumulating connections per DagsterInstance\n self._engine = create_engine(\n self.mysql_url,\n isolation_level=mysql_isolation_level(),\n poolclass=db_pool.NullPool,\n )\n\n self._index_migration_cache = {}\n table_names = retry_mysql_connection_fn(db.inspect(self._engine).get_table_names)\n\n # Stamp and create tables if the main table does not exist (we can't check alembic\n # revision because alembic config may be shared with other storage classes)\n if "runs" not in table_names:\n retry_mysql_creation_fn(self._init_db)\n self.migrate()\n self.optimize()\n\n elif "instance_info" not in table_names:\n InstanceInfo.create(self._engine)\n\n self._mysql_version = self.get_server_version()\n\n super().__init__()\n\n def _init_db(self) -> None:\n with self.connect() as conn:\n RunStorageSqlMetadata.create_all(conn)\n stamp_alembic_rev(mysql_alembic_config(__file__), conn)\n\n def optimize_for_webserver(self, statement_timeout: int, pool_recycle: int) -> None:\n # When running in dagster-webserver, hold 1 open connection\n # https://github.com/dagster-io/dagster/issues/3719\n self._engine = create_engine(\n self.mysql_url,\n isolation_level=mysql_isolation_level(),\n pool_size=1,\n pool_recycle=pool_recycle,\n )\n\n @property\n def inst_data(self) -> Optional[ConfigurableClassData]:\n return self._inst_data\n\n @classmethod\n def config_type(cls) -> UserConfigSchema:\n return mysql_config()\n\n def get_server_version(self) -> Optional[str]:\n with self.connect() as conn:\n row = conn.execute(db.text("select version()")).fetchone()\n\n if not row:\n return None\n\n return cast(str, row[0])\n\n @classmethod\n def from_config_value(\n cls, inst_data: Optional[ConfigurableClassData], config_value: MySqlStorageConfig\n ) -> "MySQLRunStorage":\n return MySQLRunStorage(inst_data=inst_data, mysql_url=mysql_url_from_config(config_value))\n\n @staticmethod\n def wipe_storage(mysql_url: str) -> None:\n engine = create_engine(\n mysql_url, isolation_level=mysql_isolation_level(), poolclass=db_pool.NullPool\n )\n try:\n RunStorageSqlMetadata.drop_all(engine)\n finally:\n engine.dispose()\n\n @staticmethod\n def create_clean_storage(mysql_url: str) -> "MySQLRunStorage":\n MySQLRunStorage.wipe_storage(mysql_url)\n return MySQLRunStorage(mysql_url)\n\n def connect(self, run_id: Optional[str] = None) -> ContextManager[Connection]:\n return create_mysql_connection(self._engine, __file__, "run")\n\n def upgrade(self) -> None:\n alembic_config = mysql_alembic_config(__file__)\n with self.connect() as conn:\n run_alembic_upgrade(alembic_config, conn)\n\n def has_built_index(self, migration_name: str) -> None:\n if migration_name not in self._index_migration_cache:\n self._index_migration_cache[migration_name] = super(\n MySQLRunStorage, self\n ).has_built_index(migration_name)\n return self._index_migration_cache[migration_name]\n\n def mark_index_built(self, migration_name: str) -> None:\n super(MySQLRunStorage, self).mark_index_built(migration_name)\n if migration_name in self._index_migration_cache:\n del self._index_migration_cache[migration_name]\n\n @property\n def supports_intersect(self) -> bool:\n return parse_mysql_version(self._mysql_version) >= parse_mysql_version( # type: ignore\n MINIMUM_MYSQL_INTERSECT_VERSION\n )\n\n def add_daemon_heartbeat(self, daemon_heartbeat: DaemonHeartbeat) -> None:\n with self.connect() as conn:\n conn.execute(\n db_dialects.mysql.insert(DaemonHeartbeatsTable)\n .values(\n timestamp=utc_datetime_from_timestamp(daemon_heartbeat.timestamp),\n daemon_type=daemon_heartbeat.daemon_type,\n daemon_id=daemon_heartbeat.daemon_id,\n body=serialize_value(daemon_heartbeat),\n )\n .on_duplicate_key_update(\n timestamp=utc_datetime_from_timestamp(daemon_heartbeat.timestamp),\n daemon_id=daemon_heartbeat.daemon_id,\n body=serialize_value(daemon_heartbeat),\n )\n )\n\n def set_cursor_values(self, pairs: Mapping[str, str]) -> None:\n check.mapping_param(pairs, "pairs", key_type=str, value_type=str)\n db_values = [{"key": k, "value": v} for k, v in pairs.items()]\n\n with self.connect() as conn:\n insert_stmt = db_dialects.mysql.insert(KeyValueStoreTable).values(db_values)\n conn.execute(\n insert_stmt.on_duplicate_key_update(\n value=insert_stmt.inserted.value,\n )\n )\n\n def alembic_version(self) -> AlembicVersion:\n alembic_config = mysql_alembic_config(__file__)\n with self.connect() as conn:\n return check_alembic_revision(alembic_config, conn)
\n
", "current_page_name": "_modules/dagster_mysql/run_storage/run_storage", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_mysql.run_storage.run_storage"}}, "schedule_storage": {"schedule_storage": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_mysql.schedule_storage.schedule_storage

\nfrom typing import ContextManager, Optional, cast\n\nimport dagster._check as check\nimport pendulum\nimport sqlalchemy as db\nimport sqlalchemy.dialects as db_dialects\nimport sqlalchemy.pool as db_pool\nfrom dagster._config.config_schema import UserConfigSchema\nfrom dagster._core.storage.config import MySqlStorageConfig, mysql_config\nfrom dagster._core.storage.schedules import ScheduleStorageSqlMetadata, SqlScheduleStorage\nfrom dagster._core.storage.schedules.schema import InstigatorsTable\nfrom dagster._core.storage.sql import (\n    AlembicVersion,\n    check_alembic_revision,\n    create_engine,\n    run_alembic_upgrade,\n    stamp_alembic_rev,\n)\nfrom dagster._serdes import ConfigurableClass, ConfigurableClassData, serialize_value\nfrom sqlalchemy.engine import Connection\n\nfrom ..utils import (\n    create_mysql_connection,\n    mysql_alembic_config,\n    mysql_isolation_level,\n    mysql_url_from_config,\n    parse_mysql_version,\n    retry_mysql_connection_fn,\n    retry_mysql_creation_fn,\n)\n\nMINIMUM_MYSQL_BATCH_VERSION = "8.0.0"\n\n\n
[docs]class MySQLScheduleStorage(SqlScheduleStorage, ConfigurableClass):\n """MySQL-backed run storage.\n\n Users should not directly instantiate this class; it is instantiated by internal machinery when\n ``dagster-webserver`` and ``dagster-graphql`` load, based on the values in the ``dagster.yaml`` file in\n ``$DAGSTER_HOME``. Configuration of this class should be done by setting values in that file.\n\n .. literalinclude:: ../../../../../../examples/docs_snippets/docs_snippets/deploying/dagster-mysql-legacy.yaml\n :caption: dagster.yaml\n :start-after: start_marker_schedules\n :end-before: end_marker_schedules\n :language: YAML\n\n Note that the fields in this config are :py:class:`~dagster.StringSource` and\n :py:class:`~dagster.IntSource` and can be configured from environment variables.\n """\n\n def __init__(self, mysql_url: str, inst_data: Optional[ConfigurableClassData] = None):\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n self.mysql_url = mysql_url\n\n # Default to not holding any connections open to prevent accumulating connections per DagsterInstance\n self._engine = create_engine(\n self.mysql_url,\n isolation_level=mysql_isolation_level(),\n poolclass=db_pool.NullPool,\n )\n\n # Stamp and create tables if the main table does not exist (we can't check alembic\n # revision because alembic config may be shared with other storage classes)\n table_names = retry_mysql_connection_fn(db.inspect(self._engine).get_table_names)\n if "jobs" not in table_names:\n retry_mysql_creation_fn(self._init_db)\n\n self._mysql_version = self.get_server_version()\n\n super().__init__()\n\n def _init_db(self) -> None:\n with self.connect() as conn:\n ScheduleStorageSqlMetadata.create_all(conn)\n stamp_alembic_rev(mysql_alembic_config(__file__), conn)\n\n # mark all the data migrations as applied\n self.migrate()\n self.optimize()\n\n def optimize_for_webserver(self, statement_timeout: int, pool_recycle: int) -> None:\n # When running in dagster-webserver, hold an open connection\n # https://github.com/dagster-io/dagster/issues/3719\n self._engine = create_engine(\n self.mysql_url,\n isolation_level=mysql_isolation_level(),\n pool_size=1,\n pool_recycle=pool_recycle,\n )\n\n @property\n def inst_data(self) -> Optional[ConfigurableClassData]:\n return self._inst_data\n\n @classmethod\n def config_type(cls) -> UserConfigSchema:\n return mysql_config()\n\n @classmethod\n def from_config_value(\n cls, inst_data: Optional[ConfigurableClassData], config_value: MySqlStorageConfig\n ) -> "MySQLScheduleStorage":\n return MySQLScheduleStorage(\n inst_data=inst_data, mysql_url=mysql_url_from_config(config_value)\n )\n\n @staticmethod\n def wipe_storage(mysql_url: str) -> None:\n engine = create_engine(\n mysql_url, isolation_level=mysql_isolation_level(), poolclass=db_pool.NullPool\n )\n try:\n ScheduleStorageSqlMetadata.drop_all(engine)\n finally:\n engine.dispose()\n\n @staticmethod\n def create_clean_storage(mysql_url: str) -> "MySQLScheduleStorage":\n MySQLScheduleStorage.wipe_storage(mysql_url)\n return MySQLScheduleStorage(mysql_url)\n\n def connect(self) -> ContextManager[Connection]:\n return create_mysql_connection(self._engine, __file__, "schedule")\n\n @property\n def supports_batch_queries(self) -> bool:\n if not self._mysql_version:\n return False\n\n return parse_mysql_version(self._mysql_version) >= parse_mysql_version(\n MINIMUM_MYSQL_BATCH_VERSION\n )\n\n def get_server_version(self) -> Optional[str]:\n with self.connect() as conn:\n row = conn.execute(db.text("select version()")).fetchone()\n\n if not row:\n return None\n\n return cast(str, row[0])\n\n def upgrade(self) -> None:\n with self.connect() as conn:\n alembic_config = mysql_alembic_config(__file__)\n run_alembic_upgrade(alembic_config, conn)\n\n def _add_or_update_instigators_table(self, conn: Connection, state) -> None:\n selector_id = state.selector_id\n conn.execute(\n db_dialects.mysql.insert(InstigatorsTable)\n .values(\n selector_id=selector_id,\n repository_selector_id=state.repository_selector_id,\n status=state.status.value,\n instigator_type=state.instigator_type.value,\n instigator_body=serialize_value(state),\n )\n .on_duplicate_key_update(\n status=state.status.value,\n instigator_type=state.instigator_type.value,\n instigator_body=serialize_value(state),\n update_timestamp=pendulum.now("UTC"),\n )\n )\n\n def alembic_version(self) -> AlembicVersion:\n alembic_config = mysql_alembic_config(__file__)\n with self.connect() as conn:\n return check_alembic_revision(alembic_config, conn)
\n
", "current_page_name": "_modules/dagster_mysql/schedule_storage/schedule_storage", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_mysql.schedule_storage.schedule_storage"}}}, "dagster_pagerduty": {"resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_pagerduty.resources

\nfrom typing import Dict, Optional, cast\n\nimport pypd\nfrom dagster import ConfigurableResource, resource\nfrom dagster._config.pythonic_config import infer_schema_from_config_class\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom dagster._utils.warnings import suppress_dagster_warnings\nfrom pydantic import Field as PyField\n\n\n
[docs]class PagerDutyService(ConfigurableResource):\n """This resource is for posting events to PagerDuty."""\n\n """Integrates with PagerDuty via the pypd library.\n\n See:\n https://v2.developer.pagerduty.com/docs/events-api-v2\n https://v2.developer.pagerduty.com/docs/send-an-event-events-api-v2\n https://support.pagerduty.com/docs/services-and-integrations#section-events-api-v2\n https://github.com/PagerDuty/pagerduty-api-python-client\n\n for documentation and more information.\n """\n\n routing_key: str = PyField(\n ...,\n description=(\n "The routing key provisions access to your PagerDuty service. You"\n "will need to include the integration key for your new integration, as a"\n "routing_key in the event payload."\n ),\n )\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n def EventV2_create(\n self,\n summary: str,\n source: str,\n severity: str,\n event_action: str = "trigger",\n dedup_key: Optional[str] = None,\n timestamp: Optional[str] = None,\n component: Optional[str] = None,\n group: Optional[str] = None,\n event_class: Optional[str] = None,\n custom_details: Optional[object] = None,\n ) -> object:\n """Events API v2 enables you to add PagerDuty's advanced event and incident management\n functionality to any system that can make an outbound HTTP connection.\n\n Args:\n summary (str):\n A high-level, text summary message of the event. Will be used to construct an\n alert's description. Example:\n\n "PING OK - Packet loss = 0%, RTA = 1.41 ms" "Host\n 'acme-andromeda-sv1-c40 :: 179.21.24.50' is DOWN"\n\n source (str):\n Specific human-readable unique identifier, such as a hostname, for the system having\n the problem. Examples:\n\n "prod05.theseus.acme-widgets.com"\n "171.26.23.22"\n "aws:elasticache:us-east-1:852511987:cluster/api-stats-prod-003"\n "9c09acd49a25"\n\n severity (str):\n How impacted the affected system is. Displayed to users in lists and influences the\n priority of any created incidents. Must be one of {info, warning, error, critical}\n\n Keyword Args:\n event_action (str):\n There are three types of events that PagerDuty recognizes, and are used to represent\n different types of activity in your monitored systems. (default: 'trigger')\n\n * trigger: When PagerDuty receives a trigger event, it will either open a new alert,\n or add a new trigger log entry to an existing alert, depending on the\n provided dedup_key. Your monitoring tools should send PagerDuty a trigger\n when a new problem has been detected. You may send additional triggers\n when a previously detected problem has occurred again.\n\n * acknowledge: acknowledge events cause the referenced incident to enter the\n acknowledged state. While an incident is acknowledged, it won't\n generate any additional notifications, even if it receives new\n trigger events. Your monitoring tools should send PagerDuty an\n acknowledge event when they know someone is presently working on the\n problem.\n\n * resolve: resolve events cause the referenced incident to enter the resolved state.\n Once an incident is resolved, it won't generate any additional\n notifications. New trigger events with the same dedup_key as a resolved\n incident won't re-open the incident. Instead, a new incident will be\n created. Your monitoring tools should send PagerDuty a resolve event when\n the problem that caused the initial trigger event has been fixed.\n\n dedup_key (str):\n Deduplication key for correlating triggers and resolves. The maximum permitted\n length of this property is 255 characters.\n\n timestamp (str):\n Timestamp (ISO 8601). When the upstream system detected / created the event. This is\n useful if a system batches or holds events before sending them to PagerDuty. This\n will be auto-generated by PagerDuty if not provided. Example:\n\n 2015-07-17T08:42:58.315+0000\n\n component (str):\n The part or component of the affected system that is broken. Examples:\n\n "keepalive"\n "webping"\n "mysql"\n "wqueue"\n\n group (str):\n A cluster or grouping of sources. For example, sources "prod-datapipe-02" and\n "prod-datapipe-03" might both be part of "prod-datapipe". Examples:\n\n "prod-datapipe"\n "www"\n "web_stack"\n\n event_class (str):\n The class/type of the event. Examples:\n\n "High CPU"\n "Latency"\n "500 Error"\n\n custom_details (Dict[str, str]):\n Additional details about the event and affected system. Example:\n\n {"ping time": "1500ms", "load avg": 0.75 }\n """\n data = {\n "routing_key": self.routing_key,\n "event_action": event_action,\n "payload": {"summary": summary, "source": source, "severity": severity},\n }\n\n if dedup_key is not None:\n data["dedup_key"] = dedup_key\n\n payload: Dict[str, object] = cast(Dict[str, object], data["payload"])\n\n if timestamp is not None:\n payload["timestamp"] = timestamp\n\n if component is not None:\n payload["component"] = component\n\n if group is not None:\n payload["group"] = group\n\n if event_class is not None:\n payload["class"] = event_class\n\n if custom_details is not None:\n payload["custom_details"] = custom_details\n\n return pypd.EventV2.create(data=data)
\n\n\n
[docs]@dagster_maintained_resource\n@resource(\n config_schema=infer_schema_from_config_class(PagerDutyService),\n description="""This resource is for posting events to PagerDuty.""",\n)\n@suppress_dagster_warnings\ndef pagerduty_resource(context) -> PagerDutyService:\n """A resource for posting events (alerts) to PagerDuty.\n\n Example:\n .. code-block:: python\n\n @op\n def pagerduty_op(pagerduty: PagerDutyService):\n pagerduty.EventV2_create(\n summary='alert from dagster'\n source='localhost',\n severity='error',\n event_action='trigger',\n )\n\n @job(resource_defs={ 'pagerduty': pagerduty_resource })\n def pagerduty_test():\n pagerduty_op()\n\n pagerduty_test.execute_in_process(\n run_config={\n "resources": {\n 'pagerduty': {'config': {'routing_key': '0123456789abcdef0123456789abcdef'}}\n }\n }\n )\n """\n return PagerDutyService(**context.resource_config)
\n
", "current_page_name": "_modules/dagster_pagerduty/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_pagerduty.resources"}}, "dagster_pandas": {"constraints": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_pandas.constraints

\nimport sys\nfrom collections import defaultdict\nfrom datetime import datetime\nfrom functools import wraps\n\nimport pandas as pd\nfrom dagster import (\n    DagsterType,\n    TypeCheck,\n    _check as check,\n)\nfrom dagster._annotations import experimental\nfrom pandas import DataFrame\nfrom typing_extensions import Final\n\nCONSTRAINT_METADATA_KEY: Final = "constraint_metadata"\n\n\nclass ConstraintViolationException(Exception):\n    """Indicates that a constraint has been violated."""\n\n\nclass ConstraintWithMetadataException(Exception):\n    """This class defines the response generated when a pandas DF fails validation -- it can be used to generate either a\n    failed typecheck or an exception.\n\n    Args:\n        constraint_name (str):  the name of the violated constraint\n        constraint_description (Optional[str]): the description of the violated constraint\n        expectation (Optional[Union[dict,list, str, set]]): what result was expected -- typically a jsonlike, though it can be a string\n        offending (Optional[Union[dict,list, str, set]]):  which pieces of the dataframe violated the expectation, typically list or string\n        actual (Optional[Union[dict,list, str, set]]): what those pieces of the dataframe actually were -- typically a jsonlike\n    """\n\n    def __init__(\n        self,\n        constraint_name,\n        constraint_description="",\n        expectation=None,\n        offending=None,\n        actual=None,\n    ):\n        self.constraint_name = constraint_name\n        self.constraint_description = constraint_description\n        self.expectation = check.opt_inst_param(expectation, "expectation", (dict, list, str, set))\n        self.offending = check.opt_inst_param(offending, "offending", (dict, list, str, set))\n        self.actual = check.opt_inst_param(actual, "actual", (dict, list, str, set))\n        super(ConstraintWithMetadataException, self).__init__(\n            "Violated {} - {}, {} was/were expected, but we received {} which was/were {}".format(\n                constraint_name,\n                constraint_description,\n                expectation,\n                offending,\n                actual,\n            )\n        )\n\n    def normalize_metadata_json_value(self, val):\n        if isinstance(val, set):\n            return list(val)\n        else:\n            return val\n\n    def convert_to_metadata(self):\n        return {\n            CONSTRAINT_METADATA_KEY: {\n                "constraint_name": self.constraint_name,\n                "constraint_description": self.constraint_description,\n                "expected": self.normalize_metadata_json_value(self.expectation),\n                "offending": self.normalize_metadata_json_value(self.offending),\n                "actual": self.normalize_metadata_json_value(self.actual),\n            },\n        }\n\n    def return_as_typecheck(self):\n        return TypeCheck(\n            success=False, description=self.args[0], metadata=self.convert_to_metadata()\n        )\n\n\nclass DataFrameConstraintViolationException(ConstraintViolationException):\n    """Indicates a dataframe level constraint has been violated."""\n\n    def __init__(self, constraint_name, constraint_description):\n        super(DataFrameConstraintViolationException, self).__init__(\n            f"Violated {constraint_name} - {constraint_description}"\n        )\n\n\nclass DataFrameWithMetadataException(ConstraintWithMetadataException):\n    def __init__(self, constraint_name, constraint_description, expectation, actual):\n        super(DataFrameWithMetadataException, self).__init__(\n            constraint_name, constraint_description, expectation, "a malformed dataframe", actual\n        )\n\n\nclass ColumnConstraintViolationException(ConstraintViolationException):\n    """Indicates that a column constraint has been violated."""\n\n    def __init__(self, constraint_name, constraint_description, column_name, offending_rows=None):\n        self.constraint_name = constraint_name\n        self.constraint_description = constraint_description\n        self.column_name = column_name\n        self.offending_rows = offending_rows\n        super(ColumnConstraintViolationException, self).__init__(self.construct_message())\n\n    def construct_message(self):\n        base_message = (\n            'Violated "{constraint_name}" for column "{column_name}" - {constraint_description}'\n            .format(\n                constraint_name=self.constraint_name,\n                constraint_description=self.constraint_description,\n                column_name=self.column_name,\n            )\n        )\n        if self.offending_rows is not None:\n            base_message += "The offending (index, row values) are the following: {}".format(\n                self.offending_rows\n            )\n        return base_message\n\n\nclass ColumnWithMetadataException(ConstraintWithMetadataException):\n    def __init__(self, constraint_name, constraint_description, expectation, offending, actual):\n        super(ColumnWithMetadataException, self).__init__(\n            "the column constraint " + constraint_name,\n            constraint_description,\n            expectation,\n            offending,\n            actual,\n        )\n\n\nclass Constraint:\n    """Base constraint object that all constraints inherit from.\n\n    Args:\n        error_description (Optional[str]): The plain string description that is output in the terminal if the constraint fails.\n        markdown_description (Optional[str]): A markdown supported description that is shown in the Dagster UI if the constraint fails.\n    """\n\n    def __init__(self, error_description=None, markdown_description=None):\n        self.name = self.__class__.__name__\n        self.markdown_description = check.str_param(markdown_description, "markdown_description")\n        self.error_description = check.str_param(error_description, "error_description")\n\n\n@experimental\nclass ConstraintWithMetadata:\n    """This class defines a base constraint over pandas DFs with organized metadata.\n\n    Args:\n        description (str): description of the constraint\n        validation_fn (Callable[[DataFrame], Tuple[bool, dict[str, Union[dict,list, str, set]]]]:\n                    the validation function to run over inputted data\n                    This function should return a tuple of a boolean for success or failure, and a dict containing\n                    metadata about the test -- this metadata will be passed to the resulting exception if validation\n                    fails.\n        resulting_exception (ConstraintWithMetadataException):  what response a failed typecheck should induce\n        raise_or_typecheck (Optional[bool]): whether to raise an exception (if set to True) or emit a failed typecheck event\n                    (if set to False) when validation fails\n        name (Optional[str]): what to call the constraint, defaults to the class name.\n    """\n\n    # TODO:  validation_fn returning metadata is sorta broken.  maybe have it yield typecheck events and grab metadata?\n\n    def __init__(\n        self, description, validation_fn, resulting_exception, raise_or_typecheck=True, name=None\n    ):\n        if name is None:\n            self.name = self.__class__.__name__\n        else:\n            self.name = name\n        self.description = description\n        # should return a tuple of (bool, and either an empty dict or a dict of extra params)\n        self.validation_fn = validation_fn\n        self.resulting_exception = resulting_exception\n        self.raise_or_typecheck = raise_or_typecheck\n\n    def validate(self, data, *args, **kwargs):\n        res = self.validation_fn(data, *args, **kwargs)\n        if not res[0]:\n            exc = self.resulting_exception(\n                constraint_name=self.name, constraint_description=self.description, **res[1]\n            )\n\n            if self.raise_or_typecheck:\n                raise exc\n            else:\n                return exc.return_as_typecheck()\n\n        else:\n            if res[0]:\n                return TypeCheck(success=True)\n\n    # TODO:  composition of validations\n    def as_dagster_type(self, *args, **kwargs):\n        if self.raise_or_typecheck:\n            raise Exception(\n                "Dagster types can only be constructed from constraints that return typechecks"\n            )\n        return DagsterType(\n            name=self.name,\n            description=f"A Pandas DataFrame with the following validation: {self.description}",\n            type_check_fn=lambda x: self.validate(x, *args),\n            **kwargs,\n        )\n\n\nclass MultiConstraintWithMetadata(ConstraintWithMetadata):\n    """Use this class if you have multiple constraints to check over the entire dataframe.\n\n    Args:\n        description (str): description of the constraint\n        validation_fn_arr(List[Callable[[DataFrame], Tuple[bool, dict[str, Union[dict,list, str, set]]]]]):\n                    a list of the validation functions to run over inputted data\n                    Each function should return a tuple of a boolean for success or failure, and a dict containing\n                    metadata about the test -- this metadata will be passed to the resulting exception if validation\n                    fails.\n        resulting_exception (ConstraintWithMetadataException):  what response a failed typecheck should induce\n        raise_or_typecheck (Optional[bool]): whether to raise an exception (if set to True) or emit a failed typecheck event\n                    (if set to False) when validation fails\n        name (Optional[str]): what to call the constraint, defaults to the class name.\n    """\n\n    def __init__(\n        self,\n        description,\n        validation_fn_arr,\n        resulting_exception,\n        raise_or_typecheck=True,\n        name=None,\n    ):\n        validation_fn_arr = check.list_param(validation_fn_arr, "validation_fn_arr")\n\n        def validation_fn(data, *args, **kwargs):\n            results = [f(data, *args, **kwargs) for f in validation_fn_arr]\n            truthparam = all(item[0] for item in results)\n            metadict = defaultdict(dict)\n            for i, dicta in enumerate(item[1] for item in results):\n                if len(dicta.keys()) > 0:\n                    for key in dicta:\n                        metadict[key][validation_fn_arr[i].__name__] = dicta[key]\n            return (truthparam, metadict)\n\n        super(MultiConstraintWithMetadata, self).__init__(\n            description,\n            validation_fn,\n            resulting_exception,\n            raise_or_typecheck=raise_or_typecheck,\n            name=name,\n        )\n\n\nclass StrictColumnsWithMetadata(ConstraintWithMetadata):\n    def __init__(self, column_list, enforce_ordering=False, raise_or_typecheck=True, name=None):\n        self.enforce_ordering = check.bool_param(enforce_ordering, "enforce_ordering")\n        self.column_list = check.list_param(column_list, "strict_column_list", of_type=str)\n\n        def validation_fcn(inframe):\n            if list(inframe.columns) == column_list:\n                return (True, {})\n            else:\n                if self.enforce_ordering:\n                    resdict = {"expectation": self.column_list, "actual": list(inframe.columns)}\n                    return (False, resdict)\n                else:\n                    if set(inframe.columns) == set(column_list):\n                        return (True, {})\n                    else:\n                        extra = [x for x in inframe.columns if x not in set(column_list)]\n                        missing = [x for x in set(column_list) if x not in inframe.columns]\n                        resdict = {\n                            "expectation": self.column_list,\n                            "actual": {"extra_columns": extra, "missing_columns": missing},\n                        }\n                        return (False, resdict)\n\n        basestr = f"ensuring that the right columns, {self.column_list} were present"\n        if enforce_ordering:\n            basestr += " in the right order"\n        super(StrictColumnsWithMetadata, self).__init__(\n            basestr,\n            validation_fcn,\n            DataFrameWithMetadataException,\n            raise_or_typecheck=raise_or_typecheck,\n            name=name,\n        )\n\n\nclass DataFrameConstraint(Constraint):\n    """Base constraint object that represent Dataframe shape constraints.\n\n    Args:\n        error_description (Optional[str]): The plain string description that is output in the terminal if the constraint fails.\n        markdown_description (Optional[str]): A markdown supported description that is shown in the Dagster UI if the constraint fails.\n    """\n\n    def __init__(self, error_description=None, markdown_description=None):\n        super(DataFrameConstraint, self).__init__(\n            error_description=error_description, markdown_description=markdown_description\n        )\n\n    def validate(self, dataframe):\n        raise NotImplementedError()\n\n\n
[docs]class StrictColumnsConstraint(DataFrameConstraint):\n """A dataframe constraint that validates column existence and ordering.\n\n Args:\n strict_column_list (List[str]): The exact list of columns that your dataframe must have.\n enforce_ordering (Optional[bool]): If true, will enforce that the ordering of column names must match.\n Default is False.\n """\n\n def __init__(self, strict_column_list, enforce_ordering=False):\n self.enforce_ordering = check.bool_param(enforce_ordering, "enforce_ordering")\n self.strict_column_list = check.list_param(\n strict_column_list, "strict_column_list", of_type=str\n )\n description = f"No columns outside of {self.strict_column_list} allowed. "\n if enforce_ordering:\n description += "Columns must be in that order."\n super(StrictColumnsConstraint, self).__init__(\n error_description=description, markdown_description=description\n )\n\n def validate(self, dataframe):\n check.inst_param(dataframe, "dataframe", DataFrame)\n columns_received = list(dataframe.columns)\n if self.enforce_ordering:\n if self.strict_column_list != columns_received:\n raise DataFrameConstraintViolationException(\n constraint_name=self.name,\n constraint_description=(\n "Expected the following ordering of columns {expected}. Received:"\n " {received}".format(\n expected=self.strict_column_list, received=columns_received\n )\n ),\n )\n for column in columns_received:\n if column not in self.strict_column_list:\n raise DataFrameConstraintViolationException(\n constraint_name=self.name,\n constraint_description="Expected {}. Recevied {}.".format(\n self.strict_column_list, columns_received\n ),\n )
\n\n\n
[docs]class RowCountConstraint(DataFrameConstraint):\n """A dataframe constraint that validates the expected count of rows.\n\n Args:\n num_allowed_rows (int): The number of allowed rows in your dataframe.\n error_tolerance (Optional[int]): The acceptable threshold if you are not completely certain. Defaults to 0.\n """\n\n def __init__(self, num_allowed_rows, error_tolerance=0):\n self.num_allowed_rows = check.int_param(num_allowed_rows, "num_allowed_rows")\n self.error_tolerance = abs(check.int_param(error_tolerance, "error_tolerance"))\n if self.error_tolerance > self.num_allowed_rows:\n raise ValueError("Tolerance can't be greater than the number of rows you expect.")\n description = f"Dataframe must have {self.num_allowed_rows} +- {self.error_tolerance} rows."\n super(RowCountConstraint, self).__init__(\n error_description=description, markdown_description=description\n )\n\n def validate(self, dataframe):\n check.inst_param(dataframe, "dataframe", DataFrame)\n\n if not (\n self.num_allowed_rows - self.error_tolerance\n <= len(dataframe)\n <= self.num_allowed_rows + self.error_tolerance\n ):\n raise DataFrameConstraintViolationException(\n constraint_name=self.name,\n constraint_description=(\n "Expected {expected} +- {tolerance} rows. Got {received}".format(\n expected=self.num_allowed_rows,\n tolerance=self.error_tolerance,\n received=len(dataframe),\n )\n ),\n )
\n\n\ndef apply_ignore_missing_data_to_mask(mask, column):\n return mask & ~column.isnull()\n\n\nclass ColumnAggregateConstraintWithMetadata(ConstraintWithMetadata):\n """Similar to the base class, but now your validation functions should take in columns (pd.Series) not Dataframes.\n\n Args:\n description (str): description of the constraint\n validation_fn (Callable[[pd.Series], Tuple[bool, dict[str, Union[dict,list, str, set]]]]:\n the validation function to run over inputted data\n This function should return a tuple of a boolean for success or failure, and a dict containing\n metadata about the test -- this metadata will be passed to the resulting exception if validation\n fails.\n resulting_exception (ConstraintWithMetadataException): what response a failed typecheck should induce\n raise_or_typecheck (Optional[bool]): whether to raise an exception (if set to True) or emit a failed typecheck event\n (if set to False) when validation fails\n name (Optional[str]): what to call the constraint, defaults to the class name.\n """\n\n def validate(self, data, *columns, **kwargs):\n if len(columns) == 0:\n columns = data.columns\n columns = [column for column in columns if column in data.columns]\n relevant_data = data[list(columns)]\n\n offending_columns = set()\n offending_values = {}\n for column in columns:\n # TODO: grab extra metadata\n res = self.validation_fn(relevant_data[column])\n if not res[0]:\n offending_columns.add(column)\n if res[1].get("actual") is not None:\n offending_values[column] = [x.item() for x in res[1].get("actual").to_numpy()]\n else:\n offending_values[column] = [x.item() for x in relevant_data[column].to_numpy()]\n if len(offending_columns) == 0 and not self.raise_or_typecheck:\n return TypeCheck(success=True)\n elif len(offending_columns) > 0:\n metadict = {\n "expectation": self.description.replace("Confirms", ""),\n "actual": offending_values,\n "offending": offending_columns,\n }\n exc = self.resulting_exception(\n constraint_name=self.name, constraint_description=self.description, **metadict\n )\n\n if self.raise_or_typecheck:\n raise exc\n else:\n return exc.return_as_typecheck()\n\n\nclass ColumnConstraintWithMetadata(ConstraintWithMetadata):\n """This class is useful for constructing single constraints that you want to apply to multiple\n columns of your dataframe.\n\n The main difference from the base class in terms of construction is that now, your validation_fns should operate on\n individual values.\n\n Args:\n description (str): description of the constraint\n validation_fn (Callable[[Any], Tuple[bool, dict[str, Union[dict,list, str, set]]]]:\n the validation function to run over inputted data\n This function should return a tuple of a boolean for success or failure, and a dict containing\n metadata about the test -- this metadata will be passed to the resulting exception if validation\n fails.\n resulting_exception (ConstraintWithMetadataException): what response a failed typecheck should induce\n raise_or_typecheck (Optional[bool]): whether to raise an exception (if set to True) or emit a failed typecheck event\n (if set to False) when validation fails\n name (Optional[str]): what to call the constraint, defaults to the class name.\n """\n\n def validate(self, data, *columns, **kwargs):\n if len(columns) == 0:\n columns = data.columns\n\n columns = [column for column in columns if column in data.columns]\n relevant_data = data[list(columns)]\n offending = {}\n offending_values = {}\n # TODO: grab metadata from here\n inverse_validation = lambda x: not self.validation_fn(x)[0]\n for column in columns:\n results = relevant_data[relevant_data[column].apply(inverse_validation)]\n if len(results.index.tolist()) > 0:\n offending[column] = ["row " + str(i) for i in (results.index.tolist())]\n offending_values[column] = results[column].tolist()\n if len(offending) == 0:\n if not self.raise_or_typecheck:\n return TypeCheck(success=True)\n else:\n metadict = {\n "expectation": self.validation_fn.__doc__,\n "actual": offending_values,\n "offending": offending,\n }\n exc = self.resulting_exception(\n constraint_name=self.name, constraint_description=self.description, **metadict\n )\n\n if self.raise_or_typecheck:\n raise exc\n else:\n return exc.return_as_typecheck()\n\n\nclass MultiColumnConstraintWithMetadata(ColumnConstraintWithMetadata):\n """This class is useful for constructing more complicated relationships between columns\n and expectations -- i.e. you want some validations on column A, others on column B, etc.\n This lets you package up the metadata neatly, and also allows for cases like 'fail if any one of\n these constraints fails but still run all of them'.\n\n Args:\n description (str): description of the overall set of validations\n fn_and_columns_dict (Dict[str, List[Callable[[Any], Tuple[bool, dict[str, Union[dict,list, str, set]]]]]):\n while this is a relatively complex type,\n what it amounts to is 'a dict mapping columns to the functions to\n run on them'\n resulting_exception (type): the response to generate if validation fails. Subclass of\n ConstraintWithMetadataException\n raise_or_typecheck (Optional[bool]): whether to raise an exception (true) or a failed typecheck (false)\n type_for_internal (Optional[type]): what type to use for internal validators. Subclass of\n ConstraintWithMetadata\n name (Optional[str]): what to call the constraint, defaults to the class name.\n """\n\n def __init__(\n self,\n description,\n fn_and_columns_dict,\n resulting_exception,\n raise_or_typecheck=True,\n type_for_internal=ColumnConstraintWithMetadata,\n name=None,\n ):\n # TODO: support multiple descriptions\n self.column_to_fn_dict = check.dict_param(\n fn_and_columns_dict, "fn_and_columns_dict", key_type=str\n )\n\n def validation_fn(data, *args, **kwargs):\n metadict = defaultdict(dict)\n truthparam = True\n for column, fn_arr in self.column_to_fn_dict.items():\n if column not in data.columns:\n continue\n for fn in fn_arr:\n # TODO: do this more effectively\n new_validator = type_for_internal(\n fn.__doc__, fn, ColumnWithMetadataException, raise_or_typecheck=False\n )\n result = new_validator.validate(\n DataFrame(data[column]), column, *args, **kwargs\n )\n result_val = result.success\n if result_val:\n continue\n result_dict = result.metadata[CONSTRAINT_METADATA_KEY].data\n truthparam = truthparam and result_val\n for key in result_dict.keys():\n if "constraint" not in key:\n if key == "expected":\n new_key = "expectation"\n result_dict[key] = result_dict[key].replace("returns", "").strip()\n if column not in metadict[new_key] or new_key not in metadict:\n metadict[new_key][column] = dict()\n metadict[new_key][column][fn.__name__] = result_dict[key]\n else:\n if column not in metadict[key] or key not in metadict:\n metadict[key][column] = dict()\n if isinstance(result_dict[key], dict):\n metadict[key][column][fn.__name__] = result_dict[key][column]\n else:\n metadict[key][column][fn.__name__] = "a violation"\n return truthparam, metadict\n\n super(MultiColumnConstraintWithMetadata, self).__init__(\n description,\n validation_fn,\n resulting_exception,\n raise_or_typecheck=raise_or_typecheck,\n name=name,\n )\n\n def validate(self, data, *args, **kwargs):\n return ConstraintWithMetadata.validate(self, data, *args, **kwargs)\n\n\nclass MultiAggregateConstraintWithMetadata(MultiColumnConstraintWithMetadata):\n """This class is similar to multicolumn, but takes in functions that operate on the whole column at once\n rather than ones that operate on each value --\n consider this similar to the difference between apply-map and apply aggregate.\n\n Args:\n description (str): description of the overall set of validations (TODO: support multiple descriptions)\n fn_and_columns_dict (Dict[str, List[Callable[[pd.Series], Tuple[bool, dict[str, Union[dict,list, str, set]]]]]):\n while this is a relatively complex type,\n what it amounts to is a dict mapping columns to the functions to\n run on them'\n resulting_exception (type): the response to generate if validation fails. Subclass of\n ConstraintWithMetadataException\n raise_or_typecheck (Optional[bool]): whether to raise an exception (true) or a failed typecheck (false)\n type_for_internal (Optional[type]): what type to use for internal validators. Subclass of\n ConstraintWithMetadata\n name (Optional[str]): what to call the constraint, defaults to the class name.\n """\n\n def __init__(\n self,\n description,\n fn_and_columns_dict,\n resulting_exception,\n raise_or_typecheck=True,\n name=None,\n ):\n super(MultiAggregateConstraintWithMetadata, self).__init__(\n description,\n fn_and_columns_dict,\n resulting_exception,\n raise_or_typecheck=raise_or_typecheck,\n type_for_internal=ColumnAggregateConstraintWithMetadata,\n name=name,\n )\n\n\ndef non_null_validation(x):\n """Validates that a particular value in a column is not null.\n\n Usage:\n pass this as a column validator to\n :py:class:'~dagster_pandas.constraints.ColumnConstraintWithMetadata'\n or :py:class:'~dagster_pandas.constraints.MultiColumnConstraintWithMetadata'\n Generally, you should prefer to use nonnull as a decorator/wrapper rather than using this\n directly.\n """\n return not pd.isnull(x), {}\n\n\ndef all_unique_validator(column, ignore_missing_vals=False):\n """Validates that all values in an iterable are unique.\n\n Returns duplicated values as metadata.\n\n Usage:\n As a validation function for a\n :py:class:'~dagster_pandas.constraints.ColumnAggregateConstraintWithMetadata'\n or :py:class:'~dagster_pandas.constraints.MultiAggregateConstraintWithMetadata'\n Example:\n .. code-block:: python\n aggregate_validator = MultiAggregateConstraintWithMetadata(\n "confirms all values are unique",\n {'bar': [all_unique_validator]},\n ConstraintWithMetadataException,\n raise_or_typecheck=False,\n )\n ntype = create_structured_dataframe_type(\n "NumericType",\n columns_aggregate_validator=aggregate_validator\n )\n @op(out={'basic_dataframe': Out(dagster_type=ntype)})\n def create_dataframe(_):\n yield Output(\n DataFrame({'foo': [1, 2, 3], 'bar': [9, 10, 10]}),\n output_name='basic_dataframe',\n )\n #will fail with\n metadata['offending'] == {'bar': {'all_unique_validator': 'a violation'}}\n metadata['actual'] == {'bar': {'all_unique_validator': [10.0]}}\n """\n column = pd.Series(column)\n duplicated = column.duplicated()\n if ignore_missing_vals:\n duplicated = apply_ignore_missing_data_to_mask(duplicated, column)\n return not duplicated.any(), {"actual": column[duplicated]}\n\n\ndef nonnull(func):\n """Decorator for column validation functions to make them error on nulls.\n\n Usage:\n pass decorated functions as column validators to\n :py:class:'~dagster_pandas.constraints.ColumnConstraintWithMetadata'\n or :py:class:'~dagster_pandas.constraints.MultiColumnConstraintWithMetadata'\n Args:\n func (Callable[[Any], Tuple[bool, dict[str, Union[dict,list, str, set]]]]]):\n the column validator you want to error on nulls.\n """\n\n @wraps(func)\n def nvalidator(val):\n origval = func(val)\n nval = non_null_validation(val)\n return origval[0] and nval[0], {}\n\n nvalidator.__doc__ += " and ensures no values are null"\n\n return nvalidator\n\n\ndef column_range_validation_factory(minim=None, maxim=None, ignore_missing_vals=False):\n """Factory for validators testing if column values are within a range.\n\n Args:\n minim(Optional[Comparable]): the low end of the range\n maxim(Optional[Comparable]): the high end of the range\n ignore_missing_vals(Optional[bool]): whether to ignore nulls.\n\n Returns: a validation function for this constraint\n Usage:\n pass returned functions as column validators to\n :py:class:'~dagster_pandas.constraints.ColumnConstraintWithMetadata'\n or :py:class:'~dagster_pandas.constraints.MultiColumnConstraintWithMetadata'\n Examples:\n .. code-block:: python\n in_range_validator = column_range_validation_factory(1, 3, ignore_missing_vals=True)\n column_validator = MultiColumnConstraintWithMetadata(\n "confirms values are numbers in a range",\n {'foo': [in_range_validator]},\n ColumnWithMetadataException,\n raise_or_typecheck=False,\n )\n ntype = create_structured_dataframe_type(\n "NumericType",\n columns_validator=column_validator\n )\n @op(out={'basic_dataframe': Out(dagster_type=ntype)})\n def create_dataframe(_):\n yield Output(\n DataFrame({'foo': [1, 2, 7], 'bar': [9, 10, 10]}),\n output_name='basic_dataframe',\n )\n #will fail with\n metadata['offending'] == {'foo': {'in_range_validation_fn': ['row 2']}}\n metadata['actual'] == {'foo': {'in_range_validation_fn': [7]}}\n\n """\n if minim is None:\n if isinstance(maxim, datetime):\n minim = datetime.min\n else:\n minim = -1 * (sys.maxsize - 1)\n if maxim is None:\n if isinstance(minim, datetime):\n maxim = datetime.max\n else:\n maxim = sys.maxsize\n\n def in_range_validation_fn(x):\n if ignore_missing_vals and pd.isnull(x):\n return True, {}\n return (isinstance(x, (type(minim), type(maxim)))) and (x <= maxim) and (x >= minim), {}\n\n in_range_validation_fn.__doc__ = f"checks whether values are between {minim} and {maxim}"\n if ignore_missing_vals:\n in_range_validation_fn.__doc__ += ", ignoring nulls"\n\n return in_range_validation_fn\n\n\ndef categorical_column_validator_factory(categories, ignore_missing_vals=False):\n """Factory for validators testing if all values are in some set.\n\n Args:\n categories(Union[Sequence, set]): the set of allowed values\n ignore_missing_vals(Optional[bool]): whether to ignore nulls.\n\n Returns: a validation function for this constraint\n\n Usage:\n pass returned functions as column validators to\n :py:class:'~dagster_pandas.constraints.ColumnConstraintWithMetadata'\n or :py:class:'~dagster_pandas.constraints.MultiColumnConstraintWithMetadata'\n\n Example:\n .. code-block:: python\n categorical_validation_fn = categorical_column_validator_factory([1, 2])\n column_validator = MultiColumnConstraintWithMetadata(\n "confirms values are numbers in a range",\n {'foo': [categorical_validation_fn]},\n ColumnWithMetadataException,\n raise_or_typecheck=False,\n )\n ntype = create_structured_dataframe_type(\n "NumericType",\n columns_validator=column_validator\n )\n @op(out={'basic_dataframe': Out(dagster_type=ntype)})\n def create_dataframe(_):\n yield Output(\n DataFrame({'foo': [1, 2, 7], 'bar': [9, 10, 10]}),\n output_name='basic_dataframe',\n )\n #will fail with\n metadata['offending'] == {'foo': {'categorical_validation_fn': ['row 2']}}\n metadata['actual'] == {'foo': {'categorical_validation_fn': [7]}}\n\n """\n categories = set(categories)\n\n def categorical_validation_fn(x):\n if ignore_missing_vals and pd.isnull(x):\n return True, {}\n return (x in categories), {}\n\n categorical_validation_fn.__doc__ = (\n f"checks whether values are within this set of values: {categories}"\n )\n if ignore_missing_vals:\n categorical_validation_fn.__doc__ += ", ignoring nulls"\n\n return categorical_validation_fn\n\n\ndef dtype_in_set_validation_factory(datatypes, ignore_missing_vals=False):\n """Factory for testing if the dtype of a val falls within some allowed set.\n\n Args:\n datatypes(Union[set[type], type]): which datatype/datatypes are allowed\n ignore_missing_vals(Optional[bool]): whether to ignore nulls\n\n Returns: a validation function for this constraint\n\n Usage:\n pass returned functions as column validators to\n :py:class:'~dagster_pandas.constraints.ColumnConstraintWithMetadata'\n or :py:class:'~dagster_pandas.constraints.MultiColumnConstraintWithMetadata'\n\n Examples:\n .. code-block:: python\n dtype_is_num_validator = dtype_in_set_validation_factory((int, float, int64, float64))\n column_validator = MultiColumnConstraintWithMetadata(\n "confirms values are numbers in a range",\n {'foo': [dtype_is_num_validator]},\n ColumnWithMetadataException,\n raise_or_typecheck=False,\n )\n ntype = create_structured_dataframe_type(\n "NumericType",\n columns_validator=column_validator\n )\n @op(out={'basic_dataframe': Out(dagster_type=ntype)})\n def create_dataframe(_):\n yield Output(\n DataFrame({'foo': [1, 'a', 7], 'bar': [9, 10, 10]}),\n output_name='basic_dataframe',\n )\n #will fail with\n metadata['offending'] == {'foo': {'categorical_validation_fn': ['row 1']}}\n metadata['actual'] == {'foo': {'categorical_validation_fn': ['a']}}\n\n """\n\n def dtype_in_set_validation_fn(x):\n if ignore_missing_vals and pd.isnull(x):\n return True, {}\n return isinstance(x, datatypes), {}\n\n dtype_in_set_validation_fn.__doc__ = f"checks whether values are this type/types: {datatypes}"\n if ignore_missing_vals:\n dtype_in_set_validation_fn.__doc__ += ", ignoring nulls"\n\n return dtype_in_set_validation_fn\n\n\nclass ColumnRangeConstraintWithMetadata(ColumnConstraintWithMetadata):\n def __init__(self, minim=None, maxim=None, columns=None, raise_or_typecheck=True):\n self.name = self.__class__.__name__\n\n description = f"Confirms values are between {minim} and {maxim}"\n super(ColumnRangeConstraintWithMetadata, self).__init__(\n description=description,\n validation_fn=column_range_validation_factory(minim=minim, maxim=maxim),\n resulting_exception=ColumnWithMetadataException,\n raise_or_typecheck=raise_or_typecheck,\n )\n self.columns = columns\n\n def validate(self, data, *args, **kwargs):\n if self.columns is None:\n self.columns = list(data.columns)\n self.columns.extend(args)\n return super(ColumnRangeConstraintWithMetadata, self).validate(\n data, *self.columns, **kwargs\n )\n\n\nclass ColumnConstraint(Constraint):\n """Base constraint object that represent dataframe column shape constraints.\n\n Args:\n error_description (Optional[str]): The plain string description that is output in the terminal if the constraint fails.\n markdown_description (Optional[str]): A markdown supported description that is shown in the Dagster UI if the constraint fails.\n """\n\n def __init__(self, error_description=None, markdown_description=None):\n super(ColumnConstraint, self).__init__(\n error_description=error_description, markdown_description=markdown_description\n )\n\n def validate(self, dataframe, column_name):\n pass\n\n @staticmethod\n def get_offending_row_pairs(dataframe, column_name):\n return zip(dataframe.index.tolist(), dataframe[column_name].tolist())\n\n\nclass ColumnDTypeFnConstraint(ColumnConstraint):\n """A column constraint that applies a pandas dtype validation function to a columns dtype.\n\n Args:\n type_fn (Callable[[Set[str]], bool]): This is a function that takes the pandas columns dtypes and\n returns if those dtypes match the types it expects. See pandas.core.dtypes.common for examples.\n """\n\n def __init__(self, type_fn):\n self.type_fn = check.callable_param(type_fn, "type_fn")\n description = f'Dtype must satisfy "{self.type_fn.__name__}"'\n super(ColumnDTypeFnConstraint, self).__init__(\n error_description=description, markdown_description=description\n )\n\n def validate(self, dataframe, column_name):\n column_dtype = dataframe[column_name].dtype\n if not self.type_fn(column_dtype):\n raise ColumnConstraintViolationException(\n constraint_name=self.name,\n constraint_description=f'{self.error_description}, but was "{column_dtype}"',\n column_name=column_name,\n )\n\n\nclass ColumnDTypeInSetConstraint(ColumnConstraint):\n """A column constraint that validates the pandas column dtypes based on the expected set of dtypes.\n\n Args:\n expected_dtype_set (Set[str]): The set of pandas dtypes that the pandas column dtypes must match.\n """\n\n def __init__(self, expected_dtype_set):\n self.expected_dtype_set = check.set_param(expected_dtype_set, "expected_dtype_set")\n description = f"Column dtype must be in the following set {self.expected_dtype_set}."\n super(ColumnDTypeInSetConstraint, self).__init__(\n error_description=description, markdown_description=description\n )\n\n def validate(self, dataframe, column_name):\n received_dtypes = dataframe[column_name].dtype\n if str(received_dtypes) not in self.expected_dtype_set:\n raise ColumnConstraintViolationException(\n constraint_name=self.name,\n constraint_description=(\n f"{self.error_description}. DTypes received: {received_dtypes}"\n ),\n column_name=column_name,\n )\n\n\nclass NonNullableColumnConstraint(ColumnConstraint):\n """A column constraint that ensures all values in a pandas column are not null."""\n\n def __init__(self):\n description = "No Null values allowed."\n super(NonNullableColumnConstraint, self).__init__(\n error_description=description, markdown_description=description\n )\n\n def validate(self, dataframe, column_name):\n rows_with_null_columns = dataframe[dataframe[column_name].isna()]\n if not rows_with_null_columns.empty:\n raise ColumnConstraintViolationException(\n constraint_name=self.name,\n constraint_description=self.error_description,\n column_name=column_name,\n offending_rows=self.get_offending_row_pairs(rows_with_null_columns, column_name),\n )\n\n\nclass UniqueColumnConstraint(ColumnConstraint):\n """A column constraint that ensures all values in a pandas column are unique.\n\n Args:\n ignore_missing_vals (bool): If true, this constraint will enforce the constraint on non missing values.\n """\n\n def __init__(self, ignore_missing_vals):\n description = "Column must be unique."\n self.ignore_missing_vals = check.bool_param(ignore_missing_vals, "ignore_missing_vals")\n super(UniqueColumnConstraint, self).__init__(\n error_description=description, markdown_description=description\n )\n\n def validate(self, dataframe, column_name):\n invalid = dataframe[column_name].duplicated()\n if self.ignore_missing_vals:\n invalid = apply_ignore_missing_data_to_mask(invalid, dataframe[column_name])\n rows_with_duplicated_values = dataframe[invalid]\n if not rows_with_duplicated_values.empty:\n raise ColumnConstraintViolationException(\n constraint_name=self.name,\n constraint_description=self.error_description,\n column_name=column_name,\n offending_rows=rows_with_duplicated_values,\n )\n\n\nclass CategoricalColumnConstraint(ColumnConstraint):\n """A column constraint that ensures all values in a pandas column are a valid category.\n\n Args:\n categories (Set[str]): Set of categories that values in your pandas column must match.\n ignore_missing_vals (bool): If true, this constraint will enforce the constraint on non missing values.\n """\n\n def __init__(self, categories, ignore_missing_vals):\n self.categories = list(check.set_param(categories, "categories", of_type=str))\n self.ignore_missing_vals = check.bool_param(ignore_missing_vals, "ignore_missing_vals")\n super(CategoricalColumnConstraint, self).__init__(\n error_description=f"Expected Categories are {self.categories}",\n markdown_description=f"Category examples are {self.categories[:5]}...",\n )\n\n def validate(self, dataframe, column_name):\n invalid = ~dataframe[column_name].isin(self.categories)\n if self.ignore_missing_vals:\n invalid = apply_ignore_missing_data_to_mask(invalid, dataframe[column_name])\n rows_with_unexpected_buckets = dataframe[invalid]\n if not rows_with_unexpected_buckets.empty:\n raise ColumnConstraintViolationException(\n constraint_name=self.name,\n constraint_description=self.error_description,\n column_name=column_name,\n offending_rows=rows_with_unexpected_buckets,\n )\n\n\nclass MinValueColumnConstraint(ColumnConstraint):\n """A column constraint that ensures all values in a pandas column are greater than the provided\n lower bound [inclusive].\n\n Args:\n min_value (Union[int, float, datetime.datetime]): The lower bound.\n ignore_missing_vals (bool): If true, this constraint will enforce the constraint on non missing values.\n """\n\n def __init__(self, min_value, ignore_missing_vals):\n self.min_value = check.inst_param(min_value, "min_value", (int, float, datetime))\n self.ignore_missing_vals = check.bool_param(ignore_missing_vals, "ignore_missing_vals")\n super(MinValueColumnConstraint, self).__init__(\n markdown_description=f"values > {self.min_value}",\n error_description=f"Column must have values > {self.min_value}",\n )\n\n def validate(self, dataframe, column_name):\n invalid = dataframe[column_name] < self.min_value\n if self.ignore_missing_vals:\n invalid = apply_ignore_missing_data_to_mask(invalid, dataframe[column_name])\n out_of_bounds_rows = dataframe[invalid]\n if not out_of_bounds_rows.empty:\n raise ColumnConstraintViolationException(\n constraint_name=self.name,\n constraint_description=self.error_description,\n column_name=column_name,\n offending_rows=out_of_bounds_rows,\n )\n\n\nclass MaxValueColumnConstraint(ColumnConstraint):\n """A column constraint that ensures all values in a pandas column are less than the provided\n upper bound [inclusive].\n\n Args:\n max_value (Union[int, float, datetime.datetime]): The upper bound.\n ignore_missing_vals (bool): If true, this constraint will enforce the constraint on non missing values.\n """\n\n def __init__(self, max_value, ignore_missing_vals):\n self.max_value = check.inst_param(max_value, "max_value", (int, float, datetime))\n self.ignore_missing_vals = check.bool_param(ignore_missing_vals, "ignore_missing_vals")\n super(MaxValueColumnConstraint, self).__init__(\n markdown_description=f"values < {self.max_value}",\n error_description=f"Column must have values < {self.max_value}",\n )\n\n def validate(self, dataframe, column_name):\n invalid = dataframe[column_name] > self.max_value\n if self.ignore_missing_vals:\n invalid = apply_ignore_missing_data_to_mask(invalid, dataframe[column_name])\n out_of_bounds_rows = dataframe[invalid]\n if not out_of_bounds_rows.empty:\n raise ColumnConstraintViolationException(\n constraint_name=self.name,\n constraint_description=self.error_description,\n column_name=column_name,\n offending_rows=out_of_bounds_rows,\n )\n\n\nclass InRangeColumnConstraint(ColumnConstraint):\n """A column constraint that ensures all values in a pandas column are between the lower and upper\n bound [inclusive].\n\n Args:\n min_value (Union[int, float, datetime.datetime]): The lower bound.\n max_value (Union[int, float, datetime.datetime]): The upper bound.\n ignore_missing_vals (bool): If true, this constraint will enforce the constraint on non\n missing values.\n """\n\n def __init__(self, min_value, max_value, ignore_missing_vals):\n self.min_value = check.inst_param(min_value, "min_value", (int, float, datetime))\n self.max_value = check.inst_param(max_value, "max_value", (int, float, datetime))\n self.ignore_missing_vals = check.bool_param(ignore_missing_vals, "ignore_missing_vals")\n super(InRangeColumnConstraint, self).__init__(\n markdown_description=f"{self.min_value} < values < {self.max_value}",\n error_description="Column must have values between {} and {} inclusive.".format(\n self.min_value, self.max_value\n ),\n )\n\n def validate(self, dataframe, column_name):\n invalid = ~dataframe[column_name].between(self.min_value, self.max_value)\n if self.ignore_missing_vals:\n invalid = apply_ignore_missing_data_to_mask(invalid, dataframe[column_name])\n out_of_bounds_rows = dataframe[invalid]\n if not out_of_bounds_rows.empty:\n raise ColumnConstraintViolationException(\n constraint_name=self.name,\n constraint_description=self.error_description,\n column_name=column_name,\n offending_rows=out_of_bounds_rows,\n )\n
", "current_page_name": "_modules/dagster_pandas/constraints", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_pandas.constraints"}, "data_frame": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_pandas.data_frame

\nimport pandas as pd\nfrom dagster import (\n    DagsterInvariantViolationError,\n    DagsterType,\n    Field,\n    MetadataValue,\n    StringSource,\n    TableColumn,\n    TableSchema,\n    TableSchemaMetadataValue,\n    TypeCheck,\n    _check as check,\n    dagster_type_loader,\n)\nfrom dagster._annotations import experimental\nfrom dagster._config import Selector\nfrom dagster._core.definitions.metadata import normalize_metadata\nfrom dagster._utils import dict_without_keys\n\nfrom dagster_pandas.constraints import (\n    CONSTRAINT_METADATA_KEY,\n    ColumnDTypeFnConstraint,\n    ColumnDTypeInSetConstraint,\n    ConstraintViolationException,\n)\nfrom dagster_pandas.validation import PandasColumn, validate_constraints\n\nCONSTRAINT_BLACKLIST = {ColumnDTypeFnConstraint, ColumnDTypeInSetConstraint}\n\n\n@dagster_type_loader(\n    Selector(\n        {\n            "csv": {\n                "path": StringSource,\n                "sep": Field(StringSource, is_required=False, default_value=","),\n            },\n            "parquet": {"path": StringSource},\n            "table": {"path": StringSource},\n            "pickle": {"path": StringSource},\n        },\n    )\n)\ndef dataframe_loader(_context, config):\n    file_type, file_options = next(iter(config.items()))\n\n    if file_type == "csv":\n        path = file_options["path"]\n        return pd.read_csv(path, **dict_without_keys(file_options, "path"))\n    elif file_type == "parquet":\n        return pd.read_parquet(file_options["path"])\n    elif file_type == "table":\n        return pd.read_csv(file_options["path"], sep="\\t")\n    elif file_type == "pickle":\n        return pd.read_pickle(file_options["path"])\n    else:\n        raise DagsterInvariantViolationError(f"Unsupported file_type {file_type}")\n\n\ndef df_type_check(_, value):\n    if not isinstance(value, pd.DataFrame):\n        return TypeCheck(success=False)\n    return TypeCheck(\n        success=True,\n        metadata={\n            "row_count": str(len(value)),\n            # string cast columns since they may be things like datetime\n            "metadata": {"columns": list(map(str, value.columns))},\n        },\n    )\n\n\nDataFrame = DagsterType(\n    name="PandasDataFrame",\n    description="""Two-dimensional size-mutable, potentially heterogeneous\n    tabular data structure with labeled axes (rows and columns).\n    See http://pandas.pydata.org/""",\n    loader=dataframe_loader,\n    type_check_fn=df_type_check,\n    typing_type=pd.DataFrame,\n)\n\n\ndef _construct_constraint_list(constraints):\n    def add_bullet(constraint_list, constraint_description):\n        return constraint_list + f"+ {constraint_description}\\n"\n\n    constraint_list = ""\n    for constraint in constraints:\n        if constraint.__class__ not in CONSTRAINT_BLACKLIST:\n            constraint_list = add_bullet(constraint_list, constraint.markdown_description)\n    return constraint_list\n\n\ndef _build_column_header(column_name, constraints):\n    header = f"**{column_name}**"\n    for constraint in constraints:\n        if isinstance(constraint, ColumnDTypeInSetConstraint):\n            dtypes_tuple = tuple(constraint.expected_dtype_set)\n            return header + f": `{dtypes_tuple if len(dtypes_tuple) > 1 else dtypes_tuple[0]}`"\n        elif isinstance(constraint, ColumnDTypeFnConstraint):\n            return header + f": Validator `{constraint.type_fn.__name__}`"\n    return header\n\n\ndef create_dagster_pandas_dataframe_description(description, columns):\n    title = "\\n".join([description, "### Columns", ""])\n    buildme = title\n    for column in columns:\n        buildme += "{}\\n{}\\n".format(\n            _build_column_header(column.name, column.constraints),\n            _construct_constraint_list(column.constraints),\n        )\n    return buildme\n\n\ndef create_table_schema_metadata_from_dataframe(\n    pandas_df: pd.DataFrame,\n) -> TableSchemaMetadataValue:\n    """This function takes a pandas DataFrame and returns its metadata as a Dagster TableSchema.\n\n    Args:\n        pandas_df (pandas.DataFrame): A pandas DataFrame for which to create metadata.\n\n    Returns:\n        TableSchemaMetadataValue: returns an object with the TableSchema for the DataFrame.\n    """\n    check.inst(pandas_df, pd.DataFrame, "Input must be a pandas DataFrame object")\n    return MetadataValue.table_schema(\n        TableSchema(\n            columns=[\n                TableColumn(name=str(name), type=str(dtype))\n                for name, dtype in pandas_df.dtypes.items()\n            ]\n        )\n    )\n\n\n
[docs]def create_dagster_pandas_dataframe_type(\n name,\n description=None,\n columns=None,\n metadata_fn=None,\n dataframe_constraints=None,\n loader=None,\n):\n """Constructs a custom pandas dataframe dagster type.\n\n Args:\n name (str): Name of the dagster pandas type.\n description (Optional[str]): A markdown-formatted string, displayed in tooling.\n columns (Optional[List[PandasColumn]]): A list of :py:class:`~dagster.PandasColumn` objects\n which express dataframe column schemas and constraints.\n metadata_fn (Optional[Callable[[], Union[Dict[str, Union[str, float, int, Dict, MetadataValue]])\n A callable which takes your dataframe and returns a dict with string label keys and\n MetadataValue values.\n dataframe_constraints (Optional[List[DataFrameConstraint]]): A list of objects that inherit from\n :py:class:`~dagster.DataFrameConstraint`. This allows you to express dataframe-level constraints.\n loader (Optional[DagsterTypeLoader]): An instance of a class that\n inherits from :py:class:`~dagster.DagsterTypeLoader`. If None, we will default\n to using `dataframe_loader`.\n """\n # We allow for the plugging in of a dagster_type_loader so that users can load their custom\n # dataframes via configuration their own way if the default configs don't suffice. This is\n # purely optional.\n check.str_param(name, "name")\n metadata_fn = check.opt_callable_param(metadata_fn, "metadata_fn")\n description = create_dagster_pandas_dataframe_description(\n check.opt_str_param(description, "description", default=""),\n check.opt_list_param(columns, "columns", of_type=PandasColumn),\n )\n\n def _dagster_type_check(_, value):\n if not isinstance(value, pd.DataFrame):\n return TypeCheck(\n success=False,\n description=(\n f"Must be a pandas.DataFrame. Got value of type. {type(value).__name__}"\n ),\n )\n\n try:\n validate_constraints(\n value,\n pandas_columns=columns,\n dataframe_constraints=dataframe_constraints,\n )\n except ConstraintViolationException as e:\n return TypeCheck(success=False, description=str(e))\n\n return TypeCheck(\n success=True,\n metadata=_execute_summary_stats(name, value, metadata_fn) if metadata_fn else None,\n )\n\n return DagsterType(\n name=name,\n type_check_fn=_dagster_type_check,\n loader=loader if loader else dataframe_loader,\n description=description,\n typing_type=pd.DataFrame,\n )
\n\n\n@experimental\ndef create_structured_dataframe_type(\n name,\n description=None,\n columns_validator=None,\n columns_aggregate_validator=None,\n dataframe_validator=None,\n loader=None,\n):\n """Args:\n name (str): the name of the new type\n description (Optional[str]): the description of the new type\n columns_validator (Optional[Union[ColumnConstraintWithMetadata, MultiColumnConstraintWithMetadata]]):\n what column-level row by row validation you want to have applied.\n Leave empty for no column-level row by row validation.\n columns_aggregate_validator (Optional[Union[ColumnAggregateConstraintWithMetadata,\n MultiAggregateConstraintWithMetadata]]):\n what column-level aggregate validation you want to have applied,\n Leave empty for no column-level aggregate validation.\n dataframe_validator (Optional[Union[ConstraintWithMetadata, MultiConstraintWithMetadata]]):\n what dataframe-wide validation you want to have applied.\n Leave empty for no dataframe-wide validation.\n loader (Optional[DagsterTypeLoader]): An instance of a class that\n inherits from :py:class:`~dagster.DagsterTypeLoader`. If None, we will default\n to using `dataframe_loader`.\n\n Returns:\n a DagsterType with the corresponding name and packaged validation.\n\n """\n\n def _dagster_type_check(_, value):\n if not isinstance(value, pd.DataFrame):\n return TypeCheck(\n success=False,\n description=(\n f"Must be a pandas.DataFrame. Got value of type. {type(value).__name__}"\n ),\n )\n individual_result_dict = {}\n\n if dataframe_validator is not None:\n individual_result_dict["dataframe"] = dataframe_validator.validate(value)\n if columns_validator is not None:\n individual_result_dict["columns"] = columns_validator.validate(value)\n\n if columns_aggregate_validator is not None:\n individual_result_dict["column-aggregates"] = columns_aggregate_validator.validate(\n value\n )\n\n typechecks_succeeded = True\n metadata = {}\n overall_description = "Failed Constraints: {}"\n constraint_clauses = []\n for key, result in individual_result_dict.items():\n result_val = result.success\n if result_val:\n continue\n typechecks_succeeded = typechecks_succeeded and result_val\n result_dict = result.metadata[CONSTRAINT_METADATA_KEY].data\n metadata[f"{key}-constraint-metadata"] = MetadataValue.json(result_dict)\n constraint_clauses.append(f"{key} failing constraints, {result.description}")\n # returns aggregates, then column, then dataframe\n return TypeCheck(\n success=typechecks_succeeded,\n description=overall_description.format(constraint_clauses),\n metadata=metadata,\n )\n\n description = check.opt_str_param(description, "description", default="")\n return DagsterType(\n name=name,\n type_check_fn=_dagster_type_check,\n loader=loader if loader else dataframe_loader,\n description=description,\n )\n\n\ndef _execute_summary_stats(type_name, value, metadata_fn):\n if not metadata_fn:\n return []\n\n user_metadata = metadata_fn(value)\n try:\n return normalize_metadata(user_metadata)\n except:\n raise DagsterInvariantViolationError(\n "The return value of the user-defined summary_statistics function for pandas "\n f"data frame type {type_name} returned {value}. This function must return "\n "Dict[str, RawMetadataValue]."\n )\n
", "current_page_name": "_modules/dagster_pandas/data_frame", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_pandas.data_frame"}, "validation": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_pandas.validation

\nfrom dagster import (\n    DagsterInvariantViolationError,\n    _check as check,\n)\nfrom pandas import DataFrame, Timestamp\nfrom pandas.core.dtypes.common import (\n    is_bool_dtype,\n    is_float_dtype,\n    is_integer_dtype,\n    is_numeric_dtype,\n    is_string_dtype,\n)\n\nfrom dagster_pandas.constraints import (\n    CategoricalColumnConstraint,\n    ColumnDTypeFnConstraint,\n    ColumnDTypeInSetConstraint,\n    Constraint,\n    ConstraintViolationException,\n    DataFrameConstraint,\n    InRangeColumnConstraint,\n    NonNullableColumnConstraint,\n    UniqueColumnConstraint,\n)\n\nPANDAS_NUMERIC_TYPES = {"int64", "float"}\n\n\ndef _construct_keyword_constraints(non_nullable, unique, ignore_missing_vals):\n    non_nullable = check.bool_param(non_nullable, "exists")\n    unique = check.bool_param(unique, "unique")\n    ignore_missing_vals = check.bool_param(ignore_missing_vals, "ignore_missing_vals")\n    if non_nullable and ignore_missing_vals:\n        raise DagsterInvariantViolationError(\n            "PandasColumn cannot have a non-null constraint while also ignore missing values"\n        )\n    constraints = []\n    if non_nullable:\n        constraints.append(NonNullableColumnConstraint())\n    if unique:\n        constraints.append(UniqueColumnConstraint(ignore_missing_vals=ignore_missing_vals))\n    return constraints\n\n\n
[docs]class PandasColumn:\n """The main API for expressing column level schemas and constraints for your custom dataframe\n types.\n\n Args:\n name (str): Name of the column. This must match up with the column name in the dataframe you\n expect to receive.\n is_required (Optional[bool]): Flag indicating the optional/required presence of the column.\n If th column exists, the validate function will validate the column. Defaults to True.\n constraints (Optional[List[Constraint]]): List of constraint objects that indicate the\n validation rules for the pandas column.\n """\n\n def __init__(self, name, constraints=None, is_required=None):\n self.name = check.str_param(name, "name")\n self.is_required = check.opt_bool_param(is_required, "is_required", default=True)\n self.constraints = check.opt_list_param(constraints, "constraints", of_type=Constraint)\n\n def validate(self, dataframe):\n if self.name not in dataframe.columns:\n # Ignore validation if column is missing from dataframe and is not required\n if self.is_required:\n raise ConstraintViolationException(\n f"Required column {self.name} not in dataframe with columns {dataframe.columns}"\n )\n else:\n for constraint in self.constraints:\n constraint.validate(dataframe, self.name)\n\n @staticmethod\n def exists(name, non_nullable=False, unique=False, ignore_missing_vals=False, is_required=None):\n """Simple constructor for PandasColumns that expresses existence constraints.\n\n Args:\n name (str): Name of the column. This must match up with the column name in the dataframe you\n expect to receive.\n non_nullable (Optional[bool]): If true, this column will enforce a constraint that all values in the column\n ought to be non null values.\n unique (Optional[bool]): If true, this column will enforce a uniqueness constraint on the column values.\n ignore_missing_vals (Optional[bool]): A flag that is passed into most constraints. If true, the constraint will\n only evaluate non-null data. Ignore_missing_vals and non_nullable cannot both be True.\n is_required (Optional[bool]): Flag indicating the optional/required presence of the column.\n If the column exists the validate function will validate the column. Default to True.\n """\n return PandasColumn(\n name=check.str_param(name, "name"),\n constraints=_construct_keyword_constraints(\n non_nullable=non_nullable, unique=unique, ignore_missing_vals=ignore_missing_vals\n ),\n is_required=is_required,\n )\n\n @staticmethod\n def boolean_column(\n name, non_nullable=False, unique=False, ignore_missing_vals=False, is_required=None\n ):\n """Simple constructor for PandasColumns that expresses boolean constraints on boolean dtypes.\n\n Args:\n name (str): Name of the column. This must match up with the column name in the dataframe you\n expect to receive.\n non_nullable (Optional[bool]): If true, this column will enforce a constraint that all values in the column\n ought to be non null values.\n unique (Optional[bool]): If true, this column will enforce a uniqueness constraint on the column values.\n ignore_missing_vals (Optional[bool]): A flag that is passed into most constraints. If true, the constraint will\n only evaluate non-null data. Ignore_missing_vals and non_nullable cannot both be True.\n is_required (Optional[bool]): Flag indicating the optional/required presence of the column.\n If the column exists the validate function will validate the column. Default to True.\n """\n return PandasColumn(\n name=check.str_param(name, "name"),\n constraints=[ColumnDTypeFnConstraint(is_bool_dtype)]\n + _construct_keyword_constraints(\n non_nullable=non_nullable, unique=unique, ignore_missing_vals=ignore_missing_vals\n ),\n is_required=is_required,\n )\n\n @staticmethod\n def numeric_column(\n name,\n min_value=-float("inf"),\n max_value=float("inf"),\n non_nullable=False,\n unique=False,\n ignore_missing_vals=False,\n is_required=None,\n ):\n """Simple constructor for PandasColumns that expresses numeric constraints numeric dtypes.\n\n Args:\n name (str): Name of the column. This must match up with the column name in the dataframe you\n expect to receive.\n min_value (Optional[Union[int,float]]): The lower bound for values you expect in this column. Defaults to -float('inf')\n max_value (Optional[Union[int,float]]): The upper bound for values you expect in this column. Defaults to float('inf')\n non_nullable (Optional[bool]): If true, this column will enforce a constraint that all values in the column\n ought to be non null values.\n unique (Optional[bool]): If true, this column will enforce a uniqueness constraint on the column values.\n ignore_missing_vals (Optional[bool]): A flag that is passed into most constraints. If true, the constraint will\n only evaluate non-null data. Ignore_missing_vals and non_nullable cannot both be True.\n is_required (Optional[bool]): Flag indicating the optional/required presence of the column.\n If the column exists the validate function will validate the column. Default to True.\n """\n return PandasColumn(\n name=check.str_param(name, "name"),\n constraints=[\n ColumnDTypeFnConstraint(is_numeric_dtype),\n InRangeColumnConstraint(\n check.numeric_param(min_value, "min_value"),\n check.numeric_param(max_value, "max_value"),\n ignore_missing_vals=ignore_missing_vals,\n ),\n ]\n + _construct_keyword_constraints(\n non_nullable=non_nullable, unique=unique, ignore_missing_vals=ignore_missing_vals\n ),\n is_required=is_required,\n )\n\n @staticmethod\n def integer_column(\n name,\n min_value=-float("inf"),\n max_value=float("inf"),\n non_nullable=False,\n unique=False,\n ignore_missing_vals=False,\n is_required=None,\n ):\n """Simple constructor for PandasColumns that expresses numeric constraints on integer dtypes.\n\n Args:\n name (str): Name of the column. This must match up with the column name in the dataframe you\n expect to receive.\n min_value (Optional[Union[int,float]]): The lower bound for values you expect in this column. Defaults to -float('inf')\n max_value (Optional[Union[int,float]]): The upper bound for values you expect in this column. Defaults to float('inf')\n non_nullable (Optional[bool]): If true, this column will enforce a constraint that all values in the column\n ought to be non null values.\n unique (Optional[bool]): If true, this column will enforce a uniqueness constraint on the column values.\n ignore_missing_vals (Optional[bool]): A flag that is passed into most constraints. If true, the constraint will\n only evaluate non-null data. Ignore_missing_vals and non_nullable cannot both be True.\n is_required (Optional[bool]): Flag indicating the optional/required presence of the column.\n If the column exists the validate function will validate the column. Default to True.\n """\n return PandasColumn(\n name=check.str_param(name, "name"),\n constraints=[\n ColumnDTypeFnConstraint(is_integer_dtype),\n InRangeColumnConstraint(\n check.numeric_param(min_value, "min_value"),\n check.numeric_param(max_value, "max_value"),\n ignore_missing_vals=ignore_missing_vals,\n ),\n ]\n + _construct_keyword_constraints(\n non_nullable=non_nullable, unique=unique, ignore_missing_vals=ignore_missing_vals\n ),\n is_required=is_required,\n )\n\n @staticmethod\n def float_column(\n name,\n min_value=-float("inf"),\n max_value=float("inf"),\n non_nullable=False,\n unique=False,\n ignore_missing_vals=False,\n is_required=None,\n ):\n """Simple constructor for PandasColumns that expresses numeric constraints on float dtypes.\n\n Args:\n name (str): Name of the column. This must match up with the column name in the dataframe you\n expect to receive.\n min_value (Optional[Union[int,float]]): The lower bound for values you expect in this column. Defaults to -float('inf')\n max_value (Optional[Union[int,float]]): The upper bound for values you expect in this column. Defaults to float('inf')\n non_nullable (Optional[bool]): If true, this column will enforce a constraint that all values in the column\n ought to be non null values.\n unique (Optional[bool]): If true, this column will enforce a uniqueness constraint on the column values.\n ignore_missing_vals (Optional[bool]): A flag that is passed into most constraints. If true, the constraint will\n only evaluate non-null data. Ignore_missing_vals and non_nullable cannot both be True.\n is_required (Optional[bool]): Flag indicating the optional/required presence of the column.\n If the column exists the validate function will validate the column. Default to True.\n """\n return PandasColumn(\n name=check.str_param(name, "name"),\n constraints=[\n ColumnDTypeFnConstraint(is_float_dtype),\n InRangeColumnConstraint(\n check.numeric_param(min_value, "min_value"),\n check.numeric_param(max_value, "max_value"),\n ignore_missing_vals=ignore_missing_vals,\n ),\n ]\n + _construct_keyword_constraints(\n non_nullable=non_nullable, unique=unique, ignore_missing_vals=ignore_missing_vals\n ),\n is_required=is_required,\n )\n\n @staticmethod\n def datetime_column(\n name,\n min_datetime=Timestamp.min,\n max_datetime=Timestamp.max,\n non_nullable=False,\n unique=False,\n ignore_missing_vals=False,\n is_required=None,\n tz=None,\n ):\n """Simple constructor for PandasColumns that expresses datetime constraints on 'datetime64[ns]' dtypes.\n\n Args:\n name (str): Name of the column. This must match up with the column name in the dataframe you\n expect to receive.\n min_datetime (Optional[Union[int,float]]): The lower bound for values you expect in this column.\n Defaults to pandas.Timestamp.min.\n max_datetime (Optional[Union[int,float]]): The upper bound for values you expect in this column.\n Defaults to pandas.Timestamp.max.\n non_nullable (Optional[bool]): If true, this column will enforce a constraint that all values in the column\n ought to be non null values.\n unique (Optional[bool]): If true, this column will enforce a uniqueness constraint on the column values.\n ignore_missing_vals (Optional[bool]): A flag that is passed into most constraints. If true, the constraint will\n only evaluate non-null data. Ignore_missing_vals and non_nullable cannot both be True.\n is_required (Optional[bool]): Flag indicating the optional/required presence of the column.\n If the column exists the validate function will validate the column. Default to True.\n tz (Optional[str]): Required timezone for values eg: tz='UTC', tz='Europe/Dublin', tz='US/Eastern'.\n Defaults to None, meaning naive datetime values.\n """\n if tz is None:\n datetime_constraint = ColumnDTypeInSetConstraint({"datetime64[ns]"})\n else:\n datetime_constraint = ColumnDTypeInSetConstraint({f"datetime64[ns, {tz}]"})\n # One day more/less than absolute min/max to prevent OutOfBoundsDatetime errors when converting min/max to be tz aware\n if min_datetime.tz_localize(None) == Timestamp.min:\n min_datetime = Timestamp("1677-09-22 00:12:43.145225Z")\n if max_datetime.tz_localize(None) == Timestamp.max:\n max_datetime = Timestamp("2262-04-10 23:47:16.854775807Z")\n # Convert bounds to same tz\n if Timestamp(min_datetime).tz is None:\n min_datetime = Timestamp(min_datetime).tz_localize(tz)\n if Timestamp(max_datetime).tz is None:\n max_datetime = Timestamp(max_datetime).tz_localize(tz)\n\n return PandasColumn(\n name=check.str_param(name, "name"),\n constraints=[\n datetime_constraint,\n InRangeColumnConstraint(\n min_datetime, max_datetime, ignore_missing_vals=ignore_missing_vals\n ),\n ]\n + _construct_keyword_constraints(\n non_nullable=non_nullable, unique=unique, ignore_missing_vals=ignore_missing_vals\n ),\n is_required=is_required,\n )\n\n @staticmethod\n def string_column(\n name, non_nullable=False, unique=False, ignore_missing_vals=False, is_required=None\n ):\n """Simple constructor for PandasColumns that expresses constraints on string dtypes.\n\n Args:\n name (str): Name of the column. This must match up with the column name in the dataframe you\n expect to receive.\n non_nullable (Optional[bool]): If true, this column will enforce a constraint that all values in the column\n ought to be non null values.\n unique (Optional[bool]): If true, this column will enforce a uniqueness constraint on the column values.\n ignore_missing_vals (Optional[bool]): A flag that is passed into most constraints. If true, the constraint will\n only evaluate non-null data. Ignore_missing_vals and non_nullable cannot both be True.\n is_required (Optional[bool]): Flag indicating the optional/required presence of the column.\n If the column exists the validate function will validate the column. Default to True.\n """\n return PandasColumn(\n name=check.str_param(name, "name"),\n constraints=[ColumnDTypeFnConstraint(is_string_dtype)]\n + _construct_keyword_constraints(\n non_nullable=non_nullable, unique=unique, ignore_missing_vals=ignore_missing_vals\n ),\n is_required=is_required,\n )\n\n @staticmethod\n def categorical_column(\n name,\n categories,\n of_types=frozenset({"category", "object"}),\n non_nullable=False,\n unique=False,\n ignore_missing_vals=False,\n is_required=None,\n ):\n """Simple constructor for PandasColumns that expresses categorical constraints on specified dtypes.\n\n Args:\n name (str): Name of the column. This must match up with the column name in the dataframe you\n expect to receive.\n categories (List[Any]): The valid set of buckets that all values in the column must match.\n of_types (Optional[Union[str, Set[str]]]): The expected dtype[s] that your categories and values must\n abide by.\n non_nullable (Optional[bool]): If true, this column will enforce a constraint that all values in\n the column ought to be non null values.\n unique (Optional[bool]): If true, this column will enforce a uniqueness constraint on the column values.\n ignore_missing_vals (Optional[bool]): A flag that is passed into most constraints. If true, the\n constraint will only evaluate non-null data. Ignore_missing_vals and non_nullable cannot both be True.\n is_required (Optional[bool]): Flag indicating the optional/required presence of the column.\n If the column exists the validate function will validate the column. Default to True.\n """\n of_types = {of_types} if isinstance(of_types, str) else of_types\n return PandasColumn(\n name=check.str_param(name, "name"),\n constraints=[\n ColumnDTypeInSetConstraint(of_types),\n CategoricalColumnConstraint(categories, ignore_missing_vals=ignore_missing_vals),\n ]\n + _construct_keyword_constraints(\n non_nullable=non_nullable, unique=unique, ignore_missing_vals=ignore_missing_vals\n ),\n is_required=is_required,\n )
\n\n\ndef validate_constraints(dataframe, pandas_columns=None, dataframe_constraints=None):\n dataframe = check.inst_param(dataframe, "dataframe", DataFrame)\n pandas_columns = check.opt_list_param(\n pandas_columns, "column_constraints", of_type=PandasColumn\n )\n dataframe_constraints = check.opt_list_param(\n dataframe_constraints, "dataframe_constraints", of_type=DataFrameConstraint\n )\n\n if pandas_columns:\n for column in pandas_columns:\n column.validate(dataframe)\n\n if dataframe_constraints:\n for dataframe_constraint in dataframe_constraints:\n dataframe_constraint.validate(dataframe)\n
", "current_page_name": "_modules/dagster_pandas/validation", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_pandas.validation"}}, "dagster_postgres": {"event_log": {"event_log": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_postgres.event_log.event_log

\nfrom typing import Any, ContextManager, Mapping, Optional, Sequence\n\nimport dagster._check as check\nimport sqlalchemy as db\nimport sqlalchemy.dialects as db_dialects\nimport sqlalchemy.pool as db_pool\nfrom dagster._config.config_schema import UserConfigSchema\nfrom dagster._core.errors import DagsterInvariantViolationError\nfrom dagster._core.event_api import EventHandlerFn\nfrom dagster._core.events import ASSET_CHECK_EVENTS, ASSET_EVENTS\nfrom dagster._core.events.log import EventLogEntry\nfrom dagster._core.storage.config import pg_config\nfrom dagster._core.storage.event_log import (\n    AssetKeyTable,\n    DynamicPartitionsTable,\n    SqlEventLogStorage,\n    SqlEventLogStorageMetadata,\n    SqlEventLogStorageTable,\n)\nfrom dagster._core.storage.event_log.base import EventLogCursor\nfrom dagster._core.storage.event_log.migration import ASSET_KEY_INDEX_COLS\nfrom dagster._core.storage.event_log.polling_event_watcher import SqlPollingEventWatcher\nfrom dagster._core.storage.sql import (\n    AlembicVersion,\n    check_alembic_revision,\n    create_engine,\n    run_alembic_upgrade,\n    stamp_alembic_rev,\n)\nfrom dagster._core.storage.sqlalchemy_compat import db_select\nfrom dagster._serdes import ConfigurableClass, ConfigurableClassData, deserialize_value\nfrom sqlalchemy.engine import Connection\n\nfrom ..utils import (\n    create_pg_connection,\n    pg_alembic_config,\n    pg_statement_timeout,\n    pg_url_from_config,\n    retry_pg_connection_fn,\n    retry_pg_creation_fn,\n)\n\nCHANNEL_NAME = "run_events"\n\n\n
[docs]class PostgresEventLogStorage(SqlEventLogStorage, ConfigurableClass):\n """Postgres-backed event log storage.\n\n Users should not directly instantiate this class; it is instantiated by internal machinery when\n ``dagster-webserver`` and ``dagster-graphql`` load, based on the values in the ``dagster.yaml`` file in\n ``$DAGSTER_HOME``. Configuration of this class should be done by setting values in that file.\n\n To use Postgres for all of the components of your instance storage, you can add the following\n block to your ``dagster.yaml``:\n\n .. literalinclude:: ../../../../../../examples/docs_snippets/docs_snippets/deploying/dagster-pg.yaml\n :caption: dagster.yaml\n :lines: 1-8\n :language: YAML\n\n If you are configuring the different storage components separately and are specifically\n configuring your event log storage to use Postgres, you can add a block such as the following\n to your ``dagster.yaml``:\n\n .. literalinclude:: ../../../../../../examples/docs_snippets/docs_snippets/deploying/dagster-pg-legacy.yaml\n :caption: dagster.yaml\n :lines: 12-21\n :language: YAML\n\n Note that the fields in this config are :py:class:`~dagster.StringSource` and\n :py:class:`~dagster.IntSource` and can be configured from environment variables.\n\n """\n\n def __init__(\n self,\n postgres_url: str,\n should_autocreate_tables: bool = True,\n inst_data: Optional[ConfigurableClassData] = None,\n ):\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n self.postgres_url = check.str_param(postgres_url, "postgres_url")\n self.should_autocreate_tables = check.bool_param(\n should_autocreate_tables, "should_autocreate_tables"\n )\n\n self._disposed = False\n\n # Default to not holding any connections open to prevent accumulating connections per DagsterInstance\n self._engine = create_engine(\n self.postgres_url, isolation_level="AUTOCOMMIT", poolclass=db_pool.NullPool\n )\n\n self._event_watcher = SqlPollingEventWatcher(self)\n\n self._secondary_index_cache = {}\n\n # Stamp and create tables if the main table does not exist (we can't check alembic\n # revision because alembic config may be shared with other storage classes)\n if self.should_autocreate_tables:\n table_names = retry_pg_connection_fn(lambda: db.inspect(self._engine).get_table_names())\n if "event_logs" not in table_names:\n retry_pg_creation_fn(self._init_db)\n self.reindex_events()\n self.reindex_assets()\n\n super().__init__()\n\n def _init_db(self) -> None:\n with self._connect() as conn:\n with conn.begin():\n SqlEventLogStorageMetadata.create_all(conn)\n stamp_alembic_rev(pg_alembic_config(__file__), conn)\n\n def optimize_for_webserver(self, statement_timeout: int, pool_recycle: int) -> None:\n # When running in dagster-webserver, hold an open connection and set statement_timeout\n existing_options = self._engine.url.query.get("options")\n timeout_option = pg_statement_timeout(statement_timeout)\n if existing_options:\n options = f"{timeout_option} {existing_options}"\n else:\n options = timeout_option\n self._engine = create_engine(\n self.postgres_url,\n isolation_level="AUTOCOMMIT",\n pool_size=1,\n connect_args={"options": options},\n pool_recycle=pool_recycle,\n )\n\n def upgrade(self) -> None:\n alembic_config = pg_alembic_config(__file__)\n with self._connect() as conn:\n run_alembic_upgrade(alembic_config, conn)\n\n @property\n def inst_data(self) -> Optional[ConfigurableClassData]:\n return self._inst_data\n\n @classmethod\n def config_type(cls) -> UserConfigSchema:\n return pg_config()\n\n @classmethod\n def from_config_value(\n cls, inst_data: Optional[ConfigurableClassData], config_value: Mapping[str, Any]\n ) -> "PostgresEventLogStorage":\n return PostgresEventLogStorage(\n inst_data=inst_data,\n postgres_url=pg_url_from_config(config_value),\n should_autocreate_tables=config_value.get("should_autocreate_tables", True),\n )\n\n @staticmethod\n def create_clean_storage(\n conn_string: str, should_autocreate_tables: bool = True\n ) -> "PostgresEventLogStorage":\n engine = create_engine(\n conn_string, isolation_level="AUTOCOMMIT", poolclass=db_pool.NullPool\n )\n try:\n SqlEventLogStorageMetadata.drop_all(engine)\n finally:\n engine.dispose()\n\n return PostgresEventLogStorage(conn_string, should_autocreate_tables)\n\n def store_event(self, event: EventLogEntry) -> None:\n """Store an event corresponding to a run.\n\n Args:\n event (EventLogEntry): The event to store.\n """\n check.inst_param(event, "event", EventLogEntry)\n insert_event_statement = self.prepare_insert_event(event) # from SqlEventLogStorage.py\n with self._connect() as conn:\n result = conn.execute(\n insert_event_statement.returning(\n SqlEventLogStorageTable.c.run_id, SqlEventLogStorageTable.c.id\n )\n )\n res = result.fetchone()\n result.close()\n\n # LISTEN/NOTIFY no longer used for pg event watch - preserved here to support version skew\n conn.execute(\n db.text(f"""NOTIFY {CHANNEL_NAME}, :notify_id; """),\n {"notify_id": res[0] + "_" + str(res[1])}, # type: ignore\n )\n event_id = int(res[1]) # type: ignore\n\n if (\n event.is_dagster_event\n and event.dagster_event_type in ASSET_EVENTS\n and event.dagster_event.asset_key # type: ignore\n ):\n self.store_asset_event(event, event_id)\n\n if event_id is None:\n raise DagsterInvariantViolationError(\n "Cannot store asset event tags for null event id."\n )\n\n self.store_asset_event_tags(event, event_id)\n\n if event.is_dagster_event and event.dagster_event_type in ASSET_CHECK_EVENTS:\n self.store_asset_check_event(event, event_id)\n\n def store_asset_event(self, event: EventLogEntry, event_id: int) -> None:\n check.inst_param(event, "event", EventLogEntry)\n if not (event.dagster_event and event.dagster_event.asset_key):\n return\n\n # We switched to storing the entire event record of the last materialization instead of just\n # the AssetMaterialization object, so that we have access to metadata like timestamp,\n # job, run_id, etc.\n #\n # This should make certain asset queries way more performant, without having to do extra\n # queries against the event log.\n #\n # This should be accompanied by a schema change in 0.12.0, renaming `last_materialization`\n # to `last_materialization_event`, for clarity. For now, we should do some back-compat.\n #\n # https://github.com/dagster-io/dagster/issues/3945\n\n # The AssetKeyTable contains a `last_materialization_timestamp` column that is exclusively\n # used to determine if an asset exists (last materialization timestamp > wipe timestamp).\n # This column is used nowhere else, and as of AssetObservation/AssetMaterializationPlanned\n # event creation, we want to extend this functionality to ensure that assets with any event\n # (observation, materialization, or materialization planned) yielded with timestamp\n # > wipe timestamp display in the Dagster UI.\n\n # As of the following PRs, we update last_materialization_timestamp to store the timestamp\n # of the latest asset observation, materialization, or materialization_planned that has occurred.\n # https://github.com/dagster-io/dagster/pull/6885\n # https://github.com/dagster-io/dagster/pull/7319\n\n # The AssetKeyTable also contains a `last_run_id` column that is updated upon asset\n # materialization. This column was not being used until the below PR. This new change\n # writes to the column upon `ASSET_MATERIALIZATION_PLANNED` events to fetch the last\n # run id for a set of assets in one roundtrip call to event log storage.\n # https://github.com/dagster-io/dagster/pull/7319\n\n values = self._get_asset_entry_values(\n event, event_id, self.has_secondary_index(ASSET_KEY_INDEX_COLS)\n )\n with self.index_connection() as conn:\n query = db_dialects.postgresql.insert(AssetKeyTable).values(\n asset_key=event.dagster_event.asset_key.to_string(),\n **values,\n )\n if values:\n query = query.on_conflict_do_update(\n index_elements=[AssetKeyTable.c.asset_key],\n set_=dict(**values),\n )\n else:\n query = query.on_conflict_do_nothing()\n conn.execute(query)\n\n def add_dynamic_partitions(\n self, partitions_def_name: str, partition_keys: Sequence[str]\n ) -> None:\n if not partition_keys:\n return\n\n # Overload base implementation to push upsert logic down into the db layer\n self._check_partitions_table()\n with self.index_connection() as conn:\n conn.execute(\n db_dialects.postgresql.insert(DynamicPartitionsTable)\n .values(\n [\n dict(partitions_def_name=partitions_def_name, partition=partition_key)\n for partition_key in partition_keys\n ]\n )\n .on_conflict_do_nothing(),\n )\n\n def _connect(self) -> ContextManager[Connection]:\n return create_pg_connection(self._engine)\n\n def run_connection(self, run_id: Optional[str] = None) -> ContextManager[Connection]:\n return self._connect()\n\n def index_connection(self) -> ContextManager[Connection]:\n return self._connect()\n\n def has_table(self, table_name: str) -> bool:\n return bool(self._engine.dialect.has_table(self._engine.connect(), table_name))\n\n def has_secondary_index(self, name: str) -> bool:\n if name not in self._secondary_index_cache:\n self._secondary_index_cache[name] = super(\n PostgresEventLogStorage, self\n ).has_secondary_index(name)\n return self._secondary_index_cache[name]\n\n def enable_secondary_index(self, name: str) -> None:\n super(PostgresEventLogStorage, self).enable_secondary_index(name)\n if name in self._secondary_index_cache:\n del self._secondary_index_cache[name]\n\n def watch(\n self,\n run_id: str,\n cursor: Optional[str],\n callback: EventHandlerFn,\n ) -> None:\n if cursor and EventLogCursor.parse(cursor).is_offset_cursor():\n check.failed("Cannot call `watch` with an offset cursor")\n\n self._event_watcher.watch_run(run_id, cursor, callback)\n\n def _gen_event_log_entry_from_cursor(self, cursor) -> EventLogEntry:\n with self._engine.connect() as conn:\n cursor_res = conn.execute(\n db_select([SqlEventLogStorageTable.c.event]).where(\n SqlEventLogStorageTable.c.id == cursor\n ),\n )\n return deserialize_value(cursor_res.scalar(), EventLogEntry) # type: ignore\n\n def end_watch(self, run_id: str, handler: EventHandlerFn) -> None:\n self._event_watcher.unwatch_run(run_id, handler)\n\n def __del__(self) -> None:\n # Keep the inherent limitations of __del__ in Python in mind!\n self.dispose()\n\n def dispose(self) -> None:\n if not self._disposed:\n self._disposed = True\n self._event_watcher.close()\n\n def alembic_version(self) -> AlembicVersion:\n alembic_config = pg_alembic_config(__file__)\n with self._connect() as conn:\n return check_alembic_revision(alembic_config, conn)
\n
", "current_page_name": "_modules/dagster_postgres/event_log/event_log", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_postgres.event_log.event_log"}}, "run_storage": {"run_storage": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_postgres.run_storage.run_storage

\nimport zlib\nfrom typing import ContextManager, Mapping, Optional\n\nimport dagster._check as check\nimport sqlalchemy as db\nimport sqlalchemy.dialects as db_dialects\nimport sqlalchemy.pool as db_pool\nfrom dagster._config.config_schema import UserConfigSchema\nfrom dagster._core.storage.config import PostgresStorageConfig, pg_config\nfrom dagster._core.storage.runs import (\n    DaemonHeartbeatsTable,\n    InstanceInfo,\n    RunStorageSqlMetadata,\n    SqlRunStorage,\n)\nfrom dagster._core.storage.runs.schema import KeyValueStoreTable, SnapshotsTable\nfrom dagster._core.storage.runs.sql_run_storage import SnapshotType\nfrom dagster._core.storage.sql import (\n    AlembicVersion,\n    check_alembic_revision,\n    create_engine,\n    run_alembic_upgrade,\n    stamp_alembic_rev,\n)\nfrom dagster._daemon.types import DaemonHeartbeat\nfrom dagster._serdes import ConfigurableClass, ConfigurableClassData, serialize_value\nfrom dagster._utils import utc_datetime_from_timestamp\nfrom sqlalchemy.engine import Connection\n\nfrom ..utils import (\n    create_pg_connection,\n    pg_alembic_config,\n    pg_statement_timeout,\n    pg_url_from_config,\n    retry_pg_connection_fn,\n    retry_pg_creation_fn,\n)\n\n\n
[docs]class PostgresRunStorage(SqlRunStorage, ConfigurableClass):\n """Postgres-backed run storage.\n\n Users should not directly instantiate this class; it is instantiated by internal machinery when\n ``dagster-webserver`` and ``dagster-graphql`` load, based on the values in the ``dagster.yaml`` file in\n ``$DAGSTER_HOME``. Configuration of this class should be done by setting values in that file.\n\n To use Postgres for all of the components of your instance storage, you can add the following\n block to your ``dagster.yaml``:\n\n .. literalinclude:: ../../../../../../examples/docs_snippets/docs_snippets/deploying/dagster-pg.yaml\n :caption: dagster.yaml\n :lines: 1-8\n :language: YAML\n\n If you are configuring the different storage components separately and are specifically\n configuring your run storage to use Postgres, you can add a block such as the following\n to your ``dagster.yaml``:\n\n .. literalinclude:: ../../../../../../examples/docs_snippets/docs_snippets/deploying/dagster-pg-legacy.yaml\n :caption: dagster.yaml\n :lines: 1-10\n :language: YAML\n\n Note that the fields in this config are :py:class:`~dagster.StringSource` and\n :py:class:`~dagster.IntSource` and can be configured from environment variables.\n """\n\n def __init__(\n self,\n postgres_url: str,\n should_autocreate_tables: bool = True,\n inst_data: Optional[ConfigurableClassData] = None,\n ):\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n self.postgres_url = postgres_url\n self.should_autocreate_tables = check.bool_param(\n should_autocreate_tables, "should_autocreate_tables"\n )\n\n # Default to not holding any connections open to prevent accumulating connections per DagsterInstance\n self._engine = create_engine(\n self.postgres_url,\n isolation_level="AUTOCOMMIT",\n poolclass=db_pool.NullPool,\n )\n\n self._index_migration_cache = {}\n\n # Stamp and create tables if the main table does not exist (we can't check alembic\n # revision because alembic config may be shared with other storage classes)\n if self.should_autocreate_tables:\n table_names = retry_pg_connection_fn(lambda: db.inspect(self._engine).get_table_names())\n if "runs" not in table_names:\n retry_pg_creation_fn(self._init_db)\n self.migrate()\n self.optimize()\n elif "instance_info" not in table_names:\n InstanceInfo.create(self._engine)\n\n super().__init__()\n\n def _init_db(self) -> None:\n with self.connect() as conn:\n with conn.begin():\n RunStorageSqlMetadata.create_all(conn)\n # This revision may be shared by any other dagster storage classes using the same DB\n stamp_alembic_rev(pg_alembic_config(__file__), conn)\n\n def optimize_for_webserver(self, statement_timeout: int, pool_recycle: int) -> None:\n # When running in dagster-webserver, hold 1 open connection and set statement_timeout\n existing_options = self._engine.url.query.get("options")\n timeout_option = pg_statement_timeout(statement_timeout)\n if existing_options:\n options = f"{timeout_option} {existing_options}"\n else:\n options = timeout_option\n self._engine = create_engine(\n self.postgres_url,\n isolation_level="AUTOCOMMIT",\n pool_size=1,\n connect_args={"options": options},\n pool_recycle=pool_recycle,\n )\n\n @property\n def inst_data(self) -> Optional[ConfigurableClassData]:\n return self._inst_data\n\n @classmethod\n def config_type(cls) -> UserConfigSchema:\n return pg_config()\n\n @classmethod\n def from_config_value(\n cls, inst_data: Optional[ConfigurableClassData], config_value: PostgresStorageConfig\n ):\n return PostgresRunStorage(\n inst_data=inst_data,\n postgres_url=pg_url_from_config(config_value),\n should_autocreate_tables=config_value.get("should_autocreate_tables", True),\n )\n\n @staticmethod\n def create_clean_storage(\n postgres_url: str, should_autocreate_tables: bool = True\n ) -> "PostgresRunStorage":\n engine = create_engine(\n postgres_url, isolation_level="AUTOCOMMIT", poolclass=db_pool.NullPool\n )\n try:\n RunStorageSqlMetadata.drop_all(engine)\n finally:\n engine.dispose()\n return PostgresRunStorage(postgres_url, should_autocreate_tables)\n\n def connect(self) -> ContextManager[Connection]:\n return create_pg_connection(self._engine)\n\n def upgrade(self) -> None:\n with self.connect() as conn:\n run_alembic_upgrade(pg_alembic_config(__file__), conn)\n\n def has_built_index(self, migration_name: str) -> bool:\n if migration_name not in self._index_migration_cache:\n self._index_migration_cache[migration_name] = super(\n PostgresRunStorage, self\n ).has_built_index(migration_name)\n return self._index_migration_cache[migration_name]\n\n def mark_index_built(self, migration_name: str) -> None:\n super(PostgresRunStorage, self).mark_index_built(migration_name)\n if migration_name in self._index_migration_cache:\n del self._index_migration_cache[migration_name]\n\n def add_daemon_heartbeat(self, daemon_heartbeat: DaemonHeartbeat) -> None:\n with self.connect() as conn:\n # insert or update if already present, using postgres specific on_conflict\n conn.execute(\n db_dialects.postgresql.insert(DaemonHeartbeatsTable)\n .values(\n timestamp=utc_datetime_from_timestamp(daemon_heartbeat.timestamp),\n daemon_type=daemon_heartbeat.daemon_type,\n daemon_id=daemon_heartbeat.daemon_id,\n body=serialize_value(daemon_heartbeat),\n )\n .on_conflict_do_update(\n index_elements=[DaemonHeartbeatsTable.c.daemon_type],\n set_={\n "timestamp": utc_datetime_from_timestamp(daemon_heartbeat.timestamp),\n "daemon_id": daemon_heartbeat.daemon_id,\n "body": serialize_value(daemon_heartbeat),\n },\n )\n .returning(\n # required because sqlalchemy might by default return the declared primary key,\n # which might not exist\n DaemonHeartbeatsTable.c.daemon_type,\n )\n )\n\n def set_cursor_values(self, pairs: Mapping[str, str]) -> None:\n check.mapping_param(pairs, "pairs", key_type=str, value_type=str)\n\n # pg speciic on_conflict_do_update\n insert_stmt = db_dialects.postgresql.insert(KeyValueStoreTable).values(\n [{"key": k, "value": v} for k, v in pairs.items()]\n )\n upsert_stmt = insert_stmt.on_conflict_do_update(\n index_elements=[\n KeyValueStoreTable.c.key,\n ],\n set_={"value": insert_stmt.excluded.value},\n ).returning(\n # required because sqlalchemy might by default return the declared primary key,\n # which might not exist\n KeyValueStoreTable.c.key\n )\n\n with self.connect() as conn:\n conn.execute(upsert_stmt)\n\n def _add_snapshot(self, snapshot_id: str, snapshot_obj, snapshot_type: SnapshotType) -> str:\n with self.connect() as conn:\n snapshot_insert = (\n db_dialects.postgresql.insert(SnapshotsTable)\n .values(\n snapshot_id=snapshot_id,\n snapshot_body=zlib.compress(serialize_value(snapshot_obj).encode("utf-8")),\n snapshot_type=snapshot_type.value,\n )\n .on_conflict_do_nothing()\n )\n conn.execute(snapshot_insert)\n return snapshot_id\n\n def alembic_version(self) -> AlembicVersion:\n alembic_config = pg_alembic_config(__file__)\n with self.connect() as conn:\n return check_alembic_revision(alembic_config, conn)
\n
", "current_page_name": "_modules/dagster_postgres/run_storage/run_storage", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_postgres.run_storage.run_storage"}}, "schedule_storage": {"schedule_storage": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_postgres.schedule_storage.schedule_storage

\nfrom typing import ContextManager, Optional\n\nimport dagster._check as check\nimport pendulum\nimport sqlalchemy as db\nimport sqlalchemy.dialects as db_dialects\nimport sqlalchemy.pool as db_pool\nfrom dagster._config.config_schema import UserConfigSchema\nfrom dagster._core.scheduler.instigation import InstigatorState\nfrom dagster._core.storage.config import PostgresStorageConfig, pg_config\nfrom dagster._core.storage.schedules import ScheduleStorageSqlMetadata, SqlScheduleStorage\nfrom dagster._core.storage.schedules.schema import InstigatorsTable\nfrom dagster._core.storage.sql import (\n    AlembicVersion,\n    check_alembic_revision,\n    create_engine,\n    run_alembic_upgrade,\n    stamp_alembic_rev,\n)\nfrom dagster._serdes import ConfigurableClass, ConfigurableClassData, serialize_value\nfrom sqlalchemy.engine import Connection\n\nfrom ..utils import (\n    create_pg_connection,\n    pg_alembic_config,\n    pg_statement_timeout,\n    pg_url_from_config,\n    retry_pg_connection_fn,\n    retry_pg_creation_fn,\n)\n\n\n
[docs]class PostgresScheduleStorage(SqlScheduleStorage, ConfigurableClass):\n """Postgres-backed run storage.\n\n Users should not directly instantiate this class; it is instantiated by internal machinery when\n ``dagster-webserver`` and ``dagster-graphql`` load, based on the values in the ``dagster.yaml`` file in\n ``$DAGSTER_HOME``. Configuration of this class should be done by setting values in that file.\n\n To use Postgres for all of the components of your instance storage, you can add the following\n block to your ``dagster.yaml``:\n\n .. literalinclude:: ../../../../../../examples/docs_snippets/docs_snippets/deploying/dagster-pg.yaml\n :caption: dagster.yaml\n :lines: 1-8\n :language: YAML\n\n If you are configuring the different storage components separately and are specifically\n configuring your schedule storage to use Postgres, you can add a block such as the following\n to your ``dagster.yaml``:\n\n .. literalinclude:: ../../../../../../examples/docs_snippets/docs_snippets/deploying/dagster-pg-legacy.yaml\n :caption: dagster.yaml\n :lines: 23-32\n :language: YAML\n\n Note that the fields in this config are :py:class:`~dagster.StringSource` and\n :py:class:`~dagster.IntSource` and can be configured from environment variables.\n """\n\n def __init__(\n self,\n postgres_url: str,\n should_autocreate_tables: bool = True,\n inst_data: Optional[ConfigurableClassData] = None,\n ):\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n self.postgres_url = postgres_url\n self.should_autocreate_tables = check.bool_param(\n should_autocreate_tables, "should_autocreate_tables"\n )\n\n # Default to not holding any connections open to prevent accumulating connections per DagsterInstance\n self._engine = create_engine(\n self.postgres_url, isolation_level="AUTOCOMMIT", poolclass=db_pool.NullPool\n )\n\n # Stamp and create tables if the main table does not exist (we can't check alembic\n # revision because alembic config may be shared with other storage classes)\n if self.should_autocreate_tables:\n table_names = retry_pg_connection_fn(lambda: db.inspect(self._engine).get_table_names())\n missing_main_table = "schedules" not in table_names and "jobs" not in table_names\n if missing_main_table:\n retry_pg_creation_fn(self._init_db)\n\n super().__init__()\n\n def _init_db(self) -> None:\n with self.connect() as conn:\n with conn.begin():\n ScheduleStorageSqlMetadata.create_all(conn)\n stamp_alembic_rev(pg_alembic_config(__file__), conn)\n\n # mark all the data migrations as applied\n self.migrate()\n self.optimize()\n\n def optimize_for_webserver(self, statement_timeout: int, pool_recycle: int) -> None:\n # When running in dagster-webserver, hold an open connection and set statement_timeout\n existing_options = self._engine.url.query.get("options")\n timeout_option = pg_statement_timeout(statement_timeout)\n if existing_options:\n options = f"{timeout_option} {existing_options}"\n else:\n options = timeout_option\n self._engine = create_engine(\n self.postgres_url,\n isolation_level="AUTOCOMMIT",\n pool_size=1,\n connect_args={"options": options},\n pool_recycle=pool_recycle,\n )\n\n @property\n def inst_data(self) -> Optional[ConfigurableClassData]:\n return self._inst_data\n\n @classmethod\n def config_type(cls) -> UserConfigSchema:\n return pg_config()\n\n @classmethod\n def from_config_value(\n cls, inst_data: Optional[ConfigurableClassData], config_value: PostgresStorageConfig\n ) -> "PostgresScheduleStorage":\n return PostgresScheduleStorage(\n inst_data=inst_data,\n postgres_url=pg_url_from_config(config_value),\n should_autocreate_tables=config_value.get("should_autocreate_tables", True),\n )\n\n @staticmethod\n def create_clean_storage(\n postgres_url: str, should_autocreate_tables: bool = True\n ) -> "PostgresScheduleStorage":\n engine = create_engine(\n postgres_url, isolation_level="AUTOCOMMIT", poolclass=db_pool.NullPool\n )\n try:\n ScheduleStorageSqlMetadata.drop_all(engine)\n finally:\n engine.dispose()\n return PostgresScheduleStorage(postgres_url, should_autocreate_tables)\n\n def connect(self, run_id: Optional[str] = None) -> ContextManager[Connection]:\n return create_pg_connection(self._engine)\n\n def upgrade(self) -> None:\n alembic_config = pg_alembic_config(__file__)\n with self.connect() as conn:\n run_alembic_upgrade(alembic_config, conn)\n\n def _add_or_update_instigators_table(self, conn: Connection, state: InstigatorState) -> None:\n selector_id = state.selector_id\n conn.execute(\n db_dialects.postgresql.insert(InstigatorsTable)\n .values(\n selector_id=selector_id,\n repository_selector_id=state.repository_selector_id,\n status=state.status.value,\n instigator_type=state.instigator_type.value,\n instigator_body=serialize_value(state),\n )\n .on_conflict_do_update(\n index_elements=[InstigatorsTable.c.selector_id],\n set_={\n "status": state.status.value,\n "instigator_type": state.instigator_type.value,\n "instigator_body": serialize_value(state),\n "update_timestamp": pendulum.now("UTC"),\n },\n )\n )\n\n def alembic_version(self) -> AlembicVersion:\n alembic_config = pg_alembic_config(__file__)\n with self.connect() as conn:\n return check_alembic_revision(alembic_config, conn)
\n
", "current_page_name": "_modules/dagster_postgres/schedule_storage/schedule_storage", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_postgres.schedule_storage.schedule_storage"}}}, "dagster_prometheus": {"resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_prometheus.resources

\nimport prometheus_client\nfrom dagster import (\n    ConfigurableResource,\n    resource,\n)\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom dagster._core.execution.context.init import InitResourceContext\nfrom prometheus_client.exposition import default_handler\nfrom pydantic import Field, PrivateAttr\n\n\n
[docs]class PrometheusClient:\n """Integrates with Prometheus via the prometheus_client library."""
\n\n\n
[docs]class PrometheusResource(ConfigurableResource):\n """This resource is used to send metrics to a Prometheus Pushgateway.\n\n **Example:**\n\n .. code-block:: python\n\n from dagster_prometheus import PrometheusResource\n from dagster import Definitions, job, op\n\n @op\n def example_prometheus_op(prometheus: PrometheusResource):\n prometheus.push_to_gateway(job="my_job")\n\n @job\n def my_job():\n example_prometheus_op()\n\n defs = Definitions(\n jobs=[my_job],\n resources={"prometheus": PrometheusResource(gateway="http://pushgateway.local")},\n )\n\n """\n\n gateway: str = Field(\n description=(\n "The url for your push gateway. Either of the"\n " form 'http://pushgateway.local', or 'pushgateway.local'."\n " Scheme defaults to 'http' if none is provided"\n )\n )\n timeout: int = Field(\n default=30,\n description="is how long delete will attempt to connect before giving up. Defaults to 30s.",\n )\n _registry: prometheus_client.CollectorRegistry = PrivateAttr(default=None)\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n def setup_for_execution(self, context: InitResourceContext) -> None:\n self._registry = prometheus_client.CollectorRegistry()\n\n @property\n def registry(self) -> prometheus_client.CollectorRegistry:\n return self._registry\n\n def push_to_gateway(self, job, grouping_key=None, handler=default_handler) -> None:\n """Push metrics to the given pushgateway.\n `job` is the job label to be attached to all pushed metrics\n `grouping_key` please see the pushgateway documentation for details.\n Defaults to None\n `handler` is an optional function which can be provided to perform\n requests to the 'gateway'.\n Defaults to None, in which case an http or https request\n will be carried out by a default handler.\n If not None, the argument must be a function which accepts\n the following arguments:\n url, method, timeout, headers, and content\n May be used to implement additional functionality not\n supported by the built-in default handler (such as SSL\n client certicates, and HTTP authentication mechanisms).\n 'url' is the URL for the request, the 'gateway' argument\n described earlier will form the basis of this URL.\n 'method' is the HTTP method which should be used when\n carrying out the request.\n 'timeout' requests not successfully completed after this\n many seconds should be aborted. If timeout is None, then\n the handler should not set a timeout.\n 'headers' is a list of ("header-name","header-value") tuples\n which must be passed to the pushgateway in the form of HTTP\n request headers.\n The function should raise an exception (e.g. IOError) on\n failure.\n 'content' is the data which should be used to form the HTTP\n Message Body.\n This overwrites all metrics with the same job and grouping_key.\n This uses the PUT HTTP method.\n """\n prometheus_client.push_to_gateway(\n gateway=self.gateway,\n job=job,\n registry=self._registry,\n grouping_key=grouping_key,\n timeout=self.timeout,\n handler=handler,\n )\n\n def pushadd_to_gateway(self, job, grouping_key=None, handler=default_handler) -> None:\n """PushAdd metrics to the given pushgateway.\n `job` is the job label to be attached to all pushed metrics\n `registry` is an instance of CollectorRegistry\n `grouping_key` please see the pushgateway documentation for details.\n Defaults to None\n `handler` is an optional function which can be provided to perform\n requests to the 'gateway'.\n Defaults to None, in which case an http or https request\n will be carried out by a default handler.\n See the 'prometheus_client.push_to_gateway' documentation\n for implementation requirements.\n This replaces metrics with the same name, job and grouping_key.\n This uses the POST HTTP method.\n """\n prometheus_client.pushadd_to_gateway(\n gateway=self.gateway,\n job=job,\n registry=self._registry,\n grouping_key=grouping_key,\n timeout=self.timeout,\n handler=handler,\n )\n\n def delete_from_gateway(self, job, grouping_key=None, handler=default_handler) -> None:\n """Delete metrics from the given pushgateway.\n `job` is the job label to be attached to all pushed metrics\n `grouping_key` please see the pushgateway documentation for details.\n Defaults to None\n `handler` is an optional function which can be provided to perform\n requests to the 'gateway'.\n Defaults to None, in which case an http or https request\n will be carried out by a default handler.\n See the 'prometheus_client.push_to_gateway' documentation\n for implementation requirements.\n This deletes metrics with the given job and grouping_key.\n This uses the DELETE HTTP method.\n """\n prometheus_client.delete_from_gateway(\n gateway=self.gateway,\n job=job,\n grouping_key=grouping_key,\n timeout=self.timeout,\n handler=handler,\n )
\n\n\n
[docs]@dagster_maintained_resource\n@resource(\n config_schema=PrometheusResource.to_config_schema(),\n description="""This resource is for sending metrics to a Prometheus Pushgateway.""",\n)\ndef prometheus_resource(context):\n return PrometheusResource(\n gateway=context.resource_config["gateway"], timeout=context.resource_config["timeout"]\n )
\n
", "current_page_name": "_modules/dagster_prometheus/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_prometheus.resources"}}, "dagster_pyspark": {"resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_pyspark.resources

\nfrom typing import Any, Dict\n\nimport dagster._check as check\nfrom dagster import ConfigurableResource, resource\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom dagster._core.execution.context.init import InitResourceContext\nfrom dagster_spark.configs_spark import spark_config\nfrom dagster_spark.utils import flatten_dict\nfrom pydantic import PrivateAttr\nfrom pyspark.sql import SparkSession\n\n\ndef spark_session_from_config(spark_conf=None):\n    spark_conf = check.opt_dict_param(spark_conf, "spark_conf")\n    builder = SparkSession.builder\n    flat = flatten_dict(spark_conf)\n    for key, value in flat:\n        builder = builder.config(key, value)\n\n    return builder.getOrCreate()\n\n\n
[docs]class PySparkResource(ConfigurableResource):\n """This resource provides access to a PySpark Session for executing PySpark code within Dagster.\n\n Example:\n .. code-block:: python\n\n @op\n def my_op(pyspark: PySparkResource)\n spark_session = pyspark.spark_session\n dataframe = spark_session.read.json("examples/src/main/resources/people.json")\n\n\n @job(\n resource_defs={\n "pyspark": PySparkResource(\n spark_config={\n "spark.executor.memory": "2g"\n }\n )\n }\n )\n def my_spark_job():\n my_op()\n """\n\n spark_config: Dict[str, Any]\n _spark_session = PrivateAttr(default=None)\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n def setup_for_execution(self, context: InitResourceContext) -> None:\n self._spark_session = spark_session_from_config(self.spark_config)\n\n @property\n def spark_session(self) -> Any:\n return self._spark_session\n\n @property\n def spark_context(self) -> Any:\n return self.spark_session.sparkContext
\n\n\n
[docs]@dagster_maintained_resource\n@resource({"spark_conf": spark_config()})\ndef pyspark_resource(init_context) -> PySparkResource:\n """This resource provides access to a PySpark SparkSession for executing PySpark code within Dagster.\n\n Example:\n .. code-block:: python\n\n @op(required_resource_keys={"pyspark"})\n def my_op(context):\n spark_session = context.resources.pyspark.spark_session\n dataframe = spark_session.read.json("examples/src/main/resources/people.json")\n\n my_pyspark_resource = pyspark_resource.configured(\n {"spark_conf": {"spark.executor.memory": "2g"}}\n )\n\n @job(resource_defs={"pyspark": my_pyspark_resource})\n def my_spark_job():\n my_op()\n """\n context_updated_config = init_context.replace_config(\n {"spark_config": init_context.resource_config["spark_conf"]}\n )\n return PySparkResource.from_resource_context(context_updated_config)
\n\n\nclass LazyPySparkResource(ConfigurableResource):\n """This resource provides access to a lazily-created PySpark SparkSession for executing PySpark\n code within Dagster, avoiding the creation of a SparkSession object until the .spark_session attribute\n of the resource is accessed. This is helpful for avoiding the creation (and startup penalty) of a SparkSession\n until it is actually needed / accessed by an op or IOManager.\n\n Example:\n .. code-block:: python\n\n @op\n def my_op(lazy_pyspark: LazyPySparkResource)\n spark_session = lazy_pyspark.spark_session\n dataframe = spark_session.read.json("examples/src/main/resources/people.json")\n\n @job(\n resource_defs={\n "lazy_pyspark": LazyPySparkResource(\n spark_config={\n "spark.executor.memory": "2g"\n }\n )\n }\n )\n def my_spark_job():\n my_op()\n """\n\n spark_config: Dict[str, Any]\n _spark_session = PrivateAttr(default=None)\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n def _init_session(self) -> None:\n if self._spark_session is None:\n self._spark_session = spark_session_from_config(self.spark_config)\n\n @property\n def spark_session(self) -> Any:\n self._init_session()\n return self._spark_session\n\n @property\n def spark_context(self) -> Any:\n self._init_session()\n return self._spark_session.sparkContext\n\n\n@dagster_maintained_resource\n@resource({"spark_conf": spark_config()})\ndef lazy_pyspark_resource(init_context: InitResourceContext) -> LazyPySparkResource:\n """This resource provides access to a lazily-created PySpark SparkSession for executing PySpark\n code within Dagster, avoiding the creation of a SparkSession object until the .spark_session attribute\n of the resource is accessed. This is helpful for avoiding the creation (and startup penalty) of a SparkSession\n until it is actually needed / accessed by an op or IOManager.\n\n Example:\n .. code-block:: python\n\n @op(required_resource_keys={"lazy_pyspark"})\n def my_op(context):\n spark_session = context.resources.lazy_pyspark.spark_session\n dataframe = spark_session.read.json("examples/src/main/resources/people.json")\n\n my_pyspark_resource = lazy_pyspark_resource.configured(\n {"spark_conf": {"spark.executor.memory": "2g"}}\n )\n\n @job(resource_defs={"lazy_pyspark": my_pyspark_resource})\n def my_spark_job():\n my_op()\n """\n context_updated_config = init_context.replace_config(\n {"spark_config": init_context.resource_config["spark_conf"]}\n )\n return LazyPySparkResource.from_resource_context(context_updated_config)\n
", "current_page_name": "_modules/dagster_pyspark/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_pyspark.resources"}}, "dagster_shell": {"ops": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_shell.ops

\nimport os\nfrom enum import Enum\nfrom typing import AbstractSet, Any, Dict, Mapping, Optional\n\nfrom dagster import (\n    Config,\n    Failure,\n    In,\n    Nothing,\n    OpExecutionContext,\n    Out,\n    _check as check,\n    op,\n)\nfrom dagster._core.definitions.op_definition import OpDefinition\nfrom pydantic import Field\n\nfrom .utils import execute, execute_script_file\n\n\nclass OutputType(Enum):\n    STREAM = "STREAM"\n    """Stream script stdout/stderr."""\n\n    BUFFER = "BUFFER"\n    """Buffer shell script stdout/stderr, then log upon completion."""\n\n    NONE = "NONE"\n    """No logging."""\n\n\nclass ShellOpConfig(Config):\n    env: Optional[Dict[str, str]] = Field(\n        default=None,\n        description="An optional dict of environment variables to pass to the subprocess.",\n    )\n    output_logging: OutputType = Field(\n        OutputType.BUFFER.value,\n    )\n    cwd: Optional[str] = Field(\n        default=None, description="Working directory in which to execute shell script"\n    )\n\n    def to_execute_params(self) -> Dict[str, Any]:\n        return {\n            "env": {**os.environ, **(self.env or {})},\n            "output_logging": self.output_logging.value,\n            "cwd": self.cwd,\n        }\n\n\n
[docs]@op(\n name="shell_op",\n description=(\n "This op executes a shell command it receives as input.\\n\\n"\n "This op is suitable for uses where the command to execute is generated dynamically by "\n "upstream ops. If you know the command to execute at job construction time, "\n "consider `shell_command_op` instead."\n ),\n ins={"shell_command": In(str)},\n out=Out(str),\n)\ndef shell_op(context: OpExecutionContext, shell_command: str, config: ShellOpConfig) -> str:\n """This op executes a shell command it receives as input.\n This op is suitable for uses where the command to execute is generated dynamically by\n upstream ops. If you know the command to execute at job construction time,\n consider ``shell_command_op`` instead.\n\n Args:\n shell_command: The shell command to be executed\n config (ShellOpConfig): A ShellOpConfig object specifying configuration options\n\n Examples:\n .. code-block:: python\n\n @op\n def create_shell_command():\n return "echo hello world!"\n\n @graph\n def echo_graph():\n shell_op(create_shell_command())\n """\n output, return_code = execute(\n shell_command=shell_command, log=context.log, **config.to_execute_params()\n )\n\n if return_code:\n raise Failure(description=f"Shell command execution failed with output: {output}")\n\n return output
\n\n\n
[docs]def create_shell_command_op(\n shell_command: str,\n name: str,\n description: Optional[str] = None,\n required_resource_keys: Optional[AbstractSet[str]] = None,\n tags: Optional[Mapping[str, str]] = None,\n) -> OpDefinition:\n """This function is a factory that constructs ops to execute a shell command.\n\n Note that you can only use ``shell_command_op`` if you know the command you'd like to execute\n at job construction time. If you'd like to construct shell commands dynamically during\n job execution and pass them between ops, you should use ``shell_op`` instead.\n\n The resulting op can take a single ``start`` argument that is a\n `Nothing dependency <https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies>`__\n to allow you to run ops before the shell op.\n\n Examples:\n .. literalinclude:: ../../../../../../python_modules/libraries/dagster-shell/dagster_shell_tests/example_shell_command_op.py\n :language: python\n\n .. code-block:: python\n\n @op\n def run_before_shell_op():\n do_some_work()\n\n @graph\n def my_graph():\n my_echo_op = create_shell_command_op("echo hello world!", name="echo_op")\n my_echo_op(start=run_before_shell_op())\n\n\n Args:\n shell_command (str): The shell command that the constructed op will execute.\n name (str): The name of the constructed op.\n description (Optional[str]): Human-readable description of this op.\n required_resource_keys (Optional[Set[str]]): Set of resource handles required by this op.\n Setting this ensures that resource spin up for the required resources will occur before\n the shell command is executed.\n tags (Optional[Dict[str, Any]]): Arbitrary metadata for the op. Frameworks may\n expect and require certain metadata to be attached to a op. Users should generally\n not set metadata directly. Values that are not strings will be json encoded and must meet\n the criteria that `json.loads(json.dumps(value)) == value`.\n\n Raises:\n Failure: Raised when the shell command returns a non-zero exit code.\n\n Returns:\n OpDefinition: Returns the constructed op definition.\n """\n\n @op(\n name=name,\n description=description,\n ins={"start": In(Nothing)},\n out=Out(str),\n required_resource_keys=required_resource_keys,\n tags=tags,\n )\n def _shell_fn(context, config: ShellOpConfig):\n output, return_code = execute(\n shell_command=shell_command, log=context.log, **config.to_execute_params()\n )\n\n if return_code:\n raise Failure(description=f"Shell command execution failed with output: {output}")\n\n return output\n\n return _shell_fn
\n\n\n
[docs]def create_shell_script_op(\n shell_script_path,\n name="create_shell_script_op",\n ins: Optional[Mapping[str, In]] = None,\n **kwargs: Any,\n) -> OpDefinition:\n """This function is a factory which constructs an op that will execute a shell command read\n from a script file.\n\n Any kwargs passed to this function will be passed along to the underlying :func:`@op\n <dagster.op>` decorator. However, note that overriding ``config`` or ``output_defs`` is not\n supported.\n\n You might consider using :func:`@graph <dagster.graph>` to wrap this op\n in the cases where you'd like to configure the shell op with different config fields.\n\n If no ``ins`` are passed then the resulting op can take a single ``start`` argument that is a\n `Nothing dependency <https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies>`__\n to allow you to run ops before the shell op.\n\n\n Examples:\n .. literalinclude:: ../../../../../../python_modules/libraries/dagster-shell/dagster_shell_tests/example_shell_script_op.py\n :language: python\n\n .. code-block:: python\n\n @op\n def run_before_shell_op():\n do_some_work()\n\n @graph\n def my_graph():\n my_echo_op = create_shell_script_op(file_relative_path(__file__, "hello_world.sh"), name="echo_op")\n my_echo_op(start=run_before_shell_op())\n\n\n Args:\n shell_script_path (str): The script file to execute.\n name (Optional[str]): The name of this op. Defaults to "create_shell_script_op".\n ins (Optional[Mapping[str, In]]): Ins for the op. Defaults to\n a single Nothing input.\n\n Raises:\n Failure: Raised when the shell command returns a non-zero exit code.\n\n Returns:\n OpDefinition: Returns the constructed op definition.\n """\n check.str_param(shell_script_path, "shell_script_path")\n name = check.str_param(name, "name")\n check.opt_mapping_param(ins, "ins", value_type=In)\n\n if "config" in kwargs:\n raise TypeError("Overriding config for shell op is not supported.")\n\n @op(\n name=name,\n description=kwargs.pop("description", "An op to invoke a shell command."),\n ins=ins or {"start": In(Nothing)},\n out=Out(str),\n **kwargs,\n )\n def _shell_script_fn(context, config: ShellOpConfig):\n output, return_code = execute_script_file(\n shell_script_path=shell_script_path, log=context.log, **config.to_execute_params()\n )\n\n if return_code:\n raise Failure(description=f"Shell command execution failed with output: {output}")\n\n return output\n\n return _shell_script_fn
\n
", "current_page_name": "_modules/dagster_shell/ops", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_shell.ops"}, "utils": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_shell.utils

\n#\n# NOTE: This file is based on the bash operator from Apache Airflow, which can be found here:\n# https://github.com/apache/airflow/blob/master/airflow/operators/bash.py\n#\n# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements.  See the NOTICE file\n# distributed with this work for additional information\n# regarding copyright ownership.  The ASF licenses this file\n# to you under the Apache License, Version 2.0 (the\n# "License"); you may not use this file except in compliance\n# with the License.  You may obtain a copy of the License at\n#\n#   http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing,\n# software distributed under the License is distributed on an\n# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY\n# KIND, either express or implied.  See the License for the\n# specific language governing permissions and limitations\n# under the License.\nimport os\nimport signal\nfrom logging import Logger\nfrom subprocess import PIPE, STDOUT, Popen\nfrom typing import Mapping, Optional, Tuple\n\nimport dagster._check as check\nfrom dagster._utils import safe_tempfile_path\nfrom typing_extensions import Final\n\nOUTPUT_LOGGING_OPTIONS: Final = ["STREAM", "BUFFER", "NONE"]\n\n\ndef execute_script_file(\n    shell_script_path: str,\n    output_logging: str,\n    log: Logger,\n    cwd: Optional[str] = None,\n    env: Optional[Mapping[str, str]] = None,\n) -> Tuple[str, int]:\n    """Execute a shell script file specified by the argument ``shell_script_path``. The script will be\n    invoked via ``subprocess.Popen(['bash', shell_script_path], ...)``.\n\n    In the Popen invocation, ``stdout=PIPE, stderr=STDOUT`` is used, and the combined stdout/stderr\n    output is retrieved.\n\n    Examples:\n        .. literalinclude:: ../../../../../../python_modules/libraries/dagster-shell/dagster_shell_tests/example_shell_script_utility.py\n           :language: python\n\n    Args:\n        shell_script_path (str): The shell script to execute.\n        output_logging (str): The logging mode to use. Supports STREAM, BUFFER, and NONE.\n        log (Union[logging.Logger, DagsterLogManager]): Any logger which responds to .info()\n        cwd (str, optional): Working directory for the shell command to use. Defaults to the\n            temporary path where we store the shell command in a script file.\n        env (Dict[str, str], optional): Environment dictionary to pass to ``subprocess.Popen``.\n            Unused by default.\n\n    Raises:\n        Exception: When an invalid output_logging is selected. Unreachable from op-based\n            invocation since the config system will check output_logging against the config\n            enum.\n\n    Returns:\n        Tuple[str, int]: A tuple where the first element is the combined stdout/stderr output of running the shell\n        command and the second element is the return code.\n    """\n    check.str_param(shell_script_path, "shell_script_path")\n    check.str_param(output_logging, "output_logging")\n    check.opt_str_param(cwd, "cwd", default=os.path.dirname(shell_script_path))\n    env = check.opt_nullable_dict_param(env, "env", key_type=str, value_type=str)\n\n    if output_logging not in OUTPUT_LOGGING_OPTIONS:\n        raise Exception("Unrecognized output_logging %s" % output_logging)\n\n    def pre_exec():\n        # Restore default signal disposition and invoke setsid\n        for sig in ("SIGPIPE", "SIGXFZ", "SIGXFSZ"):\n            if hasattr(signal, sig):\n                signal.signal(getattr(signal, sig), signal.SIG_DFL)\n        os.setsid()\n\n    with open(shell_script_path, "rb") as f:\n        shell_command = f.read().decode("utf-8")\n\n    log.info(f"Running command:\\n{shell_command}")\n\n    sub_process = None\n    try:\n        stdout_pipe = PIPE\n        stderr_pipe = STDOUT\n        if output_logging == "NONE":\n            stdout_pipe = stderr_pipe = None\n\n        sub_process = Popen(\n            ["bash", shell_script_path],\n            stdout=stdout_pipe,\n            stderr=stderr_pipe,\n            cwd=cwd,\n            env=env,\n            preexec_fn=pre_exec,  # noqa: PLW1509\n            encoding="UTF-8",\n        )\n\n        log.info(f"Command pid: {sub_process.pid}")\n\n        output = ""\n        if output_logging == "STREAM":\n            assert sub_process.stdout is not None, "Setting stdout=PIPE should always set stdout."\n            # Stream back logs as they are emitted\n            lines = []\n            for line in sub_process.stdout:\n                log.info(line.rstrip())\n                lines.append(line)\n            output = "".join(lines)\n        elif output_logging == "BUFFER":\n            # Collect and buffer all logs, then emit\n            output, _ = sub_process.communicate()\n            log.info(output)\n\n        sub_process.wait()\n        log.info(f"Command exited with return code {sub_process.returncode}")\n\n        return output, sub_process.returncode\n    finally:\n        # Always terminate subprocess, including in cases where the run is terminated\n        if sub_process:\n            sub_process.terminate()\n\n\ndef execute(\n    shell_command: str,\n    output_logging: str,\n    log: Logger,\n    cwd: Optional[str] = None,\n    env: Optional[Mapping[str, str]] = None,\n) -> Tuple[str, int]:\n    """This function is a utility for executing shell commands from within a Dagster op (or from Python in general).\n    It can be used to execute shell commands on either op input data, or any data generated within a generic python op.\n\n    Internally, it executes a shell script specified by the argument ``shell_command``. The script will be written\n    to a temporary file first and invoked via ``subprocess.Popen(['bash', shell_script_path], ...)``.\n\n    In the Popen invocation, ``stdout=PIPE, stderr=STDOUT`` is used, and the combined stdout/stderr\n    output is retrieved.\n\n    Examples:\n        .. literalinclude:: ../../../../../../python_modules/libraries/dagster-shell/dagster_shell_tests/example_shell_command_utility.py\n           :language: python\n\n    Args:\n        shell_command (str): The shell command to execute\n        output_logging (str): The logging mode to use. Supports STREAM, BUFFER, and NONE.\n        log (Union[logging.Logger, DagsterLogManager]): Any logger which responds to .info()\n        cwd (str, optional): Working directory for the shell command to use. Defaults to the\n            temporary path where we store the shell command in a script file.\n        env (Dict[str, str], optional): Environment dictionary to pass to ``subprocess.Popen``.\n            Unused by default.\n\n    Returns:\n        Tuple[str, int]: A tuple where the first element is the combined stdout/stderr output of running the shell\n        command and the second element is the return code.\n    """\n    check.str_param(shell_command, "shell_command")\n    # other args checked in execute_file\n\n    with safe_tempfile_path() as tmp_file_path:\n        tmp_path = os.path.dirname(tmp_file_path)\n        log.info("Using temporary directory: %s" % tmp_path)\n\n        with open(tmp_file_path, "wb") as tmp_file:\n            tmp_file.write(shell_command.encode("utf-8"))\n            tmp_file.flush()\n            script_location = os.path.abspath(tmp_file.name)\n            log.info(f"Temporary script location: {script_location}")\n            return execute_script_file(\n                shell_script_path=tmp_file.name,\n                output_logging=output_logging,\n                log=log,\n                cwd=(cwd or tmp_path),\n                env=env,\n            )\n
", "current_page_name": "_modules/dagster_shell/utils", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_shell.utils"}}, "dagster_slack": {"hooks": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_slack.hooks

\nfrom typing import Callable, Optional\n\nfrom dagster._annotations import deprecated_param\nfrom dagster._core.definitions import failure_hook, success_hook\nfrom dagster._core.execution.context.hook import HookContext\nfrom dagster._utils.warnings import normalize_renamed_param\n\n\ndef _default_status_message(context: HookContext, status: str) -> str:\n    return f"Op {context.op.name} on job {context.job_name} {status}!\\nRun ID: {context.run_id}"\n\n\ndef _default_failure_message(context: HookContext) -> str:\n    return _default_status_message(context, status="failed")\n\n\ndef _default_success_message(context: HookContext) -> str:\n    return _default_status_message(context, status="succeeded")\n\n\n
[docs]@deprecated_param(\n param="dagit_base_url",\n breaking_version="2.0",\n additional_warn_text="Use `webserver_base_url` instead.",\n)\ndef slack_on_failure(\n channel: str,\n message_fn: Callable[[HookContext], str] = _default_failure_message,\n dagit_base_url: Optional[str] = None,\n webserver_base_url: Optional[str] = None,\n):\n """Create a hook on step failure events that will message the given Slack channel.\n\n Args:\n channel (str): The channel to send the message to (e.g. "#my_channel")\n message_fn (Optional(Callable[[HookContext], str])): Function which takes in the HookContext\n outputs the message you want to send.\n dagit_base_url: (Optional[str]): The base url of your webserver instance. Specify this to allow\n messages to include deeplinks to the specific run that triggered the hook.\n webserver_base_url: (Optional[str]): The base url of your webserver instance. Specify this to allow\n messages to include deeplinks to the specific run that triggered the hook.\n\n Examples:\n .. code-block:: python\n\n @slack_on_failure("#foo", webserver_base_url="http://localhost:3000")\n @job(...)\n def my_job():\n pass\n\n .. code-block:: python\n\n def my_message_fn(context: HookContext) -> str:\n return f"Op {context.op} failed!"\n\n @op\n def an_op(context):\n pass\n\n @job(...)\n def my_job():\n an_op.with_hooks(hook_defs={slack_on_failure("#foo", my_message_fn)})\n\n """\n webserver_base_url = normalize_renamed_param(\n webserver_base_url, "webserver_base_url", dagit_base_url, "dagit_base_url"\n )\n\n @failure_hook(required_resource_keys={"slack"})\n def _hook(context: HookContext):\n text = message_fn(context)\n if webserver_base_url:\n text += f"\\n<{webserver_base_url}/runs/{context.run_id}|View in Dagster UI>"\n\n context.resources.slack.chat_postMessage(channel=channel, text=text)\n\n return _hook
\n\n\n
[docs]@deprecated_param(\n param="dagit_base_url",\n breaking_version="2.0",\n additional_warn_text="Use `webserver_base_url` instead.",\n)\ndef slack_on_success(\n channel: str,\n message_fn: Callable[[HookContext], str] = _default_success_message,\n dagit_base_url: Optional[str] = None,\n webserver_base_url: Optional[str] = None,\n):\n """Create a hook on step success events that will message the given Slack channel.\n\n Args:\n channel (str): The channel to send the message to (e.g. "#my_channel")\n message_fn (Optional(Callable[[HookContext], str])): Function which takes in the HookContext\n outputs the message you want to send.\n dagit_base_url: (Optional[str]): The base url of your webserver instance. Specify this to allow\n messages to include deeplinks to the specific run that triggered the hook.\n webserver_base_url: (Optional[str]): The base url of your webserver instance. Specify this to allow\n messages to include deeplinks to the specific run that triggered the hook.\n\n Examples:\n .. code-block:: python\n\n @slack_on_success("#foo", webserver_base_url="http://localhost:3000")\n @job(...)\n def my_job():\n pass\n\n .. code-block:: python\n\n def my_message_fn(context: HookContext) -> str:\n return f"Op {context.op} worked!"\n\n @op\n def an_op(context):\n pass\n\n @job(...)\n def my_job():\n an_op.with_hooks(hook_defs={slack_on_success("#foo", my_message_fn)})\n\n """\n webserver_base_url = normalize_renamed_param(\n webserver_base_url, "webserver_base_url", dagit_base_url, "dagit_base_url"\n )\n\n @success_hook(required_resource_keys={"slack"})\n def _hook(context: HookContext):\n text = message_fn(context)\n if webserver_base_url:\n text += f"\\n<{webserver_base_url}/runs/{context.run_id}|View in Dagster UI>"\n\n context.resources.slack.chat_postMessage(channel=channel, text=text)\n\n return _hook
\n
", "current_page_name": "_modules/dagster_slack/hooks", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_slack.hooks"}, "resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_slack.resources

\nfrom dagster import ConfigurableResource, resource\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom pydantic import Field\nfrom slack_sdk.web.client import WebClient\n\n\n
[docs]class SlackResource(ConfigurableResource):\n """This resource is for connecting to Slack.\n\n By configuring this Slack resource, you can post messages to Slack from any Dagster op, asset, schedule or sensor.\n\n Examples:\n .. code-block:: python\n\n import os\n\n from dagster import EnvVar, job, op\n from dagster_slack import SlackResource\n\n\n @op\n def slack_op(slack: SlackResource):\n slack.get_client().chat_postMessage(channel='#noise', text=':wave: hey there!')\n\n @job\n def slack_job():\n slack_op()\n\n defs = Definitions(\n jobs=[slack_job],\n resources={\n "slack": SlackResource(token=EnvVar("MY_SLACK_TOKEN")),\n },\n )\n """\n\n token: str = Field(\n description=(\n "To configure access to the Slack API, you'll need an access"\n " token provisioned with access to your Slack workspace."\n " Tokens are typically either user tokens or bot tokens. For programmatic posting"\n " to Slack from this resource, you probably want to provision and use a bot token."\n " More in the Slack API documentation here: https://api.slack.com/docs/token-types"\n ),\n )\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n def get_client(self) -> WebClient:\n """Returns a ``slack_sdk.WebClient`` for interacting with the Slack API."""\n return WebClient(self.token)
\n\n\n
[docs]@dagster_maintained_resource\n@resource(\n config_schema=SlackResource.to_config_schema(),\n)\ndef slack_resource(context) -> WebClient:\n """This resource is for connecting to Slack.\n\n The resource object is a `slack_sdk.WebClient`.\n\n By configuring this Slack resource, you can post messages to Slack from any Dagster op, asset, schedule or sensor.\n\n Examples:\n .. code-block:: python\n\n import os\n\n from dagster import job, op\n from dagster_slack import slack_resource\n\n\n @op(required_resource_keys={'slack'})\n def slack_op(context):\n context.resources.slack.chat_postMessage(channel='#noise', text=':wave: hey there!')\n\n @job(resource_defs={'slack': slack_resource})\n def slack_job():\n slack_op()\n\n slack_job.execute_in_process(\n run_config={'resources': {'slack': {'config': {'token': os.getenv('SLACK_TOKEN')}}}}\n )\n """\n return SlackResource.from_resource_context(context).get_client()
\n
", "current_page_name": "_modules/dagster_slack/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_slack.resources"}, "sensors": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_slack.sensors

\nfrom typing import (\n    TYPE_CHECKING,\n    Any,\n    Callable,\n    Dict,\n    List,\n    Optional,\n    Sequence,\n    Tuple,\n    TypeVar,\n    Union,\n)\n\nfrom dagster import (\n    AssetSelection,\n    DefaultSensorStatus,\n    FreshnessPolicySensorContext,\n    freshness_policy_sensor,\n)\nfrom dagster._annotations import deprecated_param, experimental\nfrom dagster._core.definitions import GraphDefinition, JobDefinition\nfrom dagster._core.definitions.run_status_sensor_definition import (\n    RunFailureSensorContext,\n    run_failure_sensor,\n)\nfrom dagster._core.definitions.unresolved_asset_job_definition import UnresolvedAssetJobDefinition\nfrom dagster._utils.warnings import normalize_renamed_param\nfrom slack_sdk.web.client import WebClient\n\nif TYPE_CHECKING:\n    from dagster._core.definitions.selector import (\n        CodeLocationSelector,\n        JobSelector,\n        RepositorySelector,\n    )\n\nT = TypeVar("T", RunFailureSensorContext, FreshnessPolicySensorContext)\n\n\ndef _build_slack_blocks_and_text(\n    context: T,\n    text_fn: Callable[[T], str],\n    blocks_fn: Optional[Callable[[T], List[Dict[Any, Any]]]],\n    webserver_base_url: Optional[str],\n) -> Tuple[List[Dict[str, Any]], str]:\n    main_body_text = text_fn(context)\n    blocks: List[Dict[Any, Any]] = []\n    if blocks_fn:\n        blocks.extend(blocks_fn(context))\n    else:\n        if isinstance(context, RunFailureSensorContext):\n            text = (\n                f'*Job "{context.dagster_run.job_name}" failed.'\n                f' `{context.dagster_run.run_id.split("-")[0]}`*'\n            )\n        else:\n            text = (\n                f'*Asset "{context.asset_key.to_user_string()}" is now'\n                f' {"on time" if context.minutes_overdue == 0 else f"{context.minutes_overdue:.2f} minutes late.*"}'\n            )\n\n        blocks.extend(\n            [\n                {\n                    "type": "section",\n                    "text": {\n                        "type": "mrkdwn",\n                        "text": text,\n                    },\n                },\n                {\n                    "type": "section",\n                    "text": {"type": "mrkdwn", "text": main_body_text},\n                },\n            ]\n        )\n\n    if webserver_base_url:\n        if isinstance(context, RunFailureSensorContext):\n            url = f"{webserver_base_url}/runs/{context.dagster_run.run_id}"\n        else:\n            url = f"{webserver_base_url}/assets/{'/'.join(context.asset_key.path)}"\n        blocks.append(\n            {\n                "type": "actions",\n                "elements": [\n                    {\n                        "type": "button",\n                        "text": {"type": "plain_text", "text": "View in Dagster UI"},\n                        "url": url,\n                    }\n                ],\n            }\n        )\n    return blocks, main_body_text\n\n\ndef _default_failure_message_text_fn(context: RunFailureSensorContext) -> str:\n    return f"Error: ```{context.failure_event.message}```"\n\n\n
[docs]@deprecated_param(\n param="dagit_base_url",\n breaking_version="2.0",\n additional_warn_text="Use `webserver_base_url` instead.",\n)\n@deprecated_param(\n param="job_selection",\n breaking_version="2.0",\n additional_warn_text="Use `monitored_jobs` instead.",\n)\ndef make_slack_on_run_failure_sensor(\n channel: str,\n slack_token: str,\n text_fn: Callable[[RunFailureSensorContext], str] = _default_failure_message_text_fn,\n blocks_fn: Optional[Callable[[RunFailureSensorContext], List[Dict[Any, Any]]]] = None,\n name: Optional[str] = None,\n dagit_base_url: Optional[str] = None,\n minimum_interval_seconds: Optional[int] = None,\n monitored_jobs: Optional[\n Sequence[\n Union[\n JobDefinition,\n GraphDefinition,\n UnresolvedAssetJobDefinition,\n "RepositorySelector",\n "JobSelector",\n "CodeLocationSelector",\n ]\n ]\n ] = None,\n job_selection: Optional[\n Sequence[\n Union[\n JobDefinition,\n GraphDefinition,\n UnresolvedAssetJobDefinition,\n "RepositorySelector",\n "JobSelector",\n "CodeLocationSelector",\n ]\n ]\n ] = None,\n monitor_all_repositories: bool = False,\n default_status: DefaultSensorStatus = DefaultSensorStatus.STOPPED,\n webserver_base_url: Optional[str] = None,\n):\n """Create a sensor on job failures that will message the given Slack channel.\n\n Args:\n channel (str): The channel to send the message to (e.g. "#my_channel")\n slack_token (str): The slack token.\n Tokens are typically either user tokens or bot tokens. More in the Slack API\n documentation here: https://api.slack.com/docs/token-types\n text_fn (Optional(Callable[[RunFailureSensorContext], str])): Function which\n takes in the ``RunFailureSensorContext`` and outputs the message you want to send.\n Defaults to a text message that contains error message, job name, and run ID.\n The usage of the `text_fn` changes depending on whether you're using `blocks_fn`. If you\n are using `blocks_fn`, this is used as a fallback string to display in notifications. If\n you aren't, this is the main body text of the message. It can be formatted as plain text,\n or with markdown.\n See more details in https://api.slack.com/methods/chat.postMessage#text_usage\n blocks_fn (Callable[[RunFailureSensorContext], List[Dict]]): Function which takes in\n the ``RunFailureSensorContext`` and outputs the message blocks you want to send.\n See information about Blocks in https://api.slack.com/reference/block-kit/blocks\n name: (Optional[str]): The name of the sensor. Defaults to "slack_on_run_failure".\n dagit_base_url: (Optional[str]): The base url of your Dagit instance. Specify this to allow\n messages to include deeplinks to the failed job run.\n minimum_interval_seconds: (Optional[int]): The minimum number of seconds that will elapse\n between sensor evaluations.\n monitored_jobs (Optional[List[Union[JobDefinition, GraphDefinition, RepositorySelector, JobSelector, CodeLocationSensor]]]): The jobs in the\n current repository that will be monitored by this failure sensor. Defaults to None, which\n means the alert will be sent when any job in the repository fails. To monitor jobs in external repositories, use RepositorySelector and JobSelector\n job_selection (Optional[List[Union[JobDefinition, GraphDefinition, RepositorySelector, JobSelector, CodeLocationSensor]]]): (deprecated in favor of monitored_jobs)\n The jobs in the current repository that will be monitored by this failure sensor. Defaults to None, which means the alert will\n be sent when any job in the repository fails.\n monitor_all_repositories (bool): If set to True, the sensor will monitor all runs in the\n Dagster instance. If set to True, an error will be raised if you also specify\n monitored_jobs or job_selection. Defaults to False.\n default_status (DefaultSensorStatus): Whether the sensor starts as running or not. The default\n status can be overridden from Dagit or via the GraphQL API.\n webserver_base_url: (Optional[str]): The base url of your webserver instance. Specify this to allow\n messages to include deeplinks to the failed job run.\n\n Examples:\n .. code-block:: python\n\n slack_on_run_failure = make_slack_on_run_failure_sensor(\n "#my_channel",\n os.getenv("MY_SLACK_TOKEN")\n )\n\n @repository\n def my_repo():\n return [my_job + slack_on_run_failure]\n\n .. code-block:: python\n\n def my_message_fn(context: RunFailureSensorContext) -> str:\n return (\n f"Job {context.dagster_run.job_name} failed!"\n f"Error: {context.failure_event.message}"\n )\n\n slack_on_run_failure = make_slack_on_run_failure_sensor(\n channel="#my_channel",\n slack_token=os.getenv("MY_SLACK_TOKEN"),\n text_fn=my_message_fn,\n webserver_base_url="http://mycoolsite.com",\n )\n\n\n """\n webserver_base_url = normalize_renamed_param(\n webserver_base_url, "webserver_base_url", dagit_base_url, "dagit_base_url"\n )\n slack_client = WebClient(token=slack_token)\n jobs = monitored_jobs if monitored_jobs else job_selection\n\n @run_failure_sensor(\n name=name,\n minimum_interval_seconds=minimum_interval_seconds,\n monitored_jobs=jobs,\n monitor_all_repositories=monitor_all_repositories,\n default_status=default_status,\n )\n def slack_on_run_failure(context: RunFailureSensorContext):\n blocks, main_body_text = _build_slack_blocks_and_text(\n context=context,\n text_fn=text_fn,\n blocks_fn=blocks_fn,\n webserver_base_url=webserver_base_url,\n )\n\n slack_client.chat_postMessage(channel=channel, blocks=blocks, text=main_body_text)\n\n return slack_on_run_failure
\n\n\ndef _default_freshness_message_text_fn(context: FreshnessPolicySensorContext) -> str:\n return (\n f"Asset `{context.asset_key.to_user_string()}` is now {context.minutes_overdue:.2f} minutes"\n " late."\n )\n\n\n
[docs]@deprecated_param(\n param="dagit_base_url",\n breaking_version="2.0",\n additional_warn_text="Use `webserver_base_url` instead.",\n)\n@experimental\ndef make_slack_on_freshness_policy_status_change_sensor(\n channel: str,\n slack_token: str,\n asset_selection: AssetSelection,\n warn_after_minutes_overdue: float = 0,\n notify_when_back_on_time: bool = False,\n text_fn: Callable[[FreshnessPolicySensorContext], str] = _default_freshness_message_text_fn,\n blocks_fn: Optional[Callable[[FreshnessPolicySensorContext], List[Dict[Any, Any]]]] = None,\n name: Optional[str] = None,\n dagit_base_url: Optional[str] = None,\n default_status: DefaultSensorStatus = DefaultSensorStatus.STOPPED,\n webserver_base_url: Optional[str] = None,\n):\n """Create a sensor that will message the given Slack channel whenever an asset in the provided\n AssetSelection becomes out of date. Messages are only fired when the state changes, meaning\n only a single slack message will be sent (when the asset begins to be out of date). If\n `notify_when_back_on_time` is set to `True`, a second slack message will be sent once the asset\n is on time again.\n\n Args:\n channel (str): The channel to send the message to (e.g. "#my_channel")\n slack_token (str): The slack token.\n Tokens are typically either user tokens or bot tokens. More in the Slack API\n documentation here: https://api.slack.com/docs/token-types\n asset_selection (AssetSelection): The selection of assets which this sensor will monitor.\n Alerts will only be fired for assets that have a FreshnessPolicy defined.\n warn_after_minutes_overdue (float): How many minutes past the specified FreshnessPolicy this\n sensor will wait before firing an alert (by default, an alert will be fired as soon as\n the policy is violated).\n notify_when_back_on_time (bool): If a success message should be sent when the asset becomes on\n time again.\n text_fn (Optional(Callable[[RunFailureSensorContext], str])): Function which\n takes in the ``FreshnessPolicySensorContext`` and outputs the message you want to send.\n Defaults to a text message that contains the relevant asset key, and the number of\n minutes past its defined freshness policy it currently is.\n The usage of the `text_fn` changes depending on whether you're using `blocks_fn`. If you\n are using `blocks_fn`, this is used as a fallback string to display in notifications. If\n you aren't, this is the main body text of the message. It can be formatted as plain text,\n or with markdown.\n See more details in https://api.slack.com/methods/chat.postMessage#text_usage\n blocks_fn (Callable[[FreshnessPolicySensorContext], List[Dict]]): Function which takes in\n the ``FreshnessPolicySensorContext`` and outputs the message blocks you want to send.\n See information about Blocks in https://api.slack.com/reference/block-kit/blocks\n name: (Optional[str]): The name of the sensor. Defaults to "slack_on_freshness_policy".\n dagit_base_url: (Optional[str]): The base url of your Dagit instance. Specify this to allow\n messages to include deeplinks to the relevant asset page.\n default_status (DefaultSensorStatus): Whether the sensor starts as running or not. The default\n status can be overridden from Dagit or via the GraphQL API.\n webserver_base_url: (Optional[str]): The base url of your Dagit instance. Specify this to allow\n messages to include deeplinks to the relevant asset page.\n\n Examples:\n .. code-block:: python\n\n slack_on_freshness_policy = make_slack_on_freshness_policy_status_change_sensor(\n "#my_channel",\n os.getenv("MY_SLACK_TOKEN"),\n )\n\n .. code-block:: python\n\n def my_message_fn(context: FreshnessPolicySensorContext) -> str:\n if context.minutes_overdue == 0:\n return f"Asset {context.asset_key} is currently on time :)"\n return (\n f"Asset {context.asset_key} is currently {context.minutes_overdue} minutes late!!"\n )\n\n slack_on_run_failure = make_slack_on_run_failure_sensor(\n channel="#my_channel",\n slack_token=os.getenv("MY_SLACK_TOKEN"),\n text_fn=my_message_fn,\n webserver_base_url="http://mycoolsite.com",\n )\n\n\n """\n webserver_base_url = normalize_renamed_param(\n webserver_base_url, "webserver_base_url", dagit_base_url, "dagit_base_url"\n )\n slack_client = WebClient(token=slack_token)\n\n @freshness_policy_sensor(\n name=name, asset_selection=asset_selection, default_status=default_status\n )\n def slack_on_freshness_policy(context: FreshnessPolicySensorContext):\n if context.minutes_overdue is None or context.previous_minutes_overdue is None:\n return\n\n if (\n context.minutes_overdue > warn_after_minutes_overdue\n and context.previous_minutes_overdue <= warn_after_minutes_overdue\n ) or (\n notify_when_back_on_time\n and context.minutes_overdue == 0\n and context.previous_minutes_overdue != 0\n ):\n blocks, main_body_text = _build_slack_blocks_and_text(\n context=context,\n text_fn=text_fn,\n blocks_fn=blocks_fn,\n webserver_base_url=webserver_base_url,\n )\n\n slack_client.chat_postMessage(channel=channel, blocks=blocks, text=main_body_text)\n\n return slack_on_freshness_policy
\n
", "current_page_name": "_modules/dagster_slack/sensors", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_slack.sensors"}}, "dagster_snowflake": {"ops": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_snowflake.ops

\nfrom dagster import (\n    Nothing,\n    _check as check,\n    op,\n)\nfrom dagster._core.definitions.input import In\n\n\ndef _core_create_snowflake_command(dagster_decorator, decorator_name, sql, parameters=None):\n    check.str_param(sql, "sql")\n    check.opt_dict_param(parameters, "parameters")\n\n    @dagster_decorator(\n        name=f"snowflake_{decorator_name}",\n        ins={"start": In(Nothing)},\n        required_resource_keys={"snowflake"},\n        tags={"kind": "sql", "sql": sql},\n    )\n    def snowflake_fn(context):\n        context.resources.snowflake.execute_query(sql=sql, parameters=parameters)\n\n    return snowflake_fn\n\n\ndef snowflake_solid_for_query(sql, parameters=None):\n    """This function is a solid factory that constructs solids to execute a snowflake query.\n\n    Note that you can only use `snowflake_solid_for_query` if you know the query you'd like to\n    execute at job construction time. If you'd like to execute queries dynamically during\n    job execution, you should manually execute those queries in your custom solid using the\n    snowflake resource.\n\n    Args:\n        sql (str): The sql query that will execute against the provided snowflake resource.\n        parameters (dict): The parameters for the sql query.\n\n    Returns:\n        SolidDefinition: Returns the constructed solid definition.\n    """\n    return _core_create_snowflake_command(op, "solid", sql, parameters)\n\n\n
[docs]def snowflake_op_for_query(sql, parameters=None):\n """This function is an op factory that constructs an op to execute a snowflake query.\n\n Note that you can only use `snowflake_op_for_query` if you know the query you'd like to\n execute at graph construction time. If you'd like to execute queries dynamically during\n job execution, you should manually execute those queries in your custom op using the\n snowflake resource.\n\n Args:\n sql (str): The sql query that will execute against the provided snowflake resource.\n parameters (dict): The parameters for the sql query.\n\n Returns:\n OpDefinition: Returns the constructed op definition.\n """\n return _core_create_snowflake_command(op, "op", sql, parameters)
\n
", "current_page_name": "_modules/dagster_snowflake/ops", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_snowflake.ops"}, "resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_snowflake.resources

\nimport base64\nimport sys\nimport warnings\nfrom contextlib import closing, contextmanager\nfrom typing import Any, Dict, Iterator, List, Mapping, Optional, Sequence, Union\n\nimport dagster._check as check\nfrom cryptography.hazmat.backends import default_backend\nfrom cryptography.hazmat.primitives import serialization\nfrom dagster import (\n    ConfigurableResource,\n    IAttachDifferentObjectToOpContext,\n    get_dagster_logger,\n    resource,\n)\nfrom dagster._annotations import public\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom dagster._core.storage.event_log.sql_event_log import SqlDbConnection\nfrom dagster._utils.cached_method import cached_method\nfrom pydantic import Field, root_validator, validator\n\ntry:\n    import snowflake.connector\nexcept ImportError:\n    msg = (\n        "Could not import snowflake.connector. This could mean you have an incompatible version "\n        "of azure-storage-blob installed. dagster-snowflake requires azure-storage-blob<12.0.0; "\n        "this conflicts with dagster-azure which requires azure-storage-blob~=12.0.0 and is "\n        "incompatible with dagster-snowflake. Please uninstall dagster-azure and reinstall "\n        "dagster-snowflake to fix this error."\n    )\n    warnings.warn(msg)\n    raise\n\n\n
[docs]class SnowflakeResource(ConfigurableResource, IAttachDifferentObjectToOpContext):\n """A resource for connecting to the Snowflake data warehouse.\n\n If connector configuration is not set, SnowflakeResource.get_connection() will return a\n `snowflake.connector.Connection <https://docs.snowflake.com/en/developer-guide/python-connector/python-connector-api#object-connection>`__\n object. If connector="sqlalchemy" configuration is set, then SnowflakeResource.get_connection() will\n return a `SQLAlchemy Connection <https://docs.sqlalchemy.org/en/20/core/connections.html#sqlalchemy.engine.Connection>`__\n or a `SQLAlchemy raw connection <https://docs.sqlalchemy.org/en/20/core/connections.html#sqlalchemy.engine.Engine.raw_connection>`__.\n\n A simple example of loading data into Snowflake and subsequently querying that data is shown below:\n\n Examples:\n .. code-block:: python\n\n from dagster import job, op\n from dagster_snowflake import SnowflakeResource\n\n @op\n def get_one(snowflake_resource: SnowflakeResource):\n with snowflake_resource.get_connection() as conn:\n # conn is a snowflake.connector.Connection object\n conn.cursor().execute("SELECT 1")\n\n @job\n def my_snowflake_job():\n get_one()\n\n my_snowflake_job.execute_in_process(\n resources={\n 'snowflake_resource': SnowflakeResource(\n account=EnvVar("SNOWFLAKE_ACCOUNT"),\n user=EnvVar("SNOWFLAKE_USER"),\n password=EnvVar("SNOWFLAKE_PASSWORD")\n database="MY_DATABASE",\n schema="MY_SCHEMA",\n warehouse="MY_WAREHOUSE"\n )\n }\n )\n """\n\n account: Optional[str] = Field(\n default=None,\n description=(\n "Your Snowflake account name. For more details, see the `Snowflake documentation."\n " <https://docs.snowflake.com/developer-guide/python-connector/python-connector-api>`__"\n ),\n )\n\n user: str = Field(description="User login name.")\n\n password: Optional[str] = Field(default=None, description="User password.")\n\n database: Optional[str] = Field(\n default=None,\n description=(\n "Name of the default database to use. After login, you can use ``USE DATABASE`` "\n " to change the database."\n ),\n )\n\n schema_: Optional[str] = Field(\n default=None,\n description=(\n "Name of the default schema to use. After login, you can use ``USE SCHEMA`` to "\n "change the schema."\n ),\n alias="schema",\n ) # schema is a reserved word for pydantic\n\n role: Optional[str] = Field(\n default=None,\n description=(\n "Name of the default role to use. After login, you can use ``USE ROLE`` to change "\n " the role."\n ),\n )\n\n warehouse: Optional[str] = Field(\n default=None,\n description=(\n "Name of the default warehouse to use. After login, you can use ``USE WAREHOUSE`` "\n "to change the role."\n ),\n )\n\n private_key: Optional[str] = Field(\n default=None,\n description=(\n "Raw private key to use. See the `Snowflake documentation"\n " <https://docs.snowflake.com/en/user-guide/key-pair-auth.html>`__ for details."\n " Alternately, set private_key_path and private_key_password. To avoid issues with"\n " newlines in the keys, you can base64 encode the key. You can retrieve the base64"\n " encoded key with this shell command: ``cat rsa_key.p8 | base64``"\n ),\n )\n\n private_key_password: Optional[str] = Field(\n default=None,\n description=(\n "Raw private key password to use. See the `Snowflake documentation"\n " <https://docs.snowflake.com/en/user-guide/key-pair-auth.html>`__ for details."\n " Required for both ``private_key`` and ``private_key_path`` if the private key is"\n " encrypted. For unencrypted keys, this config can be omitted or set to None."\n ),\n )\n\n private_key_path: Optional[str] = Field(\n default=None,\n description=(\n "Raw private key path to use. See the `Snowflake documentation"\n " <https://docs.snowflake.com/en/user-guide/key-pair-auth.html>`__ for details."\n " Alternately, set the raw private key as ``private_key``."\n ),\n )\n\n autocommit: Optional[bool] = Field(\n default=None,\n description=(\n "None by default, which honors the Snowflake parameter AUTOCOMMIT. Set to True "\n "or False to enable or disable autocommit mode in the session, respectively."\n ),\n )\n\n client_prefetch_threads: Optional[int] = Field(\n default=None,\n description=(\n "Number of threads used to download the results sets (4 by default). "\n "Increasing the value improves fetch performance but requires more memory."\n ),\n )\n\n client_session_keep_alive: Optional[bool] = Field(\n default=None,\n description=(\n "False by default. Set this to True to keep the session active indefinitely, "\n "even if there is no activity from the user. Make certain to call the close method to "\n "terminate the thread properly or the process may hang."\n ),\n )\n\n login_timeout: Optional[int] = Field(\n default=None,\n description=(\n "Timeout in seconds for login. By default, 60 seconds. The login request gives "\n 'up after the timeout length if the HTTP response is "success".'\n ),\n )\n\n network_timeout: Optional[int] = Field(\n default=None,\n description=(\n "Timeout in seconds for all other operations. By default, none/infinite. A general"\n " request gives up after the timeout length if the HTTP response is not 'success'."\n ),\n )\n\n ocsp_response_cache_filename: Optional[str] = Field(\n default=None,\n description=(\n "URI for the OCSP response cache file. By default, the OCSP response cache "\n "file is created in the cache directory."\n ),\n )\n\n validate_default_parameters: Optional[bool] = Field(\n default=None,\n description=(\n "If True, raise an exception if the warehouse, database, or schema doesn't exist."\n " Defaults to False."\n ),\n )\n\n paramstyle: Optional[str] = Field(\n default=None,\n description=(\n "pyformat by default for client side binding. Specify qmark or numeric to "\n "change bind variable formats for server side binding."\n ),\n )\n\n timezone: Optional[str] = Field(\n default=None,\n description=(\n "None by default, which honors the Snowflake parameter TIMEZONE. Set to a "\n "valid time zone (e.g. America/Los_Angeles) to set the session time zone."\n ),\n )\n\n connector: Optional[str] = Field(\n default=None,\n description=(\n "Indicate alternative database connection engine. Permissible option is "\n "'sqlalchemy' otherwise defaults to use the Snowflake Connector for Python."\n ),\n is_required=False,\n )\n\n cache_column_metadata: Optional[str] = Field(\n default=None,\n description=(\n "Optional parameter when connector is set to sqlalchemy. Snowflake SQLAlchemy takes a"\n " flag ``cache_column_metadata=True`` such that all of column metadata for all tables"\n ' are "cached"'\n ),\n )\n\n numpy: Optional[bool] = Field(\n default=None,\n description=(\n "Optional parameter when connector is set to sqlalchemy. To enable fetching "\n "NumPy data types, add numpy=True to the connection parameters."\n ),\n )\n\n authenticator: Optional[str] = Field(\n default=None,\n description="Optional parameter to specify the authentication mechanism to use.",\n )\n\n @validator("paramstyle")\n def validate_paramstyle(cls, v: Optional[str]) -> Optional[str]:\n valid_config = ["pyformat", "qmark", "numeric"]\n if v is not None and v not in valid_config:\n raise ValueError(\n "Snowflake Resource: 'paramstyle' configuration value must be one of:"\n f" {','.join(valid_config)}."\n )\n return v\n\n @validator("connector")\n def validate_connector(cls, v: Optional[str]) -> Optional[str]:\n if v is not None and v != "sqlalchemy":\n raise ValueError(\n "Snowflake Resource: 'connector' configuration value must be None or sqlalchemy."\n )\n return v\n\n @root_validator\n def validate_authentication(cls, values):\n auths_set = 0\n auths_set += 1 if values.get("password") is not None else 0\n auths_set += 1 if values.get("private_key") is not None else 0\n auths_set += 1 if values.get("private_key_path") is not None else 0\n\n # if authenticator is set, there can be 0 or 1 additional auth method;\n # otherwise, ensure at least 1 method is provided\n check.invariant(\n auths_set > 0 or values.get("authenticator") is not None,\n "Missing config: Password, private key, or authenticator authentication required"\n " for Snowflake resource.",\n )\n\n # ensure that only 1 non-authenticator method is provided\n check.invariant(\n auths_set <= 1,\n "Incorrect config: Cannot provide both password and private key authentication to"\n " Snowflake Resource.",\n )\n\n return values\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n @property\n @cached_method\n def _connection_args(self) -> Mapping[str, Any]:\n conn_args = {\n k: self._resolved_config_dict.get(k)\n for k in (\n "account",\n "user",\n "password",\n "database",\n "schema",\n "role",\n "warehouse",\n "autocommit",\n "client_prefetch_threads",\n "client_session_keep_alive",\n "login_timeout",\n "network_timeout",\n "ocsp_response_cache_filename",\n "validate_default_parameters",\n "paramstyle",\n "timezone",\n "authenticator",\n )\n if self._resolved_config_dict.get(k) is not None\n }\n if (\n self._resolved_config_dict.get("private_key", None) is not None\n or self._resolved_config_dict.get("private_key_path", None) is not None\n ):\n conn_args["private_key"] = self._snowflake_private_key(self._resolved_config_dict)\n\n return conn_args\n\n @property\n @cached_method\n def _sqlalchemy_connection_args(self) -> Mapping[str, Any]:\n conn_args: Dict[str, Any] = {\n k: self._resolved_config_dict.get(k)\n for k in (\n "account",\n "user",\n "password",\n "database",\n "schema",\n "role",\n "warehouse",\n "cache_column_metadata",\n "numpy",\n )\n if self._resolved_config_dict.get(k) is not None\n }\n\n return conn_args\n\n @property\n @cached_method\n def _sqlalchemy_engine_args(self) -> Mapping[str, Any]:\n config = self._resolved_config_dict\n sqlalchemy_engine_args = {}\n if (\n config.get("private_key", None) is not None\n or config.get("private_key_path", None) is not None\n ):\n # sqlalchemy passes private key args separately, so store them in a new dict\n sqlalchemy_engine_args["private_key"] = self._snowflake_private_key(config)\n if config.get("authenticator", None) is not None:\n sqlalchemy_engine_args["authenticator"] = config["authenticator"]\n\n return sqlalchemy_engine_args\n\n def _snowflake_private_key(self, config) -> bytes:\n # If the user has defined a path to a private key, we will use that.\n if config.get("private_key_path", None) is not None:\n # read the file from the path.\n with open(config.get("private_key_path"), "rb") as key:\n private_key = key.read()\n else:\n private_key = config.get("private_key", None)\n\n kwargs = {}\n if config.get("private_key_password", None) is not None:\n kwargs["password"] = config["private_key_password"].encode()\n else:\n kwargs["password"] = None\n\n try:\n p_key = serialization.load_pem_private_key(\n private_key, backend=default_backend(), **kwargs\n )\n except TypeError:\n try:\n private_key = base64.b64decode(private_key)\n p_key = serialization.load_pem_private_key(\n private_key, backend=default_backend(), **kwargs\n )\n except ValueError:\n raise ValueError(\n "Unable to load private key. You may need to base64 encode your private key."\n " You can retrieve the base64 encoded key with this shell command: cat"\n " rsa_key.p8 | base64"\n )\n\n pkb = p_key.private_bytes(\n encoding=serialization.Encoding.DER,\n format=serialization.PrivateFormat.PKCS8,\n encryption_algorithm=serialization.NoEncryption(),\n )\n\n return pkb\n\n @public\n @contextmanager\n def get_connection(\n self, raw_conn: bool = True\n ) -> Iterator[Union[SqlDbConnection, snowflake.connector.SnowflakeConnection]]:\n """Gets a connection to Snowflake as a context manager.\n\n If connector configuration is not set, SnowflakeResource.get_connection() will return a\n `snowflake.connector.Connection <https://docs.snowflake.com/en/developer-guide/python-connector/python-connector-api#object-connection>`__\n If connector="sqlalchemy" configuration is set, then SnowflakeResource.get_connection() will\n return a `SQLAlchemy Connection <https://docs.sqlalchemy.org/en/20/core/connections.html#sqlalchemy.engine.Connection>`__\n or a `SQLAlchemy raw connection <https://docs.sqlalchemy.org/en/20/core/connections.html#sqlalchemy.engine.Engine.raw_connection>`__\n if raw_conn=True.\n\n\n Args:\n raw_conn (bool): If using the sqlalchemy connector, you can set raw_conn to True to create a raw\n connection. Defaults to True.\n\n Examples:\n .. code-block:: python\n\n @op\n def get_query_status(snowflake: SnowflakeResource, query_id):\n with snowflake.get_connection() as conn:\n # conn is a Snowflake Connection object or a SQLAlchemy Connection if\n # sqlalchemy is specified as the connector in the Snowflake Resource config\n\n return conn.get_query_status(query_id)\n\n """\n if self.connector == "sqlalchemy":\n from snowflake.sqlalchemy import URL\n from sqlalchemy import create_engine\n\n engine = create_engine(\n URL(**self._sqlalchemy_connection_args), connect_args=self._sqlalchemy_engine_args\n )\n conn = engine.raw_connection() if raw_conn else engine.connect()\n\n yield conn\n conn.close()\n engine.dispose()\n else:\n conn = snowflake.connector.connect(**self._connection_args)\n\n yield conn\n if not self.autocommit:\n conn.commit()\n conn.close()\n\n def get_object_to_set_on_execution_context(self) -> Any:\n # Directly create a SnowflakeConnection here for backcompat since the SnowflakeConnection\n # has methods this resource does not have\n return SnowflakeConnection(\n config=self._resolved_config_dict,\n log=get_dagster_logger(),\n snowflake_connection_resource=self,\n )
\n\n\n
[docs]class SnowflakeConnection:\n """A connection to Snowflake that can execute queries. In general this class should not be\n directly instantiated, but rather used as a resource in an op or asset via the\n :py:func:`snowflake_resource`.\n\n Note that the SnowflakeConnection is only used by the snowflake_resource. The Pythonic SnowflakeResource does\n not use this SnowflakeConnection class.\n """\n\n def __init__(\n self, config: Mapping[str, str], log, snowflake_connection_resource: SnowflakeResource\n ):\n self.snowflake_connection_resource = snowflake_connection_resource\n self.log = log\n\n
[docs] @public\n @contextmanager\n def get_connection(\n self, raw_conn: bool = True\n ) -> Iterator[Union[SqlDbConnection, snowflake.connector.SnowflakeConnection]]:\n """Gets a connection to Snowflake as a context manager.\n\n If using the execute_query, execute_queries, or load_table_from_local_parquet methods,\n you do not need to create a connection using this context manager.\n\n Args:\n raw_conn (bool): If using the sqlalchemy connector, you can set raw_conn to True to create a raw\n connection. Defaults to True.\n\n Examples:\n .. code-block:: python\n\n @op(\n required_resource_keys={"snowflake"}\n )\n def get_query_status(query_id):\n with context.resources.snowflake.get_connection() as conn:\n # conn is a Snowflake Connection object or a SQLAlchemy Connection if\n # sqlalchemy is specified as the connector in the Snowflake Resource config\n\n return conn.get_query_status(query_id)\n\n """\n with self.snowflake_connection_resource.get_connection(raw_conn=raw_conn) as conn:\n yield conn
\n\n
[docs] @public\n def execute_query(\n self,\n sql: str,\n parameters: Optional[Union[Sequence[Any], Mapping[Any, Any]]] = None,\n fetch_results: bool = False,\n use_pandas_result: bool = False,\n ):\n """Execute a query in Snowflake.\n\n Args:\n sql (str): the query to be executed\n parameters (Optional[Union[Sequence[Any], Mapping[Any, Any]]]): Parameters to be passed to the query. See the\n `Snowflake documentation <https://docs.snowflake.com/en/user-guide/python-connector-example.html#binding-data>`__\n for more information.\n fetch_results (bool): If True, will return the result of the query. Defaults to False. If True\n and use_pandas_result is also True, results will be returned as a Pandas DataFrame.\n use_pandas_result (bool): If True, will return the result of the query as a Pandas DataFrame.\n Defaults to False. If fetch_results is False and use_pandas_result is True, an error will be\n raised.\n\n Returns:\n The result of the query if fetch_results or use_pandas_result is True, otherwise returns None\n\n Examples:\n .. code-block:: python\n\n @op\n def drop_database(snowflake: SnowflakeResource):\n snowflake.execute_query(\n "DROP DATABASE IF EXISTS MY_DATABASE"\n )\n """\n check.str_param(sql, "sql")\n check.opt_inst_param(parameters, "parameters", (list, dict))\n check.bool_param(fetch_results, "fetch_results")\n if not fetch_results and use_pandas_result:\n check.failed("If use_pandas_result is True, fetch_results must also be True.")\n\n with self.get_connection() as conn:\n with closing(conn.cursor()) as cursor:\n if sys.version_info[0] < 3:\n sql = sql.encode("utf-8")\n\n self.log.info("Executing query: " + sql)\n parameters = dict(parameters) if isinstance(parameters, Mapping) else parameters\n cursor.execute(sql, parameters)\n if use_pandas_result:\n return cursor.fetch_pandas_all()\n if fetch_results:\n return cursor.fetchall()
\n\n
[docs] @public\n def execute_queries(\n self,\n sql_queries: Sequence[str],\n parameters: Optional[Union[Sequence[Any], Mapping[Any, Any]]] = None,\n fetch_results: bool = False,\n use_pandas_result: bool = False,\n ) -> Optional[Sequence[Any]]:\n """Execute multiple queries in Snowflake.\n\n Args:\n sql_queries (str): List of queries to be executed in series\n parameters (Optional[Union[Sequence[Any], Mapping[Any, Any]]]): Parameters to be passed to every query. See the\n `Snowflake documentation <https://docs.snowflake.com/en/user-guide/python-connector-example.html#binding-data>`__\n for more information.\n fetch_results (bool): If True, will return the results of the queries as a list. Defaults to False. If True\n and use_pandas_result is also True, results will be returned as Pandas DataFrames.\n use_pandas_result (bool): If True, will return the results of the queries as a list of a Pandas DataFrames.\n Defaults to False. If fetch_results is False and use_pandas_result is True, an error will be\n raised.\n\n Returns:\n The results of the queries as a list if fetch_results or use_pandas_result is True,\n otherwise returns None\n\n Examples:\n .. code-block:: python\n\n @op\n def create_fresh_database(snowflake: SnowflakeResource):\n queries = ["DROP DATABASE IF EXISTS MY_DATABASE", "CREATE DATABASE MY_DATABASE"]\n snowflake.execute_queries(\n sql_queries=queries\n )\n\n """\n check.sequence_param(sql_queries, "sql_queries", of_type=str)\n check.opt_inst_param(parameters, "parameters", (list, dict))\n check.bool_param(fetch_results, "fetch_results")\n if not fetch_results and use_pandas_result:\n check.failed("If use_pandas_result is True, fetch_results must also be True.")\n\n results: List[Any] = []\n with self.get_connection() as conn:\n with closing(conn.cursor()) as cursor:\n for raw_sql in sql_queries:\n sql = raw_sql.encode("utf-8") if sys.version_info[0] < 3 else raw_sql\n self.log.info("Executing query: " + sql)\n parameters = dict(parameters) if isinstance(parameters, Mapping) else parameters\n cursor.execute(sql, parameters)\n if use_pandas_result:\n results = results.append(cursor.fetch_pandas_all()) # type: ignore\n elif fetch_results:\n results.append(cursor.fetchall())\n\n return results if len(results) > 0 else None
\n\n
[docs] @public\n def load_table_from_local_parquet(self, src: str, table: str):\n """Stores the content of a parquet file to a Snowflake table.\n\n Args:\n src (str): the name of the file to store in Snowflake\n table (str): the name of the table to store the data. If the table does not exist, it will\n be created. Otherwise the contents of the table will be replaced with the data in src\n\n Examples:\n .. code-block:: python\n\n import pandas as pd\n import pyarrow as pa\n import pyarrow.parquet as pq\n\n @op\n def write_parquet_file(snowflake: SnowflakeResource):\n df = pd.DataFrame({"one": [1, 2, 3], "ten": [11, 12, 13]})\n table = pa.Table.from_pandas(df)\n pq.write_table(table, "example.parquet')\n snowflake.load_table_from_local_parquet(\n src="example.parquet",\n table="MY_TABLE"\n )\n\n """\n check.str_param(src, "src")\n check.str_param(table, "table")\n\n sql_queries = [\n f"CREATE OR REPLACE TABLE {table} ( data VARIANT DEFAULT NULL);",\n "CREATE OR REPLACE FILE FORMAT parquet_format TYPE = 'parquet';",\n f"PUT {src} @%{table};",\n f"COPY INTO {table} FROM @%{table} FILE_FORMAT = (FORMAT_NAME = 'parquet_format');",\n ]\n\n self.execute_queries(sql_queries)
\n\n\n
[docs]@dagster_maintained_resource\n@resource(\n config_schema=SnowflakeResource.to_config_schema(),\n description="This resource is for connecting to the Snowflake data warehouse",\n)\ndef snowflake_resource(context) -> SnowflakeConnection:\n """A resource for connecting to the Snowflake data warehouse. The returned resource object is an\n instance of :py:class:`SnowflakeConnection`.\n\n A simple example of loading data into Snowflake and subsequently querying that data is shown below:\n\n Examples:\n .. code-block:: python\n\n from dagster import job, op\n from dagster_snowflake import snowflake_resource\n\n @op(required_resource_keys={'snowflake'})\n def get_one(context):\n context.resources.snowflake.execute_query('SELECT 1')\n\n @job(resource_defs={'snowflake': snowflake_resource})\n def my_snowflake_job():\n get_one()\n\n my_snowflake_job.execute_in_process(\n run_config={\n 'resources': {\n 'snowflake': {\n 'config': {\n 'account': {'env': 'SNOWFLAKE_ACCOUNT'},\n 'user': {'env': 'SNOWFLAKE_USER'},\n 'password': {'env': 'SNOWFLAKE_PASSWORD'},\n 'database': {'env': 'SNOWFLAKE_DATABASE'},\n 'schema': {'env': 'SNOWFLAKE_SCHEMA'},\n 'warehouse': {'env': 'SNOWFLAKE_WAREHOUSE'},\n }\n }\n }\n }\n )\n """\n snowflake_resource = SnowflakeResource.from_resource_context(context)\n return SnowflakeConnection(\n config=context, log=context.log, snowflake_connection_resource=snowflake_resource\n )
\n
", "current_page_name": "_modules/dagster_snowflake/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_snowflake.resources"}, "snowflake_io_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_snowflake.snowflake_io_manager

\nfrom abc import abstractmethod\nfrom contextlib import contextmanager\nfrom typing import Optional, Sequence, Type, cast\n\nfrom dagster import IOManagerDefinition, OutputContext, io_manager\nfrom dagster._config.pythonic_config import (\n    ConfigurableIOManagerFactory,\n)\nfrom dagster._core.definitions.time_window_partitions import TimeWindow\nfrom dagster._core.storage.db_io_manager import (\n    DbClient,\n    DbIOManager,\n    DbTypeHandler,\n    TablePartitionDimension,\n    TableSlice,\n)\nfrom dagster._core.storage.io_manager import dagster_maintained_io_manager\nfrom pydantic import Field\nfrom sqlalchemy.exc import ProgrammingError\n\nfrom .resources import SnowflakeResource\n\nSNOWFLAKE_DATETIME_FORMAT = "%Y-%m-%d %H:%M:%S"\n\n\n
[docs]def build_snowflake_io_manager(\n type_handlers: Sequence[DbTypeHandler], default_load_type: Optional[Type] = None\n) -> IOManagerDefinition:\n """Builds an IO manager definition that reads inputs from and writes outputs to Snowflake.\n\n Args:\n type_handlers (Sequence[DbTypeHandler]): Each handler defines how to translate between\n slices of Snowflake tables and an in-memory type - e.g. a Pandas DataFrame. If only\n one DbTypeHandler is provided, it will be used as teh default_load_type.\n default_load_type (Type): When an input has no type annotation, load it as this type.\n\n Returns:\n IOManagerDefinition\n\n Examples:\n .. code-block:: python\n\n from dagster_snowflake import build_snowflake_io_manager\n from dagster_snowflake_pandas import SnowflakePandasTypeHandler\n from dagster_snowflake_pyspark import SnowflakePySparkTypeHandler\n from dagster import Definitions\n\n @asset(\n key_prefix=["my_schema"] # will be used as the schema in snowflake\n )\n def my_table() -> pd.DataFrame: # the name of the asset will be the table name\n ...\n\n snowflake_io_manager = build_snowflake_io_manager([SnowflakePandasTypeHandler(), SnowflakePySparkTypeHandler()])\n\n defs = Definitions(\n assets=[my_table],\n resources={\n "io_manager": snowflake_io_manager.configured({\n "database": "my_database",\n "account" : {"env": "SNOWFLAKE_ACCOUNT"}\n ...\n })\n }\n )\n\n If you do not provide a schema, Dagster will determine a schema based on the assets and ops using\n the IO Manager. For assets, the schema will be determined from the asset key,\n as shown in the above example. The final prefix before the asset name will be used as the schema. For example,\n if the asset ``my_table`` had the key prefix ``["snowflake", "my_schema"]``, the schema ``my_schema`` will be\n used. For ops, the schema can be specified by including a ``schema`` entry in output metadata. If ``schema`` is not provided\n via config or on the asset/op, ``public`` will be used for the schema.\n\n .. code-block:: python\n\n @op(\n out={"my_table": Out(metadata={"schema": "my_schema"})}\n )\n def make_my_table() -> pd.DataFrame:\n # the returned value will be stored at my_schema.my_table\n ...\n\n To only use specific columns of a table as input to a downstream op or asset, add the metadata ``columns`` to the\n In or AssetIn.\n\n .. code-block:: python\n\n @asset(\n ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n )\n def my_table_a(my_table: pd.DataFrame) -> pd.DataFrame:\n # my_table will just contain the data from column "a"\n ...\n\n """\n\n @dagster_maintained_io_manager\n @io_manager(config_schema=SnowflakeIOManager.to_config_schema())\n def snowflake_io_manager(init_context):\n return DbIOManager(\n type_handlers=type_handlers,\n db_client=SnowflakeDbClient(),\n io_manager_name="SnowflakeIOManager",\n database=init_context.resource_config["database"],\n schema=init_context.resource_config.get("schema"),\n default_load_type=default_load_type,\n )\n\n return snowflake_io_manager
\n\n\n
[docs]class SnowflakeIOManager(ConfigurableIOManagerFactory):\n """Base class for an IO manager definition that reads inputs from and writes outputs to Snowflake.\n\n Examples:\n .. code-block:: python\n\n from dagster_snowflake import SnowflakeIOManager\n from dagster_snowflake_pandas import SnowflakePandasTypeHandler\n from dagster_snowflake_pyspark import SnowflakePySparkTypeHandler\n from dagster import Definitions, EnvVar\n\n class MySnowflakeIOManager(SnowflakeIOManager):\n @staticmethod\n def type_handlers() -> Sequence[DbTypeHandler]:\n return [SnowflakePandasTypeHandler(), SnowflakePySparkTypeHandler()]\n\n @asset(\n key_prefix=["my_schema"] # will be used as the schema in snowflake\n )\n def my_table() -> pd.DataFrame: # the name of the asset will be the table name\n ...\n\n defs = Definitions(\n assets=[my_table],\n resources={\n "io_manager": MySnowflakeIOManager(database="MY_DATABASE", account=EnvVar("SNOWFLAKE_ACCOUNT"), ...)\n }\n )\n\n If you do not provide a schema, Dagster will determine a schema based on the assets and ops using\n the IO Manager. For assets, the schema will be determined from the asset key,\n as shown in the above example. The final prefix before the asset name will be used as the schema. For example,\n if the asset ``my_table`` had the key prefix ``["snowflake", "my_schema"]``, the schema ``my_schema`` will be\n used. For ops, the schema can be specified by including a ``schema`` entry in output metadata. If ``schema`` is not provided\n via config or on the asset/op, ``public`` will be used for the schema.\n\n .. code-block:: python\n\n @op(\n out={"my_table": Out(metadata={"schema": "my_schema"})}\n )\n def make_my_table() -> pd.DataFrame:\n # the returned value will be stored at my_schema.my_table\n ...\n\n To only use specific columns of a table as input to a downstream op or asset, add the metadata ``columns`` to the\n In or AssetIn.\n\n .. code-block:: python\n\n @asset(\n ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n )\n def my_table_a(my_table: pd.DataFrame) -> pd.DataFrame:\n # my_table will just contain the data from column "a"\n ...\n\n """\n\n database: str = Field(description="Name of the database to use.")\n account: str = Field(\n description=(\n "Your Snowflake account name. For more details, see the `Snowflake documentation."\n " <https://docs.snowflake.com/developer-guide/python-connector/python-connector-api>`__"\n ),\n )\n user: str = Field(description="User login name.")\n schema_: Optional[str] = Field(\n default=None, alias="schema", description="Name of the schema to use."\n ) # schema is a reserved word for pydantic\n password: Optional[str] = Field(default=None, description="User password.")\n warehouse: Optional[str] = Field(default=None, description="Name of the warehouse to use.")\n role: Optional[str] = Field(default=None, description="Name of the role to use.")\n private_key: Optional[str] = Field(\n default=None,\n description=(\n "Raw private key to use. See the `Snowflake documentation"\n " <https://docs.snowflake.com/en/user-guide/key-pair-auth.html>`__ for details. To"\n " avoid issues with newlines in the keys, you can base64 encode the key. You can"\n " retrieve the base64 encoded key with this shell command: cat rsa_key.p8 | base64"\n ),\n )\n private_key_path: Optional[str] = Field(\n default=None,\n description=(\n "Path to the private key. See the `Snowflake documentation"\n " <https://docs.snowflake.com/en/user-guide/key-pair-auth.html>`__ for details."\n ),\n )\n private_key_password: Optional[str] = Field(\n default=None,\n description=(\n "The password of the private key. See the `Snowflake documentation"\n " <https://docs.snowflake.com/en/user-guide/key-pair-auth.html>`__ for details."\n " Required for both private_key and private_key_path if the private key is encrypted."\n " For unencrypted keys, this config can be omitted or set to None."\n ),\n )\n store_timestamps_as_strings: bool = Field(\n default=False,\n description=(\n "If using Pandas DataFrames, whether to convert time data to strings. If True, time"\n " data will be converted to strings when storing the DataFrame and converted back to"\n " time data when loading the DataFrame. If False, time data without a timezone will be"\n " set to UTC timezone to avoid a Snowflake bug. Defaults to False."\n ),\n )\n authenticator: Optional[str] = Field(\n default=None,\n description="Optional parameter to specify the authentication mechanism to use.",\n )\n\n @staticmethod\n @abstractmethod\n def type_handlers() -> Sequence[DbTypeHandler]:\n """type_handlers should return a list of the TypeHandlers that the I/O manager can use.\n\n .. code-block:: python\n\n from dagster_snowflake import SnowflakeIOManager\n from dagster_snowflake_pandas import SnowflakePandasTypeHandler\n from dagster_snowflake_pyspark import SnowflakePySparkTypeHandler\n from dagster import Definitions, EnvVar\n\n class MySnowflakeIOManager(SnowflakeIOManager):\n @staticmethod\n def type_handlers() -> Sequence[DbTypeHandler]:\n return [SnowflakePandasTypeHandler(), SnowflakePySparkTypeHandler()]\n """\n ...\n\n @staticmethod\n def default_load_type() -> Optional[Type]:\n """If an asset or op is not annotated with an return type, default_load_type will be used to\n determine which TypeHandler to use to store and load the output.\n\n If left unimplemented, default_load_type will return None. In that case, if there is only\n one TypeHandler, the I/O manager will default to loading unannotated outputs with that\n TypeHandler.\n\n .. code-block:: python\n\n from dagster_snowflake import SnowflakeIOManager\n from dagster_snowflake_pandas import SnowflakePandasTypeHandler\n from dagster_snowflake_pyspark import SnowflakePySparkTypeHandler\n from dagster import Definitions, EnvVar\n import pandas as pd\n\n class MySnowflakeIOManager(SnowflakeIOManager):\n @staticmethod\n def type_handlers() -> Sequence[DbTypeHandler]:\n return [SnowflakePandasTypeHandler(), SnowflakePySparkTypeHandler()]\n\n @staticmethod\n def default_load_type() -> Optional[Type]:\n return pd.DataFrame\n """\n return None\n\n def create_io_manager(self, context) -> DbIOManager:\n return DbIOManager(\n db_client=SnowflakeDbClient(),\n io_manager_name="SnowflakeIOManager",\n database=self.database,\n schema=self.schema_,\n type_handlers=self.type_handlers(),\n default_load_type=self.default_load_type(),\n )
\n\n\nclass SnowflakeDbClient(DbClient):\n @staticmethod\n @contextmanager\n def connect(context, table_slice):\n no_schema_config = (\n {k: v for k, v in context.resource_config.items() if k != "schema"}\n if context.resource_config\n else {}\n )\n with SnowflakeResource(\n schema=table_slice.schema, connector="sqlalchemy", **no_schema_config\n ).get_connection(raw_conn=False) as conn:\n yield conn\n\n @staticmethod\n def ensure_schema_exists(context: OutputContext, table_slice: TableSlice, connection) -> None:\n schemas = connection.execute(\n f"show schemas like '{table_slice.schema}' in database {table_slice.database}"\n ).fetchall()\n if len(schemas) == 0:\n connection.execute(f"create schema {table_slice.schema};")\n\n @staticmethod\n def delete_table_slice(context: OutputContext, table_slice: TableSlice, connection) -> None:\n try:\n connection.execute(_get_cleanup_statement(table_slice))\n except ProgrammingError:\n # table doesn't exist yet, so ignore the error\n pass\n\n @staticmethod\n def get_select_statement(table_slice: TableSlice) -> str:\n col_str = ", ".join(table_slice.columns) if table_slice.columns else "*"\n if table_slice.partition_dimensions and len(table_slice.partition_dimensions) > 0:\n query = (\n f"SELECT {col_str} FROM"\n f" {table_slice.database}.{table_slice.schema}.{table_slice.table} WHERE\\n"\n )\n return query + _partition_where_clause(table_slice.partition_dimensions)\n else:\n return f"""SELECT {col_str} FROM {table_slice.database}.{table_slice.schema}.{table_slice.table}"""\n\n\ndef _get_cleanup_statement(table_slice: TableSlice) -> str:\n """Returns a SQL statement that deletes data in the given table to make way for the output data\n being written.\n """\n if table_slice.partition_dimensions and len(table_slice.partition_dimensions) > 0:\n query = (\n f"DELETE FROM {table_slice.database}.{table_slice.schema}.{table_slice.table} WHERE\\n"\n )\n return query + _partition_where_clause(table_slice.partition_dimensions)\n else:\n return f"DELETE FROM {table_slice.database}.{table_slice.schema}.{table_slice.table}"\n\n\ndef _partition_where_clause(partition_dimensions: Sequence[TablePartitionDimension]) -> str:\n return " AND\\n".join(\n (\n _time_window_where_clause(partition_dimension)\n if isinstance(partition_dimension.partitions, TimeWindow)\n else _static_where_clause(partition_dimension)\n )\n for partition_dimension in partition_dimensions\n )\n\n\ndef _time_window_where_clause(table_partition: TablePartitionDimension) -> str:\n partition = cast(TimeWindow, table_partition.partitions)\n start_dt, end_dt = partition\n start_dt_str = start_dt.strftime(SNOWFLAKE_DATETIME_FORMAT)\n end_dt_str = end_dt.strftime(SNOWFLAKE_DATETIME_FORMAT)\n # Snowflake BETWEEN is inclusive; start <= partition expr <= end. We don't want to remove the next partition so we instead\n # write this as start <= partition expr < end.\n return f"""{table_partition.partition_expr} >= '{start_dt_str}' AND {table_partition.partition_expr} < '{end_dt_str}'"""\n\n\ndef _static_where_clause(table_partition: TablePartitionDimension) -> str:\n partitions = ", ".join(f"'{partition}'" for partition in table_partition.partitions)\n return f"""{table_partition.partition_expr} in ({partitions})"""\n
", "current_page_name": "_modules/dagster_snowflake/snowflake_io_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_snowflake.snowflake_io_manager"}}, "dagster_snowflake_pandas": {"snowflake_pandas_type_handler": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_snowflake_pandas.snowflake_pandas_type_handler

\nfrom typing import Mapping, Optional, Sequence, Type\n\nimport pandas as pd\nimport pandas.core.dtypes.common as pd_core_dtypes_common\nfrom dagster import InputContext, MetadataValue, OutputContext, TableColumn, TableSchema\nfrom dagster._core.definitions.metadata import RawMetadataValue\nfrom dagster._core.errors import DagsterInvariantViolationError\nfrom dagster._core.storage.db_io_manager import DbTypeHandler, TableSlice\nfrom dagster_snowflake import build_snowflake_io_manager\nfrom dagster_snowflake.snowflake_io_manager import SnowflakeDbClient, SnowflakeIOManager\nfrom snowflake.connector.pandas_tools import pd_writer\n\n\ndef _table_exists(table_slice: TableSlice, connection):\n    tables = connection.execute(\n        f"SHOW TABLES LIKE '{table_slice.table}' IN SCHEMA"\n        f" {table_slice.database}.{table_slice.schema}"\n    ).fetchall()\n    return len(tables) > 0\n\n\ndef _get_table_column_types(table_slice: TableSlice, connection) -> Optional[Mapping[str, str]]:\n    if _table_exists(table_slice, connection):\n        schema_list = connection.execute(f"DESCRIBE TABLE {table_slice.table}").fetchall()\n        return {item[0]: item[1] for item in schema_list}\n\n\ndef _convert_timestamp_to_string(\n    s: pd.Series, column_types: Optional[Mapping[str, str]], table_name: str\n) -> pd.Series:\n    """Converts columns of data of type pd.Timestamp to string so that it can be stored in\n    snowflake.\n    """\n    column_name = str(s.name)\n    if pd_core_dtypes_common.is_datetime_or_timedelta_dtype(s):  # type: ignore  # (bad stubs)\n        if column_types:\n            if "VARCHAR" not in column_types[column_name]:\n                raise DagsterInvariantViolationError(\n                    "Snowflake I/O manager: Snowflake I/O manager configured to convert time data"\n                    f" in DataFrame column {column_name} to strings, but the corresponding"\n                    f" {column_name.upper()} column in table {table_name} is not of type VARCHAR,"\n                    f" it is of type {column_types[column_name]}. Please set"\n                    " store_timestamps_as_strings=False in the Snowflake I/O manager configuration"\n                    " to store time data as TIMESTAMP types."\n                )\n        return s.dt.strftime("%Y-%m-%d %H:%M:%S.%f %z")\n    else:\n        return s\n\n\ndef _convert_string_to_timestamp(s: pd.Series) -> pd.Series:\n    """Converts columns of strings in Timestamp format to pd.Timestamp to undo the conversion in\n    _convert_timestamp_to_string.\n\n    This will not convert non-timestamp strings into timestamps (pd.to_datetime will raise an\n    exception if the string cannot be converted)\n    """\n    if isinstance(s[0], str):\n        try:\n            return pd.to_datetime(s.values)  # type: ignore  # (bad stubs)\n        except ValueError:\n            return s\n    else:\n        return s\n\n\ndef _add_missing_timezone(\n    s: pd.Series, column_types: Optional[Mapping[str, str]], table_name: str\n) -> pd.Series:\n    column_name = str(s.name)\n    if pd_core_dtypes_common.is_datetime_or_timedelta_dtype(s):  # type: ignore  # (bad stubs)\n        if column_types:\n            if "VARCHAR" in column_types[column_name]:\n                raise DagsterInvariantViolationError(\n                    f"Snowflake I/O manager: The Snowflake column {column_name.upper()} in table"\n                    f" {table_name} is of type {column_types[column_name]} and should be of type"\n                    f" TIMESTAMP to store the time data in dataframe column {column_name}. Please"\n                    " migrate this column to be of time TIMESTAMP_NTZ(9) to store time data."\n                )\n        return s.dt.tz_localize("UTC")\n    return s\n\n\n
[docs]class SnowflakePandasTypeHandler(DbTypeHandler[pd.DataFrame]):\n """Plugin for the Snowflake I/O Manager that can store and load Pandas DataFrames as Snowflake tables.\n\n Examples:\n .. code-block:: python\n\n from dagster_snowflake import SnowflakeIOManager\n from dagster_snowflake_pandas import SnowflakePandasTypeHandler\n from dagster_snowflake_pyspark import SnowflakePySparkTypeHandler\n from dagster import Definitions, EnvVar\n\n class MySnowflakeIOManager(SnowflakeIOManager):\n @staticmethod\n def type_handlers() -> Sequence[DbTypeHandler]:\n return [SnowflakePandasTypeHandler(), SnowflakePySparkTypeHandler()]\n\n @asset(\n key_prefix=["my_schema"] # will be used as the schema in snowflake\n )\n def my_table() -> pd.DataFrame: # the name of the asset will be the table name\n ...\n\n defs = Definitions(\n assets=[my_table],\n resources={\n "io_manager": MySnowflakeIOManager(database="MY_DATABASE", account=EnvVar("SNOWFLAKE_ACCOUNT"), ...)\n }\n )\n """\n\n def handle_output(\n self, context: OutputContext, table_slice: TableSlice, obj: pd.DataFrame, connection\n ) -> Mapping[str, RawMetadataValue]:\n from snowflake import connector\n\n connector.paramstyle = "pyformat"\n with_uppercase_cols = obj.rename(str.upper, copy=False, axis="columns")\n column_types = _get_table_column_types(table_slice, connection)\n if context.resource_config and context.resource_config.get(\n "store_timestamps_as_strings", False\n ):\n with_uppercase_cols = with_uppercase_cols.apply(\n lambda x: _convert_timestamp_to_string(x, column_types, table_slice.table),\n axis="index",\n )\n else:\n with_uppercase_cols = with_uppercase_cols.apply(\n lambda x: _add_missing_timezone(x, column_types, table_slice.table), axis="index"\n )\n with_uppercase_cols.to_sql(\n table_slice.table,\n con=connection.engine,\n if_exists="append",\n index=False,\n method=pd_writer,\n )\n\n return {\n "row_count": obj.shape[0],\n "dataframe_columns": MetadataValue.table_schema(\n TableSchema(\n columns=[\n TableColumn(name=str(name), type=str(dtype))\n for name, dtype in obj.dtypes.items()\n ]\n )\n ),\n }\n\n def load_input(\n self, context: InputContext, table_slice: TableSlice, connection\n ) -> pd.DataFrame:\n if table_slice.partition_dimensions and len(context.asset_partition_keys) == 0:\n return pd.DataFrame()\n result = pd.read_sql(\n sql=SnowflakeDbClient.get_select_statement(table_slice), con=connection\n )\n if context.resource_config and context.resource_config.get(\n "store_timestamps_as_strings", False\n ):\n result = result.apply(_convert_string_to_timestamp, axis="index")\n result.columns = map(str.lower, result.columns) # type: ignore # (bad stubs)\n return result\n\n @property\n def supported_types(self):\n return [pd.DataFrame]
\n\n\nsnowflake_pandas_io_manager = build_snowflake_io_manager(\n [SnowflakePandasTypeHandler()], default_load_type=pd.DataFrame\n)\nsnowflake_pandas_io_manager.__doc__ = """\nAn I/O manager definition that reads inputs from and writes Pandas DataFrames to Snowflake. When\nusing the snowflake_pandas_io_manager, any inputs and outputs without type annotations will be loaded\nas Pandas DataFrames.\n\n\nReturns:\n IOManagerDefinition\n\nExamples:\n\n .. code-block:: python\n\n from dagster_snowflake_pandas import snowflake_pandas_io_manager\n from dagster import asset, Definitions\n\n @asset(\n key_prefix=["my_schema"] # will be used as the schema in snowflake\n )\n def my_table() -> pd.DataFrame: # the name of the asset will be the table name\n ...\n\n defs = Definitions(\n assets=[my_table],\n resources={\n "io_manager": snowflake_pandas_io_manager.configured({\n "database": "my_database",\n "account" : {"env": "SNOWFLAKE_ACCOUNT"}\n ...\n })\n }\n )\n\n If you do not provide a schema, Dagster will determine a schema based on the assets and ops using\n the I/O Manager. For assets, the schema will be determined from the asset key.\n For ops, the schema can be specified by including a "schema" entry in output metadata. If "schema" is not provided\n via config or on the asset/op, "public" will be used for the schema.\n\n .. code-block:: python\n\n @op(\n out={"my_table": Out(metadata={"schema": "my_schema"})}\n )\n def make_my_table() -> pd.DataFrame:\n # the returned value will be stored at my_schema.my_table\n ...\n\n To only use specific columns of a table as input to a downstream op or asset, add the metadata "columns" to the\n In or AssetIn.\n\n .. code-block:: python\n\n @asset(\n ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n )\n def my_table_a(my_table: pd.DataFrame) -> pd.DataFrame:\n # my_table will just contain the data from column "a"\n ...\n\n"""\n\n\n
[docs]class SnowflakePandasIOManager(SnowflakeIOManager):\n """An I/O manager definition that reads inputs from and writes Pandas DataFrames to Snowflake. When\n using the SnowflakePandasIOManager, any inputs and outputs without type annotations will be loaded\n as Pandas DataFrames.\n\n\n Returns:\n IOManagerDefinition\n\n Examples:\n .. code-block:: python\n\n from dagster_snowflake_pandas import SnowflakePandasIOManager\n from dagster import asset, Definitions, EnvVar\n\n @asset(\n key_prefix=["my_schema"] # will be used as the schema in snowflake\n )\n def my_table() -> pd.DataFrame: # the name of the asset will be the table name\n ...\n\n defs = Definitions(\n assets=[my_table],\n resources={\n "io_manager": SnowflakePandasIOManager(database="MY_DATABASE", account=EnvVar("SNOWFLAKE_ACCOUNT"), ...)\n }\n )\n\n If you do not provide a schema, Dagster will determine a schema based on the assets and ops using\n the I/O Manager. For assets, the schema will be determined from the asset key, as in the above example.\n For ops, the schema can be specified by including a "schema" entry in output metadata. If "schema" is not provided\n via config or on the asset/op, "public" will be used for the schema.\n\n .. code-block:: python\n\n @op(\n out={"my_table": Out(metadata={"schema": "my_schema"})}\n )\n def make_my_table() -> pd.DataFrame:\n # the returned value will be stored at my_schema.my_table\n ...\n\n To only use specific columns of a table as input to a downstream op or asset, add the metadata "columns" to the\n In or AssetIn.\n\n .. code-block:: python\n\n @asset(\n ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n )\n def my_table_a(my_table: pd.DataFrame) -> pd.DataFrame:\n # my_table will just contain the data from column "a"\n ...\n\n """\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n @staticmethod\n def type_handlers() -> Sequence[DbTypeHandler]:\n return [SnowflakePandasTypeHandler()]\n\n @staticmethod\n def default_load_type() -> Optional[Type]:\n return pd.DataFrame
\n
", "current_page_name": "_modules/dagster_snowflake_pandas/snowflake_pandas_type_handler", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_snowflake_pandas.snowflake_pandas_type_handler"}}, "dagster_snowflake_pyspark": {"snowflake_pyspark_type_handler": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_snowflake_pyspark.snowflake_pyspark_type_handler

\nfrom typing import Mapping, Optional, Sequence, Type\n\nimport dagster._check as check\nfrom dagster import InputContext, MetadataValue, OutputContext, TableColumn, TableSchema\nfrom dagster._core.definitions.metadata import RawMetadataValue\nfrom dagster._core.storage.db_io_manager import DbTypeHandler, TableSlice\nfrom dagster_snowflake import SnowflakeIOManager, build_snowflake_io_manager\nfrom dagster_snowflake.snowflake_io_manager import SnowflakeDbClient\nfrom pyspark.sql import DataFrame, SparkSession\nfrom pyspark.sql.types import StructType\n\nSNOWFLAKE_CONNECTOR = "net.snowflake.spark.snowflake"\n\n\ndef _get_snowflake_options(config, table_slice: TableSlice) -> Mapping[str, str]:\n    check.invariant(\n        config.get("warehouse", None) is not None,\n        "Missing config: Warehouse is required when using PySpark with the Snowflake I/O manager.",\n    )\n\n    conf = {\n        "sfURL": f"{config['account']}.snowflakecomputing.com",\n        "sfUser": config["user"],\n        "sfPassword": config["password"],\n        "sfDatabase": config["database"],\n        "sfSchema": table_slice.schema,\n        "sfWarehouse": config["warehouse"],\n    }\n\n    return conf\n\n\n
[docs]class SnowflakePySparkTypeHandler(DbTypeHandler[DataFrame]):\n """Plugin for the Snowflake I/O Manager that can store and load PySpark DataFrames as Snowflake tables.\n\n Examples:\n .. code-block:: python\n\n from dagster_snowflake import SnowflakeIOManager\n from dagster_snowflake_pandas import SnowflakePandasTypeHandler\n from dagster_snowflake_pyspark import SnowflakePySparkTypeHandler\n from dagster import Definitions, EnvVar\n\n class MySnowflakeIOManager(SnowflakeIOManager):\n @staticmethod\n def type_handlers() -> Sequence[DbTypeHandler]:\n return [SnowflakePandasTypeHandler(), SnowflakePySparkTypeHandler()]\n\n @asset(\n key_prefix=["my_schema"] # will be used as the schema in snowflake\n )\n def my_table() -> pd.DataFrame: # the name of the asset will be the table name\n ...\n\n defs = Definitions(\n assets=[my_table],\n resources={\n "io_manager": MySnowflakeIOManager(database="MY_DATABASE", account=EnvVar("SNOWFLAKE_ACCOUNT"), warehouse="my_warehouse", ...)\n }\n )\n\n """\n\n def handle_output(\n self, context: OutputContext, table_slice: TableSlice, obj: DataFrame, _\n ) -> Mapping[str, RawMetadataValue]:\n options = _get_snowflake_options(context.resource_config, table_slice)\n\n with_uppercase_cols = obj.toDF(*[c.upper() for c in obj.columns])\n\n with_uppercase_cols.write.format(SNOWFLAKE_CONNECTOR).options(**options).option(\n "dbtable", table_slice.table\n ).mode("append").save()\n\n return {\n "dataframe_columns": MetadataValue.table_schema(\n TableSchema(\n columns=[\n TableColumn(name=field.name, type=field.dataType.typeName())\n for field in obj.schema.fields\n ]\n )\n ),\n }\n\n def load_input(self, context: InputContext, table_slice: TableSlice, _) -> DataFrame:\n options = _get_snowflake_options(context.resource_config, table_slice)\n\n spark = SparkSession.builder.getOrCreate() # type: ignore\n if table_slice.partition_dimensions and len(context.asset_partition_keys) == 0:\n return spark.createDataFrame([], StructType([]))\n\n df = (\n spark.read.format(SNOWFLAKE_CONNECTOR)\n .options(**options)\n .option("query", SnowflakeDbClient.get_select_statement(table_slice))\n .load()\n )\n return df.toDF(*[c.lower() for c in df.columns])\n\n @property\n def supported_types(self):\n return [DataFrame]
\n\n\nsnowflake_pyspark_io_manager = build_snowflake_io_manager(\n [SnowflakePySparkTypeHandler()], default_load_type=DataFrame\n)\nsnowflake_pyspark_io_manager.__doc__ = """\nAn I/O manager definition that reads inputs from and writes PySpark DataFrames to Snowflake. When\nusing the snowflake_pyspark_io_manager, any inputs and outputs without type annotations will be loaded\nas PySpark DataFrames.\n\nReturns:\n IOManagerDefinition\n\nExamples:\n\n .. code-block:: python\n\n from dagster_snowflake_pyspark import snowflake_pyspark_io_manager\n from pyspark.sql import DataFrame\n from dagster import Definitions\n\n @asset(\n key_prefix=["my_schema"] # will be used as the schema in snowflake\n )\n def my_table() -> DataFrame: # the name of the asset will be the table name\n ...\n\n defs = Definitions(\n assets=[my_table],\n resources={\n "io_manager": snowflake_pyspark_io_manager.configured({\n "database": "my_database",\n "warehouse": "my_warehouse", # required for snowflake_pyspark_io_manager\n "account" : {"env": "SNOWFLAKE_ACCOUNT"},\n "password": {"env": "SNOWFLAKE_PASSWORD"},\n ...\n })\n }\n )\n\n Note that the warehouse configuration value is required when using the snowflake_pyspark_io_manager\n\n If you do not provide a schema, Dagster will determine a schema based on the assets and ops using\n the I/O Manager. For assets, the schema will be determined from the asset key.\n For ops, the schema can be specified by including a "schema" entry in output metadata. If "schema" is not provided\n via config or on the asset/op, "public" will be used for the schema.\n\n .. code-block:: python\n\n @op(\n out={"my_table": Out(metadata={"schema": "my_schema"})}\n )\n def make_my_table() -> DataFrame:\n # the returned value will be stored at my_schema.my_table\n ...\n\n To only use specific columns of a table as input to a downstream op or asset, add the metadata "columns" to the\n In or AssetIn.\n\n .. code-block:: python\n\n @asset(\n ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n )\n def my_table_a(my_table: DataFrame) -> DataFrame:\n # my_table will just contain the data from column "a"\n ...\n\n"""\n\n\n
[docs]class SnowflakePySparkIOManager(SnowflakeIOManager):\n """An I/O manager definition that reads inputs from and writes PySpark DataFrames to Snowflake. When\n using the SnowflakePySparkIOManager, any inputs and outputs without type annotations will be loaded\n as PySpark DataFrames.\n\n Returns:\n IOManagerDefinition\n\n Examples:\n .. code-block:: python\n\n from dagster_snowflake_pyspark import SnowflakePySparkIOManager\n from pyspark.sql import DataFrame\n from dagster import Definitions, EnvVar\n\n @asset(\n key_prefix=["my_schema"] # will be used as the schema in snowflake\n )\n def my_table() -> DataFrame: # the name of the asset will be the table name\n ...\n\n defs = Definitions(\n assets=[my_table],\n resources={\n "io_manager": SnowflakePySparkIOManager(\n database="my_database",\n warehouse="my_warehouse", # required for SnowflakePySparkIOManager\n account=EnvVar("SNOWFLAKE_ACCOUNT"),\n password=EnvVar("SNOWFLAKE_PASSWORD"),\n ...\n )\n }\n )\n\n Note that the warehouse configuration value is required when using the SnowflakePySparkIOManager\n\n If you do not provide a schema, Dagster will determine a schema based on the assets and ops using\n the I/O Manager. For assets, the schema will be determined from the asset key, as in the above example.\n For ops, the schema can be specified by including a "schema" entry in output metadata. If "schema" is not provided\n via config or on the asset/op, "public" will be used for the schema.\n\n .. code-block:: python\n\n @op(\n out={"my_table": Out(metadata={"schema": "my_schema"})}\n )\n def make_my_table() -> DataFrame:\n # the returned value will be stored at my_schema.my_table\n ...\n\n To only use specific columns of a table as input to a downstream op or asset, add the metadata "columns" to the\n In or AssetIn.\n\n .. code-block:: python\n\n @asset(\n ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n )\n def my_table_a(my_table: DataFrame) -> DataFrame:\n # my_table will just contain the data from column "a"\n ...\n\n """\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n @staticmethod\n def type_handlers() -> Sequence[DbTypeHandler]:\n return [SnowflakePySparkTypeHandler()]\n\n @staticmethod\n def default_load_type() -> Optional[Type]:\n return DataFrame
\n
", "current_page_name": "_modules/dagster_snowflake_pyspark/snowflake_pyspark_type_handler", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_snowflake_pyspark.snowflake_pyspark_type_handler"}}, "dagster_spark": {"configs": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_spark.configs

\n"""Spark Configuration.\n\nIn this file we define the key configuration parameters for submitting Spark jobs. Spark can be run\nin a variety of deployment contexts. See the Spark documentation at\nhttps://spark.apache.org/docs/latest/submitting-applications.html for a more in-depth summary of\nSpark deployment contexts and configuration.\n"""\nfrom dagster import Field, StringSource\n\nfrom .configs_spark import spark_config\nfrom .types import SparkDeployMode\n\n\n
[docs]def define_spark_config():\n """Spark configuration.\n\n See the Spark documentation for reference:\n https://spark.apache.org/docs/latest/submitting-applications.html\n """\n master_url = Field(\n StringSource,\n description="The master URL for the cluster (e.g. spark://23.195.26.187:7077)",\n is_required=True,\n )\n\n deploy_mode = Field(\n SparkDeployMode,\n description="""Whether to deploy your driver on the worker nodes (cluster) or locally as an\n external client (client) (default: client). A common deployment strategy is to submit your\n application from a gateway machine that is physically co-located with your worker machines\n (e.g. Master node in a standalone EC2 cluster). In this setup, client mode is appropriate.\n In client mode, the driver is launched directly within the spark-submit process which acts\n as a client to the cluster. The input and output of the application is attached to the\n console. Thus, this mode is especially suitable for applications that involve the REPL (e.g.\n Spark shell).""",\n is_required=False,\n )\n\n application_jar = Field(\n StringSource,\n description="""Path to a bundled jar including your application and all\n dependencies. The URL must be globally visible inside of your cluster, for\n instance, an hdfs:// path or a file:// path that is present on all nodes.\n """,\n is_required=True,\n )\n\n application_arguments = Field(\n StringSource,\n description="Arguments passed to the main method of your main class, if any",\n is_required=False,\n )\n\n spark_home = Field(\n StringSource,\n description=(\n "The path to your spark installation. Defaults to $SPARK_HOME at runtime if not"\n " provided."\n ),\n is_required=False,\n )\n\n return {\n "master_url": master_url,\n "deploy_mode": deploy_mode,\n "application_jar": application_jar,\n "spark_conf": spark_config(),\n "spark_home": spark_home,\n "application_arguments": application_arguments,\n }
\n
", "current_page_name": "_modules/dagster_spark/configs", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_spark.configs"}, "ops": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_spark.ops

\nfrom dagster import (\n    In,\n    Nothing,\n    Out,\n    _check as check,\n    op,\n)\n\nfrom .configs import define_spark_config\n\n\n
[docs]def create_spark_op(\n name, main_class, description=None, required_resource_keys=frozenset(["spark"])\n):\n check.str_param(name, "name")\n check.str_param(main_class, "main_class")\n check.opt_str_param(description, "description", "A parameterized Spark job.")\n check.set_param(required_resource_keys, "required_resource_keys")\n\n @op(\n name=name,\n description=description,\n config_schema=define_spark_config(),\n ins={"start": In(Nothing)},\n out=Out(Nothing),\n tags={"kind": "spark", "main_class": main_class},\n required_resource_keys=required_resource_keys,\n )\n def spark_op(context):\n context.resources.spark.run_spark_job(context.op_config, main_class)\n\n return spark_op
\n
", "current_page_name": "_modules/dagster_spark/ops", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_spark.ops"}, "resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_spark.resources

\nimport os\nimport subprocess\n\nimport dagster._check as check\nfrom dagster import resource\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom dagster._core.log_manager import DagsterLogManager\n\nfrom .types import SparkOpError\nfrom .utils import construct_spark_shell_command\n\n\nclass SparkResource:\n    def __init__(self, logger):\n        self.logger = check.inst_param(logger, "logger", DagsterLogManager)\n\n    def run_spark_job(self, config, main_class):\n        check.dict_param(config, "config")\n        check.str_param(main_class, "main_class")\n\n        # Extract parameters from config\n        (\n            master_url,\n            deploy_mode,\n            application_jar,\n            spark_conf,\n            application_arguments,\n            spark_home,\n        ) = [\n            config.get(k)\n            for k in (\n                "master_url",\n                "deploy_mode",\n                "application_jar",\n                "spark_conf",\n                "application_arguments",\n                "spark_home",\n            )\n        ]\n\n        if not os.path.exists(application_jar):\n            raise SparkOpError(\n                f"Application jar {application_jar} does not exist. A valid jar must be "\n                "built before running this op."\n            )\n\n        spark_shell_cmd = construct_spark_shell_command(\n            application_jar=application_jar,\n            main_class=main_class,\n            master_url=master_url,\n            spark_conf=spark_conf,\n            deploy_mode=deploy_mode,\n            application_arguments=application_arguments,\n            spark_home=spark_home,\n        )\n        self.logger.info("Running spark-submit: " + " ".join(spark_shell_cmd))\n\n        retcode = subprocess.call(" ".join(spark_shell_cmd), shell=True)\n\n        if retcode != 0:\n            raise SparkOpError("Spark job failed. Please consult your logs.")\n\n\n
[docs]@dagster_maintained_resource\n@resource\ndef spark_resource(context):\n return SparkResource(context.log)
\n
", "current_page_name": "_modules/dagster_spark/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_spark.resources"}, "types": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_spark.types

\nfrom dagster import Enum, EnumValue\n\nSparkDeployModeCluster = EnumValue("cluster")\nSparkDeployModeClient = EnumValue("client")\nSparkDeployMode = Enum(\n    name="SparkDeployMode", enum_values=[SparkDeployModeCluster, SparkDeployModeClient]\n)\n\n\n
[docs]class SparkOpError(Exception):\n pass
\n
", "current_page_name": "_modules/dagster_spark/types", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_spark.types"}, "utils": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_spark.utils

\nimport itertools\nimport os\n\nimport dagster._check as check\n\nfrom .types import SparkOpError\n\n\ndef flatten_dict(d):\n    def _flatten_dict(d, result, key_path=None):\n        """Iterates an arbitrarily nested dictionary and yield dot-notation key:value tuples.\n\n        {'foo': {'bar': 3, 'baz': 1}, {'other': {'key': 1}} =>\n            [('foo.bar', 3), ('foo.baz', 1), ('other.key', 1)]\n\n        """\n        for k, v in d.items():\n            new_key_path = (key_path or []) + [k]\n            if isinstance(v, dict):\n                _flatten_dict(v, result, new_key_path)\n            else:\n                result.append((".".join(new_key_path), v))\n\n    result = []\n    if d is not None:\n        _flatten_dict(d, result)\n    return result\n\n\ndef parse_spark_config(spark_conf):\n    """Convert spark conf dict to list of CLI arguments.\n\n    For each key-value pair in spark conf, we need to pass to CLI in format:\n\n    --conf "key=value"\n    """\n    spark_conf_list = flatten_dict(spark_conf)\n    return format_for_cli(spark_conf_list)\n\n\ndef format_for_cli(spark_conf_list):\n    return list(\n        itertools.chain.from_iterable([("--conf", "{}={}".format(*c)) for c in spark_conf_list])\n    )\n\n\n
[docs]def construct_spark_shell_command(\n application_jar,\n main_class,\n master_url=None,\n spark_conf=None,\n deploy_mode=None,\n application_arguments=None,\n spark_home=None,\n):\n """Constructs the spark-submit command for a Spark job."""\n check.opt_str_param(master_url, "master_url")\n check.str_param(application_jar, "application_jar")\n spark_conf = check.opt_dict_param(spark_conf, "spark_conf")\n check.opt_str_param(deploy_mode, "deploy_mode")\n check.opt_str_param(application_arguments, "application_arguments")\n check.opt_str_param(spark_home, "spark_home")\n\n spark_home = spark_home if spark_home else os.environ.get("SPARK_HOME")\n if spark_home is None:\n raise SparkOpError(\n "No spark home set. You must either pass spark_home in config or "\n "set $SPARK_HOME in your environment (got None)."\n )\n\n master_url = ["--master", master_url] if master_url else []\n deploy_mode = ["--deploy-mode", deploy_mode] if deploy_mode else []\n\n spark_shell_cmd = (\n [f"{spark_home}/bin/spark-submit", "--class", main_class]\n + master_url\n + deploy_mode\n + parse_spark_config(spark_conf)\n + [application_jar]\n + [application_arguments]\n )\n return spark_shell_cmd
\n
", "current_page_name": "_modules/dagster_spark/utils", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_spark.utils"}}, "dagster_ssh": {"resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_ssh.resources

\nimport getpass\nimport os\nfrom io import StringIO\n\nimport paramiko\nfrom dagster import (\n    BoolSource,\n    Field,\n    IntSource,\n    StringSource,\n    _check as check,\n    resource,\n)\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom dagster._utils import mkdir_p\nfrom dagster._utils.merger import merge_dicts\nfrom paramiko.config import SSH_PORT\nfrom sshtunnel import SSHTunnelForwarder\n\n\ndef key_from_str(key_str):\n    """Creates a paramiko SSH key from a string."""\n    check.str_param(key_str, "key_str")\n\n    # py2 StringIO doesn't support with\n    key_file = StringIO(key_str)\n    result = paramiko.RSAKey.from_private_key(key_file)\n    key_file.close()\n    return result\n\n\nclass SSHResource:\n    """Resource for ssh remote execution using Paramiko.\n\n    ref: https://github.com/paramiko/paramiko\n    """\n\n    def __init__(\n        self,\n        remote_host,\n        remote_port,\n        username=None,\n        password=None,\n        key_file=None,\n        key_string=None,\n        timeout=10,\n        keepalive_interval=30,\n        compress=True,\n        no_host_key_check=True,\n        allow_host_key_change=False,\n        logger=None,\n    ):\n        self.remote_host = check.str_param(remote_host, "remote_host")\n        self.remote_port = check.opt_int_param(remote_port, "remote_port")\n        self.username = check.opt_str_param(username, "username")\n        self.password = check.opt_str_param(password, "password")\n        self.key_file = check.opt_str_param(key_file, "key_file")\n        self.timeout = check.opt_int_param(timeout, "timeout")\n        self.keepalive_interval = check.opt_int_param(keepalive_interval, "keepalive_interval")\n        self.compress = check.opt_bool_param(compress, "compress")\n        self.no_host_key_check = check.opt_bool_param(no_host_key_check, "no_host_key_check")\n        self.log = logger\n\n        self.host_proxy = None\n\n        # Create RSAKey object from private key string\n        self.key_obj = key_from_str(key_string) if key_string is not None else None\n\n        # Auto detecting username values from system\n        if not self.username:\n            logger.debug(\n                "username to ssh to host: %s is not specified. Using system's default provided by"\n                " getpass.getuser()"\n                % self.remote_host\n            )\n            self.username = getpass.getuser()\n\n        user_ssh_config_filename = os.path.expanduser("~/.ssh/config")\n        if os.path.isfile(user_ssh_config_filename):\n            ssh_conf = paramiko.SSHConfig()\n            ssh_conf.parse(open(user_ssh_config_filename, encoding="utf8"))\n            host_info = ssh_conf.lookup(self.remote_host)\n            if host_info and host_info.get("proxycommand"):\n                self.host_proxy = paramiko.ProxyCommand(host_info.get("proxycommand"))\n\n            if not (self.password or self.key_file):\n                if host_info and host_info.get("identityfile"):\n                    self.key_file = host_info.get("identityfile")[0]\n\n    def get_connection(self):\n        """Opens a SSH connection to the remote host.\n\n        :rtype: paramiko.client.SSHClient\n        """\n        client = paramiko.SSHClient()\n        client.load_system_host_keys()\n        if self.no_host_key_check:\n            self.log.warning(\n                "No Host Key Verification. This won't protect against Man-In-The-Middle attacks"\n            )\n            # Default is RejectPolicy\n            client.set_missing_host_key_policy(paramiko.AutoAddPolicy())\n\n        if self.password and self.password.strip():\n            client.connect(\n                hostname=self.remote_host,\n                username=self.username,\n                password=self.password,\n                key_filename=self.key_file,\n                pkey=self.key_obj,\n                timeout=self.timeout,\n                compress=self.compress,\n                port=self.remote_port,\n                sock=self.host_proxy,\n                look_for_keys=False,\n            )\n        else:\n            client.connect(\n                hostname=self.remote_host,\n                username=self.username,\n                key_filename=self.key_file,\n                pkey=self.key_obj,\n                timeout=self.timeout,\n                compress=self.compress,\n                port=self.remote_port,\n                sock=self.host_proxy,\n            )\n\n        if self.keepalive_interval:\n            client.get_transport().set_keepalive(self.keepalive_interval)\n\n        return client\n\n    def get_tunnel(self, remote_port, remote_host="localhost", local_port=None):\n        check.int_param(remote_port, "remote_port")\n        check.str_param(remote_host, "remote_host")\n        check.opt_int_param(local_port, "local_port")\n\n        if local_port is not None:\n            local_bind_address = ("localhost", local_port)\n        else:\n            local_bind_address = ("localhost",)\n\n        # Will prefer key string if specified, otherwise use the key file\n        pkey = self.key_obj if self.key_obj else self.key_file\n\n        if self.password and self.password.strip():\n            client = SSHTunnelForwarder(\n                self.remote_host,\n                ssh_port=self.remote_port,\n                ssh_username=self.username,\n                ssh_password=self.password,\n                ssh_pkey=pkey,\n                ssh_proxy=self.host_proxy,\n                local_bind_address=local_bind_address,\n                remote_bind_address=(remote_host, remote_port),\n                logger=self.log,\n            )\n        else:\n            client = SSHTunnelForwarder(\n                self.remote_host,\n                ssh_port=self.remote_port,\n                ssh_username=self.username,\n                ssh_pkey=pkey,\n                ssh_proxy=self.host_proxy,\n                local_bind_address=local_bind_address,\n                remote_bind_address=(remote_host, remote_port),\n                host_pkey_directories=[],\n                logger=self.log,\n            )\n\n        return client\n\n    def sftp_get(self, remote_filepath, local_filepath):\n        check.str_param(remote_filepath, "remote_filepath")\n        check.str_param(local_filepath, "local_filepath")\n        conn = self.get_connection()\n        with conn.open_sftp() as sftp_client:\n            local_folder = os.path.dirname(local_filepath)\n\n            # Create intermediate directories if they don't exist\n            mkdir_p(local_folder)\n\n            self.log.info(f"Starting to transfer from {remote_filepath} to {local_filepath}")\n\n            sftp_client.get(remote_filepath, local_filepath)\n\n        conn.close()\n        return local_filepath\n\n    def sftp_put(self, remote_filepath, local_filepath, confirm=True):\n        check.str_param(remote_filepath, "remote_filepath")\n        check.str_param(local_filepath, "local_filepath")\n        conn = self.get_connection()\n        with conn.open_sftp() as sftp_client:\n            self.log.info(f"Starting to transfer file from {local_filepath} to {remote_filepath}")\n\n            sftp_client.put(local_filepath, remote_filepath, confirm=confirm)\n\n        conn.close()\n        return local_filepath\n\n\n
[docs]@dagster_maintained_resource\n@resource(\n config_schema={\n "remote_host": Field(\n StringSource, description="remote host to connect to", is_required=True\n ),\n "remote_port": Field(\n IntSource,\n description="port of remote host to connect (Default is paramiko SSH_PORT)",\n is_required=False,\n default_value=SSH_PORT,\n ),\n "username": Field(\n StringSource, description="username to connect to the remote_host", is_required=False\n ),\n "password": Field(\n StringSource,\n description="password of the username to connect to the remote_host",\n is_required=False,\n ),\n "key_file": Field(\n StringSource,\n description="key file to use to connect to the remote_host.",\n is_required=False,\n ),\n "key_string": Field(\n StringSource,\n description="key string to use to connect to remote_host",\n is_required=False,\n ),\n "timeout": Field(\n IntSource,\n description="timeout for the attempt to connect to the remote_host.",\n is_required=False,\n default_value=10,\n ),\n "keepalive_interval": Field(\n IntSource,\n description="send a keepalive packet to remote host every keepalive_interval seconds",\n is_required=False,\n default_value=30,\n ),\n "compress": Field(BoolSource, is_required=False, default_value=True),\n "no_host_key_check": Field(BoolSource, is_required=False, default_value=True),\n "allow_host_key_change": Field(\n BoolSource, description="[Deprecated]", is_required=False, default_value=False\n ),\n }\n)\ndef ssh_resource(init_context):\n args = init_context.resource_config\n args = merge_dicts(init_context.resource_config, {"logger": init_context.log})\n return SSHResource(**args)
\n
", "current_page_name": "_modules/dagster_ssh/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_ssh.resources"}}, "dagster_twilio": {"resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_twilio.resources

\nfrom dagster import ConfigurableResource, resource\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom dagster._core.execution.context.init import InitResourceContext\nfrom pydantic import Field\nfrom twilio.rest import Client\n\n\n
[docs]class TwilioResource(ConfigurableResource):\n """This resource is for connecting to Twilio."""\n\n account_sid: str = Field(\n description=(\n "Twilio Account SID, created with yout Twilio account. This can be found on your Twilio"\n " dashboard, see"\n " https://www.twilio.com/blog/twilio-access-tokens-python"\n ),\n )\n auth_token: str = Field(\n description=(\n "Twilio Authentication Token, created with yout Twilio account. This can be found on"\n " your Twilio dashboard, see https://www.twilio.com/blog/twilio-access-tokens-python"\n ),\n )\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n def create_client(self) -> Client:\n return Client(self.account_sid, self.auth_token)
\n\n\n
[docs]@dagster_maintained_resource\n@resource(\n config_schema=TwilioResource.to_config_schema(),\n description="This resource is for connecting to Twilio",\n)\ndef twilio_resource(context: InitResourceContext) -> Client:\n return TwilioResource.from_resource_context(context).create_client()
\n
", "current_page_name": "_modules/dagster_twilio/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_twilio.resources"}}, "dagster_wandb": {"io_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_wandb.io_manager

\nimport datetime\nimport os\nimport pickle\nimport platform\nimport shutil\nimport sys\nimport time\nimport uuid\nfrom contextlib import contextmanager\nfrom typing import List, Optional\n\nfrom dagster import (\n    Field,\n    InitResourceContext,\n    InputContext,\n    Int,\n    IOManager,\n    MetadataValue,\n    OutputContext,\n    String,\n    io_manager,\n)\nfrom dagster._core.storage.io_manager import dagster_maintained_io_manager\nfrom wandb import Artifact\nfrom wandb.data_types import WBValue\n\nfrom .resources import WANDB_CLOUD_HOST\nfrom .utils.errors import (\n    WandbArtifactsIOManagerError,\n    raise_on_empty_configuration,\n    raise_on_unknown_partition_keys,\n    raise_on_unknown_read_configuration_keys,\n    raise_on_unknown_write_configuration_keys,\n)\nfrom .utils.pickling import (\n    ACCEPTED_SERIALIZATION_MODULES,\n    pickle_artifact_content,\n    unpickle_artifact_content,\n)\nfrom .version import __version__\n\nif sys.version_info >= (3, 8):\n    from typing import TypedDict\nelse:\n    from typing_extensions import TypedDict\n\n\nclass Config(TypedDict):\n    dagster_run_id: str\n    wandb_host: str\n    wandb_entity: str\n    wandb_project: str\n    wandb_run_name: Optional[str]\n    wandb_run_id: Optional[str]\n    wandb_run_tags: Optional[List[str]]\n    base_dir: str\n    cache_duration_in_minutes: Optional[int]\n\n\nclass ArtifactsIOManager(IOManager):\n    """IO Manager to handle Artifacts in Weights & Biases (W&B) .\n\n    It handles 3 different inputs:\n    - Pickable objects (the serialization module is configurable)\n    - W&B Objects (Audio, Table, Image, etc)\n    - W&B Artifacts\n    """\n\n    def __init__(self, wandb_client, config: Config):\n        self.wandb = wandb_client\n\n        dagster_run_id = config["dagster_run_id"]\n        self.dagster_run_id = dagster_run_id\n        self.wandb_host = config["wandb_host"]\n        self.wandb_entity = config["wandb_entity"]\n        self.wandb_project = config["wandb_project"]\n        self.wandb_run_id = config.get("wandb_run_id") or dagster_run_id\n        self.wandb_run_name = config.get("wandb_run_name") or f"dagster-run-{dagster_run_id[0:8]}"\n        # augments the run tags\n        wandb_run_tags = config["wandb_run_tags"] or []\n        if "dagster_wandb" not in wandb_run_tags:\n            wandb_run_tags = [*wandb_run_tags, "dagster_wandb"]\n        self.wandb_run_tags = wandb_run_tags\n\n        self.base_dir = config["base_dir"]\n        cache_duration_in_minutes = config["cache_duration_in_minutes"]\n        default_cache_expiration_in_minutes = 60 * 24 * 30  # 60 minutes * 24 hours * 30 days\n        self.cache_duration_in_minutes = (\n            cache_duration_in_minutes\n            if cache_duration_in_minutes is not None\n            else default_cache_expiration_in_minutes\n        )\n\n    def _get_local_storage_path(self):\n        path = self.base_dir\n        if os.path.basename(path) != "storage":\n            path = os.path.join(path, "storage")\n        path = os.path.join(path, "wandb_artifacts_manager")\n        os.makedirs(path, exist_ok=True)\n        return path\n\n    def _get_artifacts_path(self, name, version):\n        local_storage_path = self._get_local_storage_path()\n        path = os.path.join(local_storage_path, "artifacts", f"{name}.{version}")\n        os.makedirs(path, exist_ok=True)\n        return path\n\n    def _get_wandb_logs_path(self):\n        local_storage_path = self._get_local_storage_path()\n        # Adding a random uuid to avoid collisions in multi-process context\n        path = os.path.join(local_storage_path, "runs", self.dagster_run_id, str(uuid.uuid4()))\n        os.makedirs(path, exist_ok=True)\n        return path\n\n    def _clean_local_storage_path(self):\n        local_storage_path = self._get_local_storage_path()\n        cache_duration_in_minutes = self.cache_duration_in_minutes\n        current_timestamp = int(time.time())\n        expiration_timestamp = current_timestamp - (\n            cache_duration_in_minutes * 60  # convert to seconds\n        )\n\n        for root, dirs, files in os.walk(local_storage_path, topdown=False):\n            for name in files:\n                current_file_path = os.path.join(root, name)\n                most_recent_access = os.lstat(current_file_path).st_atime\n                if most_recent_access <= expiration_timestamp or cache_duration_in_minutes == 0:\n                    os.remove(current_file_path)\n            for name in dirs:\n                current_dir_path = os.path.join(root, name)\n                if not os.path.islink(current_dir_path):\n                    if len(os.listdir(current_dir_path)) == 0 or cache_duration_in_minutes == 0:\n                        shutil.rmtree(current_dir_path)\n\n    @contextmanager\n    def wandb_run(self):\n        self.wandb.init(\n            id=self.wandb_run_id,\n            name=self.wandb_run_name,\n            project=self.wandb_project,\n            entity=self.wandb_entity,\n            dir=self._get_wandb_logs_path(),\n            tags=self.wandb_run_tags,\n            anonymous="never",\n            resume="allow",\n        )\n        try:\n            yield self.wandb.run\n        finally:\n            self.wandb.finish()\n            self._clean_local_storage_path()\n\n    def _upload_artifact(self, context: OutputContext, obj):\n        if not context.has_partition_key and context.has_asset_partitions:\n            raise WandbArtifactsIOManagerError(\n                "Sorry, but the Weights & Biases (W&B) IO Manager can't handle processing several"\n                " partitions at the same time within a single run. Please process each partition"\n                " separately. If you think this might be an error, don't hesitate to reach out to"\n                " Weights & Biases Support."\n            )\n\n        with self.wandb_run() as run:\n            parameters = {}\n            if context.metadata is not None:\n                parameters = context.metadata.get("wandb_artifact_configuration", {})\n\n            raise_on_unknown_write_configuration_keys(parameters)\n\n            serialization_module = parameters.get("serialization_module", {})\n            serialization_module_name = serialization_module.get("name", "pickle")\n\n            if serialization_module_name not in ACCEPTED_SERIALIZATION_MODULES:\n                raise WandbArtifactsIOManagerError(\n                    f"Oops! It looks like the value you provided, '{serialization_module_name}',"\n                    " isn't recognized as a valid serialization module. Here are the ones we do"\n                    f" support: {ACCEPTED_SERIALIZATION_MODULES}."\n                )\n\n            serialization_module_parameters = serialization_module.get("parameters", {})\n            serialization_module_parameters_with_protocol = {\n                "protocol": (\n                    pickle.HIGHEST_PROTOCOL\n                ),  # we use the highest available protocol if we don't pass one\n                **serialization_module_parameters,\n            }\n\n            artifact_type = parameters.get("type", "artifact")\n            artifact_description = parameters.get("description")\n            artifact_metadata = {\n                "source_integration": "dagster_wandb",\n                "source_integration_version": __version__,\n                "source_dagster_run_id": self.dagster_run_id,\n                "source_created_at": datetime.datetime.now(datetime.timezone.utc).isoformat(),\n                "source_python_version": platform.python_version(),\n            }\n            if isinstance(obj, Artifact):\n                if parameters.get("name") is not None:\n                    raise WandbArtifactsIOManagerError(\n                        "You've provided a 'name' property in the 'wandb_artifact_configuration'"\n                        " settings. However, this 'name' property should only be used when the"\n                        " output isn't already an Artifact object."\n                    )\n\n                if parameters.get("type") is not None:\n                    raise WandbArtifactsIOManagerError(\n                        "You've provided a 'type' property in the 'wandb_artifact_configuration'"\n                        " settings. However, this 'type' property should only be used when the"\n                        " output isn't already an Artifact object."\n                    )\n\n                if obj.name is None:\n                    raise WandbArtifactsIOManagerError(\n                        "The Weights & Biases (W&B) Artifact you provided is missing a name."\n                        " Please, assign a name to your Artifact."\n                    )\n\n                if context.has_asset_key and obj.name != context.get_asset_identifier()[0]:\n                    asset_identifier = context.get_asset_identifier()[0]\n                    context.log.warning(\n                        f"Please note, the name '{obj.name}' of your Artifact is overwritten by the"\n                        f" name derived from the AssetKey '{asset_identifier}'. For consistency and"\n                        " to avoid confusion, we advise sharing a constant for both your asset's"\n                        " name and the artifact's name."\n                    )\n                    obj._name = asset_identifier  # noqa: SLF001\n\n                if context.has_partition_key:\n                    artifact_name = f"{obj.name}.{context.partition_key}"\n                    # The Artifact provided is produced in a partitioned execution we add the\n                    # partition as a suffix to the Artifact name\n                    obj._name = artifact_name  # noqa: SLF001\n\n                if len(serialization_module) != 0:  # not an empty dict\n                    context.log.warning(\n                        "You've included a 'serialization_module' in the"\n                        " 'wandb_artifact_configuration' settings. However, this doesn't have any"\n                        " impact when the output is already an Artifact object."\n                    )\n\n                # The obj is already an Artifact we augment its metadata\n                artifact = obj\n\n                artifact.metadata = {**artifact.metadata, **artifact_metadata}\n\n                if artifact.description is not None and artifact_description is not None:\n                    raise WandbArtifactsIOManagerError(\n                        "You've given a 'description' in the 'wandb_artifact_configuration'"\n                        " settings for an existing Artifact that already has a description. Please,"\n                        " either set the description using 'wandb_artifact_argument' or when"\n                        " creating your Artifact."\n                    )\n                if artifact_description is not None:\n                    artifact.description = artifact_description\n            else:\n                if context.has_asset_key:\n                    if parameters.get("name") is not None:\n                        raise WandbArtifactsIOManagerError(\n                            "You've included a 'name' property in the"\n                            " 'wandb_artifact_configuration' settings. But, a 'name' is only needed"\n                            " when there's no 'AssetKey'. When an Artifact is created from an"\n                            " @asset, it uses the asset name. When it's created from an @op with an"\n                            " 'asset_key' for the output, that value is used. Please remove the"\n                            " 'name' property."\n                        )\n                    artifact_name = context.get_asset_identifier()[0]  # name of asset\n                else:\n                    name_parameter = parameters.get("name")\n                    if name_parameter is None:\n                        raise WandbArtifactsIOManagerError(\n                            "The 'name' property is missing in the 'wandb_artifact_configuration'"\n                            " settings. For Artifacts created from an @op, a 'name' property is"\n                            " needed. You could also use an @asset as an alternative."\n                        )\n                    assert name_parameter is not None\n                    artifact_name = name_parameter\n\n                if context.has_partition_key:\n                    artifact_name = f"{artifact_name}.{context.partition_key}"\n\n                # We replace the | character with - because it is not allowed in artifact names\n                # The | character is used in multi-dimensional partition keys\n                artifact_name = str(artifact_name).replace("|", "-")\n\n                # Creates an artifact to hold the obj\n                artifact = self.wandb.Artifact(\n                    name=artifact_name,\n                    type=artifact_type,\n                    description=artifact_description,\n                    metadata=artifact_metadata,\n                )\n                if isinstance(obj, WBValue):\n                    if len(serialization_module) != 0:  # not an empty dict\n                        context.log.warning(\n                            "You've included a 'serialization_module' in the"\n                            " 'wandb_artifact_configuration' settings. However, this doesn't have"\n                            " any impact when the output is already an W&B object like e.g Table or"\n                            " Image."\n                        )\n                    # Adds the WBValue object using the class name as the name for the file\n                    artifact.add(obj, obj.__class__.__name__)\n                elif obj is not None:\n                    # The output is not a native wandb Object, we serialize it\n                    pickle_artifact_content(\n                        context,\n                        serialization_module_name,\n                        serialization_module_parameters_with_protocol,\n                        artifact,\n                        obj,\n                    )\n\n            # Add any files: https://docs.wandb.ai/ref/python/artifact#add_file\n            add_files = parameters.get("add_files")\n            if add_files is not None and len(add_files) > 0:\n                for add_file in add_files:\n                    artifact.add_file(**add_file)\n\n            # Add any dirs: https://docs.wandb.ai/ref/python/artifact#add_dir\n            add_dirs = parameters.get("add_dirs")\n            if add_dirs is not None and len(add_dirs) > 0:\n                for add_dir in add_dirs:\n                    artifact.add_dir(**add_dir)\n\n            # Add any reference: https://docs.wandb.ai/ref/python/artifact#add_reference\n            add_references = parameters.get("add_references")\n            if add_references is not None and len(add_references) > 0:\n                for add_reference in add_references:\n                    artifact.add_reference(**add_reference)\n\n            # Augments the aliases\n            aliases = parameters.get("aliases", [])\n            aliases.append(f"dagster-run-{self.dagster_run_id[0:8]}")\n            if "latest" not in aliases:\n                aliases.append("latest")\n\n            # Logs the artifact\n            self.wandb.log_artifact(artifact, aliases=aliases)\n            artifact.wait()\n\n            # Adds useful metadata to the output or Asset\n            artifacts_base_url = (\n                "https://wandb.ai"\n                if self.wandb_host == WANDB_CLOUD_HOST\n                else self.wandb_host.rstrip("/")\n            )\n            assert artifact.id is not None\n            output_metadata = {\n                "dagster_run_id": MetadataValue.dagster_run(self.dagster_run_id),\n                "wandb_artifact_id": MetadataValue.text(artifact.id),\n                "wandb_artifact_type": MetadataValue.text(artifact.type),\n                "wandb_artifact_version": MetadataValue.text(artifact.version),\n                "wandb_artifact_size": MetadataValue.int(artifact.size),\n                "wandb_artifact_url": MetadataValue.url(\n                    f"{artifacts_base_url}/{run.entity}/{run.project}/artifacts/{artifact.type}/{'/'.join(artifact.name.rsplit(':', 1))}"\n                ),\n                "wandb_entity": MetadataValue.text(run.entity),\n                "wandb_project": MetadataValue.text(run.project),\n                "wandb_run_id": MetadataValue.text(run.id),\n                "wandb_run_name": MetadataValue.text(run.name),\n                "wandb_run_path": MetadataValue.text(run.path),\n                "wandb_run_url": MetadataValue.url(run.url),\n            }\n            context.add_output_metadata(output_metadata)\n\n    def _download_artifact(self, context: InputContext):\n        with self.wandb_run() as run:\n            parameters = {}\n            if context.metadata is not None:\n                parameters = context.metadata.get("wandb_artifact_configuration", {})\n\n            raise_on_unknown_read_configuration_keys(parameters)\n\n            partitions_configuration = parameters.get("partitions", {})\n\n            if not context.has_asset_partitions and len(partitions_configuration) > 0:\n                raise WandbArtifactsIOManagerError(\n                    "You've included a 'partitions' value in the 'wandb_artifact_configuration'"\n                    " settings but it's not within a partitioned execution. Please only use"\n                    " 'partitions' within a partitioned context."\n                )\n\n            if context.has_asset_partitions:\n                # Note: this is currently impossible to unit test with current Dagster APIs but was\n                # tested thoroughly manually\n                name = parameters.get("get")\n                path = parameters.get("get_path")\n                if name is not None or path is not None:\n                    raise WandbArtifactsIOManagerError(\n                        "You've given a value for 'get' and/or 'get_path' in the"\n                        " 'wandb_artifact_configuration' settings during a partitioned execution."\n                        " Please use the 'partitions' property to set 'get' or 'get_path' for each"\n                        " individual partition. To set a default value for all partitions, use '*'."\n                    )\n\n                artifact_name = parameters.get("name")\n                if artifact_name is None:\n                    artifact_name = context.asset_key[0][0]  # name of asset\n\n                partitions = [\n                    (key, f"{artifact_name}.{ str(key).replace('|', '-')}")\n                    for key in context.asset_partition_keys\n                ]\n\n                output = {}\n\n                for key, artifact_name in partitions:\n                    context.log.info(f"Handling partition with key '{key}'")\n                    partition_configuration = partitions_configuration.get(\n                        key, partitions_configuration.get("*")\n                    )\n\n                    raise_on_empty_configuration(key, partition_configuration)\n                    raise_on_unknown_partition_keys(key, partition_configuration)\n\n                    partition_version = None\n                    partition_alias = None\n                    if partition_configuration and partition_configuration is not None:\n                        partition_version = partition_configuration.get("version")\n                        partition_alias = partition_configuration.get("alias")\n                        if partition_version is not None and partition_alias is not None:\n                            raise WandbArtifactsIOManagerError(\n                                "You've provided both 'version' and 'alias' for the partition with"\n                                " key '{key}'. You should only use one of these properties at a"\n                                " time. If you choose not to use any, the latest version will be"\n                                " used by default. If this partition is configured with the '*'"\n                                " key, please correct the wildcard configuration."\n                            )\n                    partition_identifier = partition_version or partition_alias or "latest"\n\n                    artifact_uri = (\n                        f"{run.entity}/{run.project}/{artifact_name}:{partition_identifier}"\n                    )\n                    try:\n                        api = self.wandb.Api()\n                        api.artifact(artifact_uri)\n                    except Exception as exception:\n                        raise WandbArtifactsIOManagerError(\n                            "The artifact you're attempting to download might not exist, or you"\n                            " might have forgotten to include the 'name' property in the"\n                            " 'wandb_artifact_configuration' settings."\n                        ) from exception\n\n                    artifact = run.use_artifact(artifact_uri)\n\n                    artifacts_path = self._get_artifacts_path(artifact_name, artifact.version)\n                    if partition_configuration and partition_configuration is not None:\n                        partition_name = partition_configuration.get("get")\n                        partition_path = partition_configuration.get("get_path")\n                        if partition_name is not None and partition_path is not None:\n                            raise WandbArtifactsIOManagerError(\n                                "You've provided both 'get' and 'get_path' in the"\n                                " 'wandb_artifact_configuration' settings for the partition with"\n                                " key '{key}'. Only one of these properties should be used. If you"\n                                " choose not to use any, the whole Artifact will be returned. If"\n                                " this partition is configured with the '*' key, please correct the"\n                                " wildcard configuration."\n                            )\n\n                        if partition_name is not None:\n                            wandb_object = artifact.get(partition_name)\n                            if wandb_object is not None:\n                                output[key] = wandb_object\n                                continue\n\n                        if partition_path is not None:\n                            path = artifact.get_path(partition_path)\n                            download_path = path.download(root=artifacts_path)\n                            if download_path is not None:\n                                output[key] = download_path\n                                continue\n\n                    artifact_dir = artifact.download(root=artifacts_path, recursive=True)\n                    unpickled_content = unpickle_artifact_content(artifact_dir)\n                    if unpickled_content is not None:\n                        output[key] = unpickled_content\n                        continue\n\n                    artifact.verify(root=artifacts_path)\n                    output[key] = artifact\n\n                if len(output) == 1:\n                    # If there's only one partition, return the value directly\n                    return next(iter(output.values()))\n\n                return output\n\n            elif context.has_asset_key:\n                # Input is an asset\n                if parameters.get("name") is not None:\n                    raise WandbArtifactsIOManagerError(\n                        "A conflict has been detected in the provided configuration settings. The"\n                        " 'name' parameter appears to be specified twice - once in the"\n                        " 'wandb_artifact_configuration' metadata dictionary, and again as an"\n                        " AssetKey. Kindly avoid setting the name directly, since the AssetKey will"\n                        " be used for this purpose."\n                    )\n                artifact_name = context.get_asset_identifier()[0]  # name of asset\n            else:\n                artifact_name = parameters.get("name")\n                if artifact_name is None:\n                    raise WandbArtifactsIOManagerError(\n                        "The 'name' property is missing in the 'wandb_artifact_configuration'"\n                        " settings. For Artifacts used in an @op, a 'name' property is required."\n                        " You could use an @asset as an alternative."\n                    )\n\n            if context.has_partition_key:\n                artifact_name = f"{artifact_name}.{context.partition_key}"\n\n            artifact_alias = parameters.get("alias")\n            artifact_version = parameters.get("version")\n\n            if artifact_alias is not None and artifact_version is not None:\n                raise WandbArtifactsIOManagerError(\n                    "You've provided both 'version' and 'alias' in the"\n                    " 'wandb_artifact_configuration' settings. Only one should be used at a time."\n                    " If you decide not to use any, the latest version will be applied"\n                    " automatically."\n                )\n\n            artifact_identifier = artifact_alias or artifact_version or "latest"\n            artifact_uri = f"{run.entity}/{run.project}/{artifact_name}:{artifact_identifier}"\n\n            # This try/except block is a workaround for a bug in the W&B SDK, this should be removed\n            # once the bug is fixed.\n            try:\n                artifact = run.use_artifact(artifact_uri)\n            except Exception:\n                api = self.wandb.Api()\n                artifact = api.artifact(artifact_uri)\n\n            name = parameters.get("get")\n            path = parameters.get("get_path")\n            if name is not None and path is not None:\n                raise WandbArtifactsIOManagerError(\n                    "You've provided both 'get' and 'get_path' in the"\n                    " 'wandb_artifact_configuration' settings. Only one should be used at a time."\n                    " If you decide not to use any, the entire Artifact will be returned."\n                )\n\n            if name is not None:\n                return artifact.get(name)\n\n            artifacts_path = self._get_artifacts_path(artifact_name, artifact.version)\n            if path is not None:\n                path = artifact.get_path(path)\n                return path.download(root=artifacts_path)\n\n            artifact_dir = artifact.download(root=artifacts_path, recursive=True)\n\n            unpickled_content = unpickle_artifact_content(artifact_dir)\n            if unpickled_content is not None:\n                return unpickled_content\n\n            artifact.verify(root=artifacts_path)\n            return artifact\n\n    def handle_output(self, context: OutputContext, obj) -> None:\n        if obj is None:\n            context.log.warning(\n                "The output value given to the Weights & Biases (W&B) IO Manager is empty. If this"\n                " was intended, you can disregard this warning."\n            )\n        else:\n            try:\n                self._upload_artifact(context, obj)\n            except WandbArtifactsIOManagerError as exception:\n                raise exception\n            except Exception as exception:\n                raise WandbArtifactsIOManagerError() from exception\n\n    def load_input(self, context: InputContext):\n        try:\n            return self._download_artifact(context)\n        except WandbArtifactsIOManagerError as exception:\n            raise exception\n        except Exception as exception:\n            raise WandbArtifactsIOManagerError() from exception\n\n\n
[docs]@dagster_maintained_io_manager\n@io_manager(\n required_resource_keys={"wandb_resource", "wandb_config"},\n description="IO manager to read and write W&B Artifacts",\n config_schema={\n "run_name": Field(\n String,\n is_required=False,\n description=(\n "Short display name for this run, which is how you'll identify this run in the UI."\n " By default, it`s set to a string with the following format dagster-run-[8 first"\n " characters of the Dagster Run ID] e.g. dagster-run-7e4df022."\n ),\n ),\n "run_id": Field(\n String,\n is_required=False,\n description=(\n "Unique ID for this run, used for resuming. It must be unique in the project, and"\n " if you delete a run you can't reuse the ID. Use the name field for a short"\n " descriptive name, or config for saving hyperparameters to compare across runs."\n r" The ID cannot contain the following special characters: /\\#?%:.. You need to set"\n " the Run ID when you are doing experiment tracking inside Dagster to allow the IO"\n " Manager to resume the run. By default it`s set to the Dagster Run ID e.g "\n " 7e4df022-1bf2-44b5-a383-bb852df4077e."\n ),\n ),\n "run_tags": Field(\n [String],\n is_required=False,\n description=(\n "A list of strings, which will populate the list of tags on this run in the UI."\n " Tags are useful for organizing runs together, or applying temporary labels like"\n " 'baseline' or 'production'. It's easy to add and remove tags in the UI, or filter"\n " down to just runs with a specific tag. Any W&B Run used by the integration will"\n " have the dagster_wandb tag."\n ),\n ),\n "base_dir": Field(\n String,\n is_required=False,\n description=(\n "Base directory used for local storage and caching. W&B Artifacts and W&B Run logs"\n " will be written and read from that directory. By default, it`s using the"\n " DAGSTER_HOME directory."\n ),\n ),\n "cache_duration_in_minutes": Field(\n Int,\n is_required=False,\n description=(\n "Defines the amount of time W&B Artifacts and W&B Run logs should be kept in the"\n " local storage. Only files and directories that were not opened for that amount of"\n " time are removed from the cache. Cache purging happens at the end of an IO"\n " Manager execution. You can set it to 0, if you want to disable caching"\n " completely. Caching improves speed when an Artifact is reused between jobs"\n " running on the same machine. It defaults to 30 days."\n ),\n ),\n },\n)\ndef wandb_artifacts_io_manager(context: InitResourceContext):\n """Dagster IO Manager to create and consume W&B Artifacts.\n\n It allows any Dagster @op or @asset to create and consume W&B Artifacts natively.\n\n For a complete set of documentation, see `Dagster integration <https://docs.wandb.ai/guides/integrations/dagster>`_.\n\n **Example:**\n\n .. code-block:: python\n\n @repository\n def my_repository():\n return [\n *with_resources(\n load_assets_from_current_module(),\n resource_defs={\n "wandb_config": make_values_resource(\n entity=str,\n project=str,\n ),\n "wandb_resource": wandb_resource.configured(\n {"api_key": {"env": "WANDB_API_KEY"}}\n ),\n "wandb_artifacts_manager": wandb_artifacts_io_manager.configured(\n {"cache_duration_in_minutes": 60} # only cache files for one hour\n ),\n },\n resource_config_by_key={\n "wandb_config": {\n "config": {\n "entity": "my_entity",\n "project": "my_project"\n }\n }\n },\n ),\n ]\n\n\n @asset(\n name="my_artifact",\n metadata={\n "wandb_artifact_configuration": {\n "type": "dataset",\n }\n },\n io_manager_key="wandb_artifacts_manager",\n )\n def create_dataset():\n return [1, 2, 3]\n\n """\n wandb_client = context.resources.wandb_resource["sdk"]\n wandb_host = context.resources.wandb_resource["host"]\n wandb_entity = context.resources.wandb_config["entity"]\n wandb_project = context.resources.wandb_config["project"]\n\n wandb_run_name = None\n wandb_run_id = None\n wandb_run_tags = None\n base_dir = (\n context.instance.storage_directory() if context.instance else os.environ["DAGSTER_HOME"]\n )\n cache_duration_in_minutes = None\n if context.resource_config is not None:\n wandb_run_name = context.resource_config.get("run_name")\n wandb_run_id = context.resource_config.get("run_id")\n wandb_run_tags = context.resource_config.get("run_tags")\n base_dir = context.resource_config.get("base_dir", base_dir)\n cache_duration_in_minutes = context.resource_config.get("cache_duration_in_minutes")\n\n if "PYTEST_CURRENT_TEST" in os.environ:\n dagster_run_id = "unit-testing"\n else:\n dagster_run_id = context.run_id\n\n assert dagster_run_id is not None\n\n config: Config = {\n "dagster_run_id": dagster_run_id,\n "wandb_host": wandb_host,\n "wandb_entity": wandb_entity,\n "wandb_project": wandb_project,\n "wandb_run_name": wandb_run_name,\n "wandb_run_id": wandb_run_id,\n "wandb_run_tags": wandb_run_tags,\n "base_dir": base_dir,\n "cache_duration_in_minutes": cache_duration_in_minutes,\n }\n return ArtifactsIOManager(wandb_client, config)
\n
", "current_page_name": "_modules/dagster_wandb/io_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_wandb.io_manager"}, "launch": {"ops": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_wandb.launch.ops

\nfrom dagster import OpExecutionContext, op\nfrom wandb.sdk.launch import launch\nfrom wandb.sdk.launch.launch_add import launch_add\n\nfrom .configs import launch_agent_config, launch_config\n\n\ndef raise_on_invalid_config(context: OpExecutionContext):\n    entity = context.resources.wandb_config["entity"]\n    if entity == "":\n        raise RuntimeError(\n            "(dagster_wandb) An empty string was provided for the 'entity' property of the"\n            " 'wandb_config'."\n        )\n\n    project = context.resources.wandb_config["project"]\n    if project == "":\n        raise RuntimeError(\n            "(dagster_wandb) An empty string was provided for the 'project' property of the"\n            " 'wandb_config'."\n        )\n\n\n
[docs]@op(\n required_resource_keys={"wandb_resource", "wandb_config"},\n config_schema=launch_agent_config(),\n)\ndef run_launch_agent(context: OpExecutionContext):\n """It starts a Launch Agent and runs it as a long running process until stopped manually.\n\n Agents are processes that poll launch queues and execute the jobs (or dispatch them to external\n services to be executed) in order.\n\n **Example:**\n\n .. code-block:: YAML\n\n # config.yaml\n\n resources:\n wandb_config:\n config:\n entity: my_entity\n project: my_project\n ops:\n run_launch_agent:\n config:\n max_jobs: -1\n queues:\n - my_dagster_queue\n\n .. code-block:: python\n\n from dagster_wandb.launch.ops import run_launch_agent\n from dagster_wandb.resources import wandb_resource\n\n from dagster import job, make_values_resource\n\n\n @job(\n resource_defs={\n "wandb_config": make_values_resource(\n entity=str,\n project=str,\n ),\n "wandb_resource": wandb_resource.configured(\n {"api_key": {"env": "WANDB_API_KEY"}}\n ),\n },\n )\n def run_launch_agent_example():\n run_launch_agent()\n\n """\n raise_on_invalid_config(context)\n config = {\n "entity": context.resources.wandb_config["entity"],\n "project": context.resources.wandb_config["project"],\n **context.op_config,\n }\n context.log.info(f"Launch agent configuration: {config}")\n context.log.info("Running Launch agent...")\n launch.create_and_run_agent(api=context.resources.wandb_resource["api"], config=config)
\n\n\n
[docs]@op(\n required_resource_keys={\n "wandb_resource",\n "wandb_config",\n },\n config_schema=launch_config(),\n)\ndef run_launch_job(context: OpExecutionContext):\n """Executes a Launch job.\n\n A Launch job is assigned to a queue in order to be executed. You can create a queue or use the\n default one. Make sure you have an active agent listening to that queue. You can run an agent\n inside your Dagster instance but can also consider using a deployable agent in Kubernetes.\n\n **Example:**\n\n .. code-block:: YAML\n\n # config.yaml\n\n resources:\n wandb_config:\n config:\n entity: my_entity\n project: my_project\n ops:\n my_launched_job:\n config:\n entry_point:\n - python\n - train.py\n queue: my_dagster_queue\n uri: https://github.com/wandb/example-dagster-integration-with-launch\n\n .. code-block:: python\n\n from dagster_wandb.launch.ops import run_launch_job\n from dagster_wandb.resources import wandb_resource\n\n from dagster import job, make_values_resource\n\n\n @job(\n resource_defs={\n "wandb_config": make_values_resource(\n entity=str,\n project=str,\n ),\n "wandb_resource": wandb_resource.configured(\n {"api_key": {"env": "WANDB_API_KEY"}}\n ),\n },\n )\n def run_launch_job_example():\n run_launch_job.alias("my_launched_job")() # we rename the job with an alias\n\n """\n raise_on_invalid_config(context)\n config = {\n "entity": context.resources.wandb_config["entity"],\n "project": context.resources.wandb_config["project"],\n **context.op_config,\n }\n context.log.info(f"Launch job configuration: {config}")\n\n queue = context.op_config.get("queue")\n if queue is None:\n context.log.info("No queue provided, running Launch job locally")\n launch.run(api=context.resources.wandb_resource["api"], config=config)\n else:\n synchronous = config.get("synchronous", True)\n config.pop("synchronous", None)\n queued_run = launch_add(**config)\n if synchronous is True:\n context.log.info(\n f"Synchronous Launch job added to queue with name={queue}. Waiting for"\n " completion..."\n )\n queued_run.wait_until_finished()\n else:\n context.log.info(f"Asynchronous Launch job added to queue with name={queue}")
\n
", "current_page_name": "_modules/dagster_wandb/launch/ops", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_wandb.launch.ops"}}, "resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_wandb.resources

\nfrom typing import Any, Dict\n\nimport wandb\nfrom dagster import Field, InitResourceContext, String, StringSource, resource\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom wandb.sdk.internal.internal_api import Api\n\nWANDB_CLOUD_HOST: str = "https://api.wandb.ai"\n\n\n
[docs]@dagster_maintained_resource\n@resource(\n config_schema={\n "api_key": Field(\n StringSource,\n description="W&B API key necessary to communicate with the W&B API.",\n is_required=True,\n ),\n "host": Field(\n String,\n description=(\n "API host server you wish to use. Only required if you are using W&B Server."\n ),\n is_required=False,\n default_value=WANDB_CLOUD_HOST,\n ),\n },\n description="Resource for interacting with Weights & Biases",\n)\ndef wandb_resource(context: InitResourceContext) -> Dict[str, Any]:\n """Dagster resource used to communicate with the W&B API. It's useful when you want to use the\n wandb client within your ops and assets. It's a required resources if you are using the W&B IO\n Manager.\n\n It automatically authenticates using the provided API key.\n\n For a complete set of documentation, see `Dagster integration <https://docs.wandb.ai/guides/integrations/dagster>`_.\n\n To configure this resource, we recommend using the `configured\n <https://docs.dagster.io/concepts/configuration/configured>`_ method.\n\n **Example:**\n\n .. code-block:: python\n\n from dagster import job\n from dagster_wandb import wandb_resource\n\n my_wandb_resource = wandb_resource.configured({"api_key": {"env": "WANDB_API_KEY"}})\n\n @job(resource_defs={"wandb_resource": my_wandb_resource})\n def my_wandb_job():\n ...\n\n """\n api_key = context.resource_config["api_key"]\n host = context.resource_config["host"]\n wandb.login(\n key=api_key,\n host=host,\n anonymous="never",\n )\n client_settings = wandb.Settings(\n api_key=api_key,\n base_url=host,\n anonymous="never",\n launch=True,\n )\n api = Api(default_settings=client_settings, load_settings=False)\n return {"sdk": wandb, "api": api, "host": host}
\n
", "current_page_name": "_modules/dagster_wandb/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_wandb.resources"}, "types": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_wandb.types

\nimport sys\n\nif sys.version_info >= (3, 8):\n    from typing import TypedDict\nelse:\n    from typing_extensions import TypedDict\n\nfrom typing import Any, Dict, List\n\n\n
[docs]class SerializationModule(TypedDict, total=False):\n """W&B Artifacts IO Manager configuration of the serialization module. Useful for type checking."""\n\n name: str\n parameters: Dict[str, Any]
\n\n\n
[docs]class WandbArtifactConfiguration(TypedDict, total=False):\n """W&B Artifacts IO Manager configuration. Useful for type checking."""\n\n name: str\n type: str\n description: str\n aliases: List[str]\n add_dirs: List[Dict[str, Any]]\n add_files: List[Dict[str, Any]]\n add_references: List[Dict[str, Any]]\n serialization_module: SerializationModule\n partitions: Dict[str, Dict[str, Any]]
\n
", "current_page_name": "_modules/dagster_wandb/types", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_wandb.types"}, "utils": {"errors": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_wandb.utils.errors

\n
[docs]class WandbArtifactsIOManagerError(Exception):\n """Represents an execution error of the W&B Artifacts IO Manager."""\n\n def __init__(self, message="A W&B Artifacts IO Manager error occurred."):\n self.message = message\n super().__init__(self.message)
\n\n\nSUPPORTED_READ_CONFIG_KEYS = [\n "alias",\n "get_path",\n "get",\n "name",\n "partitions",\n "version",\n]\nSUPPORTED_WRITE_CONFIG_KEYS = [\n "add_dirs",\n "add_files",\n "add_references",\n "aliases",\n "description",\n "name",\n "partitions",\n "serialization_module",\n "type",\n]\nSUPPORTED_PARTITION_CONFIG_KEYS = ["get", "get_path", "version", "alias"]\n\n\ndef raise_on_empty_configuration(partition_key, dictionary):\n if dictionary is not None and len(dictionary) == 0:\n raise WandbArtifactsIOManagerError(\n f"The configuration is empty for the partition identified by the key '{partition_key}'."\n " This happened within the 'wandb_artifact_configuration' metadata dictionary."\n )\n\n\ndef raise_on_unknown_keys(supported_config_keys, dictionary, is_read_config):\n if dictionary is None:\n return\n\n unsupported_keys = [key for key in dictionary.keys() if key not in supported_config_keys]\n if len(unsupported_keys) > 0:\n if is_read_config:\n raise WandbArtifactsIOManagerError(\n f"The configuration keys '{unsupported_keys}' you are trying to use are not"\n " supported within the 'wandb_artifact_configuration' metadata dictionary when"\n " reading an Artifact."\n )\n else:\n raise WandbArtifactsIOManagerError(\n f"The configuration keys '{unsupported_keys}' you are trying to use are not"\n " supported within the 'wandb_artifact_configuration' metadata dictionary when"\n " writing an Artifact."\n )\n\n\ndef raise_on_unknown_write_configuration_keys(dictionary):\n raise_on_unknown_keys(SUPPORTED_WRITE_CONFIG_KEYS, dictionary, False)\n\n\ndef raise_on_unknown_read_configuration_keys(dictionary):\n raise_on_unknown_keys(SUPPORTED_READ_CONFIG_KEYS, dictionary, True)\n\n\ndef raise_on_unknown_partition_keys(partition_key, dictionary):\n if dictionary is None:\n return\n\n unsupported_keys = [\n key for key in dictionary.keys() if key not in SUPPORTED_PARTITION_CONFIG_KEYS\n ]\n if len(unsupported_keys) > 0:\n raise WandbArtifactsIOManagerError(\n f"The configuration keys '{unsupported_keys}' you are trying to use are not supported"\n f" for the partition identified by the key '{partition_key}'. This happened within the"\n " 'wandb_artifact_configuration' metadata dictionary."\n )\n
", "current_page_name": "_modules/dagster_wandb/utils/errors", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_wandb.utils.errors"}}}, "dagstermill": {"asset_factory": {"alabaster_version": "0.7.13", "body": "

Source code for dagstermill.asset_factory

\nimport pickle\nimport tempfile\nfrom typing import Any, Callable, Iterable, Mapping, Optional, Set, Type, Union, cast\n\nimport dagster._check as check\nfrom dagster import (\n    AssetIn,\n    AssetKey,\n    AssetsDefinition,\n    Failure,\n    Output,\n    PartitionsDefinition,\n    ResourceDefinition,\n    RetryPolicy,\n    RetryRequested,\n    SourceAsset,\n    asset,\n)\nfrom dagster._config.pythonic_config import Config, infer_schema_from_config_class\nfrom dagster._config.pythonic_config.inheritance_utils import safe_is_subclass\nfrom dagster._core.definitions.events import CoercibleToAssetKey, CoercibleToAssetKeyPrefix\nfrom dagster._core.definitions.utils import validate_tags\nfrom dagster._core.execution.context.compute import OpExecutionContext\n\nfrom dagstermill.factory import _clean_path_for_windows, execute_notebook\n\n\ndef _make_dagstermill_asset_compute_fn(\n    name: str,\n    notebook_path: str,\n    save_notebook_on_failure: bool,\n) -> Callable:\n    def _t_fn(context: OpExecutionContext, **inputs) -> Iterable:\n        check.param_invariant(\n            isinstance(context.run_config, dict),\n            "context",\n            "StepExecutionContext must have valid run_config",\n        )\n\n        with tempfile.TemporaryDirectory() as output_notebook_dir:\n            executed_notebook_path = execute_notebook(\n                context.get_step_execution_context(),\n                name=name,\n                inputs=inputs,\n                save_notebook_on_failure=save_notebook_on_failure,\n                notebook_path=notebook_path,\n                output_notebook_dir=output_notebook_dir,\n            )\n\n            with open(executed_notebook_path, "rb") as fd:\n                yield Output(fd.read())\n\n            # deferred import for perf\n            import scrapbook\n\n            output_nb = scrapbook.read_notebook(executed_notebook_path)\n\n            for key, value in output_nb.scraps.items():\n                if key.startswith("event-"):\n                    with open(value.data, "rb") as fd:\n                        event = pickle.loads(fd.read())\n                        if isinstance(event, (Failure, RetryRequested)):\n                            raise event\n                        else:\n                            yield event\n\n    return _t_fn\n\n\n
[docs]def define_dagstermill_asset(\n name: str,\n notebook_path: str,\n key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n ins: Optional[Mapping[str, AssetIn]] = None,\n deps: Optional[Iterable[Union[CoercibleToAssetKey, AssetsDefinition, SourceAsset]]] = None,\n metadata: Optional[Mapping[str, Any]] = None,\n config_schema: Optional[Union[Any, Mapping[str, Any]]] = None,\n required_resource_keys: Optional[Set[str]] = None,\n resource_defs: Optional[Mapping[str, ResourceDefinition]] = None,\n description: Optional[str] = None,\n partitions_def: Optional[PartitionsDefinition] = None,\n op_tags: Optional[Mapping[str, Any]] = None,\n group_name: Optional[str] = None,\n io_manager_key: Optional[str] = None,\n retry_policy: Optional[RetryPolicy] = None,\n save_notebook_on_failure: bool = False,\n non_argument_deps: Optional[Union[Set[AssetKey], Set[str]]] = None,\n) -> AssetsDefinition:\n """Creates a Dagster asset for a Jupyter notebook.\n\n Arguments:\n name (str): The name for the asset\n notebook_path (str): Path to the backing notebook\n key_prefix (Optional[Union[str, Sequence[str]]]): If provided, the asset's key is the\n concatenation of the key_prefix and the asset's name, which defaults to the name of\n the decorated function. Each item in key_prefix must be a valid name in dagster (ie only\n contains letters, numbers, and _) and may not contain python reserved keywords.\n ins (Optional[Mapping[str, AssetIn]]): A dictionary that maps input names to information\n about the input.\n deps (Optional[Sequence[Union[AssetsDefinition, SourceAsset, AssetKey, str]]]): The assets\n that are upstream dependencies, but do not pass an input value to the notebook.\n config_schema (Optional[ConfigSchema): The configuration schema for the asset's underlying\n op. If set, Dagster will check that config provided for the op matches this schema and fail\n if it does not. If not set, Dagster will accept any config provided for the op.\n metadata (Optional[Dict[str, Any]]): A dict of metadata entries for the asset.\n required_resource_keys (Optional[Set[str]]): Set of resource handles required by the notebook.\n description (Optional[str]): Description of the asset to display in the Dagster UI.\n partitions_def (Optional[PartitionsDefinition]): Defines the set of partition keys that\n compose the asset.\n op_tags (Optional[Dict[str, Any]]): A dictionary of tags for the op that computes the asset.\n Frameworks may expect and require certain metadata to be attached to a op. Values that\n are not strings will be json encoded and must meet the criteria that\n `json.loads(json.dumps(value)) == value`.\n group_name (Optional[str]): A string name used to organize multiple assets into groups. If not provided,\n the name "default" is used.\n resource_defs (Optional[Mapping[str, ResourceDefinition]]):\n (Experimental) A mapping of resource keys to resource definitions. These resources\n will be initialized during execution, and can be accessed from the\n context within the notebook.\n io_manager_key (Optional[str]): A string key for the IO manager used to store the output notebook.\n If not provided, the default key output_notebook_io_manager will be used.\n retry_policy (Optional[RetryPolicy]): The retry policy for the op that computes the asset.\n save_notebook_on_failure (bool): If True and the notebook fails during execution, the failed notebook will be\n written to the Dagster storage directory. The location of the file will be printed in the Dagster logs.\n Defaults to False.\n non_argument_deps (Optional[Union[Set[AssetKey], Set[str]]]): Deprecated, use deps instead. Set of asset keys that are\n upstream dependencies, but do not pass an input to the asset.\n\n Examples:\n .. code-block:: python\n\n from dagstermill import define_dagstermill_asset\n from dagster import asset, AssetIn, AssetKey\n from sklearn import datasets\n import pandas as pd\n import numpy as np\n\n @asset\n def iris_dataset():\n sk_iris = datasets.load_iris()\n return pd.DataFrame(\n data=np.c_[sk_iris["data"], sk_iris["target"]],\n columns=sk_iris["feature_names"] + ["target"],\n )\n\n iris_kmeans_notebook = define_dagstermill_asset(\n name="iris_kmeans_notebook",\n notebook_path="/path/to/iris_kmeans.ipynb",\n ins={\n "iris": AssetIn(key=AssetKey("iris_dataset"))\n }\n )\n """\n check.str_param(name, "name")\n check.str_param(notebook_path, "notebook_path")\n check.bool_param(save_notebook_on_failure, "save_notebook_on_failure")\n\n required_resource_keys = set(\n check.opt_set_param(required_resource_keys, "required_resource_keys", of_type=str)\n )\n ins = check.opt_mapping_param(ins, "ins", key_type=str, value_type=AssetIn)\n\n if isinstance(key_prefix, str):\n key_prefix = [key_prefix]\n\n key_prefix = check.opt_list_param(key_prefix, "key_prefix", of_type=str)\n\n default_description = f"This asset is backed by the notebook at {notebook_path}"\n description = check.opt_str_param(description, "description", default=default_description)\n\n io_mgr_key = check.opt_str_param(\n io_manager_key, "io_manager_key", default="output_notebook_io_manager"\n )\n\n user_tags = validate_tags(op_tags)\n if op_tags is not None:\n check.invariant(\n "notebook_path" not in op_tags,\n "user-defined op tags contains the `notebook_path` key, but the `notebook_path` key"\n " is reserved for use by Dagster",\n )\n check.invariant(\n "kind" not in op_tags,\n "user-defined op tags contains the `kind` key, but the `kind` key is reserved for"\n " use by Dagster",\n )\n\n default_tags = {"notebook_path": _clean_path_for_windows(notebook_path), "kind": "ipynb"}\n\n if safe_is_subclass(config_schema, Config):\n config_schema = infer_schema_from_config_class(cast(Type[Config], config_schema))\n\n return asset(\n name=name,\n key_prefix=key_prefix,\n ins=ins,\n deps=deps,\n metadata=metadata,\n description=description,\n config_schema=config_schema,\n required_resource_keys=required_resource_keys,\n resource_defs=resource_defs,\n partitions_def=partitions_def,\n op_tags={**user_tags, **default_tags},\n group_name=group_name,\n output_required=False,\n io_manager_key=io_mgr_key,\n retry_policy=retry_policy,\n non_argument_deps=non_argument_deps,\n )(\n _make_dagstermill_asset_compute_fn(\n name=name,\n notebook_path=notebook_path,\n save_notebook_on_failure=save_notebook_on_failure,\n )\n )
\n
", "current_page_name": "_modules/dagstermill/asset_factory", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagstermill.asset_factory"}, "context": {"alabaster_version": "0.7.13", "body": "

Source code for dagstermill.context

\nfrom typing import AbstractSet, Any, Mapping, Optional, cast\n\nfrom dagster import (\n    DagsterRun,\n    JobDefinition,\n    OpDefinition,\n    _check as check,\n)\nfrom dagster._annotations import public\nfrom dagster._core.definitions.dependency import Node, NodeHandle\nfrom dagster._core.execution.context.compute import AbstractComputeExecutionContext\nfrom dagster._core.execution.context.system import PlanExecutionContext, StepExecutionContext\nfrom dagster._core.log_manager import DagsterLogManager\nfrom dagster._core.system_config.objects import ResolvedRunConfig\n\n\n
[docs]class DagstermillExecutionContext(AbstractComputeExecutionContext):\n """Dagstermill-specific execution context.\n\n Do not initialize directly: use :func:`dagstermill.get_context`.\n """\n\n def __init__(\n self,\n job_context: PlanExecutionContext,\n job_def: JobDefinition,\n resource_keys_to_init: AbstractSet[str],\n op_name: str,\n node_handle: NodeHandle,\n op_config: Any = None,\n ):\n self._job_context = check.inst_param(job_context, "job_context", PlanExecutionContext)\n self._job_def = check.inst_param(job_def, "job_def", JobDefinition)\n self._resource_keys_to_init = check.set_param(\n resource_keys_to_init, "resource_keys_to_init", of_type=str\n )\n self.op_name = check.str_param(op_name, "op_name")\n self.node_handle = check.inst_param(node_handle, "node_handle", NodeHandle)\n self._op_config = op_config\n\n def has_tag(self, key: str) -> bool:\n """Check if a logging tag is defined on the context.\n\n Args:\n key (str): The key to check.\n\n Returns:\n bool\n """\n check.str_param(key, "key")\n return self._job_context.has_tag(key)\n\n def get_tag(self, key: str) -> Optional[str]:\n """Get a logging tag defined on the context.\n\n Args:\n key (str): The key to get.\n\n Returns:\n str\n """\n check.str_param(key, "key")\n return self._job_context.get_tag(key)\n\n @public\n @property\n def run_id(self) -> str:\n """str: The run_id for the context."""\n return self._job_context.run_id\n\n @public\n @property\n def run_config(self) -> Mapping[str, Any]:\n """dict: The run_config for the context."""\n return self._job_context.run_config\n\n @property\n def resolved_run_config(self) -> ResolvedRunConfig:\n """:class:`dagster.ResolvedRunConfig`: The resolved_run_config for the context."""\n return self._job_context.resolved_run_config\n\n @public\n @property\n def logging_tags(self) -> Mapping[str, str]:\n """dict: The logging tags for the context."""\n return self._job_context.logging_tags\n\n @public\n @property\n def job_name(self) -> str:\n """str: The name of the executing job."""\n return self._job_context.job_name\n\n @public\n @property\n def job_def(self) -> JobDefinition:\n """:class:`dagster.JobDefinition`: The job definition for the context.\n\n This will be a dagstermill-specific shim.\n """\n return self._job_def\n\n @property\n def resources(self) -> Any:\n """collections.namedtuple: A dynamically-created type whose properties allow access to\n resources.\n """\n return self._job_context.scoped_resources_builder.build(\n required_resource_keys=self._resource_keys_to_init,\n )\n\n @public\n @property\n def run(self) -> DagsterRun:\n """:class:`dagster.DagsterRun`: The job run for the context."""\n return cast(DagsterRun, self._job_context.dagster_run)\n\n @property\n def log(self) -> DagsterLogManager:\n """:class:`dagster.DagsterLogManager`: The log manager for the context.\n\n Call, e.g., ``log.info()`` to log messages through the Dagster machinery.\n """\n return self._job_context.log\n\n @public\n @property\n def op_def(self) -> OpDefinition:\n """:class:`dagster.OpDefinition`: The op definition for the context.\n\n In interactive contexts, this may be a dagstermill-specific shim, depending whether an\n op definition was passed to ``dagstermill.get_context``.\n """\n return cast(OpDefinition, self._job_def.node_def_named(self.op_name))\n\n @property\n def node(self) -> Node:\n """:class:`dagster.Node`: The node for the context.\n\n In interactive contexts, this may be a dagstermill-specific shim, depending whether an\n op definition was passed to ``dagstermill.get_context``.\n """\n return self.job_def.get_node(self.node_handle)\n\n @public\n @property\n def op_config(self) -> Any:\n """collections.namedtuple: A dynamically-created type whose properties allow access to\n op-specific config.\n """\n if self._op_config:\n return self._op_config\n\n op_config = self.resolved_run_config.ops.get(self.op_name)\n return op_config.config if op_config else None
\n\n\nclass DagstermillRuntimeExecutionContext(DagstermillExecutionContext):\n def __init__(\n self,\n job_context: PlanExecutionContext,\n job_def: JobDefinition,\n resource_keys_to_init: AbstractSet[str],\n op_name: str,\n step_context: StepExecutionContext,\n node_handle: NodeHandle,\n op_config: Any = None,\n ):\n self._step_context = check.inst_param(step_context, "step_context", StepExecutionContext)\n super().__init__(\n job_context,\n job_def,\n resource_keys_to_init,\n op_name,\n node_handle,\n op_config,\n )\n\n @property\n def step_context(self) -> StepExecutionContext:\n return self._step_context\n
", "current_page_name": "_modules/dagstermill/context", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagstermill.context"}, "errors": {"alabaster_version": "0.7.13", "body": "

Source code for dagstermill.errors

\nfrom dagster._core.errors import DagsterError\n\n\n
[docs]class DagstermillError(DagsterError):\n """Base class for errors raised by dagstermill."""
\n
", "current_page_name": "_modules/dagstermill/errors", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagstermill.errors"}, "factory": {"alabaster_version": "0.7.13", "body": "

Source code for dagstermill.factory

\nimport copy\nimport os\nimport pickle\nimport sys\nimport tempfile\nimport uuid\nfrom typing import Any, Callable, Iterable, Mapping, Optional, Sequence, Set, Type, Union, cast\n\nimport nbformat\nimport papermill\nfrom dagster import (\n    In,\n    OpDefinition,\n    Out,\n    Output,\n    _check as check,\n    _seven,\n)\nfrom dagster._config.pythonic_config import Config, infer_schema_from_config_class\nfrom dagster._config.pythonic_config.inheritance_utils import safe_is_subclass\nfrom dagster._core.definitions.events import AssetMaterialization, Failure, RetryRequested\nfrom dagster._core.definitions.metadata import MetadataValue\nfrom dagster._core.definitions.reconstruct import ReconstructableJob\nfrom dagster._core.definitions.utils import validate_tags\nfrom dagster._core.execution.context.compute import OpExecutionContext\nfrom dagster._core.execution.context.input import build_input_context\nfrom dagster._core.execution.context.system import StepExecutionContext\nfrom dagster._core.execution.plan.outputs import StepOutputHandle\nfrom dagster._serdes import pack_value\nfrom dagster._seven import get_system_temp_directory\nfrom dagster._utils import mkdir_p, safe_tempfile_path\nfrom dagster._utils.error import serializable_error_info_from_exc_info\nfrom papermill.engines import papermill_engines\nfrom papermill.iorw import load_notebook_node, write_ipynb\n\nfrom .compat import ExecutionError\nfrom .engine import DagstermillEngine\nfrom .errors import DagstermillError\nfrom .translator import DagsterTranslator\n\n\ndef _clean_path_for_windows(notebook_path: str) -> str:\n    """In windows, the notebook can't render in the Dagster UI unless the C: prefix is removed.\n    os.path.splitdrive will split the path into (drive, tail), so just return the tail.\n    """\n    return os.path.splitdrive(notebook_path)[1]\n\n\n# https://github.com/nteract/papermill/blob/17d4bbb3960c30c263bca835e48baf34322a3530/papermill/parameterize.py\ndef _find_first_tagged_cell_index(nb, tag):\n    parameters_indices = []\n    for idx, cell in enumerate(nb.cells):\n        if tag in cell.metadata.tags:\n            parameters_indices.append(idx)\n    if not parameters_indices:\n        return -1\n    return parameters_indices[0]\n\n\n# This is based on papermill.parameterize.parameterize_notebook\n# Typically, papermill injects the injected-parameters cell *below* the parameters cell\n# but we want to *replace* the parameters cell, which is what this function does.\ndef replace_parameters(context, nb, parameters):\n    """Assigned parameters into the appropriate place in the input notebook.\n\n    Args:\n        nb (NotebookNode): Executable notebook object\n        parameters (dict): Arbitrary keyword arguments to pass to the notebook parameters.\n    """\n    check.dict_param(parameters, "parameters")\n\n    # Copy the nb object to avoid polluting the input\n    nb = copy.deepcopy(nb)\n\n    # papermill method chooses translator based on kernel_name and language, but we just call the\n    # DagsterTranslator to generate parameter content based on the kernel_name\n    param_content = DagsterTranslator.codify(parameters)\n\n    newcell = nbformat.v4.new_code_cell(source=param_content)\n    newcell.metadata["tags"] = ["injected-parameters"]\n\n    param_cell_index = _find_first_tagged_cell_index(nb, "parameters")\n    injected_cell_index = _find_first_tagged_cell_index(nb, "injected-parameters")\n    if injected_cell_index >= 0:\n        # Replace the injected cell with a new version\n        before = nb.cells[:injected_cell_index]\n        after = nb.cells[injected_cell_index + 1 :]\n        check.int_value_param(param_cell_index, -1, "param_cell_index")\n        # We should have blown away the parameters cell if there is an injected-parameters cell\n    elif param_cell_index >= 0:\n        # Replace the parameter cell with the injected-parameters cell\n        before = nb.cells[:param_cell_index]\n        after = nb.cells[param_cell_index + 1 :]\n    else:\n        # Inject to the top of the notebook, presumably first cell includes dagstermill import\n        context.log.debug(\n            "Executing notebook with no tagged parameters cell: injecting boilerplate in first "\n            "cell."\n        )\n        before = []\n        after = nb.cells\n\n    nb.cells = before + [newcell] + after\n    nb.metadata.papermill["parameters"] = _seven.json.dumps(parameters)\n\n    return nb\n\n\ndef get_papermill_parameters(\n    step_context: StepExecutionContext,\n    inputs: Mapping[str, object],\n    output_log_path: str,\n    compute_descriptor: str,\n) -> Mapping[str, object]:\n    check.param_invariant(\n        isinstance(step_context.run_config, dict),\n        "step_context",\n        "StepExecutionContext must have valid run_config",\n    )\n\n    run_id = step_context.run_id\n    temp_dir = get_system_temp_directory()\n    marshal_dir = os.path.normpath(os.path.join(temp_dir, "dagstermill", str(run_id), "marshal"))\n    mkdir_p(marshal_dir)\n\n    if not isinstance(step_context.job, ReconstructableJob):\n        if compute_descriptor == "asset":\n            raise DagstermillError(\n                "Can't execute a dagstermill asset that is not reconstructable. "\n                "Use the reconstructable() function if executing from python"\n            )\n        else:\n            raise DagstermillError(\n                "Can't execute a dagstermill op from a job that is not reconstructable. "\n                "Use the reconstructable() function if executing from python"\n            )\n\n    dm_executable_dict = step_context.job.to_dict()\n\n    dm_context_dict = {\n        "output_log_path": output_log_path,\n        "marshal_dir": marshal_dir,\n        "run_config": step_context.run_config,\n    }\n\n    dm_node_handle_kwargs = step_context.node_handle._asdict()\n    dm_step_key = step_context.step.key\n\n    parameters = {}\n\n    parameters["__dm_context"] = dm_context_dict\n    parameters["__dm_executable_dict"] = dm_executable_dict\n    parameters["__dm_pipeline_run_dict"] = pack_value(step_context.dagster_run)\n    parameters["__dm_node_handle_kwargs"] = dm_node_handle_kwargs\n    parameters["__dm_instance_ref_dict"] = pack_value(step_context.instance.get_ref())\n    parameters["__dm_step_key"] = dm_step_key\n    parameters["__dm_input_names"] = list(inputs.keys())\n\n    return parameters\n\n\ndef execute_notebook(\n    step_context: StepExecutionContext,\n    name: str,\n    save_notebook_on_failure: bool,\n    notebook_path: str,\n    output_notebook_dir: str,\n    inputs: Mapping[str, object],\n) -> str:\n    with safe_tempfile_path() as output_log_path:\n        prefix = str(uuid.uuid4())\n        parameterized_notebook_path = os.path.join(output_notebook_dir, f"{prefix}-inter.ipynb")\n\n        executed_notebook_path = os.path.join(output_notebook_dir, f"{prefix}-out.ipynb")\n\n        # Scaffold the registration here\n        nb = load_notebook_node(notebook_path)\n        compute_descriptor = "op"\n        nb_no_parameters = replace_parameters(\n            step_context,\n            nb,\n            get_papermill_parameters(\n                step_context,\n                inputs,\n                output_log_path,\n                compute_descriptor,\n            ),\n        )\n        write_ipynb(nb_no_parameters, parameterized_notebook_path)\n\n        try:\n            papermill_engines.register("dagstermill", DagstermillEngine)\n            papermill.execute_notebook(\n                input_path=parameterized_notebook_path,\n                output_path=executed_notebook_path,\n                engine_name="dagstermill",\n                log_output=True,\n            )\n\n        except Exception as ex:\n            step_context.log.warn(\n                "Error when attempting to materialize executed notebook: {exc}".format(\n                    exc=str(serializable_error_info_from_exc_info(sys.exc_info()))\n                )\n            )\n\n            if isinstance(ex, ExecutionError):\n                exception_name = ex.ename  # type: ignore\n                if exception_name in ["RetryRequested", "Failure"]:\n                    step_context.log.warn(\n                        f"Encountered raised {exception_name} in notebook. Use"\n                        " dagstermill.yield_event with RetryRequested or Failure to trigger"\n                        " their behavior."\n                    )\n\n            if save_notebook_on_failure:\n                storage_dir = step_context.instance.storage_directory()\n                storage_path = os.path.join(storage_dir, f"{prefix}-out.ipynb")\n                with open(storage_path, "wb") as dest_file_obj:\n                    with open(executed_notebook_path, "rb") as obj:\n                        dest_file_obj.write(obj.read())\n\n                step_context.log.info(f"Failed notebook written to {storage_path}")\n\n            raise\n\n    step_context.log.debug(f"Notebook execution complete for {name} at {executed_notebook_path}.")\n\n    return executed_notebook_path\n\n\ndef _handle_events_from_notebook(\n    step_context: StepExecutionContext, executed_notebook_path: str\n) -> Iterable:\n    # deferred import for perf\n    import scrapbook\n\n    output_nb = scrapbook.read_notebook(executed_notebook_path)\n\n    for output_name in step_context.op_def.output_dict.keys():\n        data_dict = output_nb.scraps.data_dict\n        if output_name in data_dict:\n            # read outputs that were passed out of process via io manager from `yield_result`\n            step_output_handle = StepOutputHandle(\n                step_key=step_context.step.key,\n                output_name=output_name,\n            )\n            output_context = step_context.get_output_context(step_output_handle)\n            io_manager = step_context.get_io_manager(step_output_handle)\n            value = io_manager.load_input(\n                build_input_context(\n                    upstream_output=output_context, dagster_type=output_context.dagster_type\n                )\n            )\n\n            yield Output(value, output_name)\n\n    for key, value in output_nb.scraps.items():\n        if key.startswith("event-"):\n            with open(value.data, "rb") as fd:\n                event = pickle.loads(fd.read())\n                if isinstance(event, (Failure, RetryRequested)):\n                    raise event\n                else:\n                    yield event\n\n\ndef _make_dagstermill_compute_fn(\n    dagster_factory_name: str,\n    name: str,\n    notebook_path: str,\n    output_notebook_name: Optional[str] = None,\n    asset_key_prefix: Optional[Sequence[str]] = None,\n    output_notebook: Optional[str] = None,\n    save_notebook_on_failure: bool = False,\n) -> Callable:\n    def _t_fn(op_context: OpExecutionContext, inputs: Mapping[str, object]) -> Iterable:\n        check.param_invariant(\n            isinstance(op_context.run_config, dict),\n            "context",\n            "StepExecutionContext must have valid run_config",\n        )\n\n        step_context = op_context.get_step_execution_context()\n\n        with tempfile.TemporaryDirectory() as output_notebook_dir:\n            executed_notebook_path = execute_notebook(\n                step_context,\n                name=name,\n                inputs=inputs,\n                save_notebook_on_failure=save_notebook_on_failure,\n                notebook_path=notebook_path,\n                output_notebook_dir=output_notebook_dir,\n            )\n\n            if output_notebook_name is not None:\n                # yield output notebook binary stream as an op output\n                with open(executed_notebook_path, "rb") as fd:\n                    yield Output(fd.read(), output_notebook_name)\n\n            else:\n                # backcompat\n                executed_notebook_file_handle = None\n                try:\n                    # use binary mode when when moving the file since certain file_managers such as S3\n                    # may try to hash the contents\n                    with open(executed_notebook_path, "rb") as fd:\n                        executed_notebook_file_handle = op_context.resources.file_manager.write(\n                            fd, mode="wb", ext="ipynb"\n                        )\n                        executed_notebook_materialization_path = (\n                            executed_notebook_file_handle.path_desc\n                        )\n\n                    yield AssetMaterialization(\n                        asset_key=[*(asset_key_prefix or []), f"{name}_output_notebook"],\n                        description="Location of output notebook in file manager",\n                        metadata={\n                            "path": MetadataValue.path(executed_notebook_materialization_path),\n                        },\n                    )\n\n                except Exception:\n                    # if file manager writing errors, e.g. file manager is not provided, we throw a warning\n                    # and fall back to the previously stored temp executed notebook.\n                    op_context.log.warning(\n                        "Error when attempting to materialize executed notebook using file"\n                        " manager:"\n                        f" {serializable_error_info_from_exc_info(sys.exc_info())}\\nNow"\n                        " falling back to local: notebook execution was temporarily materialized"\n                        f" at {executed_notebook_path}\\nIf you have supplied a file manager and"\n                        " expect to use it for materializing the notebook, please include"\n                        ' "file_manager" in the `required_resource_keys` argument to'\n                        f" `{dagster_factory_name}`"\n                    )\n\n                if output_notebook is not None:\n                    yield Output(executed_notebook_file_handle, output_notebook)\n\n            yield from _handle_events_from_notebook(step_context, executed_notebook_path)\n\n    return _t_fn\n\n\n
[docs]def define_dagstermill_op(\n name: str,\n notebook_path: str,\n ins: Optional[Mapping[str, In]] = None,\n outs: Optional[Mapping[str, Out]] = None,\n config_schema: Optional[Union[Any, Mapping[str, Any]]] = None,\n required_resource_keys: Optional[Set[str]] = None,\n output_notebook_name: Optional[str] = None,\n asset_key_prefix: Optional[Union[Sequence[str], str]] = None,\n description: Optional[str] = None,\n tags: Optional[Mapping[str, Any]] = None,\n io_manager_key: Optional[str] = None,\n save_notebook_on_failure: bool = False,\n) -> OpDefinition:\n """Wrap a Jupyter notebook in a op.\n\n Arguments:\n name (str): The name of the op.\n notebook_path (str): Path to the backing notebook.\n ins (Optional[Mapping[str, In]]): The op's inputs.\n outs (Optional[Mapping[str, Out]]): The op's outputs. Your notebook should\n call :py:func:`~dagstermill.yield_result` to yield each of these outputs.\n required_resource_keys (Optional[Set[str]]): The string names of any required resources.\n output_notebook_name: (Optional[str]): If set, will be used as the name of an injected output\n of type of :py:class:`~dagster.BufferedIOBase` that is the file object of the executed\n notebook (in addition to the :py:class:`~dagster.AssetMaterialization` that is always\n created). It allows the downstream ops to access the executed notebook via a file\n object.\n asset_key_prefix (Optional[Union[List[str], str]]): If set, will be used to prefix the\n asset keys for materialized notebooks.\n description (Optional[str]): If set, description used for op.\n tags (Optional[Dict[str, str]]): If set, additional tags used to annotate op.\n Dagster uses the tag keys `notebook_path` and `kind`, which cannot be\n overwritten by the user.\n io_manager_key (Optional[str]): If using output_notebook_name, you can additionally provide\n a string key for the IO manager used to store the output notebook.\n If not provided, the default key output_notebook_io_manager will be used.\n save_notebook_on_failure (bool): If True and the notebook fails during execution, the failed notebook will be\n written to the Dagster storage directory. The location of the file will be printed in the Dagster logs.\n Defaults to False.\n\n Returns:\n :py:class:`~dagster.OpDefinition`\n """\n check.str_param(name, "name")\n check.str_param(notebook_path, "notebook_path")\n check.bool_param(save_notebook_on_failure, "save_notebook_on_failure")\n\n required_resource_keys = set(\n check.opt_set_param(required_resource_keys, "required_resource_keys", of_type=str)\n )\n outs = check.opt_mapping_param(outs, "outs", key_type=str, value_type=Out)\n ins = check.opt_mapping_param(ins, "ins", key_type=str, value_type=In)\n\n if output_notebook_name is not None:\n io_mgr_key = check.opt_str_param(\n io_manager_key, "io_manager_key", default="output_notebook_io_manager"\n )\n required_resource_keys.add(io_mgr_key)\n outs = {\n **outs,\n cast(str, output_notebook_name): Out(io_manager_key=io_mgr_key),\n }\n\n if isinstance(asset_key_prefix, str):\n asset_key_prefix = [asset_key_prefix]\n\n asset_key_prefix = check.opt_list_param(asset_key_prefix, "asset_key_prefix", of_type=str)\n\n default_description = f"This op is backed by the notebook at {notebook_path}"\n description = check.opt_str_param(description, "description", default=default_description)\n\n user_tags = validate_tags(tags)\n if tags is not None:\n check.invariant(\n "notebook_path" not in tags,\n "user-defined op tags contains the `notebook_path` key, but the `notebook_path` key"\n " is reserved for use by Dagster",\n )\n check.invariant(\n "kind" not in tags,\n "user-defined op tags contains the `kind` key, but the `kind` key is reserved for"\n " use by Dagster",\n )\n default_tags = {"notebook_path": _clean_path_for_windows(notebook_path), "kind": "ipynb"}\n\n if safe_is_subclass(config_schema, Config):\n config_schema = infer_schema_from_config_class(cast(Type[Config], config_schema))\n\n return OpDefinition(\n name=name,\n compute_fn=_make_dagstermill_compute_fn(\n "define_dagstermill_op",\n name,\n notebook_path,\n output_notebook_name,\n asset_key_prefix=asset_key_prefix,\n save_notebook_on_failure=save_notebook_on_failure,\n ),\n ins=ins,\n outs=outs,\n config_schema=config_schema,\n required_resource_keys=required_resource_keys,\n description=description,\n tags={**user_tags, **default_tags},\n )
\n
", "current_page_name": "_modules/dagstermill/factory", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagstermill.factory"}, "io_managers": {"alabaster_version": "0.7.13", "body": "

Source code for dagstermill.io_managers

\nimport os\nfrom pathlib import Path\nfrom typing import Any, List, Optional, Sequence\n\nimport dagster._check as check\nfrom dagster import (\n    AssetKey,\n    AssetMaterialization,\n    ConfigurableIOManagerFactory,\n    InitResourceContext,\n    IOManager,\n)\nfrom dagster._core.definitions.metadata import MetadataValue\nfrom dagster._core.execution.context.input import InputContext\nfrom dagster._core.execution.context.output import OutputContext\nfrom dagster._core.storage.io_manager import dagster_maintained_io_manager, io_manager\nfrom dagster._utils import mkdir_p\nfrom pydantic import Field\n\nfrom dagstermill.factory import _clean_path_for_windows\n\n\nclass OutputNotebookIOManager(IOManager):\n    def __init__(self, asset_key_prefix: Optional[Sequence[str]] = None):\n        self.asset_key_prefix = asset_key_prefix if asset_key_prefix else []\n\n    def handle_output(self, context: OutputContext, obj: bytes):\n        raise NotImplementedError\n\n    def load_input(self, context: InputContext) -> Any:\n        raise NotImplementedError\n\n\nclass LocalOutputNotebookIOManager(OutputNotebookIOManager):\n    def __init__(self, base_dir: str, asset_key_prefix: Optional[Sequence[str]] = None):\n        super(LocalOutputNotebookIOManager, self).__init__(asset_key_prefix=asset_key_prefix)\n        self.base_dir = base_dir\n        self.write_mode = "wb"\n        self.read_mode = "rb"\n\n    def _get_path(self, context: OutputContext) -> str:\n        """Automatically construct filepath."""\n        if context.has_asset_key:\n            keys = context.get_asset_identifier()\n        else:\n            keys = context.get_run_scoped_output_identifier()\n        return str(Path(self.base_dir, *keys).with_suffix(".ipynb"))\n\n    def handle_output(self, context: OutputContext, obj: bytes):\n        """obj: bytes."""\n        check.inst_param(context, "context", OutputContext)\n\n        # the output notebook itself is stored at output_file_path\n        output_notebook_path = self._get_path(context)\n        mkdir_p(os.path.dirname(output_notebook_path))\n        with open(output_notebook_path, self.write_mode) as dest_file_obj:\n            dest_file_obj.write(obj)\n\n        metadata = {\n            "Executed notebook": MetadataValue.notebook(\n                _clean_path_for_windows(output_notebook_path)\n            )\n        }\n\n        if context.has_asset_key:\n            context.add_output_metadata(metadata)\n        else:\n            context.log_event(\n                AssetMaterialization(\n                    asset_key=AssetKey(\n                        [*self.asset_key_prefix, f"{context.step_key}_output_notebook"]\n                    ),\n                    metadata=metadata,\n                )\n            )\n\n    def load_input(self, context: InputContext) -> bytes:\n        check.inst_param(context, "context", InputContext)\n        # pass output notebook to downstream ops as File Object\n        output_context = check.not_none(context.upstream_output)\n        with open(self._get_path(output_context), self.read_mode) as file_obj:\n            return file_obj.read()\n\n\n
[docs]class ConfigurableLocalOutputNotebookIOManager(ConfigurableIOManagerFactory):\n """Built-in IO Manager for handling output notebook."""\n\n base_dir: Optional[str] = Field(\n default=None,\n description=(\n "Base directory to use for output notebooks. Defaults to the Dagster instance storage"\n " directory if not provided."\n ),\n )\n asset_key_prefix: List[str] = Field(\n default=[],\n description=(\n "Asset key prefix to apply to assets materialized for output notebooks. Defaults to no"\n " prefix."\n ),\n )\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n def create_io_manager(self, context: InitResourceContext) -> "LocalOutputNotebookIOManager":\n return LocalOutputNotebookIOManager(\n base_dir=self.base_dir or check.not_none(context.instance).storage_directory(),\n asset_key_prefix=self.asset_key_prefix,\n )
\n\n\n@dagster_maintained_io_manager\n@io_manager(config_schema=ConfigurableLocalOutputNotebookIOManager.to_config_schema())\ndef local_output_notebook_io_manager(init_context) -> LocalOutputNotebookIOManager:\n """Built-in IO Manager that handles output notebooks."""\n return ConfigurableLocalOutputNotebookIOManager.from_resource_context(init_context)\n
", "current_page_name": "_modules/dagstermill/io_managers", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagstermill.io_managers"}, "manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagstermill.manager

\nimport os\nimport pickle\nimport uuid\nfrom typing import TYPE_CHECKING, AbstractSet, Any, Mapping, Optional, cast\n\nfrom dagster import (\n    AssetMaterialization,\n    AssetObservation,\n    ExpectationResult,\n    Failure,\n    LoggerDefinition,\n    ResourceDefinition,\n    StepExecutionContext,\n    TypeCheck,\n    _check as check,\n)\nfrom dagster._core.definitions.dependency import NodeHandle\nfrom dagster._core.definitions.events import RetryRequested\nfrom dagster._core.definitions.graph_definition import GraphDefinition\nfrom dagster._core.definitions.job_base import InMemoryJob\nfrom dagster._core.definitions.job_definition import JobDefinition\nfrom dagster._core.definitions.op_definition import OpDefinition\nfrom dagster._core.definitions.reconstruct import ReconstructableJob\nfrom dagster._core.definitions.resource_definition import ScopedResourcesBuilder\nfrom dagster._core.events import DagsterEvent\nfrom dagster._core.execution.api import create_execution_plan, scoped_job_context\nfrom dagster._core.execution.plan.outputs import StepOutputHandle\nfrom dagster._core.execution.plan.plan import ExecutionPlan\nfrom dagster._core.execution.plan.state import KnownExecutionState\nfrom dagster._core.execution.plan.step import ExecutionStep\nfrom dagster._core.execution.resources_init import (\n    get_required_resource_keys_to_init,\n    resource_initialization_event_generator,\n)\nfrom dagster._core.instance import DagsterInstance\nfrom dagster._core.instance.ref import InstanceRef\nfrom dagster._core.log_manager import DagsterLogManager\nfrom dagster._core.storage.dagster_run import DagsterRun, DagsterRunStatus\nfrom dagster._core.system_config.objects import ResolvedRunConfig, ResourceConfig\nfrom dagster._core.utils import make_new_run_id\nfrom dagster._loggers import colored_console_logger\nfrom dagster._serdes import unpack_value\nfrom dagster._utils import EventGenerationManager\n\nfrom .context import DagstermillExecutionContext, DagstermillRuntimeExecutionContext\nfrom .errors import DagstermillError\nfrom .serialize import PICKLE_PROTOCOL\n\nif TYPE_CHECKING:\n    from dagster._core.definitions.node_definition import NodeDefinition\n\n\nclass DagstermillResourceEventGenerationManager(EventGenerationManager):\n    """Utility class to explicitly manage setup/teardown of resource events. Overrides the default\n    `generate_teardown_events` method so that teardown is deferred until explicitly called by the\n    dagstermill Manager.\n    """\n\n    def generate_teardown_events(self):\n        return iter(())\n\n    def teardown(self):\n        return [\n            teardown_event\n            for teardown_event in super(\n                DagstermillResourceEventGenerationManager, self\n            ).generate_teardown_events()\n        ]\n\n\nclass Manager:\n    def __init__(self):\n        self.job = None\n        self.op_def: Optional[NodeDefinition] = None\n        self.in_job: bool = False\n        self.marshal_dir: Optional[str] = None\n        self.context = None\n        self.resource_manager = None\n\n    def _setup_resources(\n        self,\n        resource_defs: Mapping[str, ResourceDefinition],\n        resource_configs: Mapping[str, ResourceConfig],\n        log_manager: DagsterLogManager,\n        execution_plan: Optional[ExecutionPlan],\n        dagster_run: Optional[DagsterRun],\n        resource_keys_to_init: Optional[AbstractSet[str]],\n        instance: Optional[DagsterInstance],\n        emit_persistent_events: Optional[bool],\n    ):\n        """Drop-in replacement for\n        `dagster._core.execution.resources_init.resource_initialization_manager`.  It uses a\n        `DagstermillResourceEventGenerationManager` and explicitly calls `teardown` on it.\n        """\n        generator = resource_initialization_event_generator(\n            resource_defs=resource_defs,\n            resource_configs=resource_configs,\n            log_manager=log_manager,\n            execution_plan=execution_plan,\n            dagster_run=dagster_run,\n            resource_keys_to_init=resource_keys_to_init,\n            instance=instance,\n            emit_persistent_events=emit_persistent_events,\n        )\n        self.resource_manager = DagstermillResourceEventGenerationManager(\n            generator, ScopedResourcesBuilder\n        )\n        return self.resource_manager\n\n    def reconstitute_job_context(\n        self,\n        executable_dict: Mapping[str, Any],\n        job_run_dict: Mapping[str, Any],\n        node_handle_kwargs: Mapping[str, Any],\n        instance_ref_dict: Mapping[str, Any],\n        step_key: str,\n        output_log_path: Optional[str] = None,\n        marshal_dir: Optional[str] = None,\n        run_config: Optional[Mapping[str, Any]] = None,\n    ):\n        """Reconstitutes a context for dagstermill-managed execution.\n\n        You'll see this function called to reconstruct a job context within the ``injected\n        parameters`` cell of a dagstermill output notebook. Users should not call this function\n        interactively except when debugging output notebooks.\n\n        Use :func:`dagstermill.get_context` in the ``parameters`` cell of your notebook to define a\n        context for interactive exploration and development. This call will be replaced by one to\n        :func:`dagstermill.reconstitute_job_context` when the notebook is executed by\n        dagstermill.\n        """\n        check.opt_str_param(output_log_path, "output_log_path")\n        check.opt_str_param(marshal_dir, "marshal_dir")\n        run_config = check.opt_mapping_param(run_config, "run_config", key_type=str)\n        check.mapping_param(job_run_dict, "job_run_dict")\n        check.mapping_param(executable_dict, "executable_dict")\n        check.mapping_param(node_handle_kwargs, "node_handle_kwargs")\n        check.mapping_param(instance_ref_dict, "instance_ref_dict")\n        check.str_param(step_key, "step_key")\n\n        job = ReconstructableJob.from_dict(executable_dict)\n        job_def = job.get_definition()\n\n        try:\n            instance_ref = unpack_value(instance_ref_dict, InstanceRef)\n            instance = DagsterInstance.from_ref(instance_ref)\n        except Exception as err:\n            raise DagstermillError(\n                "Error when attempting to resolve DagsterInstance from serialized InstanceRef"\n            ) from err\n\n        dagster_run = unpack_value(job_run_dict, DagsterRun)\n\n        node_handle = NodeHandle.from_dict(node_handle_kwargs)\n        op = job_def.get_node(node_handle)\n        op_def = op.definition\n\n        self.marshal_dir = marshal_dir\n        self.in_job = True\n        self.op_def = op_def\n        self.job = job\n\n        ResolvedRunConfig.build(job_def, run_config)\n\n        execution_plan = create_execution_plan(\n            self.job,\n            run_config,\n            step_keys_to_execute=dagster_run.step_keys_to_execute,\n        )\n\n        with scoped_job_context(\n            execution_plan,\n            job,\n            run_config,\n            dagster_run,\n            instance,\n            scoped_resources_builder_cm=self._setup_resources,\n            # Set this flag even though we're not in test for clearer error reporting\n            raise_on_error=True,\n        ) as job_context:\n            known_state = None\n            if dagster_run.parent_run_id:\n                known_state = KnownExecutionState.build_for_reexecution(\n                    instance=instance,\n                    parent_run=check.not_none(instance.get_run_by_id(dagster_run.parent_run_id)),\n                )\n            self.context = DagstermillRuntimeExecutionContext(\n                job_context=job_context,\n                job_def=job_def,\n                op_config=run_config.get("ops", {}).get(op.name, {}).get("config"),\n                resource_keys_to_init=get_required_resource_keys_to_init(\n                    execution_plan,\n                    job_def,\n                ),\n                op_name=op.name,\n                node_handle=node_handle,\n                step_context=cast(\n                    StepExecutionContext,\n                    job_context.for_step(\n                        cast(ExecutionStep, execution_plan.get_step_by_key(step_key)),\n                        known_state=known_state,\n                    ),\n                ),\n            )\n\n        return self.context\n\n    def get_context(\n        self,\n        op_config: Any = None,\n        resource_defs: Optional[Mapping[str, ResourceDefinition]] = None,\n        logger_defs: Optional[Mapping[str, LoggerDefinition]] = None,\n        run_config: Optional[dict] = None,\n    ) -> DagstermillExecutionContext:\n        """Get a dagstermill execution context for interactive exploration and development.\n\n        Args:\n            op_config (Optional[Any]): If specified, this value will be made available on the\n                context as its ``op_config`` property.\n            resource_defs (Optional[Mapping[str, ResourceDefinition]]): Specifies resources to provide to context.\n            logger_defs (Optional[Mapping[str, LoggerDefinition]]): Specifies loggers to provide to context.\n            run_config(Optional[dict]): The config dict with which to construct\n                the context.\n\n        Returns:\n            :py:class:`~dagstermill.DagstermillExecutionContext`\n        """\n        run_config = check.opt_dict_param(run_config, "run_config", key_type=str)\n\n        # If we are running non-interactively, and there is already a context reconstituted, return\n        # that context rather than overwriting it.\n        if self.context is not None and isinstance(\n            self.context, DagstermillRuntimeExecutionContext\n        ):\n            return self.context\n\n        if not logger_defs:\n            logger_defs = {"dagstermill": colored_console_logger}\n            run_config["loggers"] = {"dagstermill": {}}\n        logger_defs = check.opt_mapping_param(logger_defs, "logger_defs")\n        resource_defs = check.opt_mapping_param(resource_defs, "resource_defs")\n\n        op_def = OpDefinition(\n            name="this_op",\n            compute_fn=lambda *args, **kwargs: None,\n            description="Ephemeral op constructed by dagstermill.get_context()",\n            required_resource_keys=set(resource_defs.keys()),\n        )\n\n        job_def = JobDefinition(\n            graph_def=GraphDefinition(name="ephemeral_dagstermill_pipeline", node_defs=[op_def]),\n            logger_defs=logger_defs,\n            resource_defs=resource_defs,\n        )\n\n        run_id = make_new_run_id()\n\n        # construct stubbed DagsterRun for notebook exploration...\n        # The actual dagster run during job execution will be serialized and reconstituted\n        # in the `reconstitute_job_context` call\n        dagster_run = DagsterRun(\n            job_name=job_def.name,\n            run_id=run_id,\n            run_config=run_config,\n            step_keys_to_execute=None,\n            status=DagsterRunStatus.NOT_STARTED,\n            tags=None,\n        )\n\n        self.in_job = False\n        self.op_def = op_def\n        self.job = job_def\n\n        job = InMemoryJob(job_def)\n        execution_plan = create_execution_plan(job, run_config)\n\n        with scoped_job_context(\n            execution_plan,\n            job,\n            run_config,\n            dagster_run,\n            DagsterInstance.ephemeral(),\n            scoped_resources_builder_cm=self._setup_resources,\n        ) as job_context:\n            self.context = DagstermillExecutionContext(\n                job_context=job_context,\n                job_def=job_def,\n                op_config=op_config,\n                resource_keys_to_init=get_required_resource_keys_to_init(\n                    execution_plan,\n                    job_def,\n                ),\n                op_name=op_def.name,\n                node_handle=NodeHandle(op_def.name, parent=None),\n            )\n\n        return self.context\n\n    def yield_result(self, value, output_name="result"):\n        """Yield a result directly from notebook code.\n\n        When called interactively or in development, returns its input.\n\n        Args:\n            value (Any): The value to yield.\n            output_name (Optional[str]): The name of the result to yield (default: ``'result'``).\n        """\n        if not self.in_job:\n            return value\n\n        # deferred import for perf\n        import scrapbook\n\n        if not self.op_def.has_output(output_name):\n            raise DagstermillError(\n                f"Op {self.op_def.name} does not have output named {output_name}.Expected one of"\n                f" {[str(output_def.name) for output_def in self.op_def.output_defs]}"\n            )\n\n        # pass output value cross process boundary using io manager\n        step_context = self.context._step_context  # noqa: SLF001\n        # Note: yield_result currently does not support DynamicOutput\n\n        # dagstermill assets do not support yielding additional results within the notebook:\n        if len(step_context.job_def.asset_layer.asset_keys) > 0:\n            raise DagstermillError(\n                "dagstermill assets do not currently support dagstermill.yield_result"\n            )\n\n        step_output_handle = StepOutputHandle(\n            step_key=step_context.step.key, output_name=output_name\n        )\n        output_context = step_context.get_output_context(step_output_handle)\n        io_manager = step_context.get_io_manager(step_output_handle)\n\n        # Note that we assume io manager is symmetric, i.e handle_input(handle_output(X)) == X\n        io_manager.handle_output(output_context, value)\n\n        # record that the output has been yielded\n        scrapbook.glue(output_name, "")\n\n    def yield_event(self, dagster_event):\n        """Yield a dagster event directly from notebook code.\n\n        When called interactively or in development, returns its input.\n\n        Args:\n            dagster_event (Union[:class:`dagster.AssetMaterialization`, :class:`dagster.ExpectationResult`, :class:`dagster.TypeCheck`, :class:`dagster.Failure`, :class:`dagster.RetryRequested`]):\n                An event to yield back to Dagster.\n        """\n        valid_types = (\n            AssetMaterialization,\n            AssetObservation,\n            ExpectationResult,\n            TypeCheck,\n            Failure,\n            RetryRequested,\n        )\n        if not isinstance(dagster_event, valid_types):\n            raise DagstermillError(\n                f"Received invalid type {dagster_event} in yield_event. Expected a Dagster event"\n                f" type, one of {valid_types}."\n            )\n\n        if not self.in_job:\n            return dagster_event\n\n        # deferred import for perf\n        import scrapbook\n\n        event_id = f"event-{uuid.uuid4()}"\n        out_file_path = os.path.join(self.marshal_dir, event_id)\n        with open(out_file_path, "wb") as fd:\n            fd.write(pickle.dumps(dagster_event, PICKLE_PROTOCOL))\n\n        scrapbook.glue(event_id, out_file_path)\n\n    def teardown_resources(self):\n        if self.resource_manager is not None:\n            self.resource_manager.teardown()\n\n    def load_input_parameter(self, input_name: str):\n        # load input from source\n        dm_context = check.not_none(self.context)\n        if not isinstance(dm_context, DagstermillRuntimeExecutionContext):\n            check.failed("Expected DagstermillRuntimeExecutionContext")\n        step_context = dm_context.step_context\n        step_input = step_context.step.step_input_named(input_name)\n        input_def = step_context.op_def.input_def_named(input_name)\n        for event_or_input_value in step_input.source.load_input_object(step_context, input_def):\n            if isinstance(event_or_input_value, DagsterEvent):\n                continue\n            else:\n                return event_or_input_value\n\n\nMANAGER_FOR_NOTEBOOK_INSTANCE = Manager()\n
", "current_page_name": "_modules/dagstermill/manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagstermill.manager"}}} \ No newline at end of file +{"": {"dagster_pandera": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_pandera

\nimport itertools\nimport re\nfrom typing import TYPE_CHECKING, Callable, Sequence, Type, Union\n\nimport dagster._check as check\nimport pandas as pd\nimport pandera as pa\nfrom dagster import (\n    DagsterType,\n    TableColumn,\n    TableColumnConstraints,\n    TableConstraints,\n    TableSchema,\n    TypeCheck,\n    TypeCheckContext,\n)\nfrom dagster._core.definitions.metadata import MetadataValue\nfrom dagster._core.libraries import DagsterLibraryRegistry\n\nfrom .version import __version__\n\n# NOTE: Pandera supports multiple dataframe libraries. Most of the alternatives\n# to pandas implement a pandas-like API wrapper around an underlying library\n# that can handle big data (a weakness of pandas). Typically this means the\n# data is only partly loaded into memory, or is distributed across multiple\n# nodes. Because Dagster types perform runtime validation within a single\n# Python process, it's not clear at present how to interface the more complex\n# validation computations on distributed dataframes with Dagster Types.\n\n# Therefore, for the time being dagster-pandera only supports pandas dataframes.\n# However, some commented-out scaffolding has been left in place for support of\n# alternatives in the future. These sections are marked with "TODO: pending\n# alternative dataframe support".\n\nif TYPE_CHECKING:\n    ValidatableDataFrame = pd.DataFrame\n\nDagsterLibraryRegistry.register("dagster-pandera", __version__)\n\n# ########################\n# ##### VALID DATAFRAME CLASSES\n# ########################\n\n# This layer of indirection is used because we may support alternative dataframe classes in the\n# future.\nVALID_DATAFRAME_CLASSES = (pd.DataFrame,)\n\n\n# ########################\n# ##### PANDERA SCHEMA TO DAGSTER TYPE\n# ########################\n\n\n
[docs]def pandera_schema_to_dagster_type(\n schema: Union[pa.DataFrameSchema, Type[pa.SchemaModel]],\n) -> DagsterType:\n """Convert a Pandera dataframe schema to a `DagsterType`.\n\n The generated Dagster type will be given an automatically generated `name`. The schema's `title`\n property, `name` property, or class name (in that order) will be used. If neither `title` or\n `name` is defined, a name of the form `DagsterPanderaDataframe<n>` is generated.\n\n Additional metadata is also extracted from the Pandera schema and attached to the returned\n `DagsterType` as a metadata dictionary. The extracted metadata includes:\n\n - Descriptions on the schema and constituent columns and checks.\n - Data types for each column.\n - String representations of all column-wise checks.\n - String representations of all row-wise (i.e. "wide") checks.\n\n The returned `DagsterType` type will call the Pandera schema's `validate()` method in its type\n check function. Validation is done in `lazy` mode, i.e. pandera will attempt to validate all\n values in the dataframe, rather than stopping on the first error.\n\n If validation fails, the returned `TypeCheck` object will contain two pieces of metadata:\n\n - `num_failures` total number of validation errors.\n - `failure_sample` a table containing up to the first 10 validation errors.\n\n Args:\n schema (Union[pa.DataFrameSchema, Type[pa.SchemaModel]]):\n\n Returns:\n DagsterType: Dagster Type constructed from the Pandera schema.\n\n """\n if not (\n isinstance(schema, pa.DataFrameSchema)\n or (isinstance(schema, type) and issubclass(schema, pa.SchemaModel))\n ):\n raise TypeError(\n "schema must be a pandera `DataFrameSchema` or a subclass of a pandera `SchemaModel`"\n )\n\n name = _extract_name_from_pandera_schema(schema)\n norm_schema = (\n schema.to_schema()\n if isinstance(schema, type) and issubclass(schema, pa.SchemaModel)\n else schema\n )\n tschema = _pandera_schema_to_table_schema(norm_schema)\n type_check_fn = _pandera_schema_to_type_check_fn(norm_schema, tschema)\n\n return DagsterType(\n type_check_fn=type_check_fn,\n name=name,\n description=norm_schema.description,\n metadata={\n "schema": MetadataValue.table_schema(tschema),\n },\n typing_type=pd.DataFrame,\n )
\n\n\n# call next() on this to generate next unique Dagster Type name for anonymous schemas\n_anonymous_schema_name_generator = (f"DagsterPanderaDataframe{i}" for i in itertools.count(start=1))\n\n\ndef _extract_name_from_pandera_schema(\n schema: Union[pa.DataFrameSchema, Type[pa.SchemaModel]],\n) -> str:\n if isinstance(schema, type) and issubclass(schema, pa.SchemaModel):\n return (\n getattr(schema.Config, "title", None)\n or getattr(schema.Config, "name", None)\n or schema.__name__\n )\n elif isinstance(schema, pa.DataFrameSchema):\n return schema.title or schema.name or next(_anonymous_schema_name_generator)\n\n\ndef _pandera_schema_to_type_check_fn(\n schema: pa.DataFrameSchema,\n table_schema: TableSchema,\n) -> Callable[[TypeCheckContext, object], TypeCheck]:\n def type_check_fn(_context, value: object) -> TypeCheck:\n if isinstance(value, VALID_DATAFRAME_CLASSES):\n try:\n # `lazy` instructs pandera to capture every (not just the first) validation error\n schema.validate(value, lazy=True)\n except pa.errors.SchemaErrors as e:\n return _pandera_errors_to_type_check(e, table_schema)\n except Exception as e:\n return TypeCheck(\n success=False,\n description=f"Unexpected error during validation: {e}",\n )\n else:\n return TypeCheck(\n success=False,\n description=(\n f"Must be one of {VALID_DATAFRAME_CLASSES}, not {type(value).__name__}."\n ),\n )\n\n return TypeCheck(success=True)\n\n return type_check_fn\n\n\nPANDERA_FAILURE_CASES_SCHEMA = TableSchema(\n columns=[\n TableColumn(\n name="schema_context",\n type="string",\n description="`Column` for column-wise checks, or `DataFrameSchema`",\n ),\n TableColumn(\n name="column",\n type="string",\n description="Column of value that failed the check, or `None` for wide checks.",\n ),\n TableColumn(\n name="check", type="string", description="Description of the failed Pandera check."\n ),\n TableColumn(name="check_number", description="Index of the failed check."),\n TableColumn(\n name="failure_case", type="number | string", description="Value that failed a check."\n ),\n TableColumn(\n name="index",\n type="number | string",\n description="Index (row) of value that failed a check.",\n ),\n ]\n)\n\n\ndef _pandera_errors_to_type_check(\n error: pa.errors.SchemaErrors, _table_schema: TableSchema\n) -> TypeCheck:\n return TypeCheck(\n success=False,\n description=str(error),\n )\n\n\ndef _pandera_schema_to_table_schema(schema: pa.DataFrameSchema) -> TableSchema:\n df_constraints = _pandera_schema_wide_checks_to_table_constraints(schema.checks)\n columns = [_pandera_column_to_table_column(col) for k, col in schema.columns.items()]\n return TableSchema(columns=columns, constraints=df_constraints)\n\n\ndef _pandera_schema_wide_checks_to_table_constraints(\n checks: Sequence[Union[pa.Check, pa.Hypothesis]]\n) -> TableConstraints:\n return TableConstraints(other=[_pandera_check_to_table_constraint(check) for check in checks])\n\n\ndef _pandera_check_to_table_constraint(pa_check: Union[pa.Check, pa.Hypothesis]) -> str:\n return _get_pandera_check_identifier(pa_check)\n\n\ndef _pandera_column_to_table_column(pa_column: pa.Column) -> TableColumn:\n constraints = TableColumnConstraints(\n nullable=pa_column.nullable,\n unique=pa_column.unique,\n other=[_pandera_check_to_column_constraint(pa_check) for pa_check in pa_column.checks],\n )\n name = check.not_none(pa_column.name, "name")\n name = name if isinstance(name, str) else "/".join(name)\n return TableColumn(\n name=name,\n type=str(pa_column.dtype),\n description=pa_column.description,\n constraints=constraints,\n )\n\n\nCHECK_OPERATORS = {\n "equal_to": "==",\n "not_equal_to": "!=",\n "less_than": "<",\n "less_than_or_equal_to": "<=",\n "greater_than": ">",\n "greater_than_or_equal_to": ">=",\n}\n\n\ndef _extract_operand(error_str: str) -> str:\n match = re.search(r"(?<=\\().+(?=\\))", error_str)\n return match.group(0) if match else ""\n\n\ndef _pandera_check_to_column_constraint(pa_check: pa.Check) -> str:\n if pa_check.description:\n return pa_check.description\n elif pa_check.name in CHECK_OPERATORS:\n assert isinstance(\n pa_check.error, str\n ), "Expected pandera check to have string `error` attr."\n return f"{CHECK_OPERATORS[pa_check.name]} {_extract_operand(pa_check.error)}"\n else:\n return _get_pandera_check_identifier(pa_check)\n\n\ndef _get_pandera_check_identifier(pa_check: Union[pa.Check, pa.Hypothesis]) -> str:\n return pa_check.description or pa_check.error or pa_check.name or str(pa_check)\n\n\n__all__ = [\n "pandera_schema_to_dagster_type",\n]\n
", "current_page_name": "_modules/dagster_pandera", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_pandera"}, "index": {"alabaster_version": "0.7.13", "body": "

All modules for which code is available

\n", "current_page_name": "_modules/index", "customsidebar": null, "favicon_url": null, "logo_url": null, "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "Overview: module code"}}, "dagster": {"_config": {"config_schema": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._config.config_schema

\nfrom typing import TYPE_CHECKING, Any, Dict, List, Mapping, Sequence, Type, Union\n\nfrom typing_extensions import TypeAlias\n\nif TYPE_CHECKING:\n    from dagster._config import ConfigType, Field\n\n# Eventually, the below `UserConfigSchema` should be renamed to `ConfigSchema` and the class\n# definition should be dropped. The reason we don't do this now is that sphinx autodoc doesn't\n# support type aliases, so there is no good way to gracefully attach a docstring to this and have it\n# show up in the docs. See: https://github.com/sphinx-doc/sphinx/issues/8934\n#\n# Unfortunately mypy doesn't support recursive types, which would be used to properly define the\n# List/Dict elements of this union: `Dict[str, ConfigSchema]`, `List[ConfigSchema]`.\nUserConfigSchema: TypeAlias = Union[\n    Type[Union[bool, float, int, str]],\n    Type[Union[Dict[Any, Any], List[Any]]],\n    "ConfigType",\n    "Field",\n    Mapping[str, Any],\n    Sequence[Any],\n]\n\n\n
[docs]class ConfigSchema:\n """Placeholder type for config schemas.\n\n Any time that it appears in documentation, it means that any of the following types are\n acceptable:\n\n #. A Python scalar type that resolves to a Dagster config type\n (:py:class:`~python:int`, :py:class:`~python:float`, :py:class:`~python:bool`,\n or :py:class:`~python:str`). For example:\n\n * ``@op(config_schema=int)``\n * ``@op(config_schema=str)``\n\n #. A built-in python collection (:py:class:`~python:list`, or :py:class:`~python:dict`).\n :py:class:`~python:list` is exactly equivalent to :py:class:`~dagster.Array` [\n :py:class:`~dagster.Any` ] and :py:class:`~python:dict` is equivalent to\n :py:class:`~dagster.Permissive`. For example:\n\n * ``@op(config_schema=list)``\n * ``@op(config_schema=dict)``\n\n #. A Dagster config type:\n\n * :py:data:`~dagster.Any`\n * :py:class:`~dagster.Array`\n * :py:data:`~dagster.Bool`\n * :py:data:`~dagster.Enum`\n * :py:data:`~dagster.Float`\n * :py:data:`~dagster.Int`\n * :py:data:`~dagster.IntSource`\n * :py:data:`~dagster.Noneable`\n * :py:class:`~dagster.Permissive`\n * :py:class:`~dagster.Map`\n * :py:class:`~dagster.ScalarUnion`\n * :py:class:`~dagster.Selector`\n * :py:class:`~dagster.Shape`\n * :py:data:`~dagster.String`\n * :py:data:`~dagster.StringSource`\n\n\n #. A bare python dictionary, which will be automatically wrapped in\n :py:class:`~dagster.Shape`. Values of the dictionary are resolved recursively\n according to the same rules. For example:\n\n * ``{'some_config': str}`` is equivalent to ``Shape({'some_config: str})``.\n\n * ``{'some_config1': {'some_config2': str}}`` is equivalent to\n ``Shape({'some_config1: Shape({'some_config2: str})})``.\n\n #. A bare python list of length one, whose single element will be wrapped in a\n :py:class:`~dagster.Array` is resolved recursively according to the same\n rules. For example:\n\n * ``[str]`` is equivalent to ``Array[str]``.\n\n * ``[[str]]`` is equivalent to ``Array[Array[str]]``.\n\n * ``[{'some_config': str}]`` is equivalent to ``Array(Shape({'some_config: str}))``.\n\n #. An instance of :py:class:`~dagster.Field`.\n """\n\n def __init__(self):\n raise NotImplementedError(\n "ConfigSchema is a placeholder type and should not be instantiated."\n )
\n
", "current_page_name": "_modules/dagster/_config/config_schema", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._config.config_schema"}, "config_type": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._config.config_type

\nimport typing\nfrom enum import Enum as PythonEnum\nfrom typing import TYPE_CHECKING, Dict, Iterator, Optional, Sequence, cast\n\nimport dagster._check as check\nfrom dagster._annotations import public\nfrom dagster._builtins import BuiltinEnum\nfrom dagster._config import UserConfigSchema\nfrom dagster._serdes import whitelist_for_serdes\n\nif TYPE_CHECKING:\n    from .snap import ConfigSchemaSnapshot, ConfigTypeSnap\n\n\n@whitelist_for_serdes\nclass ConfigTypeKind(PythonEnum):\n    ANY = "ANY"\n    SCALAR = "SCALAR"\n    ENUM = "ENUM"\n\n    SELECTOR = "SELECTOR"\n    STRICT_SHAPE = "STRICT_SHAPE"\n    PERMISSIVE_SHAPE = "PERMISSIVE_SHAPE"\n    SCALAR_UNION = "SCALAR_UNION"\n\n    MAP = "MAP"\n\n    # Closed generic types\n    ARRAY = "ARRAY"\n    NONEABLE = "NONEABLE"\n\n    @staticmethod\n    def has_fields(kind: "ConfigTypeKind") -> bool:\n        check.inst_param(kind, "kind", ConfigTypeKind)\n        return kind == ConfigTypeKind.SELECTOR or ConfigTypeKind.is_shape(kind)\n\n    @staticmethod\n    def is_closed_generic(kind: "ConfigTypeKind") -> bool:\n        check.inst_param(kind, "kind", ConfigTypeKind)\n        return (\n            kind == ConfigTypeKind.ARRAY\n            or kind == ConfigTypeKind.NONEABLE\n            or kind == ConfigTypeKind.SCALAR_UNION\n            or kind == ConfigTypeKind.MAP\n        )\n\n    @staticmethod\n    def is_shape(kind: "ConfigTypeKind") -> bool:\n        check.inst_param(kind, "kind", ConfigTypeKind)\n        return kind == ConfigTypeKind.STRICT_SHAPE or kind == ConfigTypeKind.PERMISSIVE_SHAPE\n\n    @staticmethod\n    def is_selector(kind: "ConfigTypeKind") -> bool:\n        check.inst_param(kind, "kind", ConfigTypeKind)\n        return kind == ConfigTypeKind.SELECTOR\n\n\nclass ConfigType:\n    """The class backing DagsterTypes as they are used processing configuration data."""\n\n    def __init__(\n        self,\n        key: str,\n        kind: ConfigTypeKind,\n        given_name: Optional[str] = None,\n        description: Optional[str] = None,\n        type_params: Optional[Sequence["ConfigType"]] = None,\n    ):\n        self.key: str = check.str_param(key, "key")\n        self.kind: ConfigTypeKind = check.inst_param(kind, "kind", ConfigTypeKind)\n        self.given_name: Optional[str] = check.opt_str_param(given_name, "given_name")\n        self._description: Optional[str] = check.opt_str_param(description, "description")\n        self.type_params: Optional[Sequence[ConfigType]] = (\n            check.sequence_param(type_params, "type_params", of_type=ConfigType)\n            if type_params\n            else None\n        )\n\n        # memoized snap representation\n        self._snap: Optional["ConfigTypeSnap"] = None\n\n    @property\n    def description(self) -> Optional[str]:\n        return self._description\n\n    @staticmethod\n    def from_builtin_enum(builtin_enum: typing.Any) -> "ConfigType":\n        check.invariant(BuiltinEnum.contains(builtin_enum), "param must be member of BuiltinEnum")\n        return _CONFIG_MAP[builtin_enum]\n\n    def post_process(self, value):\n        """Implement this in order to take a value provided by the user\n        and perform computation on it. This can be done to coerce data types,\n        fetch things from the environment (e.g. environment variables), or\n        to do custom validation. If the value is not valid, throw a\n        PostProcessingError. Otherwise return the coerced value.\n        """\n        return value\n\n    def get_snapshot(self) -> "ConfigTypeSnap":\n        from .snap import snap_from_config_type\n\n        if self._snap is None:\n            self._snap = snap_from_config_type(self)\n\n        return self._snap\n\n    def type_iterator(self) -> Iterator["ConfigType"]:\n        yield self\n\n    def get_schema_snapshot(self) -> "ConfigSchemaSnapshot":\n        from .snap import ConfigSchemaSnapshot\n\n        return ConfigSchemaSnapshot({ct.key: ct.get_snapshot() for ct in self.type_iterator()})\n\n\n@whitelist_for_serdes\nclass ConfigScalarKind(PythonEnum):\n    INT = "INT"\n    STRING = "STRING"\n    FLOAT = "FLOAT"\n    BOOL = "BOOL"\n\n\n# Scalars, Composites, Selectors, Lists, Optional, Any\n\n\nclass ConfigScalar(ConfigType):\n    def __init__(\n        self,\n        key: str,\n        given_name: Optional[str],\n        scalar_kind: ConfigScalarKind,\n        **kwargs: typing.Any,\n    ):\n        self.scalar_kind = check.inst_param(scalar_kind, "scalar_kind", ConfigScalarKind)\n        super(ConfigScalar, self).__init__(\n            key, kind=ConfigTypeKind.SCALAR, given_name=given_name, **kwargs\n        )\n\n\nclass BuiltinConfigScalar(ConfigScalar):\n    def __init__(self, scalar_kind, description=None):\n        super(BuiltinConfigScalar, self).__init__(\n            key=type(self).__name__,\n            given_name=type(self).__name__,\n            scalar_kind=scalar_kind,\n            description=description,\n        )\n\n\nclass Int(BuiltinConfigScalar):\n    def __init__(self):\n        super(Int, self).__init__(scalar_kind=ConfigScalarKind.INT, description="")\n\n\nclass String(BuiltinConfigScalar):\n    def __init__(self):\n        super(String, self).__init__(scalar_kind=ConfigScalarKind.STRING, description="")\n\n\nclass Bool(BuiltinConfigScalar):\n    def __init__(self):\n        super(Bool, self).__init__(scalar_kind=ConfigScalarKind.BOOL, description="")\n\n\nclass Float(BuiltinConfigScalar):\n    def __init__(self):\n        super(Float, self).__init__(scalar_kind=ConfigScalarKind.FLOAT, description="")\n\n    def post_process(self, value):\n        return float(value)\n\n\nclass Any(ConfigType):\n    def __init__(self):\n        super(Any, self).__init__(\n            key="Any",\n            given_name="Any",\n            kind=ConfigTypeKind.ANY,\n        )\n\n\n
[docs]class Noneable(ConfigType):\n """Defines a configuration type that is the union of ``NoneType`` and the type ``inner_type``.\n\n Args:\n inner_type (type):\n The type of the values that this configuration type can contain.\n\n **Examples:**\n\n .. code-block:: python\n\n config_schema={"name": Noneable(str)}\n\n config={"name": "Hello"} # Ok\n config={"name": None} # Ok\n config={} # Error\n """\n\n def __init__(self, inner_type: object):\n from .field import resolve_to_config_type\n\n self.inner_type = cast(ConfigType, resolve_to_config_type(inner_type))\n super(Noneable, self).__init__(\n key=f"Noneable.{self.inner_type.key}",\n kind=ConfigTypeKind.NONEABLE,\n type_params=[self.inner_type],\n )\n\n def type_iterator(self) -> Iterator["ConfigType"]:\n yield from self.inner_type.type_iterator()\n yield from super().type_iterator()
\n\n\n
[docs]class Array(ConfigType):\n """Defines an array (list) configuration type that contains values of type ``inner_type``.\n\n Args:\n inner_type (type):\n The type of the values that this configuration type can contain.\n """\n\n def __init__(self, inner_type: object):\n from .field import resolve_to_config_type\n\n self.inner_type = cast(ConfigType, resolve_to_config_type(inner_type))\n super(Array, self).__init__(\n key=f"Array.{self.inner_type.key}",\n type_params=[self.inner_type],\n kind=ConfigTypeKind.ARRAY,\n )\n\n @public\n @property\n def description(self) -> str:\n """A human-readable description of this Array type."""\n return f"List of {self.key}"\n\n def type_iterator(self) -> Iterator["ConfigType"]:\n yield from self.inner_type.type_iterator()\n yield from super().type_iterator()
\n\n\n
[docs]class EnumValue:\n """Define an entry in a :py:class:`Enum`.\n\n Args:\n config_value (str):\n The string representation of the config to accept when passed.\n python_value (Optional[Any]):\n The python value to convert the enum entry in to. Defaults to the ``config_value``.\n description (Optional[str]):\n A human-readable description of the enum entry.\n\n """\n\n def __init__(\n self,\n config_value: str,\n python_value: Optional[object] = None,\n description: Optional[str] = None,\n ):\n self.config_value = check.str_param(config_value, "config_value")\n self.python_value = config_value if python_value is None else python_value\n self.description = check.opt_str_param(description, "description")
\n\n\n
[docs]class Enum(ConfigType):\n """Defines a enum configuration type that allows one of a defined set of possible values.\n\n Args:\n name (str):\n The name of the enum configuration type.\n enum_values (List[EnumValue]):\n The set of possible values for the enum configuration type.\n\n **Examples:**\n\n .. code-block:: python\n\n @op(\n config_schema=Field(\n Enum(\n 'CowboyType',\n [\n EnumValue('good'),\n EnumValue('bad'),\n EnumValue('ugly'),\n ]\n )\n )\n )\n def resolve_standoff(context):\n # ...\n """\n\n def __init__(self, name: str, enum_values: Sequence[EnumValue]):\n check.str_param(name, "name")\n super(Enum, self).__init__(key=name, given_name=name, kind=ConfigTypeKind.ENUM)\n self.enum_values = check.sequence_param(enum_values, "enum_values", of_type=EnumValue)\n self._valid_python_values = {ev.python_value for ev in enum_values}\n check.invariant(len(self._valid_python_values) == len(enum_values))\n self._valid_config_values = {ev.config_value for ev in enum_values}\n check.invariant(len(self._valid_config_values) == len(enum_values))\n\n @property\n def config_values(self):\n return [ev.config_value for ev in self.enum_values]\n\n def is_valid_config_enum_value(self, config_value):\n return config_value in self._valid_config_values\n\n def post_process(self, value: typing.Any) -> typing.Any:\n if isinstance(value, PythonEnum):\n value = value.name\n\n for ev in self.enum_values:\n if ev.config_value == value:\n return ev.python_value\n\n check.failed(f"Should never reach this. config_value should be pre-validated. Got {value}")\n\n @classmethod\n def from_python_enum(cls, enum, name=None):\n """Create a Dagster enum corresponding to an existing Python enum.\n\n Args:\n enum (enum.EnumMeta):\n The class representing the enum.\n name (Optional[str]):\n The name for the enum. If not present, `enum.__name__` will be used.\n\n Example:\n .. code-block:: python\n\n class Color(enum.Enum):\n RED = enum.auto()\n GREEN = enum.auto()\n BLUE = enum.auto()\n\n @op(\n config_schema={"color": Field(Enum.from_python_enum(Color))}\n )\n def select_color(context):\n assert context.op_config["color"] == Color.RED\n """\n if name is None:\n name = enum.__name__\n return cls(name, [EnumValue(v.name, python_value=v) for v in enum])\n\n @classmethod\n def from_python_enum_direct_values(cls, enum, name=None):\n """Create a Dagster enum corresponding to an existing Python enum, where the direct values are passed instead of symbolic values (IE, enum.symbol.value as opposed to enum.symbol).\n\n This is necessary for internal usage, as the symbolic values are not serializable.\n\n Args:\n enum (enum.EnumMeta):\n The class representing the enum.\n name (Optional[str]):\n The name for the enum. If not present, `enum.__name__` will be used.\n\n Example:\n .. code-block:: python\n\n class Color(enum.Enum):\n RED = enum.auto()\n GREEN = enum.auto()\n BLUE = enum.auto()\n\n @op(\n config_schema={"color": Field(Enum.from_python_enum(Color))}\n )\n def select_color(context):\n assert context.op_config["color"] == Color.RED.value\n """\n if name is None:\n name = enum.__name__\n return cls(name, [EnumValue(v.name, python_value=v.value) for v in enum])
\n\n\n
[docs]class ScalarUnion(ConfigType):\n """Defines a configuration type that accepts a scalar value OR a non-scalar value like a\n :py:class:`~dagster.List`, :py:class:`~dagster.Dict`, or :py:class:`~dagster.Selector`.\n\n This allows runtime scalars to be configured without a dictionary with the key ``value`` and\n instead just use the scalar value directly. However this still leaves the option to\n load scalars from a json or pickle file.\n\n Args:\n scalar_type (type):\n The scalar type of values that this configuration type can hold. For example,\n :py:class:`~python:int`, :py:class:`~python:float`, :py:class:`~python:bool`,\n or :py:class:`~python:str`.\n non_scalar_schema (ConfigSchema):\n The schema of a non-scalar Dagster configuration type. For example, :py:class:`List`,\n :py:class:`Dict`, or :py:class:`~dagster.Selector`.\n key (Optional[str]):\n The configuation type's unique key. If not set, then the key will be set to\n ``ScalarUnion.{scalar_type}-{non_scalar_schema}``.\n\n **Examples:**\n\n .. code-block:: yaml\n\n graph:\n transform_word:\n inputs:\n word:\n value: foobar\n\n\n becomes, optionally,\n\n\n .. code-block:: yaml\n\n graph:\n transform_word:\n inputs:\n word: foobar\n """\n\n def __init__(\n self,\n scalar_type: typing.Any,\n non_scalar_schema: UserConfigSchema,\n _key: Optional[str] = None,\n ):\n from .field import resolve_to_config_type\n\n self.scalar_type = check.inst(\n cast(ConfigType, resolve_to_config_type(scalar_type)), ConfigType\n )\n self.non_scalar_type = resolve_to_config_type(non_scalar_schema)\n\n check.param_invariant(self.scalar_type.kind == ConfigTypeKind.SCALAR, "scalar_type")\n check.param_invariant(\n self.non_scalar_type.kind\n in {ConfigTypeKind.STRICT_SHAPE, ConfigTypeKind.SELECTOR, ConfigTypeKind.ARRAY},\n "non_scalar_type",\n )\n\n # https://github.com/dagster-io/dagster/issues/2133\n key = check.opt_str_param(\n _key, "_key", f"ScalarUnion.{self.scalar_type.key}-{self.non_scalar_type.key}"\n )\n\n super(ScalarUnion, self).__init__(\n key=key,\n kind=ConfigTypeKind.SCALAR_UNION,\n type_params=[self.scalar_type, self.non_scalar_type],\n )\n\n def type_iterator(self) -> Iterator["ConfigType"]:\n yield from self.scalar_type.type_iterator()\n yield from self.non_scalar_type.type_iterator()\n yield from super().type_iterator()
\n\n\nConfigAnyInstance: Any = Any()\nConfigBoolInstance: Bool = Bool()\nConfigFloatInstance: Float = Float()\nConfigIntInstance: Int = Int()\nConfigStringInstance: String = String()\n\n_CONFIG_MAP: Dict[check.TypeOrTupleOfTypes, ConfigType] = {\n BuiltinEnum.ANY: ConfigAnyInstance,\n BuiltinEnum.BOOL: ConfigBoolInstance,\n BuiltinEnum.FLOAT: ConfigFloatInstance,\n BuiltinEnum.INT: ConfigIntInstance,\n BuiltinEnum.STRING: ConfigStringInstance,\n}\n\n\n_CONFIG_MAP_BY_NAME: Dict[str, ConfigType] = {\n "Any": ConfigAnyInstance,\n "Bool": ConfigBoolInstance,\n "Float": ConfigFloatInstance,\n "Int": ConfigIntInstance,\n "String": ConfigStringInstance,\n}\n\nALL_CONFIG_BUILTINS = set(_CONFIG_MAP.values())\n\n\ndef get_builtin_scalar_by_name(type_name: str):\n if type_name not in _CONFIG_MAP_BY_NAME:\n check.failed(f"Scalar {type_name} is not supported")\n return _CONFIG_MAP_BY_NAME[type_name]\n
", "current_page_name": "_modules/dagster/_config/config_type", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._config.config_type"}, "field": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._config.field

\nfrom typing import Any, Optional, Union, cast, overload\n\nimport dagster._check as check\nfrom dagster._annotations import public\nfrom dagster._builtins import BuiltinEnum\nfrom dagster._config import UserConfigSchema\nfrom dagster._core.errors import DagsterInvalidConfigError, DagsterInvalidDefinitionError\nfrom dagster._serdes import serialize_value\nfrom dagster._seven import is_subclass\nfrom dagster._utils import is_enum_value\nfrom dagster._utils.typing_api import is_closed_python_optional_type, is_typing_type\n\nfrom .config_type import Array, ConfigAnyInstance, ConfigType, ConfigTypeKind\nfrom .field_utils import FIELD_NO_DEFAULT_PROVIDED, Map, all_optional_type\n\n\ndef _is_config_type_class(obj) -> bool:\n    return isinstance(obj, type) and is_subclass(obj, ConfigType)\n\n\ndef helpful_list_error_string() -> str:\n    return "Please use a python list (e.g. [int]) or dagster.Array (e.g. Array(int)) instead."\n\n\nVALID_CONFIG_DESC = """\n1. A Python primitive type that resolve to dagster config\n   types: int, float, bool, str.\n\n2. A dagster config type: Int, Float, Bool, String, StringSource, Path, Any,\n   Array, Noneable, Selector, Shape, Permissive, etc.\n\n3. A bare python dictionary, which is wrapped in Shape. Any\n   values in the dictionary get resolved by the same rules, recursively.\n\n4. A bare python list of length one which itself is config type.\n   Becomes Array with list element as an argument.\n"""\n\n\n@overload\ndef resolve_to_config_type(obj: Union[ConfigType, UserConfigSchema]) -> ConfigType:\n    pass\n\n\n@overload\ndef resolve_to_config_type(obj: object) -> Union[ConfigType, bool]:\n    pass\n\n\ndef resolve_to_config_type(obj: object) -> Union[ConfigType, bool]:\n    from .field_utils import convert_fields_to_dict_type\n\n    # Short circuit if it's already a Config Type\n    if isinstance(obj, ConfigType):\n        return obj\n\n    if isinstance(obj, dict):\n        # Dicts of the special form {type: value} are treated as Maps\n        # mapping from the type to value type, otherwise treat as dict type\n        if len(obj) == 1:\n            key = next(iter(obj.keys()))\n            key_type = resolve_to_config_type(key)\n            if not isinstance(key, str):\n                if not key_type:\n                    raise DagsterInvalidDefinitionError(\n                        f"Invalid key in map specification: {key!r} in map {obj}"\n                    )\n\n                if not key_type.kind == ConfigTypeKind.SCALAR:  # type: ignore\n                    raise DagsterInvalidDefinitionError(\n                        f"Non-scalar key in map specification: {key!r} in map {obj}"\n                    )\n\n                inner_type = resolve_to_config_type(obj[key])\n\n                if not inner_type:\n                    raise DagsterInvalidDefinitionError(\n                        f"Invalid value in map specification: {obj[str]!r} in map {obj}"\n                    )\n                return Map(key_type, inner_type)\n        return convert_fields_to_dict_type(obj)\n\n    if isinstance(obj, list):\n        if len(obj) != 1:\n            raise DagsterInvalidDefinitionError("Array specifications must only be of length 1")\n\n        inner_type = resolve_to_config_type(obj[0])\n\n        if not inner_type:\n            raise DagsterInvalidDefinitionError(\n                f"Invalid member of array specification: {obj[0]!r} in list {obj}"\n            )\n        return Array(inner_type)\n\n    if BuiltinEnum.contains(obj):\n        return ConfigType.from_builtin_enum(obj)\n\n    from .primitive_mapping import (\n        is_supported_config_python_builtin,\n        remap_python_builtin_for_config,\n    )\n\n    if is_supported_config_python_builtin(obj):\n        return remap_python_builtin_for_config(obj)\n\n    if obj is None:\n        return ConfigAnyInstance\n\n    # Special error messages for passing a DagsterType\n    from dagster._core.types.dagster_type import DagsterType, List, ListType\n    from dagster._core.types.python_set import Set, _TypedPythonSet\n    from dagster._core.types.python_tuple import Tuple, _TypedPythonTuple\n\n    if _is_config_type_class(obj):\n        check.param_invariant(\n            False,\n            "dagster_type",\n            f"Cannot pass config type class {obj} to resolve_to_config_type. This error usually"\n            " occurs when you pass a dagster config type class instead of a class instance into"\n            ' another dagster config type. E.g. "Noneable(Permissive)" should instead be'\n            ' "Noneable(Permissive())".',\n        )\n\n    if isinstance(obj, type) and is_subclass(obj, DagsterType):\n        raise DagsterInvalidDefinitionError(\n            f"You have passed a DagsterType class {obj!r} to the config system. "\n            "The DagsterType and config schema systems are separate. "\n            f"Valid config values are:\\n{VALID_CONFIG_DESC}"\n        )\n\n    if is_closed_python_optional_type(obj):\n        raise DagsterInvalidDefinitionError(\n            "Cannot use typing.Optional as a config type. If you want this field to be "\n            "optional, please use Field(<type>, is_required=False), and if you want this field to "\n            "be required, but accept a value of None, use dagster.Noneable(<type>)."\n        )\n\n    if is_typing_type(obj):\n        raise DagsterInvalidDefinitionError(\n            f"You have passed in {obj} to the config system. Types from "\n            "the typing module in python are not allowed in the config system. "\n            "You must use types that are imported from dagster or primitive types "\n            "such as bool, int, etc."\n        )\n\n    if obj is List or isinstance(obj, ListType):\n        raise DagsterInvalidDefinitionError(\n            "Cannot use List in the context of config. " + helpful_list_error_string()\n        )\n\n    if obj is Set or isinstance(obj, _TypedPythonSet):\n        raise DagsterInvalidDefinitionError(\n            "Cannot use Set in the context of a config field. " + helpful_list_error_string()\n        )\n\n    if obj is Tuple or isinstance(obj, _TypedPythonTuple):\n        raise DagsterInvalidDefinitionError(\n            "Cannot use Tuple in the context of a config field. " + helpful_list_error_string()\n        )\n\n    if isinstance(obj, DagsterType):\n        raise DagsterInvalidDefinitionError(\n            f"You have passed an instance of DagsterType {obj.display_name} to the config "\n            f"system (Repr of type: {obj!r}). "\n            "The DagsterType and config schema systems are separate. "\n            f"Valid config values are:\\n{VALID_CONFIG_DESC}",\n        )\n\n    # This means that this is an error and we are return False to a callsite\n    # We do the error reporting there because those callsites have more context\n    return False\n\n\ndef has_implicit_default(config_type):\n    if config_type.kind == ConfigTypeKind.NONEABLE:\n        return True\n\n    return all_optional_type(config_type)\n\n\n
[docs]class Field:\n """Defines the schema for a configuration field.\n\n Fields are used in config schema instead of bare types when one wants to add a description,\n a default value, or to mark it as not required.\n\n Config fields are parsed according to their schemas in order to yield values available at\n job execution time through the config system. Config fields can be set on ops, on\n loaders for custom, and on other pluggable components of the system, such as resources, loggers,\n and executors.\n\n\n Args:\n config (Any): The schema for the config. This value can be any of:\n\n 1. A Python primitive type that resolves to a Dagster config type\n (:py:class:`~python:int`, :py:class:`~python:float`, :py:class:`~python:bool`,\n :py:class:`~python:str`, or :py:class:`~python:list`).\n\n 2. A Dagster config type:\n\n * :py:data:`~dagster.Any`\n * :py:class:`~dagster.Array`\n * :py:data:`~dagster.Bool`\n * :py:data:`~dagster.Enum`\n * :py:data:`~dagster.Float`\n * :py:data:`~dagster.Int`\n * :py:data:`~dagster.IntSource`\n * :py:data:`~dagster.Noneable`\n * :py:class:`~dagster.Permissive`\n * :py:class:`~dagster.ScalarUnion`\n * :py:class:`~dagster.Selector`\n * :py:class:`~dagster.Shape`\n * :py:data:`~dagster.String`\n * :py:data:`~dagster.StringSource`\n\n 3. A bare python dictionary, which will be automatically wrapped in\n :py:class:`~dagster.Shape`. Values of the dictionary are resolved recursively\n according to the same rules.\n\n 4. A bare python list of length one which itself is config type.\n Becomes :py:class:`Array` with list element as an argument.\n\n default_value (Any):\n A default value for this field, conformant to the schema set by the ``dagster_type``\n argument. If a default value is provided, ``is_required`` should be ``False``.\n\n Note: for config types that do post processing such as Enum, this value must be\n the pre processed version, ie use ``ExampleEnum.VALUE.name`` instead of\n ``ExampleEnum.VALUE``\n\n is_required (bool):\n Whether the presence of this field is required. Defaults to true. If ``is_required``\n is ``True``, no default value should be provided.\n\n description (str):\n A human-readable description of this config field.\n\n Examples:\n .. code-block:: python\n\n @op(\n config_schema={\n 'word': Field(str, description='I am a word.'),\n 'repeats': Field(Int, default_value=1, is_required=False),\n }\n )\n def repeat_word(context):\n return context.op_config['word'] * context.op_config['repeats']\n """\n\n def _resolve_config_arg(self, config):\n if isinstance(config, ConfigType):\n return config\n\n config_type = resolve_to_config_type(config)\n if not config_type:\n raise DagsterInvalidDefinitionError(\n f"Attempted to pass {config!r} to a Field that expects a valid "\n "dagster type usable in config (e.g. Dict, Int, String et al)."\n )\n return config_type\n\n def __init__(\n self,\n config: Any,\n default_value: Any = FIELD_NO_DEFAULT_PROVIDED,\n is_required: Optional[bool] = None,\n description: Optional[str] = None,\n ):\n from .post_process import resolve_defaults\n from .validate import validate_config\n\n self.config_type = check.inst(self._resolve_config_arg(config), ConfigType)\n\n self._description = check.opt_str_param(description, "description")\n\n check.opt_bool_param(is_required, "is_required")\n\n if default_value != FIELD_NO_DEFAULT_PROVIDED:\n check.param_invariant(\n not (callable(default_value)), "default_value", "default_value cannot be a callable"\n )\n\n if is_required is True:\n check.param_invariant(\n default_value == FIELD_NO_DEFAULT_PROVIDED,\n "default_value",\n "required arguments should not specify default values",\n )\n\n self._default_value = default_value\n\n # check explicit default value\n if self.default_provided:\n if self.config_type.kind == ConfigTypeKind.ENUM and is_enum_value(default_value):\n raise DagsterInvalidDefinitionError(\n (\n "You have passed into a python enum value as the default value "\n "into of a config enum type {name}. You must pass in the underlying "\n "string represention as the default value. One of {value_set}."\n ).format(\n value_set=[ev.config_value for ev in self.config_type.enum_values],\n name=self.config_type.given_name,\n )\n )\n\n evr = validate_config(self.config_type, default_value)\n if not evr.success:\n raise DagsterInvalidConfigError(\n "Invalid default_value for Field.",\n evr.errors,\n default_value,\n )\n\n if is_required is None:\n is_optional = has_implicit_default(self.config_type) or self.default_provided\n is_required = not is_optional\n\n # on implicitly optional - set the default value\n # by resolving the defaults of the type\n if is_optional and not self.default_provided:\n evr = resolve_defaults(self.config_type, None)\n if not evr.success:\n raise DagsterInvalidConfigError(\n "Unable to resolve implicit default_value for Field.",\n evr.errors,\n None,\n )\n self._default_value = evr.value\n self._is_required = is_required\n\n @public\n @property\n def is_required(self) -> bool:\n """Whether a value for this field must be provided at runtime.\n\n Cannot be True if a default value is provided.\n """\n return self._is_required\n\n @public\n @property\n def default_provided(self) -> bool:\n """Was a default value provided.\n\n Returns:\n bool: Yes or no\n """\n return self._default_value != FIELD_NO_DEFAULT_PROVIDED\n\n @public\n @property\n def default_value(self) -> Any:\n """The default value for the field.\n\n Raises an exception if no default value was provided.\n """\n check.invariant(self.default_provided, "Asking for default value when none was provided")\n return self._default_value\n\n @public\n @property\n def description(self) -> Optional[str]:\n """A human-readable description of this config field, if provided."""\n return self._description\n\n @property\n def default_value_as_json_str(self) -> str:\n check.invariant(self.default_provided, "Asking for default value when none was provided")\n return serialize_value(self.default_value)\n\n def __repr__(self) -> str:\n return ("Field({config_type}, default={default}, is_required={is_required})").format(\n config_type=self.config_type,\n default=(\n "@" if self._default_value == FIELD_NO_DEFAULT_PROVIDED else self._default_value\n ),\n is_required=self.is_required,\n )
\n\n\ndef check_opt_field_param(obj: object, param_name: str) -> Optional[Field]:\n return check.opt_inst_param(cast(Optional[Field], obj), param_name, Field)\n
", "current_page_name": "_modules/dagster/_config/field", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._config.field"}, "field_utils": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._config.field_utils

\n# encoding: utf-8\nimport hashlib\nfrom typing import TYPE_CHECKING, Any, Dict, Iterator, List, Mapping, Optional, Sequence\n\nimport dagster._check as check\nfrom dagster._annotations import public\nfrom dagster._core.errors import DagsterInvalidConfigDefinitionError\n\nfrom .config_type import Array, ConfigType, ConfigTypeKind\n\nif TYPE_CHECKING:\n    from dagster._config import Field\n\n\ndef all_optional_type(config_type: ConfigType) -> bool:\n    check.inst_param(config_type, "config_type", ConfigType)\n\n    if ConfigTypeKind.is_shape(config_type.kind):\n        for field in config_type.fields.values():  # type: ignore\n            if field.is_required:\n                return False\n        return True\n\n    if ConfigTypeKind.is_selector(config_type.kind):\n        if len(config_type.fields) == 1:  # type: ignore\n            for field in config_type.fields.values():  # type: ignore\n                if field.is_required:\n                    return False\n            return True\n\n    return False\n\n\nclass __FieldValueSentinel:\n    pass\n\n\nclass __InferOptionalCompositeFieldSentinel:\n    pass\n\n\nFIELD_NO_DEFAULT_PROVIDED = __FieldValueSentinel\n\nINFER_OPTIONAL_COMPOSITE_FIELD = __InferOptionalCompositeFieldSentinel\n\n\nclass _ConfigHasFields(ConfigType):\n    def __init__(self, fields, **kwargs):\n        self.fields = expand_fields_dict(fields)\n        super(_ConfigHasFields, self).__init__(**kwargs)\n\n    def type_iterator(self) -> Iterator["ConfigType"]:\n        for field in self.fields.values():\n            yield from field.config_type.type_iterator()\n        yield from super().type_iterator()\n\n\nFIELD_HASH_CACHE: Dict[str, Any] = {}\n\n\ndef _memoize_inst_in_field_cache(passed_cls, defined_cls, key):\n    if key in FIELD_HASH_CACHE:\n        return FIELD_HASH_CACHE[key]\n\n    defined_cls_inst = super(defined_cls, passed_cls).__new__(defined_cls)\n    defined_cls_inst._initialized = False  # noqa: SLF001\n    FIELD_HASH_CACHE[key] = defined_cls_inst\n    return defined_cls_inst\n\n\ndef _add_hash(m, string):\n    m.update(string.encode("utf-8"))\n\n\ndef compute_fields_hash(fields, description, field_aliases=None):\n    m = hashlib.sha1()  # so that hexdigest is 40, not 64 bytes\n    if description:\n        _add_hash(m, ":description: " + description)\n\n    for field_name in sorted(list(fields.keys())):\n        field = fields[field_name]\n        _add_hash(m, ":fieldname:" + field_name)\n        if field.default_provided:\n            _add_hash(m, ":default_value: " + field.default_value_as_json_str)\n        _add_hash(m, ":is_required: " + str(field.is_required))\n        _add_hash(m, ":type_key: " + field.config_type.key)\n        if field.description:\n            _add_hash(m, ":description: " + field.description)\n\n    field_aliases = check.opt_dict_param(\n        field_aliases, "field_aliases", key_type=str, value_type=str\n    )\n    for field_name in sorted(list(field_aliases.keys())):\n        field_alias = field_aliases[field_name]\n        _add_hash(m, ":fieldname: " + field_name)\n        _add_hash(m, ":fieldalias: " + field_alias)\n\n    return m.hexdigest()\n\n\ndef _define_shape_key_hash(fields, description, field_aliases):\n    return "Shape." + compute_fields_hash(fields, description, field_aliases=field_aliases)\n\n\n
[docs]class Shape(_ConfigHasFields):\n """Schema for configuration data with string keys and typed values via :py:class:`Field`.\n\n Unlike :py:class:`Permissive`, unspecified fields are not allowed and will throw a\n :py:class:`~dagster.DagsterInvalidConfigError`.\n\n Args:\n fields (Dict[str, Field]):\n The specification of the config dict.\n field_aliases (Dict[str, str]):\n Maps a string key to an alias that can be used instead of the original key. For example,\n an entry {"foo": "bar"} means that someone could use "bar" instead of "foo" as a\n top level string key.\n """\n\n def __new__(\n cls,\n fields,\n description=None,\n field_aliases=None,\n ):\n return _memoize_inst_in_field_cache(\n cls,\n Shape,\n _define_shape_key_hash(expand_fields_dict(fields), description, field_aliases),\n )\n\n def __init__(\n self,\n fields,\n description=None,\n field_aliases=None,\n ):\n # if we hit in the field cache - skip double init\n if self._initialized:\n return\n\n fields = expand_fields_dict(fields)\n super(Shape, self).__init__(\n kind=ConfigTypeKind.STRICT_SHAPE,\n key=_define_shape_key_hash(fields, description, field_aliases),\n description=description,\n fields=fields,\n )\n self.field_aliases = check.opt_dict_param(\n field_aliases, "field_aliases", key_type=str, value_type=str\n )\n self._initialized = True
\n\n\n
[docs]class Map(ConfigType):\n """Defines a config dict with arbitrary scalar keys and typed values.\n\n A map can contrain arbitrary keys of the specified scalar type, each of which has\n type checked values. Unlike :py:class:`Shape` and :py:class:`Permissive`, scalar\n keys other than strings can be used, and unlike :py:class:`Permissive`, all\n values are type checked.\n\n Args:\n key_type (type):\n The type of keys this map can contain. Must be a scalar type.\n inner_type (type):\n The type of the values that this map type can contain.\n key_label_name (string):\n Optional name which describes the role of keys in the map.\n\n **Examples:**\n\n .. code-block:: python\n\n @op(config_schema=Field(Map({str: int})))\n def partially_specified_config(context) -> List:\n return sorted(list(context.op_config.items()))\n """\n\n def __init__(self, key_type, inner_type, key_label_name=None):\n from .field import resolve_to_config_type\n\n self.key_type = resolve_to_config_type(key_type)\n self.inner_type = resolve_to_config_type(inner_type)\n self.given_name = key_label_name\n\n check.inst_param(self.key_type, "key_type", ConfigType)\n check.inst_param(self.inner_type, "inner_type", ConfigType)\n check.param_invariant(\n self.key_type.kind == ConfigTypeKind.SCALAR, "key_type", "Key type must be a scalar"\n )\n check.opt_str_param(self.given_name, "name")\n\n super(Map, self).__init__(\n key="Map.{key_type}.{inner_type}{name_key}".format(\n key_type=self.key_type.key,\n inner_type=self.inner_type.key,\n name_key=f":name: {key_label_name}" if key_label_name else "",\n ),\n # We use the given name field to store the key label name\n # this is used elsewhere to give custom types names\n given_name=key_label_name,\n type_params=[self.key_type, self.inner_type],\n kind=ConfigTypeKind.MAP,\n )\n\n @public\n @property\n def key_label_name(self) -> Optional[str]:\n """Name which describes the role of keys in the map, if provided."""\n return self.given_name\n\n def type_iterator(self) -> Iterator["ConfigType"]:\n yield from self.key_type.type_iterator()\n yield from self.inner_type.type_iterator()\n yield from super().type_iterator()
\n\n\ndef _define_permissive_dict_key(fields, description):\n return (\n "Permissive." + compute_fields_hash(fields, description=description)\n if fields\n else "Permissive"\n )\n\n\n
[docs]class Permissive(_ConfigHasFields):\n """Defines a config dict with a partially specified schema.\n\n A permissive dict allows partial specification of the config schema. Any fields with a\n specified schema will be type checked. Other fields will be allowed, but will be ignored by\n the type checker.\n\n Args:\n fields (Dict[str, Field]): The partial specification of the config dict.\n\n **Examples:**\n\n .. code-block:: python\n\n @op(config_schema=Field(Permissive({'required': Field(String)})))\n def map_config_op(context) -> List:\n return sorted(list(context.op_config.items()))\n """\n\n def __new__(cls, fields=None, description=None):\n return _memoize_inst_in_field_cache(\n cls,\n Permissive,\n _define_permissive_dict_key(\n expand_fields_dict(fields) if fields else None, description\n ),\n )\n\n def __init__(self, fields=None, description=None):\n # if we hit in field cache avoid double init\n if self._initialized:\n return\n\n fields = expand_fields_dict(fields) if fields else None\n super(Permissive, self).__init__(\n key=_define_permissive_dict_key(fields, description),\n kind=ConfigTypeKind.PERMISSIVE_SHAPE,\n fields=fields or dict(),\n description=description,\n )\n self._initialized = True
\n\n\ndef _define_selector_key(fields, description):\n return "Selector." + compute_fields_hash(fields, description=description)\n\n\n
[docs]class Selector(_ConfigHasFields):\n """Define a config field requiring the user to select one option.\n\n Selectors are used when you want to be able to present several different options in config but\n allow only one to be selected. For example, a single input might be read in from either a csv\n file or a parquet file, but not both at once.\n\n Note that in some other type systems this might be called an 'input union'.\n\n Functionally, a selector is like a :py:class:`Dict`, except that only one key from the dict can\n be specified in valid config.\n\n Args:\n fields (Dict[str, Field]): The fields from which the user must select.\n\n **Examples:**\n\n .. code-block:: python\n\n @op(\n config_schema=Field(\n Selector(\n {\n 'haw': {'whom': Field(String, default_value='honua', is_required=False)},\n 'cn': {'whom': Field(String, default_value='\u4e16\u754c', is_required=False)},\n 'en': {'whom': Field(String, default_value='world', is_required=False)},\n }\n ),\n is_required=False,\n default_value={'en': {'whom': 'world'}},\n )\n )\n def hello_world_with_default(context):\n if 'haw' in context.op_config:\n return 'Aloha {whom}!'.format(whom=context.op_config['haw']['whom'])\n if 'cn' in context.op_config:\n return '\u4f60\u597d, {whom}!'.format(whom=context.op_config['cn']['whom'])\n if 'en' in context.op_config:\n return 'Hello, {whom}!'.format(whom=context.op_config['en']['whom'])\n """\n\n def __new__(cls, fields, description=None):\n return _memoize_inst_in_field_cache(\n cls,\n Selector,\n _define_selector_key(expand_fields_dict(fields), description),\n )\n\n def __init__(self, fields, description=None):\n # if we hit in field cache avoid double init\n if self._initialized:\n return\n\n fields = expand_fields_dict(fields)\n super(Selector, self).__init__(\n key=_define_selector_key(fields, description),\n kind=ConfigTypeKind.SELECTOR,\n fields=fields,\n description=description,\n )\n self._initialized = True
\n\n\n# Config syntax expansion code below\n\n\ndef is_potential_field(potential_field: object) -> bool:\n from .field import Field, resolve_to_config_type\n\n return isinstance(potential_field, (Field, dict, list)) or bool(\n resolve_to_config_type(potential_field)\n )\n\n\ndef convert_fields_to_dict_type(fields: Mapping[str, object]):\n return _convert_fields_to_dict_type(fields, fields, [])\n\n\ndef _convert_fields_to_dict_type(\n original_root: object, fields: Mapping[str, object], stack: List[str]\n) -> Shape:\n return Shape(_expand_fields_dict(original_root, fields, stack))\n\n\ndef expand_fields_dict(fields: Mapping[str, object]) -> Mapping[str, "Field"]:\n return _expand_fields_dict(fields, fields, [])\n\n\ndef _expand_fields_dict(\n original_root: object, fields: Mapping[str, object], stack: List[str]\n) -> Mapping[str, "Field"]:\n check.mapping_param(fields, "fields")\n return {\n name: _convert_potential_field(original_root, value, stack + [name])\n for name, value in fields.items()\n }\n\n\ndef expand_list(original_root: object, the_list: Sequence[object], stack: List[str]) -> Array:\n if len(the_list) != 1:\n raise DagsterInvalidConfigDefinitionError(\n original_root, the_list, stack, "List must be of length 1"\n )\n\n inner_type = _convert_potential_type(original_root, the_list[0], stack)\n if not inner_type:\n raise DagsterInvalidConfigDefinitionError(\n original_root,\n the_list,\n stack,\n "List have a single item and contain a valid type i.e. [int]. Got item {}".format(\n repr(the_list[0])\n ),\n )\n\n return Array(inner_type)\n\n\ndef expand_map(original_root: object, the_dict: Mapping[object, object], stack: List[str]) -> Map:\n if len(the_dict) != 1:\n raise DagsterInvalidConfigDefinitionError(\n original_root, the_dict, stack, "Map dict must be of length 1"\n )\n\n key = next(iter(the_dict.keys()))\n key_type = _convert_potential_type(original_root, key, stack)\n if not key_type or not key_type.kind == ConfigTypeKind.SCALAR:\n raise DagsterInvalidConfigDefinitionError(\n original_root,\n the_dict,\n stack,\n f"Map dict must have a scalar type as its only key. Got key {key!r}",\n )\n\n inner_type = _convert_potential_type(original_root, the_dict[key], stack)\n if not inner_type:\n raise DagsterInvalidConfigDefinitionError(\n original_root,\n the_dict,\n stack,\n "Map must have a single value and contain a valid type i.e. {{str: int}}. Got item {}"\n .format(repr(the_dict[key])),\n )\n\n return Map(key_type, inner_type)\n\n\ndef convert_potential_field(potential_field: object) -> "Field":\n return _convert_potential_field(potential_field, potential_field, [])\n\n\ndef _convert_potential_type(original_root: object, potential_type, stack: List[str]):\n from .field import resolve_to_config_type\n\n if isinstance(potential_type, Mapping):\n # A dictionary, containing a single key which is a type (int, str, etc) and not a string is interpreted as a Map\n if len(potential_type) == 1:\n key = next(iter(potential_type.keys()))\n if not isinstance(key, str) and _convert_potential_type(original_root, key, stack):\n return expand_map(original_root, potential_type, stack)\n\n # Otherwise, the dictionary is interpreted as a Shape\n return Shape(_expand_fields_dict(original_root, potential_type, stack))\n\n if isinstance(potential_type, list):\n return expand_list(original_root, potential_type, stack)\n\n return resolve_to_config_type(potential_type)\n\n\ndef _convert_potential_field(\n original_root: object, potential_field: object, stack: List[str]\n) -> "Field":\n from .field import Field\n\n if potential_field is None:\n raise DagsterInvalidConfigDefinitionError(\n original_root, potential_field, stack, reason="Fields cannot be None"\n )\n\n if not is_potential_field(potential_field):\n raise DagsterInvalidConfigDefinitionError(original_root, potential_field, stack)\n\n if isinstance(potential_field, Field):\n return potential_field\n\n return Field(_convert_potential_type(original_root, potential_field, stack))\n\n\ndef config_dictionary_from_values(\n values: Mapping[str, Any], config_field: "Field"\n) -> Dict[str, Any]:\n """Converts a set of config values into a dictionary representation,\n in particular converting EnvVar objects into Dagster config inputs\n and processing data structures such as dicts, lists, and structured Config classes.\n """\n assert ConfigTypeKind.is_shape(config_field.config_type.kind)\n\n from dagster._config.pythonic_config import _config_value_to_dict_representation\n\n return check.is_dict(_config_value_to_dict_representation(None, values))\n\n\nclass IntEnvVar(int):\n """Class used to represent an environment variable in the Dagster config system.\n\n The environment variable will be resolved to an int value when the config is\n loaded.\n """\n\n name: str\n\n @classmethod\n def create(cls, name: str) -> "IntEnvVar":\n var = IntEnvVar(0)\n var.name = name\n return var\n\n\nclass EnvVar(str):\n """Class used to represent an environment variable in the Dagster config system.\n\n The environment variable will be resolved to a string value when the config is\n loaded.\n """\n\n @classmethod\n def int(cls, name: str) -> "IntEnvVar":\n return IntEnvVar.create(name=name)\n
", "current_page_name": "_modules/dagster/_config/field_utils", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._config.field_utils"}, "pythonic_config": {"config": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._config.pythonic_config.config

\nimport re\nfrom enum import Enum\nfrom typing import (\n    Any,\n    Dict,\n    Mapping,\n    Optional,\n    Set,\n    Type,\n    cast,\n)\n\nfrom pydantic import BaseModel, Extra\nfrom pydantic.fields import (\n    ModelField,\n)\n\nimport dagster._check as check\nfrom dagster import (\n    Field,\n    Field as DagsterField,\n    Shape,\n)\nfrom dagster._config.field_utils import Permissive\nfrom dagster._core.definitions.definition_config_schema import (\n    DefinitionConfigSchema,\n)\nfrom dagster._core.errors import (\n    DagsterInvalidConfigDefinitionError,\n    DagsterInvalidDefinitionError,\n    DagsterInvalidInvocationError,\n    DagsterInvalidPythonicConfigDefinitionError,\n)\nfrom dagster._utils.cached_method import CACHED_METHOD_FIELD_SUFFIX\n\nfrom .attach_other_object_to_context import (\n    IAttachDifferentObjectToOpContext as IAttachDifferentObjectToOpContext,\n)\nfrom .conversion_utils import _convert_pydantic_field, _is_pydantic_field_required, safe_is_subclass\nfrom .typing_utils import BaseConfigMeta\n\ntry:\n    from functools import cached_property  # type: ignore  # (py37 compat)\nexcept ImportError:\n\n    class cached_property:\n        pass\n\n\nINTERNAL_MARKER = "__internal__"\n\n# ensure that this ends with the internal marker so we can do a single check\nassert CACHED_METHOD_FIELD_SUFFIX.endswith(INTERNAL_MARKER)\n\n\nclass MakeConfigCacheable(BaseModel):\n    """This class centralizes and implements all the chicanery we need in order\n    to support caching decorators. If we decide this is a bad idea we can remove it\n    all in one go.\n    """\n\n    # Pydantic config for this class\n    # Cannot use kwargs for base class as this is not support for pydnatic<1.8\n    class Config:\n        # Various pydantic model config (https://docs.pydantic.dev/usage/model_config/)\n        # Necessary to allow for caching decorators\n        arbitrary_types_allowed = True\n        # Avoid pydantic reading a cached property class as part of the schema\n        keep_untouched = (cached_property,)\n        # Ensure the class is serializable, for caching purposes\n        frozen = True\n\n    def __setattr__(self, name: str, value: Any):\n        from .resource import ConfigurableResourceFactory\n\n        # This is a hack to allow us to set attributes on the class that are not part of the\n        # config schema. Pydantic will normally raise an error if you try to set an attribute\n        # that is not part of the schema.\n\n        if self._is_field_internal(name):\n            object.__setattr__(self, name, value)\n            return\n\n        try:\n            return super().__setattr__(name, value)\n        except (TypeError, ValueError) as e:\n            clsname = self.__class__.__name__\n            if "is immutable and does not support item assignment" in str(e):\n                if isinstance(self, ConfigurableResourceFactory):\n                    raise DagsterInvalidInvocationError(\n                        f"'{clsname}' is a Pythonic resource and does not support item assignment,"\n                        " as it inherits from 'pydantic.BaseModel' with frozen=True. If trying to"\n                        " maintain state on this resource, consider building a separate, stateful"\n                        " client class, and provide a method on the resource to construct and"\n                        " return the stateful client."\n                    ) from e\n                else:\n                    raise DagsterInvalidInvocationError(\n                        f"'{clsname}' is a Pythonic config class and does not support item"\n                        " assignment, as it inherits from 'pydantic.BaseModel' with frozen=True."\n                    ) from e\n            elif "object has no field" in str(e):\n                field_name = check.not_none(\n                    re.search(r"object has no field \\"(.*)\\"", str(e))\n                ).group(1)\n                if isinstance(self, ConfigurableResourceFactory):\n                    raise DagsterInvalidInvocationError(\n                        f"'{clsname}' is a Pythonic resource and does not support manipulating"\n                        f" undeclared attribute '{field_name}' as it inherits from"\n                        " 'pydantic.BaseModel' without extra=\\"allow\\". If trying to maintain"\n                        " state on this resource, consider building a separate, stateful client"\n                        " class, and provide a method on the resource to construct and return the"\n                        " stateful client."\n                    ) from e\n                else:\n                    raise DagsterInvalidInvocationError(\n                        f"'{clsname}' is a Pythonic config class and does not support manipulating"\n                        f" undeclared attribute '{field_name}' as it inherits from"\n                        " 'pydantic.BaseModel' without extra=\\"allow\\"."\n                    ) from e\n            else:\n                raise\n\n    def _is_field_internal(self, name: str) -> bool:\n        return name.endswith(INTERNAL_MARKER)\n\n\n
[docs]class Config(MakeConfigCacheable, metaclass=BaseConfigMeta):\n """Base class for Dagster configuration models, used to specify config schema for\n ops and assets. Subclasses :py:class:`pydantic.BaseModel`.\n\n Example definition:\n\n .. code-block:: python\n\n from pydantic import Field\n\n class MyAssetConfig(Config):\n my_str: str = "my_default_string"\n my_int_list: List[int]\n my_bool_with_metadata: bool = Field(default=False, description="A bool field")\n\n\n Example usage:\n\n .. code-block:: python\n\n @asset\n def asset_with_config(config: MyAssetConfig):\n assert config.my_str == "my_default_string"\n assert config.my_int_list == [1, 2, 3]\n assert config.my_bool_with_metadata == False\n\n asset_with_config(MyAssetConfig(my_int_list=[1, 2, 3], my_bool_with_metadata=True))\n\n """\n\n def __init__(self, **config_dict) -> None:\n """This constructor is overridden to handle any remapping of raw config dicts to\n the appropriate config classes. For example, discriminated unions are represented\n in Dagster config as dicts with a single key, which is the discriminator value.\n """\n modified_data = {}\n for key, value in config_dict.items():\n field = self.__fields__.get(key)\n if field and field.field_info.discriminator:\n nested_dict = value\n\n discriminator_key = check.not_none(field.discriminator_key)\n if isinstance(value, Config):\n nested_dict = _discriminated_union_config_dict_to_selector_config_dict(\n discriminator_key,\n value._get_non_none_public_field_values(), # noqa: SLF001\n )\n\n nested_items = list(check.is_dict(nested_dict).items())\n check.invariant(\n len(nested_items) == 1,\n "Discriminated union must have exactly one key",\n )\n discriminated_value, nested_values = nested_items[0]\n\n modified_data[key] = {\n **nested_values,\n discriminator_key: discriminated_value,\n }\n else:\n modified_data[key] = value\n super().__init__(**modified_data)\n\n def _convert_to_config_dictionary(self) -> Mapping[str, Any]:\n """Converts this Config object to a Dagster config dictionary, in the same format as the dictionary\n accepted as run config or as YAML in the launchpad.\n\n Inner fields are recursively converted to dictionaries, meaning nested config objects\n or EnvVars will be converted to the appropriate dictionary representation.\n """\n public_fields = self._get_non_none_public_field_values()\n return {\n k: _config_value_to_dict_representation(self.__fields__.get(k), v)\n for k, v in public_fields.items()\n }\n\n def _get_non_none_public_field_values(self) -> Mapping[str, Any]:\n """Returns a dictionary representation of this config object,\n ignoring any private fields, and any optional fields that are None.\n\n Inner fields are returned as-is in the dictionary,\n meaning any nested config objects will be returned as config objects, not dictionaries.\n """\n output = {}\n for key, value in self.__dict__.items():\n if self._is_field_internal(key):\n continue\n field = self.__fields__.get(key)\n if field and value is None and not _is_pydantic_field_required(field):\n continue\n\n if field:\n output[field.alias] = value\n else:\n output[key] = value\n return output\n\n @classmethod\n def to_config_schema(cls) -> DefinitionConfigSchema:\n """Converts the config structure represented by this class into a DefinitionConfigSchema."""\n return DefinitionConfigSchema(infer_schema_from_config_class(cls))\n\n @classmethod\n def to_fields_dict(cls) -> Dict[str, DagsterField]:\n """Converts the config structure represented by this class into a dictionary of dagster.Fields.\n This is useful when interacting with legacy code that expects a dictionary of fields but you\n want the source of truth to be a config class.\n """\n return cast(Shape, cls.to_config_schema().as_field().config_type).fields
\n\n\ndef _discriminated_union_config_dict_to_selector_config_dict(\n discriminator_key: str, config_dict: Mapping[str, Any]\n):\n """Remaps a config dictionary which is a member of a discriminated union to\n the appropriate structure for a Dagster config selector.\n\n A discriminated union with key "my_key" and value "my_value" will be represented\n as {"my_key": "my_value", "my_field": "my_field_value"}. When converted to a selector,\n this should be represented as {"my_value": {"my_field": "my_field_value"}}.\n """\n updated_dict = dict(config_dict)\n discriminator_value = updated_dict.pop(discriminator_key)\n wrapped_dict = {discriminator_value: updated_dict}\n return wrapped_dict\n\n\ndef _config_value_to_dict_representation(field: Optional[ModelField], value: Any):\n """Converts a config value to a dictionary representation. If a field is provided, it will be used\n to determine the appropriate dictionary representation in the case of discriminated unions.\n """\n from dagster._config.field_utils import EnvVar, IntEnvVar\n\n if isinstance(value, dict):\n return {k: _config_value_to_dict_representation(None, v) for k, v in value.items()}\n elif isinstance(value, list):\n return [_config_value_to_dict_representation(None, v) for v in value]\n elif isinstance(value, EnvVar):\n return {"env": str(value)}\n elif isinstance(value, IntEnvVar):\n return {"env": value.name}\n if isinstance(value, Config):\n if field and field.discriminator_key:\n return {\n k: v\n for k, v in _discriminated_union_config_dict_to_selector_config_dict(\n field.discriminator_key,\n value._convert_to_config_dictionary(), # noqa: SLF001\n ).items()\n }\n else:\n return {k: v for k, v in value._convert_to_config_dictionary().items()} # noqa: SLF001\n elif isinstance(value, Enum):\n return value.name\n\n return value\n\n\n
[docs]class PermissiveConfig(Config):\n """Subclass of :py:class:`Config` that allows arbitrary extra fields. This is useful for\n config classes which may have open-ended inputs.\n\n Example definition:\n\n .. code-block:: python\n\n class MyPermissiveOpConfig(PermissiveConfig):\n my_explicit_parameter: bool\n my_other_explicit_parameter: str\n\n\n Example usage:\n\n .. code-block:: python\n\n @op\n def op_with_config(config: MyPermissiveOpConfig):\n assert config.my_explicit_parameter == True\n assert config.my_other_explicit_parameter == "foo"\n assert config.dict().get("my_implicit_parameter") == "bar"\n\n op_with_config(\n MyPermissiveOpConfig(\n my_explicit_parameter=True,\n my_other_explicit_parameter="foo",\n my_implicit_parameter="bar"\n )\n )\n\n """\n\n # Pydantic config for this class\n # Cannot use kwargs for base class as this is not support for pydantic<1.8\n class Config:\n extra = "allow"
\n\n\ndef infer_schema_from_config_class(\n model_cls: Type["Config"],\n description: Optional[str] = None,\n fields_to_omit: Optional[Set[str]] = None,\n) -> Field:\n from .config import Config\n from .resource import ConfigurableResourceFactory\n\n """Parses a structured config class and returns a corresponding Dagster config Field."""\n fields_to_omit = fields_to_omit or set()\n\n check.param_invariant(\n safe_is_subclass(model_cls, Config),\n "Config type annotation must inherit from dagster.Config",\n )\n\n fields: Dict[str, Field] = {}\n for pydantic_field in model_cls.__fields__.values():\n if pydantic_field.name not in fields_to_omit:\n if isinstance(pydantic_field.default, Field):\n raise DagsterInvalidDefinitionError(\n "Using 'dagster.Field' is not supported within a Pythonic config or resource"\n " definition. 'dagster.Field' should only be used in legacy Dagster config"\n " schemas. Did you mean to use 'pydantic.Field' instead?"\n )\n\n try:\n fields[pydantic_field.alias] = _convert_pydantic_field(\n pydantic_field,\n )\n except DagsterInvalidConfigDefinitionError as e:\n raise DagsterInvalidPythonicConfigDefinitionError(\n config_class=model_cls,\n field_name=pydantic_field.name,\n invalid_type=e.current_value,\n is_resource=model_cls is not None\n and safe_is_subclass(model_cls, ConfigurableResourceFactory),\n )\n\n shape_cls = Permissive if model_cls.__config__.extra == Extra.allow else Shape\n\n docstring = model_cls.__doc__.strip() if model_cls.__doc__ else None\n\n return Field(config=shape_cls(fields), description=description or docstring)\n
", "current_page_name": "_modules/dagster/_config/pythonic_config/config", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._config.pythonic_config.config"}, "io_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._config.pythonic_config.io_manager

\nfrom abc import abstractmethod\nfrom typing import (\n    AbstractSet,\n    Any,\n    Callable,\n    Dict,\n    Generic,\n    Mapping,\n    Optional,\n    Type,\n    Union,\n    cast,\n)\n\nfrom typing_extensions import TypeVar\n\nfrom dagster._core.definitions.definition_config_schema import (\n    CoercableToConfigSchema,\n)\nfrom dagster._core.definitions.resource_definition import (\n    ResourceDefinition,\n    ResourceFunction,\n)\nfrom dagster._core.execution.context.init import InitResourceContext\nfrom dagster._core.storage.io_manager import IOManager, IOManagerDefinition\nfrom dagster._utils.cached_method import cached_method\n\nfrom .attach_other_object_to_context import (\n    IAttachDifferentObjectToOpContext as IAttachDifferentObjectToOpContext,\n)\nfrom .config import Config\nfrom .conversion_utils import TResValue\nfrom .inheritance_utils import safe_is_subclass\nfrom .resource import (\n    AllowDelayedDependencies,\n    ConfigurableResourceFactory,\n    PartialResource,\n    ResourceId,\n    ResourceWithKeyMapping,\n    Self,\n)\n\ntry:\n    from functools import cached_property  # type: ignore  # (py37 compat)\nexcept ImportError:\n\n    class cached_property:\n        pass\n\n\nTIOManagerValue = TypeVar("TIOManagerValue", bound=IOManager)\n\n\nclass ConfigurableIOManagerFactoryResourceDefinition(IOManagerDefinition, AllowDelayedDependencies):\n    def __init__(\n        self,\n        configurable_resource_cls: Type,\n        resource_fn: ResourceFunction,\n        config_schema: Any,\n        description: Optional[str],\n        resolve_resource_keys: Callable[[Mapping[int, str]], AbstractSet[str]],\n        nested_resources: Mapping[str, Any],\n        input_config_schema: Optional[Union[CoercableToConfigSchema, Type[Config]]] = None,\n        output_config_schema: Optional[Union[CoercableToConfigSchema, Type[Config]]] = None,\n        dagster_maintained: bool = False,\n    ):\n        input_config_schema_resolved: CoercableToConfigSchema = (\n            cast(Type[Config], input_config_schema).to_config_schema()\n            if safe_is_subclass(input_config_schema, Config)\n            else cast(CoercableToConfigSchema, input_config_schema)\n        )\n        output_config_schema_resolved: CoercableToConfigSchema = (\n            cast(Type[Config], output_config_schema).to_config_schema()\n            if safe_is_subclass(output_config_schema, Config)\n            else cast(CoercableToConfigSchema, output_config_schema)\n        )\n        super().__init__(\n            resource_fn=resource_fn,\n            config_schema=config_schema,\n            description=description,\n            input_config_schema=input_config_schema_resolved,\n            output_config_schema=output_config_schema_resolved,\n        )\n        self._resolve_resource_keys = resolve_resource_keys\n        self._nested_resources = nested_resources\n        self._configurable_resource_cls = configurable_resource_cls\n        self._dagster_maintained = dagster_maintained\n\n    @property\n    def configurable_resource_cls(self) -> Type:\n        return self._configurable_resource_cls\n\n    @property\n    def nested_resources(\n        self,\n    ) -> Mapping[str, Any]:\n        return self._nested_resources\n\n    def _resolve_required_resource_keys(\n        self, resource_mapping: Mapping[int, str]\n    ) -> AbstractSet[str]:\n        return self._resolve_resource_keys(resource_mapping)\n\n\nclass IOManagerWithKeyMapping(ResourceWithKeyMapping, IOManagerDefinition):\n    """Version of ResourceWithKeyMapping wrapper that also implements IOManagerDefinition."""\n\n    def __init__(\n        self,\n        resource: ResourceDefinition,\n        resource_id_to_key_mapping: Dict[ResourceId, str],\n    ):\n        ResourceWithKeyMapping.__init__(self, resource, resource_id_to_key_mapping)\n        IOManagerDefinition.__init__(\n            self, resource_fn=self.resource_fn, config_schema=resource.config_schema\n        )\n\n\n
[docs]class ConfigurableIOManagerFactory(ConfigurableResourceFactory[TIOManagerValue]):\n """Base class for Dagster IO managers that utilize structured config. This base class\n is useful for cases in which the returned IO manager is not the same as the class itself\n (e.g. when it is a wrapper around the actual IO manager implementation).\n\n This class is a subclass of both :py:class:`IOManagerDefinition` and :py:class:`Config`.\n Implementers should provide an implementation of the :py:meth:`resource_function` method,\n which should return an instance of :py:class:`IOManager`.\n\n\n Example definition:\n\n .. code-block:: python\n\n class ExternalIOManager(IOManager):\n\n def __init__(self, connection):\n self._connection = connection\n\n def handle_output(self, context, obj):\n ...\n\n def load_input(self, context):\n ...\n\n class ConfigurableExternalIOManager(ConfigurableIOManagerFactory):\n username: str\n password: str\n\n def create_io_manager(self, context) -> IOManager:\n with database.connect(username, password) as connection:\n return MyExternalIOManager(connection)\n\n defs = Definitions(\n ...,\n resources={\n "io_manager": ConfigurableExternalIOManager(\n username="dagster",\n password=EnvVar("DB_PASSWORD")\n )\n }\n )\n\n """\n\n def __init__(self, **data: Any):\n ConfigurableResourceFactory.__init__(self, **data)\n\n @abstractmethod\n def create_io_manager(self, context) -> TIOManagerValue:\n """Implement as one would implement a @io_manager decorator function."""\n raise NotImplementedError()\n\n def create_resource(self, context: InitResourceContext) -> TIOManagerValue:\n return self.create_io_manager(context)\n\n @classmethod\n def configure_at_launch(cls: "Type[Self]", **kwargs) -> "PartialIOManager[Self]":\n """Returns a partially initialized copy of the IO manager, with remaining config fields\n set at runtime.\n """\n return PartialIOManager(cls, data=kwargs)\n\n @cached_method\n def get_resource_definition(self) -> ConfigurableIOManagerFactoryResourceDefinition:\n return ConfigurableIOManagerFactoryResourceDefinition(\n self.__class__,\n resource_fn=self._get_initialize_and_run_fn(),\n config_schema=self._config_schema,\n description=self.__doc__,\n resolve_resource_keys=self._resolve_required_resource_keys,\n nested_resources=self.nested_resources,\n input_config_schema=self.__class__.input_config_schema(),\n output_config_schema=self.__class__.output_config_schema(),\n dagster_maintained=self._is_dagster_maintained(),\n )\n\n @classmethod\n def input_config_schema(\n cls,\n ) -> Optional[Union[CoercableToConfigSchema, Type[Config]]]:\n return None\n\n @classmethod\n def output_config_schema(\n cls,\n ) -> Optional[Union[CoercableToConfigSchema, Type[Config]]]:\n return None
\n\n\nclass PartialIOManager(Generic[TResValue], PartialResource[TResValue]):\n def __init__(\n self,\n resource_cls: Type[ConfigurableResourceFactory[TResValue]],\n data: Dict[str, Any],\n ):\n PartialResource.__init__(self, resource_cls, data)\n\n @cached_method\n def get_resource_definition(self) -> ConfigurableIOManagerFactoryResourceDefinition:\n input_config_schema = None\n output_config_schema = None\n if safe_is_subclass(self.resource_cls, ConfigurableIOManagerFactory):\n factory_cls: Type[ConfigurableIOManagerFactory] = cast(\n Type[ConfigurableIOManagerFactory], self.resource_cls\n )\n input_config_schema = factory_cls.input_config_schema()\n output_config_schema = factory_cls.output_config_schema()\n\n return ConfigurableIOManagerFactoryResourceDefinition(\n self.resource_cls,\n resource_fn=self._state__internal__.resource_fn,\n config_schema=self._state__internal__.config_schema,\n description=self._state__internal__.description,\n resolve_resource_keys=self._resolve_required_resource_keys,\n nested_resources=self._state__internal__.nested_resources,\n input_config_schema=input_config_schema,\n output_config_schema=output_config_schema,\n dagster_maintained=self.resource_cls._is_dagster_maintained(), # noqa: SLF001\n )\n\n\n
[docs]class ConfigurableIOManager(ConfigurableIOManagerFactory, IOManager):\n """Base class for Dagster IO managers that utilize structured config.\n\n This class is a subclass of both :py:class:`IOManagerDefinition`, :py:class:`Config`,\n and :py:class:`IOManager`. Implementers must provide an implementation of the\n :py:meth:`handle_output` and :py:meth:`load_input` methods.\n\n Example definition:\n\n .. code-block:: python\n\n class MyIOManager(ConfigurableIOManager):\n path_prefix: List[str]\n\n def _get_path(self, context) -> str:\n return "/".join(context.asset_key.path)\n\n def handle_output(self, context, obj):\n write_csv(self._get_path(context), obj)\n\n def load_input(self, context):\n return read_csv(self._get_path(context))\n\n defs = Definitions(\n ...,\n resources={\n "io_manager": MyIOManager(path_prefix=["my", "prefix"])\n }\n )\n\n """\n\n def create_io_manager(self, context) -> IOManager:\n return self
\n\n\nclass ConfigurableLegacyIOManagerAdapter(ConfigurableIOManagerFactory):\n """Adapter base class for wrapping a decorated, function-style I/O manager\n with structured config.\n\n To use this class, subclass it, define config schema fields using Pydantic,\n and implement the ``wrapped_io_manager`` method.\n\n Example:\n .. code-block:: python\n\n class OldIOManager(IOManager):\n def __init__(self, base_path: str):\n ...\n\n @io_manager(config_schema={"base_path": str})\n def old_io_manager(context):\n base_path = context.resource_config["base_path"]\n\n return OldIOManager(base_path)\n\n class MyIOManager(ConfigurableLegacyIOManagerAdapter):\n base_path: str\n\n @property\n def wrapped_io_manager(self) -> IOManagerDefinition:\n return old_io_manager\n """\n\n @property\n @abstractmethod\n def wrapped_io_manager(self) -> IOManagerDefinition:\n raise NotImplementedError()\n\n def create_io_manager(self, context) -> IOManager:\n raise NotImplementedError(\n "Because we override resource_fn in the adapter, this is never called."\n )\n\n @cached_method\n def get_resource_definition(self) -> ConfigurableIOManagerFactoryResourceDefinition:\n return ConfigurableIOManagerFactoryResourceDefinition(\n self.__class__,\n resource_fn=self.wrapped_io_manager.resource_fn,\n config_schema=self._config_schema,\n description=self.__doc__,\n resolve_resource_keys=self._resolve_required_resource_keys,\n nested_resources=self.nested_resources,\n dagster_maintained=self._is_dagster_maintained(),\n )\n
", "current_page_name": "_modules/dagster/_config/pythonic_config/io_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._config.pythonic_config.io_manager"}, "resource": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._config.pythonic_config.resource

\nimport contextlib\nimport inspect\nfrom typing import (\n    AbstractSet,\n    Any,\n    Callable,\n    Dict,\n    Generator,\n    Generic,\n    List,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Set,\n    Tuple,\n    Type,\n    TypeVar,\n    Union,\n    cast,\n)\n\nfrom typing_extensions import TypeAlias, TypeGuard, get_args, get_origin\n\nfrom dagster import (\n    Field as DagsterField,\n)\nfrom dagster._annotations import deprecated\nfrom dagster._config.field_utils import config_dictionary_from_values\nfrom dagster._config.pythonic_config.typing_utils import (\n    TypecheckAllowPartialResourceInitParams,\n)\nfrom dagster._config.validate import validate_config\nfrom dagster._core.definitions.definition_config_schema import (\n    ConfiguredDefinitionConfigSchema,\n    DefinitionConfigSchema,\n)\nfrom dagster._core.errors import DagsterInvalidConfigError\nfrom dagster._core.execution.context.init import InitResourceContext, build_init_resource_context\nfrom dagster._utils.cached_method import cached_method\n\nfrom .attach_other_object_to_context import (\n    IAttachDifferentObjectToOpContext as IAttachDifferentObjectToOpContext,\n)\n\ntry:\n    from functools import cached_property  # type: ignore  # (py37 compat)\nexcept ImportError:\n\n    class cached_property:\n        pass\n\n\nfrom abc import ABC, abstractmethod\n\nfrom pydantic import BaseModel\n\nimport dagster._check as check\nfrom dagster._core.decorator_utils import get_function_params\nfrom dagster._core.definitions.resource_definition import (\n    ResourceDefinition,\n    ResourceFunction,\n    ResourceFunctionWithContext,\n    ResourceFunctionWithoutContext,\n    has_at_least_one_parameter,\n)\nfrom dagster._core.storage.io_manager import IOManagerDefinition\n\nfrom .config import Config, MakeConfigCacheable, infer_schema_from_config_class\nfrom .conversion_utils import (\n    TResValue,\n    _curry_config_schema,\n)\nfrom .typing_utils import BaseResourceMeta, LateBoundTypesForResourceTypeChecking\n\nSelf = TypeVar("Self", bound="ConfigurableResourceFactory")\nResourceId: TypeAlias = int\n\n\nclass AllowDelayedDependencies:\n    _nested_partial_resources: Mapping[str, ResourceDefinition] = {}\n\n    def _resolve_required_resource_keys(\n        self, resource_mapping: Mapping[int, str]\n    ) -> AbstractSet[str]:\n        from dagster._core.execution.build_resources import wrap_resource_for_execution\n\n        # All dependent resources which are not fully configured\n        # must be specified to the Definitions object so that the\n        # resource can be configured at runtime by the user\n        nested_partial_resource_keys = {\n            attr_name: resource_mapping.get(id(resource_def))\n            for attr_name, resource_def in self._nested_partial_resources.items()\n        }\n        check.invariant(\n            all(pointer_key is not None for pointer_key in nested_partial_resource_keys.values()),\n            "Any partially configured, nested resources must be provided to Definitions"\n            f" object: {nested_partial_resource_keys}",\n        )\n\n        # Recursively get all nested resource keys\n        nested_resource_required_keys: Set[str] = set()\n        for v in self._nested_partial_resources.values():\n            nested_resource_required_keys.update(\n                _resolve_required_resource_keys_for_resource(v, resource_mapping)\n            )\n\n        resources, _ = separate_resource_params(\n            cast(Type[BaseModel], self.__class__), self.__dict__\n        )\n        for v in resources.values():\n            nested_resource_required_keys.update(\n                _resolve_required_resource_keys_for_resource(\n                    wrap_resource_for_execution(v), resource_mapping\n                )\n            )\n\n        out = set(cast(Set[str], nested_partial_resource_keys.values())).union(\n            nested_resource_required_keys\n        )\n        return out\n\n\nclass InitResourceContextWithKeyMapping(InitResourceContext):\n    """Passes along a mapping from ResourceDefinition id to resource key alongside the\n    InitResourceContext. This is used to resolve the required resource keys for\n    resources which may hold nested partial resources.\n    """\n\n    def __init__(\n        self,\n        context: InitResourceContext,\n        resource_id_to_key_mapping: Mapping[ResourceId, str],\n    ):\n        super().__init__(\n            resource_config=context.resource_config,\n            resources=context.resources,\n            instance=context.instance,\n            resource_def=context.resource_def,\n            dagster_run=context.dagster_run,\n            log_manager=context.log,\n        )\n        self._resource_id_to_key_mapping = resource_id_to_key_mapping\n        self._resources_by_id = {\n            resource_id: getattr(context.resources, resource_key, None)\n            for resource_id, resource_key in resource_id_to_key_mapping.items()\n        }\n\n    @property\n    def resources_by_id(self) -> Mapping[ResourceId, Any]:\n        return self._resources_by_id\n\n    def replace_config(self, config: Any) -> "InitResourceContext":\n        return InitResourceContextWithKeyMapping(\n            super().replace_config(config), self._resource_id_to_key_mapping\n        )\n\n\nclass ResourceWithKeyMapping(ResourceDefinition):\n    """Wrapper around a ResourceDefinition which helps the inner resource resolve its required\n    resource keys. This is useful for resources which may hold nested resources. At construction\n    time, they are unaware of the resource keys of their nested resources - the resource id to\n    key mapping is used to resolve this.\n    """\n\n    def __init__(\n        self,\n        resource: ResourceDefinition,\n        resource_id_to_key_mapping: Dict[ResourceId, str],\n    ):\n        self._resource = resource\n        self._resource_id_to_key_mapping = resource_id_to_key_mapping\n\n        ResourceDefinition.__init__(\n            self,\n            resource_fn=self.setup_context_resources_and_call,\n            config_schema=resource.config_schema,\n            description=resource.description,\n            version=resource.version,\n        )\n\n    def setup_context_resources_and_call(self, context: InitResourceContext):\n        """Wrapper around the wrapped resource's resource_fn which attaches its\n        resource id to key mapping to the context, and then calls the nested resource's resource_fn.\n        """\n        context_with_key_mapping = InitResourceContextWithKeyMapping(\n            context, self._resource_id_to_key_mapping\n        )\n\n        if has_at_least_one_parameter(self._resource.resource_fn):\n            return self._resource.resource_fn(context_with_key_mapping)\n        else:\n            return cast(ResourceFunctionWithoutContext, self._resource.resource_fn)()\n\n    @property\n    def required_resource_keys(self) -> AbstractSet[str]:\n        return _resolve_required_resource_keys_for_resource(\n            self._resource, self._resource_id_to_key_mapping\n        )\n\n    @property\n    def wrapped_resource(self) -> ResourceDefinition:\n        return self._resource\n\n    @property\n    def inner_resource(self):\n        return self._resource\n\n\ndef attach_resource_id_to_key_mapping(\n    resource_def: Any, resource_id_to_key_mapping: Dict[ResourceId, str]\n) -> Any:\n    from .io_manager import IOManagerWithKeyMapping\n\n    if isinstance(resource_def, (ConfigurableResourceFactory, PartialResource)):\n        defn = resource_def.get_resource_definition()\n        return (\n            IOManagerWithKeyMapping(defn, resource_id_to_key_mapping)\n            if isinstance(defn, IOManagerDefinition)\n            else ResourceWithKeyMapping(defn, resource_id_to_key_mapping)\n        )\n    return resource_def\n\n\nCoercibleToResource: TypeAlias = Union[\n    ResourceDefinition, "ConfigurableResourceFactory", "PartialResource"\n]\n\n\ndef is_coercible_to_resource(val: Any) -> TypeGuard[CoercibleToResource]:\n    return isinstance(val, (ResourceDefinition, ConfigurableResourceFactory, PartialResource))\n\n\nclass ConfigurableResourceFactoryResourceDefinition(ResourceDefinition, AllowDelayedDependencies):\n    def __init__(\n        self,\n        configurable_resource_cls: Type,\n        resource_fn: ResourceFunction,\n        config_schema: Any,\n        description: Optional[str],\n        resolve_resource_keys: Callable[[Mapping[int, str]], AbstractSet[str]],\n        nested_resources: Mapping[str, Any],\n        dagster_maintained: bool = False,\n    ):\n        super().__init__(\n            resource_fn=resource_fn,\n            config_schema=config_schema,\n            description=description,\n        )\n        self._configurable_resource_cls = configurable_resource_cls\n        self._resolve_resource_keys = resolve_resource_keys\n        self._nested_resources = nested_resources\n        self._dagster_maintained = dagster_maintained\n\n    @property\n    def configurable_resource_cls(self) -> Type:\n        return self._configurable_resource_cls\n\n    @property\n    def nested_resources(\n        self,\n    ) -> Mapping[str, Any]:\n        return self._nested_resources\n\n    def _resolve_required_resource_keys(\n        self, resource_mapping: Mapping[int, str]\n    ) -> AbstractSet[str]:\n        return self._resolve_resource_keys(resource_mapping)\n\n    def _is_dagster_maintained(self) -> bool:\n        return self._dagster_maintained\n\n\nclass ConfigurableResourceFactoryState(NamedTuple):\n    nested_partial_resources: Mapping[str, Any]\n    resolved_config_dict: Dict[str, Any]\n    config_schema: DefinitionConfigSchema\n    schema: DagsterField\n    nested_resources: Dict[str, Any]\n    resource_context: Optional[InitResourceContext]\n\n\nclass ConfigurableResourceFactory(\n    Generic[TResValue],\n    Config,\n    TypecheckAllowPartialResourceInitParams,\n    AllowDelayedDependencies,\n    ABC,\n    metaclass=BaseResourceMeta,\n):\n    """Base class for creating and managing the lifecycle of Dagster resources that utilize structured config.\n\n    Users should directly inherit from this class when they want the object passed to user-defined\n    code (such as an asset or op) to be different than the object that defines the configuration\n    schema and is passed to the :py:class:`Definitions` object. Cases where this is useful include is\n    when the object passed to user code is:\n\n    * An existing class from a third-party library that the user does not control.\n    * A complex class that requires substantial internal state management or itself requires arguments beyond its config values.\n    * A class with expensive initialization that should not be invoked on code location load, but rather lazily on first use in an op or asset during a run.\n    * A class that you desire to be a plain Python class, rather than a Pydantic class, for whatever reason.\n\n    This class is a subclass of both :py:class:`ResourceDefinition` and :py:class:`Config`, and\n    must implement ``create_resource``, which creates the resource to pass to user code.\n\n    Example definition:\n\n    .. code-block:: python\n\n        class DatabaseResource(ConfigurableResourceFactory[Database]):\n            connection_uri: str\n\n            def create_resource(self, _init_context) -> Database:\n                # For example Database could be from a third-party library or require expensive setup.\n                # Or you could just prefer to separate the concerns of configuration and runtime representation\n                return Database(self.connection_uri)\n\n    To use a resource created by a factory in a job, you must use the Resource type annotation.\n\n    Example usage:\n\n\n    .. code-block:: python\n\n        @asset\n        def asset_that_uses_database(database: ResourceParam[Database]):\n            # Database used directly in user code\n            database.query("SELECT * FROM table")\n\n        defs = Definitions(\n            assets=[asset_that_uses_database],\n            resources={"database": DatabaseResource(connection_uri="some_uri")},\n        )\n\n    """\n\n    def __init__(self, **data: Any):\n        resource_pointers, data_without_resources = separate_resource_params(self.__class__, data)\n\n        schema = infer_schema_from_config_class(\n            self.__class__, fields_to_omit=set(resource_pointers.keys())\n        )\n\n        # Populate config values\n        Config.__init__(self, **{**data_without_resources, **resource_pointers})\n\n        # We pull the values from the Pydantic config object, which may cast values\n        # to the correct type under the hood - useful in particular for enums\n        casted_data_without_resources = {\n            k: v\n            for k, v in self._convert_to_config_dictionary().items()\n            if k in data_without_resources\n        }\n        resolved_config_dict = config_dictionary_from_values(casted_data_without_resources, schema)\n\n        self._state__internal__ = ConfigurableResourceFactoryState(\n            # We keep track of any resources we depend on which are not fully configured\n            # so that we can retrieve them at runtime\n            nested_partial_resources={\n                k: v for k, v in resource_pointers.items() if (not _is_fully_configured(v))\n            },\n            resolved_config_dict=resolved_config_dict,\n            # These are unfortunately named very similarily\n            config_schema=_curry_config_schema(schema, resolved_config_dict),\n            schema=schema,\n            nested_resources={k: v for k, v in resource_pointers.items()},\n            resource_context=None,\n        )\n\n    @property\n    def _schema(self):\n        return self._state__internal__.schema\n\n    @property\n    def _config_schema(self):\n        return self._state__internal__.config_schema\n\n    @property\n    def _nested_partial_resources(self):\n        return self._state__internal__.nested_partial_resources\n\n    @property\n    def _nested_resources(self):\n        return self._state__internal__.nested_resources\n\n    @property\n    def _resolved_config_dict(self):\n        return self._state__internal__.resolved_config_dict\n\n    @classmethod\n    def _is_dagster_maintained(cls) -> bool:\n        """This should be overridden to return True by all dagster maintained resources and IO managers."""\n        return False\n\n    @classmethod\n    def _is_cm_resource_cls(cls: Type["ConfigurableResourceFactory"]) -> bool:\n        return (\n            cls.yield_for_execution != ConfigurableResourceFactory.yield_for_execution\n            or cls.teardown_after_execution != ConfigurableResourceFactory.teardown_after_execution\n        )\n\n    @property\n    def _is_cm_resource(self) -> bool:\n        return self.__class__._is_cm_resource_cls()  # noqa: SLF001\n\n    def _get_initialize_and_run_fn(self) -> Callable:\n        return self._initialize_and_run_cm if self._is_cm_resource else self._initialize_and_run\n\n    @cached_method\n    def get_resource_definition(self) -> ConfigurableResourceFactoryResourceDefinition:\n        return ConfigurableResourceFactoryResourceDefinition(\n            self.__class__,\n            resource_fn=self._get_initialize_and_run_fn(),\n            config_schema=self._config_schema,\n            description=self.__doc__,\n            resolve_resource_keys=self._resolve_required_resource_keys,\n            nested_resources=self.nested_resources,\n            dagster_maintained=self._is_dagster_maintained(),\n        )\n\n    @abstractmethod\n    def create_resource(self, context: InitResourceContext) -> TResValue:\n        """Returns the object that this resource hands to user code, accessible by ops or assets\n        through the context or resource parameters. This works like the function decorated\n        with @resource when using function-based resources.\n        """\n        raise NotImplementedError()\n\n    @property\n    def nested_resources(\n        self,\n    ) -> Mapping[str, Any]:\n        return self._nested_resources\n\n    @classmethod\n    def configure_at_launch(cls: "Type[Self]", **kwargs) -> "PartialResource[Self]":\n        """Returns a partially initialized copy of the resource, with remaining config fields\n        set at runtime.\n        """\n        return PartialResource(cls, data=kwargs)\n\n    def _with_updated_values(\n        self, values: Optional[Mapping[str, Any]]\n    ) -> "ConfigurableResourceFactory[TResValue]":\n        """Returns a new instance of the resource with the given values.\n        Used when initializing a resource at runtime.\n        """\n        values = check.opt_mapping_param(values, "values", key_type=str)\n        # Since Resource extends BaseModel and is a dataclass, we know that the\n        # signature of any __init__ method will always consist of the fields\n        # of this class. We can therefore safely pass in the values as kwargs.\n        out = self.__class__(**{**self._get_non_none_public_field_values(), **values})\n        out._state__internal__ = out._state__internal__._replace(  # noqa: SLF001\n            resource_context=self._state__internal__.resource_context\n        )\n        return out\n\n    @contextlib.contextmanager\n    def _resolve_and_update_nested_resources(\n        self, context: InitResourceContext\n    ) -> Generator["ConfigurableResourceFactory[TResValue]", None, None]:\n        """Updates any nested resources with the resource values from the context.\n        In this case, populating partially configured resources or\n        resources that return plain Python types.\n\n        Returns a new instance of the resource.\n        """\n        from dagster._core.execution.build_resources import wrap_resource_for_execution\n\n        partial_resources_to_update: Dict[str, Any] = {}\n        if self._nested_partial_resources:\n            context_with_mapping = cast(\n                InitResourceContextWithKeyMapping,\n                check.inst(\n                    context,\n                    InitResourceContextWithKeyMapping,\n                    "This ConfiguredResource contains unresolved partially-specified nested"\n                    " resources, and so can only be initialized using a"\n                    " InitResourceContextWithKeyMapping",\n                ),\n            )\n            partial_resources_to_update = {\n                attr_name: context_with_mapping.resources_by_id[id(resource)]\n                for attr_name, resource in self._nested_partial_resources.items()\n            }\n\n        # Also evaluate any resources that are not partial\n        with contextlib.ExitStack() as stack:\n            resources_to_update, _ = separate_resource_params(self.__class__, self.__dict__)\n            resources_to_update = {\n                attr_name: _call_resource_fn_with_default(\n                    stack, wrap_resource_for_execution(resource), context\n                )\n                for attr_name, resource in resources_to_update.items()\n                if attr_name not in partial_resources_to_update\n            }\n\n            to_update = {**resources_to_update, **partial_resources_to_update}\n            yield self._with_updated_values(to_update)\n\n    @deprecated(\n        breaking_version="2.0", additional_warn_text="Use `with_replaced_resource_context` instead"\n    )\n    def with_resource_context(\n        self, resource_context: InitResourceContext\n    ) -> "ConfigurableResourceFactory[TResValue]":\n        return self.with_replaced_resource_context(resource_context)\n\n    def with_replaced_resource_context(\n        self, resource_context: InitResourceContext\n    ) -> "ConfigurableResourceFactory[TResValue]":\n        """Returns a new instance of the resource with the given resource init context bound."""\n        # This utility is used to create a copy of this resource, without adjusting\n        # any values in this case\n        copy = self._with_updated_values({})\n        copy._state__internal__ = copy._state__internal__._replace(  # noqa: SLF001\n            resource_context=resource_context\n        )\n        return copy\n\n    def _initialize_and_run(self, context: InitResourceContext) -> TResValue:\n        with self._resolve_and_update_nested_resources(context) as has_nested_resource:\n            updated_resource = has_nested_resource.with_replaced_resource_context(  # noqa: SLF001\n                context\n            )._with_updated_values(context.resource_config)\n\n            updated_resource.setup_for_execution(context)\n            return updated_resource.create_resource(context)\n\n    @contextlib.contextmanager\n    def _initialize_and_run_cm(\n        self, context: InitResourceContext\n    ) -> Generator[TResValue, None, None]:\n        with self._resolve_and_update_nested_resources(context) as has_nested_resource:\n            updated_resource = has_nested_resource.with_replaced_resource_context(  # noqa: SLF001\n                context\n            )._with_updated_values(context.resource_config)\n\n            with updated_resource.yield_for_execution(context) as value:\n                yield value\n\n    def setup_for_execution(self, context: InitResourceContext) -> None:\n        """Optionally override this method to perform any pre-execution steps\n        needed before the resource is used in execution.\n        """\n        pass\n\n    def teardown_after_execution(self, context: InitResourceContext) -> None:\n        """Optionally override this method to perform any post-execution steps\n        needed after the resource is used in execution.\n\n        teardown_after_execution will be called even if any part of the run fails.\n        It will not be called if setup_for_execution fails.\n        """\n        pass\n\n    @contextlib.contextmanager\n    def yield_for_execution(self, context: InitResourceContext) -> Generator[TResValue, None, None]:\n        """Optionally override this method to perform any lifecycle steps\n        before or after the resource is used in execution. By default, calls\n        setup_for_execution before yielding, and teardown_after_execution after yielding.\n\n        Note that if you override this method and want setup_for_execution or\n        teardown_after_execution to be called, you must invoke them yourself.\n        """\n        self.setup_for_execution(context)\n        try:\n            yield self.create_resource(context)\n        finally:\n            self.teardown_after_execution(context)\n\n    def get_resource_context(self) -> InitResourceContext:\n        """Returns the context that this resource was initialized with."""\n        return check.not_none(\n            self._state__internal__.resource_context,\n            additional_message="Attempted to get context before resource was initialized.",\n        )\n\n    def process_config_and_initialize(self) -> TResValue:\n        """Initializes this resource, fully processing its config and returning the prepared\n        resource value.\n        """\n        from dagster._config.post_process import post_process_config\n\n        return self.from_resource_context(\n            build_init_resource_context(\n                config=post_process_config(\n                    self._config_schema.config_type, self._convert_to_config_dictionary()\n                ).value\n            )\n        )\n\n    @classmethod\n    def from_resource_context(cls, context: InitResourceContext) -> TResValue:\n        """Creates a new instance of this resource from a populated InitResourceContext.\n        Useful when creating a resource from a function-based resource, for backwards\n        compatibility purposes.\n\n        For resources that have custom teardown behavior, use from_resource_context_cm instead.\n\n        Example usage:\n\n        .. code-block:: python\n\n            class MyResource(ConfigurableResource):\n                my_str: str\n\n            @resource(config_schema=MyResource.to_config_schema())\n            def my_resource(context: InitResourceContext) -> MyResource:\n                return MyResource.from_resource_context(context)\n\n        """\n        check.invariant(\n            not cls._is_cm_resource_cls(),\n            "Use from_resource_context_cm for resources which have custom teardown behavior,"\n            " e.g. overriding yield_for_execution or teardown_after_execution",\n        )\n        return cls(**context.resource_config or {})._initialize_and_run(context)  # noqa: SLF001\n\n    @classmethod\n    @contextlib.contextmanager\n    def from_resource_context_cm(\n        cls, context: InitResourceContext\n    ) -> Generator[TResValue, None, None]:\n        """Context which generates a new instance of this resource from a populated InitResourceContext.\n        Useful when creating a resource from a function-based resource, for backwards\n        compatibility purposes. Handles custom teardown behavior.\n\n        Example usage:\n\n        .. code-block:: python\n\n            class MyResource(ConfigurableResource):\n                my_str: str\n\n            @resource(config_schema=MyResource.to_config_schema())\n            def my_resource(context: InitResourceContext) -> Generator[MyResource, None, None]:\n                with MyResource.from_resource_context_cm(context) as my_resource:\n                    yield my_resource\n\n        """\n        with cls(**context.resource_config or {})._initialize_and_run_cm(  # noqa: SLF001\n            context\n        ) as value:\n            yield value\n\n\n
[docs]class ConfigurableResource(ConfigurableResourceFactory[TResValue]):\n """Base class for Dagster resources that utilize structured config.\n\n This class is a subclass of both :py:class:`ResourceDefinition` and :py:class:`Config`.\n\n Example definition:\n\n .. code-block:: python\n\n class WriterResource(ConfigurableResource):\n prefix: str\n\n def output(self, text: str) -> None:\n print(f"{self.prefix}{text}")\n\n Example usage:\n\n .. code-block:: python\n\n @asset\n def asset_that_uses_writer(writer: WriterResource):\n writer.output("text")\n\n defs = Definitions(\n assets=[asset_that_uses_writer],\n resources={"writer": WriterResource(prefix="a_prefix")},\n )\n\n """\n\n def create_resource(self, context: InitResourceContext) -> TResValue:\n """Returns the object that this resource hands to user code, accessible by ops or assets\n through the context or resource parameters. This works like the function decorated\n with @resource when using function-based resources.\n\n For ConfigurableResource, this function will return itself, passing\n the actual ConfigurableResource object to user code.\n """\n return cast(TResValue, self)
\n\n\ndef _is_fully_configured(resource: CoercibleToResource) -> bool:\n from dagster._core.execution.build_resources import wrap_resource_for_execution\n\n actual_resource = wrap_resource_for_execution(resource)\n res = (\n validate_config(\n actual_resource.config_schema.config_type,\n (\n actual_resource.config_schema.default_value\n if actual_resource.config_schema.default_provided\n else {}\n ),\n ).success\n is True\n )\n\n return res\n\n\nclass PartialResourceState(NamedTuple):\n nested_partial_resources: Dict[str, Any]\n config_schema: DagsterField\n resource_fn: Callable[[InitResourceContext], Any]\n description: Optional[str]\n nested_resources: Dict[str, Any]\n\n\nclass PartialResource(Generic[TResValue], AllowDelayedDependencies, MakeConfigCacheable):\n data: Dict[str, Any]\n resource_cls: Type[ConfigurableResourceFactory[TResValue]]\n\n def __init__(\n self,\n resource_cls: Type[ConfigurableResourceFactory[TResValue]],\n data: Dict[str, Any],\n ):\n resource_pointers, _data_without_resources = separate_resource_params(resource_cls, data)\n\n MakeConfigCacheable.__init__(self, data=data, resource_cls=resource_cls) # type: ignore # extends BaseModel, takes kwargs\n\n def resource_fn(context: InitResourceContext):\n instantiated = resource_cls(\n **{**data, **context.resource_config}\n ) # So that collisions are resolved in favor of the latest provided run config\n return instantiated._get_initialize_and_run_fn()(context) # noqa: SLF001\n\n self._state__internal__ = PartialResourceState(\n # We keep track of any resources we depend on which are not fully configured\n # so that we can retrieve them at runtime\n nested_partial_resources={\n k: v for k, v in resource_pointers.items() if (not _is_fully_configured(v))\n },\n config_schema=infer_schema_from_config_class(\n resource_cls, fields_to_omit=set(resource_pointers.keys())\n ),\n resource_fn=resource_fn,\n description=resource_cls.__doc__,\n nested_resources={k: v for k, v in resource_pointers.items()},\n )\n\n # to make AllowDelayedDependencies work\n @property\n def _nested_partial_resources(\n self,\n ) -> Mapping[str, Any]:\n return self._state__internal__.nested_partial_resources\n\n @property\n def nested_resources(\n self,\n ) -> Mapping[str, Any]:\n return self._state__internal__.nested_resources\n\n @cached_method\n def get_resource_definition(self) -> ConfigurableResourceFactoryResourceDefinition:\n return ConfigurableResourceFactoryResourceDefinition(\n self.resource_cls,\n resource_fn=self._state__internal__.resource_fn,\n config_schema=self._state__internal__.config_schema,\n description=self._state__internal__.description,\n resolve_resource_keys=self._resolve_required_resource_keys,\n nested_resources=self.nested_resources,\n dagster_maintained=self.resource_cls._is_dagster_maintained(), # noqa: SLF001\n )\n\n\nResourceOrPartial: TypeAlias = Union[\n ConfigurableResourceFactory[TResValue], PartialResource[TResValue]\n]\nResourceOrPartialOrValue: TypeAlias = Union[\n ConfigurableResourceFactory[TResValue],\n PartialResource[TResValue],\n ResourceDefinition,\n TResValue,\n]\n\n\nV = TypeVar("V")\n\n\nclass ResourceDependency(Generic[V]):\n def __set_name__(self, _owner, name):\n self._name = name\n\n def __get__(self, obj: "ConfigurableResourceFactory", __owner: Any) -> V:\n return getattr(obj, self._name)\n\n def __set__(self, obj: Optional[object], value: ResourceOrPartialOrValue[V]) -> None:\n setattr(obj, self._name, value)\n\n\nclass ConfigurableLegacyResourceAdapter(ConfigurableResource, ABC):\n """Adapter base class for wrapping a decorated, function-style resource\n with structured config.\n\n To use this class, subclass it, define config schema fields using Pydantic,\n and implement the ``wrapped_resource`` method.\n\n Example:\n .. code-block:: python\n\n @resource(config_schema={"prefix": str})\n def writer_resource(context):\n prefix = context.resource_config["prefix"]\n\n def output(text: str) -> None:\n out_txt.append(f"{prefix}{text}")\n\n return output\n\n class WriterResource(ConfigurableLegacyResourceAdapter):\n prefix: str\n\n @property\n def wrapped_resource(self) -> ResourceDefinition:\n return writer_resource\n """\n\n @property\n @abstractmethod\n def wrapped_resource(self) -> ResourceDefinition:\n raise NotImplementedError()\n\n @cached_method\n def get_resource_definition(self) -> ConfigurableResourceFactoryResourceDefinition:\n return ConfigurableResourceFactoryResourceDefinition(\n self.__class__,\n resource_fn=self.wrapped_resource.resource_fn,\n config_schema=self._config_schema,\n description=self.__doc__,\n resolve_resource_keys=self._resolve_required_resource_keys,\n nested_resources=self.nested_resources,\n dagster_maintained=self._is_dagster_maintained(),\n )\n\n def __call__(self, *args, **kwargs):\n return self.wrapped_resource(*args, **kwargs)\n\n\nclass SeparatedResourceParams(NamedTuple):\n resources: Dict[str, Any]\n non_resources: Dict[str, Any]\n\n\ndef _is_annotated_as_resource_type(annotation: Type) -> bool:\n """Determines if a field in a structured config class is annotated as a resource type or not."""\n from .inheritance_utils import safe_is_subclass\n\n is_annotated_as_resource_dependency = get_origin(annotation) == ResourceDependency or getattr(\n annotation, "__metadata__", None\n ) == ("resource_dependency",)\n\n return is_annotated_as_resource_dependency or safe_is_subclass(\n annotation, (ResourceDefinition, ConfigurableResourceFactory)\n )\n\n\ndef separate_resource_params(cls: Type[BaseModel], data: Dict[str, Any]) -> SeparatedResourceParams:\n """Separates out the key/value inputs of fields in a structured config Resource class which\n are marked as resources (ie, using ResourceDependency) from those which are not.\n """\n keys_by_alias = {field.alias: field for field in cls.__fields__.values()}\n data_with_annotation: List[Tuple[str, Any, Type]] = [\n # No longer exists in Pydantic 2.x, will need to be updated when we upgrade\n (k, v, keys_by_alias[k].outer_type_)\n for k, v in data.items()\n if k in keys_by_alias\n ]\n out = SeparatedResourceParams(\n resources={k: v for k, v, t in data_with_annotation if _is_annotated_as_resource_type(t)},\n non_resources={\n k: v for k, v, t in data_with_annotation if not _is_annotated_as_resource_type(t)\n },\n )\n return out\n\n\ndef _call_resource_fn_with_default(\n stack: contextlib.ExitStack, obj: ResourceDefinition, context: InitResourceContext\n) -> Any:\n from dagster._config.validate import process_config\n\n if isinstance(obj.config_schema, ConfiguredDefinitionConfigSchema):\n value = cast(Dict[str, Any], obj.config_schema.resolve_config({}).value)\n context = context.replace_config(value["config"])\n elif obj.config_schema.default_provided:\n # To explain why we need to process config here;\n # - The resource available on the init context (context.resource_config) has already been processed\n # - The nested resource's config has also already been processed, but is only available in the broader run config dictionary.\n # - The only information we have access to here is the unprocessed default value, so we need to process it a second time.\n unprocessed_config = obj.config_schema.default_value\n evr = process_config(\n {"config": obj.config_schema.config_type}, {"config": unprocessed_config}\n )\n if not evr.success:\n raise DagsterInvalidConfigError(\n "Error in config for nested resource ",\n evr.errors,\n unprocessed_config,\n )\n context = context.replace_config(cast(dict, evr.value)["config"])\n\n if has_at_least_one_parameter(obj.resource_fn):\n result = cast(ResourceFunctionWithContext, obj.resource_fn)(context)\n else:\n result = cast(ResourceFunctionWithoutContext, obj.resource_fn)()\n\n is_fn_generator = inspect.isgenerator(obj.resource_fn) or isinstance(\n obj.resource_fn, contextlib.ContextDecorator\n )\n if is_fn_generator:\n return stack.enter_context(cast(contextlib.AbstractContextManager, result))\n else:\n return result\n\n\nLateBoundTypesForResourceTypeChecking.set_actual_types_for_type_checking(\n resource_dep_type=ResourceDependency,\n resource_type=ConfigurableResourceFactory,\n partial_resource_type=PartialResource,\n)\n\n\ndef validate_resource_annotated_function(fn) -> None:\n """Validates any parameters on the decorated function that are annotated with\n :py:class:`dagster.ResourceDefinition`, raising a :py:class:`dagster.DagsterInvalidDefinitionError`\n if any are not also instances of :py:class:`dagster.ConfigurableResource` (these resources should\n instead be wrapped in the :py:func:`dagster.Resource` Annotation).\n """\n from dagster import DagsterInvalidDefinitionError\n from dagster._config.pythonic_config.resource import (\n ConfigurableResource,\n ConfigurableResourceFactory,\n TResValue,\n )\n\n from .inheritance_utils import safe_is_subclass\n\n malformed_params = [\n param\n for param in get_function_params(fn)\n if safe_is_subclass(param.annotation, (ResourceDefinition, ConfigurableResourceFactory))\n and not safe_is_subclass(param.annotation, ConfigurableResource)\n ]\n if len(malformed_params) > 0:\n malformed_param = malformed_params[0]\n output_type = None\n if safe_is_subclass(malformed_param.annotation, ConfigurableResourceFactory):\n orig_bases = getattr(malformed_param.annotation, "__orig_bases__", None)\n output_type = get_args(orig_bases[0])[0] if orig_bases and len(orig_bases) > 0 else None\n if output_type == TResValue:\n output_type = None\n\n output_type_name = getattr(output_type, "__name__", str(output_type))\n raise DagsterInvalidDefinitionError(\n """Resource param '{param_name}' is annotated as '{annotation_type}', but '{annotation_type}' outputs {value_message} value to user code such as @ops and @assets. This annotation should instead be {annotation_suggestion}""".format(\n param_name=malformed_param.name,\n annotation_type=malformed_param.annotation,\n value_message=f"a '{output_type}'" if output_type else "an unknown",\n annotation_suggestion=(\n f"'ResourceParam[{output_type_name}]'"\n if output_type\n else "'ResourceParam[Any]' or 'ResourceParam[<output type>]'"\n ),\n )\n )\n\n\ndef _resolve_required_resource_keys_for_resource(\n resource: ResourceDefinition, resource_id_to_key_mapping: Mapping[ResourceId, str]\n) -> AbstractSet[str]:\n """Gets the required resource keys for the provided resource, with the assistance of the passed\n resource-id-to-key mapping. For resources which may hold nested partial resources,\n this mapping is used to obtain the top-level resource keys to depend on.\n """\n if isinstance(resource, AllowDelayedDependencies):\n return resource._resolve_required_resource_keys(resource_id_to_key_mapping) # noqa: SLF001\n return resource.required_resource_keys\n
", "current_page_name": "_modules/dagster/_config/pythonic_config/resource", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._config.pythonic_config.resource"}}, "source": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._config.source

\nimport os\n\nimport dagster._check as check\n\nfrom .config_type import ScalarUnion\nfrom .errors import PostProcessingError\nfrom .field_utils import Selector\n\nVALID_STRING_SOURCE_TYPES = (str, dict)\n\n\ndef _ensure_env_variable(var):\n    check.str_param(var, "var")\n    value = os.getenv(var)\n    if value is None:\n        raise PostProcessingError(\n            f'You have attempted to fetch the environment variable "{var}" '\n            "which is not set. In order for this execution to succeed it "\n            "must be set in this environment."\n        )\n    return value\n\n\nclass StringSourceType(ScalarUnion):\n    def __init__(self):\n        super(StringSourceType, self).__init__(\n            scalar_type=str,\n            non_scalar_schema=Selector({"env": str}),\n            _key="StringSourceType",\n        )\n\n    def post_process(self, value):\n        check.param_invariant(isinstance(value, VALID_STRING_SOURCE_TYPES), "value")\n\n        if not isinstance(value, dict):\n            return value\n\n        key, cfg = next(iter(value.items()))\n        check.invariant(key == "env", "Only valid key is env")\n        return str(_ensure_env_variable(cfg))\n\n\nclass IntSourceType(ScalarUnion):\n    def __init__(self):\n        super(IntSourceType, self).__init__(\n            scalar_type=int,\n            non_scalar_schema=Selector({"env": str}),\n            _key="IntSourceType",\n        )\n\n    def post_process(self, value):\n        check.param_invariant(isinstance(value, (dict, int)), "value", "Should be pre-validated")\n\n        if not isinstance(value, dict):\n            return value\n\n        check.invariant(len(value) == 1, "Selector should have one entry")\n\n        key, cfg = next(iter(value.items()))\n        check.invariant(key == "env", "Only valid key is env")\n        value = _ensure_env_variable(cfg)\n        try:\n            return int(value)\n        except ValueError as e:\n            raise PostProcessingError(\n                f'Value "{value}" stored in env variable "{cfg}" cannot be coerced into an int.'\n            ) from e\n\n\nclass BoolSourceType(ScalarUnion):\n    def __init__(self):\n        super(BoolSourceType, self).__init__(\n            scalar_type=bool,\n            non_scalar_schema=Selector({"env": str}),\n            _key="BoolSourceType",\n        )\n\n    def post_process(self, value):\n        check.param_invariant(isinstance(value, (dict, bool)), "value", "Should be pre-validated")\n\n        if not isinstance(value, dict):\n            return value\n\n        check.invariant(len(value) == 1, "Selector should have one entry")\n\n        key, cfg = next(iter(value.items()))\n        check.invariant(key == "env", "Only valid key is env")\n        value = _ensure_env_variable(cfg)\n        try:\n            return bool(value)\n        except ValueError as e:\n            raise PostProcessingError(\n                (\n                    'Value "{value}" stored in env variable "{var}" cannot be coerced into an bool.'\n                ).format(value=value, var=cfg)\n            ) from e\n\n\nStringSource: StringSourceType = StringSourceType()\nIntSource: IntSourceType = IntSourceType()\nBoolSource: BoolSourceType = BoolSourceType()\n
", "current_page_name": "_modules/dagster/_config/source", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._config.source"}}, "_core": {"definitions": {"asset_check_result": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.asset_check_result

\nfrom typing import TYPE_CHECKING, Mapping, NamedTuple, Optional\n\nimport dagster._check as check\nfrom dagster._annotations import PublicAttr, experimental\nfrom dagster._core.definitions.asset_check_evaluation import (\n    AssetCheckEvaluation,\n    AssetCheckEvaluationTargetMaterializationData,\n)\nfrom dagster._core.definitions.asset_check_spec import AssetCheckSeverity\nfrom dagster._core.definitions.events import (\n    AssetKey,\n    CoercibleToAssetKey,\n    MetadataValue,\n    RawMetadataValue,\n    normalize_metadata,\n)\nfrom dagster._core.errors import DagsterInvariantViolationError\n\nif TYPE_CHECKING:\n    from dagster._core.execution.context.compute import StepExecutionContext\n\n\n
[docs]@experimental\nclass AssetCheckResult(\n NamedTuple(\n "_AssetCheckResult",\n [\n ("passed", PublicAttr[bool]),\n ("asset_key", PublicAttr[Optional[AssetKey]]),\n ("check_name", PublicAttr[Optional[str]]),\n ("metadata", PublicAttr[Mapping[str, MetadataValue]]),\n ("severity", PublicAttr[AssetCheckSeverity]),\n ],\n )\n):\n """The result of an asset check.\n\n Attributes:\n asset_key (Optional[AssetKey]):\n The asset key that was checked.\n check_name (Optional[str]):\n The name of the check.\n passed (bool):\n The pass/fail result of the check.\n metadata (Optional[Dict[str, RawMetadataValue]]):\n Arbitrary metadata about the asset. Keys are displayed string labels, and values are\n one of the following: string, float, int, JSON-serializable dict, JSON-serializable\n list, and one of the data classes returned by a MetadataValue static method.\n severity (AssetCheckSeverity):\n Severity of the check. Defaults to ERROR.\n\n """\n\n def __new__(\n cls,\n *,\n passed: bool,\n asset_key: Optional[CoercibleToAssetKey] = None,\n check_name: Optional[str] = None,\n metadata: Optional[Mapping[str, RawMetadataValue]] = None,\n severity: AssetCheckSeverity = AssetCheckSeverity.ERROR,\n ):\n normalized_metadata = normalize_metadata(\n check.opt_mapping_param(metadata, "metadata", key_type=str),\n )\n return super().__new__(\n cls,\n asset_key=AssetKey.from_coercible(asset_key) if asset_key is not None else None,\n check_name=check.opt_str_param(check_name, "check_name"),\n passed=check.bool_param(passed, "passed"),\n metadata=normalized_metadata,\n severity=check.inst_param(severity, "severity", AssetCheckSeverity),\n )\n\n def to_asset_check_evaluation(\n self, step_context: "StepExecutionContext"\n ) -> AssetCheckEvaluation:\n spec_check_names_by_asset_key = (\n step_context.job_def.asset_layer.get_check_names_by_asset_key_for_node_handle(\n step_context.node_handle.root\n )\n )\n\n asset_keys_with_specs = spec_check_names_by_asset_key.keys()\n\n if self.asset_key is not None:\n if self.asset_key not in asset_keys_with_specs:\n raise DagsterInvariantViolationError(\n "Received unexpected AssetCheckResult. It targets asset"\n f" '{self.asset_key.to_user_string()}' which is not targeted by any of the"\n " checks currently being evaluated. Targeted assets:"\n f" {[asset_key.to_user_string() for asset_key in asset_keys_with_specs]}."\n )\n\n resolved_asset_key = self.asset_key\n\n else:\n if len(spec_check_names_by_asset_key) > 1:\n raise DagsterInvariantViolationError(\n "AssetCheckResult didn't specify an asset key, but there are multiple assets"\n " to choose from:"\n f" {[asset_key.to_user_string() for asset_key in spec_check_names_by_asset_key.keys()]}"\n )\n\n resolved_asset_key = next(iter(asset_keys_with_specs))\n\n check_names_with_specs = spec_check_names_by_asset_key[resolved_asset_key]\n if self.check_name is not None:\n if self.check_name not in check_names_with_specs:\n raise DagsterInvariantViolationError(\n "Received unexpected AssetCheckResult. No checks currently being evaluated"\n f" target asset '{resolved_asset_key.to_user_string()}' and have name"\n f" '{self.check_name}'. Checks being evaluated for this asset:"\n f" {check_names_with_specs}"\n )\n\n resolved_check_name = self.check_name\n else:\n if len(check_names_with_specs) > 1:\n raise DagsterInvariantViolationError(\n "AssetCheckResult result didn't specify a check name, but there are multiple"\n " checks to choose from for the this asset key:"\n f" {check_names_with_specs}"\n )\n\n resolved_check_name = next(iter(check_names_with_specs))\n\n input_asset_info = step_context.get_input_asset_version_info(resolved_asset_key)\n if input_asset_info is not None:\n target_materialization_data = AssetCheckEvaluationTargetMaterializationData(\n run_id=input_asset_info.run_id,\n storage_id=input_asset_info.storage_id,\n timestamp=input_asset_info.timestamp,\n )\n else:\n target_materialization_data = None\n\n return AssetCheckEvaluation(\n check_name=resolved_check_name,\n asset_key=resolved_asset_key,\n passed=self.passed,\n metadata=self.metadata,\n target_materialization_data=target_materialization_data,\n severity=self.severity,\n )\n\n def get_spec_python_identifier(\n self, *, asset_key: Optional[AssetKey] = None, check_name: Optional[str] = None\n ) -> str:\n """Returns a string uniquely identifying the asset check spec associated with this result.\n This is used for the output name associated with an `AssetCheckResult`.\n """\n asset_key = asset_key or self.asset_key\n check_name = check_name or self.check_name\n assert asset_key is not None, "Asset key must be provided if not set on spec"\n assert asset_key is not None, "Asset key must be provided if not set on spec"\n return f"{asset_key.to_python_identifier()}_{self.check_name}"
\n
", "current_page_name": "_modules/dagster/_core/definitions/asset_check_result", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.asset_check_result"}, "asset_check_spec": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.asset_check_spec

\nfrom enum import Enum\nfrom typing import TYPE_CHECKING, Any, Mapping, NamedTuple, Optional, Union\n\nimport dagster._check as check\nfrom dagster._annotations import PublicAttr, experimental\nfrom dagster._core.definitions.events import AssetKey, CoercibleToAssetKey\nfrom dagster._serdes.serdes import whitelist_for_serdes\n\nif TYPE_CHECKING:\n    from dagster._core.definitions.assets import AssetsDefinition\n    from dagster._core.definitions.source_asset import SourceAsset\n\n\n
[docs]@experimental\n@whitelist_for_serdes\nclass AssetCheckSeverity(Enum):\n """Severity level for an asset check.\n\n Severities:\n\n - WARN: If the check fails, don't fail the step.\n - ERROR: If the check fails, fail the step and, within the run, skip materialization of any\n assets that are downstream of the asset being checked.\n """\n\n WARN = "WARN"\n ERROR = "ERROR"
\n\n\n
[docs]@experimental\n@whitelist_for_serdes(old_storage_names={"AssetCheckHandle"})\nclass AssetCheckKey(NamedTuple):\n """Check names are expected to be unique per-asset. Thus, this combination of asset key and\n check name uniquely identifies an asset check within a deployment.\n """\n\n asset_key: PublicAttr[AssetKey]\n name: PublicAttr[str]\n\n @staticmethod\n def from_graphql_input(graphql_input: Mapping[str, Any]) -> "AssetCheckKey":\n return AssetCheckKey(\n asset_key=AssetKey.from_graphql_input(graphql_input["assetKey"]),\n name=graphql_input["name"],\n )
\n\n\n
[docs]@experimental\nclass AssetCheckSpec(\n NamedTuple(\n "_AssetCheckSpec",\n [\n ("name", PublicAttr[str]),\n ("asset_key", PublicAttr[AssetKey]),\n ("description", PublicAttr[Optional[str]]),\n ],\n )\n):\n """Defines information about an check, except how to execute it.\n\n AssetCheckSpec is often used as an argument to decorators that decorator a function that can\n execute multiple checks - e.g. `@asset`, and `@multi_asset`. It defines one of the checks that\n will be executed inside that function.\n\n Args:\n name (str): Name of the check.\n asset (Union[AssetKey, Sequence[str], str, AssetsDefinition, SourceAsset]): The asset that\n the check applies to.\n description (Optional[str]): Description for the check.\n """\n\n def __new__(\n cls,\n name: str,\n *,\n asset: Union[CoercibleToAssetKey, "AssetsDefinition", "SourceAsset"],\n description: Optional[str] = None,\n ):\n return super().__new__(\n cls,\n name=check.str_param(name, "name"),\n asset_key=AssetKey.from_coercible_or_definition(asset),\n description=check.opt_str_param(description, "description"),\n )\n\n def get_python_identifier(self) -> str:\n """Returns a string uniquely identifying the asset check, that uses only the characters\n allowed in a Python identifier.\n """\n return f"{self.asset_key.to_python_identifier()}_{self.name}"\n\n @property\n def key(self) -> AssetCheckKey:\n return AssetCheckKey(self.asset_key, self.name)
\n
", "current_page_name": "_modules/dagster/_core/definitions/asset_check_spec", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.asset_check_spec"}, "asset_dep": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.asset_dep

\nfrom typing import NamedTuple, Optional, Union\n\nimport dagster._check as check\nfrom dagster._annotations import PublicAttr, experimental\nfrom dagster._core.definitions.asset_spec import AssetSpec\nfrom dagster._core.definitions.assets import AssetsDefinition\nfrom dagster._core.definitions.partition_mapping import PartitionMapping\nfrom dagster._core.definitions.source_asset import SourceAsset\nfrom dagster._core.errors import DagsterInvalidDefinitionError\n\nfrom .events import (\n    AssetKey,\n    CoercibleToAssetKey,\n)\n\nCoercibleToAssetDep = Union[\n    CoercibleToAssetKey, AssetSpec, AssetsDefinition, SourceAsset, "AssetDep"\n]\n\n\n
[docs]@experimental\nclass AssetDep(\n NamedTuple(\n "_AssetDep",\n [\n ("asset_key", PublicAttr[AssetKey]),\n ("partition_mapping", PublicAttr[Optional[PartitionMapping]]),\n ],\n )\n):\n """Specifies a dependency on an upstream asset.\n\n Attributes:\n asset (Union[AssetKey, str, AssetSpec, AssetsDefinition, SourceAsset]): The upstream asset to depend on.\n partition_mapping (Optional[PartitionMapping]): Defines what partitions to depend on in\n the upstream asset. If not provided and the upstream asset is partitioned, defaults to\n the default partition mapping for the partitions definition, which is typically maps\n partition keys to the same partition keys in upstream assets.\n\n Examples:\n .. code-block:: python\n\n upstream_asset = AssetSpec("upstream_asset")\n downstream_asset = AssetSpec(\n "downstream_asset",\n deps=[\n AssetDep(\n upstream_asset,\n partition_mapping=TimeWindowPartitionMapping(start_offset=-1, end_offset=-1)\n )\n ]\n )\n """\n\n def __new__(\n cls,\n asset: Union[CoercibleToAssetKey, AssetSpec, AssetsDefinition, SourceAsset],\n *,\n partition_mapping: Optional[PartitionMapping] = None,\n ):\n if isinstance(asset, list):\n check.list_param(asset, "asset", of_type=str)\n else:\n check.inst_param(\n asset, "asset", (AssetKey, str, AssetSpec, AssetsDefinition, SourceAsset)\n )\n if isinstance(asset, AssetsDefinition) and len(asset.keys) > 1:\n # Only AssetsDefinition with a single asset can be passed\n raise DagsterInvalidDefinitionError(\n "Cannot create an AssetDep from a multi_asset AssetsDefinition."\n " Instead, specify dependencies on the assets created by the multi_asset"\n f" via AssetKeys or strings. For the multi_asset {asset.node_def.name}, the"\n f" available keys are: {asset.keys}."\n )\n\n asset_key = _get_asset_key(asset)\n\n return super().__new__(\n cls,\n asset_key=asset_key,\n partition_mapping=check.opt_inst_param(\n partition_mapping,\n "partition_mapping",\n PartitionMapping,\n ),\n )\n\n @staticmethod\n def from_coercible(arg: "CoercibleToAssetDep") -> "AssetDep":\n # if arg is AssetDep, return the original object to retain partition_mapping\n return arg if isinstance(arg, AssetDep) else AssetDep(asset=arg)
\n\n\ndef _get_asset_key(arg: "CoercibleToAssetDep") -> AssetKey:\n if isinstance(arg, (AssetsDefinition, SourceAsset, AssetSpec)):\n return arg.key\n elif isinstance(arg, AssetDep):\n return arg.asset_key\n else:\n return AssetKey.from_coercible(arg)\n
", "current_page_name": "_modules/dagster/_core/definitions/asset_dep", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.asset_dep"}, "asset_in": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.asset_in

\nfrom typing import Mapping, NamedTuple, Optional, Sequence, Type, Union\n\nimport dagster._check as check\nfrom dagster._annotations import PublicAttr\nfrom dagster._core.definitions.events import (\n    AssetKey,\n    CoercibleToAssetKey,\n    CoercibleToAssetKeyPrefix,\n)\nfrom dagster._core.definitions.input import NoValueSentinel\nfrom dagster._core.definitions.metadata import ArbitraryMetadataMapping\nfrom dagster._core.types.dagster_type import DagsterType, resolve_dagster_type\n\nfrom .partition_mapping import PartitionMapping\n\n\n
[docs]class AssetIn(\n NamedTuple(\n "_AssetIn",\n [\n ("key", PublicAttr[Optional[AssetKey]]),\n ("metadata", PublicAttr[Optional[ArbitraryMetadataMapping]]),\n ("key_prefix", PublicAttr[Optional[Sequence[str]]]),\n ("input_manager_key", PublicAttr[Optional[str]]),\n ("partition_mapping", PublicAttr[Optional[PartitionMapping]]),\n ("dagster_type", PublicAttr[Union[DagsterType, Type[NoValueSentinel]]]),\n ],\n )\n):\n """Defines an asset dependency.\n\n Attributes:\n key_prefix (Optional[Union[str, Sequence[str]]]): If provided, the asset's key is the\n concatenation of the key_prefix and the input name. Only one of the "key_prefix" and\n "key" arguments should be provided.\n key (Optional[Union[str, Sequence[str], AssetKey]]): The asset's key. Only one of the\n "key_prefix" and "key" arguments should be provided.\n metadata (Optional[Dict[str, Any]]): A dict of the metadata for the input.\n For example, if you only need a subset of columns from an upstream table, you could\n include that in metadata and the IO manager that loads the upstream table could use the\n metadata to determine which columns to load.\n partition_mapping (Optional[PartitionMapping]): Defines what partitions to depend on in\n the upstream asset. If not provided, defaults to the default partition mapping for the\n partitions definition, which is typically maps partition keys to the same partition keys\n in upstream assets.\n dagster_type (DagsterType): Allows specifying type validation functions that\n will be executed on the input of the decorated function before it runs.\n """\n\n def __new__(\n cls,\n key: Optional[CoercibleToAssetKey] = None,\n metadata: Optional[ArbitraryMetadataMapping] = None,\n key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n input_manager_key: Optional[str] = None,\n partition_mapping: Optional[PartitionMapping] = None,\n dagster_type: Union[DagsterType, Type[NoValueSentinel]] = NoValueSentinel,\n ):\n if isinstance(key_prefix, str):\n key_prefix = [key_prefix]\n\n check.invariant(\n not (key and key_prefix), "key and key_prefix cannot both be set on AssetIn"\n )\n\n return super(AssetIn, cls).__new__(\n cls,\n key=AssetKey.from_coercible(key) if key is not None else None,\n metadata=check.opt_inst_param(metadata, "metadata", Mapping),\n key_prefix=check.opt_list_param(key_prefix, "key_prefix", of_type=str),\n input_manager_key=check.opt_str_param(input_manager_key, "input_manager_key"),\n partition_mapping=check.opt_inst_param(\n partition_mapping, "partition_mapping", PartitionMapping\n ),\n dagster_type=(\n NoValueSentinel\n if dagster_type is NoValueSentinel\n else resolve_dagster_type(dagster_type)\n ),\n )
\n
", "current_page_name": "_modules/dagster/_core/definitions/asset_in", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.asset_in"}, "asset_out": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.asset_out

\nfrom typing import Any, Mapping, NamedTuple, Optional, Sequence, Type, Union\n\nimport dagster._check as check\nfrom dagster._annotations import PublicAttr\nfrom dagster._core.definitions.auto_materialize_policy import AutoMaterializePolicy\nfrom dagster._core.definitions.backfill_policy import BackfillPolicy\nfrom dagster._core.definitions.events import (\n    AssetKey,\n    CoercibleToAssetKey,\n    CoercibleToAssetKeyPrefix,\n)\nfrom dagster._core.definitions.freshness_policy import FreshnessPolicy\nfrom dagster._core.definitions.input import NoValueSentinel\nfrom dagster._core.definitions.metadata import MetadataUserInput\nfrom dagster._core.definitions.output import Out\nfrom dagster._core.definitions.utils import DEFAULT_IO_MANAGER_KEY\nfrom dagster._core.types.dagster_type import DagsterType, resolve_dagster_type\n\n\n
[docs]class AssetOut(\n NamedTuple(\n "_AssetOut",\n [\n ("key", PublicAttr[Optional[AssetKey]]),\n ("key_prefix", PublicAttr[Optional[Sequence[str]]]),\n ("metadata", PublicAttr[Optional[Mapping[str, Any]]]),\n ("io_manager_key", PublicAttr[str]),\n ("description", PublicAttr[Optional[str]]),\n ("is_required", PublicAttr[bool]),\n ("dagster_type", PublicAttr[Union[DagsterType, Type[NoValueSentinel]]]),\n ("group_name", PublicAttr[Optional[str]]),\n ("code_version", PublicAttr[Optional[str]]),\n ("freshness_policy", PublicAttr[Optional[FreshnessPolicy]]),\n ("auto_materialize_policy", PublicAttr[Optional[AutoMaterializePolicy]]),\n ("backfill_policy", PublicAttr[Optional[BackfillPolicy]]),\n ],\n )\n):\n """Defines one of the assets produced by a :py:func:`@multi_asset <multi_asset>`.\n\n Attributes:\n key_prefix (Optional[Union[str, Sequence[str]]]): If provided, the asset's key is the\n concatenation of the key_prefix and the asset's name. When using ``@multi_asset``, the\n asset name defaults to the key of the "outs" dictionary Only one of the "key_prefix" and\n "key" arguments should be provided.\n key (Optional[Union[str, Sequence[str], AssetKey]]): The asset's key. Only one of the\n "key_prefix" and "key" arguments should be provided.\n dagster_type (Optional[Union[Type, DagsterType]]]):\n The type of this output. Should only be set if the correct type can not\n be inferred directly from the type signature of the decorated function.\n description (Optional[str]): Human-readable description of the output.\n is_required (bool): Whether the presence of this field is required. (default: True)\n io_manager_key (Optional[str]): The resource key of the IO manager used for this output.\n (default: "io_manager").\n metadata (Optional[Dict[str, Any]]): A dict of the metadata for the output.\n For example, users can provide a file path if the data object will be stored in a\n filesystem, or provide information of a database table when it is going to load the data\n into the table.\n group_name (Optional[str]): A string name used to organize multiple assets into groups. If\n not provided, the name "default" is used.\n code_version (Optional[str]): The version of the code that generates this asset.\n freshness_policy (Optional[FreshnessPolicy]): A policy which indicates how up to date this\n asset is intended to be.\n auto_materialize_policy (Optional[AutoMaterializePolicy]): AutoMaterializePolicy to apply to\n the specified asset.\n backfill_policy (Optional[BackfillPolicy]): BackfillPolicy to apply to the specified asset.\n """\n\n def __new__(\n cls,\n key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n key: Optional[CoercibleToAssetKey] = None,\n dagster_type: Union[Type, DagsterType] = NoValueSentinel,\n description: Optional[str] = None,\n is_required: bool = True,\n io_manager_key: Optional[str] = None,\n metadata: Optional[MetadataUserInput] = None,\n group_name: Optional[str] = None,\n code_version: Optional[str] = None,\n freshness_policy: Optional[FreshnessPolicy] = None,\n auto_materialize_policy: Optional[AutoMaterializePolicy] = None,\n backfill_policy: Optional[BackfillPolicy] = None,\n ):\n if isinstance(key_prefix, str):\n key_prefix = [key_prefix]\n\n return super(AssetOut, cls).__new__(\n cls,\n key=AssetKey.from_coercible(key) if key is not None else None,\n key_prefix=check.opt_list_param(key_prefix, "key_prefix", of_type=str),\n dagster_type=(\n NoValueSentinel\n if dagster_type is NoValueSentinel\n else resolve_dagster_type(dagster_type)\n ),\n description=check.opt_str_param(description, "description"),\n is_required=check.bool_param(is_required, "is_required"),\n io_manager_key=check.opt_str_param(\n io_manager_key, "io_manager_key", default=DEFAULT_IO_MANAGER_KEY\n ),\n metadata=check.opt_mapping_param(metadata, "metadata", key_type=str),\n group_name=check.opt_str_param(group_name, "group_name"),\n code_version=check.opt_str_param(code_version, "code_version"),\n freshness_policy=check.opt_inst_param(\n freshness_policy, "freshness_policy", FreshnessPolicy\n ),\n auto_materialize_policy=check.opt_inst_param(\n auto_materialize_policy, "auto_materialize_policy", AutoMaterializePolicy\n ),\n backfill_policy=check.opt_inst_param(\n backfill_policy, "backfill_policy", BackfillPolicy\n ),\n )\n\n def to_out(self) -> Out:\n return Out(\n dagster_type=self.dagster_type,\n description=self.description,\n metadata=self.metadata,\n is_required=self.is_required,\n io_manager_key=self.io_manager_key,\n code_version=self.code_version,\n )
\n
", "current_page_name": "_modules/dagster/_core/definitions/asset_out", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.asset_out"}, "asset_selection": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.asset_selection

\nimport collections.abc\nimport operator\nfrom abc import ABC, abstractmethod\nfrom functools import reduce\nfrom typing import AbstractSet, Iterable, Optional, Sequence, Union, cast\n\nfrom typing_extensions import TypeAlias\n\nimport dagster._check as check\nfrom dagster._annotations import deprecated, public\nfrom dagster._core.definitions.asset_checks import AssetChecksDefinition\nfrom dagster._core.errors import DagsterInvalidSubsetError\nfrom dagster._core.selector.subset_selector import (\n    fetch_connected,\n    fetch_sinks,\n    fetch_sources,\n    parse_clause,\n)\n\nfrom .asset_check_spec import AssetCheckKey\nfrom .asset_graph import AssetGraph, InternalAssetGraph\nfrom .assets import AssetsDefinition\nfrom .events import (\n    AssetKey,\n    CoercibleToAssetKey,\n    CoercibleToAssetKeyPrefix,\n    key_prefix_from_coercible,\n)\nfrom .source_asset import SourceAsset\n\nCoercibleToAssetSelection: TypeAlias = Union[\n    str,\n    Sequence[str],\n    Sequence[AssetKey],\n    Sequence[Union["AssetsDefinition", "SourceAsset"]],\n    "AssetSelection",\n]\n\n\n
[docs]class AssetSelection(ABC):\n """An AssetSelection defines a query over a set of assets and asset checks, normally all that are defined in a code location.\n\n You can use the "|", "&", and "-" operators to create unions, intersections, and differences of selections, respectively.\n\n AssetSelections are typically used with :py:func:`define_asset_job`.\n\n By default, selecting assets will also select all of the asset checks that target those assets.\n\n Examples:\n .. code-block:: python\n\n # Select all assets in group "marketing":\n AssetSelection.groups("marketing")\n\n # Select all assets in group "marketing", as well as the asset with key "promotion":\n AssetSelection.groups("marketing") | AssetSelection.keys("promotion")\n\n # Select all assets in group "marketing" that are downstream of asset "leads":\n AssetSelection.groups("marketing") & AssetSelection.keys("leads").downstream()\n\n # Select a list of assets:\n AssetSelection.assets(*my_assets_list)\n\n # Select all assets except for those in group "marketing"\n AssetSelection.all() - AssetSelection.groups("marketing")\n\n # Select all assets which are materialized by the same op as "projections":\n AssetSelection.keys("projections").required_multi_asset_neighbors()\n\n # Select all assets in group "marketing" and exclude their asset checks:\n AssetSelection.groups("marketing") - AssetSelection.all_asset_checks()\n\n # Select all asset checks that target a list of assets:\n AssetSelection.checks_for_assets(*my_assets_list)\n\n # Select a specific asset check:\n AssetSelection.checks(my_asset_check)\n\n """\n\n
[docs] @public\n @staticmethod\n def all() -> "AllSelection":\n """Returns a selection that includes all assets and asset checks."""\n return AllSelection()
\n\n
[docs] @public\n @staticmethod\n def all_asset_checks() -> "AllAssetCheckSelection":\n """Returns a selection that includes all asset checks."""\n return AllAssetCheckSelection()
\n\n
[docs] @public\n @staticmethod\n def assets(*assets_defs: AssetsDefinition) -> "KeysAssetSelection":\n """Returns a selection that includes all of the provided assets and asset checks that target them."""\n return KeysAssetSelection(*(key for assets_def in assets_defs for key in assets_def.keys))
\n\n
[docs] @public\n @staticmethod\n def keys(*asset_keys: CoercibleToAssetKey) -> "KeysAssetSelection":\n """Returns a selection that includes assets with any of the provided keys and all asset checks that target them.\n\n Examples:\n .. code-block:: python\n\n AssetSelection.keys(AssetKey(["a"]))\n\n AssetSelection.keys("a")\n\n AssetSelection.keys(AssetKey(["a"]), AssetKey(["b"]))\n\n AssetSelection.keys("a", "b")\n\n asset_key_list = [AssetKey(["a"]), AssetKey(["b"])]\n AssetSelection.keys(*asset_key_list)\n """\n _asset_keys = [\n AssetKey.from_user_string(key) if isinstance(key, str) else AssetKey.from_coercible(key)\n for key in asset_keys\n ]\n return KeysAssetSelection(*_asset_keys)
\n\n
[docs] @public\n @staticmethod\n def key_prefixes(\n *key_prefixes: CoercibleToAssetKeyPrefix, include_sources: bool = False\n ) -> "KeyPrefixesAssetSelection":\n """Returns a selection that includes assets that match any of the provided key prefixes and all the asset checks that target them.\n\n Args:\n include_sources (bool): If True, then include source assets matching the key prefix(es)\n in the selection.\n\n Examples:\n .. code-block:: python\n\n # match any asset key where the first segment is equal to "a" or "b"\n # e.g. AssetKey(["a", "b", "c"]) would match, but AssetKey(["abc"]) would not.\n AssetSelection.key_prefixes("a", "b")\n\n # match any asset key where the first two segments are ["a", "b"] or ["a", "c"]\n AssetSelection.key_prefixes(["a", "b"], ["a", "c"])\n """\n _asset_key_prefixes = [key_prefix_from_coercible(key_prefix) for key_prefix in key_prefixes]\n return KeyPrefixesAssetSelection(*_asset_key_prefixes, include_sources=include_sources)
\n\n
[docs] @public\n @staticmethod\n def groups(*group_strs, include_sources: bool = False) -> "GroupsAssetSelection":\n """Returns a selection that includes materializable assets that belong to any of the\n provided groups and all the asset checks that target them.\n\n Args:\n include_sources (bool): If True, then include source assets matching the group in the\n selection.\n """\n check.tuple_param(group_strs, "group_strs", of_type=str)\n return GroupsAssetSelection(*group_strs, include_sources=include_sources)
\n\n
[docs] @public\n @staticmethod\n def checks_for_assets(*assets_defs: AssetsDefinition) -> "AssetChecksForAssetKeys":\n """Returns a selection with the asset checks that target the provided assets."""\n return AssetChecksForAssetKeys(\n [key for assets_def in assets_defs for key in assets_def.keys]\n )
\n\n
[docs] @public\n @staticmethod\n def checks(*asset_checks: AssetChecksDefinition) -> "AssetChecksForHandles":\n """Returns a selection that includes all of the provided asset checks."""\n return AssetChecksForHandles(\n [\n AssetCheckKey(asset_key=AssetKey.from_coercible(spec.asset_key), name=spec.name)\n for checks_def in asset_checks\n for spec in checks_def.specs\n ]\n )
\n\n
[docs] @public\n def downstream(\n self, depth: Optional[int] = None, include_self: bool = True\n ) -> "DownstreamAssetSelection":\n """Returns a selection that includes all assets that are downstream of any of the assets in\n this selection, selecting the assets in this selection by default. Includes the asset checks targeting the returned assets. Iterates through each\n asset in this selection and returns the union of all downstream assets.\n\n depth (Optional[int]): If provided, then only include assets to the given depth. A depth\n of 2 means all assets that are children or grandchildren of the assets in this\n selection.\n include_self (bool): If True, then include the assets in this selection in the result.\n If the include_self flag is False, return each downstream asset that is not part of the\n original selection. By default, set to True.\n """\n check.opt_int_param(depth, "depth")\n check.opt_bool_param(include_self, "include_self")\n return DownstreamAssetSelection(self, depth=depth, include_self=include_self)
\n\n
[docs] @public\n def upstream(\n self, depth: Optional[int] = None, include_self: bool = True\n ) -> "UpstreamAssetSelection":\n """Returns a selection that includes all materializable assets that are upstream of any of\n the assets in this selection, selecting the assets in this selection by default. Includes the asset checks targeting the returned assets. Iterates\n through each asset in this selection and returns the union of all upstream assets.\n\n Because mixed selections of source and materializable assets are currently not supported,\n keys corresponding to `SourceAssets` will not be included as upstream of regular assets.\n\n Args:\n depth (Optional[int]): If provided, then only include assets to the given depth. A depth\n of 2 means all assets that are parents or grandparents of the assets in this\n selection.\n include_self (bool): If True, then include the assets in this selection in the result.\n If the include_self flag is False, return each upstream asset that is not part of the\n original selection. By default, set to True.\n """\n check.opt_int_param(depth, "depth")\n check.opt_bool_param(include_self, "include_self")\n return UpstreamAssetSelection(self, depth=depth, include_self=include_self)
\n\n
[docs] @public\n def sinks(self) -> "SinkAssetSelection":\n """Given an asset selection, returns a new asset selection that contains all of the sink\n assets within the original asset selection. Includes the asset checks targeting the returned assets.\n\n A sink asset is an asset that has no downstream dependencies within the asset selection.\n The sink asset can have downstream dependencies outside of the asset selection.\n """\n return SinkAssetSelection(self)
\n\n
[docs] @public\n def required_multi_asset_neighbors(self) -> "RequiredNeighborsAssetSelection":\n """Given an asset selection in which some assets are output from a multi-asset compute op\n which cannot be subset, returns a new asset selection that contains all of the assets\n required to execute the original asset selection. Includes the asset checks targeting the returned assets.\n """\n return RequiredNeighborsAssetSelection(self)
\n\n
[docs] @public\n def roots(self) -> "RootAssetSelection":\n """Given an asset selection, returns a new asset selection that contains all of the root\n assets within the original asset selection. Includes the asset checks targeting the returned assets.\n\n A root asset is an asset that has no upstream dependencies within the asset selection.\n The root asset can have downstream dependencies outside of the asset selection.\n\n Because mixed selections of source and materializable assets are currently not supported,\n keys corresponding to `SourceAssets` will not be included as roots. To select source assets,\n use the `upstream_source_assets` method.\n """\n return RootAssetSelection(self)
\n\n
[docs] @public\n @deprecated(breaking_version="2.0", additional_warn_text="Use AssetSelection.roots instead.")\n def sources(self) -> "RootAssetSelection":\n """Given an asset selection, returns a new asset selection that contains all of the root\n assets within the original asset selection. Includes the asset checks targeting the returned assets.\n\n A root asset is a materializable asset that has no upstream dependencies within the asset\n selection. The root asset can have downstream dependencies outside of the asset selection.\n\n Because mixed selections of source and materializable assets are currently not supported,\n keys corresponding to `SourceAssets` will not be included as roots. To select source assets,\n use the `upstream_source_assets` method.\n """\n return self.roots()
\n\n
[docs] @public\n def upstream_source_assets(self) -> "SourceAssetSelection":\n """Given an asset selection, returns a new asset selection that contains all of the source\n assets upstream of assets in the original selection. Includes the asset checks targeting the returned assets.\n """\n return SourceAssetSelection(self)
\n\n
[docs] @public\n def without_checks(self) -> "AssetSelection":\n """Removes all asset checks in the selection."""\n return self - AssetSelection.all_asset_checks()
\n\n def __or__(self, other: "AssetSelection") -> "OrAssetSelection":\n check.inst_param(other, "other", AssetSelection)\n return OrAssetSelection(self, other)\n\n def __and__(self, other: "AssetSelection") -> "AndAssetSelection":\n check.inst_param(other, "other", AssetSelection)\n return AndAssetSelection(self, other)\n\n def __sub__(self, other: "AssetSelection") -> "SubAssetSelection":\n check.inst_param(other, "other", AssetSelection)\n return SubAssetSelection(self, other)\n\n def resolve(\n self, all_assets: Union[Iterable[Union[AssetsDefinition, SourceAsset]], AssetGraph]\n ) -> AbstractSet[AssetKey]:\n if isinstance(all_assets, AssetGraph):\n asset_graph = all_assets\n else:\n check.iterable_param(all_assets, "all_assets", (AssetsDefinition, SourceAsset))\n asset_graph = AssetGraph.from_assets(all_assets)\n\n resolved = self.resolve_inner(asset_graph)\n resolved_source_assets = asset_graph.source_asset_keys & resolved\n resolved_regular_assets = resolved - asset_graph.source_asset_keys\n check.invariant(\n not (len(resolved_source_assets) > 0 and len(resolved_regular_assets) > 0),\n "Asset selection specified both regular assets and source assets. This is not"\n " currently supported. Selections must be all regular assets or all source assets.",\n )\n return resolved\n\n @abstractmethod\n def resolve_inner(self, asset_graph: AssetGraph) -> AbstractSet[AssetKey]:\n raise NotImplementedError()\n\n def resolve_checks(self, asset_graph: InternalAssetGraph) -> AbstractSet[AssetCheckKey]:\n """We don't need this method currently, but it makes things consistent with resolve_inner. Currently\n we don't store checks in the ExternalAssetGraph, so we only support InternalAssetGraph.\n """\n return self.resolve_checks_inner(asset_graph)\n\n def resolve_checks_inner(self, asset_graph: InternalAssetGraph) -> AbstractSet[AssetCheckKey]:\n """By default, resolve to checks that target the selected assets. This is overriden for particular selections."""\n asset_keys = self.resolve(asset_graph)\n return {handle for handle in asset_graph.asset_check_keys if handle.asset_key in asset_keys}\n\n @staticmethod\n def _selection_from_string(string: str) -> "AssetSelection":\n from dagster._core.definitions import AssetSelection\n\n if string == "*":\n return AssetSelection.all()\n\n parts = parse_clause(string)\n if not parts:\n check.failed(f"Invalid selection string: {string}")\n u, item, d = parts\n\n selection: AssetSelection = AssetSelection.keys(item)\n if u:\n selection = selection.upstream(u)\n if d:\n selection = selection.downstream(d)\n return selection\n\n @classmethod\n def from_coercible(cls, selection: CoercibleToAssetSelection) -> "AssetSelection":\n if isinstance(selection, str):\n return cls._selection_from_string(selection)\n elif isinstance(selection, AssetSelection):\n return selection\n elif isinstance(selection, collections.abc.Sequence) and all(\n isinstance(el, str) for el in selection\n ):\n return reduce(\n operator.or_, [cls._selection_from_string(cast(str, s)) for s in selection]\n )\n elif isinstance(selection, collections.abc.Sequence) and all(\n isinstance(el, (AssetsDefinition, SourceAsset)) for el in selection\n ):\n return AssetSelection.keys(\n *(\n key\n for el in selection\n for key in (\n el.keys if isinstance(el, AssetsDefinition) else [cast(SourceAsset, el).key]\n )\n )\n )\n elif isinstance(selection, collections.abc.Sequence) and all(\n isinstance(el, AssetKey) for el in selection\n ):\n return cls.keys(*cast(Sequence[AssetKey], selection))\n else:\n check.failed(\n "selection argument must be one of str, Sequence[str], Sequence[AssetKey],"\n " Sequence[AssetsDefinition], Sequence[SourceAsset], AssetSelection. Was"\n f" {type(selection)}."\n )
\n\n\nclass AllSelection(AssetSelection):\n def resolve_inner(self, asset_graph: AssetGraph) -> AbstractSet[AssetKey]:\n return asset_graph.materializable_asset_keys\n\n\nclass AllAssetCheckSelection(AssetSelection):\n def resolve_inner(self, asset_graph: AssetGraph) -> AbstractSet[AssetKey]:\n return set()\n\n def resolve_checks_inner(self, asset_graph: InternalAssetGraph) -> AbstractSet[AssetCheckKey]:\n return asset_graph.asset_check_keys\n\n\nclass AssetChecksForAssetKeys(AssetSelection):\n def __init__(self, keys: Sequence[AssetKey]):\n self._keys = keys\n\n def resolve_inner(self, asset_graph: AssetGraph) -> AbstractSet[AssetKey]:\n return set()\n\n def resolve_checks_inner(self, asset_graph: InternalAssetGraph) -> AbstractSet[AssetCheckKey]:\n return {handle for handle in asset_graph.asset_check_keys if handle.asset_key in self._keys}\n\n\nclass AssetChecksForHandles(AssetSelection):\n def __init__(self, asset_check_keys: Sequence[AssetCheckKey]):\n self._asset_check_keys = asset_check_keys\n\n def resolve_inner(self, asset_graph: AssetGraph) -> AbstractSet[AssetKey]:\n return set()\n\n def resolve_checks_inner(self, asset_graph: InternalAssetGraph) -> AbstractSet[AssetCheckKey]:\n return {\n handle for handle in asset_graph.asset_check_keys if handle in self._asset_check_keys\n }\n\n\nclass AndAssetSelection(AssetSelection):\n def __init__(self, left: AssetSelection, right: AssetSelection):\n self._left = left\n self._right = right\n\n def resolve_inner(self, asset_graph: AssetGraph) -> AbstractSet[AssetKey]:\n return self._left.resolve_inner(asset_graph) & self._right.resolve_inner(asset_graph)\n\n def resolve_checks_inner(self, asset_graph: InternalAssetGraph) -> AbstractSet[AssetCheckKey]:\n return self._left.resolve_checks_inner(asset_graph) & self._right.resolve_checks_inner(\n asset_graph\n )\n\n\nclass SubAssetSelection(AssetSelection):\n def __init__(self, left: AssetSelection, right: AssetSelection):\n self._left = left\n self._right = right\n\n def resolve_inner(self, asset_graph: AssetGraph) -> AbstractSet[AssetKey]:\n return self._left.resolve_inner(asset_graph) - self._right.resolve_inner(asset_graph)\n\n def resolve_checks_inner(self, asset_graph: InternalAssetGraph) -> AbstractSet[AssetCheckKey]:\n return self._left.resolve_checks_inner(asset_graph) - self._right.resolve_checks_inner(\n asset_graph\n )\n\n\nclass SinkAssetSelection(AssetSelection):\n def __init__(self, child: AssetSelection):\n self._child = child\n\n def resolve_inner(self, asset_graph: AssetGraph) -> AbstractSet[AssetKey]:\n selection = self._child.resolve_inner(asset_graph)\n return fetch_sinks(asset_graph.asset_dep_graph, selection)\n\n\nclass RequiredNeighborsAssetSelection(AssetSelection):\n def __init__(self, child: AssetSelection):\n self._child = child\n\n def resolve_inner(self, asset_graph: AssetGraph) -> AbstractSet[AssetKey]:\n selection = self._child.resolve_inner(asset_graph)\n output = set(selection)\n for asset_key in selection:\n output.update(asset_graph.get_required_multi_asset_keys(asset_key))\n return output\n\n\nclass RootAssetSelection(AssetSelection):\n def __init__(self, child: AssetSelection):\n self._child = child\n\n def resolve_inner(self, asset_graph: AssetGraph) -> AbstractSet[AssetKey]:\n selection = self._child.resolve_inner(asset_graph)\n return fetch_sources(asset_graph.asset_dep_graph, selection)\n\n\nclass DownstreamAssetSelection(AssetSelection):\n def __init__(\n self,\n child: AssetSelection,\n *,\n depth: Optional[int] = None,\n include_self: Optional[bool] = True,\n ):\n self._child = child\n self.depth = depth\n self.include_self = include_self\n\n def resolve_inner(self, asset_graph: AssetGraph) -> AbstractSet[AssetKey]:\n selection = self._child.resolve_inner(asset_graph)\n return operator.sub(\n reduce(\n operator.or_,\n [\n {asset_key}\n | fetch_connected(\n item=asset_key,\n graph=asset_graph.asset_dep_graph,\n direction="downstream",\n depth=self.depth,\n )\n for asset_key in selection\n ],\n ),\n selection if not self.include_self else set(),\n )\n\n\nclass GroupsAssetSelection(AssetSelection):\n def __init__(self, *groups: str, include_sources: bool):\n self._groups = groups\n self._include_sources = include_sources\n\n def resolve_inner(self, asset_graph: AssetGraph) -> AbstractSet[AssetKey]:\n base_set = (\n asset_graph.all_asset_keys\n if self._include_sources\n else asset_graph.materializable_asset_keys\n )\n return {\n asset_key\n for asset_key, group in asset_graph.group_names_by_key.items()\n if group in self._groups and asset_key in base_set\n }\n\n\nclass KeysAssetSelection(AssetSelection):\n def __init__(self, *keys: AssetKey):\n self._keys = keys\n\n def resolve_inner(self, asset_graph: AssetGraph) -> AbstractSet[AssetKey]:\n specified_keys = set(self._keys)\n invalid_keys = {key for key in specified_keys if key not in asset_graph.all_asset_keys}\n if invalid_keys:\n raise DagsterInvalidSubsetError(\n f"AssetKey(s) {invalid_keys} were selected, but no AssetsDefinition objects supply "\n "these keys. Make sure all keys are spelled correctly, and all AssetsDefinitions "\n "are correctly added to the `Definitions`."\n )\n return specified_keys\n\n\nclass KeyPrefixesAssetSelection(AssetSelection):\n def __init__(self, *key_prefixes: Sequence[str], include_sources: bool):\n self._key_prefixes = key_prefixes\n self._include_sources = include_sources\n\n def resolve_inner(self, asset_graph: AssetGraph) -> AbstractSet[AssetKey]:\n base_set = (\n asset_graph.all_asset_keys\n if self._include_sources\n else asset_graph.materializable_asset_keys\n )\n return {\n key for key in base_set if any(key.has_prefix(prefix) for prefix in self._key_prefixes)\n }\n\n\nclass OrAssetSelection(AssetSelection):\n def __init__(self, left: AssetSelection, right: AssetSelection):\n self._left = left\n self._right = right\n\n def resolve_inner(self, asset_graph: AssetGraph) -> AbstractSet[AssetKey]:\n return self._left.resolve_inner(asset_graph) | self._right.resolve_inner(asset_graph)\n\n def resolve_checks_inner(self, asset_graph: InternalAssetGraph) -> AbstractSet[AssetCheckKey]:\n return self._left.resolve_checks_inner(asset_graph) | self._right.resolve_checks_inner(\n asset_graph\n )\n\n\ndef _fetch_all_upstream(\n selection: AbstractSet[AssetKey],\n asset_graph: AssetGraph,\n depth: Optional[int] = None,\n include_self: bool = True,\n) -> AbstractSet[AssetKey]:\n return operator.sub(\n reduce(\n operator.or_,\n [\n {asset_key}\n | fetch_connected(\n item=asset_key,\n graph=asset_graph.asset_dep_graph,\n direction="upstream",\n depth=depth,\n )\n for asset_key in selection\n ],\n set(),\n ),\n selection if not include_self else set(),\n )\n\n\nclass UpstreamAssetSelection(AssetSelection):\n def __init__(\n self,\n child: AssetSelection,\n *,\n depth: Optional[int] = None,\n include_self: bool = True,\n ):\n self._child = child\n self.depth = depth\n self.include_self = include_self\n\n def resolve_inner(self, asset_graph: AssetGraph) -> AbstractSet[AssetKey]:\n selection = self._child.resolve_inner(asset_graph)\n if len(selection) == 0:\n return selection\n all_upstream = _fetch_all_upstream(selection, asset_graph, self.depth, self.include_self)\n return {key for key in all_upstream if key not in asset_graph.source_asset_keys}\n\n\nclass SourceAssetSelection(AssetSelection):\n def __init__(self, child: AssetSelection):\n self._child = child\n\n def resolve_inner(self, asset_graph: AssetGraph) -> AbstractSet[AssetKey]:\n selection = self._child.resolve_inner(asset_graph)\n if len(selection) == 0:\n return selection\n all_upstream = _fetch_all_upstream(selection, asset_graph)\n return {key for key in all_upstream if key in asset_graph.source_asset_keys}\n
", "current_page_name": "_modules/dagster/_core/definitions/asset_selection", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.asset_selection"}, "asset_sensor_definition": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.asset_sensor_definition

\nimport inspect\nfrom typing import Any, Callable, NamedTuple, Optional, Sequence, Set\n\nimport dagster._check as check\nfrom dagster._annotations import public\nfrom dagster._core.decorator_utils import get_function_params\nfrom dagster._core.definitions.resource_annotation import get_resource_args\n\nfrom .events import AssetKey\nfrom .run_request import RunRequest, SkipReason\nfrom .sensor_definition import (\n    DefaultSensorStatus,\n    RawSensorEvaluationFunctionReturn,\n    SensorDefinition,\n    SensorType,\n    validate_and_get_resource_dict,\n)\nfrom .target import ExecutableDefinition\nfrom .utils import check_valid_name\n\n\nclass AssetSensorParamNames(NamedTuple):\n    context_param_name: Optional[str]\n    event_log_entry_param_name: Optional[str]\n\n\ndef get_asset_sensor_param_names(fn: Callable) -> AssetSensorParamNames:\n    """Determines the names of the context and event log entry parameters for an asset sensor function.\n    These are assumed to be the first two non-resource params, in order (context param before event log entry).\n    """\n    resource_params = {param.name for param in get_resource_args(fn)}\n\n    non_resource_params = [\n        param.name for param in get_function_params(fn) if param.name not in resource_params\n    ]\n\n    context_param_name = non_resource_params[0] if len(non_resource_params) > 0 else None\n    event_log_entry_param_name = non_resource_params[1] if len(non_resource_params) > 1 else None\n\n    return AssetSensorParamNames(\n        context_param_name=context_param_name, event_log_entry_param_name=event_log_entry_param_name\n    )\n\n\n
[docs]class AssetSensorDefinition(SensorDefinition):\n """Define an asset sensor that initiates a set of runs based on the materialization of a given\n asset.\n\n If the asset has been materialized multiple times between since the last sensor tick, the\n evaluation function will only be invoked once, with the latest materialization.\n\n Args:\n name (str): The name of the sensor to create.\n asset_key (AssetKey): The asset_key this sensor monitors.\n asset_materialization_fn (Callable[[SensorEvaluationContext, EventLogEntry], Union[Iterator[Union[RunRequest, SkipReason]], RunRequest, SkipReason]]): The core\n evaluation function for the sensor, which is run at an interval to determine whether a\n run should be launched or not. Takes a :py:class:`~dagster.SensorEvaluationContext` and\n an EventLogEntry corresponding to an AssetMaterialization event.\n\n This function must return a generator, which must yield either a single SkipReason\n or one or more RunRequest objects.\n minimum_interval_seconds (Optional[int]): The minimum number of seconds that will elapse\n between sensor evaluations.\n description (Optional[str]): A human-readable description of the sensor.\n job (Optional[Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]]): The job\n object to target with this sensor.\n jobs (Optional[Sequence[Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]]]):\n (experimental) A list of jobs to be executed when the sensor fires.\n default_status (DefaultSensorStatus): Whether the sensor starts as running or not. The default\n status can be overridden from the Dagster UI or via the GraphQL API.\n """\n\n def __init__(\n self,\n name: str,\n asset_key: AssetKey,\n job_name: Optional[str],\n asset_materialization_fn: Callable[\n ...,\n RawSensorEvaluationFunctionReturn,\n ],\n minimum_interval_seconds: Optional[int] = None,\n description: Optional[str] = None,\n job: Optional[ExecutableDefinition] = None,\n jobs: Optional[Sequence[ExecutableDefinition]] = None,\n default_status: DefaultSensorStatus = DefaultSensorStatus.STOPPED,\n required_resource_keys: Optional[Set[str]] = None,\n ):\n self._asset_key = check.inst_param(asset_key, "asset_key", AssetKey)\n\n from dagster._core.events import DagsterEventType\n from dagster._core.storage.event_log.base import EventRecordsFilter\n\n resource_arg_names: Set[str] = {\n arg.name for arg in get_resource_args(asset_materialization_fn)\n }\n\n combined_required_resource_keys = (\n check.opt_set_param(required_resource_keys, "required_resource_keys", of_type=str)\n | resource_arg_names\n )\n\n def _wrap_asset_fn(materialization_fn) -> Any:\n def _fn(context) -> Any:\n after_cursor = None\n if context.cursor:\n try:\n after_cursor = int(context.cursor)\n except ValueError:\n after_cursor = None\n\n event_records = context.instance.get_event_records(\n EventRecordsFilter(\n event_type=DagsterEventType.ASSET_MATERIALIZATION,\n asset_key=self._asset_key,\n after_cursor=after_cursor,\n ),\n ascending=False,\n limit=1,\n )\n\n if not event_records:\n yield SkipReason(\n f"No new materialization events found for asset key {self._asset_key}"\n )\n return\n\n event_record = event_records[0]\n\n (\n context_param_name,\n event_log_entry_param_name,\n ) = get_asset_sensor_param_names(materialization_fn)\n\n resource_args_populated = validate_and_get_resource_dict(\n context.resources, name, resource_arg_names\n )\n\n # Build asset sensor function args, which can include any subset of\n # context arg, event log entry arg, and any resource args\n args = resource_args_populated\n if context_param_name:\n args[context_param_name] = context\n if event_log_entry_param_name:\n args[event_log_entry_param_name] = event_record.event_log_entry\n\n result = materialization_fn(**args)\n if inspect.isgenerator(result) or isinstance(result, list):\n for item in result:\n yield item\n elif isinstance(result, (SkipReason, RunRequest)):\n yield result\n context.update_cursor(str(event_record.storage_id))\n\n return _fn\n\n super(AssetSensorDefinition, self).__init__(\n name=check_valid_name(name),\n job_name=job_name,\n evaluation_fn=_wrap_asset_fn(\n check.callable_param(asset_materialization_fn, "asset_materialization_fn"),\n ),\n minimum_interval_seconds=minimum_interval_seconds,\n description=description,\n job=job,\n jobs=jobs,\n default_status=default_status,\n required_resource_keys=combined_required_resource_keys,\n )\n\n @public\n @property\n def asset_key(self) -> AssetKey:\n """AssetKey: The key of the asset targeted by this sensor."""\n return self._asset_key\n\n @property\n def sensor_type(self) -> SensorType:\n return SensorType.ASSET
\n
", "current_page_name": "_modules/dagster/_core/definitions/asset_sensor_definition", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.asset_sensor_definition"}, "asset_spec": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.asset_spec

\nfrom enum import Enum\nfrom typing import TYPE_CHECKING, Any, Iterable, Mapping, NamedTuple, Optional\n\nimport dagster._check as check\nfrom dagster._annotations import PublicAttr, experimental\nfrom dagster._core.errors import DagsterInvariantViolationError\n\nfrom .auto_materialize_policy import AutoMaterializePolicy\nfrom .events import (\n    AssetKey,\n    CoercibleToAssetKey,\n)\nfrom .freshness_policy import FreshnessPolicy\nfrom .metadata import MetadataUserInput\n\nif TYPE_CHECKING:\n    from dagster._core.definitions.asset_dep import AssetDep, CoercibleToAssetDep\n\n# SYSTEM_METADATA_KEY_ASSET_EXECUTION_TYPE lives on the metadata of an asset\n# (which currently ends up on the Output associated with the asset key)\n# whih encodes the execution type the of asset. "Unexecutable" assets are assets\n# that cannot be materialized in Dagster, but can have events in the event\n# log keyed off of them, making Dagster usable as a observability and lineage tool\n# for externally materialized assets.\nSYSTEM_METADATA_KEY_ASSET_EXECUTION_TYPE = "dagster/asset_execution_type"\n\n\nclass AssetExecutionType(Enum):\n    UNEXECUTABLE = "UNEXECUTABLE"\n    MATERIALIZATION = "MATERIALIZATION"\n\n    @staticmethod\n    def is_executable(varietal_str: Optional[str]) -> bool:\n        return AssetExecutionType.str_to_enum(varietal_str) in {AssetExecutionType.MATERIALIZATION}\n\n    @staticmethod\n    def str_to_enum(varietal_str: Optional[str]) -> "AssetExecutionType":\n        return (\n            AssetExecutionType.MATERIALIZATION\n            if varietal_str is None\n            else AssetExecutionType(varietal_str)\n        )\n\n\n
[docs]@experimental\nclass AssetSpec(\n NamedTuple(\n "_AssetSpec",\n [\n ("key", PublicAttr[AssetKey]),\n ("deps", PublicAttr[Iterable["AssetDep"]]),\n ("description", PublicAttr[Optional[str]]),\n ("metadata", PublicAttr[Optional[Mapping[str, Any]]]),\n ("group_name", PublicAttr[Optional[str]]),\n ("skippable", PublicAttr[bool]),\n ("code_version", PublicAttr[Optional[str]]),\n ("freshness_policy", PublicAttr[Optional[FreshnessPolicy]]),\n ("auto_materialize_policy", PublicAttr[Optional[AutoMaterializePolicy]]),\n ],\n )\n):\n """Specifies the core attributes of an asset. This object is attached to the decorated\n function that defines how it materialized.\n\n Attributes:\n key (AssetKey): The unique identifier for this asset.\n deps (Optional[AbstractSet[AssetKey]]): The asset keys for the upstream assets that\n materializing this asset depends on.\n description (Optional[str]): Human-readable description of this asset.\n metadata (Optional[Dict[str, Any]]): A dict of static metadata for this asset.\n For example, users can provide information about the database table this\n asset corresponds to.\n skippable (bool): Whether this asset can be omitted during materialization, causing downstream\n dependencies to skip.\n group_name (Optional[str]): A string name used to organize multiple assets into groups. If\n not provided, the name "default" is used.\n code_version (Optional[str]): The version of the code for this specific asset,\n overriding the code version of the materialization function\n freshness_policy (Optional[FreshnessPolicy]): A policy which indicates how up to date this\n asset is intended to be.\n auto_materialize_policy (Optional[AutoMaterializePolicy]): AutoMaterializePolicy to apply to\n the specified asset.\n backfill_policy (Optional[BackfillPolicy]): BackfillPolicy to apply to the specified asset.\n """\n\n def __new__(\n cls,\n key: CoercibleToAssetKey,\n *,\n deps: Optional[Iterable["CoercibleToAssetDep"]] = None,\n description: Optional[str] = None,\n metadata: Optional[MetadataUserInput] = None,\n skippable: bool = False,\n group_name: Optional[str] = None,\n code_version: Optional[str] = None,\n freshness_policy: Optional[FreshnessPolicy] = None,\n auto_materialize_policy: Optional[AutoMaterializePolicy] = None,\n ):\n from dagster._core.definitions.asset_dep import AssetDep\n\n dep_set = {}\n if deps:\n for dep in deps:\n asset_dep = AssetDep.from_coercible(dep)\n\n # we cannot do deduplication via a set because MultiPartitionMappings have an internal\n # dictionary that cannot be hashed. Instead deduplicate by making a dictionary and checking\n # for existing keys.\n if asset_dep.asset_key in dep_set.keys():\n raise DagsterInvariantViolationError(\n f"Cannot set a dependency on asset {asset_dep.asset_key} more than once for"\n f" AssetSpec {key}"\n )\n dep_set[asset_dep.asset_key] = asset_dep\n\n return super().__new__(\n cls,\n key=AssetKey.from_coercible(key),\n deps=list(dep_set.values()),\n description=check.opt_str_param(description, "description"),\n metadata=check.opt_mapping_param(metadata, "metadata", key_type=str),\n skippable=check.bool_param(skippable, "skippable"),\n group_name=check.opt_str_param(group_name, "group_name"),\n code_version=check.opt_str_param(code_version, "code_version"),\n freshness_policy=check.opt_inst_param(\n freshness_policy,\n "freshness_policy",\n FreshnessPolicy,\n ),\n auto_materialize_policy=check.opt_inst_param(\n auto_materialize_policy,\n "auto_materialize_policy",\n AutoMaterializePolicy,\n ),\n )
\n
", "current_page_name": "_modules/dagster/_core/definitions/asset_spec", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.asset_spec"}, "assets": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.assets

\nimport hashlib\nimport json\nimport warnings\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Any,\n    Dict,\n    Iterable,\n    Iterator,\n    List,\n    Mapping,\n    Optional,\n    Sequence,\n    Set,\n    Union,\n    cast,\n)\n\nimport dagster._check as check\nfrom dagster._annotations import experimental_param, public\nfrom dagster._core.definitions.asset_check_spec import AssetCheckKey, AssetCheckSpec\nfrom dagster._core.definitions.asset_layer import get_dep_node_handles_of_graph_backed_asset\nfrom dagster._core.definitions.auto_materialize_policy import AutoMaterializePolicy\nfrom dagster._core.definitions.backfill_policy import BackfillPolicy, BackfillPolicyType\nfrom dagster._core.definitions.freshness_policy import FreshnessPolicy\nfrom dagster._core.definitions.metadata import ArbitraryMetadataMapping\nfrom dagster._core.definitions.multi_dimensional_partitions import MultiPartitionsDefinition\nfrom dagster._core.definitions.op_invocation import direct_invocation_result\nfrom dagster._core.definitions.op_selection import get_graph_subset\nfrom dagster._core.definitions.partition_mapping import MultiPartitionMapping\nfrom dagster._core.definitions.resource_requirement import (\n    RequiresResources,\n    ResourceAddable,\n    ResourceRequirement,\n    merge_resource_defs,\n)\nfrom dagster._core.definitions.time_window_partition_mapping import TimeWindowPartitionMapping\nfrom dagster._core.definitions.time_window_partitions import TimeWindowPartitionsDefinition\nfrom dagster._core.errors import (\n    DagsterInvalidDefinitionError,\n    DagsterInvalidInvocationError,\n    DagsterInvariantViolationError,\n)\nfrom dagster._utils import IHasInternalInit\nfrom dagster._utils.merger import merge_dicts\nfrom dagster._utils.warnings import (\n    disable_dagster_warnings,\n)\n\nfrom .dependency import NodeHandle\nfrom .events import AssetKey, CoercibleToAssetKey, CoercibleToAssetKeyPrefix\nfrom .node_definition import NodeDefinition\nfrom .op_definition import OpDefinition\nfrom .partition import PartitionsDefinition\nfrom .partition_mapping import (\n    PartitionMapping,\n    get_builtin_partition_mapping_types,\n    infer_partition_mapping,\n)\nfrom .resource_definition import ResourceDefinition\nfrom .source_asset import SourceAsset\nfrom .utils import DEFAULT_GROUP_NAME, validate_group_name\n\nif TYPE_CHECKING:\n    from .graph_definition import GraphDefinition\n\n\n
[docs]class AssetsDefinition(ResourceAddable, RequiresResources, IHasInternalInit):\n """Defines a set of assets that are produced by the same op or graph.\n\n AssetsDefinitions are typically not instantiated directly, but rather produced using the\n :py:func:`@asset <asset>` or :py:func:`@multi_asset <multi_asset>` decorators.\n """\n\n _node_def: NodeDefinition\n _keys_by_input_name: Mapping[str, AssetKey]\n _keys_by_output_name: Mapping[str, AssetKey]\n _partitions_def: Optional[PartitionsDefinition]\n _partition_mappings: Mapping[AssetKey, PartitionMapping]\n _asset_deps: Mapping[AssetKey, AbstractSet[AssetKey]]\n _resource_defs: Mapping[str, ResourceDefinition]\n _group_names_by_key: Mapping[AssetKey, str]\n _selected_asset_keys: AbstractSet[AssetKey]\n _can_subset: bool\n _metadata_by_key: Mapping[AssetKey, ArbitraryMetadataMapping]\n _freshness_policies_by_key: Mapping[AssetKey, FreshnessPolicy]\n _auto_materialize_policies_by_key: Mapping[AssetKey, AutoMaterializePolicy]\n _backfill_policy: Optional[BackfillPolicy]\n _code_versions_by_key: Mapping[AssetKey, Optional[str]]\n _descriptions_by_key: Mapping[AssetKey, str]\n _selected_asset_check_keys: AbstractSet[AssetCheckKey]\n\n def __init__(\n self,\n *,\n keys_by_input_name: Mapping[str, AssetKey],\n keys_by_output_name: Mapping[str, AssetKey],\n node_def: NodeDefinition,\n partitions_def: Optional[PartitionsDefinition] = None,\n partition_mappings: Optional[Mapping[AssetKey, PartitionMapping]] = None,\n asset_deps: Optional[Mapping[AssetKey, AbstractSet[AssetKey]]] = None,\n selected_asset_keys: Optional[AbstractSet[AssetKey]] = None,\n can_subset: bool = False,\n resource_defs: Optional[Mapping[str, object]] = None,\n group_names_by_key: Optional[Mapping[AssetKey, str]] = None,\n metadata_by_key: Optional[Mapping[AssetKey, ArbitraryMetadataMapping]] = None,\n freshness_policies_by_key: Optional[Mapping[AssetKey, FreshnessPolicy]] = None,\n auto_materialize_policies_by_key: Optional[Mapping[AssetKey, AutoMaterializePolicy]] = None,\n backfill_policy: Optional[BackfillPolicy] = None,\n descriptions_by_key: Optional[Mapping[AssetKey, str]] = None,\n check_specs_by_output_name: Optional[Mapping[str, AssetCheckSpec]] = None,\n selected_asset_check_keys: Optional[AbstractSet[AssetCheckKey]] = None,\n # if adding new fields, make sure to handle them in the with_attributes, from_graph, and\n # get_attributes_dict methods\n ):\n from dagster._core.execution.build_resources import wrap_resources_for_execution\n\n from .graph_definition import GraphDefinition\n\n if isinstance(node_def, GraphDefinition):\n _validate_graph_def(node_def)\n\n self._node_def = node_def\n self._keys_by_input_name = check.mapping_param(\n keys_by_input_name,\n "keys_by_input_name",\n key_type=str,\n value_type=AssetKey,\n )\n self._keys_by_output_name = check.mapping_param(\n keys_by_output_name,\n "keys_by_output_name",\n key_type=str,\n value_type=AssetKey,\n )\n\n check.opt_mapping_param(\n check_specs_by_output_name,\n "check_specs_by_output_name",\n key_type=str,\n value_type=AssetCheckSpec,\n )\n\n # if not specified assume all output assets depend on all input assets\n all_asset_keys = set(keys_by_output_name.values())\n input_asset_keys = set(keys_by_input_name.values())\n\n self._partitions_def = partitions_def\n self._partition_mappings = partition_mappings or {}\n builtin_partition_mappings = get_builtin_partition_mapping_types()\n for asset_key, partition_mapping in self._partition_mappings.items():\n if not isinstance(partition_mapping, builtin_partition_mappings):\n warnings.warn(\n f"Non-built-in PartitionMappings, such as {type(partition_mapping).__name__} "\n "are deprecated and will not work with asset reconciliation. The built-in "\n "partition mappings are "\n + ", ".join(\n builtin_partition_mapping.__name__\n for builtin_partition_mapping in builtin_partition_mappings\n )\n + ".",\n category=DeprecationWarning,\n )\n\n if asset_key not in input_asset_keys:\n check.failed(\n f"While constructing AssetsDefinition outputting {all_asset_keys}, received a"\n f" partition mapping for {asset_key} that is not defined in the set of upstream"\n f" assets: {input_asset_keys}"\n )\n\n self._asset_deps = asset_deps or {\n out_asset_key: set(keys_by_input_name.values()) for out_asset_key in all_asset_keys\n }\n check.invariant(\n set(self._asset_deps.keys()) == all_asset_keys,\n "The set of asset keys with dependencies specified in the asset_deps argument must "\n "equal the set of asset keys produced by this AssetsDefinition. \\n"\n f"asset_deps keys: {set(self._asset_deps.keys())} \\n"\n f"expected keys: {all_asset_keys}",\n )\n self._resource_defs = wrap_resources_for_execution(\n check.opt_mapping_param(resource_defs, "resource_defs")\n )\n\n group_names_by_key = (\n check.mapping_param(group_names_by_key, "group_names_by_key")\n if group_names_by_key\n else {}\n )\n self._group_names_by_key = {}\n # assets that don't have a group name get a DEFAULT_GROUP_NAME\n for key in all_asset_keys:\n group_name = group_names_by_key.get(key)\n self._group_names_by_key[key] = validate_group_name(group_name)\n\n if selected_asset_keys is not None:\n self._selected_asset_keys = selected_asset_keys\n else:\n self._selected_asset_keys = all_asset_keys\n self._can_subset = can_subset\n\n self._code_versions_by_key = {}\n self._metadata_by_key = dict(\n check.opt_mapping_param(\n metadata_by_key, "metadata_by_key", key_type=AssetKey, value_type=dict\n )\n )\n self._descriptions_by_key = dict(\n check.opt_mapping_param(\n descriptions_by_key, "descriptions_by_key", key_type=AssetKey, value_type=str\n )\n )\n for output_name, asset_key in keys_by_output_name.items():\n output_def, _ = node_def.resolve_output_to_origin(output_name, None)\n self._metadata_by_key[asset_key] = merge_dicts(\n output_def.metadata,\n self._metadata_by_key.get(asset_key, {}),\n )\n # We construct description from three sources of truth here. This\n # highly unfortunate. See commentary in @multi_asset's call to dagster_internal_init.\n description = (\n self._descriptions_by_key.get(asset_key, output_def.description)\n or node_def.description\n )\n if description:\n self._descriptions_by_key[asset_key] = description\n self._code_versions_by_key[asset_key] = output_def.code_version\n\n for key, freshness_policy in (freshness_policies_by_key or {}).items():\n check.param_invariant(\n not (\n freshness_policy\n and self._partitions_def is not None\n and not isinstance(self._partitions_def, TimeWindowPartitionsDefinition)\n ),\n "freshness_policies_by_key",\n "FreshnessPolicies are currently unsupported for assets with partitions of type"\n f" {type(self._partitions_def)}.",\n )\n\n self._freshness_policies_by_key = check.opt_mapping_param(\n freshness_policies_by_key,\n "freshness_policies_by_key",\n key_type=AssetKey,\n value_type=FreshnessPolicy,\n )\n\n self._auto_materialize_policies_by_key = check.opt_mapping_param(\n auto_materialize_policies_by_key,\n "auto_materialize_policies_by_key",\n key_type=AssetKey,\n value_type=AutoMaterializePolicy,\n )\n\n self._backfill_policy = check.opt_inst_param(\n backfill_policy, "backfill_policy", BackfillPolicy\n )\n\n if selected_asset_check_keys is None:\n self._check_specs_by_output_name = check_specs_by_output_name or {}\n else:\n self._check_specs_by_output_name = {\n output_name: check_spec\n for output_name, check_spec in (check_specs_by_output_name or {}).items()\n if check_spec.key in selected_asset_check_keys\n }\n\n self._check_specs_by_handle = {\n spec.key: spec for spec in self._check_specs_by_output_name.values()\n }\n if selected_asset_check_keys is not None:\n self._selected_asset_check_keys = selected_asset_check_keys\n else:\n self._selected_asset_check_keys = self._check_specs_by_handle.keys()\n\n if self._partitions_def is None:\n # check if backfill policy is BackfillPolicyType.SINGLE_RUN if asset is not partitioned\n check.param_invariant(\n (\n backfill_policy.policy_type is BackfillPolicyType.SINGLE_RUN\n if backfill_policy\n else True\n ),\n "backfill_policy",\n "Non partitioned asset can only have single run backfill policy",\n )\n\n _validate_self_deps(\n input_keys=self._keys_by_input_name.values(),\n output_keys=self._selected_asset_keys,\n partition_mappings=self._partition_mappings,\n partitions_def=self._partitions_def,\n )\n\n @staticmethod\n def dagster_internal_init(\n *,\n keys_by_input_name: Mapping[str, AssetKey],\n keys_by_output_name: Mapping[str, AssetKey],\n node_def: NodeDefinition,\n partitions_def: Optional[PartitionsDefinition],\n partition_mappings: Optional[Mapping[AssetKey, PartitionMapping]],\n asset_deps: Optional[Mapping[AssetKey, AbstractSet[AssetKey]]],\n selected_asset_keys: Optional[AbstractSet[AssetKey]],\n can_subset: bool,\n resource_defs: Optional[Mapping[str, object]],\n group_names_by_key: Optional[Mapping[AssetKey, str]],\n metadata_by_key: Optional[Mapping[AssetKey, ArbitraryMetadataMapping]],\n freshness_policies_by_key: Optional[Mapping[AssetKey, FreshnessPolicy]],\n auto_materialize_policies_by_key: Optional[Mapping[AssetKey, AutoMaterializePolicy]],\n backfill_policy: Optional[BackfillPolicy],\n descriptions_by_key: Optional[Mapping[AssetKey, str]],\n check_specs_by_output_name: Optional[Mapping[str, AssetCheckSpec]],\n selected_asset_check_keys: Optional[AbstractSet[AssetCheckKey]],\n ) -> "AssetsDefinition":\n return AssetsDefinition(\n keys_by_input_name=keys_by_input_name,\n keys_by_output_name=keys_by_output_name,\n node_def=node_def,\n partitions_def=partitions_def,\n partition_mappings=partition_mappings,\n asset_deps=asset_deps,\n selected_asset_keys=selected_asset_keys,\n can_subset=can_subset,\n resource_defs=resource_defs,\n group_names_by_key=group_names_by_key,\n metadata_by_key=metadata_by_key,\n freshness_policies_by_key=freshness_policies_by_key,\n auto_materialize_policies_by_key=auto_materialize_policies_by_key,\n backfill_policy=backfill_policy,\n descriptions_by_key=descriptions_by_key,\n check_specs_by_output_name=check_specs_by_output_name,\n selected_asset_check_keys=selected_asset_check_keys,\n )\n\n def __call__(self, *args: object, **kwargs: object) -> object:\n from .composition import is_in_composition\n from .graph_definition import GraphDefinition\n\n # defer to GraphDefinition.__call__ for graph backed assets, or if invoked in composition\n if isinstance(self.node_def, GraphDefinition) or is_in_composition():\n return self._node_def(*args, **kwargs)\n\n # invoke against self to allow assets def information to be used\n return direct_invocation_result(self, *args, **kwargs)\n\n
[docs] @public\n @experimental_param(param="resource_defs")\n @staticmethod\n def from_graph(\n graph_def: "GraphDefinition",\n *,\n keys_by_input_name: Optional[Mapping[str, AssetKey]] = None,\n keys_by_output_name: Optional[Mapping[str, AssetKey]] = None,\n key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n internal_asset_deps: Optional[Mapping[str, Set[AssetKey]]] = None,\n partitions_def: Optional[PartitionsDefinition] = None,\n partition_mappings: Optional[Mapping[str, PartitionMapping]] = None,\n resource_defs: Optional[Mapping[str, ResourceDefinition]] = None,\n group_name: Optional[str] = None,\n group_names_by_output_name: Optional[Mapping[str, Optional[str]]] = None,\n descriptions_by_output_name: Optional[Mapping[str, str]] = None,\n metadata_by_output_name: Optional[Mapping[str, Optional[ArbitraryMetadataMapping]]] = None,\n freshness_policies_by_output_name: Optional[Mapping[str, Optional[FreshnessPolicy]]] = None,\n auto_materialize_policies_by_output_name: Optional[\n Mapping[str, Optional[AutoMaterializePolicy]]\n ] = None,\n backfill_policy: Optional[BackfillPolicy] = None,\n can_subset: bool = False,\n check_specs: Optional[Sequence[AssetCheckSpec]] = None,\n ) -> "AssetsDefinition":\n """Constructs an AssetsDefinition from a GraphDefinition.\n\n Args:\n graph_def (GraphDefinition): The GraphDefinition that is an asset.\n keys_by_input_name (Optional[Mapping[str, AssetKey]]): A mapping of the input\n names of the decorated graph to their corresponding asset keys. If not provided,\n the input asset keys will be created from the graph input names.\n keys_by_output_name (Optional[Mapping[str, AssetKey]]): A mapping of the output\n names of the decorated graph to their corresponding asset keys. If not provided,\n the output asset keys will be created from the graph output names.\n key_prefix (Optional[Union[str, Sequence[str]]]): If provided, key_prefix will be prepended\n to each key in keys_by_output_name. Each item in key_prefix must be a valid name in\n dagster (ie only contains letters, numbers, and _) and may not contain python\n reserved keywords.\n internal_asset_deps (Optional[Mapping[str, Set[AssetKey]]]): By default, it is assumed\n that all assets produced by the graph depend on all assets that are consumed by that\n graph. If this default is not correct, you pass in a map of output names to a\n corrected set of AssetKeys that they depend on. Any AssetKeys in this list must be\n either used as input to the asset or produced within the graph.\n partitions_def (Optional[PartitionsDefinition]): Defines the set of partition keys that\n compose the assets.\n partition_mappings (Optional[Mapping[str, PartitionMapping]]): Defines how to map partition\n keys for this asset to partition keys of upstream assets. Each key in the dictionary\n correponds to one of the input assets, and each value is a PartitionMapping.\n If no entry is provided for a particular asset dependency, the partition mapping defaults\n to the default partition mapping for the partitions definition, which is typically maps\n partition keys to the same partition keys in upstream assets.\n resource_defs (Optional[Mapping[str, ResourceDefinition]]):\n (Experimental) A mapping of resource keys to resource definitions. These resources\n will be initialized during execution, and can be accessed from the\n body of ops in the graph during execution.\n group_name (Optional[str]): A group name for the constructed asset. Assets without a\n group name are assigned to a group called "default".\n group_names_by_output_name (Optional[Mapping[str, Optional[str]]]): Defines a group name to be\n associated with some or all of the output assets for this node. Keys are names of the\n outputs, and values are the group name. Cannot be used with the group_name argument.\n descriptions_by_output_name (Optional[Mapping[str, Optional[str]]]): Defines a description to be\n associated with each of the output asstes for this graph.\n metadata_by_output_name (Optional[Mapping[str, Optional[MetadataUserInput]]]): Defines metadata to\n be associated with each of the output assets for this node. Keys are names of the\n outputs, and values are dictionaries of metadata to be associated with the related\n asset.\n freshness_policies_by_output_name (Optional[Mapping[str, Optional[FreshnessPolicy]]]): Defines a\n FreshnessPolicy to be associated with some or all of the output assets for this node.\n Keys are the names of the outputs, and values are the FreshnessPolicies to be attached\n to the associated asset.\n auto_materialize_policies_by_output_name (Optional[Mapping[str, Optional[AutoMaterializePolicy]]]): Defines an\n AutoMaterializePolicy to be associated with some or all of the output assets for this node.\n Keys are the names of the outputs, and values are the AutoMaterializePolicies to be attached\n to the associated asset.\n backfill_policy (Optional[BackfillPolicy]): Defines this asset's BackfillPolicy\n """\n return AssetsDefinition._from_node(\n node_def=graph_def,\n keys_by_input_name=keys_by_input_name,\n keys_by_output_name=keys_by_output_name,\n key_prefix=key_prefix,\n internal_asset_deps=internal_asset_deps,\n partitions_def=partitions_def,\n partition_mappings=partition_mappings,\n resource_defs=resource_defs,\n group_name=group_name,\n group_names_by_output_name=group_names_by_output_name,\n descriptions_by_output_name=descriptions_by_output_name,\n metadata_by_output_name=metadata_by_output_name,\n freshness_policies_by_output_name=freshness_policies_by_output_name,\n auto_materialize_policies_by_output_name=auto_materialize_policies_by_output_name,\n backfill_policy=backfill_policy,\n can_subset=can_subset,\n check_specs=check_specs,\n )
\n\n
[docs] @public\n @staticmethod\n def from_op(\n op_def: OpDefinition,\n *,\n keys_by_input_name: Optional[Mapping[str, AssetKey]] = None,\n keys_by_output_name: Optional[Mapping[str, AssetKey]] = None,\n key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n internal_asset_deps: Optional[Mapping[str, Set[AssetKey]]] = None,\n partitions_def: Optional[PartitionsDefinition] = None,\n partition_mappings: Optional[Mapping[str, PartitionMapping]] = None,\n group_name: Optional[str] = None,\n group_names_by_output_name: Optional[Mapping[str, Optional[str]]] = None,\n descriptions_by_output_name: Optional[Mapping[str, str]] = None,\n metadata_by_output_name: Optional[Mapping[str, Optional[ArbitraryMetadataMapping]]] = None,\n freshness_policies_by_output_name: Optional[Mapping[str, Optional[FreshnessPolicy]]] = None,\n auto_materialize_policies_by_output_name: Optional[\n Mapping[str, Optional[AutoMaterializePolicy]]\n ] = None,\n backfill_policy: Optional[BackfillPolicy] = None,\n can_subset: bool = False,\n ) -> "AssetsDefinition":\n """Constructs an AssetsDefinition from an OpDefinition.\n\n Args:\n op_def (OpDefinition): The OpDefinition that is an asset.\n keys_by_input_name (Optional[Mapping[str, AssetKey]]): A mapping of the input\n names of the decorated op to their corresponding asset keys. If not provided,\n the input asset keys will be created from the op input names.\n keys_by_output_name (Optional[Mapping[str, AssetKey]]): A mapping of the output\n names of the decorated op to their corresponding asset keys. If not provided,\n the output asset keys will be created from the op output names.\n key_prefix (Optional[Union[str, Sequence[str]]]): If provided, key_prefix will be prepended\n to each key in keys_by_output_name. Each item in key_prefix must be a valid name in\n dagster (ie only contains letters, numbers, and _) and may not contain python\n reserved keywords.\n internal_asset_deps (Optional[Mapping[str, Set[AssetKey]]]): By default, it is assumed\n that all assets produced by the op depend on all assets that are consumed by that\n op. If this default is not correct, you pass in a map of output names to a\n corrected set of AssetKeys that they depend on. Any AssetKeys in this list must be\n either used as input to the asset or produced within the op.\n partitions_def (Optional[PartitionsDefinition]): Defines the set of partition keys that\n compose the assets.\n partition_mappings (Optional[Mapping[str, PartitionMapping]]): Defines how to map partition\n keys for this asset to partition keys of upstream assets. Each key in the dictionary\n correponds to one of the input assets, and each value is a PartitionMapping.\n If no entry is provided for a particular asset dependency, the partition mapping defaults\n to the default partition mapping for the partitions definition, which is typically maps\n partition keys to the same partition keys in upstream assets.\n group_name (Optional[str]): A group name for the constructed asset. Assets without a\n group name are assigned to a group called "default".\n group_names_by_output_name (Optional[Mapping[str, Optional[str]]]): Defines a group name to be\n associated with some or all of the output assets for this node. Keys are names of the\n outputs, and values are the group name. Cannot be used with the group_name argument.\n descriptions_by_output_name (Optional[Mapping[str, Optional[str]]]): Defines a description to be\n associated with each of the output asstes for this graph.\n metadata_by_output_name (Optional[Mapping[str, Optional[MetadataUserInput]]]): Defines metadata to\n be associated with each of the output assets for this node. Keys are names of the\n outputs, and values are dictionaries of metadata to be associated with the related\n asset.\n freshness_policies_by_output_name (Optional[Mapping[str, Optional[FreshnessPolicy]]]): Defines a\n FreshnessPolicy to be associated with some or all of the output assets for this node.\n Keys are the names of the outputs, and values are the FreshnessPolicies to be attached\n to the associated asset.\n auto_materialize_policies_by_output_name (Optional[Mapping[str, Optional[AutoMaterializePolicy]]]): Defines an\n AutoMaterializePolicy to be associated with some or all of the output assets for this node.\n Keys are the names of the outputs, and values are the AutoMaterializePolicies to be attached\n to the associated asset.\n backfill_policy (Optional[BackfillPolicy]): Defines this asset's BackfillPolicy\n """\n return AssetsDefinition._from_node(\n node_def=op_def,\n keys_by_input_name=keys_by_input_name,\n keys_by_output_name=keys_by_output_name,\n key_prefix=key_prefix,\n internal_asset_deps=internal_asset_deps,\n partitions_def=partitions_def,\n partition_mappings=partition_mappings,\n group_name=group_name,\n group_names_by_output_name=group_names_by_output_name,\n descriptions_by_output_name=descriptions_by_output_name,\n metadata_by_output_name=metadata_by_output_name,\n freshness_policies_by_output_name=freshness_policies_by_output_name,\n auto_materialize_policies_by_output_name=auto_materialize_policies_by_output_name,\n backfill_policy=backfill_policy,\n can_subset=can_subset,\n )
\n\n @staticmethod\n def _from_node(\n node_def: Union[OpDefinition, "GraphDefinition"],\n *,\n keys_by_input_name: Optional[Mapping[str, AssetKey]] = None,\n keys_by_output_name: Optional[Mapping[str, AssetKey]] = None,\n key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n internal_asset_deps: Optional[Mapping[str, Set[AssetKey]]] = None,\n partitions_def: Optional[PartitionsDefinition] = None,\n partition_mappings: Optional[Mapping[str, PartitionMapping]] = None,\n resource_defs: Optional[Mapping[str, ResourceDefinition]] = None,\n group_name: Optional[str] = None,\n group_names_by_output_name: Optional[Mapping[str, Optional[str]]] = None,\n descriptions_by_output_name: Optional[Mapping[str, str]] = None,\n metadata_by_output_name: Optional[Mapping[str, Optional[ArbitraryMetadataMapping]]] = None,\n freshness_policies_by_output_name: Optional[Mapping[str, Optional[FreshnessPolicy]]] = None,\n auto_materialize_policies_by_output_name: Optional[\n Mapping[str, Optional[AutoMaterializePolicy]]\n ] = None,\n backfill_policy: Optional[BackfillPolicy] = None,\n can_subset: bool = False,\n check_specs: Optional[Sequence[AssetCheckSpec]] = None,\n ) -> "AssetsDefinition":\n from dagster._core.definitions.decorators.asset_decorator import (\n _validate_and_assign_output_names_to_check_specs,\n )\n\n node_def = check.inst_param(node_def, "node_def", NodeDefinition)\n keys_by_input_name = _infer_keys_by_input_names(\n node_def,\n check.opt_mapping_param(\n keys_by_input_name, "keys_by_input_name", key_type=str, value_type=AssetKey\n ),\n )\n keys_by_output_name = check.opt_mapping_param(\n keys_by_output_name,\n "keys_by_output_name",\n key_type=str,\n value_type=AssetKey,\n )\n internal_asset_deps = check.opt_mapping_param(\n internal_asset_deps, "internal_asset_deps", key_type=str, value_type=set\n )\n resource_defs = check.opt_mapping_param(\n resource_defs, "resource_defs", key_type=str, value_type=ResourceDefinition\n )\n transformed_internal_asset_deps: Dict[AssetKey, AbstractSet[AssetKey]] = {}\n if internal_asset_deps:\n for output_name, asset_keys in internal_asset_deps.items():\n check.invariant(\n output_name in keys_by_output_name,\n f"output_name {output_name} specified in internal_asset_deps does not exist"\n " in the decorated function",\n )\n transformed_internal_asset_deps[keys_by_output_name[output_name]] = asset_keys\n\n check_specs_by_output_name = _validate_and_assign_output_names_to_check_specs(\n check_specs, list(keys_by_output_name.values())\n )\n\n keys_by_output_name = _infer_keys_by_output_names(\n node_def, keys_by_output_name or {}, check_specs_by_output_name\n )\n\n keys_by_output_name_with_prefix: Dict[str, AssetKey] = {}\n key_prefix_list = [key_prefix] if isinstance(key_prefix, str) else key_prefix\n for output_name, key in keys_by_output_name.items():\n # add key_prefix to the beginning of each asset key\n key_with_key_prefix = AssetKey(\n list(filter(None, [*(key_prefix_list or []), *key.path]))\n )\n keys_by_output_name_with_prefix[output_name] = key_with_key_prefix\n\n check.param_invariant(\n group_name is None or group_names_by_output_name is None,\n "group_name",\n "Cannot use both group_name and group_names_by_output_name",\n )\n\n if group_name:\n group_names_by_key = {\n asset_key: group_name for asset_key in keys_by_output_name_with_prefix.values()\n }\n elif group_names_by_output_name:\n group_names_by_key = {\n keys_by_output_name_with_prefix[output_name]: group_name\n for output_name, group_name in group_names_by_output_name.items()\n if group_name is not None\n }\n else:\n group_names_by_key = None\n\n return AssetsDefinition.dagster_internal_init(\n keys_by_input_name=keys_by_input_name,\n keys_by_output_name=keys_by_output_name_with_prefix,\n node_def=node_def,\n asset_deps=transformed_internal_asset_deps or None,\n partitions_def=check.opt_inst_param(\n partitions_def,\n "partitions_def",\n PartitionsDefinition,\n ),\n group_names_by_key=group_names_by_key,\n resource_defs=resource_defs,\n partition_mappings=(\n {\n keys_by_input_name[input_name]: partition_mapping\n for input_name, partition_mapping in partition_mappings.items()\n }\n if partition_mappings\n else None\n ),\n metadata_by_key=(\n {\n keys_by_output_name_with_prefix[output_name]: metadata\n for output_name, metadata in metadata_by_output_name.items()\n if metadata is not None\n }\n if metadata_by_output_name\n else None\n ),\n freshness_policies_by_key=(\n {\n keys_by_output_name_with_prefix[output_name]: freshness_policy\n for output_name, freshness_policy in freshness_policies_by_output_name.items()\n if freshness_policy is not None\n }\n if freshness_policies_by_output_name\n else None\n ),\n auto_materialize_policies_by_key=(\n {\n keys_by_output_name_with_prefix[output_name]: auto_materialize_policy\n for output_name, auto_materialize_policy in auto_materialize_policies_by_output_name.items()\n if auto_materialize_policy is not None\n }\n if auto_materialize_policies_by_output_name\n else None\n ),\n backfill_policy=check.opt_inst_param(\n backfill_policy, "backfill_policy", BackfillPolicy\n ),\n descriptions_by_key=(\n {\n keys_by_output_name_with_prefix[output_name]: description\n for output_name, description in descriptions_by_output_name.items()\n if description is not None\n }\n if descriptions_by_output_name\n else None\n ),\n can_subset=can_subset,\n selected_asset_keys=None, # node has no subselection info\n check_specs_by_output_name=check_specs_by_output_name,\n selected_asset_check_keys=None,\n )\n\n @public\n @property\n def can_subset(self) -> bool:\n """bool: If True, indicates that this AssetsDefinition may materialize any subset of its\n asset keys in a given computation (as opposed to being required to materialize all asset\n keys).\n """\n return self._can_subset\n\n @public\n @property\n def group_names_by_key(self) -> Mapping[AssetKey, str]:\n """Mapping[AssetKey, str]: Returns a mapping from the asset keys in this AssetsDefinition\n to the group names assigned to them. If there is no assigned group name for a given AssetKey,\n it will not be present in this dictionary.\n """\n return self._group_names_by_key\n\n @public\n @property\n def descriptions_by_key(self) -> Mapping[AssetKey, str]:\n """Mapping[AssetKey, str]: Returns a mapping from the asset keys in this AssetsDefinition\n to the descriptions assigned to them. If there is no assigned description for a given AssetKey,\n it will not be present in this dictionary.\n """\n return self._descriptions_by_key\n\n @public\n @property\n def op(self) -> OpDefinition:\n """OpDefinition: Returns the OpDefinition that is used to materialize the assets in this\n AssetsDefinition.\n """\n check.invariant(\n isinstance(self._node_def, OpDefinition),\n "The NodeDefinition for this AssetsDefinition is not of type OpDefinition.",\n )\n return cast(OpDefinition, self._node_def)\n\n @public\n @property\n def node_def(self) -> NodeDefinition:\n """NodeDefinition: Returns the OpDefinition or GraphDefinition that is used to materialize\n the assets in this AssetsDefinition.\n """\n return self._node_def\n\n @public\n @property\n def asset_deps(self) -> Mapping[AssetKey, AbstractSet[AssetKey]]:\n """Maps assets that are produced by this definition to assets that they depend on. The\n dependencies can be either "internal", meaning that they refer to other assets that are\n produced by this definition, or "external", meaning that they refer to assets that aren't\n produced by this definition.\n """\n return self._asset_deps\n\n @property\n def input_names(self) -> Iterable[str]:\n """Iterable[str]: The set of input names of the underlying NodeDefinition for this\n AssetsDefinition.\n """\n return self.keys_by_input_name.keys()\n\n @public\n @property\n def key(self) -> AssetKey:\n """AssetKey: The asset key associated with this AssetsDefinition. If this AssetsDefinition\n has more than one asset key, this will produce an error.\n """\n check.invariant(\n len(self.keys) == 1,\n "Tried to retrieve asset key from an assets definition with multiple asset keys: "\n + ", ".join([str(ak.to_string()) for ak in self._keys_by_output_name.values()]),\n )\n\n return next(iter(self.keys))\n\n @public\n @property\n def resource_defs(self) -> Mapping[str, ResourceDefinition]:\n """Mapping[str, ResourceDefinition]: A mapping from resource name to ResourceDefinition for\n the resources bound to this AssetsDefinition.\n """\n return dict(self._resource_defs)\n\n @public\n @property\n def keys(self) -> AbstractSet[AssetKey]:\n """AbstractSet[AssetKey]: The asset keys associated with this AssetsDefinition."""\n return self._selected_asset_keys\n\n @public\n @property\n def dependency_keys(self) -> Iterable[AssetKey]:\n """Iterable[AssetKey]: The asset keys which are upstream of any asset included in this\n AssetsDefinition.\n """\n # the input asset keys that are directly upstream of a selected asset key\n upstream_keys = {dep_key for key in self.keys for dep_key in self.asset_deps[key]}\n input_keys = set(self._keys_by_input_name.values())\n return upstream_keys.intersection(input_keys)\n\n @property\n def node_keys_by_output_name(self) -> Mapping[str, AssetKey]:\n """AssetKey for each output on the underlying NodeDefinition."""\n return self._keys_by_output_name\n\n @property\n def node_keys_by_input_name(self) -> Mapping[str, AssetKey]:\n """AssetKey for each input on the underlying NodeDefinition."""\n return self._keys_by_input_name\n\n @property\n def check_specs_by_output_name(self) -> Mapping[str, AssetCheckSpec]:\n return self._check_specs_by_output_name\n\n def get_spec_for_check_key(self, asset_check_key: AssetCheckKey) -> AssetCheckSpec:\n return self._check_specs_by_handle[asset_check_key]\n\n @property\n def keys_by_output_name(self) -> Mapping[str, AssetKey]:\n return {\n name: key for name, key in self.node_keys_by_output_name.items() if key in self.keys\n }\n\n @property\n def keys_by_input_name(self) -> Mapping[str, AssetKey]:\n upstream_keys = {dep_key for key in self.keys for dep_key in self.asset_deps[key]}\n return {\n name: key for name, key in self.node_keys_by_input_name.items() if key in upstream_keys\n }\n\n @property\n def freshness_policies_by_key(self) -> Mapping[AssetKey, FreshnessPolicy]:\n return self._freshness_policies_by_key\n\n @property\n def auto_materialize_policies_by_key(self) -> Mapping[AssetKey, AutoMaterializePolicy]:\n return self._auto_materialize_policies_by_key\n\n @property\n def backfill_policy(self) -> Optional[BackfillPolicy]:\n return self._backfill_policy\n\n @public\n @property\n def partitions_def(self) -> Optional[PartitionsDefinition]:\n """Optional[PartitionsDefinition]: The PartitionsDefinition for this AssetsDefinition (if any)."""\n return self._partitions_def\n\n @property\n def metadata_by_key(self) -> Mapping[AssetKey, ArbitraryMetadataMapping]:\n return self._metadata_by_key\n\n @property\n def code_versions_by_key(self) -> Mapping[AssetKey, Optional[str]]:\n return self._code_versions_by_key\n\n @property\n def partition_mappings(self) -> Mapping[AssetKey, PartitionMapping]:\n return self._partition_mappings\n\n
[docs] @public\n def get_partition_mapping(self, in_asset_key: AssetKey) -> Optional[PartitionMapping]:\n """Returns the partition mapping between keys in this AssetsDefinition and a given input\n asset key (if any).\n """\n return self._partition_mappings.get(in_asset_key)
\n\n @public\n @property\n def check_specs(self) -> Iterable[AssetCheckSpec]:\n """Returns the asset check specs defined on this AssetsDefinition, i.e. the checks that can\n be executed while materializing the assets.\n\n Returns:\n Iterable[AssetsCheckSpec]:\n """\n return self._check_specs_by_output_name.values()\n\n @property\n def check_keys(self) -> AbstractSet[AssetCheckKey]:\n """Returns the selected asset checks associated by this AssetsDefinition.\n\n Returns:\n AbstractSet[Tuple[AssetKey, str]]: The selected asset checks. An asset check is\n identified by the asset key and the name of the check.\n """\n return self._selected_asset_check_keys\n\n def is_asset_executable(self, asset_key: AssetKey) -> bool:\n """Returns True if the asset key is materializable by this AssetsDefinition.\n\n Args:\n asset_key (AssetKey): The asset key to check.\n\n Returns:\n bool: True if the asset key is materializable by this AssetsDefinition.\n """\n from dagster._core.definitions.asset_spec import (\n SYSTEM_METADATA_KEY_ASSET_EXECUTION_TYPE,\n AssetExecutionType,\n )\n\n return AssetExecutionType.is_executable(\n self._metadata_by_key.get(asset_key, {}).get(SYSTEM_METADATA_KEY_ASSET_EXECUTION_TYPE)\n )\n\n def get_partition_mapping_for_input(self, input_name: str) -> Optional[PartitionMapping]:\n return self._partition_mappings.get(self._keys_by_input_name[input_name])\n\n def infer_partition_mapping(\n self, upstream_asset_key: AssetKey, upstream_partitions_def: Optional[PartitionsDefinition]\n ) -> PartitionMapping:\n with disable_dagster_warnings():\n partition_mapping = self._partition_mappings.get(upstream_asset_key)\n return infer_partition_mapping(\n partition_mapping, self._partitions_def, upstream_partitions_def\n )\n\n def get_output_name_for_asset_key(self, key: AssetKey) -> str:\n for output_name, asset_key in self.keys_by_output_name.items():\n if key == asset_key:\n return output_name\n\n raise DagsterInvariantViolationError(\n f"Asset key {key.to_user_string()} not found in AssetsDefinition"\n )\n\n def get_op_def_for_asset_key(self, key: AssetKey) -> OpDefinition:\n """If this is an op-backed asset, returns the op def. If it's a graph-backed asset,\n returns the op def within the graph that produces the given asset key.\n """\n output_name = self.get_output_name_for_asset_key(key)\n return self.node_def.resolve_output_to_origin_op_def(output_name)\n\n def with_attributes(\n self,\n *,\n output_asset_key_replacements: Optional[Mapping[AssetKey, AssetKey]] = None,\n input_asset_key_replacements: Optional[Mapping[AssetKey, AssetKey]] = None,\n group_names_by_key: Optional[Mapping[AssetKey, str]] = None,\n descriptions_by_key: Optional[Mapping[AssetKey, str]] = None,\n metadata_by_key: Optional[Mapping[AssetKey, ArbitraryMetadataMapping]] = None,\n freshness_policy: Optional[\n Union[FreshnessPolicy, Mapping[AssetKey, FreshnessPolicy]]\n ] = None,\n auto_materialize_policy: Optional[\n Union[AutoMaterializePolicy, Mapping[AssetKey, AutoMaterializePolicy]]\n ] = None,\n backfill_policy: Optional[BackfillPolicy] = None,\n ) -> "AssetsDefinition":\n output_asset_key_replacements = check.opt_mapping_param(\n output_asset_key_replacements,\n "output_asset_key_replacements",\n key_type=AssetKey,\n value_type=AssetKey,\n )\n input_asset_key_replacements = check.opt_mapping_param(\n input_asset_key_replacements,\n "input_asset_key_replacements",\n key_type=AssetKey,\n value_type=AssetKey,\n )\n group_names_by_key = check.opt_mapping_param(\n group_names_by_key, "group_names_by_key", key_type=AssetKey, value_type=str\n )\n descriptions_by_key = check.opt_mapping_param(\n descriptions_by_key, "descriptions_by_key", key_type=AssetKey, value_type=str\n )\n metadata_by_key = check.opt_mapping_param(\n metadata_by_key, "metadata_by_key", key_type=AssetKey, value_type=dict\n )\n\n backfill_policy = check.opt_inst_param(backfill_policy, "backfill_policy", BackfillPolicy)\n\n if group_names_by_key:\n group_name_conflicts = [\n asset_key\n for asset_key in group_names_by_key\n if asset_key in self.group_names_by_key\n and self.group_names_by_key[asset_key] != DEFAULT_GROUP_NAME\n ]\n if group_name_conflicts:\n raise DagsterInvalidDefinitionError(\n "Group name already exists on assets"\n f" {', '.join(asset_key.to_user_string() for asset_key in group_name_conflicts)}"\n )\n\n replaced_group_names_by_key = {\n output_asset_key_replacements.get(key, key): group_name\n for key, group_name in self.group_names_by_key.items()\n }\n\n if freshness_policy:\n freshness_policy_conflicts = (\n self.freshness_policies_by_key.keys()\n if isinstance(freshness_policy, FreshnessPolicy)\n else (freshness_policy.keys() & self.freshness_policies_by_key.keys())\n )\n if freshness_policy_conflicts:\n raise DagsterInvalidDefinitionError(\n "FreshnessPolicy already exists on assets"\n f" {', '.join(key.to_string() for key in freshness_policy_conflicts)}"\n )\n\n replaced_freshness_policies_by_key = {}\n for key in self.keys:\n if isinstance(freshness_policy, FreshnessPolicy):\n replaced_freshness_policy = freshness_policy\n elif freshness_policy:\n replaced_freshness_policy = freshness_policy.get(key)\n else:\n replaced_freshness_policy = self.freshness_policies_by_key.get(key)\n\n if replaced_freshness_policy:\n replaced_freshness_policies_by_key[output_asset_key_replacements.get(key, key)] = (\n replaced_freshness_policy\n )\n\n if auto_materialize_policy:\n auto_materialize_policy_conflicts = (\n self.auto_materialize_policies_by_key.keys()\n if isinstance(auto_materialize_policy, AutoMaterializePolicy)\n else (auto_materialize_policy.keys() & self.auto_materialize_policies_by_key.keys())\n )\n if auto_materialize_policy_conflicts:\n raise DagsterInvalidDefinitionError(\n "AutoMaterializePolicy already exists on assets"\n f" {', '.join(key.to_string() for key in auto_materialize_policy_conflicts)}"\n )\n\n replaced_auto_materialize_policies_by_key = {}\n for key in self.keys:\n if isinstance(auto_materialize_policy, AutoMaterializePolicy):\n replaced_auto_materialize_policy = auto_materialize_policy\n elif auto_materialize_policy:\n replaced_auto_materialize_policy = auto_materialize_policy.get(key)\n else:\n replaced_auto_materialize_policy = self.auto_materialize_policies_by_key.get(key)\n\n if replaced_auto_materialize_policy:\n replaced_auto_materialize_policies_by_key[\n output_asset_key_replacements.get(key, key)\n ] = replaced_auto_materialize_policy\n\n replaced_descriptions_by_key = {\n output_asset_key_replacements.get(key, key): description\n for key, description in descriptions_by_key.items()\n }\n\n if not metadata_by_key:\n metadata_by_key = self.metadata_by_key\n\n replaced_metadata_by_key = {\n output_asset_key_replacements.get(key, key): metadata\n for key, metadata in metadata_by_key.items()\n }\n\n replaced_attributes = dict(\n keys_by_input_name={\n input_name: input_asset_key_replacements.get(key, key)\n for input_name, key in self._keys_by_input_name.items()\n },\n keys_by_output_name={\n output_name: output_asset_key_replacements.get(key, key)\n for output_name, key in self._keys_by_output_name.items()\n },\n partition_mappings={\n input_asset_key_replacements.get(key, key): partition_mapping\n for key, partition_mapping in self._partition_mappings.items()\n },\n asset_deps={\n # replace both the keys and the values in this mapping\n output_asset_key_replacements.get(key, key): {\n input_asset_key_replacements.get(\n upstream_key,\n output_asset_key_replacements.get(upstream_key, upstream_key),\n )\n for upstream_key in value\n }\n for key, value in self.asset_deps.items()\n },\n selected_asset_keys={\n output_asset_key_replacements.get(key, key) for key in self._selected_asset_keys\n },\n group_names_by_key={\n **replaced_group_names_by_key,\n **group_names_by_key,\n },\n metadata_by_key=replaced_metadata_by_key,\n freshness_policies_by_key=replaced_freshness_policies_by_key,\n auto_materialize_policies_by_key=replaced_auto_materialize_policies_by_key,\n backfill_policy=backfill_policy if backfill_policy else self.backfill_policy,\n descriptions_by_key=replaced_descriptions_by_key,\n )\n\n return self.__class__(**merge_dicts(self.get_attributes_dict(), replaced_attributes))\n\n def _subset_graph_backed_asset(\n self,\n selected_asset_keys: AbstractSet[AssetKey],\n ):\n from dagster._core.definitions.graph_definition import GraphDefinition\n\n if not isinstance(self.node_def, GraphDefinition):\n raise DagsterInvalidInvocationError(\n "Method _subset_graph_backed_asset cannot subset an asset that is not a graph"\n )\n\n # All asset keys in selected_asset_keys are outputted from the same top-level graph backed asset\n dep_node_handles_by_asset_key = get_dep_node_handles_of_graph_backed_asset(\n self.node_def, self\n )\n op_selection: List[str] = []\n for asset_key in selected_asset_keys:\n dep_node_handles = dep_node_handles_by_asset_key[asset_key]\n for dep_node_handle in dep_node_handles:\n op_selection.append(".".join(dep_node_handle.path[1:]))\n\n return get_graph_subset(self.node_def, op_selection)\n\n def subset_for(\n self,\n selected_asset_keys: AbstractSet[AssetKey],\n selected_asset_check_keys: Optional[AbstractSet[AssetCheckKey]],\n ) -> "AssetsDefinition":\n """Create a subset of this AssetsDefinition that will only materialize the assets and checks\n in the selected set.\n\n Args:\n selected_asset_keys (AbstractSet[AssetKey]): The total set of asset keys\n selected_asset_check_keys (AbstractSet[AssetCheckKey]): The selected asset checks\n """\n from dagster._core.definitions.graph_definition import GraphDefinition\n\n check.invariant(\n self.can_subset,\n f"Attempted to subset AssetsDefinition for {self.node_def.name}, but can_subset=False.",\n )\n\n # Set of assets within selected_asset_keys which are outputted by this AssetDefinition\n asset_subselection = selected_asset_keys & self.keys\n if selected_asset_check_keys is None:\n # filter to checks that target selected asset keys\n asset_check_subselection = {\n key for key in self.check_keys if key.asset_key in asset_subselection\n }\n else:\n asset_check_subselection = selected_asset_check_keys & self.check_keys\n\n # Early escape if all assets in AssetsDefinition are selected\n if asset_subselection == self.keys and asset_check_subselection == self.check_keys:\n return self\n elif isinstance(self.node_def, GraphDefinition): # Node is graph-backed asset\n check.invariant(\n selected_asset_check_keys == self.check_keys,\n "Subsetting graph-backed assets with checks is not yet supported",\n )\n\n subsetted_node = self._subset_graph_backed_asset(\n asset_subselection,\n )\n\n # The subsetted node should only include asset inputs that are dependencies of the\n # selected set of assets.\n subsetted_input_names = [input_def.name for input_def in subsetted_node.input_defs]\n subsetted_keys_by_input_name = {\n key: value\n for key, value in self.node_keys_by_input_name.items()\n if key in subsetted_input_names\n }\n\n subsetted_output_names = [output_def.name for output_def in subsetted_node.output_defs]\n subsetted_keys_by_output_name = {\n key: value\n for key, value in self.node_keys_by_output_name.items()\n if key in subsetted_output_names\n }\n\n # An op within the graph-backed asset that yields multiple assets will be run\n # any time any of its output assets are selected. Thus, if an op yields multiple assets\n # and only one of them is selected, the op will still run and potentially unexpectedly\n # materialize the unselected asset.\n #\n # Thus, we include unselected assets that may be accidentally materialized in\n # keys_by_output_name and asset_deps so that the webserver can populate an warning when\n # this occurs. This is the same behavior as multi-asset subsetting.\n\n subsetted_asset_deps = {\n out_asset_key: set(self._keys_by_input_name.values())\n for out_asset_key in subsetted_keys_by_output_name.values()\n }\n\n replaced_attributes = dict(\n keys_by_input_name=subsetted_keys_by_input_name,\n keys_by_output_name=subsetted_keys_by_output_name,\n node_def=subsetted_node,\n asset_deps=subsetted_asset_deps,\n selected_asset_keys=selected_asset_keys & self.keys,\n )\n\n return self.__class__(**merge_dicts(self.get_attributes_dict(), replaced_attributes))\n else:\n # multi_asset subsetting\n replaced_attributes = {\n "selected_asset_keys": asset_subselection,\n "selected_asset_check_keys": asset_check_subselection,\n }\n return self.__class__(**merge_dicts(self.get_attributes_dict(), replaced_attributes))\n\n
[docs] @public\n def to_source_assets(self) -> Sequence[SourceAsset]:\n """Returns a SourceAsset for each asset in this definition.\n\n Each produced SourceAsset will have the same key, metadata, io_manager_key, etc. as the\n corresponding asset\n """\n return [\n self._output_to_source_asset(output_name)\n for output_name in self.keys_by_output_name.keys()\n ]
\n\n
[docs] @public\n def to_source_asset(self, key: Optional[CoercibleToAssetKey] = None) -> SourceAsset:\n """Returns a representation of this asset as a :py:class:`SourceAsset`.\n\n If this is a multi-asset, the "key" argument allows selecting which asset to return a\n SourceAsset representation of.\n\n Args:\n key (Optional[Union[str, Sequence[str], AssetKey]]]): If this is a multi-asset, select\n which asset to return a SourceAsset representation of. If not a multi-asset, this\n can be left as None.\n\n Returns:\n SourceAsset\n """\n if len(self.keys) > 1:\n check.invariant(\n key is not None,\n "The 'key' argument is required when there are multiple assets to choose from",\n )\n\n if key is not None:\n resolved_key = AssetKey.from_coercible(key)\n check.invariant(\n resolved_key in self.keys, f"Key {resolved_key} not found in AssetsDefinition"\n )\n else:\n resolved_key = self.key\n\n output_names = [\n output_name\n for output_name, ak in self.keys_by_output_name.items()\n if ak == resolved_key\n ]\n check.invariant(len(output_names) == 1)\n return self._output_to_source_asset(output_names[0])
\n\n def _output_to_source_asset(self, output_name: str) -> SourceAsset:\n with disable_dagster_warnings():\n output_def = self.node_def.resolve_output_to_origin(\n output_name, NodeHandle(self.node_def.name, parent=None)\n )[0]\n key = self._keys_by_output_name[output_name]\n\n return SourceAsset(\n key=key,\n metadata=output_def.metadata,\n io_manager_key=output_def.io_manager_key,\n description=output_def.description,\n resource_defs=self.resource_defs,\n partitions_def=self.partitions_def,\n group_name=self.group_names_by_key[key],\n )\n\n def get_io_manager_key_for_asset_key(self, key: AssetKey) -> str:\n output_name = self.get_output_name_for_asset_key(key)\n return self.node_def.resolve_output_to_origin(\n output_name, NodeHandle(self.node_def.name, parent=None)\n )[0].io_manager_key\n\n def get_resource_requirements(self) -> Iterator[ResourceRequirement]:\n yield from self.node_def.get_resource_requirements() # type: ignore[attr-defined]\n for source_key, resource_def in self.resource_defs.items():\n yield from resource_def.get_resource_requirements(outer_context=source_key)\n\n @public\n @property\n def required_resource_keys(self) -> Set[str]:\n """Set[str]: The set of keys for resources that must be provided to this AssetsDefinition."""\n return {requirement.key for requirement in self.get_resource_requirements()}\n\n def __str__(self):\n if len(self.keys) == 1:\n return f"AssetsDefinition with key {self.key.to_string()}"\n else:\n asset_keys = ", ".join(sorted(([asset_key.to_string() for asset_key in self.keys])))\n return f"AssetsDefinition with keys {asset_keys}"\n\n @property\n def unique_id(self) -> str:\n """A unique identifier for the AssetsDefinition that's stable across processes."""\n return hashlib.md5((json.dumps(sorted(self.keys))).encode("utf-8")).hexdigest()\n\n def with_resources(self, resource_defs: Mapping[str, ResourceDefinition]) -> "AssetsDefinition":\n attributes_dict = self.get_attributes_dict()\n attributes_dict["resource_defs"] = merge_resource_defs(\n old_resource_defs=self.resource_defs,\n resource_defs_to_merge_in=resource_defs,\n requires_resources=self,\n )\n return self.__class__(**attributes_dict)\n\n def get_attributes_dict(self) -> Dict[str, Any]:\n return dict(\n keys_by_input_name=self._keys_by_input_name,\n keys_by_output_name=self._keys_by_output_name,\n node_def=self._node_def,\n partitions_def=self._partitions_def,\n partition_mappings=self._partition_mappings,\n asset_deps=self.asset_deps,\n selected_asset_keys=self._selected_asset_keys,\n can_subset=self._can_subset,\n resource_defs=self._resource_defs,\n group_names_by_key=self._group_names_by_key,\n metadata_by_key=self._metadata_by_key,\n freshness_policies_by_key=self._freshness_policies_by_key,\n auto_materialize_policies_by_key=self._auto_materialize_policies_by_key,\n backfill_policy=self._backfill_policy,\n descriptions_by_key=self._descriptions_by_key,\n check_specs_by_output_name=self._check_specs_by_output_name,\n selected_asset_check_keys=self._selected_asset_check_keys,\n )
\n\n\ndef _infer_keys_by_input_names(\n node_def: Union["GraphDefinition", OpDefinition], keys_by_input_name: Mapping[str, AssetKey]\n) -> Mapping[str, AssetKey]:\n all_input_names = [input_def.name for input_def in node_def.input_defs]\n if keys_by_input_name:\n check.invariant(\n set(keys_by_input_name.keys()) == set(all_input_names),\n "The set of input names keys specified in the keys_by_input_name argument must "\n f"equal the set of asset keys inputted by '{node_def.name}'. \\n"\n f"keys_by_input_name keys: {set(keys_by_input_name.keys())} \\n"\n f"expected keys: {all_input_names}",\n )\n\n # If asset key is not supplied in keys_by_input_name, create asset key\n # from input name\n inferred_input_names_by_asset_key: Dict[str, AssetKey] = {\n input_name: keys_by_input_name.get(input_name, AssetKey([input_name]))\n for input_name in all_input_names\n }\n\n return inferred_input_names_by_asset_key\n\n\ndef _infer_keys_by_output_names(\n node_def: Union["GraphDefinition", OpDefinition],\n keys_by_output_name: Mapping[str, AssetKey],\n check_specs_by_output_name: Mapping[str, AssetCheckSpec],\n) -> Mapping[str, AssetKey]:\n output_names = [output_def.name for output_def in node_def.output_defs]\n if keys_by_output_name:\n overlapping_asset_and_check_outputs = set(keys_by_output_name.keys()) & set(\n check_specs_by_output_name.keys()\n )\n check.invariant(\n not overlapping_asset_and_check_outputs,\n "The set of output names associated with asset keys and checks overlap:"\n f" {overlapping_asset_and_check_outputs}",\n )\n\n union_asset_and_check_outputs = set(keys_by_output_name.keys()) | set(\n check_specs_by_output_name.keys()\n )\n check.invariant(\n union_asset_and_check_outputs == set(output_names),\n "The union of the set of output names keys specified in the keys_by_output_name and"\n " check_specs_by_output_name arguments must equal the set of asset keys outputted by"\n f" {node_def.name}. union keys:"\n f" {union_asset_and_check_outputs} \\nexpected keys: {set(output_names)}",\n )\n\n inferred_keys_by_output_names: Dict[str, AssetKey] = {\n output_name: asset_key for output_name, asset_key in keys_by_output_name.items()\n }\n\n if (\n len(output_names) == 1\n and output_names[0] not in keys_by_output_name\n and output_names[0] not in check_specs_by_output_name\n and output_names[0] == "result"\n ):\n # If there is only one output and the name is the default "result", generate asset key\n # from the name of the node\n inferred_keys_by_output_names[output_names[0]] = AssetKey([node_def.name])\n\n for output_name in output_names:\n if (\n output_name not in inferred_keys_by_output_names\n and output_name not in check_specs_by_output_name\n ):\n inferred_keys_by_output_names[output_name] = AssetKey([output_name])\n return inferred_keys_by_output_names\n\n\ndef _validate_graph_def(graph_def: "GraphDefinition", prefix: Optional[Sequence[str]] = None):\n """Ensure that all leaf nodes are mapped to graph outputs."""\n from dagster._core.definitions.graph_definition import GraphDefinition, create_adjacency_lists\n\n prefix = check.opt_sequence_param(prefix, "prefix")\n\n # recursively validate any sub-graphs\n for inner_node_def in graph_def.node_defs:\n if isinstance(inner_node_def, GraphDefinition):\n _validate_graph_def(inner_node_def, prefix=[*prefix, graph_def.name])\n\n # leaf nodes have no downstream nodes\n forward_edges, _ = create_adjacency_lists(graph_def.nodes, graph_def.dependency_structure)\n leaf_nodes = {\n node_name for node_name, downstream_nodes in forward_edges.items() if not downstream_nodes\n }\n\n # set of nodes that have outputs mapped to a graph output\n mapped_output_nodes = {\n output_mapping.maps_from.node_name for output_mapping in graph_def.output_mappings\n }\n\n # leaf nodes which do not have an associated mapped output\n unmapped_leaf_nodes = {".".join([*prefix, node]) for node in leaf_nodes - mapped_output_nodes}\n\n check.invariant(\n not unmapped_leaf_nodes,\n f"All leaf nodes within graph '{graph_def.name}' must generate outputs which are mapped"\n " to outputs of the graph, and produce assets. The following leaf node(s) are"\n f" non-asset producing ops: {unmapped_leaf_nodes}. This behavior is not currently"\n " supported because these ops are not required for the creation of the associated"\n " asset(s).",\n )\n\n\ndef _validate_self_deps(\n input_keys: Iterable[AssetKey],\n output_keys: Iterable[AssetKey],\n partition_mappings: Mapping[AssetKey, PartitionMapping],\n partitions_def: Optional[PartitionsDefinition],\n) -> None:\n output_keys_set = set(output_keys)\n for input_key in input_keys:\n if input_key in output_keys_set:\n if input_key in partition_mappings:\n partition_mapping = partition_mappings[input_key]\n time_window_partition_mapping = get_self_dep_time_window_partition_mapping(\n partition_mapping, partitions_def\n )\n if (\n time_window_partition_mapping is not None\n and (time_window_partition_mapping.start_offset or 0) < 0\n and (time_window_partition_mapping.end_offset or 0) < 0\n ):\n continue\n\n raise DagsterInvalidDefinitionError(\n f'Asset "{input_key.to_user_string()}" depends on itself. Assets can only depend'\n " on themselves if they are:\\n(a) time-partitioned and each partition depends on"\n " earlier partitions\\n(b) multipartitioned, with one time dimension that depends"\n " on earlier time partitions"\n )\n\n\ndef get_self_dep_time_window_partition_mapping(\n partition_mapping: Optional[PartitionMapping], partitions_def: Optional[PartitionsDefinition]\n) -> Optional[TimeWindowPartitionMapping]:\n """Returns a time window partition mapping dimension of the provided partition mapping,\n if exists.\n """\n if isinstance(partition_mapping, TimeWindowPartitionMapping):\n return partition_mapping\n elif isinstance(partition_mapping, MultiPartitionMapping):\n if not isinstance(partitions_def, MultiPartitionsDefinition):\n return None\n\n time_partition_mapping = partition_mapping.downstream_mappings_by_upstream_dimension.get(\n partitions_def.time_window_dimension.name\n )\n\n if time_partition_mapping is None or not isinstance(\n time_partition_mapping.partition_mapping, TimeWindowPartitionMapping\n ):\n return None\n\n return time_partition_mapping.partition_mapping\n return None\n
", "current_page_name": "_modules/dagster/_core/definitions/assets", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.assets"}, "auto_materialize_policy": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.auto_materialize_policy

\nfrom enum import Enum\nfrom typing import TYPE_CHECKING, AbstractSet, Dict, FrozenSet, NamedTuple, Optional, Sequence\n\nimport dagster._check as check\nfrom dagster._annotations import experimental, public\nfrom dagster._serdes.serdes import (\n    NamedTupleSerializer,\n    UnpackContext,\n    UnpackedValue,\n    whitelist_for_serdes,\n)\n\nif TYPE_CHECKING:\n    from dagster._core.definitions.auto_materialize_rule import (\n        AutoMaterializeRule,\n        AutoMaterializeRuleSnapshot,\n    )\n\n\nclass AutoMaterializePolicySerializer(NamedTupleSerializer):\n    def before_unpack(\n        self, context: UnpackContext, unpacked_dict: Dict[str, UnpackedValue]\n    ) -> Dict[str, UnpackedValue]:\n        from dagster._core.definitions.auto_materialize_rule import AutoMaterializeRule\n\n        backcompat_map = {\n            "on_missing": AutoMaterializeRule.materialize_on_missing(),\n            "on_new_parent_data": AutoMaterializeRule.materialize_on_parent_updated(),\n            "for_freshness": AutoMaterializeRule.materialize_on_required_for_freshness(),\n        }\n\n        # determine if this namedtuple was serialized with the old format (booleans for rules)\n        if any(backcompat_key in unpacked_dict for backcompat_key in backcompat_map):\n            # all old policies had these rules by default\n            rules = {\n                AutoMaterializeRule.skip_on_parent_outdated(),\n                AutoMaterializeRule.skip_on_parent_missing(),\n            }\n            for backcompat_key, rule in backcompat_map.items():\n                if unpacked_dict.get(backcompat_key):\n                    rules.add(rule)\n            unpacked_dict["rules"] = frozenset(rules)\n\n        return unpacked_dict\n\n\nclass AutoMaterializePolicyType(Enum):\n    EAGER = "EAGER"\n    LAZY = "LAZY"\n\n\n
[docs]@experimental\n@whitelist_for_serdes(\n old_fields={"time_window_partition_scope_minutes": 1e-6},\n serializer=AutoMaterializePolicySerializer,\n)\nclass AutoMaterializePolicy(\n NamedTuple(\n "_AutoMaterializePolicy",\n [\n ("rules", FrozenSet["AutoMaterializeRule"]),\n ("max_materializations_per_minute", Optional[int]),\n ],\n )\n):\n """An AutoMaterializePolicy specifies how Dagster should attempt to keep an asset up-to-date.\n\n Each policy consists of a set of AutoMaterializeRules, which are used to determine whether an\n asset or a partition of an asset should or should not be auto-materialized.\n\n The most common policy is `AutoMaterializePolicy.eager()`, which consists of the following rules:\n\n - `AutoMaterializeRule.materialize_on_missing()`\n Materialize an asset or a partition if it has never been materialized.\n - `AutoMaterializeRule.materialize_on_parent_updated()`\n Materialize an asset or a partition if one of its parents have been updated more recently\n than it has.\n - `AutoMaterializeRule.materialize_on_required_for_freshness()`\n Materialize an asset or a partition if it is required to satisfy a freshness policy.\n - `AutoMaterializeRule.skip_on_parent_outdated()`\n Skip materializing an asset or partition if any of its parents have ancestors that have\n been materialized more recently.\n - `AutoMaterializeRule.skip_on_parent_missing()`\n Skip materializing an asset or a partition if any parent has never been materialized or\n observed.\n\n Policies can be customized by adding or removing rules. For example, if you'd like to allow\n an asset to be materialized even if some of its parent partitions are missing:\n\n .. code-block:: python\n\n from dagster import AutoMaterializePolicy, AutoMaterializeRule\n\n my_policy = AutoMaterializePolicy.eager().without_rules(\n AutoMaterializeRule.skip_on_parent_missing(),\n )\n\n If you'd like an asset to wait for all of its parents to be updated before materializing:\n\n .. code-block:: python\n\n from dagster import AutoMaterializePolicy, AutoMaterializeRule\n\n my_policy = AutoMaterializePolicy.eager().with_rules(\n AutoMaterializeRule.skip_on_all_parents_not_updated(),\n )\n\n Lastly, the `max_materializations_per_minute` parameter, which is set to 1 by default,\n rate-limits the number of auto-materializations that can occur for a particular asset within\n a short time interval. This mainly matters for partitioned assets. Its purpose is to provide a\n safeguard against "surprise backfills", where user-error causes auto-materialize to be\n accidentally triggered for large numbers of partitions at once.\n\n **Warning:**\n\n Constructing an AutoMaterializePolicy directly is not recommended as the API is subject to change.\n AutoMaterializePolicy.eager() and AutoMaterializePolicy.lazy() are the recommended API.\n\n """\n\n def __new__(\n cls,\n rules: AbstractSet["AutoMaterializeRule"],\n max_materializations_per_minute: Optional[int] = 1,\n ):\n from dagster._core.definitions.auto_materialize_rule import AutoMaterializeRule\n\n check.invariant(\n max_materializations_per_minute is None or max_materializations_per_minute > 0,\n "max_materializations_per_minute must be positive. To disable rate-limiting, set it"\n " to None. To disable auto materializing, remove the policy.",\n )\n\n return super(AutoMaterializePolicy, cls).__new__(\n cls,\n rules=frozenset(check.set_param(rules, "rules", of_type=AutoMaterializeRule)),\n max_materializations_per_minute=max_materializations_per_minute,\n )\n\n @property\n def materialize_rules(self) -> AbstractSet["AutoMaterializeRule"]:\n from dagster._core.definitions.auto_materialize_rule import AutoMaterializeDecisionType\n\n return {\n rule\n for rule in self.rules\n if rule.decision_type == AutoMaterializeDecisionType.MATERIALIZE\n }\n\n @property\n def skip_rules(self) -> AbstractSet["AutoMaterializeRule"]:\n from dagster._core.definitions.auto_materialize_rule import AutoMaterializeDecisionType\n\n return {\n rule for rule in self.rules if rule.decision_type == AutoMaterializeDecisionType.SKIP\n }\n\n
[docs] @public\n @staticmethod\n def eager(max_materializations_per_minute: Optional[int] = 1) -> "AutoMaterializePolicy":\n """Constructs an eager AutoMaterializePolicy.\n\n Args:\n max_materializations_per_minute (Optional[int]): The maximum number of\n auto-materializations for this asset that may be initiated per minute. If this limit\n is exceeded, the partitions which would have been materialized will be discarded,\n and will require manual materialization in order to be updated. Defaults to 1.\n """\n from dagster._core.definitions.auto_materialize_rule import AutoMaterializeRule\n\n return AutoMaterializePolicy(\n rules={\n AutoMaterializeRule.materialize_on_missing(),\n AutoMaterializeRule.materialize_on_parent_updated(),\n AutoMaterializeRule.materialize_on_required_for_freshness(),\n AutoMaterializeRule.skip_on_parent_outdated(),\n AutoMaterializeRule.skip_on_parent_missing(),\n },\n max_materializations_per_minute=check.opt_int_param(\n max_materializations_per_minute, "max_materializations_per_minute"\n ),\n )
\n\n
[docs] @public\n @staticmethod\n def lazy(max_materializations_per_minute: Optional[int] = 1) -> "AutoMaterializePolicy":\n """Constructs a lazy AutoMaterializePolicy.\n\n Args:\n max_materializations_per_minute (Optional[int]): The maximum number of\n auto-materializations for this asset that may be initiated per minute. If this limit\n is exceeded, the partitions which would have been materialized will be discarded,\n and will require manual materialization in order to be updated. Defaults to 1.\n """\n from dagster._core.definitions.auto_materialize_rule import AutoMaterializeRule\n\n return AutoMaterializePolicy(\n rules={\n AutoMaterializeRule.materialize_on_required_for_freshness(),\n AutoMaterializeRule.skip_on_parent_outdated(),\n AutoMaterializeRule.skip_on_parent_missing(),\n },\n max_materializations_per_minute=check.opt_int_param(\n max_materializations_per_minute, "max_materializations_per_minute"\n ),\n )
\n\n
[docs] @public\n def without_rules(self, *rules_to_remove: "AutoMaterializeRule") -> "AutoMaterializePolicy":\n """Constructs a copy of this policy with the specified rules removed. Raises an error\n if any of the arguments are not rules in this policy.\n """\n non_matching_rules = set(rules_to_remove).difference(self.rules)\n check.param_invariant(\n not non_matching_rules,\n "rules_to_remove",\n f"Rules {[rule for rule in rules_to_remove if rule in non_matching_rules]} do not"\n " exist in this policy.",\n )\n return self._replace(\n rules=self.rules.difference(set(rules_to_remove)),\n )
\n\n
[docs] @public\n def with_rules(self, *rules_to_add: "AutoMaterializeRule") -> "AutoMaterializePolicy":\n """Constructs a copy of this policy with the specified rules added."""\n return self._replace(rules=self.rules.union(set(rules_to_add)))
\n\n @property\n def policy_type(self) -> AutoMaterializePolicyType:\n from dagster._core.definitions.auto_materialize_rule import AutoMaterializeRule\n\n if AutoMaterializeRule.materialize_on_parent_updated() in self.rules:\n return AutoMaterializePolicyType.EAGER\n return AutoMaterializePolicyType.LAZY\n\n @property\n def rule_snapshots(self) -> Sequence["AutoMaterializeRuleSnapshot"]:\n return [rule.to_snapshot() for rule in self.rules]
\n
", "current_page_name": "_modules/dagster/_core/definitions/auto_materialize_policy", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.auto_materialize_policy"}, "auto_materialize_rule": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.auto_materialize_rule

\nimport datetime\nfrom abc import ABC, abstractmethod, abstractproperty\nfrom collections import defaultdict\nfrom enum import Enum\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Dict,\n    FrozenSet,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Set,\n    Tuple,\n    cast,\n)\n\nimport dagster._check as check\nfrom dagster._annotations import public\nfrom dagster._core.definitions.data_time import CachingDataTimeResolver\nfrom dagster._core.definitions.events import AssetKey, AssetKeyPartitionKey\nfrom dagster._core.definitions.freshness_based_auto_materialize import (\n    freshness_evaluation_results_for_asset_key,\n)\nfrom dagster._core.definitions.partition_mapping import IdentityPartitionMapping\nfrom dagster._core.definitions.time_window_partition_mapping import TimeWindowPartitionMapping\nfrom dagster._serdes.serdes import (\n    NamedTupleSerializer,\n    UnpackContext,\n    UnpackedValue,\n    WhitelistMap,\n    whitelist_for_serdes,\n)\nfrom dagster._utils.caching_instance_queryer import CachingInstanceQueryer\n\nfrom .asset_graph import AssetGraph, sort_key_for_asset_partition\nfrom .partition import SerializedPartitionsSubset\n\nif TYPE_CHECKING:\n    from dagster._core.definitions.asset_daemon_context import AssetDaemonContext\n    from dagster._core.definitions.asset_daemon_cursor import AssetDaemonCursor\n    from dagster._core.instance import DynamicPartitionsStore\n\n\n@whitelist_for_serdes\nclass AutoMaterializeDecisionType(Enum):\n    """Represents the set of results of the auto-materialize logic.\n\n    MATERIALIZE: The asset should be materialized by a run kicked off on this tick\n    SKIP: The asset should not be materialized by a run kicked off on this tick, because future\n        ticks are expected to materialize it.\n    DISCARD: The asset should not be materialized by a run kicked off on this tick, but future\n        ticks are not expected to materialize it.\n    """\n\n    MATERIALIZE = "MATERIALIZE"\n    SKIP = "SKIP"\n    DISCARD = "DISCARD"\n\n\nclass AutoMaterializeRuleEvaluationData(ABC):\n    pass\n\n\n@whitelist_for_serdes\nclass TextRuleEvaluationData(\n    AutoMaterializeRuleEvaluationData,\n    NamedTuple("_TextRuleEvaluationData", [("text", str)]),\n):\n    pass\n\n\n@whitelist_for_serdes\nclass ParentUpdatedRuleEvaluationData(\n    AutoMaterializeRuleEvaluationData,\n    NamedTuple(\n        "_ParentUpdatedRuleEvaluationData",\n        [\n            ("updated_asset_keys", FrozenSet[AssetKey]),\n            ("will_update_asset_keys", FrozenSet[AssetKey]),\n        ],\n    ),\n):\n    pass\n\n\n@whitelist_for_serdes\nclass WaitingOnAssetsRuleEvaluationData(\n    AutoMaterializeRuleEvaluationData,\n    NamedTuple(\n        "_WaitingOnParentRuleEvaluationData",\n        [("waiting_on_asset_keys", FrozenSet[AssetKey])],\n    ),\n):\n    pass\n\n\n@whitelist_for_serdes\nclass AutoMaterializeRuleSnapshot(NamedTuple):\n    """A serializable snapshot of an AutoMaterializeRule for historical evaluations."""\n\n    class_name: str\n    description: str\n    decision_type: AutoMaterializeDecisionType\n\n    @staticmethod\n    def from_rule(rule: "AutoMaterializeRule") -> "AutoMaterializeRuleSnapshot":\n        return AutoMaterializeRuleSnapshot(\n            class_name=rule.__class__.__name__,\n            description=rule.description,\n            decision_type=rule.decision_type,\n        )\n\n\n@whitelist_for_serdes\nclass AutoMaterializeRuleEvaluation(NamedTuple):\n    rule_snapshot: AutoMaterializeRuleSnapshot\n    evaluation_data: Optional[AutoMaterializeRuleEvaluationData]\n\n\nclass RuleEvaluationContext(NamedTuple):\n    asset_key: AssetKey\n    cursor: "AssetDaemonCursor"\n    instance_queryer: CachingInstanceQueryer\n    data_time_resolver: CachingDataTimeResolver\n    will_materialize_mapping: Mapping[AssetKey, AbstractSet[AssetKeyPartitionKey]]\n    expected_data_time_mapping: Mapping[AssetKey, Optional[datetime.datetime]]\n    candidates: AbstractSet[AssetKeyPartitionKey]\n    daemon_context: "AssetDaemonContext"\n\n    @property\n    def asset_graph(self) -> AssetGraph:\n        return self.instance_queryer.asset_graph\n\n    def materializable_in_same_run(self, child_key: AssetKey, parent_key: AssetKey) -> bool:\n        """Returns whether a child asset can be materialized in the same run as a parent asset."""\n        from dagster._core.definitions.external_asset_graph import ExternalAssetGraph\n\n        return (\n            # both assets must be materializable\n            child_key in self.asset_graph.materializable_asset_keys\n            and parent_key in self.asset_graph.materializable_asset_keys\n            # the parent must have the same partitioning\n            and self.asset_graph.have_same_partitioning(child_key, parent_key)\n            # the parent must have a simple partition mapping to the child\n            and (\n                not self.asset_graph.is_partitioned(parent_key)\n                or isinstance(\n                    self.asset_graph.get_partition_mapping(child_key, parent_key),\n                    (TimeWindowPartitionMapping, IdentityPartitionMapping),\n                )\n            )\n            # the parent must be in the same repository to be materialized alongside the candidate\n            and (\n                not isinstance(self.asset_graph, ExternalAssetGraph)\n                or self.asset_graph.get_repository_handle(child_key)\n                == self.asset_graph.get_repository_handle(parent_key)\n            )\n        )\n\n    def get_parents_that_will_not_be_materialized_on_current_tick(\n        self, *, asset_partition: AssetKeyPartitionKey\n    ) -> AbstractSet[AssetKeyPartitionKey]:\n        """Returns the set of parent asset partitions that will not be updated in the same run of\n        this asset partition if we launch a run of this asset partition on this tick.\n        """\n        return {\n            parent\n            for parent in self.asset_graph.get_parents_partitions(\n                dynamic_partitions_store=self.instance_queryer,\n                current_time=self.instance_queryer.evaluation_time,\n                asset_key=asset_partition.asset_key,\n                partition_key=asset_partition.partition_key,\n            ).parent_partitions\n            if parent not in self.will_materialize_mapping.get(parent.asset_key, set())\n            or not self.materializable_in_same_run(asset_partition.asset_key, parent.asset_key)\n        }\n\n    def get_asset_partitions_by_asset_key(\n        self,\n        asset_partitions: AbstractSet[AssetKeyPartitionKey],\n    ) -> Mapping[AssetKey, Set[AssetKeyPartitionKey]]:\n        asset_partitions_by_asset_key: Dict[AssetKey, Set[AssetKeyPartitionKey]] = defaultdict(set)\n        for parent in asset_partitions:\n            asset_partitions_by_asset_key[parent.asset_key].add(parent)\n\n        return asset_partitions_by_asset_key\n\n\nRuleEvaluationResults = Sequence[Tuple[Optional[AutoMaterializeRuleEvaluationData], AbstractSet]]\n\n\n
[docs]class AutoMaterializeRule(ABC):\n """An AutoMaterializeRule defines a bit of logic which helps determine if a materialization\n should be kicked off for a given asset partition.\n\n Each rule can have one of two decision types, `MATERIALIZE` (indicating that an asset partition\n should be materialized) or `SKIP` (indicating that the asset partition should not be\n materialized).\n\n Materialize rules are evaluated first, and skip rules operate over the set of candidates that\n are produced by the materialize rules. Other than that, there is no ordering between rules.\n """\n\n @abstractproperty\n def decision_type(self) -> AutoMaterializeDecisionType:\n """The decision type of the rule (either `MATERIALIZE` or `SKIP`)."""\n ...\n\n @abstractproperty\n def description(self) -> str:\n """A human-readable description of this rule. As a basic guideline, this string should\n complete the sentence: 'Indicates an asset should be (materialize/skipped) when ____'.\n """\n ...\n\n @abstractmethod\n def evaluate_for_asset(self, context: RuleEvaluationContext) -> RuleEvaluationResults:\n """The core evaluation function for the rule. This function takes in a context object and\n returns a mapping from evaluated rules to the set of asset partitions that the rule applies\n to.\n """\n ...\n\n
[docs] @public\n @staticmethod\n def materialize_on_required_for_freshness() -> "MaterializeOnRequiredForFreshnessRule":\n """Materialize an asset partition if it is required to satisfy a freshness policy of this\n asset or one of its downstream assets.\n\n Note: This rule has no effect on partitioned assets.\n """\n return MaterializeOnRequiredForFreshnessRule()
\n\n
[docs] @public\n @staticmethod\n def materialize_on_parent_updated() -> "MaterializeOnParentUpdatedRule":\n """Materialize an asset partition if one of its parents has been updated more recently\n than it has.\n\n Note: For time-partitioned or dynamic-partitioned assets downstream of an unpartitioned\n asset, this rule will only fire for the most recent partition of the downstream.\n """\n return MaterializeOnParentUpdatedRule()
\n\n
[docs] @public\n @staticmethod\n def materialize_on_missing() -> "MaterializeOnMissingRule":\n """Materialize an asset partition if it has never been materialized before. This rule will\n not fire for non-root assets unless that asset's parents have been updated.\n """\n return MaterializeOnMissingRule()
\n\n
[docs] @public\n @staticmethod\n def skip_on_parent_missing() -> "SkipOnParentMissingRule":\n """Skip materializing an asset partition if one of its parent asset partitions has never\n been materialized (for regular assets) or observed (for observable source assets).\n """\n return SkipOnParentMissingRule()
\n\n
[docs] @public\n @staticmethod\n def skip_on_parent_outdated() -> "SkipOnParentOutdatedRule":\n """Skip materializing an asset partition if any of its parents has not incorporated the\n latest data from its ancestors.\n """\n return SkipOnParentOutdatedRule()
\n\n
[docs] @public\n @staticmethod\n def skip_on_not_all_parents_updated(\n require_update_for_all_parent_partitions: bool = False,\n ) -> "SkipOnNotAllParentsUpdatedRule":\n """Skip materializing an asset partition if any of its parents have not been updated since\n the asset's last materialization.\n\n Attributes:\n require_update_for_all_parent_partitions (Optional[bool]): Applies only to an unpartitioned\n asset or an asset partition that depends on more than one partition in any upstream asset.\n If true, requires all upstream partitions in each upstream asset to be materialized since\n the downstream asset's last materialization in order to update it. If false, requires at\n least one upstream partition in each upstream asset to be materialized since the downstream\n asset's last materialization in order to update it. Defaults to false.\n """\n return SkipOnNotAllParentsUpdatedRule(require_update_for_all_parent_partitions)
\n\n def to_snapshot(self) -> AutoMaterializeRuleSnapshot:\n """Returns a serializable snapshot of this rule for historical evaluations."""\n return AutoMaterializeRuleSnapshot.from_rule(self)\n\n def __eq__(self, other) -> bool:\n # override the default NamedTuple __eq__ method to factor in types\n return type(self) == type(other) and super().__eq__(other)\n\n def __hash__(self) -> int:\n # override the default NamedTuple __hash__ method to factor in types\n return hash(hash(type(self)) + super().__hash__())
\n\n\n@whitelist_for_serdes\nclass MaterializeOnRequiredForFreshnessRule(\n AutoMaterializeRule, NamedTuple("_MaterializeOnRequiredForFreshnessRule", [])\n):\n @property\n def decision_type(self) -> AutoMaterializeDecisionType:\n return AutoMaterializeDecisionType.MATERIALIZE\n\n @property\n def description(self) -> str:\n return "required to meet this or downstream asset's freshness policy"\n\n def evaluate_for_asset(self, context: RuleEvaluationContext) -> RuleEvaluationResults:\n freshness_conditions = freshness_evaluation_results_for_asset_key(\n asset_key=context.asset_key,\n data_time_resolver=context.data_time_resolver,\n asset_graph=context.asset_graph,\n current_time=context.instance_queryer.evaluation_time,\n will_materialize_mapping=context.will_materialize_mapping,\n expected_data_time_mapping=context.expected_data_time_mapping,\n )\n return freshness_conditions\n\n\n@whitelist_for_serdes\nclass MaterializeOnParentUpdatedRule(\n AutoMaterializeRule, NamedTuple("_MaterializeOnParentUpdatedRule", [])\n):\n @property\n def decision_type(self) -> AutoMaterializeDecisionType:\n return AutoMaterializeDecisionType.MATERIALIZE\n\n @property\n def description(self) -> str:\n return "upstream data has changed since latest materialization"\n\n def evaluate_for_asset(self, context: RuleEvaluationContext) -> RuleEvaluationResults:\n """Evaluates the set of asset partitions of this asset whose parents have been updated,\n or will update on this tick.\n """\n conditions = defaultdict(set)\n has_parents_that_will_update = set()\n\n # first, get the set of parents that will be materialized this tick, and see if we\n # can materialize this asset with those parents\n will_update_parents_by_asset_partition = defaultdict(set)\n for parent_key in context.asset_graph.get_parents(context.asset_key):\n if not context.materializable_in_same_run(context.asset_key, parent_key):\n continue\n for parent_partition in context.will_materialize_mapping.get(parent_key, set()):\n asset_partition = AssetKeyPartitionKey(\n context.asset_key, parent_partition.partition_key\n )\n will_update_parents_by_asset_partition[asset_partition].add(parent_key)\n has_parents_that_will_update.add(asset_partition)\n\n # next, for each asset partition of this asset which has newly-updated parents, or\n # has a parent that will update, create a ParentUpdatedRuleEvaluationData\n has_or_will_update = (\n context.daemon_context.get_asset_partitions_with_newly_updated_parents_for_key(\n context.asset_key\n )\n | has_parents_that_will_update\n )\n for asset_partition in has_or_will_update:\n parent_asset_partitions = context.asset_graph.get_parents_partitions(\n dynamic_partitions_store=context.instance_queryer,\n current_time=context.instance_queryer.evaluation_time,\n asset_key=asset_partition.asset_key,\n partition_key=asset_partition.partition_key,\n ).parent_partitions\n\n updated_parent_asset_partitions = context.instance_queryer.get_updated_parent_asset_partitions(\n asset_partition,\n parent_asset_partitions,\n # do a precise check for updated parents, factoring in data versions, as long as\n # we're within reasonable limits on the number of partitions to check\n respect_materialization_data_versions=context.daemon_context.respect_materialization_data_versions\n and len(parent_asset_partitions | has_or_will_update) < 100,\n # ignore self-dependencies when checking for updated parents, to avoid historical\n # rematerializations from causing a chain of materializations to be kicked off\n ignored_parent_keys={context.asset_key},\n )\n updated_parents = {parent.asset_key for parent in updated_parent_asset_partitions}\n will_update_parents = will_update_parents_by_asset_partition[asset_partition]\n\n if updated_parents or will_update_parents:\n conditions[\n ParentUpdatedRuleEvaluationData(\n updated_asset_keys=frozenset(updated_parents),\n will_update_asset_keys=frozenset(will_update_parents),\n )\n ].add(asset_partition)\n if conditions:\n return [(k, v) for k, v in conditions.items()]\n return []\n\n\n@whitelist_for_serdes\nclass MaterializeOnMissingRule(AutoMaterializeRule, NamedTuple("_MaterializeOnMissingRule", [])):\n @property\n def decision_type(self) -> AutoMaterializeDecisionType:\n return AutoMaterializeDecisionType.MATERIALIZE\n\n @property\n def description(self) -> str:\n return "materialization is missing"\n\n def evaluate_for_asset(self, context: RuleEvaluationContext) -> RuleEvaluationResults:\n """Evaluates the set of asset partitions for this asset which are missing and were not\n previously discarded. Currently only applies to root asset partitions and asset partitions\n with updated parents.\n """\n missing_asset_partitions = (\n context.daemon_context.get_never_handled_root_asset_partitions_for_key(\n context.asset_key\n )\n )\n # in addition to missing root asset partitions, check any asset partitions with updated\n # parents to see if they're missing\n for (\n candidate\n ) in context.daemon_context.get_asset_partitions_with_newly_updated_parents_for_key(\n context.asset_key\n ):\n if not context.instance_queryer.asset_partition_has_materialization_or_observation(\n candidate\n ):\n missing_asset_partitions |= {candidate}\n if missing_asset_partitions:\n return [(None, missing_asset_partitions)]\n return []\n\n\n@whitelist_for_serdes\nclass SkipOnParentOutdatedRule(AutoMaterializeRule, NamedTuple("_SkipOnParentOutdatedRule", [])):\n @property\n def decision_type(self) -> AutoMaterializeDecisionType:\n return AutoMaterializeDecisionType.SKIP\n\n @property\n def description(self) -> str:\n return "waiting on upstream data to be up to date"\n\n def evaluate_for_asset(self, context: RuleEvaluationContext) -> RuleEvaluationResults:\n asset_partitions_by_waiting_on_asset_keys = defaultdict(set)\n for candidate in context.candidates:\n unreconciled_ancestors = set()\n # find the root cause of why this asset partition's parents are outdated (if any)\n for parent in context.get_parents_that_will_not_be_materialized_on_current_tick(\n asset_partition=candidate\n ):\n unreconciled_ancestors.update(\n context.instance_queryer.get_root_unreconciled_ancestors(\n asset_partition=parent,\n )\n )\n if unreconciled_ancestors:\n asset_partitions_by_waiting_on_asset_keys[frozenset(unreconciled_ancestors)].add(\n candidate\n )\n if asset_partitions_by_waiting_on_asset_keys:\n return [\n (WaitingOnAssetsRuleEvaluationData(waiting_on_asset_keys=k), v)\n for k, v in asset_partitions_by_waiting_on_asset_keys.items()\n ]\n return []\n\n\n@whitelist_for_serdes\nclass SkipOnParentMissingRule(AutoMaterializeRule, NamedTuple("_SkipOnParentMissingRule", [])):\n @property\n def decision_type(self) -> AutoMaterializeDecisionType:\n return AutoMaterializeDecisionType.SKIP\n\n @property\n def description(self) -> str:\n return "waiting on upstream data to be present"\n\n def evaluate_for_asset(\n self,\n context: RuleEvaluationContext,\n ) -> RuleEvaluationResults:\n asset_partitions_by_waiting_on_asset_keys = defaultdict(set)\n for candidate in context.candidates:\n missing_parent_asset_keys = set()\n for parent in context.get_parents_that_will_not_be_materialized_on_current_tick(\n asset_partition=candidate\n ):\n # ignore non-observable sources, which will never have a materialization or observation\n if context.asset_graph.is_source(\n parent.asset_key\n ) and not context.asset_graph.is_observable(parent.asset_key):\n continue\n if not context.instance_queryer.asset_partition_has_materialization_or_observation(\n parent\n ):\n missing_parent_asset_keys.add(parent.asset_key)\n if missing_parent_asset_keys:\n asset_partitions_by_waiting_on_asset_keys[frozenset(missing_parent_asset_keys)].add(\n candidate\n )\n if asset_partitions_by_waiting_on_asset_keys:\n return [\n (WaitingOnAssetsRuleEvaluationData(waiting_on_asset_keys=k), v)\n for k, v in asset_partitions_by_waiting_on_asset_keys.items()\n ]\n return []\n\n\n@whitelist_for_serdes\nclass SkipOnNotAllParentsUpdatedRule(\n AutoMaterializeRule,\n NamedTuple(\n "_SkipOnNotAllParentsUpdatedRule", [("require_update_for_all_parent_partitions", bool)]\n ),\n):\n """An auto-materialize rule that enforces that an asset can only be materialized if all parents\n have been materialized since the asset's last materialization.\n\n Attributes:\n require_update_for_all_parent_partitions (Optional[bool]): Applies only to an unpartitioned\n asset or an asset partition that depends on more than one partition in any upstream asset.\n If true, requires all upstream partitions in each upstream asset to be materialized since\n the downstream asset's last materialization in order to update it. If false, requires at\n least one upstream partition in each upstream asset to be materialized since the downstream\n asset's last materialization in order to update it. Defaults to false.\n """\n\n @property\n def decision_type(self) -> AutoMaterializeDecisionType:\n return AutoMaterializeDecisionType.SKIP\n\n @property\n def description(self) -> str:\n if self.require_update_for_all_parent_partitions is False:\n return "waiting on upstream data to be updated"\n else:\n return "waiting until all upstream partitions are updated"\n\n def evaluate_for_asset(\n self,\n context: RuleEvaluationContext,\n ) -> RuleEvaluationResults:\n asset_partitions_by_waiting_on_asset_keys = defaultdict(set)\n for candidate in context.candidates:\n parent_partitions = context.asset_graph.get_parents_partitions(\n context.instance_queryer,\n context.instance_queryer.evaluation_time,\n context.asset_key,\n candidate.partition_key,\n ).parent_partitions\n\n updated_parent_partitions = (\n context.instance_queryer.get_updated_parent_asset_partitions(\n candidate,\n parent_partitions,\n context.daemon_context.respect_materialization_data_versions,\n ignored_parent_keys=set(),\n )\n | set().union(\n *[\n context.will_materialize_mapping.get(parent, set())\n for parent in context.asset_graph.get_parents(context.asset_key)\n ]\n )\n )\n\n if self.require_update_for_all_parent_partitions:\n # All upstream partitions must be updated in order for the candidate to be updated\n non_updated_parent_keys = {\n parent.asset_key for parent in parent_partitions - updated_parent_partitions\n }\n else:\n # At least one upstream partition in each upstream asset must be updated in order\n # for the candidate to be updated\n parent_asset_keys = context.asset_graph.get_parents(context.asset_key)\n updated_parent_partitions_by_asset_key = context.get_asset_partitions_by_asset_key(\n updated_parent_partitions\n )\n non_updated_parent_keys = {\n parent\n for parent in parent_asset_keys\n if not updated_parent_partitions_by_asset_key.get(parent)\n }\n\n # do not require past partitions of this asset to be updated\n non_updated_parent_keys -= {context.asset_key}\n\n if non_updated_parent_keys:\n asset_partitions_by_waiting_on_asset_keys[frozenset(non_updated_parent_keys)].add(\n candidate\n )\n\n if asset_partitions_by_waiting_on_asset_keys:\n return [\n (WaitingOnAssetsRuleEvaluationData(waiting_on_asset_keys=k), v)\n for k, v in asset_partitions_by_waiting_on_asset_keys.items()\n ]\n return []\n\n\n@whitelist_for_serdes\nclass DiscardOnMaxMaterializationsExceededRule(\n AutoMaterializeRule, NamedTuple("_DiscardOnMaxMaterializationsExceededRule", [("limit", int)])\n):\n @property\n def decision_type(self) -> AutoMaterializeDecisionType:\n return AutoMaterializeDecisionType.DISCARD\n\n @property\n def description(self) -> str:\n return f"exceeds {self.limit} materialization(s) per minute"\n\n def evaluate_for_asset(self, context: RuleEvaluationContext) -> RuleEvaluationResults:\n # the set of asset partitions which exceed the limit\n rate_limited_asset_partitions = set(\n sorted(\n context.candidates,\n key=lambda x: sort_key_for_asset_partition(context.asset_graph, x),\n )[self.limit :]\n )\n if rate_limited_asset_partitions:\n return [(None, rate_limited_asset_partitions)]\n return []\n\n\n@whitelist_for_serdes\nclass AutoMaterializeAssetEvaluation(NamedTuple):\n """Represents the results of the auto-materialize logic for a single asset.\n\n Properties:\n asset_key (AssetKey): The asset key that was evaluated.\n partition_subsets_by_condition: The rule evaluations that impact if the asset should be\n materialized, skipped, or discarded. If the asset is partitioned, this will be a list of\n tuples, where the first element is the condition and the second element is the\n serialized subset of partitions that the condition applies to. If it's not partitioned,\n the second element will be None.\n """\n\n asset_key: AssetKey\n partition_subsets_by_condition: Sequence[\n Tuple["AutoMaterializeRuleEvaluation", Optional[SerializedPartitionsSubset]]\n ]\n num_requested: int\n num_skipped: int\n num_discarded: int\n run_ids: Set[str] = set()\n rule_snapshots: Optional[Sequence[AutoMaterializeRuleSnapshot]] = None\n\n @staticmethod\n def from_rule_evaluation_results(\n asset_graph: AssetGraph,\n asset_key: AssetKey,\n asset_partitions_by_rule_evaluation: Sequence[\n Tuple[AutoMaterializeRuleEvaluation, AbstractSet[AssetKeyPartitionKey]]\n ],\n num_requested: int,\n num_skipped: int,\n num_discarded: int,\n dynamic_partitions_store: "DynamicPartitionsStore",\n ) -> "AutoMaterializeAssetEvaluation":\n auto_materialize_policy = asset_graph.auto_materialize_policies_by_key.get(asset_key)\n\n if not auto_materialize_policy:\n check.failed(f"Expected auto materialize policy on asset {asset_key}")\n\n partitions_def = asset_graph.get_partitions_def(asset_key)\n if partitions_def is None:\n return AutoMaterializeAssetEvaluation(\n asset_key=asset_key,\n partition_subsets_by_condition=[\n (rule_evaluation, None)\n for rule_evaluation, _ in asset_partitions_by_rule_evaluation\n ],\n num_requested=num_requested,\n num_skipped=num_skipped,\n num_discarded=num_discarded,\n rule_snapshots=auto_materialize_policy.rule_snapshots,\n )\n else:\n return AutoMaterializeAssetEvaluation(\n asset_key=asset_key,\n partition_subsets_by_condition=[\n (\n rule_evaluation,\n SerializedPartitionsSubset.from_subset(\n subset=partitions_def.empty_subset().with_partition_keys(\n check.not_none(ap.partition_key) for ap in asset_partitions\n ),\n partitions_def=partitions_def,\n dynamic_partitions_store=dynamic_partitions_store,\n ),\n )\n for rule_evaluation, asset_partitions in asset_partitions_by_rule_evaluation\n ],\n num_requested=num_requested,\n num_skipped=num_skipped,\n num_discarded=num_discarded,\n rule_snapshots=auto_materialize_policy.rule_snapshots,\n )\n\n\n# BACKCOMPAT GRAVEYARD\n\n\nclass BackcompatAutoMaterializeConditionSerializer(NamedTupleSerializer):\n """This handles backcompat for the old AutoMaterializeCondition objects, turning them into the\n proper AutoMaterializeRuleEvaluation objects. This is necessary because old\n AutoMaterializeAssetEvaluation objects will have serialized AutoMaterializeCondition objects,\n and we need to be able to deserialize them.\n\n In theory, as these serialized objects happen to be purged periodically, we can remove this\n backcompat logic at some point in the future.\n """\n\n def unpack(\n self,\n unpacked_dict: Dict[str, UnpackedValue],\n whitelist_map: WhitelistMap,\n context: UnpackContext,\n ) -> AutoMaterializeRuleEvaluation:\n if self.klass in (\n FreshnessAutoMaterializeCondition,\n DownstreamFreshnessAutoMaterializeCondition,\n ):\n return AutoMaterializeRuleEvaluation(\n rule_snapshot=AutoMaterializeRule.materialize_on_required_for_freshness().to_snapshot(),\n evaluation_data=None,\n )\n elif self.klass == MissingAutoMaterializeCondition:\n return AutoMaterializeRuleEvaluation(\n rule_snapshot=AutoMaterializeRule.materialize_on_missing().to_snapshot(),\n evaluation_data=None,\n )\n elif self.klass == ParentMaterializedAutoMaterializeCondition:\n updated_asset_keys = unpacked_dict.get("updated_asset_keys")\n if isinstance(updated_asset_keys, set):\n updated_asset_keys = cast(FrozenSet[AssetKey], frozenset(updated_asset_keys))\n else:\n updated_asset_keys = frozenset()\n will_update_asset_keys = unpacked_dict.get("will_update_asset_keys")\n if isinstance(will_update_asset_keys, set):\n will_update_asset_keys = cast(\n FrozenSet[AssetKey], frozenset(will_update_asset_keys)\n )\n else:\n will_update_asset_keys = frozenset()\n return AutoMaterializeRuleEvaluation(\n rule_snapshot=AutoMaterializeRule.materialize_on_parent_updated().to_snapshot(),\n evaluation_data=ParentUpdatedRuleEvaluationData(\n updated_asset_keys=updated_asset_keys,\n will_update_asset_keys=will_update_asset_keys,\n ),\n )\n elif self.klass == ParentOutdatedAutoMaterializeCondition:\n waiting_on_asset_keys = unpacked_dict.get("waiting_on_asset_keys")\n if isinstance(waiting_on_asset_keys, set):\n waiting_on_asset_keys = cast(FrozenSet[AssetKey], frozenset(waiting_on_asset_keys))\n else:\n waiting_on_asset_keys = frozenset()\n return AutoMaterializeRuleEvaluation(\n rule_snapshot=AutoMaterializeRule.skip_on_parent_outdated().to_snapshot(),\n evaluation_data=WaitingOnAssetsRuleEvaluationData(\n waiting_on_asset_keys=waiting_on_asset_keys\n ),\n )\n elif self.klass == MaxMaterializationsExceededAutoMaterializeCondition:\n return AutoMaterializeRuleEvaluation(\n rule_snapshot=DiscardOnMaxMaterializationsExceededRule(limit=1).to_snapshot(),\n evaluation_data=None,\n )\n check.failed(f"Unexpected class {self.klass}")\n\n\n@whitelist_for_serdes(serializer=BackcompatAutoMaterializeConditionSerializer)\nclass FreshnessAutoMaterializeCondition(NamedTuple): ...\n\n\n@whitelist_for_serdes(serializer=BackcompatAutoMaterializeConditionSerializer)\nclass DownstreamFreshnessAutoMaterializeCondition(NamedTuple): ...\n\n\n@whitelist_for_serdes(serializer=BackcompatAutoMaterializeConditionSerializer)\nclass ParentMaterializedAutoMaterializeCondition(NamedTuple): ...\n\n\n@whitelist_for_serdes(serializer=BackcompatAutoMaterializeConditionSerializer)\nclass MissingAutoMaterializeCondition(NamedTuple): ...\n\n\n@whitelist_for_serdes(serializer=BackcompatAutoMaterializeConditionSerializer)\nclass ParentOutdatedAutoMaterializeCondition(NamedTuple): ...\n\n\n@whitelist_for_serdes(serializer=BackcompatAutoMaterializeConditionSerializer)\nclass MaxMaterializationsExceededAutoMaterializeCondition(NamedTuple): ...\n
", "current_page_name": "_modules/dagster/_core/definitions/auto_materialize_rule", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.auto_materialize_rule"}, "backfill_policy": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.backfill_policy

\nfrom enum import Enum\nfrom typing import NamedTuple, Optional\n\nimport dagster._check as check\nfrom dagster._annotations import experimental, public\nfrom dagster._serdes import whitelist_for_serdes\n\n\nclass BackfillPolicyType(Enum):\n    SINGLE_RUN = "SINGLE_RUN"\n    MULTI_RUN = "MULTI_RUN"\n\n\n
[docs]@experimental\n@whitelist_for_serdes\nclass BackfillPolicy(\n NamedTuple(\n "_BackfillPolicy",\n [\n ("max_partitions_per_run", Optional[int]),\n ],\n )\n):\n """A BackfillPolicy specifies how Dagster should attempt to backfill a partitioned asset.\n\n There are two main kinds of backfill policies: single-run and multi-run.\n\n An asset with a single-run backfill policy will take a single run to backfill all of its\n partitions at once.\n\n An asset with a multi-run backfill policy will take multiple runs to backfill all of its\n partitions. Each run will backfill a subset of the partitions. The number of partitions to\n backfill in each run is controlled by the `max_partitions_per_run` parameter.\n\n For example:\n\n - If an asset has 100 partitions, and the `max_partitions_per_run` is set to 10, then it will\n be backfilled in 10 runs; each run will backfill 10 partitions.\n\n - If an asset has 100 partitions, and the `max_partitions_per_run` is set to 11, then it will\n be backfilled in 10 runs; the first 9 runs will backfill 11 partitions, and the last one run\n will backfill the remaining 9 partitions.\n\n **Warning:**\n\n Constructing an BackfillPolicy directly is not recommended as the API is subject to change.\n BackfillPolicy.single_run() and BackfillPolicy.multi_run(max_partitions_per_run=x) are the\n recommended APIs.\n """\n\n def __new__(cls, max_partitions_per_run: Optional[int] = 1):\n return super(BackfillPolicy, cls).__new__(\n cls,\n max_partitions_per_run=max_partitions_per_run,\n )\n\n
[docs] @public\n @staticmethod\n def single_run() -> "BackfillPolicy":\n """Creates a BackfillPolicy that executes the entire backfill in a single run."""\n return BackfillPolicy(max_partitions_per_run=None)
\n\n
[docs] @public\n @staticmethod\n def multi_run(max_partitions_per_run: int = 1) -> "BackfillPolicy":\n """Creates a BackfillPolicy that executes the entire backfill in multiple runs.\n Each run will backfill [max_partitions_per_run] number of partitions.\n\n Args:\n max_partitions_per_run (Optional[int]): The maximum number of partitions in each run of\n the multiple runs. Defaults to 1.\n """\n return BackfillPolicy(\n max_partitions_per_run=check.int_param(max_partitions_per_run, "max_partitions_per_run")\n )
\n\n @property\n def policy_type(self) -> BackfillPolicyType:\n if self.max_partitions_per_run:\n return BackfillPolicyType.MULTI_RUN\n else:\n return BackfillPolicyType.SINGLE_RUN
\n
", "current_page_name": "_modules/dagster/_core/definitions/backfill_policy", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.backfill_policy"}, "config": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.config

\nfrom typing import Any, Callable, Mapping, NamedTuple, Optional, Union, cast\n\nfrom typing_extensions import TypeAlias\n\nimport dagster._check as check\nfrom dagster._builtins import BuiltinEnum\nfrom dagster._config import (\n    ConfigType,\n    is_supported_config_python_builtin,\n    process_config,\n    resolve_defaults,\n    validate_config,\n)\nfrom dagster._core.definitions.definition_config_schema import IDefinitionConfigSchema\nfrom dagster._core.errors import DagsterInvalidConfigError\n\nfrom .definition_config_schema import convert_user_facing_definition_config_schema\n\nConfigMappingFn: TypeAlias = Callable[[Any], Any]\n\n\ndef is_callable_valid_config_arg(config: Union[Callable[..., Any], Mapping[str, object]]) -> bool:\n    return BuiltinEnum.contains(config) or is_supported_config_python_builtin(config)\n\n\n
[docs]class ConfigMapping(\n NamedTuple(\n "_ConfigMapping",\n [\n ("config_fn", Callable[[Any], Any]),\n ("config_schema", IDefinitionConfigSchema),\n ("receive_processed_config_values", Optional[bool]),\n ],\n )\n):\n """Defines a config mapping for a graph (or job).\n\n By specifying a config mapping function, you can override the configuration for the child\n ops and graphs contained within a graph.\n\n Config mappings require the configuration schema to be specified as ``config_schema``, which will\n be exposed as the configuration schema for the graph, as well as a configuration mapping\n function, ``config_fn``, which maps the config provided to the graph to the config\n that will be provided to the child nodes.\n\n Args:\n config_fn (Callable[[dict], dict]): The function that will be called\n to map the graph config to a config appropriate for the child nodes.\n config_schema (ConfigSchema): The schema of the graph config.\n receive_processed_config_values (Optional[bool]): If true, config values provided to the config_fn\n will be converted to their dagster types before being passed in. For example, if this\n value is true, enum config passed to config_fn will be actual enums, while if false,\n then enum config passed to config_fn will be strings.\n """\n\n def __new__(\n cls,\n config_fn: ConfigMappingFn,\n config_schema: Optional[Any] = None,\n receive_processed_config_values: Optional[bool] = None,\n ):\n return super(ConfigMapping, cls).__new__(\n cls,\n config_fn=check.callable_param(config_fn, "config_fn"),\n config_schema=convert_user_facing_definition_config_schema(config_schema),\n receive_processed_config_values=check.opt_bool_param(\n receive_processed_config_values, "receive_processed_config_values"\n ),\n )\n\n def resolve_from_unvalidated_config(self, config: Any) -> Any:\n """Validates config against outer config schema, and calls mapping against validated config."""\n receive_processed_config_values = check.opt_bool_param(\n self.receive_processed_config_values, "receive_processed_config_values", default=True\n )\n if receive_processed_config_values:\n outer_evr = process_config(\n self.config_schema.config_type,\n config,\n )\n else:\n outer_evr = validate_config(\n self.config_schema.config_type,\n config,\n )\n if not outer_evr.success:\n raise DagsterInvalidConfigError(\n "Error in config mapping ",\n outer_evr.errors,\n config,\n )\n\n outer_config = outer_evr.value\n if not receive_processed_config_values:\n outer_config = resolve_defaults(\n cast(ConfigType, self.config_schema.config_type),\n outer_config,\n ).value\n\n return self.config_fn(outer_config)\n\n def resolve_from_validated_config(self, config: Any) -> Any:\n if self.receive_processed_config_values is not None:\n check.failed(\n "`receive_processed_config_values` parameter has been set, but only applies to "\n "unvalidated config."\n )\n\n return self.config_fn(config)
\n
", "current_page_name": "_modules/dagster/_core/definitions/config", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.config"}, "configurable": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.configurable

\nfrom abc import ABC, abstractmethod\nfrom typing import Any, Callable, NamedTuple, Optional, Type, TypeVar, Union, cast\n\nfrom typing_extensions import Self\n\nfrom dagster import (\n    Field,\n    _check as check,\n)\nfrom dagster._config import EvaluateValueResult\nfrom dagster._config.config_schema import UserConfigSchema\nfrom dagster._core.decorator_utils import get_function_params\n\nfrom .definition_config_schema import (\n    CoercableToConfigSchema,\n    ConfiguredDefinitionConfigSchema,\n    IDefinitionConfigSchema,\n    convert_user_facing_definition_config_schema,\n)\n\n\nclass ConfigurableDefinition(ABC):\n    @property\n    @abstractmethod\n    def config_schema(self) -> Optional[IDefinitionConfigSchema]:\n        raise NotImplementedError()\n\n    @property\n    def has_config_field(self) -> bool:\n        return self.config_schema is not None and bool(self.config_schema.as_field())\n\n    @property\n    def config_field(self) -> Optional[Field]:\n        return None if not self.config_schema else self.config_schema.as_field()\n\n    # getter for typed access\n    def get_config_field(self) -> Field:\n        field = self.config_field\n        if field is None:\n            check.failed("Must check has_config_Field before calling get_config_field")\n        return field\n\n    def apply_config_mapping(self, config: Any) -> EvaluateValueResult:\n        """Applies user-provided config mapping functions to the given configuration and validates the\n        results against the respective config schema.\n\n        Expects incoming config to be validated and have fully-resolved values (StringSource values\n        resolved, Enum types hydrated, etc.) via process_config() during ResolvedRunConfig\n        construction and Graph config mapping.\n\n        Args:\n            config (Any): A validated and resolved configuration dictionary matching this object's\n            config_schema\n\n        Returns (EvaluateValueResult):\n            If successful, the value is a validated and resolved configuration dictionary for the\n            innermost wrapped object after applying the config mapping transformation function.\n        """\n        # If schema is on a mapped schema this is the innermost resource (base case),\n        # so we aren't responsible for validating against anything farther down.\n        # Returns an EVR for type consistency with config_mapping_fn.\n        return (\n            self.config_schema.resolve_config(config)\n            if isinstance(self.config_schema, ConfiguredDefinitionConfigSchema)\n            else EvaluateValueResult.for_value(config)\n        )\n\n\nclass AnonymousConfigurableDefinition(ConfigurableDefinition):\n    """An interface that makes the `configured` method not accept a name argument."""\n\n    def configured(\n        self,\n        config_or_config_fn: Any,\n        config_schema: CoercableToConfigSchema = None,\n        description: Optional[str] = None,\n    ) -> Self:\n        """Wraps this object in an object of the same type that provides configuration to the inner\n        object.\n\n        Using ``configured`` may result in config values being displayed in\n        the Dagster UI, so it is not recommended to use this API with sensitive values,\n        such as secrets.\n\n        Args:\n            config_or_config_fn (Union[Any, Callable[[Any], Any]]): Either (1) Run configuration\n                that fully satisfies this object's config schema or (2) A function that accepts run\n                configuration and returns run configuration that fully satisfies this object's\n                config schema.  In the latter case, config_schema must be specified.  When\n                passing a function, it's easiest to use :py:func:`configured`.\n            config_schema (ConfigSchema): If config_or_config_fn is a function, the config schema\n                that its input must satisfy.\n            description (Optional[str]): Description of the new definition. If not specified,\n                inherits the description of the definition being configured.\n\n        Returns (ConfigurableDefinition): A configured version of this object.\n        """\n        new_config_schema = ConfiguredDefinitionConfigSchema(\n            self, convert_user_facing_definition_config_schema(config_schema), config_or_config_fn\n        )\n\n        return self.copy_for_configured(description, new_config_schema)\n\n    @abstractmethod\n    def copy_for_configured(\n        self,\n        description: Optional[str],\n        config_schema: IDefinitionConfigSchema,\n    ) -> Self:\n        raise NotImplementedError()\n\n\nclass NamedConfigurableDefinition(ConfigurableDefinition):\n    """An interface that makes the `configured` method require a positional `name` argument."""\n\n    def configured(\n        self,\n        config_or_config_fn: Any,\n        name: str,\n        config_schema: Optional[UserConfigSchema] = None,\n        description: Optional[str] = None,\n    ) -> Self:\n        """Wraps this object in an object of the same type that provides configuration to the inner\n        object.\n\n        Using ``configured`` may result in config values being displayed in\n        the Dagster UI, so it is not recommended to use this API with sensitive values,\n        such as secrets.\n\n        Args:\n            config_or_config_fn (Union[Any, Callable[[Any], Any]]): Either (1) Run configuration\n                that fully satisfies this object's config schema or (2) A function that accepts run\n                configuration and returns run configuration that fully satisfies this object's\n                config schema.  In the latter case, config_schema must be specified.  When\n                passing a function, it's easiest to use :py:func:`configured`.\n            name (str): Name of the new definition. This is a required argument, as this definition\n                type has a name uniqueness constraint.\n            config_schema (ConfigSchema): If config_or_config_fn is a function, the config schema\n                that its input must satisfy.\n            description (Optional[str]): Description of the new definition. If not specified,\n                inherits the description of the definition being configured.\n\n        Returns (ConfigurableDefinition): A configured version of this object.\n        """\n        name = check.str_param(name, "name")\n\n        new_config_schema = ConfiguredDefinitionConfigSchema(\n            self, convert_user_facing_definition_config_schema(config_schema), config_or_config_fn\n        )\n\n        return self.copy_for_configured(name, description, new_config_schema)\n\n    @abstractmethod\n    def copy_for_configured(\n        self,\n        name: str,\n        description: Optional[str],\n        config_schema: IDefinitionConfigSchema,\n    ) -> Self: ...\n\n\ndef _check_configurable_param(configurable: ConfigurableDefinition) -> None:\n    from dagster._core.definitions.composition import PendingNodeInvocation\n\n    check.param_invariant(\n        not isinstance(configurable, PendingNodeInvocation),\n        "configurable",\n        "You have invoked `configured` on a PendingNodeInvocation (an intermediate type), which"\n        " is produced by aliasing or tagging a node definition. To configure a node, you must"\n        " call `configured` on either an OpDefinition and GraphDefinition. To fix"\n        " this error, make sure to call `configured` on the definition object *before* using"\n        " the `tag` or `alias` methods. For usage examples, see"\n        " https://docs.dagster.io/concepts/configuration/configured",\n    )\n    check.inst_param(\n        configurable,\n        "configurable",\n        ConfigurableDefinition,\n        "Only the following types can be used with the `configured` method: ResourceDefinition,"\n        " ExecutorDefinition, GraphDefinition, NodeDefinition, and LoggerDefinition."\n        " For usage examples of `configured`, see"\n        " https://docs.dagster.io/concepts/configuration/configured",\n    )\n\n\nT_Configurable = TypeVar(\n    "T_Configurable", bound=Union["AnonymousConfigurableDefinition", "NamedConfigurableDefinition"]\n)\n\n\nclass FunctionAndConfigSchema(NamedTuple):\n    function: Callable[[Any], Any]\n    config_schema: Optional[UserConfigSchema]\n\n\ndef _wrap_user_fn_if_pythonic_config(\n    user_fn: Any, config_schema: Optional[UserConfigSchema]\n) -> FunctionAndConfigSchema:\n    """Helper function which allows users to provide a Pythonic config object to a @configurable\n    function. Detects if the function has a single parameter annotated with a Config class.\n    If so, wraps the function to convert the config dictionary into the appropriate Config object.\n    """\n    from dagster._config.pythonic_config import (\n        Config,\n        infer_schema_from_config_annotation,\n        safe_is_subclass,\n    )\n\n    if not isinstance(user_fn, Callable):\n        return FunctionAndConfigSchema(function=user_fn, config_schema=config_schema)\n\n    config_fn_params = get_function_params(user_fn)\n    check.invariant(\n        len(config_fn_params) == 1, "@configured function should have exactly one parameter"\n    )\n\n    param = config_fn_params[0]\n\n    # If the parameter is a subclass of Config, we can infer the config schema from the\n    # type annotation. We'll also wrap the config mapping function to convert the config\n    # dictionary into the appropriate Config object.\n    if not safe_is_subclass(param.annotation, Config):\n        return FunctionAndConfigSchema(function=user_fn, config_schema=config_schema)\n\n    check.invariant(\n        config_schema is None,\n        "Cannot provide config_schema to @configured function with Config-annotated param",\n    )\n\n    config_schema_from_class = infer_schema_from_config_annotation(param.annotation, param.default)\n    config_cls = cast(Type[Config], param.annotation)\n\n    param_name = param.name\n\n    def wrapped_fn(config_as_dict) -> Any:\n        config_input = config_cls(**config_as_dict)\n        output = user_fn(**{param_name: config_input})\n\n        if isinstance(output, Config):\n            return output._convert_to_config_dictionary()  # noqa: SLF001\n        else:\n            return output\n\n    return FunctionAndConfigSchema(function=wrapped_fn, config_schema=config_schema_from_class)\n\n\n
[docs]def configured(\n configurable: T_Configurable,\n config_schema: Optional[UserConfigSchema] = None,\n **kwargs: Any,\n) -> Callable[[object], T_Configurable]:\n """A decorator that makes it easy to create a function-configured version of an object.\n\n The following definition types can be configured using this function:\n\n * :py:class:`GraphDefinition`\n * :py:class:`ExecutorDefinition`\n * :py:class:`LoggerDefinition`\n * :py:class:`ResourceDefinition`\n * :py:class:`OpDefinition`\n\n Using ``configured`` may result in config values being displayed in the Dagster UI,\n so it is not recommended to use this API with sensitive values, such as\n secrets.\n\n If the config that will be supplied to the object is constant, you may alternatively invoke this\n and call the result with a dict of config values to be curried. Examples of both strategies\n below.\n\n Args:\n configurable (ConfigurableDefinition): An object that can be configured.\n config_schema (ConfigSchema): The config schema that the inputs to the decorated function\n must satisfy. Alternatively, annotate the config parameter to the decorated function\n with a subclass of :py:class:`Config` and omit this argument.\n **kwargs: Arbitrary keyword arguments that will be passed to the initializer of the returned\n object.\n\n Returns:\n (Callable[[Union[Any, Callable[[Any], Any]]], ConfigurableDefinition])\n\n **Examples:**\n\n .. code-block:: python\n\n class GreetingConfig(Config):\n message: str\n\n @op\n def greeting_op(config: GreetingConfig):\n print(config.message)\n\n class HelloConfig(Config):\n name: str\n\n @configured(greeting_op)\n def hello_op(config: HelloConfig):\n return GreetingConfig(message=f"Hello, {config.name}!")\n\n .. code-block:: python\n\n dev_s3 = configured(S3Resource, name="dev_s3")({'bucket': 'dev'})\n\n @configured(S3Resource)\n def dev_s3(_):\n return {'bucket': 'dev'}\n\n @configured(S3Resource, {'bucket_prefix', str})\n def dev_s3(config):\n return {'bucket': config['bucket_prefix'] + 'dev'}\n\n """\n _check_configurable_param(configurable)\n\n if isinstance(configurable, NamedConfigurableDefinition):\n\n def _configured(config_or_config_fn: object) -> T_Configurable:\n fn_name = (\n getattr(config_or_config_fn, "__name__", None)\n if callable(config_or_config_fn)\n else None\n )\n name: str = check.not_none(kwargs.get("name") or fn_name)\n\n updated_fn, new_config_schema = _wrap_user_fn_if_pythonic_config(\n config_or_config_fn, config_schema\n )\n return configurable.configured(\n config_or_config_fn=updated_fn,\n name=name,\n config_schema=new_config_schema,\n **{k: v for k, v in kwargs.items() if k != "name"},\n )\n\n return _configured\n elif isinstance(configurable, AnonymousConfigurableDefinition):\n\n def _configured(config_or_config_fn: object) -> T_Configurable:\n updated_fn, new_config_schema = _wrap_user_fn_if_pythonic_config(\n config_or_config_fn, config_schema\n )\n return configurable.configured(\n config_schema=new_config_schema, config_or_config_fn=updated_fn, **kwargs\n )\n\n return _configured\n else:\n check.failed(f"Invalid configurable definition type: {type(configurable)}")
\n
", "current_page_name": "_modules/dagster/_core/definitions/configurable", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.configurable"}, "decorators": {"asset_check_decorator": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.decorators.asset_check_decorator

\nfrom typing import Any, Callable, Mapping, Optional, Set, Tuple, Union, cast\n\nfrom dagster import _check as check\nfrom dagster._annotations import experimental\nfrom dagster._builtins import Nothing\nfrom dagster._config import UserConfigSchema\nfrom dagster._core.definitions.asset_check_result import AssetCheckResult\nfrom dagster._core.definitions.asset_check_spec import AssetCheckSpec\nfrom dagster._core.definitions.asset_checks import (\n    AssetChecksDefinition,\n    AssetChecksDefinitionInputOutputProps,\n)\nfrom dagster._core.definitions.assets import AssetsDefinition\nfrom dagster._core.definitions.events import AssetKey, CoercibleToAssetKey\nfrom dagster._core.definitions.output import Out\nfrom dagster._core.definitions.policy import RetryPolicy\nfrom dagster._core.definitions.source_asset import SourceAsset\nfrom dagster._core.definitions.utils import NoValueSentinel\nfrom dagster._core.errors import DagsterInvalidDefinitionError\n\nfrom ..input import In\nfrom .asset_decorator import (\n    get_function_params_without_context_or_config_or_resources,\n    stringify_asset_key_to_input_name,\n)\nfrom .op_decorator import _Op\n\nAssetCheckFunctionReturn = AssetCheckResult\nAssetCheckFunction = Callable[..., AssetCheckFunctionReturn]\n\n\ndef _build_asset_check_input(\n    name: str, asset_key: AssetKey, fn: Callable\n) -> Mapping[AssetKey, Tuple[str, In]]:\n    asset_params = get_function_params_without_context_or_config_or_resources(fn)\n\n    if len(asset_params) == 0:\n        input_name = stringify_asset_key_to_input_name(asset_key)\n        in_def = In(cast(type, Nothing))\n    elif len(asset_params) == 1:\n        input_name = asset_params[0].name\n        in_def = In(metadata={}, input_manager_key=None, dagster_type=NoValueSentinel)\n    else:\n        raise DagsterInvalidDefinitionError(\n            f"When defining check '{name}', multiple target assets provided as parameters:"\n            f" {[param.name for param in asset_params]}. Only one"\n            " is allowed."\n        )\n\n    return {\n        asset_key: (\n            input_name,\n            in_def,\n        )\n    }\n\n\n
[docs]@experimental\ndef asset_check(\n *,\n asset: Union[CoercibleToAssetKey, AssetsDefinition, SourceAsset],\n name: Optional[str] = None,\n description: Optional[str] = None,\n required_resource_keys: Optional[Set[str]] = None,\n resource_defs: Optional[Mapping[str, object]] = None,\n config_schema: Optional[UserConfigSchema] = None,\n compute_kind: Optional[str] = None,\n op_tags: Optional[Mapping[str, Any]] = None,\n retry_policy: Optional[RetryPolicy] = None,\n) -> Callable[[AssetCheckFunction], AssetChecksDefinition]:\n """Create a definition for how to execute an asset check.\n\n Args:\n asset (Union[AssetKey, Sequence[str], str, AssetsDefinition, SourceAsset]): The\n asset that the check applies to.\n name (Optional[str]): The name of the check. If not specified, the name of the decorated\n function will be used. Checks for the same asset must have unique names.\n description (Optional[str]): The description of the check.\n required_resource_keys (Optional[Set[str]]): A set of keys for resources that are required\n by the function that execute the check. These can alternatively be specified by\n including resource-typed parameters in the function signature.\n config_schema (Optional[ConfigSchema): The configuration schema for the check's underlying\n op. If set, Dagster will check that config provided for the op matches this schema and fail\n if it does not. If not set, Dagster will accept any config provided for the op.\n op_tags (Optional[Dict[str, Any]]): A dictionary of tags for the op that executes the check.\n Frameworks may expect and require certain metadata to be attached to a op. Values that\n are not strings will be json encoded and must meet the criteria that\n `json.loads(json.dumps(value)) == value`.\n compute_kind (Optional[str]): A string to represent the kind of computation that executes\n the check, e.g. "dbt" or "spark".\n retry_policy (Optional[RetryPolicy]): The retry policy for the op that executes the check.\n\n\n Produces an :py:class:`AssetChecksDefinition` object.\n\n\n Example:\n .. code-block:: python\n\n from dagster import asset, asset_check, AssetCheckResult\n\n @asset\n def my_asset() -> None:\n ...\n\n @asset_check(asset=my_asset, description="Check that my asset has enough rows")\n def my_asset_has_enough_rows() -> AssetCheckResult:\n num_rows = ...\n return AssetCheckResult(passed=num_rows > 5, metadata={"num_rows": num_rows})\n\n\n Example with a DataFrame Output:\n .. code-block:: python\n\n from dagster import asset, asset_check, AssetCheckResult\n from pandas import DataFrame\n\n @asset\n def my_asset() -> DataFrame:\n ...\n\n @asset_check(asset=my_asset, description="Check that my asset has enough rows")\n def my_asset_has_enough_rows(my_asset: DataFrame) -> AssetCheckResult:\n num_rows = my_asset.shape[0]\n return AssetCheckResult(passed=num_rows > 5, metadata={"num_rows": num_rows})\n """\n\n def inner(fn: AssetCheckFunction) -> AssetChecksDefinition:\n check.callable_param(fn, "fn")\n resolved_name = name or fn.__name__\n asset_key = AssetKey.from_coercible_or_definition(asset)\n\n out = Out(dagster_type=None)\n input_tuples_by_asset_key = _build_asset_check_input(resolved_name, asset_key, fn)\n if len(input_tuples_by_asset_key) == 0:\n raise DagsterInvalidDefinitionError(\n f"No target asset provided when defining check '{resolved_name}'"\n )\n\n if len(input_tuples_by_asset_key) > 1:\n raise DagsterInvalidDefinitionError(\n f"When defining check '{resolved_name}', Multiple target assets provided:"\n f" {[key.to_user_string() for key in input_tuples_by_asset_key.keys()]}. Only one"\n " is allowed."\n )\n\n resolved_asset_key = next(iter(input_tuples_by_asset_key.keys()))\n spec = AssetCheckSpec(\n name=resolved_name,\n description=description,\n asset=resolved_asset_key,\n )\n\n op_def = _Op(\n name=spec.get_python_identifier(),\n ins=dict(input_tuples_by_asset_key.values()),\n out=out,\n # Any resource requirements specified as arguments will be identified as\n # part of the Op definition instantiation\n required_resource_keys=required_resource_keys,\n tags={\n **({"kind": compute_kind} if compute_kind else {}),\n **(op_tags or {}),\n },\n config_schema=config_schema,\n retry_policy=retry_policy,\n )(fn)\n\n checks_def = AssetChecksDefinition(\n node_def=op_def,\n resource_defs={},\n specs=[spec],\n input_output_props=AssetChecksDefinitionInputOutputProps(\n asset_keys_by_input_name={\n input_tuples_by_asset_key[resolved_asset_key][0]: resolved_asset_key\n },\n asset_check_keys_by_output_name={op_def.output_defs[0].name: spec.key},\n ),\n )\n\n return checks_def\n\n return inner
\n
", "current_page_name": "_modules/dagster/_core/definitions/decorators/asset_check_decorator", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.decorators.asset_check_decorator"}, "asset_decorator": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.decorators.asset_decorator

\nfrom collections import Counter\nfrom inspect import Parameter\nfrom typing import (\n    AbstractSet,\n    Any,\n    Callable,\n    Dict,\n    Iterable,\n    List,\n    Mapping,\n    Optional,\n    Sequence,\n    Set,\n    Tuple,\n    Union,\n    cast,\n    overload,\n)\n\nimport dagster._check as check\nfrom dagster._annotations import deprecated_param, experimental_param\nfrom dagster._builtins import Nothing\nfrom dagster._config import UserConfigSchema\nfrom dagster._core.decorator_utils import get_function_params, get_valid_name_permutations\nfrom dagster._core.definitions.asset_dep import AssetDep, CoercibleToAssetDep\nfrom dagster._core.definitions.auto_materialize_policy import AutoMaterializePolicy\nfrom dagster._core.definitions.config import ConfigMapping\nfrom dagster._core.definitions.freshness_policy import FreshnessPolicy\nfrom dagster._core.definitions.metadata import ArbitraryMetadataMapping, MetadataUserInput\nfrom dagster._core.definitions.partition_mapping import PartitionMapping\nfrom dagster._core.definitions.resource_annotation import (\n    get_resource_args,\n)\nfrom dagster._core.errors import DagsterInvalidDefinitionError, DagsterInvariantViolationError\nfrom dagster._core.types.dagster_type import DagsterType\nfrom dagster._utils.warnings import (\n    disable_dagster_warnings,\n)\n\nfrom ..asset_check_spec import AssetCheckSpec\nfrom ..asset_in import AssetIn\nfrom ..asset_out import AssetOut\nfrom ..asset_spec import AssetSpec\nfrom ..assets import AssetsDefinition\nfrom ..backfill_policy import BackfillPolicy, BackfillPolicyType\nfrom ..decorators.graph_decorator import graph\nfrom ..decorators.op_decorator import _Op\nfrom ..events import AssetKey, CoercibleToAssetKey, CoercibleToAssetKeyPrefix\nfrom ..input import GraphIn, In\nfrom ..output import GraphOut, Out\nfrom ..partition import PartitionsDefinition\nfrom ..policy import RetryPolicy\nfrom ..resource_definition import ResourceDefinition\nfrom ..utils import DEFAULT_IO_MANAGER_KEY, DEFAULT_OUTPUT, NoValueSentinel\n\n\n@overload\ndef asset(\n    compute_fn: Callable,\n) -> AssetsDefinition: ...\n\n\n@overload\ndef asset(\n    *,\n    name: Optional[str] = ...,\n    key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n    ins: Optional[Mapping[str, AssetIn]] = ...,\n    deps: Optional[Iterable[CoercibleToAssetDep]] = ...,\n    metadata: Optional[Mapping[str, Any]] = ...,\n    description: Optional[str] = ...,\n    config_schema: Optional[UserConfigSchema] = None,\n    required_resource_keys: Optional[Set[str]] = ...,\n    resource_defs: Optional[Mapping[str, object]] = ...,\n    io_manager_def: Optional[object] = ...,\n    io_manager_key: Optional[str] = ...,\n    compute_kind: Optional[str] = ...,\n    dagster_type: Optional[DagsterType] = ...,\n    partitions_def: Optional[PartitionsDefinition] = ...,\n    op_tags: Optional[Mapping[str, Any]] = ...,\n    group_name: Optional[str] = ...,\n    output_required: bool = ...,\n    freshness_policy: Optional[FreshnessPolicy] = ...,\n    auto_materialize_policy: Optional[AutoMaterializePolicy] = ...,\n    backfill_policy: Optional[BackfillPolicy] = ...,\n    retry_policy: Optional[RetryPolicy] = ...,\n    code_version: Optional[str] = ...,\n    key: Optional[CoercibleToAssetKey] = None,\n    non_argument_deps: Optional[Union[Set[AssetKey], Set[str]]] = ...,\n    check_specs: Optional[Sequence[AssetCheckSpec]] = ...,\n) -> Callable[[Callable[..., Any]], AssetsDefinition]: ...\n\n\n
[docs]@experimental_param(param="resource_defs")\n@experimental_param(param="io_manager_def")\n@experimental_param(param="auto_materialize_policy")\n@experimental_param(param="backfill_policy")\n@deprecated_param(\n param="non_argument_deps", breaking_version="2.0.0", additional_warn_text="use `deps` instead."\n)\ndef asset(\n compute_fn: Optional[Callable] = None,\n *,\n name: Optional[str] = None,\n key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n ins: Optional[Mapping[str, AssetIn]] = None,\n deps: Optional[Iterable[CoercibleToAssetDep]] = None,\n metadata: Optional[ArbitraryMetadataMapping] = None,\n description: Optional[str] = None,\n config_schema: Optional[UserConfigSchema] = None,\n required_resource_keys: Optional[Set[str]] = None,\n resource_defs: Optional[Mapping[str, object]] = None,\n io_manager_def: Optional[object] = None,\n io_manager_key: Optional[str] = None,\n compute_kind: Optional[str] = None,\n dagster_type: Optional[DagsterType] = None,\n partitions_def: Optional[PartitionsDefinition] = None,\n op_tags: Optional[Mapping[str, Any]] = None,\n group_name: Optional[str] = None,\n output_required: bool = True,\n freshness_policy: Optional[FreshnessPolicy] = None,\n auto_materialize_policy: Optional[AutoMaterializePolicy] = None,\n backfill_policy: Optional[BackfillPolicy] = None,\n retry_policy: Optional[RetryPolicy] = None,\n code_version: Optional[str] = None,\n key: Optional[CoercibleToAssetKey] = None,\n non_argument_deps: Optional[Union[Set[AssetKey], Set[str]]] = None,\n check_specs: Optional[Sequence[AssetCheckSpec]] = None,\n) -> Union[AssetsDefinition, Callable[[Callable[..., Any]], AssetsDefinition]]:\n """Create a definition for how to compute an asset.\n\n A software-defined asset is the combination of:\n 1. An asset key, e.g. the name of a table.\n 2. A function, which can be run to compute the contents of the asset.\n 3. A set of upstream assets that are provided as inputs to the function when computing the asset.\n\n Unlike an op, whose dependencies are determined by the graph it lives inside, an asset knows\n about the upstream assets it depends on. The upstream assets are inferred from the arguments\n to the decorated function. The name of the argument designates the name of the upstream asset.\n\n An asset has an op inside it to represent the function that computes it. The name of the op\n will be the segments of the asset key, separated by double-underscores.\n\n Args:\n name (Optional[str]): The name of the asset. If not provided, defaults to the name of the\n decorated function. The asset's name must be a valid name in dagster (ie only contains\n letters, numbers, and _) and may not contain python reserved keywords.\n key_prefix (Optional[Union[str, Sequence[str]]]): If provided, the asset's key is the\n concatenation of the key_prefix and the asset's name, which defaults to the name of\n the decorated function. Each item in key_prefix must be a valid name in dagster (ie only\n contains letters, numbers, and _) and may not contain python reserved keywords.\n ins (Optional[Mapping[str, AssetIn]]): A dictionary that maps input names to information\n about the input.\n deps (Optional[Sequence[Union[AssetDep, AssetsDefinition, SourceAsset, AssetKey, str]]]):\n The assets that are upstream dependencies, but do not correspond to a parameter of the\n decorated function. If the AssetsDefinition for a multi_asset is provided, dependencies on\n all assets created by the multi_asset will be created.\n config_schema (Optional[ConfigSchema): The configuration schema for the asset's underlying\n op. If set, Dagster will check that config provided for the op matches this schema and fail\n if it does not. If not set, Dagster will accept any config provided for the op.\n metadata (Optional[Dict[str, Any]]): A dict of metadata entries for the asset.\n required_resource_keys (Optional[Set[str]]): Set of resource handles required by the op.\n io_manager_key (Optional[str]): The resource key of the IOManager used\n for storing the output of the op as an asset, and for loading it in downstream ops\n (default: "io_manager"). Only one of io_manager_key and io_manager_def can be provided.\n io_manager_def (Optional[object]): (Experimental) The IOManager used for\n storing the output of the op as an asset, and for loading it in\n downstream ops. Only one of io_manager_def and io_manager_key can be provided.\n compute_kind (Optional[str]): A string to represent the kind of computation that produces\n the asset, e.g. "dbt" or "spark". It will be displayed in the Dagster UI as a badge on the asset.\n dagster_type (Optional[DagsterType]): Allows specifying type validation functions that\n will be executed on the output of the decorated function after it runs.\n partitions_def (Optional[PartitionsDefinition]): Defines the set of partition keys that\n compose the asset.\n op_tags (Optional[Dict[str, Any]]): A dictionary of tags for the op that computes the asset.\n Frameworks may expect and require certain metadata to be attached to a op. Values that\n are not strings will be json encoded and must meet the criteria that\n `json.loads(json.dumps(value)) == value`.\n group_name (Optional[str]): A string name used to organize multiple assets into groups. If not provided,\n the name "default" is used.\n resource_defs (Optional[Mapping[str, object]]):\n (Experimental) A mapping of resource keys to resources. These resources\n will be initialized during execution, and can be accessed from the\n context within the body of the function.\n output_required (bool): Whether the decorated function will always materialize an asset.\n Defaults to True. If False, the function can return None, which will not be materialized to\n storage and will halt execution of downstream assets.\n freshness_policy (FreshnessPolicy): A constraint telling Dagster how often this asset is intended to be updated\n with respect to its root data.\n auto_materialize_policy (AutoMaterializePolicy): (Experimental) Configure Dagster to automatically materialize\n this asset according to its FreshnessPolicy and when upstream dependencies change.\n backfill_policy (BackfillPolicy): (Experimental) Configure Dagster to backfill this asset according to its\n BackfillPolicy.\n retry_policy (Optional[RetryPolicy]): The retry policy for the op that computes the asset.\n code_version (Optional[str]): (Experimental) Version of the code that generates this asset. In\n general, versions should be set only for code that deterministically produces the same\n output when given the same inputs.\n check_specs (Optional[Sequence[AssetCheckSpec]]): (Experimental) Specs for asset checks that\n execute in the decorated function after materializing the asset.\n non_argument_deps (Optional[Union[Set[AssetKey], Set[str]]]): Deprecated, use deps instead.\n Set of asset keys that are upstream dependencies, but do not pass an input to the asset.\n key (Optional[CoeercibleToAssetKey]): The key for this asset. If provided, cannot specify key_prefix or name.\n\n Examples:\n .. code-block:: python\n\n @asset\n def my_asset(my_upstream_asset: int) -> int:\n return my_upstream_asset + 1\n """\n\n def create_asset():\n upstream_asset_deps = _deps_and_non_argument_deps_to_asset_deps(\n deps=deps, non_argument_deps=non_argument_deps\n )\n\n return _Asset(\n name=cast(Optional[str], name), # (mypy bug that it can't infer name is Optional[str])\n key_prefix=key_prefix,\n ins=ins,\n deps=upstream_asset_deps,\n metadata=metadata,\n description=description,\n config_schema=config_schema,\n required_resource_keys=required_resource_keys,\n resource_defs=resource_defs,\n io_manager_key=io_manager_key,\n io_manager_def=io_manager_def,\n compute_kind=check.opt_str_param(compute_kind, "compute_kind"),\n dagster_type=dagster_type,\n partitions_def=partitions_def,\n op_tags=op_tags,\n group_name=group_name,\n output_required=output_required,\n freshness_policy=freshness_policy,\n auto_materialize_policy=auto_materialize_policy,\n backfill_policy=backfill_policy,\n retry_policy=retry_policy,\n code_version=code_version,\n check_specs=check_specs,\n key=key,\n )\n\n if compute_fn is not None:\n return create_asset()(compute_fn)\n\n def inner(fn: Callable[..., Any]) -> AssetsDefinition:\n check.invariant(\n not (io_manager_key and io_manager_def),\n "Both io_manager_key and io_manager_def were provided to `@asset` decorator. Please"\n " provide one or the other. ",\n )\n return create_asset()(fn)\n\n return inner
\n\n\ndef _resolve_key_and_name(\n *,\n key: Optional[CoercibleToAssetKey],\n key_prefix: Optional[CoercibleToAssetKeyPrefix],\n name: Optional[str],\n decorator: str,\n fn: Callable[..., Any],\n) -> Tuple[AssetKey, str]:\n if (name or key_prefix) and key:\n raise DagsterInvalidDefinitionError(\n f"Cannot specify a name or key prefix for {decorator} when the key"\n " argument is provided."\n )\n key_prefix_list = [key_prefix] if isinstance(key_prefix, str) else key_prefix\n key = AssetKey.from_coercible(key) if key else None\n assigned_name = name or fn.__name__\n return (\n (\n # the filter here appears unnecessary per typing, but this exists\n # historically so keeping it here to be conservative in case users\n # can get Nones into the key_prefix_list somehow\n AssetKey(list(filter(None, [*(key_prefix_list or []), assigned_name])))\n if not key\n else key\n ),\n assigned_name,\n )\n\n\nclass _Asset:\n def __init__(\n self,\n name: Optional[str] = None,\n key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n ins: Optional[Mapping[str, AssetIn]] = None,\n deps: Optional[Iterable[AssetDep]] = None,\n metadata: Optional[ArbitraryMetadataMapping] = None,\n description: Optional[str] = None,\n config_schema: Optional[UserConfigSchema] = None,\n required_resource_keys: Optional[Set[str]] = None,\n resource_defs: Optional[Mapping[str, object]] = None,\n io_manager_key: Optional[str] = None,\n io_manager_def: Optional[object] = None,\n compute_kind: Optional[str] = None,\n dagster_type: Optional[DagsterType] = None,\n partitions_def: Optional[PartitionsDefinition] = None,\n op_tags: Optional[Mapping[str, Any]] = None,\n group_name: Optional[str] = None,\n output_required: bool = True,\n freshness_policy: Optional[FreshnessPolicy] = None,\n auto_materialize_policy: Optional[AutoMaterializePolicy] = None,\n backfill_policy: Optional[BackfillPolicy] = None,\n retry_policy: Optional[RetryPolicy] = None,\n code_version: Optional[str] = None,\n key: Optional[CoercibleToAssetKey] = None,\n check_specs: Optional[Sequence[AssetCheckSpec]] = None,\n ):\n self.name = name\n self.key_prefix = key_prefix\n self.ins = ins or {}\n self.deps = deps or []\n self.metadata = metadata\n self.description = description\n self.required_resource_keys = check.opt_set_param(\n required_resource_keys, "required_resource_keys"\n )\n self.io_manager_key = io_manager_key\n self.io_manager_def = io_manager_def\n self.config_schema = config_schema\n self.compute_kind = compute_kind\n self.dagster_type = dagster_type\n self.partitions_def = partitions_def\n self.op_tags = op_tags\n self.resource_defs = dict(check.opt_mapping_param(resource_defs, "resource_defs"))\n self.group_name = group_name\n self.output_required = output_required\n self.freshness_policy = freshness_policy\n self.retry_policy = retry_policy\n self.auto_materialize_policy = auto_materialize_policy\n self.backfill_policy = backfill_policy\n self.code_version = code_version\n self.check_specs = check_specs\n self.key = key\n\n def __call__(self, fn: Callable) -> AssetsDefinition:\n from dagster._config.pythonic_config import (\n validate_resource_annotated_function,\n )\n from dagster._core.execution.build_resources import wrap_resources_for_execution\n\n validate_resource_annotated_function(fn)\n\n asset_ins = build_asset_ins(fn, self.ins or {}, {dep.asset_key for dep in self.deps})\n\n out_asset_key, asset_name = _resolve_key_and_name(\n key=self.key,\n key_prefix=self.key_prefix,\n name=self.name,\n fn=fn,\n decorator="@asset",\n )\n\n with disable_dagster_warnings():\n arg_resource_keys = {arg.name for arg in get_resource_args(fn)}\n\n bare_required_resource_keys = set(self.required_resource_keys)\n\n resource_defs_dict = self.resource_defs\n resource_defs_keys = set(resource_defs_dict.keys())\n decorator_resource_keys = bare_required_resource_keys | resource_defs_keys\n\n io_manager_key = self.io_manager_key\n if self.io_manager_def:\n if not io_manager_key:\n io_manager_key = out_asset_key.to_python_identifier("io_manager")\n\n if (\n io_manager_key in self.resource_defs\n and self.resource_defs[io_manager_key] != self.io_manager_def\n ):\n raise DagsterInvalidDefinitionError(\n f"Provided conflicting definitions for io manager key '{io_manager_key}'."\n " Please provide only one definition per key."\n )\n\n resource_defs_dict[io_manager_key] = self.io_manager_def\n\n wrapped_resource_defs = wrap_resources_for_execution(resource_defs_dict)\n\n check.param_invariant(\n len(bare_required_resource_keys) == 0 or len(arg_resource_keys) == 0,\n "Cannot specify resource requirements in both @asset decorator and as arguments"\n " to the decorated function",\n )\n\n io_manager_key = cast(str, io_manager_key) if io_manager_key else DEFAULT_IO_MANAGER_KEY\n\n out = Out(\n metadata=self.metadata or {},\n io_manager_key=io_manager_key,\n dagster_type=self.dagster_type if self.dagster_type else NoValueSentinel,\n description=self.description,\n is_required=self.output_required,\n code_version=self.code_version,\n )\n\n check_specs_by_output_name = _validate_and_assign_output_names_to_check_specs(\n self.check_specs, [out_asset_key]\n )\n check_outs: Mapping[str, Out] = {\n output_name: Out(dagster_type=None)\n for output_name in check_specs_by_output_name.keys()\n }\n\n op_required_resource_keys = decorator_resource_keys - arg_resource_keys\n\n op = _Op(\n name=out_asset_key.to_python_identifier(),\n description=self.description,\n ins=dict(asset_ins.values()),\n out={DEFAULT_OUTPUT: out, **check_outs},\n # Any resource requirements specified as arguments will be identified as\n # part of the Op definition instantiation\n required_resource_keys=op_required_resource_keys,\n tags={\n **({"kind": self.compute_kind} if self.compute_kind else {}),\n **(self.op_tags or {}),\n },\n config_schema=self.config_schema,\n retry_policy=self.retry_policy,\n code_version=self.code_version,\n )(fn)\n\n # check backfill policy is BackfillPolicyType.SINGLE_RUN for non-partitioned asset\n if self.partitions_def is None:\n check.param_invariant(\n (\n self.backfill_policy.policy_type is BackfillPolicyType.SINGLE_RUN\n if self.backfill_policy\n else True\n ),\n "backfill_policy",\n "Non partitioned asset can only have single run backfill policy",\n )\n\n keys_by_input_name = {\n input_name: asset_key for asset_key, (input_name, _) in asset_ins.items()\n }\n partition_mappings = {\n keys_by_input_name[input_name]: asset_in.partition_mapping\n for input_name, asset_in in self.ins.items()\n if asset_in.partition_mapping is not None\n }\n\n partition_mappings = _get_partition_mappings_from_deps(\n partition_mappings=partition_mappings, deps=self.deps, asset_name=asset_name\n )\n\n return AssetsDefinition.dagster_internal_init(\n keys_by_input_name=keys_by_input_name,\n keys_by_output_name={"result": out_asset_key},\n node_def=op,\n partitions_def=self.partitions_def,\n partition_mappings=partition_mappings if partition_mappings else None,\n resource_defs=wrapped_resource_defs,\n group_names_by_key={out_asset_key: self.group_name} if self.group_name else None,\n freshness_policies_by_key=(\n {out_asset_key: self.freshness_policy} if self.freshness_policy else None\n ),\n auto_materialize_policies_by_key=(\n {out_asset_key: self.auto_materialize_policy}\n if self.auto_materialize_policy\n else None\n ),\n backfill_policy=self.backfill_policy,\n asset_deps=None, # no asset deps in single-asset decorator\n selected_asset_keys=None, # no subselection in decorator\n can_subset=False,\n metadata_by_key={out_asset_key: self.metadata} if self.metadata else None,\n # see comment in @multi_asset's call to dagster_internal_init for the gory details\n # this is best understood as an _override_ which @asset does not support\n descriptions_by_key=None,\n check_specs_by_output_name=check_specs_by_output_name,\n selected_asset_check_keys=None, # no subselection in decorator\n )\n\n\n
[docs]@experimental_param(param="resource_defs")\n@deprecated_param(\n param="non_argument_deps", breaking_version="2.0.0", additional_warn_text="use `deps` instead."\n)\ndef multi_asset(\n *,\n outs: Optional[Mapping[str, AssetOut]] = None,\n name: Optional[str] = None,\n ins: Optional[Mapping[str, AssetIn]] = None,\n deps: Optional[Iterable[CoercibleToAssetDep]] = None,\n description: Optional[str] = None,\n config_schema: Optional[UserConfigSchema] = None,\n required_resource_keys: Optional[Set[str]] = None,\n compute_kind: Optional[str] = None,\n internal_asset_deps: Optional[Mapping[str, Set[AssetKey]]] = None,\n partitions_def: Optional[PartitionsDefinition] = None,\n backfill_policy: Optional[BackfillPolicy] = None,\n op_tags: Optional[Mapping[str, Any]] = None,\n can_subset: bool = False,\n resource_defs: Optional[Mapping[str, object]] = None,\n group_name: Optional[str] = None,\n retry_policy: Optional[RetryPolicy] = None,\n code_version: Optional[str] = None,\n specs: Optional[Sequence[AssetSpec]] = None,\n check_specs: Optional[Sequence[AssetCheckSpec]] = None,\n # deprecated\n non_argument_deps: Optional[Union[Set[AssetKey], Set[str]]] = None,\n) -> Callable[[Callable[..., Any]], AssetsDefinition]:\n """Create a combined definition of multiple assets that are computed using the same op and same\n upstream assets.\n\n Each argument to the decorated function references an upstream asset that this asset depends on.\n The name of the argument designates the name of the upstream asset.\n\n You can set I/O managers keys, auto-materialize policies, freshness policies, group names, etc.\n on an individual asset within the multi-asset by attaching them to the :py:class:`AssetOut`\n corresponding to that asset in the `outs` parameter.\n\n Args:\n name (Optional[str]): The name of the op.\n outs: (Optional[Dict[str, AssetOut]]): The AssetOuts representing the assets materialized by\n this function. AssetOuts detail the output, IO management, and core asset properties.\n This argument is required except when AssetSpecs are used.\n ins (Optional[Mapping[str, AssetIn]]): A dictionary that maps input names to information\n about the input.\n deps (Optional[Sequence[Union[AssetsDefinition, SourceAsset, AssetKey, str]]]):\n The assets that are upstream dependencies, but do not correspond to a parameter of the\n decorated function. If the AssetsDefinition for a multi_asset is provided, dependencies on\n all assets created by the multi_asset will be created.\n config_schema (Optional[ConfigSchema): The configuration schema for the asset's underlying\n op. If set, Dagster will check that config provided for the op matches this schema and fail\n if it does not. If not set, Dagster will accept any config provided for the op.\n required_resource_keys (Optional[Set[str]]): Set of resource handles required by the underlying op.\n compute_kind (Optional[str]): A string to represent the kind of computation that produces\n the asset, e.g. "dbt" or "spark". It will be displayed in the Dagster UI as a badge on the asset.\n internal_asset_deps (Optional[Mapping[str, Set[AssetKey]]]): By default, it is assumed\n that all assets produced by a multi_asset depend on all assets that are consumed by that\n multi asset. If this default is not correct, you pass in a map of output names to a\n corrected set of AssetKeys that they depend on. Any AssetKeys in this list must be either\n used as input to the asset or produced within the op.\n partitions_def (Optional[PartitionsDefinition]): Defines the set of partition keys that\n compose the assets.\n backfill_policy (Optional[BackfillPolicy]): The backfill policy for the op that computes the asset.\n op_tags (Optional[Dict[str, Any]]): A dictionary of tags for the op that computes the asset.\n Frameworks may expect and require certain metadata to be attached to a op. Values that\n are not strings will be json encoded and must meet the criteria that\n `json.loads(json.dumps(value)) == value`.\n can_subset (bool): If this asset's computation can emit a subset of the asset\n keys based on the context.selected_assets argument. Defaults to False.\n resource_defs (Optional[Mapping[str, object]]):\n (Experimental) A mapping of resource keys to resources. These resources\n will be initialized during execution, and can be accessed from the\n context within the body of the function.\n group_name (Optional[str]): A string name used to organize multiple assets into groups. This\n group name will be applied to all assets produced by this multi_asset.\n retry_policy (Optional[RetryPolicy]): The retry policy for the op that computes the asset.\n code_version (Optional[str]): (Experimental) Version of the code encapsulated by the multi-asset. If set,\n this is used as a default code version for all defined assets.\n specs (Optional[Sequence[AssetSpec]]): (Experimental) The specifications for the assets materialized\n by this function.\n check_specs (Optional[Sequence[AssetCheckSpec]]): (Experimental) Specs for asset checks that\n execute in the decorated function after materializing the assets.\n non_argument_deps (Optional[Union[Set[AssetKey], Set[str]]]): Deprecated, use deps instead. Set of asset keys that are upstream\n dependencies, but do not pass an input to the multi_asset.\n\n Examples:\n .. code-block:: python\n\n # Use IO managers to handle I/O:\n @multi_asset(\n outs={\n "my_string_asset": AssetOut(),\n "my_int_asset": AssetOut(),\n }\n )\n def my_function(upstream_asset: int):\n result = upstream_asset + 1\n return str(result), result\n\n # Handle I/O on your own:\n @multi_asset(\n outs={\n "asset1": AssetOut(),\n "asset2": AssetOut(),\n },\n deps=["asset0"],\n )\n def my_function():\n asset0_value = load(path="asset0")\n asset1_result, asset2_result = do_some_transformation(asset0_value)\n write(asset1_result, path="asset1")\n write(asset2_result, path="asset2")\n return None, None\n """\n from dagster._core.execution.build_resources import wrap_resources_for_execution\n\n specs = check.opt_list_param(specs, "specs", of_type=AssetSpec)\n\n upstream_asset_deps = _deps_and_non_argument_deps_to_asset_deps(\n deps=deps, non_argument_deps=non_argument_deps\n )\n\n asset_deps = check.opt_mapping_param(\n internal_asset_deps, "internal_asset_deps", key_type=str, value_type=set\n )\n required_resource_keys = check.opt_set_param(\n required_resource_keys, "required_resource_keys", of_type=str\n )\n resource_defs = wrap_resources_for_execution(\n check.opt_mapping_param(resource_defs, "resource_defs", key_type=str)\n )\n\n _config_schema = check.opt_mapping_param(\n config_schema, # type: ignore\n "config_schema",\n additional_message="Only dicts are supported for asset config_schema.",\n )\n\n bare_required_resource_keys = set(required_resource_keys)\n resource_defs_keys = set(resource_defs.keys())\n required_resource_keys = bare_required_resource_keys | resource_defs_keys\n\n asset_out_map: Mapping[str, AssetOut] = {} if outs is None else outs\n\n def inner(fn: Callable[..., Any]) -> AssetsDefinition:\n op_name = name or fn.__name__\n\n if asset_out_map and specs:\n raise DagsterInvalidDefinitionError("Must specify only outs or specs but not both.")\n elif specs:\n output_tuples_by_asset_key = {}\n for asset_spec in specs:\n # output names are asset keys joined with _\n output_name = "_".join(asset_spec.key.path)\n output_tuples_by_asset_key[asset_spec.key] = (\n output_name,\n Out(\n Nothing,\n is_required=not (can_subset or asset_spec.skippable),\n description=asset_spec.description,\n ),\n )\n if upstream_asset_deps:\n raise DagsterInvalidDefinitionError(\n "Can not pass deps and specs to @multi_asset, specify deps on the AssetSpecs"\n " directly."\n )\n if internal_asset_deps:\n raise DagsterInvalidDefinitionError(\n "Can not pass internal_asset_deps and specs to @multi_asset, specify deps on"\n " the AssetSpecs directly."\n )\n\n upstream_keys = set()\n for spec in specs:\n for dep in spec.deps:\n if dep.asset_key not in output_tuples_by_asset_key:\n upstream_keys.add(dep.asset_key)\n if (\n dep.asset_key in output_tuples_by_asset_key\n and dep.partition_mapping is not None\n ):\n # self-dependent asset also needs to be considered an upstream_key\n upstream_keys.add(dep.asset_key)\n\n explicit_ins = ins or {}\n # get which asset keys have inputs set\n loaded_upstreams = build_asset_ins(fn, explicit_ins, deps=set())\n unexpected_upstreams = {\n key for key in loaded_upstreams.keys() if key not in upstream_keys\n }\n if unexpected_upstreams:\n raise DagsterInvalidDefinitionError(\n f"Asset inputs {unexpected_upstreams} do not have dependencies on the passed"\n " AssetSpec(s). Set the deps on the appropriate AssetSpec(s)."\n )\n remaining_upstream_keys = {key for key in upstream_keys if key not in loaded_upstreams}\n asset_ins = build_asset_ins(fn, explicit_ins, deps=remaining_upstream_keys)\n else:\n asset_ins = build_asset_ins(\n fn,\n ins or {},\n deps=(\n {dep.asset_key for dep in upstream_asset_deps} if upstream_asset_deps else set()\n ),\n )\n output_tuples_by_asset_key = build_asset_outs(asset_out_map)\n # validate that the asset_deps make sense\n valid_asset_deps = set(asset_ins.keys()) | set(output_tuples_by_asset_key.keys())\n for out_name, asset_keys in asset_deps.items():\n if asset_out_map and out_name not in asset_out_map:\n check.failed(\n f"Invalid out key '{out_name}' supplied to `internal_asset_deps` argument"\n f" for multi-asset {op_name}. Must be one of the outs for this multi-asset"\n f" {list(asset_out_map.keys())[:20]}.",\n )\n invalid_asset_deps = asset_keys.difference(valid_asset_deps)\n check.invariant(\n not invalid_asset_deps,\n f"Invalid asset dependencies: {invalid_asset_deps} specified in"\n f" `internal_asset_deps` argument for multi-asset '{op_name}' on key"\n f" '{out_name}'. Each specified asset key must be associated with an input to"\n " the asset or produced by this asset. Valid keys:"\n f" {list(valid_asset_deps)[:20]}",\n )\n\n arg_resource_keys = {arg.name for arg in get_resource_args(fn)}\n check.param_invariant(\n len(bare_required_resource_keys or []) == 0 or len(arg_resource_keys) == 0,\n "Cannot specify resource requirements in both @multi_asset decorator and as"\n " arguments to the decorated function",\n )\n\n asset_outs_by_output_name: Mapping[str, Out] = dict(output_tuples_by_asset_key.values())\n\n check_specs_by_output_name = _validate_and_assign_output_names_to_check_specs(\n check_specs, list(output_tuples_by_asset_key.keys())\n )\n check_outs_by_output_name: Mapping[str, Out] = {\n output_name: Out(dagster_type=None, is_required=not can_subset)\n for output_name in check_specs_by_output_name.keys()\n }\n overlapping_output_names = (\n asset_outs_by_output_name.keys() & check_outs_by_output_name.keys()\n )\n check.invariant(\n len(overlapping_output_names) == 0,\n f"Check output names overlap with asset output names: {overlapping_output_names}",\n )\n combined_outs_by_output_name: Mapping[str, Out] = {\n **asset_outs_by_output_name,\n **check_outs_by_output_name,\n }\n\n with disable_dagster_warnings():\n op_required_resource_keys = required_resource_keys - arg_resource_keys\n\n op = _Op(\n name=op_name,\n description=description,\n ins=dict(asset_ins.values()),\n out=combined_outs_by_output_name,\n required_resource_keys=op_required_resource_keys,\n tags={\n **({"kind": compute_kind} if compute_kind else {}),\n **(op_tags or {}),\n },\n config_schema=_config_schema,\n retry_policy=retry_policy,\n code_version=code_version,\n )(fn)\n\n keys_by_input_name = {\n input_name: asset_key for asset_key, (input_name, _) in asset_ins.items()\n }\n keys_by_output_name = {\n output_name: asset_key\n for asset_key, (output_name, _) in output_tuples_by_asset_key.items()\n }\n partition_mappings = {\n keys_by_input_name[input_name]: asset_in.partition_mapping\n for input_name, asset_in in (ins or {}).items()\n if asset_in.partition_mapping is not None\n }\n\n if upstream_asset_deps:\n partition_mappings = _get_partition_mappings_from_deps(\n partition_mappings=partition_mappings, deps=upstream_asset_deps, asset_name=op_name\n )\n\n if specs:\n internal_deps = {\n spec.key: {dep.asset_key for dep in spec.deps}\n for spec in specs\n if spec.deps is not None\n }\n props_by_asset_key: Mapping[AssetKey, Union[AssetSpec, AssetOut]] = {\n spec.key: spec for spec in specs\n }\n # Add PartitionMappings specified via AssetSpec.deps to partition_mappings dictionary. Error on duplicates\n for spec in specs:\n for dep in spec.deps:\n if dep.partition_mapping is None:\n continue\n if partition_mappings.get(dep.asset_key, None) is None:\n partition_mappings[dep.asset_key] = dep.partition_mapping\n continue\n if partition_mappings[dep.asset_key] == dep.partition_mapping:\n continue\n else:\n raise DagsterInvalidDefinitionError(\n f"Two different PartitionMappings for {dep.asset_key} provided for"\n f" multi_asset {op_name}. Please use the same PartitionMapping for"\n f" {dep.asset_key}."\n )\n\n else:\n internal_deps = {keys_by_output_name[name]: asset_deps[name] for name in asset_deps}\n props_by_asset_key = {\n keys_by_output_name[output_name]: asset_out\n for output_name, asset_out in asset_out_map.items()\n }\n\n # handle properties defined ons AssetSpecs or AssetOuts\n group_names_by_key = {\n asset_key: props.group_name\n for asset_key, props in props_by_asset_key.items()\n if props.group_name is not None\n }\n if group_name:\n check.invariant(\n not group_names_by_key,\n "Cannot set group_name parameter on multi_asset if one or more of the"\n " AssetSpecs/AssetOuts supplied to this multi_asset have a group_name defined.",\n )\n group_names_by_key = {asset_key: group_name for asset_key in props_by_asset_key}\n\n freshness_policies_by_key = {\n asset_key: props.freshness_policy\n for asset_key, props in props_by_asset_key.items()\n if props.freshness_policy is not None\n }\n auto_materialize_policies_by_key = {\n asset_key: props.auto_materialize_policy\n for asset_key, props in props_by_asset_key.items()\n if props.auto_materialize_policy is not None\n }\n metadata_by_key = {\n asset_key: props.metadata\n for asset_key, props in props_by_asset_key.items()\n if props.metadata is not None\n }\n\n return AssetsDefinition.dagster_internal_init(\n keys_by_input_name=keys_by_input_name,\n keys_by_output_name=keys_by_output_name,\n node_def=op,\n asset_deps=internal_deps,\n partitions_def=partitions_def,\n partition_mappings=partition_mappings if partition_mappings else None,\n can_subset=can_subset,\n resource_defs=resource_defs,\n group_names_by_key=group_names_by_key,\n freshness_policies_by_key=freshness_policies_by_key,\n auto_materialize_policies_by_key=auto_materialize_policies_by_key,\n backfill_policy=backfill_policy,\n selected_asset_keys=None, # no subselection in decorator\n # descriptions by key is more accurately understood as _overriding_ the descriptions\n # by key that are in the OutputDefinitions associated with the asset key.\n # This is a dangerous construction liable for bugs. Instead there should be a\n # canonical source of asset descriptions in AssetsDefinintion and if we need\n # to create a memoized cached dictionary of asset keys for perf or something we do\n # that in the `__init__` or on demand.\n #\n # This is actually an override. We do not override descriptions\n # in OutputDefinitions in @multi_asset\n descriptions_by_key=None,\n metadata_by_key=metadata_by_key,\n check_specs_by_output_name=check_specs_by_output_name,\n selected_asset_check_keys=None, # no subselection in decorator\n )\n\n return inner
\n\n\ndef get_function_params_without_context_or_config_or_resources(fn: Callable) -> List[Parameter]:\n params = get_function_params(fn)\n is_context_provided = len(params) > 0 and params[0].name in get_valid_name_permutations(\n "context"\n )\n input_params = params[1:] if is_context_provided else params\n\n resource_arg_names = {arg.name for arg in get_resource_args(fn)}\n\n new_input_args = []\n for input_arg in input_params:\n if input_arg.name != "config" and input_arg.name not in resource_arg_names:\n new_input_args.append(input_arg)\n\n return new_input_args\n\n\ndef stringify_asset_key_to_input_name(asset_key: AssetKey) -> str:\n return "_".join(asset_key.path).replace("-", "_")\n\n\ndef build_asset_ins(\n fn: Callable,\n asset_ins: Mapping[str, AssetIn],\n deps: Optional[AbstractSet[AssetKey]],\n) -> Mapping[AssetKey, Tuple[str, In]]:\n """Creates a mapping from AssetKey to (name of input, In object)."""\n deps = check.opt_set_param(deps, "deps", AssetKey)\n\n new_input_args = get_function_params_without_context_or_config_or_resources(fn)\n\n non_var_input_param_names = [\n param.name for param in new_input_args if param.kind == Parameter.POSITIONAL_OR_KEYWORD\n ]\n has_kwargs = any(param.kind == Parameter.VAR_KEYWORD for param in new_input_args)\n\n all_input_names = set(non_var_input_param_names) | asset_ins.keys()\n\n if not has_kwargs:\n for in_key, asset_in in asset_ins.items():\n if in_key not in non_var_input_param_names and (\n not isinstance(asset_in.dagster_type, DagsterType)\n or not asset_in.dagster_type.is_nothing\n ):\n raise DagsterInvalidDefinitionError(\n f"Key '{in_key}' in provided ins dict does not correspond to any of the names "\n "of the arguments to the decorated function"\n )\n\n ins_by_asset_key: Dict[AssetKey, Tuple[str, In]] = {}\n for input_name in all_input_names:\n asset_key = None\n\n if input_name in asset_ins:\n asset_key = asset_ins[input_name].key\n metadata = asset_ins[input_name].metadata or {}\n key_prefix = asset_ins[input_name].key_prefix\n input_manager_key = asset_ins[input_name].input_manager_key\n dagster_type = asset_ins[input_name].dagster_type\n else:\n metadata = {}\n key_prefix = None\n input_manager_key = None\n dagster_type = NoValueSentinel\n\n asset_key = asset_key or AssetKey(list(filter(None, [*(key_prefix or []), input_name])))\n\n ins_by_asset_key[asset_key] = (\n input_name.replace("-", "_"),\n In(metadata=metadata, input_manager_key=input_manager_key, dagster_type=dagster_type),\n )\n\n for asset_key in deps:\n if asset_key in ins_by_asset_key:\n raise DagsterInvalidDefinitionError(\n f"deps value {asset_key} also declared as input/AssetIn"\n )\n # mypy doesn't realize that Nothing is a valid type here\n ins_by_asset_key[asset_key] = (\n stringify_asset_key_to_input_name(asset_key),\n In(cast(type, Nothing)),\n )\n\n return ins_by_asset_key\n\n\n@overload\ndef graph_asset(\n compose_fn: Callable,\n) -> AssetsDefinition: ...\n\n\n@overload\ndef graph_asset(\n *,\n name: Optional[str] = None,\n description: Optional[str] = None,\n ins: Optional[Mapping[str, AssetIn]] = None,\n config: Optional[Union[ConfigMapping, Mapping[str, Any]]] = None,\n key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n group_name: Optional[str] = None,\n partitions_def: Optional[PartitionsDefinition] = None,\n metadata: Optional[MetadataUserInput] = ...,\n freshness_policy: Optional[FreshnessPolicy] = ...,\n auto_materialize_policy: Optional[AutoMaterializePolicy] = ...,\n backfill_policy: Optional[BackfillPolicy] = ...,\n resource_defs: Optional[Mapping[str, ResourceDefinition]] = ...,\n check_specs: Optional[Sequence[AssetCheckSpec]] = None,\n key: Optional[CoercibleToAssetKey] = None,\n) -> Callable[[Callable[..., Any]], AssetsDefinition]: ...\n\n\n
[docs]def graph_asset(\n compose_fn: Optional[Callable] = None,\n *,\n name: Optional[str] = None,\n description: Optional[str] = None,\n ins: Optional[Mapping[str, AssetIn]] = None,\n config: Optional[Union[ConfigMapping, Mapping[str, Any]]] = None,\n key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n group_name: Optional[str] = None,\n partitions_def: Optional[PartitionsDefinition] = None,\n metadata: Optional[MetadataUserInput] = None,\n freshness_policy: Optional[FreshnessPolicy] = None,\n auto_materialize_policy: Optional[AutoMaterializePolicy] = None,\n backfill_policy: Optional[BackfillPolicy] = None,\n resource_defs: Optional[Mapping[str, ResourceDefinition]] = None,\n check_specs: Optional[Sequence[AssetCheckSpec]] = None,\n key: Optional[CoercibleToAssetKey] = None,\n) -> Union[AssetsDefinition, Callable[[Callable[..., Any]], AssetsDefinition]]:\n """Creates a software-defined asset that's computed using a graph of ops.\n\n This decorator is meant to decorate a function that composes a set of ops or graphs to define\n the dependencies between them.\n\n Args:\n name (Optional[str]): The name of the asset. If not provided, defaults to the name of the\n decorated function. The asset's name must be a valid name in Dagster (ie only contains\n letters, numbers, and underscores) and may not contain Python reserved keywords.\n description (Optional[str]):\n A human-readable description of the asset.\n ins (Optional[Mapping[str, AssetIn]]): A dictionary that maps input names to information\n about the input.\n config (Optional[Union[ConfigMapping], Mapping[str, Any]):\n Describes how the graph underlying the asset is configured at runtime.\n\n If a :py:class:`ConfigMapping` object is provided, then the graph takes on the config\n schema of this object. The mapping will be applied at runtime to generate the config for\n the graph's constituent nodes.\n\n If a dictionary is provided, then it will be used as the default run config for the\n graph. This means it must conform to the config schema of the underlying nodes. Note\n that the values provided will be viewable and editable in the Dagster UI, so be careful\n with secrets. its constituent nodes.\n\n If no value is provided, then the config schema for the graph is the default (derived\n from the underlying nodes).\n key_prefix (Optional[Union[str, Sequence[str]]]): If provided, the asset's key is the\n concatenation of the key_prefix and the asset's name, which defaults to the name of\n the decorated function. Each item in key_prefix must be a valid name in Dagster (ie only\n contains letters, numbers, and underscores) and may not contain Python reserved keywords.\n group_name (Optional[str]): A string name used to organize multiple assets into groups. If\n not provided, the name "default" is used.\n partitions_def (Optional[PartitionsDefinition]): Defines the set of partition keys that\n compose the asset.\n metadata (Optional[MetadataUserInput]): Dictionary of metadata to be associated with\n the asset.\n freshness_policy (Optional[FreshnessPolicy]): A constraint telling Dagster how often this asset is\n intended to be updated with respect to its root data.\n auto_materialize_policy (Optional[AutoMaterializePolicy]): The AutoMaterializePolicy to use\n for this asset.\n backfill_policy (Optional[BackfillPolicy]): The BackfillPolicy to use for this asset.\n key (Optional[CoeercibleToAssetKey]): The key for this asset. If provided, cannot specify key_prefix or name.\n\n Examples:\n .. code-block:: python\n\n @op\n def fetch_files_from_slack(context) -> pd.DataFrame:\n ...\n\n @op\n def store_files_in_table(files) -> None:\n files.to_sql(name="slack_files", con=create_db_connection())\n\n @graph_asset\n def slack_files_table():\n return store_files(fetch_files_from_slack())\n """\n if compose_fn is None:\n return lambda fn: graph_asset( # type: ignore # (decorator pattern)\n fn,\n name=name,\n description=description,\n ins=ins,\n config=config,\n key_prefix=key_prefix,\n group_name=group_name,\n partitions_def=partitions_def,\n metadata=metadata,\n freshness_policy=freshness_policy,\n auto_materialize_policy=auto_materialize_policy,\n backfill_policy=backfill_policy,\n resource_defs=resource_defs,\n check_specs=check_specs,\n key=key,\n )\n else:\n return graph_asset_no_defaults(\n compose_fn=compose_fn,\n name=name,\n description=description,\n ins=ins,\n config=config,\n key_prefix=key_prefix,\n group_name=group_name,\n partitions_def=partitions_def,\n metadata=metadata,\n freshness_policy=freshness_policy,\n auto_materialize_policy=auto_materialize_policy,\n backfill_policy=backfill_policy,\n resource_defs=resource_defs,\n check_specs=check_specs,\n key=key,\n )
\n\n\ndef graph_asset_no_defaults(\n *,\n compose_fn: Callable,\n name: Optional[str],\n description: Optional[str],\n ins: Optional[Mapping[str, AssetIn]],\n config: Optional[Union[ConfigMapping, Mapping[str, Any]]],\n key_prefix: Optional[CoercibleToAssetKeyPrefix],\n group_name: Optional[str],\n partitions_def: Optional[PartitionsDefinition],\n metadata: Optional[MetadataUserInput],\n freshness_policy: Optional[FreshnessPolicy],\n auto_materialize_policy: Optional[AutoMaterializePolicy],\n backfill_policy: Optional[BackfillPolicy],\n resource_defs: Optional[Mapping[str, ResourceDefinition]],\n check_specs: Optional[Sequence[AssetCheckSpec]],\n key: Optional[CoercibleToAssetKey],\n) -> AssetsDefinition:\n ins = ins or {}\n asset_ins = build_asset_ins(compose_fn, ins or {}, set())\n out_asset_key, _asset_name = _resolve_key_and_name(\n key=key,\n key_prefix=key_prefix,\n name=name,\n decorator="@graph_asset",\n fn=compose_fn,\n )\n\n keys_by_input_name = {input_name: asset_key for asset_key, (input_name, _) in asset_ins.items()}\n partition_mappings = {\n input_name: asset_in.partition_mapping\n for input_name, asset_in in ins.items()\n if asset_in.partition_mapping\n }\n\n check_specs_by_output_name = _validate_and_assign_output_names_to_check_specs(\n check_specs, [out_asset_key]\n )\n check_outs_by_output_name: Mapping[str, GraphOut] = {\n output_name: GraphOut() for output_name in check_specs_by_output_name.keys()\n }\n\n combined_outs_by_output_name: Mapping = {\n "result": GraphOut(),\n **check_outs_by_output_name,\n }\n\n op_graph = graph(\n name=out_asset_key.to_python_identifier(),\n description=description,\n config=config,\n ins={input_name: GraphIn() for _, (input_name, _) in asset_ins.items()},\n out=combined_outs_by_output_name,\n )(compose_fn)\n return AssetsDefinition.from_graph(\n op_graph,\n keys_by_input_name=keys_by_input_name,\n keys_by_output_name={"result": out_asset_key},\n partitions_def=partitions_def,\n partition_mappings=partition_mappings if partition_mappings else None,\n group_name=group_name,\n metadata_by_output_name={"result": metadata} if metadata else None,\n freshness_policies_by_output_name=(\n {"result": freshness_policy} if freshness_policy else None\n ),\n auto_materialize_policies_by_output_name=(\n {"result": auto_materialize_policy} if auto_materialize_policy else None\n ),\n backfill_policy=backfill_policy,\n descriptions_by_output_name={"result": description} if description else None,\n resource_defs=resource_defs,\n check_specs=check_specs,\n )\n\n\n
[docs]def graph_multi_asset(\n *,\n outs: Mapping[str, AssetOut],\n name: Optional[str] = None,\n ins: Optional[Mapping[str, AssetIn]] = None,\n partitions_def: Optional[PartitionsDefinition] = None,\n backfill_policy: Optional[BackfillPolicy] = None,\n group_name: Optional[str] = None,\n can_subset: bool = False,\n resource_defs: Optional[Mapping[str, ResourceDefinition]] = None,\n check_specs: Optional[Sequence[AssetCheckSpec]] = None,\n) -> Callable[[Callable[..., Any]], AssetsDefinition]:\n """Create a combined definition of multiple assets that are computed using the same graph of\n ops, and the same upstream assets.\n\n Each argument to the decorated function references an upstream asset that this asset depends on.\n The name of the argument designates the name of the upstream asset.\n\n Args:\n name (Optional[str]): The name of the graph.\n outs: (Optional[Dict[str, AssetOut]]): The AssetOuts representing the produced assets.\n ins (Optional[Mapping[str, AssetIn]]): A dictionary that maps input names to information\n about the input.\n partitions_def (Optional[PartitionsDefinition]): Defines the set of partition keys that\n compose the assets.\n backfill_policy (Optional[BackfillPolicy]): The backfill policy for the asset.\n group_name (Optional[str]): A string name used to organize multiple assets into groups. This\n group name will be applied to all assets produced by this multi_asset.\n can_subset (bool): Whether this asset's computation can emit a subset of the asset\n keys based on the context.selected_assets argument. Defaults to False.\n """\n\n def inner(fn: Callable) -> AssetsDefinition:\n partition_mappings = {\n input_name: asset_in.partition_mapping\n for input_name, asset_in in (ins or {}).items()\n if asset_in.partition_mapping\n }\n\n asset_ins = build_asset_ins(fn, ins or {}, set())\n keys_by_input_name = {\n input_name: asset_key for asset_key, (input_name, _) in asset_ins.items()\n }\n asset_outs = build_asset_outs(outs)\n\n check_specs_by_output_name = _validate_and_assign_output_names_to_check_specs(\n check_specs, list(asset_outs.keys())\n )\n check_outs_by_output_name: Mapping[str, GraphOut] = {\n output_name: GraphOut() for output_name in check_specs_by_output_name.keys()\n }\n\n combined_outs_by_output_name = {\n **{output_name: GraphOut() for output_name, _ in asset_outs.values()},\n **check_outs_by_output_name,\n }\n\n op_graph = graph(\n name=name or fn.__name__,\n out=combined_outs_by_output_name,\n )(fn)\n\n # source metadata from the AssetOuts (if any)\n metadata_by_output_name = {\n output_name: out.metadata\n for output_name, out in outs.items()\n if isinstance(out, AssetOut) and out.metadata is not None\n }\n\n # source freshness policies from the AssetOuts (if any)\n freshness_policies_by_output_name = {\n output_name: out.freshness_policy\n for output_name, out in outs.items()\n if isinstance(out, AssetOut) and out.freshness_policy is not None\n }\n\n # source auto materialize policies from the AssetOuts (if any)\n auto_materialize_policies_by_output_name = {\n output_name: out.auto_materialize_policy\n for output_name, out in outs.items()\n if isinstance(out, AssetOut) and out.auto_materialize_policy is not None\n }\n\n # source descriptions from the AssetOuts (if any)\n descriptions_by_output_name = {\n output_name: out.description\n for output_name, out in outs.items()\n if isinstance(out, AssetOut) and out.description is not None\n }\n\n return AssetsDefinition.from_graph(\n op_graph,\n keys_by_input_name=keys_by_input_name,\n keys_by_output_name={\n output_name: asset_key for asset_key, (output_name, _) in asset_outs.items()\n },\n partitions_def=partitions_def,\n partition_mappings=partition_mappings if partition_mappings else None,\n group_name=group_name,\n can_subset=can_subset,\n metadata_by_output_name=metadata_by_output_name,\n freshness_policies_by_output_name=freshness_policies_by_output_name,\n auto_materialize_policies_by_output_name=auto_materialize_policies_by_output_name,\n backfill_policy=backfill_policy,\n descriptions_by_output_name=descriptions_by_output_name,\n resource_defs=resource_defs,\n check_specs=check_specs,\n )\n\n return inner
\n\n\ndef build_asset_outs(asset_outs: Mapping[str, AssetOut]) -> Mapping[AssetKey, Tuple[str, Out]]:\n """Creates a mapping from AssetKey to (name of output, Out object)."""\n outs_by_asset_key: Dict[AssetKey, Tuple[str, Out]] = {}\n for output_name, asset_out in asset_outs.items():\n out = asset_out.to_out()\n asset_key = asset_out.key or AssetKey(\n list(filter(None, [*(asset_out.key_prefix or []), output_name]))\n )\n\n outs_by_asset_key[asset_key] = (output_name.replace("-", "_"), out)\n\n return outs_by_asset_key\n\n\ndef _deps_and_non_argument_deps_to_asset_deps(\n deps: Optional[Iterable[CoercibleToAssetDep]],\n non_argument_deps: Optional[Union[Set[AssetKey], Set[str]]],\n) -> Optional[Iterable[AssetDep]]:\n """Helper function for managing deps and non_argument_deps while non_argument_deps is still an accepted parameter.\n Ensures only one of deps and non_argument_deps is provided, then converts the deps to AssetDeps.\n """\n if non_argument_deps is not None and deps is not None:\n raise DagsterInvalidDefinitionError(\n "Cannot specify both deps and non_argument_deps to @asset. Use only deps instead."\n )\n\n if deps is not None:\n return _make_asset_deps(deps)\n\n if non_argument_deps is not None:\n check.set_param(non_argument_deps, "non_argument_deps", of_type=(AssetKey, str))\n return _make_asset_deps(non_argument_deps)\n\n\ndef _make_asset_deps(deps: Optional[Iterable[CoercibleToAssetDep]]) -> Optional[Iterable[AssetDep]]:\n if deps is None:\n return None\n\n # expand any multi_assets into a list of keys\n all_deps = []\n for dep in deps:\n if isinstance(dep, AssetsDefinition) and len(dep.keys) > 1:\n all_deps.extend(dep.keys)\n else:\n all_deps.append(dep)\n\n with disable_dagster_warnings():\n dep_dict = {}\n for dep in all_deps:\n asset_dep = AssetDep.from_coercible(dep)\n\n # we cannot do deduplication via a set because MultiPartitionMappings have an internal\n # dictionary that cannot be hashed. Instead deduplicate by making a dictionary and checking\n # for existing keys. If an asset is specified as a dependency more than once, only error if the\n # dependency is different (ie has a different PartitionMapping)\n if (\n asset_dep.asset_key in dep_dict.keys()\n and asset_dep != dep_dict[asset_dep.asset_key]\n ):\n raise DagsterInvariantViolationError(\n f"Cannot set a dependency on asset {asset_dep.asset_key} more than once per"\n " asset."\n )\n dep_dict[asset_dep.asset_key] = asset_dep\n\n return list(dep_dict.values())\n\n\ndef _validate_and_assign_output_names_to_check_specs(\n check_specs: Optional[Sequence[AssetCheckSpec]], valid_asset_keys: Sequence[AssetKey]\n) -> Mapping[str, AssetCheckSpec]:\n check_specs_by_output_name = {spec.get_python_identifier(): spec for spec in check_specs or []}\n if check_specs and len(check_specs_by_output_name) != len(check_specs):\n duplicates = {\n item: count\n for item, count in Counter(\n [(spec.asset_key, spec.name) for spec in check_specs]\n ).items()\n if count > 1\n }\n\n raise DagsterInvalidDefinitionError(f"Duplicate check specs: {duplicates}")\n\n for spec in check_specs_by_output_name.values():\n if spec.asset_key not in valid_asset_keys:\n raise DagsterInvalidDefinitionError(\n f"Invalid asset key {spec.asset_key} in check spec {spec.name}. Must be one of"\n f" {valid_asset_keys}"\n )\n\n return check_specs_by_output_name\n\n\ndef _get_partition_mappings_from_deps(\n partition_mappings: Dict[AssetKey, PartitionMapping], deps: Iterable[AssetDep], asset_name: str\n):\n # Add PartitionMappings specified via AssetDeps to partition_mappings dictionary. Error on duplicates\n for dep in deps:\n if dep.partition_mapping is None:\n continue\n if partition_mappings.get(dep.asset_key, None) is None:\n partition_mappings[dep.asset_key] = dep.partition_mapping\n continue\n if partition_mappings[dep.asset_key] == dep.partition_mapping:\n continue\n else:\n raise DagsterInvalidDefinitionError(\n f"Two different PartitionMappings for {dep.asset_key} provided for"\n f" asset {asset_name}. Please use the same PartitionMapping for"\n f" {dep.asset_key}."\n )\n\n return partition_mappings\n
", "current_page_name": "_modules/dagster/_core/definitions/decorators/asset_decorator", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.decorators.asset_decorator"}, "graph_decorator": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.decorators.graph_decorator

\nfrom functools import update_wrapper\nfrom typing import Any, Callable, Mapping, Optional, Sequence, Union, overload\n\nimport dagster._check as check\nfrom dagster._core.decorator_utils import format_docstring_for_description\n\nfrom ..config import ConfigMapping\nfrom ..graph_definition import GraphDefinition\nfrom ..input import GraphIn, InputDefinition\nfrom ..output import GraphOut, OutputDefinition\n\n\nclass _Graph:\n    name: Optional[str]\n    description: Optional[str]\n    input_defs: Sequence[InputDefinition]\n    output_defs: Optional[Sequence[OutputDefinition]]\n    ins: Optional[Mapping[str, GraphIn]]\n    out: Optional[Union[GraphOut, Mapping[str, GraphOut]]]\n    tags: Optional[Mapping[str, str]]\n    config_mapping: Optional[ConfigMapping]\n\n    def __init__(\n        self,\n        name: Optional[str] = None,\n        description: Optional[str] = None,\n        input_defs: Optional[Sequence[InputDefinition]] = None,\n        output_defs: Optional[Sequence[OutputDefinition]] = None,\n        ins: Optional[Mapping[str, GraphIn]] = None,\n        out: Optional[Union[GraphOut, Mapping[str, GraphOut]]] = None,\n        tags: Optional[Mapping[str, Any]] = None,\n        config_mapping: Optional[ConfigMapping] = None,\n    ):\n        self.name = check.opt_str_param(name, "name")\n        self.description = check.opt_str_param(description, "description")\n        self.input_defs = check.opt_sequence_param(\n            input_defs, "input_defs", of_type=InputDefinition\n        )\n        self.did_pass_outputs = output_defs is not None or out is not None\n        self.output_defs = check.opt_nullable_sequence_param(\n            output_defs, "output_defs", of_type=OutputDefinition\n        )\n        self.ins = ins\n        self.out = out\n        self.tags = tags\n        self.config_mapping = check.opt_inst_param(config_mapping, "config_mapping", ConfigMapping)\n\n    def __call__(self, fn: Callable[..., Any]) -> GraphDefinition:\n        check.callable_param(fn, "fn")\n\n        if not self.name:\n            self.name = fn.__name__\n\n        if self.ins is not None:\n            input_defs = [inp.to_definition(name) for name, inp in self.ins.items()]\n        else:\n            input_defs = check.opt_list_param(\n                self.input_defs, "input_defs", of_type=InputDefinition\n            )\n\n        if self.out is None:\n            output_defs = self.output_defs\n        elif isinstance(self.out, GraphOut):\n            output_defs = [self.out.to_definition(name=None)]\n        else:\n            check.dict_param(self.out, "out", key_type=str, value_type=GraphOut)\n            output_defs = [out.to_definition(name=name) for name, out in self.out.items()]\n\n        from dagster._core.definitions.composition import do_composition\n\n        (\n            input_mappings,\n            output_mappings,\n            dependencies,\n            node_defs,\n            config_mapping,\n            positional_inputs,\n            node_input_source_assets,\n        ) = do_composition(\n            decorator_name="@graph",\n            graph_name=self.name,\n            fn=fn,\n            provided_input_defs=input_defs,\n            provided_output_defs=output_defs,\n            ignore_output_from_composition_fn=False,\n            config_mapping=self.config_mapping,\n        )\n\n        graph_def = GraphDefinition(\n            name=self.name,\n            dependencies=dependencies,\n            node_defs=node_defs,\n            description=self.description or format_docstring_for_description(fn),\n            input_mappings=input_mappings,\n            output_mappings=output_mappings,\n            config=config_mapping,\n            positional_inputs=positional_inputs,\n            tags=self.tags,\n            node_input_source_assets=node_input_source_assets,\n        )\n        update_wrapper(graph_def, fn)\n        return graph_def\n\n\n@overload\ndef graph(compose_fn: Callable) -> GraphDefinition: ...\n\n\n@overload\ndef graph(\n    *,\n    name: Optional[str] = ...,\n    description: Optional[str] = ...,\n    input_defs: Optional[Sequence[InputDefinition]] = ...,\n    output_defs: Optional[Sequence[OutputDefinition]] = ...,\n    ins: Optional[Mapping[str, GraphIn]] = ...,\n    out: Optional[Union[GraphOut, Mapping[str, GraphOut]]] = ...,\n    tags: Optional[Mapping[str, Any]] = ...,\n    config: Optional[Union[ConfigMapping, Mapping[str, Any]]] = ...,\n) -> _Graph: ...\n\n\n
[docs]def graph(\n compose_fn: Optional[Callable] = None,\n *,\n name: Optional[str] = None,\n description: Optional[str] = None,\n input_defs: Optional[Sequence[InputDefinition]] = None,\n output_defs: Optional[Sequence[OutputDefinition]] = None,\n ins: Optional[Mapping[str, GraphIn]] = None,\n out: Optional[Union[GraphOut, Mapping[str, GraphOut]]] = None,\n tags: Optional[Mapping[str, Any]] = None,\n config: Optional[Union[ConfigMapping, Mapping[str, Any]]] = None,\n) -> Union[GraphDefinition, _Graph]:\n """Create an op graph with the specified parameters from the decorated composition function.\n\n Using this decorator allows you to build up a dependency graph by writing a\n function that invokes ops (or other graphs) and passes the output to subsequent invocations.\n\n Args:\n name (Optional[str]):\n The name of the op graph. Must be unique within any :py:class:`RepositoryDefinition` containing the graph.\n description (Optional[str]):\n A human-readable description of the graph.\n input_defs (Optional[List[InputDefinition]]):\n Information about the inputs that this graph maps. Information provided here\n will be combined with what can be inferred from the function signature, with these\n explicit InputDefinitions taking precedence.\n\n Uses of inputs in the body of the decorated composition function will determine\n the :py:class:`InputMappings <InputMapping>` passed to the underlying\n :py:class:`GraphDefinition`.\n output_defs (Optional[List[OutputDefinition]]):\n Output definitions for the graph. If not provided explicitly, these will be inferred from typehints.\n\n Uses of these outputs in the body of the decorated composition function, as well as the\n return value of the decorated function, will be used to infer the appropriate set of\n :py:class:`OutputMappings <OutputMapping>` for the underlying\n :py:class:`GraphDefinition`.\n\n To map multiple outputs, return a dictionary from the composition function.\n ins (Optional[Dict[str, GraphIn]]):\n Information about the inputs that this graph maps. Information provided here\n will be combined with what can be inferred from the function signature, with these\n explicit GraphIn taking precedence.\n out (Optional[Union[GraphOut, Dict[str, GraphOut]]]):\n Information about the outputs that this graph maps. Information provided here will be\n combined with what can be inferred from the return type signature if the function does\n not use yield.\n\n To map multiple outputs, return a dictionary from the composition function.\n tags (Optional[Dict[str, Any]]): Arbitrary metadata for any execution run of the graph.\n Values that are not strings will be json encoded and must meet the criteria that\n `json.loads(json.dumps(value)) == value`. These tag values may be overwritten by tag\n values provided at invocation time.\n\n config (Optional[Union[ConfigMapping], Mapping[str, Any]):\n Describes how the graph is configured at runtime.\n\n If a :py:class:`ConfigMapping` object is provided, then the graph takes on the config\n schema of this object. The mapping will be applied at runtime to generate the config for\n the graph's constituent nodes.\n\n If a dictionary is provided, then it will be used as the default run config for the\n graph. This means it must conform to the config schema of the underlying nodes. Note\n that the values provided will be viewable and editable in the Dagster UI, so be careful\n with secrets. its constituent nodes.\n\n If no value is provided, then the config schema for the graph is the default (derived\n from the underlying nodes).\n """\n if compose_fn is not None:\n check.invariant(description is None)\n return _Graph()(compose_fn)\n\n config_mapping = None\n # Case 1: a dictionary of config is provided, convert to config mapping.\n if config is not None and not isinstance(config, ConfigMapping):\n config = check.dict_param(config, "config", key_type=str)\n config_mapping = ConfigMapping(config_fn=lambda _: config, config_schema=None)\n # Case 2: actual config mapping is provided.\n else:\n config_mapping = config\n\n return _Graph(\n name=name,\n description=description,\n input_defs=input_defs,\n output_defs=output_defs,\n ins=ins,\n out=out,\n tags=tags,\n config_mapping=config_mapping,\n )
\n
", "current_page_name": "_modules/dagster/_core/definitions/decorators/graph_decorator", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.decorators.graph_decorator"}, "hook_decorator": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.decorators.hook_decorator

\nfrom functools import update_wrapper\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Any,\n    Callable,\n    Optional,\n    Sequence,\n    Union,\n    cast,\n    overload,\n)\n\nimport dagster._check as check\nfrom dagster._core.errors import DagsterInvalidDefinitionError\n\nfrom ...decorator_utils import get_function_params, validate_expected_params\nfrom ..events import HookExecutionResult\nfrom ..hook_definition import HookDefinition\n\nif TYPE_CHECKING:\n    from dagster._core.events import DagsterEvent\n    from dagster._core.execution.context.hook import HookContext\n\n\ndef _validate_hook_fn_params(fn, expected_positionals):\n    params = get_function_params(fn)\n    missing_positional = validate_expected_params(params, expected_positionals)\n    if missing_positional:\n        raise DagsterInvalidDefinitionError(\n            f"'{fn.__name__}' decorated function does not have required positional "\n            f"parameter '{missing_positional}'. Hook functions should only have keyword arguments "\n            "that match input names and a first positional parameter named 'context' and "\n            "a second positional parameter named 'event_list'."\n        )\n\n\nclass _Hook:\n    def __init__(\n        self,\n        name: Optional[str] = None,\n        required_resource_keys: Optional[AbstractSet[str]] = None,\n        decorated_fn: Optional[Callable[..., Any]] = None,\n    ):\n        self.name = check.opt_str_param(name, "name")\n        self.required_resource_keys = check.opt_set_param(\n            required_resource_keys, "required_resource_keys"\n        )\n        self.decorated_fn = check.opt_callable_param(decorated_fn, "decorated_fn")\n\n    def __call__(self, fn) -> HookDefinition:\n        check.callable_param(fn, "fn")\n\n        if not self.name:\n            self.name = fn.__name__\n\n        expected_positionals = ["context", "event_list"]\n\n        _validate_hook_fn_params(fn, expected_positionals)\n\n        hook_def = HookDefinition(\n            name=self.name or "",\n            hook_fn=fn,\n            required_resource_keys=self.required_resource_keys,\n            decorated_fn=self.decorated_fn or fn,\n        )\n        update_wrapper(cast(Callable[..., Any], hook_def), fn)\n        return hook_def\n\n\n@overload\ndef event_list_hook(\n    hook_fn: Callable,\n) -> HookDefinition:\n    pass\n\n\n@overload\ndef event_list_hook(\n    *,\n    name: Optional[str] = ...,\n    required_resource_keys: Optional[AbstractSet[str]] = ...,\n    decorated_fn: Optional[Callable[..., Any]] = ...,\n) -> _Hook:\n    pass\n\n\ndef event_list_hook(\n    hook_fn: Optional[Callable] = None,\n    *,\n    name: Optional[str] = None,\n    required_resource_keys: Optional[AbstractSet[str]] = None,\n    decorated_fn: Optional[Callable[..., Any]] = None,\n) -> Union[HookDefinition, _Hook]:\n    """Create a generic hook with the specified parameters from the decorated function.\n\n    This decorator is currently used internally by Dagster machinery to support success_hook and\n    failure_hook.\n\n    The user-defined hook function requires two parameters:\n    - A `context` object is passed as the first parameter. The context is an instance of\n        :py:class:`context <HookContext>`, and provides access to system\n        information, such as loggers (context.log), resources (context.resources), the op\n        (context.op) and its execution step (context.step) which triggers this hook.\n    - An `event_list` object is passed as the second paramter. It provides the full event list of the\n        associated execution step.\n\n    Args:\n        name (Optional[str]): The name of this hook.\n        required_resource_keys (Optional[AbstractSet[str]]): Keys for the resources required by the\n            hook.\n\n    Examples:\n        .. code-block:: python\n\n            @event_list_hook(required_resource_keys={'slack'})\n            def slack_on_materializations(context, event_list):\n                for event in event_list:\n                    if event.event_type == DagsterEventType.ASSET_MATERIALIZATION:\n                        message = f'{context.op_name} has materialized an asset {event.asset_key}.'\n                        # send a slack message every time a materialization event occurs\n                        context.resources.slack.send_message(message)\n\n\n    """\n    # This case is for when decorator is used bare, without arguments.\n    # e.g. @event_list_hook versus @event_list_hook()\n    if hook_fn is not None:\n        check.invariant(required_resource_keys is None)\n        return _Hook()(hook_fn)\n\n    return _Hook(\n        name=name, required_resource_keys=required_resource_keys, decorated_fn=decorated_fn\n    )\n\n\nSuccessOrFailureHookFn = Callable[["HookContext"], Any]\n\n\n@overload\ndef success_hook(hook_fn: SuccessOrFailureHookFn) -> HookDefinition: ...\n\n\n@overload\ndef success_hook(\n    *,\n    name: Optional[str] = ...,\n    required_resource_keys: Optional[AbstractSet[str]] = ...,\n) -> Callable[[SuccessOrFailureHookFn], HookDefinition]: ...\n\n\n
[docs]def success_hook(\n hook_fn: Optional[SuccessOrFailureHookFn] = None,\n *,\n name: Optional[str] = None,\n required_resource_keys: Optional[AbstractSet[str]] = None,\n) -> Union[HookDefinition, Callable[[SuccessOrFailureHookFn], HookDefinition]]:\n """Create a hook on step success events with the specified parameters from the decorated function.\n\n Args:\n name (Optional[str]): The name of this hook.\n required_resource_keys (Optional[AbstractSet[str]]): Keys for the resources required by the\n hook.\n\n Examples:\n .. code-block:: python\n\n @success_hook(required_resource_keys={'slack'})\n def slack_message_on_success(context):\n message = 'op {} succeeded'.format(context.op.name)\n context.resources.slack.send_message(message)\n\n @success_hook\n def do_something_on_success(context):\n do_something()\n\n\n """\n\n def wrapper(fn: SuccessOrFailureHookFn) -> HookDefinition:\n check.callable_param(fn, "fn")\n\n expected_positionals = ["context"]\n _validate_hook_fn_params(fn, expected_positionals)\n\n if name is None or callable(name):\n _name = fn.__name__\n else:\n _name = name\n\n @event_list_hook(name=_name, required_resource_keys=required_resource_keys, decorated_fn=fn)\n def _success_hook(\n context: "HookContext", event_list: Sequence["DagsterEvent"]\n ) -> HookExecutionResult:\n for event in event_list:\n if event.is_step_success:\n fn(context)\n return HookExecutionResult(hook_name=_name, is_skipped=False)\n\n # hook is skipped when fn didn't run\n return HookExecutionResult(hook_name=_name, is_skipped=True)\n\n return _success_hook\n\n # This case is for when decorator is used bare, without arguments, i.e. @success_hook\n if hook_fn is not None:\n check.invariant(required_resource_keys is None)\n return wrapper(hook_fn)\n\n return wrapper
\n\n\n@overload\ndef failure_hook(name: SuccessOrFailureHookFn) -> HookDefinition: ...\n\n\n@overload\ndef failure_hook(\n name: Optional[str] = ...,\n required_resource_keys: Optional[AbstractSet[str]] = ...,\n) -> Callable[[SuccessOrFailureHookFn], HookDefinition]: ...\n\n\n
[docs]def failure_hook(\n name: Optional[Union[SuccessOrFailureHookFn, str]] = None,\n required_resource_keys: Optional[AbstractSet[str]] = None,\n) -> Union[HookDefinition, Callable[[SuccessOrFailureHookFn], HookDefinition]]:\n """Create a hook on step failure events with the specified parameters from the decorated function.\n\n Args:\n name (Optional[str]): The name of this hook.\n required_resource_keys (Optional[AbstractSet[str]]): Keys for the resources required by the\n hook.\n\n Examples:\n .. code-block:: python\n\n @failure_hook(required_resource_keys={'slack'})\n def slack_message_on_failure(context):\n message = 'op {} failed'.format(context.op.name)\n context.resources.slack.send_message(message)\n\n @failure_hook\n def do_something_on_failure(context):\n do_something()\n\n\n """\n\n def wrapper(fn: Callable[["HookContext"], Any]) -> HookDefinition:\n check.callable_param(fn, "fn")\n\n expected_positionals = ["context"]\n _validate_hook_fn_params(fn, expected_positionals)\n\n if name is None or callable(name):\n _name = fn.__name__\n else:\n _name = name\n\n @event_list_hook(name=_name, required_resource_keys=required_resource_keys, decorated_fn=fn)\n def _failure_hook(\n context: "HookContext", event_list: Sequence["DagsterEvent"]\n ) -> HookExecutionResult:\n for event in event_list:\n if event.is_step_failure:\n fn(context)\n return HookExecutionResult(hook_name=_name, is_skipped=False)\n\n # hook is skipped when fn didn't run\n return HookExecutionResult(hook_name=_name, is_skipped=True)\n\n return _failure_hook\n\n # This case is for when decorator is used bare, without arguments, i.e. @failure_hook\n if callable(name):\n check.invariant(required_resource_keys is None)\n return wrapper(name)\n\n return wrapper
\n
", "current_page_name": "_modules/dagster/_core/definitions/decorators/hook_decorator", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.decorators.hook_decorator"}, "job_decorator": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.decorators.job_decorator

\nfrom functools import update_wrapper\nfrom typing import TYPE_CHECKING, AbstractSet, Any, Callable, Mapping, Optional, Union, overload\n\nimport dagster._check as check\nfrom dagster._annotations import deprecated_param\nfrom dagster._core.decorator_utils import format_docstring_for_description\n\nfrom ..config import ConfigMapping\nfrom ..graph_definition import GraphDefinition\nfrom ..hook_definition import HookDefinition\nfrom ..job_definition import JobDefinition\nfrom ..logger_definition import LoggerDefinition\nfrom ..metadata import RawMetadataValue\nfrom ..policy import RetryPolicy\nfrom ..resource_definition import ResourceDefinition\nfrom ..version_strategy import VersionStrategy\n\nif TYPE_CHECKING:\n    from ..executor_definition import ExecutorDefinition\n    from ..partition import PartitionedConfig, PartitionsDefinition\n    from ..run_config import RunConfig\n\n\nclass _Job:\n    def __init__(\n        self,\n        name: Optional[str] = None,\n        description: Optional[str] = None,\n        tags: Optional[Mapping[str, Any]] = None,\n        metadata: Optional[Mapping[str, RawMetadataValue]] = None,\n        resource_defs: Optional[Mapping[str, ResourceDefinition]] = None,\n        config: Optional[\n            Union[ConfigMapping, Mapping[str, Any], "RunConfig", "PartitionedConfig"]\n        ] = None,\n        logger_defs: Optional[Mapping[str, LoggerDefinition]] = None,\n        executor_def: Optional["ExecutorDefinition"] = None,\n        hooks: Optional[AbstractSet[HookDefinition]] = None,\n        op_retry_policy: Optional[RetryPolicy] = None,\n        version_strategy: Optional[VersionStrategy] = None,\n        partitions_def: Optional["PartitionsDefinition"] = None,\n        input_values: Optional[Mapping[str, object]] = None,\n    ):\n        from dagster._core.definitions.run_config import convert_config_input\n\n        self.name = name\n        self.description = description\n        self.tags = tags\n        self.metadata = metadata\n        self.resource_defs = resource_defs\n        self.config = convert_config_input(config)\n        self.logger_defs = logger_defs\n        self.executor_def = executor_def\n        self.hooks = hooks\n        self.op_retry_policy = op_retry_policy\n        self.version_strategy = version_strategy\n        self.partitions_def = partitions_def\n        self.input_values = input_values\n\n    def __call__(self, fn: Callable[..., Any]) -> JobDefinition:\n        check.callable_param(fn, "fn")\n\n        if not self.name:\n            self.name = fn.__name__\n\n        from dagster._core.definitions.composition import do_composition\n\n        (\n            input_mappings,\n            output_mappings,\n            dependencies,\n            node_defs,\n            config_mapping,\n            positional_inputs,\n            node_input_source_assets,\n        ) = do_composition(\n            decorator_name="@job",\n            graph_name=self.name,\n            fn=fn,\n            provided_input_defs=[],\n            provided_output_defs=[],\n            ignore_output_from_composition_fn=False,\n            config_mapping=None,\n        )\n\n        graph_def = GraphDefinition(\n            name=self.name,\n            dependencies=dependencies,\n            node_defs=node_defs,\n            description=self.description or format_docstring_for_description(fn),\n            input_mappings=input_mappings,\n            output_mappings=output_mappings,\n            config=config_mapping,\n            positional_inputs=positional_inputs,\n            tags=self.tags,\n            node_input_source_assets=node_input_source_assets,\n        )\n\n        job_def = graph_def.to_job(\n            description=self.description or format_docstring_for_description(fn),\n            resource_defs=self.resource_defs,\n            config=self.config,\n            tags=self.tags,\n            metadata=self.metadata,\n            logger_defs=self.logger_defs,\n            executor_def=self.executor_def,\n            hooks=self.hooks,\n            op_retry_policy=self.op_retry_policy,\n            version_strategy=self.version_strategy,\n            partitions_def=self.partitions_def,\n            input_values=self.input_values,\n        )\n        update_wrapper(job_def, fn)\n        return job_def\n\n\n@overload\ndef job(compose_fn: Callable[..., Any]) -> JobDefinition: ...\n\n\n@overload\ndef job(\n    *,\n    name: Optional[str] = ...,\n    description: Optional[str] = ...,\n    resource_defs: Optional[Mapping[str, object]] = ...,\n    config: Union[ConfigMapping, Mapping[str, Any], "RunConfig", "PartitionedConfig"] = ...,\n    tags: Optional[Mapping[str, Any]] = ...,\n    metadata: Optional[Mapping[str, RawMetadataValue]] = ...,\n    logger_defs: Optional[Mapping[str, LoggerDefinition]] = ...,\n    executor_def: Optional["ExecutorDefinition"] = ...,\n    hooks: Optional[AbstractSet[HookDefinition]] = ...,\n    op_retry_policy: Optional[RetryPolicy] = ...,\n    version_strategy: Optional[VersionStrategy] = ...,\n    partitions_def: Optional["PartitionsDefinition"] = ...,\n    input_values: Optional[Mapping[str, object]] = ...,\n) -> _Job: ...\n\n\n
[docs]@deprecated_param(\n param="version_strategy",\n breaking_version="2.0",\n additional_warn_text="Use asset versioning instead.",\n)\ndef job(\n compose_fn: Optional[Callable[..., Any]] = None,\n *,\n name: Optional[str] = None,\n description: Optional[str] = None,\n resource_defs: Optional[Mapping[str, object]] = None,\n config: Optional[\n Union[ConfigMapping, Mapping[str, Any], "RunConfig", "PartitionedConfig"]\n ] = None,\n tags: Optional[Mapping[str, Any]] = None,\n metadata: Optional[Mapping[str, RawMetadataValue]] = None,\n logger_defs: Optional[Mapping[str, LoggerDefinition]] = None,\n executor_def: Optional["ExecutorDefinition"] = None,\n hooks: Optional[AbstractSet[HookDefinition]] = None,\n op_retry_policy: Optional[RetryPolicy] = None,\n version_strategy: Optional[VersionStrategy] = None,\n partitions_def: Optional["PartitionsDefinition"] = None,\n input_values: Optional[Mapping[str, object]] = None,\n) -> Union[JobDefinition, _Job]:\n """Creates a job with the specified parameters from the decorated graph/op invocation function.\n\n Using this decorator allows you to build an executable job by writing a function that invokes\n ops (or graphs).\n\n Args:\n compose_fn (Callable[..., Any]:\n The decorated function. The body should contain op or graph invocations. Unlike op\n functions, does not accept a context argument.\n name (Optional[str]):\n The name for the Job. Defaults to the name of the this graph.\n resource_defs (Optional[Mapping[str, object]]):\n Resources that are required by this graph for execution.\n If not defined, `io_manager` will default to filesystem.\n config:\n Describes how the job is parameterized at runtime.\n\n If no value is provided, then the schema for the job's run config is a standard\n format based on its ops and resources.\n\n If a dictionary is provided, then it must conform to the standard config schema, and\n it will be used as the job's run config for the job whenever the job is executed.\n The values provided will be viewable and editable in the Dagster UI, so be\n careful with secrets.\n\n If a :py:class:`RunConfig` object is provided, then it will be used directly as the run config\n for the job whenever the job is executed, similar to providing a dictionary.\n\n If a :py:class:`ConfigMapping` object is provided, then the schema for the job's run config is\n determined by the config mapping, and the ConfigMapping, which should return\n configuration in the standard format to configure the job.\n\n If a :py:class:`PartitionedConfig` object is provided, then it defines a discrete set of config\n values that can parameterize the job, as well as a function for mapping those\n values to the base config. The values provided will be viewable and editable in the\n Dagster UI, so be careful with secrets.\n tags (Optional[Dict[str, Any]]):\n Arbitrary information that will be attached to the execution of the Job.\n Values that are not strings will be json encoded and must meet the criteria that\n `json.loads(json.dumps(value)) == value`. These tag values may be overwritten by tag\n values provided at invocation time.\n metadata (Optional[Dict[str, RawMetadataValue]]):\n Arbitrary information that will be attached to the JobDefinition and be viewable in the Dagster UI.\n Keys must be strings, and values must be python primitive types or one of the provided\n MetadataValue types\n logger_defs (Optional[Dict[str, LoggerDefinition]]):\n A dictionary of string logger identifiers to their implementations.\n executor_def (Optional[ExecutorDefinition]):\n How this Job will be executed. Defaults to :py:class:`multiprocess_executor` .\n op_retry_policy (Optional[RetryPolicy]): The default retry policy for all ops in this job.\n Only used if retry policy is not defined on the op definition or op invocation.\n version_strategy (Optional[VersionStrategy]):\n Defines how each op (and optionally, resource) in the job can be versioned. If\n provided, memoization will be enabled for this job.\n partitions_def (Optional[PartitionsDefinition]): Defines a discrete set of partition keys\n that can parameterize the job. If this argument is supplied, the config argument\n can't also be supplied.\n input_values (Optional[Mapping[str, Any]]):\n A dictionary that maps python objects to the top-level inputs of a job.\n\n Examples:\n .. code-block:: python\n\n @op\n def return_one():\n return 1\n\n @op\n def add_one(in1):\n return in1 + 1\n\n @job\n def job1():\n add_one(return_one())\n """\n if compose_fn is not None:\n check.invariant(description is None)\n return _Job()(compose_fn)\n\n from dagster._core.execution.build_resources import wrap_resources_for_execution\n\n return _Job(\n name=name,\n description=description,\n resource_defs=wrap_resources_for_execution(resource_defs),\n config=config,\n tags=tags,\n metadata=metadata,\n logger_defs=logger_defs,\n executor_def=executor_def,\n hooks=hooks,\n op_retry_policy=op_retry_policy,\n version_strategy=version_strategy,\n partitions_def=partitions_def,\n input_values=input_values,\n )
\n
", "current_page_name": "_modules/dagster/_core/definitions/decorators/job_decorator", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.decorators.job_decorator"}, "op_decorator": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.decorators.op_decorator

\nfrom functools import lru_cache, update_wrapper\nfrom inspect import Parameter\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Any,\n    Callable,\n    List,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Union,\n    cast,\n    overload,\n)\n\nimport dagster._check as check\nfrom dagster._annotations import deprecated_param\nfrom dagster._config import UserConfigSchema\nfrom dagster._core.decorator_utils import (\n    format_docstring_for_description,\n    get_function_params,\n    get_valid_name_permutations,\n    param_is_var_keyword,\n    positional_arg_name_list,\n)\nfrom dagster._core.definitions.inference import infer_input_props\nfrom dagster._core.definitions.resource_annotation import (\n    get_resource_args,\n)\nfrom dagster._core.errors import DagsterInvalidDefinitionError\nfrom dagster._core.types.dagster_type import DagsterTypeKind\nfrom dagster._utils.warnings import normalize_renamed_param\n\nfrom ..input import In, InputDefinition\nfrom ..output import Out\nfrom ..policy import RetryPolicy\nfrom ..utils import DEFAULT_OUTPUT\n\nif TYPE_CHECKING:\n    from ..op_definition import OpDefinition\n\n\nclass _Op:\n    def __init__(\n        self,\n        name: Optional[str] = None,\n        description: Optional[str] = None,\n        required_resource_keys: Optional[AbstractSet[str]] = None,\n        config_schema: Optional[Union[Any, Mapping[str, Any]]] = None,\n        tags: Optional[Mapping[str, Any]] = None,\n        code_version: Optional[str] = None,\n        decorator_takes_context: Optional[bool] = True,\n        retry_policy: Optional[RetryPolicy] = None,\n        ins: Optional[Mapping[str, In]] = None,\n        out: Optional[Union[Out, Mapping[str, Out]]] = None,\n    ):\n        self.name = check.opt_str_param(name, "name")\n        self.decorator_takes_context = check.bool_param(\n            decorator_takes_context, "decorator_takes_context"\n        )\n\n        self.description = check.opt_str_param(description, "description")\n\n        # these will be checked within OpDefinition\n        self.required_resource_keys = required_resource_keys\n        self.tags = tags\n        self.code_version = code_version\n        self.retry_policy = retry_policy\n\n        # config will be checked within OpDefinition\n        self.config_schema = config_schema\n\n        self.ins = check.opt_nullable_mapping_param(ins, "ins", key_type=str, value_type=In)\n        self.out = out\n\n    def __call__(self, fn: Callable[..., Any]) -> "OpDefinition":\n        from dagster._config.pythonic_config import validate_resource_annotated_function\n\n        from ..op_definition import OpDefinition\n\n        validate_resource_annotated_function(fn)\n\n        if not self.name:\n            self.name = fn.__name__\n\n        compute_fn = (\n            DecoratedOpFunction(decorated_fn=fn)\n            if self.decorator_takes_context\n            else NoContextDecoratedOpFunction(decorated_fn=fn)\n        )\n\n        if compute_fn.has_config_arg():\n            check.param_invariant(\n                self.config_schema is None or self.config_schema == {},\n                "If the @op has a config arg, you cannot specify a config schema",\n            )\n\n            from dagster._config.pythonic_config import infer_schema_from_config_annotation\n\n            # Parse schema from the type annotation of the config arg\n            config_arg = compute_fn.get_config_arg()\n            config_arg_type = config_arg.annotation\n            config_arg_default = config_arg.default\n            self.config_schema = infer_schema_from_config_annotation(\n                config_arg_type, config_arg_default\n            )\n\n        outs: Optional[Mapping[str, Out]] = None\n        if self.out is not None and isinstance(self.out, Out):\n            outs = {DEFAULT_OUTPUT: self.out}\n        elif self.out is not None:\n            outs = check.mapping_param(self.out, "out", key_type=str, value_type=Out)\n\n        arg_resource_keys = {arg.name for arg in compute_fn.get_resource_args()}\n        decorator_resource_keys = set(self.required_resource_keys or [])\n        check.param_invariant(\n            len(decorator_resource_keys) == 0 or len(arg_resource_keys) == 0,\n            "Cannot specify resource requirements in both @op decorator and as arguments to the"\n            " decorated function",\n        )\n        resolved_resource_keys = decorator_resource_keys.union(arg_resource_keys)\n\n        op_def = OpDefinition.dagster_internal_init(\n            name=self.name,\n            ins=self.ins,\n            outs=outs,\n            compute_fn=compute_fn,\n            config_schema=self.config_schema,\n            description=self.description or format_docstring_for_description(fn),\n            required_resource_keys=resolved_resource_keys,\n            tags=self.tags,\n            code_version=self.code_version,\n            retry_policy=self.retry_policy,\n            version=None,  # code_version has replaced version\n        )\n        update_wrapper(op_def, compute_fn.decorated_fn)\n        return op_def\n\n\n@overload\ndef op(compute_fn: Callable[..., Any]) -> "OpDefinition": ...\n\n\n@overload\ndef op(\n    *,\n    name: Optional[str] = ...,\n    description: Optional[str] = ...,\n    ins: Optional[Mapping[str, In]] = ...,\n    out: Optional[Union[Out, Mapping[str, Out]]] = ...,\n    config_schema: Optional[UserConfigSchema] = ...,\n    required_resource_keys: Optional[AbstractSet[str]] = ...,\n    tags: Optional[Mapping[str, Any]] = ...,\n    version: Optional[str] = ...,\n    retry_policy: Optional[RetryPolicy] = ...,\n    code_version: Optional[str] = ...,\n) -> _Op: ...\n\n\n
[docs]@deprecated_param(\n param="version", breaking_version="2.0", additional_warn_text="Use `code_version` instead"\n)\ndef op(\n compute_fn: Optional[Callable] = None,\n *,\n name: Optional[str] = None,\n description: Optional[str] = None,\n ins: Optional[Mapping[str, In]] = None,\n out: Optional[Union[Out, Mapping[str, Out]]] = None,\n config_schema: Optional[UserConfigSchema] = None,\n required_resource_keys: Optional[AbstractSet[str]] = None,\n tags: Optional[Mapping[str, Any]] = None,\n version: Optional[str] = None,\n retry_policy: Optional[RetryPolicy] = None,\n code_version: Optional[str] = None,\n) -> Union["OpDefinition", _Op]:\n """Create an op with the specified parameters from the decorated function.\n\n Ins and outs will be inferred from the type signature of the decorated function\n if not explicitly provided.\n\n The decorated function will be used as the op's compute function. The signature of the\n decorated function is more flexible than that of the ``compute_fn`` in the core API; it may:\n\n 1. Return a value. This value will be wrapped in an :py:class:`Output` and yielded by the compute function.\n 2. Return an :py:class:`Output`. This output will be yielded by the compute function.\n 3. Yield :py:class:`Output` or other :ref:`event objects <events>`. Same as default compute behavior.\n\n Note that options 1) and 2) are incompatible with yielding other events -- if you would like\n to decorate a function that yields events, it must also wrap its eventual output in an\n :py:class:`Output` and yield it.\n\n @op supports ``async def`` functions as well, including async generators when yielding multiple\n events or outputs. Note that async ops will generally be run on their own unless using a custom\n :py:class:`Executor` implementation that supports running them together.\n\n Args:\n name (Optional[str]): Name of op. Must be unique within any :py:class:`GraphDefinition`\n using the op.\n description (Optional[str]): Human-readable description of this op. If not provided, and\n the decorated function has docstring, that docstring will be used as the description.\n ins (Optional[Dict[str, In]]):\n Information about the inputs to the op. Information provided here will be combined\n with what can be inferred from the function signature.\n out (Optional[Union[Out, Dict[str, Out]]]):\n Information about the op outputs. Information provided here will be combined with\n what can be inferred from the return type signature if the function does not use yield.\n config_schema (Optional[ConfigSchema): The schema for the config. If set, Dagster will check\n that config provided for the op matches this schema and fail if it does not. If not\n set, Dagster will accept any config provided for the op.\n required_resource_keys (Optional[Set[str]]): Set of resource handles required by this op.\n tags (Optional[Dict[str, Any]]): Arbitrary metadata for the op. Frameworks may\n expect and require certain metadata to be attached to a op. Values that are not strings\n will be json encoded and must meet the criteria that `json.loads(json.dumps(value)) == value`.\n code_version (Optional[str]): (Experimental) Version of the logic encapsulated by the op. If set,\n this is used as a default version for all outputs.\n retry_policy (Optional[RetryPolicy]): The retry policy for this op.\n\n Examples:\n .. code-block:: python\n\n @op\n def hello_world():\n print('hello')\n\n @op\n def echo(msg: str) -> str:\n return msg\n\n @op(\n ins={'msg': In(str)},\n out=Out(str)\n )\n def echo_2(msg): # same as above\n return msg\n\n @op(\n out={'word': Out(), 'num': Out()}\n )\n def multi_out() -> Tuple[str, int]:\n return 'cool', 4\n """\n code_version = normalize_renamed_param(\n code_version,\n "code_version",\n version,\n "version",\n )\n\n if compute_fn is not None:\n check.invariant(description is None)\n check.invariant(config_schema is None)\n check.invariant(required_resource_keys is None)\n check.invariant(tags is None)\n check.invariant(version is None)\n\n return _Op()(compute_fn)\n\n return _Op(\n name=name,\n description=description,\n config_schema=config_schema,\n required_resource_keys=required_resource_keys,\n tags=tags,\n code_version=code_version,\n retry_policy=retry_policy,\n ins=ins,\n out=out,\n )
\n\n\nclass DecoratedOpFunction(NamedTuple):\n """Wrapper around the decorated op function to provide commonly used util methods."""\n\n decorated_fn: Callable[..., Any]\n\n @property\n def name(self):\n return self.decorated_fn.__name__\n\n @lru_cache(maxsize=1)\n def has_context_arg(self) -> bool:\n return is_context_provided(get_function_params(self.decorated_fn))\n\n def get_context_arg(self) -> Parameter:\n if self.has_context_arg():\n return get_function_params(self.decorated_fn)[0]\n check.failed("Requested context arg on function that does not have one")\n\n @lru_cache(maxsize=1)\n def _get_function_params(self) -> Sequence[Parameter]:\n return get_function_params(self.decorated_fn)\n\n def has_config_arg(self) -> bool:\n for param in get_function_params(self.decorated_fn):\n if param.name == "config":\n return True\n\n return False\n\n def get_config_arg(self) -> Parameter:\n for param in get_function_params(self.decorated_fn):\n if param.name == "config":\n return param\n\n check.failed("Requested config arg on function that does not have one")\n\n def get_resource_args(self) -> Sequence[Parameter]:\n return get_resource_args(self.decorated_fn)\n\n def positional_inputs(self) -> Sequence[str]:\n params = self._get_function_params()\n input_args = params[1:] if self.has_context_arg() else params\n resource_arg_names = [arg.name for arg in self.get_resource_args()]\n input_args_filtered = [\n input_arg\n for input_arg in input_args\n if input_arg.name != "config" and input_arg.name not in resource_arg_names\n ]\n return positional_arg_name_list(input_args_filtered)\n\n def has_var_kwargs(self) -> bool:\n params = self._get_function_params()\n # var keyword arg has to be the last argument\n return len(params) > 0 and param_is_var_keyword(params[-1])\n\n def get_output_annotation(self) -> Any:\n from ..inference import infer_output_props\n\n return infer_output_props(self.decorated_fn).annotation\n\n\nclass NoContextDecoratedOpFunction(DecoratedOpFunction):\n """Wrapper around a decorated op function, when the decorator does not permit a context\n parameter.\n """\n\n @lru_cache(maxsize=1)\n def has_context_arg(self) -> bool:\n return False\n\n\ndef is_context_provided(params: Sequence[Parameter]) -> bool:\n if len(params) == 0:\n return False\n return params[0].name in get_valid_name_permutations("context")\n\n\ndef resolve_checked_op_fn_inputs(\n decorator_name: str,\n fn_name: str,\n compute_fn: DecoratedOpFunction,\n explicit_input_defs: Sequence[InputDefinition],\n exclude_nothing: bool,\n) -> Sequence[InputDefinition]:\n """Validate provided input definitions and infer the remaining from the type signature of the compute_fn.\n Returns the resolved set of InputDefinitions.\n\n Args:\n decorator_name (str): Name of the decorator that is wrapping the op function.\n fn_name (str): Name of the decorated function.\n compute_fn (DecoratedOpFunction): The decorated function, wrapped in the\n DecoratedOpFunction wrapper.\n explicit_input_defs (List[InputDefinition]): The input definitions that were explicitly\n provided in the decorator.\n exclude_nothing (bool): True if Nothing type inputs should be excluded from compute_fn\n arguments.\n """\n explicit_names = set()\n if exclude_nothing:\n explicit_names = set(\n inp.name\n for inp in explicit_input_defs\n if not inp.dagster_type.kind == DagsterTypeKind.NOTHING\n )\n nothing_names = set(\n inp.name\n for inp in explicit_input_defs\n if inp.dagster_type.kind == DagsterTypeKind.NOTHING\n )\n else:\n explicit_names = set(inp.name for inp in explicit_input_defs)\n nothing_names = set()\n\n params = get_function_params(compute_fn.decorated_fn)\n\n input_args = params[1:] if compute_fn.has_context_arg() else params\n\n # filter out config arg\n resource_arg_names = {arg.name for arg in compute_fn.get_resource_args()}\n explicit_names = explicit_names - resource_arg_names\n\n if compute_fn.has_config_arg() or resource_arg_names:\n new_input_args = []\n for input_arg in input_args:\n if input_arg.name != "config" and input_arg.name not in resource_arg_names:\n new_input_args.append(input_arg)\n input_args = new_input_args\n\n # Validate input arguments\n used_inputs = set()\n inputs_to_infer = set()\n has_kwargs = False\n\n for param in cast(List[Parameter], input_args):\n if param.kind == Parameter.VAR_KEYWORD:\n has_kwargs = True\n elif param.kind == Parameter.VAR_POSITIONAL:\n raise DagsterInvalidDefinitionError(\n f"{decorator_name} '{fn_name}' decorated function has positional vararg parameter "\n f"'{param}'. {decorator_name} decorated functions should only have keyword "\n "arguments that match input names and, if system information is required, a first "\n "positional parameter named 'context'."\n )\n\n else:\n if param.name not in explicit_names:\n if param.name in nothing_names:\n raise DagsterInvalidDefinitionError(\n f"{decorator_name} '{fn_name}' decorated function has parameter"\n f" '{param.name}' that is one of the input_defs of type 'Nothing' which"\n " should not be included since no data will be passed for it. "\n )\n else:\n inputs_to_infer.add(param.name)\n\n else:\n used_inputs.add(param.name)\n\n undeclared_inputs = explicit_names - used_inputs\n if not has_kwargs and undeclared_inputs:\n undeclared_inputs_printed = ", '".join(undeclared_inputs)\n raise DagsterInvalidDefinitionError(\n f"{decorator_name} '{fn_name}' decorated function does not have argument(s)"\n f" '{undeclared_inputs_printed}'. {decorator_name}-decorated functions should have a"\n " keyword argument for each of their Ins, except for Ins that have the Nothing"\n " dagster_type. Alternatively, they can accept **kwargs."\n )\n\n inferred_props = {\n inferred.name: inferred\n for inferred in infer_input_props(compute_fn.decorated_fn, compute_fn.has_context_arg())\n }\n input_defs = []\n for input_def in explicit_input_defs:\n if input_def.name in inferred_props:\n # combine any information missing on the explicit def that can be inferred\n input_defs.append(input_def.combine_with_inferred(inferred_props[input_def.name]))\n else:\n # pass through those that don't have any inference info, such as Nothing type inputs\n input_defs.append(input_def)\n\n # build defs from the inferred props for those without explicit entries\n inferred_input_defs = [\n InputDefinition.create_from_inferred(inferred)\n for inferred in inferred_props.values()\n if inferred.name in inputs_to_infer\n ]\n\n if exclude_nothing:\n for in_def in inferred_input_defs:\n if in_def.dagster_type.is_nothing:\n raise DagsterInvalidDefinitionError(\n f"Input parameter {in_def.name} is annotated with"\n f" {in_def.dagster_type.display_name} which is a type that represents passing"\n " no data. This type must be used via In() and no parameter should be included"\n f" in the {decorator_name} decorated function."\n )\n\n input_defs.extend(inferred_input_defs)\n\n return input_defs\n
", "current_page_name": "_modules/dagster/_core/definitions/decorators/op_decorator", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.decorators.op_decorator"}, "repository_decorator": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.decorators.repository_decorator

\nfrom functools import update_wrapper\nfrom typing import (\n    Callable,\n    Dict,\n    Iterable,\n    Iterator,\n    List,\n    Mapping,\n    Optional,\n    Sequence,\n    TypeVar,\n    Union,\n    overload,\n)\n\nfrom typing_extensions import TypeAlias\n\nimport dagster._check as check\nfrom dagster._core.decorator_utils import get_function_params\nfrom dagster._core.definitions.metadata import (\n    RawMetadataValue,\n    normalize_metadata,\n)\nfrom dagster._core.definitions.resource_definition import ResourceDefinition\nfrom dagster._core.errors import DagsterInvalidDefinitionError\n\nfrom ..asset_checks import AssetChecksDefinition\nfrom ..executor_definition import ExecutorDefinition\nfrom ..graph_definition import GraphDefinition\nfrom ..job_definition import JobDefinition\nfrom ..logger_definition import LoggerDefinition\nfrom ..partitioned_schedule import UnresolvedPartitionedAssetScheduleDefinition\nfrom ..repository_definition import (\n    VALID_REPOSITORY_DATA_DICT_KEYS,\n    CachingRepositoryData,\n    PendingRepositoryDefinition,\n    PendingRepositoryListDefinition,\n    RepositoryData,\n    RepositoryDefinition,\n    RepositoryListDefinition,\n)\nfrom ..schedule_definition import ScheduleDefinition\nfrom ..sensor_definition import SensorDefinition\nfrom ..unresolved_asset_job_definition import UnresolvedAssetJobDefinition\n\nT = TypeVar("T")\n\nRepositoryDictSpec: TypeAlias = Dict[str, Dict[str, RepositoryListDefinition]]\n\n\ndef _flatten(items: Iterable[Union[T, List[T]]]) -> Iterator[T]:\n    for x in items:\n        if isinstance(x, List):\n            # switch to `yield from _flatten(x)` to support multiple layers of nesting\n            yield from x\n        else:\n            yield x\n\n\nclass _Repository:\n    def __init__(\n        self,\n        name: Optional[str] = None,\n        description: Optional[str] = None,\n        metadata: Optional[Dict[str, RawMetadataValue]] = None,\n        default_executor_def: Optional[ExecutorDefinition] = None,\n        default_logger_defs: Optional[Mapping[str, LoggerDefinition]] = None,\n        top_level_resources: Optional[Mapping[str, ResourceDefinition]] = None,\n        resource_key_mapping: Optional[Mapping[int, str]] = None,\n    ):\n        self.name = check.opt_str_param(name, "name")\n        self.description = check.opt_str_param(description, "description")\n        self.metadata = normalize_metadata(\n            check.opt_mapping_param(metadata, "metadata", key_type=str)\n        )\n        self.default_executor_def = check.opt_inst_param(\n            default_executor_def, "default_executor_def", ExecutorDefinition\n        )\n        self.default_logger_defs = check.opt_mapping_param(\n            default_logger_defs, "default_logger_defs", key_type=str, value_type=LoggerDefinition\n        )\n        self.top_level_resources = check.opt_mapping_param(\n            top_level_resources, "top_level_resources", key_type=str, value_type=ResourceDefinition\n        )\n        self.resource_key_mapping = check.opt_mapping_param(\n            resource_key_mapping, "resource_key_mapping", key_type=int, value_type=str\n        )\n\n    @overload\n    def __call__(\n        self,\n        fn: Union[\n            Callable[[], Sequence[RepositoryListDefinition]],\n            Callable[[], RepositoryDictSpec],\n        ],\n    ) -> RepositoryDefinition: ...\n\n    @overload\n    def __call__(\n        self, fn: Callable[[], Sequence[PendingRepositoryListDefinition]]\n    ) -> PendingRepositoryDefinition: ...\n\n    def __call__(\n        self,\n        fn: Union[\n            Callable[[], Sequence[PendingRepositoryListDefinition]],\n            Callable[[], RepositoryDictSpec],\n        ],\n    ) -> Union[RepositoryDefinition, PendingRepositoryDefinition]:\n        from dagster._core.definitions import AssetsDefinition, SourceAsset\n        from dagster._core.definitions.cacheable_assets import CacheableAssetsDefinition\n\n        check.callable_param(fn, "fn")\n\n        if not self.name:\n            self.name = fn.__name__\n\n        repository_definitions = fn()\n\n        repository_data: Optional[Union[CachingRepositoryData, RepositoryData]]\n        if isinstance(repository_definitions, list):\n            bad_defns = []\n            repository_defns = []\n            defer_repository_data = False\n            for i, definition in enumerate(_flatten(repository_definitions)):\n                if isinstance(definition, CacheableAssetsDefinition):\n                    defer_repository_data = True\n                elif not isinstance(\n                    definition,\n                    (\n                        JobDefinition,\n                        ScheduleDefinition,\n                        UnresolvedPartitionedAssetScheduleDefinition,\n                        SensorDefinition,\n                        GraphDefinition,\n                        AssetsDefinition,\n                        SourceAsset,\n                        UnresolvedAssetJobDefinition,\n                        AssetChecksDefinition,\n                    ),\n                ):\n                    bad_defns.append((i, type(definition)))\n                else:\n                    repository_defns.append(definition)\n\n            if bad_defns:\n                bad_definitions_str = ", ".join(\n                    [f"value of type {type_} at index {i}" for i, type_ in bad_defns]\n                )\n                raise DagsterInvalidDefinitionError(\n                    "Bad return value from repository construction function: all elements of list "\n                    "must be of type JobDefinition, GraphDefinition, "\n                    "ScheduleDefinition, SensorDefinition, "\n                    "AssetsDefinition, SourceAsset, or AssetChecksDefinition."\n                    f"Got {bad_definitions_str}."\n                )\n\n            repository_data = (\n                None\n                if defer_repository_data\n                else CachingRepositoryData.from_list(\n                    repository_defns,\n                    default_executor_def=self.default_executor_def,\n                    default_logger_defs=self.default_logger_defs,\n                    top_level_resources=self.top_level_resources,\n                    resource_key_mapping=self.resource_key_mapping,\n                )\n            )\n\n        elif isinstance(repository_definitions, dict):\n            if not set(repository_definitions.keys()).issubset(VALID_REPOSITORY_DATA_DICT_KEYS):\n                raise DagsterInvalidDefinitionError(\n                    "Bad return value from repository construction function: dict must not contain "\n                    "keys other than {{'schedules', 'sensors', 'jobs'}}: found "\n                    "{bad_keys}".format(\n                        bad_keys=", ".join(\n                            [\n                                f"'{key}'"\n                                for key in repository_definitions.keys()\n                                if key not in VALID_REPOSITORY_DATA_DICT_KEYS\n                            ]\n                        )\n                    )\n                )\n            repository_data = CachingRepositoryData.from_dict(repository_definitions)\n        elif isinstance(repository_definitions, RepositoryData):\n            repository_data = repository_definitions\n        else:\n            raise DagsterInvalidDefinitionError(\n                "Bad return value of type {type_} from repository construction function: must "\n                "return list, dict, or RepositoryData. See the @repository decorator docstring for "\n                "details and examples".format(type_=type(repository_definitions)),\n            )\n\n        if isinstance(repository_definitions, list) and repository_data is None:\n            return PendingRepositoryDefinition(\n                self.name,\n                repository_definitions=list(_flatten(repository_definitions)),\n                description=self.description,\n                metadata=self.metadata,\n                default_executor_def=self.default_executor_def,\n                default_logger_defs=self.default_logger_defs,\n                _top_level_resources=self.top_level_resources,\n            )\n        else:\n            repository_def = RepositoryDefinition(\n                name=self.name,\n                description=self.description,\n                metadata=self.metadata,\n                repository_data=repository_data,\n            )\n\n            update_wrapper(repository_def, fn)\n            return repository_def\n\n\n@overload\ndef repository(\n    definitions_fn: Union[\n        Callable[[], Sequence[RepositoryListDefinition]], Callable[[], RepositoryDictSpec]\n    ],\n) -> RepositoryDefinition: ...\n\n\n@overload\ndef repository(\n    definitions_fn: Callable[..., Sequence[PendingRepositoryListDefinition]]\n) -> PendingRepositoryDefinition: ...\n\n\n@overload\ndef repository(\n    *,\n    name: Optional[str] = ...,\n    description: Optional[str] = ...,\n    metadata: Optional[Dict[str, RawMetadataValue]] = ...,\n    default_executor_def: Optional[ExecutorDefinition] = ...,\n    default_logger_defs: Optional[Mapping[str, LoggerDefinition]] = ...,\n    _top_level_resources: Optional[Mapping[str, ResourceDefinition]] = ...,\n    _resource_key_mapping: Optional[Mapping[int, str]] = ...,\n) -> _Repository: ...\n\n\n
[docs]def repository(\n definitions_fn: Optional[\n Union[\n Callable[[], Sequence[PendingRepositoryListDefinition]],\n Callable[[], RepositoryDictSpec],\n ]\n ] = None,\n *,\n name: Optional[str] = None,\n description: Optional[str] = None,\n metadata: Optional[Dict[str, RawMetadataValue]] = None,\n default_executor_def: Optional[ExecutorDefinition] = None,\n default_logger_defs: Optional[Mapping[str, LoggerDefinition]] = None,\n _top_level_resources: Optional[Mapping[str, ResourceDefinition]] = None,\n _resource_key_mapping: Optional[Mapping[int, str]] = None,\n) -> Union[RepositoryDefinition, PendingRepositoryDefinition, _Repository]:\n """Create a repository from the decorated function.\n\n The decorated function should take no arguments and its return value should one of:\n\n 1. ``List[Union[JobDefinition, ScheduleDefinition, SensorDefinition]]``.\n Use this form when you have no need to lazy load jobs or other definitions. This is the\n typical use case.\n\n 2. A dict of the form:\n\n .. code-block:: python\n\n {\n 'jobs': Dict[str, Callable[[], JobDefinition]],\n 'schedules': Dict[str, Callable[[], ScheduleDefinition]]\n 'sensors': Dict[str, Callable[[], SensorDefinition]]\n }\n\n This form is intended to allow definitions to be created lazily when accessed by name,\n which can be helpful for performance when there are many definitions in a repository, or\n when constructing the definitions is costly.\n\n 3. A :py:class:`RepositoryData`. Return this object if you need fine-grained\n control over the construction and indexing of definitions within the repository, e.g., to\n create definitions dynamically from .yaml files in a directory.\n\n Args:\n name (Optional[str]): The name of the repository. Defaults to the name of the decorated\n function.\n description (Optional[str]): A string description of the repository.\n metadata (Optional[Dict[str, RawMetadataValue]]): Arbitrary metadata for the repository.\n top_level_resources (Optional[Mapping[str, ResourceDefinition]]): A dict of top-level\n resource keys to defintions, for resources which should be displayed in the UI.\n\n Example:\n .. code-block:: python\n\n ######################################################################\n # A simple repository using the first form of the decorated function\n ######################################################################\n\n @op(config_schema={n: Field(Int)})\n def return_n(context):\n return context.op_config['n']\n\n @job\n def simple_job():\n return_n()\n\n @job\n def some_job():\n ...\n\n @sensor(job=some_job)\n def some_sensor():\n if foo():\n yield RunRequest(\n run_key= ...,\n run_config={\n 'ops': {'return_n': {'config': {'n': bar()}}}\n }\n )\n\n @job\n def my_job():\n ...\n\n my_schedule = ScheduleDefinition(cron_schedule="0 0 * * *", job=my_job)\n\n @repository\n def simple_repository():\n return [simple_job, some_sensor, my_schedule]\n\n ######################################################################\n # A simple repository using the first form of the decorated function\n # and custom metadata that will be displayed in the UI\n ######################################################################\n\n ...\n\n @repository(\n name='my_repo',\n metadata={\n 'team': 'Team A',\n 'repository_version': '1.2.3',\n 'environment': 'production',\n })\n def simple_repository():\n return [simple_job, some_sensor, my_schedule]\n\n ######################################################################\n # A lazy-loaded repository\n ######################################################################\n\n def make_expensive_job():\n @job\n def expensive_job():\n for i in range(10000):\n return_n.alias(f'return_n_{i}')()\n\n return expensive_job\n\n def make_expensive_schedule():\n @job\n def other_expensive_job():\n for i in range(11000):\n return_n.alias(f'my_return_n_{i}')()\n\n return ScheduleDefinition(cron_schedule="0 0 * * *", job=other_expensive_job)\n\n @repository\n def lazy_loaded_repository():\n return {\n 'jobs': {'expensive_job': make_expensive_job},\n 'schedules': {'expensive_schedule': make_expensive_schedule}\n }\n\n\n ######################################################################\n # A complex repository that lazily constructs jobs from a directory\n # of files in a bespoke YAML format\n ######################################################################\n\n class ComplexRepositoryData(RepositoryData):\n def __init__(self, yaml_directory):\n self._yaml_directory = yaml_directory\n\n def get_all_jobs(self):\n return [\n self._construct_job_def_from_yaml_file(\n self._yaml_file_for_job_name(file_name)\n )\n for file_name in os.listdir(self._yaml_directory)\n ]\n\n ...\n\n @repository\n def complex_repository():\n return ComplexRepositoryData('some_directory')\n """\n if definitions_fn is not None:\n check.invariant(description is None)\n check.invariant(len(get_function_params(definitions_fn)) == 0)\n\n return _Repository()(definitions_fn)\n\n return _Repository(\n name=name,\n description=description,\n metadata=metadata,\n default_executor_def=default_executor_def,\n default_logger_defs=default_logger_defs,\n top_level_resources=_top_level_resources,\n resource_key_mapping=_resource_key_mapping,\n )
\n
", "current_page_name": "_modules/dagster/_core/definitions/decorators/repository_decorator", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.decorators.repository_decorator"}, "schedule_decorator": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.decorators.schedule_decorator

\nimport copy\nfrom functools import update_wrapper\nfrom typing import (\n    Callable,\n    List,\n    Mapping,\n    Optional,\n    Sequence,\n    Set,\n    Union,\n    cast,\n)\n\nimport dagster._check as check\nfrom dagster._core.definitions.resource_annotation import (\n    get_resource_args,\n)\nfrom dagster._core.definitions.sensor_definition import get_context_param_name\nfrom dagster._core.errors import (\n    DagsterInvalidDefinitionError,\n    ScheduleExecutionError,\n    user_code_error_boundary,\n)\nfrom dagster._utils import ensure_gen\n\nfrom ..run_request import RunRequest, SkipReason\nfrom ..schedule_definition import (\n    DecoratedScheduleFunction,\n    DefaultScheduleStatus,\n    RawScheduleEvaluationFunction,\n    RunRequestIterator,\n    ScheduleDefinition,\n    ScheduleEvaluationContext,\n    has_at_least_one_parameter,\n    validate_and_get_schedule_resource_dict,\n)\nfrom ..target import ExecutableDefinition\nfrom ..utils import validate_tags\n\n\n
[docs]def schedule(\n cron_schedule: Union[str, Sequence[str]],\n *,\n job_name: Optional[str] = None,\n name: Optional[str] = None,\n tags: Optional[Mapping[str, str]] = None,\n tags_fn: Optional[Callable[[ScheduleEvaluationContext], Optional[Mapping[str, str]]]] = None,\n should_execute: Optional[Callable[[ScheduleEvaluationContext], bool]] = None,\n environment_vars: Optional[Mapping[str, str]] = None,\n execution_timezone: Optional[str] = None,\n description: Optional[str] = None,\n job: Optional[ExecutableDefinition] = None,\n default_status: DefaultScheduleStatus = DefaultScheduleStatus.STOPPED,\n required_resource_keys: Optional[Set[str]] = None,\n) -> Callable[[RawScheduleEvaluationFunction], ScheduleDefinition]:\n """Creates a schedule following the provided cron schedule and requests runs for the provided job.\n\n The decorated function takes in a :py:class:`~dagster.ScheduleEvaluationContext` as its only\n argument, and does one of the following:\n\n 1. Return a `RunRequest` object.\n 2. Return a list of `RunRequest` objects.\n 3. Return a `SkipReason` object, providing a descriptive message of why no runs were requested.\n 4. Return nothing (skipping without providing a reason)\n 5. Return a run config dictionary.\n 6. Yield a `SkipReason` or yield one ore more `RunRequest` objects.\n\n Returns a :py:class:`~dagster.ScheduleDefinition`.\n\n Args:\n cron_schedule (Union[str, Sequence[str]]): A valid cron string or sequence of cron strings\n specifying when the schedule will run, e.g., ``'45 23 * * 6'`` for a schedule that runs\n at 11:45 PM every Saturday. If a sequence is provided, then the schedule will run for\n the union of all execution times for the provided cron strings, e.g.,\n ``['45 23 * * 6', '30 9 * * 0]`` for a schedule that runs at 11:45 PM every Saturday and\n 9:30 AM every Sunday.\n name (Optional[str]): The name of the schedule to create.\n tags (Optional[Dict[str, str]]): A dictionary of tags (string key-value pairs) to attach\n to the scheduled runs.\n tags_fn (Optional[Callable[[ScheduleEvaluationContext], Optional[Dict[str, str]]]]): A function\n that generates tags to attach to the schedules runs. Takes a\n :py:class:`~dagster.ScheduleEvaluationContext` and returns a dictionary of tags (string\n key-value pairs). You may set only one of ``tags`` and ``tags_fn``.\n should_execute (Optional[Callable[[ScheduleEvaluationContext], bool]]): A function that runs at\n schedule execution time to determine whether a schedule should execute or skip. Takes a\n :py:class:`~dagster.ScheduleEvaluationContext` and returns a boolean (``True`` if the\n schedule should execute). Defaults to a function that always returns ``True``.\n execution_timezone (Optional[str]): Timezone in which the schedule should run.\n Supported strings for timezones are the ones provided by the\n `IANA time zone database <https://www.iana.org/time-zones>` - e.g. "America/Los_Angeles".\n description (Optional[str]): A human-readable description of the schedule.\n job (Optional[Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]]): The job\n that should execute when this schedule runs.\n default_status (DefaultScheduleStatus): Whether the schedule starts as running or not. The default\n status can be overridden from the Dagster UI or via the GraphQL API.\n required_resource_keys (Optional[Set[str]]): The set of resource keys required by the schedule.\n """\n\n def inner(fn: RawScheduleEvaluationFunction) -> ScheduleDefinition:\n from dagster._config.pythonic_config import validate_resource_annotated_function\n\n check.callable_param(fn, "fn")\n validate_resource_annotated_function(fn)\n\n schedule_name = name or fn.__name__\n\n validated_tags = None\n\n # perform upfront validation of schedule tags\n if tags_fn and tags:\n raise DagsterInvalidDefinitionError(\n "Attempted to provide both tags_fn and tags as arguments"\n " to ScheduleDefinition. Must provide only one of the two."\n )\n elif tags:\n validated_tags = validate_tags(tags, allow_reserved_tags=False)\n\n context_param_name = get_context_param_name(fn)\n resource_arg_names: Set[str] = {arg.name for arg in get_resource_args(fn)}\n\n def _wrapped_fn(context: ScheduleEvaluationContext) -> RunRequestIterator:\n if should_execute:\n with user_code_error_boundary(\n ScheduleExecutionError,\n lambda: (\n "Error occurred during the execution of should_execute for schedule"\n f" {schedule_name}"\n ),\n ):\n if not should_execute(context):\n yield SkipReason(\n f"should_execute function for {schedule_name} returned false."\n )\n return\n resources = validate_and_get_schedule_resource_dict(\n context.resources, schedule_name, resource_arg_names\n )\n\n with user_code_error_boundary(\n ScheduleExecutionError,\n lambda: f"Error occurred during the evaluation of schedule {schedule_name}",\n ):\n context_param = {context_param_name: context} if context_param_name else {}\n result = fn(**context_param, **resources)\n\n if isinstance(result, dict):\n # this is the run-config based decorated function, wrap the evaluated run config\n # and tags in a RunRequest\n evaluated_run_config = copy.deepcopy(result)\n evaluated_tags = (\n validated_tags\n or (tags_fn and validate_tags(tags_fn(context), allow_reserved_tags=False))\n or None\n )\n yield RunRequest(\n run_key=None,\n run_config=evaluated_run_config,\n tags=evaluated_tags,\n )\n elif isinstance(result, list):\n yield from cast(List[RunRequest], result)\n else:\n # this is a run-request based decorated function\n yield from cast(RunRequestIterator, ensure_gen(result))\n\n has_context_arg = has_at_least_one_parameter(fn)\n evaluation_fn = DecoratedScheduleFunction(\n decorated_fn=fn,\n wrapped_fn=_wrapped_fn,\n has_context_arg=has_context_arg,\n )\n\n schedule_def = ScheduleDefinition.dagster_internal_init(\n name=schedule_name,\n cron_schedule=cron_schedule,\n job_name=job_name,\n environment_vars=environment_vars,\n execution_timezone=execution_timezone,\n description=description,\n execution_fn=evaluation_fn,\n job=job,\n default_status=default_status,\n required_resource_keys=required_resource_keys,\n run_config=None, # cannot supply run_config or run_config_fn to decorator\n run_config_fn=None,\n tags=None, # cannot supply tags or tags_fn to decorator\n tags_fn=None,\n should_execute=None, # already encompassed in evaluation_fn\n )\n\n update_wrapper(schedule_def, wrapped=fn)\n\n return schedule_def\n\n return inner
\n
", "current_page_name": "_modules/dagster/_core/definitions/decorators/schedule_decorator", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.decorators.schedule_decorator"}, "sensor_decorator": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.decorators.sensor_decorator

\nimport collections.abc\nimport inspect\nfrom functools import update_wrapper\nfrom typing import Any, Callable, Optional, Sequence, Set, Union\n\nimport dagster._check as check\nfrom dagster._annotations import experimental\nfrom dagster._core.definitions.asset_selection import AssetSelection\n\nfrom ...errors import DagsterInvariantViolationError\nfrom ..asset_sensor_definition import AssetSensorDefinition\nfrom ..events import AssetKey\nfrom ..multi_asset_sensor_definition import (\n    AssetMaterializationFunction,\n    MultiAssetMaterializationFunction,\n    MultiAssetSensorDefinition,\n)\nfrom ..run_request import SensorResult\nfrom ..sensor_definition import (\n    DefaultSensorStatus,\n    RawSensorEvaluationFunction,\n    RunRequest,\n    SensorDefinition,\n    SkipReason,\n)\nfrom ..target import ExecutableDefinition\n\n\n
[docs]def sensor(\n job_name: Optional[str] = None,\n *,\n name: Optional[str] = None,\n minimum_interval_seconds: Optional[int] = None,\n description: Optional[str] = None,\n job: Optional[ExecutableDefinition] = None,\n jobs: Optional[Sequence[ExecutableDefinition]] = None,\n default_status: DefaultSensorStatus = DefaultSensorStatus.STOPPED,\n asset_selection: Optional[AssetSelection] = None,\n required_resource_keys: Optional[Set[str]] = None,\n) -> Callable[[RawSensorEvaluationFunction], SensorDefinition]:\n """Creates a sensor where the decorated function is used as the sensor's evaluation function.\n\n The decorated function may:\n\n 1. Return a `RunRequest` object.\n 2. Return a list of `RunRequest` objects.\n 3. Return a `SkipReason` object, providing a descriptive message of why no runs were requested.\n 4. Return nothing (skipping without providing a reason)\n 5. Yield a `SkipReason` or yield one or more `RunRequest` objects.\n\n Takes a :py:class:`~dagster.SensorEvaluationContext`.\n\n Args:\n name (Optional[str]): The name of the sensor. Defaults to the name of the decorated\n function.\n minimum_interval_seconds (Optional[int]): The minimum number of seconds that will elapse\n between sensor evaluations.\n description (Optional[str]): A human-readable description of the sensor.\n job (Optional[Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]]):\n The job to be executed when the sensor fires.\n jobs (Optional[Sequence[Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]]]):\n (experimental) A list of jobs to be executed when the sensor fires.\n default_status (DefaultSensorStatus): Whether the sensor starts as running or not. The default\n status can be overridden from the Dagster UI or via the GraphQL API.\n asset_selection (AssetSelection): (Experimental) an asset selection to launch a run for if\n the sensor condition is met. This can be provided instead of specifying a job.\n """\n check.opt_str_param(name, "name")\n\n def inner(fn: RawSensorEvaluationFunction) -> SensorDefinition:\n check.callable_param(fn, "fn")\n\n sensor_def = SensorDefinition.dagster_internal_init(\n name=name,\n job_name=job_name,\n evaluation_fn=fn,\n minimum_interval_seconds=minimum_interval_seconds,\n description=description,\n job=job,\n jobs=jobs,\n default_status=default_status,\n asset_selection=asset_selection,\n required_resource_keys=required_resource_keys,\n )\n\n update_wrapper(sensor_def, wrapped=fn)\n\n return sensor_def\n\n return inner
\n\n\n
[docs]def asset_sensor(\n asset_key: AssetKey,\n *,\n job_name: Optional[str] = None,\n name: Optional[str] = None,\n minimum_interval_seconds: Optional[int] = None,\n description: Optional[str] = None,\n job: Optional[ExecutableDefinition] = None,\n jobs: Optional[Sequence[ExecutableDefinition]] = None,\n default_status: DefaultSensorStatus = DefaultSensorStatus.STOPPED,\n required_resource_keys: Optional[Set[str]] = None,\n) -> Callable[[AssetMaterializationFunction,], AssetSensorDefinition,]:\n """Creates an asset sensor where the decorated function is used as the asset sensor's evaluation\n function.\n\n If the asset has been materialized multiple times between since the last sensor tick, the\n evaluation function will only be invoked once, with the latest materialization.\n\n The decorated function may:\n\n 1. Return a `RunRequest` object.\n 2. Return a list of `RunRequest` objects.\n 3. Return a `SkipReason` object, providing a descriptive message of why no runs were requested.\n 4. Return nothing (skipping without providing a reason)\n 5. Yield a `SkipReason` or yield one or more `RunRequest` objects.\n\n Takes a :py:class:`~dagster.SensorEvaluationContext` and an EventLogEntry corresponding to an\n AssetMaterialization event.\n\n Args:\n asset_key (AssetKey): The asset_key this sensor monitors.\n name (Optional[str]): The name of the sensor. Defaults to the name of the decorated\n function.\n minimum_interval_seconds (Optional[int]): The minimum number of seconds that will elapse\n between sensor evaluations.\n description (Optional[str]): A human-readable description of the sensor.\n job (Optional[Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]]): The\n job to be executed when the sensor fires.\n jobs (Optional[Sequence[Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]]]):\n (experimental) A list of jobs to be executed when the sensor fires.\n default_status (DefaultSensorStatus): Whether the sensor starts as running or not. The default\n status can be overridden from the Dagster UI or via the GraphQL API.\n\n\n Example:\n .. code-block:: python\n\n from dagster import AssetKey, EventLogEntry, SensorEvaluationContext, asset_sensor\n\n\n @asset_sensor(asset_key=AssetKey("my_table"), job=my_job)\n def my_asset_sensor(context: SensorEvaluationContext, asset_event: EventLogEntry):\n return RunRequest(\n run_key=context.cursor,\n run_config={\n "ops": {\n "read_materialization": {\n "config": {\n "asset_key": asset_event.dagster_event.asset_key.path,\n }\n }\n }\n },\n )\n """\n check.opt_str_param(name, "name")\n\n def inner(fn: AssetMaterializationFunction) -> AssetSensorDefinition:\n check.callable_param(fn, "fn")\n sensor_name = name or fn.__name__\n\n def _wrapped_fn(*args, **kwargs) -> Any:\n result = fn(*args, **kwargs)\n\n if inspect.isgenerator(result) or isinstance(result, list):\n for item in result:\n yield item\n elif isinstance(result, (RunRequest, SkipReason)):\n yield result\n\n elif isinstance(result, SensorResult):\n if result.cursor:\n raise DagsterInvariantViolationError(\n f"Error in asset sensor {sensor_name}: Sensor returned a SensorResult"\n " with a cursor value. The cursor is managed by the asset sensor and"\n " should not be modified by a user."\n )\n yield result\n\n elif result is not None:\n raise DagsterInvariantViolationError(\n f"Error in sensor {sensor_name}: Sensor unexpectedly returned output "\n f"{result} of type {type(result)}. Should only return SkipReason or "\n "RunRequest objects."\n )\n\n # Preserve any resource arguments from the underlying function, for when we inspect the\n # wrapped function later on\n _wrapped_fn = update_wrapper(_wrapped_fn, wrapped=fn)\n\n return AssetSensorDefinition(\n name=sensor_name,\n asset_key=asset_key,\n job_name=job_name,\n asset_materialization_fn=_wrapped_fn,\n minimum_interval_seconds=minimum_interval_seconds,\n description=description,\n job=job,\n jobs=jobs,\n default_status=default_status,\n required_resource_keys=required_resource_keys,\n )\n\n return inner
\n\n\n
[docs]@experimental\ndef multi_asset_sensor(\n monitored_assets: Union[Sequence[AssetKey], AssetSelection],\n *,\n job_name: Optional[str] = None,\n name: Optional[str] = None,\n minimum_interval_seconds: Optional[int] = None,\n description: Optional[str] = None,\n job: Optional[ExecutableDefinition] = None,\n jobs: Optional[Sequence[ExecutableDefinition]] = None,\n default_status: DefaultSensorStatus = DefaultSensorStatus.STOPPED,\n request_assets: Optional[AssetSelection] = None,\n required_resource_keys: Optional[Set[str]] = None,\n) -> Callable[[MultiAssetMaterializationFunction,], MultiAssetSensorDefinition,]:\n """Creates an asset sensor that can monitor multiple assets.\n\n The decorated function is used as the asset sensor's evaluation\n function. The decorated function may:\n\n 1. Return a `RunRequest` object.\n 2. Return a list of `RunRequest` objects.\n 3. Return a `SkipReason` object, providing a descriptive message of why no runs were requested.\n 4. Return nothing (skipping without providing a reason)\n 5. Yield a `SkipReason` or yield one or more `RunRequest` objects.\n\n Takes a :py:class:`~dagster.MultiAssetSensorEvaluationContext`.\n\n Args:\n monitored_assets (Union[Sequence[AssetKey], AssetSelection]): The assets this\n sensor monitors. If an AssetSelection object is provided, it will only apply to assets\n within the Definitions that this sensor is part of.\n name (Optional[str]): The name of the sensor. Defaults to the name of the decorated\n function.\n minimum_interval_seconds (Optional[int]): The minimum number of seconds that will elapse\n between sensor evaluations.\n description (Optional[str]): A human-readable description of the sensor.\n job (Optional[Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]]): The\n job to be executed when the sensor fires.\n jobs (Optional[Sequence[Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]]]):\n (experimental) A list of jobs to be executed when the sensor fires.\n default_status (DefaultSensorStatus): Whether the sensor starts as running or not. The default\n status can be overridden from the Dagster UI or via the GraphQL API.\n request_assets (Optional[AssetSelection]): (Experimental) an asset selection to launch a run\n for if the sensor condition is met. This can be provided instead of specifying a job.\n """\n check.opt_str_param(name, "name")\n\n if not isinstance(monitored_assets, AssetSelection) and not (\n isinstance(monitored_assets, collections.abc.Sequence)\n and all(isinstance(el, AssetKey) for el in monitored_assets)\n ):\n check.failed(\n "The value passed to monitored_assets param must be either an AssetSelection"\n f" or a Sequence of AssetKeys, but was a {type(monitored_assets)}"\n )\n\n def inner(fn: MultiAssetMaterializationFunction) -> MultiAssetSensorDefinition:\n check.callable_param(fn, "fn")\n sensor_name = name or fn.__name__\n\n sensor_def = MultiAssetSensorDefinition(\n name=sensor_name,\n monitored_assets=monitored_assets,\n job_name=job_name,\n asset_materialization_fn=fn,\n minimum_interval_seconds=minimum_interval_seconds,\n description=description,\n job=job,\n jobs=jobs,\n default_status=default_status,\n request_assets=request_assets,\n required_resource_keys=required_resource_keys,\n )\n update_wrapper(sensor_def, wrapped=fn)\n return sensor_def\n\n return inner
\n
", "current_page_name": "_modules/dagster/_core/definitions/decorators/sensor_decorator", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.decorators.sensor_decorator"}}, "definitions_class": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.definitions_class

\nfrom typing import (\n    TYPE_CHECKING,\n    Any,\n    Dict,\n    Iterable,\n    List,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Type,\n    Union,\n)\n\nimport dagster._check as check\nfrom dagster._annotations import deprecated, experimental, public\nfrom dagster._config.pythonic_config import (\n    attach_resource_id_to_key_mapping,\n)\nfrom dagster._core.definitions.asset_checks import AssetChecksDefinition\nfrom dagster._core.definitions.asset_graph import InternalAssetGraph\nfrom dagster._core.definitions.events import AssetKey, CoercibleToAssetKey\nfrom dagster._core.definitions.executor_definition import ExecutorDefinition\nfrom dagster._core.definitions.logger_definition import LoggerDefinition\nfrom dagster._core.errors import DagsterInvariantViolationError\nfrom dagster._core.execution.build_resources import wrap_resources_for_execution\nfrom dagster._core.execution.with_resources import with_resources\nfrom dagster._core.executor.base import Executor\nfrom dagster._core.instance import DagsterInstance\nfrom dagster._utils.cached_method import cached_method\n\nfrom .assets import AssetsDefinition, SourceAsset\nfrom .cacheable_assets import CacheableAssetsDefinition\nfrom .decorators import repository\nfrom .job_definition import JobDefinition, default_job_io_manager\nfrom .partitioned_schedule import UnresolvedPartitionedAssetScheduleDefinition\nfrom .repository_definition import (\n    SINGLETON_REPOSITORY_NAME,\n    PendingRepositoryDefinition,\n    RepositoryDefinition,\n)\nfrom .schedule_definition import ScheduleDefinition\nfrom .sensor_definition import SensorDefinition\nfrom .unresolved_asset_job_definition import UnresolvedAssetJobDefinition\n\nif TYPE_CHECKING:\n    from dagster._core.storage.asset_value_loader import AssetValueLoader\n\n\n
[docs]@public\n@experimental\ndef create_repository_using_definitions_args(\n name: str,\n assets: Optional[\n Iterable[Union[AssetsDefinition, SourceAsset, CacheableAssetsDefinition]]\n ] = None,\n schedules: Optional[\n Iterable[Union[ScheduleDefinition, UnresolvedPartitionedAssetScheduleDefinition]]\n ] = None,\n sensors: Optional[Iterable[SensorDefinition]] = None,\n jobs: Optional[Iterable[Union[JobDefinition, UnresolvedAssetJobDefinition]]] = None,\n resources: Optional[Mapping[str, Any]] = None,\n executor: Optional[Union[ExecutorDefinition, Executor]] = None,\n loggers: Optional[Mapping[str, LoggerDefinition]] = None,\n asset_checks: Optional[Iterable[AssetChecksDefinition]] = None,\n) -> Union[RepositoryDefinition, PendingRepositoryDefinition]:\n """Create a named repository using the same arguments as :py:class:`Definitions`. In older\n versions of Dagster, repositories were the mechanism for organizing assets, schedules, sensors,\n and jobs. There could be many repositories per code location. This was a complicated ontology but\n gave users a way to organize code locations that contained large numbers of heterogenous definitions.\n\n As a stopgap for those who both want to 1) use the new :py:class:`Definitions` API and 2) but still\n want multiple logical groups of assets in the same code location, we have introduced this function.\n\n Example usage:\n\n .. code-block:: python\n\n named_repo = create_repository_using_definitions_args(\n name="a_repo",\n assets=[asset_one, asset_two],\n schedules=[a_schedule],\n sensors=[a_sensor],\n jobs=[a_job],\n resources={\n "a_resource": some_resource,\n }\n )\n\n """\n return _create_repository_using_definitions_args(\n name=name,\n assets=assets,\n schedules=schedules,\n sensors=sensors,\n jobs=jobs,\n resources=resources,\n executor=executor,\n loggers=loggers,\n asset_checks=asset_checks,\n )
\n\n\nclass _AttachedObjects(NamedTuple):\n jobs: Iterable[Union[JobDefinition, UnresolvedAssetJobDefinition]]\n schedules: Iterable[Union[ScheduleDefinition, UnresolvedPartitionedAssetScheduleDefinition]]\n sensors: Iterable[SensorDefinition]\n\n\ndef _io_manager_needs_replacement(job: JobDefinition, resource_defs: Mapping[str, Any]) -> bool:\n """Explicitly replace the default IO manager in jobs that don't specify one, if a top-level\n I/O manager is provided to Definitions.\n """\n return (\n job.resource_defs.get("io_manager") == default_job_io_manager\n and "io_manager" in resource_defs\n )\n\n\ndef _jobs_which_will_have_io_manager_replaced(\n jobs: Optional[Iterable[Union[JobDefinition, UnresolvedAssetJobDefinition]]],\n resource_defs: Mapping[str, Any],\n) -> List[Union[JobDefinition, UnresolvedAssetJobDefinition]]:\n """Returns whether any jobs will have their I/O manager replaced by an `io_manager` override from\n the top-level `resource_defs` provided to `Definitions` in 1.3. We will warn users if this is\n the case.\n """\n jobs = jobs or []\n return [\n job\n for job in jobs\n if isinstance(job, JobDefinition) and _io_manager_needs_replacement(job, resource_defs)\n ]\n\n\ndef _attach_resources_to_jobs_and_instigator_jobs(\n jobs: Optional[Iterable[Union[JobDefinition, UnresolvedAssetJobDefinition]]],\n schedules: Optional[\n Iterable[Union[ScheduleDefinition, UnresolvedPartitionedAssetScheduleDefinition]]\n ],\n sensors: Optional[Iterable[SensorDefinition]],\n resource_defs: Mapping[str, Any],\n) -> _AttachedObjects:\n """Given a list of jobs, schedules, and sensors along with top-level resource definitions,\n attach the resource definitions to the jobs, schedules, and sensors which require them.\n """\n jobs = jobs or []\n schedules = schedules or []\n sensors = sensors or []\n\n # Add jobs in schedules and sensors as well\n jobs = [\n *jobs,\n *[\n schedule.job\n for schedule in schedules\n if isinstance(schedule, ScheduleDefinition)\n and schedule.has_loadable_target()\n and isinstance(schedule.job, (JobDefinition, UnresolvedAssetJobDefinition))\n ],\n *[\n job\n for sensor in sensors\n if sensor.has_loadable_targets()\n for job in sensor.jobs\n if isinstance(job, (JobDefinition, UnresolvedAssetJobDefinition))\n ],\n ]\n # Dedupe\n jobs = list({id(job): job for job in jobs}.values())\n\n # Find unsatisfied jobs\n unsatisfied_jobs = [\n job\n for job in jobs\n if isinstance(job, JobDefinition)\n and (\n job.is_missing_required_resources() or _io_manager_needs_replacement(job, resource_defs)\n )\n ]\n\n # Create a mapping of job id to a version of the job with the resource defs bound\n unsatisfied_job_to_resource_bound_job = {\n id(job): job.with_top_level_resources(\n {\n **resource_defs,\n **job.resource_defs,\n # special case for IO manager - the job-level IO manager does not take precedence\n # if it is the default and a top-level IO manager is provided\n **(\n {"io_manager": resource_defs["io_manager"]}\n if _io_manager_needs_replacement(job, resource_defs)\n else {}\n ),\n }\n )\n for job in jobs\n if job in unsatisfied_jobs\n }\n\n # Update all jobs to use the resource bound version\n jobs_with_resources = [\n unsatisfied_job_to_resource_bound_job[id(job)] if job in unsatisfied_jobs else job\n for job in jobs\n ]\n\n # Update all schedules and sensors to use the resource bound version\n updated_schedules = [\n (\n schedule.with_updated_job(unsatisfied_job_to_resource_bound_job[id(schedule.job)])\n if (\n isinstance(schedule, ScheduleDefinition)\n and schedule.has_loadable_target()\n and schedule.job in unsatisfied_jobs\n )\n else schedule\n )\n for schedule in schedules\n ]\n updated_sensors = [\n (\n sensor.with_updated_jobs(\n [\n (\n unsatisfied_job_to_resource_bound_job[id(job)]\n if job in unsatisfied_jobs\n else job\n )\n for job in sensor.jobs\n ]\n )\n if sensor.has_loadable_targets() and any(job in unsatisfied_jobs for job in sensor.jobs)\n else sensor\n )\n for sensor in sensors\n ]\n\n return _AttachedObjects(jobs_with_resources, updated_schedules, updated_sensors)\n\n\ndef _create_repository_using_definitions_args(\n name: str,\n assets: Optional[\n Iterable[Union[AssetsDefinition, SourceAsset, CacheableAssetsDefinition]]\n ] = None,\n schedules: Optional[\n Iterable[Union[ScheduleDefinition, UnresolvedPartitionedAssetScheduleDefinition]]\n ] = None,\n sensors: Optional[Iterable[SensorDefinition]] = None,\n jobs: Optional[Iterable[Union[JobDefinition, UnresolvedAssetJobDefinition]]] = None,\n resources: Optional[Mapping[str, Any]] = None,\n executor: Optional[Union[ExecutorDefinition, Executor]] = None,\n loggers: Optional[Mapping[str, LoggerDefinition]] = None,\n asset_checks: Optional[Iterable[AssetChecksDefinition]] = None,\n):\n check.opt_iterable_param(\n assets, "assets", (AssetsDefinition, SourceAsset, CacheableAssetsDefinition)\n )\n check.opt_iterable_param(\n schedules, "schedules", (ScheduleDefinition, UnresolvedPartitionedAssetScheduleDefinition)\n )\n check.opt_iterable_param(sensors, "sensors", SensorDefinition)\n check.opt_iterable_param(jobs, "jobs", (JobDefinition, UnresolvedAssetJobDefinition))\n\n check.opt_inst_param(executor, "executor", (ExecutorDefinition, Executor))\n executor_def = (\n executor\n if isinstance(executor, ExecutorDefinition) or executor is None\n else ExecutorDefinition.hardcoded_executor(executor)\n )\n\n # Generate a mapping from each top-level resource instance ID to its resource key\n resource_key_mapping = {id(v): k for k, v in resources.items()} if resources else {}\n\n # Provide this mapping to each resource instance so that it can be used to resolve\n # nested resources\n resources_with_key_mapping = (\n {\n k: attach_resource_id_to_key_mapping(v, resource_key_mapping)\n for k, v in resources.items()\n }\n if resources\n else {}\n )\n\n resource_defs = wrap_resources_for_execution(resources_with_key_mapping)\n\n check.opt_mapping_param(loggers, "loggers", key_type=str, value_type=LoggerDefinition)\n\n # Binds top-level resources to jobs and any jobs attached to schedules or sensors\n (\n jobs_with_resources,\n schedules_with_resources,\n sensors_with_resources,\n ) = _attach_resources_to_jobs_and_instigator_jobs(jobs, schedules, sensors, resource_defs)\n\n @repository(\n name=name,\n default_executor_def=executor_def,\n default_logger_defs=loggers,\n _top_level_resources=resource_defs,\n _resource_key_mapping=resource_key_mapping,\n )\n def created_repo():\n return [\n *with_resources(assets or [], resource_defs),\n *with_resources(asset_checks or [], resource_defs),\n *(schedules_with_resources),\n *(sensors_with_resources),\n *(jobs_with_resources),\n ]\n\n return created_repo\n\n\n@deprecated(\n breaking_version="2.0",\n additional_warn_text=(\n "Instantiations can be removed. Since it's behavior is now the default, this class is now a"\n " no-op."\n ),\n)\nclass BindResourcesToJobs(list):\n """Used to instruct Dagster to bind top-level resources to jobs and any jobs attached to schedules\n and sensors. Now deprecated since this behavior is the default.\n """\n\n\n
[docs]class Definitions:\n """A set of definitions explicitly available and loadable by Dagster tools.\n\n Parameters:\n assets (Optional[Iterable[Union[AssetsDefinition, SourceAsset, CacheableAssetsDefinition]]]):\n A list of assets. Assets can be created by annotating\n a function with :py:func:`@asset <asset>` or\n :py:func:`@observable_source_asset <observable_source_asset>`.\n Or they can by directly instantiating :py:class:`AssetsDefinition`,\n :py:class:`SourceAsset`, or :py:class:`CacheableAssetsDefinition`.\n\n asset_checks (Optional[Iterable[AssetChecksDefinition]]):\n A list of asset checks.\n\n schedules (Optional[Iterable[Union[ScheduleDefinition, UnresolvedPartitionedAssetScheduleDefinition]]]):\n List of schedules.\n\n sensors (Optional[Iterable[SensorDefinition]]):\n List of sensors, typically created with :py:func:`@sensor <sensor>`.\n\n jobs (Optional[Iterable[Union[JobDefinition, UnresolvedAssetJobDefinition]]]):\n List of jobs. Typically created with :py:func:`define_asset_job <define_asset_job>`\n or with :py:func:`@job <job>` for jobs defined in terms of ops directly.\n Jobs created with :py:func:`@job <job>` must already have resources bound\n at job creation time. They do not respect the `resources` argument here.\n\n resources (Optional[Mapping[str, Any]]): Dictionary of resources to bind to assets.\n The resources dictionary takes raw Python objects,\n not just instances of :py:class:`ResourceDefinition`. If that raw object inherits from\n :py:class:`IOManager`, it gets coerced to an :py:class:`IOManagerDefinition`.\n Any other object is coerced to a :py:class:`ResourceDefinition`.\n These resources will be automatically bound\n to any assets passed to this Definitions instance using\n :py:func:`with_resources <with_resources>`. Assets passed to Definitions with\n resources already bound using :py:func:`with_resources <with_resources>` will\n override this dictionary.\n\n executor (Optional[Union[ExecutorDefinition, Executor]]):\n Default executor for jobs. Individual jobs can override this and define their own executors\n by setting the executor on :py:func:`@job <job>` or :py:func:`define_asset_job <define_asset_job>`\n explicitly. This executor will also be used for materializing assets directly\n outside of the context of jobs. If an :py:class:`Executor` is passed, it is coerced into\n an :py:class:`ExecutorDefinition`.\n\n loggers (Optional[Mapping[str, LoggerDefinition]):\n Default loggers for jobs. Individual jobs\n can define their own loggers by setting them explictly.\n\n Example usage:\n\n .. code-block:: python\n\n defs = Definitions(\n assets=[asset_one, asset_two],\n schedules=[a_schedule],\n sensors=[a_sensor],\n jobs=[a_job],\n resources={\n "a_resource": some_resource,\n },\n asset_checks=[asset_one_check_one]\n )\n\n Dagster separates user-defined code from system tools such the web server and\n the daemon. Rather than loading code directly into process, a tool such as the\n webserver interacts with user-defined code over a serialization boundary.\n\n These tools must be able to locate and load this code when they start. Via CLI\n arguments or config, they specify a Python module to inspect.\n\n A Python module is loadable by Dagster tools if there is a top-level variable\n that is an instance of :py:class:`Definitions`.\n\n Before the introduction of :py:class:`Definitions`,\n :py:func:`@repository <repository>` was the API for organizing defintions.\n :py:class:`Definitions` provides a few conveniences for dealing with resources\n that do not apply to old-style :py:func:`@repository <repository>` declarations:\n\n * It takes a dictionary of top-level resources which are automatically bound\n (via :py:func:`with_resources <with_resources>`) to any asset passed to it.\n If you need to apply different resources to different assets, use legacy\n :py:func:`@repository <repository>` and use\n :py:func:`with_resources <with_resources>` as before.\n * The resources dictionary takes raw Python objects, not just instances\n of :py:class:`ResourceDefinition`. If that raw object inherits from\n :py:class:`IOManager`, it gets coerced to an :py:class:`IOManagerDefinition`.\n Any other object is coerced to a :py:class:`ResourceDefinition`.\n """\n\n def __init__(\n self,\n assets: Optional[\n Iterable[Union[AssetsDefinition, SourceAsset, CacheableAssetsDefinition]]\n ] = None,\n schedules: Optional[\n Iterable[Union[ScheduleDefinition, UnresolvedPartitionedAssetScheduleDefinition]]\n ] = None,\n sensors: Optional[Iterable[SensorDefinition]] = None,\n jobs: Optional[Iterable[Union[JobDefinition, UnresolvedAssetJobDefinition]]] = None,\n resources: Optional[Mapping[str, Any]] = None,\n executor: Optional[Union[ExecutorDefinition, Executor]] = None,\n loggers: Optional[Mapping[str, LoggerDefinition]] = None,\n asset_checks: Optional[Iterable[AssetChecksDefinition]] = None,\n ):\n self._created_pending_or_normal_repo = _create_repository_using_definitions_args(\n name=SINGLETON_REPOSITORY_NAME,\n assets=assets,\n schedules=schedules,\n sensors=sensors,\n jobs=jobs,\n resources=resources,\n executor=executor,\n loggers=loggers,\n asset_checks=asset_checks,\n )\n\n
[docs] @public\n def get_job_def(self, name: str) -> JobDefinition:\n """Get a job definition by name. If you passed in a an :py:class:`UnresolvedAssetJobDefinition`\n (return value of :py:func:`define_asset_job`) it will be resolved to a :py:class:`JobDefinition` when returned\n from this function.\n """\n check.str_param(name, "name")\n return self.get_repository_def().get_job(name)
\n\n
[docs] @public\n def get_sensor_def(self, name: str) -> SensorDefinition:\n """Get a sensor definition by name."""\n check.str_param(name, "name")\n return self.get_repository_def().get_sensor_def(name)
\n\n
[docs] @public\n def get_schedule_def(self, name: str) -> ScheduleDefinition:\n """Get a schedule definition by name."""\n check.str_param(name, "name")\n return self.get_repository_def().get_schedule_def(name)
\n\n
[docs] @public\n def load_asset_value(\n self,\n asset_key: CoercibleToAssetKey,\n *,\n python_type: Optional[Type] = None,\n instance: Optional[DagsterInstance] = None,\n partition_key: Optional[str] = None,\n metadata: Optional[Dict[str, Any]] = None,\n ) -> object:\n """Load the contents of an asset as a Python object.\n\n Invokes `load_input` on the :py:class:`IOManager` associated with the asset.\n\n If you want to load the values of multiple assets, it's more efficient to use\n :py:meth:`~dagster.Definitions.get_asset_value_loader`, which avoids spinning up\n resources separately for each asset.\n\n Args:\n asset_key (Union[AssetKey, Sequence[str], str]): The key of the asset to load.\n python_type (Optional[Type]): The python type to load the asset as. This is what will\n be returned inside `load_input` by `context.dagster_type.typing_type`.\n partition_key (Optional[str]): The partition of the asset to load.\n metadata (Optional[Dict[str, Any]]): Input metadata to pass to the :py:class:`IOManager`\n (is equivalent to setting the metadata argument in `In` or `AssetIn`).\n\n Returns:\n The contents of an asset as a Python object.\n """\n return self.get_repository_def().load_asset_value(\n asset_key=asset_key,\n python_type=python_type,\n instance=instance,\n partition_key=partition_key,\n metadata=metadata,\n )
\n\n
[docs] @public\n def get_asset_value_loader(\n self, instance: Optional[DagsterInstance] = None\n ) -> "AssetValueLoader":\n """Returns an object that can load the contents of assets as Python objects.\n\n Invokes `load_input` on the :py:class:`IOManager` associated with the assets. Avoids\n spinning up resources separately for each asset.\n\n Usage:\n\n .. code-block:: python\n\n with defs.get_asset_value_loader() as loader:\n asset1 = loader.load_asset_value("asset1")\n asset2 = loader.load_asset_value("asset2")\n """\n return self.get_repository_def().get_asset_value_loader(\n instance=instance,\n )
\n\n def get_all_job_defs(self) -> Sequence[JobDefinition]:\n """Get all the Job definitions in the code location."""\n return self.get_repository_def().get_all_jobs()\n\n def has_implicit_global_asset_job_def(self) -> bool:\n return self.get_repository_def().has_implicit_global_asset_job_def()\n\n def get_implicit_global_asset_job_def(self) -> JobDefinition:\n """A useful conveninence method when there is a single defined global asset job.\n This occurs when all assets in the code location use a single partitioning scheme.\n If there are multiple partitioning schemes you must use get_implicit_job_def_for_assets\n instead to access to the correct implicit asset one.\n """\n return self.get_repository_def().get_implicit_global_asset_job_def()\n\n def get_implicit_job_def_for_assets(\n self, asset_keys: Iterable[AssetKey]\n ) -> Optional[JobDefinition]:\n return self.get_repository_def().get_implicit_job_def_for_assets(asset_keys)\n\n def get_assets_def(self, key: CoercibleToAssetKey) -> AssetsDefinition:\n asset_key = AssetKey.from_coercible(key)\n for assets_def in self.get_asset_graph().assets:\n if asset_key in assets_def.keys:\n return assets_def\n\n raise DagsterInvariantViolationError(f"Could not find asset {asset_key}")\n\n @cached_method\n def get_repository_def(self) -> RepositoryDefinition:\n """Definitions is implemented by wrapping RepositoryDefinition. Get that underlying object\n in order to access an functionality which is not exposed on Definitions. This method\n also resolves a PendingRepositoryDefinition to a RepositoryDefinition.\n """\n return (\n self._created_pending_or_normal_repo.compute_repository_definition()\n if isinstance(self._created_pending_or_normal_repo, PendingRepositoryDefinition)\n else self._created_pending_or_normal_repo\n )\n\n def get_inner_repository_for_loading_process(\n self,\n ) -> Union[RepositoryDefinition, PendingRepositoryDefinition]:\n """This method is used internally to access the inner repository during the loading process\n at CLI entry points. We explicitly do not want to resolve the pending repo because the entire\n point is to defer that resolution until later.\n """\n return self._created_pending_or_normal_repo\n\n def get_asset_graph(self) -> InternalAssetGraph:\n """Get the AssetGraph for this set of definitions."""\n return self.get_repository_def().asset_graph
\n
", "current_page_name": "_modules/dagster/_core/definitions/definitions_class", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.definitions_class"}, "dependency": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.dependency

\nfrom abc import ABC, abstractmethod\nfrom collections import defaultdict\nfrom enum import Enum\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Any,\n    DefaultDict,\n    Dict,\n    Iterable,\n    Iterator,\n    List,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Set,\n    Tuple,\n    Type,\n    Union,\n    cast,\n)\n\nfrom typing_extensions import TypeAlias, TypeVar\n\nimport dagster._check as check\nfrom dagster._annotations import PublicAttr, public\nfrom dagster._core.definitions.policy import RetryPolicy\nfrom dagster._core.errors import DagsterInvalidDefinitionError\nfrom dagster._serdes.serdes import (\n    whitelist_for_serdes,\n)\nfrom dagster._utils import hash_collection\n\nfrom .hook_definition import HookDefinition\nfrom .input import FanInInputPointer, InputDefinition, InputMapping, InputPointer\nfrom .output import OutputDefinition\nfrom .utils import DEFAULT_OUTPUT, struct_to_string, validate_tags\n\nif TYPE_CHECKING:\n    from dagster._core.definitions.op_definition import OpDefinition\n\n    from .asset_layer import AssetLayer\n    from .composition import MappedInputPlaceholder\n    from .graph_definition import GraphDefinition\n    from .node_definition import NodeDefinition\n    from .resource_requirement import ResourceRequirement\n\nT_DependencyKey = TypeVar("T_DependencyKey", str, "NodeInvocation")\nDependencyMapping: TypeAlias = Mapping[T_DependencyKey, Mapping[str, "IDependencyDefinition"]]\n\n\n
[docs]class NodeInvocation(\n NamedTuple(\n "Node",\n [\n ("name", PublicAttr[str]),\n ("alias", PublicAttr[Optional[str]]),\n ("tags", PublicAttr[Mapping[str, Any]]),\n ("hook_defs", PublicAttr[AbstractSet[HookDefinition]]),\n ("retry_policy", PublicAttr[Optional[RetryPolicy]]),\n ],\n )\n):\n """Identifies an instance of a node in a graph dependency structure.\n\n Args:\n name (str): Name of the node of which this is an instance.\n alias (Optional[str]): Name specific to this instance of the node. Necessary when there are\n multiple instances of the same node.\n tags (Optional[Dict[str, Any]]): Optional tags values to extend or override those\n set on the node definition.\n hook_defs (Optional[AbstractSet[HookDefinition]]): A set of hook definitions applied to the\n node instance.\n\n Examples:\n In general, users should prefer not to construct this class directly or use the\n :py:class:`JobDefinition` API that requires instances of this class. Instead, use the\n :py:func:`@job <job>` API:\n\n .. code-block:: python\n\n from dagster import job\n\n @job\n def my_job():\n other_name = some_op.alias('other_name')\n some_graph(other_name(some_op))\n\n """\n\n def __new__(\n cls,\n name: str,\n alias: Optional[str] = None,\n tags: Optional[Mapping[str, str]] = None,\n hook_defs: Optional[AbstractSet[HookDefinition]] = None,\n retry_policy: Optional[RetryPolicy] = None,\n ):\n return super().__new__(\n cls,\n name=check.str_param(name, "name"),\n alias=check.opt_str_param(alias, "alias"),\n tags=check.opt_mapping_param(tags, "tags", value_type=str, key_type=str),\n hook_defs=check.opt_set_param(hook_defs, "hook_defs", of_type=HookDefinition),\n retry_policy=check.opt_inst_param(retry_policy, "retry_policy", RetryPolicy),\n )\n\n # Needs to be hashable because this class is used as a key in dependencies dicts\n def __hash__(self) -> int:\n if not hasattr(self, "_hash"):\n self._hash = hash_collection(self)\n return self._hash
\n\n\nclass Node(ABC):\n """Node invocation within a graph. Identified by its name inside the graph."""\n\n name: str\n definition: "NodeDefinition"\n graph_definition: "GraphDefinition"\n _additional_tags: Mapping[str, str]\n _hook_defs: AbstractSet[HookDefinition]\n _retry_policy: Optional[RetryPolicy]\n _inputs: Mapping[str, "NodeInput"]\n _outputs: Mapping[str, "NodeOutput"]\n\n def __init__(\n self,\n name: str,\n definition: "NodeDefinition",\n graph_definition: "GraphDefinition",\n tags: Optional[Mapping[str, str]] = None,\n hook_defs: Optional[AbstractSet[HookDefinition]] = None,\n retry_policy: Optional[RetryPolicy] = None,\n ):\n from .graph_definition import GraphDefinition\n from .node_definition import NodeDefinition\n\n self.name = check.str_param(name, "name")\n self.definition = check.inst_param(definition, "definition", NodeDefinition)\n self.graph_definition = check.inst_param(\n graph_definition,\n "graph_definition",\n GraphDefinition,\n )\n self._additional_tags = validate_tags(tags)\n self._hook_defs = check.opt_set_param(hook_defs, "hook_defs", of_type=HookDefinition)\n self._retry_policy = check.opt_inst_param(retry_policy, "retry_policy", RetryPolicy)\n\n self._inputs = {\n name: NodeInput(self, input_def)\n for name, input_def in self.definition.input_dict.items()\n }\n self._outputs = {\n name: NodeOutput(self, output_def)\n for name, output_def in self.definition.output_dict.items()\n }\n\n def inputs(self) -> Iterable["NodeInput"]:\n return self._inputs.values()\n\n def outputs(self) -> Iterable["NodeOutput"]:\n return self._outputs.values()\n\n def get_input(self, name: str) -> "NodeInput":\n check.str_param(name, "name")\n return self._inputs[name]\n\n def get_output(self, name: str) -> "NodeOutput":\n check.str_param(name, "name")\n return self._outputs[name]\n\n def has_input(self, name: str) -> bool:\n return self.definition.has_input(name)\n\n def input_def_named(self, name: str) -> InputDefinition:\n return self.definition.input_def_named(name)\n\n def has_output(self, name: str) -> bool:\n return self.definition.has_output(name)\n\n def output_def_named(self, name: str) -> OutputDefinition:\n return self.definition.output_def_named(name)\n\n @property\n def input_dict(self) -> Mapping[str, InputDefinition]:\n return self.definition.input_dict\n\n @property\n def output_dict(self) -> Mapping[str, OutputDefinition]:\n return self.definition.output_dict\n\n @property\n def tags(self) -> Mapping[str, str]:\n return {**self.definition.tags, **self._additional_tags}\n\n def container_maps_input(self, input_name: str) -> bool:\n return (\n self.graph_definition.input_mapping_for_pointer(InputPointer(self.name, input_name))\n is not None\n )\n\n def container_mapped_input(self, input_name: str) -> InputMapping:\n mapping = self.graph_definition.input_mapping_for_pointer(\n InputPointer(self.name, input_name)\n )\n if mapping is None:\n check.failed(\n f"container does not map input {input_name}, check container_maps_input first"\n )\n return mapping\n\n def container_maps_fan_in_input(self, input_name: str, fan_in_index: int) -> bool:\n return (\n self.graph_definition.input_mapping_for_pointer(\n FanInInputPointer(self.name, input_name, fan_in_index)\n )\n is not None\n )\n\n def container_mapped_fan_in_input(self, input_name: str, fan_in_index: int) -> InputMapping:\n mapping = self.graph_definition.input_mapping_for_pointer(\n FanInInputPointer(self.name, input_name, fan_in_index)\n )\n if mapping is None:\n check.failed(\n f"container does not map fan-in {input_name} idx {fan_in_index}, check "\n "container_maps_fan_in_input first"\n )\n\n return mapping\n\n @property\n def hook_defs(self) -> AbstractSet[HookDefinition]:\n return self._hook_defs\n\n @property\n def retry_policy(self) -> Optional[RetryPolicy]:\n return self._retry_policy\n\n @abstractmethod\n def describe_node(self) -> str: ...\n\n @abstractmethod\n def get_resource_requirements(\n self,\n outer_container: "GraphDefinition",\n parent_handle: Optional["NodeHandle"] = None,\n asset_layer: Optional["AssetLayer"] = None,\n ) -> Iterator["ResourceRequirement"]: ...\n\n\nclass GraphNode(Node):\n definition: "GraphDefinition"\n\n def __init__(\n self,\n name: str,\n definition: "GraphDefinition",\n graph_definition: "GraphDefinition",\n tags: Optional[Mapping[str, str]] = None,\n hook_defs: Optional[AbstractSet[HookDefinition]] = None,\n retry_policy: Optional[RetryPolicy] = None,\n ):\n from .graph_definition import GraphDefinition\n\n check.inst_param(definition, "definition", GraphDefinition)\n super().__init__(name, definition, graph_definition, tags, hook_defs, retry_policy)\n\n def get_resource_requirements(\n self,\n outer_container: "GraphDefinition",\n parent_handle: Optional["NodeHandle"] = None,\n asset_layer: Optional["AssetLayer"] = None,\n ) -> Iterator["ResourceRequirement"]:\n cur_node_handle = NodeHandle(self.name, parent_handle)\n\n for node in self.definition.node_dict.values():\n yield from node.get_resource_requirements(\n asset_layer=asset_layer,\n outer_container=self.definition,\n parent_handle=cur_node_handle,\n )\n\n def describe_node(self) -> str:\n return f"graph '{self.name}'"\n\n\nclass OpNode(Node):\n definition: "OpDefinition"\n\n def __init__(\n self,\n name: str,\n definition: "OpDefinition",\n graph_definition: "GraphDefinition",\n tags: Optional[Mapping[str, str]] = None,\n hook_defs: Optional[AbstractSet[HookDefinition]] = None,\n retry_policy: Optional[RetryPolicy] = None,\n ):\n from .op_definition import OpDefinition\n\n check.inst_param(definition, "definition", OpDefinition)\n super().__init__(name, definition, graph_definition, tags, hook_defs, retry_policy)\n\n def get_resource_requirements(\n self,\n outer_container: "GraphDefinition",\n parent_handle: Optional["NodeHandle"] = None,\n asset_layer: Optional["AssetLayer"] = None,\n ) -> Iterator["ResourceRequirement"]:\n from .resource_requirement import InputManagerRequirement\n\n cur_node_handle = NodeHandle(self.name, parent_handle)\n\n for requirement in self.definition.get_resource_requirements(\n (cur_node_handle, asset_layer)\n ):\n # If requirement is a root input manager requirement, but the corresponding node has an upstream output, then ignore the requirement.\n if (\n isinstance(requirement, InputManagerRequirement)\n and outer_container.dependency_structure.has_deps(\n NodeInput(self, self.definition.input_def_named(requirement.input_name))\n )\n and requirement.root_input\n ):\n continue\n yield requirement\n for hook_def in self.hook_defs:\n yield from hook_def.get_resource_requirements(self.describe_node())\n\n def describe_node(self) -> str:\n return f"op '{self.name}'"\n\n\n@whitelist_for_serdes(storage_name="SolidHandle")\nclass NodeHandle(NamedTuple("_NodeHandle", [("name", str), ("parent", Optional["NodeHandle"])])):\n """A structured object to identify nodes in the potentially recursive graph structure."""\n\n def __new__(cls, name: str, parent: Optional["NodeHandle"]):\n return super(NodeHandle, cls).__new__(\n cls,\n check.str_param(name, "name"),\n check.opt_inst_param(parent, "parent", NodeHandle),\n )\n\n def __str__(self):\n return self.to_string()\n\n @property\n def root(self):\n if self.parent:\n return self.parent.root\n else:\n return self\n\n @property\n def path(self) -> Sequence[str]:\n """Return a list representation of the handle.\n\n Inverse of NodeHandle.from_path.\n\n Returns:\n List[str]:\n """\n path: List[str] = []\n cur = self\n while cur:\n path.append(cur.name)\n cur = cur.parent\n path.reverse()\n return path\n\n def to_string(self) -> str:\n """Return a unique string representation of the handle.\n\n Inverse of NodeHandle.from_string.\n """\n return self.parent.to_string() + "." + self.name if self.parent else self.name\n\n def is_or_descends_from(self, handle: "NodeHandle") -> bool:\n """Check if the handle is or descends from another handle.\n\n Args:\n handle (NodeHandle): The handle to check against.\n\n Returns:\n bool:\n """\n check.inst_param(handle, "handle", NodeHandle)\n\n for idx in range(len(handle.path)):\n if idx >= len(self.path):\n return False\n if self.path[idx] != handle.path[idx]:\n return False\n return True\n\n def pop(self, ancestor: "NodeHandle") -> Optional["NodeHandle"]:\n """Return a copy of the handle with some of its ancestors pruned.\n\n Args:\n ancestor (NodeHandle): Handle to an ancestor of the current handle.\n\n Returns:\n NodeHandle:\n\n Example:\n .. code-block:: python\n\n handle = NodeHandle('baz', NodeHandle('bar', NodeHandle('foo', None)))\n ancestor = NodeHandle('bar', NodeHandle('foo', None))\n assert handle.pop(ancestor) == NodeHandle('baz', None)\n """\n check.inst_param(ancestor, "ancestor", NodeHandle)\n check.invariant(\n self.is_or_descends_from(ancestor),\n f"Handle {self.to_string()} does not descend from {ancestor.to_string()}",\n )\n\n return NodeHandle.from_path(self.path[len(ancestor.path) :])\n\n def with_ancestor(self, ancestor: Optional["NodeHandle"]) -> "NodeHandle":\n """Returns a copy of the handle with an ancestor grafted on.\n\n Args:\n ancestor (NodeHandle): Handle to the new ancestor.\n\n Returns:\n NodeHandle:\n\n Example:\n .. code-block:: python\n\n handle = NodeHandle('baz', NodeHandle('bar', NodeHandle('foo', None)))\n ancestor = NodeHandle('quux' None)\n assert handle.with_ancestor(ancestor) == NodeHandle(\n 'baz', NodeHandle('bar', NodeHandle('foo', NodeHandle('quux', None)))\n )\n """\n check.opt_inst_param(ancestor, "ancestor", NodeHandle)\n\n return NodeHandle.from_path([*(ancestor.path if ancestor else []), *self.path])\n\n @staticmethod\n def from_path(path: Sequence[str]) -> "NodeHandle":\n check.sequence_param(path, "path", of_type=str)\n\n cur: Optional["NodeHandle"] = None\n _path = list(path)\n while len(_path) > 0:\n cur = NodeHandle(name=_path.pop(0), parent=cur)\n\n if cur is None:\n check.failed(f"Invalid handle path {path}")\n\n return cur\n\n @staticmethod\n def from_string(handle_str: str) -> "NodeHandle":\n check.str_param(handle_str, "handle_str")\n\n path = handle_str.split(".")\n return NodeHandle.from_path(path)\n\n @classmethod\n def from_dict(cls, dict_repr: Mapping[str, Any]) -> "NodeHandle":\n """This method makes it possible to load a potentially nested NodeHandle after a\n roundtrip through json.loads(json.dumps(NodeHandle._asdict())).\n """\n check.dict_param(dict_repr, "dict_repr", key_type=str)\n check.invariant(\n "name" in dict_repr, "Dict representation of NodeHandle must have a 'name' key"\n )\n check.invariant(\n "parent" in dict_repr, "Dict representation of NodeHandle must have a 'parent' key"\n )\n\n if isinstance(dict_repr["parent"], (list, tuple)):\n parent = NodeHandle.from_dict(\n {\n "name": dict_repr["parent"][0],\n "parent": dict_repr["parent"][1],\n }\n )\n else:\n parent = dict_repr["parent"]\n\n return NodeHandle(name=dict_repr["name"], parent=parent)\n\n\nclass NodeInputHandle(\n NamedTuple("_NodeInputHandle", [("node_handle", NodeHandle), ("input_name", str)])\n):\n """A structured object to uniquely identify inputs in the potentially recursive graph structure."""\n\n\nclass NodeOutputHandle(\n NamedTuple("_NodeOutputHandle", [("node_handle", NodeHandle), ("output_name", str)])\n):\n """A structured object to uniquely identify outputs in the potentially recursive graph structure."""\n\n\nclass NodeInput(NamedTuple("_NodeInput", [("node", Node), ("input_def", InputDefinition)])):\n def __new__(cls, node: Node, input_def: InputDefinition):\n return super(NodeInput, cls).__new__(\n cls,\n check.inst_param(node, "node", Node),\n check.inst_param(input_def, "input_def", InputDefinition),\n )\n\n def _inner_str(self) -> str:\n return struct_to_string(\n "NodeInput",\n node_name=self.node.name,\n input_name=self.input_def.name,\n )\n\n def __str__(self):\n return self._inner_str()\n\n def __repr__(self):\n return self._inner_str()\n\n def __hash__(self):\n return hash((self.node.name, self.input_def.name))\n\n def __eq__(self, other: object) -> bool:\n return (\n isinstance(other, NodeInput)\n and self.node.name == other.node.name\n and self.input_def.name == other.input_def.name\n )\n\n @property\n def node_name(self) -> str:\n return self.node.name\n\n @property\n def input_name(self) -> str:\n return self.input_def.name\n\n\nclass NodeOutput(NamedTuple("_NodeOutput", [("node", Node), ("output_def", OutputDefinition)])):\n def __new__(cls, node: Node, output_def: OutputDefinition):\n return super(NodeOutput, cls).__new__(\n cls,\n check.inst_param(node, "node", Node),\n check.inst_param(output_def, "output_def", OutputDefinition),\n )\n\n def _inner_str(self) -> str:\n return struct_to_string(\n "NodeOutput",\n node_name=self.node.name,\n output_name=self.output_def.name,\n )\n\n def __str__(self):\n return self._inner_str()\n\n def __repr__(self):\n return self._inner_str()\n\n def __hash__(self) -> int:\n return hash((self.node.name, self.output_def.name))\n\n def __eq__(self, other: Any) -> bool:\n return self.node.name == other.node.name and self.output_def.name == other.output_def.name\n\n def describe(self) -> str:\n return f"{self.node_name}:{self.output_def.name}"\n\n @property\n def node_name(self) -> str:\n return self.node.name\n\n @property\n def is_dynamic(self) -> bool:\n return self.output_def.is_dynamic\n\n @property\n def output_name(self) -> str:\n return self.output_def.name\n\n\nclass DependencyType(Enum):\n DIRECT = "DIRECT"\n FAN_IN = "FAN_IN"\n DYNAMIC_COLLECT = "DYNAMIC_COLLECT"\n\n\nclass IDependencyDefinition(ABC):\n @abstractmethod\n def get_node_dependencies(self) -> Sequence["DependencyDefinition"]:\n pass\n\n @abstractmethod\n def is_fan_in(self) -> bool:\n """The result passed to the corresponding input will be a List made from different node outputs."""\n\n\n
[docs]class DependencyDefinition(\n NamedTuple(\n "_DependencyDefinition", [("node", str), ("output", str), ("description", Optional[str])]\n ),\n IDependencyDefinition,\n):\n """Represents an edge in the DAG of nodes (ops or graphs) forming a job.\n\n This object is used at the leaves of a dictionary structure that represents the complete\n dependency structure of a job whose keys represent the dependent node and dependent\n input, so this object only contains information about the dependee.\n\n Concretely, if the input named 'input' of op_b depends on the output named 'result' of\n op_a, and the output named 'other_result' of graph_a, the structure will look as follows:\n\n .. code-block:: python\n\n dependency_structure = {\n 'my_downstream_op': {\n 'input': DependencyDefinition('my_upstream_op', 'result')\n }\n 'my_downstream_op': {\n 'input': DependencyDefinition('my_upstream_graph', 'result')\n }\n }\n\n In general, users should prefer not to construct this class directly or use the\n :py:class:`JobDefinition` API that requires instances of this class. Instead, use the\n :py:func:`@job <job>` API:\n\n .. code-block:: python\n\n @job\n def the_job():\n node_b(node_a())\n\n\n Args:\n node (str): The name of the node (op or graph) that is depended on, that is, from which the value\n passed between the two nodes originates.\n output (Optional[str]): The name of the output that is depended on. (default: "result")\n description (Optional[str]): Human-readable description of this dependency.\n """\n\n def __new__(\n cls,\n node: str,\n output: str = DEFAULT_OUTPUT,\n description: Optional[str] = None,\n ):\n return super(DependencyDefinition, cls).__new__(\n cls,\n check.str_param(node, "node"),\n check.str_param(output, "output"),\n check.opt_str_param(description, "description"),\n )\n\n def get_node_dependencies(self) -> Sequence["DependencyDefinition"]:\n return [self]\n\n
[docs] @public\n def is_fan_in(self) -> bool:\n """Return True if the dependency is fan-in (always False for DependencyDefinition)."""\n return False
\n\n def get_op_dependencies(self) -> Sequence["DependencyDefinition"]:\n return [self]
\n\n\n
[docs]class MultiDependencyDefinition(\n NamedTuple(\n "_MultiDependencyDefinition",\n [\n (\n "dependencies",\n PublicAttr[Sequence[Union[DependencyDefinition, Type["MappedInputPlaceholder"]]]],\n )\n ],\n ),\n IDependencyDefinition,\n):\n """Represents a fan-in edge in the DAG of op instances forming a job.\n\n This object is used only when an input of type ``List[T]`` is assembled by fanning-in multiple\n upstream outputs of type ``T``.\n\n This object is used at the leaves of a dictionary structure that represents the complete\n dependency structure of a job whose keys represent the dependent ops or graphs and dependent\n input, so this object only contains information about the dependee.\n\n Concretely, if the input named 'input' of op_c depends on the outputs named 'result' of\n op_a and op_b, this structure will look as follows:\n\n .. code-block:: python\n\n dependency_structure = {\n 'op_c': {\n 'input': MultiDependencyDefinition(\n [\n DependencyDefinition('op_a', 'result'),\n DependencyDefinition('op_b', 'result')\n ]\n )\n }\n }\n\n In general, users should prefer not to construct this class directly or use the\n :py:class:`JobDefinition` API that requires instances of this class. Instead, use the\n :py:func:`@job <job>` API:\n\n .. code-block:: python\n\n @job\n def the_job():\n op_c(op_a(), op_b())\n\n Args:\n dependencies (List[Union[DependencyDefinition, Type[MappedInputPlaceHolder]]]): List of\n upstream dependencies fanned in to this input.\n """\n\n def __new__(\n cls,\n dependencies: Sequence[Union[DependencyDefinition, Type["MappedInputPlaceholder"]]],\n ):\n from .composition import MappedInputPlaceholder\n\n deps = check.sequence_param(dependencies, "dependencies")\n seen = {}\n for dep in deps:\n if isinstance(dep, DependencyDefinition):\n key = dep.node + ":" + dep.output\n if key in seen:\n raise DagsterInvalidDefinitionError(\n f'Duplicate dependencies on node "{dep.node}" output "{dep.output}" '\n "used in the same MultiDependencyDefinition."\n )\n seen[key] = True\n elif dep is MappedInputPlaceholder:\n pass\n else:\n check.failed(f"Unexpected dependencies entry {dep}")\n\n return super(MultiDependencyDefinition, cls).__new__(cls, deps)\n\n
[docs] @public\n def get_node_dependencies(self) -> Sequence[DependencyDefinition]:\n """Return the list of :py:class:`DependencyDefinition` contained by this object."""\n return [dep for dep in self.dependencies if isinstance(dep, DependencyDefinition)]
\n\n
[docs] @public\n def is_fan_in(self) -> bool:\n """Return `True` if the dependency is fan-in (always True for MultiDependencyDefinition)."""\n return True
\n\n
[docs] @public\n def get_dependencies_and_mappings(\n self,\n ) -> Sequence[Union[DependencyDefinition, Type["MappedInputPlaceholder"]]]:\n """Return the combined list of dependencies contained by this object, inculding of :py:class:`DependencyDefinition` and :py:class:`MappedInputPlaceholder` objects."""\n return self.dependencies
\n\n\nclass BlockingAssetChecksDependencyDefinition(\n IDependencyDefinition,\n NamedTuple(\n "_BlockingAssetChecksDependencyDefinition",\n [\n (\n "asset_check_dependencies",\n Sequence[DependencyDefinition],\n ),\n ("other_dependency", Optional[DependencyDefinition]),\n ],\n ),\n):\n """An input that depends on a set of outputs that correspond to upstream asset checks, and also\n optionally depends on a single upstream output that does not correspond to an asset check.\n\n We model this with a different kind of DependencyDefinition than MultiDependencyDefinition,\n because we treat the value that's passed to the input parameter differently: we ignore the asset\n check dependencies and only pass a single value, instead of a fanned-in list.\n """\n\n @public\n def get_node_dependencies(self) -> Sequence[DependencyDefinition]:\n """Return the list of :py:class:`DependencyDefinition` contained by this object."""\n if self.other_dependency:\n return [*self.asset_check_dependencies, self.other_dependency]\n else:\n return self.asset_check_dependencies\n\n @public\n def is_fan_in(self) -> bool:\n return False\n\n @public\n def get_dependencies_and_mappings(\n self,\n ) -> Sequence[Union[DependencyDefinition, Type["MappedInputPlaceholder"]]]:\n return self.get_node_dependencies()\n\n\nclass DynamicCollectDependencyDefinition(\n NamedTuple("_DynamicCollectDependencyDefinition", [("node_name", str), ("output_name", str)]),\n IDependencyDefinition,\n):\n def get_node_dependencies(self) -> Sequence[DependencyDefinition]:\n return [DependencyDefinition(self.node_name, self.output_name)]\n\n def is_fan_in(self) -> bool:\n return True\n\n\nDepTypeAndOutputs: TypeAlias = Tuple[\n DependencyType,\n Union[NodeOutput, List[Union[NodeOutput, Type["MappedInputPlaceholder"]]]],\n]\n\nInputToOutputMap: TypeAlias = Dict[NodeInput, DepTypeAndOutputs]\n\n\ndef _create_handle_dict(\n node_dict: Mapping[str, Node],\n dep_dict: DependencyMapping[str],\n) -> InputToOutputMap:\n from .composition import MappedInputPlaceholder\n\n check.mapping_param(node_dict, "node_dict", key_type=str, value_type=Node)\n check.two_dim_mapping_param(dep_dict, "dep_dict", value_type=IDependencyDefinition)\n\n handle_dict: InputToOutputMap = {}\n\n for node_name, input_dict in dep_dict.items():\n from_node = node_dict[node_name]\n for input_name, dep_def in input_dict.items():\n if isinstance(\n dep_def, (MultiDependencyDefinition, BlockingAssetChecksDependencyDefinition)\n ):\n handles: List[Union[NodeOutput, Type[MappedInputPlaceholder]]] = []\n for inner_dep in dep_def.get_dependencies_and_mappings():\n if isinstance(inner_dep, DependencyDefinition):\n handles.append(node_dict[inner_dep.node].get_output(inner_dep.output))\n elif inner_dep is MappedInputPlaceholder:\n handles.append(inner_dep)\n else:\n check.failed(\n f"Unexpected MultiDependencyDefinition dependencies type {inner_dep}"\n )\n\n handle_dict[from_node.get_input(input_name)] = (DependencyType.FAN_IN, handles)\n\n elif isinstance(dep_def, DependencyDefinition):\n handle_dict[from_node.get_input(input_name)] = (\n DependencyType.DIRECT,\n node_dict[dep_def.node].get_output(dep_def.output),\n )\n elif isinstance(dep_def, DynamicCollectDependencyDefinition):\n handle_dict[from_node.get_input(input_name)] = (\n DependencyType.DYNAMIC_COLLECT,\n node_dict[dep_def.node_name].get_output(dep_def.output_name),\n )\n\n else:\n check.failed(f"Unknown dependency type {dep_def}")\n\n return handle_dict\n\n\nclass DependencyStructure:\n @staticmethod\n def from_definitions(\n nodes: Mapping[str, Node], dep_dict: DependencyMapping[str]\n ) -> "DependencyStructure":\n return DependencyStructure(\n list(dep_dict.keys()),\n _create_handle_dict(nodes, dep_dict),\n dep_dict,\n )\n\n _node_input_index: DefaultDict[str, Dict[NodeInput, List[NodeOutput]]]\n _node_output_index: Dict[str, DefaultDict[NodeOutput, List[NodeInput]]]\n _dynamic_fan_out_index: Dict[str, NodeOutput]\n _collect_index: Dict[str, Set[NodeOutput]]\n _deps_by_node_name: DependencyMapping[str]\n\n def __init__(\n self,\n node_names: Sequence[str],\n input_to_output_map: InputToOutputMap,\n deps_by_node_name: DependencyMapping[str],\n ):\n self._node_names = node_names\n self._input_to_output_map = input_to_output_map\n self._deps_by_node_name = deps_by_node_name\n\n # Building up a couple indexes here so that one can look up all the upstream output handles\n # or downstream input handles in O(1). Without this, this can become O(N^2) where N is node\n # count during the GraphQL query in particular\n\n # node_name => input_handle => list[output_handle]\n self._node_input_index = defaultdict(dict)\n\n # node_name => output_handle => list[input_handle]\n self._node_output_index = defaultdict(lambda: defaultdict(list))\n\n # node_name => dynamic output_handle that this node will dupe for\n self._dynamic_fan_out_index = {}\n\n # node_name => set of dynamic output_handle this collects over\n self._collect_index = defaultdict(set)\n\n for node_input, (dep_type, node_output_or_list) in self._input_to_output_map.items():\n if dep_type == DependencyType.FAN_IN:\n node_output_list: List[NodeOutput] = []\n for node_output in node_output_or_list:\n if not isinstance(node_output, NodeOutput):\n continue\n\n if node_output.is_dynamic:\n raise DagsterInvalidDefinitionError(\n "Currently, items in a fan-in dependency cannot be downstream of"\n " dynamic outputs. Problematic dependency on dynamic output"\n f' "{node_output.describe()}".'\n )\n if self._dynamic_fan_out_index.get(node_output.node_name):\n raise DagsterInvalidDefinitionError(\n "Currently, items in a fan-in dependency cannot be downstream of"\n " dynamic outputs. Problematic dependency on output"\n f' "{node_output.describe()}", downstream of'\n f' "{self._dynamic_fan_out_index[node_output.node_name].describe()}".'\n )\n\n node_output_list.append(node_output)\n elif dep_type == DependencyType.DIRECT:\n node_output = cast(NodeOutput, node_output_or_list)\n\n if node_output.is_dynamic:\n self._validate_and_set_fan_out(node_input, node_output)\n\n if self._dynamic_fan_out_index.get(node_output.node_name):\n self._validate_and_set_fan_out(\n node_input, self._dynamic_fan_out_index[node_output.node_name]\n )\n\n node_output_list = [node_output]\n elif dep_type == DependencyType.DYNAMIC_COLLECT:\n node_output = cast(NodeOutput, node_output_or_list)\n\n if node_output.is_dynamic:\n self._validate_and_set_collect(node_input, node_output)\n\n elif self._dynamic_fan_out_index.get(node_output.node_name):\n self._validate_and_set_collect(\n node_input,\n self._dynamic_fan_out_index[node_output.node_name],\n )\n else:\n check.failed(\n f"Unexpected dynamic fan in dep created {node_output} -> {node_input}"\n )\n\n node_output_list = [node_output]\n else:\n check.failed(f"Unexpected dep type {dep_type}")\n\n self._node_input_index[node_input.node.name][node_input] = node_output_list\n for node_output in node_output_list:\n self._node_output_index[node_output.node.name][node_output].append(node_input)\n\n def _validate_and_set_fan_out(self, node_input: NodeInput, node_output: NodeOutput) -> None:\n """Helper function for populating _dynamic_fan_out_index."""\n if not node_input.node.definition.input_supports_dynamic_output_dep(node_input.input_name):\n raise DagsterInvalidDefinitionError(\n f"{node_input.node.describe_node()} cannot be downstream of dynamic output"\n f' "{node_output.describe()}" since input "{node_input.input_name}" maps to a'\n " node that is already downstream of another dynamic output. Nodes cannot be"\n " downstream of more than one dynamic output"\n )\n\n if self._collect_index.get(node_input.node_name):\n raise DagsterInvalidDefinitionError(\n f"{node_input.node.describe_node()} cannot be both downstream of dynamic output "\n f"{node_output.describe()} and collect over dynamic output "\n f"{next(iter(self._collect_index[node_input.node_name])).describe()}."\n )\n\n if self._dynamic_fan_out_index.get(node_input.node_name) is None:\n self._dynamic_fan_out_index[node_input.node_name] = node_output\n return\n\n if self._dynamic_fan_out_index[node_input.node_name] != node_output:\n raise DagsterInvalidDefinitionError(\n f"{node_input.node.describe_node()} cannot be downstream of more than one dynamic"\n f' output. It is downstream of both "{node_output.describe()}" and'\n f' "{self._dynamic_fan_out_index[node_input.node_name].describe()}"'\n )\n\n def _validate_and_set_collect(\n self,\n node_input: NodeInput,\n node_output: NodeOutput,\n ) -> None:\n if self._dynamic_fan_out_index.get(node_input.node_name):\n raise DagsterInvalidDefinitionError(\n f"{node_input.node.describe_node()} cannot both collect over dynamic output "\n f"{node_output.describe()} and be downstream of the dynamic output "\n f"{self._dynamic_fan_out_index[node_input.node_name].describe()}."\n )\n\n self._collect_index[node_input.node_name].add(node_output)\n\n # if the output is already fanned out\n if self._dynamic_fan_out_index.get(node_output.node_name):\n raise DagsterInvalidDefinitionError(\n f"{node_input.node.describe_node()} cannot be downstream of more than one dynamic"\n f' output. It is downstream of both "{node_output.describe()}" and'\n f' "{self._dynamic_fan_out_index[node_output.node_name].describe()}"'\n )\n\n def all_upstream_outputs_from_node(self, node_name: str) -> Sequence[NodeOutput]:\n check.str_param(node_name, "node_name")\n\n # flatten out all outputs that feed into the inputs of this node\n return [\n output_handle\n for output_handle_list in self._node_input_index[node_name].values()\n for output_handle in output_handle_list\n ]\n\n def input_to_upstream_outputs_for_node(\n self, node_name: str\n ) -> Mapping[NodeInput, Sequence[NodeOutput]]:\n """Returns a Dict[NodeInput, List[NodeOutput]] that encodes\n where all the the inputs are sourced from upstream. Usually the\n List[NodeOutput] will be a list of one, except for the\n multi-dependency case.\n """\n check.str_param(node_name, "node_name")\n return self._node_input_index[node_name]\n\n def output_to_downstream_inputs_for_node(\n self, node_name: str\n ) -> Mapping[NodeOutput, Sequence[NodeInput]]:\n """Returns a Dict[NodeOutput, List[NodeInput]] that\n represents all the downstream inputs for each output in the\n dictionary.\n """\n check.str_param(node_name, "node_name")\n return self._node_output_index[node_name]\n\n def has_direct_dep(self, node_input: NodeInput) -> bool:\n check.inst_param(node_input, "node_input", NodeInput)\n if node_input not in self._input_to_output_map:\n return False\n dep_type, _ = self._input_to_output_map[node_input]\n return dep_type == DependencyType.DIRECT\n\n def get_direct_dep(self, node_input: NodeInput) -> NodeOutput:\n check.inst_param(node_input, "node_input", NodeInput)\n dep_type, dep = self._input_to_output_map[node_input]\n check.invariant(\n dep_type == DependencyType.DIRECT,\n f"Cannot call get_direct_dep when dep is not singular, got {dep_type}",\n )\n return cast(NodeOutput, dep)\n\n def get_dependency_definition(self, node_input: NodeInput) -> Optional[IDependencyDefinition]:\n return self._deps_by_node_name[node_input.node_name].get(node_input.input_name)\n\n def has_fan_in_deps(self, node_input: NodeInput) -> bool:\n check.inst_param(node_input, "node_input", NodeInput)\n if node_input not in self._input_to_output_map:\n return False\n dep_type, _ = self._input_to_output_map[node_input]\n return dep_type == DependencyType.FAN_IN\n\n def get_fan_in_deps(\n self, node_input: NodeInput\n ) -> Sequence[Union[NodeOutput, Type["MappedInputPlaceholder"]]]:\n check.inst_param(node_input, "node_input", NodeInput)\n dep_type, deps = self._input_to_output_map[node_input]\n check.invariant(\n dep_type == DependencyType.FAN_IN,\n f"Cannot call get_multi_dep when dep is not fan in, got {dep_type}",\n )\n return cast(List[Union[NodeOutput, Type["MappedInputPlaceholder"]]], deps)\n\n def has_dynamic_fan_in_dep(self, node_input: NodeInput) -> bool:\n check.inst_param(node_input, "node_input", NodeInput)\n if node_input not in self._input_to_output_map:\n return False\n dep_type, _ = self._input_to_output_map[node_input]\n return dep_type == DependencyType.DYNAMIC_COLLECT\n\n def get_dynamic_fan_in_dep(self, node_input: NodeInput) -> NodeOutput:\n check.inst_param(node_input, "node_input", NodeInput)\n dep_type, dep = self._input_to_output_map[node_input]\n check.invariant(\n dep_type == DependencyType.DYNAMIC_COLLECT,\n f"Cannot call get_dynamic_fan_in_dep when dep is not, got {dep_type}",\n )\n return cast(NodeOutput, dep)\n\n def has_deps(self, node_input: NodeInput) -> bool:\n check.inst_param(node_input, "node_input", NodeInput)\n return node_input in self._input_to_output_map\n\n def get_deps_list(self, node_input: NodeInput) -> Sequence[NodeOutput]:\n check.inst_param(node_input, "node_input", NodeInput)\n check.invariant(self.has_deps(node_input))\n dep_type, handle_or_list = self._input_to_output_map[node_input]\n if dep_type == DependencyType.DIRECT:\n return [cast(NodeOutput, handle_or_list)]\n elif dep_type == DependencyType.DYNAMIC_COLLECT:\n return [cast(NodeOutput, handle_or_list)]\n elif dep_type == DependencyType.FAN_IN:\n return [handle for handle in handle_or_list if isinstance(handle, NodeOutput)]\n else:\n check.failed(f"Unexpected dep type {dep_type}")\n\n def inputs(self) -> Sequence[NodeInput]:\n return list(self._input_to_output_map.keys())\n\n def get_upstream_dynamic_output_for_node(self, node_name: str) -> Optional[NodeOutput]:\n return self._dynamic_fan_out_index.get(node_name)\n\n def get_dependency_type(self, node_input: NodeInput) -> Optional[DependencyType]:\n result = self._input_to_output_map.get(node_input)\n if result is None:\n return None\n dep_type, _ = result\n return dep_type\n\n def is_dynamic_mapped(self, node_name: str) -> bool:\n return node_name in self._dynamic_fan_out_index\n\n def has_dynamic_downstreams(self, node_name: str) -> bool:\n for node_output in self._dynamic_fan_out_index.values():\n if node_output.node_name == node_name:\n return True\n\n return False\n
", "current_page_name": "_modules/dagster/_core/definitions/dependency", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.dependency"}, "events": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.events

\nimport re\nfrom enum import Enum\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Any,\n    Callable,\n    Generic,\n    List,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    TypeVar,\n    Union,\n    cast,\n)\n\nimport dagster._check as check\nimport dagster._seven as seven\nfrom dagster._annotations import PublicAttr, deprecated, experimental_param, public\nfrom dagster._core.definitions.data_version import DataVersion\nfrom dagster._core.storage.tags import MULTIDIMENSIONAL_PARTITION_PREFIX, SYSTEM_TAG_PREFIX\nfrom dagster._serdes import whitelist_for_serdes\nfrom dagster._serdes.serdes import NamedTupleSerializer\n\nfrom .metadata import (\n    MetadataFieldSerializer,\n    MetadataMapping,\n    MetadataValue,\n    RawMetadataValue,\n    normalize_metadata,\n)\nfrom .utils import DEFAULT_OUTPUT, check_valid_name\n\nif TYPE_CHECKING:\n    from dagster._core.definitions.assets import AssetsDefinition\n    from dagster._core.definitions.source_asset import SourceAsset\n    from dagster._core.execution.context.output import OutputContext\n\n\nASSET_KEY_SPLIT_REGEX = re.compile("[^a-zA-Z0-9_]")\nASSET_KEY_DELIMITER = "/"\n\n\ndef parse_asset_key_string(s: str) -> Sequence[str]:\n    return list(filter(lambda x: x, re.split(ASSET_KEY_SPLIT_REGEX, s)))\n\n\n
[docs]@whitelist_for_serdes\nclass AssetKey(NamedTuple("_AssetKey", [("path", PublicAttr[Sequence[str]])])):\n """Object representing the structure of an asset key. Takes in a sanitized string, list of\n strings, or tuple of strings.\n\n Example usage:\n\n .. code-block:: python\n\n from dagster import op\n\n @op\n def emit_metadata(context, df):\n yield AssetMaterialization(\n asset_key=AssetKey('flat_asset_key'),\n metadata={"text_metadata": "Text-based metadata for this event"},\n )\n\n @op\n def structured_asset_key(context, df):\n yield AssetMaterialization(\n asset_key=AssetKey(['parent', 'child', 'grandchild']),\n metadata={"text_metadata": "Text-based metadata for this event"},\n )\n\n @op\n def structured_asset_key_2(context, df):\n yield AssetMaterialization(\n asset_key=AssetKey(('parent', 'child', 'grandchild')),\n metadata={"text_metadata": "Text-based metadata for this event"},\n )\n\n Args:\n path (Sequence[str]): String, list of strings, or tuple of strings. A list of strings\n represent the hierarchical structure of the asset_key.\n """\n\n def __new__(cls, path: Sequence[str]):\n if isinstance(path, str):\n path = [path]\n else:\n path = list(check.sequence_param(path, "path", of_type=str))\n\n return super(AssetKey, cls).__new__(cls, path=path)\n\n def __str__(self):\n return f"AssetKey({self.path})"\n\n def __repr__(self):\n return f"AssetKey({self.path})"\n\n def __hash__(self):\n return hash(tuple(self.path))\n\n def __eq__(self, other):\n if not isinstance(other, AssetKey):\n return False\n if len(self.path) != len(other.path):\n return False\n for i in range(0, len(self.path)):\n if self.path[i] != other.path[i]:\n return False\n return True\n\n def to_string(self) -> str:\n """E.g. '["first_component", "second_component"]'."""\n return seven.json.dumps(self.path)\n\n def to_user_string(self) -> str:\n """E.g. "first_component/second_component"."""\n return ASSET_KEY_DELIMITER.join(self.path)\n\n def to_python_identifier(self, suffix: Optional[str] = None) -> str:\n """Build a valid Python identifier based on the asset key that can be used for\n operation names or I/O manager keys.\n """\n path = list(self.path)\n\n if suffix is not None:\n path.append(suffix)\n\n return "__".join(path).replace("-", "_")\n\n @staticmethod\n def from_user_string(asset_key_string: str) -> "AssetKey":\n return AssetKey(asset_key_string.split(ASSET_KEY_DELIMITER))\n\n @staticmethod\n def from_db_string(asset_key_string: Optional[str]) -> Optional["AssetKey"]:\n if not asset_key_string:\n return None\n if asset_key_string[0] == "[":\n # is a json string\n try:\n path = seven.json.loads(asset_key_string)\n except seven.JSONDecodeError:\n path = parse_asset_key_string(asset_key_string)\n else:\n path = parse_asset_key_string(asset_key_string)\n return AssetKey(path)\n\n @staticmethod\n def get_db_prefix(path: Sequence[str]):\n check.sequence_param(path, "path", of_type=str)\n return seven.json.dumps(path)[:-2] # strip trailing '"]' from json string\n\n @staticmethod\n def from_graphql_input(graphql_input_asset_key: Mapping[str, Sequence[str]]) -> "AssetKey":\n return AssetKey(graphql_input_asset_key["path"])\n\n def to_graphql_input(self) -> Mapping[str, Sequence[str]]:\n return {"path": self.path}\n\n @staticmethod\n def from_coercible(arg: "CoercibleToAssetKey") -> "AssetKey":\n if isinstance(arg, AssetKey):\n return check.inst_param(arg, "arg", AssetKey)\n elif isinstance(arg, str):\n return AssetKey([arg])\n elif isinstance(arg, list):\n check.list_param(arg, "arg", of_type=str)\n return AssetKey(arg)\n elif isinstance(arg, tuple):\n check.tuple_param(arg, "arg", of_type=str)\n return AssetKey(arg)\n else:\n check.failed(f"Unexpected type for AssetKey: {type(arg)}")\n\n @staticmethod\n def from_coercible_or_definition(\n arg: Union["CoercibleToAssetKey", "AssetsDefinition", "SourceAsset"]\n ) -> "AssetKey":\n from dagster._core.definitions.assets import AssetsDefinition\n from dagster._core.definitions.source_asset import SourceAsset\n\n if isinstance(arg, AssetsDefinition):\n return arg.key\n elif isinstance(arg, SourceAsset):\n return arg.key\n else:\n return AssetKey.from_coercible(arg)\n\n # @staticmethod\n # def from_coercible_to_asset_dep(arg: "CoercibleToAssetDep") -> "AssetKey":\n # from dagster._core.definitions.asset_dep import AssetDep\n # from dagster._core.definitions.asset_spec import AssetSpec\n # from dagster._core.definitions.assets import AssetsDefinition\n # from dagster._core.definitions.source_asset import SourceAsset\n\n # if isinstance(arg, AssetsDefinition):\n # if len(arg.keys) > 1:\n # # Only AssetsDefinition with a single asset can be passed\n # raise DagsterInvalidDefinitionError(\n # "Cannot pass a multi_asset AssetsDefinition as an argument to deps."\n # " Instead, specify dependencies on the assets created by the multi_asset"\n # f" via AssetKeys or strings. For the multi_asset {arg.node_def.name}, the"\n # f" available keys are: {arg.keys}."\n # )\n # return arg.key\n # elif isinstance(arg, SourceAsset):\n # return arg.key\n # elif isinstance(arg, AssetDep):\n # return arg.asset_key\n # elif isinstance(arg, AssetSpec):\n # return arg.asset_key\n # else:\n # return AssetKey.from_coercible(arg)\n\n def has_prefix(self, prefix: Sequence[str]) -> bool:\n return len(self.path) >= len(prefix) and self.path[: len(prefix)] == prefix\n\n def with_prefix(self, prefix: "CoercibleToAssetKeyPrefix") -> "AssetKey":\n prefix = key_prefix_from_coercible(prefix)\n return AssetKey(list(prefix) + list(self.path))
\n\n\nclass AssetKeyPartitionKey(NamedTuple):\n """An AssetKey with an (optional) partition key. Refers either to a non-partitioned asset or a\n partition of a partitioned asset.\n """\n\n asset_key: AssetKey\n partition_key: Optional[str] = None\n\n\nCoercibleToAssetKey = Union[AssetKey, str, Sequence[str]]\nCoercibleToAssetKeyPrefix = Union[str, Sequence[str]]\n\n\ndef check_opt_coercible_to_asset_key_prefix_param(\n prefix: Optional[CoercibleToAssetKeyPrefix], param_name: str\n) -> Optional[Sequence[str]]:\n try:\n return key_prefix_from_coercible(prefix) if prefix is not None else None\n except check.CheckError:\n raise check.ParameterCheckError(\n f'Param "{param_name}" is not a string or a sequence of strings'\n )\n\n\ndef key_prefix_from_coercible(key_prefix: CoercibleToAssetKeyPrefix) -> Sequence[str]:\n if isinstance(key_prefix, str):\n return [key_prefix]\n elif isinstance(key_prefix, list):\n return key_prefix\n else:\n check.failed(f"Unexpected type for key_prefix: {type(key_prefix)}")\n\n\nDynamicAssetKey = Callable[["OutputContext"], Optional[AssetKey]]\n\n\n@whitelist_for_serdes\nclass AssetLineageInfo(\n NamedTuple("_AssetLineageInfo", [("asset_key", AssetKey), ("partitions", AbstractSet[str])])\n):\n def __new__(cls, asset_key: AssetKey, partitions: Optional[AbstractSet[str]] = None):\n asset_key = check.inst_param(asset_key, "asset_key", AssetKey)\n partitions = check.opt_set_param(partitions, "partitions", str)\n return super(AssetLineageInfo, cls).__new__(cls, asset_key=asset_key, partitions=partitions)\n\n\nT = TypeVar("T")\n\n\n
[docs]@experimental_param(param="data_version")\nclass Output(Generic[T]):\n """Event corresponding to one of a op's outputs.\n\n Op compute functions must explicitly yield events of this type when they have more than\n one output, or when they also yield events of other types, or when defining a op using the\n :py:class:`OpDefinition` API directly.\n\n Outputs are values produced by ops that will be consumed by downstream ops in a job.\n They are type-checked at op boundaries when their corresponding :py:class:`Out`\n or the downstream :py:class:`In` is typed.\n\n Args:\n value (Any): The value returned by the compute function.\n output_name (Optional[str]): Name of the corresponding out. (default:\n "result")\n metadata (Optional[Dict[str, Union[str, float, int, MetadataValue]]]):\n Arbitrary metadata about the failure. Keys are displayed string labels, and values are\n one of the following: string, float, int, JSON-serializable dict, JSON-serializable\n list, and one of the data classes returned by a MetadataValue static method.\n data_version (Optional[DataVersion]): (Experimental) A data version to manually set\n for the asset.\n """\n\n def __init__(\n self,\n value: T,\n output_name: Optional[str] = DEFAULT_OUTPUT,\n metadata: Optional[Mapping[str, RawMetadataValue]] = None,\n data_version: Optional[DataVersion] = None,\n ):\n self._value = value\n self._output_name = check.str_param(output_name, "output_name")\n self._data_version = check.opt_inst_param(data_version, "data_version", DataVersion)\n self._metadata = normalize_metadata(\n check.opt_mapping_param(metadata, "metadata", key_type=str),\n )\n\n @property\n def metadata(self) -> MetadataMapping:\n return self._metadata\n\n @public\n @property\n def value(self) -> Any:\n """Any: The value returned by the compute function."""\n return self._value\n\n @public\n @property\n def output_name(self) -> str:\n """str: Name of the corresponding :py:class:`Out`."""\n return self._output_name\n\n @public\n @property\n def data_version(self) -> Optional[DataVersion]:\n """Optional[DataVersion]: A data version that was manually set on the `Output`."""\n return self._data_version\n\n def __eq__(self, other: object) -> bool:\n return (\n isinstance(other, Output)\n and self.value == other.value\n and self.output_name == other.output_name\n and self.metadata == other.metadata\n )
\n\n\n
[docs]class DynamicOutput(Generic[T]):\n """Variant of :py:class:`Output <dagster.Output>` used to support\n dynamic mapping & collect. Each ``DynamicOutput`` produced by an op represents\n one item in a set that can be processed individually with ``map`` or gathered\n with ``collect``.\n\n Each ``DynamicOutput`` must have a unique ``mapping_key`` to distinguish it with it's set.\n\n Args:\n value (Any):\n The value returned by the compute function.\n mapping_key (str):\n The key that uniquely identifies this dynamic value relative to its peers.\n This key will be used to identify the downstream ops when mapped, ie\n ``mapped_op[example_mapping_key]``\n output_name (Optional[str]):\n Name of the corresponding :py:class:`DynamicOut` defined on the op.\n (default: "result")\n metadata (Optional[Dict[str, Union[str, float, int, MetadataValue]]]):\n Arbitrary metadata about the failure. Keys are displayed string labels, and values are\n one of the following: string, float, int, JSON-serializable dict, JSON-serializable\n list, and one of the data classes returned by a MetadataValue static method.\n """\n\n def __init__(\n self,\n value: T,\n mapping_key: str,\n output_name: Optional[str] = DEFAULT_OUTPUT,\n metadata: Optional[Mapping[str, RawMetadataValue]] = None,\n ):\n self._mapping_key = check_valid_name(check.str_param(mapping_key, "mapping_key"))\n self._output_name = check.str_param(output_name, "output_name")\n self._value = value\n self._metadata = normalize_metadata(\n check.opt_mapping_param(metadata, "metadata", key_type=str),\n )\n\n @property\n def metadata(self) -> Mapping[str, MetadataValue]:\n return self._metadata\n\n @public\n @property\n def mapping_key(self) -> str:\n """The mapping_key that was set for this DynamicOutput at instantiation."""\n return self._mapping_key\n\n @public\n @property\n def value(self) -> T:\n """The value that is returned by the compute function for this DynamicOut."""\n return self._value\n\n @public\n @property\n def output_name(self) -> str:\n """Name of the :py:class:`DynamicOut` defined on the op that this DynamicOut is associated with."""\n return self._output_name\n\n def __eq__(self, other: object) -> bool:\n return (\n isinstance(other, DynamicOutput)\n and self.value == other.value\n and self.output_name == other.output_name\n and self.mapping_key == other.mapping_key\n and self.metadata == other.metadata\n )
\n\n\n@whitelist_for_serdes(\n storage_field_names={"metadata": "metadata_entries"},\n field_serializers={"metadata": MetadataFieldSerializer},\n)\nclass AssetObservation(\n NamedTuple(\n "_AssetObservation",\n [\n ("asset_key", PublicAttr[AssetKey]),\n ("description", PublicAttr[Optional[str]]),\n ("metadata", PublicAttr[Mapping[str, MetadataValue]]),\n ("partition", PublicAttr[Optional[str]]),\n ("tags", PublicAttr[Mapping[str, str]]),\n ],\n )\n):\n """Event that captures metadata about an asset at a point in time.\n\n Args:\n asset_key (Union[str, List[str], AssetKey]): A key to identify the asset.\n partition (Optional[str]): The name of a partition of the asset that the metadata\n corresponds to.\n tags (Optional[Mapping[str, str]]): A mapping containing system-populated tags for the\n observation. Users should not pass values into this argument.\n metadata (Optional[Dict[str, Union[str, float, int, MetadataValue]]]):\n Arbitrary metadata about the asset. Keys are displayed string labels, and values are\n one of the following: string, float, int, JSON-serializable dict, JSON-serializable\n list, and one of the data classes returned by a MetadataValue static method.\n """\n\n def __new__(\n cls,\n asset_key: CoercibleToAssetKey,\n description: Optional[str] = None,\n metadata: Optional[Mapping[str, RawMetadataValue]] = None,\n partition: Optional[str] = None,\n tags: Optional[Mapping[str, str]] = None,\n ):\n if isinstance(asset_key, AssetKey):\n check.inst_param(asset_key, "asset_key", AssetKey)\n elif isinstance(asset_key, str):\n asset_key = AssetKey(parse_asset_key_string(asset_key))\n else:\n check.sequence_param(asset_key, "asset_key", of_type=str)\n asset_key = AssetKey(asset_key)\n\n tags = check.opt_mapping_param(tags, "tags", key_type=str, value_type=str)\n if any([not tag.startswith(SYSTEM_TAG_PREFIX) for tag in tags or {}]):\n check.failed(\n "Users should not pass values into the tags argument for AssetMaterializations. "\n "The tags argument is reserved for system-populated tags."\n )\n\n normed_metadata = normalize_metadata(\n check.opt_mapping_param(metadata, "metadata", key_type=str),\n )\n\n return super(AssetObservation, cls).__new__(\n cls,\n asset_key=asset_key,\n description=check.opt_str_param(description, "description"),\n metadata=normed_metadata,\n tags=tags,\n partition=check.opt_str_param(partition, "partition"),\n )\n\n @property\n def label(self) -> str:\n return " ".join(self.asset_key.path)\n\n\nUNDEFINED_ASSET_KEY_PATH = ["__undefined__"]\n\n\nclass AssetMaterializationSerializer(NamedTupleSerializer):\n # There are old `Materialization` objects in storage. We set the default value for asset key to\n # be `AssetKey(["__undefined__"])` to ensure that we can load these objects, without needing to\n # allow for the construction of new `AssetMaterialization` objects with no defined AssetKey.\n def before_unpack(self, context, unpacked_dict: Any) -> Any:\n # cover both the case where "asset_key" is not present at all and where it is None\n if unpacked_dict.get("asset_key") is None:\n unpacked_dict["asset_key"] = AssetKey(UNDEFINED_ASSET_KEY_PATH)\n return unpacked_dict\n\n\n
[docs]@whitelist_for_serdes(\n old_storage_names={"Materialization"},\n serializer=AssetMaterializationSerializer,\n storage_field_names={"metadata": "metadata_entries"},\n field_serializers={"metadata": MetadataFieldSerializer},\n)\nclass AssetMaterialization(\n NamedTuple(\n "_AssetMaterialization",\n [\n ("asset_key", PublicAttr[AssetKey]),\n ("description", PublicAttr[Optional[str]]),\n ("metadata", PublicAttr[Mapping[str, MetadataValue]]),\n ("partition", PublicAttr[Optional[str]]),\n ("tags", Optional[Mapping[str, str]]),\n ],\n )\n):\n """Event indicating that an op has materialized an asset.\n\n Op compute functions may yield events of this type whenever they wish to indicate to the\n Dagster framework (and the end user) that they have produced a materialized value as a\n side effect of computation. Unlike outputs, asset materializations can not be passed to other\n ops, and their persistence is controlled by op logic, rather than by the Dagster\n framework.\n\n Op authors should use these events to organize metadata about the side effects of their\n computations, enabling tooling like the Assets dashboard in the Dagster UI.\n\n Args:\n asset_key (Union[str, List[str], AssetKey]): A key to identify the materialized asset across\n job runs\n description (Optional[str]): A longer human-readable description of the materialized value.\n partition (Optional[str]): The name of the partition\n that was materialized.\n tags (Optional[Mapping[str, str]]): A mapping containing system-populated tags for the\n materialization. Users should not pass values into this argument.\n metadata (Optional[Dict[str, RawMetadataValue]]):\n Arbitrary metadata about the asset. Keys are displayed string labels, and values are\n one of the following: string, float, int, JSON-serializable dict, JSON-serializable\n list, and one of the data classes returned by a MetadataValue static method.\n """\n\n def __new__(\n cls,\n asset_key: CoercibleToAssetKey,\n description: Optional[str] = None,\n metadata: Optional[Mapping[str, RawMetadataValue]] = None,\n partition: Optional[str] = None,\n tags: Optional[Mapping[str, str]] = None,\n ):\n from dagster._core.definitions.multi_dimensional_partitions import MultiPartitionKey\n\n if isinstance(asset_key, AssetKey):\n check.inst_param(asset_key, "asset_key", AssetKey)\n elif isinstance(asset_key, str):\n asset_key = AssetKey(parse_asset_key_string(asset_key))\n else:\n check.sequence_param(asset_key, "asset_key", of_type=str)\n asset_key = AssetKey(asset_key)\n\n check.opt_mapping_param(tags, "tags", key_type=str, value_type=str)\n invalid_tags = [tag for tag in tags or {} if not tag.startswith(SYSTEM_TAG_PREFIX)]\n if len(invalid_tags) > 0:\n check.failed(\n f"Invalid tags: {tags} Users should not pass values into the tags argument for"\n " AssetMaterializations. The tags argument is reserved for system-populated tags."\n )\n\n normed_metadata = normalize_metadata(\n check.opt_mapping_param(metadata, "metadata", key_type=str),\n )\n\n partition = check.opt_str_param(partition, "partition")\n\n if not isinstance(partition, MultiPartitionKey):\n # When event log records are unpacked from storage, cast the partition key as a\n # MultiPartitionKey if multi-dimensional partition tags exist\n multi_dimensional_partitions = {\n dimension[len(MULTIDIMENSIONAL_PARTITION_PREFIX) :]: partition_key\n for dimension, partition_key in (tags or {}).items()\n if dimension.startswith(MULTIDIMENSIONAL_PARTITION_PREFIX)\n }\n if multi_dimensional_partitions:\n partition = MultiPartitionKey(multi_dimensional_partitions)\n\n return super(AssetMaterialization, cls).__new__(\n cls,\n asset_key=asset_key,\n description=check.opt_str_param(description, "description"),\n metadata=normed_metadata,\n tags=tags,\n partition=partition,\n )\n\n @property\n def label(self) -> str:\n return " ".join(self.asset_key.path)\n\n
[docs] @public\n @staticmethod\n def file(\n path: str,\n description: Optional[str] = None,\n asset_key: Optional[Union[str, Sequence[str], AssetKey]] = None,\n ) -> "AssetMaterialization":\n """Static constructor for standard materializations corresponding to files on disk.\n\n Args:\n path (str): The path to the file.\n description (Optional[str]): A human-readable description of the materialization.\n """\n if not asset_key:\n asset_key = path\n\n return AssetMaterialization(\n asset_key=cast(Union[str, AssetKey, List[str]], asset_key),\n description=description,\n metadata={"path": MetadataValue.path(path)},\n )
\n\n\n
[docs]@deprecated(\n breaking_version="1.7",\n additional_warn_text="Please use AssetCheckResult and @asset_check instead.",\n)\n@whitelist_for_serdes(\n storage_field_names={"metadata": "metadata_entries"},\n field_serializers={"metadata": MetadataFieldSerializer},\n)\nclass ExpectationResult(\n NamedTuple(\n "_ExpectationResult",\n [\n ("success", PublicAttr[bool]),\n ("label", PublicAttr[Optional[str]]),\n ("description", PublicAttr[Optional[str]]),\n ("metadata", PublicAttr[Mapping[str, MetadataValue]]),\n ],\n )\n):\n """Event corresponding to a data quality test.\n\n Op compute functions may yield events of this type whenever they wish to indicate to the\n Dagster framework (and the end user) that a data quality test has produced a (positive or\n negative) result.\n\n Args:\n success (bool): Whether the expectation passed or not.\n label (Optional[str]): Short display name for expectation. Defaults to "result".\n description (Optional[str]): A longer human-readable description of the expectation.\n metadata (Optional[Dict[str, RawMetadataValue]]):\n Arbitrary metadata about the failure. Keys are displayed string labels, and values are\n one of the following: string, float, int, JSON-serializable dict, JSON-serializable\n list, and one of the data classes returned by a MetadataValue static method.\n """\n\n def __new__(\n cls,\n success: bool,\n label: Optional[str] = None,\n description: Optional[str] = None,\n metadata: Optional[Mapping[str, RawMetadataValue]] = None,\n ):\n normed_metadata = normalize_metadata(\n check.opt_mapping_param(metadata, "metadata", key_type=str),\n )\n\n return super(ExpectationResult, cls).__new__(\n cls,\n success=check.bool_param(success, "success"),\n label=check.opt_str_param(label, "label", "result"),\n description=check.opt_str_param(description, "description"),\n metadata=normed_metadata,\n )
\n\n\n
[docs]@whitelist_for_serdes(\n storage_field_names={"metadata": "metadata_entries"},\n field_serializers={"metadata": MetadataFieldSerializer},\n)\n@whitelist_for_serdes\nclass TypeCheck(\n NamedTuple(\n "_TypeCheck",\n [\n ("success", PublicAttr[bool]),\n ("description", PublicAttr[Optional[str]]),\n ("metadata", PublicAttr[Mapping[str, MetadataValue]]),\n ],\n )\n):\n """Event corresponding to a successful typecheck.\n\n Events of this type should be returned by user-defined type checks when they need to encapsulate\n additional metadata about a type check's success or failure. (i.e., when using\n :py:func:`as_dagster_type`, :py:func:`@usable_as_dagster_type <dagster_type>`, or the underlying\n :py:func:`PythonObjectDagsterType` API.)\n\n Op compute functions should generally avoid yielding events of this type to avoid confusion.\n\n Args:\n success (bool): ``True`` if the type check succeeded, ``False`` otherwise.\n description (Optional[str]): A human-readable description of the type check.\n metadata (Optional[Dict[str, RawMetadataValue]]):\n Arbitrary metadata about the failure. Keys are displayed string labels, and values are\n one of the following: string, float, int, JSON-serializable dict, JSON-serializable\n list, and one of the data classes returned by a MetadataValue static method.\n """\n\n def __new__(\n cls,\n success: bool,\n description: Optional[str] = None,\n metadata: Optional[Mapping[str, RawMetadataValue]] = None,\n ):\n normed_metadata = normalize_metadata(\n check.opt_mapping_param(metadata, "metadata", key_type=str),\n )\n\n return super(TypeCheck, cls).__new__(\n cls,\n success=check.bool_param(success, "success"),\n description=check.opt_str_param(description, "description"),\n metadata=normed_metadata,\n )
\n\n\n
[docs]class Failure(Exception):\n """Event indicating op failure.\n\n Raise events of this type from within op compute functions or custom type checks in order to\n indicate an unrecoverable failure in user code to the Dagster machinery and return\n structured metadata about the failure.\n\n Args:\n description (Optional[str]): A human-readable description of the failure.\n metadata (Optional[Dict[str, RawMetadataValue]]):\n Arbitrary metadata about the failure. Keys are displayed string labels, and values are\n one of the following: string, float, int, JSON-serializable dict, JSON-serializable\n list, and one of the data classes returned by a MetadataValue static method.\n allow_retries (Optional[bool]):\n Whether this Failure should respect the retry policy or bypass it and immediately fail.\n Defaults to True, respecting the retry policy and allowing retries.\n """\n\n def __init__(\n self,\n description: Optional[str] = None,\n metadata: Optional[Mapping[str, RawMetadataValue]] = None,\n allow_retries: Optional[bool] = None,\n ):\n super(Failure, self).__init__(description)\n self.description = check.opt_str_param(description, "description")\n self.metadata = normalize_metadata(\n check.opt_mapping_param(metadata, "metadata", key_type=str),\n )\n self.allow_retries = check.opt_bool_param(allow_retries, "allow_retries", True)
\n\n\n
[docs]class RetryRequested(Exception):\n """An exception to raise from an op to indicate that it should be retried.\n\n Args:\n max_retries (Optional[int]):\n The max number of retries this step should attempt before failing\n seconds_to_wait (Optional[Union[float,int]]):\n Seconds to wait before restarting the step after putting the step in\n to the up_for_retry state\n\n Example:\n .. code-block:: python\n\n @op\n def flakes():\n try:\n flakey_operation()\n except Exception as e:\n raise RetryRequested(max_retries=3) from e\n """\n\n def __init__(\n self, max_retries: Optional[int] = 1, seconds_to_wait: Optional[Union[float, int]] = None\n ):\n super(RetryRequested, self).__init__()\n self.max_retries = check.int_param(max_retries, "max_retries")\n self.seconds_to_wait = check.opt_numeric_param(seconds_to_wait, "seconds_to_wait")
\n\n\nclass ObjectStoreOperationType(Enum):\n SET_OBJECT = "SET_OBJECT"\n GET_OBJECT = "GET_OBJECT"\n RM_OBJECT = "RM_OBJECT"\n CP_OBJECT = "CP_OBJECT"\n\n\nclass ObjectStoreOperation(\n NamedTuple(\n "_ObjectStoreOperation",\n [\n ("op", ObjectStoreOperationType),\n ("key", str),\n ("dest_key", Optional[str]),\n ("obj", Any),\n ("serialization_strategy_name", Optional[str]),\n ("object_store_name", Optional[str]),\n ("value_name", Optional[str]),\n ("version", Optional[str]),\n ("mapping_key", Optional[str]),\n ],\n )\n):\n """This event is used internally by Dagster machinery when values are written to and read from\n an ObjectStore.\n\n Users should not import this class or yield events of this type from user code.\n\n Args:\n op (ObjectStoreOperationType): The type of the operation on the object store.\n key (str): The key of the object on which the operation was performed.\n dest_key (Optional[str]): The destination key, if any, to which the object was copied.\n obj (Any): The object, if any, retrieved by the operation.\n serialization_strategy_name (Optional[str]): The name of the serialization strategy, if any,\n employed by the operation\n object_store_name (Optional[str]): The name of the object store that performed the\n operation.\n value_name (Optional[str]): The name of the input/output\n version (Optional[str]): (Experimental) The version of the stored data.\n mapping_key (Optional[str]): The mapping key when a dynamic output is used.\n """\n\n def __new__(\n cls,\n op: ObjectStoreOperationType,\n key: str,\n dest_key: Optional[str] = None,\n obj: Any = None,\n serialization_strategy_name: Optional[str] = None,\n object_store_name: Optional[str] = None,\n value_name: Optional[str] = None,\n version: Optional[str] = None,\n mapping_key: Optional[str] = None,\n ):\n return super(ObjectStoreOperation, cls).__new__(\n cls,\n op=op,\n key=check.str_param(key, "key"),\n dest_key=check.opt_str_param(dest_key, "dest_key"),\n obj=obj,\n serialization_strategy_name=check.opt_str_param(\n serialization_strategy_name, "serialization_strategy_name"\n ),\n object_store_name=check.opt_str_param(object_store_name, "object_store_name"),\n value_name=check.opt_str_param(value_name, "value_name"),\n version=check.opt_str_param(version, "version"),\n mapping_key=check.opt_str_param(mapping_key, "mapping_key"),\n )\n\n @classmethod\n def serializable(cls, inst, **kwargs):\n return cls(\n **dict(\n {\n "op": inst.op.value,\n "key": inst.key,\n "dest_key": inst.dest_key,\n "obj": None,\n "serialization_strategy_name": inst.serialization_strategy_name,\n "object_store_name": inst.object_store_name,\n "value_name": inst.value_name,\n "version": inst.version,\n },\n **kwargs,\n )\n )\n\n\nclass HookExecutionResult(\n NamedTuple("_HookExecutionResult", [("hook_name", str), ("is_skipped", bool)])\n):\n """This event is used internally to indicate the execution result of a hook, e.g. whether the\n user-defined hook function is skipped.\n\n Args:\n hook_name (str): The name of the hook.\n is_skipped (bool): ``False`` if the hook_fn is executed, ``True`` otheriwse.\n """\n\n def __new__(cls, hook_name: str, is_skipped: Optional[bool] = None):\n return super(HookExecutionResult, cls).__new__(\n cls,\n hook_name=check.str_param(hook_name, "hook_name"),\n is_skipped=cast(bool, check.opt_bool_param(is_skipped, "is_skipped", default=False)),\n )\n\n\nUserEvent = Union[AssetMaterialization, AssetObservation, ExpectationResult]\n
", "current_page_name": "_modules/dagster/_core/definitions/events", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.events"}, "executor_definition": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.executor_definition

\nfrom enum import Enum as PyEnum\nfrom functools import update_wrapper\nfrom typing import TYPE_CHECKING, Any, Callable, Dict, Mapping, Optional, Sequence, Union, overload\n\nfrom typing_extensions import Self, TypeAlias\n\nimport dagster._check as check\nfrom dagster._annotations import public\nfrom dagster._builtins import Int\nfrom dagster._config import Field, Noneable, Selector, UserConfigSchema\nfrom dagster._core.definitions.configurable import (\n    ConfiguredDefinitionConfigSchema,\n    NamedConfigurableDefinition,\n)\nfrom dagster._core.definitions.job_base import IJob\nfrom dagster._core.definitions.reconstruct import ReconstructableJob\nfrom dagster._core.errors import DagsterUnmetExecutorRequirementsError\nfrom dagster._core.execution.retries import RetryMode, get_retries_config\nfrom dagster._core.execution.tags import get_tag_concurrency_limits_config\n\nfrom .definition_config_schema import (\n    IDefinitionConfigSchema,\n    convert_user_facing_definition_config_schema,\n)\n\nif TYPE_CHECKING:\n    from dagster._core.executor.base import Executor\n    from dagster._core.executor.in_process import InProcessExecutor\n    from dagster._core.executor.init import InitExecutorContext\n    from dagster._core.executor.multiprocess import MultiprocessExecutor\n    from dagster._core.instance import DagsterInstance\n\n\nclass ExecutorRequirement(PyEnum):\n    """An ExecutorDefinition can include a list of requirements that the system uses to\n    check whether the executor will be able to work for a particular job execution.\n    """\n\n    # The passed in IJob must be reconstructable across process boundaries\n    RECONSTRUCTABLE_PIPELINE = (  # This needs to still exist for folks who may have written their own executor\n        "RECONSTRUCTABLE_PIPELINE"\n    )\n    RECONSTRUCTABLE_JOB = "RECONSTRUCTABLE_PIPELINE"\n\n    # The DagsterInstance must be loadable in a different process\n    NON_EPHEMERAL_INSTANCE = "NON_EPHEMERAL_INSTANCE"\n\n    # Any op outputs on the job must be persisted\n    PERSISTENT_OUTPUTS = "PERSISTENT_OUTPUTS"\n\n\ndef multiple_process_executor_requirements() -> Sequence[ExecutorRequirement]:\n    return [\n        ExecutorRequirement.RECONSTRUCTABLE_JOB,\n        ExecutorRequirement.NON_EPHEMERAL_INSTANCE,\n        ExecutorRequirement.PERSISTENT_OUTPUTS,\n    ]\n\n\nExecutorConfig = Mapping[str, object]\nExecutorCreationFunction: TypeAlias = Callable[["InitExecutorContext"], "Executor"]\nExecutorRequirementsFunction: TypeAlias = Callable[[ExecutorConfig], Sequence[ExecutorRequirement]]\n\n\n
[docs]class ExecutorDefinition(NamedConfigurableDefinition):\n """An executor is responsible for executing the steps of a job.\n\n Args:\n name (str): The name of the executor.\n config_schema (Optional[ConfigSchema]): The schema for the config. Configuration data\n available in `init_context.executor_config`. If not set, Dagster will accept any config\n provided.\n requirements (Optional[List[ExecutorRequirement]]): Any requirements that must\n be met in order for the executor to be usable for a particular job execution.\n executor_creation_fn(Optional[Callable]): Should accept an :py:class:`InitExecutorContext`\n and return an instance of :py:class:`Executor`\n required_resource_keys (Optional[Set[str]]): Keys for the resources required by the\n executor.\n description (Optional[str]): A description of the executor.\n """\n\n def __init__(\n self,\n name: str,\n config_schema: Optional[UserConfigSchema] = None,\n requirements: Union[\n ExecutorRequirementsFunction, Optional[Sequence[ExecutorRequirement]]\n ] = None,\n executor_creation_fn: Optional[ExecutorCreationFunction] = None,\n description: Optional[str] = None,\n ):\n self._name = check.str_param(name, "name")\n self._requirements_fn: ExecutorRequirementsFunction\n if callable(requirements):\n self._requirements_fn = requirements\n else:\n requirements_lst = check.opt_list_param(\n requirements, "requirements", of_type=ExecutorRequirement\n )\n self._requirements_fn = lambda _: requirements_lst\n self._config_schema = convert_user_facing_definition_config_schema(config_schema)\n self._executor_creation_fn = check.opt_callable_param(\n executor_creation_fn, "executor_creation_fn"\n )\n self._description = check.opt_str_param(description, "description")\n\n @public\n @property\n def name(self) -> str:\n """Name of the executor."""\n return self._name\n\n @public\n @property\n def description(self) -> Optional[str]:\n """Description of executor, if provided."""\n return self._description\n\n @property\n def config_schema(self) -> IDefinitionConfigSchema:\n return self._config_schema\n\n def get_requirements(\n self, executor_config: Mapping[str, object]\n ) -> Sequence[ExecutorRequirement]:\n return self._requirements_fn(executor_config)\n\n @public\n @property\n def executor_creation_fn(self) -> Optional[ExecutorCreationFunction]:\n """Callable that takes an :py:class:`InitExecutorContext` and returns an instance of\n :py:class:`Executor`.\n """\n return self._executor_creation_fn\n\n def copy_for_configured(self, name, description, config_schema) -> "ExecutorDefinition":\n return ExecutorDefinition(\n name=name,\n config_schema=config_schema, # type: ignore\n executor_creation_fn=self.executor_creation_fn,\n description=description or self.description,\n requirements=self._requirements_fn,\n )\n\n @staticmethod\n def hardcoded_executor(executor: "Executor"):\n return ExecutorDefinition(\n # Executor name was only relevant in the pipeline/solid/mode world, so we\n # can put a dummy value\n name="__executor__",\n executor_creation_fn=lambda _init_context: executor,\n )\n\n # Backcompat: Overrides configured method to provide name as a keyword argument.\n # If no name is provided, the name is pulled off of this ExecutorDefinition.\n
[docs] @public\n def configured(\n self,\n config_or_config_fn: Any,\n name: Optional[str] = None,\n config_schema: Optional[UserConfigSchema] = None,\n description: Optional[str] = None,\n ) -> Self:\n """Wraps this object in an object of the same type that provides configuration to the inner\n object.\n\n Using ``configured`` may result in config values being displayed in\n the Dagster UI, so it is not recommended to use this API with sensitive values,\n such as secrets.\n\n Args:\n config_or_config_fn (Union[Any, Callable[[Any], Any]]): Either (1) Run configuration\n that fully satisfies this object's config schema or (2) A function that accepts run\n configuration and returns run configuration that fully satisfies this object's\n config schema. In the latter case, config_schema must be specified. When\n passing a function, it's easiest to use :py:func:`configured`.\n name (Optional[str]): Name of the new definition. If not provided, the emitted\n definition will inherit the name of the `ExecutorDefinition` upon which this\n function is called.\n config_schema (Optional[ConfigSchema]): If config_or_config_fn is a function, the config\n schema that its input must satisfy. If not set, Dagster will accept any config\n provided.\n description (Optional[str]): Description of the new definition. If not specified,\n inherits the description of the definition being configured.\n\n Returns (ConfigurableDefinition): A configured version of this object.\n """\n name = check.opt_str_param(name, "name")\n\n new_config_schema = ConfiguredDefinitionConfigSchema(\n self, convert_user_facing_definition_config_schema(config_schema), config_or_config_fn\n )\n\n return self.copy_for_configured(name or self.name, description, new_config_schema)
\n\n\n@overload\ndef executor(name: ExecutorCreationFunction) -> ExecutorDefinition: ...\n\n\n@overload\ndef executor(\n name: Optional[str] = ...,\n config_schema: Optional[UserConfigSchema] = ...,\n requirements: Optional[\n Union[ExecutorRequirementsFunction, Sequence[ExecutorRequirement]]\n ] = ...,\n) -> "_ExecutorDecoratorCallable": ...\n\n\n
[docs]def executor(\n name: Union[ExecutorCreationFunction, Optional[str]] = None,\n config_schema: Optional[UserConfigSchema] = None,\n requirements: Optional[\n Union[ExecutorRequirementsFunction, Sequence[ExecutorRequirement]]\n ] = None,\n) -> Union[ExecutorDefinition, "_ExecutorDecoratorCallable"]:\n """Define an executor.\n\n The decorated function should accept an :py:class:`InitExecutorContext` and return an instance\n of :py:class:`Executor`.\n\n Args:\n name (Optional[str]): The name of the executor.\n config_schema (Optional[ConfigSchema]): The schema for the config. Configuration data available in\n `init_context.executor_config`. If not set, Dagster will accept any config provided for.\n requirements (Optional[List[ExecutorRequirement]]): Any requirements that must\n be met in order for the executor to be usable for a particular job execution.\n """\n if callable(name):\n check.invariant(config_schema is None)\n check.invariant(requirements is None)\n return _ExecutorDecoratorCallable()(name)\n\n return _ExecutorDecoratorCallable(\n name=name, config_schema=config_schema, requirements=requirements\n )
\n\n\nclass _ExecutorDecoratorCallable:\n def __init__(self, name=None, config_schema=None, requirements=None):\n self.name = check.opt_str_param(name, "name")\n self.config_schema = config_schema # type check in definition\n self.requirements = requirements\n\n def __call__(self, fn: ExecutorCreationFunction) -> ExecutorDefinition:\n check.callable_param(fn, "fn")\n\n if not self.name:\n self.name = fn.__name__\n\n executor_def = ExecutorDefinition(\n name=self.name,\n config_schema=self.config_schema,\n executor_creation_fn=fn,\n requirements=self.requirements,\n )\n\n # `update_wrapper` typing cannot currently handle a Union of Callables correctly\n update_wrapper(executor_def, wrapped=fn) # type: ignore\n\n return executor_def\n\n\ndef _core_in_process_executor_creation(config: ExecutorConfig) -> "InProcessExecutor":\n from dagster._core.executor.in_process import InProcessExecutor\n\n return InProcessExecutor(\n # shouldn't need to .get() here - issue with defaults in config setup\n retries=RetryMode.from_config(check.dict_elem(config, "retries")), # type: ignore # (possible none)\n marker_to_close=config.get("marker_to_close"), # type: ignore # (should be str)\n )\n\n\nIN_PROC_CONFIG = Field(\n {\n "retries": get_retries_config(),\n "marker_to_close": Field(\n str,\n is_required=False,\n description="[DEPRECATED]",\n ),\n },\n description="Execute all steps in a single process.",\n)\n\n\n
[docs]@executor(\n name="in_process",\n config_schema=IN_PROC_CONFIG,\n)\ndef in_process_executor(init_context):\n """The in-process executor executes all steps in a single process.\n\n To select it, include the following top-level fragment in config:\n\n .. code-block:: yaml\n\n execution:\n in_process:\n\n Execution priority can be configured using the ``dagster/priority`` tag via op metadata,\n where the higher the number the higher the priority. 0 is the default and both positive\n and negative numbers can be used.\n """\n return _core_in_process_executor_creation(init_context.executor_config)
\n\n\n@executor(name="execute_in_process_executor")\ndef execute_in_process_executor(_) -> "InProcessExecutor":\n """Executor used by execute_in_process.\n\n Use of this executor triggers special behavior in the config system that ignores all incoming\n executor config. This is because someone might set executor config on a job, and when we foist\n this executor onto the job for `execute_in_process`, that config becomes nonsensical.\n """\n from dagster._core.executor.in_process import InProcessExecutor\n\n return InProcessExecutor(\n retries=RetryMode.ENABLED,\n marker_to_close=None,\n )\n\n\ndef _core_multiprocess_executor_creation(config: ExecutorConfig) -> "MultiprocessExecutor":\n from dagster._core.executor.multiprocess import MultiprocessExecutor\n\n # unpack optional selector\n start_method = None\n start_cfg: Dict[str, object] = {}\n start_selector = check.opt_dict_elem(config, "start_method")\n if start_selector:\n start_method, start_cfg = next(iter(start_selector.items()))\n\n return MultiprocessExecutor(\n max_concurrent=check.opt_int_elem(config, "max_concurrent"),\n tag_concurrency_limits=check.opt_list_elem(config, "tag_concurrency_limits"),\n retries=RetryMode.from_config(check.dict_elem(config, "retries")), # type: ignore\n start_method=start_method,\n explicit_forkserver_preload=check.opt_list_elem(start_cfg, "preload_modules", of_type=str),\n )\n\n\nMULTI_PROC_CONFIG = Field(\n {\n "max_concurrent": Field(\n Noneable(Int),\n default_value=None,\n description=(\n "The number of processes that may run concurrently. "\n "By default, this is set to be the return value of `multiprocessing.cpu_count()`."\n ),\n ),\n "tag_concurrency_limits": get_tag_concurrency_limits_config(),\n "start_method": Field(\n Selector(\n fields={\n "spawn": Field(\n {},\n description=(\n "Configure the multiprocess executor to start subprocesses "\n "using `spawn`."\n ),\n ),\n "forkserver": Field(\n {\n "preload_modules": Field(\n [str],\n is_required=False,\n description=(\n "Explicitly specify the modules to preload in the forkserver."\n " Otherwise, there are two cases for default values if modules"\n " are not specified. If the Dagster job was loaded from a"\n " module, the same module will be preloaded. If not, the"\n " `dagster` module is preloaded."\n ),\n ),\n },\n description=(\n "Configure the multiprocess executor to start subprocesses "\n "using `forkserver`."\n ),\n ),\n # fork currently unsupported due to threads usage\n }\n ),\n is_required=False,\n description=(\n "Select how subprocesses are created. By default, `spawn` is selected. See "\n "https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods."\n ),\n ),\n "retries": get_retries_config(),\n },\n description="Execute each step in an individual process.",\n)\n\n\n
[docs]@executor(\n name="multiprocess",\n config_schema=MULTI_PROC_CONFIG,\n requirements=multiple_process_executor_requirements(),\n)\ndef multiprocess_executor(init_context):\n """The multiprocess executor executes each step in an individual process.\n\n Any job that does not specify custom executors will use the multiprocess_executor by default.\n To configure the multiprocess executor, include a fragment such as the following in your run\n config:\n\n .. code-block:: yaml\n\n execution:\n config:\n multiprocess:\n max_concurrent: 4\n\n The ``max_concurrent`` arg is optional and tells the execution engine how many processes may run\n concurrently. By default, or if you set ``max_concurrent`` to be None or 0, this is the return value of\n :py:func:`python:multiprocessing.cpu_count`.\n\n Execution priority can be configured using the ``dagster/priority`` tag via op metadata,\n where the higher the number the higher the priority. 0 is the default and both positive\n and negative numbers can be used.\n """\n return _core_multiprocess_executor_creation(init_context.executor_config)
\n\n\ndef check_cross_process_constraints(init_context: "InitExecutorContext") -> None:\n from dagster._core.executor.init import InitExecutorContext\n\n check.inst_param(init_context, "init_context", InitExecutorContext)\n requirements_lst = init_context.executor_def.get_requirements(init_context.executor_config)\n\n if ExecutorRequirement.RECONSTRUCTABLE_JOB in requirements_lst:\n _check_intra_process_job(init_context.job)\n\n if ExecutorRequirement.NON_EPHEMERAL_INSTANCE in requirements_lst:\n _check_non_ephemeral_instance(init_context.instance)\n\n\ndef _check_intra_process_job(job: IJob) -> None:\n if not isinstance(job, ReconstructableJob):\n raise DagsterUnmetExecutorRequirementsError(\n "You have attempted to use an executor that uses multiple processes with the job"\n f' "{job.get_definition().name}" that is not reconstructable. Job must be loaded in a'\n " way that allows dagster to reconstruct them in a new process. This means: \\n *"\n " using the file, module, or workspace.yaml arguments of"\n " dagster-webserver/dagster-graphql/dagster\\n * loading the job through the"\n " reconstructable() function\\n"\n )\n\n\ndef _check_non_ephemeral_instance(instance: "DagsterInstance") -> None:\n if instance.is_ephemeral:\n raise DagsterUnmetExecutorRequirementsError(\n "You have attempted to use an executor that uses multiple processes with an ephemeral"\n " DagsterInstance. A non-ephemeral instance is needed to coordinate execution between"\n " multiple processes. You can configure your default instance via $DAGSTER_HOME or"\n " ensure a valid one is passed when invoking the python APIs. You can learn more about"\n " setting up a persistent DagsterInstance from the DagsterInstance docs here:"\n " https://docs.dagster.io/deployment/dagster-instance#default-local-behavior"\n )\n\n\ndef _get_default_executor_requirements(\n executor_config: ExecutorConfig,\n) -> Sequence[ExecutorRequirement]:\n return multiple_process_executor_requirements() if "multiprocess" in executor_config else []\n\n\n
[docs]@executor(\n name="multi_or_in_process_executor",\n config_schema=Field(\n Selector(\n {"multiprocess": MULTI_PROC_CONFIG, "in_process": IN_PROC_CONFIG},\n ),\n default_value={"multiprocess": {}},\n ),\n requirements=_get_default_executor_requirements,\n)\ndef multi_or_in_process_executor(init_context: "InitExecutorContext") -> "Executor":\n """The default executor for a job.\n\n This is the executor available by default on a :py:class:`JobDefinition`\n that does not provide custom executors. This executor has a multiprocessing-enabled mode, and a\n single-process mode. By default, multiprocessing mode is enabled. Switching between multiprocess\n mode and in-process mode can be achieved via config.\n\n .. code-block:: yaml\n\n execution:\n config:\n multiprocess:\n\n\n execution:\n config:\n in_process:\n\n When using the multiprocess mode, ``max_concurrent`` and ``retries`` can also be configured.\n\n .. code-block:: yaml\n\n execution:\n config:\n multiprocess:\n max_concurrent: 4\n retries:\n enabled:\n\n The ``max_concurrent`` arg is optional and tells the execution engine how many processes may run\n concurrently. By default, or if you set ``max_concurrent`` to be 0, this is the return value of\n :py:func:`python:multiprocessing.cpu_count`.\n\n When using the in_process mode, then only retries can be configured.\n\n Execution priority can be configured using the ``dagster/priority`` tag via op metadata,\n where the higher the number the higher the priority. 0 is the default and both positive\n and negative numbers can be used.\n """\n if "multiprocess" in init_context.executor_config:\n return _core_multiprocess_executor_creation(\n check.dict_elem(init_context.executor_config, "multiprocess")\n )\n else:\n return _core_in_process_executor_creation(\n check.dict_elem(init_context.executor_config, "in_process")\n )
\n
", "current_page_name": "_modules/dagster/_core/definitions/executor_definition", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.executor_definition"}, "freshness_policy": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.freshness_policy

\nimport datetime\nfrom typing import AbstractSet, NamedTuple, Optional\n\nimport pendulum\n\nimport dagster._check as check\nfrom dagster._annotations import experimental\nfrom dagster._core.errors import DagsterInvalidDefinitionError\nfrom dagster._serdes import whitelist_for_serdes\nfrom dagster._utils.schedules import (\n    is_valid_cron_schedule,\n    reverse_cron_string_iterator,\n)\n\nfrom .events import AssetKey\n\n\nclass FreshnessConstraint(NamedTuple):\n    asset_keys: AbstractSet[AssetKey]\n    required_data_time: datetime.datetime\n    required_by_time: datetime.datetime\n\n\nclass FreshnessMinutes(NamedTuple):\n    overdue_minutes: float\n    lag_minutes: float\n\n\n
[docs]@experimental\n@whitelist_for_serdes\nclass FreshnessPolicy(\n NamedTuple(\n "_FreshnessPolicy",\n [\n ("maximum_lag_minutes", float),\n ("cron_schedule", Optional[str]),\n ("cron_schedule_timezone", Optional[str]),\n ],\n )\n):\n """A FreshnessPolicy specifies how up-to-date you want a given asset to be.\n\n Attaching a FreshnessPolicy to an asset definition encodes an expectation on the upstream data\n that you expect to be incorporated into the current state of that asset at certain points in time.\n How this is calculated differs depending on if the asset is unpartitioned or time-partitioned\n (other partitioning schemes are not supported).\n\n For time-partitioned assets, the current data time for the asset is simple to calculate. The\n upstream data that is incorporated into the asset is exactly the set of materialized partitions\n for that asset. Thus, the current data time for the asset is simply the time up to which all\n partitions have been materialized.\n\n For unpartitioned assets, the current data time is based on the upstream materialization records\n that were read to generate the current state of the asset. More specifically,\n imagine you have two assets, where A depends on B. If `B` has a FreshnessPolicy defined, this\n means that at time T, the most recent materialization of `B` should have come after a\n materialization of `A` which was no more than `maximum_lag_minutes` ago. This calculation is\n recursive: any given asset is expected to incorporate up-to-date data from all of its upstream\n assets.\n\n It is assumed that all asset definitions with no upstream asset definitions consume from some\n always-updating source. That is, if you materialize that asset at time T, it will incorporate\n all data up to time T.\n\n If `cron_schedule` is not defined, the given asset will be expected to incorporate upstream\n data from no more than `maximum_lag_minutes` ago at all points in time. For example, "The events\n table should always have data from at most 1 hour ago".\n\n If `cron_schedule` is defined, the given asset will be expected to incorporate upstream data\n from no more than `maximum_lag_minutes` ago at each cron schedule tick. For example, "By 9AM,\n the signups table should contain all of yesterday's data".\n\n The freshness status of assets with policies defined will be visible in the UI. If you are using\n an asset reconciliation sensor, this sensor will kick off runs to help keep your assets up to\n date with respect to their FreshnessPolicy.\n\n Args:\n maximum_lag_minutes (float): An upper bound for how old the data contained within this\n asset may be.\n cron_schedule (Optional[str]): A cron schedule string (e.g. ``"0 1 * * *"``) specifying a\n series of times by which the `maximum_lag_minutes` constraint must be satisfied. If\n no cron schedule is provided, then this constraint must be satisfied at all times.\n cron_schedule_timezone (Optional[str]): Timezone in which the cron schedule should be evaluated.\n If not specified, defaults to UTC. Supported strings for timezones are the ones provided\n by the `IANA time zone database <https://www.iana.org/time-zones>` - e.g.\n "America/Los_Angeles".\n\n .. code-block:: python\n\n # At any point in time, this asset must incorporate all upstream data from at least 30 minutes ago.\n @asset(freshness_policy=FreshnessPolicy(maximum_lag_minutes=30))\n def fresh_asset():\n ...\n\n # At any point in time, this asset must incorporate all upstream data from at least 30 minutes ago.\n @asset(freshness_policy=FreshnessPolicy(maximum_lag_minutes=30))\n def cron_up_to_date_asset():\n ...\n\n """\n\n def __new__(\n cls,\n *,\n maximum_lag_minutes: float,\n cron_schedule: Optional[str] = None,\n cron_schedule_timezone: Optional[str] = None,\n ):\n if cron_schedule is not None:\n if not is_valid_cron_schedule(cron_schedule):\n raise DagsterInvalidDefinitionError(f"Invalid cron schedule '{cron_schedule}'.")\n check.param_invariant(\n is_valid_cron_schedule(cron_schedule),\n "cron_schedule",\n f"Invalid cron schedule '{cron_schedule}'.",\n )\n if cron_schedule_timezone is not None:\n check.param_invariant(\n cron_schedule is not None,\n "cron_schedule_timezone",\n "Cannot specify cron_schedule_timezone without a cron_schedule.",\n )\n try:\n # Verify that the timezone can be loaded\n pendulum.tz.timezone(cron_schedule_timezone) # type: ignore\n except Exception as e:\n raise DagsterInvalidDefinitionError(\n "Invalid cron schedule timezone '{cron_schedule_timezone}'. "\n ) from e\n return super(FreshnessPolicy, cls).__new__(\n cls,\n maximum_lag_minutes=float(\n check.numeric_param(maximum_lag_minutes, "maximum_lag_minutes")\n ),\n cron_schedule=check.opt_str_param(cron_schedule, "cron_schedule"),\n cron_schedule_timezone=check.opt_str_param(\n cron_schedule_timezone, "cron_schedule_timezone"\n ),\n )\n\n @classmethod\n def _create(cls, *args):\n """Pickle requires a method with positional arguments to construct\n instances of a class. Since the constructor for this class has\n keyword arguments only, we define this method to be used by pickle.\n """\n return cls(maximum_lag_minutes=args[0], cron_schedule=args[1])\n\n def __reduce__(self):\n return (self._create, (self.maximum_lag_minutes, self.cron_schedule))\n\n @property\n def maximum_lag_delta(self) -> datetime.timedelta:\n return datetime.timedelta(minutes=self.maximum_lag_minutes)\n\n def get_evaluation_tick(\n self,\n evaluation_time: datetime.datetime,\n ) -> Optional[datetime.datetime]:\n if self.cron_schedule:\n # most recent cron schedule tick\n schedule_ticks = reverse_cron_string_iterator(\n end_timestamp=evaluation_time.timestamp(),\n cron_string=self.cron_schedule,\n execution_timezone=self.cron_schedule_timezone,\n )\n return next(schedule_ticks)\n else:\n return evaluation_time\n\n def minutes_overdue(\n self,\n data_time: Optional[datetime.datetime],\n evaluation_time: datetime.datetime,\n ) -> Optional[FreshnessMinutes]:\n """Returns a number of minutes past the specified freshness policy that this asset currently\n is. If the asset is missing upstream data, or is not materialized at all, then it is unknown\n how overdue it is, and this will return None.\n\n Args:\n data_time (Optional[datetime]): The timestamp of the data that was used to create the\n current version of this asset.\n evaluation_time (datetime): The time at which we're evaluating the overdueness of this\n asset. Generally, this is the current time.\n """\n if data_time is None:\n return None\n evaluation_tick = self.get_evaluation_tick(evaluation_time)\n if evaluation_tick is None:\n return None\n required_time = evaluation_tick - self.maximum_lag_delta\n\n return FreshnessMinutes(\n lag_minutes=max(0.0, (evaluation_tick - data_time).total_seconds() / 60),\n overdue_minutes=max(0.0, (required_time - data_time).total_seconds() / 60),\n )
\n
", "current_page_name": "_modules/dagster/_core/definitions/freshness_policy", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.freshness_policy"}, "freshness_policy_sensor_definition": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.freshness_policy_sensor_definition

\nfrom typing import Callable, Dict, Mapping, NamedTuple, Optional, Set, cast\n\nimport pendulum\n\nimport dagster._check as check\nfrom dagster._annotations import PublicAttr, experimental\nfrom dagster._core.definitions.asset_selection import AssetSelection\nfrom dagster._core.definitions.data_time import CachingDataTimeResolver\nfrom dagster._core.definitions.events import AssetKey\nfrom dagster._core.definitions.freshness_policy import FreshnessPolicy\nfrom dagster._core.definitions.resource_annotation import get_resource_args\nfrom dagster._core.definitions.scoped_resources_builder import Resources, ScopedResourcesBuilder\nfrom dagster._core.errors import (\n    DagsterInvalidDefinitionError,\n    DagsterInvalidInvocationError,\n    FreshnessPolicySensorExecutionError,\n    user_code_error_boundary,\n)\nfrom dagster._core.instance import DagsterInstance\nfrom dagster._serdes import (\n    serialize_value,\n    whitelist_for_serdes,\n)\nfrom dagster._serdes.errors import DeserializationError\nfrom dagster._serdes.serdes import deserialize_value\nfrom dagster._seven import JSONDecodeError\n\nfrom .sensor_definition import (\n    DefaultSensorStatus,\n    SensorDefinition,\n    SensorEvaluationContext,\n    SensorType,\n    SkipReason,\n    get_context_param_name,\n    get_sensor_context_from_args_or_kwargs,\n    validate_and_get_resource_dict,\n)\n\n\n@whitelist_for_serdes\nclass FreshnessPolicySensorCursor(\n    NamedTuple(\n        "_FreshnessPolicySensorCursor",\n        [("minutes_late_by_key_str", Mapping[str, Optional[float]])],\n    )\n):\n    def __new__(cls, minutes_late_by_key_str: Mapping[str, Optional[float]]):\n        return super(FreshnessPolicySensorCursor, cls).__new__(\n            cls,\n            minutes_late_by_key_str=check.mapping_param(\n                minutes_late_by_key_str, "minutes_late_by_key_str", key_type=str\n            ),\n        )\n\n    @staticmethod\n    def is_valid(json_str: str) -> bool:\n        try:\n            deserialize_value(json_str, FreshnessPolicySensorCursor)\n            return True\n        except (JSONDecodeError, DeserializationError):\n            return False\n\n    @staticmethod\n    def from_dict(\n        minutes_late_by_key: Mapping[AssetKey, Optional[float]]\n    ) -> "FreshnessPolicySensorCursor":\n        return FreshnessPolicySensorCursor(\n            minutes_late_by_key_str={k.to_user_string(): v for k, v in minutes_late_by_key.items()}\n        )\n\n    @property\n    def minutes_late_by_key(self) -> Mapping[AssetKey, Optional[float]]:\n        return {AssetKey.from_user_string(k): v for k, v in self.minutes_late_by_key_str.items()}\n\n    def to_json(self) -> str:\n        return serialize_value(cast(NamedTuple, self))\n\n    @staticmethod\n    def from_json(json_str: str) -> "FreshnessPolicySensorCursor":\n        return deserialize_value(json_str, FreshnessPolicySensorCursor)\n\n\n
[docs]class FreshnessPolicySensorContext(\n NamedTuple(\n "_FreshnessPolicySensorContext",\n [\n ("sensor_name", PublicAttr[str]),\n ("asset_key", PublicAttr[AssetKey]),\n ("freshness_policy", PublicAttr[FreshnessPolicy]),\n ("minutes_overdue", PublicAttr[Optional[float]]),\n ("previous_minutes_overdue", PublicAttr[Optional[float]]),\n ("instance", PublicAttr[DagsterInstance]),\n ("resources", Resources),\n ],\n )\n):\n """The ``context`` object available to a decorated function of ``freshness_policy_sensor``.\n\n Attributes:\n sensor_name (str): the name of the sensor.\n asset_key (AssetKey): the key of the asset being monitored\n freshness_policy (FreshnessPolicy): the freshness policy of the asset being monitored\n minutes_overdue (Optional[float])\n previous_minutes_overdue (Optional[float]): the minutes_overdue value for this asset on the\n previous sensor tick.\n instance (DagsterInstance): the current instance.\n """\n\n def __new__(\n cls,\n sensor_name: str,\n asset_key: AssetKey,\n freshness_policy: FreshnessPolicy,\n minutes_overdue: Optional[float],\n previous_minutes_overdue: Optional[float],\n instance: DagsterInstance,\n resources: Optional[Resources] = None,\n ):\n minutes_overdue = check.opt_numeric_param(minutes_overdue, "minutes_overdue")\n previous_minutes_overdue = check.opt_numeric_param(\n previous_minutes_overdue, "previous_minutes_overdue"\n )\n return super(FreshnessPolicySensorContext, cls).__new__(\n cls,\n sensor_name=check.str_param(sensor_name, "sensor_name"),\n asset_key=check.inst_param(asset_key, "asset_key", AssetKey),\n freshness_policy=check.inst_param(freshness_policy, "FreshnessPolicy", FreshnessPolicy),\n minutes_overdue=float(minutes_overdue) if minutes_overdue is not None else None,\n previous_minutes_overdue=(\n float(previous_minutes_overdue) if previous_minutes_overdue is not None else None\n ),\n instance=check.inst_param(instance, "instance", DagsterInstance),\n resources=resources or ScopedResourcesBuilder.build_empty(),\n )
\n\n\n
[docs]@experimental\ndef build_freshness_policy_sensor_context(\n sensor_name: str,\n asset_key: AssetKey,\n freshness_policy: FreshnessPolicy,\n minutes_overdue: Optional[float],\n previous_minutes_overdue: Optional[float] = None,\n instance: Optional[DagsterInstance] = None,\n resources: Optional[Resources] = None,\n) -> FreshnessPolicySensorContext:\n """Builds freshness policy sensor context from provided parameters.\n\n This function can be used to provide the context argument when directly invoking a function\n decorated with `@freshness_policy_sensor`, such as when writing unit tests.\n\n Args:\n sensor_name (str): The name of the sensor the context is being constructed for.\n asset_key (AssetKey): The AssetKey for the monitored asset\n freshness_policy (FreshnessPolicy): The FreshnessPolicy for the monitored asset\n minutes_overdue (Optional[float]): How overdue the monitored asset currently is\n previous_minutes_overdue (Optional[float]): How overdue the monitored asset was on the\n previous tick.\n instance (DagsterInstance): The dagster instance configured for the context.\n\n Examples:\n .. code-block:: python\n\n context = build_freshness_policy_sensor_context(\n sensor_name="freshness_policy_sensor_to_invoke",\n asset_key=AssetKey("some_asset"),\n freshness_policy=FreshnessPolicy(maximum_lag_minutes=30)<\n minutes_overdue=10.0,\n )\n freshness_policy_sensor_to_invoke(context)\n """\n return FreshnessPolicySensorContext(\n sensor_name=sensor_name,\n asset_key=asset_key,\n freshness_policy=freshness_policy,\n minutes_overdue=minutes_overdue,\n previous_minutes_overdue=previous_minutes_overdue,\n instance=instance or DagsterInstance.ephemeral(),\n resources=resources,\n )
\n\n\n
[docs]class FreshnessPolicySensorDefinition(SensorDefinition):\n """Define a sensor that reacts to the status of a given set of asset freshness policies,\n where the decorated function will be evaluated on every sensor tick.\n\n Args:\n name (str): The name of the sensor. Defaults to the name of the decorated function.\n freshness_policy_sensor_fn (Callable[[FreshnessPolicySensorContext], None]): The core\n evaluation function for the sensor. Takes a :py:class:`~dagster.FreshnessPolicySensorContext`.\n asset_selection (AssetSelection): The asset selection monitored by the sensor.\n minimum_interval_seconds (Optional[int]): The minimum number of seconds that will elapse\n between sensor evaluations.\n description (Optional[str]): A human-readable description of the sensor.\n default_status (DefaultSensorStatus): Whether the sensor starts as running or not. The default\n status can be overridden from the Dagster UI or via the GraphQL API.\n """\n\n def __init__(\n self,\n name: str,\n asset_selection: AssetSelection,\n freshness_policy_sensor_fn: Callable[..., None],\n minimum_interval_seconds: Optional[int] = None,\n description: Optional[str] = None,\n default_status: DefaultSensorStatus = DefaultSensorStatus.STOPPED,\n required_resource_keys: Optional[Set[str]] = None,\n ):\n check.str_param(name, "name")\n check.inst_param(asset_selection, "asset_selection", AssetSelection)\n check.opt_int_param(minimum_interval_seconds, "minimum_interval_seconds")\n check.opt_str_param(description, "description")\n check.inst_param(default_status, "default_status", DefaultSensorStatus)\n\n self._freshness_policy_sensor_fn = check.callable_param(\n freshness_policy_sensor_fn, "freshness_policy_sensor_fn"\n )\n\n resource_arg_names: Set[str] = {\n arg.name for arg in get_resource_args(freshness_policy_sensor_fn)\n }\n\n combined_required_resource_keys = (\n check.opt_set_param(required_resource_keys, "required_resource_keys", of_type=str)\n | resource_arg_names\n )\n\n def _wrapped_fn(context: SensorEvaluationContext):\n from dagster._utils.caching_instance_queryer import (\n CachingInstanceQueryer, # expensive import\n )\n\n if context.repository_def is None:\n raise DagsterInvalidInvocationError(\n "The `repository_def` property on the `SensorEvaluationContext` passed into a "\n "`FreshnessPolicySensorDefinition` must not be None."\n )\n\n if context.cursor is None or not FreshnessPolicySensorCursor.is_valid(context.cursor):\n new_cursor = FreshnessPolicySensorCursor({})\n context.update_cursor(new_cursor.to_json())\n yield SkipReason(f"Initializing {name}.")\n return\n\n evaluation_time = pendulum.now("UTC")\n asset_graph = context.repository_def.asset_graph\n instance_queryer = CachingInstanceQueryer(\n context.instance, asset_graph, evaluation_time\n )\n data_time_resolver = CachingDataTimeResolver(instance_queryer=instance_queryer)\n monitored_keys = asset_selection.resolve(asset_graph)\n\n # get the previous status from the cursor\n previous_minutes_late_by_key = FreshnessPolicySensorCursor.from_json(\n context.cursor\n ).minutes_late_by_key\n\n minutes_late_by_key: Dict[AssetKey, Optional[float]] = {}\n for asset_key in monitored_keys:\n freshness_policy = asset_graph.freshness_policies_by_key.get(asset_key)\n if freshness_policy is None:\n continue\n\n # get the current minutes_overdue value for this asset\n result = data_time_resolver.get_minutes_overdue(\n evaluation_time=evaluation_time,\n asset_key=asset_key,\n )\n minutes_late_by_key[asset_key] = result.overdue_minutes if result else None\n\n resource_args_populated = validate_and_get_resource_dict(\n context.resources, name, resource_arg_names\n )\n context_param_name = get_context_param_name(freshness_policy_sensor_fn)\n freshness_context = FreshnessPolicySensorContext(\n sensor_name=name,\n asset_key=asset_key,\n freshness_policy=freshness_policy,\n minutes_overdue=minutes_late_by_key[asset_key],\n previous_minutes_overdue=previous_minutes_late_by_key.get(asset_key),\n instance=context.instance,\n resources=context.resources,\n )\n\n with user_code_error_boundary(\n FreshnessPolicySensorExecutionError,\n lambda: f'Error occurred during the execution of sensor "{name}".',\n ):\n context_param = (\n {context_param_name: freshness_context} if context_param_name else {}\n )\n result = freshness_policy_sensor_fn(\n **context_param,\n **resource_args_populated,\n )\n if result is not None:\n raise DagsterInvalidDefinitionError(\n "Functions decorated by `@freshness_policy_sensor` may not return or yield"\n " a value."\n )\n\n context.update_cursor(\n FreshnessPolicySensorCursor.from_dict(minutes_late_by_key).to_json()\n )\n\n super(FreshnessPolicySensorDefinition, self).__init__(\n name=name,\n evaluation_fn=_wrapped_fn,\n minimum_interval_seconds=minimum_interval_seconds,\n description=description,\n default_status=default_status,\n required_resource_keys=combined_required_resource_keys,\n )\n\n def __call__(self, *args, **kwargs) -> None:\n context_param_name = get_context_param_name(self._freshness_policy_sensor_fn)\n\n sensor_context = get_sensor_context_from_args_or_kwargs(\n self._freshness_policy_sensor_fn,\n args,\n kwargs,\n context_type=FreshnessPolicySensorContext,\n )\n context_param = (\n {context_param_name: sensor_context} if context_param_name and sensor_context else {}\n )\n\n resources = validate_and_get_resource_dict(\n sensor_context.resources if sensor_context else ScopedResourcesBuilder.build_empty(),\n self._name,\n self._required_resource_keys,\n )\n\n return self._freshness_policy_sensor_fn(**context_param, **resources)\n\n @property\n def sensor_type(self) -> SensorType:\n return SensorType.FRESHNESS_POLICY
\n\n\n
[docs]@experimental\ndef freshness_policy_sensor(\n asset_selection: AssetSelection,\n *,\n name: Optional[str] = None,\n minimum_interval_seconds: Optional[int] = None,\n description: Optional[str] = None,\n default_status: DefaultSensorStatus = DefaultSensorStatus.STOPPED,\n) -> Callable[[Callable[..., None]], FreshnessPolicySensorDefinition,]:\n """Define a sensor that reacts to the status of a given set of asset freshness policies, where the\n decorated function will be evaluated on every tick for each asset in the selection that has a\n FreshnessPolicy defined.\n\n Note: returning or yielding a value from the annotated function will result in an error.\n\n Takes a :py:class:`~dagster.FreshnessPolicySensorContext`.\n\n Args:\n asset_selection (AssetSelection): The asset selection monitored by the sensor.\n name (Optional[str]): The name of the sensor. Defaults to the name of the decorated function.\n freshness_policy_sensor_fn (Callable[[FreshnessPolicySensorContext], None]): The core\n evaluation function for the sensor. Takes a :py:class:`~dagster.FreshnessPolicySensorContext`.\n minimum_interval_seconds (Optional[int]): The minimum number of seconds that will elapse\n between sensor evaluations.\n description (Optional[str]): A human-readable description of the sensor.\n default_status (DefaultSensorStatus): Whether the sensor starts as running or not. The default\n status can be overridden from the Dagster UI or via the GraphQL API.\n """\n\n def inner(fn: Callable[..., None]) -> FreshnessPolicySensorDefinition:\n check.callable_param(fn, "fn")\n sensor_name = name or fn.__name__\n\n return FreshnessPolicySensorDefinition(\n name=sensor_name,\n freshness_policy_sensor_fn=fn,\n asset_selection=asset_selection,\n minimum_interval_seconds=minimum_interval_seconds,\n description=description,\n default_status=default_status,\n )\n\n return inner
\n
", "current_page_name": "_modules/dagster/_core/definitions/freshness_policy_sensor_definition", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.freshness_policy_sensor_definition"}, "graph_definition": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.graph_definition

\nfrom collections import OrderedDict, defaultdict\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Any,\n    Dict,\n    Iterable,\n    Iterator,\n    List,\n    Mapping,\n    Optional,\n    Sequence,\n    Set,\n    Tuple,\n    TypeVar,\n    Union,\n    cast,\n)\n\nfrom toposort import CircularDependencyError, toposort_flatten\nfrom typing_extensions import Self\n\nimport dagster._check as check\nfrom dagster._annotations import public\nfrom dagster._core.definitions.config import ConfigMapping\nfrom dagster._core.definitions.definition_config_schema import IDefinitionConfigSchema\nfrom dagster._core.definitions.policy import RetryPolicy\nfrom dagster._core.errors import DagsterInvalidDefinitionError, DagsterInvariantViolationError\nfrom dagster._core.selector.subset_selector import AssetSelectionData\nfrom dagster._core.types.dagster_type import (\n    DagsterType,\n    DagsterTypeKind,\n    construct_dagster_type_dictionary,\n)\n\nfrom .dependency import (\n    DependencyMapping,\n    DependencyStructure,\n    GraphNode,\n    Node,\n    NodeHandle,\n    NodeInput,\n    NodeInputHandle,\n    NodeInvocation,\n)\nfrom .hook_definition import HookDefinition\nfrom .input import FanInInputPointer, InputDefinition, InputMapping, InputPointer\nfrom .logger_definition import LoggerDefinition\nfrom .metadata import RawMetadataValue\nfrom .node_container import create_execution_structure, normalize_dependency_dict\nfrom .node_definition import NodeDefinition\nfrom .output import OutputDefinition, OutputMapping\nfrom .resource_requirement import ResourceRequirement\nfrom .version_strategy import VersionStrategy\n\nif TYPE_CHECKING:\n    from dagster._core.execution.execute_in_process_result import ExecuteInProcessResult\n    from dagster._core.instance import DagsterInstance\n\n    from .asset_layer import AssetLayer\n    from .composition import PendingNodeInvocation\n    from .executor_definition import ExecutorDefinition\n    from .job_definition import JobDefinition\n    from .op_definition import OpDefinition\n    from .partition import PartitionedConfig, PartitionsDefinition\n    from .run_config import RunConfig\n    from .source_asset import SourceAsset\n\nT = TypeVar("T")\n\n\ndef _check_node_defs_arg(\n    graph_name: str, node_defs: Optional[Sequence[NodeDefinition]]\n) -> Sequence[NodeDefinition]:\n    node_defs = node_defs or []\n\n    _node_defs = check.opt_sequence_param(node_defs, "node_defs")\n    for node_def in _node_defs:\n        if isinstance(node_def, NodeDefinition):\n            continue\n        elif callable(node_def):\n            raise DagsterInvalidDefinitionError(\n                """You have passed a lambda or function {func} into {name} that is\n                not a node. You have likely forgetten to annotate this function with\n                the @op or @graph decorators.'\n                """.format(name=graph_name, func=node_def.__name__)\n            )\n        else:\n            raise DagsterInvalidDefinitionError(f"Invalid item in node list: {node_def!r}")\n\n    return node_defs\n\n\ndef create_adjacency_lists(\n    nodes: Sequence[Node],\n    dep_structure: DependencyStructure,\n) -> Tuple[Mapping[str, Set[str]], Mapping[str, Set[str]]]:\n    visit_dict = {s.name: False for s in nodes}\n    forward_edges: Dict[str, Set[str]] = {s.name: set() for s in nodes}\n    backward_edges: Dict[str, Set[str]] = {s.name: set() for s in nodes}\n\n    def visit(node_name: str) -> None:\n        if visit_dict[node_name]:\n            return\n\n        visit_dict[node_name] = True\n\n        for node_output in dep_structure.all_upstream_outputs_from_node(node_name):\n            forward_node = node_output.node.name\n            backward_node = node_name\n            if forward_node in forward_edges:\n                forward_edges[forward_node].add(backward_node)\n                backward_edges[backward_node].add(forward_node)\n                visit(forward_node)\n\n    for s in nodes:\n        visit(s.name)\n\n    return (forward_edges, backward_edges)\n\n\n
[docs]class GraphDefinition(NodeDefinition):\n """Defines a Dagster op graph.\n\n An op graph is made up of\n\n - Nodes, which can either be an op (the functional unit of computation), or another graph.\n - Dependencies, which determine how the values produced by nodes as outputs flow from\n one node to another. This tells Dagster how to arrange nodes into a directed, acyclic graph\n (DAG) of compute.\n\n End users should prefer the :func:`@graph <graph>` decorator. GraphDefinition is generally\n intended to be used by framework authors or for programatically generated graphs.\n\n Args:\n name (str): The name of the graph. Must be unique within any :py:class:`GraphDefinition`\n or :py:class:`JobDefinition` containing the graph.\n description (Optional[str]): A human-readable description of the job.\n node_defs (Optional[Sequence[NodeDefinition]]): The set of ops / graphs used in this graph.\n dependencies (Optional[Dict[Union[str, NodeInvocation], Dict[str, DependencyDefinition]]]):\n A structure that declares the dependencies of each op's inputs on the outputs of other\n ops in the graph. Keys of the top level dict are either the string names of ops in the\n graph or, in the case of aliased ops, :py:class:`NodeInvocations <NodeInvocation>`.\n Values of the top level dict are themselves dicts, which map input names belonging to\n the op or aliased op to :py:class:`DependencyDefinitions <DependencyDefinition>`.\n input_mappings (Optional[Sequence[InputMapping]]): Defines the inputs to the nested graph, and\n how they map to the inputs of its constituent ops.\n output_mappings (Optional[Sequence[OutputMapping]]): Defines the outputs of the nested graph,\n and how they map from the outputs of its constituent ops.\n config (Optional[ConfigMapping]): Defines the config of the graph, and how its schema maps\n to the config of its constituent ops.\n tags (Optional[Dict[str, Any]]): Arbitrary metadata for any execution of the graph.\n Values that are not strings will be json encoded and must meet the criteria that\n `json.loads(json.dumps(value)) == value`. These tag values may be overwritten by tag\n values provided at invocation time.\n\n Examples:\n .. code-block:: python\n\n @op\n def return_one():\n return 1\n\n @op\n def add_one(num):\n return num + 1\n\n graph_def = GraphDefinition(\n name='basic',\n node_defs=[return_one, add_one],\n dependencies={'add_one': {'num': DependencyDefinition('return_one')}},\n )\n """\n\n _node_defs: Sequence[NodeDefinition]\n _dagster_type_dict: Mapping[str, DagsterType]\n _dependencies: DependencyMapping[NodeInvocation]\n _dependency_structure: DependencyStructure\n _node_dict: Mapping[str, Node]\n _input_mappings: Sequence[InputMapping]\n _output_mappings: Sequence[OutputMapping]\n _config_mapping: Optional[ConfigMapping]\n _nodes_in_topological_order: Sequence[Node]\n\n # (node name within the graph -> (input name -> SourceAsset to load that input from))\n # Does NOT include keys for:\n # - Inputs to the graph itself\n # - Inputs to nodes within sub-graphs of the graph\n _node_input_source_assets: Mapping[str, Mapping[str, "SourceAsset"]]\n\n def __init__(\n self,\n name: str,\n *,\n description: Optional[str] = None,\n node_defs: Optional[Sequence[NodeDefinition]] = None,\n dependencies: Optional[\n Union[DependencyMapping[str], DependencyMapping[NodeInvocation]]\n ] = None,\n input_mappings: Optional[Sequence[InputMapping]] = None,\n output_mappings: Optional[Sequence[OutputMapping]] = None,\n config: Optional[ConfigMapping] = None,\n tags: Optional[Mapping[str, str]] = None,\n node_input_source_assets: Optional[Mapping[str, Mapping[str, "SourceAsset"]]] = None,\n **kwargs: Any,\n ):\n self._node_defs = _check_node_defs_arg(name, node_defs)\n\n # `dependencies` will be converted to `dependency_structure` and `node_dict`, which may\n # alternatively be passed directly (useful when copying)\n self._dependencies = normalize_dependency_dict(dependencies)\n self._dependency_structure, self._node_dict = create_execution_structure(\n self._node_defs, self._dependencies, graph_definition=self\n )\n\n # Sequence[InputMapping]\n self._input_mappings = check.opt_sequence_param(input_mappings, "input_mappings")\n input_defs = _validate_in_mappings(\n self._input_mappings,\n self._node_dict,\n self._dependency_structure,\n name,\n class_name=type(self).__name__,\n )\n\n # Sequence[OutputMapping]\n self._output_mappings, output_defs = _validate_out_mappings(\n check.opt_sequence_param(output_mappings, "output_mappings"),\n self._node_dict,\n name,\n class_name=type(self).__name__,\n )\n\n self._config_mapping = check.opt_inst_param(config, "config", ConfigMapping)\n\n super(GraphDefinition, self).__init__(\n name=name,\n description=description,\n input_defs=input_defs,\n output_defs=output_defs,\n tags=tags,\n **kwargs,\n )\n\n # must happen after base class construction as properties are assumed to be there\n # eager computation to detect cycles\n self._nodes_in_topological_order = self._get_nodes_in_topological_order()\n self._dagster_type_dict = construct_dagster_type_dictionary([self])\n self._node_input_source_assets = check.opt_mapping_param(\n node_input_source_assets, "node_input_source_assets", key_type=str, value_type=dict\n )\n\n def _get_nodes_in_topological_order(self) -> Sequence[Node]:\n _forward_edges, backward_edges = create_adjacency_lists(\n self.nodes, self.dependency_structure\n )\n\n try:\n order = toposort_flatten(backward_edges)\n except CircularDependencyError as err:\n raise DagsterInvalidDefinitionError(str(err)) from err\n\n return [self.node_named(node_name) for node_name in order]\n\n def get_inputs_must_be_resolved_top_level(\n self, asset_layer: "AssetLayer", handle: Optional[NodeHandle] = None\n ) -> Sequence[InputDefinition]:\n unresolveable_input_defs: List[InputDefinition] = []\n for node in self.node_dict.values():\n cur_handle = NodeHandle(node.name, handle)\n for input_def in node.definition.get_inputs_must_be_resolved_top_level(\n asset_layer, cur_handle\n ):\n if self.dependency_structure.has_deps(NodeInput(node, input_def)):\n continue\n elif not node.container_maps_input(input_def.name):\n raise DagsterInvalidDefinitionError(\n f"Input '{input_def.name}' of {node.describe_node()} "\n "has no way of being resolved. Must provide a resolution to this "\n "input via another op/graph, or via a direct input value mapped from the "\n "top-level graph. To "\n "learn more, see the docs for unconnected inputs: "\n "https://docs.dagster.io/concepts/io-management/unconnected-inputs#unconnected-inputs."\n )\n else:\n mapped_input = node.container_mapped_input(input_def.name)\n unresolveable_input_defs.append(mapped_input.get_definition())\n return unresolveable_input_defs\n\n @property\n def node_type_str(self) -> str:\n return "graph"\n\n @property\n def is_graph_job_op_node(self) -> bool:\n return True\n\n @property\n def nodes(self) -> Sequence[Node]:\n return list(set(self._node_dict.values()))\n\n @property\n def node_dict(self) -> Mapping[str, Node]:\n return self._node_dict\n\n @property\n def node_defs(self) -> Sequence[NodeDefinition]:\n return self._node_defs\n\n @property\n def nodes_in_topological_order(self) -> Sequence[Node]:\n return self._nodes_in_topological_order\n\n @property\n def node_input_source_assets(self) -> Mapping[str, Mapping[str, "SourceAsset"]]:\n return self._node_input_source_assets\n\n def has_node_named(self, name: str) -> bool:\n check.str_param(name, "name")\n return name in self._node_dict\n\n def node_named(self, name: str) -> Node:\n check.str_param(name, "name")\n if name not in self._node_dict:\n raise DagsterInvariantViolationError(f"{self._name} has no op named {name}.")\n\n return self._node_dict[name]\n\n def get_node(self, handle: NodeHandle) -> Node:\n check.inst_param(handle, "handle", NodeHandle)\n current = handle\n lineage: List[str] = []\n while current:\n lineage.append(current.name)\n current = current.parent\n\n name = lineage.pop()\n node = self.node_named(name)\n while lineage:\n name = lineage.pop()\n # We know that this is a current node is a graph while ascending lineage\n definition = cast(GraphDefinition, node.definition)\n node = definition.node_named(name)\n\n return node\n\n def iterate_node_defs(self) -> Iterator[NodeDefinition]:\n yield self\n for outer_node_def in self._node_defs:\n yield from outer_node_def.iterate_node_defs()\n\n def iterate_op_defs(self) -> Iterator["OpDefinition"]:\n for outer_node_def in self._node_defs:\n yield from outer_node_def.iterate_op_defs()\n\n def iterate_node_handles(\n self, parent_node_handle: Optional[NodeHandle] = None\n ) -> Iterator[NodeHandle]:\n for node in self.node_dict.values():\n cur_node_handle = NodeHandle(node.name, parent_node_handle)\n if isinstance(node, GraphNode):\n yield from node.definition.iterate_node_handles(cur_node_handle)\n yield cur_node_handle\n\n @public\n @property\n def input_mappings(self) -> Sequence[InputMapping]:\n """Input mappings for the graph.\n\n An input mapping is a mapping from an input of the graph to an input of a child node.\n """\n return self._input_mappings\n\n @public\n @property\n def output_mappings(self) -> Sequence[OutputMapping]:\n """Output mappings for the graph.\n\n An output mapping is a mapping from an output of the graph to an output of a child node.\n """\n return self._output_mappings\n\n @public\n @property\n def config_mapping(self) -> Optional[ConfigMapping]:\n """The config mapping for the graph, if present.\n\n By specifying a config mapping function, you can override the configuration for the child nodes contained within a graph.\n """\n return self._config_mapping\n\n @property\n def has_config_mapping(self) -> bool:\n return self._config_mapping is not None\n\n def all_dagster_types(self) -> Iterable[DagsterType]:\n return self._dagster_type_dict.values()\n\n def has_dagster_type(self, name: str) -> bool:\n check.str_param(name, "name")\n return name in self._dagster_type_dict\n\n def dagster_type_named(self, name: str) -> DagsterType:\n check.str_param(name, "name")\n return self._dagster_type_dict[name]\n\n def get_input_mapping(self, input_name: str) -> InputMapping:\n check.str_param(input_name, "input_name")\n for mapping in self._input_mappings:\n if mapping.graph_input_name == input_name:\n return mapping\n check.failed(f"Could not find input mapping {input_name}")\n\n def input_mapping_for_pointer(\n self, pointer: Union[InputPointer, FanInInputPointer]\n ) -> Optional[InputMapping]:\n check.inst_param(pointer, "pointer", (InputPointer, FanInInputPointer))\n\n for mapping in self._input_mappings:\n if mapping.maps_to == pointer:\n return mapping\n return None\n\n def get_output_mapping(self, output_name: str) -> OutputMapping:\n check.str_param(output_name, "output_name")\n for mapping in self._output_mappings:\n if mapping.graph_output_name == output_name:\n return mapping\n check.failed(f"Could not find output mapping {output_name}")\n\n T_Handle = TypeVar("T_Handle", bound=Optional[NodeHandle])\n\n def resolve_output_to_origin(\n self, output_name: str, handle: Optional[NodeHandle]\n ) -> Tuple[OutputDefinition, Optional[NodeHandle]]:\n check.str_param(output_name, "output_name")\n check.opt_inst_param(handle, "handle", NodeHandle)\n\n mapping = self.get_output_mapping(output_name)\n check.invariant(mapping, "Can only resolve outputs for valid output names")\n mapped_node = self.node_named(mapping.maps_from.node_name)\n return mapped_node.definition.resolve_output_to_origin(\n mapping.maps_from.output_name,\n NodeHandle(mapped_node.name, handle),\n )\n\n def resolve_output_to_origin_op_def(self, output_name: str) -> "OpDefinition":\n mapping = self.get_output_mapping(output_name)\n check.invariant(mapping, "Can only resolve outputs for valid output names")\n return self.node_named(\n mapping.maps_from.node_name\n ).definition.resolve_output_to_origin_op_def(output_name)\n\n def default_value_for_input(self, input_name: str) -> object:\n check.str_param(input_name, "input_name")\n\n # base case\n if self.input_def_named(input_name).has_default_value:\n return self.input_def_named(input_name).default_value\n\n mapping = self.get_input_mapping(input_name)\n check.invariant(mapping, "Can only resolve inputs for valid input names")\n mapped_node = self.node_named(mapping.maps_to.node_name)\n\n return mapped_node.definition.default_value_for_input(mapping.maps_to.input_name)\n\n def input_has_default(self, input_name: str) -> bool:\n check.str_param(input_name, "input_name")\n\n # base case\n if self.input_def_named(input_name).has_default_value:\n return True\n\n mapping = self.get_input_mapping(input_name)\n check.invariant(mapping, "Can only resolve inputs for valid input names")\n mapped_node = self.node_named(mapping.maps_to.node_name)\n\n return mapped_node.definition.input_has_default(mapping.maps_to.input_name)\n\n @property\n def dependencies(self) -> DependencyMapping[NodeInvocation]:\n return self._dependencies\n\n @property\n def dependency_structure(self) -> DependencyStructure:\n return self._dependency_structure\n\n @property\n def config_schema(self) -> Optional[IDefinitionConfigSchema]:\n return self.config_mapping.config_schema if self.config_mapping is not None else None\n\n def input_supports_dynamic_output_dep(self, input_name: str) -> bool:\n mapping = self.get_input_mapping(input_name)\n target_node = mapping.maps_to.node_name\n # check if input mapped to node which is downstream of another dynamic output within\n if self.dependency_structure.is_dynamic_mapped(target_node):\n return False\n\n # check if input mapped to node which starts new dynamic downstream\n if self.dependency_structure.has_dynamic_downstreams(target_node):\n return False\n\n return self.node_named(target_node).definition.input_supports_dynamic_output_dep(\n mapping.maps_to.input_name\n )\n\n def copy(\n self,\n name: Optional[str] = None,\n description: Optional[str] = None,\n input_mappings: Optional[Sequence[InputMapping]] = None,\n output_mappings: Optional[Sequence[OutputMapping]] = None,\n config: Optional[ConfigMapping] = None,\n tags: Optional[Mapping[str, str]] = None,\n node_input_source_assets: Optional[Mapping[str, Mapping[str, "SourceAsset"]]] = None,\n ) -> Self:\n return GraphDefinition(\n node_defs=self.node_defs,\n dependencies=self.dependencies,\n name=name or self.name,\n description=description or self.description,\n input_mappings=input_mappings or self._input_mappings,\n output_mappings=output_mappings or self._output_mappings,\n config=config or self.config_mapping,\n tags=tags or self.tags,\n node_input_source_assets=node_input_source_assets or self.node_input_source_assets,\n )\n\n def copy_for_configured(\n self,\n name: str,\n description: Optional[str],\n config_schema: Any,\n ) -> "GraphDefinition":\n if not self.has_config_mapping:\n raise DagsterInvalidDefinitionError(\n "Only graphs utilizing config mapping can be pre-configured. The graph "\n f'"{self.name}" does not have a config mapping, and thus has nothing to be '\n "configured."\n )\n config_mapping = cast(ConfigMapping, self.config_mapping)\n return self.copy(\n name=name,\n description=check.opt_str_param(description, "description", default=self.description),\n config=ConfigMapping(\n config_mapping.config_fn,\n config_schema=config_schema,\n receive_processed_config_values=config_mapping.receive_processed_config_values,\n ),\n )\n\n def node_names(self) -> Sequence[str]:\n return list(self._node_dict.keys())\n\n
[docs] @public\n def to_job(\n self,\n name: Optional[str] = None,\n description: Optional[str] = None,\n resource_defs: Optional[Mapping[str, object]] = None,\n config: Optional[\n Union["RunConfig", ConfigMapping, Mapping[str, object], "PartitionedConfig"]\n ] = None,\n tags: Optional[Mapping[str, str]] = None,\n metadata: Optional[Mapping[str, RawMetadataValue]] = None,\n logger_defs: Optional[Mapping[str, LoggerDefinition]] = None,\n executor_def: Optional["ExecutorDefinition"] = None,\n hooks: Optional[AbstractSet[HookDefinition]] = None,\n op_retry_policy: Optional[RetryPolicy] = None,\n version_strategy: Optional[VersionStrategy] = None,\n op_selection: Optional[Sequence[str]] = None,\n partitions_def: Optional["PartitionsDefinition"] = None,\n asset_layer: Optional["AssetLayer"] = None,\n input_values: Optional[Mapping[str, object]] = None,\n _asset_selection_data: Optional[AssetSelectionData] = None,\n ) -> "JobDefinition":\n """Make this graph in to an executable Job by providing remaining components required for execution.\n\n Args:\n name (Optional[str]):\n The name for the Job. Defaults to the name of the this graph.\n resource_defs (Optional[Mapping [str, object]]):\n Resources that are required by this graph for execution.\n If not defined, `io_manager` will default to filesystem.\n config:\n Describes how the job is parameterized at runtime.\n\n If no value is provided, then the schema for the job's run config is a standard\n format based on its ops and resources.\n\n If a dictionary is provided, then it must conform to the standard config schema, and\n it will be used as the job's run config for the job whenever the job is executed.\n The values provided will be viewable and editable in the Dagster UI, so be\n careful with secrets.\n\n If a :py:class:`ConfigMapping` object is provided, then the schema for the job's run config is\n determined by the config mapping, and the ConfigMapping, which should return\n configuration in the standard format to configure the job.\n\n If a :py:class:`PartitionedConfig` object is provided, then it defines a discrete set of config\n values that can parameterize the job, as well as a function for mapping those\n values to the base config. The values provided will be viewable and editable in the\n Dagster UI, so be careful with secrets.\n tags (Optional[Mapping[str, Any]]):\n Arbitrary information that will be attached to the execution of the Job.\n Values that are not strings will be json encoded and must meet the criteria that\n `json.loads(json.dumps(value)) == value`. These tag values may be overwritten by tag\n values provided at invocation time.\n metadata (Optional[Mapping[str, RawMetadataValue]]):\n Arbitrary information that will be attached to the JobDefinition and be viewable in the Dagster UI.\n Keys must be strings, and values must be python primitive types or one of the provided\n MetadataValue types\n logger_defs (Optional[Mapping[str, LoggerDefinition]]):\n A dictionary of string logger identifiers to their implementations.\n executor_def (Optional[ExecutorDefinition]):\n How this Job will be executed. Defaults to :py:class:`multi_or_in_process_executor`,\n which can be switched between multi-process and in-process modes of execution. The\n default mode of execution is multi-process.\n op_retry_policy (Optional[RetryPolicy]): The default retry policy for all ops in this job.\n Only used if retry policy is not defined on the op definition or op invocation.\n version_strategy (Optional[VersionStrategy]):\n Defines how each op (and optionally, resource) in the job can be versioned. If\n provided, memoizaton will be enabled for this job.\n partitions_def (Optional[PartitionsDefinition]): Defines a discrete set of partition\n keys that can parameterize the job. If this argument is supplied, the config\n argument can't also be supplied.\n asset_layer (Optional[AssetLayer]): Top level information about the assets this job\n will produce. Generally should not be set manually.\n input_values (Optional[Mapping[str, Any]]):\n A dictionary that maps python objects to the top-level inputs of a job.\n\n Returns:\n JobDefinition\n """\n from dagster._core.execution.build_resources import wrap_resources_for_execution\n\n from .job_definition import JobDefinition\n\n wrapped_resource_defs = wrap_resources_for_execution(resource_defs)\n\n return JobDefinition.dagster_internal_init(\n name=name,\n description=description or self.description,\n graph_def=self,\n resource_defs=wrapped_resource_defs,\n logger_defs=logger_defs,\n executor_def=executor_def,\n config=config,\n partitions_def=partitions_def,\n tags=tags,\n metadata=metadata,\n hook_defs=hooks,\n version_strategy=version_strategy,\n op_retry_policy=op_retry_policy,\n asset_layer=asset_layer,\n input_values=input_values,\n _subset_selection_data=_asset_selection_data,\n _was_explicitly_provided_resources=None, # None means this is determined by whether resource_defs contains any explicitly provided resources\n ).get_subset(op_selection=op_selection)
\n\n def coerce_to_job(self) -> "JobDefinition":\n # attempt to coerce a Graph in to a Job, raising a useful error if it doesn't work\n try:\n return self.to_job()\n except DagsterInvalidDefinitionError as err:\n raise DagsterInvalidDefinitionError(\n f"Failed attempting to coerce Graph {self.name} in to a Job. "\n "Use to_job instead, passing the required information."\n ) from err\n\n
[docs] @public\n def execute_in_process(\n self,\n run_config: Any = None,\n instance: Optional["DagsterInstance"] = None,\n resources: Optional[Mapping[str, object]] = None,\n raise_on_error: bool = True,\n op_selection: Optional[Sequence[str]] = None,\n run_id: Optional[str] = None,\n input_values: Optional[Mapping[str, object]] = None,\n ) -> "ExecuteInProcessResult":\n """Execute this graph in-process, collecting results in-memory.\n\n Args:\n run_config (Optional[Mapping[str, Any]]):\n Run config to provide to execution. The configuration for the underlying graph\n should exist under the "ops" key.\n instance (Optional[DagsterInstance]):\n The instance to execute against, an ephemeral one will be used if none provided.\n resources (Optional[Mapping[str, Any]]):\n The resources needed if any are required. Can provide resource instances directly,\n or resource definitions.\n raise_on_error (Optional[bool]): Whether or not to raise exceptions when they occur.\n Defaults to ``True``.\n op_selection (Optional[List[str]]): A list of op selection queries (including single op\n names) to execute. For example:\n * ``['some_op']``: selects ``some_op`` itself.\n * ``['*some_op']``: select ``some_op`` and all its ancestors (upstream dependencies).\n * ``['*some_op+++']``: select ``some_op``, all its ancestors, and its descendants\n (downstream dependencies) within 3 levels down.\n * ``['*some_op', 'other_op_a', 'other_op_b+']``: select ``some_op`` and all its\n ancestors, ``other_op_a`` itself, and ``other_op_b`` and its direct child ops.\n input_values (Optional[Mapping[str, Any]]):\n A dictionary that maps python objects to the top-level inputs of the graph.\n\n Returns:\n :py:class:`~dagster.ExecuteInProcessResult`\n """\n from dagster._core.execution.build_resources import wrap_resources_for_execution\n from dagster._core.instance import DagsterInstance\n\n from .executor_definition import execute_in_process_executor\n from .job_definition import JobDefinition\n\n instance = check.opt_inst_param(instance, "instance", DagsterInstance)\n resources = check.opt_mapping_param(resources, "resources", key_type=str)\n input_values = check.opt_mapping_param(input_values, "input_values")\n\n resource_defs = wrap_resources_for_execution(resources)\n\n ephemeral_job = JobDefinition(\n name=self._name,\n graph_def=self,\n executor_def=execute_in_process_executor,\n resource_defs=resource_defs,\n input_values=input_values,\n ).get_subset(op_selection=op_selection)\n\n run_config = run_config if run_config is not None else {}\n op_selection = check.opt_sequence_param(op_selection, "op_selection", str)\n\n return ephemeral_job.execute_in_process(\n run_config=run_config,\n instance=instance,\n raise_on_error=raise_on_error,\n run_id=run_id,\n )
\n\n @property\n def parent_graph_def(self) -> Optional["GraphDefinition"]:\n return None\n\n @property\n def is_subselected(self) -> bool:\n return False\n\n def get_resource_requirements(\n self, asset_layer: Optional["AssetLayer"] = None\n ) -> Iterator[ResourceRequirement]:\n for node in self.node_dict.values():\n yield from node.get_resource_requirements(outer_container=self, asset_layer=asset_layer)\n\n for dagster_type in self.all_dagster_types():\n yield from dagster_type.get_resource_requirements()\n\n @public\n @property\n def name(self) -> str:\n """The name of the graph."""\n return super(GraphDefinition, self).name\n\n @public\n @property\n def tags(self) -> Mapping[str, str]:\n """The tags associated with the graph."""\n return super(GraphDefinition, self).tags\n\n
[docs] @public\n def alias(self, name: str) -> "PendingNodeInvocation":\n """Aliases the graph with a new name.\n\n Can only be used in the context of a :py:func:`@graph <graph>`, :py:func:`@job <job>`, or :py:func:`@asset_graph <asset_graph>` decorated function.\n\n **Examples:**\n .. code-block:: python\n\n @job\n def do_it_all():\n my_graph.alias("my_graph_alias")\n """\n return super(GraphDefinition, self).alias(name)
\n\n
[docs] @public\n def tag(self, tags: Optional[Mapping[str, str]]) -> "PendingNodeInvocation":\n """Attaches the provided tags to the graph immutably.\n\n Can only be used in the context of a :py:func:`@graph <graph>`, :py:func:`@job <job>`, or :py:func:`@asset_graph <asset_graph>` decorated function.\n\n **Examples:**\n .. code-block:: python\n\n @job\n def do_it_all():\n my_graph.tag({"my_tag": "my_value"})\n """\n return super(GraphDefinition, self).tag(tags)
\n\n
[docs] @public\n def with_hooks(self, hook_defs: AbstractSet[HookDefinition]) -> "PendingNodeInvocation":\n """Attaches the provided hooks to the graph immutably.\n\n Can only be used in the context of a :py:func:`@graph <graph>`, :py:func:`@job <job>`, or :py:func:`@asset_graph <asset_graph>` decorated function.\n\n **Examples:**\n .. code-block:: python\n\n @job\n def do_it_all():\n my_graph.with_hooks({my_hook})\n """\n return super(GraphDefinition, self).with_hooks(hook_defs)
\n\n
[docs] @public\n def with_retry_policy(self, retry_policy: RetryPolicy) -> "PendingNodeInvocation":\n """Attaches the provided retry policy to the graph immutably.\n\n Can only be used in the context of a :py:func:`@graph <graph>`, :py:func:`@job <job>`, or :py:func:`@asset_graph <asset_graph>` decorated function.\n\n **Examples:**\n .. code-block:: python\n\n @job\n def do_it_all():\n my_graph.with_retry_policy(RetryPolicy(max_retries=5))\n """\n return super(GraphDefinition, self).with_retry_policy(retry_policy)
\n\n def resolve_input_to_destinations(\n self, input_handle: NodeInputHandle\n ) -> Sequence[NodeInputHandle]:\n all_destinations: List[NodeInputHandle] = []\n for mapping in self.input_mappings:\n if mapping.graph_input_name != input_handle.input_name:\n continue\n # recurse into graph structure\n all_destinations += self.node_named(\n mapping.maps_to.node_name\n ).definition.resolve_input_to_destinations(\n NodeInputHandle(\n NodeHandle(mapping.maps_to.node_name, parent=input_handle.node_handle),\n mapping.maps_to.input_name,\n ),\n )\n\n return all_destinations
\n\n\nclass SubselectedGraphDefinition(GraphDefinition):\n """Defines a subselected graph.\n\n Args:\n parent_graph_def (GraphDefinition): The parent graph that this current graph is subselected\n from. This is used for tracking where the subselected graph originally comes from.\n Note that we allow subselecting a subselected graph, and this field refers to the direct\n parent graph of the current subselection, rather than the original root graph.\n node_defs (Optional[Sequence[NodeDefinition]]): A list of all top level nodes in the graph. A\n node can be an op or a graph that contains other nodes.\n dependencies (Optional[Mapping[Union[str, NodeInvocation], Mapping[str, IDependencyDefinition]]]):\n A structure that declares the dependencies of each op's inputs on the outputs of other\n ops in the subselected graph. Keys of the top level dict are either the string names of\n ops in the graph or, in the case of aliased ops, :py:class:`NodeInvocations <NodeInvocation>`.\n Values of the top level dict are themselves dicts, which map input names belonging to\n the op or aliased op to :py:class:`DependencyDefinitions <DependencyDefinition>`.\n input_mappings (Optional[Sequence[InputMapping]]): Define the inputs to the nested graph, and\n how they map to the inputs of its constituent ops.\n output_mappings (Optional[Sequence[OutputMapping]]): Define the outputs of the nested graph, and\n how they map from the outputs of its constituent ops.\n """\n\n def __init__(\n self,\n parent_graph_def: GraphDefinition,\n node_defs: Optional[Sequence[NodeDefinition]],\n dependencies: Optional[\n Union[\n DependencyMapping[str],\n DependencyMapping[NodeInvocation],\n ]\n ],\n input_mappings: Optional[Sequence[InputMapping]],\n output_mappings: Optional[Sequence[OutputMapping]],\n ):\n self._parent_graph_def = check.inst_param(\n parent_graph_def, "parent_graph_def", GraphDefinition\n )\n super(SubselectedGraphDefinition, self).__init__(\n name=parent_graph_def.name, # should we create special name for subselected graphs\n node_defs=node_defs,\n dependencies=dependencies,\n input_mappings=input_mappings,\n output_mappings=output_mappings,\n config=parent_graph_def.config_mapping,\n tags=parent_graph_def.tags,\n )\n\n @property\n def parent_graph_def(self) -> GraphDefinition:\n return self._parent_graph_def\n\n def get_top_level_omitted_nodes(self) -> Sequence[Node]:\n return [node for node in self.parent_graph_def.nodes if not self.has_node_named(node.name)]\n\n @property\n def is_subselected(self) -> bool:\n return True\n\n\ndef _validate_in_mappings(\n input_mappings: Sequence[InputMapping],\n nodes_by_name: Mapping[str, Node],\n dependency_structure: DependencyStructure,\n name: str,\n class_name: str,\n) -> Sequence[InputDefinition]:\n from .composition import MappedInputPlaceholder\n\n input_defs_by_name: Dict[str, InputDefinition] = OrderedDict()\n mapping_keys: Set[str] = set()\n\n target_input_types_by_graph_input_name: Dict[str, Set[DagsterType]] = defaultdict(set)\n\n for mapping in input_mappings:\n # handle incorrect objects passed in as mappings\n if not isinstance(mapping, InputMapping):\n if isinstance(mapping, InputDefinition):\n raise DagsterInvalidDefinitionError(\n f"In {class_name} '{name}' you passed an InputDefinition "\n f"named '{mapping.name}' directly in to input_mappings. Return "\n "an InputMapping by calling mapping_to on the InputDefinition."\n )\n else:\n raise DagsterInvalidDefinitionError(\n f"In {class_name} '{name}' received unexpected type '{type(mapping)}' in"\n " input_mappings. Provide an InputMapping using InputMapping(...)"\n )\n\n input_defs_by_name[mapping.graph_input_name] = mapping.get_definition()\n\n target_node = nodes_by_name.get(mapping.maps_to.node_name)\n if target_node is None:\n raise DagsterInvalidDefinitionError(\n f"In {class_name} '{name}' input mapping references node "\n f"'{mapping.maps_to.node_name}' which it does not contain."\n )\n if not target_node.has_input(mapping.maps_to.input_name):\n raise DagsterInvalidDefinitionError(\n f"In {class_name} '{name}' input mapping to node '{mapping.maps_to.node_name}' "\n f"which contains no input named '{mapping.maps_to.input_name}'"\n )\n\n target_input_def = target_node.input_def_named(mapping.maps_to.input_name)\n node_input = NodeInput(target_node, target_input_def)\n\n if mapping.maps_to_fan_in:\n maps_to = cast(FanInInputPointer, mapping.maps_to)\n if not dependency_structure.has_fan_in_deps(node_input):\n raise DagsterInvalidDefinitionError(\n f"In {class_name} '{name}' input mapping target"\n f' "{maps_to.node_name}.{maps_to.input_name}" (index'\n f" {maps_to.fan_in_index} of fan-in) is not a MultiDependencyDefinition."\n )\n inner_deps = dependency_structure.get_fan_in_deps(node_input)\n if (maps_to.fan_in_index >= len(inner_deps)) or (\n inner_deps[maps_to.fan_in_index] is not MappedInputPlaceholder\n ):\n raise DagsterInvalidDefinitionError(\n f"In {class_name} '{name}' input mapping target "\n f'"{maps_to.node_name}.{maps_to.input_name}" index {maps_to.fan_in_index} in '\n "the MultiDependencyDefinition is not a MappedInputPlaceholder"\n )\n mapping_keys.add(f"{maps_to.node_name}.{maps_to.input_name}.{maps_to.fan_in_index}")\n target_input_types_by_graph_input_name[mapping.graph_input_name].add(\n target_input_def.dagster_type.get_inner_type_for_fan_in()\n )\n else:\n if dependency_structure.has_deps(node_input):\n raise DagsterInvalidDefinitionError(\n f"In {class_name} '{name}' input mapping target "\n f'"{mapping.maps_to.node_name}.{mapping.maps_to.input_name}" '\n "is already satisfied by output"\n )\n\n mapping_keys.add(f"{mapping.maps_to.node_name}.{mapping.maps_to.input_name}")\n target_input_types_by_graph_input_name[mapping.graph_input_name].add(\n target_input_def.dagster_type\n )\n\n for node_input in dependency_structure.inputs():\n if dependency_structure.has_fan_in_deps(node_input):\n for idx, dep in enumerate(dependency_structure.get_fan_in_deps(node_input)):\n if dep is MappedInputPlaceholder:\n mapping_str = f"{node_input.node_name}.{node_input.input_name}.{idx}"\n if mapping_str not in mapping_keys:\n raise DagsterInvalidDefinitionError(\n f"Unsatisfied MappedInputPlaceholder at index {idx} in"\n " MultiDependencyDefinition for"\n f" '{node_input.node_name}.{node_input.input_name}'"\n )\n\n # if the dagster type on a graph input is Any and all its target inputs have the\n # same dagster type, then use that dagster type for the graph input\n for graph_input_name, graph_input_def in input_defs_by_name.items():\n if graph_input_def.dagster_type.kind == DagsterTypeKind.ANY:\n target_input_types = target_input_types_by_graph_input_name[graph_input_name]\n if len(target_input_types) == 1:\n input_defs_by_name[graph_input_name] = graph_input_def.with_dagster_type(\n next(iter(target_input_types))\n )\n\n return list(input_defs_by_name.values())\n\n\ndef _validate_out_mappings(\n output_mappings: Sequence[OutputMapping],\n node_dict: Mapping[str, Node],\n name: str,\n class_name: str,\n) -> Tuple[Sequence[OutputMapping], Sequence[OutputDefinition]]:\n output_defs: List[OutputDefinition] = []\n for mapping in output_mappings:\n if isinstance(mapping, OutputMapping):\n target_node = node_dict.get(mapping.maps_from.node_name)\n if target_node is None:\n raise DagsterInvalidDefinitionError(\n f"In {class_name} '{name}' output mapping references node "\n f"'{mapping.maps_from.node_name}' which it does not contain."\n )\n if not target_node.has_output(mapping.maps_from.output_name):\n raise DagsterInvalidDefinitionError(\n f"In {class_name} {name} output mapping from {target_node.describe_node()} "\n f"which contains no output named '{mapping.maps_from.output_name}'"\n )\n\n target_output = target_node.output_def_named(mapping.maps_from.output_name)\n output_def = mapping.get_definition(is_dynamic=target_output.is_dynamic)\n output_defs.append(output_def)\n\n if (\n mapping.dagster_type\n and mapping.dagster_type.kind != DagsterTypeKind.ANY\n and (target_output.dagster_type != mapping.dagster_type)\n and class_name != "GraphDefinition"\n ):\n raise DagsterInvalidDefinitionError(\n f"In {class_name} '{name}' output '{mapping.graph_output_name}' of type"\n f" {mapping.dagster_type.display_name} maps from"\n f" {mapping.maps_from.node_name}.{mapping.maps_from.output_name} of different"\n f" type {target_output.dagster_type.display_name}. OutputMapping source and"\n " destination must have the same type."\n )\n\n elif isinstance(mapping, OutputDefinition):\n raise DagsterInvalidDefinitionError(\n f"You passed an OutputDefinition named '{mapping.name}' directly "\n "in to output_mappings. Return an OutputMapping by calling "\n "mapping_from on the OutputDefinition."\n )\n else:\n raise DagsterInvalidDefinitionError(\n f"Received unexpected type '{type(mapping)}' in output_mappings. "\n "Provide an OutputMapping using OutputDefinition(...).mapping_from(...)"\n )\n return output_mappings, output_defs\n
", "current_page_name": "_modules/dagster/_core/definitions/graph_definition", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.graph_definition"}, "hook_definition": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.hook_definition

\nfrom typing import AbstractSet, Any, Callable, Iterator, NamedTuple, Optional, cast\n\nimport dagster._check as check\nfrom dagster._annotations import PublicAttr\n\nfrom ..decorator_utils import get_function_params\nfrom ..errors import DagsterInvalidInvocationError\nfrom .resource_requirement import HookResourceRequirement, RequiresResources, ResourceRequirement\nfrom .utils import check_valid_name\n\n\n
[docs]class HookDefinition(\n NamedTuple(\n "_HookDefinition",\n [\n ("name", PublicAttr[str]),\n ("hook_fn", PublicAttr[Callable]),\n ("required_resource_keys", PublicAttr[AbstractSet[str]]),\n ("decorated_fn", PublicAttr[Optional[Callable]]),\n ],\n ),\n RequiresResources,\n):\n """Define a hook which can be triggered during a op execution (e.g. a callback on the step\n execution failure event during a op execution).\n\n Args:\n name (str): The name of this hook.\n hook_fn (Callable): The callback function that will be triggered.\n required_resource_keys (Optional[AbstractSet[str]]): Keys for the resources required by the\n hook.\n """\n\n def __new__(\n cls,\n *,\n name: str,\n hook_fn: Callable[..., Any],\n required_resource_keys: Optional[AbstractSet[str]] = None,\n decorated_fn: Optional[Callable[..., Any]] = None,\n ):\n return super(HookDefinition, cls).__new__(\n cls,\n name=check_valid_name(name),\n hook_fn=check.callable_param(hook_fn, "hook_fn"),\n required_resource_keys=frozenset(\n check.opt_set_param(required_resource_keys, "required_resource_keys", of_type=str)\n ),\n decorated_fn=check.opt_callable_param(decorated_fn, "decorated_fn"),\n )\n\n def __call__(self, *args, **kwargs):\n """This is invoked when the hook is used as a decorator.\n\n We currently support hooks to decorate the following:\n\n - JobDefinition: when the hook decorates a job definition, it will be added to\n all the op invocations within the job.\n\n Example:\n .. code-block:: python\n\n @success_hook\n def slack_message_on_success(_):\n ...\n\n @slack_message_on_success\n @job\n def a_job():\n foo(bar())\n\n """\n from ..execution.context.hook import HookContext\n from .graph_definition import GraphDefinition\n from .hook_invocation import hook_invocation_result\n from .job_definition import JobDefinition\n\n if len(args) > 0 and isinstance(args[0], (JobDefinition, GraphDefinition)):\n # when it decorates a job, we apply this hook to all the op invocations within\n # the job.\n return args[0].with_hooks({self})\n else:\n if not self.decorated_fn:\n raise DagsterInvalidInvocationError(\n "Only hook definitions created using one of the hook decorators can be invoked."\n )\n fxn_args = get_function_params(self.decorated_fn)\n # If decorated fxn has two arguments, then this is an event list hook fxn, and parameter\n # names are always context and event_list\n if len(fxn_args) == 2:\n context_arg_name = fxn_args[0].name\n event_list_arg_name = fxn_args[1].name\n if len(args) + len(kwargs) != 2:\n raise DagsterInvalidInvocationError(\n "Decorated function expects two parameters, context and event_list, but "\n f"{len(args) + len(kwargs)} were provided."\n )\n if args:\n context = check.opt_inst_param(args[0], "context", HookContext)\n event_list = check.opt_list_param(\n args[1] if len(args) > 1 else kwargs[event_list_arg_name],\n event_list_arg_name,\n )\n else:\n if context_arg_name not in kwargs:\n raise DagsterInvalidInvocationError(\n f"Could not find expected argument '{context_arg_name}'. Provided "\n f"kwargs: {list(kwargs.keys())}"\n )\n if event_list_arg_name not in kwargs:\n raise DagsterInvalidInvocationError(\n f"Could not find expected argument '{event_list_arg_name}'. Provided "\n f"kwargs: {list(kwargs.keys())}"\n )\n context = check.opt_inst_param(\n kwargs[context_arg_name], context_arg_name, HookContext\n )\n event_list = check.opt_list_param(\n kwargs[event_list_arg_name], event_list_arg_name\n )\n return hook_invocation_result(self, context, event_list)\n else:\n context_arg_name = fxn_args[0].name\n if len(args) + len(kwargs) != 1:\n raise DagsterInvalidInvocationError(\n f"Decorated function expects one parameter, {context_arg_name}, but "\n f"{len(args) + len(kwargs)} were provided."\n )\n if args:\n context = check.opt_inst_param(args[0], context_arg_name, HookContext)\n else:\n if context_arg_name not in kwargs:\n raise DagsterInvalidInvocationError(\n f"Could not find expected argument '{context_arg_name}'. Provided "\n f"kwargs: {list(kwargs.keys())}"\n )\n context = check.opt_inst_param(\n kwargs[context_arg_name], context_arg_name, HookContext\n )\n return hook_invocation_result(self, context)\n\n def get_resource_requirements(\n self, outer_context: Optional[object] = None\n ) -> Iterator[ResourceRequirement]:\n # outer_context in this case is a string of (job, job name) or (node, node name)\n attached_to = cast(Optional[str], outer_context)\n for resource_key in sorted(list(self.required_resource_keys)):\n yield HookResourceRequirement(\n key=resource_key, attached_to=attached_to, hook_name=self.name\n )
\n
", "current_page_name": "_modules/dagster/_core/definitions/hook_definition", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.hook_definition"}, "input": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.input

\nimport inspect\nfrom types import FunctionType\nfrom typing import (\n    TYPE_CHECKING,\n    Any,\n    Callable,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Set,\n    Type,\n    TypeVar,\n    Union,\n)\n\nimport dagster._check as check\nfrom dagster._annotations import PublicAttr, deprecated_param, experimental_param\nfrom dagster._core.definitions.events import AssetKey\nfrom dagster._core.definitions.metadata import (\n    ArbitraryMetadataMapping,\n    MetadataValue,\n    RawMetadataValue,\n    normalize_metadata,\n)\nfrom dagster._core.errors import DagsterError, DagsterInvalidDefinitionError\nfrom dagster._core.types.dagster_type import (  # BuiltinScalarDagsterType,\n    DagsterType,\n    resolve_dagster_type,\n)\n\nfrom .inference import InferredInputProps\nfrom .utils import NoValueSentinel, check_valid_name\n\nif TYPE_CHECKING:\n    from dagster._core.execution.context.input import InputContext\n\nT = TypeVar("T")\n\n\n# unfortunately since type_check functions need TypeCheckContext which is only available\n# at runtime, we can only check basic types before runtime\ndef _check_default_value(input_name: str, dagster_type: DagsterType, default_value: T) -> T:\n    from dagster._core.types.dagster_type import BuiltinScalarDagsterType\n\n    if default_value is not NoValueSentinel:\n        if dagster_type.is_nothing:\n            raise DagsterInvalidDefinitionError(\n                "Setting a default_value is invalid on InputDefinitions of type Nothing"\n            )\n\n        if isinstance(dagster_type, BuiltinScalarDagsterType):\n            type_check = dagster_type.type_check_scalar_value(default_value)\n            if not type_check.success:\n                raise DagsterInvalidDefinitionError(\n                    "Type check failed for the default_value of InputDefinition "\n                    f"{input_name} of type {dagster_type.display_name}. "\n                    f"Received value {default_value} of type {type(default_value)}",\n                )\n\n    return default_value\n\n\n@experimental_param(param="asset_key")\n@experimental_param(param="asset_partitions")\nclass InputDefinition:\n    """Defines an argument to an op's compute function.\n\n    Inputs may flow from previous op outputs, or be stubbed using config. They may optionally\n    be typed using the Dagster type system.\n\n    Args:\n        name (str): Name of the input.\n        dagster_type (Optional[Union[Type, DagsterType]]]): The type of this input.\n            Users should provide the Python type of the objects that they expect to be passed for\n            this input, or a :py:class:`DagsterType` that defines a runtime check that they want\n            to be run on this input. Defaults to :py:class:`Any`.\n        description (Optional[str]): Human-readable description of the input.\n        default_value (Optional[Any]): The default value to use if no input is provided.\n        metadata (Optional[Dict[str, Any]]): A dict of metadata for the input.\n        asset_key (Optional[Union[AssetKey, InputContext -> AssetKey]]): (Experimental) An AssetKey\n            (or function that produces an AssetKey from the InputContext) which should be associated\n            with this InputDefinition. Used for tracking lineage information through Dagster.\n        asset_partitions (Optional[Union[AbstractSet[str], InputContext -> AbstractSet[str]]]): (Experimental) A\n            set of partitions of the given asset_key (or a function that produces this list of\n            partitions from the InputContext) which should be associated with this InputDefinition.\n        input_manager_key (Optional[str]): (Experimental) The resource key for the\n            :py:class:`InputManager` used for loading this input when it is not connected to an\n            upstream output.\n    """\n\n    _name: str\n    _type_not_set: bool\n    _dagster_type: DagsterType\n    _description: Optional[str]\n    _default_value: Any\n    _input_manager_key: Optional[str]\n    _raw_metadata: ArbitraryMetadataMapping\n    _metadata: Mapping[str, MetadataValue]\n    _asset_key: Optional[Union[AssetKey, Callable[["InputContext"], AssetKey]]]\n    _asset_partitions_fn: Optional[Callable[["InputContext"], Set[str]]]\n\n    def __init__(\n        self,\n        name: str,\n        dagster_type: object = None,\n        description: Optional[str] = None,\n        default_value: object = NoValueSentinel,\n        metadata: Optional[ArbitraryMetadataMapping] = None,\n        asset_key: Optional[Union[AssetKey, Callable[["InputContext"], AssetKey]]] = None,\n        asset_partitions: Optional[Union[Set[str], Callable[["InputContext"], Set[str]]]] = None,\n        input_manager_key: Optional[str] = None,\n        # when adding new params, make sure to update combine_with_inferred and with_dagster_type below\n    ):\n        self._name = check_valid_name(name, allow_list=["config"])\n\n        self._type_not_set = dagster_type is None\n        self._dagster_type = check.inst(resolve_dagster_type(dagster_type), DagsterType)\n\n        self._description = check.opt_str_param(description, "description")\n\n        self._default_value = _check_default_value(self._name, self._dagster_type, default_value)\n\n        self._input_manager_key = check.opt_str_param(input_manager_key, "input_manager_key")\n\n        self._raw_metadata = check.opt_mapping_param(metadata, "metadata", key_type=str)\n        self._metadata = normalize_metadata(self._raw_metadata, allow_invalid=True)\n\n        if not callable(asset_key):\n            check.opt_inst_param(asset_key, "asset_key", AssetKey)\n\n        self._asset_key = asset_key\n\n        if asset_partitions:\n            check.param_invariant(\n                asset_key is not None,\n                "asset_partitions",\n                'Cannot specify "asset_partitions" argument without also specifying "asset_key"',\n            )\n        if callable(asset_partitions):\n            self._asset_partitions_fn = asset_partitions\n        elif asset_partitions is not None:\n            _asset_partitions = check.set_param(asset_partitions, "asset_partitions", of_type=str)\n            self._asset_partitions_fn = lambda _: _asset_partitions\n        else:\n            self._asset_partitions_fn = None\n\n    @property\n    def name(self) -> str:\n        return self._name\n\n    @property\n    def dagster_type(self) -> DagsterType:\n        return self._dagster_type\n\n    @property\n    def description(self) -> Optional[str]:\n        return self._description\n\n    @property\n    def has_default_value(self) -> bool:\n        return self._default_value is not NoValueSentinel\n\n    @property\n    def default_value(self) -> Any:\n        check.invariant(self.has_default_value, "Can only fetch default_value if has_default_value")\n        return self._default_value\n\n    @property\n    def input_manager_key(self) -> Optional[str]:\n        return self._input_manager_key\n\n    @property\n    def metadata(self) -> ArbitraryMetadataMapping:\n        return self._raw_metadata\n\n    @property\n    def is_asset(self) -> bool:\n        return self._asset_key is not None\n\n    @property\n    def hardcoded_asset_key(self) -> Optional[AssetKey]:\n        if not callable(self._asset_key):\n            return self._asset_key\n        else:\n            return None\n\n    def get_asset_key(self, context: "InputContext") -> Optional[AssetKey]:\n        """Get the AssetKey associated with this InputDefinition for the given\n        :py:class:`InputContext` (if any).\n\n        Args:\n            context (InputContext): The InputContext that this InputDefinition is being evaluated\n                in\n        """\n        if callable(self._asset_key):\n            return self._asset_key(context)\n        else:\n            return self.hardcoded_asset_key\n\n    def get_asset_partitions(self, context: "InputContext") -> Optional[Set[str]]:\n        """Get the set of partitions that this op will read from this InputDefinition for the given\n        :py:class:`InputContext` (if any).\n\n        Args:\n            context (InputContext): The InputContext that this InputDefinition is being evaluated\n                in\n        """\n        if self._asset_partitions_fn is None:\n            return None\n\n        return self._asset_partitions_fn(context)\n\n    def mapping_to(\n        self, node_name: str, input_name: str, fan_in_index: Optional[int] = None\n    ) -> "InputMapping":\n        """Create an input mapping to an input of a child node.\n\n        In a GraphDefinition, you can use this helper function to construct\n        an :py:class:`InputMapping` to the input of a child node.\n\n        Args:\n            node_name (str): The name of the child node to which to map this input.\n            input_name (str): The name of the child node' input to which to map this input.\n            fan_in_index (Optional[int]): The index in to a fanned in input, else None\n\n        Examples:\n            .. code-block:: python\n\n                input_mapping = InputDefinition('composite_input', Int).mapping_to(\n                    'child_node', 'int_input'\n                )\n        """\n        check.str_param(node_name, "node_name")\n        check.str_param(input_name, "input_name")\n        check.opt_int_param(fan_in_index, "fan_in_index")\n\n        return InputMapping(\n            graph_input_name=self.name,\n            mapped_node_name=node_name,\n            mapped_node_input_name=input_name,\n            fan_in_index=fan_in_index,\n            graph_input_description=self.description,\n            dagster_type=self.dagster_type,\n        )\n\n    @staticmethod\n    def create_from_inferred(inferred: InferredInputProps) -> "InputDefinition":\n        return InputDefinition(\n            name=inferred.name,\n            dagster_type=_checked_inferred_type(inferred),\n            description=inferred.description,\n            default_value=inferred.default_value,\n        )\n\n    def combine_with_inferred(self, inferred: InferredInputProps) -> "InputDefinition":\n        """Return a new InputDefinition that merges this ones properties with those inferred from type signature.\n        This can update: dagster_type, description, and default_value if they are not set.\n        """\n        check.invariant(\n            self.name == inferred.name,\n            f"InferredInputProps name {inferred.name} did not align with InputDefinition name"\n            f" {self.name}",\n        )\n\n        dagster_type = self._dagster_type\n        if self._type_not_set:\n            dagster_type = _checked_inferred_type(inferred)\n\n        description = self._description\n        if description is None and inferred.description is not None:\n            description = inferred.description\n\n        default_value = self._default_value\n        if not self.has_default_value:\n            default_value = inferred.default_value\n\n        return InputDefinition(\n            name=self.name,\n            dagster_type=dagster_type,\n            description=description,\n            default_value=default_value,\n            metadata=self.metadata,\n            asset_key=self._asset_key,\n            asset_partitions=self._asset_partitions_fn,\n            input_manager_key=self._input_manager_key,\n        )\n\n    def with_dagster_type(self, dagster_type: DagsterType) -> "InputDefinition":\n        return InputDefinition(\n            name=self.name,\n            dagster_type=dagster_type,\n            description=self.description,\n            default_value=self.default_value if self.has_default_value else NoValueSentinel,\n            metadata=self.metadata,\n            asset_key=self._asset_key,\n            asset_partitions=self._asset_partitions_fn,\n            input_manager_key=self._input_manager_key,\n        )\n\n\ndef _checked_inferred_type(inferred: InferredInputProps) -> DagsterType:\n    try:\n        if inferred.annotation == inspect.Parameter.empty:\n            resolved_type = resolve_dagster_type(None)\n        elif inferred.annotation is None:\n            # When inferred.annotation is None, it means someone explicitly put "None" as the\n            # annotation, so want to map it to a DagsterType that checks for the None type\n            resolved_type = resolve_dagster_type(type(None))\n        else:\n            resolved_type = resolve_dagster_type(inferred.annotation)\n\n    except DagsterError as e:\n        raise DagsterInvalidDefinitionError(\n            f"Problem using type '{inferred.annotation}' from type annotation for argument "\n            f"'{inferred.name}', correct the issue or explicitly set the dagster_type "\n            "via In()."\n        ) from e\n\n    return resolved_type\n\n\nclass InputPointer(NamedTuple("_InputPointer", [("node_name", str), ("input_name", str)])):\n    def __new__(cls, node_name: str, input_name: str):\n        return super(InputPointer, cls).__new__(\n            cls,\n            check.str_param(node_name, "node_name"),\n            check.str_param(input_name, "input_name"),\n        )\n\n\nclass FanInInputPointer(\n    NamedTuple(\n        "_FanInInputPointer", [("node_name", str), ("input_name", str), ("fan_in_index", int)]\n    )\n):\n    def __new__(cls, node_name: str, input_name: str, fan_in_index: int):\n        return super(FanInInputPointer, cls).__new__(\n            cls,\n            check.str_param(node_name, "node_name"),\n            check.str_param(input_name, "input_name"),\n            check.int_param(fan_in_index, "fan_in_index"),\n        )\n\n\n
[docs]@deprecated_param(\n param="dagster_type",\n breaking_version="2.0",\n additional_warn_text="Any defined `dagster_type` should come from the upstream op `Output`.",\n # Disabling warning here since we're passing this internally and I'm not sure whether it is\n # actually used or discarded.\n emit_runtime_warning=False,\n)\nclass InputMapping(NamedTuple):\n """Defines an input mapping for a graph.\n\n Args:\n graph_input_name (str): Name of the input in the graph being mapped from.\n mapped_node_name (str): Named of the node (op/graph) that the input is being mapped to.\n mapped_node_input_name (str): Name of the input in the node (op/graph) that is being mapped to.\n fan_in_index (Optional[int]): The index in to a fanned input, otherwise None.\n graph_input_description (Optional[str]): A description of the input in the graph being mapped from.\n dagster_type (Optional[DagsterType]): The dagster type of the graph's input\n being mapped from.\n\n Examples:\n .. code-block:: python\n\n from dagster import InputMapping, GraphDefinition, op, graph\n\n @op\n def needs_input(x):\n return x + 1\n\n # The following two graph definitions are equivalent\n GraphDefinition(\n name="the_graph",\n node_defs=[needs_input],\n input_mappings=[\n InputMapping(\n graph_input_name="maps_x", mapped_node_name="needs_input",\n mapped_node_input_name="x"\n )\n ]\n )\n\n @graph\n def the_graph(maps_x):\n needs_input(maps_x)\n """\n\n graph_input_name: str\n mapped_node_name: str\n mapped_node_input_name: str\n fan_in_index: Optional[int] = None\n graph_input_description: Optional[str] = None\n dagster_type: Optional[DagsterType] = None\n\n @property\n def maps_to(self) -> Union[InputPointer, FanInInputPointer]:\n if self.fan_in_index is not None:\n return FanInInputPointer(\n self.mapped_node_name, self.mapped_node_input_name, self.fan_in_index\n )\n return InputPointer(self.mapped_node_name, self.mapped_node_input_name)\n\n @property\n def maps_to_fan_in(self) -> bool:\n return isinstance(self.maps_to, FanInInputPointer)\n\n def describe(self) -> str:\n idx = self.maps_to.fan_in_index if isinstance(self.maps_to, FanInInputPointer) else ""\n return f"{self.graph_input_name} -> {self.maps_to.node_name}:{self.maps_to.input_name}{idx}"\n\n def get_definition(self) -> "InputDefinition":\n return InputDefinition(\n name=self.graph_input_name,\n description=self.graph_input_description,\n dagster_type=self.dagster_type,\n )
\n\n\n
[docs]class In(\n NamedTuple(\n "_In",\n [\n ("dagster_type", PublicAttr[Union[DagsterType, Type[NoValueSentinel]]]),\n ("description", PublicAttr[Optional[str]]),\n ("default_value", PublicAttr[Any]),\n ("metadata", PublicAttr[Optional[Mapping[str, Any]]]),\n (\n "asset_key",\n PublicAttr[Optional[Union[AssetKey, Callable[["InputContext"], AssetKey]]]],\n ),\n (\n "asset_partitions",\n PublicAttr[Optional[Union[Set[str], Callable[["InputContext"], Set[str]]]]],\n ),\n ("input_manager_key", PublicAttr[Optional[str]]),\n ],\n )\n):\n """Defines an argument to an op's compute function.\n\n Inputs may flow from previous op's outputs, or be stubbed using config. They may optionally\n be typed using the Dagster type system.\n\n Args:\n dagster_type (Optional[Union[Type, DagsterType]]]):\n The type of this input. Should only be set if the correct type can not\n be inferred directly from the type signature of the decorated function.\n description (Optional[str]): Human-readable description of the input.\n default_value (Optional[Any]): The default value to use if no input is provided.\n metadata (Optional[Dict[str, RawMetadataValue]]): A dict of metadata for the input.\n asset_key (Optional[Union[AssetKey, InputContext -> AssetKey]]): (Experimental) An AssetKey\n (or function that produces an AssetKey from the InputContext) which should be associated\n with this In. Used for tracking lineage information through Dagster.\n asset_partitions (Optional[Union[Set[str], InputContext -> Set[str]]]): (Experimental) A\n set of partitions of the given asset_key (or a function that produces this list of\n partitions from the InputContext) which should be associated with this In.\n input_manager_key (Optional[str]): (Experimental) The resource key for the\n :py:class:`InputManager` used for loading this input when it is not connected to an\n upstream output.\n """\n\n def __new__(\n cls,\n dagster_type: Union[Type, DagsterType] = NoValueSentinel,\n description: Optional[str] = None,\n default_value: Any = NoValueSentinel,\n metadata: Optional[Mapping[str, RawMetadataValue]] = None,\n asset_key: Optional[Union[AssetKey, Callable[["InputContext"], AssetKey]]] = None,\n asset_partitions: Optional[Union[Set[str], Callable[["InputContext"], Set[str]]]] = None,\n input_manager_key: Optional[str] = None,\n ):\n return super(In, cls).__new__(\n cls,\n dagster_type=(\n NoValueSentinel\n if dagster_type is NoValueSentinel\n else resolve_dagster_type(dagster_type)\n ),\n description=check.opt_str_param(description, "description"),\n default_value=default_value,\n metadata=check.opt_mapping_param(metadata, "metadata", key_type=str),\n asset_key=check.opt_inst_param(asset_key, "asset_key", (AssetKey, FunctionType)),\n asset_partitions=asset_partitions,\n input_manager_key=check.opt_str_param(input_manager_key, "input_manager_key"),\n )\n\n @staticmethod\n def from_definition(input_def: InputDefinition) -> "In":\n return In(\n dagster_type=input_def.dagster_type,\n description=input_def.description,\n default_value=input_def._default_value, # noqa: SLF001\n metadata=input_def.metadata,\n asset_key=input_def._asset_key, # noqa: SLF001\n asset_partitions=input_def._asset_partitions_fn, # noqa: SLF001\n input_manager_key=input_def.input_manager_key,\n )\n\n def to_definition(self, name: str) -> InputDefinition:\n dagster_type = self.dagster_type if self.dagster_type is not NoValueSentinel else None\n return InputDefinition(\n name=name,\n dagster_type=dagster_type,\n description=self.description,\n default_value=self.default_value,\n metadata=self.metadata,\n asset_key=self.asset_key,\n asset_partitions=self.asset_partitions,\n input_manager_key=self.input_manager_key,\n )
\n\n\n
[docs]class GraphIn(NamedTuple("_GraphIn", [("description", PublicAttr[Optional[str]])])):\n """Represents information about an input that a graph maps.\n\n Args:\n description (Optional[str]): Human-readable description of the input.\n """\n\n def __new__(cls, description: Optional[str] = None):\n return super(GraphIn, cls).__new__(cls, description=description)\n\n def to_definition(self, name: str) -> InputDefinition:\n return InputDefinition(name=name, description=self.description)
\n
", "current_page_name": "_modules/dagster/_core/definitions/input", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.input"}, "job_definition": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.job_definition

\nimport importlib\nimport os\nimport warnings\nfrom datetime import datetime\nfrom functools import update_wrapper\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Any,\n    Dict,\n    Iterable,\n    List,\n    Mapping,\n    Optional,\n    Sequence,\n    Set,\n    Tuple,\n    Union,\n    cast,\n)\n\nfrom typing_extensions import Self\n\nimport dagster._check as check\nfrom dagster._annotations import deprecated, experimental_param, public\nfrom dagster._config import Field, Shape, StringSource\nfrom dagster._config.config_type import ConfigType\nfrom dagster._config.validate import validate_config\nfrom dagster._core.definitions.asset_check_spec import AssetCheckKey\nfrom dagster._core.definitions.dependency import (\n    Node,\n    NodeHandle,\n    NodeInputHandle,\n    NodeInvocation,\n)\nfrom dagster._core.definitions.events import AssetKey\nfrom dagster._core.definitions.node_definition import NodeDefinition\nfrom dagster._core.definitions.op_definition import OpDefinition\nfrom dagster._core.definitions.op_selection import OpSelection, get_graph_subset\nfrom dagster._core.definitions.partition import DynamicPartitionsDefinition\nfrom dagster._core.definitions.policy import RetryPolicy\nfrom dagster._core.definitions.resource_requirement import (\n    ResourceRequirement,\n    ensure_requirements_satisfied,\n)\nfrom dagster._core.definitions.utils import check_valid_name\nfrom dagster._core.errors import (\n    DagsterInvalidConfigError,\n    DagsterInvalidDefinitionError,\n    DagsterInvalidInvocationError,\n    DagsterInvalidSubsetError,\n    DagsterInvariantViolationError,\n)\nfrom dagster._core.selector.subset_selector import (\n    AssetSelectionData,\n    OpSelectionData,\n)\nfrom dagster._core.storage.io_manager import (\n    IOManagerDefinition,\n    dagster_maintained_io_manager,\n    io_manager,\n)\nfrom dagster._core.storage.tags import MEMOIZED_RUN_TAG\nfrom dagster._core.types.dagster_type import DagsterType\nfrom dagster._core.utils import str_format_set\nfrom dagster._utils import IHasInternalInit\nfrom dagster._utils.merger import merge_dicts\n\nfrom .asset_layer import AssetLayer, build_asset_selection_job\nfrom .config import ConfigMapping\nfrom .dependency import (\n    DependencyMapping,\n    DependencyStructure,\n    OpNode,\n)\nfrom .executor_definition import ExecutorDefinition, multi_or_in_process_executor\nfrom .graph_definition import GraphDefinition, SubselectedGraphDefinition\nfrom .hook_definition import HookDefinition\nfrom .logger_definition import LoggerDefinition\nfrom .metadata import MetadataValue, RawMetadataValue, normalize_metadata\nfrom .partition import PartitionedConfig, PartitionsDefinition\nfrom .resource_definition import ResourceDefinition\nfrom .run_request import RunRequest\nfrom .utils import DEFAULT_IO_MANAGER_KEY, validate_tags\nfrom .version_strategy import VersionStrategy\n\nif TYPE_CHECKING:\n    from dagster._config.snap import ConfigSchemaSnapshot\n    from dagster._core.definitions.run_config import RunConfig\n    from dagster._core.execution.execute_in_process_result import ExecuteInProcessResult\n    from dagster._core.execution.resources_init import InitResourceContext\n    from dagster._core.host_representation.job_index import JobIndex\n    from dagster._core.instance import DagsterInstance, DynamicPartitionsStore\n    from dagster._core.snap import JobSnapshot\n\n    from .run_config_schema import RunConfigSchema\n\nDEFAULT_EXECUTOR_DEF = multi_or_in_process_executor\n\n\n
[docs]@experimental_param(param="version_strategy")\nclass JobDefinition(IHasInternalInit):\n """Defines a Dagster job."""\n\n _name: str\n _graph_def: GraphDefinition\n _description: Optional[str]\n _tags: Mapping[str, str]\n _metadata: Mapping[str, MetadataValue]\n _current_level_node_defs: Sequence[NodeDefinition]\n _hook_defs: AbstractSet[HookDefinition]\n _op_retry_policy: Optional[RetryPolicy]\n _asset_layer: AssetLayer\n _resource_requirements: Mapping[str, AbstractSet[str]]\n _all_node_defs: Mapping[str, NodeDefinition]\n _cached_run_config_schemas: Dict[str, "RunConfigSchema"]\n _version_strategy: VersionStrategy\n _subset_selection_data: Optional[Union[OpSelectionData, AssetSelectionData]]\n input_values: Mapping[str, object]\n\n def __init__(\n self,\n *,\n graph_def: GraphDefinition,\n resource_defs: Optional[Mapping[str, ResourceDefinition]] = None,\n executor_def: Optional[ExecutorDefinition] = None,\n logger_defs: Optional[Mapping[str, LoggerDefinition]] = None,\n name: Optional[str] = None,\n config: Optional[\n Union[ConfigMapping, Mapping[str, object], PartitionedConfig, "RunConfig"]\n ] = None,\n description: Optional[str] = None,\n partitions_def: Optional[PartitionsDefinition] = None,\n tags: Optional[Mapping[str, Any]] = None,\n metadata: Optional[Mapping[str, RawMetadataValue]] = None,\n hook_defs: Optional[AbstractSet[HookDefinition]] = None,\n op_retry_policy: Optional[RetryPolicy] = None,\n version_strategy: Optional[VersionStrategy] = None,\n _subset_selection_data: Optional[Union[OpSelectionData, AssetSelectionData]] = None,\n asset_layer: Optional[AssetLayer] = None,\n input_values: Optional[Mapping[str, object]] = None,\n _was_explicitly_provided_resources: Optional[bool] = None,\n ):\n from dagster._core.definitions.run_config import RunConfig, convert_config_input\n\n self._graph_def = graph_def\n self._current_level_node_defs = self._graph_def.node_defs\n # Recursively explore all nodes in the this job\n self._all_node_defs = _build_all_node_defs(self._current_level_node_defs)\n self._asset_layer = check.opt_inst_param(\n asset_layer, "asset_layer", AssetLayer\n ) or _infer_asset_layer_from_source_asset_deps(graph_def)\n\n # validates\n self._graph_def.get_inputs_must_be_resolved_top_level(self._asset_layer)\n\n self._name = check_valid_name(check.str_param(name, "name")) if name else graph_def.name\n self._executor_def = check.opt_inst_param(executor_def, "executor_def", ExecutorDefinition)\n self._loggers = check.opt_nullable_mapping_param(\n logger_defs,\n "logger_defs",\n key_type=str,\n value_type=LoggerDefinition,\n )\n\n config = check.opt_inst_param(\n config, "config", (Mapping, ConfigMapping, PartitionedConfig, RunConfig)\n )\n config = convert_config_input(config)\n\n partitions_def = check.opt_inst_param(\n partitions_def, "partitions_def", PartitionsDefinition\n )\n # tags and description can exist on graph as well, but since\n # same graph may be in multiple jobs, keep separate layer\n self._description = check.opt_str_param(description, "description")\n self._tags = validate_tags(tags)\n self._metadata = normalize_metadata(\n check.opt_mapping_param(metadata, "metadata", key_type=str)\n )\n self._hook_defs = check.opt_set_param(hook_defs, "hook_defs")\n self._op_retry_policy = check.opt_inst_param(\n op_retry_policy, "op_retry_policy", RetryPolicy\n )\n self.version_strategy = check.opt_inst_param(\n version_strategy, "version_strategy", VersionStrategy\n )\n\n _subset_selection_data = check.opt_inst_param(\n _subset_selection_data, "_subset_selection_data", (OpSelectionData, AssetSelectionData)\n )\n input_values = check.opt_mapping_param(input_values, "input_values", key_type=str)\n\n resource_defs = check.opt_mapping_param(\n resource_defs, "resource_defs", key_type=str, value_type=ResourceDefinition\n )\n for key in resource_defs.keys():\n if not key.isidentifier():\n check.failed(f"Resource key '{key}' must be a valid Python identifier.")\n was_provided_resources = (\n bool(resource_defs)\n if _was_explicitly_provided_resources is None\n else _was_explicitly_provided_resources\n )\n self._resource_defs = {\n DEFAULT_IO_MANAGER_KEY: default_job_io_manager,\n **resource_defs,\n }\n self._required_resource_keys = self._get_required_resource_keys(was_provided_resources)\n\n self._config_mapping = None\n self._partitioned_config = None\n self._run_config = None\n self._run_config_schema = None\n self._original_config_argument = config\n\n if partitions_def:\n self._partitioned_config = PartitionedConfig.from_flexible_config(\n config, partitions_def\n )\n else:\n if isinstance(config, ConfigMapping):\n self._config_mapping = config\n elif isinstance(config, PartitionedConfig):\n self._partitioned_config = config\n elif isinstance(config, dict):\n self._run_config = config\n # Using config mapping here is a trick to make it so that the preset will be used even\n # when no config is supplied for the job.\n self._config_mapping = _config_mapping_with_default_value(\n get_run_config_schema_for_job(\n graph_def,\n self.resource_defs,\n self.executor_def,\n self.loggers,\n asset_layer,\n was_explicitly_provided_resources=was_provided_resources,\n ),\n config,\n self.name,\n )\n elif config is not None:\n check.failed(\n "config param must be a ConfigMapping, a PartitionedConfig, or a dictionary,"\n f" but is an object of type {type(config)}"\n )\n\n self._subset_selection_data = _subset_selection_data\n self.input_values = input_values\n for input_name in sorted(list(self.input_values.keys())):\n if not graph_def.has_input(input_name):\n raise DagsterInvalidDefinitionError(\n f"Error when constructing JobDefinition '{self.name}': Input value provided for"\n f" key '{input_name}', but job has no top-level input with that name."\n )\n\n def dagster_internal_init(\n *,\n graph_def: GraphDefinition,\n resource_defs: Optional[Mapping[str, ResourceDefinition]],\n executor_def: Optional[ExecutorDefinition],\n logger_defs: Optional[Mapping[str, LoggerDefinition]],\n name: Optional[str],\n config: Optional[\n Union[ConfigMapping, Mapping[str, object], PartitionedConfig, "RunConfig"]\n ],\n description: Optional[str],\n partitions_def: Optional[PartitionsDefinition],\n tags: Optional[Mapping[str, Any]],\n metadata: Optional[Mapping[str, RawMetadataValue]],\n hook_defs: Optional[AbstractSet[HookDefinition]],\n op_retry_policy: Optional[RetryPolicy],\n version_strategy: Optional[VersionStrategy],\n _subset_selection_data: Optional[Union[OpSelectionData, AssetSelectionData]],\n asset_layer: Optional[AssetLayer],\n input_values: Optional[Mapping[str, object]],\n _was_explicitly_provided_resources: Optional[bool],\n ) -> "JobDefinition":\n return JobDefinition(\n graph_def=graph_def,\n resource_defs=resource_defs,\n executor_def=executor_def,\n logger_defs=logger_defs,\n name=name,\n config=config,\n description=description,\n partitions_def=partitions_def,\n tags=tags,\n metadata=metadata,\n hook_defs=hook_defs,\n op_retry_policy=op_retry_policy,\n version_strategy=version_strategy,\n _subset_selection_data=_subset_selection_data,\n asset_layer=asset_layer,\n input_values=input_values,\n _was_explicitly_provided_resources=_was_explicitly_provided_resources,\n )\n\n @property\n def name(self) -> str:\n return self._name\n\n @property\n def tags(self) -> Mapping[str, str]:\n return merge_dicts(self._graph_def.tags, self._tags)\n\n @property\n def metadata(self) -> Mapping[str, MetadataValue]:\n return self._metadata\n\n @property\n def description(self) -> Optional[str]:\n return self._description\n\n @property\n def graph(self) -> GraphDefinition:\n return self._graph_def\n\n @property\n def dependency_structure(self) -> DependencyStructure:\n return self._graph_def.dependency_structure\n\n @property\n def dependencies(self) -> DependencyMapping[NodeInvocation]:\n return self._graph_def.dependencies\n\n @public\n @property\n def executor_def(self) -> ExecutorDefinition:\n """Returns the default :py:class:`ExecutorDefinition` for the job.\n\n If the user has not specified an executor definition, then this will default to the :py:func:`multi_or_in_process_executor`. If a default is specified on the :py:class:`Definitions` object the job was provided to, then that will be used instead.\n """\n return self._executor_def or DEFAULT_EXECUTOR_DEF\n\n @public\n @property\n def has_specified_executor(self) -> bool:\n """Returns True if this job has explicitly specified an executor, and False if the executor was inherited through defaults or the :py:class:`Definitions` object the job was provided to."""\n return self._executor_def is not None\n\n @public\n @property\n def resource_defs(self) -> Mapping[str, ResourceDefinition]:\n """Returns the set of ResourceDefinition objects specified on the job.\n\n This may not be the complete set of resources required by the job, since those can also be provided on the :py:class:`Definitions` object the job may be provided to.\n """\n return self._resource_defs\n\n @public\n @property\n def partitioned_config(self) -> Optional[PartitionedConfig]:\n """The partitioned config for the job, if it has one.\n\n A partitioned config defines a way to map partition keys to run config for the job.\n """\n return self._partitioned_config\n\n @public\n @property\n def config_mapping(self) -> Optional[ConfigMapping]:\n """The config mapping for the job, if it has one.\n\n A config mapping defines a way to map a top-level config schema to run config for the job.\n """\n return self._config_mapping\n\n @public\n @property\n def loggers(self) -> Mapping[str, LoggerDefinition]:\n """Returns the set of LoggerDefinition objects specified on the job.\n\n If the user has not specified a mapping of :py:class:`LoggerDefinition` objects, then this will default to the :py:func:`colored_console_logger` under the key `console`. If a default is specified on the :py:class:`Definitions` object the job was provided to, then that will be used instead.\n """\n from dagster._loggers import default_loggers\n\n return self._loggers or default_loggers()\n\n @public\n @property\n def has_specified_loggers(self) -> bool:\n """Returns true if the job explicitly set loggers, and False if loggers were inherited through defaults or the :py:class:`Definitions` object the job was provided to."""\n return self._loggers is not None\n\n @property\n def required_resource_keys(self) -> AbstractSet[str]:\n return self._required_resource_keys\n\n @property\n def run_config(self) -> Optional[Mapping[str, Any]]:\n return self._run_config\n\n @property\n def run_config_schema(self) -> "RunConfigSchema":\n if self._run_config_schema is None:\n self._run_config_schema = _create_run_config_schema(self, self.required_resource_keys)\n return self._run_config_schema\n\n @public\n @property\n def partitions_def(self) -> Optional[PartitionsDefinition]:\n """Returns the :py:class:`PartitionsDefinition` for the job, if it has one.\n\n A partitions definition defines the set of partition keys the job operates on.\n """\n return None if not self.partitioned_config else self.partitioned_config.partitions_def\n\n @property\n def hook_defs(self) -> AbstractSet[HookDefinition]:\n return self._hook_defs\n\n @property\n def asset_layer(self) -> AssetLayer:\n return self._asset_layer\n\n @property\n def all_node_defs(self) -> Sequence[NodeDefinition]:\n return list(self._all_node_defs.values())\n\n @property\n def top_level_node_defs(self) -> Sequence[NodeDefinition]:\n return self._current_level_node_defs\n\n def node_def_named(self, name: str) -> NodeDefinition:\n check.str_param(name, "name")\n\n check.invariant(name in self._all_node_defs, f"{name} not found")\n return self._all_node_defs[name]\n\n def has_node(self, name: str) -> bool:\n check.str_param(name, "name")\n return name in self._all_node_defs\n\n def get_node(self, handle: NodeHandle) -> Node:\n return self._graph_def.get_node(handle)\n\n def get_op(self, handle: NodeHandle) -> OpNode:\n node = self.get_node(handle)\n assert isinstance(\n node, OpNode\n ), f"Tried to retrieve node {handle} as op, but it represents a nested graph."\n return node\n\n def has_node_named(self, name: str) -> bool:\n return self._graph_def.has_node_named(name)\n\n def get_node_named(self, name: str) -> Node:\n return self._graph_def.node_named(name)\n\n @property\n def nodes(self) -> Sequence[Node]:\n return self._graph_def.nodes\n\n @property\n def nodes_in_topological_order(self) -> Sequence[Node]:\n return self._graph_def.nodes_in_topological_order\n\n def all_dagster_types(self) -> Iterable[DagsterType]:\n return self._graph_def.all_dagster_types()\n\n def has_dagster_type(self, name: str) -> bool:\n return self._graph_def.has_dagster_type(name)\n\n def dagster_type_named(self, name: str) -> DagsterType:\n return self._graph_def.dagster_type_named(name)\n\n def describe_target(self) -> str:\n return f"job '{self.name}'"\n\n def is_using_memoization(self, run_tags: Mapping[str, str]) -> bool:\n tags = merge_dicts(self.tags, run_tags)\n # If someone provides a false value for memoized run tag, then they are intentionally\n # switching off memoization.\n if tags.get(MEMOIZED_RUN_TAG) == "false":\n return False\n return (\n MEMOIZED_RUN_TAG in tags and tags.get(MEMOIZED_RUN_TAG) == "true"\n ) or self.version_strategy is not None\n\n def get_required_resource_defs(self) -> Mapping[str, ResourceDefinition]:\n return {\n resource_key: resource\n for resource_key, resource in self.resource_defs.items()\n if resource_key in self.required_resource_keys\n }\n\n def _get_required_resource_keys(self, validate_requirements: bool = False) -> AbstractSet[str]:\n from ..execution.resources_init import get_transitive_required_resource_keys\n\n requirements = self._get_resource_requirements()\n if validate_requirements:\n ensure_requirements_satisfied(self.resource_defs, requirements)\n required_keys = {req.key for req in requirements}\n if validate_requirements:\n return required_keys.union(\n get_transitive_required_resource_keys(required_keys, self.resource_defs)\n )\n else:\n return required_keys\n\n def _get_resource_requirements(self) -> Sequence[ResourceRequirement]:\n return [\n *self._graph_def.get_resource_requirements(self.asset_layer),\n *[\n req\n for hook_def in self._hook_defs\n for req in hook_def.get_resource_requirements(outer_context=f"job '{self._name}'")\n ],\n ]\n\n def validate_resource_requirements_satisfied(self) -> None:\n resource_requirements = self._get_resource_requirements()\n ensure_requirements_satisfied(self.resource_defs, resource_requirements)\n\n def is_missing_required_resources(self) -> bool:\n requirements = self._get_resource_requirements()\n for requirement in requirements:\n if not requirement.resources_contain_key(self.resource_defs):\n return True\n return False\n\n def get_all_hooks_for_handle(self, handle: NodeHandle) -> AbstractSet[HookDefinition]:\n """Gather all the hooks for the given node from all places possibly attached with a hook.\n\n A hook can be attached to any of the following objects\n * Node (node invocation)\n * JobDefinition\n\n Args:\n handle (NodeHandle): The node's handle\n\n Returns:\n FrozenSet[HookDefinition]\n """\n check.inst_param(handle, "handle", NodeHandle)\n hook_defs: Set[HookDefinition] = set()\n\n current = handle\n lineage = []\n while current:\n lineage.append(current.name)\n current = current.parent\n\n # hooks on top-level node\n name = lineage.pop()\n node = self._graph_def.node_named(name)\n hook_defs = hook_defs.union(node.hook_defs)\n\n # hooks on non-top-level nodes\n while lineage:\n name = lineage.pop()\n # While lineage is non-empty, definition is guaranteed to be a graph\n definition = cast(GraphDefinition, node.definition)\n node = definition.node_named(name)\n hook_defs = hook_defs.union(node.hook_defs)\n\n # hooks applied to a job definition will run on every node\n hook_defs = hook_defs.union(self.hook_defs)\n\n return frozenset(hook_defs)\n\n def get_retry_policy_for_handle(self, handle: NodeHandle) -> Optional[RetryPolicy]:\n node = self.get_node(handle)\n definition = node.definition\n\n if node.retry_policy:\n return node.retry_policy\n elif isinstance(definition, OpDefinition) and definition.retry_policy:\n return definition.retry_policy\n\n # could be expanded to look in graph containers\n else:\n return self._op_retry_policy\n\n # make Callable for decorator reference updates\n def __call__(self, *args, **kwargs):\n raise DagsterInvariantViolationError(\n f"Attempted to call job '{self.name}' directly. Jobs should be invoked by "\n "using an execution API function (e.g. `job.execute_in_process`)."\n )\n\n
[docs] @public\n def execute_in_process(\n self,\n run_config: Optional[Union[Mapping[str, Any], "RunConfig"]] = None,\n instance: Optional["DagsterInstance"] = None,\n partition_key: Optional[str] = None,\n raise_on_error: bool = True,\n op_selection: Optional[Sequence[str]] = None,\n asset_selection: Optional[Sequence[AssetKey]] = None,\n run_id: Optional[str] = None,\n input_values: Optional[Mapping[str, object]] = None,\n tags: Optional[Mapping[str, str]] = None,\n resources: Optional[Mapping[str, object]] = None,\n ) -> "ExecuteInProcessResult":\n """Execute the Job in-process, gathering results in-memory.\n\n The `executor_def` on the Job will be ignored, and replaced with the in-process executor.\n If using the default `io_manager`, it will switch from filesystem to in-memory.\n\n\n Args:\n run_config (Optional[Mapping[str, Any]]:\n The configuration for the run\n instance (Optional[DagsterInstance]):\n The instance to execute against, an ephemeral one will be used if none provided.\n partition_key: (Optional[str])\n The string partition key that specifies the run config to execute. Can only be used\n to select run config for jobs with partitioned config.\n raise_on_error (Optional[bool]): Whether or not to raise exceptions when they occur.\n Defaults to ``True``.\n op_selection (Optional[Sequence[str]]): A list of op selection queries (including single op\n names) to execute. For example:\n * ``['some_op']``: selects ``some_op`` itself.\n * ``['*some_op']``: select ``some_op`` and all its ancestors (upstream dependencies).\n * ``['*some_op+++']``: select ``some_op``, all its ancestors, and its descendants\n (downstream dependencies) within 3 levels down.\n * ``['*some_op', 'other_op_a', 'other_op_b+']``: select ``some_op`` and all its\n ancestors, ``other_op_a`` itself, and ``other_op_b`` and its direct child ops.\n input_values (Optional[Mapping[str, Any]]):\n A dictionary that maps python objects to the top-level inputs of the job. Input values provided here will override input values that have been provided to the job directly.\n resources (Optional[Mapping[str, Any]]):\n The resources needed if any are required. Can provide resource instances directly,\n or resource definitions.\n\n Returns:\n :py:class:`~dagster.ExecuteInProcessResult`\n\n """\n from dagster._core.definitions.executor_definition import execute_in_process_executor\n from dagster._core.definitions.run_config import convert_config_input\n from dagster._core.execution.build_resources import wrap_resources_for_execution\n from dagster._core.execution.execute_in_process import core_execute_in_process\n\n run_config = check.opt_mapping_param(convert_config_input(run_config), "run_config")\n op_selection = check.opt_sequence_param(op_selection, "op_selection", str)\n asset_selection = check.opt_sequence_param(asset_selection, "asset_selection", AssetKey)\n resources = check.opt_mapping_param(resources, "resources", key_type=str)\n\n resource_defs = wrap_resources_for_execution(resources)\n\n check.invariant(\n not (op_selection and asset_selection),\n "op_selection and asset_selection cannot both be provided as args to"\n " execute_in_process",\n )\n\n partition_key = check.opt_str_param(partition_key, "partition_key")\n input_values = check.opt_mapping_param(input_values, "input_values")\n\n # Combine provided input values at execute_in_process with input values\n # provided to the definition. Input values provided at\n # execute_in_process will override those provided on the definition.\n input_values = merge_dicts(self.input_values, input_values)\n\n bound_resource_defs = dict(self.resource_defs)\n ephemeral_job = JobDefinition.dagster_internal_init(\n name=self._name,\n graph_def=self._graph_def,\n resource_defs={**_swap_default_io_man(bound_resource_defs, self), **resource_defs},\n executor_def=execute_in_process_executor,\n logger_defs=self._loggers,\n hook_defs=self.hook_defs,\n config=self.config_mapping or self.partitioned_config or self.run_config,\n tags=self.tags,\n op_retry_policy=self._op_retry_policy,\n version_strategy=self.version_strategy,\n asset_layer=self.asset_layer,\n input_values=input_values,\n description=self.description,\n partitions_def=self.partitions_def,\n metadata=self.metadata,\n _subset_selection_data=None, # this is added below\n _was_explicitly_provided_resources=True,\n )\n\n ephemeral_job = ephemeral_job.get_subset(\n op_selection=op_selection,\n asset_selection=frozenset(asset_selection) if asset_selection else None,\n )\n\n merged_tags = merge_dicts(self.tags, tags or {})\n if partition_key:\n if not (self.partitions_def and self.partitioned_config):\n check.failed("Attempted to execute a partitioned run for a non-partitioned job")\n self.partitions_def.validate_partition_key(\n partition_key, dynamic_partitions_store=instance\n )\n\n run_config = (\n run_config\n if run_config\n else self.partitioned_config.get_run_config_for_partition_key(partition_key)\n )\n merged_tags.update(\n self.partitioned_config.get_tags_for_partition_key(\n partition_key, job_name=self.name\n )\n )\n\n return core_execute_in_process(\n ephemeral_job=ephemeral_job,\n run_config=run_config,\n instance=instance,\n output_capturing_enabled=True,\n raise_on_error=raise_on_error,\n run_tags=merged_tags,\n run_id=run_id,\n asset_selection=frozenset(asset_selection),\n )
\n\n @property\n def op_selection_data(self) -> Optional[OpSelectionData]:\n return (\n self._subset_selection_data\n if isinstance(self._subset_selection_data, OpSelectionData)\n else None\n )\n\n @property\n def asset_selection_data(self) -> Optional[AssetSelectionData]:\n return (\n self._subset_selection_data\n if isinstance(self._subset_selection_data, AssetSelectionData)\n else None\n )\n\n @property\n def is_subset(self) -> bool:\n return bool(self._subset_selection_data)\n\n def get_subset(\n self,\n *,\n op_selection: Optional[Iterable[str]] = None,\n asset_selection: Optional[AbstractSet[AssetKey]] = None,\n asset_check_selection: Optional[AbstractSet[AssetCheckKey]] = None,\n ) -> Self:\n check.invariant(\n not (op_selection and (asset_selection or asset_check_selection)),\n "op_selection cannot be provided with asset_selection or asset_check_selection to"\n " execute_in_process",\n )\n if op_selection:\n return self._get_job_def_for_op_selection(op_selection)\n if asset_selection or asset_check_selection:\n return self._get_job_def_for_asset_selection(\n asset_selection=asset_selection, asset_check_selection=asset_check_selection\n )\n else:\n return self\n\n def _get_job_def_for_asset_selection(\n self,\n asset_selection: Optional[AbstractSet[AssetKey]] = None,\n asset_check_selection: Optional[AbstractSet[AssetCheckKey]] = None,\n ) -> Self:\n asset_selection = check.opt_set_param(asset_selection, "asset_selection", AssetKey)\n check.opt_set_param(asset_check_selection, "asset_check_selection", AssetCheckKey)\n\n nonexistent_assets = [\n asset\n for asset in asset_selection\n if asset not in self.asset_layer.asset_keys\n and asset not in self.asset_layer.source_assets_by_key\n ]\n nonexistent_asset_strings = [\n asset_str\n for asset_str in (asset.to_string() for asset in nonexistent_assets)\n if asset_str\n ]\n if nonexistent_assets:\n raise DagsterInvalidSubsetError(\n "Assets provided in asset_selection argument "\n f"{', '.join(nonexistent_asset_strings)} do not exist in parent asset group or job."\n )\n\n # Test that selected asset checks exist\n all_check_keys = self.asset_layer.node_output_handles_by_asset_check_key.keys()\n\n nonexistent_asset_checks = [\n asset_check\n for asset_check in asset_check_selection or set()\n if asset_check not in all_check_keys\n ]\n nonexistent_asset_check_strings = [\n str(asset_check) for asset_check in nonexistent_asset_checks\n ]\n if nonexistent_asset_checks:\n raise DagsterInvalidSubsetError(\n "Asset checks provided in asset_check_selection argument"\n f" {', '.join(nonexistent_asset_check_strings)} do not exist in parent asset group"\n " or job."\n )\n\n # Test that selected asset checks can be run individually. Currently this is only supported\n # on checks defined with @asset_check, which will have an AssetChecksDefinition.\n all_check_keys_in_checks_defs = set()\n for asset_checks_def in self.asset_layer.asset_checks_defs:\n for spec in asset_checks_def.specs:\n all_check_keys_in_checks_defs.add(spec.key)\n\n non_checks_defs_asset_checks = [\n asset_check\n for asset_check in asset_check_selection or set()\n if asset_check not in all_check_keys_in_checks_defs\n ]\n non_checks_defs_asset_check_strings = [\n asset_check.name for asset_check in non_checks_defs_asset_checks\n ]\n if non_checks_defs_asset_checks:\n raise DagsterInvalidSubsetError(\n f"Can't execute asset checks [{', '.join(non_checks_defs_asset_check_strings)}],"\n " because they weren't defined with @asset_check or AssetChecksDefinition. To"\n " execute these checks, materialize the asset."\n )\n\n asset_selection_data = AssetSelectionData(\n asset_selection=asset_selection,\n asset_check_selection=asset_check_selection,\n parent_job_def=self,\n )\n\n check.invariant(\n self.asset_layer.assets_defs_by_key is not None,\n "Asset layer must have _asset_defs argument defined",\n )\n\n new_job = build_asset_selection_job(\n name=self.name,\n assets=set(self.asset_layer.assets_defs_by_key.values()),\n source_assets=self.asset_layer.source_assets_by_key.values(),\n executor_def=self.executor_def,\n resource_defs=self.resource_defs,\n description=self.description,\n tags=self.tags,\n asset_selection=asset_selection,\n asset_check_selection=asset_check_selection,\n asset_selection_data=asset_selection_data,\n config=self.config_mapping or self.partitioned_config,\n asset_checks=self.asset_layer.asset_checks_defs,\n )\n return new_job\n\n def _get_job_def_for_op_selection(self, op_selection: Iterable[str]) -> Self:\n try:\n sub_graph = get_graph_subset(self.graph, op_selection)\n\n # if explicit config was passed the config_mapping that resolves the defaults implicitly is\n # very unlikely to work. The job will still present the default config in the Dagster UI.\n config = (\n None\n if self.run_config is not None\n else self.config_mapping or self.partitioned_config\n )\n\n return self._copy(\n config=config,\n graph_def=sub_graph,\n _subset_selection_data=OpSelectionData(\n op_selection=list(op_selection),\n resolved_op_selection=OpSelection(op_selection).resolve(self.graph),\n parent_job_def=self, # used by job snapshot lineage\n ),\n # TODO: subset this structure.\n # https://github.com/dagster-io/dagster/issues/7541\n asset_layer=self.asset_layer,\n )\n except DagsterInvalidDefinitionError as exc:\n # This handles the case when you construct a subset such that an unsatisfied\n # input cannot be loaded from config. Instead of throwing a DagsterInvalidDefinitionError,\n # we re-raise a DagsterInvalidSubsetError.\n node_paths = OpSelection(op_selection).resolve(self.graph)\n raise DagsterInvalidSubsetError(\n f"The attempted subset {str_format_set(node_paths)} for graph "\n f"{self.graph.name} results in an invalid graph."\n ) from exc\n\n
[docs] @public\n @deprecated(\n breaking_version="2.0.0",\n additional_warn_text="Directly instantiate `RunRequest(partition_key=...)` instead.",\n )\n def run_request_for_partition(\n self,\n partition_key: str,\n run_key: Optional[str] = None,\n tags: Optional[Mapping[str, str]] = None,\n asset_selection: Optional[Sequence[AssetKey]] = None,\n run_config: Optional[Mapping[str, Any]] = None,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional["DynamicPartitionsStore"] = None,\n ) -> RunRequest:\n """Creates a RunRequest object for a run that processes the given partition.\n\n Args:\n partition_key: The key of the partition to request a run for.\n run_key (Optional[str]): A string key to identify this launched run. For sensors, ensures that\n only one run is created per run key across all sensor evaluations. For schedules,\n ensures that one run is created per tick, across failure recoveries. Passing in a `None`\n value means that a run will always be launched per evaluation.\n tags (Optional[Dict[str, str]]): A dictionary of tags (string key-value pairs) to attach\n to the launched run.\n run_config (Optional[Mapping[str, Any]]: Configuration for the run. If the job has\n a :py:class:`PartitionedConfig`, this value will override replace the config\n provided by it.\n current_time (Optional[datetime]): Used to determine which time-partitions exist.\n Defaults to now.\n dynamic_partitions_store (Optional[DynamicPartitionsStore]): The DynamicPartitionsStore\n object that is responsible for fetching dynamic partitions. Required when the\n partitions definition is a DynamicPartitionsDefinition with a name defined. Users\n can pass the DagsterInstance fetched via `context.instance` to this argument.\n\n\n Returns:\n RunRequest: an object that requests a run to process the given partition.\n """\n if not (self.partitions_def and self.partitioned_config):\n check.failed("Called run_request_for_partition on a non-partitioned job")\n\n if (\n isinstance(self.partitions_def, DynamicPartitionsDefinition)\n and self.partitions_def.name\n ):\n # Do not support using run_request_for_partition with dynamic partitions,\n # since this requires querying the instance once per run request for the\n # existent dynamic partitions\n check.failed(\n "run_request_for_partition is not supported for dynamic partitions. Instead, use"\n " RunRequest(partition_key=...)"\n )\n\n self.partitions_def.validate_partition_key(\n partition_key,\n current_time=current_time,\n dynamic_partitions_store=dynamic_partitions_store,\n )\n\n run_config = (\n run_config\n if run_config is not None\n else self.partitioned_config.get_run_config_for_partition_key(partition_key)\n )\n run_request_tags = {\n **(tags or {}),\n **self.partitioned_config.get_tags_for_partition_key(\n partition_key,\n job_name=self.name,\n ),\n }\n\n return RunRequest(\n run_key=run_key,\n run_config=run_config,\n tags=run_request_tags,\n job_name=self.name,\n asset_selection=asset_selection,\n partition_key=partition_key,\n )
\n\n def get_config_schema_snapshot(self) -> "ConfigSchemaSnapshot":\n return self.get_job_snapshot().config_schema_snapshot\n\n def get_job_snapshot(self) -> "JobSnapshot":\n return self.get_job_index().job_snapshot\n\n def get_job_index(self) -> "JobIndex":\n from dagster._core.host_representation import JobIndex\n from dagster._core.snap import JobSnapshot\n\n return JobIndex(JobSnapshot.from_job_def(self), self.get_parent_job_snapshot())\n\n def get_job_snapshot_id(self) -> str:\n return self.get_job_index().job_snapshot_id\n\n def get_parent_job_snapshot(self) -> Optional["JobSnapshot"]:\n if self.op_selection_data:\n return self.op_selection_data.parent_job_def.get_job_snapshot()\n elif self.asset_selection_data:\n return self.asset_selection_data.parent_job_def.get_job_snapshot()\n else:\n return None\n\n def has_direct_input_value(self, input_name: str) -> bool:\n return input_name in self.input_values\n\n def get_direct_input_value(self, input_name: str) -> object:\n if input_name not in self.input_values:\n raise DagsterInvalidInvocationError(\n f"On job '{self.name}', attempted to retrieve input value for input named"\n f" '{input_name}', but no value was provided. Provided input values:"\n f" {sorted(list(self.input_values.keys()))}"\n )\n return self.input_values[input_name]\n\n def _copy(self, **kwargs: Any) -> "JobDefinition":\n # dict() calls copy dict props\n base_kwargs = dict(\n graph_def=self.graph,\n resource_defs=dict(self.resource_defs),\n executor_def=self._executor_def,\n logger_defs=self._loggers,\n config=self._original_config_argument,\n name=self._name,\n description=self.description,\n tags=self.tags,\n metadata=self._metadata,\n hook_defs=self.hook_defs,\n op_retry_policy=self._op_retry_policy,\n version_strategy=self.version_strategy,\n _subset_selection_data=self._subset_selection_data,\n asset_layer=self.asset_layer,\n input_values=self.input_values,\n partitions_def=self.partitions_def,\n _was_explicitly_provided_resources=None,\n )\n resolved_kwargs = {**base_kwargs, **kwargs} # base kwargs overwritten for conflicts\n job_def = JobDefinition.dagster_internal_init(**resolved_kwargs)\n update_wrapper(job_def, self, updated=())\n return job_def\n\n
[docs] @public\n def with_top_level_resources(\n self, resource_defs: Mapping[str, ResourceDefinition]\n ) -> "JobDefinition":\n """Apply a set of resources to all op instances within the job."""\n resource_defs = check.mapping_param(resource_defs, "resource_defs", key_type=str)\n return self._copy(resource_defs=resource_defs)
\n\n
[docs] @public\n def with_hooks(self, hook_defs: AbstractSet[HookDefinition]) -> "JobDefinition":\n """Apply a set of hooks to all op instances within the job."""\n hook_defs = check.set_param(hook_defs, "hook_defs", of_type=HookDefinition)\n return self._copy(hook_defs=(hook_defs | self.hook_defs))
\n\n def with_executor_def(self, executor_def: ExecutorDefinition) -> "JobDefinition":\n return self._copy(executor_def=executor_def)\n\n def with_logger_defs(self, logger_defs: Mapping[str, LoggerDefinition]) -> "JobDefinition":\n return self._copy(logger_defs=logger_defs)\n\n @property\n def op_selection(self) -> Optional[AbstractSet[str]]:\n return set(self.op_selection_data.op_selection) if self.op_selection_data else None\n\n @property\n def asset_selection(self) -> Optional[AbstractSet[AssetKey]]:\n return self.asset_selection_data.asset_selection if self.asset_selection_data else None\n\n @property\n def resolved_op_selection(self) -> Optional[AbstractSet[str]]:\n return self.op_selection_data.resolved_op_selection if self.op_selection_data else None
\n\n\ndef _swap_default_io_man(resources: Mapping[str, ResourceDefinition], job: JobDefinition):\n """Used to create the user facing experience of the default io_manager\n switching to in-memory when using execute_in_process.\n """\n from dagster._core.storage.mem_io_manager import mem_io_manager\n\n if (\n resources.get(DEFAULT_IO_MANAGER_KEY) in [default_job_io_manager]\n and job.version_strategy is None\n ):\n updated_resources = dict(resources)\n updated_resources[DEFAULT_IO_MANAGER_KEY] = mem_io_manager\n return updated_resources\n\n return resources\n\n\n@dagster_maintained_io_manager\n@io_manager(\n description="Built-in filesystem IO manager that stores and retrieves values using pickling."\n)\ndef default_job_io_manager(init_context: "InitResourceContext"):\n # support overriding the default io manager via environment variables\n module_name = os.getenv("DAGSTER_DEFAULT_IO_MANAGER_MODULE")\n attribute_name = os.getenv("DAGSTER_DEFAULT_IO_MANAGER_ATTRIBUTE")\n silence_failures = os.getenv("DAGSTER_DEFAULT_IO_MANAGER_SILENCE_FAILURES")\n\n if module_name and attribute_name:\n from dagster._core.execution.build_resources import build_resources\n\n try:\n module = importlib.import_module(module_name)\n attr = getattr(module, attribute_name)\n check.invariant(\n isinstance(attr, IOManagerDefinition),\n "DAGSTER_DEFAULT_IO_MANAGER_MODULE and DAGSTER_DEFAULT_IO_MANAGER_ATTRIBUTE"\n " must specify an IOManagerDefinition",\n )\n with build_resources({"io_manager": attr}, instance=init_context.instance) as resources:\n return resources.io_manager\n except Exception as e:\n if not silence_failures:\n raise\n else:\n warnings.warn(\n f"Failed to load io manager override with module: {module_name} attribute:"\n f" {attribute_name}: {e}\\nFalling back to default io manager."\n )\n\n # normally, default to the fs_io_manager\n from dagster._core.storage.fs_io_manager import PickledObjectFilesystemIOManager\n\n instance = check.not_none(init_context.instance)\n return PickledObjectFilesystemIOManager(base_dir=instance.storage_directory())\n\n\n@dagster_maintained_io_manager\n@io_manager(\n description="Built-in filesystem IO manager that stores and retrieves values using pickling.",\n config_schema={"base_dir": Field(StringSource, is_required=False)},\n)\ndef default_job_io_manager_with_fs_io_manager_schema(init_context: "InitResourceContext"):\n # support overriding the default io manager via environment variables\n module_name = os.getenv("DAGSTER_DEFAULT_IO_MANAGER_MODULE")\n attribute_name = os.getenv("DAGSTER_DEFAULT_IO_MANAGER_ATTRIBUTE")\n silence_failures = os.getenv("DAGSTER_DEFAULT_IO_MANAGER_SILENCE_FAILURES")\n\n if module_name and attribute_name:\n from dagster._core.execution.build_resources import build_resources\n\n try:\n module = importlib.import_module(module_name)\n attr = getattr(module, attribute_name)\n check.invariant(\n isinstance(attr, IOManagerDefinition),\n "DAGSTER_DEFAULT_IO_MANAGER_MODULE and DAGSTER_DEFAULT_IO_MANAGER_ATTRIBUTE"\n " must specify an IOManagerDefinition",\n )\n with build_resources({"io_manager": attr}, instance=init_context.instance) as resources:\n return resources.io_manager\n except Exception as e:\n if not silence_failures:\n raise\n else:\n warnings.warn(\n f"Failed to load io manager override with module: {module_name} attribute:"\n f" {attribute_name}: {e}\\nFalling back to default io manager."\n )\n from dagster._core.storage.fs_io_manager import PickledObjectFilesystemIOManager\n\n # normally, default to the fs_io_manager\n base_dir = init_context.resource_config.get(\n "base_dir", init_context.instance.storage_directory() if init_context.instance else None\n )\n\n return PickledObjectFilesystemIOManager(base_dir=base_dir)\n\n\ndef _config_mapping_with_default_value(\n inner_schema: ConfigType,\n default_config: Mapping[str, Any],\n job_name: str,\n) -> ConfigMapping:\n if not isinstance(inner_schema, Shape):\n check.failed("Only Shape (dictionary) config_schema allowed on Job ConfigMapping")\n\n def config_fn(x):\n return x\n\n updated_fields = {}\n field_aliases = inner_schema.field_aliases\n for name, field in inner_schema.fields.items():\n if name in default_config:\n updated_fields[name] = Field(\n config=field.config_type,\n default_value=default_config[name],\n description=field.description,\n )\n elif name in field_aliases and field_aliases[name] in default_config:\n updated_fields[name] = Field(\n config=field.config_type,\n default_value=default_config[field_aliases[name]],\n description=field.description,\n )\n else:\n updated_fields[name] = field\n\n config_schema = Shape(\n fields=updated_fields,\n description=(\n "This run config schema was automatically populated with default values "\n "from `default_config`."\n ),\n field_aliases=inner_schema.field_aliases,\n )\n\n config_evr = validate_config(config_schema, default_config)\n if not config_evr.success:\n raise DagsterInvalidConfigError(\n f"Error in config when building job '{job_name}' ",\n config_evr.errors,\n default_config,\n )\n\n return ConfigMapping(\n config_fn=config_fn, config_schema=config_schema, receive_processed_config_values=False\n )\n\n\ndef get_run_config_schema_for_job(\n graph_def: GraphDefinition,\n resource_defs: Mapping[str, ResourceDefinition],\n executor_def: "ExecutorDefinition",\n logger_defs: Mapping[str, LoggerDefinition],\n asset_layer: Optional[AssetLayer],\n was_explicitly_provided_resources: bool = False,\n) -> ConfigType:\n return JobDefinition(\n name=graph_def.name,\n graph_def=graph_def,\n resource_defs=resource_defs,\n executor_def=executor_def,\n logger_defs=logger_defs,\n asset_layer=asset_layer,\n _was_explicitly_provided_resources=was_explicitly_provided_resources,\n ).run_config_schema.run_config_schema_type\n\n\ndef _infer_asset_layer_from_source_asset_deps(job_graph_def: GraphDefinition) -> AssetLayer:\n """For non-asset jobs that have some inputs that are fed from SourceAssets, constructs an\n AssetLayer that includes those SourceAssets.\n """\n asset_keys_by_node_input_handle: Dict[NodeInputHandle, AssetKey] = {}\n source_assets_list = []\n source_asset_keys_set = set()\n io_manager_keys_by_asset_key: Mapping[AssetKey, str] = {}\n\n # each entry is a graph definition and its handle relative to the job root\n stack: List[Tuple[GraphDefinition, Optional[NodeHandle]]] = [(job_graph_def, None)]\n\n while stack:\n graph_def, parent_node_handle = stack.pop()\n\n for node_name, input_source_assets in graph_def.node_input_source_assets.items():\n node_handle = NodeHandle(node_name, parent_node_handle)\n for input_name, source_asset in input_source_assets.items():\n if source_asset.key not in source_asset_keys_set:\n source_asset_keys_set.add(source_asset.key)\n source_assets_list.append(source_asset)\n\n input_handle = NodeInputHandle(node_handle, input_name)\n asset_keys_by_node_input_handle[input_handle] = source_asset.key\n for resolved_input_handle in graph_def.node_dict[\n node_name\n ].definition.resolve_input_to_destinations(input_handle):\n asset_keys_by_node_input_handle[resolved_input_handle] = source_asset.key\n\n if source_asset.io_manager_key:\n io_manager_keys_by_asset_key[source_asset.key] = source_asset.io_manager_key\n\n for node_name, node in graph_def.node_dict.items():\n if isinstance(node.definition, GraphDefinition):\n stack.append((node.definition, NodeHandle(node_name, parent_node_handle)))\n\n return AssetLayer(\n assets_defs_by_node_handle={},\n asset_keys_by_node_input_handle=asset_keys_by_node_input_handle,\n asset_info_by_node_output_handle={},\n asset_deps={},\n dependency_node_handles_by_asset_key={},\n assets_defs_by_key={},\n source_assets_by_key={\n source_asset.key: source_asset for source_asset in source_assets_list\n },\n io_manager_keys_by_asset_key=io_manager_keys_by_asset_key,\n dep_asset_keys_by_node_output_handle={},\n partition_mappings_by_asset_dep={},\n asset_checks_defs_by_node_handle={},\n node_output_handles_by_asset_check_key={},\n check_names_by_asset_key_by_node_handle={},\n check_key_by_node_output_handle={},\n )\n\n\ndef _build_all_node_defs(node_defs: Sequence[NodeDefinition]) -> Mapping[str, NodeDefinition]:\n all_defs: Dict[str, NodeDefinition] = {}\n for current_level_node_def in node_defs:\n for node_def in current_level_node_def.iterate_node_defs():\n if node_def.name in all_defs:\n if all_defs[node_def.name] != node_def:\n raise DagsterInvalidDefinitionError(\n 'Detected conflicting node definitions with the same name "{name}"'.format(\n name=node_def.name\n )\n )\n else:\n all_defs[node_def.name] = node_def\n\n return all_defs\n\n\ndef _create_run_config_schema(\n job_def: JobDefinition,\n required_resources: AbstractSet[str],\n) -> "RunConfigSchema":\n from .run_config import (\n RunConfigSchemaCreationData,\n construct_config_type_dictionary,\n define_run_config_schema_type,\n )\n from .run_config_schema import RunConfigSchema\n\n # When executing with a subset job, include the missing nodes\n # from the original job as ignored to allow execution with\n # run config that is valid for the original\n ignored_nodes: Sequence[Node] = []\n if job_def.is_subset:\n if isinstance(job_def.graph, SubselectedGraphDefinition): # op selection provided\n ignored_nodes = job_def.graph.get_top_level_omitted_nodes()\n elif job_def.asset_selection_data:\n parent_job = job_def\n while parent_job.asset_selection_data:\n parent_job = parent_job.asset_selection_data.parent_job_def\n\n ignored_nodes = [\n node for node in parent_job.graph.nodes if not job_def.has_node_named(node.name)\n ]\n else:\n ignored_nodes = []\n\n run_config_schema_type = define_run_config_schema_type(\n RunConfigSchemaCreationData(\n job_name=job_def.name,\n nodes=job_def.graph.nodes,\n graph_def=job_def.graph,\n dependency_structure=job_def.graph.dependency_structure,\n executor_def=job_def.executor_def,\n resource_defs=job_def.resource_defs,\n logger_defs=job_def.loggers,\n ignored_nodes=ignored_nodes,\n required_resources=required_resources,\n direct_inputs=job_def.input_values,\n asset_layer=job_def.asset_layer,\n )\n )\n\n if job_def.config_mapping:\n outer_config_type = job_def.config_mapping.config_schema.config_type\n else:\n outer_config_type = run_config_schema_type\n\n if outer_config_type is None:\n check.failed("Unexpected outer_config_type value of None")\n\n config_type_dict_by_name, config_type_dict_by_key = construct_config_type_dictionary(\n job_def.all_node_defs,\n outer_config_type,\n )\n\n return RunConfigSchema(\n run_config_schema_type=run_config_schema_type,\n config_type_dict_by_name=config_type_dict_by_name,\n config_type_dict_by_key=config_type_dict_by_key,\n config_mapping=job_def.config_mapping,\n )\n
", "current_page_name": "_modules/dagster/_core/definitions/job_definition", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.job_definition"}, "load_assets_from_modules": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.load_assets_from_modules

\nimport inspect\nimport os\nimport pkgutil\nfrom importlib import import_module\nfrom types import ModuleType\nfrom typing import Dict, Generator, Iterable, List, Optional, Sequence, Set, Tuple, Union\n\nimport dagster._check as check\nfrom dagster._core.definitions.auto_materialize_policy import AutoMaterializePolicy\nfrom dagster._core.definitions.backfill_policy import BackfillPolicy\nfrom dagster._core.definitions.freshness_policy import FreshnessPolicy\nfrom dagster._core.errors import DagsterInvalidDefinitionError\n\nfrom .assets import AssetsDefinition\nfrom .cacheable_assets import CacheableAssetsDefinition\nfrom .events import (\n    AssetKey,\n    CoercibleToAssetKeyPrefix,\n    check_opt_coercible_to_asset_key_prefix_param,\n)\nfrom .source_asset import SourceAsset\n\n\ndef _find_assets_in_module(\n    module: ModuleType,\n) -> Generator[Union[AssetsDefinition, SourceAsset, CacheableAssetsDefinition], None, None]:\n    """Finds assets in the given module and adds them to the given sets of assets and source assets."""\n    for attr in dir(module):\n        value = getattr(module, attr)\n        if isinstance(value, (AssetsDefinition, SourceAsset, CacheableAssetsDefinition)):\n            yield value\n        elif isinstance(value, list) and all(\n            isinstance(el, (AssetsDefinition, SourceAsset, CacheableAssetsDefinition))\n            for el in value\n        ):\n            yield from value\n\n\ndef assets_from_modules(\n    modules: Iterable[ModuleType], extra_source_assets: Optional[Sequence[SourceAsset]] = None\n) -> Tuple[Sequence[AssetsDefinition], Sequence[SourceAsset], Sequence[CacheableAssetsDefinition]]:\n    """Constructs three lists, a list of assets, a list of source assets, and a list of cacheable\n    assets from the given modules.\n\n    Args:\n        modules (Iterable[ModuleType]): The Python modules to look for assets inside.\n        extra_source_assets (Optional[Sequence[SourceAsset]]): Source assets to include in the\n            group in addition to the source assets found in the modules.\n\n    Returns:\n        Tuple[Sequence[AssetsDefinition], Sequence[SourceAsset], Sequence[CacheableAssetsDefinition]]]:\n            A tuple containing a list of assets, a list of source assets, and a list of\n            cacheable assets defined in the given modules.\n    """\n    asset_ids: Set[int] = set()\n    asset_keys: Dict[AssetKey, ModuleType] = dict()\n    source_assets: List[SourceAsset] = list(\n        check.opt_sequence_param(extra_source_assets, "extra_source_assets", of_type=SourceAsset)\n    )\n    cacheable_assets: List[CacheableAssetsDefinition] = []\n    assets: Dict[AssetKey, AssetsDefinition] = {}\n    for module in modules:\n        for asset in _find_assets_in_module(module):\n            if id(asset) not in asset_ids:\n                asset_ids.add(id(asset))\n                if isinstance(asset, CacheableAssetsDefinition):\n                    cacheable_assets.append(asset)\n                else:\n                    keys = asset.keys if isinstance(asset, AssetsDefinition) else [asset.key]\n                    for key in keys:\n                        if key in asset_keys:\n                            modules_str = ", ".join(\n                                set([asset_keys[key].__name__, module.__name__])\n                            )\n                            error_str = (\n                                f"Asset key {key} is defined multiple times. Definitions found in"\n                                f" modules: {modules_str}. "\n                            )\n\n                            if key in assets and isinstance(asset, AssetsDefinition):\n                                if assets[key].node_def == asset.node_def:\n                                    error_str += (\n                                        "One possible cause of this bug is a call to with_resources"\n                                        " outside of a repository definition, causing a duplicate"\n                                        " asset definition."\n                                    )\n\n                            raise DagsterInvalidDefinitionError(error_str)\n                        else:\n                            asset_keys[key] = module\n                            if isinstance(asset, AssetsDefinition):\n                                assets[key] = asset\n                    if isinstance(asset, SourceAsset):\n                        source_assets.append(asset)\n    return list(set(assets.values())), source_assets, cacheable_assets\n\n\n
[docs]def load_assets_from_modules(\n modules: Iterable[ModuleType],\n group_name: Optional[str] = None,\n key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n *,\n freshness_policy: Optional[FreshnessPolicy] = None,\n auto_materialize_policy: Optional[AutoMaterializePolicy] = None,\n backfill_policy: Optional[BackfillPolicy] = None,\n source_key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n) -> Sequence[Union[AssetsDefinition, SourceAsset, CacheableAssetsDefinition]]:\n """Constructs a list of assets and source assets from the given modules.\n\n Args:\n modules (Iterable[ModuleType]): The Python modules to look for assets inside.\n group_name (Optional[str]):\n Group name to apply to the loaded assets. The returned assets will be copies of the\n loaded objects, with the group name added.\n key_prefix (Optional[Union[str, Sequence[str]]]):\n Prefix to prepend to the keys of the loaded assets. The returned assets will be copies\n of the loaded objects, with the prefix prepended.\n freshness_policy (Optional[FreshnessPolicy]): FreshnessPolicy to apply to all the loaded\n assets.\n auto_materialize_policy (Optional[AutoMaterializePolicy]): AutoMaterializePolicy to apply\n to all the loaded assets.\n backfill_policy (Optional[AutoMaterializePolicy]): BackfillPolicy to apply to all the loaded assets.\n source_key_prefix (bool): Prefix to prepend to the keys of loaded SourceAssets. The returned\n assets will be copies of the loaded objects, with the prefix prepended.\n\n Returns:\n Sequence[Union[AssetsDefinition, SourceAsset]]:\n A list containing assets and source assets defined in the given modules.\n """\n group_name = check.opt_str_param(group_name, "group_name")\n key_prefix = check_opt_coercible_to_asset_key_prefix_param(key_prefix, "key_prefix")\n freshness_policy = check.opt_inst_param(freshness_policy, "freshness_policy", FreshnessPolicy)\n auto_materialize_policy = check.opt_inst_param(\n auto_materialize_policy, "auto_materialize_policy", AutoMaterializePolicy\n )\n backfill_policy = check.opt_inst_param(backfill_policy, "backfill_policy", BackfillPolicy)\n\n (\n assets,\n source_assets,\n cacheable_assets,\n ) = assets_from_modules(modules)\n\n return assets_with_attributes(\n assets,\n source_assets,\n cacheable_assets,\n key_prefix=key_prefix,\n group_name=group_name,\n freshness_policy=freshness_policy,\n auto_materialize_policy=auto_materialize_policy,\n backfill_policy=backfill_policy,\n source_key_prefix=source_key_prefix,\n )
\n\n\n
[docs]def load_assets_from_current_module(\n group_name: Optional[str] = None,\n key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n *,\n freshness_policy: Optional[FreshnessPolicy] = None,\n auto_materialize_policy: Optional[AutoMaterializePolicy] = None,\n backfill_policy: Optional[BackfillPolicy] = None,\n source_key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n) -> Sequence[Union[AssetsDefinition, SourceAsset, CacheableAssetsDefinition]]:\n """Constructs a list of assets, source assets, and cacheable assets from the module where\n this function is called.\n\n Args:\n group_name (Optional[str]):\n Group name to apply to the loaded assets. The returned assets will be copies of the\n loaded objects, with the group name added.\n key_prefix (Optional[Union[str, Sequence[str]]]):\n Prefix to prepend to the keys of the loaded assets. The returned assets will be copies\n of the loaded objects, with the prefix prepended.\n freshness_policy (Optional[FreshnessPolicy]): FreshnessPolicy to apply to all the loaded\n assets.\n auto_materialize_policy (Optional[AutoMaterializePolicy]): AutoMaterializePolicy to apply\n to all the loaded assets.\n backfill_policy (Optional[AutoMaterializePolicy]): BackfillPolicy to apply to all the loaded assets.\n source_key_prefix (bool): Prefix to prepend to the keys of loaded SourceAssets. The returned\n assets will be copies of the loaded objects, with the prefix prepended.\n\n Returns:\n Sequence[Union[AssetsDefinition, SourceAsset, CachableAssetsDefinition]]:\n A list containing assets, source assets, and cacheable assets defined in the module.\n """\n caller = inspect.stack()[1]\n module = inspect.getmodule(caller[0])\n if module is None:\n check.failed("Could not find a module for the caller")\n\n return load_assets_from_modules(\n [module],\n group_name=group_name,\n key_prefix=key_prefix,\n freshness_policy=freshness_policy,\n auto_materialize_policy=auto_materialize_policy,\n backfill_policy=backfill_policy,\n )
\n\n\ndef assets_from_package_module(\n package_module: ModuleType,\n extra_source_assets: Optional[Sequence[SourceAsset]] = None,\n) -> Tuple[Sequence[AssetsDefinition], Sequence[SourceAsset], Sequence[CacheableAssetsDefinition]]:\n """Constructs three lists, a list of assets, a list of source assets, and a list of cacheable assets\n from the given package module.\n\n Args:\n package_module (ModuleType): The package module to looks for assets inside.\n extra_source_assets (Optional[Sequence[SourceAsset]]): Source assets to include in the\n group in addition to the source assets found in the modules.\n\n Returns:\n Tuple[Sequence[AssetsDefinition], Sequence[SourceAsset], Sequence[CacheableAssetsDefinition]]:\n A tuple containing a list of assets, a list of source assets, and a list of cacheable assets\n defined in the given modules.\n """\n return assets_from_modules(\n _find_modules_in_package(package_module), extra_source_assets=extra_source_assets\n )\n\n\n
[docs]def load_assets_from_package_module(\n package_module: ModuleType,\n group_name: Optional[str] = None,\n key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n *,\n freshness_policy: Optional[FreshnessPolicy] = None,\n auto_materialize_policy: Optional[AutoMaterializePolicy] = None,\n backfill_policy: Optional[BackfillPolicy] = None,\n source_key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n) -> Sequence[Union[AssetsDefinition, SourceAsset, CacheableAssetsDefinition]]:\n """Constructs a list of assets and source assets that includes all asset\n definitions, source assets, and cacheable assets in all sub-modules of the given package module.\n\n A package module is the result of importing a package.\n\n Args:\n package_module (ModuleType): The package module to looks for assets inside.\n group_name (Optional[str]):\n Group name to apply to the loaded assets. The returned assets will be copies of the\n loaded objects, with the group name added.\n key_prefix (Optional[Union[str, Sequence[str]]]):\n Prefix to prepend to the keys of the loaded assets. The returned assets will be copies\n of the loaded objects, with the prefix prepended.\n freshness_policy (Optional[FreshnessPolicy]): FreshnessPolicy to apply to all the loaded\n assets.\n auto_materialize_policy (Optional[AutoMaterializePolicy]): AutoMaterializePolicy to apply\n to all the loaded assets.\n backfill_policy (Optional[AutoMaterializePolicy]): BackfillPolicy to apply to all the loaded assets.\n source_key_prefix (bool): Prefix to prepend to the keys of loaded SourceAssets. The returned\n assets will be copies of the loaded objects, with the prefix prepended.\n\n Returns:\n Sequence[Union[AssetsDefinition, SourceAsset, CacheableAssetsDefinition]]:\n A list containing assets, source assets, and cacheable assets defined in the module.\n """\n group_name = check.opt_str_param(group_name, "group_name")\n key_prefix = check_opt_coercible_to_asset_key_prefix_param(key_prefix, "key_prefix")\n freshness_policy = check.opt_inst_param(freshness_policy, "freshness_policy", FreshnessPolicy)\n auto_materialize_policy = check.opt_inst_param(\n auto_materialize_policy, "auto_materialize_policy", AutoMaterializePolicy\n )\n backfill_policy = check.opt_inst_param(backfill_policy, "backfill_policy", BackfillPolicy)\n\n (\n assets,\n source_assets,\n cacheable_assets,\n ) = assets_from_package_module(package_module)\n return assets_with_attributes(\n assets,\n source_assets,\n cacheable_assets,\n key_prefix=key_prefix,\n group_name=group_name,\n freshness_policy=freshness_policy,\n auto_materialize_policy=auto_materialize_policy,\n backfill_policy=backfill_policy,\n source_key_prefix=source_key_prefix,\n )
\n\n\n
[docs]def load_assets_from_package_name(\n package_name: str,\n group_name: Optional[str] = None,\n key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n *,\n freshness_policy: Optional[FreshnessPolicy] = None,\n auto_materialize_policy: Optional[AutoMaterializePolicy] = None,\n backfill_policy: Optional[BackfillPolicy] = None,\n source_key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n) -> Sequence[Union[AssetsDefinition, SourceAsset, CacheableAssetsDefinition]]:\n """Constructs a list of assets, source assets, and cacheable assets that includes all asset\n definitions and source assets in all sub-modules of the given package.\n\n Args:\n package_name (str): The name of a Python package to look for assets inside.\n group_name (Optional[str]):\n Group name to apply to the loaded assets. The returned assets will be copies of the\n loaded objects, with the group name added.\n key_prefix (Optional[Union[str, Sequence[str]]]):\n Prefix to prepend to the keys of the loaded assets. The returned assets will be copies\n of the loaded objects, with the prefix prepended.\n freshness_policy (Optional[FreshnessPolicy]): FreshnessPolicy to apply to all the loaded\n assets.\n auto_materialize_policy (Optional[AutoMaterializePolicy]): AutoMaterializePolicy to apply\n to all the loaded assets.\n backfill_policy (Optional[AutoMaterializePolicy]): BackfillPolicy to apply to all the loaded assets.\n source_key_prefix (bool): Prefix to prepend to the keys of loaded SourceAssets. The returned\n assets will be copies of the loaded objects, with the prefix prepended.\n\n Returns:\n Sequence[Union[AssetsDefinition, SourceAsset, CacheableAssetsDefinition]]:\n A list containing assets, source assets, and cacheable assets defined in the module.\n """\n package_module = import_module(package_name)\n return load_assets_from_package_module(\n package_module,\n group_name=group_name,\n key_prefix=key_prefix,\n freshness_policy=freshness_policy,\n auto_materialize_policy=auto_materialize_policy,\n backfill_policy=backfill_policy,\n )
\n\n\ndef _find_modules_in_package(package_module: ModuleType) -> Iterable[ModuleType]:\n yield package_module\n package_path = package_module.__file__\n if package_path:\n for _, modname, is_pkg in pkgutil.walk_packages([os.path.dirname(package_path)]):\n submodule = import_module(f"{package_module.__name__}.{modname}")\n if is_pkg:\n yield from _find_modules_in_package(submodule)\n else:\n yield submodule\n else:\n raise ValueError(\n f"Tried to find modules in package {package_module}, but its __file__ is None"\n )\n\n\ndef prefix_assets(\n assets_defs: Sequence[AssetsDefinition],\n key_prefix: CoercibleToAssetKeyPrefix,\n source_assets: Sequence[SourceAsset],\n source_key_prefix: Optional[CoercibleToAssetKeyPrefix],\n) -> Tuple[Sequence[AssetsDefinition], Sequence[SourceAsset]]:\n """Given a list of assets, prefix the input and output asset keys with key_prefix.\n The prefix is not added to source assets.\n\n Input asset keys that reference other assets within assets_defs are "brought along" -\n i.e. prefixed as well.\n\n Example with a single asset:\n\n .. code-block:: python\n\n @asset\n def asset1():\n ...\n\n result = prefixed_asset_key_replacements([asset_1], "my_prefix")\n assert result.assets[0].asset_key == AssetKey(["my_prefix", "asset1"])\n\n Example with dependencies within the list of assets:\n\n .. code-block:: python\n\n @asset\n def asset1():\n ...\n\n @asset\n def asset2(asset1):\n ...\n\n result = prefixed_asset_key_replacements([asset1, asset2], "my_prefix")\n assert result.assets[0].asset_key == AssetKey(["my_prefix", "asset1"])\n assert result.assets[1].asset_key == AssetKey(["my_prefix", "asset2"])\n assert result.assets[1].dependency_keys == {AssetKey(["my_prefix", "asset1"])}\n\n """\n asset_keys = {asset_key for assets_def in assets_defs for asset_key in assets_def.keys}\n source_asset_keys = {source_asset.key for source_asset in source_assets}\n\n if isinstance(key_prefix, str):\n key_prefix = [key_prefix]\n key_prefix = check.is_list(key_prefix, of_type=str)\n\n result_assets: List[AssetsDefinition] = []\n for assets_def in assets_defs:\n output_asset_key_replacements = {\n asset_key: AssetKey([*key_prefix, *asset_key.path]) for asset_key in assets_def.keys\n }\n input_asset_key_replacements = {}\n for dep_asset_key in assets_def.dependency_keys:\n if dep_asset_key in asset_keys:\n input_asset_key_replacements[dep_asset_key] = AssetKey(\n [*key_prefix, *dep_asset_key.path]\n )\n elif source_key_prefix and dep_asset_key in source_asset_keys:\n input_asset_key_replacements[dep_asset_key] = AssetKey(\n [*source_key_prefix, *dep_asset_key.path]\n )\n\n result_assets.append(\n assets_def.with_attributes(\n output_asset_key_replacements=output_asset_key_replacements,\n input_asset_key_replacements=input_asset_key_replacements,\n )\n )\n\n if source_key_prefix:\n result_source_assets = [\n source_asset.with_attributes(key=AssetKey([*source_key_prefix, *source_asset.key.path]))\n for source_asset in source_assets\n ]\n else:\n result_source_assets = source_assets\n\n return result_assets, result_source_assets\n\n\ndef assets_with_attributes(\n assets_defs: Sequence[AssetsDefinition],\n source_assets: Sequence[SourceAsset],\n cacheable_assets: Sequence[CacheableAssetsDefinition],\n key_prefix: Optional[Sequence[str]],\n group_name: Optional[str],\n freshness_policy: Optional[FreshnessPolicy],\n auto_materialize_policy: Optional[AutoMaterializePolicy],\n backfill_policy: Optional[BackfillPolicy],\n source_key_prefix: Optional[Sequence[str]],\n) -> Sequence[Union[AssetsDefinition, SourceAsset, CacheableAssetsDefinition]]:\n # There is a tricky edge case here where if a non-cacheable asset depends on a cacheable asset,\n # and the assets are prefixed, the non-cacheable asset's dependency will not be prefixed since\n # at prefix-time it is not known that its dependency is one of the cacheable assets.\n # https://github.com/dagster-io/dagster/pull/10389#pullrequestreview-1170913271\n if key_prefix:\n assets_defs, source_assets = prefix_assets(\n assets_defs, key_prefix, source_assets, source_key_prefix\n )\n cacheable_assets = [\n cached_asset.with_prefix_for_all(key_prefix) for cached_asset in cacheable_assets\n ]\n\n if group_name or freshness_policy or auto_materialize_policy or backfill_policy:\n assets_defs = [\n asset.with_attributes(\n group_names_by_key=(\n {asset_key: group_name for asset_key in asset.keys} if group_name else None\n ),\n freshness_policy=freshness_policy,\n auto_materialize_policy=auto_materialize_policy,\n backfill_policy=backfill_policy,\n )\n for asset in assets_defs\n ]\n if group_name:\n source_assets = [\n source_asset.with_attributes(group_name=group_name)\n for source_asset in source_assets\n ]\n cacheable_assets = [\n cached_asset.with_attributes_for_all(\n group_name,\n freshness_policy=freshness_policy,\n auto_materialize_policy=auto_materialize_policy,\n backfill_policy=backfill_policy,\n )\n for cached_asset in cacheable_assets\n ]\n\n return [*assets_defs, *source_assets, *cacheable_assets]\n
", "current_page_name": "_modules/dagster/_core/definitions/load_assets_from_modules", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.load_assets_from_modules"}, "logger_definition": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.logger_definition

\nfrom typing import TYPE_CHECKING, Any, Callable, Optional, Union, cast, overload\n\nimport dagster._check as check\nfrom dagster._annotations import public\nfrom dagster._core.errors import DagsterInvalidInvocationError\n\nfrom ..decorator_utils import get_function_params\nfrom .config import is_callable_valid_config_arg\nfrom .configurable import AnonymousConfigurableDefinition\nfrom .definition_config_schema import (\n    CoercableToConfigSchema,\n    convert_user_facing_definition_config_schema,\n)\n\nif TYPE_CHECKING:\n    import logging\n\n    from dagster._core.definitions import JobDefinition\n    from dagster._core.execution.context.logger import InitLoggerContext, UnboundInitLoggerContext\n\n    InitLoggerFunction = Callable[[InitLoggerContext], logging.Logger]\n\n\n
[docs]class LoggerDefinition(AnonymousConfigurableDefinition):\n """Core class for defining loggers.\n\n Loggers are job-scoped logging handlers, which will be automatically invoked whenever\n dagster messages are logged from within a job.\n\n Args:\n logger_fn (Callable[[InitLoggerContext], logging.Logger]): User-provided function to\n instantiate the logger. This logger will be automatically invoked whenever the methods\n on ``context.log`` are called from within job compute logic.\n config_schema (Optional[ConfigSchema]): The schema for the config. Configuration data available in\n `init_context.logger_config`. If not set, Dagster will accept any config provided.\n description (Optional[str]): A human-readable description of this logger.\n """\n\n def __init__(\n self,\n logger_fn: "InitLoggerFunction",\n config_schema: Any = None,\n description: Optional[str] = None,\n ):\n self._logger_fn = check.callable_param(logger_fn, "logger_fn")\n self._config_schema = convert_user_facing_definition_config_schema(config_schema)\n self._description = check.opt_str_param(description, "description")\n\n def __call__(self, *args, **kwargs):\n from dagster._core.execution.context.logger import UnboundInitLoggerContext\n\n from .logger_invocation import logger_invocation_result\n\n if len(args) == 0 and len(kwargs) == 0:\n raise DagsterInvalidInvocationError(\n "Logger initialization function has context argument, but no context argument was "\n "provided when invoking."\n )\n if len(args) + len(kwargs) > 1:\n raise DagsterInvalidInvocationError(\n "Initialization of logger received multiple arguments. Only a first "\n "positional context parameter should be provided when invoking."\n )\n\n context_param_name = get_function_params(self.logger_fn)[0].name\n\n if args:\n context = check.opt_inst_param(\n args[0],\n context_param_name,\n UnboundInitLoggerContext,\n default=UnboundInitLoggerContext(logger_config=None, job_def=None),\n )\n return logger_invocation_result(self, context)\n else:\n if context_param_name not in kwargs:\n raise DagsterInvalidInvocationError(\n f"Logger initialization expected argument '{context_param_name}'."\n )\n context = check.opt_inst_param(\n kwargs[context_param_name],\n context_param_name,\n UnboundInitLoggerContext,\n default=UnboundInitLoggerContext(logger_config=None, job_def=None),\n )\n\n return logger_invocation_result(self, context)\n\n @public\n @property\n def logger_fn(self) -> "InitLoggerFunction":\n """Callable[[InitLoggerContext], logging.Logger]: The function that will be invoked to\n instantiate the logger.\n """\n return self._logger_fn\n\n @public\n @property\n def config_schema(self) -> Any:\n """Any: The schema for the logger's config. Configuration data available in `init_context.logger_config`."""\n return self._config_schema\n\n @public\n @property\n def description(self) -> Optional[str]:\n """Optional[str]: A human-readable description of the logger."""\n return self._description\n\n def copy_for_configured(\n self,\n description: Optional[str],\n config_schema: Any,\n ) -> "LoggerDefinition":\n return LoggerDefinition(\n config_schema=config_schema,\n description=description or self.description,\n logger_fn=self.logger_fn,\n )
\n\n\n@overload\ndef logger(\n config_schema: CoercableToConfigSchema, description: Optional[str] = ...\n) -> Callable[["InitLoggerFunction"], "LoggerDefinition"]: ...\n\n\n@overload\ndef logger(\n config_schema: "InitLoggerFunction", description: Optional[str] = ...\n) -> "LoggerDefinition": ...\n\n\n
[docs]def logger(\n config_schema: Union[CoercableToConfigSchema, "InitLoggerFunction"] = None,\n description: Optional[str] = None,\n) -> Union["LoggerDefinition", Callable[["InitLoggerFunction"], "LoggerDefinition"]]:\n """Define a logger.\n\n The decorated function should accept an :py:class:`InitLoggerContext` and return an instance of\n :py:class:`python:logging.Logger`. This function will become the ``logger_fn`` of an underlying\n :py:class:`LoggerDefinition`.\n\n Args:\n config_schema (Optional[ConfigSchema]): The schema for the config. Configuration data available in\n `init_context.logger_config`. If not set, Dagster will accept any config provided.\n description (Optional[str]): A human-readable description of the logger.\n """\n # This case is for when decorator is used bare, without arguments.\n # E.g. @logger versus @logger()\n if callable(config_schema) and not is_callable_valid_config_arg(config_schema):\n return LoggerDefinition(logger_fn=cast("InitLoggerFunction", config_schema))\n\n def _wrap(logger_fn: "InitLoggerFunction") -> "LoggerDefinition":\n return LoggerDefinition(\n logger_fn=logger_fn,\n config_schema=config_schema,\n description=description,\n )\n\n return _wrap
\n\n\n
[docs]def build_init_logger_context(\n logger_config: Any = None,\n job_def: Optional["JobDefinition"] = None,\n) -> "UnboundInitLoggerContext":\n """Builds logger initialization context from provided parameters.\n\n This function can be used to provide the context argument to the invocation of a logger\n definition.\n\n Note that you may only specify one of pipeline_def and job_def.\n\n Args:\n logger_config (Any): The config to provide during initialization of logger.\n job_def (Optional[JobDefinition]): The job definition that the logger will be used with.\n\n Examples:\n .. code-block:: python\n\n context = build_init_logger_context()\n logger_to_init(context)\n """\n from dagster._core.definitions import JobDefinition\n from dagster._core.execution.context.logger import UnboundInitLoggerContext\n\n check.opt_inst_param(job_def, "job_def", JobDefinition)\n\n return UnboundInitLoggerContext(logger_config=logger_config, job_def=job_def)
\n
", "current_page_name": "_modules/dagster/_core/definitions/logger_definition", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.logger_definition"}, "materialize": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.materialize

\nfrom typing import TYPE_CHECKING, Any, Mapping, Optional, Sequence, Set, Union\n\nimport dagster._check as check\nfrom dagster._core.definitions.unresolved_asset_job_definition import define_asset_job\nfrom dagster._utils.merger import merge_dicts\n\nfrom ..errors import DagsterInvariantViolationError\nfrom ..instance import DagsterInstance\nfrom ..storage.io_manager import IOManagerDefinition\nfrom ..storage.mem_io_manager import mem_io_manager\nfrom .assets import AssetsDefinition\nfrom .source_asset import SourceAsset\n\nif TYPE_CHECKING:\n    from dagster._core.definitions.asset_selection import CoercibleToAssetSelection\n    from dagster._core.definitions.events import AssetKey\n\n    from ..execution.execute_in_process_result import ExecuteInProcessResult\n\nEPHEMERAL_JOB_NAME = "__ephemeral_asset_job__"\n\n\n
[docs]def materialize(\n assets: Sequence[Union[AssetsDefinition, SourceAsset]],\n run_config: Any = None,\n instance: Optional[DagsterInstance] = None,\n resources: Optional[Mapping[str, object]] = None,\n partition_key: Optional[str] = None,\n raise_on_error: bool = True,\n tags: Optional[Mapping[str, str]] = None,\n selection: Optional["CoercibleToAssetSelection"] = None,\n) -> "ExecuteInProcessResult":\n """Executes a single-threaded, in-process run which materializes provided assets.\n\n By default, will materialize assets to the local filesystem.\n\n Args:\n assets (Sequence[Union[AssetsDefinition, SourceAsset]]):\n The assets to materialize.\n\n Unless you're using `deps` or `non_argument_deps`, you must also include all assets that are\n upstream of the assets that you want to materialize. This is because those upstream\n asset definitions have information that is needed to load their contents while\n materializing the downstream assets.\n\n You can use the `selection` argument to distinguish between assets that you want to\n materialize and assets that are just present for loading.\n resources (Optional[Mapping[str, object]]):\n The resources needed for execution. Can provide resource instances\n directly, or resource definitions. Note that if provided resources\n conflict with resources directly on assets, an error will be thrown.\n run_config (Optional[Any]): The run config to use for the run that materializes the assets.\n partition_key: (Optional[str])\n The string partition key that specifies the run config to execute. Can only be used\n to select run config for assets with partitioned config.\n tags (Optional[Mapping[str, str]]): Tags for the run.\n selection (Optional[Union[str, Sequence[str], Sequence[AssetKey], Sequence[Union[AssetsDefinition, SourceAsset]], AssetSelection]]):\n A sub-selection of assets to materialize.\n\n If not provided, then all assets will be materialized.\n\n If providing a string or sequence of strings,\n https://docs.dagster.io/concepts/assets/asset-selection-syntax describes the accepted\n syntax.\n\n Returns:\n ExecuteInProcessResult: The result of the execution.\n\n Examples:\n .. code-block:: python\n\n @asset\n def asset1():\n ...\n\n @asset\n def asset2(asset1):\n ...\n\n # executes a run that materializes asset1 and then asset2\n materialize([asset1, asset2])\n\n # executes a run that materializes just asset2, loading its input from asset1\n materialize([asset1, asset2], selection=[asset2])\n """\n from dagster._core.definitions.definitions_class import Definitions\n\n assets = check.sequence_param(assets, "assets", of_type=(AssetsDefinition, SourceAsset))\n instance = check.opt_inst_param(instance, "instance", DagsterInstance)\n partition_key = check.opt_str_param(partition_key, "partition_key")\n resources = check.opt_mapping_param(resources, "resources", key_type=str)\n\n all_executable_keys: Set[AssetKey] = set()\n for asset in assets:\n if isinstance(asset, AssetsDefinition):\n all_executable_keys = all_executable_keys.union(set(asset.keys))\n\n defs = Definitions(\n jobs=[define_asset_job(name=EPHEMERAL_JOB_NAME, selection=selection)],\n assets=assets,\n resources=resources,\n )\n return check.not_none(\n defs.get_job_def(EPHEMERAL_JOB_NAME),\n "This should always return a job",\n ).execute_in_process(\n run_config=run_config,\n instance=instance,\n partition_key=partition_key,\n raise_on_error=raise_on_error,\n tags=tags,\n )
\n\n\n
[docs]def materialize_to_memory(\n assets: Sequence[Union[AssetsDefinition, SourceAsset]],\n run_config: Any = None,\n instance: Optional[DagsterInstance] = None,\n resources: Optional[Mapping[str, object]] = None,\n partition_key: Optional[str] = None,\n raise_on_error: bool = True,\n tags: Optional[Mapping[str, str]] = None,\n selection: Optional["CoercibleToAssetSelection"] = None,\n) -> "ExecuteInProcessResult":\n """Executes a single-threaded, in-process run which materializes provided assets in memory.\n\n Will explicitly use :py:func:`mem_io_manager` for all required io manager\n keys. If any io managers are directly provided using the `resources`\n argument, a :py:class:`DagsterInvariantViolationError` will be thrown.\n\n Args:\n assets (Sequence[Union[AssetsDefinition, SourceAsset]]):\n The assets to materialize. Can also provide :py:class:`SourceAsset` objects to fill dependencies for asset defs.\n run_config (Optional[Any]): The run config to use for the run that materializes the assets.\n resources (Optional[Mapping[str, object]]):\n The resources needed for execution. Can provide resource instances\n directly, or resource definitions. If provided resources\n conflict with resources directly on assets, an error will be thrown.\n partition_key: (Optional[str])\n The string partition key that specifies the run config to execute. Can only be used\n to select run config for assets with partitioned config.\n tags (Optional[Mapping[str, str]]): Tags for the run.\n selection (Optional[Union[str, Sequence[str], Sequence[AssetKey], Sequence[Union[AssetsDefinition, SourceAsset]], AssetSelection]]):\n A sub-selection of assets to materialize.\n\n If not provided, then all assets will be materialized.\n\n If providing a string or sequence of strings,\n https://docs.dagster.io/concepts/assets/asset-selection-syntax describes the accepted\n syntax.\n\n Returns:\n ExecuteInProcessResult: The result of the execution.\n\n Examples:\n .. code-block:: python\n\n @asset\n def asset1():\n ...\n\n @asset\n def asset2(asset1):\n ...\n\n # executes a run that materializes asset1 and then asset2\n materialize([asset1, asset2])\n\n # executes a run that materializes just asset1\n materialize([asset1, asset2], selection=[asset1])\n """\n assets = check.sequence_param(assets, "assets", of_type=(AssetsDefinition, SourceAsset))\n\n # Gather all resource defs for the purpose of checking io managers.\n resources_dict = resources or {}\n all_resource_keys = set(resources_dict.keys())\n for asset in assets:\n all_resource_keys = all_resource_keys.union(asset.resource_defs.keys())\n\n io_manager_keys = _get_required_io_manager_keys(assets)\n for io_manager_key in io_manager_keys:\n if io_manager_key in all_resource_keys:\n raise DagsterInvariantViolationError(\n "Attempted to call `materialize_to_memory` with a resource "\n f"provided for io manager key '{io_manager_key}'. Do not "\n "provide resources for io manager keys when calling "\n "`materialize_to_memory`, as it will override io management "\n "behavior for all keys."\n )\n\n resource_defs = merge_dicts({key: mem_io_manager for key in io_manager_keys}, resources_dict)\n\n return materialize(\n assets=assets,\n run_config=run_config,\n resources=resource_defs,\n instance=instance,\n partition_key=partition_key,\n raise_on_error=raise_on_error,\n tags=tags,\n selection=selection,\n )
\n\n\ndef _get_required_io_manager_keys(\n assets: Sequence[Union[AssetsDefinition, SourceAsset]]\n) -> Set[str]:\n io_manager_keys = set()\n for asset in assets:\n for requirement in asset.get_resource_requirements():\n if requirement.expected_type == IOManagerDefinition:\n io_manager_keys.add(requirement.key)\n return io_manager_keys\n
", "current_page_name": "_modules/dagster/_core/definitions/materialize", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.materialize"}, "metadata": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.metadata

\nimport os\nfrom abc import ABC, abstractmethod\nfrom typing import (\n    TYPE_CHECKING,\n    Any,\n    Callable,\n    Dict,\n    Generic,\n    List,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Union,\n    cast,\n)\n\nfrom typing_extensions import Self, TypeAlias, TypeVar\n\nimport dagster._check as check\nimport dagster._seven as seven\nfrom dagster._annotations import PublicAttr, deprecated, deprecated_param, experimental, public\nfrom dagster._core.errors import DagsterInvalidMetadata\nfrom dagster._serdes import whitelist_for_serdes\nfrom dagster._serdes.serdes import (\n    FieldSerializer,\n    PackableValue,\n    UnpackContext,\n    WhitelistMap,\n    pack_value,\n)\nfrom dagster._utils.warnings import (\n    deprecation_warning,\n    normalize_renamed_param,\n)\n\nfrom .table import (  # re-exported\n    TableColumn as TableColumn,\n    TableColumnConstraints as TableColumnConstraints,\n    TableConstraints as TableConstraints,\n    TableRecord as TableRecord,\n    TableSchema as TableSchema,\n)\n\nif TYPE_CHECKING:\n    from dagster._core.definitions.events import AssetKey\n\nArbitraryMetadataMapping: TypeAlias = Mapping[str, Any]\n\nRawMetadataValue = Union[\n    "MetadataValue",\n    TableSchema,\n    "AssetKey",\n    os.PathLike,\n    Dict[Any, Any],\n    float,\n    int,\n    List[Any],\n    str,\n    None,\n]\n\nMetadataMapping: TypeAlias = Mapping[str, "MetadataValue"]\nMetadataUserInput: TypeAlias = Mapping[str, RawMetadataValue]\n\nT_Packable = TypeVar("T_Packable", bound=PackableValue, default=PackableValue, covariant=True)\n\n# ########################\n# ##### NORMALIZATION\n# ########################\n\n\ndef normalize_metadata(\n    metadata: Mapping[str, RawMetadataValue],\n    allow_invalid: bool = False,\n) -> Mapping[str, "MetadataValue"]:\n    # This is a stopgap measure to deal with unsupported metadata values, which occur when we try\n    # to convert arbitrary metadata (on e.g. OutputDefinition) to a MetadataValue, which is required\n    # for serialization. This will cause unsupported values to be silently replaced with a\n    # string placeholder.\n    normalized_metadata: Dict[str, MetadataValue] = {}\n    for k, v in metadata.items():\n        try:\n            normalized_value = normalize_metadata_value(v)\n        except DagsterInvalidMetadata as e:\n            if allow_invalid:\n                deprecation_warning(\n                    "Support for arbitrary metadata values",\n                    "2.0.0",\n                    additional_warn_text=(\n                        "In the future, all user-supplied metadata values must be one of"\n                        f" {RawMetadataValue}"\n                    ),\n                    stacklevel=4,  # to get the caller of `normalize_metadata`\n                )\n                normalized_value = TextMetadataValue(f"[{v.__class__.__name__}] (unserializable)")\n            else:\n                raise DagsterInvalidMetadata(\n                    f'Could not resolve the metadata value for "{k}" to a known type. {e}'\n                ) from None\n        normalized_metadata[k] = normalized_value\n\n    return normalized_metadata\n\n\ndef normalize_metadata_value(raw_value: RawMetadataValue) -> "MetadataValue[Any]":\n    from dagster._core.definitions.events import AssetKey\n\n    if isinstance(raw_value, MetadataValue):\n        return raw_value\n    elif isinstance(raw_value, str):\n        return MetadataValue.text(raw_value)\n    elif isinstance(raw_value, float):\n        return MetadataValue.float(raw_value)\n    elif isinstance(raw_value, bool):\n        return MetadataValue.bool(raw_value)\n    elif isinstance(raw_value, int):\n        return MetadataValue.int(raw_value)\n    elif isinstance(raw_value, (list, dict)):\n        return MetadataValue.json(raw_value)\n    elif isinstance(raw_value, os.PathLike):\n        return MetadataValue.path(raw_value)\n    elif isinstance(raw_value, AssetKey):\n        return MetadataValue.asset(raw_value)\n    elif isinstance(raw_value, TableSchema):\n        return MetadataValue.table_schema(raw_value)\n    elif raw_value is None:\n        return MetadataValue.null()\n\n    raise DagsterInvalidMetadata(\n        f"Its type was {type(raw_value)}. Consider wrapping the value with the appropriate "\n        "MetadataValue type."\n    )\n\n\n# ########################\n# ##### METADATA VALUE\n# ########################\n\n\n
[docs]class MetadataValue(ABC, Generic[T_Packable]):\n """Utility class to wrap metadata values passed into Dagster events so that they can be\n displayed in the Dagster UI and other tooling.\n\n .. code-block:: python\n\n @op\n def emit_metadata(context, df):\n yield AssetMaterialization(\n asset_key="my_dataset",\n metadata={\n "my_text_label": "hello",\n "dashboard_url": MetadataValue.url("http://mycoolsite.com/my_dashboard"),\n "num_rows": 0,\n },\n )\n """\n\n @public\n @property\n @abstractmethod\n def value(self) -> T_Packable:\n """The wrapped value."""\n raise NotImplementedError()\n\n
[docs] @public\n @staticmethod\n def text(text: str) -> "TextMetadataValue":\n """Static constructor for a metadata value wrapping text as\n :py:class:`TextMetadataValue`. Can be used as the value type for the `metadata`\n parameter for supported events.\n\n Example:\n .. code-block:: python\n\n @op\n def emit_metadata(context, df):\n yield AssetMaterialization(\n asset_key="my_dataset",\n metadata={\n "my_text_label": MetadataValue.text("hello")\n },\n )\n\n Args:\n text (str): The text string for a metadata entry.\n """\n return TextMetadataValue(text)
\n\n
[docs] @public\n @staticmethod\n def url(url: str) -> "UrlMetadataValue":\n """Static constructor for a metadata value wrapping a URL as\n :py:class:`UrlMetadataValue`. Can be used as the value type for the `metadata`\n parameter for supported events.\n\n Example:\n .. code-block:: python\n\n @op\n def emit_metadata(context):\n yield AssetMaterialization(\n asset_key="my_dashboard",\n metadata={\n "dashboard_url": MetadataValue.url("http://mycoolsite.com/my_dashboard"),\n }\n )\n\n Args:\n url (str): The URL for a metadata entry.\n """\n return UrlMetadataValue(url)
\n\n
[docs] @public\n @staticmethod\n def path(path: Union[str, os.PathLike]) -> "PathMetadataValue":\n """Static constructor for a metadata value wrapping a path as\n :py:class:`PathMetadataValue`.\n\n Example:\n .. code-block:: python\n\n @op\n def emit_metadata(context):\n yield AssetMaterialization(\n asset_key="my_dataset",\n metadata={\n "filepath": MetadataValue.path("path/to/file"),\n }\n )\n\n Args:\n path (str): The path for a metadata entry.\n """\n return PathMetadataValue(path)
\n\n
[docs] @public\n @staticmethod\n def notebook(path: Union[str, os.PathLike]) -> "NotebookMetadataValue":\n """Static constructor for a metadata value wrapping a notebook path as\n :py:class:`NotebookMetadataValue`.\n\n Example:\n .. code-block:: python\n\n @op\n def emit_metadata(context):\n yield AssetMaterialization(\n asset_key="my_dataset",\n metadata={\n "notebook_path": MetadataValue.notebook("path/to/notebook.ipynb"),\n }\n )\n\n Args:\n path (str): The path to a notebook for a metadata entry.\n """\n return NotebookMetadataValue(path)
\n\n
[docs] @public\n @staticmethod\n def json(data: Union[Sequence[Any], Mapping[str, Any]]) -> "JsonMetadataValue":\n """Static constructor for a metadata value wrapping a json-serializable list or dict\n as :py:class:`JsonMetadataValue`. Can be used as the value type for the `metadata`\n parameter for supported events.\n\n Example:\n .. code-block:: python\n\n @op\n def emit_metadata(context):\n yield ExpectationResult(\n success=not missing_things,\n label="is_present",\n metadata={\n "about my dataset": MetadataValue.json({"missing_columns": missing_things})\n },\n )\n\n Args:\n data (Union[Sequence[Any], Mapping[str, Any]]): The JSON data for a metadata entry.\n """\n return JsonMetadataValue(data)
\n\n
[docs] @public\n @staticmethod\n def md(data: str) -> "MarkdownMetadataValue":\n """Static constructor for a metadata value wrapping markdown data as\n :py:class:`MarkdownMetadataValue`. Can be used as the value type for the `metadata`\n parameter for supported events.\n\n\n Example:\n .. code-block:: python\n\n @op\n def emit_metadata(context, md_str):\n yield AssetMaterialization(\n asset_key="info",\n metadata={\n 'Details': MetadataValue.md(md_str)\n },\n )\n\n Args:\n md_str (str): The markdown for a metadata entry.\n """\n return MarkdownMetadataValue(data)
\n\n
[docs] @public\n @staticmethod\n def python_artifact(python_artifact: Callable) -> "PythonArtifactMetadataValue":\n """Static constructor for a metadata value wrapping a python artifact as\n :py:class:`PythonArtifactMetadataValue`. Can be used as the value type for the\n `metadata` parameter for supported events.\n\n Example:\n .. code-block:: python\n\n @op\n def emit_metadata(context, df):\n yield AssetMaterialization(\n asset_key="my_dataset",\n metadata={\n "class": MetadataValue.python_artifact(MyClass),\n "function": MetadataValue.python_artifact(my_function),\n }\n )\n\n Args:\n value (Callable): The python class or function for a metadata entry.\n """\n check.callable_param(python_artifact, "python_artifact")\n return PythonArtifactMetadataValue(python_artifact.__module__, python_artifact.__name__)
\n\n
[docs] @public\n @staticmethod\n def float(value: float) -> "FloatMetadataValue":\n """Static constructor for a metadata value wrapping a float as\n :py:class:`FloatMetadataValue`. Can be used as the value type for the `metadata`\n parameter for supported events.\n\n Example:\n .. code-block:: python\n\n @op\n def emit_metadata(context, df):\n yield AssetMaterialization(\n asset_key="my_dataset",\n metadata={\n "size (bytes)": MetadataValue.float(calculate_bytes(df)),\n }\n )\n\n Args:\n value (float): The float value for a metadata entry.\n """\n return FloatMetadataValue(value)
\n\n
[docs] @public\n @staticmethod\n def int(value: int) -> "IntMetadataValue":\n """Static constructor for a metadata value wrapping an int as\n :py:class:`IntMetadataValue`. Can be used as the value type for the `metadata`\n parameter for supported events.\n\n Example:\n .. code-block:: python\n\n @op\n def emit_metadata(context, df):\n yield AssetMaterialization(\n asset_key="my_dataset",\n metadata={\n "number of rows": MetadataValue.int(len(df)),\n },\n )\n\n Args:\n value (int): The int value for a metadata entry.\n """\n return IntMetadataValue(value)
\n\n
[docs] @public\n @staticmethod\n def bool(value: bool) -> "BoolMetadataValue":\n """Static constructor for a metadata value wrapping a bool as\n :py:class:`BoolMetadataValuye`. Can be used as the value type for the `metadata`\n parameter for supported events.\n\n Example:\n .. code-block:: python\n\n @op\n def emit_metadata(context, df):\n yield AssetMaterialization(\n asset_key="my_dataset",\n metadata={\n "num rows > 1000": MetadataValue.bool(len(df) > 1000),\n },\n )\n\n Args:\n value (bool): The bool value for a metadata entry.\n """\n return BoolMetadataValue(value)
\n\n
[docs] @public\n @staticmethod\n def dagster_run(run_id: str) -> "DagsterRunMetadataValue":\n """Static constructor for a metadata value wrapping a reference to a Dagster run.\n\n Args:\n run_id (str): The ID of the run.\n """\n return DagsterRunMetadataValue(run_id)
\n\n
[docs] @public\n @staticmethod\n def asset(asset_key: "AssetKey") -> "DagsterAssetMetadataValue":\n """Static constructor for a metadata value referencing a Dagster asset, by key.\n\n For example:\n\n .. code-block:: python\n\n @op\n def validate_table(context, df):\n yield AssetMaterialization(\n asset_key=AssetKey("my_table"),\n metadata={\n "Related asset": MetadataValue.asset(AssetKey('my_other_table')),\n },\n )\n\n Args:\n asset_key (AssetKey): The asset key referencing the asset.\n """\n from dagster._core.definitions.events import AssetKey\n\n check.inst_param(asset_key, "asset_key", AssetKey)\n return DagsterAssetMetadataValue(asset_key)
\n\n
[docs] @public\n @staticmethod\n @experimental\n def table(\n records: Sequence[TableRecord], schema: Optional[TableSchema] = None\n ) -> "TableMetadataValue":\n """Static constructor for a metadata value wrapping arbitrary tabular data as\n :py:class:`TableMetadataValue`. Can be used as the value type for the `metadata`\n parameter for supported events.\n\n Example:\n .. code-block:: python\n\n @op\n def emit_metadata(context):\n yield ExpectationResult(\n success=not has_errors,\n label="is_valid",\n metadata={\n "errors": MetadataValue.table(\n records=[\n TableRecord(code="invalid-data-type", row=2, col="name"),\n ],\n schema=TableSchema(\n columns=[\n TableColumn(name="code", type="string"),\n TableColumn(name="row", type="int"),\n TableColumn(name="col", type="string"),\n ]\n )\n ),\n },\n )\n """\n return TableMetadataValue(records, schema)
\n\n
[docs] @public\n @staticmethod\n def table_schema(\n schema: TableSchema,\n ) -> "TableSchemaMetadataValue":\n """Static constructor for a metadata value wrapping a table schema as\n :py:class:`TableSchemaMetadataValue`. Can be used as the value type\n for the `metadata` parameter for supported events.\n\n Example:\n .. code-block:: python\n\n schema = TableSchema(\n columns = [\n TableColumn(name="id", type="int"),\n TableColumn(name="status", type="bool"),\n ]\n )\n\n DagsterType(\n type_check_fn=some_validation_fn,\n name='MyTable',\n metadata={\n 'my_table_schema': MetadataValue.table_schema(schema),\n }\n )\n\n Args:\n schema (TableSchema): The table schema for a metadata entry.\n """\n return TableSchemaMetadataValue(schema)
\n\n
[docs] @public\n @staticmethod\n def null() -> "NullMetadataValue":\n """Static constructor for a metadata value representing null. Can be used as the value type\n for the `metadata` parameter for supported events.\n """\n return NullMetadataValue()
\n\n\n# ########################\n# ##### METADATA VALUE TYPES\n# ########################\n\n# NOTE: We have `type: ignore` in a few places below because mypy complains about an instance method\n# (e.g. `text`) overriding a static method on the superclass of the same name. This is not a concern\n# for us because these static methods should never be called on instances.\n\n# NOTE: `XMetadataValue` classes are serialized with a storage name of `XMetadataEntryData` to\n# maintain backward compatibility. See docstring of `whitelist_for_serdes` for more info.\n\n\n
[docs]@whitelist_for_serdes(storage_name="TextMetadataEntryData")\nclass TextMetadataValue(\n NamedTuple(\n "_TextMetadataValue",\n [\n ("text", PublicAttr[Optional[str]]),\n ],\n ),\n MetadataValue[str],\n):\n """Container class for text metadata entry data.\n\n Args:\n text (Optional[str]): The text data.\n """\n\n def __new__(cls, text: Optional[str]):\n return super(TextMetadataValue, cls).__new__(\n cls, check.opt_str_param(text, "text", default="")\n )\n\n @public\n @property\n def value(self) -> Optional[str]:\n """Optional[str]: The wrapped text data."""\n return self.text
\n\n\n
[docs]@whitelist_for_serdes(storage_name="UrlMetadataEntryData")\nclass UrlMetadataValue(\n NamedTuple(\n "_UrlMetadataValue",\n [\n ("url", PublicAttr[Optional[str]]),\n ],\n ),\n MetadataValue[str],\n):\n """Container class for URL metadata entry data.\n\n Args:\n url (Optional[str]): The URL as a string.\n """\n\n def __new__(cls, url: Optional[str]):\n return super(UrlMetadataValue, cls).__new__(\n cls, check.opt_str_param(url, "url", default="")\n )\n\n @public\n @property\n def value(self) -> Optional[str]:\n """Optional[str]: The wrapped URL."""\n return self.url
\n\n\n
[docs]@whitelist_for_serdes(storage_name="PathMetadataEntryData")\nclass PathMetadataValue(\n NamedTuple("_PathMetadataValue", [("path", PublicAttr[Optional[str]])]), MetadataValue[str]\n):\n """Container class for path metadata entry data.\n\n Args:\n path (Optional[str]): The path as a string or conforming to os.PathLike.\n """\n\n def __new__(cls, path: Optional[Union[str, os.PathLike]]):\n return super(PathMetadataValue, cls).__new__(\n cls, check.opt_path_param(path, "path", default="")\n )\n\n @public\n @property\n def value(self) -> Optional[str]:\n """Optional[str]: The wrapped path."""\n return self.path
\n\n\n
[docs]@whitelist_for_serdes(storage_name="NotebookMetadataEntryData")\nclass NotebookMetadataValue(\n NamedTuple("_NotebookMetadataValue", [("path", PublicAttr[Optional[str]])]), MetadataValue[str]\n):\n """Container class for notebook metadata entry data.\n\n Args:\n path (Optional[str]): The path to the notebook as a string or conforming to os.PathLike.\n """\n\n def __new__(cls, path: Optional[Union[str, os.PathLike]]):\n return super(NotebookMetadataValue, cls).__new__(\n cls, check.opt_path_param(path, "path", default="")\n )\n\n @public\n @property\n def value(self) -> Optional[str]:\n """Optional[str]: The wrapped path to the notebook as a string."""\n return self.path
\n\n\n
[docs]@whitelist_for_serdes(storage_name="JsonMetadataEntryData")\nclass JsonMetadataValue(\n NamedTuple(\n "_JsonMetadataValue",\n [\n ("data", PublicAttr[Optional[Union[Sequence[Any], Mapping[str, Any]]]]),\n ],\n ),\n MetadataValue[Union[Sequence[Any], Mapping[str, Any]]],\n):\n """Container class for JSON metadata entry data.\n\n Args:\n data (Union[Sequence[Any], Dict[str, Any]]): The JSON data.\n """\n\n def __new__(cls, data: Optional[Union[Sequence[Any], Mapping[str, Any]]]):\n data = check.opt_inst_param(data, "data", (Sequence, Mapping))\n try:\n # check that the value is JSON serializable\n seven.dumps(data)\n except TypeError:\n raise DagsterInvalidMetadata("Value is not JSON serializable.")\n return super(JsonMetadataValue, cls).__new__(cls, data)\n\n @public\n @property\n def value(self) -> Optional[Union[Sequence[Any], Mapping[str, Any]]]:\n """Optional[Union[Sequence[Any], Dict[str, Any]]]: The wrapped JSON data."""\n return self.data
\n\n\n
[docs]@whitelist_for_serdes(storage_name="MarkdownMetadataEntryData")\nclass MarkdownMetadataValue(\n NamedTuple(\n "_MarkdownMetadataValue",\n [\n ("md_str", PublicAttr[Optional[str]]),\n ],\n ),\n MetadataValue[str],\n):\n """Container class for markdown metadata entry data.\n\n Args:\n md_str (Optional[str]): The markdown as a string.\n """\n\n def __new__(cls, md_str: Optional[str]):\n return super(MarkdownMetadataValue, cls).__new__(\n cls, check.opt_str_param(md_str, "md_str", default="")\n )\n\n @public\n @property\n def value(self) -> Optional[str]:\n """Optional[str]: The wrapped markdown as a string."""\n return self.md_str
\n\n\n# This should be deprecated or fixed so that `value` does not return itself.\n
[docs]@whitelist_for_serdes(storage_name="PythonArtifactMetadataEntryData")\nclass PythonArtifactMetadataValue(\n NamedTuple(\n "_PythonArtifactMetadataValue",\n [\n ("module", PublicAttr[str]),\n ("name", PublicAttr[str]),\n ],\n ),\n MetadataValue["PythonArtifactMetadataValue"],\n):\n """Container class for python artifact metadata entry data.\n\n Args:\n module (str): The module where the python artifact can be found\n name (str): The name of the python artifact\n """\n\n def __new__(cls, module: str, name: str):\n return super(PythonArtifactMetadataValue, cls).__new__(\n cls, check.str_param(module, "module"), check.str_param(name, "name")\n )\n\n @public\n @property\n def value(self) -> Self:\n """PythonArtifactMetadataValue: Identity function."""\n return self
\n\n\n
[docs]@whitelist_for_serdes(storage_name="FloatMetadataEntryData")\nclass FloatMetadataValue(\n NamedTuple(\n "_FloatMetadataValue",\n [\n ("value", PublicAttr[Optional[float]]),\n ],\n ),\n MetadataValue[float],\n):\n """Container class for float metadata entry data.\n\n Args:\n value (Optional[float]): The float value.\n """\n\n def __new__(cls, value: Optional[float]):\n return super(FloatMetadataValue, cls).__new__(cls, check.opt_float_param(value, "value"))
\n\n\n
[docs]@whitelist_for_serdes(storage_name="IntMetadataEntryData")\nclass IntMetadataValue(\n NamedTuple(\n "_IntMetadataValue",\n [\n ("value", PublicAttr[Optional[int]]),\n ],\n ),\n MetadataValue[int],\n):\n """Container class for int metadata entry data.\n\n Args:\n value (Optional[int]): The int value.\n """\n\n def __new__(cls, value: Optional[int]):\n return super(IntMetadataValue, cls).__new__(cls, check.opt_int_param(value, "value"))
\n\n\n@whitelist_for_serdes(storage_name="BoolMetadataEntryData")\nclass BoolMetadataValue(\n NamedTuple("_BoolMetadataValue", [("value", PublicAttr[Optional[bool]])]),\n MetadataValue[bool],\n):\n """Container class for bool metadata entry data.\n\n Args:\n value (Optional[bool]): The bool value.\n """\n\n def __new__(cls, value: Optional[bool]):\n return super(BoolMetadataValue, cls).__new__(cls, check.opt_bool_param(value, "value"))\n\n\n
[docs]@whitelist_for_serdes(storage_name="DagsterPipelineRunMetadataEntryData")\nclass DagsterRunMetadataValue(\n NamedTuple(\n "_DagsterRunMetadataValue",\n [\n ("run_id", PublicAttr[str]),\n ],\n ),\n MetadataValue[str],\n):\n """Representation of a dagster run.\n\n Args:\n run_id (str): The run id\n """\n\n def __new__(cls, run_id: str):\n return super(DagsterRunMetadataValue, cls).__new__(cls, check.str_param(run_id, "run_id"))\n\n @public\n @property\n def value(self) -> str:\n """str: The wrapped run id."""\n return self.run_id
\n\n\n
[docs]@whitelist_for_serdes(storage_name="DagsterAssetMetadataEntryData")\nclass DagsterAssetMetadataValue(\n NamedTuple("_DagsterAssetMetadataValue", [("asset_key", PublicAttr["AssetKey"])]),\n MetadataValue["AssetKey"],\n):\n """Representation of a dagster asset.\n\n Args:\n asset_key (AssetKey): The dagster asset key\n """\n\n def __new__(cls, asset_key: "AssetKey"):\n from dagster._core.definitions.events import AssetKey\n\n return super(DagsterAssetMetadataValue, cls).__new__(\n cls, check.inst_param(asset_key, "asset_key", AssetKey)\n )\n\n @public\n @property\n def value(self) -> "AssetKey":\n """AssetKey: The wrapped :py:class:`AssetKey`."""\n return self.asset_key
\n\n\n# This should be deprecated or fixed so that `value` does not return itself.\n
[docs]@experimental\n@whitelist_for_serdes(storage_name="TableMetadataEntryData")\nclass TableMetadataValue(\n NamedTuple(\n "_TableMetadataValue",\n [\n ("records", PublicAttr[Sequence[TableRecord]]),\n ("schema", PublicAttr[TableSchema]),\n ],\n ),\n MetadataValue["TableMetadataValue"],\n):\n """Container class for table metadata entry data.\n\n Args:\n records (TableRecord): The data as a list of records (i.e. rows).\n schema (Optional[TableSchema]): A schema for the table.\n """\n\n
[docs] @public\n @staticmethod\n def infer_column_type(value: object) -> str:\n """str: Infer the :py:class:`TableSchema` column type that will be used for a value."""\n if isinstance(value, bool):\n return "bool"\n elif isinstance(value, int):\n return "int"\n elif isinstance(value, float):\n return "float"\n else:\n return "string"
\n\n def __new__(cls, records: Sequence[TableRecord], schema: Optional[TableSchema]):\n check.sequence_param(records, "records", of_type=TableRecord)\n check.opt_inst_param(schema, "schema", TableSchema)\n\n if len(records) == 0:\n schema = check.not_none(schema, "schema must be provided if records is empty")\n else:\n columns = set(records[0].data.keys())\n for record in records[1:]:\n check.invariant(\n set(record.data.keys()) == columns, "All records must have the same fields"\n )\n schema = schema or TableSchema(\n columns=[\n TableColumn(name=k, type=TableMetadataValue.infer_column_type(v))\n for k, v in records[0].data.items()\n ]\n )\n\n return super(TableMetadataValue, cls).__new__(\n cls,\n records,\n schema,\n )\n\n @public\n @property\n def value(self) -> Self:\n """TableMetadataValue: Identity function."""\n return self
\n\n\n
[docs]@whitelist_for_serdes(storage_name="TableSchemaMetadataEntryData")\nclass TableSchemaMetadataValue(\n NamedTuple("_TableSchemaMetadataValue", [("schema", PublicAttr[TableSchema])]),\n MetadataValue[TableSchema],\n):\n """Representation of a schema for arbitrary tabular data.\n\n Args:\n schema (TableSchema): The dictionary containing the schema representation.\n """\n\n def __new__(cls, schema: TableSchema):\n return super(TableSchemaMetadataValue, cls).__new__(\n cls, check.inst_param(schema, "schema", TableSchema)\n )\n\n @public\n @property\n def value(self) -> TableSchema:\n """TableSchema: The wrapped :py:class:`TableSchema`."""\n return self.schema
\n\n\n@whitelist_for_serdes(storage_name="NullMetadataEntryData")\nclass NullMetadataValue(NamedTuple("_NullMetadataValue", []), MetadataValue[None]):\n """Representation of null."""\n\n @public\n @property\n def value(self) -> None:\n """None: The wrapped null value."""\n return None\n\n\n# ########################\n# ##### METADATA BACKCOMPAT\n# ########################\n\n# Metadata used to be represented as a `List[MetadataEntry]`, but that class has been deleted. But\n# we still serialize metadata dicts to the serialized representation of `List[MetadataEntry]` for\n# backcompat purposes.\n\n\nclass MetadataFieldSerializer(FieldSerializer):\n """Converts between metadata dict (new) and metadata entries list (old)."""\n\n storage_name = "metadata_entries"\n loaded_name = "metadata"\n\n def pack(\n self,\n metadata_dict: Mapping[str, MetadataValue],\n whitelist_map: WhitelistMap,\n descent_path: str,\n ) -> Sequence[Mapping[str, Any]]:\n return [\n {\n "__class__": "EventMetadataEntry",\n "label": k,\n # MetadataValue itself can't inherit from NamedTuple and so isn't a PackableValue,\n # but one of its subclasses will always be returned here.\n "entry_data": pack_value(v, whitelist_map, descent_path), # type: ignore\n "description": None,\n }\n for k, v in metadata_dict.items()\n ]\n\n def unpack(\n self,\n metadata_entries: List["MetadataEntry"],\n whitelist_map: WhitelistMap,\n context: UnpackContext,\n ) -> Mapping[str, MetadataValue]:\n return {e.label: e.entry_data for e in metadata_entries}\n\n\nT_MetadataValue = TypeVar("T_MetadataValue", bound=MetadataValue, covariant=True)\n\n\n# NOTE: MetadataEntry is no longer accessible via the public API-- all metadata APIs use metadata\n# dicts. This clas shas only been preserved to adhere strictly to our backcompat guarantees. It is\n# still instantiated in the above `MetadataFieldSerializer` but that can easily be changed.\n
[docs]@deprecated(\n breaking_version="2.0",\n additional_warn_text="Please use a dict with `MetadataValue` values instead.",\n)\n@deprecated_param(\n param="entry_data", breaking_version="2.0", additional_warn_text="Use `value` instead."\n)\n@whitelist_for_serdes(storage_name="EventMetadataEntry")\nclass MetadataEntry(\n NamedTuple(\n "_MetadataEntry",\n [\n ("label", PublicAttr[str]),\n ("description", PublicAttr[Optional[str]]),\n ("entry_data", PublicAttr[MetadataValue]),\n ],\n ),\n Generic[T_MetadataValue],\n):\n """A structure for describing metadata for Dagster events.\n\n .. note:: This class is no longer usable in any Dagster API, and will be completely removed in 2.0.\n\n Lists of objects of this type can be passed as arguments to Dagster events and will be displayed\n in the Dagster UI and other tooling.\n\n Should be yielded from within an IO manager to append metadata for a given input/output event.\n For other event types, passing a dict with `MetadataValue` values to the `metadata` argument\n is preferred.\n\n Args:\n label (str): Short display label for this metadata entry.\n description (Optional[str]): A human-readable description of this metadata entry.\n value (MetadataValue): Typed metadata entry data. The different types allow\n for customized display in tools like the Dagster UI.\n """\n\n def __new__(\n cls,\n label: str,\n description: Optional[str] = None,\n entry_data: Optional["RawMetadataValue"] = None,\n value: Optional["RawMetadataValue"] = None,\n ):\n value = cast(\n RawMetadataValue,\n normalize_renamed_param(\n new_val=value,\n new_arg="value",\n old_val=entry_data,\n old_arg="entry_data",\n ),\n )\n value = normalize_metadata_value(value)\n\n return super(MetadataEntry, cls).__new__(\n cls,\n check.str_param(label, "label"),\n check.opt_str_param(description, "description"),\n check.inst_param(value, "value", MetadataValue),\n )\n\n @property\n def value(self):\n """Alias of `entry_data`."""\n return self.entry_data
\n
", "current_page_name": "_modules/dagster/_core/definitions/metadata", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "table": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.metadata.table

\nfrom typing import Mapping, NamedTuple, Optional, Sequence, Union, cast\n\nimport dagster._check as check\nfrom dagster._annotations import PublicAttr, experimental, public\nfrom dagster._serdes.serdes import (\n    whitelist_for_serdes,\n)\n\n# ########################\n# ##### TABLE RECORD\n# ########################\n\n\n
[docs]@experimental\n@whitelist_for_serdes\nclass TableRecord(\n NamedTuple("TableRecord", [("data", PublicAttr[Mapping[str, Union[str, int, float, bool]]])])\n):\n """Represents one record in a table. Field keys are arbitrary strings-- field values must be\n strings, integers, floats, or bools.\n """\n\n def __new__(cls, data: Mapping[str, Union[str, int, float, bool]]):\n check.dict_param(\n data,\n "data",\n value_type=(str, float, int, bool, type(None)),\n additional_message="Record fields must be one of types: (str, float, int, bool)",\n )\n return super(TableRecord, cls).__new__(cls, data=data)
\n\n\n# ########################\n# ##### TABLE SCHEMA\n# ########################\n\n\n
[docs]@whitelist_for_serdes\nclass TableSchema(\n NamedTuple(\n "TableSchema",\n [\n ("columns", PublicAttr[Sequence["TableColumn"]]),\n ("constraints", PublicAttr["TableConstraints"]),\n ],\n )\n):\n """Representation of a schema for tabular data.\n\n Schema is composed of two parts:\n\n - A required list of columns (`TableColumn`). Each column specifies a\n `name`, `type`, set of `constraints`, and (optional) `description`. `type`\n defaults to `string` if unspecified. Column constraints\n (`TableColumnConstraints`) consist of boolean properties `unique` and\n `nullable`, as well as a list of strings `other` containing string\n descriptions of all additional constraints (e.g. `"<= 5"`).\n - An optional list of table-level constraints (`TableConstraints`). A\n table-level constraint cannot be expressed in terms of a single column,\n e.g. col a > col b. Presently, all table-level constraints must be\n expressed as strings under the `other` attribute of a `TableConstraints`\n object.\n\n .. code-block:: python\n\n # example schema\n TableSchema(\n constraints = TableConstraints(\n other = [\n "foo > bar",\n ],\n ),\n columns = [\n TableColumn(\n name = "foo",\n type = "string",\n description = "Foo description",\n constraints = TableColumnConstraints(\n required = True,\n other = [\n "starts with the letter 'a'",\n ],\n ),\n ),\n TableColumn(\n name = "bar",\n type = "string",\n ),\n TableColumn(\n name = "baz",\n type = "custom_type",\n constraints = TableColumnConstraints(\n unique = True,\n )\n ),\n ],\n )\n\n Args:\n columns (List[TableColumn]): The columns of the table.\n constraints (Optional[TableConstraints]): The constraints of the table.\n """\n\n def __new__(\n cls,\n columns: Sequence["TableColumn"],\n constraints: Optional["TableConstraints"] = None,\n ):\n return super(TableSchema, cls).__new__(\n cls,\n columns=check.sequence_param(columns, "columns", of_type=TableColumn),\n constraints=check.opt_inst_param(\n constraints, "constraints", TableConstraints, default=_DEFAULT_TABLE_CONSTRAINTS\n ),\n )\n\n
[docs] @public\n @staticmethod\n def from_name_type_dict(name_type_dict: Mapping[str, str]):\n """Constructs a TableSchema from a dictionary whose keys are column names and values are the\n names of data types of those columns.\n """\n return TableSchema(\n columns=[\n TableColumn(name=name, type=type_str) for name, type_str in name_type_dict.items()\n ]\n )
\n\n\n# ########################\n# ##### TABLE CONSTRAINTS\n# ########################\n\n\n
[docs]@whitelist_for_serdes\nclass TableConstraints(\n NamedTuple(\n "TableConstraints",\n [\n ("other", PublicAttr[Sequence[str]]),\n ],\n )\n):\n """Descriptor for "table-level" constraints. Presently only one property,\n `other` is supported. This contains strings describing arbitrary\n table-level constraints. A table-level constraint is a constraint defined\n in terms of multiple columns (e.g. col_A > col_B) or in terms of rows.\n\n Args:\n other (List[str]): Descriptions of arbitrary table-level constraints.\n """\n\n def __new__(\n cls,\n other: Sequence[str],\n ):\n return super(TableConstraints, cls).__new__(\n cls,\n other=check.sequence_param(other, "other", of_type=str),\n )
\n\n\n_DEFAULT_TABLE_CONSTRAINTS = TableConstraints(other=[])\n\n# ########################\n# ##### TABLE COLUMN\n# ########################\n\n\n
[docs]@whitelist_for_serdes\nclass TableColumn(\n NamedTuple(\n "TableColumn",\n [\n ("name", PublicAttr[str]),\n ("type", PublicAttr[str]),\n ("description", PublicAttr[Optional[str]]),\n ("constraints", PublicAttr["TableColumnConstraints"]),\n ],\n )\n):\n """Descriptor for a table column. The only property that must be specified\n by the user is `name`. If no `type` is specified, `string` is assumed. If\n no `constraints` are specified, the column is assumed to be nullable\n (i.e. `required = False`) and have no other constraints beyond the data type.\n\n Args:\n name (List[str]): Descriptions of arbitrary table-level constraints.\n type (Optional[str]): The type of the column. Can be an arbitrary\n string. Defaults to `"string"`.\n description (Optional[str]): Description of this column. Defaults to `None`.\n constraints (Optional[TableColumnConstraints]): Column-level constraints.\n If unspecified, column is nullable with no constraints.\n """\n\n def __new__(\n cls,\n name: str,\n type: str = "string", # noqa: A002\n description: Optional[str] = None,\n constraints: Optional["TableColumnConstraints"] = None,\n ):\n return super(TableColumn, cls).__new__(\n cls,\n name=check.str_param(name, "name"),\n type=check.str_param(type, "type"),\n description=check.opt_str_param(description, "description"),\n constraints=cast(\n "TableColumnConstraints",\n check.opt_inst_param(\n constraints,\n "constraints",\n TableColumnConstraints,\n default=_DEFAULT_TABLE_COLUMN_CONSTRAINTS,\n ),\n ),\n )
\n\n\n# ########################\n# ##### TABLE COLUMN CONSTRAINTS\n# ########################\n\n\n
[docs]@whitelist_for_serdes\nclass TableColumnConstraints(\n NamedTuple(\n "TableColumnConstraints",\n [\n ("nullable", PublicAttr[bool]),\n ("unique", PublicAttr[bool]),\n ("other", PublicAttr[Optional[Sequence[str]]]),\n ],\n )\n):\n """Descriptor for a table column's constraints. Nullability and uniqueness are specified with\n boolean properties. All other constraints are described using arbitrary strings under the\n `other` property.\n\n Args:\n nullable (Optional[bool]): If true, this column can hold null values.\n unique (Optional[bool]): If true, all values in this column must be unique.\n other (List[str]): Descriptions of arbitrary column-level constraints\n not expressible by the predefined properties.\n """\n\n def __new__(\n cls,\n nullable: bool = True,\n unique: bool = False,\n other: Optional[Sequence[str]] = None,\n ):\n return super(TableColumnConstraints, cls).__new__(\n cls,\n nullable=check.bool_param(nullable, "nullable"),\n unique=check.bool_param(unique, "unique"),\n other=check.opt_sequence_param(other, "other"),\n )
\n\n\n_DEFAULT_TABLE_COLUMN_CONSTRAINTS = TableColumnConstraints()\n
", "current_page_name": "_modules/dagster/_core/definitions/metadata/table", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}, {"link": "../", "title": "dagster._core.definitions.metadata"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.metadata.table"}, "title": "dagster._core.definitions.metadata"}, "multi_asset_sensor_definition": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.multi_asset_sensor_definition

\nimport inspect\nimport json\nfrom collections import OrderedDict, defaultdict\nfrom typing import (\n    TYPE_CHECKING,\n    Callable,\n    Dict,\n    Iterable,\n    Iterator,\n    List,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Set,\n    Union,\n    cast,\n)\n\nimport dagster._check as check\nfrom dagster._annotations import experimental, public\nfrom dagster._core.definitions.asset_selection import AssetSelection\nfrom dagster._core.definitions.assets import AssetsDefinition\nfrom dagster._core.definitions.partition import PartitionsDefinition\nfrom dagster._core.definitions.resource_annotation import get_resource_args\nfrom dagster._core.definitions.resource_definition import ResourceDefinition\nfrom dagster._core.definitions.scoped_resources_builder import ScopedResourcesBuilder\nfrom dagster._core.errors import (\n    DagsterInvalidDefinitionError,\n    DagsterInvalidInvocationError,\n    DagsterInvariantViolationError,\n)\nfrom dagster._core.instance import DagsterInstance\nfrom dagster._core.instance.ref import InstanceRef\nfrom dagster._utils import normalize_to_repository\n\nfrom .events import AssetKey\nfrom .run_request import RunRequest, SensorResult, SkipReason\nfrom .sensor_definition import (\n    DefaultSensorStatus,\n    SensorDefinition,\n    SensorEvaluationContext,\n    SensorType,\n    get_context_param_name,\n    get_sensor_context_from_args_or_kwargs,\n    validate_and_get_resource_dict,\n)\nfrom .target import ExecutableDefinition\nfrom .utils import check_valid_name\n\nif TYPE_CHECKING:\n    from dagster._core.definitions.definitions_class import Definitions\n    from dagster._core.definitions.repository_definition import RepositoryDefinition\n    from dagster._core.storage.event_log.base import EventLogRecord\n\nMAX_NUM_UNCONSUMED_EVENTS = 25\n\n\nclass MultiAssetSensorAssetCursorComponent(\n    NamedTuple(\n        "_MultiAssetSensorAssetCursorComponent",\n        [\n            ("latest_consumed_event_partition", Optional[str]),\n            ("latest_consumed_event_id", Optional[int]),\n            ("trailing_unconsumed_partitioned_event_ids", Dict[str, int]),\n        ],\n    )\n):\n    """A cursor component that is used to track the cursor for a particular asset in a multi-asset\n    sensor.\n\n    Here's an illustration to help explain how this representation works:\n\n    partition_1  ---|----------a----\n    partition_2  -t-----|-x---------\n    partition_3  ----t------|---a---\n\n\n    The "|", "a", "t", and "x" characters represent materialization events.\n    The x-axis is storage_id, which is basically time. The cursor has been advanced to the "|" event\n    for each partition. latest_evaluated_event_partition would be "partition_3", and\n    "latest_evaluated_event_id" would be the storage_id of the "|" event for partition_3.\n\n    The "t" events aren't directly represented in the cursor, because they trail the event that the\n    the cursor for their partition has advanced to. The "a" events aren't directly represented\n    in the cursor, because they occurred after the "latest_evaluated_event_id".  The "x" event is\n    included in "unevaluated_partitioned_event_ids", because it's after the event that the cursor\n    for its partition has advanced to, but trails "latest_evaluated_event_id".\n\n    Attributes:\n        latest_consumed_event_partition (Optional[str]): The partition of the latest consumed event\n            for this asset.\n        latest_consumed_event_id (Optional[int]): The event ID of the latest consumed event for\n            this asset.\n        trailing_unconsumed_partitioned_event_ids (Dict[str, int]): A mapping containing\n            the partition key mapped to the latest unconsumed materialization event for this\n            partition with an ID less than latest_consumed_event_id.\n    """\n\n    def __new__(\n        cls,\n        latest_consumed_event_partition,\n        latest_consumed_event_id,\n        trailing_unconsumed_partitioned_event_ids,\n    ):\n        return super(MultiAssetSensorAssetCursorComponent, cls).__new__(\n            cls,\n            latest_consumed_event_partition=check.opt_str_param(\n                latest_consumed_event_partition, "latest_consumed_event_partition"\n            ),\n            latest_consumed_event_id=check.opt_int_param(\n                latest_consumed_event_id, "latest_consumed_event_id"\n            ),\n            trailing_unconsumed_partitioned_event_ids=check.dict_param(\n                trailing_unconsumed_partitioned_event_ids,\n                "trailing_unconsumed_partitioned_event_ids",\n                key_type=str,\n                value_type=int,\n            ),\n        )\n\n\nclass MultiAssetSensorContextCursor:\n    # Tracks the state of the cursor within the tick, created for utility purposes.\n    # Must call MultiAssetSensorEvaluationContext._update_cursor_after_evaluation at end of tick\n    # to serialize the cursor.\n    def __init__(self, cursor: Optional[str], context: "MultiAssetSensorEvaluationContext"):\n        loaded_cursor = json.loads(cursor) if cursor else {}\n        self._cursor_component_by_asset_key: Dict[str, MultiAssetSensorAssetCursorComponent] = {}\n\n        # The initial latest consumed event ID at the beginning of the tick\n        self.initial_latest_consumed_event_ids_by_asset_key: Dict[str, Optional[int]] = {}\n\n        for str_asset_key, cursor_list in loaded_cursor.items():\n            if len(cursor_list) != 3:\n                # In this case, the cursor object is not a multi asset sensor asset cursor\n                # component. This cursor is maintained by the asset reconciliation sensor.\n                break\n            else:\n                partition_key, event_id, trailing_unconsumed_partitioned_event_ids = cursor_list\n                self._cursor_component_by_asset_key[str_asset_key] = (\n                    MultiAssetSensorAssetCursorComponent(\n                        latest_consumed_event_partition=partition_key,\n                        latest_consumed_event_id=event_id,\n                        trailing_unconsumed_partitioned_event_ids=trailing_unconsumed_partitioned_event_ids,\n                    )\n                )\n\n                self.initial_latest_consumed_event_ids_by_asset_key[str_asset_key] = event_id\n\n        check.dict_param(self._cursor_component_by_asset_key, "unpacked_cursor", key_type=str)\n        self._context = context\n\n    def get_cursor_for_asset(self, asset_key: AssetKey) -> MultiAssetSensorAssetCursorComponent:\n        return self._cursor_component_by_asset_key.get(\n            str(asset_key), MultiAssetSensorAssetCursorComponent(None, None, {})\n        )\n\n    def get_stringified_cursor(self) -> str:\n        return json.dumps(self._cursor_component_by_asset_key)\n\n\n
[docs]@experimental\nclass MultiAssetSensorEvaluationContext(SensorEvaluationContext):\n """The context object available as the argument to the evaluation function of a\n :py:class:`dagster.MultiAssetSensorDefinition`.\n\n Users should not instantiate this object directly. To construct a\n `MultiAssetSensorEvaluationContext` for testing purposes, use :py:func:`dagster.\n build_multi_asset_sensor_context`.\n\n The `MultiAssetSensorEvaluationContext` contains a cursor object that tracks the state of\n consumed event logs for each monitored asset. For each asset, the cursor stores the storage ID\n of the latest materialization that has been marked as "consumed" (via a call to `advance_cursor`)\n in a `latest_consumed_event_id` field.\n\n For each monitored asset, the cursor will store the latest unconsumed event ID for up to 25\n partitions. Each event ID must be before the `latest_consumed_event_id` field for the asset.\n\n Events marked as consumed via `advance_cursor` will be returned in future ticks until they\n are marked as consumed.\n\n To update the cursor to the latest materialization and clear the unconsumed events, call\n `advance_all_cursors`.\n\n Attributes:\n monitored_assets (Union[Sequence[AssetKey], AssetSelection]): The assets monitored\n by the sensor. If an AssetSelection object is provided, it will only apply to assets\n within the Definitions that this sensor is part of.\n repository_def (Optional[RepositoryDefinition]): The repository that the sensor belongs to.\n If needed by the sensor top-level resource definitions will be pulled from this repository.\n You can provide either this or `definitions`.\n instance_ref (Optional[InstanceRef]): The serialized instance configured to run the schedule\n cursor (Optional[str]): The cursor, passed back from the last sensor evaluation via\n the cursor attribute of SkipReason and RunRequest. Must be a dictionary of asset key\n strings to a stringified tuple of (latest_event_partition, latest_event_storage_id,\n trailing_unconsumed_partitioned_event_ids).\n last_completion_time (float): DEPRECATED The last time that the sensor was consumed (UTC).\n last_run_key (str): DEPRECATED The run key of the RunRequest most recently created by this\n sensor. Use the preferred `cursor` attribute instead.\n repository_name (Optional[str]): The name of the repository that the sensor belongs to.\n instance (Optional[DagsterInstance]): The deserialized instance can also be passed in\n directly (primarily useful in testing contexts).\n definitions (Optional[Definitions]): `Definitions` object that the sensor is defined in.\n If needed by the sensor, top-level resource definitions will be pulled from these\n definitions. You can provide either this or `repository_def`.\n\n Example:\n .. code-block:: python\n\n from dagster import multi_asset_sensor, MultiAssetSensorEvaluationContext\n\n @multi_asset_sensor(monitored_assets=[AssetKey("asset_1), AssetKey("asset_2)])\n def the_sensor(context: MultiAssetSensorEvaluationContext):\n ...\n """\n\n def __init__(\n self,\n instance_ref: Optional[InstanceRef],\n last_completion_time: Optional[float],\n last_run_key: Optional[str],\n cursor: Optional[str],\n repository_name: Optional[str],\n repository_def: Optional["RepositoryDefinition"],\n monitored_assets: Union[Sequence[AssetKey], AssetSelection],\n instance: Optional[DagsterInstance] = None,\n resource_defs: Optional[Mapping[str, ResourceDefinition]] = None,\n definitions: Optional["Definitions"] = None,\n ):\n from dagster._core.definitions.definitions_class import Definitions\n from dagster._core.definitions.repository_definition import RepositoryDefinition\n\n self._repository_def = normalize_to_repository(\n check.opt_inst_param(definitions, "definitions", Definitions),\n check.opt_inst_param(repository_def, "repository_def", RepositoryDefinition),\n )\n self._monitored_asset_keys: Sequence[AssetKey]\n if isinstance(monitored_assets, AssetSelection):\n repo_assets = self._repository_def.assets_defs_by_key.values()\n repo_source_assets = self._repository_def.source_assets_by_key.values()\n self._monitored_asset_keys = list(\n monitored_assets.resolve([*repo_assets, *repo_source_assets])\n )\n else:\n self._monitored_asset_keys = monitored_assets\n\n self._assets_by_key: Dict[AssetKey, Optional[AssetsDefinition]] = {}\n self._partitions_def_by_asset_key: Dict[AssetKey, Optional[PartitionsDefinition]] = {}\n for asset_key in self._monitored_asset_keys:\n assets_def = self._repository_def.assets_defs_by_key.get(asset_key)\n self._assets_by_key[asset_key] = assets_def\n\n source_asset_def = self._repository_def.source_assets_by_key.get(asset_key)\n self._partitions_def_by_asset_key[asset_key] = (\n assets_def.partitions_def\n if assets_def\n else source_asset_def.partitions_def if source_asset_def else None\n )\n\n # Cursor object with utility methods for updating and retrieving cursor information.\n # At the end of each tick, must call update_cursor_after_evaluation to update the serialized\n # cursor.\n self._unpacked_cursor = MultiAssetSensorContextCursor(cursor, self)\n self._cursor_advance_state_mutation = MultiAssetSensorCursorAdvances()\n\n self._initial_unconsumed_events_by_id: Dict[int, EventLogRecord] = {}\n self._fetched_initial_unconsumed_events = False\n\n super(MultiAssetSensorEvaluationContext, self).__init__(\n instance_ref=instance_ref,\n last_completion_time=last_completion_time,\n last_run_key=last_run_key,\n cursor=cursor,\n repository_name=repository_name,\n instance=instance,\n repository_def=repository_def,\n resources=resource_defs,\n )\n\n def _cache_initial_unconsumed_events(self) -> None:\n from dagster._core.events import DagsterEventType\n from dagster._core.storage.event_log.base import EventRecordsFilter\n\n # This method caches the initial unconsumed events for each asset key. To generate the\n # current unconsumed events, call get_trailing_unconsumed_events instead.\n if self._fetched_initial_unconsumed_events:\n return\n\n for asset_key in self._monitored_asset_keys:\n unconsumed_event_ids = list(\n self._get_cursor(asset_key).trailing_unconsumed_partitioned_event_ids.values()\n )\n if unconsumed_event_ids:\n event_records = self.instance.get_event_records(\n EventRecordsFilter(\n event_type=DagsterEventType.ASSET_MATERIALIZATION,\n storage_ids=unconsumed_event_ids,\n )\n )\n self._initial_unconsumed_events_by_id.update(\n {event_record.storage_id: event_record for event_record in event_records}\n )\n\n self._fetched_initial_unconsumed_events = True\n\n def _get_unconsumed_events_with_ids(\n self, event_ids: Sequence[int]\n ) -> Sequence["EventLogRecord"]:\n self._cache_initial_unconsumed_events()\n unconsumed_events = []\n for event_id in sorted(event_ids):\n event = self._initial_unconsumed_events_by_id.get(event_id)\n unconsumed_events.extend([event] if event else [])\n\n return unconsumed_events\n\n
[docs] @public\n def get_trailing_unconsumed_events(self, asset_key: AssetKey) -> Sequence["EventLogRecord"]:\n """Fetches the unconsumed events for a given asset key. Returns only events\n before the latest consumed event ID for the given asset. To mark an event as consumed,\n pass the event to `advance_cursor`. Returns events in ascending order by storage ID.\n\n Args:\n asset_key (AssetKey): The asset key to get unconsumed events for.\n\n Returns:\n Sequence[EventLogRecord]: The unconsumed events for the given asset key.\n """\n check.inst_param(asset_key, "asset_key", AssetKey)\n\n return self._get_unconsumed_events_with_ids(\n list(self._get_cursor(asset_key).trailing_unconsumed_partitioned_event_ids.values())\n )
\n\n def _get_partitions_after_cursor(self, asset_key: AssetKey) -> Sequence[str]:\n asset_key = check.inst_param(asset_key, "asset_key", AssetKey)\n partition_key = self._get_cursor(asset_key).latest_consumed_event_partition\n\n partitions_def = self._partitions_def_by_asset_key.get(asset_key)\n\n if not isinstance(partitions_def, PartitionsDefinition):\n raise DagsterInvalidInvocationError(f"No partitions defined for asset key {asset_key}")\n\n partitions_to_fetch = list(\n partitions_def.get_partition_keys(dynamic_partitions_store=self.instance)\n )\n\n if partition_key is not None:\n # Return partitions after the cursor partition, not including the cursor partition\n partitions_to_fetch = partitions_to_fetch[\n partitions_to_fetch.index(partition_key) + 1 :\n ]\n return partitions_to_fetch\n\n def update_cursor_after_evaluation(self) -> None:\n """Updates the cursor after the sensor evaluation function has been called. This method\n should be called at most once per evaluation.\n """\n new_cursor = self._cursor_advance_state_mutation.get_cursor_with_advances(\n self, self._unpacked_cursor\n )\n\n if new_cursor is not None:\n # Cursor was not updated by this context object, so we do not need to update it\n self._cursor = new_cursor\n self._unpacked_cursor = MultiAssetSensorContextCursor(new_cursor, self)\n self._cursor_advance_state_mutation = MultiAssetSensorCursorAdvances()\n self._fetched_initial_unconsumed_events = False\n\n
[docs] @public\n def latest_materialization_records_by_key(\n self,\n asset_keys: Optional[Sequence[AssetKey]] = None,\n ) -> Mapping[AssetKey, Optional["EventLogRecord"]]:\n """Fetches the most recent materialization event record for each asset in asset_keys.\n Only fetches events after the latest consumed event ID for the given asset key.\n\n Args:\n asset_keys (Optional[Sequence[AssetKey]]): list of asset keys to fetch events for. If\n not specified, the latest materialization will be fetched for all assets the\n multi_asset_sensor monitors.\n\n Returns: Mapping of AssetKey to EventLogRecord where the EventLogRecord is the latest\n materialization event for the asset. If there is no materialization event for the asset,\n the value in the mapping will be None.\n """\n # Do not evaluate unconsumed events, only events newer than the cursor\n # if there are no new events after the cursor, the cursor points to the most\n # recent event.\n\n if asset_keys is None:\n asset_keys = self._monitored_asset_keys\n else:\n asset_keys = check.opt_sequence_param(asset_keys, "asset_keys", of_type=AssetKey)\n\n asset_records = self.instance.get_asset_records(asset_keys)\n\n asset_event_records: Dict[AssetKey, Optional[EventLogRecord]] = {\n asset_key: None for asset_key in asset_keys\n }\n for record in asset_records:\n if (\n record.asset_entry.last_materialization_record\n and record.asset_entry.last_materialization_record.storage_id\n > (self._get_cursor(record.asset_entry.asset_key).latest_consumed_event_id or 0)\n ):\n asset_event_records[record.asset_entry.asset_key] = (\n record.asset_entry.last_materialization_record\n )\n\n return asset_event_records
\n\n
[docs] @public\n def materialization_records_for_key(\n self, asset_key: AssetKey, limit: Optional[int] = None\n ) -> Iterable["EventLogRecord"]:\n """Fetches asset materialization event records for asset_key, with the earliest event first.\n\n Only fetches events after the latest consumed event ID for the given asset key.\n\n Args:\n asset_key (AssetKey): The asset to fetch materialization events for\n limit (Optional[int]): The number of events to fetch\n """\n from dagster._core.events import DagsterEventType\n from dagster._core.storage.event_log.base import EventRecordsFilter\n\n asset_key = check.inst_param(asset_key, "asset_key", AssetKey)\n if asset_key not in self._assets_by_key:\n raise DagsterInvalidInvocationError(f"Asset key {asset_key} not monitored by sensor.")\n\n events = list(\n self.instance.get_event_records(\n EventRecordsFilter(\n event_type=DagsterEventType.ASSET_MATERIALIZATION,\n asset_key=asset_key,\n after_cursor=self._get_cursor(asset_key).latest_consumed_event_id,\n ),\n ascending=True,\n limit=limit,\n )\n )\n\n return events
\n\n def _get_cursor(self, asset_key: AssetKey) -> MultiAssetSensorAssetCursorComponent:\n """Returns the MultiAssetSensorAssetCursorComponent for the asset key.\n\n For more information, view the docstring for the MultiAssetSensorAssetCursorComponent class.\n """\n check.inst_param(asset_key, "asset_key", AssetKey)\n\n return self._unpacked_cursor.get_cursor_for_asset(asset_key)\n\n
[docs] @public\n def latest_materialization_records_by_partition(\n self,\n asset_key: AssetKey,\n after_cursor_partition: Optional[bool] = False,\n ) -> Mapping[str, "EventLogRecord"]:\n """Given an asset, returns a mapping of partition key to the latest materialization event\n for that partition. Fetches only materializations that have not been marked as "consumed"\n via a call to `advance_cursor`.\n\n Args:\n asset_key (AssetKey): The asset to fetch events for.\n after_cursor_partition (Optional[bool]): If True, only materializations with partitions\n after the cursor's current partition will be returned. By default, set to False.\n\n Returns:\n Mapping[str, EventLogRecord]:\n Mapping of AssetKey to a mapping of partitions to EventLogRecords where the\n EventLogRecord is the most recent materialization event for the partition.\n The mapping preserves the order that the materializations occurred.\n\n Example:\n .. code-block:: python\n\n @asset(partitions_def=DailyPartitionsDefinition("2022-07-01"))\n def july_asset():\n return 1\n\n @multi_asset_sensor(asset_keys=[july_asset.key])\n def my_sensor(context):\n context.latest_materialization_records_by_partition(july_asset.key)\n\n # After materializing july_asset for 2022-07-05, latest_materialization_by_partition\n # returns {"2022-07-05": EventLogRecord(...)}\n\n """\n from dagster._core.events import DagsterEventType\n from dagster._core.storage.event_log.base import EventLogRecord, EventRecordsFilter\n\n asset_key = check.inst_param(asset_key, "asset_key", AssetKey)\n\n if asset_key not in self._assets_by_key:\n raise DagsterInvalidInvocationError(\n f"Asset key {asset_key} not monitored in sensor definition"\n )\n\n partitions_def = self._partitions_def_by_asset_key.get(asset_key)\n if not isinstance(partitions_def, PartitionsDefinition):\n raise DagsterInvariantViolationError(\n "Cannot get latest materialization by partition for assets with no partitions"\n )\n\n partitions_to_fetch = (\n self._get_partitions_after_cursor(asset_key)\n if after_cursor_partition\n else list(partitions_def.get_partition_keys(dynamic_partitions_store=self.instance))\n )\n\n # Retain ordering of materializations\n materialization_by_partition: Dict[str, EventLogRecord] = OrderedDict()\n\n # Add unconsumed events to the materialization by partition dictionary\n # These events came before the cursor, so should be inserted in storage ID ascending order\n for unconsumed_event in sorted(\n self._get_unconsumed_events_with_ids(\n list(self._get_cursor(asset_key).trailing_unconsumed_partitioned_event_ids.values())\n )\n ):\n partition = unconsumed_event.partition_key\n if isinstance(partition, str) and partition in partitions_to_fetch:\n if partition in materialization_by_partition:\n # Remove partition to ensure materialization_by_partition preserves\n # the order of materializations\n materialization_by_partition.pop(partition)\n # Add partition and materialization to the end of the OrderedDict\n materialization_by_partition[partition] = unconsumed_event\n\n partition_materializations = self.instance.get_event_records(\n EventRecordsFilter(\n event_type=DagsterEventType.ASSET_MATERIALIZATION,\n asset_key=asset_key,\n asset_partitions=partitions_to_fetch,\n after_cursor=self._get_cursor(asset_key).latest_consumed_event_id,\n ),\n ascending=True,\n )\n for materialization in partition_materializations:\n partition = materialization.partition_key\n\n if isinstance(partition, str):\n if partition in materialization_by_partition:\n # Remove partition to ensure materialization_by_partition preserves\n # the order of materializations\n materialization_by_partition.pop(partition)\n # Add partition and materialization to the end of the OrderedDict\n materialization_by_partition[partition] = materialization\n\n return materialization_by_partition
\n\n
[docs] @public\n def latest_materialization_records_by_partition_and_asset(\n self,\n ) -> Mapping[str, Mapping[AssetKey, "EventLogRecord"]]:\n """Finds the most recent unconsumed materialization for each partition for each asset\n monitored by the sensor. Aggregates all materializations into a mapping of partition key\n to a mapping of asset key to the materialization event for that partition.\n\n For example, if the sensor monitors two partitioned assets A and B that are materialized\n for partition_x after the cursor, this function returns:\n\n .. code-block:: python\n\n {\n "partition_x": {asset_a.key: EventLogRecord(...), asset_b.key: EventLogRecord(...)}\n }\n\n This method can only be called when all monitored assets are partitioned and share\n the same partition definition.\n """\n partitions_defs = list(self._partitions_def_by_asset_key.values())\n if not partitions_defs or not all(x == partitions_defs[0] for x in partitions_defs):\n raise DagsterInvalidInvocationError(\n "All assets must be partitioned and share the same partitions definition"\n )\n\n asset_and_materialization_tuple_by_partition: Dict[\n str, Dict[AssetKey, "EventLogRecord"]\n ] = defaultdict(dict)\n\n for asset_key in self._monitored_asset_keys:\n materialization_by_partition = self.latest_materialization_records_by_partition(\n asset_key\n )\n for partition, materialization in materialization_by_partition.items():\n asset_and_materialization_tuple_by_partition[partition][asset_key] = materialization\n\n return asset_and_materialization_tuple_by_partition
\n\n
[docs] @public\n def get_cursor_partition(self, asset_key: Optional[AssetKey]) -> Optional[str]:\n """A utility method to get the current partition the cursor is on."""\n asset_key = check.opt_inst_param(asset_key, "asset_key", AssetKey)\n if asset_key not in self._monitored_asset_keys:\n raise DagsterInvalidInvocationError(\n "Provided asset key must correspond to a provided asset"\n )\n if asset_key:\n partition_key = self._get_cursor(asset_key).latest_consumed_event_partition\n elif self._monitored_asset_keys is not None and len(self._monitored_asset_keys) == 1:\n partition_key = self._get_cursor(\n self._monitored_asset_keys[0]\n ).latest_consumed_event_partition\n else:\n raise DagsterInvalidInvocationError(\n "Asset key must be provided when multiple assets are defined"\n )\n\n return partition_key
\n\n
[docs] @public\n def all_partitions_materialized(\n self, asset_key: AssetKey, partitions: Optional[Sequence[str]] = None\n ) -> bool:\n """A utility method to check if a provided list of partitions have been materialized\n for a particular asset. This method ignores the cursor and checks all materializations\n for the asset.\n\n Args:\n asset_key (AssetKey): The asset to check partitions for.\n partitions (Optional[Sequence[str]]): A list of partitions to check. If not provided,\n all partitions for the asset will be checked.\n\n Returns:\n bool: True if all selected partitions have been materialized, False otherwise.\n """\n check.inst_param(asset_key, "asset_key", AssetKey)\n\n if partitions is not None:\n check.sequence_param(partitions, "partitions", of_type=str)\n if len(partitions) == 0:\n raise DagsterInvalidInvocationError("Must provide at least one partition in list")\n\n materialized_partitions = self.instance.get_materialized_partitions(asset_key)\n if not partitions:\n if asset_key not in self._monitored_asset_keys:\n raise DagsterInvariantViolationError(\n f"Asset key {asset_key} not monitored by sensor"\n )\n\n partitions_def = self._partitions_def_by_asset_key.get(asset_key)\n if not partitions_def:\n raise DagsterInvariantViolationError(\n f"Asset key {asset_key} is not partitioned. Cannot check if partitions have"\n " been materialized."\n )\n partitions = partitions_def.get_partition_keys(dynamic_partitions_store=self.instance)\n\n return all([partition in materialized_partitions for partition in partitions])
\n\n def _get_asset(self, asset_key: AssetKey, fn_name: str) -> AssetsDefinition:\n from dagster._core.definitions.repository_definition import RepositoryDefinition\n\n repo_def = cast(RepositoryDefinition, self._repository_def)\n repository_assets = repo_def.assets_defs_by_key\n if asset_key in self._assets_by_key:\n asset_def = self._assets_by_key[asset_key]\n if asset_def is None:\n raise DagsterInvalidInvocationError(\n f"Asset key {asset_key} does not have an AssetDefinition in this repository"\n f" (likely because it is a SourceAsset). fn context.{fn_name} can only be"\n " called for assets with AssetDefinitions in the repository."\n )\n else:\n return asset_def\n elif asset_key in repository_assets:\n return repository_assets[asset_key]\n else:\n raise DagsterInvalidInvocationError(\n f"Asset key {asset_key} not monitored in sensor and does not exist in target jobs"\n )\n\n
[docs] @public\n def get_downstream_partition_keys(\n self, partition_key: str, from_asset_key: AssetKey, to_asset_key: AssetKey\n ) -> Sequence[str]:\n """Converts a partition key from one asset to the corresponding partition key in a downstream\n asset. Uses the existing partition mapping between the upstream asset and the downstream\n asset if it exists, otherwise, uses the default partition mapping.\n\n Args:\n partition_key (str): The partition key to convert.\n from_asset_key (AssetKey): The asset key of the upstream asset, which the provided\n partition key belongs to.\n to_asset_key (AssetKey): The asset key of the downstream asset. The provided partition\n key will be mapped to partitions within this asset.\n\n Returns:\n Sequence[str]: A list of the corresponding downstream partitions in to_asset_key that\n partition_key maps to.\n """\n partition_key = check.str_param(partition_key, "partition_key")\n\n to_asset = self._get_asset(to_asset_key, fn_name="get_downstream_partition_keys")\n from_asset = self._get_asset(from_asset_key, fn_name="get_downstream_partition_keys")\n\n to_partitions_def = to_asset.partitions_def\n\n if not isinstance(to_partitions_def, PartitionsDefinition):\n raise DagsterInvalidInvocationError(\n f"Asset key {to_asset_key} is not partitioned. Cannot get partition keys."\n )\n if not isinstance(from_asset.partitions_def, PartitionsDefinition):\n raise DagsterInvalidInvocationError(\n f"Asset key {from_asset_key} is not partitioned. Cannot get partition keys."\n )\n\n partition_mapping = to_asset.infer_partition_mapping(\n from_asset_key, from_asset.partitions_def\n )\n downstream_partition_key_subset = (\n partition_mapping.get_downstream_partitions_for_partitions(\n from_asset.partitions_def.empty_subset().with_partition_keys([partition_key]),\n downstream_partitions_def=to_partitions_def,\n dynamic_partitions_store=self.instance,\n )\n )\n\n return list(downstream_partition_key_subset.get_partition_keys())
\n\n
[docs] @public\n def advance_cursor(\n self, materialization_records_by_key: Mapping[AssetKey, Optional["EventLogRecord"]]\n ):\n """Marks the provided materialization records as having been consumed by the sensor.\n\n At the end of the tick, the cursor will be updated to advance past all materializations\n records provided via `advance_cursor`. In the next tick, records that have been consumed\n will no longer be returned.\n\n Passing a partitioned materialization record into this function will mark prior materializations\n with the same asset key and partition as having been consumed.\n\n Args:\n materialization_records_by_key (Mapping[AssetKey, Optional[EventLogRecord]]): Mapping of\n AssetKeys to EventLogRecord or None. If an EventLogRecord is provided, the cursor\n for the AssetKey will be updated and future calls to fetch asset materialization events\n will not fetch this event again. If None is provided, the cursor for the AssetKey\n will not be updated.\n """\n self._cursor_advance_state_mutation.add_advanced_records(materialization_records_by_key)\n self._cursor_updated = True
\n\n
[docs] @public\n def advance_all_cursors(self):\n """Updates the cursor to the most recent materialization event for all assets monitored by\n the multi_asset_sensor.\n\n Marks all materialization events as consumed by the sensor, including unconsumed events.\n """\n materializations_by_key = self.latest_materialization_records_by_key()\n\n self._cursor_advance_state_mutation.add_advanced_records(materializations_by_key)\n self._cursor_advance_state_mutation.advance_all_cursors_called = True\n self._cursor_updated = True
\n\n @public\n @property\n def assets_defs_by_key(self) -> Mapping[AssetKey, Optional[AssetsDefinition]]:\n """Mapping[AssetKey, Optional[AssetsDefinition]]: A mapping from AssetKey to the\n AssetsDefinition object which produces it. If a given asset is monitored by this sensor, but\n is not produced within the same code location as this sensor, then the value will be None.\n """\n return self._assets_by_key\n\n @public\n @property\n def asset_keys(self) -> Sequence[AssetKey]:\n """Sequence[AssetKey]: The asset keys which are monitored by this sensor."""\n return self._monitored_asset_keys
\n\n\nclass MultiAssetSensorCursorAdvances:\n _advanced_record_ids_by_key: Dict[AssetKey, Set[int]]\n _partition_key_by_record_id: Dict[int, Optional[str]]\n advance_all_cursors_called: bool\n\n def __init__(self):\n self._advanced_record_ids_by_key = defaultdict(set)\n self._partition_key_by_record_id = {}\n self.advance_all_cursors_called = False\n\n def add_advanced_records(\n self, materialization_records_by_key: Mapping[AssetKey, Optional["EventLogRecord"]]\n ):\n for asset_key, materialization in materialization_records_by_key.items():\n if materialization:\n self._advanced_record_ids_by_key[asset_key].add(materialization.storage_id)\n\n self._partition_key_by_record_id[materialization.storage_id] = (\n materialization.partition_key\n )\n\n def get_cursor_with_advances(\n self,\n context: MultiAssetSensorEvaluationContext,\n initial_cursor: MultiAssetSensorContextCursor,\n ) -> Optional[str]:\n """Given the multi asset sensor context and the cursor at the start of the tick,\n returns the cursor that should be used in the next tick.\n\n If the cursor has not been updated, returns None\n """\n if len(self._advanced_record_ids_by_key) == 0:\n # No events marked as advanced\n return None\n\n return json.dumps(\n {\n str(asset_key): self.get_asset_cursor_with_advances(\n asset_key, context, initial_cursor\n )\n for asset_key in context.asset_keys\n }\n )\n\n def get_asset_cursor_with_advances(\n self,\n asset_key: AssetKey,\n context: MultiAssetSensorEvaluationContext,\n initial_cursor: MultiAssetSensorContextCursor,\n ) -> MultiAssetSensorAssetCursorComponent:\n from dagster._core.events import DagsterEventType\n from dagster._core.storage.event_log.base import EventRecordsFilter\n\n advanced_records: Set[int] = self._advanced_record_ids_by_key.get(asset_key, set())\n if len(advanced_records) == 0:\n # No events marked as advanced for this asset key\n return initial_cursor.get_cursor_for_asset(asset_key)\n\n initial_asset_cursor = initial_cursor.get_cursor_for_asset(asset_key)\n\n latest_consumed_event_id_at_tick_start = initial_asset_cursor.latest_consumed_event_id\n\n greatest_consumed_event_id_in_tick = max(advanced_records)\n latest_consumed_partition_in_tick = self._partition_key_by_record_id[\n greatest_consumed_event_id_in_tick\n ]\n latest_unconsumed_record_by_partition: Dict[str, int] = {}\n\n if not self.advance_all_cursors_called:\n latest_unconsumed_record_by_partition = (\n initial_asset_cursor.trailing_unconsumed_partitioned_event_ids\n )\n unconsumed_events = list(context.get_trailing_unconsumed_events(asset_key)) + list(\n context.instance.get_event_records(\n EventRecordsFilter(\n event_type=DagsterEventType.ASSET_MATERIALIZATION,\n asset_key=asset_key,\n after_cursor=latest_consumed_event_id_at_tick_start,\n before_cursor=greatest_consumed_event_id_in_tick,\n ),\n ascending=True,\n )\n if greatest_consumed_event_id_in_tick\n > (latest_consumed_event_id_at_tick_start or 0)\n else []\n )\n\n # Iterate through events in ascending order, storing the latest unconsumed\n # event for each partition. If an advanced event exists for a partition, clear\n # the prior unconsumed event for that partition.\n for event in unconsumed_events:\n partition = event.partition_key\n if partition is not None: # Ignore unpartitioned events\n if event.storage_id not in advanced_records:\n latest_unconsumed_record_by_partition[partition] = event.storage_id\n elif partition in latest_unconsumed_record_by_partition:\n latest_unconsumed_record_by_partition.pop(partition)\n\n if (\n latest_consumed_partition_in_tick is not None\n and latest_consumed_partition_in_tick in latest_unconsumed_record_by_partition\n ):\n latest_unconsumed_record_by_partition.pop(latest_consumed_partition_in_tick)\n\n if len(latest_unconsumed_record_by_partition.keys()) >= MAX_NUM_UNCONSUMED_EVENTS:\n raise DagsterInvariantViolationError(f"""\n You have reached the maximum number of trailing unconsumed events\n ({MAX_NUM_UNCONSUMED_EVENTS}) for asset {asset_key} and no more events can be\n added. You can access the unconsumed events by calling the\n `get_trailing_unconsumed_events` method on the sensor context, and\n mark events as consumed by passing them to `advance_cursor`.\n\n Otherwise, you can clear all unconsumed events and reset the cursor to the latest\n materialization for each asset by calling `advance_all_cursors`.\n """)\n\n return MultiAssetSensorAssetCursorComponent(\n latest_consumed_event_partition=(\n latest_consumed_partition_in_tick\n if greatest_consumed_event_id_in_tick\n > (latest_consumed_event_id_at_tick_start or 0)\n else initial_asset_cursor.latest_consumed_event_partition\n ),\n latest_consumed_event_id=(\n greatest_consumed_event_id_in_tick\n if greatest_consumed_event_id_in_tick\n > (latest_consumed_event_id_at_tick_start or 0)\n else latest_consumed_event_id_at_tick_start\n ),\n trailing_unconsumed_partitioned_event_ids=latest_unconsumed_record_by_partition,\n )\n\n\ndef get_cursor_from_latest_materializations(\n asset_keys: Sequence[AssetKey], instance: DagsterInstance\n) -> str:\n from dagster._core.events import DagsterEventType\n from dagster._core.storage.event_log.base import EventRecordsFilter\n\n cursor_dict: Dict[str, MultiAssetSensorAssetCursorComponent] = {}\n\n for asset_key in asset_keys:\n materializations = instance.get_event_records(\n EventRecordsFilter(\n DagsterEventType.ASSET_MATERIALIZATION,\n asset_key=asset_key,\n ),\n limit=1,\n )\n if materializations:\n last_materialization = list(materializations)[-1]\n\n cursor_dict[str(asset_key)] = MultiAssetSensorAssetCursorComponent(\n last_materialization.partition_key,\n last_materialization.storage_id,\n {},\n )\n\n cursor_str = json.dumps(cursor_dict)\n return cursor_str\n\n\n
[docs]@experimental\ndef build_multi_asset_sensor_context(\n *,\n monitored_assets: Union[Sequence[AssetKey], AssetSelection],\n repository_def: Optional["RepositoryDefinition"] = None,\n instance: Optional[DagsterInstance] = None,\n cursor: Optional[str] = None,\n repository_name: Optional[str] = None,\n cursor_from_latest_materializations: bool = False,\n resources: Optional[Mapping[str, object]] = None,\n definitions: Optional["Definitions"] = None,\n) -> MultiAssetSensorEvaluationContext:\n """Builds multi asset sensor execution context for testing purposes using the provided parameters.\n\n This function can be used to provide a context to the invocation of a multi asset sensor definition. If\n provided, the dagster instance must be persistent; DagsterInstance.ephemeral() will result in an\n error.\n\n Args:\n monitored_assets (Union[Sequence[AssetKey], AssetSelection]): The assets monitored\n by the sensor. If an AssetSelection object is provided, it will only apply to assets\n within the Definitions that this sensor is part of.\n repository_def (RepositoryDefinition): `RepositoryDefinition` object that\n the sensor is defined in. Must provide `definitions` if this is not provided.\n instance (Optional[DagsterInstance]): The dagster instance configured to run the sensor.\n cursor (Optional[str]): A string cursor to provide to the evaluation of the sensor. Must be\n a dictionary of asset key strings to ints that has been converted to a json string\n repository_name (Optional[str]): The name of the repository that the sensor belongs to.\n cursor_from_latest_materializations (bool): If True, the cursor will be set to the latest\n materialization for each monitored asset. By default, set to False.\n resources (Optional[Mapping[str, object]]): The resource definitions\n to provide to the sensor.\n definitions (Optional[Definitions]): `Definitions` object that the sensor is defined in.\n Must provide `repository_def` if this is not provided.\n\n Examples:\n .. code-block:: python\n\n with instance_for_test() as instance:\n context = build_multi_asset_sensor_context(\n monitored_assets=[AssetKey("asset_1"), AssetKey("asset_2")],\n instance=instance,\n )\n my_asset_sensor(context)\n\n """\n from dagster._core.definitions import RepositoryDefinition\n from dagster._core.definitions.definitions_class import Definitions\n from dagster._core.execution.build_resources import wrap_resources_for_execution\n\n check.opt_inst_param(instance, "instance", DagsterInstance)\n check.opt_str_param(cursor, "cursor")\n check.opt_str_param(repository_name, "repository_name")\n repository_def = normalize_to_repository(\n check.opt_inst_param(definitions, "definitions", Definitions),\n check.opt_inst_param(repository_def, "repository_def", RepositoryDefinition),\n )\n\n check.bool_param(cursor_from_latest_materializations, "cursor_from_latest_materializations")\n\n if cursor_from_latest_materializations:\n if cursor:\n raise DagsterInvalidInvocationError(\n "Cannot provide both cursor and cursor_from_latest_materializations objects."\n " Dagster will override the provided cursor based on the"\n " cursor_from_latest_materializations object."\n )\n if not instance:\n raise DagsterInvalidInvocationError(\n "Cannot provide cursor_from_latest_materializations object without a Dagster"\n " instance."\n )\n\n asset_keys: Sequence[AssetKey]\n if isinstance(monitored_assets, AssetSelection):\n asset_keys = cast(\n List[AssetKey],\n list(\n monitored_assets.resolve(list(set(repository_def.assets_defs_by_key.values())))\n ),\n )\n else:\n asset_keys = monitored_assets\n\n cursor = get_cursor_from_latest_materializations(asset_keys, instance)\n\n return MultiAssetSensorEvaluationContext(\n instance_ref=None,\n last_completion_time=None,\n last_run_key=None,\n cursor=cursor,\n repository_name=repository_name,\n instance=instance,\n monitored_assets=monitored_assets,\n repository_def=repository_def,\n resource_defs=wrap_resources_for_execution(resources),\n )
\n\n\nAssetMaterializationFunctionReturn = Union[\n Iterator[Union[RunRequest, SkipReason, SensorResult]],\n Sequence[RunRequest],\n RunRequest,\n SkipReason,\n None,\n SensorResult,\n]\nAssetMaterializationFunction = Callable[\n ...,\n AssetMaterializationFunctionReturn,\n]\n\nMultiAssetMaterializationFunction = Callable[\n ...,\n AssetMaterializationFunctionReturn,\n]\n\n\n
[docs]@experimental\nclass MultiAssetSensorDefinition(SensorDefinition):\n """Define an asset sensor that initiates a set of runs based on the materialization of a list of\n assets.\n\n Users should not instantiate this object directly. To construct a\n `MultiAssetSensorDefinition`, use :py:func:`dagster.\n multi_asset_sensor`.\n\n Args:\n name (str): The name of the sensor to create.\n asset_keys (Sequence[AssetKey]): The asset_keys this sensor monitors.\n asset_materialization_fn (Callable[[MultiAssetSensorEvaluationContext], Union[Iterator[Union[RunRequest, SkipReason]], RunRequest, SkipReason]]): The core\n evaluation function for the sensor, which is run at an interval to determine whether a\n run should be launched or not. Takes a :py:class:`~dagster.MultiAssetSensorEvaluationContext`.\n\n This function must return a generator, which must yield either a single SkipReason\n or one or more RunRequest objects.\n minimum_interval_seconds (Optional[int]): The minimum number of seconds that will elapse\n between sensor evaluations.\n description (Optional[str]): A human-readable description of the sensor.\n job (Optional[Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]]): The job\n object to target with this sensor.\n jobs (Optional[Sequence[Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]]]):\n (experimental) A list of jobs to be executed when the sensor fires.\n default_status (DefaultSensorStatus): Whether the sensor starts as running or not. The default\n status can be overridden from the Dagster UI or via the GraphQL API.\n request_assets (Optional[AssetSelection]): (Experimental) an asset selection to launch a run\n for if the sensor condition is met. This can be provided instead of specifying a job.\n """\n\n def __init__(\n self,\n name: str,\n monitored_assets: Union[Sequence[AssetKey], AssetSelection],\n job_name: Optional[str],\n asset_materialization_fn: MultiAssetMaterializationFunction,\n minimum_interval_seconds: Optional[int] = None,\n description: Optional[str] = None,\n job: Optional[ExecutableDefinition] = None,\n jobs: Optional[Sequence[ExecutableDefinition]] = None,\n default_status: DefaultSensorStatus = DefaultSensorStatus.STOPPED,\n request_assets: Optional[AssetSelection] = None,\n required_resource_keys: Optional[Set[str]] = None,\n ):\n resource_arg_names: Set[str] = {\n arg.name for arg in get_resource_args(asset_materialization_fn)\n }\n\n combined_required_resource_keys = (\n check.opt_set_param(required_resource_keys, "required_resource_keys", of_type=str)\n | resource_arg_names\n )\n\n def _wrap_asset_fn(materialization_fn):\n def _fn(context):\n def _check_cursor_not_set(sensor_result: SensorResult):\n if sensor_result.cursor:\n raise DagsterInvariantViolationError(\n "Cannot set cursor in a multi_asset_sensor. Cursor is set automatically"\n " based on the latest materialization for each monitored asset."\n )\n\n resource_args_populated = validate_and_get_resource_dict(\n context.resources, name, resource_arg_names\n )\n\n with MultiAssetSensorEvaluationContext(\n instance_ref=context.instance_ref,\n last_completion_time=context.last_completion_time,\n last_run_key=context.last_run_key,\n cursor=context.cursor,\n repository_name=context.repository_def.name,\n repository_def=context.repository_def,\n monitored_assets=monitored_assets,\n instance=context.instance,\n resource_defs=context.resource_defs,\n ) as multi_asset_sensor_context:\n context_param_name = get_context_param_name(materialization_fn)\n context_param = (\n {context_param_name: multi_asset_sensor_context}\n if context_param_name\n else {}\n )\n result = materialization_fn(\n **context_param,\n **resource_args_populated,\n )\n if result is None:\n return\n\n # because the materialization_fn can yield results (see _wrapped_fn in multi_asset_sensor decorator),\n # even if you return None in a sensor, it will still cause in inspect.isgenerator(result) to be True.\n # So keep track to see if we actually return any values and should update the cursor\n runs_yielded = False\n if inspect.isgenerator(result) or isinstance(result, list):\n for item in result:\n if isinstance(item, RunRequest):\n runs_yielded = True\n if isinstance(item, SensorResult):\n raise DagsterInvariantViolationError(\n "Cannot yield a SensorResult from a multi_asset_sensor. Instead"\n " return the SensorResult."\n )\n yield item\n elif isinstance(result, RunRequest):\n runs_yielded = True\n yield result\n elif isinstance(result, SkipReason):\n # if result is a SkipReason, we don't update the cursor, so don't set runs_yielded = True\n yield result\n elif isinstance(result, SensorResult):\n _check_cursor_not_set(result)\n if result.run_requests:\n runs_yielded = True\n yield result\n\n if runs_yielded and not multi_asset_sensor_context.cursor_updated:\n raise DagsterInvalidDefinitionError(\n "Asset materializations have been handled in this sensor, but the cursor"\n " was not updated. This means the same materialization events will be"\n " handled in the next sensor tick. Use context.advance_cursor or"\n " context.advance_all_cursors to update the cursor."\n )\n\n multi_asset_sensor_context.update_cursor_after_evaluation()\n context.update_cursor(multi_asset_sensor_context.cursor)\n\n return _fn\n\n self._raw_asset_materialization_fn = asset_materialization_fn\n\n super(MultiAssetSensorDefinition, self).__init__(\n name=check_valid_name(name),\n job_name=job_name,\n evaluation_fn=_wrap_asset_fn(\n check.callable_param(asset_materialization_fn, "asset_materialization_fn")\n ),\n minimum_interval_seconds=minimum_interval_seconds,\n description=description,\n job=job,\n jobs=jobs,\n default_status=default_status,\n asset_selection=request_assets,\n required_resource_keys=combined_required_resource_keys,\n )\n\n def __call__(self, *args, **kwargs) -> AssetMaterializationFunctionReturn:\n context_param_name = get_context_param_name(self._raw_asset_materialization_fn)\n context = get_sensor_context_from_args_or_kwargs(\n self._raw_asset_materialization_fn,\n args,\n kwargs,\n context_type=MultiAssetSensorEvaluationContext,\n )\n\n resources = validate_and_get_resource_dict(\n context.resources if context else ScopedResourcesBuilder.build_empty(),\n self._name,\n self._required_resource_keys,\n )\n\n context_param = {context_param_name: context} if context_param_name and context else {}\n result = self._raw_asset_materialization_fn(**context_param, **resources)\n\n if context:\n context.update_cursor_after_evaluation()\n return result\n\n @property\n def sensor_type(self) -> SensorType:\n return SensorType.MULTI_ASSET
\n
", "current_page_name": "_modules/dagster/_core/definitions/multi_asset_sensor_definition", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.multi_asset_sensor_definition"}, "multi_dimensional_partitions": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.multi_dimensional_partitions

\nimport hashlib\nimport itertools\nfrom datetime import datetime\nfrom functools import lru_cache, reduce\nfrom typing import (\n    Dict,\n    Iterable,\n    List,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Set,\n    Tuple,\n    Type,\n    Union,\n    cast,\n)\n\nimport pendulum\n\nimport dagster._check as check\nfrom dagster._annotations import public\nfrom dagster._core.errors import (\n    DagsterInvalidDefinitionError,\n    DagsterInvalidInvocationError,\n    DagsterUnknownPartitionError,\n)\nfrom dagster._core.instance import DynamicPartitionsStore\nfrom dagster._core.storage.tags import (\n    MULTIDIMENSIONAL_PARTITION_PREFIX,\n    get_multidimensional_partition_tag,\n)\n\nfrom .partition import (\n    DefaultPartitionsSubset,\n    DynamicPartitionsDefinition,\n    PartitionsDefinition,\n    PartitionsSubset,\n    StaticPartitionsDefinition,\n)\nfrom .time_window_partitions import TimeWindow, TimeWindowPartitionsDefinition\n\nINVALID_STATIC_PARTITIONS_KEY_CHARACTERS = set(["|", ",", "[", "]"])\n\nMULTIPARTITION_KEY_DELIMITER = "|"\n\n\nclass PartitionDimensionKey(\n    NamedTuple("_PartitionDimensionKey", [("dimension_name", str), ("partition_key", str)])\n):\n    """Representation of a single dimension of a multi-dimensional partition key."""\n\n    def __new__(cls, dimension_name: str, partition_key: str):\n        return super(PartitionDimensionKey, cls).__new__(\n            cls,\n            dimension_name=check.str_param(dimension_name, "dimension_name"),\n            partition_key=check.str_param(partition_key, "partition_key"),\n        )\n\n\n
[docs]class MultiPartitionKey(str):\n """A multi-dimensional partition key stores the partition key for each dimension.\n Subclasses the string class to keep partition key type as a string.\n\n Contains additional methods to access the partition key for each dimension.\n Creates a string representation of the partition key for each dimension, separated by a pipe (|).\n Orders the dimensions by name, to ensure consistent string representation.\n """\n\n dimension_keys: List[PartitionDimensionKey] = []\n\n def __new__(cls, keys_by_dimension: Mapping[str, str]):\n check.mapping_param(\n keys_by_dimension, "partitions_by_dimension", key_type=str, value_type=str\n )\n\n dimension_keys: List[PartitionDimensionKey] = [\n PartitionDimensionKey(dimension, keys_by_dimension[dimension])\n for dimension in sorted(list(keys_by_dimension.keys()))\n ]\n\n str_key = super(MultiPartitionKey, cls).__new__(\n cls,\n MULTIPARTITION_KEY_DELIMITER.join(\n [dim_key.partition_key for dim_key in dimension_keys]\n ),\n )\n\n str_key.dimension_keys = dimension_keys\n\n return str_key\n\n def __getnewargs__(self):\n # When this instance is pickled, replace the argument to __new__ with the\n # dimension key mapping instead of the string representation.\n return ({dim_key.dimension_name: dim_key.partition_key for dim_key in self.dimension_keys},)\n\n @property\n def keys_by_dimension(self) -> Mapping[str, str]:\n return {dim_key.dimension_name: dim_key.partition_key for dim_key in self.dimension_keys}
\n\n\nclass PartitionDimensionDefinition(\n NamedTuple(\n "_PartitionDimensionDefinition",\n [\n ("name", str),\n ("partitions_def", PartitionsDefinition),\n ],\n )\n):\n def __new__(\n cls,\n name: str,\n partitions_def: PartitionsDefinition,\n ):\n return super().__new__(\n cls,\n name=check.str_param(name, "name"),\n partitions_def=check.inst_param(partitions_def, "partitions_def", PartitionsDefinition),\n )\n\n def __eq__(self, other: object) -> bool:\n return (\n isinstance(other, PartitionDimensionDefinition)\n and self.name == other.name\n and self.partitions_def == other.partitions_def\n )\n\n\nALLOWED_PARTITION_DIMENSION_TYPES = (\n StaticPartitionsDefinition,\n TimeWindowPartitionsDefinition,\n DynamicPartitionsDefinition,\n)\n\n\ndef _check_valid_partitions_dimensions(\n partitions_dimensions: Mapping[str, PartitionsDefinition]\n) -> None:\n for dim_name, partitions_def in partitions_dimensions.items():\n if not any(isinstance(partitions_def, t) for t in ALLOWED_PARTITION_DIMENSION_TYPES):\n raise DagsterInvalidDefinitionError(\n f"Invalid partitions definition type {type(partitions_def)}. "\n "Only the following partitions definition types are supported: "\n f"{ALLOWED_PARTITION_DIMENSION_TYPES}."\n )\n if isinstance(partitions_def, DynamicPartitionsDefinition) and partitions_def.name is None:\n raise DagsterInvalidDefinitionError(\n "DynamicPartitionsDefinition must have a name to be used in a"\n " MultiPartitionsDefinition."\n )\n\n if isinstance(partitions_def, StaticPartitionsDefinition):\n if any(\n [\n INVALID_STATIC_PARTITIONS_KEY_CHARACTERS & set(key)\n for key in partitions_def.get_partition_keys()\n ]\n ):\n raise DagsterInvalidDefinitionError(\n f"Invalid character in partition key for dimension {dim_name}. "\n "A multi-partitions definition cannot contain partition keys with "\n "the following characters: |, [, ], ,"\n )\n\n\n
[docs]class MultiPartitionsDefinition(PartitionsDefinition[MultiPartitionKey]):\n """Takes the cross-product of partitions from two partitions definitions.\n\n For example, with a static partitions definition where the partitions are ["a", "b", "c"]\n and a daily partitions definition, this partitions definition will have the following\n partitions:\n\n 2020-01-01|a\n 2020-01-01|b\n 2020-01-01|c\n 2020-01-02|a\n 2020-01-02|b\n ...\n\n Args:\n partitions_defs (Mapping[str, PartitionsDefinition]):\n A mapping of dimension name to partitions definition. The total set of partitions will\n be the cross-product of the partitions from each PartitionsDefinition.\n\n Attributes:\n partitions_defs (Sequence[PartitionDimensionDefinition]):\n A sequence of PartitionDimensionDefinition objects, each of which contains a dimension\n name and a PartitionsDefinition. The total set of partitions will be the cross-product\n of the partitions from each PartitionsDefinition. This sequence is ordered by\n dimension name, to ensure consistent ordering of the partitions.\n """\n\n def __init__(self, partitions_defs: Mapping[str, PartitionsDefinition]):\n if not len(partitions_defs.keys()) == 2:\n raise DagsterInvalidInvocationError(\n "Dagster currently only supports multi-partitions definitions with 2 partitions"\n " definitions. Your multi-partitions definition has"\n f" {len(partitions_defs.keys())} partitions definitions."\n )\n check.mapping_param(\n partitions_defs, "partitions_defs", key_type=str, value_type=PartitionsDefinition\n )\n\n _check_valid_partitions_dimensions(partitions_defs)\n\n self._partitions_defs: List[PartitionDimensionDefinition] = sorted(\n [\n PartitionDimensionDefinition(name, partitions_def)\n for name, partitions_def in partitions_defs.items()\n ],\n key=lambda x: x.name,\n )\n\n @property\n def partitions_subset_class(self) -> Type["PartitionsSubset"]:\n return MultiPartitionsSubset\n\n def get_serializable_unique_identifier(\n self, dynamic_partitions_store: Optional[DynamicPartitionsStore] = None\n ) -> str:\n return hashlib.sha1(\n str(\n {\n dim_def.name: dim_def.partitions_def.get_serializable_unique_identifier(\n dynamic_partitions_store\n )\n for dim_def in self.partitions_defs\n }\n ).encode("utf-8")\n ).hexdigest()\n\n @property\n def partition_dimension_names(self) -> List[str]:\n return [dim_def.name for dim_def in self._partitions_defs]\n\n @property\n def partitions_defs(self) -> Sequence[PartitionDimensionDefinition]:\n return self._partitions_defs\n\n def get_partitions_def_for_dimension(self, dimension_name: str) -> PartitionsDefinition:\n for dim_def in self._partitions_defs:\n if dim_def.name == dimension_name:\n return dim_def.partitions_def\n check.failed(f"Invalid dimension name {dimension_name}")\n\n # We override the default implementation of `has_partition_key` for performance.\n def has_partition_key(\n self,\n partition_key: Union[MultiPartitionKey, str],\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> bool:\n partition_key = (\n partition_key\n if isinstance(partition_key, MultiPartitionKey)\n else self.get_partition_key_from_str(partition_key)\n )\n if partition_key.keys_by_dimension.keys() != set(self.partition_dimension_names):\n raise DagsterUnknownPartitionError(\n f"Invalid partition key {partition_key}. The dimensions of the partition key are"\n " not the dimensions of the partitions definition."\n )\n\n for dimension in self.partitions_defs:\n if not dimension.partitions_def.has_partition_key(\n partition_key.keys_by_dimension[dimension.name],\n current_time=current_time,\n dynamic_partitions_store=dynamic_partitions_store,\n ):\n return False\n return True\n\n # store results for repeated calls with the same current_time\n @lru_cache(maxsize=1)\n def _get_partition_keys(\n self, current_time: datetime, dynamic_partitions_store: Optional[DynamicPartitionsStore]\n ) -> Sequence[MultiPartitionKey]:\n partition_key_sequences = [\n partition_dim.partitions_def.get_partition_keys(\n current_time=current_time, dynamic_partitions_store=dynamic_partitions_store\n )\n for partition_dim in self._partitions_defs\n ]\n\n return [\n MultiPartitionKey(\n {self._partitions_defs[i].name: key for i, key in enumerate(partition_key_tuple)}\n )\n for partition_key_tuple in itertools.product(*partition_key_sequences)\n ]\n\n
[docs] @public\n def get_partition_keys(\n self,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> Sequence[MultiPartitionKey]:\n """Returns a list of MultiPartitionKeys representing the partition keys of the\n PartitionsDefinition.\n\n Args:\n current_time (Optional[datetime]): A datetime object representing the current time, only\n applicable to time-based partition dimensions.\n dynamic_partitions_store (Optional[DynamicPartitionsStore]): The DynamicPartitionsStore\n object that is responsible for fetching dynamic partitions. Required when a\n dimension is a DynamicPartitionsDefinition with a name defined. Users can pass the\n DagsterInstance fetched via `context.instance` to this argument.\n\n Returns:\n Sequence[MultiPartitionKey]\n """\n return self._get_partition_keys(\n current_time or pendulum.now("UTC"), dynamic_partitions_store\n )
\n\n def filter_valid_partition_keys(\n self, partition_keys: Set[str], dynamic_partitions_store: DynamicPartitionsStore\n ) -> Set[MultiPartitionKey]:\n partition_keys_by_dimension = {\n dim.name: dim.partitions_def.get_partition_keys(\n dynamic_partitions_store=dynamic_partitions_store\n )\n for dim in self.partitions_defs\n }\n validated_partitions = set()\n for partition_key in partition_keys:\n partition_key_strs = partition_key.split(MULTIPARTITION_KEY_DELIMITER)\n if len(partition_key_strs) != len(self.partitions_defs):\n continue\n\n multipartition_key = MultiPartitionKey(\n {dim.name: partition_key_strs[i] for i, dim in enumerate(self._partitions_defs)}\n )\n\n if all(\n key in partition_keys_by_dimension.get(dim, [])\n for dim, key in multipartition_key.keys_by_dimension.items()\n ):\n validated_partitions.add(partition_key)\n\n return validated_partitions\n\n def __eq__(self, other):\n return (\n isinstance(other, MultiPartitionsDefinition)\n and self.partitions_defs == other.partitions_defs\n )\n\n def __hash__(self):\n return hash(\n tuple(\n [\n (partitions_def.name, partitions_def.__repr__())\n for partitions_def in self.partitions_defs\n ]\n )\n )\n\n def __str__(self) -> str:\n dimension_1 = self._partitions_defs[0]\n dimension_2 = self._partitions_defs[1]\n partition_str = (\n "Multi-partitioned, with dimensions: \\n"\n f"{dimension_1.name.capitalize()}: {dimension_1.partitions_def} \\n"\n f"{dimension_2.name.capitalize()}: {dimension_2.partitions_def}"\n )\n return partition_str\n\n def __repr__(self) -> str:\n return f"{type(self).__name__}(dimensions={[str(dim) for dim in self.partitions_defs]}"\n\n def get_partition_key_from_str(self, partition_key_str: str) -> MultiPartitionKey:\n """Given a string representation of a partition key, returns a MultiPartitionKey object."""\n check.str_param(partition_key_str, "partition_key_str")\n\n partition_key_strs = partition_key_str.split(MULTIPARTITION_KEY_DELIMITER)\n check.invariant(\n len(partition_key_strs) == len(self.partitions_defs),\n f"Expected {len(self.partitions_defs)} partition keys in partition key string"\n f" {partition_key_str}, but got {len(partition_key_strs)}",\n )\n\n return MultiPartitionKey(\n {dim.name: partition_key_strs[i] for i, dim in enumerate(self._partitions_defs)}\n )\n\n def _get_primary_and_secondary_dimension(\n self,\n ) -> Tuple[PartitionDimensionDefinition, PartitionDimensionDefinition]:\n # Multipartitions subsets are serialized by primary dimension. If changing\n # the selection of primary/secondary dimension, will need to also update the\n # serialization of MultiPartitionsSubsets\n\n time_dimensions = [\n dim\n for dim in self.partitions_defs\n if isinstance(dim.partitions_def, TimeWindowPartitionsDefinition)\n ]\n if len(time_dimensions) == 1:\n primary_dimension, secondary_dimension = time_dimensions[0], next(\n iter([dim for dim in self.partitions_defs if dim != time_dimensions[0]])\n )\n else:\n primary_dimension, secondary_dimension = (\n self.partitions_defs[0],\n self.partitions_defs[1],\n )\n\n return primary_dimension, secondary_dimension\n\n @property\n def primary_dimension(self) -> PartitionDimensionDefinition:\n return self._get_primary_and_secondary_dimension()[0]\n\n @property\n def secondary_dimension(self) -> PartitionDimensionDefinition:\n return self._get_primary_and_secondary_dimension()[1]\n\n def get_tags_for_partition_key(self, partition_key: str) -> Mapping[str, str]:\n partition_key = cast(MultiPartitionKey, self.get_partition_key_from_str(partition_key))\n tags = {**super().get_tags_for_partition_key(partition_key)}\n tags.update(get_tags_from_multi_partition_key(partition_key))\n return tags\n\n @property\n def time_window_dimension(self) -> PartitionDimensionDefinition:\n time_window_dims = [\n dim\n for dim in self.partitions_defs\n if isinstance(dim.partitions_def, TimeWindowPartitionsDefinition)\n ]\n check.invariant(\n len(time_window_dims) == 1, "Expected exactly one time window partitioned dimension"\n )\n return next(iter(time_window_dims))\n\n def time_window_for_partition_key(self, partition_key: str) -> TimeWindow:\n if not isinstance(partition_key, MultiPartitionKey):\n partition_key = self.get_partition_key_from_str(partition_key)\n\n time_window_dimension = self.time_window_dimension\n return cast(\n TimeWindowPartitionsDefinition, time_window_dimension.partitions_def\n ).time_window_for_partition_key(\n cast(MultiPartitionKey, partition_key).keys_by_dimension[time_window_dimension.name]\n )\n\n def get_multipartition_keys_with_dimension_value(\n self,\n dimension_name: str,\n dimension_partition_key: str,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n current_time: Optional[datetime] = None,\n ) -> Sequence[MultiPartitionKey]:\n check.str_param(dimension_name, "dimension_name")\n check.str_param(dimension_partition_key, "dimension_partition_key")\n\n matching_dimensions = [\n dimension for dimension in self.partitions_defs if dimension.name == dimension_name\n ]\n other_dimensions = [\n dimension for dimension in self.partitions_defs if dimension.name != dimension_name\n ]\n\n check.invariant(\n len(matching_dimensions) == 1,\n f"Dimension {dimension_name} not found in MultiPartitionsDefinition with dimensions"\n f" {[dim.name for dim in self.partitions_defs]}",\n )\n\n partition_sequences = [\n partition_dim.partitions_def.get_partition_keys(\n current_time=current_time, dynamic_partitions_store=dynamic_partitions_store\n )\n for partition_dim in other_dimensions\n ] + [[dimension_partition_key]]\n\n # Names of partitions dimensions in the same order as partition_sequences\n partition_dim_names = [dim.name for dim in other_dimensions] + [dimension_name]\n\n return [\n MultiPartitionKey(\n {\n partition_dim_names[i]: partition_key\n for i, partition_key in enumerate(partitions_tuple)\n }\n )\n for partitions_tuple in itertools.product(*partition_sequences)\n ]\n\n def get_num_partitions(\n self,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> int:\n # Static partitions definitions can contain duplicate keys (will throw error in 1.3.0)\n # In the meantime, relying on get_num_partitions to handle duplicates to display\n # correct counts in the Dagster UI.\n dimension_counts = [\n dim.partitions_def.get_num_partitions(\n current_time=current_time, dynamic_partitions_store=dynamic_partitions_store\n )\n for dim in self.partitions_defs\n ]\n return reduce(lambda x, y: x * y, dimension_counts, 1)
\n\n\nclass MultiPartitionsSubset(DefaultPartitionsSubset):\n def __init__(\n self,\n partitions_def: MultiPartitionsDefinition,\n subset: Optional[Set[str]] = None,\n ):\n check.inst_param(partitions_def, "partitions_def", MultiPartitionsDefinition)\n subset = (\n set(\n [\n partitions_def.get_partition_key_from_str(key)\n for key in subset\n if MULTIPARTITION_KEY_DELIMITER in key\n ]\n )\n if subset\n else set()\n )\n super(MultiPartitionsSubset, self).__init__(partitions_def, subset)\n\n def with_partition_keys(self, partition_keys: Iterable[str]) -> "MultiPartitionsSubset":\n return MultiPartitionsSubset(\n cast(MultiPartitionsDefinition, self._partitions_def),\n self._subset | set(partition_keys),\n )\n\n\ndef get_tags_from_multi_partition_key(multi_partition_key: MultiPartitionKey) -> Mapping[str, str]:\n check.inst_param(multi_partition_key, "multi_partition_key", MultiPartitionKey)\n\n return {\n get_multidimensional_partition_tag(dimension.dimension_name): dimension.partition_key\n for dimension in multi_partition_key.dimension_keys\n }\n\n\ndef get_multipartition_key_from_tags(tags: Mapping[str, str]) -> str:\n partitions_by_dimension: Dict[str, str] = {}\n for tag in tags:\n if tag.startswith(MULTIDIMENSIONAL_PARTITION_PREFIX):\n dimension = tag[len(MULTIDIMENSIONAL_PARTITION_PREFIX) :]\n partitions_by_dimension[dimension] = tags[tag]\n\n return MultiPartitionKey(partitions_by_dimension)\n
", "current_page_name": "_modules/dagster/_core/definitions/multi_dimensional_partitions", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.multi_dimensional_partitions"}, "op_definition": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.op_definition

\nimport inspect\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Any,\n    Callable,\n    Iterator,\n    List,\n    Mapping,\n    Optional,\n    Sequence,\n    Tuple,\n    TypeVar,\n    Union,\n    cast,\n)\n\nfrom typing_extensions import TypeAlias, get_args, get_origin\n\nimport dagster._check as check\nfrom dagster._annotations import deprecated, deprecated_param, public\nfrom dagster._config.config_schema import UserConfigSchema\nfrom dagster._core.definitions.dependency import NodeHandle, NodeInputHandle\nfrom dagster._core.definitions.node_definition import NodeDefinition\nfrom dagster._core.definitions.op_invocation import direct_invocation_result\nfrom dagster._core.definitions.policy import RetryPolicy\nfrom dagster._core.definitions.resource_requirement import (\n    InputManagerRequirement,\n    OpDefinitionResourceRequirement,\n    OutputManagerRequirement,\n    ResourceRequirement,\n)\nfrom dagster._core.errors import (\n    DagsterInvalidDefinitionError,\n    DagsterInvalidInvocationError,\n    DagsterInvariantViolationError,\n)\nfrom dagster._core.types.dagster_type import DagsterType, DagsterTypeKind\nfrom dagster._utils import IHasInternalInit\nfrom dagster._utils.warnings import normalize_renamed_param\n\nfrom .definition_config_schema import (\n    IDefinitionConfigSchema,\n    convert_user_facing_definition_config_schema,\n)\nfrom .hook_definition import HookDefinition\nfrom .inference import infer_output_props\nfrom .input import In, InputDefinition\nfrom .output import Out, OutputDefinition\n\nif TYPE_CHECKING:\n    from dagster._core.definitions.asset_layer import AssetLayer\n\n    from .composition import PendingNodeInvocation\n    from .decorators.op_decorator import DecoratedOpFunction\n\nOpComputeFunction: TypeAlias = Callable[..., Any]\n\n\n
[docs]@deprecated_param(\n param="version", breaking_version="2.0", additional_warn_text="Use `code_version` instead."\n)\nclass OpDefinition(NodeDefinition, IHasInternalInit):\n """Defines an op, the functional unit of user-defined computation.\n\n For more details on what a op is, refer to the\n `Ops Overview <../../concepts/ops-jobs-graphs/ops>`_ .\n\n End users should prefer the :func:`@op <op>` decorator. OpDefinition is generally intended to be\n used by framework authors or for programatically generated ops.\n\n Args:\n name (str): Name of the op. Must be unique within any :py:class:`GraphDefinition` or\n :py:class:`JobDefinition` that contains the op.\n input_defs (List[InputDefinition]): Inputs of the op.\n compute_fn (Callable): The core of the op, the function that performs the actual\n computation. The signature of this function is determined by ``input_defs``, and\n optionally, an injected first argument, ``context``, a collection of information\n provided by the system.\n\n This function will be coerced into a generator or an async generator, which must yield\n one :py:class:`Output` for each of the op's ``output_defs``, and additionally may\n yield other types of Dagster events, including :py:class:`AssetMaterialization` and\n :py:class:`ExpectationResult`.\n output_defs (List[OutputDefinition]): Outputs of the op.\n config_schema (Optional[ConfigSchema): The schema for the config. If set, Dagster will check\n that the config provided for the op matches this schema and will fail if it does not. If\n not set, Dagster will accept any config provided for the op.\n description (Optional[str]): Human-readable description of the op.\n tags (Optional[Dict[str, Any]]): Arbitrary metadata for the op. Frameworks may\n expect and require certain metadata to be attached to a op. Users should generally\n not set metadata directly. Values that are not strings will be json encoded and must meet\n the criteria that `json.loads(json.dumps(value)) == value`.\n required_resource_keys (Optional[Set[str]]): Set of resources handles required by this op.\n code_version (Optional[str]): (Experimental) Version of the code encapsulated by the op. If set,\n this is used as a default code version for all outputs.\n retry_policy (Optional[RetryPolicy]): The retry policy for this op.\n\n\n Examples:\n .. code-block:: python\n\n def _add_one(_context, inputs):\n yield Output(inputs["num"] + 1)\n\n OpDefinition(\n name="add_one",\n ins={"num": In(int)},\n outs={"result": Out(int)},\n compute_fn=_add_one,\n )\n """\n\n _compute_fn: Union[Callable[..., Any], "DecoratedOpFunction"]\n _config_schema: IDefinitionConfigSchema\n _required_resource_keys: AbstractSet[str]\n _version: Optional[str]\n _retry_policy: Optional[RetryPolicy]\n\n def __init__(\n self,\n compute_fn: Union[Callable[..., Any], "DecoratedOpFunction"],\n name: str,\n ins: Optional[Mapping[str, In]] = None,\n outs: Optional[Mapping[str, Out]] = None,\n description: Optional[str] = None,\n config_schema: Optional[Union[UserConfigSchema, IDefinitionConfigSchema]] = None,\n required_resource_keys: Optional[AbstractSet[str]] = None,\n tags: Optional[Mapping[str, Any]] = None,\n version: Optional[str] = None,\n retry_policy: Optional[RetryPolicy] = None,\n code_version: Optional[str] = None,\n ):\n from .decorators.op_decorator import DecoratedOpFunction, resolve_checked_op_fn_inputs\n\n ins = check.opt_mapping_param(ins, "ins")\n input_defs = [\n inp.to_definition(name) for name, inp in sorted(ins.items(), key=lambda inp: inp[0])\n ] # sort so that input definition order is deterministic\n\n if isinstance(compute_fn, DecoratedOpFunction):\n resolved_input_defs: Sequence[InputDefinition] = resolve_checked_op_fn_inputs(\n decorator_name="@op",\n fn_name=name,\n compute_fn=cast(DecoratedOpFunction, compute_fn),\n explicit_input_defs=input_defs,\n exclude_nothing=True,\n )\n self._compute_fn = compute_fn\n _validate_context_type_hint(self._compute_fn.decorated_fn)\n else:\n resolved_input_defs = input_defs\n self._compute_fn = check.callable_param(compute_fn, "compute_fn")\n _validate_context_type_hint(self._compute_fn)\n\n code_version = normalize_renamed_param(\n code_version,\n "code_version",\n version,\n "version",\n )\n self._version = code_version\n\n check.opt_mapping_param(outs, "outs")\n output_defs = _resolve_output_defs_from_outs(\n compute_fn=compute_fn, outs=outs, default_code_version=code_version\n )\n\n self._config_schema = convert_user_facing_definition_config_schema(config_schema)\n self._required_resource_keys = frozenset(\n check.opt_set_param(required_resource_keys, "required_resource_keys", of_type=str)\n )\n self._retry_policy = check.opt_inst_param(retry_policy, "retry_policy", RetryPolicy)\n\n positional_inputs = (\n self._compute_fn.positional_inputs()\n if isinstance(self._compute_fn, DecoratedOpFunction)\n else None\n )\n\n super(OpDefinition, self).__init__(\n name=name,\n input_defs=check.sequence_param(resolved_input_defs, "input_defs", InputDefinition),\n output_defs=check.sequence_param(output_defs, "output_defs", OutputDefinition),\n description=description,\n tags=check.opt_mapping_param(tags, "tags", key_type=str),\n positional_inputs=positional_inputs,\n )\n\n def dagster_internal_init(\n *,\n compute_fn: Union[Callable[..., Any], "DecoratedOpFunction"],\n name: str,\n ins: Optional[Mapping[str, In]],\n outs: Optional[Mapping[str, Out]],\n description: Optional[str],\n config_schema: Optional[Union[UserConfigSchema, IDefinitionConfigSchema]],\n required_resource_keys: Optional[AbstractSet[str]],\n tags: Optional[Mapping[str, Any]],\n version: Optional[str],\n retry_policy: Optional[RetryPolicy],\n code_version: Optional[str],\n ) -> "OpDefinition":\n return OpDefinition(\n compute_fn=compute_fn,\n name=name,\n ins=ins,\n outs=outs,\n description=description,\n config_schema=config_schema,\n required_resource_keys=required_resource_keys,\n tags=tags,\n version=version,\n retry_policy=retry_policy,\n code_version=code_version,\n )\n\n @property\n def node_type_str(self) -> str:\n return "op"\n\n @property\n def is_graph_job_op_node(self) -> bool:\n return True\n\n @public\n @property\n def name(self) -> str:\n """str: The name of this op."""\n return super(OpDefinition, self).name\n\n @public\n @property\n def ins(self) -> Mapping[str, In]:\n """Mapping[str, In]: A mapping from input name to the In object that represents that input."""\n return {input_def.name: In.from_definition(input_def) for input_def in self.input_defs}\n\n @public\n @property\n def outs(self) -> Mapping[str, Out]:\n """Mapping[str, Out]: A mapping from output name to the Out object that represents that output."""\n return {output_def.name: Out.from_definition(output_def) for output_def in self.output_defs}\n\n @property\n def compute_fn(self) -> Union[Callable[..., Any], "DecoratedOpFunction"]:\n return self._compute_fn\n\n @public\n @property\n def config_schema(self) -> IDefinitionConfigSchema:\n """IDefinitionConfigSchema: The config schema for this op."""\n return self._config_schema\n\n @public\n @property\n def required_resource_keys(self) -> AbstractSet[str]:\n """AbstractSet[str]: A set of keys for resources that must be provided to this OpDefinition."""\n return frozenset(self._required_resource_keys)\n\n @public\n @deprecated(breaking_version="2.0", additional_warn_text="Use `code_version` instead.")\n @property\n def version(self) -> Optional[str]:\n """str: Version of the code encapsulated by the op. If set, this is used as a\n default code version for all outputs.\n """\n return self._version\n\n @public\n @property\n def retry_policy(self) -> Optional[RetryPolicy]:\n """Optional[RetryPolicy]: The RetryPolicy for this op."""\n return self._retry_policy\n\n @public\n @property\n def tags(self) -> Mapping[str, str]:\n """Mapping[str, str]: The tags for this op."""\n return super(OpDefinition, self).tags\n\n
[docs] @public\n def alias(self, name: str) -> "PendingNodeInvocation":\n """Creates a copy of this op with the given name."""\n return super(OpDefinition, self).alias(name)
\n\n
[docs] @public\n def tag(self, tags: Optional[Mapping[str, str]]) -> "PendingNodeInvocation":\n """Creates a copy of this op with the given tags."""\n return super(OpDefinition, self).tag(tags)
\n\n
[docs] @public\n def with_hooks(self, hook_defs: AbstractSet[HookDefinition]) -> "PendingNodeInvocation":\n """Creates a copy of this op with the given hook definitions."""\n return super(OpDefinition, self).with_hooks(hook_defs)
\n\n
[docs] @public\n def with_retry_policy(self, retry_policy: RetryPolicy) -> "PendingNodeInvocation":\n """Creates a copy of this op with the given retry policy."""\n return super(OpDefinition, self).with_retry_policy(retry_policy)
\n\n def is_from_decorator(self) -> bool:\n from .decorators.op_decorator import DecoratedOpFunction\n\n return isinstance(self._compute_fn, DecoratedOpFunction)\n\n def get_output_annotation(self) -> Any:\n if not self.is_from_decorator():\n raise DagsterInvalidInvocationError(\n f"Attempted to get output annotation for {self.node_type_str} '{self.name}', "\n "which was not constructed from a decorated function."\n )\n return cast("DecoratedOpFunction", self.compute_fn).get_output_annotation()\n\n def all_dagster_types(self) -> Iterator[DagsterType]:\n yield from self.all_input_output_types()\n\n def iterate_node_defs(self) -> Iterator[NodeDefinition]:\n yield self\n\n def iterate_op_defs(self) -> Iterator["OpDefinition"]:\n yield self\n\n T_Handle = TypeVar("T_Handle", bound=Optional[NodeHandle])\n\n def resolve_output_to_origin(\n self, output_name: str, handle: T_Handle\n ) -> Tuple[OutputDefinition, T_Handle]:\n return self.output_def_named(output_name), handle\n\n def resolve_output_to_origin_op_def(self, output_name: str) -> "OpDefinition":\n return self\n\n def get_inputs_must_be_resolved_top_level(\n self, asset_layer: "AssetLayer", handle: Optional[NodeHandle] = None\n ) -> Sequence[InputDefinition]:\n handle = cast(NodeHandle, check.inst_param(handle, "handle", NodeHandle))\n unresolveable_input_defs = []\n for input_def in self.input_defs:\n if (\n not input_def.dagster_type.loader\n and not input_def.dagster_type.kind == DagsterTypeKind.NOTHING\n and not input_def.has_default_value\n and not input_def.input_manager_key\n ):\n input_asset_key = asset_layer.asset_key_for_input(handle, input_def.name)\n # If input_asset_key is present, this input can be resolved\n # by a source asset, so input does not need to be resolved\n # at the top level.\n if input_asset_key:\n continue\n unresolveable_input_defs.append(input_def)\n return unresolveable_input_defs\n\n def input_has_default(self, input_name: str) -> bool:\n return self.input_def_named(input_name).has_default_value\n\n def default_value_for_input(self, input_name: str) -> InputDefinition:\n return self.input_def_named(input_name).default_value\n\n def input_supports_dynamic_output_dep(self, input_name: str) -> bool:\n return True\n\n def with_replaced_properties(\n self,\n name: str,\n ins: Optional[Mapping[str, In]] = None,\n outs: Optional[Mapping[str, Out]] = None,\n config_schema: Optional[IDefinitionConfigSchema] = None,\n description: Optional[str] = None,\n ) -> "OpDefinition":\n return OpDefinition.dagster_internal_init(\n name=name,\n ins=ins\n or {input_def.name: In.from_definition(input_def) for input_def in self.input_defs},\n outs=outs\n or {\n output_def.name: Out.from_definition(output_def) for output_def in self.output_defs\n },\n compute_fn=self.compute_fn,\n config_schema=config_schema or self.config_schema,\n description=description or self.description,\n tags=self.tags,\n required_resource_keys=self.required_resource_keys,\n code_version=self._version,\n retry_policy=self.retry_policy,\n version=None, # code_version replaces version\n )\n\n def copy_for_configured(\n self,\n name: str,\n description: Optional[str],\n config_schema: IDefinitionConfigSchema,\n ) -> "OpDefinition":\n return self.with_replaced_properties(\n name=name,\n description=description,\n config_schema=config_schema,\n )\n\n def get_resource_requirements(\n self,\n outer_context: Optional[object] = None,\n ) -> Iterator[ResourceRequirement]:\n # Outer requiree in this context is the outer-calling node handle. If not provided, then\n # just use the op name.\n outer_context = cast(Optional[Tuple[NodeHandle, Optional["AssetLayer"]]], outer_context)\n if not outer_context:\n handle = None\n asset_layer = None\n else:\n handle, asset_layer = outer_context\n node_description = f"{self.node_type_str} '{handle or self.name}'"\n for resource_key in sorted(list(self.required_resource_keys)):\n yield OpDefinitionResourceRequirement(\n key=resource_key, node_description=node_description\n )\n for input_def in self.input_defs:\n if input_def.input_manager_key:\n yield InputManagerRequirement(\n key=input_def.input_manager_key,\n node_description=node_description,\n input_name=input_def.name,\n root_input=False,\n )\n elif asset_layer and handle:\n input_asset_key = asset_layer.asset_key_for_input(handle, input_def.name)\n if input_asset_key:\n io_manager_key = asset_layer.io_manager_key_for_asset(input_asset_key)\n yield InputManagerRequirement(\n key=io_manager_key,\n node_description=node_description,\n input_name=input_def.name,\n root_input=False,\n )\n\n for output_def in self.output_defs:\n yield OutputManagerRequirement(\n key=output_def.io_manager_key,\n node_description=node_description,\n output_name=output_def.name,\n )\n\n def resolve_input_to_destinations(\n self, input_handle: NodeInputHandle\n ) -> Sequence[NodeInputHandle]:\n return [input_handle]\n\n def __call__(self, *args, **kwargs) -> Any:\n from .composition import is_in_composition\n\n if is_in_composition():\n return super(OpDefinition, self).__call__(*args, **kwargs)\n\n return direct_invocation_result(self, *args, **kwargs)
\n\n\ndef _resolve_output_defs_from_outs(\n compute_fn: Union[Callable[..., Any], "DecoratedOpFunction"],\n outs: Optional[Mapping[str, Out]],\n default_code_version: Optional[str],\n) -> Sequence[OutputDefinition]:\n from .decorators.op_decorator import DecoratedOpFunction\n\n if isinstance(compute_fn, DecoratedOpFunction):\n inferred_output_props = infer_output_props(compute_fn.decorated_fn)\n annotation = inferred_output_props.annotation\n description = inferred_output_props.description\n else:\n inferred_output_props = None\n annotation = inspect.Parameter.empty\n description = None\n\n if outs is None:\n return [OutputDefinition.create_from_inferred(inferred_output_props, default_code_version)]\n\n # If only a single entry has been provided to the out dict, then slurp the\n # annotation into the entry.\n if len(outs) == 1:\n name = next(iter(outs.keys()))\n only_out = outs[name]\n return [only_out.to_definition(annotation, name, description, default_code_version)]\n\n output_defs: List[OutputDefinition] = []\n\n # Introspection on type annotations is experimental, so checking\n # metaclass is the best we can do.\n if annotation != inspect.Parameter.empty and not get_origin(annotation) == tuple:\n raise DagsterInvariantViolationError(\n "Expected Tuple annotation for multiple outputs, but received non-tuple annotation."\n )\n if annotation != inspect.Parameter.empty and not len(get_args(annotation)) == len(outs):\n raise DagsterInvariantViolationError(\n "Expected Tuple annotation to have number of entries matching the "\n f"number of outputs for more than one output. Expected {len(outs)} "\n f"outputs but annotation has {len(get_args(annotation))}."\n )\n for idx, (name, cur_out) in enumerate(outs.items()):\n annotation_type = (\n get_args(annotation)[idx]\n if annotation != inspect.Parameter.empty\n else inspect.Parameter.empty\n )\n # Don't provide description when using multiple outputs. Introspection\n # is challenging when faced with multiple inputs.\n output_defs.append(\n cur_out.to_definition(\n annotation_type, name=name, description=None, code_version=default_code_version\n )\n )\n\n return output_defs\n\n\ndef _validate_context_type_hint(fn):\n from inspect import _empty as EmptyAnnotation\n\n from dagster._core.decorator_utils import get_function_params\n from dagster._core.definitions.decorators.op_decorator import is_context_provided\n from dagster._core.execution.context.compute import AssetExecutionContext, OpExecutionContext\n\n params = get_function_params(fn)\n if is_context_provided(params):\n if (\n params[0].annotation is not AssetExecutionContext\n and params[0].annotation is not OpExecutionContext\n and params[0].annotation is not EmptyAnnotation\n ):\n raise DagsterInvalidDefinitionError(\n f"Cannot annotate `context` parameter with type {params[0].annotation}. `context`"\n " must be annotated with AssetExecutionContext, OpExecutionContext, or left blank."\n )\n
", "current_page_name": "_modules/dagster/_core/definitions/op_definition", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.op_definition"}, "output": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.output

\nimport inspect\nfrom typing import (\n    Any,\n    NamedTuple,\n    Optional,\n    Type,\n    TypeVar,\n    Union,\n)\n\nimport dagster._check as check\nfrom dagster._annotations import PublicAttr, deprecated_param\nfrom dagster._core.definitions.metadata import (\n    ArbitraryMetadataMapping,\n    MetadataUserInput,\n    normalize_metadata,\n)\nfrom dagster._core.errors import DagsterError, DagsterInvalidDefinitionError\nfrom dagster._core.types.dagster_type import (\n    DagsterType,\n    is_dynamic_output_annotation,\n    resolve_dagster_type,\n)\n\nfrom .inference import InferredOutputProps\nfrom .input import NoValueSentinel\nfrom .utils import DEFAULT_IO_MANAGER_KEY, DEFAULT_OUTPUT, check_valid_name\n\nTOutputDefinition = TypeVar("TOutputDefinition", bound="OutputDefinition")\nTOut = TypeVar("TOut", bound="Out")\n\n\nclass OutputDefinition:\n    """Defines an output from an op's compute function.\n\n    Ops can have multiple outputs, in which case outputs cannot be anonymous.\n\n    Many ops have only one output, in which case the user can provide a single output definition\n    that will be given the default name, "result".\n\n    Output definitions may be typed using the Dagster type system.\n\n    Args:\n        dagster_type (Optional[Union[Type, DagsterType]]]): The type of this output.\n            Users should provide the Python type of the objects that they expect the op to yield\n            for this output, or a :py:class:`DagsterType` that defines a runtime check that they\n            want to be run on this output. Defaults to :py:class:`Any`.\n        name (Optional[str]): Name of the output. (default: "result")\n        description (Optional[str]): Human-readable description of the output.\n        is_required (Optional[bool]): Whether the presence of this field is required. (default: True)\n        io_manager_key (Optional[str]): The resource key of the IOManager used for storing this\n            output and loading it in downstream steps (default: "io_manager").\n        metadata (Optional[Dict[str, Any]]): A dict of the metadata for the output.\n            For example, users can provide a file path if the data object will be stored in a\n            filesystem, or provide information of a database table when it is going to load the data\n            into the table.\n        code_version (Optional[str]): (Experimental) Version of the code that generates this output. In\n            general, versions should be set only for code that deterministically produces the same\n            output when given the same inputs.\n\n    """\n\n    def __init__(\n        self,\n        dagster_type=None,\n        name: Optional[str] = None,\n        description: Optional[str] = None,\n        is_required: bool = True,\n        io_manager_key: Optional[str] = None,\n        metadata: Optional[ArbitraryMetadataMapping] = None,\n        code_version: Optional[str] = None,\n        # make sure new parameters are updated in combine_with_inferred below\n    ):\n        self._name = check_valid_name(check.opt_str_param(name, "name", DEFAULT_OUTPUT))\n        self._type_not_set = dagster_type is None\n        self._dagster_type = resolve_dagster_type(dagster_type)\n        self._description = check.opt_str_param(description, "description")\n        self._is_required = check.bool_param(is_required, "is_required")\n        self._io_manager_key = check.opt_str_param(\n            io_manager_key,\n            "io_manager_key",\n            default=DEFAULT_IO_MANAGER_KEY,\n        )\n        self._code_version = check.opt_str_param(code_version, "code_version")\n        self._raw_metadata = check.opt_mapping_param(metadata, "metadata", key_type=str)\n        self._metadata = normalize_metadata(self._raw_metadata, allow_invalid=True)\n\n    @property\n    def name(self) -> str:\n        return self._name\n\n    @property\n    def dagster_type(self) -> DagsterType:\n        return self._dagster_type\n\n    @property\n    def description(self) -> Optional[str]:\n        return self._description\n\n    @property\n    def is_required(self) -> bool:\n        return self._is_required\n\n    @property\n    def io_manager_key(self) -> str:\n        return self._io_manager_key\n\n    @property\n    def code_version(self) -> Optional[str]:\n        return self._code_version\n\n    @property\n    def optional(self) -> bool:\n        return not self.is_required\n\n    @property\n    def metadata(self) -> ArbitraryMetadataMapping:\n        return self._raw_metadata\n\n    @property\n    def is_dynamic(self) -> bool:\n        return False\n\n    def mapping_from(\n        self, node_name: str, output_name: Optional[str] = None, from_dynamic_mapping: bool = False\n    ) -> "OutputMapping":\n        """Create an output mapping from an output of a child node.\n\n        In a GraphDefinition, you can use this helper function to construct\n        an :py:class:`OutputMapping` from the output of a child node.\n\n        Args:\n            node_name (str): The name of the child node from which to map this output.\n            output_name (str): The name of the child node's output from which to map this output.\n\n        Examples:\n            .. code-block:: python\n\n                output_mapping = OutputDefinition(Int).mapping_from('child_node')\n        """\n        return OutputMapping(\n            graph_output_name=self.name,\n            mapped_node_name=node_name,\n            mapped_node_output_name=output_name or DEFAULT_OUTPUT,\n            graph_output_description=self.description,\n            dagster_type=self.dagster_type,\n            from_dynamic_mapping=from_dynamic_mapping or self.is_dynamic,\n        )\n\n    @staticmethod\n    def create_from_inferred(\n        inferred: Optional[InferredOutputProps], code_version: Optional[str] = None\n    ) -> "OutputDefinition":\n        if not inferred:\n            return OutputDefinition(code_version=code_version)\n        if is_dynamic_output_annotation(inferred.annotation):\n            return DynamicOutputDefinition(\n                dagster_type=_checked_inferred_type(inferred.annotation),\n                description=inferred.description,\n                code_version=code_version,\n            )\n        else:\n            return OutputDefinition(\n                dagster_type=_checked_inferred_type(inferred.annotation),\n                description=inferred.description,\n                code_version=code_version,\n            )\n\n    def combine_with_inferred(\n        self: TOutputDefinition, inferred: InferredOutputProps\n    ) -> TOutputDefinition:\n        dagster_type = self.dagster_type\n        if self._type_not_set:\n            dagster_type = _checked_inferred_type(inferred.annotation)\n        if self.description is None:\n            description = inferred.description\n        else:\n            description = self.description\n\n        return self.__class__(\n            name=self.name,\n            dagster_type=dagster_type,\n            description=description,\n            is_required=self.is_required,\n            io_manager_key=self.io_manager_key,\n            metadata=self._metadata,\n        )\n\n\ndef _checked_inferred_type(inferred: Any) -> DagsterType:\n    try:\n        if inferred == inspect.Parameter.empty:\n            return resolve_dagster_type(None)\n        elif inferred is None:\n            # When inferred.annotation is None, it means someone explicitly put "None" as the\n            # annotation, so want to map it to a DagsterType that checks for the None type\n            return resolve_dagster_type(type(None))\n        else:\n            return resolve_dagster_type(inferred)\n\n    except DagsterError as e:\n        raise DagsterInvalidDefinitionError(\n            f"Problem using type '{inferred}' from return type annotation, correct the issue "\n            "or explicitly set the dagster_type via Out()."\n        ) from e\n\n\nclass DynamicOutputDefinition(OutputDefinition):\n    """Variant of :py:class:`OutputDefinition <dagster.OutputDefinition>` for an\n    output that will dynamically alter the graph at runtime.\n\n    When using in a composition function such as :py:func:`@job <dagster.job>`,\n    dynamic outputs must be used with either:\n\n    * ``map`` - clone downstream nodes for each separate :py:class:`DynamicOutput`\n    * ``collect`` - gather across all :py:class:`DynamicOutput` in to a list\n\n    Uses the same constructor as :py:class:`OutputDefinition <dagster.OutputDefinition>`\n\n        .. code-block:: python\n\n            @op(\n                config_schema={\n                    "path": Field(str, default_value=file_relative_path(__file__, "sample"))\n                },\n                output_defs=[DynamicOutputDefinition(str)],\n            )\n            def files_in_directory(context):\n                path = context.op_config["path"]\n                dirname, _, filenames = next(os.walk(path))\n                for file in filenames:\n                    yield DynamicOutput(os.path.join(dirname, file), mapping_key=_clean(file))\n\n            @job\n            def process_directory():\n                files = files_in_directory()\n\n                # use map to invoke an op on each dynamic output\n                file_results = files.map(process_file)\n\n                # use collect to gather the results in to a list\n                summarize_directory(file_results.collect())\n    """\n\n    @property\n    def is_dynamic(self) -> bool:\n        return True\n\n\nclass OutputPointer(NamedTuple("_OutputPointer", [("node_name", str), ("output_name", str)])):\n    def __new__(cls, node_name: str, output_name: Optional[str] = None):\n        return super(OutputPointer, cls).__new__(\n            cls,\n            check.str_param(node_name, "node_name"),\n            check.opt_str_param(output_name, "output_name", DEFAULT_OUTPUT),\n        )\n\n\n
[docs]@deprecated_param(\n param="dagster_type",\n breaking_version="2.0",\n additional_warn_text="Any defined `dagster_type` should come from the underlying op `Output`.",\n # Disabling warning here since we're passing this internally and I'm not sure whether it is\n # actually used or discarded.\n emit_runtime_warning=False,\n)\nclass OutputMapping(NamedTuple):\n """Defines an output mapping for a graph.\n\n Args:\n graph_output_name (str): Name of the output in the graph being mapped to.\n mapped_node_name (str): Named of the node (op/graph) that the output is being mapped from.\n mapped_node_output_name (str): Name of the output in the node (op/graph) that is being mapped from.\n graph_output_description (Optional[str]): A description of the output in the graph being mapped from.\n from_dynamic_mapping (bool): Set to true if the node being mapped to is a mapped dynamic node.\n dagster_type (Optional[DagsterType]): The dagster type of the graph's output being mapped to.\n\n Examples:\n .. code-block:: python\n\n from dagster import OutputMapping, GraphDefinition, op, graph, GraphOut\n\n @op\n def emit_five(x):\n return 5\n\n # The following two graph definitions are equivalent\n GraphDefinition(\n name="the_graph",\n node_defs=[emit_five],\n output_mappings=[\n OutputMapping(\n graph_output_name="result", # Default output name\n mapped_node_name="emit_five",\n mapped_node_output_name="result"\n )\n ]\n )\n\n @graph(out=GraphOut())\n def the_graph:\n return emit_five()\n """\n\n graph_output_name: str\n mapped_node_name: str\n mapped_node_output_name: str\n graph_output_description: Optional[str] = None\n dagster_type: Optional[DagsterType] = None\n from_dynamic_mapping: bool = False\n\n @property\n def maps_from(self) -> OutputPointer:\n return OutputPointer(self.mapped_node_name, self.mapped_node_output_name)\n\n def get_definition(self, is_dynamic: bool) -> "OutputDefinition":\n check.invariant(not is_dynamic or self.from_dynamic_mapping)\n is_dynamic = is_dynamic or self.from_dynamic_mapping\n klass = DynamicOutputDefinition if is_dynamic else OutputDefinition\n return klass(\n name=self.graph_output_name,\n description=self.graph_output_description,\n dagster_type=self.dagster_type,\n )
\n\n\n
[docs]class Out(\n NamedTuple(\n "_Out",\n [\n ("dagster_type", PublicAttr[Union[DagsterType, Type[NoValueSentinel]]]),\n ("description", PublicAttr[Optional[str]]),\n ("is_required", PublicAttr[bool]),\n ("io_manager_key", PublicAttr[str]),\n ("metadata", PublicAttr[Optional[MetadataUserInput]]),\n ("code_version", PublicAttr[Optional[str]]),\n ],\n )\n):\n """Defines an output from an op's compute function.\n\n Ops can have multiple outputs, in which case outputs cannot be anonymous.\n\n Many ops have only one output, in which case the user can provide a single output definition\n that will be given the default name, "result".\n\n Outs may be typed using the Dagster type system.\n\n Args:\n dagster_type (Optional[Union[Type, DagsterType]]]):\n The type of this output. Should only be set if the correct type can not\n be inferred directly from the type signature of the decorated function.\n description (Optional[str]): Human-readable description of the output.\n is_required (bool): Whether the presence of this field is required. (default: True)\n io_manager_key (Optional[str]): The resource key of the output manager used for this output.\n (default: "io_manager").\n metadata (Optional[Dict[str, Any]]): A dict of the metadata for the output.\n For example, users can provide a file path if the data object will be stored in a\n filesystem, or provide information of a database table when it is going to load the data\n into the table.\n code_version (Optional[str]): (Experimental) Version of the code that generates this output. In\n general, versions should be set only for code that deterministically produces the same\n output when given the same inputs.\n """\n\n def __new__(\n cls,\n dagster_type: Union[Type, DagsterType] = NoValueSentinel,\n description: Optional[str] = None,\n is_required: bool = True,\n io_manager_key: Optional[str] = None,\n metadata: Optional[ArbitraryMetadataMapping] = None,\n code_version: Optional[str] = None,\n # make sure new parameters are updated in combine_with_inferred below\n ):\n return super(Out, cls).__new__(\n cls,\n dagster_type=(\n NoValueSentinel\n if dagster_type is NoValueSentinel\n else resolve_dagster_type(dagster_type)\n ),\n description=description,\n is_required=check.bool_param(is_required, "is_required"),\n io_manager_key=check.opt_str_param(\n io_manager_key, "io_manager_key", default=DEFAULT_IO_MANAGER_KEY\n ),\n metadata=metadata,\n code_version=code_version,\n )\n\n @classmethod\n def from_definition(cls, output_def: "OutputDefinition"):\n klass = Out if not output_def.is_dynamic else DynamicOut\n return klass(\n dagster_type=output_def.dagster_type,\n description=output_def.description,\n is_required=output_def.is_required,\n io_manager_key=output_def.io_manager_key,\n metadata=output_def.metadata,\n code_version=output_def.code_version,\n )\n\n def to_definition(\n self,\n annotation_type: type,\n name: Optional[str],\n description: Optional[str],\n code_version: Optional[str],\n ) -> "OutputDefinition":\n dagster_type = (\n self.dagster_type\n if self.dagster_type is not NoValueSentinel\n else _checked_inferred_type(annotation_type)\n )\n\n klass = OutputDefinition if not self.is_dynamic else DynamicOutputDefinition\n\n return klass(\n dagster_type=dagster_type,\n name=name,\n description=self.description or description,\n is_required=self.is_required,\n io_manager_key=self.io_manager_key,\n metadata=self.metadata,\n code_version=self.code_version or code_version,\n )\n\n @property\n def is_dynamic(self) -> bool:\n return False
\n\n\n
[docs]class DynamicOut(Out):\n """Variant of :py:class:`Out <dagster.Out>` for an output that will dynamically alter the graph at\n runtime.\n\n When using in a composition function such as :py:func:`@graph <dagster.graph>`,\n dynamic outputs must be used with either\n\n * ``map`` - clone downstream ops for each separate :py:class:`DynamicOut`\n * ``collect`` - gather across all :py:class:`DynamicOut` in to a list\n\n Uses the same constructor as :py:class:`Out <dagster.Out>`\n\n .. code-block:: python\n\n @op(\n config_schema={\n "path": Field(str, default_value=file_relative_path(__file__, "sample"))\n },\n out=DynamicOut(str),\n )\n def files_in_directory(context):\n path = context.op_config["path"]\n dirname, _, filenames = next(os.walk(path))\n for file in filenames:\n yield DynamicOutput(os.path.join(dirname, file), mapping_key=_clean(file))\n\n @job\n def process_directory():\n files = files_in_directory()\n\n # use map to invoke an op on each dynamic output\n file_results = files.map(process_file)\n\n # use collect to gather the results in to a list\n summarize_directory(file_results.collect())\n """\n\n def to_definition(\n self,\n annotation_type: type,\n name: Optional[str],\n description: Optional[str],\n code_version: Optional[str],\n ) -> "OutputDefinition":\n dagster_type = (\n self.dagster_type\n if self.dagster_type is not NoValueSentinel\n else _checked_inferred_type(annotation_type)\n )\n\n return DynamicOutputDefinition(\n dagster_type=dagster_type,\n name=name,\n description=self.description or description,\n is_required=self.is_required,\n io_manager_key=self.io_manager_key,\n metadata=self.metadata,\n code_version=self.code_version or code_version,\n )\n\n @property\n def is_dynamic(self) -> bool:\n return True
\n\n\n
[docs]class GraphOut(NamedTuple("_GraphOut", [("description", PublicAttr[Optional[str]])])):\n """Represents information about the outputs that a graph maps.\n\n Args:\n description (Optional[str]): Human-readable description of the output.\n """\n\n def __new__(cls, description: Optional[str] = None):\n return super(GraphOut, cls).__new__(cls, description=description)\n\n def to_definition(self, name: Optional[str]) -> "OutputDefinition":\n return OutputDefinition(name=name, description=self.description)
\n
", "current_page_name": "_modules/dagster/_core/definitions/output", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.output"}, "partition": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.partition

\nimport copy\nimport hashlib\nimport json\nfrom abc import ABC, abstractmethod\nfrom collections import defaultdict\nfrom datetime import (\n    datetime,\n    timedelta,\n)\nfrom enum import Enum\nfrom typing import (\n    Any,\n    Callable,\n    Dict,\n    Generic,\n    Iterable,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Set,\n    Type,\n    Union,\n    cast,\n)\n\nfrom dateutil.relativedelta import relativedelta\nfrom typing_extensions import TypeVar\n\nimport dagster._check as check\nfrom dagster._annotations import PublicAttr, deprecated, deprecated_param, public\nfrom dagster._core.definitions.partition_key_range import PartitionKeyRange\nfrom dagster._core.definitions.run_request import (\n    AddDynamicPartitionsRequest,\n    DeleteDynamicPartitionsRequest,\n)\nfrom dagster._core.instance import DagsterInstance, DynamicPartitionsStore\nfrom dagster._core.storage.tags import PARTITION_NAME_TAG, PARTITION_SET_TAG\nfrom dagster._serdes import whitelist_for_serdes\nfrom dagster._utils import xor\nfrom dagster._utils.cached_method import cached_method\nfrom dagster._utils.warnings import (\n    normalize_renamed_param,\n)\n\nfrom ..errors import (\n    DagsterInvalidDefinitionError,\n    DagsterInvalidDeserializationVersionError,\n    DagsterInvalidInvocationError,\n    DagsterUnknownPartitionError,\n)\nfrom .config import ConfigMapping\nfrom .utils import validate_tags\n\nDEFAULT_DATE_FORMAT = "%Y-%m-%d"\n\nT_cov = TypeVar("T_cov", default=Any, covariant=True)\nT_str = TypeVar("T_str", bound=str, default=str, covariant=True)\nT_PartitionsDefinition = TypeVar(\n    "T_PartitionsDefinition",\n    bound="PartitionsDefinition",\n    default="PartitionsDefinition",\n    covariant=True,\n)\n\n# In the Dagster UI users can select partition ranges following the format '2022-01-13...2022-01-14'\n# "..." is an invalid substring in partition keys\n# The other escape characters are characters that may not display in the Dagster UI.\nINVALID_PARTITION_SUBSTRINGS = ["...", "\\a", "\\b", "\\f", "\\n", "\\r", "\\t", "\\v", "\\0"]\n\n\n@deprecated(breaking_version="2.0", additional_warn_text="Use string partition keys instead.")\nclass Partition(Generic[T_cov]):\n    """A Partition represents a single slice of the entire set of a job's possible work. It consists\n    of a value, which is an object that represents that partition, and an optional name, which is\n    used to label the partition in a human-readable way.\n\n    Args:\n        value (Any): The object for this partition\n        name (str): Name for this partition\n    """\n\n    def __init__(self, value: Any, name: Optional[str] = None):\n        self._value = value\n        self._name = check.str_param(name or str(value), "name")\n\n    @property\n    def value(self) -> T_cov:\n        return self._value\n\n    @property\n    def name(self) -> str:\n        return self._name\n\n    def __eq__(self, other: object) -> bool:\n        if not isinstance(other, Partition):\n            return False\n        else:\n            return self.value == other.value and self.name == other.name\n\n\n@whitelist_for_serdes\nclass ScheduleType(Enum):\n    HOURLY = "HOURLY"\n    DAILY = "DAILY"\n    WEEKLY = "WEEKLY"\n    MONTHLY = "MONTHLY"\n\n    @property\n    def ordinal(self):\n        return {"HOURLY": 1, "DAILY": 2, "WEEKLY": 3, "MONTHLY": 4}[self.value]\n\n    @property\n    def delta(self):\n        if self == ScheduleType.HOURLY:\n            return timedelta(hours=1)\n        elif self == ScheduleType.DAILY:\n            return timedelta(days=1)\n        elif self == ScheduleType.WEEKLY:\n            return timedelta(weeks=1)\n        elif self == ScheduleType.MONTHLY:\n            return relativedelta(months=1)\n        else:\n            check.failed(f"Unexpected ScheduleType {self}")\n\n    def __gt__(self, other: "ScheduleType") -> bool:\n        check.inst(other, ScheduleType, "Cannot compare ScheduleType with non-ScheduleType")\n        return self.ordinal > other.ordinal\n\n    def __lt__(self, other: "ScheduleType") -> bool:\n        check.inst(other, ScheduleType, "Cannot compare ScheduleType with non-ScheduleType")\n        return self.ordinal < other.ordinal\n\n\n
[docs]class PartitionsDefinition(ABC, Generic[T_str]):\n """Defines a set of partitions, which can be attached to a software-defined asset or job.\n\n Abstract class with implementations for different kinds of partitions.\n """\n\n @property\n def partitions_subset_class(self) -> Type["PartitionsSubset[T_str]"]:\n return DefaultPartitionsSubset[T_str]\n\n
[docs] @abstractmethod\n @public\n def get_partition_keys(\n self,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> Sequence[T_str]:\n """Returns a list of strings representing the partition keys of the PartitionsDefinition.\n\n Args:\n current_time (Optional[datetime]): A datetime object representing the current time, only\n applicable to time-based partitions definitions.\n dynamic_partitions_store (Optional[DynamicPartitionsStore]): The DynamicPartitionsStore\n object that is responsible for fetching dynamic partitions. Required when the\n partitions definition is a DynamicPartitionsDefinition with a name defined. Users\n can pass the DagsterInstance fetched via `context.instance` to this argument.\n\n Returns:\n Sequence[str]\n """\n ...
\n\n def __str__(self) -> str:\n joined_keys = ", ".join([f"'{key}'" for key in self.get_partition_keys()])\n return joined_keys\n\n def get_last_partition_key(\n self,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> Optional[T_str]:\n partition_keys = self.get_partition_keys(current_time, dynamic_partitions_store)\n return partition_keys[-1] if partition_keys else None\n\n def get_first_partition_key(\n self,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> Optional[T_str]:\n partition_keys = self.get_partition_keys(current_time, dynamic_partitions_store)\n return partition_keys[0] if partition_keys else None\n\n def get_partition_keys_in_range(\n self,\n partition_key_range: PartitionKeyRange,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> Sequence[T_str]:\n keys_exist = {\n partition_key_range.start: self.has_partition_key(\n partition_key_range.start, dynamic_partitions_store=dynamic_partitions_store\n ),\n partition_key_range.end: self.has_partition_key(\n partition_key_range.end, dynamic_partitions_store=dynamic_partitions_store\n ),\n }\n if not all(keys_exist.values()):\n raise DagsterInvalidInvocationError(\n f"""Partition range {partition_key_range.start} to {partition_key_range.end} is\n not a valid range. Nonexistent partition keys:\n {list(key for key in keys_exist if keys_exist[key] is False)}"""\n )\n\n # in the simple case, simply return the single key in the range\n if partition_key_range.start == partition_key_range.end:\n return [cast(T_str, partition_key_range.start)]\n\n # defer this call as it is potentially expensive\n partition_keys = self.get_partition_keys(dynamic_partitions_store=dynamic_partitions_store)\n return partition_keys[\n partition_keys.index(partition_key_range.start) : partition_keys.index(\n partition_key_range.end\n )\n + 1\n ]\n\n def empty_subset(self) -> "PartitionsSubset[T_str]":\n return self.partitions_subset_class.empty_subset(self)\n\n def subset_with_partition_keys(\n self, partition_keys: Iterable[str]\n ) -> "PartitionsSubset[T_str]":\n return self.empty_subset().with_partition_keys(partition_keys)\n\n def subset_with_all_partitions(\n self,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> "PartitionsSubset[T_str]":\n return self.subset_with_partition_keys(\n self.get_partition_keys(\n current_time=current_time, dynamic_partitions_store=dynamic_partitions_store\n )\n )\n\n def deserialize_subset(self, serialized: str) -> "PartitionsSubset[T_str]":\n return self.partitions_subset_class.from_serialized(self, serialized)\n\n def can_deserialize_subset(\n self,\n serialized: str,\n serialized_partitions_def_unique_id: Optional[str],\n serialized_partitions_def_class_name: Optional[str],\n ) -> bool:\n return self.partitions_subset_class.can_deserialize(\n self,\n serialized,\n serialized_partitions_def_unique_id,\n serialized_partitions_def_class_name,\n )\n\n def get_serializable_unique_identifier(\n self, dynamic_partitions_store: Optional[DynamicPartitionsStore] = None\n ) -> str:\n return hashlib.sha1(\n json.dumps(\n self.get_partition_keys(dynamic_partitions_store=dynamic_partitions_store)\n ).encode("utf-8")\n ).hexdigest()\n\n def get_tags_for_partition_key(self, partition_key: str) -> Mapping[str, str]:\n tags = {PARTITION_NAME_TAG: partition_key}\n return tags\n\n def get_num_partitions(\n self,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> int:\n return len(self.get_partition_keys(current_time, dynamic_partitions_store))\n\n def has_partition_key(\n self,\n partition_key: str,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> bool:\n return partition_key in self.get_partition_keys(\n current_time=current_time,\n dynamic_partitions_store=dynamic_partitions_store,\n )\n\n def validate_partition_key(\n self,\n partition_key: str,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> None:\n if not self.has_partition_key(partition_key, current_time, dynamic_partitions_store):\n raise DagsterUnknownPartitionError(\n f"Could not find a partition with key `{partition_key}`."\n )
\n\n\ndef raise_error_on_invalid_partition_key_substring(partition_keys: Sequence[str]) -> None:\n for partition_key in partition_keys:\n found_invalid_substrs = [\n invalid_substr\n for invalid_substr in INVALID_PARTITION_SUBSTRINGS\n if invalid_substr in partition_key\n ]\n if found_invalid_substrs:\n raise DagsterInvalidDefinitionError(\n f"{found_invalid_substrs} are invalid substrings in a partition key"\n )\n\n\ndef raise_error_on_duplicate_partition_keys(partition_keys: Sequence[str]) -> None:\n counts: Dict[str, int] = defaultdict(lambda: 0)\n for partition_key in partition_keys:\n counts[partition_key] += 1\n found_duplicates = [key for key in counts.keys() if counts[key] > 1]\n if found_duplicates:\n raise DagsterInvalidDefinitionError(\n "Partition keys must be unique. Duplicate instances of partition keys:"\n f" {found_duplicates}."\n )\n\n\n
[docs]class StaticPartitionsDefinition(PartitionsDefinition[str]):\n """A statically-defined set of partitions.\n\n Example:\n .. code-block:: python\n\n from dagster import StaticPartitionsDefinition, asset\n\n oceans_partitions_def = StaticPartitionsDefinition(\n ["arctic", "atlantic", "indian", "pacific", "southern"]\n )\n\n @asset(partitions_def=oceans_partitions_defs)\n def ml_model_for_each_ocean():\n ...\n """\n\n def __init__(self, partition_keys: Sequence[str]):\n check.sequence_param(partition_keys, "partition_keys", of_type=str)\n\n raise_error_on_invalid_partition_key_substring(partition_keys)\n raise_error_on_duplicate_partition_keys(partition_keys)\n\n self._partition_keys = partition_keys\n\n
[docs] @public\n def get_partition_keys(\n self,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> Sequence[str]:\n """Returns a list of strings representing the partition keys of the PartitionsDefinition.\n\n Args:\n current_time (Optional[datetime]): A datetime object representing the current time, only\n applicable to time-based partitions definitions.\n dynamic_partitions_store (Optional[DynamicPartitionsStore]): The DynamicPartitionsStore\n object that is responsible for fetching dynamic partitions. Only applicable to\n DynamicPartitionsDefinitions.\n\n Returns:\n Sequence[str]\n\n """\n return self._partition_keys
\n\n def __hash__(self):\n return hash(self.__repr__())\n\n def __eq__(self, other) -> bool:\n return isinstance(other, StaticPartitionsDefinition) and (\n self is other or self._partition_keys == other.get_partition_keys()\n )\n\n def __repr__(self) -> str:\n return f"{type(self).__name__}(partition_keys={self._partition_keys})"\n\n def get_num_partitions(\n self,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> int:\n # We don't currently throw an error when a duplicate partition key is defined\n # in a static partitions definition, though we will at 1.3.0.\n # This ensures that partition counts are correct in the Dagster UI.\n return len(set(self.get_partition_keys(current_time, dynamic_partitions_store)))
\n\n\nclass CachingDynamicPartitionsLoader(DynamicPartitionsStore):\n """A batch loader that caches the partition keys for a given dynamic partitions definition,\n to avoid repeated calls to the database for the same partitions definition.\n """\n\n def __init__(self, instance: DagsterInstance):\n self._instance = instance\n\n @cached_method\n def get_dynamic_partitions(self, partitions_def_name: str) -> Sequence[str]:\n return self._instance.get_dynamic_partitions(partitions_def_name)\n\n @cached_method\n def has_dynamic_partition(self, partitions_def_name: str, partition_key: str) -> bool:\n return self._instance.has_dynamic_partition(partitions_def_name, partition_key)\n\n\n
[docs]@deprecated_param(\n param="partition_fn",\n breaking_version="2.0",\n additional_warn_text="Provide partition definition name instead.",\n)\nclass DynamicPartitionsDefinition(\n PartitionsDefinition,\n NamedTuple(\n "_DynamicPartitionsDefinition",\n [\n (\n "partition_fn",\n PublicAttr[\n Optional[\n Callable[[Optional[datetime]], Union[Sequence[Partition], Sequence[str]]]\n ]\n ],\n ),\n ("name", PublicAttr[Optional[str]]),\n ],\n ),\n):\n """A partitions definition whose partition keys can be dynamically added and removed.\n\n This is useful for cases where the set of partitions is not known at definition time,\n but is instead determined at runtime.\n\n Partitions can be added and removed using `instance.add_dynamic_partitions` and\n `instance.delete_dynamic_partition` methods.\n\n Args:\n name (Optional[str]): The name of the partitions definition.\n partition_fn (Optional[Callable[[Optional[datetime]], Union[Sequence[Partition], Sequence[str]]]]):\n A function that returns the current set of partitions. This argument is deprecated and\n will be removed in 2.0.0.\n\n Examples:\n .. code-block:: python\n\n fruits = DynamicPartitionsDefinition(name="fruits")\n\n @sensor(job=my_job)\n def my_sensor(context):\n return SensorResult(\n run_requests=[RunRequest(partition_key="apple")],\n dynamic_partitions_requests=[fruits.build_add_request(["apple"])]\n )\n """\n\n def __new__(\n cls,\n partition_fn: Optional[\n Callable[[Optional[datetime]], Union[Sequence[Partition], Sequence[str]]]\n ] = None,\n name: Optional[str] = None,\n ):\n partition_fn = check.opt_callable_param(partition_fn, "partition_fn")\n name = check.opt_str_param(name, "name")\n\n if partition_fn is None and name is None:\n raise DagsterInvalidDefinitionError(\n "Must provide either partition_fn or name to DynamicPartitionsDefinition."\n )\n\n if partition_fn and name:\n raise DagsterInvalidDefinitionError(\n "Cannot provide both partition_fn and name to DynamicPartitionsDefinition."\n )\n\n return super(DynamicPartitionsDefinition, cls).__new__(\n cls,\n partition_fn=check.opt_callable_param(partition_fn, "partition_fn"),\n name=check.opt_str_param(name, "name"),\n )\n\n def _validated_name(self) -> str:\n if self.name is None:\n check.failed(\n "Dynamic partitions definition must have a name to fetch dynamic partitions"\n )\n return self.name\n\n def __eq__(self, other):\n return (\n isinstance(other, DynamicPartitionsDefinition)\n and self.name == other.name\n and self.partition_fn == other.partition_fn\n )\n\n def __hash__(self):\n return hash(tuple(self.__repr__()))\n\n def __str__(self) -> str:\n if self.name:\n return f'Dynamic partitions: "{self._validated_name()}"'\n else:\n return super().__str__()\n\n
[docs] @public\n def get_partition_keys(\n self,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> Sequence[str]:\n """Returns a list of strings representing the partition keys of the\n PartitionsDefinition.\n\n Args:\n current_time (Optional[datetime]): A datetime object representing the current time, only\n applicable to time-based partitions definitions.\n dynamic_partitions_store (Optional[DynamicPartitionsStore]): The DynamicPartitionsStore\n object that is responsible for fetching dynamic partitions. Required when the\n partitions definition is a DynamicPartitionsDefinition with a name defined. Users\n can pass the DagsterInstance fetched via `context.instance` to this argument.\n\n Returns:\n Sequence[str]\n """\n if self.partition_fn:\n partitions = self.partition_fn(current_time)\n if all(isinstance(partition, Partition) for partition in partitions):\n return [partition.name for partition in partitions] # type: ignore # (illegible conditional)\n else:\n return partitions # type: ignore # (illegible conditional)\n else:\n check.opt_inst_param(\n dynamic_partitions_store, "dynamic_partitions_store", DynamicPartitionsStore\n )\n\n if dynamic_partitions_store is None:\n check.failed(\n "The instance is not available to load partitions. You may be seeing this error"\n " when using dynamic partitions with a version of dagster-webserver or"\n " dagster-cloud that is older than 1.1.18."\n )\n\n return dynamic_partitions_store.get_dynamic_partitions(\n partitions_def_name=self._validated_name()\n )
\n\n def has_partition_key(\n self,\n partition_key: str,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> bool:\n if self.partition_fn:\n return partition_key in self.get_partition_keys(current_time)\n else:\n if dynamic_partitions_store is None:\n check.failed(\n "The instance is not available to load partitions. You may be seeing this error"\n " when using dynamic partitions with a version of dagster-webserver or"\n " dagster-cloud that is older than 1.1.18."\n )\n\n return dynamic_partitions_store.has_dynamic_partition(\n partitions_def_name=self._validated_name(), partition_key=partition_key\n )\n\n def build_add_request(self, partition_keys: Sequence[str]) -> AddDynamicPartitionsRequest:\n check.sequence_param(partition_keys, "partition_keys", of_type=str)\n validated_name = self._validated_name()\n return AddDynamicPartitionsRequest(validated_name, partition_keys)\n\n def build_delete_request(self, partition_keys: Sequence[str]) -> DeleteDynamicPartitionsRequest:\n check.sequence_param(partition_keys, "partition_keys", of_type=str)\n validated_name = self._validated_name()\n return DeleteDynamicPartitionsRequest(validated_name, partition_keys)
\n\n\n
[docs]@deprecated_param(\n param="run_config_for_partition_fn",\n breaking_version="2.0",\n additional_warn_text="Use `run_config_for_partition_key_fn` instead.",\n)\n@deprecated_param(\n param="tags_for_partition_fn",\n breaking_version="2.0",\n additional_warn_text="Use `tags_for_partition_key_fn` instead.",\n)\nclass PartitionedConfig(Generic[T_PartitionsDefinition]):\n """Defines a way of configuring a job where the job can be run on one of a discrete set of\n partitions, and each partition corresponds to run configuration for the job.\n\n Setting PartitionedConfig as the config for a job allows you to launch backfills for that job\n and view the run history across partitions.\n """\n\n def __init__(\n self,\n partitions_def: T_PartitionsDefinition,\n run_config_for_partition_fn: Optional[Callable[[Partition], Mapping[str, Any]]] = None,\n decorated_fn: Optional[Callable[..., Mapping[str, Any]]] = None,\n tags_for_partition_fn: Optional[Callable[[Partition[Any]], Mapping[str, str]]] = None,\n run_config_for_partition_key_fn: Optional[Callable[[str], Mapping[str, Any]]] = None,\n tags_for_partition_key_fn: Optional[Callable[[str], Mapping[str, str]]] = None,\n ):\n self._partitions = check.inst_param(partitions_def, "partitions_def", PartitionsDefinition)\n self._decorated_fn = decorated_fn\n\n check.invariant(\n xor(run_config_for_partition_fn, run_config_for_partition_key_fn),\n "Must provide exactly one of run_config_for_partition_fn or"\n " run_config_for_partition_key_fn",\n )\n check.invariant(\n not (tags_for_partition_fn and tags_for_partition_key_fn),\n "Cannot provide both of tags_for_partition_fn or tags_for_partition_key_fn",\n )\n\n self._run_config_for_partition_fn = check.opt_callable_param(\n run_config_for_partition_fn, "run_config_for_partition_fn"\n )\n self._run_config_for_partition_key_fn = check.opt_callable_param(\n run_config_for_partition_key_fn, "run_config_for_partition_key_fn"\n )\n self._tags_for_partition_fn = check.opt_callable_param(\n tags_for_partition_fn, "tags_for_partition_fn"\n )\n self._tags_for_partition_key_fn = check.opt_callable_param(\n tags_for_partition_key_fn, "tags_for_partition_key_fn"\n )\n\n @public\n @property\n def partitions_def(\n self,\n ) -> T_PartitionsDefinition:\n """T_PartitionsDefinition: The partitions definition associated with this PartitionedConfig."""\n return self._partitions\n\n @deprecated(\n breaking_version="2.0",\n additional_warn_text="Use `run_config_for_partition_key_fn` instead.",\n )\n @public\n @property\n def run_config_for_partition_fn(\n self,\n ) -> Optional[Callable[[Partition], Mapping[str, Any]]]:\n """Optional[Callable[[Partition], Mapping[str, Any]]]: A function that accepts a partition\n and returns a dictionary representing the config to attach to runs for that partition.\n Deprecated as of 1.3.3.\n """\n return self._run_config_for_partition_fn\n\n @public\n @property\n def run_config_for_partition_key_fn(\n self,\n ) -> Optional[Callable[[str], Mapping[str, Any]]]:\n """Optional[Callable[[str], Mapping[str, Any]]]: A function that accepts a partition key\n and returns a dictionary representing the config to attach to runs for that partition.\n """\n\n @deprecated(\n breaking_version="2.0", additional_warn_text="Use `tags_for_partition_key_fn` instead."\n )\n @public\n @property\n def tags_for_partition_fn(self) -> Optional[Callable[[Partition], Mapping[str, str]]]:\n """Optional[Callable[[Partition], Mapping[str, str]]]: A function that\n accepts a partition and returns a dictionary of tags to attach to runs for\n that partition. Deprecated as of 1.3.3.\n """\n return self._tags_for_partition_fn\n\n @public\n @property\n def tags_for_partition_key_fn(\n self,\n ) -> Optional[Callable[[str], Mapping[str, str]]]:\n """Optional[Callable[[str], Mapping[str, str]]]: A function that\n accepts a partition key and returns a dictionary of tags to attach to runs for\n that partition.\n """\n return self._tags_for_partition_key_fn\n\n
[docs] @public\n def get_partition_keys(self, current_time: Optional[datetime] = None) -> Sequence[str]:\n """Returns a list of partition keys, representing the full set of partitions that\n config can be applied to.\n\n Args:\n current_time (Optional[datetime]): A datetime object representing the current time. Only\n applicable to time-based partitions definitions.\n\n Returns:\n Sequence[str]\n """\n return self.partitions_def.get_partition_keys(current_time)
\n\n # Assumes partition key already validated\n def get_run_config_for_partition_key(\n self,\n partition_key: str,\n ) -> Mapping[str, Any]:\n """Generates the run config corresponding to a partition key.\n\n Args:\n partition_key (str): the key for a partition that should be used to generate a run config.\n """\n # _run_config_for_partition_fn is deprecated, we can remove this branching logic in 2.0\n if self._run_config_for_partition_fn:\n run_config = self._run_config_for_partition_fn(Partition(partition_key))\n elif self._run_config_for_partition_key_fn:\n run_config = self._run_config_for_partition_key_fn(partition_key)\n else:\n check.failed("Unreachable.") # one of the above funcs always defined\n return copy.deepcopy(run_config)\n\n # Assumes partition key already validated\n def get_tags_for_partition_key(\n self,\n partition_key: str,\n job_name: Optional[str] = None,\n ) -> Mapping[str, str]:\n from dagster._core.host_representation.external_data import (\n external_partition_set_name_for_job_name,\n )\n\n # _tags_for_partition_fn is deprecated, we can remove this branching logic in 2.0\n if self._tags_for_partition_fn:\n user_tags = self._tags_for_partition_fn(Partition(partition_key))\n elif self._tags_for_partition_key_fn:\n user_tags = self._tags_for_partition_key_fn(partition_key)\n else:\n user_tags = {}\n user_tags = validate_tags(user_tags, allow_reserved_tags=False)\n\n system_tags = {\n **self.partitions_def.get_tags_for_partition_key(partition_key),\n **(\n # `PartitionSetDefinition` has been deleted but we still need to attach this special tag in\n # order for reexecution against partitions to work properly.\n {PARTITION_SET_TAG: external_partition_set_name_for_job_name(job_name)}\n if job_name\n else {}\n ),\n }\n\n return {**user_tags, **system_tags}\n\n @classmethod\n def from_flexible_config(\n cls,\n config: Optional[Union[ConfigMapping, Mapping[str, object], "PartitionedConfig"]],\n partitions_def: PartitionsDefinition,\n ) -> "PartitionedConfig":\n check.invariant(\n not isinstance(config, ConfigMapping),\n "Can't supply a ConfigMapping for 'config' when 'partitions_def' is supplied.",\n )\n\n if isinstance(config, PartitionedConfig):\n check.invariant(\n config.partitions_def == partitions_def,\n "Can't supply a PartitionedConfig for 'config' with a different "\n "PartitionsDefinition than supplied for 'partitions_def'.",\n )\n return config\n else:\n hardcoded_config = config if config else {}\n return cls(\n partitions_def,\n run_config_for_partition_key_fn=lambda _: cast(Mapping, hardcoded_config),\n )\n\n def __call__(self, *args, **kwargs):\n if self._decorated_fn is None:\n raise DagsterInvalidInvocationError(\n "Only PartitionedConfig objects created using one of the partitioned config "\n "decorators can be directly invoked."\n )\n else:\n return self._decorated_fn(*args, **kwargs)
\n\n\n
[docs]@deprecated_param(\n param="tags_for_partition_fn",\n breaking_version="2.0",\n additional_warn_text="Use tags_for_partition_key_fn instead.",\n)\ndef static_partitioned_config(\n partition_keys: Sequence[str],\n tags_for_partition_fn: Optional[Callable[[str], Mapping[str, str]]] = None,\n tags_for_partition_key_fn: Optional[Callable[[str], Mapping[str, str]]] = None,\n) -> Callable[[Callable[[str], Mapping[str, Any]]], PartitionedConfig[StaticPartitionsDefinition]]:\n """Creates a static partitioned config for a job.\n\n The provided partition_keys is a static list of strings identifying the set of partitions. The\n list of partitions is static, so while the run config returned by the decorated function may\n change over time, the list of valid partition keys does not.\n\n This has performance advantages over `dynamic_partitioned_config` in terms of loading different\n partition views in the Dagster UI.\n\n The decorated function takes in a partition key and returns a valid run config for a particular\n target job.\n\n Args:\n partition_keys (Sequence[str]): A list of valid partition keys, which serve as the range of\n values that can be provided to the decorated run config function.\n tags_for_partition_fn (Optional[Callable[[str], Mapping[str, str]]]): A function that\n accepts a partition key and returns a dictionary of tags to attach to runs for that\n partition.\n tags_for_partition_key_fn (Optional[Callable[[str], Mapping[str, str]]]): A function that\n accepts a partition key and returns a dictionary of tags to attach to runs for that\n partition.\n\n Returns:\n PartitionedConfig\n """\n check.sequence_param(partition_keys, "partition_keys", str)\n\n tags_for_partition_key_fn = normalize_renamed_param(\n tags_for_partition_key_fn,\n "tags_for_partition_key_fn",\n tags_for_partition_fn,\n "tags_for_partition_fn",\n )\n\n def inner(\n fn: Callable[[str], Mapping[str, Any]]\n ) -> PartitionedConfig[StaticPartitionsDefinition]:\n return PartitionedConfig(\n partitions_def=StaticPartitionsDefinition(partition_keys),\n run_config_for_partition_key_fn=fn,\n decorated_fn=fn,\n tags_for_partition_key_fn=tags_for_partition_key_fn,\n )\n\n return inner
\n\n\ndef partitioned_config(\n partitions_def: PartitionsDefinition,\n tags_for_partition_key_fn: Optional[Callable[[str], Mapping[str, str]]] = None,\n) -> Callable[[Callable[[str], Mapping[str, Any]]], PartitionedConfig]:\n """Creates a partitioned config for a job given a PartitionsDefinition.\n\n The partitions_def provides the set of partitions, which may change over time\n (for example, when using a DynamicPartitionsDefinition).\n\n The decorated function takes in a partition key and returns a valid run config for a particular\n target job.\n\n Args:\n partitions_def: (Optional[DynamicPartitionsDefinition]): PartitionsDefinition for the job\n tags_for_partition_key_fn (Optional[Callable[[str], Mapping[str, str]]]): A function that\n accepts a partition key and returns a dictionary of tags to attach to runs for that\n partition.\n\n Returns:\n PartitionedConfig\n """\n check.opt_callable_param(tags_for_partition_key_fn, "tags_for_partition_key_fn")\n\n def inner(fn: Callable[[str], Mapping[str, Any]]) -> PartitionedConfig:\n return PartitionedConfig(\n partitions_def=partitions_def,\n run_config_for_partition_key_fn=fn,\n decorated_fn=fn,\n tags_for_partition_key_fn=tags_for_partition_key_fn,\n )\n\n return inner\n\n\n
[docs]@deprecated_param(\n param="tags_for_partition_fn",\n breaking_version="2.0",\n additional_warn_text="Use tags_for_partition_key_fn instead.",\n)\ndef dynamic_partitioned_config(\n partition_fn: Callable[[Optional[datetime]], Sequence[str]],\n tags_for_partition_fn: Optional[Callable[[str], Mapping[str, str]]] = None,\n tags_for_partition_key_fn: Optional[Callable[[str], Mapping[str, str]]] = None,\n) -> Callable[[Callable[[str], Mapping[str, Any]]], PartitionedConfig]:\n """Creates a dynamic partitioned config for a job.\n\n The provided partition_fn returns a list of strings identifying the set of partitions, given\n an optional datetime argument (representing the current time). The list of partitions returned\n may change over time.\n\n The decorated function takes in a partition key and returns a valid run config for a particular\n target job.\n\n Args:\n partition_fn (Callable[[datetime.datetime], Sequence[str]]): A function that generates a\n list of valid partition keys, which serve as the range of values that can be provided\n to the decorated run config function.\n tags_for_partition_fn (Optional[Callable[[str], Mapping[str, str]]]): A function that\n accepts a partition key and returns a dictionary of tags to attach to runs for that\n partition.\n\n Returns:\n PartitionedConfig\n """\n check.callable_param(partition_fn, "partition_fn")\n\n tags_for_partition_key_fn = normalize_renamed_param(\n tags_for_partition_key_fn,\n "tags_for_partition_key_fn",\n tags_for_partition_fn,\n "tags_for_partition_fn",\n )\n\n def inner(fn: Callable[[str], Mapping[str, Any]]) -> PartitionedConfig:\n return PartitionedConfig(\n partitions_def=DynamicPartitionsDefinition(partition_fn),\n run_config_for_partition_key_fn=fn,\n decorated_fn=fn,\n tags_for_partition_key_fn=tags_for_partition_key_fn,\n )\n\n return inner
\n\n\ndef cron_schedule_from_schedule_type_and_offsets(\n schedule_type: ScheduleType,\n minute_offset: int,\n hour_offset: int,\n day_offset: Optional[int],\n) -> str:\n if schedule_type is ScheduleType.HOURLY:\n return f"{minute_offset} * * * *"\n elif schedule_type is ScheduleType.DAILY:\n return f"{minute_offset} {hour_offset} * * *"\n elif schedule_type is ScheduleType.WEEKLY:\n return f"{minute_offset} {hour_offset} * * {day_offset if day_offset is not None else 0}"\n elif schedule_type is ScheduleType.MONTHLY:\n return f"{minute_offset} {hour_offset} {day_offset if day_offset is not None else 1} * *"\n else:\n check.assert_never(schedule_type)\n\n\nclass PartitionsSubset(ABC, Generic[T_str]):\n """Represents a subset of the partitions within a PartitionsDefinition."""\n\n @abstractmethod\n def get_partition_keys_not_in_subset(\n self,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> Iterable[T_str]: ...\n\n @abstractmethod\n @public\n def get_partition_keys(self, current_time: Optional[datetime] = None) -> Iterable[T_str]: ...\n\n @abstractmethod\n def get_partition_key_ranges(\n self,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> Sequence[PartitionKeyRange]: ...\n\n @abstractmethod\n def with_partition_keys(self, partition_keys: Iterable[str]) -> "PartitionsSubset[T_str]": ...\n\n def with_partition_key_range(\n self,\n partition_key_range: PartitionKeyRange,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> "PartitionsSubset[T_str]":\n return self.with_partition_keys(\n self.partitions_def.get_partition_keys_in_range(\n partition_key_range, dynamic_partitions_store=dynamic_partitions_store\n )\n )\n\n def __or__(self, other: "PartitionsSubset") -> "PartitionsSubset[T_str]":\n if self is other:\n return self\n return self.with_partition_keys(other.get_partition_keys())\n\n def __sub__(self, other: "PartitionsSubset") -> "PartitionsSubset[T_str]":\n if self is other:\n return self.partitions_def.empty_subset()\n return self.partitions_def.empty_subset().with_partition_keys(\n set(self.get_partition_keys()).difference(set(other.get_partition_keys()))\n )\n\n @abstractmethod\n def serialize(self) -> str: ...\n\n @classmethod\n @abstractmethod\n def from_serialized(\n cls, partitions_def: PartitionsDefinition[T_str], serialized: str\n ) -> "PartitionsSubset[T_str]": ...\n\n @classmethod\n @abstractmethod\n def can_deserialize(\n cls,\n partitions_def: PartitionsDefinition,\n serialized: str,\n serialized_partitions_def_unique_id: Optional[str],\n serialized_partitions_def_class_name: Optional[str],\n ) -> bool: ...\n\n @property\n @abstractmethod\n def partitions_def(self) -> PartitionsDefinition[T_str]: ...\n\n @abstractmethod\n def __len__(self) -> int: ...\n\n @abstractmethod\n def __contains__(self, value) -> bool: ...\n\n @classmethod\n @abstractmethod\n def empty_subset(\n cls, partitions_def: PartitionsDefinition[T_str]\n ) -> "PartitionsSubset[T_str]": ...\n\n\n@whitelist_for_serdes\nclass SerializedPartitionsSubset(NamedTuple):\n serialized_subset: str\n serialized_partitions_def_unique_id: str\n serialized_partitions_def_class_name: str\n\n @classmethod\n def from_subset(\n cls,\n subset: PartitionsSubset,\n partitions_def: PartitionsDefinition,\n dynamic_partitions_store: DynamicPartitionsStore,\n ):\n return cls(\n serialized_subset=subset.serialize(),\n serialized_partitions_def_unique_id=partitions_def.get_serializable_unique_identifier(\n dynamic_partitions_store\n ),\n serialized_partitions_def_class_name=partitions_def.__class__.__name__,\n )\n\n def can_deserialize(self, partitions_def: Optional[PartitionsDefinition]) -> bool:\n if not partitions_def:\n # Asset had a partitions definition at storage time, but no longer does\n return False\n\n return partitions_def.can_deserialize_subset(\n self.serialized_subset,\n serialized_partitions_def_unique_id=self.serialized_partitions_def_unique_id,\n serialized_partitions_def_class_name=self.serialized_partitions_def_class_name,\n )\n\n def deserialize(self, partitions_def: PartitionsDefinition) -> PartitionsSubset:\n return partitions_def.deserialize_subset(self.serialized_subset)\n\n\nclass DefaultPartitionsSubset(PartitionsSubset[T_str]):\n # Every time we change the serialization format, we should increment the version number.\n # This will ensure that we can gracefully degrade when deserializing old data.\n SERIALIZATION_VERSION = 1\n\n def __init__(\n self, partitions_def: PartitionsDefinition[T_str], subset: Optional[Set[T_str]] = None\n ):\n check.opt_set_param(subset, "subset")\n self._partitions_def = partitions_def\n self._subset = subset or set()\n\n def get_partition_keys_not_in_subset(\n self,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> Iterable[str]:\n return (\n set(\n self._partitions_def.get_partition_keys(\n current_time=current_time, dynamic_partitions_store=dynamic_partitions_store\n )\n )\n - self._subset\n )\n\n def get_partition_keys(self, current_time: Optional[datetime] = None) -> Iterable[str]:\n return self._subset\n\n def get_partition_key_ranges(\n self,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> Sequence[PartitionKeyRange]:\n partition_keys = self._partitions_def.get_partition_keys(\n current_time, dynamic_partitions_store=dynamic_partitions_store\n )\n cur_range_start = None\n cur_range_end = None\n result = []\n for partition_key in partition_keys:\n if partition_key in self._subset:\n if cur_range_start is None:\n cur_range_start = partition_key\n cur_range_end = partition_key\n else:\n if cur_range_start is not None and cur_range_end is not None:\n result.append(PartitionKeyRange(cur_range_start, cur_range_end))\n cur_range_start = cur_range_end = None\n\n if cur_range_start is not None and cur_range_end is not None:\n result.append(PartitionKeyRange(cur_range_start, cur_range_end))\n\n return result\n\n def with_partition_keys(\n self, partition_keys: Iterable[T_str]\n ) -> "DefaultPartitionsSubset[T_str]":\n return DefaultPartitionsSubset(\n self._partitions_def,\n self._subset | set(partition_keys),\n )\n\n def serialize(self) -> str:\n # Serialize version number, so attempting to deserialize old versions can be handled gracefully.\n # Any time the serialization format changes, we should increment the version number.\n return json.dumps({"version": self.SERIALIZATION_VERSION, "subset": list(self._subset)})\n\n @classmethod\n def from_serialized(\n cls, partitions_def: PartitionsDefinition[T_str], serialized: str\n ) -> "PartitionsSubset[T_str]":\n # Check the version number, so only valid versions can be deserialized.\n data = json.loads(serialized)\n\n if isinstance(data, list):\n # backwards compatibility\n return cls(subset=set(data), partitions_def=partitions_def)\n else:\n if data.get("version") != cls.SERIALIZATION_VERSION:\n raise DagsterInvalidDeserializationVersionError(\n f"Attempted to deserialize partition subset with version {data.get('version')},"\n f" but only version {cls.SERIALIZATION_VERSION} is supported."\n )\n return cls(subset=set(data.get("subset")), partitions_def=partitions_def)\n\n @classmethod\n def can_deserialize(\n cls,\n partitions_def: PartitionsDefinition[T_str],\n serialized: str,\n serialized_partitions_def_unique_id: Optional[str],\n serialized_partitions_def_class_name: Optional[str],\n ) -> bool:\n if serialized_partitions_def_class_name is not None:\n return serialized_partitions_def_class_name == partitions_def.__class__.__name__\n\n data = json.loads(serialized)\n return isinstance(data, list) or (\n data.get("subset") is not None and data.get("version") == cls.SERIALIZATION_VERSION\n )\n\n @property\n def partitions_def(self) -> PartitionsDefinition[T_str]:\n return self._partitions_def\n\n def __eq__(self, other: object) -> bool:\n return (\n isinstance(other, DefaultPartitionsSubset)\n and self._partitions_def == other._partitions_def\n and self._subset == other._subset\n )\n\n def __len__(self) -> int:\n return len(self._subset)\n\n def __contains__(self, value) -> bool:\n return value in self._subset\n\n def __repr__(self) -> str:\n return (\n f"DefaultPartitionsSubset(subset={self._subset}, partitions_def={self._partitions_def})"\n )\n\n @classmethod\n def empty_subset(cls, partitions_def: PartitionsDefinition[T_str]) -> "PartitionsSubset[T_str]":\n return cls(partitions_def=partitions_def)\n
", "current_page_name": "_modules/dagster/_core/definitions/partition", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.partition"}, "partition_key_range": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.partition_key_range

\nfrom typing import NamedTuple\n\nfrom dagster._annotations import PublicAttr\n\n\n
[docs]class PartitionKeyRange(NamedTuple):\n """Defines a range of partitions.\n\n Attributes:\n start (str): The starting partition key in the range (inclusive).\n end (str): The ending partition key in the range (inclusive).\n\n Examples:\n .. code-block:: python\n\n partitions_def = StaticPartitionsDefinition(["a", "b", "c", "d"])\n partition_key_range = PartitionKeyRange(start="a", end="c") # Represents ["a", "b", "c"]\n """\n\n # Inclusive on both sides\n start: PublicAttr[str]\n end: PublicAttr[str]
\n
", "current_page_name": "_modules/dagster/_core/definitions/partition_key_range", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.partition_key_range"}, "partition_mapping": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.partition_mapping

\nimport collections.abc\nimport itertools\nfrom abc import ABC, abstractmethod\nfrom collections import defaultdict\nfrom datetime import datetime\nfrom typing import (\n    Collection,\n    Dict,\n    List,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Tuple,\n    Type,\n    Union,\n    cast,\n)\n\nimport dagster._check as check\nfrom dagster._annotations import PublicAttr, experimental, public\nfrom dagster._core.definitions.multi_dimensional_partitions import (\n    MultiPartitionKey,\n    MultiPartitionsDefinition,\n)\nfrom dagster._core.definitions.partition import (\n    PartitionsDefinition,\n    PartitionsSubset,\n    StaticPartitionsDefinition,\n)\nfrom dagster._core.definitions.time_window_partitions import TimeWindowPartitionsDefinition\nfrom dagster._core.instance import DynamicPartitionsStore\nfrom dagster._serdes import whitelist_for_serdes\nfrom dagster._utils.cached_method import cached_method\nfrom dagster._utils.warnings import disable_dagster_warnings\n\n\nclass UpstreamPartitionsResult(NamedTuple):\n    """Represents the result of mapping a PartitionsSubset to the corresponding\n    partitions in another PartitionsDefinition.\n\n    partitions_subset (PartitionsSubset): The resulting partitions subset that was\n        mapped to. Only contains partitions for existent partitions, filtering out nonexistent partitions.\n    required_but_nonexistent_partition_keys (Sequence[str]): A list containing invalid partition keys in to_partitions_def\n        that partitions in from_partitions_subset were mapped to.\n    """\n\n    partitions_subset: PartitionsSubset\n    required_but_nonexistent_partition_keys: Sequence[str]\n\n\n
[docs]class PartitionMapping(ABC):\n """Defines a correspondence between the partitions in an asset and the partitions in an asset\n that it depends on.\n\n Overriding PartitionMapping outside of Dagster is not supported. The abstract methods of this\n class may change at any time.\n """\n\n
[docs] @public\n @abstractmethod\n def get_downstream_partitions_for_partitions(\n self,\n upstream_partitions_subset: PartitionsSubset,\n downstream_partitions_def: PartitionsDefinition,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> PartitionsSubset:\n """Returns the subset of partition keys in the downstream asset that use the data in the given\n partition key subset of the upstream asset.\n\n Args:\n upstream_partitions_subset (Union[PartitionKeyRange, PartitionsSubset]): The\n subset of partition keys in the upstream asset.\n downstream_partitions_def (PartitionsDefinition): The partitions definition for the\n downstream asset.\n """
\n\n
[docs] @public\n @abstractmethod\n def get_upstream_mapped_partitions_result_for_partitions(\n self,\n downstream_partitions_subset: Optional[PartitionsSubset],\n upstream_partitions_def: PartitionsDefinition,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> UpstreamPartitionsResult:\n """Returns a UpstreamPartitionsResult object containing the partition keys the downstream\n partitions subset was mapped to in the upstream partitions definition.\n\n Valid upstream partitions will be included in UpstreamPartitionsResult.partitions_subset.\n Invalid upstream partitions will be included in UpstreamPartitionsResult.required_but_nonexistent_partition_keys.\n\n For example, if an upstream asset is time-partitioned and starts in June 2023, and the\n downstream asset is time-partitioned and starts in May 2023, this function would return a\n UpstreamPartitionsResult(PartitionsSubset("2023-06-01"), required_but_nonexistent_partition_keys=["2023-05-01"])\n when downstream_partitions_subset contains 2023-05-01 and 2023-06-01.\n """
\n\n\n
[docs]@whitelist_for_serdes\nclass IdentityPartitionMapping(PartitionMapping, NamedTuple("_IdentityPartitionMapping", [])):\n """Expects that the upstream and downstream assets are partitioned in the same way, and maps\n partitions in the downstream asset to the same partition in the upstream asset.\n """\n\n def get_upstream_mapped_partitions_result_for_partitions(\n self,\n downstream_partitions_subset: Optional[PartitionsSubset],\n upstream_partitions_def: PartitionsDefinition,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> UpstreamPartitionsResult:\n if downstream_partitions_subset is None:\n check.failed("downstream asset is not partitioned")\n\n if downstream_partitions_subset.partitions_def == upstream_partitions_def:\n return UpstreamPartitionsResult(downstream_partitions_subset, [])\n\n upstream_partition_keys = set(\n upstream_partitions_def.get_partition_keys(\n dynamic_partitions_store=dynamic_partitions_store\n )\n )\n downstream_partition_keys = set(downstream_partitions_subset.get_partition_keys())\n\n return UpstreamPartitionsResult(\n upstream_partitions_def.subset_with_partition_keys(\n list(upstream_partition_keys & downstream_partition_keys)\n ),\n list(downstream_partition_keys - upstream_partition_keys),\n )\n\n def get_downstream_partitions_for_partitions(\n self,\n upstream_partitions_subset: PartitionsSubset,\n downstream_partitions_def: PartitionsDefinition,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> PartitionsSubset:\n if upstream_partitions_subset is None:\n check.failed("upstream asset is not partitioned")\n\n if upstream_partitions_subset.partitions_def == downstream_partitions_def:\n return upstream_partitions_subset\n\n upstream_partition_keys = set(upstream_partitions_subset.get_partition_keys())\n downstream_partition_keys = set(\n downstream_partitions_def.get_partition_keys(\n dynamic_partitions_store=dynamic_partitions_store\n )\n )\n\n return downstream_partitions_def.empty_subset().with_partition_keys(\n list(downstream_partition_keys & upstream_partition_keys)\n )
\n\n\n
[docs]@whitelist_for_serdes\nclass AllPartitionMapping(PartitionMapping, NamedTuple("_AllPartitionMapping", [])):\n """Maps every partition in the downstream asset to every partition in the upstream asset.\n\n Commonly used in the case when the downstream asset is not partitioned, in which the entire\n downstream asset depends on all partitions of the usptream asset.\n """\n\n def get_upstream_mapped_partitions_result_for_partitions(\n self,\n downstream_partitions_subset: Optional[PartitionsSubset],\n upstream_partitions_def: PartitionsDefinition,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> UpstreamPartitionsResult:\n upstream_subset = upstream_partitions_def.subset_with_all_partitions(\n current_time=current_time, dynamic_partitions_store=dynamic_partitions_store\n )\n return UpstreamPartitionsResult(upstream_subset, [])\n\n def get_downstream_partitions_for_partitions(\n self,\n upstream_partitions_subset: PartitionsSubset,\n downstream_partitions_def: PartitionsDefinition,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> PartitionsSubset:\n raise NotImplementedError()
\n\n\n
[docs]@whitelist_for_serdes\nclass LastPartitionMapping(PartitionMapping, NamedTuple("_LastPartitionMapping", [])):\n """Maps all dependencies to the last partition in the upstream asset.\n\n Commonly used in the case when the downstream asset is not partitioned, in which the entire\n downstream asset depends on the last partition of the upstream asset.\n """\n\n def get_upstream_mapped_partitions_result_for_partitions(\n self,\n downstream_partitions_subset: Optional[PartitionsSubset],\n upstream_partitions_def: PartitionsDefinition,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> UpstreamPartitionsResult:\n last = upstream_partitions_def.get_last_partition_key(\n current_time=None, dynamic_partitions_store=dynamic_partitions_store\n )\n\n upstream_subset = upstream_partitions_def.empty_subset()\n if last is not None:\n upstream_subset = upstream_subset.with_partition_keys([last])\n\n return UpstreamPartitionsResult(upstream_subset, [])\n\n def get_downstream_partitions_for_partitions(\n self,\n upstream_partitions_subset: PartitionsSubset,\n downstream_partitions_def: PartitionsDefinition,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> PartitionsSubset:\n raise NotImplementedError()
\n\n\n
[docs]@whitelist_for_serdes\nclass SpecificPartitionsPartitionMapping(\n PartitionMapping,\n NamedTuple(\n "_SpecificPartitionsPartitionMapping", [("partition_keys", PublicAttr[Sequence[str]])]\n ),\n):\n """Maps to a specific subset of partitions in the upstream asset.\n\n Example:\n .. code-block:: python\n\n from dagster import SpecificPartitionsPartitionMapping, StaticPartitionsDefinition, asset\n\n @asset(partitions_def=StaticPartitionsDefinition(["a", "b", "c"]))\n def upstream():\n ...\n\n @asset(\n ins={\n "upstream": AssetIn(partition_mapping=SpecificPartitionsPartitionMapping(["a"]))\n }\n )\n def a_downstream(upstream):\n ...\n """\n\n def get_upstream_mapped_partitions_result_for_partitions(\n self,\n downstream_partitions_subset: Optional[PartitionsSubset],\n upstream_partitions_def: PartitionsDefinition,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> UpstreamPartitionsResult:\n return UpstreamPartitionsResult(\n upstream_partitions_def.subset_with_partition_keys(self.partition_keys), []\n )\n\n def get_downstream_partitions_for_partitions(\n self,\n upstream_partitions_subset: PartitionsSubset,\n downstream_partitions_def: PartitionsDefinition,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> PartitionsSubset:\n # if any of the partition keys in this partition mapping are contained within the upstream\n # partitions subset, then all partitions of the downstream asset are dependencies\n if any(key in upstream_partitions_subset for key in self.partition_keys):\n return downstream_partitions_def.subset_with_all_partitions(\n dynamic_partitions_store=dynamic_partitions_store\n )\n return downstream_partitions_def.empty_subset()
\n\n\nclass DimensionDependency(NamedTuple):\n partition_mapping: PartitionMapping\n upstream_dimension_name: Optional[str] = None\n downstream_dimension_name: Optional[str] = None\n\n\nclass BaseMultiPartitionMapping(ABC):\n @abstractmethod\n def get_dimension_dependencies(\n self,\n upstream_partitions_def: PartitionsDefinition,\n downstream_partitions_def: PartitionsDefinition,\n ) -> Sequence[DimensionDependency]: ...\n\n def get_partitions_def(\n self, partitions_def: PartitionsDefinition, dimension_name: Optional[str]\n ) -> PartitionsDefinition:\n if isinstance(partitions_def, MultiPartitionsDefinition):\n if not isinstance(dimension_name, str):\n check.failed("Expected dimension_name to be a string")\n return partitions_def.get_partitions_def_for_dimension(dimension_name)\n return partitions_def\n\n def _get_dependency_partitions_subset(\n self,\n a_partitions_def: PartitionsDefinition,\n a_partitions_subset: PartitionsSubset,\n b_partitions_def: PartitionsDefinition,\n a_upstream_of_b: bool,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n current_time: Optional[datetime] = None,\n ) -> Union[UpstreamPartitionsResult, PartitionsSubset]:\n """Given two partitions definitions a_partitions_def and b_partitions_def that have a dependency\n relationship (a_upstream_of_b is True if a_partitions_def is upstream of b_partitions_def),\n and a_partition_keys, a list of partition keys in a_partitions_def, returns a list of\n partition keys in the partitions definition b_partitions_def that are\n dependencies of the partition keys in a_partition_keys.\n """\n a_partition_keys_by_dimension = defaultdict(set)\n if isinstance(a_partitions_def, MultiPartitionsDefinition):\n for partition_key in a_partitions_subset.get_partition_keys():\n for dimension_name, key in cast(\n MultiPartitionKey, partition_key\n ).keys_by_dimension.items():\n a_partition_keys_by_dimension[dimension_name].add(key)\n else:\n for partition_key in a_partitions_subset.get_partition_keys():\n a_partition_keys_by_dimension[None].add(partition_key)\n\n # Maps the dimension name and key of a partition in a_partitions_def to the list of\n # partition keys in b_partitions_def that are dependencies of that partition\n dep_b_keys_by_a_dim_and_key: Dict[Optional[str], Dict[Optional[str], List[str]]] = (\n defaultdict(lambda: defaultdict(list))\n )\n required_but_nonexistent_upstream_partitions = set()\n\n b_dimension_partitions_def_by_name: Dict[Optional[str], PartitionsDefinition] = (\n {\n dimension.name: dimension.partitions_def\n for dimension in b_partitions_def.partitions_defs\n }\n if isinstance(b_partitions_def, MultiPartitionsDefinition)\n else {None: b_partitions_def}\n )\n\n if a_upstream_of_b:\n # a_partitions_def is upstream of b_partitions_def, so we need to map the\n # dimension names of a_partitions_def to the corresponding dependent dimensions of\n # b_partitions_def\n a_dim_to_dependency_b_dim = {\n dimension_mapping.upstream_dimension_name: (\n dimension_mapping.downstream_dimension_name,\n dimension_mapping.partition_mapping,\n )\n for dimension_mapping in self.get_dimension_dependencies(\n a_partitions_def, b_partitions_def\n )\n }\n\n for a_dim_name, keys in a_partition_keys_by_dimension.items():\n if a_dim_name in a_dim_to_dependency_b_dim:\n (\n b_dim_name,\n dimension_mapping,\n ) = a_dim_to_dependency_b_dim[a_dim_name]\n a_dimension_partitions_def = self.get_partitions_def(\n a_partitions_def, a_dim_name\n )\n b_dimension_partitions_def = self.get_partitions_def(\n b_partitions_def, b_dim_name\n )\n for key in keys:\n # if downstream dimension mapping exists, for a given key, get the list of\n # downstream partition keys that are dependencies of that key\n dep_b_keys_by_a_dim_and_key[a_dim_name][key] = list(\n dimension_mapping.get_downstream_partitions_for_partitions(\n a_dimension_partitions_def.empty_subset().with_partition_keys(\n [key]\n ),\n b_dimension_partitions_def,\n current_time=current_time,\n dynamic_partitions_store=dynamic_partitions_store,\n ).get_partition_keys()\n )\n\n else:\n # a_partitions_def is downstream of b_partitions_def, so we need to map the\n # dimension names of a_partitions_def to the corresponding dependency dimensions of\n # b_partitions_def\n a_dim_to_dependency_b_dim = {\n dimension_mapping.downstream_dimension_name: (\n dimension_mapping.upstream_dimension_name,\n dimension_mapping.partition_mapping,\n )\n for dimension_mapping in self.get_dimension_dependencies(\n b_partitions_def, a_partitions_def\n )\n }\n\n for a_dim_name, keys in a_partition_keys_by_dimension.items():\n if a_dim_name in a_dim_to_dependency_b_dim:\n (\n b_dim_name,\n partition_mapping,\n ) = a_dim_to_dependency_b_dim[a_dim_name]\n a_dimension_partitions_def = self.get_partitions_def(\n a_partitions_def, a_dim_name\n )\n b_dimension_partitions_def = self.get_partitions_def(\n b_partitions_def, b_dim_name\n )\n for key in keys:\n mapped_partitions_result = (\n partition_mapping.get_upstream_mapped_partitions_result_for_partitions(\n a_dimension_partitions_def.empty_subset().with_partition_keys(\n [key]\n ),\n b_dimension_partitions_def,\n current_time=current_time,\n dynamic_partitions_store=dynamic_partitions_store,\n )\n )\n dep_b_keys_by_a_dim_and_key[a_dim_name][key] = list(\n mapped_partitions_result.partitions_subset.get_partition_keys()\n )\n required_but_nonexistent_upstream_partitions.update(\n set(mapped_partitions_result.required_but_nonexistent_partition_keys)\n )\n\n b_partition_keys = set()\n\n mapped_a_dim_names = a_dim_to_dependency_b_dim.keys()\n mapped_b_dim_names = [mapping[0] for mapping in a_dim_to_dependency_b_dim.values()]\n unmapped_b_dim_names = list(\n set(b_dimension_partitions_def_by_name.keys()) - set(mapped_b_dim_names)\n )\n\n for key in a_partitions_subset.get_partition_keys():\n for b_key_values in itertools.product(\n *(\n [\n dep_b_keys_by_a_dim_and_key[dim_name][\n (\n cast(MultiPartitionKey, key).keys_by_dimension[dim_name]\n if dim_name\n else key\n )\n ]\n for dim_name in mapped_a_dim_names\n ]\n ),\n *[\n b_dimension_partitions_def_by_name[dim_name].get_partition_keys()\n for dim_name in unmapped_b_dim_names\n ],\n ):\n b_partition_keys.add(\n MultiPartitionKey(\n {\n cast(str, (mapped_b_dim_names + unmapped_b_dim_names)[i]): key\n for i, key in enumerate(b_key_values)\n }\n )\n if len(b_key_values) > 1\n else b_key_values[0]\n )\n\n mapped_subset = b_partitions_def.empty_subset().with_partition_keys(b_partition_keys)\n if a_upstream_of_b:\n return mapped_subset\n else:\n return UpstreamPartitionsResult(\n mapped_subset,\n required_but_nonexistent_partition_keys=list(\n required_but_nonexistent_upstream_partitions\n ),\n )\n\n def get_upstream_mapped_partitions_result_for_partitions(\n self,\n downstream_partitions_subset: Optional[PartitionsSubset],\n upstream_partitions_def: PartitionsDefinition,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> UpstreamPartitionsResult:\n if downstream_partitions_subset is None:\n check.failed("downstream asset is not partitioned")\n\n result = self._get_dependency_partitions_subset(\n cast(MultiPartitionsDefinition, downstream_partitions_subset.partitions_def),\n downstream_partitions_subset,\n cast(MultiPartitionsDefinition, upstream_partitions_def),\n a_upstream_of_b=False,\n dynamic_partitions_store=dynamic_partitions_store,\n current_time=current_time,\n )\n\n if not isinstance(result, UpstreamPartitionsResult):\n check.failed("Expected UpstreamPartitionsResult")\n\n return result\n\n def get_downstream_partitions_for_partitions(\n self,\n upstream_partitions_subset: PartitionsSubset,\n downstream_partitions_def: PartitionsDefinition,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> PartitionsSubset:\n if upstream_partitions_subset is None:\n check.failed("upstream asset is not partitioned")\n\n result = self._get_dependency_partitions_subset(\n cast(MultiPartitionsDefinition, upstream_partitions_subset.partitions_def),\n upstream_partitions_subset,\n cast(MultiPartitionsDefinition, downstream_partitions_def),\n a_upstream_of_b=True,\n dynamic_partitions_store=dynamic_partitions_store,\n )\n\n if isinstance(result, UpstreamPartitionsResult):\n check.failed("Expected PartitionsSubset")\n\n return result\n\n\n
[docs]@experimental\n@whitelist_for_serdes\nclass MultiToSingleDimensionPartitionMapping(\n BaseMultiPartitionMapping,\n PartitionMapping,\n NamedTuple(\n "_MultiToSingleDimensionPartitionMapping", [("partition_dimension_name", Optional[str])]\n ),\n):\n """Defines a correspondence between an single-dimensional partitions definition\n and a MultiPartitionsDefinition. The single-dimensional partitions definition must be\n a dimension of the MultiPartitionsDefinition.\n\n This class handles the case where the upstream asset is multipartitioned and the\n downstream asset is single dimensional, and vice versa.\n\n For a partition key X, this partition mapping assumes that any multi-partition key with\n X in the selected dimension is a dependency.\n\n Args:\n partition_dimension_name (Optional[str]): The name of the partition dimension in the\n MultiPartitionsDefinition that matches the single-dimension partitions definition.\n """\n\n def __new__(cls, partition_dimension_name: Optional[str] = None):\n return super(MultiToSingleDimensionPartitionMapping, cls).__new__(\n cls,\n partition_dimension_name=check.opt_str_param(\n partition_dimension_name, "partition_dimension_name"\n ),\n )\n\n def get_dimension_dependencies(\n self,\n upstream_partitions_def: PartitionsDefinition,\n downstream_partitions_def: PartitionsDefinition,\n ) -> Sequence[DimensionDependency]:\n infer_mapping_result = _get_infer_single_to_multi_dimension_deps_result(\n upstream_partitions_def, downstream_partitions_def\n )\n\n if not infer_mapping_result.can_infer:\n check.invariant(isinstance(infer_mapping_result.inference_failure_reason, str))\n check.failed(cast(str, infer_mapping_result.inference_failure_reason))\n\n return [cast(DimensionDependency, infer_mapping_result.dimension_dependency)]
\n\n\n@whitelist_for_serdes\nclass DimensionPartitionMapping(\n NamedTuple(\n "_DimensionPartitionMapping",\n [\n ("dimension_name", str),\n ("partition_mapping", PartitionMapping),\n ],\n )\n):\n """A helper class for MultiPartitionMapping that defines a partition mapping used to calculate\n the dependent partition keys in the selected downstream MultiPartitions definition dimension.\n\n Args:\n dimension_name (str): The name of the dimension in the downstream MultiPartitionsDefinition.\n partition_mapping (PartitionMapping): The partition mapping object used to calculate\n the downstream dimension partitions from the upstream dimension partitions and vice versa.\n """\n\n def __new__(\n cls,\n dimension_name: str,\n partition_mapping: PartitionMapping,\n ):\n return super(DimensionPartitionMapping, cls).__new__(\n cls,\n dimension_name=check.str_param(dimension_name, "dimension_name"),\n partition_mapping=check.inst_param(\n partition_mapping, "partition_mapping", PartitionMapping\n ),\n )\n\n\n
[docs]@experimental\n@whitelist_for_serdes\nclass MultiPartitionMapping(\n BaseMultiPartitionMapping,\n PartitionMapping,\n NamedTuple(\n "_MultiPartitionMapping",\n [("downstream_mappings_by_upstream_dimension", Mapping[str, DimensionPartitionMapping])],\n ),\n):\n """Defines a correspondence between two MultiPartitionsDefinitions.\n\n Accepts a mapping of upstream dimension name to downstream DimensionPartitionMapping, representing\n the explicit correspondence between the upstream and downstream MultiPartitions dimensions\n and the partition mapping used to calculate the downstream partitions.\n\n Examples:\n .. code-block:: python\n\n weekly_abc = MultiPartitionsDefinition(\n {\n "abc": StaticPartitionsDefinition(["a", "b", "c"]),\n "weekly": WeeklyPartitionsDefinition("2023-01-01"),\n }\n )\n daily_123 = MultiPartitionsDefinition(\n {\n "123": StaticPartitionsDefinition(["1", "2", "3"]),\n "daily": DailyPartitionsDefinition("2023-01-01"),\n }\n )\n\n MultiPartitionsMapping(\n {\n "abc": DimensionPartitionMapping(\n dimension_name="123",\n partition_mapping=StaticPartitionMapping({"a": "1", "b": "2", "c": "3"}),\n ),\n "weekly": DimensionPartitionMapping(\n dimension_name="daily",\n partition_mapping=TimeWindowPartitionMapping(),\n )\n }\n )\n\n For upstream or downstream dimensions not explicitly defined in the mapping, Dagster will\n assume an `AllPartitionsMapping`, meaning that all upstream partitions in those dimensions\n will be mapped to all downstream partitions in those dimensions.\n\n Examples:\n .. code-block:: python\n\n weekly_abc = MultiPartitionsDefinition(\n {\n "abc": StaticPartitionsDefinition(["a", "b", "c"]),\n "daily": DailyPartitionsDefinition("2023-01-01"),\n }\n )\n daily_123 = MultiPartitionsDefinition(\n {\n "123": StaticPartitionsDefinition(["1", "2", "3"]),\n "daily": DailyPartitionsDefinition("2023-01-01"),\n }\n )\n\n MultiPartitionsMapping(\n {\n "daily": DimensionPartitionMapping(\n dimension_name="daily",\n partition_mapping=IdentityPartitionMapping(),\n )\n }\n )\n\n # Will map `daily_123` partition key {"123": "1", "daily": "2023-01-01"} to the upstream:\n # {"abc": "a", "daily": "2023-01-01"}\n # {"abc": "b", "daily": "2023-01-01"}\n # {"abc": "c", "daily": "2023-01-01"}\n\n Args:\n downstream_mappings_by_upstream_dimension (Mapping[str, DimensionPartitionMapping]): A\n mapping that defines an explicit correspondence between one dimension of the upstream\n MultiPartitionsDefinition and one dimension of the downstream MultiPartitionsDefinition.\n Maps a string representing upstream dimension name to downstream DimensionPartitionMapping,\n containing the downstream dimension name and partition mapping.\n """\n\n def __new__(\n cls, downstream_mappings_by_upstream_dimension: Mapping[str, DimensionPartitionMapping]\n ):\n return super(MultiPartitionMapping, cls).__new__(\n cls,\n downstream_mappings_by_upstream_dimension=check.mapping_param(\n downstream_mappings_by_upstream_dimension,\n "downstream_mappings_by_upstream_dimension",\n key_type=str,\n value_type=DimensionPartitionMapping,\n ),\n )\n\n def get_dimension_dependencies(\n self,\n upstream_partitions_def: PartitionsDefinition,\n downstream_partitions_def: PartitionsDefinition,\n ) -> Sequence[DimensionDependency]:\n self._check_all_dimensions_accounted_for(\n upstream_partitions_def,\n downstream_partitions_def,\n )\n\n return [\n DimensionDependency(\n mapping.partition_mapping,\n upstream_dimension_name=upstream_dimension,\n downstream_dimension_name=mapping.dimension_name,\n )\n for upstream_dimension, mapping in self.downstream_mappings_by_upstream_dimension.items()\n ]\n\n def _check_all_dimensions_accounted_for(\n self,\n upstream_partitions_def: PartitionsDefinition,\n downstream_partitions_def: PartitionsDefinition,\n ) -> None:\n if any(\n not isinstance(partitions_def, MultiPartitionsDefinition)\n for partitions_def in (upstream_partitions_def, downstream_partitions_def)\n ):\n check.failed(\n "Both partitions defs provided to a MultiPartitionMapping must be multi-partitioned"\n )\n\n upstream_dimension_names = {\n dim.name\n for dim in cast(MultiPartitionsDefinition, upstream_partitions_def).partitions_defs\n }\n dimension_names = {\n dim.name\n for dim in cast(MultiPartitionsDefinition, downstream_partitions_def).partitions_defs\n }\n\n for (\n upstream_dimension_name,\n dimension_mapping,\n ) in self.downstream_mappings_by_upstream_dimension.items():\n if upstream_dimension_name not in upstream_dimension_names:\n check.failed(\n "Dimension mapping has an upstream dimension name that is not in the upstream "\n "partitions def"\n )\n if dimension_mapping.dimension_name not in dimension_names:\n check.failed(\n "Dimension mapping has a downstream dimension name that is not in the"\n " downstream partitions def"\n )\n\n upstream_dimension_names.remove(upstream_dimension_name)\n dimension_names.remove(dimension_mapping.dimension_name)
\n\n\n
[docs]@whitelist_for_serdes\nclass StaticPartitionMapping(\n PartitionMapping,\n NamedTuple(\n "_StaticPartitionMapping",\n [\n (\n "downstream_partition_keys_by_upstream_partition_key",\n PublicAttr[Mapping[str, Union[str, Collection[str]]]],\n )\n ],\n ),\n):\n """Define an explicit correspondence between two StaticPartitionsDefinitions.\n\n Args:\n downstream_partition_keys_by_upstream_partition_key (Dict[str, str | Collection[str]]):\n The single or multi-valued correspondence from upstream keys to downstream keys.\n """\n\n def __init__(\n self,\n downstream_partition_keys_by_upstream_partition_key: Mapping[\n str, Union[str, Collection[str]]\n ],\n ):\n check.mapping_param(\n downstream_partition_keys_by_upstream_partition_key,\n "downstream_partition_keys_by_upstream_partition_key",\n key_type=str,\n value_type=(str, collections.abc.Collection),\n )\n\n # cache forward and reverse mappings\n self._mapping = defaultdict(set)\n for (\n upstream_key,\n downstream_keys,\n ) in downstream_partition_keys_by_upstream_partition_key.items():\n self._mapping[upstream_key] = (\n {downstream_keys} if isinstance(downstream_keys, str) else set(downstream_keys)\n )\n\n self._inverse_mapping = defaultdict(set)\n for upstream_key, downstream_keys in self._mapping.items():\n for downstream_key in downstream_keys:\n self._inverse_mapping[downstream_key].add(upstream_key)\n\n @cached_method\n def _check_upstream(self, *, upstream_partitions_def: PartitionsDefinition):\n """Validate that the mapping from upstream to downstream is only defined on upstream keys."""\n check.inst(\n upstream_partitions_def,\n StaticPartitionsDefinition,\n "StaticPartitionMapping can only be defined between two StaticPartitionsDefinitions",\n )\n upstream_keys = upstream_partitions_def.get_partition_keys()\n extra_keys = set(self._mapping.keys()).difference(upstream_keys)\n if extra_keys:\n raise ValueError(\n f"mapping source partitions not in the upstream partitions definition: {extra_keys}"\n )\n\n @cached_method\n def _check_downstream(self, *, downstream_partitions_def: PartitionsDefinition):\n """Validate that the mapping from upstream to downstream only maps to downstream keys."""\n check.inst(\n downstream_partitions_def,\n StaticPartitionsDefinition,\n "StaticPartitionMapping can only be defined between two StaticPartitionsDefinitions",\n )\n downstream_keys = downstream_partitions_def.get_partition_keys()\n extra_keys = set(self._inverse_mapping.keys()).difference(downstream_keys)\n if extra_keys:\n raise ValueError(\n "mapping target partitions not in the downstream partitions definition:"\n f" {extra_keys}"\n )\n\n def get_downstream_partitions_for_partitions(\n self,\n upstream_partitions_subset: PartitionsSubset,\n downstream_partitions_def: PartitionsDefinition,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> PartitionsSubset:\n self._check_downstream(downstream_partitions_def=downstream_partitions_def)\n\n downstream_subset = downstream_partitions_def.empty_subset()\n downstream_keys = set()\n for key in upstream_partitions_subset.get_partition_keys():\n downstream_keys.update(self._mapping[key])\n return downstream_subset.with_partition_keys(downstream_keys)\n\n def get_upstream_mapped_partitions_result_for_partitions(\n self,\n downstream_partitions_subset: Optional[PartitionsSubset],\n upstream_partitions_def: PartitionsDefinition,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> UpstreamPartitionsResult:\n self._check_upstream(upstream_partitions_def=upstream_partitions_def)\n\n upstream_subset = upstream_partitions_def.empty_subset()\n if downstream_partitions_subset is None:\n return UpstreamPartitionsResult(upstream_subset, [])\n\n upstream_keys = set()\n for key in downstream_partitions_subset.get_partition_keys():\n upstream_keys.update(self._inverse_mapping[key])\n\n return UpstreamPartitionsResult(upstream_subset.with_partition_keys(upstream_keys), [])
\n\n\nclass InferSingleToMultiDimensionDepsResult(\n NamedTuple(\n "_InferSingleToMultiDimensionDepsResult",\n [\n ("can_infer", bool),\n ("inference_failure_reason", Optional[str]),\n ("dimension_dependency", Optional[DimensionDependency]),\n ],\n )\n):\n def __new__(\n cls,\n can_infer: bool,\n inference_failure_reason: Optional[str] = None,\n dimension_dependency: Optional[DimensionDependency] = None,\n ):\n if can_infer and dimension_dependency is None:\n check.failed("dimension_dependency must be provided if can_infer is True")\n if not can_infer and inference_failure_reason is None:\n check.failed("inference_failure_reason must be provided if can_infer is False")\n\n return super(InferSingleToMultiDimensionDepsResult, cls).__new__(\n cls,\n can_infer,\n inference_failure_reason,\n dimension_dependency,\n )\n\n\ndef _get_infer_single_to_multi_dimension_deps_result(\n upstream_partitions_def: PartitionsDefinition,\n downstream_partitions_def: PartitionsDefinition,\n partition_dimension_name: Optional[str] = None,\n) -> InferSingleToMultiDimensionDepsResult:\n from dagster._core.definitions.time_window_partition_mapping import TimeWindowPartitionMapping\n\n upstream_is_multipartitioned = isinstance(upstream_partitions_def, MultiPartitionsDefinition)\n\n multipartitions_defs = [\n partitions_def\n for partitions_def in [upstream_partitions_def, downstream_partitions_def]\n if isinstance(partitions_def, MultiPartitionsDefinition)\n ]\n if len(multipartitions_defs) != 1:\n return InferSingleToMultiDimensionDepsResult(\n False,\n "Can only use MultiToSingleDimensionPartitionMapping when upstream asset is"\n " multipartitioned and the downstream asset is single dimensional, or vice versa."\n f" Instead received {len(multipartitions_defs)} multi-partitioned assets.",\n )\n\n multipartitions_def = cast(MultiPartitionsDefinition, next(iter(multipartitions_defs)))\n\n single_dimension_partitions_def = next(\n iter(\n {\n upstream_partitions_def,\n downstream_partitions_def,\n }\n - set(multipartitions_defs)\n )\n )\n\n filtered_multipartition_dims = (\n multipartitions_def.partitions_defs\n if partition_dimension_name is None\n else [\n dim\n for dim in multipartitions_def.partitions_defs\n if dim.name == partition_dimension_name\n ]\n )\n\n if partition_dimension_name:\n if len(filtered_multipartition_dims) != 1:\n return InferSingleToMultiDimensionDepsResult(\n False,\n f"Provided partition dimension name {partition_dimension_name} not found in"\n f" multipartitions definition {multipartitions_def}.",\n )\n\n matching_dimension_defs = [\n dimension_def\n for dimension_def in filtered_multipartition_dims\n if dimension_def.partitions_def == single_dimension_partitions_def\n ]\n\n if len(matching_dimension_defs) == 1:\n return InferSingleToMultiDimensionDepsResult(\n True,\n dimension_dependency=DimensionDependency(\n IdentityPartitionMapping(),\n upstream_dimension_name=(\n matching_dimension_defs[0].name if upstream_is_multipartitioned else None\n ),\n downstream_dimension_name=(\n matching_dimension_defs[0].name if not upstream_is_multipartitioned else None\n ),\n ),\n )\n elif len(matching_dimension_defs) > 1:\n return InferSingleToMultiDimensionDepsResult(\n False,\n "partition dimension name must be specified when multiple dimensions of the"\n " MultiPartitionsDefinition match the single dimension partitions def",\n )\n\n time_dimensions = [\n dimension_def\n for dimension_def in filtered_multipartition_dims\n if isinstance(dimension_def.partitions_def, TimeWindowPartitionsDefinition)\n ]\n\n if len(time_dimensions) == 1 and isinstance(\n single_dimension_partitions_def, TimeWindowPartitionsDefinition\n ):\n return InferSingleToMultiDimensionDepsResult(\n True,\n dimension_dependency=DimensionDependency(\n TimeWindowPartitionMapping(),\n upstream_dimension_name=(\n time_dimensions[0].name if upstream_is_multipartitioned else None\n ),\n downstream_dimension_name=(\n time_dimensions[0].name if not upstream_is_multipartitioned else None\n ),\n ),\n )\n\n return InferSingleToMultiDimensionDepsResult(\n False,\n "MultiToSingleDimensionPartitionMapping can only be used when: \\n(a) The single dimensional"\n " partitions definition is a dimension of the MultiPartitionsDefinition.\\n(b) The single"\n " dimensional partitions definition is a TimeWindowPartitionsDefinition and the"\n " MultiPartitionsDefinition has a single time dimension.",\n )\n\n\ndef infer_partition_mapping(\n partition_mapping: Optional[PartitionMapping],\n downstream_partitions_def: Optional[PartitionsDefinition],\n upstream_partitions_def: Optional[PartitionsDefinition],\n) -> PartitionMapping:\n from .time_window_partition_mapping import TimeWindowPartitionMapping\n\n if partition_mapping is not None:\n return partition_mapping\n elif upstream_partitions_def and downstream_partitions_def:\n if _get_infer_single_to_multi_dimension_deps_result(\n upstream_partitions_def, downstream_partitions_def\n ).can_infer:\n with disable_dagster_warnings():\n return MultiToSingleDimensionPartitionMapping()\n elif isinstance(upstream_partitions_def, TimeWindowPartitionsDefinition) and isinstance(\n downstream_partitions_def, TimeWindowPartitionsDefinition\n ):\n return TimeWindowPartitionMapping()\n else:\n return IdentityPartitionMapping()\n else:\n return AllPartitionMapping()\n\n\ndef get_builtin_partition_mapping_types() -> Tuple[Type[PartitionMapping], ...]:\n from dagster._core.definitions.time_window_partition_mapping import TimeWindowPartitionMapping\n\n return (\n AllPartitionMapping,\n IdentityPartitionMapping,\n LastPartitionMapping,\n SpecificPartitionsPartitionMapping,\n StaticPartitionMapping,\n TimeWindowPartitionMapping,\n MultiToSingleDimensionPartitionMapping,\n MultiPartitionMapping,\n )\n
", "current_page_name": "_modules/dagster/_core/definitions/partition_mapping", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.partition_mapping"}, "partitioned_schedule": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.partitioned_schedule

\nfrom typing import Callable, Mapping, NamedTuple, Optional, Union, cast\n\nimport dagster._check as check\nfrom dagster._core.errors import DagsterInvalidDefinitionError\n\nfrom .decorators.schedule_decorator import schedule\nfrom .job_definition import JobDefinition\nfrom .multi_dimensional_partitions import MultiPartitionsDefinition\nfrom .partition import PartitionsDefinition\nfrom .run_request import RunRequest, SkipReason\nfrom .schedule_definition import (\n    DefaultScheduleStatus,\n    RunRequestIterator,\n    ScheduleDefinition,\n    ScheduleEvaluationContext,\n)\nfrom .time_window_partitions import (\n    TimeWindowPartitionsDefinition,\n    get_time_partitions_def,\n    has_one_dimension_time_window_partitioning,\n)\nfrom .unresolved_asset_job_definition import UnresolvedAssetJobDefinition\n\n\nclass UnresolvedPartitionedAssetScheduleDefinition(NamedTuple):\n    """Points to an unresolved asset job. The asset selection isn't resolved yet, so we can't resolve\n    the PartitionsDefinition, so we can't resolve the schedule cadence.\n    """\n\n    name: str\n    job: UnresolvedAssetJobDefinition\n    description: Optional[str]\n    default_status: DefaultScheduleStatus\n    minute_of_hour: Optional[int]\n    hour_of_day: Optional[int]\n    day_of_week: Optional[int]\n    day_of_month: Optional[int]\n    tags: Optional[Mapping[str, str]]\n\n    def resolve(self, resolved_job: JobDefinition) -> ScheduleDefinition:\n        partitions_def = resolved_job.partitions_def\n        if partitions_def is None:\n            check.failed(\n                f"Job '{resolved_job.name}' provided to build_schedule_from_partitioned_job must"\n                " contain partitioned assets or a partitions definition."\n            )\n\n        partitions_def = _check_valid_schedule_partitions_def(partitions_def)\n        time_partitions_def = check.not_none(get_time_partitions_def(partitions_def))\n\n        return ScheduleDefinition(\n            job=resolved_job,\n            name=self.name,\n            execution_fn=_get_schedule_evaluation_fn(partitions_def, resolved_job, self.tags),\n            execution_timezone=time_partitions_def.timezone,\n            cron_schedule=time_partitions_def.get_cron_schedule(\n                self.minute_of_hour, self.hour_of_day, self.day_of_week, self.day_of_month\n            ),\n        )\n\n\n
[docs]def build_schedule_from_partitioned_job(\n job: Union[JobDefinition, UnresolvedAssetJobDefinition],\n description: Optional[str] = None,\n name: Optional[str] = None,\n minute_of_hour: Optional[int] = None,\n hour_of_day: Optional[int] = None,\n day_of_week: Optional[int] = None,\n day_of_month: Optional[int] = None,\n default_status: DefaultScheduleStatus = DefaultScheduleStatus.STOPPED,\n tags: Optional[Mapping[str, str]] = None,\n) -> Union[UnresolvedPartitionedAssetScheduleDefinition, ScheduleDefinition]:\n """Creates a schedule from a time window-partitioned job or a job that targets\n time window-partitioned assets. The job can also be multipartitioned, as long as one\n of the partitions dimensions is time-partitioned.\n\n The schedule executes at the cadence specified by the time partitioning of the job or assets.\n\n Examples:\n .. code-block:: python\n\n ######################################\n # Job that targets partitioned assets\n ######################################\n\n from dagster import (\n DailyPartitionsDefinition,\n asset,\n build_schedule_from_partitioned_job,\n define_asset_job,\n )\n\n @asset(partitions_def=DailyPartitionsDefinition(start_date="2020-01-01"))\n def asset1():\n ...\n\n asset1_job = define_asset_job("asset1_job", selection=[asset1])\n\n # The created schedule will fire daily\n asset1_job_schedule = build_schedule_from_partitioned_job(asset1_job)\n\n defs = Definitions(assets=[asset1], schedules=[asset1_job_schedule])\n\n ################\n # Non-asset job\n ################\n\n from dagster import DailyPartitionsDefinition, build_schedule_from_partitioned_job, jog\n\n\n @job(partitions_def=DailyPartitionsDefinition(start_date="2020-01-01"))\n def do_stuff_partitioned():\n ...\n\n # The created schedule will fire daily\n do_stuff_partitioned_schedule = build_schedule_from_partitioned_job(\n do_stuff_partitioned,\n )\n\n defs = Definitions(schedules=[do_stuff_partitioned_schedule])\n """\n check.invariant(\n not (day_of_week and day_of_month),\n "Cannot provide both day_of_month and day_of_week parameter to"\n " build_schedule_from_partitioned_job.",\n )\n\n if isinstance(job, UnresolvedAssetJobDefinition) and job.partitions_def is None:\n return UnresolvedPartitionedAssetScheduleDefinition(\n job=job,\n default_status=default_status,\n name=check.opt_str_param(name, "name", f"{job.name}_schedule"),\n description=check.opt_str_param(description, "description"),\n minute_of_hour=minute_of_hour,\n hour_of_day=hour_of_day,\n day_of_week=day_of_week,\n day_of_month=day_of_month,\n tags=tags,\n )\n else:\n partitions_def = job.partitions_def\n if partitions_def is None:\n check.failed("The provided job is not partitioned")\n\n partitions_def = _check_valid_schedule_partitions_def(partitions_def)\n time_partitions_def = check.not_none(get_time_partitions_def(partitions_def))\n\n return schedule(\n cron_schedule=time_partitions_def.get_cron_schedule(\n minute_of_hour, hour_of_day, day_of_week, day_of_month\n ),\n job=job,\n default_status=default_status,\n execution_timezone=time_partitions_def.timezone,\n name=check.opt_str_param(name, "name", f"{job.name}_schedule"),\n description=check.opt_str_param(description, "description"),\n )(_get_schedule_evaluation_fn(partitions_def, job, tags))
\n\n\ndef _get_schedule_evaluation_fn(\n partitions_def: PartitionsDefinition,\n job: Union[JobDefinition, UnresolvedAssetJobDefinition],\n tags: Optional[Mapping[str, str]] = None,\n) -> Callable[[ScheduleEvaluationContext], Union[SkipReason, RunRequest, RunRequestIterator]]:\n def schedule_fn(context):\n # Run for the latest partition. Prior partitions will have been handled by prior ticks.\n if isinstance(partitions_def, TimeWindowPartitionsDefinition):\n partition_key = partitions_def.get_last_partition_key(context.scheduled_execution_time)\n if partition_key is None:\n return SkipReason("The job's PartitionsDefinition has no partitions")\n\n return job.run_request_for_partition(\n partition_key=partition_key,\n run_key=partition_key,\n tags=tags,\n current_time=context.scheduled_execution_time,\n )\n else:\n check.invariant(isinstance(partitions_def, MultiPartitionsDefinition))\n time_window_dimension = partitions_def.time_window_dimension\n partition_key = time_window_dimension.partitions_def.get_last_partition_key(\n context.scheduled_execution_time\n )\n if partition_key is None:\n return SkipReason("The job's PartitionsDefinition has no partitions")\n\n return [\n job.run_request_for_partition(\n partition_key=key,\n run_key=key,\n tags=tags,\n current_time=context.scheduled_execution_time,\n dynamic_partitions_store=context.instance if context.instance_ref else None,\n )\n for key in partitions_def.get_multipartition_keys_with_dimension_value(\n time_window_dimension.name,\n partition_key,\n dynamic_partitions_store=context.instance if context.instance_ref else None,\n )\n ]\n\n return schedule_fn\n\n\ndef _check_valid_schedule_partitions_def(\n partitions_def: PartitionsDefinition,\n) -> Union[TimeWindowPartitionsDefinition, MultiPartitionsDefinition]:\n if not has_one_dimension_time_window_partitioning(partitions_def):\n raise DagsterInvalidDefinitionError(\n "Tried to build a partitioned schedule from an asset job, but received an invalid"\n " partitions definition. The permitted partitions definitions are: \\n1."\n " TimeWindowPartitionsDefinition\\n2. MultiPartitionsDefinition with a single"\n " TimeWindowPartitionsDefinition dimension"\n )\n\n return cast(Union[TimeWindowPartitionsDefinition, MultiPartitionsDefinition], partitions_def)\n\n\nschedule_from_partitions = build_schedule_from_partitioned_job\n
", "current_page_name": "_modules/dagster/_core/definitions/partitioned_schedule", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.partitioned_schedule"}, "policy": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.policy

\nfrom enum import Enum\nfrom random import random\nfrom typing import NamedTuple, Optional\n\nimport dagster._check as check\nfrom dagster._annotations import PublicAttr\nfrom dagster._core.errors import DagsterInvalidDefinitionError\n\n\n
[docs]class Backoff(Enum):\n """A modifier for delay as a function of attempt number.\n\n LINEAR: `attempt_num * delay`\n EXPONENTIAL: `((2 ^ attempt_num) - 1) * delay`\n """\n\n LINEAR = "LINEAR"\n EXPONENTIAL = "EXPONENTIAL"
\n\n\n
[docs]class Jitter(Enum):\n """A randomizing modifier for delay, applied after backoff calculation.\n\n FULL: between 0 and the calculated delay based on backoff: `random() * backoff_delay`\n PLUS_MINUS: +/- the delay: `backoff_delay + ((2 * (random() * delay)) - delay)`\n """\n\n FULL = "FULL"\n PLUS_MINUS = "PLUS_MINUS"
\n\n\n
[docs]class RetryPolicy(\n NamedTuple(\n "_RetryPolicy",\n [\n ("max_retries", PublicAttr[int]),\n ("delay", PublicAttr[Optional[check.Numeric]]),\n # declarative time modulation to allow calc witout running user function\n ("backoff", PublicAttr[Optional[Backoff]]),\n ("jitter", PublicAttr[Optional[Jitter]]),\n ],\n ),\n):\n """A declarative policy for when to request retries when an exception occurs during op execution.\n\n Args:\n max_retries (int):\n The maximum number of retries to attempt. Defaults to 1.\n delay (Optional[Union[int,float]]):\n The time in seconds to wait between the retry being requested and the next attempt\n being started. This unit of time can be modulated as a function of attempt number\n with backoff and randomly with jitter.\n backoff (Optional[Backoff]):\n A modifier for delay as a function of retry attempt number.\n jitter (Optional[Jitter]):\n A randomizing modifier for delay, applied after backoff calculation.\n """\n\n def __new__(\n cls,\n max_retries: int = 1,\n delay: Optional[check.Numeric] = None,\n backoff: Optional[Backoff] = None,\n jitter: Optional[Jitter] = None,\n ):\n if backoff is not None and delay is None:\n raise DagsterInvalidDefinitionError(\n "Can not set jitter on RetryPolicy without also setting delay"\n )\n\n if jitter is not None and delay is None:\n raise DagsterInvalidDefinitionError(\n "Can not set backoff on RetryPolicy without also setting delay"\n )\n\n return super().__new__(\n cls,\n max_retries=check.int_param(max_retries, "max_retries"),\n delay=check.opt_numeric_param(delay, "delay"),\n backoff=check.opt_inst_param(backoff, "backoff", Backoff),\n jitter=check.opt_inst_param(jitter, "jitter", Jitter),\n )\n\n def calculate_delay(self, attempt_num: int) -> check.Numeric:\n return calculate_delay(\n attempt_num=attempt_num,\n backoff=self.backoff,\n jitter=self.jitter,\n base_delay=self.delay or 0,\n )
\n\n\ndef calculate_delay(\n attempt_num: int, backoff: Optional[Backoff], jitter: Optional[Jitter], base_delay: float\n) -> float:\n if backoff is Backoff.EXPONENTIAL:\n calc_delay = ((2**attempt_num) - 1) * base_delay\n elif backoff is Backoff.LINEAR:\n calc_delay = base_delay * attempt_num\n elif backoff is None:\n calc_delay = base_delay\n else:\n check.assert_never(backoff)\n\n if jitter is Jitter.FULL:\n calc_delay = random() * calc_delay\n elif jitter is Jitter.PLUS_MINUS:\n calc_delay = calc_delay + ((2 * (random() * base_delay)) - base_delay)\n elif jitter is None:\n pass\n else:\n check.assert_never(jitter)\n\n return calc_delay\n
", "current_page_name": "_modules/dagster/_core/definitions/policy", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.policy"}, "reconstruct": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.reconstruct

\nimport inspect\nimport json\nimport os\nimport sys\nfrom functools import lru_cache\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Any,\n    Callable,\n    Dict,\n    Iterable,\n    List,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Tuple,\n    TypeVar,\n    Union,\n    overload,\n)\n\nfrom typing_extensions import Self, TypeAlias\n\nimport dagster._check as check\nimport dagster._seven as seven\nfrom dagster._annotations import experimental\nfrom dagster._core.code_pointer import (\n    CodePointer,\n    CustomPointer,\n    FileCodePointer,\n    ModuleCodePointer,\n    get_python_file_from_target,\n)\nfrom dagster._core.definitions.asset_check_spec import AssetCheckKey\nfrom dagster._core.errors import DagsterInvariantViolationError\nfrom dagster._core.origin import (\n    DEFAULT_DAGSTER_ENTRY_POINT,\n    JobPythonOrigin,\n    RepositoryPythonOrigin,\n)\nfrom dagster._serdes import pack_value, unpack_value, whitelist_for_serdes\nfrom dagster._serdes.serdes import NamedTupleSerializer\nfrom dagster._utils import hash_collection\n\nfrom .events import AssetKey\nfrom .job_base import IJob\n\nif TYPE_CHECKING:\n    from dagster._core.definitions.assets import AssetsDefinition\n    from dagster._core.definitions.job_definition import JobDefinition\n    from dagster._core.definitions.repository_definition import (\n        PendingRepositoryDefinition,\n        RepositoryLoadData,\n    )\n    from dagster._core.definitions.source_asset import SourceAsset\n\n    from .graph_definition import GraphDefinition\n    from .repository_definition import RepositoryDefinition\n\n\ndef get_ephemeral_repository_name(job_name: str) -> str:\n    check.str_param(job_name, "job_name")\n    return f"__repository__{job_name}"\n\n\n@whitelist_for_serdes\nclass ReconstructableRepository(\n    NamedTuple(\n        "_ReconstructableRepository",\n        [\n            ("pointer", CodePointer),\n            ("container_image", Optional[str]),\n            ("executable_path", Optional[str]),\n            ("entry_point", Sequence[str]),\n            ("container_context", Optional[Mapping[str, Any]]),\n            ("repository_load_data", Optional["RepositoryLoadData"]),\n        ],\n    )\n):\n    def __new__(\n        cls,\n        pointer: CodePointer,\n        container_image: Optional[str] = None,\n        executable_path: Optional[str] = None,\n        entry_point: Optional[Sequence[str]] = None,\n        container_context: Optional[Mapping[str, Any]] = None,\n        repository_load_data: Optional["RepositoryLoadData"] = None,\n    ):\n        from dagster._core.definitions.repository_definition import RepositoryLoadData\n\n        return super(ReconstructableRepository, cls).__new__(\n            cls,\n            pointer=check.inst_param(pointer, "pointer", CodePointer),\n            container_image=check.opt_str_param(container_image, "container_image"),\n            executable_path=check.opt_str_param(executable_path, "executable_path"),\n            entry_point=(\n                check.sequence_param(entry_point, "entry_point", of_type=str)\n                if entry_point is not None\n                else DEFAULT_DAGSTER_ENTRY_POINT\n            ),\n            container_context=(\n                check.mapping_param(container_context, "container_context")\n                if container_context is not None\n                else None\n            ),\n            repository_load_data=check.opt_inst_param(\n                repository_load_data, "repository_load_data", RepositoryLoadData\n            ),\n        )\n\n    def with_repository_load_data(\n        self, metadata: Optional["RepositoryLoadData"]\n    ) -> "ReconstructableRepository":\n        return self._replace(repository_load_data=metadata)\n\n    def get_definition(self) -> "RepositoryDefinition":\n        return repository_def_from_pointer(self.pointer, self.repository_load_data)\n\n    def get_reconstructable_job(self, name: str) -> "ReconstructableJob":\n        return ReconstructableJob(self, name)\n\n    @classmethod\n    def for_file(\n        cls,\n        file: str,\n        fn_name: str,\n        working_directory: Optional[str] = None,\n        container_image: Optional[str] = None,\n        container_context: Optional[Mapping[str, Any]] = None,\n    ) -> "ReconstructableRepository":\n        if not working_directory:\n            working_directory = os.getcwd()\n        return cls(\n            FileCodePointer(file, fn_name, working_directory),\n            container_image=container_image,\n            container_context=container_context,\n        )\n\n    @classmethod\n    def for_module(\n        cls,\n        module: str,\n        fn_name: str,\n        working_directory: Optional[str] = None,\n        container_image: Optional[str] = None,\n        container_context: Optional[Mapping[str, Any]] = None,\n    ) -> "ReconstructableRepository":\n        return cls(\n            ModuleCodePointer(module, fn_name, working_directory),\n            container_image=container_image,\n            container_context=container_context,\n        )\n\n    def get_python_origin(self) -> RepositoryPythonOrigin:\n        return RepositoryPythonOrigin(\n            executable_path=self.executable_path if self.executable_path else sys.executable,\n            code_pointer=self.pointer,\n            container_image=self.container_image,\n            entry_point=self.entry_point,\n            container_context=self.container_context,\n        )\n\n    def get_python_origin_id(self) -> str:\n        return self.get_python_origin().get_id()\n\n    # Allow this to be hashed for use in `lru_cache`. This is needed because:\n    # - `ReconstructableJob` uses `lru_cache`\n    # - `ReconstructableJob` has a `ReconstructableRepository` attribute\n    # - `ReconstructableRepository` has `Sequence` attributes that are unhashable by default\n    def __hash__(self) -> int:\n        if not hasattr(self, "_hash"):\n            self._hash = hash_collection(self)\n        return self._hash\n\n\nclass ReconstructableJobSerializer(NamedTupleSerializer):\n    def before_unpack(self, _, unpacked_dict: Dict[str, Any]) -> Dict[str, Any]:\n        solid_selection_str = unpacked_dict.get("solid_selection_str")\n        solids_to_execute = unpacked_dict.get("solids_to_execute")\n        if solid_selection_str:\n            unpacked_dict["op_selection"] = json.loads(solid_selection_str)\n        elif solids_to_execute:\n            unpacked_dict["op_selection"] = solids_to_execute\n        return unpacked_dict\n\n    def after_pack(self, **packed_dict: Any) -> Dict[str, Any]:\n        if packed_dict["op_selection"]:\n            packed_dict["solid_selection_str"] = json.dumps(packed_dict["op_selection"]["__set__"])\n        else:\n            packed_dict["solid_selection_str"] = None\n        del packed_dict["op_selection"]\n        return packed_dict\n\n\n@whitelist_for_serdes(\n    serializer=ReconstructableJobSerializer,\n    storage_name="ReconstructablePipeline",\n    storage_field_names={\n        "job_name": "pipeline_name",\n    },\n)\nclass ReconstructableJob(\n    NamedTuple(\n        "_ReconstructableJob",\n        [\n            ("repository", ReconstructableRepository),\n            ("job_name", str),\n            ("op_selection", Optional[AbstractSet[str]]),\n            ("asset_selection", Optional[AbstractSet[AssetKey]]),\n            ("asset_check_selection", Optional[AbstractSet[AssetCheckKey]]),\n        ],\n    ),\n    IJob,\n):\n    """Defines a reconstructable job. When your job must cross process boundaries, Dagster must know\n    how to reconstruct the job on the other side of the process boundary.\n\n    Args:\n        repository (ReconstructableRepository): The reconstructable representation of the repository\n            the job belongs to.\n        job_name (str): The name of the job.\n        op_selection (Optional[AbstractSet[str]]): A set of op query strings. Ops matching any of\n            these queries will be selected. None if no selection is specified.\n        asset_selection (Optional[AbstractSet[AssetKey]]) A set of assets to execute. None if no selection\n            is specified, i.e. the entire job will be run.\n    """\n\n    def __new__(\n        cls,\n        repository: ReconstructableRepository,\n        job_name: str,\n        op_selection: Optional[Iterable[str]] = None,\n        asset_selection: Optional[AbstractSet[AssetKey]] = None,\n        asset_check_selection: Optional[AbstractSet[AssetCheckKey]] = None,\n    ):\n        op_selection = set(op_selection) if op_selection else None\n        return super(ReconstructableJob, cls).__new__(\n            cls,\n            repository=check.inst_param(repository, "repository", ReconstructableRepository),\n            job_name=check.str_param(job_name, "job_name"),\n            op_selection=check.opt_nullable_set_param(op_selection, "op_selection", of_type=str),\n            asset_selection=check.opt_nullable_set_param(\n                asset_selection, "asset_selection", AssetKey\n            ),\n            asset_check_selection=check.opt_nullable_set_param(\n                asset_check_selection, "asset_check_selection", AssetCheckKey\n            ),\n        )\n\n    def with_repository_load_data(\n        self, metadata: Optional["RepositoryLoadData"]\n    ) -> "ReconstructableJob":\n        return self._replace(repository=self.repository.with_repository_load_data(metadata))\n\n    # Keep the most recent 1 definition (globally since this is a NamedTuple method)\n    # This allows repeated calls to get_definition in execution paths to not reload the job\n    @lru_cache(maxsize=1)\n    def get_definition(self) -> "JobDefinition":\n        return self.repository.get_definition().get_maybe_subset_job_def(\n            self.job_name,\n            self.op_selection,\n            self.asset_selection,\n        )\n\n    def get_reconstructable_repository(self) -> ReconstructableRepository:\n        return self.repository\n\n    def get_subset(\n        self,\n        *,\n        op_selection: Optional[Iterable[str]] = None,\n        asset_selection: Optional[AbstractSet[AssetKey]] = None,\n        asset_check_selection: Optional[AbstractSet[AssetCheckKey]] = None,\n    ) -> Self:\n        if op_selection and (asset_selection or asset_check_selection):\n            check.failed(\n                "op_selection and asset_selection or asset_check_selection cannot both be provided"\n                " as arguments",\n            )\n        op_selection = set(op_selection) if op_selection else None\n        return ReconstructableJob(\n            repository=self.repository,\n            job_name=self.job_name,\n            op_selection=op_selection,\n            asset_selection=asset_selection,\n            asset_check_selection=asset_check_selection,\n        )\n\n    def describe(self) -> str:\n        return f'"{self.job_name}" in repository ({self.repository.pointer.describe})'\n\n    @staticmethod\n    def for_file(python_file: str, fn_name: str) -> "ReconstructableJob":\n        return bootstrap_standalone_recon_job(FileCodePointer(python_file, fn_name, os.getcwd()))\n\n    @staticmethod\n    def for_module(module: str, fn_name: str) -> "ReconstructableJob":\n        return bootstrap_standalone_recon_job(ModuleCodePointer(module, fn_name, os.getcwd()))\n\n    def to_dict(self) -> Mapping[str, object]:\n        return pack_value(self)\n\n    @staticmethod\n    def from_dict(val: Mapping[str, Any]) -> "ReconstructableJob":\n        check.mapping_param(val, "val")\n\n        inst = unpack_value(val)\n        check.invariant(\n            isinstance(inst, ReconstructableJob),\n            f"Deserialized object is not instance of ReconstructableJob, got {type(inst)}",\n        )\n        return inst  # type: ignore  # (illegible runtime check)\n\n    def get_python_origin(self) -> JobPythonOrigin:\n        return JobPythonOrigin(self.job_name, self.repository.get_python_origin())\n\n    def get_python_origin_id(self) -> str:\n        return self.get_python_origin().get_id()\n\n    def get_module(self) -> Optional[str]:\n        """Return the module the job is found in, the origin is a module code pointer."""\n        pointer = self.get_python_origin().get_repo_pointer()\n        if isinstance(pointer, ModuleCodePointer):\n            return pointer.module\n\n        return None\n\n    # Allow this to be hashed for `lru_cache` in `get_definition`\n    def __hash__(self) -> int:\n        if not hasattr(self, "_hash"):\n            self._hash = hash_collection(self)\n        return self._hash\n\n\n
[docs]def reconstructable(target: Callable[..., "JobDefinition"]) -> ReconstructableJob:\n """Create a :py:class:`~dagster._core.definitions.reconstructable.ReconstructableJob` from a\n function that returns a :py:class:`~dagster.JobDefinition`/:py:class:`~dagster.JobDefinition`,\n or a function decorated with :py:func:`@job <dagster.job>`.\n\n When your job must cross process boundaries, e.g., for execution on multiple nodes or\n in different systems (like ``dagstermill``), Dagster must know how to reconstruct the job\n on the other side of the process boundary.\n\n Passing a job created with ``~dagster.GraphDefinition.to_job`` to ``reconstructable()``,\n requires you to wrap that job's definition in a module-scoped function, and pass that function\n instead:\n\n .. code-block:: python\n\n from dagster import graph, reconstructable\n\n @graph\n def my_graph():\n ...\n\n def define_my_job():\n return my_graph.to_job()\n\n reconstructable(define_my_job)\n\n This function implements a very conservative strategy for reconstruction, so that its behavior\n is easy to predict, but as a consequence it is not able to reconstruct certain kinds of jobs\n or jobs, such as those defined by lambdas, in nested scopes (e.g., dynamically within a method\n call), or in interactive environments such as the Python REPL or Jupyter notebooks.\n\n If you need to reconstruct objects constructed in these ways, you should use\n :py:func:`~dagster.reconstructable.build_reconstructable_job` instead, which allows you to\n specify your own reconstruction strategy.\n\n Examples:\n .. code-block:: python\n\n from dagster import job, reconstructable\n\n @job\n def foo_job():\n ...\n\n reconstructable_foo_job = reconstructable(foo_job)\n\n\n @graph\n def foo():\n ...\n\n def make_bar_job():\n return foo.to_job()\n\n reconstructable_bar_job = reconstructable(make_bar_job)\n """\n from dagster._core.definitions import JobDefinition\n\n if not seven.is_function_or_decorator_instance_of(target, JobDefinition):\n if isinstance(target, JobDefinition):\n raise DagsterInvariantViolationError(\n "Reconstructable target was not a function returning a job definition, or a job "\n "definition produced by a decorated function. If your job was constructed using "\n "``GraphDefinition.to_job``, you must wrap the ``to_job`` call in a function at "\n "module scope, ie not within any other functions. "\n "To learn more, check out the docs on ``reconstructable``: "\n "https://docs.dagster.io/_apidocs/execution#dagster.reconstructable"\n )\n raise DagsterInvariantViolationError(\n "Reconstructable target should be a function or definition produced "\n f"by a decorated function, got {type(target)}.",\n )\n\n if seven.is_lambda(target):\n raise DagsterInvariantViolationError(\n "Reconstructable target can not be a lambda. Use a function or "\n "decorated function defined at module scope instead, or use "\n "build_reconstructable_job."\n )\n\n if seven.qualname_differs(target):\n raise DagsterInvariantViolationError(\n f'Reconstructable target "{target.__name__}" has a different '\n f'__qualname__ "{target.__qualname__}" indicating it is not '\n "defined at module scope. Use a function or decorated function "\n "defined at module scope instead, or use build_reconstructable_job."\n )\n\n try:\n if (\n hasattr(target, "__module__")\n and hasattr(target, "__name__")\n and getattr(inspect.getmodule(target), "__name__", None) != "__main__"\n ):\n return ReconstructableJob.for_module(target.__module__, target.__name__)\n except:\n pass\n\n python_file = get_python_file_from_target(target)\n if not python_file:\n raise DagsterInvariantViolationError(\n "reconstructable() can not reconstruct jobs defined in interactive "\n "environments like <stdin>, IPython, or Jupyter notebooks. "\n "Use a job defined in a module or file instead, or use build_reconstructable_job."\n )\n\n pointer = FileCodePointer(\n python_file=python_file, fn_name=target.__name__, working_directory=os.getcwd()\n )\n\n return bootstrap_standalone_recon_job(pointer)
\n\n\n
[docs]@experimental\ndef build_reconstructable_job(\n reconstructor_module_name: str,\n reconstructor_function_name: str,\n reconstructable_args: Optional[Tuple[object]] = None,\n reconstructable_kwargs: Optional[Mapping[str, object]] = None,\n reconstructor_working_directory: Optional[str] = None,\n) -> ReconstructableJob:\n """Create a :py:class:`dagster._core.definitions.reconstructable.ReconstructableJob`.\n\n When your job must cross process boundaries, e.g., for execution on multiple nodes or in\n different systems (like ``dagstermill``), Dagster must know how to reconstruct the job\n on the other side of the process boundary.\n\n This function allows you to use the strategy of your choice for reconstructing jobs, so\n that you can reconstruct certain kinds of jobs that are not supported by\n :py:func:`~dagster.reconstructable`, such as those defined by lambdas, in nested scopes (e.g.,\n dynamically within a method call), or in interactive environments such as the Python REPL or\n Jupyter notebooks.\n\n If you need to reconstruct jobs constructed in these ways, use this function instead of\n :py:func:`~dagster.reconstructable`.\n\n Args:\n reconstructor_module_name (str): The name of the module containing the function to use to\n reconstruct the job.\n reconstructor_function_name (str): The name of the function to use to reconstruct the\n job.\n reconstructable_args (Tuple): Args to the function to use to reconstruct the job.\n Values of the tuple must be JSON serializable.\n reconstructable_kwargs (Dict[str, Any]): Kwargs to the function to use to reconstruct the\n job. Values of the dict must be JSON serializable.\n\n Examples:\n .. code-block:: python\n\n # module: mymodule\n\n from dagster import JobDefinition, job, build_reconstructable_job\n\n class JobFactory:\n def make_job(*args, **kwargs):\n\n @job\n def _job(...):\n ...\n\n return _job\n\n def reconstruct_job(*args):\n factory = JobFactory()\n return factory.make_job(*args)\n\n factory = JobFactory()\n\n foo_job_args = (...,...)\n\n foo_job_kwargs = {...:...}\n\n foo_job = factory.make_job(*foo_job_args, **foo_job_kwargs)\n\n reconstructable_foo_job = build_reconstructable_job(\n 'mymodule',\n 'reconstruct_job',\n foo_job_args,\n foo_job_kwargs,\n )\n """\n check.str_param(reconstructor_module_name, "reconstructor_module_name")\n check.str_param(reconstructor_function_name, "reconstructor_function_name")\n check.opt_str_param(\n reconstructor_working_directory, "reconstructor_working_directory", os.getcwd()\n )\n\n _reconstructable_args: List[object] = list(\n check.opt_tuple_param(reconstructable_args, "reconstructable_args")\n )\n _reconstructable_kwargs: List[List[Union[str, object]]] = list(\n (\n [key, value]\n for key, value in check.opt_mapping_param(\n reconstructable_kwargs, "reconstructable_kwargs", key_type=str\n ).items()\n )\n )\n\n reconstructor_pointer = ModuleCodePointer(\n reconstructor_module_name,\n reconstructor_function_name,\n working_directory=reconstructor_working_directory,\n )\n\n pointer = CustomPointer(reconstructor_pointer, _reconstructable_args, _reconstructable_kwargs)\n\n job_def = job_def_from_pointer(pointer)\n\n return ReconstructableJob(\n repository=ReconstructableRepository(pointer), # creates ephemeral repo\n job_name=job_def.name,\n )
\n\n\ndef bootstrap_standalone_recon_job(pointer: CodePointer) -> ReconstructableJob:\n # So this actually straps the the job for the sole\n # purpose of getting the job name. If we changed ReconstructableJob\n # to get the job on demand in order to get name, we could avoid this.\n job_def = job_def_from_pointer(pointer)\n return ReconstructableJob(\n repository=ReconstructableRepository(pointer), # creates ephemeral repo\n job_name=job_def.name,\n )\n\n\nLoadableDefinition: TypeAlias = Union[\n "JobDefinition",\n "RepositoryDefinition",\n "PendingRepositoryDefinition",\n "GraphDefinition",\n "Sequence[Union[AssetsDefinition, SourceAsset]]",\n]\n\nT_LoadableDefinition = TypeVar("T_LoadableDefinition", bound=LoadableDefinition)\n\n\ndef _is_list_of_assets(\n definition: LoadableDefinition,\n) -> bool:\n from dagster._core.definitions.assets import AssetsDefinition\n from dagster._core.definitions.source_asset import SourceAsset\n\n return isinstance(definition, list) and all(\n isinstance(item, (AssetsDefinition, SourceAsset)) for item in definition\n )\n\n\ndef _check_is_loadable(definition: T_LoadableDefinition) -> T_LoadableDefinition:\n from .definitions_class import Definitions\n from .graph_definition import GraphDefinition\n from .job_definition import JobDefinition\n from .repository_definition import PendingRepositoryDefinition, RepositoryDefinition\n\n if not (\n isinstance(\n definition,\n (\n JobDefinition,\n RepositoryDefinition,\n PendingRepositoryDefinition,\n GraphDefinition,\n Definitions,\n ),\n )\n or _is_list_of_assets(definition)\n ):\n raise DagsterInvariantViolationError(\n "Loadable attributes must be either a JobDefinition, GraphDefinition, "\n f"or RepositoryDefinition. Got {definition!r}."\n )\n return definition\n\n\ndef load_def_in_module(\n module_name: str, attribute: str, working_directory: Optional[str]\n) -> LoadableDefinition:\n return def_from_pointer(CodePointer.from_module(module_name, attribute, working_directory))\n\n\ndef load_def_in_package(\n package_name: str, attribute: str, working_directory: Optional[str]\n) -> LoadableDefinition:\n return def_from_pointer(\n CodePointer.from_python_package(package_name, attribute, working_directory)\n )\n\n\ndef load_def_in_python_file(\n python_file: str, attribute: str, working_directory: Optional[str]\n) -> LoadableDefinition:\n return def_from_pointer(CodePointer.from_python_file(python_file, attribute, working_directory))\n\n\ndef def_from_pointer(\n pointer: CodePointer,\n) -> LoadableDefinition:\n target = pointer.load_target()\n\n from .graph_definition import GraphDefinition\n from .job_definition import JobDefinition\n from .repository_definition import PendingRepositoryDefinition, RepositoryDefinition\n\n if isinstance(\n target,\n (\n GraphDefinition,\n JobDefinition,\n PendingRepositoryDefinition,\n RepositoryDefinition,\n ),\n ) or not callable(target):\n return _check_is_loadable(target) # type: ignore\n\n # if its a function invoke it - otherwise we are pointing to a\n # artifact in module scope, likely decorator output\n\n if seven.get_arg_names(target):\n raise DagsterInvariantViolationError(\n f"Error invoking function at {pointer.describe()} with no arguments. "\n "Reconstructable target must be callable with no arguments"\n )\n\n return _check_is_loadable(target())\n\n\ndef job_def_from_pointer(pointer: CodePointer) -> "JobDefinition":\n from .job_definition import JobDefinition\n\n target = def_from_pointer(pointer)\n\n if isinstance(target, JobDefinition):\n return target\n\n raise DagsterInvariantViolationError(\n "CodePointer ({str}) must resolve to a JobDefinition (or JobDefinition for legacy"\n " code). Received a {type}".format(str=pointer.describe(), type=type(target))\n )\n\n\n@overload\ndef repository_def_from_target_def(\n target: Union["RepositoryDefinition", "JobDefinition", "GraphDefinition"],\n repository_load_data: Optional["RepositoryLoadData"] = None,\n) -> "RepositoryDefinition": ...\n\n\n@overload\ndef repository_def_from_target_def(\n target: object, repository_load_data: Optional["RepositoryLoadData"] = None\n) -> None: ...\n\n\ndef repository_def_from_target_def(\n target: object, repository_load_data: Optional["RepositoryLoadData"] = None\n) -> Optional["RepositoryDefinition"]:\n from .assets import AssetsDefinition\n from .definitions_class import Definitions\n from .graph_definition import GraphDefinition\n from .job_definition import JobDefinition\n from .repository_definition import (\n SINGLETON_REPOSITORY_NAME,\n CachingRepositoryData,\n PendingRepositoryDefinition,\n RepositoryDefinition,\n )\n from .source_asset import SourceAsset\n\n if isinstance(target, Definitions):\n # reassign to handle both repository and pending repo case\n target = target.get_inner_repository_for_loading_process()\n\n # special case - we can wrap a single job in a repository\n if isinstance(target, (JobDefinition, GraphDefinition)):\n # consider including job name in generated repo name\n return RepositoryDefinition(\n name=get_ephemeral_repository_name(target.name),\n repository_data=CachingRepositoryData.from_list([target]),\n )\n elif isinstance(target, list) and all(\n isinstance(item, (AssetsDefinition, SourceAsset)) for item in target\n ):\n return RepositoryDefinition(\n name=SINGLETON_REPOSITORY_NAME,\n repository_data=CachingRepositoryData.from_list(target),\n )\n elif isinstance(target, RepositoryDefinition):\n return target\n elif isinstance(target, PendingRepositoryDefinition):\n # must load repository from scratch\n if repository_load_data is None:\n return target.compute_repository_definition()\n # can use the cached data to more efficiently load data\n return target.reconstruct_repository_definition(repository_load_data)\n else:\n return None\n\n\ndef repository_def_from_pointer(\n pointer: CodePointer, repository_load_data: Optional["RepositoryLoadData"] = None\n) -> "RepositoryDefinition":\n target = def_from_pointer(pointer)\n repo_def = repository_def_from_target_def(target, repository_load_data)\n if not repo_def:\n raise DagsterInvariantViolationError(\n f"CodePointer ({pointer.describe()}) must resolve to a "\n "RepositoryDefinition, JobDefinition, or JobDefinition. "\n f"Received a {type(target)}"\n )\n return repo_def\n
", "current_page_name": "_modules/dagster/_core/definitions/reconstruct", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.reconstruct"}, "repository_definition": {"repository_data": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.repository_definition.repository_data

\nfrom abc import ABC, abstractmethod\nfrom types import FunctionType\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Any,\n    Callable,\n    Dict,\n    Mapping,\n    Optional,\n    Sequence,\n    TypeVar,\n    Union,\n)\n\nimport dagster._check as check\nfrom dagster._annotations import public\nfrom dagster._core.definitions.events import AssetKey\nfrom dagster._core.definitions.executor_definition import ExecutorDefinition\nfrom dagster._core.definitions.graph_definition import SubselectedGraphDefinition\nfrom dagster._core.definitions.job_definition import JobDefinition\nfrom dagster._core.definitions.logger_definition import LoggerDefinition\nfrom dagster._core.definitions.resource_definition import ResourceDefinition\nfrom dagster._core.definitions.schedule_definition import ScheduleDefinition\nfrom dagster._core.definitions.sensor_definition import SensorDefinition\nfrom dagster._core.definitions.source_asset import SourceAsset\nfrom dagster._core.errors import DagsterInvalidDefinitionError, DagsterInvariantViolationError\n\nfrom .caching_index import CacheingDefinitionIndex\nfrom .valid_definitions import RepositoryListDefinition\n\nif TYPE_CHECKING:\n    from dagster._core.definitions import AssetsDefinition\n\n\nT = TypeVar("T")\nResolvable = Callable[[], T]\n\n\n
[docs]class RepositoryData(ABC):\n """Users should usually rely on the :py:func:`@repository <repository>` decorator to create new\n repositories, which will in turn call the static constructors on this class. However, users may\n subclass :py:class:`RepositoryData` for fine-grained control over access to and lazy creation\n of repository members.\n """\n\n @abstractmethod\n def get_resource_key_mapping(self) -> Mapping[int, str]:\n pass\n\n @abstractmethod\n def get_top_level_resources(self) -> Mapping[str, ResourceDefinition]:\n """Return all top-level resources in the repository as a list,\n such as those provided to the Definitions constructor.\n\n Returns:\n List[ResourceDefinition]: All top-level resources in the repository.\n """\n\n @abstractmethod\n def get_env_vars_by_top_level_resource(self) -> Mapping[str, AbstractSet[str]]:\n pass\n\n
[docs] @abstractmethod\n @public\n def get_all_jobs(self) -> Sequence[JobDefinition]:\n """Return all jobs in the repository as a list.\n\n Returns:\n List[JobDefinition]: All jobs in the repository.\n """
\n\n
[docs] @public\n def get_job_names(self) -> Sequence[str]:\n """Get the names of all jobs in the repository.\n\n Returns:\n List[str]\n """\n return [job_def.name for job_def in self.get_all_jobs()]
\n\n
[docs] @public\n def has_job(self, job_name: str) -> bool:\n """Check if a job with a given name is present in the repository.\n\n Args:\n job_name (str): The name of the job.\n\n Returns:\n bool\n """\n return job_name in self.get_job_names()
\n\n
[docs] @public\n def get_job(self, job_name: str) -> JobDefinition:\n """Get a job by name.\n\n Args:\n job_name (str): Name of the job to retrieve.\n\n Returns:\n JobDefinition: The job definition corresponding to the given name.\n """\n match = next(job for job in self.get_all_jobs() if job.name == job_name)\n if match is None:\n raise DagsterInvariantViolationError(f"Could not find job {job_name} in repository")\n return match
\n\n
[docs] @public\n def get_schedule_names(self) -> Sequence[str]:\n """Get the names of all schedules in the repository.\n\n Returns:\n List[str]\n """\n return [schedule.name for schedule in self.get_all_schedules()]
\n\n
[docs] @public\n def get_all_schedules(self) -> Sequence[ScheduleDefinition]:\n """Return all schedules in the repository as a list.\n\n Returns:\n List[ScheduleDefinition]: All jobs in the repository.\n """\n return []
\n\n
[docs] @public\n def get_schedule(self, schedule_name: str) -> ScheduleDefinition:\n """Get a schedule by name.\n\n Args:\n schedule_name (str): name of the schedule to retrieve.\n\n Returns:\n ScheduleDefinition: The schedule definition corresponding to the given name.\n """\n schedules_with_name = [\n schedule for schedule in self.get_all_schedules() if schedule.name == schedule_name\n ]\n if not schedules_with_name:\n raise DagsterInvariantViolationError(\n f"Could not find schedule {schedule_name} in repository"\n )\n return schedules_with_name[0]
\n\n
[docs] @public\n def has_schedule(self, schedule_name: str) -> bool:\n """Check if a schedule with a given name is present in the repository."""\n return schedule_name in self.get_schedule_names()
\n\n
[docs] @public\n def get_all_sensors(self) -> Sequence[SensorDefinition]:\n """Sequence[SensorDefinition]: Return all sensors in the repository as a list."""\n return []
\n\n
[docs] @public\n def get_sensor_names(self) -> Sequence[str]:\n """Sequence[str]: Get the names of all sensors in the repository."""\n return [sensor.name for sensor in self.get_all_sensors()]
\n\n
[docs] @public\n def get_sensor(self, sensor_name: str) -> SensorDefinition:\n """Get a sensor by name.\n\n Args:\n sensor_name (str): name of the sensor to retrieve.\n\n Returns:\n SensorDefinition: The sensor definition corresponding to the given name.\n """\n sensors_with_name = [\n sensor for sensor in self.get_all_sensors() if sensor.name == sensor_name\n ]\n if not sensors_with_name:\n raise DagsterInvariantViolationError(\n f"Could not find sensor {sensor_name} in repository"\n )\n return sensors_with_name[0]
\n\n
[docs] @public\n def has_sensor(self, sensor_name: str) -> bool:\n """Check if a sensor with a given name is present in the repository."""\n return sensor_name in self.get_sensor_names()
\n\n
[docs] @public\n def get_source_assets_by_key(self) -> Mapping[AssetKey, SourceAsset]:\n """Mapping[AssetKey, SourceAsset]: Get the source assets for the repository."""\n return {}
\n\n
[docs] @public\n def get_assets_defs_by_key(self) -> Mapping[AssetKey, "AssetsDefinition"]:\n """Mapping[AssetKey, AssetsDefinition]: Get the asset definitions for the repository."""\n return {}
\n\n def load_all_definitions(self):\n # force load of all lazy constructed code artifacts\n self.get_all_jobs()\n self.get_all_schedules()\n self.get_all_sensors()\n self.get_source_assets_by_key()
\n\n\nclass CachingRepositoryData(RepositoryData):\n """Default implementation of RepositoryData used by the :py:func:`@repository <repository>` decorator."""\n\n _all_jobs: Optional[Sequence[JobDefinition]]\n _all_pipelines: Optional[Sequence[JobDefinition]]\n\n def __init__(\n self,\n jobs: Mapping[str, Union[JobDefinition, Resolvable[JobDefinition]]],\n schedules: Mapping[str, Union[ScheduleDefinition, Resolvable[ScheduleDefinition]]],\n sensors: Mapping[str, Union[SensorDefinition, Resolvable[SensorDefinition]]],\n source_assets_by_key: Mapping[AssetKey, SourceAsset],\n assets_defs_by_key: Mapping[AssetKey, "AssetsDefinition"],\n top_level_resources: Mapping[str, ResourceDefinition],\n utilized_env_vars: Mapping[str, AbstractSet[str]],\n resource_key_mapping: Mapping[int, str],\n ):\n """Constructs a new CachingRepositoryData object.\n\n You may pass pipeline, job, and schedule definitions directly, or you may pass callables\n with no arguments that will be invoked to lazily construct definitions when accessed by\n name. This can be helpful for performance when there are many definitions in a repository,\n or when constructing the definitions is costly.\n\n Note that when lazily constructing a definition, the name of the definition must match its\n key in its dictionary index, or a :py:class:`DagsterInvariantViolationError` will be thrown\n at retrieval time.\n\n Args:\n jobs (Mapping[str, Union[JobDefinition, Callable[[], JobDefinition]]]):\n The job definitions belonging to the repository.\n schedules (Mapping[str, Union[ScheduleDefinition, Callable[[], ScheduleDefinition]]]):\n The schedules belonging to the repository.\n sensors (Mapping[str, Union[SensorDefinition, Callable[[], SensorDefinition]]]):\n The sensors belonging to a repository.\n source_assets_by_key (Mapping[AssetKey, SourceAsset]): The source assets belonging to a repository.\n assets_defs_by_key (Mapping[AssetKey, AssetsDefinition]): The assets definitions\n belonging to a repository.\n top_level_resources (Mapping[str, ResourceDefinition]): A dict of top-level\n resource keys to defintions, for resources which should be displayed in the UI.\n """\n from dagster._core.definitions import AssetsDefinition\n\n check.mapping_param(jobs, "jobs", key_type=str, value_type=(JobDefinition, FunctionType))\n check.mapping_param(\n schedules, "schedules", key_type=str, value_type=(ScheduleDefinition, FunctionType)\n )\n check.mapping_param(\n sensors, "sensors", key_type=str, value_type=(SensorDefinition, FunctionType)\n )\n check.mapping_param(\n source_assets_by_key, "source_assets_by_key", key_type=AssetKey, value_type=SourceAsset\n )\n check.mapping_param(\n assets_defs_by_key, "assets_defs_by_key", key_type=AssetKey, value_type=AssetsDefinition\n )\n check.mapping_param(\n top_level_resources, "top_level_resources", key_type=str, value_type=ResourceDefinition\n )\n check.mapping_param(\n utilized_env_vars,\n "utilized_resources",\n key_type=str,\n )\n check.mapping_param(\n resource_key_mapping, "resource_key_mapping", key_type=int, value_type=str\n )\n\n self._jobs = CacheingDefinitionIndex(\n JobDefinition,\n "JobDefinition",\n "job",\n jobs,\n self._validate_job,\n )\n\n self._schedules = CacheingDefinitionIndex(\n ScheduleDefinition,\n "ScheduleDefinition",\n "schedule",\n schedules,\n self._validate_schedule,\n )\n # load all schedules to force validation\n self._schedules.get_all_definitions()\n\n self._source_assets_by_key = source_assets_by_key\n self._assets_defs_by_key = assets_defs_by_key\n self._top_level_resources = top_level_resources\n self._utilized_env_vars = utilized_env_vars\n self._resource_key_mapping = resource_key_mapping\n\n self._sensors = CacheingDefinitionIndex(\n SensorDefinition,\n "SensorDefinition",\n "sensor",\n sensors,\n self._validate_sensor,\n )\n # load all sensors to force validation\n self._sensors.get_all_definitions()\n\n self._all_jobs = None\n\n @staticmethod\n def from_dict(repository_definitions: Dict[str, Dict[str, Any]]) -> "CachingRepositoryData":\n """Static constructor.\n\n Args:\n repository_definition (Dict[str, Dict[str, ...]]): A dict of the form:\n\n {\n 'jobs': Dict[str, Callable[[], JobDefinition]],\n 'schedules': Dict[str, Callable[[], ScheduleDefinition]]\n }\n\n This form is intended to allow definitions to be created lazily when accessed by name,\n which can be helpful for performance when there are many definitions in a repository, or\n when constructing the definitions is costly.\n """\n from .repository_data_builder import build_caching_repository_data_from_dict\n\n return build_caching_repository_data_from_dict(repository_definitions)\n\n @classmethod\n def from_list(\n cls,\n repository_definitions: Sequence[RepositoryListDefinition],\n default_executor_def: Optional[ExecutorDefinition] = None,\n default_logger_defs: Optional[Mapping[str, LoggerDefinition]] = None,\n top_level_resources: Optional[Mapping[str, ResourceDefinition]] = None,\n resource_key_mapping: Optional[Mapping[int, str]] = None,\n ) -> "CachingRepositoryData":\n """Static constructor.\n\n Args:\n repository_definitions (List[Union[JobDefinition, ScheduleDefinition, SensorDefinition, GraphDefinition]]):\n Use this constructor when you have no need to lazy load jobs or other definitions.\n top_level_resources (Optional[Mapping[str, ResourceDefinition]]): A dict of top-level\n resource keys to defintions, for resources which should be displayed in the UI.\n """\n from .repository_data_builder import build_caching_repository_data_from_list\n\n return build_caching_repository_data_from_list(\n repository_definitions=repository_definitions,\n default_executor_def=default_executor_def,\n default_logger_defs=default_logger_defs,\n top_level_resources=top_level_resources,\n resource_key_mapping=resource_key_mapping,\n )\n\n def get_env_vars_by_top_level_resource(self) -> Mapping[str, AbstractSet[str]]:\n return self._utilized_env_vars\n\n def get_resource_key_mapping(self) -> Mapping[int, str]:\n return self._resource_key_mapping\n\n def get_job_names(self) -> Sequence[str]:\n """Get the names of all jobs in the repository.\n\n Returns:\n List[str]\n """\n return self._jobs.get_definition_names()\n\n def has_job(self, job_name: str) -> bool:\n """Check if a job with a given name is present in the repository.\n\n Args:\n job_name (str): The name of the job.\n\n Returns:\n bool\n """\n check.str_param(job_name, "job_name")\n return self._jobs.has_definition(job_name)\n\n def get_top_level_resources(self) -> Mapping[str, ResourceDefinition]:\n return self._top_level_resources\n\n def get_all_jobs(self) -> Sequence[JobDefinition]:\n """Return all jobs in the repository as a list.\n\n Note that this will construct any job that has not yet been constructed.\n\n Returns:\n List[JobDefinition]: All jobs in the repository.\n """\n if self._all_jobs is not None:\n return self._all_jobs\n\n self._all_jobs = self._jobs.get_all_definitions()\n self._check_node_defs(self._all_jobs)\n return self._all_jobs\n\n def get_job(self, job_name: str) -> JobDefinition:\n """Get a job by name.\n\n If this job has not yet been constructed, only this job is constructed, and will\n be cached for future calls.\n\n Args:\n job_name (str): Name of the job to retrieve.\n\n Returns:\n JobDefinition: The job definition corresponding to the given name.\n """\n check.str_param(job_name, "job_name")\n return self._jobs.get_definition(job_name)\n\n def get_schedule_names(self) -> Sequence[str]:\n """Get the names of all schedules in the repository.\n\n Returns:\n List[str]\n """\n return self._schedules.get_definition_names()\n\n def get_all_schedules(self) -> Sequence[ScheduleDefinition]:\n """Return all schedules in the repository as a list.\n\n Note that this will construct any schedule that has not yet been constructed.\n\n Returns:\n List[ScheduleDefinition]: All schedules in the repository.\n """\n return self._schedules.get_all_definitions()\n\n def get_schedule(self, schedule_name: str) -> ScheduleDefinition:\n """Get a schedule by name.\n\n if this schedule has not yet been constructed, only this schedule is constructed, and will\n be cached for future calls.\n\n Args:\n schedule_name (str): name of the schedule to retrieve.\n\n Returns:\n ScheduleDefinition: The schedule definition corresponding to the given name.\n """\n check.str_param(schedule_name, "schedule_name")\n\n return self._schedules.get_definition(schedule_name)\n\n def has_schedule(self, schedule_name: str) -> bool:\n check.str_param(schedule_name, "schedule_name")\n\n return self._schedules.has_definition(schedule_name)\n\n def get_all_sensors(self) -> Sequence[SensorDefinition]:\n return self._sensors.get_all_definitions()\n\n def get_sensor_names(self) -> Sequence[str]:\n return self._sensors.get_definition_names()\n\n def get_sensor(self, sensor_name: str) -> SensorDefinition:\n return self._sensors.get_definition(sensor_name)\n\n def has_sensor(self, sensor_name: str) -> bool:\n return self._sensors.has_definition(sensor_name)\n\n def get_source_assets_by_key(self) -> Mapping[AssetKey, SourceAsset]:\n return self._source_assets_by_key\n\n def get_assets_defs_by_key(self) -> Mapping[AssetKey, "AssetsDefinition"]:\n return self._assets_defs_by_key\n\n def _check_node_defs(self, job_defs: Sequence[JobDefinition]) -> None:\n node_defs = {}\n node_to_job = {}\n for job_def in job_defs:\n for node_def in [*job_def.all_node_defs, job_def.graph]:\n # skip checks for subselected graphs because they don't have their own names\n if isinstance(node_def, SubselectedGraphDefinition):\n break\n\n if node_def.name not in node_defs:\n node_defs[node_def.name] = node_def\n node_to_job[node_def.name] = job_def.name\n\n if node_defs[node_def.name] is not node_def:\n first_name, second_name = sorted([node_to_job[node_def.name], job_def.name])\n raise DagsterInvalidDefinitionError(\n f"Conflicting definitions found in repository with name '{node_def.name}'."\n " Op/Graph definition names must be unique within a repository."\n f" {node_def.__class__.__name__} is defined in"\n f" job '{first_name}' and in"\n f" job '{second_name}'."\n )\n\n def _validate_job(self, job: JobDefinition) -> JobDefinition:\n return job\n\n def _validate_schedule(self, schedule: ScheduleDefinition) -> ScheduleDefinition:\n job_names = self.get_job_names()\n\n if schedule.job_name not in job_names:\n raise DagsterInvalidDefinitionError(\n f'ScheduleDefinition "{schedule.name}" targets job "{schedule.job_name}" '\n "which was not found in this repository."\n )\n\n return schedule\n\n def _validate_sensor(self, sensor: SensorDefinition) -> SensorDefinition:\n job_names = self.get_job_names()\n if len(sensor.targets) == 0:\n # skip validation when the sensor does not target a job\n return sensor\n\n for target in sensor.targets:\n if target.job_name not in job_names:\n raise DagsterInvalidDefinitionError(\n f'SensorDefinition "{sensor.name}" targets job "{sensor.job_name}" '\n "which was not found in this repository."\n )\n\n return sensor\n
", "current_page_name": "_modules/dagster/_core/definitions/repository_definition/repository_data", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.repository_definition.repository_data"}, "repository_definition": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.repository_definition.repository_definition

\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Any,\n    Dict,\n    Iterable,\n    List,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Type,\n    Union,\n)\n\nimport dagster._check as check\nfrom dagster._annotations import public\nfrom dagster._core.definitions.asset_check_spec import AssetCheckKey\nfrom dagster._core.definitions.asset_graph import AssetGraph, InternalAssetGraph\nfrom dagster._core.definitions.assets_job import (\n    ASSET_BASE_JOB_PREFIX,\n)\nfrom dagster._core.definitions.cacheable_assets import AssetsDefinitionCacheableData\nfrom dagster._core.definitions.events import AssetKey, CoercibleToAssetKey\nfrom dagster._core.definitions.executor_definition import ExecutorDefinition\nfrom dagster._core.definitions.job_definition import JobDefinition\nfrom dagster._core.definitions.logger_definition import LoggerDefinition\nfrom dagster._core.definitions.metadata import MetadataMapping\nfrom dagster._core.definitions.resource_definition import ResourceDefinition\nfrom dagster._core.definitions.schedule_definition import ScheduleDefinition\nfrom dagster._core.definitions.sensor_definition import SensorDefinition\nfrom dagster._core.definitions.source_asset import SourceAsset\nfrom dagster._core.definitions.utils import check_valid_name\nfrom dagster._core.errors import DagsterInvariantViolationError\nfrom dagster._core.instance import DagsterInstance\nfrom dagster._serdes import whitelist_for_serdes\nfrom dagster._utils import hash_collection\n\nfrom .repository_data import CachingRepositoryData, RepositoryData\nfrom .valid_definitions import (\n    RepositoryListDefinition as RepositoryListDefinition,\n)\n\nif TYPE_CHECKING:\n    from dagster._core.definitions import AssetsDefinition\n    from dagster._core.definitions.cacheable_assets import CacheableAssetsDefinition\n    from dagster._core.storage.asset_value_loader import AssetValueLoader\n\n\n@whitelist_for_serdes\nclass RepositoryLoadData(\n    NamedTuple(\n        "_RepositoryLoadData",\n        [\n            ("cached_data_by_key", Mapping[str, Sequence[AssetsDefinitionCacheableData]]),\n        ],\n    )\n):\n    def __new__(cls, cached_data_by_key: Mapping[str, Sequence[AssetsDefinitionCacheableData]]):\n        return super(RepositoryLoadData, cls).__new__(\n            cls,\n            cached_data_by_key=(\n                check.mapping_param(\n                    cached_data_by_key,\n                    "cached_data_by_key",\n                    key_type=str,\n                    value_type=list,\n                )\n            ),\n        )\n\n    # Allow this to be hashed for use in `lru_cache`. This is needed because:\n    # - `ReconstructableJob` uses `lru_cache`\n    # - `ReconstructableJob` has a `ReconstructableRepository` attribute\n    # - `ReconstructableRepository` has a `RepositoryLoadData` attribute\n    # - `RepositoryLoadData` has collection attributes that are unhashable by default\n    def __hash__(self) -> int:\n        if not hasattr(self, "_hash"):\n            self._hash = hash_collection(self)\n        return self._hash\n\n\n
[docs]class RepositoryDefinition:\n """Define a repository that contains a group of definitions.\n\n Users should typically not create objects of this class directly. Instead, use the\n :py:func:`@repository` decorator.\n\n Args:\n name (str): The name of the repository.\n repository_data (RepositoryData): Contains the definitions making up the repository.\n description (Optional[str]): A string description of the repository.\n metadata (Optional[MetadataMapping]): A map of arbitrary metadata for the repository.\n """\n\n def __init__(\n self,\n name,\n *,\n repository_data,\n description=None,\n metadata=None,\n repository_load_data=None,\n ):\n self._name = check_valid_name(name)\n self._description = check.opt_str_param(description, "description")\n self._repository_data: RepositoryData = check.inst_param(\n repository_data, "repository_data", RepositoryData\n )\n self._metadata = check.opt_mapping_param(metadata, "metadata", key_type=str)\n self._repository_load_data = check.opt_inst_param(\n repository_load_data, "repository_load_data", RepositoryLoadData\n )\n\n @property\n def repository_load_data(self) -> Optional[RepositoryLoadData]:\n return self._repository_load_data\n\n @public\n @property\n def name(self) -> str:\n """str: The name of the repository."""\n return self._name\n\n @public\n @property\n def description(self) -> Optional[str]:\n """Optional[str]: A human-readable description of the repository."""\n return self._description\n\n @public\n @property\n def metadata(self) -> Optional[MetadataMapping]:\n """Optional[MetadataMapping]: Arbitrary metadata for the repository."""\n return self._metadata\n\n def load_all_definitions(self):\n # force load of all lazy constructed code artifacts\n self._repository_data.load_all_definitions()\n\n @public\n @property\n def job_names(self) -> Sequence[str]:\n """List[str]: Names of all jobs in the repository."""\n return self._repository_data.get_job_names()\n\n def get_top_level_resources(self) -> Mapping[str, ResourceDefinition]:\n return self._repository_data.get_top_level_resources()\n\n def get_env_vars_by_top_level_resource(self) -> Mapping[str, AbstractSet[str]]:\n return self._repository_data.get_env_vars_by_top_level_resource()\n\n def get_resource_key_mapping(self) -> Mapping[int, str]:\n return self._repository_data.get_resource_key_mapping()\n\n
[docs] @public\n def has_job(self, name: str) -> bool:\n """Check if a job with a given name is present in the repository.\n\n Args:\n name (str): The name of the job.\n\n Returns:\n bool\n """\n return self._repository_data.has_job(name)
\n\n
[docs] @public\n def get_job(self, name: str) -> JobDefinition:\n """Get a job by name.\n\n If this job is present in the lazily evaluated dictionary passed to the\n constructor, but has not yet been constructed, only this job is constructed, and\n will be cached for future calls.\n\n Args:\n name (str): Name of the job to retrieve.\n\n Returns:\n JobDefinition: The job definition corresponding to\n the given name.\n """\n return self._repository_data.get_job(name)
\n\n
[docs] @public\n def get_all_jobs(self) -> Sequence[JobDefinition]:\n """Return all jobs in the repository as a list.\n\n Note that this will construct any job in the lazily evaluated dictionary that has\n not yet been constructed.\n\n Returns:\n List[JobDefinition]: All jobs in the repository.\n """\n return self._repository_data.get_all_jobs()
\n\n @public\n @property\n def schedule_defs(self) -> Sequence[ScheduleDefinition]:\n """List[ScheduleDefinition]: All schedules in the repository."""\n return self._repository_data.get_all_schedules()\n\n
[docs] @public\n def get_schedule_def(self, name: str) -> ScheduleDefinition:\n """Get a schedule definition by name.\n\n Args:\n name (str): The name of the schedule.\n\n Returns:\n ScheduleDefinition: The schedule definition.\n """\n return self._repository_data.get_schedule(name)
\n\n
[docs] @public\n def has_schedule_def(self, name: str) -> bool:\n """bool: Check if a schedule with a given name is present in the repository."""\n return self._repository_data.has_schedule(name)
\n\n @public\n @property\n def sensor_defs(self) -> Sequence[SensorDefinition]:\n """Sequence[SensorDefinition]: All sensors in the repository."""\n return self._repository_data.get_all_sensors()\n\n
[docs] @public\n def get_sensor_def(self, name: str) -> SensorDefinition:\n """Get a sensor definition by name.\n\n Args:\n name (str): The name of the sensor.\n\n Returns:\n SensorDefinition: The sensor definition.\n """\n return self._repository_data.get_sensor(name)
\n\n
[docs] @public\n def has_sensor_def(self, name: str) -> bool:\n """bool: Check if a sensor with a given name is present in the repository."""\n return self._repository_data.has_sensor(name)
\n\n @property\n def source_assets_by_key(self) -> Mapping[AssetKey, SourceAsset]:\n return self._repository_data.get_source_assets_by_key()\n\n @property\n def assets_defs_by_key(self) -> Mapping[AssetKey, "AssetsDefinition"]:\n return self._repository_data.get_assets_defs_by_key()\n\n def has_implicit_global_asset_job_def(self) -> bool:\n """Returns true is there is a single implicit asset job for all asset keys in a repository."""\n return self.has_job(ASSET_BASE_JOB_PREFIX)\n\n def get_implicit_global_asset_job_def(self) -> JobDefinition:\n """A useful conveninence method for repositories where there are a set of assets with\n the same partitioning schema and one wants to access their corresponding implicit job\n easily.\n """\n if not self.has_job(ASSET_BASE_JOB_PREFIX):\n raise DagsterInvariantViolationError(\n "There is no single global asset job, likely due to assets using "\n "different partitioning schemes via their partitions_def parameter. You must "\n "use get_implicit_job_def_for_assets in order to access the correct implicit job."\n )\n\n return self.get_job(ASSET_BASE_JOB_PREFIX)\n\n def get_implicit_asset_job_names(self) -> Sequence[str]:\n return [\n job_name for job_name in self.job_names if job_name.startswith(ASSET_BASE_JOB_PREFIX)\n ]\n\n def get_implicit_job_def_for_assets(\n self, asset_keys: Iterable[AssetKey]\n ) -> Optional[JobDefinition]:\n """Returns the asset base job that contains all the given assets, or None if there is no such\n job.\n """\n if self.has_job(ASSET_BASE_JOB_PREFIX):\n base_job = self.get_job(ASSET_BASE_JOB_PREFIX)\n if all(\n key in base_job.asset_layer.assets_defs_by_key\n or base_job.asset_layer.is_observable_for_asset(key)\n for key in asset_keys\n ):\n return base_job\n else:\n i = 0\n while self.has_job(f"{ASSET_BASE_JOB_PREFIX}_{i}"):\n base_job = self.get_job(f"{ASSET_BASE_JOB_PREFIX}_{i}")\n\n if all(\n key in base_job.asset_layer.assets_defs_by_key\n or base_job.asset_layer.is_observable_for_asset(key)\n for key in asset_keys\n ):\n return base_job\n\n i += 1\n\n return None\n\n def get_maybe_subset_job_def(\n self,\n job_name: str,\n op_selection: Optional[Iterable[str]] = None,\n asset_selection: Optional[AbstractSet[AssetKey]] = None,\n asset_check_selection: Optional[AbstractSet[AssetCheckKey]] = None,\n ):\n defn = self.get_job(job_name)\n return defn.get_subset(\n op_selection=op_selection,\n asset_selection=asset_selection,\n asset_check_selection=asset_check_selection,\n )\n\n
[docs] @public\n def load_asset_value(\n self,\n asset_key: CoercibleToAssetKey,\n *,\n python_type: Optional[Type] = None,\n instance: Optional[DagsterInstance] = None,\n partition_key: Optional[str] = None,\n metadata: Optional[Dict[str, Any]] = None,\n resource_config: Optional[Any] = None,\n ) -> object:\n """Load the contents of an asset as a Python object.\n\n Invokes `load_input` on the :py:class:`IOManager` associated with the asset.\n\n If you want to load the values of multiple assets, it's more efficient to use\n :py:meth:`~dagster.RepositoryDefinition.get_asset_value_loader`, which avoids spinning up\n resources separately for each asset.\n\n Args:\n asset_key (Union[AssetKey, Sequence[str], str]): The key of the asset to load.\n python_type (Optional[Type]): The python type to load the asset as. This is what will\n be returned inside `load_input` by `context.dagster_type.typing_type`.\n partition_key (Optional[str]): The partition of the asset to load.\n metadata (Optional[Dict[str, Any]]): Input metadata to pass to the :py:class:`IOManager`\n (is equivalent to setting the metadata argument in `In` or `AssetIn`).\n resource_config (Optional[Any]): A dictionary of resource configurations to be passed\n to the :py:class:`IOManager`.\n\n Returns:\n The contents of an asset as a Python object.\n """\n from dagster._core.storage.asset_value_loader import AssetValueLoader\n\n with AssetValueLoader(\n self.assets_defs_by_key, self.source_assets_by_key, instance=instance\n ) as loader:\n return loader.load_asset_value(\n asset_key,\n python_type=python_type,\n partition_key=partition_key,\n metadata=metadata,\n resource_config=resource_config,\n )
\n\n
[docs] @public\n def get_asset_value_loader(\n self, instance: Optional[DagsterInstance] = None\n ) -> "AssetValueLoader":\n """Returns an object that can load the contents of assets as Python objects.\n\n Invokes `load_input` on the :py:class:`IOManager` associated with the assets. Avoids\n spinning up resources separately for each asset.\n\n Usage:\n\n .. code-block:: python\n\n with my_repo.get_asset_value_loader() as loader:\n asset1 = loader.load_asset_value("asset1")\n asset2 = loader.load_asset_value("asset2")\n\n """\n from dagster._core.storage.asset_value_loader import AssetValueLoader\n\n return AssetValueLoader(\n self.assets_defs_by_key, self.source_assets_by_key, instance=instance\n )
\n\n @property\n def asset_graph(self) -> InternalAssetGraph:\n return AssetGraph.from_assets(\n [*set(self.assets_defs_by_key.values()), *self.source_assets_by_key.values()]\n )\n\n # If definition comes from the @repository decorator, then the __call__ method will be\n # overwritten. Therefore, we want to maintain the call-ability of repository definitions.\n def __call__(self, *args, **kwargs):\n return self
\n\n\nclass PendingRepositoryDefinition:\n def __init__(\n self,\n name: str,\n repository_definitions: Sequence[\n Union[RepositoryListDefinition, "CacheableAssetsDefinition"]\n ],\n description: Optional[str] = None,\n metadata: Optional[MetadataMapping] = None,\n default_logger_defs: Optional[Mapping[str, LoggerDefinition]] = None,\n default_executor_def: Optional[ExecutorDefinition] = None,\n _top_level_resources: Optional[Mapping[str, ResourceDefinition]] = None,\n _resource_key_mapping: Optional[Mapping[int, str]] = None,\n ):\n self._repository_definitions = check.list_param(\n repository_definitions,\n "repository_definition",\n additional_message=(\n "PendingRepositoryDefinition supports only list-based repository data at this time."\n ),\n )\n self._name = name\n self._description = description\n self._metadata = metadata\n self._default_logger_defs = default_logger_defs\n self._default_executor_def = default_executor_def\n self._top_level_resources = _top_level_resources\n self._resource_key_mapping = _resource_key_mapping\n\n @property\n def name(self) -> str:\n return self._name\n\n def _compute_repository_load_data(self) -> RepositoryLoadData:\n from dagster._core.definitions.cacheable_assets import CacheableAssetsDefinition\n\n return RepositoryLoadData(\n cached_data_by_key={\n defn.unique_id: defn.compute_cacheable_data()\n for defn in self._repository_definitions\n if isinstance(defn, CacheableAssetsDefinition)\n }\n )\n\n def _get_repository_definition(\n self, repository_load_data: RepositoryLoadData\n ) -> RepositoryDefinition:\n from dagster._core.definitions.cacheable_assets import CacheableAssetsDefinition\n\n resolved_definitions: List[RepositoryListDefinition] = []\n for defn in self._repository_definitions:\n if isinstance(defn, CacheableAssetsDefinition):\n # should always have metadata for each cached defn at this point\n check.invariant(\n defn.unique_id in repository_load_data.cached_data_by_key,\n "No metadata found for CacheableAssetsDefinition with unique_id"\n f" {defn.unique_id}.",\n )\n # use the emtadata to generate definitions\n resolved_definitions.extend(\n defn.build_definitions(\n data=repository_load_data.cached_data_by_key[defn.unique_id]\n )\n )\n else:\n resolved_definitions.append(defn)\n\n repository_data = CachingRepositoryData.from_list(\n resolved_definitions,\n default_executor_def=self._default_executor_def,\n default_logger_defs=self._default_logger_defs,\n top_level_resources=self._top_level_resources,\n resource_key_mapping=self._resource_key_mapping,\n )\n\n return RepositoryDefinition(\n self._name,\n repository_data=repository_data,\n description=self._description,\n metadata=self._metadata,\n repository_load_data=repository_load_data,\n )\n\n def reconstruct_repository_definition(\n self, repository_load_data: RepositoryLoadData\n ) -> RepositoryDefinition:\n """Use the provided RepositoryLoadData to construct and return a RepositoryDefinition."""\n check.inst_param(repository_load_data, "repository_load_data", RepositoryLoadData)\n return self._get_repository_definition(repository_load_data)\n\n def compute_repository_definition(self) -> RepositoryDefinition:\n """Compute the required RepositoryLoadData and use it to construct and return a RepositoryDefinition."""\n repository_load_data = self._compute_repository_load_data()\n return self._get_repository_definition(repository_load_data)\n
", "current_page_name": "_modules/dagster/_core/definitions/repository_definition/repository_definition", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.repository_definition.repository_definition"}}, "resource_definition": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.resource_definition

\nfrom functools import update_wrapper\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Any,\n    Callable,\n    Dict,\n    Iterator,\n    Mapping,\n    Optional,\n    Union,\n    cast,\n    overload,\n)\n\nfrom typing_extensions import TypeAlias\n\nimport dagster._check as check\nfrom dagster._annotations import experimental_param, public\nfrom dagster._core.decorator_utils import format_docstring_for_description\nfrom dagster._core.definitions.config import is_callable_valid_config_arg\nfrom dagster._core.definitions.configurable import AnonymousConfigurableDefinition\nfrom dagster._core.errors import DagsterInvalidDefinitionError, DagsterInvalidInvocationError\nfrom dagster._utils import IHasInternalInit\n\nfrom ..decorator_utils import (\n    get_function_params,\n    has_at_least_one_parameter,\n    is_required_param,\n    positional_arg_name_list,\n    validate_expected_params,\n)\nfrom .definition_config_schema import (\n    CoercableToConfigSchema,\n    IDefinitionConfigSchema,\n    convert_user_facing_definition_config_schema,\n)\nfrom .resource_invocation import resource_invocation_result\nfrom .resource_requirement import (\n    RequiresResources,\n    ResourceDependencyRequirement,\n    ResourceRequirement,\n)\nfrom .scoped_resources_builder import (  # re-exported\n    IContainsGenerator as IContainsGenerator,\n    Resources as Resources,\n    ScopedResourcesBuilder as ScopedResourcesBuilder,\n)\n\nif TYPE_CHECKING:\n    from dagster._core.execution.resources_init import InitResourceContext\n\nResourceFunctionWithContext: TypeAlias = Callable[["InitResourceContext"], Any]\nResourceFunctionWithoutContext: TypeAlias = Callable[[], Any]\nResourceFunction: TypeAlias = Union[\n    ResourceFunctionWithContext,\n    ResourceFunctionWithoutContext,\n]\n\n\n
[docs]@experimental_param(param="version")\nclass ResourceDefinition(AnonymousConfigurableDefinition, RequiresResources, IHasInternalInit):\n """Core class for defining resources.\n\n Resources are scoped ways to make external resources (like database connections) available to\n ops and assets during job execution and to clean up after execution resolves.\n\n If resource_fn yields once rather than returning (in the manner of functions decorable with\n :py:func:`@contextlib.contextmanager <python:contextlib.contextmanager>`) then the body of the\n function after the yield will be run after execution resolves, allowing users to write their\n own teardown/cleanup logic.\n\n Depending on your executor, resources may be instantiated and cleaned up more than once in a\n job execution.\n\n Args:\n resource_fn (Callable[[InitResourceContext], Any]): User-provided function to instantiate\n the resource, which will be made available to executions keyed on the\n ``context.resources`` object.\n config_schema (Optional[ConfigSchema): The schema for the config. If set, Dagster will check\n that config provided for the resource matches this schema and fail if it does not. If\n not set, Dagster will accept any config provided for the resource.\n description (Optional[str]): A human-readable description of the resource.\n required_resource_keys: (Optional[Set[str]]) Keys for the resources required by this\n resource. A DagsterInvariantViolationError will be raised during initialization if\n dependencies are cyclic.\n version (Optional[str]): (Experimental) The version of the resource's definition fn. Two\n wrapped resource functions should only have the same version if they produce the same\n resource definition when provided with the same inputs.\n """\n\n def __init__(\n self,\n resource_fn: ResourceFunction,\n config_schema: CoercableToConfigSchema = None,\n description: Optional[str] = None,\n required_resource_keys: Optional[AbstractSet[str]] = None,\n version: Optional[str] = None,\n ):\n self._resource_fn = check.callable_param(resource_fn, "resource_fn")\n self._config_schema = convert_user_facing_definition_config_schema(config_schema)\n self._description = check.opt_str_param(description, "description")\n self._required_resource_keys = check.opt_set_param(\n required_resource_keys, "required_resource_keys"\n )\n self._version = check.opt_str_param(version, "version")\n\n # this attribute will be updated by the @dagster_maintained_resource and @dagster_maintained_io_manager decorators\n self._dagster_maintained = False\n\n @staticmethod\n def dagster_internal_init(\n *,\n resource_fn: ResourceFunction,\n config_schema: CoercableToConfigSchema,\n description: Optional[str],\n required_resource_keys: Optional[AbstractSet[str]],\n version: Optional[str],\n ) -> "ResourceDefinition":\n return ResourceDefinition(\n resource_fn=resource_fn,\n config_schema=config_schema,\n description=description,\n required_resource_keys=required_resource_keys,\n version=version,\n )\n\n @property\n def resource_fn(self) -> ResourceFunction:\n return self._resource_fn\n\n @property\n def config_schema(self) -> IDefinitionConfigSchema:\n return self._config_schema\n\n @public\n @property\n def description(self) -> Optional[str]:\n """A human-readable description of the resource."""\n return self._description\n\n @public\n @property\n def version(self) -> Optional[str]:\n """A string which can be used to identify a particular code version of a resource definition."""\n return self._version\n\n @public\n @property\n def required_resource_keys(self) -> AbstractSet[str]:\n """A set of the resource keys that this resource depends on. These keys will be made available\n to the resource's init context during execution, and the resource will not be instantiated\n until all required resources are available.\n """\n return self._required_resource_keys\n\n def _is_dagster_maintained(self) -> bool:\n return self._dagster_maintained\n\n
[docs] @public\n @staticmethod\n def none_resource(description: Optional[str] = None) -> "ResourceDefinition":\n """A helper function that returns a none resource.\n\n Args:\n description ([Optional[str]]): The description of the resource. Defaults to None.\n\n Returns:\n [ResourceDefinition]: A resource that does nothing.\n """\n return ResourceDefinition.hardcoded_resource(value=None, description=description)
\n\n
[docs] @public\n @staticmethod\n def hardcoded_resource(value: Any, description: Optional[str] = None) -> "ResourceDefinition":\n """A helper function that creates a ``ResourceDefinition`` with a hardcoded object.\n\n Args:\n value (Any): The value that will be accessible via context.resources.resource_name.\n description ([Optional[str]]): The description of the resource. Defaults to None.\n\n Returns:\n [ResourceDefinition]: A hardcoded resource.\n """\n return ResourceDefinition(resource_fn=lambda _init_context: value, description=description)
\n\n
[docs] @public\n @staticmethod\n def mock_resource(description: Optional[str] = None) -> "ResourceDefinition":\n """A helper function that creates a ``ResourceDefinition`` which wraps a ``mock.MagicMock``.\n\n Args:\n description ([Optional[str]]): The description of the resource. Defaults to None.\n\n Returns:\n [ResourceDefinition]: A resource that creates the magic methods automatically and helps\n you mock existing resources.\n """\n from unittest import mock\n\n return ResourceDefinition(\n resource_fn=lambda _init_context: mock.MagicMock(), description=description\n )
\n\n
[docs] @public\n @staticmethod\n def string_resource(description: Optional[str] = None) -> "ResourceDefinition":\n """Creates a ``ResourceDefinition`` which takes in a single string as configuration\n and returns this configured string to any ops or assets which depend on it.\n\n Args:\n description ([Optional[str]]): The description of the string resource. Defaults to None.\n\n Returns:\n [ResourceDefinition]: A resource that takes in a single string as configuration and\n returns that string.\n """\n return ResourceDefinition(\n resource_fn=lambda init_context: init_context.resource_config,\n config_schema=str,\n description=description,\n )
\n\n def copy_for_configured(\n self,\n description: Optional[str],\n config_schema: CoercableToConfigSchema,\n ) -> "ResourceDefinition":\n resource_def = ResourceDefinition.dagster_internal_init(\n config_schema=config_schema,\n description=description or self.description,\n resource_fn=self.resource_fn,\n required_resource_keys=self.required_resource_keys,\n version=self.version,\n )\n\n resource_def._dagster_maintained = self._is_dagster_maintained() # noqa: SLF001\n\n return resource_def\n\n def __call__(self, *args, **kwargs):\n from dagster._core.execution.context.init import UnboundInitResourceContext\n\n if has_at_least_one_parameter(self.resource_fn):\n if len(args) + len(kwargs) == 0:\n raise DagsterInvalidInvocationError(\n "Resource initialization function has context argument, but no context was"\n " provided when invoking."\n )\n if len(args) + len(kwargs) > 1:\n raise DagsterInvalidInvocationError(\n "Initialization of resource received multiple arguments. Only a first "\n "positional context parameter should be provided when invoking."\n )\n\n context_param_name = get_function_params(self.resource_fn)[0].name\n\n if args:\n check.opt_inst_param(args[0], context_param_name, UnboundInitResourceContext)\n return resource_invocation_result(\n self, cast(Optional[UnboundInitResourceContext], args[0])\n )\n else:\n if context_param_name not in kwargs:\n raise DagsterInvalidInvocationError(\n f"Resource initialization expected argument '{context_param_name}'."\n )\n check.opt_inst_param(\n kwargs[context_param_name], context_param_name, UnboundInitResourceContext\n )\n\n return resource_invocation_result(\n self, cast(Optional[UnboundInitResourceContext], kwargs[context_param_name])\n )\n elif len(args) + len(kwargs) > 0:\n raise DagsterInvalidInvocationError(\n "Attempted to invoke resource with argument, but underlying function has no context"\n " argument. Either specify a context argument on the resource function, or remove"\n " the passed-in argument."\n )\n else:\n return resource_invocation_result(self, None)\n\n def get_resource_requirements(\n self, outer_context: Optional[object] = None\n ) -> Iterator[ResourceRequirement]:\n source_key = cast(str, outer_context)\n for resource_key in sorted(list(self.required_resource_keys)):\n yield ResourceDependencyRequirement(key=resource_key, source_key=source_key)
\n\n\ndef dagster_maintained_resource(\n resource_def: ResourceDefinition,\n) -> ResourceDefinition:\n resource_def._dagster_maintained = True # noqa: SLF001\n return resource_def\n\n\nclass _ResourceDecoratorCallable:\n def __init__(\n self,\n config_schema: Optional[Mapping[str, Any]] = None,\n description: Optional[str] = None,\n required_resource_keys: Optional[AbstractSet[str]] = None,\n version: Optional[str] = None,\n ):\n self.config_schema = config_schema # checked by underlying definition\n self.description = check.opt_str_param(description, "description")\n self.version = check.opt_str_param(version, "version")\n self.required_resource_keys = check.opt_set_param(\n required_resource_keys, "required_resource_keys"\n )\n\n def __call__(self, resource_fn: ResourceFunction) -> ResourceDefinition:\n check.callable_param(resource_fn, "resource_fn")\n\n any_name = ["*"] if has_at_least_one_parameter(resource_fn) else []\n\n params = get_function_params(resource_fn)\n\n missing_positional = validate_expected_params(params, any_name)\n if missing_positional:\n raise DagsterInvalidDefinitionError(\n f"@resource decorated function '{resource_fn.__name__}' expects a single "\n "positional argument."\n )\n\n extras = params[len(any_name) :]\n\n required_extras = list(filter(is_required_param, extras))\n if required_extras:\n raise DagsterInvalidDefinitionError(\n f"@resource decorated function '{resource_fn.__name__}' expects only a single"\n " positional required argument. Got required extra params"\n f" {', '.join(positional_arg_name_list(required_extras))}"\n )\n\n resource_def = ResourceDefinition.dagster_internal_init(\n resource_fn=resource_fn,\n config_schema=self.config_schema,\n description=self.description or format_docstring_for_description(resource_fn),\n version=self.version,\n required_resource_keys=self.required_resource_keys,\n )\n\n # `update_wrapper` typing cannot currently handle a Union of Callables correctly\n update_wrapper(resource_def, wrapped=resource_fn) # type: ignore\n\n return resource_def\n\n\n@overload\ndef resource(config_schema: ResourceFunction) -> ResourceDefinition: ...\n\n\n@overload\ndef resource(\n config_schema: CoercableToConfigSchema = ...,\n description: Optional[str] = ...,\n required_resource_keys: Optional[AbstractSet[str]] = ...,\n version: Optional[str] = ...,\n) -> Callable[[ResourceFunction], "ResourceDefinition"]: ...\n\n\n
[docs]def resource(\n config_schema: Union[ResourceFunction, CoercableToConfigSchema] = None,\n description: Optional[str] = None,\n required_resource_keys: Optional[AbstractSet[str]] = None,\n version: Optional[str] = None,\n) -> Union[Callable[[ResourceFunction], "ResourceDefinition"], "ResourceDefinition"]:\n """Define a resource.\n\n The decorated function should accept an :py:class:`InitResourceContext` and return an instance of\n the resource. This function will become the ``resource_fn`` of an underlying\n :py:class:`ResourceDefinition`.\n\n If the decorated function yields once rather than returning (in the manner of functions\n decorable with :py:func:`@contextlib.contextmanager <python:contextlib.contextmanager>`) then\n the body of the function after the yield will be run after execution resolves, allowing users\n to write their own teardown/cleanup logic.\n\n Args:\n config_schema (Optional[ConfigSchema]): The schema for the config. Configuration data available in\n `init_context.resource_config`. If not set, Dagster will accept any config provided.\n description(Optional[str]): A human-readable description of the resource.\n version (Optional[str]): (Experimental) The version of a resource function. Two wrapped\n resource functions should only have the same version if they produce the same resource\n definition when provided with the same inputs.\n required_resource_keys (Optional[Set[str]]): Keys for the resources required by this resource.\n """\n # This case is for when decorator is used bare, without arguments.\n # E.g. @resource versus @resource()\n if callable(config_schema) and not is_callable_valid_config_arg(config_schema):\n return _ResourceDecoratorCallable()(config_schema)\n\n def _wrap(resource_fn: ResourceFunction) -> "ResourceDefinition":\n return _ResourceDecoratorCallable(\n config_schema=cast(Optional[Dict[str, Any]], config_schema),\n description=description,\n required_resource_keys=required_resource_keys,\n version=version,\n )(resource_fn)\n\n return _wrap
\n\n\n
[docs]def make_values_resource(**kwargs: Any) -> ResourceDefinition:\n """A helper function that creates a ``ResourceDefinition`` to take in user-defined values.\n\n This is useful for sharing values between ops.\n\n Args:\n **kwargs: Arbitrary keyword arguments that will be passed to the config schema of the\n returned resource definition. If not set, Dagster will accept any config provided for\n the resource.\n\n For example:\n\n .. code-block:: python\n\n @op(required_resource_keys={"globals"})\n def my_op(context):\n print(context.resources.globals["my_str_var"])\n\n @job(resource_defs={"globals": make_values_resource(my_str_var=str, my_int_var=int)})\n def my_job():\n my_op()\n\n Returns:\n ResourceDefinition: A resource that passes in user-defined values.\n """\n return ResourceDefinition(\n resource_fn=lambda init_context: init_context.resource_config,\n config_schema=kwargs or Any,\n )
\n
", "current_page_name": "_modules/dagster/_core/definitions/resource_definition", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.resource_definition"}, "result": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.result

\nfrom typing import NamedTuple, Optional, Sequence\n\nimport dagster._check as check\nfrom dagster._annotations import PublicAttr, experimental\nfrom dagster._core.definitions.asset_check_result import AssetCheckResult\nfrom dagster._core.definitions.data_version import DataVersion\n\nfrom .events import (\n    AssetKey,\n    CoercibleToAssetKey,\n)\nfrom .metadata import MetadataUserInput\n\n\n
[docs]@experimental\nclass MaterializeResult(\n NamedTuple(\n "_MaterializeResult",\n [\n ("asset_key", PublicAttr[Optional[AssetKey]]),\n ("metadata", PublicAttr[Optional[MetadataUserInput]]),\n ("check_results", PublicAttr[Optional[Sequence[AssetCheckResult]]]),\n ("data_version", PublicAttr[Optional[DataVersion]]),\n ],\n )\n):\n """An object representing a successful materialization of an asset. These can be returned from\n @asset and @multi_asset decorated functions to pass metadata or specify specific assets were\n materialized.\n\n Attributes:\n asset_key (Optional[AssetKey]): Optional in @asset, required in @multi_asset to discern which asset this refers to.\n metadata (Optional[MetadataUserInput]): Metadata to record with the corresponding AssetMaterialization event.\n """\n\n def __new__(\n cls,\n *, # enforce kwargs\n asset_key: Optional[CoercibleToAssetKey] = None,\n metadata: Optional[MetadataUserInput] = None,\n check_results: Optional[Sequence[AssetCheckResult]] = None,\n data_version: Optional[DataVersion] = None,\n ):\n asset_key = AssetKey.from_coercible(asset_key) if asset_key else None\n\n return super().__new__(\n cls,\n asset_key=asset_key,\n metadata=check.opt_nullable_mapping_param(\n metadata,\n "metadata",\n key_type=str,\n ),\n check_results=check.opt_nullable_sequence_param(\n check_results, "check_results", of_type=AssetCheckResult\n ),\n data_version=check.opt_inst_param(data_version, "data_version", DataVersion),\n )
\n
", "current_page_name": "_modules/dagster/_core/definitions/result", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.result"}, "run_config": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.run_config

\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Any,\n    Dict,\n    Iterator,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Tuple,\n    TypeVar,\n    Union,\n    cast,\n)\n\nfrom typing_extensions import TypeAlias\n\nfrom dagster._config import (\n    ALL_CONFIG_BUILTINS,\n    ConfigType,\n    Field,\n    Permissive,\n    Selector,\n    Shape,\n)\nfrom dagster._config.pythonic_config import Config\nfrom dagster._core.definitions.asset_layer import AssetLayer\nfrom dagster._core.definitions.executor_definition import (\n    ExecutorDefinition,\n    execute_in_process_executor,\n    in_process_executor,\n)\nfrom dagster._core.definitions.input import InputDefinition\nfrom dagster._core.definitions.output import OutputDefinition\nfrom dagster._core.errors import DagsterInvalidDefinitionError\nfrom dagster._core.storage.input_manager import IInputManagerDefinition\nfrom dagster._core.storage.output_manager import IOutputManagerDefinition\nfrom dagster._core.types.dagster_type import ALL_RUNTIME_BUILTINS, construct_dagster_type_dictionary\nfrom dagster._utils import check\n\nfrom .configurable import ConfigurableDefinition\nfrom .definition_config_schema import IDefinitionConfigSchema\nfrom .dependency import DependencyStructure, GraphNode, Node, NodeHandle, NodeInput, OpNode\nfrom .graph_definition import GraphDefinition\nfrom .logger_definition import LoggerDefinition\nfrom .op_definition import NodeDefinition, OpDefinition\nfrom .resource_definition import ResourceDefinition\n\nif TYPE_CHECKING:\n    from .source_asset import SourceAsset\n\n\ndef define_resource_dictionary_cls(\n    resource_defs: Mapping[str, ResourceDefinition],\n    required_resources: AbstractSet[str],\n) -> Shape:\n    fields = {}\n    for resource_name, resource_def in resource_defs.items():\n        if resource_def.config_schema:\n            is_required = None\n            if resource_name not in required_resources:\n                # explicitly make section not required if resource is not required\n                # for the current mode\n                is_required = False\n\n            fields[resource_name] = def_config_field(\n                resource_def,\n                is_required=is_required,\n                description=resource_def.description,\n            )\n\n    return Shape(fields=fields)\n\n\ndef remove_none_entries(ddict: Mapping[Any, Any]) -> dict:\n    return {k: v for k, v in ddict.items() if v is not None}\n\n\ndef def_config_field(\n    configurable_def: ConfigurableDefinition,\n    is_required: Optional[bool] = None,\n    description: Optional[str] = None,\n) -> Field:\n    return Field(\n        Shape(\n            {"config": configurable_def.config_field} if configurable_def.has_config_field else {}\n        ),\n        is_required=is_required,\n        description=description,\n    )\n\n\nclass RunConfigSchemaCreationData(NamedTuple):\n    job_name: str\n    nodes: Sequence[Node]\n    graph_def: GraphDefinition\n    dependency_structure: DependencyStructure\n    executor_def: ExecutorDefinition\n    resource_defs: Mapping[str, ResourceDefinition]\n    logger_defs: Mapping[str, LoggerDefinition]\n    ignored_nodes: Sequence[Node]\n    required_resources: AbstractSet[str]\n    direct_inputs: Mapping[str, Any]\n    asset_layer: AssetLayer\n\n\ndef define_logger_dictionary_cls(creation_data: RunConfigSchemaCreationData) -> Shape:\n    return Shape(\n        {\n            logger_name: def_config_field(logger_definition, is_required=False)\n            for logger_name, logger_definition in creation_data.logger_defs.items()\n        }\n    )\n\n\ndef define_execution_field(executor_defs: Sequence[ExecutorDefinition], description: str) -> Field:\n    default_in_process = False\n    for executor_def in executor_defs:\n        if executor_def == in_process_executor:\n            default_in_process = True\n\n    selector = selector_for_named_defs(executor_defs)\n\n    if default_in_process:\n        return Field(\n            selector, default_value={in_process_executor.name: {}}, description=description\n        )\n\n    # If we are using the execute_in_process executor, then ignore all executor config.\n    if len(executor_defs) == 1 and executor_defs[0] == execute_in_process_executor:\n        return Field(Permissive(), is_required=False, default_value={}, description=description)\n\n    return Field(selector, description=description)\n\n\ndef define_single_execution_field(executor_def: ExecutorDefinition, description: str) -> Field:\n    return def_config_field(executor_def, description=description)\n\n\ndef define_run_config_schema_type(creation_data: RunConfigSchemaCreationData) -> ConfigType:\n    execution_field = define_single_execution_field(\n        creation_data.executor_def,\n        "Configure how steps are executed within a run.",\n    )\n\n    top_level_node = GraphNode(\n        name=creation_data.graph_def.name,\n        definition=creation_data.graph_def,\n        graph_definition=creation_data.graph_def,\n    )\n\n    fields = {\n        "execution": execution_field,\n        "loggers": Field(\n            define_logger_dictionary_cls(creation_data),\n            description="Configure how loggers emit messages within a run.",\n        ),\n        "resources": Field(\n            define_resource_dictionary_cls(\n                creation_data.resource_defs,\n                creation_data.required_resources,\n            ),\n            description="Configure how shared resources are implemented within a run.",\n        ),\n        "inputs": get_inputs_field(\n            node=top_level_node,\n            handle=NodeHandle(top_level_node.name, parent=None),\n            dependency_structure=creation_data.dependency_structure,\n            resource_defs=creation_data.resource_defs,\n            node_ignored=False,\n            direct_inputs=creation_data.direct_inputs,\n            input_source_assets={},\n            asset_layer=creation_data.asset_layer,\n        ),\n    }\n\n    if creation_data.graph_def.has_config_mapping:\n        config_schema = cast(IDefinitionConfigSchema, creation_data.graph_def.config_schema)\n        nodes_field = Field(\n            {"config": config_schema.as_field()},\n            description="Configure runtime parameters for ops or assets.",\n        )\n    else:\n        nodes_field = Field(\n            define_node_shape(\n                nodes=creation_data.nodes,\n                ignored_nodes=creation_data.ignored_nodes,\n                dependency_structure=creation_data.dependency_structure,\n                resource_defs=creation_data.resource_defs,\n                asset_layer=creation_data.asset_layer,\n                node_input_source_assets=creation_data.graph_def.node_input_source_assets,\n            ),\n            description="Configure runtime parameters for ops or assets.",\n        )\n\n    fields["ops"] = nodes_field\n\n    return Shape(\n        fields=remove_none_entries(fields),\n    )\n\n\n# Common pattern for a set of named definitions (e.g. executors)\n# to build a selector so that one of them is selected\ndef selector_for_named_defs(named_defs) -> Selector:\n    return Selector({named_def.name: def_config_field(named_def) for named_def in named_defs})\n\n\ndef get_inputs_field(\n    node: Node,\n    handle: NodeHandle,\n    dependency_structure: DependencyStructure,\n    resource_defs: Mapping[str, ResourceDefinition],\n    node_ignored: bool,\n    asset_layer: AssetLayer,\n    input_source_assets: Mapping[str, "SourceAsset"],\n    direct_inputs: Optional[Mapping[str, Any]] = None,\n) -> Optional[Field]:\n    direct_inputs = check.opt_mapping_param(direct_inputs, "direct_inputs")\n    inputs_field_fields = {}\n    for name, inp in node.definition.input_dict.items():\n        inp_handle = NodeInput(node, inp)\n        has_upstream = input_has_upstream(dependency_structure, inp_handle, node, name)\n        if inp.input_manager_key:\n            input_field = get_input_manager_input_field(node, inp, resource_defs)\n        elif (\n            # if you have asset definitions, input will be loaded from the source asset\n            asset_layer.has_assets_defs\n            or asset_layer.has_asset_check_defs\n            and asset_layer.asset_key_for_input(handle, name)\n            and not has_upstream\n        ):\n            input_field = None\n        elif name in direct_inputs and not has_upstream:\n            input_field = None\n        elif name in input_source_assets and not has_upstream:\n            input_field = None\n        elif inp.dagster_type.loader and not has_upstream:\n            input_field = get_type_loader_input_field(node, name, inp)\n        else:\n            input_field = None\n\n        if input_field:\n            inputs_field_fields[name] = input_field\n\n    if not inputs_field_fields:\n        return None\n    if node_ignored:\n        return Field(\n            Shape(inputs_field_fields),\n            is_required=False,\n            description=(\n                "This op is not present in the current op selection, "\n                "the input config values are allowed but ignored."\n            ),\n        )\n    else:\n        return Field(Shape(inputs_field_fields))\n\n\ndef input_has_upstream(\n    dependency_structure: DependencyStructure,\n    input_handle: NodeInput,\n    node: Node,\n    input_name: str,\n) -> bool:\n    return dependency_structure.has_deps(input_handle) or node.container_maps_input(input_name)\n\n\ndef get_input_manager_input_field(\n    node: Node,\n    input_def: InputDefinition,\n    resource_defs: Mapping[str, ResourceDefinition],\n) -> Optional[Field]:\n    if input_def.input_manager_key:\n        if input_def.input_manager_key not in resource_defs:\n            raise DagsterInvalidDefinitionError(\n                f"Input '{input_def.name}' for {node.describe_node()} requires input_manager_key"\n                f" '{input_def.input_manager_key}', but no resource has been provided. Please"\n                " include a resource definition for that key in the provided resource_defs."\n            )\n\n        input_manager = resource_defs[input_def.input_manager_key]\n        if not isinstance(input_manager, IInputManagerDefinition):\n            raise DagsterInvalidDefinitionError(\n                f"Input '{input_def.name}' for {node.describe_node()} requires input_manager_key "\n                f"'{input_def.input_manager_key}', but the resource definition provided is not an "\n                "IInputManagerDefinition"\n            )\n\n        input_config_schema = input_manager.input_config_schema\n        if input_config_schema:\n            return input_config_schema.as_field()\n        return None\n\n    return None\n\n\ndef get_type_loader_input_field(node: Node, input_name: str, input_def: InputDefinition) -> Field:\n    loader = check.not_none(input_def.dagster_type.loader)\n    return Field(\n        loader.schema_type,\n        is_required=(not node.definition.input_has_default(input_name)),\n    )\n\n\ndef get_outputs_field(\n    node: Node,\n    resource_defs: Mapping[str, ResourceDefinition],\n) -> Optional[Field]:\n    output_manager_fields = {}\n    for name, output_def in node.definition.output_dict.items():\n        output_manager_output_field = get_output_manager_output_field(\n            node, output_def, resource_defs\n        )\n        if output_manager_output_field:\n            output_manager_fields[name] = output_manager_output_field\n\n    return Field(Shape(output_manager_fields)) if output_manager_fields else None\n\n\ndef get_output_manager_output_field(\n    node: Node, output_def: OutputDefinition, resource_defs: Mapping[str, ResourceDefinition]\n) -> Optional[ConfigType]:\n    if output_def.io_manager_key not in resource_defs:\n        raise DagsterInvalidDefinitionError(\n            f'Output "{output_def.name}" for {node.describe_node()} requires io_manager_key '\n            f'"{output_def.io_manager_key}", but no resource has been provided. Please include a '\n            "resource definition for that key in the provided resource_defs."\n        )\n    if not isinstance(resource_defs[output_def.io_manager_key], IOutputManagerDefinition):\n        raise DagsterInvalidDefinitionError(\n            f'Output "{output_def.name}" for {node.describe_node()} requires io_manager_key '\n            f'"{output_def.io_manager_key}", but the resource definition provided is not an '\n            "IOutputManagerDefinition"\n        )\n    output_manager_def = resource_defs[output_def.io_manager_key]\n    if (\n        output_manager_def\n        and isinstance(output_manager_def, IOutputManagerDefinition)\n        and output_manager_def.output_config_schema\n    ):\n        return output_manager_def.output_config_schema.as_field()\n\n    return None\n\n\ndef node_config_field(fields: Mapping[str, Optional[Field]], ignored: bool) -> Optional[Field]:\n    trimmed_fields = remove_none_entries(fields)\n    if trimmed_fields:\n        if ignored:\n            return Field(\n                Shape(trimmed_fields),\n                is_required=False,\n                description=(\n                    "This op is not present in the current op selection, "\n                    "the config values are allowed but ignored."\n                ),\n            )\n        else:\n            return Field(Shape(trimmed_fields))\n    else:\n        return None\n\n\ndef construct_leaf_node_config(\n    node: Node,\n    handle: NodeHandle,\n    dependency_structure: DependencyStructure,\n    config_schema: Optional[IDefinitionConfigSchema],\n    resource_defs: Mapping[str, ResourceDefinition],\n    ignored: bool,\n    asset_layer: AssetLayer,\n    input_source_assets: Mapping[str, "SourceAsset"],\n) -> Optional[Field]:\n    return node_config_field(\n        {\n            "inputs": get_inputs_field(\n                node,\n                handle,\n                dependency_structure,\n                resource_defs,\n                ignored,\n                asset_layer,\n                input_source_assets,\n            ),\n            "outputs": get_outputs_field(node, resource_defs),\n            "config": config_schema.as_field() if config_schema else None,\n        },\n        ignored=ignored,\n    )\n\n\ndef define_node_field(\n    node: Node,\n    handle: NodeHandle,\n    dependency_structure: DependencyStructure,\n    resource_defs: Mapping[str, ResourceDefinition],\n    ignored: bool,\n    asset_layer: AssetLayer,\n    input_source_assets: Mapping[str, "SourceAsset"],\n) -> Optional[Field]:\n    # All nodes regardless of compositing status get the same inputs and outputs\n    # config. The only thing the varies is on extra element of configuration\n    # 1) Vanilla op definition: a 'config' key with the config_schema as the value\n    # 2) Graph with field mapping: a 'config' key with the config_schema of\n    #    the config mapping (via GraphDefinition#config_schema)\n    # 3) Graph without field mapping: an 'ops' key with recursively defined\n    #    ops dictionary\n    # 4) `configured` graph with field mapping: a 'config' key with the config_schema that was\n    #    provided when `configured` was called (via GraphDefinition#config_schema)\n\n    assert isinstance(node, (OpNode, GraphNode)), f"Invalid node type: {type(node)}"\n\n    if isinstance(node, OpNode):\n        return construct_leaf_node_config(\n            node,\n            handle,\n            dependency_structure,\n            node.definition.config_schema,\n            resource_defs,\n            ignored,\n            asset_layer,\n            input_source_assets,\n        )\n\n    graph_def = node.definition\n\n    if graph_def.has_config_mapping:\n        # has_config_mapping covers cases 2 & 4 from above (only config mapped graphs can\n        # be `configured`)...\n        return construct_leaf_node_config(\n            node,\n            handle,\n            dependency_structure,\n            # ...and in both cases, the correct schema for 'config' key is exposed by this property:\n            graph_def.config_schema,\n            resource_defs,\n            ignored,\n            asset_layer,\n            input_source_assets,\n        )\n        # This case omits an 'ops' key, thus if a graph is `configured` or has a field\n        # mapping, the user cannot stub any config, inputs, or outputs for inner (child) nodes.\n    else:\n        fields = {\n            "inputs": get_inputs_field(\n                node,\n                handle,\n                dependency_structure,\n                resource_defs,\n                ignored,\n                asset_layer,\n                input_source_assets,\n            ),\n            "outputs": get_outputs_field(node, resource_defs),\n            "ops": Field(\n                define_node_shape(\n                    nodes=graph_def.nodes,\n                    ignored_nodes=None,\n                    dependency_structure=graph_def.dependency_structure,\n                    parent_handle=handle,\n                    resource_defs=resource_defs,\n                    asset_layer=asset_layer,\n                    node_input_source_assets=graph_def.node_input_source_assets,\n                )\n            ),\n        }\n\n        return node_config_field(fields, ignored=ignored)\n\n\ndef define_node_shape(\n    nodes: Sequence[Node],\n    ignored_nodes: Optional[Sequence[Node]],\n    dependency_structure: DependencyStructure,\n    resource_defs: Mapping[str, ResourceDefinition],\n    asset_layer: AssetLayer,\n    node_input_source_assets: Mapping[str, Mapping[str, "SourceAsset"]],\n    parent_handle: Optional[NodeHandle] = None,\n) -> Shape:\n    """Examples of what this method is used to generate the schema for:\n    1.\n        inputs: ...\n        ops:\n      >    op1: ...\n      >    op2: ...\n\n    2.\n        inputs:\n        ops:\n          graph1: ...\n            inputs: ...\n            ops:\n      >       op1: ...\n      >       inner_graph: ...\n\n\n    """\n    ignored_nodes = check.opt_sequence_param(ignored_nodes, "ignored_nodes", of_type=Node)\n\n    fields = {}\n    for node in nodes:\n        node_field = define_node_field(\n            node,\n            NodeHandle(node.name, parent_handle),\n            dependency_structure,\n            resource_defs,\n            ignored=False,\n            asset_layer=asset_layer,\n            input_source_assets=node_input_source_assets.get(node.name, {}),\n        )\n\n        if node_field:\n            fields[node.name] = node_field\n\n    for node in ignored_nodes:\n        node_field = define_node_field(\n            node,\n            NodeHandle(node.name, parent_handle),\n            dependency_structure,\n            resource_defs,\n            ignored=True,\n            asset_layer=asset_layer,\n            input_source_assets=node_input_source_assets.get(node.name, {}),\n        )\n        if node_field:\n            fields[node.name] = node_field\n\n    return Shape(fields)\n\n\ndef iterate_node_def_config_types(node_def: NodeDefinition) -> Iterator[ConfigType]:\n    if isinstance(node_def, OpDefinition):\n        if node_def.has_config_field:\n            yield from node_def.get_config_field().config_type.type_iterator()\n    elif isinstance(node_def, GraphDefinition):\n        for node in node_def.nodes:\n            yield from iterate_node_def_config_types(node.definition)\n\n    else:\n        check.invariant(f"Unexpected NodeDefinition type {type(node_def)}")\n\n\ndef _gather_all_schemas(node_defs: Sequence[NodeDefinition]) -> Iterator[ConfigType]:\n    dagster_types = construct_dagster_type_dictionary(node_defs)\n    for dagster_type in list(dagster_types.values()) + list(ALL_RUNTIME_BUILTINS):\n        if dagster_type.loader:\n            yield from dagster_type.loader.schema_type.type_iterator()\n\n\ndef _gather_all_config_types(\n    node_defs: Sequence[NodeDefinition], run_config_schema_type: ConfigType\n) -> Iterator[ConfigType]:\n    for node_def in node_defs:\n        yield from iterate_node_def_config_types(node_def)\n\n    yield from run_config_schema_type.type_iterator()\n\n\ndef construct_config_type_dictionary(\n    node_defs: Sequence[NodeDefinition],\n    run_config_schema_type: ConfigType,\n) -> Tuple[Mapping[str, ConfigType], Mapping[str, ConfigType]]:\n    type_dict_by_name = {t.given_name: t for t in ALL_CONFIG_BUILTINS if t.given_name}\n    type_dict_by_key = {t.key: t for t in ALL_CONFIG_BUILTINS}\n    all_types = list(_gather_all_config_types(node_defs, run_config_schema_type)) + list(\n        _gather_all_schemas(node_defs)\n    )\n\n    for config_type in all_types:\n        name = config_type.given_name\n        if name and name in type_dict_by_name:\n            if type(config_type) is not type(type_dict_by_name[name]):\n                raise DagsterInvalidDefinitionError(\n                    "Type names must be unique. You have constructed two different "\n                    f'instances of types with the same name "{name}".'\n                )\n        elif name:\n            type_dict_by_name[name] = config_type\n\n        type_dict_by_key[config_type.key] = config_type\n\n    return type_dict_by_name, type_dict_by_key\n\n\ndef _convert_config_classes_inner(configs: Any) -> Any:\n    if not isinstance(configs, dict):\n        return configs\n\n    return {\n        k: (\n            {"config": v._convert_to_config_dictionary()}  # noqa: SLF001\n            if isinstance(v, Config)\n            else _convert_config_classes_inner(v)\n        )\n        for k, v in configs.items()\n    }\n\n\ndef _convert_config_classes(configs: Dict[str, Any]) -> Dict[str, Any]:\n    return _convert_config_classes_inner(configs)\n\n\n
[docs]class RunConfig:\n """Container for all the configuration that can be passed to a run. Accepts Pythonic definitions\n for op and asset config and resources and converts them under the hood to the appropriate config dictionaries.\n\n Example usage:\n\n .. code-block:: python\n\n class MyAssetConfig(Config):\n a_str: str\n\n @asset\n def my_asset(config: MyAssetConfig):\n assert config.a_str == "foo"\n\n materialize(\n [my_asset],\n run_config=RunConfig(\n ops={"my_asset": MyAssetConfig(a_str="foo")}\n )\n )\n\n """\n\n def __init__(\n self,\n ops: Optional[Dict[str, Any]] = None,\n resources: Optional[Dict[str, Any]] = None,\n loggers: Optional[Dict[str, Any]] = None,\n execution: Optional[Dict[str, Any]] = None,\n ):\n self.ops = check.opt_dict_param(ops, "ops")\n self.resources = check.opt_dict_param(resources, "resources")\n self.loggers = check.opt_dict_param(loggers, "loggers")\n self.execution = check.opt_dict_param(execution, "execution")\n\n def to_config_dict(self):\n return {\n "loggers": self.loggers,\n "resources": _convert_config_classes(self.resources),\n "ops": _convert_config_classes(self.ops),\n "execution": self.execution,\n }
\n\n\nCoercibleToRunConfig: TypeAlias = Union[Dict[str, Any], RunConfig]\n\nT = TypeVar("T")\n\n\ndef convert_config_input(inp: Union[CoercibleToRunConfig, T]) -> Union[T, Mapping[str, Any]]:\n if isinstance(inp, RunConfig):\n return inp.to_config_dict()\n else:\n return inp\n
", "current_page_name": "_modules/dagster/_core/definitions/run_config", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.run_config"}, "run_request": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.run_request

\nfrom datetime import datetime\nfrom enum import Enum\nfrom typing import (\n    TYPE_CHECKING,\n    Any,\n    List,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Set,\n    Union,\n    cast,\n)\n\nimport dagster._check as check\nfrom dagster._annotations import PublicAttr, experimental_param\nfrom dagster._core.definitions.asset_check_evaluation import AssetCheckEvaluation\nfrom dagster._core.definitions.events import AssetKey, AssetMaterialization, AssetObservation\nfrom dagster._core.definitions.utils import validate_tags\nfrom dagster._core.instance import DynamicPartitionsStore\nfrom dagster._core.storage.dagster_run import DagsterRun, DagsterRunStatus\nfrom dagster._core.storage.tags import PARTITION_NAME_TAG\nfrom dagster._serdes.serdes import whitelist_for_serdes\nfrom dagster._utils.error import SerializableErrorInfo\n\nif TYPE_CHECKING:\n    from dagster._core.definitions.job_definition import JobDefinition\n    from dagster._core.definitions.partition import PartitionsDefinition\n    from dagster._core.definitions.run_config import RunConfig\n    from dagster._core.definitions.unresolved_asset_job_definition import (\n        UnresolvedAssetJobDefinition,\n    )\n\n\n@whitelist_for_serdes(old_storage_names={"JobType"})\nclass InstigatorType(Enum):\n    SCHEDULE = "SCHEDULE"\n    SENSOR = "SENSOR"\n    AUTO_MATERIALIZE = "AUTO_MATERIALIZE"\n\n\n
[docs]@whitelist_for_serdes\nclass SkipReason(NamedTuple("_SkipReason", [("skip_message", PublicAttr[Optional[str]])])):\n """Represents a skipped evaluation, where no runs are requested. May contain a message to indicate\n why no runs were requested.\n\n Attributes:\n skip_message (Optional[str]): A message displayed in the Dagster UI for why this evaluation resulted\n in no requested runs.\n """\n\n def __new__(cls, skip_message: Optional[str] = None):\n return super(SkipReason, cls).__new__(\n cls,\n skip_message=check.opt_str_param(skip_message, "skip_message"),\n )
\n\n\n
[docs]@whitelist_for_serdes\nclass AddDynamicPartitionsRequest(\n NamedTuple(\n "_AddDynamicPartitionsRequest",\n [\n ("partitions_def_name", str),\n ("partition_keys", Sequence[str]),\n ],\n )\n):\n """A request to add partitions to a dynamic partitions definition, to be evaluated by a sensor or schedule."""\n\n def __new__(\n cls,\n partitions_def_name: str,\n partition_keys: Sequence[str],\n ):\n return super(AddDynamicPartitionsRequest, cls).__new__(\n cls,\n partitions_def_name=check.str_param(partitions_def_name, "partitions_def_name"),\n partition_keys=check.list_param(partition_keys, "partition_keys", of_type=str),\n )
\n\n\n
[docs]@whitelist_for_serdes\nclass DeleteDynamicPartitionsRequest(\n NamedTuple(\n "_AddDynamicPartitionsRequest",\n [\n ("partitions_def_name", str),\n ("partition_keys", Sequence[str]),\n ],\n )\n):\n """A request to delete partitions to a dynamic partitions definition, to be evaluated by a sensor or schedule."""\n\n def __new__(\n cls,\n partitions_def_name: str,\n partition_keys: Sequence[str],\n ):\n return super(DeleteDynamicPartitionsRequest, cls).__new__(\n cls,\n partitions_def_name=check.str_param(partitions_def_name, "partitions_def_name"),\n partition_keys=check.list_param(partition_keys, "partition_keys", of_type=str),\n )
\n\n\n
[docs]@whitelist_for_serdes\nclass RunRequest(\n NamedTuple(\n "_RunRequest",\n [\n ("run_key", PublicAttr[Optional[str]]),\n ("run_config", PublicAttr[Mapping[str, Any]]),\n ("tags", PublicAttr[Mapping[str, str]]),\n ("job_name", PublicAttr[Optional[str]]),\n ("asset_selection", PublicAttr[Optional[Sequence[AssetKey]]]),\n ("stale_assets_only", PublicAttr[bool]),\n ("partition_key", PublicAttr[Optional[str]]),\n ],\n )\n):\n """Represents all the information required to launch a single run. Must be returned by a\n SensorDefinition or ScheduleDefinition's evaluation function for a run to be launched.\n\n Attributes:\n run_key (Optional[str]): A string key to identify this launched run. For sensors, ensures that\n only one run is created per run key across all sensor evaluations. For schedules,\n ensures that one run is created per tick, across failure recoveries. Passing in a `None`\n value means that a run will always be launched per evaluation.\n run_config (Optional[Mapping[str, Any]]: Configuration for the run. If the job has\n a :py:class:`PartitionedConfig`, this value will override replace the config\n provided by it.\n tags (Optional[Dict[str, Any]]): A dictionary of tags (string key-value pairs) to attach\n to the launched run.\n job_name (Optional[str]): (Experimental) The name of the job this run request will launch.\n Required for sensors that target multiple jobs.\n asset_selection (Optional[Sequence[AssetKey]]): A sequence of AssetKeys that should be\n launched with this run.\n stale_assets_only (bool): Set to true to further narrow the asset\n selection to stale assets. If passed without an asset selection, all stale assets in the\n job will be materialized. If the job does not materialize assets, this flag is ignored.\n partition_key (Optional[str]): The partition key for this run request.\n """\n\n def __new__(\n cls,\n run_key: Optional[str] = None,\n run_config: Optional[Union["RunConfig", Mapping[str, Any]]] = None,\n tags: Optional[Mapping[str, Any]] = None,\n job_name: Optional[str] = None,\n asset_selection: Optional[Sequence[AssetKey]] = None,\n stale_assets_only: bool = False,\n partition_key: Optional[str] = None,\n ):\n from dagster._core.definitions.run_config import convert_config_input\n\n return super(RunRequest, cls).__new__(\n cls,\n run_key=check.opt_str_param(run_key, "run_key"),\n run_config=check.opt_mapping_param(\n convert_config_input(run_config), "run_config", key_type=str\n ),\n tags=validate_tags(check.opt_mapping_param(tags, "tags", key_type=str)),\n job_name=check.opt_str_param(job_name, "job_name"),\n asset_selection=check.opt_nullable_sequence_param(\n asset_selection, "asset_selection", of_type=AssetKey\n ),\n stale_assets_only=check.bool_param(stale_assets_only, "stale_assets_only"),\n partition_key=check.opt_str_param(partition_key, "partition_key"),\n )\n\n def with_replaced_attrs(self, **kwargs: Any) -> "RunRequest":\n fields = self._asdict()\n for k in fields.keys():\n if k in kwargs:\n fields[k] = kwargs[k]\n return RunRequest(**fields)\n\n def with_resolved_tags_and_config(\n self,\n target_definition: Union["JobDefinition", "UnresolvedAssetJobDefinition"],\n dynamic_partitions_requests: Sequence[\n Union[AddDynamicPartitionsRequest, DeleteDynamicPartitionsRequest]\n ],\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> "RunRequest":\n from dagster._core.definitions.job_definition import JobDefinition\n from dagster._core.definitions.partition import (\n PartitionedConfig,\n PartitionsDefinition,\n )\n\n if self.partition_key is None:\n check.failed(\n "Cannot resolve partition for run request without partition key",\n )\n\n partitions_def = target_definition.partitions_def\n if partitions_def is None:\n check.failed(\n "Cannot resolve partition for run request when target job"\n f" '{target_definition.name}' is unpartitioned.",\n )\n partitions_def = cast(PartitionsDefinition, partitions_def)\n\n partitioned_config = (\n target_definition.partitioned_config\n if isinstance(target_definition, JobDefinition)\n else PartitionedConfig.from_flexible_config(target_definition.config, partitions_def)\n )\n if partitioned_config is None:\n check.failed(\n "Cannot resolve partition for run request on unpartitioned job",\n )\n\n _check_valid_partition_key_after_dynamic_partitions_requests(\n self.partition_key,\n partitions_def,\n dynamic_partitions_requests,\n current_time,\n dynamic_partitions_store,\n )\n\n tags = {\n **(self.tags or {}),\n **partitioned_config.get_tags_for_partition_key(\n self.partition_key,\n job_name=target_definition.name,\n ),\n }\n\n return self.with_replaced_attrs(\n run_config=(\n self.run_config\n if self.run_config\n else partitioned_config.get_run_config_for_partition_key(self.partition_key)\n ),\n tags=tags,\n )\n\n def has_resolved_partition(self) -> bool:\n # Backcompat run requests yielded via `run_request_for_partition` already have resolved\n # partitioning\n return self.tags.get(PARTITION_NAME_TAG) is not None if self.partition_key else True
\n\n\ndef _check_valid_partition_key_after_dynamic_partitions_requests(\n partition_key: str,\n partitions_def: "PartitionsDefinition",\n dynamic_partitions_requests: Sequence[\n Union[AddDynamicPartitionsRequest, DeleteDynamicPartitionsRequest]\n ],\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n):\n from dagster._core.definitions.multi_dimensional_partitions import MultiPartitionsDefinition\n from dagster._core.definitions.partition import (\n DynamicPartitionsDefinition,\n )\n\n if isinstance(partitions_def, MultiPartitionsDefinition):\n multipartition_key = partitions_def.get_partition_key_from_str(partition_key)\n\n for dimension in partitions_def.partitions_defs:\n _check_valid_partition_key_after_dynamic_partitions_requests(\n multipartition_key.keys_by_dimension[dimension.name],\n dimension.partitions_def,\n dynamic_partitions_requests,\n current_time,\n dynamic_partitions_store,\n )\n\n elif isinstance(partitions_def, DynamicPartitionsDefinition) and partitions_def.name:\n if not dynamic_partitions_store:\n check.failed(\n "Cannot resolve partition for run request on dynamic partitions without"\n " dynamic_partitions_store"\n )\n\n add_partition_keys: Set[str] = set()\n delete_partition_keys: Set[str] = set()\n for req in dynamic_partitions_requests:\n if isinstance(req, AddDynamicPartitionsRequest):\n if req.partitions_def_name == partitions_def.name:\n add_partition_keys.update(set(req.partition_keys))\n elif isinstance(req, DeleteDynamicPartitionsRequest):\n if req.partitions_def_name == partitions_def.name:\n delete_partition_keys.update(set(req.partition_keys))\n\n partition_keys_after_requests_resolved = (\n set(\n dynamic_partitions_store.get_dynamic_partitions(\n partitions_def_name=partitions_def.name\n )\n )\n | add_partition_keys\n ) - delete_partition_keys\n\n if partition_key not in partition_keys_after_requests_resolved:\n check.failed(\n f"Dynamic partition key {partition_key} for partitions def"\n f" '{partitions_def.name}' is invalid. After dynamic partitions requests are"\n " applied, it does not exist in the set of valid partition keys."\n )\n\n else:\n partitions_def.validate_partition_key(\n partition_key,\n dynamic_partitions_store=dynamic_partitions_store,\n current_time=current_time,\n )\n\n\n@whitelist_for_serdes(\n storage_name="PipelineRunReaction",\n storage_field_names={\n "dagster_run": "pipeline_run",\n },\n)\nclass DagsterRunReaction(\n NamedTuple(\n "_DagsterRunReaction",\n [\n ("dagster_run", Optional[DagsterRun]),\n ("error", Optional[SerializableErrorInfo]),\n ("run_status", Optional[DagsterRunStatus]),\n ],\n )\n):\n """Represents a request that reacts to an existing dagster run. If success, it will report logs\n back to the run.\n\n Attributes:\n dagster_run (Optional[DagsterRun]): The dagster run that originates this reaction.\n error (Optional[SerializableErrorInfo]): user code execution error.\n run_status: (Optional[DagsterRunStatus]): The run status that triggered the reaction.\n """\n\n def __new__(\n cls,\n dagster_run: Optional[DagsterRun],\n error: Optional[SerializableErrorInfo] = None,\n run_status: Optional[DagsterRunStatus] = None,\n ):\n return super(DagsterRunReaction, cls).__new__(\n cls,\n dagster_run=check.opt_inst_param(dagster_run, "dagster_run", DagsterRun),\n error=check.opt_inst_param(error, "error", SerializableErrorInfo),\n run_status=check.opt_inst_param(run_status, "run_status", DagsterRunStatus),\n )\n\n\n
[docs]@experimental_param(\n param="asset_events", additional_warn_text="Runless asset events are experimental"\n)\nclass SensorResult(\n NamedTuple(\n "_SensorResult",\n [\n ("run_requests", Optional[Sequence[RunRequest]]),\n ("skip_reason", Optional[SkipReason]),\n ("cursor", Optional[str]),\n (\n "dynamic_partitions_requests",\n Optional[\n Sequence[Union[DeleteDynamicPartitionsRequest, AddDynamicPartitionsRequest]]\n ],\n ),\n (\n "asset_events",\n List[Union[AssetObservation, AssetMaterialization, AssetCheckEvaluation]],\n ),\n ],\n )\n):\n """The result of a sensor evaluation.\n\n Attributes:\n run_requests (Optional[Sequence[RunRequest]]): A list\n of run requests to be executed.\n skip_reason (Optional[Union[str, SkipReason]]): A skip message indicating why sensor\n evaluation was skipped.\n cursor (Optional[str]): The cursor value for this sensor, which will be provided on the\n context for the next sensor evaluation.\n dynamic_partitions_requests (Optional[Sequence[Union[DeleteDynamicPartitionsRequest,\n AddDynamicPartitionsRequest]]]): A list of dynamic partition requests to request dynamic\n partition addition and deletion. Run requests will be evaluated using the state of the\n partitions with these changes applied.\n asset_events (Optional[Sequence[Union[AssetObservation, AssetMaterialization, AssetCheckEvaluation]]]): (Experimental) A\n list of materializations, observations, and asset check evaluations that the system\n will persist on your behalf at the end of sensor evaluation. These events will be not\n be associated with any particular run, but will be queryable and viewable in the asset catalog.\n\n\n """\n\n def __new__(\n cls,\n run_requests: Optional[Sequence[RunRequest]] = None,\n skip_reason: Optional[Union[str, SkipReason]] = None,\n cursor: Optional[str] = None,\n dynamic_partitions_requests: Optional[\n Sequence[Union[DeleteDynamicPartitionsRequest, AddDynamicPartitionsRequest]]\n ] = None,\n asset_events: Optional[\n Sequence[Union[AssetObservation, AssetMaterialization, AssetCheckEvaluation]]\n ] = None,\n ):\n if skip_reason and len(run_requests if run_requests else []) > 0:\n check.failed(\n "Expected a single skip reason or one or more run requests: received values for "\n "both run_requests and skip_reason"\n )\n\n skip_reason = check.opt_inst_param(skip_reason, "skip_reason", (SkipReason, str))\n if isinstance(skip_reason, str):\n skip_reason = SkipReason(skip_reason)\n\n return super(SensorResult, cls).__new__(\n cls,\n run_requests=check.opt_sequence_param(run_requests, "run_requests", RunRequest),\n skip_reason=skip_reason,\n cursor=check.opt_str_param(cursor, "cursor"),\n dynamic_partitions_requests=check.opt_sequence_param(\n dynamic_partitions_requests,\n "dynamic_partitions_requests",\n (AddDynamicPartitionsRequest, DeleteDynamicPartitionsRequest),\n ),\n asset_events=list(\n check.opt_sequence_param(\n asset_events,\n "asset_check_evaluations",\n (AssetObservation, AssetMaterialization, AssetCheckEvaluation),\n )\n ),\n )
\n
", "current_page_name": "_modules/dagster/_core/definitions/run_request", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.run_request"}, "run_status_sensor_definition": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.run_status_sensor_definition

\nimport functools\nimport logging\nfrom contextlib import ExitStack\nfrom datetime import datetime\nfrom typing import (\n    TYPE_CHECKING,\n    Any,\n    Callable,\n    Iterator,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Set,\n    Union,\n    cast,\n    overload,\n)\n\nimport pendulum\nfrom typing_extensions import TypeAlias\n\nimport dagster._check as check\nfrom dagster._annotations import deprecated_param, public\nfrom dagster._core.definitions.instigation_logger import InstigationLogger\nfrom dagster._core.definitions.resource_annotation import get_resource_args\nfrom dagster._core.definitions.scoped_resources_builder import Resources, ScopedResourcesBuilder\nfrom dagster._core.errors import (\n    DagsterInvalidDefinitionError,\n    DagsterInvariantViolationError,\n    RunStatusSensorExecutionError,\n    user_code_error_boundary,\n)\nfrom dagster._core.events import PIPELINE_RUN_STATUS_TO_EVENT_TYPE, DagsterEvent, DagsterEventType\nfrom dagster._core.instance import DagsterInstance\nfrom dagster._core.storage.dagster_run import DagsterRun, DagsterRunStatus, RunsFilter\nfrom dagster._serdes import (\n    serialize_value,\n    whitelist_for_serdes,\n)\nfrom dagster._serdes.errors import DeserializationError\nfrom dagster._serdes.serdes import deserialize_value\nfrom dagster._seven import JSONDecodeError\nfrom dagster._utils import utc_datetime_from_timestamp\nfrom dagster._utils.error import serializable_error_info_from_exc_info\n\nfrom .graph_definition import GraphDefinition\nfrom .job_definition import JobDefinition\nfrom .sensor_definition import (\n    DagsterRunReaction,\n    DefaultSensorStatus,\n    RawSensorEvaluationFunctionReturn,\n    RunRequest,\n    SensorDefinition,\n    SensorEvaluationContext,\n    SensorResult,\n    SensorType,\n    SkipReason,\n    get_context_param_name,\n    get_sensor_context_from_args_or_kwargs,\n    validate_and_get_resource_dict,\n)\nfrom .target import ExecutableDefinition\nfrom .unresolved_asset_job_definition import UnresolvedAssetJobDefinition\n\nif TYPE_CHECKING:\n    from dagster._core.definitions.resource_definition import ResourceDefinition\n    from dagster._core.definitions.selector import (\n        CodeLocationSelector,\n        JobSelector,\n        RepositorySelector,\n    )\n\nRunStatusSensorEvaluationFunction: TypeAlias = Union[\n    Callable[..., RawSensorEvaluationFunctionReturn],\n    Callable[..., RawSensorEvaluationFunctionReturn],\n]\nRunFailureSensorEvaluationFn: TypeAlias = Union[\n    Callable[..., RawSensorEvaluationFunctionReturn],\n    Callable[..., RawSensorEvaluationFunctionReturn],\n]\n\n\n@whitelist_for_serdes(old_storage_names={"PipelineSensorCursor"})\nclass RunStatusSensorCursor(\n    NamedTuple(\n        "_RunStatusSensorCursor",\n        [("record_id", int), ("update_timestamp", str)],\n    )\n):\n    def __new__(cls, record_id, update_timestamp):\n        return super(RunStatusSensorCursor, cls).__new__(\n            cls,\n            record_id=check.int_param(record_id, "record_id"),\n            update_timestamp=check.str_param(update_timestamp, "update_timestamp"),\n        )\n\n    @staticmethod\n    def is_valid(json_str: str) -> bool:\n        try:\n            obj = deserialize_value(json_str, RunStatusSensorCursor)\n            return isinstance(obj, RunStatusSensorCursor)\n        except (JSONDecodeError, DeserializationError):\n            return False\n\n    def to_json(self) -> str:\n        return serialize_value(cast(NamedTuple, self))\n\n    @staticmethod\n    def from_json(json_str: str) -> "RunStatusSensorCursor":\n        return deserialize_value(json_str, RunStatusSensorCursor)\n\n\n
[docs]class RunStatusSensorContext:\n """The ``context`` object available to a decorated function of ``run_status_sensor``."""\n\n def __init__(\n self,\n sensor_name,\n dagster_run,\n dagster_event,\n instance,\n context: Optional[\n SensorEvaluationContext\n ] = None, # deprecated arg, but we need to keep it for backcompat\n resource_defs: Optional[Mapping[str, "ResourceDefinition"]] = None,\n logger: Optional[logging.Logger] = None,\n partition_key: Optional[str] = None,\n _resources: Optional[Resources] = None,\n _cm_scope_entered: bool = False,\n ) -> None:\n self._exit_stack = ExitStack()\n self._sensor_name = check.str_param(sensor_name, "sensor_name")\n self._dagster_run = check.inst_param(dagster_run, "dagster_run", DagsterRun)\n self._dagster_event = check.inst_param(dagster_event, "dagster_event", DagsterEvent)\n self._instance = check.inst_param(instance, "instance", DagsterInstance)\n self._logger: Optional[logging.Logger] = logger or (context.log if context else None)\n self._partition_key = check.opt_str_param(partition_key, "partition_key")\n\n # Wait to set resources unless they're accessed\n self._resource_defs = resource_defs\n self._resources = _resources\n self._cm_scope_entered = _cm_scope_entered\n\n def for_run_failure(self) -> "RunFailureSensorContext":\n """Converts RunStatusSensorContext to RunFailureSensorContext."""\n return RunFailureSensorContext(\n sensor_name=self._sensor_name,\n dagster_run=self._dagster_run,\n dagster_event=self._dagster_event,\n instance=self._instance,\n logger=self._logger,\n partition_key=self._partition_key,\n resource_defs=self._resource_defs,\n _resources=self._resources,\n _cm_scope_entered=self._cm_scope_entered,\n )\n\n @property\n def resource_defs(self) -> Optional[Mapping[str, "ResourceDefinition"]]:\n return self._resource_defs\n\n @property\n def resources(self) -> Resources:\n from dagster._core.definitions.scoped_resources_builder import (\n IContainsGenerator,\n )\n from dagster._core.execution.build_resources import build_resources\n\n if not self._resources:\n """\n This is similar to what we do in e.g. the op context - we set up a resource\n building context manager, and immediately enter it. This is so that in cases\n where a user is not using any context-manager based resources, they don't\n need to enter this SensorEvaluationContext themselves.\n\n For example:\n\n my_sensor(build_sensor_context(resources={"my_resource": my_non_cm_resource})\n\n will work ok, but for a CM resource we must do\n\n with build_sensor_context(resources={"my_resource": my_cm_resource}) as context:\n my_sensor(context)\n """\n\n instance = self.instance if self._instance else None\n\n resources_cm = build_resources(resources=self._resource_defs or {}, instance=instance)\n self._resources = self._exit_stack.enter_context(resources_cm)\n\n if isinstance(self._resources, IContainsGenerator) and not self._cm_scope_entered:\n self._exit_stack.close()\n raise DagsterInvariantViolationError(\n "At least one provided resource is a generator, but attempting to access"\n " resources outside of context manager scope. You can use the following syntax"\n " to open a context manager: `with build_schedule_context(...) as context:`"\n )\n\n return self._resources\n\n @public\n @property\n def sensor_name(self) -> str:\n """The name of the sensor."""\n return self._sensor_name\n\n @public\n @property\n def dagster_run(self) -> DagsterRun:\n """The run of the job."""\n return self._dagster_run\n\n @public\n @property\n def dagster_event(self) -> DagsterEvent:\n """The event associated with the job run status."""\n return self._dagster_event\n\n @public\n @property\n def instance(self) -> DagsterInstance:\n """The current instance."""\n return self._instance\n\n @public\n @property\n def log(self) -> logging.Logger:\n """The logger for the current sensor evaluation."""\n if not self._logger:\n self._logger = InstigationLogger()\n\n return self._logger\n\n @public\n @property\n def partition_key(self) -> Optional[str]:\n """Optional[str]: The partition key of the relevant run."""\n return self._partition_key\n\n def __enter__(self) -> "RunStatusSensorContext":\n self._cm_scope_entered = True\n return self\n\n def __exit__(self, *exc) -> None:\n self._exit_stack.close()\n self._logger = None
\n\n\n
[docs]class RunFailureSensorContext(RunStatusSensorContext):\n """The ``context`` object available to a decorated function of ``run_failure_sensor``.\n\n Attributes:\n sensor_name (str): the name of the sensor.\n dagster_run (DagsterRun): the failed run.\n """\n\n @public\n @property\n def failure_event(self) -> DagsterEvent:\n """The run failure event.\n\n If the run failed because of an error inside a step, get_step_failure_events will have more\n details on the step failure.\n """\n return self.dagster_event\n\n
[docs] @public\n def get_step_failure_events(self) -> Sequence[DagsterEvent]:\n """The step failure event for each step in the run that failed.\n\n Examples:\n .. code-block:: python\n\n error_strings_by_step_key = {\n # includes the stack trace\n event.step_key: event.event_specific_data.error.to_string()\n for event in context.get_step_failure_events()\n }\n """\n records = self.instance.get_records_for_run(\n run_id=self.dagster_run.run_id, of_type=DagsterEventType.STEP_FAILURE\n ).records\n return [cast(DagsterEvent, record.event_log_entry.dagster_event) for record in records]
\n\n\n
[docs]def build_run_status_sensor_context(\n sensor_name: str,\n dagster_event: DagsterEvent,\n dagster_instance: DagsterInstance,\n dagster_run: DagsterRun,\n context: Optional[SensorEvaluationContext] = None,\n resources: Optional[Mapping[str, object]] = None,\n partition_key: Optional[str] = None,\n) -> RunStatusSensorContext:\n """Builds run status sensor context from provided parameters.\n\n This function can be used to provide the context argument when directly invoking a function\n decorated with `@run_status_sensor` or `@run_failure_sensor`, such as when writing unit tests.\n\n Args:\n sensor_name (str): The name of the sensor the context is being constructed for.\n dagster_event (DagsterEvent): A DagsterEvent with the same event type as the one that\n triggers the run_status_sensor\n dagster_instance (DagsterInstance): The dagster instance configured for the context.\n dagster_run (DagsterRun): DagsterRun object from running a job\n resources (Optional[Mapping[str, object]]): A dictionary of resources to be made available\n to the sensor.\n\n Examples:\n .. code-block:: python\n\n instance = DagsterInstance.ephemeral()\n result = my_job.execute_in_process(instance=instance)\n\n dagster_run = result.dagster_run\n dagster_event = result.get_job_success_event() # or get_job_failure_event()\n\n context = build_run_status_sensor_context(\n sensor_name="run_status_sensor_to_invoke",\n dagster_instance=instance,\n dagster_run=dagster_run,\n dagster_event=dagster_event,\n )\n run_status_sensor_to_invoke(context)\n """\n from dagster._core.execution.build_resources import wrap_resources_for_execution\n\n return RunStatusSensorContext(\n sensor_name=sensor_name,\n instance=dagster_instance,\n dagster_run=dagster_run,\n dagster_event=dagster_event,\n resource_defs=wrap_resources_for_execution(resources),\n logger=context.log if context else None,\n partition_key=partition_key,\n )
\n\n\n@overload\ndef run_failure_sensor(\n name: RunFailureSensorEvaluationFn,\n) -> SensorDefinition: ...\n\n\n@overload\ndef run_failure_sensor(\n name: Optional[str] = None,\n minimum_interval_seconds: Optional[int] = None,\n description: Optional[str] = None,\n monitored_jobs: Optional[\n Sequence[\n Union[\n JobDefinition,\n GraphDefinition,\n UnresolvedAssetJobDefinition,\n "RepositorySelector",\n "JobSelector",\n "CodeLocationSelector",\n ]\n ]\n ] = None,\n job_selection: Optional[\n Sequence[\n Union[\n JobDefinition,\n GraphDefinition,\n UnresolvedAssetJobDefinition,\n "RepositorySelector",\n "JobSelector",\n "CodeLocationSelector",\n ]\n ]\n ] = None,\n monitor_all_repositories: bool = False,\n default_status: DefaultSensorStatus = DefaultSensorStatus.STOPPED,\n request_job: Optional[ExecutableDefinition] = None,\n request_jobs: Optional[Sequence[ExecutableDefinition]] = None,\n) -> Callable[[RunFailureSensorEvaluationFn], SensorDefinition,]: ...\n\n\n
[docs]@deprecated_param(\n param="job_selection",\n breaking_version="2.0",\n additional_warn_text="Use `monitored_jobs` instead.",\n)\ndef run_failure_sensor(\n name: Optional[Union[RunFailureSensorEvaluationFn, str]] = None,\n minimum_interval_seconds: Optional[int] = None,\n description: Optional[str] = None,\n monitored_jobs: Optional[\n Sequence[\n Union[\n JobDefinition,\n GraphDefinition,\n UnresolvedAssetJobDefinition,\n "RepositorySelector",\n "JobSelector",\n "CodeLocationSelector",\n ]\n ]\n ] = None,\n job_selection: Optional[\n Sequence[\n Union[\n JobDefinition,\n GraphDefinition,\n UnresolvedAssetJobDefinition,\n "RepositorySelector",\n "JobSelector",\n "CodeLocationSelector",\n ]\n ]\n ] = None,\n monitor_all_repositories: bool = False,\n default_status: DefaultSensorStatus = DefaultSensorStatus.STOPPED,\n request_job: Optional[ExecutableDefinition] = None,\n request_jobs: Optional[Sequence[ExecutableDefinition]] = None,\n) -> Union[SensorDefinition, Callable[[RunFailureSensorEvaluationFn], SensorDefinition,]]:\n """Creates a sensor that reacts to job failure events, where the decorated function will be\n run when a run fails.\n\n Takes a :py:class:`~dagster.RunFailureSensorContext`.\n\n Args:\n name (Optional[str]): The name of the job failure sensor. Defaults to the name of the\n decorated function.\n minimum_interval_seconds (Optional[int]): The minimum number of seconds that will elapse\n between sensor evaluations.\n description (Optional[str]): A human-readable description of the sensor.\n monitored_jobs (Optional[List[Union[JobDefinition, GraphDefinition, UnresolvedAssetJobDefinition, RepositorySelector, JobSelector, CodeLocationSelector]]]):\n The jobs in the current repository that will be monitored by this failure sensor.\n Defaults to None, which means the alert will be sent when any job in the current\n repository fails.\n monitor_all_repositories (bool): If set to True, the sensor will monitor all runs in the\n Dagster instance. If set to True, an error will be raised if you also specify\n monitored_jobs or job_selection. Defaults to False.\n job_selection (Optional[List[Union[JobDefinition, GraphDefinition, RepositorySelector, JobSelector, CodeLocationSelector]]]):\n (deprecated in favor of monitored_jobs) The jobs in the current repository that will be\n monitored by this failure sensor. Defaults to None, which means the alert will be sent\n when any job in the repository fails.\n default_status (DefaultSensorStatus): Whether the sensor starts as running or not. The default\n status can be overridden from the Dagster UI or via the GraphQL API.\n request_job (Optional[Union[GraphDefinition, JobDefinition, UnresolvedAssetJob]]): The job a RunRequest should\n execute if yielded from the sensor.\n request_jobs (Optional[Sequence[Union[GraphDefinition, JobDefinition, UnresolvedAssetJob]]]): (experimental)\n A list of jobs to be executed if RunRequests are yielded from the sensor.\n """\n\n def inner(\n fn: RunFailureSensorEvaluationFn,\n ) -> SensorDefinition:\n check.callable_param(fn, "fn")\n if name is None or callable(name):\n sensor_name = fn.__name__\n else:\n sensor_name = name\n\n jobs = monitored_jobs if monitored_jobs else job_selection\n\n @run_status_sensor(\n run_status=DagsterRunStatus.FAILURE,\n name=sensor_name,\n minimum_interval_seconds=minimum_interval_seconds,\n description=description,\n monitored_jobs=jobs,\n monitor_all_repositories=monitor_all_repositories,\n default_status=default_status,\n request_job=request_job,\n request_jobs=request_jobs,\n )\n @functools.wraps(fn)\n def _run_failure_sensor(*args, **kwargs) -> Any:\n args_modified = [\n arg.for_run_failure() if isinstance(arg, RunStatusSensorContext) else arg\n for arg in args\n ]\n kwargs_modified = {\n k: v.for_run_failure() if isinstance(v, RunStatusSensorContext) else v\n for k, v in kwargs.items()\n }\n return fn(*args_modified, **kwargs_modified)\n\n return _run_failure_sensor\n\n # This case is for when decorator is used bare, without arguments\n if callable(name):\n return inner(name)\n\n return inner
\n\n\n
[docs]class RunStatusSensorDefinition(SensorDefinition):\n """Define a sensor that reacts to a given status of job execution, where the decorated\n function will be evaluated when a run is at the given status.\n\n Args:\n name (str): The name of the sensor. Defaults to the name of the decorated function.\n run_status (DagsterRunStatus): The status of a run which will be\n monitored by the sensor.\n run_status_sensor_fn (Callable[[RunStatusSensorContext], Union[SkipReason, DagsterRunReaction]]): The core\n evaluation function for the sensor. Takes a :py:class:`~dagster.RunStatusSensorContext`.\n minimum_interval_seconds (Optional[int]): The minimum number of seconds that will elapse\n between sensor evaluations.\n description (Optional[str]): A human-readable description of the sensor.\n monitored_jobs (Optional[List[Union[JobDefinition, GraphDefinition, UnresolvedAssetJobDefinition, JobSelector, RepositorySelector, CodeLocationSelector]]]):\n The jobs in the current repository that will be monitored by this sensor. Defaults to\n None, which means the alert will be sent when any job in the repository fails.\n monitor_all_repositories (bool): If set to True, the sensor will monitor all runs in the\n Dagster instance. If set to True, an error will be raised if you also specify\n monitored_jobs or job_selection. Defaults to False.\n default_status (DefaultSensorStatus): Whether the sensor starts as running or not. The default\n status can be overridden from the Dagster UI or via the GraphQL API.\n request_job (Optional[Union[GraphDefinition, JobDefinition]]): The job a RunRequest should\n execute if yielded from the sensor.\n request_jobs (Optional[Sequence[Union[GraphDefinition, JobDefinition]]]): (experimental)\n A list of jobs to be executed if RunRequests are yielded from the sensor.\n """\n\n def __init__(\n self,\n name: str,\n run_status: DagsterRunStatus,\n run_status_sensor_fn: RunStatusSensorEvaluationFunction,\n minimum_interval_seconds: Optional[int] = None,\n description: Optional[str] = None,\n monitored_jobs: Optional[\n Sequence[\n Union[\n JobDefinition,\n GraphDefinition,\n UnresolvedAssetJobDefinition,\n "RepositorySelector",\n "JobSelector",\n "CodeLocationSelector",\n ]\n ]\n ] = None,\n monitor_all_repositories: bool = False,\n default_status: DefaultSensorStatus = DefaultSensorStatus.STOPPED,\n request_job: Optional[ExecutableDefinition] = None,\n request_jobs: Optional[Sequence[ExecutableDefinition]] = None,\n required_resource_keys: Optional[Set[str]] = None,\n ):\n from dagster._core.definitions.selector import (\n CodeLocationSelector,\n JobSelector,\n RepositorySelector,\n )\n from dagster._core.event_api import RunShardedEventsCursor\n from dagster._core.storage.event_log.base import EventRecordsFilter\n\n check.str_param(name, "name")\n check.inst_param(run_status, "run_status", DagsterRunStatus)\n check.callable_param(run_status_sensor_fn, "run_status_sensor_fn")\n check.opt_int_param(minimum_interval_seconds, "minimum_interval_seconds")\n check.opt_str_param(description, "description")\n check.opt_list_param(\n monitored_jobs,\n "monitored_jobs",\n (\n JobDefinition,\n GraphDefinition,\n UnresolvedAssetJobDefinition,\n RepositorySelector,\n JobSelector,\n CodeLocationSelector,\n ),\n )\n check.inst_param(default_status, "default_status", DefaultSensorStatus)\n\n resource_arg_names: Set[str] = {arg.name for arg in get_resource_args(run_status_sensor_fn)}\n\n combined_required_resource_keys = (\n check.opt_set_param(required_resource_keys, "required_resource_keys", of_type=str)\n | resource_arg_names\n )\n\n # coerce CodeLocationSelectors to RepositorySelectors with repo name "__repository__"\n monitored_jobs = [\n job.to_repository_selector() if isinstance(job, CodeLocationSelector) else job\n for job in (monitored_jobs or [])\n ]\n\n self._run_status_sensor_fn = check.callable_param(\n run_status_sensor_fn, "run_status_sensor_fn"\n )\n event_type = PIPELINE_RUN_STATUS_TO_EVENT_TYPE[run_status]\n\n # split monitored_jobs into external repos, external jobs, and jobs in the current repo\n other_repos = (\n [x for x in monitored_jobs if isinstance(x, RepositorySelector)]\n if monitored_jobs\n else []\n )\n\n other_repo_jobs = (\n [x for x in monitored_jobs if isinstance(x, JobSelector)] if monitored_jobs else []\n )\n\n current_repo_jobs = (\n [x for x in monitored_jobs if not isinstance(x, (JobSelector, RepositorySelector))]\n if monitored_jobs\n else []\n )\n\n def _wrapped_fn(\n context: SensorEvaluationContext,\n ) -> Iterator[Union[RunRequest, SkipReason, DagsterRunReaction, SensorResult]]:\n # initiate the cursor to (most recent event id, current timestamp) when:\n # * it's the first time starting the sensor\n # * or, the cursor isn't in valid format (backcompt)\n if context.cursor is None or not RunStatusSensorCursor.is_valid(context.cursor):\n most_recent_event_records = list(\n context.instance.get_event_records(\n EventRecordsFilter(event_type=event_type), ascending=False, limit=1\n )\n )\n most_recent_event_id = (\n most_recent_event_records[0].storage_id\n if len(most_recent_event_records) == 1\n else -1\n )\n\n new_cursor = RunStatusSensorCursor(\n update_timestamp=pendulum.now("UTC").isoformat(),\n record_id=most_recent_event_id,\n )\n context.update_cursor(new_cursor.to_json())\n yield SkipReason(f"Initiating {name}. Set cursor to {new_cursor}")\n return\n\n record_id, update_timestamp = RunStatusSensorCursor.from_json(context.cursor)\n\n # Fetch events after the cursor id\n # * we move the cursor forward to the latest visited event's id to avoid revisits\n # * when the daemon is down, bc we persist the cursor info, we can go back to where we\n # left and backfill alerts for the qualified events (up to 5 at a time) during the downtime\n # Note: this is a cross-run query which requires extra handling in sqlite, see details in SqliteEventLogStorage.\n event_records = context.instance.get_event_records(\n EventRecordsFilter(\n after_cursor=RunShardedEventsCursor(\n id=record_id,\n run_updated_after=cast(datetime, pendulum.parse(update_timestamp)),\n ),\n event_type=event_type,\n ),\n ascending=True,\n limit=5,\n )\n\n for event_record in event_records:\n event_log_entry = event_record.event_log_entry\n storage_id = event_record.storage_id\n\n # get run info\n run_records = context.instance.get_run_records(\n filters=RunsFilter(run_ids=[event_log_entry.run_id])\n )\n\n # skip if we couldn't find the right run\n if len(run_records) != 1:\n # bc we couldn't find the run, we use the event timestamp as the approximate\n # run update timestamp\n approximate_update_timestamp = utc_datetime_from_timestamp(\n event_log_entry.timestamp\n )\n context.update_cursor(\n RunStatusSensorCursor(\n record_id=storage_id,\n update_timestamp=approximate_update_timestamp.isoformat(),\n ).to_json()\n )\n continue\n\n dagster_run = run_records[0].dagster_run\n update_timestamp = run_records[0].update_timestamp\n\n job_match = False\n\n # if monitor_all_repositories is provided, then we want to run the sensor for all jobs in all repositories\n if monitor_all_repositories:\n job_match = True\n\n # check if the run is in the current repository and (if provided) one of jobs specified in monitored_jobs\n if (\n not job_match\n and\n # the job has a repository (not manually executed)\n dagster_run.external_job_origin\n and\n # the job belongs to the current repository\n dagster_run.external_job_origin.external_repository_origin.repository_name\n == context.repository_name\n ):\n if monitored_jobs:\n if dagster_run.job_name in map(lambda x: x.name, current_repo_jobs):\n job_match = True\n else:\n job_match = True\n\n if not job_match:\n # check if the run is one of the jobs specified by JobSelector or RepositorySelector (ie in another repo)\n # make a JobSelector for the run in question\n external_repository_origin = check.not_none(\n dagster_run.external_job_origin\n ).external_repository_origin\n run_job_selector = JobSelector(\n location_name=external_repository_origin.code_location_origin.location_name,\n repository_name=external_repository_origin.repository_name,\n job_name=dagster_run.job_name,\n )\n if run_job_selector in other_repo_jobs:\n job_match = True\n\n # make a RepositorySelector for the run in question\n run_repo_selector = RepositorySelector(\n location_name=external_repository_origin.code_location_origin.location_name,\n repository_name=external_repository_origin.repository_name,\n )\n if run_repo_selector in other_repos:\n job_match = True\n\n if not job_match:\n # the run in question doesn't match any of the criteria for we advance the cursor and move on\n context.update_cursor(\n RunStatusSensorCursor(\n record_id=storage_id, update_timestamp=update_timestamp.isoformat()\n ).to_json()\n )\n continue\n\n serializable_error = None\n\n resource_args_populated = validate_and_get_resource_dict(\n context.resources, name, resource_arg_names\n )\n\n try:\n with RunStatusSensorContext(\n sensor_name=name,\n dagster_run=dagster_run,\n dagster_event=event_log_entry.dagster_event,\n instance=context.instance,\n resource_defs=context.resource_defs,\n logger=context.log,\n partition_key=dagster_run.tags.get("dagster/partition"),\n ) as sensor_context, user_code_error_boundary(\n RunStatusSensorExecutionError,\n lambda: f'Error occurred during the execution sensor "{name}".',\n ):\n context_param_name = get_context_param_name(run_status_sensor_fn)\n context_param = (\n {context_param_name: sensor_context} if context_param_name else {}\n )\n\n sensor_return = run_status_sensor_fn(\n **context_param,\n **resource_args_populated,\n )\n\n if sensor_return is not None:\n context.update_cursor(\n RunStatusSensorCursor(\n record_id=storage_id,\n update_timestamp=update_timestamp.isoformat(),\n ).to_json()\n )\n\n if isinstance(sensor_return, SensorResult):\n if sensor_return.cursor:\n raise DagsterInvariantViolationError(\n f"Error in run status sensor {name}: Sensor returned a"\n " SensorResult with a cursor value. The cursor is managed"\n " by the sensor and should not be modified by a user."\n )\n yield sensor_return\n elif isinstance(\n sensor_return,\n (RunRequest, SkipReason, DagsterRunReaction),\n ):\n yield sensor_return\n else:\n yield from sensor_return\n return\n except RunStatusSensorExecutionError as run_status_sensor_execution_error:\n # When the user code errors, we report error to the sensor tick not the original run.\n serializable_error = serializable_error_info_from_exc_info(\n run_status_sensor_execution_error.original_exc_info\n )\n\n context.update_cursor(\n RunStatusSensorCursor(\n record_id=storage_id, update_timestamp=update_timestamp.isoformat()\n ).to_json()\n )\n\n # Yield DagsterRunReaction to indicate the execution success/failure.\n # The sensor machinery would\n # * report back to the original run if success\n # * update cursor and job state\n yield DagsterRunReaction(\n dagster_run=dagster_run,\n run_status=run_status,\n error=serializable_error,\n )\n\n super(RunStatusSensorDefinition, self).__init__(\n name=name,\n evaluation_fn=_wrapped_fn,\n minimum_interval_seconds=minimum_interval_seconds,\n description=description,\n default_status=default_status,\n job=request_job,\n jobs=request_jobs,\n required_resource_keys=combined_required_resource_keys,\n )\n\n def __call__(self, *args, **kwargs) -> RawSensorEvaluationFunctionReturn:\n context_param_name = get_context_param_name(self._run_status_sensor_fn)\n context = get_sensor_context_from_args_or_kwargs(\n self._run_status_sensor_fn,\n args,\n kwargs,\n context_type=RunStatusSensorContext,\n )\n context_param = {context_param_name: context} if context_param_name and context else {}\n\n resources = validate_and_get_resource_dict(\n context.resources if context else ScopedResourcesBuilder.build_empty(),\n self._name,\n self._required_resource_keys,\n )\n return self._run_status_sensor_fn(**context_param, **resources)\n\n @property\n def sensor_type(self) -> SensorType:\n return SensorType.RUN_STATUS
\n\n\n
[docs]@deprecated_param(\n param="job_selection",\n breaking_version="2.0",\n additional_warn_text="Use `monitored_jobs` instead.",\n)\ndef run_status_sensor(\n run_status: DagsterRunStatus,\n name: Optional[str] = None,\n minimum_interval_seconds: Optional[int] = None,\n description: Optional[str] = None,\n monitored_jobs: Optional[\n Sequence[\n Union[\n JobDefinition,\n GraphDefinition,\n UnresolvedAssetJobDefinition,\n "RepositorySelector",\n "JobSelector",\n "CodeLocationSelector",\n ]\n ]\n ] = None,\n job_selection: Optional[\n Sequence[\n Union[\n JobDefinition,\n GraphDefinition,\n UnresolvedAssetJobDefinition,\n "RepositorySelector",\n "JobSelector",\n "CodeLocationSelector",\n ]\n ]\n ] = None,\n monitor_all_repositories: bool = False,\n default_status: DefaultSensorStatus = DefaultSensorStatus.STOPPED,\n request_job: Optional[ExecutableDefinition] = None,\n request_jobs: Optional[Sequence[ExecutableDefinition]] = None,\n) -> Callable[[RunStatusSensorEvaluationFunction], RunStatusSensorDefinition,]:\n """Creates a sensor that reacts to a given status of job execution, where the decorated\n function will be run when a job is at the given status.\n\n Takes a :py:class:`~dagster.RunStatusSensorContext`.\n\n Args:\n run_status (DagsterRunStatus): The status of run execution which will be\n monitored by the sensor.\n name (Optional[str]): The name of the sensor. Defaults to the name of the decorated function.\n minimum_interval_seconds (Optional[int]): The minimum number of seconds that will elapse\n between sensor evaluations.\n description (Optional[str]): A human-readable description of the sensor.\n monitored_jobs (Optional[List[Union[JobDefinition, GraphDefinition, UnresolvedAssetJobDefinition, RepositorySelector, JobSelector, CodeLocationSelector]]]):\n Jobs in the current repository that will be monitored by this sensor. Defaults to None, which means the alert will\n be sent when any job in the repository matches the requested run_status. Jobs in external repositories can be monitored by using\n RepositorySelector or JobSelector.\n monitor_all_repositories (bool): If set to True, the sensor will monitor all runs in the Dagster instance.\n If set to True, an error will be raised if you also specify monitored_jobs or job_selection.\n Defaults to False.\n job_selection (Optional[List[Union[JobDefinition, GraphDefinition, RepositorySelector, JobSelector, CodeLocationSelector]]]):\n (deprecated in favor of monitored_jobs) Jobs in the current repository that will be\n monitored by this sensor. Defaults to None, which means the alert will be sent when\n any job in the repository matches the requested run_status.\n default_status (DefaultSensorStatus): Whether the sensor starts as running or not. The default\n status can be overridden from the Dagster UI or via the GraphQL API.\n request_job (Optional[Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]]): The job that should be\n executed if a RunRequest is yielded from the sensor.\n request_jobs (Optional[Sequence[Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]]]): (experimental)\n A list of jobs to be executed if RunRequests are yielded from the sensor.\n """\n\n def inner(\n fn: RunStatusSensorEvaluationFunction,\n ) -> RunStatusSensorDefinition:\n check.callable_param(fn, "fn")\n sensor_name = name or fn.__name__\n\n jobs = monitored_jobs if monitored_jobs else job_selection\n\n if jobs and monitor_all_repositories:\n DagsterInvalidDefinitionError(\n "Cannot specify both monitor_all_repositories and"\n f" {'monitored_jobs' if monitored_jobs else 'job_selection'}."\n )\n\n return RunStatusSensorDefinition(\n name=sensor_name,\n run_status=run_status,\n run_status_sensor_fn=fn,\n minimum_interval_seconds=minimum_interval_seconds,\n description=description,\n monitored_jobs=jobs,\n monitor_all_repositories=monitor_all_repositories,\n default_status=default_status,\n request_job=request_job,\n request_jobs=request_jobs,\n )\n\n return inner
\n
", "current_page_name": "_modules/dagster/_core/definitions/run_status_sensor_definition", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.run_status_sensor_definition"}, "schedule_definition": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.schedule_definition

\nimport copy\nimport logging\nfrom contextlib import ExitStack\nfrom datetime import datetime\nfrom enum import Enum\nfrom typing import (\n    TYPE_CHECKING,\n    Any,\n    Callable,\n    Dict,\n    Iterator,\n    List,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Set,\n    TypeVar,\n    Union,\n    cast,\n)\n\nimport pendulum\nfrom typing_extensions import TypeAlias\n\nimport dagster._check as check\nfrom dagster._annotations import deprecated, deprecated_param, public\nfrom dagster._core.definitions.instigation_logger import InstigationLogger\nfrom dagster._core.definitions.resource_annotation import get_resource_args\nfrom dagster._core.definitions.scoped_resources_builder import Resources, ScopedResourcesBuilder\nfrom dagster._serdes import whitelist_for_serdes\nfrom dagster._utils import IHasInternalInit, ensure_gen\nfrom dagster._utils.merger import merge_dicts\nfrom dagster._utils.schedules import is_valid_cron_schedule\n\nfrom ..decorator_utils import has_at_least_one_parameter\nfrom ..errors import (\n    DagsterInvalidDefinitionError,\n    DagsterInvalidInvocationError,\n    DagsterInvariantViolationError,\n    ScheduleExecutionError,\n    user_code_error_boundary,\n)\nfrom ..instance import DagsterInstance\nfrom ..instance.ref import InstanceRef\nfrom ..storage.dagster_run import DagsterRun\nfrom .graph_definition import GraphDefinition\nfrom .job_definition import JobDefinition\nfrom .run_request import RunRequest, SkipReason\nfrom .target import DirectTarget, ExecutableDefinition, RepoRelativeTarget\nfrom .unresolved_asset_job_definition import UnresolvedAssetJobDefinition\nfrom .utils import check_valid_name, validate_tags\n\nif TYPE_CHECKING:\n    from dagster import ResourceDefinition\n    from dagster._core.definitions.repository_definition import RepositoryDefinition\nT = TypeVar("T")\n\nRunConfig: TypeAlias = Mapping[str, Any]\nRunRequestIterator: TypeAlias = Iterator[Union[RunRequest, SkipReason]]\n\nScheduleEvaluationFunctionReturn: TypeAlias = Union[\n    RunRequest, SkipReason, RunConfig, RunRequestIterator, Sequence[RunRequest]\n]\nRawScheduleEvaluationFunction: TypeAlias = Callable[..., ScheduleEvaluationFunctionReturn]\n\nScheduleRunConfigFunction: TypeAlias = Union[\n    Callable[["ScheduleEvaluationContext"], RunConfig],\n    Callable[[], RunConfig],\n]\n\nScheduleTagsFunction: TypeAlias = Callable[["ScheduleEvaluationContext"], Mapping[str, str]]\nScheduleShouldExecuteFunction: TypeAlias = Callable[["ScheduleEvaluationContext"], bool]\nScheduleExecutionFunction: TypeAlias = Union[\n    Callable[["ScheduleEvaluationContext"], Any],\n    "DecoratedScheduleFunction",\n]\n\n\n@whitelist_for_serdes\nclass DefaultScheduleStatus(Enum):\n    RUNNING = "RUNNING"\n    STOPPED = "STOPPED"\n\n\ndef get_or_create_schedule_context(\n    fn: Callable, *args: Any, **kwargs: Any\n) -> "ScheduleEvaluationContext":\n    """Based on the passed resource function and the arguments passed to it, returns the\n    user-passed ScheduleEvaluationContext or creates one if it is not passed.\n\n    Raises an exception if the user passes more than one argument or if the user-provided\n    function requires a context parameter but none is passed.\n    """\n    from dagster._config.pythonic_config import is_coercible_to_resource\n    from dagster._core.definitions.sensor_definition import get_context_param_name\n\n    context_param_name = get_context_param_name(fn)\n\n    kwarg_keys_non_resource = set(kwargs.keys()) - {param.name for param in get_resource_args(fn)}\n    if len(args) + len(kwarg_keys_non_resource) > 1:\n        raise DagsterInvalidInvocationError(\n            "Schedule invocation received multiple non-resource arguments. Only a first "\n            "positional context parameter should be provided when invoking."\n        )\n\n    if any(is_coercible_to_resource(arg) for arg in args):\n        raise DagsterInvalidInvocationError(\n            "If directly invoking a schedule, you may not provide resources as"\n            " positional arguments, only as keyword arguments."\n        )\n\n    context: Optional[ScheduleEvaluationContext] = None\n\n    if len(args) > 0:\n        context = check.opt_inst(args[0], ScheduleEvaluationContext)\n    elif len(kwargs) > 0:\n        if context_param_name and context_param_name not in kwargs:\n            raise DagsterInvalidInvocationError(\n                f"Schedule invocation expected argument '{context_param_name}'."\n            )\n        context = check.opt_inst(\n            kwargs.get(context_param_name or "context"), ScheduleEvaluationContext\n        )\n    elif context_param_name:\n        # If the context parameter is present but no value was provided, we error\n        raise DagsterInvalidInvocationError(\n            "Schedule evaluation function expected context argument, but no context argument "\n            "was provided when invoking."\n        )\n\n    context = context or build_schedule_context()\n    resource_args_from_kwargs = {}\n\n    resource_args = {param.name for param in get_resource_args(fn)}\n    for resource_arg in resource_args:\n        if resource_arg in kwargs:\n            resource_args_from_kwargs[resource_arg] = kwargs[resource_arg]\n\n    if resource_args_from_kwargs:\n        return context.merge_resources(resource_args_from_kwargs)\n\n    return context\n\n\n
[docs]class ScheduleEvaluationContext:\n """The context object available as the first argument various functions defined on a :py:class:`dagster.ScheduleDefinition`.\n\n A `ScheduleEvaluationContext` object is passed as the first argument to ``run_config_fn``, ``tags_fn``,\n and ``should_execute``.\n\n Users should not instantiate this object directly. To construct a `ScheduleEvaluationContext` for testing purposes, use :py:func:`dagster.build_schedule_context`.\n\n Example:\n .. code-block:: python\n\n from dagster import schedule, ScheduleEvaluationContext\n\n @schedule\n def the_schedule(context: ScheduleEvaluationContext):\n ...\n\n """\n\n __slots__ = [\n "_instance_ref",\n "_scheduled_execution_time",\n "_exit_stack",\n "_instance",\n "_log_key",\n "_logger",\n "_repository_name",\n "_resource_defs",\n "_schedule_name",\n "_resources_cm",\n "_resources",\n "_cm_scope_entered",\n "_repository_def",\n ]\n\n def __init__(\n self,\n instance_ref: Optional[InstanceRef],\n scheduled_execution_time: Optional[datetime],\n repository_name: Optional[str] = None,\n schedule_name: Optional[str] = None,\n resources: Optional[Mapping[str, "ResourceDefinition"]] = None,\n repository_def: Optional["RepositoryDefinition"] = None,\n ):\n from dagster._core.definitions.repository_definition import RepositoryDefinition\n\n self._exit_stack = ExitStack()\n self._instance = None\n\n self._instance_ref = check.opt_inst_param(instance_ref, "instance_ref", InstanceRef)\n self._scheduled_execution_time = check.opt_inst_param(\n scheduled_execution_time, "scheduled_execution_time", datetime\n )\n self._log_key = (\n [\n repository_name,\n schedule_name,\n scheduled_execution_time.strftime("%Y%m%d_%H%M%S"),\n ]\n if repository_name and schedule_name and scheduled_execution_time\n else None\n )\n self._logger = None\n self._repository_name = repository_name\n self._schedule_name = schedule_name\n\n # Wait to set resources unless they're accessed\n self._resource_defs = resources\n self._resources = None\n self._cm_scope_entered = False\n self._repository_def = check.opt_inst_param(\n repository_def, "repository_def", RepositoryDefinition\n )\n\n def __enter__(self) -> "ScheduleEvaluationContext":\n self._cm_scope_entered = True\n return self\n\n def __exit__(self, *exc) -> None:\n self._exit_stack.close()\n self._logger = None\n\n @property\n def resource_defs(self) -> Optional[Mapping[str, "ResourceDefinition"]]:\n return self._resource_defs\n\n @public\n @property\n def resources(self) -> Resources:\n """Mapping of resource key to resource definition to be made available\n during schedule execution.\n """\n from dagster._core.definitions.scoped_resources_builder import (\n IContainsGenerator,\n )\n from dagster._core.execution.build_resources import build_resources\n\n if not self._resources:\n # Early exit if no resources are defined. This skips unnecessary initialization\n # entirely. This allows users to run user code servers in cases where they\n # do not have access to the instance if they use a subset of features do\n # that do not require instance access. In this case, if they do not use\n # resources on schedules they do not require the instance, so we do not\n # instantiate it\n #\n # Tracking at https://github.com/dagster-io/dagster/issues/14345\n if not self._resource_defs:\n self._resources = ScopedResourcesBuilder.build_empty()\n return self._resources\n\n instance = self.instance if self._instance or self._instance_ref else None\n\n resources_cm = build_resources(resources=self._resource_defs, instance=instance)\n self._resources = self._exit_stack.enter_context(resources_cm)\n\n if isinstance(self._resources, IContainsGenerator) and not self._cm_scope_entered:\n self._exit_stack.close()\n raise DagsterInvariantViolationError(\n "At least one provided resource is a generator, but attempting to access"\n " resources outside of context manager scope. You can use the following syntax"\n " to open a context manager: `with build_sensor_context(...) as context:`"\n )\n\n return self._resources\n\n def merge_resources(self, resources_dict: Mapping[str, Any]) -> "ScheduleEvaluationContext":\n """Merge the specified resources into this context.\n This method is intended to be used by the Dagster framework, and should not be called by user code.\n\n Args:\n resources_dict (Mapping[str, Any]): The resources to replace in the context.\n """\n check.invariant(\n self._resources is None, "Cannot merge resources in context that has been initialized."\n )\n from dagster._core.execution.build_resources import wrap_resources_for_execution\n\n return ScheduleEvaluationContext(\n instance_ref=self._instance_ref,\n scheduled_execution_time=self._scheduled_execution_time,\n repository_name=self._repository_name,\n schedule_name=self._schedule_name,\n resources={\n **(self._resource_defs or {}),\n **wrap_resources_for_execution(resources_dict),\n },\n repository_def=self._repository_def,\n )\n\n @public\n @property\n def instance(self) -> "DagsterInstance":\n """DagsterInstance: The current DagsterInstance."""\n # self._instance_ref should only ever be None when this ScheduleEvaluationContext was\n # constructed under test.\n if not self._instance_ref:\n raise DagsterInvariantViolationError(\n "Attempted to initialize dagster instance, but no instance reference was provided."\n )\n if not self._instance:\n self._instance = self._exit_stack.enter_context(\n DagsterInstance.from_ref(self._instance_ref)\n )\n return cast(DagsterInstance, self._instance)\n\n @property\n def instance_ref(self) -> Optional[InstanceRef]:\n """The serialized instance configured to run the schedule."""\n return self._instance_ref\n\n @public\n @property\n def scheduled_execution_time(self) -> datetime:\n """The time in which the execution was scheduled to happen. May differ slightly\n from both the actual execution time and the time at which the run config is computed.\n """\n if self._scheduled_execution_time is None:\n check.failed(\n "Attempting to access scheduled_execution_time, but no scheduled_execution_time was"\n " set on this context"\n )\n\n return self._scheduled_execution_time\n\n @property\n def log(self) -> logging.Logger:\n if self._logger:\n return self._logger\n\n if not self._instance_ref:\n self._logger = self._exit_stack.enter_context(\n InstigationLogger(\n self._log_key,\n repository_name=self._repository_name,\n name=self._schedule_name,\n )\n )\n\n self._logger = self._exit_stack.enter_context(\n InstigationLogger(\n self._log_key,\n self.instance,\n repository_name=self._repository_name,\n name=self._schedule_name,\n )\n )\n return cast(InstigationLogger, self._logger)\n\n def has_captured_logs(self):\n return self._logger and self._logger.has_captured_logs()\n\n @property\n def log_key(self) -> Optional[List[str]]:\n return self._log_key\n\n @property\n def repository_def(self) -> "RepositoryDefinition":\n if not self._repository_def:\n raise DagsterInvariantViolationError(\n "Attempted to access repository_def, but no repository_def was provided."\n )\n return self._repository_def
\n\n\nclass DecoratedScheduleFunction(NamedTuple):\n """Wrapper around the decorated schedule function. Keeps track of both to better support the\n optimal return value for direct invocation of the evaluation function.\n """\n\n decorated_fn: RawScheduleEvaluationFunction\n wrapped_fn: Callable[[ScheduleEvaluationContext], RunRequestIterator]\n has_context_arg: bool\n\n\n
[docs]def build_schedule_context(\n instance: Optional[DagsterInstance] = None,\n scheduled_execution_time: Optional[datetime] = None,\n resources: Optional[Mapping[str, object]] = None,\n repository_def: Optional["RepositoryDefinition"] = None,\n instance_ref: Optional["InstanceRef"] = None,\n) -> ScheduleEvaluationContext:\n """Builds schedule execution context using the provided parameters.\n\n The instance provided to ``build_schedule_context`` must be persistent;\n DagsterInstance.ephemeral() will result in an error.\n\n Args:\n instance (Optional[DagsterInstance]): The dagster instance configured to run the schedule.\n scheduled_execution_time (datetime): The time in which the execution was scheduled to\n happen. May differ slightly from both the actual execution time and the time at which\n the run config is computed.\n\n Examples:\n .. code-block:: python\n\n context = build_schedule_context(instance)\n\n """\n from dagster._core.execution.build_resources import wrap_resources_for_execution\n\n check.opt_inst_param(instance, "instance", DagsterInstance)\n\n return ScheduleEvaluationContext(\n instance_ref=(\n instance_ref\n if instance_ref\n else instance.get_ref() if instance and instance.is_persistent else None\n ),\n scheduled_execution_time=check.opt_inst_param(\n scheduled_execution_time, "scheduled_execution_time", datetime\n ),\n resources=wrap_resources_for_execution(resources),\n repository_def=repository_def,\n )
\n\n\n@whitelist_for_serdes\nclass ScheduleExecutionData(\n NamedTuple(\n "_ScheduleExecutionData",\n [\n ("run_requests", Optional[Sequence[RunRequest]]),\n ("skip_message", Optional[str]),\n ("captured_log_key", Optional[Sequence[str]]),\n ],\n )\n):\n def __new__(\n cls,\n run_requests: Optional[Sequence[RunRequest]] = None,\n skip_message: Optional[str] = None,\n captured_log_key: Optional[Sequence[str]] = None,\n ):\n check.opt_sequence_param(run_requests, "run_requests", RunRequest)\n check.opt_str_param(skip_message, "skip_message")\n check.opt_list_param(captured_log_key, "captured_log_key", str)\n check.invariant(\n not (run_requests and skip_message), "Found both skip data and run request data"\n )\n return super(ScheduleExecutionData, cls).__new__(\n cls,\n run_requests=run_requests,\n skip_message=skip_message,\n captured_log_key=captured_log_key,\n )\n\n\ndef validate_and_get_schedule_resource_dict(\n resources: Resources, schedule_name: str, required_resource_keys: Set[str]\n) -> Dict[str, Any]:\n """Validates that the context has all the required resources and returns a dictionary of\n resource key to resource object.\n """\n for k in required_resource_keys:\n if not hasattr(resources, k):\n raise DagsterInvalidDefinitionError(\n f"Resource with key '{k}' required by schedule '{schedule_name}' was not provided."\n )\n\n return {k: getattr(resources, k) for k in required_resource_keys}\n\n\n
[docs]@deprecated_param(\n param="environment_vars",\n breaking_version="2.0",\n additional_warn_text=(\n "It is no longer necessary. Schedules will have access to all environment variables set in"\n " the containing environment, and can safely be deleted."\n ),\n)\nclass ScheduleDefinition(IHasInternalInit):\n """Define a schedule that targets a job.\n\n Args:\n name (Optional[str]): The name of the schedule to create. Defaults to the job name plus\n "_schedule".\n cron_schedule (Union[str, Sequence[str]]): A valid cron string or sequence of cron strings\n specifying when the schedule will run, e.g., ``'45 23 * * 6'`` for a schedule that runs\n at 11:45 PM every Saturday. If a sequence is provided, then the schedule will run for\n the union of all execution times for the provided cron strings, e.g.,\n ``['45 23 * * 6', '30 9 * * 0]`` for a schedule that runs at 11:45 PM every Saturday and\n 9:30 AM every Sunday.\n execution_fn (Callable[ScheduleEvaluationContext]): The core evaluation function for the\n schedule, which is run at an interval to determine whether a run should be launched or\n not. Takes a :py:class:`~dagster.ScheduleEvaluationContext`.\n\n This function must return a generator, which must yield either a single SkipReason\n or one or more RunRequest objects.\n run_config (Optional[Mapping]): The config that parameterizes this execution,\n as a dict.\n run_config_fn (Optional[Callable[[ScheduleEvaluationContext], [Mapping]]]): A function that\n takes a ScheduleEvaluationContext object and returns the run configuration that\n parameterizes this execution, as a dict. You may set only one of ``run_config``,\n ``run_config_fn``, and ``execution_fn``.\n tags (Optional[Mapping[str, str]]): A dictionary of tags (string key-value pairs) to attach\n to the scheduled runs.\n tags_fn (Optional[Callable[[ScheduleEvaluationContext], Optional[Mapping[str, str]]]]): A\n function that generates tags to attach to the schedules runs. Takes a\n :py:class:`~dagster.ScheduleEvaluationContext` and returns a dictionary of tags (string\n key-value pairs). You may set only one of ``tags``, ``tags_fn``, and ``execution_fn``.\n should_execute (Optional[Callable[[ScheduleEvaluationContext], bool]]): A function that runs\n at schedule execution time to determine whether a schedule should execute or skip. Takes\n a :py:class:`~dagster.ScheduleEvaluationContext` and returns a boolean (``True`` if the\n schedule should execute). Defaults to a function that always returns ``True``.\n execution_timezone (Optional[str]): Timezone in which the schedule should run.\n Supported strings for timezones are the ones provided by the\n `IANA time zone database <https://www.iana.org/time-zones>` - e.g. "America/Los_Angeles".\n description (Optional[str]): A human-readable description of the schedule.\n job (Optional[Union[GraphDefinition, JobDefinition]]): The job that should execute when this\n schedule runs.\n default_status (DefaultScheduleStatus): Whether the schedule starts as running or not. The default\n status can be overridden from the Dagster UI or via the GraphQL API.\n required_resource_keys (Optional[Set[str]]): The set of resource keys required by the schedule.\n """\n\n def with_updated_job(self, new_job: ExecutableDefinition) -> "ScheduleDefinition":\n """Returns a copy of this schedule with the job replaced.\n\n Args:\n job (ExecutableDefinition): The job that should execute when this\n schedule runs.\n """\n return ScheduleDefinition.dagster_internal_init(\n name=self.name,\n cron_schedule=self._cron_schedule,\n job_name=self.job_name,\n execution_timezone=self.execution_timezone,\n execution_fn=self._execution_fn,\n description=self.description,\n job=new_job,\n default_status=self.default_status,\n environment_vars=self._environment_vars,\n required_resource_keys=self._raw_required_resource_keys,\n run_config=None, # run_config, tags, should_execute encapsulated in execution_fn\n run_config_fn=None,\n tags=None,\n tags_fn=None,\n should_execute=None,\n )\n\n def __init__(\n self,\n name: Optional[str] = None,\n *,\n cron_schedule: Optional[Union[str, Sequence[str]]] = None,\n job_name: Optional[str] = None,\n run_config: Optional[Any] = None,\n run_config_fn: Optional[ScheduleRunConfigFunction] = None,\n tags: Optional[Mapping[str, str]] = None,\n tags_fn: Optional[ScheduleTagsFunction] = None,\n should_execute: Optional[ScheduleShouldExecuteFunction] = None,\n environment_vars: Optional[Mapping[str, str]] = None,\n execution_timezone: Optional[str] = None,\n execution_fn: Optional[ScheduleExecutionFunction] = None,\n description: Optional[str] = None,\n job: Optional[ExecutableDefinition] = None,\n default_status: DefaultScheduleStatus = DefaultScheduleStatus.STOPPED,\n required_resource_keys: Optional[Set[str]] = None,\n ):\n self._cron_schedule = check.inst_param(cron_schedule, "cron_schedule", (str, Sequence))\n if not isinstance(self._cron_schedule, str):\n check.sequence_param(self._cron_schedule, "cron_schedule", of_type=str) # type: ignore\n\n if not is_valid_cron_schedule(self._cron_schedule): # type: ignore\n raise DagsterInvalidDefinitionError(\n f"Found invalid cron schedule '{self._cron_schedule}' for schedule '{name}''. "\n "Dagster recognizes standard cron expressions consisting of 5 fields."\n )\n\n if job is not None:\n self._target: Union[DirectTarget, RepoRelativeTarget] = DirectTarget(job)\n else:\n self._target = RepoRelativeTarget(\n job_name=check.str_param(job_name, "job_name"),\n op_selection=None,\n )\n\n if name:\n self._name = check_valid_name(name)\n elif job_name:\n self._name = job_name + "_schedule"\n elif job:\n self._name = job.name + "_schedule"\n\n self._description = check.opt_str_param(description, "description")\n\n self._environment_vars = check.opt_mapping_param(\n environment_vars, "environment_vars", key_type=str, value_type=str\n )\n\n self._execution_timezone = check.opt_str_param(execution_timezone, "execution_timezone")\n\n if execution_fn and (run_config_fn or tags_fn or should_execute or tags or run_config):\n raise DagsterInvalidDefinitionError(\n "Attempted to provide both execution_fn and individual run_config/tags arguments "\n "to ScheduleDefinition. Must provide only one of the two."\n )\n elif execution_fn:\n self._execution_fn: Optional[Union[Callable[..., Any], DecoratedScheduleFunction]] = (\n None\n )\n if isinstance(execution_fn, DecoratedScheduleFunction):\n self._execution_fn = execution_fn\n else:\n self._execution_fn = check.opt_callable_param(execution_fn, "execution_fn")\n self._run_config_fn = None\n else:\n if run_config_fn and run_config:\n raise DagsterInvalidDefinitionError(\n "Attempted to provide both run_config_fn and run_config as arguments"\n " to ScheduleDefinition. Must provide only one of the two."\n )\n\n def _default_run_config_fn(context: ScheduleEvaluationContext) -> RunConfig:\n return check.opt_dict_param(run_config, "run_config")\n\n self._run_config_fn = check.opt_callable_param(\n run_config_fn, "run_config_fn", default=_default_run_config_fn\n )\n\n if tags_fn and tags:\n raise DagsterInvalidDefinitionError(\n "Attempted to provide both tags_fn and tags as arguments"\n " to ScheduleDefinition. Must provide only one of the two."\n )\n elif tags:\n tags = validate_tags(tags, allow_reserved_tags=False)\n tags_fn = lambda _context: tags\n else:\n tags_fn = check.opt_callable_param(\n tags_fn, "tags_fn", default=lambda _context: cast(Mapping[str, str], {})\n )\n self._tags_fn = tags_fn\n self._tags = tags\n\n self._should_execute: ScheduleShouldExecuteFunction = check.opt_callable_param(\n should_execute, "should_execute", default=lambda _context: True\n )\n\n # Several type-ignores are present in this function to work around bugs in mypy\n # inference.\n def _execution_fn(context: ScheduleEvaluationContext) -> RunRequestIterator:\n with user_code_error_boundary(\n ScheduleExecutionError,\n lambda: (\n f"Error occurred during the execution of should_execute for schedule {name}"\n ),\n ):\n if not self._should_execute(context):\n yield SkipReason(f"should_execute function for {name} returned false.")\n return\n\n with user_code_error_boundary(\n ScheduleExecutionError,\n lambda: (\n f"Error occurred during the execution of run_config_fn for schedule {name}"\n ),\n ):\n _run_config_fn = check.not_none(self._run_config_fn)\n evaluated_run_config = copy.deepcopy(\n _run_config_fn(context)\n if has_at_least_one_parameter(_run_config_fn)\n else _run_config_fn() # type: ignore # (strict type guard)\n )\n\n with user_code_error_boundary(\n ScheduleExecutionError,\n lambda: f"Error occurred during the execution of tags_fn for schedule {name}",\n ):\n evaluated_tags = validate_tags(tags_fn(context), allow_reserved_tags=False)\n\n yield RunRequest(\n run_key=None,\n run_config=evaluated_run_config,\n tags=evaluated_tags,\n )\n\n self._execution_fn = _execution_fn\n\n if self._execution_timezone:\n try:\n # Verify that the timezone can be loaded\n pendulum.tz.timezone(self._execution_timezone) # type: ignore\n except Exception as e:\n raise DagsterInvalidDefinitionError(\n f"Invalid execution timezone {self._execution_timezone} for {name}"\n ) from e\n\n self._default_status = check.inst_param(\n default_status, "default_status", DefaultScheduleStatus\n )\n\n resource_arg_names: Set[str] = (\n {arg.name for arg in get_resource_args(self._execution_fn.decorated_fn)}\n if isinstance(self._execution_fn, DecoratedScheduleFunction)\n else set()\n )\n\n check.param_invariant(\n len(required_resource_keys or []) == 0 or len(resource_arg_names) == 0,\n "Cannot specify resource requirements in both @schedule decorator and as arguments to"\n " the decorated function",\n )\n\n self._raw_required_resource_keys = check.opt_set_param(\n required_resource_keys, "required_resource_keys", of_type=str\n )\n self._required_resource_keys = self._raw_required_resource_keys or resource_arg_names\n\n @staticmethod\n def dagster_internal_init(\n *,\n name: Optional[str],\n cron_schedule: Optional[Union[str, Sequence[str]]],\n job_name: Optional[str],\n run_config: Optional[Any],\n run_config_fn: Optional[ScheduleRunConfigFunction],\n tags: Optional[Mapping[str, str]],\n tags_fn: Optional[ScheduleTagsFunction],\n should_execute: Optional[ScheduleShouldExecuteFunction],\n environment_vars: Optional[Mapping[str, str]],\n execution_timezone: Optional[str],\n execution_fn: Optional[ScheduleExecutionFunction],\n description: Optional[str],\n job: Optional[ExecutableDefinition],\n default_status: DefaultScheduleStatus,\n required_resource_keys: Optional[Set[str]],\n ) -> "ScheduleDefinition":\n return ScheduleDefinition(\n name=name,\n cron_schedule=cron_schedule,\n job_name=job_name,\n run_config=run_config,\n run_config_fn=run_config_fn,\n tags=tags,\n tags_fn=tags_fn,\n should_execute=should_execute,\n environment_vars=environment_vars,\n execution_timezone=execution_timezone,\n execution_fn=execution_fn,\n description=description,\n job=job,\n default_status=default_status,\n required_resource_keys=required_resource_keys,\n )\n\n def __call__(self, *args, **kwargs) -> ScheduleEvaluationFunctionReturn:\n from dagster._core.definitions.sensor_definition import get_context_param_name\n\n from .decorators.schedule_decorator import DecoratedScheduleFunction\n\n if not isinstance(self._execution_fn, DecoratedScheduleFunction):\n raise DagsterInvalidInvocationError(\n "Schedule invocation is only supported for schedules created via the schedule "\n "decorators."\n )\n\n context_param_name = get_context_param_name(self._execution_fn.decorated_fn)\n context = get_or_create_schedule_context(self._execution_fn.decorated_fn, *args, **kwargs)\n context_param = {context_param_name: context} if context_param_name else {}\n\n resources = validate_and_get_schedule_resource_dict(\n context.resources, self._name, self._required_resource_keys\n )\n result = self._execution_fn.decorated_fn(**context_param, **resources)\n\n if isinstance(result, dict):\n return copy.deepcopy(result)\n else:\n return result\n\n @public\n @property\n def name(self) -> str:\n """str: The name of the schedule."""\n return self._name\n\n @public\n @property\n def job_name(self) -> str:\n """str: The name of the job targeted by this schedule."""\n return self._target.job_name\n\n @public\n @property\n def description(self) -> Optional[str]:\n """Optional[str]: A description for this schedule."""\n return self._description\n\n @public\n @property\n def cron_schedule(self) -> Union[str, Sequence[str]]:\n """Union[str, Sequence[str]]: The cron schedule representing when this schedule will be evaluated."""\n return self._cron_schedule # type: ignore\n\n @public\n @deprecated(\n breaking_version="2.0",\n additional_warn_text="Setting this property no longer has any effect.",\n )\n @property\n def environment_vars(self) -> Mapping[str, str]:\n """Mapping[str, str]: Environment variables to export to the cron schedule."""\n return self._environment_vars\n\n @public\n @property\n def required_resource_keys(self) -> Set[str]:\n """Set[str]: The set of keys for resources that must be provided to this schedule."""\n return self._required_resource_keys\n\n @public\n @property\n def execution_timezone(self) -> Optional[str]:\n """Optional[str]: The timezone in which this schedule will be evaluated."""\n return self._execution_timezone\n\n @public\n @property\n def job(self) -> Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]:\n """Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]: The job that is\n targeted by this schedule.\n """\n if isinstance(self._target, DirectTarget):\n return self._target.target\n raise DagsterInvalidDefinitionError("No job was provided to ScheduleDefinition.")\n\n def evaluate_tick(self, context: "ScheduleEvaluationContext") -> ScheduleExecutionData:\n """Evaluate schedule using the provided context.\n\n Args:\n context (ScheduleEvaluationContext): The context with which to evaluate this schedule.\n\n Returns:\n ScheduleExecutionData: Contains list of run requests, or skip message if present.\n\n """\n from dagster._core.definitions.partition import CachingDynamicPartitionsLoader\n\n check.inst_param(context, "context", ScheduleEvaluationContext)\n execution_fn: Callable[..., "ScheduleEvaluationFunctionReturn"]\n if isinstance(self._execution_fn, DecoratedScheduleFunction):\n execution_fn = self._execution_fn.wrapped_fn\n else:\n execution_fn = cast(\n Callable[..., "ScheduleEvaluationFunctionReturn"],\n self._execution_fn,\n )\n\n result = list(ensure_gen(execution_fn(context)))\n\n skip_message: Optional[str] = None\n\n run_requests: List[RunRequest] = []\n if not result or result == [None]:\n run_requests = []\n skip_message = "Schedule function returned an empty result"\n elif len(result) == 1:\n item = check.inst(result[0], (SkipReason, RunRequest))\n if isinstance(item, RunRequest):\n run_requests = [item]\n skip_message = None\n elif isinstance(item, SkipReason):\n run_requests = []\n skip_message = item.skip_message\n else:\n # NOTE: mypy is not correctly reading this cast-- not sure why\n # (pyright reads it fine). Hence the type-ignores below.\n result = cast(List[RunRequest], check.is_list(result, of_type=RunRequest))\n check.invariant(\n not any(not request.run_key for request in result),\n "Schedules that return multiple RunRequests must specify a run_key in each"\n " RunRequest",\n )\n run_requests = result\n skip_message = None\n\n dynamic_partitions_store = (\n CachingDynamicPartitionsLoader(context.instance) if context.instance_ref else None\n )\n\n # clone all the run requests with resolved tags and config\n resolved_run_requests = []\n for run_request in run_requests:\n if run_request.partition_key and not run_request.has_resolved_partition():\n if context.repository_def is None:\n raise DagsterInvariantViolationError(\n "Must provide repository def to build_schedule_context when yielding"\n " partitioned run requests"\n )\n\n scheduled_target = context.repository_def.get_job(self._target.job_name)\n resolved_request = run_request.with_resolved_tags_and_config(\n target_definition=scheduled_target,\n dynamic_partitions_requests=[],\n current_time=context.scheduled_execution_time,\n dynamic_partitions_store=dynamic_partitions_store,\n )\n else:\n resolved_request = run_request\n\n resolved_run_requests.append(\n resolved_request.with_replaced_attrs(\n tags=merge_dicts(resolved_request.tags, DagsterRun.tags_for_schedule(self))\n )\n )\n\n return ScheduleExecutionData(\n run_requests=resolved_run_requests,\n skip_message=skip_message,\n captured_log_key=context.log_key if context.has_captured_logs() else None,\n )\n\n def has_loadable_target(self):\n return isinstance(self._target, DirectTarget)\n\n @property\n def targets_unresolved_asset_job(self) -> bool:\n return self.has_loadable_target() and isinstance(\n self.load_target(), UnresolvedAssetJobDefinition\n )\n\n def load_target(\n self,\n ) -> Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]:\n if isinstance(self._target, DirectTarget):\n return self._target.load()\n\n check.failed("Target is not loadable")\n\n @public\n @property\n def default_status(self) -> DefaultScheduleStatus:\n """DefaultScheduleStatus: The default status for this schedule when it is first loaded in\n a code location.\n """\n return self._default_status
\n
", "current_page_name": "_modules/dagster/_core/definitions/schedule_definition", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.schedule_definition"}, "selector": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.selector

\nfrom typing import AbstractSet, Iterable, NamedTuple, Optional, Sequence\n\nfrom typing_extensions import Self\n\nimport dagster._check as check\nfrom dagster._core.definitions.asset_check_spec import AssetCheckKey\nfrom dagster._core.definitions.events import AssetKey\nfrom dagster._core.definitions.repository_definition import SINGLETON_REPOSITORY_NAME\nfrom dagster._serdes import create_snapshot_id, whitelist_for_serdes\n\n\nclass JobSubsetSelector(\n    NamedTuple(\n        "_JobSubsetSelector",\n        [\n            ("location_name", str),\n            ("repository_name", str),\n            ("job_name", str),\n            ("op_selection", Optional[Sequence[str]]),\n            ("asset_selection", Optional[AbstractSet[AssetKey]]),\n            ("asset_check_selection", Optional[AbstractSet[AssetCheckKey]]),\n        ],\n    )\n):\n    """The information needed to resolve a job within a host process."""\n\n    def __new__(\n        cls,\n        location_name: str,\n        repository_name: str,\n        job_name: str,\n        op_selection: Optional[Sequence[str]],\n        asset_selection: Optional[Iterable[AssetKey]] = None,\n        asset_check_selection: Optional[Iterable[AssetCheckKey]] = None,\n    ):\n        asset_selection = set(asset_selection) if asset_selection else None\n        asset_check_selection = (\n            set(asset_check_selection) if asset_check_selection is not None else None\n        )\n        return super(JobSubsetSelector, cls).__new__(\n            cls,\n            location_name=check.str_param(location_name, "location_name"),\n            repository_name=check.str_param(repository_name, "repository_name"),\n            job_name=check.str_param(job_name, "job_name"),\n            op_selection=check.opt_nullable_sequence_param(op_selection, "op_selection", str),\n            asset_selection=check.opt_nullable_set_param(\n                asset_selection, "asset_selection", AssetKey\n            ),\n            asset_check_selection=check.opt_nullable_set_param(\n                asset_check_selection, "asset_check_selection", AssetCheckKey\n            ),\n        )\n\n    def to_graphql_input(self):\n        return {\n            "repositoryLocationName": self.location_name,\n            "repositoryName": self.repository_name,\n            "pipelineName": self.job_name,\n            "solidSelection": self.op_selection,\n        }\n\n    def with_op_selection(self, op_selection: Optional[Sequence[str]]) -> Self:\n        check.invariant(\n            self.op_selection is None,\n            f"Can not invoke with_op_selection when op_selection={self.op_selection} is"\n            " already set",\n        )\n        return JobSubsetSelector(\n            self.location_name, self.repository_name, self.job_name, op_selection\n        )\n\n\n
[docs]@whitelist_for_serdes\nclass JobSelector(\n NamedTuple(\n "_JobSelector", [("location_name", str), ("repository_name", str), ("job_name", str)]\n )\n):\n def __new__(\n cls,\n location_name: str,\n repository_name: Optional[str] = None,\n job_name: Optional[str] = None,\n ):\n return super(JobSelector, cls).__new__(\n cls,\n location_name=check.str_param(location_name, "location_name"),\n repository_name=check.opt_str_param(\n repository_name,\n "repository_name",\n default=SINGLETON_REPOSITORY_NAME,\n ),\n job_name=check.str_param(\n job_name,\n "job_name",\n "Must provide job_name argument even though it is marked as optional in the "\n "function signature. repository_name, a truly optional parameter, is before "\n "that argument and actually optional. Use of keyword arguments is "\n "recommended to avoid confusion.",\n ),\n )\n\n def to_graphql_input(self):\n return {\n "repositoryLocationName": self.location_name,\n "repositoryName": self.repository_name,\n "jobName": self.job_name,\n }\n\n @property\n def selector_id(self):\n return create_snapshot_id(self)\n\n @staticmethod\n def from_graphql_input(graphql_data):\n return JobSelector(\n location_name=graphql_data["repositoryLocationName"],\n repository_name=graphql_data["repositoryName"],\n job_name=graphql_data["jobName"],\n )
\n\n\n
[docs]@whitelist_for_serdes\nclass RepositorySelector(\n NamedTuple("_RepositorySelector", [("location_name", str), ("repository_name", str)])\n):\n def __new__(cls, location_name: str, repository_name: str):\n return super(RepositorySelector, cls).__new__(\n cls,\n location_name=check.str_param(location_name, "location_name"),\n repository_name=check.str_param(repository_name, "repository_name"),\n )\n\n def to_graphql_input(self):\n return {\n "repositoryLocationName": self.location_name,\n "repositoryName": self.repository_name,\n }\n\n @property\n def selector_id(self):\n return create_snapshot_id(self)\n\n @staticmethod\n def from_graphql_input(graphql_data):\n return RepositorySelector(\n location_name=graphql_data["repositoryLocationName"],\n repository_name=graphql_data["repositoryName"],\n )
\n\n\nclass CodeLocationSelector(NamedTuple("_CodeLocationSelector", [("location_name", str)])):\n def __new__(cls, location_name: str):\n return super(CodeLocationSelector, cls).__new__(\n cls,\n location_name=check.str_param(location_name, "location_name"),\n )\n\n def to_repository_selector(self) -> RepositorySelector:\n return RepositorySelector(\n location_name=self.location_name, repository_name=SINGLETON_REPOSITORY_NAME\n )\n\n\nclass ScheduleSelector(\n NamedTuple(\n "_ScheduleSelector",\n [("location_name", str), ("repository_name", str), ("schedule_name", str)],\n )\n):\n def __new__(cls, location_name: str, repository_name: str, schedule_name: str):\n return super(ScheduleSelector, cls).__new__(\n cls,\n location_name=check.str_param(location_name, "location_name"),\n repository_name=check.str_param(repository_name, "repository_name"),\n schedule_name=check.str_param(schedule_name, "schedule_name"),\n )\n\n def to_graphql_input(self):\n return {\n "repositoryLocationName": self.location_name,\n "repositoryName": self.repository_name,\n "scheduleName": self.schedule_name,\n }\n\n @staticmethod\n def from_graphql_input(graphql_data):\n return ScheduleSelector(\n location_name=graphql_data["repositoryLocationName"],\n repository_name=graphql_data["repositoryName"],\n schedule_name=graphql_data["scheduleName"],\n )\n\n\nclass ResourceSelector(NamedTuple):\n location_name: str\n repository_name: str\n resource_name: str\n\n def to_graphql_input(self):\n return {\n "repositoryLocationName": self.location_name,\n "repositoryName": self.repository_name,\n "resourceName": self.resource_name,\n }\n\n @staticmethod\n def from_graphql_input(graphql_data):\n return ResourceSelector(\n location_name=graphql_data["repositoryLocationName"],\n repository_name=graphql_data["repositoryName"],\n resource_name=graphql_data["resourceName"],\n )\n\n\nclass SensorSelector(\n NamedTuple(\n "_SensorSelector", [("location_name", str), ("repository_name", str), ("sensor_name", str)]\n )\n):\n def __new__(cls, location_name: str, repository_name: str, sensor_name: str):\n return super(SensorSelector, cls).__new__(\n cls,\n location_name=check.str_param(location_name, "location_name"),\n repository_name=check.str_param(repository_name, "repository_name"),\n sensor_name=check.str_param(sensor_name, "sensor_name"),\n )\n\n def to_graphql_input(self):\n return {\n "repositoryLocationName": self.location_name,\n "repositoryName": self.repository_name,\n "sensorName": self.sensor_name,\n }\n\n @staticmethod\n def from_graphql_input(graphql_data):\n return SensorSelector(\n location_name=graphql_data["repositoryLocationName"],\n repository_name=graphql_data["repositoryName"],\n sensor_name=graphql_data["sensorName"],\n )\n\n\n@whitelist_for_serdes\nclass InstigatorSelector(\n NamedTuple(\n "_InstigatorSelector", [("location_name", str), ("repository_name", str), ("name", str)]\n )\n):\n def __new__(cls, location_name: str, repository_name: str, name: str):\n return super(InstigatorSelector, cls).__new__(\n cls,\n location_name=check.str_param(location_name, "location_name"),\n repository_name=check.str_param(repository_name, "repository_name"),\n name=check.str_param(name, "name"),\n )\n\n def to_graphql_input(self):\n return {\n "repositoryLocationName": self.location_name,\n "repositoryName": self.repository_name,\n "name": self.name,\n }\n\n @staticmethod\n def from_graphql_input(graphql_data):\n return InstigatorSelector(\n location_name=graphql_data["repositoryLocationName"],\n repository_name=graphql_data["repositoryName"],\n name=graphql_data["name"],\n )\n\n\nclass GraphSelector(\n NamedTuple(\n "_GraphSelector", [("location_name", str), ("repository_name", str), ("graph_name", str)]\n )\n):\n """The information needed to resolve a graph within a host process."""\n\n def __new__(cls, location_name: str, repository_name: str, graph_name: str):\n return super(GraphSelector, cls).__new__(\n cls,\n location_name=check.str_param(location_name, "location_name"),\n repository_name=check.str_param(repository_name, "repository_name"),\n graph_name=check.str_param(graph_name, "graph_name"),\n )\n\n def to_graphql_input(self):\n return {\n "repositoryLocationName": self.location_name,\n "repositoryName": self.repository_name,\n "graphName": self.graph_name,\n }\n\n\n@whitelist_for_serdes\nclass PartitionSetSelector(\n NamedTuple(\n "_PartitionSetSelector",\n [("location_name", str), ("repository_name", str), ("partition_set_name", str)],\n )\n):\n """The information needed to resolve a partition set within a host process."""\n\n def __new__(cls, location_name: str, repository_name: str, partition_set_name: str):\n return super(PartitionSetSelector, cls).__new__(\n cls,\n location_name=check.str_param(location_name, "location_name"),\n repository_name=check.str_param(repository_name, "repository_name"),\n partition_set_name=check.str_param(partition_set_name, "partition_set_name"),\n )\n\n def to_graphql_input(self):\n return {\n "repositoryLocationName": self.location_name,\n "repositoryName": self.repository_name,\n "partitionSetName": self.partition_set_name,\n }\n\n\nclass PartitionRangeSelector(\n NamedTuple(\n "_PartitionRangeSelector",\n [("start", str), ("end", str)],\n )\n):\n """The information needed to resolve a partition range."""\n\n def __new__(cls, start: str, end: str):\n return super(PartitionRangeSelector, cls).__new__(\n cls,\n start=check.inst_param(start, "start", str),\n end=check.inst_param(end, "end", str),\n )\n\n def to_graphql_input(self):\n return {\n "start": self.start,\n "end": self.end,\n }\n\n @staticmethod\n def from_graphql_input(graphql_data):\n return PartitionRangeSelector(\n start=graphql_data["start"],\n end=graphql_data["end"],\n )\n\n\nclass PartitionsSelector(\n NamedTuple(\n "_PartitionsSelector",\n [("partition_range", PartitionRangeSelector)],\n )\n):\n """The information needed to define selection partitions.\n Using partition_range as property name to avoid shadowing Python 'range' builtin .\n """\n\n def __new__(cls, partition_range: PartitionRangeSelector):\n return super(PartitionsSelector, cls).__new__(\n cls,\n partition_range=check.inst_param(partition_range, "range", PartitionRangeSelector),\n )\n\n def to_graphql_input(self):\n return {\n "range": self.partition_range.to_graphql_input(),\n }\n\n @staticmethod\n def from_graphql_input(graphql_data):\n return PartitionsSelector(\n partition_range=PartitionRangeSelector.from_graphql_input(graphql_data["range"])\n )\n\n\nclass PartitionsByAssetSelector(\n NamedTuple(\n "PartitionsByAssetSelector",\n [\n ("asset_key", AssetKey),\n ("partitions", Optional[PartitionsSelector]),\n ],\n )\n):\n """The information needed to define partitions selection for a given asset key."""\n\n def __new__(cls, asset_key: AssetKey, partitions: Optional[PartitionsSelector] = None):\n return super(PartitionsByAssetSelector, cls).__new__(\n cls,\n asset_key=check.inst_param(asset_key, "asset_key", AssetKey),\n partitions=check.opt_inst_param(partitions, "partitions", PartitionsSelector),\n )\n\n def to_graphql_input(self):\n return {\n "assetKey": self.asset_key.to_graphql_input(),\n "partitions": self.partitions.to_graphql_input() if self.partitions else None,\n }\n\n @staticmethod\n def from_graphql_input(graphql_data):\n asset_key = graphql_data["assetKey"]\n partitions = graphql_data.get("partitions")\n return PartitionsByAssetSelector(\n asset_key=AssetKey.from_graphql_input(asset_key),\n partitions=PartitionsSelector.from_graphql_input(partitions) if partitions else None,\n )\n
", "current_page_name": "_modules/dagster/_core/definitions/selector", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.selector"}, "sensor_definition": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.sensor_definition

\nimport inspect\nimport logging\nfrom collections import defaultdict\nfrom contextlib import ExitStack\nfrom enum import Enum\nfrom typing import (\n    TYPE_CHECKING,\n    Any,\n    Callable,\n    Dict,\n    Iterable,\n    Iterator,\n    List,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Set,\n    Tuple,\n    Type,\n    TypeVar,\n    Union,\n    cast,\n)\n\nimport pendulum\nfrom typing_extensions import TypeAlias\n\nimport dagster._check as check\nfrom dagster._annotations import public\nfrom dagster._core.definitions.asset_check_evaluation import AssetCheckEvaluation\nfrom dagster._core.definitions.events import (\n    AssetMaterialization,\n    AssetObservation,\n)\nfrom dagster._core.definitions.instigation_logger import InstigationLogger\nfrom dagster._core.definitions.job_definition import JobDefinition\nfrom dagster._core.definitions.partition import (\n    CachingDynamicPartitionsLoader,\n)\nfrom dagster._core.definitions.resource_annotation import (\n    get_resource_args,\n)\nfrom dagster._core.definitions.resource_definition import (\n    Resources,\n)\nfrom dagster._core.definitions.scoped_resources_builder import ScopedResourcesBuilder\nfrom dagster._core.errors import (\n    DagsterInvalidDefinitionError,\n    DagsterInvalidInvocationError,\n    DagsterInvalidSubsetError,\n    DagsterInvariantViolationError,\n)\nfrom dagster._core.instance import DagsterInstance\nfrom dagster._core.instance.ref import InstanceRef\nfrom dagster._serdes import whitelist_for_serdes\nfrom dagster._utils import IHasInternalInit, normalize_to_repository\n\nfrom ..decorator_utils import (\n    get_function_params,\n)\nfrom .asset_selection import AssetSelection\nfrom .graph_definition import GraphDefinition\nfrom .run_request import (\n    AddDynamicPartitionsRequest,\n    DagsterRunReaction,\n    DeleteDynamicPartitionsRequest,\n    RunRequest,\n    SensorResult,\n    SkipReason,\n)\nfrom .target import DirectTarget, ExecutableDefinition, RepoRelativeTarget\nfrom .unresolved_asset_job_definition import UnresolvedAssetJobDefinition\nfrom .utils import check_valid_name\n\nif TYPE_CHECKING:\n    from dagster import ResourceDefinition\n    from dagster._core.definitions.definitions_class import Definitions\n    from dagster._core.definitions.repository_definition import RepositoryDefinition\n\n\n@whitelist_for_serdes\nclass DefaultSensorStatus(Enum):\n    RUNNING = "RUNNING"\n    STOPPED = "STOPPED"\n\n\n@whitelist_for_serdes\nclass SensorType(Enum):\n    STANDARD = "STANDARD"\n    RUN_STATUS = "RUN_STATUS"\n    ASSET = "ASSET"\n    MULTI_ASSET = "MULTI_ASSET"\n    FRESHNESS_POLICY = "FRESHNESS_POLICY"\n    UNKNOWN = "UNKNOWN"\n\n\nDEFAULT_SENSOR_DAEMON_INTERVAL = 30\n\n\n
[docs]class SensorEvaluationContext:\n """The context object available as the argument to the evaluation function of a :py:class:`dagster.SensorDefinition`.\n\n Users should not instantiate this object directly. To construct a\n `SensorEvaluationContext` for testing purposes, use :py:func:`dagster.\n build_sensor_context`.\n\n Attributes:\n instance_ref (Optional[InstanceRef]): The serialized instance configured to run the schedule\n cursor (Optional[str]): The cursor, passed back from the last sensor evaluation via\n the cursor attribute of SkipReason and RunRequest\n last_completion_time (float): DEPRECATED The last time that the sensor was evaluated (UTC).\n last_run_key (str): DEPRECATED The run key of the RunRequest most recently created by this\n sensor. Use the preferred `cursor` attribute instead.\n repository_name (Optional[str]): The name of the repository that the sensor belongs to.\n repository_def (Optional[RepositoryDefinition]): The repository or that\n the sensor belongs to. If needed by the sensor top-level resource definitions will be\n pulled from this repository. You can provide either this or `definitions`.\n instance (Optional[DagsterInstance]): The deserialized instance can also be passed in\n directly (primarily useful in testing contexts).\n definitions (Optional[Definitions]): `Definitions` object that the sensor is defined in.\n If needed by the sensor, top-level resource definitions will be pulled from these\n definitions. You can provide either this or `repository_def`.\n resources (Optional[Dict[str, Any]]): A dict of resource keys to resource\n definitions to be made available during sensor execution.\n\n Example:\n .. code-block:: python\n\n from dagster import sensor, SensorEvaluationContext\n\n @sensor\n def the_sensor(context: SensorEvaluationContext):\n ...\n\n """\n\n def __init__(\n self,\n instance_ref: Optional[InstanceRef],\n last_completion_time: Optional[float],\n last_run_key: Optional[str],\n cursor: Optional[str],\n repository_name: Optional[str],\n repository_def: Optional["RepositoryDefinition"] = None,\n instance: Optional[DagsterInstance] = None,\n sensor_name: Optional[str] = None,\n resources: Optional[Mapping[str, "ResourceDefinition"]] = None,\n definitions: Optional["Definitions"] = None,\n ):\n from dagster._core.definitions.definitions_class import Definitions\n from dagster._core.definitions.repository_definition import RepositoryDefinition\n\n self._exit_stack = ExitStack()\n self._instance_ref = check.opt_inst_param(instance_ref, "instance_ref", InstanceRef)\n self._last_completion_time = check.opt_float_param(\n last_completion_time, "last_completion_time"\n )\n self._last_run_key = check.opt_str_param(last_run_key, "last_run_key")\n self._cursor = check.opt_str_param(cursor, "cursor")\n self._repository_name = check.opt_str_param(repository_name, "repository_name")\n self._repository_def = normalize_to_repository(\n check.opt_inst_param(definitions, "definitions", Definitions),\n check.opt_inst_param(repository_def, "repository_def", RepositoryDefinition),\n error_on_none=False,\n )\n self._instance = check.opt_inst_param(instance, "instance", DagsterInstance)\n self._sensor_name = sensor_name\n\n # Wait to set resources unless they're accessed\n self._resource_defs = resources\n self._resources = None\n self._cm_scope_entered = False\n\n self._log_key = (\n [\n repository_name,\n sensor_name,\n pendulum.now("UTC").strftime("%Y%m%d_%H%M%S"),\n ]\n if repository_name and sensor_name\n else None\n )\n self._logger: Optional[InstigationLogger] = None\n self._cursor_updated = False\n\n def __enter__(self) -> "SensorEvaluationContext":\n self._cm_scope_entered = True\n return self\n\n def __exit__(self, *exc) -> None:\n self._exit_stack.close()\n self._logger = None\n\n @property\n def resource_defs(self) -> Optional[Mapping[str, "ResourceDefinition"]]:\n return self._resource_defs\n\n def merge_resources(self, resources_dict: Mapping[str, Any]) -> "SensorEvaluationContext":\n """Merge the specified resources into this context.\n\n This method is intended to be used by the Dagster framework, and should not be called by user code.\n\n Args:\n resources_dict (Mapping[str, Any]): The resources to replace in the context.\n """\n check.invariant(\n self._resources is None, "Cannot merge resources in context that has been initialized."\n )\n from dagster._core.execution.build_resources import wrap_resources_for_execution\n\n return SensorEvaluationContext(\n instance_ref=self._instance_ref,\n last_completion_time=self._last_completion_time,\n last_run_key=self._last_run_key,\n cursor=self._cursor,\n repository_name=self._repository_name,\n repository_def=self._repository_def,\n instance=self._instance,\n sensor_name=self._sensor_name,\n resources={\n **(self._resource_defs or {}),\n **wrap_resources_for_execution(resources_dict),\n },\n )\n\n @public\n @property\n def resources(self) -> Resources:\n """Resources: A mapping from resource key to instantiated resources for this sensor."""\n from dagster._core.definitions.scoped_resources_builder import (\n IContainsGenerator,\n )\n from dagster._core.execution.build_resources import build_resources\n\n if not self._resources:\n """\n This is similar to what we do in e.g. the op context - we set up a resource\n building context manager, and immediately enter it. This is so that in cases\n where a user is not using any context-manager based resources, they don't\n need to enter this SensorEvaluationContext themselves.\n\n For example:\n\n my_sensor(build_sensor_context(resources={"my_resource": my_non_cm_resource})\n\n will work ok, but for a CM resource we must do\n\n with build_sensor_context(resources={"my_resource": my_cm_resource}) as context:\n my_sensor(context)\n """\n\n # Early exit if no resources are defined. This skips unnecessary initialization\n # entirely. This allows users to run user code servers in cases where they\n # do not have access to the instance if they use a subset of features do\n # that do not require instance access. In this case, if they do not use\n # resources on sensors they do not require the instance, so we do not\n # instantiate it\n #\n # Tracking at https://github.com/dagster-io/dagster/issues/14345\n if not self._resource_defs:\n self._resources = ScopedResourcesBuilder.build_empty()\n return self._resources\n\n instance = self.instance if self._instance or self._instance_ref else None\n\n resources_cm = build_resources(resources=self._resource_defs or {}, instance=instance)\n self._resources = self._exit_stack.enter_context(resources_cm)\n\n if isinstance(self._resources, IContainsGenerator) and not self._cm_scope_entered:\n self._exit_stack.close()\n raise DagsterInvariantViolationError(\n "At least one provided resource is a generator, but attempting to access"\n " resources outside of context manager scope. You can use the following syntax"\n " to open a context manager: `with build_schedule_context(...) as context:`"\n )\n\n return self._resources\n\n @public\n @property\n def instance(self) -> DagsterInstance:\n """DagsterInstance: The current DagsterInstance."""\n # self._instance_ref should only ever be None when this SensorEvaluationContext was\n # constructed under test.\n if not self._instance:\n if not self._instance_ref:\n raise DagsterInvariantViolationError(\n "Attempted to initialize dagster instance, but no instance reference was"\n " provided."\n )\n self._instance = self._exit_stack.enter_context(\n DagsterInstance.from_ref(self._instance_ref)\n )\n return cast(DagsterInstance, self._instance)\n\n @property\n def instance_ref(self) -> Optional[InstanceRef]:\n return self._instance_ref\n\n @public\n @property\n def last_completion_time(self) -> Optional[float]:\n """Optional[float]: Timestamp representing the last time this sensor completed an evaluation."""\n return self._last_completion_time\n\n @public\n @property\n def last_run_key(self) -> Optional[str]:\n """Optional[str]: The run key supplied to the most recent RunRequest produced by this sensor."""\n return self._last_run_key\n\n @public\n @property\n def cursor(self) -> Optional[str]:\n """The cursor value for this sensor, which was set in an earlier sensor evaluation."""\n return self._cursor\n\n
[docs] @public\n def update_cursor(self, cursor: Optional[str]) -> None:\n """Updates the cursor value for this sensor, which will be provided on the context for the\n next sensor evaluation.\n\n This can be used to keep track of progress and avoid duplicate work across sensor\n evaluations.\n\n Args:\n cursor (Optional[str]):\n """\n self._cursor = check.opt_str_param(cursor, "cursor")\n self._cursor_updated = True
\n\n @property\n def cursor_updated(self) -> bool:\n return self._cursor_updated\n\n @public\n @property\n def repository_name(self) -> Optional[str]:\n """Optional[str]: The name of the repository that this sensor resides in."""\n return self._repository_name\n\n @public\n @property\n def repository_def(self) -> Optional["RepositoryDefinition"]:\n """Optional[RepositoryDefinition]: The RepositoryDefinition that this sensor resides in."""\n return self._repository_def\n\n @property\n def log(self) -> logging.Logger:\n if self._logger:\n return self._logger\n\n if not self._instance_ref:\n self._logger = self._exit_stack.enter_context(\n InstigationLogger(\n self._log_key,\n repository_name=self._repository_name,\n name=self._sensor_name,\n )\n )\n return cast(logging.Logger, self._logger)\n\n self._logger = self._exit_stack.enter_context(\n InstigationLogger(\n self._log_key,\n self.instance,\n repository_name=self._repository_name,\n name=self._sensor_name,\n )\n )\n return cast(logging.Logger, self._logger)\n\n def has_captured_logs(self):\n return self._logger and self._logger.has_captured_logs()\n\n @property\n def log_key(self) -> Optional[List[str]]:\n return self._log_key
\n\n\nRawSensorEvaluationFunctionReturn = Union[\n Iterator[Union[SkipReason, RunRequest, DagsterRunReaction, SensorResult]],\n Sequence[RunRequest],\n SkipReason,\n RunRequest,\n DagsterRunReaction,\n SensorResult,\n]\nRawSensorEvaluationFunction: TypeAlias = Callable[..., RawSensorEvaluationFunctionReturn]\n\nSensorEvaluationFunction: TypeAlias = Callable[..., Sequence[Union[SkipReason, RunRequest]]]\n\n\ndef get_context_param_name(fn: Callable) -> Optional[str]:\n """Determines the sensor's context parameter name by excluding all resource parameters."""\n resource_params = {param.name for param in get_resource_args(fn)}\n\n return next(\n (param.name for param in get_function_params(fn) if param.name not in resource_params), None\n )\n\n\ndef validate_and_get_resource_dict(\n resources: Resources, sensor_name: str, required_resource_keys: Set[str]\n) -> Dict[str, Any]:\n """Validates that the context has all the required resources and returns a dictionary of\n resource key to resource object.\n """\n for k in required_resource_keys:\n if not hasattr(resources, k):\n raise DagsterInvalidDefinitionError(\n f"Resource with key '{k}' required by sensor '{sensor_name}' was not provided."\n )\n\n return {k: getattr(resources, k) for k in required_resource_keys}\n\n\ndef _check_dynamic_partitions_requests(\n dynamic_partitions_requests: Sequence[\n Union[AddDynamicPartitionsRequest, DeleteDynamicPartitionsRequest]\n ],\n) -> None:\n req_keys_to_add_by_partitions_def_name = defaultdict(set)\n req_keys_to_delete_by_partitions_def_name = defaultdict(set)\n\n for req in dynamic_partitions_requests:\n duplicate_req_keys_to_delete = req_keys_to_delete_by_partitions_def_name.get(\n req.partitions_def_name, set()\n ).intersection(req.partition_keys)\n duplicate_req_keys_to_add = req_keys_to_add_by_partitions_def_name.get(\n req.partitions_def_name, set()\n ).intersection(req.partition_keys)\n if isinstance(req, AddDynamicPartitionsRequest):\n if duplicate_req_keys_to_delete:\n raise DagsterInvariantViolationError(\n "Dynamic partition requests cannot contain both add and delete requests for"\n " the same partition keys.Invalid request: partitions_def_name"\n f" '{req.partitions_def_name}', partition_keys: {duplicate_req_keys_to_delete}"\n )\n elif duplicate_req_keys_to_add:\n raise DagsterInvariantViolationError(\n "Cannot request to add duplicate dynamic partition keys: \\npartitions_def_name"\n f" '{req.partitions_def_name}', partition_keys: {duplicate_req_keys_to_add}"\n )\n req_keys_to_add_by_partitions_def_name[req.partitions_def_name].update(\n req.partition_keys\n )\n elif isinstance(req, DeleteDynamicPartitionsRequest):\n if duplicate_req_keys_to_delete:\n raise DagsterInvariantViolationError(\n "Cannot request to add duplicate dynamic partition keys: \\npartitions_def_name"\n f" '{req.partitions_def_name}', partition_keys:"\n f" {req_keys_to_add_by_partitions_def_name}"\n )\n elif duplicate_req_keys_to_add:\n raise DagsterInvariantViolationError(\n "Dynamic partition requests cannot contain both add and delete requests for"\n " the same partition keys.Invalid request: partitions_def_name"\n f" '{req.partitions_def_name}', partition_keys: {duplicate_req_keys_to_add}"\n )\n req_keys_to_delete_by_partitions_def_name[req.partitions_def_name].update(\n req.partition_keys\n )\n else:\n check.failed(f"Unexpected dynamic partition request type: {req}")\n\n\n
[docs]class SensorDefinition(IHasInternalInit):\n """Define a sensor that initiates a set of runs based on some external state.\n\n Args:\n evaluation_fn (Callable[[SensorEvaluationContext]]): The core evaluation function for the\n sensor, which is run at an interval to determine whether a run should be launched or\n not. Takes a :py:class:`~dagster.SensorEvaluationContext`.\n\n This function must return a generator, which must yield either a single SkipReason\n or one or more RunRequest objects.\n name (Optional[str]): The name of the sensor to create. Defaults to name of evaluation_fn\n minimum_interval_seconds (Optional[int]): The minimum number of seconds that will elapse\n between sensor evaluations.\n description (Optional[str]): A human-readable description of the sensor.\n job (Optional[GraphDefinition, JobDefinition, UnresolvedAssetJob]): The job to execute when this sensor fires.\n jobs (Optional[Sequence[GraphDefinition, JobDefinition, UnresolvedAssetJob]]): (experimental) A list of jobs to execute when this sensor fires.\n default_status (DefaultSensorStatus): Whether the sensor starts as running or not. The default\n status can be overridden from the Dagster UI or via the GraphQL API.\n asset_selection (AssetSelection): (Experimental) an asset selection to launch a run for if\n the sensor condition is met. This can be provided instead of specifying a job.\n """\n\n def with_updated_jobs(self, new_jobs: Sequence[ExecutableDefinition]) -> "SensorDefinition":\n """Returns a copy of this sensor with the jobs replaced.\n\n Args:\n job (ExecutableDefinition): The job that should execute when this\n schedule runs.\n """\n return SensorDefinition.dagster_internal_init(\n name=self.name,\n evaluation_fn=self._raw_fn,\n minimum_interval_seconds=self.minimum_interval_seconds,\n description=self.description,\n job_name=None, # if original init was passed job name, was resolved to a job\n jobs=new_jobs if len(new_jobs) > 1 else None,\n job=new_jobs[0] if len(new_jobs) == 1 else None,\n default_status=self.default_status,\n asset_selection=self.asset_selection,\n required_resource_keys=self._raw_required_resource_keys,\n )\n\n def with_updated_job(self, new_job: ExecutableDefinition) -> "SensorDefinition":\n """Returns a copy of this sensor with the job replaced.\n\n Args:\n job (ExecutableDefinition): The job that should execute when this\n schedule runs.\n """\n return self.with_updated_jobs([new_job])\n\n def __init__(\n self,\n name: Optional[str] = None,\n *,\n evaluation_fn: Optional[RawSensorEvaluationFunction] = None,\n job_name: Optional[str] = None,\n minimum_interval_seconds: Optional[int] = None,\n description: Optional[str] = None,\n job: Optional[ExecutableDefinition] = None,\n jobs: Optional[Sequence[ExecutableDefinition]] = None,\n default_status: DefaultSensorStatus = DefaultSensorStatus.STOPPED,\n asset_selection: Optional[AssetSelection] = None,\n required_resource_keys: Optional[Set[str]] = None,\n ):\n from dagster._config.pythonic_config import validate_resource_annotated_function\n\n if evaluation_fn is None:\n raise DagsterInvalidDefinitionError("Must provide evaluation_fn to SensorDefinition.")\n\n if (\n sum(\n [\n int(job is not None),\n int(jobs is not None),\n int(job_name is not None),\n int(asset_selection is not None),\n ]\n )\n > 1\n ):\n raise DagsterInvalidDefinitionError(\n "Attempted to provide more than one of 'job', 'jobs', 'job_name', and "\n "'asset_selection' params to SensorDefinition. Must provide only one."\n )\n\n jobs = jobs if jobs else [job] if job else None\n\n targets: Optional[List[Union[RepoRelativeTarget, DirectTarget]]] = None\n if job_name:\n targets = [\n RepoRelativeTarget(\n job_name=check.str_param(job_name, "job_name"),\n op_selection=None,\n )\n ]\n elif job:\n targets = [DirectTarget(job)]\n elif jobs:\n targets = [DirectTarget(job) for job in jobs]\n elif asset_selection:\n targets = []\n\n if name:\n self._name = check_valid_name(name)\n else:\n self._name = evaluation_fn.__name__\n\n self._raw_fn: RawSensorEvaluationFunction = check.callable_param(\n evaluation_fn, "evaluation_fn"\n )\n self._evaluation_fn: Union[\n SensorEvaluationFunction,\n Callable[\n [SensorEvaluationContext],\n List[Union[SkipReason, RunRequest, DagsterRunReaction]],\n ],\n ] = wrap_sensor_evaluation(self._name, evaluation_fn)\n self._min_interval = check.opt_int_param(\n minimum_interval_seconds, "minimum_interval_seconds", DEFAULT_SENSOR_DAEMON_INTERVAL\n )\n self._description = check.opt_str_param(description, "description")\n self._targets: Sequence[Union[RepoRelativeTarget, DirectTarget]] = check.opt_list_param(\n targets, "targets", (DirectTarget, RepoRelativeTarget)\n )\n self._default_status = check.inst_param(\n default_status, "default_status", DefaultSensorStatus\n )\n self._asset_selection = check.opt_inst_param(\n asset_selection, "asset_selection", AssetSelection\n )\n validate_resource_annotated_function(self._raw_fn)\n resource_arg_names: Set[str] = {arg.name for arg in get_resource_args(self._raw_fn)}\n\n check.param_invariant(\n len(required_resource_keys or []) == 0 or len(resource_arg_names) == 0,\n "Cannot specify resource requirements in both @sensor decorator and as arguments to"\n " the decorated function",\n )\n self._raw_required_resource_keys = check.opt_set_param(\n required_resource_keys, "required_resource_keys", of_type=str\n )\n self._required_resource_keys = self._raw_required_resource_keys or resource_arg_names\n\n @staticmethod\n def dagster_internal_init(\n *,\n name: Optional[str],\n evaluation_fn: Optional[RawSensorEvaluationFunction],\n job_name: Optional[str],\n minimum_interval_seconds: Optional[int],\n description: Optional[str],\n job: Optional[ExecutableDefinition],\n jobs: Optional[Sequence[ExecutableDefinition]],\n default_status: DefaultSensorStatus,\n asset_selection: Optional[AssetSelection],\n required_resource_keys: Optional[Set[str]],\n ) -> "SensorDefinition":\n return SensorDefinition(\n name=name,\n evaluation_fn=evaluation_fn,\n job_name=job_name,\n minimum_interval_seconds=minimum_interval_seconds,\n description=description,\n job=job,\n jobs=jobs,\n default_status=default_status,\n asset_selection=asset_selection,\n required_resource_keys=required_resource_keys,\n )\n\n def __call__(self, *args, **kwargs) -> RawSensorEvaluationFunctionReturn:\n context_param_name_if_present = get_context_param_name(self._raw_fn)\n context = get_or_create_sensor_context(self._raw_fn, *args, **kwargs)\n\n context_param = (\n {context_param_name_if_present: context} if context_param_name_if_present else {}\n )\n\n resources = validate_and_get_resource_dict(\n context.resources, self.name, self._required_resource_keys\n )\n return self._raw_fn(**context_param, **resources)\n\n @public\n @property\n def required_resource_keys(self) -> Set[str]:\n """Set[str]: The set of keys for resources that must be provided to this sensor."""\n return self._required_resource_keys\n\n @public\n @property\n def name(self) -> str:\n """str: The name of this sensor."""\n return self._name\n\n @public\n @property\n def description(self) -> Optional[str]:\n """Optional[str]: A description for this sensor."""\n return self._description\n\n @public\n @property\n def minimum_interval_seconds(self) -> Optional[int]:\n """Optional[int]: The minimum number of seconds between sequential evaluations of this sensor."""\n return self._min_interval\n\n @property\n def targets(self) -> Sequence[Union[DirectTarget, RepoRelativeTarget]]:\n return self._targets\n\n @public\n @property\n def job(self) -> Union[JobDefinition, GraphDefinition, UnresolvedAssetJobDefinition]:\n """Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]: The job that is\n targeted by this schedule.\n """\n if self._targets:\n if len(self._targets) == 1 and isinstance(self._targets[0], DirectTarget):\n return self._targets[0].target\n elif len(self._targets) > 1:\n raise DagsterInvalidDefinitionError(\n "Job property not available when SensorDefinition has multiple jobs."\n )\n raise DagsterInvalidDefinitionError("No job was provided to SensorDefinition.")\n\n @public\n @property\n def jobs(self) -> List[Union[JobDefinition, GraphDefinition, UnresolvedAssetJobDefinition]]:\n """List[Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]]: A list of jobs\n that are targeted by this schedule.\n """\n if self._targets and all(isinstance(target, DirectTarget) for target in self._targets):\n return [target.target for target in self._targets] # type: ignore # (illegible conditional)\n raise DagsterInvalidDefinitionError("No job was provided to SensorDefinition.")\n\n @property\n def sensor_type(self) -> SensorType:\n return SensorType.STANDARD\n\n def evaluate_tick(self, context: "SensorEvaluationContext") -> "SensorExecutionData":\n """Evaluate sensor using the provided context.\n\n Args:\n context (SensorEvaluationContext): The context with which to evaluate this sensor.\n\n Returns:\n SensorExecutionData: Contains list of run requests, or skip message if present.\n\n """\n context = check.inst_param(context, "context", SensorEvaluationContext)\n\n result = self._evaluation_fn(context)\n\n skip_message: Optional[str] = None\n run_requests: List[RunRequest] = []\n dagster_run_reactions: List[DagsterRunReaction] = []\n dynamic_partitions_requests: Optional[\n Sequence[Union[AddDynamicPartitionsRequest, DeleteDynamicPartitionsRequest]]\n ] = []\n updated_cursor = context.cursor\n asset_events = []\n\n if not result or result == [None]:\n skip_message = "Sensor function returned an empty result"\n elif len(result) == 1:\n item = result[0]\n check.inst(item, (SkipReason, RunRequest, DagsterRunReaction, SensorResult))\n\n if isinstance(item, SensorResult):\n run_requests = list(item.run_requests) if item.run_requests else []\n skip_message = (\n item.skip_reason.skip_message\n if item.skip_reason\n else (None if run_requests else "Sensor function returned an empty result")\n )\n\n _check_dynamic_partitions_requests(\n item.dynamic_partitions_requests or [],\n )\n dynamic_partitions_requests = item.dynamic_partitions_requests or []\n\n if item.cursor and context.cursor_updated:\n raise DagsterInvariantViolationError(\n "SensorResult.cursor cannot be set if context.update_cursor() was called."\n )\n updated_cursor = item.cursor\n asset_events = item.asset_events\n\n elif isinstance(item, RunRequest):\n run_requests = [item]\n elif isinstance(item, SkipReason):\n skip_message = item.skip_message if isinstance(item, SkipReason) else None\n elif isinstance(item, DagsterRunReaction):\n dagster_run_reactions = (\n [cast(DagsterRunReaction, item)] if isinstance(item, DagsterRunReaction) else []\n )\n else:\n check.failed(f"Unexpected type {type(item)} in sensor result")\n else:\n if any(isinstance(item, SensorResult) for item in result):\n check.failed(\n "When a SensorResult is returned from a sensor, it must be the only object"\n " returned."\n )\n\n check.is_list(result, (SkipReason, RunRequest, DagsterRunReaction))\n has_skip = any(map(lambda x: isinstance(x, SkipReason), result))\n run_requests = [item for item in result if isinstance(item, RunRequest)]\n dagster_run_reactions = [\n item for item in result if isinstance(item, DagsterRunReaction)\n ]\n\n if has_skip:\n if len(run_requests) > 0:\n check.failed(\n "Expected a single SkipReason or one or more RunRequests: received both "\n "RunRequest and SkipReason"\n )\n elif len(dagster_run_reactions) > 0:\n check.failed(\n "Expected a single SkipReason or one or more DagsterRunReaction: "\n "received both DagsterRunReaction and SkipReason"\n )\n else:\n check.failed("Expected a single SkipReason: received multiple SkipReasons")\n\n _check_dynamic_partitions_requests(dynamic_partitions_requests)\n resolved_run_requests = self.resolve_run_requests(\n run_requests, context, self._asset_selection, dynamic_partitions_requests\n )\n\n return SensorExecutionData(\n resolved_run_requests,\n skip_message,\n updated_cursor,\n dagster_run_reactions,\n captured_log_key=context.log_key if context.has_captured_logs() else None,\n dynamic_partitions_requests=dynamic_partitions_requests,\n asset_events=asset_events,\n )\n\n def has_loadable_targets(self) -> bool:\n for target in self._targets:\n if isinstance(target, DirectTarget):\n return True\n return False\n\n def load_targets(\n self,\n ) -> Sequence[Union[JobDefinition, GraphDefinition, UnresolvedAssetJobDefinition]]:\n """Returns job/graph definitions that have been directly passed into the sensor definition.\n Any jobs or graphs that are referenced by name will not be loaded.\n """\n targets = []\n for target in self._targets:\n if isinstance(target, DirectTarget):\n targets.append(target.load())\n return targets\n\n def resolve_run_requests(\n self,\n run_requests: Sequence[RunRequest],\n context: SensorEvaluationContext,\n asset_selection: Optional[AssetSelection],\n dynamic_partitions_requests: Sequence[\n Union[AddDynamicPartitionsRequest, DeleteDynamicPartitionsRequest]\n ],\n ) -> Sequence[RunRequest]:\n def _get_repo_job_by_name(context: SensorEvaluationContext, job_name: str) -> JobDefinition:\n if context.repository_def is None:\n raise DagsterInvariantViolationError(\n "Must provide repository def to build_sensor_context when yielding partitioned"\n " run requests"\n )\n return context.repository_def.get_job(job_name)\n\n has_multiple_targets = len(self._targets) > 1\n target_names = [target.job_name for target in self._targets]\n\n if run_requests and len(self._targets) == 0 and not self._asset_selection:\n raise Exception(\n f"Error in sensor {self._name}: Sensor evaluation function returned a RunRequest "\n "for a sensor lacking a specified target (job_name, job, or jobs). Targets "\n "can be specified by providing job, jobs, or job_name to the @sensor "\n "decorator."\n )\n\n if asset_selection:\n run_requests = [\n *_run_requests_with_base_asset_jobs(run_requests, context, asset_selection)\n ]\n\n dynamic_partitions_store = (\n CachingDynamicPartitionsLoader(context.instance) if context.instance_ref else None\n )\n\n # Run requests may contain an invalid target, or a partition key that does not exist.\n # We will resolve these run requests, applying the target and partition config/tags.\n resolved_run_requests = []\n for run_request in run_requests:\n if run_request.job_name is None and has_multiple_targets:\n raise Exception(\n f"Error in sensor {self._name}: Sensor returned a RunRequest that did not"\n " specify job_name for the requested run. Expected one of:"\n f" {target_names}"\n )\n elif (\n run_request.job_name\n and run_request.job_name not in target_names\n and not asset_selection\n ):\n raise Exception(\n f"Error in sensor {self._name}: Sensor returned a RunRequest with job_name "\n f"{run_request.job_name}. Expected one of: {target_names}"\n )\n\n if run_request.partition_key and not run_request.has_resolved_partition():\n selected_job = _get_repo_job_by_name(\n context, run_request.job_name if run_request.job_name else target_names[0]\n )\n resolved_run_requests.append(\n run_request.with_resolved_tags_and_config(\n target_definition=selected_job,\n current_time=None,\n dynamic_partitions_store=dynamic_partitions_store,\n dynamic_partitions_requests=dynamic_partitions_requests,\n )\n )\n else:\n resolved_run_requests.append(run_request)\n\n return resolved_run_requests\n\n @property\n def _target(self) -> Optional[Union[DirectTarget, RepoRelativeTarget]]:\n return self._targets[0] if self._targets else None\n\n @public\n @property\n def job_name(self) -> Optional[str]:\n """Optional[str]: The name of the job that is targeted by this sensor."""\n if len(self._targets) > 1:\n raise DagsterInvalidInvocationError(\n f"Cannot use `job_name` property for sensor {self.name}, which targets multiple"\n " jobs."\n )\n return self._targets[0].job_name\n\n @public\n @property\n def default_status(self) -> DefaultSensorStatus:\n """DefaultSensorStatus: The default status for this sensor when it is first loaded in\n a code location.\n """\n return self._default_status\n\n @property\n def asset_selection(self) -> Optional[AssetSelection]:\n return self._asset_selection
\n\n\n@whitelist_for_serdes(\n storage_field_names={"dagster_run_reactions": "pipeline_run_reactions"},\n)\nclass SensorExecutionData(\n NamedTuple(\n "_SensorExecutionData",\n [\n ("run_requests", Optional[Sequence[RunRequest]]),\n ("skip_message", Optional[str]),\n ("cursor", Optional[str]),\n ("dagster_run_reactions", Optional[Sequence[DagsterRunReaction]]),\n ("captured_log_key", Optional[Sequence[str]]),\n (\n "dynamic_partitions_requests",\n Optional[\n Sequence[Union[AddDynamicPartitionsRequest, DeleteDynamicPartitionsRequest]]\n ],\n ),\n (\n "asset_events",\n Sequence[Union[AssetMaterialization, AssetObservation, AssetCheckEvaluation]],\n ),\n ],\n )\n):\n dagster_run_reactions: Optional[Sequence[DagsterRunReaction]]\n\n def __new__(\n cls,\n run_requests: Optional[Sequence[RunRequest]] = None,\n skip_message: Optional[str] = None,\n cursor: Optional[str] = None,\n dagster_run_reactions: Optional[Sequence[DagsterRunReaction]] = None,\n captured_log_key: Optional[Sequence[str]] = None,\n dynamic_partitions_requests: Optional[\n Sequence[Union[AddDynamicPartitionsRequest, DeleteDynamicPartitionsRequest]]\n ] = None,\n asset_events: Optional[\n Sequence[Union[AssetMaterialization, AssetObservation, AssetCheckEvaluation]]\n ] = None,\n ):\n check.opt_sequence_param(run_requests, "run_requests", RunRequest)\n check.opt_str_param(skip_message, "skip_message")\n check.opt_str_param(cursor, "cursor")\n check.opt_sequence_param(dagster_run_reactions, "dagster_run_reactions", DagsterRunReaction)\n check.opt_list_param(captured_log_key, "captured_log_key", str)\n check.opt_sequence_param(\n dynamic_partitions_requests,\n "dynamic_partitions_requests",\n (AddDynamicPartitionsRequest, DeleteDynamicPartitionsRequest),\n )\n check.opt_sequence_param(\n asset_events,\n "asset_events",\n (AssetMaterialization, AssetObservation, AssetCheckEvaluation),\n )\n check.invariant(\n not (run_requests and skip_message), "Found both skip data and run request data"\n )\n return super(SensorExecutionData, cls).__new__(\n cls,\n run_requests=run_requests,\n skip_message=skip_message,\n cursor=cursor,\n dagster_run_reactions=dagster_run_reactions,\n captured_log_key=captured_log_key,\n dynamic_partitions_requests=dynamic_partitions_requests,\n asset_events=asset_events or [],\n )\n\n\ndef wrap_sensor_evaluation(\n sensor_name: str,\n fn: RawSensorEvaluationFunction,\n) -> SensorEvaluationFunction:\n resource_arg_names: Set[str] = {arg.name for arg in get_resource_args(fn)}\n\n def _wrapped_fn(context: SensorEvaluationContext):\n resource_args_populated = validate_and_get_resource_dict(\n context.resources, sensor_name, resource_arg_names\n )\n\n context_param_name_if_present = get_context_param_name(fn)\n context_param = (\n {context_param_name_if_present: context} if context_param_name_if_present else {}\n )\n raw_evaluation_result = fn(**context_param, **resource_args_populated)\n\n def check_returned_scalar(scalar):\n if isinstance(scalar, (SkipReason, RunRequest, SensorResult)):\n return scalar\n elif scalar is not None:\n raise Exception(\n f"Error in sensor {sensor_name}: Sensor unexpectedly returned output "\n f"{scalar} of type {type(scalar)}. Should only return SkipReason or "\n "RunRequest objects."\n )\n\n if inspect.isgenerator(raw_evaluation_result):\n result = []\n try:\n while True:\n result.append(next(raw_evaluation_result))\n except StopIteration as e:\n # captures the case where the evaluation function has a yield and also returns a\n # value\n if e.value is not None:\n result.append(check_returned_scalar(e.value))\n\n return result\n elif isinstance(raw_evaluation_result, list):\n return raw_evaluation_result\n else:\n return [check_returned_scalar(raw_evaluation_result)]\n\n return _wrapped_fn\n\n\n
[docs]def build_sensor_context(\n instance: Optional[DagsterInstance] = None,\n cursor: Optional[str] = None,\n repository_name: Optional[str] = None,\n repository_def: Optional["RepositoryDefinition"] = None,\n sensor_name: Optional[str] = None,\n resources: Optional[Mapping[str, object]] = None,\n definitions: Optional["Definitions"] = None,\n instance_ref: Optional["InstanceRef"] = None,\n) -> SensorEvaluationContext:\n """Builds sensor execution context using the provided parameters.\n\n This function can be used to provide a context to the invocation of a sensor definition.If\n provided, the dagster instance must be persistent; DagsterInstance.ephemeral() will result in an\n error.\n\n Args:\n instance (Optional[DagsterInstance]): The dagster instance configured to run the sensor.\n cursor (Optional[str]): A cursor value to provide to the evaluation of the sensor.\n repository_name (Optional[str]): The name of the repository that the sensor belongs to.\n repository_def (Optional[RepositoryDefinition]): The repository that the sensor belongs to.\n If needed by the sensor top-level resource definitions will be pulled from this repository.\n You can provide either this or `definitions`.\n resources (Optional[Mapping[str, ResourceDefinition]]): A set of resource definitions\n to provide to the sensor. If passed, these will override any resource definitions\n provided by the repository.\n definitions (Optional[Definitions]): `Definitions` object that the sensor is defined in.\n If needed by the sensor, top-level resource definitions will be pulled from these\n definitions. You can provide either this or `repository_def`.\n\n Examples:\n .. code-block:: python\n\n context = build_sensor_context()\n my_sensor(context)\n\n """\n from dagster._core.definitions.definitions_class import Definitions\n from dagster._core.definitions.repository_definition import RepositoryDefinition\n from dagster._core.execution.build_resources import wrap_resources_for_execution\n\n check.opt_inst_param(instance, "instance", DagsterInstance)\n check.opt_str_param(cursor, "cursor")\n check.opt_str_param(repository_name, "repository_name")\n repository_def = normalize_to_repository(\n check.opt_inst_param(definitions, "definitions", Definitions),\n check.opt_inst_param(repository_def, "repository_def", RepositoryDefinition),\n error_on_none=False,\n )\n\n return SensorEvaluationContext(\n instance_ref=instance_ref,\n last_completion_time=None,\n last_run_key=None,\n cursor=cursor,\n repository_name=repository_name,\n instance=instance,\n repository_def=repository_def,\n sensor_name=sensor_name,\n resources=wrap_resources_for_execution(resources),\n )
\n\n\nT = TypeVar("T")\n\n\ndef get_sensor_context_from_args_or_kwargs(\n fn: Callable,\n args: Tuple[Any, ...],\n kwargs: Dict[str, Any],\n context_type: Type[T],\n) -> Optional[T]:\n from dagster._config.pythonic_config import is_coercible_to_resource\n\n context_param_name = get_context_param_name(fn)\n\n kwarg_keys_non_resource = set(kwargs.keys()) - {param.name for param in get_resource_args(fn)}\n if len(args) + len(kwarg_keys_non_resource) > 1:\n raise DagsterInvalidInvocationError(\n "Sensor invocation received multiple non-resource arguments. Only a first "\n "positional context parameter should be provided when invoking."\n )\n\n if any(is_coercible_to_resource(arg) for arg in args):\n raise DagsterInvalidInvocationError(\n "If directly invoking a sensor, you may not provide resources as"\n " positional"\n " arguments, only as keyword arguments."\n )\n\n context: Optional[T] = None\n\n if len(args) > 0:\n context = check.opt_inst(args[0], context_type)\n elif len(kwargs) > 0:\n if context_param_name and context_param_name not in kwargs:\n raise DagsterInvalidInvocationError(\n f"Sensor invocation expected argument '{context_param_name}'."\n )\n context = check.opt_inst(kwargs.get(context_param_name or "context"), context_type)\n elif context_param_name:\n # If the context parameter is present but no value was provided, we error\n raise DagsterInvalidInvocationError(\n "Sensor evaluation function expected context argument, but no context argument "\n "was provided when invoking."\n )\n\n return context\n\n\ndef get_or_create_sensor_context(\n fn: Callable,\n *args: Any,\n **kwargs: Any,\n) -> SensorEvaluationContext:\n """Based on the passed resource function and the arguments passed to it, returns the\n user-passed SensorEvaluationContext or creates one if it is not passed.\n\n Raises an exception if the user passes more than one argument or if the user-provided\n function requires a context parameter but none is passed.\n """\n context = (\n get_sensor_context_from_args_or_kwargs(\n fn,\n args,\n kwargs,\n context_type=SensorEvaluationContext,\n )\n or build_sensor_context()\n )\n resource_args_from_kwargs = {}\n\n resource_args = {param.name for param in get_resource_args(fn)}\n for resource_arg in resource_args:\n if resource_arg in kwargs:\n resource_args_from_kwargs[resource_arg] = kwargs[resource_arg]\n\n if resource_args_from_kwargs:\n return context.merge_resources(resource_args_from_kwargs)\n\n return context\n\n\ndef _run_requests_with_base_asset_jobs(\n run_requests: Iterable[RunRequest],\n context: SensorEvaluationContext,\n outer_asset_selection: AssetSelection,\n) -> Sequence[RunRequest]:\n """For sensors that target asset selections instead of jobs, finds the corresponding base asset\n for a selected set of assets.\n """\n asset_graph = context.repository_def.asset_graph # type: ignore # (possible none)\n result = []\n for run_request in run_requests:\n if run_request.asset_selection:\n asset_keys = run_request.asset_selection\n\n unexpected_asset_keys = (\n AssetSelection.keys(*asset_keys) - outer_asset_selection\n ).resolve(asset_graph)\n if unexpected_asset_keys:\n raise DagsterInvalidSubsetError(\n "RunRequest includes asset keys that are not part of sensor's asset_selection:"\n f" {unexpected_asset_keys}"\n )\n else:\n asset_keys = outer_asset_selection.resolve(asset_graph)\n\n base_job = context.repository_def.get_implicit_job_def_for_assets(asset_keys) # type: ignore # (possible none)\n result.append(\n run_request.with_replaced_attrs(\n job_name=base_job.name, asset_selection=list(asset_keys) # type: ignore # (possible none)\n )\n )\n\n return result\n
", "current_page_name": "_modules/dagster/_core/definitions/sensor_definition", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.sensor_definition"}, "source_asset": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.source_asset

\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Any,\n    Callable,\n    Dict,\n    Iterator,\n    Mapping,\n    Optional,\n    cast,\n)\n\nfrom typing_extensions import TypeAlias\n\nimport dagster._check as check\nfrom dagster._annotations import PublicAttr, experimental_param, public\nfrom dagster._core.decorator_utils import get_function_params\nfrom dagster._core.definitions.data_version import (\n    DATA_VERSION_TAG,\n    DataVersion,\n    DataVersionsByPartition,\n)\nfrom dagster._core.definitions.events import AssetKey, AssetObservation, CoercibleToAssetKey\nfrom dagster._core.definitions.metadata import (\n    ArbitraryMetadataMapping,\n    MetadataMapping,\n    normalize_metadata,\n)\nfrom dagster._core.definitions.op_definition import OpDefinition\nfrom dagster._core.definitions.partition import PartitionsDefinition\nfrom dagster._core.definitions.resource_annotation import get_resource_args\nfrom dagster._core.definitions.resource_definition import ResourceDefinition\nfrom dagster._core.definitions.resource_requirement import (\n    ResourceAddable,\n    ResourceRequirement,\n    SourceAssetIOManagerRequirement,\n    ensure_requirements_satisfied,\n    get_resource_key_conflicts,\n)\nfrom dagster._core.definitions.utils import (\n    DEFAULT_GROUP_NAME,\n    DEFAULT_IO_MANAGER_KEY,\n    validate_group_name,\n)\nfrom dagster._core.errors import (\n    DagsterInvalidDefinitionError,\n    DagsterInvalidInvocationError,\n    DagsterInvalidObservationError,\n)\n\nif TYPE_CHECKING:\n    from dagster._core.definitions.decorators.op_decorator import (\n        DecoratedOpFunction,\n    )\nfrom dagster._core.storage.io_manager import IOManagerDefinition\nfrom dagster._utils.merger import merge_dicts\nfrom dagster._utils.warnings import disable_dagster_warnings\n\n# Going with this catch-all for the time-being to permit pythonic resources\nSourceAssetObserveFunction: TypeAlias = Callable[..., Any]\n\n\ndef wrap_source_asset_observe_fn_in_op_compute_fn(\n    source_asset: "SourceAsset",\n) -> "DecoratedOpFunction":\n    from dagster._core.definitions.decorators.op_decorator import (\n        DecoratedOpFunction,\n        is_context_provided,\n    )\n    from dagster._core.execution.context.compute import (\n        OpExecutionContext,\n    )\n\n    check.not_none(source_asset.observe_fn, "Must be an observable source asset")\n    assert source_asset.observe_fn  # for type checker\n\n    observe_fn = source_asset.observe_fn\n\n    observe_fn_has_context = is_context_provided(get_function_params(observe_fn))\n\n    def fn(context: OpExecutionContext):\n        resource_kwarg_keys = [param.name for param in get_resource_args(observe_fn)]\n        resource_kwargs = {key: getattr(context.resources, key) for key in resource_kwarg_keys}\n        observe_fn_return_value = (\n            observe_fn(context, **resource_kwargs)\n            if observe_fn_has_context\n            else observe_fn(**resource_kwargs)\n        )\n\n        if isinstance(observe_fn_return_value, DataVersion):\n            if source_asset.partitions_def is not None:\n                raise DagsterInvalidObservationError(\n                    f"{source_asset.key} is partitioned, so its observe function should return a"\n                    " DataVersionsByPartition, not a DataVersion"\n                )\n\n            context.log_event(\n                AssetObservation(\n                    asset_key=source_asset.key,\n                    tags={DATA_VERSION_TAG: observe_fn_return_value.value},\n                )\n            )\n        elif isinstance(observe_fn_return_value, DataVersionsByPartition):\n            if source_asset.partitions_def is None:\n                raise DagsterInvalidObservationError(\n                    f"{source_asset.key} is not partitioned, so its observe function should return"\n                    " a DataVersion, not a DataVersionsByPartition"\n                )\n\n            for (\n                partition_key,\n                data_version,\n            ) in observe_fn_return_value.data_versions_by_partition.items():\n                context.log_event(\n                    AssetObservation(\n                        asset_key=source_asset.key,\n                        tags={DATA_VERSION_TAG: data_version.value},\n                        partition=partition_key,\n                    )\n                )\n        else:\n            raise DagsterInvalidObservationError(\n                f"Observe function for {source_asset.key} must return a DataVersion or"\n                " DataVersionsByPartition, but returned a value of type"\n                f" {type(observe_fn_return_value)}"\n            )\n\n    return DecoratedOpFunction(fn)\n\n\n
[docs]@experimental_param(param="resource_defs")\n@experimental_param(param="io_manager_def")\nclass SourceAsset(ResourceAddable):\n """A SourceAsset represents an asset that will be loaded by (but not updated by) Dagster.\n\n Attributes:\n key (Union[AssetKey, Sequence[str], str]): The key of the asset.\n metadata (Mapping[str, MetadataValue]): Metadata associated with the asset.\n io_manager_key (Optional[str]): The key for the IOManager that will be used to load the contents of\n the asset when it's used as an input to other assets inside a job.\n io_manager_def (Optional[IOManagerDefinition]): (Experimental) The definition of the IOManager that will be used to load the contents of\n the asset when it's used as an input to other assets inside a job.\n resource_defs (Optional[Mapping[str, ResourceDefinition]]): (Experimental) resource definitions that may be required by the :py:class:`dagster.IOManagerDefinition` provided in the `io_manager_def` argument.\n description (Optional[str]): The description of the asset.\n partitions_def (Optional[PartitionsDefinition]): Defines the set of partition keys that\n compose the asset.\n observe_fn (Optional[SourceAssetObserveFunction]) Observation function for the source asset.\n """\n\n key: PublicAttr[AssetKey]\n metadata: PublicAttr[MetadataMapping]\n raw_metadata: PublicAttr[ArbitraryMetadataMapping]\n io_manager_key: PublicAttr[Optional[str]]\n _io_manager_def: PublicAttr[Optional[IOManagerDefinition]]\n description: PublicAttr[Optional[str]]\n partitions_def: PublicAttr[Optional[PartitionsDefinition]]\n group_name: PublicAttr[str]\n resource_defs: PublicAttr[Dict[str, ResourceDefinition]]\n observe_fn: PublicAttr[Optional[SourceAssetObserveFunction]]\n _node_def: Optional[OpDefinition] # computed lazily\n auto_observe_interval_minutes: Optional[float]\n\n def __init__(\n self,\n key: CoercibleToAssetKey,\n metadata: Optional[ArbitraryMetadataMapping] = None,\n io_manager_key: Optional[str] = None,\n io_manager_def: Optional[object] = None,\n description: Optional[str] = None,\n partitions_def: Optional[PartitionsDefinition] = None,\n group_name: Optional[str] = None,\n resource_defs: Optional[Mapping[str, object]] = None,\n observe_fn: Optional[SourceAssetObserveFunction] = None,\n *,\n auto_observe_interval_minutes: Optional[float] = None,\n # This is currently private because it is necessary for source asset observation functions,\n # but we have not yet decided on a final API for associated one or more ops with a source\n # asset. If we were to make this public, then we would have a canonical public\n # `required_resource_keys` used for observation that might end up conflicting with a set of\n # required resource keys for a different operation.\n _required_resource_keys: Optional[AbstractSet[str]] = None,\n # Add additional fields to with_resources and with_group below\n ):\n from dagster._core.execution.build_resources import (\n wrap_resources_for_execution,\n )\n\n self.key = AssetKey.from_coercible(key)\n metadata = check.opt_mapping_param(metadata, "metadata", key_type=str)\n self.raw_metadata = metadata\n self.metadata = normalize_metadata(metadata, allow_invalid=True)\n\n resource_defs_dict = dict(check.opt_mapping_param(resource_defs, "resource_defs"))\n if io_manager_def:\n if not io_manager_key:\n io_manager_key = self.key.to_python_identifier("io_manager")\n\n if (\n io_manager_key in resource_defs_dict\n and resource_defs_dict[io_manager_key] != io_manager_def\n ):\n raise DagsterInvalidDefinitionError(\n f"Provided conflicting definitions for io manager key '{io_manager_key}'."\n " Please provide only one definition per key."\n )\n\n resource_defs_dict[io_manager_key] = io_manager_def\n\n self.resource_defs = wrap_resources_for_execution(resource_defs_dict)\n\n self.io_manager_key = check.opt_str_param(io_manager_key, "io_manager_key")\n self.partitions_def = check.opt_inst_param(\n partitions_def, "partitions_def", PartitionsDefinition\n )\n self.group_name = validate_group_name(group_name)\n self.description = check.opt_str_param(description, "description")\n self.observe_fn = check.opt_callable_param(observe_fn, "observe_fn")\n self._required_resource_keys = check.opt_set_param(\n _required_resource_keys, "_required_resource_keys", of_type=str\n )\n self._node_def = None\n self.auto_observe_interval_minutes = check.opt_numeric_param(\n auto_observe_interval_minutes, "auto_observe_interval_minutes"\n )\n\n def get_io_manager_key(self) -> str:\n return self.io_manager_key or DEFAULT_IO_MANAGER_KEY\n\n @property\n def io_manager_def(self) -> Optional[IOManagerDefinition]:\n io_manager_key = self.get_io_manager_key()\n return cast(\n Optional[IOManagerDefinition],\n self.resource_defs.get(io_manager_key) if io_manager_key else None,\n )\n\n @public\n @property\n def op(self) -> OpDefinition:\n """OpDefinition: The OpDefinition associated with the observation function of an observable\n source asset.\n\n Throws an error if the asset is not observable.\n """\n check.invariant(\n isinstance(self.node_def, OpDefinition),\n "The NodeDefinition for this AssetsDefinition is not of type OpDefinition.",\n )\n return cast(OpDefinition, self.node_def)\n\n @public\n @property\n def is_observable(self) -> bool:\n """bool: Whether the asset is observable."""\n return self.node_def is not None\n\n @property\n def required_resource_keys(self) -> AbstractSet[str]:\n return {requirement.key for requirement in self.get_resource_requirements()}\n\n @property\n def node_def(self) -> Optional[OpDefinition]:\n """Op that generates observation metadata for a source asset."""\n if self.observe_fn is None:\n return None\n\n if self._node_def is None:\n self._node_def = OpDefinition(\n compute_fn=wrap_source_asset_observe_fn_in_op_compute_fn(self),\n name=self.key.to_python_identifier(),\n description=self.description,\n required_resource_keys=self._required_resource_keys,\n )\n return self._node_def\n\n def with_resources(self, resource_defs) -> "SourceAsset":\n from dagster._core.execution.resources_init import get_transitive_required_resource_keys\n\n overlapping_keys = get_resource_key_conflicts(self.resource_defs, resource_defs)\n if overlapping_keys:\n raise DagsterInvalidInvocationError(\n f"SourceAsset with key {self.key} has conflicting resource "\n "definitions with provided resources for the following keys: "\n f"{sorted(list(overlapping_keys))}. Either remove the existing "\n "resources from the asset or change the resource keys so that "\n "they don't overlap."\n )\n\n merged_resource_defs = merge_dicts(resource_defs, self.resource_defs)\n\n # Ensure top-level resource requirements are met - except for\n # io_manager, since that is a default it can be resolved later.\n ensure_requirements_satisfied(merged_resource_defs, list(self.get_resource_requirements()))\n\n io_manager_def = merged_resource_defs.get(self.get_io_manager_key())\n if not io_manager_def and self.get_io_manager_key() != DEFAULT_IO_MANAGER_KEY:\n raise DagsterInvalidDefinitionError(\n f"SourceAsset with asset key {self.key} requires IO manager with key"\n f" '{self.get_io_manager_key()}', but none was provided."\n )\n relevant_keys = get_transitive_required_resource_keys(\n {*self._required_resource_keys, self.get_io_manager_key()}, merged_resource_defs\n )\n\n relevant_resource_defs = {\n key: resource_def\n for key, resource_def in merged_resource_defs.items()\n if key in relevant_keys\n }\n\n io_manager_key = (\n self.get_io_manager_key()\n if self.get_io_manager_key() != DEFAULT_IO_MANAGER_KEY\n else None\n )\n with disable_dagster_warnings():\n return SourceAsset(\n key=self.key,\n io_manager_key=io_manager_key,\n description=self.description,\n partitions_def=self.partitions_def,\n metadata=self.raw_metadata,\n resource_defs=relevant_resource_defs,\n group_name=self.group_name,\n observe_fn=self.observe_fn,\n auto_observe_interval_minutes=self.auto_observe_interval_minutes,\n _required_resource_keys=self._required_resource_keys,\n )\n\n def with_attributes(\n self, group_name: Optional[str] = None, key: Optional[AssetKey] = None\n ) -> "SourceAsset":\n if group_name is not None and self.group_name != DEFAULT_GROUP_NAME:\n raise DagsterInvalidDefinitionError(\n "A group name has already been provided to source asset"\n f" {self.key.to_user_string()}"\n )\n\n with disable_dagster_warnings():\n return SourceAsset(\n key=key or self.key,\n metadata=self.raw_metadata,\n io_manager_key=self.io_manager_key,\n io_manager_def=self.io_manager_def,\n description=self.description,\n partitions_def=self.partitions_def,\n group_name=group_name,\n resource_defs=self.resource_defs,\n observe_fn=self.observe_fn,\n auto_observe_interval_minutes=self.auto_observe_interval_minutes,\n _required_resource_keys=self._required_resource_keys,\n )\n\n def get_resource_requirements(self) -> Iterator[ResourceRequirement]:\n if self.node_def is not None:\n yield from self.node_def.get_resource_requirements()\n yield SourceAssetIOManagerRequirement(\n key=self.get_io_manager_key(), asset_key=self.key.to_string()\n )\n for source_key, resource_def in self.resource_defs.items():\n yield from resource_def.get_resource_requirements(outer_context=source_key)\n\n def __eq__(self, other: object) -> bool:\n if not isinstance(other, SourceAsset):\n return False\n else:\n return (\n self.key == other.key\n and self.raw_metadata == other.raw_metadata\n and self.io_manager_key == other.io_manager_key\n and self.description == other.description\n and self.group_name == other.group_name\n and self.resource_defs == other.resource_defs\n and self.observe_fn == other.observe_fn\n )
\n
", "current_page_name": "_modules/dagster/_core/definitions/source_asset", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.source_asset"}, "step_launcher": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.step_launcher

\nfrom abc import ABC, abstractmethod\nfrom typing import TYPE_CHECKING, Iterator, Mapping, NamedTuple, Optional\n\nimport dagster._check as check\nfrom dagster._core.definitions.reconstruct import ReconstructableJob\nfrom dagster._core.execution.retries import RetryMode\nfrom dagster._core.storage.dagster_run import DagsterRun\n\nif TYPE_CHECKING:\n    from dagster._core.events import DagsterEvent\n    from dagster._core.execution.context.system import StepExecutionContext\n    from dagster._core.execution.plan.state import KnownExecutionState\n\n\n
[docs]class StepRunRef(\n NamedTuple(\n "_StepRunRef",\n [\n ("run_config", Mapping[str, object]),\n ("dagster_run", DagsterRun),\n ("run_id", str),\n ("retry_mode", RetryMode),\n ("step_key", str),\n ("recon_job", ReconstructableJob),\n ("known_state", Optional["KnownExecutionState"]),\n ],\n )\n):\n """A serializable object that specifies what's needed to hydrate a step so\n that it can be executed in a process outside the plan process.\n\n Users should not instantiate this class directly.\n """\n\n def __new__(\n cls,\n run_config: Mapping[str, object],\n dagster_run: DagsterRun,\n run_id: str,\n retry_mode: RetryMode,\n step_key: str,\n recon_job: ReconstructableJob,\n known_state: Optional["KnownExecutionState"],\n ):\n from dagster._core.execution.plan.state import KnownExecutionState\n\n return super(StepRunRef, cls).__new__(\n cls,\n check.mapping_param(run_config, "run_config", key_type=str),\n check.inst_param(dagster_run, "dagster_run", DagsterRun),\n check.str_param(run_id, "run_id"),\n check.inst_param(retry_mode, "retry_mode", RetryMode),\n check.str_param(step_key, "step_key"),\n check.inst_param(recon_job, "recon_job", ReconstructableJob),\n check.opt_inst_param(known_state, "known_state", KnownExecutionState),\n )
\n\n\n
[docs]class StepLauncher(ABC):\n """A StepLauncher is responsible for executing steps, either in-process or in an external process."""\n\n @abstractmethod\n def launch_step(self, step_context: "StepExecutionContext") -> Iterator["DagsterEvent"]:\n """Args:\n step_context (StepExecutionContext): The context that we're executing the step in.\n\n Returns:\n Iterator[DagsterEvent]: The events for the step.\n """
\n
", "current_page_name": "_modules/dagster/_core/definitions/step_launcher", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.step_launcher"}, "time_window_partition_mapping": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.time_window_partition_mapping

\nfrom datetime import datetime\nfrom typing import NamedTuple, Optional, cast\n\nimport dagster._check as check\nfrom dagster._annotations import PublicAttr, experimental_param\nfrom dagster._core.definitions.partition import PartitionsDefinition, PartitionsSubset\nfrom dagster._core.definitions.partition_mapping import PartitionMapping, UpstreamPartitionsResult\nfrom dagster._core.definitions.time_window_partitions import (\n    TimeWindow,\n    TimeWindowPartitionsDefinition,\n    TimeWindowPartitionsSubset,\n)\nfrom dagster._core.errors import DagsterInvalidDefinitionError\nfrom dagster._core.instance import DynamicPartitionsStore\nfrom dagster._serdes import whitelist_for_serdes\n\n\n
[docs]@whitelist_for_serdes\n@experimental_param(param="allow_nonexistent_upstream_partitions")\nclass TimeWindowPartitionMapping(\n PartitionMapping,\n NamedTuple(\n "_TimeWindowPartitionMapping",\n [\n ("start_offset", PublicAttr[int]),\n ("end_offset", PublicAttr[int]),\n ("allow_nonexistent_upstream_partitions", PublicAttr[bool]),\n ],\n ),\n):\n """The default mapping between two TimeWindowPartitionsDefinitions.\n\n A partition in the downstream partitions definition is mapped to all partitions in the upstream\n asset whose time windows overlap it.\n\n This means that, if the upstream and downstream partitions definitions share the same time\n period, then this mapping is essentially the identity partition mapping - plus conversion of\n datetime formats.\n\n If the upstream time period is coarser than the downstream time period, then each partition in\n the downstream asset will map to a single (larger) upstream partition. E.g. if the downstream is\n hourly and the upstream is daily, then each hourly partition in the downstream will map to the\n daily partition in the upstream that contains that hour.\n\n If the upstream time period is finer than the downstream time period, then each partition in the\n downstream asset will map to multiple upstream partitions. E.g. if the downstream is daily and\n the upstream is hourly, then each daily partition in the downstream asset will map to the 24\n hourly partitions in the upstream that occur on that day.\n\n Attributes:\n start_offset (int): If not 0, then the starts of the upstream windows are shifted by this\n offset relative to the starts of the downstream windows. For example, if start_offset=-1\n and end_offset=0, then the downstream partition "2022-07-04" would map to the upstream\n partitions "2022-07-03" and "2022-07-04". Only permitted to be non-zero when the\n upstream and downstream PartitionsDefinitions are the same. Defaults to 0.\n end_offset (int): If not 0, then the ends of the upstream windows are shifted by this\n offset relative to the ends of the downstream windows. For example, if start_offset=0\n and end_offset=1, then the downstream partition "2022-07-04" would map to the upstream\n partitions "2022-07-04" and "2022-07-05". Only permitted to be non-zero when the\n upstream and downstream PartitionsDefinitions are the same. Defaults to 0.\n allow_nonexistent_upstream_partitions (bool): Defaults to false. If true, does not\n raise an error when mapped upstream partitions fall outside the start-end time window of the\n partitions def. For example, if the upstream partitions def starts on "2023-01-01" but\n the downstream starts on "2022-01-01", setting this bool to true would return no\n partition keys when get_upstream_partitions_for_partitions is called with "2022-06-01".\n When set to false, would raise an error.\n\n Examples:\n .. code-block:: python\n\n from dagster import DailyPartitionsDefinition, TimeWindowPartitionMapping, AssetIn, asset\n\n partitions_def = DailyPartitionsDefinition(start_date="2020-01-01")\n\n @asset(partitions_def=partitions_def)\n def asset1():\n ...\n\n @asset(\n partitions_def=partitions_def,\n ins={\n "asset1": AssetIn(\n partition_mapping=TimeWindowPartitionMapping(start_offset=-1)\n )\n }\n )\n def asset2(asset1):\n ...\n """\n\n def __new__(\n cls,\n start_offset: int = 0,\n end_offset: int = 0,\n allow_nonexistent_upstream_partitions: bool = False,\n ):\n return super(TimeWindowPartitionMapping, cls).__new__(\n cls,\n start_offset=check.int_param(start_offset, "start_offset"),\n end_offset=check.int_param(end_offset, "end_offset"),\n allow_nonexistent_upstream_partitions=check.bool_param(\n allow_nonexistent_upstream_partitions,\n "allow_nonexistent_upstream_partitions",\n ),\n )\n\n def get_upstream_mapped_partitions_result_for_partitions(\n self,\n downstream_partitions_subset: Optional[PartitionsSubset],\n upstream_partitions_def: PartitionsDefinition,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> UpstreamPartitionsResult:\n if not isinstance(downstream_partitions_subset, TimeWindowPartitionsSubset):\n check.failed("downstream_partitions_subset must be a TimeWindowPartitionsSubset")\n\n return self._map_partitions(\n downstream_partitions_subset.partitions_def,\n upstream_partitions_def,\n downstream_partitions_subset,\n start_offset=self.start_offset,\n end_offset=self.end_offset,\n current_time=current_time,\n )\n\n def get_downstream_partitions_for_partitions(\n self,\n upstream_partitions_subset: PartitionsSubset,\n downstream_partitions_def: Optional[PartitionsDefinition],\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> PartitionsSubset:\n """Returns the partitions in the downstream asset that map to the given upstream partitions.\n\n Filters for partitions that exist at the given current_time, fetching the current time\n if not provided.\n """\n return self._map_partitions(\n upstream_partitions_subset.partitions_def,\n downstream_partitions_def,\n upstream_partitions_subset,\n end_offset=-self.start_offset,\n start_offset=-self.end_offset,\n current_time=current_time,\n ).partitions_subset\n\n def _map_partitions(\n self,\n from_partitions_def: PartitionsDefinition,\n to_partitions_def: Optional[PartitionsDefinition],\n from_partitions_subset: PartitionsSubset,\n start_offset: int,\n end_offset: int,\n current_time: Optional[datetime] = None,\n ) -> UpstreamPartitionsResult:\n """Maps the partitions in from_partitions_subset to partitions in to_partitions_def.\n\n If partitions in from_partitions_subset represent time windows that do not exist in\n to_partitions_def, raises an error if raise_error_on_invalid_mapped_partition is True.\n Otherwise, filters out the partitions that do not exist in to_partitions_def and returns\n the filtered subset, also returning a bool indicating whether there were mapped time windows\n that did not exist in to_partitions_def.\n """\n if not isinstance(from_partitions_subset, TimeWindowPartitionsSubset):\n check.failed("from_partitions_subset must be a TimeWindowPartitionsSubset")\n\n if not isinstance(from_partitions_def, TimeWindowPartitionsDefinition):\n check.failed("from_partitions_def must be a TimeWindowPartitionsDefinition")\n\n if not isinstance(to_partitions_def, TimeWindowPartitionsDefinition):\n check.failed("to_partitions_def must be a TimeWindowPartitionsDefinition")\n\n if (start_offset != 0 or end_offset != 0) and (\n from_partitions_def.cron_schedule != to_partitions_def.cron_schedule\n ):\n raise DagsterInvalidDefinitionError(\n "Can't use the start_offset or end_offset parameters of"\n " TimeWindowPartitionMapping when the cron schedule of the upstream"\n " PartitionsDefinition is different than the cron schedule of the downstream"\n f" one. Attempted to map from cron schedule '{from_partitions_def.cron_schedule}' "\n f"to cron schedule '{to_partitions_def.cron_schedule}'."\n )\n\n if to_partitions_def.timezone != from_partitions_def.timezone:\n raise DagsterInvalidDefinitionError("Timezones don't match")\n\n # skip fancy mapping logic in the simple case\n if from_partitions_def == to_partitions_def and start_offset == 0 and end_offset == 0:\n return UpstreamPartitionsResult(from_partitions_subset, [])\n\n time_windows = []\n for from_partition_time_window in from_partitions_subset.included_time_windows:\n from_start_dt, from_end_dt = from_partition_time_window\n offsetted_start_dt = _offsetted_datetime(\n from_partitions_def, from_start_dt, start_offset\n )\n offsetted_end_dt = _offsetted_datetime(from_partitions_def, from_end_dt, end_offset)\n\n to_start_partition_key = (\n to_partitions_def.get_partition_key_for_timestamp(\n offsetted_start_dt.timestamp(), end_closed=False\n )\n if offsetted_start_dt is not None\n else None\n )\n to_end_partition_key = (\n to_partitions_def.get_partition_key_for_timestamp(\n offsetted_end_dt.timestamp(), end_closed=True\n )\n if offsetted_end_dt is not None\n else None\n )\n\n if to_start_partition_key is not None or to_end_partition_key is not None:\n window_start = (\n to_partitions_def.start_time_for_partition_key(to_start_partition_key)\n if to_start_partition_key\n else cast(TimeWindow, to_partitions_def.get_first_partition_window()).start\n )\n window_end = (\n to_partitions_def.end_time_for_partition_key(to_end_partition_key)\n if to_end_partition_key\n else cast(TimeWindow, to_partitions_def.get_last_partition_window()).end\n )\n\n if window_start < window_end:\n time_windows.append(TimeWindow(window_start, window_end))\n\n first_window = to_partitions_def.get_first_partition_window(current_time=current_time)\n last_window = to_partitions_def.get_last_partition_window(current_time=current_time)\n\n filtered_time_windows = []\n required_but_nonexistent_partition_keys = set()\n\n for time_window in time_windows:\n if (\n first_window\n and last_window\n and time_window.start <= last_window.start\n and time_window.end >= first_window.end\n ):\n window_start = max(time_window.start, first_window.start)\n window_end = min(time_window.end, last_window.end)\n filtered_time_windows.append(TimeWindow(window_start, window_end))\n\n if self.allow_nonexistent_upstream_partitions:\n # If allowed to have nonexistent upstream partitions, do not consider\n # out of range partitions to be invalid\n continue\n else:\n invalid_time_window = None\n if not (first_window and last_window) or (\n time_window.start < first_window.start and time_window.end > last_window.end\n ):\n invalid_time_window = time_window\n elif time_window.start < first_window.start:\n invalid_time_window = TimeWindow(\n time_window.start, min(time_window.end, first_window.start)\n )\n elif time_window.end > last_window.end:\n invalid_time_window = TimeWindow(\n max(time_window.start, last_window.end), time_window.end\n )\n\n if invalid_time_window:\n required_but_nonexistent_partition_keys.update(\n set(\n to_partitions_def.get_partition_keys_in_time_window(\n time_window=invalid_time_window\n )\n )\n )\n\n return UpstreamPartitionsResult(\n TimeWindowPartitionsSubset(\n to_partitions_def,\n num_partitions=sum(\n len(to_partitions_def.get_partition_keys_in_time_window(time_window))\n for time_window in filtered_time_windows\n ),\n included_time_windows=filtered_time_windows,\n ),\n sorted(list(required_but_nonexistent_partition_keys)),\n )
\n\n\ndef _offsetted_datetime(\n partitions_def: TimeWindowPartitionsDefinition, dt: datetime, offset: int\n) -> Optional[datetime]:\n for _ in range(abs(offset)):\n if offset < 0:\n prev_window = partitions_def.get_prev_partition_window(dt)\n if prev_window is None:\n return None\n\n dt = prev_window.start\n else:\n # TODO: what if we're at the end of the line?\n next_window = partitions_def.get_next_partition_window(dt)\n if next_window is None:\n return None\n\n dt = next_window.end\n\n return dt\n
", "current_page_name": "_modules/dagster/_core/definitions/time_window_partition_mapping", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.time_window_partition_mapping"}, "time_window_partitions": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.time_window_partitions

\nimport functools\nimport hashlib\nimport json\nimport re\nfrom datetime import datetime\nfrom enum import Enum\nfrom typing import (\n    AbstractSet,\n    Any,\n    Callable,\n    FrozenSet,\n    Iterable,\n    List,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Tuple,\n    Type,\n    Union,\n    cast,\n)\n\nimport pendulum\n\nimport dagster._check as check\nfrom dagster._annotations import PublicAttr, public\nfrom dagster._core.instance import DynamicPartitionsStore\nfrom dagster._utils.partitions import DEFAULT_HOURLY_FORMAT_WITHOUT_TIMEZONE\nfrom dagster._utils.schedules import (\n    cron_string_iterator,\n    is_valid_cron_schedule,\n    reverse_cron_string_iterator,\n)\n\nfrom ..errors import (\n    DagsterInvalidDefinitionError,\n    DagsterInvalidDeserializationVersionError,\n)\nfrom .partition import (\n    DEFAULT_DATE_FORMAT,\n    PartitionedConfig,\n    PartitionsDefinition,\n    PartitionsSubset,\n    ScheduleType,\n    cron_schedule_from_schedule_type_and_offsets,\n)\nfrom .partition_key_range import PartitionKeyRange\n\n\n
[docs]class TimeWindow(NamedTuple):\n """An interval that is closed at the start and open at the end.\n\n Attributes:\n start (datetime): A pendulum datetime that marks the start of the window.\n end (datetime): A pendulum datetime that marks the end of the window.\n """\n\n start: PublicAttr[datetime]\n end: PublicAttr[datetime]
\n\n\n
[docs]class TimeWindowPartitionsDefinition(\n PartitionsDefinition,\n NamedTuple(\n "_TimeWindowPartitionsDefinition",\n [\n ("start", PublicAttr[datetime]),\n ("timezone", PublicAttr[str]),\n ("end", PublicAttr[Optional[datetime]]),\n ("fmt", PublicAttr[str]),\n ("end_offset", PublicAttr[int]),\n ("cron_schedule", PublicAttr[str]),\n ],\n ),\n):\n r"""A set of partitions where each partitions corresponds to a time window.\n\n The provided cron_schedule determines the bounds of the time windows. E.g. a cron_schedule of\n "0 0 \\\\* \\\\* \\\\*" will result in daily partitions that start at midnight and end at midnight of the\n following day.\n\n The string partition_key associated with each partition corresponds to the start of the\n partition's time window.\n\n The first partition in the set will start on at the first cron_schedule tick that is equal to\n or after the given start datetime. The last partition in the set will end before the current\n time, unless the end_offset argument is set to a positive number.\n\n Args:\n cron_schedule (str): Determines the bounds of the time windows.\n start (datetime): The first partition in the set will start on at the first cron_schedule\n tick that is equal to or after this value.\n timezone (Optional[str]): The timezone in which each time should exist.\n Supported strings for timezones are the ones provided by the\n `IANA time zone database <https://www.iana.org/time-zones>` - e.g. "America/Los_Angeles".\n end (datetime): The last partition (excluding) in the set.\n fmt (str): The date format to use for partition_keys.\n end_offset (int): Extends the partition set by a number of partitions equal to the value\n passed. If end_offset is 0 (the default), the last partition ends before the current\n time. If end_offset is 1, the second-to-last partition ends before the current time,\n and so on.\n """\n\n def __new__(\n cls,\n start: Union[datetime, str],\n fmt: str,\n end: Union[datetime, str, None] = None,\n schedule_type: Optional[ScheduleType] = None,\n timezone: Optional[str] = None,\n end_offset: int = 0,\n minute_offset: Optional[int] = None,\n hour_offset: Optional[int] = None,\n day_offset: Optional[int] = None,\n cron_schedule: Optional[str] = None,\n ):\n check.opt_str_param(timezone, "timezone")\n timezone = timezone or "UTC"\n\n if isinstance(start, datetime):\n start_dt = pendulum.instance(start, tz=timezone)\n else:\n start_dt = pendulum.instance(datetime.strptime(start, fmt), tz=timezone)\n\n if not end:\n end_dt = None\n elif isinstance(end, datetime):\n end_dt = pendulum.instance(end, tz=timezone)\n else:\n end_dt = pendulum.instance(datetime.strptime(end, fmt), tz=timezone)\n\n if cron_schedule is not None:\n check.invariant(\n schedule_type is None and not minute_offset and not hour_offset and not day_offset,\n "If cron_schedule argument is provided, then schedule_type, minute_offset, "\n "hour_offset, and day_offset can't also be provided",\n )\n else:\n if schedule_type is None:\n check.failed("One of schedule_type and cron_schedule must be provided")\n\n cron_schedule = cron_schedule_from_schedule_type_and_offsets(\n schedule_type=schedule_type,\n minute_offset=minute_offset or 0,\n hour_offset=hour_offset or 0,\n day_offset=day_offset or 0,\n )\n\n if not is_valid_cron_schedule(cron_schedule):\n raise DagsterInvalidDefinitionError(\n f"Found invalid cron schedule '{cron_schedule}' for a"\n " TimeWindowPartitionsDefinition."\n )\n\n return super(TimeWindowPartitionsDefinition, cls).__new__(\n cls, start_dt, timezone, end_dt, fmt, end_offset, cron_schedule\n )\n\n def get_current_timestamp(self, current_time: Optional[datetime] = None) -> float:\n return (\n pendulum.instance(current_time, tz=self.timezone)\n if current_time\n else pendulum.now(self.timezone)\n ).timestamp()\n\n def get_num_partitions(\n self,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> int:\n # Method added for performance reasons.\n # Fetching partition keys requires significantly more compute time to\n # string format datetimes.\n current_timestamp = self.get_current_timestamp(current_time=current_time)\n\n partitions_past_current_time = 0\n\n num_partitions = 0\n for time_window in self._iterate_time_windows(self.start):\n if self.end and time_window.end.timestamp() > self.end.timestamp():\n break\n if (\n time_window.end.timestamp() <= current_timestamp\n or partitions_past_current_time < self.end_offset\n ):\n num_partitions += 1\n\n if time_window.end.timestamp() > current_timestamp:\n partitions_past_current_time += 1\n else:\n break\n\n if self.end_offset < 0:\n num_partitions += self.end_offset\n\n return num_partitions\n\n def get_partition_keys_between_indexes(\n self, start_idx: int, end_idx: int, current_time: Optional[datetime] = None\n ) -> List[str]:\n # Fetches the partition keys between the given start and end indices.\n # Start index is inclusive, end index is exclusive.\n # Method added for performance reasons, to only string format\n # partition keys included within the indices.\n current_timestamp = self.get_current_timestamp(current_time=current_time)\n\n partitions_past_current_time = 0\n partition_keys = []\n reached_end = False\n\n for idx, time_window in enumerate(self._iterate_time_windows(self.start)):\n if time_window.end.timestamp() >= current_timestamp:\n reached_end = True\n if self.end and time_window.end.timestamp() > self.end.timestamp():\n reached_end = True\n if (\n time_window.end.timestamp() <= current_timestamp\n or partitions_past_current_time < self.end_offset\n ):\n if idx >= start_idx and idx < end_idx:\n partition_keys.append(time_window.start.strftime(self.fmt))\n if time_window.end.timestamp() > current_timestamp:\n partitions_past_current_time += 1\n else:\n break\n if len(partition_keys) >= end_idx - start_idx:\n break\n\n if reached_end and self.end_offset < 0:\n partition_keys = partition_keys[: self.end_offset]\n\n return partition_keys\n\n def get_partition_keys(\n self,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> Sequence[str]:\n current_timestamp = self.get_current_timestamp(current_time=current_time)\n\n partitions_past_current_time = 0\n partition_keys: List[str] = []\n for time_window in self._iterate_time_windows(self.start):\n if self.end and time_window.end.timestamp() > self.end.timestamp():\n break\n if (\n time_window.end.timestamp() <= current_timestamp\n or partitions_past_current_time < self.end_offset\n ):\n partition_keys.append(time_window.start.strftime(self.fmt))\n\n if time_window.end.timestamp() > current_timestamp:\n partitions_past_current_time += 1\n else:\n break\n\n if self.end_offset < 0:\n partition_keys = partition_keys[: self.end_offset]\n\n return partition_keys\n\n def _get_validated_time_window_for_partition_key(\n self, partition_key: str, current_time: Optional[datetime] = None\n ) -> Optional[TimeWindow]:\n """Returns a TimeWindow for the given partition key if it is valid, otherwise returns None."""\n try:\n time_window = self.time_window_for_partition_key(partition_key)\n except ValueError:\n return None\n\n first_partition_window = self.get_first_partition_window(current_time=current_time)\n last_partition_window = self.get_last_partition_window(current_time=current_time)\n if (\n first_partition_window is None\n or last_partition_window is None\n or time_window.start < first_partition_window.start\n or time_window.start > last_partition_window.start\n or time_window.start.strftime(self.fmt) != partition_key\n ):\n return None\n\n return time_window\n\n def __str__(self) -> str:\n schedule_str = (\n self.schedule_type.value.capitalize() if self.schedule_type else self.cron_schedule\n )\n partition_def_str = (\n f"{schedule_str}, starting {self.start.strftime(self.fmt)} {self.timezone}."\n )\n if self.end_offset != 0:\n partition_def_str += (\n " End offsetted by"\n f" {self.end_offset} partition{'' if self.end_offset == 1 else 's'}."\n )\n return partition_def_str\n\n def __repr__(self):\n # Between python 3.8 and 3.9 the repr of a datetime object changed.\n # Replaces start time with timestamp as a workaround to make sure the repr is consistent across versions.\n return (\n f"TimeWindowPartitionsDefinition(start={self.start.timestamp()},"\n f" timezone='{self.timezone}', fmt='{self.fmt}', end_offset={self.end_offset},"\n f" cron_schedule='{self.cron_schedule}')"\n )\n\n def __hash__(self):\n return hash(tuple(self.__repr__()))\n\n @functools.lru_cache(maxsize=100)\n def _time_window_for_partition_key(self, *, partition_key: str) -> TimeWindow:\n partition_key_dt = pendulum.instance(\n datetime.strptime(partition_key, self.fmt), tz=self.timezone\n )\n return next(iter(self._iterate_time_windows(partition_key_dt)))\n\n def time_window_for_partition_key(self, partition_key: str) -> TimeWindow:\n return self._time_window_for_partition_key(partition_key=partition_key)\n\n @functools.lru_cache(maxsize=5)\n def time_windows_for_partition_keys(\n self,\n partition_keys: FrozenSet[str],\n validate: bool = True,\n ) -> Sequence[TimeWindow]:\n if len(partition_keys) == 0:\n return []\n\n sorted_pks = sorted(partition_keys, key=lambda pk: datetime.strptime(pk, self.fmt))\n cur_windows_iterator = iter(\n self._iterate_time_windows(\n pendulum.instance(datetime.strptime(sorted_pks[0], self.fmt), tz=self.timezone)\n )\n )\n partition_key_time_windows: List[TimeWindow] = []\n for partition_key in sorted_pks:\n next_window = next(cur_windows_iterator)\n if next_window.start.strftime(self.fmt) == partition_key:\n partition_key_time_windows.append(next_window)\n else:\n cur_windows_iterator = iter(\n self._iterate_time_windows(\n pendulum.instance(\n datetime.strptime(partition_key, self.fmt), tz=self.timezone\n )\n )\n )\n partition_key_time_windows.append(next(cur_windows_iterator))\n\n if validate:\n start_time_window = self.get_first_partition_window()\n end_time_window = self.get_last_partition_window()\n\n if start_time_window is None or end_time_window is None:\n check.failed("No partitions in the PartitionsDefinition")\n\n start_timestamp = start_time_window.start.timestamp()\n end_timestamp = end_time_window.end.timestamp()\n\n partition_key_time_windows = [\n tw\n for tw in partition_key_time_windows\n if tw.start.timestamp() >= start_timestamp and tw.end.timestamp() <= end_timestamp\n ]\n return partition_key_time_windows\n\n def start_time_for_partition_key(self, partition_key: str) -> datetime:\n partition_key_dt = pendulum.instance(\n datetime.strptime(partition_key, self.fmt), tz=self.timezone\n )\n # the datetime format might not include granular components, so we need to recover them\n # we make the assumption that the parsed partition key is <= the start datetime\n return next(iter(self._iterate_time_windows(partition_key_dt))).start\n\n def get_next_partition_key(\n self, partition_key: str, current_time: Optional[datetime] = None\n ) -> Optional[str]:\n last_partition_window = self.get_last_partition_window(current_time)\n if last_partition_window is None:\n return None\n\n partition_key_dt = pendulum.instance(\n datetime.strptime(partition_key, self.fmt), tz=self.timezone\n )\n windows_iter = iter(self._iterate_time_windows(partition_key_dt))\n next(windows_iter)\n start_time = next(windows_iter).start\n if start_time >= last_partition_window.end:\n return None\n else:\n return start_time.strftime(self.fmt)\n\n def get_next_partition_window(\n self, end_dt: datetime, current_time: Optional[datetime] = None\n ) -> Optional[TimeWindow]:\n last_partition_window = self.get_last_partition_window(current_time)\n if last_partition_window is None:\n return None\n\n windows_iter = iter(self._iterate_time_windows(end_dt))\n next_window = next(windows_iter)\n if next_window.start >= last_partition_window.end:\n return None\n else:\n return next_window\n\n def get_prev_partition_window(self, start_dt: datetime) -> Optional[TimeWindow]:\n windows_iter = iter(self._reverse_iterate_time_windows(start_dt))\n prev_window = next(windows_iter)\n first_partition_window = self.get_first_partition_window()\n if first_partition_window is None or prev_window.start < first_partition_window.start:\n return None\n else:\n return prev_window\n\n @functools.lru_cache(maxsize=5)\n def _get_first_partition_window(self, *, current_time: datetime) -> Optional[TimeWindow]:\n current_timestamp = current_time.timestamp()\n\n time_window = next(iter(self._iterate_time_windows(self.start)))\n\n if self.end_offset == 0:\n return time_window if time_window.end.timestamp() <= current_timestamp else None\n elif self.end_offset > 0:\n iterator = iter(self._iterate_time_windows(current_time))\n # first returned time window is time window of current time\n curr_window_plus_offset = next(iterator)\n for _ in range(self.end_offset):\n curr_window_plus_offset = next(iterator)\n return (\n time_window\n if time_window.end.timestamp() <= curr_window_plus_offset.start.timestamp()\n else None\n )\n else:\n # end offset < 0\n end_window = None\n iterator = iter(self._reverse_iterate_time_windows(current_time))\n for _ in range(abs(self.end_offset)):\n end_window = next(iterator)\n\n if end_window is None:\n check.failed("end_window should not be None")\n\n return (\n time_window if time_window.end.timestamp() <= end_window.start.timestamp() else None\n )\n\n def get_first_partition_window(\n self, current_time: Optional[datetime] = None\n ) -> Optional[TimeWindow]:\n current_time = cast(\n datetime,\n (\n pendulum.instance(current_time, tz=self.timezone)\n if current_time\n else pendulum.now(self.timezone)\n ),\n )\n return self._get_first_partition_window(current_time=current_time)\n\n @functools.lru_cache(maxsize=5)\n def _get_last_partition_window(self, *, current_time: datetime) -> Optional[TimeWindow]:\n if self.get_first_partition_window(current_time) is None:\n return None\n\n current_time = (\n pendulum.instance(current_time, tz=self.timezone)\n if current_time\n else pendulum.now(self.timezone)\n )\n\n if self.end and self.end < current_time:\n current_time = self.end\n\n if self.end_offset == 0:\n return next(iter(self._reverse_iterate_time_windows(current_time)))\n else:\n # TODO: make this efficient\n last_partition_key = super().get_last_partition_key(current_time)\n return (\n self.time_window_for_partition_key(last_partition_key)\n if last_partition_key\n else None\n )\n\n def get_last_partition_window(\n self, current_time: Optional[datetime] = None\n ) -> Optional[TimeWindow]:\n current_time = cast(\n datetime,\n (\n pendulum.instance(current_time, tz=self.timezone)\n if current_time\n else pendulum.now(self.timezone)\n ),\n )\n return self._get_last_partition_window(current_time=current_time)\n\n def get_first_partition_key(\n self,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> Optional[str]:\n first_window = self.get_first_partition_window(current_time)\n if first_window is None:\n return None\n\n return first_window.start.strftime(self.fmt)\n\n def get_last_partition_key(\n self,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> Optional[str]:\n last_window = self.get_last_partition_window(current_time)\n if last_window is None:\n return None\n\n return last_window.start.strftime(self.fmt)\n\n def end_time_for_partition_key(self, partition_key: str) -> datetime:\n return self.time_window_for_partition_key(partition_key).end\n\n @functools.lru_cache(maxsize=5)\n def get_partition_keys_in_time_window(self, time_window: TimeWindow) -> Sequence[str]:\n result: List[str] = []\n for partition_time_window in self._iterate_time_windows(time_window.start):\n if partition_time_window.start < time_window.end:\n result.append(partition_time_window.start.strftime(self.fmt))\n else:\n break\n return result\n\n def get_partition_key_range_for_time_window(self, time_window: TimeWindow) -> PartitionKeyRange:\n start_partition_key = self.get_partition_key_for_timestamp(time_window.start.timestamp())\n end_partition_key = self.get_partition_key_for_timestamp(\n cast(TimeWindow, self.get_prev_partition_window(time_window.end)).start.timestamp()\n )\n\n return PartitionKeyRange(start_partition_key, end_partition_key)\n\n def get_partition_keys_in_range(\n self,\n partition_key_range: PartitionKeyRange,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> Sequence[str]:\n start_time = self.start_time_for_partition_key(partition_key_range.start)\n end_time = self.end_time_for_partition_key(partition_key_range.end)\n\n return self.get_partition_keys_in_time_window(TimeWindow(start_time, end_time))\n\n @public\n @property\n def schedule_type(self) -> Optional[ScheduleType]:\n """Optional[ScheduleType]: An enum representing the partition cadence (hourly, daily,\n weekly, or monthly).\n """\n if re.fullmatch(r"\\d+ \\* \\* \\* \\*", self.cron_schedule):\n return ScheduleType.HOURLY\n elif re.fullmatch(r"\\d+ \\d+ \\* \\* \\*", self.cron_schedule):\n return ScheduleType.DAILY\n elif re.fullmatch(r"\\d+ \\d+ \\* \\* \\d+", self.cron_schedule):\n return ScheduleType.WEEKLY\n elif re.fullmatch(r"\\d+ \\d+ \\d+ \\* \\*", self.cron_schedule):\n return ScheduleType.MONTHLY\n else:\n return None\n\n @public\n @property\n def minute_offset(self) -> int:\n """int: Number of minutes past the hour to "split" partitions. Defaults to 0.\n\n For example, returns 15 if each partition starts at 15 minutes past the hour.\n """\n match = re.fullmatch(r"(\\d+) (\\d+|\\*) (\\d+|\\*) (\\d+|\\*) (\\d+|\\*)", self.cron_schedule)\n if match is None:\n check.failed(f"{self.cron_schedule} has no minute offset")\n return int(match.groups()[0])\n\n @public\n @property\n def hour_offset(self) -> int:\n """int: Number of hours past 00:00 to "split" partitions. Defaults to 0.\n\n For example, returns 1 if each partition starts at 01:00.\n """\n match = re.fullmatch(r"(\\d+|\\*) (\\d+) (\\d+|\\*) (\\d+|\\*) (\\d+|\\*)", self.cron_schedule)\n if match is None:\n check.failed(f"{self.cron_schedule} has no hour offset")\n return int(match.groups()[1])\n\n @public\n @property\n def day_offset(self) -> int:\n """int: For a weekly or monthly partitions definition, returns the day to "split" partitions\n by. Each partition will start on this day, and end before this day in the following\n week/month. Returns 0 if the day_offset parameter is unset in the\n WeeklyPartitionsDefinition, MonthlyPartitionsDefinition, or the provided cron schedule.\n\n For weekly partitions, returns a value between 0 (representing Sunday) and 6 (representing\n Saturday). Providing a value of 1 means that a partition will exist weekly from Monday to\n the following Sunday.\n\n For monthly partitions, returns a value between 0 (the first day of the month) and 31 (the\n last possible day of the month).\n """\n schedule_type = self.schedule_type\n if schedule_type == ScheduleType.WEEKLY:\n match = re.fullmatch(r"(\\d+|\\*) (\\d+|\\*) (\\d+|\\*) (\\d+|\\*) (\\d+)", self.cron_schedule)\n if match is None:\n check.failed(f"{self.cron_schedule} has no day offset")\n return int(match.groups()[4])\n elif schedule_type == ScheduleType.MONTHLY:\n match = re.fullmatch(r"(\\d+|\\*) (\\d+|\\*) (\\d+) (\\d+|\\*) (\\d+|\\*)", self.cron_schedule)\n if match is None:\n check.failed(f"{self.cron_schedule} has no day offset")\n return int(match.groups()[2])\n else:\n check.failed(f"Unsupported schedule type for day_offset: {schedule_type}")\n\n
[docs] @public\n def get_cron_schedule(\n self,\n minute_of_hour: Optional[int] = None,\n hour_of_day: Optional[int] = None,\n day_of_week: Optional[int] = None,\n day_of_month: Optional[int] = None,\n ) -> str:\n """The schedule executes at the cadence specified by the partitioning, but may overwrite\n the minute/hour/day offset of the partitioning.\n\n This is useful e.g. if you have partitions that span midnight to midnight but you want to\n schedule a job that runs at 2 am.\n """\n if (\n minute_of_hour is None\n and hour_of_day is None\n and day_of_week is None\n and day_of_month is None\n ):\n return self.cron_schedule\n\n schedule_type = self.schedule_type\n if schedule_type is None:\n check.failed(\n f"{self.cron_schedule} does not support"\n " minute_of_hour/hour_of_day/day_of_week/day_of_month arguments"\n )\n\n minute_of_hour = cast(\n int,\n check.opt_int_param(minute_of_hour, "minute_of_hour", default=self.minute_offset),\n )\n\n if schedule_type == ScheduleType.HOURLY:\n check.invariant(\n hour_of_day is None, "Cannot set hour parameter with hourly partitions."\n )\n else:\n hour_of_day = cast(\n int, check.opt_int_param(hour_of_day, "hour_of_day", default=self.hour_offset)\n )\n\n if schedule_type == ScheduleType.DAILY:\n check.invariant(\n day_of_week is None, "Cannot set day of week parameter with daily partitions."\n )\n check.invariant(\n day_of_month is None, "Cannot set day of month parameter with daily partitions."\n )\n\n if schedule_type == ScheduleType.MONTHLY:\n default = self.day_offset or 1\n day_offset = check.opt_int_param(day_of_month, "day_of_month", default=default)\n elif schedule_type == ScheduleType.WEEKLY:\n default = self.day_offset or 0\n day_offset = check.opt_int_param(day_of_week, "day_of_week", default=default)\n else:\n day_offset = 0\n\n return cron_schedule_from_schedule_type_and_offsets(\n schedule_type,\n minute_offset=minute_of_hour,\n hour_offset=hour_of_day or 0,\n day_offset=day_offset,\n )
\n\n def _iterate_time_windows(self, start: datetime) -> Iterable[TimeWindow]:\n """Returns an infinite generator of time windows that start after the given start time."""\n start_timestamp = pendulum.instance(start, tz=self.timezone).timestamp()\n iterator = cron_string_iterator(\n start_timestamp=start_timestamp,\n cron_string=self.cron_schedule,\n execution_timezone=self.timezone,\n )\n prev_time = next(iterator)\n while prev_time.timestamp() < start_timestamp:\n prev_time = next(iterator)\n\n while True:\n next_time = next(iterator)\n yield TimeWindow(prev_time, next_time)\n prev_time = next_time\n\n def _reverse_iterate_time_windows(self, end: datetime) -> Iterable[TimeWindow]:\n """Returns an infinite generator of time windows that end before the given end time."""\n end_timestamp = pendulum.instance(end, tz=self.timezone).timestamp()\n iterator = reverse_cron_string_iterator(\n end_timestamp=end_timestamp,\n cron_string=self.cron_schedule,\n execution_timezone=self.timezone,\n )\n\n prev_time = next(iterator)\n while prev_time.timestamp() > end_timestamp:\n prev_time = next(iterator)\n\n while True:\n next_time = next(iterator)\n yield TimeWindow(next_time, prev_time)\n prev_time = next_time\n\n def get_partition_key_for_timestamp(self, timestamp: float, end_closed: bool = False) -> str:\n """Args:\n timestamp (float): Timestamp from the unix epoch, UTC.\n end_closed (bool): Whether the interval is closed at the end or at the beginning.\n """\n iterator = cron_string_iterator(\n timestamp, self.cron_schedule, self.timezone, start_offset=-1\n )\n # prev will be < timestamp\n prev = next(iterator)\n # prev_next will be >= timestamp\n prev_next = next(iterator)\n\n if end_closed or prev_next.timestamp() > timestamp:\n return prev.strftime(self.fmt)\n else:\n return prev_next.strftime(self.fmt)\n\n def less_than(self, partition_key1: str, partition_key2: str) -> bool:\n """Returns true if the partition_key1 is earlier than partition_key2."""\n return self.start_time_for_partition_key(\n partition_key1\n ) < self.start_time_for_partition_key(partition_key2)\n\n @property\n def partitions_subset_class(self) -> Type["PartitionsSubset"]:\n return TimeWindowPartitionsSubset\n\n def empty_subset(self) -> "PartitionsSubset":\n return self.partitions_subset_class.empty_subset(self)\n\n def is_valid_partition_key(self, partition_key: str) -> bool:\n try:\n partition_time = pendulum.instance(\n datetime.strptime(partition_key, self.fmt), tz=self.timezone\n )\n return partition_time >= self.start\n except ValueError:\n return False\n\n def get_serializable_unique_identifier(\n self, dynamic_partitions_store: Optional[DynamicPartitionsStore] = None\n ) -> str:\n return hashlib.sha1(self.__repr__().encode("utf-8")).hexdigest()\n\n def has_partition_key(\n self,\n partition_key: str,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> bool:\n return bool(self._get_validated_time_window_for_partition_key(partition_key, current_time))
\n\n\n
[docs]class DailyPartitionsDefinition(TimeWindowPartitionsDefinition):\n """A set of daily partitions.\n\n The first partition in the set will start at the start_date at midnight. The last partition\n in the set will end before the current time, unless the end_offset argument is set to a\n positive number. If minute_offset and/or hour_offset are used, the start and end times of\n each partition will be hour_offset:minute_offset of each day.\n\n Args:\n start_date (Union[datetime.datetime, str]): The first date in the set of partitions. Can\n provide in either a datetime or string format.\n end_date (Union[datetime.datetime, str, None]): The last date(excluding) in the set of partitions.\n Default is None. Can provide in either a datetime or string format.\n minute_offset (int): Number of minutes past the hour to "split" the partition. Defaults\n to 0.\n hour_offset (int): Number of hours past 00:00 to "split" the partition. Defaults to 0.\n timezone (Optional[str]): The timezone in which each date should exist.\n Supported strings for timezones are the ones provided by the\n `IANA time zone database <https://www.iana.org/time-zones>` - e.g. "America/Los_Angeles".\n fmt (Optional[str]): The date format to use. Defaults to `%Y-%m-%d`.\n end_offset (int): Extends the partition set by a number of partitions equal to the value\n passed. If end_offset is 0 (the default), the last partition ends before the current\n time. If end_offset is 1, the second-to-last partition ends before the current time,\n and so on.\n\n .. code-block:: python\n\n DailyPartitionsDefinition(start_date="2022-03-12")\n # creates partitions (2022-03-12-00:00, 2022-03-13-00:00), (2022-03-13-00:00, 2022-03-14-00:00), ...\n\n DailyPartitionsDefinition(start_date="2022-03-12", minute_offset=15, hour_offset=16)\n # creates partitions (2022-03-12-16:15, 2022-03-13-16:15), (2022-03-13-16:15, 2022-03-14-16:15), ...\n """\n\n def __new__(\n cls,\n start_date: Union[datetime, str],\n end_date: Union[datetime, str, None] = None,\n minute_offset: int = 0,\n hour_offset: int = 0,\n timezone: Optional[str] = None,\n fmt: Optional[str] = None,\n end_offset: int = 0,\n ):\n _fmt = fmt or DEFAULT_DATE_FORMAT\n\n return super(DailyPartitionsDefinition, cls).__new__(\n cls,\n schedule_type=ScheduleType.DAILY,\n start=start_date,\n end=end_date,\n minute_offset=minute_offset,\n hour_offset=hour_offset,\n timezone=timezone,\n fmt=_fmt,\n end_offset=end_offset,\n )
\n\n\ndef wrap_time_window_run_config_fn(\n run_config_fn: Optional[Callable[[datetime, datetime], Mapping[str, Any]]],\n partitions_def: TimeWindowPartitionsDefinition,\n) -> Callable[[str], Mapping[str, Any]]:\n def _run_config_wrapper(key: str) -> Mapping[str, Any]:\n if not run_config_fn:\n return {}\n time_window = partitions_def.time_window_for_partition_key(key)\n return run_config_fn(time_window.start, time_window.end)\n\n return _run_config_wrapper\n\n\ndef wrap_time_window_tags_fn(\n tags_fn: Optional[Callable[[datetime, datetime], Mapping[str, str]]],\n partitions_def: TimeWindowPartitionsDefinition,\n) -> Callable[[str], Mapping[str, str]]:\n def _tag_wrapper(key: str) -> Mapping[str, str]:\n if not tags_fn:\n return {}\n time_window = partitions_def.time_window_for_partition_key(key)\n return tags_fn(time_window.start, time_window.end)\n\n return _tag_wrapper\n\n\n
[docs]def daily_partitioned_config(\n start_date: Union[datetime, str],\n minute_offset: int = 0,\n hour_offset: int = 0,\n timezone: Optional[str] = None,\n fmt: Optional[str] = None,\n end_offset: int = 0,\n tags_for_partition_fn: Optional[Callable[[datetime, datetime], Mapping[str, str]]] = None,\n) -> Callable[\n [Callable[[datetime, datetime], Mapping[str, Any]]],\n PartitionedConfig[DailyPartitionsDefinition],\n]:\n """Defines run config over a set of daily partitions.\n\n The decorated function should accept a start datetime and end datetime, which represent the bounds\n of the date partition the config should delineate.\n\n The decorated function should return a run config dictionary.\n\n The resulting object created by this decorator can be provided to the config argument of a Job.\n The first partition in the set will start at the start_date at midnight. The last partition in\n the set will end before the current time, unless the end_offset argument is set to a positive\n number. If minute_offset and/or hour_offset are used, the start and end times of each partition\n will be hour_offset:minute_offset of each day.\n\n Args:\n start_date (Union[datetime.datetime, str]): The first date in the set of partitions. Can\n provide in either a datetime or string format.\n minute_offset (int): Number of minutes past the hour to "split" the partition. Defaults\n to 0.\n hour_offset (int): Number of hours past 00:00 to "split" the partition. Defaults to 0.\n timezone (Optional[str]): The timezone in which each date should exist.\n Supported strings for timezones are the ones provided by the\n `IANA time zone database <https://www.iana.org/time-zones>` - e.g. "America/Los_Angeles".\n fmt (Optional[str]): The date format to use. Defaults to `%Y-%m-%d`.\n end_offset (int): Extends the partition set by a number of partitions equal to the value\n passed. If end_offset is 0 (the default), the last partition ends before the current\n time. If end_offset is 1, the second-to-last partition ends before the current time,\n and so on.\n tags_for_partition_fn (Optional[Callable[[str], Mapping[str, str]]]): A function that\n accepts a partition time window and returns a dictionary of tags to attach to runs for\n that partition.\n\n .. code-block:: python\n\n @daily_partitioned_config(start_date="2022-03-12")\n # creates partitions (2022-03-12-00:00, 2022-03-13-00:00), (2022-03-13-00:00, 2022-03-14-00:00), ...\n\n @daily_partitioned_config(start_date="2022-03-12", minute_offset=15, hour_offset=16)\n # creates partitions (2022-03-12-16:15, 2022-03-13-16:15), (2022-03-13-16:15, 2022-03-14-16:15), ...\n """\n\n def inner(\n fn: Callable[[datetime, datetime], Mapping[str, Any]]\n ) -> PartitionedConfig[DailyPartitionsDefinition]:\n check.callable_param(fn, "fn")\n\n partitions_def = DailyPartitionsDefinition(\n start_date=start_date,\n minute_offset=minute_offset,\n hour_offset=hour_offset,\n timezone=timezone,\n fmt=fmt,\n end_offset=end_offset,\n )\n\n return PartitionedConfig(\n run_config_for_partition_key_fn=wrap_time_window_run_config_fn(fn, partitions_def),\n partitions_def=partitions_def,\n decorated_fn=fn,\n tags_for_partition_key_fn=wrap_time_window_tags_fn(\n tags_for_partition_fn, partitions_def\n ),\n )\n\n return inner
\n\n\n
[docs]class HourlyPartitionsDefinition(TimeWindowPartitionsDefinition):\n """A set of hourly partitions.\n\n The first partition in the set will start on the start_date at midnight. The last partition\n in the set will end before the current time, unless the end_offset argument is set to a\n positive number. If minute_offset is provided, the start and end times of each partition\n will be minute_offset past the hour.\n\n Args:\n start_date (Union[datetime.datetime, str]): The first date in the set of partitions. Can\n provide in either a datetime or string format.\n end_date (Union[datetime.datetime, str, None]): The last date(excluding) in the set of partitions.\n Default is None. Can provide in either a datetime or string format.\n minute_offset (int): Number of minutes past the hour to "split" the partition. Defaults\n to 0.\n fmt (Optional[str]): The date format to use. Defaults to `%Y-%m-%d`.\n timezone (Optional[str]): The timezone in which each date should exist.\n Supported strings for timezones are the ones provided by the\n `IANA time zone database <https://www.iana.org/time-zones>` - e.g. "America/Los_Angeles".\n end_offset (int): Extends the partition set by a number of partitions equal to the value\n passed. If end_offset is 0 (the default), the last partition ends before the current\n time. If end_offset is 1, the second-to-last partition ends before the current time,\n and so on.\n\n .. code-block:: python\n\n HourlyPartitionsDefinition(start_date=datetime(2022, 03, 12))\n # creates partitions (2022-03-12-00:00, 2022-03-12-01:00), (2022-03-12-01:00, 2022-03-12-02:00), ...\n\n HourlyPartitionsDefinition(start_date=datetime(2022, 03, 12), minute_offset=15)\n # creates partitions (2022-03-12-00:15, 2022-03-12-01:15), (2022-03-12-01:15, 2022-03-12-02:15), ...\n """\n\n def __new__(\n cls,\n start_date: Union[datetime, str],\n end_date: Union[datetime, str, None] = None,\n minute_offset: int = 0,\n timezone: Optional[str] = None,\n fmt: Optional[str] = None,\n end_offset: int = 0,\n ):\n _fmt = fmt or DEFAULT_HOURLY_FORMAT_WITHOUT_TIMEZONE\n\n return super(HourlyPartitionsDefinition, cls).__new__(\n cls,\n schedule_type=ScheduleType.HOURLY,\n start=start_date,\n end=end_date,\n minute_offset=minute_offset,\n timezone=timezone,\n fmt=_fmt,\n end_offset=end_offset,\n )
\n\n\n
[docs]def hourly_partitioned_config(\n start_date: Union[datetime, str],\n minute_offset: int = 0,\n timezone: Optional[str] = None,\n fmt: Optional[str] = None,\n end_offset: int = 0,\n tags_for_partition_fn: Optional[Callable[[datetime, datetime], Mapping[str, str]]] = None,\n) -> Callable[\n [Callable[[datetime, datetime], Mapping[str, Any]]],\n PartitionedConfig[HourlyPartitionsDefinition],\n]:\n """Defines run config over a set of hourly partitions.\n\n The decorated function should accept a start datetime and end datetime, which represent the date\n partition the config should delineate.\n\n The decorated function should return a run config dictionary.\n\n The resulting object created by this decorator can be provided to the config argument of a Job.\n The first partition in the set will start at the start_date at midnight. The last partition in\n the set will end before the current time, unless the end_offset argument is set to a positive\n number. If minute_offset is provided, the start and end times of each partition will be\n minute_offset past the hour.\n\n Args:\n start_date (Union[datetime.datetime, str]): The first date in the set of partitions. Can\n provide in either a datetime or string format.\n minute_offset (int): Number of minutes past the hour to "split" the partition. Defaults\n to 0.\n fmt (Optional[str]): The date format to use. Defaults to `%Y-%m-%d`.\n timezone (Optional[str]): The timezone in which each date should exist.\n Supported strings for timezones are the ones provided by the\n `IANA time zone database <https://www.iana.org/time-zones>` - e.g. "America/Los_Angeles".\n end_offset (int): Extends the partition set by a number of partitions equal to the value\n passed. If end_offset is 0 (the default), the last partition ends before the current\n time. If end_offset is 1, the second-to-last partition ends before the current time,\n and so on.\n tags_for_partition_fn (Optional[Callable[[str], Mapping[str, str]]]): A function that\n accepts a partition time window and returns a dictionary of tags to attach to runs for\n that partition.\n\n .. code-block:: python\n\n @hourly_partitioned_config(start_date=datetime(2022, 03, 12))\n # creates partitions (2022-03-12-00:00, 2022-03-12-01:00), (2022-03-12-01:00, 2022-03-12-02:00), ...\n\n @hourly_partitioned_config(start_date=datetime(2022, 03, 12), minute_offset=15)\n # creates partitions (2022-03-12-00:15, 2022-03-12-01:15), (2022-03-12-01:15, 2022-03-12-02:15), ...\n """\n\n def inner(\n fn: Callable[[datetime, datetime], Mapping[str, Any]]\n ) -> PartitionedConfig[HourlyPartitionsDefinition]:\n check.callable_param(fn, "fn")\n\n partitions_def = HourlyPartitionsDefinition(\n start_date=start_date,\n minute_offset=minute_offset,\n timezone=timezone,\n fmt=fmt,\n end_offset=end_offset,\n )\n return PartitionedConfig(\n run_config_for_partition_key_fn=wrap_time_window_run_config_fn(fn, partitions_def),\n partitions_def=partitions_def,\n decorated_fn=fn,\n tags_for_partition_key_fn=wrap_time_window_tags_fn(\n tags_for_partition_fn, partitions_def\n ),\n )\n\n return inner
\n\n\n
[docs]class MonthlyPartitionsDefinition(TimeWindowPartitionsDefinition):\n """A set of monthly partitions.\n\n The first partition in the set will start at the soonest first of the month after start_date\n at midnight. The last partition in the set will end before the current time, unless the\n end_offset argument is set to a positive number. If day_offset is provided, the start and\n end date of each partition will be day_offset. If minute_offset and/or hour_offset are used,\n the start and end times of each partition will be hour_offset:minute_offset of each day.\n\n Args:\n start_date (Union[datetime.datetime, str]): The first date in the set of partitions will be\n midnight the sonnest first of the month following start_date. Can provide in either a\n datetime or string format.\n end_date (Union[datetime.datetime, str, None]): The last date(excluding) in the set of partitions.\n Default is None. Can provide in either a datetime or string format.\n minute_offset (int): Number of minutes past the hour to "split" the partition. Defaults\n to 0.\n hour_offset (int): Number of hours past 00:00 to "split" the partition. Defaults to 0.\n day_offset (int): Day of the month to "split" the partition. Defaults to 1.\n timezone (Optional[str]): The timezone in which each date should exist.\n Supported strings for timezones are the ones provided by the\n `IANA time zone database <https://www.iana.org/time-zones>` - e.g. "America/Los_Angeles".\n fmt (Optional[str]): The date format to use. Defaults to `%Y-%m-%d`.\n end_offset (int): Extends the partition set by a number of partitions equal to the value\n passed. If end_offset is 0 (the default), the last partition ends before the current\n time. If end_offset is 1, the second-to-last partition ends before the current time,\n and so on.\n\n .. code-block:: python\n\n MonthlyPartitionsDefinition(start_date="2022-03-12")\n # creates partitions (2022-04-01-00:00, 2022-05-01-00:00), (2022-05-01-00:00, 2022-06-01-00:00), ...\n\n MonthlyPartitionsDefinition(start_date="2022-03-12", minute_offset=15, hour_offset=3, day_offset=5)\n # creates partitions (2022-04-05-03:15, 2022-05-05-03:15), (2022-05-05-03:15, 2022-06-05-03:15), ...\n """\n\n def __new__(\n cls,\n start_date: Union[datetime, str],\n end_date: Union[datetime, str, None] = None,\n minute_offset: int = 0,\n hour_offset: int = 0,\n day_offset: int = 1,\n timezone: Optional[str] = None,\n fmt: Optional[str] = None,\n end_offset: int = 0,\n ):\n _fmt = fmt or DEFAULT_DATE_FORMAT\n\n return super(MonthlyPartitionsDefinition, cls).__new__(\n cls,\n schedule_type=ScheduleType.MONTHLY,\n start=start_date,\n end=end_date,\n minute_offset=minute_offset,\n hour_offset=hour_offset,\n day_offset=day_offset,\n timezone=timezone,\n fmt=_fmt,\n end_offset=end_offset,\n )
\n\n\n
[docs]def monthly_partitioned_config(\n start_date: Union[datetime, str],\n minute_offset: int = 0,\n hour_offset: int = 0,\n day_offset: int = 1,\n timezone: Optional[str] = None,\n fmt: Optional[str] = None,\n end_offset: int = 0,\n tags_for_partition_fn: Optional[Callable[[datetime, datetime], Mapping[str, str]]] = None,\n) -> Callable[\n [Callable[[datetime, datetime], Mapping[str, Any]]],\n PartitionedConfig[MonthlyPartitionsDefinition],\n]:\n """Defines run config over a set of monthly partitions.\n\n The decorated function should accept a start datetime and end datetime, which represent the date\n partition the config should delineate.\n\n The decorated function should return a run config dictionary.\n\n The resulting object created by this decorator can be provided to the config argument of a Job.\n The first partition in the set will start at midnight on the soonest first of the month after\n start_date. The last partition in the set will end before the current time, unless the\n end_offset argument is set to a positive number. If day_offset is provided, the start and end\n date of each partition will be day_offset. If minute_offset and/or hour_offset are used, the\n start and end times of each partition will be hour_offset:minute_offset of each day.\n\n Args:\n start_date (Union[datetime.datetime, str]): The first date in the set of partitions will be\n midnight the sonnest first of the month following start_date. Can provide in either a\n datetime or string format.\n minute_offset (int): Number of minutes past the hour to "split" the partition. Defaults\n to 0.\n hour_offset (int): Number of hours past 00:00 to "split" the partition. Defaults to 0.\n day_offset (int): Day of the month to "split" the partition. Defaults to 1.\n timezone (Optional[str]): The timezone in which each date should exist.\n Supported strings for timezones are the ones provided by the\n `IANA time zone database <https://www.iana.org/time-zones>` - e.g. "America/Los_Angeles".\n fmt (Optional[str]): The date format to use. Defaults to `%Y-%m-%d`.\n end_offset (int): Extends the partition set by a number of partitions equal to the value\n passed. If end_offset is 0 (the default), the last partition ends before the current\n time. If end_offset is 1, the second-to-last partition ends before the current time,\n and so on.\n tags_for_partition_fn (Optional[Callable[[str], Mapping[str, str]]]): A function that\n accepts a partition time window and returns a dictionary of tags to attach to runs for\n that partition.\n\n .. code-block:: python\n\n @monthly_partitioned_config(start_date="2022-03-12")\n # creates partitions (2022-04-01-00:00, 2022-05-01-00:00), (2022-05-01-00:00, 2022-06-01-00:00), ...\n\n @monthly_partitioned_config(start_date="2022-03-12", minute_offset=15, hour_offset=3, day_offset=5)\n # creates partitions (2022-04-05-03:15, 2022-05-05-03:15), (2022-05-05-03:15, 2022-06-05-03:15), ...\n """\n\n def inner(\n fn: Callable[[datetime, datetime], Mapping[str, Any]]\n ) -> PartitionedConfig[MonthlyPartitionsDefinition]:\n check.callable_param(fn, "fn")\n\n partitions_def = MonthlyPartitionsDefinition(\n start_date=start_date,\n minute_offset=minute_offset,\n hour_offset=hour_offset,\n day_offset=day_offset,\n timezone=timezone,\n fmt=fmt,\n end_offset=end_offset,\n )\n\n return PartitionedConfig(\n run_config_for_partition_key_fn=wrap_time_window_run_config_fn(fn, partitions_def),\n partitions_def=partitions_def,\n decorated_fn=fn,\n tags_for_partition_key_fn=wrap_time_window_tags_fn(\n tags_for_partition_fn, partitions_def\n ),\n )\n\n return inner
\n\n\n
[docs]class WeeklyPartitionsDefinition(TimeWindowPartitionsDefinition):\n """Defines a set of weekly partitions.\n\n The first partition in the set will start at the start_date. The last partition in the set will\n end before the current time, unless the end_offset argument is set to a positive number. If\n day_offset is provided, the start and end date of each partition will be day of the week\n corresponding to day_offset (0 indexed with Sunday as the start of the week). If\n minute_offset and/or hour_offset are used, the start and end times of each partition will be\n hour_offset:minute_offset of each day.\n\n Args:\n start_date (Union[datetime.datetime, str]): The first date in the set of partitions will\n Sunday at midnight following start_date. Can provide in either a datetime or string\n format.\n end_date (Union[datetime.datetime, str, None]): The last date(excluding) in the set of partitions.\n Default is None. Can provide in either a datetime or string format.\n minute_offset (int): Number of minutes past the hour to "split" the partition. Defaults\n to 0.\n hour_offset (int): Number of hours past 00:00 to "split" the partition. Defaults to 0.\n day_offset (int): Day of the week to "split" the partition. Defaults to 0 (Sunday).\n timezone (Optional[str]): The timezone in which each date should exist.\n Supported strings for timezones are the ones provided by the\n `IANA time zone database <https://www.iana.org/time-zones>` - e.g. "America/Los_Angeles".\n fmt (Optional[str]): The date format to use. Defaults to `%Y-%m-%d`.\n end_offset (int): Extends the partition set by a number of partitions equal to the value\n passed. If end_offset is 0 (the default), the last partition ends before the current\n time. If end_offset is 1, the second-to-last partition ends before the current time,\n and so on.\n\n .. code-block:: python\n\n WeeklyPartitionsDefinition(start_date="2022-03-12")\n # creates partitions (2022-03-13-00:00, 2022-03-20-00:00), (2022-03-20-00:00, 2022-03-27-00:00), ...\n\n WeeklyPartitionsDefinition(start_date="2022-03-12", minute_offset=15, hour_offset=3, day_offset=6)\n # creates partitions (2022-03-12-03:15, 2022-03-19-03:15), (2022-03-19-03:15, 2022-03-26-03:15), ...\n """\n\n def __new__(\n cls,\n start_date: Union[datetime, str],\n end_date: Union[datetime, str, None] = None,\n minute_offset: int = 0,\n hour_offset: int = 0,\n day_offset: int = 0,\n timezone: Optional[str] = None,\n fmt: Optional[str] = None,\n end_offset: int = 0,\n ):\n _fmt = fmt or DEFAULT_DATE_FORMAT\n\n return super(WeeklyPartitionsDefinition, cls).__new__(\n cls,\n schedule_type=ScheduleType.WEEKLY,\n start=start_date,\n end=end_date,\n minute_offset=minute_offset,\n hour_offset=hour_offset,\n day_offset=day_offset,\n timezone=timezone,\n fmt=_fmt,\n end_offset=end_offset,\n )
\n\n\n
[docs]def weekly_partitioned_config(\n start_date: Union[datetime, str],\n minute_offset: int = 0,\n hour_offset: int = 0,\n day_offset: int = 0,\n timezone: Optional[str] = None,\n fmt: Optional[str] = None,\n end_offset: int = 0,\n tags_for_partition_fn: Optional[Callable[[datetime, datetime], Mapping[str, str]]] = None,\n) -> Callable[\n [Callable[[datetime, datetime], Mapping[str, Any]]],\n PartitionedConfig[WeeklyPartitionsDefinition],\n]:\n """Defines run config over a set of weekly partitions.\n\n The decorated function should accept a start datetime and end datetime, which represent the date\n partition the config should delineate.\n\n The decorated function should return a run config dictionary.\n\n The resulting object created by this decorator can be provided to the config argument of a Job.\n The first partition in the set will start at the start_date. The last partition in the set will\n end before the current time, unless the end_offset argument is set to a positive number. If\n day_offset is provided, the start and end date of each partition will be day of the week\n corresponding to day_offset (0 indexed with Sunday as the start of the week). If\n minute_offset and/or hour_offset are used, the start and end times of each partition will be\n hour_offset:minute_offset of each day.\n\n Args:\n start_date (Union[datetime.datetime, str]): The first date in the set of partitions will\n Sunday at midnight following start_date. Can provide in either a datetime or string\n format.\n minute_offset (int): Number of minutes past the hour to "split" the partition. Defaults\n to 0.\n hour_offset (int): Number of hours past 00:00 to "split" the partition. Defaults to 0.\n day_offset (int): Day of the week to "split" the partition. Defaults to 0 (Sunday).\n timezone (Optional[str]): The timezone in which each date should exist.\n Supported strings for timezones are the ones provided by the\n `IANA time zone database <https://www.iana.org/time-zones>` - e.g. "America/Los_Angeles".\n fmt (Optional[str]): The date format to use. Defaults to `%Y-%m-%d`.\n end_offset (int): Extends the partition set by a number of partitions equal to the value\n passed. If end_offset is 0 (the default), the last partition ends before the current\n time. If end_offset is 1, the second-to-last partition ends before the current time,\n and so on.\n tags_for_partition_fn (Optional[Callable[[str], Mapping[str, str]]]): A function that\n accepts a partition time window and returns a dictionary of tags to attach to runs for\n that partition.\n\n .. code-block:: python\n\n @weekly_partitioned_config(start_date="2022-03-12")\n # creates partitions (2022-03-13-00:00, 2022-03-20-00:00), (2022-03-20-00:00, 2022-03-27-00:00), ...\n\n @weekly_partitioned_config(start_date="2022-03-12", minute_offset=15, hour_offset=3, day_offset=6)\n # creates partitions (2022-03-12-03:15, 2022-03-19-03:15), (2022-03-19-03:15, 2022-03-26-03:15), ...\n """\n\n def inner(\n fn: Callable[[datetime, datetime], Mapping[str, Any]]\n ) -> PartitionedConfig[WeeklyPartitionsDefinition]:\n check.callable_param(fn, "fn")\n\n partitions_def = WeeklyPartitionsDefinition(\n start_date=start_date,\n minute_offset=minute_offset,\n hour_offset=hour_offset,\n day_offset=day_offset,\n timezone=timezone,\n fmt=fmt,\n end_offset=end_offset,\n )\n return PartitionedConfig(\n run_config_for_partition_key_fn=wrap_time_window_run_config_fn(fn, partitions_def),\n partitions_def=partitions_def,\n decorated_fn=fn,\n tags_for_partition_key_fn=wrap_time_window_tags_fn(\n tags_for_partition_fn, partitions_def\n ),\n )\n\n return inner
\n\n\nclass TimeWindowPartitionsSubset(PartitionsSubset):\n # Every time we change the serialization format, we should increment the version number.\n # This will ensure that we can gracefully degrade when deserializing old data.\n SERIALIZATION_VERSION = 1\n\n def __init__(\n self,\n partitions_def: TimeWindowPartitionsDefinition,\n num_partitions: int,\n included_time_windows: Optional[Sequence[TimeWindow]] = None,\n included_partition_keys: Optional[AbstractSet[str]] = None,\n ):\n self._partitions_def = check.inst_param(\n partitions_def, "partitions_def", TimeWindowPartitionsDefinition\n )\n self._included_time_windows = included_time_windows\n self._num_partitions = num_partitions\n\n check.param_invariant(\n not (included_partition_keys and included_time_windows),\n "Cannot specify both included_partition_keys and included_time_windows",\n )\n self._included_time_windows = check.opt_nullable_sequence_param(\n included_time_windows, "included_time_windows", of_type=TimeWindow\n )\n\n self._included_partition_keys = check.opt_nullable_set_param(\n included_partition_keys, "included_partition_keys", of_type=str\n )\n\n @property\n def included_time_windows(self) -> Sequence[TimeWindow]:\n if self._included_time_windows is None:\n result_time_windows, _ = self._add_partitions_to_time_windows(\n initial_windows=[],\n partition_keys=list(check.not_none(self._included_partition_keys)),\n validate=False,\n )\n self._included_time_windows = result_time_windows\n return self._included_time_windows\n\n def _get_partition_time_windows_not_in_subset(\n self,\n current_time: Optional[datetime] = None,\n ) -> Sequence[TimeWindow]:\n """Returns a list of partition time windows that are not in the subset.\n Each time window is a single partition.\n """\n first_tw = self._partitions_def.get_first_partition_window(current_time=current_time)\n last_tw = self._partitions_def.get_last_partition_window(current_time=current_time)\n\n if not first_tw or not last_tw:\n check.failed("No partitions found")\n\n if len(self.included_time_windows) == 0:\n return [TimeWindow(first_tw.start, last_tw.end)]\n\n time_windows = []\n if first_tw.start < self.included_time_windows[0].start:\n time_windows.append(TimeWindow(first_tw.start, self.included_time_windows[0].start))\n\n for i in range(len(self.included_time_windows) - 1):\n if self.included_time_windows[i].start >= last_tw.end:\n break\n if self.included_time_windows[i].end < last_tw.end:\n if self.included_time_windows[i + 1].start <= last_tw.end:\n time_windows.append(\n TimeWindow(\n self.included_time_windows[i].end,\n self.included_time_windows[i + 1].start,\n )\n )\n else:\n time_windows.append(\n TimeWindow(\n self.included_time_windows[i].end,\n last_tw.end,\n )\n )\n\n if last_tw.end > self.included_time_windows[-1].end:\n time_windows.append(TimeWindow(self.included_time_windows[-1].end, last_tw.end))\n\n return time_windows\n\n def get_partition_keys_not_in_subset(\n self,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> Iterable[str]:\n partition_keys: List[str] = []\n for tw in self._get_partition_time_windows_not_in_subset(current_time):\n partition_keys.extend(self._partitions_def.get_partition_keys_in_time_window(tw))\n return partition_keys\n\n @public\n def get_partition_keys(self, current_time: Optional[datetime] = None) -> Iterable[str]:\n if self._included_partition_keys is None:\n return [\n pk\n for time_window in self.included_time_windows\n for pk in self._partitions_def.get_partition_keys_in_time_window(time_window)\n ]\n return list(self._included_partition_keys) if self._included_partition_keys else []\n\n def get_partition_key_ranges(\n self,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> Sequence[PartitionKeyRange]:\n return [\n self._partitions_def.get_partition_key_range_for_time_window(window)\n for window in self.included_time_windows\n ]\n\n def _add_partitions_to_time_windows(\n self,\n initial_windows: Sequence[TimeWindow],\n partition_keys: Sequence[str],\n validate: bool = True,\n ) -> Tuple[Sequence[TimeWindow], int]:\n """Merges a set of partition keys into an existing set of time windows, returning the\n minimized set of time windows and the number of partitions added.\n """\n result_windows = [*initial_windows]\n time_windows = self._partitions_def.time_windows_for_partition_keys(\n frozenset(partition_keys), validate=validate\n )\n num_added_partitions = 0\n for window in sorted(time_windows):\n # go in reverse order because it's more common to add partitions at the end than the\n # beginning\n for i in reversed(range(len(result_windows))):\n included_window = result_windows[i]\n lt_end_of_range = window.start < included_window.end\n gte_start_of_range = window.start >= included_window.start\n\n if lt_end_of_range and gte_start_of_range:\n break\n\n if not lt_end_of_range:\n merge_with_range = included_window.end == window.start\n merge_with_later_range = i + 1 < len(result_windows) and (\n window.end == result_windows[i + 1].start\n )\n\n if merge_with_range and merge_with_later_range:\n result_windows[i] = TimeWindow(\n included_window.start, result_windows[i + 1].end\n )\n del result_windows[i + 1]\n elif merge_with_range:\n result_windows[i] = TimeWindow(included_window.start, window.end)\n elif merge_with_later_range:\n result_windows[i + 1] = TimeWindow(window.start, result_windows[i + 1].end)\n else:\n result_windows.insert(i + 1, window)\n\n num_added_partitions += 1\n break\n else:\n if result_windows and window.start == result_windows[0].start:\n result_windows[0] = TimeWindow(window.start, included_window.end) # type: ignore\n else:\n result_windows.insert(0, window)\n\n num_added_partitions += 1\n\n return result_windows, num_added_partitions\n\n def with_partition_keys(self, partition_keys: Iterable[str]) -> "TimeWindowPartitionsSubset":\n # if we are representing things as a static set of keys, continue doing so\n if self._included_partition_keys is not None:\n new_partitions = {*self._included_partition_keys, *partition_keys}\n return TimeWindowPartitionsSubset(\n self._partitions_def,\n num_partitions=len(new_partitions),\n included_partition_keys=new_partitions,\n )\n\n result_windows, added_partitions = self._add_partitions_to_time_windows(\n self.included_time_windows, list(partition_keys)\n )\n\n return TimeWindowPartitionsSubset(\n self._partitions_def,\n num_partitions=self._num_partitions + added_partitions,\n included_time_windows=result_windows,\n )\n\n @classmethod\n def from_serialized(\n cls, partitions_def: PartitionsDefinition, serialized: str\n ) -> "PartitionsSubset":\n if not isinstance(partitions_def, TimeWindowPartitionsDefinition):\n check.failed("Partitions definition must be a TimeWindowPartitionsDefinition")\n partitions_def = cast(TimeWindowPartitionsDefinition, partitions_def)\n\n loaded = json.loads(serialized)\n\n def tuples_to_time_windows(tuples):\n return [\n TimeWindow(\n pendulum.from_timestamp(tup[0], tz=partitions_def.timezone),\n pendulum.from_timestamp(tup[1], tz=partitions_def.timezone),\n )\n for tup in tuples\n ]\n\n if isinstance(loaded, list):\n # backwards compatibility\n time_windows = tuples_to_time_windows(loaded)\n num_partitions = sum(\n len(partitions_def.get_partition_keys_in_time_window(time_window))\n for time_window in time_windows\n )\n elif isinstance(loaded, dict) and (\n "version" not in loaded or loaded["version"] == cls.SERIALIZATION_VERSION\n ): # version 1\n time_windows = tuples_to_time_windows(loaded["time_windows"])\n num_partitions = loaded["num_partitions"]\n else:\n raise DagsterInvalidDeserializationVersionError(\n f"Attempted to deserialize partition subset with version {loaded.get('version')},"\n f" but only version {cls.SERIALIZATION_VERSION} is supported."\n )\n\n return TimeWindowPartitionsSubset(\n partitions_def, num_partitions=num_partitions, included_time_windows=time_windows\n )\n\n @classmethod\n def can_deserialize(\n cls,\n partitions_def: PartitionsDefinition,\n serialized: str,\n serialized_partitions_def_unique_id: Optional[str],\n serialized_partitions_def_class_name: Optional[str],\n ) -> bool:\n if serialized_partitions_def_unique_id:\n return (\n partitions_def.get_serializable_unique_identifier()\n == serialized_partitions_def_unique_id\n )\n\n if (\n serialized_partitions_def_class_name\n # note: all TimeWindowPartitionsDefinition subclasses will get serialized as raw\n # TimeWindowPartitionsDefinitions, so this class name check will not always pass,\n # hence the unique id check above\n and serialized_partitions_def_class_name != partitions_def.__class__.__name__\n ):\n return False\n\n data = json.loads(serialized)\n return isinstance(data, list) or (\n isinstance(data, dict)\n and data.get("time_windows") is not None\n and data.get("num_partitions") is not None\n )\n\n @classmethod\n def empty_subset(cls, partitions_def: PartitionsDefinition) -> "PartitionsSubset":\n if not isinstance(partitions_def, TimeWindowPartitionsDefinition):\n check.failed("Partitions definition must be a TimeWindowPartitionsDefinition")\n partitions_def = cast(TimeWindowPartitionsDefinition, partitions_def)\n return cls(partitions_def, 0, [], set())\n\n def serialize(self) -> str:\n return json.dumps(\n {\n "version": self.SERIALIZATION_VERSION,\n "time_windows": [\n (window.start.timestamp(), window.end.timestamp())\n for window in self.included_time_windows\n ],\n "num_partitions": self._num_partitions,\n }\n )\n\n @property\n def partitions_def(self) -> PartitionsDefinition:\n return self._partitions_def\n\n def __eq__(self, other):\n return (\n isinstance(other, TimeWindowPartitionsSubset)\n and self._partitions_def == other._partitions_def\n and (\n # faster comparison, but will not catch all cases\n (\n self._included_time_windows == other._included_time_windows\n and self._included_partition_keys == other._included_partition_keys\n )\n # slower comparison, catches all cases\n or self.included_time_windows == other.included_time_windows\n )\n )\n\n def __len__(self) -> int:\n return self._num_partitions\n\n def __contains__(self, partition_key: str) -> bool:\n if self._included_partition_keys is not None:\n return partition_key in self._included_partition_keys\n\n time_window = self._partitions_def.time_window_for_partition_key(partition_key)\n\n return any(\n time_window.start >= included_time_window.start\n and time_window.start < included_time_window.end\n for included_time_window in self.included_time_windows\n )\n\n def __repr__(self) -> str:\n return f"TimeWindowPartitionsSubset({self.get_partition_key_ranges()})"\n\n\nclass PartitionRangeStatus(Enum):\n MATERIALIZING = "MATERIALIZING"\n MATERIALIZED = "MATERIALIZED"\n FAILED = "FAILED"\n\n\nPARTITION_RANGE_STATUS_PRIORITY = [\n PartitionRangeStatus.MATERIALIZING,\n PartitionRangeStatus.FAILED,\n PartitionRangeStatus.MATERIALIZED,\n]\n\n\nclass PartitionTimeWindowStatus:\n def __init__(self, time_window: TimeWindow, status: PartitionRangeStatus):\n self.time_window = time_window\n self.status = status\n\n def __repr__(self):\n return f"({self.time_window.start} - {self.time_window.end}): {self.status.value}"\n\n def __eq__(self, other):\n return (\n isinstance(other, PartitionTimeWindowStatus)\n and self.time_window == other.time_window\n and self.status == other.status\n )\n\n\ndef _flatten(\n high_pri_time_windows: List[PartitionTimeWindowStatus],\n low_pri_time_windows: List[PartitionTimeWindowStatus],\n) -> List[PartitionTimeWindowStatus]:\n high_pri_time_windows = sorted(high_pri_time_windows, key=lambda t: t.time_window.start)\n low_pri_time_windows = sorted(low_pri_time_windows, key=lambda t: t.time_window.start)\n\n high_pri_idx = 0\n low_pri_idx = 0\n\n filtered_low_pri: List[PartitionTimeWindowStatus] = []\n\n # slice and dice the low pri time windows so there's no overlap with high pri\n while True:\n if low_pri_idx >= len(low_pri_time_windows):\n # reached end of materialized\n break\n if high_pri_idx >= len(high_pri_time_windows):\n # reached end of failed, add all remaining materialized bc there's no overlap\n filtered_low_pri.extend(low_pri_time_windows[low_pri_idx:])\n break\n\n low_pri_tw = low_pri_time_windows[low_pri_idx]\n high_pri_tw = high_pri_time_windows[high_pri_idx]\n\n if low_pri_tw.time_window.start < high_pri_tw.time_window.start:\n if low_pri_tw.time_window.end <= high_pri_tw.time_window.start:\n # low_pri_tw is entirely before high pri\n filtered_low_pri.append(low_pri_tw)\n low_pri_idx += 1\n else:\n # high pri cuts the low pri short\n filtered_low_pri.append(\n PartitionTimeWindowStatus(\n TimeWindow(\n low_pri_tw.time_window.start,\n high_pri_tw.time_window.start,\n ),\n low_pri_tw.status,\n )\n )\n\n if low_pri_tw.time_window.end > high_pri_tw.time_window.end:\n # the low pri time window will continue on the other end of the high pri\n # and get split in two. Modify low_pri[low_pri_idx] to be\n # the second half of the low pri time window. It will be added in the next iteration.\n # (don't add it now, because we need to check if it overlaps with the next high pri)\n low_pri_time_windows[low_pri_idx] = PartitionTimeWindowStatus(\n TimeWindow(high_pri_tw.time_window.end, low_pri_tw.time_window.end),\n low_pri_tw.status,\n )\n high_pri_idx += 1\n else:\n # the rest of the low pri time window is inside the high pri time window\n low_pri_idx += 1\n else:\n if low_pri_tw.time_window.start >= high_pri_tw.time_window.end:\n # high pri is entirely before low pri. The next high pri may overlap\n high_pri_idx += 1\n elif low_pri_tw.time_window.end <= high_pri_tw.time_window.end:\n # low pri is entirely within high pri, skip it\n low_pri_idx += 1\n else:\n # high pri cuts out the start of the low pri. It will continue on the other end.\n # Modify low_pri[low_pri_idx] to shorten the start. It will be added\n # in the next iteration. (don't add it now, because we need to check if it overlaps with the next high pri)\n low_pri_time_windows[low_pri_idx] = PartitionTimeWindowStatus(\n TimeWindow(high_pri_tw.time_window.end, low_pri_tw.time_window.end),\n low_pri_tw.status,\n )\n high_pri_idx += 1\n\n # combine the high pri windwos with the filtered low pri windows\n flattened_time_windows = high_pri_time_windows\n flattened_time_windows.extend(filtered_low_pri)\n flattened_time_windows.sort(key=lambda t: t.time_window.start)\n return flattened_time_windows\n\n\ndef fetch_flattened_time_window_ranges(\n subsets: Mapping[PartitionRangeStatus, TimeWindowPartitionsSubset]\n) -> Sequence[PartitionTimeWindowStatus]:\n """Given potentially overlapping subsets, return a flattened list of timewindows where the highest priority status wins\n on overlaps.\n """\n prioritized_subsets = sorted(\n [(status, subset) for status, subset in subsets.items()],\n key=lambda t: PARTITION_RANGE_STATUS_PRIORITY.index(t[0]),\n )\n\n # progressively add lower priority time windows to the list of higher priority time windows\n flattened_time_window_statuses = []\n for status, subset in prioritized_subsets:\n subset_time_window_statuses = [\n PartitionTimeWindowStatus(tw, status) for tw in subset.included_time_windows\n ]\n flattened_time_window_statuses = _flatten(\n flattened_time_window_statuses, subset_time_window_statuses\n )\n\n return flattened_time_window_statuses\n\n\ndef has_one_dimension_time_window_partitioning(\n partitions_def: Optional[PartitionsDefinition],\n) -> bool:\n from .multi_dimensional_partitions import MultiPartitionsDefinition\n\n if isinstance(partitions_def, TimeWindowPartitionsDefinition):\n return True\n elif isinstance(partitions_def, MultiPartitionsDefinition):\n time_window_dims = [\n dim\n for dim in partitions_def.partitions_defs\n if isinstance(dim.partitions_def, TimeWindowPartitionsDefinition)\n ]\n if len(time_window_dims) == 1:\n return True\n\n return False\n\n\ndef get_time_partitions_def(\n partitions_def: Optional[PartitionsDefinition],\n) -> Optional[TimeWindowPartitionsDefinition]:\n """For a given PartitionsDefinition, return the associated TimeWindowPartitionsDefinition if it\n exists.\n """\n from .multi_dimensional_partitions import MultiPartitionsDefinition\n\n if partitions_def is None:\n return None\n elif isinstance(partitions_def, TimeWindowPartitionsDefinition):\n return partitions_def\n elif isinstance(\n partitions_def, MultiPartitionsDefinition\n ) and has_one_dimension_time_window_partitioning(partitions_def):\n return cast(\n TimeWindowPartitionsDefinition, partitions_def.time_window_dimension.partitions_def\n )\n else:\n return None\n\n\ndef get_time_partition_key(\n partitions_def: Optional[PartitionsDefinition], partition_key: Optional[str]\n) -> str:\n from .multi_dimensional_partitions import MultiPartitionsDefinition\n\n if partitions_def is None or partition_key is None:\n check.failed(\n "Cannot get time partitions key from when partitions def is None or partition key is"\n " None"\n )\n elif isinstance(partitions_def, TimeWindowPartitionsDefinition):\n return partition_key\n elif isinstance(partitions_def, MultiPartitionsDefinition):\n return partitions_def.get_partition_key_from_str(partition_key).keys_by_dimension[\n partitions_def.time_window_dimension.name\n ]\n else:\n check.failed(f"Cannot get time partition from non-time partitions def {partitions_def}")\n
", "current_page_name": "_modules/dagster/_core/definitions/time_window_partitions", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.time_window_partitions"}, "unresolved_asset_job_definition": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.unresolved_asset_job_definition

\nfrom collections import defaultdict\nfrom datetime import datetime\nfrom typing import TYPE_CHECKING, AbstractSet, Any, Mapping, NamedTuple, Optional, Sequence, Union\n\nimport dagster._check as check\nfrom dagster._annotations import deprecated\nfrom dagster._core.definitions import AssetKey\nfrom dagster._core.definitions.run_request import RunRequest\nfrom dagster._core.errors import DagsterInvalidDefinitionError\nfrom dagster._core.instance import DynamicPartitionsStore\n\nfrom .asset_layer import build_asset_selection_job\nfrom .config import ConfigMapping\nfrom .metadata import RawMetadataValue\n\nif TYPE_CHECKING:\n    from dagster._core.definitions import (\n        AssetSelection,\n        ExecutorDefinition,\n        HookDefinition,\n        JobDefinition,\n        PartitionedConfig,\n        PartitionsDefinition,\n        ResourceDefinition,\n    )\n    from dagster._core.definitions.asset_graph import InternalAssetGraph\n    from dagster._core.definitions.asset_selection import CoercibleToAssetSelection\n    from dagster._core.definitions.run_config import RunConfig\n\n\nclass UnresolvedAssetJobDefinition(\n    NamedTuple(\n        "_UnresolvedAssetJobDefinition",\n        [\n            ("name", str),\n            ("selection", "AssetSelection"),\n            (\n                "config",\n                Optional[Union[ConfigMapping, Mapping[str, Any], "PartitionedConfig"]],\n            ),\n            ("description", Optional[str]),\n            ("tags", Optional[Mapping[str, Any]]),\n            ("metadata", Optional[Mapping[str, RawMetadataValue]]),\n            ("partitions_def", Optional["PartitionsDefinition"]),\n            ("executor_def", Optional["ExecutorDefinition"]),\n            ("hooks", Optional[AbstractSet["HookDefinition"]]),\n        ],\n    )\n):\n    def __new__(\n        cls,\n        name: str,\n        selection: "AssetSelection",\n        config: Optional[\n            Union[ConfigMapping, Mapping[str, Any], "PartitionedConfig", "RunConfig"]\n        ] = None,\n        description: Optional[str] = None,\n        tags: Optional[Mapping[str, Any]] = None,\n        metadata: Optional[Mapping[str, RawMetadataValue]] = None,\n        partitions_def: Optional["PartitionsDefinition"] = None,\n        executor_def: Optional["ExecutorDefinition"] = None,\n        hooks: Optional[AbstractSet["HookDefinition"]] = None,\n    ):\n        from dagster._core.definitions import (\n            AssetSelection,\n            ExecutorDefinition,\n            HookDefinition,\n            PartitionsDefinition,\n        )\n        from dagster._core.definitions.run_config import convert_config_input\n\n        return super(UnresolvedAssetJobDefinition, cls).__new__(\n            cls,\n            name=check.str_param(name, "name"),\n            selection=check.inst_param(selection, "selection", AssetSelection),\n            config=convert_config_input(config),\n            description=check.opt_str_param(description, "description"),\n            tags=check.opt_mapping_param(tags, "tags"),\n            metadata=check.opt_mapping_param(metadata, "metadata"),\n            partitions_def=check.opt_inst_param(\n                partitions_def, "partitions_def", PartitionsDefinition\n            ),\n            executor_def=check.opt_inst_param(executor_def, "partitions_def", ExecutorDefinition),\n            hooks=check.opt_nullable_set_param(hooks, "hooks", of_type=HookDefinition),\n        )\n\n    @deprecated(\n        breaking_version="2.0.0",\n        additional_warn_text="Directly instantiate `RunRequest(partition_key=...)` instead.",\n    )\n    def run_request_for_partition(\n        self,\n        partition_key: str,\n        run_key: Optional[str] = None,\n        tags: Optional[Mapping[str, str]] = None,\n        asset_selection: Optional[Sequence[AssetKey]] = None,\n        run_config: Optional[Mapping[str, Any]] = None,\n        current_time: Optional[datetime] = None,\n        dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n    ) -> RunRequest:\n        """Creates a RunRequest object for a run that processes the given partition.\n\n        Args:\n            partition_key: The key of the partition to request a run for.\n            run_key (Optional[str]): A string key to identify this launched run. For sensors, ensures that\n                only one run is created per run key across all sensor evaluations.  For schedules,\n                ensures that one run is created per tick, across failure recoveries. Passing in a `None`\n                value means that a run will always be launched per evaluation.\n            tags (Optional[Dict[str, str]]): A dictionary of tags (string key-value pairs) to attach\n                to the launched run.\n            run_config (Optional[Mapping[str, Any]]: Configuration for the run. If the job has\n                a :py:class:`PartitionedConfig`, this value will override replace the config\n                provided by it.\n            current_time (Optional[datetime]): Used to determine which time-partitions exist.\n                Defaults to now.\n            dynamic_partitions_store (Optional[DynamicPartitionsStore]): The DynamicPartitionsStore\n                object that is responsible for fetching dynamic partitions. Required when the\n                partitions definition is a DynamicPartitionsDefinition with a name defined. Users\n                can pass the DagsterInstance fetched via `context.instance` to this argument.\n\n        Returns:\n            RunRequest: an object that requests a run to process the given partition.\n        """\n        from dagster._core.definitions.partition import (\n            DynamicPartitionsDefinition,\n            PartitionedConfig,\n        )\n\n        if not self.partitions_def:\n            check.failed("Called run_request_for_partition on a non-partitioned job")\n\n        partitioned_config = PartitionedConfig.from_flexible_config(\n            self.config, self.partitions_def\n        )\n\n        if (\n            isinstance(self.partitions_def, DynamicPartitionsDefinition)\n            and self.partitions_def.name\n        ):\n            # Do not support using run_request_for_partition with dynamic partitions,\n            # since this requires querying the instance once per run request for the\n            # existent dynamic partitions\n            check.failed(\n                "run_request_for_partition is not supported for dynamic partitions. Instead, use"\n                " RunRequest(partition_key=...)"\n            )\n\n        self.partitions_def.validate_partition_key(\n            partition_key,\n            current_time=current_time,\n            dynamic_partitions_store=dynamic_partitions_store,\n        )\n\n        run_config = (\n            run_config\n            if run_config is not None\n            else partitioned_config.get_run_config_for_partition_key(partition_key)\n        )\n        run_request_tags = {\n            **(tags or {}),\n            **partitioned_config.get_tags_for_partition_key(partition_key),\n        }\n\n        return RunRequest(\n            job_name=self.name,\n            run_key=run_key,\n            run_config=run_config,\n            tags=run_request_tags,\n            asset_selection=asset_selection,\n            partition_key=partition_key,\n        )\n\n    def resolve(\n        self,\n        asset_graph: "InternalAssetGraph",\n        default_executor_def: Optional["ExecutorDefinition"] = None,\n        resource_defs: Optional[Mapping[str, "ResourceDefinition"]] = None,\n    ) -> "JobDefinition":\n        """Resolve this UnresolvedAssetJobDefinition into a JobDefinition."""\n        assets = asset_graph.assets\n        source_assets = asset_graph.source_assets\n        selected_asset_keys = self.selection.resolve(asset_graph)\n        selected_asset_checks = self.selection.resolve_checks(asset_graph)\n\n        asset_keys_by_partitions_def = defaultdict(set)\n        for asset_key in selected_asset_keys:\n            partitions_def = asset_graph.get_partitions_def(asset_key)\n            if partitions_def is not None:\n                asset_keys_by_partitions_def[partitions_def].add(asset_key)\n\n        if len(asset_keys_by_partitions_def) > 1:\n            keys_by_partitions_def_str = "\\n".join(\n                f"{partitions_def}: {asset_keys}"\n                for partitions_def, asset_keys in asset_keys_by_partitions_def.items()\n            )\n            raise DagsterInvalidDefinitionError(\n                f"Multiple partitioned assets exist in assets job '{self.name}'. Selected assets"\n                " must have the same partitions definitions, but the selected assets have"\n                f" different partitions definitions: \\n{keys_by_partitions_def_str}"\n            )\n\n        inferred_partitions_def = (\n            next(iter(asset_keys_by_partitions_def.keys()))\n            if asset_keys_by_partitions_def\n            else None\n        )\n        if (\n            inferred_partitions_def\n            and self.partitions_def != inferred_partitions_def\n            and self.partitions_def is not None\n        ):\n            raise DagsterInvalidDefinitionError(\n                f"Job '{self.name}' received a partitions_def of {self.partitions_def}, but the"\n                f" selected assets {next(iter(asset_keys_by_partitions_def.values()))} have a"\n                f" non-matching partitions_def of {inferred_partitions_def}"\n            )\n\n        return build_asset_selection_job(\n            name=self.name,\n            assets=assets,\n            asset_checks=asset_graph.asset_checks,\n            config=self.config,\n            source_assets=source_assets,\n            description=self.description,\n            tags=self.tags,\n            metadata=self.metadata,\n            asset_selection=selected_asset_keys,\n            asset_check_selection=selected_asset_checks,\n            partitions_def=self.partitions_def if self.partitions_def else inferred_partitions_def,\n            executor_def=self.executor_def or default_executor_def,\n            hooks=self.hooks,\n            resource_defs=resource_defs,\n        )\n\n\n
[docs]def define_asset_job(\n name: str,\n selection: Optional["CoercibleToAssetSelection"] = None,\n config: Optional[\n Union[ConfigMapping, Mapping[str, Any], "PartitionedConfig", "RunConfig"]\n ] = None,\n description: Optional[str] = None,\n tags: Optional[Mapping[str, Any]] = None,\n metadata: Optional[Mapping[str, RawMetadataValue]] = None,\n partitions_def: Optional["PartitionsDefinition"] = None,\n executor_def: Optional["ExecutorDefinition"] = None,\n hooks: Optional[AbstractSet["HookDefinition"]] = None,\n) -> UnresolvedAssetJobDefinition:\n """Creates a definition of a job which will either materialize a selection of assets or observe\n a selection of source assets. This will only be resolved to a JobDefinition once placed in a\n code location.\n\n Args:\n name (str):\n The name for the job.\n selection (Union[str, Sequence[str], Sequence[AssetKey], Sequence[Union[AssetsDefinition, SourceAsset]], AssetSelection]):\n The assets that will be materialized or observed when the job is run.\n\n The selected assets must all be included in the assets that are passed to the assets\n argument of the Definitions object that this job is included on.\n\n The string "my_asset*" selects my_asset and all downstream assets within the code\n location. A list of strings represents the union of all assets selected by strings\n within the list.\n\n The selection will be resolved to a set of assets when the location is loaded. If the\n selection resolves to all source assets, the created job will perform source asset\n observations. If the selection resolves to all regular assets, the created job will\n materialize assets. If the selection resolves to a mixed set of source assets and\n regular assets, an error will be thrown.\n\n config:\n Describes how the Job is parameterized at runtime.\n\n If no value is provided, then the schema for the job's run config is a standard\n format based on its ops and resources.\n\n If a dictionary is provided, then it must conform to the standard config schema, and\n it will be used as the job's run config for the job whenever the job is executed.\n The values provided will be viewable and editable in the Dagster UI, so be\n careful with secrets.\n\n If a :py:class:`ConfigMapping` object is provided, then the schema for the job's run config is\n determined by the config mapping, and the ConfigMapping, which should return\n configuration in the standard format to configure the job.\n tags (Optional[Mapping[str, Any]]):\n Arbitrary information that will be attached to the execution of the Job.\n Values that are not strings will be json encoded and must meet the criteria that\n `json.loads(json.dumps(value)) == value`. These tag values may be overwritten by tag\n values provided at invocation time.\n metadata (Optional[Mapping[str, RawMetadataValue]]): Arbitrary metadata about the job.\n Keys are displayed string labels, and values are one of the following: string, float,\n int, JSON-serializable dict, JSON-serializable list, and one of the data classes\n returned by a MetadataValue static method.\n description (Optional[str]):\n A description for the Job.\n partitions_def (Optional[PartitionsDefinition]):\n Defines the set of partitions for this job. All AssetDefinitions selected for this job\n must have a matching PartitionsDefinition. If no PartitionsDefinition is provided, the\n PartitionsDefinition will be inferred from the selected AssetDefinitions.\n executor_def (Optional[ExecutorDefinition]):\n How this Job will be executed. Defaults to :py:class:`multi_or_in_process_executor`,\n which can be switched between multi-process and in-process modes of execution. The\n default mode of execution is multi-process.\n\n\n Returns:\n UnresolvedAssetJobDefinition: The job, which can be placed inside a code location.\n\n Examples:\n .. code-block:: python\n\n # A job that targets all assets in the code location:\n @asset\n def asset1():\n ...\n\n defs = Definitions(\n assets=[asset1],\n jobs=[define_asset_job("all_assets")],\n )\n\n # A job that targets a single asset\n @asset\n def asset1():\n ...\n\n defs = Definitions(\n assets=[asset1],\n jobs=[define_asset_job("all_assets", selection=[asset1])],\n )\n\n # A job that targets all the assets in a group:\n defs = Definitions(\n assets=assets,\n jobs=[define_asset_job("marketing_job", selection=AssetSelection.groups("marketing"))],\n )\n\n @observable_source_asset\n def source_asset():\n ...\n\n # A job that observes a source asset:\n defs = Definitions(\n assets=assets,\n jobs=[define_asset_job("observation_job", selection=[source_asset])],\n )\n\n # Resources are supplied to the assets, not the job:\n @asset(required_resource_keys={"slack_client"})\n def asset1():\n ...\n\n defs = Definitions(\n assets=[asset1],\n jobs=[define_asset_job("all_assets")],\n resources={"slack_client": prod_slack_client},\n )\n\n """\n from dagster._core.definitions import AssetSelection\n\n # convert string-based selections to AssetSelection objects\n if selection is None:\n resolved_selection = AssetSelection.all()\n else:\n resolved_selection = AssetSelection.from_coercible(selection)\n\n return UnresolvedAssetJobDefinition(\n name=name,\n selection=resolved_selection,\n config=config,\n description=description,\n tags=tags,\n metadata=metadata,\n partitions_def=partitions_def,\n executor_def=executor_def,\n hooks=hooks,\n )
\n
", "current_page_name": "_modules/dagster/_core/definitions/unresolved_asset_job_definition", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.unresolved_asset_job_definition"}, "utils": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.utils

\nimport keyword\nimport os\nimport re\nfrom glob import glob\nfrom typing import Any, Dict, List, Mapping, Optional, Sequence, Tuple, cast\n\nimport yaml\n\nimport dagster._check as check\nimport dagster._seven as seven\nfrom dagster._core.errors import DagsterInvalidDefinitionError, DagsterInvariantViolationError\nfrom dagster._core.storage.tags import check_reserved_tags\nfrom dagster._utils.yaml_utils import merge_yaml_strings, merge_yamls\n\nDEFAULT_OUTPUT = "result"\nDEFAULT_GROUP_NAME = "default"  # asset group_name used when none is provided\nDEFAULT_IO_MANAGER_KEY = "io_manager"\n\nDISALLOWED_NAMES = set(\n    [\n        "context",\n        "conf",\n        "config",\n        "meta",\n        "arg_dict",\n        "dict",\n        "input_arg_dict",\n        "output_arg_dict",\n        "int",\n        "str",\n        "float",\n        "bool",\n        "input",\n        "output",\n        "type",\n    ]\n    + list(keyword.kwlist)  # just disallow all python keywords\n)\n\nVALID_NAME_REGEX_STR = r"^[A-Za-z0-9_]+$"\nVALID_NAME_REGEX = re.compile(VALID_NAME_REGEX_STR)\n\n\nclass NoValueSentinel:\n    """Sentinel value to distinguish unset from None."""\n\n\ndef has_valid_name_chars(name: str) -> bool:\n    return bool(VALID_NAME_REGEX.match(name))\n\n\ndef check_valid_name(name: str, allow_list: Optional[List[str]] = None) -> str:\n    check.str_param(name, "name")\n\n    if allow_list and name in allow_list:\n        return name\n\n    if name in DISALLOWED_NAMES:\n        raise DagsterInvalidDefinitionError(\n            f'"{name}" is not a valid name in Dagster. It conflicts with a Dagster or python'\n            " reserved keyword."\n        )\n\n    check_valid_chars(name)\n\n    check.invariant(is_valid_name(name))\n    return name\n\n\ndef check_valid_chars(name: str):\n    if not has_valid_name_chars(name):\n        raise DagsterInvalidDefinitionError(\n            f'"{name}" is not a valid name in Dagster. Names must be in regex'\n            f" {VALID_NAME_REGEX_STR}."\n        )\n\n\ndef is_valid_name(name: str) -> bool:\n    check.str_param(name, "name")\n\n    return name not in DISALLOWED_NAMES and has_valid_name_chars(name)\n\n\ndef _kv_str(key: object, value: object) -> str:\n    return f'{key}="{value!r}"'\n\n\ndef struct_to_string(name: str, **kwargs: object) -> str:\n    # Sort the kwargs to ensure consistent representations across Python versions\n    props_str = ", ".join([_kv_str(key, value) for key, value in sorted(kwargs.items())])\n    return f"{name}({props_str})"\n\n\ndef validate_tags(\n    tags: Optional[Mapping[str, Any]], allow_reserved_tags: bool = True\n) -> Mapping[str, str]:\n    valid_tags: Dict[str, str] = {}\n    for key, value in check.opt_mapping_param(tags, "tags", key_type=str).items():\n        if not isinstance(value, str):\n            valid = False\n            err_reason = f'Could not JSON encode value "{value}"'\n            str_val = None\n            try:\n                str_val = seven.json.dumps(value)\n                err_reason = (\n                    'JSON encoding "{json}" of value "{val}" is not equivalent to original value'\n                    .format(json=str_val, val=value)\n                )\n\n                valid = seven.json.loads(str_val) == value\n            except Exception:\n                pass\n\n            if not valid:\n                raise DagsterInvalidDefinitionError(\n                    f'Invalid value for tag "{key}", {err_reason}. Tag values must be strings '\n                    "or meet the constraint that json.loads(json.dumps(value)) == value."\n                )\n\n            valid_tags[key] = str_val  # type: ignore  # (possible none)\n        else:\n            valid_tags[key] = value\n\n    if not allow_reserved_tags:\n        check_reserved_tags(valid_tags)\n\n    return valid_tags\n\n\ndef validate_group_name(group_name: Optional[str]) -> str:\n    """Ensures a string name is valid and returns a default if no name provided."""\n    if group_name:\n        check_valid_chars(group_name)\n        return group_name\n    return DEFAULT_GROUP_NAME\n\n\n
[docs]def config_from_files(config_files: Sequence[str]) -> Mapping[str, Any]:\n """Constructs run config from YAML files.\n\n Args:\n config_files (List[str]): List of paths or glob patterns for yaml files\n to load and parse as the run config.\n\n Returns:\n Dict[str, Any]: A run config dictionary constructed from provided YAML files.\n\n Raises:\n FileNotFoundError: When a config file produces no results\n DagsterInvariantViolationError: When one of the YAML files is invalid and has a parse\n error.\n """\n config_files = check.opt_sequence_param(config_files, "config_files")\n\n filenames = []\n for file_glob in config_files or []:\n globbed_files = glob(file_glob)\n if not globbed_files:\n raise DagsterInvariantViolationError(\n f'File or glob pattern "{file_glob}" for "config_files" produced no results.'\n )\n\n filenames += [os.path.realpath(globbed_file) for globbed_file in globbed_files]\n\n try:\n run_config = merge_yamls(filenames)\n except yaml.YAMLError as err:\n raise DagsterInvariantViolationError(\n f"Encountered error attempting to parse yaml. Parsing files {filenames} "\n f"loaded by file/patterns {config_files}."\n ) from err\n\n return check.is_dict(cast(Dict[str, object], run_config), key_type=str)
\n\n\n
[docs]def config_from_yaml_strings(yaml_strings: Sequence[str]) -> Mapping[str, Any]:\n """Static constructor for run configs from YAML strings.\n\n Args:\n yaml_strings (List[str]): List of yaml strings to parse as the run config.\n\n Returns:\n Dict[Str, Any]: A run config dictionary constructed from the provided yaml strings\n\n Raises:\n DagsterInvariantViolationError: When one of the YAML documents is invalid and has a\n parse error.\n """\n yaml_strings = check.sequence_param(yaml_strings, "yaml_strings", of_type=str)\n\n try:\n run_config = merge_yaml_strings(yaml_strings)\n except yaml.YAMLError as err:\n raise DagsterInvariantViolationError(\n f"Encountered error attempting to parse yaml. Parsing YAMLs {yaml_strings} "\n ) from err\n\n return check.is_dict(cast(Dict[str, object], run_config), key_type=str)
\n\n\n
[docs]def config_from_pkg_resources(pkg_resource_defs: Sequence[Tuple[str, str]]) -> Mapping[str, Any]:\n """Load a run config from a package resource, using :py:func:`pkg_resources.resource_string`.\n\n Example:\n .. code-block:: python\n\n config_from_pkg_resources(\n pkg_resource_defs=[\n ('dagster_examples.airline_demo.environments', 'local_base.yaml'),\n ('dagster_examples.airline_demo.environments', 'local_warehouse.yaml'),\n ],\n )\n\n\n Args:\n pkg_resource_defs (List[(str, str)]): List of pkg_resource modules/files to\n load as the run config.\n\n Returns:\n Dict[Str, Any]: A run config dictionary constructed from the provided yaml strings\n\n Raises:\n DagsterInvariantViolationError: When one of the YAML documents is invalid and has a\n parse error.\n """\n import pkg_resources # expensive, import only on use\n\n pkg_resource_defs = check.sequence_param(pkg_resource_defs, "pkg_resource_defs", of_type=tuple)\n\n try:\n yaml_strings = [\n pkg_resources.resource_string(*pkg_resource_def).decode("utf-8")\n for pkg_resource_def in pkg_resource_defs\n ]\n except (ModuleNotFoundError, FileNotFoundError, UnicodeDecodeError) as err:\n raise DagsterInvariantViolationError(\n "Encountered error attempting to parse yaml. Loading YAMLs from "\n f"package resources {pkg_resource_defs}."\n ) from err\n\n return config_from_yaml_strings(yaml_strings=yaml_strings)
\n
", "current_page_name": "_modules/dagster/_core/definitions/utils", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.utils"}, "version_strategy": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.definitions.version_strategy

\nimport hashlib\nimport inspect\nfrom abc import ABC, abstractmethod\nfrom typing import TYPE_CHECKING, Any, NamedTuple, Optional\n\nfrom dagster._annotations import public\n\nif TYPE_CHECKING:\n    from .op_definition import OpDefinition\n    from .resource_definition import ResourceDefinition\n\n\n
[docs]class OpVersionContext(NamedTuple):\n """Provides execution-time information for computing the version for an op.\n\n Attributes:\n op_def (OpDefinition): The definition of the op to compute a version for.\n op_config (Any): The parsed config to be passed to the op during execution.\n """\n\n op_def: "OpDefinition"\n op_config: Any
\n\n\n
[docs]class ResourceVersionContext(NamedTuple):\n """Provides execution-time information for computing the version for a resource.\n\n Attributes:\n resource_def (ResourceDefinition): The definition of the resource whose version will be computed.\n resource_config (Any): The parsed config to be passed to the resource during execution.\n """\n\n resource_def: "ResourceDefinition"\n resource_config: Any
\n\n\n
[docs]class VersionStrategy(ABC):\n """Abstract class for defining a strategy to version ops and resources.\n\n When subclassing, `get_op_version` must be implemented, and\n `get_resource_version` can be optionally implemented.\n\n `get_op_version` should ingest an OpVersionContext, and `get_resource_version` should ingest a\n ResourceVersionContext. From that, each synthesize a unique string called\n a `version`, which will\n be tagged to outputs of that op in the job. Providing a\n `VersionStrategy` instance to a\n job will enable memoization on that job, such that only steps whose\n outputs do not have an up-to-date version will run.\n """\n\n
[docs] @public\n @abstractmethod\n def get_op_version(self, context: OpVersionContext) -> str:\n """Computes a version for an op.\n\n Args:\n context (OpVersionContext): The context for computing the version.\n\n Returns:\n str: The version for the op.\n """\n raise NotImplementedError()
\n\n
[docs] @public\n def get_resource_version(self, context: ResourceVersionContext) -> Optional[str]:\n """Computes a version for a resource.\n\n Args:\n context (ResourceVersionContext): The context for computing the version.\n\n Returns:\n Optional[str]: The version for the resource. If None, the resource will not be\n memoized.\n """\n return None
\n\n\n
[docs]class SourceHashVersionStrategy(VersionStrategy):\n """VersionStrategy that checks for changes to the source code of ops and resources.\n\n Only checks for changes within the immediate body of the op/resource's\n decorated function (or compute function, if the op/resource was\n constructed directly from a definition).\n """\n\n def _get_source_hash(self, fn):\n code_as_str = inspect.getsource(fn)\n return hashlib.sha1(code_as_str.encode("utf-8")).hexdigest()\n\n
[docs] @public\n def get_op_version(self, context: OpVersionContext) -> str:\n """Computes a version for an op by hashing its source code.\n\n Args:\n context (OpVersionContext): The context for computing the version.\n\n Returns:\n str: The version for the op.\n """\n compute_fn = context.op_def.compute_fn\n if callable(compute_fn):\n return self._get_source_hash(compute_fn)\n else:\n return self._get_source_hash(compute_fn.decorated_fn)
\n\n
[docs] @public\n def get_resource_version(self, context: ResourceVersionContext) -> Optional[str]:\n """Computes a version for a resource by hashing its source code.\n\n Args:\n context (ResourceVersionContext): The context for computing the version.\n\n Returns:\n Optional[str]: The version for the resource. If None, the resource will not be\n memoized.\n """\n return self._get_source_hash(context.resource_def.resource_fn)
\n
", "current_page_name": "_modules/dagster/_core/definitions/version_strategy", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.definitions.version_strategy"}}, "errors": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.errors

\n"""Core Dagster error classes.\n\nAll errors thrown by the Dagster framework inherit from :py:class:`~dagster.DagsterError`. Users\nshould not subclass this base class for their own exceptions.\n\nThere is another exception base class, :py:class:`~dagster.DagsterUserCodeExecutionError`, which is\nused by the framework in concert with the :py:func:`~dagster._core.errors.user_code_error_boundary`.\n\nDagster uses this construct to wrap user code into which it calls. User code can perform arbitrary\ncomputations and may itself throw exceptions. The error boundary catches these user code-generated\nexceptions, and then reraises them wrapped in a subclass of\n:py:class:`~dagster.DagsterUserCodeExecutionError`.\n\nThe wrapped exceptions include additional context for the original exceptions, injected by the\nDagster runtime.\n"""\n\nimport sys\nfrom contextlib import contextmanager\nfrom typing import TYPE_CHECKING, Any, Callable, Iterator, Optional, Type\n\nimport dagster._check as check\nfrom dagster._utils.interrupts import raise_interrupts_as\n\nif TYPE_CHECKING:\n    from dagster._core.log_manager import DagsterLogManager\n\n\nclass DagsterExecutionInterruptedError(BaseException):\n    """Pipeline execution was interrupted during the execution process.\n\n    Just like KeyboardInterrupt this inherits from BaseException\n    as to not be accidentally caught by code that catches Exception\n    and thus prevent the interpreter from exiting.\n    """\n\n\n
[docs]class DagsterError(Exception):\n """Base class for all errors thrown by the Dagster framework.\n\n Users should not subclass this base class for their own exceptions.\n """\n\n @property\n def is_user_code_error(self):\n """Returns true if this error is attributable to user code."""\n return False
\n\n\n
[docs]class DagsterInvalidDefinitionError(DagsterError):\n """Indicates that the rules for a definition have been violated by the user."""
\n\n\nclass DagsterInvalidObservationError(DagsterError):\n """Indicates that an invalid value was returned from a source asset observation function."""\n\n\n
[docs]class DagsterInvalidSubsetError(DagsterError):\n """Indicates that a subset of a pipeline is invalid because either:\n - One or more ops in the specified subset do not exist on the job.'\n - The subset produces an invalid job.\n """
\n\n\nclass DagsterInvalidDeserializationVersionError(DagsterError):\n """Indicates that a serialized value has an unsupported version and cannot be deserialized."""\n\n\nPYTHONIC_CONFIG_ERROR_VERBIAGE = """\nThis config type can be a:\n - Python primitive type\n - int, float, bool, str, list\n - A Python Dict or List type containing other valid types\n - Custom data classes extending dagster.Config\n - A Pydantic discriminated union type (https://docs.pydantic.dev/usage/types/#discriminated-unions-aka-tagged-unions)\n"""\n\nPYTHONIC_RESOURCE_ADDITIONAL_TYPES = """\n\nIf this config type represents a resource dependency, its annotation must either:\n - Extend dagster.ConfigurableResource, dagster.ConfigurableIOManager, or\n - Be wrapped in a ResourceDependency annotation, e.g. ResourceDependency[{invalid_type_str}]\n"""\n\n\ndef _generate_pythonic_config_error_message(\n config_class: Optional[Type],\n field_name: Optional[str],\n invalid_type: Any,\n is_resource: bool = False,\n) -> str:\n invalid_type_name = getattr(invalid_type, "__name__", "<my type>")\n pythonic_config_error_verbiage = (\n PYTHONIC_CONFIG_ERROR_VERBIAGE + (PYTHONIC_RESOURCE_ADDITIONAL_TYPES if is_resource else "")\n ).format(invalid_type_str=invalid_type_name)\n\n return ("""\nError defining Dagster config class{config_class}{field_name}.\nUnable to resolve config type {invalid_type} to a supported Dagster config type.\n\n{PYTHONIC_CONFIG_ERROR_VERBIAGE}""").format(\n config_class=f" {config_class!r}" if config_class else "",\n field_name=f" on field '{field_name}'" if field_name else "",\n invalid_type=repr(invalid_type),\n PYTHONIC_CONFIG_ERROR_VERBIAGE=pythonic_config_error_verbiage,\n )\n\n\nclass DagsterInvalidPythonicConfigDefinitionError(DagsterError):\n """Indicates that you have attempted to construct a Pythonic config or resource class with an invalid value."""\n\n def __init__(\n self,\n config_class: Optional[Type],\n field_name: Optional[str],\n invalid_type: Any,\n is_resource: bool = False,\n **kwargs,\n ):\n self.invalid_type = invalid_type\n self.field_name = field_name\n self.config_class = config_class\n super(DagsterInvalidPythonicConfigDefinitionError, self).__init__(\n _generate_pythonic_config_error_message(\n config_class=config_class,\n field_name=field_name,\n invalid_type=invalid_type,\n is_resource=is_resource,\n ),\n **kwargs,\n )\n\n\nclass DagsterInvalidDagsterTypeInPythonicConfigDefinitionError(DagsterError):\n """Indicates that you have attempted to construct a Pythonic config or resource class with a DagsterType\n annotated field.\n """\n\n def __init__(\n self,\n config_class_name: str,\n field_name: Optional[str],\n **kwargs,\n ):\n self.field_name = field_name\n super(DagsterInvalidDagsterTypeInPythonicConfigDefinitionError, self).__init__(\n f"""Error defining Dagster config class '{config_class_name}' on field '{field_name}'. DagsterTypes cannot be used to annotate a config type. DagsterType is meant only for type checking and coercion in op and asset inputs and outputs.\n{PYTHONIC_CONFIG_ERROR_VERBIAGE}""",\n **kwargs,\n )\n\n\nCONFIG_ERROR_VERBIAGE = """\nThis value can be a:\n - Field\n - Python primitive types that resolve to dagster config types\n - int, float, bool, str, list.\n - A dagster config type: Int, Float, Bool, Array, Optional, Selector, Shape, Permissive, Map\n - A bare python dictionary, which is wrapped in Field(Shape(...)). Any values\n in the dictionary get resolved by the same rules, recursively.\n - A python list with a single entry that can resolve to a type, e.g. [int]\n"""\n\n\n
[docs]class DagsterInvalidConfigDefinitionError(DagsterError):\n """Indicates that you have attempted to construct a config with an invalid value.\n\n Acceptable values for config types are any of:\n 1. A Python primitive type that resolves to a Dagster config type\n (:py:class:`~python:int`, :py:class:`~python:float`, :py:class:`~python:bool`,\n :py:class:`~python:str`, or :py:class:`~python:list`).\n\n 2. A Dagster config type: :py:data:`~dagster.Int`, :py:data:`~dagster.Float`,\n :py:data:`~dagster.Bool`, :py:data:`~dagster.String`,\n :py:data:`~dagster.StringSource`, :py:data:`~dagster.Any`,\n :py:class:`~dagster.Array`, :py:data:`~dagster.Noneable`, :py:data:`~dagster.Enum`,\n :py:class:`~dagster.Selector`, :py:class:`~dagster.Shape`, or\n :py:class:`~dagster.Permissive`.\n\n 3. A bare python dictionary, which will be automatically wrapped in\n :py:class:`~dagster.Shape`. Values of the dictionary are resolved recursively\n according to the same rules.\n\n 4. A bare python list of length one which itself is config type.\n Becomes :py:class:`Array` with list element as an argument.\n\n 5. An instance of :py:class:`~dagster.Field`.\n """\n\n def __init__(self, original_root, current_value, stack, reason=None, **kwargs):\n self.original_root = original_root\n self.current_value = current_value\n self.stack = stack\n super(DagsterInvalidConfigDefinitionError, self).__init__(\n (\n "Error defining config. Original value passed: {original_root}. "\n "{stack_str}{current_value} "\n "cannot be resolved.{reason_str}"\n + CONFIG_ERROR_VERBIAGE\n ).format(\n original_root=repr(original_root),\n stack_str="Error at stack path :" + ":".join(stack) + ". " if stack else "",\n current_value=repr(current_value),\n reason_str=f" Reason: {reason}." if reason else "",\n ),\n **kwargs,\n )
\n\n\n
[docs]class DagsterInvariantViolationError(DagsterError):\n """Indicates the user has violated a well-defined invariant that can only be enforced\n at runtime.\n """
\n\n\n
[docs]class DagsterExecutionStepNotFoundError(DagsterError):\n """Thrown when the user specifies execution step keys that do not exist."""\n\n def __init__(self, *args, **kwargs):\n self.step_keys = check.list_param(kwargs.pop("step_keys"), "step_keys", str)\n super(DagsterExecutionStepNotFoundError, self).__init__(*args, **kwargs)
\n\n\nclass DagsterExecutionPlanSnapshotNotFoundError(DagsterError):\n """Thrown when an expected execution plan snapshot could not be found on a PipelineRun."""\n\n\n
[docs]class DagsterRunNotFoundError(DagsterError):\n """Thrown when a run cannot be found in run storage."""\n\n def __init__(self, *args, **kwargs):\n self.invalid_run_id = check.str_param(kwargs.pop("invalid_run_id"), "invalid_run_id")\n super(DagsterRunNotFoundError, self).__init__(*args, **kwargs)
\n\n\n
[docs]class DagsterStepOutputNotFoundError(DagsterError):\n """Indicates that previous step outputs required for an execution step to proceed are not\n available.\n """\n\n def __init__(self, *args, **kwargs):\n self.step_key = check.str_param(kwargs.pop("step_key"), "step_key")\n self.output_name = check.str_param(kwargs.pop("output_name"), "output_name")\n super(DagsterStepOutputNotFoundError, self).__init__(*args, **kwargs)
\n\n\n@contextmanager\ndef raise_execution_interrupts() -> Iterator[None]:\n with raise_interrupts_as(DagsterExecutionInterruptedError):\n yield\n\n\n
[docs]@contextmanager\ndef user_code_error_boundary(\n error_cls: Type["DagsterUserCodeExecutionError"],\n msg_fn: Callable[[], str],\n log_manager: Optional["DagsterLogManager"] = None,\n **kwargs: object,\n) -> Iterator[None]:\n """Wraps the execution of user-space code in an error boundary. This places a uniform\n policy around any user code invoked by the framework. This ensures that all user\n errors are wrapped in an exception derived from DagsterUserCodeExecutionError,\n and that the original stack trace of the user error is preserved, so that it\n can be reported without confusing framework code in the stack trace, if a\n tool author wishes to do so.\n\n Examples:\n .. code-block:: python\n\n with user_code_error_boundary(\n # Pass a class that inherits from DagsterUserCodeExecutionError\n DagsterExecutionStepExecutionError,\n # Pass a function that produces a message\n "Error occurred during step execution"\n ):\n call_user_provided_function()\n\n """\n check.callable_param(msg_fn, "msg_fn")\n check.class_param(error_cls, "error_cls", superclass=DagsterUserCodeExecutionError)\n\n with raise_execution_interrupts():\n if log_manager:\n log_manager.begin_python_log_capture()\n try:\n yield\n except DagsterError as de:\n # The system has thrown an error that is part of the user-framework contract\n raise de\n except Exception as e:\n # An exception has been thrown by user code and computation should cease\n # with the error reported further up the stack\n raise error_cls(\n msg_fn(), user_exception=e, original_exc_info=sys.exc_info(), **kwargs\n ) from e\n finally:\n if log_manager:\n log_manager.end_python_log_capture()
\n\n\n
[docs]class DagsterUserCodeExecutionError(DagsterError):\n """This is the base class for any exception that is meant to wrap an\n :py:class:`~python:Exception` thrown by user code. It wraps that existing user code.\n The ``original_exc_info`` argument to the constructor is meant to be a tuple of the type\n returned by :py:func:`sys.exc_info <python:sys.exc_info>` at the call site of the constructor.\n\n Users should not subclass this base class for their own exceptions and should instead throw\n freely from user code. User exceptions will be automatically wrapped and rethrown.\n """\n\n def __init__(self, *args, **kwargs):\n # original_exc_info should be gotten from a sys.exc_info() call at the\n # callsite inside of the exception handler. this will allow consuming\n # code to *re-raise* the user error in it's original format\n # for cleaner error reporting that does not have framework code in it\n user_exception = check.inst_param(kwargs.pop("user_exception"), "user_exception", Exception)\n original_exc_info = check.tuple_param(kwargs.pop("original_exc_info"), "original_exc_info")\n\n check.invariant(original_exc_info[0] is not None)\n\n super(DagsterUserCodeExecutionError, self).__init__(args[0], *args[1:], **kwargs)\n\n self.user_exception = check.opt_inst_param(user_exception, "user_exception", Exception)\n self.original_exc_info = original_exc_info\n\n @property\n def is_user_code_error(self) -> bool:\n return True
\n\n\n
[docs]class DagsterTypeCheckError(DagsterUserCodeExecutionError):\n """Indicates an error in the op type system at runtime. E.g. a op receives an\n unexpected input, or produces an output that does not match the type of the output definition.\n """
\n\n\nclass DagsterExecutionLoadInputError(DagsterUserCodeExecutionError):\n """Indicates an error occurred while loading an input for a step."""\n\n def __init__(self, *args, **kwargs):\n self.step_key = check.str_param(kwargs.pop("step_key"), "step_key")\n self.input_name = check.str_param(kwargs.pop("input_name"), "input_name")\n super(DagsterExecutionLoadInputError, self).__init__(*args, **kwargs)\n\n\nclass DagsterExecutionHandleOutputError(DagsterUserCodeExecutionError):\n """Indicates an error occurred while handling an output for a step."""\n\n def __init__(self, *args, **kwargs):\n self.step_key = check.str_param(kwargs.pop("step_key"), "step_key")\n self.output_name = check.str_param(kwargs.pop("output_name"), "output_name")\n super(DagsterExecutionHandleOutputError, self).__init__(*args, **kwargs)\n\n\n
[docs]class DagsterExecutionStepExecutionError(DagsterUserCodeExecutionError):\n """Indicates an error occurred while executing the body of an execution step."""\n\n def __init__(self, *args, **kwargs):\n self.step_key = check.str_param(kwargs.pop("step_key"), "step_key")\n self.op_name = check.str_param(kwargs.pop("op_name"), "op_name")\n self.op_def_name = check.str_param(kwargs.pop("op_def_name"), "op_def_name")\n super(DagsterExecutionStepExecutionError, self).__init__(*args, **kwargs)
\n\n\n
[docs]class DagsterResourceFunctionError(DagsterUserCodeExecutionError):\n """Indicates an error occurred while executing the body of the ``resource_fn`` in a\n :py:class:`~dagster.ResourceDefinition` during resource initialization.\n """
\n\n\n
[docs]class DagsterConfigMappingFunctionError(DagsterUserCodeExecutionError):\n """Indicates that an unexpected error occurred while executing the body of a config mapping\n function defined in a :py:class:`~dagster.JobDefinition` or `~dagster.GraphDefinition` during\n config parsing.\n """
\n\n\nclass DagsterTypeLoadingError(DagsterUserCodeExecutionError):\n """Indicates that an unexpected error occurred while executing the body of an type load\n function defined in a :py:class:`~dagster.DagsterTypeLoader` during loading of a custom type.\n """\n\n\n
[docs]class DagsterUnknownResourceError(DagsterError, AttributeError):\n # inherits from AttributeError as it is raised within a __getattr__ call... used to support\n # object hasattr method\n """Indicates that an unknown resource was accessed in the body of an execution step. May often\n happen by accessing a resource in the compute function of an op without first supplying the\n op with the correct `required_resource_keys` argument.\n """\n\n def __init__(self, resource_name, *args, **kwargs):\n self.resource_name = check.str_param(resource_name, "resource_name")\n msg = (\n f"Unknown resource `{resource_name}`. Specify `{resource_name}` as a required resource "\n "on the compute / config function that accessed it."\n )\n super(DagsterUnknownResourceError, self).__init__(msg, *args, **kwargs)
\n\n\nclass DagsterInvalidInvocationError(DagsterError):\n """Indicates that an error has occurred when an op has been invoked, but before the actual\n core compute has been reached.\n """\n\n\n
[docs]class DagsterInvalidConfigError(DagsterError):\n """Thrown when provided config is invalid (does not type check against the relevant config\n schema).\n """\n\n def __init__(self, preamble, errors, config_value, *args, **kwargs):\n from dagster._config import EvaluationError\n\n check.str_param(preamble, "preamble")\n self.errors = check.list_param(errors, "errors", of_type=EvaluationError)\n self.config_value = config_value\n\n error_msg = preamble\n error_messages = []\n\n for i_error, error in enumerate(self.errors):\n error_messages.append(error.message)\n error_msg += f"\\n Error {i_error + 1}: {error.message}"\n\n self.message = error_msg\n self.error_messages = error_messages\n\n super(DagsterInvalidConfigError, self).__init__(error_msg, *args, **kwargs)
\n\n\n
[docs]class DagsterUnmetExecutorRequirementsError(DagsterError):\n """Indicates the resolved executor is incompatible with the state of other systems\n such as the :py:class:`~dagster._core.instance.DagsterInstance` or system storage configuration.\n """
\n\n\n
[docs]class DagsterSubprocessError(DagsterError):\n """An exception has occurred in one or more of the child processes dagster manages.\n This error forwards the message and stack trace for all of the collected errors.\n """\n\n def __init__(self, *args, **kwargs):\n from dagster._utils.error import SerializableErrorInfo\n\n self.subprocess_error_infos = check.list_param(\n kwargs.pop("subprocess_error_infos"), "subprocess_error_infos", SerializableErrorInfo\n )\n super(DagsterSubprocessError, self).__init__(*args, **kwargs)
\n\n\nclass DagsterUserCodeUnreachableError(DagsterError):\n """Dagster was unable to reach a user code server to fetch information about user code."""\n\n\nclass DagsterUserCodeProcessError(DagsterError):\n """An exception has occurred in a user code process that the host process raising this error\n was communicating with.\n """\n\n @staticmethod\n def from_error_info(error_info):\n from dagster._utils.error import SerializableErrorInfo\n\n check.inst_param(error_info, "error_info", SerializableErrorInfo)\n return DagsterUserCodeProcessError(\n error_info.to_string(), user_code_process_error_infos=[error_info]\n )\n\n def __init__(self, *args, **kwargs):\n from dagster._utils.error import SerializableErrorInfo\n\n self.user_code_process_error_infos = check.list_param(\n kwargs.pop("user_code_process_error_infos"),\n "user_code_process_error_infos",\n SerializableErrorInfo,\n )\n super(DagsterUserCodeProcessError, self).__init__(*args, **kwargs)\n\n\nclass DagsterMaxRetriesExceededError(DagsterError):\n """Raised when raise_on_error is true, and retries were exceeded, this error should be raised."""\n\n def __init__(self, *args, **kwargs):\n from dagster._utils.error import SerializableErrorInfo\n\n self.user_code_process_error_infos = check.list_param(\n kwargs.pop("user_code_process_error_infos"),\n "user_code_process_error_infos",\n SerializableErrorInfo,\n )\n super(DagsterMaxRetriesExceededError, self).__init__(*args, **kwargs)\n\n @staticmethod\n def from_error_info(error_info):\n from dagster._utils.error import SerializableErrorInfo\n\n check.inst_param(error_info, "error_info", SerializableErrorInfo)\n return DagsterMaxRetriesExceededError(\n error_info.to_string(), user_code_process_error_infos=[error_info]\n )\n\n\nclass DagsterCodeLocationNotFoundError(DagsterError):\n pass\n\n\nclass DagsterCodeLocationLoadError(DagsterError):\n def __init__(self, *args, **kwargs):\n from dagster._utils.error import SerializableErrorInfo\n\n self.load_error_infos = check.list_param(\n kwargs.pop("load_error_infos"),\n "load_error_infos",\n SerializableErrorInfo,\n )\n super(DagsterCodeLocationLoadError, self).__init__(*args, **kwargs)\n\n\nclass DagsterLaunchFailedError(DagsterError):\n """Indicates an error while attempting to launch a pipeline run."""\n\n def __init__(self, *args, **kwargs):\n from dagster._utils.error import SerializableErrorInfo\n\n self.serializable_error_info = check.opt_inst_param(\n kwargs.pop("serializable_error_info", None),\n "serializable_error_info",\n SerializableErrorInfo,\n )\n super(DagsterLaunchFailedError, self).__init__(*args, **kwargs)\n\n\nclass DagsterBackfillFailedError(DagsterError):\n """Indicates an error while attempting to launch a backfill."""\n\n def __init__(self, *args, **kwargs):\n from dagster._utils.error import SerializableErrorInfo\n\n self.serializable_error_info = check.opt_inst_param(\n kwargs.pop("serializable_error_info", None),\n "serializable_error_info",\n SerializableErrorInfo,\n )\n super(DagsterBackfillFailedError, self).__init__(*args, **kwargs)\n\n\nclass DagsterRunAlreadyExists(DagsterError):\n """Indicates that a pipeline run already exists in a run storage."""\n\n\nclass DagsterSnapshotDoesNotExist(DagsterError):\n """Indicates you attempted to create a pipeline run with a nonexistent snapshot id."""\n\n\nclass DagsterRunConflict(DagsterError):\n """Indicates that a conflicting pipeline run exists in a run storage."""\n\n\n
[docs]class DagsterTypeCheckDidNotPass(DagsterError):\n """Indicates that a type check failed.\n\n This is raised when ``raise_on_error`` is ``True`` in calls to the synchronous job and\n graph execution APIs (e.g. `graph.execute_in_process()`, `job.execute_in_process()` -- typically\n within a test), and a :py:class:`~dagster.DagsterType`'s type check fails by returning either\n ``False`` or an instance of :py:class:`~dagster.TypeCheck` whose ``success`` member is ``False``.\n """\n\n def __init__(self, description=None, metadata=None, dagster_type=None):\n from dagster import DagsterType\n from dagster._core.definitions.metadata import normalize_metadata\n\n super(DagsterTypeCheckDidNotPass, self).__init__(description)\n self.description = check.opt_str_param(description, "description")\n self.metadata = normalize_metadata(\n check.opt_mapping_param(metadata, "metadata", key_type=str)\n )\n self.dagster_type = check.opt_inst_param(dagster_type, "dagster_type", DagsterType)
\n\n\nclass DagsterAssetCheckFailedError(DagsterError):\n """Indicates than an asset check failed."""\n\n\n
[docs]class DagsterEventLogInvalidForRun(DagsterError):\n """Raised when the event logs for a historical run are malformed or invalid."""\n\n def __init__(self, run_id):\n self.run_id = check.str_param(run_id, "run_id")\n super(DagsterEventLogInvalidForRun, self).__init__(\n f"Event logs invalid for run id {run_id}"\n )
\n\n\nclass ScheduleExecutionError(DagsterUserCodeExecutionError):\n """Errors raised in a user process during the execution of schedule."""\n\n\nclass SensorExecutionError(DagsterUserCodeExecutionError):\n """Errors raised in a user process during the execution of a sensor (or its job)."""\n\n\nclass PartitionExecutionError(DagsterUserCodeExecutionError):\n """Errors raised during the execution of user-provided functions of a partition set schedule."""\n\n\nclass DagsterInvalidAssetKey(DagsterError):\n """Error raised by invalid asset key."""\n\n\nclass DagsterInvalidMetadata(DagsterError):\n """Error raised by invalid metadata parameters."""\n\n\nclass HookExecutionError(DagsterUserCodeExecutionError):\n """Error raised during the execution of a user-defined hook."""\n\n\nclass RunStatusSensorExecutionError(DagsterUserCodeExecutionError):\n """Error raised during the execution of a user-defined run status sensor."""\n\n\nclass FreshnessPolicySensorExecutionError(DagsterUserCodeExecutionError):\n """Error raised during the execution of a user-defined freshness policy sensor."""\n\n\nclass DagsterImportError(DagsterError):\n """Import error raised while importing user-code."""\n\n\nclass JobError(DagsterUserCodeExecutionError):\n """Errors raised during the execution of user-provided functions for a defined Job."""\n\n\nclass DagsterUnknownStepStateError(DagsterError):\n """When job execution completes with steps in an unknown state."""\n\n\nclass DagsterObjectStoreError(DagsterError):\n """Errors during an object store operation."""\n\n\nclass DagsterInvalidPropertyError(DagsterError):\n """Indicates that an invalid property was accessed. May often happen by accessing a property\n that no longer exists after breaking changes.\n """\n\n\nclass DagsterHomeNotSetError(DagsterError):\n """The user has tried to use a command that requires an instance or invoke DagsterInstance.get()\n without setting DAGSTER_HOME env var.\n """\n\n\nclass DagsterUnknownPartitionError(DagsterError):\n """The user has tried to access run config for a partition name that does not exist."""\n\n\nclass DagsterUndefinedDataVersionError(DagsterError):\n """The user attempted to retrieve the most recent logical version for an asset, but no logical version is defined."""\n\n\nclass DagsterAssetBackfillDataLoadError(DagsterError):\n """Indicates that an asset backfill is now unloadable. May happen when (1) a code location containing\n targeted assets is unloadable or (2) and asset or an asset's partitions definition has been removed.\n """\n\n\nclass DagsterDefinitionChangedDeserializationError(DagsterError):\n """Indicates that a stored value can't be deserialized because the definition needed to interpret\n it has changed.\n """\n\n\nclass DagsterPipesExecutionError(DagsterError):\n """Indicates that an error occurred during the execution of an external process."""\n
", "current_page_name": "_modules/dagster/_core/errors", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.errors"}, "event_api": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.event_api

\nfrom datetime import datetime\nfrom typing import Callable, Mapping, NamedTuple, Optional, Sequence, Union\n\nfrom typing_extensions import TypeAlias\n\nimport dagster._check as check\nfrom dagster._annotations import PublicAttr\nfrom dagster._core.definitions.events import AssetKey, AssetMaterialization, AssetObservation\nfrom dagster._core.errors import DagsterInvalidInvocationError\nfrom dagster._core.events import DagsterEventType\nfrom dagster._core.events.log import EventLogEntry\nfrom dagster._serdes import whitelist_for_serdes\n\nEventHandlerFn: TypeAlias = Callable[[EventLogEntry, str], None]\n\n\n
[docs]class RunShardedEventsCursor(NamedTuple):\n """Pairs an id-based event log cursor with a timestamp-based run cursor, for improved\n performance on run-sharded event log storages (e.g. the default SqliteEventLogStorage). For\n run-sharded storages, the id field is ignored, since they may not be unique across shards.\n """\n\n id: int\n run_updated_after: datetime
\n\n\n
[docs]@whitelist_for_serdes\nclass EventLogRecord(NamedTuple):\n """Internal representation of an event record, as stored in a\n :py:class:`~dagster._core.storage.event_log.EventLogStorage`.\n\n Users should not instantiate this class directly.\n """\n\n storage_id: PublicAttr[int]\n event_log_entry: PublicAttr[EventLogEntry]\n\n @property\n def run_id(self) -> str:\n return self.event_log_entry.run_id\n\n @property\n def timestamp(self) -> float:\n return self.event_log_entry.timestamp\n\n @property\n def asset_key(self) -> Optional[AssetKey]:\n dagster_event = self.event_log_entry.dagster_event\n if dagster_event:\n return dagster_event.asset_key\n\n return None\n\n @property\n def partition_key(self) -> Optional[str]:\n dagster_event = self.event_log_entry.dagster_event\n if dagster_event:\n return dagster_event.partition\n\n return None\n\n @property\n def asset_materialization(self) -> Optional[AssetMaterialization]:\n return self.event_log_entry.asset_materialization\n\n @property\n def asset_observation(self) -> Optional[AssetObservation]:\n return self.event_log_entry.asset_observation
\n\n\n
[docs]@whitelist_for_serdes\nclass EventRecordsFilter(\n NamedTuple(\n "_EventRecordsFilter",\n [\n ("event_type", DagsterEventType),\n ("asset_key", Optional[AssetKey]),\n ("asset_partitions", Optional[Sequence[str]]),\n ("after_cursor", Optional[Union[int, RunShardedEventsCursor]]),\n ("before_cursor", Optional[Union[int, RunShardedEventsCursor]]),\n ("after_timestamp", Optional[float]),\n ("before_timestamp", Optional[float]),\n ("storage_ids", Optional[Sequence[int]]),\n ("tags", Optional[Mapping[str, Union[str, Sequence[str]]]]),\n ],\n )\n):\n """Defines a set of filter fields for fetching a set of event log entries or event log records.\n\n Args:\n event_type (DagsterEventType): Filter argument for dagster event type\n asset_key (Optional[AssetKey]): Asset key for which to get asset materialization event\n entries / records.\n asset_partitions (Optional[List[str]]): Filter parameter such that only asset\n events with a partition value matching one of the provided values. Only\n valid when the `asset_key` parameter is provided.\n after_cursor (Optional[Union[int, RunShardedEventsCursor]]): Filter parameter such that only\n records with storage_id greater than the provided value are returned. Using a\n run-sharded events cursor will result in a significant performance gain when run against\n a SqliteEventLogStorage implementation (which is run-sharded)\n before_cursor (Optional[Union[int, RunShardedEventsCursor]]): Filter parameter such that\n records with storage_id less than the provided value are returned. Using a run-sharded\n events cursor will result in a significant performance gain when run against\n a SqliteEventLogStorage implementation (which is run-sharded)\n after_timestamp (Optional[float]): Filter parameter such that only event records for\n events with timestamp greater than the provided value are returned.\n before_timestamp (Optional[float]): Filter parameter such that only event records for\n events with timestamp less than the provided value are returned.\n """\n\n def __new__(\n cls,\n event_type: DagsterEventType,\n asset_key: Optional[AssetKey] = None,\n asset_partitions: Optional[Sequence[str]] = None,\n after_cursor: Optional[Union[int, RunShardedEventsCursor]] = None,\n before_cursor: Optional[Union[int, RunShardedEventsCursor]] = None,\n after_timestamp: Optional[float] = None,\n before_timestamp: Optional[float] = None,\n storage_ids: Optional[Sequence[int]] = None,\n tags: Optional[Mapping[str, Union[str, Sequence[str]]]] = None,\n ):\n check.opt_sequence_param(asset_partitions, "asset_partitions", of_type=str)\n check.inst_param(event_type, "event_type", DagsterEventType)\n\n tags = check.opt_mapping_param(tags, "tags", key_type=str)\n if tags and event_type is not DagsterEventType.ASSET_MATERIALIZATION:\n raise DagsterInvalidInvocationError(\n "Can only filter by tags for asset materialization events"\n )\n\n # type-ignores work around mypy type inference bug\n return super(EventRecordsFilter, cls).__new__(\n cls,\n event_type=event_type,\n asset_key=check.opt_inst_param(asset_key, "asset_key", AssetKey),\n asset_partitions=asset_partitions,\n after_cursor=check.opt_inst_param(\n after_cursor, "after_cursor", (int, RunShardedEventsCursor)\n ),\n before_cursor=check.opt_inst_param(\n before_cursor, "before_cursor", (int, RunShardedEventsCursor)\n ),\n after_timestamp=check.opt_float_param(after_timestamp, "after_timestamp"),\n before_timestamp=check.opt_float_param(before_timestamp, "before_timestamp"),\n storage_ids=check.opt_nullable_sequence_param(storage_ids, "storage_ids", of_type=int),\n tags=check.opt_mapping_param(tags, "tags", key_type=str),\n )
\n
", "current_page_name": "_modules/dagster/_core/event_api", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.event_api"}, "events": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.events

\n"""Structured representations of system events."""\nimport logging\nimport os\nimport sys\nfrom enum import Enum\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Any,\n    Dict,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Tuple,\n    Union,\n    cast,\n)\n\nimport dagster._check as check\nfrom dagster._annotations import public\nfrom dagster._core.definitions import (\n    AssetKey,\n    AssetMaterialization,\n    AssetObservation,\n    ExpectationResult,\n    HookDefinition,\n    NodeHandle,\n)\nfrom dagster._core.definitions.asset_check_evaluation import (\n    AssetCheckEvaluation,\n    AssetCheckEvaluationPlanned,\n)\nfrom dagster._core.definitions.events import AssetLineageInfo, ObjectStoreOperationType\nfrom dagster._core.definitions.metadata import (\n    MetadataFieldSerializer,\n    MetadataValue,\n    RawMetadataValue,\n    normalize_metadata,\n)\nfrom dagster._core.errors import HookExecutionError\nfrom dagster._core.execution.context.system import IPlanContext, IStepContext, StepExecutionContext\nfrom dagster._core.execution.plan.handle import ResolvedFromDynamicStepHandle, StepHandle\nfrom dagster._core.execution.plan.inputs import StepInputData\nfrom dagster._core.execution.plan.objects import StepFailureData, StepRetryData, StepSuccessData\nfrom dagster._core.execution.plan.outputs import StepOutputData\nfrom dagster._core.log_manager import DagsterLogManager\nfrom dagster._core.storage.captured_log_manager import CapturedLogContext\nfrom dagster._core.storage.dagster_run import DagsterRunStatus\nfrom dagster._serdes import (\n    NamedTupleSerializer,\n    whitelist_for_serdes,\n)\nfrom dagster._serdes.serdes import UnpackContext\nfrom dagster._utils.error import SerializableErrorInfo, serializable_error_info_from_exc_info\nfrom dagster._utils.timing import format_duration\n\nif TYPE_CHECKING:\n    from dagster._core.definitions.events import ObjectStoreOperation\n    from dagster._core.execution.plan.plan import ExecutionPlan\n    from dagster._core.execution.plan.step import StepKind\n\n\nEventSpecificData = Union[\n    StepOutputData,\n    StepFailureData,\n    StepSuccessData,\n    "StepMaterializationData",\n    "StepExpectationResultData",\n    StepInputData,\n    "EngineEventData",\n    "HookErroredData",\n    StepRetryData,\n    "JobFailureData",\n    "JobCanceledData",\n    "ObjectStoreOperationResultData",\n    "HandledOutputData",\n    "LoadedInputData",\n    "ComputeLogsCaptureData",\n    "AssetObservationData",\n    "AssetMaterializationPlannedData",\n    "AssetCheckEvaluation",\n    "AssetCheckEvaluationPlanned",\n]\n\n\n
[docs]class DagsterEventType(str, Enum):\n """The types of events that may be yielded by op and job execution."""\n\n STEP_OUTPUT = "STEP_OUTPUT"\n STEP_INPUT = "STEP_INPUT"\n STEP_FAILURE = "STEP_FAILURE"\n STEP_START = "STEP_START"\n STEP_SUCCESS = "STEP_SUCCESS"\n STEP_SKIPPED = "STEP_SKIPPED"\n\n # The process carrying out step execution is starting/started. Shown as a\n # marker start/end in the Dagster UI.\n STEP_WORKER_STARTING = "STEP_WORKER_STARTING"\n STEP_WORKER_STARTED = "STEP_WORKER_STARTED"\n\n # Resource initialization for execution has started/succeede/failed. Shown\n # as a marker start/end in the Dagster UI.\n RESOURCE_INIT_STARTED = "RESOURCE_INIT_STARTED"\n RESOURCE_INIT_SUCCESS = "RESOURCE_INIT_SUCCESS"\n RESOURCE_INIT_FAILURE = "RESOURCE_INIT_FAILURE"\n\n STEP_UP_FOR_RETRY = "STEP_UP_FOR_RETRY" # "failed" but want to retry\n STEP_RESTARTED = "STEP_RESTARTED"\n\n ASSET_MATERIALIZATION = "ASSET_MATERIALIZATION"\n ASSET_MATERIALIZATION_PLANNED = "ASSET_MATERIALIZATION_PLANNED"\n ASSET_OBSERVATION = "ASSET_OBSERVATION"\n STEP_EXPECTATION_RESULT = "STEP_EXPECTATION_RESULT"\n ASSET_CHECK_EVALUATION_PLANNED = "ASSET_CHECK_EVALUATION_PLANNED"\n ASSET_CHECK_EVALUATION = "ASSET_CHECK_EVALUATION"\n\n # We want to display RUN_* events in the Dagster UI and in our LogManager output, but in order to\n # support backcompat for our storage layer, we need to keep the persisted value to be strings\n # of the form "PIPELINE_*". We may have user code that pass in the DagsterEventType\n # enum values into storage APIs (like get_event_records, which takes in an EventRecordsFilter).\n RUN_ENQUEUED = "PIPELINE_ENQUEUED"\n RUN_DEQUEUED = "PIPELINE_DEQUEUED"\n RUN_STARTING = "PIPELINE_STARTING" # Launch is happening, execution hasn't started yet\n RUN_START = "PIPELINE_START" # Execution has started\n RUN_SUCCESS = "PIPELINE_SUCCESS"\n RUN_FAILURE = "PIPELINE_FAILURE"\n RUN_CANCELING = "PIPELINE_CANCELING"\n RUN_CANCELED = "PIPELINE_CANCELED"\n\n # Keep these legacy enum values around, to keep back-compatability for user code that might be\n # using these constants to filter event records\n PIPELINE_ENQUEUED = RUN_ENQUEUED\n PIPELINE_DEQUEUED = RUN_DEQUEUED\n PIPELINE_STARTING = RUN_STARTING\n PIPELINE_START = RUN_START\n PIPELINE_SUCCESS = RUN_SUCCESS\n PIPELINE_FAILURE = RUN_FAILURE\n PIPELINE_CANCELING = RUN_CANCELING\n PIPELINE_CANCELED = RUN_CANCELED\n\n OBJECT_STORE_OPERATION = "OBJECT_STORE_OPERATION"\n ASSET_STORE_OPERATION = "ASSET_STORE_OPERATION"\n LOADED_INPUT = "LOADED_INPUT"\n HANDLED_OUTPUT = "HANDLED_OUTPUT"\n\n ENGINE_EVENT = "ENGINE_EVENT"\n\n HOOK_COMPLETED = "HOOK_COMPLETED"\n HOOK_ERRORED = "HOOK_ERRORED"\n HOOK_SKIPPED = "HOOK_SKIPPED"\n\n ALERT_START = "ALERT_START"\n ALERT_SUCCESS = "ALERT_SUCCESS"\n ALERT_FAILURE = "ALERT_FAILURE"\n\n LOGS_CAPTURED = "LOGS_CAPTURED"
\n\n\nEVENT_TYPE_VALUE_TO_DISPLAY_STRING = {\n "PIPELINE_ENQUEUED": "RUN_ENQUEUED",\n "PIPELINE_DEQUEUED": "RUN_DEQUEUED",\n "PIPELINE_STARTING": "RUN_STARTING",\n "PIPELINE_START": "RUN_START",\n "PIPELINE_SUCCESS": "RUN_SUCCESS",\n "PIPELINE_FAILURE": "RUN_FAILURE",\n "PIPELINE_CANCELING": "RUN_CANCELING",\n "PIPELINE_CANCELED": "RUN_CANCELED",\n}\n\nSTEP_EVENTS = {\n DagsterEventType.STEP_INPUT,\n DagsterEventType.STEP_START,\n DagsterEventType.STEP_OUTPUT,\n DagsterEventType.STEP_FAILURE,\n DagsterEventType.STEP_SUCCESS,\n DagsterEventType.STEP_SKIPPED,\n DagsterEventType.ASSET_MATERIALIZATION,\n DagsterEventType.ASSET_OBSERVATION,\n DagsterEventType.STEP_EXPECTATION_RESULT,\n DagsterEventType.ASSET_CHECK_EVALUATION,\n DagsterEventType.OBJECT_STORE_OPERATION,\n DagsterEventType.HANDLED_OUTPUT,\n DagsterEventType.LOADED_INPUT,\n DagsterEventType.STEP_RESTARTED,\n DagsterEventType.STEP_UP_FOR_RETRY,\n}\n\nFAILURE_EVENTS = {\n DagsterEventType.RUN_FAILURE,\n DagsterEventType.STEP_FAILURE,\n DagsterEventType.RUN_CANCELED,\n}\n\nPIPELINE_EVENTS = {\n DagsterEventType.RUN_ENQUEUED,\n DagsterEventType.RUN_DEQUEUED,\n DagsterEventType.RUN_STARTING,\n DagsterEventType.RUN_START,\n DagsterEventType.RUN_SUCCESS,\n DagsterEventType.RUN_FAILURE,\n DagsterEventType.RUN_CANCELING,\n DagsterEventType.RUN_CANCELED,\n}\n\nHOOK_EVENTS = {\n DagsterEventType.HOOK_COMPLETED,\n DagsterEventType.HOOK_ERRORED,\n DagsterEventType.HOOK_SKIPPED,\n}\n\nALERT_EVENTS = {\n DagsterEventType.ALERT_START,\n DagsterEventType.ALERT_SUCCESS,\n DagsterEventType.ALERT_FAILURE,\n}\n\nMARKER_EVENTS = {\n DagsterEventType.ENGINE_EVENT,\n DagsterEventType.STEP_WORKER_STARTING,\n DagsterEventType.STEP_WORKER_STARTED,\n DagsterEventType.RESOURCE_INIT_STARTED,\n DagsterEventType.RESOURCE_INIT_SUCCESS,\n DagsterEventType.RESOURCE_INIT_FAILURE,\n}\n\n\nEVENT_TYPE_TO_PIPELINE_RUN_STATUS = {\n DagsterEventType.RUN_START: DagsterRunStatus.STARTED,\n DagsterEventType.RUN_SUCCESS: DagsterRunStatus.SUCCESS,\n DagsterEventType.RUN_FAILURE: DagsterRunStatus.FAILURE,\n DagsterEventType.RUN_ENQUEUED: DagsterRunStatus.QUEUED,\n DagsterEventType.RUN_STARTING: DagsterRunStatus.STARTING,\n DagsterEventType.RUN_CANCELING: DagsterRunStatus.CANCELING,\n DagsterEventType.RUN_CANCELED: DagsterRunStatus.CANCELED,\n}\n\nPIPELINE_RUN_STATUS_TO_EVENT_TYPE = {v: k for k, v in EVENT_TYPE_TO_PIPELINE_RUN_STATUS.items()}\n\nASSET_EVENTS = {\n DagsterEventType.ASSET_MATERIALIZATION,\n DagsterEventType.ASSET_OBSERVATION,\n DagsterEventType.ASSET_MATERIALIZATION_PLANNED,\n}\n\nASSET_CHECK_EVENTS = {\n DagsterEventType.ASSET_CHECK_EVALUATION,\n DagsterEventType.ASSET_CHECK_EVALUATION_PLANNED,\n}\n\n\ndef _assert_type(\n method: str,\n expected_type: Union[DagsterEventType, Sequence[DagsterEventType]],\n actual_type: DagsterEventType,\n) -> None:\n _expected_type = (\n [expected_type] if isinstance(expected_type, DagsterEventType) else expected_type\n )\n check.invariant(\n actual_type in _expected_type,\n f"{method} only callable when event_type is"\n f" {','.join([t.value for t in _expected_type])}, called on {actual_type}",\n )\n\n\ndef _validate_event_specific_data(\n event_type: DagsterEventType, event_specific_data: Optional["EventSpecificData"]\n) -> Optional["EventSpecificData"]:\n if event_type == DagsterEventType.STEP_OUTPUT:\n check.inst_param(event_specific_data, "event_specific_data", StepOutputData)\n elif event_type == DagsterEventType.STEP_FAILURE:\n check.inst_param(event_specific_data, "event_specific_data", StepFailureData)\n elif event_type == DagsterEventType.STEP_SUCCESS:\n check.inst_param(event_specific_data, "event_specific_data", StepSuccessData)\n elif event_type == DagsterEventType.ASSET_MATERIALIZATION:\n check.inst_param(event_specific_data, "event_specific_data", StepMaterializationData)\n elif event_type == DagsterEventType.STEP_EXPECTATION_RESULT:\n check.inst_param(event_specific_data, "event_specific_data", StepExpectationResultData)\n elif event_type == DagsterEventType.STEP_INPUT:\n check.inst_param(event_specific_data, "event_specific_data", StepInputData)\n elif event_type in (\n DagsterEventType.ENGINE_EVENT,\n DagsterEventType.STEP_WORKER_STARTING,\n DagsterEventType.STEP_WORKER_STARTED,\n DagsterEventType.RESOURCE_INIT_STARTED,\n DagsterEventType.RESOURCE_INIT_SUCCESS,\n DagsterEventType.RESOURCE_INIT_FAILURE,\n ):\n check.inst_param(event_specific_data, "event_specific_data", EngineEventData)\n elif event_type == DagsterEventType.HOOK_ERRORED:\n check.inst_param(event_specific_data, "event_specific_data", HookErroredData)\n elif event_type == DagsterEventType.ASSET_MATERIALIZATION_PLANNED:\n check.inst_param(\n event_specific_data, "event_specific_data", AssetMaterializationPlannedData\n )\n elif event_type == DagsterEventType.ASSET_CHECK_EVALUATION_PLANNED:\n check.inst_param(event_specific_data, "event_specific_data", AssetCheckEvaluationPlanned)\n elif event_type == DagsterEventType.ASSET_CHECK_EVALUATION:\n check.inst_param(event_specific_data, "event_specific_data", AssetCheckEvaluation)\n\n return event_specific_data\n\n\ndef log_step_event(step_context: IStepContext, event: "DagsterEvent") -> None:\n event_type = DagsterEventType(event.event_type_value)\n log_level = logging.ERROR if event_type in FAILURE_EVENTS else logging.DEBUG\n\n step_context.log.log_dagster_event(\n level=log_level,\n msg=event.message or f"{event_type} for step {step_context.step.key}",\n dagster_event=event,\n )\n\n\ndef log_job_event(job_context: IPlanContext, event: "DagsterEvent") -> None:\n event_type = DagsterEventType(event.event_type_value)\n log_level = logging.ERROR if event_type in FAILURE_EVENTS else logging.DEBUG\n\n job_context.log.log_dagster_event(\n level=log_level,\n msg=event.message or f"{event_type} for pipeline {job_context.job_name}",\n dagster_event=event,\n )\n\n\ndef log_resource_event(log_manager: DagsterLogManager, event: "DagsterEvent") -> None:\n event_specific_data = cast(EngineEventData, event.event_specific_data)\n\n log_level = logging.ERROR if event_specific_data.error else logging.DEBUG\n log_manager.log_dagster_event(level=log_level, msg=event.message or "", dagster_event=event)\n\n\nclass DagsterEventSerializer(NamedTupleSerializer["DagsterEvent"]):\n def before_unpack(self, context, unpacked_dict: Any) -> Dict[str, Any]:\n event_type_value, event_specific_data = _handle_back_compat(\n unpacked_dict["event_type_value"], unpacked_dict.get("event_specific_data")\n )\n unpacked_dict["event_type_value"] = event_type_value\n unpacked_dict["event_specific_data"] = event_specific_data\n\n return unpacked_dict\n\n def handle_unpack_error(\n self,\n exc: Exception,\n context: UnpackContext,\n storage_dict: Dict[str, Any],\n ) -> "DagsterEvent":\n event_type_value, _ = _handle_back_compat(\n storage_dict["event_type_value"], storage_dict.get("event_specific_data")\n )\n step_key = storage_dict.get("step_key")\n orig_message = storage_dict.get("message")\n new_message = (\n f"Could not deserialize event of type {event_type_value}. This event may have been"\n " written by a newer version of Dagster."\n + (f' Original message: "{orig_message}"' if orig_message else "")\n )\n return DagsterEvent(\n event_type_value=DagsterEventType.ENGINE_EVENT.value,\n job_name=storage_dict["pipeline_name"],\n message=new_message,\n step_key=step_key,\n event_specific_data=EngineEventData(\n error=serializable_error_info_from_exc_info(sys.exc_info())\n ),\n )\n\n\n
[docs]@whitelist_for_serdes(\n serializer=DagsterEventSerializer,\n storage_field_names={\n "node_handle": "solid_handle",\n "job_name": "pipeline_name",\n },\n)\nclass DagsterEvent(\n NamedTuple(\n "_DagsterEvent",\n [\n ("event_type_value", str),\n ("job_name", str),\n ("step_handle", Optional[Union[StepHandle, ResolvedFromDynamicStepHandle]]),\n ("node_handle", Optional[NodeHandle]),\n ("step_kind_value", Optional[str]),\n ("logging_tags", Optional[Mapping[str, str]]),\n ("event_specific_data", Optional["EventSpecificData"]),\n ("message", Optional[str]),\n ("pid", Optional[int]),\n ("step_key", Optional[str]),\n ],\n )\n):\n """Events yielded by op and job execution.\n\n Users should not instantiate this class.\n\n Attributes:\n event_type_value (str): Value for a DagsterEventType.\n job_name (str)\n node_handle (NodeHandle)\n step_kind_value (str): Value for a StepKind.\n logging_tags (Dict[str, str])\n event_specific_data (Any): Type must correspond to event_type_value.\n message (str)\n pid (int)\n step_key (Optional[str]): DEPRECATED\n """\n\n @staticmethod\n def from_step(\n event_type: "DagsterEventType",\n step_context: IStepContext,\n event_specific_data: Optional["EventSpecificData"] = None,\n message: Optional[str] = None,\n ) -> "DagsterEvent":\n event = DagsterEvent(\n event_type_value=check.inst_param(event_type, "event_type", DagsterEventType).value,\n job_name=step_context.job_name,\n step_handle=step_context.step.handle,\n node_handle=step_context.step.node_handle,\n step_kind_value=step_context.step.kind.value,\n logging_tags=step_context.event_tags,\n event_specific_data=_validate_event_specific_data(event_type, event_specific_data),\n message=check.opt_str_param(message, "message"),\n pid=os.getpid(),\n )\n\n log_step_event(step_context, event)\n\n return event\n\n @staticmethod\n def from_job(\n event_type: DagsterEventType,\n job_context: IPlanContext,\n message: Optional[str] = None,\n event_specific_data: Optional["EventSpecificData"] = None,\n step_handle: Optional[Union[StepHandle, ResolvedFromDynamicStepHandle]] = None,\n ) -> "DagsterEvent":\n check.opt_inst_param(\n step_handle, "step_handle", (StepHandle, ResolvedFromDynamicStepHandle)\n )\n\n event = DagsterEvent(\n event_type_value=check.inst_param(event_type, "event_type", DagsterEventType).value,\n job_name=job_context.job_name,\n message=check.opt_str_param(message, "message"),\n event_specific_data=_validate_event_specific_data(event_type, event_specific_data),\n step_handle=step_handle,\n pid=os.getpid(),\n )\n\n log_job_event(job_context, event)\n\n return event\n\n @staticmethod\n def from_resource(\n event_type: DagsterEventType,\n job_name: str,\n execution_plan: "ExecutionPlan",\n log_manager: DagsterLogManager,\n message: Optional[str] = None,\n event_specific_data: Optional["EngineEventData"] = None,\n ) -> "DagsterEvent":\n event = DagsterEvent(\n event_type_value=check.inst_param(event_type, "event_type", DagsterEventType).value,\n job_name=job_name,\n message=check.opt_str_param(message, "message"),\n event_specific_data=_validate_event_specific_data(\n DagsterEventType.ENGINE_EVENT, event_specific_data\n ),\n step_handle=execution_plan.step_handle_for_single_step_plans(),\n pid=os.getpid(),\n )\n log_resource_event(log_manager, event)\n return event\n\n def __new__(\n cls,\n event_type_value: str,\n job_name: str,\n step_handle: Optional[Union[StepHandle, ResolvedFromDynamicStepHandle]] = None,\n node_handle: Optional[NodeHandle] = None,\n step_kind_value: Optional[str] = None,\n logging_tags: Optional[Mapping[str, str]] = None,\n event_specific_data: Optional["EventSpecificData"] = None,\n message: Optional[str] = None,\n pid: Optional[int] = None,\n # legacy\n step_key: Optional[str] = None,\n ):\n # old events may contain node_handle but not step_handle\n if node_handle is not None and step_handle is None:\n step_handle = StepHandle(node_handle)\n\n # Legacy events may have step_key set directly, preserve those to stay in sync\n # with legacy execution plan snapshots.\n if step_handle is not None and step_key is None:\n step_key = step_handle.to_key()\n\n return super(DagsterEvent, cls).__new__(\n cls,\n check.str_param(event_type_value, "event_type_value"),\n check.str_param(job_name, "job_name"),\n check.opt_inst_param(\n step_handle, "step_handle", (StepHandle, ResolvedFromDynamicStepHandle)\n ),\n check.opt_inst_param(node_handle, "node_handle", NodeHandle),\n check.opt_str_param(step_kind_value, "step_kind_value"),\n check.opt_mapping_param(logging_tags, "logging_tags"),\n _validate_event_specific_data(DagsterEventType(event_type_value), event_specific_data),\n check.opt_str_param(message, "message"),\n check.opt_int_param(pid, "pid"),\n check.opt_str_param(step_key, "step_key"),\n )\n\n @property\n def node_name(self) -> str:\n check.invariant(self.node_handle is not None)\n node_handle = cast(NodeHandle, self.node_handle)\n return node_handle.name\n\n @public\n @property\n def event_type(self) -> DagsterEventType:\n """DagsterEventType: The type of this event."""\n return DagsterEventType(self.event_type_value)\n\n @public\n @property\n def is_step_event(self) -> bool:\n """bool: If this event relates to a specific step."""\n return self.event_type in STEP_EVENTS\n\n @public\n @property\n def is_hook_event(self) -> bool:\n """bool: If this event relates to the execution of a hook."""\n return self.event_type in HOOK_EVENTS\n\n @property\n def is_alert_event(self) -> bool:\n return self.event_type in ALERT_EVENTS\n\n @property\n def step_kind(self) -> "StepKind":\n from dagster._core.execution.plan.step import StepKind\n\n return StepKind(self.step_kind_value)\n\n @public\n @property\n def is_step_success(self) -> bool:\n """bool: If this event is of type STEP_SUCCESS."""\n return self.event_type == DagsterEventType.STEP_SUCCESS\n\n @public\n @property\n def is_successful_output(self) -> bool:\n """bool: If this event is of type STEP_OUTPUT."""\n return self.event_type == DagsterEventType.STEP_OUTPUT\n\n @public\n @property\n def is_step_start(self) -> bool:\n """bool: If this event is of type STEP_START."""\n return self.event_type == DagsterEventType.STEP_START\n\n @public\n @property\n def is_step_failure(self) -> bool:\n """bool: If this event is of type STEP_FAILURE."""\n return self.event_type == DagsterEventType.STEP_FAILURE\n\n @public\n @property\n def is_resource_init_failure(self) -> bool:\n """bool: If this event is of type RESOURCE_INIT_FAILURE."""\n return self.event_type == DagsterEventType.RESOURCE_INIT_FAILURE\n\n @public\n @property\n def is_step_skipped(self) -> bool:\n """bool: If this event is of type STEP_SKIPPED."""\n return self.event_type == DagsterEventType.STEP_SKIPPED\n\n @public\n @property\n def is_step_up_for_retry(self) -> bool:\n """bool: If this event is of type STEP_UP_FOR_RETRY."""\n return self.event_type == DagsterEventType.STEP_UP_FOR_RETRY\n\n @public\n @property\n def is_step_restarted(self) -> bool:\n """bool: If this event is of type STEP_RESTARTED."""\n return self.event_type == DagsterEventType.STEP_RESTARTED\n\n @property\n def is_job_success(self) -> bool:\n return self.event_type == DagsterEventType.RUN_SUCCESS\n\n @property\n def is_job_failure(self) -> bool:\n return self.event_type == DagsterEventType.RUN_FAILURE\n\n @property\n def is_run_failure(self) -> bool:\n return self.event_type == DagsterEventType.RUN_FAILURE\n\n @public\n @property\n def is_failure(self) -> bool:\n """bool: If this event represents the failure of a run or step."""\n return self.event_type in FAILURE_EVENTS\n\n @property\n def is_job_event(self) -> bool:\n return self.event_type in PIPELINE_EVENTS\n\n @public\n @property\n def is_engine_event(self) -> bool:\n """bool: If this event is of type ENGINE_EVENT."""\n return self.event_type == DagsterEventType.ENGINE_EVENT\n\n @public\n @property\n def is_handled_output(self) -> bool:\n """bool: If this event is of type HANDLED_OUTPUT."""\n return self.event_type == DagsterEventType.HANDLED_OUTPUT\n\n @public\n @property\n def is_loaded_input(self) -> bool:\n """bool: If this event is of type LOADED_INPUT."""\n return self.event_type == DagsterEventType.LOADED_INPUT\n\n @public\n @property\n def is_step_materialization(self) -> bool:\n """bool: If this event is of type ASSET_MATERIALIZATION."""\n return self.event_type == DagsterEventType.ASSET_MATERIALIZATION\n\n @public\n @property\n def is_expectation_result(self) -> bool:\n """bool: If this event is of type STEP_EXPECTATION_RESULT."""\n return self.event_type == DagsterEventType.STEP_EXPECTATION_RESULT\n\n @public\n @property\n def is_asset_observation(self) -> bool:\n """bool: If this event is of type ASSET_OBSERVATION."""\n return self.event_type == DagsterEventType.ASSET_OBSERVATION\n\n @public\n @property\n def is_asset_materialization_planned(self) -> bool:\n """bool: If this event is of type ASSET_MATERIALIZATION_PLANNED."""\n return self.event_type == DagsterEventType.ASSET_MATERIALIZATION_PLANNED\n\n @public\n @property\n def asset_key(self) -> Optional[AssetKey]:\n """Optional[AssetKey]: For events that correspond to a specific asset_key / partition\n (ASSET_MATERIALIZTION, ASSET_OBSERVATION, ASSET_MATERIALIZATION_PLANNED), returns that\n asset key. Otherwise, returns None.\n """\n if self.event_type == DagsterEventType.ASSET_MATERIALIZATION:\n return self.step_materialization_data.materialization.asset_key\n elif self.event_type == DagsterEventType.ASSET_OBSERVATION:\n return self.asset_observation_data.asset_observation.asset_key\n elif self.event_type == DagsterEventType.ASSET_MATERIALIZATION_PLANNED:\n return self.asset_materialization_planned_data.asset_key\n else:\n return None\n\n @public\n @property\n def partition(self) -> Optional[str]:\n """Optional[AssetKey]: For events that correspond to a specific asset_key / partition\n (ASSET_MATERIALIZTION, ASSET_OBSERVATION, ASSET_MATERIALIZATION_PLANNED), returns that\n partition. Otherwise, returns None.\n """\n if self.event_type == DagsterEventType.ASSET_MATERIALIZATION:\n return self.step_materialization_data.materialization.partition\n elif self.event_type == DagsterEventType.ASSET_OBSERVATION:\n return self.asset_observation_data.asset_observation.partition\n elif self.event_type == DagsterEventType.ASSET_MATERIALIZATION_PLANNED:\n return self.asset_materialization_planned_data.partition\n else:\n return None\n\n @property\n def step_input_data(self) -> "StepInputData":\n _assert_type("step_input_data", DagsterEventType.STEP_INPUT, self.event_type)\n return cast(StepInputData, self.event_specific_data)\n\n @property\n def step_output_data(self) -> StepOutputData:\n _assert_type("step_output_data", DagsterEventType.STEP_OUTPUT, self.event_type)\n return cast(StepOutputData, self.event_specific_data)\n\n @property\n def step_success_data(self) -> "StepSuccessData":\n _assert_type("step_success_data", DagsterEventType.STEP_SUCCESS, self.event_type)\n return cast(StepSuccessData, self.event_specific_data)\n\n @property\n def step_failure_data(self) -> "StepFailureData":\n _assert_type("step_failure_data", DagsterEventType.STEP_FAILURE, self.event_type)\n return cast(StepFailureData, self.event_specific_data)\n\n @property\n def step_retry_data(self) -> "StepRetryData":\n _assert_type("step_retry_data", DagsterEventType.STEP_UP_FOR_RETRY, self.event_type)\n return cast(StepRetryData, self.event_specific_data)\n\n @property\n def step_materialization_data(self) -> "StepMaterializationData":\n _assert_type(\n "step_materialization_data", DagsterEventType.ASSET_MATERIALIZATION, self.event_type\n )\n return cast(StepMaterializationData, self.event_specific_data)\n\n @property\n def asset_observation_data(self) -> "AssetObservationData":\n _assert_type("asset_observation_data", DagsterEventType.ASSET_OBSERVATION, self.event_type)\n return cast(AssetObservationData, self.event_specific_data)\n\n @property\n def asset_materialization_planned_data(self) -> "AssetMaterializationPlannedData":\n _assert_type(\n "asset_materialization_planned",\n DagsterEventType.ASSET_MATERIALIZATION_PLANNED,\n self.event_type,\n )\n return cast(AssetMaterializationPlannedData, self.event_specific_data)\n\n @property\n def asset_check_planned_data(self) -> "AssetCheckEvaluationPlanned":\n _assert_type(\n "asset_check_planned",\n DagsterEventType.ASSET_CHECK_EVALUATION_PLANNED,\n self.event_type,\n )\n return cast(AssetCheckEvaluationPlanned, self.event_specific_data)\n\n @property\n def step_expectation_result_data(self) -> "StepExpectationResultData":\n _assert_type(\n "step_expectation_result_data",\n DagsterEventType.STEP_EXPECTATION_RESULT,\n self.event_type,\n )\n return cast(StepExpectationResultData, self.event_specific_data)\n\n @property\n def materialization(self) -> AssetMaterialization:\n _assert_type(\n "step_materialization_data", DagsterEventType.ASSET_MATERIALIZATION, self.event_type\n )\n return cast(StepMaterializationData, self.event_specific_data).materialization\n\n @property\n def asset_check_evaluation_data(self) -> AssetCheckEvaluation:\n _assert_type(\n "asset_check_evaluation", DagsterEventType.ASSET_CHECK_EVALUATION, self.event_type\n )\n return cast(AssetCheckEvaluation, self.event_specific_data)\n\n @property\n def job_failure_data(self) -> "JobFailureData":\n _assert_type("job_failure_data", DagsterEventType.RUN_FAILURE, self.event_type)\n return cast(JobFailureData, self.event_specific_data)\n\n @property\n def engine_event_data(self) -> "EngineEventData":\n _assert_type(\n "engine_event_data",\n [\n DagsterEventType.ENGINE_EVENT,\n DagsterEventType.RESOURCE_INIT_STARTED,\n DagsterEventType.RESOURCE_INIT_SUCCESS,\n DagsterEventType.RESOURCE_INIT_FAILURE,\n DagsterEventType.STEP_WORKER_STARTED,\n DagsterEventType.STEP_WORKER_STARTING,\n ],\n self.event_type,\n )\n return cast(EngineEventData, self.event_specific_data)\n\n @property\n def hook_completed_data(self) -> Optional["EventSpecificData"]:\n _assert_type("hook_completed_data", DagsterEventType.HOOK_COMPLETED, self.event_type)\n return self.event_specific_data\n\n @property\n def hook_errored_data(self) -> "HookErroredData":\n _assert_type("hook_errored_data", DagsterEventType.HOOK_ERRORED, self.event_type)\n return cast(HookErroredData, self.event_specific_data)\n\n @property\n def hook_skipped_data(self) -> Optional["EventSpecificData"]:\n _assert_type("hook_skipped_data", DagsterEventType.HOOK_SKIPPED, self.event_type)\n return self.event_specific_data\n\n @property\n def logs_captured_data(self) -> "ComputeLogsCaptureData":\n _assert_type("logs_captured_data", DagsterEventType.LOGS_CAPTURED, self.event_type)\n return cast(ComputeLogsCaptureData, self.event_specific_data)\n\n @staticmethod\n def step_output_event(\n step_context: StepExecutionContext, step_output_data: StepOutputData\n ) -> "DagsterEvent":\n output_def = step_context.op.output_def_named(\n step_output_data.step_output_handle.output_name\n )\n\n return DagsterEvent.from_step(\n event_type=DagsterEventType.STEP_OUTPUT,\n step_context=step_context,\n event_specific_data=step_output_data,\n message=(\n 'Yielded output "{output_name}"{mapping_clause} of type'\n ' "{output_type}".{type_check_clause}'.format(\n output_name=step_output_data.step_output_handle.output_name,\n output_type=output_def.dagster_type.display_name,\n type_check_clause=(\n (\n " Warning! Type check failed."\n if not step_output_data.type_check_data.success\n else " (Type check passed)."\n )\n if step_output_data.type_check_data\n else " (No type check)."\n ),\n mapping_clause=(\n f' mapping key "{step_output_data.step_output_handle.mapping_key}"'\n if step_output_data.step_output_handle.mapping_key\n else ""\n ),\n )\n ),\n )\n\n @staticmethod\n def step_failure_event(\n step_context: IStepContext,\n step_failure_data: "StepFailureData",\n message=None,\n ) -> "DagsterEvent":\n return DagsterEvent.from_step(\n event_type=DagsterEventType.STEP_FAILURE,\n step_context=step_context,\n event_specific_data=step_failure_data,\n message=(message or f'Execution of step "{step_context.step.key}" failed.'),\n )\n\n @staticmethod\n def step_retry_event(\n step_context: IStepContext, step_retry_data: "StepRetryData"\n ) -> "DagsterEvent":\n return DagsterEvent.from_step(\n event_type=DagsterEventType.STEP_UP_FOR_RETRY,\n step_context=step_context,\n event_specific_data=step_retry_data,\n message=(\n 'Execution of step "{step_key}" failed and has requested a retry{wait_str}.'.format(\n step_key=step_context.step.key,\n wait_str=(\n f" in {step_retry_data.seconds_to_wait} seconds"\n if step_retry_data.seconds_to_wait\n else ""\n ),\n )\n ),\n )\n\n @staticmethod\n def step_input_event(\n step_context: StepExecutionContext, step_input_data: "StepInputData"\n ) -> "DagsterEvent":\n input_def = step_context.op_def.input_def_named(step_input_data.input_name)\n\n return DagsterEvent.from_step(\n event_type=DagsterEventType.STEP_INPUT,\n step_context=step_context,\n event_specific_data=step_input_data,\n message='Got input "{input_name}" of type "{input_type}".{type_check_clause}'.format(\n input_name=step_input_data.input_name,\n input_type=input_def.dagster_type.display_name,\n type_check_clause=(\n (\n " Warning! Type check failed."\n if not step_input_data.type_check_data.success\n else " (Type check passed)."\n )\n if step_input_data.type_check_data\n else " (No type check)."\n ),\n ),\n )\n\n @staticmethod\n def step_start_event(step_context: IStepContext) -> "DagsterEvent":\n return DagsterEvent.from_step(\n event_type=DagsterEventType.STEP_START,\n step_context=step_context,\n message=f'Started execution of step "{step_context.step.key}".',\n )\n\n @staticmethod\n def step_restarted_event(step_context: IStepContext, previous_attempts: int) -> "DagsterEvent":\n return DagsterEvent.from_step(\n event_type=DagsterEventType.STEP_RESTARTED,\n step_context=step_context,\n message='Started re-execution (attempt # {n}) of step "{step_key}".'.format(\n step_key=step_context.step.key, n=previous_attempts + 1\n ),\n )\n\n @staticmethod\n def step_success_event(\n step_context: IStepContext, success: "StepSuccessData"\n ) -> "DagsterEvent":\n return DagsterEvent.from_step(\n event_type=DagsterEventType.STEP_SUCCESS,\n step_context=step_context,\n event_specific_data=success,\n message='Finished execution of step "{step_key}" in {duration}.'.format(\n step_key=step_context.step.key,\n duration=format_duration(success.duration_ms),\n ),\n )\n\n @staticmethod\n def step_skipped_event(step_context: IStepContext) -> "DagsterEvent":\n return DagsterEvent.from_step(\n event_type=DagsterEventType.STEP_SKIPPED,\n step_context=step_context,\n message=f'Skipped execution of step "{step_context.step.key}".',\n )\n\n @staticmethod\n def asset_materialization(\n step_context: IStepContext,\n materialization: AssetMaterialization,\n ) -> "DagsterEvent":\n return DagsterEvent.from_step(\n event_type=DagsterEventType.ASSET_MATERIALIZATION,\n step_context=step_context,\n event_specific_data=StepMaterializationData(materialization),\n message=(\n materialization.description\n if materialization.description\n else "Materialized value{label_clause}.".format(\n label_clause=f" {materialization.label}" if materialization.label else ""\n )\n ),\n )\n\n @staticmethod\n def asset_observation(\n step_context: IStepContext, observation: AssetObservation\n ) -> "DagsterEvent":\n return DagsterEvent.from_step(\n event_type=DagsterEventType.ASSET_OBSERVATION,\n step_context=step_context,\n event_specific_data=AssetObservationData(observation),\n )\n\n @staticmethod\n def asset_check_evaluation(\n step_context: IStepContext, asset_check_evaluation: AssetCheckEvaluation\n ) -> "DagsterEvent":\n return DagsterEvent.from_step(\n event_type=DagsterEventType.ASSET_CHECK_EVALUATION,\n step_context=step_context,\n event_specific_data=asset_check_evaluation,\n )\n\n @staticmethod\n def step_expectation_result(\n step_context: IStepContext, expectation_result: ExpectationResult\n ) -> "DagsterEvent":\n def _msg():\n if expectation_result.description:\n return expectation_result.description\n\n return "Expectation{label_clause} {result_verb}".format(\n label_clause=" " + expectation_result.label if expectation_result.label else "",\n result_verb="passed" if expectation_result.success else "failed",\n )\n\n return DagsterEvent.from_step(\n event_type=DagsterEventType.STEP_EXPECTATION_RESULT,\n step_context=step_context,\n event_specific_data=StepExpectationResultData(expectation_result),\n message=_msg(),\n )\n\n @staticmethod\n def job_start(job_context: IPlanContext) -> "DagsterEvent":\n return DagsterEvent.from_job(\n DagsterEventType.RUN_START,\n job_context,\n message=f'Started execution of run for "{job_context.job_name}".',\n )\n\n @staticmethod\n def job_success(job_context: IPlanContext) -> "DagsterEvent":\n return DagsterEvent.from_job(\n DagsterEventType.RUN_SUCCESS,\n job_context,\n message=f'Finished execution of run for "{job_context.job_name}".',\n )\n\n @staticmethod\n def job_failure(\n job_context_or_name: Union[IPlanContext, str],\n context_msg: str,\n error_info: Optional[SerializableErrorInfo] = None,\n ) -> "DagsterEvent":\n check.str_param(context_msg, "context_msg")\n if isinstance(job_context_or_name, IPlanContext):\n return DagsterEvent.from_job(\n DagsterEventType.RUN_FAILURE,\n job_context_or_name,\n message=(\n f'Execution of run for "{job_context_or_name.job_name}" failed. {context_msg}'\n ),\n event_specific_data=JobFailureData(error_info),\n )\n else:\n # when the failure happens trying to bring up context, the job_context hasn't been\n # built and so can't use from_pipeline\n check.str_param(job_context_or_name, "pipeline_name")\n event = DagsterEvent(\n event_type_value=DagsterEventType.RUN_FAILURE.value,\n job_name=job_context_or_name,\n event_specific_data=JobFailureData(error_info),\n message=f'Execution of run for "{job_context_or_name}" failed. {context_msg}',\n pid=os.getpid(),\n )\n return event\n\n @staticmethod\n def job_canceled(\n job_context: IPlanContext, error_info: Optional[SerializableErrorInfo] = None\n ) -> "DagsterEvent":\n return DagsterEvent.from_job(\n DagsterEventType.RUN_CANCELED,\n job_context,\n message=f'Execution of run for "{job_context.job_name}" canceled.',\n event_specific_data=JobCanceledData(\n check.opt_inst_param(error_info, "error_info", SerializableErrorInfo)\n ),\n )\n\n @staticmethod\n def step_worker_starting(\n step_context: IStepContext,\n message: str,\n metadata: Mapping[str, MetadataValue],\n ) -> "DagsterEvent":\n return DagsterEvent.from_step(\n DagsterEventType.STEP_WORKER_STARTING,\n step_context,\n message=message,\n event_specific_data=EngineEventData(\n metadata=metadata, marker_start="step_process_start"\n ),\n )\n\n @staticmethod\n def step_worker_started(\n log_manager: DagsterLogManager,\n job_name: str,\n message: str,\n metadata: Mapping[str, MetadataValue],\n step_key: Optional[str],\n ) -> "DagsterEvent":\n event = DagsterEvent(\n DagsterEventType.STEP_WORKER_STARTED.value,\n job_name=job_name,\n message=message,\n event_specific_data=EngineEventData(metadata=metadata, marker_end="step_process_start"),\n pid=os.getpid(),\n step_key=step_key,\n )\n log_manager.log_dagster_event(\n level=logging.DEBUG,\n msg=message,\n dagster_event=event,\n )\n return event\n\n @staticmethod\n def resource_init_start(\n job_name: str,\n execution_plan: "ExecutionPlan",\n log_manager: DagsterLogManager,\n resource_keys: AbstractSet[str],\n ) -> "DagsterEvent":\n return DagsterEvent.from_resource(\n DagsterEventType.RESOURCE_INIT_STARTED,\n job_name=job_name,\n execution_plan=execution_plan,\n log_manager=log_manager,\n message="Starting initialization of resources [{}].".format(\n ", ".join(sorted(resource_keys))\n ),\n event_specific_data=EngineEventData(metadata={}, marker_start="resources"),\n )\n\n @staticmethod\n def resource_init_success(\n job_name: str,\n execution_plan: "ExecutionPlan",\n log_manager: DagsterLogManager,\n resource_instances: Mapping[str, Any],\n resource_init_times: Mapping[str, str],\n ) -> "DagsterEvent":\n metadata = {}\n for key in resource_instances.keys():\n metadata[key] = MetadataValue.python_artifact(resource_instances[key].__class__)\n metadata[f"{key}:init_time"] = resource_init_times[key]\n\n return DagsterEvent.from_resource(\n DagsterEventType.RESOURCE_INIT_SUCCESS,\n job_name=job_name,\n execution_plan=execution_plan,\n log_manager=log_manager,\n message="Finished initialization of resources [{}].".format(\n ", ".join(sorted(resource_init_times.keys()))\n ),\n event_specific_data=EngineEventData(\n metadata=metadata,\n marker_end="resources",\n ),\n )\n\n @staticmethod\n def resource_init_failure(\n job_name: str,\n execution_plan: "ExecutionPlan",\n log_manager: DagsterLogManager,\n resource_keys: AbstractSet[str],\n error: SerializableErrorInfo,\n ) -> "DagsterEvent":\n return DagsterEvent.from_resource(\n DagsterEventType.RESOURCE_INIT_FAILURE,\n job_name=job_name,\n execution_plan=execution_plan,\n log_manager=log_manager,\n message="Initialization of resources [{}] failed.".format(", ".join(resource_keys)),\n event_specific_data=EngineEventData(\n metadata={},\n marker_end="resources",\n error=error,\n ),\n )\n\n @staticmethod\n def resource_teardown_failure(\n job_name: str,\n execution_plan: "ExecutionPlan",\n log_manager: DagsterLogManager,\n resource_keys: AbstractSet[str],\n error: SerializableErrorInfo,\n ) -> "DagsterEvent":\n return DagsterEvent.from_resource(\n DagsterEventType.ENGINE_EVENT,\n job_name=job_name,\n execution_plan=execution_plan,\n log_manager=log_manager,\n message="Teardown of resources [{}] failed.".format(", ".join(resource_keys)),\n event_specific_data=EngineEventData(\n metadata={},\n marker_start=None,\n marker_end=None,\n error=error,\n ),\n )\n\n @staticmethod\n def engine_event(\n plan_context: IPlanContext,\n message: str,\n event_specific_data: Optional["EngineEventData"] = None,\n ) -> "DagsterEvent":\n if isinstance(plan_context, IStepContext):\n return DagsterEvent.from_step(\n DagsterEventType.ENGINE_EVENT,\n step_context=plan_context,\n event_specific_data=event_specific_data,\n message=message,\n )\n else:\n return DagsterEvent.from_job(\n DagsterEventType.ENGINE_EVENT,\n plan_context,\n message,\n event_specific_data=event_specific_data,\n )\n\n @staticmethod\n def object_store_operation(\n step_context: IStepContext, object_store_operation_result: "ObjectStoreOperation"\n ) -> "DagsterEvent":\n object_store_name = (\n f"{object_store_operation_result.object_store_name} "\n if object_store_operation_result.object_store_name\n else ""\n )\n\n serialization_strategy_modifier = (\n f" using {object_store_operation_result.serialization_strategy_name}"\n if object_store_operation_result.serialization_strategy_name\n else ""\n )\n\n value_name = object_store_operation_result.value_name\n\n if (\n ObjectStoreOperationType(object_store_operation_result.op)\n == ObjectStoreOperationType.SET_OBJECT\n ):\n message = (\n f"Stored intermediate object for output {value_name} in "\n f"{object_store_name}object store{serialization_strategy_modifier}."\n )\n elif (\n ObjectStoreOperationType(object_store_operation_result.op)\n == ObjectStoreOperationType.GET_OBJECT\n ):\n message = (\n f"Retrieved intermediate object for input {value_name} in "\n f"{object_store_name}object store{serialization_strategy_modifier}."\n )\n elif (\n ObjectStoreOperationType(object_store_operation_result.op)\n == ObjectStoreOperationType.CP_OBJECT\n ):\n message = (\n "Copied intermediate object for input {value_name} from {key} to {dest_key}"\n ).format(\n value_name=value_name,\n key=object_store_operation_result.key,\n dest_key=object_store_operation_result.dest_key,\n )\n else:\n message = ""\n\n return DagsterEvent.from_step(\n DagsterEventType.OBJECT_STORE_OPERATION,\n step_context,\n event_specific_data=ObjectStoreOperationResultData(\n op=object_store_operation_result.op,\n value_name=value_name,\n address=object_store_operation_result.key,\n metadata={"key": MetadataValue.path(object_store_operation_result.key)},\n version=object_store_operation_result.version,\n mapping_key=object_store_operation_result.mapping_key,\n ),\n message=message,\n )\n\n @staticmethod\n def handled_output(\n step_context: IStepContext,\n output_name: str,\n manager_key: str,\n message_override: Optional[str] = None,\n metadata: Optional[Mapping[str, MetadataValue]] = None,\n ) -> "DagsterEvent":\n message = f'Handled output "{output_name}" using IO manager "{manager_key}"'\n return DagsterEvent.from_step(\n event_type=DagsterEventType.HANDLED_OUTPUT,\n step_context=step_context,\n event_specific_data=HandledOutputData(\n output_name=output_name,\n manager_key=manager_key,\n metadata=metadata if metadata else {},\n ),\n message=message_override or message,\n )\n\n @staticmethod\n def loaded_input(\n step_context: IStepContext,\n input_name: str,\n manager_key: str,\n upstream_output_name: Optional[str] = None,\n upstream_step_key: Optional[str] = None,\n message_override: Optional[str] = None,\n metadata: Optional[Mapping[str, MetadataValue]] = None,\n ) -> "DagsterEvent":\n message = f'Loaded input "{input_name}" using input manager "{manager_key}"'\n if upstream_output_name:\n message += f', from output "{upstream_output_name}" of step "{upstream_step_key}"'\n\n return DagsterEvent.from_step(\n event_type=DagsterEventType.LOADED_INPUT,\n step_context=step_context,\n event_specific_data=LoadedInputData(\n input_name=input_name,\n manager_key=manager_key,\n upstream_output_name=upstream_output_name,\n upstream_step_key=upstream_step_key,\n metadata=metadata if metadata else {},\n ),\n message=message_override or message,\n )\n\n @staticmethod\n def hook_completed(\n step_context: StepExecutionContext, hook_def: HookDefinition\n ) -> "DagsterEvent":\n event_type = DagsterEventType.HOOK_COMPLETED\n\n event = DagsterEvent(\n event_type_value=event_type.value,\n job_name=step_context.job_name,\n step_handle=step_context.step.handle,\n node_handle=step_context.step.node_handle,\n step_kind_value=step_context.step.kind.value,\n logging_tags=step_context.event_tags,\n message=(\n f'Finished the execution of hook "{hook_def.name}" triggered for'\n f' "{step_context.op.name}".'\n ),\n )\n\n step_context.log.log_dagster_event(\n level=logging.DEBUG, msg=event.message or "", dagster_event=event\n )\n\n return event\n\n @staticmethod\n def hook_errored(\n step_context: StepExecutionContext, error: HookExecutionError\n ) -> "DagsterEvent":\n event_type = DagsterEventType.HOOK_ERRORED\n\n event = DagsterEvent(\n event_type_value=event_type.value,\n job_name=step_context.job_name,\n step_handle=step_context.step.handle,\n node_handle=step_context.step.node_handle,\n step_kind_value=step_context.step.kind.value,\n logging_tags=step_context.event_tags,\n event_specific_data=_validate_event_specific_data(\n event_type,\n HookErroredData(\n error=serializable_error_info_from_exc_info(error.original_exc_info)\n ),\n ),\n )\n\n step_context.log.log_dagster_event(level=logging.ERROR, msg=str(error), dagster_event=event)\n\n return event\n\n @staticmethod\n def hook_skipped(\n step_context: StepExecutionContext, hook_def: HookDefinition\n ) -> "DagsterEvent":\n event_type = DagsterEventType.HOOK_SKIPPED\n\n event = DagsterEvent(\n event_type_value=event_type.value,\n job_name=step_context.job_name,\n step_handle=step_context.step.handle,\n node_handle=step_context.step.node_handle,\n step_kind_value=step_context.step.kind.value,\n logging_tags=step_context.event_tags,\n message=(\n f'Skipped the execution of hook "{hook_def.name}". It did not meet its triggering '\n f'condition during the execution of "{step_context.op.name}".'\n ),\n )\n\n step_context.log.log_dagster_event(\n level=logging.DEBUG, msg=event.message or "", dagster_event=event\n )\n\n return event\n\n @staticmethod\n def legacy_compute_log_step_event(step_context: StepExecutionContext):\n step_key = step_context.step.key\n return DagsterEvent.from_step(\n DagsterEventType.LOGS_CAPTURED,\n step_context,\n message=f"Started capturing logs for step: {step_key}.",\n event_specific_data=ComputeLogsCaptureData(\n step_keys=[step_key],\n file_key=step_key,\n ),\n )\n\n @staticmethod\n def capture_logs(\n job_context: IPlanContext,\n step_keys: Sequence[str],\n log_key: Sequence[str],\n log_context: CapturedLogContext,\n ):\n file_key = log_key[-1]\n return DagsterEvent.from_job(\n DagsterEventType.LOGS_CAPTURED,\n job_context,\n message=f"Started capturing logs in process (pid: {os.getpid()}).",\n event_specific_data=ComputeLogsCaptureData(\n step_keys=step_keys,\n file_key=file_key,\n external_stdout_url=log_context.external_stdout_url,\n external_stderr_url=log_context.external_stderr_url,\n external_url=log_context.external_url,\n ),\n )
\n\n\ndef get_step_output_event(\n events: Sequence[DagsterEvent], step_key: str, output_name: Optional[str] = "result"\n) -> Optional["DagsterEvent"]:\n check.sequence_param(events, "events", of_type=DagsterEvent)\n check.str_param(step_key, "step_key")\n check.str_param(output_name, "output_name")\n for event in events:\n if (\n event.event_type == DagsterEventType.STEP_OUTPUT\n and event.step_key == step_key\n and event.step_output_data.output_name == output_name\n ):\n return event\n return None\n\n\n@whitelist_for_serdes\nclass AssetObservationData(\n NamedTuple("_AssetObservation", [("asset_observation", AssetObservation)])\n):\n def __new__(cls, asset_observation: AssetObservation):\n return super(AssetObservationData, cls).__new__(\n cls,\n asset_observation=check.inst_param(\n asset_observation, "asset_observation", AssetObservation\n ),\n )\n\n\n@whitelist_for_serdes\nclass StepMaterializationData(\n NamedTuple(\n "_StepMaterializationData",\n [\n ("materialization", AssetMaterialization),\n ("asset_lineage", Sequence[AssetLineageInfo]),\n ],\n )\n):\n def __new__(\n cls,\n materialization: AssetMaterialization,\n asset_lineage: Optional[Sequence[AssetLineageInfo]] = None,\n ):\n return super(StepMaterializationData, cls).__new__(\n cls,\n materialization=check.inst_param(\n materialization, "materialization", AssetMaterialization\n ),\n asset_lineage=check.opt_sequence_param(\n asset_lineage, "asset_lineage", of_type=AssetLineageInfo\n ),\n )\n\n\n@whitelist_for_serdes\nclass AssetMaterializationPlannedData(\n NamedTuple(\n "_AssetMaterializationPlannedData",\n [("asset_key", AssetKey), ("partition", Optional[str])],\n )\n):\n def __new__(cls, asset_key: AssetKey, partition: Optional[str] = None):\n return super(AssetMaterializationPlannedData, cls).__new__(\n cls,\n asset_key=check.inst_param(asset_key, "asset_key", AssetKey),\n partition=check.opt_str_param(partition, "partition"),\n )\n\n\n@whitelist_for_serdes\nclass StepExpectationResultData(\n NamedTuple(\n "_StepExpectationResultData",\n [\n ("expectation_result", ExpectationResult),\n ],\n )\n):\n def __new__(cls, expectation_result: ExpectationResult):\n return super(StepExpectationResultData, cls).__new__(\n cls,\n expectation_result=check.inst_param(\n expectation_result, "expectation_result", ExpectationResult\n ),\n )\n\n\n@whitelist_for_serdes(\n storage_field_names={"metadata": "metadata_entries"},\n field_serializers={"metadata": MetadataFieldSerializer},\n)\nclass ObjectStoreOperationResultData(\n NamedTuple(\n "_ObjectStoreOperationResultData",\n [\n ("op", ObjectStoreOperationType),\n ("value_name", Optional[str]),\n ("metadata", Mapping[str, MetadataValue]),\n ("address", Optional[str]),\n ("version", Optional[str]),\n ("mapping_key", Optional[str]),\n ],\n )\n):\n def __new__(\n cls,\n op: ObjectStoreOperationType,\n value_name: Optional[str] = None,\n metadata: Optional[Mapping[str, MetadataValue]] = None,\n address: Optional[str] = None,\n version: Optional[str] = None,\n mapping_key: Optional[str] = None,\n ):\n return super(ObjectStoreOperationResultData, cls).__new__(\n cls,\n op=cast(ObjectStoreOperationType, check.str_param(op, "op")),\n value_name=check.opt_str_param(value_name, "value_name"),\n metadata=normalize_metadata(\n check.opt_mapping_param(metadata, "metadata", key_type=str)\n ),\n address=check.opt_str_param(address, "address"),\n version=check.opt_str_param(version, "version"),\n mapping_key=check.opt_str_param(mapping_key, "mapping_key"),\n )\n\n\n@whitelist_for_serdes(\n storage_field_names={"metadata": "metadata_entries"},\n field_serializers={"metadata": MetadataFieldSerializer},\n)\nclass EngineEventData(\n NamedTuple(\n "_EngineEventData",\n [\n ("metadata", Mapping[str, MetadataValue]),\n ("error", Optional[SerializableErrorInfo]),\n ("marker_start", Optional[str]),\n ("marker_end", Optional[str]),\n ],\n )\n):\n # serdes log\n # * added optional error\n # * added marker_start / marker_end\n #\n def __new__(\n cls,\n metadata: Optional[Mapping[str, RawMetadataValue]] = None,\n error: Optional[SerializableErrorInfo] = None,\n marker_start: Optional[str] = None,\n marker_end: Optional[str] = None,\n ):\n return super(EngineEventData, cls).__new__(\n cls,\n metadata=normalize_metadata(\n check.opt_mapping_param(metadata, "metadata", key_type=str)\n ),\n error=check.opt_inst_param(error, "error", SerializableErrorInfo),\n marker_start=check.opt_str_param(marker_start, "marker_start"),\n marker_end=check.opt_str_param(marker_end, "marker_end"),\n )\n\n @staticmethod\n def in_process(\n pid: int, step_keys_to_execute: Optional[Sequence[str]] = None\n ) -> "EngineEventData":\n return EngineEventData(\n metadata={\n "pid": MetadataValue.text(str(pid)),\n **(\n {"step_keys": MetadataValue.text(str(step_keys_to_execute))}\n if step_keys_to_execute\n else {}\n ),\n }\n )\n\n @staticmethod\n def multiprocess(\n pid: int, step_keys_to_execute: Optional[Sequence[str]] = None\n ) -> "EngineEventData":\n return EngineEventData(\n metadata={\n "pid": MetadataValue.text(str(pid)),\n **(\n {"step_keys": MetadataValue.text(str(step_keys_to_execute))}\n if step_keys_to_execute\n else {}\n ),\n }\n )\n\n @staticmethod\n def interrupted(steps_interrupted: Sequence[str]) -> "EngineEventData":\n return EngineEventData(\n metadata={"steps_interrupted": MetadataValue.text(str(steps_interrupted))}\n )\n\n @staticmethod\n def engine_error(error: SerializableErrorInfo) -> "EngineEventData":\n return EngineEventData(metadata={}, error=error)\n\n\n@whitelist_for_serdes(storage_name="PipelineFailureData")\nclass JobFailureData(\n NamedTuple(\n "_JobFailureData",\n [\n ("error", Optional[SerializableErrorInfo]),\n ],\n )\n):\n def __new__(cls, error: Optional[SerializableErrorInfo]):\n return super(JobFailureData, cls).__new__(\n cls, error=check.opt_inst_param(error, "error", SerializableErrorInfo)\n )\n\n\n@whitelist_for_serdes(storage_name="PipelineCanceledData")\nclass JobCanceledData(\n NamedTuple(\n "_JobCanceledData",\n [\n ("error", Optional[SerializableErrorInfo]),\n ],\n )\n):\n def __new__(cls, error: Optional[SerializableErrorInfo]):\n return super(JobCanceledData, cls).__new__(\n cls, error=check.opt_inst_param(error, "error", SerializableErrorInfo)\n )\n\n\n@whitelist_for_serdes\nclass HookErroredData(\n NamedTuple(\n "_HookErroredData",\n [\n ("error", SerializableErrorInfo),\n ],\n )\n):\n def __new__(cls, error: SerializableErrorInfo):\n return super(HookErroredData, cls).__new__(\n cls, error=check.inst_param(error, "error", SerializableErrorInfo)\n )\n\n\n@whitelist_for_serdes(\n storage_field_names={"metadata": "metadata_entries"},\n field_serializers={"metadata": MetadataFieldSerializer},\n)\nclass HandledOutputData(\n NamedTuple(\n "_HandledOutputData",\n [\n ("output_name", str),\n ("manager_key", str),\n ("metadata", Mapping[str, MetadataValue]),\n ],\n )\n):\n def __new__(\n cls,\n output_name: str,\n manager_key: str,\n metadata: Optional[Mapping[str, MetadataValue]] = None,\n ):\n return super(HandledOutputData, cls).__new__(\n cls,\n output_name=check.str_param(output_name, "output_name"),\n manager_key=check.str_param(manager_key, "manager_key"),\n metadata=normalize_metadata(\n check.opt_mapping_param(metadata, "metadata", key_type=str)\n ),\n )\n\n\n@whitelist_for_serdes(\n storage_field_names={"metadata": "metadata_entries"},\n field_serializers={"metadata": MetadataFieldSerializer},\n)\nclass LoadedInputData(\n NamedTuple(\n "_LoadedInputData",\n [\n ("input_name", str),\n ("manager_key", str),\n ("upstream_output_name", Optional[str]),\n ("upstream_step_key", Optional[str]),\n ("metadata", Mapping[str, MetadataValue]),\n ],\n )\n):\n def __new__(\n cls,\n input_name: str,\n manager_key: str,\n upstream_output_name: Optional[str] = None,\n upstream_step_key: Optional[str] = None,\n metadata: Optional[Mapping[str, MetadataValue]] = None,\n ):\n return super(LoadedInputData, cls).__new__(\n cls,\n input_name=check.str_param(input_name, "input_name"),\n manager_key=check.str_param(manager_key, "manager_key"),\n upstream_output_name=check.opt_str_param(upstream_output_name, "upstream_output_name"),\n upstream_step_key=check.opt_str_param(upstream_step_key, "upstream_step_key"),\n metadata=normalize_metadata(\n check.opt_mapping_param(metadata, "metadata", key_type=str)\n ),\n )\n\n\n@whitelist_for_serdes(storage_field_names={"file_key": "log_key"})\nclass ComputeLogsCaptureData(\n NamedTuple(\n "_ComputeLogsCaptureData",\n [\n ("file_key", str), # renamed log_key => file_key to avoid confusion\n ("step_keys", Sequence[str]),\n ("external_url", Optional[str]),\n ("external_stdout_url", Optional[str]),\n ("external_stderr_url", Optional[str]),\n ],\n )\n):\n def __new__(\n cls,\n file_key: str,\n step_keys: Sequence[str],\n external_url: Optional[str] = None,\n external_stdout_url: Optional[str] = None,\n external_stderr_url: Optional[str] = None,\n ):\n return super(ComputeLogsCaptureData, cls).__new__(\n cls,\n file_key=check.str_param(file_key, "file_key"),\n step_keys=check.opt_list_param(step_keys, "step_keys", of_type=str),\n external_url=check.opt_str_param(external_url, "external_url"),\n external_stdout_url=check.opt_str_param(external_stdout_url, "external_stdout_url"),\n external_stderr_url=check.opt_str_param(external_stderr_url, "external_stderr_url"),\n )\n\n\n###################################################################################################\n# THE GRAVEYARD\n#\n# -|- -|- -|-\n# | | |\n# _-'~~~~~`-_ . _-'~~~~~`-_ _-'~~~~~`-_\n# .' '. .' '. .' '.\n# | R I P | | R I P | | R I P |\n# | | | | | |\n# | Synthetic | | Asset | | Pipeline |\n# | Process | | Store | | Init |\n# | Events | | Operations | | Failures |\n# | | | | | |\n###################################################################################################\n\n\n# Old data structures referenced below\n# class AssetStoreOperationData(NamedTuple):\n# op: str\n# step_key: str\n# output_name: str\n# asset_store_key: str\n#\n#\n# class AssetStoreOperationType(Enum):\n# SET_ASSET = "SET_ASSET"\n# GET_ASSET = "GET_ASSET"\n#\n#\n# class PipelineInitFailureData(NamedTuple):\n# error: SerializableErrorInfo\n\n\ndef _handle_back_compat(\n event_type_value: str,\n event_specific_data: Optional[Dict[str, Any]],\n) -> Tuple[str, Optional[Dict[str, Any]]]:\n # transform old specific process events in to engine events\n if event_type_value in [\n "PIPELINE_PROCESS_START",\n "PIPELINE_PROCESS_STARTED",\n "PIPELINE_PROCESS_EXITED",\n ]:\n return "ENGINE_EVENT", {"__class__": "EngineEventData"}\n\n # changes asset store ops in to get/set asset\n elif event_type_value == "ASSET_STORE_OPERATION":\n assert (\n event_specific_data is not None\n ), "ASSET_STORE_OPERATION event must have specific data"\n if event_specific_data["op"] in (\n "GET_ASSET",\n '{"__enum__": "AssetStoreOperationType.GET_ASSET"}',\n ):\n return (\n "LOADED_INPUT",\n {\n "__class__": "LoadedInputData",\n "input_name": event_specific_data["output_name"],\n "manager_key": event_specific_data["asset_store_key"],\n },\n )\n if event_specific_data["op"] in (\n "SET_ASSET",\n '{"__enum__": "AssetStoreOperationType.SET_ASSET"}',\n ):\n return (\n "HANDLED_OUTPUT",\n {\n "__class__": "HandledOutputData",\n "output_name": event_specific_data["output_name"],\n "manager_key": event_specific_data["asset_store_key"],\n },\n )\n\n # previous name for ASSET_MATERIALIZATION was STEP_MATERIALIZATION\n if event_type_value == "STEP_MATERIALIZATION":\n assert event_specific_data is not None, "STEP_MATERIALIZATION event must have specific data"\n return "ASSET_MATERIALIZATION", event_specific_data\n\n # transform PIPELINE_INIT_FAILURE to PIPELINE_FAILURE\n if event_type_value == "PIPELINE_INIT_FAILURE":\n assert (\n event_specific_data is not None\n ), "PIPELINE_INIT_FAILURE event must have specific data"\n return "PIPELINE_FAILURE", {\n "__class__": "PipelineFailureData",\n "error": event_specific_data.get("error"),\n }\n\n return event_type_value, event_specific_data\n
", "current_page_name": "_modules/dagster/_core/events", "customsidebar": null, "favicon_url": null, "log": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.events.log

\nfrom typing import Mapping, NamedTuple, Optional, Union\n\nimport dagster._check as check\nfrom dagster._annotations import PublicAttr, public\nfrom dagster._core.definitions.events import AssetMaterialization, AssetObservation\nfrom dagster._core.events import DagsterEvent, DagsterEventType\nfrom dagster._core.utils import coerce_valid_log_level\nfrom dagster._serdes.serdes import (\n    deserialize_value,\n    serialize_value,\n    whitelist_for_serdes,\n)\nfrom dagster._utils.error import SerializableErrorInfo\nfrom dagster._utils.log import (\n    JsonEventLoggerHandler,\n    StructuredLoggerHandler,\n    StructuredLoggerMessage,\n    construct_single_handler_logger,\n)\n\n\n
[docs]@whitelist_for_serdes(\n # These were originally distinguished from each other but ended up being empty subclasses\n # of EventLogEntry -- instead of using the subclasses we were relying on\n # EventLogEntry.is_dagster_event to distinguish events that originate in the logging\n # machinery from events that are yielded by user code\n old_storage_names={"DagsterEventRecord", "LogMessageRecord", "EventRecord"},\n old_fields={"message": ""},\n storage_field_names={"job_name": "pipeline_name"},\n)\nclass EventLogEntry(\n NamedTuple(\n "_EventLogEntry",\n [\n ("error_info", PublicAttr[Optional[SerializableErrorInfo]]),\n ("level", PublicAttr[Union[str, int]]),\n ("user_message", PublicAttr[str]),\n ("run_id", PublicAttr[str]),\n ("timestamp", PublicAttr[float]),\n ("step_key", PublicAttr[Optional[str]]),\n ("job_name", PublicAttr[Optional[str]]),\n ("dagster_event", PublicAttr[Optional[DagsterEvent]]),\n ],\n )\n):\n """Entries in the event log.\n\n Users should not instantiate this object directly. These entries may originate from the logging machinery (DagsterLogManager/context.log), from\n framework events (e.g. EngineEvent), or they may correspond to events yielded by user code\n (e.g. Output).\n\n Args:\n error_info (Optional[SerializableErrorInfo]): Error info for an associated exception, if\n any, as generated by serializable_error_info_from_exc_info and friends.\n level (Union[str, int]): The Python log level at which to log this event. Note that\n framework and user code events are also logged to Python logging. This value may be an\n integer or a (case-insensitive) string member of PYTHON_LOGGING_LEVELS_NAMES.\n user_message (str): For log messages, this is the user-generated message.\n run_id (str): The id of the run which generated this event.\n timestamp (float): The Unix timestamp of this event.\n step_key (Optional[str]): The step key for the step which generated this event. Some events\n are generated outside of a step context.\n job_name (Optional[str]): The job which generated this event. Some events are\n generated outside of a job context.\n dagster_event (Optional[DagsterEvent]): For framework and user events, the associated\n structured event.\n """\n\n def __new__(\n cls,\n error_info,\n level,\n user_message,\n run_id,\n timestamp,\n step_key=None,\n job_name=None,\n dagster_event=None,\n ):\n return super(EventLogEntry, cls).__new__(\n cls,\n check.opt_inst_param(error_info, "error_info", SerializableErrorInfo),\n coerce_valid_log_level(level),\n check.str_param(user_message, "user_message"),\n check.str_param(run_id, "run_id"),\n check.float_param(timestamp, "timestamp"),\n check.opt_str_param(step_key, "step_key"),\n check.opt_str_param(job_name, "job_name"),\n check.opt_inst_param(dagster_event, "dagster_event", DagsterEvent),\n )\n\n @public\n @property\n def is_dagster_event(self) -> bool:\n """bool: If this entry contains a DagsterEvent."""\n return bool(self.dagster_event)\n\n
[docs] @public\n def get_dagster_event(self) -> DagsterEvent:\n """DagsterEvent: Returns the DagsterEvent contained within this entry. If this entry does not\n contain a DagsterEvent, an error will be raised.\n """\n if not isinstance(self.dagster_event, DagsterEvent):\n check.failed(\n "Not a dagster event, check is_dagster_event before calling get_dagster_event",\n )\n\n return self.dagster_event
\n\n def to_json(self):\n return serialize_value(self)\n\n @staticmethod\n def from_json(json_str: str):\n return deserialize_value(json_str, EventLogEntry)\n\n @public\n @property\n def dagster_event_type(self) -> Optional[DagsterEventType]:\n """Optional[DagsterEventType]: The type of the DagsterEvent contained by this entry, if any."""\n return self.dagster_event.event_type if self.dagster_event else None\n\n @public\n @property\n def message(self) -> str:\n """Return the message from the structured DagsterEvent if present, fallback to user_message."""\n if self.is_dagster_event:\n msg = self.get_dagster_event().message\n if msg is not None:\n return msg\n\n return self.user_message\n\n @property\n def asset_materialization(self) -> Optional[AssetMaterialization]:\n if (\n self.dagster_event\n and self.dagster_event.event_type_value == DagsterEventType.ASSET_MATERIALIZATION\n ):\n materialization = self.dagster_event.step_materialization_data.materialization\n if isinstance(materialization, AssetMaterialization):\n return materialization\n\n return None\n\n @property\n def asset_observation(self) -> Optional[AssetObservation]:\n if (\n self.dagster_event\n and self.dagster_event.event_type_value == DagsterEventType.ASSET_OBSERVATION\n ):\n observation = self.dagster_event.asset_observation_data.asset_observation\n if isinstance(observation, AssetObservation):\n return observation\n\n return None\n\n @property\n def tags(self) -> Optional[Mapping[str, str]]:\n materialization = self.asset_materialization\n if materialization:\n return materialization.tags\n\n observation = self.asset_observation\n if observation:\n return observation.tags\n\n return None
\n\n\ndef construct_event_record(logger_message: StructuredLoggerMessage) -> EventLogEntry:\n check.inst_param(logger_message, "logger_message", StructuredLoggerMessage)\n\n return EventLogEntry(\n level=logger_message.level,\n user_message=logger_message.meta["orig_message"],\n run_id=logger_message.meta["run_id"],\n timestamp=logger_message.record.created,\n step_key=logger_message.meta.get("step_key"),\n job_name=logger_message.meta.get("job_name"),\n dagster_event=logger_message.meta.get("dagster_event"),\n error_info=None,\n )\n\n\ndef construct_event_logger(event_record_callback):\n """Callback receives a stream of event_records. Piggybacks on the logging machinery."""\n check.callable_param(event_record_callback, "event_record_callback")\n\n return construct_single_handler_logger(\n "event-logger",\n "debug",\n StructuredLoggerHandler(\n lambda logger_message: event_record_callback(construct_event_record(logger_message))\n ),\n )\n\n\ndef construct_json_event_logger(json_path):\n """Record a stream of event records to json."""\n check.str_param(json_path, "json_path")\n return construct_single_handler_logger(\n "json-event-record-logger",\n "debug",\n JsonEventLoggerHandler(\n json_path,\n lambda record: construct_event_record(\n StructuredLoggerMessage(\n name=record.name,\n message=record.msg,\n level=record.levelno,\n meta=record.dagster_meta,\n record=record,\n )\n ),\n ),\n )\n
", "current_page_name": "_modules/dagster/_core/events/log", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}, {"link": "../", "title": "dagster._core.events"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.events.log"}, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.events"}, "execution": {"api": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.execution.api

\nimport sys\nfrom contextlib import contextmanager\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Any,\n    Callable,\n    Dict,\n    Iterator,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Tuple,\n    Union,\n    cast,\n)\n\nimport dagster._check as check\nfrom dagster._core.definitions import IJob, JobDefinition\nfrom dagster._core.definitions.events import AssetKey\nfrom dagster._core.definitions.job_base import InMemoryJob\nfrom dagster._core.definitions.reconstruct import ReconstructableJob\nfrom dagster._core.definitions.repository_definition import RepositoryLoadData\nfrom dagster._core.errors import DagsterExecutionInterruptedError, DagsterInvariantViolationError\nfrom dagster._core.events import DagsterEvent, EngineEventData\nfrom dagster._core.execution.context.system import PlanOrchestrationContext\nfrom dagster._core.execution.plan.execute_plan import inner_plan_execution_iterator\nfrom dagster._core.execution.plan.plan import ExecutionPlan\nfrom dagster._core.execution.plan.state import KnownExecutionState\nfrom dagster._core.execution.retries import RetryMode\nfrom dagster._core.instance import DagsterInstance, InstanceRef\nfrom dagster._core.selector import parse_step_selection\nfrom dagster._core.storage.dagster_run import DagsterRun, DagsterRunStatus\nfrom dagster._core.system_config.objects import ResolvedRunConfig\nfrom dagster._core.telemetry import log_dagster_event, log_repo_stats, telemetry_wrapper\nfrom dagster._utils.error import serializable_error_info_from_exc_info\nfrom dagster._utils.interrupts import capture_interrupts\nfrom dagster._utils.merger import merge_dicts\n\nfrom .context_creation_job import (\n    ExecutionContextManager,\n    PlanExecutionContextManager,\n    PlanOrchestrationContextManager,\n    orchestration_context_event_generator,\n    scoped_job_context,\n)\nfrom .job_execution_result import JobExecutionResult\n\nif TYPE_CHECKING:\n    from dagster._core.execution.plan.outputs import StepOutputHandle\n\n## Brief guide to the execution APIs\n# | function name               | operates over      | sync  | supports    | creates new DagsterRun  |\n# |                             |                    |       | reexecution | in instance             |\n# | --------------------------- | ------------------ | ----- | ----------- | ----------------------- |\n# | execute_job                 | ReconstructableJob | sync  | yes         | yes                     |\n# | execute_run_iterator        | DagsterRun         | async | (1)         | no                      |\n# | execute_run                 | DagsterRun         | sync  | (1)         | no                      |\n# | execute_plan_iterator       | ExecutionPlan      | async | (2)         | no                      |\n# | execute_plan                | ExecutionPlan      | sync  | (2)         | no                      |\n#\n# Notes on reexecution support:\n# (1) The appropriate bits must be set on the DagsterRun passed to this function. Specifically,\n#     parent_run_id and root_run_id must be set and consistent, and if a resolved_op_selection or\n#     step_keys_to_execute are set they must be consistent with the parent and root runs.\n# (2) As for (1), but the ExecutionPlan passed must also agree in all relevant bits.\n\n\ndef execute_run_iterator(\n    job: IJob,\n    dagster_run: DagsterRun,\n    instance: DagsterInstance,\n    resume_from_failure: bool = False,\n) -> Iterator[DagsterEvent]:\n    check.inst_param(job, "job", IJob)\n    check.inst_param(dagster_run, "dagster_run", DagsterRun)\n    check.inst_param(instance, "instance", DagsterInstance)\n\n    if dagster_run.status == DagsterRunStatus.CANCELED:\n        # This can happen if the run was force-terminated while it was starting\n        def gen_execute_on_cancel():\n            yield instance.report_engine_event(\n                "Not starting execution since the run was canceled before execution could start",\n                dagster_run,\n            )\n\n        return gen_execute_on_cancel()\n\n    if not resume_from_failure:\n        if dagster_run.status not in (DagsterRunStatus.NOT_STARTED, DagsterRunStatus.STARTING):\n            if dagster_run.is_finished:\n\n                def gen_ignore_duplicate_run_worker():\n                    yield instance.report_engine_event(\n                        "Ignoring a run worker that started after the run had already finished.",\n                        dagster_run,\n                    )\n\n                return gen_ignore_duplicate_run_worker()\n            elif instance.run_monitoring_enabled:\n                # This can happen if the pod was unexpectedly restarted by the cluster - ignore it since\n                # the run monitoring daemon will also spin up a new pod\n                def gen_ignore_duplicate_run_worker():\n                    yield instance.report_engine_event(\n                        "Ignoring a duplicate run that was started from somewhere other than"\n                        " the run monitor daemon",\n                        dagster_run,\n                    )\n\n                return gen_ignore_duplicate_run_worker()\n            else:\n\n                def gen_fail_restarted_run_worker():\n                    yield instance.report_engine_event(\n                        f"{dagster_run.job_name} ({dagster_run.run_id}) started a new"\n                        f" run worker while the run was already in state {dagster_run.status}."\n                        " This most frequently happens when the run worker unexpectedly stops"\n                        " and is restarted by the cluster. Marking the run as failed.",\n                        dagster_run,\n                    )\n                    yield instance.report_run_failed(dagster_run)\n\n                return gen_fail_restarted_run_worker()\n\n    else:\n        check.invariant(\n            dagster_run.status == DagsterRunStatus.STARTED\n            or dagster_run.status == DagsterRunStatus.STARTING,\n            desc=(\n                "Run of {} ({}) in state {}, expected STARTED or STARTING because it's "\n                "resuming from a run worker failure".format(\n                    dagster_run.job_name, dagster_run.run_id, dagster_run.status\n                )\n            ),\n        )\n\n    if dagster_run.resolved_op_selection or dagster_run.asset_selection:\n        # when `execute_run_iterator` is directly called, the sub pipeline hasn't been created\n        # note that when we receive the solids to execute via DagsterRun, it won't support\n        # solid selection query syntax\n        job = job.get_subset(\n            op_selection=(\n                list(dagster_run.resolved_op_selection)\n                if dagster_run.resolved_op_selection\n                else None\n            ),\n            asset_selection=dagster_run.asset_selection,\n        )\n\n    execution_plan = _get_execution_plan_from_run(job, dagster_run, instance)\n    if isinstance(job, ReconstructableJob):\n        job = job.with_repository_load_data(execution_plan.repository_load_data)\n\n    return iter(\n        ExecuteRunWithPlanIterable(\n            execution_plan=execution_plan,\n            iterator=job_execution_iterator,\n            execution_context_manager=PlanOrchestrationContextManager(\n                context_event_generator=orchestration_context_event_generator,\n                job=job,\n                execution_plan=execution_plan,\n                dagster_run=dagster_run,\n                instance=instance,\n                run_config=dagster_run.run_config,\n                raise_on_error=False,\n                executor_defs=None,\n                output_capture=None,\n                resume_from_failure=resume_from_failure,\n            ),\n        )\n    )\n\n\ndef execute_run(\n    job: IJob,\n    dagster_run: DagsterRun,\n    instance: DagsterInstance,\n    raise_on_error: bool = False,\n) -> JobExecutionResult:\n    """Executes an existing job run synchronously.\n\n    Synchronous version of execute_run_iterator.\n\n    Args:\n        job (IJob): The pipeline to execute.\n        dagster_run (DagsterRun): The run to execute\n        instance (DagsterInstance): The instance in which the run has been created.\n        raise_on_error (Optional[bool]): Whether or not to raise exceptions when they occur.\n            Defaults to ``False``.\n\n    Returns:\n        JobExecutionResult: The result of the execution.\n    """\n    if isinstance(job, JobDefinition):\n        raise DagsterInvariantViolationError(\n            "execute_run requires a reconstructable job but received job definition directly"\n            " instead. To support hand-off to other processes please wrap your definition in a call"\n            " to reconstructable(). Learn more about reconstructable here:"\n            " https://docs.dagster.io/_apidocs/execution#dagster.reconstructable"\n        )\n\n    check.inst_param(job, "job", IJob)\n    check.inst_param(dagster_run, "dagster_run", DagsterRun)\n    check.inst_param(instance, "instance", DagsterInstance)\n\n    if dagster_run.status == DagsterRunStatus.CANCELED:\n        message = "Not starting execution since the run was canceled before execution could start"\n        instance.report_engine_event(\n            message,\n            dagster_run,\n        )\n        raise DagsterInvariantViolationError(message)\n\n    check.invariant(\n        dagster_run.status == DagsterRunStatus.NOT_STARTED\n        or dagster_run.status == DagsterRunStatus.STARTING,\n        desc="Run {} ({}) in state {}, expected NOT_STARTED or STARTING".format(\n            dagster_run.job_name, dagster_run.run_id, dagster_run.status\n        ),\n    )\n    if dagster_run.resolved_op_selection or dagster_run.asset_selection:\n        # when `execute_run` is directly called, the sub job hasn't been created\n        # note that when we receive the solids to execute via DagsterRun, it won't support\n        # solid selection query syntax\n        job = job.get_subset(\n            op_selection=(\n                list(dagster_run.resolved_op_selection)\n                if dagster_run.resolved_op_selection\n                else None\n            ),\n            asset_selection=dagster_run.asset_selection,\n        )\n\n    execution_plan = _get_execution_plan_from_run(job, dagster_run, instance)\n    if isinstance(job, ReconstructableJob):\n        job = job.with_repository_load_data(execution_plan.repository_load_data)\n\n    output_capture: Optional[Dict[StepOutputHandle, Any]] = {}\n\n    _execute_run_iterable = ExecuteRunWithPlanIterable(\n        execution_plan=execution_plan,\n        iterator=job_execution_iterator,\n        execution_context_manager=PlanOrchestrationContextManager(\n            context_event_generator=orchestration_context_event_generator,\n            job=job,\n            execution_plan=execution_plan,\n            dagster_run=dagster_run,\n            instance=instance,\n            run_config=dagster_run.run_config,\n            raise_on_error=raise_on_error,\n            executor_defs=None,\n            output_capture=output_capture,\n        ),\n    )\n    event_list = list(_execute_run_iterable)\n\n    # We need to reload the run object after execution for it to be accurate\n    reloaded_dagster_run = check.not_none(instance.get_run_by_id(dagster_run.run_id))\n\n    return JobExecutionResult(\n        job.get_definition(),\n        scoped_job_context(\n            execution_plan,\n            job,\n            reloaded_dagster_run.run_config,\n            reloaded_dagster_run,\n            instance,\n        ),\n        event_list,\n        reloaded_dagster_run,\n    )\n\n\n@contextmanager\ndef ephemeral_instance_if_missing(\n    instance: Optional[DagsterInstance],\n) -> Iterator[DagsterInstance]:\n    if instance:\n        yield instance\n    else:\n        with DagsterInstance.ephemeral() as ephemeral_instance:\n            yield ephemeral_instance\n\n\n
[docs]class ReexecutionOptions(NamedTuple):\n """Reexecution options for python-based execution in Dagster.\n\n Args:\n parent_run_id (str): The run_id of the run to reexecute.\n step_selection (Sequence[str]):\n The list of step selections to reexecute. Must be a subset or match of the\n set of steps executed in the original run. For example:\n\n - ``['some_op']``: selects ``some_op`` itself.\n - ``['*some_op']``: select ``some_op`` and all its ancestors (upstream dependencies).\n - ``['*some_op+++']``: select ``some_op``, all its ancestors, and its descendants\n (downstream dependencies) within 3 levels down.\n - ``['*some_op', 'other_op_a', 'other_op_b+']``: select ``some_op`` and all its\n ancestors, ``other_op_a`` itself, and ``other_op_b`` and its direct child ops.\n """\n\n parent_run_id: str\n step_selection: Sequence[str] = []\n\n @staticmethod\n def from_failure(run_id: str, instance: DagsterInstance) -> "ReexecutionOptions":\n """Creates reexecution options from a failed run.\n\n Args:\n run_id (str): The run_id of the failed run. Run must fail in order to be reexecuted.\n instance (DagsterInstance): The DagsterInstance that the original run occurred in.\n\n Returns:\n ReexecutionOptions: Reexecution options to pass to a python execution.\n """\n from dagster._core.execution.plan.state import KnownExecutionState\n\n parent_run = check.not_none(instance.get_run_by_id(run_id))\n check.invariant(\n parent_run.status == DagsterRunStatus.FAILURE,\n "Cannot reexecute from failure a run that is not failed",\n )\n # Tried to thread through KnownExecutionState to execution plan creation, but little benefit.\n # It is recalculated later by the re-execution machinery.\n step_keys_to_execute, _ = KnownExecutionState.build_resume_retry_reexecution(\n instance, parent_run=cast(DagsterRun, instance.get_run_by_id(run_id))\n )\n return ReexecutionOptions(parent_run_id=run_id, step_selection=step_keys_to_execute)
\n\n\n
[docs]def execute_job(\n job: ReconstructableJob,\n instance: "DagsterInstance",\n run_config: Any = None,\n tags: Optional[Mapping[str, Any]] = None,\n raise_on_error: bool = False,\n op_selection: Optional[Sequence[str]] = None,\n reexecution_options: Optional[ReexecutionOptions] = None,\n asset_selection: Optional[Sequence[AssetKey]] = None,\n) -> JobExecutionResult:\n """Execute a job synchronously.\n\n This API represents dagster's python entrypoint for out-of-process\n execution. For most testing purposes, :py:meth:`~dagster.JobDefinition.\n execute_in_process` will be more suitable, but when wanting to run\n execution using an out-of-process executor (such as :py:class:`dagster.\n multiprocess_executor`), then `execute_job` is suitable.\n\n `execute_job` expects a persistent :py:class:`DagsterInstance` for\n execution, meaning the `$DAGSTER_HOME` environment variable must be set.\n It also expects a reconstructable pointer to a :py:class:`JobDefinition` so\n that it can be reconstructed in separate processes. This can be done by\n wrapping the ``JobDefinition`` in a call to :py:func:`dagster.\n reconstructable`.\n\n .. code-block:: python\n\n from dagster import DagsterInstance, execute_job, job, reconstructable\n\n @job\n def the_job():\n ...\n\n instance = DagsterInstance.get()\n result = execute_job(reconstructable(the_job), instance=instance)\n assert result.success\n\n\n If using the :py:meth:`~dagster.GraphDefinition.to_job` method to\n construct the ``JobDefinition``, then the invocation must be wrapped in a\n module-scope function, which can be passed to ``reconstructable``.\n\n .. code-block:: python\n\n from dagster import graph, reconstructable\n\n @graph\n def the_graph():\n ...\n\n def define_job():\n return the_graph.to_job(...)\n\n result = execute_job(reconstructable(define_job), ...)\n\n Since `execute_job` is potentially executing outside of the current\n process, output objects need to be retrieved by use of the provided job's\n io managers. Output objects can be retrieved by opening the result of\n `execute_job` as a context manager.\n\n .. code-block:: python\n\n from dagster import execute_job\n\n with execute_job(...) as result:\n output_obj = result.output_for_node("some_op")\n\n ``execute_job`` can also be used to reexecute a run, by providing a :py:class:`ReexecutionOptions` object.\n\n .. code-block:: python\n\n from dagster import ReexecutionOptions, execute_job\n\n instance = DagsterInstance.get()\n\n options = ReexecutionOptions.from_failure(run_id=failed_run_id, instance)\n execute_job(reconstructable(job), instance, reexecution_options=options)\n\n Parameters:\n job (ReconstructableJob): A reconstructable pointer to a :py:class:`JobDefinition`.\n instance (DagsterInstance): The instance to execute against.\n run_config (Optional[dict]): The configuration that parametrizes this run, as a dict.\n tags (Optional[Dict[str, Any]]): Arbitrary key-value pairs that will be added to run logs.\n raise_on_error (Optional[bool]): Whether or not to raise exceptions when they occur.\n Defaults to ``False``.\n op_selection (Optional[List[str]]): A list of op selection queries (including single\n op names) to execute. For example:\n\n - ``['some_op']``: selects ``some_op`` itself.\n - ``['*some_op']``: select ``some_op`` and all its ancestors (upstream dependencies).\n - ``['*some_op+++']``: select ``some_op``, all its ancestors, and its descendants\n (downstream dependencies) within 3 levels down.\n - ``['*some_op', 'other_op_a', 'other_op_b+']``: select ``some_op`` and all its\n ancestors, ``other_op_a`` itself, and ``other_op_b`` and its direct child ops.\n reexecution_options (Optional[ReexecutionOptions]):\n Reexecution options to provide to the run, if this run is\n intended to be a reexecution of a previous run. Cannot be used in\n tandem with the ``op_selection`` argument.\n\n Returns:\n :py:class:`JobExecutionResult`: The result of job execution.\n """\n check.inst_param(job, "job", ReconstructableJob)\n check.inst_param(instance, "instance", DagsterInstance)\n check.opt_sequence_param(asset_selection, "asset_selection", of_type=AssetKey)\n\n # get the repository load data here because we call job.get_definition() later in this fn\n job_def, _ = _job_with_repository_load_data(job)\n\n if reexecution_options is not None and op_selection is not None:\n raise DagsterInvariantViolationError(\n "re-execution and op selection cannot be used together at this time."\n )\n\n if reexecution_options:\n if run_config is None:\n run = check.not_none(instance.get_run_by_id(reexecution_options.parent_run_id))\n run_config = run.run_config\n return _reexecute_job(\n job_arg=job_def,\n parent_run_id=reexecution_options.parent_run_id,\n run_config=run_config,\n step_selection=list(reexecution_options.step_selection),\n tags=tags,\n instance=instance,\n raise_on_error=raise_on_error,\n )\n else:\n return _logged_execute_job(\n job_arg=job_def,\n instance=instance,\n run_config=run_config,\n tags=tags,\n op_selection=op_selection,\n raise_on_error=raise_on_error,\n asset_selection=asset_selection,\n )
\n\n\n@telemetry_wrapper\ndef _logged_execute_job(\n job_arg: Union[IJob, JobDefinition],\n instance: DagsterInstance,\n run_config: Optional[Mapping[str, object]] = None,\n tags: Optional[Mapping[str, str]] = None,\n op_selection: Optional[Sequence[str]] = None,\n raise_on_error: bool = True,\n asset_selection: Optional[Sequence[AssetKey]] = None,\n) -> JobExecutionResult:\n check.inst_param(instance, "instance", DagsterInstance)\n\n job_arg, repository_load_data = _job_with_repository_load_data(job_arg)\n\n (\n job_arg,\n run_config,\n tags,\n resolved_op_selection,\n op_selection,\n ) = _check_execute_job_args(\n job_arg=job_arg,\n run_config=run_config,\n tags=tags,\n op_selection=op_selection,\n )\n\n log_repo_stats(instance=instance, job=job_arg, source="execute_pipeline")\n\n dagster_run = instance.create_run_for_job(\n job_def=job_arg.get_definition(),\n run_config=run_config,\n op_selection=op_selection,\n resolved_op_selection=resolved_op_selection,\n tags=tags,\n job_code_origin=(\n job_arg.get_python_origin() if isinstance(job_arg, ReconstructableJob) else None\n ),\n repository_load_data=repository_load_data,\n asset_selection=frozenset(asset_selection) if asset_selection else None,\n )\n\n return execute_run(\n job_arg,\n dagster_run,\n instance,\n raise_on_error=raise_on_error,\n )\n\n\ndef _reexecute_job(\n job_arg: Union[IJob, JobDefinition],\n parent_run_id: str,\n run_config: Optional[Mapping[str, object]] = None,\n step_selection: Optional[Sequence[str]] = None,\n tags: Optional[Mapping[str, str]] = None,\n instance: Optional[DagsterInstance] = None,\n raise_on_error: bool = True,\n) -> JobExecutionResult:\n """Reexecute an existing job run."""\n check.opt_sequence_param(step_selection, "step_selection", of_type=str)\n\n check.str_param(parent_run_id, "parent_run_id")\n\n with ephemeral_instance_if_missing(instance) as execute_instance:\n job_arg, repository_load_data = _job_with_repository_load_data(job_arg)\n\n (job_arg, run_config, tags, _, _) = _check_execute_job_args(\n job_arg=job_arg,\n run_config=run_config,\n tags=tags,\n )\n\n parent_dagster_run = execute_instance.get_run_by_id(parent_run_id)\n if parent_dagster_run is None:\n check.failed(\n f"No parent run with id {parent_run_id} found in instance.",\n )\n\n execution_plan: Optional[ExecutionPlan] = None\n # resolve step selection DSL queries using parent execution information\n if step_selection:\n execution_plan = _resolve_reexecute_step_selection(\n execute_instance,\n job_arg,\n run_config,\n cast(DagsterRun, parent_dagster_run),\n step_selection,\n )\n\n if parent_dagster_run.asset_selection:\n job_arg = job_arg.get_subset(\n op_selection=None, asset_selection=parent_dagster_run.asset_selection\n )\n\n dagster_run = execute_instance.create_run_for_job(\n job_def=job_arg.get_definition(),\n execution_plan=execution_plan,\n run_config=run_config,\n tags=tags,\n op_selection=parent_dagster_run.op_selection,\n asset_selection=parent_dagster_run.asset_selection,\n resolved_op_selection=parent_dagster_run.resolved_op_selection,\n root_run_id=parent_dagster_run.root_run_id or parent_dagster_run.run_id,\n parent_run_id=parent_dagster_run.run_id,\n job_code_origin=(\n job_arg.get_python_origin() if isinstance(job_arg, ReconstructableJob) else None\n ),\n repository_load_data=repository_load_data,\n )\n\n return execute_run(\n job_arg,\n dagster_run,\n execute_instance,\n raise_on_error=raise_on_error,\n )\n check.failed("Should not reach here.")\n\n\ndef execute_plan_iterator(\n execution_plan: ExecutionPlan,\n job: IJob,\n dagster_run: DagsterRun,\n instance: DagsterInstance,\n retry_mode: Optional[RetryMode] = None,\n run_config: Optional[Mapping[str, object]] = None,\n) -> Iterator[DagsterEvent]:\n check.inst_param(execution_plan, "execution_plan", ExecutionPlan)\n check.inst_param(job, "job", IJob)\n check.inst_param(dagster_run, "dagster_run", DagsterRun)\n check.inst_param(instance, "instance", DagsterInstance)\n retry_mode = check.opt_inst_param(retry_mode, "retry_mode", RetryMode, RetryMode.DISABLED)\n run_config = check.opt_mapping_param(run_config, "run_config")\n\n if isinstance(job, ReconstructableJob):\n job = job.with_repository_load_data(execution_plan.repository_load_data)\n\n return iter(\n ExecuteRunWithPlanIterable(\n execution_plan=execution_plan,\n iterator=inner_plan_execution_iterator,\n execution_context_manager=PlanExecutionContextManager(\n job=job,\n retry_mode=retry_mode,\n execution_plan=execution_plan,\n run_config=run_config,\n dagster_run=dagster_run,\n instance=instance,\n ),\n )\n )\n\n\ndef execute_plan(\n execution_plan: ExecutionPlan,\n job: IJob,\n instance: DagsterInstance,\n dagster_run: DagsterRun,\n run_config: Optional[Mapping[str, object]] = None,\n retry_mode: Optional[RetryMode] = None,\n) -> Sequence[DagsterEvent]:\n """This is the entry point of dagster-graphql executions. For the dagster CLI entry point, see\n execute_job() above.\n """\n check.inst_param(execution_plan, "execution_plan", ExecutionPlan)\n check.inst_param(job, "job", IJob)\n check.inst_param(instance, "instance", DagsterInstance)\n check.inst_param(dagster_run, "dagster_run", DagsterRun)\n run_config = check.opt_mapping_param(run_config, "run_config")\n check.opt_inst_param(retry_mode, "retry_mode", RetryMode)\n\n return list(\n execute_plan_iterator(\n execution_plan=execution_plan,\n job=job,\n run_config=run_config,\n dagster_run=dagster_run,\n instance=instance,\n retry_mode=retry_mode,\n )\n )\n\n\ndef _get_execution_plan_from_run(\n job: IJob,\n dagster_run: DagsterRun,\n instance: DagsterInstance,\n) -> ExecutionPlan:\n execution_plan_snapshot = (\n instance.get_execution_plan_snapshot(dagster_run.execution_plan_snapshot_id)\n if dagster_run.execution_plan_snapshot_id\n else None\n )\n\n # Rebuild from snapshot if able and selection has not changed\n if (\n execution_plan_snapshot is not None\n and execution_plan_snapshot.can_reconstruct_plan\n and job.resolved_op_selection == dagster_run.resolved_op_selection\n and job.asset_selection == dagster_run.asset_selection\n ):\n return ExecutionPlan.rebuild_from_snapshot(\n dagster_run.job_name,\n execution_plan_snapshot,\n )\n\n return create_execution_plan(\n job,\n run_config=dagster_run.run_config,\n step_keys_to_execute=dagster_run.step_keys_to_execute,\n instance_ref=instance.get_ref() if instance.is_persistent else None,\n repository_load_data=(\n execution_plan_snapshot.repository_load_data if execution_plan_snapshot else None\n ),\n known_state=(\n execution_plan_snapshot.initial_known_state if execution_plan_snapshot else None\n ),\n )\n\n\ndef create_execution_plan(\n job: Union[IJob, JobDefinition],\n run_config: Optional[Mapping[str, object]] = None,\n step_keys_to_execute: Optional[Sequence[str]] = None,\n known_state: Optional[KnownExecutionState] = None,\n instance_ref: Optional[InstanceRef] = None,\n tags: Optional[Mapping[str, str]] = None,\n repository_load_data: Optional[RepositoryLoadData] = None,\n) -> ExecutionPlan:\n if isinstance(job, IJob):\n # If you have repository_load_data, make sure to use it when building plan\n if isinstance(job, ReconstructableJob) and repository_load_data is not None:\n job = job.with_repository_load_data(repository_load_data)\n job_def = job.get_definition()\n else:\n job_def = job\n\n run_config = check.opt_mapping_param(run_config, "run_config", key_type=str)\n check.opt_nullable_sequence_param(step_keys_to_execute, "step_keys_to_execute", of_type=str)\n check.opt_inst_param(instance_ref, "instance_ref", InstanceRef)\n tags = check.opt_mapping_param(tags, "tags", key_type=str, value_type=str)\n known_state = check.opt_inst_param(\n known_state,\n "known_state",\n KnownExecutionState,\n default=KnownExecutionState(),\n )\n repository_load_data = check.opt_inst_param(\n repository_load_data, "repository_load_data", RepositoryLoadData\n )\n\n resolved_run_config = ResolvedRunConfig.build(job_def, run_config)\n\n return ExecutionPlan.build(\n job_def,\n resolved_run_config,\n step_keys_to_execute=step_keys_to_execute,\n known_state=known_state,\n instance_ref=instance_ref,\n tags=tags,\n repository_load_data=repository_load_data,\n )\n\n\ndef job_execution_iterator(\n job_context: PlanOrchestrationContext, execution_plan: ExecutionPlan\n) -> Iterator[DagsterEvent]:\n """A complete execution of a pipeline. Yields pipeline start, success,\n and failure events.\n\n Args:\n pipeline_context (PlanOrchestrationContext):\n execution_plan (ExecutionPlan):\n """\n # TODO: restart event?\n if not job_context.resume_from_failure:\n yield DagsterEvent.job_start(job_context)\n\n job_exception_info = None\n job_canceled_info = None\n failed_steps = []\n generator_closed = False\n try:\n for event in job_context.executor.execute(job_context, execution_plan):\n if event.is_step_failure:\n failed_steps.append(event.step_key)\n elif event.is_resource_init_failure and event.step_key:\n failed_steps.append(event.step_key)\n\n # Telemetry\n log_dagster_event(event, job_context)\n\n yield event\n except GeneratorExit:\n # Shouldn't happen, but avoid runtime-exception in case this generator gets GC-ed\n # (see https://amir.rachum.com/blog/2017/03/03/generator-cleanup/).\n generator_closed = True\n job_exception_info = serializable_error_info_from_exc_info(sys.exc_info())\n if job_context.raise_on_error:\n raise\n except (KeyboardInterrupt, DagsterExecutionInterruptedError):\n job_canceled_info = serializable_error_info_from_exc_info(sys.exc_info())\n if job_context.raise_on_error:\n raise\n except BaseException:\n job_exception_info = serializable_error_info_from_exc_info(sys.exc_info())\n if job_context.raise_on_error:\n raise # finally block will run before this is re-raised\n finally:\n if job_canceled_info:\n reloaded_run = job_context.instance.get_run_by_id(job_context.run_id)\n if reloaded_run and reloaded_run.status == DagsterRunStatus.CANCELING:\n event = DagsterEvent.job_canceled(job_context, job_canceled_info)\n elif reloaded_run and reloaded_run.status == DagsterRunStatus.CANCELED:\n # This happens if the run was force-terminated but was still able to send\n # a cancellation request\n event = DagsterEvent.engine_event(\n job_context,\n "Computational resources were cleaned up after the run was forcibly marked"\n " as canceled.",\n EngineEventData(),\n )\n elif job_context.instance.run_will_resume(job_context.run_id):\n event = DagsterEvent.engine_event(\n job_context,\n "Execution was interrupted unexpectedly. No user initiated termination"\n " request was found, not treating as failure because run will be resumed.",\n EngineEventData(),\n )\n elif reloaded_run and reloaded_run.status == DagsterRunStatus.FAILURE:\n event = DagsterEvent.engine_event(\n job_context,\n "Execution was interrupted for a run that was already in a failure state.",\n EngineEventData(),\n )\n else:\n event = DagsterEvent.job_failure(\n job_context,\n "Execution was interrupted unexpectedly. "\n "No user initiated termination request was found, treating as failure.",\n job_canceled_info,\n )\n elif job_exception_info:\n event = DagsterEvent.job_failure(\n job_context,\n "An exception was thrown during execution.",\n job_exception_info,\n )\n elif failed_steps:\n event = DagsterEvent.job_failure(\n job_context,\n f"Steps failed: {failed_steps}.",\n )\n else:\n event = DagsterEvent.job_success(job_context)\n if not generator_closed:\n yield event\n\n\nclass ExecuteRunWithPlanIterable:\n """Utility class to consolidate execution logic.\n\n This is a class and not a function because, e.g., in constructing a `scoped_pipeline_context`\n for `JobExecutionResult`, we need to pull out the `pipeline_context` after we're done\n yielding events. This broadly follows a pattern we make use of in other places,\n cf. `dagster._utils.EventGenerationManager`.\n """\n\n def __init__(\n self,\n execution_plan: ExecutionPlan,\n iterator: Callable[..., Iterator[DagsterEvent]],\n execution_context_manager: ExecutionContextManager[Any],\n ):\n self.execution_plan = check.inst_param(execution_plan, "execution_plan", ExecutionPlan)\n self.iterator = check.callable_param(iterator, "iterator")\n self.execution_context_manager = check.inst_param(\n execution_context_manager, "execution_context_manager", ExecutionContextManager\n )\n\n self.job_context = None\n\n def __iter__(self) -> Iterator[DagsterEvent]:\n # Since interrupts can't be raised at arbitrary points safely, delay them until designated\n # checkpoints during the execution.\n # To be maximally certain that interrupts are always caught during an execution process,\n # you can safely add an additional `with capture_interrupts()` at the very beginning of the\n # process that performs the execution.\n with capture_interrupts():\n yield from self.execution_context_manager.prepare_context()\n self.job_context = self.execution_context_manager.get_context()\n generator_closed = False\n try:\n if self.job_context: # False if we had a pipeline init failure\n yield from self.iterator(\n execution_plan=self.execution_plan,\n job_context=self.job_context,\n )\n except GeneratorExit:\n # Shouldn't happen, but avoid runtime-exception in case this generator gets GC-ed\n # (see https://amir.rachum.com/blog/2017/03/03/generator-cleanup/).\n generator_closed = True\n raise\n finally:\n for event in self.execution_context_manager.shutdown_context():\n if not generator_closed:\n yield event\n\n\ndef _check_execute_job_args(\n job_arg: Union[JobDefinition, IJob],\n run_config: Optional[Mapping[str, object]],\n tags: Optional[Mapping[str, str]],\n op_selection: Optional[Sequence[str]] = None,\n) -> Tuple[\n IJob,\n Optional[Mapping],\n Mapping[str, str],\n Optional[AbstractSet[str]],\n Optional[Sequence[str]],\n]:\n ijob = InMemoryJob(job_arg) if isinstance(job_arg, JobDefinition) else job_arg\n job_def = job_arg if isinstance(job_arg, JobDefinition) else job_arg.get_definition()\n\n run_config = check.opt_mapping_param(run_config, "run_config")\n\n tags = check.opt_mapping_param(tags, "tags", key_type=str)\n check.opt_sequence_param(op_selection, "op_selection", of_type=str)\n\n tags = merge_dicts(job_def.tags, tags)\n\n # generate job subset from the given op_selection\n if op_selection:\n ijob = ijob.get_subset(op_selection=op_selection)\n\n return (\n ijob,\n run_config,\n tags,\n ijob.resolved_op_selection,\n op_selection,\n )\n\n\ndef _resolve_reexecute_step_selection(\n instance: DagsterInstance,\n job: IJob,\n run_config: Optional[Mapping],\n parent_dagster_run: DagsterRun,\n step_selection: Sequence[str],\n) -> ExecutionPlan:\n if parent_dagster_run.op_selection:\n job = job.get_subset(op_selection=parent_dagster_run.op_selection)\n\n state = KnownExecutionState.build_for_reexecution(instance, parent_dagster_run)\n\n parent_plan = create_execution_plan(\n job,\n parent_dagster_run.run_config,\n known_state=state,\n )\n step_keys_to_execute = parse_step_selection(parent_plan.get_all_step_deps(), step_selection)\n execution_plan = create_execution_plan(\n job,\n run_config,\n step_keys_to_execute=list(step_keys_to_execute),\n known_state=state.update_for_step_selection(step_keys_to_execute),\n tags=parent_dagster_run.tags,\n )\n return execution_plan\n\n\ndef _job_with_repository_load_data(\n job_arg: Union[JobDefinition, IJob],\n) -> Tuple[Union[JobDefinition, IJob], Optional[RepositoryLoadData]]:\n """For ReconstructableJob, generate and return any required RepositoryLoadData, alongside\n a ReconstructableJob with this repository load data baked in.\n """\n if isinstance(job_arg, ReconstructableJob):\n # Unless this ReconstructableJob alread has repository_load_data attached, this will\n # force the repository_load_data to be computed from scratch.\n repository_load_data = job_arg.repository.get_definition().repository_load_data\n return job_arg.with_repository_load_data(repository_load_data), repository_load_data\n return job_arg, None\n
", "current_page_name": "_modules/dagster/_core/execution/api", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.execution.api"}, "build_resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.execution.build_resources

\nfrom contextlib import contextmanager\nfrom typing import Any, Dict, Generator, Mapping, Optional, cast\n\nimport dagster._check as check\nfrom dagster._config import process_config\nfrom dagster._core.definitions.resource_definition import (\n    ResourceDefinition,\n    Resources,\n    ScopedResourcesBuilder,\n)\nfrom dagster._core.definitions.run_config import define_resource_dictionary_cls\nfrom dagster._core.errors import DagsterInvalidConfigError\nfrom dagster._core.execution.resources_init import resource_initialization_manager\nfrom dagster._core.instance import DagsterInstance\nfrom dagster._core.log_manager import DagsterLogManager\nfrom dagster._core.storage.dagster_run import DagsterRun\nfrom dagster._core.storage.io_manager import IOManager, IOManagerDefinition\nfrom dagster._core.system_config.objects import ResourceConfig, config_map_resources\n\nfrom .api import ephemeral_instance_if_missing\nfrom .context_creation_job import initialize_console_manager\n\n\ndef get_mapped_resource_config(\n    resource_defs: Mapping[str, ResourceDefinition], resource_config: Mapping[str, Any]\n) -> Mapping[str, ResourceConfig]:\n    resource_config_schema = define_resource_dictionary_cls(\n        resource_defs, set(resource_defs.keys())\n    )\n    config_evr = process_config(resource_config_schema, resource_config)\n    if not config_evr.success:\n        raise DagsterInvalidConfigError(\n            "Error in config for resources ",\n            config_evr.errors,\n            resource_config,\n        )\n    config_value = cast(Dict[str, Any], config_evr.value)\n    return config_map_resources(resource_defs, config_value)\n\n\n
[docs]@contextmanager\ndef build_resources(\n resources: Mapping[str, Any],\n instance: Optional[DagsterInstance] = None,\n resource_config: Optional[Mapping[str, Any]] = None,\n dagster_run: Optional[DagsterRun] = None,\n log_manager: Optional[DagsterLogManager] = None,\n) -> Generator[Resources, None, None]:\n """Context manager that yields resources using provided resource definitions and run config.\n\n This API allows for using resources in an independent context. Resources will be initialized\n with the provided run config, and optionally, dagster_run. The resulting resources will be\n yielded on a dictionary keyed identically to that provided for `resource_defs`. Upon exiting the\n context, resources will also be torn down safely.\n\n Args:\n resources (Mapping[str, Any]): Resource instances or definitions to build. All\n required resource dependencies to a given resource must be contained within this\n dictionary, or the resource build will fail.\n instance (Optional[DagsterInstance]): The dagster instance configured to instantiate\n resources on.\n resource_config (Optional[Mapping[str, Any]]): A dict representing the config to be\n provided to each resource during initialization and teardown.\n dagster_run (Optional[PipelineRun]): The pipeline run to provide during resource\n initialization and teardown. If the provided resources require either the `dagster_run`\n or `run_id` attributes of the provided context during resource initialization and/or\n teardown, this must be provided, or initialization will fail.\n log_manager (Optional[DagsterLogManager]): Log Manager to use during resource\n initialization. Defaults to system log manager.\n\n Examples:\n .. code-block:: python\n\n from dagster import resource, build_resources\n\n @resource\n def the_resource():\n return "foo"\n\n with build_resources(resources={"from_def": the_resource, "from_val": "bar"}) as resources:\n assert resources.from_def == "foo"\n assert resources.from_val == "bar"\n\n """\n resources = check.mapping_param(resources, "resource_defs", key_type=str)\n instance = check.opt_inst_param(instance, "instance", DagsterInstance)\n resource_config = check.opt_mapping_param(resource_config, "resource_config", key_type=str)\n log_manager = check.opt_inst_param(log_manager, "log_manager", DagsterLogManager)\n resource_defs = wrap_resources_for_execution(resources)\n mapped_resource_config = get_mapped_resource_config(resource_defs, resource_config)\n\n with ephemeral_instance_if_missing(instance) as dagster_instance:\n resources_manager = resource_initialization_manager(\n resource_defs=resource_defs,\n resource_configs=mapped_resource_config,\n log_manager=log_manager if log_manager else initialize_console_manager(dagster_run),\n execution_plan=None,\n dagster_run=dagster_run,\n resource_keys_to_init=set(resource_defs.keys()),\n instance=dagster_instance,\n emit_persistent_events=False,\n )\n try:\n list(resources_manager.generate_setup_events())\n instantiated_resources = check.inst(\n resources_manager.get_object(), ScopedResourcesBuilder\n )\n yield instantiated_resources.build(\n set(instantiated_resources.resource_instance_dict.keys())\n )\n finally:\n list(resources_manager.generate_teardown_events())
\n\n\ndef wrap_resources_for_execution(\n resources: Optional[Mapping[str, Any]] = None\n) -> Dict[str, ResourceDefinition]:\n return (\n {\n resource_key: wrap_resource_for_execution(resource)\n for resource_key, resource in resources.items()\n }\n if resources\n else {}\n )\n\n\ndef wrap_resource_for_execution(resource: Any) -> ResourceDefinition:\n from dagster._config.pythonic_config import ConfigurableResourceFactory, PartialResource\n\n # Wrap instantiated resource values in a resource definition.\n # If an instantiated IO manager is provided, wrap it in an IO manager definition.\n if isinstance(resource, (ConfigurableResourceFactory, PartialResource)):\n return resource.get_resource_definition()\n elif isinstance(resource, ResourceDefinition):\n return resource\n elif isinstance(resource, IOManager):\n return IOManagerDefinition.hardcoded_io_manager(resource)\n else:\n return ResourceDefinition.hardcoded_resource(resource)\n
", "current_page_name": "_modules/dagster/_core/execution/build_resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.execution.build_resources"}, "context": {"compute": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.execution.context.compute

\nfrom abc import ABC, ABCMeta, abstractmethod\nfrom inspect import _empty as EmptyAnnotation\nfrom typing import (\n    AbstractSet,\n    Any,\n    Dict,\n    Iterator,\n    List,\n    Mapping,\n    Optional,\n    Sequence,\n    Set,\n    Union,\n    cast,\n)\n\nimport dagster._check as check\nfrom dagster._annotations import deprecated, experimental, public\nfrom dagster._core.definitions.asset_check_spec import AssetCheckKey, AssetCheckSpec\nfrom dagster._core.definitions.asset_checks import AssetChecksDefinition\nfrom dagster._core.definitions.assets import AssetsDefinition\nfrom dagster._core.definitions.data_version import (\n    DataProvenance,\n    DataVersion,\n    extract_data_provenance_from_entry,\n)\nfrom dagster._core.definitions.decorators.op_decorator import DecoratedOpFunction\nfrom dagster._core.definitions.dependency import Node, NodeHandle\nfrom dagster._core.definitions.events import (\n    AssetKey,\n    AssetMaterialization,\n    AssetObservation,\n    ExpectationResult,\n    UserEvent,\n)\nfrom dagster._core.definitions.job_definition import JobDefinition\nfrom dagster._core.definitions.op_definition import OpDefinition\nfrom dagster._core.definitions.partition import PartitionsDefinition\nfrom dagster._core.definitions.partition_key_range import PartitionKeyRange\nfrom dagster._core.definitions.step_launcher import StepLauncher\nfrom dagster._core.definitions.time_window_partitions import TimeWindow\nfrom dagster._core.errors import (\n    DagsterInvalidDefinitionError,\n    DagsterInvalidPropertyError,\n    DagsterInvariantViolationError,\n)\nfrom dagster._core.events import DagsterEvent\nfrom dagster._core.instance import DagsterInstance\nfrom dagster._core.log_manager import DagsterLogManager\nfrom dagster._core.storage.dagster_run import DagsterRun\nfrom dagster._utils.forked_pdb import ForkedPdb\nfrom dagster._utils.warnings import (\n    deprecation_warning,\n)\n\nfrom .system import StepExecutionContext\n\n\n# This metaclass has to exist for OpExecutionContext to have a metaclass\nclass AbstractComputeMetaclass(ABCMeta):\n    pass\n\n\nclass AbstractComputeExecutionContext(ABC, metaclass=AbstractComputeMetaclass):\n    """Base class for op context implemented by OpExecutionContext and DagstermillExecutionContext."""\n\n    @abstractmethod\n    def has_tag(self, key: str) -> bool:\n        """Implement this method to check if a logging tag is set."""\n\n    @abstractmethod\n    def get_tag(self, key: str) -> Optional[str]:\n        """Implement this method to get a logging tag."""\n\n    @property\n    @abstractmethod\n    def run_id(self) -> str:\n        """The run id for the context."""\n\n    @property\n    @abstractmethod\n    def op_def(self) -> OpDefinition:\n        """The op definition corresponding to the execution step being executed."""\n\n    @property\n    @abstractmethod\n    def job_def(self) -> JobDefinition:\n        """The job being executed."""\n\n    @property\n    @abstractmethod\n    def run(self) -> DagsterRun:\n        """The DagsterRun object corresponding to the execution."""\n\n    @property\n    @abstractmethod\n    def resources(self) -> Any:\n        """Resources available in the execution context."""\n\n    @property\n    @abstractmethod\n    def log(self) -> DagsterLogManager:\n        """The log manager available in the execution context."""\n\n    @property\n    @abstractmethod\n    def op_config(self) -> Any:\n        """The parsed config specific to this op."""\n\n\nclass OpExecutionContextMetaClass(AbstractComputeMetaclass):\n    def __instancecheck__(cls, instance) -> bool:\n        # This makes isinstance(context, OpExecutionContext) throw a deprecation warning when\n        # context is an AssetExecutionContext. This metaclass can be deleted once AssetExecutionContext\n        # has been split into it's own class in 1.7.0\n        if type(instance) is AssetExecutionContext and cls is not AssetExecutionContext:\n            deprecation_warning(\n                subject="AssetExecutionContext",\n                additional_warn_text=(\n                    "Starting in version 1.7.0 AssetExecutionContext will no longer be a subclass"\n                    " of OpExecutionContext."\n                ),\n                breaking_version="1.7.0",\n                stacklevel=1,\n            )\n        return super().__instancecheck__(instance)\n\n\n
[docs]class OpExecutionContext(AbstractComputeExecutionContext, metaclass=OpExecutionContextMetaClass):\n """The ``context`` object that can be made available as the first argument to the function\n used for computing an op or asset.\n\n This context object provides system information such as resources, config, and logging.\n\n To construct an execution context for testing purposes, use :py:func:`dagster.build_op_context`.\n\n Example:\n .. code-block:: python\n\n from dagster import op, OpExecutionContext\n\n @op\n def hello_world(context: OpExecutionContext):\n context.log.info("Hello, world!")\n """\n\n __slots__ = ["_step_execution_context"]\n\n def __init__(self, step_execution_context: StepExecutionContext):\n self._step_execution_context = check.inst_param(\n step_execution_context,\n "step_execution_context",\n StepExecutionContext,\n )\n self._pdb: Optional[ForkedPdb] = None\n self._events: List[DagsterEvent] = []\n self._output_metadata: Dict[str, Any] = {}\n\n @public\n @property\n def op_config(self) -> Any:\n """Any: The parsed config specific to this op."""\n return self._step_execution_context.op_config\n\n @property\n def dagster_run(self) -> DagsterRun:\n """PipelineRun: The current pipeline run."""\n return self._step_execution_context.dagster_run\n\n @property\n def run(self) -> DagsterRun:\n """DagsterRun: The current run."""\n return self.dagster_run\n\n @public\n @property\n def instance(self) -> DagsterInstance:\n """DagsterInstance: The current Dagster instance."""\n return self._step_execution_context.instance\n\n @public\n @property\n def pdb(self) -> ForkedPdb:\n """dagster.utils.forked_pdb.ForkedPdb: Gives access to pdb debugging from within the op.\n\n Example:\n .. code-block:: python\n\n @op\n def debug(context):\n context.pdb.set_trace()\n """\n if self._pdb is None:\n self._pdb = ForkedPdb()\n\n return self._pdb\n\n @property\n def file_manager(self):\n """Deprecated access to the file manager.\n\n :meta private:\n """\n raise DagsterInvalidPropertyError(\n "You have attempted to access the file manager which has been moved to resources in"\n " 0.10.0. Please access it via `context.resources.file_manager` instead."\n )\n\n @public\n @property\n def resources(self) -> Any:\n """Resources: The currently available resources."""\n return self._step_execution_context.resources\n\n @property\n def step_launcher(self) -> Optional[StepLauncher]:\n """Optional[StepLauncher]: The current step launcher, if any."""\n return self._step_execution_context.step_launcher\n\n @public\n @property\n def run_id(self) -> str:\n """str: The id of the current execution's run."""\n return self._step_execution_context.run_id\n\n @public\n @property\n def run_config(self) -> Mapping[str, object]:\n """dict: The run config for the current execution."""\n return self._step_execution_context.run_config\n\n @public\n @property\n def job_def(self) -> JobDefinition:\n """JobDefinition: The currently executing pipeline."""\n return self._step_execution_context.job_def\n\n @public\n @property\n def job_name(self) -> str:\n """str: The name of the currently executing pipeline."""\n return self._step_execution_context.job_name\n\n @public\n @property\n def log(self) -> DagsterLogManager:\n """DagsterLogManager: The log manager available in the execution context."""\n return self._step_execution_context.log\n\n @property\n def node_handle(self) -> NodeHandle:\n """NodeHandle: The current op's handle.\n\n :meta private:\n """\n return self._step_execution_context.node_handle\n\n @property\n def op_handle(self) -> NodeHandle:\n """NodeHandle: The current op's handle.\n\n :meta private:\n """\n return self.node_handle\n\n @property\n def op(self) -> Node:\n """Node: The object representing the invoked op within the graph.\n\n :meta private:\n\n """\n return self._step_execution_context.job_def.get_node(self.node_handle)\n\n @public\n @property\n def op_def(self) -> OpDefinition:\n """OpDefinition: The current op definition."""\n return cast(OpDefinition, self.op.definition)\n\n @public\n @property\n def has_partition_key(self) -> bool:\n """Whether the current run is a partitioned run."""\n return self._step_execution_context.has_partition_key\n\n @public\n @property\n def partition_key(self) -> str:\n """The partition key for the current run.\n\n Raises an error if the current run is not a partitioned run. Or if the current run is operating\n over a range of partitions (ie. a backfill of several partitions executed in a single run).\n\n Examples:\n .. code-block:: python\n\n partitions_def = DailyPartitionsDefinition("2023-08-20")\n\n @asset(\n partitions_def=partitions_def\n )\n def my_asset(context: AssetExecutionContext):\n context.log.info(context.partition_key)\n\n # materializing the 2023-08-21 partition of this asset will log:\n # "2023-08-21"\n """\n return self._step_execution_context.partition_key\n\n @deprecated(breaking_version="2.0", additional_warn_text="Use `partition_key_range` instead.")\n @public\n @property\n def asset_partition_key_range(self) -> PartitionKeyRange:\n """The range of partition keys for the current run.\n\n If run is for a single partition key, return a `PartitionKeyRange` with the same start and\n end. Raises an error if the current run is not a partitioned run.\n """\n return self.partition_key_range\n\n @public\n @property\n def partition_key_range(self) -> PartitionKeyRange:\n """The range of partition keys for the current run.\n\n If run is for a single partition key, returns a `PartitionKeyRange` with the same start and\n end. Raises an error if the current run is not a partitioned run.\n\n Examples:\n .. code-block:: python\n\n partitions_def = DailyPartitionsDefinition("2023-08-20")\n\n @asset(\n partitions_def=partitions_def\n )\n def my_asset(context: AssetExecutionContext):\n context.log.info(context.partition_key_range)\n\n # running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n # PartitionKeyRange(start="2023-08-21", end="2023-08-25")\n """\n return self._step_execution_context.asset_partition_key_range\n\n @public\n @property\n def partition_time_window(self) -> TimeWindow:\n """The partition time window for the current run.\n\n Raises an error if the current run is not a partitioned run, or if the job's partition\n definition is not a TimeWindowPartitionsDefinition.\n\n Examples:\n .. code-block:: python\n\n partitions_def = DailyPartitionsDefinition("2023-08-20")\n\n @asset(\n partitions_def=partitions_def\n )\n def my_asset(context: AssetExecutionContext):\n context.log.info(context.partition_time_window)\n\n # materializing the 2023-08-21 partition of this asset will log:\n # TimeWindow("2023-08-21", "2023-08-22")\n """\n return self._step_execution_context.partition_time_window\n\n
[docs] @public\n def has_tag(self, key: str) -> bool:\n """Check if a logging tag is set.\n\n Args:\n key (str): The tag to check.\n\n Returns:\n bool: Whether the tag is set.\n """\n return self._step_execution_context.has_tag(key)
\n\n
[docs] @public\n def get_tag(self, key: str) -> Optional[str]:\n """Get a logging tag.\n\n Args:\n key (tag): The tag to get.\n\n Returns:\n Optional[str]: The value of the tag, if present.\n """\n return self._step_execution_context.get_tag(key)
\n\n @property\n def run_tags(self) -> Mapping[str, str]:\n """Mapping[str, str]: The tags for the current run."""\n return self._step_execution_context.run_tags\n\n def has_events(self) -> bool:\n return bool(self._events)\n\n def consume_events(self) -> Iterator[DagsterEvent]:\n """Pops and yields all user-generated events that have been recorded from this context.\n\n If consume_events has not yet been called, this will yield all logged events since the beginning of the op's computation. If consume_events has been called, it will yield all events since the last time consume_events was called. Designed for internal use. Users should never need to invoke this method.\n """\n events = self._events\n self._events = []\n yield from events\n\n
[docs] @public\n def log_event(self, event: UserEvent) -> None:\n """Log an AssetMaterialization, AssetObservation, or ExpectationResult from within the body of an op.\n\n Events logged with this method will appear in the list of DagsterEvents, as well as the event log.\n\n Args:\n event (Union[AssetMaterialization, AssetObservation, ExpectationResult]): The event to log.\n\n **Examples:**\n\n .. code-block:: python\n\n from dagster import op, AssetMaterialization\n\n @op\n def log_materialization(context):\n context.log_event(AssetMaterialization("foo"))\n """\n if isinstance(event, AssetMaterialization):\n self._events.append(\n DagsterEvent.asset_materialization(self._step_execution_context, event)\n )\n elif isinstance(event, AssetObservation):\n self._events.append(DagsterEvent.asset_observation(self._step_execution_context, event))\n elif isinstance(event, ExpectationResult):\n self._events.append(\n DagsterEvent.step_expectation_result(self._step_execution_context, event)\n )\n else:\n check.failed(f"Unexpected event {event}")
\n\n
[docs] @public\n def add_output_metadata(\n self,\n metadata: Mapping[str, Any],\n output_name: Optional[str] = None,\n mapping_key: Optional[str] = None,\n ) -> None:\n """Add metadata to one of the outputs of an op.\n\n This can be invoked multiple times per output in the body of an op. If the same key is\n passed multiple times, the value associated with the last call will be used.\n\n Args:\n metadata (Mapping[str, Any]): The metadata to attach to the output\n output_name (Optional[str]): The name of the output to attach metadata to. If there is only one output on the op, then this argument does not need to be provided. The metadata will automatically be attached to the only output.\n mapping_key (Optional[str]): The mapping key of the output to attach metadata to. If the\n output is not dynamic, this argument does not need to be provided.\n\n **Examples:**\n\n .. code-block:: python\n\n from dagster import Out, op\n from typing import Tuple\n\n @op\n def add_metadata(context):\n context.add_output_metadata({"foo", "bar"})\n return 5 # Since the default output is called "result", metadata will be attached to the output "result".\n\n @op(out={"a": Out(), "b": Out()})\n def add_metadata_two_outputs(context) -> Tuple[str, int]:\n context.add_output_metadata({"foo": "bar"}, output_name="b")\n context.add_output_metadata({"baz": "bat"}, output_name="a")\n\n return ("dog", 5)\n\n """\n metadata = check.mapping_param(metadata, "metadata", key_type=str)\n output_name = check.opt_str_param(output_name, "output_name")\n mapping_key = check.opt_str_param(mapping_key, "mapping_key")\n\n self._step_execution_context.add_output_metadata(\n metadata=metadata, output_name=output_name, mapping_key=mapping_key\n )
\n\n def get_output_metadata(\n self, output_name: str, mapping_key: Optional[str] = None\n ) -> Optional[Mapping[str, Any]]:\n return self._step_execution_context.get_output_metadata(\n output_name=output_name, mapping_key=mapping_key\n )\n\n def get_step_execution_context(self) -> StepExecutionContext:\n """Allows advanced users (e.g. framework authors) to punch through to the underlying\n step execution context.\n\n :meta private:\n\n Returns:\n StepExecutionContext: The underlying system context.\n """\n return self._step_execution_context\n\n @public\n @property\n def retry_number(self) -> int:\n """Which retry attempt is currently executing i.e. 0 for initial attempt, 1 for first retry, etc."""\n return self._step_execution_context.previous_attempt_count\n\n def describe_op(self):\n return self._step_execution_context.describe_op()\n\n
[docs] @public\n def get_mapping_key(self) -> Optional[str]:\n """Which mapping_key this execution is for if downstream of a DynamicOutput, otherwise None."""\n return self._step_execution_context.step.get_mapping_key()
\n\n #############################################################################################\n # asset related methods\n #############################################################################################\n\n @public\n @property\n def asset_key(self) -> AssetKey:\n """The AssetKey for the current asset. In a multi_asset, use asset_key_for_output instead."""\n if self.has_assets_def and len(self.assets_def.keys_by_output_name.keys()) > 1:\n raise DagsterInvariantViolationError(\n "Cannot call `context.asset_key` in a multi_asset with more than one asset. Use"\n " `context.asset_key_for_output` instead."\n )\n # pass in the output name to handle the case when a multi_asset has a single AssetOut\n return self.asset_key_for_output(\n output_name=next(iter(self.assets_def.keys_by_output_name.keys()))\n )\n\n @public\n @property\n def has_assets_def(self) -> bool:\n """If there is a backing AssetsDefinition for what is currently executing."""\n assets_def = self.job_def.asset_layer.assets_def_for_node(self.node_handle)\n return assets_def is not None\n\n @public\n @property\n def assets_def(self) -> AssetsDefinition:\n """The backing AssetsDefinition for what is currently executing, errors if not available."""\n assets_def = self.job_def.asset_layer.assets_def_for_node(self.node_handle)\n if assets_def is None:\n raise DagsterInvalidPropertyError(\n f"Op '{self.op.name}' does not have an assets definition."\n )\n return assets_def\n\n @public\n @property\n def selected_asset_keys(self) -> AbstractSet[AssetKey]:\n """Get the set of AssetKeys this execution is expected to materialize."""\n if not self.has_assets_def:\n return set()\n return self.assets_def.keys\n\n @public\n @property\n def has_asset_checks_def(self) -> bool:\n """Return a boolean indicating the presence of a backing AssetChecksDefinition\n for the current execution.\n\n Returns:\n bool: True if there is a backing AssetChecksDefinition for the current execution, otherwise False.\n """\n return self.job_def.asset_layer.asset_checks_def_for_node(self.node_handle) is not None\n\n @public\n @property\n def asset_checks_def(self) -> AssetChecksDefinition:\n """The backing AssetChecksDefinition for what is currently executing, errors if not\n available.\n\n Returns:\n AssetChecksDefinition.\n """\n asset_checks_def = self.job_def.asset_layer.asset_checks_def_for_node(self.node_handle)\n if asset_checks_def is None:\n raise DagsterInvalidPropertyError(\n f"Op '{self.op.name}' does not have an asset checks definition."\n )\n\n return asset_checks_def\n\n @public\n @property\n def selected_asset_check_keys(self) -> AbstractSet[AssetCheckKey]:\n if self.has_assets_def:\n return self.assets_def.check_keys\n\n if self.has_asset_checks_def:\n check.failed("Subset selection is not yet supported within an AssetChecksDefinition")\n\n return set()\n\n @public\n @property\n def selected_output_names(self) -> AbstractSet[str]:\n """Get the output names that correspond to the current selection of assets this execution is expected to materialize."""\n # map selected asset keys to the output names they correspond to\n selected_asset_keys = self.selected_asset_keys\n selected_outputs: Set[str] = set()\n for output_name in self.op.output_dict.keys():\n asset_info = self.job_def.asset_layer.asset_info_for_output(\n self.node_handle, output_name\n )\n if any( # For graph-backed assets, check if a downstream asset is selected\n [\n asset_key in selected_asset_keys\n for asset_key in self.job_def.asset_layer.downstream_dep_assets(\n self.node_handle, output_name\n )\n ]\n ) or (asset_info and asset_info.key in selected_asset_keys):\n selected_outputs.add(output_name)\n\n return selected_outputs\n\n
[docs] @public\n def asset_key_for_output(self, output_name: str = "result") -> AssetKey:\n """Return the AssetKey for the corresponding output."""\n asset_output_info = self.job_def.asset_layer.asset_info_for_output(\n node_handle=self.op_handle, output_name=output_name\n )\n if asset_output_info is None:\n check.failed(f"Output '{output_name}' has no asset")\n else:\n return asset_output_info.key
\n\n
[docs] @public\n def output_for_asset_key(self, asset_key: AssetKey) -> str:\n """Return the output name for the corresponding asset key."""\n node_output_handle = self.job_def.asset_layer.node_output_handle_for_asset(asset_key)\n if node_output_handle is None:\n check.failed(f"Asset key '{asset_key}' has no output")\n else:\n return node_output_handle.output_name
\n\n
[docs] @public\n def asset_key_for_input(self, input_name: str) -> AssetKey:\n """Return the AssetKey for the corresponding input."""\n key = self.job_def.asset_layer.asset_key_for_input(\n node_handle=self.op_handle, input_name=input_name\n )\n if key is None:\n check.failed(f"Input '{input_name}' has no asset")\n else:\n return key
\n\n
[docs] @public\n def asset_partition_key_for_output(self, output_name: str = "result") -> str:\n """Returns the asset partition key for the given output.\n\n Args:\n output_name (str): For assets defined with the ``@asset`` decorator, the name of the output\n will be automatically provided. For assets defined with ``@multi_asset``, ``output_name``\n should be the op output associated with the asset key (as determined by AssetOut)\n to get the partition key for.\n\n Examples:\n .. code-block:: python\n\n partitions_def = DailyPartitionsDefinition("2023-08-20")\n\n @asset(\n partitions_def=partitions_def\n )\n def an_asset(context: AssetExecutionContext):\n context.log.info(context.asset_partition_key_for_output())\n\n\n # materializing the 2023-08-21 partition of this asset will log:\n # "2023-08-21"\n\n @multi_asset(\n outs={\n "first_asset": AssetOut(key=["my_assets", "first_asset"]),\n "second_asset": AssetOut(key=["my_assets", "second_asset"])\n }\n partitions_def=partitions_def,\n )\n def a_multi_asset(context: AssetExecutionContext):\n context.log.info(context.asset_partition_key_for_output("first_asset"))\n context.log.info(context.asset_partition_key_for_output("second_asset"))\n\n\n # materializing the 2023-08-21 partition of this asset will log:\n # "2023-08-21"\n # "2023-08-21"\n\n\n @asset(\n partitions_def=partitions_def,\n ins={\n "self_dependent_asset": AssetIn(partition_mapping=TimeWindowPartitionMapping(start_offset=-1, end_offset=-1))\n }\n )\n def self_dependent_asset(context: AssetExecutionContext, self_dependent_asset):\n context.log.info(context.asset_partition_key_for_output())\n\n # materializing the 2023-08-21 partition of this asset will log:\n # "2023-08-21"\n\n """\n return self._step_execution_context.asset_partition_key_for_output(output_name)
\n\n
[docs] @public\n def asset_partitions_time_window_for_output(self, output_name: str = "result") -> TimeWindow:\n """The time window for the partitions of the output asset.\n\n If you want to write your asset to support running a backfill of several partitions in a single run,\n you can use ``asset_partitions_time_window_for_output`` to get the TimeWindow of all of the partitions\n being materialized by the backfill.\n\n Raises an error if either of the following are true:\n - The output asset has no partitioning.\n - The output asset is not partitioned with a TimeWindowPartitionsDefinition or a\n MultiPartitionsDefinition with one time-partitioned dimension.\n\n Args:\n output_name (str): For assets defined with the ``@asset`` decorator, the name of the output\n will be automatically provided. For assets defined with ``@multi_asset``, ``output_name``\n should be the op output associated with the asset key (as determined by AssetOut)\n to get the time window for.\n\n Examples:\n .. code-block:: python\n\n partitions_def = DailyPartitionsDefinition("2023-08-20")\n\n @asset(\n partitions_def=partitions_def\n )\n def an_asset(context: AssetExecutionContext):\n context.log.info(context.asset_partitions_time_window_for_output())\n\n\n # materializing the 2023-08-21 partition of this asset will log:\n # TimeWindow("2023-08-21", "2023-08-22")\n\n # running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n # TimeWindow("2023-08-21", "2023-08-26")\n\n @multi_asset(\n outs={\n "first_asset": AssetOut(key=["my_assets", "first_asset"]),\n "second_asset": AssetOut(key=["my_assets", "second_asset"])\n }\n partitions_def=partitions_def,\n )\n def a_multi_asset(context: AssetExecutionContext):\n context.log.info(context.asset_partitions_time_window_for_output("first_asset"))\n context.log.info(context.asset_partitions_time_window_for_output("second_asset"))\n\n # materializing the 2023-08-21 partition of this asset will log:\n # TimeWindow("2023-08-21", "2023-08-22")\n # TimeWindow("2023-08-21", "2023-08-22")\n\n # running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n # TimeWindow("2023-08-21", "2023-08-26")\n # TimeWindow("2023-08-21", "2023-08-26")\n\n\n @asset(\n partitions_def=partitions_def,\n ins={\n "self_dependent_asset": AssetIn(partition_mapping=TimeWindowPartitionMapping(start_offset=-1, end_offset=-1))\n }\n )\n def self_dependent_asset(context: AssetExecutionContext, self_dependent_asset):\n context.log.info(context.asset_partitions_time_window_for_output())\n\n # materializing the 2023-08-21 partition of this asset will log:\n # TimeWindow("2023-08-21", "2023-08-22")\n\n # running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n # TimeWindow("2023-08-21", "2023-08-26")\n\n """\n return self._step_execution_context.asset_partitions_time_window_for_output(output_name)
\n\n
[docs] @public\n def asset_partition_key_range_for_output(\n self, output_name: str = "result"\n ) -> PartitionKeyRange:\n """Return the PartitionKeyRange for the corresponding output. Errors if the run is not partitioned.\n\n If you want to write your asset to support running a backfill of several partitions in a single run,\n you can use ``asset_partition_key_range_for_output`` to get all of the partitions being materialized\n by the backfill.\n\n Args:\n output_name (str): For assets defined with the ``@asset`` decorator, the name of the output\n will be automatically provided. For assets defined with ``@multi_asset``, ``output_name``\n should be the op output associated with the asset key (as determined by AssetOut)\n to get the partition key range for.\n\n Examples:\n .. code-block:: python\n\n partitions_def = DailyPartitionsDefinition("2023-08-20")\n\n @asset(\n partitions_def=partitions_def\n )\n def an_asset(context: AssetExecutionContext):\n context.log.info(context.asset_partition_key_range_for_output())\n\n\n # running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n # PartitionKeyRange(start="2023-08-21", end="2023-08-25")\n\n @multi_asset(\n outs={\n "first_asset": AssetOut(key=["my_assets", "first_asset"]),\n "second_asset": AssetOut(key=["my_assets", "second_asset"])\n }\n partitions_def=partitions_def,\n )\n def a_multi_asset(context: AssetExecutionContext):\n context.log.info(context.asset_partition_key_range_for_output("first_asset"))\n context.log.info(context.asset_partition_key_range_for_output("second_asset"))\n\n\n # running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n # PartitionKeyRange(start="2023-08-21", end="2023-08-25")\n # PartitionKeyRange(start="2023-08-21", end="2023-08-25")\n\n\n @asset(\n partitions_def=partitions_def,\n ins={\n "self_dependent_asset": AssetIn(partition_mapping=TimeWindowPartitionMapping(start_offset=-1, end_offset=-1))\n }\n )\n def self_dependent_asset(context: AssetExecutionContext, self_dependent_asset):\n context.log.info(context.asset_partition_key_range_for_output())\n\n # running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n # PartitionKeyRange(start="2023-08-21", end="2023-08-25")\n\n """\n return self._step_execution_context.asset_partition_key_range_for_output(output_name)
\n\n
[docs] @public\n def asset_partition_key_range_for_input(self, input_name: str) -> PartitionKeyRange:\n """Return the PartitionKeyRange for the corresponding input. Errors if the asset depends on a\n non-contiguous chunk of the input.\n\n If you want to write your asset to support running a backfill of several partitions in a single run,\n you can use ``asset_partition_key_range_for_input`` to get the range of partitions keys of the input that\n are relevant to that backfill.\n\n Args:\n input_name (str): The name of the input to get the time window for.\n\n Examples:\n .. code-block:: python\n\n partitions_def = DailyPartitionsDefinition("2023-08-20")\n\n @asset(\n partitions_def=partitions_def\n )\n def upstream_asset():\n ...\n\n @asset(\n partitions_def=partitions_def\n )\n def an_asset(context: AssetExecutionContext, upstream_asset):\n context.log.info(context.asset_partition_key_range_for_input("upstream_asset"))\n\n\n # running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n # PartitionKeyRange(start="2023-08-21", end="2023-08-25")\n\n @asset(\n ins={\n "upstream_asset": AssetIn(partition_mapping=TimeWindowPartitionMapping(start_offset=-1, end_offset=-1))\n }\n partitions_def=partitions_def,\n )\n def another_asset(context: AssetExecutionContext, upstream_asset):\n context.log.info(context.asset_partition_key_range_for_input("upstream_asset"))\n\n\n # running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n # PartitionKeyRange(start="2023-08-20", end="2023-08-24")\n\n\n @asset(\n partitions_def=partitions_def,\n ins={\n "self_dependent_asset": AssetIn(partition_mapping=TimeWindowPartitionMapping(start_offset=-1, end_offset=-1))\n }\n )\n def self_dependent_asset(context: AssetExecutionContext, self_dependent_asset):\n context.log.info(context.asset_partition_key_range_for_input("self_dependent_asset"))\n\n # running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n # PartitionKeyRange(start="2023-08-20", end="2023-08-24")\n\n\n """\n return self._step_execution_context.asset_partition_key_range_for_input(input_name)
\n\n
[docs] @public\n def asset_partition_key_for_input(self, input_name: str) -> str:\n """Returns the partition key of the upstream asset corresponding to the given input.\n\n Args:\n input_name (str): The name of the input to get the partition key for.\n\n Examples:\n .. code-block:: python\n\n partitions_def = DailyPartitionsDefinition("2023-08-20")\n\n @asset(\n partitions_def=partitions_def\n )\n def upstream_asset():\n ...\n\n @asset(\n partitions_def=partitions_def\n )\n def an_asset(context: AssetExecutionContext, upstream_asset):\n context.log.info(context.asset_partition_key_for_input("upstream_asset"))\n\n # materializing the 2023-08-21 partition of this asset will log:\n # "2023-08-21"\n\n\n @asset(\n partitions_def=partitions_def,\n ins={\n "self_dependent_asset": AssetIn(partition_mapping=TimeWindowPartitionMapping(start_offset=-1, end_offset=-1))\n }\n )\n def self_dependent_asset(context: AssetExecutionContext, self_dependent_asset):\n context.log.info(context.asset_partition_key_for_input("self_dependent_asset"))\n\n # materializing the 2023-08-21 partition of this asset will log:\n # "2023-08-20"\n\n """\n return self._step_execution_context.asset_partition_key_for_input(input_name)
\n\n
[docs] @public\n def asset_partitions_def_for_output(self, output_name: str = "result") -> PartitionsDefinition:\n """The PartitionsDefinition on the asset corresponding to this output.\n\n Args:\n output_name (str): For assets defined with the ``@asset`` decorator, the name of the output\n will be automatically provided. For assets defined with ``@multi_asset``, ``output_name``\n should be the op output associated with the asset key (as determined by AssetOut)\n to get the PartitionsDefinition for.\n\n Examples:\n .. code-block:: python\n\n partitions_def = DailyPartitionsDefinition("2023-08-20")\n\n @asset(\n partitions_def=partitions_def\n )\n def upstream_asset(context: AssetExecutionContext):\n context.log.info(context.asset_partitions_def_for_output())\n\n # materializing the 2023-08-21 partition of this asset will log:\n # DailyPartitionsDefinition("2023-08-20")\n\n @multi_asset(\n outs={\n "first_asset": AssetOut(key=["my_assets", "first_asset"]),\n "second_asset": AssetOut(key=["my_assets", "second_asset"])\n }\n partitions_def=partitions_def,\n )\n def a_multi_asset(context: AssetExecutionContext):\n context.log.info(context.asset_partitions_def_for_output("first_asset"))\n context.log.info(context.asset_partitions_def_for_output("second_asset"))\n\n # materializing the 2023-08-21 partition of this asset will log:\n # DailyPartitionsDefinition("2023-08-20")\n # DailyPartitionsDefinition("2023-08-20")\n\n """\n asset_key = self.asset_key_for_output(output_name)\n result = self._step_execution_context.job_def.asset_layer.partitions_def_for_asset(\n asset_key\n )\n if result is None:\n raise DagsterInvariantViolationError(\n f"Attempting to access partitions def for asset {asset_key}, but it is not"\n " partitioned"\n )\n\n return result
\n\n
[docs] @public\n def asset_partitions_def_for_input(self, input_name: str) -> PartitionsDefinition:\n """The PartitionsDefinition on the upstream asset corresponding to this input.\n\n Args:\n input_name (str): The name of the input to get the PartitionsDefinition for.\n\n Examples:\n .. code-block:: python\n\n partitions_def = DailyPartitionsDefinition("2023-08-20")\n\n @asset(\n partitions_def=partitions_def\n )\n def upstream_asset():\n ...\n\n @asset(\n partitions_def=partitions_def\n )\n def upstream_asset(context: AssetExecutionContext, upstream_asset):\n context.log.info(context.asset_partitions_def_for_input("upstream_asset"))\n\n # materializing the 2023-08-21 partition of this asset will log:\n # DailyPartitionsDefinition("2023-08-20")\n\n """\n asset_key = self.asset_key_for_input(input_name)\n result = self._step_execution_context.job_def.asset_layer.partitions_def_for_asset(\n asset_key\n )\n if result is None:\n raise DagsterInvariantViolationError(\n f"Attempting to access partitions def for asset {asset_key}, but it is not"\n " partitioned"\n )\n\n return result
\n\n
[docs] @public\n def asset_partition_keys_for_output(self, output_name: str = "result") -> Sequence[str]:\n """Returns a list of the partition keys for the given output.\n\n If you want to write your asset to support running a backfill of several partitions in a single run,\n you can use ``asset_partition_keys_for_output`` to get all of the partitions being materialized\n by the backfill.\n\n Args:\n output_name (str): For assets defined with the ``@asset`` decorator, the name of the output\n will be automatically provided. For assets defined with ``@multi_asset``, ``output_name``\n should be the op output associated with the asset key (as determined by AssetOut)\n to get the partition keys for.\n\n Examples:\n .. code-block:: python\n\n partitions_def = DailyPartitionsDefinition("2023-08-20")\n\n @asset(\n partitions_def=partitions_def\n )\n def an_asset(context: AssetExecutionContext):\n context.log.info(context.asset_partition_keys_for_output())\n\n\n # running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n # ["2023-08-21", "2023-08-22", "2023-08-23", "2023-08-24", "2023-08-25"]\n\n @multi_asset(\n outs={\n "first_asset": AssetOut(key=["my_assets", "first_asset"]),\n "second_asset": AssetOut(key=["my_assets", "second_asset"])\n }\n partitions_def=partitions_def,\n )\n def a_multi_asset(context: AssetExecutionContext):\n context.log.info(context.asset_partition_keys_for_output("first_asset"))\n context.log.info(context.asset_partition_keys_for_output("second_asset"))\n\n\n # running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n # ["2023-08-21", "2023-08-22", "2023-08-23", "2023-08-24", "2023-08-25"]\n # ["2023-08-21", "2023-08-22", "2023-08-23", "2023-08-24", "2023-08-25"]\n\n\n @asset(\n partitions_def=partitions_def,\n ins={\n "self_dependent_asset": AssetIn(partition_mapping=TimeWindowPartitionMapping(start_offset=-1, end_offset=-1))\n }\n )\n def self_dependent_asset(context: AssetExecutionContext, self_dependent_asset):\n context.log.info(context.asset_partition_keys_for_output())\n\n # running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n # ["2023-08-21", "2023-08-22", "2023-08-23", "2023-08-24", "2023-08-25"]\n """\n return self.asset_partitions_def_for_output(output_name).get_partition_keys_in_range(\n self._step_execution_context.asset_partition_key_range_for_output(output_name),\n dynamic_partitions_store=self.instance,\n )
\n\n
[docs] @public\n def asset_partition_keys_for_input(self, input_name: str) -> Sequence[str]:\n """Returns a list of the partition keys of the upstream asset corresponding to the\n given input.\n\n If you want to write your asset to support running a backfill of several partitions in a single run,\n you can use ``asset_partition_keys_for_input`` to get all of the partition keys of the input that\n are relevant to that backfill.\n\n Args:\n input_name (str): The name of the input to get the time window for.\n\n Examples:\n .. code-block:: python\n\n partitions_def = DailyPartitionsDefinition("2023-08-20")\n\n @asset(\n partitions_def=partitions_def\n )\n def upstream_asset():\n ...\n\n @asset(\n partitions_def=partitions_def\n )\n def an_asset(context: AssetExecutionContext, upstream_asset):\n context.log.info(context.asset_partition_keys_for_input("upstream_asset"))\n\n\n # running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n # ["2023-08-21", "2023-08-22", "2023-08-23", "2023-08-24", "2023-08-25"]\n\n @asset(\n ins={\n "upstream_asset": AssetIn(partition_mapping=TimeWindowPartitionMapping(start_offset=-1, end_offset=-1))\n }\n partitions_def=partitions_def,\n )\n def another_asset(context: AssetExecutionContext, upstream_asset):\n context.log.info(context.asset_partition_keys_for_input("upstream_asset"))\n\n\n # running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n # ["2023-08-20", "2023-08-21", "2023-08-22", "2023-08-23", "2023-08-24"]\n\n\n @asset(\n partitions_def=partitions_def,\n ins={\n "self_dependent_asset": AssetIn(partition_mapping=TimeWindowPartitionMapping(start_offset=-1, end_offset=-1))\n }\n )\n def self_dependent_asset(context: AssetExecutionContext, self_dependent_asset):\n context.log.info(context.asset_partition_keys_for_input("self_dependent_asset"))\n\n # running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n # ["2023-08-20", "2023-08-21", "2023-08-22", "2023-08-23", "2023-08-24"]\n """\n return list(\n self._step_execution_context.asset_partitions_subset_for_input(\n input_name\n ).get_partition_keys()\n )
\n\n
[docs] @public\n def asset_partitions_time_window_for_input(self, input_name: str = "result") -> TimeWindow:\n """The time window for the partitions of the input asset.\n\n If you want to write your asset to support running a backfill of several partitions in a single run,\n you can use ``asset_partitions_time_window_for_input`` to get the time window of the input that\n are relevant to that backfill.\n\n Raises an error if either of the following are true:\n - The input asset has no partitioning.\n - The input asset is not partitioned with a TimeWindowPartitionsDefinition or a\n MultiPartitionsDefinition with one time-partitioned dimension.\n\n Args:\n input_name (str): The name of the input to get the partition key for.\n\n Examples:\n .. code-block:: python\n\n partitions_def = DailyPartitionsDefinition("2023-08-20")\n\n @asset(\n partitions_def=partitions_def\n )\n def upstream_asset():\n ...\n\n @asset(\n partitions_def=partitions_def\n )\n def an_asset(context: AssetExecutionContext, upstream_asset):\n context.log.info(context.asset_partitions_time_window_for_input("upstream_asset"))\n\n\n # materializing the 2023-08-21 partition of this asset will log:\n # TimeWindow("2023-08-21", "2023-08-22")\n\n # running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n # TimeWindow("2023-08-21", "2023-08-26")\n\n\n @asset(\n ins={\n "upstream_asset": AssetIn(partition_mapping=TimeWindowPartitionMapping(start_offset=-1, end_offset=-1))\n }\n partitions_def=partitions_def,\n )\n def another_asset(context: AssetExecutionContext, upstream_asset):\n context.log.info(context.asset_partitions_time_window_for_input("upstream_asset"))\n\n\n # materializing the 2023-08-21 partition of this asset will log:\n # TimeWindow("2023-08-20", "2023-08-21")\n\n # running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n # TimeWindow("2023-08-21", "2023-08-26")\n\n\n @asset(\n partitions_def=partitions_def,\n ins={\n "self_dependent_asset": AssetIn(partition_mapping=TimeWindowPartitionMapping(start_offset=-1, end_offset=-1))\n }\n )\n def self_dependent_asset(context: AssetExecutionContext, self_dependent_asset):\n context.log.info(context.asset_partitions_time_window_for_input("self_dependent_asset"))\n\n # materializing the 2023-08-21 partition of this asset will log:\n # TimeWindow("2023-08-20", "2023-08-21")\n\n # running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n # TimeWindow("2023-08-20", "2023-08-25")\n\n """\n return self._step_execution_context.asset_partitions_time_window_for_input(input_name)
\n\n
[docs] @public\n @experimental\n def get_asset_provenance(self, asset_key: AssetKey) -> Optional[DataProvenance]:\n """Return the provenance information for the most recent materialization of an asset.\n\n Args:\n asset_key (AssetKey): Key of the asset for which to retrieve provenance.\n\n Returns:\n Optional[DataProvenance]: Provenance information for the most recent\n materialization of the asset. Returns `None` if the asset was never materialized or\n the materialization record is too old to contain provenance information.\n """\n record = self.instance.get_latest_data_version_record(asset_key)\n\n return (\n None if record is None else extract_data_provenance_from_entry(record.event_log_entry)\n )
\n\n def set_data_version(self, asset_key: AssetKey, data_version: DataVersion) -> None:\n """Set the data version for an asset being materialized by the currently executing step.\n This is useful for external execution situations where it is not possible to return\n an `Output`.\n\n Args:\n asset_key (AssetKey): Key of the asset for which to set the data version.\n data_version (DataVersion): The data version to set.\n """\n self._step_execution_context.set_data_version(asset_key, data_version)\n\n @property\n def asset_check_spec(self) -> AssetCheckSpec:\n asset_checks_def = check.not_none(\n self.job_def.asset_layer.asset_checks_def_for_node(self.node_handle),\n "This context does not correspond to an AssetChecksDefinition",\n )\n return asset_checks_def.spec\n\n # In this mode no conversion is done on returned values and missing but expected outputs are not\n # allowed.\n @property\n def requires_typed_event_stream(self) -> bool:\n return self._step_execution_context.requires_typed_event_stream\n\n @property\n def typed_event_stream_error_message(self) -> Optional[str]:\n return self._step_execution_context.typed_event_stream_error_message\n\n def set_requires_typed_event_stream(self, *, error_message: Optional[str] = None) -> None:\n self._step_execution_context.set_requires_typed_event_stream(error_message=error_message)
\n\n\n
[docs]class AssetExecutionContext(OpExecutionContext):\n def __init__(self, step_execution_context: StepExecutionContext):\n super().__init__(step_execution_context=step_execution_context)
\n\n\ndef build_execution_context(\n step_context: StepExecutionContext,\n) -> Union[OpExecutionContext, AssetExecutionContext]:\n """Get the correct context based on the type of step (op or asset) and the user provided context\n type annotation. Follows these rules.\n\n step type annotation result\n asset AssetExecutionContext AssetExecutionContext\n asset OpExecutionContext OpExecutionContext\n asset None AssetExecutionContext\n op AssetExecutionContext Error - we cannot init an AssetExecutionContext w/o an AssetsDefinition\n op OpExecutionContext OpExecutionContext\n op None OpExecutionContext\n For ops in graph-backed assets\n step type annotation result\n op AssetExecutionContext AssetExecutionContext\n op OpExecutionContext OpExecutionContext\n op None OpExecutionContext\n """\n is_sda_step = step_context.is_sda_step\n is_op_in_graph_asset = is_sda_step and step_context.is_op_in_graph\n context_annotation = EmptyAnnotation\n compute_fn = step_context.op_def._compute_fn # noqa: SLF001\n compute_fn = (\n compute_fn\n if isinstance(compute_fn, DecoratedOpFunction)\n else DecoratedOpFunction(compute_fn)\n )\n if compute_fn.has_context_arg():\n context_param = compute_fn.get_context_arg()\n context_annotation = context_param.annotation\n\n # It would be nice to do this check at definition time, rather than at run time, but we don't\n # know if the op is part of an op job or a graph-backed asset until we have the step execution context\n if context_annotation is AssetExecutionContext and not is_sda_step:\n # AssetExecutionContext requires an AssetsDefinition during init, so an op in an op job\n # cannot be annotated with AssetExecutionContext\n raise DagsterInvalidDefinitionError(\n "Cannot annotate @op `context` parameter with type AssetExecutionContext unless the"\n " op is part of a graph-backed asset. `context` must be annotated with"\n " OpExecutionContext, or left blank."\n )\n\n if context_annotation is EmptyAnnotation:\n # if no type hint has been given, default to:\n # * AssetExecutionContext for sda steps, not in graph-backed assets\n # * OpExecutionContext for non sda steps\n # * OpExecutionContext for ops in graph-backed assets\n if is_op_in_graph_asset or not is_sda_step:\n return OpExecutionContext(step_context)\n return AssetExecutionContext(step_context)\n if context_annotation is AssetExecutionContext:\n return AssetExecutionContext(step_context)\n return OpExecutionContext(step_context)\n
", "current_page_name": "_modules/dagster/_core/execution/context/compute", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.execution.context.compute"}, "hook": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.execution.context.hook

\nimport warnings\nfrom typing import TYPE_CHECKING, AbstractSet, Any, Dict, Mapping, Optional, Set, Union\n\nimport dagster._check as check\nfrom dagster._annotations import public\n\nfrom ...definitions.composition import PendingNodeInvocation\nfrom ...definitions.decorators.graph_decorator import graph\nfrom ...definitions.dependency import Node\nfrom ...definitions.hook_definition import HookDefinition\nfrom ...definitions.op_definition import OpDefinition\nfrom ...definitions.resource_definition import IContainsGenerator, Resources\nfrom ...errors import DagsterInvalidPropertyError, DagsterInvariantViolationError\nfrom ...log_manager import DagsterLogManager\nfrom ..plan.step import ExecutionStep\nfrom ..plan.utils import RetryRequestedFromPolicy\nfrom .system import StepExecutionContext\n\nif TYPE_CHECKING:\n    from dagster._core.instance import DagsterInstance\n\n\ndef _property_msg(prop_name: str, method_name: str) -> str:\n    return (\n        f"The {prop_name} {method_name} is not set when a `HookContext` is constructed from "\n        "`build_hook_context`."\n    )\n\n\ndef _check_property_on_test_context(\n    context: "HookContext", attr_str: str, user_facing_name: str, param_on_builder: str\n):\n    """Check if attribute is not None on context. If none, error, and point user in direction of\n    how to specify the parameter on the context object.\n    """\n    value = getattr(context, attr_str)\n    if value is None:\n        raise DagsterInvalidPropertyError(\n            f"Attribute '{user_facing_name}' was not provided when "\n            f"constructing context. Provide a value for the '{param_on_builder}' parameter on "\n            "'build_hook_context'. To learn more, check out the testing hooks section of Dagster's "\n            "concepts docs: https://docs.dagster.io/concepts/ops-jobs-graphs/op-hooks#testing-hooks"\n        )\n    else:\n        return value\n\n\n
[docs]class HookContext:\n """The ``context`` object available to a hook function on an DagsterEvent."""\n\n def __init__(\n self,\n step_execution_context: StepExecutionContext,\n hook_def: HookDefinition,\n ):\n self._step_execution_context = step_execution_context\n self._hook_def = check.inst_param(hook_def, "hook_def", HookDefinition)\n self._required_resource_keys = hook_def.required_resource_keys\n self._resources = step_execution_context.scoped_resources_builder.build(\n self._required_resource_keys\n )\n\n @public\n @property\n def job_name(self) -> str:\n """The name of the job where this hook is being triggered."""\n return self._step_execution_context.job_name\n\n @public\n @property\n def run_id(self) -> str:\n """The id of the run where this hook is being triggered."""\n return self._step_execution_context.run_id\n\n @public\n @property\n def hook_def(self) -> HookDefinition:\n """The hook that the context object belongs to."""\n return self._hook_def\n\n @public\n @property\n def instance(self) -> "DagsterInstance":\n """The instance configured to run the current job."""\n return self._step_execution_context.instance\n\n @property\n def op(self) -> Node:\n """The op instance associated with the hook."""\n return self._step_execution_context.op\n\n @property\n def step(self) -> ExecutionStep:\n warnings.warn(\n "The step property of HookContext has been deprecated, and will be removed "\n "in a future release."\n )\n return self._step_execution_context.step\n\n @public\n @property\n def step_key(self) -> str:\n """The key for the step where this hook is being triggered."""\n return self._step_execution_context.step.key\n\n @public\n @property\n def required_resource_keys(self) -> AbstractSet[str]:\n """Resources required by this hook."""\n return self._required_resource_keys\n\n @public\n @property\n def resources(self) -> "Resources":\n """Resources available in the hook context."""\n return self._resources\n\n @property\n def solid_config(self) -> Any:\n solid_config = self._step_execution_context.resolved_run_config.ops.get(\n str(self._step_execution_context.step.node_handle)\n )\n return solid_config.config if solid_config else None\n\n @public\n @property\n def op_config(self) -> Any:\n """The parsed config specific to this op."""\n return self.solid_config\n\n # Because of the fact that we directly use the log manager of the step, if a user calls\n # hook_context.log.with_tags, then they will end up mutating the step's logging tags as well.\n # This is not problematic because the hook only runs after the step has been completed.\n @public\n @property\n def log(self) -> DagsterLogManager:\n """Centralized log dispatch from user code."""\n return self._step_execution_context.log\n\n @property\n def solid_exception(self) -> Optional[BaseException]:\n """The thrown exception in a failed solid.\n\n Returns:\n Optional[BaseException]: the exception object, None if the solid execution succeeds.\n """\n return self.op_exception\n\n @public\n @property\n def op_exception(self) -> Optional[BaseException]:\n """The thrown exception in a failed op."""\n exc = self._step_execution_context.step_exception\n\n if isinstance(exc, RetryRequestedFromPolicy):\n return exc.__cause__\n\n return exc\n\n @property\n def solid_output_values(self) -> Mapping[str, Union[Any, Mapping[str, Any]]]:\n """The computed output values.\n\n Returns a dictionary where keys are output names and the values are:\n * the output values in the normal case\n * a dictionary from mapping key to corresponding value in the mapped case\n """\n results: Dict[str, Union[Any, Dict[str, Any]]] = {}\n captured = self._step_execution_context.step_output_capture\n\n if captured is None:\n check.failed("Outputs were unexpectedly not captured for hook")\n\n # make the returned values more user-friendly\n for step_output_handle, value in captured.items():\n if step_output_handle.mapping_key:\n if results.get(step_output_handle.output_name) is None:\n results[step_output_handle.output_name] = {\n step_output_handle.mapping_key: value\n }\n else:\n results[step_output_handle.output_name][step_output_handle.mapping_key] = value\n else:\n results[step_output_handle.output_name] = value\n\n return results\n\n @public\n @property\n def op_output_values(self):\n """Computed output values in an op."""\n return self.solid_output_values
\n\n\nclass UnboundHookContext(HookContext):\n def __init__(\n self,\n resources: Mapping[str, Any],\n op: Optional[Union[OpDefinition, PendingNodeInvocation]],\n run_id: Optional[str],\n job_name: Optional[str],\n op_exception: Optional[Exception],\n instance: Optional["DagsterInstance"],\n ):\n from ..build_resources import build_resources, wrap_resources_for_execution\n from ..context_creation_job import initialize_console_manager\n\n self._op = None\n if op is not None:\n\n @graph(name="hook_context_container")\n def temp_graph():\n op()\n\n self._op = temp_graph.nodes[0]\n\n # Open resource context manager\n self._resource_defs = wrap_resources_for_execution(resources)\n self._resources_cm = build_resources(self._resource_defs)\n self._resources = self._resources_cm.__enter__()\n self._resources_contain_cm = isinstance(self._resources, IContainsGenerator)\n\n self._run_id = run_id\n self._job_name = job_name\n self._op_exception = op_exception\n self._instance = instance\n\n self._log = initialize_console_manager(None)\n\n self._cm_scope_entered = False\n\n def __enter__(self):\n self._cm_scope_entered = True\n return self\n\n def __exit__(self, *exc: Any):\n self._resources_cm.__exit__(*exc)\n\n def __del__(self):\n if self._resources_contain_cm and not self._cm_scope_entered:\n self._resources_cm.__exit__(None, None, None)\n\n @property\n def job_name(self) -> str:\n return _check_property_on_test_context(\n self, attr_str="_job_name", user_facing_name="job_name", param_on_builder="job_name"\n )\n\n @property\n def run_id(self) -> str:\n return _check_property_on_test_context(\n self, attr_str="_run_id", user_facing_name="run_id", param_on_builder="run_id"\n )\n\n @property\n def hook_def(self) -> HookDefinition:\n raise DagsterInvalidPropertyError(_property_msg("hook_def", "property"))\n\n @property\n def op(self) -> Node:\n return _check_property_on_test_context(\n self, attr_str="_op", user_facing_name="op", param_on_builder="op"\n )\n\n @property\n def step(self) -> ExecutionStep:\n raise DagsterInvalidPropertyError(_property_msg("step", "property"))\n\n @property\n def step_key(self) -> str:\n raise DagsterInvalidPropertyError(_property_msg("step_key", "property"))\n\n @property\n def required_resource_keys(self) -> Set[str]:\n raise DagsterInvalidPropertyError(_property_msg("hook_def", "property"))\n\n @property\n def resources(self) -> "Resources":\n if self._resources_contain_cm and not self._cm_scope_entered:\n raise DagsterInvariantViolationError(\n "At least one provided resource is a generator, but attempting to access "\n "resources outside of context manager scope. You can use the following syntax to "\n "open a context manager: `with build_hook_context(...) as context:`"\n )\n return self._resources\n\n @property\n def solid_config(self) -> Any:\n raise DagsterInvalidPropertyError(_property_msg("solid_config", "property"))\n\n @property\n def log(self) -> DagsterLogManager:\n return self._log\n\n @property\n def op_exception(self) -> Optional[BaseException]:\n return self._op_exception\n\n @property\n def solid_output_values(self) -> Mapping[str, Union[Any, Mapping[str, Any]]]:\n """The computed output values.\n\n Returns a dictionary where keys are output names and the values are:\n * the output values in the normal case\n * a dictionary from mapping key to corresponding value in the mapped case\n """\n raise DagsterInvalidPropertyError(_property_msg("solid_output_values", "method"))\n\n @property\n def instance(self) -> "DagsterInstance":\n if not self._instance:\n raise DagsterInvariantViolationError(\n "Tried to access the HookContext instance, but no instance was provided to"\n " `build_hook_context`."\n )\n\n return self._instance\n\n\nclass BoundHookContext(HookContext):\n def __init__(\n self,\n hook_def: HookDefinition,\n resources: Resources,\n op: Optional[Node],\n log_manager: DagsterLogManager,\n run_id: Optional[str],\n job_name: Optional[str],\n op_exception: Optional[Exception],\n instance: Optional["DagsterInstance"],\n ):\n self._hook_def = hook_def\n self._resources = resources\n self._op = op\n self._log_manager = log_manager\n self._run_id = run_id\n self._job_name = job_name\n self._op_exception = op_exception\n self._instance = instance\n\n @property\n def job_name(self) -> str:\n return _check_property_on_test_context(\n self, attr_str="_job_name", user_facing_name="job_name", param_on_builder="job_name"\n )\n\n @property\n def run_id(self) -> str:\n return _check_property_on_test_context(\n self, attr_str="_run_id", user_facing_name="run_id", param_on_builder="run_id"\n )\n\n @property\n def hook_def(self) -> HookDefinition:\n return self._hook_def\n\n @property\n def op(self) -> Node:\n return _check_property_on_test_context(\n self, attr_str="_op", user_facing_name="op", param_on_builder="op"\n )\n\n @property\n def step(self) -> ExecutionStep:\n raise DagsterInvalidPropertyError(_property_msg("step", "property"))\n\n @property\n def step_key(self) -> str:\n raise DagsterInvalidPropertyError(_property_msg("step_key", "property"))\n\n @property\n def required_resource_keys(self) -> AbstractSet[str]:\n return self._hook_def.required_resource_keys\n\n @property\n def resources(self) -> "Resources":\n return self._resources\n\n @property\n def solid_config(self) -> Any:\n raise DagsterInvalidPropertyError(_property_msg("solid_config", "property"))\n\n @property\n def log(self) -> DagsterLogManager:\n return self._log_manager\n\n @property\n def op_exception(self):\n return self._op_exception\n\n @property\n def solid_output_values(self) -> Mapping[str, Union[Any, Mapping[str, Any]]]:\n """The computed output values.\n\n Returns a dictionary where keys are output names and the values are:\n * the output values in the normal case\n * a dictionary from mapping key to corresponding value in the mapped case\n """\n raise DagsterInvalidPropertyError(_property_msg("solid_output_values", "method"))\n\n @property\n def instance(self) -> "DagsterInstance":\n if not self._instance:\n raise DagsterInvariantViolationError(\n "Tried to access the HookContext instance, but no instance was provided to"\n " `build_hook_context`."\n )\n\n return self._instance\n\n\n
[docs]def build_hook_context(\n resources: Optional[Mapping[str, Any]] = None,\n op: Optional[Union[OpDefinition, PendingNodeInvocation]] = None,\n run_id: Optional[str] = None,\n job_name: Optional[str] = None,\n op_exception: Optional[Exception] = None,\n instance: Optional["DagsterInstance"] = None,\n) -> UnboundHookContext:\n """Builds hook context from provided parameters.\n\n ``build_hook_context`` can be used as either a function or a context manager. If there is a\n provided resource to ``build_hook_context`` that is a context manager, then it must be used as a\n context manager. This function can be used to provide the context argument to the invocation of\n a hook definition.\n\n Args:\n resources (Optional[Dict[str, Any]]): The resources to provide to the context. These can\n either be values or resource definitions.\n op (Optional[OpDefinition, PendingNodeInvocation]): The op definition which the\n hook may be associated with.\n run_id (Optional[str]): The id of the run in which the hook is invoked (provided for mocking purposes).\n job_name (Optional[str]): The name of the job in which the hook is used (provided for mocking purposes).\n op_exception (Optional[Exception]): The exception that caused the hook to be triggered.\n instance (Optional[DagsterInstance]): The Dagster instance configured to run the hook.\n\n Examples:\n .. code-block:: python\n\n context = build_hook_context()\n hook_to_invoke(context)\n\n with build_hook_context(resources={"foo": context_manager_resource}) as context:\n hook_to_invoke(context)\n """\n op = check.opt_inst_param(op, "op", (OpDefinition, PendingNodeInvocation))\n\n from dagster._core.instance import DagsterInstance\n\n return UnboundHookContext(\n resources=check.opt_mapping_param(resources, "resources", key_type=str),\n op=op,\n run_id=check.opt_str_param(run_id, "run_id"),\n job_name=check.opt_str_param(job_name, "job_name"),\n op_exception=check.opt_inst_param(op_exception, "op_exception", Exception),\n instance=check.opt_inst_param(instance, "instance", DagsterInstance),\n )
\n
", "current_page_name": "_modules/dagster/_core/execution/context/hook", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.execution.context.hook"}, "init": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.execution.context.init

\nfrom typing import Any, Mapping, Optional, Union\n\nimport dagster._check as check\nfrom dagster._annotations import public\nfrom dagster._core.definitions.resource_definition import (\n    IContainsGenerator,\n    ResourceDefinition,\n    Resources,\n)\nfrom dagster._core.errors import DagsterInvariantViolationError\nfrom dagster._core.instance import DagsterInstance\nfrom dagster._core.log_manager import DagsterLogManager\nfrom dagster._core.storage.dagster_run import DagsterRun\n\n\n
[docs]class InitResourceContext:\n """The context object available as the argument to the initialization function of a :py:class:`dagster.ResourceDefinition`.\n\n Users should not instantiate this object directly. To construct an `InitResourceContext` for testing purposes, use :py:func:`dagster.build_init_resource_context`.\n\n Example:\n .. code-block:: python\n\n from dagster import resource, InitResourceContext\n\n @resource\n def the_resource(init_context: InitResourceContext):\n init_context.log.info("Hello, world!")\n """\n\n def __init__(\n self,\n resource_config: Any,\n resources: Resources,\n resource_def: Optional[ResourceDefinition] = None,\n instance: Optional[DagsterInstance] = None,\n dagster_run: Optional[DagsterRun] = None,\n log_manager: Optional[DagsterLogManager] = None,\n ):\n self._resource_config = resource_config\n self._resource_def = resource_def\n self._log_manager = log_manager\n self._instance = instance\n self._resources = resources\n self._dagster_run = dagster_run\n\n @public\n @property\n def resource_config(self) -> Any:\n """The configuration data provided by the run config. The schema\n for this data is defined by the ``config_field`` argument to\n :py:class:`ResourceDefinition`.\n """\n return self._resource_config\n\n @public\n @property\n def resource_def(self) -> Optional[ResourceDefinition]:\n """The definition of the resource currently being constructed."""\n return self._resource_def\n\n @public\n @property\n def resources(self) -> Resources:\n """The resources that are available to the resource that we are initalizing."""\n return self._resources\n\n @public\n @property\n def instance(self) -> Optional[DagsterInstance]:\n """The Dagster instance configured for the current execution context."""\n return self._instance\n\n @property\n def dagster_run(self) -> Optional[DagsterRun]:\n """The dagster run to use. When initializing resources outside of execution context, this will be None."""\n return self._dagster_run\n\n @public\n @property\n def log(self) -> Optional[DagsterLogManager]:\n """The Dagster log manager configured for the current execution context."""\n return self._log_manager\n\n # backcompat: keep around this property from when InitResourceContext used to be a NamedTuple\n @public\n @property\n def log_manager(self) -> Optional[DagsterLogManager]:\n """The log manager for this run of the job."""\n return self._log_manager\n\n @public\n @property\n def run_id(self) -> Optional[str]:\n """The id for this run of the job or pipeline. When initializing resources outside of\n execution context, this will be None.\n """\n return self.dagster_run.run_id if self.dagster_run else None\n\n def replace_config(self, config: Any) -> "InitResourceContext":\n return InitResourceContext(\n resource_config=config,\n resources=self.resources,\n instance=self.instance,\n resource_def=self.resource_def,\n dagster_run=self.dagster_run,\n log_manager=self.log,\n )
\n\n\nclass UnboundInitResourceContext(InitResourceContext):\n """Resource initialization context outputted by ``build_init_resource_context``.\n\n Represents a context whose config has not yet been validated against a resource definition,\n hence the inability to access the `resource_def` attribute. When an instance of\n ``UnboundInitResourceContext`` is passed to a resource invocation, config is validated,\n and it is subsumed into an `InitResourceContext`, which contains the resource_def validated\n against.\n """\n\n def __init__(\n self,\n resource_config: Any,\n resources: Optional[Union[Resources, Mapping[str, Any]]],\n instance: Optional[DagsterInstance],\n ):\n from dagster._core.execution.api import ephemeral_instance_if_missing\n from dagster._core.execution.build_resources import (\n build_resources,\n wrap_resources_for_execution,\n )\n from dagster._core.execution.context_creation_job import initialize_console_manager\n\n self._instance_provided = (\n check.opt_inst_param(instance, "instance", DagsterInstance) is not None\n )\n # Construct ephemeral instance if missing\n self._instance_cm = ephemeral_instance_if_missing(instance)\n # Pylint can't infer that the ephemeral_instance context manager has an __enter__ method,\n # so ignore lint error\n instance = self._instance_cm.__enter__()\n\n if isinstance(resources, Resources):\n check.failed("Should not have a Resources object directly from this initialization")\n\n self._resource_defs = wrap_resources_for_execution(\n check.opt_mapping_param(resources, "resources")\n )\n\n self._resources_cm = build_resources(self._resource_defs, instance=instance)\n resources = self._resources_cm.__enter__()\n self._resources_contain_cm = isinstance(resources, IContainsGenerator)\n\n self._cm_scope_entered = False\n super(UnboundInitResourceContext, self).__init__(\n resource_config=resource_config,\n resources=resources,\n resource_def=None,\n instance=instance,\n dagster_run=None,\n log_manager=initialize_console_manager(None),\n )\n\n def __enter__(self):\n self._cm_scope_entered = True\n return self\n\n def __exit__(self, *exc):\n self._resources_cm.__exit__(*exc)\n if self._instance_provided:\n self._instance_cm.__exit__(*exc)\n\n def __del__(self):\n if self._resources_cm and self._resources_contain_cm and not self._cm_scope_entered:\n self._resources_cm.__exit__(None, None, None)\n if self._instance_provided and not self._cm_scope_entered:\n self._instance_cm.__exit__(None, None, None)\n\n @property\n def resource_config(self) -> Any:\n return self._resource_config\n\n @property\n def resource_def(self) -> Optional[ResourceDefinition]:\n raise DagsterInvariantViolationError(\n "UnboundInitLoggerContext has not been validated against a logger definition."\n )\n\n @property\n def resources(self) -> Resources:\n """The resources that are available to the resource that we are initalizing."""\n if self._resources_cm and self._resources_contain_cm and not self._cm_scope_entered:\n raise DagsterInvariantViolationError(\n "At least one provided resource is a generator, but attempting to access "\n "resources outside of context manager scope. You can use the following syntax to "\n "open a context manager: `with build_init_resource_context(...) as context:`"\n )\n return self._resources\n\n @property\n def instance(self) -> Optional[DagsterInstance]:\n return self._instance\n\n @property\n def log(self) -> Optional[DagsterLogManager]:\n return self._log_manager\n\n # backcompat: keep around this property from when InitResourceContext used to be a NamedTuple\n @property\n def log_manager(self) -> Optional[DagsterLogManager]:\n return self._log_manager\n\n @property\n def run_id(self) -> Optional[str]:\n return None\n\n\n
[docs]def build_init_resource_context(\n config: Optional[Mapping[str, Any]] = None,\n resources: Optional[Mapping[str, Any]] = None,\n instance: Optional[DagsterInstance] = None,\n) -> InitResourceContext:\n """Builds resource initialization context from provided parameters.\n\n ``build_init_resource_context`` can be used as either a function or context manager. If there is a\n provided resource to ``build_init_resource_context`` that is a context manager, then it must be\n used as a context manager. This function can be used to provide the context argument to the\n invocation of a resource.\n\n Args:\n resources (Optional[Dict[str, Any]]): The resources to provide to the context. These can be\n either values or resource definitions.\n config (Optional[Any]): The resource config to provide to the context.\n instance (Optional[DagsterInstance]): The dagster instance configured for the context.\n Defaults to DagsterInstance.ephemeral().\n\n Examples:\n .. code-block:: python\n\n context = build_init_resource_context()\n resource_to_init(context)\n\n with build_init_resource_context(\n resources={"foo": context_manager_resource}\n ) as context:\n resource_to_init(context)\n\n """\n return UnboundInitResourceContext(\n resource_config=check.opt_mapping_param(config, "config", key_type=str),\n instance=check.opt_inst_param(instance, "instance", DagsterInstance),\n resources=check.opt_mapping_param(resources, "resources", key_type=str),\n )
\n
", "current_page_name": "_modules/dagster/_core/execution/context/init", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.execution.context.init"}, "input": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.execution.context.input

\nfrom datetime import datetime\nfrom typing import (\n    TYPE_CHECKING,\n    Any,\n    Iterable,\n    Iterator,\n    List,\n    Mapping,\n    Optional,\n    Sequence,\n    Union,\n)\n\nimport dagster._check as check\nfrom dagster._annotations import public\nfrom dagster._core.definitions.events import AssetKey, AssetObservation, CoercibleToAssetKey\nfrom dagster._core.definitions.metadata import (\n    ArbitraryMetadataMapping,\n    MetadataValue,\n)\nfrom dagster._core.definitions.partition import PartitionsSubset\nfrom dagster._core.definitions.partition_key_range import PartitionKeyRange\nfrom dagster._core.definitions.time_window_partitions import TimeWindow, TimeWindowPartitionsSubset\nfrom dagster._core.errors import DagsterInvariantViolationError\nfrom dagster._core.instance import DagsterInstance, DynamicPartitionsStore\n\nif TYPE_CHECKING:\n    from dagster._core.definitions import PartitionsDefinition\n    from dagster._core.definitions.op_definition import OpDefinition\n    from dagster._core.definitions.resource_definition import Resources\n    from dagster._core.events import DagsterEvent\n    from dagster._core.execution.context.system import StepExecutionContext\n    from dagster._core.log_manager import DagsterLogManager\n    from dagster._core.types.dagster_type import DagsterType\n\n    from .output import OutputContext\n\n\n
[docs]class InputContext:\n """The ``context`` object available to the load_input method of :py:class:`InputManager`.\n\n Users should not instantiate this object directly. In order to construct\n an `InputContext` for testing an IO Manager's `load_input` method, use\n :py:func:`dagster.build_input_context`.\n\n Example:\n .. code-block:: python\n\n from dagster import IOManager, InputContext\n\n class MyIOManager(IOManager):\n def load_input(self, context: InputContext):\n ...\n """\n\n def __init__(\n self,\n *,\n name: Optional[str] = None,\n job_name: Optional[str] = None,\n op_def: Optional["OpDefinition"] = None,\n config: Optional[Any] = None,\n metadata: Optional[ArbitraryMetadataMapping] = None,\n upstream_output: Optional["OutputContext"] = None,\n dagster_type: Optional["DagsterType"] = None,\n log_manager: Optional["DagsterLogManager"] = None,\n resource_config: Optional[Mapping[str, Any]] = None,\n resources: Optional[Union["Resources", Mapping[str, Any]]] = None,\n step_context: Optional["StepExecutionContext"] = None,\n asset_key: Optional[AssetKey] = None,\n partition_key: Optional[str] = None,\n asset_partitions_subset: Optional[PartitionsSubset] = None,\n asset_partitions_def: Optional["PartitionsDefinition"] = None,\n instance: Optional[DagsterInstance] = None,\n ):\n from dagster._core.definitions.resource_definition import IContainsGenerator, Resources\n from dagster._core.execution.build_resources import build_resources\n\n self._name = name\n self._job_name = job_name\n self._op_def = op_def\n self._config = config\n self._metadata = metadata or {}\n self._upstream_output = upstream_output\n self._dagster_type = dagster_type\n self._log = log_manager\n self._resource_config = resource_config\n self._step_context = step_context\n self._asset_key = asset_key\n if self._step_context and self._step_context.has_partition_key:\n self._partition_key: Optional[str] = self._step_context.partition_key\n else:\n self._partition_key = partition_key\n\n self._asset_partitions_subset = asset_partitions_subset\n self._asset_partitions_def = asset_partitions_def\n\n if isinstance(resources, Resources):\n self._resources_cm = None\n self._resources = resources\n else:\n self._resources_cm = build_resources(\n check.opt_mapping_param(resources, "resources", key_type=str)\n )\n self._resources = self._resources_cm.__enter__()\n self._resources_contain_cm = isinstance(self._resources, IContainsGenerator)\n self._cm_scope_entered = False\n\n self._events: List["DagsterEvent"] = []\n self._observations: List[AssetObservation] = []\n self._instance = instance\n\n def __enter__(self):\n if self._resources_cm:\n self._cm_scope_entered = True\n return self\n\n def __exit__(self, *exc):\n if self._resources_cm:\n self._resources_cm.__exit__(*exc)\n\n def __del__(self):\n if self._resources_cm and self._resources_contain_cm and not self._cm_scope_entered:\n self._resources_cm.__exit__(None, None, None)\n\n @property\n def instance(self) -> DagsterInstance:\n if self._instance is None:\n raise DagsterInvariantViolationError(\n "Attempting to access instance, "\n "but it was not provided when constructing the InputContext"\n )\n return self._instance\n\n @public\n @property\n def has_input_name(self) -> bool:\n """If we're the InputContext is being used to load the result of a run from outside the run,\n then it won't have an input name.\n """\n return self._name is not None\n\n @public\n @property\n def name(self) -> str:\n """The name of the input that we're loading."""\n if self._name is None:\n raise DagsterInvariantViolationError(\n "Attempting to access name, "\n "but it was not provided when constructing the InputContext"\n )\n\n return self._name\n\n @property\n def job_name(self) -> str:\n if self._job_name is None:\n raise DagsterInvariantViolationError(\n "Attempting to access job_name, "\n "but it was not provided when constructing the InputContext"\n )\n return self._job_name\n\n @public\n @property\n def op_def(self) -> "OpDefinition":\n """The definition of the op that's loading the input."""\n if self._op_def is None:\n raise DagsterInvariantViolationError(\n "Attempting to access op_def, "\n "but it was not provided when constructing the InputContext"\n )\n\n return self._op_def\n\n @public\n @property\n def config(self) -> Any:\n """The config attached to the input that we're loading."""\n return self._config\n\n @public\n @property\n def metadata(self) -> Optional[ArbitraryMetadataMapping]:\n """A dict of metadata that is assigned to the InputDefinition that we're loading for.\n This property only contains metadata passed in explicitly with :py:class:`AssetIn`\n or :py:class:`In`. To access metadata of an upstream asset or operation definition,\n use the metadata in :py:attr:`.InputContext.upstream_output`.\n """\n return self._metadata\n\n @public\n @property\n def upstream_output(self) -> Optional["OutputContext"]:\n """Info about the output that produced the object we're loading."""\n return self._upstream_output\n\n @public\n @property\n def dagster_type(self) -> "DagsterType":\n """The type of this input.\n Dagster types do not propagate from an upstream output to downstream inputs,\n and this property only captures type information for the input that is either\n passed in explicitly with :py:class:`AssetIn` or :py:class:`In`, or can be\n infered from type hints. For an asset input, the Dagster type from the upstream\n asset definition is ignored.\n """\n if self._dagster_type is None:\n raise DagsterInvariantViolationError(\n "Attempting to access dagster_type, "\n "but it was not provided when constructing the InputContext"\n )\n\n return self._dagster_type\n\n @public\n @property\n def log(self) -> "DagsterLogManager":\n """The log manager to use for this input."""\n if self._log is None:\n raise DagsterInvariantViolationError(\n "Attempting to access log, "\n "but it was not provided when constructing the InputContext"\n )\n\n return self._log\n\n @public\n @property\n def resource_config(self) -> Optional[Mapping[str, Any]]:\n """The config associated with the resource that initializes the InputManager."""\n return self._resource_config\n\n @public\n @property\n def resources(self) -> Any:\n """The resources required by the resource that initializes the\n input manager. If using the :py:func:`@input_manager` decorator, these resources\n correspond to those requested with the `required_resource_keys` parameter.\n """\n if self._resources is None:\n raise DagsterInvariantViolationError(\n "Attempting to access resources, "\n "but it was not provided when constructing the InputContext"\n )\n\n if self._resources_cm and self._resources_contain_cm and not self._cm_scope_entered:\n raise DagsterInvariantViolationError(\n "At least one provided resource is a generator, but attempting to access "\n "resources outside of context manager scope. You can use the following syntax to "\n "open a context manager: `with build_input_context(...) as context:`"\n )\n return self._resources\n\n @public\n @property\n def has_asset_key(self) -> bool:\n """Returns True if an asset is being loaded as input, otherwise returns False. A return value of False\n indicates that an output from an op is being loaded as the input.\n """\n return self._asset_key is not None\n\n @public\n @property\n def asset_key(self) -> AssetKey:\n """The ``AssetKey`` of the asset that is being loaded as an input."""\n if self._asset_key is None:\n raise DagsterInvariantViolationError(\n "Attempting to access asset_key, but no asset is associated with this input"\n )\n\n return self._asset_key\n\n @public\n @property\n def asset_partitions_def(self) -> "PartitionsDefinition":\n """The PartitionsDefinition on the upstream asset corresponding to this input."""\n if self._asset_partitions_def is None:\n if self.asset_key:\n raise DagsterInvariantViolationError(\n f"Attempting to access partitions def for asset {self.asset_key}, but it is not"\n " partitioned"\n )\n else:\n raise DagsterInvariantViolationError(\n "Attempting to access partitions def for asset, but input does not correspond"\n " to an asset"\n )\n\n return self._asset_partitions_def\n\n @property\n def step_context(self) -> "StepExecutionContext":\n if self._step_context is None:\n raise DagsterInvariantViolationError(\n "Attempting to access step_context, "\n "but it was not provided when constructing the InputContext"\n )\n\n return self._step_context\n\n @public\n @property\n def has_partition_key(self) -> bool:\n """Whether the current run is a partitioned run."""\n return self._partition_key is not None\n\n @public\n @property\n def partition_key(self) -> str:\n """The partition key for the current run.\n\n Raises an error if the current run is not a partitioned run.\n """\n if self._partition_key is None:\n check.failed(\n "Tried to access partition_key on a non-partitioned run.",\n )\n\n return self._partition_key\n\n @public\n @property\n def has_asset_partitions(self) -> bool:\n """Returns True if the asset being loaded as input is partitioned."""\n return self._asset_partitions_subset is not None\n\n @public\n @property\n def asset_partition_key(self) -> str:\n """The partition key for input asset.\n\n Raises an error if the input asset has no partitioning, or if the run covers a partition\n range for the input asset.\n """\n subset = self._asset_partitions_subset\n\n if subset is None:\n check.failed("The input does not correspond to a partitioned asset.")\n\n partition_keys = list(subset.get_partition_keys())\n if len(partition_keys) == 1:\n return partition_keys[0]\n else:\n check.failed(\n f"Tried to access partition key for asset '{self.asset_key}', "\n f"but the number of input partitions != 1: '{subset}'."\n )\n\n @public\n @property\n def asset_partition_key_range(self) -> PartitionKeyRange:\n """The partition key range for input asset.\n\n Raises an error if the input asset has no partitioning.\n """\n subset = self._asset_partitions_subset\n\n if subset is None:\n check.failed(\n "Tried to access asset_partition_key_range, but the asset is not partitioned.",\n )\n\n partition_key_ranges = subset.get_partition_key_ranges(\n dynamic_partitions_store=self.instance\n )\n if len(partition_key_ranges) != 1:\n check.failed(\n "Tried to access asset_partition_key_range, but there are "\n f"({len(partition_key_ranges)}) key ranges associated with this input.",\n )\n\n return partition_key_ranges[0]\n\n @public\n @property\n def asset_partition_keys(self) -> Sequence[str]:\n """The partition keys for input asset.\n\n Raises an error if the input asset has no partitioning.\n """\n if self._asset_partitions_subset is None:\n check.failed(\n "Tried to access asset_partition_keys, but the asset is not partitioned.",\n )\n\n return list(self._asset_partitions_subset.get_partition_keys())\n\n @public\n @property\n def asset_partitions_time_window(self) -> TimeWindow:\n """The time window for the partitions of the input asset.\n\n Raises an error if either of the following are true:\n - The input asset has no partitioning.\n - The input asset is not partitioned with a TimeWindowPartitionsDefinition.\n """\n subset = self._asset_partitions_subset\n\n if subset is None:\n check.failed(\n "Tried to access asset_partitions_time_window, but the asset is not partitioned.",\n )\n\n if not isinstance(subset, TimeWindowPartitionsSubset):\n check.failed(\n "Tried to access asset_partitions_time_window, but the asset is not partitioned"\n " with time windows.",\n )\n\n time_windows = subset.included_time_windows\n if len(time_windows) != 1:\n check.failed(\n "Tried to access asset_partitions_time_window, but there are "\n f"({len(time_windows)}) time windows associated with this input.",\n )\n\n return time_windows[0]\n\n
[docs] @public\n def get_identifier(self) -> Sequence[str]:\n """Utility method to get a collection of identifiers that as a whole represent a unique\n step input.\n\n If not using memoization, the unique identifier collection consists of\n\n - ``run_id``: the id of the run which generates the input.\n Note: This method also handles the re-execution memoization logic. If the step that\n generates the input is skipped in the re-execution, the ``run_id`` will be the id\n of its parent run.\n - ``step_key``: the key for a compute step.\n - ``name``: the name of the output. (default: 'result').\n\n If using memoization, the ``version`` corresponding to the step output is used in place of\n the ``run_id``.\n\n Returns:\n List[str, ...]: A list of identifiers, i.e. (run_id or version), step_key, and output_name\n """\n if self.upstream_output is None:\n raise DagsterInvariantViolationError(\n "InputContext.upstream_output not defined. Cannot compute an identifier"\n )\n\n return self.upstream_output.get_identifier()
\n\n
[docs] @public\n def get_asset_identifier(self) -> Sequence[str]:\n """The sequence of strings making up the AssetKey for the asset being loaded as an input.\n If the asset is partitioned, the identifier contains the partition key as the final element in the\n sequence. For example, for the asset key ``AssetKey(["foo", "bar", "baz"])``, materialized with\n partition key "2023-06-01", ``get_asset_identifier`` will return ``["foo", "bar", "baz", "2023-06-01"]``.\n """\n if self.asset_key is not None:\n if self.has_asset_partitions:\n return [*self.asset_key.path, self.asset_partition_key]\n else:\n return self.asset_key.path\n else:\n check.failed("Can't get asset identifier for an input with no asset key")
\n\n def consume_events(self) -> Iterator["DagsterEvent"]:\n """Pops and yields all user-generated events that have been recorded from this context.\n\n If consume_events has not yet been called, this will yield all logged events since the call to `handle_input`. If consume_events has been called, it will yield all events since the last time consume_events was called. Designed for internal use. Users should never need to invoke this method.\n """\n events = self._events\n self._events = []\n yield from events\n\n def add_input_metadata(\n self,\n metadata: Mapping[str, Any],\n description: Optional[str] = None,\n ) -> None:\n """Accepts a dictionary of metadata. Metadata entries will appear on the LOADED_INPUT event.\n If the input is an asset, metadata will be attached to an asset observation.\n\n The asset observation will be yielded from the run and appear in the event log.\n Only valid if the context has an asset key.\n """\n from dagster._core.definitions.metadata import normalize_metadata\n from dagster._core.events import DagsterEvent\n\n metadata = check.mapping_param(metadata, "metadata", key_type=str)\n self._metadata = {**self._metadata, **normalize_metadata(metadata)}\n if self.has_asset_key:\n check.opt_str_param(description, "description")\n\n observation = AssetObservation(\n asset_key=self.asset_key,\n description=description,\n partition=self.asset_partition_key if self.has_asset_partitions else None,\n metadata=metadata,\n )\n self._observations.append(observation)\n if self._step_context:\n self._events.append(DagsterEvent.asset_observation(self._step_context, observation))\n\n def get_observations(\n self,\n ) -> Sequence[AssetObservation]:\n """Retrieve the list of user-generated asset observations that were observed via the context.\n\n User-generated events that were yielded will not appear in this list.\n\n **Examples:**\n\n .. code-block:: python\n\n from dagster import IOManager, build_input_context, AssetObservation\n\n class MyIOManager(IOManager):\n def load_input(self, context, obj):\n ...\n\n def test_load_input():\n mgr = MyIOManager()\n context = build_input_context()\n mgr.load_input(context)\n observations = context.get_observations()\n ...\n """\n return self._observations\n\n def consume_metadata(self) -> Mapping[str, MetadataValue]:\n result = self._metadata\n self._metadata = {}\n return result
\n\n\n
[docs]def build_input_context(\n name: Optional[str] = None,\n config: Optional[Any] = None,\n metadata: Optional[ArbitraryMetadataMapping] = None,\n upstream_output: Optional["OutputContext"] = None,\n dagster_type: Optional["DagsterType"] = None,\n resource_config: Optional[Mapping[str, Any]] = None,\n resources: Optional[Mapping[str, Any]] = None,\n op_def: Optional["OpDefinition"] = None,\n step_context: Optional["StepExecutionContext"] = None,\n asset_key: Optional[CoercibleToAssetKey] = None,\n partition_key: Optional[str] = None,\n asset_partition_key_range: Optional[PartitionKeyRange] = None,\n asset_partitions_def: Optional["PartitionsDefinition"] = None,\n instance: Optional[DagsterInstance] = None,\n) -> "InputContext":\n """Builds input context from provided parameters.\n\n ``build_input_context`` can be used as either a function, or a context manager. If resources\n that are also context managers are provided, then ``build_input_context`` must be used as a\n context manager.\n\n Args:\n name (Optional[str]): The name of the input that we're loading.\n config (Optional[Any]): The config attached to the input that we're loading.\n metadata (Optional[Dict[str, Any]]): A dict of metadata that is assigned to the\n InputDefinition that we're loading for.\n upstream_output (Optional[OutputContext]): Info about the output that produced the object\n we're loading.\n dagster_type (Optional[DagsterType]): The type of this input.\n resource_config (Optional[Dict[str, Any]]): The resource config to make available from the\n input context. This usually corresponds to the config provided to the resource that\n loads the input manager.\n resources (Optional[Dict[str, Any]]): The resources to make available from the context.\n For a given key, you can provide either an actual instance of an object, or a resource\n definition.\n asset_key (Optional[Union[AssetKey, Sequence[str], str]]): The asset key attached to the InputDefinition.\n op_def (Optional[OpDefinition]): The definition of the op that's loading the input.\n step_context (Optional[StepExecutionContext]): For internal use.\n partition_key (Optional[str]): String value representing partition key to execute with.\n asset_partition_key_range (Optional[str]): The range of asset partition keys to load.\n asset_partitions_def: Optional[PartitionsDefinition]: The PartitionsDefinition of the asset\n being loaded.\n\n Examples:\n .. code-block:: python\n\n build_input_context()\n\n with build_input_context(resources={"foo": context_manager_resource}) as context:\n do_something\n """\n from dagster._core.definitions import OpDefinition, PartitionsDefinition\n from dagster._core.execution.context.output import OutputContext\n from dagster._core.execution.context.system import StepExecutionContext\n from dagster._core.execution.context_creation_job import initialize_console_manager\n from dagster._core.types.dagster_type import DagsterType\n\n name = check.opt_str_param(name, "name")\n metadata = check.opt_mapping_param(metadata, "metadata", key_type=str)\n upstream_output = check.opt_inst_param(upstream_output, "upstream_output", OutputContext)\n dagster_type = check.opt_inst_param(dagster_type, "dagster_type", DagsterType)\n resource_config = check.opt_mapping_param(resource_config, "resource_config", key_type=str)\n resources = check.opt_mapping_param(resources, "resources", key_type=str)\n op_def = check.opt_inst_param(op_def, "op_def", OpDefinition)\n step_context = check.opt_inst_param(step_context, "step_context", StepExecutionContext)\n asset_key = AssetKey.from_coercible(asset_key) if asset_key else None\n partition_key = check.opt_str_param(partition_key, "partition_key")\n asset_partition_key_range = check.opt_inst_param(\n asset_partition_key_range, "asset_partition_key_range", PartitionKeyRange\n )\n asset_partitions_def = check.opt_inst_param(\n asset_partitions_def, "asset_partitions_def", PartitionsDefinition\n )\n if asset_partitions_def and asset_partition_key_range:\n asset_partitions_subset = asset_partitions_def.empty_subset().with_partition_key_range(\n asset_partition_key_range, dynamic_partitions_store=instance\n )\n elif asset_partition_key_range:\n asset_partitions_subset = KeyRangeNoPartitionsDefPartitionsSubset(asset_partition_key_range)\n else:\n asset_partitions_subset = None\n\n return InputContext(\n name=name,\n job_name=None,\n config=config,\n metadata=metadata,\n upstream_output=upstream_output,\n dagster_type=dagster_type,\n log_manager=initialize_console_manager(None),\n resource_config=resource_config,\n resources=resources,\n step_context=step_context,\n op_def=op_def,\n asset_key=asset_key,\n partition_key=partition_key,\n asset_partitions_subset=asset_partitions_subset,\n asset_partitions_def=asset_partitions_def,\n instance=instance,\n )
\n\n\nclass KeyRangeNoPartitionsDefPartitionsSubset(PartitionsSubset):\n """For build_input_context when no PartitionsDefinition has been provided."""\n\n def __init__(self, key_range: PartitionKeyRange):\n self._key_range = key_range\n\n def get_partition_keys_not_in_subset(\n self,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> Iterable[str]:\n raise NotImplementedError()\n\n def get_partition_keys(self, current_time: Optional[datetime] = None) -> Iterable[str]:\n if self._key_range.start == self._key_range.end:\n return self._key_range.start\n else:\n raise NotImplementedError()\n\n def get_partition_key_ranges(\n self,\n current_time: Optional[datetime] = None,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> Sequence[PartitionKeyRange]:\n return [self._key_range]\n\n def with_partition_keys(self, partition_keys: Iterable[str]) -> "PartitionsSubset":\n raise NotImplementedError()\n\n def with_partition_key_range(\n self,\n partition_key_range: PartitionKeyRange,\n dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,\n ) -> "PartitionsSubset":\n raise NotImplementedError()\n\n def serialize(self) -> str:\n raise NotImplementedError()\n\n @property\n def partitions_def(self) -> "PartitionsDefinition":\n raise NotImplementedError()\n\n def __len__(self) -> int:\n raise NotImplementedError()\n\n def __contains__(self, value) -> bool:\n raise NotImplementedError()\n\n @classmethod\n def from_serialized(\n cls, partitions_def: "PartitionsDefinition", serialized: str\n ) -> "PartitionsSubset":\n raise NotImplementedError()\n\n @classmethod\n def can_deserialize(\n cls,\n partitions_def: "PartitionsDefinition",\n serialized: str,\n serialized_partitions_def_unique_id: Optional[str],\n serialized_partitions_def_class_name: Optional[str],\n ) -> bool:\n raise NotImplementedError()\n\n @classmethod\n def empty_subset(cls, partitions_def: "PartitionsDefinition") -> "PartitionsSubset":\n raise NotImplementedError()\n
", "current_page_name": "_modules/dagster/_core/execution/context/input", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.execution.context.input"}, "invocation": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.execution.context.invocation

\nfrom contextlib import ExitStack\nfrom typing import (\n    AbstractSet,\n    Any,\n    Dict,\n    List,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Set,\n    Union,\n    cast,\n)\n\nimport dagster._check as check\nfrom dagster._core.definitions.assets import AssetsDefinition\nfrom dagster._core.definitions.composition import PendingNodeInvocation\nfrom dagster._core.definitions.decorators.op_decorator import DecoratedOpFunction\nfrom dagster._core.definitions.dependency import Node, NodeHandle\nfrom dagster._core.definitions.events import (\n    AssetMaterialization,\n    AssetObservation,\n    ExpectationResult,\n    UserEvent,\n)\nfrom dagster._core.definitions.hook_definition import HookDefinition\nfrom dagster._core.definitions.job_definition import JobDefinition\nfrom dagster._core.definitions.multi_dimensional_partitions import MultiPartitionsDefinition\nfrom dagster._core.definitions.op_definition import OpDefinition\nfrom dagster._core.definitions.partition_key_range import PartitionKeyRange\nfrom dagster._core.definitions.resource_definition import (\n    IContainsGenerator,\n    ResourceDefinition,\n    Resources,\n    ScopedResourcesBuilder,\n)\nfrom dagster._core.definitions.resource_requirement import ensure_requirements_satisfied\nfrom dagster._core.definitions.step_launcher import StepLauncher\nfrom dagster._core.definitions.time_window_partitions import (\n    TimeWindow,\n    TimeWindowPartitionsDefinition,\n    has_one_dimension_time_window_partitioning,\n)\nfrom dagster._core.errors import (\n    DagsterInvalidInvocationError,\n    DagsterInvalidPropertyError,\n    DagsterInvariantViolationError,\n)\nfrom dagster._core.execution.build_resources import build_resources, wrap_resources_for_execution\nfrom dagster._core.instance import DagsterInstance\nfrom dagster._core.log_manager import DagsterLogManager\nfrom dagster._core.storage.dagster_run import DagsterRun\nfrom dagster._core.types.dagster_type import DagsterType\nfrom dagster._utils.forked_pdb import ForkedPdb\nfrom dagster._utils.merger import merge_dicts\n\nfrom .compute import OpExecutionContext\nfrom .system import StepExecutionContext, TypeCheckContext\n\n\ndef _property_msg(prop_name: str, method_name: str) -> str:\n    return (\n        f"The {prop_name} {method_name} is not set on the context when a solid is directly invoked."\n    )\n\n\nclass UnboundOpExecutionContext(OpExecutionContext):\n    """The ``context`` object available as the first argument to a solid's compute function when\n    being invoked directly. Can also be used as a context manager.\n    """\n\n    def __init__(\n        self,\n        op_config: Any,\n        resources_dict: Mapping[str, Any],\n        resources_config: Mapping[str, Any],\n        instance: Optional[DagsterInstance],\n        partition_key: Optional[str],\n        partition_key_range: Optional[PartitionKeyRange],\n        mapping_key: Optional[str],\n        assets_def: Optional[AssetsDefinition],\n    ):\n        from dagster._core.execution.api import ephemeral_instance_if_missing\n        from dagster._core.execution.context_creation_job import initialize_console_manager\n\n        self._op_config = op_config\n        self._mapping_key = mapping_key\n\n        self._exit_stack = ExitStack()\n\n        # Construct ephemeral instance if missing\n        self._instance = self._exit_stack.enter_context(ephemeral_instance_if_missing(instance))\n\n        self._resources_config = resources_config\n        # Open resource context manager\n        self._resources_contain_cm = False\n        self._resource_defs = wrap_resources_for_execution(resources_dict)\n        self._resources = self._exit_stack.enter_context(\n            build_resources(\n                resources=self._resource_defs,\n                instance=self._instance,\n                resource_config=resources_config,\n            )\n        )\n        self._resources_contain_cm = isinstance(self._resources, IContainsGenerator)\n\n        self._log = initialize_console_manager(None)\n        self._pdb: Optional[ForkedPdb] = None\n        self._cm_scope_entered = False\n        check.invariant(\n            not (partition_key and partition_key_range),\n            "Must supply at most one of partition_key or partition_key_range",\n        )\n        self._partition_key = partition_key\n        self._partition_key_range = partition_key_range\n        self._user_events: List[UserEvent] = []\n        self._output_metadata: Dict[str, Any] = {}\n\n        self._assets_def = check.opt_inst_param(assets_def, "assets_def", AssetsDefinition)\n\n    def __enter__(self):\n        self._cm_scope_entered = True\n        return self\n\n    def __exit__(self, *exc):\n        self._exit_stack.close()\n\n    def __del__(self):\n        self._exit_stack.close()\n\n    @property\n    def op_config(self) -> Any:\n        return self._op_config\n\n    @property\n    def resource_keys(self) -> AbstractSet[str]:\n        return self._resource_defs.keys()\n\n    @property\n    def resources(self) -> Resources:\n        if self._resources_contain_cm and not self._cm_scope_entered:\n            raise DagsterInvariantViolationError(\n                "At least one provided resource is a generator, but attempting to access "\n                "resources outside of context manager scope. You can use the following syntax to "\n                "open a context manager: `with build_op_context(...) as context:`"\n            )\n        return self._resources\n\n    @property\n    def dagster_run(self) -> DagsterRun:\n        raise DagsterInvalidPropertyError(_property_msg("pipeline_run", "property"))\n\n    @property\n    def instance(self) -> DagsterInstance:\n        return self._instance\n\n    @property\n    def pdb(self) -> ForkedPdb:\n        """dagster.utils.forked_pdb.ForkedPdb: Gives access to pdb debugging from within the solid.\n\n        Example:\n        .. code-block:: python\n\n            @solid\n            def debug_solid(context):\n                context.pdb.set_trace()\n\n        """\n        if self._pdb is None:\n            self._pdb = ForkedPdb()\n\n        return self._pdb\n\n    @property\n    def step_launcher(self) -> Optional[StepLauncher]:\n        raise DagsterInvalidPropertyError(_property_msg("step_launcher", "property"))\n\n    @property\n    def run_id(self) -> str:\n        """str: Hard-coded value to indicate that we are directly invoking solid."""\n        return "EPHEMERAL"\n\n    @property\n    def run_config(self) -> dict:\n        raise DagsterInvalidPropertyError(_property_msg("run_config", "property"))\n\n    @property\n    def job_def(self) -> JobDefinition:\n        raise DagsterInvalidPropertyError(_property_msg("job_def", "property"))\n\n    @property\n    def job_name(self) -> str:\n        raise DagsterInvalidPropertyError(_property_msg("job_name", "property"))\n\n    @property\n    def log(self) -> DagsterLogManager:\n        """DagsterLogManager: A console manager constructed for this context."""\n        return self._log\n\n    @property\n    def node_handle(self) -> NodeHandle:\n        raise DagsterInvalidPropertyError(_property_msg("solid_handle", "property"))\n\n    @property\n    def op(self) -> JobDefinition:\n        raise DagsterInvalidPropertyError(_property_msg("op", "property"))\n\n    @property\n    def solid(self) -> Node:\n        raise DagsterInvalidPropertyError(_property_msg("solid", "property"))\n\n    @property\n    def op_def(self) -> OpDefinition:\n        raise DagsterInvalidPropertyError(_property_msg("op_def", "property"))\n\n    @property\n    def assets_def(self) -> AssetsDefinition:\n        raise DagsterInvalidPropertyError(_property_msg("assets_def", "property"))\n\n    @property\n    def has_partition_key(self) -> bool:\n        return self._partition_key is not None\n\n    @property\n    def partition_key(self) -> str:\n        if self._partition_key:\n            return self._partition_key\n        check.failed("Tried to access partition_key for a non-partitioned run")\n\n    @property\n    def partition_key_range(self) -> PartitionKeyRange:\n        """The range of partition keys for the current run.\n\n        If run is for a single partition key, return a `PartitionKeyRange` with the same start and\n        end. Raises an error if the current run is not a partitioned run.\n        """\n        if self._partition_key_range:\n            return self._partition_key_range\n        elif self._partition_key:\n            return PartitionKeyRange(self._partition_key, self._partition_key)\n        else:\n            check.failed("Tried to access partition_key range for a non-partitioned run")\n\n    def asset_partition_key_for_output(self, output_name: str = "result") -> str:\n        return self.partition_key\n\n    def has_tag(self, key: str) -> bool:\n        raise DagsterInvalidPropertyError(_property_msg("has_tag", "method"))\n\n    def get_tag(self, key: str) -> str:\n        raise DagsterInvalidPropertyError(_property_msg("get_tag", "method"))\n\n    def get_step_execution_context(self) -> StepExecutionContext:\n        raise DagsterInvalidPropertyError(_property_msg("get_step_execution_context", "methods"))\n\n    def bind(\n        self,\n        op_def: OpDefinition,\n        pending_invocation: Optional[PendingNodeInvocation[OpDefinition]],\n        assets_def: Optional[AssetsDefinition],\n        config_from_args: Optional[Mapping[str, Any]],\n        resources_from_args: Optional[Mapping[str, Any]],\n    ) -> "BoundOpExecutionContext":\n        from dagster._core.definitions.resource_invocation import resolve_bound_config\n\n        if resources_from_args:\n            if self._resource_defs:\n                raise DagsterInvalidInvocationError(\n                    "Cannot provide resources in both context and kwargs"\n                )\n            resource_defs = wrap_resources_for_execution(resources_from_args)\n            # add new resources context to the stack to be cleared on exit\n            resources = self._exit_stack.enter_context(\n                build_resources(resource_defs, self.instance)\n            )\n        elif assets_def and assets_def.resource_defs:\n            for key in sorted(list(assets_def.resource_defs.keys())):\n                if key in self._resource_defs:\n                    raise DagsterInvalidInvocationError(\n                        f"Error when invoking {assets_def!s} resource '{key}' "\n                        "provided on both the definition and invocation context. Please "\n                        "provide on only one or the other."\n                    )\n            resource_defs = wrap_resources_for_execution(\n                {**self._resource_defs, **assets_def.resource_defs}\n            )\n            # add new resources context to the stack to be cleared on exit\n            resources = self._exit_stack.enter_context(\n                build_resources(resource_defs, self.instance, self._resources_config)\n            )\n        else:\n            resources = self.resources\n            resource_defs = self._resource_defs\n\n        _validate_resource_requirements(resource_defs, op_def)\n\n        if self.op_config and config_from_args:\n            raise DagsterInvalidInvocationError("Cannot provide config in both context and kwargs")\n        op_config = resolve_bound_config(config_from_args or self.op_config, op_def)\n\n        return BoundOpExecutionContext(\n            op_def=op_def,\n            op_config=op_config,\n            resources=resources,\n            resources_config=self._resources_config,\n            instance=self.instance,\n            log_manager=self.log,\n            pdb=self.pdb,\n            tags=(\n                pending_invocation.tags\n                if isinstance(pending_invocation, PendingNodeInvocation)\n                else None\n            ),\n            hook_defs=(\n                pending_invocation.hook_defs\n                if isinstance(pending_invocation, PendingNodeInvocation)\n                else None\n            ),\n            alias=(\n                pending_invocation.given_alias\n                if isinstance(pending_invocation, PendingNodeInvocation)\n                else None\n            ),\n            user_events=self._user_events,\n            output_metadata=self._output_metadata,\n            mapping_key=self._mapping_key,\n            partition_key=self._partition_key,\n            partition_key_range=self._partition_key_range,\n            assets_def=assets_def,\n        )\n\n    def get_events(self) -> Sequence[UserEvent]:\n        """Retrieve the list of user-generated events that were logged via the context.\n\n        **Examples:**\n\n        .. code-block:: python\n\n            from dagster import op, build_op_context, AssetMaterialization, ExpectationResult\n\n            @op\n            def my_op(context):\n                ...\n\n            def test_my_op():\n                context = build_op_context()\n                my_op(context)\n                all_user_events = context.get_events()\n                materializations = [event for event in all_user_events if isinstance(event, AssetMaterialization)]\n                expectation_results = [event for event in all_user_events if isinstance(event, ExpectationResult)]\n                ...\n        """\n        return self._user_events\n\n    def get_output_metadata(\n        self, output_name: str, mapping_key: Optional[str] = None\n    ) -> Optional[Mapping[str, Any]]:\n        """Retrieve metadata that was logged for an output and mapping_key, if it exists.\n\n        If metadata cannot be found for the particular output_name/mapping_key combination, None will be returned.\n\n        Args:\n            output_name (str): The name of the output to retrieve logged metadata for.\n            mapping_key (Optional[str]): The mapping key to retrieve metadata for (only applies when using dynamic outputs).\n\n        Returns:\n            Optional[Mapping[str, Any]]: The metadata values present for the output_name/mapping_key combination, if present.\n        """\n        metadata = self._output_metadata.get(output_name)\n        if mapping_key and metadata:\n            return metadata.get(mapping_key)\n        return metadata\n\n    def get_mapping_key(self) -> Optional[str]:\n        return self._mapping_key\n\n\ndef _validate_resource_requirements(\n    resource_defs: Mapping[str, ResourceDefinition], op_def: OpDefinition\n) -> None:\n    """Validate correctness of resources against required resource keys."""\n    if cast(DecoratedOpFunction, op_def.compute_fn).has_context_arg():\n        for requirement in op_def.get_resource_requirements():\n            if not requirement.is_io_manager_requirement:\n                ensure_requirements_satisfied(resource_defs, [requirement])\n\n\nclass BoundOpExecutionContext(OpExecutionContext):\n    """The op execution context that is passed to the compute function during invocation.\n\n    This context is bound to a specific op definition, for which the resources and config have\n    been validated.\n    """\n\n    _op_def: OpDefinition\n    _op_config: Any\n    _resources: "Resources"\n    _resources_config: Mapping[str, Any]\n    _instance: DagsterInstance\n    _log_manager: DagsterLogManager\n    _pdb: Optional[ForkedPdb]\n    _tags: Mapping[str, str]\n    _hook_defs: Optional[AbstractSet[HookDefinition]]\n    _alias: str\n    _user_events: List[UserEvent]\n    _seen_outputs: Dict[str, Union[str, Set[str]]]\n    _output_metadata: Dict[str, Any]\n    _mapping_key: Optional[str]\n    _partition_key: Optional[str]\n    _partition_key_range: Optional[PartitionKeyRange]\n    _assets_def: Optional[AssetsDefinition]\n\n    def __init__(\n        self,\n        op_def: OpDefinition,\n        op_config: Any,\n        resources: "Resources",\n        resources_config: Mapping[str, Any],\n        instance: DagsterInstance,\n        log_manager: DagsterLogManager,\n        pdb: Optional[ForkedPdb],\n        tags: Optional[Mapping[str, str]],\n        hook_defs: Optional[AbstractSet[HookDefinition]],\n        alias: Optional[str],\n        user_events: List[UserEvent],\n        output_metadata: Dict[str, Any],\n        mapping_key: Optional[str],\n        partition_key: Optional[str],\n        partition_key_range: Optional[PartitionKeyRange],\n        assets_def: Optional[AssetsDefinition],\n    ):\n        self._op_def = op_def\n        self._op_config = op_config\n        self._resources = resources\n        self._instance = instance\n        self._log = log_manager\n        self._pdb = pdb\n        self._tags = merge_dicts(self._op_def.tags, tags) if tags else self._op_def.tags\n        self._hook_defs = hook_defs\n        self._alias = alias if alias else self._op_def.name\n        self._resources_config = resources_config\n        self._user_events = user_events\n        self._seen_outputs = {}\n        self._output_metadata = output_metadata\n        self._mapping_key = mapping_key\n        self._partition_key = partition_key\n        self._partition_key_range = partition_key_range\n        self._assets_def = assets_def\n        self._requires_typed_event_stream = False\n        self._typed_event_stream_error_message = None\n\n    @property\n    def op_config(self) -> Any:\n        return self._op_config\n\n    @property\n    def resources(self) -> Resources:\n        return self._resources\n\n    @property\n    def dagster_run(self) -> DagsterRun:\n        raise DagsterInvalidPropertyError(_property_msg("pipeline_run", "property"))\n\n    @property\n    def instance(self) -> DagsterInstance:\n        return self._instance\n\n    @property\n    def pdb(self) -> ForkedPdb:\n        """dagster.utils.forked_pdb.ForkedPdb: Gives access to pdb debugging from within the solid.\n\n        Example:\n        .. code-block:: python\n\n            @solid\n            def debug_solid(context):\n                context.pdb.set_trace()\n\n        """\n        if self._pdb is None:\n            self._pdb = ForkedPdb()\n\n        return self._pdb\n\n    @property\n    def step_launcher(self) -> Optional[StepLauncher]:\n        raise DagsterInvalidPropertyError(_property_msg("step_launcher", "property"))\n\n    @property\n    def run_id(self) -> str:\n        """str: Hard-coded value to indicate that we are directly invoking solid."""\n        return "EPHEMERAL"\n\n    @property\n    def run_config(self) -> Mapping[str, object]:\n        run_config: Dict[str, object] = {}\n        if self._op_config:\n            run_config["ops"] = {self._op_def.name: {"config": self._op_config}}\n        run_config["resources"] = self._resources_config\n        return run_config\n\n    @property\n    def job_def(self) -> JobDefinition:\n        raise DagsterInvalidPropertyError(_property_msg("job_def", "property"))\n\n    @property\n    def job_name(self) -> str:\n        raise DagsterInvalidPropertyError(_property_msg("job_name", "property"))\n\n    @property\n    def log(self) -> DagsterLogManager:\n        """DagsterLogManager: A console manager constructed for this context."""\n        return self._log\n\n    @property\n    def node_handle(self) -> NodeHandle:\n        raise DagsterInvalidPropertyError(_property_msg("node_handle", "property"))\n\n    @property\n    def op(self) -> Node:\n        raise DagsterInvalidPropertyError(_property_msg("op", "property"))\n\n    @property\n    def op_def(self) -> OpDefinition:\n        return self._op_def\n\n    @property\n    def has_assets_def(self) -> bool:\n        return self._assets_def is not None\n\n    @property\n    def assets_def(self) -> AssetsDefinition:\n        if self._assets_def is None:\n            raise DagsterInvalidPropertyError(\n                f"Op {self.op_def.name} does not have an assets definition."\n            )\n        return self._assets_def\n\n    @property\n    def has_partition_key(self) -> bool:\n        return self._partition_key is not None\n\n    def has_tag(self, key: str) -> bool:\n        return key in self._tags\n\n    def get_tag(self, key: str) -> Optional[str]:\n        return self._tags.get(key)\n\n    @property\n    def alias(self) -> str:\n        return self._alias\n\n    def get_step_execution_context(self) -> StepExecutionContext:\n        raise DagsterInvalidPropertyError(_property_msg("get_step_execution_context", "methods"))\n\n    def for_type(self, dagster_type: DagsterType) -> TypeCheckContext:\n        resources = cast(NamedTuple, self.resources)\n        return TypeCheckContext(\n            self.run_id,\n            self.log,\n            ScopedResourcesBuilder(resources._asdict()),\n            dagster_type,\n        )\n\n    def get_mapping_key(self) -> Optional[str]:\n        return self._mapping_key\n\n    def describe_op(self) -> str:\n        if isinstance(self.op_def, OpDefinition):\n            return f'op "{self.op_def.name}"'\n\n        return f'solid "{self.op_def.name}"'\n\n    def log_event(self, event: UserEvent) -> None:\n        check.inst_param(\n            event,\n            "event",\n            (AssetMaterialization, AssetObservation, ExpectationResult),\n        )\n        self._user_events.append(event)\n\n    def observe_output(self, output_name: str, mapping_key: Optional[str] = None) -> None:\n        if mapping_key:\n            if output_name not in self._seen_outputs:\n                self._seen_outputs[output_name] = set()\n            cast(Set[str], self._seen_outputs[output_name]).add(mapping_key)\n        else:\n            self._seen_outputs[output_name] = "seen"\n\n    def has_seen_output(self, output_name: str, mapping_key: Optional[str] = None) -> bool:\n        if mapping_key:\n            return (\n                output_name in self._seen_outputs and mapping_key in self._seen_outputs[output_name]\n            )\n        return output_name in self._seen_outputs\n\n    @property\n    def partition_key(self) -> str:\n        if self._partition_key is not None:\n            return self._partition_key\n        check.failed("Tried to access partition_key for a non-partitioned asset")\n\n    @property\n    def partition_key_range(self) -> PartitionKeyRange:\n        """The range of partition keys for the current run.\n\n        If run is for a single partition key, return a `PartitionKeyRange` with the same start and\n        end. Raises an error if the current run is not a partitioned run.\n        """\n        if self._partition_key_range:\n            return self._partition_key_range\n        elif self._partition_key:\n            return PartitionKeyRange(self._partition_key, self._partition_key)\n        else:\n            check.failed("Tried to access partition_key range for a non-partitioned run")\n\n    def asset_partition_key_for_output(self, output_name: str = "result") -> str:\n        return self.partition_key\n\n    def asset_partitions_time_window_for_output(self, output_name: str = "result") -> TimeWindow:\n        partitions_def = self.assets_def.partitions_def\n        if partitions_def is None:\n            check.failed("Tried to access partition_key for a non-partitioned asset")\n\n        if not has_one_dimension_time_window_partitioning(partitions_def=partitions_def):\n            raise DagsterInvariantViolationError(\n                "Expected a TimeWindowPartitionsDefinition or MultiPartitionsDefinition with a"\n                f" single time dimension, but instead found {type(partitions_def)}"\n            )\n\n        return cast(\n            Union[MultiPartitionsDefinition, TimeWindowPartitionsDefinition], partitions_def\n        ).time_window_for_partition_key(self.partition_key)\n\n    def add_output_metadata(\n        self,\n        metadata: Mapping[str, Any],\n        output_name: Optional[str] = None,\n        mapping_key: Optional[str] = None,\n    ) -> None:\n        """Add metadata to one of the outputs of an op.\n\n        This can only be used once per output in the body of an op. Using this method with the same output_name more than once within an op will result in an error.\n\n        Args:\n            metadata (Mapping[str, Any]): The metadata to attach to the output\n            output_name (Optional[str]): The name of the output to attach metadata to. If there is only one output on the op, then this argument does not need to be provided. The metadata will automatically be attached to the only output.\n\n        **Examples:**\n\n        .. code-block:: python\n\n            from dagster import Out, op\n            from typing import Tuple\n\n            @op\n            def add_metadata(context):\n                context.add_output_metadata({"foo", "bar"})\n                return 5 # Since the default output is called "result", metadata will be attached to the output "result".\n\n            @op(out={"a": Out(), "b": Out()})\n            def add_metadata_two_outputs(context) -> Tuple[str, int]:\n                context.add_output_metadata({"foo": "bar"}, output_name="b")\n                context.add_output_metadata({"baz": "bat"}, output_name="a")\n\n                return ("dog", 5)\n\n        """\n        metadata = check.mapping_param(metadata, "metadata", key_type=str)\n        output_name = check.opt_str_param(output_name, "output_name")\n        mapping_key = check.opt_str_param(mapping_key, "mapping_key")\n\n        if output_name is None and len(self.op_def.output_defs) == 1:\n            output_def = self.op_def.output_defs[0]\n            output_name = output_def.name\n        elif output_name is None:\n            raise DagsterInvariantViolationError(\n                "Attempted to log metadata without providing output_name, but multiple outputs"\n                " exist. Please provide an output_name to the invocation of"\n                " `context.add_output_metadata`."\n            )\n        else:\n            output_def = self.op_def.output_def_named(output_name)\n\n        if self.has_seen_output(output_name, mapping_key):\n            output_desc = (\n                f"output '{output_def.name}'"\n                if not mapping_key\n                else f"output '{output_def.name}' with mapping_key '{mapping_key}'"\n            )\n            raise DagsterInvariantViolationError(\n                f"In {self.op_def.node_type_str} '{self.op_def.name}', attempted to log output"\n                f" metadata for {output_desc} which has already been yielded. Metadata must be"\n                " logged before the output is yielded."\n            )\n        if output_def.is_dynamic and not mapping_key:\n            raise DagsterInvariantViolationError(\n                f"In {self.op_def.node_type_str} '{self.op_def.name}', attempted to log metadata"\n                f" for dynamic output '{output_def.name}' without providing a mapping key. When"\n                " logging metadata for a dynamic output, it is necessary to provide a mapping key."\n            )\n\n        output_name = output_def.name\n        if output_name in self._output_metadata:\n            if not mapping_key or mapping_key in self._output_metadata[output_name]:\n                raise DagsterInvariantViolationError(\n                    f"In {self.op_def.node_type_str} '{self.op_def.name}', attempted to log"\n                    f" metadata for output '{output_name}' more than once."\n                )\n        if mapping_key:\n            if output_name not in self._output_metadata:\n                self._output_metadata[output_name] = {}\n            self._output_metadata[output_name][mapping_key] = metadata\n\n        else:\n            self._output_metadata[output_name] = metadata\n\n    # In this mode no conversion is done on returned values and missing but expected outputs are not\n    # allowed.\n    @property\n    def requires_typed_event_stream(self) -> bool:\n        return self._requires_typed_event_stream\n\n    @property\n    def typed_event_stream_error_message(self) -> Optional[str]:\n        return self._typed_event_stream_error_message\n\n    def set_requires_typed_event_stream(self, *, error_message: Optional[str]) -> None:\n        self._requires_typed_event_stream = True\n        self._typed_event_stream_error_message = error_message\n\n\n
[docs]def build_op_context(\n resources: Optional[Mapping[str, Any]] = None,\n op_config: Any = None,\n resources_config: Optional[Mapping[str, Any]] = None,\n instance: Optional[DagsterInstance] = None,\n config: Any = None,\n partition_key: Optional[str] = None,\n partition_key_range: Optional[PartitionKeyRange] = None,\n mapping_key: Optional[str] = None,\n _assets_def: Optional[AssetsDefinition] = None,\n) -> UnboundOpExecutionContext:\n """Builds op execution context from provided parameters.\n\n ``build_op_context`` can be used as either a function or context manager. If there is a\n provided resource that is a context manager, then ``build_op_context`` must be used as a\n context manager. This function can be used to provide the context argument when directly\n invoking a op.\n\n Args:\n resources (Optional[Dict[str, Any]]): The resources to provide to the context. These can be\n either values or resource definitions.\n op_config (Optional[Mapping[str, Any]]): The config to provide to the op.\n resources_config (Optional[Mapping[str, Any]]): The config to provide to the resources.\n instance (Optional[DagsterInstance]): The dagster instance configured for the context.\n Defaults to DagsterInstance.ephemeral().\n mapping_key (Optional[str]): A key representing the mapping key from an upstream dynamic\n output. Can be accessed using ``context.get_mapping_key()``.\n partition_key (Optional[str]): String value representing partition key to execute with.\n partition_key_range (Optional[PartitionKeyRange]): Partition key range to execute with.\n _assets_def (Optional[AssetsDefinition]): Internal argument that populates the op's assets\n definition, not meant to be populated by users.\n\n Examples:\n .. code-block:: python\n\n context = build_op_context()\n op_to_invoke(context)\n\n with build_op_context(resources={"foo": context_manager_resource}) as context:\n op_to_invoke(context)\n """\n if op_config and config:\n raise DagsterInvalidInvocationError(\n "Attempted to invoke ``build_op_context`` with both ``op_config``, and its "\n "legacy version, ``config``. Please provide one or the other."\n )\n\n op_config = op_config if op_config else config\n return UnboundOpExecutionContext(\n resources_dict=check.opt_mapping_param(resources, "resources", key_type=str),\n resources_config=check.opt_mapping_param(\n resources_config, "resources_config", key_type=str\n ),\n op_config=op_config,\n instance=check.opt_inst_param(instance, "instance", DagsterInstance),\n partition_key=check.opt_str_param(partition_key, "partition_key"),\n partition_key_range=check.opt_inst_param(\n partition_key_range, "partition_key_range", PartitionKeyRange\n ),\n mapping_key=check.opt_str_param(mapping_key, "mapping_key"),\n assets_def=check.opt_inst_param(_assets_def, "_assets_def", AssetsDefinition),\n )
\n\n\n
[docs]def build_asset_context(\n resources: Optional[Mapping[str, Any]] = None,\n resources_config: Optional[Mapping[str, Any]] = None,\n asset_config: Optional[Mapping[str, Any]] = None,\n instance: Optional[DagsterInstance] = None,\n partition_key: Optional[str] = None,\n partition_key_range: Optional[PartitionKeyRange] = None,\n):\n """Builds asset execution context from provided parameters.\n\n ``build_asset_context`` can be used as either a function or context manager. If there is a\n provided resource that is a context manager, then ``build_asset_context`` must be used as a\n context manager. This function can be used to provide the context argument when directly\n invoking an asset.\n\n Args:\n resources (Optional[Dict[str, Any]]): The resources to provide to the context. These can be\n either values or resource definitions.\n resources_config (Optional[Mapping[str, Any]]): The config to provide to the resources.\n asset_config (Optional[Mapping[str, Any]]): The config to provide to the asset.\n instance (Optional[DagsterInstance]): The dagster instance configured for the context.\n Defaults to DagsterInstance.ephemeral().\n partition_key (Optional[str]): String value representing partition key to execute with.\n partition_key_range (Optional[PartitionKeyRange]): Partition key range to execute with.\n\n Examples:\n .. code-block:: python\n\n context = build_asset_context()\n asset_to_invoke(context)\n\n with build_asset_context(resources={"foo": context_manager_resource}) as context:\n asset_to_invoke(context)\n """\n return build_op_context(\n op_config=asset_config,\n resources=resources,\n resources_config=resources_config,\n partition_key=partition_key,\n partition_key_range=partition_key_range,\n instance=instance,\n )
\n
", "current_page_name": "_modules/dagster/_core/execution/context/invocation", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.execution.context.invocation"}, "logger": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.execution.context.logger

\nfrom typing import Any, Optional\n\nimport dagster._check as check\nfrom dagster._annotations import public\nfrom dagster._core.definitions.job_definition import JobDefinition\nfrom dagster._core.definitions.logger_definition import LoggerDefinition\nfrom dagster._core.errors import DagsterInvariantViolationError\n\nfrom .output import RUN_ID_PLACEHOLDER\n\n\n
[docs]class InitLoggerContext:\n """The context object available as the argument to the initialization function of a :py:class:`dagster.LoggerDefinition`.\n\n Users should not instantiate this object directly. To construct an\n `InitLoggerContext` for testing purposes, use :py:func:`dagster.\n build_init_logger_context`.\n\n Example:\n .. code-block:: python\n\n from dagster import logger, InitLoggerContext\n\n @logger\n def hello_world(init_context: InitLoggerContext):\n ...\n\n """\n\n def __init__(\n self,\n logger_config: Any,\n logger_def: Optional[LoggerDefinition] = None,\n job_def: Optional[JobDefinition] = None,\n run_id: Optional[str] = None,\n ):\n self._logger_config = logger_config\n self._job_def = check.opt_inst_param(job_def, "job_def", JobDefinition)\n self._logger_def = check.opt_inst_param(logger_def, "logger_def", LoggerDefinition)\n self._run_id = check.opt_str_param(run_id, "run_id")\n\n @public\n @property\n def logger_config(self) -> Any:\n """The configuration data provided by the run config. The\n schema for this data is defined by ``config_schema`` on the :py:class:`LoggerDefinition`.\n """\n return self._logger_config\n\n @property\n def job_def(self) -> Optional[JobDefinition]:\n """The job definition currently being executed."""\n return self._job_def\n\n @public\n @property\n def logger_def(self) -> Optional[LoggerDefinition]:\n """The logger definition for the logger being constructed."""\n return self._logger_def\n\n @public\n @property\n def run_id(self) -> Optional[str]:\n """The ID for this run of the job."""\n return self._run_id
\n\n\nclass UnboundInitLoggerContext(InitLoggerContext):\n """Logger initialization context outputted by ``build_init_logger_context``.\n\n Represents a context whose config has not yet been validated against a logger definition, hence\n the inability to access the `logger_def` attribute. When an instance of\n ``UnboundInitLoggerContext`` is passed to ``LoggerDefinition.initialize``, config is validated,\n and it is subsumed into an `InitLoggerContext`, which contains the logger_def validated against.\n """\n\n def __init__(self, logger_config: Any, job_def: Optional[JobDefinition]):\n super(UnboundInitLoggerContext, self).__init__(\n logger_config, logger_def=None, job_def=job_def, run_id=None\n )\n\n @property\n def logger_def(self) -> LoggerDefinition:\n raise DagsterInvariantViolationError(\n "UnboundInitLoggerContext has not been validated against a logger definition."\n )\n\n @property\n def run_id(self) -> Optional[str]:\n return RUN_ID_PLACEHOLDER\n
", "current_page_name": "_modules/dagster/_core/execution/context/logger", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.execution.context.logger"}, "output": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.execution.context.output

\nimport warnings\nfrom typing import (\n    TYPE_CHECKING,\n    Any,\n    ContextManager,\n    Iterator,\n    List,\n    Mapping,\n    Optional,\n    Sequence,\n    Union,\n    cast,\n)\n\nimport dagster._check as check\nfrom dagster._annotations import public\nfrom dagster._core.definitions.asset_layer import AssetOutputInfo\nfrom dagster._core.definitions.events import (\n    AssetKey,\n    AssetMaterialization,\n    AssetObservation,\n    CoercibleToAssetKey,\n)\nfrom dagster._core.definitions.metadata import (\n    ArbitraryMetadataMapping,\n    MetadataValue,\n    RawMetadataValue,\n)\nfrom dagster._core.definitions.partition_key_range import PartitionKeyRange\nfrom dagster._core.definitions.time_window_partitions import TimeWindow\nfrom dagster._core.errors import DagsterInvalidMetadata, DagsterInvariantViolationError\nfrom dagster._core.execution.plan.utils import build_resources_for_manager\n\nif TYPE_CHECKING:\n    from dagster._core.definitions import JobDefinition, PartitionsDefinition\n    from dagster._core.definitions.op_definition import OpDefinition\n    from dagster._core.definitions.resource_definition import Resources\n    from dagster._core.events import DagsterEvent\n    from dagster._core.execution.context.system import StepExecutionContext\n    from dagster._core.execution.plan.outputs import StepOutputHandle\n    from dagster._core.execution.plan.plan import ExecutionPlan\n    from dagster._core.log_manager import DagsterLogManager\n    from dagster._core.system_config.objects import ResolvedRunConfig\n    from dagster._core.types.dagster_type import DagsterType\n\nRUN_ID_PLACEHOLDER = "__EPHEMERAL_RUN_ID"\n\n\n
[docs]class OutputContext:\n """The context object that is available to the `handle_output` method of an :py:class:`IOManager`.\n\n Users should not instantiate this object directly. To construct an\n `OutputContext` for testing an IO Manager's `handle_output` method, use\n :py:func:`dagster.build_output_context`.\n\n Example:\n .. code-block:: python\n\n from dagster import IOManager, OutputContext\n\n class MyIOManager(IOManager):\n def handle_output(self, context: OutputContext, obj):\n ...\n """\n\n _step_key: Optional[str]\n _name: Optional[str]\n _job_name: Optional[str]\n _run_id: Optional[str]\n _metadata: ArbitraryMetadataMapping\n _user_generated_metadata: Mapping[str, MetadataValue]\n _mapping_key: Optional[str]\n _config: object\n _op_def: Optional["OpDefinition"]\n _dagster_type: Optional["DagsterType"]\n _log: Optional["DagsterLogManager"]\n _version: Optional[str]\n _resource_config: Optional[Mapping[str, object]]\n _step_context: Optional["StepExecutionContext"]\n _asset_info: Optional[AssetOutputInfo]\n _warn_on_step_context_use: bool\n _resources: Optional["Resources"]\n _resources_cm: Optional[ContextManager["Resources"]]\n _resources_contain_cm: Optional[bool]\n _cm_scope_entered: Optional[bool]\n _events: List["DagsterEvent"]\n _user_events: List[Union[AssetMaterialization, AssetObservation]]\n\n def __init__(\n self,\n step_key: Optional[str] = None,\n name: Optional[str] = None,\n job_name: Optional[str] = None,\n run_id: Optional[str] = None,\n metadata: Optional[ArbitraryMetadataMapping] = None,\n mapping_key: Optional[str] = None,\n config: object = None,\n dagster_type: Optional["DagsterType"] = None,\n log_manager: Optional["DagsterLogManager"] = None,\n version: Optional[str] = None,\n resource_config: Optional[Mapping[str, object]] = None,\n resources: Optional[Union["Resources", Mapping[str, object]]] = None,\n step_context: Optional["StepExecutionContext"] = None,\n op_def: Optional["OpDefinition"] = None,\n asset_info: Optional[AssetOutputInfo] = None,\n warn_on_step_context_use: bool = False,\n partition_key: Optional[str] = None,\n ):\n from dagster._core.definitions.resource_definition import IContainsGenerator, Resources\n from dagster._core.execution.build_resources import build_resources\n\n self._step_key = step_key\n self._name = name\n self._job_name = job_name\n self._run_id = run_id\n self._metadata = metadata or {}\n self._mapping_key = mapping_key\n self._config = config\n self._op_def = op_def\n self._dagster_type = dagster_type\n self._log = log_manager\n self._version = version\n self._resource_config = resource_config\n self._step_context = step_context\n self._asset_info = asset_info\n self._warn_on_step_context_use = warn_on_step_context_use\n if self._step_context and self._step_context.has_partition_key:\n self._partition_key: Optional[str] = self._step_context.partition_key\n else:\n self._partition_key = partition_key\n\n if isinstance(resources, Resources):\n self._resources_cm = None\n self._resources = resources\n else:\n self._resources_cm = build_resources(\n check.opt_mapping_param(resources, "resources", key_type=str)\n )\n self._resources = self._resources_cm.__enter__()\n self._resources_contain_cm = isinstance(self._resources, IContainsGenerator)\n self._cm_scope_entered = False\n\n self._events = []\n self._user_events = []\n self._user_generated_metadata = {}\n\n def __enter__(self):\n if self._resources_cm:\n self._cm_scope_entered = True\n return self\n\n def __exit__(self, *exc):\n if self._resources_cm:\n self._resources_cm.__exit__(*exc)\n\n def __del__(self):\n if (\n hasattr(self, "_resources_cm")\n and self._resources_cm\n and self._resources_contain_cm\n and not self._cm_scope_entered\n ):\n self._resources_cm.__exit__(None, None, None)\n\n @public\n @property\n def step_key(self) -> str:\n """The step_key for the compute step that produced the output."""\n if self._step_key is None:\n raise DagsterInvariantViolationError(\n "Attempting to access step_key, "\n "but it was not provided when constructing the OutputContext"\n )\n\n return self._step_key\n\n @public\n @property\n def name(self) -> str:\n """The name of the output that produced the output."""\n if self._name is None:\n raise DagsterInvariantViolationError(\n "Attempting to access name, "\n "but it was not provided when constructing the OutputContext"\n )\n\n return self._name\n\n @property\n def job_name(self) -> str:\n if self._job_name is None:\n raise DagsterInvariantViolationError(\n "Attempting to access pipeline_name, "\n "but it was not provided when constructing the OutputContext"\n )\n\n return self._job_name\n\n @public\n @property\n def run_id(self) -> str:\n """The id of the run that produced the output."""\n if self._run_id is None:\n raise DagsterInvariantViolationError(\n "Attempting to access run_id, "\n "but it was not provided when constructing the OutputContext"\n )\n\n return self._run_id\n\n @public\n @property\n def metadata(self) -> Optional[ArbitraryMetadataMapping]:\n """A dict of the metadata that is assigned to the OutputDefinition that produced\n the output.\n """\n return self._metadata\n\n @public\n @property\n def mapping_key(self) -> Optional[str]:\n """The key that identifies a unique mapped output. None for regular outputs."""\n return self._mapping_key\n\n @public\n @property\n def config(self) -> Any:\n """The configuration for the output."""\n return self._config\n\n @public\n @property\n def op_def(self) -> "OpDefinition":\n """The definition of the op that produced the output."""\n from dagster._core.definitions import OpDefinition\n\n if self._op_def is None:\n raise DagsterInvariantViolationError(\n "Attempting to access op_def, "\n "but it was not provided when constructing the OutputContext"\n )\n\n return cast(OpDefinition, self._op_def)\n\n @public\n @property\n def dagster_type(self) -> "DagsterType":\n """The type of this output."""\n if self._dagster_type is None:\n raise DagsterInvariantViolationError(\n "Attempting to access dagster_type, "\n "but it was not provided when constructing the OutputContext"\n )\n\n return self._dagster_type\n\n @public\n @property\n def log(self) -> "DagsterLogManager":\n """The log manager to use for this output."""\n if self._log is None:\n raise DagsterInvariantViolationError(\n "Attempting to access log, "\n "but it was not provided when constructing the OutputContext"\n )\n\n return self._log\n\n @public\n @property\n def version(self) -> Optional[str]:\n """(Experimental) The version of the output."""\n return self._version\n\n @public\n @property\n def resource_config(self) -> Optional[Mapping[str, object]]:\n """The config associated with the resource that initializes the InputManager."""\n return self._resource_config\n\n @public\n @property\n def resources(self) -> Any:\n """The resources required by the output manager, specified by the `required_resource_keys`\n parameter.\n """\n if self._resources is None:\n raise DagsterInvariantViolationError(\n "Attempting to access resources, "\n "but it was not provided when constructing the OutputContext"\n )\n\n if self._resources_cm and self._resources_contain_cm and not self._cm_scope_entered:\n raise DagsterInvariantViolationError(\n "At least one provided resource is a generator, but attempting to access "\n "resources outside of context manager scope. You can use the following syntax to "\n "open a context manager: `with build_output_context(...) as context:`"\n )\n return self._resources\n\n @property\n def asset_info(self) -> Optional[AssetOutputInfo]:\n """(Experimental) Asset info corresponding to the output."""\n return self._asset_info\n\n @public\n @property\n def has_asset_key(self) -> bool:\n """Returns True if an asset is being stored, otherwise returns False. A return value of False\n indicates that an output from an op is being stored.\n """\n return self._asset_info is not None\n\n @public\n @property\n def asset_key(self) -> AssetKey:\n """The ``AssetKey`` of the asset that is being stored as an output."""\n if self._asset_info is None:\n raise DagsterInvariantViolationError(\n "Attempting to access asset_key, "\n "but it was not provided when constructing the OutputContext"\n )\n\n return self._asset_info.key\n\n @public\n @property\n def asset_partitions_def(self) -> "PartitionsDefinition":\n """The PartitionsDefinition on the asset corresponding to this output."""\n asset_key = self.asset_key\n result = self.step_context.job_def.asset_layer.partitions_def_for_asset(asset_key)\n if result is None:\n raise DagsterInvariantViolationError(\n f"Attempting to access partitions def for asset {asset_key}, but it is not"\n " partitioned"\n )\n\n return result\n\n @property\n def step_context(self) -> "StepExecutionContext":\n if self._warn_on_step_context_use:\n warnings.warn(\n "You are using InputContext.upstream_output.step_context"\n "This use on upstream_output is deprecated and will fail in the future"\n "Try to obtain what you need directly from InputContext"\n "For more details: https://github.com/dagster-io/dagster/issues/7900"\n )\n\n if self._step_context is None:\n raise DagsterInvariantViolationError(\n "Attempting to access step_context, "\n "but it was not provided when constructing the OutputContext"\n )\n\n return self._step_context\n\n @public\n @property\n def has_partition_key(self) -> bool:\n """Whether the current run is a partitioned run."""\n if self._warn_on_step_context_use:\n warnings.warn(\n "You are using InputContext.upstream_output.has_partition_key"\n "This use on upstream_output is deprecated and will fail in the future"\n "Try to obtain what you need directly from InputContext"\n "For more details: https://github.com/dagster-io/dagster/issues/7900"\n )\n\n return self._partition_key is not None\n\n @public\n @property\n def partition_key(self) -> str:\n """The partition key for the current run.\n\n Raises an error if the current run is not a partitioned run.\n """\n if self._warn_on_step_context_use:\n warnings.warn(\n "You are using InputContext.upstream_output.partition_key"\n "This use on upstream_output is deprecated and will fail in the future"\n "Try to obtain what you need directly from InputContext"\n "For more details: https://github.com/dagster-io/dagster/issues/7900"\n )\n\n if self._partition_key is None:\n check.failed(\n "Tried to access partition_key on a non-partitioned run.",\n )\n\n return self._partition_key\n\n @public\n @property\n def has_asset_partitions(self) -> bool:\n """Returns True if the asset being stored is partitioned."""\n if self._warn_on_step_context_use:\n warnings.warn(\n "You are using InputContext.upstream_output.has_asset_partitions"\n "This use on upstream_output is deprecated and will fail in the future"\n "Try to obtain what you need directly from InputContext"\n "For more details: https://github.com/dagster-io/dagster/issues/7900"\n )\n\n if self._step_context is not None:\n return self._step_context.has_asset_partitions_for_output(self.name)\n else:\n return False\n\n @public\n @property\n def asset_partition_key(self) -> str:\n """The partition key for output asset.\n\n Raises an error if the output asset has no partitioning, or if the run covers a partition\n range for the output asset.\n """\n if self._warn_on_step_context_use:\n warnings.warn(\n "You are using InputContext.upstream_output.asset_partition_key"\n "This use on upstream_output is deprecated and will fail in the future"\n "Try to obtain what you need directly from InputContext"\n "For more details: https://github.com/dagster-io/dagster/issues/7900"\n )\n\n return self.step_context.asset_partition_key_for_output(self.name)\n\n @public\n @property\n def asset_partition_key_range(self) -> PartitionKeyRange:\n """The partition key range for output asset.\n\n Raises an error if the output asset has no partitioning.\n """\n if self._warn_on_step_context_use:\n warnings.warn(\n "You are using InputContext.upstream_output.asset_partition_key_range"\n "This use on upstream_output is deprecated and will fail in the future"\n "Try to obtain what you need directly from InputContext"\n "For more details: https://github.com/dagster-io/dagster/issues/7900"\n )\n\n return self.step_context.asset_partition_key_range_for_output(self.name)\n\n @public\n @property\n def asset_partition_keys(self) -> Sequence[str]:\n """The partition keys for the output asset.\n\n Raises an error if the output asset has no partitioning.\n """\n if self._warn_on_step_context_use:\n warnings.warn(\n "You are using InputContext.upstream_output.asset_partition_keys"\n "This use on upstream_output is deprecated and will fail in the future"\n "Try to obtain what you need directly from InputContext"\n "For more details: https://github.com/dagster-io/dagster/issues/7900"\n )\n\n return self.asset_partitions_def.get_partition_keys_in_range(\n self.step_context.asset_partition_key_range_for_output(self.name),\n dynamic_partitions_store=self.step_context.instance,\n )\n\n @public\n @property\n def asset_partitions_time_window(self) -> TimeWindow:\n """The time window for the partitions of the output asset.\n\n Raises an error if either of the following are true:\n - The output asset has no partitioning.\n - The output asset is not partitioned with a TimeWindowPartitionsDefinition or a\n MultiPartitionsDefinition with one time-partitioned dimension.\n """\n if self._warn_on_step_context_use:\n warnings.warn(\n "You are using InputContext.upstream_output.asset_partitions_time_window"\n "This use on upstream_output is deprecated and will fail in the future"\n "Try to obtain what you need directly from InputContext"\n "For more details: https://github.com/dagster-io/dagster/issues/7900"\n )\n\n return self.step_context.asset_partitions_time_window_for_output(self.name)\n\n def get_run_scoped_output_identifier(self) -> Sequence[str]:\n """Utility method to get a collection of identifiers that as a whole represent a unique\n step output.\n\n The unique identifier collection consists of\n\n - ``run_id``: the id of the run which generates the output.\n Note: This method also handles the re-execution memoization logic. If the step that\n generates the output is skipped in the re-execution, the ``run_id`` will be the id\n of its parent run.\n - ``step_key``: the key for a compute step.\n - ``name``: the name of the output. (default: 'result').\n\n Returns:\n Sequence[str, ...]: A list of identifiers, i.e. run id, step key, and output name\n """\n warnings.warn(\n "`OutputContext.get_run_scoped_output_identifier` is deprecated. Use "\n "`OutputContext.get_identifier` instead."\n )\n # if run_id is None and this is a re-execution, it means we failed to find its source run id\n check.invariant(\n self.run_id is not None,\n "Unable to find the run scoped output identifier: run_id is None on OutputContext.",\n )\n check.invariant(\n self.step_key is not None,\n "Unable to find the run scoped output identifier: step_key is None on OutputContext.",\n )\n check.invariant(\n self.name is not None,\n "Unable to find the run scoped output identifier: name is None on OutputContext.",\n )\n run_id = cast(str, self.run_id)\n step_key = cast(str, self.step_key)\n name = cast(str, self.name)\n\n if self.mapping_key:\n return [run_id, step_key, name, self.mapping_key]\n\n return [run_id, step_key, name]\n\n
[docs] @public\n def get_identifier(self) -> Sequence[str]:\n """Utility method to get a collection of identifiers that as a whole represent a unique\n step output.\n\n If not using memoization, the unique identifier collection consists of\n\n - ``run_id``: the id of the run which generates the output.\n Note: This method also handles the re-execution memoization logic. If the step that\n generates the output is skipped in the re-execution, the ``run_id`` will be the id\n of its parent run.\n - ``step_key``: the key for a compute step.\n - ``name``: the name of the output. (default: 'result').\n\n If using memoization, the ``version`` corresponding to the step output is used in place of\n the ``run_id``.\n\n Returns:\n Sequence[str, ...]: A list of identifiers, i.e. (run_id or version), step_key, and output_name\n """\n version = self.version\n step_key = self.step_key\n name = self.name\n if version is not None:\n check.invariant(\n self.mapping_key is None,\n f"Mapping key and version both provided for output '{name}' of step"\n f" '{step_key}'. Dynamic mapping is not supported when using versioning.",\n )\n identifier = ["versioned_outputs", version, step_key, name]\n else:\n run_id = self.run_id\n identifier = [run_id, step_key, name]\n if self.mapping_key:\n identifier.append(self.mapping_key)\n\n return identifier
\n\n def get_output_identifier(self) -> Sequence[str]:\n warnings.warn(\n "`OutputContext.get_output_identifier` is deprecated. Use "\n "`OutputContext.get_identifier` instead."\n )\n\n return self.get_identifier()\n\n
[docs] @public\n def get_asset_identifier(self) -> Sequence[str]:\n """The sequence of strings making up the AssetKey for the asset being stored as an output.\n If the asset is partitioned, the identifier contains the partition key as the final element in the\n sequence. For example, for the asset key ``AssetKey(["foo", "bar", "baz"])`` materialized with\n partition key "2023-06-01", ``get_asset_identifier`` will return ``["foo", "bar", "baz", "2023-06-01"]``.\n """\n if self.asset_key is not None:\n if self.has_asset_partitions:\n return [*self.asset_key.path, self.asset_partition_key]\n else:\n return self.asset_key.path\n else:\n check.failed("Can't get asset output identifier for an output with no asset key")
\n\n def get_asset_output_identifier(self) -> Sequence[str]:\n warnings.warn(\n "`OutputContext.get_asset_output_identifier` is deprecated. Use "\n "`OutputContext.get_asset_identifier` instead."\n )\n\n return self.get_asset_identifier()\n\n
[docs] @public\n def log_event(self, event: Union[AssetObservation, AssetMaterialization]) -> None:\n """Log an AssetMaterialization or AssetObservation from within the body of an io manager's `handle_output` method.\n\n Events logged with this method will appear in the event log.\n\n Args:\n event (Union[AssetMaterialization, AssetObservation]): The event to log.\n\n Examples:\n .. code-block:: python\n\n from dagster import IOManager, AssetMaterialization\n\n class MyIOManager(IOManager):\n def handle_output(self, context, obj):\n context.log_event(AssetMaterialization("foo"))\n """\n from dagster._core.events import DagsterEvent\n\n if isinstance(event, (AssetMaterialization)):\n if self._step_context:\n self._events.append(DagsterEvent.asset_materialization(self._step_context, event))\n self._user_events.append(event)\n elif isinstance(event, AssetObservation):\n if self._step_context:\n self._events.append(DagsterEvent.asset_observation(self._step_context, event))\n self._user_events.append(event)\n else:\n check.failed(f"Unexpected event {event}")
\n\n def consume_events(self) -> Iterator["DagsterEvent"]:\n """Pops and yields all user-generated events that have been recorded from this context.\n\n If consume_events has not yet been called, this will yield all logged events since the call to `handle_output`. If consume_events has been called, it will yield all events since the last time consume_events was called. Designed for internal use. Users should never need to invoke this method.\n """\n events = self._events\n self._events = []\n yield from events\n\n def get_logged_events(\n self,\n ) -> Sequence[Union[AssetMaterialization, AssetObservation]]:\n """Retrieve the list of user-generated events that were logged via the context.\n\n\n User-generated events that were yielded will not appear in this list.\n\n **Examples:**\n\n .. code-block:: python\n\n from dagster import IOManager, build_output_context, AssetMaterialization\n\n class MyIOManager(IOManager):\n def handle_output(self, context, obj):\n ...\n\n def test_handle_output():\n mgr = MyIOManager()\n context = build_output_context()\n mgr.handle_output(context)\n all_user_events = context.get_logged_events()\n materializations = [event for event in all_user_events if isinstance(event, AssetMaterialization)]\n ...\n """\n return self._user_events\n\n
[docs] @public\n def add_output_metadata(self, metadata: Mapping[str, RawMetadataValue]) -> None:\n """Add a dictionary of metadata to the handled output.\n\n Metadata entries added will show up in the HANDLED_OUTPUT and ASSET_MATERIALIZATION events for the run.\n\n Args:\n metadata (Mapping[str, RawMetadataValue]): A metadata dictionary to log\n\n Examples:\n .. code-block:: python\n\n from dagster import IOManager\n\n class MyIOManager(IOManager):\n def handle_output(self, context, obj):\n context.add_output_metadata({"foo": "bar"})\n """\n from dagster._core.definitions.metadata import normalize_metadata\n\n overlapping_labels = set(self._user_generated_metadata.keys()) & metadata.keys()\n if overlapping_labels:\n raise DagsterInvalidMetadata(\n f"Tried to add metadata for key(s) that already have metadata: {overlapping_labels}"\n )\n\n self._user_generated_metadata = {\n **self._user_generated_metadata,\n **normalize_metadata(metadata),\n }
\n\n def get_logged_metadata(\n self,\n ) -> Mapping[str, MetadataValue]:\n """Get the mapping of metadata entries that have been logged for use with this output."""\n return self._user_generated_metadata\n\n def consume_logged_metadata(\n self,\n ) -> Mapping[str, MetadataValue]:\n """Pops and yields all user-generated metadata entries that have been recorded from this context.\n\n If consume_logged_metadata has not yet been called, this will yield all logged events since\n the call to `handle_output`. If consume_logged_metadata has been called, it will yield all\n events since the last time consume_logged_metadata_entries was called. Designed for internal\n use. Users should never need to invoke this method.\n """\n result = self._user_generated_metadata\n self._user_generated_metadata = {}\n return result or {}
\n\n\ndef get_output_context(\n execution_plan: "ExecutionPlan",\n job_def: "JobDefinition",\n resolved_run_config: "ResolvedRunConfig",\n step_output_handle: "StepOutputHandle",\n run_id: Optional[str],\n log_manager: Optional["DagsterLogManager"],\n step_context: Optional["StepExecutionContext"],\n resources: Optional["Resources"],\n version: Optional[str],\n warn_on_step_context_use: bool = False,\n) -> "OutputContext":\n """Args:\n run_id (str): The run ID of the run that produced the output, not necessarily the run that\n the context will be used in.\n """\n step = execution_plan.get_step_by_key(step_output_handle.step_key)\n # get config\n op_config = resolved_run_config.ops[step.node_handle.to_string()]\n outputs_config = op_config.outputs\n\n if outputs_config:\n output_config = outputs_config.get_output_manager_config(step_output_handle.output_name)\n else:\n output_config = None\n\n step_output = execution_plan.get_step_output(step_output_handle)\n output_def = job_def.get_node(step_output.node_handle).output_def_named(step_output.name)\n\n io_manager_key = output_def.io_manager_key\n resource_config = resolved_run_config.resources[io_manager_key].config\n\n node_handle = execution_plan.get_step_by_key(step.key).node_handle\n asset_info = job_def.asset_layer.asset_info_for_output(\n node_handle=node_handle, output_name=step_output.name\n )\n if asset_info is not None:\n metadata = job_def.asset_layer.metadata_for_asset(asset_info.key) or output_def.metadata\n else:\n metadata = output_def.metadata\n\n if step_context:\n check.invariant(\n not resources,\n "Expected either resources or step context to be set, but "\n "received both. If step context is provided, resources for IO manager will be "\n "retrieved off of that.",\n )\n resources = build_resources_for_manager(io_manager_key, step_context)\n\n return OutputContext(\n step_key=step_output_handle.step_key,\n name=step_output_handle.output_name,\n job_name=job_def.name,\n run_id=run_id,\n metadata=metadata,\n mapping_key=step_output_handle.mapping_key,\n config=output_config,\n op_def=job_def.get_node(step.node_handle).definition, # type: ignore # (should be OpDefinition not NodeDefinition)\n dagster_type=output_def.dagster_type,\n log_manager=log_manager,\n version=version,\n step_context=step_context,\n resource_config=resource_config,\n resources=resources,\n asset_info=asset_info,\n warn_on_step_context_use=warn_on_step_context_use,\n )\n\n\ndef step_output_version(\n job_def: "JobDefinition",\n execution_plan: "ExecutionPlan",\n resolved_run_config: "ResolvedRunConfig",\n step_output_handle: "StepOutputHandle",\n) -> Optional[str]:\n from dagster._core.execution.resolve_versions import resolve_step_output_versions\n\n step_output_versions = resolve_step_output_versions(\n job_def, execution_plan, resolved_run_config\n )\n return (\n step_output_versions[step_output_handle]\n if step_output_handle in step_output_versions\n else None\n )\n\n\n
[docs]def build_output_context(\n step_key: Optional[str] = None,\n name: Optional[str] = None,\n metadata: Optional[Mapping[str, RawMetadataValue]] = None,\n run_id: Optional[str] = None,\n mapping_key: Optional[str] = None,\n config: Optional[Any] = None,\n dagster_type: Optional["DagsterType"] = None,\n version: Optional[str] = None,\n resource_config: Optional[Mapping[str, object]] = None,\n resources: Optional[Mapping[str, object]] = None,\n op_def: Optional["OpDefinition"] = None,\n asset_key: Optional[CoercibleToAssetKey] = None,\n partition_key: Optional[str] = None,\n) -> "OutputContext":\n """Builds output context from provided parameters.\n\n ``build_output_context`` can be used as either a function, or a context manager. If resources\n that are also context managers are provided, then ``build_output_context`` must be used as a\n context manager.\n\n Args:\n step_key (Optional[str]): The step_key for the compute step that produced the output.\n name (Optional[str]): The name of the output that produced the output.\n metadata (Optional[Mapping[str, Any]]): A dict of the metadata that is assigned to the\n OutputDefinition that produced the output.\n mapping_key (Optional[str]): The key that identifies a unique mapped output. None for regular outputs.\n config (Optional[Any]): The configuration for the output.\n dagster_type (Optional[DagsterType]): The type of this output.\n version (Optional[str]): (Experimental) The version of the output.\n resource_config (Optional[Mapping[str, Any]]): The resource config to make available from the\n input context. This usually corresponds to the config provided to the resource that\n loads the output manager.\n resources (Optional[Resources]): The resources to make available from the context.\n For a given key, you can provide either an actual instance of an object, or a resource\n definition.\n op_def (Optional[OpDefinition]): The definition of the op that produced the output.\n asset_key: Optional[Union[AssetKey, Sequence[str], str]]: The asset key corresponding to the\n output.\n partition_key: Optional[str]: String value representing partition key to execute with.\n\n Examples:\n .. code-block:: python\n\n build_output_context()\n\n with build_output_context(resources={"foo": context_manager_resource}) as context:\n do_something\n\n """\n from dagster._core.definitions import OpDefinition\n from dagster._core.execution.context_creation_job import initialize_console_manager\n from dagster._core.types.dagster_type import DagsterType\n\n step_key = check.opt_str_param(step_key, "step_key")\n name = check.opt_str_param(name, "name")\n metadata = check.opt_mapping_param(metadata, "metadata", key_type=str)\n run_id = check.opt_str_param(run_id, "run_id", default=RUN_ID_PLACEHOLDER)\n mapping_key = check.opt_str_param(mapping_key, "mapping_key")\n dagster_type = check.opt_inst_param(dagster_type, "dagster_type", DagsterType)\n version = check.opt_str_param(version, "version")\n resource_config = check.opt_mapping_param(resource_config, "resource_config", key_type=str)\n resources = check.opt_mapping_param(resources, "resources", key_type=str)\n op_def = check.opt_inst_param(op_def, "op_def", OpDefinition)\n asset_key = AssetKey.from_coercible(asset_key) if asset_key else None\n partition_key = check.opt_str_param(partition_key, "partition_key")\n\n return OutputContext(\n step_key=step_key,\n name=name,\n job_name=None,\n run_id=run_id,\n metadata=metadata,\n mapping_key=mapping_key,\n config=config,\n dagster_type=dagster_type,\n log_manager=initialize_console_manager(None),\n version=version,\n resource_config=resource_config,\n resources=resources,\n step_context=None,\n op_def=op_def,\n asset_info=AssetOutputInfo(key=asset_key) if asset_key else None,\n partition_key=partition_key,\n )
\n
", "current_page_name": "_modules/dagster/_core/execution/context/output", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.execution.context.output"}, "system": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.execution.context.system

\n"""This module contains the execution context objects that are internal to the system.\nNot every property on these should be exposed to random Jane or Joe dagster user\nso we have a different layer of objects that encode the explicit public API\nin the user_context module.\n"""\nfrom abc import ABC, abstractmethod\nfrom dataclasses import dataclass\nfrom hashlib import sha256\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Any,\n    Dict,\n    Iterable,\n    List,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Set,\n    Union,\n    cast,\n)\n\nimport dagster._check as check\nfrom dagster._annotations import public\nfrom dagster._core.definitions.data_version import (\n    DATA_VERSION_TAG,\n    SKIP_PARTITION_DATA_VERSION_DEPENDENCY_THRESHOLD,\n    extract_data_version_from_entry,\n)\nfrom dagster._core.definitions.dependency import OpNode\nfrom dagster._core.definitions.events import AssetKey, AssetLineageInfo\nfrom dagster._core.definitions.hook_definition import HookDefinition\nfrom dagster._core.definitions.job_base import IJob\nfrom dagster._core.definitions.job_definition import JobDefinition\nfrom dagster._core.definitions.multi_dimensional_partitions import MultiPartitionsDefinition\nfrom dagster._core.definitions.op_definition import OpDefinition\nfrom dagster._core.definitions.partition import PartitionsDefinition, PartitionsSubset\nfrom dagster._core.definitions.partition_key_range import PartitionKeyRange\nfrom dagster._core.definitions.partition_mapping import (\n    PartitionMapping,\n    infer_partition_mapping,\n)\nfrom dagster._core.definitions.policy import RetryPolicy\nfrom dagster._core.definitions.reconstruct import ReconstructableJob\nfrom dagster._core.definitions.resource_definition import ScopedResourcesBuilder\nfrom dagster._core.definitions.step_launcher import StepLauncher\nfrom dagster._core.definitions.time_window_partitions import (\n    TimeWindow,\n    TimeWindowPartitionsDefinition,\n    has_one_dimension_time_window_partitioning,\n)\nfrom dagster._core.errors import DagsterInvariantViolationError\nfrom dagster._core.execution.plan.handle import ResolvedFromDynamicStepHandle, StepHandle\nfrom dagster._core.execution.plan.outputs import StepOutputHandle\nfrom dagster._core.execution.plan.step import ExecutionStep\nfrom dagster._core.execution.retries import RetryMode\nfrom dagster._core.executor.base import Executor\nfrom dagster._core.log_manager import DagsterLogManager\nfrom dagster._core.storage.dagster_run import DagsterRun\nfrom dagster._core.storage.io_manager import IOManager\nfrom dagster._core.storage.tags import (\n    ASSET_PARTITION_RANGE_END_TAG,\n    ASSET_PARTITION_RANGE_START_TAG,\n    MULTIDIMENSIONAL_PARTITION_PREFIX,\n    PARTITION_NAME_TAG,\n)\nfrom dagster._core.system_config.objects import ResolvedRunConfig\nfrom dagster._core.types.dagster_type import DagsterType\n\nfrom .input import InputContext\nfrom .output import OutputContext, get_output_context\n\nif TYPE_CHECKING:\n    from dagster._core.definitions.data_version import (\n        DataVersion,\n    )\n    from dagster._core.definitions.dependency import NodeHandle\n    from dagster._core.definitions.resource_definition import Resources\n    from dagster._core.event_api import EventLogRecord\n    from dagster._core.execution.plan.plan import ExecutionPlan\n    from dagster._core.execution.plan.state import KnownExecutionState\n    from dagster._core.instance import DagsterInstance\n\n    from .hook import HookContext\n\n\ndef is_iterable(obj: Any) -> bool:\n    try:\n        iter(obj)\n    except:\n        return False\n    return True\n\n\nclass IPlanContext(ABC):\n    """Context interface to represent run information that does not require access to user code.\n\n    The information available via this interface is accessible to the system throughout a run.\n    """\n\n    @property\n    @abstractmethod\n    def plan_data(self) -> "PlanData":\n        raise NotImplementedError()\n\n    @property\n    def job(self) -> IJob:\n        return self.plan_data.job\n\n    @property\n    def dagster_run(self) -> DagsterRun:\n        return self.plan_data.dagster_run\n\n    @property\n    def run_id(self) -> str:\n        return self.dagster_run.run_id\n\n    @property\n    def run_config(self) -> Mapping[str, object]:\n        return self.dagster_run.run_config\n\n    @property\n    def job_name(self) -> str:\n        return self.dagster_run.job_name\n\n    @property\n    def instance(self) -> "DagsterInstance":\n        return self.plan_data.instance\n\n    @property\n    def raise_on_error(self) -> bool:\n        return self.plan_data.raise_on_error\n\n    @property\n    def retry_mode(self) -> RetryMode:\n        return self.plan_data.retry_mode\n\n    @property\n    def execution_plan(self) -> "ExecutionPlan":\n        return self.plan_data.execution_plan\n\n    @property\n    @abstractmethod\n    def output_capture(self) -> Optional[Mapping[StepOutputHandle, Any]]:\n        raise NotImplementedError()\n\n    @property\n    def log(self) -> DagsterLogManager:\n        raise NotImplementedError()\n\n    @property\n    def logging_tags(self) -> Mapping[str, str]:\n        return self.log.logging_metadata.all_tags()\n\n    @property\n    def event_tags(self) -> Mapping[str, str]:\n        return self.log.logging_metadata.event_tags()\n\n    def has_tag(self, key: str) -> bool:\n        check.str_param(key, "key")\n        return key in self.dagster_run.tags\n\n    def get_tag(self, key: str) -> Optional[str]:\n        check.str_param(key, "key")\n        return self.dagster_run.tags.get(key)\n\n    @property\n    def run_tags(self) -> Mapping[str, str]:\n        return self.dagster_run.tags\n\n\nclass PlanData(NamedTuple):\n    """The data about a run that is available during both orchestration and execution.\n\n    This object does not contain any information that requires access to user code, such as the\n    pipeline definition and resources.\n    """\n\n    job: IJob\n    dagster_run: DagsterRun\n    instance: "DagsterInstance"\n    execution_plan: "ExecutionPlan"\n    raise_on_error: bool = False\n    retry_mode: RetryMode = RetryMode.DISABLED\n\n\nclass ExecutionData(NamedTuple):\n    """The data that is available to the system during execution.\n\n    This object contains information that requires access to user code, such as the pipeline\n    definition and resources.\n    """\n\n    scoped_resources_builder: ScopedResourcesBuilder\n    resolved_run_config: ResolvedRunConfig\n    job_def: JobDefinition\n\n\nclass IStepContext(IPlanContext):\n    """Interface to represent data to be available during either step orchestration or execution."""\n\n    @property\n    @abstractmethod\n    def step(self) -> ExecutionStep:\n        raise NotImplementedError()\n\n    @property\n    @abstractmethod\n    def node_handle(self) -> "NodeHandle":\n        raise NotImplementedError()\n\n\nclass PlanOrchestrationContext(IPlanContext):\n    """Context for the orchestration of a run.\n\n    This context assumes inability to run user code directly.\n    """\n\n    def __init__(\n        self,\n        plan_data: PlanData,\n        log_manager: DagsterLogManager,\n        executor: Executor,\n        output_capture: Optional[Dict[StepOutputHandle, Any]],\n        resume_from_failure: bool = False,\n    ):\n        self._plan_data = plan_data\n        self._log_manager = log_manager\n        self._executor = executor\n        self._output_capture = output_capture\n        self._resume_from_failure = resume_from_failure\n\n    @property\n    def plan_data(self) -> PlanData:\n        return self._plan_data\n\n    @property\n    def reconstructable_job(self) -> ReconstructableJob:\n        if not isinstance(self.job, ReconstructableJob):\n            raise DagsterInvariantViolationError(\n                "reconstructable_pipeline property must be a ReconstructableJob"\n            )\n        return self.job\n\n    @property\n    def log(self) -> DagsterLogManager:\n        return self._log_manager\n\n    @property\n    def executor(self) -> Executor:\n        return self._executor\n\n    @property\n    def output_capture(self) -> Optional[Dict[StepOutputHandle, Any]]:\n        return self._output_capture\n\n    def for_step(self, step: ExecutionStep) -> "IStepContext":\n        return StepOrchestrationContext(\n            plan_data=self.plan_data,\n            log_manager=self._log_manager.with_tags(**step.logging_tags),\n            executor=self.executor,\n            step=step,\n            output_capture=self.output_capture,\n        )\n\n    @property\n    def resume_from_failure(self) -> bool:\n        return self._resume_from_failure\n\n\nclass StepOrchestrationContext(PlanOrchestrationContext, IStepContext):\n    """Context for the orchestration of a step.\n\n    This context assumes inability to run user code directly. Thus, it does not include any resource\n    information.\n    """\n\n    def __init__(\n        self,\n        plan_data: PlanData,\n        log_manager: DagsterLogManager,\n        executor: Executor,\n        step: ExecutionStep,\n        output_capture: Optional[Dict[StepOutputHandle, Any]],\n    ):\n        super(StepOrchestrationContext, self).__init__(\n            plan_data, log_manager, executor, output_capture\n        )\n        self._step = step\n\n    @property\n    def step(self) -> ExecutionStep:\n        return self._step\n\n    @property\n    def node_handle(self) -> "NodeHandle":\n        return self.step.node_handle\n\n\nclass PlanExecutionContext(IPlanContext):\n    """Context for the execution of a plan.\n\n    This context assumes that user code can be run directly, and thus includes resource and\n    information.\n    """\n\n    def __init__(\n        self,\n        plan_data: PlanData,\n        execution_data: ExecutionData,\n        log_manager: DagsterLogManager,\n        output_capture: Optional[Dict[StepOutputHandle, Any]] = None,\n    ):\n        self._plan_data = plan_data\n        self._execution_data = execution_data\n        self._log_manager = log_manager\n        self._output_capture = output_capture\n\n    @property\n    def plan_data(self) -> PlanData:\n        return self._plan_data\n\n    @property\n    def output_capture(self) -> Optional[Dict[StepOutputHandle, Any]]:\n        return self._output_capture\n\n    def for_step(\n        self,\n        step: ExecutionStep,\n        known_state: Optional["KnownExecutionState"] = None,\n    ) -> IStepContext:\n        return StepExecutionContext(\n            plan_data=self.plan_data,\n            execution_data=self._execution_data,\n            log_manager=self._log_manager.with_tags(**step.logging_tags),\n            step=step,\n            output_capture=self.output_capture,\n            known_state=known_state,\n        )\n\n    @property\n    def job_def(self) -> JobDefinition:\n        return self._execution_data.job_def\n\n    @property\n    def resolved_run_config(self) -> ResolvedRunConfig:\n        return self._execution_data.resolved_run_config\n\n    @property\n    def scoped_resources_builder(self) -> ScopedResourcesBuilder:\n        return self._execution_data.scoped_resources_builder\n\n    @property\n    def log(self) -> DagsterLogManager:\n        return self._log_manager\n\n    @property\n    def partitions_def(self) -> Optional[PartitionsDefinition]:\n        from dagster._core.definitions.job_definition import JobDefinition\n\n        job_def = self._execution_data.job_def\n        if not isinstance(job_def, JobDefinition):\n            check.failed(\n                "Can only call 'partitions_def', when using jobs, not legacy pipelines",\n            )\n        partitions_def = job_def.partitions_def\n        return partitions_def\n\n    @property\n    def has_partitions(self) -> bool:\n        tags = self._plan_data.dagster_run.tags\n        return bool(\n            PARTITION_NAME_TAG in tags\n            or any([tag.startswith(MULTIDIMENSIONAL_PARTITION_PREFIX) for tag in tags.keys()])\n            or (\n                tags.get(ASSET_PARTITION_RANGE_START_TAG)\n                and tags.get(ASSET_PARTITION_RANGE_END_TAG)\n            )\n        )\n\n    @property\n    def partition_key(self) -> str:\n        from dagster._core.definitions.multi_dimensional_partitions import (\n            MultiPartitionsDefinition,\n            get_multipartition_key_from_tags,\n        )\n\n        if not self.has_partitions:\n            raise DagsterInvariantViolationError(\n                "Cannot access partition_key for a non-partitioned run"\n            )\n\n        tags = self._plan_data.dagster_run.tags\n        if any([tag.startswith(MULTIDIMENSIONAL_PARTITION_PREFIX) for tag in tags.keys()]):\n            return get_multipartition_key_from_tags(tags)\n        elif PARTITION_NAME_TAG in tags:\n            return tags[PARTITION_NAME_TAG]\n        else:\n            range_start = tags[ASSET_PARTITION_RANGE_START_TAG]\n            range_end = tags[ASSET_PARTITION_RANGE_END_TAG]\n\n            if range_start != range_end:\n                raise DagsterInvariantViolationError(\n                    "Cannot access partition_key for a partitioned run with a range of partitions."\n                    " Call partition_key_range instead."\n                )\n            else:\n                if isinstance(self.partitions_def, MultiPartitionsDefinition):\n                    return self.partitions_def.get_partition_key_from_str(cast(str, range_start))\n                return cast(str, range_start)\n\n    @property\n    def asset_partition_key_range(self) -> PartitionKeyRange:\n        from dagster._core.definitions.multi_dimensional_partitions import (\n            MultiPartitionsDefinition,\n            get_multipartition_key_from_tags,\n        )\n\n        if not self.has_partitions:\n            raise DagsterInvariantViolationError(\n                "Cannot access partition_key for a non-partitioned run"\n            )\n\n        tags = self._plan_data.dagster_run.tags\n        if any([tag.startswith(MULTIDIMENSIONAL_PARTITION_PREFIX) for tag in tags.keys()]):\n            multipartition_key = get_multipartition_key_from_tags(tags)\n            return PartitionKeyRange(multipartition_key, multipartition_key)\n        elif PARTITION_NAME_TAG in tags:\n            partition_key = tags[PARTITION_NAME_TAG]\n            return PartitionKeyRange(partition_key, partition_key)\n        else:\n            partition_key_range_start = tags[ASSET_PARTITION_RANGE_START_TAG]\n            if partition_key_range_start is not None:\n                if isinstance(self.partitions_def, MultiPartitionsDefinition):\n                    return PartitionKeyRange(\n                        self.partitions_def.get_partition_key_from_str(partition_key_range_start),\n                        self.partitions_def.get_partition_key_from_str(\n                            tags[ASSET_PARTITION_RANGE_END_TAG]\n                        ),\n                    )\n            return PartitionKeyRange(partition_key_range_start, tags[ASSET_PARTITION_RANGE_END_TAG])\n\n    @property\n    def partition_time_window(self) -> TimeWindow:\n        partitions_def = self.partitions_def\n\n        if partitions_def is None:\n            raise DagsterInvariantViolationError("Partitions definition is not defined")\n\n        if not has_one_dimension_time_window_partitioning(partitions_def=partitions_def):\n            raise DagsterInvariantViolationError(\n                "Expected a TimeWindowPartitionsDefinition or MultiPartitionsDefinition with a"\n                f" single time dimension, but instead found {type(partitions_def)}"\n            )\n\n        if self.has_partition_key:\n            return cast(\n                Union[MultiPartitionsDefinition, TimeWindowPartitionsDefinition], partitions_def\n            ).time_window_for_partition_key(self.partition_key)\n        elif self.has_partition_key_range:\n            partition_key_range = self.asset_partition_key_range\n            partitions_def = cast(\n                Union[TimeWindowPartitionsDefinition, MultiPartitionsDefinition], partitions_def\n            )\n            return TimeWindow(\n                partitions_def.time_window_for_partition_key(partition_key_range.start).start,\n                partitions_def.time_window_for_partition_key(partition_key_range.end).end,\n            )\n\n        else:\n            check.failed(\n                "Has a PartitionsDefinition, so should either have a partition key or a partition"\n                " key range"\n            )\n\n    @property\n    def has_partition_key(self) -> bool:\n        return PARTITION_NAME_TAG in self._plan_data.dagster_run.tags\n\n    @property\n    def has_partition_key_range(self) -> bool:\n        return ASSET_PARTITION_RANGE_START_TAG in self._plan_data.dagster_run.tags\n\n    def for_type(self, dagster_type: DagsterType) -> "TypeCheckContext":\n        return TypeCheckContext(\n            self.run_id, self.log, self._execution_data.scoped_resources_builder, dagster_type\n        )\n\n\n@dataclass\nclass InputAssetVersionInfo:\n    # This is the storage id of the last materialization of any partition of an asset. Thus it is\n    # computed the same way for both partitioned and non-partitioned assets.\n    storage_id: int\n\n    # If the input asset is partitioned, this is a hash of the sorted data versions of each dependency\n    # partition. If the input asset is not partitioned, this is the data version of the asset. It\n    # can be none if we are sourcing a materialization from before data versions.\n    data_version: Optional["DataVersion"]\n\n    # This is the run_id on the event that the storage_id references\n    run_id: str\n\n    # This is the timestamp on the event that the storage_id references\n    timestamp: float\n\n\n
[docs]class StepExecutionContext(PlanExecutionContext, IStepContext):\n """Context for the execution of a step. Users should not instantiate this class directly.\n\n This context assumes that user code can be run directly, and thus includes resource and information.\n """\n\n def __init__(\n self,\n plan_data: PlanData,\n execution_data: ExecutionData,\n log_manager: DagsterLogManager,\n step: ExecutionStep,\n output_capture: Optional[Dict[StepOutputHandle, Any]],\n known_state: Optional["KnownExecutionState"],\n ):\n from dagster._core.execution.resources_init import get_required_resource_keys_for_step\n\n super(StepExecutionContext, self).__init__(\n plan_data=plan_data,\n execution_data=execution_data,\n log_manager=log_manager,\n output_capture=output_capture,\n )\n self._step = step\n self._required_resource_keys = get_required_resource_keys_for_step(\n plan_data.job.get_definition(),\n step,\n plan_data.execution_plan,\n )\n self._resources = execution_data.scoped_resources_builder.build(\n self._required_resource_keys\n )\n self._known_state = known_state\n self._input_lineage: List[AssetLineageInfo] = []\n\n resources_iter = cast(Iterable, self._resources)\n\n step_launcher_resources = [\n resource for resource in resources_iter if isinstance(resource, StepLauncher)\n ]\n\n self._step_launcher: Optional[StepLauncher] = None\n if len(step_launcher_resources) > 1:\n raise DagsterInvariantViolationError(\n "Multiple required resources for {described_op} have inherited StepLauncher"\n "There should be at most one step launcher resource per {node_type}.".format(\n described_op=self.describe_op(), node_type=self.op_def.node_type_str\n )\n )\n elif len(step_launcher_resources) == 1:\n self._step_launcher = step_launcher_resources[0]\n\n self._step_exception: Optional[BaseException] = None\n\n self._step_output_capture: Optional[Dict[StepOutputHandle, Any]] = None\n # Enable step output capture if there are any hooks which will receive them.\n # Expect in the future that hooks may control whether or not they get outputs,\n # but for now presence of any will cause output capture.\n if self.job_def.get_all_hooks_for_handle(self.node_handle):\n self._step_output_capture = {}\n\n self._output_metadata: Dict[str, Any] = {}\n self._seen_outputs: Dict[str, Union[str, Set[str]]] = {}\n\n self._input_asset_version_info: Dict[AssetKey, Optional["InputAssetVersionInfo"]] = {}\n self._is_external_input_asset_version_info_loaded = False\n self._data_version_cache: Dict[AssetKey, "DataVersion"] = {}\n\n self._requires_typed_event_stream = False\n self._typed_event_stream_error_message = None\n\n # In this mode no conversion is done on returned values and missing but expected outputs are not\n # allowed.\n @property\n def requires_typed_event_stream(self) -> bool:\n return self._requires_typed_event_stream\n\n @property\n def typed_event_stream_error_message(self) -> Optional[str]:\n return self._typed_event_stream_error_message\n\n # Error message will be appended to the default error message.\n def set_requires_typed_event_stream(self, *, error_message: Optional[str] = None):\n self._requires_typed_event_stream = True\n self._typed_event_stream_error_message = error_message\n\n @property\n def step(self) -> ExecutionStep:\n return self._step\n\n @property\n def node_handle(self) -> "NodeHandle":\n return self.step.node_handle\n\n @property\n def required_resource_keys(self) -> AbstractSet[str]:\n return self._required_resource_keys\n\n @property\n def resources(self) -> "Resources":\n return self._resources\n\n @property\n def step_launcher(self) -> Optional[StepLauncher]:\n return self._step_launcher\n\n @property\n def op_def(self) -> OpDefinition:\n return self.op.definition\n\n @property\n def job_def(self) -> "JobDefinition":\n return self._execution_data.job_def\n\n @property\n def op(self) -> OpNode:\n return self.job_def.get_op(self._step.node_handle)\n\n @property\n def op_retry_policy(self) -> Optional[RetryPolicy]:\n return self.job_def.get_retry_policy_for_handle(self.node_handle)\n\n def describe_op(self) -> str:\n return f'op "{self.node_handle}"'\n\n def get_io_manager(self, step_output_handle: StepOutputHandle) -> IOManager:\n step_output = self.execution_plan.get_step_output(step_output_handle)\n io_manager_key = (\n self.job_def.get_node(step_output.node_handle)\n .output_def_named(step_output.name)\n .io_manager_key\n )\n\n output_manager = getattr(self.resources, io_manager_key)\n return check.inst(output_manager, IOManager)\n\n def get_output_context(self, step_output_handle: StepOutputHandle) -> OutputContext:\n return get_output_context(\n self.execution_plan,\n self.job_def,\n self.resolved_run_config,\n step_output_handle,\n self._get_source_run_id(step_output_handle),\n log_manager=self.log,\n step_context=self,\n resources=None,\n version=self.execution_plan.get_version_for_step_output_handle(step_output_handle),\n )\n\n def for_input_manager(\n self,\n name: str,\n config: Any,\n metadata: Any,\n dagster_type: DagsterType,\n source_handle: Optional[StepOutputHandle] = None,\n resource_config: Any = None,\n resources: Optional["Resources"] = None,\n artificial_output_context: Optional["OutputContext"] = None,\n ) -> InputContext:\n if source_handle and artificial_output_context:\n check.failed("Cannot specify both source_handle and artificial_output_context.")\n\n upstream_output: Optional[OutputContext] = None\n\n if source_handle is not None:\n version = self.execution_plan.get_version_for_step_output_handle(source_handle)\n\n # NOTE: this is using downstream step_context for upstream OutputContext. step_context\n # will be set to None for 0.15 release.\n upstream_output = get_output_context(\n self.execution_plan,\n self.job_def,\n self.resolved_run_config,\n source_handle,\n self._get_source_run_id(source_handle),\n log_manager=self.log,\n step_context=self,\n resources=None,\n version=version,\n warn_on_step_context_use=True,\n )\n else:\n upstream_output = artificial_output_context\n\n asset_key = self.job_def.asset_layer.asset_key_for_input(\n node_handle=self.node_handle, input_name=name\n )\n asset_partitions_subset = (\n self.asset_partitions_subset_for_input(name)\n if self.has_asset_partitions_for_input(name)\n else None\n )\n\n asset_partitions_def = (\n self.job_def.asset_layer.partitions_def_for_asset(asset_key) if asset_key else None\n )\n return InputContext(\n job_name=self.job_def.name,\n name=name,\n op_def=self.op_def,\n config=config,\n metadata=metadata,\n upstream_output=upstream_output,\n dagster_type=dagster_type,\n log_manager=self.log,\n step_context=self,\n resource_config=resource_config,\n resources=resources,\n asset_key=asset_key,\n asset_partitions_subset=asset_partitions_subset,\n asset_partitions_def=asset_partitions_def,\n instance=self.instance,\n )\n\n def for_hook(self, hook_def: HookDefinition) -> "HookContext":\n from .hook import HookContext\n\n return HookContext(self, hook_def)\n\n def get_known_state(self) -> "KnownExecutionState":\n if not self._known_state:\n check.failed(\n "Attempted to access KnownExecutionState but it was not provided at context"\n " creation"\n )\n return self._known_state\n\n def can_load(\n self,\n step_output_handle: StepOutputHandle,\n ) -> bool:\n # can load from upstream in the same run\n if step_output_handle in self.get_known_state().ready_outputs:\n return True\n\n if (\n self._should_load_from_previous_runs(step_output_handle)\n # should and can load from a previous run\n and self._get_source_run_id_from_logs(step_output_handle)\n ):\n return True\n\n return False\n\n def observe_output(self, output_name: str, mapping_key: Optional[str] = None) -> None:\n if mapping_key:\n if output_name not in self._seen_outputs:\n self._seen_outputs[output_name] = set()\n cast(Set[str], self._seen_outputs[output_name]).add(mapping_key)\n else:\n self._seen_outputs[output_name] = "seen"\n\n def has_seen_output(self, output_name: str, mapping_key: Optional[str] = None) -> bool:\n if mapping_key:\n return (\n output_name in self._seen_outputs and mapping_key in self._seen_outputs[output_name]\n )\n return output_name in self._seen_outputs\n\n def add_output_metadata(\n self,\n metadata: Mapping[str, Any],\n output_name: Optional[str] = None,\n mapping_key: Optional[str] = None,\n ) -> None:\n if output_name is None and len(self.op_def.output_defs) == 1:\n output_def = self.op_def.output_defs[0]\n output_name = output_def.name\n elif output_name is None:\n raise DagsterInvariantViolationError(\n "Attempted to log metadata without providing output_name, but multiple outputs"\n " exist. Please provide an output_name to the invocation of"\n " `context.add_output_metadata`."\n )\n else:\n output_def = self.op_def.output_def_named(output_name)\n\n if self.has_seen_output(output_name, mapping_key):\n output_desc = (\n f"output '{output_def.name}'"\n if not mapping_key\n else f"output '{output_def.name}' with mapping_key '{mapping_key}'"\n )\n raise DagsterInvariantViolationError(\n f"In {self.op_def.node_type_str} '{self.op.name}', attempted to log output"\n f" metadata for {output_desc} which has already been yielded. Metadata must be"\n " logged before the output is yielded."\n )\n if output_def.is_dynamic and not mapping_key:\n raise DagsterInvariantViolationError(\n f"In {self.op_def.node_type_str} '{self.op.name}', attempted to log metadata"\n f" for dynamic output '{output_def.name}' without providing a mapping key. When"\n " logging metadata for a dynamic output, it is necessary to provide a mapping key."\n )\n\n if mapping_key:\n if output_name not in self._output_metadata:\n self._output_metadata[output_name] = {}\n if mapping_key in self._output_metadata[output_name]:\n self._output_metadata[output_name][mapping_key].update(metadata)\n else:\n self._output_metadata[output_name][mapping_key] = metadata\n else:\n if output_name in self._output_metadata:\n self._output_metadata[output_name].update(metadata)\n else:\n self._output_metadata[output_name] = metadata\n\n def get_output_metadata(\n self, output_name: str, mapping_key: Optional[str] = None\n ) -> Optional[Mapping[str, Any]]:\n metadata = self._output_metadata.get(output_name)\n if mapping_key and metadata:\n return metadata.get(mapping_key)\n return metadata\n\n def _get_source_run_id_from_logs(self, step_output_handle: StepOutputHandle) -> Optional[str]:\n # walk through event logs to find the right run_id based on the run lineage\n\n parent_state = self.get_known_state().parent_state\n while parent_state:\n # if the parent run has yielded an StepOutput event for the given step output,\n # we find the source run id\n if step_output_handle in parent_state.produced_outputs:\n return parent_state.run_id\n\n # else, keep looking backwards\n parent_state = parent_state.get_parent_state()\n\n # When a fixed path is provided via io manager, it's able to run step subset using an execution\n # plan when the ascendant outputs were not previously created by dagster-controlled\n # computations. for example, in backfills, with fixed path io manager, we allow users to\n # "re-execute" runs with steps where the outputs weren't previously stored by dagster.\n\n # Warn about this special case because it will also reach here when all previous runs have\n # skipped yielding this output. From the logs, we have no easy way to differentiate the fixed\n # path case and the skipping case, until we record the skipping info in KnownExecutionState,\n # i.e. resolve https://github.com/dagster-io/dagster/issues/3511\n self.log.warning(\n f"No previously stored outputs found for source {step_output_handle}. "\n "This is either because you are using an IO Manager that does not depend on run ID, "\n "or because all the previous runs have skipped the output in conditional execution."\n )\n return None\n\n def _should_load_from_previous_runs(self, step_output_handle: StepOutputHandle) -> bool:\n # should not load if not a re-execution\n if self.dagster_run.parent_run_id is None:\n return False\n # should not load if re-executing the entire pipeline\n if self.dagster_run.step_keys_to_execute is None:\n return False\n\n # should not load if the entire dynamic step is being executed in the current run\n handle = StepHandle.parse_from_key(step_output_handle.step_key)\n if (\n isinstance(handle, ResolvedFromDynamicStepHandle)\n and handle.unresolved_form.to_key() in self.dagster_run.step_keys_to_execute\n ):\n return False\n\n # should not load if this step is being executed in the current run\n return step_output_handle.step_key not in self.dagster_run.step_keys_to_execute\n\n def _get_source_run_id(self, step_output_handle: StepOutputHandle) -> Optional[str]:\n if self._should_load_from_previous_runs(step_output_handle):\n return self._get_source_run_id_from_logs(step_output_handle)\n else:\n return self.dagster_run.run_id\n\n def capture_step_exception(self, exception: BaseException):\n self._step_exception = check.inst_param(exception, "exception", BaseException)\n\n @property\n def step_exception(self) -> Optional[BaseException]:\n return self._step_exception\n\n @property\n def step_output_capture(self) -> Optional[Dict[StepOutputHandle, Any]]:\n return self._step_output_capture\n\n @property\n def previous_attempt_count(self) -> int:\n return self.get_known_state().get_retry_state().get_attempt_count(self._step.key)\n\n @property\n def op_config(self) -> Any:\n op_config = self.resolved_run_config.ops.get(str(self.node_handle))\n return op_config.config if op_config else None\n\n @property\n def is_op_in_graph(self) -> bool:\n """Whether this step corresponds to an op within a graph (either @graph, or @graph_asset)."""\n return self.step.node_handle.parent is not None\n\n @property\n def is_sda_step(self) -> bool:\n """Whether this step corresponds to a software define asset, inferred by presence of asset info on outputs.\n\n note: ops can materialize assets as well.\n """\n for output in self.step.step_outputs:\n asset_info = self.job_def.asset_layer.asset_info_for_output(\n self.node_handle, output.name\n )\n if asset_info is not None:\n return True\n return False\n\n def set_data_version(self, asset_key: AssetKey, data_version: "DataVersion") -> None:\n self._data_version_cache[asset_key] = data_version\n\n def has_data_version(self, asset_key: AssetKey) -> bool:\n return asset_key in self._data_version_cache\n\n def get_data_version(self, asset_key: AssetKey) -> "DataVersion":\n return self._data_version_cache[asset_key]\n\n @property\n def input_asset_records(self) -> Optional[Mapping[AssetKey, Optional["InputAssetVersionInfo"]]]:\n return self._input_asset_version_info\n\n @property\n def is_external_input_asset_version_info_loaded(self) -> bool:\n return self._is_external_input_asset_version_info_loaded\n\n def get_input_asset_version_info(self, key: AssetKey) -> Optional["InputAssetVersionInfo"]:\n if key not in self._input_asset_version_info:\n self._fetch_input_asset_version_info(key)\n return self._input_asset_version_info[key]\n\n # "external" refers to records for inputs generated outside of this step\n def fetch_external_input_asset_version_info(self) -> None:\n output_keys = self.get_output_asset_keys()\n\n all_dep_keys: List[AssetKey] = []\n for output_key in output_keys:\n if output_key not in self.job_def.asset_layer.asset_deps:\n continue\n dep_keys = self.job_def.asset_layer.upstream_assets_for_asset(output_key)\n for key in dep_keys:\n if key not in all_dep_keys and key not in output_keys:\n all_dep_keys.append(key)\n\n self._input_asset_version_info = {}\n for key in all_dep_keys:\n self._fetch_input_asset_version_info(key)\n self._is_external_input_asset_version_info_loaded = True\n\n def _fetch_input_asset_version_info(self, key: AssetKey) -> None:\n from dagster._core.definitions.data_version import (\n extract_data_version_from_entry,\n )\n\n event = self._get_input_asset_event(key)\n if event is None:\n self._input_asset_version_info[key] = None\n else:\n storage_id = event.storage_id\n # Input name will be none if this is an internal dep\n input_name = self.job_def.asset_layer.input_for_asset_key(self.node_handle, key)\n # Exclude AllPartitionMapping for now to avoid huge queries\n if input_name and self.has_asset_partitions_for_input(input_name):\n subset = self.asset_partitions_subset_for_input(\n input_name, require_valid_partitions=False\n )\n input_keys = list(subset.get_partition_keys())\n\n # This check represents a temporary constraint that prevents huge query results for upstream\n # partition data versions from timing out runs. If a partitioned dependency (a) uses an\n # AllPartitionMapping; and (b) has greater than or equal to\n # SKIP_PARTITION_DATA_VERSION_DEPENDENCY_THRESHOLD dependency partitions, then we\n # process it as a non-partitioned dependency (note that this was the behavior for\n # all partition dependencies prior to 2023-08). This means that stale status\n # results cannot be accurately computed for the dependency, and there is thus\n # corresponding logic in the CachingStaleStatusResolver to account for this. This\n # constraint should be removed when we have thoroughly examined the performance of\n # the data version retrieval query and can guarantee decent performance.\n if len(input_keys) < SKIP_PARTITION_DATA_VERSION_DEPENDENCY_THRESHOLD:\n data_version = self._get_partitions_data_version_from_keys(key, input_keys)\n else:\n data_version = extract_data_version_from_entry(event.event_log_entry)\n else:\n data_version = extract_data_version_from_entry(event.event_log_entry)\n self._input_asset_version_info[key] = InputAssetVersionInfo(\n storage_id, data_version, event.run_id, event.timestamp\n )\n\n def partition_mapping_for_input(self, input_name: str) -> Optional[PartitionMapping]:\n asset_layer = self.job_def.asset_layer\n upstream_asset_key = asset_layer.asset_key_for_input(self.node_handle, input_name)\n if upstream_asset_key:\n upstream_asset_partitions_def = asset_layer.partitions_def_for_asset(upstream_asset_key)\n assets_def = asset_layer.assets_def_for_node(self.node_handle)\n partitions_def = assets_def.partitions_def if assets_def else None\n explicit_partition_mapping = self.job_def.asset_layer.partition_mapping_for_node_input(\n self.node_handle, upstream_asset_key\n )\n return infer_partition_mapping(\n explicit_partition_mapping,\n partitions_def,\n upstream_asset_partitions_def,\n )\n else:\n return None\n\n def _get_input_asset_event(self, key: AssetKey) -> Optional["EventLogRecord"]:\n event = self.instance.get_latest_data_version_record(key)\n if event:\n self._check_input_asset_event(key, event)\n return event\n\n def _check_input_asset_event(self, key: AssetKey, event: "EventLogRecord") -> None:\n assert event.event_log_entry\n event_data_version = extract_data_version_from_entry(event.event_log_entry)\n if key in self._data_version_cache and self._data_version_cache[key] != event_data_version:\n self.log.warning(\n f"Data version mismatch for asset {key}. Data version from materialization within"\n f" current step is `{self._data_version_cache[key]}`. Data version from most recent"\n f" materialization is `{event_data_version}`. Most recent materialization will be"\n " used for provenance tracking."\n )\n\n def _get_partitions_data_version_from_keys(\n self, key: AssetKey, partition_keys: Sequence[str]\n ) -> "DataVersion":\n from dagster._core.definitions.data_version import (\n DataVersion,\n )\n from dagster._core.events import DagsterEventType\n\n # TODO: this needs to account for observations also\n event_type = DagsterEventType.ASSET_MATERIALIZATION\n tags_by_partition = (\n self.instance._event_storage.get_latest_tags_by_partition( # noqa: SLF001\n key, event_type, [DATA_VERSION_TAG], asset_partitions=list(partition_keys)\n )\n )\n partition_data_versions = [\n pair[1][DATA_VERSION_TAG]\n for pair in sorted(tags_by_partition.items(), key=lambda x: x[0])\n ]\n hash_sig = sha256()\n hash_sig.update(bytearray("".join(partition_data_versions), "utf8"))\n return DataVersion(hash_sig.hexdigest())\n\n # Call this to clear the cache for an input asset record. This is necessary when an old\n # materialization for an asset was loaded during `fetch_external_input_asset_records` because an\n # intrastep asset is not required, but then that asset is materialized during the step. If we\n # don't clear the cache for this asset, then we won't use the most up-to-date asset record.\n def wipe_input_asset_version_info(self, key: AssetKey) -> None:\n if key in self._input_asset_version_info:\n del self._input_asset_version_info[key]\n\n def get_output_asset_keys(self) -> AbstractSet[AssetKey]:\n output_keys: Set[AssetKey] = set()\n for step_output in self.step.step_outputs:\n asset_info = self.job_def.asset_layer.asset_info_for_output(\n self.node_handle, step_output.name\n )\n if asset_info is None or not asset_info.is_required:\n continue\n output_keys.add(asset_info.key)\n return output_keys\n\n def has_asset_partitions_for_input(self, input_name: str) -> bool:\n asset_layer = self.job_def.asset_layer\n upstream_asset_key = asset_layer.asset_key_for_input(self.node_handle, input_name)\n\n return (\n upstream_asset_key is not None\n and asset_layer.partitions_def_for_asset(upstream_asset_key) is not None\n )\n\n def asset_partition_key_range_for_input(self, input_name: str) -> PartitionKeyRange:\n subset = self.asset_partitions_subset_for_input(input_name)\n partition_key_ranges = subset.get_partition_key_ranges(\n dynamic_partitions_store=self.instance\n )\n\n if len(partition_key_ranges) != 1:\n check.failed(\n "Tried to access asset partition key range, but there are "\n f"({len(partition_key_ranges)}) key ranges associated with this input.",\n )\n\n return partition_key_ranges[0]\n\n def asset_partitions_subset_for_input(\n self, input_name: str, *, require_valid_partitions: bool = True\n ) -> PartitionsSubset:\n asset_layer = self.job_def.asset_layer\n assets_def = asset_layer.assets_def_for_node(self.node_handle)\n upstream_asset_key = asset_layer.asset_key_for_input(self.node_handle, input_name)\n\n if upstream_asset_key is not None:\n upstream_asset_partitions_def = asset_layer.partitions_def_for_asset(upstream_asset_key)\n\n if upstream_asset_partitions_def is not None:\n partitions_def = assets_def.partitions_def if assets_def else None\n partitions_subset = (\n partitions_def.empty_subset().with_partition_key_range(\n self.asset_partition_key_range, dynamic_partitions_store=self.instance\n )\n if partitions_def\n else None\n )\n partition_mapping = infer_partition_mapping(\n asset_layer.partition_mapping_for_node_input(\n self.node_handle, upstream_asset_key\n ),\n partitions_def,\n upstream_asset_partitions_def,\n )\n mapped_partitions_result = (\n partition_mapping.get_upstream_mapped_partitions_result_for_partitions(\n partitions_subset,\n upstream_asset_partitions_def,\n dynamic_partitions_store=self.instance,\n )\n )\n\n if (\n require_valid_partitions\n and mapped_partitions_result.required_but_nonexistent_partition_keys\n ):\n raise DagsterInvariantViolationError(\n f"Partition key range {self.asset_partition_key_range} in"\n f" {self.node_handle.name} depends on invalid partition keys"\n f" {mapped_partitions_result.required_but_nonexistent_partition_keys} in"\n f" upstream asset {upstream_asset_key}"\n )\n\n return mapped_partitions_result.partitions_subset\n\n check.failed("The input has no asset partitions")\n\n def asset_partition_key_for_input(self, input_name: str) -> str:\n start, end = self.asset_partition_key_range_for_input(input_name)\n if start == end:\n return start\n else:\n check.failed(\n f"Tried to access partition key for input '{input_name}' of step '{self.step.key}',"\n f" but the step input has a partition range: '{start}' to '{end}'."\n )\n\n def _partitions_def_for_output(self, output_name: str) -> Optional[PartitionsDefinition]:\n asset_info = self.job_def.asset_layer.asset_info_for_output(\n node_handle=self.node_handle, output_name=output_name\n )\n if asset_info:\n return asset_info.partitions_def\n else:\n return None\n\n def partitions_def_for_output(self, output_name: str) -> Optional[PartitionsDefinition]:\n return self._partitions_def_for_output(output_name)\n\n def has_asset_partitions_for_output(self, output_name: str) -> bool:\n return self._partitions_def_for_output(output_name) is not None\n\n def asset_partition_key_range_for_output(self, output_name: str) -> PartitionKeyRange:\n if self._partitions_def_for_output(output_name) is not None:\n return self.asset_partition_key_range\n\n check.failed("The output has no asset partitions")\n\n def asset_partition_key_for_output(self, output_name: str) -> str:\n start, end = self.asset_partition_key_range_for_output(output_name)\n if start == end:\n return start\n else:\n check.failed(\n f"Tried to access partition key for output '{output_name}' of step"\n f" '{self.step.key}', but the step output has a partition range: '{start}' to"\n f" '{end}'."\n )\n\n def asset_partitions_time_window_for_output(self, output_name: str) -> TimeWindow:\n """The time window for the partitions of the asset correponding to the given output.\n\n Raises an error if either of the following are true:\n - The output asset has no partitioning.\n - The output asset is not partitioned with a TimeWindowPartitionsDefinition or a\n MultiPartitionsDefinition with one time-partitioned dimension.\n """\n partitions_def = self._partitions_def_for_output(output_name)\n\n if not partitions_def:\n raise ValueError(\n "Tried to get asset partitions for an output that does not correspond to a "\n "partitioned asset."\n )\n\n if not has_one_dimension_time_window_partitioning(partitions_def):\n raise ValueError(\n "Tried to get asset partitions for an output that correponds to a partitioned "\n "asset that is not time-partitioned."\n )\n\n partitions_def = cast(\n Union[TimeWindowPartitionsDefinition, MultiPartitionsDefinition], partitions_def\n )\n partition_key_range = self.asset_partition_key_range_for_output(output_name)\n return TimeWindow(\n # mypy thinks partitions_def is <nothing> here because ????\n partitions_def.time_window_for_partition_key(partition_key_range.start).start,\n partitions_def.time_window_for_partition_key(partition_key_range.end).end,\n )\n\n def asset_partitions_time_window_for_input(self, input_name: str) -> TimeWindow:\n """The time window for the partitions of the asset correponding to the given input.\n\n Raises an error if either of the following are true:\n - The input asset has no partitioning.\n - The input asset is not partitioned with a TimeWindowPartitionsDefinition or a\n MultiPartitionsDefinition with one time-partitioned dimension.\n """\n asset_layer = self.job_def.asset_layer\n upstream_asset_key = asset_layer.asset_key_for_input(self.node_handle, input_name)\n\n if upstream_asset_key is None:\n raise ValueError("The input has no corresponding asset")\n\n upstream_asset_partitions_def = asset_layer.partitions_def_for_asset(upstream_asset_key)\n\n if not upstream_asset_partitions_def:\n raise ValueError(\n "Tried to get asset partitions for an input that does not correspond to a "\n "partitioned asset."\n )\n\n if not has_one_dimension_time_window_partitioning(upstream_asset_partitions_def):\n raise ValueError(\n "Tried to get asset partitions for an input that correponds to a partitioned "\n "asset that is not time-partitioned."\n )\n\n upstream_asset_partitions_def = cast(\n Union[TimeWindowPartitionsDefinition, MultiPartitionsDefinition],\n upstream_asset_partitions_def,\n )\n partition_key_range = self.asset_partition_key_range_for_input(input_name)\n\n return TimeWindow(\n upstream_asset_partitions_def.time_window_for_partition_key(\n partition_key_range.start\n ).start,\n upstream_asset_partitions_def.time_window_for_partition_key(\n partition_key_range.end\n ).end,\n )\n\n def get_type_loader_context(self) -> "DagsterTypeLoaderContext":\n return DagsterTypeLoaderContext(\n plan_data=self.plan_data,\n execution_data=self._execution_data,\n log_manager=self._log_manager,\n step=self.step,\n output_capture=self._output_capture,\n known_state=self._known_state,\n )
\n\n\n
[docs]class TypeCheckContext:\n """The ``context`` object available to a type check function on a DagsterType."""\n\n def __init__(\n self,\n run_id: str,\n log_manager: DagsterLogManager,\n scoped_resources_builder: ScopedResourcesBuilder,\n dagster_type: DagsterType,\n ):\n self._run_id = run_id\n self._log = log_manager\n self._resources = scoped_resources_builder.build(dagster_type.required_resource_keys)\n\n @public\n @property\n def resources(self) -> "Resources":\n """An object whose attributes contain the resources available to this op."""\n return self._resources\n\n @public\n @property\n def run_id(self) -> str:\n """The id of this job run."""\n return self._run_id\n\n @public\n @property\n def log(self) -> DagsterLogManager:\n """Centralized log dispatch from user code."""\n return self._log
\n\n\n
[docs]class DagsterTypeLoaderContext(StepExecutionContext):\n """The context object provided to a :py:class:`@dagster_type_loader <dagster_type_loader>`-decorated function during execution.\n\n Users should not construct this object directly.\n """\n\n @public\n @property\n def resources(self) -> "Resources":\n """The resources available to the type loader, specified by the `required_resource_keys` argument of the decorator."""\n return super(DagsterTypeLoaderContext, self).resources\n\n @public\n @property\n def job_def(self) -> "JobDefinition":\n """The underlying job definition being executed."""\n return super(DagsterTypeLoaderContext, self).job_def\n\n @public\n @property\n def op_def(self) -> "OpDefinition":\n """The op for which type loading is occurring."""\n return super(DagsterTypeLoaderContext, self).op_def
\n
", "current_page_name": "_modules/dagster/_core/execution/context/system", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.execution.context.system"}}, "execute_in_process_result": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.execution.execute_in_process_result

\nfrom typing import Any, Mapping, Optional, Sequence\n\nimport dagster._check as check\nfrom dagster._annotations import public\nfrom dagster._core.definitions import JobDefinition, NodeHandle\nfrom dagster._core.definitions.events import AssetKey, CoercibleToAssetKey\nfrom dagster._core.definitions.utils import DEFAULT_OUTPUT\nfrom dagster._core.errors import DagsterInvariantViolationError\nfrom dagster._core.events import DagsterEvent\nfrom dagster._core.execution.plan.outputs import StepOutputHandle\nfrom dagster._core.storage.dagster_run import DagsterRun\n\nfrom .execution_result import ExecutionResult\n\n\n
[docs]class ExecuteInProcessResult(ExecutionResult):\n """Result object returned by in-process testing APIs.\n\n Users should not instantiate this object directly. Used for retrieving run success, events, and outputs from execution methods that return this object.\n\n This object is returned by:\n - :py:meth:`dagster.GraphDefinition.execute_in_process`\n - :py:meth:`dagster.JobDefinition.execute_in_process`\n - :py:meth:`dagster.materialize_to_memory`\n - :py:meth:`dagster.materialize`\n """\n\n _handle: NodeHandle\n _event_list: Sequence[DagsterEvent]\n _dagster_run: DagsterRun\n _output_capture: Mapping[StepOutputHandle, Any]\n _job_def: JobDefinition\n\n def __init__(\n self,\n event_list: Sequence[DagsterEvent],\n dagster_run: DagsterRun,\n output_capture: Optional[Mapping[StepOutputHandle, Any]],\n job_def: JobDefinition,\n ):\n self._job_def = job_def\n\n self._event_list = event_list\n self._dagster_run = dagster_run\n\n self._output_capture = check.opt_mapping_param(\n output_capture, "output_capture", key_type=StepOutputHandle\n )\n\n @public\n @property\n def job_def(self) -> JobDefinition:\n """JobDefinition: The job definition that was executed."""\n return self._job_def\n\n @public\n @property\n def dagster_run(self) -> DagsterRun:\n """DagsterRun: The Dagster run that was executed."""\n return self._dagster_run\n\n @public\n @property\n def all_events(self) -> Sequence[DagsterEvent]:\n """List[DagsterEvent]: All dagster events emitted during execution."""\n return self._event_list\n\n @public\n @property\n def run_id(self) -> str:\n """str: The run ID of the executed :py:class:`DagsterRun`."""\n return self.dagster_run.run_id\n\n def _get_output_for_handle(self, handle: NodeHandle, output_name: str) -> Any:\n mapped_outputs = {}\n step_key = str(handle)\n output_found = False\n for step_output_handle, value in self._output_capture.items():\n # For the mapped output case, where step keys are in the format\n # "step_key[upstream_mapped_output_name]" within the step output handle.\n if (\n step_output_handle.step_key.startswith(f"{step_key}[")\n and step_output_handle.output_name == output_name\n ):\n output_found = True\n key_start = step_output_handle.step_key.find("[")\n key_end = step_output_handle.step_key.find("]")\n upstream_mapped_output_name = step_output_handle.step_key[key_start + 1 : key_end]\n mapped_outputs[upstream_mapped_output_name] = value\n\n # For all other cases, search for exact match.\n elif (\n step_key == step_output_handle.step_key\n and step_output_handle.output_name == output_name\n ):\n output_found = True\n if not step_output_handle.mapping_key:\n return self._output_capture[step_output_handle]\n mapped_outputs[step_output_handle.mapping_key] = value\n\n if not output_found:\n raise DagsterInvariantViolationError(\n f"No outputs found for output '{output_name}' from node '{handle}'."\n )\n return mapped_outputs\n\n
[docs] @public\n def output_for_node(self, node_str: str, output_name: str = DEFAULT_OUTPUT) -> Any:\n """Retrieves output value with a particular name from the in-process run of the job.\n\n Args:\n node_str (str): Name of the op/graph whose output should be retrieved. If the intended\n graph/op is nested within another graph, the syntax is `outer_graph.inner_node`.\n output_name (Optional[str]): Name of the output on the op/graph to retrieve. Defaults to\n `result`, the default output name in dagster.\n\n Returns:\n Any: The value of the retrieved output.\n """\n return super(ExecuteInProcessResult, self).output_for_node(\n node_str, output_name=output_name\n )
\n\n
[docs] @public\n def asset_value(self, asset_key: CoercibleToAssetKey) -> Any:\n """Retrieves the value of an asset that was materialized during the execution of the job.\n\n Args:\n asset_key (CoercibleToAssetKey): The key of the asset to retrieve.\n\n Returns:\n Any: The value of the retrieved asset.\n """\n node_output_handle = self._job_def.asset_layer.node_output_handle_for_asset(\n AssetKey.from_coercible(asset_key)\n )\n return self.output_for_node(\n node_str=str(node_output_handle.node_handle), output_name=node_output_handle.output_name\n )
\n\n
[docs] @public\n def output_value(self, output_name: str = DEFAULT_OUTPUT) -> Any:\n """Retrieves output of top-level job, if an output is returned.\n\n Args:\n output_name (Optional[str]): The name of the output to retrieve. Defaults to `result`,\n the default output name in dagster.\n\n Returns:\n Any: The value of the retrieved output.\n """\n return super(ExecuteInProcessResult, self).output_value(output_name=output_name)
\n
", "current_page_name": "_modules/dagster/_core/execution/execute_in_process_result", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.execution.execute_in_process_result"}, "job_execution_result": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.execution.job_execution_result

\nfrom typing import Any, Sequence\n\nimport dagster._check as check\nfrom dagster._annotations import public\nfrom dagster._core.definitions import JobDefinition, NodeHandle\nfrom dagster._core.definitions.utils import DEFAULT_OUTPUT\nfrom dagster._core.errors import DagsterInvariantViolationError\nfrom dagster._core.events import DagsterEvent\nfrom dagster._core.execution.plan.utils import build_resources_for_manager\nfrom dagster._core.storage.dagster_run import DagsterRun\n\nfrom .execution_result import ExecutionResult\n\n\n
[docs]class JobExecutionResult(ExecutionResult):\n """Result object returned by :py:func:`dagster.execute_job`.\n\n Used for retrieving run success, events, and outputs from `execute_job`.\n Users should not directly instantiate this class.\n\n Events and run information can be retrieved off of the object directly. In\n order to access outputs, the `ExecuteJobResult` object needs to be opened\n as a context manager, which will re-initialize the resources from\n execution.\n """\n\n def __init__(self, job_def, reconstruct_context, event_list, dagster_run):\n self._job_def = job_def\n self._reconstruct_context = reconstruct_context\n self._context = None\n self._event_list = event_list\n self._dagster_run = dagster_run\n\n def __enter__(self) -> "JobExecutionResult":\n context = self._reconstruct_context.__enter__()\n self._context = context\n return self\n\n def __exit__(self, *exc):\n exit_result = self._reconstruct_context.__exit__(*exc)\n self._context = None\n return exit_result\n\n @public\n @property\n def job_def(self) -> JobDefinition:\n """JobDefinition: The job definition that was executed."""\n return self._job_def\n\n @public\n @property\n def dagster_run(self) -> DagsterRun:\n """DagsterRun: The Dagster run that was executed."""\n return self._dagster_run\n\n @public\n @property\n def all_events(self) -> Sequence[DagsterEvent]:\n """Sequence[DagsterEvent]: List of all events yielded by the job execution."""\n return self._event_list\n\n @public\n @property\n def run_id(self) -> str:\n """str: The id of the Dagster run that was executed."""\n return self.dagster_run.run_id\n\n
[docs] @public\n def output_value(self, output_name: str = DEFAULT_OUTPUT) -> Any:\n """Retrieves output of top-level job, if an output is returned.\n\n In order to use this method, the `ExecuteJobResult` object must be opened as a context manager. If this method is used without opening the context manager, it will result in a :py:class:`DagsterInvariantViolationError`. If the top-level job has no output, calling this method will also result in a :py:class:`DagsterInvariantViolationError`.\n\n Args:\n output_name (Optional[str]): The name of the output to retrieve. Defaults to `result`,\n the default output name in dagster.\n\n Returns:\n Any: The value of the retrieved output.\n """\n return super(JobExecutionResult, self).output_value(output_name=output_name)
\n\n
[docs] @public\n def output_for_node(self, node_str: str, output_name: str = DEFAULT_OUTPUT) -> Any:\n """Retrieves output value with a particular name from the run of the job.\n\n In order to use this method, the `ExecuteJobResult` object must be opened as a context manager. If this method is used without opening the context manager, it will result in a :py:class:`DagsterInvariantViolationError`.\n\n Args:\n node_str (str): Name of the op/graph whose output should be retrieved. If the intended\n graph/op is nested within another graph, the syntax is `outer_graph.inner_node`.\n output_name (Optional[str]): Name of the output on the op/graph to retrieve. Defaults to\n `result`, the default output name in dagster.\n\n Returns:\n Any: The value of the retrieved output.\n """\n return super(JobExecutionResult, self).output_for_node(node_str, output_name=output_name)
\n\n def _get_output_for_handle(self, handle: NodeHandle, output_name: str) -> Any:\n if not self._context:\n raise DagsterInvariantViolationError(\n "In order to access output objects, the result of `execute_job` must be opened as a"\n " context manager: 'with execute_job(...) as result:"\n )\n found = False\n result = None\n for compute_step_event in self.compute_events_for_handle(handle):\n if (\n compute_step_event.is_successful_output\n and compute_step_event.step_output_data.output_name == output_name\n ):\n found = True\n output = compute_step_event.step_output_data\n step = self._context.execution_plan.get_step_by_key(compute_step_event.step_key)\n dagster_type = (\n self.job_def.get_node(handle).output_def_named(output_name).dagster_type\n )\n value = self._get_value(self._context.for_step(step), output, dagster_type)\n check.invariant(\n not (output.mapping_key and step.get_mapping_key()),\n "Not set up to handle mapped outputs downstream of mapped steps",\n )\n mapping_key = output.mapping_key or step.get_mapping_key()\n if mapping_key:\n if result is None:\n result = {mapping_key: value}\n else:\n result[mapping_key] = (\n value # pylint:disable=unsupported-assignment-operation\n )\n else:\n result = value\n\n if found:\n return result\n\n node = self.job_def.get_node(handle)\n raise DagsterInvariantViolationError(\n f"Did not find result {output_name} in {node.describe_node()}"\n )\n\n def _get_value(self, context, step_output_data, dagster_type):\n step_output_handle = step_output_data.step_output_handle\n manager = context.get_io_manager(step_output_handle)\n manager_key = context.execution_plan.get_manager_key(step_output_handle, self.job_def)\n res = manager.load_input(\n context.for_input_manager(\n name=None,\n config=None,\n metadata=None,\n dagster_type=dagster_type,\n source_handle=step_output_handle,\n resource_config=context.resolved_run_config.resources[manager_key].config,\n resources=build_resources_for_manager(manager_key, context),\n )\n )\n return res
\n
", "current_page_name": "_modules/dagster/_core/execution/job_execution_result", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.execution.job_execution_result"}, "validate_run_config": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.execution.validate_run_config

\nfrom typing import Any, Mapping, Optional, Union\n\nimport dagster._check as check\nfrom dagster._core.definitions import JobDefinition\nfrom dagster._core.definitions.run_config import RunConfig, convert_config_input\nfrom dagster._core.system_config.objects import ResolvedRunConfig\n\n\n
[docs]def validate_run_config(\n job_def: JobDefinition,\n run_config: Optional[Union[Mapping[str, Any], RunConfig]] = None,\n) -> Mapping[str, Any]:\n """Function to validate a provided run config blob against a given job.\n\n If validation is successful, this function will return a dictionary representation of the\n validated config actually used during execution.\n\n Args:\n job_def (JobDefinition): The job definition to validate run\n config against\n run_config (Optional[Dict[str, Any]]): The run config to validate\n\n Returns:\n Dict[str, Any]: A dictionary representation of the validated config.\n """\n check.inst_param(job_def, "job_def", JobDefinition)\n run_config = check.opt_mapping_param(\n convert_config_input(run_config), "run_config", key_type=str\n )\n\n return ResolvedRunConfig.build(job_def, run_config).to_dict()
\n
", "current_page_name": "_modules/dagster/_core/execution/validate_run_config", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.execution.validate_run_config"}, "with_resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.execution.with_resources

\nfrom typing import Any, Iterable, List, Mapping, Optional, Sequence, TypeVar, cast\n\nfrom dagster import _check as check\nfrom dagster._core.execution.build_resources import wrap_resources_for_execution\nfrom dagster._utils.merger import merge_dicts\n\nfrom ..._config import Shape\nfrom ..definitions.resource_requirement import ResourceAddable\nfrom ..definitions.utils import DEFAULT_IO_MANAGER_KEY\nfrom ..errors import DagsterInvalidConfigError, DagsterInvalidInvocationError\n\nT = TypeVar("T", bound=ResourceAddable)\n\n\n
[docs]def with_resources(\n definitions: Iterable[T],\n resource_defs: Mapping[str, object],\n resource_config_by_key: Optional[Mapping[str, Any]] = None,\n) -> Sequence[T]:\n """Adds dagster resources to copies of resource-requiring dagster definitions.\n\n An error will be thrown if any provided definitions have a conflicting\n resource definition provided for a key provided to resource_defs. Resource\n config can be provided, with keys in the config dictionary corresponding to\n the keys for each resource definition. If any definition has unsatisfied\n resource keys after applying with_resources, an error will be thrown.\n\n Args:\n definitions (Iterable[ResourceAddable]): Dagster definitions to provide resources to.\n resource_defs (Mapping[str, object]):\n Mapping of resource keys to objects to satisfy\n resource requirements of provided dagster definitions.\n resource_config_by_key (Optional[Mapping[str, Any]]):\n Specifies config for provided resources. The key in this dictionary\n corresponds to configuring the same key in the resource_defs\n dictionary.\n\n Examples:\n .. code-block:: python\n\n from dagster import asset, resource, with_resources\n\n @resource(config_schema={"bar": str})\n def foo_resource():\n ...\n\n @asset(required_resource_keys={"foo"})\n def asset1(context):\n foo = context.resources.foo\n ...\n\n @asset(required_resource_keys={"foo"})\n def asset2(context):\n foo = context.resources.foo\n ...\n\n asset1_with_foo, asset2_with_foo = with_resources(\n [the_asset, other_asset],\n resource_config_by_key={\n "foo": {\n "config": {"bar": ...}\n }\n }\n )\n """\n from dagster._config import validate_config\n from dagster._core.definitions.job_definition import (\n default_job_io_manager_with_fs_io_manager_schema,\n )\n\n check.mapping_param(resource_defs, "resource_defs")\n resource_config_by_key = check.opt_mapping_param(\n resource_config_by_key, "resource_config_by_key"\n )\n\n resource_defs = wrap_resources_for_execution(\n merge_dicts(\n {DEFAULT_IO_MANAGER_KEY: default_job_io_manager_with_fs_io_manager_schema},\n resource_defs,\n )\n )\n\n for key, resource_def in resource_defs.items():\n if key in resource_config_by_key:\n resource_config = resource_config_by_key[key]\n if not isinstance(resource_config, dict) or "config" not in resource_config:\n raise DagsterInvalidInvocationError(\n f"Error with config for resource key '{key}': Expected a "\n "dictionary of the form {'config': ...}, but received "\n f"{resource_config}"\n )\n\n outer_config_shape = Shape({"config": resource_def.get_config_field()})\n config_evr = validate_config(outer_config_shape, resource_config)\n if not config_evr.success:\n raise DagsterInvalidConfigError(\n f"Error when applying config for resource with key '{key}' ",\n config_evr.errors,\n resource_config,\n )\n resource_defs[key] = resource_defs[key].configured(resource_config["config"])\n\n transformed_defs: List[T] = []\n for definition in definitions:\n transformed_defs.append(cast(T, definition.with_resources(resource_defs)))\n\n return transformed_defs
\n
", "current_page_name": "_modules/dagster/_core/execution/with_resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.execution.with_resources"}}, "executor": {"base": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.executor.base

\nfrom abc import ABC, abstractmethod\nfrom typing import TYPE_CHECKING, Iterator\n\nfrom dagster._annotations import public\nfrom dagster._core.execution.retries import RetryMode\n\nif TYPE_CHECKING:\n    from dagster._core.events import DagsterEvent\n    from dagster._core.execution.context.system import PlanOrchestrationContext\n    from dagster._core.execution.plan.plan import ExecutionPlan\n\n\n
[docs]class Executor(ABC):\n
[docs] @public\n @abstractmethod\n def execute(\n self, plan_context: "PlanOrchestrationContext", execution_plan: "ExecutionPlan"\n ) -> Iterator["DagsterEvent"]:\n """For the given context and execution plan, orchestrate a series of sub plan executions in a way that satisfies the whole plan being executed.\n\n Args:\n plan_context (PlanOrchestrationContext): The plan's orchestration context.\n execution_plan (ExecutionPlan): The plan to execute.\n\n Returns:\n A stream of dagster events.\n """
\n\n @public\n @property\n @abstractmethod\n def retries(self) -> RetryMode:\n """Whether retries are enabled or disabled for this instance of the executor.\n\n Executors should allow this to be controlled via configuration if possible.\n\n Returns: RetryMode\n """
\n
", "current_page_name": "_modules/dagster/_core/executor/base", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.executor.base"}, "init": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.executor.init

\nfrom typing import Mapping, NamedTuple\n\nimport dagster._check as check\nfrom dagster._annotations import PublicAttr\nfrom dagster._core.definitions import ExecutorDefinition, IJob\nfrom dagster._core.instance import DagsterInstance\n\n\n
[docs]class InitExecutorContext(\n NamedTuple(\n "InitExecutorContext",\n [\n ("job", PublicAttr[IJob]),\n ("executor_def", PublicAttr[ExecutorDefinition]),\n ("executor_config", PublicAttr[Mapping[str, object]]),\n ("instance", PublicAttr[DagsterInstance]),\n ],\n )\n):\n """Executor-specific initialization context.\n\n Attributes:\n job (IJob): The job to be executed.\n executor_def (ExecutorDefinition): The definition of the executor currently being\n constructed.\n executor_config (dict): The parsed config passed to the executor.\n instance (DagsterInstance): The current instance.\n """\n\n def __new__(\n cls,\n job: IJob,\n executor_def: ExecutorDefinition,\n executor_config: Mapping[str, object],\n instance: DagsterInstance,\n ):\n return super(InitExecutorContext, cls).__new__(\n cls,\n job=check.inst_param(job, "job", IJob),\n executor_def=check.inst_param(executor_def, "executor_def", ExecutorDefinition),\n executor_config=check.mapping_param(executor_config, "executor_config", key_type=str),\n instance=check.inst_param(instance, "instance", DagsterInstance),\n )
\n
", "current_page_name": "_modules/dagster/_core/executor/init", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.executor.init"}}, "instance": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.instance

\nimport logging\nimport logging.config\nimport os\nimport sys\nimport time\nimport weakref\nfrom abc import abstractmethod\nfrom collections import defaultdict\nfrom enum import Enum\nfrom tempfile import TemporaryDirectory\nfrom types import TracebackType\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Any,\n    Callable,\n    Dict,\n    Generic,\n    Iterable,\n    List,\n    Mapping,\n    Optional,\n    Sequence,\n    Set,\n    Tuple,\n    Type,\n    Union,\n    cast,\n)\n\nimport yaml\nfrom typing_extensions import Protocol, Self, TypeAlias, TypeVar, runtime_checkable\n\nimport dagster._check as check\nfrom dagster._annotations import experimental, public\nfrom dagster._core.definitions.asset_check_evaluation import (\n    AssetCheckEvaluation,\n    AssetCheckEvaluationPlanned,\n)\nfrom dagster._core.definitions.data_version import extract_data_provenance_from_entry\nfrom dagster._core.definitions.events import AssetKey, AssetObservation\nfrom dagster._core.errors import (\n    DagsterHomeNotSetError,\n    DagsterInvalidInvocationError,\n    DagsterInvariantViolationError,\n    DagsterRunAlreadyExists,\n    DagsterRunConflict,\n)\nfrom dagster._core.log_manager import DagsterLogRecord\nfrom dagster._core.origin import JobPythonOrigin\nfrom dagster._core.storage.dagster_run import (\n    IN_PROGRESS_RUN_STATUSES,\n    DagsterRun,\n    DagsterRunStatsSnapshot,\n    DagsterRunStatus,\n    JobBucket,\n    RunPartitionData,\n    RunRecord,\n    RunsFilter,\n    TagBucket,\n)\nfrom dagster._core.storage.tags import (\n    ASSET_PARTITION_RANGE_END_TAG,\n    ASSET_PARTITION_RANGE_START_TAG,\n    PARENT_RUN_ID_TAG,\n    PARTITION_NAME_TAG,\n    RESUME_RETRY_TAG,\n    ROOT_RUN_ID_TAG,\n)\nfrom dagster._serdes import ConfigurableClass\nfrom dagster._seven import get_current_datetime_in_utc\nfrom dagster._utils import PrintFn, traced\nfrom dagster._utils.error import serializable_error_info_from_exc_info\nfrom dagster._utils.merger import merge_dicts\nfrom dagster._utils.warnings import (\n    deprecation_warning,\n    experimental_warning,\n)\n\nfrom .config import (\n    DAGSTER_CONFIG_YAML_FILENAME,\n    DEFAULT_LOCAL_CODE_SERVER_STARTUP_TIMEOUT,\n    get_default_tick_retention_settings,\n    get_tick_retention_settings,\n)\nfrom .ref import InstanceRef\n\n# 'airflow_execution_date' and 'is_airflow_ingest_pipeline' are hardcoded tags used in the\n# airflow ingestion logic (see: dagster_pipeline_factory.py). 'airflow_execution_date' stores the\n# 'execution_date' used in Airflow operator execution and 'is_airflow_ingest_pipeline' determines\n# whether 'airflow_execution_date' is needed.\n# https://github.com/dagster-io/dagster/issues/2403\nAIRFLOW_EXECUTION_DATE_STR = "airflow_execution_date"\nIS_AIRFLOW_INGEST_PIPELINE_STR = "is_airflow_ingest_pipeline"\n\n# Our internal guts can handle empty strings for job name and run id\n# However making these named constants for documentation, to encode where we are making the assumption,\n# and to allow us to change this more easily in the future, provided we are disciplined about\n# actually using this constants.\nRUNLESS_RUN_ID = ""\nRUNLESS_JOB_NAME = ""\n\nif TYPE_CHECKING:\n    from dagster._core.debug import DebugRunPayload\n    from dagster._core.definitions.asset_check_spec import AssetCheckKey\n    from dagster._core.definitions.job_definition import (\n        JobDefinition,\n    )\n    from dagster._core.definitions.partition import PartitionsDefinition\n    from dagster._core.definitions.repository_definition.repository_definition import (\n        RepositoryLoadData,\n    )\n    from dagster._core.definitions.run_request import InstigatorType\n    from dagster._core.event_api import EventHandlerFn\n    from dagster._core.events import (\n        AssetMaterialization,\n        DagsterEvent,\n        DagsterEventType,\n        EngineEventData,\n    )\n    from dagster._core.events.log import EventLogEntry\n    from dagster._core.execution.backfill import BulkActionStatus, PartitionBackfill\n    from dagster._core.execution.plan.plan import ExecutionPlan\n    from dagster._core.execution.plan.resume_retry import ReexecutionStrategy\n    from dagster._core.execution.stats import RunStepKeyStatsSnapshot\n    from dagster._core.host_representation import (\n        CodeLocation,\n        ExternalJob,\n        ExternalJobOrigin,\n        ExternalSensor,\n        HistoricalJob,\n    )\n    from dagster._core.host_representation.external import ExternalSchedule\n    from dagster._core.launcher import RunLauncher\n    from dagster._core.run_coordinator import RunCoordinator\n    from dagster._core.scheduler import Scheduler, SchedulerDebugInfo\n    from dagster._core.scheduler.instigation import (\n        InstigatorState,\n        InstigatorStatus,\n        InstigatorTick,\n        TickData,\n        TickStatus,\n    )\n    from dagster._core.secrets import SecretsLoader\n    from dagster._core.snap import ExecutionPlanSnapshot, JobSnapshot\n    from dagster._core.storage.compute_log_manager import ComputeLogManager\n    from dagster._core.storage.daemon_cursor import DaemonCursorStorage\n    from dagster._core.storage.event_log import EventLogStorage\n    from dagster._core.storage.event_log.base import (\n        AssetRecord,\n        EventLogConnection,\n        EventLogRecord,\n        EventRecordsFilter,\n    )\n    from dagster._core.storage.partition_status_cache import (\n        AssetPartitionStatus,\n        AssetStatusCacheValue,\n    )\n    from dagster._core.storage.root import LocalArtifactStorage\n    from dagster._core.storage.runs import RunStorage\n    from dagster._core.storage.schedules import ScheduleStorage\n    from dagster._core.storage.sql import AlembicVersion\n    from dagster._core.workspace.workspace import IWorkspace\n    from dagster._daemon.types import DaemonHeartbeat, DaemonStatus\n\nDagsterInstanceOverrides: TypeAlias = Mapping[str, Any]\n\n\ndef _check_run_equality(\n    pipeline_run: DagsterRun, candidate_run: DagsterRun\n) -> Mapping[str, Tuple[Any, Any]]:\n    field_diff: Dict[str, Tuple[Any, Any]] = {}\n    for field in pipeline_run._fields:\n        expected_value = getattr(pipeline_run, field)\n        candidate_value = getattr(candidate_run, field)\n        if expected_value != candidate_value:\n            field_diff[field] = (expected_value, candidate_value)\n\n    return field_diff\n\n\ndef _format_field_diff(field_diff: Mapping[str, Tuple[Any, Any]]) -> str:\n    return "\\n".join(\n        [\n            (\n                "    {field_name}:\\n"\n                + "        Expected: {expected_value}\\n"\n                + "        Received: {candidate_value}"\n            ).format(\n                field_name=field_name,\n                expected_value=expected_value,\n                candidate_value=candidate_value,\n            )\n            for field_name, (\n                expected_value,\n                candidate_value,\n            ) in field_diff.items()\n        ]\n    )\n\n\nclass _EventListenerLogHandler(logging.Handler):\n    def __init__(self, instance: "DagsterInstance"):\n        self._instance = instance\n        super(_EventListenerLogHandler, self).__init__()\n\n    def emit(self, record: DagsterLogRecord) -> None:\n        from dagster._core.events import EngineEventData\n        from dagster._core.events.log import StructuredLoggerMessage, construct_event_record\n\n        event = construct_event_record(\n            StructuredLoggerMessage(\n                name=record.name,\n                message=record.msg,\n                level=record.levelno,\n                meta=record.dagster_meta,  # type: ignore\n                record=record,\n            )\n        )\n\n        try:\n            self._instance.handle_new_event(event)\n        except Exception as e:\n            sys.stderr.write(f"Exception while writing logger call to event log: {e}\\n")\n            if event.dagster_event:\n                # Swallow user-generated log failures so that the entire step/run doesn't fail, but\n                # raise failures writing system-generated log events since they are the source of\n                # truth for the state of the run\n                raise\n            elif event.run_id:\n                self._instance.report_engine_event(\n                    "Exception while writing logger call to event log",\n                    job_name=event.job_name,\n                    run_id=event.run_id,\n                    step_key=event.step_key,\n                    engine_event_data=EngineEventData(\n                        error=serializable_error_info_from_exc_info(sys.exc_info()),\n                    ),\n                )\n\n\nclass InstanceType(Enum):\n    PERSISTENT = "PERSISTENT"\n    EPHEMERAL = "EPHEMERAL"\n\n\nT_DagsterInstance = TypeVar("T_DagsterInstance", bound="DagsterInstance", default="DagsterInstance")\n\n\nclass MayHaveInstanceWeakref(Generic[T_DagsterInstance]):\n    """Mixin for classes that can have a weakref back to a Dagster instance."""\n\n    _instance_weakref: "Optional[weakref.ReferenceType[T_DagsterInstance]]"\n\n    def __init__(self):\n        self._instance_weakref = None\n\n    @property\n    def has_instance(self) -> bool:\n        return hasattr(self, "_instance_weakref") and (self._instance_weakref is not None)\n\n    @property\n    def _instance(self) -> T_DagsterInstance:\n        instance = (\n            self._instance_weakref()\n            # Backcompat with custom subclasses that don't call super().__init__()\n            # in their own __init__ implementations\n            if (hasattr(self, "_instance_weakref") and self._instance_weakref is not None)\n            else None\n        )\n        if instance is None:\n            raise DagsterInvariantViolationError(\n                "Attempted to resolve undefined DagsterInstance weakref."\n            )\n        else:\n            return instance\n\n    def register_instance(self, instance: T_DagsterInstance) -> None:\n        check.invariant(\n            # Backcompat with custom subclasses that don't call super().__init__()\n            # in their own __init__ implementations\n            (not hasattr(self, "_instance_weakref") or self._instance_weakref is None),\n            "Must only call initialize once",\n        )\n\n        # Store a weakref to avoid a circular reference / enable GC\n        self._instance_weakref = weakref.ref(instance)\n\n\n@runtime_checkable\nclass DynamicPartitionsStore(Protocol):\n    @abstractmethod\n    def get_dynamic_partitions(self, partitions_def_name: str) -> Sequence[str]: ...\n\n    @abstractmethod\n    def has_dynamic_partition(self, partitions_def_name: str, partition_key: str) -> bool: ...\n\n\n
[docs]class DagsterInstance(DynamicPartitionsStore):\n """Core abstraction for managing Dagster's access to storage and other resources.\n\n Use DagsterInstance.get() to grab the current DagsterInstance which will load based on\n the values in the ``dagster.yaml`` file in ``$DAGSTER_HOME``.\n\n Alternatively, DagsterInstance.ephemeral() can use used which provides a set of\n transient in-memory components.\n\n Configuration of this class should be done by setting values in ``$DAGSTER_HOME/dagster.yaml``.\n For example, to use Postgres for dagster storage, you can write a ``dagster.yaml`` such as the\n following:\n\n .. literalinclude:: ../../../../../examples/docs_snippets/docs_snippets/deploying/dagster-pg.yaml\n :caption: dagster.yaml\n :language: YAML\n\n Args:\n instance_type (InstanceType): Indicates whether the instance is ephemeral or persistent.\n Users should not attempt to set this value directly or in their ``dagster.yaml`` files.\n local_artifact_storage (LocalArtifactStorage): The local artifact storage is used to\n configure storage for any artifacts that require a local disk, such as schedules, or\n when using the filesystem system storage to manage files and intermediates. By default,\n this will be a :py:class:`dagster._core.storage.root.LocalArtifactStorage`. Configurable\n in ``dagster.yaml`` using the :py:class:`~dagster.serdes.ConfigurableClass`\n machinery.\n run_storage (RunStorage): The run storage is used to store metadata about ongoing and past\n pipeline runs. By default, this will be a\n :py:class:`dagster._core.storage.runs.SqliteRunStorage`. Configurable in ``dagster.yaml``\n using the :py:class:`~dagster.serdes.ConfigurableClass` machinery.\n event_storage (EventLogStorage): Used to store the structured event logs generated by\n pipeline runs. By default, this will be a\n :py:class:`dagster._core.storage.event_log.SqliteEventLogStorage`. Configurable in\n ``dagster.yaml`` using the :py:class:`~dagster.serdes.ConfigurableClass` machinery.\n compute_log_manager (Optional[ComputeLogManager]): The compute log manager handles stdout\n and stderr logging for op compute functions. By default, this will be a\n :py:class:`dagster._core.storage.local_compute_log_manager.LocalComputeLogManager`.\n Configurable in ``dagster.yaml`` using the\n :py:class:`~dagster.serdes.ConfigurableClass` machinery.\n run_coordinator (Optional[RunCoordinator]): A runs coordinator may be used to manage the execution\n of pipeline runs.\n run_launcher (Optional[RunLauncher]): Optionally, a run launcher may be used to enable\n a Dagster instance to launch pipeline runs, e.g. on a remote Kubernetes cluster, in\n addition to running them locally.\n settings (Optional[Dict]): Specifies certain per-instance settings,\n such as feature flags. These are set in the ``dagster.yaml`` under a set of whitelisted\n keys.\n ref (Optional[InstanceRef]): Used by internal machinery to pass instances across process\n boundaries.\n """\n\n # Stores TemporaryDirectory instances that were created for DagsterInstance.local_temp() calls\n # to be removed once the instance is garbage collected.\n _TEMP_DIRS: "weakref.WeakKeyDictionary[DagsterInstance, TemporaryDirectory]" = (\n weakref.WeakKeyDictionary()\n )\n\n def __init__(\n self,\n instance_type: InstanceType,\n local_artifact_storage: "LocalArtifactStorage",\n run_storage: "RunStorage",\n event_storage: "EventLogStorage",\n run_coordinator: Optional["RunCoordinator"],\n compute_log_manager: Optional["ComputeLogManager"],\n run_launcher: Optional["RunLauncher"],\n scheduler: Optional["Scheduler"] = None,\n schedule_storage: Optional["ScheduleStorage"] = None,\n settings: Optional[Mapping[str, Any]] = None,\n secrets_loader: Optional["SecretsLoader"] = None,\n ref: Optional[InstanceRef] = None,\n **_kwargs: Any, # we accept kwargs for forward-compat of custom instances\n ):\n from dagster._core.launcher import RunLauncher\n from dagster._core.run_coordinator import RunCoordinator\n from dagster._core.scheduler import Scheduler\n from dagster._core.secrets import SecretsLoader\n from dagster._core.storage.captured_log_manager import CapturedLogManager\n from dagster._core.storage.compute_log_manager import ComputeLogManager\n from dagster._core.storage.event_log import EventLogStorage\n from dagster._core.storage.root import LocalArtifactStorage\n from dagster._core.storage.runs import RunStorage\n from dagster._core.storage.schedules import ScheduleStorage\n\n self._instance_type = check.inst_param(instance_type, "instance_type", InstanceType)\n self._local_artifact_storage = check.inst_param(\n local_artifact_storage, "local_artifact_storage", LocalArtifactStorage\n )\n self._event_storage = check.inst_param(event_storage, "event_storage", EventLogStorage)\n self._event_storage.register_instance(self)\n\n self._run_storage = check.inst_param(run_storage, "run_storage", RunStorage)\n self._run_storage.register_instance(self)\n\n if compute_log_manager:\n self._compute_log_manager = check.inst_param(\n compute_log_manager, "compute_log_manager", ComputeLogManager\n )\n if not isinstance(self._compute_log_manager, CapturedLogManager):\n deprecation_warning(\n "ComputeLogManager",\n "1.2.0",\n "Implement the CapturedLogManager interface instead.",\n )\n self._compute_log_manager.register_instance(self)\n else:\n check.invariant(\n ref, "Compute log manager must be provided if instance is not from a ref"\n )\n self._compute_log_manager = None\n\n self._scheduler = check.opt_inst_param(scheduler, "scheduler", Scheduler)\n\n self._schedule_storage = check.opt_inst_param(\n schedule_storage, "schedule_storage", ScheduleStorage\n )\n if self._schedule_storage:\n self._schedule_storage.register_instance(self)\n\n if run_coordinator:\n self._run_coordinator = check.inst_param(\n run_coordinator, "run_coordinator", RunCoordinator\n )\n self._run_coordinator.register_instance(self)\n else:\n check.invariant(ref, "Run coordinator must be provided if instance is not from a ref")\n self._run_coordinator = None\n\n if run_launcher:\n self._run_launcher: Optional[RunLauncher] = check.inst_param(\n run_launcher, "run_launcher", RunLauncher\n )\n run_launcher.register_instance(self)\n else:\n check.invariant(ref, "Run launcher must be provided if instance is not from a ref")\n self._run_launcher = None\n\n self._settings = check.opt_mapping_param(settings, "settings")\n\n self._secrets_loader = check.opt_inst_param(secrets_loader, "secrets_loader", SecretsLoader)\n\n if self._secrets_loader:\n self._secrets_loader.register_instance(self)\n\n self._ref = check.opt_inst_param(ref, "ref", InstanceRef)\n\n self._subscribers: Dict[str, List[Callable]] = defaultdict(list)\n\n run_monitoring_enabled = self.run_monitoring_settings.get("enabled", False)\n self._run_monitoring_enabled = run_monitoring_enabled\n if self.run_monitoring_enabled and self.run_monitoring_max_resume_run_attempts:\n check.invariant(\n self.run_launcher.supports_resume_run,\n "The configured run launcher does not support resuming runs. Set"\n " max_resume_run_attempts to 0 to use run monitoring. Any runs with a failed"\n " run worker will be marked as failed, but will not be resumed.",\n )\n\n if self.run_retries_enabled:\n check.invariant(\n self.event_log_storage.supports_event_consumer_queries(),\n "Run retries are enabled, but the configured event log storage does not support"\n " them. Consider switching to Postgres or Mysql.",\n )\n\n # ctors\n\n
[docs] @public\n @staticmethod\n def ephemeral(\n tempdir: Optional[str] = None,\n preload: Optional[Sequence["DebugRunPayload"]] = None,\n settings: Optional[Dict] = None,\n ) -> "DagsterInstance":\n """Create a `DagsterInstance` suitable for ephemeral execution, useful in test contexts. An\n ephemeral instance uses mostly in-memory components. Use `local_temp` to create a test\n instance that is fully persistent.\n\n Args:\n tempdir (Optional[str]): The path of a directory to be used for local artifact storage.\n preload (Optional[Sequence[DebugRunPayload]]): A sequence of payloads to load into the\n instance's run storage. Useful for debugging.\n settings (Optional[Dict]): Settings for the instance.\n\n Returns:\n DagsterInstance: An ephemeral DagsterInstance.\n """\n from dagster._core.launcher.sync_in_memory_run_launcher import SyncInMemoryRunLauncher\n from dagster._core.run_coordinator import DefaultRunCoordinator\n from dagster._core.storage.event_log import InMemoryEventLogStorage\n from dagster._core.storage.noop_compute_log_manager import NoOpComputeLogManager\n from dagster._core.storage.root import LocalArtifactStorage, TemporaryLocalArtifactStorage\n from dagster._core.storage.runs import InMemoryRunStorage\n\n if tempdir is not None:\n local_storage = LocalArtifactStorage(tempdir)\n else:\n local_storage = TemporaryLocalArtifactStorage()\n\n return DagsterInstance(\n instance_type=InstanceType.EPHEMERAL,\n local_artifact_storage=local_storage,\n run_storage=InMemoryRunStorage(preload=preload),\n event_storage=InMemoryEventLogStorage(preload=preload),\n compute_log_manager=NoOpComputeLogManager(),\n run_coordinator=DefaultRunCoordinator(),\n run_launcher=SyncInMemoryRunLauncher(),\n settings=settings,\n )
\n\n
[docs] @public\n @staticmethod\n def get() -> "DagsterInstance":\n """Get the current `DagsterInstance` as specified by the ``DAGSTER_HOME`` environment variable.\n\n Returns:\n DagsterInstance: The current DagsterInstance.\n """\n dagster_home_path = os.getenv("DAGSTER_HOME")\n\n if not dagster_home_path:\n raise DagsterHomeNotSetError(\n "The environment variable $DAGSTER_HOME is not set. \\nDagster requires this"\n " environment variable to be set to an existing directory in your filesystem. This"\n " directory is used to store metadata across sessions, or load the dagster.yaml"\n " file which can configure storing metadata in an external database.\\nYou can"\n " resolve this error by exporting the environment variable. For example, you can"\n " run the following command in your shell or include it in your shell configuration"\n ' file:\\n\\texport DAGSTER_HOME=~"/dagster_home"\\nor PowerShell\\n$env:DAGSTER_HOME'\n " = ($home + '\\\\dagster_home')or batchset"\n " DAGSTER_HOME=%UserProfile%/dagster_homeAlternatively, DagsterInstance.ephemeral()"\n " can be used for a transient instance.\\n"\n )\n\n dagster_home_path = os.path.expanduser(dagster_home_path)\n\n if not os.path.isabs(dagster_home_path):\n raise DagsterInvariantViolationError(\n (\n '$DAGSTER_HOME "{}" must be an absolute path. Dagster requires this '\n "environment variable to be set to an existing directory in your filesystem."\n ).format(dagster_home_path)\n )\n\n if not (os.path.exists(dagster_home_path) and os.path.isdir(dagster_home_path)):\n raise DagsterInvariantViolationError(\n (\n '$DAGSTER_HOME "{}" is not a directory or does not exist. Dagster requires this'\n " environment variable to be set to an existing directory in your filesystem"\n ).format(dagster_home_path)\n )\n\n return DagsterInstance.from_config(dagster_home_path)
\n\n
[docs] @public\n @staticmethod\n def local_temp(\n tempdir: Optional[str] = None,\n overrides: Optional[DagsterInstanceOverrides] = None,\n ) -> "DagsterInstance":\n """Create a DagsterInstance that uses a temporary directory for local storage. This is a\n regular, fully persistent instance. Use `ephemeral` to get an ephemeral instance with\n in-memory components.\n\n Args:\n tempdir (Optional[str]): The path of a directory to be used for local artifact storage.\n overrides (Optional[DagsterInstanceOverrides]): Override settings for the instance.\n\n Returns:\n DagsterInstance\n """\n if tempdir is None:\n created_dir = TemporaryDirectory()\n i = DagsterInstance.from_ref(\n InstanceRef.from_dir(created_dir.name, overrides=overrides)\n )\n DagsterInstance._TEMP_DIRS[i] = created_dir\n return i\n\n return DagsterInstance.from_ref(InstanceRef.from_dir(tempdir, overrides=overrides))
\n\n @staticmethod\n def from_config(\n config_dir: str,\n config_filename: str = DAGSTER_CONFIG_YAML_FILENAME,\n ) -> "DagsterInstance":\n instance_ref = InstanceRef.from_dir(config_dir, config_filename=config_filename)\n return DagsterInstance.from_ref(instance_ref)\n\n @staticmethod\n def from_ref(instance_ref: InstanceRef) -> "DagsterInstance":\n check.inst_param(instance_ref, "instance_ref", InstanceRef)\n\n # DagsterInstance doesn't implement ConfigurableClass, but we may still sometimes want to\n # have custom subclasses of DagsterInstance. This machinery allows for those custom\n # subclasses to receive additional keyword arguments passed through the config YAML.\n klass = instance_ref.custom_instance_class or DagsterInstance\n kwargs = instance_ref.custom_instance_class_config\n\n unified_storage = instance_ref.storage\n run_storage = unified_storage.run_storage if unified_storage else instance_ref.run_storage\n event_storage = (\n unified_storage.event_log_storage if unified_storage else instance_ref.event_storage\n )\n schedule_storage = (\n unified_storage.schedule_storage if unified_storage else instance_ref.schedule_storage\n )\n\n return klass(\n instance_type=InstanceType.PERSISTENT,\n local_artifact_storage=instance_ref.local_artifact_storage,\n run_storage=run_storage, # type: ignore # (possible none)\n event_storage=event_storage, # type: ignore # (possible none)\n schedule_storage=schedule_storage,\n compute_log_manager=None, # lazy load\n scheduler=instance_ref.scheduler,\n run_coordinator=None, # lazy load\n run_launcher=None, # lazy load\n settings=instance_ref.settings,\n secrets_loader=instance_ref.secrets_loader,\n ref=instance_ref,\n **kwargs,\n )\n\n # flags\n\n @property\n def is_persistent(self) -> bool:\n return self._instance_type == InstanceType.PERSISTENT\n\n @property\n def is_ephemeral(self) -> bool:\n return self._instance_type == InstanceType.EPHEMERAL\n\n def get_ref(self) -> InstanceRef:\n if self._ref:\n return self._ref\n\n check.failed(\n "Attempted to prepare an ineligible DagsterInstance ({inst_type}) for cross "\n "process communication.{dagster_home_msg}".format(\n inst_type=self._instance_type,\n dagster_home_msg=(\n "\\nDAGSTER_HOME environment variable is not set, set it to "\n "a directory on the filesystem for dagster to use for storage and cross "\n "process coordination."\n if os.getenv("DAGSTER_HOME") is None\n else ""\n ),\n )\n )\n\n @property\n def root_directory(self) -> str:\n return self._local_artifact_storage.base_dir\n\n def _info(self, component: object) -> Union[str, Mapping[Any, Any]]:\n # ConfigurableClass may not have inst_data if it's a direct instantiation\n # which happens for ephemeral instances\n if isinstance(component, ConfigurableClass) and component.inst_data:\n return component.inst_data.info_dict()\n if type(component) is dict:\n return component\n return component.__class__.__name__\n\n def _info_str_for_component(self, component_name: str, component: object) -> str:\n return yaml.dump(\n {component_name: self._info(component)}, default_flow_style=False, sort_keys=False\n )\n\n def info_dict(self) -> Mapping[str, object]:\n settings: Mapping[str, object] = self._settings if self._settings else {}\n\n ret = {\n "local_artifact_storage": self._info(self._local_artifact_storage),\n "run_storage": self._info(self._run_storage),\n "event_log_storage": self._info(self._event_storage),\n "compute_logs": self._info(self._compute_log_manager),\n "schedule_storage": self._info(self._schedule_storage),\n "scheduler": self._info(self._scheduler),\n "run_coordinator": self._info(self._run_coordinator),\n "run_launcher": self._info(self.run_launcher),\n }\n ret.update(\n {\n settings_key: self._info(settings_value)\n for settings_key, settings_value in settings.items()\n }\n )\n\n return ret\n\n def info_str(self) -> str:\n return yaml.dump(self.info_dict(), default_flow_style=False, sort_keys=False)\n\n def schema_str(self) -> str:\n def _schema_dict(alembic_version: "AlembicVersion") -> Optional[Mapping[str, object]]:\n if not alembic_version:\n return None\n db_revision, head_revision = alembic_version\n return {\n "current": db_revision,\n "latest": head_revision,\n }\n\n return yaml.dump(\n {\n "schema": {\n "event_log_storage": _schema_dict(self._event_storage.alembic_version()), # type: ignore # (possible none)\n "run_storage": _schema_dict(self._event_storage.alembic_version()), # type: ignore # (possible none)\n "schedule_storage": _schema_dict(self._event_storage.alembic_version()), # type: ignore # (possible none)\n }\n },\n default_flow_style=False,\n sort_keys=False,\n )\n\n @property\n def run_storage(self) -> "RunStorage":\n return self._run_storage\n\n @property\n def event_log_storage(self) -> "EventLogStorage":\n return self._event_storage\n\n @property\n def daemon_cursor_storage(self) -> "DaemonCursorStorage":\n return self._run_storage\n\n # schedule storage\n\n @property\n def schedule_storage(self) -> Optional["ScheduleStorage"]:\n return self._schedule_storage\n\n @property\n def scheduler(self) -> Optional["Scheduler"]:\n return self._scheduler\n\n @property\n def scheduler_class(self) -> Optional[str]:\n return self.scheduler.__class__.__name__ if self.scheduler else None\n\n # run coordinator\n\n @property\n def run_coordinator(self) -> "RunCoordinator":\n # Lazily load in case the run coordinator requires dependencies that are not available\n # everywhere that loads the instance\n if not self._run_coordinator:\n check.invariant(\n self._ref, "Run coordinator not provided, and no instance ref available"\n )\n run_coordinator = cast(InstanceRef, self._ref).run_coordinator\n check.invariant(run_coordinator, "Run coordinator not configured in instance ref")\n self._run_coordinator = cast("RunCoordinator", run_coordinator)\n self._run_coordinator.register_instance(self)\n return self._run_coordinator\n\n # run launcher\n\n @property\n def run_launcher(self) -> "RunLauncher":\n # Lazily load in case the launcher requires dependencies that are not available everywhere\n # that loads the instance (e.g. The EcsRunLauncher requires boto3)\n if not self._run_launcher:\n check.invariant(self._ref, "Run launcher not provided, and no instance ref available")\n launcher = cast(InstanceRef, self._ref).run_launcher\n check.invariant(launcher, "Run launcher not configured in instance ref")\n self._run_launcher = cast("RunLauncher", launcher)\n self._run_launcher.register_instance(self)\n return self._run_launcher\n\n # compute logs\n\n @property\n def compute_log_manager(self) -> "ComputeLogManager":\n if not self._compute_log_manager:\n check.invariant(\n self._ref, "Compute log manager not provided, and no instance ref available"\n )\n compute_log_manager = cast(InstanceRef, self._ref).compute_log_manager\n check.invariant(\n compute_log_manager, "Compute log manager not configured in instance ref"\n )\n self._compute_log_manager = cast("ComputeLogManager", compute_log_manager)\n self._compute_log_manager.register_instance(self)\n return self._compute_log_manager\n\n def get_settings(self, settings_key: str) -> Any:\n check.str_param(settings_key, "settings_key")\n if self._settings and settings_key in self._settings:\n return self._settings.get(settings_key)\n return {}\n\n @property\n def telemetry_enabled(self) -> bool:\n if self.is_ephemeral:\n return False\n\n dagster_telemetry_enabled_default = True\n\n telemetry_settings = self.get_settings("telemetry")\n\n if not telemetry_settings:\n return dagster_telemetry_enabled_default\n\n if "enabled" in telemetry_settings:\n return telemetry_settings["enabled"]\n else:\n return dagster_telemetry_enabled_default\n\n @property\n def nux_enabled(self) -> bool:\n if self.is_ephemeral:\n return False\n\n nux_enabled_by_default = True\n\n nux_settings = self.get_settings("nux")\n if not nux_settings:\n return nux_enabled_by_default\n\n if "enabled" in nux_settings:\n return nux_settings["enabled"]\n else:\n return nux_enabled_by_default\n\n # run monitoring\n\n @property\n def run_monitoring_enabled(self) -> bool:\n return self._run_monitoring_enabled\n\n @property\n def run_monitoring_settings(self) -> Any:\n return self.get_settings("run_monitoring")\n\n @property\n def run_monitoring_start_timeout_seconds(self) -> int:\n return self.run_monitoring_settings.get("start_timeout_seconds", 180)\n\n @property\n def run_monitoring_cancel_timeout_seconds(self) -> int:\n return self.run_monitoring_settings.get("cancel_timeout_seconds", 180)\n\n @property\n def code_server_settings(self) -> Any:\n return self.get_settings("code_servers")\n\n @property\n def code_server_process_startup_timeout(self) -> int:\n return self.code_server_settings.get(\n "local_startup_timeout", DEFAULT_LOCAL_CODE_SERVER_STARTUP_TIMEOUT\n )\n\n @property\n def code_server_reload_timeout(self) -> int:\n return self.code_server_settings.get(\n "reload_timeout", DEFAULT_LOCAL_CODE_SERVER_STARTUP_TIMEOUT\n )\n\n @property\n def wait_for_local_code_server_processes_on_shutdown(self) -> bool:\n return self.code_server_settings.get("wait_for_local_processes_on_shutdown", False)\n\n @property\n def run_monitoring_max_resume_run_attempts(self) -> int:\n return self.run_monitoring_settings.get("max_resume_run_attempts", 0)\n\n @property\n def run_monitoring_poll_interval_seconds(self) -> int:\n return self.run_monitoring_settings.get("poll_interval_seconds", 120)\n\n @property\n def cancellation_thread_poll_interval_seconds(self) -> int:\n return self.get_settings("run_monitoring").get(\n "cancellation_thread_poll_interval_seconds", 10\n )\n\n @property\n def run_retries_enabled(self) -> bool:\n return self.get_settings("run_retries").get("enabled", False)\n\n @property\n def run_retries_max_retries(self) -> int:\n return self.get_settings("run_retries").get("max_retries")\n\n @property\n def auto_materialize_enabled(self) -> bool:\n return self.get_settings("auto_materialize").get("enabled", True)\n\n @property\n def auto_materialize_minimum_interval_seconds(self) -> int:\n return self.get_settings("auto_materialize").get("minimum_interval_seconds")\n\n @property\n def auto_materialize_run_tags(self) -> Dict[str, str]:\n return self.get_settings("auto_materialize").get("run_tags", {})\n\n @property\n def auto_materialize_respect_materialization_data_versions(self) -> bool:\n return self.get_settings("auto_materialize").get(\n "respect_materialization_data_versions", False\n )\n\n # python logs\n\n @property\n def managed_python_loggers(self) -> Sequence[str]:\n python_log_settings = self.get_settings("python_logs") or {}\n loggers: Sequence[str] = python_log_settings.get("managed_python_loggers", [])\n return loggers\n\n @property\n def python_log_level(self) -> Optional[str]:\n python_log_settings = self.get_settings("python_logs") or {}\n return python_log_settings.get("python_log_level")\n\n def upgrade(self, print_fn: Optional[PrintFn] = None) -> None:\n from dagster._core.storage.migration.utils import upgrading_instance\n\n with upgrading_instance(self):\n if print_fn:\n print_fn("Updating run storage...")\n self._run_storage.upgrade() # type: ignore # (unknown method on run storage)\n self._run_storage.migrate(print_fn)\n\n if print_fn:\n print_fn("Updating event storage...")\n self._event_storage.upgrade()\n self._event_storage.reindex_assets(print_fn=print_fn)\n\n if print_fn:\n print_fn("Updating schedule storage...")\n self._schedule_storage.upgrade() # type: ignore # (possible none)\n self._schedule_storage.migrate(print_fn) # type: ignore # (possible none)\n\n def optimize_for_webserver(self, statement_timeout: int, pool_recycle: int) -> None:\n if self._schedule_storage:\n self._schedule_storage.optimize_for_webserver(\n statement_timeout=statement_timeout, pool_recycle=pool_recycle\n )\n self._run_storage.optimize_for_webserver(\n statement_timeout=statement_timeout, pool_recycle=pool_recycle\n )\n self._event_storage.optimize_for_webserver(\n statement_timeout=statement_timeout, pool_recycle=pool_recycle\n )\n\n def reindex(self, print_fn: PrintFn = lambda _: None) -> None:\n print_fn("Checking for reindexing...")\n self._event_storage.reindex_events(print_fn)\n self._event_storage.reindex_assets(print_fn)\n self._run_storage.optimize(print_fn)\n self._schedule_storage.optimize(print_fn) # type: ignore # (possible none)\n print_fn("Done.")\n\n def dispose(self) -> None:\n self._local_artifact_storage.dispose()\n self._run_storage.dispose()\n if self._run_coordinator:\n self._run_coordinator.dispose()\n if self._run_launcher:\n self._run_launcher.dispose()\n self._event_storage.dispose()\n if self._compute_log_manager:\n self._compute_log_manager.dispose()\n if self._secrets_loader:\n self._secrets_loader.dispose()\n\n if self in DagsterInstance._TEMP_DIRS:\n DagsterInstance._TEMP_DIRS[self].cleanup()\n del DagsterInstance._TEMP_DIRS[self]\n\n # run storage\n
[docs] @public\n def get_run_by_id(self, run_id: str) -> Optional[DagsterRun]:\n """Get a :py:class:`DagsterRun` matching the provided `run_id`.\n\n Args:\n run_id (str): The id of the run to retrieve.\n\n Returns:\n Optional[DagsterRun]: The run corresponding to the given id. If no run matching the id\n is found, return `None`.\n """\n record = self.get_run_record_by_id(run_id)\n if record is None:\n return None\n return record.dagster_run
\n\n
[docs] @public\n @traced\n def get_run_record_by_id(self, run_id: str) -> Optional[RunRecord]:\n """Get a :py:class:`RunRecord` matching the provided `run_id`.\n\n Args:\n run_id (str): The id of the run record to retrieve.\n\n Returns:\n Optional[RunRecord]: The run record corresponding to the given id. If no run matching\n the id is found, return `None`.\n """\n records = self._run_storage.get_run_records(RunsFilter(run_ids=[run_id]))\n if not records:\n return None\n return records[0]
\n\n @traced\n def get_job_snapshot(self, snapshot_id: str) -> "JobSnapshot":\n return self._run_storage.get_job_snapshot(snapshot_id)\n\n @traced\n def has_job_snapshot(self, snapshot_id: str) -> bool:\n return self._run_storage.has_job_snapshot(snapshot_id)\n\n @traced\n def has_snapshot(self, snapshot_id: str) -> bool:\n return self._run_storage.has_snapshot(snapshot_id)\n\n @traced\n def get_historical_job(self, snapshot_id: str) -> "HistoricalJob":\n from dagster._core.host_representation import HistoricalJob\n\n snapshot = self._run_storage.get_job_snapshot(snapshot_id)\n parent_snapshot = (\n self._run_storage.get_job_snapshot(snapshot.lineage_snapshot.parent_snapshot_id)\n if snapshot.lineage_snapshot\n else None\n )\n return HistoricalJob(snapshot, snapshot_id, parent_snapshot)\n\n @traced\n def has_historical_job(self, snapshot_id: str) -> bool:\n return self._run_storage.has_job_snapshot(snapshot_id)\n\n @traced\n def get_execution_plan_snapshot(self, snapshot_id: str) -> "ExecutionPlanSnapshot":\n return self._run_storage.get_execution_plan_snapshot(snapshot_id)\n\n @traced\n def get_run_stats(self, run_id: str) -> DagsterRunStatsSnapshot:\n return self._event_storage.get_stats_for_run(run_id)\n\n @traced\n def get_run_step_stats(\n self, run_id: str, step_keys: Optional[Sequence[str]] = None\n ) -> Sequence["RunStepKeyStatsSnapshot"]:\n return self._event_storage.get_step_stats_for_run(run_id, step_keys)\n\n @traced\n def get_run_tags(\n self,\n tag_keys: Optional[Sequence[str]] = None,\n value_prefix: Optional[str] = None,\n limit: Optional[int] = None,\n ) -> Sequence[Tuple[str, Set[str]]]:\n return self._run_storage.get_run_tags(\n tag_keys=tag_keys, value_prefix=value_prefix, limit=limit\n )\n\n @traced\n def get_run_tag_keys(self) -> Sequence[str]:\n return self._run_storage.get_run_tag_keys()\n\n @traced\n def get_run_group(self, run_id: str) -> Optional[Tuple[str, Sequence[DagsterRun]]]:\n return self._run_storage.get_run_group(run_id)\n\n def create_run_for_job(\n self,\n job_def: "JobDefinition",\n execution_plan: Optional["ExecutionPlan"] = None,\n run_id: Optional[str] = None,\n run_config: Optional[Mapping[str, object]] = None,\n resolved_op_selection: Optional[AbstractSet[str]] = None,\n status: Optional[Union[DagsterRunStatus, str]] = None,\n tags: Optional[Mapping[str, str]] = None,\n root_run_id: Optional[str] = None,\n parent_run_id: Optional[str] = None,\n op_selection: Optional[Sequence[str]] = None,\n asset_selection: Optional[AbstractSet[AssetKey]] = None,\n external_job_origin: Optional["ExternalJobOrigin"] = None,\n job_code_origin: Optional[JobPythonOrigin] = None,\n repository_load_data: Optional["RepositoryLoadData"] = None,\n ) -> DagsterRun:\n from dagster._core.definitions.job_definition import JobDefinition\n from dagster._core.execution.api import create_execution_plan\n from dagster._core.execution.plan.plan import ExecutionPlan\n from dagster._core.snap import snapshot_from_execution_plan\n\n check.inst_param(job_def, "pipeline_def", JobDefinition)\n check.opt_inst_param(execution_plan, "execution_plan", ExecutionPlan)\n\n # note that op_selection is required to execute the solid subset, which is the\n # frozenset version of the previous solid_subset.\n # op_selection is not required and will not be converted to op_selection here.\n # i.e. this function doesn't handle solid queries.\n # op_selection is only used to pass the user queries further down.\n check.opt_set_param(resolved_op_selection, "resolved_op_selection", of_type=str)\n check.opt_list_param(op_selection, "op_selection", of_type=str)\n check.opt_set_param(asset_selection, "asset_selection", of_type=AssetKey)\n\n # op_selection never provided\n if asset_selection or op_selection:\n # for cases when `create_run_for_pipeline` is directly called\n job_def = job_def.get_subset(\n asset_selection=asset_selection,\n op_selection=op_selection,\n )\n step_keys_to_execute = None\n\n if execution_plan:\n step_keys_to_execute = execution_plan.step_keys_to_execute\n\n else:\n execution_plan = create_execution_plan(\n job=job_def,\n run_config=run_config,\n instance_ref=self.get_ref() if self.is_persistent else None,\n tags=tags,\n repository_load_data=repository_load_data,\n )\n\n return self.create_run(\n job_name=job_def.name,\n run_id=run_id,\n run_config=run_config,\n op_selection=op_selection,\n asset_selection=asset_selection,\n asset_check_selection=None,\n resolved_op_selection=resolved_op_selection,\n step_keys_to_execute=step_keys_to_execute,\n status=DagsterRunStatus(status) if status else None,\n tags=tags,\n root_run_id=root_run_id,\n parent_run_id=parent_run_id,\n job_snapshot=job_def.get_job_snapshot(),\n execution_plan_snapshot=snapshot_from_execution_plan(\n execution_plan,\n job_def.get_job_snapshot_id(),\n ),\n parent_job_snapshot=job_def.get_parent_job_snapshot(),\n external_job_origin=external_job_origin,\n job_code_origin=job_code_origin,\n )\n\n def _construct_run_with_snapshots(\n self,\n job_name: str,\n run_id: str,\n run_config: Optional[Mapping[str, object]],\n resolved_op_selection: Optional[AbstractSet[str]],\n step_keys_to_execute: Optional[Sequence[str]],\n status: Optional[DagsterRunStatus],\n tags: Mapping[str, str],\n root_run_id: Optional[str],\n parent_run_id: Optional[str],\n job_snapshot: Optional["JobSnapshot"],\n execution_plan_snapshot: Optional["ExecutionPlanSnapshot"],\n parent_job_snapshot: Optional["JobSnapshot"],\n asset_selection: Optional[AbstractSet[AssetKey]] = None,\n asset_check_selection: Optional[AbstractSet["AssetCheckKey"]] = None,\n op_selection: Optional[Sequence[str]] = None,\n external_job_origin: Optional["ExternalJobOrigin"] = None,\n job_code_origin: Optional[JobPythonOrigin] = None,\n ) -> DagsterRun:\n # https://github.com/dagster-io/dagster/issues/2403\n if tags and IS_AIRFLOW_INGEST_PIPELINE_STR in tags:\n if AIRFLOW_EXECUTION_DATE_STR not in tags:\n tags = {\n **tags,\n AIRFLOW_EXECUTION_DATE_STR: get_current_datetime_in_utc().isoformat(),\n }\n\n check.invariant(\n not (not job_snapshot and execution_plan_snapshot),\n "It is illegal to have an execution plan snapshot and not have a pipeline snapshot."\n " It is possible to have no execution plan snapshot since we persist runs that do"\n " not successfully compile execution plans in the scheduled case.",\n )\n\n job_snapshot_id = (\n self._ensure_persisted_job_snapshot(job_snapshot, parent_job_snapshot)\n if job_snapshot\n else None\n )\n\n execution_plan_snapshot_id = (\n self._ensure_persisted_execution_plan_snapshot(\n execution_plan_snapshot, job_snapshot_id, step_keys_to_execute\n )\n if execution_plan_snapshot and job_snapshot_id\n else None\n )\n\n return DagsterRun(\n job_name=job_name,\n run_id=run_id,\n run_config=run_config,\n asset_selection=asset_selection,\n asset_check_selection=asset_check_selection,\n op_selection=op_selection,\n resolved_op_selection=resolved_op_selection,\n step_keys_to_execute=step_keys_to_execute,\n status=status,\n tags=tags,\n root_run_id=root_run_id,\n parent_run_id=parent_run_id,\n job_snapshot_id=job_snapshot_id,\n execution_plan_snapshot_id=execution_plan_snapshot_id,\n external_job_origin=external_job_origin,\n job_code_origin=job_code_origin,\n has_repository_load_data=execution_plan_snapshot is not None\n and execution_plan_snapshot.repository_load_data is not None,\n )\n\n def _ensure_persisted_job_snapshot(\n self,\n job_snapshot: "JobSnapshot",\n parent_job_snapshot: "Optional[JobSnapshot]",\n ) -> str:\n from dagster._core.snap import JobSnapshot, create_job_snapshot_id\n\n check.inst_param(job_snapshot, "job_snapshot", JobSnapshot)\n check.opt_inst_param(parent_job_snapshot, "parent_job_snapshot", JobSnapshot)\n\n if job_snapshot.lineage_snapshot:\n if not self._run_storage.has_job_snapshot(\n job_snapshot.lineage_snapshot.parent_snapshot_id\n ):\n check.invariant(\n create_job_snapshot_id(parent_job_snapshot) # type: ignore # (possible none)\n == job_snapshot.lineage_snapshot.parent_snapshot_id,\n "Parent pipeline snapshot id out of sync with passed parent pipeline snapshot",\n )\n\n returned_job_snapshot_id = self._run_storage.add_job_snapshot(\n parent_job_snapshot # type: ignore # (possible none)\n )\n check.invariant(\n job_snapshot.lineage_snapshot.parent_snapshot_id == returned_job_snapshot_id\n )\n\n job_snapshot_id = create_job_snapshot_id(job_snapshot)\n if not self._run_storage.has_job_snapshot(job_snapshot_id):\n returned_job_snapshot_id = self._run_storage.add_job_snapshot(job_snapshot)\n check.invariant(job_snapshot_id == returned_job_snapshot_id)\n\n return job_snapshot_id\n\n def _ensure_persisted_execution_plan_snapshot(\n self,\n execution_plan_snapshot: "ExecutionPlanSnapshot",\n job_snapshot_id: str,\n step_keys_to_execute: Optional[Sequence[str]],\n ) -> str:\n from dagster._core.snap.execution_plan_snapshot import (\n ExecutionPlanSnapshot,\n create_execution_plan_snapshot_id,\n )\n\n check.inst_param(execution_plan_snapshot, "execution_plan_snapshot", ExecutionPlanSnapshot)\n check.str_param(job_snapshot_id, "job_snapshot_id")\n check.opt_nullable_sequence_param(step_keys_to_execute, "step_keys_to_execute", of_type=str)\n\n check.invariant(\n execution_plan_snapshot.job_snapshot_id == job_snapshot_id,\n "Snapshot mismatch: Snapshot ID in execution plan snapshot is "\n f'"{execution_plan_snapshot.job_snapshot_id}" and snapshot_id created in memory is '\n f'"{job_snapshot_id}"',\n )\n\n execution_plan_snapshot_id = create_execution_plan_snapshot_id(execution_plan_snapshot)\n\n if not self._run_storage.has_execution_plan_snapshot(execution_plan_snapshot_id):\n returned_execution_plan_snapshot_id = self._run_storage.add_execution_plan_snapshot(\n execution_plan_snapshot\n )\n\n check.invariant(execution_plan_snapshot_id == returned_execution_plan_snapshot_id)\n\n return execution_plan_snapshot_id\n\n def _log_asset_planned_events(\n self, dagster_run: DagsterRun, execution_plan_snapshot: "ExecutionPlanSnapshot"\n ) -> None:\n from dagster._core.events import (\n AssetMaterializationPlannedData,\n DagsterEvent,\n DagsterEventType,\n )\n\n job_name = dagster_run.job_name\n\n for step in execution_plan_snapshot.steps:\n if step.key in execution_plan_snapshot.step_keys_to_execute:\n for output in step.outputs:\n asset_key = check.not_none(output.properties).asset_key\n if asset_key:\n # Logs and stores asset_materialization_planned event\n partition_tag = dagster_run.tags.get(PARTITION_NAME_TAG)\n partition_range_start, partition_range_end = dagster_run.tags.get(\n ASSET_PARTITION_RANGE_START_TAG\n ), dagster_run.tags.get(ASSET_PARTITION_RANGE_END_TAG)\n\n if partition_tag and (partition_range_start or partition_range_end):\n raise DagsterInvariantViolationError(\n f"Cannot have {ASSET_PARTITION_RANGE_START_TAG} or"\n f" {ASSET_PARTITION_RANGE_END_TAG} set along with"\n f" {PARTITION_NAME_TAG}"\n )\n\n if partition_range_start or partition_range_end:\n if not partition_range_start or not partition_range_end:\n raise DagsterInvariantViolationError(\n f"Cannot have {ASSET_PARTITION_RANGE_START_TAG} or"\n f" {ASSET_PARTITION_RANGE_END_TAG} set without the other"\n )\n\n # TODO: resolve which partitions are in the range, and emit an event for each\n\n partition = (\n partition_tag\n if check.not_none(output.properties).is_asset_partitioned\n else None\n )\n\n event = DagsterEvent(\n event_type_value=DagsterEventType.ASSET_MATERIALIZATION_PLANNED.value,\n job_name=job_name,\n message=(\n f"{job_name} intends to materialize asset {asset_key.to_string()}"\n ),\n event_specific_data=AssetMaterializationPlannedData(\n asset_key, partition=partition\n ),\n step_key=step.key,\n )\n self.report_dagster_event(event, dagster_run.run_id, logging.DEBUG)\n\n if check.not_none(output.properties).asset_check_key:\n asset_check_key = check.not_none(\n check.not_none(output.properties).asset_check_key\n )\n target_asset_key = asset_check_key.asset_key\n check_name = asset_check_key.name\n\n event = DagsterEvent(\n event_type_value=DagsterEventType.ASSET_CHECK_EVALUATION_PLANNED.value,\n job_name=job_name,\n message=(\n f"{job_name} intends to execute asset check {check_name} on"\n f" asset {target_asset_key.to_string()}"\n ),\n event_specific_data=AssetCheckEvaluationPlanned(\n target_asset_key,\n check_name=check_name,\n ),\n step_key=step.key,\n )\n self.report_dagster_event(event, dagster_run.run_id, logging.DEBUG)\n\n def create_run(\n self,\n *,\n job_name: str,\n run_id: Optional[str],\n run_config: Optional[Mapping[str, object]],\n status: Optional[DagsterRunStatus],\n tags: Optional[Mapping[str, Any]],\n root_run_id: Optional[str],\n parent_run_id: Optional[str],\n step_keys_to_execute: Optional[Sequence[str]],\n execution_plan_snapshot: Optional["ExecutionPlanSnapshot"],\n job_snapshot: Optional["JobSnapshot"],\n parent_job_snapshot: Optional["JobSnapshot"],\n asset_selection: Optional[AbstractSet[AssetKey]],\n asset_check_selection: Optional[AbstractSet["AssetCheckKey"]],\n resolved_op_selection: Optional[AbstractSet[str]],\n op_selection: Optional[Sequence[str]],\n external_job_origin: Optional["ExternalJobOrigin"],\n job_code_origin: Optional[JobPythonOrigin],\n ) -> DagsterRun:\n from dagster._core.definitions.asset_check_spec import AssetCheckKey\n from dagster._core.definitions.utils import validate_tags\n from dagster._core.host_representation.origin import ExternalJobOrigin\n from dagster._core.snap import ExecutionPlanSnapshot, JobSnapshot\n\n check.str_param(job_name, "job_name")\n check.opt_str_param(\n run_id, "run_id"\n ) # will be assigned to make_new_run_id() lower in callstack\n check.opt_mapping_param(run_config, "run_config", key_type=str)\n\n check.opt_inst_param(status, "status", DagsterRunStatus)\n check.opt_mapping_param(tags, "tags", key_type=str)\n\n validated_tags = validate_tags(tags)\n\n check.opt_str_param(root_run_id, "root_run_id")\n check.opt_str_param(parent_run_id, "parent_run_id")\n\n # If step_keys_to_execute is None, then everything is executed. In some cases callers\n # are still exploding and sending the full list of step keys even though that is\n # unnecessary.\n\n check.opt_sequence_param(step_keys_to_execute, "step_keys_to_execute")\n check.opt_inst_param(\n execution_plan_snapshot, "execution_plan_snapshot", ExecutionPlanSnapshot\n )\n\n if root_run_id or parent_run_id:\n check.invariant(\n root_run_id and parent_run_id,\n "If root_run_id or parent_run_id is passed, this is a re-execution scenario and"\n " root_run_id and parent_run_id must both be passed.",\n )\n\n # The job_snapshot should always be set in production scenarios. In tests\n # we have sometimes omitted it out of convenience.\n\n check.opt_inst_param(job_snapshot, "job_snapshot", JobSnapshot)\n check.opt_inst_param(parent_job_snapshot, "parent_job_snapshot", JobSnapshot)\n\n if parent_job_snapshot:\n check.invariant(\n job_snapshot,\n "If parent_job_snapshot is set, job_snapshot should also be.",\n )\n\n # op_selection is a sequence of selection queries assigned by the user.\n # *Most* callers expand the op_selection into an explicit set of\n # resolved_op_selection via accessing external_job.resolved_op_selection\n # but not all do. Some (launch execution mutation in graphql and backfill run\n # creation, for example) actually pass the solid *selection* into the\n # resolved_op_selection parameter, but just as a frozen set, rather than\n # fully resolving the selection, as the daemon launchers do. Given the\n # state of callers we just check to ensure that the arguments are well-formed.\n #\n # asset_selection adds another dimension to this lovely dance. op_selection\n # and asset_selection are mutually exclusive and should never both be set.\n # This is invariant is checked in a sporadic fashion around\n # the codebase, but is never enforced in a typed fashion.\n #\n # Additionally, the way that callsites currently behave *if* asset selection\n # is set (i.e., not None) then *neither* op_selection *nor*\n # resolved_op_selection is passed. In the asset selection case resolving\n # the set of assets into the canonical resolved_op_selection is done in\n # the user process, and the exact resolution is never persisted in the run.\n # We are asserting that invariant here to maintain that behavior.\n #\n # Finally, asset_check_selection can be passed along with asset_selection. It\n # is mutually exclusive with op_selection and resolved_op_selection. A `None`\n # value will include any asset checks that target selected assets. An empty set\n # will include no asset checks.\n\n check.opt_set_param(resolved_op_selection, "resolved_op_selection", of_type=str)\n check.opt_sequence_param(op_selection, "op_selection", of_type=str)\n check.opt_set_param(asset_selection, "asset_selection", of_type=AssetKey)\n check.opt_set_param(asset_check_selection, "asset_check_selection", of_type=AssetCheckKey)\n\n if asset_selection is not None or asset_check_selection is not None:\n check.invariant(\n op_selection is None,\n "Cannot pass op_selection with either of asset_selection or asset_check_selection",\n )\n\n check.invariant(\n resolved_op_selection is None,\n "Cannot pass resolved_op_selection with either of asset_selection or"\n " asset_check_selection",\n )\n\n # The "python origin" arguments exist so a job can be reconstructed in memory\n # after a DagsterRun has been fetched from the database.\n #\n # There are cases (notably in _logged_execute_job with Reconstructable jobs)\n # where job_code_origin and is not. In some cloud test cases only\n # external_job_origin is passed But they are almost always passed together.\n # If these are not set the created run will never be able to be relaunched from\n # the information just in the run or in another process.\n\n check.opt_inst_param(external_job_origin, "external_job_origin", ExternalJobOrigin)\n check.opt_inst_param(job_code_origin, "job_code_origin", JobPythonOrigin)\n\n dagster_run = self._construct_run_with_snapshots(\n job_name=job_name,\n run_id=run_id, # type: ignore # (possible none)\n run_config=run_config,\n asset_selection=asset_selection,\n asset_check_selection=asset_check_selection,\n op_selection=op_selection,\n resolved_op_selection=resolved_op_selection,\n step_keys_to_execute=step_keys_to_execute,\n status=status,\n tags=validated_tags,\n root_run_id=root_run_id,\n parent_run_id=parent_run_id,\n job_snapshot=job_snapshot,\n execution_plan_snapshot=execution_plan_snapshot,\n parent_job_snapshot=parent_job_snapshot,\n external_job_origin=external_job_origin,\n job_code_origin=job_code_origin,\n )\n\n dagster_run = self._run_storage.add_run(dagster_run)\n\n if execution_plan_snapshot:\n self._log_asset_planned_events(dagster_run, execution_plan_snapshot)\n\n return dagster_run\n\n def create_reexecuted_run(\n self,\n *,\n parent_run: DagsterRun,\n code_location: "CodeLocation",\n external_job: "ExternalJob",\n strategy: "ReexecutionStrategy",\n extra_tags: Optional[Mapping[str, Any]] = None,\n run_config: Optional[Mapping[str, Any]] = None,\n use_parent_run_tags: bool = False,\n ) -> DagsterRun:\n from dagster._core.execution.plan.resume_retry import (\n ReexecutionStrategy,\n )\n from dagster._core.execution.plan.state import KnownExecutionState\n from dagster._core.host_representation import CodeLocation, ExternalJob\n\n check.inst_param(parent_run, "parent_run", DagsterRun)\n check.inst_param(code_location, "code_location", CodeLocation)\n check.inst_param(external_job, "external_job", ExternalJob)\n check.inst_param(strategy, "strategy", ReexecutionStrategy)\n check.opt_mapping_param(extra_tags, "extra_tags", key_type=str)\n check.opt_mapping_param(run_config, "run_config", key_type=str)\n\n check.bool_param(use_parent_run_tags, "use_parent_run_tags")\n\n root_run_id = parent_run.root_run_id or parent_run.run_id\n parent_run_id = parent_run.run_id\n\n tags = merge_dicts(\n external_job.tags,\n (\n # these can differ from external_job.tags if tags were added at launch time\n parent_run.tags\n if use_parent_run_tags\n else {}\n ),\n extra_tags or {},\n {\n PARENT_RUN_ID_TAG: parent_run_id,\n ROOT_RUN_ID_TAG: root_run_id,\n },\n )\n\n run_config = run_config if run_config is not None else parent_run.run_config\n\n if strategy == ReexecutionStrategy.FROM_FAILURE:\n check.invariant(\n parent_run.status == DagsterRunStatus.FAILURE,\n "Cannot reexecute from failure a run that is not failed",\n )\n\n (\n step_keys_to_execute,\n known_state,\n ) = KnownExecutionState.build_resume_retry_reexecution(\n self,\n parent_run=parent_run,\n )\n tags[RESUME_RETRY_TAG] = "true"\n elif strategy == ReexecutionStrategy.ALL_STEPS:\n step_keys_to_execute = None\n known_state = None\n else:\n raise DagsterInvariantViolationError(f"Unknown reexecution strategy: {strategy}")\n\n external_execution_plan = code_location.get_external_execution_plan(\n external_job,\n run_config,\n step_keys_to_execute=step_keys_to_execute,\n known_state=known_state,\n instance=self,\n )\n\n return self.create_run(\n job_name=parent_run.job_name,\n run_id=None,\n run_config=run_config,\n resolved_op_selection=parent_run.resolved_op_selection,\n step_keys_to_execute=step_keys_to_execute,\n status=DagsterRunStatus.NOT_STARTED,\n tags=tags,\n root_run_id=root_run_id,\n parent_run_id=parent_run_id,\n job_snapshot=external_job.job_snapshot,\n execution_plan_snapshot=external_execution_plan.execution_plan_snapshot,\n parent_job_snapshot=external_job.parent_job_snapshot,\n op_selection=parent_run.op_selection,\n asset_selection=parent_run.asset_selection,\n asset_check_selection=parent_run.asset_check_selection,\n external_job_origin=external_job.get_external_origin(),\n job_code_origin=external_job.get_python_origin(),\n )\n\n def register_managed_run(\n self,\n job_name: str,\n run_id: str,\n run_config: Optional[Mapping[str, object]],\n resolved_op_selection: Optional[AbstractSet[str]],\n step_keys_to_execute: Optional[Sequence[str]],\n tags: Mapping[str, str],\n root_run_id: Optional[str],\n parent_run_id: Optional[str],\n job_snapshot: Optional["JobSnapshot"],\n execution_plan_snapshot: Optional["ExecutionPlanSnapshot"],\n parent_job_snapshot: Optional["JobSnapshot"],\n op_selection: Optional[Sequence[str]] = None,\n job_code_origin: Optional[JobPythonOrigin] = None,\n ) -> DagsterRun:\n # The usage of this method is limited to dagster-airflow, specifically in Dagster\n # Operators that are executed in Airflow. Because a common workflow in Airflow is to\n # retry dags from arbitrary tasks, we need any node to be capable of creating a\n # DagsterRun.\n #\n # The try-except DagsterRunAlreadyExists block handles the race when multiple "root" tasks\n # simultaneously execute self._run_storage.add_run(dagster_run). When this happens, only\n # one task succeeds in creating the run, while the others get DagsterRunAlreadyExists\n # error; at this point, the failed tasks try again to fetch the existing run.\n # https://github.com/dagster-io/dagster/issues/2412\n\n dagster_run = self._construct_run_with_snapshots(\n job_name=job_name,\n run_id=run_id,\n run_config=run_config,\n op_selection=op_selection,\n resolved_op_selection=resolved_op_selection,\n step_keys_to_execute=step_keys_to_execute,\n status=DagsterRunStatus.MANAGED,\n tags=tags,\n root_run_id=root_run_id,\n parent_run_id=parent_run_id,\n job_snapshot=job_snapshot,\n execution_plan_snapshot=execution_plan_snapshot,\n parent_job_snapshot=parent_job_snapshot,\n job_code_origin=job_code_origin,\n )\n\n def get_run() -> DagsterRun:\n candidate_run = self.get_run_by_id(dagster_run.run_id)\n\n field_diff = _check_run_equality(dagster_run, candidate_run) # type: ignore # (possible none)\n\n if field_diff:\n raise DagsterRunConflict(\n "Found conflicting existing run with same id {run_id}. Runs differ in:"\n "\\n{field_diff}".format(\n run_id=dagster_run.run_id,\n field_diff=_format_field_diff(field_diff),\n ),\n )\n return candidate_run # type: ignore # (possible none)\n\n if self.has_run(dagster_run.run_id):\n return get_run()\n\n try:\n return self._run_storage.add_run(dagster_run)\n except DagsterRunAlreadyExists:\n return get_run()\n\n @traced\n def add_run(self, dagster_run: DagsterRun) -> DagsterRun:\n return self._run_storage.add_run(dagster_run)\n\n @traced\n def add_snapshot(\n self,\n snapshot: Union["JobSnapshot", "ExecutionPlanSnapshot"],\n snapshot_id: Optional[str] = None,\n ) -> None:\n return self._run_storage.add_snapshot(snapshot, snapshot_id)\n\n @traced\n def handle_run_event(self, run_id: str, event: "DagsterEvent") -> None:\n return self._run_storage.handle_run_event(run_id, event)\n\n @traced\n def add_run_tags(self, run_id: str, new_tags: Mapping[str, str]) -> None:\n return self._run_storage.add_run_tags(run_id, new_tags)\n\n @traced\n def has_run(self, run_id: str) -> bool:\n return self._run_storage.has_run(run_id)\n\n @traced\n def get_runs(\n self,\n filters: Optional[RunsFilter] = None,\n cursor: Optional[str] = None,\n limit: Optional[int] = None,\n bucket_by: Optional[Union[JobBucket, TagBucket]] = None,\n ) -> Sequence[DagsterRun]:\n return self._run_storage.get_runs(filters, cursor, limit, bucket_by)\n\n @traced\n def get_run_ids(\n self,\n filters: Optional[RunsFilter] = None,\n cursor: Optional[str] = None,\n limit: Optional[int] = None,\n ) -> Sequence[str]:\n return self._run_storage.get_run_ids(filters, cursor=cursor, limit=limit)\n\n @traced\n def get_runs_count(self, filters: Optional[RunsFilter] = None) -> int:\n return self._run_storage.get_runs_count(filters)\n\n
[docs] @public\n @traced\n def get_run_records(\n self,\n filters: Optional[RunsFilter] = None,\n limit: Optional[int] = None,\n order_by: Optional[str] = None,\n ascending: bool = False,\n cursor: Optional[str] = None,\n bucket_by: Optional[Union[JobBucket, TagBucket]] = None,\n ) -> Sequence[RunRecord]:\n """Return a list of run records stored in the run storage, sorted by the given column in given order.\n\n Args:\n filters (Optional[RunsFilter]): the filter by which to filter runs.\n limit (Optional[int]): Number of results to get. Defaults to infinite.\n order_by (Optional[str]): Name of the column to sort by. Defaults to id.\n ascending (Optional[bool]): Sort the result in ascending order if True, descending\n otherwise. Defaults to descending.\n\n Returns:\n List[RunRecord]: List of run records stored in the run storage.\n """\n return self._run_storage.get_run_records(\n filters, limit, order_by, ascending, cursor, bucket_by\n )
\n\n @traced\n def get_run_partition_data(self, runs_filter: RunsFilter) -> Sequence[RunPartitionData]:\n """Get run partition data for a given partitioned job."""\n return self._run_storage.get_run_partition_data(runs_filter)\n\n def wipe(self) -> None:\n self._run_storage.wipe()\n self._event_storage.wipe()\n\n
[docs] @public\n @traced\n def delete_run(self, run_id: str) -> None:\n """Delete a run and all events generated by that from storage.\n\n Args:\n run_id (str): The id of the run to delete.\n """\n self._run_storage.delete_run(run_id)\n self._event_storage.delete_events(run_id)
\n\n # event storage\n @traced\n def logs_after(\n self,\n run_id: str,\n cursor: Optional[int] = None,\n of_type: Optional["DagsterEventType"] = None,\n limit: Optional[int] = None,\n ) -> Sequence["EventLogEntry"]:\n return self._event_storage.get_logs_for_run(\n run_id,\n cursor=cursor,\n of_type=of_type,\n limit=limit,\n )\n\n @traced\n def all_logs(\n self,\n run_id: str,\n of_type: Optional[Union["DagsterEventType", Set["DagsterEventType"]]] = None,\n ) -> Sequence["EventLogEntry"]:\n return self._event_storage.get_logs_for_run(run_id, of_type=of_type)\n\n @traced\n def get_records_for_run(\n self,\n run_id: str,\n cursor: Optional[str] = None,\n of_type: Optional[Union["DagsterEventType", Set["DagsterEventType"]]] = None,\n limit: Optional[int] = None,\n ascending: bool = True,\n ) -> "EventLogConnection":\n return self._event_storage.get_records_for_run(run_id, cursor, of_type, limit, ascending)\n\n def watch_event_logs(self, run_id: str, cursor: Optional[str], cb: "EventHandlerFn") -> None:\n return self._event_storage.watch(run_id, cursor, cb)\n\n def end_watch_event_logs(self, run_id: str, cb: "EventHandlerFn") -> None:\n return self._event_storage.end_watch(run_id, cb)\n\n # asset storage\n\n @traced\n def can_cache_asset_status_data(self) -> bool:\n return self._event_storage.can_cache_asset_status_data()\n\n @traced\n def update_asset_cached_status_data(\n self, asset_key: AssetKey, cache_values: "AssetStatusCacheValue"\n ) -> None:\n self._event_storage.update_asset_cached_status_data(asset_key, cache_values)\n\n @traced\n def wipe_asset_cached_status(self, asset_keys: Sequence[AssetKey]) -> None:\n check.list_param(asset_keys, "asset_keys", of_type=AssetKey)\n for asset_key in asset_keys:\n self._event_storage.wipe_asset_cached_status(asset_key)\n\n @traced\n def all_asset_keys(self) -> Sequence[AssetKey]:\n return self._event_storage.all_asset_keys()\n\n
[docs] @public\n @traced\n def get_asset_keys(\n self,\n prefix: Optional[Sequence[str]] = None,\n limit: Optional[int] = None,\n cursor: Optional[str] = None,\n ) -> Sequence[AssetKey]:\n """Return a filtered subset of asset keys managed by this instance.\n\n Args:\n prefix (Optional[Sequence[str]]): Return only assets having this key prefix.\n limit (Optional[int]): Maximum number of keys to return.\n cursor (Optional[str]): Cursor to use for pagination.\n\n Returns:\n Sequence[AssetKey]: List of asset keys.\n """\n return self._event_storage.get_asset_keys(prefix=prefix, limit=limit, cursor=cursor)
\n\n
[docs] @public\n @traced\n def has_asset_key(self, asset_key: AssetKey) -> bool:\n """Return true if this instance manages the given asset key.\n\n Args:\n asset_key (AssetKey): Asset key to check.\n """\n return self._event_storage.has_asset_key(asset_key)
\n\n @traced\n def get_latest_materialization_events(\n self, asset_keys: Iterable[AssetKey]\n ) -> Mapping[AssetKey, Optional["EventLogEntry"]]:\n return self._event_storage.get_latest_materialization_events(asset_keys)\n\n
[docs] @public\n @traced\n def get_latest_materialization_event(self, asset_key: AssetKey) -> Optional["EventLogEntry"]:\n """Fetch the latest materialization event for the given asset key.\n\n Args:\n asset_key (AssetKey): Asset key to return materialization for.\n\n Returns:\n Optional[AssetMaterialization]: The latest materialization event for the given asset\n key, or `None` if the asset has not been materialized.\n """\n return self._event_storage.get_latest_materialization_events([asset_key]).get(asset_key)
\n\n
[docs] @public\n @traced\n def get_event_records(\n self,\n event_records_filter: "EventRecordsFilter",\n limit: Optional[int] = None,\n ascending: bool = False,\n ) -> Sequence["EventLogRecord"]:\n """Return a list of event records stored in the event log storage.\n\n Args:\n event_records_filter (Optional[EventRecordsFilter]): the filter by which to filter event\n records.\n limit (Optional[int]): Number of results to get. Defaults to infinite.\n ascending (Optional[bool]): Sort the result in ascending order if True, descending\n otherwise. Defaults to descending.\n\n Returns:\n List[EventLogRecord]: List of event log records stored in the event log storage.\n """\n return self._event_storage.get_event_records(event_records_filter, limit, ascending)
\n\n
[docs] @public\n @traced\n def get_status_by_partition(\n self,\n asset_key: AssetKey,\n partition_keys: Sequence[str],\n partitions_def: "PartitionsDefinition",\n ) -> Optional[Mapping[str, "AssetPartitionStatus"]]:\n """Get the current status of provided partition_keys for the provided asset.\n\n Args:\n asset_key (AssetKey): The asset to get per-partition status for.\n partition_keys (Sequence[str]): The partitions to get status for.\n partitions_def (PartitionsDefinition): The PartitionsDefinition of the asset to get\n per-partition status for.\n\n Returns:\n Optional[Mapping[str, AssetPartitionStatus]]: status for each partition key\n\n """\n from dagster._core.storage.partition_status_cache import (\n AssetPartitionStatus,\n AssetStatusCacheValue,\n get_and_update_asset_status_cache_value,\n )\n\n cached_value = get_and_update_asset_status_cache_value(self, asset_key, partitions_def)\n\n if isinstance(cached_value, AssetStatusCacheValue):\n materialized_partitions = cached_value.deserialize_materialized_partition_subsets(\n partitions_def\n )\n failed_partitions = cached_value.deserialize_failed_partition_subsets(partitions_def)\n in_progress_partitions = cached_value.deserialize_in_progress_partition_subsets(\n partitions_def\n )\n\n status_by_partition = {}\n\n for partition_key in partition_keys:\n if partition_key in in_progress_partitions:\n status_by_partition[partition_key] = AssetPartitionStatus.IN_PROGRESS\n elif partition_key in failed_partitions:\n status_by_partition[partition_key] = AssetPartitionStatus.FAILED\n elif partition_key in materialized_partitions:\n status_by_partition[partition_key] = AssetPartitionStatus.MATERIALIZED\n else:\n status_by_partition[partition_key] = None\n\n return status_by_partition
\n\n
[docs] @public\n @traced\n def get_asset_records(\n self, asset_keys: Optional[Sequence[AssetKey]] = None\n ) -> Sequence["AssetRecord"]:\n """Return an `AssetRecord` for each of the given asset keys.\n\n Args:\n asset_keys (Optional[Sequence[AssetKey]]): List of asset keys to retrieve records for.\n\n Returns:\n Sequence[AssetRecord]: List of asset records.\n """\n return self._event_storage.get_asset_records(asset_keys)
\n\n @traced\n def get_event_tags_for_asset(\n self,\n asset_key: AssetKey,\n filter_tags: Optional[Mapping[str, str]] = None,\n filter_event_id: Optional[int] = None,\n ) -> Sequence[Mapping[str, str]]:\n """Fetches asset event tags for the given asset key.\n\n If filter_tags is provided, searches for events containing all of the filter tags. Then,\n returns all tags for those events. This enables searching for multipartitioned asset\n partition tags with a fixed dimension value, e.g. all of the tags for events where\n "country" == "US".\n\n If filter_event_id is provided, searches for the event with the provided event_id.\n\n Returns a list of dicts, where each dict is a mapping of tag key to tag value for a\n single event.\n """\n return self._event_storage.get_event_tags_for_asset(asset_key, filter_tags, filter_event_id)\n\n
[docs] @public\n @traced\n def wipe_assets(self, asset_keys: Sequence[AssetKey]) -> None:\n """Wipes asset event history from the event log for the given asset keys.\n\n Args:\n asset_keys (Sequence[AssetKey]): Asset keys to wipe.\n """\n check.list_param(asset_keys, "asset_keys", of_type=AssetKey)\n for asset_key in asset_keys:\n self._event_storage.wipe_asset(asset_key)
\n\n @traced\n def get_materialization_count_by_partition(\n self, asset_keys: Sequence[AssetKey], after_cursor: Optional[int] = None\n ) -> Mapping[AssetKey, Mapping[str, int]]:\n return self._event_storage.get_materialization_count_by_partition(asset_keys, after_cursor)\n\n @traced\n def get_materialized_partitions(\n self,\n asset_key: AssetKey,\n before_cursor: Optional[int] = None,\n after_cursor: Optional[int] = None,\n ) -> Set[str]:\n return self._event_storage.get_materialized_partitions(\n asset_key, before_cursor=before_cursor, after_cursor=after_cursor\n )\n\n @traced\n def get_latest_storage_id_by_partition(\n self, asset_key: AssetKey, event_type: "DagsterEventType"\n ) -> Mapping[str, int]:\n """Fetch the latest materialzation storage id for each partition for a given asset key.\n\n Returns a mapping of partition to storage id.\n """\n return self._event_storage.get_latest_storage_id_by_partition(asset_key, event_type)\n\n
[docs] @public\n @traced\n def get_dynamic_partitions(self, partitions_def_name: str) -> Sequence[str]:\n """Get the set of partition keys for the specified :py:class:`DynamicPartitionsDefinition`.\n\n Args:\n partitions_def_name (str): The name of the `DynamicPartitionsDefinition`.\n """\n check.str_param(partitions_def_name, "partitions_def_name")\n return self._event_storage.get_dynamic_partitions(partitions_def_name)
\n\n
[docs] @public\n @traced\n def add_dynamic_partitions(\n self, partitions_def_name: str, partition_keys: Sequence[str]\n ) -> None:\n """Add partitions to the specified :py:class:`DynamicPartitionsDefinition` idempotently.\n Does not add any partitions that already exist.\n\n Args:\n partitions_def_name (str): The name of the `DynamicPartitionsDefinition`.\n partition_keys (Sequence[str]): Partition keys to add.\n """\n from dagster._core.definitions.partition import (\n raise_error_on_invalid_partition_key_substring,\n )\n\n check.str_param(partitions_def_name, "partitions_def_name")\n check.sequence_param(partition_keys, "partition_keys", of_type=str)\n if isinstance(partition_keys, str):\n # Guard against a single string being passed in `partition_keys`\n raise DagsterInvalidInvocationError("partition_keys must be a sequence of strings")\n raise_error_on_invalid_partition_key_substring(partition_keys)\n return self._event_storage.add_dynamic_partitions(partitions_def_name, partition_keys)
\n\n
[docs] @public\n @traced\n def delete_dynamic_partition(self, partitions_def_name: str, partition_key: str) -> None:\n """Delete a partition for the specified :py:class:`DynamicPartitionsDefinition`.\n If the partition does not exist, exits silently.\n\n Args:\n partitions_def_name (str): The name of the `DynamicPartitionsDefinition`.\n partition_key (Sequence[str]): Partition key to delete.\n """\n check.str_param(partitions_def_name, "partitions_def_name")\n check.sequence_param(partition_key, "partition_key", of_type=str)\n self._event_storage.delete_dynamic_partition(partitions_def_name, partition_key)
\n\n
[docs] @public\n @traced\n def has_dynamic_partition(self, partitions_def_name: str, partition_key: str) -> bool:\n """Check if a partition key exists for the :py:class:`DynamicPartitionsDefinition`.\n\n Args:\n partitions_def_name (str): The name of the `DynamicPartitionsDefinition`.\n partition_key (Sequence[str]): Partition key to check.\n """\n check.str_param(partitions_def_name, "partitions_def_name")\n check.str_param(partition_key, "partition_key")\n return self._event_storage.has_dynamic_partition(partitions_def_name, partition_key)
\n\n # event subscriptions\n\n def _get_yaml_python_handlers(self) -> Sequence[logging.Handler]:\n if self._settings:\n logging_config = self.get_settings("python_logs").get("dagster_handler_config", {})\n\n if logging_config:\n experimental_warning("Handling yaml-defined logging configuration")\n\n # Handlers can only be retrieved from dictConfig configuration if they are attached\n # to a logger. We add a dummy logger to the configuration that allows us to access user\n # defined handlers.\n handler_names = logging_config.get("handlers", {}).keys()\n\n dagster_dummy_logger_name = "dagster_dummy_logger"\n\n processed_dict_conf = {\n "version": 1,\n "disable_existing_loggers": False,\n "loggers": {dagster_dummy_logger_name: {"handlers": handler_names}},\n }\n processed_dict_conf.update(logging_config)\n\n logging.config.dictConfig(processed_dict_conf)\n\n dummy_logger = logging.getLogger(dagster_dummy_logger_name)\n return dummy_logger.handlers\n return []\n\n def _get_event_log_handler(self) -> _EventListenerLogHandler:\n event_log_handler = _EventListenerLogHandler(self)\n event_log_handler.setLevel(10)\n return event_log_handler\n\n def get_handlers(self) -> Sequence[logging.Handler]:\n handlers: List[logging.Handler] = [self._get_event_log_handler()]\n handlers.extend(self._get_yaml_python_handlers())\n return handlers\n\n def store_event(self, event: "EventLogEntry") -> None:\n self._event_storage.store_event(event)\n\n def handle_new_event(self, event: "EventLogEntry") -> None:\n run_id = event.run_id\n\n self._event_storage.store_event(event)\n\n if event.is_dagster_event and event.get_dagster_event().is_job_event:\n self._run_storage.handle_run_event(run_id, event.get_dagster_event())\n\n for sub in self._subscribers[run_id]:\n sub(event)\n\n def add_event_listener(self, run_id: str, cb) -> None:\n self._subscribers[run_id].append(cb)\n\n def report_engine_event(\n self,\n message: str,\n dagster_run: Optional[DagsterRun] = None,\n engine_event_data: Optional["EngineEventData"] = None,\n cls: Optional[Type[object]] = None,\n step_key: Optional[str] = None,\n job_name: Optional[str] = None,\n run_id: Optional[str] = None,\n ) -> "DagsterEvent":\n """Report a EngineEvent that occurred outside of a job execution context."""\n from dagster._core.events import DagsterEvent, DagsterEventType, EngineEventData\n\n check.opt_class_param(cls, "cls")\n check.str_param(message, "message")\n check.opt_inst_param(dagster_run, "dagster_run", DagsterRun)\n check.opt_str_param(run_id, "run_id")\n check.opt_str_param(job_name, "job_name")\n\n check.invariant(\n dagster_run or (job_name and run_id),\n "Must include either dagster_run or job_name and run_id",\n )\n\n run_id = run_id if run_id else dagster_run.run_id # type: ignore\n job_name = job_name if job_name else dagster_run.job_name # type: ignore\n\n engine_event_data = check.opt_inst_param(\n engine_event_data,\n "engine_event_data",\n EngineEventData,\n EngineEventData({}),\n )\n\n if cls:\n message = f"[{cls.__name__}] {message}"\n\n log_level = logging.INFO\n if engine_event_data and engine_event_data.error:\n log_level = logging.ERROR\n\n dagster_event = DagsterEvent(\n event_type_value=DagsterEventType.ENGINE_EVENT.value,\n job_name=job_name,\n message=message,\n event_specific_data=engine_event_data,\n step_key=step_key,\n )\n self.report_dagster_event(dagster_event, run_id=run_id, log_level=log_level)\n return dagster_event\n\n def report_dagster_event(\n self,\n dagster_event: "DagsterEvent",\n run_id: str,\n log_level: Union[str, int] = logging.INFO,\n ) -> None:\n """Takes a DagsterEvent and stores it in persistent storage for the corresponding DagsterRun."""\n from dagster._core.events.log import EventLogEntry\n\n event_record = EventLogEntry(\n user_message="",\n level=log_level,\n job_name=dagster_event.job_name,\n run_id=run_id,\n error_info=None,\n timestamp=time.time(),\n step_key=dagster_event.step_key,\n dagster_event=dagster_event,\n )\n self.handle_new_event(event_record)\n\n def report_run_canceling(self, run: DagsterRun, message: Optional[str] = None):\n from dagster._core.events import DagsterEvent, DagsterEventType\n\n check.inst_param(run, "run", DagsterRun)\n message = check.opt_str_param(\n message,\n "message",\n "Sending run termination request.",\n )\n canceling_event = DagsterEvent(\n event_type_value=DagsterEventType.PIPELINE_CANCELING.value,\n job_name=run.job_name,\n message=message,\n )\n self.report_dagster_event(canceling_event, run_id=run.run_id)\n\n def report_run_canceled(\n self,\n dagster_run: DagsterRun,\n message: Optional[str] = None,\n ) -> "DagsterEvent":\n from dagster._core.events import DagsterEvent, DagsterEventType\n\n check.inst_param(dagster_run, "dagster_run", DagsterRun)\n\n message = check.opt_str_param(\n message,\n "mesage",\n "This run has been marked as canceled from outside the execution context.",\n )\n\n dagster_event = DagsterEvent(\n event_type_value=DagsterEventType.PIPELINE_CANCELED.value,\n job_name=dagster_run.job_name,\n message=message,\n )\n self.report_dagster_event(dagster_event, run_id=dagster_run.run_id, log_level=logging.ERROR)\n return dagster_event\n\n def report_run_failed(\n self, dagster_run: DagsterRun, message: Optional[str] = None\n ) -> "DagsterEvent":\n from dagster._core.events import DagsterEvent, DagsterEventType\n\n check.inst_param(dagster_run, "dagster_run", DagsterRun)\n\n message = check.opt_str_param(\n message,\n "message",\n "This run has been marked as failed from outside the execution context.",\n )\n\n dagster_event = DagsterEvent(\n event_type_value=DagsterEventType.PIPELINE_FAILURE.value,\n job_name=dagster_run.job_name,\n message=message,\n )\n self.report_dagster_event(dagster_event, run_id=dagster_run.run_id, log_level=logging.ERROR)\n return dagster_event\n\n # directories\n\n def file_manager_directory(self, run_id: str) -> str:\n return self._local_artifact_storage.file_manager_dir(run_id)\n\n def storage_directory(self) -> str:\n return self._local_artifact_storage.storage_dir\n\n def schedules_directory(self) -> str:\n return self._local_artifact_storage.schedules_dir\n\n # Runs coordinator\n\n def submit_run(self, run_id: str, workspace: "IWorkspace") -> DagsterRun:\n """Submit a pipeline run to the coordinator.\n\n This method delegates to the ``RunCoordinator``, configured on the instance, and will\n call its implementation of ``RunCoordinator.submit_run()`` to send the run to the\n coordinator for execution. Runs should be created in the instance (e.g., by calling\n ``DagsterInstance.create_run()``) *before* this method is called, and\n should be in the ``PipelineRunStatus.NOT_STARTED`` state. They also must have a non-null\n ExternalPipelineOrigin.\n\n Args:\n run_id (str): The id of the run.\n """\n from dagster._core.host_representation import ExternalJobOrigin\n from dagster._core.run_coordinator import SubmitRunContext\n\n run = self.get_run_by_id(run_id)\n if run is None:\n raise DagsterInvariantViolationError(\n f"Could not load run {run_id} that was passed to submit_run"\n )\n\n check.inst(\n run.external_job_origin,\n ExternalJobOrigin,\n "External pipeline origin must be set for submitted runs",\n )\n check.inst(\n run.job_code_origin,\n JobPythonOrigin,\n "Python origin must be set for submitted runs",\n )\n\n try:\n submitted_run = self.run_coordinator.submit_run(\n SubmitRunContext(run, workspace=workspace)\n )\n except:\n from dagster._core.events import EngineEventData\n\n error = serializable_error_info_from_exc_info(sys.exc_info())\n self.report_engine_event(\n error.message,\n run,\n EngineEventData.engine_error(error),\n )\n self.report_run_failed(run)\n raise\n\n return submitted_run\n\n # Run launcher\n\n def launch_run(self, run_id: str, workspace: "IWorkspace") -> DagsterRun:\n """Launch a pipeline run.\n\n This method is typically called using `instance.submit_run` rather than being invoked\n directly. This method delegates to the ``RunLauncher``, if any, configured on the instance,\n and will call its implementation of ``RunLauncher.launch_run()`` to begin the execution of\n the specified run. Runs should be created in the instance (e.g., by calling\n ``DagsterInstance.create_run()``) *before* this method is called, and should be in the\n ``PipelineRunStatus.NOT_STARTED`` state.\n\n Args:\n run_id (str): The id of the run the launch.\n """\n from dagster._core.events import DagsterEvent, DagsterEventType, EngineEventData\n from dagster._core.launcher import LaunchRunContext\n\n run = self.get_run_by_id(run_id)\n if run is None:\n raise DagsterInvariantViolationError(\n f"Could not load run {run_id} that was passed to launch_run"\n )\n\n launch_started_event = DagsterEvent(\n event_type_value=DagsterEventType.PIPELINE_STARTING.value,\n job_name=run.job_name,\n )\n self.report_dagster_event(launch_started_event, run_id=run.run_id)\n\n run = self.get_run_by_id(run_id)\n if run is None:\n check.failed(f"Failed to reload run {run_id}")\n\n try:\n self.run_launcher.launch_run(LaunchRunContext(dagster_run=run, workspace=workspace))\n except:\n error = serializable_error_info_from_exc_info(sys.exc_info())\n self.report_engine_event(\n error.message,\n run,\n EngineEventData.engine_error(error),\n )\n self.report_run_failed(run)\n raise\n\n return run\n\n def resume_run(self, run_id: str, workspace: "IWorkspace", attempt_number: int) -> DagsterRun:\n """Resume a pipeline run.\n\n This method should be called on runs which have already been launched, but whose run workers\n have died.\n\n Args:\n run_id (str): The id of the run the launch.\n """\n from dagster._core.events import EngineEventData\n from dagster._core.launcher import ResumeRunContext\n from dagster._daemon.monitoring import RESUME_RUN_LOG_MESSAGE\n\n run = self.get_run_by_id(run_id)\n if run is None:\n raise DagsterInvariantViolationError(\n f"Could not load run {run_id} that was passed to resume_run"\n )\n if run.status not in IN_PROGRESS_RUN_STATUSES:\n raise DagsterInvariantViolationError(\n f"Run {run_id} is not in a state that can be resumed"\n )\n\n self.report_engine_event(\n RESUME_RUN_LOG_MESSAGE,\n run,\n )\n\n try:\n self.run_launcher.resume_run(\n ResumeRunContext(\n dagster_run=run,\n workspace=workspace,\n resume_attempt_number=attempt_number,\n )\n )\n except:\n error = serializable_error_info_from_exc_info(sys.exc_info())\n self.report_engine_event(\n error.message,\n run,\n EngineEventData.engine_error(error),\n )\n self.report_run_failed(run)\n raise\n\n return run\n\n def count_resume_run_attempts(self, run_id: str) -> int:\n from dagster._daemon.monitoring import count_resume_run_attempts\n\n return count_resume_run_attempts(self, run_id)\n\n def run_will_resume(self, run_id: str) -> bool:\n if not self.run_monitoring_enabled:\n return False\n return self.count_resume_run_attempts(run_id) < self.run_monitoring_max_resume_run_attempts\n\n # Scheduler\n\n def start_schedule(self, external_schedule: "ExternalSchedule") -> "InstigatorState":\n return self._scheduler.start_schedule(self, external_schedule) # type: ignore\n\n def stop_schedule(\n self,\n schedule_origin_id: str,\n schedule_selector_id: str,\n external_schedule: Optional["ExternalSchedule"],\n ) -> "InstigatorState":\n return self._scheduler.stop_schedule( # type: ignore\n self, schedule_origin_id, schedule_selector_id, external_schedule\n )\n\n def scheduler_debug_info(self) -> "SchedulerDebugInfo":\n from dagster._core.definitions.run_request import InstigatorType\n from dagster._core.scheduler import SchedulerDebugInfo\n\n errors = []\n\n schedules: List[str] = []\n for schedule_state in self.all_instigator_state(instigator_type=InstigatorType.SCHEDULE):\n schedule_info: Mapping[str, Mapping[str, object]] = {\n schedule_state.instigator_name: {\n "status": schedule_state.status.value,\n "cron_schedule": schedule_state.instigator_data.cron_schedule,\n "schedule_origin_id": schedule_state.instigator_origin_id,\n "repository_origin_id": schedule_state.repository_origin_id,\n }\n }\n\n schedules.append(yaml.safe_dump(schedule_info, default_flow_style=False))\n\n return SchedulerDebugInfo(\n scheduler_config_info=self._info_str_for_component("Scheduler", self.scheduler),\n scheduler_info=self.scheduler.debug_info(), # type: ignore\n schedule_storage=schedules,\n errors=errors,\n )\n\n # Schedule / Sensor Storage\n\n def start_sensor(self, external_sensor: "ExternalSensor") -> "InstigatorState":\n from dagster._core.definitions.run_request import InstigatorType\n from dagster._core.scheduler.instigation import (\n InstigatorState,\n InstigatorStatus,\n SensorInstigatorData,\n )\n\n stored_state = self.get_instigator_state(\n external_sensor.get_external_origin_id(), external_sensor.selector_id\n )\n\n computed_state = external_sensor.get_current_instigator_state(stored_state)\n if computed_state.is_running:\n return computed_state\n\n if not stored_state:\n return self.add_instigator_state(\n InstigatorState(\n external_sensor.get_external_origin(),\n InstigatorType.SENSOR,\n InstigatorStatus.RUNNING,\n SensorInstigatorData(min_interval=external_sensor.min_interval_seconds),\n )\n )\n else:\n return self.update_instigator_state(stored_state.with_status(InstigatorStatus.RUNNING))\n\n def stop_sensor(\n self,\n instigator_origin_id: str,\n selector_id: str,\n external_sensor: Optional["ExternalSensor"],\n ) -> "InstigatorState":\n from dagster._core.definitions.run_request import InstigatorType\n from dagster._core.scheduler.instigation import (\n InstigatorState,\n InstigatorStatus,\n SensorInstigatorData,\n )\n\n stored_state = self.get_instigator_state(instigator_origin_id, selector_id)\n computed_state: InstigatorState\n if external_sensor:\n computed_state = external_sensor.get_current_instigator_state(stored_state)\n else:\n computed_state = check.not_none(stored_state)\n\n if not computed_state.is_running:\n return computed_state\n\n if not stored_state:\n assert external_sensor\n return self.add_instigator_state(\n InstigatorState(\n external_sensor.get_external_origin(),\n InstigatorType.SENSOR,\n InstigatorStatus.STOPPED,\n SensorInstigatorData(min_interval=external_sensor.min_interval_seconds),\n )\n )\n else:\n return self.update_instigator_state(stored_state.with_status(InstigatorStatus.STOPPED))\n\n @traced\n def all_instigator_state(\n self,\n repository_origin_id: Optional[str] = None,\n repository_selector_id: Optional[str] = None,\n instigator_type: Optional["InstigatorType"] = None,\n instigator_statuses: Optional[Set["InstigatorStatus"]] = None,\n ):\n if not self._schedule_storage:\n check.failed("Schedule storage not available")\n return self._schedule_storage.all_instigator_state(\n repository_origin_id, repository_selector_id, instigator_type, instigator_statuses\n )\n\n @traced\n def get_instigator_state(self, origin_id: str, selector_id: str) -> Optional["InstigatorState"]:\n if not self._schedule_storage:\n check.failed("Schedule storage not available")\n return self._schedule_storage.get_instigator_state(origin_id, selector_id)\n\n def add_instigator_state(self, state: "InstigatorState") -> "InstigatorState":\n if not self._schedule_storage:\n check.failed("Schedule storage not available")\n return self._schedule_storage.add_instigator_state(state)\n\n def update_instigator_state(self, state: "InstigatorState") -> "InstigatorState":\n if not self._schedule_storage:\n check.failed("Schedule storage not available")\n return self._schedule_storage.update_instigator_state(state)\n\n def delete_instigator_state(self, origin_id: str, selector_id: str) -> None:\n return self._schedule_storage.delete_instigator_state(origin_id, selector_id) # type: ignore # (possible none)\n\n @property\n def supports_batch_tick_queries(self) -> bool:\n return self._schedule_storage and self._schedule_storage.supports_batch_queries # type: ignore # (possible none)\n\n @traced\n def get_batch_ticks(\n self,\n selector_ids: Sequence[str],\n limit: Optional[int] = None,\n statuses: Optional[Sequence["TickStatus"]] = None,\n ) -> Mapping[str, Sequence["InstigatorTick"]]:\n if not self._schedule_storage:\n return {}\n return self._schedule_storage.get_batch_ticks(selector_ids, limit, statuses)\n\n @traced\n def get_tick(\n self, origin_id: str, selector_id: str, timestamp: float\n ) -> Optional["InstigatorTick"]:\n matches = self._schedule_storage.get_ticks( # type: ignore # (possible none)\n origin_id, selector_id, before=timestamp + 1, after=timestamp - 1, limit=1\n )\n return matches[0] if len(matches) else None\n\n @traced\n def get_ticks(\n self,\n origin_id: str,\n selector_id: str,\n before: Optional[float] = None,\n after: Optional[float] = None,\n limit: Optional[int] = None,\n statuses: Optional[Sequence["TickStatus"]] = None,\n ) -> Sequence["InstigatorTick"]:\n return self._schedule_storage.get_ticks( # type: ignore # (possible none)\n origin_id, selector_id, before=before, after=after, limit=limit, statuses=statuses\n )\n\n def create_tick(self, tick_data: "TickData") -> "InstigatorTick":\n return check.not_none(self._schedule_storage).create_tick(tick_data)\n\n def update_tick(self, tick: "InstigatorTick"):\n return check.not_none(self._schedule_storage).update_tick(tick)\n\n def purge_ticks(\n self,\n origin_id: str,\n selector_id: str,\n before: float,\n tick_statuses: Optional[Sequence["TickStatus"]] = None,\n ) -> None:\n self._schedule_storage.purge_ticks(origin_id, selector_id, before, tick_statuses) # type: ignore # (possible none)\n\n def wipe_all_schedules(self) -> None:\n if self._scheduler:\n self._scheduler.wipe(self) # type: ignore # (possible none)\n\n self._schedule_storage.wipe() # type: ignore # (possible none)\n\n def logs_path_for_schedule(self, schedule_origin_id: str) -> str:\n return self._scheduler.get_logs_path(self, schedule_origin_id) # type: ignore # (possible none)\n\n def __enter__(self) -> Self:\n return self\n\n def __exit__(\n self,\n exception_type: Optional[Type[BaseException]],\n exception_value: Optional[BaseException],\n traceback: Optional[TracebackType],\n ) -> None:\n self.dispose()\n\n # dagster daemon\n def add_daemon_heartbeat(self, daemon_heartbeat: "DaemonHeartbeat") -> None:\n """Called on a regular interval by the daemon."""\n self._run_storage.add_daemon_heartbeat(daemon_heartbeat)\n\n def get_daemon_heartbeats(self) -> Mapping[str, "DaemonHeartbeat"]:\n """Latest heartbeats of all daemon types."""\n return self._run_storage.get_daemon_heartbeats()\n\n def wipe_daemon_heartbeats(self) -> None:\n self._run_storage.wipe_daemon_heartbeats()\n\n def get_required_daemon_types(self) -> Sequence[str]:\n from dagster._core.run_coordinator import QueuedRunCoordinator\n from dagster._core.scheduler import DagsterDaemonScheduler\n from dagster._daemon.asset_daemon import AssetDaemon\n from dagster._daemon.auto_run_reexecution.event_log_consumer import EventLogConsumerDaemon\n from dagster._daemon.daemon import (\n BackfillDaemon,\n MonitoringDaemon,\n SchedulerDaemon,\n SensorDaemon,\n )\n from dagster._daemon.run_coordinator.queued_run_coordinator_daemon import (\n QueuedRunCoordinatorDaemon,\n )\n\n if self.is_ephemeral:\n return []\n\n daemons = [SensorDaemon.daemon_type(), BackfillDaemon.daemon_type()]\n if isinstance(self.scheduler, DagsterDaemonScheduler):\n daemons.append(SchedulerDaemon.daemon_type())\n if isinstance(self.run_coordinator, QueuedRunCoordinator):\n daemons.append(QueuedRunCoordinatorDaemon.daemon_type())\n if self.run_monitoring_enabled:\n daemons.append(MonitoringDaemon.daemon_type())\n if self.run_retries_enabled:\n daemons.append(EventLogConsumerDaemon.daemon_type())\n if self.auto_materialize_enabled:\n daemons.append(AssetDaemon.daemon_type())\n return daemons\n\n def get_daemon_statuses(\n self, daemon_types: Optional[Sequence[str]] = None\n ) -> Mapping[str, "DaemonStatus"]:\n """Get the current status of the daemons. If daemon_types aren't provided, defaults to all\n required types. Returns a dict of daemon type to status.\n """\n from dagster._daemon.controller import get_daemon_statuses\n\n check.opt_sequence_param(daemon_types, "daemon_types", of_type=str)\n return get_daemon_statuses(\n self, daemon_types=daemon_types or self.get_required_daemon_types(), ignore_errors=True\n )\n\n @property\n def daemon_skip_heartbeats_without_errors(self) -> bool:\n # If enabled, daemon threads won't write heartbeats unless they encounter an error. This is\n # enabled in cloud, where we don't need to use heartbeats to check if daemons are running, but\n # do need to surface errors to users. This is an optimization to reduce DB writes.\n return False\n\n # backfill\n def get_backfills(\n self,\n status: Optional["BulkActionStatus"] = None,\n cursor: Optional[str] = None,\n limit: Optional[int] = None,\n ) -> Sequence["PartitionBackfill"]:\n return self._run_storage.get_backfills(status=status, cursor=cursor, limit=limit)\n\n def get_backfill(self, backfill_id: str) -> Optional["PartitionBackfill"]:\n return self._run_storage.get_backfill(backfill_id)\n\n def add_backfill(self, partition_backfill: "PartitionBackfill") -> None:\n self._run_storage.add_backfill(partition_backfill)\n\n def update_backfill(self, partition_backfill: "PartitionBackfill") -> None:\n self._run_storage.update_backfill(partition_backfill)\n\n @property\n def should_start_background_run_thread(self) -> bool:\n """Gate on an experimental feature to start a thread that monitors for if the run should be canceled."""\n return False\n\n def get_tick_retention_settings(\n self, instigator_type: "InstigatorType"\n ) -> Mapping["TickStatus", int]:\n from dagster._core.definitions.run_request import InstigatorType\n\n retention_settings = self.get_settings("retention")\n\n if instigator_type == InstigatorType.SCHEDULE:\n tick_settings = retention_settings.get("schedule")\n elif instigator_type == InstigatorType.SENSOR:\n tick_settings = retention_settings.get("sensor")\n elif instigator_type == InstigatorType.AUTO_MATERIALIZE:\n tick_settings = retention_settings.get("auto_materialize")\n else:\n raise Exception(f"Unexpected instigator type {instigator_type}")\n\n default_tick_settings = get_default_tick_retention_settings(instigator_type)\n return get_tick_retention_settings(tick_settings, default_tick_settings)\n\n def inject_env_vars(self, location_name: Optional[str]) -> None:\n if not self._secrets_loader:\n return\n\n new_env = self._secrets_loader.get_secrets_for_environment(location_name)\n for k, v in new_env.items():\n os.environ[k] = v\n\n def get_latest_data_version_record(\n self,\n key: AssetKey,\n is_source: Optional[bool] = None,\n partition_key: Optional[str] = None,\n before_cursor: Optional[int] = None,\n after_cursor: Optional[int] = None,\n ) -> Optional["EventLogRecord"]:\n from dagster._core.event_api import EventRecordsFilter\n from dagster._core.events import DagsterEventType\n\n # When we cant don't know whether the requested key corresponds to a source or regular\n # asset, we need to retrieve both the latest observation and materialization for all assets.\n # If there is a materialization, it's a regular asset and we can ignore the observation.\n\n observation: Optional[EventLogRecord] = None\n if is_source or is_source is None:\n observations = self.get_event_records(\n EventRecordsFilter(\n event_type=DagsterEventType.ASSET_OBSERVATION,\n asset_key=key,\n asset_partitions=[partition_key] if partition_key else None,\n before_cursor=before_cursor,\n after_cursor=after_cursor,\n ),\n limit=1,\n )\n observation = next(iter(observations), None)\n\n materialization: Optional[EventLogRecord] = None\n if not is_source:\n materializations = self.get_event_records(\n EventRecordsFilter(\n event_type=DagsterEventType.ASSET_MATERIALIZATION,\n asset_key=key,\n asset_partitions=[partition_key] if partition_key else None,\n before_cursor=before_cursor,\n after_cursor=after_cursor,\n ),\n limit=1,\n )\n materialization = next(iter(materializations), None)\n\n return materialization or observation\n\n
[docs] @public\n def get_latest_materialization_code_versions(\n self, asset_keys: Iterable[AssetKey]\n ) -> Mapping[AssetKey, Optional[str]]:\n """Returns the code version used for the latest materialization of each of the provided\n assets.\n\n Args:\n asset_keys (Iterable[AssetKey]): The asset keys to find latest materialization code\n versions for.\n\n Returns:\n Mapping[AssetKey, Optional[str]]: A dictionary with a key for each of the provided asset\n keys. The values will be None if the asset has no materializations. If an asset does\n not have a code version explicitly assigned to its definitions, but was\n materialized, Dagster assigns the run ID as its code version.\n """\n result: Dict[AssetKey, Optional[str]] = {}\n latest_materialization_events = self.get_latest_materialization_events(asset_keys)\n for asset_key in asset_keys:\n event_log_entry = latest_materialization_events.get(asset_key)\n if event_log_entry is None:\n result[asset_key] = None\n else:\n data_provenance = extract_data_provenance_from_entry(event_log_entry)\n result[asset_key] = data_provenance.code_version if data_provenance else None\n\n return result
\n\n @experimental\n def report_runless_asset_event(\n self,\n asset_event: Union["AssetMaterialization", "AssetObservation", "AssetCheckEvaluation"],\n ):\n """Record an event log entry related to assets that does not belong to a Dagster run."""\n from dagster._core.events import (\n AssetMaterialization,\n AssetObservationData,\n DagsterEvent,\n DagsterEventType,\n StepMaterializationData,\n )\n\n if isinstance(asset_event, AssetMaterialization):\n event_type_value = DagsterEventType.ASSET_MATERIALIZATION.value\n data_payload = StepMaterializationData(asset_event)\n elif isinstance(asset_event, AssetCheckEvaluation):\n event_type_value = DagsterEventType.ASSET_CHECK_EVALUATION.value\n data_payload = asset_event\n elif isinstance(asset_event, AssetObservation):\n event_type_value = DagsterEventType.ASSET_OBSERVATION.value\n data_payload = AssetObservationData(asset_event)\n else:\n raise DagsterInvariantViolationError(\n f"Received unexpected asset event type {asset_event}, expected"\n " AssetMaterialization, AssetObservation or AssetCheckEvaluation"\n )\n\n return self.report_dagster_event(\n run_id=RUNLESS_RUN_ID,\n dagster_event=DagsterEvent(\n event_type_value=event_type_value,\n event_specific_data=data_payload,\n job_name=RUNLESS_JOB_NAME,\n ),\n )
\n
", "current_page_name": "_modules/dagster/_core/instance", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "ref": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.instance.ref

\nimport os\nfrom typing import TYPE_CHECKING, Any, Mapping, NamedTuple, Optional, Sequence, Type\n\nimport yaml\n\nimport dagster._check as check\nfrom dagster._serdes import ConfigurableClassData, class_from_code_pointer, whitelist_for_serdes\n\nfrom .config import DAGSTER_CONFIG_YAML_FILENAME, dagster_instance_config\n\nif TYPE_CHECKING:\n    from dagster._core.instance import DagsterInstance, DagsterInstanceOverrides\n    from dagster._core.launcher.base import RunLauncher\n    from dagster._core.run_coordinator.base import RunCoordinator\n    from dagster._core.scheduler.scheduler import Scheduler\n    from dagster._core.secrets.loader import SecretsLoader\n    from dagster._core.storage.base_storage import DagsterStorage\n    from dagster._core.storage.compute_log_manager import ComputeLogManager\n    from dagster._core.storage.event_log.base import EventLogStorage\n    from dagster._core.storage.root import LocalArtifactStorage\n    from dagster._core.storage.runs.base import RunStorage\n    from dagster._core.storage.schedules.base import ScheduleStorage\n\n\ndef compute_logs_directory(base: str) -> str:\n    return os.path.join(base, "storage")\n\n\ndef _runs_directory(base: str) -> str:\n    return os.path.join(base, "history", "")\n\n\ndef _event_logs_directory(base: str) -> str:\n    return os.path.join(base, "history", "runs", "")\n\n\ndef _schedule_directory(base: str) -> str:\n    return os.path.join(base, "schedules")\n\n\ndef configurable_class_data(config_field: Mapping[str, Any]) -> ConfigurableClassData:\n    return ConfigurableClassData(\n        check.str_elem(config_field, "module"),\n        check.str_elem(config_field, "class"),\n        yaml.dump(check.opt_dict_elem(config_field, "config"), default_flow_style=False),\n    )\n\n\ndef configurable_class_data_or_default(\n    config_value: Mapping[str, Any], field_name: str, default: Optional[ConfigurableClassData]\n) -> Optional[ConfigurableClassData]:\n    return (\n        configurable_class_data(config_value[field_name])\n        if config_value.get(field_name)\n        else default\n    )\n\n\ndef configurable_secrets_loader_data(\n    config_field: Mapping[str, Any], default: Optional[ConfigurableClassData]\n) -> Optional[ConfigurableClassData]:\n    if not config_field:\n        return default\n    elif "custom" in config_field:\n        return configurable_class_data(config_field["custom"])\n    else:\n        return None\n\n\ndef configurable_storage_data(\n    config_field: Mapping[str, Any], defaults: Mapping[str, Optional[ConfigurableClassData]]\n) -> Sequence[Optional[ConfigurableClassData]]:\n    storage_data: ConfigurableClassData\n    run_storage_data: Optional[ConfigurableClassData]\n    event_storage_data: Optional[ConfigurableClassData]\n    schedule_storage_data: Optional[ConfigurableClassData]\n\n    if not config_field:\n        storage_data = check.not_none(defaults.get("storage"))\n        run_storage_data = check.not_none(defaults.get("run_storage"))\n        event_storage_data = check.not_none(defaults.get("event_log_storage"))\n        schedule_storage_data = check.not_none(defaults.get("schedule_storage"))\n    elif "postgres" in config_field:\n        config_yaml = yaml.dump(config_field["postgres"], default_flow_style=False)\n        storage_data = ConfigurableClassData(\n            module_name="dagster_postgres",\n            class_name="DagsterPostgresStorage",\n            config_yaml=config_yaml,\n        )\n        # for backwards compatibility\n        run_storage_data = ConfigurableClassData(\n            module_name="dagster_postgres",\n            class_name="PostgresRunStorage",\n            config_yaml=config_yaml,\n        )\n        event_storage_data = ConfigurableClassData(\n            module_name="dagster_postgres",\n            class_name="PostgresEventLogStorage",\n            config_yaml=config_yaml,\n        )\n        schedule_storage_data = ConfigurableClassData(\n            module_name="dagster_postgres",\n            class_name="PostgresScheduleStorage",\n            config_yaml=config_yaml,\n        )\n\n    elif "mysql" in config_field:\n        config_yaml = yaml.dump(config_field["mysql"], default_flow_style=False)\n        storage_data = ConfigurableClassData(\n            module_name="dagster_mysql",\n            class_name="DagsterMySQLStorage",\n            config_yaml=config_yaml,\n        )\n        # for backwards compatibility\n        run_storage_data = ConfigurableClassData(\n            module_name="dagster_mysql",\n            class_name="MySQLRunStorage",\n            config_yaml=config_yaml,\n        )\n        event_storage_data = ConfigurableClassData(\n            module_name="dagster_mysql",\n            class_name="MySQLEventLogStorage",\n            config_yaml=config_yaml,\n        )\n        schedule_storage_data = ConfigurableClassData(\n            module_name="dagster_mysql",\n            class_name="MySQLScheduleStorage",\n            config_yaml=config_yaml,\n        )\n\n    elif "sqlite" in config_field:\n        base_dir = config_field["sqlite"]["base_dir"]\n        storage_data = ConfigurableClassData(\n            "dagster._core.storage.sqlite_storage",\n            "DagsterSqliteStorage",\n            yaml.dump({"base_dir": base_dir}, default_flow_style=False),\n        )\n\n        # Back-compat fo the legacy storage field only works if the base_dir is a string\n        # (env var doesn't work since each storage has a different value for the base_dir field)\n        if isinstance(base_dir, str):\n            run_storage_data = ConfigurableClassData(\n                "dagster._core.storage.runs",\n                "SqliteRunStorage",\n                yaml.dump({"base_dir": _runs_directory(base_dir)}, default_flow_style=False),\n            )\n\n            event_storage_data = ConfigurableClassData(\n                "dagster._core.storage.event_log",\n                "SqliteEventLogStorage",\n                yaml.dump({"base_dir": _event_logs_directory(base_dir)}, default_flow_style=False),\n            )\n\n            schedule_storage_data = ConfigurableClassData(\n                "dagster._core.storage.schedules",\n                "SqliteScheduleStorage",\n                yaml.dump({"base_dir": _schedule_directory(base_dir)}, default_flow_style=False),\n            )\n        else:\n            run_storage_data = None\n            event_storage_data = None\n            schedule_storage_data = None\n    else:\n        storage_data = configurable_class_data(config_field["custom"])\n        storage_config_yaml = yaml.dump(\n            {\n                "module_name": storage_data.module_name,\n                "class_name": storage_data.class_name,\n                "config_yaml": storage_data.config_yaml,\n            },\n            default_flow_style=False,\n        )\n        run_storage_data = ConfigurableClassData(\n            "dagster._core.storage.legacy_storage", "LegacyRunStorage", storage_config_yaml\n        )\n        event_storage_data = ConfigurableClassData(\n            "dagster._core.storage.legacy_storage", "LegacyEventLogStorage", storage_config_yaml\n        )\n        schedule_storage_data = ConfigurableClassData(\n            "dagster._core.storage.legacy_storage", "LegacyScheduleStorage", storage_config_yaml\n        )\n\n    return [storage_data, run_storage_data, event_storage_data, schedule_storage_data]\n\n\n
[docs]@whitelist_for_serdes\nclass InstanceRef(\n NamedTuple(\n "_InstanceRef",\n [\n ("local_artifact_storage_data", ConfigurableClassData),\n ("compute_logs_data", ConfigurableClassData),\n ("scheduler_data", Optional[ConfigurableClassData]),\n ("run_coordinator_data", Optional[ConfigurableClassData]),\n ("run_launcher_data", Optional[ConfigurableClassData]),\n ("settings", Mapping[str, object]),\n # Required for backwards compatibility, but going forward will be unused by new versions\n # of DagsterInstance, which instead will instead grab the constituent storages from the\n # unified `storage_data`, if it is populated.\n ("run_storage_data", Optional[ConfigurableClassData]),\n ("event_storage_data", Optional[ConfigurableClassData]),\n ("schedule_storage_data", Optional[ConfigurableClassData]),\n ("custom_instance_class_data", Optional[ConfigurableClassData]),\n # unified storage field\n ("storage_data", Optional[ConfigurableClassData]),\n ("secrets_loader_data", Optional[ConfigurableClassData]),\n ],\n )\n):\n """Serializable representation of a :py:class:`DagsterInstance`.\n\n Users should not instantiate this class directly.\n """\n\n def __new__(\n cls,\n local_artifact_storage_data: ConfigurableClassData,\n compute_logs_data: ConfigurableClassData,\n scheduler_data: Optional[ConfigurableClassData],\n run_coordinator_data: Optional[ConfigurableClassData],\n run_launcher_data: Optional[ConfigurableClassData],\n settings: Mapping[str, object],\n run_storage_data: Optional[ConfigurableClassData],\n event_storage_data: Optional[ConfigurableClassData],\n schedule_storage_data: Optional[ConfigurableClassData],\n custom_instance_class_data: Optional[ConfigurableClassData] = None,\n storage_data: Optional[ConfigurableClassData] = None,\n secrets_loader_data: Optional[ConfigurableClassData] = None,\n ):\n return super(cls, InstanceRef).__new__(\n cls,\n local_artifact_storage_data=check.inst_param(\n local_artifact_storage_data, "local_artifact_storage_data", ConfigurableClassData\n ),\n compute_logs_data=check.inst_param(\n compute_logs_data, "compute_logs_data", ConfigurableClassData\n ),\n scheduler_data=check.opt_inst_param(\n scheduler_data, "scheduler_data", ConfigurableClassData\n ),\n run_coordinator_data=check.opt_inst_param(\n run_coordinator_data, "run_coordinator_data", ConfigurableClassData\n ),\n run_launcher_data=check.opt_inst_param(\n run_launcher_data, "run_launcher_data", ConfigurableClassData\n ),\n settings=check.opt_mapping_param(settings, "settings", key_type=str),\n run_storage_data=check.opt_inst_param(\n run_storage_data, "run_storage_data", ConfigurableClassData\n ),\n event_storage_data=check.opt_inst_param(\n event_storage_data, "event_storage_data", ConfigurableClassData\n ),\n schedule_storage_data=check.opt_inst_param(\n schedule_storage_data, "schedule_storage_data", ConfigurableClassData\n ),\n custom_instance_class_data=check.opt_inst_param(\n custom_instance_class_data,\n "instance_class",\n ConfigurableClassData,\n ),\n storage_data=check.opt_inst_param(storage_data, "storage_data", ConfigurableClassData),\n secrets_loader_data=check.opt_inst_param(\n secrets_loader_data, "secrets_loader_data", ConfigurableClassData\n ),\n )\n\n @staticmethod\n def config_defaults(base_dir: str) -> Mapping[str, Optional[ConfigurableClassData]]:\n default_run_storage_data = ConfigurableClassData(\n "dagster._core.storage.runs",\n "SqliteRunStorage",\n yaml.dump({"base_dir": _runs_directory(base_dir)}, default_flow_style=False),\n )\n default_event_log_storage_data = ConfigurableClassData(\n "dagster._core.storage.event_log",\n "SqliteEventLogStorage",\n yaml.dump({"base_dir": _event_logs_directory(base_dir)}, default_flow_style=False),\n )\n default_schedule_storage_data = ConfigurableClassData(\n "dagster._core.storage.schedules",\n "SqliteScheduleStorage",\n yaml.dump({"base_dir": _schedule_directory(base_dir)}, default_flow_style=False),\n )\n\n return {\n "local_artifact_storage": ConfigurableClassData(\n "dagster._core.storage.root",\n "LocalArtifactStorage",\n yaml.dump({"base_dir": base_dir}, default_flow_style=False),\n ),\n "storage": ConfigurableClassData(\n "dagster._core.storage.sqlite_storage",\n "DagsterSqliteStorage",\n yaml.dump({"base_dir": base_dir}, default_flow_style=False),\n ),\n "compute_logs": ConfigurableClassData(\n "dagster._core.storage.local_compute_log_manager",\n "LocalComputeLogManager",\n yaml.dump({"base_dir": compute_logs_directory(base_dir)}, default_flow_style=False),\n ),\n "scheduler": ConfigurableClassData(\n "dagster._core.scheduler",\n "DagsterDaemonScheduler",\n yaml.dump({}),\n ),\n "run_coordinator": ConfigurableClassData(\n "dagster._core.run_coordinator", "DefaultRunCoordinator", yaml.dump({})\n ),\n "run_launcher": ConfigurableClassData(\n "dagster",\n "DefaultRunLauncher",\n yaml.dump({}),\n ),\n # For back-compat, the default is actually set in the secrets_loader property above,\n # so that old clients loading new config don't try to load a class that they\n # don't recognize\n "secrets": None,\n # LEGACY DEFAULTS\n "run_storage": default_run_storage_data,\n "event_log_storage": default_event_log_storage_data,\n "schedule_storage": default_schedule_storage_data,\n }\n\n @staticmethod\n def from_dir(\n base_dir: str,\n *,\n config_dir: Optional[str] = None,\n config_filename: str = DAGSTER_CONFIG_YAML_FILENAME,\n overrides: Optional["DagsterInstanceOverrides"] = None,\n ) -> "InstanceRef":\n if config_dir is None:\n config_dir = base_dir\n\n overrides = check.opt_mapping_param(overrides, "overrides")\n config_value, custom_instance_class = dagster_instance_config(\n config_dir, config_filename=config_filename, overrides=overrides\n )\n\n if custom_instance_class:\n config_keys = set(custom_instance_class.config_schema().keys()) # type: ignore # (undefined method)\n custom_instance_class_config = {\n key: val for key, val in config_value.items() if key in config_keys\n }\n custom_instance_class_data = ConfigurableClassData(\n config_value["instance_class"]["module"],\n config_value["instance_class"]["class"],\n yaml.dump(custom_instance_class_config, default_flow_style=False),\n )\n defaults = custom_instance_class.config_defaults(base_dir) # type: ignore # (undefined method)\n else:\n custom_instance_class_data = None\n defaults = InstanceRef.config_defaults(base_dir)\n\n local_artifact_storage_data = configurable_class_data_or_default(\n config_value, "local_artifact_storage", defaults["local_artifact_storage"]\n )\n\n compute_logs_data = configurable_class_data_or_default(\n config_value,\n "compute_logs",\n defaults["compute_logs"],\n )\n\n if (\n config_value.get("run_storage")\n or config_value.get("event_log_storage")\n or config_value.get("schedule_storage")\n ):\n # using legacy config, specifying config for each of the constituent storages, make sure\n # to create a composite storage\n run_storage_data = configurable_class_data_or_default(\n config_value, "run_storage", defaults["run_storage"]\n )\n event_storage_data = configurable_class_data_or_default(\n config_value, "event_log_storage", defaults["event_log_storage"]\n )\n schedule_storage_data = configurable_class_data_or_default(\n config_value, "schedule_storage", defaults["schedule_storage"]\n )\n storage_data = ConfigurableClassData(\n module_name="dagster._core.storage.legacy_storage",\n class_name="CompositeStorage",\n config_yaml=yaml.dump(\n {\n "run_storage": {\n "module_name": run_storage_data.module_name, # type: ignore # (possible none)\n "class_name": run_storage_data.class_name, # type: ignore # (possible none)\n "config_yaml": run_storage_data.config_yaml, # type: ignore # (possible none)\n },\n "event_log_storage": {\n "module_name": event_storage_data.module_name, # type: ignore # (possible none)\n "class_name": event_storage_data.class_name, # type: ignore # (possible none)\n "config_yaml": event_storage_data.config_yaml, # type: ignore # (possible none)\n },\n "schedule_storage": {\n "module_name": schedule_storage_data.module_name, # type: ignore # (possible none)\n "class_name": schedule_storage_data.class_name, # type: ignore # (possible none)\n "config_yaml": schedule_storage_data.config_yaml, # type: ignore # (possible none)\n },\n },\n default_flow_style=False,\n ),\n )\n\n else:\n [\n storage_data,\n run_storage_data,\n event_storage_data,\n schedule_storage_data,\n ] = configurable_storage_data(\n config_value.get("storage"), defaults # type: ignore # (possible none)\n )\n\n scheduler_data = configurable_class_data_or_default(\n config_value, "scheduler", defaults["scheduler"]\n )\n\n if config_value.get("run_queue"):\n run_coordinator_data = configurable_class_data(\n {\n "module": "dagster.core.run_coordinator",\n "class": "QueuedRunCoordinator",\n "config": config_value["run_queue"],\n }\n )\n else:\n run_coordinator_data = configurable_class_data_or_default(\n config_value,\n "run_coordinator",\n defaults["run_coordinator"],\n )\n\n run_launcher_data = configurable_class_data_or_default(\n config_value,\n "run_launcher",\n defaults["run_launcher"],\n )\n\n secrets_loader_data = configurable_secrets_loader_data(\n config_value.get("secrets"), defaults["secrets"] # type: ignore # (possible none)\n )\n\n settings_keys = {\n "telemetry",\n "python_logs",\n "run_monitoring",\n "run_retries",\n "code_servers",\n "retention",\n "sensors",\n "schedules",\n "nux",\n "auto_materialize",\n }\n settings = {key: config_value.get(key) for key in settings_keys if config_value.get(key)}\n\n return InstanceRef(\n local_artifact_storage_data=local_artifact_storage_data, # type: ignore # (possible none)\n run_storage_data=run_storage_data,\n event_storage_data=event_storage_data,\n compute_logs_data=compute_logs_data, # type: ignore # (possible none)\n schedule_storage_data=schedule_storage_data,\n scheduler_data=scheduler_data,\n run_coordinator_data=run_coordinator_data,\n run_launcher_data=run_launcher_data,\n settings=settings,\n custom_instance_class_data=custom_instance_class_data,\n storage_data=storage_data,\n secrets_loader_data=secrets_loader_data,\n )\n\n @staticmethod\n def from_dict(instance_ref_dict):\n def value_for_ref_item(k, v):\n if v is None:\n return None\n if k == "settings":\n return v\n return ConfigurableClassData(*v)\n\n return InstanceRef(**{k: value_for_ref_item(k, v) for k, v in instance_ref_dict.items()})\n\n @property\n def local_artifact_storage(self) -> "LocalArtifactStorage":\n from dagster._core.storage.root import LocalArtifactStorage\n\n return self.local_artifact_storage_data.rehydrate(as_type=LocalArtifactStorage)\n\n @property\n def storage(self) -> Optional["DagsterStorage"]:\n from dagster._core.storage.base_storage import DagsterStorage\n\n return self.storage_data.rehydrate(as_type=DagsterStorage) if self.storage_data else None\n\n @property\n def run_storage(self) -> Optional["RunStorage"]:\n from dagster._core.storage.runs.base import RunStorage\n\n return (\n self.run_storage_data.rehydrate(as_type=RunStorage) if self.run_storage_data else None\n )\n\n @property\n def event_storage(self) -> Optional["EventLogStorage"]:\n from dagster._core.storage.event_log.base import EventLogStorage\n\n return (\n self.event_storage_data.rehydrate(as_type=EventLogStorage)\n if self.event_storage_data\n else None\n )\n\n @property\n def schedule_storage(self) -> Optional["ScheduleStorage"]:\n from dagster._core.storage.schedules.base import ScheduleStorage\n\n return (\n self.schedule_storage_data.rehydrate(as_type=ScheduleStorage)\n if self.schedule_storage_data\n else None\n )\n\n @property\n def compute_log_manager(self) -> "ComputeLogManager":\n from dagster._core.storage.compute_log_manager import ComputeLogManager\n\n return self.compute_logs_data.rehydrate(as_type=ComputeLogManager)\n\n @property\n def scheduler(self) -> Optional["Scheduler"]:\n from dagster._core.scheduler.scheduler import Scheduler\n\n return self.scheduler_data.rehydrate(as_type=Scheduler) if self.scheduler_data else None\n\n @property\n def run_coordinator(self) -> Optional["RunCoordinator"]:\n from dagster._core.run_coordinator.base import RunCoordinator\n\n return (\n self.run_coordinator_data.rehydrate(as_type=RunCoordinator)\n if self.run_coordinator_data\n else None\n )\n\n @property\n def run_launcher(self) -> Optional["RunLauncher"]:\n from dagster._core.launcher.base import RunLauncher\n\n return (\n self.run_launcher_data.rehydrate(as_type=RunLauncher)\n if self.run_launcher_data\n else None\n )\n\n @property\n def secrets_loader(self) -> Optional["SecretsLoader"]:\n from dagster._core.secrets.loader import SecretsLoader\n\n # Defining a default here rather than in stored config to avoid\n # back-compat issues when loading the config on older versions where\n # EnvFileLoader was not defined\n return (\n self.secrets_loader_data.rehydrate(as_type=SecretsLoader)\n if self.secrets_loader_data\n else None\n )\n\n @property\n def custom_instance_class(self) -> Type["DagsterInstance"]:\n return ( # type: ignore # (ambiguous return type)\n class_from_code_pointer(\n self.custom_instance_class_data.module_name,\n self.custom_instance_class_data.class_name,\n )\n if self.custom_instance_class_data\n else None\n )\n\n @property\n def custom_instance_class_config(self) -> Mapping[str, Any]:\n return (\n self.custom_instance_class_data.config_dict if self.custom_instance_class_data else {}\n )\n\n def to_dict(self) -> Mapping[str, Any]:\n return self._asdict()
\n
", "current_page_name": "_modules/dagster/_core/instance/ref", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}, {"link": "../", "title": "dagster._core.instance"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.instance.ref"}, "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.instance"}, "instance_for_test": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.instance_for_test

\nimport os\nimport sys\nimport tempfile\nfrom contextlib import ExitStack, contextmanager\nfrom typing import Any, Iterator, Mapping, Optional\n\nimport yaml\n\nfrom dagster._utils.error import serializable_error_info_from_exc_info\n\nfrom .._utils.env import environ\nfrom .._utils.merger import merge_dicts\nfrom .instance import DagsterInstance\n\n\n
[docs]@contextmanager\ndef instance_for_test(\n overrides: Optional[Mapping[str, Any]] = None,\n set_dagster_home: bool = True,\n temp_dir: Optional[str] = None,\n) -> Iterator[DagsterInstance]:\n """Creates a persistent :py:class:`~dagster.DagsterInstance` available within a context manager.\n\n When a context manager is opened, if no `temp_dir` parameter is set, a new\n temporary directory will be created for the duration of the context\n manager's opening. If the `set_dagster_home` parameter is set to True\n (True by default), the `$DAGSTER_HOME` environment variable will be\n overridden to be this directory (or the directory passed in by `temp_dir`)\n for the duration of the context manager being open.\n\n Args:\n overrides (Optional[Mapping[str, Any]]):\n Config to provide to instance (config format follows that typically found in an `instance.yaml` file).\n set_dagster_home (Optional[bool]):\n If set to True, the `$DAGSTER_HOME` environment variable will be\n overridden to be the directory used by this instance for the\n duration that the context manager is open. Upon the context\n manager closing, the `$DAGSTER_HOME` variable will be re-set to the original value. (Defaults to True).\n temp_dir (Optional[str]):\n The directory to use for storing local artifacts produced by the\n instance. If not set, a temporary directory will be created for\n the duration of the context manager being open, and all artifacts\n will be torn down afterward.\n """\n with ExitStack() as stack:\n if not temp_dir:\n temp_dir = stack.enter_context(tempfile.TemporaryDirectory())\n\n # wait for any grpc processes that created runs during test disposal to finish,\n # since they might also be using this instance's tempdir (and to keep each test\n # isolated / avoid race conditions in newer versions of grpcio when servers are\n # shutting down and spinning up at the same time)\n instance_overrides = merge_dicts(\n {\n "telemetry": {"enabled": False},\n "code_servers": {"wait_for_local_processes_on_shutdown": True},\n },\n (overrides if overrides else {}),\n )\n\n if set_dagster_home:\n stack.enter_context(\n environ({"DAGSTER_HOME": temp_dir, "DAGSTER_DISABLE_TELEMETRY": "yes"})\n )\n\n with open(os.path.join(temp_dir, "dagster.yaml"), "w", encoding="utf8") as fd:\n yaml.dump(instance_overrides, fd, default_flow_style=False)\n\n with DagsterInstance.from_config(temp_dir) as instance:\n try:\n yield instance\n except:\n sys.stderr.write(\n "Test raised an exception, attempting to clean up instance:"\n + serializable_error_info_from_exc_info(sys.exc_info()).to_string()\n + "\\n"\n )\n raise\n finally:\n cleanup_test_instance(instance)
\n\n\ndef cleanup_test_instance(instance: DagsterInstance) -> None:\n # To avoid filesystem contention when we close the temporary directory, wait for\n # all runs to reach a terminal state, and close any subprocesses or threads\n # that might be accessing the run history DB.\n\n # Since launcher is lazy loaded, we don't need to do anyting if it's None\n if instance._run_launcher: # noqa: SLF001\n instance._run_launcher.join() # noqa: SLF001\n
", "current_page_name": "_modules/dagster/_core/instance_for_test", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.instance_for_test"}, "launcher": {"base": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.launcher.base

\nfrom abc import ABC, abstractmethod\nfrom enum import Enum\nfrom typing import NamedTuple, Optional\n\nfrom dagster._core.instance import MayHaveInstanceWeakref, T_DagsterInstance\nfrom dagster._core.origin import JobPythonOrigin\nfrom dagster._core.storage.dagster_run import DagsterRun\nfrom dagster._core.workspace.workspace import IWorkspace\nfrom dagster._serdes import whitelist_for_serdes\n\n\nclass LaunchRunContext(NamedTuple):\n    """Context available within a run launcher's launch_run call."""\n\n    dagster_run: DagsterRun\n    workspace: Optional[IWorkspace]\n\n    @property\n    def job_code_origin(self) -> Optional[JobPythonOrigin]:\n        return self.dagster_run.job_code_origin\n\n\nclass ResumeRunContext(NamedTuple):\n    """Context available within a run launcher's resume_run call."""\n\n    dagster_run: DagsterRun\n    workspace: Optional[IWorkspace]\n    resume_attempt_number: Optional[int] = None\n\n    @property\n    def job_code_origin(self) -> Optional[JobPythonOrigin]:\n        return self.dagster_run.job_code_origin\n\n\n@whitelist_for_serdes\nclass WorkerStatus(Enum):\n    RUNNING = "RUNNING"\n    NOT_FOUND = "NOT_FOUND"\n    FAILED = "FAILED"\n    SUCCESS = "SUCCESS"\n    UNKNOWN = "UNKNOWN"\n\n\nclass CheckRunHealthResult(NamedTuple):\n    """Result of a check_run_worker_health call."""\n\n    status: WorkerStatus\n    msg: Optional[str] = None\n    transient: Optional[bool] = None\n    run_worker_id: Optional[str] = None  # Identifier for a particular run worker\n\n    def __str__(self) -> str:\n        return f"{self.status.value}: '{self.msg}'"\n\n\n
[docs]class RunLauncher(ABC, MayHaveInstanceWeakref[T_DagsterInstance]):\n @abstractmethod\n def launch_run(self, context: LaunchRunContext) -> None:\n """Launch a run.\n\n This method should begin the execution of the specified run, and may emit engine events.\n Runs should be created in the instance (e.g., by calling\n ``DagsterInstance.create_run()``) *before* this method is called, and\n should be in the ``PipelineRunStatus.STARTING`` state. Typically, this method will\n not be invoked directly, but should be invoked through ``DagsterInstance.launch_run()``.\n\n Args:\n context (LaunchRunContext): information about the launch - every run launcher\n will need the PipelineRun, and some run launchers may need information from the\n IWorkspace from which the run was launched.\n """\n\n @abstractmethod\n def terminate(self, run_id: str) -> bool:\n """Terminates a process.\n\n Returns False is the process was already terminated. Returns true if\n the process was alive and was successfully terminated\n """\n\n def dispose(self) -> None:\n """Do any resource cleanup that should happen when the DagsterInstance is\n cleaning itself up.\n """\n\n def join(self, timeout: int = 30) -> None:\n pass\n\n @property\n def supports_check_run_worker_health(self) -> bool:\n """Whether the run launcher supports check_run_worker_health."""\n return False\n\n def check_run_worker_health(self, run: DagsterRun) -> CheckRunHealthResult:\n raise NotImplementedError(\n "This run launcher does not support run monitoring. Please disable it on your instance."\n )\n\n def get_run_worker_debug_info(self, run: DagsterRun) -> Optional[str]:\n return None\n\n @property\n def supports_resume_run(self) -> bool:\n """Whether the run launcher supports resume_run."""\n return False\n\n def resume_run(self, context: ResumeRunContext) -> None:\n raise NotImplementedError(\n "This run launcher does not support resuming runs. If using "\n "run monitoring, set max_resume_run_attempts to 0."\n )
\n
", "current_page_name": "_modules/dagster/_core/launcher/base", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.launcher.base"}, "default_run_launcher": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.launcher.default_run_launcher

\nimport time\nfrom typing import TYPE_CHECKING, Any, Mapping, Optional, cast\n\nfrom typing_extensions import Self\n\nimport dagster._seven as seven\nfrom dagster import (\n    _check as check,\n)\nfrom dagster._config.config_schema import UserConfigSchema\nfrom dagster._core.errors import (\n    DagsterInvariantViolationError,\n    DagsterLaunchFailedError,\n    DagsterUserCodeProcessError,\n)\nfrom dagster._core.storage.dagster_run import DagsterRun\nfrom dagster._core.storage.tags import GRPC_INFO_TAG\nfrom dagster._serdes import (\n    ConfigurableClass,\n    deserialize_value,\n)\nfrom dagster._serdes.config_class import ConfigurableClassData\nfrom dagster._utils.merger import merge_dicts\n\nfrom .base import LaunchRunContext, RunLauncher\n\nif TYPE_CHECKING:\n    from dagster._core.instance import DagsterInstance\n    from dagster._grpc.client import DagsterGrpcClient\n\n\n# note: this class is a top level export, so we defer many imports til use for performance\n
[docs]class DefaultRunLauncher(RunLauncher, ConfigurableClass):\n """Launches runs against running GRPC servers."""\n\n def __init__(\n self,\n inst_data: Optional[ConfigurableClassData] = None,\n ):\n self._inst_data = inst_data\n\n self._run_ids = set()\n\n super().__init__()\n\n @property\n def inst_data(self) -> Optional[ConfigurableClassData]:\n return self._inst_data\n\n @classmethod\n def config_type(cls) -> UserConfigSchema:\n return {}\n\n @classmethod\n def from_config_value(\n cls, inst_data: ConfigurableClassData, config_value: Mapping[str, Any]\n ) -> Self:\n return DefaultRunLauncher(inst_data=inst_data)\n\n @staticmethod\n def launch_run_from_grpc_client(\n instance: "DagsterInstance", run: DagsterRun, grpc_client: "DagsterGrpcClient"\n ):\n # defer for perf\n from dagster._grpc.types import ExecuteExternalJobArgs, StartRunResult\n\n instance.add_run_tags(\n run.run_id,\n {\n GRPC_INFO_TAG: seven.json.dumps(\n merge_dicts(\n {"host": grpc_client.host},\n (\n {"port": grpc_client.port}\n if grpc_client.port\n else {"socket": grpc_client.socket}\n ),\n ({"use_ssl": True} if grpc_client.use_ssl else {}),\n )\n )\n },\n )\n\n res = deserialize_value(\n grpc_client.start_run(\n ExecuteExternalJobArgs(\n job_origin=run.external_job_origin, # type: ignore # (possible none)\n run_id=run.run_id,\n instance_ref=instance.get_ref(),\n )\n ),\n StartRunResult,\n )\n if not res.success:\n raise (\n DagsterLaunchFailedError(\n res.message, serializable_error_info=res.serializable_error_info\n )\n )\n\n def launch_run(self, context: LaunchRunContext) -> None:\n # defer for perf\n from dagster._core.host_representation.code_location import (\n GrpcServerCodeLocation,\n )\n\n run = context.dagster_run\n\n check.inst_param(run, "run", DagsterRun)\n\n if not context.workspace:\n raise DagsterInvariantViolationError(\n "DefaultRunLauncher requires a workspace to be included in its LaunchRunContext"\n )\n\n external_job_origin = check.not_none(run.external_job_origin)\n code_location = context.workspace.get_code_location(\n external_job_origin.external_repository_origin.code_location_origin.location_name\n )\n\n check.inst(\n code_location,\n GrpcServerCodeLocation,\n "DefaultRunLauncher: Can't launch runs for pipeline not loaded from a GRPC server",\n )\n\n DefaultRunLauncher.launch_run_from_grpc_client(\n self._instance, run, cast(GrpcServerCodeLocation, code_location).client\n )\n\n self._run_ids.add(run.run_id)\n\n def _get_grpc_client_for_termination(self, run_id):\n # defer for perf\n from dagster._grpc.client import DagsterGrpcClient\n\n if not self.has_instance:\n return None\n\n run = self._instance.get_run_by_id(run_id)\n if not run or run.is_finished:\n return None\n\n tags = run.tags\n\n if GRPC_INFO_TAG not in tags:\n return None\n\n grpc_info = seven.json.loads(tags.get(GRPC_INFO_TAG))\n\n return DagsterGrpcClient(\n port=grpc_info.get("port"),\n socket=grpc_info.get("socket"),\n host=grpc_info.get("host"),\n use_ssl=bool(grpc_info.get("use_ssl", False)),\n )\n\n def terminate(self, run_id):\n # defer for perf\n from dagster._grpc.types import CancelExecutionRequest, CancelExecutionResult\n\n check.str_param(run_id, "run_id")\n if not self.has_instance:\n return False\n\n run = self._instance.get_run_by_id(run_id)\n if not run:\n return False\n\n self._instance.report_run_canceling(run)\n\n client = self._get_grpc_client_for_termination(run_id)\n\n if not client:\n self._instance.report_engine_event(\n message="Unable to get grpc client to send termination request to.",\n dagster_run=run,\n cls=self.__class__,\n )\n return False\n\n res = deserialize_value(\n client.cancel_execution(CancelExecutionRequest(run_id=run_id)), CancelExecutionResult\n )\n\n if res.serializable_error_info:\n raise DagsterUserCodeProcessError.from_error_info(res.serializable_error_info)\n\n return res.success\n\n def join(self, timeout=30):\n # If this hasn't been initialized at all, we can just do a noop\n if not self.has_instance:\n return\n\n total_time = 0\n interval = 0.01\n\n while True:\n active_run_ids = [\n run_id\n for run_id in self._run_ids\n if (\n self._instance.get_run_by_id(run_id)\n and not self._instance.get_run_by_id(run_id).is_finished\n )\n ]\n\n if len(active_run_ids) == 0:\n return\n\n if total_time >= timeout:\n raise Exception(f"Timed out waiting for these runs to finish: {active_run_ids!r}")\n\n total_time += interval\n time.sleep(interval)\n interval = interval * 2
\n
", "current_page_name": "_modules/dagster/_core/launcher/default_run_launcher", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.launcher.default_run_launcher"}}, "log_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.log_manager

\nimport datetime\nimport logging\nfrom typing import TYPE_CHECKING, Any, Mapping, NamedTuple, Optional, Sequence, Union, cast\n\nfrom typing_extensions import Protocol\n\nimport dagster._check as check\nfrom dagster._core.utils import coerce_valid_log_level, make_new_run_id\nfrom dagster._utils.log import get_dagster_logger\n\nif TYPE_CHECKING:\n    from dagster import DagsterInstance\n    from dagster._core.events import DagsterEvent\n    from dagster._core.storage.dagster_run import DagsterRun\n\nDAGSTER_META_KEY = "dagster_meta"\n\n\nclass IDagsterMeta(Protocol):\n    @property\n    def dagster_meta(self) -> "DagsterLoggingMetadata": ...\n\n\n# The type-checker complains here that DagsterLogRecord does not implement the `dagster_meta`\n# property of `IDagsterMeta`. We ignore this error because we don't need to implement this method--\n# `DagsterLogRecord` is a stub class that is never instantiated. We only ever cast\n# `logging.LogRecord` objects to `DagsterLogRecord`, because it gives us typed access to the\n# `dagster_meta` property. `dagster_meta` itself is set on these `logging.LogRecord` objects via the\n# `extra` argument to `logging.Logger.log` (see `DagsterLogManager.log_dagster_event`), but\n# `logging.LogRecord` has no way of exposing to the type-checker the attributes that are dynamically\n# defined via `extra`.\nclass DagsterLogRecord(logging.LogRecord, IDagsterMeta):  # type: ignore\n    pass\n\n\nclass DagsterMessageProps(\n    NamedTuple(\n        "_DagsterMessageProps",\n        [\n            ("orig_message", Optional[str]),\n            ("log_message_id", Optional[str]),\n            ("log_timestamp", Optional[str]),\n            ("dagster_event", Optional[Any]),\n        ],\n    )\n):\n    """Internal class used to represent specific attributes about a logged message."""\n\n    def __new__(\n        cls,\n        orig_message: str,\n        log_message_id: Optional[str] = None,\n        log_timestamp: Optional[str] = None,\n        dagster_event: Optional["DagsterEvent"] = None,\n    ):\n        return super().__new__(\n            cls,\n            orig_message=check.str_param(orig_message, "orig_message"),\n            log_message_id=check.opt_str_param(\n                log_message_id, "log_message_id", default=make_new_run_id()\n            ),\n            log_timestamp=check.opt_str_param(\n                log_timestamp,\n                "log_timestamp",\n                default=datetime.datetime.utcnow().isoformat(),\n            ),\n            dagster_event=dagster_event,\n        )\n\n    @property\n    def error_str(self) -> Optional[str]:\n        if self.dagster_event is None:\n            return None\n\n        event_specific_data = self.dagster_event.event_specific_data\n        if not event_specific_data:\n            return None\n\n        error = getattr(event_specific_data, "error", None)\n        if error:\n            return f'\\n\\n{getattr(event_specific_data, "error_display_string", error.to_string())}'\n        return None\n\n    @property\n    def pid(self) -> Optional[str]:\n        if self.dagster_event is None or self.dagster_event.pid is None:\n            return None\n        return str(self.dagster_event.pid)\n\n    @property\n    def step_key(self) -> Optional[str]:\n        if self.dagster_event is None:\n            return None\n        return self.dagster_event.step_key\n\n    @property\n    def event_type_value(self) -> Optional[str]:\n        if self.dagster_event is None:\n            return None\n        return self.dagster_event.event_type_value\n\n\nclass DagsterLoggingMetadata(\n    NamedTuple(\n        "_DagsterLoggingMetadata",\n        [\n            ("run_id", Optional[str]),\n            ("job_name", Optional[str]),\n            ("job_tags", Mapping[str, str]),\n            ("step_key", Optional[str]),\n            ("op_name", Optional[str]),\n            ("resource_name", Optional[str]),\n            ("resource_fn_name", Optional[str]),\n        ],\n    )\n):\n    """Internal class used to represent the context in which a given message was logged (i.e. the\n    step, pipeline run, resource, etc.).\n    """\n\n    def __new__(\n        cls,\n        run_id: Optional[str] = None,\n        job_name: Optional[str] = None,\n        job_tags: Optional[Mapping[str, str]] = None,\n        step_key: Optional[str] = None,\n        op_name: Optional[str] = None,\n        resource_name: Optional[str] = None,\n        resource_fn_name: Optional[str] = None,\n    ):\n        return super().__new__(\n            cls,\n            run_id=run_id,\n            job_name=job_name,\n            job_tags=job_tags or {},\n            step_key=step_key,\n            op_name=op_name,\n            resource_name=resource_name,\n            resource_fn_name=resource_fn_name,\n        )\n\n    @property\n    def log_source(self) -> str:\n        if self.resource_name is None:\n            return self.job_name or "system"\n        return f"resource:{self.resource_name}"\n\n    def all_tags(self) -> Mapping[str, str]:\n        # converts all values into strings\n        return {k: str(v) for k, v in self._asdict().items()}\n\n    def event_tags(self) -> Mapping[str, str]:\n        # Exclude pipeline_tags since it can be quite large and can be found on the run\n        return {k: str(v) for k, v in self._asdict().items() if k != "job_tags"}\n\n\ndef construct_log_string(\n    logging_metadata: DagsterLoggingMetadata, message_props: DagsterMessageProps\n) -> str:\n    from dagster._core.events import EVENT_TYPE_VALUE_TO_DISPLAY_STRING\n\n    event_type_str = (\n        EVENT_TYPE_VALUE_TO_DISPLAY_STRING[message_props.event_type_value]\n        if message_props.event_type_value in EVENT_TYPE_VALUE_TO_DISPLAY_STRING\n        else message_props.event_type_value\n    )\n    return " - ".join(\n        filter(\n            None,\n            (\n                logging_metadata.log_source,\n                logging_metadata.run_id,\n                message_props.pid,\n                logging_metadata.step_key,\n                event_type_str,\n                message_props.orig_message,\n            ),\n        )\n    ) + (message_props.error_str or "")\n\n\ndef get_dagster_meta_dict(\n    logging_metadata: DagsterLoggingMetadata, dagster_message_props: DagsterMessageProps\n) -> Mapping[str, object]:\n    # combine all dagster meta information into a single dictionary\n    meta_dict = {\n        **logging_metadata._asdict(),\n        **dagster_message_props._asdict(),\n    }\n    # step-level events can be logged from a pipeline context. for these cases, pull the step\n    # key from the underlying DagsterEvent\n    if meta_dict["step_key"] is None:\n        meta_dict["step_key"] = dagster_message_props.step_key\n\n    return meta_dict\n\n\nclass DagsterLogHandler(logging.Handler):\n    """Internal class used to turn regular logs into Dagster logs by adding Dagster-specific\n    metadata (such as pipeline_name or step_key), as well as reformatting the underlying message.\n\n    Note: The `loggers` argument will be populated with the set of @loggers supplied to the current\n    pipeline run. These essentially work as handlers (they do not create their own log messages,\n    they simply re-log messages that are created from context.log.x() calls), which is why they are\n    referenced from within this handler class.\n    """\n\n    def __init__(\n        self,\n        logging_metadata: DagsterLoggingMetadata,\n        loggers: Sequence[logging.Logger],\n        handlers: Sequence[logging.Handler],\n    ):\n        self._logging_metadata = logging_metadata\n        self._loggers = loggers\n        self._handlers = handlers\n        self._should_capture = True\n        super().__init__()\n\n    @property\n    def logging_metadata(self) -> DagsterLoggingMetadata:\n        return self._logging_metadata\n\n    def with_tags(self, **new_tags: str) -> "DagsterLogHandler":\n        return DagsterLogHandler(\n            logging_metadata=self.logging_metadata._replace(**new_tags),\n            loggers=self._loggers,\n            handlers=self._handlers,\n        )\n\n    def _extract_extra(self, record: logging.LogRecord) -> Mapping[str, Any]:\n        """In the logging.Logger log() implementation, the elements of the `extra` dictionary\n        argument are smashed into the __dict__ of the underlying logging.LogRecord.\n        This function figures out what the original `extra` values of the log call were by\n        comparing the set of attributes in the received record to those of a default record.\n        """\n        ref_attrs = list(logging.makeLogRecord({}).__dict__.keys()) + [\n            "message",\n            "asctime",\n        ]\n        return {k: v for k, v in record.__dict__.items() if k not in ref_attrs}\n\n    def _convert_record(self, record: logging.LogRecord) -> DagsterLogRecord:\n        # we store the originating DagsterEvent in the DAGSTER_META_KEY field, if applicable\n        dagster_meta = getattr(record, DAGSTER_META_KEY, None)\n\n        # generate some properties for this specific record\n        dagster_message_props = DagsterMessageProps(\n            orig_message=record.getMessage(), dagster_event=dagster_meta\n        )\n\n        # set the dagster meta info for the record\n        setattr(\n            record,\n            DAGSTER_META_KEY,\n            get_dagster_meta_dict(self._logging_metadata, dagster_message_props),\n        )\n\n        # update the message to be formatted like other dagster logs\n        record.msg = construct_log_string(self._logging_metadata, dagster_message_props)\n        record.args = ()\n\n        # DagsterLogRecord is a LogRecord with a `dagster_meta` field\n        return cast(DagsterLogRecord, record)\n\n    def filter(self, record: logging.LogRecord) -> bool:\n        """If you list multiple levels of a python logging hierarchy as managed loggers, and do not\n        set the propagate attribute to False, this will result in that record getting logged\n        multiple times, as the DagsterLogHandler will be invoked at each level of the hierarchy as\n        the message is propagated. This filter prevents this from happening.\n        """\n        return self._should_capture and not isinstance(\n            getattr(record, DAGSTER_META_KEY, None), dict\n        )\n\n    def emit(self, record: logging.LogRecord) -> None:\n        """For any received record, add Dagster metadata, and have handlers handle it."""\n        try:\n            # to prevent the potential for infinite loops in which a handler produces log messages\n            # which are then captured and then handled by that same handler (etc.), do not capture\n            # any log messages while one is currently being emitted\n            self._should_capture = False\n            dagster_record = self._convert_record(record)\n            # built-in handlers\n            for handler in self._handlers:\n                if dagster_record.levelno >= handler.level:\n                    handler.handle(dagster_record)\n            # user-defined @loggers\n            for logger in self._loggers:\n                logger.log(\n                    dagster_record.levelno,\n                    dagster_record.msg,\n                    exc_info=dagster_record.exc_info,\n                    extra=self._extract_extra(record),\n                )\n        finally:\n            self._should_capture = True\n\n\n
[docs]class DagsterLogManager(logging.Logger):\n """Centralized dispatch for logging from user code.\n\n Handles the construction of uniform structured log messages and passes them through to the\n underlying loggers/handlers.\n\n An instance of the log manager is made available to ops as ``context.log``. Users should not\n initialize instances of the log manager directly. To configure custom loggers, set the\n ``logger_defs`` argument in an `@job` decorator or when calling the `to_job()` method on a\n :py:class:`GraphDefinition`.\n\n The log manager inherits standard convenience methods like those exposed by the Python standard\n library :py:mod:`python:logging` module (i.e., within the body of an op,\n ``context.log.{debug, info, warning, warn, error, critical, fatal}``).\n\n The underlying integer API can also be called directly using, e.g.\n ``context.log.log(5, msg)``, and the log manager will delegate to the ``log`` method\n defined on each of the loggers it manages.\n\n User-defined custom log levels are not supported, and calls to, e.g.,\n ``context.log.trace`` or ``context.log.notice`` will result in hard exceptions **at runtime**.\n """\n\n def __init__(\n self,\n dagster_handler: DagsterLogHandler,\n level: int = logging.NOTSET,\n managed_loggers: Optional[Sequence[logging.Logger]] = None,\n ):\n super().__init__(name="dagster", level=coerce_valid_log_level(level))\n self._managed_loggers = check.opt_sequence_param(\n managed_loggers, "managed_loggers", of_type=logging.Logger\n )\n self._dagster_handler = dagster_handler\n self.addHandler(dagster_handler)\n\n @classmethod\n def create(\n cls,\n loggers: Sequence[logging.Logger],\n handlers: Optional[Sequence[logging.Handler]] = None,\n instance: Optional["DagsterInstance"] = None,\n dagster_run: Optional["DagsterRun"] = None,\n ) -> "DagsterLogManager":\n """Create a DagsterLogManager with a set of subservient loggers."""\n handlers = check.opt_sequence_param(handlers, "handlers", of_type=logging.Handler)\n\n managed_loggers = [get_dagster_logger()]\n python_log_level = logging.NOTSET\n\n if instance:\n handlers = [*handlers, *instance.get_handlers()]\n managed_loggers += [\n logging.getLogger(lname) if lname != "root" else logging.getLogger()\n for lname in instance.managed_python_loggers\n ]\n if instance.python_log_level is not None:\n python_log_level = coerce_valid_log_level(instance.python_log_level)\n\n # set all loggers to the declared logging level\n for logger in managed_loggers:\n logger.setLevel(python_log_level)\n\n if dagster_run:\n logging_metadata = DagsterLoggingMetadata(\n run_id=dagster_run.run_id,\n job_name=dagster_run.job_name,\n job_tags=dagster_run.tags,\n )\n else:\n logging_metadata = DagsterLoggingMetadata()\n\n return cls(\n dagster_handler=DagsterLogHandler(\n logging_metadata=logging_metadata,\n loggers=loggers,\n handlers=handlers,\n ),\n level=python_log_level,\n managed_loggers=managed_loggers,\n )\n\n @property\n def logging_metadata(self) -> DagsterLoggingMetadata:\n return self._dagster_handler.logging_metadata\n\n def begin_python_log_capture(self) -> None:\n for logger in self._managed_loggers:\n logger.addHandler(self._dagster_handler)\n\n def end_python_log_capture(self) -> None:\n for logger in self._managed_loggers:\n logger.removeHandler(self._dagster_handler)\n\n def log_dagster_event(\n self, level: Union[str, int], msg: str, dagster_event: "DagsterEvent"\n ) -> None:\n """Log a DagsterEvent at the given level. Attributes about the context it was logged in\n (such as the solid name or pipeline name) will be automatically attached to the created record.\n\n Args:\n level (str, int): either a string representing the desired log level ("INFO", "WARN"),\n or an integer level such as logging.INFO or logging.DEBUG.\n msg (str): message describing the event\n dagster_event (DagsterEvent): DagsterEvent that will be logged\n """\n self.log(level=level, msg=msg, extra={DAGSTER_META_KEY: dagster_event})\n\n def log(self, level: Union[str, int], msg: object, *args: Any, **kwargs: Any) -> None:\n """Log a message at the given level. Attributes about the context it was logged in (such as\n the solid name or pipeline name) will be automatically attached to the created record.\n\n Args:\n level (str, int): either a string representing the desired log level ("INFO", "WARN"),\n or an integer level such as logging.INFO or logging.DEBUG.\n msg (str): the message to be logged\n *args: the logged message will be msg % args\n """\n level = coerce_valid_log_level(level)\n # log DagsterEvents regardless of level\n if self.isEnabledFor(level) or ("extra" in kwargs and DAGSTER_META_KEY in kwargs["extra"]):\n self._log(level, msg, args, **kwargs)\n\n def with_tags(self, **new_tags: str) -> "DagsterLogManager":\n """Add new tags in "new_tags" to the set of tags attached to this log manager instance, and\n return a new DagsterLogManager with the merged set of tags.\n\n Args:\n new_tags (Dict[str,str]): Dictionary of tags\n\n Returns:\n DagsterLogManager: a new DagsterLogManager namedtuple with updated tags for the same\n run ID and loggers.\n """\n return DagsterLogManager(\n dagster_handler=self._dagster_handler.with_tags(**new_tags),\n managed_loggers=self._managed_loggers,\n level=self.level,\n )
\n
", "current_page_name": "_modules/dagster/_core/log_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.log_manager"}, "run_coordinator": {"default_run_coordinator": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.run_coordinator.default_run_coordinator

\nimport logging\nfrom typing import Mapping, Optional\n\nfrom typing_extensions import Self\n\nimport dagster._check as check\nfrom dagster._config.config_schema import UserConfigSchema\nfrom dagster._core.storage.dagster_run import DagsterRun, DagsterRunStatus\nfrom dagster._serdes import ConfigurableClass, ConfigurableClassData\n\nfrom .base import RunCoordinator, SubmitRunContext\n\n\n
[docs]class DefaultRunCoordinator(RunCoordinator, ConfigurableClass):\n """Immediately send runs to the run launcher."""\n\n def __init__(self, inst_data: Optional[ConfigurableClassData] = None):\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n self._logger = logging.getLogger("dagster.run_coordinator.default_run_coordinator")\n super().__init__()\n\n @property\n def inst_data(self) -> Optional[ConfigurableClassData]:\n return self._inst_data\n\n @classmethod\n def config_type(cls) -> UserConfigSchema:\n return {}\n\n @classmethod\n def from_config_value(\n cls, inst_data: Optional[ConfigurableClassData], config_value: Mapping[str, object]\n ) -> Self:\n return cls(inst_data=inst_data, **config_value)\n\n def submit_run(self, context: SubmitRunContext) -> DagsterRun:\n dagster_run = context.dagster_run\n\n if dagster_run.status == DagsterRunStatus.NOT_STARTED:\n self._instance.launch_run(dagster_run.run_id, context.workspace)\n else:\n self._logger.warning(\n f"submit_run called for run {dagster_run.run_id} with status "\n f"{dagster_run.status.value}, skipping launch."\n )\n\n run = self._instance.get_run_by_id(dagster_run.run_id)\n if run is None:\n check.failed(f"Failed to reload run {dagster_run.run_id}")\n return run\n\n def cancel_run(self, run_id: str) -> bool:\n return self._instance.run_launcher.terminate(run_id)
\n
", "current_page_name": "_modules/dagster/_core/run_coordinator/default_run_coordinator", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.run_coordinator.default_run_coordinator"}, "queued_run_coordinator": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.run_coordinator.queued_run_coordinator

\nimport logging\nfrom typing import Any, Mapping, NamedTuple, Optional, Sequence\n\nfrom typing_extensions import Self\n\nfrom dagster import (\n    DagsterEvent,\n    DagsterEventType,\n    IntSource,\n    String,\n    _check as check,\n)\nfrom dagster._builtins import Bool\nfrom dagster._config import Array, Field, Noneable, ScalarUnion, Shape\nfrom dagster._config.config_schema import UserConfigSchema\nfrom dagster._core.instance import T_DagsterInstance\nfrom dagster._core.storage.dagster_run import DagsterRun, DagsterRunStatus\nfrom dagster._serdes import ConfigurableClass, ConfigurableClassData\n\nfrom .base import RunCoordinator, SubmitRunContext\n\n\nclass RunQueueConfig(\n    NamedTuple(\n        "_RunQueueConfig",\n        [\n            ("max_concurrent_runs", int),\n            ("tag_concurrency_limits", Sequence[Mapping[str, Any]]),\n            ("max_user_code_failure_retries", int),\n            ("user_code_failure_retry_delay", int),\n        ],\n    )\n):\n    def __new__(\n        cls,\n        max_concurrent_runs: int,\n        tag_concurrency_limits: Optional[Sequence[Mapping[str, Any]]],\n        max_user_code_failure_retries: int = 0,\n        user_code_failure_retry_delay: int = 60,\n    ):\n        return super(RunQueueConfig, cls).__new__(\n            cls,\n            check.int_param(max_concurrent_runs, "max_concurrent_runs"),\n            check.opt_sequence_param(tag_concurrency_limits, "tag_concurrency_limits"),\n            check.int_param(max_user_code_failure_retries, "max_user_code_failure_retries"),\n            check.int_param(user_code_failure_retry_delay, "user_code_failure_retry_delay"),\n        )\n\n\n
[docs]class QueuedRunCoordinator(RunCoordinator[T_DagsterInstance], ConfigurableClass):\n """Enqueues runs via the run storage, to be deqeueued by the Dagster Daemon process. Requires\n the Dagster Daemon process to be alive in order for runs to be launched.\n """\n\n def __init__(\n self,\n max_concurrent_runs: Optional[int] = None,\n tag_concurrency_limits: Optional[Sequence[Mapping[str, Any]]] = None,\n dequeue_interval_seconds: Optional[int] = None,\n dequeue_use_threads: Optional[bool] = None,\n dequeue_num_workers: Optional[int] = None,\n max_user_code_failure_retries: Optional[int] = None,\n user_code_failure_retry_delay: Optional[int] = None,\n inst_data: Optional[ConfigurableClassData] = None,\n ):\n self._inst_data: Optional[ConfigurableClassData] = check.opt_inst_param(\n inst_data, "inst_data", ConfigurableClassData\n )\n self._max_concurrent_runs: int = check.opt_int_param(\n max_concurrent_runs, "max_concurrent_runs", 10\n )\n check.invariant(\n self._max_concurrent_runs >= -1,\n "Negative values other than -1 (which disables the limit) for max_concurrent_runs"\n " are disallowed.",\n )\n self._tag_concurrency_limits: Sequence[Mapping[str, Any]] = check.opt_list_param(\n tag_concurrency_limits,\n "tag_concurrency_limits",\n )\n self._dequeue_interval_seconds: int = check.opt_int_param(\n dequeue_interval_seconds, "dequeue_interval_seconds", 5\n )\n self._dequeue_use_threads: bool = check.opt_bool_param(\n dequeue_use_threads, "dequeue_use_threads", False\n )\n self._dequeue_num_workers: Optional[int] = check.opt_int_param(\n dequeue_num_workers, "dequeue_num_workers"\n )\n self._max_user_code_failure_retries: int = check.opt_int_param(\n max_user_code_failure_retries, "max_user_code_failure_retries", 0\n )\n self._user_code_failure_retry_delay: int = check.opt_int_param(\n user_code_failure_retry_delay, "user_code_failure_retry_delay", 60\n )\n self._logger = logging.getLogger("dagster.run_coordinator.queued_run_coordinator")\n super().__init__()\n\n @property\n def inst_data(self) -> Optional[ConfigurableClassData]:\n return self._inst_data\n\n def get_run_queue_config(self) -> RunQueueConfig:\n return RunQueueConfig(\n max_concurrent_runs=self._max_concurrent_runs,\n tag_concurrency_limits=self._tag_concurrency_limits,\n max_user_code_failure_retries=self._max_user_code_failure_retries,\n user_code_failure_retry_delay=self._user_code_failure_retry_delay,\n )\n\n @property\n def dequeue_interval_seconds(self) -> int:\n return self._dequeue_interval_seconds\n\n @property\n def dequeue_use_threads(self) -> bool:\n return self._dequeue_use_threads\n\n @property\n def dequeue_num_workers(self) -> Optional[int]:\n return self._dequeue_num_workers\n\n @classmethod\n def config_type(cls) -> UserConfigSchema:\n return {\n "max_concurrent_runs": Field(\n config=IntSource,\n is_required=False,\n description=(\n "The maximum number of runs that are allowed to be in progress at once."\n " Defaults to 10. Set to -1 to disable the limit. Set to 0 to stop any runs"\n " from launching. Any other negative values are disallowed."\n ),\n ),\n "tag_concurrency_limits": Field(\n config=Noneable(\n Array(\n Shape(\n {\n "key": String,\n "value": Field(\n ScalarUnion(\n scalar_type=String,\n non_scalar_schema=Shape({"applyLimitPerUniqueValue": Bool}),\n ),\n is_required=False,\n ),\n "limit": Field(int),\n }\n )\n )\n ),\n is_required=False,\n description=(\n "A set of limits that are applied to runs with particular tags. If a value is"\n " set, the limit is applied to only that key-value pair. If no value is set,"\n " the limit is applied across all values of that key. If the value is set to a"\n " dict with `applyLimitPerUniqueValue: true`, the limit will apply to the"\n " number of unique values for that key."\n ),\n ),\n "dequeue_interval_seconds": Field(\n config=IntSource,\n is_required=False,\n description=(\n "The interval in seconds at which the Dagster Daemon "\n "should periodically check the run queue for new runs to launch."\n ),\n ),\n "dequeue_use_threads": Field(\n config=bool,\n is_required=False,\n description=(\n "Whether or not to use threads for concurrency when launching dequeued runs."\n ),\n ),\n "dequeue_num_workers": Field(\n config=IntSource,\n is_required=False,\n description=(\n "If dequeue_use_threads is true, limit the number of concurrent worker threads."\n ),\n ),\n "max_user_code_failure_retries": Field(\n config=IntSource,\n is_required=False,\n default_value=0,\n description=(\n "If there is an error reaching a Dagster gRPC server while dequeuing the run,"\n " how many times to retry the dequeue before failing it. The only run launcher"\n " that requires the gRPC server to be running is the DefaultRunLauncher, so"\n " setting this will have no effect unless that run launcher is being used."\n ),\n ),\n "user_code_failure_retry_delay": Field(\n config=IntSource,\n is_required=False,\n default_value=60,\n description=(\n "If there is an error reaching a Dagster gRPC server while dequeuing the run,"\n " how long to wait before retrying any runs from that same code location. The"\n " only run launcher that requires the gRPC server to be running is the"\n " DefaultRunLauncher, so setting this will have no effect unless that run"\n " launcher is being used."\n ),\n ),\n }\n\n @classmethod\n def from_config_value(\n cls, inst_data: ConfigurableClassData, config_value: Mapping[str, Any]\n ) -> Self:\n return cls(\n inst_data=inst_data,\n max_concurrent_runs=config_value.get("max_concurrent_runs"),\n tag_concurrency_limits=config_value.get("tag_concurrency_limits"),\n dequeue_interval_seconds=config_value.get("dequeue_interval_seconds"),\n dequeue_use_threads=config_value.get("dequeue_use_threads"),\n dequeue_num_workers=config_value.get("dequeue_num_workers"),\n max_user_code_failure_retries=config_value.get("max_user_code_failure_retries"),\n user_code_failure_retry_delay=config_value.get("user_code_failure_retry_delay"),\n )\n\n def submit_run(self, context: SubmitRunContext) -> DagsterRun:\n dagster_run = context.dagster_run\n\n if dagster_run.status == DagsterRunStatus.NOT_STARTED:\n enqueued_event = DagsterEvent(\n event_type_value=DagsterEventType.PIPELINE_ENQUEUED.value,\n job_name=dagster_run.job_name,\n )\n self._instance.report_dagster_event(enqueued_event, run_id=dagster_run.run_id)\n else:\n # the run was already submitted, this is a no-op\n self._logger.warning(\n f"submit_run called for run {dagster_run.run_id} with status "\n f"{dagster_run.status.value}, skipping enqueue."\n )\n\n run = self._instance.get_run_by_id(dagster_run.run_id)\n if run is None:\n check.failed(f"Failed to reload run {dagster_run.run_id}")\n return run\n\n def cancel_run(self, run_id: str) -> bool:\n run = self._instance.get_run_by_id(run_id)\n if not run:\n return False\n # NOTE: possible race condition if the dequeuer acts on this run at the same time\n # https://github.com/dagster-io/dagster/issues/3323\n if run.status == DagsterRunStatus.QUEUED:\n self._instance.report_run_canceling(\n run,\n message="Canceling run from the queue.",\n )\n self._instance.report_run_canceled(run)\n return True\n else:\n return self._instance.run_launcher.terminate(run_id)
\n
", "current_page_name": "_modules/dagster/_core/run_coordinator/queued_run_coordinator", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.run_coordinator.queued_run_coordinator"}}, "scheduler": {"scheduler": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.scheduler.scheduler

\nimport abc\nimport os\nfrom typing import Any, Mapping, NamedTuple, Optional, Sequence\n\nfrom typing_extensions import Self\n\nimport dagster._check as check\nfrom dagster._config import Field, IntSource\nfrom dagster._core.definitions.run_request import InstigatorType\nfrom dagster._core.errors import DagsterError\nfrom dagster._core.host_representation import ExternalSchedule\nfrom dagster._core.instance import DagsterInstance\nfrom dagster._core.scheduler.instigation import (\n    InstigatorState,\n    InstigatorStatus,\n    ScheduleInstigatorData,\n)\nfrom dagster._serdes import ConfigurableClass\nfrom dagster._serdes.config_class import ConfigurableClassData\nfrom dagster._seven import get_current_datetime_in_utc\nfrom dagster._utils import mkdir_p\n\n\nclass DagsterSchedulerError(DagsterError):\n    """Base class for all Dagster Scheduler errors."""\n\n\nclass DagsterScheduleDoesNotExist(DagsterSchedulerError):\n    """Errors raised when fetching a schedule."""\n\n\nclass SchedulerDebugInfo(\n    NamedTuple(\n        "SchedulerDebugInfo",\n        [\n            ("errors", Sequence[str]),\n            ("scheduler_config_info", str),\n            ("scheduler_info", str),\n            ("schedule_storage", Sequence[str]),\n        ],\n    )\n):\n    def __new__(\n        cls,\n        errors: Sequence[str],\n        scheduler_config_info: str,\n        scheduler_info: str,\n        schedule_storage: Sequence[str],\n    ):\n        return super(SchedulerDebugInfo, cls).__new__(\n            cls,\n            errors=check.sequence_param(errors, "errors", of_type=str),\n            scheduler_config_info=check.str_param(scheduler_config_info, "scheduler_config_info"),\n            scheduler_info=check.str_param(scheduler_info, "scheduler_info"),\n            schedule_storage=check.sequence_param(\n                schedule_storage, "schedule_storage", of_type=str\n            ),\n        )\n\n\n
[docs]class Scheduler(abc.ABC):\n """Abstract base class for a scheduler. This component is responsible for interfacing with\n an external system such as cron to ensure scheduled repeated execution according.\n """\n\n def start_schedule(\n self, instance: DagsterInstance, external_schedule: ExternalSchedule\n ) -> InstigatorState:\n """Updates the status of the given schedule to `InstigatorStatus.RUNNING` in schedule storage,.\n\n This should not be overridden by subclasses.\n\n Args:\n instance (DagsterInstance): The current instance.\n external_schedule (ExternalSchedule): The schedule to start\n\n """\n check.inst_param(instance, "instance", DagsterInstance)\n check.inst_param(external_schedule, "external_schedule", ExternalSchedule)\n\n stored_state = instance.get_instigator_state(\n external_schedule.get_external_origin_id(), external_schedule.selector_id\n )\n computed_state = external_schedule.get_current_instigator_state(stored_state)\n if computed_state.is_running:\n return computed_state\n\n new_instigator_data = ScheduleInstigatorData(\n external_schedule.cron_schedule,\n get_current_datetime_in_utc().timestamp(),\n )\n\n if not stored_state:\n started_state = InstigatorState(\n external_schedule.get_external_origin(),\n InstigatorType.SCHEDULE,\n InstigatorStatus.RUNNING,\n new_instigator_data,\n )\n instance.add_instigator_state(started_state)\n else:\n started_state = stored_state.with_status(InstigatorStatus.RUNNING).with_data(\n new_instigator_data\n )\n instance.update_instigator_state(started_state)\n return started_state\n\n def stop_schedule(\n self,\n instance: DagsterInstance,\n schedule_origin_id: str,\n schedule_selector_id: str,\n external_schedule: Optional[ExternalSchedule],\n ) -> InstigatorState:\n """Updates the status of the given schedule to `InstigatorStatus.STOPPED` in schedule storage,.\n\n This should not be overridden by subclasses.\n\n Args:\n schedule_origin_id (string): The id of the schedule target to stop running.\n """\n check.str_param(schedule_origin_id, "schedule_origin_id")\n check.opt_inst_param(external_schedule, "external_schedule", ExternalSchedule)\n\n stored_state = instance.get_instigator_state(schedule_origin_id, schedule_selector_id)\n\n if not external_schedule:\n computed_state = stored_state\n else:\n computed_state = external_schedule.get_current_instigator_state(stored_state)\n\n if computed_state and not computed_state.is_running:\n return computed_state\n\n if not stored_state:\n assert external_schedule\n stopped_state = InstigatorState(\n external_schedule.get_external_origin(),\n InstigatorType.SCHEDULE,\n InstigatorStatus.STOPPED,\n ScheduleInstigatorData(\n external_schedule.cron_schedule,\n ),\n )\n instance.add_instigator_state(stopped_state)\n else:\n stopped_state = stored_state.with_status(InstigatorStatus.STOPPED).with_data(\n ScheduleInstigatorData(\n cron_schedule=computed_state.instigator_data.cron_schedule, # type: ignore\n )\n )\n instance.update_instigator_state(stopped_state)\n\n return stopped_state\n\n @abc.abstractmethod\n def debug_info(self) -> str:\n """Returns debug information about the scheduler."""\n\n @abc.abstractmethod\n def get_logs_path(self, instance: DagsterInstance, schedule_origin_id: str) -> str:\n """Get path to store logs for schedule.\n\n Args:\n schedule_origin_id (string): The id of the schedule target to retrieve the log path for\n """
\n\n\nDEFAULT_MAX_CATCHUP_RUNS = 5\n\n\n
[docs]class DagsterDaemonScheduler(Scheduler, ConfigurableClass):\n """Default scheduler implementation that submits runs from the `dagster-daemon`\n long-lived process. Periodically checks each running schedule for execution times that don't\n have runs yet and launches them.\n """\n\n def __init__(\n self,\n max_catchup_runs: int = DEFAULT_MAX_CATCHUP_RUNS,\n max_tick_retries: int = 0,\n inst_data: Optional[ConfigurableClassData] = None,\n ):\n self.max_catchup_runs = check.opt_int_param(\n max_catchup_runs, "max_catchup_runs", DEFAULT_MAX_CATCHUP_RUNS\n )\n self.max_tick_retries = check.opt_int_param(max_tick_retries, "max_tick_retries", 0)\n self._inst_data = inst_data\n\n @property\n def inst_data(self) -> Optional[ConfigurableClassData]:\n return self._inst_data\n\n @classmethod\n def config_type(cls):\n return {\n "max_catchup_runs": Field(\n IntSource,\n is_required=False,\n default_value=DEFAULT_MAX_CATCHUP_RUNS,\n description="""For partitioned schedules, controls the maximum number of past\n partitions for each schedule that will be considered when looking for missing\n runs . Generally this parameter will only come into play if the scheduler\n falls behind or launches after experiencing downtime. This parameter will not be checked for\n schedules without partition sets (for example, schedules created using the @schedule\n decorator) - only the most recent execution time will be considered for those schedules.\n\n Note that no matter what this value is, the scheduler will never launch a run from a time\n before the schedule was turned on (even if the start_date on the schedule is earlier) - if\n you want to launch runs for earlier partitions, launch a backfill.\n """,\n ),\n "max_tick_retries": Field(\n IntSource,\n default_value=0,\n is_required=False,\n description=(\n "For each schedule tick that raises an error, how many times to retry that tick"\n ),\n ),\n }\n\n @classmethod\n def from_config_value(\n cls, inst_data: ConfigurableClassData, config_value: Mapping[str, Any]\n ) -> Self:\n return DagsterDaemonScheduler(inst_data=inst_data, **config_value)\n\n def debug_info(self) -> str:\n return ""\n\n def wipe(self, instance: DagsterInstance) -> None:\n pass\n\n def _get_or_create_logs_directory(\n self, instance: DagsterInstance, schedule_origin_id: str\n ) -> str:\n check.inst_param(instance, "instance", DagsterInstance)\n check.str_param(schedule_origin_id, "schedule_origin_id")\n\n logs_directory = os.path.join(instance.schedules_directory(), "logs", schedule_origin_id)\n if not os.path.isdir(logs_directory):\n mkdir_p(logs_directory)\n\n return logs_directory\n\n def get_logs_path(self, instance: DagsterInstance, schedule_origin_id: str) -> str:\n check.inst_param(instance, "instance", DagsterInstance)\n check.str_param(schedule_origin_id, "schedule_origin_id")\n\n logs_directory = self._get_or_create_logs_directory(instance, schedule_origin_id)\n return os.path.join(logs_directory, "scheduler.log")
\n
", "current_page_name": "_modules/dagster/_core/scheduler/scheduler", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.scheduler.scheduler"}}, "storage": {"asset_value_loader": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.storage.asset_value_loader

\nfrom contextlib import ExitStack\nfrom typing import Any, Dict, Mapping, Optional, Type, cast\n\nimport dagster._check as check\nfrom dagster._annotations import public\nfrom dagster._core.definitions.assets import AssetsDefinition\nfrom dagster._core.definitions.events import AssetKey, CoercibleToAssetKey\nfrom dagster._core.definitions.job_definition import (\n    default_job_io_manager_with_fs_io_manager_schema,\n)\nfrom dagster._core.definitions.partition_key_range import PartitionKeyRange\nfrom dagster._core.definitions.resource_definition import ResourceDefinition\nfrom dagster._core.definitions.source_asset import SourceAsset\nfrom dagster._core.definitions.utils import DEFAULT_IO_MANAGER_KEY\nfrom dagster._core.execution.build_resources import build_resources, get_mapped_resource_config\nfrom dagster._core.execution.context.input import build_input_context\nfrom dagster._core.execution.context.output import build_output_context\nfrom dagster._core.execution.resources_init import get_transitive_required_resource_keys\nfrom dagster._core.instance import DagsterInstance\nfrom dagster._core.instance.config import is_dagster_home_set\nfrom dagster._core.types.dagster_type import resolve_dagster_type\nfrom dagster._utils.merger import merge_dicts\n\nfrom .io_manager import IOManager\n\n\n
[docs]class AssetValueLoader:\n """Caches resource definitions that are used to load asset values across multiple load\n invocations.\n\n Should not be instantiated directly. Instead, use\n :py:meth:`~dagster.RepositoryDefinition.get_asset_value_loader`.\n """\n\n def __init__(\n self,\n assets_defs_by_key: Mapping[AssetKey, AssetsDefinition],\n source_assets_by_key: Mapping[AssetKey, SourceAsset],\n instance: Optional[DagsterInstance] = None,\n ):\n self._assets_defs_by_key = assets_defs_by_key\n self._source_assets_by_key = source_assets_by_key\n self._resource_instance_cache: Dict[str, object] = {}\n self._exit_stack: ExitStack = ExitStack().__enter__()\n if not instance and is_dagster_home_set():\n self._instance = self._exit_stack.enter_context(DagsterInstance.get())\n else:\n self._instance = instance\n\n def _ensure_resource_instances_in_cache(\n self,\n resource_defs: Mapping[str, ResourceDefinition],\n resource_config: Optional[Mapping[str, Any]] = None,\n ):\n for built_resource_key, built_resource in (\n self._exit_stack.enter_context(\n build_resources(\n resources={\n resource_key: self._resource_instance_cache.get(resource_key, resource_def)\n for resource_key, resource_def in resource_defs.items()\n },\n instance=self._instance,\n resource_config=resource_config,\n )\n )\n ._asdict()\n .items()\n ):\n self._resource_instance_cache[built_resource_key] = built_resource\n\n
[docs] @public\n def load_asset_value(\n self,\n asset_key: CoercibleToAssetKey,\n *,\n python_type: Optional[Type[object]] = None,\n partition_key: Optional[str] = None,\n metadata: Optional[Dict[str, Any]] = None,\n resource_config: Optional[Mapping[str, Any]] = None,\n ) -> object:\n """Loads the contents of an asset as a Python object.\n\n Invokes `load_input` on the :py:class:`IOManager` associated with the asset.\n\n Args:\n asset_key (Union[AssetKey, Sequence[str], str]): The key of the asset to load.\n python_type (Optional[Type]): The python type to load the asset as. This is what will\n be returned inside `load_input` by `context.dagster_type.typing_type`.\n partition_key (Optional[str]): The partition of the asset to load.\n metadata (Optional[Dict[str, Any]]): Input metadata to pass to the :py:class:`IOManager`\n (is equivalent to setting the metadata argument in `In` or `AssetIn`).\n resource_config (Optional[Any]): A dictionary of resource configurations to be passed\n to the :py:class:`IOManager`.\n\n Returns:\n The contents of an asset as a Python object.\n """\n asset_key = AssetKey.from_coercible(asset_key)\n resource_config = resource_config or {}\n output_metadata = {}\n\n if asset_key in self._assets_defs_by_key:\n assets_def = self._assets_defs_by_key[asset_key]\n\n resource_defs = merge_dicts(\n {DEFAULT_IO_MANAGER_KEY: default_job_io_manager_with_fs_io_manager_schema},\n assets_def.resource_defs,\n )\n io_manager_key = assets_def.get_io_manager_key_for_asset_key(asset_key)\n io_manager_def = resource_defs[io_manager_key]\n name = assets_def.get_output_name_for_asset_key(asset_key)\n output_metadata = assets_def.metadata_by_key[asset_key]\n op_def = assets_def.get_op_def_for_asset_key(asset_key)\n asset_partitions_def = assets_def.partitions_def\n elif asset_key in self._source_assets_by_key:\n source_asset = self._source_assets_by_key[asset_key]\n\n resource_defs = merge_dicts(\n {DEFAULT_IO_MANAGER_KEY: default_job_io_manager_with_fs_io_manager_schema},\n source_asset.resource_defs,\n )\n io_manager_key = source_asset.get_io_manager_key()\n io_manager_def = resource_defs[io_manager_key]\n name = asset_key.path[-1]\n output_metadata = source_asset.raw_metadata\n op_def = None\n asset_partitions_def = source_asset.partitions_def\n else:\n check.failed(f"Asset key {asset_key} not found")\n\n required_resource_keys = get_transitive_required_resource_keys(\n io_manager_def.required_resource_keys, resource_defs\n ) | {io_manager_key}\n\n self._ensure_resource_instances_in_cache(\n {k: v for k, v in resource_defs.items() if k in required_resource_keys},\n resource_config=resource_config,\n )\n io_manager = cast(IOManager, self._resource_instance_cache[io_manager_key])\n\n io_config = resource_config.get(io_manager_key)\n io_resource_config = {io_manager_key: io_config} if io_config else {}\n\n io_manager_config = get_mapped_resource_config(\n {io_manager_key: io_manager_def}, io_resource_config\n )\n\n input_context = build_input_context(\n name=None,\n asset_key=asset_key,\n dagster_type=resolve_dagster_type(python_type),\n upstream_output=build_output_context(\n name=name,\n metadata=output_metadata,\n asset_key=asset_key,\n op_def=op_def,\n resource_config=resource_config,\n ),\n resources=self._resource_instance_cache,\n resource_config=io_manager_config[io_manager_key].config,\n partition_key=partition_key,\n asset_partition_key_range=(\n PartitionKeyRange(partition_key, partition_key)\n if partition_key is not None\n else None\n ),\n asset_partitions_def=asset_partitions_def,\n instance=self._instance,\n metadata=metadata,\n )\n\n return io_manager.load_input(input_context)
\n\n def __enter__(self):\n return self\n\n def __exit__(self, *exc):\n self._exit_stack.close()
\n
", "current_page_name": "_modules/dagster/_core/storage/asset_value_loader", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.storage.asset_value_loader"}, "base_storage": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.storage.base_storage

\nfrom abc import ABC, abstractmethod\n\nfrom dagster._core.instance import MayHaveInstanceWeakref, T_DagsterInstance\n\nfrom .event_log.base import EventLogStorage\nfrom .runs.base import RunStorage\nfrom .schedules.base import ScheduleStorage\n\n\n
[docs]class DagsterStorage(ABC, MayHaveInstanceWeakref[T_DagsterInstance]):\n """Abstract base class for Dagster persistent storage, for reading and writing data for runs,\n events, and schedule/sensor state.\n\n Users should not directly instantiate concrete subclasses of this class; they are instantiated\n by internal machinery when ``dagster-webserver`` and ``dagster-daemon`` load, based on the values in the\n ``dagster.yaml`` file in ``$DAGSTER_HOME``. Configuration of concrete subclasses of this class\n should be done by setting values in that file.\n """\n\n @property\n @abstractmethod\n def event_log_storage(self) -> EventLogStorage[T_DagsterInstance]:\n raise NotImplementedError()\n\n @property\n @abstractmethod\n def run_storage(self) -> RunStorage[T_DagsterInstance]:\n raise NotImplementedError()\n\n @property\n @abstractmethod\n def schedule_storage(self) -> ScheduleStorage[T_DagsterInstance]:\n raise NotImplementedError()
\n
", "current_page_name": "_modules/dagster/_core/storage/base_storage", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.storage.base_storage"}, "captured_log_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.storage.captured_log_manager

\nfrom abc import ABC, abstractmethod\nfrom contextlib import contextmanager\nfrom typing import IO, Callable, Generator, Iterator, NamedTuple, Optional, Sequence\n\nfrom typing_extensions import Final, Self\n\nimport dagster._check as check\nfrom dagster._core.storage.compute_log_manager import ComputeIOType\n\nMAX_BYTES_CHUNK_READ: Final = 4194304  # 4 MB\n\n\nclass CapturedLogContext(\n    NamedTuple(\n        "_CapturedLogContext",\n        [\n            ("log_key", Sequence[str]),\n            ("external_url", Optional[str]),\n            ("external_stdout_url", Optional[str]),\n            ("external_stderr_url", Optional[str]),\n        ],\n    )\n):\n    """Object representing the context in which logs are captured.  Can be used by external logging\n    sidecar implementations to point the Dagster UI to an external url to view compute logs instead of a\n    Dagster-managed location.\n    """\n\n    def __new__(\n        cls,\n        log_key: Sequence[str],\n        external_stdout_url: Optional[str] = None,\n        external_stderr_url: Optional[str] = None,\n        external_url: Optional[str] = None,\n    ):\n        if external_url and (external_stdout_url or external_stderr_url):\n            check.failed(\n                "Cannot specify both `external_url` and one of"\n                " `external_stdout_url`/`external_stderr_url`"\n            )\n\n        return super(CapturedLogContext, cls).__new__(\n            cls,\n            log_key,\n            external_stdout_url=external_stdout_url,\n            external_stderr_url=external_stderr_url,\n            external_url=external_url,\n        )\n\n\nclass CapturedLogData(\n    NamedTuple(\n        "_CapturedLogData",\n        [\n            ("log_key", Sequence[str]),\n            ("stdout", Optional[bytes]),\n            ("stderr", Optional[bytes]),\n            ("cursor", Optional[str]),\n        ],\n    )\n):\n    """Object representing captured log data, either a partial chunk of the log data or the full\n    capture.  Contains the raw bytes and optionally the cursor offset for the partial chunk.\n    """\n\n    def __new__(\n        cls,\n        log_key: Sequence[str],\n        stdout: Optional[bytes] = None,\n        stderr: Optional[bytes] = None,\n        cursor: Optional[str] = None,\n    ):\n        return super(CapturedLogData, cls).__new__(cls, log_key, stdout, stderr, cursor)\n\n\nclass CapturedLogMetadata(\n    NamedTuple(\n        "_CapturedLogMetadata",\n        [\n            ("stdout_location", Optional[str]),\n            ("stderr_location", Optional[str]),\n            ("stdout_download_url", Optional[str]),\n            ("stderr_download_url", Optional[str]),\n        ],\n    )\n):\n    """Object representing metadata info for the captured log data, containing a display string for\n    the location of the log data and a URL for direct download of the captured log data.\n    """\n\n    def __new__(\n        cls,\n        stdout_location: Optional[str] = None,\n        stderr_location: Optional[str] = None,\n        stdout_download_url: Optional[str] = None,\n        stderr_download_url: Optional[str] = None,\n    ):\n        return super(CapturedLogMetadata, cls).__new__(\n            cls,\n            stdout_location=stdout_location,\n            stderr_location=stderr_location,\n            stdout_download_url=stdout_download_url,\n            stderr_download_url=stderr_download_url,\n        )\n\n\nclass CapturedLogSubscription:\n    def __init__(\n        self, manager: "CapturedLogManager", log_key: Sequence[str], cursor: Optional[str]\n    ):\n        self._manager = manager\n        self._log_key = log_key\n        self._cursor = cursor\n        self._observer: Optional[Callable[[CapturedLogData], None]] = None\n        self.is_complete = False\n\n    def __call__(self, observer: Optional[Callable[[CapturedLogData], None]]) -> Self:\n        self._observer = observer\n        self.fetch()\n        if self._manager.is_capture_complete(self._log_key):\n            self.complete()\n        return self\n\n    @property\n    def log_key(self) -> Sequence[str]:\n        return self._log_key\n\n    def dispose(self) -> None:\n        self._observer = None\n        self._manager.unsubscribe(self)\n\n    def fetch(self) -> None:\n        if not self._observer:\n            return\n\n        should_fetch = True\n        while should_fetch:\n            log_data = self._manager.get_log_data(\n                self._log_key,\n                self._cursor,\n                max_bytes=MAX_BYTES_CHUNK_READ,\n            )\n            if not self._cursor or log_data.cursor != self._cursor:\n                self._observer(log_data)\n                self._cursor = log_data.cursor\n            should_fetch = _has_max_data(log_data.stdout) or _has_max_data(log_data.stderr)\n\n    def complete(self) -> None:\n        self.is_complete = True\n\n\ndef _has_max_data(chunk: Optional[bytes]) -> bool:\n    # function is used as predicate but does not actually return a boolean\n    return chunk and len(chunk) >= MAX_BYTES_CHUNK_READ  # type: ignore\n\n\n
[docs]class CapturedLogManager(ABC):\n """Abstract base class for capturing the unstructured logs (stdout/stderr) in the current\n process, stored / retrieved with a provided log_key.\n """\n\n @abstractmethod\n @contextmanager\n def capture_logs(self, log_key: Sequence[str]) -> Generator[CapturedLogContext, None, None]:\n """Context manager for capturing the stdout/stderr within the current process, and persisting\n it under the given log key.\n\n Args:\n log_key (List[String]): The log key identifying the captured logs\n """\n\n @abstractmethod\n @contextmanager\n def open_log_stream(\n self, log_key: Sequence[str], io_type: ComputeIOType\n ) -> Iterator[Optional[IO[bytes]]]:\n """Context manager for providing an IO stream that enables the caller to write to a log stream\n managed by the captured log manager, to be read later using the given log key.\n\n Args:\n log_key (List[String]): The log key identifying the captured logs\n """\n\n @abstractmethod\n def is_capture_complete(self, log_key: Sequence[str]) -> bool:\n """Flag indicating when the log capture for a given log key has completed.\n\n Args:\n log_key (List[String]): The log key identifying the captured logs\n\n Returns:\n Boolean\n """\n\n @abstractmethod\n def get_log_data(\n self,\n log_key: Sequence[str],\n cursor: Optional[str] = None,\n max_bytes: Optional[int] = None,\n ) -> CapturedLogData:\n """Returns a chunk of the captured stdout logs for a given log key.\n\n Args:\n log_key (List[String]): The log key identifying the captured logs\n cursor (Optional[str]): A cursor representing the position of the log chunk to fetch\n max_bytes (Optional[int]): A limit on the size of the log chunk to fetch\n\n Returns:\n CapturedLogData\n """\n\n @abstractmethod\n def get_log_metadata(self, log_key: Sequence[str]) -> CapturedLogMetadata:\n """Returns the metadata of the captured logs for a given log key, including\n displayable information on where the logs are persisted.\n\n Args:\n log_key (List[String]): The log key identifying the captured logs\n\n Returns:\n CapturedLogMetadata\n """\n\n @abstractmethod\n def delete_logs(\n self, log_key: Optional[Sequence[str]] = None, prefix: Optional[Sequence[str]] = None\n ) -> None:\n """Deletes the captured logs for a given log key.\n\n Args:\n log_key(Optional[List[String]]): The log key of the logs to delete\n prefix(Optional[List[String]]): The prefix of the log keys to delete\n """\n\n @abstractmethod\n def subscribe(\n self, log_key: Sequence[str], cursor: Optional[str] = None\n ) -> CapturedLogSubscription:\n """Registers an observable object for log data.\n\n Args:\n log_key (List[String]): The log key identifying the captured logs\n cursor (Optional[String]): The string cursor marking the position within the log stream\n Returns:\n ComputeLogSubscription\n """\n\n @abstractmethod\n def unsubscribe(self, subscription: CapturedLogSubscription) -> None:\n """Deregisters an observable object from receiving log updates.\n\n Args:\n subscription (CapturedLogSubscription): subscription object which manages when to send\n back data to the subscriber\n """\n\n def build_log_key_for_run(self, run_id: str, step_key: str) -> Sequence[str]:\n """Legacy adapter to translate run_id/key to captured log manager-based log_key."""\n return [run_id, "compute_logs", step_key]
\n
", "current_page_name": "_modules/dagster/_core/storage/captured_log_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.storage.captured_log_manager"}, "compute_log_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.storage.compute_log_manager

\nfrom abc import ABC, abstractmethod\nfrom contextlib import contextmanager\nfrom enum import Enum\nfrom typing import Callable, Iterator, NamedTuple, Optional\n\nfrom typing_extensions import Self\n\nimport dagster._check as check\nfrom dagster._core.instance import MayHaveInstanceWeakref, T_DagsterInstance\nfrom dagster._core.storage.dagster_run import DagsterRun\n\nMAX_BYTES_FILE_READ = 33554432  # 32 MB\nMAX_BYTES_CHUNK_READ = 4194304  # 4 MB\n\n\nclass ComputeIOType(Enum):\n    STDOUT = "stdout"\n    STDERR = "stderr"\n\n\nclass ComputeLogFileData(\n    NamedTuple(\n        "ComputeLogFileData",\n        [\n            ("path", str),\n            ("data", Optional[str]),\n            ("cursor", int),\n            ("size", int),\n            ("download_url", Optional[str]),\n        ],\n    )\n):\n    """Representation of a chunk of compute execution log data."""\n\n    def __new__(\n        cls, path: str, data: Optional[str], cursor: int, size: int, download_url: Optional[str]\n    ):\n        return super(ComputeLogFileData, cls).__new__(\n            cls,\n            path=check.str_param(path, "path"),\n            data=check.opt_str_param(data, "data"),\n            cursor=check.int_param(cursor, "cursor"),\n            size=check.int_param(size, "size"),\n            download_url=check.opt_str_param(download_url, "download_url"),\n        )\n\n\n
[docs]class ComputeLogManager(ABC, MayHaveInstanceWeakref[T_DagsterInstance]):\n """Abstract base class for storing unstructured compute logs (stdout/stderr) from the compute\n steps of pipeline solids.\n """\n\n @contextmanager\n def watch(self, dagster_run: DagsterRun, step_key: Optional[str] = None) -> Iterator[None]:\n """Watch the stdout/stderr for a given execution for a given run_id / step_key and persist it.\n\n Args:\n dagster_run (DagsterRun): The run config\n step_key (Optional[String]): The step_key for a compute step\n """\n check.inst_param(dagster_run, "dagster_run", DagsterRun)\n check.opt_str_param(step_key, "step_key")\n\n if not self.enabled(dagster_run, step_key):\n yield\n return\n\n self.on_watch_start(dagster_run, step_key)\n with self._watch_logs(dagster_run, step_key):\n yield\n self.on_watch_finish(dagster_run, step_key)\n\n @contextmanager\n @abstractmethod\n def _watch_logs(\n self, dagster_run: DagsterRun, step_key: Optional[str] = None\n ) -> Iterator[None]:\n """Method to watch the stdout/stderr logs for a given run_id / step_key. Kept separate from\n blessed `watch` method, which triggers all the start/finish hooks that are necessary to\n implement the different remote implementations.\n\n Args:\n dagster_run (DagsterRun): The run config\n step_key (Optional[String]): The step_key for a compute step\n """\n\n @abstractmethod\n def get_local_path(self, run_id: str, key: str, io_type: ComputeIOType) -> str:\n """Get the local path of the logfile for a given execution step. This determines the\n location on the local filesystem to which stdout/stderr will be rerouted.\n\n Args:\n run_id (str): The id of the pipeline run.\n key (str): The unique descriptor of the execution step (e.g. `solid_invocation.compute`)\n io_type (ComputeIOType): Flag indicating the I/O type, either ComputeIOType.STDOUT or\n ComputeIOType.STDERR\n\n Returns:\n str\n """\n ...\n\n @abstractmethod\n def is_watch_completed(self, run_id: str, key: str) -> bool:\n """Flag indicating when computation for a given execution step has completed.\n\n Args:\n run_id (str): The id of the pipeline run.\n key (str): The unique descriptor of the execution step (e.g. `solid_invocation.compute`)\n\n Returns:\n Boolean\n """\n\n @abstractmethod\n def on_watch_start(self, dagster_run: DagsterRun, step_key: Optional[str]) -> None:\n """Hook called when starting to watch compute logs.\n\n Args:\n pipeline_run (PipelineRun): The pipeline run config\n step_key (Optional[String]): The step_key for a compute step\n """\n\n @abstractmethod\n def on_watch_finish(self, dagster_run: DagsterRun, step_key: Optional[str]) -> None:\n """Hook called when computation for a given execution step is finished.\n\n Args:\n pipeline_run (PipelineRun): The pipeline run config\n step_key (Optional[String]): The step_key for a compute step\n """\n\n @abstractmethod\n def download_url(self, run_id: str, key: str, io_type: ComputeIOType) -> str:\n """Get a URL where the logs can be downloaded.\n\n Args:\n run_id (str): The id of the pipeline run.\n key (str): The unique descriptor of the execution step (e.g. `solid_invocation.compute`)\n io_type (ComputeIOType): Flag indicating the I/O type, either stdout or stderr\n\n Returns:\n String\n """\n\n @abstractmethod\n def read_logs_file(\n self,\n run_id: str,\n key: str,\n io_type: ComputeIOType,\n cursor: int = 0,\n max_bytes: int = MAX_BYTES_FILE_READ,\n ) -> ComputeLogFileData:\n """Get compute log data for a given compute step.\n\n Args:\n run_id (str): The id of the pipeline run.\n key (str): The unique descriptor of the execution step (e.g. `solid_invocation.compute`)\n io_type (ComputeIOType): Flag indicating the I/O type, either stdout or stderr\n cursor (Optional[Int]): Starting cursor (byte) of log file\n max_bytes (Optional[Int]): Maximum number of bytes to be read and returned\n\n Returns:\n ComputeLogFileData\n """\n\n def enabled(self, _dagster_run: DagsterRun, _step_key: Optional[str]) -> bool:\n """Hook for disabling compute log capture.\n\n Args:\n _step_key (Optional[String]): The step_key for a compute step\n\n Returns:\n Boolean\n """\n return True\n\n @abstractmethod\n def on_subscribe(self, subscription: "ComputeLogSubscription") -> None:\n """Hook for managing streaming subscriptions for log data from `dagster-webserver`.\n\n Args:\n subscription (ComputeLogSubscription): subscription object which manages when to send\n back data to the subscriber\n """\n\n def on_unsubscribe(self, subscription: "ComputeLogSubscription") -> None:\n pass\n\n def observable(\n self, run_id: str, key: str, io_type: ComputeIOType, cursor: Optional[str] = None\n ) -> "ComputeLogSubscription":\n """Return a ComputeLogSubscription which streams back log data from the execution logs for a given\n compute step.\n\n Args:\n run_id (str): The id of the pipeline run.\n key (str): The unique descriptor of the execution step (e.g. `solid_invocation.compute`)\n io_type (ComputeIOType): Flag indicating the I/O type, either stdout or stderr\n cursor (Optional[Int]): Starting cursor (byte) of log file\n\n Returns:\n Observable\n """\n check.str_param(run_id, "run_id")\n check.str_param(key, "key")\n check.inst_param(io_type, "io_type", ComputeIOType)\n check.opt_str_param(cursor, "cursor")\n\n if cursor:\n cursor = int(cursor) # type: ignore # (var reassigned diff type)\n else:\n cursor = 0 # type: ignore # (var reassigned diff type)\n\n subscription = ComputeLogSubscription(self, run_id, key, io_type, cursor) # type: ignore # (var reassigned diff type)\n self.on_subscribe(subscription)\n return subscription\n\n def dispose(self):\n pass
\n\n\nclass ComputeLogSubscription:\n """Observable object that generates ComputeLogFileData objects as compute step execution logs\n are written.\n """\n\n def __init__(\n self,\n manager: ComputeLogManager,\n run_id: str,\n key: str,\n io_type: ComputeIOType,\n cursor: int,\n ):\n self.manager = manager\n self.run_id = run_id\n self.key = key\n self.io_type = io_type\n self.cursor = cursor\n self.observer: Optional[Callable[[ComputeLogFileData], None]] = None\n self.is_complete = False\n\n def __call__(self, observer: Callable[[ComputeLogFileData], None]) -> Self:\n self.observer = observer\n self.fetch()\n if self.manager.is_watch_completed(self.run_id, self.key):\n self.complete()\n return self\n\n def dispose(self) -> None:\n # called when the connection gets closed, allowing the observer to get GC'ed\n self.observer = None\n self.manager.on_unsubscribe(self)\n\n def fetch(self) -> None:\n if not self.observer:\n return\n\n should_fetch = True\n while should_fetch:\n update = self.manager.read_logs_file(\n self.run_id,\n self.key,\n self.io_type,\n self.cursor,\n max_bytes=MAX_BYTES_CHUNK_READ,\n )\n if not self.cursor or update.cursor != self.cursor:\n self.observer(update)\n self.cursor = update.cursor\n should_fetch = update.data and len(update.data.encode("utf-8")) >= MAX_BYTES_CHUNK_READ\n\n def complete(self) -> None:\n self.is_complete = True\n if not self.observer:\n return\n
", "current_page_name": "_modules/dagster/_core/storage/compute_log_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.storage.compute_log_manager"}, "dagster_run": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.storage.dagster_run

\nfrom datetime import datetime\nfrom enum import Enum\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Any,\n    Dict,\n    List,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Union,\n)\n\nfrom typing_extensions import Self\n\nimport dagster._check as check\nfrom dagster._annotations import PublicAttr, public\nfrom dagster._core.definitions.asset_check_spec import AssetCheckKey\nfrom dagster._core.definitions.events import AssetKey\nfrom dagster._core.origin import JobPythonOrigin\nfrom dagster._core.storage.tags import PARENT_RUN_ID_TAG, ROOT_RUN_ID_TAG\nfrom dagster._core.utils import make_new_run_id\nfrom dagster._serdes.serdes import (\n    NamedTupleSerializer,\n    whitelist_for_serdes,\n)\n\nfrom .tags import (\n    BACKFILL_ID_TAG,\n    REPOSITORY_LABEL_TAG,\n    RESUME_RETRY_TAG,\n    SCHEDULE_NAME_TAG,\n    SENSOR_NAME_TAG,\n)\n\nif TYPE_CHECKING:\n    from dagster._core.host_representation.external import ExternalSchedule, ExternalSensor\n    from dagster._core.host_representation.origin import ExternalJobOrigin\n\n\n
[docs]@whitelist_for_serdes(storage_name="PipelineRunStatus")\nclass DagsterRunStatus(Enum):\n """The status of run execution."""\n\n # Runs waiting to be launched by the Dagster Daemon.\n QUEUED = "QUEUED"\n\n # Runs that have been launched, but execution has not yet started."""\n NOT_STARTED = "NOT_STARTED"\n\n # Runs that are managed outside of the Dagster control plane.\n MANAGED = "MANAGED"\n\n # Runs that have been launched, but execution has not yet started.\n STARTING = "STARTING"\n\n # Runs that have been launched and execution has started.\n STARTED = "STARTED"\n\n # Runs that have successfully completed.\n SUCCESS = "SUCCESS"\n\n # Runs that have failed to complete.\n FAILURE = "FAILURE"\n\n # Runs that are in-progress and pending to be canceled.\n CANCELING = "CANCELING"\n\n # Runs that have been canceled before completion.\n CANCELED = "CANCELED"
\n\n\n# These statuses that indicate a run may be using compute resources\nIN_PROGRESS_RUN_STATUSES = [\n DagsterRunStatus.STARTING,\n DagsterRunStatus.STARTED,\n DagsterRunStatus.CANCELING,\n]\n\n# This serves as an explicit list of run statuses that indicate that the run is not using compute\n# resources. This and the enum above should cover all run statuses.\nNON_IN_PROGRESS_RUN_STATUSES = [\n DagsterRunStatus.QUEUED,\n DagsterRunStatus.NOT_STARTED,\n DagsterRunStatus.SUCCESS,\n DagsterRunStatus.FAILURE,\n DagsterRunStatus.MANAGED,\n DagsterRunStatus.CANCELED,\n]\n\nFINISHED_STATUSES = [\n DagsterRunStatus.SUCCESS,\n DagsterRunStatus.FAILURE,\n DagsterRunStatus.CANCELED,\n]\n\n# Run statuses for runs that can be safely canceled.\n# Does not include the other unfinished statuses for the following reasons:\n# STARTING: Control has been ceded to the run worker, which will eventually move the run to a STARTED.\n# NOT_STARTED: Mostly replaced with STARTING. Runs are only here in the the brief window between\n# creating the run and launching or enqueueing it.\nCANCELABLE_RUN_STATUSES = [DagsterRunStatus.STARTED, DagsterRunStatus.QUEUED]\n\n\n@whitelist_for_serdes(storage_name="PipelineRunStatsSnapshot")\nclass DagsterRunStatsSnapshot(\n NamedTuple(\n "_DagsterRunStatsSnapshot",\n [\n ("run_id", str),\n ("steps_succeeded", int),\n ("steps_failed", int),\n ("materializations", int),\n ("expectations", int),\n ("enqueued_time", Optional[float]),\n ("launch_time", Optional[float]),\n ("start_time", Optional[float]),\n ("end_time", Optional[float]),\n ],\n )\n):\n def __new__(\n cls,\n run_id: str,\n steps_succeeded: int,\n steps_failed: int,\n materializations: int,\n expectations: int,\n enqueued_time: Optional[float],\n launch_time: Optional[float],\n start_time: Optional[float],\n end_time: Optional[float],\n ):\n return super(DagsterRunStatsSnapshot, cls).__new__(\n cls,\n run_id=check.str_param(run_id, "run_id"),\n steps_succeeded=check.int_param(steps_succeeded, "steps_succeeded"),\n steps_failed=check.int_param(steps_failed, "steps_failed"),\n materializations=check.int_param(materializations, "materializations"),\n expectations=check.int_param(expectations, "expectations"),\n enqueued_time=check.opt_float_param(enqueued_time, "enqueued_time"),\n launch_time=check.opt_float_param(launch_time, "launch_time"),\n start_time=check.opt_float_param(start_time, "start_time"),\n end_time=check.opt_float_param(end_time, "end_time"),\n )\n\n\nclass DagsterRunSerializer(NamedTupleSerializer["DagsterRun"]):\n # serdes log\n # * removed reexecution_config - serdes logic expected to strip unknown keys so no need to preserve\n # * added pipeline_snapshot_id\n # * renamed previous_run_id -> parent_run_id, added root_run_id\n # * added execution_plan_snapshot_id\n # * removed selector\n # * added solid_subset\n # * renamed solid_subset -> solid_selection, added solids_to_execute\n # * renamed environment_dict -> run_config\n # * added asset_selection\n # * added has_repository_load_data\n def before_unpack(self, context, unpacked_dict: Dict[str, Any]) -> Dict[str, Any]:\n # back compat for environment dict => run_config\n if "environment_dict" in unpacked_dict:\n check.invariant(\n unpacked_dict.get("run_config") is None,\n "Cannot set both run_config and environment_dict. Use run_config parameter.",\n )\n unpacked_dict["run_config"] = unpacked_dict["environment_dict"]\n del unpacked_dict["environment_dict"]\n\n # back compat for previous_run_id => parent_run_id, root_run_id\n if "previous_run_id" in unpacked_dict and not (\n "parent_run_id" in unpacked_dict and "root_run_id" in unpacked_dict\n ):\n unpacked_dict["parent_run_id"] = unpacked_dict["previous_run_id"]\n unpacked_dict["root_run_id"] = unpacked_dict["previous_run_id"]\n del unpacked_dict["previous_run_id"]\n\n # back compat for selector => pipeline_name, solids_to_execute\n if "selector" in unpacked_dict:\n selector = unpacked_dict["selector"]\n\n if not isinstance(selector, ExecutionSelector):\n check.failed(f"unexpected entry for 'select', {selector}")\n selector_name = selector.name\n selector_subset = selector.solid_subset\n\n job_name = unpacked_dict.get("pipeline_name")\n check.invariant(\n job_name is None or selector_name == job_name,\n f"Conflicting pipeline name {job_name} in arguments to PipelineRun: "\n f"selector was passed with pipeline {selector_name}",\n )\n if job_name is None:\n unpacked_dict["pipeline_name"] = selector_name\n\n solids_to_execute = unpacked_dict.get("solids_to_execute")\n check.invariant(\n solids_to_execute is None\n or (selector_subset and set(selector_subset) == solids_to_execute),\n f"Conflicting solids_to_execute {solids_to_execute} in arguments to"\n f" PipelineRun: selector was passed with subset {selector_subset}",\n )\n # for old runs that only have selector but no solids_to_execute\n if solids_to_execute is None:\n solids_to_execute = frozenset(selector_subset) if selector_subset else None\n\n # back compat for solid_subset => solids_to_execute\n if "solid_subset" in unpacked_dict:\n unpacked_dict["solids_to_execute"] = unpacked_dict["solid_subset"]\n del unpacked_dict["solid_subset"]\n\n return unpacked_dict\n\n\n
[docs]@whitelist_for_serdes(\n serializer=DagsterRunSerializer,\n # DagsterRun is serialized as PipelineRun so that it can be read by older (pre 0.13.x) version\n # of Dagster, but is read back in as a DagsterRun.\n storage_name="PipelineRun",\n old_fields={"mode": None},\n storage_field_names={\n "job_name": "pipeline_name",\n "job_snapshot_id": "pipeline_snapshot_id",\n "external_job_origin": "external_pipeline_origin",\n "job_code_origin": "pipeline_code_origin",\n "op_selection": "solid_selection",\n "resolved_op_selection": "solids_to_execute",\n },\n)\nclass DagsterRun(\n NamedTuple(\n "_DagsterRun",\n [\n ("job_name", PublicAttr[str]),\n ("run_id", str),\n ("run_config", Mapping[str, object]),\n ("asset_selection", Optional[AbstractSet[AssetKey]]),\n ("asset_check_selection", Optional[AbstractSet[AssetCheckKey]]),\n ("op_selection", Optional[Sequence[str]]),\n ("resolved_op_selection", Optional[AbstractSet[str]]),\n ("step_keys_to_execute", Optional[Sequence[str]]),\n ("status", DagsterRunStatus),\n ("tags", Mapping[str, str]),\n ("root_run_id", Optional[str]),\n ("parent_run_id", Optional[str]),\n ("job_snapshot_id", Optional[str]),\n ("execution_plan_snapshot_id", Optional[str]),\n ("external_job_origin", Optional["ExternalJobOrigin"]),\n ("job_code_origin", Optional[JobPythonOrigin]),\n ("has_repository_load_data", bool),\n ],\n )\n):\n """Serializable internal representation of a dagster run, as stored in a\n :py:class:`~dagster._core.storage.runs.RunStorage`.\n """\n\n def __new__(\n cls,\n job_name: str,\n run_id: Optional[str] = None,\n run_config: Optional[Mapping[str, object]] = None,\n asset_selection: Optional[AbstractSet[AssetKey]] = None,\n asset_check_selection: Optional[AbstractSet[AssetCheckKey]] = None,\n op_selection: Optional[Sequence[str]] = None,\n resolved_op_selection: Optional[AbstractSet[str]] = None,\n step_keys_to_execute: Optional[Sequence[str]] = None,\n status: Optional[DagsterRunStatus] = None,\n tags: Optional[Mapping[str, str]] = None,\n root_run_id: Optional[str] = None,\n parent_run_id: Optional[str] = None,\n job_snapshot_id: Optional[str] = None,\n execution_plan_snapshot_id: Optional[str] = None,\n external_job_origin: Optional["ExternalJobOrigin"] = None,\n job_code_origin: Optional[JobPythonOrigin] = None,\n has_repository_load_data: Optional[bool] = None,\n ):\n check.invariant(\n (root_run_id is not None and parent_run_id is not None)\n or (root_run_id is None and parent_run_id is None),\n "Must set both root_run_id and parent_run_id when creating a PipelineRun that "\n "belongs to a run group",\n )\n # a set which contains the names of the ops to execute\n resolved_op_selection = check.opt_nullable_set_param(\n resolved_op_selection, "resolved_op_selection", of_type=str\n )\n # a list of op queries provided by the user\n # possible to be None when resolved_op_selection is set by the user directly\n op_selection = check.opt_nullable_sequence_param(op_selection, "op_selection", of_type=str)\n check.opt_nullable_sequence_param(step_keys_to_execute, "step_keys_to_execute", of_type=str)\n\n asset_selection = check.opt_nullable_set_param(\n asset_selection, "asset_selection", of_type=AssetKey\n )\n asset_check_selection = check.opt_nullable_set_param(\n asset_check_selection, "asset_check_selection", of_type=AssetCheckKey\n )\n\n # Placing this with the other imports causes a cyclic import\n # https://github.com/dagster-io/dagster/issues/3181\n from dagster._core.host_representation.origin import ExternalJobOrigin\n\n if status == DagsterRunStatus.QUEUED:\n check.inst_param(\n external_job_origin,\n "external_job_origin",\n ExternalJobOrigin,\n "external_job_origin is required for queued runs",\n )\n\n if run_id is None:\n run_id = make_new_run_id()\n\n return super(DagsterRun, cls).__new__(\n cls,\n job_name=check.str_param(job_name, "job_name"),\n run_id=check.str_param(run_id, "run_id"),\n run_config=check.opt_mapping_param(run_config, "run_config", key_type=str),\n op_selection=op_selection,\n asset_selection=asset_selection,\n asset_check_selection=asset_check_selection,\n resolved_op_selection=resolved_op_selection,\n step_keys_to_execute=step_keys_to_execute,\n status=check.opt_inst_param(\n status, "status", DagsterRunStatus, DagsterRunStatus.NOT_STARTED\n ),\n tags=check.opt_mapping_param(tags, "tags", key_type=str, value_type=str),\n root_run_id=check.opt_str_param(root_run_id, "root_run_id"),\n parent_run_id=check.opt_str_param(parent_run_id, "parent_run_id"),\n job_snapshot_id=check.opt_str_param(job_snapshot_id, "job_snapshot_id"),\n execution_plan_snapshot_id=check.opt_str_param(\n execution_plan_snapshot_id, "execution_plan_snapshot_id"\n ),\n external_job_origin=check.opt_inst_param(\n external_job_origin, "external_job_origin", ExternalJobOrigin\n ),\n job_code_origin=check.opt_inst_param(\n job_code_origin, "job_code_origin", JobPythonOrigin\n ),\n has_repository_load_data=check.opt_bool_param(\n has_repository_load_data, "has_repository_load_data", default=False\n ),\n )\n\n def with_status(self, status: DagsterRunStatus) -> Self:\n if status == DagsterRunStatus.QUEUED:\n # Placing this with the other imports causes a cyclic import\n # https://github.com/dagster-io/dagster/issues/3181\n from dagster._core.host_representation.origin import ExternalJobOrigin\n\n check.inst(\n self.external_job_origin,\n ExternalJobOrigin,\n "external_pipeline_origin is required for queued runs",\n )\n\n return self._replace(status=status)\n\n def with_job_origin(self, origin: "ExternalJobOrigin") -> Self:\n from dagster._core.host_representation.origin import ExternalJobOrigin\n\n check.inst_param(origin, "origin", ExternalJobOrigin)\n return self._replace(external_job_origin=origin)\n\n def with_tags(self, tags: Mapping[str, str]) -> Self:\n return self._replace(tags=tags)\n\n def get_root_run_id(self) -> Optional[str]:\n return self.tags.get(ROOT_RUN_ID_TAG)\n\n def get_parent_run_id(self) -> Optional[str]:\n return self.tags.get(PARENT_RUN_ID_TAG)\n\n def tags_for_storage(self) -> Mapping[str, str]:\n repository_tags = {}\n if self.external_job_origin:\n # tag the run with a label containing the repository name / location name, to allow for\n # per-repository filtering of runs from the Dagster UI.\n repository_tags[REPOSITORY_LABEL_TAG] = (\n self.external_job_origin.external_repository_origin.get_label()\n )\n\n if not self.tags:\n return repository_tags\n\n return {**repository_tags, **self.tags}\n\n @public\n @property\n def is_finished(self) -> bool:\n """bool: If this run has completely finished execution."""\n return self.status in FINISHED_STATUSES\n\n @public\n @property\n def is_success(self) -> bool:\n """bool: If this run has successfully finished executing."""\n return self.status == DagsterRunStatus.SUCCESS\n\n @public\n @property\n def is_failure(self) -> bool:\n """bool: If this run has failed."""\n return self.status == DagsterRunStatus.FAILURE\n\n @public\n @property\n def is_failure_or_canceled(self) -> bool:\n """bool: If this run has either failed or was canceled."""\n return self.status == DagsterRunStatus.FAILURE or self.status == DagsterRunStatus.CANCELED\n\n @public\n @property\n def is_resume_retry(self) -> bool:\n """bool: If this run was created from retrying another run from the point of failure."""\n return self.tags.get(RESUME_RETRY_TAG) == "true"\n\n @property\n def previous_run_id(self) -> Optional[str]:\n # Compat\n return self.parent_run_id\n\n @staticmethod\n def tags_for_schedule(schedule) -> Mapping[str, str]:\n return {SCHEDULE_NAME_TAG: schedule.name}\n\n @staticmethod\n def tags_for_sensor(sensor) -> Mapping[str, str]:\n return {SENSOR_NAME_TAG: sensor.name}\n\n @staticmethod\n def tags_for_backfill_id(backfill_id: str) -> Mapping[str, str]:\n return {BACKFILL_ID_TAG: backfill_id}
\n\n\nclass RunsFilterSerializer(NamedTupleSerializer["RunsFilter"]):\n def before_unpack(\n self,\n context,\n unpacked_dict: Dict[str, Any],\n ) -> Dict[str, Any]:\n # We store empty run ids as [] but only accept None\n if "run_ids" in unpacked_dict and unpacked_dict["run_ids"] == []:\n unpacked_dict["run_ids"] = None\n return unpacked_dict\n\n\n
[docs]@whitelist_for_serdes(\n serializer=RunsFilterSerializer,\n old_storage_names={"PipelineRunsFilter"},\n storage_field_names={"job_name": "pipeline_name"},\n)\nclass RunsFilter(\n NamedTuple(\n "_RunsFilter",\n [\n ("run_ids", Sequence[str]),\n ("job_name", Optional[str]),\n ("statuses", Sequence[DagsterRunStatus]),\n ("tags", Mapping[str, Union[str, Sequence[str]]]),\n ("snapshot_id", Optional[str]),\n ("updated_after", Optional[datetime]),\n ("updated_before", Optional[datetime]),\n ("created_after", Optional[datetime]),\n ("created_before", Optional[datetime]),\n ],\n )\n):\n """Defines a filter across job runs, for use when querying storage directly.\n\n Each field of the RunsFilter represents a logical AND with each other. For\n example, if you specify job_name and tags, then you will receive only runs\n with the specified job_name AND the specified tags. If left blank, then\n all values will be permitted for that field.\n\n Args:\n run_ids (Optional[List[str]]): A list of job run_id values.\n job_name (Optional[str]):\n Name of the job to query for. If blank, all job_names will be accepted.\n statuses (Optional[List[DagsterRunStatus]]):\n A list of run statuses to filter by. If blank, all run statuses will be allowed.\n tags (Optional[Dict[str, Union[str, List[str]]]]):\n A dictionary of run tags to query by. All tags specified here must be present for a given run to pass the filter.\n snapshot_id (Optional[str]): The ID of the job snapshot to query for. Intended for internal use.\n updated_after (Optional[DateTime]): Filter by runs that were last updated before this datetime.\n created_before (Optional[DateTime]): Filter by runs that were created before this datetime.\n\n """\n\n def __new__(\n cls,\n run_ids: Optional[Sequence[str]] = None,\n job_name: Optional[str] = None,\n statuses: Optional[Sequence[DagsterRunStatus]] = None,\n tags: Optional[Mapping[str, Union[str, Sequence[str]]]] = None,\n snapshot_id: Optional[str] = None,\n updated_after: Optional[datetime] = None,\n updated_before: Optional[datetime] = None,\n created_after: Optional[datetime] = None,\n created_before: Optional[datetime] = None,\n ):\n check.invariant(run_ids != [], "When filtering on run ids, a non-empty list must be used.")\n\n return super(RunsFilter, cls).__new__(\n cls,\n run_ids=check.opt_sequence_param(run_ids, "run_ids", of_type=str),\n job_name=check.opt_str_param(job_name, "job_name"),\n statuses=check.opt_sequence_param(statuses, "statuses", of_type=DagsterRunStatus),\n tags=check.opt_mapping_param(tags, "tags", key_type=str),\n snapshot_id=check.opt_str_param(snapshot_id, "snapshot_id"),\n updated_after=check.opt_inst_param(updated_after, "updated_after", datetime),\n updated_before=check.opt_inst_param(updated_before, "updated_before", datetime),\n created_after=check.opt_inst_param(created_after, "created_after", datetime),\n created_before=check.opt_inst_param(created_before, "created_before", datetime),\n )\n\n @staticmethod\n def for_schedule(schedule: "ExternalSchedule") -> "RunsFilter":\n return RunsFilter(tags=DagsterRun.tags_for_schedule(schedule))\n\n @staticmethod\n def for_sensor(sensor: "ExternalSensor") -> "RunsFilter":\n return RunsFilter(tags=DagsterRun.tags_for_sensor(sensor))\n\n @staticmethod\n def for_backfill(backfill_id: str) -> "RunsFilter":\n return RunsFilter(tags=DagsterRun.tags_for_backfill_id(backfill_id))
\n\n\nclass JobBucket(NamedTuple):\n job_names: List[str]\n bucket_limit: Optional[int]\n\n\nclass TagBucket(NamedTuple):\n tag_key: str\n tag_values: List[str]\n bucket_limit: Optional[int]\n\n\n
[docs]class RunRecord(\n NamedTuple(\n "_RunRecord",\n [\n ("storage_id", int),\n ("dagster_run", DagsterRun),\n ("create_timestamp", datetime),\n ("update_timestamp", datetime),\n ("start_time", Optional[float]),\n ("end_time", Optional[float]),\n ],\n )\n):\n """Internal representation of a run record, as stored in a\n :py:class:`~dagster._core.storage.runs.RunStorage`.\n\n Users should not invoke this class directly.\n """\n\n def __new__(\n cls,\n storage_id: int,\n dagster_run: DagsterRun,\n create_timestamp: datetime,\n update_timestamp: datetime,\n start_time: Optional[float] = None,\n end_time: Optional[float] = None,\n ):\n return super(RunRecord, cls).__new__(\n cls,\n storage_id=check.int_param(storage_id, "storage_id"),\n dagster_run=check.inst_param(dagster_run, "dagster_run", DagsterRun),\n create_timestamp=check.inst_param(create_timestamp, "create_timestamp", datetime),\n update_timestamp=check.inst_param(update_timestamp, "update_timestamp", datetime),\n # start_time and end_time fields will be populated once the run has started and ended, respectively, but will be None beforehand.\n start_time=check.opt_float_param(start_time, "start_time"),\n end_time=check.opt_float_param(end_time, "end_time"),\n )
\n\n\n@whitelist_for_serdes\nclass RunPartitionData(\n NamedTuple(\n "_RunPartitionData",\n [\n ("run_id", str),\n ("partition", str),\n ("status", DagsterRunStatus),\n ("start_time", Optional[float]),\n ("end_time", Optional[float]),\n ],\n )\n):\n def __new__(\n cls,\n run_id: str,\n partition: str,\n status: DagsterRunStatus,\n start_time: Optional[float],\n end_time: Optional[float],\n ):\n return super(RunPartitionData, cls).__new__(\n cls,\n run_id=check.str_param(run_id, "run_id"),\n partition=check.str_param(partition, "partition"),\n status=check.inst_param(status, "status", DagsterRunStatus),\n start_time=check.opt_inst(start_time, float),\n end_time=check.opt_inst(end_time, float),\n )\n\n\n###################################################################################################\n# GRAVEYARD\n#\n# -|-\n# |\n# _-'~~~~~`-_\n# .' '.\n# | R I P |\n# | |\n# | Execution |\n# | Selector |\n# | |\n# | |\n###################################################################################################\n\n\n@whitelist_for_serdes\nclass ExecutionSelector(\n NamedTuple("_ExecutionSelector", [("name", str), ("solid_subset", Optional[Sequence[str]])])\n):\n """Kept here to maintain loading of PipelineRuns from when it was still alive."""\n\n def __new__(cls, name: str, solid_subset: Optional[Sequence[str]] = None):\n return super(ExecutionSelector, cls).__new__(\n cls,\n name=check.str_param(name, "name"),\n solid_subset=(\n None\n if solid_subset is None\n else check.sequence_param(solid_subset, "solid_subset", of_type=str)\n ),\n )\n
", "current_page_name": "_modules/dagster/_core/storage/dagster_run", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.storage.dagster_run"}, "event_log": {"base": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.storage.event_log.base

\nimport base64\nfrom abc import ABC, abstractmethod\nfrom enum import Enum\nfrom typing import (\n    TYPE_CHECKING,\n    Iterable,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Set,\n    Tuple,\n    Union,\n)\n\nimport dagster._check as check\nfrom dagster._core.assets import AssetDetails\nfrom dagster._core.definitions.events import AssetKey\nfrom dagster._core.event_api import EventHandlerFn, EventLogRecord, EventRecordsFilter\nfrom dagster._core.events import DagsterEventType\nfrom dagster._core.execution.stats import (\n    RunStepKeyStatsSnapshot,\n    build_run_stats_from_events,\n    build_run_step_stats_from_events,\n)\nfrom dagster._core.instance import MayHaveInstanceWeakref, T_DagsterInstance\nfrom dagster._core.storage.asset_check_execution_record import AssetCheckExecutionRecord\nfrom dagster._core.storage.dagster_run import DagsterRunStatsSnapshot\nfrom dagster._core.storage.sql import AlembicVersion\nfrom dagster._seven import json\nfrom dagster._utils import PrintFn\nfrom dagster._utils.concurrency import ConcurrencyClaimStatus, ConcurrencyKeyInfo\n\nif TYPE_CHECKING:\n    from dagster._core.events.log import EventLogEntry\n    from dagster._core.storage.partition_status_cache import AssetStatusCacheValue\n\n\nclass EventLogConnection(NamedTuple):\n    records: Sequence[EventLogRecord]\n    cursor: str\n    has_more: bool\n\n\nclass EventLogCursorType(Enum):\n    OFFSET = "OFFSET"\n    STORAGE_ID = "STORAGE_ID"\n\n\nclass EventLogCursor(NamedTuple):\n    """Representation of an event record cursor, keeping track of the log query state."""\n\n    cursor_type: EventLogCursorType\n    value: int\n\n    def is_offset_cursor(self) -> bool:\n        return self.cursor_type == EventLogCursorType.OFFSET\n\n    def is_id_cursor(self) -> bool:\n        return self.cursor_type == EventLogCursorType.STORAGE_ID\n\n    def offset(self) -> int:\n        check.invariant(self.cursor_type == EventLogCursorType.OFFSET)\n        return max(0, int(self.value))\n\n    def storage_id(self) -> int:\n        check.invariant(self.cursor_type == EventLogCursorType.STORAGE_ID)\n        return int(self.value)\n\n    def __str__(self) -> str:\n        return self.to_string()\n\n    def to_string(self) -> str:\n        raw = json.dumps({"type": self.cursor_type.value, "value": self.value})\n        return base64.b64encode(bytes(raw, encoding="utf-8")).decode("utf-8")\n\n    @staticmethod\n    def parse(cursor_str: str) -> "EventLogCursor":\n        raw = json.loads(base64.b64decode(cursor_str).decode("utf-8"))\n        return EventLogCursor(EventLogCursorType(raw["type"]), raw["value"])\n\n    @staticmethod\n    def from_offset(offset: int) -> "EventLogCursor":\n        return EventLogCursor(EventLogCursorType.OFFSET, offset)\n\n    @staticmethod\n    def from_storage_id(storage_id: int) -> "EventLogCursor":\n        return EventLogCursor(EventLogCursorType.STORAGE_ID, storage_id)\n\n\nclass AssetEntry(\n    NamedTuple(\n        "_AssetEntry",\n        [\n            ("asset_key", AssetKey),\n            ("last_materialization_record", Optional[EventLogRecord]),\n            ("last_run_id", Optional[str]),\n            ("asset_details", Optional[AssetDetails]),\n            ("cached_status", Optional["AssetStatusCacheValue"]),\n        ],\n    )\n):\n    def __new__(\n        cls,\n        asset_key: AssetKey,\n        last_materialization_record: Optional[EventLogRecord] = None,\n        last_run_id: Optional[str] = None,\n        asset_details: Optional[AssetDetails] = None,\n        cached_status: Optional["AssetStatusCacheValue"] = None,\n    ):\n        from dagster._core.storage.partition_status_cache import AssetStatusCacheValue\n\n        return super(AssetEntry, cls).__new__(\n            cls,\n            asset_key=check.inst_param(asset_key, "asset_key", AssetKey),\n            last_materialization_record=check.opt_inst_param(\n                last_materialization_record, "last_materialization_record", EventLogRecord\n            ),\n            last_run_id=check.opt_str_param(last_run_id, "last_run_id"),\n            asset_details=check.opt_inst_param(asset_details, "asset_details", AssetDetails),\n            cached_status=check.opt_inst_param(\n                cached_status, "cached_status", AssetStatusCacheValue\n            ),\n        )\n\n    @property\n    def last_materialization(self) -> Optional["EventLogEntry"]:\n        if self.last_materialization_record is None:\n            return None\n        return self.last_materialization_record.event_log_entry\n\n    @property\n    def last_materialization_storage_id(self) -> Optional[int]:\n        if self.last_materialization_record is None:\n            return None\n        return self.last_materialization_record.storage_id\n\n\n
[docs]class AssetRecord(NamedTuple):\n """Internal representation of an asset record, as stored in a :py:class:`~dagster._core.storage.event_log.EventLogStorage`.\n\n Users should not invoke this class directly.\n """\n\n storage_id: int\n asset_entry: AssetEntry
\n\n\n
[docs]class EventLogStorage(ABC, MayHaveInstanceWeakref[T_DagsterInstance]):\n """Abstract base class for storing structured event logs from pipeline runs.\n\n Note that event log storages using SQL databases as backing stores should implement\n :py:class:`~dagster._core.storage.event_log.SqlEventLogStorage`.\n\n Users should not directly instantiate concrete subclasses of this class; they are instantiated\n by internal machinery when ``dagster-webserver`` and ``dagster-graphql`` load, based on the values in the\n ``dagster.yaml`` file in ``$DAGSTER_HOME``. Configuration of concrete subclasses of this class\n should be done by setting values in that file.\n """\n\n def get_logs_for_run(\n self,\n run_id: str,\n cursor: Optional[Union[str, int]] = None,\n of_type: Optional[Union[DagsterEventType, Set[DagsterEventType]]] = None,\n limit: Optional[int] = None,\n ascending: bool = True,\n ) -> Sequence["EventLogEntry"]:\n """Get all of the logs corresponding to a run.\n\n Args:\n run_id (str): The id of the run for which to fetch logs.\n cursor (Optional[Union[str, int]]): Cursor value to track paginated queries. Legacy\n support for integer offset cursors.\n of_type (Optional[DagsterEventType]): the dagster event type to filter the logs.\n limit (Optional[int]): Max number of records to return.\n """\n if isinstance(cursor, int):\n cursor = EventLogCursor.from_offset(cursor + 1).to_string()\n records = self.get_records_for_run(\n run_id, cursor, of_type, limit, ascending=ascending\n ).records\n return [record.event_log_entry for record in records]\n\n @abstractmethod\n def get_records_for_run(\n self,\n run_id: str,\n cursor: Optional[str] = None,\n of_type: Optional[Union[DagsterEventType, Set[DagsterEventType]]] = None,\n limit: Optional[int] = None,\n ascending: bool = True,\n ) -> EventLogConnection:\n """Get all of the event log records corresponding to a run.\n\n Args:\n run_id (str): The id of the run for which to fetch logs.\n cursor (Optional[str]): Cursor value to track paginated queries.\n of_type (Optional[DagsterEventType]): the dagster event type to filter the logs.\n limit (Optional[int]): Max number of records to return.\n """\n\n def get_stats_for_run(self, run_id: str) -> DagsterRunStatsSnapshot:\n """Get a summary of events that have ocurred in a run."""\n return build_run_stats_from_events(run_id, self.get_logs_for_run(run_id))\n\n def get_step_stats_for_run(\n self, run_id: str, step_keys: Optional[Sequence[str]] = None\n ) -> Sequence[RunStepKeyStatsSnapshot]:\n """Get per-step stats for a pipeline run."""\n logs = self.get_logs_for_run(run_id)\n if step_keys:\n logs = [\n event\n for event in logs\n if event.is_dagster_event and event.get_dagster_event().step_key in step_keys\n ]\n\n return build_run_step_stats_from_events(run_id, logs)\n\n @abstractmethod\n def store_event(self, event: "EventLogEntry") -> None:\n """Store an event corresponding to a pipeline run.\n\n Args:\n event (EventLogEntry): The event to store.\n """\n\n @abstractmethod\n def delete_events(self, run_id: str) -> None:\n """Remove events for a given run id."""\n\n @abstractmethod\n def upgrade(self) -> None:\n """This method should perform any schema migrations necessary to bring an\n out-of-date instance of the storage up to date.\n """\n\n @abstractmethod\n def reindex_events(self, print_fn: Optional[PrintFn] = None, force: bool = False) -> None:\n """Call this method to run any data migrations across the event_log tables."""\n\n @abstractmethod\n def reindex_assets(self, print_fn: Optional[PrintFn] = None, force: bool = False) -> None:\n """Call this method to run any data migrations across the asset tables."""\n\n @abstractmethod\n def wipe(self) -> None:\n """Clear the log storage."""\n\n @abstractmethod\n def watch(self, run_id: str, cursor: Optional[str], callback: EventHandlerFn) -> None:\n """Call this method to start watching."""\n\n @abstractmethod\n def end_watch(self, run_id: str, handler: EventHandlerFn) -> None:\n """Call this method to stop watching."""\n\n @property\n @abstractmethod\n def is_persistent(self) -> bool:\n """bool: Whether the storage is persistent."""\n\n def dispose(self) -> None:\n """Explicit lifecycle management."""\n\n def optimize_for_webserver(self, statement_timeout: int, pool_recycle: int) -> None:\n """Allows for optimizing database connection / use in the context of a long lived webserver process."""\n\n @abstractmethod\n def get_event_records(\n self,\n event_records_filter: EventRecordsFilter,\n limit: Optional[int] = None,\n ascending: bool = False,\n ) -> Sequence[EventLogRecord]:\n pass\n\n def supports_event_consumer_queries(self) -> bool:\n return False\n\n def get_logs_for_all_runs_by_log_id(\n self,\n after_cursor: int = -1,\n dagster_event_type: Optional[Union[DagsterEventType, Set[DagsterEventType]]] = None,\n limit: Optional[int] = None,\n ) -> Mapping[int, "EventLogEntry"]:\n """Get event records across all runs. Only supported for non sharded sql storage."""\n raise NotImplementedError()\n\n def get_maximum_record_id(self) -> Optional[int]:\n """Get the current greatest record id in the event log. Only supported for non sharded sql storage."""\n raise NotImplementedError()\n\n @abstractmethod\n def can_cache_asset_status_data(self) -> bool:\n pass\n\n @abstractmethod\n def wipe_asset_cached_status(self, asset_key: AssetKey) -> None:\n pass\n\n @abstractmethod\n def get_asset_records(\n self, asset_keys: Optional[Sequence[AssetKey]] = None\n ) -> Sequence[AssetRecord]:\n pass\n\n @abstractmethod\n def has_asset_key(self, asset_key: AssetKey) -> bool:\n pass\n\n @abstractmethod\n def all_asset_keys(self) -> Sequence[AssetKey]:\n pass\n\n @abstractmethod\n def update_asset_cached_status_data(\n self, asset_key: AssetKey, cache_values: "AssetStatusCacheValue"\n ) -> None:\n pass\n\n def get_asset_keys(\n self,\n prefix: Optional[Sequence[str]] = None,\n limit: Optional[int] = None,\n cursor: Optional[str] = None,\n ) -> Sequence[AssetKey]:\n # base implementation of get_asset_keys, using the existing `all_asset_keys` and doing the\n # filtering in-memory\n asset_keys = sorted(self.all_asset_keys(), key=str)\n if prefix:\n asset_keys = [\n asset_key for asset_key in asset_keys if asset_key.path[: len(prefix)] == prefix\n ]\n if cursor:\n cursor_asset = AssetKey.from_db_string(cursor)\n if cursor_asset and cursor_asset in asset_keys:\n idx = asset_keys.index(cursor_asset)\n asset_keys = asset_keys[idx + 1 :]\n if limit:\n asset_keys = asset_keys[:limit]\n return asset_keys\n\n @abstractmethod\n def get_latest_materialization_events(\n self, asset_keys: Iterable[AssetKey]\n ) -> Mapping[AssetKey, Optional["EventLogEntry"]]:\n pass\n\n def supports_add_asset_event_tags(self) -> bool:\n return False\n\n def add_asset_event_tags(\n self,\n event_id: int,\n event_timestamp: float,\n asset_key: AssetKey,\n new_tags: Mapping[str, str],\n ) -> None:\n raise NotImplementedError()\n\n @abstractmethod\n def get_event_tags_for_asset(\n self,\n asset_key: AssetKey,\n filter_tags: Optional[Mapping[str, str]] = None,\n filter_event_id: Optional[int] = None,\n ) -> Sequence[Mapping[str, str]]:\n pass\n\n @abstractmethod\n def wipe_asset(self, asset_key: AssetKey) -> None:\n """Remove asset index history from event log for given asset_key."""\n\n @abstractmethod\n def get_materialized_partitions(\n self,\n asset_key: AssetKey,\n before_cursor: Optional[int] = None,\n after_cursor: Optional[int] = None,\n ) -> Set[str]:\n pass\n\n @abstractmethod\n def get_materialization_count_by_partition(\n self, asset_keys: Sequence[AssetKey], after_cursor: Optional[int] = None\n ) -> Mapping[AssetKey, Mapping[str, int]]:\n pass\n\n @abstractmethod\n def get_latest_storage_id_by_partition(\n self, asset_key: AssetKey, event_type: DagsterEventType\n ) -> Mapping[str, int]:\n pass\n\n @abstractmethod\n def get_latest_tags_by_partition(\n self,\n asset_key: AssetKey,\n event_type: DagsterEventType,\n tag_keys: Sequence[str],\n asset_partitions: Optional[Sequence[str]] = None,\n before_cursor: Optional[int] = None,\n after_cursor: Optional[int] = None,\n ) -> Mapping[str, Mapping[str, str]]:\n pass\n\n @abstractmethod\n def get_latest_asset_partition_materialization_attempts_without_materializations(\n self, asset_key: AssetKey\n ) -> Mapping[str, Tuple[str, int]]:\n pass\n\n @abstractmethod\n def get_dynamic_partitions(self, partitions_def_name: str) -> Sequence[str]:\n """Get the list of partition keys for a dynamic partitions definition."""\n raise NotImplementedError()\n\n @abstractmethod\n def has_dynamic_partition(self, partitions_def_name: str, partition_key: str) -> bool:\n """Check if a dynamic partition exists."""\n raise NotImplementedError()\n\n @abstractmethod\n def add_dynamic_partitions(\n self, partitions_def_name: str, partition_keys: Sequence[str]\n ) -> None:\n """Add a partition for the specified dynamic partitions definition."""\n raise NotImplementedError()\n\n @abstractmethod\n def delete_dynamic_partition(self, partitions_def_name: str, partition_key: str) -> None:\n """Delete a partition for the specified dynamic partitions definition."""\n raise NotImplementedError()\n\n def alembic_version(self) -> Optional[AlembicVersion]:\n return None\n\n @property\n def is_run_sharded(self) -> bool:\n """Indicates that the EventLogStoarge is sharded."""\n return False\n\n @property\n def supports_global_concurrency_limits(self) -> bool:\n """Indicates that the EventLogStorage supports global concurrency limits."""\n return False\n\n @abstractmethod\n def set_concurrency_slots(self, concurrency_key: str, num: int) -> None:\n """Allocate concurrency slots for the given concurrency key."""\n raise NotImplementedError()\n\n @abstractmethod\n def get_concurrency_keys(self) -> Set[str]:\n """Get the set of concurrency limited keys."""\n raise NotImplementedError()\n\n @abstractmethod\n def get_concurrency_info(self, concurrency_key: str) -> ConcurrencyKeyInfo:\n """Get concurrency info for key."""\n raise NotImplementedError()\n\n @abstractmethod\n def claim_concurrency_slot(\n self, concurrency_key: str, run_id: str, step_key: str, priority: Optional[int] = None\n ) -> ConcurrencyClaimStatus:\n """Claim concurrency slots for step."""\n raise NotImplementedError()\n\n @abstractmethod\n def check_concurrency_claim(\n self, concurrency_key: str, run_id: str, step_key: str\n ) -> ConcurrencyClaimStatus:\n """Claim concurrency slots for step."""\n raise NotImplementedError()\n\n @abstractmethod\n def get_concurrency_run_ids(self) -> Set[str]:\n """Get a list of run_ids that are occupying or waiting for a concurrency key slot."""\n raise NotImplementedError()\n\n @abstractmethod\n def free_concurrency_slots_for_run(self, run_id: str) -> None:\n """Frees concurrency slots for a given run."""\n raise NotImplementedError()\n\n @abstractmethod\n def free_concurrency_slot_for_step(self, run_id: str, step_key: str) -> None:\n """Frees concurrency slots for a given run/step."""\n raise NotImplementedError()\n\n @property\n def supports_asset_checks(self):\n return True\n\n def get_asset_check_executions(\n self,\n asset_key: AssetKey,\n check_name: str,\n limit: int,\n cursor: Optional[int] = None,\n materialization_event_storage_id: Optional[int] = None,\n include_planned: bool = True,\n ) -> Sequence[AssetCheckExecutionRecord]:\n """Get the executions for an asset check, sorted by recency. If materialization_event_storage_id\n is set and include_planned is True, the returned Sequence will include executions that are planned\n but do not have a target materialization yet (since we don't set the target until the check is executed).\n """\n raise NotImplementedError()
\n
", "current_page_name": "_modules/dagster/_core/storage/event_log/base", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.storage.event_log.base"}, "sql_event_log": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.storage.event_log.sql_event_log

\nimport logging\nfrom abc import abstractmethod\nfrom collections import OrderedDict, defaultdict\nfrom datetime import datetime\nfrom typing import (\n    TYPE_CHECKING,\n    Any,\n    ContextManager,\n    Dict,\n    Iterable,\n    List,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Set,\n    Tuple,\n    Union,\n    cast,\n)\n\nimport pendulum\nimport sqlalchemy as db\nimport sqlalchemy.exc as db_exc\nfrom sqlalchemy.engine import Connection\nfrom typing_extensions import TypeAlias\n\nimport dagster._check as check\nimport dagster._seven as seven\nfrom dagster._core.assets import AssetDetails\nfrom dagster._core.definitions.asset_check_evaluation import (\n    AssetCheckEvaluation,\n    AssetCheckEvaluationPlanned,\n)\nfrom dagster._core.definitions.events import AssetKey, AssetMaterialization\nfrom dagster._core.errors import (\n    DagsterEventLogInvalidForRun,\n    DagsterInvalidInvocationError,\n    DagsterInvariantViolationError,\n)\nfrom dagster._core.event_api import RunShardedEventsCursor\nfrom dagster._core.events import ASSET_CHECK_EVENTS, ASSET_EVENTS, MARKER_EVENTS, DagsterEventType\nfrom dagster._core.events.log import EventLogEntry\nfrom dagster._core.execution.stats import RunStepKeyStatsSnapshot, build_run_step_stats_from_events\nfrom dagster._core.storage.asset_check_execution_record import (\n    AssetCheckExecutionRecord,\n    AssetCheckExecutionRecordStatus,\n)\nfrom dagster._core.storage.sql import SqlAlchemyQuery, SqlAlchemyRow\nfrom dagster._core.storage.sqlalchemy_compat import (\n    db_case,\n    db_fetch_mappings,\n    db_select,\n    db_subquery,\n)\nfrom dagster._serdes import (\n    deserialize_value,\n    serialize_value,\n)\nfrom dagster._serdes.errors import DeserializationError\nfrom dagster._utils import (\n    PrintFn,\n    datetime_as_float,\n    utc_datetime_from_naive,\n    utc_datetime_from_timestamp,\n)\nfrom dagster._utils.concurrency import (\n    ConcurrencyClaimStatus,\n    ConcurrencyKeyInfo,\n    ConcurrencySlotStatus,\n)\n\nfrom ..dagster_run import DagsterRunStatsSnapshot\nfrom .base import (\n    AssetEntry,\n    AssetRecord,\n    EventLogConnection,\n    EventLogCursor,\n    EventLogRecord,\n    EventLogStorage,\n    EventRecordsFilter,\n)\nfrom .migration import ASSET_DATA_MIGRATIONS, ASSET_KEY_INDEX_COLS, EVENT_LOG_DATA_MIGRATIONS\nfrom .schema import (\n    AssetCheckExecutionsTable,\n    AssetEventTagsTable,\n    AssetKeyTable,\n    ConcurrencySlotsTable,\n    DynamicPartitionsTable,\n    PendingStepsTable,\n    SecondaryIndexMigrationTable,\n    SqlEventLogStorageTable,\n)\n\nif TYPE_CHECKING:\n    from dagster._core.storage.partition_status_cache import AssetStatusCacheValue\n\nMAX_CONCURRENCY_SLOTS = 1000\nMIN_ASSET_ROWS = 25\n\n# We are using third-party library objects for DB connections-- at this time, these libraries are\n# untyped. When/if we upgrade to typed variants, the `Any` here can be replaced or the alias as a\n# whole can be dropped.\nSqlDbConnection: TypeAlias = Any\n\n\n
[docs]class SqlEventLogStorage(EventLogStorage):\n """Base class for SQL backed event log storages.\n\n Distinguishes between run-based connections and index connections in order to support run-level\n sharding, while maintaining the ability to do cross-run queries\n """\n\n @abstractmethod\n def run_connection(self, run_id: Optional[str]) -> ContextManager[Connection]:\n """Context manager yielding a connection to access the event logs for a specific run.\n\n Args:\n run_id (Optional[str]): Enables those storages which shard based on run_id, e.g.,\n SqliteEventLogStorage, to connect appropriately.\n """\n\n @abstractmethod\n def index_connection(self) -> ContextManager[Connection]:\n """Context manager yielding a connection to access cross-run indexed tables."""\n\n @abstractmethod\n def upgrade(self) -> None:\n """This method should perform any schema migrations necessary to bring an\n out-of-date instance of the storage up to date.\n """\n\n @abstractmethod\n def has_table(self, table_name: str) -> bool:\n """This method checks if a table exists in the database."""\n\n def prepare_insert_event(self, event):\n """Helper method for preparing the event log SQL insertion statement. Abstracted away to\n have a single place for the logical table representation of the event, while having a way\n for SQL backends to implement different execution implementations for `store_event`. See\n the `dagster-postgres` implementation which overrides the generic SQL implementation of\n `store_event`.\n """\n dagster_event_type = None\n asset_key_str = None\n partition = None\n step_key = event.step_key\n if event.is_dagster_event:\n dagster_event_type = event.dagster_event.event_type_value\n step_key = event.dagster_event.step_key\n if event.dagster_event.asset_key:\n check.inst_param(event.dagster_event.asset_key, "asset_key", AssetKey)\n asset_key_str = event.dagster_event.asset_key.to_string()\n if event.dagster_event.partition:\n partition = event.dagster_event.partition\n\n # https://stackoverflow.com/a/54386260/324449\n return SqlEventLogStorageTable.insert().values(\n run_id=event.run_id,\n event=serialize_value(event),\n dagster_event_type=dagster_event_type,\n # Postgres requires a datetime that is in UTC but has no timezone info set\n # in order to be stored correctly\n timestamp=datetime.utcfromtimestamp(event.timestamp),\n step_key=step_key,\n asset_key=asset_key_str,\n partition=partition,\n )\n\n def has_asset_key_col(self, column_name: str) -> bool:\n with self.index_connection() as conn:\n column_names = [x.get("name") for x in db.inspect(conn).get_columns(AssetKeyTable.name)]\n return column_name in column_names\n\n def has_asset_key_index_cols(self) -> bool:\n return self.has_asset_key_col("last_materialization_timestamp")\n\n def store_asset_event(self, event: EventLogEntry, event_id: int):\n check.inst_param(event, "event", EventLogEntry)\n\n if not (event.dagster_event and event.dagster_event.asset_key):\n return\n\n # We switched to storing the entire event record of the last materialization instead of just\n # the AssetMaterialization object, so that we have access to metadata like timestamp,\n # pipeline, run_id, etc.\n #\n # This should make certain asset queries way more performant, without having to do extra\n # queries against the event log.\n #\n # This should be accompanied by a schema change in 0.12.0, renaming `last_materialization`\n # to `last_materialization_event`, for clarity. For now, we should do some back-compat.\n #\n # https://github.com/dagster-io/dagster/issues/3945\n\n values = self._get_asset_entry_values(event, event_id, self.has_asset_key_index_cols())\n insert_statement = AssetKeyTable.insert().values(\n asset_key=event.dagster_event.asset_key.to_string(), **values\n )\n update_statement = (\n AssetKeyTable.update()\n .values(**values)\n .where(\n AssetKeyTable.c.asset_key == event.dagster_event.asset_key.to_string(),\n )\n )\n\n with self.index_connection() as conn:\n try:\n conn.execute(insert_statement)\n except db_exc.IntegrityError:\n conn.execute(update_statement)\n\n def _get_asset_entry_values(\n self, event: EventLogEntry, event_id: int, has_asset_key_index_cols: bool\n ) -> Dict[str, Any]:\n # The AssetKeyTable contains a `last_materialization_timestamp` column that is exclusively\n # used to determine if an asset exists (last materialization timestamp > wipe timestamp).\n # This column is used nowhere else, and as of AssetObservation/AssetMaterializationPlanned\n # event creation, we want to extend this functionality to ensure that assets with any event\n # (observation, materialization, or materialization planned) yielded with timestamp\n # > wipe timestamp display in the Dagster UI.\n\n # As of the following PRs, we update last_materialization_timestamp to store the timestamp\n # of the latest asset observation, materialization, or materialization_planned that has occurred.\n # https://github.com/dagster-io/dagster/pull/6885\n # https://github.com/dagster-io/dagster/pull/7319\n\n entry_values: Dict[str, Any] = {}\n dagster_event = check.not_none(event.dagster_event)\n if dagster_event.is_step_materialization:\n entry_values.update(\n {\n "last_materialization": serialize_value(\n EventLogRecord(\n storage_id=event_id,\n event_log_entry=event,\n )\n ),\n "last_run_id": event.run_id,\n }\n )\n if has_asset_key_index_cols:\n entry_values.update(\n {\n "last_materialization_timestamp": utc_datetime_from_timestamp(\n event.timestamp\n ),\n }\n )\n elif dagster_event.is_asset_materialization_planned:\n # The AssetKeyTable also contains a `last_run_id` column that is updated upon asset\n # materialization. This column was not being used until the below PR. This new change\n # writes to the column upon `ASSET_MATERIALIZATION_PLANNED` events to fetch the last\n # run id for a set of assets in one roundtrip call to event log storage.\n # https://github.com/dagster-io/dagster/pull/7319\n entry_values.update({"last_run_id": event.run_id})\n if has_asset_key_index_cols:\n entry_values.update(\n {\n "last_materialization_timestamp": utc_datetime_from_timestamp(\n event.timestamp\n ),\n }\n )\n elif dagster_event.is_asset_observation:\n if has_asset_key_index_cols:\n entry_values.update(\n {\n "last_materialization_timestamp": utc_datetime_from_timestamp(\n event.timestamp\n ),\n }\n )\n\n return entry_values\n\n def supports_add_asset_event_tags(self) -> bool:\n return self.has_table(AssetEventTagsTable.name)\n\n def add_asset_event_tags(\n self,\n event_id: int,\n event_timestamp: float,\n asset_key: AssetKey,\n new_tags: Mapping[str, str],\n ) -> None:\n check.int_param(event_id, "event_id")\n check.float_param(event_timestamp, "event_timestamp")\n check.inst_param(asset_key, "asset_key", AssetKey)\n check.mapping_param(new_tags, "new_tags", key_type=str, value_type=str)\n\n if not self.supports_add_asset_event_tags():\n raise DagsterInvalidInvocationError(\n "In order to add asset event tags, you must run `dagster instance migrate` to "\n "create the AssetEventTags table."\n )\n\n current_tags_list = self.get_event_tags_for_asset(asset_key, filter_event_id=event_id)\n\n asset_key_str = asset_key.to_string()\n\n if len(current_tags_list) == 0:\n current_tags: Mapping[str, str] = {}\n else:\n current_tags = current_tags_list[0]\n\n with self.index_connection() as conn:\n current_tags_set = set(current_tags.keys())\n new_tags_set = set(new_tags.keys())\n\n existing_tags = current_tags_set & new_tags_set\n added_tags = new_tags_set.difference(existing_tags)\n\n for tag in existing_tags:\n conn.execute(\n AssetEventTagsTable.update()\n .where(\n db.and_(\n AssetEventTagsTable.c.event_id == event_id,\n AssetEventTagsTable.c.asset_key == asset_key_str,\n AssetEventTagsTable.c.key == tag,\n )\n )\n .values(value=new_tags[tag])\n )\n\n if added_tags:\n conn.execute(\n AssetEventTagsTable.insert(),\n [\n dict(\n event_id=event_id,\n asset_key=asset_key_str,\n key=tag,\n value=new_tags[tag],\n # Postgres requires a datetime that is in UTC but has no timezone info\n # set in order to be stored correctly\n event_timestamp=datetime.utcfromtimestamp(event_timestamp),\n )\n for tag in added_tags\n ],\n )\n\n def store_asset_event_tags(self, event: EventLogEntry, event_id: int) -> None:\n check.inst_param(event, "event", EventLogEntry)\n check.int_param(event_id, "event_id")\n\n if event.dagster_event and event.dagster_event.asset_key:\n if event.dagster_event.is_step_materialization:\n tags = event.dagster_event.step_materialization_data.materialization.tags\n elif event.dagster_event.is_asset_observation:\n tags = event.dagster_event.asset_observation_data.asset_observation.tags\n else:\n tags = None\n\n if not tags or not self.has_table(AssetEventTagsTable.name):\n # If tags table does not exist, silently exit. This is to support OSS\n # users who have not yet run the migration to create the table.\n # On read, we will throw an error if the table does not exist.\n return\n\n check.inst_param(event.dagster_event.asset_key, "asset_key", AssetKey)\n asset_key_str = event.dagster_event.asset_key.to_string()\n\n with self.index_connection() as conn:\n conn.execute(\n AssetEventTagsTable.insert(),\n [\n dict(\n event_id=event_id,\n asset_key=asset_key_str,\n key=key,\n value=value,\n # Postgres requires a datetime that is in UTC but has no timezone info\n # set in order to be stored correctly\n event_timestamp=datetime.utcfromtimestamp(event.timestamp),\n )\n for key, value in tags.items()\n ],\n )\n\n def store_event(self, event: EventLogEntry) -> None:\n """Store an event corresponding to a pipeline run.\n\n Args:\n event (EventLogEntry): The event to store.\n """\n check.inst_param(event, "event", EventLogEntry)\n insert_event_statement = self.prepare_insert_event(event)\n run_id = event.run_id\n\n event_id = None\n\n with self.run_connection(run_id) as conn:\n result = conn.execute(insert_event_statement)\n event_id = result.inserted_primary_key[0]\n\n if (\n event.is_dagster_event\n and event.dagster_event_type in ASSET_EVENTS\n and event.dagster_event.asset_key # type: ignore\n ):\n self.store_asset_event(event, event_id)\n\n if event_id is None:\n raise DagsterInvariantViolationError(\n "Cannot store asset event tags for null event id."\n )\n\n self.store_asset_event_tags(event, event_id)\n\n if event.is_dagster_event and event.dagster_event_type in ASSET_CHECK_EVENTS:\n self.store_asset_check_event(event, event_id)\n\n def get_records_for_run(\n self,\n run_id,\n cursor: Optional[str] = None,\n of_type: Optional[Union[DagsterEventType, Set[DagsterEventType]]] = None,\n limit: Optional[int] = None,\n ascending: bool = True,\n ) -> EventLogConnection:\n """Get all of the logs corresponding to a run.\n\n Args:\n run_id (str): The id of the run for which to fetch logs.\n cursor (Optional[int]): Zero-indexed logs will be returned starting from cursor + 1,\n i.e., if cursor is -1, all logs will be returned. (default: -1)\n of_type (Optional[DagsterEventType]): the dagster event type to filter the logs.\n limit (Optional[int]): the maximum number of events to fetch\n """\n check.str_param(run_id, "run_id")\n check.opt_str_param(cursor, "cursor")\n\n check.invariant(not of_type or isinstance(of_type, (DagsterEventType, frozenset, set)))\n\n dagster_event_types = (\n {of_type}\n if isinstance(of_type, DagsterEventType)\n else check.opt_set_param(of_type, "dagster_event_type", of_type=DagsterEventType)\n )\n\n query = (\n db_select([SqlEventLogStorageTable.c.id, SqlEventLogStorageTable.c.event])\n .where(SqlEventLogStorageTable.c.run_id == run_id)\n .order_by(\n SqlEventLogStorageTable.c.id.asc()\n if ascending\n else SqlEventLogStorageTable.c.id.desc()\n )\n )\n if dagster_event_types:\n query = query.where(\n SqlEventLogStorageTable.c.dagster_event_type.in_(\n [dagster_event_type.value for dagster_event_type in dagster_event_types]\n )\n )\n\n # adjust 0 based index cursor to SQL offset\n if cursor is not None:\n cursor_obj = EventLogCursor.parse(cursor)\n if cursor_obj.is_offset_cursor():\n query = query.offset(cursor_obj.offset())\n elif cursor_obj.is_id_cursor():\n if ascending:\n query = query.where(SqlEventLogStorageTable.c.id > cursor_obj.storage_id())\n else:\n query = query.where(SqlEventLogStorageTable.c.id < cursor_obj.storage_id())\n\n if limit:\n query = query.limit(limit)\n\n with self.run_connection(run_id) as conn:\n results = conn.execute(query).fetchall()\n\n last_record_id = None\n try:\n records = []\n for (\n record_id,\n json_str,\n ) in results:\n records.append(\n EventLogRecord(\n storage_id=record_id,\n event_log_entry=deserialize_value(json_str, EventLogEntry),\n )\n )\n last_record_id = record_id\n except (seven.JSONDecodeError, DeserializationError) as err:\n raise DagsterEventLogInvalidForRun(run_id=run_id) from err\n\n if last_record_id is not None:\n next_cursor = EventLogCursor.from_storage_id(last_record_id).to_string()\n elif cursor:\n # record fetch returned no new logs, return the same cursor\n next_cursor = cursor\n else:\n # rely on the fact that all storage ids will be positive integers\n next_cursor = EventLogCursor.from_storage_id(-1).to_string()\n\n return EventLogConnection(\n records=records,\n cursor=next_cursor,\n has_more=bool(limit and len(results) == limit),\n )\n\n def get_stats_for_run(self, run_id: str) -> DagsterRunStatsSnapshot:\n check.str_param(run_id, "run_id")\n\n query = (\n db_select(\n [\n SqlEventLogStorageTable.c.dagster_event_type,\n db.func.count().label("n_events_of_type"),\n db.func.max(SqlEventLogStorageTable.c.timestamp).label("last_event_timestamp"),\n ]\n )\n .where(\n db.and_(\n SqlEventLogStorageTable.c.run_id == run_id,\n SqlEventLogStorageTable.c.dagster_event_type != None, # noqa: E711\n )\n )\n .group_by("dagster_event_type")\n )\n\n with self.run_connection(run_id) as conn:\n results = conn.execute(query).fetchall()\n\n try:\n counts = {}\n times = {}\n for result in results:\n (dagster_event_type, n_events_of_type, last_event_timestamp) = result\n check.invariant(dagster_event_type is not None)\n counts[dagster_event_type] = n_events_of_type\n times[dagster_event_type] = last_event_timestamp\n\n enqueued_time = times.get(DagsterEventType.PIPELINE_ENQUEUED.value, None)\n launch_time = times.get(DagsterEventType.PIPELINE_STARTING.value, None)\n start_time = times.get(DagsterEventType.PIPELINE_START.value, None)\n end_time = times.get(\n DagsterEventType.PIPELINE_SUCCESS.value,\n times.get(\n DagsterEventType.PIPELINE_FAILURE.value,\n times.get(DagsterEventType.PIPELINE_CANCELED.value, None),\n ),\n )\n\n return DagsterRunStatsSnapshot(\n run_id=run_id,\n steps_succeeded=counts.get(DagsterEventType.STEP_SUCCESS.value, 0),\n steps_failed=counts.get(DagsterEventType.STEP_FAILURE.value, 0),\n materializations=counts.get(DagsterEventType.ASSET_MATERIALIZATION.value, 0),\n expectations=counts.get(DagsterEventType.STEP_EXPECTATION_RESULT.value, 0),\n enqueued_time=datetime_as_float(enqueued_time) if enqueued_time else None,\n launch_time=datetime_as_float(launch_time) if launch_time else None,\n start_time=datetime_as_float(start_time) if start_time else None,\n end_time=datetime_as_float(end_time) if end_time else None,\n )\n except (seven.JSONDecodeError, DeserializationError) as err:\n raise DagsterEventLogInvalidForRun(run_id=run_id) from err\n\n def get_step_stats_for_run(\n self, run_id: str, step_keys: Optional[Sequence[str]] = None\n ) -> Sequence[RunStepKeyStatsSnapshot]:\n check.str_param(run_id, "run_id")\n check.opt_list_param(step_keys, "step_keys", of_type=str)\n\n # Originally, this was two different queries:\n # 1) one query which aggregated top-level step stats by grouping by event type / step_key in\n # a single query, using pure SQL (e.g. start_time, end_time, status, attempt counts).\n # 2) one query which fetched all the raw events for a specific event type and then inspected\n # the deserialized event object to aggregate stats derived from sequences of events.\n # (e.g. marker events, materializations, expectations resuls, attempts timing, etc.)\n #\n # For simplicity, we now just do the second type of query and derive the stats in Python\n # from the raw events. This has the benefit of being easier to read and also the benefit of\n # being able to share code with the in-memory event log storage implementation. We may\n # choose to revisit this in the future, especially if we are able to do JSON-column queries\n # in SQL as a way of bypassing the serdes layer in all cases.\n raw_event_query = (\n db_select([SqlEventLogStorageTable.c.event])\n .where(SqlEventLogStorageTable.c.run_id == run_id)\n .where(SqlEventLogStorageTable.c.step_key != None) # noqa: E711\n .where(\n SqlEventLogStorageTable.c.dagster_event_type.in_(\n [\n DagsterEventType.STEP_START.value,\n DagsterEventType.STEP_SUCCESS.value,\n DagsterEventType.STEP_SKIPPED.value,\n DagsterEventType.STEP_FAILURE.value,\n DagsterEventType.STEP_RESTARTED.value,\n DagsterEventType.ASSET_MATERIALIZATION.value,\n DagsterEventType.STEP_EXPECTATION_RESULT.value,\n DagsterEventType.STEP_RESTARTED.value,\n DagsterEventType.STEP_UP_FOR_RETRY.value,\n ]\n + [marker_event.value for marker_event in MARKER_EVENTS]\n )\n )\n .order_by(SqlEventLogStorageTable.c.id.asc())\n )\n if step_keys:\n raw_event_query = raw_event_query.where(\n SqlEventLogStorageTable.c.step_key.in_(step_keys)\n )\n\n with self.run_connection(run_id) as conn:\n results = conn.execute(raw_event_query).fetchall()\n\n try:\n records = [deserialize_value(json_str, EventLogEntry) for (json_str,) in results]\n return build_run_step_stats_from_events(run_id, records)\n except (seven.JSONDecodeError, DeserializationError) as err:\n raise DagsterEventLogInvalidForRun(run_id=run_id) from err\n\n def _apply_migration(self, migration_name, migration_fn, print_fn, force):\n if self.has_secondary_index(migration_name):\n if not force:\n if print_fn:\n print_fn(f"Skipping already applied data migration: {migration_name}")\n return\n if print_fn:\n print_fn(f"Starting data migration: {migration_name}")\n migration_fn()(self, print_fn)\n self.enable_secondary_index(migration_name)\n if print_fn:\n print_fn(f"Finished data migration: {migration_name}")\n\n def reindex_events(self, print_fn: Optional[PrintFn] = None, force: bool = False) -> None:\n """Call this method to run any data migrations across the event_log table."""\n for migration_name, migration_fn in EVENT_LOG_DATA_MIGRATIONS.items():\n self._apply_migration(migration_name, migration_fn, print_fn, force)\n\n def reindex_assets(self, print_fn: Optional[PrintFn] = None, force: bool = False) -> None:\n """Call this method to run any data migrations across the asset_keys table."""\n for migration_name, migration_fn in ASSET_DATA_MIGRATIONS.items():\n self._apply_migration(migration_name, migration_fn, print_fn, force)\n\n def wipe(self) -> None:\n """Clears the event log storage."""\n # Should be overridden by SqliteEventLogStorage and other storages that shard based on\n # run_id\n\n # https://stackoverflow.com/a/54386260/324449\n with self.run_connection(run_id=None) as conn:\n conn.execute(SqlEventLogStorageTable.delete())\n conn.execute(AssetKeyTable.delete())\n\n if self.has_table("asset_event_tags"):\n conn.execute(AssetEventTagsTable.delete())\n\n if self.has_table("dynamic_partitions"):\n conn.execute(DynamicPartitionsTable.delete())\n\n if self.has_table("concurrency_slots"):\n conn.execute(ConcurrencySlotsTable.delete())\n\n if self.has_table("pending_steps"):\n conn.execute(PendingStepsTable.delete())\n\n if self.has_table("asset_check_executions"):\n conn.execute(AssetCheckExecutionsTable.delete())\n\n self._wipe_index()\n\n def _wipe_index(self):\n with self.index_connection() as conn:\n conn.execute(SqlEventLogStorageTable.delete())\n conn.execute(AssetKeyTable.delete())\n\n if self.has_table("asset_event_tags"):\n conn.execute(AssetEventTagsTable.delete())\n\n if self.has_table("dynamic_partitions"):\n conn.execute(DynamicPartitionsTable.delete())\n\n if self.has_table("concurrency_slots"):\n conn.execute(ConcurrencySlotsTable.delete())\n\n if self.has_table("pending_steps"):\n conn.execute(PendingStepsTable.delete())\n\n if self.has_table("asset_check_executions"):\n conn.execute(AssetCheckExecutionsTable.delete())\n\n def delete_events(self, run_id: str) -> None:\n with self.run_connection(run_id) as conn:\n self.delete_events_for_run(conn, run_id)\n with self.index_connection() as conn:\n self.delete_events_for_run(conn, run_id)\n self.free_concurrency_slots_for_run(run_id)\n\n def delete_events_for_run(self, conn: Connection, run_id: str) -> None:\n check.str_param(run_id, "run_id")\n conn.execute(\n SqlEventLogStorageTable.delete().where(SqlEventLogStorageTable.c.run_id == run_id)\n )\n\n @property\n def is_persistent(self) -> bool:\n return True\n\n def update_event_log_record(self, record_id: int, event: EventLogEntry) -> None:\n """Utility method for migration scripts to update SQL representation of event records."""\n check.int_param(record_id, "record_id")\n check.inst_param(event, "event", EventLogEntry)\n dagster_event_type = None\n asset_key_str = None\n if event.is_dagster_event:\n dagster_event_type = event.dagster_event.event_type_value # type: ignore\n if event.dagster_event.asset_key: # type: ignore\n check.inst_param(event.dagster_event.asset_key, "asset_key", AssetKey) # type: ignore\n asset_key_str = event.dagster_event.asset_key.to_string() # type: ignore\n\n with self.run_connection(run_id=event.run_id) as conn:\n conn.execute(\n SqlEventLogStorageTable.update()\n .where(SqlEventLogStorageTable.c.id == record_id)\n .values(\n event=serialize_value(event),\n dagster_event_type=dagster_event_type,\n timestamp=datetime.utcfromtimestamp(event.timestamp),\n step_key=event.step_key,\n asset_key=asset_key_str,\n )\n )\n\n def get_event_log_table_data(self, run_id: str, record_id: int) -> Optional[SqlAlchemyRow]:\n """Utility method to test representation of the record in the SQL table. Returns all of\n the columns stored in the event log storage (as opposed to the deserialized `EventLogEntry`).\n This allows checking that certain fields are extracted to support performant lookups (e.g.\n extracting `step_key` for fast filtering).\n """\n with self.run_connection(run_id=run_id) as conn:\n query = (\n db_select([SqlEventLogStorageTable])\n .where(SqlEventLogStorageTable.c.id == record_id)\n .order_by(SqlEventLogStorageTable.c.id.asc())\n )\n return conn.execute(query).fetchone()\n\n def has_secondary_index(self, name: str) -> bool:\n """This method uses a checkpoint migration table to see if summary data has been constructed\n in a secondary index table. Can be used to checkpoint event_log data migrations.\n """\n query = (\n db_select([1])\n .where(SecondaryIndexMigrationTable.c.name == name)\n .where(SecondaryIndexMigrationTable.c.migration_completed != None) # noqa: E711\n .limit(1)\n )\n with self.index_connection() as conn:\n results = conn.execute(query).fetchall()\n\n return len(results) > 0\n\n def enable_secondary_index(self, name: str) -> None:\n """This method marks an event_log data migration as complete, to indicate that a summary\n data migration is complete.\n """\n query = SecondaryIndexMigrationTable.insert().values(\n name=name,\n migration_completed=datetime.now(),\n )\n with self.index_connection() as conn:\n try:\n conn.execute(query)\n except db_exc.IntegrityError:\n conn.execute(\n SecondaryIndexMigrationTable.update()\n .where(SecondaryIndexMigrationTable.c.name == name)\n .values(migration_completed=datetime.now())\n )\n\n def _apply_filter_to_query(\n self,\n query: SqlAlchemyQuery,\n event_records_filter: EventRecordsFilter,\n asset_details: Optional[AssetDetails] = None,\n apply_cursor_filters: bool = True,\n ) -> SqlAlchemyQuery:\n query = query.where(\n SqlEventLogStorageTable.c.dagster_event_type == event_records_filter.event_type.value\n )\n\n if event_records_filter.asset_key:\n query = query.where(\n SqlEventLogStorageTable.c.asset_key == event_records_filter.asset_key.to_string(),\n )\n\n if event_records_filter.asset_partitions:\n query = query.where(\n SqlEventLogStorageTable.c.partition.in_(event_records_filter.asset_partitions)\n )\n\n if asset_details and asset_details.last_wipe_timestamp:\n query = query.where(\n SqlEventLogStorageTable.c.timestamp\n > datetime.utcfromtimestamp(asset_details.last_wipe_timestamp)\n )\n\n if apply_cursor_filters:\n # allow the run-sharded sqlite implementation to disable this cursor filtering so that\n # it can implement its own custom cursor logic, as cursor ids are not unique across run\n # shards\n if event_records_filter.before_cursor is not None:\n before_cursor_id = (\n event_records_filter.before_cursor.id\n if isinstance(event_records_filter.before_cursor, RunShardedEventsCursor)\n else event_records_filter.before_cursor\n )\n query = query.where(SqlEventLogStorageTable.c.id < before_cursor_id)\n\n if event_records_filter.after_cursor is not None:\n after_cursor_id = (\n event_records_filter.after_cursor.id\n if isinstance(event_records_filter.after_cursor, RunShardedEventsCursor)\n else event_records_filter.after_cursor\n )\n query = query.where(SqlEventLogStorageTable.c.id > after_cursor_id)\n\n if event_records_filter.before_timestamp:\n query = query.where(\n SqlEventLogStorageTable.c.timestamp\n < datetime.utcfromtimestamp(event_records_filter.before_timestamp)\n )\n\n if event_records_filter.after_timestamp:\n query = query.where(\n SqlEventLogStorageTable.c.timestamp\n > datetime.utcfromtimestamp(event_records_filter.after_timestamp)\n )\n\n if event_records_filter.storage_ids:\n query = query.where(SqlEventLogStorageTable.c.id.in_(event_records_filter.storage_ids))\n\n if event_records_filter.tags and self.has_table(AssetEventTagsTable.name):\n # If we don't have the tags table, we'll filter the results after the query\n check.invariant(\n isinstance(event_records_filter.asset_key, AssetKey),\n "Asset key must be set in event records filter to filter by tags.",\n )\n if self.supports_intersect:\n intersections = [\n db_select([AssetEventTagsTable.c.event_id]).where(\n db.and_(\n AssetEventTagsTable.c.asset_key\n == event_records_filter.asset_key.to_string(), # type: ignore # (bad sig?)\n AssetEventTagsTable.c.key == key,\n (\n AssetEventTagsTable.c.value == value\n if isinstance(value, str)\n else AssetEventTagsTable.c.value.in_(value)\n ),\n )\n )\n for key, value in event_records_filter.tags.items()\n ]\n query = query.where(SqlEventLogStorageTable.c.id.in_(db.intersect(*intersections)))\n\n return query\n\n def _apply_tags_table_joins(\n self,\n table: db.Table,\n tags: Mapping[str, Union[str, Sequence[str]]],\n asset_key: Optional[AssetKey],\n ) -> db.Table:\n event_id_col = table.c.id if table == SqlEventLogStorageTable else table.c.event_id\n i = 0\n for key, value in tags.items():\n i += 1\n tags_table = db_subquery(\n db_select([AssetEventTagsTable]), f"asset_event_tags_subquery_{i}"\n )\n table = table.join(\n tags_table,\n db.and_(\n event_id_col == tags_table.c.event_id,\n not asset_key or tags_table.c.asset_key == asset_key.to_string(),\n tags_table.c.key == key,\n (\n tags_table.c.value == value\n if isinstance(value, str)\n else tags_table.c.value.in_(value)\n ),\n ),\n )\n return table\n\n def get_event_records(\n self,\n event_records_filter: EventRecordsFilter,\n limit: Optional[int] = None,\n ascending: bool = False,\n ) -> Sequence[EventLogRecord]:\n """Returns a list of (record_id, record)."""\n check.inst_param(event_records_filter, "event_records_filter", EventRecordsFilter)\n check.opt_int_param(limit, "limit")\n check.bool_param(ascending, "ascending")\n\n if event_records_filter.asset_key:\n asset_details = next(iter(self._get_assets_details([event_records_filter.asset_key])))\n else:\n asset_details = None\n\n if (\n event_records_filter.tags\n and not self.supports_intersect\n and self.has_table(AssetEventTagsTable.name)\n ):\n table = self._apply_tags_table_joins(\n SqlEventLogStorageTable, event_records_filter.tags, event_records_filter.asset_key\n )\n else:\n table = SqlEventLogStorageTable\n\n query = db_select(\n [SqlEventLogStorageTable.c.id, SqlEventLogStorageTable.c.event]\n ).select_from(table)\n\n query = self._apply_filter_to_query(\n query=query,\n event_records_filter=event_records_filter,\n asset_details=asset_details,\n )\n if limit:\n query = query.limit(limit)\n\n if ascending:\n query = query.order_by(SqlEventLogStorageTable.c.id.asc())\n else:\n query = query.order_by(SqlEventLogStorageTable.c.id.desc())\n\n with self.index_connection() as conn:\n results = conn.execute(query).fetchall()\n\n event_records = []\n for row_id, json_str in results:\n try:\n event_record = deserialize_value(json_str, NamedTuple)\n if not isinstance(event_record, EventLogEntry):\n logging.warning(\n "Could not resolve event record as EventLogEntry for id `%s`.", row_id\n )\n continue\n\n if event_records_filter.tags and not self.has_table(AssetEventTagsTable.name):\n # If we can't filter tags via the tags table, filter the returned records\n if limit is not None:\n raise DagsterInvalidInvocationError(\n "Cannot filter events on tags with a limit, without the asset event "\n "tags table. To fix, run `dagster instance migrate`."\n )\n\n event_record_tags = event_record.tags\n if not event_record_tags or any(\n event_record_tags.get(k) != v for k, v in event_records_filter.tags.items()\n ):\n continue\n\n event_records.append(\n EventLogRecord(storage_id=row_id, event_log_entry=event_record)\n )\n except seven.JSONDecodeError:\n logging.warning("Could not parse event record id `%s`.", row_id)\n\n return event_records\n\n def supports_event_consumer_queries(self) -> bool:\n return True\n\n @property\n def supports_intersect(self) -> bool:\n return True\n\n def get_logs_for_all_runs_by_log_id(\n self,\n after_cursor: int = -1,\n dagster_event_type: Optional[Union[DagsterEventType, Set[DagsterEventType]]] = None,\n limit: Optional[int] = None,\n ) -> Mapping[int, EventLogEntry]:\n check.int_param(after_cursor, "after_cursor")\n check.invariant(\n after_cursor >= -1,\n f"Don't know what to do with negative cursor {after_cursor}",\n )\n dagster_event_types = (\n {dagster_event_type}\n if isinstance(dagster_event_type, DagsterEventType)\n else check.opt_set_param(\n dagster_event_type, "dagster_event_type", of_type=DagsterEventType\n )\n )\n\n query = (\n db_select([SqlEventLogStorageTable.c.id, SqlEventLogStorageTable.c.event])\n .where(SqlEventLogStorageTable.c.id > after_cursor)\n .order_by(SqlEventLogStorageTable.c.id.asc())\n )\n\n if dagster_event_types:\n query = query.where(\n SqlEventLogStorageTable.c.dagster_event_type.in_(\n [dagster_event_type.value for dagster_event_type in dagster_event_types]\n )\n )\n\n if limit:\n query = query.limit(limit)\n\n with self.index_connection() as conn:\n results = conn.execute(query).fetchall()\n\n events = {}\n record_id = None\n try:\n for (\n record_id,\n json_str,\n ) in results:\n events[record_id] = deserialize_value(json_str, EventLogEntry)\n except (seven.JSONDecodeError, DeserializationError):\n logging.warning("Could not parse event record id `%s`.", record_id)\n\n return events\n\n def get_maximum_record_id(self) -> Optional[int]:\n with self.index_connection() as conn:\n result = conn.execute(db_select([db.func.max(SqlEventLogStorageTable.c.id)])).fetchone()\n return result[0] # type: ignore\n\n def _construct_asset_record_from_row(\n self,\n row,\n last_materialization_record: Optional[EventLogRecord],\n can_cache_asset_status_data: bool,\n ) -> AssetRecord:\n from dagster._core.storage.partition_status_cache import AssetStatusCacheValue\n\n asset_key = AssetKey.from_db_string(row["asset_key"])\n if asset_key:\n return AssetRecord(\n storage_id=row["id"],\n asset_entry=AssetEntry(\n asset_key=asset_key,\n last_materialization_record=last_materialization_record,\n last_run_id=row["last_run_id"],\n asset_details=AssetDetails.from_db_string(row["asset_details"]),\n cached_status=(\n AssetStatusCacheValue.from_db_string(row["cached_status_data"])\n if can_cache_asset_status_data\n else None\n ),\n ),\n )\n else:\n check.failed("Row did not contain asset key.")\n\n def _get_latest_materialization_records(\n self, raw_asset_rows\n ) -> Mapping[AssetKey, Optional[EventLogRecord]]:\n # Given a list of raw asset rows, returns a mapping of asset key to latest asset materialization\n # event log entry. Fetches backcompat EventLogEntry records when the last_materialization\n # in the raw asset row is an AssetMaterialization.\n to_backcompat_fetch = set()\n results: Dict[AssetKey, Optional[EventLogRecord]] = {}\n for row in raw_asset_rows:\n asset_key = AssetKey.from_db_string(row["asset_key"])\n if not asset_key:\n continue\n event_or_materialization = (\n deserialize_value(row["last_materialization"], NamedTuple)\n if row["last_materialization"]\n else None\n )\n if isinstance(event_or_materialization, EventLogRecord):\n results[asset_key] = event_or_materialization\n else:\n to_backcompat_fetch.add(asset_key)\n\n latest_event_subquery = db_subquery(\n db_select(\n [\n SqlEventLogStorageTable.c.asset_key,\n db.func.max(SqlEventLogStorageTable.c.id).label("id"),\n ]\n )\n .where(\n db.and_(\n SqlEventLogStorageTable.c.asset_key.in_(\n [asset_key.to_string() for asset_key in to_backcompat_fetch]\n ),\n SqlEventLogStorageTable.c.dagster_event_type\n == DagsterEventType.ASSET_MATERIALIZATION.value,\n )\n )\n .group_by(SqlEventLogStorageTable.c.asset_key),\n "latest_event_subquery",\n )\n backcompat_query = db_select(\n [\n SqlEventLogStorageTable.c.asset_key,\n SqlEventLogStorageTable.c.id,\n SqlEventLogStorageTable.c.event,\n ]\n ).select_from(\n latest_event_subquery.join(\n SqlEventLogStorageTable,\n db.and_(\n SqlEventLogStorageTable.c.asset_key == latest_event_subquery.c.asset_key,\n SqlEventLogStorageTable.c.id == latest_event_subquery.c.id,\n ),\n )\n )\n with self.index_connection() as conn:\n event_rows = db_fetch_mappings(conn, backcompat_query)\n\n for row in event_rows:\n asset_key = AssetKey.from_db_string(cast(Optional[str], row["asset_key"]))\n if asset_key:\n results[asset_key] = EventLogRecord(\n storage_id=cast(int, row["id"]),\n event_log_entry=deserialize_value(cast(str, row["event"]), EventLogEntry),\n )\n return results\n\n def can_cache_asset_status_data(self) -> bool:\n return self.has_asset_key_col("cached_status_data")\n\n def wipe_asset_cached_status(self, asset_key: AssetKey) -> None:\n if self.can_cache_asset_status_data():\n check.inst_param(asset_key, "asset_key", AssetKey)\n with self.index_connection() as conn:\n conn.execute(\n AssetKeyTable.update()\n .values(dict(cached_status_data=None))\n .where(\n AssetKeyTable.c.asset_key == asset_key.to_string(),\n )\n )\n\n def get_asset_records(\n self, asset_keys: Optional[Sequence[AssetKey]] = None\n ) -> Sequence[AssetRecord]:\n rows = self._fetch_asset_rows(asset_keys=asset_keys)\n latest_materialization_records = self._get_latest_materialization_records(rows)\n can_cache_asset_status_data = self.can_cache_asset_status_data()\n\n asset_records: List[AssetRecord] = []\n for row in rows:\n asset_key = AssetKey.from_db_string(row["asset_key"])\n if asset_key:\n asset_records.append(\n self._construct_asset_record_from_row(\n row,\n latest_materialization_records.get(asset_key),\n can_cache_asset_status_data,\n )\n )\n\n return asset_records\n\n def has_asset_key(self, asset_key: AssetKey) -> bool:\n check.inst_param(asset_key, "asset_key", AssetKey)\n rows = self._fetch_asset_rows(asset_keys=[asset_key])\n return bool(rows)\n\n def all_asset_keys(self):\n rows = self._fetch_asset_rows()\n asset_keys = [\n AssetKey.from_db_string(row["asset_key"])\n for row in sorted(rows, key=lambda x: x["asset_key"])\n ]\n return [asset_key for asset_key in asset_keys if asset_key]\n\n def get_asset_keys(\n self,\n prefix: Optional[Sequence[str]] = None,\n limit: Optional[int] = None,\n cursor: Optional[str] = None,\n ) -> Sequence[AssetKey]:\n rows = self._fetch_asset_rows(prefix=prefix, limit=limit, cursor=cursor)\n asset_keys = [\n AssetKey.from_db_string(row["asset_key"])\n for row in sorted(rows, key=lambda x: x["asset_key"])\n ]\n return [asset_key for asset_key in asset_keys if asset_key]\n\n def get_latest_materialization_events(\n self, asset_keys: Iterable[AssetKey]\n ) -> Mapping[AssetKey, Optional[EventLogEntry]]:\n check.iterable_param(asset_keys, "asset_keys", AssetKey)\n rows = self._fetch_asset_rows(asset_keys=asset_keys)\n return {\n asset_key: event_log_record.event_log_entry if event_log_record is not None else None\n for asset_key, event_log_record in self._get_latest_materialization_records(\n rows\n ).items()\n }\n\n def _fetch_asset_rows(\n self,\n asset_keys=None,\n prefix: Optional[Sequence[str]] = None,\n limit: Optional[int] = None,\n cursor: Optional[str] = None,\n ) -> Sequence[SqlAlchemyRow]:\n # fetches rows containing asset_key, last_materialization, and asset_details from the DB,\n # applying the filters specified in the arguments.\n #\n # Differs from _fetch_raw_asset_rows, in that it loops through to make sure enough rows are\n # returned to satisfy the limit.\n #\n # returns a list of rows where each row is a tuple of serialized asset_key, materialization,\n # and asset_details\n should_query = True\n current_cursor = cursor\n if self.has_secondary_index(ASSET_KEY_INDEX_COLS):\n # if we have migrated, we can limit using SQL\n fetch_limit = limit\n else:\n # if we haven't migrated, overfetch in case the first N results are wiped\n fetch_limit = max(limit, MIN_ASSET_ROWS) if limit else None\n result = []\n\n while should_query:\n rows, has_more, current_cursor = self._fetch_raw_asset_rows(\n asset_keys=asset_keys, prefix=prefix, limit=fetch_limit, cursor=current_cursor\n )\n result.extend(rows)\n should_query = bool(has_more) and bool(limit) and len(result) < cast(int, limit)\n\n is_partial_query = asset_keys is not None or bool(prefix) or bool(limit) or bool(cursor)\n if not is_partial_query and self._can_mark_assets_as_migrated(rows): # type: ignore\n self.enable_secondary_index(ASSET_KEY_INDEX_COLS)\n\n return result[:limit] if limit else result\n\n def _fetch_raw_asset_rows(\n self,\n asset_keys: Optional[Sequence[AssetKey]] = None,\n prefix: Optional[Sequence[str]] = None,\n limit: Optional[int] = None,\n cursor=None,\n ) -> Tuple[Iterable[SqlAlchemyRow], bool, Optional[str]]:\n # fetches rows containing asset_key, last_materialization, and asset_details from the DB,\n # applying the filters specified in the arguments. Does not guarantee that the number of\n # rows returned will match the limit specified. This helper function is used to fetch a\n # chunk of asset key rows, which may or may not be wiped.\n #\n # Returns a tuple of (rows, has_more, cursor), where each row is a tuple of serialized\n # asset_key, materialization, and asset_details\n # TODO update comment\n\n columns = [\n AssetKeyTable.c.id,\n AssetKeyTable.c.asset_key,\n AssetKeyTable.c.last_materialization,\n AssetKeyTable.c.last_run_id,\n AssetKeyTable.c.asset_details,\n ]\n if self.can_cache_asset_status_data():\n columns.extend([AssetKeyTable.c.cached_status_data])\n\n is_partial_query = asset_keys is not None or bool(prefix) or bool(limit) or bool(cursor)\n if self.has_asset_key_index_cols() and not is_partial_query:\n # if the schema has been migrated, fetch the last_materialization_timestamp to see if\n # we can lazily migrate the data table\n columns.append(AssetKeyTable.c.last_materialization_timestamp)\n columns.append(AssetKeyTable.c.wipe_timestamp)\n\n query = db_select(columns).order_by(AssetKeyTable.c.asset_key.asc())\n query = self._apply_asset_filter_to_query(query, asset_keys, prefix, limit, cursor)\n\n if self.has_secondary_index(ASSET_KEY_INDEX_COLS):\n query = query.where(\n db.or_(\n AssetKeyTable.c.wipe_timestamp.is_(None),\n AssetKeyTable.c.last_materialization_timestamp > AssetKeyTable.c.wipe_timestamp,\n )\n )\n with self.index_connection() as conn:\n rows = db_fetch_mappings(conn, query)\n\n return rows, False, None\n\n with self.index_connection() as conn:\n rows = db_fetch_mappings(conn, query)\n\n wiped_timestamps_by_asset_key: Dict[AssetKey, float] = {}\n row_by_asset_key: Dict[AssetKey, SqlAlchemyRow] = OrderedDict()\n\n for row in rows:\n asset_key = AssetKey.from_db_string(cast(str, row["asset_key"]))\n if not asset_key:\n continue\n asset_details = AssetDetails.from_db_string(row["asset_details"])\n if not asset_details or not asset_details.last_wipe_timestamp:\n row_by_asset_key[asset_key] = row\n continue\n materialization_or_event_or_record = (\n deserialize_value(cast(str, row["last_materialization"]), NamedTuple)\n if row["last_materialization"]\n else None\n )\n if isinstance(materialization_or_event_or_record, (EventLogRecord, EventLogEntry)):\n if isinstance(materialization_or_event_or_record, EventLogRecord):\n event_timestamp = materialization_or_event_or_record.event_log_entry.timestamp\n else:\n event_timestamp = materialization_or_event_or_record.timestamp\n\n if asset_details.last_wipe_timestamp > event_timestamp:\n # this asset has not been materialized since being wiped, skip\n continue\n else:\n # add the key\n row_by_asset_key[asset_key] = row\n else:\n row_by_asset_key[asset_key] = row\n wiped_timestamps_by_asset_key[asset_key] = asset_details.last_wipe_timestamp\n\n if wiped_timestamps_by_asset_key:\n materialization_times = self._fetch_backcompat_materialization_times(\n wiped_timestamps_by_asset_key.keys() # type: ignore\n )\n for asset_key, wiped_timestamp in wiped_timestamps_by_asset_key.items():\n materialization_time = materialization_times.get(asset_key)\n if not materialization_time or utc_datetime_from_naive(\n materialization_time\n ) < utc_datetime_from_timestamp(wiped_timestamp):\n # remove rows that have not been materialized since being wiped\n row_by_asset_key.pop(asset_key)\n\n has_more = limit and len(rows) == limit\n new_cursor = rows[-1]["id"] if rows else None\n\n return row_by_asset_key.values(), has_more, new_cursor # type: ignore\n\n def update_asset_cached_status_data(\n self, asset_key: AssetKey, cache_values: "AssetStatusCacheValue"\n ) -> None:\n if self.can_cache_asset_status_data():\n with self.index_connection() as conn:\n conn.execute(\n AssetKeyTable.update()\n .where(\n AssetKeyTable.c.asset_key == asset_key.to_string(),\n )\n .values(cached_status_data=serialize_value(cache_values))\n )\n\n def _fetch_backcompat_materialization_times(\n self, asset_keys: Sequence[AssetKey]\n ) -> Mapping[AssetKey, datetime]:\n # fetches the latest materialization timestamp for the given asset_keys. Uses the (slower)\n # raw event log table.\n backcompat_query = (\n db_select(\n [\n SqlEventLogStorageTable.c.asset_key,\n db.func.max(SqlEventLogStorageTable.c.timestamp).label("timestamp"),\n ]\n )\n .where(\n SqlEventLogStorageTable.c.asset_key.in_(\n [asset_key.to_string() for asset_key in asset_keys]\n )\n )\n .group_by(SqlEventLogStorageTable.c.asset_key)\n .order_by(db.func.max(SqlEventLogStorageTable.c.timestamp).asc())\n )\n with self.index_connection() as conn:\n backcompat_rows = db_fetch_mappings(conn, backcompat_query)\n return {AssetKey.from_db_string(row["asset_key"]): row["timestamp"] for row in backcompat_rows} # type: ignore\n\n def _can_mark_assets_as_migrated(self, rows):\n if not self.has_asset_key_index_cols():\n return False\n\n if self.has_secondary_index(ASSET_KEY_INDEX_COLS):\n # we have already migrated\n return False\n\n for row in rows:\n if not _get_from_row(row, "last_materialization_timestamp"):\n return False\n\n if _get_from_row(row, "asset_details") and not _get_from_row(row, "wipe_timestamp"):\n return False\n\n return True\n\n def _apply_asset_filter_to_query(\n self,\n query: SqlAlchemyQuery,\n asset_keys: Optional[Sequence[AssetKey]] = None,\n prefix=None,\n limit: Optional[int] = None,\n cursor: Optional[str] = None,\n ) -> SqlAlchemyQuery:\n if asset_keys is not None:\n query = query.where(\n AssetKeyTable.c.asset_key.in_([asset_key.to_string() for asset_key in asset_keys])\n )\n\n if prefix:\n prefix_str = seven.dumps(prefix)[:-1]\n query = query.where(AssetKeyTable.c.asset_key.startswith(prefix_str))\n\n if cursor:\n query = query.where(AssetKeyTable.c.asset_key > cursor)\n\n if limit:\n query = query.limit(limit)\n return query\n\n def _get_assets_details(\n self, asset_keys: Sequence[AssetKey]\n ) -> Sequence[Optional[AssetDetails]]:\n check.sequence_param(asset_keys, "asset_key", AssetKey)\n rows = None\n with self.index_connection() as conn:\n rows = db_fetch_mappings(\n conn,\n db_select([AssetKeyTable.c.asset_key, AssetKeyTable.c.asset_details]).where(\n AssetKeyTable.c.asset_key.in_(\n [asset_key.to_string() for asset_key in asset_keys]\n ),\n ),\n )\n\n asset_key_to_details = {\n cast(str, row["asset_key"]): (\n deserialize_value(cast(str, row["asset_details"]), AssetDetails)\n if row["asset_details"]\n else None\n )\n for row in rows\n }\n\n # returns a list of the corresponding asset_details to provided asset_keys\n return [\n asset_key_to_details.get(asset_key.to_string(), None) for asset_key in asset_keys\n ]\n\n def _add_assets_wipe_filter_to_query(\n self,\n query: SqlAlchemyQuery,\n assets_details: Sequence[Optional[AssetDetails]],\n asset_keys: Sequence[AssetKey],\n ) -> SqlAlchemyQuery:\n check.invariant(\n len(assets_details) == len(asset_keys),\n "asset_details and asset_keys must be the same length",\n )\n for i in range(len(assets_details)):\n asset_key, asset_details = asset_keys[i], assets_details[i]\n if asset_details and asset_details.last_wipe_timestamp:\n asset_key_in_row = SqlEventLogStorageTable.c.asset_key == asset_key.to_string()\n # If asset key is in row, keep the row if the timestamp > wipe timestamp, else remove the row.\n # If asset key is not in row, keep the row.\n query = query.where(\n db.or_(\n db.and_(\n asset_key_in_row,\n SqlEventLogStorageTable.c.timestamp\n > datetime.utcfromtimestamp(asset_details.last_wipe_timestamp),\n ),\n db.not_(asset_key_in_row),\n )\n )\n\n return query\n\n def get_event_tags_for_asset(\n self,\n asset_key: AssetKey,\n filter_tags: Optional[Mapping[str, str]] = None,\n filter_event_id: Optional[int] = None,\n ) -> Sequence[Mapping[str, str]]:\n """Fetches asset event tags for the given asset key.\n\n If filter_tags is provided, searches for events containing all of the filter tags. Then,\n returns all tags for those events. This enables searching for multipartitioned asset\n partition tags with a fixed dimension value, e.g. all of the tags for events where\n "country" == "US".\n\n If filter_event_id is provided, fetches only tags applied to the given event.\n\n Returns a list of dicts, where each dict is a mapping of tag key to tag value for a\n single event.\n """\n asset_key = check.inst_param(asset_key, "asset_key", AssetKey)\n filter_tags = check.opt_mapping_param(\n filter_tags, "filter_tags", key_type=str, value_type=str\n )\n filter_event_id = check.opt_int_param(filter_event_id, "filter_event_id")\n\n if not self.has_table(AssetEventTagsTable.name):\n raise DagsterInvalidInvocationError(\n "In order to search for asset event tags, you must run "\n "`dagster instance migrate` to create the AssetEventTags table."\n )\n\n asset_details = self._get_assets_details([asset_key])[0]\n if not filter_tags:\n tags_query = db_select(\n [\n AssetEventTagsTable.c.key,\n AssetEventTagsTable.c.value,\n AssetEventTagsTable.c.event_id,\n ]\n ).where(AssetEventTagsTable.c.asset_key == asset_key.to_string())\n if asset_details and asset_details.last_wipe_timestamp:\n tags_query = tags_query.where(\n AssetEventTagsTable.c.event_timestamp\n > datetime.utcfromtimestamp(asset_details.last_wipe_timestamp)\n )\n elif self.supports_intersect:\n\n def get_tag_filter_query(tag_key, tag_value):\n filter_query = db_select([AssetEventTagsTable.c.event_id]).where(\n db.and_(\n AssetEventTagsTable.c.asset_key == asset_key.to_string(),\n AssetEventTagsTable.c.key == tag_key,\n AssetEventTagsTable.c.value == tag_value,\n )\n )\n if asset_details and asset_details.last_wipe_timestamp:\n filter_query = filter_query.where(\n AssetEventTagsTable.c.event_timestamp\n > datetime.utcfromtimestamp(asset_details.last_wipe_timestamp)\n )\n return filter_query\n\n intersections = [\n get_tag_filter_query(tag_key, tag_value)\n for tag_key, tag_value in filter_tags.items()\n ]\n\n tags_query = db_select(\n [\n AssetEventTagsTable.c.key,\n AssetEventTagsTable.c.value,\n AssetEventTagsTable.c.event_id,\n ]\n ).where(\n db.and_(\n AssetEventTagsTable.c.event_id.in_(db.intersect(*intersections)),\n )\n )\n else:\n table = self._apply_tags_table_joins(AssetEventTagsTable, filter_tags, asset_key)\n tags_query = db_select(\n [\n AssetEventTagsTable.c.key,\n AssetEventTagsTable.c.value,\n AssetEventTagsTable.c.event_id,\n ]\n ).select_from(table)\n\n if asset_details and asset_details.last_wipe_timestamp:\n tags_query = tags_query.where(\n AssetEventTagsTable.c.event_timestamp\n > datetime.utcfromtimestamp(asset_details.last_wipe_timestamp)\n )\n\n if filter_event_id is not None:\n tags_query = tags_query.where(AssetEventTagsTable.c.event_id == filter_event_id)\n\n with self.index_connection() as conn:\n results = conn.execute(tags_query).fetchall()\n\n tags_by_event_id: Dict[int, Dict[str, str]] = defaultdict(dict)\n for row in results:\n key, value, event_id = row\n tags_by_event_id[event_id][key] = value\n\n return list(tags_by_event_id.values())\n\n def _asset_materialization_from_json_column(\n self, json_str: str\n ) -> Optional[AssetMaterialization]:\n if not json_str:\n return None\n\n # We switched to storing the entire event record of the last materialization instead of just\n # the AssetMaterialization object, so that we have access to metadata like timestamp,\n # pipeline, run_id, etc.\n #\n # This should make certain asset queries way more performant, without having to do extra\n # queries against the event log.\n #\n # This should be accompanied by a schema change in 0.12.0, renaming `last_materialization`\n # to `last_materialization_event`, for clarity. For now, we should do some back-compat.\n #\n # https://github.com/dagster-io/dagster/issues/3945\n\n event_or_materialization = deserialize_value(json_str, NamedTuple)\n if isinstance(event_or_materialization, AssetMaterialization):\n return event_or_materialization\n\n if (\n not isinstance(event_or_materialization, EventLogEntry)\n or not event_or_materialization.is_dagster_event\n or not event_or_materialization.dagster_event.asset_key # type: ignore\n ):\n return None\n\n return event_or_materialization.dagster_event.step_materialization_data.materialization # type: ignore\n\n def _get_asset_key_values_on_wipe(self) -> Mapping[str, Any]:\n wipe_timestamp = pendulum.now("UTC").timestamp()\n values = {\n "asset_details": serialize_value(AssetDetails(last_wipe_timestamp=wipe_timestamp)),\n "last_run_id": None,\n }\n if self.has_asset_key_index_cols():\n values.update(\n dict(\n wipe_timestamp=utc_datetime_from_timestamp(wipe_timestamp),\n )\n )\n if self.can_cache_asset_status_data():\n values.update(dict(cached_status_data=None))\n return values\n\n def wipe_asset(self, asset_key: AssetKey) -> None:\n check.inst_param(asset_key, "asset_key", AssetKey)\n wiped_values = self._get_asset_key_values_on_wipe()\n\n with self.index_connection() as conn:\n conn.execute(\n AssetKeyTable.update()\n .values(**wiped_values)\n .where(\n AssetKeyTable.c.asset_key == asset_key.to_string(),\n )\n )\n\n def get_materialized_partitions(\n self,\n asset_key: AssetKey,\n before_cursor: Optional[int] = None,\n after_cursor: Optional[int] = None,\n ) -> Set[str]:\n query = (\n db_select(\n [\n SqlEventLogStorageTable.c.partition,\n db.func.max(SqlEventLogStorageTable.c.id),\n ]\n )\n .where(\n db.and_(\n SqlEventLogStorageTable.c.asset_key == asset_key.to_string(),\n SqlEventLogStorageTable.c.partition != None, # noqa: E711\n SqlEventLogStorageTable.c.dagster_event_type\n == DagsterEventType.ASSET_MATERIALIZATION.value,\n )\n )\n .group_by(SqlEventLogStorageTable.c.partition)\n )\n\n assets_details = self._get_assets_details([asset_key])\n query = self._add_assets_wipe_filter_to_query(query, assets_details, [asset_key])\n\n if after_cursor:\n query = query.where(SqlEventLogStorageTable.c.id > after_cursor)\n if before_cursor:\n query = query.where(SqlEventLogStorageTable.c.id < before_cursor)\n\n with self.index_connection() as conn:\n results = conn.execute(query).fetchall()\n\n return set([cast(str, row[0]) for row in results])\n\n def get_materialization_count_by_partition(\n self,\n asset_keys: Sequence[AssetKey],\n after_cursor: Optional[int] = None,\n before_cursor: Optional[int] = None,\n ) -> Mapping[AssetKey, Mapping[str, int]]:\n check.sequence_param(asset_keys, "asset_keys", AssetKey)\n\n query = (\n db_select(\n [\n SqlEventLogStorageTable.c.asset_key,\n SqlEventLogStorageTable.c.partition,\n db.func.count(SqlEventLogStorageTable.c.id),\n ]\n )\n .where(\n db.and_(\n SqlEventLogStorageTable.c.asset_key.in_(\n [asset_key.to_string() for asset_key in asset_keys]\n ),\n SqlEventLogStorageTable.c.partition != None, # noqa: E711\n SqlEventLogStorageTable.c.dagster_event_type\n == DagsterEventType.ASSET_MATERIALIZATION.value,\n )\n )\n .group_by(SqlEventLogStorageTable.c.asset_key, SqlEventLogStorageTable.c.partition)\n )\n\n assets_details = self._get_assets_details(asset_keys)\n query = self._add_assets_wipe_filter_to_query(query, assets_details, asset_keys)\n\n if after_cursor:\n query = query.where(SqlEventLogStorageTable.c.id > after_cursor)\n\n with self.index_connection() as conn:\n results = conn.execute(query).fetchall()\n\n materialization_count_by_partition: Dict[AssetKey, Dict[str, int]] = {\n asset_key: {} for asset_key in asset_keys\n }\n for row in results:\n asset_key = AssetKey.from_db_string(cast(Optional[str], row[0]))\n if asset_key:\n materialization_count_by_partition[asset_key][cast(str, row[1])] = cast(int, row[2])\n\n return materialization_count_by_partition\n\n def _latest_event_ids_by_partition_subquery(\n self,\n asset_key: AssetKey,\n event_types: Sequence[DagsterEventType],\n asset_partitions: Optional[Sequence[str]] = None,\n before_cursor: Optional[int] = None,\n after_cursor: Optional[int] = None,\n ):\n """Subquery for locating the latest event ids by partition for a given asset key and set\n of event types.\n """\n query = db_select(\n [\n SqlEventLogStorageTable.c.dagster_event_type,\n SqlEventLogStorageTable.c.partition,\n db.func.max(SqlEventLogStorageTable.c.id).label("id"),\n ]\n ).where(\n db.and_(\n SqlEventLogStorageTable.c.asset_key == asset_key.to_string(),\n SqlEventLogStorageTable.c.partition != None, # noqa: E711\n SqlEventLogStorageTable.c.dagster_event_type.in_(\n [event_type.value for event_type in event_types]\n ),\n )\n )\n if asset_partitions is not None:\n query = query.where(SqlEventLogStorageTable.c.partition.in_(asset_partitions))\n if before_cursor is not None:\n query = query.where(SqlEventLogStorageTable.c.id < before_cursor)\n if after_cursor is not None:\n query = query.where(SqlEventLogStorageTable.c.id > after_cursor)\n\n latest_event_ids_subquery = query.group_by(\n SqlEventLogStorageTable.c.dagster_event_type, SqlEventLogStorageTable.c.partition\n )\n\n assets_details = self._get_assets_details([asset_key])\n return db_subquery(\n self._add_assets_wipe_filter_to_query(\n latest_event_ids_subquery, assets_details, [asset_key]\n ),\n "latest_event_ids_by_partition_subquery",\n )\n\n def get_latest_storage_id_by_partition(\n self, asset_key: AssetKey, event_type: DagsterEventType\n ) -> Mapping[str, int]:\n """Fetch the latest materialzation storage id for each partition for a given asset key.\n\n Returns a mapping of partition to storage id.\n """\n check.inst_param(asset_key, "asset_key", AssetKey)\n\n latest_event_ids_by_partition_subquery = self._latest_event_ids_by_partition_subquery(\n asset_key, [event_type]\n )\n latest_event_ids_by_partition = db_select(\n [\n latest_event_ids_by_partition_subquery.c.partition,\n latest_event_ids_by_partition_subquery.c.id,\n ]\n )\n\n with self.index_connection() as conn:\n rows = conn.execute(latest_event_ids_by_partition).fetchall()\n\n latest_materialization_storage_id_by_partition: Dict[str, int] = {}\n for row in rows:\n latest_materialization_storage_id_by_partition[cast(str, row[0])] = cast(int, row[1])\n return latest_materialization_storage_id_by_partition\n\n def get_latest_tags_by_partition(\n self,\n asset_key: AssetKey,\n event_type: DagsterEventType,\n tag_keys: Sequence[str],\n asset_partitions: Optional[Sequence[str]] = None,\n before_cursor: Optional[int] = None,\n after_cursor: Optional[int] = None,\n ) -> Mapping[str, Mapping[str, str]]:\n check.inst_param(asset_key, "asset_key", AssetKey)\n check.inst_param(event_type, "event_type", DagsterEventType)\n check.sequence_param(tag_keys, "tag_keys", of_type=str)\n check.opt_nullable_sequence_param(asset_partitions, "asset_partitions", of_type=str)\n check.opt_int_param(before_cursor, "before_cursor")\n check.opt_int_param(after_cursor, "after_cursor")\n\n latest_event_ids_subquery = self._latest_event_ids_by_partition_subquery(\n asset_key=asset_key,\n event_types=[event_type],\n asset_partitions=asset_partitions,\n before_cursor=before_cursor,\n after_cursor=after_cursor,\n )\n\n latest_tags_by_partition_query = (\n db_select(\n [\n latest_event_ids_subquery.c.partition,\n AssetEventTagsTable.c.key,\n AssetEventTagsTable.c.value,\n ]\n )\n .select_from(\n latest_event_ids_subquery.join(\n AssetEventTagsTable,\n AssetEventTagsTable.c.event_id == latest_event_ids_subquery.c.id,\n )\n )\n .where(AssetEventTagsTable.c.key.in_(tag_keys))\n )\n\n latest_tags_by_partition: Dict[str, Dict[str, str]] = defaultdict(dict)\n with self.index_connection() as conn:\n rows = conn.execute(latest_tags_by_partition_query).fetchall()\n\n for row in rows:\n latest_tags_by_partition[cast(str, row[0])][cast(str, row[1])] = cast(str, row[2])\n\n # convert defaultdict to dict\n return dict(latest_tags_by_partition)\n\n def get_latest_asset_partition_materialization_attempts_without_materializations(\n self, asset_key: AssetKey\n ) -> Mapping[str, Tuple[str, int]]:\n """Fetch the latest materialzation and materialization planned events for each partition of the given asset.\n Return the partitions that have a materialization planned event but no matching (same run) materialization event.\n These materializations could be in progress, or they could have failed. A separate query checking the run status\n is required to know.\n\n Returns a mapping of partition to [run id, event id].\n """\n check.inst_param(asset_key, "asset_key", AssetKey)\n\n latest_event_ids_subquery = self._latest_event_ids_by_partition_subquery(\n asset_key,\n [\n DagsterEventType.ASSET_MATERIALIZATION,\n DagsterEventType.ASSET_MATERIALIZATION_PLANNED,\n ],\n )\n\n latest_events_subquery = db_subquery(\n db_select(\n [\n SqlEventLogStorageTable.c.dagster_event_type,\n SqlEventLogStorageTable.c.partition,\n SqlEventLogStorageTable.c.run_id,\n SqlEventLogStorageTable.c.id,\n ]\n ).select_from(\n latest_event_ids_subquery.join(\n SqlEventLogStorageTable,\n SqlEventLogStorageTable.c.id == latest_event_ids_subquery.c.id,\n ),\n ),\n "latest_events_subquery",\n )\n\n materialization_planned_events = db_select(\n [\n latest_events_subquery.c.dagster_event_type,\n latest_events_subquery.c.partition,\n latest_events_subquery.c.run_id,\n latest_events_subquery.c.id,\n ]\n ).where(\n latest_events_subquery.c.dagster_event_type\n == DagsterEventType.ASSET_MATERIALIZATION_PLANNED.value\n )\n\n materialization_events = db_select(\n [\n latest_events_subquery.c.dagster_event_type,\n latest_events_subquery.c.partition,\n latest_events_subquery.c.run_id,\n ]\n ).where(\n latest_events_subquery.c.dagster_event_type\n == DagsterEventType.ASSET_MATERIALIZATION.value\n )\n\n with self.index_connection() as conn:\n materialization_planned_rows = db_fetch_mappings(conn, materialization_planned_events)\n materialization_rows = db_fetch_mappings(conn, materialization_events)\n\n materialization_planned_rows_by_partition = {\n cast(str, row["partition"]): (cast(str, row["run_id"]), cast(int, row["id"]))\n for row in materialization_planned_rows\n }\n for row in materialization_rows:\n if (\n row["partition"] in materialization_planned_rows_by_partition\n and materialization_planned_rows_by_partition[cast(str, row["partition"])][0]\n == row["run_id"]\n ):\n materialization_planned_rows_by_partition.pop(cast(str, row["partition"]))\n\n return materialization_planned_rows_by_partition\n\n def _check_partitions_table(self) -> None:\n # Guards against cases where the user is not running the latest migration for\n # partitions storage. Should be updated when the partitions storage schema changes.\n if not self.has_table("dynamic_partitions"):\n raise DagsterInvalidInvocationError(\n "Using dynamic partitions definitions requires the dynamic partitions table, which"\n " currently does not exist. Add this table by running `dagster"\n " instance migrate`."\n )\n\n def get_dynamic_partitions(self, partitions_def_name: str) -> Sequence[str]:\n """Get the list of partition keys for a partition definition."""\n self._check_partitions_table()\n columns = [\n DynamicPartitionsTable.c.partitions_def_name,\n DynamicPartitionsTable.c.partition,\n ]\n query = (\n db_select(columns)\n .where(DynamicPartitionsTable.c.partitions_def_name == partitions_def_name)\n .order_by(DynamicPartitionsTable.c.id)\n )\n with self.index_connection() as conn:\n rows = conn.execute(query).fetchall()\n\n return [cast(str, row[1]) for row in rows]\n\n def has_dynamic_partition(self, partitions_def_name: str, partition_key: str) -> bool:\n self._check_partitions_table()\n query = (\n db_select([DynamicPartitionsTable.c.partition])\n .where(\n db.and_(\n DynamicPartitionsTable.c.partitions_def_name == partitions_def_name,\n DynamicPartitionsTable.c.partition == partition_key,\n )\n )\n .limit(1)\n )\n with self.index_connection() as conn:\n results = conn.execute(query).fetchall()\n\n return len(results) > 0\n\n def add_dynamic_partitions(\n self, partitions_def_name: str, partition_keys: Sequence[str]\n ) -> None:\n self._check_partitions_table()\n with self.index_connection() as conn:\n existing_rows = conn.execute(\n db_select([DynamicPartitionsTable.c.partition]).where(\n db.and_(\n DynamicPartitionsTable.c.partition.in_(partition_keys),\n DynamicPartitionsTable.c.partitions_def_name == partitions_def_name,\n )\n )\n ).fetchall()\n existing_keys = set([row[0] for row in existing_rows])\n new_keys = [\n partition_key\n for partition_key in partition_keys\n if partition_key not in existing_keys\n ]\n\n if new_keys:\n conn.execute(\n DynamicPartitionsTable.insert(),\n [\n dict(partitions_def_name=partitions_def_name, partition=partition_key)\n for partition_key in new_keys\n ],\n )\n\n def delete_dynamic_partition(self, partitions_def_name: str, partition_key: str) -> None:\n self._check_partitions_table()\n with self.index_connection() as conn:\n conn.execute(\n DynamicPartitionsTable.delete().where(\n db.and_(\n DynamicPartitionsTable.c.partitions_def_name == partitions_def_name,\n DynamicPartitionsTable.c.partition == partition_key,\n )\n )\n )\n\n @property\n def supports_global_concurrency_limits(self) -> bool:\n return self.has_table(ConcurrencySlotsTable.name)\n\n def set_concurrency_slots(self, concurrency_key: str, num: int) -> None:\n """Allocate a set of concurrency slots.\n\n Args:\n concurrency_key (str): The key to allocate the slots for.\n num (int): The number of slots to allocate.\n """\n if num > MAX_CONCURRENCY_SLOTS:\n raise DagsterInvalidInvocationError(\n f"Cannot have more than {MAX_CONCURRENCY_SLOTS} slots per concurrency key."\n )\n if num < 0:\n raise DagsterInvalidInvocationError("Cannot have a negative number of slots.")\n\n keys_to_assign = None\n with self.index_connection() as conn:\n count_row = conn.execute(\n db_select([db.func.count()])\n .select_from(ConcurrencySlotsTable)\n .where(\n db.and_(\n ConcurrencySlotsTable.c.concurrency_key == concurrency_key,\n ConcurrencySlotsTable.c.deleted == False, # noqa: E712\n )\n )\n ).fetchone()\n existing = cast(int, count_row[0]) if count_row else 0\n\n if existing > num:\n # need to delete some slots, favoring ones where the slot is unallocated\n rows = conn.execute(\n db_select([ConcurrencySlotsTable.c.id])\n .select_from(ConcurrencySlotsTable)\n .where(\n db.and_(\n ConcurrencySlotsTable.c.concurrency_key == concurrency_key,\n ConcurrencySlotsTable.c.deleted == False, # noqa: E712\n )\n )\n .order_by(\n db_case([(ConcurrencySlotsTable.c.run_id.is_(None), 1)], else_=0).desc(),\n ConcurrencySlotsTable.c.id.desc(),\n )\n .limit(existing - num)\n ).fetchall()\n\n if rows:\n # mark rows as deleted\n conn.execute(\n ConcurrencySlotsTable.update()\n .values(deleted=True)\n .where(ConcurrencySlotsTable.c.id.in_([row[0] for row in rows]))\n )\n\n # actually delete rows that are marked as deleted and are not claimed... the rest\n # will be deleted when the slots are released by the free_concurrency_slots\n conn.execute(\n ConcurrencySlotsTable.delete().where(\n db.and_(\n ConcurrencySlotsTable.c.deleted == True, # noqa: E712\n ConcurrencySlotsTable.c.run_id == None, # noqa: E711\n )\n )\n )\n elif num > existing:\n # need to add some slots\n rows = [\n {\n "concurrency_key": concurrency_key,\n "run_id": None,\n "step_key": None,\n "deleted": False,\n }\n for _ in range(existing, num)\n ]\n conn.execute(ConcurrencySlotsTable.insert().values(rows))\n keys_to_assign = [concurrency_key for _ in range(existing, num)]\n\n if keys_to_assign:\n # we've added some slots... if there are any pending steps, we can assign them now or\n # they will be unutilized until free_concurrency_slots is called\n self.assign_pending_steps(keys_to_assign)\n\n def has_unassigned_slots(self, concurrency_key: str) -> bool:\n with self.index_connection() as conn:\n pending_row = conn.execute(\n db_select([db.func.count()])\n .select_from(PendingStepsTable)\n .where(\n db.and_(\n PendingStepsTable.c.concurrency_key == concurrency_key,\n PendingStepsTable.c.assigned_timestamp != None, # noqa: E711\n )\n )\n ).fetchone()\n slots = conn.execute(\n db_select([db.func.count()])\n .select_from(ConcurrencySlotsTable)\n .where(\n db.and_(\n ConcurrencySlotsTable.c.concurrency_key == concurrency_key,\n ConcurrencySlotsTable.c.deleted == False, # noqa: E712\n )\n )\n ).fetchone()\n pending_count = cast(int, pending_row[0]) if pending_row else 0\n slots_count = cast(int, slots[0]) if slots else 0\n return slots_count > pending_count\n\n def check_concurrency_claim(\n self, concurrency_key: str, run_id: str, step_key: str\n ) -> ConcurrencyClaimStatus:\n with self.index_connection() as conn:\n pending_row = conn.execute(\n db_select(\n [\n PendingStepsTable.c.assigned_timestamp,\n PendingStepsTable.c.priority,\n PendingStepsTable.c.create_timestamp,\n ]\n ).where(\n db.and_(\n PendingStepsTable.c.run_id == run_id,\n PendingStepsTable.c.step_key == step_key,\n PendingStepsTable.c.concurrency_key == concurrency_key,\n )\n )\n ).fetchone()\n\n if not pending_row:\n # no pending step pending_row exists, the slot is blocked and the enqueued timestamp is None\n return ConcurrencyClaimStatus(\n concurrency_key=concurrency_key,\n slot_status=ConcurrencySlotStatus.BLOCKED,\n priority=None,\n assigned_timestamp=None,\n enqueued_timestamp=None,\n )\n\n priority = cast(int, pending_row[1]) if pending_row[1] else None\n assigned_timestamp = cast(datetime, pending_row[0]) if pending_row[0] else None\n create_timestamp = cast(datetime, pending_row[2]) if pending_row[2] else None\n if assigned_timestamp is None:\n return ConcurrencyClaimStatus(\n concurrency_key=concurrency_key,\n slot_status=ConcurrencySlotStatus.BLOCKED,\n priority=priority,\n assigned_timestamp=None,\n enqueued_timestamp=create_timestamp,\n )\n\n # pending step is assigned, check to see if it's been claimed\n slot_row = conn.execute(\n db_select([db.func.count()]).where(\n db.and_(\n ConcurrencySlotsTable.c.concurrency_key == concurrency_key,\n ConcurrencySlotsTable.c.run_id == run_id,\n ConcurrencySlotsTable.c.step_key == step_key,\n )\n )\n ).fetchone()\n\n return ConcurrencyClaimStatus(\n concurrency_key=concurrency_key,\n slot_status=(\n ConcurrencySlotStatus.CLAIMED\n if slot_row and slot_row[0]\n else ConcurrencySlotStatus.BLOCKED\n ),\n priority=priority,\n assigned_timestamp=assigned_timestamp,\n enqueued_timestamp=create_timestamp,\n )\n\n def can_claim_from_pending(self, concurrency_key: str, run_id: str, step_key: str):\n with self.index_connection() as conn:\n row = conn.execute(\n db_select([PendingStepsTable.c.assigned_timestamp]).where(\n db.and_(\n PendingStepsTable.c.run_id == run_id,\n PendingStepsTable.c.step_key == step_key,\n PendingStepsTable.c.concurrency_key == concurrency_key,\n )\n )\n ).fetchone()\n return row and row[0] is not None\n\n def has_pending_step(self, concurrency_key: str, run_id: str, step_key: str):\n with self.index_connection() as conn:\n row = conn.execute(\n db_select([db.func.count()])\n .select_from(PendingStepsTable)\n .where(\n db.and_(\n PendingStepsTable.c.concurrency_key == concurrency_key,\n PendingStepsTable.c.run_id == run_id,\n PendingStepsTable.c.step_key == step_key,\n )\n )\n ).fetchone()\n return row and cast(int, row[0]) > 0\n\n def assign_pending_steps(self, concurrency_keys: Sequence[str]):\n if not concurrency_keys:\n return\n\n with self.index_connection() as conn:\n for key in concurrency_keys:\n row = conn.execute(\n db_select([PendingStepsTable.c.id])\n .where(\n db.and_(\n PendingStepsTable.c.concurrency_key == key,\n PendingStepsTable.c.assigned_timestamp == None, # noqa: E711\n )\n )\n .order_by(\n PendingStepsTable.c.priority.desc(),\n PendingStepsTable.c.create_timestamp.asc(),\n )\n .limit(1)\n ).fetchone()\n if row:\n conn.execute(\n PendingStepsTable.update()\n .where(PendingStepsTable.c.id == row[0])\n .values(assigned_timestamp=db.func.now())\n )\n\n def add_pending_step(\n self,\n concurrency_key: str,\n run_id: str,\n step_key: str,\n priority: Optional[int] = None,\n should_assign: bool = False,\n ):\n with self.index_connection() as conn:\n try:\n conn.execute(\n PendingStepsTable.insert().values(\n [\n dict(\n run_id=run_id,\n step_key=step_key,\n concurrency_key=concurrency_key,\n priority=priority or 0,\n assigned_timestamp=db.func.now() if should_assign else None,\n )\n ]\n )\n )\n except db_exc.IntegrityError:\n # do nothing\n pass\n\n def _remove_pending_steps(self, run_id: str, step_key: Optional[str] = None):\n query = PendingStepsTable.delete().where(PendingStepsTable.c.run_id == run_id)\n if step_key:\n query = query.where(PendingStepsTable.c.step_key == step_key)\n with self.index_connection() as conn:\n conn.execute(query)\n\n def claim_concurrency_slot(\n self, concurrency_key: str, run_id: str, step_key: str, priority: Optional[int] = None\n ) -> ConcurrencyClaimStatus:\n """Claim concurrency slot for step.\n\n Args:\n concurrency_keys (str): The concurrency key to claim.\n run_id (str): The run id to claim for.\n step_key (str): The step key to claim for.\n """\n # first, register the step by adding to pending queue\n if not self.has_pending_step(\n concurrency_key=concurrency_key, run_id=run_id, step_key=step_key\n ):\n has_unassigned_slots = self.has_unassigned_slots(concurrency_key)\n self.add_pending_step(\n concurrency_key=concurrency_key,\n run_id=run_id,\n step_key=step_key,\n priority=priority,\n should_assign=has_unassigned_slots,\n )\n\n # if the step is not assigned (i.e. has not been popped from queue), block the claim\n claim_status = self.check_concurrency_claim(\n concurrency_key=concurrency_key, run_id=run_id, step_key=step_key\n )\n if claim_status.is_claimed or not claim_status.is_assigned:\n return claim_status\n\n # attempt to claim a concurrency slot... this should generally work because we only assign\n # based on the number of unclaimed slots, but this should act as a safeguard, using the slot\n # rows as a semaphore\n slot_status = self._claim_concurrency_slot(\n concurrency_key=concurrency_key, run_id=run_id, step_key=step_key\n )\n return claim_status.with_slot_status(slot_status)\n\n def _claim_concurrency_slot(\n self, concurrency_key: str, run_id: str, step_key: str\n ) -> ConcurrencySlotStatus:\n """Claim a concurrency slot for the step. Helper method that is called for steps that are\n popped off the priority queue.\n\n Args:\n concurrency_key (str): The concurrency key to claim.\n run_id (str): The run id to claim a slot for.\n step_key (str): The step key to claim a slot for.\n """\n with self.index_connection() as conn:\n result = conn.execute(\n db_select([ConcurrencySlotsTable.c.id])\n .select_from(ConcurrencySlotsTable)\n .where(\n db.and_(\n ConcurrencySlotsTable.c.concurrency_key == concurrency_key,\n ConcurrencySlotsTable.c.step_key == None, # noqa: E711\n ConcurrencySlotsTable.c.deleted == False, # noqa: E712\n )\n )\n .with_for_update(skip_locked=True)\n .limit(1)\n ).fetchone()\n if not result or not result[0]:\n return ConcurrencySlotStatus.BLOCKED\n if not conn.execute(\n ConcurrencySlotsTable.update()\n .values(run_id=run_id, step_key=step_key)\n .where(ConcurrencySlotsTable.c.id == result[0])\n ).rowcount:\n return ConcurrencySlotStatus.BLOCKED\n\n return ConcurrencySlotStatus.CLAIMED\n\n def get_concurrency_keys(self) -> Set[str]:\n """Get the set of concurrency limited keys."""\n with self.index_connection() as conn:\n rows = conn.execute(\n db_select([ConcurrencySlotsTable.c.concurrency_key])\n .select_from(ConcurrencySlotsTable)\n .where(ConcurrencySlotsTable.c.deleted == False) # noqa: E712\n .distinct()\n ).fetchall()\n return {cast(str, row[0]) for row in rows}\n\n def get_concurrency_info(self, concurrency_key: str) -> ConcurrencyKeyInfo:\n """Get the list of concurrency slots for a given concurrency key.\n\n Args:\n concurrency_key (str): The concurrency key to get the slots for.\n\n Returns:\n List[Tuple[str, int]]: A list of tuples of run_id and the number of slots it is\n occupying for the given concurrency key.\n """\n with self.index_connection() as conn:\n slot_query = (\n db_select(\n [\n ConcurrencySlotsTable.c.run_id,\n ConcurrencySlotsTable.c.deleted,\n db.func.count().label("count"),\n ]\n )\n .select_from(ConcurrencySlotsTable)\n .where(ConcurrencySlotsTable.c.concurrency_key == concurrency_key)\n .group_by(ConcurrencySlotsTable.c.run_id, ConcurrencySlotsTable.c.deleted)\n )\n slot_rows = db_fetch_mappings(conn, slot_query)\n pending_query = (\n db_select(\n [\n PendingStepsTable.c.run_id,\n db_case(\n [(PendingStepsTable.c.assigned_timestamp.is_(None), False)],\n else_=True,\n ).label("is_assigned"),\n db.func.count().label("count"),\n ]\n )\n .select_from(PendingStepsTable)\n .where(PendingStepsTable.c.concurrency_key == concurrency_key)\n .group_by(PendingStepsTable.c.run_id, "is_assigned")\n )\n pending_rows = db_fetch_mappings(conn, pending_query)\n\n return ConcurrencyKeyInfo(\n concurrency_key=concurrency_key,\n slot_count=sum(\n [\n cast(int, slot_row["count"])\n for slot_row in slot_rows\n if not slot_row["deleted"]\n ]\n ),\n active_slot_count=sum(\n [cast(int, slot_row["count"]) for slot_row in slot_rows if slot_row["run_id"]]\n ),\n active_run_ids={\n cast(str, slot_row["run_id"]) for slot_row in slot_rows if slot_row["run_id"]\n },\n pending_step_count=sum(\n [cast(int, row["count"]) for row in pending_rows if not row["is_assigned"]]\n ),\n pending_run_ids={\n cast(str, row["run_id"]) for row in pending_rows if not row["is_assigned"]\n },\n assigned_step_count=sum(\n [cast(int, row["count"]) for row in pending_rows if row["is_assigned"]]\n ),\n assigned_run_ids={\n cast(str, row["run_id"]) for row in pending_rows if row["is_assigned"]\n },\n )\n\n def get_concurrency_run_ids(self) -> Set[str]:\n with self.index_connection() as conn:\n rows = conn.execute(db_select([PendingStepsTable.c.run_id]).distinct()).fetchall()\n return set([cast(str, row[0]) for row in rows])\n\n def free_concurrency_slots_for_run(self, run_id: str) -> None:\n freed_concurrency_keys = self._free_concurrency_slots(run_id=run_id)\n self._remove_pending_steps(run_id=run_id)\n if freed_concurrency_keys:\n # assign any pending steps that can now claim a slot\n self.assign_pending_steps(freed_concurrency_keys)\n\n def free_concurrency_slot_for_step(self, run_id: str, step_key: str) -> None:\n freed_concurrency_keys = self._free_concurrency_slots(run_id=run_id, step_key=step_key)\n self._remove_pending_steps(run_id=run_id, step_key=step_key)\n if freed_concurrency_keys:\n # assign any pending steps that can now claim a slot\n self.assign_pending_steps(freed_concurrency_keys)\n\n def _free_concurrency_slots(self, run_id: str, step_key: Optional[str] = None) -> Sequence[str]:\n """Frees concurrency slots for a given run/step.\n\n Args:\n run_id (str): The run id to free the slots for.\n step_key (Optional[str]): The step key to free the slots for. If not provided, all the\n slots for all the steps of the run will be freed.\n """\n with self.index_connection() as conn:\n # first delete any rows that apply and are marked as deleted. This happens when the\n # configured number of slots has been reduced, and some of the pruned slots included\n # ones that were already allocated to the run/step\n delete_query = ConcurrencySlotsTable.delete().where(\n db.and_(\n ConcurrencySlotsTable.c.run_id == run_id,\n ConcurrencySlotsTable.c.deleted == True, # noqa: E712\n )\n )\n if step_key:\n delete_query = delete_query.where(ConcurrencySlotsTable.c.step_key == step_key)\n conn.execute(delete_query)\n\n # next, fetch the slots to free up, while grabbing the concurrency keys so that we can\n # allocate any pending steps from the queue for the freed slots, if necessary\n select_query = (\n db_select([ConcurrencySlotsTable.c.id, ConcurrencySlotsTable.c.concurrency_key])\n .select_from(ConcurrencySlotsTable)\n .where(ConcurrencySlotsTable.c.run_id == run_id)\n .with_for_update(skip_locked=True)\n )\n if step_key:\n select_query = select_query.where(ConcurrencySlotsTable.c.step_key == step_key)\n rows = conn.execute(select_query).fetchall()\n if not rows:\n return []\n\n # now, actually free the slots\n conn.execute(\n ConcurrencySlotsTable.update()\n .values(run_id=None, step_key=None)\n .where(\n db.and_(\n ConcurrencySlotsTable.c.id.in_([row[0] for row in rows]),\n )\n )\n )\n\n # return the concurrency keys for the freed slots\n return [cast(str, row[1]) for row in rows]\n\n def store_asset_check_event(self, event: EventLogEntry, event_id: Optional[int]) -> None:\n check.inst_param(event, "event", EventLogEntry)\n check.opt_int_param(event_id, "event_id")\n\n check.invariant(\n self.supports_asset_checks,\n "Asset checks require a database schema migration. Run `dagster instance migrate`.",\n )\n\n if event.dagster_event_type == DagsterEventType.ASSET_CHECK_EVALUATION_PLANNED:\n self._store_asset_check_evaluation_planned(event, event_id)\n if event.dagster_event_type == DagsterEventType.ASSET_CHECK_EVALUATION:\n if event.run_id == "" or event.run_id is None:\n self._store_runless_asset_check_evaluation(event, event_id)\n else:\n self._update_asset_check_evaluation(event, event_id)\n\n def _store_asset_check_evaluation_planned(\n self, event: EventLogEntry, event_id: Optional[int]\n ) -> None:\n planned = cast(\n AssetCheckEvaluationPlanned, check.not_none(event.dagster_event).event_specific_data\n )\n with self.index_connection() as conn:\n conn.execute(\n AssetCheckExecutionsTable.insert().values(\n asset_key=planned.asset_key.to_string(),\n check_name=planned.check_name,\n run_id=event.run_id,\n execution_status=AssetCheckExecutionRecordStatus.PLANNED.value,\n )\n )\n\n def _store_runless_asset_check_evaluation(\n self, event: EventLogEntry, event_id: Optional[int]\n ) -> None:\n evaluation = cast(\n AssetCheckEvaluation, check.not_none(event.dagster_event).event_specific_data\n )\n with self.index_connection() as conn:\n conn.execute(\n AssetCheckExecutionsTable.insert().values(\n asset_key=evaluation.asset_key.to_string(),\n check_name=evaluation.check_name,\n run_id=event.run_id,\n execution_status=(\n AssetCheckExecutionRecordStatus.SUCCEEDED.value\n if evaluation.passed\n else AssetCheckExecutionRecordStatus.FAILED.value\n ),\n evaluation_event=serialize_value(event),\n evaluation_event_timestamp=datetime.utcfromtimestamp(event.timestamp),\n evaluation_event_storage_id=event_id,\n materialization_event_storage_id=(\n evaluation.target_materialization_data.storage_id\n if evaluation.target_materialization_data\n else None\n ),\n )\n )\n\n def _update_asset_check_evaluation(self, event: EventLogEntry, event_id: Optional[int]) -> None:\n evaluation = cast(\n AssetCheckEvaluation, check.not_none(event.dagster_event).event_specific_data\n )\n with self.index_connection() as conn:\n rows_updated = conn.execute(\n AssetCheckExecutionsTable.update()\n .where(\n # (asset_key, check_name, run_id) uniquely identifies the row created for the planned event\n db.and_(\n AssetCheckExecutionsTable.c.asset_key == evaluation.asset_key.to_string(),\n AssetCheckExecutionsTable.c.check_name == evaluation.check_name,\n AssetCheckExecutionsTable.c.run_id == event.run_id,\n )\n )\n .values(\n execution_status=(\n AssetCheckExecutionRecordStatus.SUCCEEDED.value\n if evaluation.passed\n else AssetCheckExecutionRecordStatus.FAILED.value\n ),\n evaluation_event=serialize_value(event),\n evaluation_event_timestamp=datetime.utcfromtimestamp(event.timestamp),\n evaluation_event_storage_id=event_id,\n materialization_event_storage_id=(\n evaluation.target_materialization_data.storage_id\n if evaluation.target_materialization_data\n else None\n ),\n )\n ).rowcount\n if rows_updated != 1:\n raise DagsterInvariantViolationError(\n "Expected to update one row for asset check evaluation, but updated"\n f" {rows_updated}."\n )\n\n def get_asset_check_executions(\n self,\n asset_key: AssetKey,\n check_name: str,\n limit: int,\n cursor: Optional[int] = None,\n materialization_event_storage_id: Optional[int] = None,\n include_planned: bool = True,\n ) -> Sequence[AssetCheckExecutionRecord]:\n query = (\n db_select(\n [\n AssetCheckExecutionsTable.c.id,\n AssetCheckExecutionsTable.c.run_id,\n AssetCheckExecutionsTable.c.execution_status,\n AssetCheckExecutionsTable.c.evaluation_event,\n AssetCheckExecutionsTable.c.create_timestamp,\n ]\n )\n .where(\n db.and_(\n AssetCheckExecutionsTable.c.asset_key == asset_key.to_string(),\n AssetCheckExecutionsTable.c.check_name == check_name,\n )\n )\n .order_by(AssetCheckExecutionsTable.c.id.desc())\n ).limit(limit)\n\n if cursor:\n query = query.where(AssetCheckExecutionsTable.c.id < cursor)\n if not include_planned:\n query = query.where(\n AssetCheckExecutionsTable.c.execution_status\n != AssetCheckExecutionRecordStatus.PLANNED.value\n )\n if materialization_event_storage_id:\n if include_planned:\n # rows in PLANNED status are not associated with a materialization event yet\n query = query.where(\n db.or_(\n AssetCheckExecutionsTable.c.materialization_event_storage_id\n == materialization_event_storage_id,\n AssetCheckExecutionsTable.c.execution_status\n == AssetCheckExecutionRecordStatus.PLANNED.value,\n )\n )\n else:\n query = query.where(\n AssetCheckExecutionsTable.c.materialization_event_storage_id\n == materialization_event_storage_id\n )\n\n with self.index_connection() as conn:\n rows = conn.execute(query).fetchall()\n\n return [\n AssetCheckExecutionRecord(\n id=cast(int, row[0]),\n run_id=cast(str, row[1]),\n status=AssetCheckExecutionRecordStatus(row[2]),\n evaluation_event=(\n deserialize_value(cast(str, row[3]), EventLogEntry) if row[3] else None\n ),\n create_timestamp=datetime_as_float(cast(datetime, row[4])),\n )\n for row in rows\n ]\n\n @property\n def supports_asset_checks(self):\n return self.has_table(AssetCheckExecutionsTable.name)
\n\n\ndef _get_from_row(row: SqlAlchemyRow, column: str) -> object:\n """Utility function for extracting a column from a sqlalchemy row proxy, since '_asdict' is not\n supported in sqlalchemy 1.3.\n """\n if column not in row.keys():\n return None\n return row[column]\n
", "current_page_name": "_modules/dagster/_core/storage/event_log/sql_event_log", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.storage.event_log.sql_event_log"}, "sqlite": {"consolidated_sqlite_event_log": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.storage.event_log.sqlite.consolidated_sqlite_event_log

\nimport logging\nimport os\nfrom collections import defaultdict\nfrom contextlib import contextmanager\nfrom typing import Any, Mapping, Optional\n\nimport sqlalchemy as db\nfrom sqlalchemy.pool import NullPool\nfrom typing_extensions import Self\nfrom watchdog.events import PatternMatchingEventHandler\nfrom watchdog.observers import Observer\n\nimport dagster._check as check\nfrom dagster._config import StringSource\nfrom dagster._core.storage.dagster_run import DagsterRunStatus\nfrom dagster._core.storage.event_log.base import EventLogCursor\nfrom dagster._core.storage.sql import (\n    check_alembic_revision,\n    create_engine,\n    get_alembic_config,\n    run_alembic_upgrade,\n    stamp_alembic_rev,\n)\nfrom dagster._core.storage.sqlite import create_db_conn_string\nfrom dagster._serdes import ConfigurableClass, ConfigurableClassData\nfrom dagster._utils import mkdir_p\n\nfrom ..schema import SqlEventLogStorageMetadata\nfrom ..sql_event_log import SqlDbConnection, SqlEventLogStorage\n\nSQLITE_EVENT_LOG_FILENAME = "event_log"\n\n\n
[docs]class ConsolidatedSqliteEventLogStorage(SqlEventLogStorage, ConfigurableClass):\n """SQLite-backed consolidated event log storage intended for test cases only.\n\n Users should not directly instantiate this class; it is instantiated by internal machinery when\n ``dagster-webserver`` and ``dagster-graphql`` load, based on the values in the ``dagster.yaml`` file in\n ``$DAGSTER_HOME``. Configuration of this class should be done by setting values in that file.\n\n To explicitly specify the consolidated SQLite for event log storage, you can add a block such as\n the following to your ``dagster.yaml``:\n\n .. code-block:: YAML\n\n run_storage:\n module: dagster._core.storage.event_log\n class: ConsolidatedSqliteEventLogStorage\n config:\n base_dir: /path/to/dir\n\n The ``base_dir`` param tells the event log storage where on disk to store the database.\n """\n\n def __init__(self, base_dir, inst_data: Optional[ConfigurableClassData] = None):\n self._base_dir = check.str_param(base_dir, "base_dir")\n self._conn_string = create_db_conn_string(base_dir, SQLITE_EVENT_LOG_FILENAME)\n self._secondary_index_cache = {}\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n self._watchers = defaultdict(dict)\n self._obs = None\n\n if not os.path.exists(self.get_db_path()):\n self._init_db()\n\n super().__init__()\n\n @property\n def inst_data(self):\n return self._inst_data\n\n @classmethod\n def config_type(cls):\n return {"base_dir": StringSource}\n\n @classmethod\n def from_config_value(\n cls, inst_data: ConfigurableClassData, config_value: Mapping[str, Any]\n ) -> Self:\n return ConsolidatedSqliteEventLogStorage(inst_data=inst_data, **config_value)\n\n def _init_db(self):\n mkdir_p(self._base_dir)\n engine = create_engine(self._conn_string, poolclass=NullPool)\n alembic_config = get_alembic_config(__file__)\n\n should_mark_indexes = False\n with engine.connect() as connection:\n db_revision, head_revision = check_alembic_revision(alembic_config, connection)\n if not (db_revision and head_revision):\n SqlEventLogStorageMetadata.create_all(engine)\n connection.execute(db.text("PRAGMA journal_mode=WAL;"))\n stamp_alembic_rev(alembic_config, connection)\n should_mark_indexes = True\n\n if should_mark_indexes:\n # mark all secondary indexes\n self.reindex_events()\n self.reindex_assets()\n\n @contextmanager\n def _connect(self):\n engine = create_engine(self._conn_string, poolclass=NullPool)\n with engine.connect() as conn:\n with conn.begin():\n yield conn\n\n def run_connection(self, run_id: Optional[str]) -> SqlDbConnection:\n return self._connect()\n\n def index_connection(self):\n return self._connect()\n\n def has_table(self, table_name: str) -> bool:\n engine = create_engine(self._conn_string, poolclass=NullPool)\n return bool(engine.dialect.has_table(engine.connect(), table_name))\n\n def get_db_path(self):\n return os.path.join(self._base_dir, f"{SQLITE_EVENT_LOG_FILENAME}.db")\n\n def upgrade(self):\n alembic_config = get_alembic_config(__file__)\n with self._connect() as conn:\n run_alembic_upgrade(alembic_config, conn)\n\n def has_secondary_index(self, name):\n if name not in self._secondary_index_cache:\n self._secondary_index_cache[name] = super(\n ConsolidatedSqliteEventLogStorage, self\n ).has_secondary_index(name)\n return self._secondary_index_cache[name]\n\n def enable_secondary_index(self, name):\n super(ConsolidatedSqliteEventLogStorage, self).enable_secondary_index(name)\n if name in self._secondary_index_cache:\n del self._secondary_index_cache[name]\n\n def watch(self, run_id, cursor, callback):\n if not self._obs:\n self._obs = Observer()\n self._obs.start()\n self._obs.schedule(\n ConsolidatedSqliteEventLogStorageWatchdog(self), self._base_dir, True\n )\n\n self._watchers[run_id][callback] = cursor\n\n @property\n def supports_global_concurrency_limits(self) -> bool:\n return False\n\n def on_modified(self):\n keys = [\n (run_id, callback)\n for run_id, callback_dict in self._watchers.items()\n for callback, _ in callback_dict.items()\n ]\n for run_id, callback in keys:\n cursor = self._watchers[run_id][callback]\n\n # fetch events\n connection = self.get_records_for_run(run_id, cursor)\n\n # update cursor\n if connection.cursor:\n self._watchers[run_id][callback] = connection.cursor\n\n for record in connection.records:\n status = None\n try:\n status = callback(\n record.event_log_entry,\n str(EventLogCursor.from_storage_id(record.storage_id)),\n )\n except Exception:\n logging.exception("Exception in callback for event watch on run %s.", run_id)\n\n if (\n status == DagsterRunStatus.SUCCESS\n or status == DagsterRunStatus.FAILURE\n or status == DagsterRunStatus.CANCELED\n ):\n self.end_watch(run_id, callback)\n\n def end_watch(self, run_id, handler):\n if run_id in self._watchers and handler in self._watchers[run_id]:\n del self._watchers[run_id][handler]\n\n def dispose(self):\n if self._obs:\n self._obs.stop()\n self._obs.join(timeout=15)
\n\n\nclass ConsolidatedSqliteEventLogStorageWatchdog(PatternMatchingEventHandler):\n def __init__(self, event_log_storage, **kwargs):\n self._event_log_storage = check.inst_param(\n event_log_storage, "event_log_storage", ConsolidatedSqliteEventLogStorage\n )\n self._log_path = event_log_storage.get_db_path()\n super(ConsolidatedSqliteEventLogStorageWatchdog, self).__init__(\n patterns=[self._log_path], **kwargs\n )\n\n def on_modified(self, event):\n check.invariant(event.src_path == self._log_path)\n self._event_log_storage.on_modified()\n
", "current_page_name": "_modules/dagster/_core/storage/event_log/sqlite/consolidated_sqlite_event_log", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.storage.event_log.sqlite.consolidated_sqlite_event_log"}, "sqlite_event_log": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.storage.event_log.sqlite.sqlite_event_log

\nimport contextlib\nimport glob\nimport logging\nimport os\nimport re\nimport sqlite3\nimport threading\nimport time\nfrom collections import defaultdict\nfrom contextlib import contextmanager\nfrom typing import TYPE_CHECKING, Any, ContextManager, Iterable, Iterator, Optional, Sequence\n\nimport sqlalchemy as db\nimport sqlalchemy.exc as db_exc\nfrom sqlalchemy.engine import Connection, Engine\nfrom sqlalchemy.pool import NullPool\nfrom tqdm import tqdm\nfrom watchdog.events import FileSystemEvent, PatternMatchingEventHandler\nfrom watchdog.observers import Observer\n\nimport dagster._check as check\nimport dagster._seven as seven\nfrom dagster._config import StringSource\nfrom dagster._config.config_schema import UserConfigSchema\nfrom dagster._core.definitions.events import AssetKey\nfrom dagster._core.errors import DagsterInvariantViolationError\nfrom dagster._core.event_api import EventHandlerFn\nfrom dagster._core.events import ASSET_CHECK_EVENTS, ASSET_EVENTS, EVENT_TYPE_TO_PIPELINE_RUN_STATUS\nfrom dagster._core.events.log import EventLogEntry\nfrom dagster._core.storage.dagster_run import DagsterRunStatus, RunsFilter\nfrom dagster._core.storage.event_log.base import EventLogCursor, EventLogRecord, EventRecordsFilter\nfrom dagster._core.storage.sql import (\n    AlembicVersion,\n    check_alembic_revision,\n    create_engine,\n    get_alembic_config,\n    run_alembic_upgrade,\n    stamp_alembic_rev,\n)\nfrom dagster._core.storage.sqlalchemy_compat import db_select\nfrom dagster._core.storage.sqlite import create_db_conn_string\nfrom dagster._serdes import (\n    ConfigurableClass,\n    ConfigurableClassData,\n)\nfrom dagster._serdes.errors import DeserializationError\nfrom dagster._serdes.serdes import deserialize_value\nfrom dagster._utils import mkdir_p\n\nfrom ..schema import SqlEventLogStorageMetadata, SqlEventLogStorageTable\nfrom ..sql_event_log import RunShardedEventsCursor, SqlEventLogStorage\n\nif TYPE_CHECKING:\n    from dagster._core.storage.sqlite_storage import SqliteStorageConfig\nINDEX_SHARD_NAME = "index"\n\n\n
[docs]class SqliteEventLogStorage(SqlEventLogStorage, ConfigurableClass):\n """SQLite-backed event log storage.\n\n Users should not directly instantiate this class; it is instantiated by internal machinery when\n ``dagster-webserver`` and ``dagster-graphql`` load, based on the values in the ``dagster.yaml`` file insqliteve\n ``$DAGSTER_HOME``. Configuration of this class should be done by setting values in that file.\n\n This is the default event log storage when none is specified in the ``dagster.yaml``.\n\n To explicitly specify SQLite for event log storage, you can add a block such as the following\n to your ``dagster.yaml``:\n\n .. code-block:: YAML\n\n event_log_storage:\n module: dagster._core.storage.event_log\n class: SqliteEventLogStorage\n config:\n base_dir: /path/to/dir\n\n The ``base_dir`` param tells the event log storage where on disk to store the databases. To\n improve concurrent performance, event logs are stored in a separate SQLite database for each\n run.\n """\n\n def __init__(self, base_dir: str, inst_data: Optional[ConfigurableClassData] = None):\n """Note that idempotent initialization of the SQLite database is done on a per-run_id\n basis in the body of connect, since each run is stored in a separate database.\n """\n self._base_dir = os.path.abspath(check.str_param(base_dir, "base_dir"))\n mkdir_p(self._base_dir)\n\n self._obs = None\n\n self._watchers = defaultdict(dict)\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n\n # Used to ensure that each run ID attempts to initialize its DB the first time it connects,\n # ensuring that the database will be created if it doesn't exist\n self._initialized_dbs = set()\n\n # Ensure that multiple threads (like the event log watcher) interact safely with each other\n self._db_lock = threading.Lock()\n\n if not os.path.exists(self.path_for_shard(INDEX_SHARD_NAME)):\n conn_string = self.conn_string_for_shard(INDEX_SHARD_NAME)\n engine = create_engine(conn_string, poolclass=NullPool)\n self._initdb(engine)\n self.reindex_events()\n self.reindex_assets()\n\n super().__init__()\n\n def upgrade(self) -> None:\n all_run_ids = self.get_all_run_ids()\n print(f"Updating event log storage for {len(all_run_ids)} runs on disk...") # noqa: T201\n alembic_config = get_alembic_config(__file__)\n if all_run_ids:\n for run_id in tqdm(all_run_ids):\n with self.run_connection(run_id) as conn:\n run_alembic_upgrade(alembic_config, conn, run_id)\n\n print("Updating event log storage for index db on disk...") # noqa: T201\n with self.index_connection() as conn:\n run_alembic_upgrade(alembic_config, conn, "index")\n\n self._initialized_dbs = set()\n\n @property\n def inst_data(self) -> Optional[ConfigurableClassData]:\n return self._inst_data\n\n @classmethod\n def config_type(cls) -> UserConfigSchema:\n return {"base_dir": StringSource}\n\n @classmethod\n def from_config_value(\n cls, inst_data: Optional[ConfigurableClassData], config_value: "SqliteStorageConfig"\n ) -> "SqliteEventLogStorage":\n return SqliteEventLogStorage(inst_data=inst_data, **config_value)\n\n def get_all_run_ids(self) -> Sequence[str]:\n all_filenames = glob.glob(os.path.join(self._base_dir, "*.db"))\n return [\n os.path.splitext(os.path.basename(filename))[0]\n for filename in all_filenames\n if os.path.splitext(os.path.basename(filename))[0] != INDEX_SHARD_NAME\n ]\n\n def has_table(self, table_name: str) -> bool:\n conn_string = self.conn_string_for_shard(INDEX_SHARD_NAME)\n engine = create_engine(conn_string, poolclass=NullPool)\n with engine.connect() as conn:\n return bool(engine.dialect.has_table(conn, table_name))\n\n def path_for_shard(self, run_id: str) -> str:\n return os.path.join(self._base_dir, f"{run_id}.db")\n\n def conn_string_for_shard(self, shard_name: str) -> str:\n check.str_param(shard_name, "shard_name")\n return create_db_conn_string(self._base_dir, shard_name)\n\n def _initdb(self, engine: Engine) -> None:\n alembic_config = get_alembic_config(__file__)\n\n retry_limit = 10\n\n while True:\n try:\n with engine.connect() as connection:\n db_revision, head_revision = check_alembic_revision(alembic_config, connection)\n\n if not (db_revision and head_revision):\n SqlEventLogStorageMetadata.create_all(engine)\n connection.execute(db.text("PRAGMA journal_mode=WAL;"))\n stamp_alembic_rev(alembic_config, connection)\n\n break\n except (db_exc.DatabaseError, sqlite3.DatabaseError, sqlite3.OperationalError) as exc:\n # This is SQLite-specific handling for concurrency issues that can arise when\n # multiple processes (e.g. the dagster-webserver process and user code process) contend with\n # each other to init the db. When we hit the following errors, we know that another\n # process is on the case and we should retry.\n err_msg = str(exc)\n\n if not (\n re.search(r"table [A-Za-z_]* already exists", err_msg)\n or "database is locked" in err_msg\n or "UNIQUE constraint failed: alembic_version.version_num" in err_msg\n ):\n raise\n\n if retry_limit == 0:\n raise\n else:\n logging.info(\n "SqliteEventLogStorage._initdb: Encountered apparent concurrent init, "\n "retrying (%s retries left). Exception: %s",\n retry_limit,\n err_msg,\n )\n time.sleep(0.2)\n retry_limit -= 1\n\n @contextmanager\n def _connect(self, shard: str) -> Iterator[Connection]:\n with self._db_lock:\n check.str_param(shard, "shard")\n\n conn_string = self.conn_string_for_shard(shard)\n engine = create_engine(conn_string, poolclass=NullPool)\n\n if shard not in self._initialized_dbs:\n self._initdb(engine)\n self._initialized_dbs.add(shard)\n\n with engine.connect() as conn:\n with conn.begin():\n yield conn\n engine.dispose()\n\n def run_connection(self, run_id: Optional[str] = None) -> Any:\n return self._connect(run_id) # type: ignore # bad sig\n\n def index_connection(self) -> ContextManager[Connection]:\n return self._connect(INDEX_SHARD_NAME)\n\n def store_event(self, event: EventLogEntry) -> None:\n """Overridden method to replicate asset events in a central assets.db sqlite shard, enabling\n cross-run asset queries.\n\n Args:\n event (EventLogEntry): The event to store.\n """\n check.inst_param(event, "event", EventLogEntry)\n insert_event_statement = self.prepare_insert_event(event)\n run_id = event.run_id\n\n with self.run_connection(run_id) as conn:\n conn.execute(insert_event_statement)\n\n if event.is_dagster_event and event.dagster_event.asset_key: # type: ignore\n check.invariant(\n event.dagster_event_type in ASSET_EVENTS,\n "Can only store asset materializations, materialization_planned, and"\n " observations in index database",\n )\n\n event_id = None\n\n # mirror the event in the cross-run index database\n with self.index_connection() as conn:\n result = conn.execute(insert_event_statement)\n event_id = result.inserted_primary_key[0]\n\n self.store_asset_event(event, event_id)\n\n if event_id is None:\n raise DagsterInvariantViolationError(\n "Cannot store asset event tags for null event id."\n )\n\n self.store_asset_event_tags(event, event_id)\n\n if event.is_dagster_event and event.dagster_event_type in ASSET_CHECK_EVENTS:\n self.store_asset_check_event(event, None)\n\n if event.is_dagster_event and event.dagster_event_type in EVENT_TYPE_TO_PIPELINE_RUN_STATUS:\n # should mirror run status change events in the index shard\n with self.index_connection() as conn:\n result = conn.execute(insert_event_statement)\n\n def get_event_records(\n self,\n event_records_filter: EventRecordsFilter,\n limit: Optional[int] = None,\n ascending: bool = False,\n ) -> Iterable[EventLogRecord]:\n """Overridden method to enable cross-run event queries in sqlite.\n\n The record id in sqlite does not auto increment cross runs, so instead of fetching events\n after record id, we only fetch events whose runs updated after update_timestamp.\n """\n check.opt_inst_param(event_records_filter, "event_records_filter", EventRecordsFilter)\n check.opt_int_param(limit, "limit")\n check.bool_param(ascending, "ascending")\n\n is_asset_query = event_records_filter and event_records_filter.event_type in ASSET_EVENTS\n if is_asset_query:\n # asset materializations, observations and materialization planned events get mirrored\n # into the index shard, so no custom run shard-aware cursor logic needed\n return super(SqliteEventLogStorage, self).get_event_records(\n event_records_filter=event_records_filter, limit=limit, ascending=ascending\n )\n\n query = db_select([SqlEventLogStorageTable.c.id, SqlEventLogStorageTable.c.event])\n if event_records_filter.asset_key:\n asset_details = next(iter(self._get_assets_details([event_records_filter.asset_key])))\n else:\n asset_details = None\n\n if event_records_filter.after_cursor is not None and not isinstance(\n event_records_filter.after_cursor, RunShardedEventsCursor\n ):\n raise Exception("""\n Called `get_event_records` on a run-sharded event log storage with a cursor that\n is not run-aware. Add a RunShardedEventsCursor to your query filter\n or switch your instance configuration to use a non-run-sharded event log storage\n (e.g. PostgresEventLogStorage, ConsolidatedSqliteEventLogStorage)\n """)\n\n query = self._apply_filter_to_query(\n query=query,\n event_records_filter=event_records_filter,\n asset_details=asset_details,\n apply_cursor_filters=False, # run-sharded cursor filters don't really make sense\n )\n if limit:\n query = query.limit(limit)\n if ascending:\n query = query.order_by(SqlEventLogStorageTable.c.timestamp.asc())\n else:\n query = query.order_by(SqlEventLogStorageTable.c.timestamp.desc())\n\n # workaround for the run-shard sqlite to enable cross-run queries: get a list of run_ids\n # whose events may qualify the query, and then open run_connection per run_id at a time.\n run_updated_after = (\n event_records_filter.after_cursor.run_updated_after\n if isinstance(event_records_filter.after_cursor, RunShardedEventsCursor)\n else None\n )\n run_records = self._instance.get_run_records(\n filters=RunsFilter(updated_after=run_updated_after),\n order_by="update_timestamp",\n ascending=ascending,\n )\n\n event_records = []\n for run_record in run_records:\n run_id = run_record.dagster_run.run_id\n with self.run_connection(run_id) as conn:\n results = conn.execute(query).fetchall()\n\n for row_id, json_str in results:\n try:\n event_record = deserialize_value(json_str, EventLogEntry)\n event_records.append(\n EventLogRecord(storage_id=row_id, event_log_entry=event_record)\n )\n if limit and len(event_records) >= limit:\n break\n except DeserializationError:\n logging.warning(\n "Could not resolve event record as EventLogEntry for id `%s`.", row_id\n )\n except seven.JSONDecodeError:\n logging.warning("Could not parse event record id `%s`.", row_id)\n\n if limit and len(event_records) >= limit:\n break\n\n return event_records[:limit]\n\n def supports_event_consumer_queries(self) -> bool:\n return False\n\n def delete_events(self, run_id: str) -> None:\n with self.run_connection(run_id) as conn:\n self.delete_events_for_run(conn, run_id)\n\n # delete the mirrored event in the cross-run index database\n with self.index_connection() as conn:\n self.delete_events_for_run(conn, run_id)\n\n def wipe(self) -> None:\n # should delete all the run-sharded db files and drop the contents of the index\n for filename in (\n glob.glob(os.path.join(self._base_dir, "*.db"))\n + glob.glob(os.path.join(self._base_dir, "*.db-wal"))\n + glob.glob(os.path.join(self._base_dir, "*.db-shm"))\n ):\n if (\n not filename.endswith(f"{INDEX_SHARD_NAME}.db")\n and not filename.endswith(f"{INDEX_SHARD_NAME}.db-wal")\n and not filename.endswith(f"{INDEX_SHARD_NAME}.db-shm")\n ):\n with contextlib.suppress(FileNotFoundError):\n os.unlink(filename)\n\n self._initialized_dbs = set()\n self._wipe_index()\n\n def _delete_mirrored_events_for_asset_key(self, asset_key: AssetKey) -> None:\n with self.index_connection() as conn:\n conn.execute(\n SqlEventLogStorageTable.delete().where(\n SqlEventLogStorageTable.c.asset_key == asset_key.to_string(),\n )\n )\n\n def wipe_asset(self, asset_key: AssetKey) -> None:\n # default implementation will update the event_logs in the sharded dbs, and the asset_key\n # table in the asset shard, but will not remove the mirrored event_log events in the asset\n # shard\n super(SqliteEventLogStorage, self).wipe_asset(asset_key)\n self._delete_mirrored_events_for_asset_key(asset_key)\n\n def watch(self, run_id: str, cursor: Optional[str], callback: EventHandlerFn) -> None:\n if not self._obs:\n self._obs = Observer()\n self._obs.start()\n\n watchdog = SqliteEventLogStorageWatchdog(self, run_id, callback, cursor)\n self._watchers[run_id][callback] = (\n watchdog,\n self._obs.schedule(watchdog, self._base_dir, True),\n )\n\n def end_watch(self, run_id: str, handler: EventHandlerFn) -> None:\n if handler in self._watchers[run_id]:\n event_handler, watch = self._watchers[run_id][handler]\n self._obs.remove_handler_for_watch(event_handler, watch) # type: ignore # (possible none)\n del self._watchers[run_id][handler]\n\n def dispose(self) -> None:\n if self._obs:\n self._obs.stop()\n self._obs.join(timeout=15)\n\n def alembic_version(self) -> AlembicVersion:\n alembic_config = get_alembic_config(__file__)\n with self.index_connection() as conn:\n return check_alembic_revision(alembic_config, conn)\n\n @property\n def is_run_sharded(self) -> bool:\n return True\n\n @property\n def supports_global_concurrency_limits(self) -> bool:\n return False
\n\n\nclass SqliteEventLogStorageWatchdog(PatternMatchingEventHandler):\n def __init__(\n self,\n event_log_storage: SqliteEventLogStorage,\n run_id: str,\n callback: EventHandlerFn,\n cursor: Optional[str],\n **kwargs: Any,\n ):\n self._event_log_storage = check.inst_param(\n event_log_storage, "event_log_storage", SqliteEventLogStorage\n )\n self._run_id = check.str_param(run_id, "run_id")\n self._cb = check.callable_param(callback, "callback")\n self._log_path = event_log_storage.path_for_shard(run_id)\n self._cursor = cursor\n super(SqliteEventLogStorageWatchdog, self).__init__(patterns=[self._log_path], **kwargs)\n\n def _process_log(self) -> None:\n connection = self._event_log_storage.get_records_for_run(self._run_id, self._cursor)\n if connection.cursor:\n self._cursor = connection.cursor\n for record in connection.records:\n status = None\n try:\n status = self._cb(\n record.event_log_entry, str(EventLogCursor.from_storage_id(record.storage_id))\n )\n except Exception:\n logging.exception("Exception in callback for event watch on run %s.", self._run_id)\n\n if (\n status == DagsterRunStatus.SUCCESS\n or status == DagsterRunStatus.FAILURE\n or status == DagsterRunStatus.CANCELED\n ):\n self._event_log_storage.end_watch(self._run_id, self._cb)\n\n def on_modified(self, event: FileSystemEvent) -> None:\n check.invariant(event.src_path == self._log_path)\n self._process_log()\n
", "current_page_name": "_modules/dagster/_core/storage/event_log/sqlite/sqlite_event_log", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.storage.event_log.sqlite.sqlite_event_log"}}}, "file_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.storage.file_manager

\nimport io\nimport os\nimport shutil\nimport uuid\nfrom abc import ABC, abstractmethod\nfrom contextlib import contextmanager\nfrom typing import BinaryIO, ContextManager, Iterator, Optional, TextIO, Union\n\nfrom typing_extensions import TypeAlias\n\nimport dagster._check as check\nfrom dagster._annotations import public\nfrom dagster._config import Field, StringSource\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource, resource\nfrom dagster._core.execution.context.init import InitResourceContext\nfrom dagster._core.instance import DagsterInstance\nfrom dagster._utils import mkdir_p\n\nfrom .temp_file_manager import TempfileManager\n\nIOStream: TypeAlias = Union[TextIO, BinaryIO]\n\n\n
[docs]class FileHandle(ABC):\n """A reference to a file as manipulated by a FileManager.\n\n Subclasses may handle files that are resident on the local file system, in an object store, or\n in any arbitrary place where a file can be stored.\n\n This exists to handle the very common case where you wish to write a computation that reads,\n transforms, and writes files, but where you also want the same code to work in local development\n as well as on a cluster where the files will be stored in a globally available object store\n such as S3.\n """\n\n @public\n @property\n @abstractmethod\n def path_desc(self) -> str:\n """A representation of the file path for display purposes only."""\n raise NotImplementedError()
\n\n\n
[docs]class LocalFileHandle(FileHandle):\n """A reference to a file on a local filesystem."""\n\n def __init__(self, path: str):\n self._path = check.str_param(path, "path")\n\n @public\n @property\n def path(self) -> str:\n """The file's path."""\n return self._path\n\n @public\n @property\n def path_desc(self) -> str:\n """A representation of the file path for display purposes only."""\n return self._path
\n\n\n
[docs]class FileManager(ABC):\n """Base class for all file managers in dagster.\n\n The file manager is an interface that can be implemented by resources to provide abstract\n access to a file system such as local disk, S3, or other cloud storage.\n\n For examples of usage, see the documentation of the concrete file manager implementations.\n """\n\n
[docs] @public\n @abstractmethod\n def copy_handle_to_local_temp(self, file_handle: FileHandle) -> str:\n """Copy a file represented by a file handle to a temp file.\n\n In an implementation built around an object store such as S3, this method would be expected\n to download the file from S3 to local filesystem in a location assigned by the standard\n library's :py:mod:`python:tempfile` module.\n\n Temp files returned by this method are *not* guaranteed to be reusable across solid\n boundaries. For files that must be available across solid boundaries, use the\n :py:meth:`~dagster._core.storage.file_manager.FileManager.read`,\n :py:meth:`~dagster._core.storage.file_manager.FileManager.read_data`,\n :py:meth:`~dagster._core.storage.file_manager.FileManager.write`, and\n :py:meth:`~dagster._core.storage.file_manager.FileManager.write_data` methods.\n\n Args:\n file_handle (FileHandle): The handle to the file to make available as a local temp file.\n\n Returns:\n str: Path to the local temp file.\n """\n raise NotImplementedError()
\n\n
[docs] @public\n @abstractmethod\n def delete_local_temp(self) -> None:\n """Delete all local temporary files created by previous calls to\n :py:meth:`~dagster._core.storage.file_manager.FileManager.copy_handle_to_local_temp`.\n\n Should typically only be called by framework implementors.\n """\n raise NotImplementedError()
\n\n
[docs] @public\n @abstractmethod\n def read(self, file_handle: FileHandle, mode: str = "rb") -> ContextManager[IOStream]:\n """Return a file-like stream for the file handle.\n\n This may incur an expensive network call for file managers backed by object stores\n such as S3.\n\n Args:\n file_handle (FileHandle): The file handle to make available as a stream.\n mode (str): The mode in which to open the file. Default: ``"rb"``.\n\n Returns:\n Union[TextIO, BinaryIO]: A file-like stream.\n """\n raise NotImplementedError()
\n\n
[docs] @public\n @abstractmethod\n def read_data(self, file_handle: FileHandle) -> bytes:\n """Return the bytes for a given file handle. This may incur an expensive network\n call for file managers backed by object stores such as s3.\n\n Args:\n file_handle (FileHandle): The file handle for which to return bytes.\n\n Returns:\n bytes: Bytes for a given file handle.\n """\n raise NotImplementedError()
\n\n
[docs] @public\n @abstractmethod\n def write(self, file_obj: IOStream, mode: str = "wb", ext: Optional[str] = None) -> FileHandle:\n """Write the bytes contained within the given file object into the file manager.\n\n Args:\n file_obj (Union[TextIO, StringIO]): A file-like object.\n mode (Optional[str]): The mode in which to write the file into the file manager.\n Default: ``"wb"``.\n ext (Optional[str]): For file managers that support file extensions, the extension with\n which to write the file. Default: ``None``.\n\n Returns:\n FileHandle: A handle to the newly created file.\n """\n raise NotImplementedError()
\n\n
[docs] @public\n @abstractmethod\n def write_data(self, data: bytes, ext: Optional[str] = None) -> FileHandle:\n """Write raw bytes into the file manager.\n\n Args:\n data (bytes): The bytes to write into the file manager.\n ext (Optional[str]): For file managers that support file extensions, the extension with\n which to write the file. Default: ``None``.\n\n Returns:\n FileHandle: A handle to the newly created file.\n """\n raise NotImplementedError()
\n\n\n
[docs]@dagster_maintained_resource\n@resource(config_schema={"base_dir": Field(StringSource, is_required=False)})\ndef local_file_manager(init_context: InitResourceContext) -> "LocalFileManager":\n """FileManager that provides abstract access to a local filesystem.\n\n By default, files will be stored in `<local_artifact_storage>/storage/file_manager` where\n `<local_artifact_storage>` can be configured the ``dagster.yaml`` file in ``$DAGSTER_HOME``.\n\n Implements the :py:class:`~dagster._core.storage.file_manager.FileManager` API.\n\n Examples:\n .. code-block:: python\n\n import tempfile\n\n from dagster import job, local_file_manager, op\n\n\n @op(required_resource_keys={"file_manager"})\n def write_files(context):\n fh_1 = context.resources.file_manager.write_data(b"foo")\n\n with tempfile.NamedTemporaryFile("w+") as fd:\n fd.write("bar")\n fd.seek(0)\n fh_2 = context.resources.file_manager.write(fd, mode="w", ext=".txt")\n\n return (fh_1, fh_2)\n\n\n @op(required_resource_keys={"file_manager"})\n def read_files(context, file_handles):\n fh_1, fh_2 = file_handles\n assert context.resources.file_manager.read_data(fh_2) == b"bar"\n fd = context.resources.file_manager.read(fh_2, mode="r")\n assert fd.read() == "foo"\n fd.close()\n\n\n @job(resource_defs={"file_manager": local_file_manager})\n def files_pipeline():\n read_files(write_files())\n\n Or to specify the file directory:\n\n .. code-block:: python\n\n @job(\n resource_defs={\n "file_manager": local_file_manager.configured({"base_dir": "/my/base/dir"})\n }\n )\n def files_pipeline():\n read_files(write_files())\n """\n return LocalFileManager(\n base_dir=init_context.resource_config.get(\n "base_dir", os.path.join(init_context.instance.storage_directory(), "file_manager") # type: ignore # (possible none)\n )\n )
\n\n\ndef check_file_like_obj(obj: object) -> None:\n check.invariant(obj and hasattr(obj, "read") and hasattr(obj, "write"))\n\n\nclass LocalFileManager(FileManager):\n def __init__(self, base_dir: str):\n self.base_dir = base_dir\n self._base_dir_ensured = False\n self._temp_file_manager = TempfileManager()\n\n @staticmethod\n def for_instance(instance: DagsterInstance, run_id: str) -> "LocalFileManager":\n check.inst_param(instance, "instance", DagsterInstance)\n return LocalFileManager(instance.file_manager_directory(run_id))\n\n def ensure_base_dir_exists(self) -> None:\n if self._base_dir_ensured:\n return\n\n mkdir_p(self.base_dir)\n\n self._base_dir_ensured = True\n\n def copy_handle_to_local_temp(self, file_handle: FileHandle) -> str:\n check.inst_param(file_handle, "file_handle", FileHandle)\n with self.read(file_handle, "rb") as handle_obj: # type: ignore # (??)\n temp_file_obj = self._temp_file_manager.tempfile()\n temp_file_obj.write(handle_obj.read())\n temp_name = temp_file_obj.name\n temp_file_obj.close()\n return temp_name\n\n @contextmanager\n def read(self, file_handle: LocalFileHandle, mode: str = "rb") -> Iterator[IOStream]:\n check.inst_param(file_handle, "file_handle", LocalFileHandle)\n check.str_param(mode, "mode")\n check.param_invariant(mode in {"r", "rb"}, "mode")\n\n encoding = None if mode == "rb" else "utf8"\n with open(file_handle.path, mode, encoding=encoding) as file_obj:\n yield file_obj # type: ignore # (??)\n\n def read_data(self, file_handle: LocalFileHandle) -> bytes:\n with self.read(file_handle, mode="rb") as file_obj:\n return file_obj.read() # type: ignore # (??)\n\n def write_data(self, data: bytes, ext: Optional[str] = None):\n check.inst_param(data, "data", bytes)\n return self.write(io.BytesIO(data), mode="wb", ext=ext)\n\n def write(\n self, file_obj: IOStream, mode: str = "wb", ext: Optional[str] = None\n ) -> LocalFileHandle:\n check_file_like_obj(file_obj)\n check.opt_str_param(ext, "ext")\n\n self.ensure_base_dir_exists()\n\n dest_file_path = os.path.join(\n self.base_dir, str(uuid.uuid4()) + (("." + ext) if ext is not None else "")\n )\n\n encoding = None if "b" in mode else "utf8"\n with open(dest_file_path, mode, encoding=encoding) as dest_file_obj:\n shutil.copyfileobj(file_obj, dest_file_obj) # type: ignore # (??)\n return LocalFileHandle(dest_file_path)\n\n def delete_local_temp(self) -> None:\n self._temp_file_manager.close()\n
", "current_page_name": "_modules/dagster/_core/storage/file_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.storage.file_manager"}, "fs_io_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.storage.fs_io_manager

\nimport os\nimport pickle\nfrom typing import TYPE_CHECKING, Any, Optional\n\nfrom pydantic import Field\n\nimport dagster._check as check\nfrom dagster import (\n    DagsterInvariantViolationError,\n    Field as DagsterField,\n)\nfrom dagster._annotations import experimental\nfrom dagster._config import StringSource\nfrom dagster._config.pythonic_config import ConfigurableIOManagerFactory\nfrom dagster._core.definitions.events import AssetKey, AssetMaterialization\nfrom dagster._core.definitions.metadata import MetadataValue\nfrom dagster._core.execution.context.init import InitResourceContext\nfrom dagster._core.execution.context.input import InputContext\nfrom dagster._core.execution.context.output import OutputContext\nfrom dagster._core.storage.io_manager import IOManager, dagster_maintained_io_manager, io_manager\nfrom dagster._core.storage.upath_io_manager import UPathIOManager\nfrom dagster._utils import PICKLE_PROTOCOL, mkdir_p\n\nif TYPE_CHECKING:\n    from typing_extensions import Literal\n    from upath import UPath\n\n\n
[docs]class FilesystemIOManager(ConfigurableIOManagerFactory["PickledObjectFilesystemIOManager"]):\n """Built-in filesystem IO manager that stores and retrieves values using pickling.\n\n The base directory that the pickle files live inside is determined by:\n\n * The IO manager's "base_dir" configuration value, if specified. Otherwise...\n * A "storage/" directory underneath the value for "local_artifact_storage" in your dagster.yaml\n file, if specified. Otherwise...\n * A "storage/" directory underneath the directory that the DAGSTER_HOME environment variable\n points to, if that environment variable is specified. Otherwise...\n * A temporary directory.\n\n Assigns each op output to a unique filepath containing run ID, step key, and output name.\n Assigns each asset to a single filesystem path, at "<base_dir>/<asset_key>". If the asset key\n has multiple components, the final component is used as the name of the file, and the preceding\n components as parent directories under the base_dir.\n\n Subsequent materializations of an asset will overwrite previous materializations of that asset.\n So, with a base directory of "/my/base/path", an asset with key\n `AssetKey(["one", "two", "three"])` would be stored in a file called "three" in a directory\n with path "/my/base/path/one/two/".\n\n Example usage:\n\n\n 1. Attach an IO manager to a set of assets using the reserved resource key ``"io_manager"``.\n\n .. code-block:: python\n\n from dagster import Definitions, asset, FilesystemIOManager\n\n @asset\n def asset1():\n # create df ...\n return df\n\n @asset\n def asset2(asset1):\n return asset1[:5]\n\n defs = Definitions(\n assets=[asset1, asset2],\n resources={\n "io_manager": FilesystemIOManager(base_dir="/my/base/path")\n },\n )\n\n\n 2. Specify a job-level IO manager using the reserved resource key ``"io_manager"``,\n which will set the given IO manager on all ops in a job.\n\n .. code-block:: python\n\n from dagster import FilesystemIOManager, job, op\n\n @op\n def op_a():\n # create df ...\n return df\n\n @op\n def op_b(df):\n return df[:5]\n\n @job(\n resource_defs={\n "io_manager": FilesystemIOManager(base_dir="/my/base/path")\n }\n )\n def job():\n op_b(op_a())\n\n\n 3. Specify IO manager on :py:class:`Out`, which allows you to set different IO managers on\n different step outputs.\n\n .. code-block:: python\n\n from dagster import FilesystemIOManager, job, op, Out\n\n @op(out=Out(io_manager_key="my_io_manager"))\n def op_a():\n # create df ...\n return df\n\n @op\n def op_b(df):\n return df[:5]\n\n @job(resource_defs={"my_io_manager": FilesystemIOManager()})\n def job():\n op_b(op_a())\n\n """\n\n base_dir: Optional[str] = Field(default=None, description="Base directory for storing files.")\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n def create_io_manager(self, context: InitResourceContext) -> "PickledObjectFilesystemIOManager":\n base_dir = self.base_dir or check.not_none(context.instance).storage_directory()\n return PickledObjectFilesystemIOManager(base_dir=base_dir)
\n\n\n
[docs]@dagster_maintained_io_manager\n@io_manager(\n config_schema=FilesystemIOManager.to_config_schema(),\n description="Built-in filesystem IO manager that stores and retrieves values using pickling.",\n)\ndef fs_io_manager(init_context: InitResourceContext) -> "PickledObjectFilesystemIOManager":\n """Built-in filesystem IO manager that stores and retrieves values using pickling.\n\n The base directory that the pickle files live inside is determined by:\n\n * The IO manager's "base_dir" configuration value, if specified. Otherwise...\n * A "storage/" directory underneath the value for "local_artifact_storage" in your dagster.yaml\n file, if specified. Otherwise...\n * A "storage/" directory underneath the directory that the DAGSTER_HOME environment variable\n points to, if that environment variable is specified. Otherwise...\n * A temporary directory.\n\n Assigns each op output to a unique filepath containing run ID, step key, and output name.\n Assigns each asset to a single filesystem path, at "<base_dir>/<asset_key>". If the asset key\n has multiple components, the final component is used as the name of the file, and the preceding\n components as parent directories under the base_dir.\n\n Subsequent materializations of an asset will overwrite previous materializations of that asset.\n So, with a base directory of "/my/base/path", an asset with key\n `AssetKey(["one", "two", "three"])` would be stored in a file called "three" in a directory\n with path "/my/base/path/one/two/".\n\n Example usage:\n\n\n 1. Attach an IO manager to a set of assets using the reserved resource key ``"io_manager"``.\n\n .. code-block:: python\n\n from dagster import Definitions, asset, fs_io_manager\n\n @asset\n def asset1():\n # create df ...\n return df\n\n @asset\n def asset2(asset1):\n return asset1[:5]\n\n defs = Definitions(\n assets=[asset1, asset2],\n resources={\n "io_manager": fs_io_manager.configured({"base_dir": "/my/base/path"})\n },\n )\n\n\n 2. Specify a job-level IO manager using the reserved resource key ``"io_manager"``,\n which will set the given IO manager on all ops in a job.\n\n .. code-block:: python\n\n from dagster import fs_io_manager, job, op\n\n @op\n def op_a():\n # create df ...\n return df\n\n @op\n def op_b(df):\n return df[:5]\n\n @job(\n resource_defs={\n "io_manager": fs_io_manager.configured({"base_dir": "/my/base/path"})\n }\n )\n def job():\n op_b(op_a())\n\n\n 3. Specify IO manager on :py:class:`Out`, which allows you to set different IO managers on\n different step outputs.\n\n .. code-block:: python\n\n from dagster import fs_io_manager, job, op, Out\n\n @op(out=Out(io_manager_key="my_io_manager"))\n def op_a():\n # create df ...\n return df\n\n @op\n def op_b(df):\n return df[:5]\n\n @job(resource_defs={"my_io_manager": fs_io_manager})\n def job():\n op_b(op_a())\n\n """\n return FilesystemIOManager.from_resource_context(init_context)
\n\n\nclass PickledObjectFilesystemIOManager(UPathIOManager):\n """Built-in filesystem IO manager that stores and retrieves values using pickling.\n Is compatible with local and remote filesystems via `universal-pathlib` and `fsspec`.\n Learn more about how to use remote filesystems here: https://github.com/fsspec/universal_pathlib.\n\n Args:\n base_dir (Optional[str]): base directory where all the step outputs which use this object\n manager will be stored in.\n **kwargs: additional keyword arguments for `universal_pathlib.UPath`.\n """\n\n extension: str = "" # TODO: maybe change this to .pickle? Leaving blank for compatibility.\n\n def __init__(self, base_dir=None, **kwargs):\n from upath import UPath\n\n self.base_dir = check.opt_str_param(base_dir, "base_dir")\n\n super().__init__(base_path=UPath(base_dir, **kwargs))\n\n def dump_to_path(self, context: OutputContext, obj: Any, path: "UPath"):\n try:\n with path.open("wb") as file:\n pickle.dump(obj, file, PICKLE_PROTOCOL)\n except (AttributeError, RecursionError, ImportError, pickle.PicklingError) as e:\n executor = context.step_context.job_def.executor_def\n\n if isinstance(e, RecursionError):\n # if obj can't be pickled because of RecursionError then __str__() will also\n # throw a RecursionError\n obj_repr = f"{obj.__class__} exceeds recursion limit and"\n else:\n obj_repr = obj.__str__()\n\n raise DagsterInvariantViolationError(\n f"Object {obj_repr} is not picklable. You are currently using the "\n f"fs_io_manager and the {executor.name}. You will need to use a different "\n "io manager to continue using this output. For example, you can use the "\n "mem_io_manager with the in_process_executor.\\n"\n "For more information on io managers, visit "\n "https://docs.dagster.io/concepts/io-management/io-managers \\n"\n "For more information on executors, vist "\n "https://docs.dagster.io/deployment/executors#overview"\n ) from e\n\n def load_from_path(self, context: InputContext, path: "UPath") -> Any:\n with path.open("rb") as file:\n return pickle.load(file)\n\n\nclass CustomPathPickledObjectFilesystemIOManager(IOManager):\n """Built-in filesystem IO managerthat stores and retrieves values using pickling and\n allow users to specify file path for outputs.\n\n Args:\n base_dir (Optional[str]): base directory where all the step outputs which use this object\n manager will be stored in.\n """\n\n def __init__(self, base_dir: Optional[str] = None):\n self.base_dir = check.opt_str_param(base_dir, "base_dir")\n self.write_mode: Literal["wb"] = "wb"\n self.read_mode: Literal["rb"] = "rb"\n\n def _get_path(self, path: str) -> str:\n return os.path.join(self.base_dir, path) # type: ignore # (possible none)\n\n def handle_output(self, context: OutputContext, obj: object):\n """Pickle the data and store the object to a custom file path.\n\n This method emits an AssetMaterialization event so the assets will be tracked by the\n Asset Catalog.\n """\n check.inst_param(context, "context", OutputContext)\n metadata = context.metadata\n path = check.str_param(metadata.get("path"), "metadata.path") # type: ignore # (possible none)\n\n filepath = self._get_path(path)\n\n # Ensure path exists\n mkdir_p(os.path.dirname(filepath))\n context.log.debug(f"Writing file at: {filepath}")\n\n with open(filepath, self.write_mode) as write_obj:\n pickle.dump(obj, write_obj, PICKLE_PROTOCOL)\n\n return AssetMaterialization(\n asset_key=AssetKey([context.job_name, context.step_key, context.name]),\n metadata={"path": MetadataValue.path(os.path.abspath(filepath))},\n )\n\n def load_input(self, context: InputContext) -> object:\n """Unpickle the file from a given file path and Load it to a data object."""\n check.inst_param(context, "context", InputContext)\n metadata = context.upstream_output.metadata # type: ignore # (possible none)\n path = check.str_param(metadata.get("path"), "metadata.path") # type: ignore # (possible none)\n filepath = self._get_path(path)\n context.log.debug(f"Loading file from: {filepath}")\n\n with open(filepath, self.read_mode) as read_obj:\n return pickle.load(read_obj)\n\n\n@dagster_maintained_io_manager\n@io_manager(config_schema={"base_dir": DagsterField(StringSource, is_required=True)})\n@experimental\ndef custom_path_fs_io_manager(\n init_context: InitResourceContext,\n) -> CustomPathPickledObjectFilesystemIOManager:\n """Built-in IO manager that allows users to custom output file path per output definition.\n\n It requires users to specify a base directory where all the step output will be stored in. It\n serializes and deserializes output values (assets) using pickling and stores the pickled object\n in the user-provided file paths.\n\n Example usage:\n\n .. code-block:: python\n\n from dagster import custom_path_fs_io_manager, job, op\n\n @op(out=Out(metadata={"path": "path/to/sample_output"}))\n def sample_data(df):\n return df[:5]\n\n my_custom_path_fs_io_manager = custom_path_fs_io_manager.configured(\n {"base_dir": "path/to/basedir"}\n )\n\n @job(resource_defs={"io_manager": my_custom_path_fs_io_manager})\n def my_job():\n sample_data()\n\n """\n return CustomPathPickledObjectFilesystemIOManager(\n base_dir=init_context.resource_config.get("base_dir")\n )\n
", "current_page_name": "_modules/dagster/_core/storage/fs_io_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.storage.fs_io_manager"}, "input_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.storage.input_manager

\nfrom abc import ABC, abstractmethod\nfrom functools import update_wrapper\nfrom typing import TYPE_CHECKING, AbstractSet, Callable, Optional, Union, cast, overload\n\nfrom typing_extensions import TypeAlias, TypeGuard\n\nimport dagster._check as check\nfrom dagster._core.decorator_utils import has_at_least_one_parameter\nfrom dagster._core.definitions.config import is_callable_valid_config_arg\nfrom dagster._core.definitions.definition_config_schema import (\n    CoercableToConfigSchema,\n    IDefinitionConfigSchema,\n    convert_user_facing_definition_config_schema,\n)\nfrom dagster._core.definitions.resource_definition import ResourceDefinition, ResourceFunction\n\nif TYPE_CHECKING:\n    from dagster._core.execution.context.input import InputContext\n\nInputLoadFn: TypeAlias = Union[\n    Callable[["InputContext"], object],\n    Callable[[], object],\n]\n\n\n
[docs]class InputManager(ABC):\n """Base interface for classes that are responsible for loading solid inputs."""\n\n @abstractmethod\n def load_input(self, context: "InputContext") -> object:\n """The user-defined read method that loads an input to a solid.\n\n Args:\n context (InputContext): The input context.\n\n Returns:\n Any: The data object.\n """
\n\n\nclass IInputManagerDefinition:\n @property\n @abstractmethod\n def input_config_schema(self) -> IDefinitionConfigSchema:\n """The schema for per-input configuration for inputs that are managed by this\n input manager.\n """\n\n\n
[docs]class InputManagerDefinition(ResourceDefinition, IInputManagerDefinition):\n """Definition of an input manager resource.\n\n Input managers load op inputs.\n\n An InputManagerDefinition is a :py:class:`ResourceDefinition` whose resource_fn returns an\n :py:class:`InputManager`.\n\n The easiest way to create an InputManagerDefinition is with the\n :py:func:`@input_manager <input_manager>` decorator.\n """\n\n def __init__(\n self,\n resource_fn: ResourceFunction,\n config_schema: Optional[CoercableToConfigSchema] = None,\n description: Optional[str] = None,\n input_config_schema: Optional[CoercableToConfigSchema] = None,\n required_resource_keys: Optional[AbstractSet[str]] = None,\n version: Optional[str] = None,\n ):\n self._input_config_schema = convert_user_facing_definition_config_schema(\n input_config_schema\n )\n super(InputManagerDefinition, self).__init__(\n resource_fn=resource_fn,\n config_schema=config_schema,\n description=description,\n required_resource_keys=required_resource_keys,\n version=version,\n )\n\n @property\n def input_config_schema(self) -> IDefinitionConfigSchema:\n return self._input_config_schema\n\n def copy_for_configured(\n self,\n description: Optional[str],\n config_schema: CoercableToConfigSchema,\n ) -> "InputManagerDefinition":\n return InputManagerDefinition(\n config_schema=config_schema,\n description=description or self.description,\n resource_fn=self.resource_fn,\n required_resource_keys=self.required_resource_keys,\n input_config_schema=self.input_config_schema,\n )
\n\n\n@overload\ndef input_manager(\n config_schema: InputLoadFn,\n) -> InputManagerDefinition: ...\n\n\n@overload\ndef input_manager(\n config_schema: Optional[CoercableToConfigSchema] = None,\n description: Optional[str] = None,\n input_config_schema: Optional[CoercableToConfigSchema] = None,\n required_resource_keys: Optional[AbstractSet[str]] = None,\n version: Optional[str] = None,\n) -> Callable[[InputLoadFn], InputManagerDefinition]: ...\n\n\n
[docs]def input_manager(\n config_schema: Union[InputLoadFn, Optional[CoercableToConfigSchema]] = None,\n description: Optional[str] = None,\n input_config_schema: Optional[CoercableToConfigSchema] = None,\n required_resource_keys: Optional[AbstractSet[str]] = None,\n version: Optional[str] = None,\n) -> Union[InputManagerDefinition, Callable[[InputLoadFn], InputManagerDefinition]]:\n """Define an input manager.\n\n Input managers load op inputs, either from upstream outputs or by providing default values.\n\n The decorated function should accept a :py:class:`InputContext` and resource config, and return\n a loaded object that will be passed into one of the inputs of an op.\n\n The decorator produces an :py:class:`InputManagerDefinition`.\n\n Args:\n config_schema (Optional[ConfigSchema]): The schema for the resource-level config. If not\n set, Dagster will accept any config provided.\n description (Optional[str]): A human-readable description of the resource.\n input_config_schema (Optional[ConfigSchema]): A schema for the input-level config. Each\n input that uses this input manager can be configured separately using this config.\n If not set, Dagster will accept any config provided.\n required_resource_keys (Optional[Set[str]]): Keys for the resources required by the input\n manager.\n version (Optional[str]): (Experimental) the version of the input manager definition.\n\n **Examples:**\n\n .. code-block:: python\n\n from dagster import input_manager, op, job, In\n\n @input_manager\n def csv_loader(_):\n return read_csv("some/path")\n\n @op(ins={"input1": In(input_manager_key="csv_loader_key")})\n def my_op(_, input1):\n do_stuff(input1)\n\n @job(resource_defs={"csv_loader_key": csv_loader})\n def my_job():\n my_op()\n\n @input_manager(config_schema={"base_dir": str})\n def csv_loader(context):\n return read_csv(context.resource_config["base_dir"] + "/some/path")\n\n @input_manager(input_config_schema={"path": str})\n def csv_loader(context):\n return read_csv(context.config["path"])\n """\n if _is_input_load_fn(config_schema):\n return _InputManagerDecoratorCallable()(config_schema)\n\n def _wrap(load_fn: InputLoadFn) -> InputManagerDefinition:\n return _InputManagerDecoratorCallable(\n config_schema=cast(CoercableToConfigSchema, config_schema),\n description=description,\n version=version,\n input_config_schema=input_config_schema,\n required_resource_keys=required_resource_keys,\n )(load_fn)\n\n return _wrap
\n\n\ndef _is_input_load_fn(obj: Union[InputLoadFn, CoercableToConfigSchema]) -> TypeGuard[InputLoadFn]:\n return callable(obj) and not is_callable_valid_config_arg(obj)\n\n\nclass InputManagerWrapper(InputManager):\n def __init__(self, load_fn: InputLoadFn):\n self._load_fn = load_fn\n\n def load_input(self, context: "InputContext") -> object:\n # the @input_manager decorated function (self._load_fn) may return a direct value that\n # should be used or an instance of an InputManager. So we call self._load_fn and see if the\n # result is an InputManager. If so we call it's load_input method\n intermediate = (\n # type-ignore because function being used as attribute\n self._load_fn(context)\n if has_at_least_one_parameter(self._load_fn)\n else self._load_fn() # type: ignore # (strict type guard)\n )\n\n if isinstance(intermediate, InputManager):\n return intermediate.load_input(context)\n return intermediate\n\n\nclass _InputManagerDecoratorCallable:\n def __init__(\n self,\n config_schema: CoercableToConfigSchema = None,\n description: Optional[str] = None,\n version: Optional[str] = None,\n input_config_schema: CoercableToConfigSchema = None,\n required_resource_keys: Optional[AbstractSet[str]] = None,\n ):\n self.config_schema = config_schema\n self.description = check.opt_str_param(description, "description")\n self.version = check.opt_str_param(version, "version")\n self.input_config_schema = input_config_schema\n self.required_resource_keys = required_resource_keys\n\n def __call__(self, load_fn: InputLoadFn) -> InputManagerDefinition:\n check.callable_param(load_fn, "load_fn")\n\n def _resource_fn(_):\n return InputManagerWrapper(load_fn)\n\n input_manager_def = InputManagerDefinition(\n resource_fn=_resource_fn,\n config_schema=self.config_schema,\n description=self.description,\n version=self.version,\n input_config_schema=self.input_config_schema,\n required_resource_keys=self.required_resource_keys,\n )\n\n # `update_wrapper` typing cannot currently handle a Union of Callables correctly\n update_wrapper(input_manager_def, wrapped=load_fn) # type: ignore\n\n return input_manager_def\n
", "current_page_name": "_modules/dagster/_core/storage/input_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.storage.input_manager"}, "io_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.storage.io_manager

\nfrom abc import abstractmethod\nfrom functools import update_wrapper\nfrom typing import TYPE_CHECKING, AbstractSet, Any, Callable, Optional, Set, Union, cast, overload\n\nfrom typing_extensions import TypeAlias, TypeGuard\n\nimport dagster._check as check\nfrom dagster._annotations import public\nfrom dagster._config import UserConfigSchema\nfrom dagster._core.definitions.config import is_callable_valid_config_arg\nfrom dagster._core.definitions.definition_config_schema import (\n    CoercableToConfigSchema,\n    IDefinitionConfigSchema,\n    convert_user_facing_definition_config_schema,\n)\nfrom dagster._core.definitions.resource_definition import ResourceDefinition\nfrom dagster._core.storage.input_manager import IInputManagerDefinition, InputManager\nfrom dagster._core.storage.output_manager import IOutputManagerDefinition, OutputManager\n\nfrom ..decorator_utils import get_function_params\n\nif TYPE_CHECKING:\n    from dagster._core.execution.context.init import InitResourceContext\n    from dagster._core.execution.context.input import InputContext\n    from dagster._core.execution.context.output import OutputContext\n\nIOManagerFunctionWithContext = Callable[["InitResourceContext"], "IOManager"]\nIOManagerFunction: TypeAlias = Union[\n    IOManagerFunctionWithContext,\n    Callable[[], "IOManager"],\n]\n\n\ndef is_io_manager_context_provided(\n    fn: IOManagerFunction,\n) -> TypeGuard[IOManagerFunctionWithContext]:\n    return len(get_function_params(fn)) >= 1\n\n\n
[docs]class IOManagerDefinition(ResourceDefinition, IInputManagerDefinition, IOutputManagerDefinition):\n """Definition of an IO manager resource.\n\n IOManagers are used to store op outputs and load them as inputs to downstream ops.\n\n An IOManagerDefinition is a :py:class:`ResourceDefinition` whose `resource_fn` returns an\n :py:class:`IOManager`.\n\n The easiest way to create an IOManagerDefnition is with the :py:func:`@io_manager <io_manager>`\n decorator.\n """\n\n def __init__(\n self,\n resource_fn: IOManagerFunction,\n config_schema: CoercableToConfigSchema = None,\n description: Optional[str] = None,\n required_resource_keys: Optional[AbstractSet[str]] = None,\n version: Optional[str] = None,\n input_config_schema: CoercableToConfigSchema = None,\n output_config_schema: CoercableToConfigSchema = None,\n ):\n self._input_config_schema = convert_user_facing_definition_config_schema(\n input_config_schema\n )\n # Unlike other configurable objects, whose config schemas default to Any,\n # output_config_schema defaults to None. This the because IOManager input / output config\n # shares config namespace with dagster type loaders.\n self._output_config_schema = (\n convert_user_facing_definition_config_schema(output_config_schema)\n if output_config_schema is not None\n else None\n )\n super(IOManagerDefinition, self).__init__(\n resource_fn=resource_fn,\n config_schema=config_schema,\n description=description,\n required_resource_keys=required_resource_keys,\n version=version,\n )\n\n @property\n def input_config_schema(self) -> IDefinitionConfigSchema:\n return self._input_config_schema\n\n @property\n def output_config_schema(self) -> Optional[IDefinitionConfigSchema]:\n return self._output_config_schema\n\n def copy_for_configured(\n self,\n description: Optional[str],\n config_schema: CoercableToConfigSchema,\n ) -> "IOManagerDefinition":\n io_def = IOManagerDefinition(\n config_schema=config_schema,\n description=description or self.description,\n resource_fn=self.resource_fn,\n required_resource_keys=self.required_resource_keys,\n input_config_schema=self.input_config_schema,\n output_config_schema=self.output_config_schema,\n )\n\n io_def._dagster_maintained = self._is_dagster_maintained() # noqa: SLF001\n\n return io_def\n\n
[docs] @public\n @staticmethod\n def hardcoded_io_manager(\n value: "IOManager", description: Optional[str] = None\n ) -> "IOManagerDefinition":\n """A helper function that creates an ``IOManagerDefinition`` with a hardcoded IOManager.\n\n Args:\n value (IOManager): A hardcoded IO Manager which helps mock the definition.\n description ([Optional[str]]): The description of the IO Manager. Defaults to None.\n\n Returns:\n [IOManagerDefinition]: A hardcoded resource.\n """\n check.inst_param(value, "value", IOManager)\n return IOManagerDefinition(resource_fn=lambda _init_context: value, description=description)
\n\n\n
[docs]class IOManager(InputManager, OutputManager):\n """Base class for user-provided IO managers.\n\n IOManagers are used to store op outputs and load them as inputs to downstream ops.\n\n Extend this class to handle how objects are loaded and stored. Users should implement\n ``handle_output`` to store an object and ``load_input`` to retrieve an object.\n """\n\n
[docs] @public\n @abstractmethod\n def load_input(self, context: "InputContext") -> Any:\n """User-defined method that loads an input to an op.\n\n Args:\n context (InputContext): The input context, which describes the input that's being loaded\n and the upstream output that's being loaded from.\n\n Returns:\n Any: The data object.\n """
\n\n
[docs] @public\n @abstractmethod\n def handle_output(self, context: "OutputContext", obj: Any) -> None:\n """User-defined method that stores an output of an op.\n\n Args:\n context (OutputContext): The context of the step output that produces this object.\n obj (Any): The object, returned by the op, to be stored.\n """
\n\n\n@overload\ndef io_manager(config_schema: IOManagerFunction) -> IOManagerDefinition: ...\n\n\n@overload\ndef io_manager(\n config_schema: CoercableToConfigSchema = None,\n description: Optional[str] = None,\n output_config_schema: CoercableToConfigSchema = None,\n input_config_schema: CoercableToConfigSchema = None,\n required_resource_keys: Optional[Set[str]] = None,\n version: Optional[str] = None,\n) -> Callable[[IOManagerFunction], IOManagerDefinition]: ...\n\n\n
[docs]def io_manager(\n config_schema: Union[IOManagerFunction, CoercableToConfigSchema] = None,\n description: Optional[str] = None,\n output_config_schema: CoercableToConfigSchema = None,\n input_config_schema: CoercableToConfigSchema = None,\n required_resource_keys: Optional[Set[str]] = None,\n version: Optional[str] = None,\n) -> Union[IOManagerDefinition, Callable[[IOManagerFunction], IOManagerDefinition],]:\n """Define an IO manager.\n\n IOManagers are used to store op outputs and load them as inputs to downstream ops.\n\n The decorated function should accept an :py:class:`InitResourceContext` and return an\n :py:class:`IOManager`.\n\n Args:\n config_schema (Optional[ConfigSchema]): The schema for the resource config. Configuration\n data available in `init_context.resource_config`. If not set, Dagster will accept any\n config provided.\n description(Optional[str]): A human-readable description of the resource.\n output_config_schema (Optional[ConfigSchema]): The schema for per-output config. If not set,\n no per-output configuration will be allowed.\n input_config_schema (Optional[ConfigSchema]): The schema for per-input config. If not set,\n Dagster will accept any config provided.\n required_resource_keys (Optional[Set[str]]): Keys for the resources required by the object\n manager.\n version (Optional[str]): (Experimental) The version of a resource function. Two wrapped\n resource functions should only have the same version if they produce the same resource\n definition when provided with the same inputs.\n\n **Examples:**\n\n .. code-block:: python\n\n class MyIOManager(IOManager):\n def handle_output(self, context, obj):\n write_csv("some/path")\n\n def load_input(self, context):\n return read_csv("some/path")\n\n @io_manager\n def my_io_manager(init_context):\n return MyIOManager()\n\n @op(out=Out(io_manager_key="my_io_manager_key"))\n def my_op(_):\n return do_stuff()\n\n @job(resource_defs={"my_io_manager_key": my_io_manager})\n def my_job():\n my_op()\n\n """\n if callable(config_schema) and not is_callable_valid_config_arg(config_schema):\n config_schema = cast(IOManagerFunction, config_schema)\n return _IOManagerDecoratorCallable()(config_schema)\n\n def _wrap(resource_fn: IOManagerFunction) -> IOManagerDefinition:\n return _IOManagerDecoratorCallable(\n config_schema=cast(Optional[UserConfigSchema], config_schema),\n description=description,\n required_resource_keys=required_resource_keys,\n version=version,\n output_config_schema=output_config_schema,\n input_config_schema=input_config_schema,\n )(resource_fn)\n\n return _wrap
\n\n\ndef dagster_maintained_io_manager(io_manager_def: IOManagerDefinition) -> IOManagerDefinition:\n io_manager_def._dagster_maintained = True # noqa: SLF001\n return io_manager_def\n\n\nclass _IOManagerDecoratorCallable:\n def __init__(\n self,\n config_schema: CoercableToConfigSchema = None,\n description: Optional[str] = None,\n output_config_schema: CoercableToConfigSchema = None,\n input_config_schema: CoercableToConfigSchema = None,\n required_resource_keys: Optional[Set[str]] = None,\n version: Optional[str] = None,\n ):\n # type validation happens in IOManagerDefinition\n self.config_schema = config_schema\n self.description = description\n self.required_resource_keys = required_resource_keys\n self.version = version\n self.output_config_schema = output_config_schema\n self.input_config_schema = input_config_schema\n\n def __call__(self, fn: IOManagerFunction) -> IOManagerDefinition:\n check.callable_param(fn, "fn")\n\n io_manager_def = IOManagerDefinition(\n resource_fn=fn,\n config_schema=self.config_schema,\n description=self.description,\n required_resource_keys=self.required_resource_keys,\n version=self.version,\n output_config_schema=self.output_config_schema,\n input_config_schema=self.input_config_schema,\n )\n\n # `update_wrapper` typing cannot currently handle a Union of Callables correctly\n update_wrapper(io_manager_def, wrapped=fn) # type: ignore\n\n return io_manager_def\n
", "current_page_name": "_modules/dagster/_core/storage/io_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.storage.io_manager"}, "local_compute_log_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.storage.local_compute_log_manager

\nimport hashlib\nimport os\nimport shutil\nimport sys\nfrom collections import defaultdict\nfrom contextlib import contextmanager\nfrom typing import IO, TYPE_CHECKING, Generator, Iterator, Mapping, Optional, Sequence, Tuple\n\nfrom typing_extensions import Final\nfrom watchdog.events import PatternMatchingEventHandler\nfrom watchdog.observers.polling import PollingObserver\n\nfrom dagster import (\n    Field,\n    Float,\n    StringSource,\n    _check as check,\n)\nfrom dagster._config.config_schema import UserConfigSchema\nfrom dagster._core.execution.compute_logs import mirror_stream_to_file\nfrom dagster._core.storage.dagster_run import DagsterRun\nfrom dagster._serdes import ConfigurableClass, ConfigurableClassData\nfrom dagster._seven import json\nfrom dagster._utils import ensure_dir, ensure_file, touch_file\n\nfrom .captured_log_manager import (\n    CapturedLogContext,\n    CapturedLogData,\n    CapturedLogManager,\n    CapturedLogMetadata,\n    CapturedLogSubscription,\n)\nfrom .compute_log_manager import (\n    MAX_BYTES_FILE_READ,\n    ComputeIOType,\n    ComputeLogFileData,\n    ComputeLogManager,\n    ComputeLogSubscription,\n)\n\nif TYPE_CHECKING:\n    from dagster._core.storage.cloud_storage_compute_log_manager import LogSubscription\n\nDEFAULT_WATCHDOG_POLLING_TIMEOUT: Final = 2.5\n\nIO_TYPE_EXTENSION: Final[Mapping[ComputeIOType, str]] = {\n    ComputeIOType.STDOUT: "out",\n    ComputeIOType.STDERR: "err",\n}\n\nMAX_FILENAME_LENGTH: Final = 255\n\n\n
[docs]class LocalComputeLogManager(CapturedLogManager, ComputeLogManager, ConfigurableClass):\n """Stores copies of stdout & stderr for each compute step locally on disk."""\n\n def __init__(\n self,\n base_dir: str,\n polling_timeout: Optional[float] = None,\n inst_data: Optional[ConfigurableClassData] = None,\n ):\n self._base_dir = base_dir\n self._polling_timeout = check.opt_float_param(\n polling_timeout, "polling_timeout", DEFAULT_WATCHDOG_POLLING_TIMEOUT\n )\n self._subscription_manager = LocalComputeLogSubscriptionManager(self)\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n\n @property\n def inst_data(self) -> Optional[ConfigurableClassData]:\n return self._inst_data\n\n @property\n def polling_timeout(self) -> float:\n return self._polling_timeout\n\n @classmethod\n def config_type(cls) -> UserConfigSchema:\n return {\n "base_dir": StringSource,\n "polling_timeout": Field(Float, is_required=False),\n }\n\n @classmethod\n def from_config_value(\n cls, inst_data: Optional[ConfigurableClassData], config_value\n ) -> "LocalComputeLogManager":\n return LocalComputeLogManager(inst_data=inst_data, **config_value)\n\n @contextmanager\n def capture_logs(self, log_key: Sequence[str]) -> Generator[CapturedLogContext, None, None]:\n outpath = self.get_captured_local_path(log_key, IO_TYPE_EXTENSION[ComputeIOType.STDOUT])\n errpath = self.get_captured_local_path(log_key, IO_TYPE_EXTENSION[ComputeIOType.STDERR])\n with mirror_stream_to_file(sys.stdout, outpath), mirror_stream_to_file(sys.stderr, errpath):\n yield CapturedLogContext(log_key)\n\n # leave artifact on filesystem so that we know the capture is completed\n touch_file(self.complete_artifact_path(log_key))\n\n @contextmanager\n def open_log_stream(\n self, log_key: Sequence[str], io_type: ComputeIOType\n ) -> Iterator[Optional[IO]]:\n path = self.get_captured_local_path(log_key, IO_TYPE_EXTENSION[io_type])\n ensure_file(path)\n with open(path, "+a", encoding="utf-8") as f:\n yield f\n\n def is_capture_complete(self, log_key: Sequence[str]) -> bool:\n return os.path.exists(self.complete_artifact_path(log_key))\n\n def get_log_data(\n self, log_key: Sequence[str], cursor: Optional[str] = None, max_bytes: Optional[int] = None\n ) -> CapturedLogData:\n stdout_cursor, stderr_cursor = self.parse_cursor(cursor)\n stdout, stdout_offset = self._read_bytes(\n log_key, ComputeIOType.STDOUT, offset=stdout_cursor, max_bytes=max_bytes\n )\n stderr, stderr_offset = self._read_bytes(\n log_key, ComputeIOType.STDERR, offset=stderr_cursor, max_bytes=max_bytes\n )\n return CapturedLogData(\n log_key=log_key,\n stdout=stdout,\n stderr=stderr,\n cursor=self.build_cursor(stdout_offset, stderr_offset),\n )\n\n def get_log_metadata(self, log_key: Sequence[str]) -> CapturedLogMetadata:\n return CapturedLogMetadata(\n stdout_location=self.get_captured_local_path(\n log_key, IO_TYPE_EXTENSION[ComputeIOType.STDOUT]\n ),\n stderr_location=self.get_captured_local_path(\n log_key, IO_TYPE_EXTENSION[ComputeIOType.STDERR]\n ),\n stdout_download_url=self.get_captured_log_download_url(log_key, ComputeIOType.STDOUT),\n stderr_download_url=self.get_captured_log_download_url(log_key, ComputeIOType.STDERR),\n )\n\n def delete_logs(\n self, log_key: Optional[Sequence[str]] = None, prefix: Optional[Sequence[str]] = None\n ):\n if log_key:\n paths = [\n self.get_captured_local_path(log_key, IO_TYPE_EXTENSION[ComputeIOType.STDOUT]),\n self.get_captured_local_path(log_key, IO_TYPE_EXTENSION[ComputeIOType.STDERR]),\n self.get_captured_local_path(\n log_key, IO_TYPE_EXTENSION[ComputeIOType.STDOUT], partial=True\n ),\n self.get_captured_local_path(\n log_key, IO_TYPE_EXTENSION[ComputeIOType.STDERR], partial=True\n ),\n self.get_captured_local_path(log_key, "complete"),\n ]\n for path in paths:\n if os.path.exists(path) and os.path.isfile(path):\n os.remove(path)\n elif prefix:\n dir_to_delete = os.path.join(self._base_dir, *prefix)\n if os.path.exists(dir_to_delete) and os.path.isdir(dir_to_delete):\n # recursively delete all files in dir\n shutil.rmtree(dir_to_delete)\n else:\n check.failed("Must pass in either `log_key` or `prefix` argument to delete_logs")\n\n def _read_bytes(\n self,\n log_key: Sequence[str],\n io_type: ComputeIOType,\n offset: Optional[int] = 0,\n max_bytes: Optional[int] = None,\n ):\n path = self.get_captured_local_path(log_key, IO_TYPE_EXTENSION[io_type])\n return self.read_path(path, offset or 0, max_bytes)\n\n def parse_cursor(self, cursor: Optional[str] = None) -> Tuple[int, int]:\n # Translates a string cursor into a set of byte offsets for stdout, stderr\n if not cursor:\n return 0, 0\n\n parts = cursor.split(":")\n if not parts or len(parts) != 2:\n return 0, 0\n\n stdout, stderr = [int(_) for _ in parts]\n return stdout, stderr\n\n def build_cursor(self, stdout_offset: int, stderr_offset: int) -> str:\n return f"{stdout_offset}:{stderr_offset}"\n\n def complete_artifact_path(self, log_key):\n return self.get_captured_local_path(log_key, "complete")\n\n def read_path(\n self,\n path: str,\n offset: int = 0,\n max_bytes: Optional[int] = None,\n ):\n if not os.path.exists(path) or not os.path.isfile(path):\n return None, offset\n\n with open(path, "rb") as f:\n f.seek(offset, os.SEEK_SET)\n if max_bytes is None:\n data = f.read()\n else:\n data = f.read(max_bytes)\n new_offset = f.tell()\n return data, new_offset\n\n def get_captured_log_download_url(self, log_key, io_type):\n check.inst_param(io_type, "io_type", ComputeIOType)\n url = "/logs"\n for part in log_key:\n url = f"{url}/{part}"\n\n return f"{url}/{IO_TYPE_EXTENSION[io_type]}"\n\n def get_captured_local_path(self, log_key: Sequence[str], extension: str, partial=False):\n [*namespace, filebase] = log_key\n filename = f"{filebase}.{extension}"\n if partial:\n filename = f"{filename}.partial"\n if len(filename) > MAX_FILENAME_LENGTH:\n filename = "{}.{}".format(hashlib.md5(filebase.encode("utf-8")).hexdigest(), extension)\n return os.path.join(self._base_dir, *namespace, filename)\n\n def subscribe(\n self, log_key: Sequence[str], cursor: Optional[str] = None\n ) -> CapturedLogSubscription:\n subscription = CapturedLogSubscription(self, log_key, cursor)\n self.on_subscribe(subscription)\n return subscription\n\n def unsubscribe(self, subscription):\n self.on_unsubscribe(subscription)\n\n ###############################################\n #\n # Methods for the ComputeLogManager interface\n #\n ###############################################\n @contextmanager\n def _watch_logs(\n self, dagster_run: DagsterRun, step_key: Optional[str] = None\n ) -> Iterator[None]:\n check.inst_param(dagster_run, "dagster_run", DagsterRun)\n check.opt_str_param(step_key, "step_key")\n\n log_key = self.build_log_key_for_run(dagster_run.run_id, step_key or dagster_run.job_name)\n with self.capture_logs(log_key):\n yield\n\n def get_local_path(self, run_id: str, key: str, io_type: ComputeIOType) -> str:\n """Legacy adapter from compute log manager to more generic captured log manager API."""\n check.inst_param(io_type, "io_type", ComputeIOType)\n log_key = self.build_log_key_for_run(run_id, key)\n return self.get_captured_local_path(log_key, IO_TYPE_EXTENSION[io_type])\n\n def read_logs_file(\n self,\n run_id: str,\n key: str,\n io_type: ComputeIOType,\n cursor: int = 0,\n max_bytes: int = MAX_BYTES_FILE_READ,\n ) -> ComputeLogFileData:\n path = self.get_local_path(run_id, key, io_type)\n\n if not os.path.exists(path) or not os.path.isfile(path):\n return ComputeLogFileData(path=path, data=None, cursor=0, size=0, download_url=None)\n\n # See: https://docs.python.org/2/library/stdtypes.html#file.tell for Windows behavior\n with open(path, "rb") as f:\n f.seek(cursor, os.SEEK_SET)\n data = f.read(max_bytes)\n cursor = f.tell()\n stats = os.fstat(f.fileno())\n\n # local download path\n download_url = self.download_url(run_id, key, io_type)\n return ComputeLogFileData(\n path=path,\n data=data.decode("utf-8"),\n cursor=cursor,\n size=stats.st_size,\n download_url=download_url,\n )\n\n def get_key(self, dagster_run: DagsterRun, step_key: Optional[str]):\n check.inst_param(dagster_run, "dagster_run", DagsterRun)\n check.opt_str_param(step_key, "step_key")\n return step_key or dagster_run.job_name\n\n def is_watch_completed(self, run_id: str, key: str) -> bool:\n log_key = self.build_log_key_for_run(run_id, key)\n return self.is_capture_complete(log_key)\n\n def on_watch_start(self, dagster_run: DagsterRun, step_key: Optional[str]):\n pass\n\n def on_watch_finish(self, dagster_run: DagsterRun, step_key: Optional[str] = None):\n check.inst_param(dagster_run, "dagster_run", DagsterRun)\n check.opt_str_param(step_key, "step_key")\n log_key = self.build_log_key_for_run(dagster_run.run_id, step_key or dagster_run.job_name)\n touchpath = self.complete_artifact_path(log_key)\n touch_file(touchpath)\n\n def download_url(self, run_id: str, key: str, io_type: ComputeIOType):\n check.inst_param(io_type, "io_type", ComputeIOType)\n return f"/download/{run_id}/{key}/{io_type.value}"\n\n def on_subscribe(self, subscription: "LogSubscription") -> None:\n self._subscription_manager.add_subscription(subscription)\n\n def on_unsubscribe(self, subscription: "LogSubscription") -> None:\n self._subscription_manager.remove_subscription(subscription)\n\n def dispose(self) -> None:\n self._subscription_manager.dispose()
\n\n\nclass LocalComputeLogSubscriptionManager:\n def __init__(self, manager):\n self._manager = manager\n self._subscriptions = defaultdict(list)\n self._watchers = {}\n self._observer = None\n\n def add_subscription(self, subscription: "LogSubscription") -> None:\n check.inst_param(\n subscription, "subscription", (ComputeLogSubscription, CapturedLogSubscription)\n )\n\n if self.is_complete(subscription):\n subscription.fetch()\n subscription.complete()\n else:\n log_key = self._log_key(subscription)\n watch_key = self._watch_key(log_key)\n self._subscriptions[watch_key].append(subscription)\n self.watch(subscription)\n\n def is_complete(self, subscription: "LogSubscription") -> bool:\n check.inst_param(\n subscription, "subscription", (ComputeLogSubscription, CapturedLogSubscription)\n )\n\n if isinstance(subscription, ComputeLogSubscription):\n return self._manager.is_watch_completed(subscription.run_id, subscription.key)\n return self._manager.is_capture_complete(subscription.log_key)\n\n def remove_subscription(self, subscription: "LogSubscription") -> None:\n check.inst_param(\n subscription, "subscription", (ComputeLogSubscription, CapturedLogSubscription)\n )\n log_key = self._log_key(subscription)\n watch_key = self._watch_key(log_key)\n if subscription in self._subscriptions[watch_key]:\n self._subscriptions[watch_key].remove(subscription)\n subscription.complete()\n\n def _log_key(self, subscription: "LogSubscription") -> Sequence[str]:\n check.inst_param(\n subscription, "subscription", (ComputeLogSubscription, CapturedLogSubscription)\n )\n\n if isinstance(subscription, ComputeLogSubscription):\n return self._manager.build_log_key_for_run(subscription.run_id, subscription.key)\n return subscription.log_key\n\n def _watch_key(self, log_key: Sequence[str]) -> str:\n return json.dumps(log_key)\n\n def remove_all_subscriptions(self, log_key: Sequence[str]) -> None:\n watch_key = self._watch_key(log_key)\n for subscription in self._subscriptions.pop(watch_key, []):\n subscription.complete()\n\n def watch(self, subscription: "LogSubscription") -> None:\n log_key = self._log_key(subscription)\n watch_key = self._watch_key(log_key)\n if watch_key in self._watchers:\n return\n\n update_paths = [\n self._manager.get_captured_local_path(log_key, IO_TYPE_EXTENSION[ComputeIOType.STDOUT]),\n self._manager.get_captured_local_path(log_key, IO_TYPE_EXTENSION[ComputeIOType.STDERR]),\n self._manager.get_captured_local_path(\n log_key, IO_TYPE_EXTENSION[ComputeIOType.STDOUT], partial=True\n ),\n self._manager.get_captured_local_path(\n log_key, IO_TYPE_EXTENSION[ComputeIOType.STDERR], partial=True\n ),\n ]\n complete_paths = [self._manager.complete_artifact_path(log_key)]\n directory = os.path.dirname(\n self._manager.get_captured_local_path(log_key, ComputeIOType.STDERR),\n )\n\n if not self._observer:\n self._observer = PollingObserver(self._manager.polling_timeout)\n self._observer.start()\n\n ensure_dir(directory)\n\n self._watchers[watch_key] = self._observer.schedule(\n LocalComputeLogFilesystemEventHandler(self, log_key, update_paths, complete_paths),\n str(directory),\n )\n\n def notify_subscriptions(self, log_key: Sequence[str]) -> None:\n watch_key = self._watch_key(log_key)\n for subscription in self._subscriptions[watch_key]:\n subscription.fetch()\n\n def unwatch(self, log_key: Sequence[str], handler) -> None:\n watch_key = self._watch_key(log_key)\n if watch_key in self._watchers:\n self._observer.remove_handler_for_watch(handler, self._watchers[watch_key]) # type: ignore\n del self._watchers[watch_key]\n\n def dispose(self) -> None:\n if self._observer:\n self._observer.stop()\n self._observer.join(15)\n\n\nclass LocalComputeLogFilesystemEventHandler(PatternMatchingEventHandler):\n def __init__(self, manager, log_key, update_paths, complete_paths):\n self.manager = manager\n self.log_key = log_key\n self.update_paths = update_paths\n self.complete_paths = complete_paths\n patterns = update_paths + complete_paths\n super(LocalComputeLogFilesystemEventHandler, self).__init__(patterns=patterns)\n\n def on_created(self, event):\n if event.src_path in self.complete_paths:\n self.manager.remove_all_subscriptions(self.log_key)\n self.manager.unwatch(self.log_key, self)\n\n def on_modified(self, event):\n if event.src_path in self.update_paths:\n self.manager.notify_subscriptions(self.log_key)\n
", "current_page_name": "_modules/dagster/_core/storage/local_compute_log_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.storage.local_compute_log_manager"}, "mem_io_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.storage.mem_io_manager

\nfrom typing import Dict, Tuple\n\nfrom dagster._core.execution.context.input import InputContext\nfrom dagster._core.execution.context.output import OutputContext\nfrom dagster._core.storage.io_manager import IOManager, dagster_maintained_io_manager, io_manager\n\n\n
[docs]class InMemoryIOManager(IOManager):\n """I/O manager that stores and retrieves values in memory. After execution is complete, the values will\n be garbage-collected. Note that this means that each run will not have access to values from previous runs.\n """\n\n def __init__(self):\n self.values: Dict[Tuple[object, ...], object] = {}\n\n def handle_output(self, context: OutputContext, obj: object):\n keys = tuple(context.get_identifier())\n self.values[keys] = obj\n\n def load_input(self, context: InputContext) -> object:\n keys = tuple(context.get_identifier())\n return self.values[keys]
\n\n\n
[docs]@dagster_maintained_io_manager\n@io_manager(description="Built-in IO manager that stores and retrieves values in memory.")\ndef mem_io_manager(_) -> InMemoryIOManager:\n """Built-in IO manager that stores and retrieves values in memory."""\n return InMemoryIOManager()
\n
", "current_page_name": "_modules/dagster/_core/storage/mem_io_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.storage.mem_io_manager"}, "memoizable_io_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.storage.memoizable_io_manager

\nimport os\nimport pickle\nfrom abc import abstractmethod\nfrom typing import Union\n\nimport dagster._check as check\nfrom dagster._annotations import experimental, public\nfrom dagster._config import Field, StringSource\nfrom dagster._core.errors import DagsterInvariantViolationError\nfrom dagster._core.execution.context.input import InputContext\nfrom dagster._core.execution.context.output import OutputContext\nfrom dagster._core.storage.io_manager import IOManager, dagster_maintained_io_manager, io_manager\nfrom dagster._utils import PICKLE_PROTOCOL, mkdir_p\n\n\n
[docs]class MemoizableIOManager(IOManager):\n """Base class for IO manager enabled to work with memoized execution. Users should implement\n the ``load_input`` and ``handle_output`` methods described in the ``IOManager`` API, and the\n ``has_output`` method, which returns a boolean representing whether a data object can be found.\n """\n\n
[docs] @public\n @abstractmethod\n def has_output(self, context: OutputContext) -> bool:\n """The user-defined method that returns whether data exists given the metadata.\n\n Args:\n context (OutputContext): The context of the step performing this check.\n\n Returns:\n bool: True if there is data present that matches the provided context. False otherwise.\n """
\n\n\nclass VersionedPickledObjectFilesystemIOManager(MemoizableIOManager):\n def __init__(self, base_dir=None):\n self.base_dir = check.opt_str_param(base_dir, "base_dir")\n self.write_mode = "wb"\n self.read_mode = "rb"\n\n def _get_path(self, context: Union[InputContext, OutputContext]) -> str:\n output_context: OutputContext\n\n if isinstance(context, OutputContext):\n output_context = context\n else:\n if context.upstream_output is None:\n raise DagsterInvariantViolationError(\n "Missing value of InputContext.upstream_output. Cannot compute the input path."\n )\n\n output_context = context.upstream_output\n\n # automatically construct filepath\n step_key = check.str_param(output_context.step_key, "context.step_key")\n output_name = check.str_param(output_context.name, "context.name")\n version = check.str_param(output_context.version, "context.version")\n\n return os.path.join(self.base_dir, step_key, output_name, version)\n\n def handle_output(self, context, obj):\n """Pickle the data with the associated version, and store the object to a file.\n\n This method omits the AssetMaterialization event so assets generated by it won't be tracked\n by the Asset Catalog.\n """\n filepath = self._get_path(context)\n\n context.log.debug(f"Writing file at: {filepath}")\n\n # Ensure path exists\n mkdir_p(os.path.dirname(filepath))\n\n with open(filepath, self.write_mode) as write_obj:\n pickle.dump(obj, write_obj, PICKLE_PROTOCOL)\n\n def load_input(self, context):\n """Unpickle the file and Load it to a data object."""\n filepath = self._get_path(context)\n\n context.log.debug(f"Loading file from: {filepath}")\n\n with open(filepath, self.read_mode) as read_obj:\n return pickle.load(read_obj)\n\n def has_output(self, context):\n """Returns true if data object exists with the associated version, False otherwise."""\n filepath = self._get_path(context)\n\n context.log.debug(f"Checking for file at: {filepath}")\n\n return os.path.exists(filepath) and not os.path.isdir(filepath)\n\n\n@dagster_maintained_io_manager\n@io_manager(config_schema={"base_dir": Field(StringSource, is_required=False)})\n@experimental\ndef versioned_filesystem_io_manager(init_context):\n """Filesystem IO manager that utilizes versioning of stored objects.\n\n It requires users to specify a base directory where all the step outputs will be stored in. It\n serializes and deserializes output values (assets) using pickling and automatically constructs\n the filepaths for the assets using the provided directory, and the version for a provided step\n output.\n """\n return VersionedPickledObjectFilesystemIOManager(\n base_dir=init_context.resource_config.get(\n "base_dir", os.path.join(init_context.instance.storage_directory(), "versioned_outputs")\n )\n )\n
", "current_page_name": "_modules/dagster/_core/storage/memoizable_io_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.storage.memoizable_io_manager"}, "noop_compute_log_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.storage.noop_compute_log_manager

\nfrom contextlib import contextmanager\nfrom typing import IO, Any, Generator, Mapping, Optional, Sequence\n\nfrom typing_extensions import Self\n\nimport dagster._check as check\nfrom dagster._core.storage.captured_log_manager import (\n    CapturedLogContext,\n    CapturedLogData,\n    CapturedLogManager,\n    CapturedLogMetadata,\n    CapturedLogSubscription,\n)\nfrom dagster._serdes import ConfigurableClass, ConfigurableClassData\n\nfrom .compute_log_manager import (\n    MAX_BYTES_FILE_READ,\n    ComputeIOType,\n    ComputeLogFileData,\n    ComputeLogManager,\n)\n\n\n
[docs]class NoOpComputeLogManager(CapturedLogManager, ComputeLogManager, ConfigurableClass):\n """When enabled for a Dagster instance, stdout and stderr will not be available for any step."""\n\n def __init__(self, inst_data: Optional[ConfigurableClassData] = None):\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n\n @property\n def inst_data(self):\n return self._inst_data\n\n @classmethod\n def config_type(cls):\n return {}\n\n @classmethod\n def from_config_value(\n cls, inst_data: ConfigurableClassData, config_value: Mapping[str, Any]\n ) -> Self:\n return NoOpComputeLogManager(inst_data=inst_data, **config_value)\n\n def enabled(self, _dagster_run, _step_key):\n return False\n\n def _watch_logs(self, dagster_run, step_key=None):\n pass\n\n def get_local_path(self, run_id: str, key: str, io_type: ComputeIOType) -> str:\n raise NotImplementedError()\n\n def is_watch_completed(self, run_id, key):\n return True\n\n def on_watch_start(self, dagster_run, step_key):\n pass\n\n def on_watch_finish(self, dagster_run, step_key):\n pass\n\n def download_url(self, run_id, key, io_type):\n return None\n\n def read_logs_file(self, run_id, key, io_type, cursor=0, max_bytes=MAX_BYTES_FILE_READ):\n return ComputeLogFileData(\n path=f"{key}.{io_type}", data=None, cursor=0, size=0, download_url=None\n )\n\n def on_subscribe(self, subscription):\n pass\n\n def on_unsubscribe(self, subscription):\n pass\n\n @contextmanager\n def capture_logs(self, log_key: Sequence[str]) -> Generator[CapturedLogContext, None, None]:\n yield CapturedLogContext(log_key=log_key)\n\n def is_capture_complete(self, log_key: Sequence[str]):\n return True\n\n @contextmanager\n def open_log_stream(\n self, log_key: Sequence[str], io_type: ComputeIOType\n ) -> Generator[Optional[IO], None, None]:\n yield None\n\n def get_log_data(\n self,\n log_key: Sequence[str],\n cursor: Optional[str] = None,\n max_bytes: Optional[int] = None,\n ) -> CapturedLogData:\n return CapturedLogData(log_key=log_key)\n\n def get_log_metadata(self, log_key: Sequence[str]) -> CapturedLogMetadata:\n return CapturedLogMetadata()\n\n def delete_logs(\n self, log_key: Optional[Sequence[str]] = None, prefix: Optional[Sequence[str]] = None\n ):\n pass\n\n def subscribe(\n self, log_key: Sequence[str], cursor: Optional[str] = None\n ) -> CapturedLogSubscription:\n return CapturedLogSubscription(self, log_key, cursor)\n\n def unsubscribe(self, subscription: CapturedLogSubscription):\n pass
\n
", "current_page_name": "_modules/dagster/_core/storage/noop_compute_log_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.storage.noop_compute_log_manager"}, "root": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.storage.root

\nimport os\nfrom tempfile import TemporaryDirectory\nfrom typing import Optional\n\nfrom typing_extensions import TypedDict\n\nfrom dagster import (\n    StringSource,\n    _check as check,\n)\nfrom dagster._config.config_schema import UserConfigSchema\nfrom dagster._serdes import ConfigurableClass, ConfigurableClassData\n\n\nclass LocalArtifactStorageConfig(TypedDict):\n    base_dir: str\n\n\n
[docs]class LocalArtifactStorage(ConfigurableClass):\n def __init__(self, base_dir: str, inst_data: Optional[ConfigurableClassData] = None):\n self._base_dir = base_dir\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n\n @property\n def inst_data(self) -> Optional[ConfigurableClassData]:\n return self._inst_data\n\n @property\n def base_dir(self) -> str:\n return self._base_dir\n\n def file_manager_dir(self, run_id: str) -> str:\n check.str_param(run_id, "run_id")\n return os.path.join(self.base_dir, "storage", run_id, "files")\n\n @property\n def storage_dir(self) -> str:\n return os.path.join(self.base_dir, "storage")\n\n @property\n def schedules_dir(self) -> str:\n return os.path.join(self.base_dir, "schedules")\n\n @classmethod\n def from_config_value(\n cls, inst_data: Optional[ConfigurableClassData], config_value: LocalArtifactStorageConfig\n ) -> "LocalArtifactStorage":\n return LocalArtifactStorage(inst_data=inst_data, **config_value)\n\n @classmethod\n def config_type(cls) -> UserConfigSchema:\n return {"base_dir": StringSource}\n\n def dispose(self):\n pass
\n\n\nclass TemporaryLocalArtifactStorage(LocalArtifactStorage):\n """Used by ephemeral DagsterInstances, defers directory creation til\n access since many uses of ephemeral instance do not require artifact directory.\n """\n\n def __init__(self):\n self._temp_dir = None\n\n @property\n def base_dir(self):\n if self._temp_dir is None:\n self._temp_dir = TemporaryDirectory()\n return self._temp_dir.name\n\n def dispose(self):\n if self._temp_dir:\n self._temp_dir.cleanup()\n
", "current_page_name": "_modules/dagster/_core/storage/root", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.storage.root"}, "runs": {"base": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.storage.runs.base

\nfrom abc import ABC, abstractmethod\nfrom typing import TYPE_CHECKING, Mapping, Optional, Sequence, Set, Tuple, Union\n\nfrom typing_extensions import TypedDict\n\nfrom dagster._core.events import DagsterEvent\nfrom dagster._core.execution.backfill import BulkActionStatus, PartitionBackfill\nfrom dagster._core.instance import MayHaveInstanceWeakref, T_DagsterInstance\nfrom dagster._core.snap import ExecutionPlanSnapshot, JobSnapshot\nfrom dagster._core.storage.dagster_run import (\n    DagsterRun,\n    JobBucket,\n    RunPartitionData,\n    RunRecord,\n    RunsFilter,\n    TagBucket,\n)\nfrom dagster._core.storage.sql import AlembicVersion\nfrom dagster._daemon.types import DaemonHeartbeat\nfrom dagster._utils import PrintFn\n\nfrom ..daemon_cursor import DaemonCursorStorage\n\nif TYPE_CHECKING:\n    from dagster._core.host_representation.origin import ExternalJobOrigin\n\n\nclass RunGroupInfo(TypedDict):\n    count: int\n    runs: Sequence[DagsterRun]\n\n\n
[docs]class RunStorage(ABC, MayHaveInstanceWeakref[T_DagsterInstance], DaemonCursorStorage):\n """Abstract base class for storing pipeline run history.\n\n Note that run storages using SQL databases as backing stores should implement\n :py:class:`~dagster._core.storage.runs.SqlRunStorage`.\n\n Users should not directly instantiate concrete subclasses of this class; they are instantiated\n by internal machinery when ``dagster-webserver`` and ``dagster-graphql`` load, based on the values in the\n ``dagster.yaml`` file in ``$DAGSTER_HOME``. Configuration of concrete subclasses of this class\n should be done by setting values in that file.\n """\n\n @abstractmethod\n def add_run(self, dagster_run: DagsterRun) -> DagsterRun:\n """Add a run to storage.\n\n If a run already exists with the same ID, raise DagsterRunAlreadyExists\n If the run's snapshot ID does not exist raise DagsterSnapshotDoesNotExist\n\n Args:\n dagster_run (DagsterRun): The run to add.\n """\n\n @abstractmethod\n def handle_run_event(self, run_id: str, event: DagsterEvent) -> None:\n """Update run storage in accordance to a pipeline run related DagsterEvent.\n\n Args:\n run_id (str)\n event (DagsterEvent)\n """\n\n @abstractmethod\n def get_runs(\n self,\n filters: Optional[RunsFilter] = None,\n cursor: Optional[str] = None,\n limit: Optional[int] = None,\n bucket_by: Optional[Union[JobBucket, TagBucket]] = None,\n ) -> Sequence[DagsterRun]:\n """Return all the runs present in the storage that match the given filters.\n\n Args:\n filters (Optional[RunsFilter]) -- The\n :py:class:`~dagster._core.storage.pipeline_run.RunsFilter` by which to filter\n runs\n cursor (Optional[str]): Starting cursor (run_id) of range of runs\n limit (Optional[int]): Number of results to get. Defaults to infinite.\n\n Returns:\n List[PipelineRun]\n """\n\n @abstractmethod\n def get_run_ids(\n self,\n filters: Optional[RunsFilter] = None,\n cursor: Optional[str] = None,\n limit: Optional[int] = None,\n ) -> Sequence[str]:\n """Return all the run IDs for runs present in the storage that match the given filters.\n\n Args:\n filters (Optional[RunsFilter]) -- The\n :py:class:`~dagster._core.storage.pipeline_run.RunsFilter` by which to filter\n runs\n cursor (Optional[str]): Starting cursor (run_id) of range of runs\n limit (Optional[int]): Number of results to get. Defaults to infinite.\n\n Returns:\n Sequence[str]\n """\n\n @abstractmethod\n def get_runs_count(self, filters: Optional[RunsFilter] = None) -> int:\n """Return the number of runs present in the storage that match the given filters.\n\n Args:\n filters (Optional[RunsFilter]) -- The\n :py:class:`~dagster._core.storage.pipeline_run.PipelineRunFilter` by which to filter\n runs\n\n Returns:\n int: The number of runs that match the given filters.\n """\n\n @abstractmethod\n def get_run_group(self, run_id: str) -> Optional[Tuple[str, Sequence[DagsterRun]]]:\n """Get the run group to which a given run belongs.\n\n Args:\n run_id (str): If the corresponding run is the descendant of some root run (i.e., there\n is a root_run_id on the :py:class:`PipelineRun`), that root run and all of its\n descendants are returned; otherwise, the group will consist only of the given run\n (a run that does not descend from any root is its own root).\n\n Returns:\n Optional[Tuple[string, List[PipelineRun]]]: If there is a corresponding run group, tuple\n whose first element is the root_run_id and whose second element is a list of all the\n descendent runs. Otherwise `None`.\n """\n\n @abstractmethod\n def get_run_records(\n self,\n filters: Optional[RunsFilter] = None,\n limit: Optional[int] = None,\n order_by: Optional[str] = None,\n ascending: bool = False,\n cursor: Optional[str] = None,\n bucket_by: Optional[Union[JobBucket, TagBucket]] = None,\n ) -> Sequence[RunRecord]:\n """Return a list of run records stored in the run storage, sorted by the given column in given order.\n\n Args:\n filters (Optional[RunsFilter]): the filter by which to filter runs.\n limit (Optional[int]): Number of results to get. Defaults to infinite.\n order_by (Optional[str]): Name of the column to sort by. Defaults to id.\n ascending (Optional[bool]): Sort the result in ascending order if True, descending\n otherwise. Defaults to descending.\n\n Returns:\n List[RunRecord]: List of run records stored in the run storage.\n """\n\n @abstractmethod\n def get_run_tags(\n self,\n tag_keys: Optional[Sequence[str]] = None,\n value_prefix: Optional[str] = None,\n limit: Optional[int] = None,\n ) -> Sequence[Tuple[str, Set[str]]]:\n """Get a list of tag keys and the values that have been associated with them.\n\n Args:\n tag_keys (Optional[Sequence[str]]): tag keys to filter by.\n\n Returns:\n List[Tuple[str, Set[str]]]\n """\n\n @abstractmethod\n def get_run_tag_keys(self) -> Sequence[str]:\n """Get a list of tag keys.\n\n Returns:\n List[str]\n """\n\n @abstractmethod\n def add_run_tags(self, run_id: str, new_tags: Mapping[str, str]) -> None:\n """Add additional tags for a pipeline run.\n\n Args:\n run_id (str)\n new_tags (Dict[string, string])\n """\n\n @abstractmethod\n def has_run(self, run_id: str) -> bool:\n """Check if the storage contains a run.\n\n Args:\n run_id (str): The id of the run\n\n Returns:\n bool\n """\n\n def add_snapshot(\n self,\n snapshot: Union[JobSnapshot, ExecutionPlanSnapshot],\n snapshot_id: Optional[str] = None,\n ) -> None:\n """Add a snapshot to the storage.\n\n Args:\n snapshot (Union[PipelineSnapshot, ExecutionPlanSnapshot])\n snapshot_id (Optional[str]): [Internal] The id of the snapshot. If not provided, the\n snapshot id will be generated from a hash of the snapshot. This should only be used\n in debugging, where we might want to import a historical run whose snapshots were\n calculated using a different hash function than the current code.\n """\n if isinstance(snapshot, JobSnapshot):\n self.add_job_snapshot(snapshot, snapshot_id)\n else:\n self.add_execution_plan_snapshot(snapshot, snapshot_id)\n\n def has_snapshot(self, snapshot_id: str):\n return self.has_job_snapshot(snapshot_id) or self.has_execution_plan_snapshot(snapshot_id)\n\n @abstractmethod\n def has_job_snapshot(self, job_snapshot_id: str) -> bool:\n """Check to see if storage contains a pipeline snapshot.\n\n Args:\n pipeline_snapshot_id (str): The id of the run.\n\n Returns:\n bool\n """\n\n @abstractmethod\n def add_job_snapshot(self, job_snapshot: JobSnapshot, snapshot_id: Optional[str] = None) -> str:\n """Add a pipeline snapshot to the run store.\n\n Pipeline snapshots are content-addressable, meaning\n that the ID for a snapshot is a hash based on the\n body of the snapshot. This function returns\n that snapshot ID.\n\n Args:\n job_snapshot (PipelineSnapshot)\n snapshot_id (Optional[str]): [Internal] The id of the snapshot. If not provided, the\n snapshot id will be generated from a hash of the snapshot. This should only be used\n in debugging, where we might want to import a historical run whose snapshots were\n calculated using a different hash function than the current code.\n\n Return:\n str: The job_snapshot_id\n """\n\n @abstractmethod\n def get_job_snapshot(self, job_snapshot_id: str) -> JobSnapshot:\n """Fetch a snapshot by ID.\n\n Args:\n job_snapshot_id (str)\n\n Returns:\n PipelineSnapshot\n """\n\n @abstractmethod\n def has_execution_plan_snapshot(self, execution_plan_snapshot_id: str) -> bool:\n """Check to see if storage contains an execution plan snapshot.\n\n Args:\n execution_plan_snapshot_id (str): The id of the execution plan.\n\n Returns:\n bool\n """\n\n @abstractmethod\n def add_execution_plan_snapshot(\n self, execution_plan_snapshot: ExecutionPlanSnapshot, snapshot_id: Optional[str] = None\n ) -> str:\n """Add an execution plan snapshot to the run store.\n\n Execution plan snapshots are content-addressable, meaning\n that the ID for a snapshot is a hash based on the\n body of the snapshot. This function returns\n that snapshot ID.\n\n Args:\n execution_plan_snapshot (ExecutionPlanSnapshot)\n snapshot_id (Optional[str]): [Internal] The id of the snapshot. If not provided, the\n snapshot id will be generated from a hash of the snapshot. This should only be used\n in debugging, where we might want to import a historical run whose snapshots were\n calculated using a different hash function than the current code.\n\n Return:\n str: The execution_plan_snapshot_id\n """\n\n @abstractmethod\n def get_execution_plan_snapshot(self, execution_plan_snapshot_id: str) -> ExecutionPlanSnapshot:\n """Fetch a snapshot by ID.\n\n Args:\n execution_plan_snapshot_id (str)\n\n Returns:\n ExecutionPlanSnapshot\n """\n\n @abstractmethod\n def wipe(self) -> None:\n """Clears the run storage."""\n\n @abstractmethod\n def delete_run(self, run_id: str) -> None:\n """Remove a run from storage."""\n\n @property\n def supports_bucket_queries(self) -> bool:\n return False\n\n @abstractmethod\n def get_run_partition_data(self, runs_filter: RunsFilter) -> Sequence[RunPartitionData]:\n """Get run partition data for a given partitioned job."""\n\n def migrate(self, print_fn: Optional[PrintFn] = None, force_rebuild_all: bool = False) -> None:\n """Call this method to run any required data migrations."""\n\n def optimize(self, print_fn: Optional[PrintFn] = None, force_rebuild_all: bool = False) -> None:\n """Call this method to run any optional data migrations for optimized reads."""\n\n def dispose(self) -> None:\n """Explicit lifecycle management."""\n\n def optimize_for_webserver(self, statement_timeout: int, pool_recycle: int) -> None:\n """Allows for optimizing database connection / use in the context of a long lived webserver process."""\n\n # Daemon Heartbeat Storage\n #\n # Holds heartbeats from the Dagster Daemon so that other system components can alert when it's not\n # alive.\n # This is temporarily placed along with run storage to avoid adding a new instance concept. It\n # should be split out once all metadata storages are configured together.\n\n @abstractmethod\n def add_daemon_heartbeat(self, daemon_heartbeat: DaemonHeartbeat) -> None:\n """Called on a regular interval by the daemon."""\n\n @abstractmethod\n def get_daemon_heartbeats(self) -> Mapping[str, DaemonHeartbeat]:\n """Latest heartbeats of all daemon types."""\n\n @abstractmethod\n def wipe_daemon_heartbeats(self) -> None:\n """Wipe all daemon heartbeats."""\n\n # Backfill storage\n @abstractmethod\n def get_backfills(\n self,\n status: Optional[BulkActionStatus] = None,\n cursor: Optional[str] = None,\n limit: Optional[int] = None,\n ) -> Sequence[PartitionBackfill]:\n """Get a list of partition backfills."""\n\n @abstractmethod\n def get_backfill(self, backfill_id: str) -> Optional[PartitionBackfill]:\n """Get the partition backfill of the given backfill id."""\n\n @abstractmethod\n def add_backfill(self, partition_backfill: PartitionBackfill):\n """Add partition backfill to run storage."""\n\n @abstractmethod\n def update_backfill(self, partition_backfill: PartitionBackfill):\n """Update a partition backfill in run storage."""\n\n def alembic_version(self) -> Optional[AlembicVersion]:\n return None\n\n @abstractmethod\n def replace_job_origin(self, run: "DagsterRun", job_origin: "ExternalJobOrigin") -> None: ...
\n
", "current_page_name": "_modules/dagster/_core/storage/runs/base", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.storage.runs.base"}, "sql_run_storage": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.storage.runs.sql_run_storage

\nimport logging\nimport uuid\nimport zlib\nfrom abc import abstractmethod\nfrom collections import defaultdict\nfrom datetime import datetime\nfrom enum import Enum\nfrom typing import (\n    Any,\n    Callable,\n    ContextManager,\n    Dict,\n    Iterable,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Set,\n    Tuple,\n    Union,\n    cast,\n)\n\nimport pendulum\nimport sqlalchemy as db\nimport sqlalchemy.exc as db_exc\nfrom sqlalchemy.engine import Connection\n\nimport dagster._check as check\nfrom dagster._core.errors import (\n    DagsterInvariantViolationError,\n    DagsterRunAlreadyExists,\n    DagsterRunNotFoundError,\n    DagsterSnapshotDoesNotExist,\n)\nfrom dagster._core.events import EVENT_TYPE_TO_PIPELINE_RUN_STATUS, DagsterEvent, DagsterEventType\nfrom dagster._core.execution.backfill import BulkActionStatus, PartitionBackfill\nfrom dagster._core.host_representation.origin import ExternalJobOrigin\nfrom dagster._core.snap import (\n    ExecutionPlanSnapshot,\n    JobSnapshot,\n    create_execution_plan_snapshot_id,\n    create_job_snapshot_id,\n)\nfrom dagster._core.storage.sql import SqlAlchemyQuery\nfrom dagster._core.storage.sqlalchemy_compat import (\n    db_fetch_mappings,\n    db_scalar_subquery,\n    db_select,\n    db_subquery,\n)\nfrom dagster._core.storage.tags import (\n    PARTITION_NAME_TAG,\n    PARTITION_SET_TAG,\n    REPOSITORY_LABEL_TAG,\n    ROOT_RUN_ID_TAG,\n)\nfrom dagster._daemon.types import DaemonHeartbeat\nfrom dagster._serdes import (\n    deserialize_value,\n    serialize_value,\n)\nfrom dagster._seven import JSONDecodeError\nfrom dagster._utils import PrintFn, utc_datetime_from_timestamp\nfrom dagster._utils.merger import merge_dicts\n\nfrom ..dagster_run import (\n    DagsterRun,\n    DagsterRunStatus,\n    JobBucket,\n    RunPartitionData,\n    RunRecord,\n    RunsFilter,\n    TagBucket,\n)\nfrom .base import RunStorage\nfrom .migration import (\n    OPTIONAL_DATA_MIGRATIONS,\n    REQUIRED_DATA_MIGRATIONS,\n    RUN_PARTITIONS,\n    MigrationFn,\n)\nfrom .schema import (\n    BulkActionsTable,\n    DaemonHeartbeatsTable,\n    InstanceInfo,\n    KeyValueStoreTable,\n    RunsTable,\n    RunTagsTable,\n    SecondaryIndexMigrationTable,\n    SnapshotsTable,\n)\n\n\nclass SnapshotType(Enum):\n    PIPELINE = "PIPELINE"\n    EXECUTION_PLAN = "EXECUTION_PLAN"\n\n\n
[docs]class SqlRunStorage(RunStorage):\n """Base class for SQL based run storages."""\n\n @abstractmethod\n def connect(self) -> ContextManager[Connection]:\n """Context manager yielding a sqlalchemy.engine.Connection."""\n\n @abstractmethod\n def upgrade(self) -> None:\n """This method should perform any schema or data migrations necessary to bring an\n out-of-date instance of the storage up to date.\n """\n\n def fetchall(self, query: SqlAlchemyQuery) -> Sequence[Any]:\n with self.connect() as conn:\n return db_fetch_mappings(conn, query)\n\n def fetchone(self, query: SqlAlchemyQuery) -> Optional[Any]:\n with self.connect() as conn:\n if db.__version__.startswith("2."):\n return conn.execute(query).mappings().first()\n else:\n return conn.execute(query).fetchone()\n\n def add_run(self, dagster_run: DagsterRun) -> DagsterRun:\n check.inst_param(dagster_run, "dagster_run", DagsterRun)\n\n if dagster_run.job_snapshot_id and not self.has_job_snapshot(dagster_run.job_snapshot_id):\n raise DagsterSnapshotDoesNotExist(\n f"Snapshot {dagster_run.job_snapshot_id} does not exist in run storage"\n )\n\n has_tags = dagster_run.tags and len(dagster_run.tags) > 0\n partition = dagster_run.tags.get(PARTITION_NAME_TAG) if has_tags else None\n partition_set = dagster_run.tags.get(PARTITION_SET_TAG) if has_tags else None\n\n runs_insert = RunsTable.insert().values(\n run_id=dagster_run.run_id,\n pipeline_name=dagster_run.job_name,\n status=dagster_run.status.value,\n run_body=serialize_value(dagster_run),\n snapshot_id=dagster_run.job_snapshot_id,\n partition=partition,\n partition_set=partition_set,\n )\n with self.connect() as conn:\n try:\n conn.execute(runs_insert)\n except db_exc.IntegrityError as exc:\n raise DagsterRunAlreadyExists from exc\n\n tags_to_insert = dagster_run.tags_for_storage()\n if tags_to_insert:\n conn.execute(\n RunTagsTable.insert(),\n [\n dict(run_id=dagster_run.run_id, key=k, value=v)\n for k, v in tags_to_insert.items()\n ],\n )\n\n return dagster_run\n\n def handle_run_event(self, run_id: str, event: DagsterEvent) -> None:\n check.str_param(run_id, "run_id")\n check.inst_param(event, "event", DagsterEvent)\n\n if event.event_type not in EVENT_TYPE_TO_PIPELINE_RUN_STATUS:\n return\n\n run = self._get_run_by_id(run_id)\n if not run:\n # TODO log?\n return\n\n new_job_status = EVENT_TYPE_TO_PIPELINE_RUN_STATUS[event.event_type]\n\n run_stats_cols_in_index = self.has_run_stats_index_cols()\n\n kwargs = {}\n\n # consider changing the `handle_run_event` signature to get timestamp off of the\n # EventLogEntry instead of the DagsterEvent, for consistency\n now = pendulum.now("UTC")\n\n if run_stats_cols_in_index and event.event_type == DagsterEventType.PIPELINE_START:\n kwargs["start_time"] = now.timestamp()\n\n if run_stats_cols_in_index and event.event_type in {\n DagsterEventType.PIPELINE_CANCELED,\n DagsterEventType.PIPELINE_FAILURE,\n DagsterEventType.PIPELINE_SUCCESS,\n }:\n kwargs["end_time"] = now.timestamp()\n\n with self.connect() as conn:\n conn.execute(\n RunsTable.update()\n .where(RunsTable.c.run_id == run_id)\n .values(\n run_body=serialize_value(run.with_status(new_job_status)),\n status=new_job_status.value,\n update_timestamp=now,\n **kwargs,\n )\n )\n\n def _row_to_run(self, row: Dict) -> DagsterRun:\n run = deserialize_value(row["run_body"], DagsterRun)\n status = DagsterRunStatus(row["status"])\n # NOTE: the status column is more trustworthy than the status in the run body, since concurrent\n # writes (e.g. handle_run_event and add_tags) can cause the status in the body to be out of\n # overriden with an old value.\n return run.with_status(status)\n\n def _rows_to_runs(self, rows: Iterable[Dict]) -> Sequence[DagsterRun]:\n return list(map(self._row_to_run, rows))\n\n def _add_cursor_limit_to_query(\n self,\n query: SqlAlchemyQuery,\n cursor: Optional[str],\n limit: Optional[int],\n order_by: Optional[str],\n ascending: Optional[bool],\n ) -> SqlAlchemyQuery:\n """Helper function to deal with cursor/limit pagination args."""\n if cursor:\n cursor_query = db_select([RunsTable.c.id]).where(RunsTable.c.run_id == cursor)\n query = query.where(RunsTable.c.id < db_scalar_subquery(cursor_query))\n\n if limit:\n query = query.limit(limit)\n\n sorting_column = getattr(RunsTable.c, order_by) if order_by else RunsTable.c.id\n direction = db.asc if ascending else db.desc\n query = query.order_by(direction(sorting_column))\n\n return query\n\n @property\n def supports_intersect(self) -> bool:\n return True\n\n def _add_filters_to_query(self, query: SqlAlchemyQuery, filters: RunsFilter) -> SqlAlchemyQuery:\n check.inst_param(filters, "filters", RunsFilter)\n\n if filters.run_ids:\n query = query.where(RunsTable.c.run_id.in_(filters.run_ids))\n\n if filters.job_name:\n query = query.where(RunsTable.c.pipeline_name == filters.job_name)\n\n if filters.statuses:\n query = query.where(\n RunsTable.c.status.in_([status.value for status in filters.statuses])\n )\n\n if filters.snapshot_id:\n query = query.where(RunsTable.c.snapshot_id == filters.snapshot_id)\n\n if filters.updated_after:\n query = query.where(RunsTable.c.update_timestamp > filters.updated_after)\n\n if filters.updated_before:\n query = query.where(RunsTable.c.update_timestamp < filters.updated_before)\n\n if filters.created_after:\n query = query.where(RunsTable.c.create_timestamp > filters.created_after)\n\n if filters.created_before:\n query = query.where(RunsTable.c.create_timestamp < filters.created_before)\n\n return query\n\n def _runs_query(\n self,\n filters: Optional[RunsFilter] = None,\n cursor: Optional[str] = None,\n limit: Optional[int] = None,\n columns: Optional[Sequence[str]] = None,\n order_by: Optional[str] = None,\n ascending: bool = False,\n bucket_by: Optional[Union[JobBucket, TagBucket]] = None,\n ) -> SqlAlchemyQuery:\n filters = check.opt_inst_param(filters, "filters", RunsFilter, default=RunsFilter())\n check.opt_str_param(cursor, "cursor")\n check.opt_int_param(limit, "limit")\n check.opt_sequence_param(columns, "columns")\n check.opt_str_param(order_by, "order_by")\n check.opt_bool_param(ascending, "ascending")\n\n if columns is None:\n columns = ["run_body", "status"]\n\n if filters.tags:\n table = self._apply_tags_table_joins(RunsTable, filters.tags)\n else:\n table = RunsTable\n\n base_query = db_select([getattr(RunsTable.c, column) for column in columns]).select_from(\n table\n )\n base_query = self._add_filters_to_query(base_query, filters)\n return self._add_cursor_limit_to_query(base_query, cursor, limit, order_by, ascending)\n\n def _apply_tags_table_joins(\n self,\n table: db.Table,\n tags: Mapping[str, Union[str, Sequence[str]]],\n ) -> db.Table:\n multi_join = len(tags) > 1\n i = 0\n for key, value in tags.items():\n i += 1\n tags_table = (\n db_subquery(db_select([RunTagsTable]), f"run_tags_subquery_{i}")\n if multi_join\n else RunTagsTable\n )\n table = table.join(\n tags_table,\n db.and_(\n RunsTable.c.run_id == tags_table.c.run_id,\n tags_table.c.key == key,\n (\n tags_table.c.value == value\n if isinstance(value, str)\n else tags_table.c.value.in_(value)\n ),\n ),\n )\n return table\n\n def get_runs(\n self,\n filters: Optional[RunsFilter] = None,\n cursor: Optional[str] = None,\n limit: Optional[int] = None,\n bucket_by: Optional[Union[JobBucket, TagBucket]] = None,\n ) -> Sequence[DagsterRun]:\n query = self._runs_query(filters, cursor, limit, bucket_by=bucket_by)\n rows = self.fetchall(query)\n return self._rows_to_runs(rows)\n\n def get_run_ids(\n self,\n filters: Optional[RunsFilter] = None,\n cursor: Optional[str] = None,\n limit: Optional[int] = None,\n ) -> Sequence[str]:\n query = self._runs_query(filters=filters, cursor=cursor, limit=limit, columns=["run_id"])\n rows = self.fetchall(query)\n return [row["run_id"] for row in rows]\n\n def get_runs_count(self, filters: Optional[RunsFilter] = None) -> int:\n subquery = db_subquery(self._runs_query(filters=filters))\n query = db_select([db.func.count().label("count")]).select_from(subquery)\n row = self.fetchone(query)\n count = row["count"] if row else 0\n return count\n\n def _get_run_by_id(self, run_id: str) -> Optional[DagsterRun]:\n check.str_param(run_id, "run_id")\n\n query = db_select([RunsTable.c.run_body, RunsTable.c.status]).where(\n RunsTable.c.run_id == run_id\n )\n rows = self.fetchall(query)\n return self._row_to_run(rows[0]) if rows else None\n\n def get_run_records(\n self,\n filters: Optional[RunsFilter] = None,\n limit: Optional[int] = None,\n order_by: Optional[str] = None,\n ascending: bool = False,\n cursor: Optional[str] = None,\n bucket_by: Optional[Union[JobBucket, TagBucket]] = None,\n ) -> Sequence[RunRecord]:\n filters = check.opt_inst_param(filters, "filters", RunsFilter, default=RunsFilter())\n check.opt_int_param(limit, "limit")\n\n columns = ["id", "run_body", "status", "create_timestamp", "update_timestamp"]\n\n if self.has_run_stats_index_cols():\n columns += ["start_time", "end_time"]\n # only fetch columns we use to build RunRecord\n query = self._runs_query(\n filters=filters,\n limit=limit,\n columns=columns,\n order_by=order_by,\n ascending=ascending,\n cursor=cursor,\n bucket_by=bucket_by,\n )\n\n rows = self.fetchall(query)\n return [\n RunRecord(\n storage_id=check.int_param(row["id"], "id"),\n dagster_run=self._row_to_run(row),\n create_timestamp=check.inst(row["create_timestamp"], datetime),\n update_timestamp=check.inst(row["update_timestamp"], datetime),\n start_time=(\n check.opt_inst(row["start_time"], float) if "start_time" in row else None\n ),\n end_time=check.opt_inst(row["end_time"], float) if "end_time" in row else None,\n )\n for row in rows\n ]\n\n def get_run_tags(\n self,\n tag_keys: Optional[Sequence[str]] = None,\n value_prefix: Optional[str] = None,\n limit: Optional[int] = None,\n ) -> Sequence[Tuple[str, Set[str]]]:\n result = defaultdict(set)\n query = (\n db_select([RunTagsTable.c.key, RunTagsTable.c.value])\n .distinct()\n .order_by(RunTagsTable.c.key, RunTagsTable.c.value)\n )\n if tag_keys:\n query = query.where(RunTagsTable.c.key.in_(tag_keys))\n if value_prefix:\n query = query.where(RunTagsTable.c.value.startswith(value_prefix))\n if limit:\n query = query.limit(limit)\n rows = self.fetchall(query)\n for r in rows:\n result[r["key"]].add(r["value"])\n return sorted(list([(k, v) for k, v in result.items()]), key=lambda x: x[0])\n\n def get_run_tag_keys(self) -> Sequence[str]:\n query = db_select([RunTagsTable.c.key]).distinct().order_by(RunTagsTable.c.key)\n rows = self.fetchall(query)\n return sorted([r["key"] for r in rows])\n\n def add_run_tags(self, run_id: str, new_tags: Mapping[str, str]) -> None:\n check.str_param(run_id, "run_id")\n check.mapping_param(new_tags, "new_tags", key_type=str, value_type=str)\n\n run = self._get_run_by_id(run_id)\n if not run:\n raise DagsterRunNotFoundError(\n f"Run {run_id} was not found in instance.", invalid_run_id=run_id\n )\n current_tags = run.tags if run.tags else {}\n\n all_tags = merge_dicts(current_tags, new_tags)\n partition = all_tags.get(PARTITION_NAME_TAG)\n partition_set = all_tags.get(PARTITION_SET_TAG)\n\n with self.connect() as conn:\n conn.execute(\n RunsTable.update()\n .where(RunsTable.c.run_id == run_id)\n .values(\n run_body=serialize_value(run.with_tags(merge_dicts(current_tags, new_tags))),\n partition=partition,\n partition_set=partition_set,\n update_timestamp=pendulum.now("UTC"),\n )\n )\n\n current_tags_set = set(current_tags.keys())\n new_tags_set = set(new_tags.keys())\n\n existing_tags = current_tags_set & new_tags_set\n added_tags = new_tags_set.difference(existing_tags)\n\n for tag in existing_tags:\n conn.execute(\n RunTagsTable.update()\n .where(db.and_(RunTagsTable.c.run_id == run_id, RunTagsTable.c.key == tag))\n .values(value=new_tags[tag])\n )\n\n if added_tags:\n conn.execute(\n RunTagsTable.insert(),\n [dict(run_id=run_id, key=tag, value=new_tags[tag]) for tag in added_tags],\n )\n\n def get_run_group(self, run_id: str) -> Tuple[str, Sequence[DagsterRun]]:\n check.str_param(run_id, "run_id")\n dagster_run = self._get_run_by_id(run_id)\n if not dagster_run:\n raise DagsterRunNotFoundError(\n f"Run {run_id} was not found in instance.", invalid_run_id=run_id\n )\n\n # find root_run\n root_run_id = dagster_run.root_run_id if dagster_run.root_run_id else dagster_run.run_id\n root_run = self._get_run_by_id(root_run_id)\n if not root_run:\n raise DagsterRunNotFoundError(\n f"Run id {root_run_id} set as root run id for run {run_id} was not found in"\n " instance.",\n invalid_run_id=root_run_id,\n )\n\n # root_run_id to run_id 1:1 mapping\n # https://github.com/dagster-io/dagster/issues/2495\n # Note: we currently use tags to persist the run group info\n root_to_run = db_subquery(\n db_select(\n [RunTagsTable.c.value.label("root_run_id"), RunTagsTable.c.run_id.label("run_id")]\n ).where(\n db.and_(RunTagsTable.c.key == ROOT_RUN_ID_TAG, RunTagsTable.c.value == root_run_id)\n ),\n "root_to_run",\n )\n # get run group\n run_group_query = db_select([RunsTable.c.run_body, RunsTable.c.status]).select_from(\n root_to_run.join(\n RunsTable,\n root_to_run.c.run_id == RunsTable.c.run_id,\n isouter=True,\n )\n )\n\n res = self.fetchall(run_group_query)\n run_group = self._rows_to_runs(res)\n\n return (root_run_id, [root_run, *run_group])\n\n def has_run(self, run_id: str) -> bool:\n check.str_param(run_id, "run_id")\n return bool(self._get_run_by_id(run_id))\n\n def delete_run(self, run_id: str) -> None:\n check.str_param(run_id, "run_id")\n query = db.delete(RunsTable).where(RunsTable.c.run_id == run_id)\n with self.connect() as conn:\n conn.execute(query)\n\n def has_job_snapshot(self, job_snapshot_id: str) -> bool:\n check.str_param(job_snapshot_id, "job_snapshot_id")\n return self._has_snapshot_id(job_snapshot_id)\n\n def add_job_snapshot(self, job_snapshot: JobSnapshot, snapshot_id: Optional[str] = None) -> str:\n check.inst_param(job_snapshot, "job_snapshot", JobSnapshot)\n check.opt_str_param(snapshot_id, "snapshot_id")\n\n if not snapshot_id:\n snapshot_id = create_job_snapshot_id(job_snapshot)\n\n return self._add_snapshot(\n snapshot_id=snapshot_id,\n snapshot_obj=job_snapshot,\n snapshot_type=SnapshotType.PIPELINE,\n )\n\n def get_job_snapshot(self, job_snapshot_id: str) -> JobSnapshot:\n check.str_param(job_snapshot_id, "job_snapshot_id")\n return self._get_snapshot(job_snapshot_id) # type: ignore # (allowed to return None?)\n\n def has_execution_plan_snapshot(self, execution_plan_snapshot_id: str) -> bool:\n check.str_param(execution_plan_snapshot_id, "execution_plan_snapshot_id")\n return bool(self.get_execution_plan_snapshot(execution_plan_snapshot_id))\n\n def add_execution_plan_snapshot(\n self, execution_plan_snapshot: ExecutionPlanSnapshot, snapshot_id: Optional[str] = None\n ) -> str:\n check.inst_param(execution_plan_snapshot, "execution_plan_snapshot", ExecutionPlanSnapshot)\n check.opt_str_param(snapshot_id, "snapshot_id")\n\n if not snapshot_id:\n snapshot_id = create_execution_plan_snapshot_id(execution_plan_snapshot)\n\n return self._add_snapshot(\n snapshot_id=snapshot_id,\n snapshot_obj=execution_plan_snapshot,\n snapshot_type=SnapshotType.EXECUTION_PLAN,\n )\n\n def get_execution_plan_snapshot(self, execution_plan_snapshot_id: str) -> ExecutionPlanSnapshot:\n check.str_param(execution_plan_snapshot_id, "execution_plan_snapshot_id")\n return self._get_snapshot(execution_plan_snapshot_id) # type: ignore # (allowed to return None?)\n\n def _add_snapshot(self, snapshot_id: str, snapshot_obj, snapshot_type: SnapshotType) -> str:\n check.str_param(snapshot_id, "snapshot_id")\n check.not_none_param(snapshot_obj, "snapshot_obj")\n check.inst_param(snapshot_type, "snapshot_type", SnapshotType)\n\n with self.connect() as conn:\n snapshot_insert = SnapshotsTable.insert().values(\n snapshot_id=snapshot_id,\n snapshot_body=zlib.compress(serialize_value(snapshot_obj).encode("utf-8")),\n snapshot_type=snapshot_type.value,\n )\n try:\n conn.execute(snapshot_insert)\n except db_exc.IntegrityError:\n # on_conflict_do_nothing equivalent\n pass\n\n return snapshot_id\n\n def get_run_storage_id(self) -> str:\n query = db_select([InstanceInfo.c.run_storage_id])\n row = self.fetchone(query)\n if not row:\n run_storage_id = str(uuid.uuid4())\n with self.connect() as conn:\n conn.execute(InstanceInfo.insert().values(run_storage_id=run_storage_id))\n return run_storage_id\n else:\n return row["run_storage_id"]\n\n def _has_snapshot_id(self, snapshot_id: str) -> bool:\n query = db_select([SnapshotsTable.c.snapshot_id]).where(\n SnapshotsTable.c.snapshot_id == snapshot_id\n )\n\n row = self.fetchone(query)\n\n return bool(row)\n\n def _get_snapshot(self, snapshot_id: str) -> Optional[JobSnapshot]:\n query = db_select([SnapshotsTable.c.snapshot_body]).where(\n SnapshotsTable.c.snapshot_id == snapshot_id\n )\n\n row = self.fetchone(query)\n\n return defensively_unpack_execution_plan_snapshot_query(logging, [row["snapshot_body"]]) if row else None # type: ignore\n\n def get_run_partition_data(self, runs_filter: RunsFilter) -> Sequence[RunPartitionData]:\n if self.has_built_index(RUN_PARTITIONS) and self.has_run_stats_index_cols():\n query = self._runs_query(\n filters=runs_filter,\n columns=["run_id", "status", "start_time", "end_time", "partition"],\n )\n rows = self.fetchall(query)\n\n # dedup by partition\n _partition_data_by_partition = {}\n for row in rows:\n if not row["partition"] or row["partition"] in _partition_data_by_partition:\n continue\n\n _partition_data_by_partition[row["partition"]] = RunPartitionData(\n run_id=row["run_id"],\n partition=row["partition"],\n status=DagsterRunStatus[row["status"]],\n start_time=row["start_time"],\n end_time=row["end_time"],\n )\n\n return list(_partition_data_by_partition.values())\n else:\n query = self._runs_query(filters=runs_filter)\n rows = self.fetchall(query)\n _partition_data_by_partition = {}\n for row in rows:\n run = self._row_to_run(row)\n partition = run.tags.get(PARTITION_NAME_TAG)\n if not partition or partition in _partition_data_by_partition:\n continue\n\n _partition_data_by_partition[partition] = RunPartitionData(\n run_id=run.run_id,\n partition=partition,\n status=run.status,\n start_time=None,\n end_time=None,\n )\n\n return list(_partition_data_by_partition.values())\n\n def _get_partition_runs(\n self, partition_set_name: str, partition_name: str\n ) -> Sequence[DagsterRun]:\n # utility method to help test reads off of the partition column\n if not self.has_built_index(RUN_PARTITIONS):\n # query by tags\n return self.get_runs(\n filters=RunsFilter(\n tags={\n PARTITION_SET_TAG: partition_set_name,\n PARTITION_NAME_TAG: partition_name,\n }\n )\n )\n else:\n query = (\n self._runs_query()\n .where(RunsTable.c.partition == partition_name)\n .where(RunsTable.c.partition_set == partition_set_name)\n )\n rows = self.fetchall(query)\n return self._rows_to_runs(rows)\n\n # Tracking data migrations over secondary indexes\n\n def _execute_data_migrations(\n self,\n migrations: Mapping[str, Callable[[], MigrationFn]],\n print_fn: Optional[PrintFn] = None,\n force_rebuild_all: bool = False,\n ) -> None:\n for migration_name, migration_fn in migrations.items():\n if self.has_built_index(migration_name):\n if not force_rebuild_all:\n if print_fn:\n print_fn(f"Skipping already applied data migration: {migration_name}")\n continue\n if print_fn:\n print_fn(f"Starting data migration: {migration_name}")\n migration_fn()(self, print_fn)\n self.mark_index_built(migration_name)\n if print_fn:\n print_fn(f"Finished data migration: {migration_name}")\n\n def migrate(self, print_fn: Optional[PrintFn] = None, force_rebuild_all: bool = False) -> None:\n self._execute_data_migrations(REQUIRED_DATA_MIGRATIONS, print_fn, force_rebuild_all)\n\n def optimize(self, print_fn: Optional[PrintFn] = None, force_rebuild_all: bool = False) -> None:\n self._execute_data_migrations(OPTIONAL_DATA_MIGRATIONS, print_fn, force_rebuild_all)\n\n def has_built_index(self, migration_name: str) -> bool:\n query = (\n db_select([1])\n .where(SecondaryIndexMigrationTable.c.name == migration_name)\n .where(SecondaryIndexMigrationTable.c.migration_completed != None) # noqa: E711\n .limit(1)\n )\n results = self.fetchall(query)\n\n return len(results) > 0\n\n def mark_index_built(self, migration_name: str) -> None:\n query = SecondaryIndexMigrationTable.insert().values(\n name=migration_name,\n migration_completed=datetime.now(),\n )\n with self.connect() as conn:\n try:\n conn.execute(query)\n except db_exc.IntegrityError:\n conn.execute(\n SecondaryIndexMigrationTable.update()\n .where(SecondaryIndexMigrationTable.c.name == migration_name)\n .values(migration_completed=datetime.now())\n )\n\n # Checking for migrations\n\n def has_run_stats_index_cols(self) -> bool:\n with self.connect() as conn:\n column_names = [x.get("name") for x in db.inspect(conn).get_columns(RunsTable.name)]\n return "start_time" in column_names and "end_time" in column_names\n\n def has_bulk_actions_selector_cols(self) -> bool:\n with self.connect() as conn:\n column_names = [\n x.get("name") for x in db.inspect(conn).get_columns(BulkActionsTable.name)\n ]\n return "selector_id" in column_names\n\n # Daemon heartbeats\n\n def add_daemon_heartbeat(self, daemon_heartbeat: DaemonHeartbeat) -> None:\n with self.connect() as conn:\n # insert, or update if already present\n try:\n conn.execute(\n DaemonHeartbeatsTable.insert().values(\n timestamp=utc_datetime_from_timestamp(daemon_heartbeat.timestamp),\n daemon_type=daemon_heartbeat.daemon_type,\n daemon_id=daemon_heartbeat.daemon_id,\n body=serialize_value(daemon_heartbeat),\n )\n )\n except db_exc.IntegrityError:\n conn.execute(\n DaemonHeartbeatsTable.update()\n .where(DaemonHeartbeatsTable.c.daemon_type == daemon_heartbeat.daemon_type)\n .values(\n timestamp=utc_datetime_from_timestamp(daemon_heartbeat.timestamp),\n daemon_id=daemon_heartbeat.daemon_id,\n body=serialize_value(daemon_heartbeat),\n )\n )\n\n def get_daemon_heartbeats(self) -> Mapping[str, DaemonHeartbeat]:\n rows = self.fetchall(db_select([DaemonHeartbeatsTable.c.body]))\n heartbeats = []\n for row in rows:\n heartbeats.append(deserialize_value(row["body"], DaemonHeartbeat))\n return {heartbeat.daemon_type: heartbeat for heartbeat in heartbeats}\n\n def wipe(self) -> None:\n """Clears the run storage."""\n with self.connect() as conn:\n # https://stackoverflow.com/a/54386260/324449\n conn.execute(RunsTable.delete())\n conn.execute(RunTagsTable.delete())\n conn.execute(SnapshotsTable.delete())\n conn.execute(DaemonHeartbeatsTable.delete())\n conn.execute(BulkActionsTable.delete())\n\n def wipe_daemon_heartbeats(self) -> None:\n with self.connect() as conn:\n # https://stackoverflow.com/a/54386260/324449\n conn.execute(DaemonHeartbeatsTable.delete())\n\n def get_backfills(\n self,\n status: Optional[BulkActionStatus] = None,\n cursor: Optional[str] = None,\n limit: Optional[int] = None,\n ) -> Sequence[PartitionBackfill]:\n check.opt_inst_param(status, "status", BulkActionStatus)\n query = db_select([BulkActionsTable.c.body])\n if status:\n query = query.where(BulkActionsTable.c.status == status.value)\n if cursor:\n cursor_query = db_select([BulkActionsTable.c.id]).where(\n BulkActionsTable.c.key == cursor\n )\n query = query.where(BulkActionsTable.c.id < cursor_query)\n if limit:\n query = query.limit(limit)\n query = query.order_by(BulkActionsTable.c.id.desc())\n rows = self.fetchall(query)\n return [deserialize_value(row["body"], PartitionBackfill) for row in rows]\n\n def get_backfill(self, backfill_id: str) -> Optional[PartitionBackfill]:\n check.str_param(backfill_id, "backfill_id")\n query = db_select([BulkActionsTable.c.body]).where(BulkActionsTable.c.key == backfill_id)\n row = self.fetchone(query)\n return deserialize_value(row["body"], PartitionBackfill) if row else None\n\n def add_backfill(self, partition_backfill: PartitionBackfill) -> None:\n check.inst_param(partition_backfill, "partition_backfill", PartitionBackfill)\n values: Dict[str, Any] = dict(\n key=partition_backfill.backfill_id,\n status=partition_backfill.status.value,\n timestamp=utc_datetime_from_timestamp(partition_backfill.backfill_timestamp),\n body=serialize_value(cast(NamedTuple, partition_backfill)),\n )\n\n if self.has_bulk_actions_selector_cols():\n values["selector_id"] = partition_backfill.selector_id\n values["action_type"] = partition_backfill.bulk_action_type.value\n\n with self.connect() as conn:\n conn.execute(BulkActionsTable.insert().values(**values))\n\n def update_backfill(self, partition_backfill: PartitionBackfill) -> None:\n check.inst_param(partition_backfill, "partition_backfill", PartitionBackfill)\n backfill_id = partition_backfill.backfill_id\n if not self.get_backfill(backfill_id):\n raise DagsterInvariantViolationError(\n f"Backfill {backfill_id} is not present in storage"\n )\n with self.connect() as conn:\n conn.execute(\n BulkActionsTable.update()\n .where(BulkActionsTable.c.key == backfill_id)\n .values(\n status=partition_backfill.status.value,\n body=serialize_value(partition_backfill),\n )\n )\n\n def get_cursor_values(self, keys: Set[str]) -> Mapping[str, str]:\n check.set_param(keys, "keys", of_type=str)\n\n rows = self.fetchall(\n db_select([KeyValueStoreTable.c.key, KeyValueStoreTable.c.value]).where(\n KeyValueStoreTable.c.key.in_(keys)\n ),\n )\n return {row["key"]: row["value"] for row in rows}\n\n def set_cursor_values(self, pairs: Mapping[str, str]) -> None:\n check.mapping_param(pairs, "pairs", key_type=str, value_type=str)\n db_values = [{"key": k, "value": v} for k, v in pairs.items()]\n\n with self.connect() as conn:\n try:\n conn.execute(KeyValueStoreTable.insert().values(db_values))\n except db_exc.IntegrityError:\n conn.execute(\n KeyValueStoreTable.update()\n .where(KeyValueStoreTable.c.key.in_(pairs.keys()))\n .values(value=db.sql.case(pairs, value=KeyValueStoreTable.c.key))\n )\n\n # Migrating run history\n def replace_job_origin(self, run: DagsterRun, job_origin: ExternalJobOrigin) -> None:\n new_label = job_origin.external_repository_origin.get_label()\n with self.connect() as conn:\n conn.execute(\n RunsTable.update()\n .where(RunsTable.c.run_id == run.run_id)\n .values(\n run_body=serialize_value(run.with_job_origin(job_origin)),\n )\n )\n conn.execute(\n RunTagsTable.update()\n .where(RunTagsTable.c.run_id == run.run_id)\n .where(RunTagsTable.c.key == REPOSITORY_LABEL_TAG)\n .values(value=new_label)\n )
\n\n\nGET_PIPELINE_SNAPSHOT_QUERY_ID = "get-pipeline-snapshot"\n\n\ndef defensively_unpack_execution_plan_snapshot_query(\n logger: logging.Logger, row: Sequence[Any]\n) -> Optional[Union[ExecutionPlanSnapshot, JobSnapshot]]:\n # minimal checking here because sqlalchemy returns a different type based on what version of\n # SqlAlchemy you are using\n\n def _warn(msg: str) -> None:\n logger.warning(f"get-pipeline-snapshot: {msg}")\n\n if not isinstance(row[0], bytes):\n _warn("First entry in row is not a binary type.")\n return None\n\n try:\n uncompressed_bytes = zlib.decompress(row[0])\n except zlib.error:\n _warn("Could not decompress bytes stored in snapshot table.")\n return None\n\n try:\n decoded_str = uncompressed_bytes.decode("utf-8")\n except UnicodeDecodeError:\n _warn("Could not unicode decode decompressed bytes stored in snapshot table.")\n return None\n\n try:\n return deserialize_value(decoded_str, (ExecutionPlanSnapshot, JobSnapshot))\n except JSONDecodeError:\n _warn("Could not parse json in snapshot table.")\n return None\n
", "current_page_name": "_modules/dagster/_core/storage/runs/sql_run_storage", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.storage.runs.sql_run_storage"}, "sqlite": {"sqlite_run_storage": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.storage.runs.sqlite.sqlite_run_storage

\nimport os\nfrom contextlib import contextmanager\nfrom typing import TYPE_CHECKING, Iterator, Optional\nfrom urllib.parse import urljoin, urlparse\n\nimport sqlalchemy as db\nfrom sqlalchemy.engine import Connection\nfrom sqlalchemy.pool import NullPool\nfrom typing_extensions import Self\n\nfrom dagster import (\n    StringSource,\n    _check as check,\n)\nfrom dagster._config.config_schema import UserConfigSchema\nfrom dagster._core.storage.sql import (\n    AlembicVersion,\n    check_alembic_revision,\n    create_engine,\n    get_alembic_config,\n    run_alembic_downgrade,\n    run_alembic_upgrade,\n    stamp_alembic_rev,\n)\nfrom dagster._core.storage.sqlite import create_db_conn_string\nfrom dagster._serdes import ConfigurableClass, ConfigurableClassData\nfrom dagster._utils import mkdir_p\n\nfrom ..schema import InstanceInfo, RunsTable, RunStorageSqlMetadata, RunTagsTable\nfrom ..sql_run_storage import SqlRunStorage\n\nif TYPE_CHECKING:\n    from dagster._core.storage.sqlite_storage import SqliteStorageConfig\nMINIMUM_SQLITE_BUCKET_VERSION = [3, 25, 0]\n\n\n
[docs]class SqliteRunStorage(SqlRunStorage, ConfigurableClass):\n """SQLite-backed run storage.\n\n Users should not directly instantiate this class; it is instantiated by internal machinery when\n ``dagster-webserver`` and ``dagster-graphql`` load, based on the values in the ``dagster.yaml`` file in\n ``$DAGSTER_HOME``. Configuration of this class should be done by setting values in that file.\n\n This is the default run storage when none is specified in the ``dagster.yaml``.\n\n To explicitly specify SQLite for run storage, you can add a block such as the following to your\n ``dagster.yaml``:\n\n .. code-block:: YAML\n\n run_storage:\n module: dagster._core.storage.runs\n class: SqliteRunStorage\n config:\n base_dir: /path/to/dir\n\n The ``base_dir`` param tells the run storage where on disk to store the database.\n """\n\n def __init__(self, conn_string: str, inst_data: Optional[ConfigurableClassData] = None):\n check.str_param(conn_string, "conn_string")\n self._conn_string = conn_string\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n super().__init__()\n\n @property\n def inst_data(self) -> Optional[ConfigurableClassData]:\n return self._inst_data\n\n @classmethod\n def config_type(cls) -> UserConfigSchema:\n return {"base_dir": StringSource}\n\n @classmethod\n def from_config_value(\n cls, inst_data: Optional[ConfigurableClassData], config_value: "SqliteStorageConfig"\n ) -> "SqliteRunStorage":\n return SqliteRunStorage.from_local(inst_data=inst_data, **config_value)\n\n @classmethod\n def from_local(cls, base_dir: str, inst_data: Optional[ConfigurableClassData] = None) -> Self:\n check.str_param(base_dir, "base_dir")\n mkdir_p(base_dir)\n conn_string = create_db_conn_string(base_dir, "runs")\n engine = create_engine(conn_string, poolclass=NullPool)\n alembic_config = get_alembic_config(__file__)\n\n should_mark_indexes = False\n with engine.connect() as connection:\n db_revision, head_revision = check_alembic_revision(alembic_config, connection)\n if not (db_revision and head_revision):\n RunStorageSqlMetadata.create_all(engine)\n connection.execute(db.text("PRAGMA journal_mode=WAL;"))\n stamp_alembic_rev(alembic_config, connection)\n should_mark_indexes = True\n\n table_names = db.inspect(engine).get_table_names()\n if "instance_info" not in table_names:\n InstanceInfo.create(engine)\n\n run_storage = cls(conn_string, inst_data)\n\n if should_mark_indexes:\n run_storage.migrate()\n run_storage.optimize()\n\n return run_storage\n\n @contextmanager\n def connect(self) -> Iterator[Connection]:\n engine = create_engine(self._conn_string, poolclass=NullPool)\n with engine.connect() as conn:\n with conn.begin():\n yield conn\n\n def _alembic_upgrade(self, rev: str = "head") -> None:\n alembic_config = get_alembic_config(__file__)\n with self.connect() as conn:\n run_alembic_upgrade(alembic_config, conn, rev=rev)\n\n def _alembic_downgrade(self, rev: str = "head") -> None:\n alembic_config = get_alembic_config(__file__)\n with self.connect() as conn:\n run_alembic_downgrade(alembic_config, conn, rev=rev)\n\n def upgrade(self) -> None:\n self._check_for_version_066_migration_and_perform()\n self._alembic_upgrade()\n\n # In version 0.6.6, we changed the layout of the of the sqllite dbs on disk\n # to move from the root of DAGSTER_HOME/runs.db to DAGSTER_HOME/history/runs.bd\n # This function checks for that condition and does the move\n def _check_for_version_066_migration_and_perform(self) -> None:\n old_conn_string = "sqlite://" + urljoin(urlparse(self._conn_string).path, "../runs.db")\n path_to_old_db = urlparse(old_conn_string).path\n # sqlite URLs look like `sqlite:///foo/bar/baz on Unix/Mac` but on Windows they look like\n # `sqlite:///D:/foo/bar/baz` (or `sqlite:///D:\\foo\\bar\\baz`)\n if os.name == "nt":\n path_to_old_db = path_to_old_db.lstrip("/")\n if os.path.exists(path_to_old_db):\n old_storage = SqliteRunStorage(old_conn_string)\n old_runs = old_storage.get_runs()\n for run in old_runs:\n self.add_run(run)\n os.unlink(path_to_old_db)\n\n def delete_run(self, run_id: str) -> None:\n """Override the default sql delete run implementation until we can get full\n support on cascading deletes.\n """\n check.str_param(run_id, "run_id")\n remove_tags = db.delete(RunTagsTable).where(RunTagsTable.c.run_id == run_id)\n remove_run = db.delete(RunsTable).where(RunsTable.c.run_id == run_id)\n with self.connect() as conn:\n conn.execute(remove_tags)\n conn.execute(remove_run)\n\n def alembic_version(self) -> AlembicVersion:\n alembic_config = get_alembic_config(__file__)\n with self.connect() as conn:\n return check_alembic_revision(alembic_config, conn)
\n
", "current_page_name": "_modules/dagster/_core/storage/runs/sqlite/sqlite_run_storage", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.storage.runs.sqlite.sqlite_run_storage"}}}, "schedules": {"base": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.storage.schedules.base

\nimport abc\nfrom typing import Mapping, Optional, Sequence, Set\n\nfrom dagster import AssetKey\nfrom dagster._core.definitions.auto_materialize_rule import AutoMaterializeAssetEvaluation\nfrom dagster._core.definitions.run_request import InstigatorType\nfrom dagster._core.instance import MayHaveInstanceWeakref, T_DagsterInstance\nfrom dagster._core.scheduler.instigation import (\n    AutoMaterializeAssetEvaluationRecord,\n    InstigatorState,\n    InstigatorStatus,\n    InstigatorTick,\n    TickData,\n    TickStatus,\n)\nfrom dagster._core.storage.sql import AlembicVersion\nfrom dagster._utils import PrintFn\n\n\n
[docs]class ScheduleStorage(abc.ABC, MayHaveInstanceWeakref[T_DagsterInstance]):\n """Abstract class for managing persistance of scheduler artifacts."""\n\n @abc.abstractmethod\n def wipe(self) -> None:\n """Delete all schedules from storage."""\n\n @abc.abstractmethod\n def all_instigator_state(\n self,\n repository_origin_id: Optional[str] = None,\n repository_selector_id: Optional[str] = None,\n instigator_type: Optional[InstigatorType] = None,\n instigator_statuses: Optional[Set[InstigatorStatus]] = None,\n ) -> Sequence[InstigatorState]:\n """Return all InstigationStates present in storage.\n\n Args:\n repository_origin_id (Optional[str]): The ExternalRepository target id to scope results to\n repository_selector_id (Optional[str]): The repository selector id to scope results to\n instigator_type (Optional[InstigatorType]): The InstigatorType to scope results to\n instigator_statuses (Optional[Set[InstigatorStatus]]): The InstigatorStatuses to scope results to\n """\n\n @abc.abstractmethod\n def get_instigator_state(self, origin_id: str, selector_id: str) -> Optional[InstigatorState]:\n """Return the instigator state for the given id.\n\n Args:\n origin_id (str): The unique instigator identifier\n selector_id (str): The logical instigator identifier\n """\n\n @abc.abstractmethod\n def add_instigator_state(self, state: InstigatorState) -> InstigatorState:\n """Add an instigator state to storage.\n\n Args:\n state (InstigatorState): The state to add\n """\n\n @abc.abstractmethod\n def update_instigator_state(self, state: InstigatorState) -> InstigatorState:\n """Update an instigator state in storage.\n\n Args:\n state (InstigatorState): The state to update\n """\n\n @abc.abstractmethod\n def delete_instigator_state(self, origin_id: str, selector_id: str) -> None:\n """Delete a state in storage.\n\n Args:\n origin_id (str): The id of the instigator target to delete\n selector_id (str): The logical instigator identifier\n """\n\n @property\n def supports_batch_queries(self) -> bool:\n return False\n\n def get_batch_ticks(\n self,\n selector_ids: Sequence[str],\n limit: Optional[int] = None,\n statuses: Optional[Sequence[TickStatus]] = None,\n ) -> Mapping[str, Sequence[InstigatorTick]]:\n raise NotImplementedError()\n\n @abc.abstractmethod\n def get_ticks(\n self,\n origin_id: str,\n selector_id: str,\n before: Optional[float] = None,\n after: Optional[float] = None,\n limit: Optional[int] = None,\n statuses: Optional[Sequence[TickStatus]] = None,\n ) -> Sequence[InstigatorTick]:\n """Get the ticks for a given instigator.\n\n Args:\n origin_id (str): The id of the instigator target\n selector_id (str): The logical instigator identifier\n """\n\n @abc.abstractmethod\n def create_tick(self, tick_data: TickData) -> InstigatorTick:\n """Add a tick to storage.\n\n Args:\n tick_data (TickData): The tick to add\n """\n\n @abc.abstractmethod\n def update_tick(self, tick: InstigatorTick) -> InstigatorTick:\n """Update a tick already in storage.\n\n Args:\n tick (InstigatorTick): The tick to update\n """\n\n @abc.abstractmethod\n def purge_ticks(\n self,\n origin_id: str,\n selector_id: str,\n before: float,\n tick_statuses: Optional[Sequence[TickStatus]] = None,\n ) -> None:\n """Wipe ticks for an instigator for a certain status and timestamp.\n\n Args:\n origin_id (str): The id of the instigator target to delete\n selector_id (str): The logical instigator identifier\n before (datetime): All ticks before this datetime will get purged\n tick_statuses (Optional[List[TickStatus]]): The tick statuses to wipe\n """\n\n @property\n def supports_auto_materialize_asset_evaluations(self) -> bool:\n return True\n\n @abc.abstractmethod\n def add_auto_materialize_asset_evaluations(\n self,\n evaluation_id: int,\n asset_evaluations: Sequence[AutoMaterializeAssetEvaluation],\n ) -> None:\n """Add asset policy evaluations to storage."""\n\n @abc.abstractmethod\n def get_auto_materialize_asset_evaluations(\n self, asset_key: AssetKey, limit: int, cursor: Optional[int] = None\n ) -> Sequence[AutoMaterializeAssetEvaluationRecord]:\n """Get the policy evaluations for a given asset.\n\n Args:\n asset_key (AssetKey): The asset key to query\n limit (Optional[int]): The maximum number of evaluations to return\n cursor (Optional[int]): The cursor to paginate from\n """\n\n @abc.abstractmethod\n def purge_asset_evaluations(self, before: float) -> None:\n """Wipe evaluations before a certain timestamp.\n\n Args:\n before (datetime): All evaluations before this datetime will get purged\n """\n\n @abc.abstractmethod\n def upgrade(self) -> None:\n """Perform any needed migrations."""\n\n def migrate(self, print_fn: Optional[PrintFn] = None, force_rebuild_all: bool = False) -> None:\n """Call this method to run any required data migrations."""\n\n def optimize(self, print_fn: Optional[PrintFn] = None, force_rebuild_all: bool = False) -> None:\n """Call this method to run any optional data migrations for optimized reads."""\n\n def optimize_for_webserver(self, statement_timeout: int, pool_recycle: int) -> None:\n """Allows for optimizing database connection / use in the context of a long lived webserver process."""\n\n def alembic_version(self) -> Optional[AlembicVersion]:\n return None\n\n def dispose(self) -> None:\n """Explicit lifecycle management."""
\n
", "current_page_name": "_modules/dagster/_core/storage/schedules/base", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.storage.schedules.base"}, "sql_schedule_storage": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.storage.schedules.sql_schedule_storage

\nfrom abc import abstractmethod\nfrom collections import defaultdict\nfrom datetime import datetime\nfrom typing import (\n    Any,\n    Callable,\n    ContextManager,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Set,\n    Type,\n    TypeVar,\n)\n\nimport pendulum\nimport sqlalchemy as db\nimport sqlalchemy.exc as db_exc\nfrom sqlalchemy.engine import Connection\n\nimport dagster._check as check\nfrom dagster._core.definitions.auto_materialize_rule import AutoMaterializeAssetEvaluation\nfrom dagster._core.definitions.events import AssetKey\nfrom dagster._core.definitions.run_request import InstigatorType\nfrom dagster._core.errors import DagsterInvariantViolationError\nfrom dagster._core.scheduler.instigation import (\n    AutoMaterializeAssetEvaluationRecord,\n    InstigatorState,\n    InstigatorStatus,\n    InstigatorTick,\n    TickData,\n    TickStatus,\n)\nfrom dagster._core.storage.sql import SqlAlchemyQuery, SqlAlchemyRow\nfrom dagster._core.storage.sqlalchemy_compat import db_fetch_mappings, db_select, db_subquery\nfrom dagster._serdes import serialize_value\nfrom dagster._serdes.serdes import deserialize_value\nfrom dagster._utils import PrintFn, utc_datetime_from_timestamp\n\nfrom .base import ScheduleStorage\nfrom .migration import (\n    OPTIONAL_SCHEDULE_DATA_MIGRATIONS,\n    REQUIRED_SCHEDULE_DATA_MIGRATIONS,\n    SCHEDULE_JOBS_SELECTOR_ID,\n    SCHEDULE_TICKS_SELECTOR_ID,\n)\nfrom .schema import (\n    AssetDaemonAssetEvaluationsTable,\n    InstigatorsTable,\n    JobTable,\n    JobTickTable,\n    SecondaryIndexMigrationTable,\n)\n\nT_NamedTuple = TypeVar("T_NamedTuple", bound=NamedTuple)\n\n\n
[docs]class SqlScheduleStorage(ScheduleStorage):\n """Base class for SQL backed schedule storage."""\n\n @abstractmethod\n def connect(self) -> ContextManager[Connection]:\n """Context manager yielding a sqlalchemy.engine.Connection."""\n\n def execute(self, query: SqlAlchemyQuery) -> Sequence[SqlAlchemyRow]:\n with self.connect() as conn:\n result_proxy = conn.execute(query)\n res = result_proxy.fetchall()\n result_proxy.close()\n\n return res\n\n def _deserialize_rows(\n self, rows: Sequence[SqlAlchemyRow], as_type: Type[T_NamedTuple]\n ) -> Sequence[T_NamedTuple]:\n return list(map(lambda r: deserialize_value(r[0], as_type), rows))\n\n def all_instigator_state(\n self,\n repository_origin_id: Optional[str] = None,\n repository_selector_id: Optional[str] = None,\n instigator_type: Optional[InstigatorType] = None,\n instigator_statuses: Optional[Set[InstigatorStatus]] = None,\n ) -> Sequence[InstigatorState]:\n check.opt_inst_param(instigator_type, "instigator_type", InstigatorType)\n\n if self.has_instigators_table() and self.has_built_index(SCHEDULE_JOBS_SELECTOR_ID):\n query = db_select([InstigatorsTable.c.instigator_body]).select_from(InstigatorsTable)\n if repository_selector_id:\n query = query.where(\n InstigatorsTable.c.repository_selector_id == repository_selector_id\n )\n if instigator_type:\n query = query.where(InstigatorsTable.c.instigator_type == instigator_type.value)\n if instigator_statuses:\n query = query.where(\n InstigatorsTable.c.status.in_([status.value for status in instigator_statuses])\n )\n\n else:\n query = db_select([JobTable.c.job_body]).select_from(JobTable)\n if repository_origin_id:\n query = query.where(JobTable.c.repository_origin_id == repository_origin_id)\n if instigator_type:\n query = query.where(JobTable.c.job_type == instigator_type.value)\n if instigator_statuses:\n query = query.where(\n JobTable.c.status.in_([status.value for status in instigator_statuses])\n )\n\n rows = self.execute(query)\n return self._deserialize_rows(rows, InstigatorState)\n\n def get_instigator_state(self, origin_id: str, selector_id: str) -> Optional[InstigatorState]:\n check.str_param(origin_id, "origin_id")\n check.str_param(selector_id, "selector_id")\n\n if self.has_instigators_table() and self.has_built_index(SCHEDULE_JOBS_SELECTOR_ID):\n query = (\n db_select([InstigatorsTable.c.instigator_body])\n .select_from(InstigatorsTable)\n .where(InstigatorsTable.c.selector_id == selector_id)\n )\n else:\n query = (\n db_select([JobTable.c.job_body])\n .select_from(JobTable)\n .where(JobTable.c.job_origin_id == origin_id)\n )\n\n rows = self.execute(query)\n return self._deserialize_rows(rows[:1], InstigatorState)[0] if len(rows) else None\n\n def _has_instigator_state_by_selector(self, selector_id: str) -> bool:\n check.str_param(selector_id, "selector_id")\n\n query = (\n db_select([JobTable.c.job_body])\n .select_from(JobTable)\n .where(JobTable.c.selector_id == selector_id)\n )\n\n rows = self.execute(query)\n return self._deserialize_rows(rows[:1])[0] if len(rows) else None # type: ignore\n\n def _add_or_update_instigators_table(self, conn: Connection, state: InstigatorState) -> None:\n selector_id = state.selector_id\n try:\n conn.execute(\n InstigatorsTable.insert().values(\n selector_id=selector_id,\n repository_selector_id=state.repository_selector_id,\n status=state.status.value,\n instigator_type=state.instigator_type.value,\n instigator_body=serialize_value(state),\n )\n )\n except db_exc.IntegrityError:\n conn.execute(\n InstigatorsTable.update()\n .where(InstigatorsTable.c.selector_id == selector_id)\n .values(\n status=state.status.value,\n instigator_type=state.instigator_type.value,\n instigator_body=serialize_value(state),\n update_timestamp=pendulum.now("UTC"),\n )\n )\n\n def add_instigator_state(self, state: InstigatorState) -> InstigatorState:\n check.inst_param(state, "state", InstigatorState)\n with self.connect() as conn:\n try:\n conn.execute(\n JobTable.insert().values(\n job_origin_id=state.instigator_origin_id,\n repository_origin_id=state.repository_origin_id,\n status=state.status.value,\n job_type=state.instigator_type.value,\n job_body=serialize_value(state),\n )\n )\n except db_exc.IntegrityError as exc:\n raise DagsterInvariantViolationError(\n f"InstigatorState {state.instigator_origin_id} is already present in storage"\n ) from exc\n\n # try writing to the instigators table\n if self._has_instigators_table(conn):\n self._add_or_update_instigators_table(conn, state)\n\n return state\n\n def update_instigator_state(self, state: InstigatorState) -> InstigatorState:\n check.inst_param(state, "state", InstigatorState)\n if not self.get_instigator_state(state.instigator_origin_id, state.selector_id):\n raise DagsterInvariantViolationError(\n f"InstigatorState {state.instigator_origin_id} is not present in storage"\n )\n\n values = {\n "status": state.status.value,\n "job_body": serialize_value(state),\n "update_timestamp": pendulum.now("UTC"),\n }\n if self.has_instigators_table():\n values["selector_id"] = state.selector_id\n\n with self.connect() as conn:\n conn.execute(\n JobTable.update()\n .where(JobTable.c.job_origin_id == state.instigator_origin_id)\n .values(**values)\n )\n if self._has_instigators_table(conn):\n self._add_or_update_instigators_table(conn, state)\n\n return state\n\n def delete_instigator_state(self, origin_id: str, selector_id: str) -> None:\n check.str_param(origin_id, "origin_id")\n check.str_param(selector_id, "selector_id")\n\n if not self.get_instigator_state(origin_id, selector_id):\n raise DagsterInvariantViolationError(\n f"InstigatorState {origin_id} is not present in storage"\n )\n\n with self.connect() as conn:\n conn.execute(JobTable.delete().where(JobTable.c.job_origin_id == origin_id))\n\n if self._has_instigators_table(conn):\n if not self._jobs_has_selector_state(conn, selector_id):\n conn.execute(\n InstigatorsTable.delete().where(\n InstigatorsTable.c.selector_id == selector_id\n )\n )\n\n def _jobs_has_selector_state(self, conn: Connection, selector_id: str) -> bool:\n query = (\n db_select([db.func.count()])\n .select_from(JobTable)\n .where(JobTable.c.selector_id == selector_id)\n )\n result = conn.execute(query)\n row = result.fetchone()\n result.close()\n return row[0] > 0 # type: ignore # (possible none)\n\n def _add_filter_limit(\n self,\n query: SqlAlchemyQuery,\n before: Optional[float] = None,\n after: Optional[float] = None,\n limit: Optional[int] = None,\n statuses=None,\n ) -> SqlAlchemyQuery:\n check.opt_float_param(before, "before")\n check.opt_float_param(after, "after")\n check.opt_int_param(limit, "limit")\n check.opt_list_param(statuses, "statuses", of_type=TickStatus)\n\n if before:\n query = query.where(JobTickTable.c.timestamp < utc_datetime_from_timestamp(before))\n if after:\n query = query.where(JobTickTable.c.timestamp > utc_datetime_from_timestamp(after))\n if limit:\n query = query.limit(limit)\n if statuses:\n query = query.where(JobTickTable.c.status.in_([status.value for status in statuses]))\n return query\n\n @property\n def supports_batch_queries(self) -> bool:\n return self.has_instigators_table() and self.has_built_index(SCHEDULE_TICKS_SELECTOR_ID)\n\n def has_instigators_table(self) -> bool:\n with self.connect() as conn:\n return self._has_instigators_table(conn)\n\n def _has_instigators_table(self, conn: Connection) -> bool:\n table_names = db.inspect(conn).get_table_names()\n return "instigators" in table_names\n\n def _has_asset_daemon_asset_evaluations_table(self, conn: Connection) -> bool:\n table_names = db.inspect(conn).get_table_names()\n return "asset_daemon_asset_evaluations" in table_names\n\n def get_batch_ticks(\n self,\n selector_ids: Sequence[str],\n limit: Optional[int] = None,\n statuses: Optional[Sequence[TickStatus]] = None,\n ) -> Mapping[str, Sequence[InstigatorTick]]:\n check.sequence_param(selector_ids, "selector_ids", of_type=str)\n check.opt_int_param(limit, "limit")\n check.opt_sequence_param(statuses, "statuses", of_type=TickStatus)\n\n bucket_rank_column = (\n db.func.rank()\n .over(\n order_by=db.desc(JobTickTable.c.timestamp),\n partition_by=JobTickTable.c.selector_id,\n )\n .label("rank")\n )\n subquery = db_subquery(\n db_select(\n [\n JobTickTable.c.id,\n JobTickTable.c.selector_id,\n JobTickTable.c.tick_body,\n bucket_rank_column,\n ]\n )\n .select_from(JobTickTable)\n .where(JobTickTable.c.selector_id.in_(selector_ids))\n )\n if statuses:\n subquery = subquery.where(\n JobTickTable.c.status.in_([status.value for status in statuses])\n )\n\n query = (\n db_select([subquery.c.id, subquery.c.selector_id, subquery.c.tick_body])\n .order_by(subquery.c.rank.asc())\n .where(subquery.c.rank <= limit)\n )\n\n rows = self.execute(query)\n results = defaultdict(list)\n for row in rows:\n tick_id = row[0]\n selector_id = row[1]\n tick_data = deserialize_value(row[2], TickData)\n results[selector_id].append(InstigatorTick(tick_id, tick_data))\n return results\n\n def get_ticks(\n self,\n origin_id: str,\n selector_id: str,\n before: Optional[float] = None,\n after: Optional[float] = None,\n limit: Optional[int] = None,\n statuses: Optional[Sequence[TickStatus]] = None,\n ) -> Sequence[InstigatorTick]:\n check.str_param(origin_id, "origin_id")\n check.opt_float_param(before, "before")\n check.opt_float_param(after, "after")\n check.opt_int_param(limit, "limit")\n check.opt_list_param(statuses, "statuses", of_type=TickStatus)\n\n base_query = (\n db_select([JobTickTable.c.id, JobTickTable.c.tick_body])\n .select_from(JobTickTable)\n .order_by(JobTickTable.c.timestamp.desc())\n )\n if self.has_instigators_table():\n query = base_query.where(\n db.or_(\n JobTickTable.c.selector_id == selector_id,\n db.and_(\n JobTickTable.c.selector_id.is_(None),\n JobTickTable.c.job_origin_id == origin_id,\n ),\n )\n )\n else:\n query = base_query.where(JobTickTable.c.job_origin_id == origin_id)\n\n query = self._add_filter_limit(\n query, before=before, after=after, limit=limit, statuses=statuses\n )\n\n rows = self.execute(query)\n return list(map(lambda r: InstigatorTick(r[0], deserialize_value(r[1], TickData)), rows))\n\n def create_tick(self, tick_data: TickData) -> InstigatorTick:\n check.inst_param(tick_data, "tick_data", TickData)\n\n values = {\n "job_origin_id": tick_data.instigator_origin_id,\n "status": tick_data.status.value,\n "type": tick_data.instigator_type.value,\n "timestamp": utc_datetime_from_timestamp(tick_data.timestamp),\n "tick_body": serialize_value(tick_data),\n }\n if self.has_instigators_table() and tick_data.selector_id:\n values["selector_id"] = tick_data.selector_id\n\n with self.connect() as conn:\n try:\n tick_insert = JobTickTable.insert().values(**values)\n result = conn.execute(tick_insert)\n tick_id = result.inserted_primary_key[0]\n return InstigatorTick(tick_id, tick_data)\n except db_exc.IntegrityError as exc:\n raise DagsterInvariantViolationError(\n f"Unable to insert InstigatorTick for job {tick_data.instigator_name} in"\n " storage"\n ) from exc\n\n def update_tick(self, tick: InstigatorTick) -> InstigatorTick:\n check.inst_param(tick, "tick", InstigatorTick)\n\n values = {\n "status": tick.status.value,\n "type": tick.instigator_type.value,\n "timestamp": utc_datetime_from_timestamp(tick.timestamp),\n "tick_body": serialize_value(tick.tick_data),\n }\n if self.has_instigators_table() and tick.selector_id:\n values["selector_id"] = tick.selector_id\n\n with self.connect() as conn:\n conn.execute(\n JobTickTable.update().where(JobTickTable.c.id == tick.tick_id).values(**values)\n )\n\n return tick\n\n def purge_ticks(\n self,\n origin_id: str,\n selector_id: str,\n before: float,\n tick_statuses: Optional[Sequence[TickStatus]] = None,\n ) -> None:\n check.str_param(origin_id, "origin_id")\n check.float_param(before, "before")\n check.opt_list_param(tick_statuses, "tick_statuses", of_type=TickStatus)\n\n utc_before = utc_datetime_from_timestamp(before)\n\n query = JobTickTable.delete().where(JobTickTable.c.timestamp < utc_before)\n if tick_statuses:\n query = query.where(\n JobTickTable.c.status.in_([tick_status.value for tick_status in tick_statuses])\n )\n\n if self.has_instigators_table():\n query = query.where(\n db.or_(\n JobTickTable.c.selector_id == selector_id,\n db.and_(\n JobTickTable.c.selector_id.is_(None),\n JobTickTable.c.job_origin_id == origin_id,\n ),\n )\n )\n else:\n query = query.where(JobTickTable.c.job_origin_id == origin_id)\n\n with self.connect() as conn:\n conn.execute(query)\n\n @property\n def supports_auto_materialize_asset_evaluations(self) -> bool:\n with self.connect() as conn:\n return self._has_asset_daemon_asset_evaluations_table(conn)\n\n def add_auto_materialize_asset_evaluations(\n self,\n evaluation_id: int,\n asset_evaluations: Sequence[AutoMaterializeAssetEvaluation],\n ):\n if not asset_evaluations:\n return\n\n with self.connect() as conn:\n bulk_insert = AssetDaemonAssetEvaluationsTable.insert().values(\n [\n {\n "evaluation_id": evaluation_id,\n "asset_key": evaluation.asset_key.to_string(),\n "asset_evaluation_body": serialize_value(evaluation),\n "num_requested": evaluation.num_requested,\n "num_skipped": evaluation.num_skipped,\n "num_discarded": evaluation.num_discarded,\n }\n for evaluation in asset_evaluations\n ]\n )\n conn.execute(bulk_insert)\n\n def get_auto_materialize_asset_evaluations(\n self, asset_key: AssetKey, limit: int, cursor: Optional[int] = None\n ) -> Sequence[AutoMaterializeAssetEvaluationRecord]:\n with self.connect() as conn:\n query = (\n db_select(\n [\n AssetDaemonAssetEvaluationsTable.c.id,\n AssetDaemonAssetEvaluationsTable.c.asset_evaluation_body,\n AssetDaemonAssetEvaluationsTable.c.evaluation_id,\n AssetDaemonAssetEvaluationsTable.c.create_timestamp,\n ]\n )\n .where(AssetDaemonAssetEvaluationsTable.c.asset_key == asset_key.to_string())\n .order_by(AssetDaemonAssetEvaluationsTable.c.evaluation_id.desc())\n ).limit(limit)\n\n if cursor:\n query = query.where(AssetDaemonAssetEvaluationsTable.c.evaluation_id < cursor)\n\n rows = db_fetch_mappings(conn, query)\n return [AutoMaterializeAssetEvaluationRecord.from_db_row(row) for row in rows]\n\n def purge_asset_evaluations(self, before: float):\n check.float_param(before, "before")\n\n utc_before = utc_datetime_from_timestamp(before)\n query = AssetDaemonAssetEvaluationsTable.delete().where(\n AssetDaemonAssetEvaluationsTable.c.create_timestamp < utc_before\n )\n\n with self.connect() as conn:\n conn.execute(query)\n\n def wipe(self) -> None:\n """Clears the schedule storage."""\n with self.connect() as conn:\n # https://stackoverflow.com/a/54386260/324449\n conn.execute(JobTable.delete())\n conn.execute(JobTickTable.delete())\n if self._has_instigators_table(conn):\n conn.execute(InstigatorsTable.delete())\n if self._has_asset_daemon_asset_evaluations_table(conn):\n conn.execute(AssetDaemonAssetEvaluationsTable.delete())\n\n # MIGRATIONS\n\n def has_secondary_index_table(self) -> bool:\n with self.connect() as conn:\n return "secondary_indexes" in db.inspect(conn).get_table_names()\n\n def has_built_index(self, migration_name: str) -> bool:\n if not self.has_secondary_index_table():\n return False\n\n query = (\n db_select([1])\n .where(SecondaryIndexMigrationTable.c.name == migration_name)\n .where(SecondaryIndexMigrationTable.c.migration_completed != None) # noqa: E711\n .limit(1)\n )\n with self.connect() as conn:\n results = conn.execute(query).fetchall()\n\n return len(results) > 0\n\n def mark_index_built(self, migration_name: str) -> None:\n query = SecondaryIndexMigrationTable.insert().values(\n name=migration_name,\n migration_completed=datetime.now(),\n )\n with self.connect() as conn:\n try:\n conn.execute(query)\n except db_exc.IntegrityError:\n conn.execute(\n SecondaryIndexMigrationTable.update()\n .where(SecondaryIndexMigrationTable.c.name == migration_name)\n .values(migration_completed=datetime.now())\n )\n\n def _execute_data_migrations(\n self,\n migrations: Mapping[str, Callable[..., Any]],\n print_fn: Optional[Callable] = None,\n force_rebuild_all: bool = False,\n ) -> None:\n for migration_name, migration_fn in migrations.items():\n if self.has_built_index(migration_name):\n if not force_rebuild_all:\n if print_fn:\n print_fn(f"Skipping already applied migration: {migration_name}")\n continue\n if print_fn:\n print_fn(f"Starting data migration: {migration_name}")\n migration_fn()(self, print_fn)\n self.mark_index_built(migration_name)\n if print_fn:\n print_fn(f"Finished data migration: {migration_name}")\n\n def migrate(self, print_fn: Optional[PrintFn] = None, force_rebuild_all: bool = False) -> None:\n self._execute_data_migrations(\n REQUIRED_SCHEDULE_DATA_MIGRATIONS, print_fn, force_rebuild_all\n )\n\n def optimize(self, print_fn: Optional[PrintFn] = None, force_rebuild_all: bool = False) -> None:\n self._execute_data_migrations(\n OPTIONAL_SCHEDULE_DATA_MIGRATIONS, print_fn, force_rebuild_all\n )
\n
", "current_page_name": "_modules/dagster/_core/storage/schedules/sql_schedule_storage", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.storage.schedules.sql_schedule_storage"}, "sqlite": {"sqlite_schedule_storage": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.storage.schedules.sqlite.sqlite_schedule_storage

\nfrom contextlib import contextmanager\nfrom typing import Iterator, Optional\n\nimport sqlalchemy as db\nfrom packaging.version import parse\nfrom sqlalchemy.engine import Connection\nfrom sqlalchemy.pool import NullPool\n\nfrom dagster import (\n    StringSource,\n    _check as check,\n)\nfrom dagster._config.config_schema import UserConfigSchema\nfrom dagster._core.storage.sql import (\n    AlembicVersion,\n    check_alembic_revision,\n    create_engine,\n    get_alembic_config,\n    run_alembic_upgrade,\n    stamp_alembic_rev,\n)\nfrom dagster._core.storage.sqlite import create_db_conn_string, get_sqlite_version\nfrom dagster._serdes import ConfigurableClass, ConfigurableClassData\nfrom dagster._utils import mkdir_p\n\nfrom ..schema import ScheduleStorageSqlMetadata\nfrom ..sql_schedule_storage import SqlScheduleStorage\n\nMINIMUM_SQLITE_BATCH_VERSION = "3.25.0"\n\n\n
[docs]class SqliteScheduleStorage(SqlScheduleStorage, ConfigurableClass):\n """Local SQLite backed schedule storage."""\n\n def __init__(self, conn_string: str, inst_data: Optional[ConfigurableClassData] = None):\n check.str_param(conn_string, "conn_string")\n self._conn_string = conn_string\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n\n super().__init__()\n\n @property\n def inst_data(self) -> Optional[ConfigurableClassData]:\n return self._inst_data\n\n @classmethod\n def config_type(cls) -> UserConfigSchema:\n return {"base_dir": StringSource}\n\n @classmethod\n def from_config_value(\n cls, inst_data: Optional[ConfigurableClassData], config_value\n ) -> "SqliteScheduleStorage":\n return SqliteScheduleStorage.from_local(inst_data=inst_data, **config_value)\n\n @classmethod\n def from_local(\n cls, base_dir: str, inst_data: Optional[ConfigurableClassData] = None\n ) -> "SqliteScheduleStorage":\n check.str_param(base_dir, "base_dir")\n mkdir_p(base_dir)\n conn_string = create_db_conn_string(base_dir, "schedules")\n engine = create_engine(conn_string, poolclass=NullPool)\n alembic_config = get_alembic_config(__file__)\n\n should_migrate_data = False\n with engine.connect() as connection:\n db_revision, head_revision = check_alembic_revision(alembic_config, connection)\n if not (db_revision and head_revision):\n ScheduleStorageSqlMetadata.create_all(engine)\n connection.execute(db.text("PRAGMA journal_mode=WAL;"))\n stamp_alembic_rev(alembic_config, connection)\n should_migrate_data = True\n\n schedule_storage = cls(conn_string, inst_data)\n if should_migrate_data:\n schedule_storage.migrate()\n schedule_storage.optimize()\n\n return schedule_storage\n\n @contextmanager\n def connect(self) -> Iterator[Connection]:\n engine = create_engine(self._conn_string, poolclass=NullPool)\n with engine.connect() as conn:\n with conn.begin():\n yield conn\n\n @property\n def supports_batch_queries(self) -> bool:\n if not super().supports_batch_queries:\n return False\n\n return super().supports_batch_queries and parse(get_sqlite_version()) >= parse(\n MINIMUM_SQLITE_BATCH_VERSION\n )\n\n def upgrade(self) -> None:\n alembic_config = get_alembic_config(__file__)\n with self.connect() as conn:\n run_alembic_upgrade(alembic_config, conn)\n\n def alembic_version(self) -> AlembicVersion:\n alembic_config = get_alembic_config(__file__)\n with self.connect() as conn:\n return check_alembic_revision(alembic_config, conn)
\n
", "current_page_name": "_modules/dagster/_core/storage/schedules/sqlite/sqlite_schedule_storage", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.storage.schedules.sqlite.sqlite_schedule_storage"}}}, "upath_io_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.storage.upath_io_manager

\nimport asyncio\nimport inspect\nfrom abc import abstractmethod\nfrom pathlib import Path\nfrom typing import TYPE_CHECKING, Any, Dict, Mapping, Optional, Union\n\nfrom fsspec import AbstractFileSystem\nfrom fsspec.implementations.local import LocalFileSystem\n\nfrom dagster import (\n    InputContext,\n    MetadataValue,\n    MultiPartitionKey,\n    OutputContext,\n    _check as check,\n)\nfrom dagster._core.storage.memoizable_io_manager import MemoizableIOManager\n\nif TYPE_CHECKING:\n    from upath import UPath\n\n\n
[docs]class UPathIOManager(MemoizableIOManager):\n """Abstract IOManager base class compatible with local and cloud storage via `universal-pathlib` and `fsspec`.\n\n Features:\n - handles partitioned assets\n - handles loading a single upstream partition\n - handles loading multiple upstream partitions (with respect to :py:class:`PartitionMapping`)\n - supports loading multiple partitions concurrently with async `load_from_path` method\n - the `get_metadata` method can be customized to add additional metadata to the output\n - the `allow_missing_partitions` metadata value can be set to `True` to skip missing partitions\n (the default behavior is to raise an error)\n\n """\n\n extension: Optional[str] = None # override in child class\n\n def __init__(\n self,\n base_path: Optional["UPath"] = None,\n ):\n from upath import UPath\n\n assert not self.extension or "." in self.extension\n self._base_path = base_path or UPath(".")\n\n @abstractmethod\n def dump_to_path(self, context: OutputContext, obj: Any, path: "UPath"):\n """Child classes should override this method to write the object to the filesystem."""\n\n @abstractmethod\n def load_from_path(self, context: InputContext, path: "UPath") -> Any:\n """Child classes should override this method to load the object from the filesystem."""\n\n @property\n def fs(self) -> AbstractFileSystem:\n """Utility function to get the IOManager filesystem.\n\n Returns:\n AbstractFileSystem: fsspec filesystem.\n\n """\n from upath import UPath\n\n if isinstance(self._base_path, UPath):\n return self._base_path.fs\n elif isinstance(self._base_path, Path):\n return LocalFileSystem()\n else:\n raise ValueError(f"Unsupported base_path type: {type(self._base_path)}")\n\n @property\n def storage_options(self) -> Dict[str, Any]:\n """Utility function to get the fsspec storage_options which are often consumed by various I/O functions.\n\n Returns:\n Dict[str, Any]: fsspec storage_options.\n """\n from upath import UPath\n\n if isinstance(self._base_path, UPath):\n return self._base_path._kwargs.copy() # noqa\n elif isinstance(self._base_path, Path):\n return {}\n else:\n raise ValueError(f"Unsupported base_path type: {type(self._base_path)}")\n\n def get_metadata(\n self,\n context: OutputContext,\n obj: Any,\n ) -> Dict[str, MetadataValue]:\n """Child classes should override this method to add custom metadata to the outputs."""\n return {}\n\n # Read/write operations on paths can generally be handled by methods on the\n # UPath class, but when the backend requires credentials, this isn't\n # always possible. Override these path_* methods to provide custom\n # implementations for targeting backends that require authentication.\n\n def unlink(self, path: "UPath") -> None:\n """Remove the file or object at the provided path."""\n path.unlink()\n\n def path_exists(self, path: "UPath") -> bool:\n """Check if a file or object exists at the provided path."""\n return path.exists()\n\n def make_directory(self, path: "UPath"):\n """Create a directory at the provided path.\n\n Override as a no-op if the target backend doesn't use directories.\n """\n path.mkdir(parents=True, exist_ok=True)\n\n def has_output(self, context: OutputContext) -> bool:\n return self.path_exists(self._get_path(context))\n\n def _with_extension(self, path: "UPath") -> "UPath":\n return path.with_suffix(path.suffix + self.extension) if self.extension else path\n\n def _get_path_without_extension(self, context: Union[InputContext, OutputContext]) -> "UPath":\n if context.has_asset_key:\n context_path = self.get_asset_relative_path(context)\n else:\n # we are dealing with an op output\n context_path = self.get_op_output_relative_path(context)\n\n return self._base_path.joinpath(context_path)\n\n def get_asset_relative_path(self, context: Union[InputContext, OutputContext]) -> "UPath":\n from upath import UPath\n\n # we are not using context.get_asset_identifier() because it already includes the partition_key\n return UPath(*context.asset_key.path)\n\n def get_op_output_relative_path(self, context: Union[InputContext, OutputContext]) -> "UPath":\n from upath import UPath\n\n return UPath(*context.get_identifier())\n\n def get_loading_input_log_message(self, path: "UPath") -> str:\n return f"Loading file from: {path} using {self.__class__.__name__}..."\n\n def get_writing_output_log_message(self, path: "UPath") -> str:\n return f"Writing file at: {path} using {self.__class__.__name__}..."\n\n def get_loading_input_partition_log_message(self, path: "UPath", partition_key: str) -> str:\n return f"Loading partition {partition_key} from {path} using {self.__class__.__name__}..."\n\n def get_missing_partition_log_message(self, partition_key: str) -> str:\n return (\n f"Couldn't load partition {partition_key} and skipped it "\n "because the input metadata includes allow_missing_partitions=True"\n )\n\n def _get_path(self, context: Union[InputContext, OutputContext]) -> "UPath":\n """Returns the I/O path for a given context.\n Should not be used with partitions (use `_get_paths_for_partitions` instead).\n """\n path = self._get_path_without_extension(context)\n return self._with_extension(path)\n\n def get_path_for_partition(\n self, context: Union[InputContext, OutputContext], path: "UPath", partition: str\n ) -> "UPath":\n """Override this method if you want to use a different partitioning scheme\n (for example, if the saving function handles partitioning instead).\n The extension will be added later.\n\n Args:\n context (Union[InputContext, OutputContext]): The context for the I/O operation.\n path (UPath): The path to the file or object.\n partition (str): Formatted partition/multipartition key\n\n Returns:\n UPath: The path to the file with the partition key appended.\n """\n return path / partition\n\n def _get_paths_for_partitions(\n self, context: Union[InputContext, OutputContext]\n ) -> Dict[str, "UPath"]:\n """Returns a dict of partition_keys into I/O paths for a given context."""\n if not context.has_asset_partitions:\n raise TypeError(\n f"Detected {context.dagster_type.typing_type} input type "\n "but the asset is not partitioned"\n )\n\n def _formatted_multipartitioned_path(partition_key: MultiPartitionKey) -> str:\n ordered_dimension_keys = [\n key[1]\n for key in sorted(partition_key.keys_by_dimension.items(), key=lambda x: x[0])\n ]\n return "/".join(ordered_dimension_keys)\n\n formatted_partition_keys = {\n partition_key: (\n _formatted_multipartitioned_path(partition_key)\n if isinstance(partition_key, MultiPartitionKey)\n else partition_key\n )\n for partition_key in context.asset_partition_keys\n }\n\n asset_path = self._get_path_without_extension(context)\n return {\n partition_key: self._with_extension(\n self.get_path_for_partition(context, asset_path, partition)\n )\n for partition_key, partition in formatted_partition_keys.items()\n }\n\n def _get_multipartition_backcompat_paths(\n self, context: Union[InputContext, OutputContext]\n ) -> Mapping[str, "UPath"]:\n if not context.has_asset_partitions:\n raise TypeError(\n f"Detected {context.dagster_type.typing_type} input type "\n "but the asset is not partitioned"\n )\n\n partition_keys = context.asset_partition_keys\n\n asset_path = self._get_path_without_extension(context)\n return {\n partition_key: self._with_extension(asset_path / partition_key)\n for partition_key in partition_keys\n if isinstance(partition_key, MultiPartitionKey)\n }\n\n def _load_single_input(\n self, path: "UPath", context: InputContext, backcompat_path: Optional["UPath"] = None\n ) -> Any:\n context.log.debug(self.get_loading_input_log_message(path))\n try:\n obj = self.load_from_path(context=context, path=path)\n if asyncio.iscoroutine(obj):\n obj = asyncio.run(obj)\n except FileNotFoundError as e:\n if backcompat_path is not None:\n try:\n obj = self.load_from_path(context=context, path=backcompat_path)\n if asyncio.iscoroutine(obj):\n obj = asyncio.run(obj)\n\n context.log.debug(\n f"File not found at {path}. Loaded instead from backcompat path:"\n f" {backcompat_path}"\n )\n except FileNotFoundError:\n raise e\n else:\n raise e\n\n context.add_input_metadata({"path": MetadataValue.path(str(path))})\n return obj\n\n def _load_partition_from_path(\n self,\n context: InputContext,\n partition_key: str,\n path: "UPath",\n backcompat_path: Optional["UPath"] = None,\n ) -> Any:\n """1. Try to load the partition from the normal path.\n 2. If it was not found, try to load it from the backcompat path.\n 3. If allow_missing_partitions metadata is True, skip the partition if it was not found in any of the paths.\n Otherwise, raise an error.\n\n Args:\n context (InputContext): IOManager Input context\n partition_key (str): the partition key corresponding to the partition being loaded\n path (UPath): The path to the partition.\n backcompat_path (Optional[UPath]): The path to the partition in the backcompat location.\n\n Returns:\n Any: The object loaded from the partition.\n """\n allow_missing_partitions = (\n context.metadata.get("allow_missing_partitions", False)\n if context.metadata is not None\n else False\n )\n\n try:\n context.log.debug(self.get_loading_input_partition_log_message(path, partition_key))\n obj = self.load_from_path(context=context, path=path)\n return obj\n except FileNotFoundError as e:\n if backcompat_path is not None:\n try:\n obj = self.load_from_path(context=context, path=path)\n context.log.debug(\n f"File not found at {path}. Loaded instead from backcompat path:"\n f" {backcompat_path}"\n )\n return obj\n except FileNotFoundError as e:\n if allow_missing_partitions:\n context.log.warning(self.get_missing_partition_log_message(partition_key))\n return None\n else:\n raise e\n if allow_missing_partitions:\n context.log.warning(self.get_missing_partition_log_message(partition_key))\n return None\n else:\n raise e\n\n def _load_multiple_inputs(self, context: InputContext) -> Dict[str, Any]:\n # load multiple partitions\n paths = self._get_paths_for_partitions(context) # paths for normal partitions\n backcompat_paths = self._get_multipartition_backcompat_paths(\n context\n ) # paths for multipartitions\n\n context.log.debug(f"Loading {len(paths)} partitions...")\n\n objs = {}\n\n if not inspect.iscoroutinefunction(self.load_from_path):\n for partition_key in context.asset_partition_keys:\n obj = self._load_partition_from_path(\n context,\n partition_key,\n paths[partition_key],\n backcompat_paths.get(partition_key),\n )\n if obj is not None: # in case some partitions were skipped\n objs[partition_key] = obj\n return objs\n else:\n # load_from_path returns a coroutine, so we need to await the results\n\n async def collect():\n loop = asyncio.get_running_loop()\n\n tasks = []\n\n for partition_key in context.asset_partition_keys:\n tasks.append(\n loop.create_task(\n self._load_partition_from_path(\n context,\n partition_key,\n paths[partition_key],\n backcompat_paths.get(partition_key),\n )\n )\n )\n\n results = await asyncio.gather(*tasks, return_exceptions=True)\n\n # need to handle missing partitions here because exceptions don't get propagated from async calls\n allow_missing_partitions = (\n context.metadata.get("allow_missing_partitions", False)\n if context.metadata is not None\n else False\n )\n\n results_without_errors = []\n found_errors = False\n for partition_key, result in zip(context.asset_partition_keys, results):\n if isinstance(result, FileNotFoundError):\n if allow_missing_partitions:\n context.log.warning(\n self.get_missing_partition_log_message(partition_key)\n )\n else:\n context.log.error(str(result))\n found_errors = True\n elif isinstance(result, Exception):\n context.log.error(str(result))\n found_errors = True\n else:\n results_without_errors.append(result)\n\n if found_errors:\n raise RuntimeError(\n f"{len(paths) - len(results_without_errors)} partitions could not be loaded"\n )\n\n return results_without_errors\n\n awaited_objects = asyncio.get_event_loop().run_until_complete(collect())\n\n return {\n partition_key: awaited_object\n for partition_key, awaited_object in zip(\n context.asset_partition_keys, awaited_objects\n )\n if awaited_object is not None\n }\n\n def load_input(self, context: InputContext) -> Union[Any, Dict[str, Any]]:\n # If no asset key, we are dealing with an op output which is always non-partitioned\n if not context.has_asset_key or not context.has_asset_partitions:\n path = self._get_path(context)\n return self._load_single_input(path, context)\n else:\n asset_partition_keys = context.asset_partition_keys\n if len(asset_partition_keys) == 0:\n return None\n elif len(asset_partition_keys) == 1:\n paths = self._get_paths_for_partitions(context)\n check.invariant(len(paths) == 1, f"Expected 1 path, but got {len(paths)}")\n path = next(iter(paths.values()))\n backcompat_paths = self._get_multipartition_backcompat_paths(context)\n backcompat_path = (\n None if not backcompat_paths else next(iter(backcompat_paths.values()))\n )\n\n return self._load_single_input(path, context, backcompat_path)\n else: # we are dealing with multiple partitions of an asset\n type_annotation = context.dagster_type.typing_type\n if type_annotation != Any and not is_dict_type(type_annotation):\n check.failed(\n "Loading an input that corresponds to multiple partitions, but the"\n " type annotation on the op input is not a dict, Dict, Mapping, or"\n f" Any: is '{type_annotation}'."\n )\n\n return self._load_multiple_inputs(context)\n\n def handle_output(self, context: OutputContext, obj: Any):\n if context.dagster_type.typing_type == type(None):\n check.invariant(\n obj is None,\n "Output had Nothing type or 'None' annotation, but handle_output received"\n f" value that was not None and was of type {type(obj)}.",\n )\n return None\n\n if context.has_asset_partitions:\n paths = self._get_paths_for_partitions(context)\n\n check.invariant(\n len(paths) == 1,\n f"The current IO manager {type(self)} does not support persisting an output"\n " associated with multiple partitions. This error is likely occurring because a"\n " backfill was launched using the 'single run' option. Instead, launch the"\n " backfill with the 'multiple runs' option.",\n )\n\n path = next(iter(paths.values()))\n else:\n path = self._get_path(context)\n self.make_directory(path.parent)\n context.log.debug(self.get_writing_output_log_message(path))\n self.dump_to_path(context=context, obj=obj, path=path)\n\n metadata = {"path": MetadataValue.path(str(path))}\n custom_metadata = self.get_metadata(context=context, obj=obj)\n metadata.update(custom_metadata) # type: ignore\n\n context.add_output_metadata(metadata)
\n\n\ndef is_dict_type(type_obj) -> bool:\n if type_obj == dict:\n return True\n\n if hasattr(type_obj, "__origin__") and type_obj.__origin__ in (dict, Dict, Mapping):\n return True\n\n return False\n
", "current_page_name": "_modules/dagster/_core/storage/upath_io_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.storage.upath_io_manager"}}, "types": {"config_schema": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.types.config_schema

\nimport hashlib\nfrom abc import ABC, abstractmethod\nfrom typing import TYPE_CHECKING, AbstractSet, Any, Callable, Iterator, Optional, cast\n\nfrom typing_extensions import TypeAlias\n\nimport dagster._check as check\nfrom dagster._annotations import experimental_param\nfrom dagster._config import ConfigType\nfrom dagster._core.decorator_utils import get_function_params, validate_expected_params\nfrom dagster._core.errors import DagsterInvalidDefinitionError\n\nfrom ..definitions.resource_requirement import (\n    ResourceRequirement,\n    TypeLoaderResourceRequirement,\n)\n\nif TYPE_CHECKING:\n    from dagster._core.execution.context.system import (\n        DagsterTypeLoaderContext,\n    )\n\n\n
[docs]class DagsterTypeLoader(ABC):\n """Dagster type loaders are used to load unconnected inputs of the dagster type they are attached\n to.\n\n The recommended way to define a type loader is with the\n :py:func:`@dagster_type_loader <dagster_type_loader>` decorator.\n """\n\n @property\n @abstractmethod\n def schema_type(self) -> ConfigType:\n pass\n\n @property\n def loader_version(self) -> Optional[str]:\n return None\n\n def compute_loaded_input_version(self, _config_value: object) -> Optional[str]:\n return None\n\n def construct_from_config_value(\n self, _context: "DagsterTypeLoaderContext", config_value: object\n ) -> object:\n """How to create a runtime value from config data."""\n return config_value\n\n def required_resource_keys(self) -> AbstractSet[str]:\n return frozenset()\n\n def get_resource_requirements(\n self, outer_context: Optional[object] = None\n ) -> Iterator[ResourceRequirement]:\n type_display_name = cast(str, outer_context)\n for resource_key in sorted(list(self.required_resource_keys())):\n yield TypeLoaderResourceRequirement(\n key=resource_key, type_display_name=type_display_name\n )
\n\n\n@experimental_param(param="loader_version")\n@experimental_param(param="external_version_fn")\nclass DagsterTypeLoaderFromDecorator(DagsterTypeLoader):\n def __init__(\n self,\n config_type,\n func,\n required_resource_keys,\n loader_version=None,\n external_version_fn=None,\n ):\n self._config_type = check.inst_param(config_type, "config_type", ConfigType)\n self._func = check.callable_param(func, "func")\n self._required_resource_keys = check.opt_set_param(\n required_resource_keys, "required_resource_keys", of_type=str\n )\n self._loader_version = check.opt_str_param(loader_version, "loader_version")\n self._external_version_fn = check.opt_callable_param(\n external_version_fn, "external_version_fn"\n )\n\n @property\n def schema_type(self) -> ConfigType:\n return self._config_type\n\n @property\n def loader_version(self) -> Optional[str]:\n return self._loader_version\n\n def compute_loaded_input_version(self, config_value: object) -> Optional[str]:\n """Compute the type-loaded input from a given config_value.\n\n Args:\n config_value (object): Config value to be ingested by the external version\n loading function.\n\n Returns:\n Optional[str]: Hash of concatenated loader version and external input version if both\n are provided, else None.\n """\n version = ""\n if self.loader_version:\n version += str(self.loader_version)\n if self._external_version_fn:\n ext_version = self._external_version_fn(config_value)\n version += str(ext_version)\n\n if version == "":\n return None # Sentinel value for no version provided.\n else:\n return hashlib.sha1(version.encode("utf-8")).hexdigest()\n\n def construct_from_config_value(\n self, context: "DagsterTypeLoaderContext", config_value: object\n ):\n return self._func(context, config_value)\n\n def required_resource_keys(self):\n return frozenset(self._required_resource_keys)\n\n\ndef _create_type_loader_for_decorator(\n config_type: ConfigType,\n func,\n required_resource_keys: Optional[AbstractSet[str]],\n loader_version: Optional[str] = None,\n external_version_fn: Optional[Callable[[object], str]] = None,\n):\n return DagsterTypeLoaderFromDecorator(\n config_type, func, required_resource_keys, loader_version, external_version_fn\n )\n\n\nDagsterTypeLoaderFn: TypeAlias = Callable[["DagsterTypeLoaderContext", Any], Any]\n\n\n
[docs]def dagster_type_loader(\n config_schema: object,\n required_resource_keys: Optional[AbstractSet[str]] = None,\n loader_version: Optional[str] = None,\n external_version_fn: Optional[Callable[[object], str]] = None,\n) -> Callable[[DagsterTypeLoaderFn], DagsterTypeLoaderFromDecorator]:\n """Create an dagster type loader that maps config data to a runtime value.\n\n The decorated function should take the execution context and parsed config value and return the\n appropriate runtime value.\n\n Args:\n config_schema (ConfigSchema): The schema for the config that's passed to the decorated\n function.\n loader_version (str): (Experimental) The version of the decorated compute function. Two\n loading functions should have the same version if and only if they deterministically\n produce the same outputs when provided the same inputs.\n external_version_fn (Callable): (Experimental) A function that takes in the same parameters as the loader\n function (config_value) and returns a representation of the version of the external\n asset (str). Two external assets with identical versions are treated as identical to one\n another.\n\n Examples:\n .. code-block:: python\n\n @dagster_type_loader(Permissive())\n def load_dict(_context, value):\n return value\n """\n from dagster._config import resolve_to_config_type\n\n config_type = resolve_to_config_type(config_schema)\n assert isinstance(\n config_type, ConfigType\n ), f"{config_schema} could not be resolved to config type"\n EXPECTED_POSITIONALS = ["context", "*"]\n\n def wrapper(func: DagsterTypeLoaderFn) -> DagsterTypeLoaderFromDecorator:\n params = get_function_params(func)\n missing_positional = validate_expected_params(params, EXPECTED_POSITIONALS)\n if missing_positional:\n raise DagsterInvalidDefinitionError(\n f"@dagster_type_loader '{func.__name__}' decorated function does not have required"\n f" positional parameter '{missing_positional}'. @dagster_type_loader decorated"\n " functions should only have keyword arguments that match input names and a first"\n " positional parameter named 'context'."\n )\n\n return _create_type_loader_for_decorator(\n config_type, func, required_resource_keys, loader_version, external_version_fn\n )\n\n return wrapper
\n
", "current_page_name": "_modules/dagster/_core/types/config_schema", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.types.config_schema"}, "dagster_type": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.types.dagster_type

\nimport typing as t\nfrom abc import abstractmethod\nfrom enum import Enum as PythonEnum\nfrom functools import partial\nfrom typing import (\n    AbstractSet as TypingAbstractSet,\n    AnyStr,\n    Iterator as TypingIterator,\n    Mapping,\n    Optional as TypingOptional,\n    Sequence,\n    Type as TypingType,\n    cast,\n)\n\nfrom typing_extensions import get_args, get_origin\n\nimport dagster._check as check\nfrom dagster._annotations import public\nfrom dagster._builtins import BuiltinEnum\nfrom dagster._config import (\n    Array,\n    ConfigType,\n    Noneable as ConfigNoneable,\n)\nfrom dagster._core.definitions.events import DynamicOutput, Output, TypeCheck\nfrom dagster._core.definitions.metadata import (\n    MetadataValue,\n    RawMetadataValue,\n    normalize_metadata,\n)\nfrom dagster._core.errors import DagsterInvalidDefinitionError, DagsterInvariantViolationError\nfrom dagster._serdes import whitelist_for_serdes\nfrom dagster._seven import is_subclass\n\nfrom ..definitions.resource_requirement import (\n    RequiresResources,\n    ResourceRequirement,\n    TypeResourceRequirement,\n)\nfrom .builtin_config_schemas import BuiltinSchemas\nfrom .config_schema import DagsterTypeLoader\n\nif t.TYPE_CHECKING:\n    from dagster._core.definitions.node_definition import NodeDefinition\n    from dagster._core.execution.context.system import DagsterTypeLoaderContext, TypeCheckContext\n\nTypeCheckFn = t.Callable[["TypeCheckContext", AnyStr], t.Union[TypeCheck, bool]]\n\n\n@whitelist_for_serdes\nclass DagsterTypeKind(PythonEnum):\n    ANY = "ANY"\n    SCALAR = "SCALAR"\n    LIST = "LIST"\n    NOTHING = "NOTHING"\n    NULLABLE = "NULLABLE"\n    REGULAR = "REGULAR"\n\n\n
[docs]class DagsterType(RequiresResources):\n """Define a type in dagster. These can be used in the inputs and outputs of ops.\n\n Args:\n type_check_fn (Callable[[TypeCheckContext, Any], [Union[bool, TypeCheck]]]):\n The function that defines the type check. It takes the value flowing\n through the input or output of the op. If it passes, return either\n ``True`` or a :py:class:`~dagster.TypeCheck` with ``success`` set to ``True``. If it fails,\n return either ``False`` or a :py:class:`~dagster.TypeCheck` with ``success`` set to ``False``.\n The first argument must be named ``context`` (or, if unused, ``_``, ``_context``, or ``context_``).\n Use ``required_resource_keys`` for access to resources.\n key (Optional[str]): The unique key to identify types programmatically.\n The key property always has a value. If you omit key to the argument\n to the init function, it instead receives the value of ``name``. If\n neither ``key`` nor ``name`` is provided, a ``CheckError`` is thrown.\n\n In the case of a generic type such as ``List`` or ``Optional``, this is\n generated programmatically based on the type parameters.\n\n For most use cases, name should be set and the key argument should\n not be specified.\n name (Optional[str]): A unique name given by a user. If ``key`` is ``None``, ``key``\n becomes this value. Name is not given in a case where the user does\n not specify a unique name for this type, such as a generic class.\n description (Optional[str]): A markdown-formatted string, displayed in tooling.\n loader (Optional[DagsterTypeLoader]): An instance of a class that\n inherits from :py:class:`~dagster.DagsterTypeLoader` and can map config data to a value of\n this type. Specify this argument if you will need to shim values of this type using the\n config machinery. As a rule, you should use the\n :py:func:`@dagster_type_loader <dagster.dagster_type_loader>` decorator to construct\n these arguments.\n required_resource_keys (Optional[Set[str]]): Resource keys required by the ``type_check_fn``.\n is_builtin (bool): Defaults to False. This is used by tools to display or\n filter built-in types (such as :py:class:`~dagster.String`, :py:class:`~dagster.Int`) to visually distinguish\n them from user-defined types. Meant for internal use.\n kind (DagsterTypeKind): Defaults to None. This is used to determine the kind of runtime type\n for InputDefinition and OutputDefinition type checking.\n typing_type: Defaults to None. A valid python typing type (e.g. Optional[List[int]]) for the\n value contained within the DagsterType. Meant for internal use.\n """\n\n def __init__(\n self,\n type_check_fn: TypeCheckFn,\n key: t.Optional[str] = None,\n name: t.Optional[str] = None,\n is_builtin: bool = False,\n description: t.Optional[str] = None,\n loader: t.Optional[DagsterTypeLoader] = None,\n required_resource_keys: t.Optional[t.Set[str]] = None,\n kind: DagsterTypeKind = DagsterTypeKind.REGULAR,\n typing_type: t.Any = t.Any,\n metadata: t.Optional[t.Mapping[str, RawMetadataValue]] = None,\n ):\n check.opt_str_param(key, "key")\n check.opt_str_param(name, "name")\n\n check.invariant(not (name is None and key is None), "Must set key or name")\n if name is None:\n key = check.not_none(\n key,\n "If name is not provided, must provide key.",\n )\n self.key, self._name = key, None\n elif key is None:\n name = check.not_none(\n name,\n "If key is not provided, must provide name.",\n )\n self.key, self._name = name, name\n else:\n check.invariant(key and name)\n self.key, self._name = key, name\n\n self._description = check.opt_str_param(description, "description")\n self._loader = check.opt_inst_param(loader, "loader", DagsterTypeLoader)\n\n self._required_resource_keys = check.opt_set_param(\n required_resource_keys,\n "required_resource_keys",\n )\n\n self._type_check_fn = check.callable_param(type_check_fn, "type_check_fn")\n _validate_type_check_fn(self._type_check_fn, self._name)\n\n self.is_builtin = check.bool_param(is_builtin, "is_builtin")\n check.invariant(\n self.display_name is not None,\n f"All types must have a valid display name, got None for key {key}",\n )\n\n self.kind = check.inst_param(kind, "kind", DagsterTypeKind)\n\n self._typing_type = typing_type\n\n self._metadata = normalize_metadata(\n check.opt_mapping_param(metadata, "metadata", key_type=str),\n )\n\n
[docs] @public\n def type_check(self, context: "TypeCheckContext", value: object) -> TypeCheck:\n """Type check the value against the type.\n\n Args:\n context (TypeCheckContext): The context of the type check.\n value (Any): The value to check.\n\n Returns:\n TypeCheck: The result of the type check.\n """\n retval = self._type_check_fn(context, value)\n\n if not isinstance(retval, (bool, TypeCheck)):\n raise DagsterInvariantViolationError(\n f"You have returned {retval!r} of type {type(retval)} from the type "\n f'check function of type "{self.key}". Return value must be instance '\n "of TypeCheck or a bool."\n )\n\n return TypeCheck(success=retval) if isinstance(retval, bool) else retval
\n\n def __eq__(self, other):\n return isinstance(other, DagsterType) and self.key == other.key\n\n def __ne__(self, other):\n return not self.__eq__(other)\n\n def __hash__(self):\n return hash(self.key)\n\n @staticmethod\n def from_builtin_enum(builtin_enum) -> "DagsterType":\n check.invariant(BuiltinEnum.contains(builtin_enum), "must be member of BuiltinEnum")\n return _RUNTIME_MAP[builtin_enum]\n\n @property\n def metadata(self) -> t.Mapping[str, MetadataValue]:\n return self._metadata\n\n @public\n @property\n def required_resource_keys(self) -> TypingAbstractSet[str]:\n """AbstractSet[str]: Set of resource keys required by the type check function."""\n return self._required_resource_keys\n\n @public\n @property\n def display_name(self) -> str:\n """Either the name or key (if name is `None`) of the type, overridden in many subclasses."""\n return cast(str, self._name or self.key)\n\n @public\n @property\n def unique_name(self) -> t.Optional[str]:\n """The unique name of this type. Can be None if the type is not unique, such as container types."""\n # TODO: docstring and body inconsistent-- can this be None or not?\n check.invariant(\n self._name is not None,\n f"unique_name requested but is None for type {self.display_name}",\n )\n return self._name\n\n @public\n @property\n def has_unique_name(self) -> bool:\n """bool: Whether the type has a unique name."""\n return self._name is not None\n\n @public\n @property\n def typing_type(self) -> t.Any:\n """Any: The python typing type for this type."""\n return self._typing_type\n\n @public\n @property\n def loader(self) -> t.Optional[DagsterTypeLoader]:\n """Optional[DagsterTypeLoader]: Loader for this type, if any."""\n return self._loader\n\n @public\n @property\n def description(self) -> t.Optional[str]:\n """Optional[str]: Description of the type, or None if not provided."""\n return self._description\n\n @property\n def inner_types(self) -> t.Sequence["DagsterType"]:\n return []\n\n @property\n def loader_schema_key(self) -> t.Optional[str]:\n return self.loader.schema_type.key if self.loader else None\n\n @property\n def type_param_keys(self) -> t.Sequence[str]:\n return []\n\n @property\n def is_nothing(self) -> bool:\n return self.kind == DagsterTypeKind.NOTHING\n\n @property\n def supports_fan_in(self) -> bool:\n return False\n\n def get_inner_type_for_fan_in(self) -> "DagsterType":\n check.failed(\n "DagsterType {name} does not support fan-in, should have checked supports_fan_in before"\n " calling getter.".format(name=self.display_name)\n )\n\n def get_resource_requirements(\n self, _outer_context: TypingOptional[object] = None\n ) -> TypingIterator[ResourceRequirement]:\n for resource_key in sorted(list(self.required_resource_keys)):\n yield TypeResourceRequirement(key=resource_key, type_display_name=self.display_name)\n if self.loader:\n yield from self.loader.get_resource_requirements(outer_context=self.display_name)
\n\n\ndef _validate_type_check_fn(fn: t.Callable, name: t.Optional[str]) -> bool:\n from dagster._seven import get_arg_names\n\n args = get_arg_names(fn)\n\n # py2 doesn't filter out self\n if len(args) >= 1 and args[0] == "self":\n args = args[1:]\n\n if len(args) == 2:\n possible_names = {\n "_",\n "context",\n "_context",\n "context_",\n }\n if args[0] not in possible_names:\n DagsterInvalidDefinitionError(\n f'type_check function on type "{name}" must have first '\n 'argument named "context" (or _, _context, context_).'\n )\n return True\n\n raise DagsterInvalidDefinitionError(\n f'type_check_fn argument on type "{name}" must take 2 arguments, received {len(args)}.'\n )\n\n\nclass BuiltinScalarDagsterType(DagsterType):\n def __init__(self, name: str, type_check_fn: TypeCheckFn, typing_type: t.Type, **kwargs):\n super(BuiltinScalarDagsterType, self).__init__(\n key=name,\n name=name,\n kind=DagsterTypeKind.SCALAR,\n type_check_fn=type_check_fn,\n is_builtin=True,\n typing_type=typing_type,\n **kwargs,\n )\n\n # This is passed to the constructor of subclasses as the argument `type_check_fn`-- that's why\n # it exists together with the `type_check_fn` arg.\n def type_check_fn(self, _context, value) -> TypeCheck:\n return self.type_check_scalar_value(value)\n\n @abstractmethod\n def type_check_scalar_value(self, _value) -> TypeCheck:\n raise NotImplementedError()\n\n\ndef _typemismatch_error_str(value: object, expected_type_desc: str) -> str:\n return 'Value "{value}" of python type "{python_type}" must be a {type_desc}.'.format(\n value=value, python_type=type(value).__name__, type_desc=expected_type_desc\n )\n\n\ndef _fail_if_not_of_type(\n value: object, value_type: t.Type[t.Any], value_type_desc: str\n) -> TypeCheck:\n if not isinstance(value, value_type):\n return TypeCheck(success=False, description=_typemismatch_error_str(value, value_type_desc))\n\n return TypeCheck(success=True)\n\n\nclass _Int(BuiltinScalarDagsterType):\n def __init__(self):\n super(_Int, self).__init__(\n name="Int",\n loader=BuiltinSchemas.INT_INPUT,\n type_check_fn=self.type_check_fn,\n typing_type=int,\n )\n\n def type_check_scalar_value(self, value) -> TypeCheck:\n return _fail_if_not_of_type(value, int, "int")\n\n\nclass _String(BuiltinScalarDagsterType):\n def __init__(self):\n super(_String, self).__init__(\n name="String",\n loader=BuiltinSchemas.STRING_INPUT,\n type_check_fn=self.type_check_fn,\n typing_type=str,\n )\n\n def type_check_scalar_value(self, value: object) -> TypeCheck:\n return _fail_if_not_of_type(value, str, "string")\n\n\nclass _Float(BuiltinScalarDagsterType):\n def __init__(self):\n super(_Float, self).__init__(\n name="Float",\n loader=BuiltinSchemas.FLOAT_INPUT,\n type_check_fn=self.type_check_fn,\n typing_type=float,\n )\n\n def type_check_scalar_value(self, value: object) -> TypeCheck:\n return _fail_if_not_of_type(value, float, "float")\n\n\nclass _Bool(BuiltinScalarDagsterType):\n def __init__(self):\n super(_Bool, self).__init__(\n name="Bool",\n loader=BuiltinSchemas.BOOL_INPUT,\n type_check_fn=self.type_check_fn,\n typing_type=bool,\n )\n\n def type_check_scalar_value(self, value: object) -> TypeCheck:\n return _fail_if_not_of_type(value, bool, "bool")\n\n\nclass Anyish(DagsterType):\n def __init__(\n self,\n key: t.Optional[str],\n name: t.Optional[str],\n loader: t.Optional[DagsterTypeLoader] = None,\n is_builtin: bool = False,\n description: t.Optional[str] = None,\n ):\n super(Anyish, self).__init__(\n key=key,\n name=name,\n kind=DagsterTypeKind.ANY,\n loader=loader,\n is_builtin=is_builtin,\n type_check_fn=self.type_check_method,\n description=description,\n typing_type=t.Any,\n )\n\n def type_check_method(self, _context: "TypeCheckContext", _value: object) -> TypeCheck:\n return TypeCheck(success=True)\n\n @property\n def supports_fan_in(self) -> bool:\n return True\n\n def get_inner_type_for_fan_in(self) -> DagsterType:\n # Anyish all the way down\n return self\n\n\nclass _Any(Anyish):\n def __init__(self):\n super(_Any, self).__init__(\n key="Any",\n name="Any",\n loader=BuiltinSchemas.ANY_INPUT,\n is_builtin=True,\n )\n\n\ndef create_any_type(\n name: str,\n loader: t.Optional[DagsterTypeLoader] = None,\n description: t.Optional[str] = None,\n) -> Anyish:\n return Anyish(\n key=name,\n name=name,\n description=description,\n loader=loader,\n )\n\n\nclass _Nothing(DagsterType):\n def __init__(self):\n super(_Nothing, self).__init__(\n key="Nothing",\n name="Nothing",\n kind=DagsterTypeKind.NOTHING,\n loader=None,\n type_check_fn=self.type_check_method,\n is_builtin=True,\n typing_type=type(None),\n )\n\n def type_check_method(self, _context: "TypeCheckContext", value: object) -> TypeCheck:\n if value is not None:\n return TypeCheck(\n success=False,\n description=f"Value must be None, got a {type(value)}",\n )\n\n return TypeCheck(success=True)\n\n @property\n def supports_fan_in(self) -> bool:\n return True\n\n def get_inner_type_for_fan_in(self) -> DagsterType:\n return self\n\n\ndef isinstance_type_check_fn(\n expected_python_type: t.Union[t.Type, t.Tuple[t.Type, ...]],\n dagster_type_name: str,\n expected_python_type_str: str,\n) -> TypeCheckFn:\n def type_check(_context: "TypeCheckContext", value: object) -> TypeCheck:\n if not isinstance(value, expected_python_type):\n return TypeCheck(\n success=False,\n description=(\n f"Value of type {type(value)} failed type check for Dagster type"\n f" {dagster_type_name}, expected value to be of Python type"\n f" {expected_python_type_str}."\n ),\n )\n\n return TypeCheck(success=True)\n\n return type_check\n\n\n
[docs]class PythonObjectDagsterType(DagsterType):\n """Define a type in dagster whose typecheck is an isinstance check.\n\n Specifically, the type can either be a single python type (e.g. int),\n or a tuple of types (e.g. (int, float)) which is treated as a union.\n\n Examples:\n .. code-block:: python\n\n ntype = PythonObjectDagsterType(python_type=int)\n assert ntype.name == 'int'\n assert_success(ntype, 1)\n assert_failure(ntype, 'a')\n\n .. code-block:: python\n\n ntype = PythonObjectDagsterType(python_type=(int, float))\n assert ntype.name == 'Union[int, float]'\n assert_success(ntype, 1)\n assert_success(ntype, 1.5)\n assert_failure(ntype, 'a')\n\n\n Args:\n python_type (Union[Type, Tuple[Type, ...]): The dagster typecheck function calls instanceof on\n this type.\n name (Optional[str]): Name the type. Defaults to the name of ``python_type``.\n key (Optional[str]): Key of the type. Defaults to name.\n description (Optional[str]): A markdown-formatted string, displayed in tooling.\n loader (Optional[DagsterTypeLoader]): An instance of a class that\n inherits from :py:class:`~dagster.DagsterTypeLoader` and can map config data to a value of\n this type. Specify this argument if you will need to shim values of this type using the\n config machinery. As a rule, you should use the\n :py:func:`@dagster_type_loader <dagster.dagster_type_loader>` decorator to construct\n these arguments.\n """\n\n def __init__(\n self,\n python_type: t.Union[t.Type, t.Tuple[t.Type, ...]],\n key: t.Optional[str] = None,\n name: t.Optional[str] = None,\n **kwargs,\n ):\n if isinstance(python_type, tuple):\n self.python_type = check.tuple_param(\n python_type, "python_type", of_shape=tuple(type for item in python_type)\n )\n self.type_str = "Union[{}]".format(\n ", ".join(python_type.__name__ for python_type in python_type)\n )\n typing_type = t.Union[python_type] # type: ignore\n\n else:\n self.python_type = check.class_param(python_type, "python_type")\n self.type_str = cast(str, python_type.__name__)\n typing_type = self.python_type\n name = check.opt_str_param(name, "name", self.type_str)\n key = check.opt_str_param(key, "key", name)\n super(PythonObjectDagsterType, self).__init__(\n key=key,\n name=name,\n type_check_fn=isinstance_type_check_fn(python_type, name, self.type_str),\n typing_type=typing_type,\n **kwargs,\n )
\n\n\nclass NoneableInputSchema(DagsterTypeLoader):\n def __init__(self, inner_dagster_type: DagsterType):\n self._inner_dagster_type = check.inst_param(\n inner_dagster_type, "inner_dagster_type", DagsterType\n )\n self._inner_loader = check.not_none_param(inner_dagster_type.loader, "inner_dagster_type")\n self._schema_type = ConfigNoneable(self._inner_loader.schema_type)\n\n @property\n def schema_type(self) -> ConfigType:\n return self._schema_type\n\n def construct_from_config_value(\n self, context: "DagsterTypeLoaderContext", config_value: object\n ) -> object:\n if config_value is None:\n return None\n return self._inner_loader.construct_from_config_value(context, config_value)\n\n\ndef _create_nullable_input_schema(inner_type: DagsterType) -> t.Optional[DagsterTypeLoader]:\n if not inner_type.loader:\n return None\n\n return NoneableInputSchema(inner_type)\n\n\nclass OptionalType(DagsterType):\n def __init__(self, inner_type: DagsterType):\n inner_type = resolve_dagster_type(inner_type)\n\n if inner_type is Nothing:\n raise DagsterInvalidDefinitionError(\n "Type Nothing can not be wrapped in List or Optional"\n )\n\n key = "Optional." + cast(str, inner_type.key)\n self.inner_type = inner_type\n super(OptionalType, self).__init__(\n key=key,\n name=None,\n kind=DagsterTypeKind.NULLABLE,\n type_check_fn=self.type_check_method,\n loader=_create_nullable_input_schema(inner_type),\n # This throws a type error with Py\n typing_type=t.Optional[inner_type.typing_type],\n )\n\n @property\n def display_name(self) -> str:\n return self.inner_type.display_name + "?"\n\n def type_check_method(self, context, value):\n return (\n TypeCheck(success=True) if value is None else self.inner_type.type_check(context, value)\n )\n\n @property\n def inner_types(self):\n return [self.inner_type] + self.inner_type.inner_types\n\n @property\n def type_param_keys(self):\n return [self.inner_type.key]\n\n @property\n def supports_fan_in(self):\n return self.inner_type.supports_fan_in\n\n def get_inner_type_for_fan_in(self):\n return self.inner_type.get_inner_type_for_fan_in()\n\n\nclass ListInputSchema(DagsterTypeLoader):\n def __init__(self, inner_dagster_type):\n self._inner_dagster_type = check.inst_param(\n inner_dagster_type, "inner_dagster_type", DagsterType\n )\n check.param_invariant(inner_dagster_type.loader, "inner_dagster_type")\n self._schema_type = Array(inner_dagster_type.loader.schema_type)\n\n @property\n def schema_type(self):\n return self._schema_type\n\n def construct_from_config_value(self, context, config_value):\n convert_item = partial(self._inner_dagster_type.loader.construct_from_config_value, context)\n return list(map(convert_item, config_value))\n\n\ndef _create_list_input_schema(inner_type):\n if not inner_type.loader:\n return None\n\n return ListInputSchema(inner_type)\n\n\nclass ListType(DagsterType):\n def __init__(self, inner_type: DagsterType):\n key = "List." + inner_type.key\n self.inner_type = inner_type\n super(ListType, self).__init__(\n key=key,\n name=None,\n kind=DagsterTypeKind.LIST,\n type_check_fn=self.type_check_method,\n loader=_create_list_input_schema(inner_type),\n typing_type=t.List[inner_type.typing_type],\n )\n\n @property\n def display_name(self):\n return "[" + self.inner_type.display_name + "]"\n\n def type_check_method(self, context, value):\n value_check = _fail_if_not_of_type(value, list, "list")\n if not value_check.success:\n return value_check\n\n for item in value:\n item_check = self.inner_type.type_check(context, item)\n if not item_check.success:\n return item_check\n\n return TypeCheck(success=True)\n\n @property\n def inner_types(self):\n return [self.inner_type] + self.inner_type.inner_types\n\n @property\n def type_param_keys(self):\n return [self.inner_type.key]\n\n @property\n def supports_fan_in(self):\n return True\n\n def get_inner_type_for_fan_in(self):\n return self.inner_type\n\n\nclass DagsterListApi:\n def __getitem__(self, inner_type):\n check.not_none_param(inner_type, "inner_type")\n return _List(resolve_dagster_type(inner_type))\n\n def __call__(self, inner_type):\n check.not_none_param(inner_type, "inner_type")\n return _List(inner_type)\n\n\nList: DagsterListApi = DagsterListApi()\n\n\ndef _List(inner_type):\n check.inst_param(inner_type, "inner_type", DagsterType)\n if inner_type is Nothing:\n raise DagsterInvalidDefinitionError("Type Nothing can not be wrapped in List or Optional")\n return ListType(inner_type)\n\n\nclass Stringish(DagsterType):\n def __init__(self, key: t.Optional[str] = None, name: t.Optional[str] = None, **kwargs):\n name = check.opt_str_param(name, "name", type(self).__name__)\n key = check.opt_str_param(key, "key", name)\n super(Stringish, self).__init__(\n key=key,\n name=name,\n kind=DagsterTypeKind.SCALAR,\n type_check_fn=self.type_check_method,\n loader=BuiltinSchemas.STRING_INPUT,\n typing_type=str,\n **kwargs,\n )\n\n def type_check_method(self, _context: "TypeCheckContext", value: object) -> TypeCheck:\n return _fail_if_not_of_type(value, str, "string")\n\n\ndef create_string_type(name, description=None):\n return Stringish(name=name, key=name, description=description)\n\n\nAny = _Any()\nBool = _Bool()\nFloat = _Float()\nInt = _Int()\nString = _String()\nNothing = _Nothing()\n\n_RUNTIME_MAP = {\n BuiltinEnum.ANY: Any,\n BuiltinEnum.BOOL: Bool,\n BuiltinEnum.FLOAT: Float,\n BuiltinEnum.INT: Int,\n BuiltinEnum.STRING: String,\n BuiltinEnum.NOTHING: Nothing,\n}\n\n_PYTHON_TYPE_TO_DAGSTER_TYPE_MAPPING_REGISTRY: t.Dict[type, DagsterType] = {}\n"""Python types corresponding to user-defined RunTime types created using @map_to_dagster_type or\nas_dagster_type are registered here so that we can remap the Python types to runtime types."""\n\n\n
[docs]def make_python_type_usable_as_dagster_type(\n python_type: TypingType[t.Any], dagster_type: DagsterType\n) -> None:\n """Take any existing python type and map it to a dagster type (generally created with\n :py:class:`DagsterType <dagster.DagsterType>`) This can only be called once\n on a given python type.\n """\n check.inst_param(python_type, "python_type", type)\n check.inst_param(dagster_type, "dagster_type", DagsterType)\n registered_dagster_type = _PYTHON_TYPE_TO_DAGSTER_TYPE_MAPPING_REGISTRY.get(python_type)\n\n if registered_dagster_type is None:\n _PYTHON_TYPE_TO_DAGSTER_TYPE_MAPPING_REGISTRY[python_type] = dagster_type\n elif registered_dagster_type is not dagster_type:\n # This would be just a great place to insert a short URL pointing to the type system\n # documentation into the error message\n # https://github.com/dagster-io/dagster/issues/1831\n if isinstance(registered_dagster_type, TypeHintInferredDagsterType):\n raise DagsterInvalidDefinitionError(\n "A Dagster type has already been registered for the Python type "\n f'{python_type}. The Dagster type was "auto-registered" - i.e. a solid definition '\n "used the Python type as an annotation for one of its arguments or for its return "\n "value before make_python_type_usable_as_dagster_type was called, and we "\n "generated a Dagster type to correspond to it. To override the auto-generated "\n "Dagster type, call make_python_type_usable_as_dagster_type before any solid "\n "definitions refer to the Python type."\n )\n else:\n raise DagsterInvalidDefinitionError(\n "A Dagster type has already been registered for the Python type "\n f"{python_type}. make_python_type_usable_as_dagster_type can only "\n "be called once on a python type as it is registering a 1:1 mapping "\n "between that python type and a dagster type."\n )
\n\n\nDAGSTER_INVALID_TYPE_ERROR_MESSAGE = (\n "Invalid type: dagster_type must be an instance of DagsterType or a Python type: "\n "got {dagster_type}{additional_msg}"\n)\n\n\nclass TypeHintInferredDagsterType(DagsterType):\n def __init__(self, python_type: t.Type):\n qualified_name = f"{python_type.__module__}.{python_type.__name__}"\n self.python_type = python_type\n super(TypeHintInferredDagsterType, self).__init__(\n key=f"_TypeHintInferred[{qualified_name}]",\n description=(\n f"DagsterType created from a type hint for the Python type {qualified_name}"\n ),\n type_check_fn=isinstance_type_check_fn(\n python_type, python_type.__name__, qualified_name\n ),\n typing_type=python_type,\n )\n\n @property\n def display_name(self) -> str:\n return self.python_type.__name__\n\n\ndef resolve_dagster_type(dagster_type: object) -> DagsterType:\n # circular dep\n from dagster._utils.typing_api import is_typing_type\n\n from ..definitions.result import MaterializeResult\n from .primitive_mapping import (\n is_supported_runtime_python_builtin,\n remap_python_builtin_for_runtime,\n )\n from .python_dict import (\n Dict as DDict,\n PythonDict,\n )\n from .python_set import DagsterSetApi, PythonSet\n from .python_tuple import DagsterTupleApi, PythonTuple\n from .transform_typing import transform_typing_type\n\n check.invariant(\n not (isinstance(dagster_type, type) and is_subclass(dagster_type, ConfigType)),\n "Cannot resolve a config type to a runtime type",\n )\n\n check.invariant(\n not (isinstance(dagster_type, type) and is_subclass(dagster_type, DagsterType)),\n f"Do not pass runtime type classes. Got {dagster_type}",\n )\n\n # First, check to see if we're using Dagster's generic output type to do the type catching.\n if is_generic_output_annotation(dagster_type):\n type_args = get_args(dagster_type)\n # If no inner type was provided, forward Any type.\n dagster_type = type_args[0] if len(type_args) == 1 else Any\n elif is_dynamic_output_annotation(dagster_type):\n dynamic_out_annotation = get_args(dagster_type)[0]\n type_args = get_args(dynamic_out_annotation)\n dagster_type = type_args[0] if len(type_args) == 1 else Any\n elif dagster_type == MaterializeResult:\n # convert MaterializeResult type annotation to Nothing until returning\n # scalar values via MaterializeResult is supported\n # https://github.com/dagster-io/dagster/issues/16887\n dagster_type = Nothing\n\n # Then, check to see if it is part of python's typing library\n if is_typing_type(dagster_type):\n dagster_type = transform_typing_type(dagster_type)\n if isinstance(dagster_type, DagsterType):\n return dagster_type\n\n # Test for unhashable objects -- this is if, for instance, someone has passed us an instance of\n # a dict where they meant to pass dict or Dict, etc.\n try:\n hash(dagster_type)\n except TypeError as e:\n raise DagsterInvalidDefinitionError(\n DAGSTER_INVALID_TYPE_ERROR_MESSAGE.format(\n additional_msg=(\n ", which isn't hashable. Did you pass an instance of a type instead of "\n "the type?"\n ),\n dagster_type=str(dagster_type),\n )\n ) from e\n\n if BuiltinEnum.contains(dagster_type):\n return DagsterType.from_builtin_enum(dagster_type)\n\n if is_supported_runtime_python_builtin(dagster_type):\n return remap_python_builtin_for_runtime(dagster_type)\n\n if dagster_type is None:\n return Any\n\n if dagster_type is DDict:\n return PythonDict\n if isinstance(dagster_type, DagsterTupleApi):\n return PythonTuple\n if isinstance(dagster_type, DagsterSetApi):\n return PythonSet\n if isinstance(dagster_type, DagsterListApi):\n return List(Any)\n\n if isinstance(dagster_type, type):\n return resolve_python_type_to_dagster_type(dagster_type)\n\n raise DagsterInvalidDefinitionError(\n DAGSTER_INVALID_TYPE_ERROR_MESSAGE.format(\n dagster_type=str(dagster_type), additional_msg="."\n )\n )\n\n\ndef is_dynamic_output_annotation(dagster_type: object) -> bool:\n check.invariant(\n not (isinstance(dagster_type, type) and is_subclass(dagster_type, ConfigType)),\n "Cannot resolve a config type to a runtime type",\n )\n\n check.invariant(\n not (isinstance(dagster_type, type) and is_subclass(dagster_type, ConfigType)),\n f"Do not pass runtime type classes. Got {dagster_type}",\n )\n\n if dagster_type == DynamicOutput or get_origin(dagster_type) == DynamicOutput:\n raise DagsterInvariantViolationError(\n "Op annotated with return type DynamicOutput. DynamicOutputs can only be returned in"\n " the context of a List. If only one output is needed, use the Output API."\n )\n\n if get_origin(dagster_type) == list and len(get_args(dagster_type)) == 1:\n list_inner_type = get_args(dagster_type)[0]\n return list_inner_type == DynamicOutput or get_origin(list_inner_type) == DynamicOutput\n return False\n\n\ndef is_generic_output_annotation(dagster_type: object) -> bool:\n return dagster_type == Output or get_origin(dagster_type) == Output\n\n\ndef resolve_python_type_to_dagster_type(python_type: t.Type) -> DagsterType:\n """Resolves a Python type to a Dagster type."""\n check.inst_param(python_type, "python_type", type)\n\n if python_type in _PYTHON_TYPE_TO_DAGSTER_TYPE_MAPPING_REGISTRY:\n return _PYTHON_TYPE_TO_DAGSTER_TYPE_MAPPING_REGISTRY[python_type]\n else:\n dagster_type = TypeHintInferredDagsterType(python_type)\n _PYTHON_TYPE_TO_DAGSTER_TYPE_MAPPING_REGISTRY[python_type] = dagster_type\n return dagster_type\n\n\nALL_RUNTIME_BUILTINS = list(_RUNTIME_MAP.values())\n\n\ndef construct_dagster_type_dictionary(\n node_defs: Sequence["NodeDefinition"],\n) -> Mapping[str, DagsterType]:\n from dagster._core.definitions.graph_definition import GraphDefinition\n\n type_dict_by_name = {t.unique_name: t for t in ALL_RUNTIME_BUILTINS}\n type_dict_by_key = {t.key: t for t in ALL_RUNTIME_BUILTINS}\n\n def process_node_def(node_def: "NodeDefinition"):\n input_output_types = list(node_def.all_input_output_types())\n for dagster_type in input_output_types:\n # We don't do uniqueness check on key because with classes\n # like Array, Noneable, etc, those are ephemeral objects\n # and it is perfectly fine to have many of them.\n type_dict_by_key[dagster_type.key] = dagster_type\n\n if not dagster_type.has_unique_name:\n continue\n\n if dagster_type.unique_name not in type_dict_by_name:\n type_dict_by_name[dagster_type.unique_name] = dagster_type\n continue\n\n if type_dict_by_name[dagster_type.unique_name] is not dagster_type:\n raise DagsterInvalidDefinitionError(\n (\n 'You have created two dagster types with the same name "{type_name}". '\n "Dagster types have must have unique names."\n ).format(type_name=dagster_type.display_name)\n )\n\n if isinstance(node_def, GraphDefinition):\n for child_node_def in node_def.node_defs:\n process_node_def(child_node_def)\n\n for node_def in node_defs:\n process_node_def(node_def)\n\n return type_dict_by_key\n\n\nclass DagsterOptionalApi:\n def __getitem__(self, inner_type: t.Union[t.Type, DagsterType]) -> OptionalType:\n inner_type = resolve_dagster_type(check.not_none_param(inner_type, "inner_type"))\n return OptionalType(inner_type)\n\n\nOptional: DagsterOptionalApi = DagsterOptionalApi()\n
", "current_page_name": "_modules/dagster/_core/types/dagster_type", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.types.dagster_type"}, "decorator": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._core.types.decorator

\nfrom typing import TYPE_CHECKING, Callable, Optional, Type, TypeVar, Union, overload\n\nimport dagster._check as check\n\nfrom .dagster_type import PythonObjectDagsterType, make_python_type_usable_as_dagster_type\n\nif TYPE_CHECKING:\n    from dagster._core.types.config_schema import DagsterTypeLoader\n\nT_Type = TypeVar("T_Type", bound=Type[object])\n\n\n@overload\ndef usable_as_dagster_type(\n    name: Optional[str] = ...,\n    description: Optional[str] = ...,\n    loader: Optional["DagsterTypeLoader"] = ...,\n) -> Callable[[T_Type], T_Type]: ...\n\n\n@overload\ndef usable_as_dagster_type(\n    name: T_Type,\n) -> T_Type: ...\n\n\n
[docs]def usable_as_dagster_type(\n name: Optional[Union[str, T_Type]] = None,\n description: Optional[str] = None,\n loader: Optional["DagsterTypeLoader"] = None,\n) -> Union[T_Type, Callable[[T_Type], T_Type]]:\n """Decorate a Python class to make it usable as a Dagster Type.\n\n This is intended to make it straightforward to annotate existing business logic classes to\n make them dagster types whose typecheck is an isinstance check against that python class.\n\n Args:\n python_type (cls): The python type to make usable as python type.\n name (Optional[str]): Name of the new Dagster type. If ``None``, the name (``__name__``) of\n the ``python_type`` will be used.\n description (Optional[str]): A user-readable description of the type.\n loader (Optional[DagsterTypeLoader]): An instance of a class that\n inherits from :py:class:`DagsterTypeLoader` and can map config data to a value of\n this type. Specify this argument if you will need to shim values of this type using the\n config machinery. As a rule, you should use the\n :py:func:`@dagster_type_loader <dagster.dagster_type_loader>` decorator to construct\n these arguments.\n\n Examples:\n .. code-block:: python\n\n # dagster_aws.s3.file_manager.S3FileHandle\n @usable_as_dagster_type\n class S3FileHandle(FileHandle):\n def __init__(self, s3_bucket, s3_key):\n self._s3_bucket = check.str_param(s3_bucket, 's3_bucket')\n self._s3_key = check.str_param(s3_key, 's3_key')\n\n @property\n def s3_bucket(self):\n return self._s3_bucket\n\n @property\n def s3_key(self):\n return self._s3_key\n\n @property\n def path_desc(self):\n return self.s3_path\n\n @property\n def s3_path(self):\n return 's3://{bucket}/{key}'.format(bucket=self.s3_bucket, key=self.s3_key)\n """\n # check for no args, no parens case\n if isinstance(name, type):\n bare_cls = name # with no parens, name is actually the decorated class\n make_python_type_usable_as_dagster_type(\n bare_cls,\n PythonObjectDagsterType(python_type=bare_cls, name=bare_cls.__name__, description=None),\n )\n return bare_cls\n\n def _with_args(bare_cls: T_Type) -> T_Type:\n check.class_param(bare_cls, "bare_cls")\n new_name = check.opt_str_param(name, "name") if name else bare_cls.__name__\n\n make_python_type_usable_as_dagster_type(\n bare_cls,\n PythonObjectDagsterType(\n name=new_name,\n description=description,\n python_type=bare_cls,\n loader=loader,\n ),\n )\n return bare_cls\n\n return _with_args
\n
", "current_page_name": "_modules/dagster/_core/types/decorator", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._core.types.decorator"}}}, "_serdes": {"config_class": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._serdes.config_class

\nimport importlib\nfrom abc import ABC, abstractmethod\nfrom typing import (\n    TYPE_CHECKING,\n    Any,\n    Dict,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Type,\n    TypeVar,\n    Union,\n    overload,\n)\n\nfrom typing_extensions import Self\n\nimport dagster._check as check\nfrom dagster._utils import convert_dagster_submodule_name\nfrom dagster._utils.yaml_utils import load_run_config_yaml\n\nfrom .serdes import (\n    NamedTupleSerializer,\n    whitelist_for_serdes,\n)\n\nif TYPE_CHECKING:\n    from dagster._config.config_schema import UserConfigSchema\n\nT_ConfigurableClass = TypeVar("T_ConfigurableClass")\n\n\nclass ConfigurableClassDataSerializer(NamedTupleSerializer["ConfigurableClassData"]):\n    def after_pack(self, **packed: Any) -> Dict[str, Any]:\n        packed["module_name"] = convert_dagster_submodule_name(packed["module_name"], "public")\n        return packed\n\n\n
[docs]@whitelist_for_serdes(serializer=ConfigurableClassDataSerializer)\nclass ConfigurableClassData(\n NamedTuple(\n "_ConfigurableClassData",\n [\n ("module_name", str),\n ("class_name", str),\n ("config_yaml", str),\n ],\n )\n):\n """Serializable tuple describing where to find a class and the config fragment that should\n be used to instantiate it.\n\n Users should not instantiate this class directly.\n\n Classes intended to be serialized in this way should implement the\n :py:class:`dagster.serdes.ConfigurableClass` mixin.\n """\n\n def __new__(cls, module_name: str, class_name: str, config_yaml: str):\n return super(ConfigurableClassData, cls).__new__(\n cls,\n convert_dagster_submodule_name(check.str_param(module_name, "module_name"), "private"),\n check.str_param(class_name, "class_name"),\n check.str_param(config_yaml, "config_yaml"),\n )\n\n @property\n def config_dict(self) -> Mapping[str, Any]:\n return check.is_dict(load_run_config_yaml(self.config_yaml), key_type=str)\n\n def info_dict(self) -> Mapping[str, Any]:\n return {\n "module": self.module_name,\n "class": self.class_name,\n "config": self.config_dict,\n }\n\n @overload\n def rehydrate(self, as_type: Type[T_ConfigurableClass]) -> T_ConfigurableClass: ...\n\n @overload\n def rehydrate(self, as_type: None = ...) -> "ConfigurableClass": ...\n\n def rehydrate(\n self, as_type: Optional[Type[T_ConfigurableClass]] = None\n ) -> Union["ConfigurableClass", T_ConfigurableClass]:\n from dagster._config import process_config, resolve_to_config_type\n from dagster._core.errors import DagsterInvalidConfigError\n\n try:\n module = importlib.import_module(self.module_name)\n except ModuleNotFoundError:\n check.failed(\n f"Couldn't import module {self.module_name} when attempting to load the "\n f"configurable class {self.module_name}.{self.class_name}"\n )\n try:\n klass = getattr(module, self.class_name)\n except AttributeError:\n check.failed(\n f"Couldn't find class {self.class_name} in module when attempting to load the "\n f"configurable class {self.module_name}.{self.class_name}"\n )\n\n if not issubclass(klass, as_type or ConfigurableClass):\n raise check.CheckError(\n klass,\n f"class {self.class_name} in module {self.module_name}",\n ConfigurableClass,\n )\n\n config_dict = self.config_dict\n result = process_config(resolve_to_config_type(klass.config_type()), config_dict)\n if not result.success:\n raise DagsterInvalidConfigError(\n f"Errors whilst loading configuration for {klass.config_type()}.",\n result.errors,\n config_dict,\n )\n return klass.from_config_value(self, check.not_none(result.value))
\n\n\n
[docs]class ConfigurableClass(ABC):\n """Abstract mixin for classes that can be loaded from config.\n\n This supports a powerful plugin pattern which avoids both a) a lengthy, hard-to-synchronize list\n of conditional imports / optional extras_requires in dagster core and b) a magic directory or\n file in which third parties can place plugin packages. Instead, the intention is to make, e.g.,\n run storage, pluggable with a config chunk like:\n\n .. code-block:: yaml\n\n run_storage:\n module: very_cool_package.run_storage\n class: SplendidRunStorage\n config:\n magic_word: "quux"\n\n This same pattern should eventually be viable for other system components, e.g. engines.\n\n The ``ConfigurableClass`` mixin provides the necessary hooks for classes to be instantiated from\n an instance of ``ConfigurableClassData``.\n\n Pieces of the Dagster system which we wish to make pluggable in this way should consume a config\n type such as:\n\n .. code-block:: python\n\n {'module': str, 'class': str, 'config': Field(Permissive())}\n\n """\n\n @property\n @abstractmethod\n def inst_data(self) -> Optional[ConfigurableClassData]:\n """Subclass must be able to return the inst_data as a property if it has been constructed\n through the from_config_value code path.\n """\n\n @classmethod\n @abstractmethod\n def config_type(cls) -> "UserConfigSchema":\n """Get the config type against which to validate a config yaml fragment.\n\n The only place config values matching this type are used is inside `from_config_value`. This\n is an alternative constructor for a class. It is a common pattern for the config type to\n match constructor arguments, so `from_config_value`\n\n The config type against which to validate a config yaml fragment\n serialized in an instance of ``ConfigurableClassData``.\n """\n ...\n # We need to raise `NotImplementedError` here because nothing prevents abstract class\n # methods from being called.\n raise NotImplementedError(f"{cls.__name__} must implement the config_type classmethod")\n\n @classmethod\n @abstractmethod\n def from_config_value(\n cls, inst_data: ConfigurableClassData, config_value: Mapping[str, Any]\n ) -> Self:\n """Create an instance of the ConfigurableClass from a validated config value.\n\n The config value used here should be derived from the accompanying `inst_data` argument.\n `inst_data` contains the yaml-serialized config-- this must be parsed and\n validated/normalized, then passed to this method for object instantiation. This is done in\n ConfigurableClassData.rehydrate.\n\n Args:\n config_value (dict): The validated config value to use. Typically this should be the\n ``value`` attribute of a\n :py:class:`~dagster._core.types.evaluator.evaluation.EvaluateValueResult`.\n\n\n A common pattern is for the implementation to align the config_value with the signature\n of the ConfigurableClass's constructor:\n\n .. code-block:: python\n\n @classmethod\n def from_config_value(cls, inst_data, config_value):\n return MyConfigurableClass(inst_data=inst_data, **config_value)\n\n """
\n\n\ndef class_from_code_pointer(module_name: str, class_name: str) -> Type[object]:\n try:\n module = importlib.import_module(module_name)\n except ModuleNotFoundError:\n check.failed(\n "Couldn't import module {module_name} when attempting to load the class {klass}".format(\n module_name=module_name,\n klass=module_name + "." + class_name,\n )\n )\n try:\n return getattr(module, class_name)\n except AttributeError:\n check.failed(\n "Couldn't find class {class_name} in module when attempting to load the "\n "class {klass}".format(\n class_name=class_name,\n klass=module_name + "." + class_name,\n )\n )\n
", "current_page_name": "_modules/dagster/_serdes/config_class", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._serdes.config_class"}}, "_utils": {"alabaster_version": "0.7.13", "alert": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._utils.alert

\nimport datetime\nimport smtplib\nimport ssl\nfrom typing import TYPE_CHECKING, Callable, Optional, Sequence, Union\n\nfrom dagster._annotations import deprecated_param\nfrom dagster._core.definitions.sensor_definition import DefaultSensorStatus, SensorDefinition\nfrom dagster._core.errors import DagsterInvalidDefinitionError\n\nif TYPE_CHECKING:\n    from dagster._core.definitions.graph_definition import GraphDefinition\n    from dagster._core.definitions.job_definition import JobDefinition\n    from dagster._core.definitions.run_status_sensor_definition import RunFailureSensorContext\n    from dagster._core.definitions.selector import JobSelector, RepositorySelector\n    from dagster._core.definitions.unresolved_asset_job_definition import (\n        UnresolvedAssetJobDefinition,\n    )\n\n\ndef _default_failure_email_body(context: "RunFailureSensorContext") -> str:\n    from dagster._core.host_representation.external_data import DEFAULT_MODE_NAME\n\n    return "<br>".join(\n        [\n            f"Pipeline {context.dagster_run.job_name} failed!",\n            f"Run ID: {context.dagster_run.run_id}",\n            f"Mode: {DEFAULT_MODE_NAME}",\n            f"Error: {context.failure_event.message}",\n        ]\n    )\n\n\ndef _default_failure_email_subject(context) -> str:\n    return f"Dagster Run Failed: {context.pipeline_run.job_name}"\n\n\nEMAIL_MESSAGE = """From: {email_from}\nTo: {email_to}\nMIME-Version: 1.0\nContent-type: text/html\nSubject: {email_subject}\n\n{email_body}\n\n<!-- this ensures Gmail doesn't trim the email -->\n<span style="opacity: 0"> {randomness} </span>\n"""\n\n\ndef send_email_via_ssl(\n    email_from: str,\n    email_password: str,\n    email_to: Sequence[str],\n    message: str,\n    smtp_host: str,\n    smtp_port: int,\n):\n    context = ssl.create_default_context()\n    with smtplib.SMTP_SSL(smtp_host, smtp_port, context=context) as server:\n        server.login(email_from, email_password)\n        server.sendmail(email_from, email_to, message)\n\n\ndef send_email_via_starttls(\n    email_from: str,\n    email_password: str,\n    email_to: Sequence[str],\n    message: str,\n    smtp_host: str,\n    smtp_port: int,\n):\n    context = ssl.create_default_context()\n    with smtplib.SMTP(smtp_host, smtp_port) as server:\n        server.starttls(context=context)\n        server.login(email_from, email_password)\n        server.sendmail(email_from, email_to, message)\n\n\n
[docs]@deprecated_param(\n param="job_selection",\n breaking_version="2.0",\n additional_warn_text="Use `monitored_jobs` instead.",\n)\ndef make_email_on_run_failure_sensor(\n email_from: str,\n email_password: str,\n email_to: Sequence[str],\n email_body_fn: Callable[["RunFailureSensorContext"], str] = _default_failure_email_body,\n email_subject_fn: Callable[["RunFailureSensorContext"], str] = _default_failure_email_subject,\n smtp_host: str = "smtp.gmail.com",\n smtp_type: str = "SSL",\n smtp_port: Optional[int] = None,\n name: Optional[str] = None,\n webserver_base_url: Optional[str] = None,\n monitored_jobs: Optional[\n Sequence[\n Union[\n "JobDefinition",\n "GraphDefinition",\n "UnresolvedAssetJobDefinition",\n "RepositorySelector",\n "JobSelector",\n ]\n ]\n ] = None,\n job_selection: Optional[\n Sequence[\n Union[\n "JobDefinition",\n "GraphDefinition",\n "UnresolvedAssetJobDefinition",\n "RepositorySelector",\n "JobSelector",\n ]\n ]\n ] = None,\n monitor_all_repositories: bool = False,\n default_status: DefaultSensorStatus = DefaultSensorStatus.STOPPED,\n) -> SensorDefinition:\n """Create a job failure sensor that sends email via the SMTP protocol.\n\n Args:\n email_from (str): The sender email address to send the message from.\n email_password (str): The password of the sender.\n email_to (List[str]): The receipt email addresses to send the message to.\n email_body_fn (Optional(Callable[[RunFailureSensorContext], str])): Function which\n takes in the ``RunFailureSensorContext`` outputs the email body you want to send.\n Defaults to the plain text that contains error message, job name, and run ID.\n email_subject_fn (Optional(Callable[[RunFailureSensorContext], str])): Function which\n takes in the ``RunFailureSensorContext`` outputs the email subject you want to send.\n Defaults to "Dagster Run Failed: <job_name>".\n smtp_host (str): The hostname of the SMTP server. Defaults to "smtp.gmail.com".\n smtp_type (str): The protocol; either "SSL" or "STARTTLS". Defaults to SSL.\n smtp_port (Optional[int]): The SMTP port. Defaults to 465 for SSL, 587 for STARTTLS.\n name: (Optional[str]): The name of the sensor. Defaults to "email_on_job_failure".\n webserver_base_url: (Optional[str]): The base url of your dagster-webserver instance. Specify this to allow\n messages to include deeplinks to the failed run.\n monitored_jobs (Optional[List[Union[JobDefinition, GraphDefinition, JobDefinition, RepositorySelector, JobSelector]]]):\n The jobs that will be monitored by this failure sensor. Defaults to None, which means the alert will\n be sent when any job in the repository fails. To monitor jobs in external repositories,\n use RepositorySelector and JobSelector.\n monitor_all_repositories (bool): If set to True, the sensor will monitor all runs in the\n Dagster instance. If set to True, an error will be raised if you also specify\n monitored_jobs or job_selection. Defaults to False.\n job_selection (Optional[List[Union[JobDefinition, GraphDefinition, JobDefinition, RepositorySelector, JobSelector]]]):\n (deprecated in favor of monitored_jobs) The jobs that will be monitored by this failure\n sensor. Defaults to None, which means the alert will be sent when any job in the repository fails.\n default_status (DefaultSensorStatus): Whether the sensor starts as running or not. The default\n status can be overridden from the Dagster UI or via the GraphQL API.\n\n Examples:\n .. code-block:: python\n\n email_on_run_failure = make_email_on_run_failure_sensor(\n email_from="no-reply@example.com",\n email_password=os.getenv("ALERT_EMAIL_PASSWORD"),\n email_to=["xxx@example.com"],\n )\n\n @repository\n def my_repo():\n return [my_job + email_on_run_failure]\n\n .. code-block:: python\n\n def my_message_fn(context: RunFailureSensorContext) -> str:\n return (\n f"Job {context.pipeline_run.job_name} failed!"\n f"Error: {context.failure_event.message}"\n )\n\n email_on_run_failure = make_email_on_run_failure_sensor(\n email_from="no-reply@example.com",\n email_password=os.getenv("ALERT_EMAIL_PASSWORD"),\n email_to=["xxx@example.com"],\n email_body_fn=my_message_fn,\n email_subject_fn=lambda _: "Dagster Alert",\n webserver_base_url="http://mycoolsite.com",\n )\n\n\n """\n from dagster._core.definitions.run_status_sensor_definition import (\n RunFailureSensorContext,\n run_failure_sensor,\n )\n\n jobs = monitored_jobs if monitored_jobs else job_selection\n\n @run_failure_sensor(\n name=name,\n monitored_jobs=jobs,\n default_status=default_status,\n monitor_all_repositories=monitor_all_repositories,\n )\n def email_on_run_failure(context: RunFailureSensorContext):\n email_body = email_body_fn(context)\n if webserver_base_url:\n email_body += (\n f'<p><a href="{webserver_base_url}/runs/{context.dagster_run.run_id}">View in'\n " the Dagster UI</a></p>"\n )\n\n message = EMAIL_MESSAGE.format(\n email_to=",".join(email_to),\n email_from=email_from,\n email_subject=email_subject_fn(context),\n email_body=email_body,\n randomness=datetime.datetime.now(),\n )\n\n if smtp_type == "SSL":\n send_email_via_ssl(\n email_from, email_password, email_to, message, smtp_host, smtp_port=smtp_port or 465\n )\n elif smtp_type == "STARTTLS":\n send_email_via_starttls(\n email_from, email_password, email_to, message, smtp_host, smtp_port=smtp_port or 587\n )\n else:\n raise DagsterInvalidDefinitionError(f'smtp_type "{smtp_type}" is not supported.')\n\n return email_on_run_failure
\n
", "current_page_name": "_modules/dagster/_utils/alert", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}, {"link": "../", "title": "dagster._utils"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._utils.alert"}, "body": "

Source code for dagster._utils

\nimport _thread as thread\nimport contextlib\nimport contextvars\nimport datetime\nimport errno\nimport functools\nimport inspect\nimport multiprocessing\nimport os\nimport re\nimport signal\nimport socket\nimport subprocess\nimport sys\nimport tempfile\nimport threading\nimport time\nfrom collections import OrderedDict\nfrom datetime import timezone\nfrom enum import Enum\nfrom signal import Signals\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Any,\n    Callable,\n    ContextManager,\n    Dict,\n    Generator,\n    Generic,\n    Hashable,\n    Iterator,\n    List,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Set,\n    Tuple,\n    Type,\n    TypeVar,\n    Union,\n    cast,\n    overload,\n)\n\nimport packaging.version\nfrom typing_extensions import Literal, TypeAlias, TypeGuard\n\nimport dagster._check as check\nimport dagster._seven as seven\n\nfrom .internal_init import IHasInternalInit as IHasInternalInit\n\nif sys.version_info > (3,):\n    from pathlib import Path\nelse:\n    from pathlib2 import Path\n\nif TYPE_CHECKING:\n    from dagster._core.definitions.definitions_class import Definitions\n    from dagster._core.definitions.repository_definition.repository_definition import (\n        RepositoryDefinition,\n    )\n    from dagster._core.events import DagsterEvent\n\nK = TypeVar("K")\nT = TypeVar("T")\nU = TypeVar("U")\nV = TypeVar("V")\n\nEPOCH = datetime.datetime.utcfromtimestamp(0)\n\nPICKLE_PROTOCOL = 4\n\n\nDEFAULT_WORKSPACE_YAML_FILENAME = "workspace.yaml"\n\nPrintFn: TypeAlias = Callable[[Any], None]\n\nSingleInstigatorDebugCrashFlags: TypeAlias = Mapping[str, int]\nDebugCrashFlags: TypeAlias = Mapping[str, SingleInstigatorDebugCrashFlags]\n\n\n# Use this to get the "library version" (pre-1.0 version) from the "core version" (post 1.0\n# version). 16 is from the 0.16.0 that library versions stayed on when core went to 1.0.0.\ndef library_version_from_core_version(core_version: str) -> str:\n    parsed_version = parse_package_version(core_version)\n\n    release = parsed_version.release\n    if release[0] >= 1:\n        library_version = ".".join(["0", str(16 + release[1]), str(release[2])])\n\n        if parsed_version.is_prerelease:\n            library_version = library_version + "".join(\n                [str(pre) for pre in check.not_none(parsed_version.pre)]\n            )\n\n        if parsed_version.is_postrelease:\n            library_version = library_version + "post" + str(parsed_version.post)\n\n        return library_version\n    else:\n        return core_version\n\n\ndef parse_package_version(version_str: str) -> packaging.version.Version:\n    parsed_version = packaging.version.parse(version_str)\n    assert isinstance(parsed_version, packaging.version.Version)\n    return parsed_version\n\n\ndef convert_dagster_submodule_name(name: str, mode: Literal["private", "public"]) -> str:\n    """This function was introduced when all Dagster submodules were marked private by\n    underscore-prefixing the root submodules (e.g. `dagster._core`). The function provides\n    backcompatibility by converting modules between the old and new (i.e. public and private) forms.\n    This is needed when reading older data or communicating with older versions of Dagster.\n    """\n    if mode == "private":\n        return re.sub(r"^dagster\\.([^_])", r"dagster._\\1", name)\n    elif mode == "public":\n        return re.sub(r"^dagster._", "dagster.", name)\n    else:\n        check.failed("`mode` must be 'private' or 'public'")\n\n\n
[docs]def file_relative_path(dunderfile: str, relative_path: str) -> str:\n """Get a path relative to the currently executing Python file.\n\n This function is useful when one needs to load a file that is relative to the position of\n the current file. (Such as when you encode a configuration file path in source file and want\n in runnable in any current working directory)\n\n Args:\n dunderfile (str): Should always be ``__file__``.\n relative_path (str): Path to get relative to the currently executing file.\n\n **Examples**:\n\n .. code-block:: python\n\n file_relative_path(__file__, 'path/relative/to/file')\n\n """\n check.str_param(dunderfile, "dunderfile")\n check.str_param(relative_path, "relative_path")\n\n return os.path.join(os.path.dirname(dunderfile), relative_path)
\n\n\ndef script_relative_path(file_path: str) -> str:\n """Useful for testing with local files. Use a path relative to where the\n test resides and this function will return the absolute path\n of that file. Otherwise it will be relative to script that\n ran the test.\n\n Note: this is function is very, very expensive (on the order of 1\n millisecond per invocation) so this should only be used in performance\n insensitive contexts. Prefer file_relative_path for anything with\n performance constraints.\n\n """\n # from http://bit.ly/2snyC6s\n\n check.str_param(file_path, "file_path")\n scriptdir = inspect.stack()[1][1]\n return os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(scriptdir)), file_path))\n\n\n# Adapted from https://github.com/okunishinishi/python-stringcase/blob/master/stringcase.py\ndef camelcase(string: str) -> str:\n check.str_param(string, "string")\n\n string = re.sub(r"^[\\-_\\.]", "", str(string))\n if not string:\n return string\n return str(string[0]).upper() + re.sub(\n r"[\\-_\\.\\s]([a-z])", lambda matched: str(matched.group(1)).upper(), string[1:]\n )\n\n\ndef ensure_single_item(ddict: Mapping[T, U]) -> Tuple[T, U]:\n check.mapping_param(ddict, "ddict")\n check.param_invariant(len(ddict) == 1, "ddict", "Expected dict with single item")\n return next(iter(ddict.items()))\n\n\n@contextlib.contextmanager\ndef pushd(path: str) -> Iterator[str]:\n old_cwd = os.getcwd()\n os.chdir(path)\n try:\n yield path\n finally:\n os.chdir(old_cwd)\n\n\ndef safe_isfile(path: str) -> bool:\n """Backport of Python 3.8 os.path.isfile behavior.\n\n This is intended to backport https://docs.python.org/dev/whatsnew/3.8.html#os-path. I'm not\n sure that there are other ways to provoke this behavior on Unix other than the null byte,\n but there are certainly other ways to do it on Windows. Afaict, we won't mask other\n ValueErrors, and the behavior in the status quo ante is rough because we risk throwing an\n unexpected, uncaught ValueError from very deep in our logic.\n """\n try:\n return os.path.isfile(path)\n except ValueError:\n return False\n\n\ndef mkdir_p(path: str) -> str:\n try:\n os.makedirs(path)\n return path\n except OSError as exc: # Python >2.5\n if exc.errno == errno.EEXIST and os.path.isdir(path):\n return path\n else:\n raise\n\n\ndef hash_collection(\n collection: Union[\n Mapping[Hashable, Any], Sequence[Any], AbstractSet[Any], Tuple[Any, ...], NamedTuple\n ]\n) -> int:\n """Hash a mutable collection or immutable collection containing mutable elements.\n\n This is useful for hashing Dagster-specific NamedTuples that contain mutable lists or dicts.\n The default NamedTuple __hash__ function assumes the contents of the NamedTuple are themselves\n hashable, and will throw an error if they are not. This can occur when trying to e.g. compute a\n cache key for the tuple for use with `lru_cache`.\n\n This alternative implementation will recursively process collection elements to convert basic\n lists and dicts to tuples prior to hashing. It is recommended to cache the result:\n\n Example:\n .. code-block:: python\n\n def __hash__(self):\n if not hasattr(self, '_hash'):\n self._hash = hash_named_tuple(self)\n return self._hash\n """\n assert isinstance(\n collection, (list, dict, set, tuple)\n ), f"Cannot hash collection of type {type(collection)}"\n return hash(make_hashable(collection))\n\n\n@overload\ndef make_hashable(value: Union[List[Any], Set[Any]]) -> Tuple[Any, ...]: ...\n\n\n@overload\ndef make_hashable(value: Dict[Any, Any]) -> Tuple[Tuple[Any, Any]]: ...\n\n\n@overload\ndef make_hashable(value: Any) -> Any: ...\n\n\ndef make_hashable(value: Any) -> Any:\n if isinstance(value, dict):\n return tuple(sorted((key, make_hashable(value)) for key, value in value.items()))\n elif isinstance(value, (list, tuple, set)):\n return tuple([make_hashable(x) for x in value])\n else:\n return value\n\n\ndef get_prop_or_key(elem, key):\n if isinstance(elem, Mapping):\n return elem.get(key)\n else:\n return getattr(elem, key)\n\n\ndef list_pull(alist, key):\n return list(map(lambda elem: get_prop_or_key(elem, key), alist))\n\n\ndef all_none(kwargs):\n for value in kwargs.values():\n if value is not None:\n return False\n return True\n\n\ndef check_script(path, return_code=0):\n try:\n subprocess.check_output([sys.executable, path])\n except subprocess.CalledProcessError as exc:\n if return_code != 0:\n if exc.returncode == return_code:\n return\n raise\n\n\ndef check_cli_execute_file_job(path, pipeline_fn_name, env_file=None):\n from dagster._core.test_utils import instance_for_test\n\n with instance_for_test():\n cli_cmd = [\n sys.executable,\n "-m",\n "dagster",\n "pipeline",\n "execute",\n "-f",\n path,\n "-a",\n pipeline_fn_name,\n ]\n\n if env_file:\n cli_cmd.append("-c")\n cli_cmd.append(env_file)\n\n try:\n subprocess.check_output(cli_cmd)\n except subprocess.CalledProcessError as cpe:\n print(cpe) # noqa: T201\n raise cpe\n\n\ndef safe_tempfile_path_unmanaged() -> str:\n # This gets a valid temporary file path in the safest possible way, although there is still no\n # guarantee that another process will not create a file at this path. The NamedTemporaryFile is\n # deleted when the context manager exits and the file object is closed.\n #\n # This is preferable to using NamedTemporaryFile as a context manager and passing the name\n # attribute of the file object around because NamedTemporaryFiles cannot be opened a second time\n # if already open on Windows NT or later:\n # https://docs.python.org/3.8/library/tempfile.html#tempfile.NamedTemporaryFile\n # https://github.com/dagster-io/dagster/issues/1582\n with tempfile.NamedTemporaryFile() as fd:\n path = fd.name\n return Path(path).as_posix()\n\n\n@contextlib.contextmanager\ndef safe_tempfile_path() -> Iterator[str]:\n path = None\n try:\n path = safe_tempfile_path_unmanaged()\n yield path\n finally:\n if path is not None and os.path.exists(path):\n os.unlink(path)\n\n\n@overload\ndef ensure_gen(thing_or_gen: Generator[T, Any, Any]) -> Generator[T, Any, Any]:\n pass\n\n\n@overload\ndef ensure_gen(thing_or_gen: T) -> Generator[T, Any, Any]:\n pass\n\n\ndef ensure_gen(\n thing_or_gen: Union[T, Iterator[T], Generator[T, Any, Any]]\n) -> Generator[T, Any, Any]:\n if not inspect.isgenerator(thing_or_gen):\n thing_or_gen = cast(T, thing_or_gen)\n\n def _gen_thing():\n yield thing_or_gen\n\n return _gen_thing()\n\n return thing_or_gen\n\n\ndef ensure_dir(file_path: str) -> str:\n try:\n os.makedirs(file_path)\n except OSError as e:\n if e.errno != errno.EEXIST:\n raise\n return file_path\n\n\ndef ensure_file(path: str) -> str:\n ensure_dir(os.path.dirname(path))\n if not os.path.exists(path):\n touch_file(path)\n return path\n\n\ndef touch_file(path):\n ensure_dir(os.path.dirname(path))\n with open(path, "a", encoding="utf8"):\n os.utime(path, None)\n\n\ndef _kill_on_event(termination_event):\n termination_event.wait()\n send_interrupt()\n\n\ndef send_interrupt():\n if seven.IS_WINDOWS:\n # This will raise a KeyboardInterrupt in python land - meaning this wont be able to\n # interrupt things like sleep()\n thread.interrupt_main()\n else:\n # If on unix send an os level signal to interrupt any situation we may be stuck in\n os.kill(os.getpid(), signal.SIGINT)\n\n\n# Function to be invoked by daemon thread in processes which seek to be cancellable.\n# The motivation for this approach is to be able to exit cleanly on Windows. An alternative\n# path is to change how the processes are opened and send CTRL_BREAK signals, which at\n# the time of authoring seemed a more costly approach.\n#\n# Reading for the curious:\n# * https://stackoverflow.com/questions/35772001/how-to-handle-the-signal-in-python-on-windows-machine\n# * https://stefan.sofa-rockers.org/2013/08/15/handling-sub-process-hierarchies-python-linux-os-x/\ndef start_termination_thread(termination_event):\n check.inst_param(termination_event, "termination_event", ttype=type(multiprocessing.Event()))\n\n int_thread = threading.Thread(\n target=_kill_on_event, args=(termination_event,), name="kill-on-event"\n )\n int_thread.daemon = True\n int_thread.start()\n\n\n# Executes the next() function within an instance of the supplied context manager class\n# (leaving the context before yielding each result)\ndef iterate_with_context(\n context_fn: Callable[[], ContextManager[Any]], iterator: Iterator[T]\n) -> Iterator[T]:\n while True:\n # Allow interrupts during user code so that we can terminate slow/hanging steps\n with context_fn():\n try:\n next_output = next(iterator)\n except StopIteration:\n return\n\n yield next_output\n\n\ndef datetime_as_float(dt: datetime.datetime) -> float:\n check.inst_param(dt, "dt", datetime.datetime)\n return float((dt - EPOCH).total_seconds())\n\n\nT_GeneratedContext = TypeVar("T_GeneratedContext")\n\n\nclass EventGenerationManager(Generic[T_GeneratedContext]):\n """Utility class that wraps an event generator function, that also yields a single instance of\n a typed object. All events yielded before the typed object are yielded through the method\n `generate_setup_events` and all events yielded after the typed object are yielded through the\n method `generate_teardown_events`.\n\n This is used to help replace the context managers used in pipeline initialization with\n generators so that we can begin emitting initialization events AND construct a pipeline context\n object, while managing explicit setup/teardown.\n\n This does require calling `generate_setup_events` AND `generate_teardown_events` in order to\n get the typed object.\n """\n\n def __init__(\n self,\n generator: Iterator[Union["DagsterEvent", T_GeneratedContext]],\n object_cls: Type[T_GeneratedContext],\n require_object: Optional[bool] = True,\n ):\n self.generator = check.generator(generator)\n self.object_cls: Type[T_GeneratedContext] = check.class_param(object_cls, "object_cls")\n self.require_object = check.bool_param(require_object, "require_object")\n self.object: Optional[T_GeneratedContext] = None\n self.did_setup = False\n self.did_teardown = False\n\n def generate_setup_events(self) -> Iterator["DagsterEvent"]:\n self.did_setup = True\n try:\n while self.object is None:\n obj = next(self.generator)\n if isinstance(obj, self.object_cls):\n self.object = obj\n else:\n yield obj\n except StopIteration:\n if self.require_object:\n check.inst_param(\n self.object,\n "self.object",\n self.object_cls,\n f"generator never yielded object of type {self.object_cls.__name__}",\n )\n\n def get_object(self) -> T_GeneratedContext:\n if not self.did_setup:\n check.failed("Called `get_object` before `generate_setup_events`")\n return cast(T_GeneratedContext, self.object)\n\n def generate_teardown_events(self) -> Iterator["DagsterEvent"]:\n self.did_teardown = True\n if self.object:\n yield from self.generator\n\n\ndef utc_datetime_from_timestamp(timestamp: float) -> datetime.datetime:\n tz = timezone.utc\n return datetime.datetime.fromtimestamp(timestamp, tz=tz)\n\n\ndef utc_datetime_from_naive(dt: datetime.datetime) -> datetime.datetime:\n tz = timezone.utc\n return dt.replace(tzinfo=tz)\n\n\ndef is_enum_value(value: object) -> bool:\n return False if value is None else issubclass(value.__class__, Enum)\n\n\ndef git_repository_root() -> str:\n return subprocess.check_output(["git", "rev-parse", "--show-toplevel"]).decode("utf-8").strip()\n\n\ndef segfault() -> None:\n """Reliable cross-Python version segfault.\n\n https://bugs.python.org/issue1215#msg143236\n """\n import ctypes\n\n ctypes.string_at(0)\n\n\ndef find_free_port() -> int:\n with contextlib.closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:\n s.bind(("", 0))\n s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)\n return s.getsockname()[1]\n\n\ndef is_port_in_use(host, port) -> bool:\n # Similar to the socket options that uvicorn uses to bind ports:\n # https://github.com/encode/uvicorn/blob/62f19c1c39929c84968712c371c9b7b96a041dec/uvicorn/config.py#L565-L566\n sock = socket.socket(family=socket.AF_INET)\n sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)\n try:\n sock.bind((host, port))\n return False\n except socket.error as e:\n return e.errno == errno.EADDRINUSE\n finally:\n sock.close()\n\n\n@contextlib.contextmanager\ndef alter_sys_path(to_add: Sequence[str], to_remove: Sequence[str]) -> Iterator[None]:\n to_restore = [path for path in sys.path]\n\n # remove paths\n for path in to_remove:\n if path in sys.path:\n sys.path.remove(path)\n\n # add paths\n for path in to_add:\n sys.path.insert(0, path)\n\n try:\n yield\n finally:\n sys.path = to_restore\n\n\n@contextlib.contextmanager\ndef restore_sys_modules() -> Iterator[None]:\n sys_modules = {k: v for k, v in sys.modules.items()}\n try:\n yield\n finally:\n to_delete = set(sys.modules) - set(sys_modules)\n for key in to_delete:\n del sys.modules[key]\n\n\ndef process_is_alive(pid: int) -> bool:\n if seven.IS_WINDOWS:\n import psutil\n\n return psutil.pid_exists(pid=pid)\n else:\n try:\n subprocess.check_output(["ps", str(pid)])\n except subprocess.CalledProcessError as exc:\n assert exc.returncode == 1\n return False\n return True\n\n\ndef compose(*args):\n """Compose python functions args such that compose(f, g)(x) is equivalent to f(g(x)).""" # noqa: D402\n # reduce using functional composition over all the arguments, with the identity function as\n # initializer\n return functools.reduce(lambda f, g: lambda x: f(g(x)), args, lambda x: x)\n\n\ndef dict_without_keys(ddict, *keys):\n return {key: value for key, value in ddict.items() if key not in set(keys)}\n\n\nclass Counter:\n def __init__(self):\n self._lock = threading.Lock()\n self._counts = OrderedDict()\n super(Counter, self).__init__()\n\n def increment(self, key: str):\n with self._lock:\n self._counts[key] = self._counts.get(key, 0) + 1\n\n def counts(self) -> Mapping[str, int]:\n with self._lock:\n copy = {k: v for k, v in self._counts.items()}\n return copy\n\n\ntraced_counter = contextvars.ContextVar("traced_counts", default=Counter())\n\nT_Callable = TypeVar("T_Callable", bound=Callable)\n\n\ndef traced(func: T_Callable) -> T_Callable:\n """A decorator that keeps track of how many times a function is called."""\n\n @functools.wraps(func)\n def inner(*args, **kwargs):\n counter = traced_counter.get()\n if counter and isinstance(counter, Counter):\n counter.increment(func.__qualname__)\n\n return func(*args, **kwargs)\n\n return cast(T_Callable, inner)\n\n\ndef get_terminate_signal():\n if sys.platform == "win32":\n return signal.SIGTERM\n return signal.SIGKILL\n\n\ndef get_run_crash_explanation(prefix: str, exit_code: int):\n # As per https://docs.python.org/3/library/subprocess.html#subprocess.CompletedProcess.returncode\n # negative exit code means a posix signal\n if exit_code < 0 and -exit_code in [signal.value for signal in Signals]:\n posix_signal = -exit_code\n signal_str = Signals(posix_signal).name\n exit_clause = f"was terminated by signal {posix_signal} ({signal_str})."\n if posix_signal == get_terminate_signal():\n exit_clause = (\n exit_clause\n + " This usually indicates that the process was"\n " killed by the operating system due to running out of"\n " memory. Possible solutions include increasing the"\n " amount of memory available to the run, reducing"\n " the amount of memory used by the ops in the run, or"\n " configuring the executor to run fewer ops concurrently."\n )\n else:\n exit_clause = f"unexpectedly exited with code {exit_code}."\n\n return prefix + " " + exit_clause\n\n\ndef last_file_comp(path: str) -> str:\n return os.path.basename(os.path.normpath(path))\n\n\ndef is_named_tuple_instance(obj: object) -> TypeGuard[NamedTuple]:\n return isinstance(obj, tuple) and hasattr(obj, "_fields")\n\n\ndef is_named_tuple_subclass(klass: Type[object]) -> TypeGuard[Type[NamedTuple]]:\n return isinstance(klass, type) and issubclass(klass, tuple) and hasattr(klass, "_fields")\n\n\n@overload\ndef normalize_to_repository(\n definitions_or_repository: Optional[Union["Definitions", "RepositoryDefinition"]] = ...,\n repository: Optional["RepositoryDefinition"] = ...,\n error_on_none: Literal[True] = ...,\n) -> "RepositoryDefinition": ...\n\n\n@overload\ndef normalize_to_repository(\n definitions_or_repository: Optional[Union["Definitions", "RepositoryDefinition"]] = ...,\n repository: Optional["RepositoryDefinition"] = ...,\n error_on_none: Literal[False] = ...,\n) -> Optional["RepositoryDefinition"]: ...\n\n\ndef normalize_to_repository(\n definitions_or_repository: Optional[Union["Definitions", "RepositoryDefinition"]] = None,\n repository: Optional["RepositoryDefinition"] = None,\n error_on_none: bool = True,\n) -> Optional["RepositoryDefinition"]:\n """Normalizes the arguments that take a RepositoryDefinition or Definitions object to a\n RepositoryDefinition.\n\n This is intended to handle both the case where a single argument takes a\n `Union[RepositoryDefinition, Definitions]` or separate keyword arguments accept\n `RepositoryDefinition` or `Definitions`.\n """\n from dagster._core.definitions.definitions_class import Definitions\n\n if (definitions_or_repository and repository) or (\n error_on_none and not (definitions_or_repository or repository)\n ):\n check.failed("Exactly one of `definitions` or `repository_def` must be provided.")\n elif isinstance(definitions_or_repository, Definitions):\n return definitions_or_repository.get_repository_def()\n elif definitions_or_repository:\n return definitions_or_repository\n elif repository:\n return repository\n else:\n return None\n\n\ndef xor(a, b):\n return bool(a) != bool(b)\n\n\ndef tail_file(path_or_fd: Union[str, int], should_stop: Callable[[], bool]) -> Iterator[str]:\n with open(path_or_fd, "r") as output_stream:\n while True:\n line = output_stream.readline()\n if line:\n yield line\n elif should_stop():\n break\n else:\n time.sleep(0.01)\n
", "current_page_name": "_modules/dagster/_utils", "customsidebar": null, "dagster_type": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._utils.dagster_type

\nfrom typing import Any\n\nfrom dagster._core.definitions.events import Failure, TypeCheck\nfrom dagster._core.definitions.graph_definition import GraphDefinition\nfrom dagster._core.definitions.job_base import InMemoryJob\nfrom dagster._core.errors import DagsterInvariantViolationError\nfrom dagster._core.execution.api import create_execution_plan\nfrom dagster._core.execution.context_creation_job import scoped_job_context\nfrom dagster._core.instance import DagsterInstance\nfrom dagster._core.types.dagster_type import resolve_dagster_type\n\nfrom .typing_api import is_typing_type\n\n\n
[docs]def check_dagster_type(dagster_type: Any, value: Any) -> TypeCheck:\n """Test a custom Dagster type.\n\n Args:\n dagster_type (Any): The Dagster type to test. Should be one of the\n :ref:`built-in types <builtin>`, a dagster type explicitly constructed with\n :py:func:`as_dagster_type`, :py:func:`@usable_as_dagster_type <dagster_type>`, or\n :py:func:`PythonObjectDagsterType`, or a Python type.\n value (Any): The runtime value to test.\n\n Returns:\n TypeCheck: The result of the type check.\n\n\n Examples:\n .. code-block:: python\n\n assert check_dagster_type(Dict[Any, Any], {'foo': 'bar'}).success\n """\n if is_typing_type(dagster_type):\n raise DagsterInvariantViolationError(\n f"Must pass in a type from dagster module. You passed {dagster_type} "\n "which is part of python's typing module."\n )\n\n dagster_type = resolve_dagster_type(dagster_type)\n\n job = InMemoryJob(GraphDefinition(node_defs=[], name="empty").to_job())\n job_def = job.get_definition()\n\n instance = DagsterInstance.ephemeral()\n execution_plan = create_execution_plan(job)\n dagster_run = instance.create_run_for_job(job_def)\n with scoped_job_context(execution_plan, job, {}, dagster_run, instance) as context:\n type_check_context = context.for_type(dagster_type)\n try:\n type_check = dagster_type.type_check(type_check_context, value)\n except Failure as failure:\n return TypeCheck(success=False, description=failure.description)\n\n if not isinstance(type_check, TypeCheck):\n raise DagsterInvariantViolationError(\n "Type checks can only return TypeCheck. Type {type_name} returned {value}.".format(\n type_name=dagster_type.display_name, value=repr(type_check)\n )\n )\n return type_check
\n
", "current_page_name": "_modules/dagster/_utils/dagster_type", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}, {"link": "../", "title": "dagster._utils"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._utils.dagster_type"}, "favicon_url": null, "forked_pdb": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._utils.forked_pdb

\nimport pdb\nimport sys\n\n\n# From https://stackoverflow.com/questions/4716533/how-to-attach-debugger-to-a-python-subproccess\n
[docs]class ForkedPdb(pdb.Pdb):\n """A pdb subclass that may be used from a forked multiprocessing child.\n\n **Examples**:\n\n .. code-block:: python\n\n from dagster._utils.forked_pdb import ForkedPdb\n\n @solid\n def complex_solid(_):\n # some complicated stuff\n\n ForkedPdb().set_trace()\n\n # some other complicated stuff\n\n You can initiate pipeline execution via the webserver and use the pdb debugger to examine/step through\n execution at the breakpoint.\n """\n\n def interaction(self, frame, traceback):\n _stdin = sys.stdin\n try:\n sys.stdin = open("/dev/stdin", encoding="utf8")\n pdb.Pdb.interaction(self, frame, traceback)\n finally:\n sys.stdin = _stdin
\n
", "current_page_name": "_modules/dagster/_utils/forked_pdb", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}, {"link": "../", "title": "dagster._utils"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._utils.forked_pdb"}, "log": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._utils.log

\nimport copy\nimport logging\nimport sys\nimport traceback\nfrom typing import Mapping, NamedTuple, Optional\n\nimport coloredlogs\n\nimport dagster._check as check\nimport dagster._seven as seven\nfrom dagster._annotations import deprecated\nfrom dagster._config import Enum, EnumValue\nfrom dagster._core.definitions.logger_definition import logger\nfrom dagster._core.utils import PYTHON_LOGGING_LEVELS_MAPPING, coerce_valid_log_level\n\nLogLevelEnum = Enum("log_level", list(map(EnumValue, PYTHON_LOGGING_LEVELS_MAPPING.keys())))\n\n\nclass JsonFileHandler(logging.Handler):\n    def __init__(self, json_path: str):\n        super(JsonFileHandler, self).__init__()\n        self.json_path = check.str_param(json_path, "json_path")\n\n    def emit(self, record: logging.LogRecord) -> None:\n        try:\n            log_dict = copy.copy(record.__dict__)\n\n            # This horrific monstrosity is to maintain backwards compatability\n            # with the old behavior of the JsonFileHandler, which the clarify\n            # project has a dependency on. It relied on the dagster-defined\n            # properties smashing all the properties of the LogRecord object\n            # and uploads all of those properties to a redshift table for\n            # in order to do analytics on the log\n\n            if "dagster_meta" in log_dict:\n                dagster_meta_dict = log_dict["dagster_meta"]\n                del log_dict["dagster_meta"]\n            else:\n                dagster_meta_dict = {}\n\n            log_dict.update(dagster_meta_dict)\n\n            with open(self.json_path, "a", encoding="utf8") as ff:\n                text_line = seven.json.dumps(log_dict)\n                ff.write(text_line + "\\n")\n        # Need to catch Exception here, so disabling lint\n        except Exception as e:\n            logging.critical("[%s] Error during logging!", self.__class__.__name__)\n            logging.exception(str(e))\n\n\nclass StructuredLoggerMessage(\n    NamedTuple(\n        "_StructuredLoggerMessage",\n        [\n            ("name", str),\n            ("message", str),\n            ("level", int),\n            ("meta", Mapping[object, object]),\n            ("record", logging.LogRecord),\n        ],\n    )\n):\n    def __new__(\n        cls,\n        name: str,\n        message: str,\n        level: int,\n        meta: Mapping[object, object],\n        record: logging.LogRecord,\n    ):\n        return super(StructuredLoggerMessage, cls).__new__(\n            cls,\n            check.str_param(name, "name"),\n            check.str_param(message, "message"),\n            coerce_valid_log_level(level),\n            check.mapping_param(meta, "meta"),\n            check.inst_param(record, "record", logging.LogRecord),\n        )\n\n\nclass JsonEventLoggerHandler(logging.Handler):\n    def __init__(self, json_path: str, construct_event_record):\n        super(JsonEventLoggerHandler, self).__init__()\n        self.json_path = check.str_param(json_path, "json_path")\n        self.construct_event_record = construct_event_record\n\n    def emit(self, record: logging.LogRecord) -> None:\n        try:\n            event_record = self.construct_event_record(record)\n            with open(self.json_path, "a", encoding="utf8") as ff:\n                text_line = seven.json.dumps(event_record.to_dict())\n                ff.write(text_line + "\\n")\n\n        # Need to catch Exception here, so disabling lint\n        except Exception as e:\n            logging.critical("[%s] Error during logging!", self.__class__.__name__)\n            logging.exception(str(e))\n\n\nclass StructuredLoggerHandler(logging.Handler):\n    def __init__(self, callback):\n        super(StructuredLoggerHandler, self).__init__()\n        self.callback = check.is_callable(callback, "callback")\n\n    def emit(self, record: logging.LogRecord) -> None:\n        try:\n            self.callback(\n                StructuredLoggerMessage(\n                    name=record.name,\n                    message=record.msg,\n                    level=record.levelno,\n                    meta=record.dagster_meta,  # type: ignore\n                    record=record,\n                )\n            )\n        # Need to catch Exception here, so disabling lint\n        except Exception as e:\n            logging.critical("[%s] Error during logging!", self.__class__.__name__)\n            logging.exception(str(e))\n\n\ndef construct_single_handler_logger(name, level, handler):\n    check.str_param(name, "name")\n    check.inst_param(handler, "handler", logging.Handler)\n\n    level = coerce_valid_log_level(level)\n\n    @logger\n    def single_handler_logger(_init_context):\n        klass = logging.getLoggerClass()\n        logger_ = klass(name, level=level)\n        logger_.addHandler(handler)\n        handler.setLevel(level)\n        return logger_\n\n    return single_handler_logger\n\n\n# Base python logger whose messages will be captured as structured Dagster log messages.\nBASE_DAGSTER_LOGGER = logging.getLogger(name="dagster")\n\n\n
[docs]def get_dagster_logger(name: Optional[str] = None) -> logging.Logger:\n """Creates a python logger whose output messages will be captured and converted into Dagster log\n messages. This means they will have structured information such as the step_key, run_id, etc.\n embedded into them, and will show up in the Dagster event log.\n\n This can be used as a more convenient alternative to `context.log` in most cases. If log level\n is not set explicitly, defaults to DEBUG.\n\n Args:\n name (Optional[str]): If supplied, will create a logger with the name "dagster.builtin.{name}",\n with properties inherited from the base Dagster logger. If omitted, the returned logger\n will be named "dagster.builtin".\n\n Returns:\n :class:`logging.Logger`: A logger whose output will be captured by Dagster.\n\n Example:\n .. code-block:: python\n\n from dagster import get_dagster_logger, op\n\n @op\n def hello_op():\n log = get_dagster_logger()\n for i in range(5):\n # do something\n log.info(f"Did {i+1} things!")\n\n """\n # enforce that the parent logger will always have a DEBUG log level\n BASE_DAGSTER_LOGGER.setLevel(logging.DEBUG)\n base_builtin = BASE_DAGSTER_LOGGER.getChild("builtin")\n if name:\n return base_builtin.getChild(name)\n return base_builtin
\n\n\ndef define_structured_logger(name, callback, level):\n check.str_param(name, "name")\n check.callable_param(callback, "callback")\n level = coerce_valid_log_level(level)\n\n return construct_single_handler_logger(name, level, StructuredLoggerHandler(callback))\n\n\ndef define_json_file_logger(name, json_path, level):\n check.str_param(name, "name")\n check.str_param(json_path, "json_path")\n level = coerce_valid_log_level(level)\n\n stream_handler = JsonFileHandler(json_path)\n stream_handler.setFormatter(define_default_formatter())\n return construct_single_handler_logger(name, level, stream_handler)\n\n\ndef get_stack_trace_array(exception):\n check.inst_param(exception, "exception", Exception)\n if hasattr(exception, "__traceback__"):\n tb = exception.__traceback__\n else:\n _exc_type, _exc_value, tb = sys.exc_info()\n return traceback.format_tb(tb)\n\n\ndef default_format_string():\n return "%(asctime)s - %(name)s - %(levelname)s - %(message)s"\n\n\ndef default_date_format_string():\n return "%Y-%m-%d %H:%M:%S %z"\n\n\ndef define_default_formatter():\n return logging.Formatter(default_format_string(), default_date_format_string())\n\n\n@deprecated(\n breaking_version="2.0",\n subject="loggers.dagit",\n emit_runtime_warning=False,\n)\ndef configure_loggers(handler="default", log_level="INFO"):\n LOGGING_CONFIG = {\n "version": 1,\n "disable_existing_loggers": False,\n "formatters": {\n "colored": {\n "()": coloredlogs.ColoredFormatter,\n "fmt": default_format_string(),\n "datefmt": default_date_format_string(),\n "field_styles": {"levelname": {"color": "blue"}, "asctime": {"color": "green"}},\n "level_styles": {"debug": {}, "error": {"color": "red"}},\n },\n },\n "handlers": {\n "default": {\n "formatter": "colored",\n "class": "logging.StreamHandler",\n "stream": sys.stdout,\n "level": log_level,\n },\n "null": {\n "class": "logging.NullHandler",\n },\n },\n "loggers": {\n "dagster": {\n "handlers": [handler],\n "level": log_level,\n },\n # Only one of dagster or dagster-webserver will be used at a time. We configure them\n # both here to avoid a dependency on the dagster-webserver package.\n "dagit": {\n "handlers": [handler],\n "level": log_level,\n },\n "dagster-webserver": {\n "handlers": [handler],\n "level": log_level,\n },\n },\n }\n\n logging.config.dictConfig(LOGGING_CONFIG)\n\n\ndef create_console_logger(name, level):\n klass = logging.getLoggerClass()\n handler = klass(name, level=level)\n coloredlogs.install(\n logger=handler,\n level=level,\n fmt=default_format_string(),\n datefmt=default_date_format_string(),\n field_styles={"levelname": {"color": "blue"}, "asctime": {"color": "green"}},\n level_styles={"debug": {}, "error": {"color": "red"}},\n )\n return handler\n
", "current_page_name": "_modules/dagster/_utils/log", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}, {"link": "../", "title": "dagster._utils"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._utils.log"}, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._utils", "warnings": {"alabaster_version": "0.7.13", "body": "

Source code for dagster._utils.warnings

\nimport warnings\nfrom contextlib import contextmanager\nfrom typing import Callable, Iterator, Optional, TypeVar\n\nimport dagster._check as check\nfrom dagster._core.decorator_utils import (\n    Decoratable,\n    apply_context_manager_decorator,\n)\n\nT = TypeVar("T")\n\n# ########################\n# ##### DEPRECATED\n# ########################\n\n\ndef normalize_renamed_param(\n    new_val: T,\n    new_arg: str,\n    old_val: T,\n    old_arg: str,\n    coerce_old_to_new: Optional[Callable[[T], T]] = None,\n) -> T:\n    """Utility for managing backwards compatibility of a renamed parameter.\n\n    .. code-block::\n\n       # The name of param `old_flag` is being updated to `new_flag`, but we are temporarily\n       # accepting either param.\n       def is_new(old_flag=None, new_flag=None):\n           return canonicalize_backcompat_args(\n               new_val=new_flag,\n               new_arg='new_flag',\n               old_val=old_flag,\n               old_arg='old_flag',\n               breaking_version='0.9.0',\n               coerce_old_to_new=lambda val: not val,\n           )\n\n    In the above example, if the caller sets both new_flag and old_flag, it will fail by throwing\n    a CheckError. If the caller sets the new_flag, it's returned unaltered. If the caller sets\n    old_flag, it will return the old_flag run through the coercion function.\n    """\n    check.str_param(new_arg, "new_arg")\n    check.str_param(old_arg, "old_arg")\n    check.opt_callable_param(coerce_old_to_new, "coerce_old_to_new")\n    if new_val is not None and old_val is not None:\n        check.failed(f'Do not use deprecated "{old_arg}" now that you are using "{new_arg}".')\n    elif old_val is not None:\n        return coerce_old_to_new(old_val) if coerce_old_to_new else old_val\n    else:\n        return new_val\n\n\ndef deprecation_warning(\n    subject: str,\n    breaking_version: str,\n    additional_warn_text: Optional[str] = None,\n    stacklevel: int = 3,\n):\n    warnings.warn(\n        f"{subject} is deprecated and will be removed in {breaking_version}."\n        + ((" " + additional_warn_text) if additional_warn_text else ""),\n        category=DeprecationWarning,\n        stacklevel=stacklevel,\n    )\n\n\n# ########################\n# ##### EXPERIMENTAL\n# ########################\n\nEXPERIMENTAL_WARNING_HELP = (\n    "To mute warnings for experimental functionality, invoke"\n    ' warnings.filterwarnings("ignore", category=dagster.ExperimentalWarning) or use'\n    " one of the other methods described at"\n    " https://docs.python.org/3/library/warnings.html#describing-warning-filters."\n)\n\n\n
[docs]class ExperimentalWarning(Warning):\n pass
\n\n\ndef experimental_warning(\n subject: str, additional_warn_text: Optional[str] = None, stacklevel: int = 3\n) -> None:\n extra_text = f" {additional_warn_text}" if additional_warn_text else ""\n warnings.warn(\n f"{subject} is experimental. It may break in future versions, even between dot"\n f" releases.{extra_text} {EXPERIMENTAL_WARNING_HELP}",\n ExperimentalWarning,\n stacklevel=stacklevel,\n )\n\n\n# ########################\n# ##### DISABLE DAGSTER WARNINGS\n# ########################\n\n\n@contextmanager\ndef disable_dagster_warnings() -> Iterator[None]:\n with warnings.catch_warnings():\n warnings.simplefilter("ignore", category=DeprecationWarning)\n warnings.simplefilter("ignore", category=ExperimentalWarning)\n yield\n\n\nT_Decoratable = TypeVar("T_Decoratable", bound=Decoratable)\n\n\ndef suppress_dagster_warnings(__obj: T_Decoratable) -> T_Decoratable:\n """Mark a method/function as ignoring Dagster-generated warnings. This suppresses any\n `ExperimentalWarnings` or `DeprecationWarnings` when the function is called.\n\n Usage:\n\n .. code-block:: python\n\n @suppress_dagster_warnings\n def invokes_some_experimental_stuff(my_arg):\n my_experimental_function(my_arg)\n """\n return apply_context_manager_decorator(__obj, disable_dagster_warnings)\n
", "current_page_name": "_modules/dagster/_utils/warnings", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}, {"link": "../", "title": "dagster._utils"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster._utils.warnings"}}}, "dagster_airbyte": {"asset_defs": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_airbyte.asset_defs

\nimport hashlib\nimport inspect\nimport os\nimport re\nfrom abc import abstractmethod\nfrom functools import partial\nfrom itertools import chain\nfrom typing import (\n    Any,\n    Callable,\n    Dict,\n    Iterable,\n    List,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Set,\n    Tuple,\n    Union,\n    cast,\n)\n\nimport yaml\nfrom dagster import (\n    AssetKey,\n    AssetOut,\n    AutoMaterializePolicy,\n    FreshnessPolicy,\n    Nothing,\n    Output,\n    ResourceDefinition,\n    SourceAsset,\n    _check as check,\n)\nfrom dagster._core.definitions import AssetsDefinition, multi_asset\nfrom dagster._core.definitions.cacheable_assets import (\n    AssetsDefinitionCacheableData,\n    CacheableAssetsDefinition,\n)\nfrom dagster._core.definitions.events import CoercibleToAssetKey, CoercibleToAssetKeyPrefix\nfrom dagster._core.definitions.metadata import MetadataValue, TableSchemaMetadataValue\nfrom dagster._core.definitions.metadata.table import TableSchema\nfrom dagster._core.errors import DagsterInvalidDefinitionError, DagsterInvalidInvocationError\nfrom dagster._core.execution.context.init import build_init_resource_context\nfrom dagster._utils.merger import merge_dicts\n\nfrom dagster_airbyte.resources import AirbyteCloudResource, AirbyteResource, BaseAirbyteResource\nfrom dagster_airbyte.types import AirbyteTableMetadata\nfrom dagster_airbyte.utils import (\n    generate_materializations,\n    generate_table_schema,\n    is_basic_normalization_operation,\n)\n\n\ndef _table_to_output_name_fn(table: str) -> str:\n    return table.replace("-", "_")\n\n\ndef _build_airbyte_asset_defn_metadata(\n    connection_id: str,\n    destination_tables: Sequence[str],\n    table_to_asset_key_fn: Callable[[str], AssetKey],\n    asset_key_prefix: Optional[Sequence[str]] = None,\n    normalization_tables: Optional[Mapping[str, Set[str]]] = None,\n    upstream_assets: Optional[Iterable[AssetKey]] = None,\n    group_name: Optional[str] = None,\n    io_manager_key: Optional[str] = None,\n    schema_by_table_name: Optional[Mapping[str, TableSchema]] = None,\n    freshness_policy: Optional[FreshnessPolicy] = None,\n    auto_materialize_policy: Optional[AutoMaterializePolicy] = None,\n) -> AssetsDefinitionCacheableData:\n    asset_key_prefix = (\n        check.opt_sequence_param(asset_key_prefix, "asset_key_prefix", of_type=str) or []\n    )\n\n    # Generate a list of outputs, the set of destination tables plus any affiliated\n    # normalization tables\n    tables = list(\n        chain.from_iterable(\n            chain(\n                [destination_tables], normalization_tables.values() if normalization_tables else []\n            )\n        )\n    )\n\n    outputs = {\n        _table_to_output_name_fn(table): AssetKey(\n            [*asset_key_prefix, *table_to_asset_key_fn(table).path]\n        )\n        for table in tables\n    }\n\n    internal_deps: Dict[str, Set[AssetKey]] = {}\n\n    metadata_encodable_normalization_tables = (\n        {k: list(v) for k, v in normalization_tables.items()} if normalization_tables else {}\n    )\n\n    # If normalization tables are specified, we need to add a dependency from the destination table\n    # to the affilitated normalization table\n    if len(metadata_encodable_normalization_tables) > 0:\n        for base_table, derived_tables in metadata_encodable_normalization_tables.items():\n            for derived_table in derived_tables:\n                internal_deps[derived_table] = {\n                    AssetKey([*asset_key_prefix, *table_to_asset_key_fn(base_table).path])\n                }\n\n    # All non-normalization tables depend on any user-provided upstream assets\n    for table in destination_tables:\n        internal_deps[table] = set(upstream_assets or [])\n\n    return AssetsDefinitionCacheableData(\n        keys_by_input_name=(\n            {asset_key.path[-1]: asset_key for asset_key in upstream_assets}\n            if upstream_assets\n            else {}\n        ),\n        keys_by_output_name=outputs,\n        internal_asset_deps=internal_deps,\n        group_name=group_name,\n        key_prefix=asset_key_prefix,\n        can_subset=False,\n        metadata_by_output_name=(\n            {\n                table: {"table_schema": MetadataValue.table_schema(schema_by_table_name[table])}\n                for table in tables\n            }\n            if schema_by_table_name\n            else None\n        ),\n        freshness_policies_by_output_name=(\n            {output: freshness_policy for output in outputs} if freshness_policy else None\n        ),\n        auto_materialize_policies_by_output_name=(\n            {output: auto_materialize_policy for output in outputs}\n            if auto_materialize_policy\n            else None\n        ),\n        extra_metadata={\n            "connection_id": connection_id,\n            "group_name": group_name,\n            "destination_tables": destination_tables,\n            "normalization_tables": metadata_encodable_normalization_tables,\n            "io_manager_key": io_manager_key,\n        },\n    )\n\n\ndef _build_airbyte_assets_from_metadata(\n    assets_defn_meta: AssetsDefinitionCacheableData,\n    resource_defs: Optional[Mapping[str, ResourceDefinition]],\n) -> AssetsDefinition:\n    metadata = cast(Mapping[str, Any], assets_defn_meta.extra_metadata)\n    connection_id = cast(str, metadata["connection_id"])\n    group_name = cast(Optional[str], metadata["group_name"])\n    destination_tables = cast(List[str], metadata["destination_tables"])\n    normalization_tables = cast(Mapping[str, List[str]], metadata["normalization_tables"])\n    io_manager_key = cast(Optional[str], metadata["io_manager_key"])\n\n    @multi_asset(\n        name=f"airbyte_sync_{connection_id[:5]}",\n        deps=list((assets_defn_meta.keys_by_input_name or {}).values()),\n        outs={\n            k: AssetOut(\n                key=v,\n                metadata=(\n                    {\n                        k: cast(TableSchemaMetadataValue, v)\n                        for k, v in assets_defn_meta.metadata_by_output_name.get(k, {}).items()\n                    }\n                    if assets_defn_meta.metadata_by_output_name\n                    else None\n                ),\n                io_manager_key=io_manager_key,\n                freshness_policy=(\n                    assets_defn_meta.freshness_policies_by_output_name.get(k)\n                    if assets_defn_meta.freshness_policies_by_output_name\n                    else None\n                ),\n                dagster_type=Nothing,\n            )\n            for k, v in (assets_defn_meta.keys_by_output_name or {}).items()\n        },\n        internal_asset_deps={\n            k: set(v) for k, v in (assets_defn_meta.internal_asset_deps or {}).items()\n        },\n        compute_kind="airbyte",\n        group_name=group_name,\n        resource_defs=resource_defs,\n    )\n    def _assets(context, airbyte: AirbyteResource):\n        ab_output = airbyte.sync_and_poll(connection_id=connection_id)\n        for materialization in generate_materializations(\n            ab_output, assets_defn_meta.key_prefix or []\n        ):\n            table_name = materialization.asset_key.path[-1]\n            if table_name in destination_tables:\n                yield Output(\n                    value=None,\n                    output_name=_table_to_output_name_fn(table_name),\n                    metadata=materialization.metadata,\n                )\n                # Also materialize any normalization tables affiliated with this destination\n                # e.g. nested objects, lists etc\n                if normalization_tables:\n                    for dependent_table in normalization_tables.get(table_name, set()):\n                        yield Output(\n                            value=None,\n                            output_name=_table_to_output_name_fn(dependent_table),\n                        )\n            else:\n                yield materialization\n\n    return _assets\n\n\n
[docs]def build_airbyte_assets(\n connection_id: str,\n destination_tables: Sequence[str],\n asset_key_prefix: Optional[Sequence[str]] = None,\n group_name: Optional[str] = None,\n normalization_tables: Optional[Mapping[str, Set[str]]] = None,\n deps: Optional[Iterable[Union[CoercibleToAssetKey, AssetsDefinition, SourceAsset]]] = None,\n upstream_assets: Optional[Set[AssetKey]] = None,\n schema_by_table_name: Optional[Mapping[str, TableSchema]] = None,\n freshness_policy: Optional[FreshnessPolicy] = None,\n stream_to_asset_map: Optional[Mapping[str, str]] = None,\n) -> Sequence[AssetsDefinition]:\n """Builds a set of assets representing the tables created by an Airbyte sync operation.\n\n Args:\n connection_id (str): The Airbyte Connection ID that this op will sync. You can retrieve this\n value from the "Connections" tab of a given connector in the Airbyte UI.\n destination_tables (List[str]): The names of the tables that you want to be represented\n in the Dagster asset graph for this sync. This will generally map to the name of the\n stream in Airbyte, unless a stream prefix has been specified in Airbyte.\n normalization_tables (Optional[Mapping[str, List[str]]]): If you are using Airbyte's\n normalization feature, you may specify a mapping of destination table to a list of\n derived tables that will be created by the normalization process.\n asset_key_prefix (Optional[List[str]]): A prefix for the asset keys inside this asset.\n If left blank, assets will have a key of `AssetKey([table_name])`.\n deps (Optional[Sequence[Union[AssetsDefinition, SourceAsset, str, AssetKey]]]):\n A list of assets to add as sources.\n upstream_assets (Optional[Set[AssetKey]]): Deprecated, use deps instead. A list of assets to add as sources.\n freshness_policy (Optional[FreshnessPolicy]): A freshness policy to apply to the assets\n stream_to_asset_map (Optional[Mapping[str, str]]): A mapping of an Airbyte stream name to a Dagster asset.\n This allows the use of the "prefix" setting in Airbyte with special characters that aren't valid asset names.\n """\n if upstream_assets is not None and deps is not None:\n raise DagsterInvalidDefinitionError(\n "Cannot specify both deps and upstream_assets to build_airbyte_assets. Use only deps"\n " instead."\n )\n\n asset_key_prefix = check.opt_sequence_param(asset_key_prefix, "asset_key_prefix", of_type=str)\n\n # Generate a list of outputs, the set of destination tables plus any affiliated\n # normalization tables\n tables = chain.from_iterable(\n chain([destination_tables], normalization_tables.values() if normalization_tables else [])\n )\n outputs = {\n table: AssetOut(\n key=AssetKey([*asset_key_prefix, table]),\n metadata=(\n {"table_schema": MetadataValue.table_schema(schema_by_table_name[table])}\n if schema_by_table_name\n else None\n ),\n freshness_policy=freshness_policy,\n )\n for table in tables\n }\n\n internal_deps = {}\n\n # If normalization tables are specified, we need to add a dependency from the destination table\n # to the affilitated normalization table\n if normalization_tables:\n for base_table, derived_tables in normalization_tables.items():\n for derived_table in derived_tables:\n internal_deps[derived_table] = {AssetKey([*asset_key_prefix, base_table])}\n\n upstream_deps = deps\n if upstream_assets is not None:\n upstream_deps = list(upstream_assets)\n\n # All non-normalization tables depend on any user-provided upstream assets\n for table in destination_tables:\n internal_deps[table] = set(upstream_deps) if upstream_deps else set()\n\n @multi_asset(\n name=f"airbyte_sync_{connection_id[:5]}",\n deps=upstream_deps,\n outs=outputs,\n internal_asset_deps=internal_deps,\n compute_kind="airbyte",\n group_name=group_name,\n )\n def _assets(context, airbyte: BaseAirbyteResource):\n ab_output = airbyte.sync_and_poll(connection_id=connection_id)\n\n # No connection details (e.g. using Airbyte Cloud) means we just assume\n # that the outputs were produced\n if len(ab_output.connection_details) == 0:\n for table_name in destination_tables:\n yield Output(\n value=None,\n output_name=_table_to_output_name_fn(table_name),\n )\n if normalization_tables:\n for dependent_table in normalization_tables.get(table_name, set()):\n yield Output(\n value=None,\n output_name=_table_to_output_name_fn(dependent_table),\n )\n else:\n for materialization in generate_materializations(\n ab_output, asset_key_prefix, stream_to_asset_map\n ):\n table_name = materialization.asset_key.path[-1]\n if table_name in destination_tables:\n yield Output(\n value=None,\n output_name=_table_to_output_name_fn(table_name),\n metadata=materialization.metadata,\n )\n # Also materialize any normalization tables affiliated with this destination\n # e.g. nested objects, lists etc\n if normalization_tables:\n for dependent_table in normalization_tables.get(table_name, set()):\n yield Output(\n value=None,\n output_name=_table_to_output_name_fn(dependent_table),\n )\n else:\n yield materialization\n\n return [_assets]
\n\n\ndef _get_schema_types(schema: Mapping[str, Any]) -> Sequence[str]:\n """Given a schema definition, return a list of data types that are valid for this schema."""\n types = schema.get("types") or schema.get("type")\n if not types:\n return []\n if isinstance(types, str):\n return [types]\n return types\n\n\ndef _get_sub_schemas(schema: Mapping[str, Any]) -> Sequence[Mapping[str, Any]]:\n """Returns a list of sub-schema definitions for a given schema. This is used to handle union types."""\n return schema.get("anyOf") or schema.get("oneOf") or [schema]\n\n\ndef _get_normalization_tables_for_schema(\n key: str, schema: Mapping[str, Any], prefix: str = ""\n) -> Mapping[str, AirbyteTableMetadata]:\n """Recursively traverses a schema, returning metadata for the tables that will be created by the Airbyte\n normalization process.\n\n For example, a table `cars` with a nested object field `limited_editions` will produce the tables\n `cars` and `cars_limited_editions`.\n\n For more information on Airbyte's normalization process, see:\n https://docs.airbyte.com/understanding-airbyte/basic-normalization/#nesting\n """\n out: Dict[str, AirbyteTableMetadata] = {}\n # Object types are broken into a new table, as long as they have children\n\n sub_schemas = _get_sub_schemas(schema)\n\n for sub_schema in sub_schemas:\n schema_types = _get_schema_types(sub_schema)\n if not schema_types:\n continue\n\n if "object" in schema_types and len(sub_schema.get("properties", {})) > 0:\n out[prefix + key] = AirbyteTableMetadata(\n schema=generate_table_schema(sub_schema.get("properties", {}))\n )\n for k, v in sub_schema["properties"].items():\n out = merge_dicts(\n out, _get_normalization_tables_for_schema(k, v, f"{prefix}{key}_")\n )\n # Array types are also broken into a new table\n elif "array" in schema_types:\n out[prefix + key] = AirbyteTableMetadata(\n schema=generate_table_schema(sub_schema.get("items", {}).get("properties", {}))\n )\n if sub_schema.get("items", {}).get("properties"):\n for k, v in sub_schema["items"]["properties"].items():\n out = merge_dicts(\n out, _get_normalization_tables_for_schema(k, v, f"{prefix}{key}_")\n )\n\n return out\n\n\ndef _clean_name(name: str) -> str:\n """Cleans an input to be a valid Dagster asset name."""\n return re.sub(r"[^a-z0-9]+", "_", name.lower())\n\n\nclass AirbyteConnectionMetadata(\n NamedTuple(\n "_AirbyteConnectionMetadata",\n [\n ("name", str),\n ("stream_prefix", str),\n ("has_basic_normalization", bool),\n ("stream_data", List[Mapping[str, Any]]),\n ],\n )\n):\n """Contains information about an Airbyte connection.\n\n Attributes:\n name (str): The name of the connection.\n stream_prefix (str): A prefix to add to all stream names.\n has_basic_normalization (bool): Whether or not the connection has basic normalization enabled.\n stream_data (List[Mapping[str, Any]]): Unparsed list of dicts with information about each stream.\n """\n\n @classmethod\n def from_api_json(\n cls, contents: Mapping[str, Any], operations: Mapping[str, Any]\n ) -> "AirbyteConnectionMetadata":\n return cls(\n name=contents["name"],\n stream_prefix=contents.get("prefix", ""),\n has_basic_normalization=any(\n is_basic_normalization_operation(op.get("operatorConfiguration", {}))\n for op in operations.get("operations", [])\n ),\n stream_data=contents.get("syncCatalog", {}).get("streams", []),\n )\n\n @classmethod\n def from_config(cls, contents: Mapping[str, Any]) -> "AirbyteConnectionMetadata":\n config_contents = cast(Mapping[str, Any], contents.get("configuration"))\n check.invariant(\n config_contents is not None, "Airbyte connection config is missing 'configuration' key"\n )\n\n return cls(\n name=contents["resource_name"],\n stream_prefix=config_contents.get("prefix", ""),\n has_basic_normalization=any(\n is_basic_normalization_operation(op.get("operator_configuration", {}))\n for op in config_contents.get("operations", [])\n ),\n stream_data=config_contents.get("sync_catalog", {}).get("streams", []),\n )\n\n def parse_stream_tables(\n self, return_normalization_tables: bool = False\n ) -> Mapping[str, AirbyteTableMetadata]:\n """Parses the stream data and returns a mapping, with keys representing destination\n tables associated with each enabled stream and values representing any affiliated\n tables created by Airbyte's normalization process, if enabled.\n """\n tables: Dict[str, AirbyteTableMetadata] = {}\n\n enabled_streams = [\n stream for stream in self.stream_data if stream.get("config", {}).get("selected", False)\n ]\n\n for stream in enabled_streams:\n name = cast(str, stream.get("stream", {}).get("name"))\n prefixed_name = f"{self.stream_prefix}{name}"\n\n schema = (\n stream["stream"]["json_schema"]\n if "json_schema" in stream["stream"]\n else stream["stream"]["jsonSchema"]\n )\n normalization_tables: Dict[str, AirbyteTableMetadata] = {}\n schema_props = schema.get("properties", schema.get("items", {}).get("properties", {}))\n if self.has_basic_normalization and return_normalization_tables:\n for k, v in schema_props.items():\n for normalization_table_name, meta in _get_normalization_tables_for_schema(\n k, v, f"{name}_"\n ).items():\n prefixed_norm_table_name = f"{self.stream_prefix}{normalization_table_name}"\n normalization_tables[prefixed_norm_table_name] = meta\n tables[prefixed_name] = AirbyteTableMetadata(\n schema=generate_table_schema(schema_props),\n normalization_tables=normalization_tables,\n )\n\n return tables\n\n\ndef _get_schema_by_table_name(\n stream_table_metadata: Mapping[str, AirbyteTableMetadata]\n) -> Mapping[str, TableSchema]:\n schema_by_base_table_name = [(k, v.schema) for k, v in stream_table_metadata.items()]\n schema_by_normalization_table_name = list(\n chain.from_iterable(\n [\n [\n (k, v.schema)\n for k, v in cast(\n Dict[str, AirbyteTableMetadata], meta.normalization_tables\n ).items()\n ]\n for meta in stream_table_metadata.values()\n ]\n )\n )\n\n return dict(schema_by_normalization_table_name + schema_by_base_table_name)\n\n\nclass AirbyteCoreCacheableAssetsDefinition(CacheableAssetsDefinition):\n def __init__(\n self,\n key_prefix: Sequence[str],\n create_assets_for_normalization_tables: bool,\n connection_to_group_fn: Optional[Callable[[str], Optional[str]]],\n connection_to_io_manager_key_fn: Optional[Callable[[str], Optional[str]]],\n connection_filter: Optional[Callable[[AirbyteConnectionMetadata], bool]],\n connection_to_asset_key_fn: Optional[Callable[[AirbyteConnectionMetadata, str], AssetKey]],\n connection_to_freshness_policy_fn: Optional[\n Callable[[AirbyteConnectionMetadata], Optional[FreshnessPolicy]]\n ],\n connection_to_auto_materialize_policy_fn: Optional[\n Callable[[AirbyteConnectionMetadata], Optional[AutoMaterializePolicy]]\n ] = None,\n ):\n self._key_prefix = key_prefix\n self._create_assets_for_normalization_tables = create_assets_for_normalization_tables\n self._connection_to_group_fn = connection_to_group_fn\n self._connection_to_io_manager_key_fn = connection_to_io_manager_key_fn\n self._connection_filter = connection_filter\n self._connection_to_asset_key_fn: Callable[[AirbyteConnectionMetadata, str], AssetKey] = (\n connection_to_asset_key_fn or (lambda _, table: AssetKey(path=[table]))\n )\n self._connection_to_freshness_policy_fn = connection_to_freshness_policy_fn or (\n lambda _: None\n )\n self._connection_to_auto_materialize_policy_fn = (\n connection_to_auto_materialize_policy_fn or (lambda _: None)\n )\n\n contents = hashlib.sha1() # so that hexdigest is 40, not 64 bytes\n contents.update(",".join(key_prefix).encode("utf-8"))\n contents.update(str(create_assets_for_normalization_tables).encode("utf-8"))\n if connection_filter:\n contents.update(inspect.getsource(connection_filter).encode("utf-8"))\n\n super().__init__(unique_id=f"airbyte-{contents.hexdigest()}")\n\n @abstractmethod\n def _get_connections(self) -> Sequence[Tuple[str, AirbyteConnectionMetadata]]:\n pass\n\n def compute_cacheable_data(self) -> Sequence[AssetsDefinitionCacheableData]:\n asset_defn_data: List[AssetsDefinitionCacheableData] = []\n for connection_id, connection in self._get_connections():\n stream_table_metadata = connection.parse_stream_tables(\n self._create_assets_for_normalization_tables\n )\n schema_by_table_name = _get_schema_by_table_name(stream_table_metadata)\n\n table_to_asset_key = partial(self._connection_to_asset_key_fn, connection)\n asset_data_for_conn = _build_airbyte_asset_defn_metadata(\n connection_id=connection_id,\n destination_tables=list(stream_table_metadata.keys()),\n normalization_tables={\n table: set(metadata.normalization_tables.keys())\n for table, metadata in stream_table_metadata.items()\n },\n asset_key_prefix=self._key_prefix,\n group_name=(\n self._connection_to_group_fn(connection.name)\n if self._connection_to_group_fn\n else None\n ),\n io_manager_key=(\n self._connection_to_io_manager_key_fn(connection.name)\n if self._connection_to_io_manager_key_fn\n else None\n ),\n schema_by_table_name=schema_by_table_name,\n table_to_asset_key_fn=table_to_asset_key,\n freshness_policy=self._connection_to_freshness_policy_fn(connection),\n auto_materialize_policy=self._connection_to_auto_materialize_policy_fn(connection),\n )\n\n asset_defn_data.append(asset_data_for_conn)\n\n return asset_defn_data\n\n def _build_definitions_with_resources(\n self,\n data: Sequence[AssetsDefinitionCacheableData],\n resource_defs: Optional[Mapping[str, ResourceDefinition]] = None,\n ) -> Sequence[AssetsDefinition]:\n return [_build_airbyte_assets_from_metadata(meta, resource_defs) for meta in data]\n\n def build_definitions(\n self, data: Sequence[AssetsDefinitionCacheableData]\n ) -> Sequence[AssetsDefinition]:\n return self._build_definitions_with_resources(data)\n\n\nclass AirbyteInstanceCacheableAssetsDefinition(AirbyteCoreCacheableAssetsDefinition):\n def __init__(\n self,\n airbyte_resource_def: Union[ResourceDefinition, AirbyteResource],\n workspace_id: Optional[str],\n key_prefix: Sequence[str],\n create_assets_for_normalization_tables: bool,\n connection_to_group_fn: Optional[Callable[[str], Optional[str]]],\n connection_to_io_manager_key_fn: Optional[Callable[[str], Optional[str]]],\n connection_filter: Optional[Callable[[AirbyteConnectionMetadata], bool]],\n connection_to_asset_key_fn: Optional[Callable[[AirbyteConnectionMetadata, str], AssetKey]],\n connection_to_freshness_policy_fn: Optional[\n Callable[[AirbyteConnectionMetadata], Optional[FreshnessPolicy]]\n ],\n connection_to_auto_materialize_policy_fn: Optional[\n Callable[[AirbyteConnectionMetadata], Optional[AutoMaterializePolicy]]\n ] = None,\n ):\n super().__init__(\n key_prefix=key_prefix,\n create_assets_for_normalization_tables=create_assets_for_normalization_tables,\n connection_to_group_fn=connection_to_group_fn,\n connection_to_io_manager_key_fn=connection_to_io_manager_key_fn,\n connection_filter=connection_filter,\n connection_to_asset_key_fn=connection_to_asset_key_fn,\n connection_to_freshness_policy_fn=connection_to_freshness_policy_fn,\n connection_to_auto_materialize_policy_fn=connection_to_auto_materialize_policy_fn,\n )\n self._workspace_id = workspace_id\n self._airbyte_instance: AirbyteResource = (\n airbyte_resource_def.process_config_and_initialize()\n if isinstance(airbyte_resource_def, AirbyteResource)\n else airbyte_resource_def(build_init_resource_context())\n )\n\n def _get_connections(self) -> Sequence[Tuple[str, AirbyteConnectionMetadata]]:\n workspace_id = self._workspace_id\n if not workspace_id:\n workspaces = cast(\n List[Dict[str, Any]],\n check.not_none(\n self._airbyte_instance.make_request(endpoint="/workspaces/list", data={})\n ).get("workspaces", []),\n )\n\n check.invariant(len(workspaces) <= 1, "Airbyte instance has more than one workspace")\n check.invariant(len(workspaces) > 0, "Airbyte instance has no workspaces")\n\n workspace_id = workspaces[0].get("workspaceId")\n\n connections = cast(\n List[Dict[str, Any]],\n check.not_none(\n self._airbyte_instance.make_request(\n endpoint="/connections/list", data={"workspaceId": workspace_id}\n )\n ).get("connections", []),\n )\n\n output_connections: List[Tuple[str, AirbyteConnectionMetadata]] = []\n for connection_json in connections:\n connection_id = cast(str, connection_json.get("connectionId"))\n\n operations_json = cast(\n Dict[str, Any],\n check.not_none(\n self._airbyte_instance.make_request(\n endpoint="/operations/list",\n data={"connectionId": connection_id},\n )\n ),\n )\n connection = AirbyteConnectionMetadata.from_api_json(connection_json, operations_json)\n\n # Filter out connections that don't match the filter function\n if self._connection_filter and not self._connection_filter(connection):\n continue\n\n output_connections.append((connection_id, connection))\n return output_connections\n\n def build_definitions(\n self, data: Sequence[AssetsDefinitionCacheableData]\n ) -> Sequence[AssetsDefinition]:\n return super()._build_definitions_with_resources(\n data, {"airbyte": self._airbyte_instance.get_resource_definition()}\n )\n\n\nclass AirbyteYAMLCacheableAssetsDefinition(AirbyteCoreCacheableAssetsDefinition):\n def __init__(\n self,\n project_dir: str,\n workspace_id: Optional[str],\n key_prefix: Sequence[str],\n create_assets_for_normalization_tables: bool,\n connection_to_group_fn: Optional[Callable[[str], Optional[str]]],\n connection_to_io_manager_key_fn: Optional[Callable[[str], Optional[str]]],\n connection_filter: Optional[Callable[[AirbyteConnectionMetadata], bool]],\n connection_directories: Optional[Sequence[str]],\n connection_to_asset_key_fn: Optional[Callable[[AirbyteConnectionMetadata, str], AssetKey]],\n connection_to_freshness_policy_fn: Optional[\n Callable[[AirbyteConnectionMetadata], Optional[FreshnessPolicy]]\n ],\n connection_to_auto_materialize_policy_fn: Optional[\n Callable[[AirbyteConnectionMetadata], Optional[AutoMaterializePolicy]]\n ] = None,\n ):\n super().__init__(\n key_prefix=key_prefix,\n create_assets_for_normalization_tables=create_assets_for_normalization_tables,\n connection_to_group_fn=connection_to_group_fn,\n connection_to_io_manager_key_fn=connection_to_io_manager_key_fn,\n connection_filter=connection_filter,\n connection_to_asset_key_fn=connection_to_asset_key_fn,\n connection_to_freshness_policy_fn=connection_to_freshness_policy_fn,\n connection_to_auto_materialize_policy_fn=connection_to_auto_materialize_policy_fn,\n )\n self._workspace_id = workspace_id\n self._project_dir = project_dir\n self._connection_directories = connection_directories\n\n def _get_connections(self) -> Sequence[Tuple[str, AirbyteConnectionMetadata]]:\n connections_dir = os.path.join(self._project_dir, "connections")\n\n output_connections: List[Tuple[str, AirbyteConnectionMetadata]] = []\n\n connection_directories = self._connection_directories or os.listdir(connections_dir)\n for connection_name in connection_directories:\n connection_dir = os.path.join(connections_dir, connection_name)\n with open(os.path.join(connection_dir, "configuration.yaml"), encoding="utf-8") as f:\n connection = AirbyteConnectionMetadata.from_config(yaml.safe_load(f.read()))\n\n # Filter out connections that don't match the filter function\n if self._connection_filter and not self._connection_filter(connection):\n continue\n\n if self._workspace_id:\n state_file = f"state_{self._workspace_id}.yaml"\n check.invariant(\n state_file in os.listdir(connection_dir),\n f"Workspace state file {state_file} not found",\n )\n else:\n state_files = [\n filename\n for filename in os.listdir(connection_dir)\n if filename.startswith("state_")\n ]\n check.invariant(\n len(state_files) > 0,\n f"No state files found for connection {connection_name} in {connection_dir}",\n )\n check.invariant(\n len(state_files) <= 1,\n "More than one state file found for connection {} in {}, specify a workspace_id"\n " to disambiguate".format(connection_name, connection_dir),\n )\n state_file = state_files[0]\n\n with open(os.path.join(connection_dir, cast(str, state_file)), encoding="utf-8") as f:\n state = yaml.safe_load(f.read())\n connection_id = state.get("resource_id")\n\n output_connections.append((connection_id, connection))\n return output_connections\n\n\n
[docs]def load_assets_from_airbyte_instance(\n airbyte: Union[AirbyteResource, ResourceDefinition],\n workspace_id: Optional[str] = None,\n key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n create_assets_for_normalization_tables: bool = True,\n connection_to_group_fn: Optional[Callable[[str], Optional[str]]] = _clean_name,\n io_manager_key: Optional[str] = None,\n connection_to_io_manager_key_fn: Optional[Callable[[str], Optional[str]]] = None,\n connection_filter: Optional[Callable[[AirbyteConnectionMetadata], bool]] = None,\n connection_to_asset_key_fn: Optional[\n Callable[[AirbyteConnectionMetadata, str], AssetKey]\n ] = None,\n connection_to_freshness_policy_fn: Optional[\n Callable[[AirbyteConnectionMetadata], Optional[FreshnessPolicy]]\n ] = None,\n connection_to_auto_materialize_policy_fn: Optional[\n Callable[[AirbyteConnectionMetadata], Optional[AutoMaterializePolicy]]\n ] = None,\n) -> CacheableAssetsDefinition:\n """Loads Airbyte connection assets from a configured AirbyteResource instance. This fetches information\n about defined connections at initialization time, and will error on workspace load if the Airbyte\n instance is not reachable.\n\n Args:\n airbyte (ResourceDefinition): An AirbyteResource configured with the appropriate connection\n details.\n workspace_id (Optional[str]): The ID of the Airbyte workspace to load connections from. Only\n required if multiple workspaces exist in your instance.\n key_prefix (Optional[CoercibleToAssetKeyPrefix]): A prefix for the asset keys created.\n create_assets_for_normalization_tables (bool): If True, assets will be created for tables\n created by Airbyte's normalization feature. If False, only the destination tables\n will be created. Defaults to True.\n connection_to_group_fn (Optional[Callable[[str], Optional[str]]]): Function which returns an asset\n group name for a given Airbyte connection name. If None, no groups will be created. Defaults\n to a basic sanitization function.\n io_manager_key (Optional[str]): The I/O manager key to use for all assets. Defaults to "io_manager".\n Use this if all assets should be loaded from the same source, otherwise use connection_to_io_manager_key_fn.\n connection_to_io_manager_key_fn (Optional[Callable[[str], Optional[str]]]): Function which returns an\n I/O manager key for a given Airbyte connection name. When other ops are downstream of the loaded assets,\n the IOManager specified determines how the inputs to those ops are loaded. Defaults to "io_manager".\n connection_filter (Optional[Callable[[AirbyteConnectionMetadata], bool]]): Optional function which takes\n in connection metadata and returns False if the connection should be excluded from the output assets.\n connection_to_asset_key_fn (Optional[Callable[[AirbyteConnectionMetadata, str], AssetKey]]): Optional function which\n takes in connection metadata and table name and returns an asset key for the table. If None, the default asset\n key is based on the table name. Any asset key prefix will be applied to the output of this function.\n connection_to_freshness_policy_fn (Optional[Callable[[AirbyteConnectionMetadata], Optional[FreshnessPolicy]]]): Optional function\n which takes in connection metadata and returns a freshness policy for the connection's assets. If None, no freshness policies\n will be applied to the assets.\n connection_to_auto_materialize_policy_fn (Optional[Callable[[AirbyteConnectionMetadata], Optional[AutoMaterializePolicy]]]): Optional\n function which takes in connection metadata and returns an auto materialization policy for the connection's assets. If None, no\n auto materialization policies will be applied to the assets.\n\n **Examples:**\n\n Loading all Airbyte connections as assets:\n\n .. code-block:: python\n\n from dagster_airbyte import airbyte_resource, load_assets_from_airbyte_instance\n\n airbyte_instance = airbyte_resource.configured(\n {\n "host": "localhost",\n "port": "8000",\n }\n )\n airbyte_assets = load_assets_from_airbyte_instance(airbyte_instance)\n\n Filtering the set of loaded connections:\n\n .. code-block:: python\n\n from dagster_airbyte import airbyte_resource, load_assets_from_airbyte_instance\n\n airbyte_instance = airbyte_resource.configured(\n {\n "host": "localhost",\n "port": "8000",\n }\n )\n airbyte_assets = load_assets_from_airbyte_instance(\n airbyte_instance,\n connection_filter=lambda meta: "snowflake" in meta.name,\n )\n """\n if isinstance(airbyte, AirbyteCloudResource):\n raise DagsterInvalidInvocationError(\n "load_assets_from_airbyte_instance is not yet supported for AirbyteCloudResource"\n )\n\n if isinstance(key_prefix, str):\n key_prefix = [key_prefix]\n key_prefix = check.list_param(key_prefix or [], "key_prefix", of_type=str)\n\n check.invariant(\n not io_manager_key or not connection_to_io_manager_key_fn,\n "Cannot specify both io_manager_key and connection_to_io_manager_key_fn",\n )\n if not connection_to_io_manager_key_fn:\n connection_to_io_manager_key_fn = lambda _: io_manager_key\n\n return AirbyteInstanceCacheableAssetsDefinition(\n airbyte_resource_def=airbyte,\n workspace_id=workspace_id,\n key_prefix=key_prefix,\n create_assets_for_normalization_tables=create_assets_for_normalization_tables,\n connection_to_group_fn=connection_to_group_fn,\n connection_to_io_manager_key_fn=connection_to_io_manager_key_fn,\n connection_filter=connection_filter,\n connection_to_asset_key_fn=connection_to_asset_key_fn,\n connection_to_freshness_policy_fn=connection_to_freshness_policy_fn,\n connection_to_auto_materialize_policy_fn=connection_to_auto_materialize_policy_fn,\n )
\n\n\n
[docs]def load_assets_from_airbyte_project(\n project_dir: str,\n workspace_id: Optional[str] = None,\n key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n create_assets_for_normalization_tables: bool = True,\n connection_to_group_fn: Optional[Callable[[str], Optional[str]]] = _clean_name,\n io_manager_key: Optional[str] = None,\n connection_to_io_manager_key_fn: Optional[Callable[[str], Optional[str]]] = None,\n connection_filter: Optional[Callable[[AirbyteConnectionMetadata], bool]] = None,\n connection_directories: Optional[Sequence[str]] = None,\n connection_to_asset_key_fn: Optional[\n Callable[[AirbyteConnectionMetadata, str], AssetKey]\n ] = None,\n connection_to_freshness_policy_fn: Optional[\n Callable[[AirbyteConnectionMetadata], Optional[FreshnessPolicy]]\n ] = None,\n connection_to_auto_materialize_policy_fn: Optional[\n Callable[[AirbyteConnectionMetadata], Optional[AutoMaterializePolicy]]\n ] = None,\n) -> CacheableAssetsDefinition:\n """Loads an Airbyte project into a set of Dagster assets.\n\n Point to the root folder of an Airbyte project synced using the Octavia CLI. For\n more information, see https://github.com/airbytehq/airbyte/tree/master/octavia-cli#octavia-import-all.\n\n Args:\n project_dir (str): The path to the root of your Airbyte project, containing sources, destinations,\n and connections folders.\n workspace_id (Optional[str]): The ID of the Airbyte workspace to load connections from. Only\n required if multiple workspace state YAMLfiles exist in the project.\n key_prefix (Optional[CoercibleToAssetKeyPrefix]): A prefix for the asset keys created.\n create_assets_for_normalization_tables (bool): If True, assets will be created for tables\n created by Airbyte's normalization feature. If False, only the destination tables\n will be created. Defaults to True.\n connection_to_group_fn (Optional[Callable[[str], Optional[str]]]): Function which returns an asset\n group name for a given Airbyte connection name. If None, no groups will be created. Defaults\n to a basic sanitization function.\n io_manager_key (Optional[str]): The I/O manager key to use for all assets. Defaults to "io_manager".\n Use this if all assets should be loaded from the same source, otherwise use connection_to_io_manager_key_fn.\n connection_to_io_manager_key_fn (Optional[Callable[[str], Optional[str]]]): Function which returns an\n I/O manager key for a given Airbyte connection name. When other ops are downstream of the loaded assets,\n the IOManager specified determines how the inputs to those ops are loaded. Defaults to "io_manager".\n connection_filter (Optional[Callable[[AirbyteConnectionMetadata], bool]]): Optional function which\n takes in connection metadata and returns False if the connection should be excluded from the output assets.\n connection_directories (Optional[List[str]]): Optional list of connection directories to load assets from.\n If omitted, all connections in the Airbyte project are loaded. May be faster than connection_filter\n if the project has many connections or if the connection yaml files are large.\n connection_to_asset_key_fn (Optional[Callable[[AirbyteConnectionMetadata, str], AssetKey]]): Optional function which\n takes in connection metadata and table name and returns an asset key for the table. If None, the default asset\n key is based on the table name. Any asset key prefix will be applied to the output of this function.\n connection_to_freshness_policy_fn (Optional[Callable[[AirbyteConnectionMetadata], Optional[FreshnessPolicy]]]):\n Optional function which takes in connection metadata and returns a freshness policy for the connection's assets.\n If None, no freshness policies will be applied to the assets.\n connection_to_auto_materialize_policy_fn (Optional[Callable[[AirbyteConnectionMetadata], Optional[AutoMaterializePolicy]]]):\n Optional function which takes in connection metadata and returns an auto materialization policy for the connection's assets.\n If None, no auto materialization policies will be applied to the assets.\n\n **Examples:**\n\n Loading all Airbyte connections as assets:\n\n .. code-block:: python\n\n from dagster_airbyte import load_assets_from_airbyte_project\n\n airbyte_assets = load_assets_from_airbyte_project(\n project_dir="path/to/airbyte/project",\n )\n\n Filtering the set of loaded connections:\n\n .. code-block:: python\n\n from dagster_airbyte import load_assets_from_airbyte_project\n\n airbyte_assets = load_assets_from_airbyte_project(\n project_dir="path/to/airbyte/project",\n connection_filter=lambda meta: "snowflake" in meta.name,\n )\n """\n if isinstance(key_prefix, str):\n key_prefix = [key_prefix]\n key_prefix = check.list_param(key_prefix or [], "key_prefix", of_type=str)\n\n check.invariant(\n not io_manager_key or not connection_to_io_manager_key_fn,\n "Cannot specify both io_manager_key and connection_to_io_manager_key_fn",\n )\n if not connection_to_io_manager_key_fn:\n connection_to_io_manager_key_fn = lambda _: io_manager_key\n\n return AirbyteYAMLCacheableAssetsDefinition(\n project_dir=project_dir,\n workspace_id=workspace_id,\n key_prefix=key_prefix,\n create_assets_for_normalization_tables=create_assets_for_normalization_tables,\n connection_to_group_fn=connection_to_group_fn,\n connection_to_io_manager_key_fn=connection_to_io_manager_key_fn,\n connection_filter=connection_filter,\n connection_directories=connection_directories,\n connection_to_asset_key_fn=connection_to_asset_key_fn,\n connection_to_freshness_policy_fn=connection_to_freshness_policy_fn,\n connection_to_auto_materialize_policy_fn=connection_to_auto_materialize_policy_fn,\n )
\n
", "current_page_name": "_modules/dagster_airbyte/asset_defs", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_airbyte.asset_defs"}, "managed": {"generated": {"destinations": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_airbyte.managed.generated.destinations

\n# ruff: noqa: A001, A002\nfrom typing import Optional, Union\n\nimport dagster._check as check\nfrom dagster._annotations import public\n\nfrom dagster_airbyte.managed.types import GeneratedAirbyteDestination\n\n\n
[docs]class DynamodbDestination(GeneratedAirbyteDestination):\n
[docs] @public\n def __init__(\n self,\n name: str,\n dynamodb_table_name_prefix: str,\n dynamodb_region: str,\n access_key_id: str,\n secret_access_key: str,\n dynamodb_endpoint: Optional[str] = None,\n ):\n """Airbyte Destination for Dynamodb.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/dynamodb\n\n Args:\n name (str): The name of the destination.\n dynamodb_endpoint (Optional[str]): This is your DynamoDB endpoint url.(if you are working with AWS DynamoDB, just leave empty).\n dynamodb_table_name_prefix (str): The prefix to use when naming DynamoDB tables.\n dynamodb_region (str): The region of the DynamoDB.\n access_key_id (str): The access key id to access the DynamoDB. Airbyte requires Read and Write permissions to the DynamoDB.\n secret_access_key (str): The corresponding secret to the access key id.\n """\n self.dynamodb_endpoint = check.opt_str_param(dynamodb_endpoint, "dynamodb_endpoint")\n self.dynamodb_table_name_prefix = check.str_param(\n dynamodb_table_name_prefix, "dynamodb_table_name_prefix"\n )\n self.dynamodb_region = check.str_param(dynamodb_region, "dynamodb_region")\n self.access_key_id = check.str_param(access_key_id, "access_key_id")\n self.secret_access_key = check.str_param(secret_access_key, "secret_access_key")\n super().__init__("Dynamodb", name)
\n\n\n
[docs]class BigqueryDestination(GeneratedAirbyteDestination):\n
[docs] class StandardInserts:\n
[docs] @public\n def __init__(\n self,\n ):\n self.method = "Standard"
\n\n
[docs] class HMACKey:\n
[docs] @public\n def __init__(self, hmac_key_access_id: str, hmac_key_secret: str):\n self.credential_type = "HMAC_KEY"\n self.hmac_key_access_id = check.str_param(hmac_key_access_id, "hmac_key_access_id")\n self.hmac_key_secret = check.str_param(hmac_key_secret, "hmac_key_secret")
\n\n
[docs] class GCSStaging:\n
[docs] @public\n def __init__(\n self,\n credential: "BigqueryDestination.HMACKey",\n gcs_bucket_name: str,\n gcs_bucket_path: str,\n keep_files_in_gcs_bucket: Optional[str] = None,\n ):\n self.method = "GCS Staging"\n self.credential = check.inst_param(\n credential, "credential", BigqueryDestination.HMACKey\n )\n self.gcs_bucket_name = check.str_param(gcs_bucket_name, "gcs_bucket_name")\n self.gcs_bucket_path = check.str_param(gcs_bucket_path, "gcs_bucket_path")\n self.keep_files_in_gcs_bucket = check.opt_str_param(\n keep_files_in_gcs_bucket, "keep_files_in_gcs_bucket"\n )
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n project_id: str,\n dataset_location: str,\n dataset_id: str,\n loading_method: Union[\n "BigqueryDestination.StandardInserts", "BigqueryDestination.GCSStaging"\n ],\n credentials_json: Optional[str] = None,\n transformation_priority: Optional[str] = None,\n big_query_client_buffer_size_mb: Optional[int] = None,\n ):\n """Airbyte Destination for Bigquery.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/bigquery\n\n Args:\n name (str): The name of the destination.\n project_id (str): The GCP project ID for the project containing the target BigQuery dataset. Read more here.\n dataset_location (str): The location of the dataset. Warning: Changes made after creation will not be applied. Read more here.\n dataset_id (str): The default BigQuery Dataset ID that tables are replicated to if the source does not specify a namespace. Read more here.\n loading_method (Union[BigqueryDestination.StandardInserts, BigqueryDestination.GCSStaging]): Loading method used to send select the way data will be uploaded to BigQuery. Standard Inserts - Direct uploading using SQL INSERT statements. This method is extremely inefficient and provided only for quick testing. In almost all cases, you should use staging. GCS Staging - Writes large batches of records to a file, uploads the file to GCS, then uses COPY INTO table to upload the file. Recommended for most workloads for better speed and scalability. Read more about GCS Staging here.\n credentials_json (Optional[str]): The contents of the JSON service account key. Check out the docs if you need help generating this key. Default credentials will be used if this field is left empty.\n transformation_priority (Optional[str]): Interactive run type means that the query is executed as soon as possible, and these queries count towards concurrent rate limit and daily limit. Read more about interactive run type here. Batch queries are queued and started as soon as idle resources are available in the BigQuery shared resource pool, which usually occurs within a few minutes. Batch queries don`t count towards your concurrent rate limit. Read more about batch queries here. The default "interactive" value is used if not set explicitly.\n big_query_client_buffer_size_mb (Optional[int]): Google BigQuery client's chunk (buffer) size (MIN=1, MAX = 15) for each table. The size that will be written by a single RPC. Written data will be buffered and only flushed upon reaching this size or closing the channel. The default 15MB value is used if not set explicitly. Read more here.\n """\n self.project_id = check.str_param(project_id, "project_id")\n self.dataset_location = check.str_param(dataset_location, "dataset_location")\n self.dataset_id = check.str_param(dataset_id, "dataset_id")\n self.loading_method = check.inst_param(\n loading_method,\n "loading_method",\n (BigqueryDestination.StandardInserts, BigqueryDestination.GCSStaging),\n )\n self.credentials_json = check.opt_str_param(credentials_json, "credentials_json")\n self.transformation_priority = check.opt_str_param(\n transformation_priority, "transformation_priority"\n )\n self.big_query_client_buffer_size_mb = check.opt_int_param(\n big_query_client_buffer_size_mb, "big_query_client_buffer_size_mb"\n )\n super().__init__("Bigquery", name)
\n\n\n
[docs]class RabbitmqDestination(GeneratedAirbyteDestination):\n
[docs] @public\n def __init__(\n self,\n name: str,\n host: str,\n routing_key: str,\n ssl: Optional[bool] = None,\n port: Optional[int] = None,\n virtual_host: Optional[str] = None,\n username: Optional[str] = None,\n password: Optional[str] = None,\n exchange: Optional[str] = None,\n ):\n """Airbyte Destination for Rabbitmq.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/rabbitmq\n\n Args:\n name (str): The name of the destination.\n ssl (Optional[bool]): SSL enabled.\n host (str): The RabbitMQ host name.\n port (Optional[int]): The RabbitMQ port.\n virtual_host (Optional[str]): The RabbitMQ virtual host name.\n username (Optional[str]): The username to connect.\n password (Optional[str]): The password to connect.\n exchange (Optional[str]): The exchange name.\n routing_key (str): The routing key.\n """\n self.ssl = check.opt_bool_param(ssl, "ssl")\n self.host = check.str_param(host, "host")\n self.port = check.opt_int_param(port, "port")\n self.virtual_host = check.opt_str_param(virtual_host, "virtual_host")\n self.username = check.opt_str_param(username, "username")\n self.password = check.opt_str_param(password, "password")\n self.exchange = check.opt_str_param(exchange, "exchange")\n self.routing_key = check.str_param(routing_key, "routing_key")\n super().__init__("Rabbitmq", name)
\n\n\n
[docs]class KvdbDestination(GeneratedAirbyteDestination):\n
[docs] @public\n def __init__(self, name: str, bucket_id: str, secret_key: str):\n """Airbyte Destination for Kvdb.\n\n Documentation can be found at https://kvdb.io/docs/api/\n\n Args:\n name (str): The name of the destination.\n bucket_id (str): The ID of your KVdb bucket.\n secret_key (str): Your bucket Secret Key.\n """\n self.bucket_id = check.str_param(bucket_id, "bucket_id")\n self.secret_key = check.str_param(secret_key, "secret_key")\n super().__init__("Kvdb", name)
\n\n\n
[docs]class ClickhouseDestination(GeneratedAirbyteDestination):\n
[docs] @public\n def __init__(\n self,\n name: str,\n host: str,\n port: int,\n database: str,\n username: str,\n password: Optional[str] = None,\n jdbc_url_params: Optional[str] = None,\n ssl: Optional[bool] = None,\n ):\n """Airbyte Destination for Clickhouse.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/clickhouse\n\n Args:\n name (str): The name of the destination.\n host (str): Hostname of the database.\n port (int): HTTP port of the database.\n database (str): Name of the database.\n username (str): Username to use to access the database.\n password (Optional[str]): Password associated with the username.\n jdbc_url_params (Optional[str]): Additional properties to pass to the JDBC URL string when connecting to the database formatted as 'key=value' pairs separated by the symbol '&'. (example: key1=value1&key2=value2&key3=value3).\n ssl (Optional[bool]): Encrypt data using SSL.\n """\n self.host = check.str_param(host, "host")\n self.port = check.int_param(port, "port")\n self.database = check.str_param(database, "database")\n self.username = check.str_param(username, "username")\n self.password = check.opt_str_param(password, "password")\n self.jdbc_url_params = check.opt_str_param(jdbc_url_params, "jdbc_url_params")\n self.ssl = check.opt_bool_param(ssl, "ssl")\n super().__init__("Clickhouse", name)
\n\n\n
[docs]class AmazonSqsDestination(GeneratedAirbyteDestination):\n
[docs] @public\n def __init__(\n self,\n name: str,\n queue_url: str,\n region: str,\n message_delay: Optional[int] = None,\n access_key: Optional[str] = None,\n secret_key: Optional[str] = None,\n message_body_key: Optional[str] = None,\n message_group_id: Optional[str] = None,\n ):\n """Airbyte Destination for Amazon Sqs.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/amazon-sqs\n\n Args:\n name (str): The name of the destination.\n queue_url (str): URL of the SQS Queue\n region (str): AWS Region of the SQS Queue\n message_delay (Optional[int]): Modify the Message Delay of the individual message from the Queue's default (seconds).\n access_key (Optional[str]): The Access Key ID of the AWS IAM Role to use for sending messages\n secret_key (Optional[str]): The Secret Key of the AWS IAM Role to use for sending messages\n message_body_key (Optional[str]): Use this property to extract the contents of the named key in the input record to use as the SQS message body. If not set, the entire content of the input record data is used as the message body.\n message_group_id (Optional[str]): The tag that specifies that a message belongs to a specific message group. This parameter applies only to, and is REQUIRED by, FIFO queues.\n """\n self.queue_url = check.str_param(queue_url, "queue_url")\n self.region = check.str_param(region, "region")\n self.message_delay = check.opt_int_param(message_delay, "message_delay")\n self.access_key = check.opt_str_param(access_key, "access_key")\n self.secret_key = check.opt_str_param(secret_key, "secret_key")\n self.message_body_key = check.opt_str_param(message_body_key, "message_body_key")\n self.message_group_id = check.opt_str_param(message_group_id, "message_group_id")\n super().__init__("Amazon Sqs", name)
\n\n\n
[docs]class MariadbColumnstoreDestination(GeneratedAirbyteDestination):\n
[docs] @public\n def __init__(\n self,\n name: str,\n host: str,\n port: int,\n database: str,\n username: str,\n password: Optional[str] = None,\n jdbc_url_params: Optional[str] = None,\n ):\n """Airbyte Destination for Mariadb Columnstore.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/mariadb-columnstore\n\n Args:\n name (str): The name of the destination.\n host (str): The Hostname of the database.\n port (int): The Port of the database.\n database (str): Name of the database.\n username (str): The Username which is used to access the database.\n password (Optional[str]): The Password associated with the username.\n jdbc_url_params (Optional[str]): Additional properties to pass to the JDBC URL string when connecting to the database formatted as 'key=value' pairs separated by the symbol '&'. (example: key1=value1&key2=value2&key3=value3).\n """\n self.host = check.str_param(host, "host")\n self.port = check.int_param(port, "port")\n self.database = check.str_param(database, "database")\n self.username = check.str_param(username, "username")\n self.password = check.opt_str_param(password, "password")\n self.jdbc_url_params = check.opt_str_param(jdbc_url_params, "jdbc_url_params")\n super().__init__("Mariadb Columnstore", name)
\n\n\n
[docs]class KinesisDestination(GeneratedAirbyteDestination):\n
[docs] @public\n def __init__(\n self,\n name: str,\n endpoint: str,\n region: str,\n shardCount: int,\n accessKey: str,\n privateKey: str,\n bufferSize: int,\n ):\n """Airbyte Destination for Kinesis.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/kinesis\n\n Args:\n name (str): The name of the destination.\n endpoint (str): AWS Kinesis endpoint.\n region (str): AWS region. Your account determines the Regions that are available to you.\n shardCount (int): Number of shards to which the data should be streamed.\n accessKey (str): Generate the AWS Access Key for current user.\n privateKey (str): The AWS Private Key - a string of numbers and letters that are unique for each account, also known as a "recovery phrase".\n bufferSize (int): Buffer size for storing kinesis records before being batch streamed.\n """\n self.endpoint = check.str_param(endpoint, "endpoint")\n self.region = check.str_param(region, "region")\n self.shardCount = check.int_param(shardCount, "shardCount")\n self.accessKey = check.str_param(accessKey, "accessKey")\n self.privateKey = check.str_param(privateKey, "privateKey")\n self.bufferSize = check.int_param(bufferSize, "bufferSize")\n super().__init__("Kinesis", name)
\n\n\n
[docs]class AzureBlobStorageDestination(GeneratedAirbyteDestination):\n
[docs] class CSVCommaSeparatedValues:\n
[docs] @public\n def __init__(self, flattening: str):\n self.format_type = "CSV"\n self.flattening = check.str_param(flattening, "flattening")
\n\n
[docs] class JSONLinesNewlineDelimitedJSON:\n
[docs] @public\n def __init__(\n self,\n ):\n self.format_type = "JSONL"
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n azure_blob_storage_account_name: str,\n azure_blob_storage_account_key: str,\n format: Union[\n "AzureBlobStorageDestination.CSVCommaSeparatedValues",\n "AzureBlobStorageDestination.JSONLinesNewlineDelimitedJSON",\n ],\n azure_blob_storage_endpoint_domain_name: Optional[str] = None,\n azure_blob_storage_container_name: Optional[str] = None,\n azure_blob_storage_output_buffer_size: Optional[int] = None,\n ):\n """Airbyte Destination for Azure Blob Storage.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/azureblobstorage\n\n Args:\n name (str): The name of the destination.\n azure_blob_storage_endpoint_domain_name (Optional[str]): This is Azure Blob Storage endpoint domain name. Leave default value (or leave it empty if run container from command line) to use Microsoft native from example.\n azure_blob_storage_container_name (Optional[str]): The name of the Azure blob storage container. If not exists - will be created automatically. May be empty, then will be created automatically airbytecontainer+timestamp\n azure_blob_storage_account_name (str): The account's name of the Azure Blob Storage.\n azure_blob_storage_account_key (str): The Azure blob storage account key.\n azure_blob_storage_output_buffer_size (Optional[int]): The amount of megabytes to buffer for the output stream to Azure. This will impact memory footprint on workers, but may need adjustment for performance and appropriate block size in Azure.\n format (Union[AzureBlobStorageDestination.CSVCommaSeparatedValues, AzureBlobStorageDestination.JSONLinesNewlineDelimitedJSON]): Output data format\n """\n self.azure_blob_storage_endpoint_domain_name = check.opt_str_param(\n azure_blob_storage_endpoint_domain_name, "azure_blob_storage_endpoint_domain_name"\n )\n self.azure_blob_storage_container_name = check.opt_str_param(\n azure_blob_storage_container_name, "azure_blob_storage_container_name"\n )\n self.azure_blob_storage_account_name = check.str_param(\n azure_blob_storage_account_name, "azure_blob_storage_account_name"\n )\n self.azure_blob_storage_account_key = check.str_param(\n azure_blob_storage_account_key, "azure_blob_storage_account_key"\n )\n self.azure_blob_storage_output_buffer_size = check.opt_int_param(\n azure_blob_storage_output_buffer_size, "azure_blob_storage_output_buffer_size"\n )\n self.format = check.inst_param(\n format,\n "format",\n (\n AzureBlobStorageDestination.CSVCommaSeparatedValues,\n AzureBlobStorageDestination.JSONLinesNewlineDelimitedJSON,\n ),\n )\n super().__init__("Azure Blob Storage", name)
\n\n\n
[docs]class KafkaDestination(GeneratedAirbyteDestination):\n
[docs] class PLAINTEXT:\n
[docs] @public\n def __init__(self, security_protocol: str):\n self.security_protocol = check.str_param(security_protocol, "security_protocol")
\n\n
[docs] class SASLPLAINTEXT:\n
[docs] @public\n def __init__(self, security_protocol: str, sasl_mechanism: str, sasl_jaas_config: str):\n self.security_protocol = check.str_param(security_protocol, "security_protocol")\n self.sasl_mechanism = check.str_param(sasl_mechanism, "sasl_mechanism")\n self.sasl_jaas_config = check.str_param(sasl_jaas_config, "sasl_jaas_config")
\n\n
[docs] class SASLSSL:\n
[docs] @public\n def __init__(self, security_protocol: str, sasl_mechanism: str, sasl_jaas_config: str):\n self.security_protocol = check.str_param(security_protocol, "security_protocol")\n self.sasl_mechanism = check.str_param(sasl_mechanism, "sasl_mechanism")\n self.sasl_jaas_config = check.str_param(sasl_jaas_config, "sasl_jaas_config")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n bootstrap_servers: str,\n topic_pattern: str,\n protocol: Union[\n "KafkaDestination.PLAINTEXT",\n "KafkaDestination.SASLPLAINTEXT",\n "KafkaDestination.SASLSSL",\n ],\n acks: str,\n enable_idempotence: bool,\n compression_type: str,\n batch_size: int,\n linger_ms: str,\n max_in_flight_requests_per_connection: int,\n client_dns_lookup: str,\n buffer_memory: str,\n max_request_size: int,\n retries: int,\n socket_connection_setup_timeout_ms: str,\n socket_connection_setup_timeout_max_ms: str,\n max_block_ms: str,\n request_timeout_ms: int,\n delivery_timeout_ms: int,\n send_buffer_bytes: int,\n receive_buffer_bytes: int,\n test_topic: Optional[str] = None,\n sync_producer: Optional[bool] = None,\n client_id: Optional[str] = None,\n ):\n """Airbyte Destination for Kafka.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/kafka\n\n Args:\n name (str): The name of the destination.\n bootstrap_servers (str): A list of host/port pairs to use for establishing the initial connection to the Kafka cluster. The client will make use of all servers irrespective of which servers are specified here for bootstrapping&mdash;this list only impacts the initial hosts used to discover the full set of servers. This list should be in the form host1:port1,host2:port2,.... Since these servers are just used for the initial connection to discover the full cluster membership (which may change dynamically), this list need not contain the full set of servers (you may want more than one, though, in case a server is down).\n topic_pattern (str): Topic pattern in which the records will be sent. You can use patterns like '{namespace}' and/or '{stream}' to send the message to a specific topic based on these values. Notice that the topic name will be transformed to a standard naming convention.\n test_topic (Optional[str]): Topic to test if Airbyte can produce messages.\n sync_producer (Optional[bool]): Wait synchronously until the record has been sent to Kafka.\n protocol (Union[KafkaDestination.PLAINTEXT, KafkaDestination.SASLPLAINTEXT, KafkaDestination.SASLSSL]): Protocol used to communicate with brokers.\n client_id (Optional[str]): An ID string to pass to the server when making requests. The purpose of this is to be able to track the source of requests beyond just ip/port by allowing a logical application name to be included in server-side request logging.\n acks (str): The number of acknowledgments the producer requires the leader to have received before considering a request complete. This controls the durability of records that are sent.\n enable_idempotence (bool): When set to 'true', the producer will ensure that exactly one copy of each message is written in the stream. If 'false', producer retries due to broker failures, etc., may write duplicates of the retried message in the stream.\n compression_type (str): The compression type for all data generated by the producer.\n batch_size (int): The producer will attempt to batch records together into fewer requests whenever multiple records are being sent to the same partition.\n linger_ms (str): The producer groups together any records that arrive in between request transmissions into a single batched request.\n max_in_flight_requests_per_connection (int): The maximum number of unacknowledged requests the client will send on a single connection before blocking. Can be greater than 1, and the maximum value supported with idempotency is 5.\n client_dns_lookup (str): Controls how the client uses DNS lookups. If set to use_all_dns_ips, connect to each returned IP address in sequence until a successful connection is established. After a disconnection, the next IP is used. Once all IPs have been used once, the client resolves the IP(s) from the hostname again. If set to resolve_canonical_bootstrap_servers_only, resolve each bootstrap address into a list of canonical names. After the bootstrap phase, this behaves the same as use_all_dns_ips. If set to default (deprecated), attempt to connect to the first IP address returned by the lookup, even if the lookup returns multiple IP addresses.\n buffer_memory (str): The total bytes of memory the producer can use to buffer records waiting to be sent to the server.\n max_request_size (int): The maximum size of a request in bytes.\n retries (int): Setting a value greater than zero will cause the client to resend any record whose send fails with a potentially transient error.\n socket_connection_setup_timeout_ms (str): The amount of time the client will wait for the socket connection to be established.\n socket_connection_setup_timeout_max_ms (str): The maximum amount of time the client will wait for the socket connection to be established. The connection setup timeout will increase exponentially for each consecutive connection failure up to this maximum.\n max_block_ms (str): The configuration controls how long the KafkaProducer's send(), partitionsFor(), initTransactions(), sendOffsetsToTransaction(), commitTransaction() and abortTransaction() methods will block.\n request_timeout_ms (int): The configuration controls the maximum amount of time the client will wait for the response of a request. If the response is not received before the timeout elapses the client will resend the request if necessary or fail the request if retries are exhausted.\n delivery_timeout_ms (int): An upper bound on the time to report success or failure after a call to 'send()' returns.\n send_buffer_bytes (int): The size of the TCP send buffer (SO_SNDBUF) to use when sending data. If the value is -1, the OS default will be used.\n receive_buffer_bytes (int): The size of the TCP receive buffer (SO_RCVBUF) to use when reading data. If the value is -1, the OS default will be used.\n """\n self.bootstrap_servers = check.str_param(bootstrap_servers, "bootstrap_servers")\n self.topic_pattern = check.str_param(topic_pattern, "topic_pattern")\n self.test_topic = check.opt_str_param(test_topic, "test_topic")\n self.sync_producer = check.opt_bool_param(sync_producer, "sync_producer")\n self.protocol = check.inst_param(\n protocol,\n "protocol",\n (KafkaDestination.PLAINTEXT, KafkaDestination.SASLPLAINTEXT, KafkaDestination.SASLSSL),\n )\n self.client_id = check.opt_str_param(client_id, "client_id")\n self.acks = check.str_param(acks, "acks")\n self.enable_idempotence = check.bool_param(enable_idempotence, "enable_idempotence")\n self.compression_type = check.str_param(compression_type, "compression_type")\n self.batch_size = check.int_param(batch_size, "batch_size")\n self.linger_ms = check.str_param(linger_ms, "linger_ms")\n self.max_in_flight_requests_per_connection = check.int_param(\n max_in_flight_requests_per_connection, "max_in_flight_requests_per_connection"\n )\n self.client_dns_lookup = check.str_param(client_dns_lookup, "client_dns_lookup")\n self.buffer_memory = check.str_param(buffer_memory, "buffer_memory")\n self.max_request_size = check.int_param(max_request_size, "max_request_size")\n self.retries = check.int_param(retries, "retries")\n self.socket_connection_setup_timeout_ms = check.str_param(\n socket_connection_setup_timeout_ms, "socket_connection_setup_timeout_ms"\n )\n self.socket_connection_setup_timeout_max_ms = check.str_param(\n socket_connection_setup_timeout_max_ms, "socket_connection_setup_timeout_max_ms"\n )\n self.max_block_ms = check.str_param(max_block_ms, "max_block_ms")\n self.request_timeout_ms = check.int_param(request_timeout_ms, "request_timeout_ms")\n self.delivery_timeout_ms = check.int_param(delivery_timeout_ms, "delivery_timeout_ms")\n self.send_buffer_bytes = check.int_param(send_buffer_bytes, "send_buffer_bytes")\n self.receive_buffer_bytes = check.int_param(receive_buffer_bytes, "receive_buffer_bytes")\n super().__init__("Kafka", name)
\n\n\n
[docs]class ElasticsearchDestination(GeneratedAirbyteDestination):\n
[docs] class None_:\n
[docs] @public\n def __init__(\n self,\n ):\n self.method = "none"
\n\n
[docs] class ApiKeySecret:\n
[docs] @public\n def __init__(self, apiKeyId: str, apiKeySecret: str):\n self.method = "secret"\n self.apiKeyId = check.str_param(apiKeyId, "apiKeyId")\n self.apiKeySecret = check.str_param(apiKeySecret, "apiKeySecret")
\n\n
[docs] class UsernamePassword:\n
[docs] @public\n def __init__(self, username: str, password: str):\n self.method = "basic"\n self.username = check.str_param(username, "username")\n self.password = check.str_param(password, "password")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n endpoint: str,\n authenticationMethod: Union[\n "ElasticsearchDestination.None_",\n "ElasticsearchDestination.ApiKeySecret",\n "ElasticsearchDestination.UsernamePassword",\n ],\n upsert: Optional[bool] = None,\n ):\n r"""Airbyte Destination for Elasticsearch.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/elasticsearch\n\n Args:\n name (str): The name of the destination.\n endpoint (str): The full url of the Elasticsearch server\n upsert (Optional[bool]): If a primary key identifier is defined in the source, an upsert will be performed using the primary key value as the elasticsearch doc id. Does not support composite primary keys.\n authenticationMethod (Union[ElasticsearchDestination.None\\\\_, ElasticsearchDestination.ApiKeySecret, ElasticsearchDestination.UsernamePassword]): The type of authentication to be used\n """\n self.endpoint = check.str_param(endpoint, "endpoint")\n self.upsert = check.opt_bool_param(upsert, "upsert")\n self.authenticationMethod = check.inst_param(\n authenticationMethod,\n "authenticationMethod",\n (\n ElasticsearchDestination.None_,\n ElasticsearchDestination.ApiKeySecret,\n ElasticsearchDestination.UsernamePassword,\n ),\n )\n super().__init__("Elasticsearch", name)
\n\n\n
[docs]class MysqlDestination(GeneratedAirbyteDestination):\n
[docs] @public\n def __init__(\n self,\n name: str,\n host: str,\n port: int,\n database: str,\n username: str,\n password: Optional[str] = None,\n ssl: Optional[bool] = None,\n jdbc_url_params: Optional[str] = None,\n ):\n """Airbyte Destination for Mysql.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/mysql\n\n Args:\n name (str): The name of the destination.\n host (str): Hostname of the database.\n port (int): Port of the database.\n database (str): Name of the database.\n username (str): Username to use to access the database.\n password (Optional[str]): Password associated with the username.\n ssl (Optional[bool]): Encrypt data using SSL.\n jdbc_url_params (Optional[str]): Additional properties to pass to the JDBC URL string when connecting to the database formatted as 'key=value' pairs separated by the symbol '&'. (example: key1=value1&key2=value2&key3=value3).\n """\n self.host = check.str_param(host, "host")\n self.port = check.int_param(port, "port")\n self.database = check.str_param(database, "database")\n self.username = check.str_param(username, "username")\n self.password = check.opt_str_param(password, "password")\n self.ssl = check.opt_bool_param(ssl, "ssl")\n self.jdbc_url_params = check.opt_str_param(jdbc_url_params, "jdbc_url_params")\n super().__init__("Mysql", name)
\n\n\n
[docs]class SftpJsonDestination(GeneratedAirbyteDestination):\n
[docs] @public\n def __init__(\n self,\n name: str,\n host: str,\n username: str,\n password: str,\n destination_path: str,\n port: Optional[int] = None,\n ):\n """Airbyte Destination for Sftp Json.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/sftp-json\n\n Args:\n name (str): The name of the destination.\n host (str): Hostname of the SFTP server.\n port (Optional[int]): Port of the SFTP server.\n username (str): Username to use to access the SFTP server.\n password (str): Password associated with the username.\n destination_path (str): Path to the directory where json files will be written.\n """\n self.host = check.str_param(host, "host")\n self.port = check.opt_int_param(port, "port")\n self.username = check.str_param(username, "username")\n self.password = check.str_param(password, "password")\n self.destination_path = check.str_param(destination_path, "destination_path")\n super().__init__("Sftp Json", name)
\n\n\n
[docs]class GcsDestination(GeneratedAirbyteDestination):\n
[docs] class HMACKey:\n
[docs] @public\n def __init__(self, credential_type: str, hmac_key_access_id: str, hmac_key_secret: str):\n self.credential_type = check.str_param(credential_type, "credential_type")\n self.hmac_key_access_id = check.str_param(hmac_key_access_id, "hmac_key_access_id")\n self.hmac_key_secret = check.str_param(hmac_key_secret, "hmac_key_secret")
\n\n
[docs] class NoCompression:\n
[docs] @public\n def __init__(self, compression_type: Optional[str] = None):\n self.compression_type = check.opt_str_param(compression_type, "compression_type")
\n\n
[docs] class Deflate:\n
[docs] @public\n def __init__(self, codec: str, compression_level: Optional[int] = None):\n self.codec = check.str_param(codec, "codec")\n self.compression_level = check.opt_int_param(compression_level, "compression_level")
\n\n
[docs] class Bzip2:\n
[docs] @public\n def __init__(self, codec: str):\n self.codec = check.str_param(codec, "codec")
\n\n
[docs] class Xz:\n
[docs] @public\n def __init__(self, codec: str, compression_level: Optional[int] = None):\n self.codec = check.str_param(codec, "codec")\n self.compression_level = check.opt_int_param(compression_level, "compression_level")
\n\n
[docs] class Zstandard:\n
[docs] @public\n def __init__(\n self,\n codec: str,\n compression_level: Optional[int] = None,\n include_checksum: Optional[bool] = None,\n ):\n self.codec = check.str_param(codec, "codec")\n self.compression_level = check.opt_int_param(compression_level, "compression_level")\n self.include_checksum = check.opt_bool_param(include_checksum, "include_checksum")
\n\n
[docs] class Snappy:\n
[docs] @public\n def __init__(self, codec: str):\n self.codec = check.str_param(codec, "codec")
\n\n
[docs] class AvroApacheAvro:\n
[docs] @public\n def __init__(\n self,\n format_type: str,\n compression_codec: Union[\n "GcsDestination.NoCompression",\n "GcsDestination.Deflate",\n "GcsDestination.Bzip2",\n "GcsDestination.Xz",\n "GcsDestination.Zstandard",\n "GcsDestination.Snappy",\n ],\n ):\n self.format_type = check.str_param(format_type, "format_type")\n self.compression_codec = check.inst_param(\n compression_codec,\n "compression_codec",\n (\n GcsDestination.NoCompression,\n GcsDestination.Deflate,\n GcsDestination.Bzip2,\n GcsDestination.Xz,\n GcsDestination.Zstandard,\n GcsDestination.Snappy,\n ),\n )
\n\n
[docs] class GZIP:\n
[docs] @public\n def __init__(self, compression_type: Optional[str] = None):\n self.compression_type = check.opt_str_param(compression_type, "compression_type")
\n\n
[docs] class CSVCommaSeparatedValues:\n
[docs] @public\n def __init__(\n self,\n format_type: str,\n compression: Union["GcsDestination.NoCompression", "GcsDestination.GZIP"],\n flattening: Optional[str] = None,\n ):\n self.format_type = check.str_param(format_type, "format_type")\n self.flattening = check.opt_str_param(flattening, "flattening")\n self.compression = check.inst_param(\n compression, "compression", (GcsDestination.NoCompression, GcsDestination.GZIP)\n )
\n\n
[docs] class JSONLinesNewlineDelimitedJSON:\n
[docs] @public\n def __init__(\n self,\n format_type: str,\n compression: Union["GcsDestination.NoCompression", "GcsDestination.GZIP"],\n ):\n self.format_type = check.str_param(format_type, "format_type")\n self.compression = check.inst_param(\n compression, "compression", (GcsDestination.NoCompression, GcsDestination.GZIP)\n )
\n\n
[docs] class ParquetColumnarStorage:\n
[docs] @public\n def __init__(\n self,\n format_type: str,\n compression_codec: Optional[str] = None,\n block_size_mb: Optional[int] = None,\n max_padding_size_mb: Optional[int] = None,\n page_size_kb: Optional[int] = None,\n dictionary_page_size_kb: Optional[int] = None,\n dictionary_encoding: Optional[bool] = None,\n ):\n self.format_type = check.str_param(format_type, "format_type")\n self.compression_codec = check.opt_str_param(compression_codec, "compression_codec")\n self.block_size_mb = check.opt_int_param(block_size_mb, "block_size_mb")\n self.max_padding_size_mb = check.opt_int_param(\n max_padding_size_mb, "max_padding_size_mb"\n )\n self.page_size_kb = check.opt_int_param(page_size_kb, "page_size_kb")\n self.dictionary_page_size_kb = check.opt_int_param(\n dictionary_page_size_kb, "dictionary_page_size_kb"\n )\n self.dictionary_encoding = check.opt_bool_param(\n dictionary_encoding, "dictionary_encoding"\n )
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n gcs_bucket_name: str,\n gcs_bucket_path: str,\n credential: "GcsDestination.HMACKey",\n format: Union[\n "GcsDestination.AvroApacheAvro",\n "GcsDestination.CSVCommaSeparatedValues",\n "GcsDestination.JSONLinesNewlineDelimitedJSON",\n "GcsDestination.ParquetColumnarStorage",\n ],\n gcs_bucket_region: Optional[str] = None,\n ):\n """Airbyte Destination for Gcs.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/gcs\n\n Args:\n name (str): The name of the destination.\n gcs_bucket_name (str): You can find the bucket name in the App Engine Admin console Application Settings page, under the label Google Cloud Storage Bucket. Read more here.\n gcs_bucket_path (str): GCS Bucket Path string Subdirectory under the above bucket to sync the data into.\n gcs_bucket_region (Optional[str]): Select a Region of the GCS Bucket. Read more here.\n credential (GcsDestination.HMACKey): An HMAC key is a type of credential and can be associated with a service account or a user account in Cloud Storage. Read more here.\n format (Union[GcsDestination.AvroApacheAvro, GcsDestination.CSVCommaSeparatedValues, GcsDestination.JSONLinesNewlineDelimitedJSON, GcsDestination.ParquetColumnarStorage]): Output data format. One of the following formats must be selected - AVRO format, PARQUET format, CSV format, or JSONL format.\n """\n self.gcs_bucket_name = check.str_param(gcs_bucket_name, "gcs_bucket_name")\n self.gcs_bucket_path = check.str_param(gcs_bucket_path, "gcs_bucket_path")\n self.gcs_bucket_region = check.opt_str_param(gcs_bucket_region, "gcs_bucket_region")\n self.credential = check.inst_param(credential, "credential", GcsDestination.HMACKey)\n self.format = check.inst_param(\n format,\n "format",\n (\n GcsDestination.AvroApacheAvro,\n GcsDestination.CSVCommaSeparatedValues,\n GcsDestination.JSONLinesNewlineDelimitedJSON,\n GcsDestination.ParquetColumnarStorage,\n ),\n )\n super().__init__("Gcs", name)
\n\n\n
[docs]class CassandraDestination(GeneratedAirbyteDestination):\n
[docs] @public\n def __init__(\n self,\n name: str,\n keyspace: str,\n username: str,\n password: str,\n address: str,\n port: int,\n datacenter: Optional[str] = None,\n replication: Optional[int] = None,\n ):\n """Airbyte Destination for Cassandra.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/cassandra\n\n Args:\n name (str): The name of the destination.\n keyspace (str): Default Cassandra keyspace to create data in.\n username (str): Username to use to access Cassandra.\n password (str): Password associated with Cassandra.\n address (str): Address to connect to.\n port (int): Port of Cassandra.\n datacenter (Optional[str]): Datacenter of the cassandra cluster.\n replication (Optional[int]): Indicates to how many nodes the data should be replicated to.\n """\n self.keyspace = check.str_param(keyspace, "keyspace")\n self.username = check.str_param(username, "username")\n self.password = check.str_param(password, "password")\n self.address = check.str_param(address, "address")\n self.port = check.int_param(port, "port")\n self.datacenter = check.opt_str_param(datacenter, "datacenter")\n self.replication = check.opt_int_param(replication, "replication")\n super().__init__("Cassandra", name)
\n\n\n
[docs]class FireboltDestination(GeneratedAirbyteDestination):\n
[docs] class SQLInserts:\n
[docs] @public\n def __init__(\n self,\n ):\n self.method = "SQL"
\n\n
[docs] class ExternalTableViaS3:\n
[docs] @public\n def __init__(self, s3_bucket: str, s3_region: str, aws_key_id: str, aws_key_secret: str):\n self.method = "S3"\n self.s3_bucket = check.str_param(s3_bucket, "s3_bucket")\n self.s3_region = check.str_param(s3_region, "s3_region")\n self.aws_key_id = check.str_param(aws_key_id, "aws_key_id")\n self.aws_key_secret = check.str_param(aws_key_secret, "aws_key_secret")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n username: str,\n password: str,\n database: str,\n loading_method: Union[\n "FireboltDestination.SQLInserts", "FireboltDestination.ExternalTableViaS3"\n ],\n account: Optional[str] = None,\n host: Optional[str] = None,\n engine: Optional[str] = None,\n ):\n """Airbyte Destination for Firebolt.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/firebolt\n\n Args:\n name (str): The name of the destination.\n username (str): Firebolt email address you use to login.\n password (str): Firebolt password.\n account (Optional[str]): Firebolt account to login.\n host (Optional[str]): The host name of your Firebolt database.\n database (str): The database to connect to.\n engine (Optional[str]): Engine name or url to connect to.\n loading_method (Union[FireboltDestination.SQLInserts, FireboltDestination.ExternalTableViaS3]): Loading method used to select the way data will be uploaded to Firebolt\n """\n self.username = check.str_param(username, "username")\n self.password = check.str_param(password, "password")\n self.account = check.opt_str_param(account, "account")\n self.host = check.opt_str_param(host, "host")\n self.database = check.str_param(database, "database")\n self.engine = check.opt_str_param(engine, "engine")\n self.loading_method = check.inst_param(\n loading_method,\n "loading_method",\n (FireboltDestination.SQLInserts, FireboltDestination.ExternalTableViaS3),\n )\n super().__init__("Firebolt", name)
\n\n\n
[docs]class GoogleSheetsDestination(GeneratedAirbyteDestination):\n
[docs] class AuthenticationViaGoogleOAuth:\n
[docs] @public\n def __init__(self, client_id: str, client_secret: str, refresh_token: str):\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n spreadsheet_id: str,\n credentials: "GoogleSheetsDestination.AuthenticationViaGoogleOAuth",\n ):\n """Airbyte Destination for Google Sheets.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/google-sheets\n\n Args:\n name (str): The name of the destination.\n spreadsheet_id (str): The link to your spreadsheet. See this guide for more details.\n credentials (GoogleSheetsDestination.AuthenticationViaGoogleOAuth): Google API Credentials for connecting to Google Sheets and Google Drive APIs\n """\n self.spreadsheet_id = check.str_param(spreadsheet_id, "spreadsheet_id")\n self.credentials = check.inst_param(\n credentials, "credentials", GoogleSheetsDestination.AuthenticationViaGoogleOAuth\n )\n super().__init__("Google Sheets", name)
\n\n\n
[docs]class DatabricksDestination(GeneratedAirbyteDestination):\n
[docs] class AmazonS3:\n
[docs] @public\n def __init__(\n self,\n data_source_type: str,\n s3_bucket_name: str,\n s3_bucket_path: str,\n s3_bucket_region: str,\n s3_access_key_id: str,\n s3_secret_access_key: str,\n file_name_pattern: Optional[str] = None,\n ):\n self.data_source_type = check.str_param(data_source_type, "data_source_type")\n self.s3_bucket_name = check.str_param(s3_bucket_name, "s3_bucket_name")\n self.s3_bucket_path = check.str_param(s3_bucket_path, "s3_bucket_path")\n self.s3_bucket_region = check.str_param(s3_bucket_region, "s3_bucket_region")\n self.s3_access_key_id = check.str_param(s3_access_key_id, "s3_access_key_id")\n self.s3_secret_access_key = check.str_param(\n s3_secret_access_key, "s3_secret_access_key"\n )\n self.file_name_pattern = check.opt_str_param(file_name_pattern, "file_name_pattern")
\n\n
[docs] class AzureBlobStorage:\n
[docs] @public\n def __init__(\n self,\n data_source_type: str,\n azure_blob_storage_account_name: str,\n azure_blob_storage_container_name: str,\n azure_blob_storage_sas_token: str,\n azure_blob_storage_endpoint_domain_name: Optional[str] = None,\n ):\n self.data_source_type = check.str_param(data_source_type, "data_source_type")\n self.azure_blob_storage_endpoint_domain_name = check.opt_str_param(\n azure_blob_storage_endpoint_domain_name, "azure_blob_storage_endpoint_domain_name"\n )\n self.azure_blob_storage_account_name = check.str_param(\n azure_blob_storage_account_name, "azure_blob_storage_account_name"\n )\n self.azure_blob_storage_container_name = check.str_param(\n azure_blob_storage_container_name, "azure_blob_storage_container_name"\n )\n self.azure_blob_storage_sas_token = check.str_param(\n azure_blob_storage_sas_token, "azure_blob_storage_sas_token"\n )
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n accept_terms: bool,\n databricks_server_hostname: str,\n databricks_http_path: str,\n databricks_personal_access_token: str,\n data_source: Union[\n "DatabricksDestination.AmazonS3", "DatabricksDestination.AzureBlobStorage"\n ],\n databricks_port: Optional[str] = None,\n database_schema: Optional[str] = None,\n purge_staging_data: Optional[bool] = None,\n ):\n """Airbyte Destination for Databricks.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/databricks\n\n Args:\n name (str): The name of the destination.\n accept_terms (bool): You must agree to the Databricks JDBC Driver Terms & Conditions to use this connector.\n databricks_server_hostname (str): Databricks Cluster Server Hostname.\n databricks_http_path (str): Databricks Cluster HTTP Path.\n databricks_port (Optional[str]): Databricks Cluster Port.\n databricks_personal_access_token (str): Databricks Personal Access Token for making authenticated requests.\n database_schema (Optional[str]): The default schema tables are written to if the source does not specify a namespace. Unless specifically configured, the usual value for this field is "public".\n data_source (Union[DatabricksDestination.AmazonS3, DatabricksDestination.AzureBlobStorage]): Storage on which the delta lake is built.\n purge_staging_data (Optional[bool]): Default to 'true'. Switch it to 'false' for debugging purpose.\n """\n self.accept_terms = check.bool_param(accept_terms, "accept_terms")\n self.databricks_server_hostname = check.str_param(\n databricks_server_hostname, "databricks_server_hostname"\n )\n self.databricks_http_path = check.str_param(databricks_http_path, "databricks_http_path")\n self.databricks_port = check.opt_str_param(databricks_port, "databricks_port")\n self.databricks_personal_access_token = check.str_param(\n databricks_personal_access_token, "databricks_personal_access_token"\n )\n self.database_schema = check.opt_str_param(database_schema, "database_schema")\n self.data_source = check.inst_param(\n data_source,\n "data_source",\n (DatabricksDestination.AmazonS3, DatabricksDestination.AzureBlobStorage),\n )\n self.purge_staging_data = check.opt_bool_param(purge_staging_data, "purge_staging_data")\n super().__init__("Databricks", name)
\n\n\n
[docs]class BigqueryDenormalizedDestination(GeneratedAirbyteDestination):\n
[docs] class StandardInserts:\n
[docs] @public\n def __init__(\n self,\n ):\n self.method = "Standard"
\n\n
[docs] class HMACKey:\n
[docs] @public\n def __init__(self, hmac_key_access_id: str, hmac_key_secret: str):\n self.credential_type = "HMAC_KEY"\n self.hmac_key_access_id = check.str_param(hmac_key_access_id, "hmac_key_access_id")\n self.hmac_key_secret = check.str_param(hmac_key_secret, "hmac_key_secret")
\n\n
[docs] class GCSStaging:\n
[docs] @public\n def __init__(\n self,\n credential: "BigqueryDenormalizedDestination.HMACKey",\n gcs_bucket_name: str,\n gcs_bucket_path: str,\n keep_files_in_gcs_bucket: Optional[str] = None,\n ):\n self.method = "GCS Staging"\n self.credential = check.inst_param(\n credential, "credential", BigqueryDenormalizedDestination.HMACKey\n )\n self.gcs_bucket_name = check.str_param(gcs_bucket_name, "gcs_bucket_name")\n self.gcs_bucket_path = check.str_param(gcs_bucket_path, "gcs_bucket_path")\n self.keep_files_in_gcs_bucket = check.opt_str_param(\n keep_files_in_gcs_bucket, "keep_files_in_gcs_bucket"\n )
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n project_id: str,\n dataset_id: str,\n loading_method: Union[\n "BigqueryDenormalizedDestination.StandardInserts",\n "BigqueryDenormalizedDestination.GCSStaging",\n ],\n credentials_json: Optional[str] = None,\n dataset_location: Optional[str] = None,\n big_query_client_buffer_size_mb: Optional[int] = None,\n ):\n """Airbyte Destination for Bigquery Denormalized.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/bigquery\n\n Args:\n name (str): The name of the destination.\n project_id (str): The GCP project ID for the project containing the target BigQuery dataset. Read more here.\n dataset_id (str): The default BigQuery Dataset ID that tables are replicated to if the source does not specify a namespace. Read more here.\n loading_method (Union[BigqueryDenormalizedDestination.StandardInserts, BigqueryDenormalizedDestination.GCSStaging]): Loading method used to send select the way data will be uploaded to BigQuery. Standard Inserts - Direct uploading using SQL INSERT statements. This method is extremely inefficient and provided only for quick testing. In almost all cases, you should use staging. GCS Staging - Writes large batches of records to a file, uploads the file to GCS, then uses COPY INTO table to upload the file. Recommended for most workloads for better speed and scalability. Read more about GCS Staging here.\n credentials_json (Optional[str]): The contents of the JSON service account key. Check out the docs if you need help generating this key. Default credentials will be used if this field is left empty.\n dataset_location (Optional[str]): The location of the dataset. Warning: Changes made after creation will not be applied. The default "US" value is used if not set explicitly. Read more here.\n big_query_client_buffer_size_mb (Optional[int]): Google BigQuery client's chunk (buffer) size (MIN=1, MAX = 15) for each table. The size that will be written by a single RPC. Written data will be buffered and only flushed upon reaching this size or closing the channel. The default 15MB value is used if not set explicitly. Read more here.\n """\n self.project_id = check.str_param(project_id, "project_id")\n self.dataset_id = check.str_param(dataset_id, "dataset_id")\n self.loading_method = check.inst_param(\n loading_method,\n "loading_method",\n (\n BigqueryDenormalizedDestination.StandardInserts,\n BigqueryDenormalizedDestination.GCSStaging,\n ),\n )\n self.credentials_json = check.opt_str_param(credentials_json, "credentials_json")\n self.dataset_location = check.opt_str_param(dataset_location, "dataset_location")\n self.big_query_client_buffer_size_mb = check.opt_int_param(\n big_query_client_buffer_size_mb, "big_query_client_buffer_size_mb"\n )\n super().__init__("Bigquery Denormalized", name)
\n\n\n
[docs]class SqliteDestination(GeneratedAirbyteDestination):\n
[docs] @public\n def __init__(self, name: str, destination_path: str):\n """Airbyte Destination for Sqlite.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/sqlite\n\n Args:\n name (str): The name of the destination.\n destination_path (str): Path to the sqlite.db file. The file will be placed inside that local mount. For more information check out our docs\n """\n self.destination_path = check.str_param(destination_path, "destination_path")\n super().__init__("Sqlite", name)
\n\n\n
[docs]class MongodbDestination(GeneratedAirbyteDestination):\n
[docs] class StandaloneMongoDbInstance:\n
[docs] @public\n def __init__(self, instance: str, host: str, port: int, tls: Optional[bool] = None):\n self.instance = check.str_param(instance, "instance")\n self.host = check.str_param(host, "host")\n self.port = check.int_param(port, "port")\n self.tls = check.opt_bool_param(tls, "tls")
\n\n
[docs] class ReplicaSet:\n
[docs] @public\n def __init__(self, instance: str, server_addresses: str, replica_set: Optional[str] = None):\n self.instance = check.str_param(instance, "instance")\n self.server_addresses = check.str_param(server_addresses, "server_addresses")\n self.replica_set = check.opt_str_param(replica_set, "replica_set")
\n\n
[docs] class MongoDBAtlas:\n
[docs] @public\n def __init__(self, instance: str, cluster_url: str):\n self.instance = check.str_param(instance, "instance")\n self.cluster_url = check.str_param(cluster_url, "cluster_url")
\n\n
[docs] class None_:\n
[docs] @public\n def __init__(\n self,\n ):\n self.authorization = "none"
\n\n
[docs] class LoginPassword:\n
[docs] @public\n def __init__(self, username: str, password: str):\n self.authorization = "login/password"\n self.username = check.str_param(username, "username")\n self.password = check.str_param(password, "password")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n instance_type: Union[\n "MongodbDestination.StandaloneMongoDbInstance",\n "MongodbDestination.ReplicaSet",\n "MongodbDestination.MongoDBAtlas",\n ],\n database: str,\n auth_type: Union["MongodbDestination.None_", "MongodbDestination.LoginPassword"],\n ):\n r"""Airbyte Destination for Mongodb.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/mongodb\n\n Args:\n name (str): The name of the destination.\n instance_type (Union[MongodbDestination.StandaloneMongoDbInstance, MongodbDestination.ReplicaSet, MongodbDestination.MongoDBAtlas]): MongoDb instance to connect to. For MongoDB Atlas and Replica Set TLS connection is used by default.\n database (str): Name of the database.\n auth_type (Union[MongodbDestination.None\\\\_, MongodbDestination.LoginPassword]): Authorization type.\n """\n self.instance_type = check.inst_param(\n instance_type,\n "instance_type",\n (\n MongodbDestination.StandaloneMongoDbInstance,\n MongodbDestination.ReplicaSet,\n MongodbDestination.MongoDBAtlas,\n ),\n )\n self.database = check.str_param(database, "database")\n self.auth_type = check.inst_param(\n auth_type, "auth_type", (MongodbDestination.None_, MongodbDestination.LoginPassword)\n )\n super().__init__("Mongodb", name)
\n\n\n
[docs]class RocksetDestination(GeneratedAirbyteDestination):\n
[docs] @public\n def __init__(self, name: str, api_key: str, workspace: str, api_server: Optional[str] = None):\n """Airbyte Destination for Rockset.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/rockset\n\n Args:\n name (str): The name of the destination.\n api_key (str): Rockset api key\n workspace (str): The Rockset workspace in which collections will be created + written to.\n api_server (Optional[str]): Rockset api URL\n """\n self.api_key = check.str_param(api_key, "api_key")\n self.workspace = check.str_param(workspace, "workspace")\n self.api_server = check.opt_str_param(api_server, "api_server")\n super().__init__("Rockset", name)
\n\n\n
[docs]class OracleDestination(GeneratedAirbyteDestination):\n
[docs] class Unencrypted:\n
[docs] @public\n def __init__(\n self,\n ):\n self.encryption_method = "unencrypted"
\n\n
[docs] class NativeNetworkEncryptionNNE:\n
[docs] @public\n def __init__(self, encryption_algorithm: Optional[str] = None):\n self.encryption_method = "client_nne"\n self.encryption_algorithm = check.opt_str_param(\n encryption_algorithm, "encryption_algorithm"\n )
\n\n
[docs] class TLSEncryptedVerifyCertificate:\n
[docs] @public\n def __init__(self, ssl_certificate: str):\n self.encryption_method = "encrypted_verify_certificate"\n self.ssl_certificate = check.str_param(ssl_certificate, "ssl_certificate")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n host: str,\n port: int,\n sid: str,\n username: str,\n encryption: Union[\n "OracleDestination.Unencrypted",\n "OracleDestination.NativeNetworkEncryptionNNE",\n "OracleDestination.TLSEncryptedVerifyCertificate",\n ],\n password: Optional[str] = None,\n jdbc_url_params: Optional[str] = None,\n schema: Optional[str] = None,\n ):\n """Airbyte Destination for Oracle.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/oracle\n\n Args:\n name (str): The name of the destination.\n host (str): The hostname of the database.\n port (int): The port of the database.\n sid (str): The System Identifier uniquely distinguishes the instance from any other instance on the same computer.\n username (str): The username to access the database. This user must have CREATE USER privileges in the database.\n password (Optional[str]): The password associated with the username.\n jdbc_url_params (Optional[str]): Additional properties to pass to the JDBC URL string when connecting to the database formatted as 'key=value' pairs separated by the symbol '&'. (example: key1=value1&key2=value2&key3=value3).\n schema (Optional[str]): The default schema is used as the target schema for all statements issued from the connection that do not explicitly specify a schema name. The usual value for this field is "airbyte". In Oracle, schemas and users are the same thing, so the "user" parameter is used as the login credentials and this is used for the default Airbyte message schema.\n encryption (Union[OracleDestination.Unencrypted, OracleDestination.NativeNetworkEncryptionNNE, OracleDestination.TLSEncryptedVerifyCertificate]): The encryption method which is used when communicating with the database.\n """\n self.host = check.str_param(host, "host")\n self.port = check.int_param(port, "port")\n self.sid = check.str_param(sid, "sid")\n self.username = check.str_param(username, "username")\n self.password = check.opt_str_param(password, "password")\n self.jdbc_url_params = check.opt_str_param(jdbc_url_params, "jdbc_url_params")\n self.schema = check.opt_str_param(schema, "schema")\n self.encryption = check.inst_param(\n encryption,\n "encryption",\n (\n OracleDestination.Unencrypted,\n OracleDestination.NativeNetworkEncryptionNNE,\n OracleDestination.TLSEncryptedVerifyCertificate,\n ),\n )\n super().__init__("Oracle", name)
\n\n\n
[docs]class CsvDestination(GeneratedAirbyteDestination):\n
[docs] @public\n def __init__(self, name: str, destination_path: str):\n """Airbyte Destination for Csv.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/local-csv\n\n Args:\n name (str): The name of the destination.\n destination_path (str): Path to the directory where csv files will be written. The destination uses the local mount "/local" and any data files will be placed inside that local mount. For more information check out our docs\n """\n self.destination_path = check.str_param(destination_path, "destination_path")\n super().__init__("Csv", name)
\n\n\n
[docs]class S3Destination(GeneratedAirbyteDestination):\n
[docs] class NoCompression:\n
[docs] @public\n def __init__(self, compression_type: Optional[str] = None):\n self.compression_type = check.opt_str_param(compression_type, "compression_type")
\n\n
[docs] class Deflate:\n
[docs] @public\n def __init__(self, codec: str, compression_level: int):\n self.codec = check.str_param(codec, "codec")\n self.compression_level = check.int_param(compression_level, "compression_level")
\n\n
[docs] class Bzip2:\n
[docs] @public\n def __init__(self, codec: str):\n self.codec = check.str_param(codec, "codec")
\n\n
[docs] class Xz:\n
[docs] @public\n def __init__(self, codec: str, compression_level: int):\n self.codec = check.str_param(codec, "codec")\n self.compression_level = check.int_param(compression_level, "compression_level")
\n\n
[docs] class Zstandard:\n
[docs] @public\n def __init__(\n self, codec: str, compression_level: int, include_checksum: Optional[bool] = None\n ):\n self.codec = check.str_param(codec, "codec")\n self.compression_level = check.int_param(compression_level, "compression_level")\n self.include_checksum = check.opt_bool_param(include_checksum, "include_checksum")
\n\n
[docs] class Snappy:\n
[docs] @public\n def __init__(self, codec: str):\n self.codec = check.str_param(codec, "codec")
\n\n
[docs] class AvroApacheAvro:\n
[docs] @public\n def __init__(\n self,\n format_type: str,\n compression_codec: Union[\n "S3Destination.NoCompression",\n "S3Destination.Deflate",\n "S3Destination.Bzip2",\n "S3Destination.Xz",\n "S3Destination.Zstandard",\n "S3Destination.Snappy",\n ],\n ):\n self.format_type = check.str_param(format_type, "format_type")\n self.compression_codec = check.inst_param(\n compression_codec,\n "compression_codec",\n (\n S3Destination.NoCompression,\n S3Destination.Deflate,\n S3Destination.Bzip2,\n S3Destination.Xz,\n S3Destination.Zstandard,\n S3Destination.Snappy,\n ),\n )
\n\n
[docs] class GZIP:\n
[docs] @public\n def __init__(self, compression_type: Optional[str] = None):\n self.compression_type = check.opt_str_param(compression_type, "compression_type")
\n\n
[docs] class CSVCommaSeparatedValues:\n
[docs] @public\n def __init__(\n self,\n format_type: str,\n flattening: str,\n compression: Union["S3Destination.NoCompression", "S3Destination.GZIP"],\n ):\n self.format_type = check.str_param(format_type, "format_type")\n self.flattening = check.str_param(flattening, "flattening")\n self.compression = check.inst_param(\n compression, "compression", (S3Destination.NoCompression, S3Destination.GZIP)\n )
\n\n
[docs] class JSONLinesNewlineDelimitedJSON:\n
[docs] @public\n def __init__(\n self,\n format_type: str,\n compression: Union["S3Destination.NoCompression", "S3Destination.GZIP"],\n ):\n self.format_type = check.str_param(format_type, "format_type")\n self.compression = check.inst_param(\n compression, "compression", (S3Destination.NoCompression, S3Destination.GZIP)\n )
\n\n
[docs] class ParquetColumnarStorage:\n
[docs] @public\n def __init__(\n self,\n format_type: str,\n compression_codec: Optional[str] = None,\n block_size_mb: Optional[int] = None,\n max_padding_size_mb: Optional[int] = None,\n page_size_kb: Optional[int] = None,\n dictionary_page_size_kb: Optional[int] = None,\n dictionary_encoding: Optional[bool] = None,\n ):\n self.format_type = check.str_param(format_type, "format_type")\n self.compression_codec = check.opt_str_param(compression_codec, "compression_codec")\n self.block_size_mb = check.opt_int_param(block_size_mb, "block_size_mb")\n self.max_padding_size_mb = check.opt_int_param(\n max_padding_size_mb, "max_padding_size_mb"\n )\n self.page_size_kb = check.opt_int_param(page_size_kb, "page_size_kb")\n self.dictionary_page_size_kb = check.opt_int_param(\n dictionary_page_size_kb, "dictionary_page_size_kb"\n )\n self.dictionary_encoding = check.opt_bool_param(\n dictionary_encoding, "dictionary_encoding"\n )
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n s3_bucket_name: str,\n s3_bucket_path: str,\n s3_bucket_region: str,\n format: Union[\n "S3Destination.AvroApacheAvro",\n "S3Destination.CSVCommaSeparatedValues",\n "S3Destination.JSONLinesNewlineDelimitedJSON",\n "S3Destination.ParquetColumnarStorage",\n ],\n access_key_id: Optional[str] = None,\n secret_access_key: Optional[str] = None,\n s3_endpoint: Optional[str] = None,\n s3_path_format: Optional[str] = None,\n file_name_pattern: Optional[str] = None,\n ):\n """Airbyte Destination for S3.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/s3\n\n Args:\n name (str): The name of the destination.\n access_key_id (Optional[str]): The access key ID to access the S3 bucket. Airbyte requires Read and Write permissions to the given bucket. Read more here.\n secret_access_key (Optional[str]): The corresponding secret to the access key ID. Read more here\n s3_bucket_name (str): The name of the S3 bucket. Read more here.\n s3_bucket_path (str): Directory under the S3 bucket where data will be written. Read more here\n s3_bucket_region (str): The region of the S3 bucket. See here for all region codes.\n format (Union[S3Destination.AvroApacheAvro, S3Destination.CSVCommaSeparatedValues, S3Destination.JSONLinesNewlineDelimitedJSON, S3Destination.ParquetColumnarStorage]): Format of the data output. See here for more details\n s3_endpoint (Optional[str]): Your S3 endpoint url. Read more here\n s3_path_format (Optional[str]): Format string on how data will be organized inside the S3 bucket directory. Read more here\n file_name_pattern (Optional[str]): The pattern allows you to set the file-name format for the S3 staging file(s)\n """\n self.access_key_id = check.opt_str_param(access_key_id, "access_key_id")\n self.secret_access_key = check.opt_str_param(secret_access_key, "secret_access_key")\n self.s3_bucket_name = check.str_param(s3_bucket_name, "s3_bucket_name")\n self.s3_bucket_path = check.str_param(s3_bucket_path, "s3_bucket_path")\n self.s3_bucket_region = check.str_param(s3_bucket_region, "s3_bucket_region")\n self.format = check.inst_param(\n format,\n "format",\n (\n S3Destination.AvroApacheAvro,\n S3Destination.CSVCommaSeparatedValues,\n S3Destination.JSONLinesNewlineDelimitedJSON,\n S3Destination.ParquetColumnarStorage,\n ),\n )\n self.s3_endpoint = check.opt_str_param(s3_endpoint, "s3_endpoint")\n self.s3_path_format = check.opt_str_param(s3_path_format, "s3_path_format")\n self.file_name_pattern = check.opt_str_param(file_name_pattern, "file_name_pattern")\n super().__init__("S3", name)
\n\n\n
[docs]class AwsDatalakeDestination(GeneratedAirbyteDestination):\n
[docs] class IAMRole:\n
[docs] @public\n def __init__(self, role_arn: str):\n self.credentials_title = "IAM Role"\n self.role_arn = check.str_param(role_arn, "role_arn")
\n\n
[docs] class IAMUser:\n
[docs] @public\n def __init__(self, aws_access_key_id: str, aws_secret_access_key: str):\n self.credentials_title = "IAM User"\n self.aws_access_key_id = check.str_param(aws_access_key_id, "aws_access_key_id")\n self.aws_secret_access_key = check.str_param(\n aws_secret_access_key, "aws_secret_access_key"\n )
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n region: str,\n credentials: Union["AwsDatalakeDestination.IAMRole", "AwsDatalakeDestination.IAMUser"],\n bucket_name: str,\n bucket_prefix: str,\n aws_account_id: Optional[str] = None,\n lakeformation_database_name: Optional[str] = None,\n ):\n """Airbyte Destination for Aws Datalake.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/aws-datalake\n\n Args:\n name (str): The name of the destination.\n aws_account_id (Optional[str]): target aws account id\n region (str): Region name\n credentials (Union[AwsDatalakeDestination.IAMRole, AwsDatalakeDestination.IAMUser]): Choose How to Authenticate to AWS.\n bucket_name (str): Name of the bucket\n bucket_prefix (str): S3 prefix\n lakeformation_database_name (Optional[str]): Which database to use\n """\n self.aws_account_id = check.opt_str_param(aws_account_id, "aws_account_id")\n self.region = check.str_param(region, "region")\n self.credentials = check.inst_param(\n credentials,\n "credentials",\n (AwsDatalakeDestination.IAMRole, AwsDatalakeDestination.IAMUser),\n )\n self.bucket_name = check.str_param(bucket_name, "bucket_name")\n self.bucket_prefix = check.str_param(bucket_prefix, "bucket_prefix")\n self.lakeformation_database_name = check.opt_str_param(\n lakeformation_database_name, "lakeformation_database_name"\n )\n super().__init__("Aws Datalake", name)
\n\n\n
[docs]class MssqlDestination(GeneratedAirbyteDestination):\n
[docs] class Unencrypted:\n
[docs] @public\n def __init__(\n self,\n ):\n self.ssl_method = "unencrypted"
\n\n
[docs] class EncryptedTrustServerCertificate:\n
[docs] @public\n def __init__(\n self,\n ):\n self.ssl_method = "encrypted_trust_server_certificate"
\n\n
[docs] class EncryptedVerifyCertificate:\n
[docs] @public\n def __init__(self, hostNameInCertificate: Optional[str] = None):\n self.ssl_method = "encrypted_verify_certificate"\n self.hostNameInCertificate = check.opt_str_param(\n hostNameInCertificate, "hostNameInCertificate"\n )
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n host: str,\n port: int,\n database: str,\n schema: str,\n username: str,\n ssl_method: Union[\n "MssqlDestination.Unencrypted",\n "MssqlDestination.EncryptedTrustServerCertificate",\n "MssqlDestination.EncryptedVerifyCertificate",\n ],\n password: Optional[str] = None,\n jdbc_url_params: Optional[str] = None,\n ):\n """Airbyte Destination for Mssql.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/mssql\n\n Args:\n name (str): The name of the destination.\n host (str): The host name of the MSSQL database.\n port (int): The port of the MSSQL database.\n database (str): The name of the MSSQL database.\n schema (str): The default schema tables are written to if the source does not specify a namespace. The usual value for this field is "public".\n username (str): The username which is used to access the database.\n password (Optional[str]): The password associated with this username.\n jdbc_url_params (Optional[str]): Additional properties to pass to the JDBC URL string when connecting to the database formatted as 'key=value' pairs separated by the symbol '&'. (example: key1=value1&key2=value2&key3=value3).\n ssl_method (Union[MssqlDestination.Unencrypted, MssqlDestination.EncryptedTrustServerCertificate, MssqlDestination.EncryptedVerifyCertificate]): The encryption method which is used to communicate with the database.\n """\n self.host = check.str_param(host, "host")\n self.port = check.int_param(port, "port")\n self.database = check.str_param(database, "database")\n self.schema = check.str_param(schema, "schema")\n self.username = check.str_param(username, "username")\n self.password = check.opt_str_param(password, "password")\n self.jdbc_url_params = check.opt_str_param(jdbc_url_params, "jdbc_url_params")\n self.ssl_method = check.inst_param(\n ssl_method,\n "ssl_method",\n (\n MssqlDestination.Unencrypted,\n MssqlDestination.EncryptedTrustServerCertificate,\n MssqlDestination.EncryptedVerifyCertificate,\n ),\n )\n super().__init__("Mssql", name)
\n\n\n
[docs]class PubsubDestination(GeneratedAirbyteDestination):\n
[docs] @public\n def __init__(self, name: str, project_id: str, topic_id: str, credentials_json: str):\n """Airbyte Destination for Pubsub.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/pubsub\n\n Args:\n name (str): The name of the destination.\n project_id (str): The GCP project ID for the project containing the target PubSub.\n topic_id (str): The PubSub topic ID in the given GCP project ID.\n credentials_json (str): The contents of the JSON service account key. Check out the docs if you need help generating this key.\n """\n self.project_id = check.str_param(project_id, "project_id")\n self.topic_id = check.str_param(topic_id, "topic_id")\n self.credentials_json = check.str_param(credentials_json, "credentials_json")\n super().__init__("Pubsub", name)
\n\n\n
[docs]class R2Destination(GeneratedAirbyteDestination):\n
[docs] class NoCompression:\n
[docs] @public\n def __init__(self, compression_type: Optional[str] = None):\n self.compression_type = check.opt_str_param(compression_type, "compression_type")
\n\n
[docs] class Deflate:\n
[docs] @public\n def __init__(self, codec: str, compression_level: int):\n self.codec = check.str_param(codec, "codec")\n self.compression_level = check.int_param(compression_level, "compression_level")
\n\n
[docs] class Bzip2:\n
[docs] @public\n def __init__(self, codec: str):\n self.codec = check.str_param(codec, "codec")
\n\n
[docs] class Xz:\n
[docs] @public\n def __init__(self, codec: str, compression_level: int):\n self.codec = check.str_param(codec, "codec")\n self.compression_level = check.int_param(compression_level, "compression_level")
\n\n
[docs] class Zstandard:\n
[docs] @public\n def __init__(\n self, codec: str, compression_level: int, include_checksum: Optional[bool] = None\n ):\n self.codec = check.str_param(codec, "codec")\n self.compression_level = check.int_param(compression_level, "compression_level")\n self.include_checksum = check.opt_bool_param(include_checksum, "include_checksum")
\n\n
[docs] class Snappy:\n
[docs] @public\n def __init__(self, codec: str):\n self.codec = check.str_param(codec, "codec")
\n\n
[docs] class AvroApacheAvro:\n
[docs] @public\n def __init__(\n self,\n format_type: str,\n compression_codec: Union[\n "R2Destination.NoCompression",\n "R2Destination.Deflate",\n "R2Destination.Bzip2",\n "R2Destination.Xz",\n "R2Destination.Zstandard",\n "R2Destination.Snappy",\n ],\n ):\n self.format_type = check.str_param(format_type, "format_type")\n self.compression_codec = check.inst_param(\n compression_codec,\n "compression_codec",\n (\n R2Destination.NoCompression,\n R2Destination.Deflate,\n R2Destination.Bzip2,\n R2Destination.Xz,\n R2Destination.Zstandard,\n R2Destination.Snappy,\n ),\n )
\n\n
[docs] class GZIP:\n
[docs] @public\n def __init__(self, compression_type: Optional[str] = None):\n self.compression_type = check.opt_str_param(compression_type, "compression_type")
\n\n
[docs] class CSVCommaSeparatedValues:\n
[docs] @public\n def __init__(\n self,\n format_type: str,\n flattening: str,\n compression: Union["R2Destination.NoCompression", "R2Destination.GZIP"],\n ):\n self.format_type = check.str_param(format_type, "format_type")\n self.flattening = check.str_param(flattening, "flattening")\n self.compression = check.inst_param(\n compression, "compression", (R2Destination.NoCompression, R2Destination.GZIP)\n )
\n\n
[docs] class JSONLinesNewlineDelimitedJSON:\n
[docs] @public\n def __init__(\n self,\n format_type: str,\n compression: Union["R2Destination.NoCompression", "R2Destination.GZIP"],\n ):\n self.format_type = check.str_param(format_type, "format_type")\n self.compression = check.inst_param(\n compression, "compression", (R2Destination.NoCompression, R2Destination.GZIP)\n )
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n account_id: str,\n access_key_id: str,\n secret_access_key: str,\n s3_bucket_name: str,\n s3_bucket_path: str,\n format: Union[\n "R2Destination.AvroApacheAvro",\n "R2Destination.CSVCommaSeparatedValues",\n "R2Destination.JSONLinesNewlineDelimitedJSON",\n ],\n s3_path_format: Optional[str] = None,\n file_name_pattern: Optional[str] = None,\n ):\n """Airbyte Destination for R2.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/r2\n\n Args:\n name (str): The name of the destination.\n account_id (str): Cloudflare account ID\n access_key_id (str): The access key ID to access the R2 bucket. Airbyte requires Read and Write permissions to the given bucket. Read more here.\n secret_access_key (str): The corresponding secret to the access key ID. Read more here\n s3_bucket_name (str): The name of the R2 bucket. Read more here.\n s3_bucket_path (str): Directory under the R2 bucket where data will be written.\n format (Union[R2Destination.AvroApacheAvro, R2Destination.CSVCommaSeparatedValues, R2Destination.JSONLinesNewlineDelimitedJSON]): Format of the data output. See here for more details\n s3_path_format (Optional[str]): Format string on how data will be organized inside the R2 bucket directory. Read more here\n file_name_pattern (Optional[str]): The pattern allows you to set the file-name format for the R2 staging file(s)\n """\n self.account_id = check.str_param(account_id, "account_id")\n self.access_key_id = check.str_param(access_key_id, "access_key_id")\n self.secret_access_key = check.str_param(secret_access_key, "secret_access_key")\n self.s3_bucket_name = check.str_param(s3_bucket_name, "s3_bucket_name")\n self.s3_bucket_path = check.str_param(s3_bucket_path, "s3_bucket_path")\n self.format = check.inst_param(\n format,\n "format",\n (\n R2Destination.AvroApacheAvro,\n R2Destination.CSVCommaSeparatedValues,\n R2Destination.JSONLinesNewlineDelimitedJSON,\n ),\n )\n self.s3_path_format = check.opt_str_param(s3_path_format, "s3_path_format")\n self.file_name_pattern = check.opt_str_param(file_name_pattern, "file_name_pattern")\n super().__init__("R2", name)
\n\n\n
[docs]class JdbcDestination(GeneratedAirbyteDestination):\n
[docs] @public\n def __init__(\n self,\n name: str,\n username: str,\n jdbc_url: str,\n password: Optional[str] = None,\n schema: Optional[str] = None,\n ):\n """Airbyte Destination for Jdbc.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/postgres\n\n Args:\n name (str): The name of the destination.\n username (str): The username which is used to access the database.\n password (Optional[str]): The password associated with this username.\n jdbc_url (str): JDBC formatted url. See the standard here.\n schema (Optional[str]): If you leave the schema unspecified, JDBC defaults to a schema named "public".\n """\n self.username = check.str_param(username, "username")\n self.password = check.opt_str_param(password, "password")\n self.jdbc_url = check.str_param(jdbc_url, "jdbc_url")\n self.schema = check.opt_str_param(schema, "schema")\n super().__init__("Jdbc", name)
\n\n\n
[docs]class KeenDestination(GeneratedAirbyteDestination):\n
[docs] @public\n def __init__(\n self, name: str, project_id: str, api_key: str, infer_timestamp: Optional[bool] = None\n ):\n """Airbyte Destination for Keen.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/keen\n\n Args:\n name (str): The name of the destination.\n project_id (str): To get Keen Project ID, navigate to the Access tab from the left-hand, side panel and check the Project Details section.\n api_key (str): To get Keen Master API Key, navigate to the Access tab from the left-hand, side panel and check the Project Details section.\n infer_timestamp (Optional[bool]): Allow connector to guess keen.timestamp value based on the streamed data.\n """\n self.project_id = check.str_param(project_id, "project_id")\n self.api_key = check.str_param(api_key, "api_key")\n self.infer_timestamp = check.opt_bool_param(infer_timestamp, "infer_timestamp")\n super().__init__("Keen", name)
\n\n\n
[docs]class TidbDestination(GeneratedAirbyteDestination):\n
[docs] @public\n def __init__(\n self,\n name: str,\n host: str,\n port: int,\n database: str,\n username: str,\n password: Optional[str] = None,\n ssl: Optional[bool] = None,\n jdbc_url_params: Optional[str] = None,\n ):\n """Airbyte Destination for Tidb.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/tidb\n\n Args:\n name (str): The name of the destination.\n host (str): Hostname of the database.\n port (int): Port of the database.\n database (str): Name of the database.\n username (str): Username to use to access the database.\n password (Optional[str]): Password associated with the username.\n ssl (Optional[bool]): Encrypt data using SSL.\n jdbc_url_params (Optional[str]): Additional properties to pass to the JDBC URL string when connecting to the database formatted as 'key=value' pairs separated by the symbol '&'. (example: key1=value1&key2=value2&key3=value3).\n """\n self.host = check.str_param(host, "host")\n self.port = check.int_param(port, "port")\n self.database = check.str_param(database, "database")\n self.username = check.str_param(username, "username")\n self.password = check.opt_str_param(password, "password")\n self.ssl = check.opt_bool_param(ssl, "ssl")\n self.jdbc_url_params = check.opt_str_param(jdbc_url_params, "jdbc_url_params")\n super().__init__("Tidb", name)
\n\n\n
[docs]class FirestoreDestination(GeneratedAirbyteDestination):\n
[docs] @public\n def __init__(self, name: str, project_id: str, credentials_json: Optional[str] = None):\n """Airbyte Destination for Firestore.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/firestore\n\n Args:\n name (str): The name of the destination.\n project_id (str): The GCP project ID for the project containing the target BigQuery dataset.\n credentials_json (Optional[str]): The contents of the JSON service account key. Check out the docs if you need help generating this key. Default credentials will be used if this field is left empty.\n """\n self.project_id = check.str_param(project_id, "project_id")\n self.credentials_json = check.opt_str_param(credentials_json, "credentials_json")\n super().__init__("Firestore", name)
\n\n\n
[docs]class ScyllaDestination(GeneratedAirbyteDestination):\n
[docs] @public\n def __init__(\n self,\n name: str,\n keyspace: str,\n username: str,\n password: str,\n address: str,\n port: int,\n replication: Optional[int] = None,\n ):\n """Airbyte Destination for Scylla.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/scylla\n\n Args:\n name (str): The name of the destination.\n keyspace (str): Default Scylla keyspace to create data in.\n username (str): Username to use to access Scylla.\n password (str): Password associated with Scylla.\n address (str): Address to connect to.\n port (int): Port of Scylla.\n replication (Optional[int]): Indicates to how many nodes the data should be replicated to.\n """\n self.keyspace = check.str_param(keyspace, "keyspace")\n self.username = check.str_param(username, "username")\n self.password = check.str_param(password, "password")\n self.address = check.str_param(address, "address")\n self.port = check.int_param(port, "port")\n self.replication = check.opt_int_param(replication, "replication")\n super().__init__("Scylla", name)
\n\n\n
[docs]class RedisDestination(GeneratedAirbyteDestination):\n
[docs] @public\n def __init__(\n self, name: str, host: str, port: int, username: str, password: str, cache_type: str\n ):\n """Airbyte Destination for Redis.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/redis\n\n Args:\n name (str): The name of the destination.\n host (str): Redis host to connect to.\n port (int): Port of Redis.\n username (str): Username associated with Redis.\n password (str): Password associated with Redis.\n cache_type (str): Redis cache type to store data in.\n """\n self.host = check.str_param(host, "host")\n self.port = check.int_param(port, "port")\n self.username = check.str_param(username, "username")\n self.password = check.str_param(password, "password")\n self.cache_type = check.str_param(cache_type, "cache_type")\n super().__init__("Redis", name)
\n\n\n
[docs]class MqttDestination(GeneratedAirbyteDestination):\n
[docs] @public\n def __init__(\n self,\n name: str,\n broker_host: str,\n broker_port: int,\n use_tls: bool,\n topic_pattern: str,\n publisher_sync: bool,\n connect_timeout: int,\n automatic_reconnect: bool,\n clean_session: bool,\n message_retained: bool,\n message_qos: str,\n username: Optional[str] = None,\n password: Optional[str] = None,\n topic_test: Optional[str] = None,\n client: Optional[str] = None,\n ):\n """Airbyte Destination for Mqtt.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/mqtt\n\n Args:\n name (str): The name of the destination.\n broker_host (str): Host of the broker to connect to.\n broker_port (int): Port of the broker.\n use_tls (bool): Whether to use TLS encryption on the connection.\n username (Optional[str]): User name to use for the connection.\n password (Optional[str]): Password to use for the connection.\n topic_pattern (str): Topic pattern in which the records will be sent. You can use patterns like '{namespace}' and/or '{stream}' to send the message to a specific topic based on these values. Notice that the topic name will be transformed to a standard naming convention.\n topic_test (Optional[str]): Topic to test if Airbyte can produce messages.\n client (Optional[str]): A client identifier that is unique on the server being connected to.\n publisher_sync (bool): Wait synchronously until the record has been sent to the broker.\n connect_timeout (int): Maximum time interval (in seconds) the client will wait for the network connection to the MQTT server to be established.\n automatic_reconnect (bool): Whether the client will automatically attempt to reconnect to the server if the connection is lost.\n clean_session (bool): Whether the client and server should remember state across restarts and reconnects.\n message_retained (bool): Whether or not the publish message should be retained by the messaging engine.\n message_qos (str): Quality of service used for each message to be delivered.\n """\n self.broker_host = check.str_param(broker_host, "broker_host")\n self.broker_port = check.int_param(broker_port, "broker_port")\n self.use_tls = check.bool_param(use_tls, "use_tls")\n self.username = check.opt_str_param(username, "username")\n self.password = check.opt_str_param(password, "password")\n self.topic_pattern = check.str_param(topic_pattern, "topic_pattern")\n self.topic_test = check.opt_str_param(topic_test, "topic_test")\n self.client = check.opt_str_param(client, "client")\n self.publisher_sync = check.bool_param(publisher_sync, "publisher_sync")\n self.connect_timeout = check.int_param(connect_timeout, "connect_timeout")\n self.automatic_reconnect = check.bool_param(automatic_reconnect, "automatic_reconnect")\n self.clean_session = check.bool_param(clean_session, "clean_session")\n self.message_retained = check.bool_param(message_retained, "message_retained")\n self.message_qos = check.str_param(message_qos, "message_qos")\n super().__init__("Mqtt", name)
\n\n\n
[docs]class RedshiftDestination(GeneratedAirbyteDestination):\n
[docs] class Standard:\n
[docs] @public\n def __init__(\n self,\n ):\n self.method = "Standard"
\n\n
[docs] class NoEncryption:\n
[docs] @public\n def __init__(\n self,\n ):\n self.encryption_type = "none"
\n\n
[docs] class AESCBCEnvelopeEncryption:\n
[docs] @public\n def __init__(self, key_encrypting_key: Optional[str] = None):\n self.encryption_type = "aes_cbc_envelope"\n self.key_encrypting_key = check.opt_str_param(key_encrypting_key, "key_encrypting_key")
\n\n
[docs] class S3Staging:\n
[docs] @public\n def __init__(\n self,\n s3_bucket_name: str,\n s3_bucket_region: str,\n access_key_id: str,\n secret_access_key: str,\n encryption: Union[\n "RedshiftDestination.NoEncryption", "RedshiftDestination.AESCBCEnvelopeEncryption"\n ],\n s3_bucket_path: Optional[str] = None,\n file_name_pattern: Optional[str] = None,\n purge_staging_data: Optional[bool] = None,\n ):\n self.method = "S3 Staging"\n self.s3_bucket_name = check.str_param(s3_bucket_name, "s3_bucket_name")\n self.s3_bucket_path = check.opt_str_param(s3_bucket_path, "s3_bucket_path")\n self.s3_bucket_region = check.str_param(s3_bucket_region, "s3_bucket_region")\n self.file_name_pattern = check.opt_str_param(file_name_pattern, "file_name_pattern")\n self.access_key_id = check.str_param(access_key_id, "access_key_id")\n self.secret_access_key = check.str_param(secret_access_key, "secret_access_key")\n self.purge_staging_data = check.opt_bool_param(purge_staging_data, "purge_staging_data")\n self.encryption = check.inst_param(\n encryption,\n "encryption",\n (RedshiftDestination.NoEncryption, RedshiftDestination.AESCBCEnvelopeEncryption),\n )
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n host: str,\n port: int,\n username: str,\n password: str,\n database: str,\n schema: str,\n uploading_method: Union["RedshiftDestination.Standard", "RedshiftDestination.S3Staging"],\n jdbc_url_params: Optional[str] = None,\n ):\n """Airbyte Destination for Redshift.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/redshift\n\n Args:\n name (str): The name of the destination.\n host (str): Host Endpoint of the Redshift Cluster (must include the cluster-id, region and end with .redshift.amazonaws.com)\n port (int): Port of the database.\n username (str): Username to use to access the database.\n password (str): Password associated with the username.\n database (str): Name of the database.\n schema (str): The default schema tables are written to if the source does not specify a namespace. Unless specifically configured, the usual value for this field is "public".\n jdbc_url_params (Optional[str]): Additional properties to pass to the JDBC URL string when connecting to the database formatted as 'key=value' pairs separated by the symbol '&'. (example: key1=value1&key2=value2&key3=value3).\n uploading_method (Union[RedshiftDestination.Standard, RedshiftDestination.S3Staging]): The method how the data will be uploaded to the database.\n """\n self.host = check.str_param(host, "host")\n self.port = check.int_param(port, "port")\n self.username = check.str_param(username, "username")\n self.password = check.str_param(password, "password")\n self.database = check.str_param(database, "database")\n self.schema = check.str_param(schema, "schema")\n self.jdbc_url_params = check.opt_str_param(jdbc_url_params, "jdbc_url_params")\n self.uploading_method = check.inst_param(\n uploading_method,\n "uploading_method",\n (RedshiftDestination.Standard, RedshiftDestination.S3Staging),\n )\n super().__init__("Redshift", name)
\n\n\n
[docs]class PulsarDestination(GeneratedAirbyteDestination):\n
[docs] @public\n def __init__(\n self,\n name: str,\n brokers: str,\n use_tls: bool,\n topic_type: str,\n topic_tenant: str,\n topic_namespace: str,\n topic_pattern: str,\n compression_type: str,\n send_timeout_ms: int,\n max_pending_messages: int,\n max_pending_messages_across_partitions: int,\n batching_enabled: bool,\n batching_max_messages: int,\n batching_max_publish_delay: int,\n block_if_queue_full: bool,\n topic_test: Optional[str] = None,\n producer_name: Optional[str] = None,\n producer_sync: Optional[bool] = None,\n ):\n """Airbyte Destination for Pulsar.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/pulsar\n\n Args:\n name (str): The name of the destination.\n brokers (str): A list of host/port pairs to use for establishing the initial connection to the Pulsar cluster.\n use_tls (bool): Whether to use TLS encryption on the connection.\n topic_type (str): It identifies type of topic. Pulsar supports two kind of topics: persistent and non-persistent. In persistent topic, all messages are durably persisted on disk (that means on multiple disks unless the broker is standalone), whereas non-persistent topic does not persist message into storage disk.\n topic_tenant (str): The topic tenant within the instance. Tenants are essential to multi-tenancy in Pulsar, and spread across clusters.\n topic_namespace (str): The administrative unit of the topic, which acts as a grouping mechanism for related topics. Most topic configuration is performed at the namespace level. Each tenant has one or multiple namespaces.\n topic_pattern (str): Topic pattern in which the records will be sent. You can use patterns like '{namespace}' and/or '{stream}' to send the message to a specific topic based on these values. Notice that the topic name will be transformed to a standard naming convention.\n topic_test (Optional[str]): Topic to test if Airbyte can produce messages.\n producer_name (Optional[str]): Name for the producer. If not filled, the system will generate a globally unique name which can be accessed with.\n producer_sync (Optional[bool]): Wait synchronously until the record has been sent to Pulsar.\n compression_type (str): Compression type for the producer.\n send_timeout_ms (int): If a message is not acknowledged by a server before the send-timeout expires, an error occurs (in ms).\n max_pending_messages (int): The maximum size of a queue holding pending messages.\n max_pending_messages_across_partitions (int): The maximum number of pending messages across partitions.\n batching_enabled (bool): Control whether automatic batching of messages is enabled for the producer.\n batching_max_messages (int): Maximum number of messages permitted in a batch.\n batching_max_publish_delay (int): Time period in milliseconds within which the messages sent will be batched.\n block_if_queue_full (bool): If the send operation should block when the outgoing message queue is full.\n """\n self.brokers = check.str_param(brokers, "brokers")\n self.use_tls = check.bool_param(use_tls, "use_tls")\n self.topic_type = check.str_param(topic_type, "topic_type")\n self.topic_tenant = check.str_param(topic_tenant, "topic_tenant")\n self.topic_namespace = check.str_param(topic_namespace, "topic_namespace")\n self.topic_pattern = check.str_param(topic_pattern, "topic_pattern")\n self.topic_test = check.opt_str_param(topic_test, "topic_test")\n self.producer_name = check.opt_str_param(producer_name, "producer_name")\n self.producer_sync = check.opt_bool_param(producer_sync, "producer_sync")\n self.compression_type = check.str_param(compression_type, "compression_type")\n self.send_timeout_ms = check.int_param(send_timeout_ms, "send_timeout_ms")\n self.max_pending_messages = check.int_param(max_pending_messages, "max_pending_messages")\n self.max_pending_messages_across_partitions = check.int_param(\n max_pending_messages_across_partitions, "max_pending_messages_across_partitions"\n )\n self.batching_enabled = check.bool_param(batching_enabled, "batching_enabled")\n self.batching_max_messages = check.int_param(batching_max_messages, "batching_max_messages")\n self.batching_max_publish_delay = check.int_param(\n batching_max_publish_delay, "batching_max_publish_delay"\n )\n self.block_if_queue_full = check.bool_param(block_if_queue_full, "block_if_queue_full")\n super().__init__("Pulsar", name)
\n\n\n
[docs]class SnowflakeDestination(GeneratedAirbyteDestination):\n
[docs] class OAuth20:\n
[docs] @public\n def __init__(\n self,\n access_token: str,\n refresh_token: str,\n auth_type: Optional[str] = None,\n client_id: Optional[str] = None,\n client_secret: Optional[str] = None,\n ):\n self.auth_type = check.opt_str_param(auth_type, "auth_type")\n self.client_id = check.opt_str_param(client_id, "client_id")\n self.client_secret = check.opt_str_param(client_secret, "client_secret")\n self.access_token = check.str_param(access_token, "access_token")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")
\n\n
[docs] class KeyPairAuthentication:\n
[docs] @public\n def __init__(\n self,\n private_key: str,\n auth_type: Optional[str] = None,\n private_key_password: Optional[str] = None,\n ):\n self.auth_type = check.opt_str_param(auth_type, "auth_type")\n self.private_key = check.str_param(private_key, "private_key")\n self.private_key_password = check.opt_str_param(\n private_key_password, "private_key_password"\n )
\n\n
[docs] class UsernameAndPassword:\n
[docs] @public\n def __init__(self, password: str):\n self.password = check.str_param(password, "password")
\n\n
[docs] class SelectAnotherOption:\n
[docs] @public\n def __init__(self, method: str):\n self.method = check.str_param(method, "method")
\n\n
[docs] class RecommendedInternalStaging:\n
[docs] @public\n def __init__(self, method: str):\n self.method = check.str_param(method, "method")
\n\n
[docs] class NoEncryption:\n
[docs] @public\n def __init__(\n self,\n ):\n self.encryption_type = "none"
\n\n
[docs] class AESCBCEnvelopeEncryption:\n
[docs] @public\n def __init__(self, key_encrypting_key: Optional[str] = None):\n self.encryption_type = "aes_cbc_envelope"\n self.key_encrypting_key = check.opt_str_param(key_encrypting_key, "key_encrypting_key")
\n\n
[docs] class AWSS3Staging:\n
[docs] @public\n def __init__(\n self,\n method: str,\n s3_bucket_name: str,\n access_key_id: str,\n secret_access_key: str,\n encryption: Union[\n "SnowflakeDestination.NoEncryption", "SnowflakeDestination.AESCBCEnvelopeEncryption"\n ],\n s3_bucket_region: Optional[str] = None,\n purge_staging_data: Optional[bool] = None,\n file_name_pattern: Optional[str] = None,\n ):\n self.method = check.str_param(method, "method")\n self.s3_bucket_name = check.str_param(s3_bucket_name, "s3_bucket_name")\n self.s3_bucket_region = check.opt_str_param(s3_bucket_region, "s3_bucket_region")\n self.access_key_id = check.str_param(access_key_id, "access_key_id")\n self.secret_access_key = check.str_param(secret_access_key, "secret_access_key")\n self.purge_staging_data = check.opt_bool_param(purge_staging_data, "purge_staging_data")\n self.encryption = check.inst_param(\n encryption,\n "encryption",\n (SnowflakeDestination.NoEncryption, SnowflakeDestination.AESCBCEnvelopeEncryption),\n )\n self.file_name_pattern = check.opt_str_param(file_name_pattern, "file_name_pattern")
\n\n
[docs] class GoogleCloudStorageStaging:\n
[docs] @public\n def __init__(self, method: str, project_id: str, bucket_name: str, credentials_json: str):\n self.method = check.str_param(method, "method")\n self.project_id = check.str_param(project_id, "project_id")\n self.bucket_name = check.str_param(bucket_name, "bucket_name")\n self.credentials_json = check.str_param(credentials_json, "credentials_json")
\n\n
[docs] class AzureBlobStorageStaging:\n
[docs] @public\n def __init__(\n self,\n method: str,\n azure_blob_storage_account_name: str,\n azure_blob_storage_container_name: str,\n azure_blob_storage_sas_token: str,\n azure_blob_storage_endpoint_domain_name: Optional[str] = None,\n ):\n self.method = check.str_param(method, "method")\n self.azure_blob_storage_endpoint_domain_name = check.opt_str_param(\n azure_blob_storage_endpoint_domain_name, "azure_blob_storage_endpoint_domain_name"\n )\n self.azure_blob_storage_account_name = check.str_param(\n azure_blob_storage_account_name, "azure_blob_storage_account_name"\n )\n self.azure_blob_storage_container_name = check.str_param(\n azure_blob_storage_container_name, "azure_blob_storage_container_name"\n )\n self.azure_blob_storage_sas_token = check.str_param(\n azure_blob_storage_sas_token, "azure_blob_storage_sas_token"\n )
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n host: str,\n role: str,\n warehouse: str,\n database: str,\n schema: str,\n username: str,\n credentials: Union[\n "SnowflakeDestination.OAuth20",\n "SnowflakeDestination.KeyPairAuthentication",\n "SnowflakeDestination.UsernameAndPassword",\n ],\n loading_method: Union[\n "SnowflakeDestination.SelectAnotherOption",\n "SnowflakeDestination.RecommendedInternalStaging",\n "SnowflakeDestination.AWSS3Staging",\n "SnowflakeDestination.GoogleCloudStorageStaging",\n "SnowflakeDestination.AzureBlobStorageStaging",\n ],\n jdbc_url_params: Optional[str] = None,\n ):\n """Airbyte Destination for Snowflake.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/snowflake\n\n Args:\n name (str): The name of the destination.\n host (str): Enter your Snowflake account's locator (in the format ...snowflakecomputing.com)\n role (str): Enter the role that you want to use to access Snowflake\n warehouse (str): Enter the name of the warehouse that you want to sync data into\n database (str): Enter the name of the database you want to sync data into\n schema (str): Enter the name of the default schema\n username (str): Enter the name of the user you want to use to access the database\n jdbc_url_params (Optional[str]): Enter the additional properties to pass to the JDBC URL string when connecting to the database (formatted as key=value pairs separated by the symbol &). Example: key1=value1&key2=value2&key3=value3\n loading_method (Union[SnowflakeDestination.SelectAnotherOption, SnowflakeDestination.RecommendedInternalStaging, SnowflakeDestination.AWSS3Staging, SnowflakeDestination.GoogleCloudStorageStaging, SnowflakeDestination.AzureBlobStorageStaging]): Select a data staging method\n """\n self.host = check.str_param(host, "host")\n self.role = check.str_param(role, "role")\n self.warehouse = check.str_param(warehouse, "warehouse")\n self.database = check.str_param(database, "database")\n self.schema = check.str_param(schema, "schema")\n self.username = check.str_param(username, "username")\n self.credentials = check.inst_param(\n credentials,\n "credentials",\n (\n SnowflakeDestination.OAuth20,\n SnowflakeDestination.KeyPairAuthentication,\n SnowflakeDestination.UsernameAndPassword,\n ),\n )\n self.jdbc_url_params = check.opt_str_param(jdbc_url_params, "jdbc_url_params")\n self.loading_method = check.inst_param(\n loading_method,\n "loading_method",\n (\n SnowflakeDestination.SelectAnotherOption,\n SnowflakeDestination.RecommendedInternalStaging,\n SnowflakeDestination.AWSS3Staging,\n SnowflakeDestination.GoogleCloudStorageStaging,\n SnowflakeDestination.AzureBlobStorageStaging,\n ),\n )\n super().__init__("Snowflake", name)
\n\n\n
[docs]class PostgresDestination(GeneratedAirbyteDestination):\n
[docs] class Disable:\n
[docs] @public\n def __init__(\n self,\n ):\n self.mode = "disable"
\n\n
[docs] class Allow:\n
[docs] @public\n def __init__(\n self,\n ):\n self.mode = "allow"
\n\n
[docs] class Prefer:\n
[docs] @public\n def __init__(\n self,\n ):\n self.mode = "prefer"
\n\n
[docs] class Require:\n
[docs] @public\n def __init__(\n self,\n ):\n self.mode = "require"
\n\n
[docs] class VerifyCa:\n
[docs] @public\n def __init__(self, ca_certificate: str, client_key_password: Optional[str] = None):\n self.mode = "verify-ca"\n self.ca_certificate = check.str_param(ca_certificate, "ca_certificate")\n self.client_key_password = check.opt_str_param(\n client_key_password, "client_key_password"\n )
\n\n
[docs] class VerifyFull:\n
[docs] @public\n def __init__(\n self,\n ca_certificate: str,\n client_certificate: str,\n client_key: str,\n client_key_password: Optional[str] = None,\n ):\n self.mode = "verify-full"\n self.ca_certificate = check.str_param(ca_certificate, "ca_certificate")\n self.client_certificate = check.str_param(client_certificate, "client_certificate")\n self.client_key = check.str_param(client_key, "client_key")\n self.client_key_password = check.opt_str_param(\n client_key_password, "client_key_password"\n )
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n host: str,\n port: int,\n database: str,\n schema: str,\n username: str,\n ssl_mode: Union[\n "PostgresDestination.Disable",\n "PostgresDestination.Allow",\n "PostgresDestination.Prefer",\n "PostgresDestination.Require",\n "PostgresDestination.VerifyCa",\n "PostgresDestination.VerifyFull",\n ],\n password: Optional[str] = None,\n ssl: Optional[bool] = None,\n jdbc_url_params: Optional[str] = None,\n ):\n """Airbyte Destination for Postgres.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/postgres\n\n Args:\n name (str): The name of the destination.\n host (str): Hostname of the database.\n port (int): Port of the database.\n database (str): Name of the database.\n schema (str): The default schema tables are written to if the source does not specify a namespace. The usual value for this field is "public".\n username (str): Username to use to access the database.\n password (Optional[str]): Password associated with the username.\n ssl (Optional[bool]): Encrypt data using SSL. When activating SSL, please select one of the connection modes.\n ssl_mode (Union[PostgresDestination.Disable, PostgresDestination.Allow, PostgresDestination.Prefer, PostgresDestination.Require, PostgresDestination.VerifyCa, PostgresDestination.VerifyFull]): SSL connection modes. disable - Chose this mode to disable encryption of communication between Airbyte and destination database allow - Chose this mode to enable encryption only when required by the source database prefer - Chose this mode to allow unencrypted connection only if the source database does not support encryption require - Chose this mode to always require encryption. If the source database server does not support encryption, connection will fail verify-ca - Chose this mode to always require encryption and to verify that the source database server has a valid SSL certificate verify-full - This is the most secure mode. Chose this mode to always require encryption and to verify the identity of the source database server See more information - in the docs.\n jdbc_url_params (Optional[str]): Additional properties to pass to the JDBC URL string when connecting to the database formatted as 'key=value' pairs separated by the symbol '&'. (example: key1=value1&key2=value2&key3=value3).\n """\n self.host = check.str_param(host, "host")\n self.port = check.int_param(port, "port")\n self.database = check.str_param(database, "database")\n self.schema = check.str_param(schema, "schema")\n self.username = check.str_param(username, "username")\n self.password = check.opt_str_param(password, "password")\n self.ssl = check.opt_bool_param(ssl, "ssl")\n self.ssl_mode = check.inst_param(\n ssl_mode,\n "ssl_mode",\n (\n PostgresDestination.Disable,\n PostgresDestination.Allow,\n PostgresDestination.Prefer,\n PostgresDestination.Require,\n PostgresDestination.VerifyCa,\n PostgresDestination.VerifyFull,\n ),\n )\n self.jdbc_url_params = check.opt_str_param(jdbc_url_params, "jdbc_url_params")\n super().__init__("Postgres", name)
\n\n\n
[docs]class ScaffoldDestinationPythonDestination(GeneratedAirbyteDestination):\n
[docs] @public\n def __init__(self, name: str, TODO: Optional[str] = None):\n """Airbyte Destination for Scaffold Destination Python.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/scaffold-destination-python\n\n Args:\n name (str): The name of the destination.\n TODO (Optional[str]): FIX ME\n """\n self.TODO = check.opt_str_param(TODO, "TODO")\n super().__init__("Scaffold Destination Python", name)
\n\n\n
[docs]class LocalJsonDestination(GeneratedAirbyteDestination):\n
[docs] @public\n def __init__(self, name: str, destination_path: str):\n """Airbyte Destination for Local Json.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/local-json\n\n Args:\n name (str): The name of the destination.\n destination_path (str): Path to the directory where json files will be written. The files will be placed inside that local mount. For more information check out our docs\n """\n self.destination_path = check.str_param(destination_path, "destination_path")\n super().__init__("Local Json", name)
\n\n\n
[docs]class MeilisearchDestination(GeneratedAirbyteDestination):\n
[docs] @public\n def __init__(self, name: str, host: str, api_key: Optional[str] = None):\n """Airbyte Destination for Meilisearch.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/meilisearch\n\n Args:\n name (str): The name of the destination.\n host (str): Hostname of the MeiliSearch instance.\n api_key (Optional[str]): MeiliSearch API Key. See the docs for more information on how to obtain this key.\n """\n self.host = check.str_param(host, "host")\n self.api_key = check.opt_str_param(api_key, "api_key")\n super().__init__("Meilisearch", name)
\n
", "current_page_name": "_modules/dagster_airbyte/managed/generated/destinations", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_airbyte.managed.generated.destinations"}, "sources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_airbyte.managed.generated.sources

\n# ruff: noqa: A001, A002\nfrom typing import List, Optional, Union\n\nimport dagster._check as check\nfrom dagster._annotations import public\n\nfrom dagster_airbyte.managed.types import GeneratedAirbyteSource\n\n\n
[docs]class StravaSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n client_id: str,\n client_secret: str,\n refresh_token: str,\n athlete_id: int,\n start_date: str,\n auth_type: Optional[str] = None,\n ):\n """Airbyte Source for Strava.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/strava\n\n Args:\n name (str): The name of the destination.\n client_id (str): The Client ID of your Strava developer application.\n client_secret (str): The Client Secret of your Strava developer application.\n refresh_token (str): The Refresh Token with the activity: read_all permissions.\n athlete_id (int): The Athlete ID of your Strava developer application.\n start_date (str): UTC date and time. Any data before this date will not be replicated.\n """\n self.auth_type = check.opt_str_param(auth_type, "auth_type")\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")\n self.athlete_id = check.int_param(athlete_id, "athlete_id")\n self.start_date = check.str_param(start_date, "start_date")\n super().__init__("Strava", name)
\n\n\n
[docs]class AppsflyerSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n app_id: str,\n api_token: str,\n start_date: str,\n timezone: Optional[str] = None,\n ):\n """Airbyte Source for Appsflyer.\n\n Args:\n name (str): The name of the destination.\n app_id (str): App identifier as found in AppsFlyer.\n api_token (str): Pull API token for authentication. If you change the account admin, the token changes, and you must update scripts with the new token. Get the API token in the Dashboard.\n start_date (str): The default value to use if no bookmark exists for an endpoint. Raw Reports historical lookback is limited to 90 days.\n timezone (Optional[str]): Time zone in which date times are stored. The project timezone may be found in the App settings in the AppsFlyer console.\n """\n self.app_id = check.str_param(app_id, "app_id")\n self.api_token = check.str_param(api_token, "api_token")\n self.start_date = check.str_param(start_date, "start_date")\n self.timezone = check.opt_str_param(timezone, "timezone")\n super().__init__("Appsflyer", name)
\n\n\n
[docs]class GoogleWorkspaceAdminReportsSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self, name: str, credentials_json: str, email: str, lookback: Optional[int] = None\n ):\n """Airbyte Source for Google Workspace Admin Reports.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/google-workspace-admin-reports\n\n Args:\n name (str): The name of the destination.\n credentials_json (str): The contents of the JSON service account key. See the docs for more information on how to generate this key.\n email (str): The email of the user, who has permissions to access the Google Workspace Admin APIs.\n lookback (Optional[int]): Sets the range of time shown in the report. The maximum value allowed by the Google API is 180 days.\n """\n self.credentials_json = check.str_param(credentials_json, "credentials_json")\n self.email = check.str_param(email, "email")\n self.lookback = check.opt_int_param(lookback, "lookback")\n super().__init__("Google Workspace Admin Reports", name)
\n\n\n
[docs]class CartSource(GeneratedAirbyteSource):\n
[docs] class CentralAPIRouter:\n
[docs] @public\n def __init__(self, user_name: str, user_secret: str, site_id: str):\n self.auth_type = "CENTRAL_API_ROUTER"\n self.user_name = check.str_param(user_name, "user_name")\n self.user_secret = check.str_param(user_secret, "user_secret")\n self.site_id = check.str_param(site_id, "site_id")
\n\n
[docs] class SingleStoreAccessToken:\n
[docs] @public\n def __init__(self, access_token: str, store_name: str):\n self.auth_type = "SINGLE_STORE_ACCESS_TOKEN"\n self.access_token = check.str_param(access_token, "access_token")\n self.store_name = check.str_param(store_name, "store_name")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n credentials: Union["CartSource.CentralAPIRouter", "CartSource.SingleStoreAccessToken"],\n start_date: str,\n ):\n """Airbyte Source for Cart.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/cart\n\n Args:\n name (str): The name of the destination.\n start_date (str): The date from which you'd like to replicate the data\n """\n self.credentials = check.inst_param(\n credentials,\n "credentials",\n (CartSource.CentralAPIRouter, CartSource.SingleStoreAccessToken),\n )\n self.start_date = check.str_param(start_date, "start_date")\n super().__init__("Cart", name)
\n\n\n
[docs]class LinkedinAdsSource(GeneratedAirbyteSource):\n
[docs] class OAuth20:\n
[docs] @public\n def __init__(\n self,\n client_id: str,\n client_secret: str,\n refresh_token: str,\n auth_method: Optional[str] = None,\n ):\n self.auth_method = check.opt_str_param(auth_method, "auth_method")\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")
\n\n
[docs] class AccessToken:\n
[docs] @public\n def __init__(self, access_token: str, auth_method: Optional[str] = None):\n self.auth_method = check.opt_str_param(auth_method, "auth_method")\n self.access_token = check.str_param(access_token, "access_token")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n credentials: Union["LinkedinAdsSource.OAuth20", "LinkedinAdsSource.AccessToken"],\n start_date: str,\n account_ids: Optional[List[int]] = None,\n ):\n """Airbyte Source for Linkedin Ads.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/linkedin-ads\n\n Args:\n name (str): The name of the destination.\n start_date (str): UTC date in the format 2020-09-17. Any data before this date will not be replicated.\n account_ids (Optional[List[int]]): Specify the account IDs separated by a space, to pull the data from. Leave empty, if you want to pull the data from all associated accounts. See the LinkedIn Ads docs for more info.\n """\n self.credentials = check.inst_param(\n credentials, "credentials", (LinkedinAdsSource.OAuth20, LinkedinAdsSource.AccessToken)\n )\n self.start_date = check.str_param(start_date, "start_date")\n self.account_ids = check.opt_nullable_list_param(account_ids, "account_ids", int)\n super().__init__("Linkedin Ads", name)
\n\n\n
[docs]class MongodbSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n host: str,\n port: int,\n database: str,\n user: str,\n password: str,\n auth_source: str,\n replica_set: Optional[str] = None,\n ssl: Optional[bool] = None,\n ):\n """Airbyte Source for Mongodb.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/mongodb\n\n Args:\n name (str): The name of the destination.\n host (str): Host of a Mongo database to be replicated.\n port (int): Port of a Mongo database to be replicated.\n database (str): Database to be replicated.\n user (str): User\n password (str): Password\n auth_source (str): Authentication source where user information is stored. See the Mongo docs for more info.\n replica_set (Optional[str]): The name of the set to filter servers by, when connecting to a replica set (Under this condition, the 'TLS connection' value automatically becomes 'true'). See the Mongo docs for more info.\n ssl (Optional[bool]): If this switch is enabled, TLS connections will be used to connect to MongoDB.\n """\n self.host = check.str_param(host, "host")\n self.port = check.int_param(port, "port")\n self.database = check.str_param(database, "database")\n self.user = check.str_param(user, "user")\n self.password = check.str_param(password, "password")\n self.auth_source = check.str_param(auth_source, "auth_source")\n self.replica_set = check.opt_str_param(replica_set, "replica_set")\n self.ssl = check.opt_bool_param(ssl, "ssl")\n super().__init__("Mongodb", name)
\n\n\n
[docs]class TimelySource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, account_id: str, start_date: str, bearer_token: str):\n """Airbyte Source for Timely.\n\n Args:\n name (str): The name of the destination.\n account_id (str): Timely account id\n start_date (str): start date\n bearer_token (str): Timely bearer token\n """\n self.account_id = check.str_param(account_id, "account_id")\n self.start_date = check.str_param(start_date, "start_date")\n self.bearer_token = check.str_param(bearer_token, "bearer_token")\n super().__init__("Timely", name)
\n\n\n
[docs]class StockTickerApiTutorialSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, stock_ticker: str, api_key: str):\n """Airbyte Source for Stock Ticker Api Tutorial.\n\n Documentation can be found at https://polygon.io/docs/stocks/get_v2_aggs_grouped_locale_us_market_stocks__date\n\n Args:\n name (str): The name of the destination.\n stock_ticker (str): The stock ticker to track\n api_key (str): The Polygon.io Stocks API key to use to hit the API.\n """\n self.stock_ticker = check.str_param(stock_ticker, "stock_ticker")\n self.api_key = check.str_param(api_key, "api_key")\n super().__init__("Stock Ticker Api Tutorial", name)
\n\n\n
[docs]class WrikeSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self, name: str, access_token: str, wrike_instance: str, start_date: Optional[str] = None\n ):\n """Airbyte Source for Wrike.\n\n Args:\n name (str): The name of the destination.\n access_token (str): Permanent access token. You can find documentation on how to acquire a permanent access token here\n wrike_instance (str): Wrike's instance such as `app-us2.wrike.com`\n start_date (Optional[str]): UTC date and time in the format 2017-01-25T00:00:00Z. Only comments after this date will be replicated.\n """\n self.access_token = check.str_param(access_token, "access_token")\n self.wrike_instance = check.str_param(wrike_instance, "wrike_instance")\n self.start_date = check.opt_str_param(start_date, "start_date")\n super().__init__("Wrike", name)
\n\n\n
[docs]class CommercetoolsSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n region: str,\n host: str,\n start_date: str,\n project_key: str,\n client_id: str,\n client_secret: str,\n ):\n """Airbyte Source for Commercetools.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/commercetools\n\n Args:\n name (str): The name of the destination.\n region (str): The region of the platform.\n host (str): The cloud provider your shop is hosted. See: https://docs.commercetools.com/api/authorization\n start_date (str): The date you would like to replicate data. Format: YYYY-MM-DD.\n project_key (str): The project key\n client_id (str): Id of API Client.\n client_secret (str): The password of secret of API Client.\n """\n self.region = check.str_param(region, "region")\n self.host = check.str_param(host, "host")\n self.start_date = check.str_param(start_date, "start_date")\n self.project_key = check.str_param(project_key, "project_key")\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n super().__init__("Commercetools", name)
\n\n\n
[docs]class GutendexSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n author_year_start: Optional[str] = None,\n author_year_end: Optional[str] = None,\n copyright: Optional[str] = None,\n languages: Optional[str] = None,\n search: Optional[str] = None,\n sort: Optional[str] = None,\n topic: Optional[str] = None,\n ):\n """Airbyte Source for Gutendex.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/gutendex\n\n Args:\n name (str): The name of the destination.\n author_year_start (Optional[str]): (Optional) Defines the minimum birth year of the authors. Books by authors born prior to the start year will not be returned. Supports both positive (CE) or negative (BCE) integer values\n author_year_end (Optional[str]): (Optional) Defines the maximum birth year of the authors. Books by authors born after the end year will not be returned. Supports both positive (CE) or negative (BCE) integer values\n copyright (Optional[str]): (Optional) Use this to find books with a certain copyright status - true for books with existing copyrights, false for books in the public domain in the USA, or null for books with no available copyright information.\n languages (Optional[str]): (Optional) Use this to find books in any of a list of languages. They must be comma-separated, two-character language codes.\n search (Optional[str]): (Optional) Use this to search author names and book titles with given words. They must be separated by a space (i.e. %20 in URL-encoded format) and are case-insensitive.\n sort (Optional[str]): (Optional) Use this to sort books - ascending for Project Gutenberg ID numbers from lowest to highest, descending for IDs highest to lowest, or popular (the default) for most popular to least popular by number of downloads.\n topic (Optional[str]): (Optional) Use this to search for a case-insensitive key-phrase in books' bookshelves or subjects.\n """\n self.author_year_start = check.opt_str_param(author_year_start, "author_year_start")\n self.author_year_end = check.opt_str_param(author_year_end, "author_year_end")\n self.copyright = check.opt_str_param(copyright, "copyright")\n self.languages = check.opt_str_param(languages, "languages")\n self.search = check.opt_str_param(search, "search")\n self.sort = check.opt_str_param(sort, "sort")\n self.topic = check.opt_str_param(topic, "topic")\n super().__init__("Gutendex", name)
\n\n\n
[docs]class IterableSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, api_key: str, start_date: str):\n """Airbyte Source for Iterable.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/iterable\n\n Args:\n name (str): The name of the destination.\n api_key (str): Iterable API Key. See the docs for more information on how to obtain this key.\n start_date (str): The date from which you'd like to replicate data for Iterable, in the format YYYY-MM-DDT00:00:00Z. All data generated after this date will be replicated.\n """\n self.api_key = check.str_param(api_key, "api_key")\n self.start_date = check.str_param(start_date, "start_date")\n super().__init__("Iterable", name)
\n\n\n
[docs]class QuickbooksSingerSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n client_id: str,\n client_secret: str,\n refresh_token: str,\n realm_id: str,\n user_agent: str,\n start_date: str,\n sandbox: bool,\n ):\n """Airbyte Source for Quickbooks Singer.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/quickbooks\n\n Args:\n name (str): The name of the destination.\n client_id (str): Identifies which app is making the request. Obtain this value from the Keys tab on the app profile via My Apps on the developer site. There are two versions of this key: development and production.\n client_secret (str): Obtain this value from the Keys tab on the app profile via My Apps on the developer site. There are two versions of this key: development and production.\n refresh_token (str): A token used when refreshing the access token.\n realm_id (str): Labeled Company ID. The Make API Calls panel is populated with the realm id and the current access token.\n user_agent (str): Process and email for API logging purposes. Example: tap-quickbooks .\n start_date (str): The default value to use if no bookmark exists for an endpoint (rfc3339 date string). E.g, 2021-03-20T00:00:00Z. Any data before this date will not be replicated.\n sandbox (bool): Determines whether to use the sandbox or production environment.\n """\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")\n self.realm_id = check.str_param(realm_id, "realm_id")\n self.user_agent = check.str_param(user_agent, "user_agent")\n self.start_date = check.str_param(start_date, "start_date")\n self.sandbox = check.bool_param(sandbox, "sandbox")\n super().__init__("Quickbooks Singer", name)
\n\n\n
[docs]class BigcommerceSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, start_date: str, store_hash: str, access_token: str):\n """Airbyte Source for Bigcommerce.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/bigcommerce\n\n Args:\n name (str): The name of the destination.\n start_date (str): The date you would like to replicate data. Format: YYYY-MM-DD.\n store_hash (str): The hash code of the store. For https://api.bigcommerce.com/stores/HASH_CODE/v3/, The store's hash code is 'HASH_CODE'.\n access_token (str): Access Token for making authenticated requests.\n """\n self.start_date = check.str_param(start_date, "start_date")\n self.store_hash = check.str_param(store_hash, "store_hash")\n self.access_token = check.str_param(access_token, "access_token")\n super().__init__("Bigcommerce", name)
\n\n\n
[docs]class ShopifySource(GeneratedAirbyteSource):\n
[docs] class APIPassword:\n
[docs] @public\n def __init__(self, api_password: str):\n self.auth_method = "api_password"\n self.api_password = check.str_param(api_password, "api_password")
\n\n
[docs] class OAuth20:\n
[docs] @public\n def __init__(\n self,\n client_id: Optional[str] = None,\n client_secret: Optional[str] = None,\n access_token: Optional[str] = None,\n ):\n self.auth_method = "oauth2.0"\n self.client_id = check.opt_str_param(client_id, "client_id")\n self.client_secret = check.opt_str_param(client_secret, "client_secret")\n self.access_token = check.opt_str_param(access_token, "access_token")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n shop: str,\n credentials: Union["ShopifySource.APIPassword", "ShopifySource.OAuth20"],\n start_date: str,\n ):\n """Airbyte Source for Shopify.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/shopify\n\n Args:\n name (str): The name of the destination.\n shop (str): The name of your Shopify store found in the URL. For example, if your URL was https://NAME.myshopify.com, then the name would be 'NAME'.\n credentials (Union[ShopifySource.APIPassword, ShopifySource.OAuth20]): The authorization method to use to retrieve data from Shopify\n start_date (str): The date you would like to replicate data from. Format: YYYY-MM-DD. Any data before this date will not be replicated.\n """\n self.shop = check.str_param(shop, "shop")\n self.credentials = check.inst_param(\n credentials, "credentials", (ShopifySource.APIPassword, ShopifySource.OAuth20)\n )\n self.start_date = check.str_param(start_date, "start_date")\n super().__init__("Shopify", name)
\n\n\n
[docs]class AppstoreSingerSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self, name: str, key_id: str, private_key: str, issuer_id: str, vendor: str, start_date: str\n ):\n """Airbyte Source for Appstore Singer.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/appstore\n\n Args:\n name (str): The name of the destination.\n key_id (str): Appstore Key ID. See the docs for more information on how to obtain this key.\n private_key (str): Appstore Private Key. See the docs for more information on how to obtain this key.\n issuer_id (str): Appstore Issuer ID. See the docs for more information on how to obtain this ID.\n vendor (str): Appstore Vendor ID. See the docs for more information on how to obtain this ID.\n start_date (str): UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated.\n """\n self.key_id = check.str_param(key_id, "key_id")\n self.private_key = check.str_param(private_key, "private_key")\n self.issuer_id = check.str_param(issuer_id, "issuer_id")\n self.vendor = check.str_param(vendor, "vendor")\n self.start_date = check.str_param(start_date, "start_date")\n super().__init__("Appstore Singer", name)
\n\n\n
[docs]class GreenhouseSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, api_key: str):\n """Airbyte Source for Greenhouse.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/greenhouse\n\n Args:\n name (str): The name of the destination.\n api_key (str): Greenhouse API Key. See the docs for more information on how to generate this key.\n """\n self.api_key = check.str_param(api_key, "api_key")\n super().__init__("Greenhouse", name)
\n\n\n
[docs]class ZoomSingerSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, jwt: str):\n """Airbyte Source for Zoom Singer.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/zoom\n\n Args:\n name (str): The name of the destination.\n jwt (str): Zoom JWT Token. See the docs for more information on how to obtain this key.\n """\n self.jwt = check.str_param(jwt, "jwt")\n super().__init__("Zoom Singer", name)
\n\n\n
[docs]class TiktokMarketingSource(GeneratedAirbyteSource):\n
[docs] class OAuth20:\n
[docs] @public\n def __init__(\n self, app_id: str, secret: str, access_token: str, auth_type: Optional[str] = None\n ):\n self.auth_type = check.opt_str_param(auth_type, "auth_type")\n self.app_id = check.str_param(app_id, "app_id")\n self.secret = check.str_param(secret, "secret")\n self.access_token = check.str_param(access_token, "access_token")
\n\n
[docs] class SandboxAccessToken:\n
[docs] @public\n def __init__(self, advertiser_id: str, access_token: str, auth_type: Optional[str] = None):\n self.auth_type = check.opt_str_param(auth_type, "auth_type")\n self.advertiser_id = check.str_param(advertiser_id, "advertiser_id")\n self.access_token = check.str_param(access_token, "access_token")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n credentials: Union[\n "TiktokMarketingSource.OAuth20", "TiktokMarketingSource.SandboxAccessToken"\n ],\n start_date: Optional[str] = None,\n end_date: Optional[str] = None,\n report_granularity: Optional[str] = None,\n ):\n """Airbyte Source for Tiktok Marketing.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/tiktok-marketing\n\n Args:\n name (str): The name of the destination.\n credentials (Union[TiktokMarketingSource.OAuth20, TiktokMarketingSource.SandboxAccessToken]): Authentication method\n start_date (Optional[str]): The Start Date in format: YYYY-MM-DD. Any data before this date will not be replicated. If this parameter is not set, all data will be replicated.\n end_date (Optional[str]): The date until which you'd like to replicate data for all incremental streams, in the format YYYY-MM-DD. All data generated between start_date and this date will be replicated. Not setting this option will result in always syncing the data till the current date.\n report_granularity (Optional[str]): The granularity used for aggregating performance data in reports. See the docs.\n """\n self.credentials = check.inst_param(\n credentials,\n "credentials",\n (TiktokMarketingSource.OAuth20, TiktokMarketingSource.SandboxAccessToken),\n )\n self.start_date = check.opt_str_param(start_date, "start_date")\n self.end_date = check.opt_str_param(end_date, "end_date")\n self.report_granularity = check.opt_str_param(report_granularity, "report_granularity")\n super().__init__("Tiktok Marketing", name)
\n\n\n
[docs]class ZendeskChatSource(GeneratedAirbyteSource):\n
[docs] class OAuth20:\n
[docs] @public\n def __init__(\n self,\n client_id: Optional[str] = None,\n client_secret: Optional[str] = None,\n access_token: Optional[str] = None,\n refresh_token: Optional[str] = None,\n ):\n self.credentials = "oauth2.0"\n self.client_id = check.opt_str_param(client_id, "client_id")\n self.client_secret = check.opt_str_param(client_secret, "client_secret")\n self.access_token = check.opt_str_param(access_token, "access_token")\n self.refresh_token = check.opt_str_param(refresh_token, "refresh_token")
\n\n
[docs] class AccessToken:\n
[docs] @public\n def __init__(self, access_token: str):\n self.credentials = "access_token"\n self.access_token = check.str_param(access_token, "access_token")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n start_date: str,\n credentials: Union["ZendeskChatSource.OAuth20", "ZendeskChatSource.AccessToken"],\n subdomain: Optional[str] = None,\n ):\n """Airbyte Source for Zendesk Chat.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/zendesk-chat\n\n Args:\n name (str): The name of the destination.\n start_date (str): The date from which you'd like to replicate data for Zendesk Chat API, in the format YYYY-MM-DDT00:00:00Z.\n subdomain (Optional[str]): Required if you access Zendesk Chat from a Zendesk Support subdomain.\n """\n self.start_date = check.str_param(start_date, "start_date")\n self.subdomain = check.opt_str_param(subdomain, "subdomain")\n self.credentials = check.inst_param(\n credentials, "credentials", (ZendeskChatSource.OAuth20, ZendeskChatSource.AccessToken)\n )\n super().__init__("Zendesk Chat", name)
\n\n\n
[docs]class AwsCloudtrailSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self, name: str, aws_key_id: str, aws_secret_key: str, aws_region_name: str, start_date: str\n ):\n """Airbyte Source for Aws Cloudtrail.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/aws-cloudtrail\n\n Args:\n name (str): The name of the destination.\n aws_key_id (str): AWS CloudTrail Access Key ID. See the docs for more information on how to obtain this key.\n aws_secret_key (str): AWS CloudTrail Access Key ID. See the docs for more information on how to obtain this key.\n aws_region_name (str): The default AWS Region to use, for example, us-west-1 or us-west-2. When specifying a Region inline during client initialization, this property is named region_name.\n start_date (str): The date you would like to replicate data. Data in AWS CloudTrail is available for last 90 days only. Format: YYYY-MM-DD.\n """\n self.aws_key_id = check.str_param(aws_key_id, "aws_key_id")\n self.aws_secret_key = check.str_param(aws_secret_key, "aws_secret_key")\n self.aws_region_name = check.str_param(aws_region_name, "aws_region_name")\n self.start_date = check.str_param(start_date, "start_date")\n super().__init__("Aws Cloudtrail", name)
\n\n\n
[docs]class OktaSource(GeneratedAirbyteSource):\n
[docs] class OAuth20:\n
[docs] @public\n def __init__(self, client_id: str, client_secret: str, refresh_token: str):\n self.auth_type = "oauth2.0"\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")
\n\n
[docs] class APIToken:\n
[docs] @public\n def __init__(self, api_token: str):\n self.auth_type = "api_token"\n self.api_token = check.str_param(api_token, "api_token")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n credentials: Union["OktaSource.OAuth20", "OktaSource.APIToken"],\n domain: Optional[str] = None,\n start_date: Optional[str] = None,\n ):\n """Airbyte Source for Okta.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/okta\n\n Args:\n name (str): The name of the destination.\n domain (Optional[str]): The Okta domain. See the docs for instructions on how to find it.\n start_date (Optional[str]): UTC date and time in the format YYYY-MM-DDTHH:MM:SSZ. Any data before this date will not be replicated.\n """\n self.domain = check.opt_str_param(domain, "domain")\n self.start_date = check.opt_str_param(start_date, "start_date")\n self.credentials = check.inst_param(\n credentials, "credentials", (OktaSource.OAuth20, OktaSource.APIToken)\n )\n super().__init__("Okta", name)
\n\n\n
[docs]class InsightlySource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, token: Optional[str] = None, start_date: Optional[str] = None):\n """Airbyte Source for Insightly.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/insightly\n\n Args:\n name (str): The name of the destination.\n token (Optional[str]): Your Insightly API token.\n start_date (Optional[str]): The date from which you'd like to replicate data for Insightly in the format YYYY-MM-DDT00:00:00Z. All data generated after this date will be replicated. Note that it will be used only for incremental streams.\n """\n self.token = check.opt_str_param(token, "token")\n self.start_date = check.opt_str_param(start_date, "start_date")\n super().__init__("Insightly", name)
\n\n\n
[docs]class LinkedinPagesSource(GeneratedAirbyteSource):\n
[docs] class OAuth20:\n
[docs] @public\n def __init__(\n self,\n client_id: str,\n client_secret: str,\n refresh_token: str,\n auth_method: Optional[str] = None,\n ):\n self.auth_method = check.opt_str_param(auth_method, "auth_method")\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")
\n\n
[docs] class AccessToken:\n
[docs] @public\n def __init__(self, access_token: str, auth_method: Optional[str] = None):\n self.auth_method = check.opt_str_param(auth_method, "auth_method")\n self.access_token = check.str_param(access_token, "access_token")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n org_id: int,\n credentials: Union["LinkedinPagesSource.OAuth20", "LinkedinPagesSource.AccessToken"],\n ):\n """Airbyte Source for Linkedin Pages.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/linkedin-pages/\n\n Args:\n name (str): The name of the destination.\n org_id (int): Specify the Organization ID\n """\n self.org_id = check.int_param(org_id, "org_id")\n self.credentials = check.inst_param(\n credentials,\n "credentials",\n (LinkedinPagesSource.OAuth20, LinkedinPagesSource.AccessToken),\n )\n super().__init__("Linkedin Pages", name)
\n\n\n
[docs]class PersistiqSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, api_key: str):\n """Airbyte Source for Persistiq.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/persistiq\n\n Args:\n name (str): The name of the destination.\n api_key (str): PersistIq API Key. See the docs for more information on where to find that key.\n """\n self.api_key = check.str_param(api_key, "api_key")\n super().__init__("Persistiq", name)
\n\n\n
[docs]class FreshcallerSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n domain: str,\n api_key: str,\n start_date: str,\n requests_per_minute: Optional[int] = None,\n sync_lag_minutes: Optional[int] = None,\n ):\n """Airbyte Source for Freshcaller.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/freshcaller\n\n Args:\n name (str): The name of the destination.\n domain (str): Used to construct Base URL for the Freshcaller APIs\n api_key (str): Freshcaller API Key. See the docs for more information on how to obtain this key.\n requests_per_minute (Optional[int]): The number of requests per minute that this source allowed to use. There is a rate limit of 50 requests per minute per app per account.\n start_date (str): UTC date and time. Any data created after this date will be replicated.\n sync_lag_minutes (Optional[int]): Lag in minutes for each sync, i.e., at time T, data for the time range [prev_sync_time, T-30] will be fetched\n """\n self.domain = check.str_param(domain, "domain")\n self.api_key = check.str_param(api_key, "api_key")\n self.requests_per_minute = check.opt_int_param(requests_per_minute, "requests_per_minute")\n self.start_date = check.str_param(start_date, "start_date")\n self.sync_lag_minutes = check.opt_int_param(sync_lag_minutes, "sync_lag_minutes")\n super().__init__("Freshcaller", name)
\n\n\n
[docs]class AppfollowSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, ext_id: str, cid: str, api_secret: str, country: str):\n """Airbyte Source for Appfollow.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/appfollow\n\n Args:\n name (str): The name of the destination.\n ext_id (str): for App Store \u2014 this is 9-10 digits identification number; for Google Play \u2014 this is bundle name;\n cid (str): client id provided by Appfollow\n api_secret (str): api secret provided by Appfollow\n country (str): getting data by Country\n """\n self.ext_id = check.str_param(ext_id, "ext_id")\n self.cid = check.str_param(cid, "cid")\n self.api_secret = check.str_param(api_secret, "api_secret")\n self.country = check.str_param(country, "country")\n super().__init__("Appfollow", name)
\n\n\n
[docs]class FacebookPagesSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, access_token: str, page_id: str):\n """Airbyte Source for Facebook Pages.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/facebook-pages\n\n Args:\n name (str): The name of the destination.\n access_token (str): Facebook Page Access Token\n page_id (str): Page ID\n """\n self.access_token = check.str_param(access_token, "access_token")\n self.page_id = check.str_param(page_id, "page_id")\n super().__init__("Facebook Pages", name)
\n\n\n
[docs]class JiraSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n api_token: str,\n domain: str,\n email: str,\n projects: Optional[List[str]] = None,\n start_date: Optional[str] = None,\n additional_fields: Optional[List[str]] = None,\n expand_issue_changelog: Optional[bool] = None,\n render_fields: Optional[bool] = None,\n enable_experimental_streams: Optional[bool] = None,\n ):\n """Airbyte Source for Jira.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/jira\n\n Args:\n name (str): The name of the destination.\n api_token (str): Jira API Token. See the docs for more information on how to generate this key.\n domain (str): The Domain for your Jira account, e.g. airbyteio.atlassian.net\n email (str): The user email for your Jira account.\n projects (Optional[List[str]]): List of Jira project keys to replicate data for.\n start_date (Optional[str]): The date from which you'd like to replicate data for Jira in the format YYYY-MM-DDT00:00:00Z. All data generated after this date will be replicated. Note that it will be used only in the following incremental streams: issues.\n additional_fields (Optional[List[str]]): List of additional fields to include in replicating issues.\n expand_issue_changelog (Optional[bool]): Expand the changelog when replicating issues.\n render_fields (Optional[bool]): Render issue fields in HTML format in addition to Jira JSON-like format.\n enable_experimental_streams (Optional[bool]): Allow the use of experimental streams which rely on undocumented Jira API endpoints. See https://docs.airbyte.com/integrations/sources/jira#experimental-tables for more info.\n """\n self.api_token = check.str_param(api_token, "api_token")\n self.domain = check.str_param(domain, "domain")\n self.email = check.str_param(email, "email")\n self.projects = check.opt_nullable_list_param(projects, "projects", str)\n self.start_date = check.opt_str_param(start_date, "start_date")\n self.additional_fields = check.opt_nullable_list_param(\n additional_fields, "additional_fields", str\n )\n self.expand_issue_changelog = check.opt_bool_param(\n expand_issue_changelog, "expand_issue_changelog"\n )\n self.render_fields = check.opt_bool_param(render_fields, "render_fields")\n self.enable_experimental_streams = check.opt_bool_param(\n enable_experimental_streams, "enable_experimental_streams"\n )\n super().__init__("Jira", name)
\n\n\n
[docs]class GoogleSheetsSource(GeneratedAirbyteSource):\n
[docs] class AuthenticateViaGoogleOAuth:\n
[docs] @public\n def __init__(self, client_id: str, client_secret: str, refresh_token: str):\n self.auth_type = "Client"\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")
\n\n
[docs] class ServiceAccountKeyAuthentication:\n
[docs] @public\n def __init__(self, service_account_info: str):\n self.auth_type = "Service"\n self.service_account_info = check.str_param(\n service_account_info, "service_account_info"\n )
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n spreadsheet_id: str,\n credentials: Union[\n "GoogleSheetsSource.AuthenticateViaGoogleOAuth",\n "GoogleSheetsSource.ServiceAccountKeyAuthentication",\n ],\n row_batch_size: Optional[int] = None,\n ):\n """Airbyte Source for Google Sheets.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/google-sheets\n\n Args:\n name (str): The name of the destination.\n spreadsheet_id (str): Enter the link to the Google spreadsheet you want to sync\n row_batch_size (Optional[int]): Number of rows fetched when making a Google Sheet API call. Defaults to 200.\n credentials (Union[GoogleSheetsSource.AuthenticateViaGoogleOAuth, GoogleSheetsSource.ServiceAccountKeyAuthentication]): Credentials for connecting to the Google Sheets API\n """\n self.spreadsheet_id = check.str_param(spreadsheet_id, "spreadsheet_id")\n self.row_batch_size = check.opt_int_param(row_batch_size, "row_batch_size")\n self.credentials = check.inst_param(\n credentials,\n "credentials",\n (\n GoogleSheetsSource.AuthenticateViaGoogleOAuth,\n GoogleSheetsSource.ServiceAccountKeyAuthentication,\n ),\n )\n super().__init__("Google Sheets", name)
\n\n\n
[docs]class DockerhubSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, docker_username: str):\n """Airbyte Source for Dockerhub.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/dockerhub\n\n Args:\n name (str): The name of the destination.\n docker_username (str): Username of DockerHub person or organization (for https://hub.docker.com/v2/repositories/USERNAME/ API call)\n """\n self.docker_username = check.str_param(docker_username, "docker_username")\n super().__init__("Dockerhub", name)
\n\n\n
[docs]class UsCensusSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self, name: str, query_path: str, api_key: str, query_params: Optional[str] = None\n ):\n """Airbyte Source for Us Census.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/us-census\n\n Args:\n name (str): The name of the destination.\n query_params (Optional[str]): The query parameters portion of the GET request, without the api key\n query_path (str): The path portion of the GET request\n api_key (str): Your API Key. Get your key here.\n """\n self.query_params = check.opt_str_param(query_params, "query_params")\n self.query_path = check.str_param(query_path, "query_path")\n self.api_key = check.str_param(api_key, "api_key")\n super().__init__("Us Census", name)
\n\n\n
[docs]class KustomerSingerSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, api_token: str, start_date: str):\n """Airbyte Source for Kustomer Singer.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/kustomer\n\n Args:\n name (str): The name of the destination.\n api_token (str): Kustomer API Token. See the docs on how to obtain this\n start_date (str): The date from which you'd like to replicate the data\n """\n self.api_token = check.str_param(api_token, "api_token")\n self.start_date = check.str_param(start_date, "start_date")\n super().__init__("Kustomer Singer", name)
\n\n\n
[docs]class AzureTableSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n storage_account_name: str,\n storage_access_key: str,\n storage_endpoint_suffix: Optional[str] = None,\n ):\n """Airbyte Source for Azure Table.\n\n Args:\n name (str): The name of the destination.\n storage_account_name (str): The name of your storage account.\n storage_access_key (str): Azure Table Storage Access Key. See the docs for more information on how to obtain this key.\n storage_endpoint_suffix (Optional[str]): Azure Table Storage service account URL suffix. See the docs for more information on how to obtain endpoint suffix\n """\n self.storage_account_name = check.str_param(storage_account_name, "storage_account_name")\n self.storage_access_key = check.str_param(storage_access_key, "storage_access_key")\n self.storage_endpoint_suffix = check.opt_str_param(\n storage_endpoint_suffix, "storage_endpoint_suffix"\n )\n super().__init__("Azure Table", name)
\n\n\n
[docs]class ScaffoldJavaJdbcSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n host: str,\n port: int,\n database: str,\n username: str,\n replication_method: str,\n password: Optional[str] = None,\n jdbc_url_params: Optional[str] = None,\n ):\n """Airbyte Source for Scaffold Java Jdbc.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/scaffold_java_jdbc\n\n Args:\n name (str): The name of the destination.\n host (str): Hostname of the database.\n port (int): Port of the database.\n database (str): Name of the database.\n username (str): Username to use to access the database.\n password (Optional[str]): Password associated with the username.\n jdbc_url_params (Optional[str]): Additional properties to pass to the JDBC URL string when connecting to the database formatted as 'key=value' pairs separated by the symbol '&'. (example: key1=value1&key2=value2&key3=value3)\n replication_method (str): Replication method to use for extracting data from the database. STANDARD replication requires no setup on the DB side but will not be able to represent deletions incrementally. CDC uses the Binlog to detect inserts, updates, and deletes. This needs to be configured on the source database itself.\n """\n self.host = check.str_param(host, "host")\n self.port = check.int_param(port, "port")\n self.database = check.str_param(database, "database")\n self.username = check.str_param(username, "username")\n self.password = check.opt_str_param(password, "password")\n self.jdbc_url_params = check.opt_str_param(jdbc_url_params, "jdbc_url_params")\n self.replication_method = check.str_param(replication_method, "replication_method")\n super().__init__("Scaffold Java Jdbc", name)
\n\n\n
[docs]class TidbSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n host: str,\n port: int,\n database: str,\n username: str,\n password: Optional[str] = None,\n jdbc_url_params: Optional[str] = None,\n ssl: Optional[bool] = None,\n ):\n """Airbyte Source for Tidb.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/tidb\n\n Args:\n name (str): The name of the destination.\n host (str): Hostname of the database.\n port (int): Port of the database.\n database (str): Name of the database.\n username (str): Username to use to access the database.\n password (Optional[str]): Password associated with the username.\n jdbc_url_params (Optional[str]): Additional properties to pass to the JDBC URL string when connecting to the database formatted as 'key=value' pairs separated by the symbol '&'. (example: key1=value1&key2=value2&key3=value3)\n ssl (Optional[bool]): Encrypt data using SSL.\n """\n self.host = check.str_param(host, "host")\n self.port = check.int_param(port, "port")\n self.database = check.str_param(database, "database")\n self.username = check.str_param(username, "username")\n self.password = check.opt_str_param(password, "password")\n self.jdbc_url_params = check.opt_str_param(jdbc_url_params, "jdbc_url_params")\n self.ssl = check.opt_bool_param(ssl, "ssl")\n super().__init__("Tidb", name)
\n\n\n
[docs]class QualarooSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n token: str,\n key: str,\n start_date: str,\n survey_ids: Optional[List[str]] = None,\n ):\n """Airbyte Source for Qualaroo.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/qualaroo\n\n Args:\n name (str): The name of the destination.\n token (str): A Qualaroo token. See the docs for instructions on how to generate it.\n key (str): A Qualaroo token. See the docs for instructions on how to generate it.\n start_date (str): UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated.\n survey_ids (Optional[List[str]]): IDs of the surveys from which you'd like to replicate data. If left empty, data from all surveys to which you have access will be replicated.\n """\n self.token = check.str_param(token, "token")\n self.key = check.str_param(key, "key")\n self.start_date = check.str_param(start_date, "start_date")\n self.survey_ids = check.opt_nullable_list_param(survey_ids, "survey_ids", str)\n super().__init__("Qualaroo", name)
\n\n\n
[docs]class YahooFinancePriceSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self, name: str, tickers: str, interval: Optional[str] = None, range: Optional[str] = None\n ):\n """Airbyte Source for Yahoo Finance Price.\n\n Args:\n name (str): The name of the destination.\n tickers (str): Comma-separated identifiers for the stocks to be queried. Whitespaces are allowed.\n interval (Optional[str]): The interval of between prices queried.\n range (Optional[str]): The range of prices to be queried.\n """\n self.tickers = check.str_param(tickers, "tickers")\n self.interval = check.opt_str_param(interval, "interval")\n self.range = check.opt_str_param(range, "range")\n super().__init__("Yahoo Finance Price", name)
\n\n\n
[docs]class GoogleAnalyticsV4Source(GeneratedAirbyteSource):\n
[docs] class AuthenticateViaGoogleOauth:\n
[docs] @public\n def __init__(\n self,\n client_id: str,\n client_secret: str,\n refresh_token: str,\n auth_type: Optional[str] = None,\n access_token: Optional[str] = None,\n ):\n self.auth_type = check.opt_str_param(auth_type, "auth_type")\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")\n self.access_token = check.opt_str_param(access_token, "access_token")
\n\n
[docs] class ServiceAccountKeyAuthentication:\n
[docs] @public\n def __init__(self, credentials_json: str, auth_type: Optional[str] = None):\n self.auth_type = check.opt_str_param(auth_type, "auth_type")\n self.credentials_json = check.str_param(credentials_json, "credentials_json")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n credentials: Union[\n "GoogleAnalyticsV4Source.AuthenticateViaGoogleOauth",\n "GoogleAnalyticsV4Source.ServiceAccountKeyAuthentication",\n ],\n start_date: str,\n view_id: str,\n custom_reports: Optional[str] = None,\n window_in_days: Optional[int] = None,\n ):\n """Airbyte Source for Google Analytics V4.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/google-analytics-universal-analytics\n\n Args:\n name (str): The name of the destination.\n credentials (Union[GoogleAnalyticsV4Source.AuthenticateViaGoogleOauth, GoogleAnalyticsV4Source.ServiceAccountKeyAuthentication]): Credentials for the service\n start_date (str): The date in the format YYYY-MM-DD. Any data before this date will not be replicated.\n view_id (str): The ID for the Google Analytics View you want to fetch data from. This can be found from the Google Analytics Account Explorer.\n custom_reports (Optional[str]): A JSON array describing the custom reports you want to sync from Google Analytics. See the docs for more information about the exact format you can use to fill out this field.\n window_in_days (Optional[int]): The time increment used by the connector when requesting data from the Google Analytics API. More information is available in the the docs. The bigger this value is, the faster the sync will be, but the more likely that sampling will be applied to your data, potentially causing inaccuracies in the returned results. We recommend setting this to 1 unless you have a hard requirement to make the sync faster at the expense of accuracy. The minimum allowed value for this field is 1, and the maximum is 364.\n """\n self.credentials = check.inst_param(\n credentials,\n "credentials",\n (\n GoogleAnalyticsV4Source.AuthenticateViaGoogleOauth,\n GoogleAnalyticsV4Source.ServiceAccountKeyAuthentication,\n ),\n )\n self.start_date = check.str_param(start_date, "start_date")\n self.view_id = check.str_param(view_id, "view_id")\n self.custom_reports = check.opt_str_param(custom_reports, "custom_reports")\n self.window_in_days = check.opt_int_param(window_in_days, "window_in_days")\n super().__init__("Google Analytics V4", name)
\n\n\n
[docs]class JdbcSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n username: str,\n jdbc_url: str,\n password: Optional[str] = None,\n jdbc_url_params: Optional[str] = None,\n ):\n """Airbyte Source for Jdbc.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/postgres\n\n Args:\n name (str): The name of the destination.\n username (str): The username which is used to access the database.\n password (Optional[str]): The password associated with this username.\n jdbc_url (str): JDBC formatted URL. See the standard here.\n jdbc_url_params (Optional[str]): Additional properties to pass to the JDBC URL string when connecting to the database formatted as 'key=value' pairs separated by the symbol '&'. (example: key1=value1&key2=value2&key3=value3).\n """\n self.username = check.str_param(username, "username")\n self.password = check.opt_str_param(password, "password")\n self.jdbc_url = check.str_param(jdbc_url, "jdbc_url")\n self.jdbc_url_params = check.opt_str_param(jdbc_url_params, "jdbc_url_params")\n super().__init__("Jdbc", name)
\n\n\n
[docs]class FakerSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n count: int,\n seed: Optional[int] = None,\n records_per_sync: Optional[int] = None,\n records_per_slice: Optional[int] = None,\n ):\n """Airbyte Source for Faker.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/faker\n\n Args:\n name (str): The name of the destination.\n count (int): How many users should be generated in total. This setting does not apply to the purchases or products stream.\n seed (Optional[int]): Manually control the faker random seed to return the same values on subsequent runs (leave -1 for random)\n records_per_sync (Optional[int]): How many fake records will be returned for each sync, for each stream? By default, it will take 2 syncs to create the requested 1000 records.\n records_per_slice (Optional[int]): How many fake records will be in each page (stream slice), before a state message is emitted?\n """\n self.count = check.int_param(count, "count")\n self.seed = check.opt_int_param(seed, "seed")\n self.records_per_sync = check.opt_int_param(records_per_sync, "records_per_sync")\n self.records_per_slice = check.opt_int_param(records_per_slice, "records_per_slice")\n super().__init__("Faker", name)
\n\n\n
[docs]class TplcentralSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n url_base: str,\n client_id: str,\n client_secret: str,\n user_login_id: Optional[int] = None,\n user_login: Optional[str] = None,\n tpl_key: Optional[str] = None,\n customer_id: Optional[int] = None,\n facility_id: Optional[int] = None,\n start_date: Optional[str] = None,\n ):\n """Airbyte Source for Tplcentral.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/tplcentral\n\n Args:\n name (str): The name of the destination.\n user_login_id (Optional[int]): User login ID and/or name is required\n user_login (Optional[str]): User login ID and/or name is required\n start_date (Optional[str]): Date and time together in RFC 3339 format, for example, 2018-11-13T20:20:39+00:00.\n """\n self.url_base = check.str_param(url_base, "url_base")\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.user_login_id = check.opt_int_param(user_login_id, "user_login_id")\n self.user_login = check.opt_str_param(user_login, "user_login")\n self.tpl_key = check.opt_str_param(tpl_key, "tpl_key")\n self.customer_id = check.opt_int_param(customer_id, "customer_id")\n self.facility_id = check.opt_int_param(facility_id, "facility_id")\n self.start_date = check.opt_str_param(start_date, "start_date")\n super().__init__("Tplcentral", name)
\n\n\n
[docs]class ClickhouseSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n host: str,\n port: int,\n database: str,\n username: str,\n password: Optional[str] = None,\n jdbc_url_params: Optional[str] = None,\n ssl: Optional[bool] = None,\n ):\n """Airbyte Source for Clickhouse.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/clickhouse\n\n Args:\n name (str): The name of the destination.\n host (str): The host endpoint of the Clickhouse cluster.\n port (int): The port of the database.\n database (str): The name of the database.\n username (str): The username which is used to access the database.\n password (Optional[str]): The password associated with this username.\n jdbc_url_params (Optional[str]): Additional properties to pass to the JDBC URL string when connecting to the database formatted as 'key=value' pairs separated by the symbol '&'. (Eg. key1=value1&key2=value2&key3=value3). For more information read about JDBC URL parameters.\n ssl (Optional[bool]): Encrypt data using SSL.\n """\n self.host = check.str_param(host, "host")\n self.port = check.int_param(port, "port")\n self.database = check.str_param(database, "database")\n self.username = check.str_param(username, "username")\n self.password = check.opt_str_param(password, "password")\n self.jdbc_url_params = check.opt_str_param(jdbc_url_params, "jdbc_url_params")\n self.ssl = check.opt_bool_param(ssl, "ssl")\n super().__init__("Clickhouse", name)
\n\n\n
[docs]class FreshserviceSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, domain_name: str, api_key: str, start_date: str):\n """Airbyte Source for Freshservice.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/freshservice\n\n Args:\n name (str): The name of the destination.\n domain_name (str): The name of your Freshservice domain\n api_key (str): Freshservice API Key. See here. The key is case sensitive.\n start_date (str): UTC date and time in the format 2020-10-01T00:00:00Z. Any data before this date will not be replicated.\n """\n self.domain_name = check.str_param(domain_name, "domain_name")\n self.api_key = check.str_param(api_key, "api_key")\n self.start_date = check.str_param(start_date, "start_date")\n super().__init__("Freshservice", name)
\n\n\n
[docs]class ZenloopSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n api_token: str,\n date_from: Optional[str] = None,\n survey_id: Optional[str] = None,\n survey_group_id: Optional[str] = None,\n ):\n """Airbyte Source for Zenloop.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/zenloop\n\n Args:\n name (str): The name of the destination.\n api_token (str): Zenloop API Token. You can get the API token in settings page here\n date_from (Optional[str]): Zenloop date_from. Format: 2021-10-24T03:30:30Z or 2021-10-24. Leave empty if only data from current data should be synced\n survey_id (Optional[str]): Zenloop Survey ID. Can be found here. Leave empty to pull answers from all surveys\n survey_group_id (Optional[str]): Zenloop Survey Group ID. Can be found by pulling All Survey Groups via SurveyGroups stream. Leave empty to pull answers from all survey groups\n """\n self.api_token = check.str_param(api_token, "api_token")\n self.date_from = check.opt_str_param(date_from, "date_from")\n self.survey_id = check.opt_str_param(survey_id, "survey_id")\n self.survey_group_id = check.opt_str_param(survey_group_id, "survey_group_id")\n super().__init__("Zenloop", name)
\n\n\n
[docs]class OracleSource(GeneratedAirbyteSource):\n
[docs] class ServiceName:\n
[docs] @public\n def __init__(self, service_name: str, connection_type: Optional[str] = None):\n self.connection_type = check.opt_str_param(connection_type, "connection_type")\n self.service_name = check.str_param(service_name, "service_name")
\n\n
[docs] class SystemIDSID:\n
[docs] @public\n def __init__(self, sid: str, connection_type: Optional[str] = None):\n self.connection_type = check.opt_str_param(connection_type, "connection_type")\n self.sid = check.str_param(sid, "sid")
\n\n
[docs] class Unencrypted:\n
[docs] @public\n def __init__(\n self,\n ):\n self.encryption_method = "unencrypted"
\n\n
[docs] class NativeNetworkEncryptionNNE:\n
[docs] @public\n def __init__(self, encryption_algorithm: Optional[str] = None):\n self.encryption_method = "client_nne"\n self.encryption_algorithm = check.opt_str_param(\n encryption_algorithm, "encryption_algorithm"\n )
\n\n
[docs] class TLSEncryptedVerifyCertificate:\n
[docs] @public\n def __init__(self, ssl_certificate: str):\n self.encryption_method = "encrypted_verify_certificate"\n self.ssl_certificate = check.str_param(ssl_certificate, "ssl_certificate")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n host: str,\n port: int,\n connection_data: Union["OracleSource.ServiceName", "OracleSource.SystemIDSID"],\n username: str,\n encryption: Union[\n "OracleSource.Unencrypted",\n "OracleSource.NativeNetworkEncryptionNNE",\n "OracleSource.TLSEncryptedVerifyCertificate",\n ],\n password: Optional[str] = None,\n schemas: Optional[List[str]] = None,\n jdbc_url_params: Optional[str] = None,\n ):\n """Airbyte Source for Oracle.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/oracle\n\n Args:\n name (str): The name of the destination.\n host (str): Hostname of the database.\n port (int): Port of the database. Oracle Corporations recommends the following port numbers: 1521 - Default listening port for client connections to the listener. 2484 - Recommended and officially registered listening port for client connections to the listener using TCP/IP with SSL\n connection_data (Union[OracleSource.ServiceName, OracleSource.SystemIDSID]): Connect data that will be used for DB connection\n username (str): The username which is used to access the database.\n password (Optional[str]): The password associated with the username.\n schemas (Optional[List[str]]): The list of schemas to sync from. Defaults to user. Case sensitive.\n jdbc_url_params (Optional[str]): Additional properties to pass to the JDBC URL string when connecting to the database formatted as 'key=value' pairs separated by the symbol '&'. (example: key1=value1&key2=value2&key3=value3).\n encryption (Union[OracleSource.Unencrypted, OracleSource.NativeNetworkEncryptionNNE, OracleSource.TLSEncryptedVerifyCertificate]): The encryption method with is used when communicating with the database.\n """\n self.host = check.str_param(host, "host")\n self.port = check.int_param(port, "port")\n self.connection_data = check.inst_param(\n connection_data, "connection_data", (OracleSource.ServiceName, OracleSource.SystemIDSID)\n )\n self.username = check.str_param(username, "username")\n self.password = check.opt_str_param(password, "password")\n self.schemas = check.opt_nullable_list_param(schemas, "schemas", str)\n self.jdbc_url_params = check.opt_str_param(jdbc_url_params, "jdbc_url_params")\n self.encryption = check.inst_param(\n encryption,\n "encryption",\n (\n OracleSource.Unencrypted,\n OracleSource.NativeNetworkEncryptionNNE,\n OracleSource.TLSEncryptedVerifyCertificate,\n ),\n )\n super().__init__("Oracle", name)
\n\n\n
[docs]class KlaviyoSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, api_key: str, start_date: str):\n """Airbyte Source for Klaviyo.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/klaviyo\n\n Args:\n name (str): The name of the destination.\n api_key (str): Klaviyo API Key. See our docs if you need help finding this key.\n start_date (str): UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated.\n """\n self.api_key = check.str_param(api_key, "api_key")\n self.start_date = check.str_param(start_date, "start_date")\n super().__init__("Klaviyo", name)
\n\n\n
[docs]class GoogleDirectorySource(GeneratedAirbyteSource):\n
[docs] class SignInViaGoogleOAuth:\n
[docs] @public\n def __init__(\n self,\n client_id: str,\n client_secret: str,\n refresh_token: str,\n credentials_title: Optional[str] = None,\n ):\n self.credentials_title = check.opt_str_param(credentials_title, "credentials_title")\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")
\n\n
[docs] class ServiceAccountKey:\n
[docs] @public\n def __init__(\n self, credentials_json: str, email: str, credentials_title: Optional[str] = None\n ):\n self.credentials_title = check.opt_str_param(credentials_title, "credentials_title")\n self.credentials_json = check.str_param(credentials_json, "credentials_json")\n self.email = check.str_param(email, "email")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n credentials: Union[\n "GoogleDirectorySource.SignInViaGoogleOAuth", "GoogleDirectorySource.ServiceAccountKey"\n ],\n ):\n """Airbyte Source for Google Directory.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/google-directory\n\n Args:\n name (str): The name of the destination.\n credentials (Union[GoogleDirectorySource.SignInViaGoogleOAuth, GoogleDirectorySource.ServiceAccountKey]): Google APIs use the OAuth 2.0 protocol for authentication and authorization. The Source supports Web server application and Service accounts scenarios.\n """\n self.credentials = check.inst_param(\n credentials,\n "credentials",\n (GoogleDirectorySource.SignInViaGoogleOAuth, GoogleDirectorySource.ServiceAccountKey),\n )\n super().__init__("Google Directory", name)
\n\n\n
[docs]class InstagramSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, start_date: str, access_token: str):\n """Airbyte Source for Instagram.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/instagram\n\n Args:\n name (str): The name of the destination.\n start_date (str): The date from which you'd like to replicate data for User Insights, in the format YYYY-MM-DDT00:00:00Z. All data generated after this date will be replicated.\n access_token (str): The value of the access token generated. See the docs for more information\n """\n self.start_date = check.str_param(start_date, "start_date")\n self.access_token = check.str_param(access_token, "access_token")\n super().__init__("Instagram", name)
\n\n\n
[docs]class ShortioSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, domain_id: str, secret_key: str, start_date: str):\n """Airbyte Source for Shortio.\n\n Documentation can be found at https://developers.short.io/reference\n\n Args:\n name (str): The name of the destination.\n secret_key (str): Short.io Secret Key\n start_date (str): UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated.\n """\n self.domain_id = check.str_param(domain_id, "domain_id")\n self.secret_key = check.str_param(secret_key, "secret_key")\n self.start_date = check.str_param(start_date, "start_date")\n super().__init__("Shortio", name)
\n\n\n
[docs]class SquareSource(GeneratedAirbyteSource):\n
[docs] class OauthAuthentication:\n
[docs] @public\n def __init__(self, client_id: str, client_secret: str, refresh_token: str):\n self.auth_type = "Oauth"\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")
\n\n
[docs] class APIKey:\n
[docs] @public\n def __init__(self, api_key: str):\n self.auth_type = "Apikey"\n self.api_key = check.str_param(api_key, "api_key")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n is_sandbox: bool,\n credentials: Union["SquareSource.OauthAuthentication", "SquareSource.APIKey"],\n start_date: Optional[str] = None,\n include_deleted_objects: Optional[bool] = None,\n ):\n """Airbyte Source for Square.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/square\n\n Args:\n name (str): The name of the destination.\n is_sandbox (bool): Determines whether to use the sandbox or production environment.\n start_date (Optional[str]): UTC date in the format YYYY-MM-DD. Any data before this date will not be replicated. If not set, all data will be replicated.\n include_deleted_objects (Optional[bool]): In some streams there is an option to include deleted objects (Items, Categories, Discounts, Taxes)\n """\n self.is_sandbox = check.bool_param(is_sandbox, "is_sandbox")\n self.start_date = check.opt_str_param(start_date, "start_date")\n self.include_deleted_objects = check.opt_bool_param(\n include_deleted_objects, "include_deleted_objects"\n )\n self.credentials = check.inst_param(\n credentials, "credentials", (SquareSource.OauthAuthentication, SquareSource.APIKey)\n )\n super().__init__("Square", name)
\n\n\n
[docs]class DelightedSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, since: str, api_key: str):\n """Airbyte Source for Delighted.\n\n Args:\n name (str): The name of the destination.\n since (str): The date from which you'd like to replicate the data\n api_key (str): A Delighted API key.\n """\n self.since = check.str_param(since, "since")\n self.api_key = check.str_param(api_key, "api_key")\n super().__init__("Delighted", name)
\n\n\n
[docs]class AmazonSqsSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n queue_url: str,\n region: str,\n delete_messages: bool,\n max_batch_size: Optional[int] = None,\n max_wait_time: Optional[int] = None,\n attributes_to_return: Optional[str] = None,\n visibility_timeout: Optional[int] = None,\n access_key: Optional[str] = None,\n secret_key: Optional[str] = None,\n ):\n """Airbyte Source for Amazon Sqs.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/amazon-sqs\n\n Args:\n name (str): The name of the destination.\n queue_url (str): URL of the SQS Queue\n region (str): AWS Region of the SQS Queue\n delete_messages (bool): If Enabled, messages will be deleted from the SQS Queue after being read. If Disabled, messages are left in the queue and can be read more than once. WARNING: Enabling this option can result in data loss in cases of failure, use with caution, see documentation for more detail.\n max_batch_size (Optional[int]): Max amount of messages to get in one batch (10 max)\n max_wait_time (Optional[int]): Max amount of time in seconds to wait for messages in a single poll (20 max)\n attributes_to_return (Optional[str]): Comma separated list of Mesage Attribute names to return\n visibility_timeout (Optional[int]): Modify the Visibility Timeout of the individual message from the Queue's default (seconds).\n access_key (Optional[str]): The Access Key ID of the AWS IAM Role to use for pulling messages\n secret_key (Optional[str]): The Secret Key of the AWS IAM Role to use for pulling messages\n """\n self.queue_url = check.str_param(queue_url, "queue_url")\n self.region = check.str_param(region, "region")\n self.delete_messages = check.bool_param(delete_messages, "delete_messages")\n self.max_batch_size = check.opt_int_param(max_batch_size, "max_batch_size")\n self.max_wait_time = check.opt_int_param(max_wait_time, "max_wait_time")\n self.attributes_to_return = check.opt_str_param(\n attributes_to_return, "attributes_to_return"\n )\n self.visibility_timeout = check.opt_int_param(visibility_timeout, "visibility_timeout")\n self.access_key = check.opt_str_param(access_key, "access_key")\n self.secret_key = check.opt_str_param(secret_key, "secret_key")\n super().__init__("Amazon Sqs", name)
\n\n\n
[docs]class YoutubeAnalyticsSource(GeneratedAirbyteSource):\n
[docs] class AuthenticateViaOAuth20:\n
[docs] @public\n def __init__(self, client_id: str, client_secret: str, refresh_token: str):\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")
\n\n
[docs] @public\n def __init__(self, name: str, credentials: "YoutubeAnalyticsSource.AuthenticateViaOAuth20"):\n """Airbyte Source for Youtube Analytics.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/youtube-analytics\n\n Args:\n name (str): The name of the destination.\n\n """\n self.credentials = check.inst_param(\n credentials, "credentials", YoutubeAnalyticsSource.AuthenticateViaOAuth20\n )\n super().__init__("Youtube Analytics", name)
\n\n\n
[docs]class ScaffoldSourcePythonSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, fix_me: Optional[str] = None):\n """Airbyte Source for Scaffold Source Python.\n\n Args:\n name (str): The name of the destination.\n fix_me (Optional[str]): describe me\n """\n self.fix_me = check.opt_str_param(fix_me, "fix_me")\n super().__init__("Scaffold Source Python", name)
\n\n\n
[docs]class LookerSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n domain: str,\n client_id: str,\n client_secret: str,\n run_look_ids: Optional[List[str]] = None,\n ):\n """Airbyte Source for Looker.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/looker\n\n Args:\n name (str): The name of the destination.\n domain (str): Domain for your Looker account, e.g. airbyte.cloud.looker.com,looker.[clientname].com,IP address\n client_id (str): The Client ID is first part of an API3 key that is specific to each Looker user. See the docs for more information on how to generate this key.\n client_secret (str): The Client Secret is second part of an API3 key.\n run_look_ids (Optional[List[str]]): The IDs of any Looks to run\n """\n self.domain = check.str_param(domain, "domain")\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.run_look_ids = check.opt_nullable_list_param(run_look_ids, "run_look_ids", str)\n super().__init__("Looker", name)
\n\n\n
[docs]class GitlabSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n api_url: str,\n private_token: str,\n start_date: str,\n groups: Optional[str] = None,\n projects: Optional[str] = None,\n ):\n """Airbyte Source for Gitlab.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/gitlab\n\n Args:\n name (str): The name of the destination.\n api_url (str): Please enter your basic URL from GitLab instance.\n private_token (str): Log into your GitLab account and then generate a personal Access Token.\n groups (Optional[str]): Space-delimited list of groups. e.g. airbyte.io.\n projects (Optional[str]): Space-delimited list of projects. e.g. airbyte.io/documentation meltano/tap-gitlab.\n start_date (str): The date from which you'd like to replicate data for GitLab API, in the format YYYY-MM-DDT00:00:00Z. All data generated after this date will be replicated.\n """\n self.api_url = check.str_param(api_url, "api_url")\n self.private_token = check.str_param(private_token, "private_token")\n self.groups = check.opt_str_param(groups, "groups")\n self.projects = check.opt_str_param(projects, "projects")\n self.start_date = check.str_param(start_date, "start_date")\n super().__init__("Gitlab", name)
\n\n\n
[docs]class ExchangeRatesSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n start_date: str,\n access_key: str,\n base: Optional[str] = None,\n ignore_weekends: Optional[bool] = None,\n ):\n """Airbyte Source for Exchange Rates.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/exchangeratesapi\n\n Args:\n name (str): The name of the destination.\n start_date (str): Start getting data from that date.\n access_key (str): Your API Key. See here. The key is case sensitive.\n base (Optional[str]): ISO reference currency. See here. Free plan doesn't support Source Currency Switching, default base currency is EUR\n ignore_weekends (Optional[bool]): Ignore weekends? (Exchanges don't run on weekends)\n """\n self.start_date = check.str_param(start_date, "start_date")\n self.access_key = check.str_param(access_key, "access_key")\n self.base = check.opt_str_param(base, "base")\n self.ignore_weekends = check.opt_bool_param(ignore_weekends, "ignore_weekends")\n super().__init__("Exchange Rates", name)
\n\n\n
[docs]class AmazonAdsSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n client_id: str,\n client_secret: str,\n refresh_token: str,\n auth_type: Optional[str] = None,\n region: Optional[str] = None,\n report_wait_timeout: Optional[int] = None,\n report_generation_max_retries: Optional[int] = None,\n start_date: Optional[str] = None,\n profiles: Optional[List[int]] = None,\n state_filter: Optional[List[str]] = None,\n ):\n """Airbyte Source for Amazon Ads.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/amazon-ads\n\n Args:\n name (str): The name of the destination.\n client_id (str): The client ID of your Amazon Ads developer application. See the docs for more information.\n client_secret (str): The client secret of your Amazon Ads developer application. See the docs for more information.\n refresh_token (str): Amazon Ads refresh token. See the docs for more information on how to obtain this token.\n region (Optional[str]): Region to pull data from (EU/NA/FE). See docs for more details.\n report_wait_timeout (Optional[int]): Timeout duration in minutes for Reports. Default is 60 minutes.\n report_generation_max_retries (Optional[int]): Maximum retries Airbyte will attempt for fetching report data. Default is 5.\n start_date (Optional[str]): The Start date for collecting reports, should not be more than 60 days in the past. In YYYY-MM-DD format\n profiles (Optional[List[int]]): Profile IDs you want to fetch data for. See docs for more details.\n state_filter (Optional[List[str]]): Reflects the state of the Display, Product, and Brand Campaign streams as enabled, paused, or archived. If you do not populate this field, it will be ignored completely.\n """\n self.auth_type = check.opt_str_param(auth_type, "auth_type")\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")\n self.region = check.opt_str_param(region, "region")\n self.report_wait_timeout = check.opt_int_param(report_wait_timeout, "report_wait_timeout")\n self.report_generation_max_retries = check.opt_int_param(\n report_generation_max_retries, "report_generation_max_retries"\n )\n self.start_date = check.opt_str_param(start_date, "start_date")\n self.profiles = check.opt_nullable_list_param(profiles, "profiles", int)\n self.state_filter = check.opt_nullable_list_param(state_filter, "state_filter", str)\n super().__init__("Amazon Ads", name)
\n\n\n
[docs]class MixpanelSource(GeneratedAirbyteSource):\n
[docs] class ServiceAccount:\n
[docs] @public\n def __init__(self, username: str, secret: str):\n self.username = check.str_param(username, "username")\n self.secret = check.str_param(secret, "secret")
\n\n
[docs] class ProjectSecret:\n
[docs] @public\n def __init__(self, api_secret: str):\n self.api_secret = check.str_param(api_secret, "api_secret")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n credentials: Union["MixpanelSource.ServiceAccount", "MixpanelSource.ProjectSecret"],\n project_id: Optional[int] = None,\n attribution_window: Optional[int] = None,\n project_timezone: Optional[str] = None,\n select_properties_by_default: Optional[bool] = None,\n start_date: Optional[str] = None,\n end_date: Optional[str] = None,\n region: Optional[str] = None,\n date_window_size: Optional[int] = None,\n ):\n """Airbyte Source for Mixpanel.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/mixpanel\n\n Args:\n name (str): The name of the destination.\n credentials (Union[MixpanelSource.ServiceAccount, MixpanelSource.ProjectSecret]): Choose how to authenticate to Mixpanel\n project_id (Optional[int]): Your project ID number. See the docs for more information on how to obtain this.\n attribution_window (Optional[int]): A period of time for attributing results to ads and the lookback period after those actions occur during which ad results are counted. Default attribution window is 5 days.\n project_timezone (Optional[str]): Time zone in which integer date times are stored. The project timezone may be found in the project settings in the Mixpanel console.\n select_properties_by_default (Optional[bool]): Setting this config parameter to TRUE ensures that new properties on events and engage records are captured. Otherwise new properties will be ignored.\n start_date (Optional[str]): The date in the format YYYY-MM-DD. Any data before this date will not be replicated. If this option is not set, the connector will replicate data from up to one year ago by default.\n end_date (Optional[str]): The date in the format YYYY-MM-DD. Any data after this date will not be replicated. Left empty to always sync to most recent date\n region (Optional[str]): The region of mixpanel domain instance either US or EU.\n date_window_size (Optional[int]): Defines window size in days, that used to slice through data. You can reduce it, if amount of data in each window is too big for your environment.\n """\n self.credentials = check.inst_param(\n credentials,\n "credentials",\n (MixpanelSource.ServiceAccount, MixpanelSource.ProjectSecret),\n )\n self.project_id = check.opt_int_param(project_id, "project_id")\n self.attribution_window = check.opt_int_param(attribution_window, "attribution_window")\n self.project_timezone = check.opt_str_param(project_timezone, "project_timezone")\n self.select_properties_by_default = check.opt_bool_param(\n select_properties_by_default, "select_properties_by_default"\n )\n self.start_date = check.opt_str_param(start_date, "start_date")\n self.end_date = check.opt_str_param(end_date, "end_date")\n self.region = check.opt_str_param(region, "region")\n self.date_window_size = check.opt_int_param(date_window_size, "date_window_size")\n super().__init__("Mixpanel", name)
\n\n\n
[docs]class OrbitSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, api_token: str, workspace: str, start_date: Optional[str] = None):\n """Airbyte Source for Orbit.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/orbit\n\n Args:\n name (str): The name of the destination.\n api_token (str): Authorizes you to work with Orbit workspaces associated with the token.\n workspace (str): The unique name of the workspace that your API token is associated with.\n start_date (Optional[str]): Date in the format 2022-06-26. Only load members whose last activities are after this date.\n """\n self.api_token = check.str_param(api_token, "api_token")\n self.workspace = check.str_param(workspace, "workspace")\n self.start_date = check.opt_str_param(start_date, "start_date")\n super().__init__("Orbit", name)
\n\n\n
[docs]class AmazonSellerPartnerSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n lwa_app_id: str,\n lwa_client_secret: str,\n refresh_token: str,\n aws_access_key: str,\n aws_secret_key: str,\n role_arn: str,\n replication_start_date: str,\n aws_environment: str,\n region: str,\n app_id: Optional[str] = None,\n auth_type: Optional[str] = None,\n replication_end_date: Optional[str] = None,\n period_in_days: Optional[int] = None,\n report_options: Optional[str] = None,\n max_wait_seconds: Optional[int] = None,\n ):\n """Airbyte Source for Amazon Seller Partner.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/amazon-seller-partner\n\n Args:\n name (str): The name of the destination.\n app_id (Optional[str]): Your Amazon App ID\n lwa_app_id (str): Your Login with Amazon Client ID.\n lwa_client_secret (str): Your Login with Amazon Client Secret.\n refresh_token (str): The Refresh Token obtained via OAuth flow authorization.\n aws_access_key (str): Specifies the AWS access key used as part of the credentials to authenticate the user.\n aws_secret_key (str): Specifies the AWS secret key used as part of the credentials to authenticate the user.\n role_arn (str): Specifies the Amazon Resource Name (ARN) of an IAM role that you want to use to perform operations requested using this profile. (Needs permission to 'Assume Role' STS).\n replication_start_date (str): UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated.\n replication_end_date (Optional[str]): UTC date and time in the format 2017-01-25T00:00:00Z. Any data after this date will not be replicated.\n period_in_days (Optional[int]): Will be used for stream slicing for initial full_refresh sync when no updated state is present for reports that support sliced incremental sync.\n report_options (Optional[str]): Additional information passed to reports. This varies by report type. Must be a valid json string.\n max_wait_seconds (Optional[int]): Sometimes report can take up to 30 minutes to generate. This will set the limit for how long to wait for a successful report.\n aws_environment (str): An enumeration.\n region (str): An enumeration.\n """\n self.app_id = check.opt_str_param(app_id, "app_id")\n self.auth_type = check.opt_str_param(auth_type, "auth_type")\n self.lwa_app_id = check.str_param(lwa_app_id, "lwa_app_id")\n self.lwa_client_secret = check.str_param(lwa_client_secret, "lwa_client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")\n self.aws_access_key = check.str_param(aws_access_key, "aws_access_key")\n self.aws_secret_key = check.str_param(aws_secret_key, "aws_secret_key")\n self.role_arn = check.str_param(role_arn, "role_arn")\n self.replication_start_date = check.str_param(\n replication_start_date, "replication_start_date"\n )\n self.replication_end_date = check.opt_str_param(\n replication_end_date, "replication_end_date"\n )\n self.period_in_days = check.opt_int_param(period_in_days, "period_in_days")\n self.report_options = check.opt_str_param(report_options, "report_options")\n self.max_wait_seconds = check.opt_int_param(max_wait_seconds, "max_wait_seconds")\n self.aws_environment = check.str_param(aws_environment, "aws_environment")\n self.region = check.str_param(region, "region")\n super().__init__("Amazon Seller Partner", name)
\n\n\n
[docs]class CourierSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, api_key: str):\n """Airbyte Source for Courier.\n\n Documentation can be found at https://docs.airbyte.io/integrations/sources/courier\n\n Args:\n name (str): The name of the destination.\n api_key (str): Courier API Key to retrieve your data.\n """\n self.api_key = check.str_param(api_key, "api_key")\n super().__init__("Courier", name)
\n\n\n
[docs]class CloseComSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, api_key: str, start_date: Optional[str] = None):\n r"""Airbyte Source for Close Com.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/close-com\n\n Args:\n name (str): The name of the destination.\n api_key (str): Close.com API key (usually starts with 'api\\\\_'; find yours here).\n start_date (Optional[str]): The start date to sync data. Leave blank for full sync. Format: YYYY-MM-DD.\n """\n self.api_key = check.str_param(api_key, "api_key")\n self.start_date = check.opt_str_param(start_date, "start_date")\n super().__init__("Close Com", name)
\n\n\n
[docs]class BingAdsSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n client_id: str,\n refresh_token: str,\n developer_token: str,\n reports_start_date: str,\n auth_method: Optional[str] = None,\n tenant_id: Optional[str] = None,\n client_secret: Optional[str] = None,\n ):\n """Airbyte Source for Bing Ads.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/bing-ads\n\n Args:\n name (str): The name of the destination.\n tenant_id (Optional[str]): The Tenant ID of your Microsoft Advertising developer application. Set this to "common" unless you know you need a different value.\n client_id (str): The Client ID of your Microsoft Advertising developer application.\n client_secret (Optional[str]): The Client Secret of your Microsoft Advertising developer application.\n refresh_token (str): Refresh Token to renew the expired Access Token.\n developer_token (str): Developer token associated with user. See more info in the docs.\n reports_start_date (str): The start date from which to begin replicating report data. Any data generated before this date will not be replicated in reports. This is a UTC date in YYYY-MM-DD format.\n """\n self.auth_method = check.opt_str_param(auth_method, "auth_method")\n self.tenant_id = check.opt_str_param(tenant_id, "tenant_id")\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.opt_str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")\n self.developer_token = check.str_param(developer_token, "developer_token")\n self.reports_start_date = check.str_param(reports_start_date, "reports_start_date")\n super().__init__("Bing Ads", name)
\n\n\n
[docs]class PrimetricSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, client_id: str, client_secret: str):\n """Airbyte Source for Primetric.\n\n Args:\n name (str): The name of the destination.\n client_id (str): The Client ID of your Primetric developer application. The Client ID is visible here.\n client_secret (str): The Client Secret of your Primetric developer application. You can manage your client's credentials here.\n """\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n super().__init__("Primetric", name)
\n\n\n
[docs]class PivotalTrackerSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, api_token: str):\n """Airbyte Source for Pivotal Tracker.\n\n Args:\n name (str): The name of the destination.\n api_token (str): Pivotal Tracker API token\n """\n self.api_token = check.str_param(api_token, "api_token")\n super().__init__("Pivotal Tracker", name)
\n\n\n
[docs]class ElasticsearchSource(GeneratedAirbyteSource):\n
[docs] class None_:\n
[docs] @public\n def __init__(\n self,\n ):\n self.method = "none"
\n\n
[docs] class ApiKeySecret:\n
[docs] @public\n def __init__(self, apiKeyId: str, apiKeySecret: str):\n self.method = "secret"\n self.apiKeyId = check.str_param(apiKeyId, "apiKeyId")\n self.apiKeySecret = check.str_param(apiKeySecret, "apiKeySecret")
\n\n
[docs] class UsernamePassword:\n
[docs] @public\n def __init__(self, username: str, password: str):\n self.method = "basic"\n self.username = check.str_param(username, "username")\n self.password = check.str_param(password, "password")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n endpoint: str,\n authenticationMethod: Union[\n "ElasticsearchSource.None_",\n "ElasticsearchSource.ApiKeySecret",\n "ElasticsearchSource.UsernamePassword",\n ],\n ):\n r"""Airbyte Source for Elasticsearch.\n\n Documentation can be found at https://docs.airbyte.com/integrations/source/elasticsearch\n\n Args:\n name (str): The name of the destination.\n endpoint (str): The full url of the Elasticsearch server\n authenticationMethod (Union[ElasticsearchSource.None\\\\_, ElasticsearchSource.ApiKeySecret, ElasticsearchSource.UsernamePassword]): The type of authentication to be used\n """\n self.endpoint = check.str_param(endpoint, "endpoint")\n self.authenticationMethod = check.inst_param(\n authenticationMethod,\n "authenticationMethod",\n (\n ElasticsearchSource.None_,\n ElasticsearchSource.ApiKeySecret,\n ElasticsearchSource.UsernamePassword,\n ),\n )\n super().__init__("Elasticsearch", name)
\n\n\n
[docs]class BigquerySource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self, name: str, project_id: str, credentials_json: str, dataset_id: Optional[str] = None\n ):\n """Airbyte Source for Bigquery.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/bigquery\n\n Args:\n name (str): The name of the destination.\n project_id (str): The GCP project ID for the project containing the target BigQuery dataset.\n dataset_id (Optional[str]): The dataset ID to search for tables and views. If you are only loading data from one dataset, setting this option could result in much faster schema discovery.\n credentials_json (str): The contents of your Service Account Key JSON file. See the docs for more information on how to obtain this key.\n """\n self.project_id = check.str_param(project_id, "project_id")\n self.dataset_id = check.opt_str_param(dataset_id, "dataset_id")\n self.credentials_json = check.str_param(credentials_json, "credentials_json")\n super().__init__("Bigquery", name)
\n\n\n
[docs]class WoocommerceSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n shop: str,\n start_date: str,\n api_key: str,\n api_secret: str,\n conversion_window_days: Optional[int] = None,\n ):\n """Airbyte Source for Woocommerce.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/woocommerce\n\n Args:\n name (str): The name of the destination.\n shop (str): The name of the store. For https://EXAMPLE.com, the shop name is 'EXAMPLE.com'.\n start_date (str): The date you would like to replicate data. Format: YYYY-MM-DD.\n api_key (str): The CUSTOMER KEY for API in WooCommerce shop.\n api_secret (str): The CUSTOMER SECRET for API in WooCommerce shop.\n conversion_window_days (Optional[int]): A conversion window is the period of time after an ad interaction (such as an ad click or video view) during which a conversion, such as a purchase, is recorded in Google Ads.\n """\n self.shop = check.str_param(shop, "shop")\n self.start_date = check.str_param(start_date, "start_date")\n self.api_key = check.str_param(api_key, "api_key")\n self.api_secret = check.str_param(api_secret, "api_secret")\n self.conversion_window_days = check.opt_int_param(\n conversion_window_days, "conversion_window_days"\n )\n super().__init__("Woocommerce", name)
\n\n\n
[docs]class SearchMetricsSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self, name: str, api_key: str, client_secret: str, country_code: str, start_date: str\n ):\n """Airbyte Source for Search Metrics.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/seacrh-metrics\n\n Args:\n name (str): The name of the destination.\n country_code (str): The region of the S3 staging bucket to use if utilising a copy strategy.\n start_date (str): Data generated in SearchMetrics after this date will be replicated. This date must be specified in the format YYYY-MM-DDT00:00:00Z.\n """\n self.api_key = check.str_param(api_key, "api_key")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.country_code = check.str_param(country_code, "country_code")\n self.start_date = check.str_param(start_date, "start_date")\n super().__init__("Search Metrics", name)
\n\n\n
[docs]class TypeformSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self, name: str, start_date: str, token: str, form_ids: Optional[List[str]] = None\n ):\n """Airbyte Source for Typeform.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/typeform\n\n Args:\n name (str): The name of the destination.\n start_date (str): UTC date and time in the format: YYYY-MM-DDTHH:mm:ss[Z]. Any data before this date will not be replicated.\n token (str): The API Token for a Typeform account.\n form_ids (Optional[List[str]]): When this parameter is set, the connector will replicate data only from the input forms. Otherwise, all forms in your Typeform account will be replicated. You can find form IDs in your form URLs. For example, in the URL "https://mysite.typeform.com/to/u6nXL7" the form_id is u6nXL7. You can find form URLs on Share panel\n """\n self.start_date = check.str_param(start_date, "start_date")\n self.token = check.str_param(token, "token")\n self.form_ids = check.opt_nullable_list_param(form_ids, "form_ids", str)\n super().__init__("Typeform", name)
\n\n\n
[docs]class WebflowSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, site_id: str, api_key: str):\n """Airbyte Source for Webflow.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/webflow\n\n Args:\n name (str): The name of the destination.\n site_id (str): The id of the Webflow site you are requesting data from. See https://developers.webflow.com/#sites\n api_key (str): The API token for authenticating to Webflow. See https://university.webflow.com/lesson/intro-to-the-webflow-api\n """\n self.site_id = check.str_param(site_id, "site_id")\n self.api_key = check.str_param(api_key, "api_key")\n super().__init__("Webflow", name)
\n\n\n
[docs]class FireboltSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n username: str,\n password: str,\n database: str,\n account: Optional[str] = None,\n host: Optional[str] = None,\n engine: Optional[str] = None,\n ):\n """Airbyte Source for Firebolt.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/firebolt\n\n Args:\n name (str): The name of the destination.\n username (str): Firebolt email address you use to login.\n password (str): Firebolt password.\n account (Optional[str]): Firebolt account to login.\n host (Optional[str]): The host name of your Firebolt database.\n database (str): The database to connect to.\n engine (Optional[str]): Engine name or url to connect to.\n """\n self.username = check.str_param(username, "username")\n self.password = check.str_param(password, "password")\n self.account = check.opt_str_param(account, "account")\n self.host = check.opt_str_param(host, "host")\n self.database = check.str_param(database, "database")\n self.engine = check.opt_str_param(engine, "engine")\n super().__init__("Firebolt", name)
\n\n\n
[docs]class FaunaSource(GeneratedAirbyteSource):\n
[docs] class Disabled:\n
[docs] @public\n def __init__(\n self,\n ):\n self.deletion_mode = "ignore"
\n\n
[docs] class Enabled:\n
[docs] @public\n def __init__(self, column: str):\n self.deletion_mode = "deleted_field"\n self.column = check.str_param(column, "column")
\n\n
[docs] class Collection:\n
[docs] @public\n def __init__(\n self, page_size: int, deletions: Union["FaunaSource.Disabled", "FaunaSource.Enabled"]\n ):\n self.page_size = check.int_param(page_size, "page_size")\n self.deletions = check.inst_param(\n deletions, "deletions", (FaunaSource.Disabled, FaunaSource.Enabled)\n )
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n domain: str,\n port: int,\n scheme: str,\n secret: str,\n collection: "FaunaSource.Collection",\n ):\n """Airbyte Source for Fauna.\n\n Documentation can be found at https://github.com/fauna/airbyte/blob/source-fauna/docs/integrations/sources/fauna.md\n\n Args:\n name (str): The name of the destination.\n domain (str): Domain of Fauna to query. Defaults db.fauna.com. See the docs.\n port (int): Endpoint port.\n scheme (str): URL scheme.\n secret (str): Fauna secret, used when authenticating with the database.\n collection (FaunaSource.Collection): Settings for the Fauna Collection.\n """\n self.domain = check.str_param(domain, "domain")\n self.port = check.int_param(port, "port")\n self.scheme = check.str_param(scheme, "scheme")\n self.secret = check.str_param(secret, "secret")\n self.collection = check.inst_param(collection, "collection", FaunaSource.Collection)\n super().__init__("Fauna", name)
\n\n\n
[docs]class IntercomSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, start_date: str, access_token: str):\n """Airbyte Source for Intercom.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/intercom\n\n Args:\n name (str): The name of the destination.\n start_date (str): UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated.\n access_token (str): Access token for making authenticated requests. See the Intercom docs for more information.\n """\n self.start_date = check.str_param(start_date, "start_date")\n self.access_token = check.str_param(access_token, "access_token")\n super().__init__("Intercom", name)
\n\n\n
[docs]class FreshsalesSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, domain_name: str, api_key: str):\n """Airbyte Source for Freshsales.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/freshsales\n\n Args:\n name (str): The name of the destination.\n domain_name (str): The Name of your Freshsales domain\n api_key (str): Freshsales API Key. See here. The key is case sensitive.\n """\n self.domain_name = check.str_param(domain_name, "domain_name")\n self.api_key = check.str_param(api_key, "api_key")\n super().__init__("Freshsales", name)
\n\n\n
[docs]class AdjustSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n api_token: str,\n dimensions: List[str],\n ingest_start: str,\n metrics: List[str],\n additional_metrics: Optional[List[str]] = None,\n until_today: Optional[bool] = None,\n ):\n """Airbyte Source for Adjust.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/adjust\n\n Args:\n name (str): The name of the destination.\n additional_metrics (Optional[List[str]]): Metrics names that are not pre-defined, such as cohort metrics or app specific metrics.\n api_token (str): Adjust API key, see https://help.adjust.com/en/article/report-service-api-authentication\n dimensions (List[str]): Dimensions allow a user to break down metrics into groups using one or several parameters. For example, the number of installs by date, country and network. See https://help.adjust.com/en/article/reports-endpoint#dimensions for more information about the dimensions.\n ingest_start (str): Data ingest start date.\n metrics (List[str]): Select at least one metric to query.\n until_today (Optional[bool]): Syncs data up until today. Useful when running daily incremental syncs, and duplicates are not desired.\n """\n self.additional_metrics = check.opt_nullable_list_param(\n additional_metrics, "additional_metrics", str\n )\n self.api_token = check.str_param(api_token, "api_token")\n self.dimensions = check.list_param(dimensions, "dimensions", str)\n self.ingest_start = check.str_param(ingest_start, "ingest_start")\n self.metrics = check.list_param(metrics, "metrics", str)\n self.until_today = check.opt_bool_param(until_today, "until_today")\n super().__init__("Adjust", name)
\n\n\n
[docs]class BambooHrSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n subdomain: str,\n api_key: str,\n custom_reports_fields: Optional[str] = None,\n custom_reports_include_default_fields: Optional[bool] = None,\n ):\n """Airbyte Source for Bamboo Hr.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/bamboo-hr\n\n Args:\n name (str): The name of the destination.\n subdomain (str): Sub Domain of bamboo hr\n api_key (str): Api key of bamboo hr\n custom_reports_fields (Optional[str]): Comma-separated list of fields to include in custom reports.\n custom_reports_include_default_fields (Optional[bool]): If true, the custom reports endpoint will include the default fields defined here: https://documentation.bamboohr.com/docs/list-of-field-names.\n """\n self.subdomain = check.str_param(subdomain, "subdomain")\n self.api_key = check.str_param(api_key, "api_key")\n self.custom_reports_fields = check.opt_str_param(\n custom_reports_fields, "custom_reports_fields"\n )\n self.custom_reports_include_default_fields = check.opt_bool_param(\n custom_reports_include_default_fields, "custom_reports_include_default_fields"\n )\n super().__init__("Bamboo Hr", name)
\n\n\n
[docs]class GoogleAdsSource(GeneratedAirbyteSource):\n
[docs] class GoogleCredentials:\n
[docs] @public\n def __init__(\n self,\n developer_token: str,\n client_id: str,\n client_secret: str,\n refresh_token: str,\n access_token: Optional[str] = None,\n ):\n self.developer_token = check.str_param(developer_token, "developer_token")\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")\n self.access_token = check.opt_str_param(access_token, "access_token")
\n\n
[docs] class CustomGAQLQueriesEntry:\n
[docs] @public\n def __init__(self, query: str, table_name: str):\n self.query = check.str_param(query, "query")\n self.table_name = check.str_param(table_name, "table_name")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n credentials: "GoogleAdsSource.GoogleCredentials",\n customer_id: str,\n start_date: str,\n end_date: Optional[str] = None,\n custom_queries: Optional[List[CustomGAQLQueriesEntry]] = None,\n login_customer_id: Optional[str] = None,\n conversion_window_days: Optional[int] = None,\n ):\n """Airbyte Source for Google Ads.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/google-ads\n\n Args:\n name (str): The name of the destination.\n customer_id (str): Comma separated list of (client) customer IDs. Each customer ID must be specified as a 10-digit number without dashes. More instruction on how to find this value in our docs. Metrics streams like AdGroupAdReport cannot be requested for a manager account.\n start_date (str): UTC date and time in the format 2017-01-25. Any data before this date will not be replicated.\n end_date (Optional[str]): UTC date and time in the format 2017-01-25. Any data after this date will not be replicated.\n login_customer_id (Optional[str]): If your access to the customer account is through a manager account, this field is required and must be set to the customer ID of the manager account (10-digit number without dashes). More information about this field you can see here\n conversion_window_days (Optional[int]): A conversion window is the period of time after an ad interaction (such as an ad click or video view) during which a conversion, such as a purchase, is recorded in Google Ads. For more information, see Google's documentation.\n """\n self.credentials = check.inst_param(\n credentials, "credentials", GoogleAdsSource.GoogleCredentials\n )\n self.customer_id = check.str_param(customer_id, "customer_id")\n self.start_date = check.str_param(start_date, "start_date")\n self.end_date = check.opt_str_param(end_date, "end_date")\n self.custom_queries = check.opt_nullable_list_param(\n custom_queries, "custom_queries", GoogleAdsSource.CustomGAQLQueriesEntry\n )\n self.login_customer_id = check.opt_str_param(login_customer_id, "login_customer_id")\n self.conversion_window_days = check.opt_int_param(\n conversion_window_days, "conversion_window_days"\n )\n super().__init__("Google Ads", name)
\n\n\n
[docs]class HellobatonSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, api_key: str, company: str):\n """Airbyte Source for Hellobaton.\n\n Args:\n name (str): The name of the destination.\n api_key (str): authentication key required to access the api endpoints\n company (str): Company name that generates your base api url\n """\n self.api_key = check.str_param(api_key, "api_key")\n self.company = check.str_param(company, "company")\n super().__init__("Hellobaton", name)
\n\n\n
[docs]class SendgridSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, apikey: str, start_time: Union[int, str]):\n """Airbyte Source for Sendgrid.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/sendgrid\n\n Args:\n name (str): The name of the destination.\n apikey (str): API Key, use admin to generate this key.\n start_time (Union[int, str]): Start time in ISO8601 format. Any data before this time point will not be replicated.\n """\n self.apikey = check.str_param(apikey, "apikey")\n self.start_time = check.inst_param(start_time, "start_time", (int, str))\n super().__init__("Sendgrid", name)
\n\n\n
[docs]class MondaySource(GeneratedAirbyteSource):\n
[docs] class OAuth20:\n
[docs] @public\n def __init__(\n self,\n client_id: str,\n client_secret: str,\n access_token: str,\n subdomain: Optional[str] = None,\n ):\n self.auth_type = "oauth2.0"\n self.subdomain = check.opt_str_param(subdomain, "subdomain")\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.access_token = check.str_param(access_token, "access_token")
\n\n
[docs] class APIToken:\n
[docs] @public\n def __init__(self, api_token: str):\n self.auth_type = "api_token"\n self.api_token = check.str_param(api_token, "api_token")
\n\n
[docs] @public\n def __init__(\n self, name: str, credentials: Union["MondaySource.OAuth20", "MondaySource.APIToken"]\n ):\n """Airbyte Source for Monday.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/monday\n\n Args:\n name (str): The name of the destination.\n\n """\n self.credentials = check.inst_param(\n credentials, "credentials", (MondaySource.OAuth20, MondaySource.APIToken)\n )\n super().__init__("Monday", name)
\n\n\n
[docs]class DixaSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self, name: str, api_token: str, start_date: str, batch_size: Optional[int] = None\n ):\n """Airbyte Source for Dixa.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/dixa\n\n Args:\n name (str): The name of the destination.\n api_token (str): Dixa API token\n start_date (str): The connector pulls records updated from this date onwards.\n batch_size (Optional[int]): Number of days to batch into one request. Max 31.\n """\n self.api_token = check.str_param(api_token, "api_token")\n self.start_date = check.str_param(start_date, "start_date")\n self.batch_size = check.opt_int_param(batch_size, "batch_size")\n super().__init__("Dixa", name)
\n\n\n
[docs]class SalesforceSource(GeneratedAirbyteSource):\n
[docs] class FilterSalesforceObjectsEntry:\n
[docs] @public\n def __init__(self, criteria: str, value: str):\n self.criteria = check.str_param(criteria, "criteria")\n self.value = check.str_param(value, "value")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n client_id: str,\n client_secret: str,\n refresh_token: str,\n is_sandbox: Optional[bool] = None,\n auth_type: Optional[str] = None,\n start_date: Optional[str] = None,\n streams_criteria: Optional[List[FilterSalesforceObjectsEntry]] = None,\n ):\n """Airbyte Source for Salesforce.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/salesforce\n\n Args:\n name (str): The name of the destination.\n is_sandbox (Optional[bool]): Toggle if you're using a Salesforce Sandbox\n client_id (str): Enter your Salesforce developer application's Client ID\n client_secret (str): Enter your Salesforce developer application's Client secret\n refresh_token (str): Enter your application's Salesforce Refresh Token used for Airbyte to access your Salesforce account.\n start_date (Optional[str]): Enter the date in the YYYY-MM-DD format. Airbyte will replicate the data added on and after this date. If this field is blank, Airbyte will replicate all data.\n streams_criteria (Optional[List[SalesforceSource.FilterSalesforceObjectsEntry]]): Filter streams relevant to you\n """\n self.is_sandbox = check.opt_bool_param(is_sandbox, "is_sandbox")\n self.auth_type = check.opt_str_param(auth_type, "auth_type")\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")\n self.start_date = check.opt_str_param(start_date, "start_date")\n self.streams_criteria = check.opt_nullable_list_param(\n streams_criteria, "streams_criteria", SalesforceSource.FilterSalesforceObjectsEntry\n )\n super().__init__("Salesforce", name)
\n\n\n
[docs]class PipedriveSource(GeneratedAirbyteSource):\n
[docs] class SignInViaPipedriveOAuth:\n
[docs] @public\n def __init__(self, client_id: str, client_secret: str, refresh_token: str):\n self.auth_type = "Client"\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")
\n\n
[docs] class APIKeyAuthentication:\n
[docs] @public\n def __init__(self, api_token: str):\n self.auth_type = "Token"\n self.api_token = check.str_param(api_token, "api_token")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n authorization: Union[\n "PipedriveSource.SignInViaPipedriveOAuth", "PipedriveSource.APIKeyAuthentication"\n ],\n replication_start_date: str,\n ):\n """Airbyte Source for Pipedrive.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/pipedrive\n\n Args:\n name (str): The name of the destination.\n authorization (Union[PipedriveSource.SignInViaPipedriveOAuth, PipedriveSource.APIKeyAuthentication]): Choose one of the possible authorization method\n replication_start_date (str): UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated. When specified and not None, then stream will behave as incremental\n """\n self.authorization = check.inst_param(\n authorization,\n "authorization",\n (PipedriveSource.SignInViaPipedriveOAuth, PipedriveSource.APIKeyAuthentication),\n )\n self.replication_start_date = check.str_param(\n replication_start_date, "replication_start_date"\n )\n super().__init__("Pipedrive", name)
\n\n\n
[docs]class FileSource(GeneratedAirbyteSource):\n
[docs] class HTTPSPublicWeb:\n
[docs] @public\n def __init__(self, user_agent: Optional[bool] = None):\n self.storage = "HTTPS"\n self.user_agent = check.opt_bool_param(user_agent, "user_agent")
\n\n
[docs] class GCSGoogleCloudStorage:\n
[docs] @public\n def __init__(self, service_account_json: Optional[str] = None):\n self.storage = "GCS"\n self.service_account_json = check.opt_str_param(\n service_account_json, "service_account_json"\n )
\n\n
[docs] class S3AmazonWebServices:\n
[docs] @public\n def __init__(\n self,\n aws_access_key_id: Optional[str] = None,\n aws_secret_access_key: Optional[str] = None,\n ):\n self.storage = "S3"\n self.aws_access_key_id = check.opt_str_param(aws_access_key_id, "aws_access_key_id")\n self.aws_secret_access_key = check.opt_str_param(\n aws_secret_access_key, "aws_secret_access_key"\n )
\n\n
[docs] class AzBlobAzureBlobStorage:\n
[docs] @public\n def __init__(\n self,\n storage_account: str,\n sas_token: Optional[str] = None,\n shared_key: Optional[str] = None,\n ):\n self.storage = "AzBlob"\n self.storage_account = check.str_param(storage_account, "storage_account")\n self.sas_token = check.opt_str_param(sas_token, "sas_token")\n self.shared_key = check.opt_str_param(shared_key, "shared_key")
\n\n
[docs] class SSHSecureShell:\n
[docs] @public\n def __init__(\n self, user: str, host: str, password: Optional[str] = None, port: Optional[str] = None\n ):\n self.storage = "SSH"\n self.user = check.str_param(user, "user")\n self.password = check.opt_str_param(password, "password")\n self.host = check.str_param(host, "host")\n self.port = check.opt_str_param(port, "port")
\n\n
[docs] class SCPSecureCopyProtocol:\n
[docs] @public\n def __init__(\n self, user: str, host: str, password: Optional[str] = None, port: Optional[str] = None\n ):\n self.storage = "SCP"\n self.user = check.str_param(user, "user")\n self.password = check.opt_str_param(password, "password")\n self.host = check.str_param(host, "host")\n self.port = check.opt_str_param(port, "port")
\n\n
[docs] class SFTPSecureFileTransferProtocol:\n
[docs] @public\n def __init__(\n self, user: str, host: str, password: Optional[str] = None, port: Optional[str] = None\n ):\n self.storage = "SFTP"\n self.user = check.str_param(user, "user")\n self.password = check.opt_str_param(password, "password")\n self.host = check.str_param(host, "host")\n self.port = check.opt_str_param(port, "port")
\n\n
[docs] class LocalFilesystemLimited:\n
[docs] @public\n def __init__(\n self,\n ):\n self.storage = "local"
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n dataset_name: str,\n format: str,\n url: str,\n provider: Union[\n "FileSource.HTTPSPublicWeb",\n "FileSource.GCSGoogleCloudStorage",\n "FileSource.S3AmazonWebServices",\n "FileSource.AzBlobAzureBlobStorage",\n "FileSource.SSHSecureShell",\n "FileSource.SCPSecureCopyProtocol",\n "FileSource.SFTPSecureFileTransferProtocol",\n "FileSource.LocalFilesystemLimited",\n ],\n reader_options: Optional[str] = None,\n ):\n """Airbyte Source for File.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/file\n\n Args:\n name (str): The name of the destination.\n dataset_name (str): The Name of the final table to replicate this file into (should include letters, numbers dash and underscores only).\n format (str): The Format of the file which should be replicated (Warning: some formats may be experimental, please refer to the docs).\n reader_options (Optional[str]): This should be a string in JSON format. It depends on the chosen file format to provide additional options and tune its behavior.\n url (str): The URL path to access the file which should be replicated.\n provider (Union[FileSource.HTTPSPublicWeb, FileSource.GCSGoogleCloudStorage, FileSource.S3AmazonWebServices, FileSource.AzBlobAzureBlobStorage, FileSource.SSHSecureShell, FileSource.SCPSecureCopyProtocol, FileSource.SFTPSecureFileTransferProtocol, FileSource.LocalFilesystemLimited]): The storage Provider or Location of the file(s) which should be replicated.\n """\n self.dataset_name = check.str_param(dataset_name, "dataset_name")\n self.format = check.str_param(format, "format")\n self.reader_options = check.opt_str_param(reader_options, "reader_options")\n self.url = check.str_param(url, "url")\n self.provider = check.inst_param(\n provider,\n "provider",\n (\n FileSource.HTTPSPublicWeb,\n FileSource.GCSGoogleCloudStorage,\n FileSource.S3AmazonWebServices,\n FileSource.AzBlobAzureBlobStorage,\n FileSource.SSHSecureShell,\n FileSource.SCPSecureCopyProtocol,\n FileSource.SFTPSecureFileTransferProtocol,\n FileSource.LocalFilesystemLimited,\n ),\n )\n super().__init__("File", name)
\n\n\n
[docs]class GlassfrogSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, api_key: str):\n """Airbyte Source for Glassfrog.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/glassfrog\n\n Args:\n name (str): The name of the destination.\n api_key (str): API key provided by Glassfrog\n """\n self.api_key = check.str_param(api_key, "api_key")\n super().__init__("Glassfrog", name)
\n\n\n
[docs]class ChartmogulSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, api_key: str, start_date: str, interval: str):\n """Airbyte Source for Chartmogul.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/chartmogul\n\n Args:\n name (str): The name of the destination.\n api_key (str): Chartmogul API key\n start_date (str): UTC date and time in the format 2017-01-25T00:00:00Z. When feasible, any data before this date will not be replicated.\n interval (str): Some APIs such as Metrics require intervals to cluster data.\n """\n self.api_key = check.str_param(api_key, "api_key")\n self.start_date = check.str_param(start_date, "start_date")\n self.interval = check.str_param(interval, "interval")\n super().__init__("Chartmogul", name)
\n\n\n
[docs]class OrbSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n api_key: str,\n start_date: Optional[str] = None,\n lookback_window_days: Optional[int] = None,\n string_event_properties_keys: Optional[List[str]] = None,\n numeric_event_properties_keys: Optional[List[str]] = None,\n ):\n """Airbyte Source for Orb.\n\n Documentation can be found at https://docs.withorb.com/\n\n Args:\n name (str): The name of the destination.\n api_key (str): Orb API Key, issued from the Orb admin console.\n start_date (Optional[str]): UTC date and time in the format 2022-03-01T00:00:00Z. Any data with created_at before this data will not be synced.\n lookback_window_days (Optional[int]): When set to N, the connector will always refresh resources created within the past N days. By default, updated objects that are not newly created are not incrementally synced.\n string_event_properties_keys (Optional[List[str]]): Property key names to extract from all events, in order to enrich ledger entries corresponding to an event deduction.\n numeric_event_properties_keys (Optional[List[str]]): Property key names to extract from all events, in order to enrich ledger entries corresponding to an event deduction.\n """\n self.api_key = check.str_param(api_key, "api_key")\n self.start_date = check.opt_str_param(start_date, "start_date")\n self.lookback_window_days = check.opt_int_param(\n lookback_window_days, "lookback_window_days"\n )\n self.string_event_properties_keys = check.opt_nullable_list_param(\n string_event_properties_keys, "string_event_properties_keys", str\n )\n self.numeric_event_properties_keys = check.opt_nullable_list_param(\n numeric_event_properties_keys, "numeric_event_properties_keys", str\n )\n super().__init__("Orb", name)
\n\n\n
[docs]class CockroachdbSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n host: str,\n port: int,\n database: str,\n username: str,\n password: Optional[str] = None,\n jdbc_url_params: Optional[str] = None,\n ssl: Optional[bool] = None,\n ):\n """Airbyte Source for Cockroachdb.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/cockroachdb\n\n Args:\n name (str): The name of the destination.\n host (str): Hostname of the database.\n port (int): Port of the database.\n database (str): Name of the database.\n username (str): Username to use to access the database.\n password (Optional[str]): Password associated with the username.\n jdbc_url_params (Optional[str]): Additional properties to pass to the JDBC URL string when connecting to the database formatted as 'key=value' pairs separated by the symbol '&'. (Eg. key1=value1&key2=value2&key3=value3). For more information read about JDBC URL parameters.\n ssl (Optional[bool]): Encrypt client/server communications for increased security.\n """\n self.host = check.str_param(host, "host")\n self.port = check.int_param(port, "port")\n self.database = check.str_param(database, "database")\n self.username = check.str_param(username, "username")\n self.password = check.opt_str_param(password, "password")\n self.jdbc_url_params = check.opt_str_param(jdbc_url_params, "jdbc_url_params")\n self.ssl = check.opt_bool_param(ssl, "ssl")\n super().__init__("Cockroachdb", name)
\n\n\n
[docs]class ConfluenceSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, api_token: str, domain_name: str, email: str):\n """Airbyte Source for Confluence.\n\n Args:\n name (str): The name of the destination.\n api_token (str): Please follow the Jira confluence for generating an API token: https://support.atlassian.com/atlassian-account/docs/manage-api-tokens-for-your-atlassian-account/\n domain_name (str): Your Confluence domain name\n email (str): Your Confluence login email\n """\n self.api_token = check.str_param(api_token, "api_token")\n self.domain_name = check.str_param(domain_name, "domain_name")\n self.email = check.str_param(email, "email")\n super().__init__("Confluence", name)
\n\n\n
[docs]class PlaidSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n access_token: str,\n api_key: str,\n client_id: str,\n plaid_env: str,\n start_date: Optional[str] = None,\n ):\n """Airbyte Source for Plaid.\n\n Documentation can be found at https://plaid.com/docs/api/\n\n Args:\n name (str): The name of the destination.\n access_token (str): The end-user's Link access token.\n api_key (str): The Plaid API key to use to hit the API.\n client_id (str): The Plaid client id\n plaid_env (str): The Plaid environment\n start_date (Optional[str]): The date from which you'd like to replicate data for Plaid in the format YYYY-MM-DD. All data generated after this date will be replicated.\n """\n self.access_token = check.str_param(access_token, "access_token")\n self.api_key = check.str_param(api_key, "api_key")\n self.client_id = check.str_param(client_id, "client_id")\n self.plaid_env = check.str_param(plaid_env, "plaid_env")\n self.start_date = check.opt_str_param(start_date, "start_date")\n super().__init__("Plaid", name)
\n\n\n
[docs]class SnapchatMarketingSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n client_id: str,\n client_secret: str,\n refresh_token: str,\n start_date: Optional[str] = None,\n end_date: Optional[str] = None,\n ):\n """Airbyte Source for Snapchat Marketing.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/snapchat-marketing\n\n Args:\n name (str): The name of the destination.\n client_id (str): The Client ID of your Snapchat developer application.\n client_secret (str): The Client Secret of your Snapchat developer application.\n refresh_token (str): Refresh Token to renew the expired Access Token.\n start_date (Optional[str]): Date in the format 2022-01-01. Any data before this date will not be replicated.\n end_date (Optional[str]): Date in the format 2017-01-25. Any data after this date will not be replicated.\n """\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")\n self.start_date = check.opt_str_param(start_date, "start_date")\n self.end_date = check.opt_str_param(end_date, "end_date")\n super().__init__("Snapchat Marketing", name)
\n\n\n
[docs]class MicrosoftTeamsSource(GeneratedAirbyteSource):\n
[docs] class AuthenticateViaMicrosoftOAuth20:\n
[docs] @public\n def __init__(\n self,\n tenant_id: str,\n client_id: str,\n client_secret: str,\n refresh_token: str,\n auth_type: Optional[str] = None,\n ):\n self.auth_type = check.opt_str_param(auth_type, "auth_type")\n self.tenant_id = check.str_param(tenant_id, "tenant_id")\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")
\n\n
[docs] class AuthenticateViaMicrosoft:\n
[docs] @public\n def __init__(\n self,\n tenant_id: str,\n client_id: str,\n client_secret: str,\n auth_type: Optional[str] = None,\n ):\n self.auth_type = check.opt_str_param(auth_type, "auth_type")\n self.tenant_id = check.str_param(tenant_id, "tenant_id")\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n period: str,\n credentials: Union[\n "MicrosoftTeamsSource.AuthenticateViaMicrosoftOAuth20",\n "MicrosoftTeamsSource.AuthenticateViaMicrosoft",\n ],\n ):\n """Airbyte Source for Microsoft Teams.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/microsoft-teams\n\n Args:\n name (str): The name of the destination.\n period (str): Specifies the length of time over which the Team Device Report stream is aggregated. The supported values are: D7, D30, D90, and D180.\n credentials (Union[MicrosoftTeamsSource.AuthenticateViaMicrosoftOAuth20, MicrosoftTeamsSource.AuthenticateViaMicrosoft]): Choose how to authenticate to Microsoft\n """\n self.period = check.str_param(period, "period")\n self.credentials = check.inst_param(\n credentials,\n "credentials",\n (\n MicrosoftTeamsSource.AuthenticateViaMicrosoftOAuth20,\n MicrosoftTeamsSource.AuthenticateViaMicrosoft,\n ),\n )\n super().__init__("Microsoft Teams", name)
\n\n\n
[docs]class LeverHiringSource(GeneratedAirbyteSource):\n
[docs] class OAuthCredentials:\n
[docs] @public\n def __init__(\n self,\n refresh_token: str,\n auth_type: Optional[str] = None,\n client_id: Optional[str] = None,\n client_secret: Optional[str] = None,\n ):\n self.auth_type = check.opt_str_param(auth_type, "auth_type")\n self.client_id = check.opt_str_param(client_id, "client_id")\n self.client_secret = check.opt_str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n credentials: "LeverHiringSource.OAuthCredentials",\n start_date: str,\n environment: Optional[str] = None,\n ):\n """Airbyte Source for Lever Hiring.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/lever-hiring\n\n Args:\n name (str): The name of the destination.\n credentials (LeverHiringSource.OAuthCredentials): Choose how to authenticate to Lever Hiring.\n start_date (str): UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated. Note that it will be used only in the following incremental streams: comments, commits, and issues.\n environment (Optional[str]): The environment in which you'd like to replicate data for Lever. This is used to determine which Lever API endpoint to use.\n """\n self.credentials = check.inst_param(\n credentials, "credentials", LeverHiringSource.OAuthCredentials\n )\n self.start_date = check.str_param(start_date, "start_date")\n self.environment = check.opt_str_param(environment, "environment")\n super().__init__("Lever Hiring", name)
\n\n\n
[docs]class TwilioSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n account_sid: str,\n auth_token: str,\n start_date: str,\n lookback_window: Optional[int] = None,\n ):\n """Airbyte Source for Twilio.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/twilio\n\n Args:\n name (str): The name of the destination.\n account_sid (str): Twilio account SID\n auth_token (str): Twilio Auth Token.\n start_date (str): UTC date and time in the format 2020-10-01T00:00:00Z. Any data before this date will not be replicated.\n lookback_window (Optional[int]): How far into the past to look for records. (in minutes)\n """\n self.account_sid = check.str_param(account_sid, "account_sid")\n self.auth_token = check.str_param(auth_token, "auth_token")\n self.start_date = check.str_param(start_date, "start_date")\n self.lookback_window = check.opt_int_param(lookback_window, "lookback_window")\n super().__init__("Twilio", name)
\n\n\n
[docs]class StripeSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n account_id: str,\n client_secret: str,\n start_date: str,\n lookback_window_days: Optional[int] = None,\n slice_range: Optional[int] = None,\n ):\n r"""Airbyte Source for Stripe.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/stripe\n\n Args:\n name (str): The name of the destination.\n account_id (str): Your Stripe account ID (starts with 'acct\\\\_', find yours here).\n client_secret (str): Stripe API key (usually starts with 'sk_live\\\\_'; find yours here).\n start_date (str): UTC date and time in the format 2017-01-25T00:00:00Z. Only data generated after this date will be replicated.\n lookback_window_days (Optional[int]): When set, the connector will always re-export data from the past N days, where N is the value set here. This is useful if your data is frequently updated after creation. More info here\n slice_range (Optional[int]): The time increment used by the connector when requesting data from the Stripe API. The bigger the value is, the less requests will be made and faster the sync will be. On the other hand, the more seldom the state is persisted.\n """\n self.account_id = check.str_param(account_id, "account_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.start_date = check.str_param(start_date, "start_date")\n self.lookback_window_days = check.opt_int_param(\n lookback_window_days, "lookback_window_days"\n )\n self.slice_range = check.opt_int_param(slice_range, "slice_range")\n super().__init__("Stripe", name)
\n\n\n
[docs]class Db2Source(GeneratedAirbyteSource):\n
[docs] class Unencrypted:\n
[docs] @public\n def __init__(\n self,\n ):\n self.encryption_method = "unencrypted"
\n\n
[docs] class TLSEncryptedVerifyCertificate:\n
[docs] @public\n def __init__(self, ssl_certificate: str, key_store_password: Optional[str] = None):\n self.encryption_method = "encrypted_verify_certificate"\n self.ssl_certificate = check.str_param(ssl_certificate, "ssl_certificate")\n self.key_store_password = check.opt_str_param(key_store_password, "key_store_password")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n host: str,\n port: int,\n db: str,\n username: str,\n password: str,\n encryption: Union["Db2Source.Unencrypted", "Db2Source.TLSEncryptedVerifyCertificate"],\n jdbc_url_params: Optional[str] = None,\n ):\n """Airbyte Source for Db2.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/db2\n\n Args:\n name (str): The name of the destination.\n host (str): Host of the Db2.\n port (int): Port of the database.\n db (str): Name of the database.\n username (str): Username to use to access the database.\n password (str): Password associated with the username.\n jdbc_url_params (Optional[str]): Additional properties to pass to the JDBC URL string when connecting to the database formatted as 'key=value' pairs separated by the symbol '&'. (example: key1=value1&key2=value2&key3=value3).\n encryption (Union[Db2Source.Unencrypted, Db2Source.TLSEncryptedVerifyCertificate]): Encryption method to use when communicating with the database\n """\n self.host = check.str_param(host, "host")\n self.port = check.int_param(port, "port")\n self.db = check.str_param(db, "db")\n self.username = check.str_param(username, "username")\n self.password = check.str_param(password, "password")\n self.jdbc_url_params = check.opt_str_param(jdbc_url_params, "jdbc_url_params")\n self.encryption = check.inst_param(\n encryption,\n "encryption",\n (Db2Source.Unencrypted, Db2Source.TLSEncryptedVerifyCertificate),\n )\n super().__init__("Db2", name)
\n\n\n
[docs]class SlackSource(GeneratedAirbyteSource):\n
[docs] class DefaultOAuth20Authorization:\n
[docs] @public\n def __init__(\n self,\n client_id: str,\n client_secret: str,\n access_token: str,\n refresh_token: Optional[str] = None,\n ):\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.access_token = check.str_param(access_token, "access_token")\n self.refresh_token = check.opt_str_param(refresh_token, "refresh_token")
\n\n
[docs] class APITokenCredentials:\n
[docs] @public\n def __init__(self, api_token: str):\n self.api_token = check.str_param(api_token, "api_token")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n start_date: str,\n lookback_window: int,\n join_channels: bool,\n credentials: Union[\n "SlackSource.DefaultOAuth20Authorization", "SlackSource.APITokenCredentials"\n ],\n channel_filter: Optional[List[str]] = None,\n ):\n """Airbyte Source for Slack.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/slack\n\n Args:\n name (str): The name of the destination.\n start_date (str): UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated.\n lookback_window (int): How far into the past to look for messages in threads.\n join_channels (bool): Whether to join all channels or to sync data only from channels the bot is already in. If false, you'll need to manually add the bot to all the channels from which you'd like to sync messages.\n channel_filter (Optional[List[str]]): A channel name list (without leading '#' char) which limit the channels from which you'd like to sync. Empty list means no filter.\n credentials (Union[SlackSource.DefaultOAuth20Authorization, SlackSource.APITokenCredentials]): Choose how to authenticate into Slack\n """\n self.start_date = check.str_param(start_date, "start_date")\n self.lookback_window = check.int_param(lookback_window, "lookback_window")\n self.join_channels = check.bool_param(join_channels, "join_channels")\n self.channel_filter = check.opt_nullable_list_param(channel_filter, "channel_filter", str)\n self.credentials = check.inst_param(\n credentials,\n "credentials",\n (SlackSource.DefaultOAuth20Authorization, SlackSource.APITokenCredentials),\n )\n super().__init__("Slack", name)
\n\n\n
[docs]class RechargeSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, start_date: str, access_token: str):\n """Airbyte Source for Recharge.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/recharge\n\n Args:\n name (str): The name of the destination.\n start_date (str): The date from which you'd like to replicate data for Recharge API, in the format YYYY-MM-DDT00:00:00Z. Any data before this date will not be replicated.\n access_token (str): The value of the Access Token generated. See the docs for more information.\n """\n self.start_date = check.str_param(start_date, "start_date")\n self.access_token = check.str_param(access_token, "access_token")\n super().__init__("Recharge", name)
\n\n\n
[docs]class OpenweatherSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n lat: str,\n lon: str,\n appid: str,\n units: Optional[str] = None,\n lang: Optional[str] = None,\n ):\n """Airbyte Source for Openweather.\n\n Args:\n name (str): The name of the destination.\n lat (str): Latitude for which you want to get weather condition from. (min -90, max 90)\n lon (str): Longitude for which you want to get weather condition from. (min -180, max 180)\n appid (str): Your OpenWeather API Key. See here. The key is case sensitive.\n units (Optional[str]): Units of measurement. standard, metric and imperial units are available. If you do not use the units parameter, standard units will be applied by default.\n lang (Optional[str]): You can use lang parameter to get the output in your language. The contents of the description field will be translated. See here for the list of supported languages.\n """\n self.lat = check.str_param(lat, "lat")\n self.lon = check.str_param(lon, "lon")\n self.appid = check.str_param(appid, "appid")\n self.units = check.opt_str_param(units, "units")\n self.lang = check.opt_str_param(lang, "lang")\n super().__init__("Openweather", name)
\n\n\n
[docs]class RetentlySource(GeneratedAirbyteSource):\n
[docs] class AuthenticateViaRetentlyOAuth:\n
[docs] @public\n def __init__(\n self,\n client_id: str,\n client_secret: str,\n refresh_token: str,\n auth_type: Optional[str] = None,\n ):\n self.auth_type = check.opt_str_param(auth_type, "auth_type")\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")
\n\n
[docs] class AuthenticateWithAPIToken:\n
[docs] @public\n def __init__(self, api_key: str, auth_type: Optional[str] = None):\n self.auth_type = check.opt_str_param(auth_type, "auth_type")\n self.api_key = check.str_param(api_key, "api_key")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n credentials: Union[\n "RetentlySource.AuthenticateViaRetentlyOAuth", "RetentlySource.AuthenticateWithAPIToken"\n ],\n ):\n """Airbyte Source for Retently.\n\n Args:\n name (str): The name of the destination.\n credentials (Union[RetentlySource.AuthenticateViaRetentlyOAuth, RetentlySource.AuthenticateWithAPIToken]): Choose how to authenticate to Retently\n """\n self.credentials = check.inst_param(\n credentials,\n "credentials",\n (RetentlySource.AuthenticateViaRetentlyOAuth, RetentlySource.AuthenticateWithAPIToken),\n )\n super().__init__("Retently", name)
\n\n\n
[docs]class ScaffoldSourceHttpSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, TODO: str):\n """Airbyte Source for Scaffold Source Http.\n\n Args:\n name (str): The name of the destination.\n TODO (str): describe me\n """\n self.TODO = check.str_param(TODO, "TODO")\n super().__init__("Scaffold Source Http", name)
\n\n\n
[docs]class YandexMetricaSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, auth_token: str, counter_id: str, start_date: str, end_date: str):\n """Airbyte Source for Yandex Metrica.\n\n Args:\n name (str): The name of the destination.\n auth_token (str): Your Yandex Metrica API access token\n counter_id (str): Counter ID\n start_date (str): UTC date and time in the format YYYY-MM-DD.\n end_date (str): UTC date and time in the format YYYY-MM-DD.\n """\n self.auth_token = check.str_param(auth_token, "auth_token")\n self.counter_id = check.str_param(counter_id, "counter_id")\n self.start_date = check.str_param(start_date, "start_date")\n self.end_date = check.str_param(end_date, "end_date")\n super().__init__("Yandex Metrica", name)
\n\n\n
[docs]class TalkdeskExploreSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n start_date: str,\n auth_url: str,\n api_key: str,\n timezone: Optional[str] = None,\n ):\n """Airbyte Source for Talkdesk Explore.\n\n Args:\n name (str): The name of the destination.\n start_date (str): The date from which you'd like to replicate data for Talkdesk Explore API, in the format YYYY-MM-DDT00:00:00. All data generated after this date will be replicated.\n timezone (Optional[str]): Timezone to use when generating reports. Only IANA timezones are supported (https://nodatime.org/TimeZones)\n auth_url (str): Talkdesk Auth URL. Only 'client_credentials' auth type supported at the moment.\n api_key (str): Talkdesk API key.\n """\n self.start_date = check.str_param(start_date, "start_date")\n self.timezone = check.opt_str_param(timezone, "timezone")\n self.auth_url = check.str_param(auth_url, "auth_url")\n self.api_key = check.str_param(api_key, "api_key")\n super().__init__("Talkdesk Explore", name)
\n\n\n
[docs]class ChargifySource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, api_key: str, domain: str):\n """Airbyte Source for Chargify.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/chargify\n\n Args:\n name (str): The name of the destination.\n api_key (str): Chargify API Key.\n domain (str): Chargify domain. Normally this domain follows the following format companyname.chargify.com\n """\n self.api_key = check.str_param(api_key, "api_key")\n self.domain = check.str_param(domain, "domain")\n super().__init__("Chargify", name)
\n\n\n
[docs]class RkiCovidSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, start_date: str):\n """Airbyte Source for Rki Covid.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/rki-covid\n\n Args:\n name (str): The name of the destination.\n start_date (str): UTC date in the format 2017-01-25. Any data before this date will not be replicated.\n """\n self.start_date = check.str_param(start_date, "start_date")\n super().__init__("Rki Covid", name)
\n\n\n
[docs]class PostgresSource(GeneratedAirbyteSource):\n
[docs] class Disable:\n
[docs] @public\n def __init__(\n self,\n ):\n self.mode = "disable"
\n\n
[docs] class Allow:\n
[docs] @public\n def __init__(\n self,\n ):\n self.mode = "allow"
\n\n
[docs] class Prefer:\n
[docs] @public\n def __init__(\n self,\n ):\n self.mode = "prefer"
\n\n
[docs] class Require:\n
[docs] @public\n def __init__(\n self,\n ):\n self.mode = "require"
\n\n
[docs] class VerifyCa:\n
[docs] @public\n def __init__(\n self,\n ca_certificate: str,\n client_certificate: Optional[str] = None,\n client_key: Optional[str] = None,\n client_key_password: Optional[str] = None,\n ):\n self.mode = "verify-ca"\n self.ca_certificate = check.str_param(ca_certificate, "ca_certificate")\n self.client_certificate = check.opt_str_param(client_certificate, "client_certificate")\n self.client_key = check.opt_str_param(client_key, "client_key")\n self.client_key_password = check.opt_str_param(\n client_key_password, "client_key_password"\n )
\n\n
[docs] class VerifyFull:\n
[docs] @public\n def __init__(\n self,\n ca_certificate: str,\n client_certificate: Optional[str] = None,\n client_key: Optional[str] = None,\n client_key_password: Optional[str] = None,\n ):\n self.mode = "verify-full"\n self.ca_certificate = check.str_param(ca_certificate, "ca_certificate")\n self.client_certificate = check.opt_str_param(client_certificate, "client_certificate")\n self.client_key = check.opt_str_param(client_key, "client_key")\n self.client_key_password = check.opt_str_param(\n client_key_password, "client_key_password"\n )
\n\n
[docs] class Standard:\n
[docs] @public\n def __init__(\n self,\n ):\n self.method = "Standard"
\n\n
[docs] class LogicalReplicationCDC:\n
[docs] @public\n def __init__(\n self,\n replication_slot: str,\n publication: str,\n plugin: Optional[str] = None,\n initial_waiting_seconds: Optional[int] = None,\n ):\n self.method = "CDC"\n self.plugin = check.opt_str_param(plugin, "plugin")\n self.replication_slot = check.str_param(replication_slot, "replication_slot")\n self.publication = check.str_param(publication, "publication")\n self.initial_waiting_seconds = check.opt_int_param(\n initial_waiting_seconds, "initial_waiting_seconds"\n )
\n\n
[docs] class NoTunnel:\n
[docs] @public\n def __init__(\n self,\n ):\n self.tunnel_method = "NO_TUNNEL"
\n\n
[docs] class SSHKeyAuthentication:\n
[docs] @public\n def __init__(self, tunnel_host: str, tunnel_port: int, tunnel_user: str, ssh_key: str):\n self.tunnel_method = "SSH_KEY_AUTH"\n self.tunnel_host = check.str_param(tunnel_host, "tunnel_host")\n self.tunnel_port = check.int_param(tunnel_port, "tunnel_port")\n self.tunnel_user = check.str_param(tunnel_user, "tunnel_user")\n self.ssh_key = check.str_param(ssh_key, "ssh_key")
\n\n
[docs] class PasswordAuthentication:\n
[docs] @public\n def __init__(\n self, tunnel_host: str, tunnel_port: int, tunnel_user: str, tunnel_user_password: str\n ):\n self.tunnel_method = "SSH_PASSWORD_AUTH"\n self.tunnel_host = check.str_param(tunnel_host, "tunnel_host")\n self.tunnel_port = check.int_param(tunnel_port, "tunnel_port")\n self.tunnel_user = check.str_param(tunnel_user, "tunnel_user")\n self.tunnel_user_password = check.str_param(\n tunnel_user_password, "tunnel_user_password"\n )
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n host: str,\n port: int,\n database: str,\n username: str,\n ssl_mode: Union[\n "PostgresSource.Disable",\n "PostgresSource.Allow",\n "PostgresSource.Prefer",\n "PostgresSource.Require",\n "PostgresSource.VerifyCa",\n "PostgresSource.VerifyFull",\n ],\n replication_method: Union[\n "PostgresSource.Standard", "PostgresSource.LogicalReplicationCDC"\n ],\n tunnel_method: Union[\n "PostgresSource.NoTunnel",\n "PostgresSource.SSHKeyAuthentication",\n "PostgresSource.PasswordAuthentication",\n ],\n schemas: Optional[List[str]] = None,\n password: Optional[str] = None,\n jdbc_url_params: Optional[str] = None,\n ssl: Optional[bool] = None,\n ):\n """Airbyte Source for Postgres.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/postgres\n\n Args:\n name (str): The name of the destination.\n host (str): Hostname of the database.\n port (int): Port of the database.\n database (str): Name of the database.\n schemas (Optional[List[str]]): The list of schemas (case sensitive) to sync from. Defaults to public.\n username (str): Username to access the database.\n password (Optional[str]): Password associated with the username.\n jdbc_url_params (Optional[str]): Additional properties to pass to the JDBC URL string when connecting to the database formatted as 'key=value' pairs separated by the symbol '&'. (Eg. key1=value1&key2=value2&key3=value3). For more information read about JDBC URL parameters.\n ssl (Optional[bool]): Encrypt data using SSL. When activating SSL, please select one of the connection modes.\n ssl_mode (Union[PostgresSource.Disable, PostgresSource.Allow, PostgresSource.Prefer, PostgresSource.Require, PostgresSource.VerifyCa, PostgresSource.VerifyFull]): SSL connection modes. disable - Disables encryption of communication between Airbyte and source database allow - Enables encryption only when required by the source database prefer - allows unencrypted connection only if the source database does not support encryption require - Always require encryption. If the source database server does not support encryption, connection will fail verify-ca - Always require encryption and verifies that the source database server has a valid SSL certificate verify-full - This is the most secure mode. Always require encryption and verifies the identity of the source database server Read more in the docs.\n replication_method (Union[PostgresSource.Standard, PostgresSource.LogicalReplicationCDC]): Replication method for extracting data from the database.\n tunnel_method (Union[PostgresSource.NoTunnel, PostgresSource.SSHKeyAuthentication, PostgresSource.PasswordAuthentication]): Whether to initiate an SSH tunnel before connecting to the database, and if so, which kind of authentication to use.\n """\n self.host = check.str_param(host, "host")\n self.port = check.int_param(port, "port")\n self.database = check.str_param(database, "database")\n self.schemas = check.opt_nullable_list_param(schemas, "schemas", str)\n self.username = check.str_param(username, "username")\n self.password = check.opt_str_param(password, "password")\n self.jdbc_url_params = check.opt_str_param(jdbc_url_params, "jdbc_url_params")\n self.ssl = check.opt_bool_param(ssl, "ssl")\n self.ssl_mode = check.inst_param(\n ssl_mode,\n "ssl_mode",\n (\n PostgresSource.Disable,\n PostgresSource.Allow,\n PostgresSource.Prefer,\n PostgresSource.Require,\n PostgresSource.VerifyCa,\n PostgresSource.VerifyFull,\n ),\n )\n self.replication_method = check.inst_param(\n replication_method,\n "replication_method",\n (PostgresSource.Standard, PostgresSource.LogicalReplicationCDC),\n )\n self.tunnel_method = check.inst_param(\n tunnel_method,\n "tunnel_method",\n (\n PostgresSource.NoTunnel,\n PostgresSource.SSHKeyAuthentication,\n PostgresSource.PasswordAuthentication,\n ),\n )\n super().__init__("Postgres", name)
\n\n\n
[docs]class TrelloSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n token: str,\n key: str,\n start_date: str,\n board_ids: Optional[List[str]] = None,\n ):\n """Airbyte Source for Trello.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/trello\n\n Args:\n name (str): The name of the destination.\n token (str): Trello v API token. See the docs for instructions on how to generate it.\n key (str): Trello API key. See the docs for instructions on how to generate it.\n start_date (str): UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated.\n board_ids (Optional[List[str]]): IDs of the boards to replicate data from. If left empty, data from all boards to which you have access will be replicated.\n """\n self.token = check.str_param(token, "token")\n self.key = check.str_param(key, "key")\n self.start_date = check.str_param(start_date, "start_date")\n self.board_ids = check.opt_nullable_list_param(board_ids, "board_ids", str)\n super().__init__("Trello", name)
\n\n\n
[docs]class PrestashopSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, url: str, access_key: str):\n """Airbyte Source for Prestashop.\n\n Args:\n name (str): The name of the destination.\n url (str): Shop URL without trailing slash (domain name or IP address)\n access_key (str): Your PrestaShop access key. See the docs for info on how to obtain this.\n """\n self.url = check.str_param(url, "url")\n self.access_key = check.str_param(access_key, "access_key")\n super().__init__("Prestashop", name)
\n\n\n
[docs]class PaystackSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n secret_key: str,\n start_date: str,\n lookback_window_days: Optional[int] = None,\n ):\n r"""Airbyte Source for Paystack.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/paystack\n\n Args:\n name (str): The name of the destination.\n secret_key (str): The Paystack API key (usually starts with 'sk_live\\\\_'; find yours here).\n start_date (str): UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated.\n lookback_window_days (Optional[int]): When set, the connector will always reload data from the past N days, where N is the value set here. This is useful if your data is updated after creation.\n """\n self.secret_key = check.str_param(secret_key, "secret_key")\n self.start_date = check.str_param(start_date, "start_date")\n self.lookback_window_days = check.opt_int_param(\n lookback_window_days, "lookback_window_days"\n )\n super().__init__("Paystack", name)
\n\n\n
[docs]class S3Source(GeneratedAirbyteSource):\n
[docs] class CSV:\n
[docs] @public\n def __init__(\n self,\n filetype: Optional[str] = None,\n delimiter: Optional[str] = None,\n infer_datatypes: Optional[bool] = None,\n quote_char: Optional[str] = None,\n escape_char: Optional[str] = None,\n encoding: Optional[str] = None,\n double_quote: Optional[bool] = None,\n newlines_in_values: Optional[bool] = None,\n additional_reader_options: Optional[str] = None,\n advanced_options: Optional[str] = None,\n block_size: Optional[int] = None,\n ):\n self.filetype = check.opt_str_param(filetype, "filetype")\n self.delimiter = check.opt_str_param(delimiter, "delimiter")\n self.infer_datatypes = check.opt_bool_param(infer_datatypes, "infer_datatypes")\n self.quote_char = check.opt_str_param(quote_char, "quote_char")\n self.escape_char = check.opt_str_param(escape_char, "escape_char")\n self.encoding = check.opt_str_param(encoding, "encoding")\n self.double_quote = check.opt_bool_param(double_quote, "double_quote")\n self.newlines_in_values = check.opt_bool_param(newlines_in_values, "newlines_in_values")\n self.additional_reader_options = check.opt_str_param(\n additional_reader_options, "additional_reader_options"\n )\n self.advanced_options = check.opt_str_param(advanced_options, "advanced_options")\n self.block_size = check.opt_int_param(block_size, "block_size")
\n\n
[docs] class Parquet:\n
[docs] @public\n def __init__(\n self,\n filetype: Optional[str] = None,\n columns: Optional[List[str]] = None,\n batch_size: Optional[int] = None,\n buffer_size: Optional[int] = None,\n ):\n self.filetype = check.opt_str_param(filetype, "filetype")\n self.columns = check.opt_nullable_list_param(columns, "columns", str)\n self.batch_size = check.opt_int_param(batch_size, "batch_size")\n self.buffer_size = check.opt_int_param(buffer_size, "buffer_size")
\n\n
[docs] class Avro:\n
[docs] @public\n def __init__(self, filetype: Optional[str] = None):\n self.filetype = check.opt_str_param(filetype, "filetype")
\n\n
[docs] class Jsonl:\n
[docs] @public\n def __init__(\n self,\n filetype: Optional[str] = None,\n newlines_in_values: Optional[bool] = None,\n unexpected_field_behavior: Optional[str] = None,\n block_size: Optional[int] = None,\n ):\n self.filetype = check.opt_str_param(filetype, "filetype")\n self.newlines_in_values = check.opt_bool_param(newlines_in_values, "newlines_in_values")\n self.unexpected_field_behavior = check.opt_str_param(\n unexpected_field_behavior, "unexpected_field_behavior"\n )\n self.block_size = check.opt_int_param(block_size, "block_size")
\n\n
[docs] class S3AmazonWebServices:\n
[docs] @public\n def __init__(\n self,\n bucket: str,\n aws_access_key_id: Optional[str] = None,\n aws_secret_access_key: Optional[str] = None,\n path_prefix: Optional[str] = None,\n endpoint: Optional[str] = None,\n ):\n self.bucket = check.str_param(bucket, "bucket")\n self.aws_access_key_id = check.opt_str_param(aws_access_key_id, "aws_access_key_id")\n self.aws_secret_access_key = check.opt_str_param(\n aws_secret_access_key, "aws_secret_access_key"\n )\n self.path_prefix = check.opt_str_param(path_prefix, "path_prefix")\n self.endpoint = check.opt_str_param(endpoint, "endpoint")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n dataset: str,\n path_pattern: str,\n format: Union["S3Source.CSV", "S3Source.Parquet", "S3Source.Avro", "S3Source.Jsonl"],\n provider: "S3Source.S3AmazonWebServices",\n schema: Optional[str] = None,\n ):\n """Airbyte Source for S3.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/s3\n\n Args:\n name (str): The name of the destination.\n dataset (str): The name of the stream you would like this source to output. Can contain letters, numbers, or underscores.\n path_pattern (str): A regular expression which tells the connector which files to replicate. All files which match this pattern will be replicated. Use | to separate multiple patterns. See this page to understand pattern syntax (GLOBSTAR and SPLIT flags are enabled). Use pattern ** to pick up all files.\n format (Union[S3Source.CSV, S3Source.Parquet, S3Source.Avro, S3Source.Jsonl]): The format of the files you'd like to replicate\n schema (Optional[str]): Optionally provide a schema to enforce, as a valid JSON string. Ensure this is a mapping of { "column" : "type" }, where types are valid JSON Schema datatypes. Leave as {} to auto-infer the schema.\n provider (S3Source.S3AmazonWebServices): Use this to load files from S3 or S3-compatible services\n """\n self.dataset = check.str_param(dataset, "dataset")\n self.path_pattern = check.str_param(path_pattern, "path_pattern")\n self.format = check.inst_param(\n format, "format", (S3Source.CSV, S3Source.Parquet, S3Source.Avro, S3Source.Jsonl)\n )\n self.schema = check.opt_str_param(schema, "schema")\n self.provider = check.inst_param(provider, "provider", S3Source.S3AmazonWebServices)\n super().__init__("S3", name)
\n\n\n
[docs]class SnowflakeSource(GeneratedAirbyteSource):\n
[docs] class OAuth20:\n
[docs] @public\n def __init__(\n self,\n client_id: str,\n client_secret: str,\n access_token: Optional[str] = None,\n refresh_token: Optional[str] = None,\n ):\n self.auth_type = "OAuth"\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.access_token = check.opt_str_param(access_token, "access_token")\n self.refresh_token = check.opt_str_param(refresh_token, "refresh_token")
\n\n
[docs] class UsernameAndPassword:\n
[docs] @public\n def __init__(self, username: str, password: str):\n self.auth_type = "username/password"\n self.username = check.str_param(username, "username")\n self.password = check.str_param(password, "password")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n credentials: Union["SnowflakeSource.OAuth20", "SnowflakeSource.UsernameAndPassword"],\n host: str,\n role: str,\n warehouse: str,\n database: str,\n schema: str,\n jdbc_url_params: Optional[str] = None,\n ):\n """Airbyte Source for Snowflake.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/snowflake\n\n Args:\n name (str): The name of the destination.\n host (str): The host domain of the snowflake instance (must include the account, region, cloud environment, and end with snowflakecomputing.com).\n role (str): The role you created for Airbyte to access Snowflake.\n warehouse (str): The warehouse you created for Airbyte to access data.\n database (str): The database you created for Airbyte to access data.\n schema (str): The source Snowflake schema tables.\n jdbc_url_params (Optional[str]): Additional properties to pass to the JDBC URL string when connecting to the database formatted as 'key=value' pairs separated by the symbol '&'. (example: key1=value1&key2=value2&key3=value3).\n """\n self.credentials = check.inst_param(\n credentials,\n "credentials",\n (SnowflakeSource.OAuth20, SnowflakeSource.UsernameAndPassword),\n )\n self.host = check.str_param(host, "host")\n self.role = check.str_param(role, "role")\n self.warehouse = check.str_param(warehouse, "warehouse")\n self.database = check.str_param(database, "database")\n self.schema = check.str_param(schema, "schema")\n self.jdbc_url_params = check.opt_str_param(jdbc_url_params, "jdbc_url_params")\n super().__init__("Snowflake", name)
\n\n\n
[docs]class AmplitudeSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, api_key: str, secret_key: str, start_date: str):\n """Airbyte Source for Amplitude.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/amplitude\n\n Args:\n name (str): The name of the destination.\n api_key (str): Amplitude API Key. See the setup guide for more information on how to obtain this key.\n secret_key (str): Amplitude Secret Key. See the setup guide for more information on how to obtain this key.\n start_date (str): UTC date and time in the format 2021-01-25T00:00:00Z. Any data before this date will not be replicated.\n """\n self.api_key = check.str_param(api_key, "api_key")\n self.secret_key = check.str_param(secret_key, "secret_key")\n self.start_date = check.str_param(start_date, "start_date")\n super().__init__("Amplitude", name)
\n\n\n
[docs]class PosthogSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, start_date: str, api_key: str, base_url: Optional[str] = None):\n """Airbyte Source for Posthog.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/posthog\n\n Args:\n name (str): The name of the destination.\n start_date (str): The date from which you'd like to replicate the data. Any data before this date will not be replicated.\n api_key (str): API Key. See the docs for information on how to generate this key.\n base_url (Optional[str]): Base PostHog url. Defaults to PostHog Cloud (https://app.posthog.com).\n """\n self.start_date = check.str_param(start_date, "start_date")\n self.api_key = check.str_param(api_key, "api_key")\n self.base_url = check.opt_str_param(base_url, "base_url")\n super().__init__("Posthog", name)
\n\n\n
[docs]class PaypalTransactionSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n start_date: str,\n is_sandbox: bool,\n client_id: Optional[str] = None,\n client_secret: Optional[str] = None,\n refresh_token: Optional[str] = None,\n ):\n """Airbyte Source for Paypal Transaction.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/paypal-transactions\n\n Args:\n name (str): The name of the destination.\n client_id (Optional[str]): The Client ID of your Paypal developer application.\n client_secret (Optional[str]): The Client Secret of your Paypal developer application.\n refresh_token (Optional[str]): The key to refresh the expired access token.\n start_date (str): Start Date for data extraction in ISO format. Date must be in range from 3 years till 12 hrs before present time.\n is_sandbox (bool): Determines whether to use the sandbox or production environment.\n """\n self.client_id = check.opt_str_param(client_id, "client_id")\n self.client_secret = check.opt_str_param(client_secret, "client_secret")\n self.refresh_token = check.opt_str_param(refresh_token, "refresh_token")\n self.start_date = check.str_param(start_date, "start_date")\n self.is_sandbox = check.bool_param(is_sandbox, "is_sandbox")\n super().__init__("Paypal Transaction", name)
\n\n\n
[docs]class MssqlSource(GeneratedAirbyteSource):\n
[docs] class Unencrypted:\n
[docs] @public\n def __init__(\n self,\n ):\n self.ssl_method = "unencrypted"
\n\n
[docs] class EncryptedTrustServerCertificate:\n
[docs] @public\n def __init__(\n self,\n ):\n self.ssl_method = "encrypted_trust_server_certificate"
\n\n
[docs] class EncryptedVerifyCertificate:\n
[docs] @public\n def __init__(self, hostNameInCertificate: Optional[str] = None):\n self.ssl_method = "encrypted_verify_certificate"\n self.hostNameInCertificate = check.opt_str_param(\n hostNameInCertificate, "hostNameInCertificate"\n )
\n\n
[docs] class Standard:\n
[docs] @public\n def __init__(\n self,\n ):\n self.method = "STANDARD"
\n\n
[docs] class LogicalReplicationCDC:\n
[docs] @public\n def __init__(\n self, data_to_sync: Optional[str] = None, snapshot_isolation: Optional[str] = None\n ):\n self.method = "CDC"\n self.data_to_sync = check.opt_str_param(data_to_sync, "data_to_sync")\n self.snapshot_isolation = check.opt_str_param(snapshot_isolation, "snapshot_isolation")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n host: str,\n port: int,\n database: str,\n username: str,\n ssl_method: Union[\n "MssqlSource.Unencrypted",\n "MssqlSource.EncryptedTrustServerCertificate",\n "MssqlSource.EncryptedVerifyCertificate",\n ],\n replication_method: Union["MssqlSource.Standard", "MssqlSource.LogicalReplicationCDC"],\n schemas: Optional[List[str]] = None,\n password: Optional[str] = None,\n jdbc_url_params: Optional[str] = None,\n ):\n """Airbyte Source for Mssql.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/mssql\n\n Args:\n name (str): The name of the destination.\n host (str): The hostname of the database.\n port (int): The port of the database.\n database (str): The name of the database.\n schemas (Optional[List[str]]): The list of schemas to sync from. Defaults to user. Case sensitive.\n username (str): The username which is used to access the database.\n password (Optional[str]): The password associated with the username.\n jdbc_url_params (Optional[str]): Additional properties to pass to the JDBC URL string when connecting to the database formatted as 'key=value' pairs separated by the symbol '&'. (example: key1=value1&key2=value2&key3=value3).\n ssl_method (Union[MssqlSource.Unencrypted, MssqlSource.EncryptedTrustServerCertificate, MssqlSource.EncryptedVerifyCertificate]): The encryption method which is used when communicating with the database.\n replication_method (Union[MssqlSource.Standard, MssqlSource.LogicalReplicationCDC]): The replication method used for extracting data from the database. STANDARD replication requires no setup on the DB side but will not be able to represent deletions incrementally. CDC uses {TBC} to detect inserts, updates, and deletes. This needs to be configured on the source database itself.\n """\n self.host = check.str_param(host, "host")\n self.port = check.int_param(port, "port")\n self.database = check.str_param(database, "database")\n self.schemas = check.opt_nullable_list_param(schemas, "schemas", str)\n self.username = check.str_param(username, "username")\n self.password = check.opt_str_param(password, "password")\n self.jdbc_url_params = check.opt_str_param(jdbc_url_params, "jdbc_url_params")\n self.ssl_method = check.inst_param(\n ssl_method,\n "ssl_method",\n (\n MssqlSource.Unencrypted,\n MssqlSource.EncryptedTrustServerCertificate,\n MssqlSource.EncryptedVerifyCertificate,\n ),\n )\n self.replication_method = check.inst_param(\n replication_method,\n "replication_method",\n (MssqlSource.Standard, MssqlSource.LogicalReplicationCDC),\n )\n super().__init__("Mssql", name)
\n\n\n
[docs]class ZohoCrmSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n client_id: str,\n client_secret: str,\n refresh_token: str,\n dc_region: str,\n environment: str,\n edition: str,\n start_datetime: Optional[str] = None,\n ):\n """Airbyte Source for Zoho Crm.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/zoho-crm\n\n Args:\n name (str): The name of the destination.\n client_id (str): OAuth2.0 Client ID\n client_secret (str): OAuth2.0 Client Secret\n refresh_token (str): OAuth2.0 Refresh Token\n dc_region (str): Please choose the region of your Data Center location. More info by this Link\n environment (str): Please choose the environment\n start_datetime (Optional[str]): ISO 8601, for instance: `YYYY-MM-DD`, `YYYY-MM-DD HH:MM:SS+HH:MM`\n edition (str): Choose your Edition of Zoho CRM to determine API Concurrency Limits\n """\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")\n self.dc_region = check.str_param(dc_region, "dc_region")\n self.environment = check.str_param(environment, "environment")\n self.start_datetime = check.opt_str_param(start_datetime, "start_datetime")\n self.edition = check.str_param(edition, "edition")\n super().__init__("Zoho Crm", name)
\n\n\n
[docs]class RedshiftSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n host: str,\n port: int,\n database: str,\n username: str,\n password: str,\n schemas: Optional[List[str]] = None,\n jdbc_url_params: Optional[str] = None,\n ):\n """Airbyte Source for Redshift.\n\n Documentation can be found at https://docs.airbyte.com/integrations/destinations/redshift\n\n Args:\n name (str): The name of the destination.\n host (str): Host Endpoint of the Redshift Cluster (must include the cluster-id, region and end with .redshift.amazonaws.com).\n port (int): Port of the database.\n database (str): Name of the database.\n schemas (Optional[List[str]]): The list of schemas to sync from. Specify one or more explicitly or keep empty to process all schemas. Schema names are case sensitive.\n username (str): Username to use to access the database.\n password (str): Password associated with the username.\n jdbc_url_params (Optional[str]): Additional properties to pass to the JDBC URL string when connecting to the database formatted as 'key=value' pairs separated by the symbol '&'. (example: key1=value1&key2=value2&key3=value3).\n """\n self.host = check.str_param(host, "host")\n self.port = check.int_param(port, "port")\n self.database = check.str_param(database, "database")\n self.schemas = check.opt_nullable_list_param(schemas, "schemas", str)\n self.username = check.str_param(username, "username")\n self.password = check.str_param(password, "password")\n self.jdbc_url_params = check.opt_str_param(jdbc_url_params, "jdbc_url_params")\n super().__init__("Redshift", name)
\n\n\n
[docs]class AsanaSource(GeneratedAirbyteSource):\n
[docs] class PATCredentials:\n
[docs] @public\n def __init__(self, personal_access_token: str):\n self.personal_access_token = check.str_param(\n personal_access_token, "personal_access_token"\n )
\n\n
[docs] class OAuthCredentials:\n
[docs] @public\n def __init__(self, client_id: str, client_secret: str, refresh_token: str):\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n credentials: Union["AsanaSource.PATCredentials", "AsanaSource.OAuthCredentials"],\n ):\n """Airbyte Source for Asana.\n\n Args:\n name (str): The name of the destination.\n credentials (Union[AsanaSource.PATCredentials, AsanaSource.OAuthCredentials]): Choose how to authenticate to Github\n """\n self.credentials = check.inst_param(\n credentials, "credentials", (AsanaSource.PATCredentials, AsanaSource.OAuthCredentials)\n )\n super().__init__("Asana", name)
\n\n\n
[docs]class SmartsheetsSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n access_token: str,\n spreadsheet_id: str,\n start_datetime: Optional[str] = None,\n ):\n """Airbyte Source for Smartsheets.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/smartsheets\n\n Args:\n name (str): The name of the destination.\n access_token (str): The access token to use for accessing your data from Smartsheets. This access token must be generated by a user with at least read access to the data you'd like to replicate. Generate an access token in the Smartsheets main menu by clicking Account > Apps & Integrations > API Access. See the setup guide for information on how to obtain this token.\n spreadsheet_id (str): The spreadsheet ID. Find it by opening the spreadsheet then navigating to File > Properties\n start_datetime (Optional[str]): Only rows modified after this date/time will be replicated. This should be an ISO 8601 string, for instance: `2000-01-01T13:00:00`\n """\n self.access_token = check.str_param(access_token, "access_token")\n self.spreadsheet_id = check.str_param(spreadsheet_id, "spreadsheet_id")\n self.start_datetime = check.opt_str_param(start_datetime, "start_datetime")\n super().__init__("Smartsheets", name)
\n\n\n
[docs]class MailchimpSource(GeneratedAirbyteSource):\n
[docs] class OAuth20:\n
[docs] @public\n def __init__(\n self,\n access_token: str,\n client_id: Optional[str] = None,\n client_secret: Optional[str] = None,\n ):\n self.auth_type = "oauth2.0"\n self.client_id = check.opt_str_param(client_id, "client_id")\n self.client_secret = check.opt_str_param(client_secret, "client_secret")\n self.access_token = check.str_param(access_token, "access_token")
\n\n
[docs] class APIKey:\n
[docs] @public\n def __init__(self, apikey: str):\n self.auth_type = "apikey"\n self.apikey = check.str_param(apikey, "apikey")
\n\n
[docs] @public\n def __init__(\n self, name: str, credentials: Union["MailchimpSource.OAuth20", "MailchimpSource.APIKey"]\n ):\n """Airbyte Source for Mailchimp.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/mailchimp\n\n Args:\n name (str): The name of the destination.\n\n """\n self.credentials = check.inst_param(\n credentials, "credentials", (MailchimpSource.OAuth20, MailchimpSource.APIKey)\n )\n super().__init__("Mailchimp", name)
\n\n\n
[docs]class SentrySource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n auth_token: str,\n organization: str,\n project: str,\n hostname: Optional[str] = None,\n discover_fields: Optional[List[str]] = None,\n ):\n """Airbyte Source for Sentry.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/sentry\n\n Args:\n name (str): The name of the destination.\n auth_token (str): Log into Sentry and then create authentication tokens.For self-hosted, you can find or create authentication tokens by visiting "{instance_url_prefix}/settings/account/api/auth-tokens/"\n hostname (Optional[str]): Host name of Sentry API server.For self-hosted, specify your host name here. Otherwise, leave it empty.\n organization (str): The slug of the organization the groups belong to.\n project (str): The name (slug) of the Project you want to sync.\n discover_fields (Optional[List[str]]): Fields to retrieve when fetching discover events\n """\n self.auth_token = check.str_param(auth_token, "auth_token")\n self.hostname = check.opt_str_param(hostname, "hostname")\n self.organization = check.str_param(organization, "organization")\n self.project = check.str_param(project, "project")\n self.discover_fields = check.opt_nullable_list_param(\n discover_fields, "discover_fields", str\n )\n super().__init__("Sentry", name)
\n\n\n
[docs]class MailgunSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n private_key: str,\n domain_region: Optional[str] = None,\n start_date: Optional[str] = None,\n ):\n """Airbyte Source for Mailgun.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/mailgun\n\n Args:\n name (str): The name of the destination.\n private_key (str): Primary account API key to access your Mailgun data.\n domain_region (Optional[str]): Domain region code. 'EU' or 'US' are possible values. The default is 'US'.\n start_date (Optional[str]): UTC date and time in the format 2020-10-01 00:00:00. Any data before this date will not be replicated. If omitted, defaults to 3 days ago.\n """\n self.private_key = check.str_param(private_key, "private_key")\n self.domain_region = check.opt_str_param(domain_region, "domain_region")\n self.start_date = check.opt_str_param(start_date, "start_date")\n super().__init__("Mailgun", name)
\n\n\n
[docs]class OnesignalSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, user_auth_key: str, start_date: str, outcome_names: str):\n """Airbyte Source for Onesignal.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/onesignal\n\n Args:\n name (str): The name of the destination.\n user_auth_key (str): OneSignal User Auth Key, see the docs for more information on how to obtain this key.\n start_date (str): The date from which you'd like to replicate data for OneSignal API, in the format YYYY-MM-DDT00:00:00Z. All data generated after this date will be replicated.\n outcome_names (str): Comma-separated list of names and the value (sum/count) for the returned outcome data. See the docs for more details\n """\n self.user_auth_key = check.str_param(user_auth_key, "user_auth_key")\n self.start_date = check.str_param(start_date, "start_date")\n self.outcome_names = check.str_param(outcome_names, "outcome_names")\n super().__init__("Onesignal", name)
\n\n\n
[docs]class PythonHttpTutorialSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, start_date: str, base: str, access_key: Optional[str] = None):\n """Airbyte Source for Python Http Tutorial.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/exchangeratesapi\n\n Args:\n name (str): The name of the destination.\n access_key (Optional[str]): API access key used to retrieve data from the Exchange Rates API.\n start_date (str): UTC date and time in the format 2017-01-25. Any data before this date will not be replicated.\n base (str): ISO reference currency. See here.\n """\n self.access_key = check.opt_str_param(access_key, "access_key")\n self.start_date = check.str_param(start_date, "start_date")\n self.base = check.str_param(base, "base")\n super().__init__("Python Http Tutorial", name)
\n\n\n
[docs]class AirtableSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, api_key: str, base_id: str, tables: List[str]):\n """Airbyte Source for Airtable.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/airtable\n\n Args:\n name (str): The name of the destination.\n api_key (str): The API Key for the Airtable account. See the Support Guide for more information on how to obtain this key.\n base_id (str): The Base ID to integrate the data from. You can find the Base ID following the link Airtable API, log in to your account, select the base you need and find Base ID in the docs.\n tables (List[str]): The list of Tables to integrate.\n """\n self.api_key = check.str_param(api_key, "api_key")\n self.base_id = check.str_param(base_id, "base_id")\n self.tables = check.list_param(tables, "tables", str)\n super().__init__("Airtable", name)
\n\n\n
[docs]class MongodbV2Source(GeneratedAirbyteSource):\n
[docs] class StandaloneMongoDbInstance:\n
[docs] @public\n def __init__(self, instance: str, host: str, port: int, tls: Optional[bool] = None):\n self.instance = check.str_param(instance, "instance")\n self.host = check.str_param(host, "host")\n self.port = check.int_param(port, "port")\n self.tls = check.opt_bool_param(tls, "tls")
\n\n
[docs] class ReplicaSet:\n
[docs] @public\n def __init__(self, instance: str, server_addresses: str, replica_set: Optional[str] = None):\n self.instance = check.str_param(instance, "instance")\n self.server_addresses = check.str_param(server_addresses, "server_addresses")\n self.replica_set = check.opt_str_param(replica_set, "replica_set")
\n\n
[docs] class MongoDBAtlas:\n
[docs] @public\n def __init__(self, instance: str, cluster_url: str):\n self.instance = check.str_param(instance, "instance")\n self.cluster_url = check.str_param(cluster_url, "cluster_url")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n instance_type: Union[\n "MongodbV2Source.StandaloneMongoDbInstance",\n "MongodbV2Source.ReplicaSet",\n "MongodbV2Source.MongoDBAtlas",\n ],\n database: str,\n user: Optional[str] = None,\n password: Optional[str] = None,\n auth_source: Optional[str] = None,\n ):\n """Airbyte Source for Mongodb V2.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/mongodb-v2\n\n Args:\n name (str): The name of the destination.\n instance_type (Union[MongodbV2Source.StandaloneMongoDbInstance, MongodbV2Source.ReplicaSet, MongodbV2Source.MongoDBAtlas]): The MongoDb instance to connect to. For MongoDB Atlas and Replica Set TLS connection is used by default.\n database (str): The database you want to replicate.\n user (Optional[str]): The username which is used to access the database.\n password (Optional[str]): The password associated with this username.\n auth_source (Optional[str]): The authentication source where the user information is stored.\n """\n self.instance_type = check.inst_param(\n instance_type,\n "instance_type",\n (\n MongodbV2Source.StandaloneMongoDbInstance,\n MongodbV2Source.ReplicaSet,\n MongodbV2Source.MongoDBAtlas,\n ),\n )\n self.database = check.str_param(database, "database")\n self.user = check.opt_str_param(user, "user")\n self.password = check.opt_str_param(password, "password")\n self.auth_source = check.opt_str_param(auth_source, "auth_source")\n super().__init__("Mongodb V2", name)
\n\n\n
[docs]class FileSecureSource(GeneratedAirbyteSource):\n
[docs] class HTTPSPublicWeb:\n
[docs] @public\n def __init__(self, user_agent: Optional[bool] = None):\n self.storage = "HTTPS"\n self.user_agent = check.opt_bool_param(user_agent, "user_agent")
\n\n
[docs] class GCSGoogleCloudStorage:\n
[docs] @public\n def __init__(self, service_account_json: Optional[str] = None):\n self.storage = "GCS"\n self.service_account_json = check.opt_str_param(\n service_account_json, "service_account_json"\n )
\n\n
[docs] class S3AmazonWebServices:\n
[docs] @public\n def __init__(\n self,\n aws_access_key_id: Optional[str] = None,\n aws_secret_access_key: Optional[str] = None,\n ):\n self.storage = "S3"\n self.aws_access_key_id = check.opt_str_param(aws_access_key_id, "aws_access_key_id")\n self.aws_secret_access_key = check.opt_str_param(\n aws_secret_access_key, "aws_secret_access_key"\n )
\n\n
[docs] class AzBlobAzureBlobStorage:\n
[docs] @public\n def __init__(\n self,\n storage_account: str,\n sas_token: Optional[str] = None,\n shared_key: Optional[str] = None,\n ):\n self.storage = "AzBlob"\n self.storage_account = check.str_param(storage_account, "storage_account")\n self.sas_token = check.opt_str_param(sas_token, "sas_token")\n self.shared_key = check.opt_str_param(shared_key, "shared_key")
\n\n
[docs] class SSHSecureShell:\n
[docs] @public\n def __init__(\n self, user: str, host: str, password: Optional[str] = None, port: Optional[str] = None\n ):\n self.storage = "SSH"\n self.user = check.str_param(user, "user")\n self.password = check.opt_str_param(password, "password")\n self.host = check.str_param(host, "host")\n self.port = check.opt_str_param(port, "port")
\n\n
[docs] class SCPSecureCopyProtocol:\n
[docs] @public\n def __init__(\n self, user: str, host: str, password: Optional[str] = None, port: Optional[str] = None\n ):\n self.storage = "SCP"\n self.user = check.str_param(user, "user")\n self.password = check.opt_str_param(password, "password")\n self.host = check.str_param(host, "host")\n self.port = check.opt_str_param(port, "port")
\n\n
[docs] class SFTPSecureFileTransferProtocol:\n
[docs] @public\n def __init__(\n self, user: str, host: str, password: Optional[str] = None, port: Optional[str] = None\n ):\n self.storage = "SFTP"\n self.user = check.str_param(user, "user")\n self.password = check.opt_str_param(password, "password")\n self.host = check.str_param(host, "host")\n self.port = check.opt_str_param(port, "port")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n dataset_name: str,\n format: str,\n url: str,\n provider: Union[\n "FileSecureSource.HTTPSPublicWeb",\n "FileSecureSource.GCSGoogleCloudStorage",\n "FileSecureSource.S3AmazonWebServices",\n "FileSecureSource.AzBlobAzureBlobStorage",\n "FileSecureSource.SSHSecureShell",\n "FileSecureSource.SCPSecureCopyProtocol",\n "FileSecureSource.SFTPSecureFileTransferProtocol",\n ],\n reader_options: Optional[str] = None,\n ):\n """Airbyte Source for File Secure.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/file\n\n Args:\n name (str): The name of the destination.\n dataset_name (str): The Name of the final table to replicate this file into (should include letters, numbers dash and underscores only).\n format (str): The Format of the file which should be replicated (Warning: some formats may be experimental, please refer to the docs).\n reader_options (Optional[str]): This should be a string in JSON format. It depends on the chosen file format to provide additional options and tune its behavior.\n url (str): The URL path to access the file which should be replicated.\n provider (Union[FileSecureSource.HTTPSPublicWeb, FileSecureSource.GCSGoogleCloudStorage, FileSecureSource.S3AmazonWebServices, FileSecureSource.AzBlobAzureBlobStorage, FileSecureSource.SSHSecureShell, FileSecureSource.SCPSecureCopyProtocol, FileSecureSource.SFTPSecureFileTransferProtocol]): The storage Provider or Location of the file(s) which should be replicated.\n """\n self.dataset_name = check.str_param(dataset_name, "dataset_name")\n self.format = check.str_param(format, "format")\n self.reader_options = check.opt_str_param(reader_options, "reader_options")\n self.url = check.str_param(url, "url")\n self.provider = check.inst_param(\n provider,\n "provider",\n (\n FileSecureSource.HTTPSPublicWeb,\n FileSecureSource.GCSGoogleCloudStorage,\n FileSecureSource.S3AmazonWebServices,\n FileSecureSource.AzBlobAzureBlobStorage,\n FileSecureSource.SSHSecureShell,\n FileSecureSource.SCPSecureCopyProtocol,\n FileSecureSource.SFTPSecureFileTransferProtocol,\n ),\n )\n super().__init__("File Secure", name)
\n\n\n
[docs]class ZendeskSupportSource(GeneratedAirbyteSource):\n
[docs] class OAuth20:\n
[docs] @public\n def __init__(self, access_token: str, credentials: Optional[str] = None):\n self.credentials = check.opt_str_param(credentials, "credentials")\n self.access_token = check.str_param(access_token, "access_token")
\n\n
[docs] class APIToken:\n
[docs] @public\n def __init__(self, email: str, api_token: str, credentials: Optional[str] = None):\n self.credentials = check.opt_str_param(credentials, "credentials")\n self.email = check.str_param(email, "email")\n self.api_token = check.str_param(api_token, "api_token")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n start_date: str,\n subdomain: str,\n credentials: Union["ZendeskSupportSource.OAuth20", "ZendeskSupportSource.APIToken"],\n ):\n """Airbyte Source for Zendesk Support.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/zendesk-support\n\n Args:\n name (str): The name of the destination.\n start_date (str): The date from which you'd like to replicate data for Zendesk Support API, in the format YYYY-MM-DDT00:00:00Z. All data generated after this date will be replicated.\n subdomain (str): This is your Zendesk subdomain that can be found in your account URL. For example, in https://{MY_SUBDOMAIN}.zendesk.com/, where MY_SUBDOMAIN is the value of your subdomain.\n credentials (Union[ZendeskSupportSource.OAuth20, ZendeskSupportSource.APIToken]): Zendesk service provides two authentication methods. Choose between: `OAuth2.0` or `API token`.\n """\n self.start_date = check.str_param(start_date, "start_date")\n self.subdomain = check.str_param(subdomain, "subdomain")\n self.credentials = check.inst_param(\n credentials,\n "credentials",\n (ZendeskSupportSource.OAuth20, ZendeskSupportSource.APIToken),\n )\n super().__init__("Zendesk Support", name)
\n\n\n
[docs]class TempoSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, api_token: str):\n """Airbyte Source for Tempo.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/\n\n Args:\n name (str): The name of the destination.\n api_token (str): Tempo API Token. Go to Tempo>Settings, scroll down to Data Access and select API integration.\n """\n self.api_token = check.str_param(api_token, "api_token")\n super().__init__("Tempo", name)
\n\n\n
[docs]class BraintreeSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n merchant_id: str,\n public_key: str,\n private_key: str,\n environment: str,\n start_date: Optional[str] = None,\n ):\n """Airbyte Source for Braintree.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/braintree\n\n Args:\n name (str): The name of the destination.\n merchant_id (str): The unique identifier for your entire gateway account. See the docs for more information on how to obtain this ID.\n public_key (str): Braintree Public Key. See the docs for more information on how to obtain this key.\n private_key (str): Braintree Private Key. See the docs for more information on how to obtain this key.\n start_date (Optional[str]): UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated.\n environment (str): Environment specifies where the data will come from.\n """\n self.merchant_id = check.str_param(merchant_id, "merchant_id")\n self.public_key = check.str_param(public_key, "public_key")\n self.private_key = check.str_param(private_key, "private_key")\n self.start_date = check.opt_str_param(start_date, "start_date")\n self.environment = check.str_param(environment, "environment")\n super().__init__("Braintree", name)
\n\n\n
[docs]class SalesloftSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self, name: str, client_id: str, client_secret: str, refresh_token: str, start_date: str\n ):\n """Airbyte Source for Salesloft.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/salesloft\n\n Args:\n name (str): The name of the destination.\n client_id (str): The Client ID of your Salesloft developer application.\n client_secret (str): The Client Secret of your Salesloft developer application.\n refresh_token (str): The token for obtaining a new access token.\n start_date (str): The date from which you'd like to replicate data for Salesloft API, in the format YYYY-MM-DDT00:00:00Z. All data generated after this date will be replicated.\n """\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")\n self.start_date = check.str_param(start_date, "start_date")\n super().__init__("Salesloft", name)
\n\n\n
[docs]class LinnworksSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self, name: str, application_id: str, application_secret: str, token: str, start_date: str\n ):\n """Airbyte Source for Linnworks.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/linnworks\n\n Args:\n name (str): The name of the destination.\n application_id (str): Linnworks Application ID\n application_secret (str): Linnworks Application Secret\n start_date (str): UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated.\n """\n self.application_id = check.str_param(application_id, "application_id")\n self.application_secret = check.str_param(application_secret, "application_secret")\n self.token = check.str_param(token, "token")\n self.start_date = check.str_param(start_date, "start_date")\n super().__init__("Linnworks", name)
\n\n\n
[docs]class ChargebeeSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self, name: str, site: str, site_api_key: str, start_date: str, product_catalog: str\n ):\n """Airbyte Source for Chargebee.\n\n Documentation can be found at https://apidocs.chargebee.com/docs/api\n\n Args:\n name (str): The name of the destination.\n site (str): The site prefix for your Chargebee instance.\n site_api_key (str): Chargebee API Key. See the docs for more information on how to obtain this key.\n start_date (str): UTC date and time in the format 2021-01-25T00:00:00Z. Any data before this date will not be replicated.\n product_catalog (str): Product Catalog version of your Chargebee site. Instructions on how to find your version you may find here under `API Version` section.\n """\n self.site = check.str_param(site, "site")\n self.site_api_key = check.str_param(site_api_key, "site_api_key")\n self.start_date = check.str_param(start_date, "start_date")\n self.product_catalog = check.str_param(product_catalog, "product_catalog")\n super().__init__("Chargebee", name)
\n\n\n
[docs]class GoogleAnalyticsDataApiSource(GeneratedAirbyteSource):\n
[docs] class AuthenticateViaGoogleOauth:\n
[docs] @public\n def __init__(\n self,\n client_id: str,\n client_secret: str,\n refresh_token: str,\n auth_type: Optional[str] = None,\n access_token: Optional[str] = None,\n ):\n self.auth_type = check.opt_str_param(auth_type, "auth_type")\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")\n self.access_token = check.opt_str_param(access_token, "access_token")
\n\n
[docs] class ServiceAccountKeyAuthentication:\n
[docs] @public\n def __init__(self, credentials_json: str, auth_type: Optional[str] = None):\n self.auth_type = check.opt_str_param(auth_type, "auth_type")\n self.credentials_json = check.str_param(credentials_json, "credentials_json")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n property_id: str,\n credentials: Union[\n "GoogleAnalyticsDataApiSource.AuthenticateViaGoogleOauth",\n "GoogleAnalyticsDataApiSource.ServiceAccountKeyAuthentication",\n ],\n date_ranges_start_date: str,\n custom_reports: Optional[str] = None,\n window_in_days: Optional[int] = None,\n ):\n """Airbyte Source for Google Analytics Data Api.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/google-analytics-v4\n\n Args:\n name (str): The name of the destination.\n property_id (str): A Google Analytics GA4 property identifier whose events are tracked. Specified in the URL path and not the body\n credentials (Union[GoogleAnalyticsDataApiSource.AuthenticateViaGoogleOauth, GoogleAnalyticsDataApiSource.ServiceAccountKeyAuthentication]): Credentials for the service\n date_ranges_start_date (str): The start date. One of the values Ndaysago, yesterday, today or in the format YYYY-MM-DD\n custom_reports (Optional[str]): A JSON array describing the custom reports you want to sync from Google Analytics. See the docs for more information about the exact format you can use to fill out this field.\n window_in_days (Optional[int]): The time increment used by the connector when requesting data from the Google Analytics API. More information is available in the the docs. The bigger this value is, the faster the sync will be, but the more likely that sampling will be applied to your data, potentially causing inaccuracies in the returned results. We recommend setting this to 1 unless you have a hard requirement to make the sync faster at the expense of accuracy. The minimum allowed value for this field is 1, and the maximum is 364.\n """\n self.property_id = check.str_param(property_id, "property_id")\n self.credentials = check.inst_param(\n credentials,\n "credentials",\n (\n GoogleAnalyticsDataApiSource.AuthenticateViaGoogleOauth,\n GoogleAnalyticsDataApiSource.ServiceAccountKeyAuthentication,\n ),\n )\n self.date_ranges_start_date = check.str_param(\n date_ranges_start_date, "date_ranges_start_date"\n )\n self.custom_reports = check.opt_str_param(custom_reports, "custom_reports")\n self.window_in_days = check.opt_int_param(window_in_days, "window_in_days")\n super().__init__("Google Analytics Data Api", name)
\n\n\n
[docs]class OutreachSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n client_id: str,\n client_secret: str,\n refresh_token: str,\n redirect_uri: str,\n start_date: str,\n ):\n """Airbyte Source for Outreach.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/outreach\n\n Args:\n name (str): The name of the destination.\n client_id (str): The Client ID of your Outreach developer application.\n client_secret (str): The Client Secret of your Outreach developer application.\n refresh_token (str): The token for obtaining the new access token.\n redirect_uri (str): A Redirect URI is the location where the authorization server sends the user once the app has been successfully authorized and granted an authorization code or access token.\n start_date (str): The date from which you'd like to replicate data for Outreach API, in the format YYYY-MM-DDT00:00:00Z. All data generated after this date will be replicated.\n """\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")\n self.redirect_uri = check.str_param(redirect_uri, "redirect_uri")\n self.start_date = check.str_param(start_date, "start_date")\n super().__init__("Outreach", name)
\n\n\n
[docs]class LemlistSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, api_key: str):\n """Airbyte Source for Lemlist.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/lemlist\n\n Args:\n name (str): The name of the destination.\n api_key (str): Lemlist API key.\n """\n self.api_key = check.str_param(api_key, "api_key")\n super().__init__("Lemlist", name)
\n\n\n
[docs]class ApifyDatasetSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, datasetId: str, clean: Optional[bool] = None):\n """Airbyte Source for Apify Dataset.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/apify-dataset\n\n Args:\n name (str): The name of the destination.\n datasetId (str): ID of the dataset you would like to load to Airbyte.\n clean (Optional[bool]): If set to true, only clean items will be downloaded from the dataset. See description of what clean means in Apify API docs. If not sure, set clean to false.\n """\n self.datasetId = check.str_param(datasetId, "datasetId")\n self.clean = check.opt_bool_param(clean, "clean")\n super().__init__("Apify Dataset", name)
\n\n\n
[docs]class RecurlySource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n api_key: str,\n begin_time: Optional[str] = None,\n end_time: Optional[str] = None,\n ):\n """Airbyte Source for Recurly.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/recurly\n\n Args:\n name (str): The name of the destination.\n api_key (str): Recurly API Key. See the docs for more information on how to generate this key.\n begin_time (Optional[str]): ISO8601 timestamp from which the replication from Recurly API will start from.\n end_time (Optional[str]): ISO8601 timestamp to which the replication from Recurly API will stop. Records after that date won't be imported.\n """\n self.api_key = check.str_param(api_key, "api_key")\n self.begin_time = check.opt_str_param(begin_time, "begin_time")\n self.end_time = check.opt_str_param(end_time, "end_time")\n super().__init__("Recurly", name)
\n\n\n
[docs]class ZendeskTalkSource(GeneratedAirbyteSource):\n
[docs] class APIToken:\n
[docs] @public\n def __init__(self, email: str, api_token: str, auth_type: Optional[str] = None):\n self.auth_type = check.opt_str_param(auth_type, "auth_type")\n self.email = check.str_param(email, "email")\n self.api_token = check.str_param(api_token, "api_token")
\n\n
[docs] class OAuth20:\n
[docs] @public\n def __init__(self, access_token: str, auth_type: Optional[str] = None):\n self.auth_type = check.opt_str_param(auth_type, "auth_type")\n self.access_token = check.str_param(access_token, "access_token")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n subdomain: str,\n credentials: Union["ZendeskTalkSource.APIToken", "ZendeskTalkSource.OAuth20"],\n start_date: str,\n ):\n """Airbyte Source for Zendesk Talk.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/zendesk-talk\n\n Args:\n name (str): The name of the destination.\n subdomain (str): This is your Zendesk subdomain that can be found in your account URL. For example, in https://{MY_SUBDOMAIN}.zendesk.com/, where MY_SUBDOMAIN is the value of your subdomain.\n credentials (Union[ZendeskTalkSource.APIToken, ZendeskTalkSource.OAuth20]): Zendesk service provides two authentication methods. Choose between: `OAuth2.0` or `API token`.\n start_date (str): The date from which you'd like to replicate data for Zendesk Talk API, in the format YYYY-MM-DDT00:00:00Z. All data generated after this date will be replicated.\n """\n self.subdomain = check.str_param(subdomain, "subdomain")\n self.credentials = check.inst_param(\n credentials, "credentials", (ZendeskTalkSource.APIToken, ZendeskTalkSource.OAuth20)\n )\n self.start_date = check.str_param(start_date, "start_date")\n super().__init__("Zendesk Talk", name)
\n\n\n
[docs]class SftpSource(GeneratedAirbyteSource):\n
[docs] class PasswordAuthentication:\n
[docs] @public\n def __init__(self, auth_user_password: str):\n self.auth_method = "SSH_PASSWORD_AUTH"\n self.auth_user_password = check.str_param(auth_user_password, "auth_user_password")
\n\n
[docs] class SSHKeyAuthentication:\n
[docs] @public\n def __init__(self, auth_ssh_key: str):\n self.auth_method = "SSH_KEY_AUTH"\n self.auth_ssh_key = check.str_param(auth_ssh_key, "auth_ssh_key")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n user: str,\n host: str,\n port: int,\n credentials: Union["SftpSource.PasswordAuthentication", "SftpSource.SSHKeyAuthentication"],\n file_types: Optional[str] = None,\n folder_path: Optional[str] = None,\n file_pattern: Optional[str] = None,\n ):\n """Airbyte Source for Sftp.\n\n Documentation can be found at https://docs.airbyte.com/integrations/source/sftp\n\n Args:\n name (str): The name of the destination.\n user (str): The server user\n host (str): The server host address\n port (int): The server port\n credentials (Union[SftpSource.PasswordAuthentication, SftpSource.SSHKeyAuthentication]): The server authentication method\n file_types (Optional[str]): Coma separated file types. Currently only 'csv' and 'json' types are supported.\n folder_path (Optional[str]): The directory to search files for sync\n file_pattern (Optional[str]): The regular expression to specify files for sync in a chosen Folder Path\n """\n self.user = check.str_param(user, "user")\n self.host = check.str_param(host, "host")\n self.port = check.int_param(port, "port")\n self.credentials = check.inst_param(\n credentials,\n "credentials",\n (SftpSource.PasswordAuthentication, SftpSource.SSHKeyAuthentication),\n )\n self.file_types = check.opt_str_param(file_types, "file_types")\n self.folder_path = check.opt_str_param(folder_path, "folder_path")\n self.file_pattern = check.opt_str_param(file_pattern, "file_pattern")\n super().__init__("Sftp", name)
\n\n\n
[docs]class WhiskyHunterSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n ):\n """Airbyte Source for Whisky Hunter.\n\n Documentation can be found at https://docs.airbyte.io/integrations/sources/whisky-hunter\n\n Args:\n name (str): The name of the destination.\n\n """\n super().__init__("Whisky Hunter", name)
\n\n\n
[docs]class FreshdeskSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n domain: str,\n api_key: str,\n requests_per_minute: Optional[int] = None,\n start_date: Optional[str] = None,\n ):\n """Airbyte Source for Freshdesk.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/freshdesk\n\n Args:\n name (str): The name of the destination.\n domain (str): Freshdesk domain\n api_key (str): Freshdesk API Key. See the docs for more information on how to obtain this key.\n requests_per_minute (Optional[int]): The number of requests per minute that this source allowed to use. There is a rate limit of 50 requests per minute per app per account.\n start_date (Optional[str]): UTC date and time. Any data created after this date will be replicated. If this parameter is not set, all data will be replicated.\n """\n self.domain = check.str_param(domain, "domain")\n self.api_key = check.str_param(api_key, "api_key")\n self.requests_per_minute = check.opt_int_param(requests_per_minute, "requests_per_minute")\n self.start_date = check.opt_str_param(start_date, "start_date")\n super().__init__("Freshdesk", name)
\n\n\n
[docs]class GocardlessSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n access_token: str,\n gocardless_environment: str,\n gocardless_version: str,\n start_date: str,\n ):\n """Airbyte Source for Gocardless.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/gocardless\n\n Args:\n name (str): The name of the destination.\n access_token (str): Gocardless API TOKEN\n gocardless_environment (str): Environment you are trying to connect to.\n gocardless_version (str): GoCardless version. This is a date. You can find the latest here: https://developer.gocardless.com/api-reference/#api-usage-making-requests\n start_date (str): UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated.\n """\n self.access_token = check.str_param(access_token, "access_token")\n self.gocardless_environment = check.str_param(\n gocardless_environment, "gocardless_environment"\n )\n self.gocardless_version = check.str_param(gocardless_version, "gocardless_version")\n self.start_date = check.str_param(start_date, "start_date")\n super().__init__("Gocardless", name)
\n\n\n
[docs]class ZuoraSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n start_date: str,\n tenant_endpoint: str,\n data_query: str,\n client_id: str,\n client_secret: str,\n window_in_days: Optional[str] = None,\n ):\n """Airbyte Source for Zuora.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/zuora\n\n Args:\n name (str): The name of the destination.\n start_date (str): Start Date in format: YYYY-MM-DD\n window_in_days (Optional[str]): The amount of days for each data-chunk begining from start_date. Bigger the value - faster the fetch. (0.1 - as for couple of hours, 1 - as for a Day; 364 - as for a Year).\n tenant_endpoint (str): Please choose the right endpoint where your Tenant is located. More info by this Link\n data_query (str): Choose between `Live`, or `Unlimited` - the optimized, replicated database at 12 hours freshness for high volume extraction Link\n client_id (str): Your OAuth user Client ID\n client_secret (str): Your OAuth user Client Secret\n """\n self.start_date = check.str_param(start_date, "start_date")\n self.window_in_days = check.opt_str_param(window_in_days, "window_in_days")\n self.tenant_endpoint = check.str_param(tenant_endpoint, "tenant_endpoint")\n self.data_query = check.str_param(data_query, "data_query")\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n super().__init__("Zuora", name)
\n\n\n
[docs]class MarketoSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self, name: str, domain_url: str, client_id: str, client_secret: str, start_date: str\n ):\n """Airbyte Source for Marketo.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/marketo\n\n Args:\n name (str): The name of the destination.\n domain_url (str): Your Marketo Base URL. See the docs for info on how to obtain this.\n client_id (str): The Client ID of your Marketo developer application. See the docs for info on how to obtain this.\n client_secret (str): The Client Secret of your Marketo developer application. See the docs for info on how to obtain this.\n start_date (str): UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated.\n """\n self.domain_url = check.str_param(domain_url, "domain_url")\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.start_date = check.str_param(start_date, "start_date")\n super().__init__("Marketo", name)
\n\n\n
[docs]class DriftSource(GeneratedAirbyteSource):\n
[docs] class OAuth20:\n
[docs] @public\n def __init__(\n self,\n client_id: str,\n client_secret: str,\n access_token: str,\n refresh_token: str,\n credentials: Optional[str] = None,\n ):\n self.credentials = check.opt_str_param(credentials, "credentials")\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.access_token = check.str_param(access_token, "access_token")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")
\n\n
[docs] class AccessToken:\n
[docs] @public\n def __init__(self, access_token: str, credentials: Optional[str] = None):\n self.credentials = check.opt_str_param(credentials, "credentials")\n self.access_token = check.str_param(access_token, "access_token")
\n\n
[docs] @public\n def __init__(\n self, name: str, credentials: Union["DriftSource.OAuth20", "DriftSource.AccessToken"]\n ):\n """Airbyte Source for Drift.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/drift\n\n Args:\n name (str): The name of the destination.\n\n """\n self.credentials = check.inst_param(\n credentials, "credentials", (DriftSource.OAuth20, DriftSource.AccessToken)\n )\n super().__init__("Drift", name)
\n\n\n
[docs]class PokeapiSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, pokemon_name: str):\n """Airbyte Source for Pokeapi.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/pokeapi\n\n Args:\n name (str): The name of the destination.\n pokemon_name (str): Pokemon requested from the API.\n """\n self.pokemon_name = check.str_param(pokemon_name, "pokemon_name")\n super().__init__("Pokeapi", name)
\n\n\n
[docs]class NetsuiteSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n realm: str,\n consumer_key: str,\n consumer_secret: str,\n token_key: str,\n token_secret: str,\n start_datetime: str,\n object_types: Optional[List[str]] = None,\n window_in_days: Optional[int] = None,\n ):\n """Airbyte Source for Netsuite.\n\n Args:\n name (str): The name of the destination.\n realm (str): Netsuite realm e.g. 2344535, as for `production` or 2344535_SB1, as for the `sandbox`\n consumer_key (str): Consumer key associated with your integration\n consumer_secret (str): Consumer secret associated with your integration\n token_key (str): Access token key\n token_secret (str): Access token secret\n object_types (Optional[List[str]]): The API names of the Netsuite objects you want to sync. Setting this speeds up the connection setup process by limiting the number of schemas that need to be retrieved from Netsuite.\n start_datetime (str): Starting point for your data replication, in format of "YYYY-MM-DDTHH:mm:ssZ"\n window_in_days (Optional[int]): The amount of days used to query the data with date chunks. Set smaller value, if you have lots of data.\n """\n self.realm = check.str_param(realm, "realm")\n self.consumer_key = check.str_param(consumer_key, "consumer_key")\n self.consumer_secret = check.str_param(consumer_secret, "consumer_secret")\n self.token_key = check.str_param(token_key, "token_key")\n self.token_secret = check.str_param(token_secret, "token_secret")\n self.object_types = check.opt_nullable_list_param(object_types, "object_types", str)\n self.start_datetime = check.str_param(start_datetime, "start_datetime")\n self.window_in_days = check.opt_int_param(window_in_days, "window_in_days")\n super().__init__("Netsuite", name)
\n\n\n
[docs]class HubplannerSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, api_key: str):\n """Airbyte Source for Hubplanner.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/hubplanner\n\n Args:\n name (str): The name of the destination.\n api_key (str): Hubplanner API key. See https://github.com/hubplanner/API#authentication for more details.\n """\n self.api_key = check.str_param(api_key, "api_key")\n super().__init__("Hubplanner", name)
\n\n\n
[docs]class Dv360Source(GeneratedAirbyteSource):\n
[docs] class Oauth2Credentials:\n
[docs] @public\n def __init__(\n self,\n access_token: str,\n refresh_token: str,\n token_uri: str,\n client_id: str,\n client_secret: str,\n ):\n self.access_token = check.str_param(access_token, "access_token")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")\n self.token_uri = check.str_param(token_uri, "token_uri")\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n credentials: "Dv360Source.Oauth2Credentials",\n partner_id: int,\n start_date: str,\n end_date: Optional[str] = None,\n filters: Optional[List[str]] = None,\n ):\n """Airbyte Source for Dv 360.\n\n Args:\n name (str): The name of the destination.\n credentials (Dv360Source.Oauth2Credentials): Oauth2 credentials\n partner_id (int): Partner ID\n start_date (str): UTC date and time in the format 2017-01-25. Any data before this date will not be replicated\n end_date (Optional[str]): UTC date and time in the format 2017-01-25. Any data after this date will not be replicated.\n filters (Optional[List[str]]): filters for the dimensions. each filter object had 2 keys: 'type' for the name of the dimension to be used as. and 'value' for the value of the filter\n """\n self.credentials = check.inst_param(\n credentials, "credentials", Dv360Source.Oauth2Credentials\n )\n self.partner_id = check.int_param(partner_id, "partner_id")\n self.start_date = check.str_param(start_date, "start_date")\n self.end_date = check.opt_str_param(end_date, "end_date")\n self.filters = check.opt_nullable_list_param(filters, "filters", str)\n super().__init__("Dv 360", name)
\n\n\n
[docs]class NotionSource(GeneratedAirbyteSource):\n
[docs] class OAuth20:\n
[docs] @public\n def __init__(self, client_id: str, client_secret: str, access_token: str):\n self.auth_type = "OAuth2.0"\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.access_token = check.str_param(access_token, "access_token")
\n\n
[docs] class AccessToken:\n
[docs] @public\n def __init__(self, token: str):\n self.auth_type = "token"\n self.token = check.str_param(token, "token")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n start_date: str,\n credentials: Union["NotionSource.OAuth20", "NotionSource.AccessToken"],\n ):\n """Airbyte Source for Notion.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/notion\n\n Args:\n name (str): The name of the destination.\n start_date (str): UTC date and time in the format 2017-01-25T00:00:00.000Z. Any data before this date will not be replicated.\n credentials (Union[NotionSource.OAuth20, NotionSource.AccessToken]): Pick an authentication method.\n """\n self.start_date = check.str_param(start_date, "start_date")\n self.credentials = check.inst_param(\n credentials, "credentials", (NotionSource.OAuth20, NotionSource.AccessToken)\n )\n super().__init__("Notion", name)
\n\n\n
[docs]class ZendeskSunshineSource(GeneratedAirbyteSource):\n
[docs] class OAuth20:\n
[docs] @public\n def __init__(self, client_id: str, client_secret: str, access_token: str):\n self.auth_method = "oauth2.0"\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.access_token = check.str_param(access_token, "access_token")
\n\n
[docs] class APIToken:\n
[docs] @public\n def __init__(self, api_token: str, email: str):\n self.auth_method = "api_token"\n self.api_token = check.str_param(api_token, "api_token")\n self.email = check.str_param(email, "email")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n subdomain: str,\n start_date: str,\n credentials: Union["ZendeskSunshineSource.OAuth20", "ZendeskSunshineSource.APIToken"],\n ):\n """Airbyte Source for Zendesk Sunshine.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/zendesk_sunshine\n\n Args:\n name (str): The name of the destination.\n subdomain (str): The subdomain for your Zendesk Account.\n start_date (str): The date from which you'd like to replicate data for Zendesk Sunshine API, in the format YYYY-MM-DDT00:00:00Z.\n """\n self.subdomain = check.str_param(subdomain, "subdomain")\n self.start_date = check.str_param(start_date, "start_date")\n self.credentials = check.inst_param(\n credentials,\n "credentials",\n (ZendeskSunshineSource.OAuth20, ZendeskSunshineSource.APIToken),\n )\n super().__init__("Zendesk Sunshine", name)
\n\n\n
[docs]class PinterestSource(GeneratedAirbyteSource):\n
[docs] class OAuth20:\n
[docs] @public\n def __init__(\n self,\n refresh_token: str,\n client_id: Optional[str] = None,\n client_secret: Optional[str] = None,\n ):\n self.auth_method = "oauth2.0"\n self.client_id = check.opt_str_param(client_id, "client_id")\n self.client_secret = check.opt_str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")
\n\n
[docs] class AccessToken:\n
[docs] @public\n def __init__(self, access_token: str):\n self.auth_method = "access_token"\n self.access_token = check.str_param(access_token, "access_token")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n start_date: str,\n credentials: Union["PinterestSource.OAuth20", "PinterestSource.AccessToken"],\n ):\n """Airbyte Source for Pinterest.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/pinterest\n\n Args:\n name (str): The name of the destination.\n start_date (str): A date in the format YYYY-MM-DD. If you have not set a date, it would be defaulted to latest allowed date by api (914 days from today).\n """\n self.start_date = check.str_param(start_date, "start_date")\n self.credentials = check.inst_param(\n credentials, "credentials", (PinterestSource.OAuth20, PinterestSource.AccessToken)\n )\n super().__init__("Pinterest", name)
\n\n\n
[docs]class MetabaseSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n instance_api_url: str,\n username: Optional[str] = None,\n password: Optional[str] = None,\n session_token: Optional[str] = None,\n ):\n r"""Airbyte Source for Metabase.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/metabase\n\n Args:\n name (str): The name of the destination.\n instance_api_url (str): URL to your metabase instance API\n session_token (Optional[str]): To generate your session token, you need to run the following command: ``` curl -X POST \\\\ -H "Content-Type: application/json" \\\\ -d '{"username": "person@metabase.com", "password": "fakepassword"}' \\\\ http://localhost:3000/api/session ``` Then copy the value of the `id` field returned by a successful call to that API. Note that by default, sessions are good for 14 days and needs to be regenerated.\n """\n self.instance_api_url = check.str_param(instance_api_url, "instance_api_url")\n self.username = check.opt_str_param(username, "username")\n self.password = check.opt_str_param(password, "password")\n self.session_token = check.opt_str_param(session_token, "session_token")\n super().__init__("Metabase", name)
\n\n\n
[docs]class HubspotSource(GeneratedAirbyteSource):\n
[docs] class OAuth:\n
[docs] @public\n def __init__(self, client_id: str, client_secret: str, refresh_token: str):\n self.credentials_title = "OAuth Credentials"\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")
\n\n
[docs] class APIKey:\n
[docs] @public\n def __init__(self, api_key: str):\n self.credentials_title = "API Key Credentials"\n self.api_key = check.str_param(api_key, "api_key")
\n\n
[docs] class PrivateAPP:\n
[docs] @public\n def __init__(self, access_token: str):\n self.credentials_title = "Private App Credentials"\n self.access_token = check.str_param(access_token, "access_token")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n start_date: str,\n credentials: Union[\n "HubspotSource.OAuth", "HubspotSource.APIKey", "HubspotSource.PrivateAPP"\n ],\n ):\n """Airbyte Source for Hubspot.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/hubspot\n\n Args:\n name (str): The name of the destination.\n start_date (str): UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated.\n credentials (Union[HubspotSource.OAuth, HubspotSource.APIKey, HubspotSource.PrivateAPP]): Choose how to authenticate to HubSpot.\n """\n self.start_date = check.str_param(start_date, "start_date")\n self.credentials = check.inst_param(\n credentials,\n "credentials",\n (HubspotSource.OAuth, HubspotSource.APIKey, HubspotSource.PrivateAPP),\n )\n super().__init__("Hubspot", name)
\n\n\n
[docs]class HarvestSource(GeneratedAirbyteSource):\n
[docs] class AuthenticateViaHarvestOAuth:\n
[docs] @public\n def __init__(\n self,\n client_id: str,\n client_secret: str,\n refresh_token: str,\n auth_type: Optional[str] = None,\n ):\n self.auth_type = check.opt_str_param(auth_type, "auth_type")\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")
\n\n
[docs] class AuthenticateWithPersonalAccessToken:\n
[docs] @public\n def __init__(self, api_token: str, auth_type: Optional[str] = None):\n self.auth_type = check.opt_str_param(auth_type, "auth_type")\n self.api_token = check.str_param(api_token, "api_token")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n account_id: str,\n replication_start_date: str,\n credentials: Union[\n "HarvestSource.AuthenticateViaHarvestOAuth",\n "HarvestSource.AuthenticateWithPersonalAccessToken",\n ],\n ):\n """Airbyte Source for Harvest.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/harvest\n\n Args:\n name (str): The name of the destination.\n account_id (str): Harvest account ID. Required for all Harvest requests in pair with Personal Access Token\n replication_start_date (str): UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated.\n credentials (Union[HarvestSource.AuthenticateViaHarvestOAuth, HarvestSource.AuthenticateWithPersonalAccessToken]): Choose how to authenticate to Harvest.\n """\n self.account_id = check.str_param(account_id, "account_id")\n self.replication_start_date = check.str_param(\n replication_start_date, "replication_start_date"\n )\n self.credentials = check.inst_param(\n credentials,\n "credentials",\n (\n HarvestSource.AuthenticateViaHarvestOAuth,\n HarvestSource.AuthenticateWithPersonalAccessToken,\n ),\n )\n super().__init__("Harvest", name)
\n\n\n
[docs]class GithubSource(GeneratedAirbyteSource):\n
[docs] class OAuthCredentials:\n
[docs] @public\n def __init__(self, access_token: str):\n self.access_token = check.str_param(access_token, "access_token")
\n\n
[docs] class PATCredentials:\n
[docs] @public\n def __init__(self, personal_access_token: str):\n self.personal_access_token = check.str_param(\n personal_access_token, "personal_access_token"\n )
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n credentials: Union["GithubSource.OAuthCredentials", "GithubSource.PATCredentials"],\n start_date: str,\n repository: str,\n branch: Optional[str] = None,\n page_size_for_large_streams: Optional[int] = None,\n ):\n """Airbyte Source for Github.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/github\n\n Args:\n name (str): The name of the destination.\n credentials (Union[GithubSource.OAuthCredentials, GithubSource.PATCredentials]): Choose how to authenticate to GitHub\n start_date (str): The date from which you'd like to replicate data from GitHub in the format YYYY-MM-DDT00:00:00Z. For the streams which support this configuration, only data generated on or after the start date will be replicated. This field doesn't apply to all streams, see the docs for more info\n repository (str): Space-delimited list of GitHub organizations/repositories, e.g. `airbytehq/airbyte` for single repository, `airbytehq/*` for get all repositories from organization and `airbytehq/airbyte airbytehq/another-repo` for multiple repositories.\n branch (Optional[str]): Space-delimited list of GitHub repository branches to pull commits for, e.g. `airbytehq/airbyte/master`. If no branches are specified for a repository, the default branch will be pulled.\n page_size_for_large_streams (Optional[int]): The Github connector contains several streams with a large amount of data. The page size of such streams depends on the size of your repository. We recommended that you specify values between 10 and 30.\n """\n self.credentials = check.inst_param(\n credentials, "credentials", (GithubSource.OAuthCredentials, GithubSource.PATCredentials)\n )\n self.start_date = check.str_param(start_date, "start_date")\n self.repository = check.str_param(repository, "repository")\n self.branch = check.opt_str_param(branch, "branch")\n self.page_size_for_large_streams = check.opt_int_param(\n page_size_for_large_streams, "page_size_for_large_streams"\n )\n super().__init__("Github", name)
\n\n\n
[docs]class E2eTestSource(GeneratedAirbyteSource):\n
[docs] class SingleSchema:\n
[docs] @public\n def __init__(\n self, stream_name: str, stream_schema: str, stream_duplication: Optional[int] = None\n ):\n self.type = "SINGLE_STREAM"\n self.stream_name = check.str_param(stream_name, "stream_name")\n self.stream_schema = check.str_param(stream_schema, "stream_schema")\n self.stream_duplication = check.opt_int_param(stream_duplication, "stream_duplication")
\n\n
[docs] class MultiSchema:\n
[docs] @public\n def __init__(self, stream_schemas: str):\n self.type = "MULTI_STREAM"\n self.stream_schemas = check.str_param(stream_schemas, "stream_schemas")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n max_messages: int,\n mock_catalog: Union["E2eTestSource.SingleSchema", "E2eTestSource.MultiSchema"],\n type: Optional[str] = None,\n seed: Optional[int] = None,\n message_interval_ms: Optional[int] = None,\n ):\n """Airbyte Source for E2e Test.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/e2e-test\n\n Args:\n name (str): The name of the destination.\n max_messages (int): Number of records to emit per stream. Min 1. Max 100 billion.\n seed (Optional[int]): When the seed is unspecified, the current time millis will be used as the seed. Range: [0, 1000000].\n message_interval_ms (Optional[int]): Interval between messages in ms. Min 0 ms. Max 60000 ms (1 minute).\n """\n self.type = check.opt_str_param(type, "type")\n self.max_messages = check.int_param(max_messages, "max_messages")\n self.seed = check.opt_int_param(seed, "seed")\n self.message_interval_ms = check.opt_int_param(message_interval_ms, "message_interval_ms")\n self.mock_catalog = check.inst_param(\n mock_catalog, "mock_catalog", (E2eTestSource.SingleSchema, E2eTestSource.MultiSchema)\n )\n super().__init__("E2e Test", name)
\n\n\n
[docs]class MysqlSource(GeneratedAirbyteSource):\n
[docs] class Preferred:\n
[docs] @public\n def __init__(\n self,\n ):\n self.mode = "preferred"
\n\n
[docs] class Required:\n
[docs] @public\n def __init__(\n self,\n ):\n self.mode = "required"
\n\n
[docs] class VerifyCA:\n
[docs] @public\n def __init__(\n self,\n ca_certificate: str,\n client_certificate: Optional[str] = None,\n client_key: Optional[str] = None,\n client_key_password: Optional[str] = None,\n ):\n self.mode = "verify_ca"\n self.ca_certificate = check.str_param(ca_certificate, "ca_certificate")\n self.client_certificate = check.opt_str_param(client_certificate, "client_certificate")\n self.client_key = check.opt_str_param(client_key, "client_key")\n self.client_key_password = check.opt_str_param(\n client_key_password, "client_key_password"\n )
\n\n
[docs] class VerifyIdentity:\n
[docs] @public\n def __init__(\n self,\n ca_certificate: str,\n client_certificate: Optional[str] = None,\n client_key: Optional[str] = None,\n client_key_password: Optional[str] = None,\n ):\n self.mode = "verify_identity"\n self.ca_certificate = check.str_param(ca_certificate, "ca_certificate")\n self.client_certificate = check.opt_str_param(client_certificate, "client_certificate")\n self.client_key = check.opt_str_param(client_key, "client_key")\n self.client_key_password = check.opt_str_param(\n client_key_password, "client_key_password"\n )
\n\n
[docs] class Standard:\n
[docs] @public\n def __init__(\n self,\n ):\n self.method = "STANDARD"
\n\n
[docs] class LogicalReplicationCDC:\n
[docs] @public\n def __init__(\n self,\n initial_waiting_seconds: Optional[int] = None,\n server_time_zone: Optional[str] = None,\n ):\n self.method = "CDC"\n self.initial_waiting_seconds = check.opt_int_param(\n initial_waiting_seconds, "initial_waiting_seconds"\n )\n self.server_time_zone = check.opt_str_param(server_time_zone, "server_time_zone")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n host: str,\n port: int,\n database: str,\n username: str,\n ssl_mode: Union[\n "MysqlSource.Preferred",\n "MysqlSource.Required",\n "MysqlSource.VerifyCA",\n "MysqlSource.VerifyIdentity",\n ],\n replication_method: Union["MysqlSource.Standard", "MysqlSource.LogicalReplicationCDC"],\n password: Optional[str] = None,\n jdbc_url_params: Optional[str] = None,\n ssl: Optional[bool] = None,\n ):\n """Airbyte Source for Mysql.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/mysql\n\n Args:\n name (str): The name of the destination.\n host (str): The host name of the database.\n port (int): The port to connect to.\n database (str): The database name.\n username (str): The username which is used to access the database.\n password (Optional[str]): The password associated with the username.\n jdbc_url_params (Optional[str]): Additional properties to pass to the JDBC URL string when connecting to the database formatted as 'key=value' pairs separated by the symbol '&'. (example: key1=value1&key2=value2&key3=value3). For more information read about JDBC URL parameters.\n ssl (Optional[bool]): Encrypt data using SSL.\n ssl_mode (Union[MysqlSource.Preferred, MysqlSource.Required, MysqlSource.VerifyCA, MysqlSource.VerifyIdentity]): SSL connection modes. preferred - Automatically attempt SSL connection. If the MySQL server does not support SSL, continue with a regular connection.required - Always connect with SSL. If the MySQL server doesn`t support SSL, the connection will not be established. Certificate Authority (CA) and Hostname are not verified.verify-ca - Always connect with SSL. Verifies CA, but allows connection even if Hostname does not match.Verify Identity - Always connect with SSL. Verify both CA and Hostname.Read more in the docs.\n replication_method (Union[MysqlSource.Standard, MysqlSource.LogicalReplicationCDC]): Replication method to use for extracting data from the database.\n """\n self.host = check.str_param(host, "host")\n self.port = check.int_param(port, "port")\n self.database = check.str_param(database, "database")\n self.username = check.str_param(username, "username")\n self.password = check.opt_str_param(password, "password")\n self.jdbc_url_params = check.opt_str_param(jdbc_url_params, "jdbc_url_params")\n self.ssl = check.opt_bool_param(ssl, "ssl")\n self.ssl_mode = check.inst_param(\n ssl_mode,\n "ssl_mode",\n (\n MysqlSource.Preferred,\n MysqlSource.Required,\n MysqlSource.VerifyCA,\n MysqlSource.VerifyIdentity,\n ),\n )\n self.replication_method = check.inst_param(\n replication_method,\n "replication_method",\n (MysqlSource.Standard, MysqlSource.LogicalReplicationCDC),\n )\n super().__init__("Mysql", name)
\n\n\n
[docs]class MyHoursSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n email: str,\n password: str,\n start_date: str,\n logs_batch_size: Optional[int] = None,\n ):\n """Airbyte Source for My Hours.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/my-hours\n\n Args:\n name (str): The name of the destination.\n email (str): Your My Hours username\n password (str): The password associated to the username\n start_date (str): Start date for collecting time logs\n logs_batch_size (Optional[int]): Pagination size used for retrieving logs in days\n """\n self.email = check.str_param(email, "email")\n self.password = check.str_param(password, "password")\n self.start_date = check.str_param(start_date, "start_date")\n self.logs_batch_size = check.opt_int_param(logs_batch_size, "logs_batch_size")\n super().__init__("My Hours", name)
\n\n\n
[docs]class KyribaSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n domain: str,\n username: str,\n password: str,\n start_date: str,\n end_date: Optional[str] = None,\n ):\n """Airbyte Source for Kyriba.\n\n Args:\n name (str): The name of the destination.\n domain (str): Kyriba domain\n username (str): Username to be used in basic auth\n password (str): Password to be used in basic auth\n start_date (str): The date the sync should start from.\n end_date (Optional[str]): The date the sync should end. If let empty the sync will run to the current date.\n """\n self.domain = check.str_param(domain, "domain")\n self.username = check.str_param(username, "username")\n self.password = check.str_param(password, "password")\n self.start_date = check.str_param(start_date, "start_date")\n self.end_date = check.opt_str_param(end_date, "end_date")\n super().__init__("Kyriba", name)
\n\n\n
[docs]class GoogleSearchConsoleSource(GeneratedAirbyteSource):\n
[docs] class OAuth:\n
[docs] @public\n def __init__(\n self,\n client_id: str,\n client_secret: str,\n refresh_token: str,\n access_token: Optional[str] = None,\n ):\n self.auth_type = "Client"\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.access_token = check.opt_str_param(access_token, "access_token")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")
\n\n
[docs] class ServiceAccountKeyAuthentication:\n
[docs] @public\n def __init__(self, service_account_info: str, email: str):\n self.auth_type = "Service"\n self.service_account_info = check.str_param(\n service_account_info, "service_account_info"\n )\n self.email = check.str_param(email, "email")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n site_urls: List[str],\n start_date: str,\n authorization: Union[\n "GoogleSearchConsoleSource.OAuth",\n "GoogleSearchConsoleSource.ServiceAccountKeyAuthentication",\n ],\n end_date: Optional[str] = None,\n custom_reports: Optional[str] = None,\n ):\n """Airbyte Source for Google Search Console.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/google-search-console\n\n Args:\n name (str): The name of the destination.\n site_urls (List[str]): The URLs of the website property attached to your GSC account. Read more here.\n start_date (str): UTC date in the format 2017-01-25. Any data before this date will not be replicated.\n end_date (Optional[str]): UTC date in the format 2017-01-25. Any data after this date will not be replicated. Must be greater or equal to the start date field.\n custom_reports (Optional[str]): A JSON array describing the custom reports you want to sync from Google Search Console. See the docs for more information about the exact format you can use to fill out this field.\n """\n self.site_urls = check.list_param(site_urls, "site_urls", str)\n self.start_date = check.str_param(start_date, "start_date")\n self.end_date = check.opt_str_param(end_date, "end_date")\n self.authorization = check.inst_param(\n authorization,\n "authorization",\n (\n GoogleSearchConsoleSource.OAuth,\n GoogleSearchConsoleSource.ServiceAccountKeyAuthentication,\n ),\n )\n self.custom_reports = check.opt_str_param(custom_reports, "custom_reports")\n super().__init__("Google Search Console", name)
\n\n\n
[docs]class FacebookMarketingSource(GeneratedAirbyteSource):\n
[docs] class InsightConfig:\n
[docs] @public\n def __init__(\n self,\n name: str,\n fields: Optional[List[str]] = None,\n breakdowns: Optional[List[str]] = None,\n action_breakdowns: Optional[List[str]] = None,\n time_increment: Optional[int] = None,\n start_date: Optional[str] = None,\n end_date: Optional[str] = None,\n insights_lookback_window: Optional[int] = None,\n ):\n self.name = check.str_param(name, "name")\n self.fields = check.opt_nullable_list_param(fields, "fields", str)\n self.breakdowns = check.opt_nullable_list_param(breakdowns, "breakdowns", str)\n self.action_breakdowns = check.opt_nullable_list_param(\n action_breakdowns, "action_breakdowns", str\n )\n self.time_increment = check.opt_int_param(time_increment, "time_increment")\n self.start_date = check.opt_str_param(start_date, "start_date")\n self.end_date = check.opt_str_param(end_date, "end_date")\n self.insights_lookback_window = check.opt_int_param(\n insights_lookback_window, "insights_lookback_window"\n )
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n account_id: str,\n start_date: str,\n access_token: str,\n end_date: Optional[str] = None,\n include_deleted: Optional[bool] = None,\n fetch_thumbnail_images: Optional[bool] = None,\n custom_insights: Optional[List[InsightConfig]] = None,\n page_size: Optional[int] = None,\n insights_lookback_window: Optional[int] = None,\n max_batch_size: Optional[int] = None,\n ):\n """Airbyte Source for Facebook Marketing.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/facebook-marketing\n\n Args:\n name (str): The name of the destination.\n account_id (str): The Facebook Ad account ID to use when pulling data from the Facebook Marketing API.\n start_date (str): The date from which you'd like to replicate data for all incremental streams, in the format YYYY-MM-DDT00:00:00Z. All data generated after this date will be replicated.\n end_date (Optional[str]): The date until which you'd like to replicate data for all incremental streams, in the format YYYY-MM-DDT00:00:00Z. All data generated between start_date and this date will be replicated. Not setting this option will result in always syncing the latest data.\n access_token (str): The value of the access token generated. See the docs for more information\n include_deleted (Optional[bool]): Include data from deleted Campaigns, Ads, and AdSets\n fetch_thumbnail_images (Optional[bool]): In each Ad Creative, fetch the thumbnail_url and store the result in thumbnail_data_url\n custom_insights (Optional[List[FacebookMarketingSource.InsightConfig]]): A list which contains insights entries, each entry must have a name and can contains fields, breakdowns or action_breakdowns)\n page_size (Optional[int]): Page size used when sending requests to Facebook API to specify number of records per page when response has pagination. Most users do not need to set this field unless they specifically need to tune the connector to address specific issues or use cases.\n insights_lookback_window (Optional[int]): The attribution window\n max_batch_size (Optional[int]): Maximum batch size used when sending batch requests to Facebook API. Most users do not need to set this field unless they specifically need to tune the connector to address specific issues or use cases.\n """\n self.account_id = check.str_param(account_id, "account_id")\n self.start_date = check.str_param(start_date, "start_date")\n self.end_date = check.opt_str_param(end_date, "end_date")\n self.access_token = check.str_param(access_token, "access_token")\n self.include_deleted = check.opt_bool_param(include_deleted, "include_deleted")\n self.fetch_thumbnail_images = check.opt_bool_param(\n fetch_thumbnail_images, "fetch_thumbnail_images"\n )\n self.custom_insights = check.opt_nullable_list_param(\n custom_insights, "custom_insights", FacebookMarketingSource.InsightConfig\n )\n self.page_size = check.opt_int_param(page_size, "page_size")\n self.insights_lookback_window = check.opt_int_param(\n insights_lookback_window, "insights_lookback_window"\n )\n self.max_batch_size = check.opt_int_param(max_batch_size, "max_batch_size")\n super().__init__("Facebook Marketing", name)
\n\n\n
[docs]class SurveymonkeySource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self, name: str, access_token: str, start_date: str, survey_ids: Optional[List[str]] = None\n ):\n """Airbyte Source for Surveymonkey.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/surveymonkey\n\n Args:\n name (str): The name of the destination.\n access_token (str): Access Token for making authenticated requests. See the docs for information on how to generate this key.\n start_date (str): UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated.\n survey_ids (Optional[List[str]]): IDs of the surveys from which you'd like to replicate data. If left empty, data from all boards to which you have access will be replicated.\n """\n self.access_token = check.str_param(access_token, "access_token")\n self.start_date = check.str_param(start_date, "start_date")\n self.survey_ids = check.opt_nullable_list_param(survey_ids, "survey_ids", str)\n super().__init__("Surveymonkey", name)
\n\n\n
[docs]class PardotSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(\n self,\n name: str,\n pardot_business_unit_id: str,\n client_id: str,\n client_secret: str,\n refresh_token: str,\n start_date: Optional[str] = None,\n is_sandbox: Optional[bool] = None,\n ):\n """Airbyte Source for Pardot.\n\n Args:\n name (str): The name of the destination.\n pardot_business_unit_id (str): Pardot Business ID, can be found at Setup > Pardot > Pardot Account Setup\n client_id (str): The Consumer Key that can be found when viewing your app in Salesforce\n client_secret (str): The Consumer Secret that can be found when viewing your app in Salesforce\n refresh_token (str): Salesforce Refresh Token used for Airbyte to access your Salesforce account. If you don't know what this is, follow this guide to retrieve it.\n start_date (Optional[str]): UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated. Leave blank to skip this filter\n is_sandbox (Optional[bool]): Whether or not the the app is in a Salesforce sandbox. If you do not know what this, assume it is false.\n """\n self.pardot_business_unit_id = check.str_param(\n pardot_business_unit_id, "pardot_business_unit_id"\n )\n self.client_id = check.str_param(client_id, "client_id")\n self.client_secret = check.str_param(client_secret, "client_secret")\n self.refresh_token = check.str_param(refresh_token, "refresh_token")\n self.start_date = check.opt_str_param(start_date, "start_date")\n self.is_sandbox = check.opt_bool_param(is_sandbox, "is_sandbox")\n super().__init__("Pardot", name)
\n\n\n
[docs]class FlexportSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, api_key: str, start_date: str):\n """Airbyte Source for Flexport.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/flexport\n\n Args:\n name (str): The name of the destination.\n\n """\n self.api_key = check.str_param(api_key, "api_key")\n self.start_date = check.str_param(start_date, "start_date")\n super().__init__("Flexport", name)
\n\n\n
[docs]class ZenefitsSource(GeneratedAirbyteSource):\n
[docs] @public\n def __init__(self, name: str, token: str):\n """Airbyte Source for Zenefits.\n\n Args:\n name (str): The name of the destination.\n token (str): Use Sync with Zenefits button on the link given on the readme file, and get the token to access the api\n """\n self.token = check.str_param(token, "token")\n super().__init__("Zenefits", name)
\n\n\n
[docs]class KafkaSource(GeneratedAirbyteSource):\n
[docs] class JSON:\n
[docs] @public\n def __init__(self, deserialization_type: Optional[str] = None):\n self.deserialization_type = check.opt_str_param(\n deserialization_type, "deserialization_type"\n )
\n\n
[docs] class AVRO:\n
[docs] @public\n def __init__(\n self,\n deserialization_type: Optional[str] = None,\n deserialization_strategy: Optional[str] = None,\n schema_registry_url: Optional[str] = None,\n schema_registry_username: Optional[str] = None,\n schema_registry_password: Optional[str] = None,\n ):\n self.deserialization_type = check.opt_str_param(\n deserialization_type, "deserialization_type"\n )\n self.deserialization_strategy = check.opt_str_param(\n deserialization_strategy, "deserialization_strategy"\n )\n self.schema_registry_url = check.opt_str_param(\n schema_registry_url, "schema_registry_url"\n )\n self.schema_registry_username = check.opt_str_param(\n schema_registry_username, "schema_registry_username"\n )\n self.schema_registry_password = check.opt_str_param(\n schema_registry_password, "schema_registry_password"\n )
\n\n
[docs] class ManuallyAssignAListOfPartitions:\n
[docs] @public\n def __init__(self, topic_partitions: str):\n self.subscription_type = "assign"\n self.topic_partitions = check.str_param(topic_partitions, "topic_partitions")
\n\n
[docs] class SubscribeToAllTopicsMatchingSpecifiedPattern:\n
[docs] @public\n def __init__(self, topic_pattern: str):\n self.subscription_type = "subscribe"\n self.topic_pattern = check.str_param(topic_pattern, "topic_pattern")
\n\n
[docs] class PLAINTEXT:\n
[docs] @public\n def __init__(self, security_protocol: str):\n self.security_protocol = check.str_param(security_protocol, "security_protocol")
\n\n
[docs] class SASLPLAINTEXT:\n
[docs] @public\n def __init__(self, security_protocol: str, sasl_mechanism: str, sasl_jaas_config: str):\n self.security_protocol = check.str_param(security_protocol, "security_protocol")\n self.sasl_mechanism = check.str_param(sasl_mechanism, "sasl_mechanism")\n self.sasl_jaas_config = check.str_param(sasl_jaas_config, "sasl_jaas_config")
\n\n
[docs] class SASLSSL:\n
[docs] @public\n def __init__(self, security_protocol: str, sasl_mechanism: str, sasl_jaas_config: str):\n self.security_protocol = check.str_param(security_protocol, "security_protocol")\n self.sasl_mechanism = check.str_param(sasl_mechanism, "sasl_mechanism")\n self.sasl_jaas_config = check.str_param(sasl_jaas_config, "sasl_jaas_config")
\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n MessageFormat: Union["KafkaSource.JSON", "KafkaSource.AVRO"],\n bootstrap_servers: str,\n subscription: Union[\n "KafkaSource.ManuallyAssignAListOfPartitions",\n "KafkaSource.SubscribeToAllTopicsMatchingSpecifiedPattern",\n ],\n protocol: Union[\n "KafkaSource.PLAINTEXT", "KafkaSource.SASLPLAINTEXT", "KafkaSource.SASLSSL"\n ],\n test_topic: Optional[str] = None,\n group_id: Optional[str] = None,\n max_poll_records: Optional[int] = None,\n polling_time: Optional[int] = None,\n client_id: Optional[str] = None,\n enable_auto_commit: Optional[bool] = None,\n auto_commit_interval_ms: Optional[int] = None,\n client_dns_lookup: Optional[str] = None,\n retry_backoff_ms: Optional[int] = None,\n request_timeout_ms: Optional[int] = None,\n receive_buffer_bytes: Optional[int] = None,\n auto_offset_reset: Optional[str] = None,\n repeated_calls: Optional[int] = None,\n max_records_process: Optional[int] = None,\n ):\n """Airbyte Source for Kafka.\n\n Documentation can be found at https://docs.airbyte.com/integrations/sources/kafka\n\n Args:\n name (str): The name of the destination.\n MessageFormat (Union[KafkaSource.JSON, KafkaSource.AVRO]): The serialization used based on this\n bootstrap_servers (str): A list of host/port pairs to use for establishing the initial connection to the Kafka cluster. The client will make use of all servers irrespective of which servers are specified here for bootstrapping&mdash;this list only impacts the initial hosts used to discover the full set of servers. This list should be in the form host1:port1,host2:port2,.... Since these servers are just used for the initial connection to discover the full cluster membership (which may change dynamically), this list need not contain the full set of servers (you may want more than one, though, in case a server is down).\n subscription (Union[KafkaSource.ManuallyAssignAListOfPartitions, KafkaSource.SubscribeToAllTopicsMatchingSpecifiedPattern]): You can choose to manually assign a list of partitions, or subscribe to all topics matching specified pattern to get dynamically assigned partitions.\n test_topic (Optional[str]): The Topic to test in case the Airbyte can consume messages.\n group_id (Optional[str]): The Group ID is how you distinguish different consumer groups.\n max_poll_records (Optional[int]): The maximum number of records returned in a single call to poll(). Note, that max_poll_records does not impact the underlying fetching behavior. The consumer will cache the records from each fetch request and returns them incrementally from each poll.\n polling_time (Optional[int]): Amount of time Kafka connector should try to poll for messages.\n protocol (Union[KafkaSource.PLAINTEXT, KafkaSource.SASLPLAINTEXT, KafkaSource.SASLSSL]): The Protocol used to communicate with brokers.\n client_id (Optional[str]): An ID string to pass to the server when making requests. The purpose of this is to be able to track the source of requests beyond just ip/port by allowing a logical application name to be included in server-side request logging.\n enable_auto_commit (Optional[bool]): If true, the consumer's offset will be periodically committed in the background.\n auto_commit_interval_ms (Optional[int]): The frequency in milliseconds that the consumer offsets are auto-committed to Kafka if enable.auto.commit is set to true.\n client_dns_lookup (Optional[str]): Controls how the client uses DNS lookups. If set to use_all_dns_ips, connect to each returned IP address in sequence until a successful connection is established. After a disconnection, the next IP is used. Once all IPs have been used once, the client resolves the IP(s) from the hostname again. If set to resolve_canonical_bootstrap_servers_only, resolve each bootstrap address into a list of canonical names. After the bootstrap phase, this behaves the same as use_all_dns_ips. If set to default (deprecated), attempt to connect to the first IP address returned by the lookup, even if the lookup returns multiple IP addresses.\n retry_backoff_ms (Optional[int]): The amount of time to wait before attempting to retry a failed request to a given topic partition. This avoids repeatedly sending requests in a tight loop under some failure scenarios.\n request_timeout_ms (Optional[int]): The configuration controls the maximum amount of time the client will wait for the response of a request. If the response is not received before the timeout elapses the client will resend the request if necessary or fail the request if retries are exhausted.\n receive_buffer_bytes (Optional[int]): The size of the TCP receive buffer (SO_RCVBUF) to use when reading data. If the value is -1, the OS default will be used.\n auto_offset_reset (Optional[str]): What to do when there is no initial offset in Kafka or if the current offset does not exist any more on the server - earliest: automatically reset the offset to the earliest offset, latest: automatically reset the offset to the latest offset, none: throw exception to the consumer if no previous offset is found for the consumer's group, anything else: throw exception to the consumer.\n repeated_calls (Optional[int]): The number of repeated calls to poll() if no messages were received.\n max_records_process (Optional[int]): The Maximum to be processed per execution\n """\n self.MessageFormat = check.inst_param(\n MessageFormat, "MessageFormat", (KafkaSource.JSON, KafkaSource.AVRO)\n )\n self.bootstrap_servers = check.str_param(bootstrap_servers, "bootstrap_servers")\n self.subscription = check.inst_param(\n subscription,\n "subscription",\n (\n KafkaSource.ManuallyAssignAListOfPartitions,\n KafkaSource.SubscribeToAllTopicsMatchingSpecifiedPattern,\n ),\n )\n self.test_topic = check.opt_str_param(test_topic, "test_topic")\n self.group_id = check.opt_str_param(group_id, "group_id")\n self.max_poll_records = check.opt_int_param(max_poll_records, "max_poll_records")\n self.polling_time = check.opt_int_param(polling_time, "polling_time")\n self.protocol = check.inst_param(\n protocol,\n "protocol",\n (KafkaSource.PLAINTEXT, KafkaSource.SASLPLAINTEXT, KafkaSource.SASLSSL),\n )\n self.client_id = check.opt_str_param(client_id, "client_id")\n self.enable_auto_commit = check.opt_bool_param(enable_auto_commit, "enable_auto_commit")\n self.auto_commit_interval_ms = check.opt_int_param(\n auto_commit_interval_ms, "auto_commit_interval_ms"\n )\n self.client_dns_lookup = check.opt_str_param(client_dns_lookup, "client_dns_lookup")\n self.retry_backoff_ms = check.opt_int_param(retry_backoff_ms, "retry_backoff_ms")\n self.request_timeout_ms = check.opt_int_param(request_timeout_ms, "request_timeout_ms")\n self.receive_buffer_bytes = check.opt_int_param(\n receive_buffer_bytes, "receive_buffer_bytes"\n )\n self.auto_offset_reset = check.opt_str_param(auto_offset_reset, "auto_offset_reset")\n self.repeated_calls = check.opt_int_param(repeated_calls, "repeated_calls")\n self.max_records_process = check.opt_int_param(max_records_process, "max_records_process")\n super().__init__("Kafka", name)
\n
", "current_page_name": "_modules/dagster_airbyte/managed/generated/sources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_airbyte.managed.generated.sources"}}, "reconciliation": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_airbyte.managed.reconciliation

\nfrom typing import (\n    Any,\n    Callable,\n    Dict,\n    Iterable,\n    List,\n    Mapping,\n    Optional,\n    Sequence,\n    Tuple,\n    Union,\n    cast,\n)\n\nimport dagster._check as check\nfrom dagster import AssetKey\nfrom dagster._annotations import experimental, public\nfrom dagster._core.definitions.cacheable_assets import CacheableAssetsDefinition\nfrom dagster._core.definitions.events import CoercibleToAssetKeyPrefix\nfrom dagster._core.definitions.freshness_policy import FreshnessPolicy\nfrom dagster._core.definitions.resource_definition import ResourceDefinition\nfrom dagster._core.execution.context.init import build_init_resource_context\nfrom dagster._utils.merger import deep_merge_dicts\nfrom dagster_managed_elements import (\n    ManagedElementCheckResult,\n    ManagedElementDiff,\n    ManagedElementError,\n)\nfrom dagster_managed_elements.types import (\n    SECRET_MASK_VALUE,\n    ManagedElementReconciler,\n    is_key_secret,\n)\nfrom dagster_managed_elements.utils import UNSET, diff_dicts\n\nfrom dagster_airbyte.asset_defs import (\n    AirbyteConnectionMetadata,\n    AirbyteInstanceCacheableAssetsDefinition,\n    _clean_name,\n)\nfrom dagster_airbyte.managed.types import (\n    AirbyteConnection,\n    AirbyteDestination,\n    AirbyteDestinationNamespace,\n    AirbyteSource,\n    AirbyteSyncMode,\n    InitializedAirbyteConnection,\n    InitializedAirbyteDestination,\n    InitializedAirbyteSource,\n)\nfrom dagster_airbyte.resources import AirbyteResource\nfrom dagster_airbyte.utils import is_basic_normalization_operation\n\n\ndef gen_configured_stream_json(\n    source_stream: Mapping[str, Any], user_stream_config: Mapping[str, AirbyteSyncMode]\n) -> Mapping[str, Any]:\n    """Generates an Airbyte API stream defintiion based on the succinct user-provided config and the\n    full stream definition from the source.\n    """\n    config = user_stream_config[source_stream["stream"]["name"]]\n    return deep_merge_dicts(\n        source_stream,\n        {"config": config.to_json()},\n    )\n\n\ndef _ignore_secrets_compare_fn(k: str, _cv: Any, dv: Any) -> Optional[bool]:\n    if is_key_secret(k):\n        return dv == SECRET_MASK_VALUE\n    return None\n\n\ndef _diff_configs(\n    config_dict: Mapping[str, Any], dst_dict: Mapping[str, Any], ignore_secrets: bool = True\n) -> ManagedElementDiff:\n    return diff_dicts(\n        config_dict=config_dict,\n        dst_dict=dst_dict,\n        custom_compare_fn=_ignore_secrets_compare_fn if ignore_secrets else None,\n    )\n\n\ndef diff_sources(\n    config_src: Optional[AirbyteSource],\n    curr_src: Optional[AirbyteSource],\n    ignore_secrets: bool = True,\n) -> ManagedElementCheckResult:\n    """Utility to diff two AirbyteSource objects."""\n    diff = _diff_configs(\n        config_src.source_configuration if config_src else {},\n        curr_src.source_configuration if curr_src else {},\n        ignore_secrets,\n    )\n    if not diff.is_empty():\n        name = config_src.name if config_src else curr_src.name if curr_src else "Unknown"\n        return ManagedElementDiff().with_nested(name, diff)\n\n    return ManagedElementDiff()\n\n\ndef diff_destinations(\n    config_dst: Optional[AirbyteDestination],\n    curr_dst: Optional[AirbyteDestination],\n    ignore_secrets: bool = True,\n) -> ManagedElementCheckResult:\n    """Utility to diff two AirbyteDestination objects."""\n    diff = _diff_configs(\n        config_dst.destination_configuration if config_dst else {},\n        curr_dst.destination_configuration if curr_dst else {},\n        ignore_secrets,\n    )\n    if not diff.is_empty():\n        name = config_dst.name if config_dst else curr_dst.name if curr_dst else "Unknown"\n        return ManagedElementDiff().with_nested(name, diff)\n\n    return ManagedElementDiff()\n\n\ndef conn_dict(conn: Optional[AirbyteConnection]) -> Mapping[str, Any]:\n    if not conn:\n        return {}\n    return {\n        "source": conn.source.name if conn.source else "Unknown",\n        "destination": conn.destination.name if conn.destination else "Unknown",\n        "normalize data": conn.normalize_data,\n        "streams": {k: v.to_json() for k, v in conn.stream_config.items()},\n        "destination namespace": (\n            conn.destination_namespace.name\n            if isinstance(conn.destination_namespace, AirbyteDestinationNamespace)\n            else conn.destination_namespace\n        ),\n        "prefix": conn.prefix,\n    }\n\n\nOPTIONAL_STREAM_SETTINGS = ("cursorField", "primaryKey")\n\n\ndef _compare_stream_values(k: str, cv: str, _dv: str):\n    """Don't register a diff for optional stream settings if the value is not set\n    in the user-provided config, this means it will default to the value in the\n    source.\n    """\n    return True if k in OPTIONAL_STREAM_SETTINGS and cv == UNSET else None\n\n\ndef diff_connections(\n    config_conn: Optional[AirbyteConnection], curr_conn: Optional[AirbyteConnection]\n) -> ManagedElementCheckResult:\n    """Utility to diff two AirbyteConnection objects."""\n    diff = diff_dicts(\n        conn_dict(config_conn),\n        conn_dict(curr_conn),\n        custom_compare_fn=_compare_stream_values,\n    )\n    if not diff.is_empty():\n        name = config_conn.name if config_conn else curr_conn.name if curr_conn else "Unknown"\n        return ManagedElementDiff().with_nested(name, diff)\n\n    return ManagedElementDiff()\n\n\ndef reconcile_sources(\n    res: AirbyteResource,\n    config_sources: Mapping[str, AirbyteSource],\n    existing_sources: Mapping[str, InitializedAirbyteSource],\n    workspace_id: str,\n    dry_run: bool,\n    should_delete: bool,\n    ignore_secrets: bool,\n) -> Tuple[Mapping[str, InitializedAirbyteSource], ManagedElementCheckResult]:\n    """Generates a diff of the configured and existing sources and reconciles them to match the\n    configured state if dry_run is False.\n    """\n    diff = ManagedElementDiff()\n\n    initialized_sources: Dict[str, InitializedAirbyteSource] = {}\n    for source_name in set(config_sources.keys()).union(existing_sources.keys()):\n        configured_source = config_sources.get(source_name)\n        existing_source = existing_sources.get(source_name)\n\n        # Ignore sources not mentioned in the user config unless the user specifies to delete\n        if not should_delete and existing_source and not configured_source:\n            initialized_sources[source_name] = existing_source\n            continue\n\n        diff = diff.join(\n            diff_sources(  # type: ignore\n                configured_source,\n                existing_source.source if existing_source else None,\n                ignore_secrets,\n            )\n        )\n\n        if existing_source and (\n            not configured_source or (configured_source.must_be_recreated(existing_source.source))\n        ):\n            initialized_sources[source_name] = existing_source\n            if not dry_run:\n                res.make_request(\n                    endpoint="/sources/delete",\n                    data={"sourceId": existing_source.source_id},\n                )\n            existing_source = None\n\n        if configured_source:\n            defn_id = check.not_none(\n                res.get_source_definition_by_name(configured_source.source_type)\n            )\n            base_source_defn_dict = {\n                "name": configured_source.name,\n                "connectionConfiguration": configured_source.source_configuration,\n            }\n            source_id = ""\n            if existing_source:\n                source_id = existing_source.source_id\n                if not dry_run:\n                    res.make_request(\n                        endpoint="/sources/update",\n                        data={"sourceId": source_id, **base_source_defn_dict},\n                    )\n            else:\n                if not dry_run:\n                    create_result = cast(\n                        Dict[str, str],\n                        check.not_none(\n                            res.make_request(\n                                endpoint="/sources/create",\n                                data={\n                                    "sourceDefinitionId": defn_id,\n                                    "workspaceId": workspace_id,\n                                    **base_source_defn_dict,\n                                },\n                            )\n                        ),\n                    )\n                    source_id = create_result["sourceId"]\n\n            if source_name in initialized_sources:\n                # Preserve to be able to initialize old connection object\n                initialized_sources[f"{source_name}_old"] = initialized_sources[source_name]\n            initialized_sources[source_name] = InitializedAirbyteSource(\n                source=configured_source,\n                source_id=source_id,\n                source_definition_id=defn_id,\n            )\n    return initialized_sources, diff\n\n\ndef reconcile_destinations(\n    res: AirbyteResource,\n    config_destinations: Mapping[str, AirbyteDestination],\n    existing_destinations: Mapping[str, InitializedAirbyteDestination],\n    workspace_id: str,\n    dry_run: bool,\n    should_delete: bool,\n    ignore_secrets: bool,\n) -> Tuple[Mapping[str, InitializedAirbyteDestination], ManagedElementCheckResult]:\n    """Generates a diff of the configured and existing destinations and reconciles them to match the\n    configured state if dry_run is False.\n    """\n    diff = ManagedElementDiff()\n\n    initialized_destinations: Dict[str, InitializedAirbyteDestination] = {}\n    for destination_name in set(config_destinations.keys()).union(existing_destinations.keys()):\n        configured_destination = config_destinations.get(destination_name)\n        existing_destination = existing_destinations.get(destination_name)\n\n        # Ignore destinations not mentioned in the user config unless the user specifies to delete\n        if not should_delete and existing_destination and not configured_destination:\n            initialized_destinations[destination_name] = existing_destination\n            continue\n\n        diff = diff.join(\n            diff_destinations(  # type: ignore\n                configured_destination,\n                existing_destination.destination if existing_destination else None,\n                ignore_secrets,\n            )\n        )\n\n        if existing_destination and (\n            not configured_destination\n            or (configured_destination.must_be_recreated(existing_destination.destination))\n        ):\n            initialized_destinations[destination_name] = existing_destination\n            if not dry_run:\n                res.make_request(\n                    endpoint="/destinations/delete",\n                    data={"destinationId": existing_destination.destination_id},\n                )\n            existing_destination = None\n\n        if configured_destination:\n            defn_id = res.get_destination_definition_by_name(\n                configured_destination.destination_type\n            )\n            base_destination_defn_dict = {\n                "name": configured_destination.name,\n                "connectionConfiguration": configured_destination.destination_configuration,\n            }\n            destination_id = ""\n            if existing_destination:\n                destination_id = existing_destination.destination_id\n                if not dry_run:\n                    res.make_request(\n                        endpoint="/destinations/update",\n                        data={"destinationId": destination_id, **base_destination_defn_dict},\n                    )\n            else:\n                if not dry_run:\n                    create_result = cast(\n                        Dict[str, str],\n                        check.not_none(\n                            res.make_request(\n                                endpoint="/destinations/create",\n                                data={\n                                    "destinationDefinitionId": defn_id,\n                                    "workspaceId": workspace_id,\n                                    **base_destination_defn_dict,\n                                },\n                            )\n                        ),\n                    )\n                    destination_id = create_result["destinationId"]\n\n            if destination_name in initialized_destinations:\n                # Preserve to be able to initialize old connection object\n                initialized_destinations[f"{destination_name}_old"] = initialized_destinations[\n                    destination_name\n                ]\n            initialized_destinations[destination_name] = InitializedAirbyteDestination(\n                destination=configured_destination,\n                destination_id=destination_id,\n                destination_definition_id=defn_id,\n            )\n    return initialized_destinations, diff\n\n\ndef reconcile_config(\n    res: AirbyteResource,\n    objects: Sequence[AirbyteConnection],\n    dry_run: bool = False,\n    should_delete: bool = False,\n    ignore_secrets: bool = True,\n) -> ManagedElementCheckResult:\n    """Main entry point for the reconciliation process. Takes a list of AirbyteConnection objects\n    and a pointer to an Airbyte instance and returns a diff, along with applying the diff\n    if dry_run is False.\n    """\n    with res.cache_requests():\n        config_connections = {conn.name: conn for conn in objects}\n        config_sources = {conn.source.name: conn.source for conn in objects}\n        config_dests = {conn.destination.name: conn.destination for conn in objects}\n\n        workspace_id = res.get_default_workspace()\n\n        existing_sources_raw = cast(\n            Dict[str, List[Dict[str, Any]]],\n            check.not_none(\n                res.make_request(endpoint="/sources/list", data={"workspaceId": workspace_id})\n            ),\n        )\n        existing_dests_raw = cast(\n            Dict[str, List[Dict[str, Any]]],\n            check.not_none(\n                res.make_request(endpoint="/destinations/list", data={"workspaceId": workspace_id})\n            ),\n        )\n\n        existing_sources: Dict[str, InitializedAirbyteSource] = {\n            source_json["name"]: InitializedAirbyteSource.from_api_json(source_json)\n            for source_json in existing_sources_raw.get("sources", [])\n        }\n        existing_dests: Dict[str, InitializedAirbyteDestination] = {\n            destination_json["name"]: InitializedAirbyteDestination.from_api_json(destination_json)\n            for destination_json in existing_dests_raw.get("destinations", [])\n        }\n\n        # First, remove any connections that need to be deleted, so that we can\n        # safely delete any sources/destinations that are no longer referenced\n        # or that need to be recreated.\n        connections_diff = reconcile_connections_pre(\n            res,\n            config_connections,\n            existing_sources,\n            existing_dests,\n            workspace_id,\n            dry_run,\n            should_delete,\n        )\n\n        all_sources, sources_diff = reconcile_sources(\n            res,\n            config_sources,\n            existing_sources,\n            workspace_id,\n            dry_run,\n            should_delete,\n            ignore_secrets,\n        )\n        all_dests, dests_diff = reconcile_destinations(\n            res, config_dests, existing_dests, workspace_id, dry_run, should_delete, ignore_secrets\n        )\n\n        # Now that we have updated the set of sources and destinations, we can\n        # recreate or update any connections which depend on them.\n        reconcile_connections_post(\n            res,\n            config_connections,\n            all_sources,\n            all_dests,\n            workspace_id,\n            dry_run,\n        )\n\n        return ManagedElementDiff().join(sources_diff).join(dests_diff).join(connections_diff)  # type: ignore\n\n\ndef reconcile_normalization(\n    res: AirbyteResource,\n    existing_connection_id: Optional[str],\n    destination: InitializedAirbyteDestination,\n    normalization_config: Optional[bool],\n    workspace_id: str,\n) -> Optional[str]:\n    """Reconciles the normalization configuration for a connection.\n\n    If normalization_config is None, then defaults to True on destinations that support normalization\n    and False on destinations that do not.\n    """\n    existing_basic_norm_op_id = None\n    if existing_connection_id:\n        operations = cast(\n            Dict[str, List[Dict[str, str]]],\n            check.not_none(\n                res.make_request(\n                    endpoint="/operations/list",\n                    data={"connectionId": existing_connection_id},\n                )\n            ),\n        )\n        existing_basic_norm_op = next(\n            (\n                operation\n                for operation in operations["operations"]\n                if is_basic_normalization_operation(operation)\n            ),\n            None,\n        )\n        existing_basic_norm_op_id = (\n            existing_basic_norm_op["operationId"] if existing_basic_norm_op else None\n        )\n\n    if normalization_config is not False:\n        if destination.destination_definition_id and res.does_dest_support_normalization(\n            destination.destination_definition_id, workspace_id\n        ):\n            if existing_basic_norm_op_id:\n                return existing_basic_norm_op_id\n            else:\n                return cast(\n                    Dict[str, str],\n                    check.not_none(\n                        res.make_request(\n                            endpoint="/operations/create",\n                            data={\n                                "workspaceId": workspace_id,\n                                "name": "Normalization",\n                                "operatorConfiguration": {\n                                    "operatorType": "normalization",\n                                    "normalization": {"option": "basic"},\n                                },\n                            },\n                        )\n                    ),\n                )["operationId"]\n        elif normalization_config is True:\n            raise Exception(\n                f"Destination {destination.destination.name} does not support normalization."\n            )\n\n    return None\n\n\ndef reconcile_connections_pre(\n    res: AirbyteResource,\n    config_connections: Mapping[str, AirbyteConnection],\n    existing_sources: Mapping[str, InitializedAirbyteSource],\n    existing_destinations: Mapping[str, InitializedAirbyteDestination],\n    workspace_id: str,\n    dry_run: bool,\n    should_delete: bool,\n) -> ManagedElementCheckResult:\n    """Generates the diff for connections, and deletes any connections that are not in the config if\n    dry_run is False.\n\n    It's necessary to do this in two steps because we need to remove connections that depend on\n    sources and destinations that are being deleted or recreated before Airbyte will allow us to\n    delete or recreate them.\n    """\n    diff = ManagedElementDiff()\n\n    existing_connections_raw = cast(\n        Dict[str, List[Dict[str, Any]]],\n        check.not_none(\n            res.make_request(endpoint="/connections/list", data={"workspaceId": workspace_id})\n        ),\n    )\n    existing_connections: Dict[str, InitializedAirbyteConnection] = {\n        connection_json["name"]: InitializedAirbyteConnection.from_api_json(\n            connection_json, existing_sources, existing_destinations\n        )\n        for connection_json in existing_connections_raw.get("connections", [])\n    }\n\n    for conn_name in set(config_connections.keys()).union(existing_connections.keys()):\n        config_conn = config_connections.get(conn_name)\n        existing_conn = existing_connections.get(conn_name)\n\n        # Ignore connections not mentioned in the user config unless the user specifies to delete\n        if not should_delete and not config_conn:\n            continue\n\n        diff = diff.join(\n            diff_connections(config_conn, existing_conn.connection if existing_conn else None)  # type: ignore\n        )\n\n        if existing_conn and (\n            not config_conn or config_conn.must_be_recreated(existing_conn.connection)\n        ):\n            if not dry_run:\n                res.make_request(\n                    endpoint="/connections/delete",\n                    data={"connectionId": existing_conn.connection_id},\n                )\n    return diff\n\n\ndef reconcile_connections_post(\n    res: AirbyteResource,\n    config_connections: Mapping[str, AirbyteConnection],\n    init_sources: Mapping[str, InitializedAirbyteSource],\n    init_dests: Mapping[str, InitializedAirbyteDestination],\n    workspace_id: str,\n    dry_run: bool,\n) -> None:\n    """Creates new and modifies existing connections based on the config if dry_run is False."""\n    existing_connections_raw = cast(\n        Dict[str, List[Dict[str, Any]]],\n        check.not_none(\n            res.make_request(endpoint="/connections/list", data={"workspaceId": workspace_id})\n        ),\n    )\n    existing_connections = {\n        connection_json["name"]: InitializedAirbyteConnection.from_api_json(\n            connection_json, init_sources, init_dests\n        )\n        for connection_json in existing_connections_raw.get("connections", [])\n    }\n\n    for conn_name, config_conn in config_connections.items():\n        existing_conn = existing_connections.get(conn_name)\n\n        normalization_operation_id = None\n        if not dry_run:\n            destination = init_dests[config_conn.destination.name]\n\n            # Enable or disable basic normalization based on config\n            normalization_operation_id = reconcile_normalization(\n                res,\n                existing_connections.get("name", {}).get("connectionId"),\n                destination,\n                config_conn.normalize_data,\n                workspace_id,\n            )\n\n        configured_streams = []\n        if not dry_run:\n            source = init_sources[config_conn.source.name]\n            schema = res.get_source_schema(source.source_id)\n            base_streams = schema["catalog"]["streams"]\n\n            configured_streams = [\n                gen_configured_stream_json(stream, config_conn.stream_config)\n                for stream in base_streams\n                if stream["stream"]["name"] in config_conn.stream_config\n            ]\n\n        connection_base_json = {\n            "name": conn_name,\n            "namespaceDefinition": "source",\n            "namespaceFormat": "${SOURCE_NAMESPACE}",\n            "prefix": "",\n            "operationIds": [normalization_operation_id] if normalization_operation_id else [],\n            "syncCatalog": {"streams": configured_streams},\n            "scheduleType": "manual",\n            "status": "active",\n        }\n\n        if isinstance(config_conn.destination_namespace, AirbyteDestinationNamespace):\n            connection_base_json["namespaceDefinition"] = config_conn.destination_namespace.value\n        else:\n            connection_base_json["namespaceDefinition"] = "customformat"\n            connection_base_json["namespaceFormat"] = cast(str, config_conn.destination_namespace)\n\n        if config_conn.prefix:\n            connection_base_json["prefix"] = config_conn.prefix\n\n        if existing_conn:\n            if not dry_run:\n                source = init_sources[config_conn.source.name]\n                res.make_request(\n                    endpoint="/connections/update",\n                    data={\n                        **connection_base_json,\n                        "sourceCatalogId": res.get_source_catalog_id(source.source_id),\n                        "connectionId": existing_conn.connection_id,\n                    },\n                )\n        else:\n            if not dry_run:\n                source = init_sources[config_conn.source.name]\n                destination = init_dests[config_conn.destination.name]\n\n                res.make_request(\n                    endpoint="/connections/create",\n                    data={\n                        **connection_base_json,\n                        "sourceCatalogId": res.get_source_catalog_id(source.source_id),\n                        "sourceId": source.source_id,\n                        "destinationId": destination.destination_id,\n                    },\n                )\n\n\n
[docs]@experimental\nclass AirbyteManagedElementReconciler(ManagedElementReconciler):\n """Reconciles Python-specified Airbyte connections with an Airbyte instance.\n\n Passing the module containing an AirbyteManagedElementReconciler to the dagster-airbyte\n CLI will allow you to check the state of your Python-code-specified Airbyte connections\n against an Airbyte instance, and reconcile them if necessary.\n\n This functionality is experimental and subject to change.\n """\n\n
[docs] @public\n def __init__(\n self,\n airbyte: Union[AirbyteResource, ResourceDefinition],\n connections: Iterable[AirbyteConnection],\n delete_unmentioned_resources: bool = False,\n ):\n """Reconciles Python-specified Airbyte connections with an Airbyte instance.\n\n Args:\n airbyte (Union[AirbyteResource, ResourceDefinition]): The Airbyte resource definition to reconcile against.\n connections (Iterable[AirbyteConnection]): The Airbyte connection objects to reconcile.\n delete_unmentioned_resources (bool): Whether to delete resources that are not mentioned in\n the set of connections provided. When True, all Airbyte instance contents are effectively\n managed by the reconciler. Defaults to False.\n """\n # airbyte = check.inst_param(airbyte, "airbyte", ResourceDefinition)\n\n self._airbyte_instance: AirbyteResource = (\n airbyte\n if isinstance(airbyte, AirbyteResource)\n else airbyte(build_init_resource_context())\n )\n self._connections = list(\n check.iterable_param(connections, "connections", of_type=AirbyteConnection)\n )\n self._delete_unmentioned_resources = check.bool_param(\n delete_unmentioned_resources, "delete_unmentioned_resources"\n )\n\n super().__init__()
\n\n def check(self, **kwargs) -> ManagedElementCheckResult:\n return reconcile_config(\n self._airbyte_instance,\n self._connections,\n dry_run=True,\n should_delete=self._delete_unmentioned_resources,\n ignore_secrets=(not kwargs.get("include_all_secrets", False)),\n )\n\n def apply(self, **kwargs) -> ManagedElementCheckResult:\n return reconcile_config(\n self._airbyte_instance,\n self._connections,\n dry_run=False,\n should_delete=self._delete_unmentioned_resources,\n ignore_secrets=(not kwargs.get("include_all_secrets", False)),\n )
\n\n\nclass AirbyteManagedElementCacheableAssetsDefinition(AirbyteInstanceCacheableAssetsDefinition):\n def __init__(\n self,\n airbyte_resource_def: AirbyteResource,\n key_prefix: Sequence[str],\n create_assets_for_normalization_tables: bool,\n connection_to_group_fn: Optional[Callable[[str], Optional[str]]],\n connections: Iterable[AirbyteConnection],\n connection_to_io_manager_key_fn: Optional[Callable[[str], Optional[str]]],\n connection_to_asset_key_fn: Optional[Callable[[AirbyteConnectionMetadata, str], AssetKey]],\n connection_to_freshness_policy_fn: Optional[\n Callable[[AirbyteConnectionMetadata], Optional[FreshnessPolicy]]\n ],\n ):\n defined_conn_names = {conn.name for conn in connections}\n super().__init__(\n airbyte_resource_def=airbyte_resource_def,\n workspace_id=None,\n key_prefix=key_prefix,\n create_assets_for_normalization_tables=create_assets_for_normalization_tables,\n connection_to_group_fn=connection_to_group_fn,\n connection_to_io_manager_key_fn=connection_to_io_manager_key_fn,\n connection_filter=lambda conn: conn.name in defined_conn_names,\n connection_to_asset_key_fn=connection_to_asset_key_fn,\n connection_to_freshness_policy_fn=connection_to_freshness_policy_fn,\n )\n self._connections: List[AirbyteConnection] = list(connections)\n\n def _get_connections(self) -> Sequence[Tuple[str, AirbyteConnectionMetadata]]:\n diff = reconcile_config(self._airbyte_instance, self._connections, dry_run=True)\n if isinstance(diff, ManagedElementDiff) and not diff.is_empty():\n raise ValueError(\n "Airbyte connections are not in sync with provided configuration, diff:\\n{}".format(\n str(diff)\n )\n )\n elif isinstance(diff, ManagedElementError):\n raise ValueError(f"Error checking Airbyte connections: {diff}")\n\n return super()._get_connections()\n\n\n
[docs]@experimental\ndef load_assets_from_connections(\n airbyte: Union[AirbyteResource, ResourceDefinition],\n connections: Iterable[AirbyteConnection],\n key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n create_assets_for_normalization_tables: bool = True,\n connection_to_group_fn: Optional[Callable[[str], Optional[str]]] = _clean_name,\n io_manager_key: Optional[str] = None,\n connection_to_io_manager_key_fn: Optional[Callable[[str], Optional[str]]] = None,\n connection_to_asset_key_fn: Optional[\n Callable[[AirbyteConnectionMetadata, str], AssetKey]\n ] = None,\n connection_to_freshness_policy_fn: Optional[\n Callable[[AirbyteConnectionMetadata], Optional[FreshnessPolicy]]\n ] = None,\n) -> CacheableAssetsDefinition:\n """Loads Airbyte connection assets from a configured AirbyteResource instance, checking against a list of AirbyteConnection objects.\n This method will raise an error on repo load if the passed AirbyteConnection objects are not in sync with the Airbyte instance.\n\n Args:\n airbyte (Union[AirbyteResource, ResourceDefinition]): An AirbyteResource configured with the appropriate connection\n details.\n connections (Iterable[AirbyteConnection]): A list of AirbyteConnection objects to build assets for.\n key_prefix (Optional[CoercibleToAssetKeyPrefix]): A prefix for the asset keys created.\n create_assets_for_normalization_tables (bool): If True, assets will be created for tables\n created by Airbyte's normalization feature. If False, only the destination tables\n will be created. Defaults to True.\n connection_to_group_fn (Optional[Callable[[str], Optional[str]]]): Function which returns an asset\n group name for a given Airbyte connection name. If None, no groups will be created. Defaults\n to a basic sanitization function.\n io_manager_key (Optional[str]): The IO manager key to use for all assets. Defaults to "io_manager".\n Use this if all assets should be loaded from the same source, otherwise use connection_to_io_manager_key_fn.\n connection_to_io_manager_key_fn (Optional[Callable[[str], Optional[str]]]): Function which returns an\n IO manager key for a given Airbyte connection name. When other ops are downstream of the loaded assets,\n the IOManager specified determines how the inputs to those ops are loaded. Defaults to "io_manager".\n connection_to_asset_key_fn (Optional[Callable[[AirbyteConnectionMetadata, str], AssetKey]]): Optional function which\n takes in connection metadata and table name and returns an asset key for the table. If None, the default asset\n key is based on the table name. Any asset key prefix will be applied to the output of this function.\n connection_to_freshness_policy_fn (Optional[Callable[[AirbyteConnectionMetadata], Optional[FreshnessPolicy]]]): Optional function which\n takes in connection metadata and returns a freshness policy for the connection. If None, no freshness policy will be applied.\n\n **Examples:**\n\n .. code-block:: python\n\n from dagster_airbyte import (\n AirbyteConnection,\n AirbyteResource,\n load_assets_from_connections,\n )\n\n airbyte_instance = AirbyteResource(\n host: "localhost",\n port: "8000",\n )\n airbyte_connections = [\n AirbyteConnection(...),\n AirbyteConnection(...)\n ]\n airbyte_assets = load_assets_from_connections(airbyte_instance, airbyte_connections)\n """\n if isinstance(key_prefix, str):\n key_prefix = [key_prefix]\n key_prefix = check.list_param(key_prefix or [], "key_prefix", of_type=str)\n\n check.invariant(\n not io_manager_key or not connection_to_io_manager_key_fn,\n "Cannot specify both io_manager_key and connection_to_io_manager_key_fn",\n )\n if not connection_to_io_manager_key_fn:\n connection_to_io_manager_key_fn = lambda _: io_manager_key\n\n return AirbyteManagedElementCacheableAssetsDefinition(\n airbyte_resource_def=(\n airbyte\n if isinstance(airbyte, AirbyteResource)\n else airbyte(build_init_resource_context())\n ),\n key_prefix=key_prefix,\n create_assets_for_normalization_tables=check.bool_param(\n create_assets_for_normalization_tables, "create_assets_for_normalization_tables"\n ),\n connection_to_group_fn=check.opt_callable_param(\n connection_to_group_fn, "connection_to_group_fn"\n ),\n connection_to_io_manager_key_fn=connection_to_io_manager_key_fn,\n connections=check.iterable_param(connections, "connections", of_type=AirbyteConnection),\n connection_to_asset_key_fn=connection_to_asset_key_fn,\n connection_to_freshness_policy_fn=connection_to_freshness_policy_fn,\n )
\n
", "current_page_name": "_modules/dagster_airbyte/managed/reconciliation", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_airbyte.managed.reconciliation"}, "types": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_airbyte.managed.types

\nimport json\nfrom abc import ABC\nfrom enum import Enum\nfrom typing import Any, Dict, List, Mapping, Optional, Union\n\nimport dagster._check as check\nfrom dagster._annotations import public\n\n\n
[docs]class AirbyteSyncMode(ABC):\n """Represents the sync mode for a given Airbyte stream, which governs how Airbyte reads\n from a source and writes to a destination.\n\n For more information, see https://docs.airbyte.com/understanding-airbyte/connections/.\n """\n\n def __eq__(self, other: Any) -> bool:\n return isinstance(other, AirbyteSyncMode) and self.to_json() == other.to_json()\n\n def __init__(self, json_repr: Dict[str, Any]):\n self.json_repr = json_repr\n\n def to_json(self) -> Dict[str, Any]:\n return self.json_repr\n\n @classmethod\n def from_json(cls, json_repr: Dict[str, Any]) -> "AirbyteSyncMode":\n return cls(\n {\n k: v\n for k, v in json_repr.items()\n if k in ("syncMode", "destinationSyncMode", "cursorField", "primaryKey")\n }\n )\n\n
[docs] @public\n @classmethod\n def full_refresh_append(cls) -> "AirbyteSyncMode":\n """Syncs the entire data stream from the source, appending rows to the destination.\n\n https://docs.airbyte.com/understanding-airbyte/connections/full-refresh-append/\n """\n return cls({"syncMode": "full_refresh", "destinationSyncMode": "append"})
\n\n
[docs] @public\n @classmethod\n def full_refresh_overwrite(cls) -> "AirbyteSyncMode":\n """Syncs the entire data stream from the source, replaces data in the destination by\n overwriting it.\n\n https://docs.airbyte.com/understanding-airbyte/connections/full-refresh-overwrite\n """\n return cls({"syncMode": "full_refresh", "destinationSyncMode": "overwrite"})
\n\n
[docs] @public\n @classmethod\n def incremental_append(\n cls,\n cursor_field: Optional[str] = None,\n ) -> "AirbyteSyncMode":\n """Syncs only new records from the source, appending rows to the destination.\n May optionally specify the cursor field used to determine which records\n are new.\n\n https://docs.airbyte.com/understanding-airbyte/connections/incremental-append/\n """\n cursor_field = check.opt_str_param(cursor_field, "cursor_field")\n\n return cls(\n {\n "syncMode": "incremental",\n "destinationSyncMode": "append",\n **({"cursorField": [cursor_field]} if cursor_field else {}),\n }\n )
\n\n
[docs] @public\n @classmethod\n def incremental_append_dedup(\n cls,\n cursor_field: Optional[str] = None,\n primary_key: Optional[Union[str, List[str]]] = None,\n ) -> "AirbyteSyncMode":\n """Syncs new records from the source, appending to an append-only history\n table in the destination. Also generates a deduplicated view mirroring the\n source table. May optionally specify the cursor field used to determine\n which records are new, and the primary key used to determine which records\n are duplicates.\n\n https://docs.airbyte.com/understanding-airbyte/connections/incremental-append-dedup/\n """\n cursor_field = check.opt_str_param(cursor_field, "cursor_field")\n if isinstance(primary_key, str):\n primary_key = [primary_key]\n primary_key = check.opt_list_param(primary_key, "primary_key", of_type=str)\n\n return cls(\n {\n "syncMode": "incremental",\n "destinationSyncMode": "append_dedup",\n **({"cursorField": [cursor_field]} if cursor_field else {}),\n **({"primaryKey": [[x] for x in primary_key]} if primary_key else {}),\n }\n )
\n\n\n
[docs]class AirbyteSource:\n """Represents a user-defined Airbyte source.\n\n Args:\n name (str): The display name of the source.\n source_type (str): The type of the source, from Airbyte's list\n of sources https://airbytehq.github.io/category/sources/.\n source_configuration (Mapping[str, Any]): The configuration for the\n source, as defined by Airbyte's API.\n """\n\n
[docs] @public\n def __init__(self, name: str, source_type: str, source_configuration: Mapping[str, Any]):\n self.name = check.str_param(name, "name")\n self.source_type = check.str_param(source_type, "source_type")\n self.source_configuration = check.mapping_param(\n source_configuration, "source_configuration", key_type=str\n )
\n\n def must_be_recreated(self, other: "AirbyteSource") -> bool:\n return self.name != other.name or self.source_type != other.source_type
\n\n\nclass InitializedAirbyteSource:\n """User-defined Airbyte source bound to actual created Airbyte source."""\n\n def __init__(self, source: AirbyteSource, source_id: str, source_definition_id: Optional[str]):\n self.source = source\n self.source_id = source_id\n self.source_definition_id = source_definition_id\n\n @classmethod\n def from_api_json(cls, api_json: Mapping[str, Any]):\n return cls(\n source=AirbyteSource(\n name=api_json["name"],\n source_type=api_json["sourceName"],\n source_configuration=api_json["connectionConfiguration"],\n ),\n source_id=api_json["sourceId"],\n source_definition_id=None,\n )\n\n\n
[docs]class AirbyteDestination:\n """Represents a user-defined Airbyte destination.\n\n Args:\n name (str): The display name of the destination.\n destination_type (str): The type of the destination, from Airbyte's list\n of destinations https://airbytehq.github.io/category/destinations/.\n destination_configuration (Mapping[str, Any]): The configuration for the\n destination, as defined by Airbyte's API.\n """\n\n
[docs] @public\n def __init__(\n self, name: str, destination_type: str, destination_configuration: Mapping[str, Any]\n ):\n self.name = check.str_param(name, "name")\n self.destination_type = check.str_param(destination_type, "destination_type")\n self.destination_configuration = check.mapping_param(\n destination_configuration, "destination_configuration", key_type=str\n )
\n\n def must_be_recreated(self, other: "AirbyteDestination") -> bool:\n return self.name != other.name or self.destination_type != other.destination_type
\n\n\nclass InitializedAirbyteDestination:\n """User-defined Airbyte destination bound to actual created Airbyte destination."""\n\n def __init__(\n self,\n destination: AirbyteDestination,\n destination_id: str,\n destination_definition_id: Optional[str],\n ):\n self.destination = destination\n self.destination_id = destination_id\n self.destination_definition_id = destination_definition_id\n\n @classmethod\n def from_api_json(cls, api_json: Mapping[str, Any]):\n return cls(\n destination=AirbyteDestination(\n name=api_json["name"],\n destination_type=api_json["destinationName"],\n destination_configuration=api_json["connectionConfiguration"],\n ),\n destination_id=api_json["destinationId"],\n destination_definition_id=None,\n )\n\n\nclass AirbyteDestinationNamespace(Enum):\n """Represents the sync mode for a given Airbyte stream."""\n\n SAME_AS_SOURCE = "source"\n DESTINATION_DEFAULT = "destination"\n\n\n
[docs]class AirbyteConnection:\n """A user-defined Airbyte connection, pairing an Airbyte source and destination and configuring\n which streams to sync.\n\n Args:\n name (str): The display name of the connection.\n source (AirbyteSource): The source to sync from.\n destination (AirbyteDestination): The destination to sync to.\n stream_config (Mapping[str, AirbyteSyncMode]): A mapping from stream name to\n the sync mode for that stream, including any additional configuration\n of primary key or cursor field.\n normalize_data (Optional[bool]): Whether to normalize the data in the\n destination.\n destination_namespace (Optional[Union[AirbyteDestinationNamespace, str]]):\n The namespace to sync to in the destination. If set to\n AirbyteDestinationNamespace.SAME_AS_SOURCE, the namespace will be the\n same as the source namespace. If set to\n AirbyteDestinationNamespace.DESTINATION_DEFAULT, the namespace will be\n the default namespace for the destination. If set to a string, the\n namespace will be that string.\n prefix (Optional[str]): A prefix to add to the table names in the destination.\n\n Example:\n .. code-block:: python\n\n from dagster_airbyte.managed.generated.sources import FileSource\n from dagster_airbyte.managed.generated.destinations import LocalJsonDestination\n from dagster_airbyte import AirbyteConnection, AirbyteSyncMode\n\n cereals_csv_source = FileSource(...)\n local_json_destination = LocalJsonDestination(...)\n\n cereals_connection = AirbyteConnection(\n name="download-cereals",\n source=cereals_csv_source,\n destination=local_json_destination,\n stream_config={"cereals": AirbyteSyncMode.full_refresh_overwrite()},\n )\n """\n\n
[docs] @public\n def __init__(\n self,\n name: str,\n source: AirbyteSource,\n destination: AirbyteDestination,\n stream_config: Mapping[str, AirbyteSyncMode],\n normalize_data: Optional[bool] = None,\n destination_namespace: Optional[\n Union[AirbyteDestinationNamespace, str]\n ] = AirbyteDestinationNamespace.SAME_AS_SOURCE,\n prefix: Optional[str] = None,\n ):\n self.name = check.str_param(name, "name")\n self.source = check.inst_param(source, "source", AirbyteSource)\n self.destination = check.inst_param(destination, "destination", AirbyteDestination)\n self.stream_config = check.mapping_param(\n stream_config, "stream_config", key_type=str, value_type=AirbyteSyncMode\n )\n self.normalize_data = check.opt_bool_param(normalize_data, "normalize_data")\n self.destination_namespace = check.opt_inst_param(\n destination_namespace, "destination_namespace", (str, AirbyteDestinationNamespace)\n )\n self.prefix = check.opt_str_param(prefix, "prefix")
\n\n def must_be_recreated(self, other: Optional["AirbyteConnection"]) -> bool:\n return (\n not other\n or self.source.must_be_recreated(other.source)\n or self.destination.must_be_recreated(other.destination)\n )
\n\n\nclass InitializedAirbyteConnection:\n """User-defined Airbyte connection bound to actual created Airbyte connection."""\n\n def __init__(\n self,\n connection: AirbyteConnection,\n connection_id: str,\n ):\n self.connection = connection\n self.connection_id = connection_id\n\n @classmethod\n def from_api_json(\n cls,\n api_dict: Mapping[str, Any],\n init_sources: Mapping[str, InitializedAirbyteSource],\n init_dests: Mapping[str, InitializedAirbyteDestination],\n ):\n source = next(\n (\n source.source\n for source in init_sources.values()\n if source.source_id == api_dict["sourceId"]\n ),\n None,\n )\n dest = next(\n (\n dest.destination\n for dest in init_dests.values()\n if dest.destination_id == api_dict["destinationId"]\n ),\n None,\n )\n\n source = check.not_none(source, f"Could not find source with id {api_dict['sourceId']}")\n dest = check.not_none(\n dest, f"Could not find destination with id {api_dict['destinationId']}"\n )\n\n streams = {\n stream["stream"]["name"]: AirbyteSyncMode.from_json(stream["config"])\n for stream in api_dict["syncCatalog"]["streams"]\n }\n return cls(\n AirbyteConnection(\n name=api_dict["name"],\n source=source,\n destination=dest,\n stream_config=streams,\n normalize_data=len(api_dict["operationIds"]) > 0,\n destination_namespace=(\n api_dict["namespaceFormat"]\n if api_dict["namespaceDefinition"] == "customformat"\n else AirbyteDestinationNamespace(api_dict["namespaceDefinition"])\n ),\n prefix=api_dict["prefix"] if api_dict.get("prefix") else None,\n ),\n api_dict["connectionId"],\n )\n\n\ndef _remove_none_values(obj: Dict[str, Any]) -> Dict[str, Any]:\n return {k: v for k, v in obj.items() if v is not None}\n\n\ndef _dump_class(obj: Any) -> Dict[str, Any]:\n return json.loads(json.dumps(obj, default=lambda o: _remove_none_values(o.__dict__)))\n\n\nclass GeneratedAirbyteSource(AirbyteSource):\n """Base class used by the codegen Airbyte sources. This class is not intended to be used directly.\n\n Converts all of its attributes into a source configuration dict which is passed down to the base\n AirbyteSource class.\n """\n\n def __init__(self, source_type: str, name: str):\n source_configuration = _dump_class(self)\n super().__init__(\n name=name, source_type=source_type, source_configuration=source_configuration\n )\n\n\nclass GeneratedAirbyteDestination(AirbyteDestination):\n """Base class used by the codegen Airbyte destinations. This class is not intended to be used directly.\n\n Converts all of its attributes into a destination configuration dict which is passed down to the\n base AirbyteDestination class.\n """\n\n def __init__(self, source_type: str, name: str):\n destination_configuration = _dump_class(self)\n super().__init__(\n name=name,\n destination_type=source_type,\n destination_configuration=destination_configuration,\n )\n
", "current_page_name": "_modules/dagster_airbyte/managed/types", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_airbyte.managed.types"}}, "ops": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_airbyte.ops

\nfrom typing import Any, Iterable, List, Optional\n\nfrom dagster import Config, In, Nothing, Out, Output, op\nfrom pydantic import Field\n\nfrom dagster_airbyte.types import AirbyteOutput\nfrom dagster_airbyte.utils import _get_attempt, generate_materializations\n\nfrom .resources import DEFAULT_POLL_INTERVAL_SECONDS, BaseAirbyteResource\n\n\nclass AirbyteSyncConfig(Config):\n    connection_id: str = Field(\n        ...,\n        description=(\n            "Parsed json dictionary representing the details of the Airbyte connector after the"\n            " sync successfully completes. See the [Airbyte API"\n            " Docs](https://airbyte-public-api-docs.s3.us-east-2.amazonaws.com/rapidoc-api-docs.html#overview)"\n            " to see detailed information on this response."\n        ),\n    )\n    poll_interval: float = Field(\n        DEFAULT_POLL_INTERVAL_SECONDS,\n        description=(\n            "The maximum time that will waited before this operation is timed out. By "\n            "default, this will never time out."\n        ),\n    )\n    poll_timeout: Optional[float] = Field(\n        None,\n        description=(\n            "The maximum time that will waited before this operation is timed out. By "\n            "default, this will never time out."\n        ),\n    )\n    yield_materializations: bool = Field(\n        True,\n        description=(\n            "If True, materializations corresponding to the results of the Airbyte sync will "\n            "be yielded when the op executes."\n        ),\n    )\n    asset_key_prefix: List[str] = Field(\n        ["airbyte"],\n        description=(\n            "If provided and yield_materializations is True, these components will be used to "\n            "prefix the generated asset keys."\n        ),\n    )\n\n\n
[docs]@op(\n ins={"start_after": In(Nothing)},\n out=Out(\n AirbyteOutput,\n description=(\n "Parsed json dictionary representing the details of the Airbyte connector after the"\n " sync successfully completes. See the [Airbyte API"\n " Docs](https://airbyte-public-api-docs.s3.us-east-2.amazonaws.com/rapidoc-api-docs.html#overview)"\n " to see detailed information on this response."\n ),\n ),\n tags={"kind": "airbyte"},\n)\ndef airbyte_sync_op(\n context, config: AirbyteSyncConfig, airbyte: BaseAirbyteResource\n) -> Iterable[Any]:\n """Executes a Airbyte job sync for a given ``connection_id``, and polls until that sync\n completes, raising an error if it is unsuccessful. It outputs a AirbyteOutput which contains\n the job details for a given ``connection_id``.\n\n It requires the use of the :py:class:`~dagster_airbyte.airbyte_resource`, which allows it to\n communicate with the Airbyte API.\n\n Examples:\n .. code-block:: python\n\n from dagster import job\n from dagster_airbyte import airbyte_resource, airbyte_sync_op\n\n my_airbyte_resource = airbyte_resource.configured(\n {\n "host": {"env": "AIRBYTE_HOST"},\n "port": {"env": "AIRBYTE_PORT"},\n }\n )\n\n sync_foobar = airbyte_sync_op.configured({"connection_id": "foobar"}, name="sync_foobar")\n\n @job(resource_defs={"airbyte": my_airbyte_resource})\n def my_simple_airbyte_job():\n sync_foobar()\n\n @job(resource_defs={"airbyte": my_airbyte_resource})\n def my_composed_airbyte_job():\n final_foobar_state = sync_foobar(start_after=some_op())\n other_op(final_foobar_state)\n """\n airbyte_output = airbyte.sync_and_poll(\n connection_id=config.connection_id,\n poll_interval=config.poll_interval,\n poll_timeout=config.poll_timeout,\n )\n if config.yield_materializations:\n yield from generate_materializations(\n airbyte_output, asset_key_prefix=config.asset_key_prefix\n )\n yield Output(\n airbyte_output,\n metadata={\n **_get_attempt(airbyte_output.job_details.get("attempts", [{}])[-1]).get(\n "totalStats", {}\n )\n },\n )
\n
", "current_page_name": "_modules/dagster_airbyte/ops", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_airbyte.ops"}, "resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_airbyte.resources

\nimport hashlib\nimport json\nimport logging\nimport sys\nimport time\nfrom abc import abstractmethod\nfrom contextlib import contextmanager\nfrom typing import Any, Dict, List, Mapping, Optional, cast\n\nimport requests\nfrom dagster import (\n    ConfigurableResource,\n    Failure,\n    _check as check,\n    get_dagster_logger,\n    resource,\n)\nfrom dagster._config.pythonic_config import infer_schema_from_config_class\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom dagster._utils.cached_method import cached_method\nfrom dagster._utils.merger import deep_merge_dicts\nfrom pydantic import Field\nfrom requests.exceptions import RequestException\n\nfrom dagster_airbyte.types import AirbyteOutput\n\nDEFAULT_POLL_INTERVAL_SECONDS = 10\n\n\nclass AirbyteState:\n    RUNNING = "running"\n    SUCCEEDED = "succeeded"\n    CANCELLED = "cancelled"\n    PENDING = "pending"\n    FAILED = "failed"\n    ERROR = "error"\n    INCOMPLETE = "incomplete"\n\n\nclass AirbyteResourceState:\n    def __init__(self) -> None:\n        self.request_cache: Dict[str, Optional[Mapping[str, object]]] = {}\n        # Int in case we nest contexts\n        self.cache_enabled = 0\n\n\nclass BaseAirbyteResource(ConfigurableResource):\n    request_max_retries: int = Field(\n        default=3,\n        description=(\n            "The maximum number of times requests to the Airbyte API should be retried "\n            "before failing."\n        ),\n    )\n    request_retry_delay: float = Field(\n        default=0.25,\n        description="Time (in seconds) to wait between each request retry.",\n    )\n    request_timeout: int = Field(\n        default=15,\n        description="Time (in seconds) after which the requests to Airbyte are declared timed out.",\n    )\n    cancel_sync_on_run_termination: bool = Field(\n        default=True,\n        description=(\n            "Whether to cancel a sync in Airbyte if the Dagster runner is terminated. This may"\n            " be useful to disable if using Airbyte sources that cannot be cancelled and"\n            " resumed easily, or if your Dagster deployment may experience runner interruptions"\n            " that do not impact your Airbyte deployment."\n        ),\n    )\n    poll_interval: float = Field(\n        default=DEFAULT_POLL_INTERVAL_SECONDS,\n        description="Time (in seconds) to wait between checking a sync's status.",\n    )\n\n    @classmethod\n    def _is_dagster_maintained(cls) -> bool:\n        return True\n\n    @property\n    @cached_method\n    def _log(self) -> logging.Logger:\n        return get_dagster_logger()\n\n    @property\n    @abstractmethod\n    def api_base_url(self) -> str:\n        raise NotImplementedError()\n\n    @property\n    @abstractmethod\n    def all_additional_request_params(self) -> Mapping[str, Any]:\n        raise NotImplementedError()\n\n    def make_request(\n        self, endpoint: str, data: Optional[Mapping[str, object]] = None, method: str = "POST"\n    ) -> Optional[Mapping[str, object]]:\n        """Creates and sends a request to the desired Airbyte REST API endpoint.\n\n        Args:\n            endpoint (str): The Airbyte API endpoint to send this request to.\n            data (Optional[str]): JSON-formatted data string to be included in the request.\n\n        Returns:\n            Optional[Dict[str, Any]]: Parsed json data from the response to this request\n        """\n        url = self.api_base_url + endpoint\n        headers = {"accept": "application/json"}\n\n        num_retries = 0\n        while True:\n            try:\n                request_args: Dict[str, Any] = dict(\n                    method=method,\n                    url=url,\n                    headers=headers,\n                    timeout=self.request_timeout,\n                )\n                if data:\n                    request_args["json"] = data\n\n                request_args = deep_merge_dicts(\n                    request_args,\n                    self.all_additional_request_params,\n                )\n\n                response = requests.request(\n                    **request_args,\n                )\n                response.raise_for_status()\n                if response.status_code == 204:\n                    return None\n                return response.json()\n            except RequestException as e:\n                self._log.error("Request to Airbyte API failed: %s", e)\n                if num_retries == self.request_max_retries:\n                    break\n                num_retries += 1\n                time.sleep(self.request_retry_delay)\n\n        raise Failure(f"Max retries ({self.request_max_retries}) exceeded with url: {url}.")\n\n    @abstractmethod\n    def start_sync(self, connection_id: str) -> Mapping[str, object]:\n        raise NotImplementedError()\n\n    @abstractmethod\n    def get_connection_details(self, connection_id: str) -> Mapping[str, object]:\n        raise NotImplementedError()\n\n    @abstractmethod\n    def get_job_status(self, connection_id: str, job_id: int) -> Mapping[str, object]:\n        raise NotImplementedError()\n\n    @abstractmethod\n    def cancel_job(self, job_id: int):\n        raise NotImplementedError()\n\n    @property\n    @abstractmethod\n    def _should_forward_logs(self) -> bool:\n        raise NotImplementedError()\n\n    def sync_and_poll(\n        self,\n        connection_id: str,\n        poll_interval: Optional[float] = None,\n        poll_timeout: Optional[float] = None,\n    ) -> AirbyteOutput:\n        """Initializes a sync operation for the given connector, and polls until it completes.\n\n        Args:\n            connection_id (str): The Airbyte Connector ID. You can retrieve this value from the\n                "Connection" tab of a given connection in the Arbyte UI.\n            poll_interval (float): The time (in seconds) that will be waited between successive polls.\n            poll_timeout (float): The maximum time that will waited before this operation is timed\n                out. By default, this will never time out.\n\n        Returns:\n            :py:class:`~AirbyteOutput`:\n                Details of the sync job.\n        """\n        connection_details = self.get_connection_details(connection_id)\n        job_details = self.start_sync(connection_id)\n        job_info = cast(Dict[str, object], job_details.get("job", {}))\n        job_id = cast(int, job_info.get("id"))\n\n        self._log.info(f"Job {job_id} initialized for connection_id={connection_id}.")\n        start = time.monotonic()\n        logged_attempts = 0\n        logged_lines = 0\n        state = None\n\n        try:\n            while True:\n                if poll_timeout and start + poll_timeout < time.monotonic():\n                    raise Failure(\n                        f"Timeout: Airbyte job {job_id} is not ready after the timeout"\n                        f" {poll_timeout} seconds"\n                    )\n                time.sleep(poll_interval or self.poll_interval)\n                job_details = self.get_job_status(connection_id, job_id)\n                attempts = cast(List, job_details.get("attempts", []))\n                cur_attempt = len(attempts)\n                # spit out the available Airbyte log info\n                if cur_attempt:\n                    if self._should_forward_logs:\n                        log_lines = attempts[logged_attempts].get("logs", {}).get("logLines", [])\n\n                        for line in log_lines[logged_lines:]:\n                            sys.stdout.write(line + "\\n")\n                            sys.stdout.flush()\n                        logged_lines = len(log_lines)\n\n                    # if there's a next attempt, this one will have no more log messages\n                    if logged_attempts < cur_attempt - 1:\n                        logged_lines = 0\n                        logged_attempts += 1\n\n                job_info = cast(Dict[str, object], job_details.get("job", {}))\n                state = job_info.get("status")\n\n                if state in (AirbyteState.RUNNING, AirbyteState.PENDING, AirbyteState.INCOMPLETE):\n                    continue\n                elif state == AirbyteState.SUCCEEDED:\n                    break\n                elif state == AirbyteState.ERROR:\n                    raise Failure(f"Job failed: {job_id}")\n                elif state == AirbyteState.CANCELLED:\n                    raise Failure(f"Job was cancelled: {job_id}")\n                else:\n                    raise Failure(f"Encountered unexpected state `{state}` for job_id {job_id}")\n        finally:\n            # if Airbyte sync has not completed, make sure to cancel it so that it doesn't outlive\n            # the python process\n            if (\n                state not in (AirbyteState.SUCCEEDED, AirbyteState.ERROR, AirbyteState.CANCELLED)\n                and self.cancel_sync_on_run_termination\n            ):\n                self.cancel_job(job_id)\n\n        return AirbyteOutput(job_details=job_details, connection_details=connection_details)\n\n\nclass AirbyteCloudResource(BaseAirbyteResource):\n    """This resource allows users to programatically interface with the Airbyte Cloud API to launch\n    syncs and monitor their progress.\n\n    **Examples:**\n\n    .. code-block:: python\n\n        from dagster import job, EnvVar\n        from dagster_airbyte import AirbyteResource\n\n        my_airbyte_resource = AirbyteCloudResource(\n            api_key=EnvVar("AIRBYTE_API_KEY"),\n        )\n\n        airbyte_assets = build_airbyte_assets(\n            connection_id="87b7fe85-a22c-420e-8d74-b30e7ede77df",\n            destination_tables=["releases", "tags", "teams"],\n        )\n\n        defs = Definitions(\n            assets=[airbyte_assets],\n            resources={"airbyte": my_airbyte_resource},\n        )\n    """\n\n    api_key: str = Field(..., description="The Airbyte Cloud API key.")\n\n    @property\n    def api_base_url(self) -> str:\n        return "https://api.airbyte.com/v1"\n\n    @property\n    def all_additional_request_params(self) -> Mapping[str, Any]:\n        return {"headers": {"Authorization": f"Bearer {self.api_key}", "User-Agent": "dagster"}}\n\n    def start_sync(self, connection_id: str) -> Mapping[str, object]:\n        job_sync = check.not_none(\n            self.make_request(\n                endpoint="/jobs",\n                data={\n                    "connectionId": connection_id,\n                    "jobType": "sync",\n                },\n            )\n        )\n        return {"job": {"id": job_sync["jobId"], "status": job_sync["status"]}}\n\n    def get_connection_details(self, connection_id: str) -> Mapping[str, object]:\n        return {}\n\n    def get_job_status(self, connection_id: str, job_id: int) -> Mapping[str, object]:\n        job_status = check.not_none(self.make_request(endpoint=f"/jobs/{job_id}", method="GET"))\n        return {"job": {"id": job_status["jobId"], "status": job_status["status"]}}\n\n    def cancel_job(self, job_id: int):\n        self.make_request(endpoint=f"/jobs/{job_id}", method="DELETE")\n\n    @property\n    def _should_forward_logs(self) -> bool:\n        # Airbyte Cloud does not support streaming logs yet\n        return False\n\n\n
[docs]class AirbyteResource(BaseAirbyteResource):\n """This resource allows users to programatically interface with the Airbyte REST API to launch\n syncs and monitor their progress.\n\n **Examples:**\n\n .. code-block:: python\n\n from dagster import job, EnvVar\n from dagster_airbyte import AirbyteResource\n\n my_airbyte_resource = AirbyteResource(\n host=EnvVar("AIRBYTE_HOST"),\n port=EnvVar("AIRBYTE_PORT"),\n # If using basic auth\n username=EnvVar("AIRBYTE_USERNAME"),\n password=EnvVar("AIRBYTE_PASSWORD"),\n )\n\n airbyte_assets = build_airbyte_assets(\n connection_id="87b7fe85-a22c-420e-8d74-b30e7ede77df",\n destination_tables=["releases", "tags", "teams"],\n )\n\n defs = Definitions(\n assets=[airbyte_assets],\n resources={"airbyte": my_airbyte_resource},\n )\n """\n\n host: str = Field(description="The Airbyte server address.")\n port: str = Field(description="Port used for the Airbyte server.")\n username: Optional[str] = Field(default=None, description="Username if using basic auth.")\n password: Optional[str] = Field(default=None, description="Password if using basic auth.")\n use_https: bool = Field(\n default=False, description="Whether to use HTTPS to connect to the Airbyte server."\n )\n forward_logs: bool = Field(\n default=True,\n description=(\n "Whether to forward Airbyte logs to the compute log, can be expensive for"\n " long-running syncs."\n ),\n )\n request_additional_params: Mapping[str, Any] = Field(\n default=dict(),\n description=(\n "Any additional kwargs to pass to the requests library when making requests to Airbyte."\n ),\n )\n\n @property\n @cached_method\n def _state(self) -> AirbyteResourceState:\n return AirbyteResourceState()\n\n @property\n @cached_method\n def _log(self) -> logging.Logger:\n return get_dagster_logger()\n\n @property\n def api_base_url(self) -> str:\n return (\n ("https://" if self.use_https else "http://")\n + (f"{self.host}:{self.port}" if self.port else self.host)\n + "/api/v1"\n )\n\n @property\n def _should_forward_logs(self) -> bool:\n return self.forward_logs\n\n @contextmanager\n def cache_requests(self):\n """Context manager that enables caching certain requests to the Airbyte API,\n cleared when the context is exited.\n """\n self.clear_request_cache()\n self._state.cache_enabled += 1\n try:\n yield\n finally:\n self.clear_request_cache()\n self._state.cache_enabled -= 1\n\n def clear_request_cache(self) -> None:\n self._state.request_cache = {}\n\n def make_request_cached(self, endpoint: str, data: Optional[Mapping[str, object]]):\n if not self._state.cache_enabled > 0:\n return self.make_request(endpoint, data)\n data_json = json.dumps(data, sort_keys=True)\n sha = hashlib.sha1()\n sha.update(endpoint.encode("utf-8"))\n sha.update(data_json.encode("utf-8"))\n digest = sha.hexdigest()\n\n if digest not in self._state.request_cache:\n self._state.request_cache[digest] = self.make_request(endpoint, data)\n return self._state.request_cache[digest]\n\n @property\n def all_additional_request_params(self) -> Mapping[str, Any]:\n auth_param = (\n {"auth": (self.username, self.password)} if self.username and self.password else {}\n )\n return {**auth_param, **self.request_additional_params}\n\n def make_request(\n self, endpoint: str, data: Optional[Mapping[str, object]]\n ) -> Optional[Mapping[str, object]]:\n """Creates and sends a request to the desired Airbyte REST API endpoint.\n\n Args:\n endpoint (str): The Airbyte API endpoint to send this request to.\n data (Optional[str]): JSON-formatted data string to be included in the request.\n\n Returns:\n Optional[Dict[str, Any]]: Parsed json data from the response to this request\n """\n url = self.api_base_url + endpoint\n headers = {"accept": "application/json"}\n\n num_retries = 0\n while True:\n try:\n response = requests.request(\n **deep_merge_dicts( # type: ignore\n dict(\n method="POST",\n url=url,\n headers=headers,\n json=data,\n timeout=self.request_timeout,\n auth=(\n (self.username, self.password)\n if self.username and self.password\n else None\n ),\n ),\n self.request_additional_params,\n ),\n )\n response.raise_for_status()\n if response.status_code == 204:\n return None\n return response.json()\n except RequestException as e:\n self._log.error("Request to Airbyte API failed: %s", e)\n if num_retries == self.request_max_retries:\n break\n num_retries += 1\n time.sleep(self.request_retry_delay)\n\n raise Failure(f"Max retries ({self.request_max_retries}) exceeded with url: {url}.")\n\n def cancel_job(self, job_id: int):\n self.make_request(endpoint="/jobs/cancel", data={"id": job_id})\n\n def get_default_workspace(self) -> str:\n workspaces = cast(\n List[Dict[str, Any]],\n check.not_none(self.make_request_cached(endpoint="/workspaces/list", data={})).get(\n "workspaces", []\n ),\n )\n return workspaces[0]["workspaceId"]\n\n def get_source_definition_by_name(self, name: str) -> Optional[str]:\n name_lower = name.lower()\n definitions = self.make_request_cached(endpoint="/source_definitions/list", data={})\n\n return next(\n (\n definition["sourceDefinitionId"]\n for definition in definitions["sourceDefinitions"]\n if definition["name"].lower() == name_lower\n ),\n None,\n )\n\n def get_destination_definition_by_name(self, name: str):\n name_lower = name.lower()\n definitions = cast(\n Dict[str, List[Dict[str, str]]],\n check.not_none(\n self.make_request_cached(endpoint="/destination_definitions/list", data={})\n ),\n )\n return next(\n (\n definition["destinationDefinitionId"]\n for definition in definitions["destinationDefinitions"]\n if definition["name"].lower() == name_lower\n ),\n None,\n )\n\n def get_source_catalog_id(self, source_id: str):\n result = cast(\n Dict[str, Any],\n check.not_none(\n self.make_request(endpoint="/sources/discover_schema", data={"sourceId": source_id})\n ),\n )\n return result["catalogId"]\n\n def get_source_schema(self, source_id: str) -> Mapping[str, Any]:\n return cast(\n Dict[str, Any],\n check.not_none(\n self.make_request(endpoint="/sources/discover_schema", data={"sourceId": source_id})\n ),\n )\n\n def does_dest_support_normalization(\n self, destination_definition_id: str, workspace_id: str\n ) -> bool:\n # Airbyte API changed source of truth for normalization in PR\n # https://github.com/airbytehq/airbyte/pull/21005\n norm_dest_def_spec: bool = cast(\n Dict[str, Any],\n check.not_none(\n self.make_request_cached(\n endpoint="/destination_definition_specifications/get",\n data={\n "destinationDefinitionId": destination_definition_id,\n "workspaceId": workspace_id,\n },\n )\n ),\n ).get("supportsNormalization", False)\n\n norm_dest_def: bool = (\n cast(\n Dict[str, Any],\n check.not_none(\n self.make_request_cached(\n endpoint="/destination_definitions/get",\n data={\n "destinationDefinitionId": destination_definition_id,\n },\n )\n ),\n )\n .get("normalizationConfig", {})\n .get("supported", False)\n )\n\n return any([norm_dest_def_spec, norm_dest_def])\n\n def get_job_status(self, connection_id: str, job_id: int) -> Mapping[str, object]:\n if self.forward_logs:\n return check.not_none(self.make_request(endpoint="/jobs/get", data={"id": job_id}))\n else:\n # the "list all jobs" endpoint doesn't return logs, which actually makes it much more\n # lightweight for long-running syncs with many logs\n out = check.not_none(\n self.make_request(\n endpoint="/jobs/list",\n data={\n "configTypes": ["sync"],\n "configId": connection_id,\n # sync should be the most recent, so pageSize 5 is sufficient\n "pagination": {"pageSize": 5},\n },\n )\n )\n job = next((job for job in cast(List, out["jobs"]) if job["job"]["id"] == job_id), None)\n\n return check.not_none(job)\n\n def start_sync(self, connection_id: str) -> Mapping[str, object]:\n return check.not_none(\n self.make_request(endpoint="/connections/sync", data={"connectionId": connection_id})\n )\n\n def get_connection_details(self, connection_id: str) -> Mapping[str, object]:\n return check.not_none(\n self.make_request(endpoint="/connections/get", data={"connectionId": connection_id})\n )\n\n def sync_and_poll(\n self,\n connection_id: str,\n poll_interval: Optional[float] = None,\n poll_timeout: Optional[float] = None,\n ) -> AirbyteOutput:\n """Initializes a sync operation for the given connector, and polls until it completes.\n\n Args:\n connection_id (str): The Airbyte Connector ID. You can retrieve this value from the\n "Connection" tab of a given connection in the Arbyte UI.\n poll_interval (float): The time (in seconds) that will be waited between successive polls.\n poll_timeout (float): The maximum time that will waited before this operation is timed\n out. By default, this will never time out.\n\n Returns:\n :py:class:`~AirbyteOutput`:\n Details of the sync job.\n """\n connection_details = self.get_connection_details(connection_id)\n job_details = self.start_sync(connection_id)\n job_info = cast(Dict[str, object], job_details.get("job", {}))\n job_id = cast(int, job_info.get("id"))\n\n self._log.info(f"Job {job_id} initialized for connection_id={connection_id}.")\n start = time.monotonic()\n logged_attempts = 0\n logged_lines = 0\n state = None\n\n try:\n while True:\n if poll_timeout and start + poll_timeout < time.monotonic():\n raise Failure(\n f"Timeout: Airbyte job {job_id} is not ready after the timeout"\n f" {poll_timeout} seconds"\n )\n time.sleep(poll_interval or self.poll_interval)\n job_details = self.get_job_status(connection_id, job_id)\n attempts = cast(List, job_details.get("attempts", []))\n cur_attempt = len(attempts)\n # spit out the available Airbyte log info\n if cur_attempt:\n if self.forward_logs:\n log_lines = attempts[logged_attempts].get("logs", {}).get("logLines", [])\n\n for line in log_lines[logged_lines:]:\n sys.stdout.write(line + "\\n")\n sys.stdout.flush()\n logged_lines = len(log_lines)\n\n # if there's a next attempt, this one will have no more log messages\n if logged_attempts < cur_attempt - 1:\n logged_lines = 0\n logged_attempts += 1\n\n job_info = cast(Dict[str, object], job_details.get("job", {}))\n state = job_info.get("status")\n\n if state in (AirbyteState.RUNNING, AirbyteState.PENDING, AirbyteState.INCOMPLETE):\n continue\n elif state == AirbyteState.SUCCEEDED:\n break\n elif state == AirbyteState.ERROR:\n raise Failure(f"Job failed: {job_id}")\n elif state == AirbyteState.CANCELLED:\n raise Failure(f"Job was cancelled: {job_id}")\n else:\n raise Failure(f"Encountered unexpected state `{state}` for job_id {job_id}")\n finally:\n # if Airbyte sync has not completed, make sure to cancel it so that it doesn't outlive\n # the python process\n if (\n state not in (AirbyteState.SUCCEEDED, AirbyteState.ERROR, AirbyteState.CANCELLED)\n and self.cancel_sync_on_run_termination\n ):\n self.cancel_job(job_id)\n\n return AirbyteOutput(job_details=job_details, connection_details=connection_details)
\n\n\n
[docs]@dagster_maintained_resource\n@resource(config_schema=AirbyteResource.to_config_schema())\ndef airbyte_resource(context) -> AirbyteResource:\n """This resource allows users to programatically interface with the Airbyte REST API to launch\n syncs and monitor their progress. This currently implements only a subset of the functionality\n exposed by the API.\n\n For a complete set of documentation on the Airbyte REST API, including expected response JSON\n schema, see the `Airbyte API Docs <https://airbyte-public-api-docs.s3.us-east-2.amazonaws.com/rapidoc-api-docs.html#overview>`_.\n\n To configure this resource, we recommend using the `configured\n <https://docs.dagster.io/concepts/configuration/configured>`_ method.\n\n **Examples:**\n\n .. code-block:: python\n\n from dagster import job\n from dagster_airbyte import airbyte_resource\n\n my_airbyte_resource = airbyte_resource.configured(\n {\n "host": {"env": "AIRBYTE_HOST"},\n "port": {"env": "AIRBYTE_PORT"},\n # If using basic auth\n "username": {"env": "AIRBYTE_USERNAME"},\n "password": {"env": "AIRBYTE_PASSWORD"},\n }\n )\n\n @job(resource_defs={"airbyte":my_airbyte_resource})\n def my_airbyte_job():\n ...\n\n """\n return AirbyteResource.from_resource_context(context)
\n\n\n@dagster_maintained_resource\n@resource(config_schema=infer_schema_from_config_class(AirbyteCloudResource))\ndef airbyte_cloud_resource(context) -> AirbyteCloudResource:\n """This resource allows users to programatically interface with the Airbyte Cloud REST API to launch\n syncs and monitor their progress. Currently, this resource may only be used with the more basic\n `dagster-airbyte` APIs, including the ops and assets.\n\n """\n return AirbyteCloudResource.from_resource_context(context)\n
", "current_page_name": "_modules/dagster_airbyte/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_airbyte.resources"}}, "dagster_airflow": {"dagster_asset_factory": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_airflow.dagster_asset_factory

\nfrom typing import AbstractSet, List, Mapping, Optional, Set, Tuple\n\nfrom airflow.models.connection import Connection\nfrom airflow.models.dag import DAG\nfrom dagster import (\n    AssetKey,\n    AssetsDefinition,\n    GraphDefinition,\n    OutputMapping,\n    TimeWindowPartitionsDefinition,\n)\nfrom dagster._core.definitions.graph_definition import create_adjacency_lists\nfrom dagster._utils.schedules import is_valid_cron_schedule\n\nfrom dagster_airflow.dagster_job_factory import make_dagster_job_from_airflow_dag\nfrom dagster_airflow.utils import (\n    DagsterAirflowError,\n    normalized_name,\n)\n\n\ndef _build_asset_dependencies(\n    dag: DAG,\n    graph: GraphDefinition,\n    task_ids_by_asset_key: Mapping[AssetKey, AbstractSet[str]],\n    upstream_dependencies_by_asset_key: Mapping[AssetKey, AbstractSet[AssetKey]],\n) -> Tuple[AbstractSet[OutputMapping], Mapping[str, AssetKey], Mapping[str, Set[AssetKey]]]:\n    """Builds the asset dependency graph for a given set of airflow task mappings and a dagster graph."""\n    output_mappings = set()\n    keys_by_output_name = {}\n    internal_asset_deps: dict[str, Set[AssetKey]] = {}\n\n    visited_nodes: dict[str, bool] = {}\n    upstream_deps = set()\n\n    def find_upstream_dependency(node_name: str) -> None:\n        """Uses Depth-Firs-Search to find all upstream asset dependencies\n        as described in task_ids_by_asset_key.\n        """\n        # node has been visited\n        if visited_nodes[node_name]:\n            return\n        # mark node as visted\n        visited_nodes[node_name] = True\n        # traverse upstream nodes\n        for output_handle in graph.dependency_structure.all_upstream_outputs_from_node(node_name):\n            forward_node = output_handle.node_name\n            match = False\n            # find any assets produced by upstream nodes and add them to the internal asset deps\n            for asset_key in task_ids_by_asset_key:\n                if (\n                    forward_node.replace(f"{normalized_name(dag.dag_id)}__", "")\n                    in task_ids_by_asset_key[asset_key]\n                ):\n                    upstream_deps.add(asset_key)\n                    match = True\n            # don't traverse past nodes that have assets\n            if not match:\n                find_upstream_dependency(forward_node)\n\n    # iterate through each asset to find all upstream asset dependencies\n    for asset_key in task_ids_by_asset_key:\n        asset_upstream_deps = set()\n        for task_id in task_ids_by_asset_key[asset_key]:\n            visited_nodes = {s.name: False for s in graph.nodes}\n            upstream_deps = set()\n            find_upstream_dependency(normalized_name(dag.dag_id, task_id))\n            for dep in upstream_deps:\n                asset_upstream_deps.add(dep)\n            keys_by_output_name[f"result_{normalized_name(dag.dag_id, task_id)}"] = asset_key\n            output_mappings.add(\n                OutputMapping(\n                    graph_output_name=f"result_{normalized_name(dag.dag_id, task_id)}",\n                    mapped_node_name=normalized_name(dag.dag_id, task_id),\n                    mapped_node_output_name="airflow_task_complete",  # Default output name\n                )\n            )\n\n        # the tasks for a given asset should have the same internal deps\n        for task_id in task_ids_by_asset_key[asset_key]:\n            if f"result_{normalized_name(dag.dag_id, task_id)}" in internal_asset_deps:\n                internal_asset_deps[f"result_{normalized_name(dag.dag_id, task_id)}"].update(\n                    asset_upstream_deps\n                )\n            else:\n                internal_asset_deps[f"result_{normalized_name(dag.dag_id, task_id)}"] = (\n                    asset_upstream_deps\n                )\n\n    # add new upstream asset dependencies to the internal deps\n    for asset_key in upstream_dependencies_by_asset_key:\n        for key in keys_by_output_name:\n            if keys_by_output_name[key] == asset_key:\n                internal_asset_deps[key].update(upstream_dependencies_by_asset_key[asset_key])\n\n    return (output_mappings, keys_by_output_name, internal_asset_deps)\n\n\n
[docs]def load_assets_from_airflow_dag(\n dag: DAG,\n task_ids_by_asset_key: Mapping[AssetKey, AbstractSet[str]] = {},\n upstream_dependencies_by_asset_key: Mapping[AssetKey, AbstractSet[AssetKey]] = {},\n connections: Optional[List[Connection]] = None,\n) -> List[AssetsDefinition]:\n """[Experimental] Construct Dagster Assets for a given Airflow DAG.\n\n Args:\n dag (DAG): The Airflow DAG to compile into a Dagster job\n task_ids_by_asset_key (Optional[Mapping[AssetKey, AbstractSet[str]]]): A mapping from asset\n keys to task ids. Used break up the Airflow Dag into multiple SDAs\n upstream_dependencies_by_asset_key (Optional[Mapping[AssetKey, AbstractSet[AssetKey]]]): A\n mapping from upstream asset keys to assets provided in task_ids_by_asset_key. Used to\n declare new upstream SDA depenencies.\n connections (List[Connection]): List of Airflow Connections to be created in the Airflow DB\n\n Returns:\n List[AssetsDefinition]\n """\n cron_schedule = dag.normalized_schedule_interval\n if cron_schedule is not None and not is_valid_cron_schedule(str(cron_schedule)):\n raise DagsterAirflowError(f"Invalid cron schedule: {cron_schedule} in DAG {dag.dag_id}")\n\n job = make_dagster_job_from_airflow_dag(dag, connections=connections)\n graph = job._graph_def # noqa: SLF001\n start_date = dag.start_date if dag.start_date else dag.default_args.get("start_date")\n if start_date is None:\n raise DagsterAirflowError(f"Invalid start_date: {start_date} in DAG {dag.dag_id}")\n\n # leaf nodes have no downstream nodes\n forward_edges, _ = create_adjacency_lists(graph.nodes, graph.dependency_structure)\n leaf_nodes = {\n node_name.replace(f"{normalized_name(dag.dag_id)}__", "")\n for node_name, downstream_nodes in forward_edges.items()\n if not downstream_nodes\n }\n\n mutated_task_ids_by_asset_key: dict[AssetKey, set[str]] = {}\n\n if task_ids_by_asset_key is None or task_ids_by_asset_key == {}:\n # if no mappings are provided the dag becomes a single SDA\n task_ids_by_asset_key = {AssetKey(dag.dag_id): leaf_nodes}\n else:\n # if mappings were provide any unmapped leaf nodes are added to a default asset\n used_nodes: set[str] = set()\n for key in task_ids_by_asset_key:\n used_nodes.update(task_ids_by_asset_key[key])\n\n mutated_task_ids_by_asset_key[AssetKey(dag.dag_id)] = leaf_nodes - used_nodes\n\n for key in task_ids_by_asset_key:\n if key not in mutated_task_ids_by_asset_key:\n mutated_task_ids_by_asset_key[key] = set(task_ids_by_asset_key[key])\n else:\n mutated_task_ids_by_asset_key[key].update(task_ids_by_asset_key[key])\n\n output_mappings, keys_by_output_name, internal_asset_deps = _build_asset_dependencies(\n dag, graph, mutated_task_ids_by_asset_key, upstream_dependencies_by_asset_key\n )\n\n new_graph = graph.copy(\n output_mappings=list(output_mappings),\n )\n\n asset_def = AssetsDefinition.from_graph(\n graph_def=new_graph,\n partitions_def=(\n TimeWindowPartitionsDefinition(\n cron_schedule=str(cron_schedule),\n timezone=dag.timezone.name,\n start=start_date.strftime("%Y-%m-%dT%H:%M:%S"),\n fmt="%Y-%m-%dT%H:%M:%S",\n )\n if cron_schedule is not None\n else None\n ),\n group_name=dag.dag_id,\n keys_by_output_name=keys_by_output_name,\n internal_asset_deps=internal_asset_deps,\n can_subset=True,\n )\n return [asset_def]
\n
", "current_page_name": "_modules/dagster_airflow/dagster_asset_factory", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_airflow.dagster_asset_factory"}, "dagster_factory": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_airflow.dagster_factory

\nimport os\nfrom typing import List, Mapping, Optional, Tuple\n\nfrom airflow.models.connection import Connection\nfrom airflow.models.dagbag import DagBag\nfrom dagster import (\n    Definitions,\n    JobDefinition,\n    ResourceDefinition,\n    ScheduleDefinition,\n    _check as check,\n)\n\nfrom dagster_airflow.dagster_job_factory import make_dagster_job_from_airflow_dag\nfrom dagster_airflow.dagster_schedule_factory import (\n    _is_dag_is_schedule,\n    make_dagster_schedule_from_airflow_dag,\n)\nfrom dagster_airflow.patch_airflow_example_dag import patch_airflow_example_dag\nfrom dagster_airflow.resources import (\n    make_ephemeral_airflow_db_resource as make_ephemeral_airflow_db_resource,\n)\nfrom dagster_airflow.resources.airflow_ephemeral_db import AirflowEphemeralDatabase\nfrom dagster_airflow.resources.airflow_persistent_db import AirflowPersistentDatabase\nfrom dagster_airflow.utils import (\n    is_airflow_2_loaded_in_environment,\n)\n\n\n
[docs]def make_dagster_definitions_from_airflow_dag_bag(\n dag_bag: DagBag,\n connections: Optional[List[Connection]] = None,\n resource_defs: Optional[Mapping[str, ResourceDefinition]] = {},\n) -> Definitions:\n """Construct a Dagster definition corresponding to Airflow DAGs in DagBag.\n\n Usage:\n Create `make_dagster_definition.py`:\n from dagster_airflow import make_dagster_definition_from_airflow_dag_bag\n from airflow_home import my_dag_bag\n\n def make_definition_from_dag_bag():\n return make_dagster_definition_from_airflow_dag_bag(my_dag_bag)\n\n Use Definitions as usual, for example:\n `dagster-webserver -f path/to/make_dagster_definition.py`\n\n Args:\n dag_bag (DagBag): Airflow DagBag Model\n connections (List[Connection]): List of Airflow Connections to be created in the Airflow DB\n\n Returns:\n Definitions\n """\n check.inst_param(dag_bag, "dag_bag", DagBag)\n connections = check.opt_list_param(connections, "connections", of_type=Connection)\n resource_defs = check.opt_mapping_param(resource_defs, "resource_defs")\n if resource_defs is None or "airflow_db" not in resource_defs:\n resource_defs = dict(resource_defs) if resource_defs else {}\n resource_defs["airflow_db"] = make_ephemeral_airflow_db_resource(connections=connections)\n\n schedules, jobs = make_schedules_and_jobs_from_airflow_dag_bag(\n dag_bag=dag_bag,\n connections=connections,\n resource_defs=resource_defs,\n )\n\n return Definitions(\n schedules=schedules,\n jobs=jobs,\n resources=resource_defs,\n )
\n\n\n
[docs]def make_dagster_definitions_from_airflow_dags_path(\n dag_path: str,\n safe_mode: bool = True,\n connections: Optional[List[Connection]] = None,\n resource_defs: Optional[Mapping[str, ResourceDefinition]] = {},\n) -> Definitions:\n """Construct a Dagster repository corresponding to Airflow DAGs in dag_path.\n\n Usage:\n Create ``make_dagster_definitions.py``:\n\n .. code-block:: python\n\n from dagster_airflow import make_dagster_definitions_from_airflow_dags_path\n\n def make_definitions_from_dir():\n return make_dagster_definitions_from_airflow_dags_path(\n '/path/to/dags/',\n )\n\n Use RepositoryDefinition as usual, for example:\n ``dagster-webserver -f path/to/make_dagster_repo.py -n make_repo_from_dir``\n\n Args:\n dag_path (str): Path to directory or file that contains Airflow Dags\n include_examples (bool): True to include Airflow's example DAGs. (default: False)\n safe_mode (bool): True to use Airflow's default heuristic to find files that contain DAGs\n (ie find files that contain both b'DAG' and b'airflow') (default: True)\n connections (List[Connection]): List of Airflow Connections to be created in the Airflow DB\n\n Returns:\n Definitions\n """\n check.str_param(dag_path, "dag_path")\n check.bool_param(safe_mode, "safe_mode")\n connections = check.opt_list_param(connections, "connections", of_type=Connection)\n resource_defs = check.opt_mapping_param(resource_defs, "resource_defs")\n if resource_defs is None or "airflow_db" not in resource_defs:\n resource_defs = dict(resource_defs) if resource_defs else {}\n resource_defs["airflow_db"] = make_ephemeral_airflow_db_resource(connections=connections)\n\n if (\n resource_defs["airflow_db"].resource_fn.__qualname__.split(".")[0]\n == "AirflowEphemeralDatabase"\n ):\n AirflowEphemeralDatabase._initialize_database(connections=connections) # noqa: SLF001\n elif (\n resource_defs["airflow_db"].resource_fn.__qualname__.split(".")[0]\n == "AirflowPersistentDatabase"\n ):\n AirflowPersistentDatabase._initialize_database( # noqa: SLF001\n uri=(\n os.getenv("AIRFLOW__DATABASE__SQL_ALCHEMY_CONN", "")\n if is_airflow_2_loaded_in_environment()\n else os.getenv("AIRFLOW__CORE__SQL_ALCHEMY_CONN", "")\n ),\n connections=connections,\n )\n\n dag_bag = DagBag(\n dag_folder=dag_path,\n include_examples=False, # Exclude Airflow example dags\n safe_mode=safe_mode,\n )\n\n return make_dagster_definitions_from_airflow_dag_bag(\n dag_bag=dag_bag,\n connections=connections,\n resource_defs=resource_defs,\n )
\n\n\ndef make_dagster_definitions_from_airflow_example_dags(\n resource_defs: Optional[Mapping[str, ResourceDefinition]] = {},\n) -> Definitions:\n """Construct a Dagster repository for Airflow's example DAGs.\n\n Usage:\n\n Create `make_dagster_definitions.py`:\n from dagster_airflow import make_dagster_definitions_from_airflow_example_dags\n\n def make_airflow_example_dags():\n return make_dagster_definitions_from_airflow_example_dags()\n\n Use Definitions as usual, for example:\n `dagster-webserver -f path/to/make_dagster_definitions.py`\n\n Args:\n resource_defs: Optional[Mapping[str, ResourceDefinition]]\n Resource definitions to be used with the definitions\n\n Returns:\n Definitions\n """\n dag_bag = DagBag(\n dag_folder="some/empty/folder/with/no/dags", # prevent defaulting to settings.DAGS_FOLDER\n include_examples=True,\n )\n\n # There is a bug in Airflow v1 where the python_callable for task\n # 'search_catalog' is missing a required position argument '_'. It is fixed in airflow v2\n patch_airflow_example_dag(dag_bag)\n\n return make_dagster_definitions_from_airflow_dag_bag(\n dag_bag=dag_bag, resource_defs=resource_defs\n )\n\n\n
[docs]def make_schedules_and_jobs_from_airflow_dag_bag(\n dag_bag: DagBag,\n connections: Optional[List[Connection]] = None,\n resource_defs: Optional[Mapping[str, ResourceDefinition]] = {},\n) -> Tuple[List[ScheduleDefinition], List[JobDefinition]]:\n """Construct Dagster Schedules and Jobs corresponding to Airflow DagBag.\n\n Args:\n dag_bag (DagBag): Airflow DagBag Model\n connections (List[Connection]): List of Airflow Connections to be created in the Airflow DB\n\n Returns:\n - List[ScheduleDefinition]: The generated Dagster Schedules\n - List[JobDefinition]: The generated Dagster Jobs\n """\n check.inst_param(dag_bag, "dag_bag", DagBag)\n connections = check.opt_list_param(connections, "connections", of_type=Connection)\n\n job_defs = []\n schedule_defs = []\n count = 0\n # To enforce predictable iteration order\n sorted_dag_ids = sorted(dag_bag.dag_ids)\n for dag_id in sorted_dag_ids:\n dag = dag_bag.dags.get(dag_id)\n if not dag:\n continue\n if _is_dag_is_schedule(dag):\n schedule_defs.append(\n make_dagster_schedule_from_airflow_dag(\n dag=dag, tags=None, connections=connections, resource_defs=resource_defs\n )\n )\n else:\n job_defs.append(\n make_dagster_job_from_airflow_dag(\n dag=dag, tags=None, connections=connections, resource_defs=resource_defs\n )\n )\n\n count += 1\n\n return schedule_defs, job_defs
\n
", "current_page_name": "_modules/dagster_airflow/dagster_factory", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_airflow.dagster_factory"}, "dagster_job_factory": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_airflow.dagster_job_factory

\nfrom typing import List, Mapping, Optional\n\nfrom airflow.models.connection import Connection\nfrom airflow.models.dag import DAG\nfrom dagster import (\n    GraphDefinition,\n    JobDefinition,\n    ResourceDefinition,\n    _check as check,\n)\nfrom dagster._core.definitions.utils import validate_tags\nfrom dagster._core.instance import IS_AIRFLOW_INGEST_PIPELINE_STR\n\nfrom dagster_airflow.airflow_dag_converter import get_graph_definition_args\nfrom dagster_airflow.resources import (\n    make_ephemeral_airflow_db_resource as make_ephemeral_airflow_db_resource,\n)\nfrom dagster_airflow.utils import (\n    normalized_name,\n)\n\n\n
[docs]def make_dagster_job_from_airflow_dag(\n dag: DAG,\n tags: Optional[Mapping[str, str]] = None,\n connections: Optional[List[Connection]] = None,\n resource_defs: Optional[Mapping[str, ResourceDefinition]] = {},\n) -> JobDefinition:\n """Construct a Dagster job corresponding to a given Airflow DAG.\n\n Tasks in the resulting job will execute the ``execute()`` method on the corresponding\n Airflow Operator. Dagster, any dependencies required by Airflow Operators, and the module\n containing your DAG definition must be available in the Python environment within which your\n Dagster solids execute.\n\n To set Airflow's ``execution_date`` for use with Airflow Operator's ``execute()`` methods,\n either:\n\n 1. (Best for ad hoc runs) Execute job directly. This will set execution_date to the\n time (in UTC) of the run.\n\n 2. Add ``{'airflow_execution_date': utc_date_string}`` to the job tags. This will override\n behavior from (1).\n\n .. code-block:: python\n\n my_dagster_job = make_dagster_job_from_airflow_dag(\n dag=dag,\n tags={'airflow_execution_date': utc_execution_date_str}\n )\n my_dagster_job.execute_in_process()\n\n 3. (Recommended) Add ``{'airflow_execution_date': utc_date_string}`` to the run tags,\n such as in the Dagster UI. This will override behavior from (1) and (2)\n\n\n We apply normalized_name() to the dag id and task ids when generating job name and op\n names to ensure that names conform to Dagster's naming conventions.\n\n Args:\n dag (DAG): The Airflow DAG to compile into a Dagster job\n tags (Dict[str, Field]): Job tags. Optionally include\n `tags={'airflow_execution_date': utc_date_string}` to specify execution_date used within\n execution of Airflow Operators.\n connections (List[Connection]): List of Airflow Connections to be created in the Ephemeral\n Airflow DB, if use_emphemeral_airflow_db is False this will be ignored.\n\n Returns:\n JobDefinition: The generated Dagster job\n\n """\n check.inst_param(dag, "dag", DAG)\n tags = check.opt_mapping_param(tags, "tags")\n connections = check.opt_list_param(connections, "connections", of_type=Connection)\n\n mutated_tags = dict(tags)\n if IS_AIRFLOW_INGEST_PIPELINE_STR not in tags:\n mutated_tags[IS_AIRFLOW_INGEST_PIPELINE_STR] = "true"\n\n mutated_tags = validate_tags(mutated_tags)\n\n node_dependencies, node_defs = get_graph_definition_args(dag=dag)\n\n graph_def = GraphDefinition(\n name=normalized_name(dag.dag_id),\n description="",\n node_defs=node_defs,\n dependencies=node_dependencies,\n tags=mutated_tags,\n )\n\n if resource_defs is None or "airflow_db" not in resource_defs:\n resource_defs = dict(resource_defs) if resource_defs else {}\n resource_defs["airflow_db"] = make_ephemeral_airflow_db_resource(connections=connections)\n\n job_def = JobDefinition(\n name=normalized_name(dag.dag_id),\n description="",\n graph_def=graph_def,\n resource_defs=resource_defs,\n tags=mutated_tags,\n metadata={},\n op_retry_policy=None,\n version_strategy=None,\n )\n return job_def
\n
", "current_page_name": "_modules/dagster_airflow/dagster_job_factory", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_airflow.dagster_job_factory"}, "operators": {"dagster_operator": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_airflow.operators.dagster_operator

\nimport json\n\nfrom airflow.models import BaseOperator\nfrom airflow.utils.decorators import apply_defaults\n\nfrom dagster_airflow.hooks.dagster_hook import DagsterHook\nfrom dagster_airflow.links.dagster_link import LINK_FMT, DagsterLink\nfrom dagster_airflow.utils import is_airflow_2_loaded_in_environment\n\n\n
[docs]class DagsterOperator(BaseOperator):\n """DagsterOperator.\n\n Uses the dagster graphql api to run and monitor dagster jobs on remote dagster infrastructure\n\n Parameters:\n repository_name (str): the name of the repository to use\n repostitory_location_name (str): the name of the repostitory location to use\n job_name (str): the name of the job to run\n run_config (Optional[Dict[str, Any]]): the run config to use for the job run\n dagster_conn_id (Optional[str]): the id of the dagster connection, airflow 2.0+ only\n organization_id (Optional[str]): the id of the dagster cloud organization\n deployment_name (Optional[str]): the name of the dagster cloud deployment\n user_token (Optional[str]): the dagster cloud user token to use\n """\n\n template_fields = ["run_config"]\n template_ext = (".yaml", ".yml", ".json")\n ui_color = "#663399"\n ui_fgcolor = "#e0e3fc"\n operator_extra_links = (DagsterLink(),)\n\n @apply_defaults\n def __init__(\n self,\n dagster_conn_id="dagster_default",\n run_config=None,\n repository_name="",\n repostitory_location_name="",\n job_name="",\n # params for airflow < 2.0.0 were custom connections aren't supported\n deployment_name="prod",\n user_token=None,\n organization_id="",\n url="https://dagster.cloud/",\n *args,\n **kwargs,\n ) -> None:\n super().__init__(*args, **kwargs)\n self.run_id = None\n self.dagster_conn_id = dagster_conn_id if is_airflow_2_loaded_in_environment() else None\n self.run_config = run_config or {}\n self.repository_name = repository_name\n self.repostitory_location_name = repostitory_location_name\n self.job_name = job_name\n\n self.user_token = user_token\n self.url = url\n self.organization_id = organization_id\n self.deployment_name = deployment_name\n\n self.hook = DagsterHook(\n dagster_conn_id=self.dagster_conn_id,\n user_token=self.user_token,\n url=f"{self.url}{self.organization_id}/{self.deployment_name}/graphql",\n )\n\n def _is_json(self, blob):\n try:\n json.loads(blob)\n except ValueError:\n return False\n return True\n\n def pre_execute(self, context):\n # force re-rendering to ensure run_config renders any templated\n # content from run_config that couldn't be accessed on init\n setattr(\n self,\n "run_config",\n self.render_template(self.run_config, context),\n )\n\n def on_kill(self):\n self.log.info("Terminating Run")\n self.hook.terminate_run(\n run_id=self.run_id,\n )\n\n def execute(self, context):\n try:\n return self._execute(context)\n except Exception as e:\n raise e\n\n def _execute(self, context):\n self.run_id = self.hook.launch_run(\n repository_name=self.repository_name,\n repostitory_location_name=self.repostitory_location_name,\n job_name=self.job_name,\n run_config=self.run_config,\n )\n # save relevant info in xcom for use in links\n context["task_instance"].xcom_push(key="run_id", value=self.run_id)\n context["task_instance"].xcom_push(\n key="organization_id",\n value=self.hook.organization_id if self.dagster_conn_id else self.organization_id,\n )\n context["task_instance"].xcom_push(\n key="deployment_name",\n value=self.hook.deployment_name if self.dagster_conn_id else self.deployment_name,\n )\n\n self.log.info("Run Starting....")\n self.log.info(\n "Run tracking: %s",\n LINK_FMT.format(\n organization_id=self.hook.organization_id,\n deployment_name=self.hook.deployment_name,\n run_id=self.run_id,\n ),\n )\n self.hook.wait_for_run(\n run_id=self.run_id,\n )
\n\n\n
[docs]class DagsterCloudOperator(DagsterOperator):\n """DagsterCloudOperator.\n\n Uses the dagster cloud graphql api to run and monitor dagster jobs on dagster cloud\n\n Parameters:\n repository_name (str): the name of the repository to use\n repostitory_location_name (str): the name of the repostitory location to use\n job_name (str): the name of the job to run\n run_config (Optional[Dict[str, Any]]): the run config to use for the job run\n dagster_conn_id (Optional[str]): the id of the dagster connection, airflow 2.0+ only\n organization_id (Optional[str]): the id of the dagster cloud organization\n deployment_name (Optional[str]): the name of the dagster cloud deployment\n user_token (Optional[str]): the dagster cloud user token to use\n """
\n
", "current_page_name": "_modules/dagster_airflow/operators/dagster_operator", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_airflow.operators.dagster_operator"}}, "resources": {"airflow_ephemeral_db": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_airflow.resources.airflow_ephemeral_db

\nimport importlib\nimport os\nimport tempfile\nfrom typing import List, Optional\n\nimport airflow\nfrom airflow.models.connection import Connection\nfrom airflow.utils import db\nfrom dagster import (\n    Array,\n    DagsterRun,\n    Field,\n    InitResourceContext,\n    Noneable,\n    ResourceDefinition,\n    _check as check,\n)\n\nfrom dagster_airflow.resources.airflow_db import AirflowDatabase\nfrom dagster_airflow.utils import (\n    Locker,\n    create_airflow_connections,\n    is_airflow_2_loaded_in_environment,\n    serialize_connections,\n)\n\n\nclass AirflowEphemeralDatabase(AirflowDatabase):\n    """A ephemeral Airflow database Dagster resource."""\n\n    def __init__(\n        self, airflow_home_path: str, dagster_run: DagsterRun, dag_run_config: Optional[dict] = None\n    ):\n        self.airflow_home_path = airflow_home_path\n        super().__init__(dagster_run=dagster_run, dag_run_config=dag_run_config)\n\n    @staticmethod\n    def _initialize_database(\n        airflow_home_path: str = os.path.join(tempfile.gettempdir(), "dagster_airflow"),\n        connections: List[Connection] = [],\n    ):\n        os.environ["AIRFLOW_HOME"] = airflow_home_path\n        os.makedirs(airflow_home_path, exist_ok=True)\n        with Locker(airflow_home_path):\n            airflow_initialized = os.path.exists(f"{airflow_home_path}/airflow.db")\n            # because AIRFLOW_HOME has been overriden airflow needs to be reloaded\n            if is_airflow_2_loaded_in_environment():\n                importlib.reload(airflow.configuration)\n                importlib.reload(airflow.settings)\n                importlib.reload(airflow)\n            else:\n                importlib.reload(airflow)\n            if not airflow_initialized:\n                db.initdb()\n                create_airflow_connections(connections)\n\n    @staticmethod\n    def from_resource_context(context: InitResourceContext) -> "AirflowEphemeralDatabase":\n        airflow_home_path = os.path.join(tempfile.gettempdir(), f"dagster_airflow_{context.run_id}")\n        AirflowEphemeralDatabase._initialize_database(\n            airflow_home_path=airflow_home_path,\n            connections=[Connection(**c) for c in context.resource_config["connections"]],\n        )\n        return AirflowEphemeralDatabase(\n            airflow_home_path=airflow_home_path,\n            dagster_run=check.not_none(context.dagster_run, "Context must have run"),\n            dag_run_config=context.resource_config.get("dag_run_config"),\n        )\n\n\n
[docs]def make_ephemeral_airflow_db_resource(\n connections: List[Connection] = [], dag_run_config: Optional[dict] = None\n) -> ResourceDefinition:\n """Creates a Dagster resource that provides an ephemeral Airflow database.\n\n Args:\n connections (List[Connection]): List of Airflow Connections to be created in the Airflow DB\n dag_run_config (Optional[dict]): dag_run configuration to be used when creating a DagRun\n\n Returns:\n ResourceDefinition: The ephemeral Airflow DB resource\n\n """\n serialized_connections = serialize_connections(connections)\n airflow_db_resource_def = ResourceDefinition(\n resource_fn=AirflowEphemeralDatabase.from_resource_context,\n config_schema={\n "connections": Field(\n Array(inner_type=dict),\n default_value=serialized_connections,\n is_required=False,\n ),\n "dag_run_config": Field(\n Noneable(dict),\n default_value=dag_run_config,\n is_required=False,\n ),\n },\n description="Ephemeral Airflow DB to be used by dagster-airflow ",\n )\n return airflow_db_resource_def
\n
", "current_page_name": "_modules/dagster_airflow/resources/airflow_ephemeral_db", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_airflow.resources.airflow_ephemeral_db"}, "airflow_persistent_db": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_airflow.resources.airflow_persistent_db

\nimport importlib\nimport os\nfrom typing import List, Optional\n\nimport airflow\nfrom airflow.models.connection import Connection\nfrom dagster import (\n    Array,\n    DagsterRun,\n    Field,\n    InitResourceContext,\n    ResourceDefinition,\n    StringSource,\n    _check as check,\n)\n\nfrom dagster_airflow.resources.airflow_db import AirflowDatabase\nfrom dagster_airflow.utils import (\n    create_airflow_connections,\n    is_airflow_2_loaded_in_environment,\n    serialize_connections,\n)\n\n\nclass AirflowPersistentDatabase(AirflowDatabase):\n    """A persistent Airflow database Dagster resource."""\n\n    def __init__(self, dagster_run: DagsterRun, uri: str, dag_run_config: Optional[dict] = None):\n        self.uri = uri\n        super().__init__(dagster_run=dagster_run, dag_run_config=dag_run_config)\n\n    @staticmethod\n    def _initialize_database(uri: str, connections: List[Connection] = []):\n        if is_airflow_2_loaded_in_environment("2.3.0"):\n            os.environ["AIRFLOW__DATABASE__SQL_ALCHEMY_CONN"] = uri\n            importlib.reload(airflow.configuration)\n            importlib.reload(airflow.settings)\n            importlib.reload(airflow)\n        else:\n            os.environ["AIRFLOW__CORE__SQL_ALCHEMY_CONN"] = uri\n            importlib.reload(airflow)\n        create_airflow_connections(connections)\n\n    @staticmethod\n    def from_resource_context(context: InitResourceContext) -> "AirflowPersistentDatabase":\n        uri = context.resource_config["uri"]\n        AirflowPersistentDatabase._initialize_database(\n            uri=uri, connections=[Connection(**c) for c in context.resource_config["connections"]]\n        )\n        return AirflowPersistentDatabase(\n            dagster_run=check.not_none(context.dagster_run, "Context must have run"),\n            uri=uri,\n            dag_run_config=context.resource_config["dag_run_config"],\n        )\n\n\n
[docs]def make_persistent_airflow_db_resource(\n uri: str = "",\n connections: List[Connection] = [],\n dag_run_config: Optional[dict] = {},\n) -> ResourceDefinition:\n """Creates a Dagster resource that provides an persistent Airflow database.\n\n\n Usage:\n .. code-block:: python\n\n from dagster_airflow import (\n make_dagster_definitions_from_airflow_dags_path,\n make_persistent_airflow_db_resource,\n )\n postgres_airflow_db = "postgresql+psycopg2://airflow:airflow@localhost:5432/airflow"\n airflow_db = make_persistent_airflow_db_resource(uri=postgres_airflow_db)\n definitions = make_dagster_definitions_from_airflow_example_dags(\n '/path/to/dags/',\n resource_defs={"airflow_db": airflow_db}\n )\n\n\n Args:\n uri: SQLAlchemy URI of the Airflow DB to be used\n connections (List[Connection]): List of Airflow Connections to be created in the Airflow DB\n dag_run_config (Optional[dict]): dag_run configuration to be used when creating a DagRun\n\n Returns:\n ResourceDefinition: The persistent Airflow DB resource\n\n """\n if is_airflow_2_loaded_in_environment():\n os.environ["AIRFLOW__DATABASE__SQL_ALCHEMY_CONN"] = uri\n else:\n os.environ["AIRFLOW__CORE__SQL_ALCHEMY_CONN"] = uri\n\n serialized_connections = serialize_connections(connections)\n\n airflow_db_resource_def = ResourceDefinition(\n resource_fn=AirflowPersistentDatabase.from_resource_context,\n config_schema={\n "uri": Field(\n StringSource,\n default_value=uri,\n is_required=False,\n ),\n "connections": Field(\n Array(inner_type=dict),\n default_value=serialized_connections,\n is_required=False,\n ),\n "dag_run_config": Field(\n dict,\n default_value=dag_run_config,\n is_required=False,\n ),\n },\n description="Persistent Airflow DB to be used by dagster-airflow ",\n )\n return airflow_db_resource_def
\n
", "current_page_name": "_modules/dagster_airflow/resources/airflow_persistent_db", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_airflow.resources.airflow_persistent_db"}}}, "dagster_aws": {"ecs": {"launcher": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_aws.ecs.launcher

\nimport json\nimport logging\nimport os\nimport uuid\nimport warnings\nfrom collections import namedtuple\nfrom typing import Any, Dict, List, Mapping, Optional, Sequence\n\nimport boto3\nfrom botocore.exceptions import ClientError\nfrom dagster import (\n    Array,\n    DagsterRunStatus,\n    Field,\n    Noneable,\n    Permissive,\n    ScalarUnion,\n    StringSource,\n    _check as check,\n)\nfrom dagster._core.events import EngineEventData\nfrom dagster._core.instance import T_DagsterInstance\nfrom dagster._core.launcher.base import (\n    CheckRunHealthResult,\n    LaunchRunContext,\n    RunLauncher,\n    WorkerStatus,\n)\nfrom dagster._core.storage.dagster_run import DagsterRun\nfrom dagster._core.storage.tags import RUN_WORKER_ID_TAG\nfrom dagster._grpc.types import ExecuteRunArgs\nfrom dagster._serdes import ConfigurableClass\nfrom dagster._serdes.config_class import ConfigurableClassData\nfrom dagster._utils.backoff import backoff\nfrom typing_extensions import Self\n\nfrom ..secretsmanager import get_secrets_from_arns\nfrom .container_context import SHARED_ECS_SCHEMA, SHARED_TASK_DEFINITION_FIELDS, EcsContainerContext\nfrom .tasks import (\n    DagsterEcsTaskDefinitionConfig,\n    get_current_ecs_task,\n    get_current_ecs_task_metadata,\n    get_task_definition_dict_from_current_task,\n    get_task_kwargs_from_current_task,\n)\nfrom .utils import get_task_definition_family, get_task_logs, task_definitions_match\n\nTags = namedtuple("Tags", ["arn", "cluster", "cpu", "memory"])\n\nRUNNING_STATUSES = [\n    "PROVISIONING",\n    "PENDING",\n    "ACTIVATING",\n    "RUNNING",\n    "DEACTIVATING",\n    "STOPPING",\n    "DEPROVISIONING",\n]\nSTOPPED_STATUSES = ["STOPPED"]\n\nDEFAULT_WINDOWS_RESOURCES = {"cpu": "1024", "memory": "2048"}\n\nDEFAULT_LINUX_RESOURCES = {"cpu": "256", "memory": "512"}\n\n\n
[docs]class EcsRunLauncher(RunLauncher[T_DagsterInstance], ConfigurableClass):\n """RunLauncher that starts a task in ECS for each Dagster job run."""\n\n def __init__(\n self,\n inst_data: Optional[ConfigurableClassData] = None,\n task_definition=None,\n container_name="run",\n secrets=None,\n secrets_tag="dagster",\n env_vars=None,\n include_sidecars=False,\n use_current_ecs_task_config: bool = True,\n run_task_kwargs: Optional[Mapping[str, Any]] = None,\n run_resources: Optional[Dict[str, Any]] = None,\n run_ecs_tags: Optional[List[Dict[str, Optional[str]]]] = None,\n ):\n self._inst_data = inst_data\n self.ecs = boto3.client("ecs")\n self.ec2 = boto3.resource("ec2")\n self.secrets_manager = boto3.client("secretsmanager")\n self.logs = boto3.client("logs")\n\n self.task_definition = None\n self.task_definition_dict = {}\n if isinstance(task_definition, str):\n self.task_definition = task_definition\n elif task_definition and "env" in task_definition:\n check.invariant(\n len(task_definition) == 1,\n "If `task_definition` is set to a dictionary with `env`, `env` must be the only"\n " key.",\n )\n env_var = task_definition["env"]\n self.task_definition = os.getenv(env_var)\n if not self.task_definition:\n raise Exception(\n f"You have attempted to fetch the environment variable {env_var} which is not"\n " set."\n )\n else:\n self.task_definition_dict = task_definition or {}\n\n self.container_name = container_name\n\n self.secrets = check.opt_list_param(secrets, "secrets")\n\n self.env_vars = check.opt_list_param(env_vars, "env_vars")\n\n if self.secrets and all(isinstance(secret, str) for secret in self.secrets):\n warnings.warn(\n "Setting secrets as a list of ARNs is deprecated. "\n "Secrets should instead follow the same structure as the ECS API: "\n "https://docs.aws.amazon.com/AmazonECS/latest/APIReference/API_Secret.html",\n DeprecationWarning,\n )\n self.secrets = [\n {"name": name, "valueFrom": value_from}\n for name, value_from in get_secrets_from_arns(\n self.secrets_manager, self.secrets\n ).items()\n ]\n\n self.secrets_tags = [secrets_tag] if secrets_tag else []\n self.include_sidecars = include_sidecars\n\n if self.task_definition:\n task_definition = self.ecs.describe_task_definition(taskDefinition=self.task_definition)\n container_names = [\n container.get("name")\n for container in task_definition["taskDefinition"]["containerDefinitions"]\n ]\n check.invariant(\n container_name in container_names,\n f"Cannot override container '{container_name}' in task definition "\n f"'{self.task_definition}' because the container is not defined.",\n )\n self.task_definition = task_definition["taskDefinition"]["taskDefinitionArn"]\n\n self.use_current_ecs_task_config = check.opt_bool_param(\n use_current_ecs_task_config, "use_current_ecs_task_config"\n )\n\n self.run_task_kwargs = check.opt_mapping_param(run_task_kwargs, "run_task_kwargs")\n if run_task_kwargs:\n check.invariant(\n "taskDefinition" not in run_task_kwargs,\n "Use the `taskDefinition` config field to pass in a task definition to run.",\n )\n check.invariant(\n "overrides" not in run_task_kwargs,\n "Task overrides are set by the run launcher and cannot be set in run_task_kwargs.",\n )\n\n expected_keys = [\n key for key in self.ecs.meta.service_model.shape_for("RunTaskRequest").members\n ]\n\n for key in run_task_kwargs:\n check.invariant(\n key in expected_keys, f"Found an unexpected key {key} in run_task_kwargs"\n )\n\n self.run_resources = check.opt_mapping_param(run_resources, "run_resources")\n\n self.run_ecs_tags = check.opt_sequence_param(run_ecs_tags, "run_ecs_tags")\n\n self._current_task_metadata = None\n self._current_task = None\n\n @property\n def inst_data(self):\n return self._inst_data\n\n @property\n def task_role_arn(self) -> Optional[str]:\n if not self.task_definition_dict:\n return None\n return self.task_definition_dict.get("task_role_arn")\n\n @property\n def execution_role_arn(self) -> Optional[str]:\n if not self.task_definition_dict:\n return None\n return self.task_definition_dict.get("execution_role_arn")\n\n @property\n def runtime_platform(self) -> Optional[Mapping[str, Any]]:\n if not self.task_definition_dict:\n return None\n return self.task_definition_dict.get("runtime_platform")\n\n @property\n def mount_points(self) -> Optional[Sequence[Mapping[str, Any]]]:\n if not self.task_definition_dict:\n return None\n return self.task_definition_dict.get("mount_points")\n\n @property\n def volumes(self) -> Optional[Sequence[Mapping[str, Any]]]:\n if not self.task_definition_dict:\n return None\n return self.task_definition_dict.get("volumes")\n\n @property\n def repository_credentials(self) -> Optional[str]:\n if not self.task_definition_dict:\n return None\n return self.task_definition_dict.get("repository_credentials")\n\n @property\n def run_sidecar_containers(self) -> Optional[Sequence[Mapping[str, Any]]]:\n if not self.task_definition_dict:\n return None\n return self.task_definition_dict.get("sidecar_containers")\n\n @classmethod\n def config_type(cls):\n return {\n "task_definition": Field(\n ScalarUnion(\n scalar_type=str,\n non_scalar_schema={\n "log_group": Field(StringSource, is_required=False),\n "sidecar_containers": Field(Array(Permissive({})), is_required=False),\n "requires_compatibilities": Field(Array(str), is_required=False),\n "env": Field(\n str,\n is_required=False,\n description=(\n "Backwards-compatibility for when task_definition was a"\n " StringSource.Can be used to source the task_definition scalar"\n " from an environment variable."\n ),\n ),\n **SHARED_TASK_DEFINITION_FIELDS,\n },\n ),\n is_required=False,\n description=(\n "Either the short name of an existing task definition to use when launching new"\n " tasks, or a dictionary configuration to use when creating a task definition"\n " for the run.If neither is provided, the task definition will be created based"\n " on the current task's task definition."\n ),\n ),\n "container_name": Field(\n StringSource,\n is_required=False,\n default_value="run",\n description=(\n "The container name to use when launching new tasks. Defaults to 'run'."\n ),\n ),\n "secrets": Field(\n Array(\n ScalarUnion(\n scalar_type=str,\n non_scalar_schema={"name": StringSource, "valueFrom": StringSource},\n )\n ),\n is_required=False,\n description=(\n "An array of AWS Secrets Manager secrets. These secrets will "\n "be mounted as environment variables in the container. See "\n "https://docs.aws.amazon.com/AmazonECS/latest/APIReference/API_Secret.html."\n ),\n ),\n "secrets_tag": Field(\n Noneable(StringSource),\n is_required=False,\n default_value="dagster",\n description=(\n "AWS Secrets Manager secrets with this tag will be mounted as "\n "environment variables in the container. Defaults to 'dagster'."\n ),\n ),\n "include_sidecars": Field(\n bool,\n is_required=False,\n default_value=False,\n description=(\n "Whether each run should use the same sidecars as the task that launches it. "\n "Defaults to False."\n ),\n ),\n "use_current_ecs_task_config": Field(\n bool,\n is_required=False,\n default_value=True,\n description=(\n "Whether to use the run launcher's current ECS task in order to determine "\n "the cluster and networking configuration for the launched task. Defaults to "\n "True. Should only be called if the run launcher is running within an ECS "\n "task."\n ),\n ),\n "run_task_kwargs": Field(\n Permissive(\n {\n "cluster": Field(\n StringSource,\n is_required=False,\n description="Name of the ECS cluster to launch ECS tasks in.",\n ),\n }\n ),\n is_required=False,\n description=(\n "Additional arguments to include while running the task. See"\n " https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/ecs.html#ECS.Client.run_task"\n " for the available parameters. The overrides and taskDefinition arguments will"\n " always be set by the run launcher."\n ),\n ),\n **SHARED_ECS_SCHEMA,\n }\n\n @classmethod\n def from_config_value(\n cls, inst_data: ConfigurableClassData, config_value: Mapping[str, Any]\n ) -> Self:\n return EcsRunLauncher(inst_data=inst_data, **config_value)\n\n def _set_run_tags(self, run_id: str, cluster: str, task_arn: str):\n tags = {\n "ecs/task_arn": task_arn,\n "ecs/cluster": cluster,\n RUN_WORKER_ID_TAG: str(uuid.uuid4().hex)[0:6],\n }\n self._instance.add_run_tags(run_id, tags)\n\n def build_ecs_tags_for_run_task(self, run, container_context: EcsContainerContext):\n if any(tag["key"] == "dagster/run_id" for tag in container_context.run_ecs_tags):\n raise Exception("Cannot override system ECS tag: dagster/run_id")\n\n return [{"key": "dagster/run_id", "value": run.run_id}, *container_context.run_ecs_tags]\n\n def _get_run_tags(self, run_id):\n run = self._instance.get_run_by_id(run_id)\n tags = run.tags if run else {}\n arn = tags.get("ecs/task_arn")\n cluster = tags.get("ecs/cluster")\n cpu = tags.get("ecs/cpu")\n memory = tags.get("ecs/memory")\n\n return Tags(arn, cluster, cpu, memory)\n\n def _get_command_args(self, run_args: ExecuteRunArgs, context: LaunchRunContext):\n return run_args.get_command_args()\n\n def _get_image_for_run(self, context: LaunchRunContext) -> Optional[str]:\n job_origin = check.not_none(context.job_code_origin)\n return job_origin.repository_origin.container_image\n\n def launch_run(self, context: LaunchRunContext) -> None:\n """Launch a run in an ECS task."""\n run = context.dagster_run\n container_context = EcsContainerContext.create_for_run(run, self)\n\n job_origin = check.not_none(context.job_code_origin)\n\n # ECS limits overrides to 8192 characters including json formatting\n # https://docs.aws.amazon.com/AmazonECS/latest/APIReference/API_RunTask.html\n # When container_context is serialized as part of the ExecuteRunArgs, we risk\n # going over this limit (for example, if many secrets have been set). This strips\n # the container context off of our job origin because we don't actually need\n # it to launch the run; we only needed it to create the task definition.\n repository_origin = job_origin.repository_origin\n\n stripped_repository_origin = repository_origin._replace(container_context={})\n stripped_job_origin = job_origin._replace(repository_origin=stripped_repository_origin)\n\n args = ExecuteRunArgs(\n job_origin=stripped_job_origin,\n run_id=run.run_id,\n instance_ref=self._instance.get_ref(),\n )\n command = self._get_command_args(args, context)\n image = self._get_image_for_run(context)\n\n run_task_kwargs = self._run_task_kwargs(run, image, container_context)\n\n # Set cpu or memory overrides\n # https://docs.aws.amazon.com/AmazonECS/latest/developerguide/task-cpu-memory-error.html\n cpu_and_memory_overrides = self.get_cpu_and_memory_overrides(container_context, run)\n\n task_overrides = self._get_task_overrides(container_context, run)\n\n container_overrides: List[Dict[str, Any]] = [\n {\n "name": self._get_container_name(container_context),\n "command": command,\n # containerOverrides expects cpu/memory as integers\n **{k: int(v) for k, v in cpu_and_memory_overrides.items()},\n }\n ]\n\n run_task_kwargs["overrides"] = {\n "containerOverrides": container_overrides,\n # taskOverrides expects cpu/memory as strings\n **cpu_and_memory_overrides,\n **task_overrides,\n }\n run_task_kwargs["tags"] = [\n *run_task_kwargs.get("tags", []),\n *self.build_ecs_tags_for_run_task(run, container_context),\n ]\n\n run_task_kwargs_from_run = self._get_run_task_kwargs_from_run(run)\n run_task_kwargs.update(run_task_kwargs_from_run)\n\n # launchType and capacityProviderStrategy are incompatible - prefer the latter if it is set\n if "launchType" in run_task_kwargs and run_task_kwargs.get("capacityProviderStrategy"):\n del run_task_kwargs["launchType"]\n\n # Run a task using the same network configuration as this processes's task.\n response = self.ecs.run_task(**run_task_kwargs)\n\n tasks = response["tasks"]\n\n if not tasks:\n failures = response["failures"]\n failure_messages = []\n for failure in failures:\n arn = failure.get("arn")\n reason = failure.get("reason")\n detail = failure.get("detail")\n\n failure_message = (\n "Task"\n + (f" {arn}" if arn else "")\n + " failed."\n + (f" Failure reason: {reason}" if reason else "")\n + (f" Failure details: {detail}" if detail else "")\n )\n failure_messages.append(failure_message)\n\n raise Exception("\\n".join(failure_messages) if failure_messages else "Task failed.")\n\n arn = tasks[0]["taskArn"]\n cluster_arn = tasks[0]["clusterArn"]\n self._set_run_tags(run.run_id, cluster=cluster_arn, task_arn=arn)\n self.report_launch_events(run, arn, cluster_arn)\n\n def report_launch_events(\n self, run: DagsterRun, arn: Optional[str] = None, cluster: Optional[str] = None\n ):\n # Extracted method to allow for subclasses to customize the launch reporting behavior\n\n metadata = {}\n if arn:\n metadata["ECS Task ARN"] = arn\n if cluster:\n metadata["ECS Cluster"] = cluster\n\n metadata["Run ID"] = run.run_id\n self._instance.report_engine_event(\n message="Launching run in ECS task",\n dagster_run=run,\n engine_event_data=EngineEventData(metadata),\n cls=self.__class__,\n )\n\n def get_cpu_and_memory_overrides(\n self, container_context: EcsContainerContext, run: DagsterRun\n ) -> Mapping[str, str]:\n overrides = {}\n\n cpu = run.tags.get("ecs/cpu", container_context.run_resources.get("cpu"))\n memory = run.tags.get("ecs/memory", container_context.run_resources.get("memory"))\n\n if cpu:\n overrides["cpu"] = cpu\n if memory:\n overrides["memory"] = memory\n\n return overrides\n\n def _get_task_overrides(\n self, container_context: EcsContainerContext, run: DagsterRun\n ) -> Mapping[str, Any]:\n tag_overrides = run.tags.get("ecs/task_overrides")\n\n overrides = {}\n\n if tag_overrides:\n overrides = json.loads(tag_overrides)\n\n ephemeral_storage = run.tags.get(\n "ecs/ephemeral_storage", container_context.run_resources.get("ephemeral_storage")\n )\n if ephemeral_storage:\n overrides["ephemeralStorage"] = {"sizeInGiB": int(ephemeral_storage)}\n\n return overrides\n\n def _get_run_task_kwargs_from_run(self, run: DagsterRun) -> Mapping[str, Any]:\n run_task_kwargs = run.tags.get("ecs/run_task_kwargs")\n if run_task_kwargs:\n return json.loads(run_task_kwargs)\n return {}\n\n def terminate(self, run_id):\n tags = self._get_run_tags(run_id)\n\n run = self._instance.get_run_by_id(run_id)\n if not run:\n return False\n\n self._instance.report_run_canceling(run)\n\n if not (tags.arn and tags.cluster):\n return False\n\n tasks = self.ecs.describe_tasks(tasks=[tags.arn], cluster=tags.cluster).get("tasks")\n if not tasks:\n return False\n\n status = tasks[0].get("lastStatus")\n if status == "STOPPED":\n return False\n\n self.ecs.stop_task(task=tags.arn, cluster=tags.cluster)\n return True\n\n def _get_current_task_metadata(self):\n if self._current_task_metadata is None:\n self._current_task_metadata = get_current_ecs_task_metadata()\n return self._current_task_metadata\n\n def _get_current_task(self):\n if self._current_task is None:\n current_task_metadata = self._get_current_task_metadata()\n self._current_task = get_current_ecs_task(\n self.ecs, current_task_metadata.task_arn, current_task_metadata.cluster\n )\n\n return self._current_task\n\n def _get_run_task_definition_family(self, run: DagsterRun) -> str:\n return get_task_definition_family("run", check.not_none(run.external_job_origin))\n\n def _get_container_name(self, container_context) -> str:\n return container_context.container_name or self.container_name\n\n def _run_task_kwargs(self, run, image, container_context) -> Dict[str, Any]:\n """Return a dictionary of args to launch the ECS task, registering a new task\n definition if needed.\n """\n environment = self._environment(container_context)\n environment.append({"name": "DAGSTER_RUN_JOB_NAME", "value": run.job_name})\n\n secrets = self._secrets(container_context)\n\n if container_context.task_definition_arn:\n task_definition = container_context.task_definition_arn\n else:\n family = self._get_run_task_definition_family(run)\n\n if self.task_definition_dict or not self.use_current_ecs_task_config:\n runtime_platform = container_context.runtime_platform\n is_windows = container_context.runtime_platform.get(\n "operatingSystemFamily"\n ) not in {None, "LINUX"}\n\n default_resources = (\n DEFAULT_WINDOWS_RESOURCES if is_windows else DEFAULT_LINUX_RESOURCES\n )\n task_definition_config = DagsterEcsTaskDefinitionConfig(\n family,\n image,\n self._get_container_name(container_context),\n command=None,\n log_configuration=(\n {\n "logDriver": "awslogs",\n "options": {\n "awslogs-group": self.task_definition_dict["log_group"],\n "awslogs-region": self.ecs.meta.region_name,\n "awslogs-stream-prefix": family,\n },\n }\n if self.task_definition_dict.get("log_group")\n else None\n ),\n secrets=secrets if secrets else [],\n environment=environment,\n execution_role_arn=container_context.execution_role_arn,\n task_role_arn=container_context.task_role_arn,\n sidecars=container_context.run_sidecar_containers,\n requires_compatibilities=self.task_definition_dict.get(\n "requires_compatibilities", []\n ),\n cpu=container_context.run_resources.get("cpu", default_resources["cpu"]),\n memory=container_context.run_resources.get(\n "memory", default_resources["memory"]\n ),\n ephemeral_storage=container_context.run_resources.get("ephemeral_storage"),\n runtime_platform=runtime_platform,\n volumes=container_context.volumes,\n mount_points=container_context.mount_points,\n repository_credentials=container_context.repository_credentials,\n )\n task_definition_dict = task_definition_config.task_definition_dict()\n else:\n task_definition_dict = get_task_definition_dict_from_current_task(\n self.ecs,\n family,\n self._get_current_task(),\n image,\n self._get_container_name(container_context),\n environment=environment,\n secrets=secrets if secrets else {},\n include_sidecars=self.include_sidecars,\n task_role_arn=container_context.task_role_arn,\n execution_role_arn=container_context.execution_role_arn,\n cpu=container_context.run_resources.get("cpu"),\n memory=container_context.run_resources.get("memory"),\n runtime_platform=container_context.runtime_platform,\n ephemeral_storage=container_context.run_resources.get("ephemeral_storage"),\n volumes=container_context.volumes,\n mount_points=container_context.mount_points,\n additional_sidecars=container_context.run_sidecar_containers,\n repository_credentials=container_context.repository_credentials,\n )\n\n task_definition_config = DagsterEcsTaskDefinitionConfig.from_task_definition_dict(\n task_definition_dict,\n self._get_container_name(container_context),\n )\n\n container_name = self._get_container_name(container_context)\n\n backoff(\n self._reuse_or_register_task_definition,\n retry_on=(Exception,),\n kwargs={\n "desired_task_definition_config": task_definition_config,\n "container_name": container_name,\n "task_definition_dict": task_definition_dict,\n },\n max_retries=5,\n )\n\n task_definition = family\n\n if self.use_current_ecs_task_config:\n current_task_metadata = get_current_ecs_task_metadata()\n current_task = get_current_ecs_task(\n self.ecs, current_task_metadata.task_arn, current_task_metadata.cluster\n )\n task_kwargs = get_task_kwargs_from_current_task(\n self.ec2,\n current_task_metadata.cluster,\n current_task,\n )\n else:\n task_kwargs = {}\n\n return {**task_kwargs, **self.run_task_kwargs, "taskDefinition": task_definition}\n\n def _reuse_task_definition(\n self, desired_task_definition_config: DagsterEcsTaskDefinitionConfig, container_name: str\n ):\n family = desired_task_definition_config.family\n\n try:\n existing_task_definition = self.ecs.describe_task_definition(taskDefinition=family)[\n "taskDefinition"\n ]\n except ClientError:\n # task definition does not exist, do not reuse\n return False\n\n return task_definitions_match(\n desired_task_definition_config,\n existing_task_definition,\n container_name=container_name,\n )\n\n def _reuse_or_register_task_definition(\n self,\n desired_task_definition_config: DagsterEcsTaskDefinitionConfig,\n container_name: str,\n task_definition_dict: dict,\n ):\n if not self._reuse_task_definition(desired_task_definition_config, container_name):\n self.ecs.register_task_definition(**task_definition_dict)\n\n def _environment(self, container_context):\n return [\n {"name": key, "value": value}\n for key, value in container_context.get_environment_dict().items()\n ]\n\n def _secrets(self, container_context):\n secrets = container_context.get_secrets_dict(self.secrets_manager)\n return (\n [{"name": key, "valueFrom": value} for key, value in secrets.items()] if secrets else []\n )\n\n @property\n def supports_check_run_worker_health(self):\n return True\n\n @property\n def include_cluster_info_in_failure_messages(self):\n return True\n\n def _is_transient_startup_failure(self, run, task):\n if not task.get("stoppedReason"):\n return False\n return (\n run.status == DagsterRunStatus.STARTING\n and "Timeout waiting for network interface provisioning to complete"\n in task.get("stoppedReason")\n )\n\n def check_run_worker_health(self, run: DagsterRun):\n run_worker_id = run.tags.get(RUN_WORKER_ID_TAG)\n\n tags = self._get_run_tags(run.run_id)\n container_context = EcsContainerContext.create_for_run(run, self)\n\n if not (tags.arn and tags.cluster):\n return CheckRunHealthResult(WorkerStatus.UNKNOWN, "", run_worker_id=run_worker_id)\n\n tasks = self.ecs.describe_tasks(tasks=[tags.arn], cluster=tags.cluster).get("tasks")\n if not tasks:\n return CheckRunHealthResult(WorkerStatus.UNKNOWN, "", run_worker_id=run_worker_id)\n\n t = tasks[0]\n\n if t.get("lastStatus") in RUNNING_STATUSES:\n return CheckRunHealthResult(WorkerStatus.RUNNING, run_worker_id=run_worker_id)\n elif t.get("lastStatus") in STOPPED_STATUSES:\n failed_containers = []\n for c in t.get("containers"):\n if c.get("exitCode") != 0:\n failed_containers.append(c)\n if len(failed_containers) > 0:\n if len(failed_containers) > 1:\n container_str = "Containers"\n else:\n container_str = "Container"\n\n failure_text = []\n\n if self.include_cluster_info_in_failure_messages:\n failure_text.append(\n f"Task {t.get('taskArn')} failed. Stop code: {t.get('stopCode')}. Stop"\n f" reason: {t.get('stoppedReason')}."\n + f" {container_str} {[c.get('name') for c in failed_containers]} failed."\n )\n\n logs = []\n\n try:\n logs = get_task_logs(\n self.ecs,\n logs_client=self.logs,\n cluster=tags.cluster,\n task_arn=tags.arn,\n container_name=self._get_container_name(container_context),\n )\n except:\n logging.exception(f"Error trying to get logs for failed task {tags.arn}")\n\n if logs:\n failure_text.append("Run worker logs:\\n" + "\\n".join(logs))\n\n return CheckRunHealthResult(\n WorkerStatus.FAILED,\n "\\n\\n".join(failure_text),\n transient=self._is_transient_startup_failure(run, t),\n run_worker_id=run_worker_id,\n )\n\n return CheckRunHealthResult(WorkerStatus.SUCCESS, run_worker_id=run_worker_id)\n\n return CheckRunHealthResult(\n WorkerStatus.UNKNOWN, "ECS task health status is unknown.", run_worker_id=run_worker_id\n )
\n
", "current_page_name": "_modules/dagster_aws/ecs/launcher", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_aws.ecs.launcher"}}, "emr": {"emr": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_aws.emr.emr

\n# Portions of this file are copied from the Yelp MRJob project:\n#\n#   https://github.com/Yelp/mrjob\n#\n#\n# Copyright 2009-2013 Yelp, David Marin\n# Copyright 2015 Yelp\n# Copyright 2017 Yelp\n# Copyright 2018 Contributors\n# Copyright 2019 Yelp and Contributors\n#\n# Licensed under the Apache License, Version 2.0 (the "License");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an "AS IS" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport gzip\nimport re\nfrom io import BytesIO\nfrom urllib.parse import urlparse\n\nimport boto3\nimport dagster\nimport dagster._check as check\nfrom botocore.exceptions import WaiterError\n\nfrom dagster_aws.utils.mrjob.utils import _boto3_now, _wrap_aws_client, strip_microseconds\n\nfrom .types import EMR_CLUSTER_TERMINATED_STATES, EmrClusterState, EmrStepState\n\n# if we can't create or find our own service role, use the one\n# created by the AWS console and CLI\n_FALLBACK_SERVICE_ROLE = "EMR_DefaultRole"\n\n# if we can't create or find our own instance profile, use the one\n# created by the AWS console and CLI\n_FALLBACK_INSTANCE_PROFILE = "EMR_EC2_DefaultRole"\n\n\n
[docs]class EmrError(Exception):\n pass
\n\n\n
[docs]class EmrJobRunner:\n def __init__(\n self,\n region,\n check_cluster_every=30,\n aws_access_key_id=None,\n aws_secret_access_key=None,\n ):\n """This object encapsulates various utilities for interacting with EMR clusters and invoking\n steps (jobs) on them.\n\n See also :py:class:`~dagster_aws.emr.EmrPySparkResource`, which wraps this job runner in a\n resource for pyspark workloads.\n\n Args:\n region (str): AWS region to use\n check_cluster_every (int, optional): How frequently to poll boto3 APIs for updates.\n Defaults to 30 seconds.\n aws_access_key_id ([type], optional): AWS access key ID. Defaults to None, which will\n use the default boto3 credentials chain.\n aws_secret_access_key ([type], optional): AWS secret access key. Defaults to None, which\n will use the default boto3 credentials chain.\n """\n self.region = check.str_param(region, "region")\n\n # This is in seconds\n self.check_cluster_every = check.int_param(check_cluster_every, "check_cluster_every")\n self.aws_access_key_id = check.opt_str_param(aws_access_key_id, "aws_access_key_id")\n self.aws_secret_access_key = check.opt_str_param(\n aws_secret_access_key, "aws_secret_access_key"\n )\n\n def make_emr_client(self):\n """Creates a boto3 EMR client. Construction is wrapped in retries in case client connection\n fails transiently.\n\n Returns:\n botocore.client.EMR: An EMR client\n """\n raw_emr_client = boto3.client(\n "emr",\n aws_access_key_id=self.aws_access_key_id,\n aws_secret_access_key=self.aws_secret_access_key,\n region_name=self.region,\n )\n return _wrap_aws_client(raw_emr_client, min_backoff=self.check_cluster_every)\n\n def cluster_id_from_name(self, cluster_name):\n """Get a cluster ID in the format "j-123ABC123ABC1" given a cluster name "my cool cluster".\n\n Args:\n cluster_name (str): The name of the cluster for which to find an ID\n\n Returns:\n str: The ID of the cluster\n\n Raises:\n EmrError: No cluster with the specified name exists\n """\n check.str_param(cluster_name, "cluster_name")\n\n response = self.make_emr_client().list_clusters().get("Clusters", [])\n for cluster in response:\n if cluster["Name"] == cluster_name:\n return cluster["Id"]\n\n raise EmrError(f"cluster {cluster_name} not found in region {self.region}")\n\n @staticmethod\n def construct_step_dict_for_command(step_name, command, action_on_failure="CONTINUE"):\n """Construct an EMR step definition which uses command-runner.jar to execute a shell command\n on the EMR master.\n\n Args:\n step_name (str): The name of the EMR step (will show up in the EMR UI)\n command (str): The shell command to execute with command-runner.jar\n action_on_failure (str, optional): Configure action on failure (e.g., continue, or\n terminate the cluster). Defaults to 'CONTINUE'.\n\n Returns:\n dict: Step definition dict\n """\n check.str_param(step_name, "step_name")\n check.list_param(command, "command", of_type=str)\n check.str_param(action_on_failure, "action_on_failure")\n\n return {\n "Name": step_name,\n "ActionOnFailure": action_on_failure,\n "HadoopJarStep": {"Jar": "command-runner.jar", "Args": command},\n }\n\n def add_tags(self, log, tags, cluster_id):\n """Add tags in the dict tags to cluster cluster_id.\n\n Args:\n log (DagsterLogManager): Log manager, for logging\n tags (dict): Dictionary of {'key': 'value'} tags\n cluster_id (str): The ID of the cluster to tag\n """\n check.dict_param(tags, "tags")\n check.str_param(cluster_id, "cluster_id")\n\n tags_items = sorted(tags.items())\n\n self.make_emr_client().add_tags(\n ResourceId=cluster_id, Tags=[dict(Key=k, Value=v) for k, v in tags_items]\n )\n\n log.info(\n "Added EMR tags to cluster %s: %s"\n % (cluster_id, ", ".join("%s=%s" % (tag, value) for tag, value in tags_items))\n )\n\n def run_job_flow(self, log, cluster_config):\n """Create an empty cluster on EMR, and return the ID of that job flow.\n\n Args:\n log (DagsterLogManager): Log manager, for logging\n cluster_config (dict): Configuration for this EMR job flow. See:\n https://docs.aws.amazon.com/emr/latest/APIReference/API_RunJobFlow.html\n\n Returns:\n str: The cluster ID, e.g. "j-ZKIY4CKQRX72"\n """\n check.dict_param(cluster_config, "cluster_config")\n\n log.debug("Creating Elastic MapReduce cluster")\n emr_client = self.make_emr_client()\n\n log.debug(\n "Calling run_job_flow(%s)"\n % (", ".join("%s=%r" % (k, v) for k, v in sorted(cluster_config.items())))\n )\n cluster_id = emr_client.run_job_flow(**cluster_config)["JobFlowId"]\n\n log.info("Created new cluster %s" % cluster_id)\n\n # set EMR tags for the cluster\n tags_items = cluster_config.get("Tags", [])\n tags = {k: v for k, v in tags_items}\n tags["__dagster_version"] = dagster.__version__\n self.add_tags(log, tags, cluster_id)\n return cluster_id\n\n def describe_cluster(self, cluster_id):\n """Thin wrapper over boto3 describe_cluster.\n\n Args:\n cluster_id (str): Cluster to inspect\n\n Returns:\n dict: The cluster info. See:\n https://docs.aws.amazon.com/emr/latest/APIReference/API_DescribeCluster.html\n """\n check.str_param(cluster_id, "cluster_id")\n\n emr_client = self.make_emr_client()\n return emr_client.describe_cluster(ClusterId=cluster_id)\n\n def describe_step(self, cluster_id, step_id):\n """Thin wrapper over boto3 describe_step.\n\n Args:\n cluster_id (str): Cluster to inspect\n step_id (str): Step ID to describe\n\n Returns:\n dict: The step info. See:\n https://docs.aws.amazon.com/emr/latest/APIReference/API_DescribeStep.html\n """\n check.str_param(cluster_id, "cluster_id")\n check.str_param(step_id, "step_id")\n\n emr_client = self.make_emr_client()\n return emr_client.describe_step(ClusterId=cluster_id, StepId=step_id)\n\n def add_job_flow_steps(self, log, cluster_id, step_defs):\n """Submit the constructed job flow steps to EMR for execution.\n\n Args:\n log (DagsterLogManager): Log manager, for logging\n cluster_id (str): The ID of the cluster\n step_defs (List[dict]): List of steps; see also `construct_step_dict_for_command`\n\n Returns:\n List[str]: list of step IDs.\n """\n check.str_param(cluster_id, "cluster_id")\n check.list_param(step_defs, "step_defs", of_type=dict)\n\n emr_client = self.make_emr_client()\n\n steps_kwargs = dict(JobFlowId=cluster_id, Steps=step_defs)\n log.debug(\n "Calling add_job_flow_steps(%s)"\n % ",".join(("%s=%r" % (k, v)) for k, v in steps_kwargs.items())\n )\n return emr_client.add_job_flow_steps(**steps_kwargs)["StepIds"]\n\n def is_emr_step_complete(self, log, cluster_id, emr_step_id):\n step = self.describe_step(cluster_id, emr_step_id)["Step"]\n step_state = EmrStepState(step["Status"]["State"])\n\n if step_state == EmrStepState.Pending:\n cluster = self.describe_cluster(cluster_id)["Cluster"]\n\n reason = _get_reason(cluster)\n reason_desc = (": %s" % reason) if reason else ""\n\n log.info("PENDING (cluster is %s%s)" % (cluster["Status"]["State"], reason_desc))\n return False\n\n elif step_state == EmrStepState.Running:\n time_running_desc = ""\n\n start = step["Status"]["Timeline"].get("StartDateTime")\n if start:\n time_running_desc = " for %s" % strip_microseconds(_boto3_now() - start)\n\n log.info("RUNNING%s" % time_running_desc)\n return False\n\n # we're done, will return at the end of this\n elif step_state == EmrStepState.Completed:\n log.info("COMPLETED")\n return True\n else:\n # step has failed somehow. *reason* seems to only be set\n # when job is cancelled (e.g. 'Job terminated')\n reason = _get_reason(step)\n reason_desc = (" (%s)" % reason) if reason else ""\n\n log.info("%s%s" % (step_state.value, reason_desc))\n\n # print cluster status; this might give more context\n # why step didn't succeed\n cluster = self.describe_cluster(cluster_id)["Cluster"]\n reason = _get_reason(cluster)\n reason_desc = (": %s" % reason) if reason else ""\n log.info(\n "Cluster %s %s %s%s"\n % (\n cluster["Id"],\n "was" if "ED" in cluster["Status"]["State"] else "is",\n cluster["Status"]["State"],\n reason_desc,\n )\n )\n\n if EmrClusterState(cluster["Status"]["State"]) in EMR_CLUSTER_TERMINATED_STATES:\n # was it caused by IAM roles?\n self._check_for_missing_default_iam_roles(log, cluster)\n\n # TODO: extract logs here to surface failure reason\n # See: https://github.com/dagster-io/dagster/issues/1954\n\n if step_state == EmrStepState.Failed:\n log.error("EMR step %s failed" % emr_step_id)\n\n raise EmrError("EMR step %s failed" % emr_step_id)\n\n def _check_for_missing_default_iam_roles(self, log, cluster):\n """If cluster couldn't start due to missing IAM roles, tell user what to do."""\n check.dict_param(cluster, "cluster")\n\n reason = _get_reason(cluster)\n if any(\n reason.endswith("/%s is invalid" % role)\n for role in (_FALLBACK_INSTANCE_PROFILE, _FALLBACK_SERVICE_ROLE)\n ):\n log.warning(\n "IAM roles are missing. See documentation for IAM roles on EMR here: "\n "https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-iam-roles.html"\n )\n\n def log_location_for_cluster(self, cluster_id):\n """EMR clusters are typically launched with S3 logging configured. This method inspects a\n cluster using boto3 describe_cluster to retrieve the log URI.\n\n Args:\n cluster_id (str): The cluster to inspect.\n\n Raises:\n EmrError: the log URI was missing (S3 log mirroring not enabled for this cluster)\n\n Returns:\n (str, str): log bucket and key\n """\n check.str_param(cluster_id, "cluster_id")\n\n # The S3 log URI is specified per job flow (cluster)\n log_uri = self.describe_cluster(cluster_id)["Cluster"].get("LogUri", None)\n\n # ugh, seriously boto3?! This will come back as string "None"\n if log_uri == "None" or log_uri is None:\n raise EmrError("Log URI not specified, cannot retrieve step execution logs")\n\n # For some reason the API returns an s3n:// protocol log URI instead of s3://\n log_uri = re.sub("^s3n", "s3", log_uri)\n log_uri_parsed = urlparse(log_uri)\n log_bucket = log_uri_parsed.netloc\n log_key_prefix = log_uri_parsed.path.lstrip("/")\n return log_bucket, log_key_prefix\n\n def retrieve_logs_for_step_id(self, log, cluster_id, step_id):\n """Retrieves stdout and stderr logs for the given step ID.\n\n Args:\n log (DagsterLogManager): Log manager, for logging\n cluster_id (str): EMR cluster ID\n step_id (str): EMR step ID for the job that was submitted.\n\n Returns:\n (str, str): Tuple of stdout log string contents, and stderr log string contents\n """\n check.str_param(cluster_id, "cluster_id")\n check.str_param(step_id, "step_id")\n\n log_bucket, log_key_prefix = self.log_location_for_cluster(cluster_id)\n\n prefix = f"{log_key_prefix}{cluster_id}/steps/{step_id}"\n stdout_log = self.wait_for_log(log, log_bucket, f"{prefix}/stdout.gz")\n stderr_log = self.wait_for_log(log, log_bucket, f"{prefix}/stderr.gz")\n return stdout_log, stderr_log\n\n def wait_for_log(self, log, log_bucket, log_key, waiter_delay=30, waiter_max_attempts=20):\n """Wait for gzipped EMR logs to appear on S3. Note that EMR syncs logs to S3 every 5\n minutes, so this may take a long time.\n\n Args:\n log_bucket (str): S3 bucket where log is expected to appear\n log_key (str): S3 key for the log file\n waiter_delay (int): How long to wait between attempts to check S3 for the log file\n waiter_max_attempts (int): Number of attempts before giving up on waiting\n\n Raises:\n EmrError: Raised if we waited the full duration and the logs did not appear\n\n Returns:\n str: contents of the log file\n """\n check.str_param(log_bucket, "log_bucket")\n check.str_param(log_key, "log_key")\n check.int_param(waiter_delay, "waiter_delay")\n check.int_param(waiter_max_attempts, "waiter_max_attempts")\n\n log.info(f"Attempting to get log: s3://{log_bucket}/{log_key}")\n\n s3 = _wrap_aws_client(boto3.client("s3"), min_backoff=self.check_cluster_every)\n waiter = s3.get_waiter("object_exists")\n try:\n waiter.wait(\n Bucket=log_bucket,\n Key=log_key,\n WaiterConfig={"Delay": waiter_delay, "MaxAttempts": waiter_max_attempts},\n )\n except WaiterError as err:\n raise EmrError("EMR log file did not appear on S3 after waiting") from err\n\n obj = BytesIO(s3.get_object(Bucket=log_bucket, Key=log_key)["Body"].read())\n gzip_file = gzip.GzipFile(fileobj=obj)\n return gzip_file.read().decode("utf-8")
\n\n\ndef _get_reason(cluster_or_step):\n """Get state change reason message."""\n # StateChangeReason is {} before the first state change\n return cluster_or_step["Status"]["StateChangeReason"].get("Message", "")\n
", "current_page_name": "_modules/dagster_aws/emr/emr", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_aws.emr.emr"}, "pyspark_step_launcher": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_aws.emr.pyspark_step_launcher

\nimport os\nimport pickle\nimport sys\nimport tempfile\nimport time\n\nimport boto3\nfrom botocore.exceptions import ClientError\nfrom dagster import (\n    Field,\n    StringSource,\n    _check as check,\n    resource,\n)\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom dagster._core.definitions.step_launcher import StepLauncher\nfrom dagster._core.errors import DagsterInvariantViolationError, raise_execution_interrupts\nfrom dagster._core.execution.plan.external_step import (\n    PICKLED_EVENTS_FILE_NAME,\n    PICKLED_STEP_RUN_REF_FILE_NAME,\n    step_context_to_step_run_ref,\n)\nfrom dagster._serdes import deserialize_value\n\nfrom dagster_aws.emr import EmrError, EmrJobRunner, emr_step_main\nfrom dagster_aws.emr.configs_spark import spark_config as get_spark_config\nfrom dagster_aws.utils.mrjob.log4j import parse_hadoop_log4j_records\n\n# On EMR, Spark is installed here\nEMR_SPARK_HOME = "/usr/lib/spark/"\n\nCODE_ZIP_NAME = "code.zip"\n\n\n
[docs]@dagster_maintained_resource\n@resource(\n {\n "spark_config": get_spark_config(),\n "cluster_id": Field(\n StringSource, description="Name of the job flow (cluster) on which to execute."\n ),\n "region_name": Field(StringSource, description="The AWS region that the cluster is in."),\n "action_on_failure": Field(\n str,\n is_required=False,\n default_value="CANCEL_AND_WAIT",\n description=(\n "The EMR action to take when the cluster step fails: "\n "https://docs.aws.amazon.com/emr/latest/APIReference/API_StepConfig.html"\n ),\n ),\n "staging_bucket": Field(\n StringSource,\n is_required=True,\n description=(\n "S3 bucket to use for passing files between the plan process and EMR process."\n ),\n ),\n "staging_prefix": Field(\n StringSource,\n is_required=False,\n default_value="emr_staging",\n description=(\n "S3 key prefix inside the staging_bucket to use for files passed the plan "\n "process and EMR process"\n ),\n ),\n "wait_for_logs": Field(\n bool,\n is_required=False,\n default_value=False,\n description=(\n "If set, the system will wait for EMR logs to appear on S3. Note that logs "\n "are copied every 5 minutes, so enabling this will add several minutes to the job "\n "runtime."\n ),\n ),\n "local_job_package_path": Field(\n StringSource,\n is_required=False,\n description=(\n "Absolute path to the package that contains the job definition(s) whose steps will"\n " execute remotely on EMR. This is a path on the local fileystem of the process"\n " executing the job. The expectation is that this package will also be available on"\n " the python path of the launched process running the Spark step on EMR, either"\n " deployed on step launch via the deploy_local_job_package option, referenced on s3"\n " via the s3_job_package_path option, or installed on the cluster via bootstrap"\n " actions."\n ),\n ),\n "local_pipeline_package_path": Field(\n StringSource,\n is_required=False,\n description=(\n "(legacy) Absolute path to the package that contains the pipeline definition(s)"\n " whose steps will execute remotely on EMR. This is a path on the local fileystem"\n " of the process executing the pipeline. The expectation is that this package will"\n " also be available on the python path of the launched process running the Spark"\n " step on EMR, either deployed on step launch via the deploy_local_pipeline_package"\n " option, referenced on s3 via the s3_pipeline_package_path option, or installed on"\n " the cluster via bootstrap actions."\n ),\n ),\n "deploy_local_job_package": Field(\n bool,\n default_value=False,\n is_required=False,\n description=(\n "If set, before every step run, the launcher will zip up all the code in"\n " local_job_package_path, upload it to s3, and pass it to spark-submit's --py-files"\n " option. This gives the remote process access to up-to-date user code. If not set,"\n " the assumption is that some other mechanism is used for distributing code to the"\n " EMR cluster. If this option is set to True, s3_job_package_path should not also"\n " be set."\n ),\n ),\n "deploy_local_pipeline_package": Field(\n bool,\n default_value=False,\n is_required=False,\n description=(\n "(legacy) If set, before every step run, the launcher will zip up all the code in"\n " local_job_package_path, upload it to s3, and pass it to spark-submit's --py-files"\n " option. This gives the remote process access to up-to-date user code. If not set,"\n " the assumption is that some other mechanism is used for distributing code to the"\n " EMR cluster. If this option is set to True, s3_job_package_path should not also"\n " be set."\n ),\n ),\n "s3_job_package_path": Field(\n StringSource,\n is_required=False,\n description=(\n "If set, this path will be passed to the --py-files option of spark-submit. "\n "This should usually be a path to a zip file. If this option is set, "\n "deploy_local_job_package should not be set to True."\n ),\n ),\n "s3_pipeline_package_path": Field(\n StringSource,\n is_required=False,\n description=(\n "If set, this path will be passed to the --py-files option of spark-submit. "\n "This should usually be a path to a zip file. If this option is set, "\n "deploy_local_pipeline_package should not be set to True."\n ),\n ),\n }\n)\ndef emr_pyspark_step_launcher(context):\n # Resolve legacy arguments\n if context.resource_config.get("local_job_package_path") and context.resource_config.get(\n "local_pipeline_package_path"\n ):\n raise DagsterInvariantViolationError(\n "Provided both ``local_job_package_path`` and legacy version "\n "``local_pipeline_package_path`` arguments to ``emr_pyspark_step_launcher`` "\n "resource. Please choose one or the other."\n )\n\n if not context.resource_config.get(\n "local_job_package_path"\n ) and not context.resource_config.get("local_pipeline_package_path"):\n raise DagsterInvariantViolationError(\n "For resource ``emr_pyspark_step_launcher``, no config value provided for required "\n "schema entry ``local_job_package_path``."\n )\n\n local_job_package_path = context.resource_config.get(\n "local_job_package_path"\n ) or context.resource_config.get("local_pipeline_package_path")\n\n if context.resource_config.get("deploy_local_job_package") and context.resource_config.get(\n "deploy_local_pipeline_package"\n ):\n raise DagsterInvariantViolationError(\n "Provided both ``deploy_local_job_package`` and legacy version "\n "``deploy_local_pipeline_package`` arguments to ``emr_pyspark_step_launcher`` "\n "resource. Please choose one or the other."\n )\n\n deploy_local_job_package = context.resource_config.get(\n "deploy_local_job_package"\n ) or context.resource_config.get("deploy_local_pipeline_package")\n\n if context.resource_config.get("s3_job_package_path") and context.resource_config.get(\n "s3_pipeline_package_path"\n ):\n raise DagsterInvariantViolationError(\n "Provided both ``s3_job_package_path`` and legacy version "\n "``s3_pipeline_package_path`` arguments to ``emr_pyspark_step_launcher`` "\n "resource. Please choose one or the other."\n )\n\n s3_job_package_path = context.resource_config.get(\n "s3_job_package_path"\n ) or context.resource_config.get("s3_pipeline_package_path")\n\n return EmrPySparkStepLauncher(\n region_name=context.resource_config.get("region_name"),\n staging_bucket=context.resource_config.get("staging_bucket"),\n staging_prefix=context.resource_config.get("staging_prefix"),\n wait_for_logs=context.resource_config.get("wait_for_logs"),\n action_on_failure=context.resource_config.get("action_on_failure"),\n cluster_id=context.resource_config.get("cluster_id"),\n spark_config=context.resource_config.get("spark_config"),\n local_job_package_path=local_job_package_path,\n deploy_local_job_package=deploy_local_job_package,\n s3_job_package_path=s3_job_package_path,\n )
\n\n\nemr_pyspark_step_launcher.__doc__ = "\\n".join(\n "- **" + option + "**: " + (field.description or "")\n for option, field in emr_pyspark_step_launcher.config_schema.config_type.fields.items() # type: ignore\n)\n\n\nclass EmrPySparkStepLauncher(StepLauncher):\n def __init__(\n self,\n region_name,\n staging_bucket,\n staging_prefix,\n wait_for_logs,\n action_on_failure,\n cluster_id,\n spark_config,\n local_job_package_path,\n deploy_local_job_package,\n s3_job_package_path=None,\n ):\n self.region_name = check.str_param(region_name, "region_name")\n self.staging_bucket = check.str_param(staging_bucket, "staging_bucket")\n self.staging_prefix = check.str_param(staging_prefix, "staging_prefix")\n self.wait_for_logs = check.bool_param(wait_for_logs, "wait_for_logs")\n self.action_on_failure = check.str_param(action_on_failure, "action_on_failure")\n self.cluster_id = check.str_param(cluster_id, "cluster_id")\n self.spark_config = spark_config\n\n check.invariant(\n not deploy_local_job_package or not s3_job_package_path,\n "If deploy_local_job_package is set to True, s3_job_package_path should not "\n "also be set.",\n )\n\n self.local_job_package_path = check.str_param(\n local_job_package_path, "local_job_package_path"\n )\n self.deploy_local_job_package = check.bool_param(\n deploy_local_job_package, "deploy_local_job_package"\n )\n self.s3_job_package_path = check.opt_str_param(s3_job_package_path, "s3_job_package_path")\n\n self.emr_job_runner = EmrJobRunner(region=self.region_name)\n\n def _post_artifacts(self, log, step_run_ref, run_id, step_key):\n """Synchronize the step run ref and pyspark code to an S3 staging bucket for use on EMR.\n\n For the zip file, consider the following toy example:\n\n # Folder: my_pyspark_project/\n # a.py\n def foo():\n print(1)\n\n # b.py\n def bar():\n print(2)\n\n # main.py\n from a import foo\n from b import bar\n\n foo()\n bar()\n\n This will zip up `my_pyspark_project/` as `my_pyspark_project.zip`. Then, when running\n `spark-submit --py-files my_pyspark_project.zip emr_step_main.py` on EMR this will\n print 1, 2.\n """\n from dagster_pyspark.utils import build_pyspark_zip\n\n with tempfile.TemporaryDirectory() as temp_dir:\n s3 = boto3.client("s3", region_name=self.region_name)\n\n # Upload step run ref\n def _upload_file_to_s3(local_path, s3_filename):\n key = self._artifact_s3_key(run_id, step_key, s3_filename)\n s3_uri = self._artifact_s3_uri(run_id, step_key, s3_filename)\n log.debug(f"Uploading file {local_path} to {s3_uri}")\n s3.upload_file(Filename=local_path, Bucket=self.staging_bucket, Key=key)\n\n # Upload main file.\n # The remote Dagster installation should also have the file, but locating it there\n # could be a pain.\n main_local_path = self._main_file_local_path()\n _upload_file_to_s3(main_local_path, self._main_file_name())\n\n if self.deploy_local_job_package:\n # Zip and upload package containing job\n zip_local_path = os.path.join(temp_dir, CODE_ZIP_NAME)\n\n build_pyspark_zip(zip_local_path, self.local_job_package_path)\n _upload_file_to_s3(zip_local_path, CODE_ZIP_NAME)\n\n # Create step run ref pickle file\n step_run_ref_local_path = os.path.join(temp_dir, PICKLED_STEP_RUN_REF_FILE_NAME)\n with open(step_run_ref_local_path, "wb") as step_pickle_file:\n pickle.dump(step_run_ref, step_pickle_file)\n\n _upload_file_to_s3(step_run_ref_local_path, PICKLED_STEP_RUN_REF_FILE_NAME)\n\n def launch_step(self, step_context):\n step_run_ref = step_context_to_step_run_ref(step_context, self.local_job_package_path)\n\n run_id = step_context.dagster_run.run_id\n log = step_context.log\n\n step_key = step_run_ref.step_key\n self._post_artifacts(log, step_run_ref, run_id, step_key)\n\n emr_step_def = self._get_emr_step_def(run_id, step_key, step_context.op.name)\n emr_step_id = self.emr_job_runner.add_job_flow_steps(log, self.cluster_id, [emr_step_def])[\n 0\n ]\n\n yield from self.wait_for_completion_and_log(run_id, step_key, emr_step_id, step_context)\n\n def wait_for_completion_and_log(self, run_id, step_key, emr_step_id, step_context):\n s3 = boto3.resource("s3", region_name=self.region_name)\n try:\n for event in self.wait_for_completion(step_context, s3, run_id, step_key, emr_step_id):\n yield event\n except EmrError as emr_error:\n if self.wait_for_logs:\n self._log_logs_from_s3(step_context.log, emr_step_id)\n raise emr_error\n\n if self.wait_for_logs:\n self._log_logs_from_s3(step_context.log, emr_step_id)\n\n def wait_for_completion(\n self, step_context, s3, run_id, step_key, emr_step_id, check_interval=15\n ):\n """We want to wait for the EMR steps to complete, and while that's happening, we want to\n yield any events that have been written to S3 for us by the remote process.\n After the the EMR steps complete, we want a final chance to fetch events before finishing\n the step.\n """\n done = False\n all_events = []\n # If this is being called within a `capture_interrupts` context, allow interrupts\n # while waiting for the pyspark execution to complete, so that we can terminate slow or\n # hanging steps\n while not done:\n with raise_execution_interrupts():\n time.sleep(check_interval) # AWS rate-limits us if we poll it too often\n done = self.emr_job_runner.is_emr_step_complete(\n step_context.log, self.cluster_id, emr_step_id\n )\n\n all_events_new = self.read_events(s3, run_id, step_key)\n\n if len(all_events_new) > len(all_events):\n for i in range(len(all_events), len(all_events_new)):\n event = all_events_new[i]\n # write each event from the EMR instance to the local instance\n step_context.instance.handle_new_event(event)\n if event.is_dagster_event:\n yield event.dagster_event\n all_events = all_events_new\n\n def read_events(self, s3, run_id, step_key):\n events_s3_obj = s3.Object(\n self.staging_bucket, self._artifact_s3_key(run_id, step_key, PICKLED_EVENTS_FILE_NAME)\n )\n\n try:\n events_data = events_s3_obj.get()["Body"].read()\n return deserialize_value(pickle.loads(events_data))\n except ClientError as ex:\n # The file might not be there yet, which is fine\n if ex.response["Error"]["Code"] == "NoSuchKey":\n return []\n else:\n raise ex\n\n def _log_logs_from_s3(self, log, emr_step_id):\n """Retrieves the logs from the remote PySpark process that EMR posted to S3 and logs\n them to the given log.\n """\n stdout_log, stderr_log = self.emr_job_runner.retrieve_logs_for_step_id(\n log, self.cluster_id, emr_step_id\n )\n # Since stderr is YARN / Hadoop Log4J output, parse and reformat those log lines for\n # Dagster's logging system.\n records = parse_hadoop_log4j_records(stderr_log)\n for record in records:\n if record.level:\n log.log(\n level=record.level,\n msg="".join(["Spark Driver stderr: ", record.logger, ": ", record.message]),\n )\n else:\n log.debug(f"Spark Driver stderr: {record.message}")\n\n sys.stdout.write(\n "---------- Spark Driver stdout: ----------\\n"\n + stdout_log\n + "\\n"\n + "---------- End of Spark Driver stdout ----------\\n"\n )\n\n def _get_emr_step_def(self, run_id, step_key, solid_name):\n """From the local Dagster instance, construct EMR steps that will kick off execution on a\n remote EMR cluster.\n """\n from dagster_spark.utils import flatten_dict, format_for_cli\n\n action_on_failure = self.action_on_failure\n\n # Execute Solid via spark-submit\n conf = dict(flatten_dict(self.spark_config))\n conf["spark.app.name"] = conf.get("spark.app.name", solid_name)\n\n check.invariant(\n conf.get("spark.master", "yarn") == "yarn",\n desc=(\n "spark.master is configured as %s; cannot set Spark master on EMR to anything "\n 'other than "yarn"'\n )\n % conf.get("spark.master"),\n )\n\n command = (\n [\n EMR_SPARK_HOME + "bin/spark-submit",\n "--master",\n "yarn",\n "--deploy-mode",\n conf.get("spark.submit.deployMode", "client"),\n ]\n + format_for_cli(list(flatten_dict(conf)))\n + [\n "--py-files",\n self._artifact_s3_uri(run_id, step_key, CODE_ZIP_NAME),\n self._artifact_s3_uri(run_id, step_key, self._main_file_name()),\n self.staging_bucket,\n self._artifact_s3_key(run_id, step_key, PICKLED_STEP_RUN_REF_FILE_NAME),\n ]\n )\n\n return EmrJobRunner.construct_step_dict_for_command(\n "Execute Solid/Op %s" % solid_name, command, action_on_failure=action_on_failure\n )\n\n def _main_file_name(self):\n return os.path.basename(self._main_file_local_path())\n\n def _main_file_local_path(self):\n return emr_step_main.__file__\n\n def _sanitize_step_key(self, step_key: str) -> str:\n # step_keys of dynamic steps contain brackets, which are invalid characters\n return step_key.replace("[", "__").replace("]", "__")\n\n def _artifact_s3_uri(self, run_id, step_key, filename):\n key = self._artifact_s3_key(run_id, self._sanitize_step_key(step_key), filename)\n return f"s3://{self.staging_bucket}/{key}"\n\n def _artifact_s3_key(self, run_id, step_key, filename):\n return "/".join(\n [\n self.staging_prefix,\n run_id,\n self._sanitize_step_key(step_key),\n os.path.basename(filename),\n ]\n )\n
", "current_page_name": "_modules/dagster_aws/emr/pyspark_step_launcher", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_aws.emr.pyspark_step_launcher"}, "types": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_aws.emr.types

\nfrom enum import Enum as PyEnum\n\nfrom dagster import Enum, EnumValue\n\nEbsVolumeType = Enum(\n    name="EbsVolumeType", enum_values=[EnumValue("gp2"), EnumValue("io1"), EnumValue("standard")]\n)\n\n\n
[docs]class EmrClusterState(PyEnum):\n Starting = "STARTING"\n Bootstrapping = "BOOTSTRAPPING"\n Running = "RUNNING"\n Waiting = "WAITING"\n Terminating = "TERMINATING"\n Terminated = "TERMINATED"\n TerminatedWithErrors = "TERMINATED_WITH_ERRORS"
\n\n\nEMR_CLUSTER_TERMINATED_STATES = [\n EmrClusterState.Terminating,\n EmrClusterState.Terminated,\n EmrClusterState.TerminatedWithErrors,\n]\n\nEMR_CLUSTER_DONE_STATES = EMR_CLUSTER_TERMINATED_STATES + [EmrClusterState.Waiting]\n\n\n
[docs]class EmrStepState(PyEnum):\n Pending = "PENDING"\n Running = "RUNNING"\n Continue = "CONTINUE"\n Completed = "COMPLETED"\n Cancelled = "CANCELLED"\n Failed = "FAILED"\n Interrupted = "INTERRUPTED"
\n\n\nEmrActionOnFailure = Enum(\n name="EmrActionOnFailure",\n enum_values=[\n EnumValue("TERMINATE_JOB_FLOW"),\n EnumValue("TERMINATE_CLUSTER"),\n EnumValue("CANCEL_AND_WAIT"),\n EnumValue("CONTINUE"),\n ],\n)\n\nEmrAdjustmentType = Enum(\n name="EmrAdjustmentType",\n enum_values=[\n EnumValue("CHANGE_IN_CAPACITY"),\n EnumValue("PERCENT_CHANGE_IN_CAPACITY"),\n EnumValue("EXACT_CAPACITY"),\n ],\n)\n\nEmrComparisonOperator = Enum(\n name="EmrComparisonOperator",\n enum_values=[\n EnumValue("GREATER_THAN_OR_EQUAL"),\n EnumValue("GREATER_THAN"),\n EnumValue("LESS_THAN"),\n EnumValue("LESS_THAN_OR_EQUAL"),\n ],\n)\n\nEmrInstanceRole = Enum(\n name="EmrInstanceRole", enum_values=[EnumValue("MASTER"), EnumValue("CORE"), EnumValue("TASK")]\n)\n\nEmrMarket = Enum(name="EmrMarket", enum_values=[EnumValue("ON_DEMAND"), EnumValue("SPOT")])\n\nEmrRepoUpgradeOnBoot = Enum(\n name="EmrRepoUpgradeOnBoot", enum_values=[EnumValue("SECURITY"), EnumValue("NONE")]\n)\n\nEmrScaleDownBehavior = Enum(\n name="EmrScaleDownBehavior",\n enum_values=[\n EnumValue("TERMINATE_AT_INSTANCE_HOUR"),\n EnumValue("TERMINATE_AT_TASK_COMPLETION"),\n ],\n)\n\nEmrStatistic = Enum(\n name="EmrStatistic",\n enum_values=[\n EnumValue("SAMPLE_COUNT"),\n EnumValue("AVERAGE"),\n EnumValue("SUM"),\n EnumValue("MINIMUM"),\n EnumValue("MAXIMUM"),\n ],\n)\n\nEmrSupportedProducts = Enum(\n name="EmrSupportedProducts", enum_values=[EnumValue("mapr-m3"), EnumValue("mapr-m5")]\n)\n\nEmrTimeoutAction = Enum(\n name="EmrTimeoutAction",\n enum_values=[EnumValue("SWITCH_TO_ON_DEMAND"), EnumValue("TERMINATE_CLUSTER")],\n)\n\nEmrUnit = Enum(\n name="EmrUnit",\n enum_values=[\n EnumValue("NONE"),\n EnumValue("SECONDS"),\n EnumValue("MICRO_SECONDS"),\n EnumValue("MILLI_SECONDS"),\n EnumValue("BYTES"),\n EnumValue("KILO_BYTES"),\n EnumValue("MEGA_BYTES"),\n EnumValue("GIGA_BYTES"),\n EnumValue("TERA_BYTES"),\n EnumValue("BITS"),\n EnumValue("KILO_BITS"),\n EnumValue("MEGA_BITS"),\n EnumValue("GIGA_BITS"),\n EnumValue("TERA_BITS"),\n EnumValue("PERCENT"),\n EnumValue("COUNT"),\n EnumValue("BYTES_PER_SECOND"),\n EnumValue("KILO_BYTES_PER_SECOND"),\n EnumValue("MEGA_BYTES_PER_SECOND"),\n EnumValue("GIGA_BYTES_PER_SECOND"),\n EnumValue("TERA_BYTES_PER_SECOND"),\n EnumValue("BITS_PER_SECOND"),\n EnumValue("KILO_BITS_PER_SECOND"),\n EnumValue("MEGA_BITS_PER_SECOND"),\n EnumValue("GIGA_BITS_PER_SECOND"),\n EnumValue("TERA_BITS_PER_SECOND"),\n EnumValue("COUNT_PER_SECOND"),\n ],\n)\n
", "current_page_name": "_modules/dagster_aws/emr/types", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_aws.emr.types"}}, "redshift": {"resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_aws.redshift.resources

\nimport abc\nfrom contextlib import contextmanager\nfrom logging import Logger\nfrom typing import Any, Dict, Optional, cast\n\nimport psycopg2\nimport psycopg2.extensions\nfrom dagster import (\n    ConfigurableResource,\n    _check as check,\n    get_dagster_logger,\n    resource,\n)\nfrom dagster._annotations import deprecated\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom pydantic import Field\n\n\nclass RedshiftError(Exception):\n    pass\n\n\nclass BaseRedshiftClient(abc.ABC):\n    @abc.abstractmethod\n    def execute_query(self, query, fetch_results=False, cursor_factory=None, error_callback=None):\n        pass\n\n    @abc.abstractmethod\n    def execute_queries(\n        self, queries, fetch_results=False, cursor_factory=None, error_callback=None\n    ):\n        pass\n\n\nclass RedshiftClient(BaseRedshiftClient):\n    def __init__(self, conn_args: Dict[str, Any], autocommit: Optional[bool], log: Logger):\n        # Extract parameters from resource config\n        self.conn_args = conn_args\n\n        self.autocommit = autocommit\n        self.log = log\n\n    def execute_query(self, query, fetch_results=False, cursor_factory=None, error_callback=None):\n        """Synchronously execute a single query against Redshift. Will return a list of rows, where\n        each row is a tuple of values, e.g. SELECT 1 will return [(1,)].\n\n        Args:\n            query (str): The query to execute.\n            fetch_results (Optional[bool]): Whether to return the results of executing the query.\n                Defaults to False, in which case the query will be executed without retrieving the\n                results.\n            cursor_factory (Optional[:py:class:`psycopg2.extensions.cursor`]): An alternative\n                cursor_factory; defaults to None. Will be used when constructing the cursor.\n            error_callback (Optional[Callable[[Exception, Cursor, DagsterLogManager], None]]): A\n                callback function, invoked when an exception is encountered during query execution;\n                this is intended to support executing additional queries to provide diagnostic\n                information, e.g. by querying ``stl_load_errors`` using ``pg_last_copy_id()``. If no\n                function is provided, exceptions during query execution will be raised directly.\n\n        Returns:\n            Optional[List[Tuple[Any, ...]]]: Results of the query, as a list of tuples, when\n                fetch_results is set. Otherwise return None.\n        """\n        check.str_param(query, "query")\n        check.bool_param(fetch_results, "fetch_results")\n        check.opt_class_param(\n            cursor_factory, "cursor_factory", superclass=psycopg2.extensions.cursor\n        )\n        check.opt_callable_param(error_callback, "error_callback")\n\n        with self._get_conn() as conn:\n            with self._get_cursor(conn, cursor_factory=cursor_factory) as cursor:\n                try:\n                    self.log.info(f"Executing query '{query}'")\n                    cursor.execute(query)\n\n                    if fetch_results and cursor.rowcount > 0:\n                        return cursor.fetchall()\n                    else:\n                        self.log.info("Empty result from query")\n\n                except Exception as e:\n                    # If autocommit is disabled or not set (it is disabled by default), Redshift\n                    # will be in the middle of a transaction at exception time, and because of\n                    # the failure the current transaction will not accept any further queries.\n                    #\n                    # This conn.commit() call closes the open transaction before handing off\n                    # control to the error callback, so that the user can issue additional\n                    # queries. Notably, for e.g. pg_last_copy_id() to work, it requires you to\n                    # use the same conn/cursor, so you have to do this conn.commit() to ensure\n                    # things are in a usable state in the error callback.\n                    if not self.autocommit:\n                        conn.commit()\n\n                    if error_callback is not None:\n                        error_callback(e, cursor, self.log)\n                    else:\n                        raise\n\n    def execute_queries(\n        self, queries, fetch_results=False, cursor_factory=None, error_callback=None\n    ):\n        """Synchronously execute a list of queries against Redshift. Will return a list of list of\n        rows, where each row is a tuple of values, e.g. ['SELECT 1', 'SELECT 1'] will return\n        [[(1,)], [(1,)]].\n\n        Args:\n            queries (List[str]): The queries to execute.\n            fetch_results (Optional[bool]): Whether to return the results of executing the query.\n                Defaults to False, in which case the query will be executed without retrieving the\n                results.\n            cursor_factory (Optional[:py:class:`psycopg2.extensions.cursor`]): An alternative\n            cursor_factory; defaults to None. Will be used when constructing the cursor.\n            error_callback (Optional[Callable[[Exception, Cursor, DagsterLogManager], None]]): A\n                callback function, invoked when an exception is encountered during query execution;\n                this is intended to support executing additional queries to provide diagnostic\n                information, e.g. by querying ``stl_load_errors`` using ``pg_last_copy_id()``. If no\n                function is provided, exceptions during query execution will be raised directly.\n\n        Returns:\n            Optional[List[List[Tuple[Any, ...]]]]: Results of the query, as a list of list of\n                tuples, when fetch_results is set. Otherwise return None.\n        """\n        check.list_param(queries, "queries", of_type=str)\n        check.bool_param(fetch_results, "fetch_results")\n        check.opt_class_param(\n            cursor_factory, "cursor_factory", superclass=psycopg2.extensions.cursor\n        )\n        check.opt_callable_param(error_callback, "error_callback")\n\n        results = []\n        with self._get_conn() as conn:\n            with self._get_cursor(conn, cursor_factory=cursor_factory) as cursor:\n                for query in queries:\n                    try:\n                        self.log.info(f"Executing query '{query}'")\n                        cursor.execute(query)\n\n                        if fetch_results and cursor.rowcount > 0:\n                            results.append(cursor.fetchall())\n                        else:\n                            results.append([])\n                            self.log.info("Empty result from query")\n\n                    except Exception as e:\n                        # If autocommit is disabled or not set (it is disabled by default), Redshift\n                        # will be in the middle of a transaction at exception time, and because of\n                        # the failure the current transaction will not accept any further queries.\n                        #\n                        # This conn.commit() call closes the open transaction before handing off\n                        # control to the error callback, so that the user can issue additional\n                        # queries. Notably, for e.g. pg_last_copy_id() to work, it requires you to\n                        # use the same conn/cursor, so you have to do this conn.commit() to ensure\n                        # things are in a usable state in the error callback.\n                        if not self.autocommit:\n                            conn.commit()\n\n                        if error_callback is not None:\n                            error_callback(e, cursor, self.log)\n                        else:\n                            raise\n\n        if fetch_results:\n            return results\n\n    @contextmanager\n    def _get_conn(self):\n        conn = None\n        try:\n            conn = psycopg2.connect(**self.conn_args)\n            yield conn\n        finally:\n            if conn:\n                conn.close()\n\n    @contextmanager\n    def _get_cursor(self, conn, cursor_factory=None):\n        check.opt_class_param(\n            cursor_factory, "cursor_factory", superclass=psycopg2.extensions.cursor\n        )\n\n        # Could be none, in which case we should respect the connection default. Otherwise\n        # explicitly set to true/false.\n        if self.autocommit is not None:\n            conn.autocommit = self.autocommit\n\n        with conn:\n            with conn.cursor(cursor_factory=cursor_factory) as cursor:\n                yield cursor\n\n            # If autocommit is set, we'll commit after each and every query execution. Otherwise, we\n            # want to do a final commit after we're wrapped up executing the full set of one or more\n            # queries.\n            if not self.autocommit:\n                conn.commit()\n\n\n@deprecated(breaking_version="2.0", additional_warn_text="Use RedshiftClientResource instead.")\nclass RedshiftResource(RedshiftClient):\n    """This class was used by the function-style Redshift resource."""\n\n\nclass FakeRedshiftClient(BaseRedshiftClient):\n    QUERY_RESULT = [(1,)]\n\n    def __init__(self, log: Logger):\n        # Extract parameters from resource config\n\n        self.log = log\n\n    def execute_query(self, query, fetch_results=False, cursor_factory=None, error_callback=None):\n        """Fake for execute_query; returns [self.QUERY_RESULT].\n\n        Args:\n            query (str): The query to execute.\n            fetch_results (Optional[bool]): Whether to return the results of executing the query.\n                Defaults to False, in which case the query will be executed without retrieving the\n                results.\n            cursor_factory (Optional[:py:class:`psycopg2.extensions.cursor`]): An alternative\n                cursor_factory; defaults to None. Will be used when constructing the cursor.\n            error_callback (Optional[Callable[[Exception, Cursor, DagsterLogManager], None]]): A\n                callback function, invoked when an exception is encountered during query execution;\n                this is intended to support executing additional queries to provide diagnostic\n                information, e.g. by querying ``stl_load_errors`` using ``pg_last_copy_id()``. If no\n                function is provided, exceptions during query execution will be raised directly.\n\n        Returns:\n            Optional[List[Tuple[Any, ...]]]: Results of the query, as a list of tuples, when\n                fetch_results is set. Otherwise return None.\n        """\n        check.str_param(query, "query")\n        check.bool_param(fetch_results, "fetch_results")\n        check.opt_class_param(\n            cursor_factory, "cursor_factory", superclass=psycopg2.extensions.cursor\n        )\n        check.opt_callable_param(error_callback, "error_callback")\n\n        self.log.info(f"Executing query '{query}'")\n        if fetch_results:\n            return self.QUERY_RESULT\n\n    def execute_queries(\n        self, queries, fetch_results=False, cursor_factory=None, error_callback=None\n    ):\n        """Fake for execute_queries; returns [self.QUERY_RESULT] * 3.\n\n        Args:\n            queries (List[str]): The queries to execute.\n            fetch_results (Optional[bool]): Whether to return the results of executing the query.\n                Defaults to False, in which case the query will be executed without retrieving the\n                results.\n            cursor_factory (Optional[:py:class:`psycopg2.extensions.cursor`]): An alternative\n                cursor_factory; defaults to None. Will be used when constructing the cursor.\n            error_callback (Optional[Callable[[Exception, Cursor, DagsterLogManager], None]]): A\n                callback function, invoked when an exception is encountered during query execution;\n                this is intended to support executing additional queries to provide diagnostic\n                information, e.g. by querying ``stl_load_errors`` using ``pg_last_copy_id()``. If no\n                function is provided, exceptions during query execution will be raised directly.\n\n        Returns:\n            Optional[List[List[Tuple[Any, ...]]]]: Results of the query, as a list of list of\n                tuples, when fetch_results is set. Otherwise return None.\n        """\n        check.list_param(queries, "queries", of_type=str)\n        check.bool_param(fetch_results, "fetch_results")\n        check.opt_class_param(\n            cursor_factory, "cursor_factory", superclass=psycopg2.extensions.cursor\n        )\n        check.opt_callable_param(error_callback, "error_callback")\n\n        for query in queries:\n            self.log.info(f"Executing query '{query}'")\n        if fetch_results:\n            return [self.QUERY_RESULT] * 3\n\n\n@deprecated(breaking_version="2.0", additional_warn_text="Use FakeRedshiftClientResource instead.")\nclass FakeRedshiftResource(FakeRedshiftClient):\n    """This class was used by the function-style fake Redshift resource."""\n\n\n
[docs]class RedshiftClientResource(ConfigurableResource):\n """This resource enables connecting to a Redshift cluster and issuing queries against that\n cluster.\n\n Example:\n .. code-block:: python\n\n from dagster import Definitions, asset, EnvVar\n from dagster_aws.redshift import RedshiftClientResource\n\n @asset\n def example_redshift_asset(context, redshift: RedshiftClientResource):\n redshift.get_client().execute_query('SELECT 1', fetch_results=True)\n\n redshift_configured = RedshiftClientResource(\n host='my-redshift-cluster.us-east-1.redshift.amazonaws.com',\n port=5439,\n user='dagster',\n password=EnvVar("DAGSTER_REDSHIFT_PASSWORD"),\n database='dev',\n )\n\n defs = Definitions(\n assets=[example_redshift_asset],\n resources={'redshift': redshift_configured},\n )\n\n """\n\n host: str = Field(description="Redshift host")\n port: int = Field(default=5439, description="Redshift port")\n user: Optional[str] = Field(default=None, description="Username for Redshift connection")\n password: Optional[str] = Field(default=None, description="Password for Redshift connection")\n database: Optional[str] = Field(\n default=None,\n description=(\n "Name of the default database to use. After login, you can use USE DATABASE to change"\n " the database."\n ),\n )\n autocommit: Optional[bool] = Field(default=None, description="Whether to autocommit queries")\n connect_timeout: int = Field(\n default=5, description="Timeout for connection to Redshift cluster. Defaults to 5 seconds."\n )\n sslmode: str = Field(\n default="require",\n description=(\n "SSL mode to use. See the Redshift documentation for reference:"\n " https://docs.aws.amazon.com/redshift/latest/mgmt/connecting-ssl-support.html"\n ),\n )\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n def get_client(self) -> RedshiftClient:\n conn_args = {\n k: getattr(self, k, None)\n for k in (\n "host",\n "port",\n "user",\n "password",\n "database",\n "connect_timeout",\n "sslmode",\n )\n if getattr(self, k, None) is not None\n }\n\n return RedshiftClient(conn_args, self.autocommit, get_dagster_logger())
\n\n\n
[docs]class FakeRedshiftClientResource(RedshiftClientResource):\n def get_client(self) -> FakeRedshiftClient:\n return FakeRedshiftClient(get_dagster_logger())
\n\n\n
[docs]@dagster_maintained_resource\n@resource(\n config_schema=RedshiftClientResource.to_config_schema(),\n description="Resource for connecting to the Redshift data warehouse",\n)\ndef redshift_resource(context) -> RedshiftClient:\n """This resource enables connecting to a Redshift cluster and issuing queries against that\n cluster.\n\n Example:\n .. code-block:: python\n\n from dagster import build_op_context, op\n from dagster_aws.redshift import redshift_resource\n\n @op(required_resource_keys={'redshift'})\n def example_redshift_op(context):\n return context.resources.redshift.execute_query('SELECT 1', fetch_results=True)\n\n redshift_configured = redshift_resource.configured({\n 'host': 'my-redshift-cluster.us-east-1.redshift.amazonaws.com',\n 'port': 5439,\n 'user': 'dagster',\n 'password': 'dagster',\n 'database': 'dev',\n })\n context = build_op_context(resources={'redshift': redshift_configured})\n assert example_redshift_op(context) == [(1,)]\n\n """\n return RedshiftClientResource.from_resource_context(context).get_client()
\n\n\n
[docs]@dagster_maintained_resource\n@resource(\n config_schema=FakeRedshiftClientResource.to_config_schema(),\n description=(\n "Fake resource for connecting to the Redshift data warehouse. Usage is identical "\n "to the real redshift_resource. Will always return [(1,)] for the single query case and "\n "[[(1,)], [(1,)], [(1,)]] for the multi query case."\n ),\n)\ndef fake_redshift_resource(context) -> FakeRedshiftClient:\n return cast(\n FakeRedshiftClient,\n FakeRedshiftClientResource.from_resource_context(context).get_client(),\n )
\n
", "current_page_name": "_modules/dagster_aws/redshift/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_aws.redshift.resources"}}, "s3": {"compute_log_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_aws.s3.compute_log_manager

\nimport os\nfrom contextlib import contextmanager\nfrom typing import Any, Iterator, Mapping, Optional, Sequence\n\nimport boto3\nimport dagster._seven as seven\nfrom botocore.errorfactory import ClientError\nfrom dagster import (\n    Field,\n    Permissive,\n    StringSource,\n    _check as check,\n)\nfrom dagster._config.config_type import Noneable\nfrom dagster._core.storage.captured_log_manager import CapturedLogContext\nfrom dagster._core.storage.cloud_storage_compute_log_manager import (\n    CloudStorageComputeLogManager,\n    PollingComputeLogSubscriptionManager,\n)\nfrom dagster._core.storage.compute_log_manager import ComputeIOType\nfrom dagster._core.storage.local_compute_log_manager import (\n    IO_TYPE_EXTENSION,\n    LocalComputeLogManager,\n)\nfrom dagster._serdes import ConfigurableClass, ConfigurableClassData\nfrom dagster._utils import ensure_dir, ensure_file\nfrom typing_extensions import Self\n\nPOLLING_INTERVAL = 5\n\n\n
[docs]class S3ComputeLogManager(CloudStorageComputeLogManager, ConfigurableClass):\n """Logs compute function stdout and stderr to S3.\n\n Users should not instantiate this class directly. Instead, use a YAML block in ``dagster.yaml``\n such as the following:\n\n .. code-block:: YAML\n\n compute_logs:\n module: dagster_aws.s3.compute_log_manager\n class: S3ComputeLogManager\n config:\n bucket: "mycorp-dagster-compute-logs"\n local_dir: "/tmp/cool"\n prefix: "dagster-test-"\n use_ssl: true\n verify: true\n verify_cert_path: "/path/to/cert/bundle.pem"\n endpoint_url: "http://alternate-s3-host.io"\n skip_empty_files: true\n upload_interval: 30\n upload_extra_args:\n ServerSideEncryption: "AES256"\n show_url_only: false\n region: "us-west-1"\n\n Args:\n bucket (str): The name of the s3 bucket to which to log.\n local_dir (Optional[str]): Path to the local directory in which to stage logs. Default:\n ``dagster._seven.get_system_temp_directory()``.\n prefix (Optional[str]): Prefix for the log file keys.\n use_ssl (Optional[bool]): Whether or not to use SSL. Default True.\n verify (Optional[bool]): Whether or not to verify SSL certificates. Default True.\n verify_cert_path (Optional[str]): A filename of the CA cert bundle to use. Only used if\n `verify` set to False.\n endpoint_url (Optional[str]): Override for the S3 endpoint url.\n skip_empty_files: (Optional[bool]): Skip upload of empty log files.\n upload_interval: (Optional[int]): Interval in seconds to upload partial log files to S3. By default, will only upload when the capture is complete.\n upload_extra_args: (Optional[dict]): Extra args for S3 file upload\n show_url_only: (Optional[bool]): Only show the URL of the log file in the UI, instead of fetching and displaying the full content. Default False.\n region: (Optional[str]): The region of the S3 bucket. If not specified, will use the default region of the AWS session.\n inst_data (Optional[ConfigurableClassData]): Serializable representation of the compute\n log manager when newed up from config.\n """\n\n def __init__(\n self,\n bucket,\n local_dir=None,\n inst_data: Optional[ConfigurableClassData] = None,\n prefix="dagster",\n use_ssl=True,\n verify=True,\n verify_cert_path=None,\n endpoint_url=None,\n skip_empty_files=False,\n upload_interval=None,\n upload_extra_args=None,\n show_url_only=False,\n region=None,\n ):\n _verify = False if not verify else verify_cert_path\n self._s3_session = boto3.resource(\n "s3", use_ssl=use_ssl, verify=_verify, endpoint_url=endpoint_url\n ).meta.client\n self._s3_bucket = check.str_param(bucket, "bucket")\n self._s3_prefix = self._clean_prefix(check.str_param(prefix, "prefix"))\n\n # proxy calls to local compute log manager (for subscriptions, etc)\n if not local_dir:\n local_dir = seven.get_system_temp_directory()\n\n self._local_manager = LocalComputeLogManager(local_dir)\n self._subscription_manager = PollingComputeLogSubscriptionManager(self)\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n self._skip_empty_files = check.bool_param(skip_empty_files, "skip_empty_files")\n self._upload_interval = check.opt_int_param(upload_interval, "upload_interval")\n check.opt_dict_param(upload_extra_args, "upload_extra_args")\n self._upload_extra_args = upload_extra_args\n self._show_url_only = show_url_only\n if region is None:\n # if unspecified, use the current session name\n self._region = self._s3_session.meta.region_name\n else:\n self._region = region\n\n @property\n def inst_data(self):\n return self._inst_data\n\n @classmethod\n def config_type(cls):\n return {\n "bucket": StringSource,\n "local_dir": Field(StringSource, is_required=False),\n "prefix": Field(StringSource, is_required=False, default_value="dagster"),\n "use_ssl": Field(bool, is_required=False, default_value=True),\n "verify": Field(bool, is_required=False, default_value=True),\n "verify_cert_path": Field(StringSource, is_required=False),\n "endpoint_url": Field(StringSource, is_required=False),\n "skip_empty_files": Field(bool, is_required=False, default_value=False),\n "upload_interval": Field(Noneable(int), is_required=False, default_value=None),\n "upload_extra_args": Field(\n Permissive(), is_required=False, description="Extra args for S3 file upload"\n ),\n "show_url_only": Field(bool, is_required=False, default_value=False),\n "region": Field(StringSource, is_required=False),\n }\n\n @classmethod\n def from_config_value(\n cls, inst_data: ConfigurableClassData, config_value: Mapping[str, Any]\n ) -> Self:\n return S3ComputeLogManager(inst_data=inst_data, **config_value)\n\n @property\n def local_manager(self) -> LocalComputeLogManager:\n return self._local_manager\n\n @property\n def upload_interval(self) -> Optional[int]:\n return self._upload_interval if self._upload_interval else None\n\n def _clean_prefix(self, prefix):\n parts = prefix.split("/")\n return "/".join([part for part in parts if part])\n\n def _s3_key(self, log_key, io_type, partial=False):\n check.inst_param(io_type, "io_type", ComputeIOType)\n extension = IO_TYPE_EXTENSION[io_type]\n [*namespace, filebase] = log_key\n filename = f"{filebase}.{extension}"\n if partial:\n filename = f"{filename}.partial"\n paths = [self._s3_prefix, "storage", *namespace, filename]\n return "/".join(paths) # s3 path delimiter\n\n @contextmanager\n def capture_logs(self, log_key: Sequence[str]) -> Iterator[CapturedLogContext]:\n with super().capture_logs(log_key) as local_context:\n if not self._show_url_only:\n yield local_context\n else:\n out_key = self._s3_key(log_key, ComputeIOType.STDOUT)\n err_key = self._s3_key(log_key, ComputeIOType.STDERR)\n s3_base = f"https://s3.console.aws.amazon.com/s3/object/{self._s3_bucket}?region={self._region}"\n yield CapturedLogContext(\n local_context.log_key,\n external_stdout_url=f"{s3_base}&prefix={out_key}",\n external_stderr_url=f"{s3_base}&prefix={err_key}",\n )\n\n def delete_logs(\n self, log_key: Optional[Sequence[str]] = None, prefix: Optional[Sequence[str]] = None\n ):\n self.local_manager.delete_logs(log_key=log_key, prefix=prefix)\n\n s3_keys_to_remove = None\n if log_key:\n s3_keys_to_remove = [\n self._s3_key(log_key, ComputeIOType.STDOUT),\n self._s3_key(log_key, ComputeIOType.STDERR),\n self._s3_key(log_key, ComputeIOType.STDOUT, partial=True),\n self._s3_key(log_key, ComputeIOType.STDERR, partial=True),\n ]\n elif prefix:\n # add the trailing '' to make sure that ['a'] does not match ['apple']\n s3_prefix = "/".join([self._s3_prefix, "storage", *prefix, ""])\n matching = self._s3_session.list_objects(Bucket=self._s3_bucket, Prefix=s3_prefix)\n s3_keys_to_remove = [obj["Key"] for obj in matching.get("Contents", [])]\n else:\n check.failed("Must pass in either `log_key` or `prefix` argument to delete_logs")\n\n if s3_keys_to_remove:\n to_delete = [{"Key": key} for key in s3_keys_to_remove]\n self._s3_session.delete_objects(Bucket=self._s3_bucket, Delete={"Objects": to_delete})\n\n def download_url_for_type(self, log_key: Sequence[str], io_type: ComputeIOType):\n if not self.is_capture_complete(log_key):\n return None\n\n s3_key = self._s3_key(log_key, io_type)\n return self._s3_session.generate_presigned_url(\n ClientMethod="get_object", Params={"Bucket": self._s3_bucket, "Key": s3_key}\n )\n\n def display_path_for_type(self, log_key: Sequence[str], io_type: ComputeIOType):\n if not self.is_capture_complete(log_key):\n return None\n s3_key = self._s3_key(log_key, io_type)\n return f"s3://{self._s3_bucket}/{s3_key}"\n\n def cloud_storage_has_logs(\n self, log_key: Sequence[str], io_type: ComputeIOType, partial: bool = False\n ) -> bool:\n s3_key = self._s3_key(log_key, io_type, partial=partial)\n try: # https://stackoverflow.com/a/38376288/14656695\n self._s3_session.head_object(Bucket=self._s3_bucket, Key=s3_key)\n except ClientError:\n return False\n return True\n\n def upload_to_cloud_storage(\n self, log_key: Sequence[str], io_type: ComputeIOType, partial=False\n ):\n path = self.local_manager.get_captured_local_path(log_key, IO_TYPE_EXTENSION[io_type])\n ensure_file(path)\n\n if (self._skip_empty_files or partial) and os.stat(path).st_size == 0:\n return\n\n s3_key = self._s3_key(log_key, io_type, partial=partial)\n with open(path, "rb") as data:\n extra_args = {\n "ContentType": "text/plain",\n **(self._upload_extra_args if self._upload_extra_args else {}),\n }\n self._s3_session.upload_fileobj(data, self._s3_bucket, s3_key, ExtraArgs=extra_args)\n\n def download_from_cloud_storage(\n self, log_key: Sequence[str], io_type: ComputeIOType, partial=False\n ):\n path = self._local_manager.get_captured_local_path(\n log_key, IO_TYPE_EXTENSION[io_type], partial=partial\n )\n ensure_dir(os.path.dirname(path))\n s3_key = self._s3_key(log_key, io_type, partial=partial)\n with open(path, "wb") as fileobj:\n self._s3_session.download_fileobj(self._s3_bucket, s3_key, fileobj)\n\n def on_subscribe(self, subscription):\n self._subscription_manager.add_subscription(subscription)\n\n def on_unsubscribe(self, subscription):\n self._subscription_manager.remove_subscription(subscription)\n\n def dispose(self):\n self._subscription_manager.dispose()\n self._local_manager.dispose()
\n
", "current_page_name": "_modules/dagster_aws/s3/compute_log_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_aws.s3.compute_log_manager"}, "file_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_aws.s3.file_manager

\nimport io\nimport uuid\nfrom contextlib import contextmanager\n\nimport dagster._check as check\nfrom dagster._core.storage.file_manager import (\n    FileHandle,\n    FileManager,\n    TempfileManager,\n    check_file_like_obj,\n)\n\n\n
[docs]class S3FileHandle(FileHandle):\n """A reference to a file on S3."""\n\n def __init__(self, s3_bucket: str, s3_key: str):\n self._s3_bucket = check.str_param(s3_bucket, "s3_bucket")\n self._s3_key = check.str_param(s3_key, "s3_key")\n\n @property\n def s3_bucket(self) -> str:\n """str: The name of the S3 bucket."""\n return self._s3_bucket\n\n @property\n def s3_key(self) -> str:\n """str: The S3 key."""\n return self._s3_key\n\n @property\n def path_desc(self) -> str:\n """str: The file's S3 URL."""\n return self.s3_path\n\n @property\n def s3_path(self) -> str:\n """str: The file's S3 URL."""\n return f"s3://{self.s3_bucket}/{self.s3_key}"
\n\n\nclass S3FileManager(FileManager):\n def __init__(self, s3_session, s3_bucket, s3_base_key):\n self._s3_session = s3_session\n self._s3_bucket = check.str_param(s3_bucket, "s3_bucket")\n self._s3_base_key = check.str_param(s3_base_key, "s3_base_key")\n self._local_handle_cache = {}\n self._temp_file_manager = TempfileManager()\n\n def copy_handle_to_local_temp(self, file_handle):\n self._download_if_not_cached(file_handle)\n return self._get_local_path(file_handle)\n\n def _download_if_not_cached(self, file_handle):\n if not self._file_handle_cached(file_handle):\n # instigate download\n temp_file_obj = self._temp_file_manager.tempfile()\n temp_name = temp_file_obj.name\n self._s3_session.download_file(\n Bucket=file_handle.s3_bucket, Key=file_handle.s3_key, Filename=temp_name\n )\n self._local_handle_cache[file_handle.s3_path] = temp_name\n\n return file_handle\n\n @contextmanager\n def read(self, file_handle, mode="rb"):\n check.inst_param(file_handle, "file_handle", S3FileHandle)\n check.str_param(mode, "mode")\n check.param_invariant(mode in {"r", "rb"}, "mode")\n\n self._download_if_not_cached(file_handle)\n\n encoding = None if mode == "rb" else "utf-8"\n with open(self._get_local_path(file_handle), mode, encoding=encoding) as file_obj:\n yield file_obj\n\n def _file_handle_cached(self, file_handle):\n return file_handle.s3_path in self._local_handle_cache\n\n def _get_local_path(self, file_handle):\n return self._local_handle_cache[file_handle.s3_path]\n\n def read_data(self, file_handle):\n with self.read(file_handle, mode="rb") as file_obj:\n return file_obj.read()\n\n def write_data(self, data, ext=None):\n check.inst_param(data, "data", bytes)\n return self.write(io.BytesIO(data), mode="wb", ext=ext)\n\n def write(self, file_obj, mode="wb", ext=None):\n check_file_like_obj(file_obj)\n s3_key = self.get_full_key(str(uuid.uuid4()) + (("." + ext) if ext is not None else ""))\n self._s3_session.put_object(Body=file_obj, Bucket=self._s3_bucket, Key=s3_key)\n return S3FileHandle(self._s3_bucket, s3_key)\n\n def get_full_key(self, file_key):\n return f"{self._s3_base_key}/{file_key}"\n\n def delete_local_temp(self):\n self._temp_file_manager.close()\n
", "current_page_name": "_modules/dagster_aws/s3/file_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_aws.s3.file_manager"}, "io_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_aws.s3.io_manager

\nimport io\nimport pickle\nfrom typing import Any, Dict, Optional, Union\n\nfrom dagster import (\n    ConfigurableIOManager,\n    InputContext,\n    MetadataValue,\n    OutputContext,\n    ResourceDependency,\n    _check as check,\n    io_manager,\n)\nfrom dagster._annotations import deprecated\nfrom dagster._core.storage.io_manager import dagster_maintained_io_manager\nfrom dagster._core.storage.upath_io_manager import UPathIOManager\nfrom dagster._utils import PICKLE_PROTOCOL\nfrom dagster._utils.cached_method import cached_method\nfrom pydantic import Field\nfrom upath import UPath\n\nfrom .resources import S3Resource\n\n\nclass PickledObjectS3IOManager(UPathIOManager):\n    def __init__(\n        self,\n        s3_bucket: str,\n        s3_session: Any,\n        s3_prefix: Optional[str] = None,\n    ):\n        self.bucket = check.str_param(s3_bucket, "s3_bucket")\n        check.opt_str_param(s3_prefix, "s3_prefix")\n        self.s3 = s3_session\n        self.s3.list_objects(Bucket=s3_bucket, Prefix=s3_prefix, MaxKeys=1)\n        base_path = UPath(s3_prefix) if s3_prefix else None\n        super().__init__(base_path=base_path)\n\n    def load_from_path(self, context: InputContext, path: UPath) -> Any:\n        try:\n            s3_obj = self.s3.get_object(Bucket=self.bucket, Key=str(path))["Body"].read()\n            return pickle.loads(s3_obj)\n        except self.s3.exceptions.NoSuchKey:\n            raise FileNotFoundError(f"Could not find file {path} in S3 bucket {self.bucket}")\n\n    def dump_to_path(self, context: OutputContext, obj: Any, path: UPath) -> None:\n        if self.path_exists(path):\n            context.log.warning(f"Removing existing S3 object: {path}")\n            self.unlink(path)\n\n        pickled_obj = pickle.dumps(obj, PICKLE_PROTOCOL)\n        pickled_obj_bytes = io.BytesIO(pickled_obj)\n        self.s3.upload_fileobj(pickled_obj_bytes, self.bucket, str(path))\n\n    def path_exists(self, path: UPath) -> bool:\n        try:\n            self.s3.get_object(Bucket=self.bucket, Key=str(path))\n        except self.s3.exceptions.NoSuchKey:\n            return False\n        return True\n\n    def get_loading_input_log_message(self, path: UPath) -> str:\n        return f"Loading S3 object from: {self._uri_for_path(path)}"\n\n    def get_writing_output_log_message(self, path: UPath) -> str:\n        return f"Writing S3 object at: {self._uri_for_path(path)}"\n\n    def unlink(self, path: UPath) -> None:\n        self.s3.delete_object(Bucket=self.bucket, Key=str(path))\n\n    def make_directory(self, path: UPath) -> None:\n        # It is not necessary to create directories in S3\n        return None\n\n    def get_metadata(self, context: OutputContext, obj: Any) -> Dict[str, MetadataValue]:\n        path = self._get_path(context)\n        return {"uri": MetadataValue.path(self._uri_for_path(path))}\n\n    def get_op_output_relative_path(self, context: Union[InputContext, OutputContext]) -> UPath:\n        return UPath("storage", super().get_op_output_relative_path(context))\n\n    def _uri_for_path(self, path: UPath) -> str:\n        return f"s3://{self.bucket}/{path}"\n\n\n
[docs]class S3PickleIOManager(ConfigurableIOManager):\n """Persistent IO manager using S3 for storage.\n\n Serializes objects via pickling. Suitable for objects storage for distributed executors, so long\n as each execution node has network connectivity and credentials for S3 and the backing bucket.\n\n Assigns each op output to a unique filepath containing run ID, step key, and output name.\n Assigns each asset to a single filesystem path, at "<base_dir>/<asset_key>". If the asset key\n has multiple components, the final component is used as the name of the file, and the preceding\n components as parent directories under the base_dir.\n\n Subsequent materializations of an asset will overwrite previous materializations of that asset.\n With a base directory of "/my/base/path", an asset with key\n `AssetKey(["one", "two", "three"])` would be stored in a file called "three" in a directory\n with path "/my/base/path/one/two/".\n\n Example usage:\n\n .. code-block:: python\n\n from dagster import asset, Definitions\n from dagster_aws.s3 import S3PickleIOManager, S3Resource\n\n\n @asset\n def asset1():\n # create df ...\n return df\n\n @asset\n def asset2(asset1):\n return asset1[:5]\n\n defs = Definitions(\n assets=[asset1, asset2],\n resources={\n "io_manager": S3PickleIOManager(\n s3_resource=S3Resource(),\n s3_bucket="my-cool-bucket",\n s3_prefix="my-cool-prefix",\n )\n }\n )\n\n """\n\n s3_resource: ResourceDependency[S3Resource]\n s3_bucket: str = Field(description="S3 bucket to use for the file manager.")\n s3_prefix: str = Field(\n default="dagster", description="Prefix to use for the S3 bucket for this file manager."\n )\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n @cached_method\n def inner_io_manager(self) -> PickledObjectS3IOManager:\n return PickledObjectS3IOManager(\n s3_bucket=self.s3_bucket,\n s3_session=self.s3_resource.get_client(),\n s3_prefix=self.s3_prefix,\n )\n\n def load_input(self, context: InputContext) -> Any:\n return self.inner_io_manager().load_input(context)\n\n def handle_output(self, context: OutputContext, obj: Any) -> None:\n return self.inner_io_manager().handle_output(context, obj)
\n\n\n
[docs]@deprecated(\n breaking_version="2.0",\n additional_warn_text="Please use S3PickleIOManager instead.",\n)\nclass ConfigurablePickledObjectS3IOManager(S3PickleIOManager):\n """Renamed to S3PickleIOManager. See S3PickleIOManager for documentation."""\n\n pass
\n\n\n
[docs]@dagster_maintained_io_manager\n@io_manager(\n config_schema=S3PickleIOManager.to_config_schema(),\n required_resource_keys={"s3"},\n)\ndef s3_pickle_io_manager(init_context):\n """Persistent IO manager using S3 for storage.\n\n Serializes objects via pickling. Suitable for objects storage for distributed executors, so long\n as each execution node has network connectivity and credentials for S3 and the backing bucket.\n\n Assigns each op output to a unique filepath containing run ID, step key, and output name.\n Assigns each asset to a single filesystem path, at "<base_dir>/<asset_key>". If the asset key\n has multiple components, the final component is used as the name of the file, and the preceding\n components as parent directories under the base_dir.\n\n Subsequent materializations of an asset will overwrite previous materializations of that asset.\n With a base directory of "/my/base/path", an asset with key\n `AssetKey(["one", "two", "three"])` would be stored in a file called "three" in a directory\n with path "/my/base/path/one/two/".\n\n Example usage:\n\n 1. Attach this IO manager to a set of assets.\n\n .. code-block:: python\n\n from dagster import Definitions, asset\n from dagster_aws.s3 import s3_pickle_io_manager, s3_resource\n\n\n @asset\n def asset1():\n # create df ...\n return df\n\n @asset\n def asset2(asset1):\n return asset1[:5]\n\n defs = Definitions(\n assets=[asset1, asset2],\n resources={\n "io_manager": s3_pickle_io_manager.configured(\n {"s3_bucket": "my-cool-bucket", "s3_prefix": "my-cool-prefix"}\n ),\n "s3": s3_resource,\n },\n )\n\n\n 2. Attach this IO manager to your job to make it available to your ops.\n\n .. code-block:: python\n\n from dagster import job\n from dagster_aws.s3 import s3_pickle_io_manager, s3_resource\n\n @job(\n resource_defs={\n "io_manager": s3_pickle_io_manager.configured(\n {"s3_bucket": "my-cool-bucket", "s3_prefix": "my-cool-prefix"}\n ),\n "s3": s3_resource,\n },\n )\n def my_job():\n ...\n """\n s3_session = init_context.resources.s3\n s3_bucket = init_context.resource_config["s3_bucket"]\n s3_prefix = init_context.resource_config.get("s3_prefix") # s3_prefix is optional\n pickled_io_manager = PickledObjectS3IOManager(s3_bucket, s3_session, s3_prefix=s3_prefix)\n return pickled_io_manager
\n
", "current_page_name": "_modules/dagster_aws/s3/io_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_aws.s3.io_manager"}, "ops": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_aws.s3.ops

\nfrom typing import Any, Generator, Mapping\n\nfrom dagster import (\n    AssetMaterialization,\n    Field,\n    FileHandle,\n    In,\n    MetadataValue,\n    Out,\n    Output,\n    StringSource,\n    _check as check,\n    dagster_type_loader,\n    op,\n)\nfrom dagster._core.types.dagster_type import PythonObjectDagsterType\n\nfrom .file_manager import S3FileHandle\n\n\ndef dict_with_fields(name: str, fields: Mapping[str, object]):\n    check.str_param(name, "name")\n    check.mapping_param(fields, "fields", key_type=str)\n    field_names = set(fields.keys())\n\n    @dagster_type_loader(fields)\n    def _input_schema(_context, value):\n        check.dict_param(value, "value")\n        check.param_invariant(set(value.keys()) == field_names, "value")\n        return value\n\n    class _DictWithSchema(PythonObjectDagsterType):\n        def __init__(self):\n            super(_DictWithSchema, self).__init__(python_type=dict, name=name, loader=_input_schema)\n\n    return _DictWithSchema()\n\n\nS3Coordinate = dict_with_fields(\n    "S3Coordinate",\n    fields={\n        "bucket": Field(StringSource, description="S3 bucket name"),\n        "key": Field(StringSource, description="S3 key name"),\n    },\n)\n\n\ndef last_key(key: str) -> str:\n    if "/" not in key:\n        return key\n    comps = key.split("/")\n    return comps[-1]\n\n\n@op(\n    config_schema={\n        "Bucket": Field(\n            StringSource, description="The name of the bucket to upload to.", is_required=True\n        ),\n        "Key": Field(\n            StringSource, description="The name of the key to upload to.", is_required=True\n        ),\n    },\n    ins={"file_handle": In(FileHandle, description="The file to upload.")},\n    out={"s3_file_handle": Out(S3FileHandle)},\n    description="""Take a file handle and upload it to s3. Returns an S3FileHandle.""",\n    required_resource_keys={"s3", "file_manager"},\n)\ndef file_handle_to_s3(context, file_handle) -> Generator[Any, None, None]:\n    bucket = context.op_config["Bucket"]\n    key = context.op_config["Key"]\n\n    file_manager = context.resources.file_manager\n    s3 = context.resources.s3\n\n    with file_manager.read(file_handle, "rb") as fileobj:\n        s3.upload_fileobj(fileobj, bucket, key)\n        s3_file_handle = S3FileHandle(bucket, key)\n\n        yield AssetMaterialization(\n            asset_key=s3_file_handle.s3_path,\n            metadata={last_key(key): MetadataValue.path(s3_file_handle.s3_path)},\n        )\n\n        yield Output(value=s3_file_handle, output_name="s3_file_handle")\n
", "current_page_name": "_modules/dagster_aws/s3/ops", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_aws.s3.ops"}, "resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_aws.s3.resources

\nfrom typing import Any, Optional, TypeVar\n\nfrom dagster import ConfigurableResource, IAttachDifferentObjectToOpContext, resource\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom pydantic import Field\n\nfrom .file_manager import S3FileManager\nfrom .utils import construct_s3_client\n\nT = TypeVar("T")\n\n\nclass ResourceWithS3Configuration(ConfigurableResource):\n    use_unsigned_session: bool = Field(\n        default=False, description="Specifies whether to use an unsigned S3 session."\n    )\n    region_name: Optional[str] = Field(\n        default=None, description="Specifies a custom region for the S3 session."\n    )\n    endpoint_url: Optional[str] = Field(\n        default=None, description="Specifies a custom endpoint for the S3 session."\n    )\n    max_attempts: int = Field(\n        default=5,\n        description=(\n            "This provides Boto3's retry handler with a value of maximum retry attempts, where the"\n            " initial call counts toward the max_attempts value that you provide."\n        ),\n    )\n    profile_name: Optional[str] = Field(\n        default=None, description="Specifies a profile to connect that session."\n    )\n    use_ssl: bool = Field(\n        default=True, description="Whether or not to use SSL. By default, SSL is used."\n    )\n    verify: Optional[str] = Field(\n        default=None,\n        description=(\n            "Whether or not to verify SSL certificates. By default SSL certificates are verified."\n            " You can also specify this argument if you want to use a different CA cert bundle than"\n            " the one used by botocore."\n        ),\n    )\n    aws_access_key_id: Optional[str] = Field(\n        default=None, description="AWS access key ID to use when creating the boto3 session."\n    )\n    aws_secret_access_key: Optional[str] = Field(\n        default=None, description="AWS secret access key to use when creating the boto3 session."\n    )\n    aws_session_token: str = Field(\n        default=None, description="AWS session token to use when creating the boto3 session."\n    )\n\n\n
[docs]class S3Resource(ResourceWithS3Configuration, IAttachDifferentObjectToOpContext):\n """Resource that gives access to S3.\n\n The underlying S3 session is created by calling\n :py:func:`boto3.session.Session(profile_name) <boto3:boto3.session>`.\n The returned resource object is an S3 client, an instance of `botocore.client.S3`.\n\n Example:\n .. code-block:: python\n\n from dagster import job, op, Definitions\n from dagster_aws.s3 import S3Resource\n\n @op\n def example_s3_op(s3: S3Resource):\n return s3.get_client().list_objects_v2(\n Bucket='my-bucket',\n Prefix='some-key'\n )\n\n @job\n def example_job():\n example_s3_op()\n\n defs = Definitions(\n jobs=[example_job],\n resources={'s3': S3Resource(region_name='us-west-1')}\n )\n\n """\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n def get_client(self) -> Any:\n return construct_s3_client(\n max_attempts=self.max_attempts,\n region_name=self.region_name,\n endpoint_url=self.endpoint_url,\n use_unsigned_session=self.use_unsigned_session,\n profile_name=self.profile_name,\n use_ssl=self.use_ssl,\n verify=self.verify,\n aws_access_key_id=self.aws_access_key_id,\n aws_secret_access_key=self.aws_secret_access_key,\n aws_session_token=self.aws_session_token,\n )\n\n def get_object_to_set_on_execution_context(self) -> Any:\n return self.get_client()
\n\n\n
[docs]@dagster_maintained_resource\n@resource(config_schema=S3Resource.to_config_schema())\ndef s3_resource(context) -> Any:\n """Resource that gives access to S3.\n\n The underlying S3 session is created by calling\n :py:func:`boto3.session.Session(profile_name) <boto3:boto3.session>`.\n The returned resource object is an S3 client, an instance of `botocore.client.S3`.\n\n Example:\n .. code-block:: python\n\n from dagster import build_op_context, job, op\n from dagster_aws.s3 import s3_resource\n\n @op(required_resource_keys={'s3'})\n def example_s3_op(context):\n return context.resources.s3.list_objects_v2(\n Bucket='my-bucket',\n Prefix='some-key'\n )\n\n @job(resource_defs={'s3': s3_resource})\n def example_job():\n example_s3_op()\n\n example_job.execute_in_process(\n run_config={\n 'resources': {\n 's3': {\n 'config': {\n 'region_name': 'us-west-1',\n }\n }\n }\n }\n )\n\n Note that your ops must also declare that they require this resource with\n `required_resource_keys`, or it will not be initialized for the execution of their compute\n functions.\n\n You may configure this resource as follows:\n\n .. code-block:: YAML\n\n resources:\n s3:\n config:\n region_name: "us-west-1"\n # Optional[str]: Specifies a custom region for the S3 session. Default is chosen\n # through the ordinary boto credential chain.\n use_unsigned_session: false\n # Optional[bool]: Specifies whether to use an unsigned S3 session. Default: True\n endpoint_url: "http://localhost"\n # Optional[str]: Specifies a custom endpoint for the S3 session. Default is None.\n profile_name: "dev"\n # Optional[str]: Specifies a custom profile for S3 session. Default is default\n # profile as specified in ~/.aws/credentials file\n use_ssl: true\n # Optional[bool]: Whether or not to use SSL. By default, SSL is used.\n verify: None\n # Optional[str]: Whether or not to verify SSL certificates. By default SSL certificates are verified.\n # You can also specify this argument if you want to use a different CA cert bundle than the one used by botocore."\n aws_access_key_id: None\n # Optional[str]: The access key to use when creating the client.\n aws_secret_access_key: None\n # Optional[str]: The secret key to use when creating the client.\n aws_session_token: None\n # Optional[str]: The session token to use when creating the client.\n """\n return S3Resource.from_resource_context(context).get_client()
\n\n\n
[docs]class S3FileManagerResource(ResourceWithS3Configuration, IAttachDifferentObjectToOpContext):\n s3_bucket: str = Field(description="S3 bucket to use for the file manager.")\n s3_prefix: str = Field(\n default="dagster", description="Prefix to use for the S3 bucket for this file manager."\n )\n\n def get_client(self) -> S3FileManager:\n return S3FileManager(\n s3_session=construct_s3_client(\n max_attempts=self.max_attempts,\n region_name=self.region_name,\n endpoint_url=self.endpoint_url,\n use_unsigned_session=self.use_unsigned_session,\n profile_name=self.profile_name,\n use_ssl=self.use_ssl,\n verify=self.verify,\n aws_access_key_id=self.aws_access_key_id,\n aws_secret_access_key=self.aws_secret_access_key,\n aws_session_token=self.aws_session_token,\n ),\n s3_bucket=self.s3_bucket,\n s3_base_key=self.s3_prefix,\n )\n\n def get_object_to_set_on_execution_context(self) -> Any:\n return self.get_client()
\n\n\n
[docs]@dagster_maintained_resource\n@resource(\n config_schema=S3FileManagerResource.to_config_schema(),\n)\ndef s3_file_manager(context) -> S3FileManager:\n """FileManager that provides abstract access to S3.\n\n Implements the :py:class:`~dagster._core.storage.file_manager.FileManager` API.\n """\n return S3FileManagerResource.from_resource_context(context).get_client()
\n
", "current_page_name": "_modules/dagster_aws/s3/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_aws.s3.resources"}}, "secretsmanager": {"resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_aws.secretsmanager.resources

\nfrom contextlib import contextmanager\nfrom typing import TYPE_CHECKING, Dict, Generator, List, Optional, cast\n\nfrom dagster import (\n    Field as LegacyDagsterField,\n    resource,\n)\nfrom dagster._config.field_utils import Shape\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom dagster._core.test_utils import environ\nfrom dagster._utils.merger import merge_dicts\nfrom pydantic import Field\n\nfrom dagster_aws.utils import ResourceWithBoto3Configuration\n\nfrom .secrets import construct_secretsmanager_client, get_secrets_from_arns, get_tagged_secrets\n\nif TYPE_CHECKING:\n    import botocore\n\n\n
[docs]class SecretsManagerResource(ResourceWithBoto3Configuration):\n """Resource that gives access to AWS SecretsManager.\n\n The underlying SecretsManager session is created by calling\n :py:func:`boto3.session.Session(profile_name) <boto3:boto3.session>`.\n The returned resource object is a SecretsManager client, an instance of `botocore.client.SecretsManager`.\n\n Example:\n .. code-block:: python\n\n from dagster import build_op_context, job, op\n from dagster_aws.secretsmanager import SecretsManagerResource\n\n @op\n def example_secretsmanager_op(secretsmanager: SecretsManagerResource):\n return secretsmanager.get_client().get_secret_value(\n SecretId='arn:aws:secretsmanager:region:aws_account_id:secret:appauthexample-AbCdEf'\n )\n\n @job\n def example_job():\n example_secretsmanager_op()\n\n defs = Definitions(\n jobs=[example_job],\n resources={\n 'secretsmanager': SecretsManagerResource(\n region_name='us-west-1'\n )\n }\n )\n """\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n def get_client(self) -> "botocore.client.SecretsManager":\n return construct_secretsmanager_client(\n max_attempts=self.max_attempts,\n region_name=self.region_name,\n profile_name=self.profile_name,\n )
\n\n\n
[docs]@dagster_maintained_resource\n@resource(SecretsManagerResource.to_config_schema())\ndef secretsmanager_resource(context) -> "botocore.client.SecretsManager":\n """Resource that gives access to AWS SecretsManager.\n\n The underlying SecretsManager session is created by calling\n :py:func:`boto3.session.Session(profile_name) <boto3:boto3.session>`.\n The returned resource object is a SecretsManager client, an instance of `botocore.client.SecretsManager`.\n\n Example:\n .. code-block:: python\n\n from dagster import build_op_context, job, op\n from dagster_aws.secretsmanager import secretsmanager_resource\n\n @op(required_resource_keys={'secretsmanager'})\n def example_secretsmanager_op(context):\n return context.resources.secretsmanager.get_secret_value(\n SecretId='arn:aws:secretsmanager:region:aws_account_id:secret:appauthexample-AbCdEf'\n )\n\n @job(resource_defs={'secretsmanager': secretsmanager_resource})\n def example_job():\n example_secretsmanager_op()\n\n example_job.execute_in_process(\n run_config={\n 'resources': {\n 'secretsmanager': {\n 'config': {\n 'region_name': 'us-west-1',\n }\n }\n }\n }\n )\n\n You may configure this resource as follows:\n\n .. code-block:: YAML\n\n resources:\n secretsmanager:\n config:\n region_name: "us-west-1"\n # Optional[str]: Specifies a custom region for the SecretsManager session. Default is chosen\n # through the ordinary boto credential chain.\n profile_name: "dev"\n # Optional[str]: Specifies a custom profile for SecretsManager session. Default is default\n # profile as specified in ~/.aws/credentials file\n\n """\n return SecretsManagerResource.from_resource_context(context).get_client()
\n\n\n
[docs]class SecretsManagerSecretsResource(ResourceWithBoto3Configuration):\n """Resource that provides a dict which maps selected SecretsManager secrets to\n their string values. Also optionally sets chosen secrets as environment variables.\n\n Example:\n .. code-block:: python\n\n import os\n from dagster import build_op_context, job, op, ResourceParam\n from dagster_aws.secretsmanager import SecretsManagerSecretsResource\n\n @op\n def example_secretsmanager_secrets_op(secrets: SecretsManagerSecretsResource):\n return secrets.fetch_secrets().get("my-secret-name")\n\n @op\n def example_secretsmanager_secrets_op_2(secrets: SecretsManagerSecretsResource):\n with secrets.secrets_in_environment():\n return os.getenv("my-other-secret-name")\n\n @job\n def example_job():\n example_secretsmanager_secrets_op()\n example_secretsmanager_secrets_op_2()\n\n defs = Definitions(\n jobs=[example_job],\n resources={\n 'secrets': SecretsManagerSecretsResource(\n region_name='us-west-1',\n secrets_tag="dagster",\n add_to_environment=True,\n )\n }\n )\n\n Note that your ops must also declare that they require this resource with or it will not be initialized\n for the execution of their compute functions.\n """\n\n secrets: List[str] = Field(\n default=[], description="An array of AWS Secrets Manager secrets arns to fetch."\n )\n secrets_tag: Optional[str] = Field(\n default=None,\n description="AWS Secrets Manager secrets with this tag will be fetched and made available.",\n )\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n @contextmanager\n def secrets_in_environment(\n self,\n secrets: Optional[List[str]] = None,\n secrets_tag: Optional[str] = None,\n ) -> Generator[Dict[str, str], None, None]:\n """Yields a dict which maps selected SecretsManager secrets to their string values. Also\n sets chosen secrets as environment variables.\n\n Args:\n secrets (Optional[List[str]]): An array of AWS Secrets Manager secrets arns to fetch.\n Note that this will override the secrets specified in the resource config.\n secrets_tag (Optional[str]): AWS Secrets Manager secrets with this tag will be fetched\n and made available. Note that this will override the secrets_tag specified in the\n resource config.\n """\n secrets_manager = construct_secretsmanager_client(\n max_attempts=self.max_attempts,\n region_name=self.region_name,\n profile_name=self.profile_name,\n )\n\n secrets_tag_to_fetch = secrets_tag if secrets_tag is not None else self.secrets_tag\n secrets_to_fetch = secrets if secrets is not None else self.secrets\n\n secret_arns = merge_dicts(\n (\n get_tagged_secrets(secrets_manager, [secrets_tag_to_fetch])\n if secrets_tag_to_fetch\n else {}\n ),\n get_secrets_from_arns(secrets_manager, secrets_to_fetch),\n )\n\n secrets_map = {\n name: secrets_manager.get_secret_value(SecretId=arn).get("SecretString")\n for name, arn in secret_arns.items()\n }\n with environ(secrets_map):\n yield secrets_map\n\n def fetch_secrets(\n self,\n secrets: Optional[List[str]] = None,\n secrets_tag: Optional[str] = None,\n ) -> Dict[str, str]:\n """Fetches secrets from AWS Secrets Manager and returns them as a dict.\n\n Args:\n secrets (Optional[List[str]]): An array of AWS Secrets Manager secrets arns to fetch.\n Note that this will override the secrets specified in the resource config.\n secrets_tag (Optional[str]): AWS Secrets Manager secrets with this tag will be fetched\n and made available. Note that this will override the secrets_tag specified in the\n resource config.\n """\n with self.secrets_in_environment(secrets=secrets, secrets_tag=secrets_tag) as secret_values:\n return secret_values
\n\n\nLEGACY_SECRETSMANAGER_SECRETS_SCHEMA = {\n **cast(Shape, SecretsManagerSecretsResource.to_config_schema().as_field().config_type).fields,\n "add_to_environment": LegacyDagsterField(\n bool,\n default_value=False,\n description="Whether to add the secrets to the environment. Defaults to False.",\n ),\n}\n\n\n
[docs]@dagster_maintained_resource\n@resource(config_schema=LEGACY_SECRETSMANAGER_SECRETS_SCHEMA)\n@contextmanager\ndef secretsmanager_secrets_resource(context):\n """Resource that provides a dict which maps selected SecretsManager secrets to\n their string values. Also optionally sets chosen secrets as environment variables.\n\n Example:\n .. code-block:: python\n\n import os\n from dagster import build_op_context, job, op\n from dagster_aws.secretsmanager import secretsmanager_secrets_resource\n\n @op(required_resource_keys={'secrets'})\n def example_secretsmanager_secrets_op(context):\n return context.resources.secrets.get("my-secret-name")\n\n @op(required_resource_keys={'secrets'})\n def example_secretsmanager_secrets_op_2(context):\n return os.getenv("my-other-secret-name")\n\n @job(resource_defs={'secrets': secretsmanager_secrets_resource})\n def example_job():\n example_secretsmanager_secrets_op()\n example_secretsmanager_secrets_op_2()\n\n example_job.execute_in_process(\n run_config={\n 'resources': {\n 'secrets': {\n 'config': {\n 'region_name': 'us-west-1',\n 'secrets_tag': 'dagster',\n 'add_to_environment': True,\n }\n }\n }\n }\n )\n\n Note that your ops must also declare that they require this resource with\n `required_resource_keys`, or it will not be initialized for the execution of their compute\n functions.\n\n You may configure this resource as follows:\n\n .. code-block:: YAML\n\n resources:\n secretsmanager:\n config:\n region_name: "us-west-1"\n # Optional[str]: Specifies a custom region for the SecretsManager session. Default is chosen\n # through the ordinary boto credential chain.\n profile_name: "dev"\n # Optional[str]: Specifies a custom profile for SecretsManager session. Default is default\n # profile as specified in ~/.aws/credentials file\n secrets: ["arn:aws:secretsmanager:region:aws_account_id:secret:appauthexample-AbCdEf"]\n # Optional[List[str]]: Specifies a list of secret ARNs to pull from SecretsManager.\n secrets_tag: "dagster"\n # Optional[str]: Specifies a tag, all secrets which have the tag set will be pulled\n # from SecretsManager.\n add_to_environment: true\n # Optional[bool]: Whether to set the selected secrets as environment variables. Defaults\n # to false.\n\n """\n add_to_environment = context.resource_config.get("add_to_environment", False)\n if add_to_environment:\n with SecretsManagerSecretsResource.from_resource_context(\n context\n ).secrets_in_environment() as secrets:\n yield secrets\n else:\n yield SecretsManagerSecretsResource.from_resource_context(context).fetch_secrets()
\n
", "current_page_name": "_modules/dagster_aws/secretsmanager/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_aws.secretsmanager.resources"}}}, "dagster_azure": {"adls2": {"fake_adls2_resource": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_azure.adls2.fake_adls2_resource

\nimport io\nimport random\nfrom typing import Any, Dict, Optional\nfrom unittest import mock\n\nfrom dagster import resource\nfrom dagster._config.pythonic_config import ConfigurableResource\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom dagster._utils.cached_method import cached_method\n\nfrom dagster_azure.blob import FakeBlobServiceClient\n\nfrom .utils import ResourceNotFoundError\n\n\n@dagster_maintained_resource\n@resource({"account_name": str})\ndef fake_adls2_resource(context):\n    return FakeADLS2Resource(account_name=context.resource_config["account_name"])\n\n\n
[docs]class FakeADLS2Resource(ConfigurableResource):\n """Stateful mock of an ADLS2Resource for testing.\n\n Wraps a ``mock.MagicMock``. Containers are implemented using an in-memory dict.\n """\n\n account_name: str\n storage_account: Optional[str] = None\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n @property\n @cached_method\n def adls2_client(self) -> "FakeADLS2ServiceClient":\n return FakeADLS2ServiceClient(self.account_name)\n\n @property\n @cached_method\n def blob_client(self) -> FakeBlobServiceClient:\n return FakeBlobServiceClient(self.account_name)\n\n @property\n def lease_client_constructor(self) -> Any:\n return FakeLeaseClient
\n\n\nclass FakeLeaseClient:\n def __init__(self, client):\n self.client = client\n self.id = None\n\n # client needs a ref to self to check if a given lease is valid\n self.client._lease = self # noqa: SLF001\n\n def acquire(self, lease_duration=-1):\n if self.id is None:\n self.id = random.randint(0, 2**9)\n else:\n raise Exception("Lease already held")\n\n def release(self):\n self.id = None\n\n def is_valid(self, lease):\n if self.id is None:\n # no lease is held so any operation is valid\n return True\n return lease == self.id\n\n\nclass FakeADLS2ServiceClient:\n """Stateful mock of an ADLS2 service client for testing.\n\n Wraps a ``mock.MagicMock``. Containers are implemented using an in-memory dict.\n """\n\n def __init__(self, account_name, credential="fake-creds"):\n self._account_name = account_name\n self._credential = mock.MagicMock()\n self._credential.account_key = credential\n self._file_systems = {}\n\n @property\n def account_name(self):\n return self._account_name\n\n @property\n def credential(self):\n return self._credential\n\n @property\n def file_systems(self):\n return self._file_systems\n\n def get_file_system_client(self, file_system):\n return self._file_systems.setdefault(\n file_system, FakeADLS2FilesystemClient(self.account_name, file_system)\n )\n\n def get_file_client(self, file_system, file_path):\n return self.get_file_system_client(file_system).get_file_client(file_path)\n\n\nclass FakeADLS2FilesystemClient:\n """Stateful mock of an ADLS2 filesystem client for testing."""\n\n def __init__(self, account_name, file_system_name):\n self._file_system: Dict[str, FakeADLS2FileClient] = {}\n self._account_name = account_name\n self._file_system_name = file_system_name\n\n @property\n def account_name(self):\n return self._account_name\n\n @property\n def file_system_name(self):\n return self._file_system_name\n\n def keys(self):\n return self._file_system.keys()\n\n def get_file_system_properties(self):\n return {"account_name": self.account_name, "file_system_name": self.file_system_name}\n\n def has_file(self, path):\n return bool(self._file_system.get(path))\n\n def get_file_client(self, file_path):\n # pass fileclient a ref to self and its name so the file can delete itself\n self._file_system.setdefault(file_path, FakeADLS2FileClient(self, file_path))\n return self._file_system[file_path]\n\n def create_file(self, file):\n # pass fileclient a ref to self and the file's name so the file can delete itself by\n # accessing the self._file_system dict\n self._file_system.setdefault(file, FakeADLS2FileClient(fs_client=self, name=file))\n return self._file_system[file]\n\n def delete_file(self, file):\n for k in list(self._file_system.keys()):\n if k.startswith(file):\n del self._file_system[k]\n\n\nclass FakeADLS2FileClient:\n """Stateful mock of an ADLS2 file client for testing."""\n\n def __init__(self, name, fs_client):\n self.name = name\n self.contents = None\n self._lease = None\n self.fs_client = fs_client\n\n @property\n def lease(self):\n return self._lease if self._lease is None else self._lease.id\n\n def get_file_properties(self):\n if self.contents is None:\n raise ResourceNotFoundError("File does not exist!")\n lease_id = None if self._lease is None else self._lease.id\n return {"lease": lease_id}\n\n def upload_data(self, contents, overwrite=False, lease=None):\n if self._lease is not None:\n if not self._lease.is_valid(lease):\n raise Exception("Invalid lease!")\n if self.contents is not None or overwrite is True:\n if isinstance(contents, str):\n self.contents = contents.encode("utf8")\n elif isinstance(contents, io.BytesIO):\n self.contents = contents.read()\n elif isinstance(contents, io.StringIO):\n self.contents = contents.read().encode("utf8")\n elif isinstance(contents, bytes):\n self.contents = contents\n else:\n self.contents = contents\n\n def download_file(self):\n if self.contents is None:\n raise ResourceNotFoundError("File does not exist!")\n return FakeADLS2FileDownloader(contents=self.contents)\n\n def delete_file(self, lease=None):\n if self._lease is not None:\n if not self._lease.is_valid(lease):\n raise Exception("Invalid lease!")\n self.fs_client.delete_file(self.name)\n\n\nclass FakeADLS2FileDownloader:\n """Mock of an ADLS2 file downloader for testing."""\n\n def __init__(self, contents):\n self.contents = contents\n\n def readall(self):\n return self.contents\n\n def readinto(self, fileobj):\n fileobj.write(self.contents)\n
", "current_page_name": "_modules/dagster_azure/adls2/fake_adls2_resource", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_azure.adls2.fake_adls2_resource"}, "file_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_azure.adls2.file_manager

\nimport io\nimport uuid\nfrom contextlib import contextmanager\n\nimport dagster._check as check\nfrom dagster._core.storage.file_manager import (\n    FileHandle,\n    FileManager,\n    TempfileManager,\n    check_file_like_obj,\n)\n\n\n
[docs]class ADLS2FileHandle(FileHandle):\n """A reference to a file on ADLS2."""\n\n def __init__(self, account: str, file_system: str, key: str):\n self._account = check.str_param(account, "account")\n self._file_system = check.str_param(file_system, "file_system")\n self._key = check.str_param(key, "key")\n\n @property\n def account(self):\n """str: The name of the ADLS2 account."""\n return self._account\n\n @property\n def file_system(self):\n """str: The name of the ADLS2 file system."""\n return self._file_system\n\n @property\n def key(self):\n """str: The ADLS2 key."""\n return self._key\n\n @property\n def path_desc(self):\n """str: The file's ADLS2 URL."""\n return self.adls2_path\n\n @property\n def adls2_path(self):\n """str: The file's ADLS2 URL."""\n return f"adfss://{self.file_system}@{self.account}.dfs.core.windows.net/{self.key}"
\n\n\nclass ADLS2FileManager(FileManager):\n def __init__(self, adls2_client, file_system, prefix):\n self._client = adls2_client\n self._file_system = check.str_param(file_system, "file_system")\n self._prefix = check.str_param(prefix, "prefix")\n self._local_handle_cache = {}\n self._temp_file_manager = TempfileManager()\n\n def copy_handle_to_local_temp(self, file_handle):\n self._download_if_not_cached(file_handle)\n return self._get_local_path(file_handle)\n\n def _download_if_not_cached(self, file_handle):\n if not self._file_handle_cached(file_handle):\n # instigate download\n temp_file_obj = self._temp_file_manager.tempfile()\n temp_name = temp_file_obj.name\n file = self._client.get_file_client(\n file_system=file_handle.file_system,\n file_path=file_handle.key,\n )\n download = file.download_file()\n with open(temp_name, "wb") as file_obj:\n download.readinto(file_obj)\n self._local_handle_cache[file_handle.adls2_path] = temp_name\n\n return file_handle\n\n @contextmanager\n def read(self, file_handle, mode="rb"):\n check.inst_param(file_handle, "file_handle", ADLS2FileHandle)\n check.str_param(mode, "mode")\n check.param_invariant(mode in {"r", "rb"}, "mode")\n\n self._download_if_not_cached(file_handle)\n\n encoding = None if "b" in mode else "utf-8"\n with open(self._get_local_path(file_handle), mode, encoding=encoding) as file_obj:\n yield file_obj\n\n def _file_handle_cached(self, file_handle):\n return file_handle.adls2_path in self._local_handle_cache\n\n def _get_local_path(self, file_handle):\n return self._local_handle_cache[file_handle.adls2_path]\n\n def read_data(self, file_handle):\n with self.read(file_handle, mode="rb") as file_obj:\n return file_obj.read()\n\n def write_data(self, data, ext=None):\n check.inst_param(data, "data", bytes)\n return self.write(io.BytesIO(data), mode="wb", ext=ext)\n\n def write(self, file_obj, mode="wb", ext=None):\n check_file_like_obj(file_obj)\n adls2_key = self.get_full_key(str(uuid.uuid4()) + (("." + ext) if ext is not None else ""))\n adls2_file = self._client.get_file_client(\n file_system=self._file_system, file_path=adls2_key\n )\n adls2_file.upload_data(file_obj, overwrite=True)\n return ADLS2FileHandle(self._client.account_name, self._file_system, adls2_key)\n\n def get_full_key(self, file_key):\n return f"{self._prefix}/{file_key}"\n\n def delete_local_temp(self):\n self._temp_file_manager.close()\n
", "current_page_name": "_modules/dagster_azure/adls2/file_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_azure.adls2.file_manager"}, "io_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_azure.adls2.io_manager

\nimport pickle\nfrom contextlib import contextmanager\nfrom typing import Any, Iterator, Union\n\nfrom dagster import (\n    InputContext,\n    OutputContext,\n    ResourceDependency,\n    _check as check,\n    io_manager,\n)\nfrom dagster._annotations import deprecated\nfrom dagster._config.pythonic_config import ConfigurableIOManager\nfrom dagster._core.storage.io_manager import dagster_maintained_io_manager\nfrom dagster._core.storage.upath_io_manager import UPathIOManager\nfrom dagster._utils import PICKLE_PROTOCOL\nfrom dagster._utils.cached_method import cached_method\nfrom pydantic import Field\nfrom upath import UPath\n\nfrom dagster_azure.adls2.resources import ADLS2Resource\nfrom dagster_azure.adls2.utils import ResourceNotFoundError\n\n_LEASE_DURATION = 60  # One minute\n\n\nclass PickledObjectADLS2IOManager(UPathIOManager):\n    def __init__(\n        self,\n        file_system: Any,\n        adls2_client: Any,\n        blob_client: Any,\n        lease_client_constructor: Any,\n        prefix: str = "dagster",\n    ):\n        self.adls2_client = adls2_client\n        self.file_system_client = self.adls2_client.get_file_system_client(file_system)\n        # We also need a blob client to handle copying as ADLS doesn't have a copy API yet\n        self.blob_client = blob_client\n        self.blob_container_client = self.blob_client.get_container_client(file_system)\n        self.prefix = check.str_param(prefix, "prefix")\n\n        self.lease_client_constructor = lease_client_constructor\n        self.lease_duration = _LEASE_DURATION\n        self.file_system_client.get_file_system_properties()\n        super().__init__(base_path=UPath(self.prefix))\n\n    def get_op_output_relative_path(self, context: Union[InputContext, OutputContext]) -> UPath:\n        parts = context.get_identifier()\n        run_id = parts[0]\n        output_parts = parts[1:]\n        return UPath("storage", run_id, "files", *output_parts)\n\n    def get_loading_input_log_message(self, path: UPath) -> str:\n        return f"Loading ADLS2 object from: {self._uri_for_path(path)}"\n\n    def get_writing_output_log_message(self, path: UPath) -> str:\n        return f"Writing ADLS2 object at: {self._uri_for_path(path)}"\n\n    def unlink(self, path: UPath) -> None:\n        file_client = self.file_system_client.get_file_client(str(path))\n        with self._acquire_lease(file_client, is_rm=True) as lease:\n            file_client.delete_file(lease=lease, recursive=True)\n\n    def path_exists(self, path: UPath) -> bool:\n        try:\n            self.file_system_client.get_file_client(str(path)).get_file_properties()\n        except ResourceNotFoundError:\n            return False\n        return True\n\n    def _uri_for_path(self, path: UPath, protocol: str = "abfss://") -> str:\n        return "{protocol}{filesystem}@{account}.dfs.core.windows.net/{key}".format(\n            protocol=protocol,\n            filesystem=self.file_system_client.file_system_name,\n            account=self.file_system_client.account_name,\n            key=path,\n        )\n\n    @contextmanager\n    def _acquire_lease(self, client: Any, is_rm: bool = False) -> Iterator[str]:\n        lease_client = self.lease_client_constructor(client=client)\n        try:\n            lease_client.acquire(lease_duration=self.lease_duration)\n            yield lease_client.id\n        finally:\n            # cannot release a lease on a file that no longer exists, so need to check\n            if not is_rm:\n                lease_client.release()\n\n    def load_from_path(self, context: InputContext, path: UPath) -> Any:\n        if context.dagster_type.typing_type == type(None):\n            return None\n        file = self.file_system_client.get_file_client(str(path))\n        stream = file.download_file()\n        return pickle.loads(stream.readall())\n\n    def dump_to_path(self, context: OutputContext, obj: Any, path: UPath) -> None:\n        if self.path_exists(path):\n            context.log.warning(f"Removing existing ADLS2 key: {path}")\n            self.unlink(path)\n\n        pickled_obj = pickle.dumps(obj, PICKLE_PROTOCOL)\n        file = self.file_system_client.create_file(str(path))\n        with self._acquire_lease(file) as lease:\n            file.upload_data(pickled_obj, lease=lease, overwrite=True)\n\n\n
[docs]class ADLS2PickleIOManager(ConfigurableIOManager):\n """Persistent IO manager using Azure Data Lake Storage Gen2 for storage.\n\n Serializes objects via pickling. Suitable for objects storage for distributed executors, so long\n as each execution node has network connectivity and credentials for ADLS and the backing\n container.\n\n Assigns each op output to a unique filepath containing run ID, step key, and output name.\n Assigns each asset to a single filesystem path, at "<base_dir>/<asset_key>". If the asset key\n has multiple components, the final component is used as the name of the file, and the preceding\n components as parent directories under the base_dir.\n\n Subsequent materializations of an asset will overwrite previous materializations of that asset.\n With a base directory of "/my/base/path", an asset with key\n `AssetKey(["one", "two", "three"])` would be stored in a file called "three" in a directory\n with path "/my/base/path/one/two/".\n\n Example usage:\n\n 1. Attach this IO manager to a set of assets.\n\n .. code-block:: python\n\n from dagster import Definitions, asset\n from dagster_azure.adls2 import ADLS2PickleIOManager, adls2_resource\n\n @asset\n def asset1():\n # create df ...\n return df\n\n @asset\n def asset2(asset1):\n return df[:5]\n\n defs = Definitions(\n assets=[asset1, asset2],\n resources={\n "io_manager": ADLS2PickleIOManager(\n adls2_file_system="my-cool-fs",\n adls2_prefix="my-cool-prefix"\n ),\n "adls2": adls2_resource,\n },\n )\n\n\n 2. Attach this IO manager to your job to make it available to your ops.\n\n .. code-block:: python\n\n from dagster import job\n from dagster_azure.adls2 import ADLS2PickleIOManager, adls2_resource\n\n @job(\n resource_defs={\n "io_manager": ADLS2PickleIOManager(\n adls2_file_system="my-cool-fs",\n adls2_prefix="my-cool-prefix"\n ),\n "adls2": adls2_resource,\n },\n )\n def my_job():\n ...\n """\n\n adls2: ResourceDependency[ADLS2Resource]\n adls2_file_system: str = Field(description="ADLS Gen2 file system name.")\n adls2_prefix: str = Field(\n default="dagster", description="ADLS Gen2 file system prefix to write to."\n )\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n @property\n @cached_method\n def _internal_io_manager(self) -> PickledObjectADLS2IOManager:\n return PickledObjectADLS2IOManager(\n self.adls2_file_system,\n self.adls2.adls2_client,\n self.adls2.blob_client,\n self.adls2.lease_client_constructor,\n self.adls2_prefix,\n )\n\n def load_input(self, context: "InputContext") -> Any:\n return self._internal_io_manager.load_input(context)\n\n def handle_output(self, context: "OutputContext", obj: Any) -> None:\n self._internal_io_manager.handle_output(context, obj)
\n\n\n
[docs]@deprecated(\n breaking_version="2.0",\n additional_warn_text="Please use GCSPickleIOManager instead.",\n)\nclass ConfigurablePickledObjectADLS2IOManager(ADLS2PickleIOManager):\n """Renamed to ADLS2PickleIOManager. See ADLS2PickleIOManager for documentation."""\n\n pass
\n\n\n
[docs]@dagster_maintained_io_manager\n@io_manager(\n config_schema=ADLS2PickleIOManager.to_config_schema(),\n required_resource_keys={"adls2"},\n)\ndef adls2_pickle_io_manager(init_context):\n """Persistent IO manager using Azure Data Lake Storage Gen2 for storage.\n\n Serializes objects via pickling. Suitable for objects storage for distributed executors, so long\n as each execution node has network connectivity and credentials for ADLS and the backing\n container.\n\n Assigns each op output to a unique filepath containing run ID, step key, and output name.\n Assigns each asset to a single filesystem path, at "<base_dir>/<asset_key>". If the asset key\n has multiple components, the final component is used as the name of the file, and the preceding\n components as parent directories under the base_dir.\n\n Subsequent materializations of an asset will overwrite previous materializations of that asset.\n With a base directory of "/my/base/path", an asset with key\n `AssetKey(["one", "two", "three"])` would be stored in a file called "three" in a directory\n with path "/my/base/path/one/two/".\n\n Example usage:\n\n 1. Attach this IO manager to a set of assets.\n\n .. code-block:: python\n\n from dagster import Definitions, asset\n from dagster_azure.adls2 import adls2_pickle_io_manager, adls2_resource\n\n @asset\n def asset1():\n # create df ...\n return df\n\n @asset\n def asset2(asset1):\n return df[:5]\n\n defs = Definitions(\n assets=[asset1, asset2],\n resources={\n "io_manager": adls2_pickle_io_manager.configured(\n {"adls2_file_system": "my-cool-fs", "adls2_prefix": "my-cool-prefix"}\n ),\n "adls2": adls2_resource,\n },\n )\n\n\n 2. Attach this IO manager to your job to make it available to your ops.\n\n .. code-block:: python\n\n from dagster import job\n from dagster_azure.adls2 import adls2_pickle_io_manager, adls2_resource\n\n @job(\n resource_defs={\n "io_manager": adls2_pickle_io_manager.configured(\n {"adls2_file_system": "my-cool-fs", "adls2_prefix": "my-cool-prefix"}\n ),\n "adls2": adls2_resource,\n },\n )\n def my_job():\n ...\n """\n adls_resource = init_context.resources.adls2\n adls2_client = adls_resource.adls2_client\n blob_client = adls_resource.blob_client\n lease_client = adls_resource.lease_client_constructor\n pickled_io_manager = PickledObjectADLS2IOManager(\n init_context.resource_config["adls2_file_system"],\n adls2_client,\n blob_client,\n lease_client,\n init_context.resource_config.get("adls2_prefix"),\n )\n return pickled_io_manager
\n
", "current_page_name": "_modules/dagster_azure/adls2/io_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_azure.adls2.io_manager"}, "resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_azure.adls2.resources

\nfrom typing import Any, Dict, Union\n\nfrom azure.identity import DefaultAzureCredential\nfrom azure.storage.filedatalake import DataLakeLeaseClient\nfrom dagster import (\n    Config,\n    ConfigurableResource,\n    Field as DagsterField,\n    Permissive,\n    Selector,\n    StringSource,\n    resource,\n)\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom dagster._utils.cached_method import cached_method\nfrom dagster._utils.merger import merge_dicts\nfrom pydantic import Field\nfrom typing_extensions import Literal\n\nfrom dagster_azure.blob.utils import BlobServiceClient, create_blob_client\n\nfrom .file_manager import ADLS2FileManager\nfrom .utils import DataLakeServiceClient, create_adls2_client\n\n\nclass ADLS2SASToken(Config):\n    credential_type: Literal["sas"] = "sas"\n    token: str\n\n\nclass ADLS2Key(Config):\n    credential_type: Literal["key"] = "key"\n    key: str\n\n\nclass ADLS2DefaultAzureCredential(Config):\n    credential_type: Literal["default_azure_credential"] = "default_azure_credential"\n    kwargs: Dict[str, Any]\n\n\nclass ADLS2BaseResource(ConfigurableResource):\n    storage_account: str = Field(description="The storage account name.")\n    credential: Union[ADLS2SASToken, ADLS2Key, ADLS2DefaultAzureCredential] = Field(\n        discriminator="credential_type", description="The credentials with which to authenticate."\n    )\n\n\nDEFAULT_AZURE_CREDENTIAL_CONFIG = DagsterField(\n    Permissive(\n        description="Uses DefaultAzureCredential to authenticate and passed as keyword arguments",\n    )\n)\n\nADLS2_CLIENT_CONFIG = {\n    "storage_account": DagsterField(StringSource, description="The storage account name."),\n    "credential": DagsterField(\n        Selector(\n            {\n                "sas": DagsterField(StringSource, description="SAS token for the account."),\n                "key": DagsterField(StringSource, description="Shared Access Key for the account."),\n                "DefaultAzureCredential": DEFAULT_AZURE_CREDENTIAL_CONFIG,\n            }\n        ),\n        description="The credentials with which to authenticate.",\n    ),\n}\n\n\n
[docs]class ADLS2Resource(ADLS2BaseResource):\n """Resource containing clients to access Azure Data Lake Storage Gen2.\n\n Contains a client for both the Data Lake and Blob APIs, to work around the limitations\n of each.\n """\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n @property\n @cached_method\n def _raw_credential(self) -> Any:\n if isinstance(self.credential, ADLS2Key):\n return self.credential.key\n elif isinstance(self.credential, ADLS2SASToken):\n return self.credential.token\n else:\n return DefaultAzureCredential(**self.credential.kwargs)\n\n @property\n @cached_method\n def adls2_client(self) -> DataLakeServiceClient:\n return create_adls2_client(self.storage_account, self._raw_credential)\n\n @property\n @cached_method\n def blob_client(self) -> BlobServiceClient:\n return create_blob_client(self.storage_account, self._raw_credential)\n\n @property\n def lease_client_constructor(self) -> Any:\n return DataLakeLeaseClient
\n\n\n# Due to a limitation of the discriminated union type, we can't directly mirror these old\n# config fields in the new resource config. Instead, we'll just use the old config fields\n# to construct the new config and then use that to construct the resource.\n
[docs]@dagster_maintained_resource\n@resource(ADLS2_CLIENT_CONFIG)\ndef adls2_resource(context):\n """Resource that gives ops access to Azure Data Lake Storage Gen2.\n\n The underlying client is a :py:class:`~azure.storage.filedatalake.DataLakeServiceClient`.\n\n Attach this resource definition to a :py:class:`~dagster.JobDefinition` in order to make it\n available to your ops.\n\n Example:\n .. code-block:: python\n\n from dagster import job, op\n from dagster_azure.adls2 import adls2_resource\n\n @op(required_resource_keys={'adls2'})\n def example_adls2_op(context):\n return list(context.resources.adls2.adls2_client.list_file_systems())\n\n @job(resource_defs={"adls2": adls2_resource})\n def my_job():\n example_adls2_op()\n\n Note that your ops must also declare that they require this resource with\n `required_resource_keys`, or it will not be initialized for the execution of their compute\n functions.\n\n You may pass credentials to this resource using either a SAS token, a key or by passing the\n `DefaultAzureCredential` object.\n\n .. code-block:: YAML\n\n resources:\n adls2:\n config:\n storage_account: my_storage_account\n # str: The storage account name.\n credential:\n sas: my_sas_token\n # str: the SAS token for the account.\n key:\n env: AZURE_DATA_LAKE_STORAGE_KEY\n # str: The shared access key for the account.\n DefaultAzureCredential: {}\n # dict: The keyword arguments used for DefaultAzureCredential\n # or leave the object empty for no arguments\n DefaultAzureCredential:\n exclude_environment_credential: true\n\n """\n return _adls2_resource_from_config(context.resource_config)
\n\n\n
[docs]@dagster_maintained_resource\n@resource(\n merge_dicts(\n ADLS2_CLIENT_CONFIG,\n {\n "adls2_file_system": DagsterField(\n StringSource, description="ADLS Gen2 file system name"\n ),\n "adls2_prefix": DagsterField(StringSource, is_required=False, default_value="dagster"),\n },\n )\n)\ndef adls2_file_manager(context):\n """FileManager that provides abstract access to ADLS2.\n\n Implements the :py:class:`~dagster._core.storage.file_manager.FileManager` API.\n """\n adls2_client = _adls2_resource_from_config(context.resource_config).adls2_client\n\n return ADLS2FileManager(\n adls2_client=adls2_client,\n file_system=context.resource_config["adls2_file_system"],\n prefix=context.resource_config["adls2_prefix"],\n )
\n\n\ndef _adls2_resource_from_config(config) -> ADLS2Resource:\n """Args:\n config: A configuration containing the fields in ADLS2_CLIENT_CONFIG.\n\n Returns: An adls2 client.\n """\n storage_account = config["storage_account"]\n if "DefaultAzureCredential" in config["credential"]:\n credential = ADLS2DefaultAzureCredential(\n kwargs=config["credential"]["DefaultAzureCredential"]\n )\n elif "sas" in config["credential"]:\n credential = ADLS2SASToken(token=config["credential"]["sas"])\n else:\n credential = ADLS2Key(key=config["credential"]["key"])\n\n return ADLS2Resource(storage_account=storage_account, credential=credential)\n
", "current_page_name": "_modules/dagster_azure/adls2/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_azure.adls2.resources"}}, "blob": {"compute_log_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_azure.blob.compute_log_manager

\nimport os\nfrom contextlib import contextmanager\nfrom typing import Any, Mapping, Optional, Sequence\n\nimport dagster._seven as seven\nfrom azure.identity import DefaultAzureCredential\nfrom dagster import (\n    Field,\n    Noneable,\n    Permissive,\n    StringSource,\n    _check as check,\n)\nfrom dagster._core.storage.cloud_storage_compute_log_manager import (\n    CloudStorageComputeLogManager,\n    PollingComputeLogSubscriptionManager,\n)\nfrom dagster._core.storage.compute_log_manager import ComputeIOType\nfrom dagster._core.storage.local_compute_log_manager import (\n    IO_TYPE_EXTENSION,\n    LocalComputeLogManager,\n)\nfrom dagster._serdes import ConfigurableClass, ConfigurableClassData\nfrom dagster._utils import ensure_dir, ensure_file\nfrom typing_extensions import Self\n\nfrom .utils import create_blob_client, generate_blob_sas\n\n\n
[docs]class AzureBlobComputeLogManager(CloudStorageComputeLogManager, ConfigurableClass):\n """Logs op compute function stdout and stderr to Azure Blob Storage.\n\n This is also compatible with Azure Data Lake Storage.\n\n Users should not instantiate this class directly. Instead, use a YAML block in ``dagster.yaml``\n such as the following:\n\n .. code-block:: YAML\n\n compute_logs:\n module: dagster_azure.blob.compute_log_manager\n class: AzureBlobComputeLogManager\n config:\n storage_account: my-storage-account\n container: my-container\n credential: sas-token-or-secret-key\n default_azure_credential:\n exclude_environment_credential: true\n prefix: "dagster-test-"\n local_dir: "/tmp/cool"\n upload_interval: 30\n\n Args:\n storage_account (str): The storage account name to which to log.\n container (str): The container (or ADLS2 filesystem) to which to log.\n secret_key (Optional[str]): Secret key for the storage account. SAS tokens are not\n supported because we need a secret key to generate a SAS token for a download URL.\n default_azure_credential (Optional[dict]): Use and configure DefaultAzureCredential.\n Cannot be used with sas token or secret key config.\n local_dir (Optional[str]): Path to the local directory in which to stage logs. Default:\n ``dagster._seven.get_system_temp_directory()``.\n prefix (Optional[str]): Prefix for the log file keys.\n upload_interval: (Optional[int]): Interval in seconds to upload partial log files blob storage. By default, will only upload when the capture is complete.\n inst_data (Optional[ConfigurableClassData]): Serializable representation of the compute\n log manager when newed up from config.\n """\n\n def __init__(\n self,\n storage_account,\n container,\n secret_key=None,\n local_dir=None,\n inst_data: Optional[ConfigurableClassData] = None,\n prefix="dagster",\n upload_interval=None,\n default_azure_credential=None,\n ):\n self._storage_account = check.str_param(storage_account, "storage_account")\n self._container = check.str_param(container, "container")\n self._blob_prefix = self._clean_prefix(check.str_param(prefix, "prefix"))\n self._default_azure_credential = check.opt_dict_param(\n default_azure_credential, "default_azure_credential"\n )\n check.opt_str_param(secret_key, "secret_key")\n check.invariant(\n secret_key is not None or default_azure_credential is not None,\n "Missing config: need to provide one of secret_key or default_azure_credential",\n )\n\n if default_azure_credential is None:\n self._blob_client = create_blob_client(storage_account, secret_key)\n else:\n credential = DefaultAzureCredential(**self._default_azure_credential)\n self._blob_client = create_blob_client(storage_account, credential)\n\n self._container_client = self._blob_client.get_container_client(container)\n self._download_urls = {}\n\n # proxy calls to local compute log manager (for subscriptions, etc)\n if not local_dir:\n local_dir = seven.get_system_temp_directory()\n\n self._local_manager = LocalComputeLogManager(local_dir)\n self._subscription_manager = PollingComputeLogSubscriptionManager(self)\n self._upload_interval = check.opt_int_param(upload_interval, "upload_interval")\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n\n @contextmanager\n def _watch_logs(self, dagster_run, step_key=None):\n # proxy watching to the local compute log manager, interacting with the filesystem\n with self.local_manager._watch_logs(dagster_run, step_key): # noqa: SLF001\n yield\n\n @property\n def inst_data(self):\n return self._inst_data\n\n @classmethod\n def config_type(cls):\n return {\n "storage_account": StringSource,\n "container": StringSource,\n "secret_key": Field(StringSource, is_required=False),\n "default_azure_credential": Field(\n Noneable(Permissive(description="keyword arguments for DefaultAzureCredential")),\n is_required=False,\n default_value=None,\n ),\n "local_dir": Field(StringSource, is_required=False),\n "prefix": Field(StringSource, is_required=False, default_value="dagster"),\n "upload_interval": Field(Noneable(int), is_required=False, default_value=None),\n }\n\n @classmethod\n def from_config_value(\n cls, inst_data: ConfigurableClassData, config_value: Mapping[str, Any]\n ) -> Self:\n return AzureBlobComputeLogManager(inst_data=inst_data, **config_value)\n\n @property\n def local_manager(self) -> LocalComputeLogManager:\n return self._local_manager\n\n @property\n def upload_interval(self) -> Optional[int]:\n return self._upload_interval if self._upload_interval else None\n\n def _clean_prefix(self, prefix):\n parts = prefix.split("/")\n return "/".join([part for part in parts if part])\n\n def _blob_key(self, log_key, io_type, partial=False):\n check.inst_param(io_type, "io_type", ComputeIOType)\n extension = IO_TYPE_EXTENSION[io_type]\n [*namespace, filebase] = log_key\n filename = f"{filebase}.{extension}"\n if partial:\n filename = f"{filename}.partial"\n paths = [self._blob_prefix, "storage", *namespace, filename]\n return "/".join(paths) # blob path delimiter\n\n def delete_logs(\n self, log_key: Optional[Sequence[str]] = None, prefix: Optional[Sequence[str]] = None\n ):\n self.local_manager.delete_logs(log_key=log_key, prefix=prefix)\n if log_key:\n prefix_path = "/".join([self._blob_prefix, "storage", *log_key])\n elif prefix:\n # add the trailing '/' to make sure that ['a'] does not match ['apple']\n prefix_path = "/".join([self._blob_prefix, "storage", *prefix, ""])\n else:\n prefix_path = None\n\n blob_list = {\n b.name for b in list(self._container_client.list_blobs(name_starts_with=prefix_path))\n }\n\n to_remove = None\n if log_key:\n # filter to the known set of keys\n known_keys = [\n self._blob_key(log_key, ComputeIOType.STDOUT),\n self._blob_key(log_key, ComputeIOType.STDERR),\n self._blob_key(log_key, ComputeIOType.STDOUT, partial=True),\n self._blob_key(log_key, ComputeIOType.STDERR, partial=True),\n ]\n to_remove = [key for key in known_keys if key in blob_list]\n elif prefix:\n to_remove = list(blob_list)\n else:\n check.failed("Must pass in either `log_key` or `prefix` argument to delete_logs")\n\n if to_remove:\n self._container_client.delete_blobs(*to_remove)\n\n def download_url_for_type(self, log_key: Sequence[str], io_type: ComputeIOType):\n if not self.is_capture_complete(log_key):\n return None\n\n blob_key = self._blob_key(log_key, io_type)\n if blob_key in self._download_urls:\n return self._download_urls[blob_key]\n blob = self._container_client.get_blob_client(blob_key)\n sas = generate_blob_sas(\n self._storage_account,\n self._container,\n blob_key,\n account_key=self._blob_client.credential.account_key,\n )\n url = blob.url + sas\n self._download_urls[blob_key] = url\n return url\n\n def display_path_for_type(self, log_key: Sequence[str], io_type: ComputeIOType):\n if not self.is_capture_complete(log_key):\n return self.local_manager.get_captured_local_path(log_key, IO_TYPE_EXTENSION[io_type])\n\n blob_key = self._blob_key(log_key, io_type)\n return f"https://{self._storage_account}.blob.core.windows.net/{self._container}/{blob_key}"\n\n def cloud_storage_has_logs(\n self, log_key: Sequence[str], io_type: ComputeIOType, partial: bool = False\n ) -> bool:\n blob_key = self._blob_key(log_key, io_type, partial=partial)\n blob_objects = self._container_client.list_blobs(blob_key)\n exact_matches = [blob for blob in blob_objects if blob.name == blob_key]\n return len(exact_matches) > 0\n\n def upload_to_cloud_storage(\n self, log_key: Sequence[str], io_type: ComputeIOType, partial=False\n ):\n path = self.local_manager.get_captured_local_path(log_key, IO_TYPE_EXTENSION[io_type])\n ensure_file(path)\n blob_key = self._blob_key(log_key, io_type, partial=partial)\n with open(path, "rb") as data:\n blob = self._container_client.get_blob_client(blob_key)\n blob.upload_blob(data)\n\n def download_from_cloud_storage(\n self, log_key: Sequence[str], io_type: ComputeIOType, partial=False\n ):\n path = self.local_manager.get_captured_local_path(\n log_key, IO_TYPE_EXTENSION[io_type], partial=partial\n )\n ensure_dir(os.path.dirname(path))\n blob_key = self._blob_key(log_key, io_type, partial=partial)\n with open(path, "wb") as fileobj:\n blob = self._container_client.get_blob_client(blob_key)\n blob.download_blob().readinto(fileobj)\n\n def on_subscribe(self, subscription):\n self._subscription_manager.add_subscription(subscription)\n\n def on_unsubscribe(self, subscription):\n self._subscription_manager.remove_subscription(subscription)
\n
", "current_page_name": "_modules/dagster_azure/blob/compute_log_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_azure.blob.compute_log_manager"}}}, "dagster_celery": {"executor": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_celery.executor

\nfrom dagster import (\n    Executor,\n    Field,\n    Noneable,\n    Permissive,\n    StringSource,\n    _check as check,\n    executor,\n    multiple_process_executor_requirements,\n)\nfrom dagster._core.execution.retries import RetryMode, get_retries_config\nfrom dagster._grpc.types import ExecuteStepArgs\nfrom dagster._serdes import pack_value\n\nfrom .config import DEFAULT_CONFIG, dict_wrapper\nfrom .defaults import broker_url, result_backend\n\nCELERY_CONFIG = {\n    "broker": Field(\n        Noneable(StringSource),\n        is_required=False,\n        description=(\n            "The URL of the Celery broker. Default: "\n            "'pyamqp://guest@{os.getenv('DAGSTER_CELERY_BROKER_HOST',"\n            "'localhost')}//'."\n        ),\n    ),\n    "backend": Field(\n        Noneable(StringSource),\n        is_required=False,\n        default_value="rpc://",\n        description="The URL of the Celery results backend. Default: 'rpc://'.",\n    ),\n    "include": Field(\n        [str], is_required=False, description="List of modules every worker should import"\n    ),\n    "config_source": Field(\n        Noneable(Permissive()),\n        is_required=False,\n        description="Additional settings for the Celery app.",\n    ),\n    "retries": get_retries_config(),\n}\n\n\n
[docs]@executor(\n name="celery",\n config_schema=CELERY_CONFIG,\n requirements=multiple_process_executor_requirements(),\n)\ndef celery_executor(init_context):\n """Celery-based executor.\n\n The Celery executor exposes config settings for the underlying Celery app under\n the ``config_source`` key. This config corresponds to the "new lowercase settings" introduced\n in Celery version 4.0 and the object constructed from config will be passed to the\n :py:class:`celery.Celery` constructor as its ``config_source`` argument.\n (See https://docs.celeryq.dev/en/stable/userguide/configuration.html for details.)\n\n The executor also exposes the ``broker``, `backend`, and ``include`` arguments to the\n :py:class:`celery.Celery` constructor.\n\n In the most common case, you may want to modify the ``broker`` and ``backend`` (e.g., to use\n Redis instead of RabbitMQ). We expect that ``config_source`` will be less frequently\n modified, but that when solid executions are especially fast or slow, or when there are\n different requirements around idempotence or retry, it may make sense to execute jobs\n with variations on these settings.\n\n To use the `celery_executor`, set it as the `executor_def` when defining a job:\n\n .. code-block:: python\n\n from dagster import job\n from dagster_celery import celery_executor\n\n @job(executor_def=celery_executor)\n def celery_enabled_job():\n pass\n\n Then you can configure the executor as follows:\n\n .. code-block:: YAML\n\n execution:\n config:\n broker: 'pyamqp://guest@localhost//' # Optional[str]: The URL of the Celery broker\n backend: 'rpc://' # Optional[str]: The URL of the Celery results backend\n include: ['my_module'] # Optional[List[str]]: Modules every worker should import\n config_source: # Dict[str, Any]: Any additional parameters to pass to the\n #... # Celery workers. This dict will be passed as the `config_source`\n #... # argument of celery.Celery().\n\n Note that the YAML you provide here must align with the configuration with which the Celery\n workers on which you hope to run were started. If, for example, you point the executor at a\n different broker than the one your workers are listening to, the workers will never be able to\n pick up tasks for execution.\n """\n return CeleryExecutor(\n broker=init_context.executor_config.get("broker"),\n backend=init_context.executor_config.get("backend"),\n config_source=init_context.executor_config.get("config_source"),\n include=init_context.executor_config.get("include"),\n retries=RetryMode.from_config(init_context.executor_config["retries"]),\n )
\n\n\ndef _submit_task(app, plan_context, step, queue, priority, known_state):\n from .tasks import create_task\n\n execute_step_args = ExecuteStepArgs(\n job_origin=plan_context.reconstructable_job.get_python_origin(),\n run_id=plan_context.dagster_run.run_id,\n step_keys_to_execute=[step.key],\n instance_ref=plan_context.instance.get_ref(),\n retry_mode=plan_context.executor.retries.for_inner_plan(),\n known_state=known_state,\n print_serialized_events=True, # Not actually checked by the celery task\n )\n\n task = create_task(app)\n task_signature = task.si(\n execute_step_args_packed=pack_value(execute_step_args),\n executable_dict=plan_context.reconstructable_job.to_dict(),\n )\n return task_signature.apply_async(\n priority=priority,\n queue=queue,\n routing_key=f"{queue}.execute_plan",\n )\n\n\nclass CeleryExecutor(Executor):\n def __init__(\n self,\n retries,\n broker=None,\n backend=None,\n include=None,\n config_source=None,\n ):\n self.broker = check.opt_str_param(broker, "broker", default=broker_url)\n self.backend = check.opt_str_param(backend, "backend", default=result_backend)\n self.include = check.opt_list_param(include, "include", of_type=str)\n self.config_source = dict_wrapper(\n dict(DEFAULT_CONFIG, **check.opt_dict_param(config_source, "config_source"))\n )\n self._retries = check.inst_param(retries, "retries", RetryMode)\n\n @property\n def retries(self):\n return self._retries\n\n def execute(self, plan_context, execution_plan):\n from .core_execution_loop import core_celery_execution_loop\n\n return core_celery_execution_loop(\n plan_context, execution_plan, step_execution_fn=_submit_task\n )\n\n @staticmethod\n def for_cli(broker=None, backend=None, include=None, config_source=None):\n return CeleryExecutor(\n retries=RetryMode(RetryMode.DISABLED),\n broker=broker,\n backend=backend,\n include=include,\n config_source=config_source,\n )\n\n def app_args(self):\n return {\n "broker": self.broker,\n "backend": self.backend,\n "include": self.include,\n "config_source": self.config_source,\n "retries": self.retries,\n }\n
", "current_page_name": "_modules/dagster_celery/executor", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_celery.executor"}}, "dagster_celery_docker": {"executor": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_celery_docker.executor

\nimport os\n\nimport docker.client\nfrom dagster import (\n    DagsterInstance,\n    Executor,\n    Field,\n    Permissive,\n    StringSource,\n    _check as check,\n    executor,\n    multiple_process_executor_requirements,\n)\nfrom dagster._cli.api import ExecuteStepArgs\nfrom dagster._core.events import EngineEventData\nfrom dagster._core.events.utils import filter_dagster_events_from_cli_logs\nfrom dagster._core.execution.retries import RetryMode\nfrom dagster._core.storage.dagster_run import DagsterRun\nfrom dagster._serdes import pack_value, serialize_value, unpack_value\nfrom dagster._utils.merger import merge_dicts\nfrom dagster_celery.config import DEFAULT_CONFIG, dict_wrapper\nfrom dagster_celery.core_execution_loop import DELEGATE_MARKER, core_celery_execution_loop\nfrom dagster_celery.defaults import broker_url, result_backend\nfrom dagster_celery.executor import CELERY_CONFIG\n\nCELERY_DOCKER_CONFIG_KEY = "celery-docker"\n\n\ndef celery_docker_config():\n    additional_config = {\n        "docker": Field(\n            {\n                "image": Field(\n                    StringSource,\n                    is_required=False,\n                    description="The docker image to be used for step execution.",\n                ),\n                "registry": Field(\n                    {\n                        "url": Field(StringSource),\n                        "username": Field(StringSource),\n                        "password": Field(StringSource),\n                    },\n                    is_required=False,\n                    description="Information for using a non local/public docker registry",\n                ),\n                "env_vars": Field(\n                    [str],\n                    is_required=False,\n                    description=(\n                        "The list of environment variables names to forward from the celery worker"\n                        " in to the docker container"\n                    ),\n                ),\n                "network": Field(\n                    str,\n                    is_required=False,\n                    description=(\n                        "Name of the network this container will be connected to at creation time"\n                    ),\n                ),\n                "container_kwargs": Field(\n                    Permissive(),\n                    is_required=False,\n                    description="Additional keyword args for the docker container",\n                ),\n            },\n            is_required=True,\n            description="The configuration for interacting with docker in the celery worker.",\n        ),\n    }\n\n    cfg = merge_dicts(CELERY_CONFIG, additional_config)\n    return cfg\n\n\n
[docs]@executor(\n name=CELERY_DOCKER_CONFIG_KEY,\n config_schema=celery_docker_config(),\n requirements=multiple_process_executor_requirements(),\n)\ndef celery_docker_executor(init_context):\n """Celery-based executor which launches tasks in docker containers.\n\n The Celery executor exposes config settings for the underlying Celery app under\n the ``config_source`` key. This config corresponds to the "new lowercase settings" introduced\n in Celery version 4.0 and the object constructed from config will be passed to the\n :py:class:`celery.Celery` constructor as its ``config_source`` argument.\n (See https://docs.celeryq.dev/en/stable/userguide/configuration.html for details.)\n\n The executor also exposes the ``broker``, `backend`, and ``include`` arguments to the\n :py:class:`celery.Celery` constructor.\n\n In the most common case, you may want to modify the ``broker`` and ``backend`` (e.g., to use\n Redis instead of RabbitMQ). We expect that ``config_source`` will be less frequently\n modified, but that when op executions are especially fast or slow, or when there are\n different requirements around idempotence or retry, it may make sense to execute jobs\n with variations on these settings.\n\n To use the `celery_docker_executor`, set it as the `executor_def` when defining a job:\n\n .. code-block:: python\n\n from dagster import job\n from dagster_celery_docker.executor import celery_docker_executor\n\n @job(executor_def=celery_docker_executor)\n def celery_enabled_job():\n pass\n\n Then you can configure the executor as follows:\n\n .. code-block:: YAML\n\n execution:\n config:\n docker:\n image: 'my_repo.com/image_name:latest'\n registry:\n url: 'my_repo.com'\n username: 'my_user'\n password: {env: 'DOCKER_PASSWORD'}\n env_vars: ["DAGSTER_HOME"] # environment vars to pass from celery worker to docker\n container_kwargs: # keyword args to be passed to the container. example:\n volumes: ['/home/user1/:/mnt/vol2','/var/www:/mnt/vol1']\n\n broker: 'pyamqp://guest@localhost//' # Optional[str]: The URL of the Celery broker\n backend: 'rpc://' # Optional[str]: The URL of the Celery results backend\n include: ['my_module'] # Optional[List[str]]: Modules every worker should import\n config_source: # Dict[str, Any]: Any additional parameters to pass to the\n #... # Celery workers. This dict will be passed as the `config_source`\n #... # argument of celery.Celery().\n\n Note that the YAML you provide here must align with the configuration with which the Celery\n workers on which you hope to run were started. If, for example, you point the executor at a\n different broker than the one your workers are listening to, the workers will never be able to\n pick up tasks for execution.\n\n In deployments where the celery_docker_job_executor is used all appropriate celery and dagster_celery\n commands must be invoked with the `-A dagster_celery_docker.app` argument.\n """\n exc_cfg = init_context.executor_config\n\n return CeleryDockerExecutor(\n broker=exc_cfg.get("broker"),\n backend=exc_cfg.get("backend"),\n config_source=exc_cfg.get("config_source"),\n include=exc_cfg.get("include"),\n retries=RetryMode.from_config(exc_cfg.get("retries")),\n docker_config=exc_cfg.get("docker"),\n )
\n\n\nclass CeleryDockerExecutor(Executor):\n def __init__(\n self,\n retries,\n docker_config,\n broker=None,\n backend=None,\n include=None,\n config_source=None,\n ):\n self._retries = check.inst_param(retries, "retries", RetryMode)\n self.broker = check.opt_str_param(broker, "broker", default=broker_url)\n self.backend = check.opt_str_param(backend, "backend", default=result_backend)\n self.include = check.opt_list_param(include, "include", of_type=str)\n self.config_source = dict_wrapper(\n dict(DEFAULT_CONFIG, **check.opt_dict_param(config_source, "config_source"))\n )\n self.docker_config = check.dict_param(docker_config, "docker_config")\n\n @property\n def retries(self):\n return self._retries\n\n def execute(self, plan_context, execution_plan):\n return core_celery_execution_loop(\n plan_context, execution_plan, step_execution_fn=_submit_task_docker\n )\n\n def app_args(self):\n return {\n "broker": self.broker,\n "backend": self.backend,\n "include": self.include,\n "config_source": self.config_source,\n "retries": self.retries,\n }\n\n\ndef _submit_task_docker(app, plan_context, step, queue, priority, known_state):\n execute_step_args = ExecuteStepArgs(\n job_origin=plan_context.reconstructable_job.get_python_origin(),\n run_id=plan_context.dagster_run.run_id,\n step_keys_to_execute=[step.key],\n instance_ref=plan_context.instance.get_ref(),\n retry_mode=plan_context.executor.retries.for_inner_plan(),\n known_state=known_state,\n print_serialized_events=True,\n )\n\n task = create_docker_task(app)\n task_signature = task.si(\n execute_step_args_packed=pack_value(execute_step_args),\n docker_config=plan_context.executor.docker_config,\n )\n return task_signature.apply_async(\n priority=priority,\n queue=queue,\n routing_key=f"{queue}.execute_step_docker",\n )\n\n\ndef create_docker_task(celery_app, **task_kwargs):\n @celery_app.task(bind=True, name="execute_step_docker", **task_kwargs)\n def _execute_step_docker(\n self,\n execute_step_args_packed,\n docker_config,\n ):\n """Run step execution in a Docker container."""\n execute_step_args = unpack_value(\n check.dict_param(\n execute_step_args_packed,\n "execute_step_args_packed",\n ),\n as_type=ExecuteStepArgs,\n )\n\n check.dict_param(docker_config, "docker_config")\n\n instance = DagsterInstance.from_ref(execute_step_args.instance_ref)\n dagster_run = instance.get_run_by_id(execute_step_args.run_id)\n check.inst(\n dagster_run,\n DagsterRun,\n f"Could not load run {execute_step_args.run_id}",\n )\n step_keys_str = ", ".join(execute_step_args.step_keys_to_execute)\n\n docker_image = (\n docker_config["image"]\n if docker_config.get("image")\n else dagster_run.job_code_origin.repository_origin.container_image\n )\n\n if not docker_image:\n raise Exception("No docker image specified by either the job or the repository")\n\n client = docker.client.from_env()\n\n if docker_config.get("registry"):\n client.login(\n registry=docker_config["registry"]["url"],\n username=docker_config["registry"]["username"],\n password=docker_config["registry"]["password"],\n )\n\n # Post event for starting execution\n engine_event = instance.report_engine_event(\n f"Executing steps {step_keys_str} in Docker container {docker_image}",\n dagster_run,\n EngineEventData(\n {\n "Step keys": step_keys_str,\n "Image": docker_image,\n "Celery worker": self.request.hostname,\n },\n marker_end=DELEGATE_MARKER,\n ),\n CeleryDockerExecutor,\n step_key=execute_step_args.step_keys_to_execute[0],\n )\n\n serialized_events = [serialize_value(engine_event)]\n\n docker_env = {}\n if docker_config.get("env_vars"):\n docker_env = {env_name: os.getenv(env_name) for env_name in docker_config["env_vars"]}\n\n container_kwargs = check.opt_dict_param(\n docker_config.get("container_kwargs"), "container_kwargs", key_type=str\n )\n\n # set defaults for detach and auto_remove\n container_kwargs["detach"] = container_kwargs.get("detach", False)\n container_kwargs["auto_remove"] = container_kwargs.get("auto_remove", True)\n\n # if environment variables are provided via container_kwargs, merge with env_vars\n if container_kwargs.get("environment") is not None:\n e_vars = container_kwargs.get("environment")\n if isinstance(e_vars, dict):\n docker_env.update(e_vars)\n else:\n for v in e_vars:\n key, val = v.split("=")\n docker_env[key] = val\n del container_kwargs["environment"]\n\n try:\n docker_response = client.containers.run(\n docker_image,\n command=execute_step_args.get_command_args(),\n # pass through this worker's environment for things like AWS creds etc.\n environment=docker_env,\n network=docker_config.get("network", None),\n **container_kwargs,\n )\n\n res = docker_response.decode("utf-8")\n except docker.errors.ContainerError as err:\n metadata = {"Job image": docker_image}\n if err.stderr is not None:\n metadata["Docker stderr"] = err.stderr\n\n instance.report_engine_event(\n f"Failed to run steps {step_keys_str} in Docker container {docker_image}",\n dagster_run,\n EngineEventData(metadata),\n CeleryDockerExecutor,\n step_key=execute_step_args.step_keys_to_execute[0],\n )\n raise\n else:\n if res is None:\n raise Exception("No response from execute_step in CeleryDockerExecutor")\n\n events = filter_dagster_events_from_cli_logs(res.split("\\n"))\n serialized_events += [serialize_value(event) for event in events]\n\n return serialized_events\n\n return _execute_step_docker\n
", "current_page_name": "_modules/dagster_celery_docker/executor", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_celery_docker.executor"}}, "dagster_celery_k8s": {"executor": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_celery_k8s.executor

\nimport logging\nimport os\nimport sys\nimport time\n\nimport kubernetes\nfrom dagster import (\n    DagsterEvent,\n    DagsterEventType,\n    DagsterInstance,\n    Executor,\n    _check as check,\n    executor,\n    multiple_process_executor_requirements,\n)\nfrom dagster._cli.api import ExecuteStepArgs\nfrom dagster._core.errors import DagsterUnmetExecutorRequirementsError\nfrom dagster._core.events import EngineEventData\nfrom dagster._core.events.log import EventLogEntry\nfrom dagster._core.events.utils import filter_dagster_events_from_cli_logs\nfrom dagster._core.execution.plan.objects import StepFailureData, UserFailureData\nfrom dagster._core.execution.retries import RetryMode\nfrom dagster._core.storage.dagster_run import DagsterRun, DagsterRunStatus\nfrom dagster._serdes import pack_value, serialize_value, unpack_value\nfrom dagster._utils.error import serializable_error_info_from_exc_info\nfrom dagster_celery.config import DEFAULT_CONFIG, dict_wrapper\nfrom dagster_celery.core_execution_loop import DELEGATE_MARKER\nfrom dagster_celery.defaults import broker_url, result_backend\nfrom dagster_k8s import DagsterK8sJobConfig, construct_dagster_k8s_job\nfrom dagster_k8s.client import (\n    DagsterK8sAPIRetryLimitExceeded,\n    DagsterK8sError,\n    DagsterK8sJobStatusException,\n    DagsterK8sTimeoutError,\n    DagsterK8sUnrecoverableAPIError,\n    DagsterKubernetesClient,\n)\nfrom dagster_k8s.job import (\n    UserDefinedDagsterK8sConfig,\n    get_k8s_job_name,\n    get_user_defined_k8s_config,\n)\n\nfrom .config import CELERY_K8S_CONFIG_KEY, celery_k8s_executor_config\nfrom .launcher import CeleryK8sRunLauncher\n\n\n
[docs]@executor(\n name=CELERY_K8S_CONFIG_KEY,\n config_schema=celery_k8s_executor_config(),\n requirements=multiple_process_executor_requirements(),\n)\ndef celery_k8s_job_executor(init_context):\n """Celery-based executor which launches tasks as Kubernetes Jobs.\n\n The Celery executor exposes config settings for the underlying Celery app under\n the ``config_source`` key. This config corresponds to the "new lowercase settings" introduced\n in Celery version 4.0 and the object constructed from config will be passed to the\n :py:class:`celery.Celery` constructor as its ``config_source`` argument.\n (See https://docs.celeryq.dev/en/stable/userguide/configuration.html for details.)\n\n The executor also exposes the ``broker``, `backend`, and ``include`` arguments to the\n :py:class:`celery.Celery` constructor.\n\n In the most common case, you may want to modify the ``broker`` and ``backend`` (e.g., to use\n Redis instead of RabbitMQ). We expect that ``config_source`` will be less frequently\n modified, but that when op executions are especially fast or slow, or when there are\n different requirements around idempotence or retry, it may make sense to execute dagster jobs\n with variations on these settings.\n\n To use the `celery_k8s_job_executor`, set it as the `executor_def` when defining a job:\n\n .. literalinclude:: ../../../../../../python_modules/libraries/dagster-celery-k8s/dagster_celery_k8s_tests/example_celery_mode_def.py\n :language: python\n\n Then you can configure the executor as follows:\n\n .. code-block:: YAML\n\n execution:\n config:\n job_image: 'my_repo.com/image_name:latest'\n job_namespace: 'some-namespace'\n broker: 'pyamqp://guest@localhost//' # Optional[str]: The URL of the Celery broker\n backend: 'rpc://' # Optional[str]: The URL of the Celery results backend\n include: ['my_module'] # Optional[List[str]]: Modules every worker should import\n config_source: # Dict[str, Any]: Any additional parameters to pass to the\n #... # Celery workers. This dict will be passed as the `config_source`\n #... # argument of celery.Celery().\n\n Note that the YAML you provide here must align with the configuration with which the Celery\n workers on which you hope to run were started. If, for example, you point the executor at a\n different broker than the one your workers are listening to, the workers will never be able to\n pick up tasks for execution.\n\n In deployments where the celery_k8s_job_executor is used all appropriate celery and dagster_celery\n commands must be invoked with the `-A dagster_celery_k8s.app` argument.\n """\n run_launcher = init_context.instance.run_launcher\n exc_cfg = init_context.executor_config\n\n if not isinstance(run_launcher, CeleryK8sRunLauncher):\n raise DagsterUnmetExecutorRequirementsError(\n "This engine is only compatible with a CeleryK8sRunLauncher; configure the "\n "CeleryK8sRunLauncher on your instance to use it.",\n )\n\n job_config = run_launcher.get_k8s_job_config(\n job_image=exc_cfg.get("job_image") or os.getenv("DAGSTER_CURRENT_IMAGE"), exc_config=exc_cfg\n )\n\n # Set on the instance but overrideable here\n broker = run_launcher.broker or exc_cfg.get("broker")\n backend = run_launcher.backend or exc_cfg.get("backend")\n config_source = run_launcher.config_source or exc_cfg.get("config_source")\n include = run_launcher.include or exc_cfg.get("include")\n retries = run_launcher.retries or RetryMode.from_config(exc_cfg.get("retries"))\n\n return CeleryK8sJobExecutor(\n broker=broker,\n backend=backend,\n config_source=config_source,\n include=include,\n retries=retries,\n job_config=job_config,\n job_namespace=exc_cfg.get("job_namespace", run_launcher.job_namespace),\n load_incluster_config=exc_cfg.get("load_incluster_config"),\n kubeconfig_file=exc_cfg.get("kubeconfig_file"),\n repo_location_name=exc_cfg.get("repo_location_name"),\n job_wait_timeout=exc_cfg.get("job_wait_timeout"),\n )
\n\n\nclass CeleryK8sJobExecutor(Executor):\n def __init__(\n self,\n retries,\n broker=None,\n backend=None,\n include=None,\n config_source=None,\n job_config=None,\n job_namespace=None,\n load_incluster_config=False,\n kubeconfig_file=None,\n repo_location_name=None,\n job_wait_timeout=None,\n ):\n if load_incluster_config:\n check.invariant(\n kubeconfig_file is None,\n "`kubeconfig_file` is set but `load_incluster_config` is True.",\n )\n else:\n check.opt_str_param(kubeconfig_file, "kubeconfig_file")\n\n self._retries = check.inst_param(retries, "retries", RetryMode)\n self.broker = check.opt_str_param(broker, "broker", default=broker_url)\n self.backend = check.opt_str_param(backend, "backend", default=result_backend)\n self.include = check.opt_list_param(include, "include", of_type=str)\n self.config_source = dict_wrapper(\n dict(DEFAULT_CONFIG, **check.opt_dict_param(config_source, "config_source"))\n )\n self.job_config = check.inst_param(job_config, "job_config", DagsterK8sJobConfig)\n self.job_namespace = check.opt_str_param(job_namespace, "job_namespace")\n\n self.load_incluster_config = check.bool_param(\n load_incluster_config, "load_incluster_config"\n )\n\n self.kubeconfig_file = check.opt_str_param(kubeconfig_file, "kubeconfig_file")\n self.repo_location_name = check.opt_str_param(repo_location_name, "repo_location_name")\n self.job_wait_timeout = check.float_param(job_wait_timeout, "job_wait_timeout")\n\n @property\n def retries(self):\n return self._retries\n\n def execute(self, plan_context, execution_plan):\n from dagster_celery.core_execution_loop import core_celery_execution_loop\n\n return core_celery_execution_loop(\n plan_context, execution_plan, step_execution_fn=_submit_task_k8s_job\n )\n\n def app_args(self):\n return {\n "broker": self.broker,\n "backend": self.backend,\n "include": self.include,\n "config_source": self.config_source,\n "retries": self.retries,\n }\n\n\ndef _submit_task_k8s_job(app, plan_context, step, queue, priority, known_state):\n user_defined_k8s_config = get_user_defined_k8s_config(step.tags)\n\n job_origin = plan_context.reconstructable_job.get_python_origin()\n\n execute_step_args = ExecuteStepArgs(\n job_origin=job_origin,\n run_id=plan_context.dagster_run.run_id,\n step_keys_to_execute=[step.key],\n instance_ref=plan_context.instance.get_ref(),\n retry_mode=plan_context.executor.retries.for_inner_plan(),\n known_state=known_state,\n should_verify_step=True,\n print_serialized_events=True,\n )\n\n job_config = plan_context.executor.job_config\n if not job_config.job_image:\n job_config = job_config.with_image(job_origin.repository_origin.container_image)\n\n if not job_config.job_image:\n raise Exception("No image included in either executor config or the dagster job")\n\n task = create_k8s_job_task(app)\n task_signature = task.si(\n execute_step_args_packed=pack_value(execute_step_args),\n job_config_dict=job_config.to_dict(),\n job_namespace=plan_context.executor.job_namespace,\n user_defined_k8s_config_dict=user_defined_k8s_config.to_dict(),\n load_incluster_config=plan_context.executor.load_incluster_config,\n job_wait_timeout=plan_context.executor.job_wait_timeout,\n kubeconfig_file=plan_context.executor.kubeconfig_file,\n )\n\n return task_signature.apply_async(\n priority=priority,\n queue=queue,\n routing_key=f"{queue}.execute_step_k8s_job",\n )\n\n\ndef construct_step_failure_event_and_handle(dagster_run, step_key, err, instance):\n step_failure_event = DagsterEvent(\n event_type_value=DagsterEventType.STEP_FAILURE.value,\n job_name=dagster_run.job_name,\n step_key=step_key,\n event_specific_data=StepFailureData(\n error=serializable_error_info_from_exc_info(sys.exc_info()),\n user_failure_data=UserFailureData(label="K8sError"),\n ),\n )\n event_record = EventLogEntry(\n user_message=str(err),\n level=logging.ERROR,\n job_name=dagster_run.job_name,\n run_id=dagster_run.run_id,\n error_info=None,\n step_key=step_key,\n timestamp=time.time(),\n dagster_event=step_failure_event,\n )\n instance.handle_new_event(event_record)\n return step_failure_event\n\n\ndef create_k8s_job_task(celery_app, **task_kwargs):\n @celery_app.task(bind=True, name="execute_step_k8s_job", **task_kwargs)\n def _execute_step_k8s_job(\n self,\n execute_step_args_packed,\n job_config_dict,\n job_namespace,\n load_incluster_config,\n job_wait_timeout,\n user_defined_k8s_config_dict=None,\n kubeconfig_file=None,\n ):\n """Run step execution in a K8s job pod."""\n execute_step_args = unpack_value(\n check.dict_param(\n execute_step_args_packed,\n "execute_step_args_packed",\n )\n )\n check.inst_param(execute_step_args, "execute_step_args", ExecuteStepArgs)\n check.invariant(\n len(execute_step_args.step_keys_to_execute) == 1,\n "Celery K8s task executor can only execute 1 step at a time",\n )\n\n # Celery will serialize this as a list\n job_config = DagsterK8sJobConfig.from_dict(job_config_dict)\n check.inst_param(job_config, "job_config", DagsterK8sJobConfig)\n check.str_param(job_namespace, "job_namespace")\n\n check.bool_param(load_incluster_config, "load_incluster_config")\n\n user_defined_k8s_config = UserDefinedDagsterK8sConfig.from_dict(\n user_defined_k8s_config_dict\n )\n check.opt_inst_param(\n user_defined_k8s_config,\n "user_defined_k8s_config",\n UserDefinedDagsterK8sConfig,\n )\n check.opt_str_param(kubeconfig_file, "kubeconfig_file")\n\n # For when launched via DinD or running the cluster\n if load_incluster_config:\n kubernetes.config.load_incluster_config()\n else:\n kubernetes.config.load_kube_config(kubeconfig_file)\n\n api_client = DagsterKubernetesClient.production_client()\n instance = DagsterInstance.from_ref(execute_step_args.instance_ref)\n dagster_run = instance.get_run_by_id(execute_step_args.run_id)\n\n check.inst(\n dagster_run,\n DagsterRun,\n f"Could not load run {execute_step_args.run_id}",\n )\n step_key = execute_step_args.step_keys_to_execute[0]\n\n celery_worker_name = self.request.hostname\n celery_pod_name = os.environ.get("HOSTNAME")\n instance.report_engine_event(\n f"Task for step {step_key} picked up by Celery",\n dagster_run,\n EngineEventData(\n {\n "Celery worker name": celery_worker_name,\n "Celery worker Kubernetes Pod name": celery_pod_name,\n }\n ),\n CeleryK8sJobExecutor,\n step_key=step_key,\n )\n\n if dagster_run.status != DagsterRunStatus.STARTED:\n instance.report_engine_event(\n "Not scheduling step because dagster run status is not STARTED",\n dagster_run,\n EngineEventData(\n {\n "Step key": step_key,\n }\n ),\n CeleryK8sJobExecutor,\n step_key=step_key,\n )\n return []\n\n # Ensure we stay below k8s name length limits\n k8s_name_key = get_k8s_job_name(execute_step_args.run_id, step_key)\n\n retry_state = execute_step_args.known_state.get_retry_state()\n\n if retry_state.get_attempt_count(step_key):\n attempt_number = retry_state.get_attempt_count(step_key)\n job_name = "dagster-step-%s-%d" % (k8s_name_key, attempt_number)\n pod_name = "dagster-step-%s-%d" % (k8s_name_key, attempt_number)\n else:\n job_name = "dagster-step-%s" % (k8s_name_key)\n pod_name = "dagster-step-%s" % (k8s_name_key)\n\n args = execute_step_args.get_command_args()\n\n labels = {\n "dagster/job": dagster_run.job_name,\n "dagster/op": step_key,\n "dagster/run-id": execute_step_args.run_id,\n }\n if dagster_run.external_job_origin:\n labels["dagster/code-location"] = (\n dagster_run.external_job_origin.external_repository_origin.code_location_origin.location_name\n )\n job = construct_dagster_k8s_job(\n job_config,\n args,\n job_name,\n user_defined_k8s_config,\n pod_name,\n component="step_worker",\n labels=labels,\n env_vars=[\n {\n "name": "DAGSTER_RUN_JOB_NAME",\n "value": dagster_run.job_name,\n },\n {"name": "DAGSTER_RUN_STEP_KEY", "value": step_key},\n ],\n )\n\n # Running list of events generated from this task execution\n events = []\n\n # Post event for starting execution\n job_name = job.metadata.name\n engine_event = instance.report_engine_event(\n f'Executing step "{step_key}" in Kubernetes job {job_name}.',\n dagster_run,\n EngineEventData(\n {\n "Step key": step_key,\n "Kubernetes Job name": job_name,\n "Job image": job_config.job_image,\n "Image pull policy": job_config.image_pull_policy,\n "Image pull secrets": str(job_config.image_pull_secrets),\n "Service account name": str(job_config.service_account_name),\n },\n marker_end=DELEGATE_MARKER,\n ),\n CeleryK8sJobExecutor,\n # validated above that step_keys is length 1, and it is not possible to use ETH or\n # execution plan in this function (Celery K8s workers should not access to user code)\n step_key=step_key,\n )\n events.append(engine_event)\n try:\n api_client.batch_api.create_namespaced_job(body=job, namespace=job_namespace)\n except kubernetes.client.rest.ApiException as e:\n if e.reason == "Conflict":\n # There is an existing job with the same name so proceed and see if the existing job succeeded\n instance.report_engine_event(\n "Did not create Kubernetes job {} for step {} since job name already "\n "exists, proceeding with existing job.".format(job_name, step_key),\n dagster_run,\n EngineEventData(\n {\n "Step key": step_key,\n "Kubernetes Job name": job_name,\n },\n marker_end=DELEGATE_MARKER,\n ),\n CeleryK8sJobExecutor,\n step_key=step_key,\n )\n else:\n instance.report_engine_event(\n "Encountered unexpected error while creating Kubernetes job {} for step {}, "\n "exiting.".format(job_name, step_key),\n dagster_run,\n EngineEventData(\n {\n "Step key": step_key,\n },\n error=serializable_error_info_from_exc_info(sys.exc_info()),\n ),\n CeleryK8sJobExecutor,\n step_key=step_key,\n )\n return []\n\n try:\n api_client.wait_for_job_success(\n job_name=job_name,\n namespace=job_namespace,\n instance=instance,\n run_id=execute_step_args.run_id,\n wait_timeout=job_wait_timeout,\n )\n except (DagsterK8sError, DagsterK8sTimeoutError) as err:\n step_failure_event = construct_step_failure_event_and_handle(\n dagster_run, step_key, err, instance=instance\n )\n events.append(step_failure_event)\n except DagsterK8sJobStatusException:\n instance.report_engine_event(\n "Terminating Kubernetes Job because dagster run status is not STARTED",\n dagster_run,\n EngineEventData(\n {\n "Step key": step_key,\n "Kubernetes Job name": job_name,\n "Kubernetes Job namespace": job_namespace,\n }\n ),\n CeleryK8sJobExecutor,\n step_key=step_key,\n )\n api_client.delete_job(job_name=job_name, namespace=job_namespace)\n return []\n except (\n DagsterK8sUnrecoverableAPIError,\n DagsterK8sAPIRetryLimitExceeded,\n # We shouldn't see unwrapped APIExceptions anymore, as they should all be wrapped in\n # a retry boundary. We still catch it here just in case we missed one so that we can\n # report it to the event log\n kubernetes.client.rest.ApiException,\n ):\n instance.report_engine_event(\n "Encountered unexpected error while waiting on Kubernetes job {} for step {}, "\n "exiting.".format(job_name, step_key),\n dagster_run,\n EngineEventData(\n {\n "Step key": step_key,\n },\n error=serializable_error_info_from_exc_info(sys.exc_info()),\n ),\n CeleryK8sJobExecutor,\n step_key=step_key,\n )\n return []\n\n try:\n pod_names = api_client.get_pod_names_in_job(job_name, namespace=job_namespace)\n except kubernetes.client.rest.ApiException:\n instance.report_engine_event(\n "Encountered unexpected error retreiving Pods for Kubernetes job {} for step {}, "\n "exiting.".format(job_name, step_key),\n dagster_run,\n EngineEventData(\n {\n "Step key": step_key,\n },\n error=serializable_error_info_from_exc_info(sys.exc_info()),\n ),\n CeleryK8sJobExecutor,\n step_key=step_key,\n )\n return []\n\n # Post engine event for log retrieval\n engine_event = instance.report_engine_event(\n "Retrieving logs from Kubernetes Job pods",\n dagster_run,\n EngineEventData({"Pod names": "\\n".join(pod_names)}),\n CeleryK8sJobExecutor,\n step_key=step_key,\n )\n events.append(engine_event)\n\n logs = []\n for pod_name in pod_names:\n try:\n raw_logs = api_client.retrieve_pod_logs(pod_name, namespace=job_namespace)\n logs += raw_logs.split("\\n")\n except kubernetes.client.exceptions.ApiException:\n instance.report_engine_event(\n "Encountered unexpected error while fetching pod logs for Kubernetes job {}, "\n "Pod name {} for step {}. Will attempt to continue with other pods.".format(\n job_name, pod_name, step_key\n ),\n dagster_run,\n EngineEventData(\n {\n "Step key": step_key,\n },\n error=serializable_error_info_from_exc_info(sys.exc_info()),\n ),\n CeleryK8sJobExecutor,\n step_key=step_key,\n )\n\n events += filter_dagster_events_from_cli_logs(logs)\n serialized_events = [serialize_value(event) for event in events]\n return serialized_events\n\n return _execute_step_k8s_job\n
", "current_page_name": "_modules/dagster_celery_k8s/executor", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_celery_k8s.executor"}, "launcher": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_celery_k8s.launcher

\nimport sys\nfrom typing import Optional, cast\n\nimport kubernetes\nfrom dagster import (\n    DagsterInvariantViolationError,\n    _check as check,\n)\nfrom dagster._config import process_config, resolve_to_config_type\nfrom dagster._core.events import EngineEventData\nfrom dagster._core.execution.retries import RetryMode\nfrom dagster._core.launcher import LaunchRunContext, RunLauncher\nfrom dagster._core.launcher.base import CheckRunHealthResult, WorkerStatus\nfrom dagster._core.origin import JobPythonOrigin\nfrom dagster._core.storage.dagster_run import DagsterRun\nfrom dagster._core.storage.tags import DOCKER_IMAGE_TAG\nfrom dagster._serdes import ConfigurableClass, ConfigurableClassData\nfrom dagster._utils.error import serializable_error_info_from_exc_info\nfrom dagster._utils.merger import merge_dicts\nfrom dagster_k8s.client import DagsterKubernetesClient\nfrom dagster_k8s.job import (\n    DagsterK8sJobConfig,\n    construct_dagster_k8s_job,\n    get_job_name_from_run_id,\n    get_user_defined_k8s_config,\n)\n\nfrom .config import CELERY_K8S_CONFIG_KEY, celery_k8s_executor_config\n\n\n
[docs]class CeleryK8sRunLauncher(RunLauncher, ConfigurableClass):\n """In contrast to the :py:class:`K8sRunLauncher`, which launches dagster runs as single K8s\n Jobs, this run launcher is intended for use in concert with\n :py:func:`dagster_celery_k8s.celery_k8s_job_executor`.\n\n With this run launcher, execution is delegated to:\n\n 1. A run worker Kubernetes Job, which traverses the dagster run execution plan and\n submits steps to Celery queues for execution;\n 2. The step executions which are submitted to Celery queues are picked up by Celery workers,\n and each step execution spawns a step execution Kubernetes Job. See the implementation\n defined in :py:func:`dagster_celery_k8.executor.create_k8s_job_task`.\n\n You can configure a Dagster instance to use this RunLauncher by adding a section to your\n ``dagster.yaml`` like the following:\n\n .. code-block:: yaml\n\n run_launcher:\n module: dagster_k8s.launcher\n class: CeleryK8sRunLauncher\n config:\n instance_config_map: "dagster-k8s-instance-config-map"\n dagster_home: "/some/path"\n postgres_password_secret: "dagster-k8s-pg-password"\n broker: "some_celery_broker_url"\n backend: "some_celery_backend_url"\n\n """\n\n def __init__(\n self,\n instance_config_map,\n dagster_home,\n postgres_password_secret,\n load_incluster_config=True,\n kubeconfig_file=None,\n broker=None,\n backend=None,\n include=None,\n config_source=None,\n retries=None,\n inst_data: Optional[ConfigurableClassData] = None,\n k8s_client_batch_api=None,\n env_config_maps=None,\n env_secrets=None,\n volume_mounts=None,\n volumes=None,\n service_account_name=None,\n image_pull_policy=None,\n image_pull_secrets=None,\n labels=None,\n fail_pod_on_run_failure=None,\n job_namespace=None,\n ):\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n\n if load_incluster_config:\n check.invariant(\n kubeconfig_file is None,\n "`kubeconfig_file` is set but `load_incluster_config` is True.",\n )\n kubernetes.config.load_incluster_config()\n else:\n check.opt_str_param(kubeconfig_file, "kubeconfig_file")\n kubernetes.config.load_kube_config(kubeconfig_file)\n\n self._api_client = DagsterKubernetesClient.production_client(\n batch_api_override=k8s_client_batch_api\n )\n\n self.instance_config_map = check.str_param(instance_config_map, "instance_config_map")\n self.dagster_home = check.str_param(dagster_home, "dagster_home")\n self.postgres_password_secret = check.str_param(\n postgres_password_secret, "postgres_password_secret"\n )\n self.broker = check.opt_str_param(broker, "broker")\n self.backend = check.opt_str_param(backend, "backend")\n self.include = check.opt_list_param(include, "include")\n self.config_source = check.opt_dict_param(config_source, "config_source")\n\n retries = check.opt_dict_param(retries, "retries") or {"enabled": {}}\n self.retries = RetryMode.from_config(retries)\n\n self._env_config_maps = check.opt_list_param(\n env_config_maps, "env_config_maps", of_type=str\n )\n self._env_secrets = check.opt_list_param(env_secrets, "env_secrets", of_type=str)\n\n self._volume_mounts = check.opt_list_param(volume_mounts, "volume_mounts")\n self._volumes = check.opt_list_param(volumes, "volumes")\n\n self._service_account_name = check.opt_str_param(\n service_account_name, "service_account_name"\n )\n self._image_pull_policy = check.opt_str_param(\n image_pull_policy, "image_pull_policy", "IfNotPresent"\n )\n self._image_pull_secrets = check.opt_list_param(\n image_pull_secrets, "image_pull_secrets", of_type=dict\n )\n self._labels = check.opt_dict_param(labels, "labels", key_type=str, value_type=str)\n self._fail_pod_on_run_failure = check.opt_bool_param(\n fail_pod_on_run_failure, "fail_pod_on_run_failure"\n )\n self.job_namespace = check.opt_str_param(job_namespace, "job_namespace", default="default")\n\n super().__init__()\n\n @classmethod\n def config_type(cls):\n from dagster_celery.executor import CELERY_CONFIG\n\n return merge_dicts(DagsterK8sJobConfig.config_type_run_launcher(), CELERY_CONFIG)\n\n @classmethod\n def from_config_value(cls, inst_data, config_value):\n return cls(inst_data=inst_data, **config_value)\n\n @property\n def inst_data(self):\n return self._inst_data\n\n def launch_run(self, context: LaunchRunContext) -> None:\n run = context.dagster_run\n\n job_name = get_job_name_from_run_id(run.run_id)\n pod_name = job_name\n exc_config = _get_validated_celery_k8s_executor_config(run.run_config)\n\n job_image_from_executor_config = exc_config.get("job_image")\n\n job_origin = cast(JobPythonOrigin, context.job_code_origin)\n repository_origin = job_origin.repository_origin\n\n job_image = repository_origin.container_image\n\n if job_image:\n if job_image_from_executor_config:\n job_image = job_image_from_executor_config\n self._instance.report_engine_event(\n f"You have specified a job_image {job_image_from_executor_config} in your"\n f" executor configuration, but also {job_image} in your user-code"\n f" deployment. Using the job image {job_image_from_executor_config} from"\n " executor configuration as it takes precedence.",\n run,\n cls=self.__class__,\n )\n else:\n if not job_image_from_executor_config:\n raise DagsterInvariantViolationError(\n "You have not specified a job_image in your executor configuration. To resolve"\n " this error, specify the job_image configuration in the executor config"\n " section in your run config. \\nNote: You may also be seeing this error because"\n " you are using the configured API. Using configured with the celery-k8s"\n " executor is not supported at this time, and the job_image must be configured"\n " at the top-level executor config without using configured."\n )\n\n job_image = job_image_from_executor_config\n\n job_config = self.get_k8s_job_config(job_image, exc_config)\n\n self._instance.add_run_tags(\n run.run_id,\n {DOCKER_IMAGE_TAG: job_config.job_image},\n )\n\n user_defined_k8s_config = get_user_defined_k8s_config(run.tags)\n\n from dagster._cli.api import ExecuteRunArgs\n\n run_args = ExecuteRunArgs(\n job_origin=job_origin,\n run_id=run.run_id,\n instance_ref=self._instance.get_ref(),\n set_exit_code_on_failure=self._fail_pod_on_run_failure,\n ).get_command_args()\n\n labels = {\n "dagster/job": job_origin.job_name,\n "dagster/run-id": run.run_id,\n }\n if run.external_job_origin:\n labels["dagster/code-location"] = (\n run.external_job_origin.external_repository_origin.code_location_origin.location_name\n )\n\n job = construct_dagster_k8s_job(\n job_config,\n args=run_args,\n job_name=job_name,\n pod_name=pod_name,\n component="run_worker",\n user_defined_k8s_config=user_defined_k8s_config,\n labels=labels,\n env_vars=[{"name": "DAGSTER_RUN_JOB_NAME", "value": job_origin.job_name}],\n )\n\n job_namespace = exc_config.get("job_namespace", self.job_namespace)\n\n self._instance.report_engine_event(\n "Creating Kubernetes run worker job",\n run,\n EngineEventData(\n {\n "Kubernetes Job name": job_name,\n "Kubernetes Namespace": job_namespace,\n "Run ID": run.run_id,\n }\n ),\n cls=self.__class__,\n )\n\n self._api_client.batch_api.create_namespaced_job(body=job, namespace=job_namespace)\n self._instance.report_engine_event(\n "Kubernetes run worker job created",\n run,\n EngineEventData(\n {\n "Kubernetes Job name": job_name,\n "Kubernetes Namespace": job_namespace,\n "Run ID": run.run_id,\n }\n ),\n cls=self.__class__,\n )\n\n def get_k8s_job_config(self, job_image, exc_config):\n return DagsterK8sJobConfig(\n dagster_home=self.dagster_home,\n instance_config_map=self.instance_config_map,\n postgres_password_secret=self.postgres_password_secret,\n job_image=check.opt_str_param(job_image, "job_image"),\n image_pull_policy=exc_config.get("image_pull_policy", self._image_pull_policy),\n image_pull_secrets=exc_config.get("image_pull_secrets", []) + self._image_pull_secrets,\n service_account_name=exc_config.get("service_account_name", self._service_account_name),\n env_config_maps=exc_config.get("env_config_maps", []) + self._env_config_maps,\n env_secrets=exc_config.get("env_secrets", []) + self._env_secrets,\n volume_mounts=exc_config.get("volume_mounts", []) + self._volume_mounts,\n volumes=exc_config.get("volumes", []) + self._volumes,\n labels=merge_dicts(self._labels, exc_config.get("labels", {})),\n )\n\n def terminate(self, run_id):\n check.str_param(run_id, "run_id")\n\n run = self._instance.get_run_by_id(run_id)\n if not run:\n return False\n\n self._instance.report_run_canceling(run)\n\n job_name = get_job_name_from_run_id(run_id)\n\n job_namespace = self.get_namespace_from_run_config(run_id)\n\n try:\n termination_result = self._api_client.delete_job(\n job_name=job_name, namespace=job_namespace\n )\n if termination_result:\n self._instance.report_engine_event(\n message="Dagster Job was terminated successfully.",\n dagster_run=run,\n cls=self.__class__,\n )\n else:\n self._instance.report_engine_event(\n message=(\n "Dagster Job was not terminated successfully; delete_job returned {}"\n .format(termination_result)\n ),\n dagster_run=run,\n cls=self.__class__,\n )\n return termination_result\n except Exception:\n self._instance.report_engine_event(\n message=(\n "Dagster Job was not terminated successfully; encountered error in delete_job"\n ),\n dagster_run=run,\n engine_event_data=EngineEventData.engine_error(\n serializable_error_info_from_exc_info(sys.exc_info())\n ),\n cls=self.__class__,\n )\n\n def get_namespace_from_run_config(self, run_id):\n check.str_param(run_id, "run_id")\n\n dagster_run = self._instance.get_run_by_id(run_id)\n run_config = dagster_run.run_config\n executor_config = _get_validated_celery_k8s_executor_config(run_config)\n return executor_config.get("job_namespace", self.job_namespace)\n\n @property\n def supports_check_run_worker_health(self):\n return True\n\n def check_run_worker_health(self, run: DagsterRun):\n job_namespace = _get_validated_celery_k8s_executor_config(run.run_config).get(\n "job_namespace", self.job_namespace\n )\n job_name = get_job_name_from_run_id(run.run_id)\n try:\n status = self._api_client.get_job_status(namespace=job_namespace, job_name=job_name)\n except Exception:\n return CheckRunHealthResult(\n WorkerStatus.UNKNOWN, str(serializable_error_info_from_exc_info(sys.exc_info()))\n )\n if status.failed:\n return CheckRunHealthResult(WorkerStatus.FAILED, "K8s job failed")\n return CheckRunHealthResult(WorkerStatus.RUNNING)
\n\n\ndef _get_validated_celery_k8s_executor_config(run_config):\n check.dict_param(run_config, "run_config")\n\n executor_config = run_config.get("execution", {})\n execution_config_schema = resolve_to_config_type(celery_k8s_executor_config())\n\n # In run config on jobs, we don't have an executor key\n if CELERY_K8S_CONFIG_KEY not in executor_config:\n execution_run_config = executor_config.get("config", {})\n else:\n execution_run_config = (run_config["execution"][CELERY_K8S_CONFIG_KEY] or {}).get(\n "config", {}\n )\n\n res = process_config(execution_config_schema, execution_run_config)\n\n check.invariant(\n res.success,\n "Incorrect execution schema provided. Note: You may also be seeing this error "\n "because you are using the configured API. "\n "Using configured with the {config_key} executor is not supported at this time, "\n "and all executor config must be directly in the run config without using configured."\n .format(\n config_key=CELERY_K8S_CONFIG_KEY,\n ),\n )\n\n return res.value\n
", "current_page_name": "_modules/dagster_celery_k8s/launcher", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_celery_k8s.launcher"}}, "dagster_census": {"ops": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_census.ops

\nfrom dagster import Array, Bool, Field, In, Noneable, Nothing, Out, Output, op\n\nfrom .resources import DEFAULT_POLL_INTERVAL\nfrom .types import CensusOutput\nfrom .utils import generate_materialization\n\n\n
[docs]@op(\n required_resource_keys={"census"},\n ins={"start_after": In(Nothing)},\n out=Out(\n CensusOutput,\n description=(\n "Parsed json dictionary representing the details of the Census sync after "\n "the sync successfully completes."\n ),\n ),\n config_schema={\n "sync_id": Field(\n int,\n is_required=True,\n description="Id of the parent sync.",\n ),\n "force_full_sync": Field(\n config=Bool,\n default_value=False,\n description=(\n "If this trigger request should be a Full Sync. "\n "Note that some sync configurations such as Append do not support full syncs."\n ),\n ),\n "poll_interval": Field(\n float,\n default_value=DEFAULT_POLL_INTERVAL,\n description="The time (in seconds) to wait between successive polls.",\n ),\n "poll_timeout": Field(\n Noneable(float),\n default_value=None,\n description=(\n "The maximum time to wait before this operation is timed out. By "\n "default, this will never time out."\n ),\n ),\n "yield_materializations": Field(\n config=Bool,\n default_value=True,\n description=(\n "If True, materializations corresponding to the results of the Census sync will "\n "be yielded when the op executes."\n ),\n ),\n "asset_key_prefix": Field(\n config=Array(str),\n default_value=["census"],\n description=(\n "If provided and yield_materializations is True, these components will be used to "\n "prefix the generated asset keys."\n ),\n ),\n },\n tags={"kind": "census"},\n)\ndef census_trigger_sync_op(context):\n """Executes a Census sync for a given ``sync_id`` and polls until that sync completes, raising\n an error if it is unsuccessful.\n\n It outputs a :py:class:`~dagster_census.CensusOutput` which contains the details of the Census\n sync after it successfully completes.\n\n It requires the use of the :py:class:`~dagster_census.census_resource`, which allows it to\n communicate with the Census API.\n\n **Examples:**\n\n .. code-block:: python\n\n from dagster import job\n from dagster_census import census_resource, census_sync_op\n\n my_census_resource = census_resource.configured(\n {\n "api_key": {"env": "CENSUS_API_KEY"},\n }\n )\n\n sync_foobar = census_sync_op.configured({"sync_id": "foobar"}, name="sync_foobar")\n\n @job(resource_defs={"census": my_census_resource})\n def my_simple_census_job():\n sync_foobar()\n\n """\n census_output = context.resources.census.trigger_sync_and_poll(\n sync_id=context.op_config["sync_id"],\n force_full_sync=context.op_config["force_full_sync"],\n poll_interval=context.op_config["poll_interval"],\n poll_timeout=context.op_config["poll_timeout"],\n )\n if context.op_config["yield_materializations"]:\n yield generate_materialization(\n census_output, asset_key_prefix=context.op_config["asset_key_prefix"]\n )\n yield Output(census_output)
\n
", "current_page_name": "_modules/dagster_census/ops", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_census.ops"}, "resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_census.resources

\nimport datetime\nimport json\nimport logging\nimport time\nfrom typing import Any, Mapping, Optional\n\nimport requests\nfrom dagster import Failure, Field, StringSource, __version__, get_dagster_logger, resource\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom requests.auth import HTTPBasicAuth\nfrom requests.exceptions import RequestException\n\nfrom .types import CensusOutput\n\nCENSUS_API_BASE = "app.getcensus.com/api"\nCENSUS_VERSION = "v1"\n\nDEFAULT_POLL_INTERVAL = 10\n\nSYNC_RUN_STATUSES = {"completed", "failed", "queued", "skipped", "working"}\n\n\n
[docs]class CensusResource:\n """This class exposes methods on top of the Census REST API."""\n\n def __init__(\n self,\n api_key: str,\n request_max_retries: int = 3,\n request_retry_delay: float = 0.25,\n log: logging.Logger = get_dagster_logger(),\n ):\n self.api_key = api_key\n\n self._request_max_retries = request_max_retries\n self._request_retry_delay = request_retry_delay\n\n self._log = log\n\n @property\n def _api_key(self):\n if self.api_key.startswith("secret-token:"):\n return self.api_key\n return "secret-token:" + self.api_key\n\n @property\n def api_base_url(self) -> str:\n return f"https://{CENSUS_API_BASE}/{CENSUS_VERSION}"\n\n def make_request(\n self, method: str, endpoint: str, data: Optional[str] = None\n ) -> Mapping[str, Any]:\n """Creates and sends a request to the desired Census API endpoint.\n\n Args:\n method (str): The http method to use for this request (e.g. "POST", "GET", "PATCH").\n endpoint (str): The Census API endpoint to send this request to.\n data (Optional[str]): JSON-formatted data string to be included in the request.\n\n Returns:\n Dict[str, Any]: JSON data from the response to this request\n """\n url = f"{self.api_base_url}/{endpoint}"\n headers = {\n "User-Agent": f"dagster-census/{__version__}",\n "Content-Type": "application/json;version=2",\n }\n\n num_retries = 0\n while True:\n try:\n response = requests.request(\n method=method,\n url=url,\n headers=headers,\n auth=HTTPBasicAuth("bearer", self._api_key),\n data=data,\n )\n response.raise_for_status()\n return response.json()\n except RequestException as e:\n self._log.error("Request to Census API failed: %s", e)\n if num_retries == self._request_max_retries:\n break\n num_retries += 1\n time.sleep(self._request_retry_delay)\n\n raise Failure(f"Max retries ({self._request_max_retries}) exceeded with url: {url}.")\n\n def get_sync(self, sync_id: int) -> Mapping[str, Any]:\n """Gets details about a given sync from the Census API.\n\n Args:\n sync_id (int): The Census Sync ID.\n\n Returns:\n Dict[str, Any]: JSON data from the response to this request\n """\n return self.make_request(method="GET", endpoint=f"syncs/{sync_id}")\n\n def get_source(self, source_id: int) -> Mapping[str, Any]:\n """Gets details about a given source from the Census API.\n\n Args:\n source_id (int): The Census Source ID.\n\n Returns:\n Dict[str, Any]: JSON data from the response to this request\n """\n return self.make_request(method="GET", endpoint=f"sources/{source_id}")\n\n def get_destination(self, destination_id: int) -> Mapping[str, Any]:\n """Gets details about a given destination from the Census API.\n\n Args:\n destination_id (int): The Census Destination ID.\n\n Returns:\n Dict[str, Any]: JSON data from the response to this request\n """\n return self.make_request(method="GET", endpoint=f"destinations/{destination_id}")\n\n def get_sync_run(self, sync_run_id: int) -> Mapping[str, Any]:\n """Gets details about a specific sync run from the Census API.\n\n Args:\n sync_run_id (int): The Census Sync Run ID.\n\n Returns:\n Dict[str, Any]: JSON data from the response to this request\n """\n return self.make_request(method="GET", endpoint=f"sync_runs/{sync_run_id}")\n\n def poll_sync_run(\n self,\n sync_run_id: int,\n poll_interval: float = DEFAULT_POLL_INTERVAL,\n poll_timeout: Optional[float] = None,\n ) -> Mapping[str, Any]:\n """Given a Census sync run, poll until the run is complete.\n\n Args:\n sync_id (int): The Census Sync Run ID.\n poll_interval (float): The time (in seconds) that will be waited between successive polls.\n poll_timeout (float): The maximum time that will waited before this operation is timed\n out. By default, this will never time out.\n\n Returns:\n Dict[str, Any]: JSON data from the response to this request\n """\n log_url = f"https://app.getcensus.com/syncs_runs/{sync_run_id}"\n poll_start = datetime.datetime.now()\n\n while True:\n time.sleep(poll_interval)\n response_dict = self.get_sync_run(sync_run_id)\n if "data" not in response_dict.keys():\n raise ValueError(\n f"Getting status of sync failed, please visit Census Logs at {log_url} to see"\n " more."\n )\n\n sync_status = response_dict["data"]["status"]\n sync_id = response_dict["data"]["sync_id"]\n\n if sync_status not in SYNC_RUN_STATUSES:\n raise ValueError(\n f"Unexpected response status '{sync_status}'; "\n f"must be one of {','.join(sorted(SYNC_RUN_STATUSES))}. "\n "See Management API docs for more information: "\n "https://docs.getcensus.com/basics/developers/api/sync-runs"\n )\n\n if sync_status in {"queued", "working"}:\n self._log.debug(\n f"Sync {sync_id} still running after {datetime.datetime.now() - poll_start}."\n )\n continue\n\n if poll_timeout and datetime.datetime.now() > poll_start + datetime.timedelta(\n seconds=poll_timeout\n ):\n raise Failure(\n f"Sync for sync '{sync_id}' timed out after"\n f" {datetime.datetime.now() - poll_start}."\n )\n\n break\n\n self._log.debug(\n f"Sync {sync_id} has finished running after {datetime.datetime.now() - poll_start}."\n )\n self._log.info(f"View sync details here: {log_url}.")\n\n return response_dict\n\n def trigger_sync(self, sync_id: int, force_full_sync: bool = False) -> Mapping[str, Any]:\n """Trigger an asynchronous run for a specific sync.\n\n Args:\n sync_id (int): The Census Sync Run ID.\n force_full_sync (bool): If the Sync should perform a full sync\n\n Returns:\n Dict[str, Any]: JSON data from the response to this request\n """\n data = {"force_full_sync": force_full_sync}\n return self.make_request(\n method="POST", endpoint=f"syncs/{sync_id}/trigger", data=json.dumps(data)\n )\n\n def trigger_sync_and_poll(\n self,\n sync_id: int,\n force_full_sync: bool = False,\n poll_interval: float = DEFAULT_POLL_INTERVAL,\n poll_timeout: Optional[float] = None,\n ) -> CensusOutput:\n """Trigger a run for a specific sync and poll until it has completed.\n\n Args:\n sync_id (int): The Census Sync Run ID.\n force_full_sync (bool): If the Sync should perform a full sync\n poll_interval (float): The time (in seconds) that will be waited between successive polls.\n poll_timeout (float): The maximum time that will waited before this operation is timed\n out. By default, this will never time out.\n\n Returns:\n :py:class:`~CensusOutput`:\n Object containing details about the sync run and the sync details\n """\n sync_details = self.get_sync(sync_id=sync_id)\n source_details = self.get_source(\n source_id=sync_details["data"]["source_attributes"]["connection_id"]\n )["data"]\n destination_details = self.get_destination(\n destination_id=sync_details["data"]["destination_attributes"]["connection_id"]\n )["data"]\n\n trigger_sync_resp = self.trigger_sync(sync_id=sync_id, force_full_sync=force_full_sync)\n sync_run_details = self.poll_sync_run(\n sync_run_id=trigger_sync_resp["data"]["sync_run_id"],\n poll_interval=poll_interval,\n poll_timeout=poll_timeout,\n )["data"]\n return CensusOutput(\n sync_run=sync_run_details,\n source=source_details,\n destination=destination_details,\n )
\n\n\n
[docs]@dagster_maintained_resource\n@resource(\n config_schema={\n "api_key": Field(\n StringSource,\n is_required=True,\n description="Census API Key.",\n ),\n "request_max_retries": Field(\n int,\n default_value=3,\n description=(\n "The maximum number of times requests to the Census API should be retried "\n "before failing."\n ),\n ),\n "request_retry_delay": Field(\n float,\n default_value=0.25,\n description="Time (in seconds) to wait between each request retry.",\n ),\n },\n description="This resource helps manage Census connectors",\n)\ndef census_resource(context) -> CensusResource:\n """This resource allows users to programatically interface with the Census REST API to launch\n syncs and monitor their progress. This currently implements only a subset of the functionality\n exposed by the API.\n\n **Examples:**\n\n .. code-block:: python\n\n from dagster import job\n from dagster_census import census_resource\n\n my_census_resource = census_resource.configured(\n {\n "api_key": {"env": "CENSUS_API_KEY"},\n }\n )\n\n @job(resource_defs={"census":my_census_resource})\n def my_census_job():\n ...\n\n """\n return CensusResource(\n api_key=context.resource_config["api_key"],\n request_max_retries=context.resource_config["request_max_retries"],\n request_retry_delay=context.resource_config["request_retry_delay"],\n log=context.log,\n )
\n
", "current_page_name": "_modules/dagster_census/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_census.resources"}, "types": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_census.types

\nfrom typing import Any, Mapping, NamedTuple\n\n\n
[docs]class CensusOutput(\n NamedTuple(\n "_CensusOutput",\n [\n ("sync_run", Mapping[str, Any]),\n ("source", Mapping[str, Any]),\n ("destination", Mapping[str, Any]),\n ],\n )\n):\n """Contains recorded information about the state of a Census sync after a sync completes.\n\n Attributes:\n sync_run (Dict[str, Any]):\n The details of the specific sync run.\n source (Dict[str, Any]):\n Information about the source for the Census sync.\n destination (Dict[str, Any]):\n Information about the destination for the Census sync.\n """
\n
", "current_page_name": "_modules/dagster_census/types", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_census.types"}}, "dagster_dask": {"executor": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_dask.executor

\nfrom typing import Any, Mapping, Optional, Sequence\n\nimport dask\nimport dask.distributed\nfrom dagster import (\n    Executor,\n    Field,\n    Permissive,\n    Selector,\n    StringSource,\n    _check as check,\n    _seven,\n    multiple_process_executor_requirements,\n)\nfrom dagster._core.definitions.executor_definition import executor\nfrom dagster._core.definitions.reconstruct import ReconstructableJob\nfrom dagster._core.errors import raise_execution_interrupts\nfrom dagster._core.events import DagsterEvent\nfrom dagster._core.execution.api import create_execution_plan, execute_plan\nfrom dagster._core.execution.context.system import PlanOrchestrationContext\nfrom dagster._core.execution.plan.plan import ExecutionPlan\nfrom dagster._core.execution.plan.state import KnownExecutionState\nfrom dagster._core.execution.retries import RetryMode\nfrom dagster._core.instance import DagsterInstance\nfrom dagster._core.instance.ref import InstanceRef\nfrom dagster._core.storage.dagster_run import DagsterRun\nfrom dagster._utils import iterate_with_context\n\n# Dask resource requirements are specified under this key\nDASK_RESOURCE_REQUIREMENTS_KEY = "dagster-dask/resource_requirements"\n\n\n
[docs]@executor(\n name="dask",\n requirements=multiple_process_executor_requirements(),\n config_schema={\n "cluster": Field(\n Selector(\n {\n "existing": Field(\n {"address": StringSource},\n description="Connect to an existing scheduler.",\n ),\n "local": Field(\n Permissive(), is_required=False, description="Local cluster configuration."\n ),\n "yarn": Field(\n Permissive(), is_required=False, description="YARN cluster configuration."\n ),\n "ssh": Field(\n Permissive(), is_required=False, description="SSH cluster configuration."\n ),\n "pbs": Field(\n Permissive(), is_required=False, description="PBS cluster configuration."\n ),\n "moab": Field(\n Permissive(), is_required=False, description="Moab cluster configuration."\n ),\n "sge": Field(\n Permissive(), is_required=False, description="SGE cluster configuration."\n ),\n "lsf": Field(\n Permissive(), is_required=False, description="LSF cluster configuration."\n ),\n "slurm": Field(\n Permissive(), is_required=False, description="SLURM cluster configuration."\n ),\n "oar": Field(\n Permissive(), is_required=False, description="OAR cluster configuration."\n ),\n "kube": Field(\n Permissive(),\n is_required=False,\n description="Kubernetes cluster configuration.",\n ),\n }\n )\n )\n },\n)\ndef dask_executor(init_context):\n """Dask-based executor.\n\n The 'cluster' can be one of the following:\n ('existing', 'local', 'yarn', 'ssh', 'pbs', 'moab', 'sge', 'lsf', 'slurm', 'oar', 'kube').\n\n If the Dask executor is used without providing executor-specific config, a local Dask cluster\n will be created (as when calling :py:class:`dask.distributed.Client() <dask:distributed.Client>`\n with :py:class:`dask.distributed.LocalCluster() <dask:distributed.LocalCluster>`).\n\n The Dask executor optionally takes the following config:\n\n .. code-block:: none\n\n cluster:\n {\n local?: # takes distributed.LocalCluster parameters\n {\n timeout?: 5, # Timeout duration for initial connection to the scheduler\n n_workers?: 4 # Number of workers to start\n threads_per_worker?: 1 # Number of threads per each worker\n }\n }\n\n To use the `dask_executor`, set it as the `executor_def` when defining a job:\n\n .. code-block:: python\n\n from dagster import job\n from dagster_dask import dask_executor\n\n @job(executor_def=dask_executor)\n def dask_enabled_job():\n pass\n\n """\n ((cluster_type, cluster_configuration),) = init_context.executor_config["cluster"].items()\n return DaskExecutor(cluster_type, cluster_configuration)
\n\n\ndef query_on_dask_worker(\n dependencies: Any,\n recon_job: ReconstructableJob,\n dagster_run: DagsterRun,\n run_config: Optional[Mapping[str, object]],\n step_keys: Optional[Sequence[str]],\n instance_ref: InstanceRef,\n known_state: Optional[KnownExecutionState],\n) -> Sequence[DagsterEvent]:\n """Note that we need to pass "dependencies" to ensure Dask sequences futures during task\n scheduling, even though we do not use this argument within the function.\n """\n with DagsterInstance.from_ref(instance_ref) as instance:\n subset_job = recon_job.get_subset(op_selection=dagster_run.resolved_op_selection)\n\n execution_plan = create_execution_plan(\n subset_job,\n run_config=run_config,\n step_keys_to_execute=step_keys,\n known_state=known_state,\n )\n\n return execute_plan(\n execution_plan, subset_job, instance, dagster_run, run_config=run_config\n )\n\n\ndef get_dask_resource_requirements(tags: Mapping[str, str]):\n check.mapping_param(tags, "tags", key_type=str, value_type=str)\n req_str = tags.get(DASK_RESOURCE_REQUIREMENTS_KEY)\n if req_str is not None:\n return _seven.json.loads(req_str)\n\n return {}\n\n\nclass DaskExecutor(Executor):\n def __init__(self, cluster_type, cluster_configuration):\n self.cluster_type = check.opt_str_param(cluster_type, "cluster_type", default="local")\n self.cluster_configuration = check.opt_dict_param(\n cluster_configuration, "cluster_configuration"\n )\n\n @property\n def retries(self):\n return RetryMode.DISABLED\n\n def execute(self, plan_context: PlanOrchestrationContext, execution_plan: ExecutionPlan):\n check.inst_param(plan_context, "plan_context", PlanOrchestrationContext)\n check.inst_param(execution_plan, "execution_plan", ExecutionPlan)\n check.param_invariant(\n isinstance(plan_context.executor, DaskExecutor),\n "plan_context",\n f"Expected executor to be DaskExecutor got {plan_context.executor}",\n )\n\n check.invariant(\n plan_context.instance.is_persistent,\n "Dask execution requires a persistent DagsterInstance",\n )\n\n step_levels = execution_plan.get_steps_to_execute_by_level()\n\n job_name = plan_context.job_name\n\n instance = plan_context.instance\n\n cluster_type = self.cluster_type\n if cluster_type == "existing":\n # address passed directly to Client() below to connect to existing Scheduler\n cluster = self.cluster_configuration["address"]\n elif cluster_type == "local":\n from dask.distributed import LocalCluster\n\n cluster = LocalCluster(**self.build_dict(job_name))\n elif cluster_type == "yarn":\n from dask_yarn import YarnCluster\n\n cluster = YarnCluster(**self.build_dict(job_name))\n elif cluster_type == "ssh":\n from dask.distributed import SSHCluster\n\n cluster = SSHCluster(**self.build_dict(job_name))\n elif cluster_type == "pbs":\n from dask_jobqueue import PBSCluster\n\n cluster = PBSCluster(**self.build_dict(job_name))\n elif cluster_type == "moab":\n from dask_jobqueue import MoabCluster\n\n cluster = MoabCluster(**self.build_dict(job_name))\n elif cluster_type == "sge":\n from dask_jobqueue import SGECluster\n\n cluster = SGECluster(**self.build_dict(job_name))\n elif cluster_type == "lsf":\n from dask_jobqueue import LSFCluster\n\n cluster = LSFCluster(**self.build_dict(job_name))\n elif cluster_type == "slurm":\n from dask_jobqueue import SLURMCluster\n\n cluster = SLURMCluster(**self.build_dict(job_name))\n elif cluster_type == "oar":\n from dask_jobqueue import OARCluster\n\n cluster = OARCluster(**self.build_dict(job_name))\n elif cluster_type == "kube":\n from dask_kubernetes import KubeCluster\n\n cluster = KubeCluster(**self.build_dict(job_name))\n else:\n raise ValueError(\n "Must be providing one of the following ('existing', 'local', 'yarn', 'ssh',"\n f" 'pbs', 'moab', 'sge', 'lsf', 'slurm', 'oar', 'kube') not {cluster_type}"\n )\n\n with dask.distributed.Client(cluster) as client:\n execution_futures = []\n execution_futures_dict = {}\n\n for step_level in step_levels:\n for step in step_level:\n # We ensure correctness in sequencing by letting Dask schedule futures and\n # awaiting dependencies within each step.\n dependencies = []\n for step_input in step.step_inputs:\n for key in step_input.dependency_keys:\n dependencies.append(execution_futures_dict[key])\n\n run_config = plan_context.run_config\n\n dask_task_name = "%s.%s" % (job_name, step.key)\n\n recon_job = plan_context.reconstructable_job\n\n future = client.submit(\n query_on_dask_worker,\n dependencies,\n recon_job,\n plan_context.dagster_run,\n run_config,\n [step.key],\n instance.get_ref(),\n execution_plan.known_state,\n key=dask_task_name,\n resources=get_dask_resource_requirements(step.tags),\n )\n\n execution_futures.append(future)\n execution_futures_dict[step.key] = future\n\n # This tells Dask to awaits the step executions and retrieve their results to the\n # master\n futures = dask.distributed.as_completed(execution_futures, with_results=True)\n\n # Allow interrupts while waiting for the results from Dask\n for future, result in iterate_with_context(raise_execution_interrupts, futures):\n for step_event in result:\n check.inst(step_event, DagsterEvent)\n yield step_event\n\n def build_dict(self, job_name):\n """Returns a dict we can use for kwargs passed to dask client instantiation.\n\n Intended to be used like:\n\n with dask.distributed.Client(**cfg.build_dict()) as client:\n << use client here >>\n\n """\n if self.cluster_type in ["yarn", "pbs", "moab", "sge", "lsf", "slurm", "oar", "kube"]:\n dask_cfg = {"name": job_name}\n else:\n dask_cfg = {}\n\n if self.cluster_configuration:\n for k, v in self.cluster_configuration.items():\n dask_cfg[k] = v\n\n # if address is set, don't add LocalCluster args\n # context: https://github.com/dask/distributed/issues/3313\n if (self.cluster_type == "local") and ("address" not in dask_cfg):\n # We set threads_per_worker because Dagster is not thread-safe. Even though\n # environments=True by default, there is a clever piece of machinery\n # (dask.distributed.deploy.local.nprocesses_nthreads) that automagically makes execution\n # multithreaded by default when the number of available cores is greater than 4.\n # See: https://github.com/dagster-io/dagster/issues/2181\n # We may want to try to figure out a way to enforce this on remote Dask clusters against\n # which users run Dagster workloads.\n dask_cfg["threads_per_worker"] = 1\n\n return dask_cfg\n
", "current_page_name": "_modules/dagster_dask/executor", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_dask.executor"}}, "dagster_databricks": {"databricks": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_databricks.databricks

\nimport base64\nimport logging\nimport time\nfrom typing import IO, Any, Mapping, Optional, Tuple, Union, cast\n\nimport dagster\nimport dagster._check as check\nimport dagster_pyspark\nimport databricks_api\nimport databricks_cli.sdk\nimport requests.exceptions\nfrom dagster._annotations import deprecated, public\nfrom databricks.sdk import WorkspaceClient\nfrom databricks.sdk.service import compute, jobs\nfrom typing_extensions import Final\n\nimport dagster_databricks\n\nfrom .types import (\n    DatabricksRunState,\n)\nfrom .version import __version__\n\n# wait at most 24 hours by default for run execution\nDEFAULT_RUN_MAX_WAIT_TIME_SEC: Final = 24 * 60 * 60\n\n\n
[docs]class DatabricksError(Exception):\n pass
\n\n\n
[docs]class DatabricksClient:\n """A thin wrapper over the Databricks REST API."""\n\n def __init__(\n self,\n host: str,\n token: Optional[str] = None,\n oauth_client_id: Optional[str] = None,\n oauth_client_secret: Optional[str] = None,\n workspace_id: Optional[str] = None,\n ):\n self.host = host\n self.workspace_id = workspace_id\n\n self._workspace_client = WorkspaceClient(\n host=host,\n token=token,\n client_id=oauth_client_id,\n client_secret=oauth_client_secret,\n product="dagster-databricks",\n product_version=__version__,\n )\n\n # TODO: This is the old shim client that we were previously using. Arguably this is\n # confusing for users to use since this is an unofficial wrapper around the documented\n # Databricks REST API. We should consider removing this in the next minor release.\n if token:\n self._client = databricks_api.DatabricksAPI(host=host, token=token)\n self.__setup_user_agent(self._client.client)\n # TODO: This is the old `databricks_cli` client that was previously recommended by Databricks.\n # It is no longer supported and should be removed in favour of `databricks-sdk` in the next\n # minor release.\n self._api_client = databricks_cli.sdk.ApiClient(host=host, token=token)\n self.__setup_user_agent(self._api_client)\n else:\n self._client = None\n self._api_client = None\n\n def __setup_user_agent(\n self,\n client: Union[WorkspaceClient, databricks_api.DatabricksAPI, databricks_cli.sdk.ApiClient],\n ) -> None:\n """Overrides the user agent for the Databricks API client."""\n client.default_headers["user-agent"] = f"dagster-databricks/{__version__}"\n\n @deprecated(\n breaking_version="0.21.0", additional_warn_text="Use `workspace_client` property instead."\n )\n @public\n @property\n def client(self) -> databricks_api.DatabricksAPI:\n """Retrieve the legacy Databricks API client. Note: accessing this property will throw an exception if oauth\n credentials are used to initialize the DatabricksClient, because oauth credentials are not supported by the\n legacy Databricks API client.\n """\n if self._client is None:\n raise ValueError(\n "Legacy Databricks API client from `databricks-api` was not initialized because"\n " oauth credentials were used instead of an access token. This legacy Databricks"\n " API client is not supported when using oauth credentials. Use the"\n " `workspace_client` property instead."\n )\n return self._client\n\n @client.setter\n def client(self, value: Optional[databricks_api.DatabricksAPI]) -> None:\n self._client = value\n\n @deprecated(\n breaking_version="0.21.0", additional_warn_text="Use `workspace_client` property instead."\n )\n @public\n @property\n def api_client(self) -> databricks_cli.sdk.ApiClient:\n """Retrieve a reference to the underlying Databricks API client. For more information,\n see the `Databricks Python API <https://docs.databricks.com/dev-tools/python-api.html>`_.\n Noe: accessing this property will throw an exception if oauth credentials are used to initialize the\n DatabricksClient, because oauth credentials are not supported by the legacy Databricks API client.\n **Examples:**.\n\n .. code-block:: python\n\n from dagster import op\n from databricks_cli.jobs.api import JobsApi\n from databricks_cli.runs.api import RunsApi\n from databricks.sdk import WorkspaceClient\n\n @op(required_resource_keys={"databricks_client"})\n def op1(context):\n # Initialize the Databricks Jobs API\n jobs_client = JobsApi(context.resources.databricks_client.api_client)\n runs_client = RunsApi(context.resources.databricks_client.api_client)\n client = context.resources.databricks_client.api_client\n\n # Example 1: Run a Databricks job with some parameters.\n jobs_client.run_now(...)\n client.jobs.run_now(...)\n\n # Example 2: Trigger a one-time run of a Databricks workload.\n runs_client.submit_run(...)\n client.jobs.submit(...)\n\n # Example 3: Get an existing run.\n runs_client.get_run(...)\n client.jobs.get_run(...)\n\n # Example 4: Cancel a run.\n runs_client.cancel_run(...)\n client.jobs.cancel_run(...)\n\n Returns:\n ApiClient: The authenticated Databricks API client.\n """\n if self._api_client is None:\n raise ValueError(\n "Legacy Databricks API client from `databricks-cli` was not initialized because"\n " oauth credentials were used instead of an access token. This legacy Databricks"\n " API client is not supported when using oauth credentials. Use the"\n " `workspace_client` property instead."\n )\n return self._api_client\n\n @public\n @property\n def workspace_client(self) -> WorkspaceClient:\n """Retrieve a reference to the underlying Databricks Workspace client. For more information,\n see the `Databricks SDK for Python <https://docs.databricks.com/dev-tools/sdk-python.html>`_.\n\n **Examples:**\n\n .. code-block:: python\n\n from dagster import op\n from databricks.sdk import WorkspaceClient\n\n @op(required_resource_keys={"databricks_client"})\n def op1(context):\n # Initialize the Databricks Jobs API\n client = context.resources.databricks_client.api_client\n\n # Example 1: Run a Databricks job with some parameters.\n client.jobs.run_now(...)\n\n # Example 2: Trigger a one-time run of a Databricks workload.\n client.jobs.submit(...)\n\n # Example 3: Get an existing run.\n client.jobs.get_run(...)\n\n # Example 4: Cancel a run.\n client.jobs.cancel_run(...)\n\n Returns:\n WorkspaceClient: The authenticated Databricks SDK Workspace Client.\n """\n return self._workspace_client\n\n def read_file(self, dbfs_path: str, block_size: int = 1024**2) -> bytes:\n """Read a file from DBFS to a **byte string**."""\n if dbfs_path.startswith("dbfs://"):\n dbfs_path = dbfs_path[7:]\n\n data = b""\n bytes_read = 0\n dbfs_service = self.workspace_client.dbfs\n\n jdoc = dbfs_service.read(path=dbfs_path, length=block_size)\n data += base64.b64decode(jdoc.data)\n while jdoc.bytes_read == block_size:\n bytes_read += jdoc.bytes_read\n jdoc = dbfs_service.read(path=dbfs_path, offset=bytes_read, length=block_size)\n data += base64.b64decode(jdoc.data)\n\n return data\n\n def put_file(\n self, file_obj: IO, dbfs_path: str, overwrite: bool = False, block_size: int = 1024**2\n ) -> None:\n """Upload an arbitrary large file to DBFS.\n\n This doesn't use the DBFS `Put` API because that endpoint is limited to 1MB.\n """\n if dbfs_path.startswith("dbfs://"):\n dbfs_path = dbfs_path[7:]\n\n dbfs_service = self.workspace_client.dbfs\n\n create_response = dbfs_service.create(path=dbfs_path, overwrite=overwrite)\n handle = create_response.handle\n\n block = file_obj.read(block_size)\n while block:\n data = base64.b64encode(block).decode("utf-8")\n dbfs_service.add_block(data=data, handle=handle)\n block = file_obj.read(block_size)\n\n dbfs_service.close(handle=handle)\n\n def get_run_state(self, databricks_run_id: int) -> "DatabricksRunState":\n """Get the state of a run by Databricks run ID.\n\n Return a `DatabricksRunState` object. Note that the `result_state`\n attribute may be `None` if the run hasn't yet terminated.\n """\n run = self.workspace_client.jobs.get_run(databricks_run_id)\n return DatabricksRunState.from_databricks(run.state)\n\n def poll_run_state(\n self,\n logger: logging.Logger,\n start_poll_time: float,\n databricks_run_id: int,\n max_wait_time_sec: float,\n verbose_logs: bool = True,\n ) -> bool:\n run_state = self.get_run_state(databricks_run_id)\n\n if run_state.has_terminated():\n if run_state.is_successful():\n logger.info(f"Run `{databricks_run_id}` completed successfully.")\n return True\n if run_state.is_skipped():\n logger.info(f"Run `{databricks_run_id}` was skipped.")\n return True\n else:\n error_message = (\n f"Run `{databricks_run_id}` failed with result state:"\n f" `{run_state.result_state}`. Message: {run_state.state_message}."\n )\n logger.error(error_message)\n raise DatabricksError(error_message)\n else:\n if verbose_logs:\n logger.debug(f"Run `{databricks_run_id}` in state {run_state}.")\n if time.time() - start_poll_time > max_wait_time_sec:\n raise DatabricksError(\n f"Run `{databricks_run_id}` took more than {max_wait_time_sec}s to complete."\n " Failing the run."\n )\n return False\n\n def wait_for_run_to_complete(\n self,\n logger: logging.Logger,\n databricks_run_id: int,\n poll_interval_sec: float,\n max_wait_time_sec: int,\n verbose_logs: bool = True,\n ) -> None:\n logger.info(f"Waiting for Databricks run `{databricks_run_id}` to complete...")\n\n start_poll_time = time.time()\n while True:\n if self.poll_run_state(\n logger=logger,\n start_poll_time=start_poll_time,\n databricks_run_id=databricks_run_id,\n max_wait_time_sec=max_wait_time_sec,\n verbose_logs=verbose_logs,\n ):\n return\n\n time.sleep(poll_interval_sec)
\n\n\nclass DatabricksJobRunner:\n """Submits jobs created using Dagster config to Databricks, and monitors their progress.\n\n Attributes:\n host (str): Databricks host, e.g. https://uksouth.azuredatabricks.net.\n token (str): Databricks authentication token.\n poll_interval_sec (float): How often to poll Databricks for run status.\n max_wait_time_sec (int): How long to wait for a run to complete before failing.\n """\n\n def __init__(\n self,\n host: str,\n token: Optional[str] = None,\n oauth_client_id: Optional[str] = None,\n oauth_client_secret: Optional[str] = None,\n poll_interval_sec: float = 5,\n max_wait_time_sec: int = DEFAULT_RUN_MAX_WAIT_TIME_SEC,\n ):\n self.host = check.str_param(host, "host")\n check.invariant(\n token is None or (oauth_client_id is None and oauth_client_secret is None),\n "Must provide either databricks_token or oauth_credentials, but cannot provide both",\n )\n self.token = check.opt_str_param(token, "token")\n self.oauth_client_id = check.opt_str_param(oauth_client_id, "oauth_client_id")\n self.oauth_client_secret = check.opt_str_param(oauth_client_secret, "oauth_client_secret")\n self.poll_interval_sec = check.numeric_param(poll_interval_sec, "poll_interval_sec")\n self.max_wait_time_sec = check.int_param(max_wait_time_sec, "max_wait_time_sec")\n\n self._client: DatabricksClient = DatabricksClient(\n host=self.host,\n token=self.token,\n oauth_client_id=oauth_client_id,\n oauth_client_secret=oauth_client_secret,\n )\n\n @property\n def client(self) -> DatabricksClient:\n """Return the underlying `DatabricksClient` object."""\n return self._client\n\n def submit_run(self, run_config: Mapping[str, Any], task: Mapping[str, Any]) -> int:\n """Submit a new run using the 'Runs submit' API."""\n existing_cluster_id = run_config["cluster"].get("existing")\n\n new_cluster = run_config["cluster"].get("new")\n\n # The Databricks API needs different keys to be present in API calls depending\n # on new/existing cluster, so we need to process the new_cluster\n # config first.\n if new_cluster:\n new_cluster = new_cluster.copy()\n\n nodes = new_cluster.pop("nodes")\n if "instance_pool_id" in nodes:\n new_cluster["instance_pool_id"] = nodes["instance_pool_id"]\n else:\n node_types = nodes["node_types"]\n new_cluster["node_type_id"] = node_types["node_type_id"]\n if "driver_node_type_id" in node_types:\n new_cluster["driver_node_type_id"] = node_types["driver_node_type_id"]\n\n cluster_size = new_cluster.pop("size")\n if "num_workers" in cluster_size:\n new_cluster["num_workers"] = cluster_size["num_workers"]\n else:\n new_cluster["autoscale"] = cluster_size["autoscale"]\n\n tags = new_cluster.get("custom_tags", {})\n if isinstance(tags, list):\n tags = {x["key"]: x["value"] for x in tags}\n tags["__dagster_version"] = dagster.__version__\n new_cluster["custom_tags"] = tags\n\n check.invariant(\n existing_cluster_id is not None or new_cluster is not None,\n "Invalid value for run_config.cluster",\n )\n\n # We'll always need some libraries, namely dagster/dagster_databricks/dagster_pyspark,\n # since they're imported by our scripts.\n # Add them if they're not already added by users in config.\n libraries = list(run_config.get("libraries", []))\n install_default_libraries = run_config.get("install_default_libraries", True)\n if install_default_libraries:\n python_libraries = {\n x["pypi"]["package"].split("==")[0].replace("_", "-")\n for x in libraries\n if "pypi" in x\n }\n\n for library_name, library in [\n ("dagster", dagster),\n ("dagster-databricks", dagster_databricks),\n ("dagster-pyspark", dagster_pyspark),\n ]:\n if library_name not in python_libraries:\n libraries.append(\n {"pypi": {"package": f"{library_name}=={library.__version__}"}}\n )\n\n # Only one task should be able to be chosen really; make sure of that here.\n check.invariant(\n sum(\n task.get(key) is not None\n for key in [\n "notebook_task",\n "spark_python_task",\n "spark_jar_task",\n "spark_submit_task",\n ]\n )\n == 1,\n "Multiple tasks specified in Databricks run",\n )\n\n return self.client.workspace_client.jobs.submit(\n run_name=run_config.get("run_name"),\n tasks=[\n jobs.SubmitTask.from_dict(\n {\n "new_cluster": new_cluster,\n "existing_cluster_id": existing_cluster_id,\n # "libraries": [compute.Library.from_dict(lib) for lib in libraries],\n "libraries": libraries,\n **task,\n "task_key": "dagster-task",\n },\n )\n ],\n ).bind()["run_id"]\n\n def retrieve_logs_for_run_id(\n self, log: logging.Logger, databricks_run_id: int\n ) -> Optional[Tuple[Optional[str], Optional[str]]]:\n """Retrieve the stdout and stderr logs for a run."""\n run = self.client.workspace_client.jobs.get_run(databricks_run_id)\n\n # Run.cluster_instance can be None. In that case, fall back to cluster instance on first\n # task. Currently pyspark step launcher runs jobs with singleton tasks.\n cluster_instance = run.cluster_instance or run.tasks[0].cluster_instance\n cluster_id = check.inst(\n cluster_instance.cluster_id,\n str,\n "cluster_id should be string like `1234-123456-abcdefgh` got:"\n f" `{cluster_instance.cluster_id}`",\n )\n cluster = self.client.workspace_client.clusters.get(cluster_id)\n log_config = cluster.cluster_log_conf\n if log_config is None:\n log.warn(\n f"Logs not configured for cluster {cluster_id} used for run {databricks_run_id}"\n )\n return None\n if cast(Optional[compute.S3StorageInfo], log_config.s3) is not None:\n logs_prefix = log_config.s3.destination\n log.warn("Retrieving S3 logs not yet implemented")\n return None\n elif cast(Optional[compute.DbfsStorageInfo], log_config.dbfs) is not None:\n logs_prefix = log_config.dbfs.destination\n stdout = self.wait_for_dbfs_logs(log, logs_prefix, cluster_id, "stdout")\n stderr = self.wait_for_dbfs_logs(log, logs_prefix, cluster_id, "stderr")\n return stdout, stderr\n\n def wait_for_dbfs_logs(\n self,\n log: logging.Logger,\n prefix: str,\n cluster_id: str,\n filename: str,\n waiter_delay: int = 10,\n waiter_max_attempts: int = 10,\n ) -> Optional[str]:\n """Attempt up to `waiter_max_attempts` attempts to get logs from DBFS."""\n path = "/".join([prefix, cluster_id, "driver", filename])\n log.info(f"Retrieving logs from {path}")\n num_attempts = 0\n while num_attempts <= waiter_max_attempts:\n try:\n logs = self.client.read_file(path)\n return logs.decode("utf-8")\n except requests.exceptions.HTTPError:\n num_attempts += 1\n time.sleep(waiter_delay)\n log.warn("Could not retrieve cluster logs!")\n
", "current_page_name": "_modules/dagster_databricks/databricks", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_databricks.databricks"}, "databricks_pyspark_step_launcher": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_databricks.databricks_pyspark_step_launcher

\nimport gzip\nimport io\nimport os.path\nimport pickle\nimport sys\nimport tempfile\nimport time\nimport zlib\nfrom typing import Any, Dict, Iterator, Mapping, Optional, Sequence, cast\n\nfrom dagster import (\n    Bool,\n    Field,\n    IntSource,\n    Noneable,\n    StringSource,\n    _check as check,\n    resource,\n)\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom dagster._core.definitions.step_launcher import StepLauncher, StepRunRef\nfrom dagster._core.errors import raise_execution_interrupts\nfrom dagster._core.events import DagsterEvent\nfrom dagster._core.events.log import EventLogEntry\nfrom dagster._core.execution.context.init import InitResourceContext\nfrom dagster._core.execution.context.system import StepExecutionContext\nfrom dagster._core.execution.plan.external_step import (\n    PICKLED_EVENTS_FILE_NAME,\n    PICKLED_STEP_RUN_REF_FILE_NAME,\n    step_context_to_step_run_ref,\n)\nfrom dagster._core.log_manager import DagsterLogManager\nfrom dagster._serdes import deserialize_value\nfrom dagster._utils.backoff import backoff\nfrom dagster_pyspark.utils import build_pyspark_zip\nfrom databricks.sdk.core import DatabricksError\nfrom databricks.sdk.service import jobs\n\nfrom dagster_databricks import databricks_step_main\nfrom dagster_databricks.databricks import (\n    DEFAULT_RUN_MAX_WAIT_TIME_SEC,\n    DatabricksJobRunner,\n)\n\nfrom .configs import (\n    define_databricks_env_variables,\n    define_databricks_permissions,\n    define_databricks_secrets_config,\n    define_databricks_storage_config,\n    define_databricks_submit_run_config,\n    define_oauth_credentials,\n)\n\nCODE_ZIP_NAME = "code.zip"\nPICKLED_CONFIG_FILE_NAME = "config.pkl"\nDAGSTER_SYSTEM_ENV_VARS = {\n    "DAGSTER_CLOUD_DEPLOYMENT_NAME",\n    "DAGSTER_CLOUD_IS_BRANCH_DEPLOYMENT",\n    "DAGSTER_CLOUD_GIT_SHA",\n    "DAGSTER_CLOUD_GIT_TIMESTAMP",\n    "DAGSTER_CLOUD_GIT_AUTHOR_EMAIL",\n    "DAGSTER_CLOUD_GIT_AUTHOR_NAME",\n    "DAGSTER_CLOUD_GIT_MESSAGE",\n    "DAGSTER_CLOUD_GIT_BRANCH",\n    "DAGSTER_CLOUD_GIT_REPO",\n    "DAGSTER_CLOUD_PULL_REQUEST_ID",\n    "DAGSTER_CLOUD_PULL_REQUEST_STATUS",\n}\n\n\n
[docs]@dagster_maintained_resource\n@resource(\n {\n "run_config": define_databricks_submit_run_config(),\n "permissions": define_databricks_permissions(),\n "databricks_host": Field(\n StringSource,\n is_required=True,\n description="Databricks host, e.g. uksouth.azuredatabricks.com",\n ),\n "databricks_token": Field(\n Noneable(StringSource),\n default_value=None,\n description="Databricks access token",\n ),\n "oauth_credentials": define_oauth_credentials(),\n "env_variables": define_databricks_env_variables(),\n "secrets_to_env_variables": define_databricks_secrets_config(),\n "storage": define_databricks_storage_config(),\n "local_pipeline_package_path": Field(\n StringSource,\n is_required=False,\n description=(\n "Absolute path to root python package containing your Dagster code. If you set this"\n " value to a directory lower than the root package, and have user relative imports"\n " in your code (e.g. `from .foo import bar`), it's likely you'll encounter an"\n " import error on the remote step. Before every step run, the launcher will zip up"\n " the code in this local path, upload it to DBFS, and unzip it into the Python path"\n " of the remote Spark process. This gives the remote process access to up-to-date"\n " user code."\n ),\n ),\n "local_dagster_job_package_path": Field(\n StringSource,\n is_required=False,\n description=(\n "Absolute path to root python package containing your Dagster code. If you set this"\n " value to a directory lower than the root package, and have user relative imports"\n " in your code (e.g. `from .foo import bar`), it's likely you'll encounter an"\n " import error on the remote step. Before every step run, the launcher will zip up"\n " the code in this local path, upload it to DBFS, and unzip it into the Python path"\n " of the remote Spark process. This gives the remote process access to up-to-date"\n " user code."\n ),\n ),\n "staging_prefix": Field(\n StringSource,\n is_required=False,\n default_value="/dagster_staging",\n description="Directory in DBFS to use for uploaded job code. Must be absolute.",\n ),\n "wait_for_logs": Field(\n Bool,\n is_required=False,\n default_value=False,\n description=(\n "If set, and if the specified cluster is configured to export logs, the system will"\n " wait after job completion for the logs to appear in the configured location. Note"\n " that logs are copied every 5 minutes, so enabling this will add several minutes"\n " to the job runtime. NOTE: this integration will export stdout/stderrfrom the"\n " remote Databricks process automatically, so this option is not generally"\n " necessary."\n ),\n ),\n "max_completion_wait_time_seconds": Field(\n IntSource,\n is_required=False,\n default_value=DEFAULT_RUN_MAX_WAIT_TIME_SEC,\n description=(\n "If the Databricks job run takes more than this many seconds, then "\n "consider it failed and terminate the step."\n ),\n ),\n "poll_interval_sec": Field(\n float,\n is_required=False,\n default_value=5.0,\n description=(\n "How frequently Dagster will poll Databricks to determine the state of the job."\n ),\n ),\n "verbose_logs": Field(\n bool,\n default_value=True,\n description=(\n "Determines whether to display debug logs emitted while job is being polled. It can"\n " be helpful for Dagster UI performance to set to False when running long-running"\n " or fan-out Databricks jobs, to avoid forcing the UI to fetch large amounts of"\n " debug logs."\n ),\n ),\n "add_dagster_env_variables": Field(\n bool,\n default_value=True,\n description=(\n "Automatically add Dagster system environment variables. This option is only"\n " applicable when the code being executed is deployed on Dagster Cloud. It will be"\n " ignored when the environment variables provided by Dagster Cloud are not present."\n ),\n ),\n }\n)\ndef databricks_pyspark_step_launcher(\n context: InitResourceContext,\n) -> "DatabricksPySparkStepLauncher":\n """Resource for running ops as a Databricks Job.\n\n When this resource is used, the op will be executed in Databricks using the 'Run Submit'\n API. Pipeline code will be zipped up and copied to a directory in DBFS along with the op's\n execution context.\n\n Use the 'run_config' configuration to specify the details of the Databricks cluster used, and\n the 'storage' key to configure persistent storage on that cluster. Storage is accessed by\n setting the credentials in the Spark context, as documented `here for S3`_ and `here for ADLS`_.\n\n .. _`here for S3`: https://docs.databricks.com/data/data-sources/aws/amazon-s3.html#alternative-1-set-aws-keys-in-the-spark-context\n .. _`here for ADLS`: https://docs.microsoft.com/en-gb/azure/databricks/data/data-sources/azure/azure-datalake-gen2#--access-directly-using-the-storage-account-access-key\n """\n return DatabricksPySparkStepLauncher(**context.resource_config)
\n\n\nclass DatabricksPySparkStepLauncher(StepLauncher):\n def __init__(\n self,\n run_config: Mapping[str, Any],\n permissions: Mapping[str, Any],\n databricks_host: str,\n secrets_to_env_variables: Sequence[Mapping[str, Any]],\n staging_prefix: str,\n wait_for_logs: bool,\n max_completion_wait_time_seconds: int,\n databricks_token: Optional[str] = None,\n oauth_credentials: Optional[Mapping[str, str]] = None,\n env_variables: Optional[Mapping[str, str]] = None,\n storage: Optional[Mapping[str, Any]] = None,\n poll_interval_sec: int = 5,\n local_pipeline_package_path: Optional[str] = None,\n local_dagster_job_package_path: Optional[str] = None,\n verbose_logs: bool = True,\n add_dagster_env_variables: bool = True,\n ):\n self.run_config = check.mapping_param(run_config, "run_config")\n self.permissions = check.mapping_param(permissions, "permissions")\n self.databricks_host = check.str_param(databricks_host, "databricks_host")\n\n check.invariant(\n databricks_token is not None or oauth_credentials is not None,\n "Must provide either databricks_token or oauth_credentials",\n )\n check.invariant(\n databricks_token is None or oauth_credentials is None,\n "Must provide either databricks_token or oauth_credentials, but cannot provide both",\n )\n self.databricks_token = check.opt_str_param(databricks_token, "databricks_token")\n oauth_credentials = check.opt_mapping_param(\n oauth_credentials,\n "oauth_credentials",\n key_type=str,\n value_type=str,\n )\n\n self.secrets = check.sequence_param(\n secrets_to_env_variables, "secrets_to_env_variables", dict\n )\n self.env_variables = check.opt_mapping_param(env_variables, "env_variables")\n self.storage = check.opt_mapping_param(storage, "storage")\n check.invariant(\n local_dagster_job_package_path is not None or local_pipeline_package_path is not None,\n "Missing config: need to provide either 'local_dagster_job_package_path' or"\n " 'local_pipeline_package_path' config entry",\n )\n check.invariant(\n local_dagster_job_package_path is None or local_pipeline_package_path is None,\n "Error in config: Provided both 'local_dagster_job_package_path' and"\n " 'local_pipeline_package_path' entries. Need to specify one or the other.",\n )\n self.local_dagster_job_package_path = check.str_param(\n local_pipeline_package_path or local_dagster_job_package_path,\n "local_dagster_job_package_path",\n )\n self.staging_prefix = check.str_param(staging_prefix, "staging_prefix")\n check.invariant(staging_prefix.startswith("/"), "staging_prefix must be an absolute path")\n self.wait_for_logs = check.bool_param(wait_for_logs, "wait_for_logs")\n\n self.databricks_runner = DatabricksJobRunner(\n host=databricks_host,\n token=databricks_token,\n oauth_client_id=oauth_credentials.get("client_id"),\n oauth_client_secret=oauth_credentials.get("client_secret"),\n poll_interval_sec=poll_interval_sec,\n max_wait_time_sec=max_completion_wait_time_seconds,\n )\n self.verbose_logs = check.bool_param(verbose_logs, "verbose_logs")\n self.add_dagster_env_variables = check.bool_param(\n add_dagster_env_variables, "add_dagster_env_variables"\n )\n\n def launch_step(self, step_context: StepExecutionContext) -> Iterator[DagsterEvent]:\n step_run_ref = step_context_to_step_run_ref(\n step_context, self.local_dagster_job_package_path\n )\n run_id = step_context.dagster_run.run_id\n log = step_context.log\n\n step_key = step_run_ref.step_key\n self._upload_artifacts(log, step_run_ref, run_id, step_key)\n\n task = self._get_databricks_task(run_id, step_key)\n databricks_run_id = self.databricks_runner.submit_run(self.run_config, task)\n\n if self.permissions:\n self._grant_permissions(log, databricks_run_id)\n\n try:\n # If this is being called within a `capture_interrupts` context, allow interrupts while\n # waiting for the execution to complete, so that we can terminate slow or hanging steps\n with raise_execution_interrupts():\n yield from self.step_events_iterator(step_context, step_key, databricks_run_id)\n except:\n # if executon is interrupted before the step is completed, cancel the run\n self.databricks_runner.client.workspace_client.jobs.cancel_run(databricks_run_id)\n raise\n finally:\n self.log_compute_logs(log, run_id, step_key)\n # this is somewhat obsolete\n if self.wait_for_logs:\n self._log_logs_from_cluster(log, databricks_run_id)\n\n def log_compute_logs(self, log: DagsterLogManager, run_id: str, step_key: str) -> None:\n try:\n stdout = self.databricks_runner.client.read_file(\n self._dbfs_path(run_id, step_key, "stdout")\n ).decode()\n log.info(f"Captured stdout for step {step_key}:")\n log.info(stdout)\n sys.stdout.write(stdout)\n except Exception as e:\n log.error(\n f"Encountered exception {e} when attempting to load stdout logs for step"\n f" {step_key}. Check the databricks console for more info."\n )\n try:\n stderr = self.databricks_runner.client.read_file(\n self._dbfs_path(run_id, step_key, "stderr")\n ).decode()\n log.info(f"Captured stderr for step {step_key}:")\n log.info(stderr)\n sys.stderr.write(stderr)\n except Exception as e:\n log.error(\n f"Encountered exception {e} when attempting to load stderr logs for step"\n f" {step_key}. Check the databricks console for more info."\n )\n\n def step_events_iterator(\n self, step_context: StepExecutionContext, step_key: str, databricks_run_id: int\n ) -> Iterator[DagsterEvent]:\n """The launched Databricks job writes all event records to a specific dbfs file. This iterator\n regularly reads the contents of the file, adds any events that have not yet been seen to\n the instance, and yields any DagsterEvents.\n\n By doing this, we simulate having the remote Databricks process able to directly write to\n the local DagsterInstance. Importantly, this means that timestamps (and all other record\n properties) will be sourced from the Databricks process, rather than recording when this\n process happens to log them.\n """\n check.int_param(databricks_run_id, "databricks_run_id")\n processed_events = 0\n start_poll_time = time.time()\n done = False\n step_context.log.info("Waiting for Databricks run %s to complete..." % databricks_run_id)\n while not done:\n with raise_execution_interrupts():\n if self.verbose_logs:\n step_context.log.debug(\n "Waiting %.1f seconds...", self.databricks_runner.poll_interval_sec\n )\n time.sleep(self.databricks_runner.poll_interval_sec)\n try:\n done = self.databricks_runner.client.poll_run_state(\n logger=step_context.log,\n start_poll_time=start_poll_time,\n databricks_run_id=databricks_run_id,\n max_wait_time_sec=self.databricks_runner.max_wait_time_sec,\n verbose_logs=self.verbose_logs,\n )\n finally:\n all_events = self.get_step_events(\n step_context.run_id, step_key, step_context.previous_attempt_count\n )\n # we get all available records on each poll, but we only want to process the\n # ones we haven't seen before\n for event in all_events[processed_events:]:\n # write each event from the DataBricks instance to the local instance\n step_context.instance.handle_new_event(event)\n if event.is_dagster_event:\n yield event.get_dagster_event()\n processed_events = len(all_events)\n\n step_context.log.info(f"Databricks run {databricks_run_id} completed.")\n\n def get_step_events(\n self, run_id: str, step_key: str, retry_number: int\n ) -> Sequence[EventLogEntry]:\n path = self._dbfs_path(run_id, step_key, f"{retry_number}_{PICKLED_EVENTS_FILE_NAME}")\n\n def _get_step_records() -> Sequence[EventLogEntry]:\n serialized_records = self.databricks_runner.client.read_file(path)\n if not serialized_records:\n return []\n return cast(\n Sequence[EventLogEntry],\n deserialize_value(pickle.loads(gzip.decompress(serialized_records))),\n )\n\n try:\n # reading from dbfs while it writes can be flaky\n # allow for retry if we get malformed data\n return backoff(\n fn=_get_step_records,\n retry_on=(pickle.UnpicklingError, OSError, zlib.error, EOFError),\n max_retries=4,\n )\n # if you poll before the Databricks process has had a chance to create the file,\n # we expect to get this error\n except DatabricksError as e:\n if e.error_code == "RESOURCE_DOES_NOT_EXIST":\n return []\n raise\n\n def _grant_permissions(\n self, log: DagsterLogManager, databricks_run_id: int, request_retries: int = 3\n ) -> None:\n client = self.databricks_runner.client.workspace_client\n # Retrieve run info\n cluster_id = None\n for i in range(1, request_retries + 1):\n run_info = client.jobs.get_run(databricks_run_id)\n # if a new job cluster is created, the cluster_instance key may not be immediately present in the run response\n try:\n cluster_id = run_info.cluster_instance.cluster_id\n break\n except:\n log.warning(\n f"Failed to retrieve cluster info for databricks_run_id {databricks_run_id}. "\n f"Retrying {i} of {request_retries} times."\n )\n time.sleep(5)\n if not cluster_id:\n log.warning(\n f"Failed to retrieve cluster info for databricks_run_id {databricks_run_id} "\n f"{request_retries} times. Skipping permission updates..."\n )\n return\n\n # Update job permissions\n if "job_permissions" in self.permissions:\n job_permissions = self._format_permissions(self.permissions["job_permissions"])\n job_id = run_info.job_id # type: ignore # (??)\n log.debug(f"Updating job permissions with following json: {job_permissions}")\n client.permissions.update("jobs", job_id, access_control_list=job_permissions)\n log.info("Successfully updated cluster permissions")\n\n # Update cluster permissions\n if "cluster_permissions" in self.permissions:\n if "existing" in self.run_config["cluster"]:\n raise ValueError(\n "Attempting to update permissions of an existing cluster. "\n "This is dangerous and thus unsupported."\n )\n cluster_permissions = self._format_permissions(self.permissions["cluster_permissions"])\n log.debug(f"Updating cluster permissions with following json: {cluster_permissions}")\n client.permissions.update(\n "clusters", cluster_id, access_control_list=cluster_permissions\n )\n log.info("Successfully updated cluster permissions")\n\n def _format_permissions(\n self, input_permissions: Mapping[str, Sequence[Mapping[str, str]]]\n ) -> Sequence[Mapping[str, str]]:\n access_control_list = []\n for permission, accessors in input_permissions.items():\n access_control_list.extend(\n [\n jobs.JobAccessControlRequest.from_dict(\n {"permission_level": permission, **accessor}\n )\n for accessor in accessors\n ]\n )\n return access_control_list\n\n def _get_databricks_task(self, run_id: str, step_key: str) -> Mapping[str, Any]:\n """Construct the 'task' parameter to be submitted to the Databricks API.\n\n This will create a 'spark_python_task' dict where `python_file` is a path on DBFS\n pointing to the 'databricks_step_main.py' file, and `parameters` is an array with a single\n element, a path on DBFS pointing to the picked `step_run_ref` data.\n\n See https://docs.databricks.com/dev-tools/api/latest/jobs.html#jobssparkpythontask.\n """\n python_file = self._dbfs_path(run_id, step_key, self._main_file_name())\n parameters = [\n self._internal_dbfs_path(run_id, step_key, PICKLED_STEP_RUN_REF_FILE_NAME),\n self._internal_dbfs_path(run_id, step_key, PICKLED_CONFIG_FILE_NAME),\n self._internal_dbfs_path(run_id, step_key, CODE_ZIP_NAME),\n ]\n return {"spark_python_task": {"python_file": python_file, "parameters": parameters}}\n\n def _upload_artifacts(\n self, log: DagsterLogManager, step_run_ref: StepRunRef, run_id: str, step_key: str\n ) -> None:\n """Upload the step run ref and pyspark code to DBFS to run as a job."""\n log.info("Uploading main file to DBFS")\n main_local_path = self._main_file_local_path()\n with open(main_local_path, "rb") as infile:\n self.databricks_runner.client.put_file(\n infile, self._dbfs_path(run_id, step_key, self._main_file_name()), overwrite=True\n )\n\n log.info("Uploading dagster job to DBFS")\n with tempfile.TemporaryDirectory() as temp_dir:\n # Zip and upload package containing dagster job\n zip_local_path = os.path.join(temp_dir, CODE_ZIP_NAME)\n build_pyspark_zip(zip_local_path, self.local_dagster_job_package_path)\n with open(zip_local_path, "rb") as infile:\n self.databricks_runner.client.put_file(\n infile, self._dbfs_path(run_id, step_key, CODE_ZIP_NAME), overwrite=True\n )\n\n log.info("Uploading step run ref file to DBFS")\n step_pickle_file = io.BytesIO()\n\n pickle.dump(step_run_ref, step_pickle_file)\n step_pickle_file.seek(0)\n self.databricks_runner.client.put_file(\n step_pickle_file,\n self._dbfs_path(run_id, step_key, PICKLED_STEP_RUN_REF_FILE_NAME),\n overwrite=True,\n )\n\n databricks_config = self.create_remote_config()\n log.info("Uploading Databricks configuration to DBFS")\n databricks_config_file = io.BytesIO()\n pickle.dump(databricks_config, databricks_config_file)\n databricks_config_file.seek(0)\n self.databricks_runner.client.put_file(\n databricks_config_file,\n self._dbfs_path(run_id, step_key, PICKLED_CONFIG_FILE_NAME),\n overwrite=True,\n )\n\n def get_dagster_env_variables(self) -> Dict[str, str]:\n out = {}\n if self.add_dagster_env_variables:\n for var in DAGSTER_SYSTEM_ENV_VARS:\n if os.getenv(var):\n out.update({var: os.getenv(var)})\n return out\n\n def create_remote_config(self) -> "DatabricksConfig":\n env_variables = self.get_dagster_env_variables()\n env_variables.update(self.env_variables)\n databricks_config = DatabricksConfig(\n env_variables=env_variables,\n storage=self.storage,\n secrets=self.secrets,\n )\n return databricks_config\n\n def _log_logs_from_cluster(self, log: DagsterLogManager, run_id: int) -> None:\n logs = self.databricks_runner.retrieve_logs_for_run_id(log, run_id)\n if logs is None:\n return\n stdout, stderr = logs\n if stderr:\n log.info(stderr)\n if stdout:\n log.info(stdout)\n\n def _main_file_name(self) -> str:\n return os.path.basename(self._main_file_local_path())\n\n def _main_file_local_path(self) -> str:\n return databricks_step_main.__file__\n\n def _sanitize_step_key(self, step_key: str) -> str:\n # step_keys of dynamic steps contain brackets, which are invalid characters\n return step_key.replace("[", "__").replace("]", "__")\n\n def _dbfs_path(self, run_id: str, step_key: str, filename: str) -> str:\n path = "/".join(\n [\n self.staging_prefix,\n run_id,\n self._sanitize_step_key(step_key),\n os.path.basename(filename),\n ]\n )\n return f"dbfs://{path}"\n\n def _internal_dbfs_path(self, run_id: str, step_key: str, filename: str) -> str:\n """Scripts running on Databricks should access DBFS at /dbfs/."""\n path = "/".join(\n [\n self.staging_prefix,\n run_id,\n self._sanitize_step_key(step_key),\n os.path.basename(filename),\n ]\n )\n return f"/dbfs/{path}"\n\n\nclass DatabricksConfig:\n """Represents configuration required by Databricks to run jobs.\n\n Instances of this class will be created when a Databricks step is launched and will contain\n all configuration and secrets required to set up storage and environment variables within\n the Databricks environment. The instance will be serialized and uploaded to Databricks\n by the step launcher, then deserialized as part of the 'main' script when the job is running\n in Databricks.\n\n The `setup` method handles the actual setup prior to op execution on the Databricks side.\n\n This config is separated out from the regular Dagster run config system because the setup\n is done by the 'main' script before entering a Dagster context (i.e. using `run_step_from_ref`).\n We use a separate class to avoid coupling the setup to the format of the `step_run_ref` object.\n """\n\n def __init__(\n self,\n env_variables: Mapping[str, str],\n storage: Mapping[str, Any],\n secrets: Sequence[Mapping[str, Any]],\n ):\n """Create a new DatabricksConfig object.\n\n `storage` and `secrets` should be of the same shape as the `storage` and\n `secrets_to_env_variables` config passed to `databricks_pyspark_step_launcher`.\n """\n self.env_variables = env_variables\n self.storage = storage\n self.secrets = secrets\n\n def setup(self, dbutils: Any, sc: Any) -> None:\n """Set up storage and environment variables on Databricks.\n\n The `dbutils` and `sc` arguments must be passed in by the 'main' script, as they\n aren't accessible by any other modules.\n """\n self.setup_storage(dbutils, sc)\n self.setup_environment(dbutils)\n\n def setup_storage(self, dbutils: Any, sc: Any) -> None:\n """Set up storage using either S3 or ADLS2."""\n if "s3" in self.storage:\n self.setup_s3_storage(self.storage["s3"], dbutils, sc)\n elif "adls2" in self.storage:\n self.setup_adls2_storage(self.storage["adls2"], dbutils, sc)\n\n def setup_s3_storage(self, s3_storage: Mapping[str, Any], dbutils: Any, sc: Any) -> None:\n """Obtain AWS credentials from Databricks secrets and export so both Spark and boto can use them."""\n scope = s3_storage["secret_scope"]\n\n access_key = dbutils.secrets.get(scope=scope, key=s3_storage["access_key_key"])\n secret_key = dbutils.secrets.get(scope=scope, key=s3_storage["secret_key_key"])\n\n # Spark APIs will use this.\n # See https://docs.databricks.com/data/data-sources/aws/amazon-s3.html#alternative-1-set-aws-keys-in-the-spark-context.\n sc._jsc.hadoopConfiguration().set("fs.s3n.awsAccessKeyId", access_key) # noqa: SLF001\n sc._jsc.hadoopConfiguration().set("fs.s3n.awsSecretAccessKey", secret_key) # noqa: SLF001\n\n # Boto will use these.\n os.environ["AWS_ACCESS_KEY_ID"] = access_key\n os.environ["AWS_SECRET_ACCESS_KEY"] = secret_key\n\n def setup_adls2_storage(self, adls2_storage: Mapping[str, Any], dbutils: Any, sc: Any) -> None:\n """Obtain an Azure Storage Account key from Databricks secrets and export so Spark can use it."""\n storage_account_key = dbutils.secrets.get(\n scope=adls2_storage["secret_scope"], key=adls2_storage["storage_account_key_key"]\n )\n # Spark APIs will use this.\n # See https://docs.microsoft.com/en-gb/azure/databricks/data/data-sources/azure/azure-datalake-gen2#--access-directly-using-the-storage-account-access-key\n # sc is globally defined in the Databricks runtime and points to the Spark context\n sc._jsc.hadoopConfiguration().set( # noqa: SLF001\n "fs.azure.account.key.{}.dfs.core.windows.net".format(\n adls2_storage["storage_account_name"]\n ),\n storage_account_key,\n )\n\n def setup_environment(self, dbutils: Any) -> None:\n """Setup any environment variables required by the run.\n\n Extract any secrets in the run config and export them as environment variables.\n\n This is important for any `StringSource` config since the environment variables\n won't ordinarily be available in the Databricks execution environment.\n """\n for env_k, env_v in self.env_variables.items():\n os.environ[env_k] = env_v\n\n for secret in self.secrets:\n name = secret["name"]\n key = secret["key"]\n scope = secret["scope"]\n print(f"Exporting {name} from Databricks secret {key}, scope {scope}") # noqa: T201\n val = dbutils.secrets.get(scope=scope, key=key)\n os.environ[name] = val\n
", "current_page_name": "_modules/dagster_databricks/databricks_pyspark_step_launcher", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_databricks.databricks_pyspark_step_launcher"}, "ops": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_databricks.ops

\nfrom typing import TYPE_CHECKING, Optional\n\nfrom dagster import (\n    In,\n    Nothing,\n    OpExecutionContext,\n    _check as check,\n    op,\n)\nfrom dagster._core.definitions.op_definition import OpDefinition\nfrom databricks.sdk.service import jobs\nfrom pydantic import Field\n\nDEFAULT_POLL_INTERVAL_SECONDS = 10\n# wait at most 24 hours by default for run execution\nDEFAULT_MAX_WAIT_TIME_SECONDS = 24 * 60 * 60\nfrom dagster import Config\n\nif TYPE_CHECKING:\n    from .databricks import DatabricksClient\n\n\n
[docs]def create_databricks_run_now_op(\n databricks_job_id: int,\n databricks_job_configuration: Optional[dict] = None,\n poll_interval_seconds: float = DEFAULT_POLL_INTERVAL_SECONDS,\n max_wait_time_seconds: float = DEFAULT_MAX_WAIT_TIME_SECONDS,\n name: Optional[str] = None,\n databricks_resource_key: str = "databricks",\n) -> OpDefinition:\n """Creates an op that launches an existing databricks job.\n\n As config, the op accepts a blob of the form described in Databricks' Job API:\n https://docs.databricks.com/api-explorer/workspace/jobs/runnow. The only required field is\n ``job_id``, which is the ID of the job to be executed. Additional fields can be used to specify\n override parameters for the Databricks Job.\n\n Arguments:\n databricks_job_id (int): The ID of the Databricks Job to be executed.\n databricks_job_configuration (dict): Configuration for triggering a new job run of a\n Databricks Job. See https://docs.databricks.com/api-explorer/workspace/jobs/runnow\n for the full configuration.\n poll_interval_seconds (float): How often to poll the Databricks API to check whether the\n Databricks job has finished running.\n max_wait_time_seconds (float): How long to wait for the Databricks job to finish running\n before raising an error.\n name (Optional[str]): The name of the op. If not provided, the name will be\n _databricks_run_now_op.\n databricks_resource_key (str): The name of the resource key used by this op. If not\n provided, the resource key will be "databricks".\n\n Returns:\n OpDefinition: An op definition to run the Databricks Job.\n\n Example:\n .. code-block:: python\n\n from dagster import job\n from dagster_databricks import create_databricks_run_now_op, DatabricksClientResource\n\n DATABRICKS_JOB_ID = 1234\n\n\n run_now_op = create_databricks_run_now_op(\n databricks_job_id=DATABRICKS_JOB_ID,\n databricks_job_configuration={\n "python_params": [\n "--input",\n "schema.db.input_table",\n "--output",\n "schema.db.output_table",\n ],\n },\n )\n\n @job(\n resource_defs={\n "databricks": DatabricksClientResource(\n host=EnvVar("DATABRICKS_HOST"),\n token=EnvVar("DATABRICKS_TOKEN")\n )\n }\n )\n def do_stuff():\n run_now_op()\n """\n _poll_interval_seconds = poll_interval_seconds\n _max_wait_time_seconds = max_wait_time_seconds\n\n class DatabricksRunNowOpConfig(Config):\n poll_interval_seconds: float = Field(\n default=_poll_interval_seconds,\n description="Check whether the Databricks Job is done at this interval, in seconds.",\n )\n max_wait_time_seconds: int = Field(\n default=_max_wait_time_seconds,\n description=(\n "If the Databricks Job is not complete after this length of time, in seconds,"\n " raise an error."\n ),\n )\n\n @op(\n ins={"start_after": In(Nothing)},\n required_resource_keys={databricks_resource_key},\n tags={"kind": "databricks"},\n name=name,\n )\n def _databricks_run_now_op(context: OpExecutionContext, config: DatabricksRunNowOpConfig):\n databricks: DatabricksClient = getattr(context.resources, databricks_resource_key)\n jobs_service = databricks.workspace_client.jobs\n\n run = jobs_service.run_now(\n job_id=databricks_job_id,\n **(databricks_job_configuration or {}),\n )\n run_id = run.bind()["run_id"]\n\n get_run_response = jobs_service.get_run(run_id=run_id)\n\n context.log.info(\n f"Launched databricks job run for '{get_run_response.run_name}' (`{run_id}`). URL:"\n f" {get_run_response.run_page_url}. Waiting to run to complete."\n )\n\n databricks.wait_for_run_to_complete(\n logger=context.log,\n databricks_run_id=run_id,\n poll_interval_sec=config.poll_interval_seconds,\n max_wait_time_sec=config.max_wait_time_seconds,\n )\n\n return _databricks_run_now_op
\n\n\n
[docs]def create_databricks_submit_run_op(\n databricks_job_configuration: dict,\n poll_interval_seconds: float = DEFAULT_POLL_INTERVAL_SECONDS,\n max_wait_time_seconds: float = DEFAULT_MAX_WAIT_TIME_SECONDS,\n name: Optional[str] = None,\n databricks_resource_key: str = "databricks",\n) -> OpDefinition:\n """Creates an op that submits a one-time run of a set of tasks on Databricks.\n\n As config, the op accepts a blob of the form described in Databricks' Job API:\n https://docs.databricks.com/api-explorer/workspace/jobs/submit.\n\n Arguments:\n databricks_job_configuration (dict): Configuration for submitting a one-time run of a set\n of tasks on Databricks. See https://docs.databricks.com/api-explorer/workspace/jobs/submit\n for the full configuration.\n poll_interval_seconds (float): How often to poll the Databricks API to check whether the\n Databricks job has finished running.\n max_wait_time_seconds (float): How long to wait for the Databricks job to finish running\n before raising an error.\n name (Optional[str]): The name of the op. If not provided, the name will be\n _databricks_submit_run_op.\n databricks_resource_key (str): The name of the resource key used by this op. If not\n provided, the resource key will be "databricks".\n\n Returns:\n OpDefinition: An op definition to submit a one-time run of a set of tasks on Databricks.\n\n Example:\n .. code-block:: python\n\n from dagster import job\n from dagster_databricks import create_databricks_submit_run_op, DatabricksClientResource\n\n\n submit_run_op = create_databricks_submit_run_op(\n databricks_job_configuration={\n "new_cluster": {\n "spark_version": '2.1.0-db3-scala2.11',\n "num_workers": 2\n },\n "notebook_task": {\n "notebook_path": "/Users/dagster@example.com/PrepareData",\n },\n }\n )\n\n @job(\n resource_defs={\n "databricks": DatabricksClientResource(\n host=EnvVar("DATABRICKS_HOST"),\n token=EnvVar("DATABRICKS_TOKEN")\n )\n }\n )\n def do_stuff():\n submit_run_op()\n """\n check.invariant(\n bool(databricks_job_configuration),\n "Configuration for the one-time Databricks Job is required.",\n )\n\n _poll_interval_seconds = poll_interval_seconds\n _max_wait_time_seconds = max_wait_time_seconds\n\n class DatabricksSubmitRunOpConfig(Config):\n poll_interval_seconds: float = Field(\n default=_poll_interval_seconds,\n description="Check whether the Databricks Job is done at this interval, in seconds.",\n )\n max_wait_time_seconds: int = Field(\n default=_max_wait_time_seconds,\n description=(\n "If the Databricks Job is not complete after this length of time, in seconds,"\n " raise an error."\n ),\n )\n\n @op(\n ins={"start_after": In(Nothing)},\n required_resource_keys={databricks_resource_key},\n tags={"kind": "databricks"},\n name=name,\n )\n def _databricks_submit_run_op(\n context: OpExecutionContext, config: DatabricksSubmitRunOpConfig\n ) -> None:\n databricks: DatabricksClient = getattr(context.resources, databricks_resource_key)\n jobs_service = databricks.workspace_client.jobs\n\n run = jobs_service.submit(\n tasks=[jobs.SubmitTask.from_dict(databricks_job_configuration)],\n )\n run_id: int = run.bind()["run_id"]\n\n get_run_response = jobs_service.get_run(run_id=run_id)\n\n context.log.info(\n f"Launched databricks job run for '{get_run_response.run_name}' (`{run_id}`). URL:"\n f" {get_run_response.run_page_url}. Waiting to run to complete."\n )\n\n databricks.wait_for_run_to_complete(\n logger=context.log,\n databricks_run_id=run_id,\n poll_interval_sec=config.poll_interval_seconds,\n max_wait_time_sec=config.max_wait_time_seconds,\n )\n\n return _databricks_submit_run_op
\n
", "current_page_name": "_modules/dagster_databricks/ops", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_databricks.ops"}, "resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_databricks.resources

\nfrom typing import Any, Optional\n\nfrom dagster import (\n    Config,\n    ConfigurableResource,\n    IAttachDifferentObjectToOpContext,\n    resource,\n)\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom pydantic import Field, root_validator\n\nfrom .databricks import DatabricksClient\n\n\nclass OauthCredentials(Config):\n    """OAuth credentials for Databricks.\n\n    See https://docs.databricks.com/dev-tools/api/latest/authentication.html#oauth-2-0.\n    """\n\n    client_id: str = Field(description="OAuth client ID")\n    client_secret: str = Field(description="OAuth client secret")\n\n\n
[docs]class DatabricksClientResource(ConfigurableResource, IAttachDifferentObjectToOpContext):\n """Resource which provides a Python client for interacting with Databricks within an\n op or asset.\n """\n\n host: str = Field(description="Databricks host, e.g. https://uksouth.azuredatabricks.com")\n token: Optional[str] = Field(default=None, description="Databricks access token")\n oauth_credentials: Optional[OauthCredentials] = Field(\n default=None,\n description=(\n "Databricks OAuth credentials for using a service principal. See"\n " https://docs.databricks.com/en/dev-tools/auth.html#oauth-2-0"\n ),\n )\n workspace_id: Optional[str] = Field(\n default=None,\n description=(\n "DEPRECATED: The Databricks workspace ID, as described in"\n " https://docs.databricks.com/workspace/workspace-details.html#workspace-instance-names-urls-and-ids."\n " This is no longer used and will be removed in a 0.21."\n ),\n )\n\n @root_validator()\n def has_token_or_oauth_credentials(cls, values):\n token = values.get("token")\n oauth_credentials = values.get("oauth_credentials")\n if not token and not oauth_credentials:\n raise ValueError("Must provide either token or oauth_credentials")\n if token and oauth_credentials:\n raise ValueError("Must provide either token or oauth_credentials, not both")\n return values\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n def get_client(self) -> DatabricksClient:\n if self.oauth_credentials:\n client_id = self.oauth_credentials.client_id\n client_secret = self.oauth_credentials.client_secret\n else:\n client_id = None\n client_secret = None\n\n return DatabricksClient(\n host=self.host,\n token=self.token,\n oauth_client_id=client_id,\n oauth_client_secret=client_secret,\n workspace_id=self.workspace_id,\n )\n\n def get_object_to_set_on_execution_context(self) -> Any:\n return self.get_client()
\n\n\n
[docs]@dagster_maintained_resource\n@resource(config_schema=DatabricksClientResource.to_config_schema())\ndef databricks_client(init_context) -> DatabricksClient:\n return DatabricksClientResource.from_resource_context(init_context).get_client()
\n
", "current_page_name": "_modules/dagster_databricks/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_databricks.resources"}}, "dagster_datadog": {"resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_datadog.resources

\nfrom dagster import ConfigurableResource, resource\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom datadog import DogStatsd, initialize, statsd\nfrom pydantic import Field\n\n\nclass DatadogClient:\n    # Mirroring levels from the dogstatsd library\n    OK, WARNING, CRITICAL, UNKNOWN = (\n        DogStatsd.OK,\n        DogStatsd.WARNING,\n        DogStatsd.CRITICAL,\n        DogStatsd.UNKNOWN,\n    )\n\n    def __init__(self, api_key: str, app_key: str):\n        self.api_key = api_key\n        self.app_key = app_key\n        initialize(api_key=api_key, app_key=app_key)\n\n        # Pull in methods from the dogstatsd library\n        for method in [\n            "event",\n            "gauge",\n            "increment",\n            "decrement",\n            "histogram",\n            "distribution",\n            "set",\n            "service_check",\n            "timed",\n            "timing",\n        ]:\n            setattr(self, method, getattr(statsd, method))\n\n\n
[docs]class DatadogResource(ConfigurableResource):\n """This resource is a thin wrapper over the\n `dogstatsd library <https://datadogpy.readthedocs.io/en/latest/>`_.\n\n As such, we directly mirror the public API methods of DogStatsd here; you can refer to the\n `DataDog documentation <https://docs.datadoghq.com/developers/dogstatsd/>`_ for how to use this\n resource.\n\n Examples:\n .. code-block:: python\n\n @op\n def datadog_op(datadog_client: ResourceParam[DatadogClient]):\n datadog_client.event('Man down!', 'This server needs assistance.')\n datadog_client.gauge('users.online', 1001, tags=["protocol:http"])\n datadog_client.increment('page.views')\n datadog_client.decrement('page.views')\n datadog_client.histogram('album.photo.count', 26, tags=["gender:female"])\n datadog_client.distribution('album.photo.count', 26, tags=["color:blue"])\n datadog_client.set('visitors.uniques', 999, tags=["browser:ie"])\n datadog_client.service_check('svc.check_name', datadog_client.WARNING)\n datadog_client.timing("query.response.time", 1234)\n\n # Use timed decorator\n @datadog_client.timed('run_fn')\n def run_fn():\n pass\n\n run_fn()\n\n @job\n def job_for_datadog_op() -> None:\n datadog_op()\n\n job_for_datadog_op.execute_in_process(\n resources={"datadog_client": DatadogResource(api_key="FOO", app_key="BAR")}\n )\n\n """\n\n api_key: str = Field(\n description=(\n "Datadog API key. See https://docs.datadoghq.com/account_management/api-app-keys/"\n )\n )\n app_key: str = Field(\n description=(\n "Datadog application key. See"\n " https://docs.datadoghq.com/account_management/api-app-keys/."\n )\n )\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n def get_client(self) -> DatadogClient:\n return DatadogClient(self.api_key, self.app_key)
\n\n\n
[docs]@dagster_maintained_resource\n@resource(\n config_schema=DatadogResource.to_config_schema(),\n description="This resource is for publishing to DataDog",\n)\ndef datadog_resource(context) -> DatadogClient:\n """This legacy resource is a thin wrapper over the\n `dogstatsd library <https://datadogpy.readthedocs.io/en/latest/>`_.\n\n Prefer using :py:class:`DatadogResource`.\n\n As such, we directly mirror the public API methods of DogStatsd here; you can refer to the\n `DataDog documentation <https://docs.datadoghq.com/developers/dogstatsd/>`_ for how to use this\n resource.\n\n Examples:\n .. code-block:: python\n\n @op(required_resource_keys={'datadog'})\n def datadog_op(context):\n dd = context.resources.datadog\n\n dd.event('Man down!', 'This server needs assistance.')\n dd.gauge('users.online', 1001, tags=["protocol:http"])\n dd.increment('page.views')\n dd.decrement('page.views')\n dd.histogram('album.photo.count', 26, tags=["gender:female"])\n dd.distribution('album.photo.count', 26, tags=["color:blue"])\n dd.set('visitors.uniques', 999, tags=["browser:ie"])\n dd.service_check('svc.check_name', dd.WARNING)\n dd.timing("query.response.time", 1234)\n\n # Use timed decorator\n @dd.timed('run_fn')\n def run_fn():\n pass\n\n run_fn()\n\n @job(resource_defs={'datadog': datadog_resource})\n def dd_job():\n datadog_op()\n\n result = dd_job.execute_in_process(\n run_config={'resources': {'datadog': {'config': {'api_key': 'YOUR_KEY', 'app_key': 'YOUR_KEY'}}}}\n )\n\n """\n return DatadogResource.from_resource_context(context).get_client()
\n
", "current_page_name": "_modules/dagster_datadog/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_datadog.resources"}}, "dagster_datahub": {"resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_datahub.resources

\nfrom typing import Any, Dict, List, Optional\n\nfrom dagster import InitResourceContext, resource\nfrom dagster._config.pythonic_config import Config, ConfigurableResource\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom datahub.emitter.kafka_emitter import (\n    DEFAULT_MCE_KAFKA_TOPIC,\n    DEFAULT_MCP_KAFKA_TOPIC,\n    MCE_KEY,\n    MCP_KEY,\n    DatahubKafkaEmitter,\n    KafkaEmitterConfig,\n)\nfrom datahub.emitter.rest_emitter import DatahubRestEmitter\nfrom pydantic import Field\n\n\n
[docs]class DatahubRESTEmitterResource(ConfigurableResource):\n connection: str = Field(description="Datahub GMS Server")\n token: Optional[str] = Field(default=None, description="Personal Access Token")\n connect_timeout_sec: Optional[float] = None\n read_timeout_sec: Optional[float] = None\n retry_status_codes: Optional[List[int]] = None\n retry_methods: Optional[List[str]] = None\n retry_max_times: Optional[int] = None\n extra_headers: Optional[Dict[str, str]] = None\n ca_certificate_path: Optional[str] = None\n server_telemetry_id: Optional[str] = None\n disable_ssl_verification: bool = False\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n def get_emitter(self) -> DatahubRestEmitter:\n return DatahubRestEmitter(\n gms_server=self.connection,\n token=self.token,\n connect_timeout_sec=self.connect_timeout_sec,\n read_timeout_sec=self.read_timeout_sec,\n retry_status_codes=self.retry_status_codes,\n retry_methods=self.retry_methods,\n retry_max_times=self.retry_max_times,\n extra_headers=self.extra_headers,\n ca_certificate_path=self.ca_certificate_path,\n server_telemetry_id=self.server_telemetry_id,\n disable_ssl_verification=self.disable_ssl_verification,\n )
\n\n\n
[docs]@dagster_maintained_resource\n@resource(config_schema=DatahubRESTEmitterResource.to_config_schema())\ndef datahub_rest_emitter(init_context: InitResourceContext) -> DatahubRestEmitter:\n emitter = DatahubRestEmitter(\n gms_server=init_context.resource_config.get("connection"),\n token=init_context.resource_config.get("token"),\n connect_timeout_sec=init_context.resource_config.get("connect_timeout_sec"),\n read_timeout_sec=init_context.resource_config.get("read_timeout_sec"),\n retry_status_codes=init_context.resource_config.get("retry_status_codes"),\n retry_methods=init_context.resource_config.get("retry_methods"),\n retry_max_times=init_context.resource_config.get("retry_max_times"),\n extra_headers=init_context.resource_config.get("extra_headers"),\n ca_certificate_path=init_context.resource_config.get("ca_certificate_path"),\n server_telemetry_id=init_context.resource_config.get("server_telemetry_id"),\n disable_ssl_verification=init_context.resource_config.get("disable_ssl_verification"),\n )\n # Attempt to hit the server to ensure the resource is properly configured\n emitter.test_connection()\n return emitter
\n\n\nclass DatahubConnection(Config):\n bootstrap: str = Field(description="Kafka Boostrap Servers. Comma delimited")\n schema_registry_url: str = Field(description="Schema Registry Location.")\n schema_registry_config: Dict[str, Any] = Field(\n default={}, description="Extra Schema Registry Config."\n )\n\n\n
[docs]class DatahubKafkaEmitterResource(ConfigurableResource):\n connection: DatahubConnection\n topic: Optional[str] = None\n topic_routes: Dict[str, str] = Field(\n default={\n MCE_KEY: DEFAULT_MCE_KAFKA_TOPIC,\n MCP_KEY: DEFAULT_MCP_KAFKA_TOPIC,\n }\n )\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n def get_emitter(self) -> DatahubKafkaEmitter:\n return DatahubKafkaEmitter(\n KafkaEmitterConfig.parse_obj(self._convert_to_config_dictionary())\n )
\n\n\n
[docs]@dagster_maintained_resource\n@resource(config_schema=DatahubKafkaEmitterResource.to_config_schema())\ndef datahub_kafka_emitter(init_context: InitResourceContext) -> DatahubKafkaEmitter:\n return DatahubKafkaEmitter(KafkaEmitterConfig.parse_obj(init_context.resource_config))
\n
", "current_page_name": "_modules/dagster_datahub/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_datahub.resources"}}, "dagster_dbt": {"asset_decorator": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_dbt.asset_decorator

\nfrom typing import (\n    Any,\n    Callable,\n    Dict,\n    FrozenSet,\n    Mapping,\n    Optional,\n    Sequence,\n    Set,\n    Tuple,\n)\n\nimport dagster._check as check\nfrom dagster import (\n    AssetCheckSpec,\n    AssetKey,\n    AssetOut,\n    AssetsDefinition,\n    Nothing,\n    PartitionsDefinition,\n    multi_asset,\n)\n\nfrom .asset_utils import (\n    DAGSTER_DBT_TRANSLATOR_METADATA_KEY,\n    MANIFEST_METADATA_KEY,\n    default_asset_check_fn,\n    default_code_version_fn,\n    get_deps,\n)\nfrom .dagster_dbt_translator import DagsterDbtTranslator, DbtManifestWrapper\nfrom .dbt_manifest import DbtManifestParam, validate_manifest\nfrom .utils import (\n    ASSET_RESOURCE_TYPES,\n    get_dbt_resource_props_by_dbt_unique_id_from_manifest,\n    output_name_fn,\n    select_unique_ids_from_manifest,\n)\n\n\n
[docs]def dbt_assets(\n *,\n manifest: DbtManifestParam,\n select: str = "fqn:*",\n exclude: Optional[str] = None,\n io_manager_key: Optional[str] = None,\n partitions_def: Optional[PartitionsDefinition] = None,\n dagster_dbt_translator: DagsterDbtTranslator = DagsterDbtTranslator(),\n) -> Callable[..., AssetsDefinition]:\n """Create a definition for how to compute a set of dbt resources, described by a manifest.json.\n When invoking dbt commands using :py:class:`~dagster_dbt.DbtCliResource`'s\n :py:meth:`~dagster_dbt.DbtCliResource.cli` method, Dagster events are emitted by calling\n ``yield from`` on the event stream returned by :py:meth:`~dagster_dbt.DbtCliInvocation.stream`.\n\n Args:\n manifest (Union[Mapping[str, Any], str, Path]): The contents of a manifest.json file\n or the path to a manifest.json file. A manifest.json contains a representation of a\n dbt project (models, tests, macros, etc). We use this representation to create\n corresponding Dagster assets.\n select (str): A dbt selection string for the models in a project that you want\n to include. Defaults to ``fqn:*``.\n exclude (Optional[str]): A dbt selection string for the models in a project that you want\n to exclude. Defaults to "".\n io_manager_key (Optional[str]): The IO manager key that will be set on each of the returned\n assets. When other ops are downstream of the loaded assets, the IOManager specified\n here determines how the inputs to those ops are loaded. Defaults to "io_manager".\n partitions_def (Optional[PartitionsDefinition]): Defines the set of partition keys that\n compose the dbt assets.\n dagster_dbt_translator (Optional[DagsterDbtTranslator]): Allows customizing how to map\n dbt models, seeds, etc. to asset keys and asset metadata.\n\n Examples:\n Running ``dbt build`` for a dbt project:\n\n .. code-block:: python\n\n from pathlib import Path\n\n from dagster import AssetExecutionContext\n from dagster_dbt import DbtCliResource, dbt_assets\n\n\n @dbt_assets(manifest=Path("target", "manifest.json"))\n def my_dbt_assets(context: AssetExecutionContext, dbt: DbtCliResource):\n yield from dbt.cli(["build"], context=context).stream()\n\n Running dbt commands with flags:\n\n .. code-block:: python\n\n from pathlib import Path\n\n from dagster import AssetExecutionContext\n from dagster_dbt import DbtCliResource, dbt_assets\n\n\n @dbt_assets(manifest=Path("target", "manifest.json"))\n def my_dbt_assets(context: AssetExecutionContext, dbt: DbtCliResource):\n yield from dbt.cli(["build", "--full-refresh"], context=context).stream()\n\n Running dbt commands with ``--vars``:\n\n .. code-block:: python\n\n import json\n from pathlib import Path\n\n from dagster import AssetExecutionContext\n from dagster_dbt import DbtCliResource, dbt_assets\n\n\n @dbt_assets(manifest=Path("target", "manifest.json"))\n def my_dbt_assets(context: AssetExecutionContext, dbt: DbtCliResource):\n dbt_vars = {"key": "value"}\n\n yield from dbt.cli(["build", "--vars", json.dumps(dbt_vars)], context=context).stream()\n\n Retrieving dbt artifacts after running a dbt command:\n\n .. code-block:: python\n\n from pathlib import Path\n\n from dagster import AssetExecutionContext\n from dagster_dbt import DbtCliResource, dbt_assets\n\n\n @dbt_assets(manifest=Path("target", "manifest.json"))\n def my_dbt_assets(context: AssetExecutionContext, dbt: DbtCliResource):\n dbt_build_invocation = dbt.cli(["build"], context=context)\n\n yield from dbt_build_invocation.stream()\n\n run_results_json = dbt_build_invocation.get_artifact("run_results.json")\n\n Running multiple dbt commands for a dbt project:\n\n .. code-block:: python\n\n from pathlib import Path\n\n from dagster import AssetExecutionContext\n from dagster_dbt import DbtCliResource, dbt_assets\n\n\n @dbt_assets(manifest=Path("target", "manifest.json"))\n def my_dbt_assets(context: AssetExecutionContext, dbt: DbtCliResource):\n yield from dbt.cli(["run"], context=context).stream()\n yield from dbt.cli(["test"], context=context).stream()\n\n Customizing the Dagster asset metadata inferred from a dbt project using :py:class:`~dagster_dbt.DagsterDbtTranslator`:\n\n .. code-block:: python\n\n from pathlib import Path\n\n from dagster import AssetExecutionContext\n from dagster_dbt import DagsterDbtTranslator, DbtCliResource, dbt_assets\n\n\n class CustomDagsterDbtTranslator(DagsterDbtTranslator):\n ...\n\n\n @dbt_assets(\n manifest=Path("target", "manifest.json"),\n dagster_dbt_translator=CustomDagsterDbtTranslator(),\n )\n def my_dbt_assets(context: AssetExecutionContext, dbt: DbtCliResource):\n yield from dbt.cli(["build"], context=context).stream()\n\n Invoking another Dagster :py:class:`~dagster.ResourceDefinition` alongside dbt:\n\n .. code-block:: python\n\n from pathlib import Path\n\n from dagster import AssetExecutionContext\n from dagster_dbt import DagsterDbtTranslator, DbtCliResource, dbt_assets\n from dagster_slack import SlackResource\n\n\n @dbt_assets(manifest=Path("target", "manifest.json"))\n def my_dbt_assets(context: AssetExecutionContext, dbt: DbtCliResource, slack: SlackResource):\n yield from dbt.cli(["build"], context=context).stream()\n\n slack_client = slack.get_client()\n slack_client.chat_postMessage(channel="#my-channel", text="dbt build succeeded!")\n\n Defining and accessing Dagster :py:class:`~dagster.Config` alongside dbt:\n\n .. code-block:: python\n\n from pathlib import Path\n\n from dagster import AssetExecutionContext, Config\n from dagster_dbt import DagsterDbtTranslator, DbtCliResource, dbt_assets\n\n\n class MyDbtConfig(Config):\n full_refresh: bool\n\n\n @dbt_assets(manifest=Path("target", "manifest.json"))\n def my_dbt_assets(context: AssetExecutionContext, dbt: DbtCliResource, config: MyDbtConfig):\n dbt_build_args = ["build"]\n if config.full_refresh:\n dbt_build_args += ["--full-refresh"]\n\n yield from dbt.cli(dbt_build_args, context=context).stream()\n\n Defining Dagster :py:class:`~dagster.PartitionDefinition` alongside dbt:\n\n\n .. code-block:: python\n\n import json\n from pathlib import Path\n\n from dagster import AssetExecutionContext, DailyPartitionDefinition\n from dagster_dbt import DbtCliResource, dbt_assets\n\n\n @dbt_assets(\n manifest=Path("target", "manifest.json"),\n partitions_def=DailyPartitionsDefinition(start_date="2023-01-01")\n )\n def partitionshop_dbt_assets(context: AssetExecutionContext, dbt: DbtCliResource):\n time_window = context.asset_partitions_time_window_for_output(\n list(context.selected_output_names)[0]\n )\n\n dbt_vars = {\n "min_date": time_window.start.isoformat(),\n "max_date": time_window.end.isoformat()\n }\n dbt_build_args = ["build", "--vars", json.dumps(dbt_vars)]\n\n yield from dbt.cli(dbt_build_args, context=context).stream()\n\n """\n check.inst_param(\n dagster_dbt_translator,\n "dagster_dbt_translator",\n DagsterDbtTranslator,\n additional_message=(\n "Ensure that the argument is an instantiated class that subclasses"\n " DagsterDbtTranslator."\n ),\n )\n manifest = validate_manifest(manifest)\n\n unique_ids = select_unique_ids_from_manifest(\n select=select, exclude=exclude or "", manifest_json=manifest\n )\n node_info_by_dbt_unique_id = get_dbt_resource_props_by_dbt_unique_id_from_manifest(manifest)\n deps = get_deps(\n dbt_nodes=node_info_by_dbt_unique_id,\n selected_unique_ids=unique_ids,\n asset_resource_types=ASSET_RESOURCE_TYPES,\n )\n (\n non_argument_deps,\n outs,\n internal_asset_deps,\n check_specs,\n ) = get_dbt_multi_asset_args(\n dbt_nodes=node_info_by_dbt_unique_id,\n deps=deps,\n io_manager_key=io_manager_key,\n manifest=manifest,\n dagster_dbt_translator=dagster_dbt_translator,\n )\n\n def inner(fn) -> AssetsDefinition:\n asset_definition = multi_asset(\n outs=outs,\n internal_asset_deps=internal_asset_deps,\n deps=non_argument_deps,\n compute_kind="dbt",\n partitions_def=partitions_def,\n can_subset=True,\n op_tags={\n **({"dagster-dbt/select": select} if select else {}),\n **({"dagster-dbt/exclude": exclude} if exclude else {}),\n },\n check_specs=check_specs,\n )(fn)\n\n return asset_definition\n\n return inner
\n\n\ndef get_dbt_multi_asset_args(\n dbt_nodes: Mapping[str, Any],\n deps: Mapping[str, FrozenSet[str]],\n io_manager_key: Optional[str],\n manifest: Mapping[str, Any],\n dagster_dbt_translator: DagsterDbtTranslator,\n) -> Tuple[\n Sequence[AssetKey],\n Dict[str, AssetOut],\n Dict[str, Set[AssetKey]],\n Sequence[AssetCheckSpec],\n]:\n non_argument_deps: Set[AssetKey] = set()\n outs: Dict[str, AssetOut] = {}\n internal_asset_deps: Dict[str, Set[AssetKey]] = {}\n check_specs: Sequence[AssetCheckSpec] = []\n\n for unique_id, parent_unique_ids in deps.items():\n dbt_resource_props = dbt_nodes[unique_id]\n\n output_name = output_name_fn(dbt_resource_props)\n asset_key = dagster_dbt_translator.get_asset_key(dbt_resource_props)\n\n outs[output_name] = AssetOut(\n key=asset_key,\n dagster_type=Nothing,\n io_manager_key=io_manager_key,\n description=dagster_dbt_translator.get_description(dbt_resource_props),\n is_required=False,\n metadata={ # type: ignore\n **dagster_dbt_translator.get_metadata(dbt_resource_props),\n MANIFEST_METADATA_KEY: DbtManifestWrapper(manifest=manifest),\n DAGSTER_DBT_TRANSLATOR_METADATA_KEY: dagster_dbt_translator,\n },\n group_name=dagster_dbt_translator.get_group_name(dbt_resource_props),\n code_version=default_code_version_fn(dbt_resource_props),\n freshness_policy=dagster_dbt_translator.get_freshness_policy(dbt_resource_props),\n auto_materialize_policy=dagster_dbt_translator.get_auto_materialize_policy(\n dbt_resource_props\n ),\n )\n\n test_unique_ids = [\n child_unique_id\n for child_unique_id in manifest["child_map"][unique_id]\n if child_unique_id.startswith("test")\n ]\n for test_unique_id in test_unique_ids:\n test_resource_props = manifest["nodes"][test_unique_id]\n check_spec = default_asset_check_fn(asset_key, unique_id, test_resource_props)\n\n if check_spec:\n check_specs.append(check_spec)\n\n # Translate parent unique ids to internal asset deps and non argument dep\n output_internal_deps = internal_asset_deps.setdefault(output_name, set())\n for parent_unique_id in parent_unique_ids:\n parent_resource_props = dbt_nodes[parent_unique_id]\n parent_asset_key = dagster_dbt_translator.get_asset_key(parent_resource_props)\n\n # Add this parent as an internal dependency\n output_internal_deps.add(parent_asset_key)\n\n # Mark this parent as an input if it has no dependencies\n if parent_unique_id not in deps:\n non_argument_deps.add(parent_asset_key)\n\n return list(non_argument_deps), outs, internal_asset_deps, check_specs\n
", "current_page_name": "_modules/dagster_dbt/asset_decorator", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_dbt.asset_decorator"}, "asset_defs": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_dbt.asset_defs

\nimport hashlib\nimport json\nimport os\nfrom pathlib import Path\nfrom typing import (\n    AbstractSet,\n    Any,\n    Callable,\n    Dict,\n    Iterator,\n    List,\n    Mapping,\n    Optional,\n    Sequence,\n    Set,\n    Tuple,\n    Union,\n    cast,\n)\n\nimport dateutil\nfrom dagster import (\n    AssetCheckResult,\n    AssetKey,\n    AssetsDefinition,\n    AutoMaterializePolicy,\n    FreshnessPolicy,\n    In,\n    OpExecutionContext,\n    Out,\n    PartitionsDefinition,\n    PermissiveConfig,\n    _check as check,\n    get_dagster_logger,\n    op,\n)\nfrom dagster._annotations import deprecated_param\nfrom dagster._core.definitions.events import (\n    AssetMaterialization,\n    AssetObservation,\n    CoercibleToAssetKeyPrefix,\n    Output,\n)\nfrom dagster._core.definitions.metadata import MetadataUserInput, RawMetadataValue\nfrom dagster._core.errors import DagsterInvalidSubsetError\nfrom dagster._utils.merger import deep_merge_dicts\nfrom dagster._utils.warnings import (\n    deprecation_warning,\n    normalize_renamed_param,\n)\n\nfrom dagster_dbt.asset_utils import (\n    default_asset_key_fn,\n    default_auto_materialize_policy_fn,\n    default_description_fn,\n    default_freshness_policy_fn,\n    default_group_from_dbt_resource_props,\n    default_metadata_from_dbt_resource_props,\n    get_asset_deps,\n    get_deps,\n)\nfrom dagster_dbt.core.resources import DbtCliClient\nfrom dagster_dbt.core.resources_v2 import DbtCliResource\nfrom dagster_dbt.core.types import DbtCliOutput\nfrom dagster_dbt.core.utils import build_command_args_from_flags, execute_cli\nfrom dagster_dbt.dagster_dbt_translator import DagsterDbtTranslator\nfrom dagster_dbt.errors import DagsterDbtError\nfrom dagster_dbt.types import DbtOutput\nfrom dagster_dbt.utils import (\n    ASSET_RESOURCE_TYPES,\n    output_name_fn,\n    result_to_events,\n    select_unique_ids_from_manifest,\n)\n\n\ndef _load_manifest_for_project(\n    project_dir: str,\n    profiles_dir: str,\n    target_dir: str,\n    select: str,\n    exclude: str,\n) -> Tuple[Mapping[str, Any], DbtCliOutput]:\n    # running "dbt ls" regenerates the manifest.json, which includes a superset of the actual\n    # "dbt ls" output\n    cli_output = execute_cli(\n        executable="dbt",\n        command="ls",\n        log=get_dagster_logger(),\n        flags_dict={\n            "project-dir": project_dir,\n            "profiles-dir": profiles_dir,\n            "select": select,\n            "exclude": exclude,\n            "output": "json",\n        },\n        warn_error=False,\n        ignore_handled_error=False,\n        target_path=target_dir,\n        json_log_format=True,\n        capture_logs=True,\n    )\n    manifest_path = os.path.join(target_dir, "manifest.json")\n    with open(manifest_path, "r", encoding="utf8") as f:\n        return json.load(f), cli_output\n\n\ndef _can_stream_events(dbt_resource: Union[DbtCliClient, DbtCliResource]) -> bool:\n    """Check if the installed dbt version supports streaming events."""\n    import dbt.version\n    from packaging import version\n\n    if version.parse(dbt.version.__version__) >= version.parse("1.4.0"):\n        # The json log format is required for streaming events. DbtCliResource always uses this format, but\n        # DbtCliClient has an option to disable it.\n        if isinstance(dbt_resource, DbtCliResource):\n            return True\n        else:\n            return dbt_resource._json_log_format  # noqa: SLF001\n    else:\n        return False\n\n\ndef _batch_event_iterator(\n    context: OpExecutionContext,\n    dbt_resource: DbtCliClient,\n    use_build_command: bool,\n    node_info_to_asset_key: Callable[[Mapping[str, Any]], AssetKey],\n    runtime_metadata_fn: Optional[\n        Callable[[OpExecutionContext, Mapping[str, Any]], Mapping[str, RawMetadataValue]]\n    ],\n    kwargs: Dict[str, Any],\n) -> Iterator[Union[AssetObservation, AssetMaterialization, Output]]:\n    """Yields events for a dbt cli invocation. Waits until the entire command has completed before\n    emitting outputs.\n    """\n    # clean up any run results from the last run\n    dbt_resource.remove_run_results_json()\n\n    dbt_output: Optional[DbtOutput] = None\n    try:\n        if use_build_command:\n            dbt_output = dbt_resource.build(**kwargs)\n        else:\n            dbt_output = dbt_resource.run(**kwargs)\n    finally:\n        # in the case that the project only partially runs successfully, still attempt to generate\n        # events for the parts that were successful\n        if dbt_output is None:\n            dbt_output = DbtOutput(result=check.not_none(dbt_resource.get_run_results_json()))\n\n        manifest_json = check.not_none(dbt_resource.get_manifest_json())\n\n        dbt_output = check.not_none(dbt_output)\n        for result in dbt_output.result["results"]:\n            extra_metadata: Optional[Mapping[str, RawMetadataValue]] = None\n            if runtime_metadata_fn:\n                node_info = manifest_json["nodes"][result["unique_id"]]\n                extra_metadata = runtime_metadata_fn(context, node_info)\n            yield from result_to_events(\n                result=result,\n                docs_url=dbt_output.docs_url,\n                node_info_to_asset_key=node_info_to_asset_key,\n                manifest_json=manifest_json,\n                extra_metadata=extra_metadata,\n                generate_asset_outputs=True,\n            )\n\n\ndef _events_for_structured_json_line(\n    json_line: Mapping[str, Any],\n    context: OpExecutionContext,\n    node_info_to_asset_key: Callable[[Mapping[str, Any]], AssetKey],\n    runtime_metadata_fn: Optional[\n        Callable[[OpExecutionContext, Mapping[str, Any]], Mapping[str, RawMetadataValue]]\n    ],\n    manifest_json: Mapping[str, Any],\n) -> Iterator[Union[AssetObservation, Output]]:\n    """Parses a json line into a Dagster event. Attempts to replicate the behavior of result_to_events\n    as closely as possible.\n    """\n    runtime_node_info = json_line.get("data", {}).get("node_info", {})\n    if not runtime_node_info:\n        return\n\n    node_resource_type = runtime_node_info.get("resource_type")\n    node_status = runtime_node_info.get("node_status")\n    unique_id = runtime_node_info.get("unique_id")\n\n    if not node_resource_type or not unique_id:\n        return\n\n    compiled_node_info = manifest_json["nodes"][unique_id]\n\n    if node_resource_type in ASSET_RESOURCE_TYPES and node_status == "success":\n        metadata = dict(\n            runtime_metadata_fn(context, compiled_node_info) if runtime_metadata_fn else {}\n        )\n        started_at_str = runtime_node_info.get("node_started_at")\n        finished_at_str = runtime_node_info.get("node_finished_at")\n        if started_at_str is None or finished_at_str is None:\n            return\n\n        started_at = dateutil.parser.isoparse(started_at_str)  # type: ignore\n        completed_at = dateutil.parser.isoparse(finished_at_str)  # type: ignore\n        duration = completed_at - started_at\n        metadata.update(\n            {\n                "Execution Started At": started_at.isoformat(timespec="seconds"),\n                "Execution Completed At": completed_at.isoformat(timespec="seconds"),\n                "Execution Duration": duration.total_seconds(),\n            }\n        )\n        yield Output(\n            value=None,\n            output_name=output_name_fn(compiled_node_info),\n            metadata=metadata,\n        )\n    elif node_resource_type == "test" and runtime_node_info.get("node_finished_at"):\n        upstream_unique_ids = (\n            manifest_json["nodes"][unique_id].get("depends_on", {}).get("nodes", [])\n        )\n        # tests can apply to multiple asset keys\n        for upstream_id in upstream_unique_ids:\n            # the upstream id can reference a node or a source\n            upstream_node_info = manifest_json["nodes"].get(upstream_id) or manifest_json[\n                "sources"\n            ].get(upstream_id)\n            if upstream_node_info is None:\n                continue\n            upstream_asset_key = node_info_to_asset_key(upstream_node_info)\n            yield AssetObservation(\n                asset_key=upstream_asset_key,\n                metadata={\n                    "Test ID": unique_id,\n                    "Test Status": node_status,\n                },\n            )\n\n\ndef _stream_event_iterator(\n    context: OpExecutionContext,\n    dbt_resource: Union[DbtCliResource, DbtCliClient],\n    use_build_command: bool,\n    node_info_to_asset_key: Callable[[Mapping[str, Any]], AssetKey],\n    runtime_metadata_fn: Optional[\n        Callable[[OpExecutionContext, Mapping[str, Any]], Mapping[str, RawMetadataValue]]\n    ],\n    kwargs: Dict[str, Any],\n    manifest_json: Mapping[str, Any],\n) -> Iterator[Union[AssetObservation, Output, AssetCheckResult]]:\n    """Yields events for a dbt cli invocation. Emits outputs as soon as the relevant dbt logs are\n    emitted.\n    """\n    if isinstance(dbt_resource, DbtCliClient):\n        for parsed_json_line in dbt_resource.cli_stream_json(\n            command="build" if use_build_command else "run",\n            **kwargs,\n        ):\n            yield from _events_for_structured_json_line(\n                parsed_json_line,\n                context,\n                node_info_to_asset_key,\n                runtime_metadata_fn,\n                manifest_json,\n            )\n    else:\n        if runtime_metadata_fn is not None:\n            raise DagsterDbtError(\n                "The runtime_metadata_fn argument on the load_assets_from_dbt_manifest and"\n                " load_assets_from_dbt_project functions is not supported when using the"\n                " DbtCliResource resource. Use the @dbt_assets decorator instead if you want"\n                " control over what metadata is yielded at runtime."\n            )\n\n        class CustomDagsterDbtTranslator(DagsterDbtTranslator):\n            @classmethod\n            def get_asset_key(cls, dbt_resource_props: Mapping[str, Any]) -> AssetKey:\n                return node_info_to_asset_key(dbt_resource_props)\n\n        cli_output = dbt_resource.cli(\n            args=["build" if use_build_command else "run", *build_command_args_from_flags(kwargs)],\n            manifest=manifest_json,\n            dagster_dbt_translator=CustomDagsterDbtTranslator(),\n        )\n        yield from cli_output.stream()\n\n\nclass DbtOpConfig(PermissiveConfig):\n    """Keyword arguments to pass to the underlying dbt command. Additional arguments not listed in the schema will\n    be passed through as well, e.g. {'bool_flag': True, 'string_flag': 'hi'} will result in the flags\n    '--bool_flag --string_flag hi' being passed to the dbt command.\n    """\n\n    select: Optional[str] = None\n    exclude: Optional[str] = None\n    vars: Optional[Dict[str, Any]] = None\n    full_refresh: Optional[bool] = None\n\n\ndef _get_dbt_op(\n    op_name: str,\n    ins: Mapping[str, In],\n    outs: Mapping[str, Out],\n    select: str,\n    exclude: str,\n    use_build_command: bool,\n    fqns_by_output_name: Mapping[str, List[str]],\n    dbt_resource_key: str,\n    node_info_to_asset_key: Callable[[Mapping[str, Any]], AssetKey],\n    partition_key_to_vars_fn: Optional[Callable[[str], Mapping[str, Any]]],\n    runtime_metadata_fn: Optional[\n        Callable[[OpExecutionContext, Mapping[str, Any]], Mapping[str, RawMetadataValue]]\n    ],\n    manifest_json: Mapping[str, Any],\n):\n    @op(\n        name=op_name,\n        tags={"kind": "dbt"},\n        ins=ins,\n        out=outs,\n        required_resource_keys={dbt_resource_key},\n    )\n    def _dbt_op(context, config: DbtOpConfig):\n        dbt_resource: Union[DbtCliResource, DbtCliClient] = getattr(\n            context.resources, dbt_resource_key\n        )\n        check.inst(\n            dbt_resource,\n            (DbtCliResource, DbtCliClient),\n            "Resource with key 'dbt_resource_key' must be a DbtCliResource or DbtCliClient"\n            f" object, but is a {type(dbt_resource)}",\n        )\n\n        kwargs: Dict[str, Any] = {}\n        # in the case that we're running everything, opt for the cleaner selection string\n        if len(context.selected_output_names) == len(outs):\n            kwargs["select"] = select\n            kwargs["exclude"] = exclude\n        else:\n            # for each output that we want to emit, translate to a dbt select string by converting\n            # the out to its corresponding fqn\n            kwargs["select"] = [\n                ".".join(fqns_by_output_name[output_name])\n                for output_name in context.selected_output_names\n            ]\n        # variables to pass into the command\n        if partition_key_to_vars_fn:\n            kwargs["vars"] = partition_key_to_vars_fn(context.partition_key)\n        # merge in any additional kwargs from the config\n        kwargs = deep_merge_dicts(kwargs, context.op_config)\n\n        if _can_stream_events(dbt_resource):\n            yield from _stream_event_iterator(\n                context,\n                dbt_resource,\n                use_build_command,\n                node_info_to_asset_key,\n                runtime_metadata_fn,\n                kwargs,\n                manifest_json=manifest_json,\n            )\n        else:\n            if not isinstance(dbt_resource, DbtCliClient):\n                check.failed(\n                    "Chose batch event iterator, but it only works with DbtCliClient, and"\n                    f" resource has type {type(dbt_resource)}"\n                )\n            yield from _batch_event_iterator(\n                context,\n                dbt_resource,\n                use_build_command,\n                node_info_to_asset_key,\n                runtime_metadata_fn,\n                kwargs,\n            )\n\n    return _dbt_op\n\n\ndef _dbt_nodes_to_assets(\n    dbt_nodes: Mapping[str, Any],\n    select: str,\n    exclude: str,\n    selected_unique_ids: AbstractSet[str],\n    project_id: str,\n    dbt_resource_key: str,\n    manifest_json: Mapping[str, Any],\n    op_name: Optional[str],\n    runtime_metadata_fn: Optional[\n        Callable[[OpExecutionContext, Mapping[str, Any]], Mapping[str, RawMetadataValue]]\n    ],\n    io_manager_key: Optional[str],\n    use_build_command: bool,\n    partitions_def: Optional[PartitionsDefinition],\n    partition_key_to_vars_fn: Optional[Callable[[str], Mapping[str, Any]]],\n    dagster_dbt_translator: DagsterDbtTranslator,\n) -> AssetsDefinition:\n    if use_build_command:\n        deps = get_deps(\n            dbt_nodes,\n            selected_unique_ids,\n            asset_resource_types=["model", "seed", "snapshot"],\n        )\n    else:\n        deps = get_deps(dbt_nodes, selected_unique_ids, asset_resource_types=["model"])\n\n    (\n        asset_deps,\n        asset_ins,\n        asset_outs,\n        group_names_by_key,\n        freshness_policies_by_key,\n        auto_materialize_policies_by_key,\n        check_specs_by_output_name,\n        fqns_by_output_name,\n        _,\n    ) = get_asset_deps(\n        dbt_nodes=dbt_nodes,\n        deps=deps,\n        io_manager_key=io_manager_key,\n        manifest=manifest_json,\n        dagster_dbt_translator=dagster_dbt_translator,\n    )\n\n    # prevent op name collisions between multiple dbt multi-assets\n    if not op_name:\n        op_name = f"run_dbt_{project_id}"\n        if select != "fqn:*" or exclude:\n            op_name += "_" + hashlib.md5(select.encode() + exclude.encode()).hexdigest()[-5:]\n\n    check_outs_by_output_name: Mapping[str, Out] = {}\n    if check_specs_by_output_name:\n        check_outs_by_output_name = {\n            output_name: Out(dagster_type=None, is_required=False)\n            for output_name in check_specs_by_output_name.keys()\n        }\n\n    dbt_op = _get_dbt_op(\n        op_name=op_name,\n        ins=dict(asset_ins.values()),\n        outs={\n            **dict(asset_outs.values()),\n            **check_outs_by_output_name,\n        },\n        select=select,\n        exclude=exclude,\n        use_build_command=use_build_command,\n        fqns_by_output_name=fqns_by_output_name,\n        dbt_resource_key=dbt_resource_key,\n        node_info_to_asset_key=dagster_dbt_translator.get_asset_key,\n        partition_key_to_vars_fn=partition_key_to_vars_fn,\n        runtime_metadata_fn=runtime_metadata_fn,\n        manifest_json=manifest_json,\n    )\n\n    return AssetsDefinition(\n        keys_by_input_name={\n            input_name: asset_key for asset_key, (input_name, _) in asset_ins.items()\n        },\n        keys_by_output_name={\n            output_name: asset_key for asset_key, (output_name, _) in asset_outs.items()\n        },\n        node_def=dbt_op,\n        can_subset=True,\n        asset_deps=asset_deps,\n        group_names_by_key=group_names_by_key,\n        freshness_policies_by_key=freshness_policies_by_key,\n        auto_materialize_policies_by_key=auto_materialize_policies_by_key,\n        check_specs_by_output_name=check_specs_by_output_name,\n        partitions_def=partitions_def,\n    )\n\n\n
[docs]def load_assets_from_dbt_project(\n project_dir: str,\n profiles_dir: Optional[str] = None,\n *,\n select: Optional[str] = None,\n exclude: Optional[str] = None,\n dagster_dbt_translator: Optional[DagsterDbtTranslator] = None,\n io_manager_key: Optional[str] = None,\n target_dir: Optional[str] = None,\n # All arguments below are deprecated\n key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n source_key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n op_name: Optional[str] = None,\n runtime_metadata_fn: Optional[\n Callable[[OpExecutionContext, Mapping[str, Any]], Mapping[str, Any]]\n ] = None,\n node_info_to_asset_key: Callable[[Mapping[str, Any]], AssetKey] = default_asset_key_fn,\n use_build_command: bool = True,\n partitions_def: Optional[PartitionsDefinition] = None,\n partition_key_to_vars_fn: Optional[Callable[[str], Mapping[str, Any]]] = None,\n node_info_to_group_fn: Callable[\n [Mapping[str, Any]], Optional[str]\n ] = default_group_from_dbt_resource_props,\n node_info_to_freshness_policy_fn: Callable[\n [Mapping[str, Any]], Optional[FreshnessPolicy]\n ] = default_freshness_policy_fn,\n node_info_to_auto_materialize_policy_fn: Callable[\n [Mapping[str, Any]], Optional[AutoMaterializePolicy]\n ] = default_auto_materialize_policy_fn,\n node_info_to_definition_metadata_fn: Callable[\n [Mapping[str, Any]], Mapping[str, MetadataUserInput]\n ] = default_metadata_from_dbt_resource_props,\n display_raw_sql: Optional[bool] = None,\n dbt_resource_key: str = "dbt",\n) -> Sequence[AssetsDefinition]:\n """Loads a set of dbt models from a dbt project into Dagster assets.\n\n Creates one Dagster asset for each dbt model. All assets will be re-materialized using a single\n `dbt run` or `dbt build` command.\n\n When searching for more flexibility in defining the computations that materialize your\n dbt assets, we recommend that you use :py:class:`~dagster_dbt.dbt_assets`.\n\n Args:\n project_dir (Optional[str]): The directory containing the dbt project to load.\n profiles_dir (Optional[str]): The profiles directory to use for loading the DBT project.\n Defaults to a directory called "config" inside the project_dir.\n target_dir (Optional[str]): The target directory where dbt will place compiled artifacts.\n Defaults to "target" underneath the project_dir.\n select (Optional[str]): A dbt selection string for the models in a project that you want\n to include. Defaults to `"fqn:*"`.\n exclude (Optional[str]): A dbt selection string for the models in a project that you want\n to exclude. Defaults to "".\n dagster_dbt_translator (Optional[DagsterDbtTranslator]): Allows customizing how to map\n dbt models, seeds, etc. to asset keys and asset metadata.\n key_prefix (Optional[Union[str, List[str]]]): [Deprecated] A key prefix to apply to all assets loaded\n from the dbt project. Does not apply to input assets. Deprecated: use\n dagster_dbt_translator=KeyPrefixDagsterDbtTranslator(key_prefix=...) instead.\n source_key_prefix (Optional[Union[str, List[str]]]): [Deprecated] A key prefix to apply to all input\n assets for the set of assets loaded from the dbt project. Deprecated: use\n dagster_dbt_translator=KeyPrefixDagsterDbtTranslator(source_key_prefix=...) instead.\n op_name (Optional[str]): [Deprecated] Sets the name of the underlying Op that will generate the dbt assets.\n Deprecated: use the `@dbt_assets` decorator if you need to customize the op name.\n dbt_resource_key (Optional[str]): [Deprecated] The resource key that the dbt resource will be specified at.\n Defaults to "dbt". Deprecated: use the `@dbt_assets` decorator if you need to customize\n the resource key.\n runtime_metadata_fn (Optional[Callable[[OpExecutionContext, Mapping[str, Any]], Mapping[str, Any]]]): [Deprecated]\n A function that will be run after any of the assets are materialized and returns\n metadata entries for the asset, to be displayed in the asset catalog for that run.\n Deprecated: use the @dbt_assets decorator if you need to customize runtime metadata.\n manifest_json (Optional[Mapping[str, Any]]): [Deprecated] Use the manifest argument instead.\n selected_unique_ids (Optional[Set[str]]): [Deprecated] The set of dbt unique_ids that you want to load\n as assets. Deprecated: use the select argument instead.\n node_info_to_asset_key (Mapping[str, Any] -> AssetKey): [Deprecated] A function that takes a dictionary\n of dbt node info and returns the AssetKey that you want to represent that node. By\n default, the asset key will simply be the name of the dbt model. Deprecated: instead,\n provide a custom DagsterDbtTranslator that overrides node_info_to_asset_key.\n use_build_command (bool): Flag indicating if you want to use `dbt build` as the core computation\n for this asset. Defaults to True. If set to False, then `dbt run` will be used, and\n seeds and snapshots won't be loaded as assets.\n partitions_def (Optional[PartitionsDefinition]): [Deprecated] Defines the set of partition keys that\n compose the dbt assets. Deprecated: use the @dbt_assets decorator to define partitioned\n dbt assets.\n partition_key_to_vars_fn (Optional[str -> Dict[str, Any]]): [Deprecated] A function to translate a given\n partition key (e.g. '2022-01-01') to a dictionary of vars to be passed into the dbt\n invocation (e.g. {"run_date": "2022-01-01"}). Deprecated: use the @dbt_assets decorator\n to define partitioned dbt assets.\n node_info_to_group_fn (Dict[str, Any] -> Optional[str]): [Deprecated] A function that takes a\n dictionary of dbt node info and returns the group that this node should be assigned to.\n Deprecated: instead, configure dagster groups on a dbt resource's meta field or assign\n dbt groups.\n node_info_to_freshness_policy_fn (Dict[str, Any] -> Optional[FreshnessPolicy]): [Deprecated] A function\n that takes a dictionary of dbt node info and optionally returns a FreshnessPolicy that\n should be applied to this node. By default, freshness policies will be created from\n config applied to dbt models, i.e.:\n `dagster_freshness_policy={"maximum_lag_minutes": 60, "cron_schedule": "0 9 * * *"}`\n will result in that model being assigned\n `FreshnessPolicy(maximum_lag_minutes=60, cron_schedule="0 9 * * *")`. Deprecated:\n instead, configure auto-materialize policies on a dbt resource's meta field.\n node_info_to_auto_materialize_policy_fn (Dict[str, Any] -> Optional[AutoMaterializePolicy]): [Deprecated]\n A function that takes a dictionary of dbt node info and optionally returns a AutoMaterializePolicy\n that should be applied to this node. By default, AutoMaterializePolicies will be created from\n config applied to dbt models, i.e.:\n `dagster_auto_materialize_policy={"type": "lazy"}` will result in that model being assigned\n `AutoMaterializePolicy.lazy()`. Deprecated: instead, configure auto-materialize\n policies on a dbt resource's meta field.\n node_info_to_definition_metadata_fn (Dict[str, Any] -> Optional[Dict[str, MetadataUserInput]]): [Deprecated]\n A function that takes a dictionary of dbt node info and optionally returns a dictionary\n of metadata to be attached to the corresponding definition. This is added to the default\n metadata assigned to the node, which consists of the node's schema (if present).\n Deprecated: instead, provide a custom DagsterDbtTranslator that overrides\n node_info_to_metadata.\n display_raw_sql (Optional[bool]): [Deprecated] A flag to indicate if the raw sql associated\n with each model should be included in the asset description. For large projects, setting\n this flag to False is advised to reduce the size of the resulting snapshot. Deprecated:\n instead, provide a custom DagsterDbtTranslator that overrides node_info_to_description.\n """\n project_dir = check.str_param(project_dir, "project_dir")\n profiles_dir = check.opt_str_param(\n profiles_dir, "profiles_dir", os.path.join(project_dir, "config")\n )\n target_dir = check.opt_str_param(target_dir, "target_dir", os.path.join(project_dir, "target"))\n select = check.opt_str_param(select, "select", "fqn:*")\n exclude = check.opt_str_param(exclude, "exclude", "")\n\n _raise_warnings_for_deprecated_args(\n "load_assets_from_dbt_manifest",\n selected_unique_ids=None,\n dbt_resource_key=dbt_resource_key,\n use_build_command=use_build_command,\n partitions_def=partitions_def,\n partition_key_to_vars_fn=partition_key_to_vars_fn,\n runtime_metadata_fn=runtime_metadata_fn,\n node_info_to_asset_key=node_info_to_asset_key,\n node_info_to_group_fn=node_info_to_group_fn,\n node_info_to_freshness_policy_fn=node_info_to_freshness_policy_fn,\n node_info_to_auto_materialize_policy_fn=node_info_to_auto_materialize_policy_fn,\n node_info_to_definition_metadata_fn=node_info_to_definition_metadata_fn,\n )\n\n manifest, cli_output = _load_manifest_for_project(\n project_dir, profiles_dir, target_dir, select, exclude\n )\n selected_unique_ids: Set[str] = set(\n filter(None, (line.get("unique_id") for line in cli_output.logs))\n )\n return _load_assets_from_dbt_manifest(\n manifest=manifest,\n select=select,\n exclude=exclude,\n key_prefix=key_prefix,\n source_key_prefix=source_key_prefix,\n dagster_dbt_translator=dagster_dbt_translator,\n op_name=op_name,\n runtime_metadata_fn=runtime_metadata_fn,\n io_manager_key=io_manager_key,\n selected_unique_ids=selected_unique_ids,\n node_info_to_asset_key=node_info_to_asset_key,\n use_build_command=use_build_command,\n partitions_def=partitions_def,\n partition_key_to_vars_fn=partition_key_to_vars_fn,\n node_info_to_auto_materialize_policy_fn=node_info_to_auto_materialize_policy_fn,\n node_info_to_group_fn=node_info_to_group_fn,\n node_info_to_freshness_policy_fn=node_info_to_freshness_policy_fn,\n node_info_to_definition_metadata_fn=node_info_to_definition_metadata_fn,\n display_raw_sql=display_raw_sql,\n dbt_resource_key=dbt_resource_key,\n )
\n\n\n
[docs]@deprecated_param(\n param="manifest_json", breaking_version="0.21", additional_warn_text="Use manifest instead"\n)\n@deprecated_param(\n param="selected_unique_ids",\n breaking_version="0.21",\n additional_warn_text="Use the select parameter instead.",\n)\n@deprecated_param(\n param="dbt_resource_key",\n breaking_version="0.21",\n additional_warn_text=(\n "Use the `@dbt_assets` decorator if you need to customize your resource key."\n ),\n)\n@deprecated_param(\n param="use_build_command",\n breaking_version="0.21",\n additional_warn_text=(\n "Use the `@dbt_assets` decorator if you need to customize the underlying dbt commands."\n ),\n)\n@deprecated_param(\n param="partitions_def",\n breaking_version="0.21",\n additional_warn_text="Use the `@dbt_assets` decorator to define partitioned dbt assets.",\n)\n@deprecated_param(\n param="partition_key_to_vars_fn",\n breaking_version="0.21",\n additional_warn_text="Use the `@dbt_assets` decorator to define partitioned dbt assets.",\n)\n@deprecated_param(\n param="runtime_metadata_fn",\n breaking_version="0.21",\n additional_warn_text=(\n "Use the `@dbt_assets` decorator if you need to customize runtime metadata."\n ),\n)\ndef load_assets_from_dbt_manifest(\n manifest: Optional[Union[Path, Mapping[str, Any]]] = None,\n *,\n select: Optional[str] = None,\n exclude: Optional[str] = None,\n io_manager_key: Optional[str] = None,\n dagster_dbt_translator: Optional[DagsterDbtTranslator] = None,\n # All arguments below are deprecated\n key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n source_key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n selected_unique_ids: Optional[AbstractSet[str]] = None,\n display_raw_sql: Optional[bool] = None,\n dbt_resource_key: str = "dbt",\n op_name: Optional[str] = None,\n manifest_json: Optional[Mapping[str, Any]] = None,\n use_build_command: bool = True,\n partitions_def: Optional[PartitionsDefinition] = None,\n partition_key_to_vars_fn: Optional[Callable[[str], Mapping[str, Any]]] = None,\n runtime_metadata_fn: Optional[\n Callable[[OpExecutionContext, Mapping[str, Any]], Mapping[str, Any]]\n ] = None,\n node_info_to_asset_key: Callable[[Mapping[str, Any]], AssetKey] = default_asset_key_fn,\n node_info_to_group_fn: Callable[\n [Mapping[str, Any]], Optional[str]\n ] = default_group_from_dbt_resource_props,\n node_info_to_freshness_policy_fn: Callable[\n [Mapping[str, Any]], Optional[FreshnessPolicy]\n ] = default_freshness_policy_fn,\n node_info_to_auto_materialize_policy_fn: Callable[\n [Mapping[str, Any]], Optional[AutoMaterializePolicy]\n ] = default_auto_materialize_policy_fn,\n node_info_to_definition_metadata_fn: Callable[\n [Mapping[str, Any]], Mapping[str, MetadataUserInput]\n ] = default_metadata_from_dbt_resource_props,\n) -> Sequence[AssetsDefinition]:\n """Loads a set of dbt models, described in a manifest.json, into Dagster assets.\n\n Creates one Dagster asset for each dbt model. All assets will be re-materialized using a single\n `dbt run` command.\n\n When searching for more flexibility in defining the computations that materialize your\n dbt assets, we recommend that you use :py:class:`~dagster_dbt.dbt_assets`.\n\n Args:\n manifest (Optional[Mapping[str, Any]]): The contents of a DBT manifest.json, which contains\n a set of models to load into assets.\n select (Optional[str]): A dbt selection string for the models in a project that you want\n to include. Defaults to `"fqn:*"`.\n exclude (Optional[str]): A dbt selection string for the models in a project that you want\n to exclude. Defaults to "".\n io_manager_key (Optional[str]): The IO manager key that will be set on each of the returned\n assets. When other ops are downstream of the loaded assets, the IOManager specified\n here determines how the inputs to those ops are loaded. Defaults to "io_manager".\n dagster_dbt_translator (Optional[DagsterDbtTranslator]): Allows customizing how to map\n dbt models, seeds, etc. to asset keys and asset metadata.\n key_prefix (Optional[Union[str, List[str]]]): [Deprecated] A key prefix to apply to all assets loaded\n from the dbt project. Does not apply to input assets. Deprecated: use\n dagster_dbt_translator=KeyPrefixDagsterDbtTranslator(key_prefix=...) instead.\n source_key_prefix (Optional[Union[str, List[str]]]): [Deprecated] A key prefix to apply to all input\n assets for the set of assets loaded from the dbt project. Deprecated: use\n dagster_dbt_translator=KeyPrefixDagsterDbtTranslator(source_key_prefix=...) instead.\n op_name (Optional[str]): [Deprecated] Sets the name of the underlying Op that will generate the dbt assets.\n Deprecated: use the `@dbt_assets` decorator if you need to customize the op name.\n dbt_resource_key (Optional[str]): [Deprecated] The resource key that the dbt resource will be specified at.\n Defaults to "dbt". Deprecated: use the `@dbt_assets` decorator if you need to customize\n the resource key.\n runtime_metadata_fn (Optional[Callable[[OpExecutionContext, Mapping[str, Any]], Mapping[str, Any]]]): [Deprecated]\n A function that will be run after any of the assets are materialized and returns\n metadata entries for the asset, to be displayed in the asset catalog for that run.\n Deprecated: use the @dbt_assets decorator if you need to customize runtime metadata.\n selected_unique_ids (Optional[Set[str]]): [Deprecated] The set of dbt unique_ids that you want to load\n as assets. Deprecated: use the select argument instead.\n node_info_to_asset_key (Mapping[str, Any] -> AssetKey): [Deprecated] A function that takes a dictionary\n of dbt node info and returns the AssetKey that you want to represent that node. By\n default, the asset key will simply be the name of the dbt model.\n use_build_command (bool): Flag indicating if you want to use `dbt build` as the core computation\n for this asset. Defaults to True. If set to False, then `dbt run` will be used, and\n seeds and snapshots won't be loaded as assets.\n partitions_def (Optional[PartitionsDefinition]): [Deprecated] Defines the set of partition keys that\n compose the dbt assets. Deprecated: use the @dbt_assets decorator to define partitioned\n dbt assets.\n partition_key_to_vars_fn (Optional[str -> Dict[str, Any]]): [Deprecated] A function to translate a given\n partition key (e.g. '2022-01-01') to a dictionary of vars to be passed into the dbt\n invocation (e.g. {"run_date": "2022-01-01"}). Deprecated: use the @dbt_assets decorator\n to define partitioned dbt assets.\n node_info_to_group_fn (Dict[str, Any] -> Optional[str]): [Deprecated] A function that takes a\n dictionary of dbt node info and returns the group that this node should be assigned to.\n Deprecated: instead, configure dagster groups on a dbt resource's meta field or assign\n dbt groups.\n node_info_to_freshness_policy_fn (Dict[str, Any] -> Optional[FreshnessPolicy]): [Deprecated] A function\n that takes a dictionary of dbt node info and optionally returns a FreshnessPolicy that\n should be applied to this node. By default, freshness policies will be created from\n config applied to dbt models, i.e.:\n `dagster_freshness_policy={"maximum_lag_minutes": 60, "cron_schedule": "0 9 * * *"}`\n will result in that model being assigned\n `FreshnessPolicy(maximum_lag_minutes=60, cron_schedule="0 9 * * *")`. Deprecated:\n instead, configure auto-materialize policies on a dbt resource's meta field.\n node_info_to_auto_materialize_policy_fn (Dict[str, Any] -> Optional[AutoMaterializePolicy]): [Deprecated]\n A function that takes a dictionary of dbt node info and optionally returns a AutoMaterializePolicy\n that should be applied to this node. By default, AutoMaterializePolicies will be created from\n config applied to dbt models, i.e.:\n `dagster_auto_materialize_policy={"type": "lazy"}` will result in that model being assigned\n `AutoMaterializePolicy.lazy()`. Deprecated: instead, configure auto-materialize\n policies on a dbt resource's meta field.\n node_info_to_definition_metadata_fn (Dict[str, Any] -> Optional[Dict[str, MetadataUserInput]]): [Deprecated]\n A function that takes a dictionary of dbt node info and optionally returns a dictionary\n of metadata to be attached to the corresponding definition. This is added to the default\n metadata assigned to the node, which consists of the node's schema (if present).\n Deprecated: instead, provide a custom DagsterDbtTranslator that overrides\n node_info_to_metadata.\n display_raw_sql (Optional[bool]): [Deprecated] A flag to indicate if the raw sql associated\n with each model should be included in the asset description. For large projects, setting\n this flag to False is advised to reduce the size of the resulting snapshot. Deprecated:\n instead, provide a custom DagsterDbtTranslator that overrides node_info_to_description.\n """\n manifest = normalize_renamed_param(\n manifest,\n "manifest",\n manifest_json,\n "manifest_json",\n )\n manifest = cast(\n Union[Mapping[str, Any], Path], check.inst_param(manifest, "manifest", (Path, dict))\n )\n if isinstance(manifest, Path):\n manifest = cast(Mapping[str, Any], json.loads(manifest.read_bytes()))\n\n _raise_warnings_for_deprecated_args(\n "load_assets_from_dbt_manifest",\n selected_unique_ids=selected_unique_ids,\n dbt_resource_key=dbt_resource_key,\n use_build_command=use_build_command,\n partitions_def=partitions_def,\n partition_key_to_vars_fn=partition_key_to_vars_fn,\n runtime_metadata_fn=runtime_metadata_fn,\n node_info_to_asset_key=node_info_to_asset_key,\n node_info_to_group_fn=node_info_to_group_fn,\n node_info_to_freshness_policy_fn=node_info_to_freshness_policy_fn,\n node_info_to_auto_materialize_policy_fn=node_info_to_auto_materialize_policy_fn,\n node_info_to_definition_metadata_fn=node_info_to_definition_metadata_fn,\n )\n\n return _load_assets_from_dbt_manifest(\n manifest=manifest,\n select=select,\n exclude=exclude,\n io_manager_key=io_manager_key,\n dagster_dbt_translator=dagster_dbt_translator,\n key_prefix=key_prefix,\n source_key_prefix=source_key_prefix,\n selected_unique_ids=selected_unique_ids,\n display_raw_sql=display_raw_sql,\n dbt_resource_key=dbt_resource_key,\n op_name=op_name,\n use_build_command=use_build_command,\n partitions_def=partitions_def,\n partition_key_to_vars_fn=partition_key_to_vars_fn,\n runtime_metadata_fn=runtime_metadata_fn,\n node_info_to_asset_key=node_info_to_asset_key,\n node_info_to_group_fn=node_info_to_group_fn,\n node_info_to_freshness_policy_fn=node_info_to_freshness_policy_fn,\n node_info_to_auto_materialize_policy_fn=node_info_to_auto_materialize_policy_fn,\n node_info_to_definition_metadata_fn=node_info_to_definition_metadata_fn,\n )
\n\n\ndef _load_assets_from_dbt_manifest(\n manifest: Mapping[str, Any],\n select: Optional[str],\n exclude: Optional[str],\n io_manager_key: Optional[str],\n dagster_dbt_translator: Optional[DagsterDbtTranslator],\n key_prefix: Optional[CoercibleToAssetKeyPrefix],\n source_key_prefix: Optional[CoercibleToAssetKeyPrefix],\n selected_unique_ids: Optional[AbstractSet[str]],\n display_raw_sql: Optional[bool],\n dbt_resource_key: str,\n op_name: Optional[str],\n use_build_command: bool,\n partitions_def: Optional[PartitionsDefinition],\n partition_key_to_vars_fn: Optional[Callable[[str], Mapping[str, Any]]],\n runtime_metadata_fn: Optional[\n Callable[[OpExecutionContext, Mapping[str, Any]], Mapping[str, Any]]\n ],\n node_info_to_asset_key: Callable[[Mapping[str, Any]], AssetKey],\n node_info_to_group_fn: Callable[[Mapping[str, Any]], Optional[str]],\n node_info_to_freshness_policy_fn: Callable[[Mapping[str, Any]], Optional[FreshnessPolicy]],\n node_info_to_auto_materialize_policy_fn: Callable[\n [Mapping[str, Any]], Optional[AutoMaterializePolicy]\n ],\n node_info_to_definition_metadata_fn: Callable[\n [Mapping[str, Any]], Mapping[str, MetadataUserInput]\n ],\n) -> Sequence[AssetsDefinition]:\n if partition_key_to_vars_fn:\n check.invariant(\n partitions_def is not None,\n "Cannot supply a `partition_key_to_vars_fn` without a `partitions_def`.",\n )\n\n dbt_resource_key = check.str_param(dbt_resource_key, "dbt_resource_key")\n\n dbt_nodes = {\n **manifest["nodes"],\n **manifest["sources"],\n **manifest["metrics"],\n **manifest["exposures"],\n }\n\n if selected_unique_ids:\n select = (\n " ".join(".".join(dbt_nodes[uid]["fqn"]) for uid in selected_unique_ids)\n if select is None\n else select\n )\n exclude = "" if exclude is None else exclude\n else:\n select = select if select is not None else "fqn:*"\n exclude = exclude if exclude is not None else ""\n\n selected_unique_ids = select_unique_ids_from_manifest(\n select=select, exclude=exclude, manifest_json=manifest\n )\n if len(selected_unique_ids) == 0:\n raise DagsterInvalidSubsetError(f"No dbt models match the selection string '{select}'.")\n\n if dagster_dbt_translator is not None:\n check.invariant(\n node_info_to_asset_key == default_asset_key_fn,\n "Can't specify both dagster_dbt_translator and node_info_to_asset_key",\n )\n check.invariant(\n key_prefix is None,\n "Can't specify both dagster_dbt_translator and key_prefix",\n )\n check.invariant(\n source_key_prefix is None,\n "Can't specify both dagster_dbt_translator and source_key_prefix",\n )\n check.invariant(\n node_info_to_group_fn == default_group_from_dbt_resource_props,\n "Can't specify both dagster_dbt_translator and node_info_to_group_fn",\n )\n check.invariant(\n display_raw_sql is None,\n "Can't specify both dagster_dbt_translator and display_raw_sql",\n )\n check.invariant(\n node_info_to_definition_metadata_fn is default_metadata_from_dbt_resource_props,\n "Can't specify both dagster_dbt_translator and node_info_to_definition_metadata_fn",\n )\n else:\n\n class CustomDagsterDbtTranslator(DagsterDbtTranslator):\n @classmethod\n def get_asset_key(cls, dbt_resource_props):\n base_key = node_info_to_asset_key(dbt_resource_props)\n if dbt_resource_props["resource_type"] == "source":\n return base_key.with_prefix(source_key_prefix or [])\n else:\n return base_key.with_prefix(key_prefix or [])\n\n @classmethod\n def get_metadata(cls, dbt_resource_props):\n return node_info_to_definition_metadata_fn(dbt_resource_props)\n\n @classmethod\n def get_description(cls, dbt_resource_props):\n return default_description_fn(\n dbt_resource_props,\n display_raw_sql=display_raw_sql if display_raw_sql is not None else True,\n )\n\n @classmethod\n def get_group_name(cls, dbt_resource_props):\n return node_info_to_group_fn(dbt_resource_props)\n\n @classmethod\n def get_freshness_policy(\n cls, dbt_resource_props: Mapping[str, Any]\n ) -> Optional[FreshnessPolicy]:\n return node_info_to_freshness_policy_fn(dbt_resource_props)\n\n @classmethod\n def get_auto_materialize_policy(\n cls, dbt_resource_props: Mapping[str, Any]\n ) -> Optional[AutoMaterializePolicy]:\n return node_info_to_auto_materialize_policy_fn(dbt_resource_props)\n\n dagster_dbt_translator = CustomDagsterDbtTranslator()\n\n dbt_assets_def = _dbt_nodes_to_assets(\n dbt_nodes,\n runtime_metadata_fn=runtime_metadata_fn,\n io_manager_key=io_manager_key,\n select=select,\n exclude=exclude,\n selected_unique_ids=selected_unique_ids,\n dbt_resource_key=dbt_resource_key,\n op_name=op_name,\n project_id=manifest["metadata"]["project_id"][:5],\n use_build_command=use_build_command,\n partitions_def=partitions_def,\n partition_key_to_vars_fn=partition_key_to_vars_fn,\n dagster_dbt_translator=dagster_dbt_translator,\n manifest_json=manifest,\n )\n\n return [dbt_assets_def]\n\n\ndef _raise_warnings_for_deprecated_args(\n public_fn_name: str,\n selected_unique_ids: Optional[AbstractSet[str]],\n dbt_resource_key: Optional[str],\n use_build_command: Optional[bool],\n partitions_def: Optional[PartitionsDefinition],\n partition_key_to_vars_fn: Optional[Callable[[str], Mapping[str, Any]]],\n runtime_metadata_fn: Optional[\n Callable[[OpExecutionContext, Mapping[str, Any]], Mapping[str, Any]]\n ],\n node_info_to_asset_key: Callable[[Mapping[str, Any]], AssetKey],\n node_info_to_group_fn: Callable[[Mapping[str, Any]], Optional[str]],\n node_info_to_freshness_policy_fn: Callable[[Mapping[str, Any]], Optional[FreshnessPolicy]],\n node_info_to_auto_materialize_policy_fn: Callable[\n [Mapping[str, Any]], Optional[AutoMaterializePolicy]\n ],\n node_info_to_definition_metadata_fn: Callable[\n [Mapping[str, Any]], Mapping[str, MetadataUserInput]\n ],\n):\n if node_info_to_asset_key != default_asset_key_fn:\n deprecation_warning(\n f"The node_info_to_asset_key_fn arg of {public_fn_name}",\n "0.21",\n "Instead, provide a custom DagsterDbtTranslator that overrides get_asset_key.",\n stacklevel=4,\n )\n\n if node_info_to_group_fn != default_group_from_dbt_resource_props:\n deprecation_warning(\n f"The node_info_to_group_fn arg of {public_fn_name}",\n "0.21",\n "Instead, configure dagster groups on a dbt resource's meta field or assign dbt"\n " groups or provide a custom DagsterDbtTranslator that overrides get_group_name.",\n stacklevel=4,\n )\n\n if node_info_to_auto_materialize_policy_fn != default_auto_materialize_policy_fn:\n deprecation_warning(\n f"The node_info_to_auto_materialize_policy_fn arg of {public_fn_name}",\n "0.21",\n "Instead, configure Dagster auto-materialize policies on a dbt resource's meta field.",\n stacklevel=4,\n )\n\n if node_info_to_freshness_policy_fn != default_freshness_policy_fn:\n deprecation_warning(\n f"The node_info_to_freshness_policy_fn arg of {public_fn_name}",\n "0.21",\n "Instead, configure Dagster freshness policies on a dbt resource's meta field.",\n stacklevel=4,\n )\n\n if node_info_to_definition_metadata_fn != default_metadata_from_dbt_resource_props:\n deprecation_warning(\n f"The node_info_to_definition_metadata_fn arg of {public_fn_name}",\n "0.21",\n "Instead, provide a custom DagsterDbtTranslator that overrides get_metadata.",\n stacklevel=4,\n )\n
", "current_page_name": "_modules/dagster_dbt/asset_defs", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_dbt.asset_defs"}, "asset_utils": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_dbt.asset_utils

\nimport hashlib\nimport textwrap\nfrom typing import (\n    TYPE_CHECKING,\n    AbstractSet,\n    Any,\n    Dict,\n    FrozenSet,\n    List,\n    Mapping,\n    Optional,\n    Sequence,\n    Set,\n    Tuple,\n    cast,\n)\n\nfrom dagster import (\n    AssetCheckSpec,\n    AssetKey,\n    AssetsDefinition,\n    AssetSelection,\n    AutoMaterializePolicy,\n    DagsterInvariantViolationError,\n    FreshnessPolicy,\n    In,\n    MetadataValue,\n    Nothing,\n    Out,\n    RunConfig,\n    ScheduleDefinition,\n    TableColumn,\n    TableSchema,\n    _check as check,\n    define_asset_job,\n)\nfrom dagster._core.definitions.decorators.asset_decorator import (\n    _validate_and_assign_output_names_to_check_specs,\n)\nfrom dagster._utils.merger import merge_dicts\nfrom dagster._utils.warnings import deprecation_warning\n\nfrom .utils import input_name_fn, output_name_fn\n\nif TYPE_CHECKING:\n    from .dagster_dbt_translator import DagsterDbtTranslator, DbtManifestWrapper\n\nMANIFEST_METADATA_KEY = "dagster_dbt/manifest"\nDAGSTER_DBT_TRANSLATOR_METADATA_KEY = "dagster_dbt/dagster_dbt_translator"\n\n\n
[docs]def get_asset_key_for_model(dbt_assets: Sequence[AssetsDefinition], model_name: str) -> AssetKey:\n """Return the corresponding Dagster asset key for a dbt model.\n\n Args:\n dbt_assets (AssetsDefinition): An AssetsDefinition object produced by\n load_assets_from_dbt_project, load_assets_from_dbt_manifest, or @dbt_assets.\n model_name (str): The name of the dbt model.\n\n Returns:\n AssetKey: The corresponding Dagster asset key.\n\n Examples:\n .. code-block:: python\n\n from dagster import asset\n from dagster_dbt import dbt_assets, get_asset_key_for_model\n\n @dbt_assets(manifest=...)\n def all_dbt_assets():\n ...\n\n\n @asset(deps={get_asset_key_for_model([all_dbt_assets], "customers")})\n def cleaned_customers():\n ...\n """\n check.sequence_param(dbt_assets, "dbt_assets", of_type=AssetsDefinition)\n check.str_param(model_name, "model_name")\n\n manifest, dagster_dbt_translator = get_manifest_and_translator_from_dbt_assets(dbt_assets)\n\n matching_models = [\n value\n for value in manifest["nodes"].values()\n if value["name"] == model_name and value["resource_type"] == "model"\n ]\n\n if len(matching_models) == 0:\n raise KeyError(f"Could not find a dbt model with name: {model_name}")\n\n return dagster_dbt_translator.get_asset_key(next(iter(matching_models)))
\n\n\n
[docs]def get_asset_keys_by_output_name_for_source(\n dbt_assets: Sequence[AssetsDefinition], source_name: str\n) -> Mapping[str, AssetKey]:\n """Returns the corresponding Dagster asset keys for all tables in a dbt source.\n\n This is a convenience method that makes it easy to define a multi-asset that generates\n all the tables for a given dbt source.\n\n Args:\n source_name (str): The name of the dbt source.\n\n Returns:\n Mapping[str, AssetKey]: A mapping of the table name to corresponding Dagster asset key\n for all tables in the given dbt source.\n\n Examples:\n .. code-block:: python\n\n from dagster import AssetOut, multi_asset\n from dagster_dbt import dbt_assets, get_asset_keys_by_output_name_for_source\n\n @dbt_assets(manifest=...)\n def all_dbt_assets():\n ...\n\n @multi_asset(\n outs={\n name: AssetOut(key=asset_key)\n for name, asset_key in get_asset_keys_by_output_name_for_source(\n [all_dbt_assets], "raw_data"\n ).items()\n },\n )\n def upstream_python_asset():\n ...\n\n """\n check.sequence_param(dbt_assets, "dbt_assets", of_type=AssetsDefinition)\n check.str_param(source_name, "source_name")\n\n manifest, dagster_dbt_translator = get_manifest_and_translator_from_dbt_assets(dbt_assets)\n\n matching_nodes = [\n value for value in manifest["sources"].values() if value["source_name"] == source_name\n ]\n\n if len(matching_nodes) == 0:\n raise KeyError(f"Could not find a dbt source with name: {source_name}")\n\n return {\n output_name_fn(value): dagster_dbt_translator.get_asset_key(value)\n for value in matching_nodes\n }
\n\n\n
[docs]def get_asset_key_for_source(dbt_assets: Sequence[AssetsDefinition], source_name: str) -> AssetKey:\n """Returns the corresponding Dagster asset key for a dbt source with a singular table.\n\n Args:\n source_name (str): The name of the dbt source.\n\n Raises:\n DagsterInvalidInvocationError: If the source has more than one table.\n\n Returns:\n AssetKey: The corresponding Dagster asset key.\n\n Examples:\n .. code-block:: python\n\n from dagster import asset\n from dagster_dbt import dbt_assets, get_asset_key_for_source\n\n @dbt_assets(manifest=...)\n def all_dbt_assets():\n ...\n\n @asset(key=get_asset_key_for_source([all_dbt_assets], "my_source"))\n def upstream_python_asset():\n ...\n """\n asset_keys_by_output_name = get_asset_keys_by_output_name_for_source(dbt_assets, source_name)\n\n if len(asset_keys_by_output_name) > 1:\n raise KeyError(\n f"Source {source_name} has more than one table:"\n f" {asset_keys_by_output_name.values()}. Use"\n " `get_asset_keys_by_output_name_for_source` instead to get all tables for a"\n " source."\n )\n\n return next(iter(asset_keys_by_output_name.values()))
\n\n\n
[docs]def build_dbt_asset_selection(\n dbt_assets: Sequence[AssetsDefinition],\n dbt_select: str = "fqn:*",\n dbt_exclude: Optional[str] = None,\n) -> AssetSelection:\n """Build an asset selection for a dbt selection string.\n\n See https://docs.getdbt.com/reference/node-selection/syntax#how-does-selection-work for\n more information.\n\n Args:\n dbt_select (str): A dbt selection string to specify a set of dbt resources.\n dbt_exclude (Optional[str]): A dbt selection string to exclude a set of dbt resources.\n\n Returns:\n AssetSelection: An asset selection for the selected dbt nodes.\n\n Examples:\n .. code-block:: python\n\n from dagster_dbt import dbt_assets, build_dbt_asset_selection\n\n @dbt_assets(manifest=...)\n def all_dbt_assets():\n ...\n\n # Select the dbt assets that have the tag "foo".\n foo_selection = build_dbt_asset_selection([dbt_assets], dbt_select="tag:foo")\n\n # Select the dbt assets that have the tag "foo" and all Dagster assets downstream\n # of them (dbt-related or otherwise)\n foo_and_downstream_selection = foo_selection.downstream()\n\n """\n manifest, dagster_dbt_translator = get_manifest_and_translator_from_dbt_assets(dbt_assets)\n from .dbt_manifest_asset_selection import DbtManifestAssetSelection\n\n return DbtManifestAssetSelection(\n manifest=manifest,\n dagster_dbt_translator=dagster_dbt_translator,\n select=dbt_select,\n exclude=dbt_exclude,\n )
\n\n\n
[docs]def build_schedule_from_dbt_selection(\n dbt_assets: Sequence[AssetsDefinition],\n job_name: str,\n cron_schedule: str,\n dbt_select: str = "fqn:*",\n dbt_exclude: Optional[str] = None,\n tags: Optional[Mapping[str, str]] = None,\n config: Optional[RunConfig] = None,\n execution_timezone: Optional[str] = None,\n) -> ScheduleDefinition:\n """Build a schedule to materialize a specified set of dbt resources from a dbt selection string.\n\n See https://docs.getdbt.com/reference/node-selection/syntax#how-does-selection-work for\n more information.\n\n Args:\n job_name (str): The name of the job to materialize the dbt resources.\n cron_schedule (str): The cron schedule to define the schedule.\n dbt_select (str): A dbt selection string to specify a set of dbt resources.\n dbt_exclude (Optional[str]): A dbt selection string to exclude a set of dbt resources.\n tags (Optional[Mapping[str, str]]): A dictionary of tags (string key-value pairs) to attach\n to the scheduled runs.\n config (Optional[RunConfig]): The config that parameterizes the execution of this schedule.\n execution_timezone (Optional[str]): Timezone in which the schedule should run.\n Supported strings for timezones are the ones provided by the\n `IANA time zone database <https://www.iana.org/time-zones>` - e.g. "America/Los_Angeles".\n\n Returns:\n ScheduleDefinition: A definition to materialize the selected dbt resources on a cron schedule.\n\n Examples:\n .. code-block:: python\n\n from dagster_dbt import dbt_assets, build_schedule_from_dbt_selection\n\n @dbt_assets(manifest=...)\n def all_dbt_assets():\n ...\n\n daily_dbt_assets_schedule = build_schedule_from_dbt_selection(\n [all_dbt_assets],\n job_name="all_dbt_assets",\n cron_schedule="0 0 * * *",\n dbt_select="fqn:*",\n )\n """\n return ScheduleDefinition(\n cron_schedule=cron_schedule,\n job=define_asset_job(\n name=job_name,\n selection=build_dbt_asset_selection(\n dbt_assets,\n dbt_select=dbt_select,\n dbt_exclude=dbt_exclude,\n ),\n config=config,\n tags=tags,\n ),\n execution_timezone=execution_timezone,\n )
\n\n\ndef get_manifest_and_translator_from_dbt_assets(\n dbt_assets: Sequence[AssetsDefinition],\n) -> Tuple[Mapping[str, Any], "DagsterDbtTranslator"]:\n check.invariant(len(dbt_assets) == 1, "Exactly one dbt AssetsDefinition is required")\n dbt_assets_def = dbt_assets[0]\n metadata_by_key = dbt_assets_def.metadata_by_key or {}\n first_asset_key = next(iter(dbt_assets_def.keys))\n first_metadata = metadata_by_key.get(first_asset_key, {})\n manifest_wrapper: Optional["DbtManifestWrapper"] = first_metadata.get(MANIFEST_METADATA_KEY)\n if manifest_wrapper is None:\n raise DagsterInvariantViolationError(\n f"Expected to find dbt manifest metadata on asset {first_asset_key.to_user_string()},"\n " but did not. Did you pass in assets that weren't generated by"\n " load_assets_from_dbt_project, load_assets_from_dbt_manifest, or @dbt_assets?"\n )\n\n dagster_dbt_translator = first_metadata.get(DAGSTER_DBT_TRANSLATOR_METADATA_KEY)\n if dagster_dbt_translator is None:\n raise DagsterInvariantViolationError(\n f"Expected to find dbt translator metadata on asset {first_asset_key.to_user_string()},"\n " but did not. Did you pass in assets that weren't generated by"\n " load_assets_from_dbt_project, load_assets_from_dbt_manifest, or @dbt_assets?"\n )\n\n return manifest_wrapper.manifest, dagster_dbt_translator\n\n\n###################\n# DEFAULT FUNCTIONS\n###################\n\n\ndef default_asset_key_fn(dbt_resource_props: Mapping[str, Any]) -> AssetKey:\n """Get the asset key for a dbt node.\n\n By default, if the dbt node has a Dagster asset key configured in its metadata, then that is\n parsed and used.\n\n Otherwise:\n dbt sources: a dbt source's key is the union of its source name and its table name\n dbt models: a dbt model's key is the union of its model name and any schema configured on\n the model itself.\n """\n dagster_metadata = dbt_resource_props.get("meta", {}).get("dagster", {})\n asset_key_config = dagster_metadata.get("asset_key", [])\n if asset_key_config:\n return AssetKey(asset_key_config)\n\n if dbt_resource_props["resource_type"] == "source":\n components = [dbt_resource_props["source_name"], dbt_resource_props["name"]]\n else:\n configured_schema = dbt_resource_props["config"].get("schema")\n if configured_schema is not None:\n components = [configured_schema, dbt_resource_props["name"]]\n else:\n components = [dbt_resource_props["name"]]\n\n return AssetKey(components)\n\n\n
[docs]def default_metadata_from_dbt_resource_props(\n dbt_resource_props: Mapping[str, Any]\n) -> Mapping[str, Any]:\n metadata: Dict[str, Any] = {}\n columns = dbt_resource_props.get("columns", {})\n if len(columns) > 0:\n metadata["table_schema"] = MetadataValue.table_schema(\n TableSchema(\n columns=[\n TableColumn(\n name=column_name,\n type=column_info.get("data_type") or "?",\n description=column_info.get("description"),\n )\n for column_name, column_info in columns.items()\n ]\n )\n )\n return metadata
\n\n\n
[docs]def default_group_from_dbt_resource_props(dbt_resource_props: Mapping[str, Any]) -> Optional[str]:\n """Get the group name for a dbt node.\n\n If a Dagster group is configured in the metadata for the node, use that.\n\n Otherwise, if a dbt group is configured for the node, use that.\n """\n dagster_metadata = dbt_resource_props.get("meta", {}).get("dagster", {})\n\n dagster_group = dagster_metadata.get("group")\n if dagster_group:\n return dagster_group\n\n dbt_group = dbt_resource_props.get("config", {}).get("group")\n if dbt_group:\n return dbt_group\n\n return None
\n\n\n
[docs]def group_from_dbt_resource_props_fallback_to_directory(\n dbt_resource_props: Mapping[str, Any]\n) -> Optional[str]:\n """Get the group name for a dbt node.\n\n Has the same behavior as the default_group_from_dbt_resource_props, except for that, if no group can be determined\n from config or metadata, falls back to using the subdirectory of the models directory that the\n source file is in.\n\n Args:\n dbt_resource_props (Mapping[str, Any]): A dictionary representing the dbt resource.\n\n Examples:\n .. code-block:: python\n\n from dagster_dbt import group_from_dbt_resource_props_fallback_to_directory\n\n dbt_assets = load_assets_from_dbt_manifest(\n manifest=manifest,\n node_info_to_group_fn=group_from_dbt_resource_props_fallback_to_directory,\n )\n """\n group_name = default_group_from_dbt_resource_props(dbt_resource_props)\n if group_name is not None:\n return group_name\n\n fqn = dbt_resource_props.get("fqn", [])\n # the first component is the package name, and the last component is the model name\n if len(fqn) < 3:\n return None\n return fqn[1]
\n\n\ndef default_freshness_policy_fn(dbt_resource_props: Mapping[str, Any]) -> Optional[FreshnessPolicy]:\n dagster_metadata = dbt_resource_props.get("meta", {}).get("dagster", {})\n freshness_policy_config = dagster_metadata.get("freshness_policy", {})\n\n freshness_policy = _legacy_freshness_policy_fn(freshness_policy_config)\n if freshness_policy:\n return freshness_policy\n\n legacy_freshness_policy_config = dbt_resource_props["config"].get(\n "dagster_freshness_policy", {}\n )\n legacy_freshness_policy = _legacy_freshness_policy_fn(legacy_freshness_policy_config)\n\n if legacy_freshness_policy:\n deprecation_warning(\n "dagster_freshness_policy",\n "0.21.0",\n "Instead, configure a Dagster freshness policy on a dbt model using"\n " +meta.dagster.freshness_policy.",\n )\n\n return legacy_freshness_policy\n\n\ndef _legacy_freshness_policy_fn(\n freshness_policy_config: Mapping[str, Any]\n) -> Optional[FreshnessPolicy]:\n if freshness_policy_config:\n return FreshnessPolicy(\n maximum_lag_minutes=float(freshness_policy_config["maximum_lag_minutes"]),\n cron_schedule=freshness_policy_config.get("cron_schedule"),\n cron_schedule_timezone=freshness_policy_config.get("cron_schedule_timezone"),\n )\n return None\n\n\ndef default_auto_materialize_policy_fn(\n dbt_resource_props: Mapping[str, Any]\n) -> Optional[AutoMaterializePolicy]:\n dagster_metadata = dbt_resource_props.get("meta", {}).get("dagster", {})\n auto_materialize_policy_config = dagster_metadata.get("auto_materialize_policy", {})\n\n auto_materialize_policy = _auto_materialize_policy_fn(auto_materialize_policy_config)\n if auto_materialize_policy:\n return auto_materialize_policy\n\n legacy_auto_materialize_policy_config = dbt_resource_props["config"].get(\n "dagster_auto_materialize_policy", {}\n )\n legacy_auto_materialize_policy = _auto_materialize_policy_fn(\n legacy_auto_materialize_policy_config\n )\n\n if legacy_auto_materialize_policy:\n deprecation_warning(\n "dagster_auto_materialize_policy",\n "0.21.0",\n "Instead, configure a Dagster auto-materialize policy on a dbt model using"\n " +meta.dagster.auto_materialize_policy.",\n )\n\n return legacy_auto_materialize_policy\n\n\ndef _auto_materialize_policy_fn(\n auto_materialize_policy_config: Mapping[str, Any]\n) -> Optional[AutoMaterializePolicy]:\n if auto_materialize_policy_config.get("type") == "eager":\n return AutoMaterializePolicy.eager()\n elif auto_materialize_policy_config.get("type") == "lazy":\n return AutoMaterializePolicy.lazy()\n return None\n\n\ndef default_description_fn(dbt_resource_props: Mapping[str, Any], display_raw_sql: bool = True):\n code_block = textwrap.indent(\n dbt_resource_props.get("raw_sql") or dbt_resource_props.get("raw_code", ""), " "\n )\n description_sections = [\n dbt_resource_props["description"]\n or f"dbt {dbt_resource_props['resource_type']} {dbt_resource_props['name']}",\n ]\n if display_raw_sql:\n description_sections.append(f"#### Raw SQL:\\n```\\n{code_block}\\n```")\n return "\\n\\n".join(filter(None, description_sections))\n\n\ndef is_asset_check_from_dbt_resource_props(dbt_resource_props: Mapping[str, Any]) -> bool:\n return dbt_resource_props["meta"].get("dagster", {}).get("asset_check", False)\n\n\ndef is_generic_test_on_attached_node_from_dbt_resource_props(\n unique_id: str, dbt_resource_props: Mapping[str, Any]\n) -> bool:\n attached_node_unique_id = dbt_resource_props.get("attached_node")\n is_generic_test = bool(attached_node_unique_id)\n\n return is_generic_test and attached_node_unique_id == unique_id\n\n\ndef default_asset_check_fn(\n asset_key: AssetKey, unique_id: str, dbt_resource_props: Mapping[str, Any]\n) -> Optional[AssetCheckSpec]:\n is_asset_check = is_asset_check_from_dbt_resource_props(dbt_resource_props)\n is_generic_test_on_attached_node = is_generic_test_on_attached_node_from_dbt_resource_props(\n unique_id, dbt_resource_props\n )\n\n if not all([is_asset_check, is_generic_test_on_attached_node]):\n return None\n\n return AssetCheckSpec(\n name=dbt_resource_props["name"],\n asset=asset_key,\n description=dbt_resource_props["description"],\n )\n\n\ndef default_code_version_fn(dbt_resource_props: Mapping[str, Any]) -> str:\n return hashlib.sha1(\n (dbt_resource_props.get("raw_sql") or dbt_resource_props.get("raw_code", "")).encode(\n "utf-8"\n )\n ).hexdigest()\n\n\n###################\n# DEPENDENCIES\n###################\n\n\ndef is_non_asset_node(dbt_resource_props: Mapping[str, Any]):\n # some nodes exist inside the dbt graph but are not assets\n resource_type = dbt_resource_props["resource_type"]\n if resource_type == "metric":\n return True\n if (\n resource_type == "model"\n and dbt_resource_props.get("config", {}).get("materialized") == "ephemeral"\n ):\n return True\n return False\n\n\ndef get_deps(\n dbt_nodes: Mapping[str, Any],\n selected_unique_ids: AbstractSet[str],\n asset_resource_types: List[str],\n) -> Mapping[str, FrozenSet[str]]:\n def _valid_parent_node(dbt_resource_props):\n # sources are valid parents, but not assets\n return dbt_resource_props["resource_type"] in asset_resource_types + ["source"]\n\n asset_deps: Dict[str, Set[str]] = {}\n for unique_id in selected_unique_ids:\n dbt_resource_props = dbt_nodes[unique_id]\n node_resource_type = dbt_resource_props["resource_type"]\n\n # skip non-assets, such as metrics, tests, and ephemeral models\n if is_non_asset_node(dbt_resource_props) or node_resource_type not in asset_resource_types:\n continue\n\n asset_deps[unique_id] = set()\n for parent_unique_id in dbt_resource_props.get("depends_on", {}).get("nodes", []):\n parent_node_info = dbt_nodes[parent_unique_id]\n # for metrics or ephemeral dbt models, BFS to find valid parents\n if is_non_asset_node(parent_node_info):\n visited = set()\n replaced_parent_ids = set()\n # make a copy to avoid mutating the actual dictionary\n queue = list(parent_node_info.get("depends_on", {}).get("nodes", []))\n while queue:\n candidate_parent_id = queue.pop()\n if candidate_parent_id in visited:\n continue\n visited.add(candidate_parent_id)\n\n candidate_parent_info = dbt_nodes[candidate_parent_id]\n if is_non_asset_node(candidate_parent_info):\n queue.extend(candidate_parent_info.get("depends_on", {}).get("nodes", []))\n elif _valid_parent_node(candidate_parent_info):\n replaced_parent_ids.add(candidate_parent_id)\n\n asset_deps[unique_id] |= replaced_parent_ids\n # ignore nodes which are not assets / sources\n elif _valid_parent_node(parent_node_info):\n asset_deps[unique_id].add(parent_unique_id)\n\n frozen_asset_deps = {\n unique_id: frozenset(parent_ids) for unique_id, parent_ids in asset_deps.items()\n }\n\n return frozen_asset_deps\n\n\ndef get_asset_deps(\n dbt_nodes,\n deps,\n io_manager_key,\n manifest: Optional[Mapping[str, Any]],\n dagster_dbt_translator: "DagsterDbtTranslator",\n) -> Tuple[\n Dict[AssetKey, Set[AssetKey]],\n Dict[AssetKey, Tuple[str, In]],\n Dict[AssetKey, Tuple[str, Out]],\n Dict[AssetKey, str],\n Dict[AssetKey, FreshnessPolicy],\n Dict[AssetKey, AutoMaterializePolicy],\n Dict[str, AssetCheckSpec],\n Dict[str, List[str]],\n Dict[str, Dict[str, Any]],\n]:\n from .dagster_dbt_translator import DbtManifestWrapper\n\n asset_deps: Dict[AssetKey, Set[AssetKey]] = {}\n asset_ins: Dict[AssetKey, Tuple[str, In]] = {}\n asset_outs: Dict[AssetKey, Tuple[str, Out]] = {}\n\n # These dicts could be refactored as a single dict, mapping from output name to arbitrary\n # metadata that we need to store for reference.\n group_names_by_key: Dict[AssetKey, str] = {}\n freshness_policies_by_key: Dict[AssetKey, FreshnessPolicy] = {}\n auto_materialize_policies_by_key: Dict[AssetKey, AutoMaterializePolicy] = {}\n check_specs: List[AssetCheckSpec] = []\n fqns_by_output_name: Dict[str, List[str]] = {}\n metadata_by_output_name: Dict[str, Dict[str, Any]] = {}\n\n for unique_id, parent_unique_ids in deps.items():\n dbt_resource_props = dbt_nodes[unique_id]\n\n output_name = output_name_fn(dbt_resource_props)\n fqns_by_output_name[output_name] = dbt_resource_props["fqn"]\n\n metadata_by_output_name[output_name] = {\n key: dbt_resource_props[key] for key in ["unique_id", "resource_type"]\n }\n\n asset_key = dagster_dbt_translator.get_asset_key(dbt_resource_props)\n\n asset_deps[asset_key] = set()\n\n metadata = merge_dicts(\n dagster_dbt_translator.get_metadata(dbt_resource_props),\n {\n MANIFEST_METADATA_KEY: DbtManifestWrapper(manifest=manifest) if manifest else None,\n DAGSTER_DBT_TRANSLATOR_METADATA_KEY: dagster_dbt_translator,\n },\n )\n asset_outs[asset_key] = (\n output_name,\n Out(\n io_manager_key=io_manager_key,\n description=dagster_dbt_translator.get_description(dbt_resource_props),\n metadata=metadata,\n is_required=False,\n dagster_type=Nothing,\n code_version=default_code_version_fn(dbt_resource_props),\n ),\n )\n\n group_name = dagster_dbt_translator.get_group_name(dbt_resource_props)\n if group_name is not None:\n group_names_by_key[asset_key] = group_name\n\n freshness_policy = dagster_dbt_translator.get_freshness_policy(dbt_resource_props)\n if freshness_policy is not None:\n freshness_policies_by_key[asset_key] = freshness_policy\n\n auto_materialize_policy = dagster_dbt_translator.get_auto_materialize_policy(\n dbt_resource_props\n )\n if auto_materialize_policy is not None:\n auto_materialize_policies_by_key[asset_key] = auto_materialize_policy\n\n test_unique_ids = []\n if manifest:\n test_unique_ids = [\n child_unique_id\n for child_unique_id in manifest["child_map"][unique_id]\n if child_unique_id.startswith("test")\n ]\n\n for test_unique_id in test_unique_ids:\n test_resource_props = manifest["nodes"][test_unique_id]\n check_spec = default_asset_check_fn(asset_key, unique_id, test_resource_props)\n\n if check_spec:\n check_specs.append(check_spec)\n\n for parent_unique_id in parent_unique_ids:\n parent_node_info = dbt_nodes[parent_unique_id]\n parent_asset_key = dagster_dbt_translator.get_asset_key(parent_node_info)\n\n asset_deps[asset_key].add(parent_asset_key)\n\n # if this parent is not one of the selected nodes, it's an input\n if parent_unique_id not in deps:\n input_name = input_name_fn(parent_node_info)\n asset_ins[parent_asset_key] = (input_name, In(Nothing))\n\n check_specs_by_output_name = cast(\n Dict[str, AssetCheckSpec],\n _validate_and_assign_output_names_to_check_specs(check_specs, list(asset_outs.keys())),\n )\n\n return (\n asset_deps,\n asset_ins,\n asset_outs,\n group_names_by_key,\n freshness_policies_by_key,\n auto_materialize_policies_by_key,\n check_specs_by_output_name,\n fqns_by_output_name,\n metadata_by_output_name,\n )\n
", "current_page_name": "_modules/dagster_dbt/asset_utils", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_dbt.asset_utils"}, "cloud": {"asset_defs": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_dbt.cloud.asset_defs

\nimport json\nimport shlex\nfrom argparse import Namespace\nfrom typing import (\n    Any,\n    Callable,\n    Dict,\n    FrozenSet,\n    List,\n    Mapping,\n    Optional,\n    Sequence,\n    Set,\n    Tuple,\n    Union,\n    cast,\n)\n\nimport dagster._check as check\nfrom dagster import (\n    AssetExecutionContext,\n    AssetKey,\n    AssetOut,\n    AssetsDefinition,\n    AutoMaterializePolicy,\n    FreshnessPolicy,\n    MetadataValue,\n    PartitionsDefinition,\n    ResourceDefinition,\n    multi_asset,\n    with_resources,\n)\nfrom dagster._annotations import experimental, experimental_param\nfrom dagster._core.definitions.cacheable_assets import (\n    AssetsDefinitionCacheableData,\n    CacheableAssetsDefinition,\n)\nfrom dagster._core.definitions.metadata import MetadataUserInput\nfrom dagster._core.execution.context.init import build_init_resource_context\n\nfrom dagster_dbt.asset_utils import (\n    default_asset_key_fn,\n    default_auto_materialize_policy_fn,\n    default_description_fn,\n    default_freshness_policy_fn,\n    default_group_from_dbt_resource_props,\n    get_asset_deps,\n    get_deps,\n)\nfrom dagster_dbt.dagster_dbt_translator import DagsterDbtTranslator\n\nfrom ..errors import DagsterDbtCloudJobInvariantViolationError\nfrom ..utils import ASSET_RESOURCE_TYPES, result_to_events\nfrom .resources import DbtCloudClient, DbtCloudClientResource, DbtCloudRunStatus\n\nDAGSTER_DBT_COMPILE_RUN_ID_ENV_VAR = "DBT_DAGSTER_COMPILE_RUN_ID"\n\n\nclass DbtCloudCacheableAssetsDefinition(CacheableAssetsDefinition):\n    def __init__(\n        self,\n        dbt_cloud_resource_def: Union[DbtCloudClientResource, ResourceDefinition],\n        job_id: int,\n        node_info_to_asset_key: Callable[[Mapping[str, Any]], AssetKey],\n        node_info_to_group_fn: Callable[[Mapping[str, Any]], Optional[str]],\n        node_info_to_freshness_policy_fn: Callable[[Mapping[str, Any]], Optional[FreshnessPolicy]],\n        node_info_to_auto_materialize_policy_fn: Callable[\n            [Mapping[str, Any]], Optional[AutoMaterializePolicy]\n        ],\n        partitions_def: Optional[PartitionsDefinition] = None,\n        partition_key_to_vars_fn: Optional[Callable[[str], Mapping[str, Any]]] = None,\n    ):\n        self._dbt_cloud_resource_def: ResourceDefinition = (\n            dbt_cloud_resource_def.get_resource_definition()\n            if isinstance(dbt_cloud_resource_def, DbtCloudClientResource)\n            else dbt_cloud_resource_def\n        )\n\n        self._dbt_cloud: DbtCloudClient = (\n            dbt_cloud_resource_def.process_config_and_initialize().get_dbt_client()\n            if isinstance(dbt_cloud_resource_def, DbtCloudClientResource)\n            else dbt_cloud_resource_def(build_init_resource_context())\n        )\n        self._job_id = job_id\n        self._project_id: int\n        self._has_generate_docs: bool\n        self._job_commands: List[str]\n        self._job_materialization_command_step: int\n        self._node_info_to_asset_key = node_info_to_asset_key\n        self._node_info_to_group_fn = node_info_to_group_fn\n        self._node_info_to_freshness_policy_fn = node_info_to_freshness_policy_fn\n        self._node_info_to_auto_materialize_policy_fn = node_info_to_auto_materialize_policy_fn\n        self._partitions_def = partitions_def\n        self._partition_key_to_vars_fn = partition_key_to_vars_fn\n\n        super().__init__(unique_id=f"dbt-cloud-{job_id}")\n\n    def compute_cacheable_data(self) -> Sequence[AssetsDefinitionCacheableData]:\n        dbt_nodes, dbt_dependencies = self._get_dbt_nodes_and_dependencies()\n        return [self._build_dbt_cloud_assets_cacheable_data(dbt_nodes, dbt_dependencies)]\n\n    def build_definitions(\n        self, data: Sequence[AssetsDefinitionCacheableData]\n    ) -> Sequence[AssetsDefinition]:\n        return with_resources(\n            [\n                self._build_dbt_cloud_assets_from_cacheable_data(assets_definition_metadata)\n                for assets_definition_metadata in data\n            ],\n            {"dbt_cloud": self._dbt_cloud_resource_def},\n        )\n\n    @staticmethod\n    def parse_dbt_command(dbt_command: str) -> Namespace:\n        args = shlex.split(dbt_command)[1:]\n        try:\n            from dbt.cli.flags import (\n                Flags,\n                args_to_context,\n            )\n\n            # nasty hack to get dbt to parse the args\n            # dbt >= 1.5.0 requires that profiles-dir is set to an existing directory\n            return Namespace(**vars(Flags(args_to_context(args + ["--profiles-dir", "."]))))\n        except ImportError:\n            # dbt < 1.5.0 compat\n            from dbt.main import parse_args  # type: ignore\n\n            return parse_args(args=args)\n\n    @staticmethod\n    def get_job_materialization_command_step(execute_steps: List[str]) -> int:\n        materialization_command_filter = [\n            DbtCloudCacheableAssetsDefinition.parse_dbt_command(command).which in ["run", "build"]\n            for command in execute_steps\n        ]\n\n        if sum(materialization_command_filter) != 1:\n            raise DagsterDbtCloudJobInvariantViolationError(\n                "The dbt Cloud job must have a single `dbt run` or `dbt build` in its commands. "\n                f"Received commands: {execute_steps}."\n            )\n\n        return materialization_command_filter.index(True)\n\n    @staticmethod\n    def get_compile_filters(parsed_args: Namespace) -> List[str]:\n        dbt_compile_options: List[str] = []\n\n        selected_models = parsed_args.select or []\n        if selected_models:\n            dbt_compile_options.append(f"--select {' '.join(selected_models)}")\n\n        excluded_models = parsed_args.exclude or []\n        if excluded_models:\n            dbt_compile_options.append(f"--exclude {' '.join(excluded_models)}")\n\n        selector = getattr(parsed_args, "selector_name", None) or getattr(\n            parsed_args, "selector", None\n        )\n        if selector:\n            dbt_compile_options.append(f"--selector {selector}")\n\n        return dbt_compile_options\n\n    def _get_cached_compile_dbt_cloud_job_run(self, compile_run_id: int) -> Tuple[int, int]:\n        compile_run = self._dbt_cloud.get_run(\n            run_id=compile_run_id, include_related=["trigger", "run_steps"]\n        )\n\n        compile_run_status: str = compile_run["status_humanized"]\n        if compile_run_status != DbtCloudRunStatus.SUCCESS:\n            raise DagsterDbtCloudJobInvariantViolationError(\n                f"The cached dbt Cloud job run `{compile_run_id}` must have a status of"\n                f" `{DbtCloudRunStatus.SUCCESS}`. Received status: `{compile_run_status}. You can"\n                f" view the full status of your dbt Cloud run at {compile_run['href']}. Once it has"\n                " successfully completed, reload your Dagster definitions. If your run has failed,"\n                " you must manually refresh the cache using the `dagster-dbt"\n                " cache-compile-references` CLI."\n            )\n\n        compile_run_has_generate_docs = compile_run["trigger"]["generate_docs_override"]\n\n        compile_job_materialization_command_step = len(compile_run["run_steps"])\n        if compile_run_has_generate_docs:\n            compile_job_materialization_command_step -= 1\n\n        return compile_run_id, compile_job_materialization_command_step\n\n    def _compile_dbt_cloud_job(self, dbt_cloud_job: Mapping[str, Any]) -> Tuple[int, int]:\n        # Retrieve the filters options from the dbt Cloud job's materialization command.\n        #\n        # There are three filters: `--select`, `--exclude`, and `--selector`.\n        materialization_command = self._job_commands[self._job_materialization_command_step]\n        parsed_args = DbtCloudCacheableAssetsDefinition.parse_dbt_command(materialization_command)\n        dbt_compile_options = DbtCloudCacheableAssetsDefinition.get_compile_filters(\n            parsed_args=parsed_args\n        )\n\n        # Add the partition variable as a variable to the dbt Cloud job command.\n        #\n        # If existing variables passed through the dbt Cloud job's command, an error will be\n        # raised. Since these are static variables anyways, they can be moved to the\n        # `dbt_project.yml` without loss of functionality.\n        #\n        # Since we're only doing this to generate the dependency structure, just use an arbitrary\n        # partition key (e.g. the last one) to retrieve the partition variable.\n        if parsed_args.vars and parsed_args.vars != "{}":\n            raise DagsterDbtCloudJobInvariantViolationError(\n                f"The dbt Cloud job '{dbt_cloud_job['name']}' ({dbt_cloud_job['id']}) must not have"\n                " variables defined from `--vars` in its `dbt run` or `dbt build` command."\n                " Instead, declare the variables in the `dbt_project.yml` file. Received commands:"\n                f" {self._job_commands}."\n            )\n\n        if self._partitions_def and self._partition_key_to_vars_fn:\n            last_partition_key = self._partitions_def.get_last_partition_key()\n            if last_partition_key is None:\n                check.failed("PartitionsDefinition has no partitions")\n            partition_var = self._partition_key_to_vars_fn(last_partition_key)\n\n            dbt_compile_options.append(f"--vars '{json.dumps(partition_var)}'")\n\n        # We need to retrieve the dependency structure for the assets in the dbt Cloud project.\n        # However, we can't just use the dependency structure from the latest run, because\n        # this historical structure may not be up-to-date with the current state of the project.\n        #\n        # By always doing a compile step, we can always get the latest dependency structure.\n        # This incurs some latency, but at least it doesn't run through the entire materialization\n        # process.\n        dbt_compile_command = f"dbt compile {' '.join(dbt_compile_options)}"\n        compile_run_dbt_output = self._dbt_cloud.run_job_and_poll(\n            job_id=self._job_id,\n            cause="Generating software-defined assets for Dagster.",\n            steps_override=[dbt_compile_command],\n        )\n\n        # Target the compile execution step when retrieving run artifacts, rather than assuming\n        # that the last step is the correct target.\n        #\n        # Here, we ignore the `dbt docs generate` step.\n        compile_job_materialization_command_step = len(\n            compile_run_dbt_output.run_details.get("run_steps", [])\n        )\n        if self._has_generate_docs:\n            compile_job_materialization_command_step -= 1\n\n        return compile_run_dbt_output.run_id, compile_job_materialization_command_step\n\n    def _get_dbt_nodes_and_dependencies(\n        self,\n    ) -> Tuple[Mapping[str, Any], Mapping[str, FrozenSet[str]]]:\n        """For a given dbt Cloud job, fetch the latest run's dependency structure of executed nodes."""\n        # Fetch information about the job.\n        job = self._dbt_cloud.get_job(job_id=self._job_id)\n        self._project_id = job["project_id"]\n        self._has_generate_docs = job["generate_docs"]\n\n        # We constraint the kinds of dbt Cloud jobs that we support running.\n        #\n        # A simple constraint is that we only support jobs that run multiple steps,\n        # but it must contain one of either `dbt run` or `dbt build`.\n        #\n        # As a reminder, `dbt deps` is automatically run before the job's configured commands.\n        # And if the settings are enabled, `dbt docs generate` and `dbt source freshness` can\n        # automatically run after the job's configured commands.\n        #\n        # These commands that execute before and after the job's configured commands do not count\n        # towards the single command constraint.\n        self._job_commands = job["execute_steps"]\n        self._job_materialization_command_step = (\n            DbtCloudCacheableAssetsDefinition.get_job_materialization_command_step(\n                execute_steps=self._job_commands\n            )\n        )\n\n        # Determine whether to use a cached compile run. This should only be set up if the user is\n        # using a GitHub action along with their dbt project.\n        dbt_cloud_job_env_vars = self._dbt_cloud.get_job_environment_variables(\n            project_id=self._project_id, job_id=self._job_id\n        )\n        compile_run_id = (\n            dbt_cloud_job_env_vars.get(DAGSTER_DBT_COMPILE_RUN_ID_ENV_VAR, {})\n            .get("job", {})\n            .get("value")\n        )\n\n        compile_run_id, compile_job_materialization_command_step = (\n            # If a compile run is cached, then use it.\n            self._get_cached_compile_dbt_cloud_job_run(compile_run_id=int(compile_run_id))\n            if compile_run_id\n            # Otherwise, compile the dbt Cloud project in an ad-hoc manner.\n            else self._compile_dbt_cloud_job(dbt_cloud_job=job)\n        )\n\n        manifest_json = self._dbt_cloud.get_manifest(\n            run_id=compile_run_id, step=compile_job_materialization_command_step\n        )\n        run_results_json = self._dbt_cloud.get_run_results(\n            run_id=compile_run_id, step=compile_job_materialization_command_step\n        )\n\n        # Filter the manifest to only include the nodes that were executed.\n        dbt_nodes: Dict[str, Any] = {\n            **manifest_json.get("nodes", {}),\n            **manifest_json.get("sources", {}),\n            **manifest_json.get("metrics", {}),\n        }\n        executed_node_ids: Set[str] = set(\n            result["unique_id"] for result in run_results_json["results"]\n        )\n\n        # If there are no executed nodes, then there are no assets to generate.\n        # Inform the user to inspect their dbt Cloud job's command.\n        if not executed_node_ids:\n            raise DagsterDbtCloudJobInvariantViolationError(\n                f"The dbt Cloud job '{job['name']}' ({job['id']}) does not generate any "\n                "software-defined assets. Ensure that your dbt project has nodes to execute, "\n                "and that your dbt Cloud job's materialization command has the proper filter "\n                f"options applied. Received commands: {self._job_commands}."\n            )\n\n        # Generate the dependency structure for the executed nodes.\n        dbt_dependencies = get_deps(\n            dbt_nodes=dbt_nodes,\n            selected_unique_ids=executed_node_ids,\n            asset_resource_types=ASSET_RESOURCE_TYPES,\n        )\n\n        return dbt_nodes, dbt_dependencies\n\n    def _build_dbt_cloud_assets_cacheable_data(\n        self, dbt_nodes: Mapping[str, Any], dbt_dependencies: Mapping[str, FrozenSet[str]]\n    ) -> AssetsDefinitionCacheableData:\n        """Given all of the nodes and dependencies for a dbt Cloud job, build the cacheable\n        representation that generate the asset definition for the job.\n        """\n\n        class CustomDagsterDbtTranslator(DagsterDbtTranslator):\n            @classmethod\n            def get_asset_key(cls, dbt_resource_props):\n                return self._node_info_to_asset_key(dbt_resource_props)\n\n            @classmethod\n            def get_description(cls, dbt_resource_props):\n                # We shouldn't display the raw sql. Instead, inspect if dbt docs were generated,\n                # and attach metadata to link to the docs.\n                return default_description_fn(dbt_resource_props, display_raw_sql=False)\n\n            @classmethod\n            def get_group_name(cls, dbt_resource_props):\n                return self._node_info_to_group_fn(dbt_resource_props)\n\n            @classmethod\n            def get_freshness_policy(cls, dbt_resource_props):\n                return self._node_info_to_freshness_policy_fn(dbt_resource_props)\n\n            @classmethod\n            def get_auto_materialize_policy(cls, dbt_resource_props):\n                return self._node_info_to_auto_materialize_policy_fn(dbt_resource_props)\n\n        (\n            asset_deps,\n            asset_ins,\n            asset_outs,\n            group_names_by_key,\n            freshness_policies_by_key,\n            auto_materialize_policies_by_key,\n            _,\n            fqns_by_output_name,\n            metadata_by_output_name,\n        ) = get_asset_deps(\n            dbt_nodes=dbt_nodes,\n            deps=dbt_dependencies,\n            # TODO: In the future, allow the IO manager to be specified.\n            io_manager_key=None,\n            dagster_dbt_translator=CustomDagsterDbtTranslator(),\n            manifest=None,\n        )\n\n        return AssetsDefinitionCacheableData(\n            # TODO: In the future, we should allow additional upstream assets to be specified.\n            keys_by_input_name={\n                input_name: asset_key for asset_key, (input_name, _) in asset_ins.items()\n            },\n            keys_by_output_name={\n                output_name: asset_key for asset_key, (output_name, _) in asset_outs.items()\n            },\n            internal_asset_deps={\n                asset_outs[asset_key][0]: asset_deps for asset_key, asset_deps in asset_deps.items()\n            },\n            # We don't rely on a static group name. Instead, we map over the dbt metadata to\n            # determine the group name for each asset.\n            group_name=None,\n            metadata_by_output_name={\n                output_name: self._build_dbt_cloud_assets_metadata(dbt_metadata)\n                for output_name, dbt_metadata in metadata_by_output_name.items()\n            },\n            # TODO: In the future, we should allow the key prefix to be specified.\n            key_prefix=None,\n            can_subset=True,\n            extra_metadata={\n                "job_id": self._job_id,\n                "job_commands": self._job_commands,\n                "job_materialization_command_step": self._job_materialization_command_step,\n                "group_names_by_output_name": {\n                    asset_outs[asset_key][0]: group_name\n                    for asset_key, group_name in group_names_by_key.items()\n                },\n                "fqns_by_output_name": fqns_by_output_name,\n            },\n            freshness_policies_by_output_name={\n                asset_outs[asset_key][0]: freshness_policy\n                for asset_key, freshness_policy in freshness_policies_by_key.items()\n            },\n            auto_materialize_policies_by_output_name={\n                asset_outs[asset_key][0]: auto_materialize_policy\n                for asset_key, auto_materialize_policy in auto_materialize_policies_by_key.items()\n            },\n        )\n\n    def _build_dbt_cloud_assets_metadata(self, dbt_metadata: Dict[str, Any]) -> MetadataUserInput:\n        metadata = {\n            "dbt Cloud Job": MetadataValue.url(\n                self._dbt_cloud.build_url_for_job(\n                    project_id=self._project_id,\n                    job_id=self._job_id,\n                )\n            ),\n        }\n\n        if self._has_generate_docs:\n            metadata["dbt Cloud Documentation"] = MetadataValue.url(\n                self._dbt_cloud.build_url_for_cloud_docs(\n                    job_id=self._job_id,\n                    resource_type=dbt_metadata["resource_type"],\n                    unique_id=dbt_metadata["unique_id"],\n                )\n            )\n\n        return metadata\n\n    def _build_dbt_cloud_assets_from_cacheable_data(\n        self, assets_definition_cacheable_data: AssetsDefinitionCacheableData\n    ) -> AssetsDefinition:\n        metadata = cast(Mapping[str, Any], assets_definition_cacheable_data.extra_metadata)\n        job_id = cast(int, metadata["job_id"])\n        job_commands = cast(List[str], list(metadata["job_commands"]))\n        job_materialization_command_step = cast(int, metadata["job_materialization_command_step"])\n        group_names_by_output_name = cast(Mapping[str, str], metadata["group_names_by_output_name"])\n        fqns_by_output_name = cast(Mapping[str, List[str]], metadata["fqns_by_output_name"])\n\n        @multi_asset(\n            name=f"dbt_cloud_job_{job_id}",\n            deps=list((assets_definition_cacheable_data.keys_by_input_name or {}).values()),\n            outs={\n                output_name: AssetOut(\n                    key=asset_key,\n                    group_name=group_names_by_output_name.get(output_name),\n                    freshness_policy=(\n                        assets_definition_cacheable_data.freshness_policies_by_output_name or {}\n                    ).get(\n                        output_name,\n                    ),\n                    auto_materialize_policy=(\n                        assets_definition_cacheable_data.auto_materialize_policies_by_output_name\n                        or {}\n                    ).get(\n                        output_name,\n                    ),\n                    metadata=(assets_definition_cacheable_data.metadata_by_output_name or {}).get(\n                        output_name\n                    ),\n                    is_required=False,\n                )\n                for output_name, asset_key in (\n                    assets_definition_cacheable_data.keys_by_output_name or {}\n                ).items()\n            },\n            internal_asset_deps={\n                output_name: set(asset_deps)\n                for output_name, asset_deps in (\n                    assets_definition_cacheable_data.internal_asset_deps or {}\n                ).items()\n            },\n            partitions_def=self._partitions_def,\n            can_subset=assets_definition_cacheable_data.can_subset,\n            required_resource_keys={"dbt_cloud"},\n            compute_kind="dbt",\n        )\n        def _assets(context: AssetExecutionContext):\n            dbt_cloud = cast(DbtCloudClient, context.resources.dbt_cloud)\n\n            # Add the partition variable as a variable to the dbt Cloud job command.\n            dbt_options: List[str] = []\n            if context.has_partition_key and self._partition_key_to_vars_fn:\n                partition_var = self._partition_key_to_vars_fn(context.partition_key)\n\n                dbt_options.append(f"--vars '{json.dumps(partition_var)}'")\n\n            # Prepare the materialization step to be overriden with the selection filter\n            materialization_command = job_commands[job_materialization_command_step]\n\n            # Map the selected outputs to dbt models that should be materialized.\n            #\n            # HACK: This selection filter works even if an existing `--select` is specified in the\n            # dbt Cloud job. We take advantage of the fact that the last `--select` will be used.\n            #\n            # This is not ideal, as the triggered run for the dbt Cloud job will still have both\n            # `--select` options when displayed in the UI, but parsing the command line argument\n            # to remove the initial select using argparse.\n            if len(context.selected_output_names) != len(\n                assets_definition_cacheable_data.keys_by_output_name or {}\n            ):\n                selected_models = [\n                    ".".join(fqns_by_output_name[output_name])\n                    for output_name in context.selected_output_names\n                ]\n\n                dbt_options.append(f"--select {' '.join(sorted(selected_models))}")\n\n                # If the `--selector` option is used, we need to remove it from the command, since\n                # it disables other selection options from being functional.\n                #\n                # See https://docs.getdbt.com/reference/node-selection/syntax for details.\n                split_materialization_command = shlex.split(materialization_command)\n                if "--selector" in split_materialization_command:\n                    idx = split_materialization_command.index("--selector")\n\n                    materialization_command = " ".join(\n                        split_materialization_command[:idx]\n                        + split_materialization_command[idx + 2 :]\n                    )\n\n            job_commands[job_materialization_command_step] = (\n                f"{materialization_command} {' '.join(dbt_options)}".strip()\n            )\n\n            # Run the dbt Cloud job to rematerialize the assets.\n            dbt_cloud_output = dbt_cloud.run_job_and_poll(\n                job_id=job_id,\n                cause=f"Materializing software-defined assets in Dagster run {context.run_id[:8]}",\n                steps_override=job_commands,\n            )\n\n            # Target the materialization step when retrieving run artifacts, rather than assuming\n            # that the last step is the correct target.\n            #\n            # We ignore the commands in front of the materialization command. And again, we ignore\n            # the `dbt docs generate` step.\n            materialization_command_step = len(dbt_cloud_output.run_details.get("run_steps", []))\n            materialization_command_step -= len(job_commands) - job_materialization_command_step - 1\n            if dbt_cloud_output.run_details.get("job", {}).get("generate_docs"):\n                materialization_command_step -= 1\n\n            # TODO: Assume the run completely fails or completely succeeds.\n            # In the future, we can relax this assumption.\n            manifest_json = dbt_cloud.get_manifest(\n                run_id=dbt_cloud_output.run_id, step=materialization_command_step\n            )\n            run_results_json = self._dbt_cloud.get_run_results(\n                run_id=dbt_cloud_output.run_id, step=materialization_command_step\n            )\n\n            for result in run_results_json.get("results", []):\n                yield from result_to_events(\n                    result=result,\n                    docs_url=dbt_cloud_output.docs_url,\n                    node_info_to_asset_key=self._node_info_to_asset_key,\n                    manifest_json=manifest_json,\n                    # TODO: In the future, allow arbitrary mappings to Dagster output metadata from\n                    # the dbt metadata.\n                    extra_metadata=None,\n                    generate_asset_outputs=True,\n                )\n\n        return _assets\n\n\n
[docs]@experimental\n@experimental_param(param="partitions_def")\n@experimental_param(param="partition_key_to_vars_fn")\ndef load_assets_from_dbt_cloud_job(\n dbt_cloud: ResourceDefinition,\n job_id: int,\n node_info_to_asset_key: Callable[[Mapping[str, Any]], AssetKey] = default_asset_key_fn,\n node_info_to_group_fn: Callable[\n [Mapping[str, Any]], Optional[str]\n ] = default_group_from_dbt_resource_props,\n node_info_to_freshness_policy_fn: Callable[\n [Mapping[str, Any]], Optional[FreshnessPolicy]\n ] = default_freshness_policy_fn,\n node_info_to_auto_materialize_policy_fn: Callable[\n [Mapping[str, Any]], Optional[AutoMaterializePolicy]\n ] = default_auto_materialize_policy_fn,\n partitions_def: Optional[PartitionsDefinition] = None,\n partition_key_to_vars_fn: Optional[Callable[[str], Mapping[str, Any]]] = None,\n) -> CacheableAssetsDefinition:\n """Loads a set of dbt models, managed by a dbt Cloud job, into Dagster assets. In order to\n determine the set of dbt models, the project is compiled to generate the necessary artifacts\n that define the dbt models and their dependencies.\n\n One Dagster asset is created for each dbt model.\n\n Args:\n dbt_cloud (ResourceDefinition): The dbt Cloud resource to use to connect to the dbt Cloud API.\n job_id (int): The ID of the dbt Cloud job to load assets from.\n node_info_to_asset_key: (Mapping[str, Any] -> AssetKey): A function that takes a dictionary\n of dbt metadata and returns the AssetKey that you want to represent a given model or\n source. By default: dbt model -> AssetKey([model_name]) and\n dbt source -> AssetKey([source_name, table_name])\n node_info_to_group_fn (Dict[str, Any] -> Optional[str]): A function that takes a\n dictionary of dbt node info and returns the group that this node should be assigned to.\n node_info_to_freshness_policy_fn (Dict[str, Any] -> Optional[FreshnessPolicy]): A function\n that takes a dictionary of dbt node info and optionally returns a FreshnessPolicy that\n should be applied to this node. By default, freshness policies will be created from\n config applied to dbt models, i.e.:\n `dagster_freshness_policy={"maximum_lag_minutes": 60, "cron_schedule": "0 9 * * *"}`\n will result in that model being assigned\n `FreshnessPolicy(maximum_lag_minutes=60, cron_schedule="0 9 * * *")`\n node_info_to_auto_materialize_policy_fn (Dict[str, Any] -> Optional[AutoMaterializePolicy]):\n A function that takes a dictionary of dbt node info and optionally returns a AutoMaterializePolicy\n that should be applied to this node. By default, AutoMaterializePolicies will be created from\n config applied to dbt models, i.e.:\n `dagster_auto_materialize_policy={"type": "lazy"}` will result in that model being assigned\n `AutoMaterializePolicy.lazy()`\n node_info_to_definition_metadata_fn (Dict[str, Any] -> Optional[Dict[str, MetadataUserInput]]):\n A function that takes a dictionary of dbt node info and optionally returns a dictionary\n of metadata to be attached to the corresponding definition. This is added to the default\n metadata assigned to the node, which consists of the node's schema (if present).\n partitions_def (Optional[PartitionsDefinition]): Defines the set of partition keys that\n compose the dbt assets.\n partition_key_to_vars_fn (Optional[str -> Dict[str, Any]]): A function to translate a given\n partition key (e.g. '2022-01-01') to a dictionary of vars to be passed into the dbt\n invocation (e.g. {"run_date": "2022-01-01"})\n\n Returns:\n CacheableAssetsDefinition: A definition for the loaded assets.\n\n Examples:\n .. code-block:: python\n\n from dagster import repository\n from dagster_dbt import dbt_cloud_resource, load_assets_from_dbt_cloud_job\n\n DBT_CLOUD_JOB_ID = 1234\n\n dbt_cloud = dbt_cloud_resource.configured(\n {\n "auth_token": {"env": "DBT_CLOUD_API_TOKEN"},\n "account_id": {"env": "DBT_CLOUD_ACCOUNT_ID"},\n }\n )\n\n dbt_cloud_assets = load_assets_from_dbt_cloud_job(\n dbt_cloud=dbt_cloud, job_id=DBT_CLOUD_JOB_ID\n )\n\n\n @repository\n def dbt_cloud_sandbox():\n return [dbt_cloud_assets]\n """\n if partition_key_to_vars_fn:\n check.invariant(\n partitions_def is not None,\n "Cannot supply a `partition_key_to_vars_fn` without a `partitions_def`.",\n )\n\n return DbtCloudCacheableAssetsDefinition(\n dbt_cloud_resource_def=dbt_cloud,\n job_id=job_id,\n node_info_to_asset_key=node_info_to_asset_key,\n node_info_to_group_fn=node_info_to_group_fn,\n node_info_to_freshness_policy_fn=node_info_to_freshness_policy_fn,\n node_info_to_auto_materialize_policy_fn=node_info_to_auto_materialize_policy_fn,\n partitions_def=partitions_def,\n partition_key_to_vars_fn=partition_key_to_vars_fn,\n )
\n
", "current_page_name": "_modules/dagster_dbt/cloud/asset_defs", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_dbt.cloud.asset_defs"}, "ops": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_dbt.cloud.ops

\nfrom typing import List, Optional\n\nfrom dagster import Config, In, Nothing, Out, Output, op\nfrom pydantic import Field\n\nfrom ..utils import generate_materializations\nfrom .resources import DEFAULT_POLL_INTERVAL\nfrom .types import DbtCloudOutput\n\n\nclass DbtCloudRunOpConfig(Config):\n    job_id: int = Field(\n        description=(\n            "The integer ID of the relevant dbt Cloud job. You can find this value by going to the"\n            " details page of your job in the dbt Cloud UI. It will be the final number in the url,"\n            " e.g.:    "\n            " https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/jobs/{job_id}/"\n        )\n    )\n    poll_interval: float = Field(\n        default=DEFAULT_POLL_INTERVAL,\n        description="The time (in seconds) that will be waited between successive polls.",\n    )\n    poll_timeout: Optional[float] = Field(\n        default=None,\n        description=(\n            "The maximum time that will waited before this operation is timed out. By "\n            "default, this will never time out."\n        ),\n    )\n    yield_materializations: bool = Field(\n        default=True,\n        description=(\n            "If True, materializations corresponding to the results of the dbt operation will "\n            "be yielded when the op executes."\n        ),\n    )\n\n    asset_key_prefix: List[str] = Field(\n        default=["dbt"],\n        description=(\n            "If provided and yield_materializations is True, these components will be used to "\n            "prefix the generated asset keys."\n        ),\n    )\n\n\n
[docs]@op(\n required_resource_keys={"dbt_cloud"},\n ins={"start_after": In(Nothing)},\n out=Out(DbtCloudOutput, description="Parsed output from running the dbt Cloud job."),\n tags={"kind": "dbt_cloud"},\n)\ndef dbt_cloud_run_op(context, config: DbtCloudRunOpConfig):\n """Initiates a run for a dbt Cloud job, then polls until the run completes. If the job\n fails or is otherwised stopped before succeeding, a `dagster.Failure` exception will be raised,\n and this op will fail.\n\n It requires the use of a 'dbt_cloud' resource, which is used to connect to the dbt Cloud API.\n\n **Config Options:**\n\n job_id (int)\n The integer ID of the relevant dbt Cloud job. You can find this value by going to the details\n page of your job in the dbt Cloud UI. It will be the final number in the url, e.g.:\n ``https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/jobs/{job_id}/``\n poll_interval (float)\n The time (in seconds) that will be waited between successive polls. Defaults to ``10``.\n poll_timeout (float)\n The maximum time (in seconds) that will waited before this operation is timed out. By\n default, this will never time out.\n yield_materializations (bool)\n If True, materializations corresponding to the results of the dbt operation will be\n yielded when the solid executes. Defaults to ``True``.\n rasset_key_prefix (float)\n If provided and yield_materializations is True, these components will be used to "\n prefix the generated asset keys. Defaults to ["dbt"].\n\n **Examples:**\n\n .. code-block:: python\n\n from dagster import job\n from dagster_dbt import dbt_cloud_resource, dbt_cloud_run_op\n\n my_dbt_cloud_resource = dbt_cloud_resource.configured(\n {"auth_token": {"env": "DBT_CLOUD_AUTH_TOKEN"}, "account_id": 77777}\n )\n run_dbt_nightly_sync = dbt_cloud_run_op.configured(\n {"job_id": 54321}, name="run_dbt_nightly_sync"\n )\n\n @job(resource_defs={"dbt_cloud": my_dbt_cloud_resource})\n def dbt_cloud():\n run_dbt_nightly_sync()\n\n\n """\n dbt_output = context.resources.dbt_cloud.run_job_and_poll(\n config.job_id, poll_interval=config.poll_interval, poll_timeout=config.poll_timeout\n )\n if config.yield_materializations and "results" in dbt_output.result:\n yield from generate_materializations(dbt_output, asset_key_prefix=config.asset_key_prefix)\n yield Output(\n dbt_output,\n metadata={\n "created_at": dbt_output.run_details["created_at"],\n "started_at": dbt_output.run_details["started_at"],\n "finished_at": dbt_output.run_details["finished_at"],\n "total_duration": dbt_output.run_details["duration"],\n "run_duration": dbt_output.run_details["run_duration"],\n },\n )
\n
", "current_page_name": "_modules/dagster_dbt/cloud/ops", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_dbt.cloud.ops"}, "resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_dbt.cloud.resources

\nimport datetime\nimport json\nimport logging\nimport time\nfrom enum import Enum\nfrom typing import Any, Mapping, Optional, Sequence, cast\nfrom urllib.parse import urlencode, urljoin\n\nimport requests\nfrom dagster import (\n    ConfigurableResource,\n    Failure,\n    IAttachDifferentObjectToOpContext,\n    MetadataValue,\n    __version__,\n    _check as check,\n    get_dagster_logger,\n    resource,\n)\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom dagster._utils.merger import deep_merge_dicts\nfrom pydantic import Field\nfrom requests.exceptions import RequestException\n\nfrom .types import DbtCloudOutput\n\nDBT_DEFAULT_HOST = "https://cloud.getdbt.com/"\nDBT_API_V2_PATH = "api/v2/accounts/"\nDBT_API_V3_PATH = "api/v3/accounts/"\n\n# default polling interval (in seconds)\nDEFAULT_POLL_INTERVAL = 10\n\n\nclass DbtCloudRunStatus(str, Enum):\n    QUEUED = "Queued"\n    STARTING = "Starting"\n    RUNNING = "Running"\n    SUCCESS = "Success"\n    ERROR = "Error"\n    CANCELLED = "Cancelled"\n\n\n# TODO: This resource should be a wrapper over an existing client for a accessing dbt Cloud,\n# rather than using requests to the API directly.\nclass DbtCloudClient:\n    """This class exposes methods on top of the dbt Cloud REST API v2.\n\n    For a complete set of documentation on the dbt Cloud Administrative REST API, including expected\n    response JSON schemae, see the `dbt Cloud API Docs <https://docs.getdbt.com/dbt-cloud/api-v2>`_.\n    """\n\n    def __init__(\n        self,\n        auth_token: str,\n        account_id: int,\n        disable_schedule_on_trigger: bool = True,\n        request_max_retries: int = 3,\n        request_retry_delay: float = 0.25,\n        dbt_cloud_host: str = DBT_DEFAULT_HOST,\n        log: logging.Logger = get_dagster_logger(),\n        log_requests: bool = False,\n    ):\n        self._auth_token = auth_token\n        self._account_id = account_id\n        self._disable_schedule_on_trigger = disable_schedule_on_trigger\n\n        self._request_max_retries = request_max_retries\n        self._request_retry_delay = request_retry_delay\n\n        self._dbt_cloud_host = dbt_cloud_host\n        self._log = log\n        self._log_requests = log_requests\n\n    @property\n    def api_v2_base_url(self) -> str:\n        return urljoin(self._dbt_cloud_host, DBT_API_V2_PATH)\n\n    @property\n    def api_v3_base_url(self) -> str:\n        return urljoin(self._dbt_cloud_host, DBT_API_V3_PATH)\n\n    def build_url_for_job(self, project_id: int, job_id: int) -> str:\n        return urljoin(\n            self._dbt_cloud_host,\n            f"next/deploy/{self._account_id}/projects/{project_id}/jobs/{job_id}/",\n        )\n\n    def build_url_for_cloud_docs(self, job_id: int, resource_type: str, unique_id: str) -> str:\n        return urljoin(\n            self._dbt_cloud_host,\n            f"/accounts/{self._account_id}/jobs/{job_id}/docs/#!/{resource_type}/{unique_id}",\n        )\n\n    def make_request(\n        self,\n        method: str,\n        endpoint: str,\n        data: Optional[Mapping[str, Any]] = None,\n        params: Optional[Mapping[str, Any]] = None,\n        return_text: bool = False,\n        base_url: Optional[str] = None,\n    ) -> Any:\n        """Creates and sends a request to the desired dbt Cloud API endpoint.\n\n        Args:\n            method (str): The http method to use for this request (e.g. "POST", "GET", "PATCH").\n            endpoint (str): The dbt Cloud API endpoint to send this request to.\n            data (Optional[Mapping[str, Any]]): JSON-formatable data string to be included in the request.\n            params (Optional[Mapping[str, Any]]): Payload to add to query string of the request.\n            return_text (bool): Override default behavior and return unparsed {"text": response.text}\n                blob instead of json.\n\n        Returns:\n            Dict[str, Any]: Parsed json data from the response to this request\n        """\n        headers = {\n            "User-Agent": f"dagster-dbt/{__version__}",\n            "Content-Type": "application/json",\n            "Authorization": f"Bearer {self._auth_token}",\n        }\n        base_url = base_url or self.api_v2_base_url\n        url = urljoin(base_url, endpoint)\n\n        if self._log_requests:\n            self._log.debug(f"Making Request: method={method} url={url} data={data}")\n\n        num_retries = 0\n        while True:\n            try:\n                response = requests.request(\n                    method=method,\n                    url=url,\n                    headers=headers,\n                    data=json.dumps(data),\n                    params=params,\n                )\n                response.raise_for_status()\n                return {"text": response.text} if return_text else response.json()["data"]\n            except RequestException as e:\n                self._log.error("Request to dbt Cloud API failed: %s", e)\n                if num_retries == self._request_max_retries:\n                    break\n                num_retries += 1\n                time.sleep(self._request_retry_delay)\n\n        raise Failure(f"Max retries ({self._request_max_retries}) exceeded with url: {url}.")\n\n    def list_jobs(\n        self, project_id: int, order_by: Optional[str] = "-id"\n    ) -> Sequence[Mapping[str, Any]]:\n        """List all dbt jobs in a dbt Cloud project.\n\n        Args:\n            project_id (int): The ID of the relevant dbt Cloud project. You can find this value by\n                going to your account settings in the dbt Cloud UI. It will be the final\n                number in the url, e.g.: ``https://cloud.getdbt.com/next/settings/accounts/{account_id}/projects/{project_id}/``\n            order_by (Optional[str]): An identifier designated by dbt Cloud in which to sort the\n                results before returning them. Useful when combined with offset and limit to load\n                runs for a job. Defaults to "-id" where "-" designates reverse order and "id" is\n                the key to filter on.\n\n        Returns:\n            List[Dict[str, Any]]: Parsed json data from the response to this request\n        """\n        return self.make_request(\n            "GET",\n            f"{self._account_id}/jobs",\n            params={"project_id": project_id, "order_by": order_by},\n        )\n\n    def get_job(self, job_id: int) -> Mapping[str, Any]:\n        """Gets details about a given dbt job from the dbt Cloud API.\n\n        Args:\n            job_id (int): The ID of the relevant dbt Cloud job. You can find this value by going to\n                the details page of your job in the dbt Cloud UI. It will be the final number in the\n                url, e.g.: ``https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/jobs/{job_id}/``\n        Returns:\n            Dict[str, Any]: Parsed json data from the response to this request\n        """\n        return self.make_request("GET", f"{self._account_id}/jobs/{job_id}/")\n\n    def update_job(self, job_id: int, **kwargs) -> Mapping[str, Any]:\n        """Updates specific properties of a dbt job.\n\n        Documentation on the full set of potential parameters can be found here:\n        https://docs.getdbt.com/dbt-cloud/api-v2#operation/updateJobById.\n\n        Args:\n            job_id (int): The ID of the relevant dbt Cloud job. You can find this value by going to\n                the details page of your job in the dbt Cloud UI. It will be the final number in the\n                url, e.g.: ``https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/jobs/{job_id}/``\n            kwargs: Passed in as the properties to be changed.\n\n        Returns:\n            Dict[str, Any]: Parsed json data from the response to this request\n\n        Examples:\n        .. code-block:: python\n\n            # disable schedule for job with id=12345\n            my_dbt_cloud_resource.update_job(12345, triggers={"schedule": False})\n        """\n        # API requires you to supply a bunch of values, so we can just use the current state\n        # as the defaults\n        job_data = self.get_job(job_id)\n        return self.make_request(\n            "POST", f"{self._account_id}/jobs/{job_id}/", data=deep_merge_dicts(job_data, kwargs)\n        )\n\n    def run_job(self, job_id: int, **kwargs) -> Mapping[str, Any]:\n        """Initializes a run for a job.\n\n        Overrides for specific properties can be set by passing in values to the kwargs. A full list\n        of overridable properties can be found here:\n        https://docs.getdbt.com/dbt-cloud/api-v2#operation/triggerRun.\n\n        Args:\n            job_id (int): The ID of the relevant dbt Cloud job. You can find this value by going to\n                the details page of your job in the dbt Cloud UI. It will be the final number in the\n                url, e.g.: ``https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/jobs/{job_id}/``\n            kwargs: Passed in as the properties to be overridden.\n\n        Returns:\n            Dict[str, Any]: Parsed json data from the response to this request\n        """\n        self._log.info(f"Initializing run for job with job_id={job_id}")\n        if "cause" not in kwargs:\n            kwargs["cause"] = "Triggered via Dagster"\n        resp = self.make_request("POST", f"{self._account_id}/jobs/{job_id}/run/", data=kwargs)\n\n        has_schedule: bool = resp.get("job", {}).get("triggers", {}).get("schedule", False)\n        if has_schedule and self._disable_schedule_on_trigger:\n            self._log.info("Disabling dbt Cloud job schedule.")\n            self.update_job(job_id, triggers={"schedule": False})\n\n        self._log.info(\n            f"Run initialized with run_id={resp['id']}. View this run in "\n            f"the dbt Cloud UI: {resp['href']}"\n        )\n        return resp\n\n    def get_runs(\n        self,\n        include_related: Optional[Sequence[str]] = None,\n        job_id: Optional[int] = None,\n        order_by: Optional[str] = "-id",\n        offset: int = 0,\n        limit: int = 100,\n    ) -> Sequence[Mapping[str, object]]:\n        """Returns a list of runs from dbt Cloud. This can be optionally filtered to a specific job\n        using the job_definition_id. It supports pagination using offset and limit as well and\n        can be configured to load a variety of related information about the runs.\n\n        Args:\n            include_related (Optional[List[str]]): A list of resources to include in the response\n                from dbt Cloud. This is technically a required field according to the API, but it\n                can be passed with an empty list where it will only load the default run\n                information. Valid values are "trigger", "job", "repository", and "environment".\n            job_definition_id (Optional[int]): This method can be optionally filtered to only\n                load runs for a specific job id if it is included here. If omitted it will pull\n                runs for every job.\n            order_by (Optional[str]): An identifier designated by dbt Cloud in which to sort the\n                results before returning them. Useful when combined with offset and limit to load\n                runs for a job. Defaults to "-id" where "-" designates reverse order and "id" is\n                the key to filter on.\n            offset (int): An offset to apply when listing runs. Can be used to paginate results\n                when combined with order_by and limit. Defaults to 0.\n            limit (int): Limits the amount of rows returned by the API. Defaults to 100.\n\n        Returns:\n            List[Dict[str, Any]]: A list of dictionaries containing the runs and any included\n                related information.\n        """\n        query_dict = {\n            "include_related": include_related or [],\n            "order_by": order_by,\n            "offset": offset,\n            "limit": limit,\n        }\n        if job_id:\n            query_dict["job_definition_id"] = job_id\n        return self.make_request("GET", f"{self._account_id}/runs/?{urlencode(query_dict)}")\n\n    def get_run(\n        self, run_id: int, include_related: Optional[Sequence[str]] = None\n    ) -> Mapping[str, Any]:\n        """Gets details about a specific job run.\n\n        Args:\n            run_id (int): The ID of the relevant dbt Cloud run. You can find this value by going to\n                the details page of your run in the dbt Cloud UI. It will be the final number in the\n                url, e.g.: ``https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/runs/{run_id}/``\n            include_related (List[str]): List of related fields to pull with the run. Valid values\n                are "trigger", "job", and "debug_logs".\n\n        Returns:\n            Dict[str, Any]: A dictionary containing the parsed contents of the dbt Cloud run details.\n                See: https://docs.getdbt.com/dbt-cloud/api-v2#operation/getRunById for schema.\n        """\n        query_params = f"?include_related={','.join(include_related)}" if include_related else ""\n        return self.make_request(\n            "GET",\n            f"{self._account_id}/runs/{run_id}/{query_params}",\n        )\n\n    def get_run_steps(self, run_id: int) -> Sequence[str]:\n        """Gets the steps of an initialized dbt Cloud run.\n\n        Args:\n            run_id (int): The ID of the relevant dbt Cloud run. You can find this value by going to\n                the details page of your run in the dbt Cloud UI. It will be the final number in the\n                url, e.g.: ``https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/runs/{run_id}/``\n\n        Returns:\n            List[str, Any]: List of commands for each step of the run.\n        """\n        run_details = self.get_run(run_id, include_related=["trigger", "job"])\n        steps = run_details["job"]["execute_steps"]\n        steps_override = run_details["trigger"]["steps_override"]\n        return steps_override or steps\n\n    def cancel_run(self, run_id: int) -> Mapping[str, Any]:\n        """Cancels a dbt Cloud run.\n\n        Args:\n            run_id (int): The ID of the relevant dbt Cloud run. You can find this value by going to\n                the details page of your run in the dbt Cloud UI. It will be the final number in the\n                url, e.g.: ``https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/runs/{run_id}/``\n\n        Returns:\n            Dict[str, Any]: A dictionary containing the parsed contents of the dbt Cloud run details.\n                See: https://docs.getdbt.com/dbt-cloud/api-v2#operation/getRunById for schema.\n        """\n        self._log.info(f"Cancelling run with id '{run_id}'")\n        return self.make_request("POST", f"{self._account_id}/runs/{run_id}/cancel/")\n\n    def list_run_artifacts(self, run_id: int, step: Optional[int] = None) -> Sequence[str]:\n        """Lists the paths of the available run artifacts from a completed dbt Cloud run.\n\n        Args:\n            run_id (int): The ID of the relevant dbt Cloud run. You can find this value by going to\n                the details page of your run in the dbt Cloud UI. It will be the final number in the\n                url, e.g.: ``https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/runs/{run_id}/``\n            step (int): The index of the step in the run to query for artifacts. The first step in\n                the run has the index 1. If the step parameter is omitted, then this endpoint will\n                return the artifacts compiled for the last step in the run\n\n        Returns:\n            List[str]: List of the paths of the available run artifacts\n        """\n        query_params = f"?step={step}" if step else ""\n        return cast(\n            list,\n            self.make_request(\n                "GET",\n                f"{self._account_id}/runs/{run_id}/artifacts/{query_params}",\n                data={"step": step} if step else None,\n            ),\n        )\n\n    def get_run_artifact(self, run_id: int, path: str, step: Optional[int] = None) -> str:\n        """The string contents of a run artifact from a dbt Cloud run.\n\n        Args:\n            run_id (int): The ID of the relevant dbt Cloud run. You can find this value by going to\n                the details page of your run in the dbt Cloud UI. It will be the final number in the\n                url, e.g.: ``https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/runs/{run_id}/``\n            path (str): The path to this run artifact (e.g. 'run/my_new_project/models/example/my_first_dbt_model.sql')\n            step (int): The index of the step in the run to query for artifacts. The first step in\n                the run has the index 1. If the step parameter is omitted, then this endpoint will\n                return the artifacts compiled for the last step in the run.\n\n        Returns:\n            List[str]: List of the names of the available run artifacts\n        """\n        query_params = f"?step={step}" if step else ""\n        return self.make_request(\n            "GET",\n            f"{self._account_id}/runs/{run_id}/artifacts/{path}{query_params}",\n            data={"step": step} if step else None,\n            return_text=True,\n        )["text"]\n\n    def get_manifest(self, run_id: int, step: Optional[int] = None) -> Mapping[str, Any]:\n        """The parsed contents of a manifest.json file created by a completed run.\n\n        Args:\n            run_id (int): The ID of the relevant dbt Cloud run. You can find this value by going to\n                the details page of your run in the dbt Cloud UI. It will be the final number in the\n                url, e.g.: ``https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/runs/{run_id}/``\n            step (int): The index of the step in the run to query for artifacts. The first step in\n                the run has the index 1. If the step parameter is omitted, then this endpoint will\n                return the artifacts compiled for the last step in the run.\n\n        Returns:\n            Dict[str, Any]: Parsed contents of the manifest.json file\n        """\n        return json.loads(self.get_run_artifact(run_id, "manifest.json", step=step))\n\n    def get_run_results(self, run_id: int, step: Optional[int] = None) -> Mapping[str, Any]:\n        """The parsed contents of a run_results.json file created by a completed run.\n\n        Args:\n            run_id (int): The ID of the relevant dbt Cloud run. You can find this value by going to\n                the details page of your run in the dbt Cloud UI. It will be the final number in the\n                url, e.g.: ``https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/runs/{run_id}/``\n            step (int): The index of the step in the run to query for artifacts. The first step in\n                the run has the index 1. If the step parameter is omitted, then this endpoint will\n                return the artifacts compiled for the last step in the run.\n\n        Returns:\n            Dict[str, Any]: Parsed contents of the run_results.json file\n        """\n        return json.loads(self.get_run_artifact(run_id, "run_results.json", step=step))\n\n    def poll_run(\n        self,\n        run_id: int,\n        poll_interval: float = DEFAULT_POLL_INTERVAL,\n        poll_timeout: Optional[float] = None,\n        href: Optional[str] = None,\n    ) -> Mapping[str, Any]:\n        """Polls a dbt Cloud job run until it completes. Will raise a `dagster.Failure` exception if the\n        run does not complete successfully.\n\n        Args:\n            run_id (int): The ID of the relevant dbt Cloud run. You can find this value by going to\n                the details page of your run in the dbt Cloud UI. It will be the final number in the\n                url, e.g.: ``https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/runs/{run_id}/``\n            poll_interval (float): The time (in seconds) that should be waited between successive\n                polls of the dbt Cloud API.\n            poll_timeout (float): The maximum time (in seconds) that should be waited for this run\n                to complete. If this threshold is exceeded, the run will be cancelled and an\n                exception will be thrown. By default, this will poll forver.\n            href (str): For internal use, generally should not be set manually.\n\n        Returns:\n            Dict[str, Any]: A dictionary containing the parsed contents of the dbt Cloud run details.\n                See: https://docs.getdbt.com/dbt-cloud/api-v2#operation/getRunById for schema.\n        """\n        status: Optional[str] = None\n\n        if href is None:\n            href = self.get_run(run_id).get("href")\n        assert isinstance(href, str), "Run must have an href"\n\n        poll_start = datetime.datetime.now()\n        try:\n            while True:\n                run_details = self.get_run(run_id)\n                status = run_details["status_humanized"]\n                self._log.info(f"Polled run {run_id}. Status: [{status}]")\n\n                # completed successfully\n                if status == DbtCloudRunStatus.SUCCESS:\n                    return self.get_run(run_id, include_related=["job", "trigger", "run_steps"])\n                elif status in [DbtCloudRunStatus.ERROR, DbtCloudRunStatus.CANCELLED]:\n                    break\n                elif status not in [\n                    DbtCloudRunStatus.QUEUED,\n                    DbtCloudRunStatus.STARTING,\n                    DbtCloudRunStatus.RUNNING,\n                ]:\n                    check.failed(f"Received unexpected status '{status}'. This should never happen")\n\n                if poll_timeout and datetime.datetime.now() > poll_start + datetime.timedelta(\n                    seconds=poll_timeout\n                ):\n                    self.cancel_run(run_id)\n                    raise Failure(\n                        f"Run {run_id} timed out after "\n                        f"{datetime.datetime.now() - poll_start}. Attempted to cancel.",\n                        metadata={"run_page_url": MetadataValue.url(href)},\n                    )\n\n                # Sleep for the configured time interval before polling again.\n                time.sleep(poll_interval)\n        finally:\n            if status not in (\n                DbtCloudRunStatus.SUCCESS,\n                DbtCloudRunStatus.ERROR,\n                DbtCloudRunStatus.CANCELLED,\n            ):\n                self.cancel_run(run_id)\n\n        run_details = self.get_run(run_id, include_related=["trigger"])\n        raise Failure(\n            f"Run {run_id} failed. Status Message: {run_details['status_message']}",\n            metadata={\n                "run_details": MetadataValue.json(run_details),\n                "run_page_url": MetadataValue.url(href),\n            },\n        )\n\n    def run_job_and_poll(\n        self,\n        job_id: int,\n        poll_interval: float = DEFAULT_POLL_INTERVAL,\n        poll_timeout: Optional[float] = None,\n        **kwargs,\n    ) -> DbtCloudOutput:\n        """Runs a dbt Cloud job and polls until it completes. Will raise a `dagster.Failure` exception\n        if the run does not complete successfully.\n\n        Args:\n            job_id (int): The ID of the relevant dbt Cloud job. You can find this value by going to\n                the details page of your job in the dbt Cloud UI. It will be the final number in the\n                url, e.g.: ``https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/jobs/{job_id}/``\n            poll_interval (float): The time (in seconds) that should be waited between successive\n                polls of the dbt Cloud API.\n            poll_timeout (float): The maximum time (in seconds) that should be waited for this run\n                to complete. If this threshold is exceeded, the run will be cancelled and an\n                exception will be thrown. By default, this will poll forver.\n\n        Returns:\n            :py:class:`~DbtCloudOutput`: Class containing details about the specific job run and the\n                parsed run results.\n        """\n        run_details = self.run_job(job_id, **kwargs)\n        run_id = run_details["id"]\n        href = run_details["href"]\n        final_run_details = self.poll_run(\n            run_id, poll_interval=poll_interval, poll_timeout=poll_timeout, href=href\n        )\n        try:\n            run_results = self.get_run_results(run_id)\n        # if you fail to get run_results for this job, just leave it empty\n        except Failure:\n            self._log.info(\n                "run_results.json not available for this run. Defaulting to empty value."\n            )\n            run_results = {}\n        output = DbtCloudOutput(run_details=final_run_details, result=run_results)\n        if output.docs_url:\n            self._log.info(f"Docs for this run can be viewed here: {output.docs_url}")\n        return output\n\n    def get_job_environment_variables(self, project_id: int, job_id: int) -> Mapping[str, Any]:\n        """Get the dbt Cloud environment variables for a specific job.\n\n        Args:\n            project_id (int): The ID of the relevant dbt Cloud project. You can find this value by\n                going to your account settings in the dbt Cloud UI. It will be the final\n                number in the url, e.g.: ``https://cloud.getdbt.com/next/settings/accounts/{account_id}/projects/{project_id}/``\n            job_id (int): The ID of the relevant dbt Cloud job. You can find this value by going to\n                the details page of your job in the dbt Cloud UI. It will be the final number in the\n                url, e.g.: ``https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/jobs/{job_id}/``\n        """\n        return self.make_request(\n            "GET",\n            f"{self._account_id}/projects/{project_id}/environment-variables/job",\n            params={"job_definition_id": job_id},\n            base_url=self.api_v3_base_url,\n        )\n\n    def set_job_environment_variable(\n        self, project_id: int, job_id: int, environment_variable_id: int, name: str, value: str\n    ) -> Mapping[str, Any]:\n        """Set the dbt Cloud environment variables for a specific job.\n\n        Args:\n            project_id (int): The ID of the relevant dbt Cloud project. You can find this value by\n                going to your account settings in the dbt Cloud UI. It will be the final\n                number in the url, e.g.: ``https://cloud.getdbt.com/next/settings/accounts/{account_id}/projects/{project_id}/``\n            job_id (int): The ID of the relevant dbt Cloud job. You can find this value by going to\n                the details page of your job in the dbt Cloud UI. It will be the final number in the\n                url, e.g.: ``https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/jobs/{job_id}/``\n            name (str): The name of the environment variable to set.\n            value (str): The raw value of the environment variable.\n        """\n        return self.make_request(\n            "POST",\n            f"{self._account_id}/projects/{project_id}/environment-variables/{environment_variable_id}",\n            data={\n                "id": environment_variable_id,\n                "account_id": self._account_id,\n                "project_id": project_id,\n                "job_definition_id": job_id,\n                "type": "job",\n                "name": name,\n                "raw_value": value,\n            },\n            base_url=self.api_v3_base_url,\n        )\n\n\nclass DbtCloudResource(DbtCloudClient):\n    pass\n\n\n
[docs]class DbtCloudClientResource(ConfigurableResource, IAttachDifferentObjectToOpContext):\n """This resource helps interact with dbt Cloud connectors."""\n\n auth_token: str = Field(\n description=(\n "dbt Cloud API Token. User tokens can be found in the [dbt Cloud"\n " UI](https://cloud.getdbt.com/#/profile/api/), or see the [dbt Cloud"\n " Docs](https://docs.getdbt.com/docs/dbt-cloud/dbt-cloud-api/service-tokens) for"\n " instructions on creating a Service Account token."\n ),\n )\n account_id: int = Field(\n description=(\n "dbt Cloud Account ID. This value can be found in the url of a variety of views in"\n " the dbt Cloud UI, e.g."\n " https://cloud.getdbt.com/#/accounts/{account_id}/settings/."\n ),\n )\n disable_schedule_on_trigger: bool = Field(\n default=True,\n description=(\n "Specifies if you would like any job that is triggered using this "\n "resource to automatically disable its schedule."\n ),\n )\n request_max_retries: int = Field(\n default=3,\n description=(\n "The maximum number of times requests to the dbt Cloud API should be retried "\n "before failing."\n ),\n )\n request_retry_delay: float = Field(\n default=0.25,\n description="Time (in seconds) to wait between each request retry.",\n )\n dbt_cloud_host: str = Field(\n default=DBT_DEFAULT_HOST,\n description=(\n "The hostname where dbt cloud is being hosted (e.g. https://my_org.cloud.getdbt.com/)."\n ),\n )\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n def get_dbt_client(self) -> DbtCloudClient:\n context = self.get_resource_context()\n assert context.log\n\n return DbtCloudClient(\n auth_token=self.auth_token,\n account_id=self.account_id,\n disable_schedule_on_trigger=self.disable_schedule_on_trigger,\n request_max_retries=self.request_max_retries,\n request_retry_delay=self.request_retry_delay,\n log=context.log,\n dbt_cloud_host=self.dbt_cloud_host,\n )\n\n def get_object_to_set_on_execution_context(self) -> Any:\n return self.get_dbt_client()
\n\n\n
[docs]@dagster_maintained_resource\n@resource(\n config_schema=DbtCloudClientResource.to_config_schema(),\n description="This resource helps interact with dbt Cloud connectors",\n)\ndef dbt_cloud_resource(context) -> DbtCloudResource:\n """This resource allows users to programatically interface with the dbt Cloud Administrative REST\n API (v2) to launch jobs and monitor their progress. This currently implements only a subset of\n the functionality exposed by the API.\n\n For a complete set of documentation on the dbt Cloud Administrative REST API, including expected\n response JSON schemae, see the `dbt Cloud API Docs <https://docs.getdbt.com/dbt-cloud/api-v2>`_.\n\n To configure this resource, we recommend using the `configured\n <https://docs.dagster.io/concepts/configuration/configured>`_ method.\n\n **Examples:**\n\n .. code-block:: python\n\n from dagster import job\n from dagster_dbt import dbt_cloud_resource\n\n my_dbt_cloud_resource = dbt_cloud_resource.configured(\n {\n "auth_token": {"env": "DBT_CLOUD_AUTH_TOKEN"},\n "account_id": {"env": "DBT_CLOUD_ACCOUNT_ID"},\n }\n )\n\n @job(resource_defs={"dbt_cloud": my_dbt_cloud_resource})\n def my_dbt_cloud_job():\n ...\n """\n return DbtCloudResource(\n auth_token=context.resource_config["auth_token"],\n account_id=context.resource_config["account_id"],\n disable_schedule_on_trigger=context.resource_config["disable_schedule_on_trigger"],\n request_max_retries=context.resource_config["request_max_retries"],\n request_retry_delay=context.resource_config["request_retry_delay"],\n log=context.log,\n dbt_cloud_host=context.resource_config["dbt_cloud_host"],\n )
\n
", "current_page_name": "_modules/dagster_dbt/cloud/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_dbt.cloud.resources"}}, "core": {"resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_dbt.core.resources

\nfrom typing import Any, Iterator, Mapping, Optional, Sequence, Set\n\nimport dagster._check as check\nfrom dagster import resource\nfrom dagster._annotations import deprecated, public\nfrom dagster._config.pythonic_config import ConfigurableResource, IAttachDifferentObjectToOpContext\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom dagster._utils.merger import merge_dicts\nfrom pydantic import Field\n\nfrom ..dbt_resource import DbtClient\nfrom .types import DbtCliOutput\nfrom .utils import (\n    DEFAULT_DBT_TARGET_PATH,\n    execute_cli,\n    execute_cli_stream,\n    parse_manifest,\n    parse_run_results,\n    remove_run_results,\n)\n\nDEFAULT_DBT_EXECUTABLE = "dbt"\n\n# The set of dbt cli commands that result in the creation of a run_results.json output file\n# https://docs.getdbt.com/reference/artifacts/run-results-json\nDBT_RUN_RESULTS_COMMANDS = ["run", "test", "seed", "snapshot", "docs generate", "build"]\n\n# The following config fields correspond to flags that apply to all dbt CLI commands. For details\n# on dbt CLI flags, see\n# https://github.com/fishtown-analytics/dbt/blob/1f8e29276e910c697588c43f08bc881379fff178/core/dbt/main.py#L260-L329\n\nCOMMON_OPTION_KEYS = {\n    "warn_error",\n    "dbt_executable",\n    "ignore_handled_error",\n    "target_path",\n    "docs_url",\n    "json_log_format",\n    "capture_logs",\n    "debug",\n}\n\n\nclass ConfigurableResourceWithCliFlags(ConfigurableResource):\n    project_dir: str = Field(\n        default=".",\n        description=(\n            "Which directory to look in for the dbt_project.yml file. Default is the current "\n            "working directory and its parents."\n        ),\n    )\n    profiles_dir: Optional[str] = Field(\n        default=None,\n        description=(\n            "Which directory to look in for the profiles.yml file. Default = $DBT_PROFILES_DIR or "\n            "$HOME/.dbt"\n        ),\n    )\n    profile: Optional[str] = Field(\n        default=None, description="Which profile to load. Overrides setting in dbt_project.yml."\n    )\n    target: Optional[str] = Field(\n        default=None, description="Which target to load for the given profile."\n    )\n    vars: Optional[Mapping[str, Any]] = Field(\n        default=None,\n        description=(\n            "Supply variables to the project. This argument overrides variables defined in your "\n            "dbt_project.yml file. This argument should be a dictionary, eg. "\n            "{'my_variable': 'my_value'}"\n        ),\n    )\n    bypass_cache: bool = Field(\n        default=False, description="If set, bypass the adapter-level cache of database state"\n    )\n    warn_error: bool = Field(\n        default=False,\n        description=(\n            "If dbt would normally warn, instead raise an exception. Examples include --models "\n            "that selects nothing, deprecations, configurations with no associated models, "\n            "invalid test configurations, and missing sources/refs in tests."\n        ),\n    )\n    dbt_executable: str = Field(\n        default=DEFAULT_DBT_EXECUTABLE,\n        description=f"Path to the dbt executable. Default is {DEFAULT_DBT_EXECUTABLE}",\n    )\n    ignore_handled_error: bool = Field(\n        default=False,\n        description=(\n            "When True, will not raise an exception when the dbt CLI returns error code 1. "\n            "Default is False."\n        ),\n    )\n    target_path: str = Field(\n        default=DEFAULT_DBT_TARGET_PATH,\n        description=(\n            "The directory path for target if different from the default `target-path` in "\n            "your dbt project configuration file."\n        ),\n    )\n    docs_url: Optional[str] = Field(\n        default=None, description="The url for where dbt docs are being served for this project."\n    )\n    json_log_format: bool = Field(\n        default=True,\n        description=(\n            "When True, dbt will invoked with the `--log-format json` flag, allowing "\n            "Dagster to parse the log messages and emit simpler log messages to the event log."\n        ),\n    )\n    capture_logs: bool = Field(\n        default=True,\n        description=(\n            "When True, dbt will invoked with the `--capture-output` flag, allowing "\n            "Dagster to capture the logs and emit them to the event log."\n        ),\n    )\n    debug: bool = Field(\n        default=False,\n        description=(\n            "When True, dbt will invoked with the `--debug` flag, which will print "\n            "additional debug information to the console."\n        ),\n    )\n\n\nclass DbtCliClient(DbtClient):\n    """A resource that allows you to execute dbt cli commands.\n\n    For the most up-to-date documentation on the specific parameters available to you for each\n    command, check out the dbt docs:\n\n    https://docs.getdbt.com/reference/commands/run\n\n    To use this as a dagster resource, we recommend using\n    :func:`dbt_cli_resource <dagster_dbt.dbt_cli_resource>`.\n    """\n\n    def __init__(\n        self,\n        executable: str,\n        default_flags: Mapping[str, Any],\n        warn_error: bool,\n        ignore_handled_error: bool,\n        target_path: str,\n        logger: Optional[Any] = None,\n        docs_url: Optional[str] = None,\n        json_log_format: bool = True,\n        capture_logs: bool = True,\n        debug: bool = False,\n    ):\n        self._default_flags = default_flags\n        self._executable = executable\n        self._warn_error = warn_error\n        self._ignore_handled_error = ignore_handled_error\n        self._target_path = target_path\n        self._docs_url = docs_url\n        self._json_log_format = json_log_format\n        self._capture_logs = capture_logs\n        self._debug = debug\n        super().__init__(logger)\n\n    @property\n    def default_flags(self) -> Mapping[str, Any]:\n        """A set of params populated from resource config that are passed as flags to each dbt CLI command."""\n        return self._format_params(self._default_flags, replace_underscores=True)\n\n    @property\n    def strict_flags(self) -> Set[str]:\n        """A set of flags that should not be auto-populated from the default flags unless they are\n        arguments to the associated function.\n        """\n        return {"models", "exclude", "select"}\n\n    def _get_flags_dict(self, kwargs) -> Mapping[str, Any]:\n        extra_flags = {} if kwargs is None else kwargs\n\n        # remove default flags that are declared as "strict" and not explicitly passed in\n        default_flags = {\n            k: v\n            for k, v in self.default_flags.items()\n            if not (k in self.strict_flags and k not in extra_flags)\n        }\n\n        return merge_dicts(\n            default_flags, self._format_params(extra_flags, replace_underscores=True)\n        )\n\n    @public\n    def cli(self, command: str, **kwargs) -> DbtCliOutput:\n        """Executes a dbt CLI command. Params passed in as keyword arguments will be merged with the\n            default flags that were configured on resource initialization (if any) overriding the\n            default values if necessary.\n\n        Args:\n            command (str): The command you wish to run (e.g. 'run', 'test', 'docs generate', etc.)\n\n        Returns:\n            DbtCliOutput: An instance of :class:`DbtCliOutput<dagster_dbt.DbtCliOutput>` containing\n                parsed log output as well as the contents of run_results.json (if applicable).\n        """\n        command = check.str_param(command, "command")\n        return execute_cli(\n            executable=self._executable,\n            command=command,\n            flags_dict=self._get_flags_dict(kwargs),\n            log=self.logger,\n            warn_error=self._warn_error,\n            ignore_handled_error=self._ignore_handled_error,\n            target_path=self._target_path,\n            docs_url=self._docs_url,\n            json_log_format=self._json_log_format,\n            capture_logs=self._capture_logs,\n            debug=self._debug,\n        )\n\n    def cli_stream_json(self, command: str, **kwargs) -> Iterator[Mapping[str, Any]]:\n        """Executes a dbt CLI command. Params passed in as keyword arguments will be merged with the\n            default flags that were configured on resource initialization (if any) overriding the\n            default values if necessary.\n\n        Args:\n            command (str): The command you wish to run (e.g. 'run', 'test', 'docs generate', etc.)\n        """\n        check.invariant(self._json_log_format, "Cannot stream JSON if json_log_format is False.")\n        for event in execute_cli_stream(\n            executable=self._executable,\n            command=command,\n            flags_dict=self._get_flags_dict(kwargs),\n            log=self.logger,\n            warn_error=self._warn_error,\n            ignore_handled_error=self._ignore_handled_error,\n            json_log_format=self._json_log_format,\n            capture_logs=self._capture_logs,\n            debug=self._debug,\n        ):\n            if event.parsed_json_line is not None:\n                yield event.parsed_json_line\n\n    @public\n    def compile(\n        self,\n        models: Optional[Sequence[str]] = None,\n        exclude: Optional[Sequence[str]] = None,\n        select: Optional[Sequence[str]] = None,\n        **kwargs,\n    ) -> DbtCliOutput:\n        """Run the ``compile`` command on a dbt project. kwargs are passed in as additional parameters.\n\n        Args:\n            models (List[str], optional): the models to include in compilation.\n            exclude (List[str]), optional): the models to exclude from compilation.\n            select (List[str], optional): the models to include in compilation.\n\n        Returns:\n            DbtCliOutput: An instance of :class:`DbtCliOutput<dagster_dbt.DbtCliOutput>` containing\n                parsed log output as well as the contents of run_results.json (if applicable).\n        """\n        return self.cli("compile", models=models, exclude=exclude, select=select, **kwargs)\n\n    @public\n    def run(\n        self,\n        models: Optional[Sequence[str]] = None,\n        exclude: Optional[Sequence[str]] = None,\n        select: Optional[Sequence[str]] = None,\n        **kwargs,\n    ) -> DbtCliOutput:\n        """Run the ``run`` command on a dbt project. kwargs are passed in as additional parameters.\n\n        Args:\n            models (List[str], optional): the models to include in the run.\n            exclude (List[str]), optional): the models to exclude from the run.\n            select (List[str], optional): the models to include in the run.\n\n        Returns:\n            DbtCliOutput: An instance of :class:`DbtCliOutput<dagster_dbt.DbtCliOutput>` containing\n                parsed log output as well as the contents of run_results.json (if applicable).\n        """\n        return self.cli("run", models=models, exclude=exclude, select=select, **kwargs)\n\n    @public\n    def snapshot(\n        self,\n        select: Optional[Sequence[str]] = None,\n        exclude: Optional[Sequence[str]] = None,\n        **kwargs,\n    ) -> DbtCliOutput:\n        """Run the ``snapshot`` command on a dbt project. kwargs are passed in as additional parameters.\n\n        Args:\n            select (List[str], optional): the snapshots to include in the run.\n            exclude (List[str], optional): the snapshots to exclude from the run.\n\n        Returns:\n            DbtCliOutput: An instance of :class:`DbtCliOutput<dagster_dbt.DbtCliOutput>` containing\n                parsed log output as well as the contents of run_results.json (if applicable).\n        """\n        return self.cli("snapshot", select=select, exclude=exclude, **kwargs)\n\n    @public\n    def test(\n        self,\n        models: Optional[Sequence[str]] = None,\n        exclude: Optional[Sequence[str]] = None,\n        data: bool = True,\n        schema: bool = True,\n        select: Optional[Sequence[str]] = None,\n        **kwargs,\n    ) -> DbtCliOutput:\n        """Run the ``test`` command on a dbt project. kwargs are passed in as additional parameters.\n\n        Args:\n            models (List[str], optional): the models to include in testing.\n            exclude (List[str], optional): the models to exclude from testing.\n            data (bool, optional): If ``True`` (default), then run data tests.\n            schema (bool, optional): If ``True`` (default), then run schema tests.\n            select (List[str], optional): the models to include in testing.\n\n        Returns:\n            DbtCliOutput: An instance of :class:`DbtCliOutput<dagster_dbt.DbtCliOutput>` containing\n                parsed log output as well as the contents of run_results.json (if applicable).\n        """\n        if data and schema:\n            # do not include these arguments if both are True, as these are deprecated in later\n            # versions of dbt, and for older versions the functionality is the same regardless of\n            # if both are set or neither are set.\n            return self.cli("test", models=models, exclude=exclude, select=select, **kwargs)\n        return self.cli(\n            "test",\n            models=models,\n            exclude=exclude,\n            data=data,\n            schema=schema,\n            select=select,\n            **kwargs,\n        )\n\n    @public\n    def seed(\n        self,\n        show: bool = False,\n        select: Optional[Sequence[str]] = None,\n        exclude: Optional[Sequence[str]] = None,\n        **kwargs,\n    ) -> DbtCliOutput:\n        """Run the ``seed`` command on a dbt project. kwargs are passed in as additional parameters.\n\n        Args:\n            show (bool, optional): If ``True``, then show a sample of the seeded data in the\n                response. Defaults to ``False``.\n            select (List[str], optional): the snapshots to include in the run.\n            exclude (List[str], optional): the snapshots to exclude from the run.\n\n        Returns:\n            DbtCliOutput: An instance of :class:`DbtCliOutput<dagster_dbt.DbtCliOutput>` containing\n                parsed log output as well as the contents of run_results.json (if applicable).\n        """\n        return self.cli("seed", show=show, select=select, exclude=exclude, **kwargs)\n\n    @public\n    def ls(\n        self,\n        select: Optional[Sequence[str]] = None,\n        models: Optional[Sequence[str]] = None,\n        exclude: Optional[Sequence[str]] = None,\n        **kwargs,\n    ) -> DbtCliOutput:\n        """Run the ``ls`` command on a dbt project. kwargs are passed in as additional parameters.\n\n        Args:\n            select (List[str], optional): the resources to include in the output.\n            models (List[str], optional): the models to include in the output.\n            exclude (List[str], optional): the resources to exclude from the output.\n\n        Returns:\n            DbtCliOutput: An instance of :class:`DbtCliOutput<dagster_dbt.DbtCliOutput>` containing\n                parsed log output as well as the contents of run_results.json (if applicable).\n        """\n        return self.cli("ls", select=select, models=models, exclude=exclude, **kwargs)\n\n    @public\n    def build(self, select: Optional[Sequence[str]] = None, **kwargs) -> DbtCliOutput:\n        """Run the ``build`` command on a dbt project. kwargs are passed in as additional parameters.\n\n        Args:\n            select (List[str], optional): the models/resources to include in the run.\n\n        Returns:\n            DbtCliOutput: An instance of :class:`DbtCliOutput<dagster_dbt.DbtCliOutput>` containing\n                parsed log output as well as the contents of run_results.json (if applicable).\n        """\n        return self.cli("build", select=select, **kwargs)\n\n    @public\n    def freshness(self, select: Optional[Sequence[str]] = None, **kwargs) -> DbtCliOutput:\n        """Run the ``source snapshot-freshness`` command on a dbt project. kwargs are passed in as additional parameters.\n\n        Args:\n            select (List[str], optional): the sources to include in the run.\n\n        Returns:\n            DbtCliOutput: An instance of :class:`DbtCliOutput<dagster_dbt.DbtCliOutput>` containing\n                parsed log output as well as the contents of run_results.json (if applicable).\n        """\n        return self.cli("source snapshot-freshness", select=select, **kwargs)\n\n    @public\n    def generate_docs(self, compile_project: bool = False, **kwargs) -> DbtCliOutput:\n        """Run the ``docs generate`` command on a dbt project. kwargs are passed in as additional parameters.\n\n        Args:\n            compile_project (bool, optional): If true, compile the project before generating a catalog.\n\n        Returns:\n            DbtCliOutput: An instance of :class:`DbtCliOutput<dagster_dbt.DbtCliOutput>` containing\n                parsed log output as well as the contents of run_results.json (if applicable).\n        """\n        return self.cli("docs generate", compile=compile_project, **kwargs)\n\n    @public\n    def run_operation(\n        self, macro: str, args: Optional[Mapping[str, Any]] = None, **kwargs\n    ) -> DbtCliOutput:\n        """Run the ``run-operation`` command on a dbt project. kwargs are passed in as additional parameters.\n\n        Args:\n            macro (str): the dbt macro to invoke.\n            args (Dict[str, Any], optional): the keyword arguments to be supplied to the macro.\n\n        Returns:\n            DbtCliOutput: An instance of :class:`DbtCliOutput<dagster_dbt.DbtCliOutput>` containing\n                parsed log output as well as the contents of run_results.json (if applicable).\n        """\n        return self.cli(f"run-operation {macro}", args=args, **kwargs)\n\n    @public\n    def get_run_results_json(self, **kwargs) -> Optional[Mapping[str, Any]]:\n        """Get a parsed version of the run_results.json file for the relevant dbt project.\n\n        Returns:\n            Dict[str, Any]: dictionary containing the parsed contents of the manifest json file\n                for this dbt project.\n        """\n        project_dir = kwargs.get("project_dir", self.default_flags["project-dir"])\n        target_path = kwargs.get("target_path", self._target_path)\n        return parse_run_results(project_dir, target_path)\n\n    @public\n    def remove_run_results_json(self, **kwargs):\n        """Remove the run_results.json file from previous runs (if it exists)."""\n        project_dir = kwargs.get("project_dir", self.default_flags["project-dir"])\n        target_path = kwargs.get("target_path", self._target_path)\n        remove_run_results(project_dir, target_path)\n\n    @public\n    def get_manifest_json(self, **kwargs) -> Optional[Mapping[str, Any]]:\n        """Get a parsed version of the manifest.json file for the relevant dbt project.\n\n        Returns:\n            Dict[str, Any]: dictionary containing the parsed contents of the manifest json file\n                for this dbt project.\n        """\n        project_dir = kwargs.get("project_dir", self.default_flags["project-dir"])\n        target_path = kwargs.get("target_path", self._target_path)\n        return parse_manifest(project_dir, target_path)\n\n\nclass DbtCliClientResource(ConfigurableResourceWithCliFlags, IAttachDifferentObjectToOpContext):\n    """Resource which issues dbt CLI commands against a configured dbt project."""\n\n    class Config:\n        extra = "allow"\n\n    @classmethod\n    def _is_dagster_maintained(cls) -> bool:\n        return True\n\n    def get_dbt_client(self) -> DbtCliClient:\n        context = self.get_resource_context()\n        default_flags = {\n            k: v\n            for k, v in self._get_non_none_public_field_values().items()\n            if k not in COMMON_OPTION_KEYS\n        }\n\n        return DbtCliClient(\n            executable=self.dbt_executable,\n            default_flags=default_flags,\n            warn_error=self.warn_error,\n            ignore_handled_error=self.ignore_handled_error,\n            target_path=self.target_path,\n            docs_url=self.docs_url,\n            logger=context.log,\n            json_log_format=self.json_log_format,\n            capture_logs=self.capture_logs,\n            debug=self.debug,\n        )\n\n    def get_object_to_set_on_execution_context(self) -> Any:\n        return self.get_dbt_client()\n\n\n
[docs]@deprecated(breaking_version="0.21", additional_warn_text="Use DbtCliResource instead.")\n@dagster_maintained_resource\n@resource(config_schema=DbtCliClientResource.to_config_schema())\ndef dbt_cli_resource(context) -> DbtCliClient:\n """This resource issues dbt CLI commands against a configured dbt project. It is deprecated\n in favor of :py:class:`~dagster_dbt.DbtCliResource`.\n """\n # all config options that are intended to be used as flags for dbt commands\n\n default_flags = {\n k: v for k, v in context.resource_config.items() if k not in COMMON_OPTION_KEYS\n }\n return DbtCliClient(\n executable=context.resource_config["dbt_executable"],\n default_flags=default_flags,\n warn_error=context.resource_config["warn_error"],\n ignore_handled_error=context.resource_config["ignore_handled_error"],\n target_path=context.resource_config["target_path"],\n logger=context.log,\n docs_url=context.resource_config.get("docs_url"),\n capture_logs=context.resource_config["capture_logs"],\n json_log_format=context.resource_config["json_log_format"],\n debug=context.resource_config["debug"],\n )
\n
", "current_page_name": "_modules/dagster_dbt/core/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_dbt.core.resources"}, "resources_v2": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_dbt.core.resources_v2

\nimport atexit\nimport contextlib\nimport os\nimport shutil\nimport subprocess\nimport sys\nimport uuid\nfrom contextlib import suppress\nfrom dataclasses import dataclass\nfrom pathlib import Path\nfrom typing import (\n    Any,\n    Dict,\n    Iterator,\n    List,\n    Mapping,\n    Optional,\n    Union,\n)\n\nimport dateutil.parser\nimport orjson\nfrom dagster import (\n    AssetCheckResult,\n    AssetCheckSeverity,\n    AssetObservation,\n    AssetsDefinition,\n    ConfigurableResource,\n    Output,\n    get_dagster_logger,\n)\nfrom dagster._annotations import public\nfrom dagster._core.errors import DagsterInvalidPropertyError\nfrom dagster._core.execution.context.compute import OpExecutionContext\nfrom dbt.contracts.results import NodeStatus, TestStatus\nfrom dbt.node_types import NodeType\nfrom packaging import version\nfrom pydantic import Field, root_validator, validator\nfrom typing_extensions import Literal\n\nfrom ..asset_utils import (\n    get_manifest_and_translator_from_dbt_assets,\n    is_asset_check_from_dbt_resource_props,\n    output_name_fn,\n)\nfrom ..dagster_dbt_translator import DagsterDbtTranslator\nfrom ..dbt_manifest import DbtManifestParam, validate_manifest\nfrom ..errors import DagsterDbtCliRuntimeError\nfrom ..utils import ASSET_RESOURCE_TYPES, get_dbt_resource_props_by_dbt_unique_id_from_manifest\n\nlogger = get_dagster_logger()\n\n\nDBT_PROJECT_YML_NAME = "dbt_project.yml"\nDBT_PROFILES_YML_NAME = "profiles.yml"\nPARTIAL_PARSE_FILE_NAME = "partial_parse.msgpack"\n\n\ndef _get_dbt_target_path() -> Path:\n    return Path(os.getenv("DBT_TARGET_PATH", "target"))\n\n\n
[docs]@dataclass\nclass DbtCliEventMessage:\n """The representation of a dbt CLI event.\n\n Args:\n raw_event (Dict[str, Any]): The raw event dictionary.\n See https://docs.getdbt.com/reference/events-logging#structured-logging for more\n information.\n """\n\n raw_event: Dict[str, Any]\n\n @classmethod\n def from_log(cls, log: str) -> "DbtCliEventMessage":\n """Parse an event according to https://docs.getdbt.com/reference/events-logging#structured-logging.\n\n We assume that the log format is json.\n """\n raw_event: Dict[str, Any] = orjson.loads(log)\n\n return cls(raw_event=raw_event)\n\n def __str__(self) -> str:\n return self.raw_event["info"]["msg"]\n\n
[docs] @public\n def to_default_asset_events(\n self,\n manifest: DbtManifestParam,\n dagster_dbt_translator: DagsterDbtTranslator = DagsterDbtTranslator(),\n ) -> Iterator[Union[Output, AssetObservation, AssetCheckResult]]:\n """Convert a dbt CLI event to a set of corresponding Dagster events.\n\n Args:\n manifest (Union[Mapping[str, Any], str, Path]): The dbt manifest blob.\n dagster_dbt_translator (DagsterDbtTranslator): Optionally, a custom translator for\n linking dbt nodes to Dagster assets.\n\n Returns:\n Iterator[Union[Output, AssetObservation, AssetCheckResult]]: A set of corresponding Dagster events.\n - Output for refables (e.g. models, seeds, snapshots.)\n - AssetObservation for dbt test results that are not enabled as asset checks.\n - AssetCheckResult for dbt test results that are enabled as asset checks.\n """\n if self.raw_event["info"]["level"] == "debug":\n return\n\n event_node_info: Dict[str, Any] = self.raw_event["data"].get("node_info")\n if not event_node_info:\n return\n\n manifest = validate_manifest(manifest)\n\n if not manifest:\n logger.info(\n "No dbt manifest was provided. Dagster events for dbt tests will not be created."\n )\n\n invocation_id: str = self.raw_event["info"]["invocation_id"]\n unique_id: str = event_node_info["unique_id"]\n node_resource_type: str = event_node_info["resource_type"]\n node_status: str = event_node_info["node_status"]\n\n is_node_successful = node_status == NodeStatus.Success\n is_node_finished = bool(event_node_info.get("node_finished_at"))\n if node_resource_type in NodeType.refable() and is_node_successful:\n started_at = dateutil.parser.isoparse(event_node_info["node_started_at"])\n finished_at = dateutil.parser.isoparse(event_node_info["node_finished_at"])\n duration_seconds = (finished_at - started_at).total_seconds()\n\n yield Output(\n value=None,\n output_name=output_name_fn(event_node_info),\n metadata={\n "unique_id": unique_id,\n "invocation_id": invocation_id,\n "Execution Duration": duration_seconds,\n },\n )\n elif manifest and node_resource_type == NodeType.Test and is_node_finished:\n upstream_unique_ids: List[str] = manifest["parent_map"][unique_id]\n test_resource_props = manifest["nodes"][unique_id]\n metadata = {\n "unique_id": unique_id,\n "invocation_id": invocation_id,\n "status": node_status,\n }\n\n is_asset_check = is_asset_check_from_dbt_resource_props(test_resource_props)\n attached_node_unique_id = test_resource_props.get("attached_node")\n is_generic_test = bool(attached_node_unique_id)\n\n if is_asset_check and is_generic_test:\n is_test_successful = node_status == TestStatus.Pass\n severity = AssetCheckSeverity(test_resource_props["config"]["severity"].upper())\n\n attached_node_resource_props: Dict[str, Any] = manifest["nodes"].get(\n attached_node_unique_id\n ) or manifest["sources"].get(attached_node_unique_id)\n attached_node_asset_key = dagster_dbt_translator.get_asset_key(\n attached_node_resource_props\n )\n\n yield AssetCheckResult(\n passed=is_test_successful,\n asset_key=attached_node_asset_key,\n check_name=event_node_info["node_name"],\n metadata=metadata,\n severity=severity,\n )\n else:\n for upstream_unique_id in upstream_unique_ids:\n upstream_resource_props: Dict[str, Any] = manifest["nodes"].get(\n upstream_unique_id\n ) or manifest["sources"].get(upstream_unique_id)\n upstream_asset_key = dagster_dbt_translator.get_asset_key(\n upstream_resource_props\n )\n\n yield AssetObservation(\n asset_key=upstream_asset_key,\n metadata=metadata,\n )
\n\n\n
[docs]@dataclass\nclass DbtCliInvocation:\n """The representation of an invoked dbt command.\n\n Args:\n process (subprocess.Popen): The process running the dbt command.\n manifest (Mapping[str, Any]): The dbt manifest blob.\n project_dir (Path): The path to the dbt project.\n target_path (Path): The path to the dbt target folder.\n raise_on_error (bool): Whether to raise an exception if the dbt command fails.\n """\n\n process: subprocess.Popen\n manifest: Mapping[str, Any]\n dagster_dbt_translator: DagsterDbtTranslator\n project_dir: Path\n target_path: Path\n raise_on_error: bool\n\n @classmethod\n def run(\n cls,\n args: List[str],\n env: Dict[str, str],\n manifest: Mapping[str, Any],\n dagster_dbt_translator: DagsterDbtTranslator,\n project_dir: Path,\n target_path: Path,\n raise_on_error: bool,\n ) -> "DbtCliInvocation":\n # Attempt to take advantage of partial parsing. If there is a `partial_parse.msgpack` in\n # in the target folder, then copy it to the dynamic target path.\n #\n # This effectively allows us to skip the parsing of the manifest, which can be expensive.\n # See https://docs.getdbt.com/reference/programmatic-invocations#reusing-objects for more\n # details.\n current_target_path = _get_dbt_target_path()\n partial_parse_file_path = (\n current_target_path.joinpath(PARTIAL_PARSE_FILE_NAME)\n if current_target_path.is_absolute()\n else project_dir.joinpath(current_target_path, PARTIAL_PARSE_FILE_NAME)\n )\n partial_parse_destination_target_path = target_path.joinpath(PARTIAL_PARSE_FILE_NAME)\n\n if partial_parse_file_path.exists():\n logger.info(\n f"Copying `{partial_parse_file_path}` to `{partial_parse_destination_target_path}`"\n " to take advantage of partial parsing."\n )\n\n partial_parse_destination_target_path.parent.mkdir(parents=True, exist_ok=True)\n shutil.copy(partial_parse_file_path, partial_parse_destination_target_path)\n\n # Create a subprocess that runs the dbt CLI command.\n logger.info(f"Running dbt command: `{' '.join(args)}`.")\n process = subprocess.Popen(\n args=args,\n stdout=subprocess.PIPE,\n stderr=subprocess.STDOUT,\n env=env,\n cwd=project_dir,\n )\n\n # Add handler to terminate child process if running.\n # See https://stackoverflow.com/a/18258391 for more details.\n def cleanup_dbt_subprocess(process: subprocess.Popen) -> None:\n if process.returncode is None:\n logger.info(\n "The main process is being terminated, but the dbt command has not yet"\n " completed. Terminating the execution of dbt command."\n )\n process.terminate()\n process.wait()\n\n atexit.register(cleanup_dbt_subprocess, process)\n\n return cls(\n process=process,\n manifest=manifest,\n dagster_dbt_translator=dagster_dbt_translator,\n project_dir=project_dir,\n target_path=target_path,\n raise_on_error=raise_on_error,\n )\n\n
[docs] @public\n def wait(self) -> "DbtCliInvocation":\n """Wait for the dbt CLI process to complete.\n\n Returns:\n DbtCliInvocation: The current representation of the dbt CLI invocation.\n\n Examples:\n .. code-block:: python\n\n from dagster_dbt import DbtCliResource\n\n dbt = DbtCliResource(project_dir="/path/to/dbt/project")\n\n dbt_cli_invocation = dbt.cli(["run"]).wait()\n """\n list(self.stream_raw_events())\n\n return self
\n\n
[docs] @public\n def is_successful(self) -> bool:\n """Return whether the dbt CLI process completed successfully.\n\n Returns:\n bool: True, if the dbt CLI process returns with a zero exit code, and False otherwise.\n\n Examples:\n .. code-block:: python\n\n from dagster_dbt import DbtCliResource\n\n dbt = DbtCliResource(project_dir="/path/to/dbt/project")\n\n dbt_cli_invocation = dbt.cli(["run"], raise_on_error=False)\n\n if dbt_cli_invocation.is_successful():\n ...\n """\n return self.process.wait() == 0
\n\n
[docs] @public\n def stream(self) -> Iterator[Union[Output, AssetObservation, AssetCheckResult]]:\n """Stream the events from the dbt CLI process and convert them to Dagster events.\n\n Returns:\n Iterator[Union[Output, AssetObservation, AssetCheckResult]]: A set of corresponding Dagster events.\n - Output for refables (e.g. models, seeds, snapshots.)\n - AssetObservation for dbt test results that are not enabled as asset checks.\n - AssetCheckResult for dbt test results that are enabled as asset checks.\n\n Examples:\n .. code-block:: python\n\n from pathlib import Path\n from dagster_dbt import DbtCliResource, dbt_assets\n\n @dbt_assets(manifest=Path("target", "manifest.json"))\n def my_dbt_assets(context, dbt: DbtCliResource):\n yield from dbt.cli(["run"], context=context).stream()\n """\n for event in self.stream_raw_events():\n yield from event.to_default_asset_events(\n manifest=self.manifest, dagster_dbt_translator=self.dagster_dbt_translator\n )
\n\n
[docs] @public\n def stream_raw_events(self) -> Iterator[DbtCliEventMessage]:\n """Stream the events from the dbt CLI process.\n\n Returns:\n Iterator[DbtCliEventMessage]: An iterator of events from the dbt CLI process.\n """\n with self.process.stdout or contextlib.nullcontext():\n for raw_line in self.process.stdout or []:\n log: str = raw_line.decode().strip()\n try:\n event = DbtCliEventMessage.from_log(log=log)\n\n # Re-emit the logs from dbt CLI process into stdout.\n sys.stdout.write(str(event) + "\\n")\n sys.stdout.flush()\n\n yield event\n except:\n # If we can't parse the log, then just emit it as a raw log.\n sys.stdout.write(log + "\\n")\n sys.stdout.flush()\n\n # Ensure that the dbt CLI process has completed.\n self._raise_on_error()
\n\n
[docs] @public\n def get_artifact(\n self,\n artifact: Union[\n Literal["manifest.json"],\n Literal["catalog.json"],\n Literal["run_results.json"],\n Literal["sources.json"],\n ],\n ) -> Dict[str, Any]:\n """Retrieve a dbt artifact from the target path.\n\n See https://docs.getdbt.com/reference/artifacts/dbt-artifacts for more information.\n\n Args:\n artifact (Union[Literal["manifest.json"], Literal["catalog.json"], Literal["run_results.json"], Literal["sources.json"]]): The name of the artifact to retrieve.\n\n Returns:\n Dict[str, Any]: The artifact as a dictionary.\n\n Examples:\n .. code-block:: python\n\n from dagster_dbt import DbtCliResource\n\n dbt = DbtCliResource(project_dir="/path/to/dbt/project")\n\n dbt_cli_invocation = dbt.cli(["run"]).wait()\n\n # Retrieve the run_results.json artifact.\n run_results = dbt_cli_invocation.get_artifact("run_results.json")\n """\n artifact_path = self.target_path.joinpath(artifact)\n\n return orjson.loads(artifact_path.read_bytes())
\n\n def _raise_on_error(self) -> None:\n """Ensure that the dbt CLI process has completed. If the process has not successfully\n completed, then optionally raise an error.\n """\n if not self.is_successful() and self.raise_on_error:\n raise DagsterDbtCliRuntimeError(\n description=(\n f"The dbt CLI process failed with exit code {self.process.returncode}. Check"\n " the Dagster compute logs for the full information about the error, or view"\n f" the dbt debug log file: {self.target_path.joinpath('dbt.log')}."\n )\n )
\n\n\n
[docs]class DbtCliResource(ConfigurableResource):\n """A resource used to execute dbt CLI commands.\n\n Attributes:\n project_dir (str): The path to the dbt project directory. This directory should contain a\n `dbt_project.yml`. See https://docs.getdbt.com/reference/dbt_project.yml for more\n information.\n global_config_flags (List[str]): A list of global flags configuration to pass to the dbt CLI\n invocation. See https://docs.getdbt.com/reference/global-configs for a full list of\n configuration.\n profiles_dir (Optional[str]): The path to the directory containing your dbt `profiles.yml`.\n By default, the current working directory is used, which is the dbt project directory.\n See https://docs.getdbt.com/docs/core/connect-data-platform/connection-profiles for more\n information.\n profile (Optional[str]): The profile from your dbt `profiles.yml` to use for execution. See\n https://docs.getdbt.com/docs/core/connect-data-platform/connection-profiles for more\n information.\n target (Optional[str]): The target from your dbt `profiles.yml` to use for execution. See\n https://docs.getdbt.com/docs/core/connect-data-platform/connection-profiles for more\n information.\n\n Examples:\n Creating a dbt resource with only a reference to ``project_dir``:\n\n .. code-block:: python\n\n from dagster_dbt import DbtCliResource\n\n dbt = DbtCliResource(project_dir="/path/to/dbt/project")\n\n Creating a dbt resource with a custom ``profiles_dir``:\n\n .. code-block:: python\n\n from dagster_dbt import DbtCliResource\n\n dbt = DbtCliResource(\n project_dir="/path/to/dbt/project",\n profiles_dir="/path/to/dbt/project/profiles",\n )\n\n Creating a dbt resource with a custom ``profile`` and ``target``:\n\n .. code-block:: python\n\n from dagster_dbt import DbtCliResource\n\n dbt = DbtCliResource(\n project_dir="/path/to/dbt/project",\n profiles_dir="/path/to/dbt/project/profiles",\n profile="jaffle_shop",\n target="dev",\n )\n\n Creating a dbt resource with global configs, e.g. disabling colored logs with ``--no-use-color``:\n\n .. code-block:: python\n\n from dagster_dbt import DbtCliResource\n\n dbt = DbtCliResource(\n project_dir="/path/to/dbt/project",\n global_config_flags=["--no-use-color"],\n )\n """\n\n project_dir: str = Field(\n ...,\n description=(\n "The path to your dbt project directory. This directory should contain a"\n " `dbt_project.yml`. See https://docs.getdbt.com/reference/dbt_project.yml for more"\n " information."\n ),\n )\n global_config_flags: List[str] = Field(\n default=[],\n description=(\n "A list of global flags configuration to pass to the dbt CLI invocation. See"\n " https://docs.getdbt.com/reference/global-configs for a full list of configuration."\n ),\n )\n profiles_dir: Optional[str] = Field(\n default=None,\n description=(\n "The path to the directory containing your dbt `profiles.yml`. By default, the current"\n " working directory is used, which is the dbt project directory."\n " See https://docs.getdbt.com/docs/core/connect-data-platform/connection-profiles for "\n " more information."\n ),\n )\n profile: Optional[str] = Field(\n default=None,\n description=(\n "The profile from your dbt `profiles.yml` to use for execution. See"\n " https://docs.getdbt.com/docs/core/connect-data-platform/connection-profiles for more"\n " information."\n ),\n )\n target: Optional[str] = Field(\n default=None,\n description=(\n "The target from your dbt `profiles.yml` to use for execution. See"\n " https://docs.getdbt.com/docs/core/connect-data-platform/connection-profiles for more"\n " information."\n ),\n )\n\n @classmethod\n def _validate_absolute_path_exists(cls, path: Union[str, Path]) -> Path:\n absolute_path = Path(path).absolute()\n try:\n resolved_path = absolute_path.resolve(strict=True)\n except FileNotFoundError:\n raise ValueError(f"The absolute path of '{path}' ('{absolute_path}') does not exist")\n\n return resolved_path\n\n @classmethod\n def _validate_path_contains_file(cls, path: Path, file_name: str, error_message: str):\n if not path.joinpath(file_name).exists():\n raise ValueError(error_message)\n\n @validator("project_dir", "profiles_dir", pre=True)\n def convert_path_to_str(cls, v: Any) -> Any:\n """Validate that the path is converted to a string."""\n if isinstance(v, Path):\n resolved_path = cls._validate_absolute_path_exists(v)\n\n absolute_path = Path(v).absolute()\n try:\n resolved_path = absolute_path.resolve(strict=True)\n except FileNotFoundError:\n raise ValueError(f"The absolute path of '{v}' ('{absolute_path}') does not exist")\n return os.fspath(resolved_path)\n\n return v\n\n @validator("project_dir")\n def validate_project_dir(cls, project_dir: str) -> str:\n resolved_project_dir = cls._validate_absolute_path_exists(project_dir)\n\n cls._validate_path_contains_file(\n path=resolved_project_dir,\n file_name=DBT_PROJECT_YML_NAME,\n error_message=(\n f"{resolved_project_dir} does not contain a {DBT_PROJECT_YML_NAME} file. Please"\n " specify a valid path to a dbt project."\n ),\n )\n\n return os.fspath(resolved_project_dir)\n\n @validator("profiles_dir")\n def validate_profiles_dir(cls, profiles_dir: str) -> str:\n resolved_project_dir = cls._validate_absolute_path_exists(profiles_dir)\n\n cls._validate_path_contains_file(\n path=resolved_project_dir,\n file_name=DBT_PROFILES_YML_NAME,\n error_message=(\n f"{resolved_project_dir} does not contain a {DBT_PROFILES_YML_NAME} file. Please"\n " specify a valid path to a dbt profile directory."\n ),\n )\n\n return os.fspath(resolved_project_dir)\n\n @root_validator(pre=True)\n def validate_dbt_version(cls, values: Dict[str, Any]) -> Dict[str, Any]:\n """Validate that the dbt version is supported."""\n from dbt.version import __version__ as dbt_version\n\n if version.parse(dbt_version) < version.parse("1.4.0"):\n raise ValueError(\n "To use `dagster_dbt.DbtCliResource`, you must use `dbt-core>=1.4.0`. Currently,"\n f" you are using `dbt-core=={dbt_version}`. Please install a compatible dbt-core"\n " version."\n )\n\n return values\n\n def _get_unique_target_path(self, *, context: Optional[OpExecutionContext]) -> Path:\n """Get a unique target path for the dbt CLI invocation.\n\n Args:\n context (Optional[OpExecutionContext]): The execution context.\n\n Returns:\n str: A unique target path for the dbt CLI invocation.\n """\n unique_id = str(uuid.uuid4())[:7]\n path = unique_id\n if context:\n path = f"{context.op.name}-{context.run_id[:7]}-{unique_id}"\n\n current_target_path = _get_dbt_target_path()\n\n return current_target_path.joinpath(path)\n\n
[docs] @public\n def cli(\n self,\n args: List[str],\n *,\n raise_on_error: bool = True,\n manifest: Optional[DbtManifestParam] = None,\n dagster_dbt_translator: Optional[DagsterDbtTranslator] = None,\n context: Optional[OpExecutionContext] = None,\n ) -> DbtCliInvocation:\n """Create a subprocess to execute a dbt CLI command.\n\n Args:\n args (List[str]): The dbt CLI command to execute.\n raise_on_error (bool): Whether to raise an exception if the dbt CLI command fails.\n manifest (Optional[Union[Mapping[str, Any], str, Path]]): The dbt manifest blob. If an\n execution context from within `@dbt_assets` is provided to the context argument,\n then the manifest provided to `@dbt_assets` will be used.\n dagster_dbt_translator (Optional[DagsterDbtTranslator]): The translator to link dbt\n nodes to Dagster assets. If an execution context from within `@dbt_assets` is\n provided to the context argument, then the dagster_dbt_translator provided to\n `@dbt_assets` will be used.\n context (Optional[OpExecutionContext]): The execution context from within `@dbt_assets`.\n\n Returns:\n DbtCliInvocation: A invocation instance that can be used to retrieve the output of the\n dbt CLI command.\n\n Examples:\n Streaming Dagster events for dbt asset materializations and observations:\n\n .. code-block:: python\n\n from pathlib import Path\n\n from dagster import AssetExecutionContext\n from dagster_dbt import DbtCliResource, dbt_assets\n\n\n @dbt_assets(manifest=Path("target", "manifest.json"))\n def my_dbt_assets(context: AssetExecutionContext, dbt: DbtCliResource):\n yield from dbt.cli(["run"], context=context).stream()\n\n Retrieving a dbt artifact after streaming the Dagster events:\n\n .. code-block:: python\n\n from pathlib import Path\n\n from dagster import AssetExecutionContext\n from dagster_dbt import DbtCliResource, dbt_assets\n\n\n @dbt_assets(manifest=Path("target", "manifest.json"))\n def my_dbt_assets(context: AssetExecutionContext, dbt: DbtCliResource):\n dbt_run_invocation = dbt.cli(["run"], context=context)\n\n yield from dbt_run_invocation.stream()\n\n # Retrieve the `run_results.json` dbt artifact as a dictionary:\n run_results_json = dbt_run_invocation.get_artifact("run_results.json")\n\n # Retrieve the `run_results.json` dbt artifact as a file path:\n run_results_path = dbt_run_invocation.target_path.joinpath("run_results.json")\n\n Customizing the asset materialization metadata when streaming the Dagster events:\n\n .. code-block:: python\n\n from pathlib import Path\n\n from dagster import AssetExecutionContext\n from dagster_dbt import DbtCliResource, dbt_assets\n\n\n @dbt_assets(manifest=Path("target", "manifest.json"))\n def my_dbt_assets(context: AssetExecutionContext, dbt: DbtCliResource):\n dbt_cli_invocation = dbt.cli(["run"], context=context)\n\n for dbt_event in dbt_cli_invocation.stream_raw_events():\n for dagster_event in dbt_event.to_default_asset_events(manifest=dbt_cli_invocation.manifest):\n if isinstance(dagster_event, Output):\n context.add_output_metadata(\n metadata={\n "my_custom_metadata": "my_custom_metadata_value",\n },\n output_name=dagster_event.output_name,\n )\n\n yield dagster_event\n\n Suppressing exceptions from a dbt CLI command when a non-zero exit code is returned:\n\n .. code-block:: python\n\n from pathlib import Path\n\n from dagster import AssetExecutionContext\n from dagster_dbt import DbtCliResource, dbt_assets\n\n\n @dbt_assets(manifest=Path("target", "manifest.json"))\n def my_dbt_assets(context: AssetExecutionContext, dbt: DbtCliResource):\n dbt_run_invocation = dbt.cli(["run"], context=context, raise_on_error=False)\n\n if dbt_run_invocation.is_successful():\n yield from dbt_run_invocation.stream()\n else:\n ...\n\n Invoking a dbt CLI command in a custom asset or op:\n\n .. code-block:: python\n\n import json\n\n from dagster import asset, op\n from dagster_dbt import DbtCliResource\n\n\n @asset\n def my_dbt_asset(dbt: DbtCliResource):\n dbt_macro_args = {"key": "value"}\n dbt.cli(["run-operation", "my-macro", json.dumps(dbt_macro_args)]).wait()\n\n\n @op\n def my_dbt_op(dbt: DbtCliResource):\n dbt_macro_args = {"key": "value"}\n dbt.cli(["run-operation", "my-macro", json.dumps(dbt_macro_args)]).wait()\n """\n target_path = self._get_unique_target_path(context=context)\n env = {\n **os.environ.copy(),\n # Run dbt with unbuffered output.\n "PYTHONUNBUFFERED": "1",\n # Disable anonymous usage statistics for performance.\n "DBT_SEND_ANONYMOUS_USAGE_STATS": "false",\n # The DBT_LOG_FORMAT environment variable must be set to `json`. We use this\n # environment variable to ensure that the dbt CLI outputs structured logs.\n "DBT_LOG_FORMAT": "json",\n # The DBT_TARGET_PATH environment variable is set to a unique value for each dbt\n # invocation so that artifact paths are separated.\n # See https://discourse.getdbt.com/t/multiple-run-results-json-and-manifest-json-files/7555\n # for more information.\n "DBT_TARGET_PATH": os.fspath(target_path),\n # The DBT_LOG_PATH environment variable is set to the same value as DBT_TARGET_PATH\n # so that logs for each dbt invocation has separate log files.\n "DBT_LOG_PATH": os.fspath(target_path),\n # The DBT_PROFILES_DIR environment variable is set to the path containing the dbt\n # profiles.yml file.\n # See https://docs.getdbt.com/docs/core/connect-data-platform/connection-profiles#advanced-customizing-a-profile-directory\n # for more information.\n **({"DBT_PROFILES_DIR": self.profiles_dir} if self.profiles_dir else {}),\n }\n\n assets_def: Optional[AssetsDefinition] = None\n with suppress(DagsterInvalidPropertyError):\n assets_def = context.assets_def if context else None\n\n selection_args: List[str] = []\n dagster_dbt_translator = dagster_dbt_translator or DagsterDbtTranslator()\n if context and assets_def is not None:\n manifest, dagster_dbt_translator = get_manifest_and_translator_from_dbt_assets(\n [assets_def]\n )\n selection_args = get_subset_selection_for_context(\n context=context,\n manifest=manifest,\n select=context.op.tags.get("dagster-dbt/select"),\n exclude=context.op.tags.get("dagster-dbt/exclude"),\n )\n else:\n manifest = validate_manifest(manifest) if manifest else {}\n\n # TODO: verify that args does not have any selection flags if the context and manifest\n # are passed to this function.\n profile_args: List[str] = []\n if self.profile:\n profile_args = ["--profile", self.profile]\n\n if self.target:\n profile_args += ["--target", self.target]\n\n args = ["dbt"] + self.global_config_flags + args + profile_args + selection_args\n project_dir = Path(self.project_dir)\n\n if not target_path.is_absolute():\n target_path = project_dir.joinpath(target_path)\n\n return DbtCliInvocation.run(\n args=args,\n env=env,\n manifest=manifest,\n dagster_dbt_translator=dagster_dbt_translator,\n project_dir=project_dir,\n target_path=target_path,\n raise_on_error=raise_on_error,\n )
\n\n\ndef get_subset_selection_for_context(\n context: OpExecutionContext,\n manifest: Mapping[str, Any],\n select: Optional[str],\n exclude: Optional[str],\n) -> List[str]:\n """Generate a dbt selection string to materialize the selected resources in a subsetted execution context.\n\n See https://docs.getdbt.com/reference/node-selection/syntax#how-does-selection-work.\n\n Args:\n context (OpExecutionContext): The execution context for the current execution step.\n select (Optional[str]): A dbt selection string to select resources to materialize.\n exclude (Optional[str]): A dbt selection string to exclude resources from materializing.\n\n Returns:\n List[str]: dbt CLI arguments to materialize the selected resources in a\n subsetted execution context.\n\n If the current execution context is not performing a subsetted execution,\n return CLI arguments composed of the inputed selection and exclusion arguments.\n """\n default_dbt_selection = []\n if select:\n default_dbt_selection += ["--select", select]\n if exclude:\n default_dbt_selection += ["--exclude", exclude]\n\n dbt_resource_props_by_output_name = get_dbt_resource_props_by_output_name(manifest)\n\n # TODO: this should be a property on the context if this is a permanent indicator for\n # determining whether the current execution context is performing a subsetted execution.\n is_subsetted_execution = len(context.selected_output_names) != len(\n context.assets_def.node_keys_by_output_name\n )\n if not is_subsetted_execution:\n logger.info(\n "A dbt subsetted execution is not being performed. Using the default dbt selection"\n f" arguments `{default_dbt_selection}`."\n )\n return default_dbt_selection\n\n selected_dbt_resources = []\n for output_name in context.selected_output_names:\n dbt_resource_props = dbt_resource_props_by_output_name[output_name]\n\n # Explicitly select a dbt resource by its fully qualified name (FQN).\n # https://docs.getdbt.com/reference/node-selection/methods#the-file-or-fqn-method\n fqn_selector = f"fqn:{'.'.join(dbt_resource_props['fqn'])}"\n\n selected_dbt_resources.append(fqn_selector)\n\n # Take the union of all the selected resources.\n # https://docs.getdbt.com/reference/node-selection/set-operators#unions\n union_selected_dbt_resources = ["--select"] + [" ".join(selected_dbt_resources)]\n\n logger.info(\n "A dbt subsetted execution is being performed. Overriding default dbt selection"\n f" arguments `{default_dbt_selection}` with arguments: `{union_selected_dbt_resources}`"\n )\n\n return union_selected_dbt_resources\n\n\ndef get_dbt_resource_props_by_output_name(\n manifest: Mapping[str, Any]\n) -> Mapping[str, Mapping[str, Any]]:\n node_info_by_dbt_unique_id = get_dbt_resource_props_by_dbt_unique_id_from_manifest(manifest)\n\n return {\n output_name_fn(node): node\n for node in node_info_by_dbt_unique_id.values()\n if node["resource_type"] in ASSET_RESOURCE_TYPES\n }\n
", "current_page_name": "_modules/dagster_dbt/core/resources_v2", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_dbt.core.resources_v2"}, "types": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_dbt.core.types

\nfrom typing import Any, Mapping, Optional, Sequence\n\nimport dagster._check as check\n\nfrom ..types import DbtOutput\n\n\n
[docs]class DbtCliOutput(DbtOutput):\n """The results of executing a dbt command, along with additional metadata about the dbt CLI\n process that was run.\n\n This class is deprecated, because it's only produced by methods of the DbtCliClientResource class,\n which is deprecated in favor of DbtCliResource.\n\n Note that users should not construct instances of this class directly. This class is intended\n to be constructed from the JSON output of dbt commands.\n\n Attributes:\n command (str): The full shell command that was executed.\n return_code (int): The return code of the dbt CLI process.\n raw_output (str): The raw output (``stdout``) of the dbt CLI process.\n logs (List[Dict[str, Any]]): List of parsed JSON logs produced by the dbt command.\n result (Optional[Dict[str, Any]]): Dictionary containing dbt-reported result information\n contained in run_results.json. Some dbt commands do not produce results, and will\n therefore have result = None.\n docs_url (Optional[str]): Hostname where dbt docs are being served for this project.\n """\n\n def __init__(\n self,\n command: str,\n return_code: int,\n raw_output: str,\n logs: Sequence[Mapping[str, Any]],\n result: Mapping[str, Any],\n docs_url: Optional[str] = None,\n ):\n self._command = check.str_param(command, "command")\n self._return_code = check.int_param(return_code, "return_code")\n self._raw_output = check.str_param(raw_output, "raw_output")\n self._logs = check.sequence_param(logs, "logs", of_type=dict)\n self._docs_url = check.opt_str_param(docs_url, "docs_url")\n super().__init__(result)\n\n @property\n def command(self) -> str:\n return self._command\n\n @property\n def return_code(self) -> int:\n return self._return_code\n\n @property\n def raw_output(self) -> str:\n return self._raw_output\n\n @property\n def logs(self) -> Sequence[Mapping[str, Any]]:\n return self._logs\n\n @property\n def docs_url(self) -> Optional[str]:\n return self._docs_url
\n
", "current_page_name": "_modules/dagster_dbt/core/types", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_dbt.core.types"}}, "dagster_dbt_translator": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_dbt.dagster_dbt_translator

\nfrom dataclasses import dataclass\nfrom typing import Any, Mapping, Optional\n\nfrom dagster import AssetKey, AutoMaterializePolicy, FreshnessPolicy\nfrom dagster._annotations import public\nfrom dagster._core.definitions.events import (\n    CoercibleToAssetKeyPrefix,\n    check_opt_coercible_to_asset_key_prefix_param,\n)\n\nfrom .asset_utils import (\n    default_asset_key_fn,\n    default_auto_materialize_policy_fn,\n    default_description_fn,\n    default_freshness_policy_fn,\n    default_group_from_dbt_resource_props,\n    default_metadata_from_dbt_resource_props,\n)\n\n\n
[docs]class DagsterDbtTranslator:\n """Holds a set of methods that derive Dagster asset definition metadata given a representation\n of a dbt resource (models, tests, sources, etc).\n\n This class is exposed so that methods can be overriden to customize how Dagster asset metadata\n is derived.\n """\n\n
[docs] @classmethod\n @public\n def get_asset_key(cls, dbt_resource_props: Mapping[str, Any]) -> AssetKey:\n """A function that takes a dictionary representing properties of a dbt resource, and\n returns the Dagster asset key that represents that resource.\n\n Note that a dbt resource is unrelated to Dagster's resource concept, and simply represents\n a model, seed, snapshot or source in a given dbt project. You can learn more about dbt\n resources and the properties available in this dictionary here:\n https://docs.getdbt.com/reference/artifacts/manifest-json#resource-details\n\n This method can be overridden to provide a custom asset key for a dbt resource.\n\n Args:\n dbt_resource_props (Mapping[str, Any]): A dictionary representing the dbt resource.\n\n Returns:\n AssetKey: The Dagster asset key for the dbt resource.\n\n Examples:\n Adding a prefix to the default asset key generated for each dbt resource:\n\n .. code-block:: python\n\n from typing import Any, Mapping\n\n from dagster import AssetKey\n from dagster_dbt import DagsterDbtTranslator\n\n\n class CustomDagsterDbtTranslator(DagsterDbtTranslator):\n @classmethod\n def get_asset_key(cls, dbt_resource_props: Mapping[str, Any]) -> AssetKey:\n return super().get_asset_key(dbt_resource_props).with_prefix("prefix")\n\n Adding a prefix to the default asset key generated for each dbt resource, but only for dbt sources:\n\n .. code-block:: python\n\n from typing import Any, Mapping\n\n from dagster import AssetKey\n from dagster_dbt import DagsterDbtTranslator\n\n\n class CustomDagsterDbtTranslator(DagsterDbtTranslator):\n @classmethod\n def get_asset_key(cls, dbt_resource_props: Mapping[str, Any]) -> AssetKey:\n asset_key = super().get_asset_key(dbt_resource_props)\n\n if dbt_resource_props["resource_type"] == "source":\n asset_key = asset_key.with_prefix("my_prefix")\n\n return asset_key\n """\n return default_asset_key_fn(dbt_resource_props)
\n\n
[docs] @classmethod\n @public\n def get_description(cls, dbt_resource_props: Mapping[str, Any]) -> str:\n """A function that takes a dictionary representing properties of a dbt resource, and\n returns the Dagster description for that resource.\n\n Note that a dbt resource is unrelated to Dagster's resource concept, and simply represents\n a model, seed, snapshot or source in a given dbt project. You can learn more about dbt\n resources and the properties available in this dictionary here:\n https://docs.getdbt.com/reference/artifacts/manifest-json#resource-details\n\n This method can be overridden to provide a custom description for a dbt resource.\n\n Args:\n dbt_resource_props (Mapping[str, Any]): A dictionary representing the dbt resource.\n\n Returns:\n str: The description for the dbt resource.\n\n Examples:\n .. code-block:: python\n\n from typing import Any, Mapping\n\n from dagster_dbt import DagsterDbtTranslator\n\n\n class CustomDagsterDbtTranslator(DagsterDbtTranslator):\n @classmethod\n def get_description(cls, dbt_resource_props: Mapping[str, Any]) -> str:\n return "custom description"\n """\n return default_description_fn(dbt_resource_props)
\n\n
[docs] @classmethod\n @public\n def get_metadata(cls, dbt_resource_props: Mapping[str, Any]) -> Mapping[str, Any]:\n """A function that takes a dictionary representing properties of a dbt resource, and\n returns the Dagster metadata for that resource.\n\n Note that a dbt resource is unrelated to Dagster's resource concept, and simply represents\n a model, seed, snapshot or source in a given dbt project. You can learn more about dbt\n resources and the properties available in this dictionary here:\n https://docs.getdbt.com/reference/artifacts/manifest-json#resource-details\n\n This method can be overridden to provide a custom metadata for a dbt resource.\n\n Args:\n dbt_resource_props (Mapping[str, Any]): A dictionary representing the dbt resource.\n\n Returns:\n Mapping[str, Any]: A dictionary representing the Dagster metadata for the dbt resource.\n\n Examples:\n .. code-block:: python\n\n from typing import Any, Mapping\n\n from dagster_dbt import DagsterDbtTranslator\n\n\n class CustomDagsterDbtTranslator(DagsterDbtTranslator):\n @classmethod\n def get_metadata(cls, dbt_resource_props: Mapping[str, Any]) -> Mapping[str, Any]:\n return {"custom": "metadata"}\n """\n return default_metadata_from_dbt_resource_props(dbt_resource_props)
\n\n
[docs] @classmethod\n @public\n def get_group_name(cls, dbt_resource_props: Mapping[str, Any]) -> Optional[str]:\n """A function that takes a dictionary representing properties of a dbt resource, and\n returns the Dagster group name for that resource.\n\n Note that a dbt resource is unrelated to Dagster's resource concept, and simply represents\n a model, seed, snapshot or source in a given dbt project. You can learn more about dbt\n resources and the properties available in this dictionary here:\n https://docs.getdbt.com/reference/artifacts/manifest-json#resource-details\n\n This method can be overridden to provide a custom group name for a dbt resource.\n\n Args:\n dbt_resource_props (Mapping[str, Any]): A dictionary representing the dbt resource.\n\n Returns:\n Optional[str]: A Dagster group name.\n\n Examples:\n .. code-block:: python\n\n from typing import Any, Mapping\n\n from dagster_dbt import DagsterDbtTranslator\n\n\n class CustomDagsterDbtTranslator(DagsterDbtTranslator):\n @classmethod\n def get_group_name(cls, dbt_resource_props: Mapping[str, Any]) -> Optional[str]:\n return "custom_group_prefix" + dbt_resource_props.get("config", {}).get("group")\n """\n return default_group_from_dbt_resource_props(dbt_resource_props)
\n\n
[docs] @classmethod\n @public\n def get_freshness_policy(\n cls, dbt_resource_props: Mapping[str, Any]\n ) -> Optional[FreshnessPolicy]:\n """A function that takes a dictionary representing properties of a dbt resource, and\n returns the Dagster :py:class:`dagster.FreshnessPolicy` for that resource.\n\n Note that a dbt resource is unrelated to Dagster's resource concept, and simply represents\n a model, seed, snapshot or source in a given dbt project. You can learn more about dbt\n resources and the properties available in this dictionary here:\n https://docs.getdbt.com/reference/artifacts/manifest-json#resource-details\n\n This method can be overridden to provide a custom freshness policy for a dbt resource.\n\n Args:\n dbt_resource_props (Mapping[str, Any]): A dictionary representing the dbt resource.\n\n Returns:\n Optional[FreshnessPolicy]: A Dagster freshness policy.\n\n Examples:\n Set a custom freshness policy for all dbt resources:\n\n .. code-block:: python\n\n from typing import Any, Mapping\n\n from dagster_dbt import DagsterDbtTranslator\n\n\n class CustomDagsterDbtTranslator(DagsterDbtTranslator):\n @classmethod\n def get_freshness_policy(cls, dbt_resource_props: Mapping[str, Any]) -> Optional[FreshnessPolicy]:\n return FreshnessPolicy(maximum_lag_minutes=60)\n\n Set a custom freshness policy for dbt resources with a specific tag:\n\n .. code-block:: python\n\n from typing import Any, Mapping\n\n from dagster_dbt import DagsterDbtTranslator\n\n\n class CustomDagsterDbtTranslator(DagsterDbtTranslator):\n @classmethod\n def get_freshness_policy(cls, dbt_resource_props: Mapping[str, Any]) -> Optional[FreshnessPolicy]:\n freshness_policy = None\n if "my_custom_tag" in dbt_resource_props.get("tags", []):\n freshness_policy = FreshnessPolicy(maximum_lag_minutes=60)\n\n return freshness_policy\n """\n return default_freshness_policy_fn(dbt_resource_props)
\n\n
[docs] @classmethod\n @public\n def get_auto_materialize_policy(\n cls, dbt_resource_props: Mapping[str, Any]\n ) -> Optional[AutoMaterializePolicy]:\n """A function that takes a dictionary representing properties of a dbt resource, and\n returns the Dagster :py:class:`dagster.AutoMaterializePolicy` for that resource.\n\n Note that a dbt resource is unrelated to Dagster's resource concept, and simply represents\n a model, seed, snapshot or source in a given dbt project. You can learn more about dbt\n resources and the properties available in this dictionary here:\n https://docs.getdbt.com/reference/artifacts/manifest-json#resource-details\n\n This method can be overridden to provide a custom auto-materialize policy for a dbt resource.\n\n Args:\n dbt_resource_props (Mapping[str, Any]): A dictionary representing the dbt resource.\n\n Returns:\n Optional[AutoMaterializePolicy]: A Dagster auto-materialize policy.\n\n Examples:\n Set a custom auto-materialize policy for all dbt resources:\n\n .. code-block:: python\n\n from typing import Any, Mapping\n\n from dagster_dbt import DagsterDbtTranslator\n\n\n class CustomDagsterDbtTranslator(DagsterDbtTranslator):\n @classmethod\n def get_auto_materialize_policy(cls, dbt_resource_props: Mapping[str, Any]) -> Optional[AutoMaterializePolicy]:\n return AutoMaterializePolicy.eager()\n\n Set a custom auto-materialize policy for dbt resources with a specific tag:\n\n .. code-block:: python\n\n from typing import Any, Mapping\n\n from dagster_dbt import DagsterDbtTranslator\n\n\n class CustomDagsterDbtTranslator(DagsterDbtTranslator):\n @classmethod\n def get_auto_materialize_policy(cls, dbt_resource_props: Mapping[str, Any]) -> Optional[AutoMaterializePolicy]:\n auto_materialize_policy = None\n if "my_custom_tag" in dbt_resource_props.get("tags", []):\n auto_materialize_policy = AutoMaterializePolicy.eager()\n\n return auto_materialize_policy\n\n """\n return default_auto_materialize_policy_fn(dbt_resource_props)
\n\n\nclass KeyPrefixDagsterDbtTranslator(DagsterDbtTranslator):\n """A DagsterDbtTranslator that applies prefixes to the asset keys generated from dbt resources.\n\n Attributes:\n asset_key_prefix (Optional[Union[str, Sequence[str]]]): A prefix to apply to all dbt models,\n seeds, snapshots, etc. This will *not* apply to dbt sources.\n source_asset_key_prefix (Optional[Union[str, Sequence[str]]]): A prefix to apply to all dbt\n sources.\n """\n\n def __init__(\n self,\n asset_key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n source_asset_key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n ):\n self._asset_key_prefix = (\n check_opt_coercible_to_asset_key_prefix_param(asset_key_prefix, "asset_key_prefix")\n or []\n )\n self._source_asset_key_prefix = (\n check_opt_coercible_to_asset_key_prefix_param(\n source_asset_key_prefix, "source_asset_key_prefix"\n )\n or []\n )\n\n @public\n def get_asset_key(self, dbt_resource_props: Mapping[str, Any]) -> AssetKey:\n base_key = default_asset_key_fn(dbt_resource_props)\n if dbt_resource_props["resource_type"] == "source":\n return base_key.with_prefix(self._source_asset_key_prefix)\n else:\n return base_key.with_prefix(self._asset_key_prefix)\n\n\n@dataclass\nclass DbtManifestWrapper:\n manifest: Mapping[str, Any]\n
", "current_page_name": "_modules/dagster_dbt/dagster_dbt_translator", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_dbt.dagster_dbt_translator"}, "dbt_manifest_asset_selection": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_dbt.dbt_manifest_asset_selection

\nfrom typing import AbstractSet, Optional\n\nfrom dagster import (\n    AssetKey,\n    AssetSelection,\n    _check as check,\n)\nfrom dagster._core.definitions.asset_graph import AssetGraph\n\nfrom .asset_utils import is_non_asset_node\nfrom .dagster_dbt_translator import DagsterDbtTranslator\nfrom .dbt_manifest import DbtManifestParam, validate_manifest\nfrom .utils import (\n    ASSET_RESOURCE_TYPES,\n    get_dbt_resource_props_by_dbt_unique_id_from_manifest,\n    select_unique_ids_from_manifest,\n)\n\n\n
[docs]class DbtManifestAssetSelection(AssetSelection):\n """Defines a selection of assets from a dbt manifest wrapper and a dbt selection string.\n\n Args:\n manifest (Mapping[str, Any]): The dbt manifest blob.\n select (str): A dbt selection string to specify a set of dbt resources.\n exclude (Optional[str]): A dbt selection string to exclude a set of dbt resources.\n\n Examples:\n .. code-block:: python\n\n import json\n from pathlib import Path\n\n from dagster_dbt import DbtManifestAssetSelection\n\n manifest = json.loads(Path("path/to/manifest.json").read_text())\n\n # select the dbt assets that have the tag "foo".\n my_selection = DbtManifestAssetSelection(manifest=manifest, select="tag:foo")\n """\n\n def __init__(\n self,\n manifest: DbtManifestParam,\n select: str = "fqn:*",\n *,\n dagster_dbt_translator: Optional[DagsterDbtTranslator] = None,\n exclude: Optional[str] = None,\n ) -> None:\n self.manifest = validate_manifest(manifest)\n self.select = check.str_param(select, "select")\n self.exclude = check.opt_str_param(exclude, "exclude", default="")\n self.dagster_dbt_translator = check.opt_inst_param(\n dagster_dbt_translator,\n "dagster_dbt_translator",\n DagsterDbtTranslator,\n DagsterDbtTranslator(),\n )\n\n def resolve_inner(self, asset_graph: AssetGraph) -> AbstractSet[AssetKey]:\n dbt_nodes = get_dbt_resource_props_by_dbt_unique_id_from_manifest(self.manifest)\n\n keys = set()\n for unique_id in select_unique_ids_from_manifest(\n select=self.select,\n exclude=self.exclude,\n manifest_json=self.manifest,\n ):\n dbt_resource_props = dbt_nodes[unique_id]\n is_dbt_asset = dbt_resource_props["resource_type"] in ASSET_RESOURCE_TYPES\n if is_dbt_asset and not is_non_asset_node(dbt_resource_props):\n asset_key = self.dagster_dbt_translator.get_asset_key(dbt_resource_props)\n keys.add(asset_key)\n\n return keys
\n
", "current_page_name": "_modules/dagster_dbt/dbt_manifest_asset_selection", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_dbt.dbt_manifest_asset_selection"}, "dbt_resource": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_dbt.dbt_resource

\nimport logging\nfrom abc import abstractmethod\nfrom typing import Any, Mapping, Optional, Sequence\n\nfrom dagster import get_dagster_logger\n\nfrom .types import DbtOutput\n\n\nclass DbtClient:\n    """Base class for a client allowing users to interface with dbt."""\n\n    def __init__(\n        self,\n        logger: Optional[logging.Logger] = None,\n    ):\n        """Constructor.\n\n        Args:\n            logger (Optional[Any]): A property for injecting a logger dependency.\n                Default is ``None``.\n        """\n        self._logger = logger or get_dagster_logger()\n\n    def _format_params(\n        self, flags: Mapping[str, Any], replace_underscores: bool = False\n    ) -> Mapping[str, Any]:\n        """Reformats arguments that are easier to express as a list into the format that dbt expects,\n        and deletes and keys with no value.\n        """\n        # remove any keys with a value of None\n        if replace_underscores:\n            flags = {k.replace("_", "-"): v for k, v in flags.items() if v is not None}\n        else:\n            flags = {k: v for k, v in flags.items() if v is not None}\n\n        for param in ["select", "exclude", "models"]:\n            if param in flags:\n                if isinstance(flags[param], list):\n                    # if it's a list, format as space-separated\n                    flags[param] = " ".join(set(flags[param]))\n\n        return flags\n\n    @property\n    def logger(self) -> logging.Logger:\n        """logging.Logger: A property for injecting a logger dependency."""\n        return self._logger\n\n    @abstractmethod\n    def compile(\n        self,\n        models: Optional[Sequence[str]] = None,\n        exclude: Optional[Sequence[str]] = None,\n        **kwargs,\n    ) -> DbtOutput:\n        """Run the ``compile`` command on a dbt project. kwargs are passed in as additional parameters.\n\n        Args:\n            models (List[str], optional): the models to include in compilation.\n            exclude (List[str]), optional): the models to exclude from compilation.\n\n        Returns:\n            DbtOutput: object containing parsed output from dbt\n        """\n\n    @abstractmethod\n    def run(\n        self,\n        models: Optional[Sequence[str]] = None,\n        exclude: Optional[Sequence[str]] = None,\n        **kwargs,\n    ) -> DbtOutput:\n        """Run the ``run`` command on a dbt project. kwargs are passed in as additional parameters.\n\n        Args:\n            models (List[str], optional): the models to include in the run.\n            exclude (List[str]), optional): the models to exclude from the run.\n\n        Returns:\n            DbtOutput: object containing parsed output from dbt\n        """\n\n    @abstractmethod\n    def snapshot(\n        self,\n        select: Optional[Sequence[str]] = None,\n        exclude: Optional[Sequence[str]] = None,\n        **kwargs,\n    ) -> DbtOutput:\n        """Run the ``snapshot`` command on a dbt project. kwargs are passed in as additional parameters.\n\n        Args:\n            select (List[str], optional): the snapshots to include in the run.\n            exclude (List[str], optional): the snapshots to exclude from the run.\n\n        Returns:\n            DbtOutput: object containing parsed output from dbt\n        """\n\n    @abstractmethod\n    def test(\n        self,\n        models: Optional[Sequence[str]] = None,\n        exclude: Optional[Sequence[str]] = None,\n        data: bool = True,\n        schema: bool = True,\n        **kwargs,\n    ) -> DbtOutput:\n        """Run the ``test`` command on a dbt project. kwargs are passed in as additional parameters.\n\n        Args:\n            models (List[str], optional): the models to include in testing.\n            exclude (List[str], optional): the models to exclude from testing.\n            data (bool, optional): If ``True`` (default), then run data tests.\n            schema (bool, optional): If ``True`` (default), then run schema tests.\n\n        Returns:\n            DbtOutput: object containing parsed output from dbt\n        """\n\n    @abstractmethod\n    def seed(\n        self,\n        show: bool = False,\n        select: Optional[Sequence[str]] = None,\n        exclude: Optional[Sequence[str]] = None,\n        **kwargs,\n    ) -> DbtOutput:\n        """Run the ``seed`` command on a dbt project. kwargs are passed in as additional parameters.\n\n        Args:\n            show (bool, optional): If ``True``, then show a sample of the seeded data in the\n                response. Defaults to ``False``.\n            select (List[str], optional): the snapshots to include in the run.\n            exclude (List[str], optional): the snapshots to exclude from the run.\n\n\n        Returns:\n            DbtOutput: object containing parsed output from dbt\n        """\n\n    @abstractmethod\n    def ls(\n        self,\n        select: Optional[Sequence[str]] = None,\n        models: Optional[Sequence[str]] = None,\n        exclude: Optional[Sequence[str]] = None,\n        **kwargs,\n    ) -> DbtOutput:\n        """Run the ``ls`` command on a dbt project. kwargs are passed in as additional parameters.\n\n        Args:\n            select (List[str], optional): the resources to include in the output.\n            models (List[str], optional): the models to include in the output.\n            exclude (List[str], optional): the resources to exclude from the output.\n\n\n        Returns:\n            DbtOutput: object containing parsed output from dbt\n        """\n\n    @abstractmethod\n    def build(self, select: Optional[Sequence[str]] = None, **kwargs) -> DbtOutput:\n        """Run the ``build`` command on a dbt project. kwargs are passed in as additional parameters.\n\n        Args:\n            select (List[str], optional): the models/resources to include in the run.\n\n        Returns:\n            DbtOutput: object containing parsed output from dbt\n        """\n        raise NotImplementedError()\n\n    @abstractmethod\n    def generate_docs(self, compile_project: bool = False, **kwargs) -> DbtOutput:\n        """Run the ``docs generate`` command on a dbt project. kwargs are passed in as additional parameters.\n\n        Args:\n            compile_project (bool, optional): If true, compile the project before generating a catalog.\n\n        Returns:\n            DbtOutput: object containing parsed output from dbt\n        """\n\n    @abstractmethod\n    def run_operation(\n        self, macro: str, args: Optional[Mapping[str, Any]] = None, **kwargs\n    ) -> DbtOutput:\n        """Run the ``run-operation`` command on a dbt project. kwargs are passed in as additional parameters.\n\n        Args:\n            macro (str): the dbt macro to invoke.\n            args (Dict[str, Any], optional): the keyword arguments to be supplied to the macro.\n\n        Returns:\n            DbtOutput: object containing parsed output from dbt\n        """\n\n    @abstractmethod\n    def get_run_results_json(self, **kwargs) -> Optional[Mapping[str, Any]]:\n        """Get a parsed version of the run_results.json file for the relevant dbt project.\n\n        Returns:\n            Dict[str, Any]: dictionary containing the parsed contents of the run_results json file\n                for this dbt project.\n        """\n\n    @abstractmethod\n    def get_manifest_json(self, **kwargs) -> Optional[Mapping[str, Any]]:\n        """Get a parsed version of the manifest.json file for the relevant dbt project.\n\n        Returns:\n            Dict[str, Any]: dictionary containing the parsed contents of the manifest json file\n                for this dbt project.\n        """\n\n\n
[docs]class DbtResource(DbtClient):\n pass
\n
", "current_page_name": "_modules/dagster_dbt/dbt_resource", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_dbt.dbt_resource"}, "errors": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_dbt.errors

\nimport warnings\nfrom abc import ABC\nfrom typing import Any, Mapping, Optional, Sequence\n\nfrom dagster import (\n    DagsterInvariantViolationError,\n    Failure,\n    MetadataValue,\n    _check as check,\n)\n\n\n
[docs]class DagsterDbtError(Failure, ABC):\n """The base exception of the ``dagster-dbt`` library."""
\n\n\n
[docs]class DagsterDbtCliUnexpectedOutputError(DagsterDbtError):\n """Represents an error when parsing the output of a dbt CLI command."""\n\n invalid_line_nos: Sequence[int]\n\n def __init__(self, invalid_line_nos: Sequence[int]):\n check.sequence_param(invalid_line_nos, "invalid_line_nos", int)\n line_nos_str = ", ".join(map(str, invalid_line_nos))\n description = f"dbt CLI emitted unexpected output on lines {line_nos_str}"\n metadata = {\n "Invalid CLI Output Line Numbers": MetadataValue.json({"line_nos": invalid_line_nos})\n }\n super().__init__(description, metadata=metadata)\n self.invalid_line_nos = invalid_line_nos
\n\n\n
[docs]class DagsterDbtCliRuntimeError(DagsterDbtError, ABC):\n """Represents an error while executing a dbt CLI command."""\n\n def __init__(\n self,\n description: str,\n logs: Optional[Sequence[Mapping[str, Any]]] = None,\n raw_output: Optional[str] = None,\n messages: Optional[Sequence[str]] = None,\n ):\n if logs is not None:\n warnings.warn(\n "`logs` is a deprecated argument to DagsterDbtCliRuntimeError and will be discarded"\n )\n if raw_output is not None:\n warnings.warn(\n "`raw_output` is a deprecated argument to DagsterDbtCliRuntimeError and will be"\n " discarded"\n )\n metadata = {"Parsed CLI Messages": "\\n".join(messages or [])}\n super().__init__(description, metadata=metadata)
\n\n\n
[docs]class DagsterDbtCliHandledRuntimeError(DagsterDbtCliRuntimeError):\n """Represents a model error reported by the dbt CLI at runtime (return code 1)."""\n\n def __init__(\n self,\n logs: Optional[Sequence[Mapping[str, Any]]] = None,\n raw_output: Optional[str] = None,\n messages: Optional[Sequence[str]] = None,\n ):\n super().__init__("Handled error in the dbt CLI (return code 1)", logs, raw_output, messages)
\n\n\n
[docs]class DagsterDbtCliFatalRuntimeError(DagsterDbtCliRuntimeError):\n """Represents a fatal error in the dbt CLI (return code 2)."""\n\n def __init__(\n self,\n logs: Optional[Sequence[Mapping[str, Any]]] = None,\n raw_output: Optional[str] = None,\n messages: Optional[Sequence[str]] = None,\n ):\n super().__init__(\n "Fatal error in the dbt CLI (return code 2): " + " ".join(messages or []),\n logs,\n raw_output,\n messages,\n )
\n\n\n
[docs]class DagsterDbtCliOutputsNotFoundError(DagsterDbtError):\n """Represents a problem in finding the ``target/run_results.json`` artifact when executing a dbt\n CLI command.\n\n For more details on ``target/run_results.json``, see\n https://docs.getdbt.com/reference/dbt-artifacts#run_resultsjson.\n """\n\n def __init__(self, path: str):\n super().__init__(f"Expected to find file at path {path}")
\n\n\nclass DagsterDbtCloudJobInvariantViolationError(DagsterDbtError, DagsterInvariantViolationError):\n """Represents an error when a dbt Cloud job is not supported by the ``dagster-dbt`` library."""\n
", "current_page_name": "_modules/dagster_dbt/errors", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_dbt.errors"}, "ops": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_dbt.ops

\nfrom typing import Any, Dict, List, Optional\n\nfrom dagster import Config, In, Nothing, Out, Output, op\nfrom pydantic import Field\n\nfrom .types import DbtOutput\nfrom .utils import generate_events, generate_materializations\n\n_DEFAULT_OP_PROPS: Dict[str, Any] = dict(\n    required_resource_keys={"dbt"},\n    ins={"start_after": In(Nothing)},\n    out=Out(DbtOutput, description="Parsed output from running the dbt command."),\n    tags={"kind": "dbt"},\n)\n\n\ndef _get_doc(op_name: str, dbt_command: str) -> str:\n    return f"""\nThis op executes a ``dbt {dbt_command}`` command. It requires the use of a dbt resource, which can be\nset to execute this command through the CLI (using the :py:class:`~dagster_dbt.dbt_cli_resource`).\n\nExamples:\n\n.. code-block:: python\n\n    from dagster import job\n    from dagster_dbt import {op_name}, dbt_cli_resource\n\n    @job(resource_defs={{"dbt":dbt_cli_resource}})\n    def my_dbt_cli_job():\n        {op_name}()\n    """\n\n\n# NOTE: mypy fails to properly track the type of `_DEFAULT_OP_PROPS` items when they are\n# double-splatted, so we type-ignore the below op declarations.\n\n\nclass DbtBuildOpConfig(Config):\n    yield_asset_events: bool = Field(\n        default=True,\n        description=(\n            "If True, materializations and asset observations corresponding to the results of "\n            "the dbt operation will be yielded when the op executes. Default: True"\n        ),\n    )\n    asset_key_prefix: List[str] = Field(\n        default=["dbt"],\n        description=(\n            "If provided and yield_materializations is True, these components will be used to "\n            "prefix the generated asset keys."\n        ),\n    )\n\n\n@op(**_DEFAULT_OP_PROPS)\ndef dbt_build_op(context, config: DbtBuildOpConfig) -> Any:\n    dbt_output = context.resources.dbt.build()\n    if config.yield_asset_events and "results" in dbt_output.result:\n        yield from generate_events(\n            dbt_output,\n            node_info_to_asset_key=lambda info: config.asset_key_prefix\n            + info["unique_id"].split("."),\n            manifest_json=context.resources.dbt.get_manifest_json(),\n        )\n    yield Output(dbt_output)\n\n\nclass DbtRunOpConfig(Config):\n    yield_materializations: bool = Field(\n        default=True,\n        description=(\n            "If True, materializations corresponding to the results of the dbt operation will "\n            "be yielded when the op executes. Default: True"\n        ),\n    )\n    asset_key_prefix: Optional[List[str]] = Field(\n        default=["dbt"],\n        description=(\n            "If provided and yield_materializations is True, these components will be used to "\n            "prefix the generated asset keys."\n        ),\n    )\n\n\n
[docs]@op(**_DEFAULT_OP_PROPS)\ndef dbt_run_op(context, config: DbtRunOpConfig):\n dbt_output = context.resources.dbt.run()\n if config.yield_materializations and "results" in dbt_output.result:\n yield from generate_materializations(dbt_output, asset_key_prefix=config.asset_key_prefix)\n yield Output(dbt_output)
\n\n\n
[docs]@op(**_DEFAULT_OP_PROPS)\ndef dbt_compile_op(context):\n return context.resources.dbt.compile()
\n\n\n
[docs]@op(**_DEFAULT_OP_PROPS)\ndef dbt_ls_op(context):\n return context.resources.dbt.ls()
\n\n\n
[docs]@op(**_DEFAULT_OP_PROPS)\ndef dbt_test_op(context):\n return context.resources.dbt.test()
\n\n\n
[docs]@op(**_DEFAULT_OP_PROPS)\ndef dbt_snapshot_op(context):\n return context.resources.dbt.snapshot()
\n\n\n
[docs]@op(**_DEFAULT_OP_PROPS)\ndef dbt_seed_op(context):\n return context.resources.dbt.seed()
\n\n\n
[docs]@op(**_DEFAULT_OP_PROPS)\ndef dbt_docs_generate_op(context):\n return context.resources.dbt.generate_docs()
\n\n\nfor dbt_op, cmd in [\n (dbt_build_op, "build"),\n (dbt_run_op, "run"),\n (dbt_compile_op, "compile"),\n (dbt_ls_op, "ls"),\n (dbt_test_op, "test"),\n (dbt_snapshot_op, "snapshot"),\n (dbt_seed_op, "seed"),\n (dbt_docs_generate_op, "docs generate"),\n]:\n dbt_op.__doc__ = _get_doc(dbt_op.name, cmd)\n
", "current_page_name": "_modules/dagster_dbt/ops", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_dbt.ops"}, "types": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_dbt.types

\nfrom typing import Any, Mapping, Optional\n\nimport dagster._check as check\n\n\n
[docs]class DbtOutput:\n """Base class for both DbtCliOutput and DbtRPCOutput. Contains a single field, `result`, which\n represents the dbt-formatted result of the command that was run (if any).\n\n Used internally, should not be instantiated directly by the user.\n """\n\n def __init__(self, result: Mapping[str, Any]):\n self._result = check.mapping_param(result, "result", key_type=str)\n\n @property\n def result(self) -> Mapping[str, Any]:\n return self._result\n\n @property\n def docs_url(self) -> Optional[str]:\n return None
\n
", "current_page_name": "_modules/dagster_dbt/types", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_dbt.types"}, "utils": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_dbt.utils

\nfrom pathlib import Path\nfrom typing import (\n    AbstractSet,\n    Any,\n    Callable,\n    Dict,\n    Iterator,\n    Mapping,\n    Optional,\n    Sequence,\n    Union,\n    cast,\n)\n\nimport dateutil\nfrom dagster import (\n    AssetKey,\n    AssetMaterialization,\n    AssetObservation,\n    MetadataValue,\n    Output,\n    _check as check,\n)\nfrom dagster._core.definitions.metadata import RawMetadataValue\n\nfrom .types import DbtOutput\n\n# dbt resource types that may be considered assets\nASSET_RESOURCE_TYPES = ["model", "seed", "snapshot"]\n\n\ndef default_node_info_to_asset_key(node_info: Mapping[str, Any]) -> AssetKey:\n    return AssetKey(node_info["unique_id"].split("."))\n\n\ndef _resource_type(unique_id: str) -> str:\n    # returns the type of the node (e.g. model, test, snapshot)\n    return unique_id.split(".")[0]\n\n\ndef input_name_fn(dbt_resource_props: Mapping[str, Any]) -> str:\n    # * can be present when sources are sharded tables\n    return dbt_resource_props["unique_id"].replace(".", "_").replace("*", "_star")\n\n\ndef output_name_fn(dbt_resource_props: Mapping[str, Any]) -> str:\n    # hyphens are valid in dbt model names, but not in output names\n    return dbt_resource_props["unique_id"].split(".")[-1].replace("-", "_")\n\n\ndef _node_result_to_metadata(node_result: Mapping[str, Any]) -> Mapping[str, RawMetadataValue]:\n    return {\n        "Materialization Strategy": node_result["config"]["materialized"],\n        "Database": node_result["database"],\n        "Schema": node_result["schema"],\n        "Alias": node_result["alias"],\n        "Description": node_result["description"],\n    }\n\n\ndef _timing_to_metadata(timings: Sequence[Mapping[str, Any]]) -> Mapping[str, RawMetadataValue]:\n    metadata: Dict[str, RawMetadataValue] = {}\n    for timing in timings:\n        if timing["name"] == "execute":\n            desc = "Execution"\n        elif timing["name"] == "compile":\n            desc = "Compilation"\n        else:\n            continue\n\n        # dateutil does not properly expose its modules to static checkers\n        started_at = dateutil.parser.isoparse(timing["started_at"])  # type: ignore\n        completed_at = dateutil.parser.isoparse(timing["completed_at"])  # type: ignore\n        duration = completed_at - started_at\n        metadata.update(\n            {\n                f"{desc} Started At": started_at.isoformat(timespec="seconds"),\n                f"{desc} Completed At": started_at.isoformat(timespec="seconds"),\n                f"{desc} Duration": duration.total_seconds(),\n            }\n        )\n    return metadata\n\n\ndef result_to_events(\n    result: Mapping[str, Any],\n    docs_url: Optional[str] = None,\n    node_info_to_asset_key: Optional[Callable[[Mapping[str, Any]], AssetKey]] = None,\n    manifest_json: Optional[Mapping[str, Any]] = None,\n    extra_metadata: Optional[Mapping[str, RawMetadataValue]] = None,\n    generate_asset_outputs: bool = False,\n) -> Iterator[Union[AssetMaterialization, AssetObservation, Output]]:\n    """This is a hacky solution that attempts to consolidate parsing many of the potential formats\n    that dbt can provide its results in. This is known to work for CLI Outputs for dbt versions 0.18+,\n    as well as RPC responses for a similar time period, but as the RPC response schema is not documented\n    nor enforced, this can become out of date easily.\n    """\n    node_info_to_asset_key = check.opt_callable_param(\n        node_info_to_asset_key, "node_info_to_asset_key", default=default_node_info_to_asset_key\n    )\n\n    # status comes from set of fields rather than "status"\n    if "fail" in result:\n        status = (\n            "fail"\n            if result.get("fail")\n            else "skip" if result.get("skip") else "error" if result.get("error") else "success"\n        )\n    else:\n        status = result["status"]\n\n    # all versions represent timing the same way\n    metadata = {"Status": status, "Execution Time (seconds)": result["execution_time"]}\n    metadata.update(_timing_to_metadata(result["timing"]))\n\n    # working with a response that contains the node block (RPC and CLI 0.18.x)\n    if "node" in result:\n        unique_id = result["node"]["unique_id"]\n        metadata.update(_node_result_to_metadata(result["node"]))\n    else:\n        unique_id = result["unique_id"]\n\n    if docs_url:\n        metadata["docs_url"] = MetadataValue.url(f"{docs_url}#!/model/{unique_id}")\n\n    if extra_metadata:\n        metadata.update(extra_metadata)\n\n    # if you have a manifest available, get the full node info, otherwise just populate unique_id\n    dbt_resource_props = (\n        manifest_json["nodes"][unique_id] if manifest_json else {"unique_id": unique_id}\n    )\n\n    node_resource_type = _resource_type(unique_id)\n\n    if node_resource_type in ASSET_RESOURCE_TYPES and status == "success":\n        if generate_asset_outputs:\n            yield Output(\n                value=None,\n                output_name=output_name_fn(dbt_resource_props),\n                metadata=metadata,\n            )\n        else:\n            yield AssetMaterialization(\n                asset_key=node_info_to_asset_key(dbt_resource_props),\n                description=f"dbt node: {unique_id}",\n                metadata=metadata,\n            )\n    # can only associate tests with assets if we have manifest_json available\n    elif node_resource_type == "test" and manifest_json and status != "skipped":\n        upstream_unique_ids = manifest_json["nodes"][unique_id]["depends_on"]["nodes"]\n        # tests can apply to multiple asset keys\n        for upstream_id in upstream_unique_ids:\n            # the upstream id can reference a node or a source\n            dbt_resource_props = manifest_json["nodes"].get(upstream_id) or manifest_json[\n                "sources"\n            ].get(upstream_id)\n            if dbt_resource_props is None:\n                continue\n            upstream_asset_key = node_info_to_asset_key(dbt_resource_props)\n            yield AssetObservation(\n                asset_key=upstream_asset_key,\n                metadata={\n                    "Test ID": result["unique_id"],\n                    "Test Status": status,\n                    "Test Message": result.get("message") or "",\n                },\n            )\n\n\ndef generate_events(\n    dbt_output: DbtOutput,\n    node_info_to_asset_key: Optional[Callable[[Mapping[str, Any]], AssetKey]] = None,\n    manifest_json: Optional[Mapping[str, Any]] = None,\n) -> Iterator[Union[AssetMaterialization, AssetObservation]]:\n    """This function yields :py:class:`dagster.AssetMaterialization` events for each model updated by\n    a dbt command, and :py:class:`dagster.AssetObservation` events for each test run.\n\n    Information parsed from a :py:class:`~DbtOutput` object.\n    """\n    for result in dbt_output.result["results"]:\n        for event in result_to_events(\n            result,\n            docs_url=dbt_output.docs_url,\n            node_info_to_asset_key=node_info_to_asset_key,\n            manifest_json=manifest_json,\n        ):\n            yield check.inst(\n                cast(Union[AssetMaterialization, AssetObservation], event),\n                (AssetMaterialization, AssetObservation),\n            )\n\n\n
[docs]def generate_materializations(\n dbt_output: DbtOutput,\n asset_key_prefix: Optional[Sequence[str]] = None,\n) -> Iterator[AssetMaterialization]:\n """This function yields :py:class:`dagster.AssetMaterialization` events for each model updated by\n a dbt command.\n\n Information parsed from a :py:class:`~DbtOutput` object.\n\n Examples:\n .. code-block:: python\n\n from dagster import op, Output\n from dagster_dbt.utils import generate_materializations\n from dagster_dbt import dbt_cli_resource\n\n @op(required_resource_keys={"dbt"})\n def my_custom_dbt_run(context):\n dbt_output = context.resources.dbt.run()\n for materialization in generate_materializations(dbt_output):\n # you can modify the materialization object to add extra metadata, if desired\n yield materialization\n yield Output(my_dbt_output)\n\n @job(resource_defs={{"dbt":dbt_cli_resource}})\n def my_dbt_cli_job():\n my_custom_dbt_run()\n """\n asset_key_prefix = check.opt_sequence_param(asset_key_prefix, "asset_key_prefix", of_type=str)\n\n for event in generate_events(\n dbt_output,\n node_info_to_asset_key=lambda info: AssetKey(\n asset_key_prefix + info["unique_id"].split(".")\n ),\n ):\n yield check.inst(cast(AssetMaterialization, event), AssetMaterialization)
\n\n\ndef select_unique_ids_from_manifest(\n select: str,\n exclude: str,\n state_path: Optional[str] = None,\n manifest_json_path: Optional[str] = None,\n manifest_json: Optional[Mapping[str, Any]] = None,\n manifest_parsed: Optional[Any] = None,\n) -> AbstractSet[str]:\n """Method to apply a selection string to an existing manifest.json file."""\n import dbt.graph.cli as graph_cli\n import dbt.graph.selector as graph_selector\n from dbt.contracts.graph.manifest import Manifest, WritableManifest\n from dbt.contracts.state import PreviousState\n from dbt.graph.selector_spec import IndirectSelection, SelectionSpec\n from networkx import DiGraph\n\n if state_path is not None:\n previous_state = PreviousState(\n path=Path(state_path), # type: ignore # (unused path, slated for deletion)\n current_path=( # type: ignore # (unused path, slated for deletion)\n Path("/tmp/null") if manifest_json_path is None else Path(manifest_json_path)\n ),\n )\n else:\n previous_state = None\n\n if manifest_json_path is not None:\n manifest = WritableManifest.read_and_check_versions(manifest_json_path)\n child_map = manifest.child_map\n elif manifest_json is not None:\n\n class _DictShim(dict):\n """Shim to enable hydrating a dictionary into a dot-accessible object."""\n\n def __getattr__(self, item):\n ret = super().get(item)\n # allow recursive access e.g. foo.bar.baz\n return _DictShim(ret) if isinstance(ret, dict) else ret\n\n manifest = Manifest(\n # dbt expects dataclasses that can be accessed with dot notation, not bare dictionaries\n nodes={\n unique_id: _DictShim(info) for unique_id, info in manifest_json["nodes"].items() # type: ignore\n },\n sources={\n unique_id: _DictShim(info) for unique_id, info in manifest_json["sources"].items() # type: ignore\n },\n metrics={\n unique_id: _DictShim(info) for unique_id, info in manifest_json["metrics"].items() # type: ignore\n },\n exposures={\n unique_id: _DictShim(info) for unique_id, info in manifest_json["exposures"].items() # type: ignore\n },\n )\n child_map = manifest_json["child_map"]\n elif manifest_parsed is not None:\n manifest = manifest_parsed\n child_map = manifest.child_map\n else:\n check.failed("Must provide either a manifest_json_path, manifest_json, or manifest_parsed.")\n graph = graph_selector.Graph(DiGraph(incoming_graph_data=child_map))\n\n # create a parsed selection from the select string\n try:\n from dbt.flags import GLOBAL_FLAGS\n except ImportError:\n # dbt < 1.5.0 compat\n import dbt.flags as GLOBAL_FLAGS\n setattr(GLOBAL_FLAGS, "INDIRECT_SELECTION", IndirectSelection.Eager)\n setattr(GLOBAL_FLAGS, "WARN_ERROR", True)\n parsed_spec: SelectionSpec = graph_cli.parse_union([select], True)\n\n if exclude:\n parsed_spec = graph_cli.SelectionDifference(\n components=[parsed_spec, graph_cli.parse_union([exclude], True)]\n )\n\n # execute this selection against the graph\n selector = graph_selector.NodeSelector(graph, manifest, previous_state=previous_state)\n selected, _ = selector.select_nodes(parsed_spec)\n return selected\n\n\ndef get_dbt_resource_props_by_dbt_unique_id_from_manifest(\n manifest: Mapping[str, Any]\n) -> Mapping[str, Mapping[str, Any]]:\n """A mapping of a dbt node's unique id to the node's dictionary representation in the manifest."""\n return {\n **manifest["nodes"],\n **manifest["sources"],\n **manifest["exposures"],\n **manifest["metrics"],\n }\n
", "current_page_name": "_modules/dagster_dbt/utils", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_dbt.utils"}}, "dagster_docker": {"docker_executor": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_docker.docker_executor

\nfrom typing import Iterator, Optional, cast\n\nimport dagster._check as check\nimport docker\nimport docker.errors\nfrom dagster import Field, IntSource, executor\nfrom dagster._annotations import experimental\nfrom dagster._core.definitions.executor_definition import multiple_process_executor_requirements\nfrom dagster._core.events import DagsterEvent, EngineEventData\nfrom dagster._core.execution.retries import RetryMode, get_retries_config\nfrom dagster._core.execution.tags import get_tag_concurrency_limits_config\nfrom dagster._core.executor.base import Executor\nfrom dagster._core.executor.init import InitExecutorContext\nfrom dagster._core.executor.step_delegating import StepDelegatingExecutor\nfrom dagster._core.executor.step_delegating.step_handler.base import (\n    CheckStepHealthResult,\n    StepHandler,\n    StepHandlerContext,\n)\nfrom dagster._core.origin import JobPythonOrigin\nfrom dagster._core.utils import parse_env_var\nfrom dagster._grpc.types import ExecuteStepArgs\nfrom dagster._serdes.utils import hash_str\nfrom dagster._utils.merger import merge_dicts\n\nfrom dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_config, validate_docker_image\n\nfrom .container_context import DockerContainerContext\n\n\n
[docs]@executor(\n name="docker",\n config_schema=merge_dicts(\n DOCKER_CONFIG_SCHEMA,\n {\n "retries": get_retries_config(),\n "max_concurrent": Field(\n IntSource,\n is_required=False,\n description=(\n "Limit on the number of containers that will run concurrently within the scope "\n "of a Dagster run. Note that this limit is per run, not global."\n ),\n ),\n "tag_concurrency_limits": get_tag_concurrency_limits_config(),\n },\n ),\n requirements=multiple_process_executor_requirements(),\n)\n@experimental\ndef docker_executor(init_context: InitExecutorContext) -> Executor:\n """Executor which launches steps as Docker containers.\n\n To use the `docker_executor`, set it as the `executor_def` when defining a job:\n\n .. literalinclude:: ../../../../../../python_modules/libraries/dagster-docker/dagster_docker_tests/test_example_executor.py\n :start-after: start_marker\n :end-before: end_marker\n :language: python\n\n Then you can configure the executor with run config as follows:\n\n .. code-block:: YAML\n\n execution:\n config:\n registry: ...\n network: ...\n networks: ...\n container_kwargs: ...\n\n If you're using the DockerRunLauncher, configuration set on the containers created by the run\n launcher will also be set on the containers that are created for each step.\n """\n config = init_context.executor_config\n image = check.opt_str_elem(config, "image")\n registry = check.opt_dict_elem(config, "registry", key_type=str)\n env_vars = check.opt_list_elem(config, "env_vars", of_type=str)\n network = check.opt_str_elem(config, "network")\n networks = check.opt_list_elem(config, "networks", of_type=str)\n container_kwargs = check.opt_dict_elem(config, "container_kwargs", key_type=str)\n retries = check.dict_elem(config, "retries", key_type=str)\n max_concurrent = check.opt_int_elem(config, "max_concurrent")\n tag_concurrency_limits = check.opt_list_elem(config, "tag_concurrency_limits")\n\n validate_docker_config(network, networks, container_kwargs)\n\n if network and not networks:\n networks = [network]\n\n container_context = DockerContainerContext(\n registry=registry,\n env_vars=env_vars or [],\n networks=networks or [],\n container_kwargs=container_kwargs,\n )\n\n return StepDelegatingExecutor(\n DockerStepHandler(image, container_context),\n retries=check.not_none(RetryMode.from_config(retries)),\n max_concurrent=max_concurrent,\n tag_concurrency_limits=tag_concurrency_limits,\n )
\n\n\nclass DockerStepHandler(StepHandler):\n def __init__(\n self,\n image: Optional[str],\n container_context: DockerContainerContext,\n ):\n super().__init__()\n\n self._image = check.opt_str_param(image, "image")\n self._container_context = check.inst_param(\n container_context, "container_context", DockerContainerContext\n )\n\n def _get_image(self, step_handler_context: StepHandlerContext):\n from . import DockerRunLauncher\n\n image = cast(\n JobPythonOrigin, step_handler_context.dagster_run.job_code_origin\n ).repository_origin.container_image\n if not image:\n image = self._image\n\n run_launcher = step_handler_context.instance.run_launcher\n\n if not image and isinstance(run_launcher, DockerRunLauncher):\n image = run_launcher.image\n\n if not image:\n raise Exception("No docker image specified by the executor config or repository")\n\n return image\n\n def _get_docker_container_context(self, step_handler_context: StepHandlerContext):\n # This doesn't vary per step: would be good to have a hook where it can be set once\n # for the whole StepHandler but we need access to the DagsterRun for that\n\n from .docker_run_launcher import DockerRunLauncher\n\n run_launcher = step_handler_context.instance.run_launcher\n run_target = DockerContainerContext.create_for_run(\n step_handler_context.dagster_run,\n run_launcher if isinstance(run_launcher, DockerRunLauncher) else None,\n )\n\n merged_container_context = run_target.merge(self._container_context)\n\n validate_docker_config(\n network=None,\n networks=merged_container_context.networks,\n container_kwargs=merged_container_context.container_kwargs,\n )\n\n return merged_container_context\n\n @property\n def name(self) -> str:\n return "DockerStepHandler"\n\n def _get_client(self, docker_container_context: DockerContainerContext):\n client = docker.client.from_env()\n if docker_container_context.registry:\n client.login(\n registry=docker_container_context.registry["url"],\n username=docker_container_context.registry["username"],\n password=docker_container_context.registry["password"],\n )\n return client\n\n def _get_container_name(self, execute_step_args: ExecuteStepArgs):\n run_id = execute_step_args.run_id\n step_keys_to_execute = check.not_none(execute_step_args.step_keys_to_execute)\n assert len(step_keys_to_execute) == 1, "Launching multiple steps is not currently supported"\n step_key = step_keys_to_execute[0]\n\n step_name = f"dagster-step-{hash_str(run_id + step_key)}"\n\n if execute_step_args.known_state:\n retry_state = execute_step_args.known_state.get_retry_state()\n retry_number = retry_state.get_attempt_count(step_key)\n if retry_number:\n step_name = f"{step_name}-{retry_number}"\n\n return step_name\n\n def _create_step_container(\n self,\n client,\n container_context,\n step_image,\n step_handler_context: StepHandlerContext,\n ):\n execute_step_args = step_handler_context.execute_step_args\n step_keys_to_execute = check.not_none(execute_step_args.step_keys_to_execute)\n assert len(step_keys_to_execute) == 1, "Launching multiple steps is not currently supported"\n step_key = step_keys_to_execute[0]\n\n env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])\n env_vars["DAGSTER_RUN_JOB_NAME"] = step_handler_context.dagster_run.job_name\n env_vars["DAGSTER_RUN_STEP_KEY"] = step_key\n return client.containers.create(\n step_image,\n name=self._get_container_name(execute_step_args),\n detach=True,\n network=container_context.networks[0] if len(container_context.networks) else None,\n command=execute_step_args.get_command_args(),\n environment=env_vars,\n **container_context.container_kwargs,\n )\n\n def launch_step(self, step_handler_context: StepHandlerContext) -> Iterator[DagsterEvent]:\n container_context = self._get_docker_container_context(step_handler_context)\n\n client = self._get_client(container_context)\n\n step_image = self._get_image(step_handler_context)\n validate_docker_image(step_image)\n\n try:\n step_container = self._create_step_container(\n client, container_context, step_image, step_handler_context\n )\n except docker.errors.ImageNotFound:\n client.images.pull(step_image)\n step_container = self._create_step_container(\n client, container_context, step_image, step_handler_context\n )\n\n if len(container_context.networks) > 1:\n for network_name in container_context.networks[1:]:\n network = client.networks.get(network_name)\n network.connect(step_container)\n\n step_keys_to_execute = check.not_none(\n step_handler_context.execute_step_args.step_keys_to_execute\n )\n assert len(step_keys_to_execute) == 1, "Launching multiple steps is not currently supported"\n step_key = step_keys_to_execute[0]\n\n yield DagsterEvent.step_worker_starting(\n step_handler_context.get_step_context(step_key),\n message="Launching step in Docker container.",\n metadata={\n "Docker container id": step_container.id,\n },\n )\n step_container.start()\n\n def check_step_health(self, step_handler_context: StepHandlerContext) -> CheckStepHealthResult:\n container_context = self._get_docker_container_context(step_handler_context)\n\n client = self._get_client(container_context)\n\n container_name = self._get_container_name(step_handler_context.execute_step_args)\n\n container = client.containers.get(container_name)\n\n if container.status == "running":\n return CheckStepHealthResult.healthy()\n\n try:\n container_info = container.wait(timeout=0.1)\n except Exception as e:\n raise Exception(\n f"Container status is {container.status}. Raised exception attempting to get its"\n " return code."\n ) from e\n\n ret_code = container_info.get("StatusCode")\n if ret_code == 0:\n return CheckStepHealthResult.healthy()\n\n return CheckStepHealthResult.unhealthy(\n reason=f"Container status is {container.status}. Return code is {ret_code}."\n )\n\n def terminate_step(self, step_handler_context: StepHandlerContext) -> Iterator[DagsterEvent]:\n container_context = self._get_docker_container_context(step_handler_context)\n\n step_keys_to_execute = check.not_none(\n step_handler_context.execute_step_args.step_keys_to_execute\n )\n assert (\n len(step_keys_to_execute) == 1\n ), "Terminating multiple steps is not currently supported"\n step_key = step_keys_to_execute[0]\n\n container_name = self._get_container_name(step_handler_context.execute_step_args)\n\n yield DagsterEvent.engine_event(\n step_handler_context.get_step_context(step_key),\n message=f"Stopping Docker container {container_name} for step.",\n event_specific_data=EngineEventData(),\n )\n\n client = self._get_client(container_context)\n\n container = client.containers.get(container_name)\n\n container.stop()\n
", "current_page_name": "_modules/dagster_docker/docker_executor", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_docker.docker_executor"}, "docker_run_launcher": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_docker.docker_run_launcher

\nfrom typing import Any, Mapping, Optional\n\nimport dagster._check as check\nimport docker\nfrom dagster._core.launcher.base import (\n    CheckRunHealthResult,\n    LaunchRunContext,\n    ResumeRunContext,\n    RunLauncher,\n    WorkerStatus,\n)\nfrom dagster._core.storage.dagster_run import DagsterRun\nfrom dagster._core.storage.tags import DOCKER_IMAGE_TAG\nfrom dagster._core.utils import parse_env_var\nfrom dagster._grpc.types import ExecuteRunArgs, ResumeRunArgs\nfrom dagster._serdes import ConfigurableClass\nfrom dagster._serdes.config_class import ConfigurableClassData\nfrom typing_extensions import Self\n\nfrom dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_config, validate_docker_image\n\nfrom .container_context import DockerContainerContext\n\nDOCKER_CONTAINER_ID_TAG = "docker/container_id"\n\n\n
[docs]class DockerRunLauncher(RunLauncher, ConfigurableClass):\n """Launches runs in a Docker container."""\n\n def __init__(\n self,\n inst_data: Optional[ConfigurableClassData] = None,\n image=None,\n registry=None,\n env_vars=None,\n network=None,\n networks=None,\n container_kwargs=None,\n ):\n self._inst_data = inst_data\n self.image = image\n self.registry = registry\n self.env_vars = env_vars\n\n validate_docker_config(network, networks, container_kwargs)\n\n if network:\n self.networks = [network]\n elif networks:\n self.networks = networks\n else:\n self.networks = []\n\n self.container_kwargs = check.opt_dict_param(\n container_kwargs, "container_kwargs", key_type=str\n )\n\n super().__init__()\n\n @property\n def inst_data(self):\n return self._inst_data\n\n @classmethod\n def config_type(cls):\n return DOCKER_CONFIG_SCHEMA\n\n @classmethod\n def from_config_value(\n cls, inst_data: ConfigurableClassData, config_value: Mapping[str, Any]\n ) -> Self:\n return DockerRunLauncher(inst_data=inst_data, **config_value)\n\n def get_container_context(self, dagster_run: DagsterRun) -> DockerContainerContext:\n return DockerContainerContext.create_for_run(dagster_run, self)\n\n def _get_client(self, container_context: DockerContainerContext):\n client = docker.client.from_env()\n if container_context.registry:\n client.login(\n registry=container_context.registry["url"],\n username=container_context.registry["username"],\n password=container_context.registry["password"],\n )\n return client\n\n def _get_docker_image(self, job_code_origin):\n docker_image = job_code_origin.repository_origin.container_image\n\n if not docker_image:\n docker_image = self.image\n\n if not docker_image:\n raise Exception("No docker image specified by the instance config or repository")\n\n validate_docker_image(docker_image)\n return docker_image\n\n def _launch_container_with_command(self, run, docker_image, command):\n container_context = self.get_container_context(run)\n docker_env = dict([parse_env_var(env_var) for env_var in container_context.env_vars])\n docker_env["DAGSTER_RUN_JOB_NAME"] = run.job_name\n\n client = self._get_client(container_context)\n\n try:\n container = client.containers.create(\n image=docker_image,\n command=command,\n detach=True,\n environment=docker_env,\n network=container_context.networks[0] if len(container_context.networks) else None,\n **container_context.container_kwargs,\n )\n\n except docker.errors.ImageNotFound:\n client.images.pull(docker_image)\n container = client.containers.create(\n image=docker_image,\n command=command,\n detach=True,\n environment=docker_env,\n network=container_context.networks[0] if len(container_context.networks) else None,\n **container_context.container_kwargs,\n )\n\n if len(container_context.networks) > 1:\n for network_name in container_context.networks[1:]:\n network = client.networks.get(network_name)\n network.connect(container)\n\n self._instance.report_engine_event(\n message=f"Launching run in a new container {container.id} with image {docker_image}",\n dagster_run=run,\n cls=self.__class__,\n )\n\n self._instance.add_run_tags(\n run.run_id,\n {DOCKER_CONTAINER_ID_TAG: container.id, DOCKER_IMAGE_TAG: docker_image},\n )\n\n container.start()\n\n def launch_run(self, context: LaunchRunContext) -> None:\n run = context.dagster_run\n job_code_origin = check.not_none(context.job_code_origin)\n docker_image = self._get_docker_image(job_code_origin)\n\n command = ExecuteRunArgs(\n job_origin=job_code_origin,\n run_id=run.run_id,\n instance_ref=self._instance.get_ref(),\n ).get_command_args()\n\n self._launch_container_with_command(run, docker_image, command)\n\n @property\n def supports_resume_run(self):\n return True\n\n def resume_run(self, context: ResumeRunContext) -> None:\n run = context.dagster_run\n job_code_origin = check.not_none(context.job_code_origin)\n docker_image = self._get_docker_image(job_code_origin)\n\n command = ResumeRunArgs(\n job_origin=job_code_origin,\n run_id=run.run_id,\n instance_ref=self._instance.get_ref(),\n ).get_command_args()\n\n self._launch_container_with_command(run, docker_image, command)\n\n def _get_container(self, run):\n if not run or run.is_finished:\n return None\n\n container_id = run.tags.get(DOCKER_CONTAINER_ID_TAG)\n\n if not container_id:\n return None\n\n container_context = self.get_container_context(run)\n\n try:\n return self._get_client(container_context).containers.get(container_id)\n except Exception:\n return None\n\n def terminate(self, run_id):\n run = self._instance.get_run_by_id(run_id)\n\n if not run:\n return False\n\n self._instance.report_run_canceling(run)\n\n container = self._get_container(run)\n\n if not container:\n self._instance.report_engine_event(\n message="Unable to get docker container to send termination request to.",\n dagster_run=run,\n cls=self.__class__,\n )\n return False\n\n container.stop()\n\n return True\n\n @property\n def supports_check_run_worker_health(self):\n return True\n\n def check_run_worker_health(self, run: DagsterRun):\n container = self._get_container(run)\n if container is None:\n return CheckRunHealthResult(WorkerStatus.NOT_FOUND)\n if container.status == "running":\n return CheckRunHealthResult(WorkerStatus.RUNNING)\n return CheckRunHealthResult(\n WorkerStatus.FAILED, msg=f"Container status is {container.status}"\n )
\n
", "current_page_name": "_modules/dagster_docker/docker_run_launcher", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_docker.docker_run_launcher"}, "ops": {"docker_container_op": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_docker.ops.docker_container_op

\nfrom typing import Any, Mapping, Optional, Sequence\n\nimport docker\nfrom dagster import Field, In, Nothing, OpExecutionContext, StringSource, op\nfrom dagster._annotations import experimental\nfrom dagster._core.utils import parse_env_var\nfrom dagster._serdes.utils import hash_str\n\nfrom ..container_context import DockerContainerContext\nfrom ..docker_run_launcher import DockerRunLauncher\nfrom ..utils import DOCKER_CONFIG_SCHEMA, validate_docker_image\n\nDOCKER_CONTAINER_OP_CONFIG = {\n    **DOCKER_CONFIG_SCHEMA,\n    "image": Field(\n        StringSource,\n        is_required=True,\n        description="The image in which to run the Docker container.",\n    ),\n    "entrypoint": Field(\n        [str],\n        is_required=False,\n        description="The ENTRYPOINT for the Docker container",\n    ),\n    "command": Field(\n        [str],\n        is_required=False,\n        description="The command to run in the container within the launched Docker container.",\n    ),\n}\n\n\ndef _get_client(docker_container_context: DockerContainerContext):\n    client = docker.client.from_env()\n    if docker_container_context.registry:\n        client.login(\n            registry=docker_container_context.registry["url"],\n            username=docker_container_context.registry["username"],\n            password=docker_container_context.registry["password"],\n        )\n    return client\n\n\ndef _get_container_name(run_id, op_name, retry_number):\n    container_name = hash_str(run_id + op_name)\n\n    if retry_number > 0:\n        container_name = f"{container_name}-{retry_number}"\n\n    return container_name\n\n\ndef _create_container(\n    op_context: OpExecutionContext,\n    client,\n    container_context: DockerContainerContext,\n    image: str,\n    entrypoint: Optional[Sequence[str]],\n    command: Optional[Sequence[str]],\n):\n    env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars])\n    return client.containers.create(\n        image,\n        name=_get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number),\n        detach=True,\n        network=container_context.networks[0] if len(container_context.networks) else None,\n        entrypoint=entrypoint,\n        command=command,\n        environment=env_vars,\n        **container_context.container_kwargs,\n    )\n\n\n
[docs]@experimental\ndef execute_docker_container(\n context: OpExecutionContext,\n image: str,\n entrypoint: Optional[Sequence[str]] = None,\n command: Optional[Sequence[str]] = None,\n networks: Optional[Sequence[str]] = None,\n registry: Optional[Mapping[str, str]] = None,\n env_vars: Optional[Sequence[str]] = None,\n container_kwargs: Optional[Mapping[str, Any]] = None,\n):\n """This function is a utility for executing a Docker container from within a Dagster op.\n\n Args:\n image (str): The image to use for the launched Docker container.\n entrypoint (Optional[Sequence[str]]): The ENTRYPOINT to run in the launched Docker\n container. Default: None.\n command (Optional[Sequence[str]]): The CMD to run in the launched Docker container.\n Default: None.\n networks (Optional[Sequence[str]]): Names of the Docker networks to which to connect the\n launched container. Default: None.\n registry: (Optional[Mapping[str, str]]): Information for using a non local/public Docker\n registry. Can have "url", "username", or "password" keys.\n env_vars (Optional[Sequence[str]]): List of environemnt variables to include in the launched\n container. ach can be of the form KEY=VALUE or just KEY (in which case the value will be\n pulled from the calling environment.\n container_kwargs (Optional[Dict[str[Any]]]): key-value pairs that can be passed into\n containers.create in the Docker Python API. See\n https://docker-py.readthedocs.io/en/stable/containers.html for the full list\n of available options.\n """\n run_container_context = DockerContainerContext.create_for_run(\n context.dagster_run,\n (\n context.instance.run_launcher\n if isinstance(context.instance.run_launcher, DockerRunLauncher)\n else None\n ),\n )\n\n validate_docker_image(image)\n\n op_container_context = DockerContainerContext(\n registry=registry, env_vars=env_vars, networks=networks, container_kwargs=container_kwargs\n )\n\n container_context = run_container_context.merge(op_container_context)\n\n client = _get_client(container_context)\n\n try:\n container = _create_container(\n context, client, container_context, image, entrypoint, command\n )\n except docker.errors.ImageNotFound:\n client.images.pull(image)\n container = _create_container(\n context, client, container_context, image, entrypoint, command\n )\n\n if len(container_context.networks) > 1:\n for network_name in container_context.networks[1:]:\n network = client.networks.get(network_name)\n network.connect(container)\n\n container.start()\n\n for line in container.logs(stdout=True, stderr=True, stream=True, follow=True):\n print(line) # noqa: T201\n\n exit_status = container.wait()["StatusCode"]\n\n if exit_status != 0:\n raise Exception(f"Docker container returned exit code {exit_status}")
\n\n\n
[docs]@op(ins={"start_after": In(Nothing)}, config_schema=DOCKER_CONTAINER_OP_CONFIG)\n@experimental\ndef docker_container_op(context):\n """An op that runs a Docker container using the docker Python API.\n\n Contrast with the `docker_executor`, which runs each Dagster op in a Dagster job in its\n own Docker container.\n\n This op may be useful when:\n - You need to orchestrate a command that isn't a Dagster op (or isn't written in Python)\n - You want to run the rest of a Dagster job using a specific executor, and only a single\n op in docker.\n\n For example:\n\n .. literalinclude:: ../../../../../../python_modules/libraries/dagster-docker/dagster_docker_tests/test_example_docker_container_op.py\n :start-after: start_marker\n :end-before: end_marker\n :language: python\n\n You can create your own op with the same implementation by calling the `execute_docker_container` function\n inside your own op.\n """\n execute_docker_container(context, **context.op_config)
\n
", "current_page_name": "_modules/dagster_docker/ops/docker_container_op", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_docker.ops.docker_container_op"}}}, "dagster_duckdb": {"io_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_duckdb.io_manager

\nfrom abc import abstractmethod\nfrom contextlib import contextmanager\nfrom typing import Optional, Sequence, Type, cast\n\nimport duckdb\nfrom dagster import IOManagerDefinition, OutputContext, io_manager\nfrom dagster._config.pythonic_config import (\n    ConfigurableIOManagerFactory,\n)\nfrom dagster._core.definitions.time_window_partitions import TimeWindow\nfrom dagster._core.storage.db_io_manager import (\n    DbClient,\n    DbIOManager,\n    DbTypeHandler,\n    TablePartitionDimension,\n    TableSlice,\n)\nfrom dagster._core.storage.io_manager import dagster_maintained_io_manager\nfrom dagster._utils.backoff import backoff\nfrom pydantic import Field\n\nDUCKDB_DATETIME_FORMAT = "%Y-%m-%d %H:%M:%S"\n\n\n
[docs]def build_duckdb_io_manager(\n type_handlers: Sequence[DbTypeHandler], default_load_type: Optional[Type] = None\n) -> IOManagerDefinition:\n """Builds an IO manager definition that reads inputs from and writes outputs to DuckDB.\n\n Args:\n type_handlers (Sequence[DbTypeHandler]): Each handler defines how to translate between\n DuckDB tables and an in-memory type - e.g. a Pandas DataFrame. If only\n one DbTypeHandler is provided, it will be used as teh default_load_type.\n default_load_type (Type): When an input has no type annotation, load it as this type.\n\n Returns:\n IOManagerDefinition\n\n Examples:\n .. code-block:: python\n\n from dagster_duckdb import build_duckdb_io_manager\n from dagster_duckdb_pandas import DuckDBPandasTypeHandler\n\n @asset(\n key_prefix=["my_schema"] # will be used as the schema in duckdb\n )\n def my_table() -> pd.DataFrame: # the name of the asset will be the table name\n ...\n\n duckdb_io_manager = build_duckdb_io_manager([DuckDBPandasTypeHandler()])\n\n @repository\n def my_repo():\n return with_resources(\n [my_table],\n {"io_manager": duckdb_io_manager.configured({"database": "my_db.duckdb"})}\n )\n\n If you do not provide a schema, Dagster will determine a schema based on the assets and ops using\n the IO Manager. For assets, the schema will be determined from the asset key. For ops, the schema can be\n specified by including a "schema" entry in output metadata. If none of these is provided, the schema will\n default to "public".\n\n .. code-block:: python\n\n @op(\n out={"my_table": Out(metadata={"schema": "my_schema"})}\n )\n def make_my_table() -> pd.DataFrame:\n ...\n\n To only use specific columns of a table as input to a downstream op or asset, add the metadata "columns" to the\n In or AssetIn.\n\n .. code-block:: python\n\n @asset(\n ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n )\n def my_table_a(my_table: pd.DataFrame):\n # my_table will just contain the data from column "a"\n ...\n\n """\n\n @dagster_maintained_io_manager\n @io_manager(config_schema=DuckDBIOManager.to_config_schema())\n def duckdb_io_manager(init_context):\n """IO Manager for storing outputs in a DuckDB database.\n\n Assets will be stored in the schema and table name specified by their AssetKey.\n Subsequent materializations of an asset will overwrite previous materializations of that asset.\n Op outputs will be stored in the schema specified by output metadata (defaults to public) in a\n table of the name of the output.\n """\n return DbIOManager(\n type_handlers=type_handlers,\n db_client=DuckDbClient(),\n io_manager_name="DuckDBIOManager",\n database=init_context.resource_config["database"],\n schema=init_context.resource_config.get("schema"),\n default_load_type=default_load_type,\n )\n\n return duckdb_io_manager
\n\n\n
[docs]class DuckDBIOManager(ConfigurableIOManagerFactory):\n """Base class for an IO manager definition that reads inputs from and writes outputs to DuckDB.\n\n Examples:\n .. code-block:: python\n\n from dagster_duckdb import DuckDBIOManager\n from dagster_duckdb_pandas import DuckDBPandasTypeHandler\n\n class MyDuckDBIOManager(DuckDBIOManager):\n @staticmethod\n def type_handlers() -> Sequence[DbTypeHandler]:\n return [DuckDBPandasTypeHandler()]\n\n @asset(\n key_prefix=["my_schema"] # will be used as the schema in duckdb\n )\n def my_table() -> pd.DataFrame: # the name of the asset will be the table name\n ...\n\n defs = Definitions(\n assets=[my_table],\n resources={"io_manager": MyDuckDBIOManager(database="my_db.duckdb")}\n )\n\n If you do not provide a schema, Dagster will determine a schema based on the assets and ops using\n the IO Manager. For assets, the schema will be determined from the asset key, as in the above example.\n For ops, the schema can be specified by including a "schema" entry in output metadata. If none\n of these is provided, the schema will default to "public".\n\n .. code-block:: python\n\n @op(\n out={"my_table": Out(metadata={"schema": "my_schema"})}\n )\n def make_my_table() -> pd.DataFrame:\n ...\n\n To only use specific columns of a table as input to a downstream op or asset, add the metadata "columns" to the\n In or AssetIn.\n\n .. code-block:: python\n\n @asset(\n ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n )\n def my_table_a(my_table: pd.DataFrame):\n # my_table will just contain the data from column "a"\n ...\n\n """\n\n database: str = Field(description="Path to the DuckDB database.")\n schema_: Optional[str] = Field(\n default=None, alias="schema", description="Name of the schema to use."\n ) # schema is a reserved word for pydantic\n\n @staticmethod\n @abstractmethod\n def type_handlers() -> Sequence[DbTypeHandler]: ...\n\n @staticmethod\n def default_load_type() -> Optional[Type]:\n return None\n\n def create_io_manager(self, context) -> DbIOManager:\n return DbIOManager(\n db_client=DuckDbClient(),\n database=self.database,\n schema=self.schema_,\n type_handlers=self.type_handlers(),\n default_load_type=self.default_load_type(),\n io_manager_name="DuckDBIOManager",\n )
\n\n\nclass DuckDbClient(DbClient):\n @staticmethod\n def delete_table_slice(context: OutputContext, table_slice: TableSlice, connection) -> None:\n try:\n connection.execute(_get_cleanup_statement(table_slice))\n except duckdb.CatalogException:\n # table doesn't exist yet, so ignore the error\n pass\n\n @staticmethod\n def ensure_schema_exists(context: OutputContext, table_slice: TableSlice, connection) -> None:\n connection.execute(f"create schema if not exists {table_slice.schema};")\n\n @staticmethod\n def get_select_statement(table_slice: TableSlice) -> str:\n col_str = ", ".join(table_slice.columns) if table_slice.columns else "*"\n\n if table_slice.partition_dimensions and len(table_slice.partition_dimensions) > 0:\n query = f"SELECT {col_str} FROM {table_slice.schema}.{table_slice.table} WHERE\\n"\n return query + _partition_where_clause(table_slice.partition_dimensions)\n else:\n return f"""SELECT {col_str} FROM {table_slice.schema}.{table_slice.table}"""\n\n @staticmethod\n @contextmanager\n def connect(context, _):\n conn = backoff(\n fn=duckdb.connect,\n retry_on=(RuntimeError, duckdb.IOException),\n kwargs={"database": context.resource_config["database"], "read_only": False},\n max_retries=10,\n )\n\n yield conn\n\n conn.close()\n\n\ndef _get_cleanup_statement(table_slice: TableSlice) -> str:\n """Returns a SQL statement that deletes data in the given table to make way for the output data\n being written.\n """\n if table_slice.partition_dimensions and len(table_slice.partition_dimensions) > 0:\n query = f"DELETE FROM {table_slice.schema}.{table_slice.table} WHERE\\n"\n return query + _partition_where_clause(table_slice.partition_dimensions)\n else:\n return f"DELETE FROM {table_slice.schema}.{table_slice.table}"\n\n\ndef _partition_where_clause(partition_dimensions: Sequence[TablePartitionDimension]) -> str:\n return " AND\\n".join(\n (\n _time_window_where_clause(partition_dimension)\n if isinstance(partition_dimension.partitions, TimeWindow)\n else _static_where_clause(partition_dimension)\n )\n for partition_dimension in partition_dimensions\n )\n\n\ndef _time_window_where_clause(table_partition: TablePartitionDimension) -> str:\n partition = cast(TimeWindow, table_partition.partitions)\n start_dt, end_dt = partition\n start_dt_str = start_dt.strftime(DUCKDB_DATETIME_FORMAT)\n end_dt_str = end_dt.strftime(DUCKDB_DATETIME_FORMAT)\n return f"""{table_partition.partition_expr} >= '{start_dt_str}' AND {table_partition.partition_expr} < '{end_dt_str}'"""\n\n\ndef _static_where_clause(table_partition: TablePartitionDimension) -> str:\n partitions = ", ".join(f"'{partition}'" for partition in table_partition.partitions)\n return f"""{table_partition.partition_expr} in ({partitions})"""\n
", "current_page_name": "_modules/dagster_duckdb/io_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_duckdb.io_manager"}, "resource": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_duckdb.resource

\nfrom contextlib import contextmanager\n\nimport duckdb\nfrom dagster import ConfigurableResource\nfrom dagster._utils.backoff import backoff\nfrom pydantic import Field\n\n\n
[docs]class DuckDBResource(ConfigurableResource):\n """Resource for interacting with a DuckDB database.\n\n Examples:\n .. code-block:: python\n\n from dagster import Definitions, asset\n from dagster_duckdb import DuckDBResource\n\n @asset\n def my_table(duckdb: DuckDBResource):\n with duckdb.get_connection() as conn:\n conn.execute("SELECT * from MY_SCHEMA.MY_TABLE")\n\n defs = Definitions(\n assets=[my_table],\n resources={"duckdb": DuckDBResource(database="path/to/db.duckdb")}\n )\n\n """\n\n database: str = Field(\n description=(\n "Path to the DuckDB database. Setting database=':memory:' will use an in-memory"\n " database "\n )\n )\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n @contextmanager\n def get_connection(self):\n conn = backoff(\n fn=duckdb.connect,\n retry_on=(RuntimeError, duckdb.IOException),\n kwargs={"database": self.database, "read_only": False},\n max_retries=10,\n )\n\n yield conn\n\n conn.close()
\n
", "current_page_name": "_modules/dagster_duckdb/resource", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_duckdb.resource"}}, "dagster_duckdb_pandas": {"duckdb_pandas_type_handler": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_duckdb_pandas.duckdb_pandas_type_handler

\nfrom typing import Optional, Sequence, Type\n\nimport pandas as pd\nfrom dagster import InputContext, MetadataValue, OutputContext, TableColumn, TableSchema\nfrom dagster._core.storage.db_io_manager import DbTypeHandler, TableSlice\nfrom dagster_duckdb.io_manager import (\n    DuckDbClient,\n    DuckDBIOManager,\n    build_duckdb_io_manager,\n)\n\n\n
[docs]class DuckDBPandasTypeHandler(DbTypeHandler[pd.DataFrame]):\n """Stores and loads Pandas DataFrames in DuckDB.\n\n To use this type handler, return it from the ``type_handlers` method of an I/O manager that inherits from ``DuckDBIOManager``.\n\n Example:\n .. code-block:: python\n\n from dagster_duckdb import DuckDBIOManager\n from dagster_duckdb_pandas import DuckDBPandasTypeHandler\n\n class MyDuckDBIOManager(DuckDBIOManager):\n @staticmethod\n def type_handlers() -> Sequence[DbTypeHandler]:\n return [DuckDBPandasTypeHandler()]\n\n @asset(\n key_prefix=["my_schema"] # will be used as the schema in duckdb\n )\n def my_table() -> pd.DataFrame: # the name of the asset will be the table name\n ...\n\n defs = Definitions(\n assets=[my_table],\n resources={"io_manager": MyDuckDBIOManager(database="my_db.duckdb")}\n )\n\n """\n\n def handle_output(\n self, context: OutputContext, table_slice: TableSlice, obj: pd.DataFrame, connection\n ):\n """Stores the pandas DataFrame in duckdb."""\n connection.execute(\n f"create table if not exists {table_slice.schema}.{table_slice.table} as select * from"\n " obj;"\n )\n if not connection.fetchall():\n # table was not created, therefore already exists. Insert the data\n connection.execute(\n f"insert into {table_slice.schema}.{table_slice.table} select * from obj"\n )\n\n context.add_output_metadata(\n {\n "row_count": obj.shape[0],\n "dataframe_columns": MetadataValue.table_schema(\n TableSchema(\n columns=[\n TableColumn(name=name, type=str(dtype)) # type: ignore # (bad stubs)\n for name, dtype in obj.dtypes.items()\n ]\n )\n ),\n }\n )\n\n def load_input(\n self, context: InputContext, table_slice: TableSlice, connection\n ) -> pd.DataFrame:\n """Loads the input as a Pandas DataFrame."""\n if table_slice.partition_dimensions and len(context.asset_partition_keys) == 0:\n return pd.DataFrame()\n return connection.execute(DuckDbClient.get_select_statement(table_slice)).fetchdf()\n\n @property\n def supported_types(self):\n return [pd.DataFrame]
\n\n\nduckdb_pandas_io_manager = build_duckdb_io_manager(\n [DuckDBPandasTypeHandler()], default_load_type=pd.DataFrame\n)\nduckdb_pandas_io_manager.__doc__ = """\nAn I/O manager definition that reads inputs from and writes Pandas DataFrames to DuckDB. When\nusing the duckdb_pandas_io_manager, any inputs and outputs without type annotations will be loaded\nas Pandas DataFrames.\n\nReturns:\n IOManagerDefinition\n\nExamples:\n\n .. code-block:: python\n\n from dagster_duckdb_pandas import duckdb_pandas_io_manager\n\n @asset(\n key_prefix=["my_schema"] # will be used as the schema in DuckDB\n )\n def my_table() -> pd.DataFrame: # the name of the asset will be the table name\n ...\n\n @repository\n def my_repo():\n return with_resources(\n [my_table],\n {"io_manager": duckdb_pandas_io_manager.configured({"database": "my_db.duckdb"})}\n )\n\n If you do not provide a schema, Dagster will determine a schema based on the assets and ops using\n the I/O Manager. For assets, the schema will be determined from the asset key.\n For ops, the schema can be specified by including a "schema" entry in output metadata. If "schema" is not provided\n via config or on the asset/op, "public" will be used for the schema.\n\n .. code-block:: python\n\n @op(\n out={"my_table": Out(metadata={"schema": "my_schema"})}\n )\n def make_my_table() -> pd.DataFrame:\n # the returned value will be stored at my_schema.my_table\n ...\n\n To only use specific columns of a table as input to a downstream op or asset, add the metadata "columns" to the\n In or AssetIn.\n\n .. code-block:: python\n\n @asset(\n ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n )\n def my_table_a(my_table: pd.DataFrame) -> pd.DataFrame:\n # my_table will just contain the data from column "a"\n ...\n\n"""\n\n\n
[docs]class DuckDBPandasIOManager(DuckDBIOManager):\n """An I/O manager definition that reads inputs from and writes Pandas DataFrames to DuckDB. When\n using the DuckDBPandasIOManager, any inputs and outputs without type annotations will be loaded\n as Pandas DataFrames.\n\n Returns:\n IOManagerDefinition\n\n Examples:\n .. code-block:: python\n\n from dagster_duckdb_pandas import DuckDBPandasIOManager\n\n @asset(\n key_prefix=["my_schema"] # will be used as the schema in DuckDB\n )\n def my_table() -> pd.DataFrame: # the name of the asset will be the table name\n ...\n\n defs = Definitions(\n assets=[my_table],\n resources={"io_manager": DuckDBPandasIOManager(database="my_db.duckdb")}\n )\n\n If you do not provide a schema, Dagster will determine a schema based on the assets and ops using\n the I/O Manager. For assets, the schema will be determined from the asset key, as in the above example.\n For ops, the schema can be specified by including a "schema" entry in output metadata. If "schema" is not provided\n via config or on the asset/op, "public" will be used for the schema.\n\n .. code-block:: python\n\n @op(\n out={"my_table": Out(metadata={"schema": "my_schema"})}\n )\n def make_my_table() -> pd.DataFrame:\n # the returned value will be stored at my_schema.my_table\n ...\n\n To only use specific columns of a table as input to a downstream op or asset, add the metadata "columns" to the\n In or AssetIn.\n\n .. code-block:: python\n\n @asset(\n ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n )\n def my_table_a(my_table: pd.DataFrame) -> pd.DataFrame:\n # my_table will just contain the data from column "a"\n ...\n\n """\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n @staticmethod\n def type_handlers() -> Sequence[DbTypeHandler]:\n return [DuckDBPandasTypeHandler()]\n\n @staticmethod\n def default_load_type() -> Optional[Type]:\n return pd.DataFrame
\n
", "current_page_name": "_modules/dagster_duckdb_pandas/duckdb_pandas_type_handler", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_duckdb_pandas.duckdb_pandas_type_handler"}}, "dagster_duckdb_polars": {"duckdb_polars_type_handler": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_duckdb_polars.duckdb_polars_type_handler

\nfrom typing import Optional, Sequence, Type\n\nimport polars as pl\nfrom dagster import InputContext, MetadataValue, OutputContext, TableColumn, TableSchema\nfrom dagster._core.storage.db_io_manager import DbTypeHandler, TableSlice\nfrom dagster_duckdb.io_manager import DuckDbClient, DuckDBIOManager, build_duckdb_io_manager\n\n\n
[docs]class DuckDBPolarsTypeHandler(DbTypeHandler[pl.DataFrame]):\n """Stores and loads Polars DataFrames in DuckDB.\n\n To use this type handler, return it from the ``type_handlers` method of an I/O manager that inherits from ``DuckDBIOManager``.\n\n Example:\n .. code-block:: python\n\n from dagster_duckdb import DuckDBIOManager\n from dagster_duckdb_polars import DuckDBPolarsTypeHandler\n\n class MyDuckDBIOManager(DuckDBIOManager):\n @staticmethod\n def type_handlers() -> Sequence[DbTypeHandler]:\n return [DuckDBPolarsTypeHandler()]\n\n @asset(\n key_prefix=["my_schema"] # will be used as the schema in duckdb\n )\n def my_table() -> pl.DataFrame: # the name of the asset will be the table name\n ...\n\n defs = Definitions(\n assets=[my_table],\n resources={"io_manager": MyDuckDBIOManager(database="my_db.duckdb")}\n )\n\n """\n\n def handle_output(\n self, context: OutputContext, table_slice: TableSlice, obj: pl.DataFrame, connection\n ):\n """Stores the polars DataFrame in duckdb."""\n obj_arrow = obj.to_arrow() # noqa: F841 # need obj_arrow symbol to exist for duckdb query\n connection.execute(f"create schema if not exists {table_slice.schema};")\n connection.execute(\n f"create table if not exists {table_slice.schema}.{table_slice.table} as select * from"\n " obj_arrow;"\n )\n if not connection.fetchall():\n # table was not created, therefore already exists. Insert the data\n connection.execute(\n f"insert into {table_slice.schema}.{table_slice.table} select * from obj_arrow"\n )\n\n context.add_output_metadata(\n {\n "row_count": obj.shape[0],\n "dataframe_columns": MetadataValue.table_schema(\n TableSchema(\n columns=[\n TableColumn(name=name, type=str(dtype))\n for name, dtype in zip(obj.columns, obj.dtypes)\n ]\n )\n ),\n }\n )\n\n def load_input(\n self, context: InputContext, table_slice: TableSlice, connection\n ) -> pl.DataFrame:\n """Loads the input as a Polars DataFrame."""\n if table_slice.partition_dimensions and len(context.asset_partition_keys) == 0:\n return pl.DataFrame()\n select_statement = connection.execute(\n DuckDbClient.get_select_statement(table_slice=table_slice)\n )\n duckdb_to_arrow = select_statement.arrow()\n return pl.DataFrame(duckdb_to_arrow)\n\n @property\n def supported_types(self):\n return [pl.DataFrame]
\n\n\nduckdb_polars_io_manager = build_duckdb_io_manager(\n [DuckDBPolarsTypeHandler()], default_load_type=pl.DataFrame\n)\nduckdb_polars_io_manager.__doc__ = """\nAn I/O manager definition that reads inputs from and writes polars dataframes to DuckDB. When\nusing the duckdb_polars_io_manager, any inputs and outputs without type annotations will be loaded\nas Polars DataFrames.\n\nReturns:\n IOManagerDefinition\n\nExamples:\n\n .. code-block:: python\n\n from dagster_duckdb_polars import duckdb_polars_io_manager\n\n @asset(\n key_prefix=["my_schema"] # will be used as the schema in DuckDB\n )\n def my_table() -> pl.DataFrame: # the name of the asset will be the table name\n ...\n\n @repository\n def my_repo():\n return with_resources(\n [my_table],\n {"io_manager": duckdb_polars_io_manager.configured({"database": "my_db.duckdb"})}\n )\n\n If you do not provide a schema, Dagster will determine a schema based on the assets and ops using\n the I/O Manager. For assets, the schema will be determined from the asset key.\n For ops, the schema can be specified by including a "schema" entry in output metadata. If "schema" is not provided\n via config or on the asset/op, "public" will be used for the schema.\n\n .. code-block:: python\n\n @op(\n out={"my_table": Out(metadata={"schema": "my_schema"})}\n )\n def make_my_table() -> pl.DataFrame:\n # the returned value will be stored at my_schema.my_table\n ...\n\n To only use specific columns of a table as input to a downstream op or asset, add the metadata "columns" to the\n In or AssetIn.\n\n .. code-block:: python\n\n @asset(\n ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n )\n def my_table_a(my_table: pl.DataFrame) -> pl.DataFrame:\n # my_table will just contain the data from column "a"\n ...\n\n"""\n\n\n
[docs]class DuckDBPolarsIOManager(DuckDBIOManager):\n """An I/O manager definition that reads inputs from and writes Polars DataFrames to DuckDB. When\n using the DuckDBPolarsIOManager, any inputs and outputs without type annotations will be loaded\n as Polars DataFrames.\n\n Returns:\n IOManagerDefinition\n\n Examples:\n .. code-block:: python\n\n from dagster_duckdb_polars import DuckDBPolarsIOManager\n\n @asset(\n key_prefix=["my_schema"] # will be used as the schema in DuckDB\n )\n def my_table() -> pl.DataFrame: # the name of the asset will be the table name\n ...\n\n defs = Definitions(\n assets=[my_table],\n resources={"io_manager": DuckDBPolarsIOManager(database="my_db.duckdb")}\n )\n\n If you do not provide a schema, Dagster will determine a schema based on the assets and ops using\n the I/O Manager. For assets, the schema will be determined from the asset key, as in the above example.\n For ops, the schema can be specified by including a "schema" entry in output metadata. If "schema" is not provided\n via config or on the asset/op, "public" will be used for the schema.\n\n .. code-block:: python\n\n @op(\n out={"my_table": Out(metadata={"schema": "my_schema"})}\n )\n def make_my_table() -> pl.DataFrame:\n # the returned value will be stored at my_schema.my_table\n ...\n\n To only use specific columns of a table as input to a downstream op or asset, add the metadata "columns" to the\n In or AssetIn.\n\n .. code-block:: python\n\n @asset(\n ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n )\n def my_table_a(my_table: pl.DataFrame) -> pl.DataFrame:\n # my_table will just contain the data from column "a"\n ...\n\n """\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n @staticmethod\n def type_handlers() -> Sequence[DbTypeHandler]:\n return [DuckDBPolarsTypeHandler()]\n\n @staticmethod\n def default_load_type() -> Optional[Type]:\n return pl.DataFrame
\n
", "current_page_name": "_modules/dagster_duckdb_polars/duckdb_polars_type_handler", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_duckdb_polars.duckdb_polars_type_handler"}}, "dagster_duckdb_pyspark": {"duckdb_pyspark_type_handler": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_duckdb_pyspark.duckdb_pyspark_type_handler

\nfrom typing import Optional, Sequence, Type\n\nimport pyarrow as pa\nimport pyspark\nimport pyspark.sql\nfrom dagster import InputContext, MetadataValue, OutputContext, TableColumn, TableSchema\nfrom dagster._core.storage.db_io_manager import DbTypeHandler, TableSlice\nfrom dagster_duckdb.io_manager import (\n    DuckDbClient,\n    DuckDBIOManager,\n    build_duckdb_io_manager,\n)\nfrom pyspark.sql import SparkSession\nfrom pyspark.sql.types import StructType\n\n\ndef pyspark_df_to_arrow_table(df: pyspark.sql.DataFrame) -> pa.Table:\n    """Converts a PySpark DataFrame to a PyArrow Table."""\n    # `_collect_as_arrow` API call sourced from:\n    #   https://stackoverflow.com/questions/73203318/how-to-transform-spark-dataframe-to-polars-dataframe\n    return pa.Table.from_batches(df._collect_as_arrow())  # noqa: SLF001\n\n\n
[docs]class DuckDBPySparkTypeHandler(DbTypeHandler[pyspark.sql.DataFrame]):\n """Stores PySpark DataFrames in DuckDB.\n\n To use this type handler, return it from the ``type_handlers` method of an I/O manager that inherits from ``DuckDBIOManager``.\n\n Example:\n .. code-block:: python\n\n from dagster_duckdb import DuckDBIOManager\n from dagster_duckdb_pyspark import DuckDBPySparkTypeHandler\n\n class MyDuckDBIOManager(DuckDBIOManager):\n @staticmethod\n def type_handlers() -> Sequence[DbTypeHandler]:\n return [DuckDBPySparkTypeHandler()]\n\n @asset(\n key_prefix=["my_schema"] # will be used as the schema in duckdb\n )\n def my_table() -> pyspark.sql.DataFrame: # the name of the asset will be the table name\n ...\n\n defs = Definitions(\n assets=[my_table],\n resources={"io_manager": MyDuckDBIOManager(database="my_db.duckdb")}\n )\n """\n\n def handle_output(\n self,\n context: OutputContext,\n table_slice: TableSlice,\n obj: pyspark.sql.DataFrame,\n connection,\n ):\n """Stores the given object at the provided filepath."""\n pa_df = pyspark_df_to_arrow_table(obj) # noqa: F841\n connection.execute(\n f"create table if not exists {table_slice.schema}.{table_slice.table} as select * from"\n " pa_df;"\n )\n if not connection.fetchall():\n # table was not created, therefore already exists. Insert the data\n connection.execute(\n f"insert into {table_slice.schema}.{table_slice.table} select * from pa_df;"\n )\n\n context.add_output_metadata(\n {\n "row_count": obj.count(),\n "dataframe_columns": MetadataValue.table_schema(\n TableSchema(\n columns=[\n TableColumn(name=name, type=str(dtype)) for name, dtype in obj.dtypes\n ]\n )\n ),\n }\n )\n\n def load_input(\n self, context: InputContext, table_slice: TableSlice, connection\n ) -> pyspark.sql.DataFrame:\n """Loads the return of the query as the correct type."""\n spark = SparkSession.builder.getOrCreate() # type: ignore\n if table_slice.partition_dimensions and len(context.asset_partition_keys) == 0:\n return spark.createDataFrame([], StructType([]))\n\n pd_df = connection.execute(DuckDbClient.get_select_statement(table_slice)).fetchdf()\n return spark.createDataFrame(pd_df)\n\n @property\n def supported_types(self):\n return [pyspark.sql.DataFrame]
\n\n\nduckdb_pyspark_io_manager = build_duckdb_io_manager(\n [DuckDBPySparkTypeHandler()], default_load_type=pyspark.sql.DataFrame\n)\nduckdb_pyspark_io_manager.__doc__ = """\nAn I/O manager definition that reads inputs from and writes PySpark DataFrames to DuckDB. When\nusing the duckdb_pyspark_io_manager, any inputs and outputs without type annotations will be loaded\nas PySpark DataFrames.\n\nReturns:\n IOManagerDefinition\n\nExamples:\n\n .. code-block:: python\n\n from dagster_duckdb_pyspark import duckdb_pyspark_io_manager\n\n @asset(\n key_prefix=["my_schema"] # will be used as the schema in DuckDB\n )\n def my_table() -> pyspark.sql.DataFrame: # the name of the asset will be the table name\n ...\n\n @repository\n def my_repo():\n return with_resources(\n [my_table],\n {"io_manager": duckdb_pyspark_io_manager.configured({"database": "my_db.duckdb"})}\n )\n\n If you do not provide a schema, Dagster will determine a schema based on the assets and ops using\n the I/O Manager. For assets, the schema will be determined from the asset key.\n For ops, the schema can be specified by including a "schema" entry in output metadata. If "schema" is not provided\n via config or on the asset/op, "public" will be used for the schema.\n\n .. code-block:: python\n\n @op(\n out={"my_table": Out(metadata={"schema": "my_schema"})}\n )\n def make_my_table() -> pyspark.sql.DataFrame:\n # the returned value will be stored at my_schema.my_table\n ...\n\n To only use specific columns of a table as input to a downstream op or asset, add the metadata "columns" to the\n In or AssetIn.\n\n .. code-block:: python\n\n @asset(\n ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n )\n def my_table_a(my_table: pyspark.sql.DataFrame) -> pyspark.sql.DataFrame:\n # my_table will just contain the data from column "a"\n ...\n\n"""\n\n\n
[docs]class DuckDBPySparkIOManager(DuckDBIOManager):\n """An I/O manager definition that reads inputs from and writes PySpark DataFrames to DuckDB. When\n using the DuckDBPySparkIOManager, any inputs and outputs without type annotations will be loaded\n as PySpark DataFrames.\n\n Returns:\n IOManagerDefinition\n\n Examples:\n .. code-block:: python\n\n from dagster_duckdb_pyspark import DuckDBPySparkIOManager\n\n @asset(\n key_prefix=["my_schema"] # will be used as the schema in DuckDB\n )\n def my_table() -> pyspark.sql.DataFrame: # the name of the asset will be the table name\n ...\n\n defs = Definitions(\n assets=[my_table],\n resources={"io_manager": DuckDBPySparkIOManager(database="my_db.duckdb")}\n )\n\n If you do not provide a schema, Dagster will determine a schema based on the assets and ops using\n the I/O Manager. For assets, the schema will be determined from the asset key, as in the above example.\n For ops, the schema can be specified by including a "schema" entry in output metadata. If "schema" is not provided\n via config or on the asset/op, "public" will be used for the schema.\n\n .. code-block:: python\n\n @op(\n out={"my_table": Out(metadata={"schema": "my_schema"})}\n )\n def make_my_table() -> pyspark.sql.DataFrame:\n # the returned value will be stored at my_schema.my_table\n ...\n\n To only use specific columns of a table as input to a downstream op or asset, add the metadata "columns" to the\n In or AssetIn.\n\n .. code-block:: python\n\n @asset(\n ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n )\n def my_table_a(my_table: pyspark.sql.DataFrame) -> pyspark.sql.DataFrame:\n # my_table will just contain the data from column "a"\n ...\n\n """\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n @staticmethod\n def type_handlers() -> Sequence[DbTypeHandler]:\n return [DuckDBPySparkTypeHandler()]\n\n @staticmethod\n def default_load_type() -> Optional[Type]:\n return pyspark.sql.DataFrame
\n
", "current_page_name": "_modules/dagster_duckdb_pyspark/duckdb_pyspark_type_handler", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_duckdb_pyspark.duckdb_pyspark_type_handler"}}, "dagster_embedded_elt": {"sling": {"asset_defs": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_embedded_elt.sling.asset_defs

\nimport re\nfrom typing import Any, Dict, List, Optional, Union\n\nfrom dagster import (\n    AssetExecutionContext,\n    AssetsDefinition,\n    AssetSpec,\n    MaterializeResult,\n    multi_asset,\n)\nfrom dagster._annotations import experimental\n\nfrom dagster_embedded_elt.sling.resources import SlingMode, SlingResource\n\n\n
[docs]@experimental\ndef build_sling_asset(\n asset_spec: AssetSpec,\n source_stream: str,\n target_object: str,\n mode: SlingMode = SlingMode.FULL_REFRESH,\n primary_key: Optional[Union[str, List[str]]] = None,\n update_key: Optional[Union[str, List[str]]] = None,\n source_options: Optional[Dict[str, Any]] = None,\n target_options: Optional[Dict[str, Any]] = None,\n sling_resource_key: str = "sling",\n) -> AssetsDefinition:\n """Asset Factory for using Sling to sync data from a source stream to a target object.\n\n Args:\n asset_spec (AssetSpec): The AssetSpec to use to materialize this asset.\n source_stream (str): The source stream to sync from. This can be a table, a query, or a path.\n target_object (str): The target object to sync to. This can be a table, or a path.\n mode (SlingMode, optional): The sync mode to use when syncing. Defaults to SlingMode.FULL_REFRESH.\n primary_key (Optional[Union[str, List[str]]], optional): The optional primary key to use when syncing.\n update_key (Optional[Union[str, List[str]]], optional): The optional update key to use when syncing.\n source_options (Optional[Dict[str, Any]], optional): Any optional Sling source options to use when syncing.\n target_options (Optional[Dict[str, Any]], optional): Any optional target options to use when syncing.\n sling_resource_key (str, optional): The resource key for the SlingResource. Defaults to "sling".\n\n Examples:\n Creating a Sling asset that syncs from a file to a table:\n\n .. code-block:: python\n\n asset_spec = AssetSpec(key=["main", "dest_tbl"])\n asset_def = build_sling_asset(\n asset_spec=asset_spec,\n source_stream="file:///tmp/test.csv",\n target_object="main.dest_table",\n mode=SlingMode.INCREMENTAL,\n primary_key="id"\n )\n\n Creating a Sling asset that syncs from a table to a file with a full refresh:\n\n .. code-block:: python\n\n asset_spec = AssetSpec(key="test.csv")\n asset_def = build_sling_asset(\n asset_spec=asset_spec,\n source_stream="main.dest_table",\n target_object="file:///tmp/test.csv",\n mode=SlingMode.FULL_REFRESH\n )\n\n\n """\n if primary_key is not None and not isinstance(primary_key, list):\n primary_key = [primary_key]\n\n if update_key is not None and not isinstance(update_key, list):\n update_key = [update_key]\n\n @multi_asset(\n compute_kind="sling", specs=[asset_spec], required_resource_keys={sling_resource_key}\n )\n def sync(context: AssetExecutionContext) -> MaterializeResult:\n sling: SlingResource = getattr(context.resources, sling_resource_key)\n last_row_count_observed = None\n for stdout_line in sling.sync(\n source_stream=source_stream,\n target_object=target_object,\n mode=mode,\n primary_key=primary_key,\n update_key=update_key,\n source_options=source_options,\n target_options=target_options,\n ):\n match = re.search(r"(\\d+) rows", stdout_line)\n if match:\n last_row_count_observed = int(match.group(1))\n context.log.info(stdout_line)\n\n return MaterializeResult(\n metadata=(\n {} if last_row_count_observed is None else {"row_count": last_row_count_observed}\n )\n )\n\n return sync
\n
", "current_page_name": "_modules/dagster_embedded_elt/sling/asset_defs", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_embedded_elt.sling.asset_defs"}, "resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_embedded_elt.sling.resources

\nimport contextlib\nimport json\nimport re\nfrom enum import Enum\nfrom subprocess import PIPE, STDOUT, Popen\nfrom typing import Any, Dict, Generator, List, Optional\n\nfrom dagster import ConfigurableResource, PermissiveConfig, get_dagster_logger\nfrom dagster._annotations import experimental\nfrom dagster._utils.env import environ\nfrom pydantic import Field\nfrom sling import Sling  # type: ignore\n\nlogger = get_dagster_logger()\n\n\nclass SlingMode(str, Enum):\n    """The mode to use when syncing.\n\n    See the Sling docs for more information: https://docs.slingdata.io/sling-cli/running-tasks#modes.\n    """\n\n    INCREMENTAL = "incremental"\n    TRUNCATE = "truncate"\n    FULL_REFRESH = "full-refresh"\n    SNAPSHOT = "snapshot"\n\n\n
[docs]class SlingSourceConnection(PermissiveConfig):\n """A Sling Source Connection defines the source connection used by :py:class:`~dagster_elt.sling.SlingResource`.\n\n Examples:\n Creating a Sling Source for a file, such as CSV or JSON:\n\n .. code-block:: python\n\n source = SlingSourceConnection(type="file")\n\n Create a Sling Source for a Postgres database, using a connection string:\n\n .. code-block:: python\n\n source = SlingTargetConnection(type="postgres", connection_string=EnvVar("POSTGRES_CONNECTION_STRING"))\n source = SlingSourceConnection(type="postgres", connection_string="postgresql://user:password@host:port/schema")\n\n Create a Sling Source for a Postgres database, using keyword arguments, as described here:\n https://docs.slingdata.io/connections/database-connections/postgres\n\n .. code-block:: python\n\n source = SlingTargetConnection(type="postgres", host="host", user="hunter42", password=EnvVar("POSTGRES_PASSWORD"))\n\n """\n\n type: str = Field(description="Type of the source connection. Use 'file' for local storage.")\n connection_string: Optional[str] = Field(\n description="The connection string for the source database."\n )
\n\n\n
[docs]class SlingTargetConnection(PermissiveConfig):\n """A Sling Target Connection defines the target connection used by :py:class:`~dagster_elt.sling.SlingResource`.\n\n Examples:\n Creating a Sling Target for a file, such as CSV or JSON:\n\n .. code-block:: python\n\n source = SlingTargetConnection(type="file")\n\n Create a Sling Source for a Postgres database, using a connection string:\n\n .. code-block:: python\n\n source = SlingTargetConnection(type="postgres", connection_string="postgresql://user:password@host:port/schema"\n source = SlingTargetConnection(type="postgres", connection_string=EnvVar("POSTGRES_CONNECTION_STRING"))\n\n Create a Sling Source for a Postgres database, using keyword arguments, as described here:\n https://docs.slingdata.io/connections/database-connections/postgres\n\n .. code-block::python\n\n source = SlingTargetConnection(type="postgres", host="host", user="hunter42", password=EnvVar("POSTGRES_PASSWORD"))\n\n\n """\n\n type: str = Field(\n description="Type of the destination connection. Use 'file' for local storage."\n )\n connection_string: Optional[str] = Field(\n description="The connection string for the target database."\n )
\n\n\n
[docs]@experimental\nclass SlingResource(ConfigurableResource):\n """Resource for interacting with the Sling package.\n\n Examples:\n .. code-block:: python\n\n from dagster_etl.sling import SlingResource\n sling_resource = SlingResource(\n source_connection=SlingSourceConnection(\n type="postgres", connection_string=EnvVar("POSTGRES_CONNECTION_STRING")\n ),\n target_connection=SlingTargetConnection(\n type="snowflake",\n host="host",\n user="user",\n database="database",\n password="password",\n role="role",\n ),\n )\n\n """\n\n source_connection: SlingSourceConnection\n target_connection: SlingTargetConnection\n\n @contextlib.contextmanager\n def _setup_config(self) -> Generator[None, None, None]:\n """Uses environment variables to set the Sling source and target connections."""\n sling_source = self.source_connection.dict()\n sling_target = self.target_connection.dict()\n if self.source_connection.connection_string:\n sling_source["url"] = self.source_connection.connection_string\n if self.target_connection.connection_string:\n sling_target["url"] = self.target_connection.connection_string\n with environ(\n {\n "SLING_SOURCE": json.dumps(sling_source),\n "SLING_TARGET": json.dumps(sling_target),\n }\n ):\n yield\n\n @staticmethod\n def _exec_sling_cmd(cmd, stdin=None, stdout=PIPE, stderr=STDOUT) -> Generator[str, None, None]:\n ansi_escape = re.compile(r"\\x1B(?:[@-Z\\\\-_]|\\[[0-?]*[ -/]*[@-~])")\n with Popen(cmd, shell=True, stdin=stdin, stdout=stdout, stderr=stderr) as proc:\n assert proc.stdout\n\n for line in proc.stdout:\n fmt_line = str(line, "utf-8")\n clean_line = ansi_escape.sub("", fmt_line).replace("INF", "")\n yield clean_line\n\n proc.wait()\n if proc.returncode != 0:\n raise Exception("Sling command failed with error code %s", proc.returncode)\n\n def _sync(\n self,\n source_stream: str,\n target_object: str,\n mode: SlingMode = SlingMode.FULL_REFRESH,\n primary_key: Optional[List[str]] = None,\n update_key: Optional[List[str]] = None,\n source_options: Optional[Dict[str, Any]] = None,\n target_options: Optional[Dict[str, Any]] = None,\n ) -> Generator[str, None, None]:\n """Runs a Sling sync from the given source table to the given destination table. Generates\n output lines from the Sling CLI.\n """\n if self.source_connection.type == "file" and not source_stream.startswith("file://"):\n source_stream = "file://" + source_stream\n\n if self.target_connection.type == "file" and not target_object.startswith("file://"):\n target_object = "file://" + target_object\n\n with self._setup_config():\n config = {\n "source": {\n "conn": "SLING_SOURCE",\n "stream": source_stream,\n "primary_key": primary_key,\n "update_key": update_key,\n "options": source_options,\n },\n "target": {\n "conn": "SLING_TARGET",\n "object": target_object,\n "options": target_options,\n },\n }\n config["source"] = {k: v for k, v in config["source"].items() if v is not None}\n config["target"] = {k: v for k, v in config["target"].items() if v is not None}\n\n sling_cli = Sling(**config)\n logger.info("Starting Sling sync with mode: %s", mode)\n cmd = sling_cli._prep_cmd() # noqa: SLF001\n\n yield from self._exec_sling_cmd(cmd)\n\n def sync(\n self,\n source_stream: str,\n target_object: str,\n mode: SlingMode,\n primary_key: Optional[List[str]] = None,\n update_key: Optional[List[str]] = None,\n source_options: Optional[Dict[str, Any]] = None,\n target_options: Optional[Dict[str, Any]] = None,\n ) -> Generator[str, None, None]:\n """Initiate a Sling Sync between a source stream and a target object.\n\n Args:\n source_stream (str): The source stream to read from. For database sources, the source stream can be either\n a table name, a SQL statement or a path to a SQL file e.g. `TABLE1` or `SCHEMA1.TABLE2` or\n `SELECT * FROM TABLE`. For file sources, the source stream is a path or an url to a file.\n For file targets, the target object is a path or a url to a file, e.g. file:///tmp/file.csv or\n s3://my_bucket/my_folder/file.csv\n target_object (str): The target object to write into. For database targets, the target object is a table\n name, e.g. TABLE1, SCHEMA1.TABLE2. For file targets, the target object is a path or an url to a file.\n mode (SlingMode): The Sling mode to use when syncing, i.e. incremental, full-refresh\n See the Sling docs for more information: https://docs.slingdata.io/sling-cli/running-tasks#modes.\n primary_key (str): For incremental syncs, a primary key is used during merge statements to update\n existing rows.\n update_key (str): For incremental syncs, an update key is used to stream records after max(update_key)\n source_options (Dict[str, Any]): Other source options to pass to Sling,\n see https://docs.slingdata.io/sling-cli/running-tasks#source-options-src-options-flag-source.options-key\n for details\n target_options (Dict[str, Any[): Other target options to pass to Sling,\n see https://docs.slingdata.io/sling-cli/running-tasks#target-options-tgt-options-flag-target.options-key\n for details\n\n Examples:\n Sync from a source file to a sqlite database:\n\n .. code-block:: python\n\n sqllite_path = "/path/to/sqlite.db"\n csv_path = "/path/to/file.csv"\n\n @asset\n def run_sync(context, sling: SlingResource):\n res = sling.sync(\n source_stream=csv_path,\n target_object="events",\n mode=SlingMode.FULL_REFRESH,\n )\n for stdout in res:\n context.log.debug(stdout)\n counts = sqlite3.connect(sqllitepath).execute("SELECT count(1) FROM events").fetchone()\n assert counts[0] == 3\n\n source = SlingSourceConnection(\n type="file",\n )\n target = SlingTargetConnection(type="sqlite", instance=sqllitepath)\n\n materialize(\n [run_sync],\n resources={\n "sling": SlingResource(\n source_connection=source,\n target_connection=target,\n mode=SlingMode.TRUNCATE,\n )\n },\n )\n\n """\n yield from self._sync(\n source_stream=source_stream,\n target_object=target_object,\n mode=mode,\n primary_key=primary_key,\n update_key=update_key,\n source_options=source_options,\n target_options=target_options,\n )
\n
", "current_page_name": "_modules/dagster_embedded_elt/sling/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_embedded_elt.sling.resources"}}}, "dagster_fivetran": {"asset_defs": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_fivetran.asset_defs

\nimport hashlib\nimport inspect\nimport re\nfrom functools import partial\nfrom typing import (\n    Any,\n    Callable,\n    Dict,\n    List,\n    Mapping,\n    NamedTuple,\n    Optional,\n    Sequence,\n    Set,\n    Union,\n    cast,\n)\n\nfrom dagster import (\n    AssetKey,\n    AssetOut,\n    AssetsDefinition,\n    Nothing,\n    OpExecutionContext,\n    Output,\n    _check as check,\n    multi_asset,\n)\nfrom dagster._core.definitions.cacheable_assets import (\n    AssetsDefinitionCacheableData,\n    CacheableAssetsDefinition,\n)\nfrom dagster._core.definitions.events import CoercibleToAssetKeyPrefix\nfrom dagster._core.definitions.metadata import MetadataUserInput\nfrom dagster._core.definitions.resource_definition import ResourceDefinition\nfrom dagster._core.errors import DagsterStepOutputNotFoundError\nfrom dagster._core.execution.context.init import build_init_resource_context\n\nfrom dagster_fivetran.resources import DEFAULT_POLL_INTERVAL, FivetranResource\nfrom dagster_fivetran.utils import (\n    generate_materializations,\n    get_fivetran_connector_url,\n    metadata_for_table,\n)\n\n\ndef _build_fivetran_assets(\n    connector_id: str,\n    destination_tables: Sequence[str],\n    poll_interval: float = DEFAULT_POLL_INTERVAL,\n    poll_timeout: Optional[float] = None,\n    io_manager_key: Optional[str] = None,\n    asset_key_prefix: Optional[Sequence[str]] = None,\n    metadata_by_table_name: Optional[Mapping[str, MetadataUserInput]] = None,\n    table_to_asset_key_map: Optional[Mapping[str, AssetKey]] = None,\n    resource_defs: Optional[Mapping[str, ResourceDefinition]] = None,\n    group_name: Optional[str] = None,\n    infer_missing_tables: bool = False,\n    op_tags: Optional[Mapping[str, Any]] = None,\n) -> Sequence[AssetsDefinition]:\n    asset_key_prefix = check.opt_sequence_param(asset_key_prefix, "asset_key_prefix", of_type=str)\n\n    tracked_asset_keys = {\n        table: AssetKey([*asset_key_prefix, *table.split(".")]) for table in destination_tables\n    }\n    user_facing_asset_keys = table_to_asset_key_map or tracked_asset_keys\n\n    _metadata_by_table_name = check.opt_mapping_param(\n        metadata_by_table_name, "metadata_by_table_name", key_type=str\n    )\n\n    @multi_asset(\n        name=f"fivetran_sync_{connector_id}",\n        outs={\n            "_".join(key.path): AssetOut(\n                io_manager_key=io_manager_key,\n                key=user_facing_asset_keys[table],\n                metadata=_metadata_by_table_name.get(table),\n                dagster_type=Nothing,\n            )\n            for table, key in tracked_asset_keys.items()\n        },\n        compute_kind="fivetran",\n        resource_defs=resource_defs,\n        group_name=group_name,\n        op_tags=op_tags,\n    )\n    def _assets(context: OpExecutionContext, fivetran: FivetranResource) -> Any:\n        fivetran_output = fivetran.sync_and_poll(\n            connector_id=connector_id,\n            poll_interval=poll_interval,\n            poll_timeout=poll_timeout,\n        )\n\n        materialized_asset_keys = set()\n        for materialization in generate_materializations(\n            fivetran_output, asset_key_prefix=asset_key_prefix\n        ):\n            # scan through all tables actually created, if it was expected then emit an Output.\n            # otherwise, emit a runtime AssetMaterialization\n            if materialization.asset_key in tracked_asset_keys.values():\n                yield Output(\n                    value=None,\n                    output_name="_".join(materialization.asset_key.path),\n                    metadata=materialization.metadata,\n                )\n                materialized_asset_keys.add(materialization.asset_key)\n\n            else:\n                yield materialization\n\n        unmaterialized_asset_keys = set(tracked_asset_keys.values()) - materialized_asset_keys\n        if infer_missing_tables:\n            for asset_key in unmaterialized_asset_keys:\n                yield Output(\n                    value=None,\n                    output_name="_".join(asset_key.path),\n                )\n\n        else:\n            if unmaterialized_asset_keys:\n                asset_key = next(iter(unmaterialized_asset_keys))\n                output_name = "_".join(asset_key.path)\n                raise DagsterStepOutputNotFoundError(\n                    f"Core compute for {context.op_def.name} did not return an output for"\n                    f' non-optional output "{output_name}".',\n                    step_key=context.get_step_execution_context().step.key,\n                    output_name=output_name,\n                )\n\n    return [_assets]\n\n\n
[docs]def build_fivetran_assets(\n connector_id: str,\n destination_tables: Sequence[str],\n poll_interval: float = DEFAULT_POLL_INTERVAL,\n poll_timeout: Optional[float] = None,\n io_manager_key: Optional[str] = None,\n asset_key_prefix: Optional[Sequence[str]] = None,\n metadata_by_table_name: Optional[Mapping[str, MetadataUserInput]] = None,\n group_name: Optional[str] = None,\n infer_missing_tables: bool = False,\n op_tags: Optional[Mapping[str, Any]] = None,\n) -> Sequence[AssetsDefinition]:\n """Build a set of assets for a given Fivetran connector.\n\n Returns an AssetsDefinition which connects the specified ``asset_keys`` to the computation that\n will update them. Internally, executes a Fivetran sync for a given ``connector_id``, and\n polls until that sync completes, raising an error if it is unsuccessful. Requires the use of the\n :py:class:`~dagster_fivetran.fivetran_resource`, which allows it to communicate with the\n Fivetran API.\n\n Args:\n connector_id (str): The Fivetran Connector ID that this op will sync. You can retrieve this\n value from the "Setup" tab of a given connector in the Fivetran UI.\n destination_tables (List[str]): `schema_name.table_name` for each table that you want to be\n represented in the Dagster asset graph for this connection.\n poll_interval (float): The time (in seconds) that will be waited between successive polls.\n poll_timeout (Optional[float]): The maximum time that will waited before this operation is\n timed out. By default, this will never time out.\n io_manager_key (Optional[str]): The io_manager to be used to handle each of these assets.\n asset_key_prefix (Optional[List[str]]): A prefix for the asset keys inside this asset.\n If left blank, assets will have a key of `AssetKey([schema_name, table_name])`.\n metadata_by_table_name (Optional[Mapping[str, MetadataUserInput]]): A mapping from destination\n table name to user-supplied metadata that should be associated with the asset for that table.\n group_name (Optional[str]): A string name used to organize multiple assets into groups. This\n group name will be applied to all assets produced by this multi_asset.\n infer_missing_tables (bool): If True, will create asset materializations for tables specified\n in destination_tables even if they are not present in the Fivetran sync output. This is useful\n in cases where Fivetran does not sync any data for a table and therefore does not include it\n in the sync output API response.\n op_tags (Optional[Dict[str, Any]]):\n A dictionary of tags for the op that computes the asset. Frameworks may expect and\n require certain metadata to be attached to a op. Values that are not strings will be\n json encoded and must meet the criteria that json.loads(json.dumps(value)) == value.\n\n **Examples:**\n\n Basic example:\n\n .. code-block:: python\n\n from dagster import AssetKey, repository, with_resources\n\n from dagster_fivetran import fivetran_resource\n from dagster_fivetran.assets import build_fivetran_assets\n\n my_fivetran_resource = fivetran_resource.configured(\n {\n "api_key": {"env": "FIVETRAN_API_KEY"},\n "api_secret": {"env": "FIVETRAN_API_SECRET"},\n }\n )\n\n Attaching metadata:\n\n .. code-block:: python\n\n fivetran_assets = build_fivetran_assets(\n connector_id="foobar",\n table_names=["schema1.table1", "schema2.table2"],\n metadata_by_table_name={\n "schema1.table1": {\n "description": "This is a table that contains foo and bar",\n },\n "schema2.table2": {\n "description": "This is a table that contains baz and quux",\n },\n },\n )\n """\n return _build_fivetran_assets(\n connector_id=connector_id,\n destination_tables=destination_tables,\n poll_interval=poll_interval,\n poll_timeout=poll_timeout,\n io_manager_key=io_manager_key,\n asset_key_prefix=asset_key_prefix,\n metadata_by_table_name=metadata_by_table_name,\n group_name=group_name,\n infer_missing_tables=infer_missing_tables,\n op_tags=op_tags,\n )
\n\n\nclass FivetranConnectionMetadata(\n NamedTuple(\n "_FivetranConnectionMetadata",\n [\n ("name", str),\n ("connector_id", str),\n ("connector_url", str),\n ("schemas", Mapping[str, Any]),\n ],\n )\n):\n def build_asset_defn_metadata(\n self,\n key_prefix: Sequence[str],\n group_name: Optional[str],\n table_to_asset_key_fn: Callable[[str], AssetKey],\n io_manager_key: Optional[str] = None,\n ) -> AssetsDefinitionCacheableData:\n schema_table_meta: Dict[str, MetadataUserInput] = {}\n if "schemas" in self.schemas:\n schemas_inner = cast(Dict[str, Any], self.schemas["schemas"])\n for schema in schemas_inner.values():\n if schema["enabled"]:\n schema_name = schema["name_in_destination"]\n schema_tables = cast(Dict[str, Dict[str, Any]], schema["tables"])\n for table in schema_tables.values():\n if table["enabled"]:\n table_name = table["name_in_destination"]\n schema_table_meta[f"{schema_name}.{table_name}"] = metadata_for_table(\n table, self.connector_url\n )\n else:\n schema_table_meta[self.name] = {}\n\n outputs = {\n table: AssetKey([*key_prefix, *list(table_to_asset_key_fn(table).path)])\n for table in schema_table_meta.keys()\n }\n\n internal_deps: Dict[str, Set[AssetKey]] = {}\n\n return AssetsDefinitionCacheableData(\n keys_by_input_name={},\n keys_by_output_name=outputs,\n internal_asset_deps=internal_deps,\n group_name=group_name,\n key_prefix=key_prefix,\n can_subset=False,\n metadata_by_output_name=schema_table_meta,\n extra_metadata={\n "connector_id": self.connector_id,\n "io_manager_key": io_manager_key,\n },\n )\n\n\ndef _build_fivetran_assets_from_metadata(\n assets_defn_meta: AssetsDefinitionCacheableData,\n resource_defs: Mapping[str, ResourceDefinition],\n poll_interval: float,\n poll_timeout: Optional[float] = None,\n) -> AssetsDefinition:\n metadata = cast(Mapping[str, Any], assets_defn_meta.extra_metadata)\n connector_id = cast(str, metadata["connector_id"])\n io_manager_key = cast(Optional[str], metadata["io_manager_key"])\n\n return _build_fivetran_assets(\n connector_id=connector_id,\n destination_tables=list(\n assets_defn_meta.keys_by_output_name.keys()\n if assets_defn_meta.keys_by_output_name\n else []\n ),\n asset_key_prefix=list(assets_defn_meta.key_prefix or []),\n metadata_by_table_name=cast(\n Dict[str, MetadataUserInput], assets_defn_meta.metadata_by_output_name\n ),\n io_manager_key=io_manager_key,\n table_to_asset_key_map=assets_defn_meta.keys_by_output_name,\n resource_defs=resource_defs,\n group_name=assets_defn_meta.group_name,\n poll_interval=poll_interval,\n poll_timeout=poll_timeout,\n )[0]\n\n\nclass FivetranInstanceCacheableAssetsDefinition(CacheableAssetsDefinition):\n def __init__(\n self,\n fivetran_resource_def: Union[FivetranResource, ResourceDefinition],\n key_prefix: Sequence[str],\n connector_to_group_fn: Optional[Callable[[str], Optional[str]]],\n connector_filter: Optional[Callable[[FivetranConnectionMetadata], bool]],\n connector_to_io_manager_key_fn: Optional[Callable[[str], Optional[str]]],\n connector_to_asset_key_fn: Optional[Callable[[FivetranConnectionMetadata, str], AssetKey]],\n poll_interval: float,\n poll_timeout: Optional[float],\n ):\n self._fivetran_resource_def = fivetran_resource_def\n self._fivetran_instance: FivetranResource = (\n fivetran_resource_def.process_config_and_initialize()\n if isinstance(fivetran_resource_def, FivetranResource)\n else fivetran_resource_def(build_init_resource_context())\n )\n\n self._key_prefix = key_prefix\n self._connector_to_group_fn = connector_to_group_fn\n self._connection_filter = connector_filter\n self._connector_to_io_manager_key_fn = connector_to_io_manager_key_fn\n self._connector_to_asset_key_fn: Callable[[FivetranConnectionMetadata, str], AssetKey] = (\n connector_to_asset_key_fn or (lambda _, table: AssetKey(path=table.split(".")))\n )\n self._poll_interval = poll_interval\n self._poll_timeout = poll_timeout\n\n contents = hashlib.sha1()\n contents.update(",".join(key_prefix).encode("utf-8"))\n if connector_filter:\n contents.update(inspect.getsource(connector_filter).encode("utf-8"))\n\n super().__init__(unique_id=f"fivetran-{contents.hexdigest()}")\n\n def _get_connectors(self) -> Sequence[FivetranConnectionMetadata]:\n output_connectors: List[FivetranConnectionMetadata] = []\n\n groups = self._fivetran_instance.make_request("GET", "groups")["items"]\n\n for group in groups:\n group_id = group["id"]\n\n connectors = self._fivetran_instance.make_request(\n "GET", f"groups/{group_id}/connectors"\n )["items"]\n for connector in connectors:\n connector_id = connector["id"]\n\n connector_name = connector["schema"]\n\n setup_state = connector.get("status", {}).get("setup_state")\n if setup_state and setup_state in ("incomplete", "broken"):\n continue\n\n connector_url = get_fivetran_connector_url(connector)\n\n schemas = self._fivetran_instance.make_request(\n "GET", f"connectors/{connector_id}/schemas"\n )\n\n output_connectors.append(\n FivetranConnectionMetadata(\n name=connector_name,\n connector_id=connector_id,\n connector_url=connector_url,\n schemas=schemas,\n )\n )\n\n return output_connectors\n\n def compute_cacheable_data(self) -> Sequence[AssetsDefinitionCacheableData]:\n asset_defn_data: List[AssetsDefinitionCacheableData] = []\n for connector in self._get_connectors():\n if not self._connection_filter or self._connection_filter(connector):\n table_to_asset_key = partial(self._connector_to_asset_key_fn, connector)\n asset_defn_data.append(\n connector.build_asset_defn_metadata(\n key_prefix=self._key_prefix,\n group_name=(\n self._connector_to_group_fn(connector.name)\n if self._connector_to_group_fn\n else None\n ),\n io_manager_key=(\n self._connector_to_io_manager_key_fn(connector.name)\n if self._connector_to_io_manager_key_fn\n else None\n ),\n table_to_asset_key_fn=table_to_asset_key,\n )\n )\n\n return asset_defn_data\n\n def build_definitions(\n self, data: Sequence[AssetsDefinitionCacheableData]\n ) -> Sequence[AssetsDefinition]:\n return [\n _build_fivetran_assets_from_metadata(\n meta,\n {"fivetran": self._fivetran_instance.get_resource_definition()},\n poll_interval=self._poll_interval,\n poll_timeout=self._poll_timeout,\n )\n for meta in data\n ]\n\n\ndef _clean_name(name: str) -> str:\n """Cleans an input to be a valid Dagster asset name."""\n return re.sub(r"[^a-z0-9]+", "_", name.lower())\n\n\n
[docs]def load_assets_from_fivetran_instance(\n fivetran: Union[FivetranResource, ResourceDefinition],\n key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n connector_to_group_fn: Optional[Callable[[str], Optional[str]]] = _clean_name,\n io_manager_key: Optional[str] = None,\n connector_to_io_manager_key_fn: Optional[Callable[[str], Optional[str]]] = None,\n connector_filter: Optional[Callable[[FivetranConnectionMetadata], bool]] = None,\n connector_to_asset_key_fn: Optional[\n Callable[[FivetranConnectionMetadata, str], AssetKey]\n ] = None,\n poll_interval: float = DEFAULT_POLL_INTERVAL,\n poll_timeout: Optional[float] = None,\n) -> CacheableAssetsDefinition:\n """Loads Fivetran connector assets from a configured FivetranResource instance. This fetches information\n about defined connectors at initialization time, and will error on workspace load if the Fivetran\n instance is not reachable.\n\n Args:\n fivetran (ResourceDefinition): A FivetranResource configured with the appropriate connection\n details.\n key_prefix (Optional[CoercibleToAssetKeyPrefix]): A prefix for the asset keys created.\n connector_to_group_fn (Optional[Callable[[str], Optional[str]]]): Function which returns an asset\n group name for a given Fivetran connector name. If None, no groups will be created. Defaults\n to a basic sanitization function.\n io_manager_key (Optional[str]): The IO manager key to use for all assets. Defaults to "io_manager".\n Use this if all assets should be loaded from the same source, otherwise use connector_to_io_manager_key_fn.\n connector_to_io_manager_key_fn (Optional[Callable[[str], Optional[str]]]): Function which returns an\n IO manager key for a given Fivetran connector name. When other ops are downstream of the loaded assets,\n the IOManager specified determines how the inputs to those ops are loaded. Defaults to "io_manager".\n connector_filter (Optional[Callable[[FivetranConnectorMetadata], bool]]): Optional function which takes\n in connector metadata and returns False if the connector should be excluded from the output assets.\n connector_to_asset_key_fn (Optional[Callable[[FivetranConnectorMetadata, str], AssetKey]]): Optional function\n which takes in connector metadata and a table name and returns an AssetKey for that table. Defaults to\n a function that generates an AssetKey matching the table name, split by ".".\n poll_interval (float): The time (in seconds) that will be waited between successive polls.\n poll_timeout (Optional[float]): The maximum time that will waited before this operation is\n timed out. By default, this will never time out.\n\n **Examples:**\n\n Loading all Fivetran connectors as assets:\n\n .. code-block:: python\n\n from dagster_fivetran import fivetran_resource, load_assets_from_fivetran_instance\n\n fivetran_instance = fivetran_resource.configured(\n {\n "api_key": "some_key",\n "api_secret": "some_secret",\n }\n )\n fivetran_assets = load_assets_from_fivetran_instance(fivetran_instance)\n\n Filtering the set of loaded connectors:\n\n .. code-block:: python\n\n from dagster_fivetran import fivetran_resource, load_assets_from_fivetran_instance\n\n fivetran_instance = fivetran_resource.configured(\n {\n "api_key": "some_key",\n "api_secret": "some_secret",\n }\n )\n fivetran_assets = load_assets_from_fivetran_instance(\n fivetran_instance,\n connector_filter=lambda meta: "snowflake" in meta.name,\n )\n """\n if isinstance(key_prefix, str):\n key_prefix = [key_prefix]\n key_prefix = check.list_param(key_prefix or [], "key_prefix", of_type=str)\n\n check.invariant(\n not io_manager_key or not connector_to_io_manager_key_fn,\n "Cannot specify both io_manager_key and connector_to_io_manager_key_fn",\n )\n if not connector_to_io_manager_key_fn:\n connector_to_io_manager_key_fn = lambda _: io_manager_key\n\n return FivetranInstanceCacheableAssetsDefinition(\n fivetran_resource_def=fivetran,\n key_prefix=key_prefix,\n connector_to_group_fn=connector_to_group_fn,\n connector_to_io_manager_key_fn=connector_to_io_manager_key_fn,\n connector_filter=connector_filter,\n connector_to_asset_key_fn=connector_to_asset_key_fn,\n poll_interval=poll_interval,\n poll_timeout=poll_timeout,\n )
\n
", "current_page_name": "_modules/dagster_fivetran/asset_defs", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_fivetran.asset_defs"}, "ops": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_fivetran.ops

\nfrom typing import Any, Dict, List, Optional\n\nfrom dagster import (\n    AssetKey,\n    Config,\n    In,\n    Nothing,\n    Out,\n    Output,\n    op,\n)\nfrom pydantic import Field\n\nfrom dagster_fivetran.resources import DEFAULT_POLL_INTERVAL, FivetranResource\nfrom dagster_fivetran.types import FivetranOutput\nfrom dagster_fivetran.utils import generate_materializations\n\n\nclass SyncConfig(Config):\n    connector_id: str = Field(\n        description=(\n            "The Fivetran Connector ID that this op will sync. You can retrieve this "\n            'value from the "Setup" tab of a given connector in the Fivetran UI.'\n        ),\n    )\n    poll_interval: float = Field(\n        default=DEFAULT_POLL_INTERVAL,\n        description="The time (in seconds) that will be waited between successive polls.",\n    )\n    poll_timeout: Optional[float] = Field(\n        default=None,\n        description=(\n            "The maximum time that will waited before this operation is timed out. By "\n            "default, this will never time out."\n        ),\n    )\n    yield_materializations: bool = Field(\n        default=True,\n        description=(\n            "If True, materializations corresponding to the results of the Fivetran sync will "\n            "be yielded when the op executes."\n        ),\n    )\n    asset_key_prefix: List[str] = Field(\n        default=["fivetran"],\n        description=(\n            "If provided and yield_materializations is True, these components will be used to "\n            "prefix the generated asset keys."\n        ),\n    )\n\n\n
[docs]@op(\n ins={"start_after": In(Nothing)},\n out=Out(\n FivetranOutput,\n description=(\n "Parsed json dictionary representing the details of the Fivetran connector after the"\n " sync successfully completes. See the [Fivetran API"\n " Docs](https://fivetran.com/docs/rest-api/connectors#retrieveconnectordetails) to see"\n " detailed information on this response."\n ),\n ),\n tags={"kind": "fivetran"},\n)\ndef fivetran_sync_op(config: SyncConfig, fivetran: FivetranResource) -> Any:\n """Executes a Fivetran sync for a given ``connector_id``, and polls until that sync\n completes, raising an error if it is unsuccessful. It outputs a FivetranOutput which contains\n the details of the Fivetran connector after the sync successfully completes, as well as details\n about which tables the sync updates.\n\n It requires the use of the :py:class:`~dagster_fivetran.fivetran_resource`, which allows it to\n communicate with the Fivetran API.\n\n Examples:\n .. code-block:: python\n\n from dagster import job\n from dagster_fivetran import fivetran_resource, fivetran_sync_op\n\n my_fivetran_resource = fivetran_resource.configured(\n {\n "api_key": {"env": "FIVETRAN_API_KEY"},\n "api_secret": {"env": "FIVETRAN_API_SECRET"},\n }\n )\n\n sync_foobar = fivetran_sync_op.configured({"connector_id": "foobar"}, name="sync_foobar")\n\n @job(resource_defs={"fivetran": my_fivetran_resource})\n def my_simple_fivetran_job():\n sync_foobar()\n\n @job(resource_defs={"fivetran": my_fivetran_resource})\n def my_composed_fivetran_job():\n final_foobar_state = sync_foobar(start_after=some_op())\n other_op(final_foobar_state)\n """\n fivetran_output = fivetran.sync_and_poll(\n connector_id=config.connector_id,\n poll_interval=config.poll_interval,\n poll_timeout=config.poll_timeout,\n )\n if config.yield_materializations:\n yield from generate_materializations(\n fivetran_output, asset_key_prefix=config.asset_key_prefix\n )\n yield Output(fivetran_output)
\n\n\nclass FivetranResyncConfig(SyncConfig):\n resync_parameters: Optional[Dict[str, Any]] = Field(\n None,\n description=(\n "Optional resync parameters to send in the payload to the Fivetran API. You can"\n " find an example resync payload here:"\n " https://fivetran.com/docs/rest-api/connectors#request_7"\n ),\n )\n\n\n@op(\n ins={"start_after": In(Nothing)},\n out=Out(\n FivetranOutput,\n description=(\n "Parsed json dictionary representing the details of the Fivetran connector after the"\n " resync successfully completes. See the [Fivetran API"\n " Docs](https://fivetran.com/docs/rest-api/connectors#retrieveconnectordetails) to see"\n " detailed information on this response."\n ),\n ),\n tags={"kind": "fivetran"},\n)\ndef fivetran_resync_op(\n config: FivetranResyncConfig,\n fivetran: FivetranResource,\n) -> Any:\n """Executes a Fivetran historical resync for a given ``connector_id``, and polls until that resync\n completes, raising an error if it is unsuccessful. It outputs a FivetranOutput which contains\n the details of the Fivetran connector after the resync successfully completes, as well as details\n about which tables the resync updates.\n\n It requires the use of the :py:class:`~dagster_fivetran.fivetran_resource`, which allows it to\n communicate with the Fivetran API.\n\n Examples:\n .. code-block:: python\n\n from dagster import job\n from dagster_fivetran import fivetran_resource, fivetran_resync_op\n\n my_fivetran_resource = fivetran_resource.configured(\n {\n "api_key": {"env": "FIVETRAN_API_KEY"},\n "api_secret": {"env": "FIVETRAN_API_SECRET"},\n }\n )\n\n sync_foobar = fivetran_resync_op.configured(\n {\n "connector_id": "foobar",\n "resync_parameters": {\n "schema_a": ["table_a", "table_b"],\n "schema_b": ["table_c"]\n }\n },\n name="sync_foobar"\n )\n\n @job(resource_defs={"fivetran": my_fivetran_resource})\n def my_simple_fivetran_job():\n sync_foobar()\n\n @job(resource_defs={"fivetran": my_fivetran_resource})\n def my_composed_fivetran_job():\n final_foobar_state = sync_foobar(start_after=some_op())\n other_op(final_foobar_state)\n """\n fivetran_output = fivetran.resync_and_poll(\n connector_id=config.connector_id,\n resync_parameters=config.resync_parameters,\n poll_interval=config.poll_interval,\n poll_timeout=config.poll_timeout,\n )\n if config.yield_materializations:\n asset_key_filter = (\n [\n AssetKey(config.asset_key_prefix + [schema, table])\n for schema, tables in config.resync_parameters.items()\n for table in tables\n ]\n if config.resync_parameters is not None\n else None\n )\n for mat in generate_materializations(\n fivetran_output, asset_key_prefix=config.asset_key_prefix\n ):\n if asset_key_filter is None or mat.asset_key in asset_key_filter:\n yield mat\n\n yield Output(fivetran_output)\n
", "current_page_name": "_modules/dagster_fivetran/ops", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_fivetran.ops"}, "resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_fivetran.resources

\nimport datetime\nimport json\nimport logging\nimport time\nfrom typing import Any, Mapping, Optional, Sequence, Tuple\nfrom urllib.parse import urljoin\n\nimport requests\nfrom dagster import (\n    Failure,\n    InitResourceContext,\n    MetadataValue,\n    __version__,\n    _check as check,\n    get_dagster_logger,\n    resource,\n)\nfrom dagster._config.pythonic_config import ConfigurableResource\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom dagster._utils.cached_method import cached_method\nfrom dateutil import parser\nfrom pydantic import Field\nfrom requests.auth import HTTPBasicAuth\nfrom requests.exceptions import RequestException\n\nfrom dagster_fivetran.types import FivetranOutput\nfrom dagster_fivetran.utils import get_fivetran_connector_url, get_fivetran_logs_url\n\nFIVETRAN_API_BASE = "https://api.fivetran.com"\nFIVETRAN_API_VERSION_PATH = "v1/"\nFIVETRAN_CONNECTOR_PATH = "connectors/"\n\n# default polling interval (in seconds)\nDEFAULT_POLL_INTERVAL = 10\n\n\n
[docs]class FivetranResource(ConfigurableResource):\n """This class exposes methods on top of the Fivetran REST API."""\n\n api_key: str = Field(description="The Fivetran API key to use for this resource.")\n api_secret: str = Field(description="The Fivetran API secret to use for this resource.")\n disable_schedule_on_trigger: bool = Field(\n default=True,\n description=(\n "Specifies if you would like any connector that is sync'd using this "\n "resource to be automatically taken off its Fivetran schedule."\n ),\n )\n request_max_retries: int = Field(\n default=3,\n description=(\n "The maximum number of times requests to the Fivetran API should be retried "\n "before failing."\n ),\n )\n request_retry_delay: float = Field(\n default=0.25,\n description="Time (in seconds) to wait between each request retry.",\n )\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n @property\n def _auth(self) -> HTTPBasicAuth:\n return HTTPBasicAuth(self.api_key, self.api_secret)\n\n @property\n @cached_method\n def _log(self) -> logging.Logger:\n return get_dagster_logger()\n\n @property\n def api_base_url(self) -> str:\n return urljoin(FIVETRAN_API_BASE, FIVETRAN_API_VERSION_PATH)\n\n @property\n def api_connector_url(self) -> str:\n return urljoin(self.api_base_url, FIVETRAN_CONNECTOR_PATH)\n\n def make_connector_request(\n self, method: str, endpoint: str, data: Optional[str] = None\n ) -> Mapping[str, Any]:\n return self.make_request(method, urljoin(FIVETRAN_CONNECTOR_PATH, endpoint), data)\n\n def make_request(\n self, method: str, endpoint: str, data: Optional[str] = None\n ) -> Mapping[str, Any]:\n """Creates and sends a request to the desired Fivetran Connector API endpoint.\n\n Args:\n method (str): The http method to use for this request (e.g. "POST", "GET", "PATCH").\n endpoint (str): The Fivetran API endpoint to send this request to.\n data (Optional[str]): JSON-formatted data string to be included in the request.\n\n Returns:\n Dict[str, Any]: Parsed json data from the response to this request\n """\n url = urljoin(self.api_base_url, endpoint)\n headers = {\n "User-Agent": f"dagster-fivetran/{__version__}",\n "Content-Type": "application/json;version=2",\n }\n\n num_retries = 0\n while True:\n try:\n response = requests.request(\n method=method,\n url=url,\n headers=headers,\n auth=self._auth,\n data=data,\n )\n response.raise_for_status()\n resp_dict = response.json()\n return resp_dict["data"] if "data" in resp_dict else resp_dict\n except RequestException as e:\n self._log.error("Request to Fivetran API failed: %s", e)\n if num_retries == self.request_max_retries:\n break\n num_retries += 1\n time.sleep(self.request_retry_delay)\n\n raise Failure(f"Max retries ({self.request_max_retries}) exceeded with url: {url}.")\n\n def get_connector_details(self, connector_id: str) -> Mapping[str, Any]:\n """Gets details about a given connector from the Fivetran Connector API.\n\n Args:\n connector_id (str): The Fivetran Connector ID. You can retrieve this value from the\n "Setup" tab of a given connector in the Fivetran UI.\n\n Returns:\n Dict[str, Any]: Parsed json data from the response to this request\n """\n return self.make_connector_request(method="GET", endpoint=connector_id)\n\n def _assert_syncable_connector(self, connector_id: str):\n """Confirms that a given connector is eligible to sync. Will raise a Failure in the event that\n the connector is either paused or not fully setup.\n\n Args:\n connector_id (str): The Fivetran Connector ID. You can retrieve this value from the\n "Setup" tab of a given connector in the Fivetran UI.\n """\n connector_details = self.get_connector_details(connector_id)\n if connector_details["paused"]:\n raise Failure(f"Connector '{connector_id}' cannot be synced as it is currently paused.")\n if connector_details["status"]["setup_state"] != "connected":\n raise Failure(f"Connector '{connector_id}' cannot be synced as it has not been setup")\n\n def get_connector_sync_status(self, connector_id: str) -> Tuple[datetime.datetime, bool, str]:\n """Gets details about the status of the most recent Fivetran sync operation for a given\n connector.\n\n Args:\n connector_id (str): The Fivetran Connector ID. You can retrieve this value from the\n "Setup" tab of a given connector in the Fivetran UI.\n\n Returns:\n Tuple[datetime.datetime, bool, str]:\n Tuple representing the timestamp of the last completeded sync, if it succeeded, and\n the currently reported sync status.\n """\n connector_details = self.get_connector_details(connector_id)\n\n min_time_str = "0001-01-01 00:00:00+00"\n succeeded_at = parser.parse(connector_details["succeeded_at"] or min_time_str)\n failed_at = parser.parse(connector_details["failed_at"] or min_time_str)\n\n return (\n max(succeeded_at, failed_at),\n succeeded_at > failed_at,\n connector_details["status"]["sync_state"],\n )\n\n def update_connector(\n self, connector_id: str, properties: Optional[Mapping[str, Any]] = None\n ) -> Mapping[str, Any]:\n """Updates properties of a Fivetran Connector.\n\n Args:\n connector_id (str): The Fivetran Connector ID. You can retrieve this value from the\n "Setup" tab of a given connector in the Fivetran UI.\n properties (Dict[str, Any]): The properties to be updated. For a comprehensive list of\n properties, see the [Fivetran docs](https://fivetran.com/docs/rest-api/connectors#modifyaconnector).\n\n Returns:\n Dict[str, Any]: Parsed json data representing the API response.\n """\n return self.make_connector_request(\n method="PATCH", endpoint=connector_id, data=json.dumps(properties)\n )\n\n def update_schedule_type(\n self, connector_id: str, schedule_type: Optional[str] = None\n ) -> Mapping[str, Any]:\n """Updates the schedule type property of the connector to either "auto" or "manual".\n\n Args:\n connector_id (str): The Fivetran Connector ID. You can retrieve this value from the\n "Setup" tab of a given connector in the Fivetran UI.\n schedule_type (Optional[str]): Either "auto" (to turn the schedule on) or "manual" (to\n turn it off).\n\n Returns:\n Dict[str, Any]: Parsed json data representing the API response.\n """\n if schedule_type not in ["auto", "manual"]:\n check.failed(f"schedule_type must be either 'auto' or 'manual': got '{schedule_type}'")\n return self.update_connector(connector_id, properties={"schedule_type": schedule_type})\n\n def get_connector_schema_config(self, connector_id: str) -> Mapping[str, Any]:\n return self.make_connector_request("GET", endpoint=f"{connector_id}/schemas")\n\n def start_sync(self, connector_id: str) -> Mapping[str, Any]:\n """Initiates a sync of a Fivetran connector.\n\n Args:\n connector_id (str): The Fivetran Connector ID. You can retrieve this value from the\n "Setup" tab of a given connector in the Fivetran UI.\n\n Returns:\n Dict[str, Any]: Parsed json data representing the connector details API response after\n the sync is started.\n """\n if self.disable_schedule_on_trigger:\n self._log.info("Disabling Fivetran sync schedule.")\n self.update_schedule_type(connector_id, "manual")\n self._assert_syncable_connector(connector_id)\n self.make_connector_request(method="POST", endpoint=f"{connector_id}/force")\n connector_details = self.get_connector_details(connector_id)\n self._log.info(\n f"Sync initialized for connector_id={connector_id}. View this sync in the Fivetran UI: "\n + get_fivetran_connector_url(connector_details)\n )\n return connector_details\n\n def start_resync(\n self, connector_id: str, resync_parameters: Optional[Mapping[str, Sequence[str]]] = None\n ) -> Mapping[str, Any]:\n """Initiates a historical sync of all data for multiple schema tables within a Fivetran connector.\n\n Args:\n connector_id (str): The Fivetran Connector ID. You can retrieve this value from the\n "Setup" tab of a given connector in the Fivetran UI.\n resync_parameters (Optional[Dict[str, List[str]]]): Optional resync parameters to send to the Fivetran API.\n An example payload can be found here: https://fivetran.com/docs/rest-api/connectors#request_7\n\n Returns:\n Dict[str, Any]: Parsed json data representing the connector details API response after\n the resync is started.\n """\n if self.disable_schedule_on_trigger:\n self._log.info("Disabling Fivetran sync schedule.")\n self.update_schedule_type(connector_id, "manual")\n self._assert_syncable_connector(connector_id)\n self.make_connector_request(\n method="POST",\n endpoint=(\n f"{connector_id}/schemas/tables/resync"\n if resync_parameters is not None\n else f"{connector_id}/resync"\n ),\n data=json.dumps(resync_parameters) if resync_parameters is not None else None,\n )\n connector_details = self.get_connector_details(connector_id)\n self._log.info(\n f"Sync initialized for connector_id={connector_id}. View this resync in the Fivetran"\n " UI: "\n + get_fivetran_connector_url(connector_details)\n )\n return connector_details\n\n def poll_sync(\n self,\n connector_id: str,\n initial_last_sync_completion: datetime.datetime,\n poll_interval: float = DEFAULT_POLL_INTERVAL,\n poll_timeout: Optional[float] = None,\n ) -> Mapping[str, Any]:\n """Given a Fivetran connector and the timestamp at which the previous sync completed, poll\n until the next sync completes.\n\n The previous sync completion time is necessary because the only way to tell when a sync\n completes is when this value changes.\n\n Args:\n connector_id (str): The Fivetran Connector ID. You can retrieve this value from the\n "Setup" tab of a given connector in the Fivetran UI.\n initial_last_sync_completion (datetime.datetime): The timestamp of the last completed sync\n (successful or otherwise) for this connector, prior to running this method.\n poll_interval (float): The time (in seconds) that will be waited between successive polls.\n poll_timeout (float): The maximum time that will waited before this operation is timed\n out. By default, this will never time out.\n\n Returns:\n Dict[str, Any]: Parsed json data representing the API response.\n """\n poll_start = datetime.datetime.now()\n while True:\n (\n curr_last_sync_completion,\n curr_last_sync_succeeded,\n curr_sync_state,\n ) = self.get_connector_sync_status(connector_id)\n self._log.info(f"Polled '{connector_id}'. Status: [{curr_sync_state}]")\n\n if curr_last_sync_completion > initial_last_sync_completion:\n break\n\n if poll_timeout and datetime.datetime.now() > poll_start + datetime.timedelta(\n seconds=poll_timeout\n ):\n raise Failure(\n f"Sync for connector '{connector_id}' timed out after "\n f"{datetime.datetime.now() - poll_start}."\n )\n\n # Sleep for the configured time interval before polling again.\n time.sleep(poll_interval)\n\n connector_details = self.get_connector_details(connector_id)\n if not curr_last_sync_succeeded:\n raise Failure(\n f"Sync for connector '{connector_id}' failed!",\n metadata={\n "connector_details": MetadataValue.json(connector_details),\n "log_url": MetadataValue.url(get_fivetran_logs_url(connector_details)),\n },\n )\n return connector_details\n\n def sync_and_poll(\n self,\n connector_id: str,\n poll_interval: float = DEFAULT_POLL_INTERVAL,\n poll_timeout: Optional[float] = None,\n ) -> FivetranOutput:\n """Initializes a sync operation for the given connector, and polls until it completes.\n\n Args:\n connector_id (str): The Fivetran Connector ID. You can retrieve this value from the\n "Setup" tab of a given connector in the Fivetran UI.\n poll_interval (float): The time (in seconds) that will be waited between successive polls.\n poll_timeout (float): The maximum time that will waited before this operation is timed\n out. By default, this will never time out.\n\n Returns:\n :py:class:`~FivetranOutput`:\n Object containing details about the connector and the tables it updates\n """\n schema_config = self.get_connector_schema_config(connector_id)\n init_last_sync_timestamp, _, _ = self.get_connector_sync_status(connector_id)\n self.start_sync(connector_id)\n final_details = self.poll_sync(\n connector_id,\n init_last_sync_timestamp,\n poll_interval=poll_interval,\n poll_timeout=poll_timeout,\n )\n return FivetranOutput(connector_details=final_details, schema_config=schema_config)\n\n def resync_and_poll(\n self,\n connector_id: str,\n poll_interval: float = DEFAULT_POLL_INTERVAL,\n poll_timeout: Optional[float] = None,\n resync_parameters: Optional[Mapping[str, Sequence[str]]] = None,\n ) -> FivetranOutput:\n """Initializes a historical resync operation for the given connector, and polls until it completes.\n\n Args:\n connector_id (str): The Fivetran Connector ID. You can retrieve this value from the\n "Setup" tab of a given connector in the Fivetran UI.\n resync_parameters (Dict[str, List[str]]): The payload to send to the Fivetran API.\n This should be a dictionary with schema names as the keys and a list of tables\n to resync as the values.\n poll_interval (float): The time (in seconds) that will be waited between successive polls.\n poll_timeout (float): The maximum time that will waited before this operation is timed\n out. By default, this will never time out.\n\n Returns:\n :py:class:`~FivetranOutput`:\n Object containing details about the connector and the tables it updates\n """\n schema_config = self.get_connector_schema_config(connector_id)\n init_last_sync_timestamp, _, _ = self.get_connector_sync_status(connector_id)\n self.start_resync(connector_id, resync_parameters)\n final_details = self.poll_sync(\n connector_id,\n init_last_sync_timestamp,\n poll_interval=poll_interval,\n poll_timeout=poll_timeout,\n )\n return FivetranOutput(connector_details=final_details, schema_config=schema_config)
\n\n\n
[docs]@dagster_maintained_resource\n@resource(config_schema=FivetranResource.to_config_schema())\ndef fivetran_resource(context: InitResourceContext) -> FivetranResource:\n """This resource allows users to programatically interface with the Fivetran REST API to launch\n syncs and monitor their progress. This currently implements only a subset of the functionality\n exposed by the API.\n\n For a complete set of documentation on the Fivetran REST API, including expected response JSON\n schemae, see the `Fivetran API Docs <https://fivetran.com/docs/rest-api/connectors>`_.\n\n To configure this resource, we recommend using the `configured\n <https://docs.dagster.io/concepts/configuration/configured>`_ method.\n\n **Examples:**\n\n .. code-block:: python\n\n from dagster import job\n from dagster_fivetran import fivetran_resource\n\n my_fivetran_resource = fivetran_resource.configured(\n {\n "api_key": {"env": "FIVETRAN_API_KEY"},\n "api_secret": {"env": "FIVETRAN_API_SECRET"},\n }\n )\n\n @job(resource_defs={"fivetran":my_fivetran_resource})\n def my_fivetran_job():\n ...\n\n """\n return FivetranResource.from_resource_context(context)
\n
", "current_page_name": "_modules/dagster_fivetran/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_fivetran.resources"}}, "dagster_gcp": {"bigquery": {"io_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_gcp.bigquery.io_manager

\nfrom abc import abstractmethod\nfrom contextlib import contextmanager\nfrom typing import Generator, Optional, Sequence, Type, cast\n\nfrom dagster import IOManagerDefinition, OutputContext, io_manager\nfrom dagster._annotations import experimental\nfrom dagster._config.pythonic_config import (\n    ConfigurableIOManagerFactory,\n)\nfrom dagster._core.storage.db_io_manager import (\n    DbClient,\n    DbIOManager,\n    DbTypeHandler,\n    TablePartitionDimension,\n    TableSlice,\n    TimeWindow,\n)\nfrom dagster._core.storage.io_manager import dagster_maintained_io_manager\nfrom google.api_core.exceptions import NotFound\nfrom google.cloud import bigquery\nfrom pydantic import Field\n\nfrom .utils import setup_gcp_creds\n\nBIGQUERY_DATETIME_FORMAT = "%Y-%m-%d %H:%M:%S"\n\n\n
[docs]@experimental\ndef build_bigquery_io_manager(\n type_handlers: Sequence[DbTypeHandler], default_load_type: Optional[Type] = None\n) -> IOManagerDefinition:\n """Builds an I/O manager definition that reads inputs from and writes outputs to BigQuery.\n\n Args:\n type_handlers (Sequence[DbTypeHandler]): Each handler defines how to translate between\n slices of BigQuery tables and an in-memory type - e.g. a Pandas DataFrame.\n If only one DbTypeHandler is provided, it will be used as the default_load_type.\n default_load_type (Type): When an input has no type annotation, load it as this type.\n\n Returns:\n IOManagerDefinition\n\n Examples:\n .. code-block:: python\n\n from dagster_gcp import build_bigquery_io_manager\n from dagster_bigquery_pandas import BigQueryPandasTypeHandler\n from dagster import Definitions\n\n @asset(\n key_prefix=["my_dataset"] # my_dataset will be used as the dataset in BigQuery\n )\n def my_table() -> pd.DataFrame: # the name of the asset will be the table name\n ...\n\n bigquery_io_manager = build_bigquery_io_manager([BigQueryPandasTypeHandler()])\n\n defs = Definitions(\n assets=[my_table],\n resources={\n "io_manager": bigquery_io_manager.configured({\n "project" : {"env": "GCP_PROJECT"}\n })\n }\n )\n\n You can tell Dagster in which dataset to create tables by setting the ``dataset`` configuration value.\n If you do not provide a dataset as configuration to the I/O manager, Dagster will determine a dataset based\n on the assets and ops using the I/O Manager. For assets, the dataset will be determined from the asset key,\n as shown in the above example. The final prefix before the asset name will be used as the dataset. For example,\n if the asset ``my_table`` had the key prefix ``["gcp", "bigquery", "my_dataset"]``, the dataset ``my_dataset`` will be\n used. For ops, the dataset can be specified by including a `schema` entry in output metadata. If ``schema`` is\n not provided via config or on the asset/op, ``public`` will be used for the dataset.\n\n .. code-block:: python\n\n @op(\n out={"my_table": Out(metadata={"schema": "my_dataset"})}\n )\n def make_my_table() -> pd.DataFrame:\n # the returned value will be stored at my_dataset.my_table\n ...\n\n To only use specific columns of a table as input to a downstream op or asset, add the metadata ``columns`` to the\n :py:class:`~dagster.In` or :py:class:`~dagster.AssetIn`.\n\n .. code-block:: python\n\n @asset(\n ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n )\n def my_table_a(my_table: pd.DataFrame) -> pd.DataFrame:\n # my_table will just contain the data from column "a"\n ...\n\n If you cannot upload a file to your Dagster deployment, or otherwise cannot\n `authenticate with GCP <https://cloud.google.com/docs/authentication/provide-credentials-adc>`_\n via a standard method, you can provide a service account key as the ``gcp_credentials`` configuration.\n Dagster willstore this key in a temporary file and set ``GOOGLE_APPLICATION_CREDENTIALS`` to point to the file.\n After the run completes, the file will be deleted, and ``GOOGLE_APPLICATION_CREDENTIALS`` will be\n unset. The key must be base64 encoded to avoid issues with newlines in the keys. You can retrieve\n the base64 encoded with this shell command: ``cat $GOOGLE_APPLICATION_CREDENTIALS | base64``\n """\n\n @dagster_maintained_io_manager\n @io_manager(config_schema=BigQueryIOManager.to_config_schema())\n def bigquery_io_manager(init_context):\n """I/O Manager for storing outputs in a BigQuery database.\n\n Assets will be stored in the dataset and table name specified by their AssetKey.\n Subsequent materializations of an asset will overwrite previous materializations of that asset.\n Op outputs will be stored in the dataset specified by output metadata (defaults to public) in a\n table of the name of the output.\n\n Note that the BigQuery config is mapped to the DB IO manager table hierarchy as follows:\n BigQuery DB IO\n * project -> database\n * dataset -> schema\n * table -> table\n """\n mgr = DbIOManager(\n type_handlers=type_handlers,\n db_client=BigQueryClient(),\n io_manager_name="BigQueryIOManager",\n database=init_context.resource_config["project"],\n schema=init_context.resource_config.get("dataset"),\n default_load_type=default_load_type,\n )\n if init_context.resource_config.get("gcp_credentials"):\n with setup_gcp_creds(init_context.resource_config.get("gcp_credentials")):\n yield mgr\n else:\n yield mgr\n\n return bigquery_io_manager
\n\n\n
[docs]class BigQueryIOManager(ConfigurableIOManagerFactory):\n """Base class for an I/O manager definition that reads inputs from and writes outputs to BigQuery.\n\n Examples:\n .. code-block:: python\n\n from dagster_gcp import BigQueryIOManager\n from dagster_bigquery_pandas import BigQueryPandasTypeHandler\n from dagster import Definitions, EnvVar\n\n class MyBigQueryIOManager(BigQueryIOManager):\n @staticmethod\n def type_handlers() -> Sequence[DbTypeHandler]:\n return [BigQueryPandasTypeHandler()]\n\n @asset(\n key_prefix=["my_dataset"] # my_dataset will be used as the dataset in BigQuery\n )\n def my_table() -> pd.DataFrame: # the name of the asset will be the table name\n ...\n\n defs = Definitions(\n assets=[my_table],\n resources={\n "io_manager": MyBigQueryIOManager(project=EnvVar("GCP_PROJECT"))\n }\n )\n\n You can tell Dagster in which dataset to create tables by setting the ``dataset`` configuration value.\n If you do not provide a dataset as configuration to the I/O manager, Dagster will determine a dataset based\n on the assets and ops using the I/O Manager. For assets, the dataset will be determined from the asset key,\n as shown in the above example. The final prefix before the asset name will be used as the dataset. For example,\n if the asset ``my_table`` had the key prefix ``["gcp", "bigquery", "my_dataset"]``, the dataset ``my_dataset`` will be\n used. For ops, the dataset can be specified by including a ``schema`` entry in output metadata. If ``schema`` is\n not provided via config or on the asset/op, ``public`` will be used for the dataset.\n\n .. code-block:: python\n\n @op(\n out={"my_table": Out(metadata={"schema": "my_dataset"})}\n )\n def make_my_table() -> pd.DataFrame:\n # the returned value will be stored at my_dataset.my_table\n ...\n\n To only use specific columns of a table as input to a downstream op or asset, add the metadata ``columns`` to the\n :py:class:`~dagster.In` or :py:class:`~dagster.AssetIn`.\n\n .. code-block:: python\n\n @asset(\n ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n )\n def my_table_a(my_table: pd.DataFrame) -> pd.DataFrame:\n # my_table will just contain the data from column "a"\n ...\n\n If you cannot upload a file to your Dagster deployment, or otherwise cannot\n `authenticate with GCP <https://cloud.google.com/docs/authentication/provide-credentials-adc>`_\n via a standard method, you can provide a service account key as the ``gcp_credentials`` configuration.\n Dagster will store this key in a temporary file and set ``GOOGLE_APPLICATION_CREDENTIALS`` to point to the file.\n After the run completes, the file will be deleted, and ``GOOGLE_APPLICATION_CREDENTIALS`` will be\n unset. The key must be base64 encoded to avoid issues with newlines in the keys. You can retrieve\n the base64 encoded with this shell command: ``cat $GOOGLE_APPLICATION_CREDENTIALS | base64``\n """\n\n project: str = Field(description="The GCP project to use.")\n dataset: Optional[str] = Field(\n default=None,\n description=(\n "Name of the BigQuery dataset to use. If not provided, the last prefix before"\n " the asset name will be used."\n ),\n )\n location: Optional[str] = Field(\n default=None,\n description=(\n "The GCP location. Note: When using PySpark DataFrames, the default"\n " location of the project will be used. A custom location can be specified in"\n " your SparkSession configuration."\n ),\n )\n gcp_credentials: Optional[str] = Field(\n default=None,\n description=(\n "GCP authentication credentials. If provided, a temporary file will be created"\n " with the credentials and ``GOOGLE_APPLICATION_CREDENTIALS`` will be set to the"\n " temporary file. To avoid issues with newlines in the keys, you must base64"\n " encode the key. You can retrieve the base64 encoded key with this shell"\n " command: ``cat $GOOGLE_AUTH_CREDENTIALS | base64``"\n ),\n )\n temporary_gcs_bucket: Optional[str] = Field(\n default=None,\n description=(\n "When using PySpark DataFrames, optionally specify a temporary GCS bucket to"\n " store data. If not provided, data will be directly written to BigQuery."\n ),\n )\n timeout: Optional[float] = Field(\n default=None,\n description=(\n "When using Pandas DataFrames, optionally specify a timeout for the BigQuery"\n " queries (loading and reading from tables)."\n ),\n )\n\n @staticmethod\n @abstractmethod\n def type_handlers() -> Sequence[DbTypeHandler]: ...\n\n @staticmethod\n def default_load_type() -> Optional[Type]:\n return None\n\n def create_io_manager(self, context) -> Generator:\n mgr = DbIOManager(\n db_client=BigQueryClient(),\n io_manager_name="BigQueryIOManager",\n database=self.project,\n schema=self.dataset,\n type_handlers=self.type_handlers(),\n default_load_type=self.default_load_type(),\n )\n if self.gcp_credentials:\n with setup_gcp_creds(self.gcp_credentials):\n yield mgr\n else:\n yield mgr
\n\n\nclass BigQueryClient(DbClient):\n @staticmethod\n def delete_table_slice(context: OutputContext, table_slice: TableSlice, connection) -> None:\n try:\n connection.query(_get_cleanup_statement(table_slice)).result()\n except NotFound:\n # table doesn't exist yet, so ignore the error\n pass\n\n @staticmethod\n def get_select_statement(table_slice: TableSlice) -> str:\n col_str = ", ".join(table_slice.columns) if table_slice.columns else "*"\n\n if table_slice.partition_dimensions and len(table_slice.partition_dimensions) > 0:\n query = (\n f"SELECT {col_str} FROM"\n f" `{table_slice.database}.{table_slice.schema}.{table_slice.table}` WHERE\\n"\n )\n return query + _partition_where_clause(table_slice.partition_dimensions)\n else:\n return f"""SELECT {col_str} FROM `{table_slice.database}.{table_slice.schema}.{table_slice.table}`"""\n\n @staticmethod\n def ensure_schema_exists(context: OutputContext, table_slice: TableSlice, connection) -> None:\n connection.query(f"CREATE SCHEMA IF NOT EXISTS {table_slice.schema}").result()\n\n @staticmethod\n @contextmanager\n def connect(context, _):\n conn = bigquery.Client(\n project=context.resource_config.get("project"),\n location=context.resource_config.get("location"),\n )\n\n yield conn\n\n\ndef _get_cleanup_statement(table_slice: TableSlice) -> str:\n """Returns a SQL statement that deletes data in the given table to make way for the output data\n being written.\n """\n if table_slice.partition_dimensions and len(table_slice.partition_dimensions) > 0:\n query = (\n f"DELETE FROM `{table_slice.database}.{table_slice.schema}.{table_slice.table}` WHERE\\n"\n )\n return query + _partition_where_clause(table_slice.partition_dimensions)\n else:\n return f"TRUNCATE TABLE `{table_slice.database}.{table_slice.schema}.{table_slice.table}`"\n\n\ndef _partition_where_clause(partition_dimensions: Sequence[TablePartitionDimension]) -> str:\n return " AND\\n".join(\n (\n _time_window_where_clause(partition_dimension)\n if isinstance(partition_dimension.partitions, TimeWindow)\n else _static_where_clause(partition_dimension)\n )\n for partition_dimension in partition_dimensions\n )\n\n\ndef _time_window_where_clause(table_partition: TablePartitionDimension) -> str:\n partition = cast(TimeWindow, table_partition.partitions)\n start_dt, end_dt = partition\n start_dt_str = start_dt.strftime(BIGQUERY_DATETIME_FORMAT)\n end_dt_str = end_dt.strftime(BIGQUERY_DATETIME_FORMAT)\n return f"""{table_partition.partition_expr} >= '{start_dt_str}' AND {table_partition.partition_expr} < '{end_dt_str}'"""\n\n\ndef _static_where_clause(table_partition: TablePartitionDimension) -> str:\n partitions = ", ".join(f"'{partition}'" for partition in table_partition.partitions)\n return f"""{table_partition.partition_expr} in ({partitions})"""\n
", "current_page_name": "_modules/dagster_gcp/bigquery/io_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_gcp.bigquery.io_manager"}, "ops": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_gcp.bigquery.ops

\nimport hashlib\n\nfrom dagster import (\n    In,\n    List,\n    Nothing,\n    Out,\n    _check as check,\n    op,\n)\nfrom dagster_pandas import DataFrame\nfrom google.cloud.bigquery.encryption_configuration import EncryptionConfiguration\nfrom google.cloud.bigquery.job import LoadJobConfig, QueryJobConfig\nfrom google.cloud.bigquery.table import TimePartitioning\n\nfrom .configs import (\n    define_bigquery_create_dataset_config,\n    define_bigquery_delete_dataset_config,\n    define_bigquery_load_config,\n    define_bigquery_query_config,\n)\nfrom .types import BigQueryLoadSource\n\n_START = "start"\n\n\ndef _preprocess_config(cfg):\n    destination_encryption_configuration = cfg.get("destination_encryption_configuration")\n    time_partitioning = cfg.get("time_partitioning")\n\n    if destination_encryption_configuration is not None:\n        cfg["destination_encryption_configuration"] = EncryptionConfiguration(\n            kms_key_name=destination_encryption_configuration\n        )\n\n    if time_partitioning is not None:\n        cfg["time_partitioning"] = TimePartitioning(**time_partitioning)\n\n    return cfg\n\n\n
[docs]def bq_op_for_queries(sql_queries):\n """Executes BigQuery SQL queries.\n\n Expects a BQ client to be provisioned in resources as context.resources.bigquery.\n """\n sql_queries = check.list_param(sql_queries, "sql queries", of_type=str)\n m = hashlib.sha1()\n for query in sql_queries:\n m.update(query.encode("utf-8"))\n hash_str = m.hexdigest()[:10]\n name = f"bq_op_{hash_str}"\n\n @op(\n name=name,\n ins={_START: In(Nothing)},\n out=Out(List[DataFrame]),\n config_schema=define_bigquery_query_config(),\n required_resource_keys={"bigquery"},\n tags={"kind": "sql", "sql": "\\n".join(sql_queries)},\n )\n def _bq_fn(context):\n query_job_config = _preprocess_config(context.op_config.get("query_job_config", {}))\n\n # Retrieve results as pandas DataFrames\n results = []\n for sql_query in sql_queries:\n # We need to construct a new QueryJobConfig for each query.\n # See: https://bit.ly/2VjD6sl\n cfg = QueryJobConfig(**query_job_config) if query_job_config else None\n context.log.info(\n "executing query %s with config: %s"\n % (sql_query, cfg.to_api_repr() if cfg else "(no config provided)")\n )\n results.append(\n context.resources.bigquery.query(sql_query, job_config=cfg).to_dataframe()\n )\n\n return results\n\n return _bq_fn
\n\n\nBIGQUERY_LOAD_CONFIG = define_bigquery_load_config()\n\n\n
[docs]@op(\n ins={"paths": In(List[str])},\n out=Out(Nothing),\n config_schema=BIGQUERY_LOAD_CONFIG,\n required_resource_keys={"bigquery"},\n)\ndef import_gcs_paths_to_bq(context, paths):\n return _execute_load_in_source(context, paths, BigQueryLoadSource.GCS)
\n\n\n
[docs]@op(\n ins={"df": In(DataFrame)},\n out=Out(Nothing),\n config_schema=BIGQUERY_LOAD_CONFIG,\n required_resource_keys={"bigquery"},\n)\ndef import_df_to_bq(context, df):\n return _execute_load_in_source(context, df, BigQueryLoadSource.DataFrame)
\n\n\n
[docs]@op(\n ins={"path": In(str)},\n out=Out(Nothing),\n config_schema=BIGQUERY_LOAD_CONFIG,\n required_resource_keys={"bigquery"},\n)\ndef import_file_to_bq(context, path):\n return _execute_load_in_source(context, path, BigQueryLoadSource.File)
\n\n\ndef _execute_load_in_source(context, source, source_name):\n destination = context.op_config.get("destination")\n load_job_config = _preprocess_config(context.op_config.get("load_job_config", {}))\n cfg = LoadJobConfig(**load_job_config) if load_job_config else None\n\n context.log.info(\n "executing BQ load with config: %s for source %s"\n % (cfg.to_api_repr() if cfg else "(no config provided)", source)\n )\n\n if source_name == BigQueryLoadSource.DataFrame:\n context.resources.bigquery.load_table_from_dataframe(\n source, destination, job_config=cfg\n ).result()\n\n # Load from file. See: https://cloud.google.com/bigquery/docs/loading-data-local\n elif source_name == BigQueryLoadSource.File:\n with open(source, "rb") as file_obj:\n context.resources.bigquery.load_table_from_file(\n file_obj, destination, job_config=cfg\n ).result()\n\n # Load from GCS. See: https://cloud.google.com/bigquery/docs/loading-data-cloud-storage\n elif source_name == BigQueryLoadSource.GCS:\n context.resources.bigquery.load_table_from_uri(source, destination, job_config=cfg).result()\n\n\n
[docs]@op(\n ins={_START: In(Nothing)},\n config_schema=define_bigquery_create_dataset_config(),\n required_resource_keys={"bigquery"},\n)\ndef bq_create_dataset(context):\n """BigQuery Create Dataset.\n\n This op encapsulates creating a BigQuery dataset.\n\n Expects a BQ client to be provisioned in resources as context.resources.bigquery.\n """\n (dataset, exists_ok) = [context.op_config.get(k) for k in ("dataset", "exists_ok")]\n context.log.info("executing BQ create_dataset for dataset %s" % (dataset))\n context.resources.bigquery.create_dataset(dataset, exists_ok)
\n\n\n
[docs]@op(\n ins={_START: In(Nothing)},\n config_schema=define_bigquery_delete_dataset_config(),\n required_resource_keys={"bigquery"},\n)\ndef bq_delete_dataset(context):\n """BigQuery Delete Dataset.\n\n This op encapsulates deleting a BigQuery dataset.\n\n Expects a BQ client to be provisioned in resources as context.resources.bigquery.\n """\n (dataset, delete_contents, not_found_ok) = [\n context.op_config.get(k) for k in ("dataset", "delete_contents", "not_found_ok")\n ]\n\n context.log.info("executing BQ delete_dataset for dataset %s" % dataset)\n\n context.resources.bigquery.delete_dataset(\n dataset, delete_contents=delete_contents, not_found_ok=not_found_ok\n )
\n
", "current_page_name": "_modules/dagster_gcp/bigquery/ops", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_gcp.bigquery.ops"}, "resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_gcp.bigquery.resources

\nfrom contextlib import contextmanager\nfrom typing import Any, Iterator, Optional\n\nfrom dagster import ConfigurableResource, IAttachDifferentObjectToOpContext, resource\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom google.cloud import bigquery\nfrom pydantic import Field\n\nfrom .utils import setup_gcp_creds\n\n\n
[docs]class BigQueryResource(ConfigurableResource, IAttachDifferentObjectToOpContext):\n """Resource for interacting with Google BigQuery.\n\n Examples:\n .. code-block:: python\n\n from dagster import Definitions, asset\n from dagster_gcp import BigQueryResource\n\n @asset\n def my_table(bigquery: BigQueryResource):\n with bigquery.get_client() as client:\n client.query("SELECT * FROM my_dataset.my_table")\n\n defs = Definitions(\n assets=[my_table],\n resources={\n "bigquery": BigQueryResource(project="my-project")\n }\n )\n """\n\n project: Optional[str] = Field(\n default=None,\n description=(\n "Project ID for the project which the client acts on behalf of. Will be passed when"\n " creating a dataset / job. If not passed, falls back to the default inferred from the"\n " environment."\n ),\n )\n\n location: Optional[str] = Field(\n default=None,\n description="Default location for jobs / datasets / tables.",\n )\n\n gcp_credentials: Optional[str] = Field(\n default=None,\n description=(\n "GCP authentication credentials. If provided, a temporary file will be created"\n " with the credentials and ``GOOGLE_APPLICATION_CREDENTIALS`` will be set to the"\n " temporary file. To avoid issues with newlines in the keys, you must base64"\n " encode the key. You can retrieve the base64 encoded key with this shell"\n " command: ``cat $GOOGLE_AUTH_CREDENTIALS | base64``"\n ),\n )\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n @contextmanager\n def get_client(self) -> Iterator[bigquery.Client]:\n """Context manager to create a BigQuery Client.\n\n Examples:\n .. code-block:: python\n\n from dagster import asset\n from dagster_gcp import BigQueryResource\n\n @asset\n def my_table(bigquery: BigQueryResource):\n with bigquery.get_client() as client:\n client.query("SELECT * FROM my_dataset.my_table")\n """\n if self.gcp_credentials:\n with setup_gcp_creds(self.gcp_credentials):\n yield bigquery.Client(project=self.project, location=self.location)\n\n else:\n yield bigquery.Client(project=self.project, location=self.location)\n\n def get_object_to_set_on_execution_context(self) -> Any:\n with self.get_client() as client:\n yield client
\n\n\n
[docs]@dagster_maintained_resource\n@resource(\n config_schema=BigQueryResource.to_config_schema(),\n description="Dagster resource for connecting to BigQuery",\n)\ndef bigquery_resource(context):\n bq_resource = BigQueryResource.from_resource_context(context)\n with bq_resource.get_client() as client:\n yield client
\n
", "current_page_name": "_modules/dagster_gcp/bigquery/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_gcp.bigquery.resources"}, "types": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_gcp.bigquery.types

\nimport re\nfrom enum import Enum as PyEnum\n\nfrom dagster import Enum, EnumValue\nfrom dagster._config import ConfigScalar, ConfigScalarKind, PostProcessingError\nfrom google.cloud.bigquery.job import (\n    CreateDisposition,\n    Encoding,\n    QueryPriority,\n    SchemaUpdateOption,\n    SourceFormat,\n    WriteDisposition,\n)\n\n\nclass BigQueryLoadSource(PyEnum):\n    DataFrame = "DATA_FRAME"\n    GCS = "GCS"\n    File = "FILE"\n\n\nBQCreateDisposition = Enum(\n    name="BQCreateDisposition",\n    enum_values=[\n        EnumValue(CreateDisposition.CREATE_IF_NEEDED),\n        EnumValue(CreateDisposition.CREATE_NEVER),\n    ],\n)\n\nBQPriority = Enum(\n    name="BQPriority",\n    enum_values=[EnumValue(QueryPriority.BATCH), EnumValue(QueryPriority.INTERACTIVE)],\n)\n\nBQSchemaUpdateOption = Enum(\n    name="BQSchemaUpdateOption",\n    enum_values=[\n        EnumValue(\n            SchemaUpdateOption.ALLOW_FIELD_ADDITION,\n            description="Allow adding a nullable field to the schema.",\n        ),\n        EnumValue(\n            SchemaUpdateOption.ALLOW_FIELD_RELAXATION,\n            description="Allow relaxing a required field in the original schema to nullable.",\n        ),\n    ],\n)\n\nBQWriteDisposition = Enum(\n    name="BQWriteDisposition",\n    enum_values=[\n        EnumValue(WriteDisposition.WRITE_APPEND),\n        EnumValue(WriteDisposition.WRITE_EMPTY),\n        EnumValue(WriteDisposition.WRITE_TRUNCATE),\n    ],\n)\n\nBQEncoding = Enum(\n    name="BQEncoding", enum_values=[EnumValue(Encoding.ISO_8859_1), EnumValue(Encoding.UTF_8)]\n)\n\nBQSourceFormat = Enum(\n    name="BQSourceFormat",\n    enum_values=[\n        EnumValue(SourceFormat.AVRO),\n        EnumValue(SourceFormat.CSV),\n        EnumValue(SourceFormat.DATASTORE_BACKUP),\n        EnumValue(SourceFormat.NEWLINE_DELIMITED_JSON),\n        EnumValue(SourceFormat.ORC),\n        EnumValue(SourceFormat.PARQUET),\n    ],\n)\n\n\n# Project names are permitted to have alphanumeric, dashes and underscores, up to 1024 characters.\nRE_PROJECT = r"[\\w\\d\\-\\_]{1,1024}"\n\n# Datasets and tables are permitted to have alphanumeric or underscores, no dashes allowed, up to\n# 1024 characters\nRE_DS_TABLE = r"[\\w\\d\\_]{1,1024}"\n\n# BigQuery supports writes directly to date partitions with the syntax foo.bar$20190101\nRE_PARTITION_SUFFIX = r"(\\$\\d{8})?"\n\n\ndef _is_valid_dataset(config_value):\n    """Datasets must be of form "project.dataset" or "dataset"."""\n    return re.match(\n        # regex matches: project.dataset -- OR -- dataset\n        r"^" + RE_PROJECT + r"\\." + RE_DS_TABLE + r"$|^" + RE_DS_TABLE + r"$",\n        config_value,\n    )\n\n\ndef _is_valid_table(config_value):\n    """Tables must be of form "project.dataset.table" or "dataset.table" with optional\n    date-partition suffix.\n    """\n    return re.match(\n        r"^"\n        + RE_PROJECT  #          project\n        + r"\\."  #               .\n        + RE_DS_TABLE  #         dataset\n        + r"\\."  #               .\n        + RE_DS_TABLE  #         table\n        + RE_PARTITION_SUFFIX  # date partition suffix\n        + r"$|^"  #              -- OR --\n        + RE_DS_TABLE  #         dataset\n        + r"\\."  #               .\n        + RE_DS_TABLE  #         table\n        + RE_PARTITION_SUFFIX  # date partition suffix\n        + r"$",\n        config_value,\n    )\n\n\nclass _Dataset(ConfigScalar):\n    def __init__(self):\n        super(_Dataset, self).__init__(\n            key=type(self).__name__,\n            given_name=type(self).__name__,\n            scalar_kind=ConfigScalarKind.STRING,\n        )\n\n    def post_process(self, value):\n        if not _is_valid_dataset(value):\n            raise PostProcessingError('Datasets must be of the form "project.dataset" or "dataset"')\n        return value\n\n\nclass _Table(ConfigScalar):\n    def __init__(self):\n        super(_Table, self).__init__(\n            key=type(self).__name__,\n            given_name=type(self).__name__,\n            scalar_kind=ConfigScalarKind.STRING,\n        )\n\n    def post_process(self, value):\n        if not _is_valid_table(value):\n            raise PostProcessingError(\n                'Tables must be of the form "project.dataset.table" or "dataset.table" '\n                "with optional date-partition suffix"\n            )\n\n        return value\n\n\n# https://github.com/dagster-io/dagster/issues/1971\nTable = _Table()\nDataset = _Dataset()\n\n\n
[docs]class BigQueryError(Exception):\n pass
\n
", "current_page_name": "_modules/dagster_gcp/bigquery/types", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_gcp.bigquery.types"}}, "dataproc": {"ops": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_gcp.dataproc.ops

\nfrom typing import Any, Dict\n\nfrom dagster import (\n    Bool,\n    Config,\n    Field as DagsterField,\n    Int,\n    op,\n)\nfrom dagster._seven import json\nfrom pydantic import Field\n\nfrom .configs import define_dataproc_submit_job_config\nfrom .resources import TWENTY_MINUTES, DataprocResource\n\n# maintain the old config schema because of the nested job_config schema\nDATAPROC_CONFIG_SCHEMA = {\n    "job_timeout_in_seconds": DagsterField(\n        Int,\n        description="""Optional. Maximum time in seconds to wait for the job being\n                    completed. Default is set to 1200 seconds (20 minutes).\n                    """,\n        is_required=False,\n        default_value=TWENTY_MINUTES,\n    ),\n    "job_config": define_dataproc_submit_job_config(),\n    "job_scoped_cluster": DagsterField(\n        Bool,\n        description="whether to create a cluster or use an existing cluster",\n        is_required=False,\n        default_value=True,\n    ),\n}\n\n\nclass DataprocOpConfig(Config):\n    job_timeout_in_seconds: int = Field(\n        default=TWENTY_MINUTES,\n        description=(\n            "Maximum time in seconds to wait for the job being completed. Default is set to 1200"\n            " seconds (20 minutes)."\n        ),\n    )\n    job_scoped_cluster: bool = Field(\n        default=True,\n        description="Whether to create a cluster or use an existing cluster. Defaults to True.",\n    )\n    project_id: str = Field(\n        description=(\n            "Required. Project ID for the project which the client acts on behalf of. Will be"\n            " passed when creating a dataset/job."\n        )\n    )\n    region: str = Field(description="The GCP region.")\n    job_config: Dict[str, Any] = Field(\n        description="Python dictionary containing configuration for the Dataproc Job."\n    )\n\n\ndef _dataproc_compute(context):\n    job_config = context.op_config["job_config"]\n    job_timeout = context.op_config["job_timeout_in_seconds"]\n\n    context.log.info(\n        "submitting job with config: %s and timeout of: %d seconds"\n        % (str(json.dumps(job_config)), job_timeout)\n    )\n\n    if context.op_config["job_scoped_cluster"]:\n        # Cluster context manager, creates and then deletes cluster\n        with context.resources.dataproc.cluster_context_manager() as cluster:\n            # Submit the job specified by this solid to the cluster defined by the associated resource\n            result = cluster.submit_job(job_config)\n\n            job_id = result["reference"]["jobId"]\n            context.log.info(f"Submitted job ID {job_id}")\n            cluster.wait_for_job(job_id, wait_timeout=job_timeout)\n\n    else:\n        # Submit to an existing cluster\n        # Submit the job specified by this solid to the cluster defined by the associated resource\n        result = context.resources.dataproc.submit_job(job_config)\n\n        job_id = result["reference"]["jobId"]\n        context.log.info(f"Submitted job ID {job_id}")\n        context.resources.dataproc.wait_for_job(job_id, wait_timeout=job_timeout)\n\n\n@op(required_resource_keys={"dataproc"}, config_schema=DATAPROC_CONFIG_SCHEMA)\ndef dataproc_solid(context):\n    return _dataproc_compute(context)\n\n\n
[docs]@op(required_resource_keys={"dataproc"}, config_schema=DATAPROC_CONFIG_SCHEMA)\ndef dataproc_op(context):\n return _dataproc_compute(context)
\n\n\n@op\ndef configurable_dataproc_op(context, dataproc: DataprocResource, config: DataprocOpConfig):\n job_config = {"projectId": config.project_id, "region": config.region, "job": config.job_config}\n job_timeout = config.job_timeout_in_seconds\n\n context.log.info(\n "submitting job with config: %s and timeout of: %d seconds"\n % (str(json.dumps(job_config)), job_timeout)\n )\n\n dataproc_client = dataproc.get_client()\n\n if config.job_scoped_cluster:\n # Cluster context manager, creates and then deletes cluster\n with dataproc_client.cluster_context_manager() as cluster:\n # Submit the job specified by this solid to the cluster defined by the associated resource\n result = cluster.submit_job(job_config)\n\n job_id = result["reference"]["jobId"]\n context.log.info(f"Submitted job ID {job_id}")\n cluster.wait_for_job(job_id, wait_timeout=job_timeout)\n\n else:\n # Submit to an existing cluster\n # Submit the job specified by this solid to the cluster defined by the associated resource\n result = dataproc_client.submit_job(job_config)\n\n job_id = result["reference"]["jobId"]\n context.log.info(f"Submitted job ID {job_id}")\n dataproc_client.wait_for_job(job_id, wait_timeout=job_timeout)\n
", "current_page_name": "_modules/dagster_gcp/dataproc/ops", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_gcp.dataproc.ops"}, "resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_gcp.dataproc.resources

\nimport json\nimport time\nfrom contextlib import contextmanager\nfrom typing import Any, Dict, Mapping, Optional\n\nimport dagster._check as check\nimport yaml\nfrom dagster import ConfigurableResource, IAttachDifferentObjectToOpContext, resource\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom googleapiclient.discovery import build\nfrom oauth2client.client import GoogleCredentials\nfrom pydantic import Field\n\nfrom .configs import define_dataproc_create_cluster_config\nfrom .types import DataprocError\n\nTWENTY_MINUTES = 20 * 60\nDEFAULT_ITER_TIME_SEC = 5\n\n\nclass DataprocClient:\n    """Builds a client to the dataproc API."""\n\n    def __init__(self, config):\n        # Use Application Default Credentials to check the\n        # GOOGLE_APPLICATION_CREDENTIALS environment variable\n        # for the location of the service account key file.\n        credentials = GoogleCredentials.get_application_default()\n\n        # See https://github.com/googleapis/google-api-python-client/issues/299 for the\n        # cache_discovery=False configuration below\n        self.dataproc = build("dataproc", "v1", credentials=credentials, cache_discovery=False)\n\n        self.config = config\n\n        (self.project_id, self.region, self.cluster_name, self.cluster_config) = (\n            self.config.get(k) for k in ("projectId", "region", "clusterName", "cluster_config")\n        )\n\n    @property\n    def dataproc_clusters(self):\n        return (\n            # Google APIs dynamically genned, so pylint pukes\n            self.dataproc.projects()\n            .regions()\n            .clusters()\n        )\n\n    @property\n    def dataproc_jobs(self):\n        return (\n            # Google APIs dynamically genned, so pylint pukes\n            self.dataproc.projects()\n            .regions()\n            .jobs()\n        )\n\n    def create_cluster(self):\n        (\n            self.dataproc_clusters.create(\n                projectId=self.project_id,\n                region=self.region,\n                body={\n                    "projectId": self.project_id,\n                    "clusterName": self.cluster_name,\n                    "config": self.cluster_config,\n                },\n            ).execute()\n        )\n\n        def iter_fn():\n            # TODO: Add logging\n            # See: https://bit.ly/2UW5JaN\n            cluster = self.get_cluster()\n            return cluster["status"]["state"] in {"RUNNING", "UPDATING"}\n\n        done = DataprocClient._iter_and_sleep_until_ready(iter_fn)\n        if not done:\n            cluster = self.get_cluster()\n            raise DataprocError(\n                "Could not provision cluster -- status: %s" % str(cluster["status"])\n            )\n\n    def get_cluster(self):\n        return self.dataproc_clusters.get(\n            projectId=self.project_id, region=self.region, clusterName=self.cluster_name\n        ).execute()\n\n    def delete_cluster(self):\n        return self.dataproc_clusters.delete(\n            projectId=self.project_id, region=self.region, clusterName=self.cluster_name\n        ).execute()\n\n    def submit_job(self, job_details):\n        return self.dataproc_jobs.submit(\n            projectId=self.project_id, region=self.region, body=job_details\n        ).execute()\n\n    def get_job(self, job_id):\n        return self.dataproc_jobs.get(\n            projectId=self.project_id, region=self.region, jobId=job_id\n        ).execute()\n\n    def wait_for_job(self, job_id, wait_timeout=TWENTY_MINUTES):\n        """This method polls job status every 5 seconds."""\n\n        # TODO: Add logging here print('Waiting for job ID {} to finish...'.format(job_id))\n        def iter_fn():\n            # See: https://bit.ly/2Lg2tHr\n            result = self.get_job(job_id)\n\n            # Handle exceptions\n            if result["status"]["state"] in {"CANCELLED", "ERROR"}:\n                raise DataprocError("Job error: %s" % str(result["status"]))\n\n            if result["status"]["state"] == "DONE":\n                return True\n\n            return False\n\n        done = DataprocClient._iter_and_sleep_until_ready(iter_fn, max_wait_time_sec=wait_timeout)\n        if not done:\n            job = self.get_job(job_id)\n            raise DataprocError("Job run timed out: %s" % str(job["status"]))\n\n    @staticmethod\n    def _iter_and_sleep_until_ready(\n        callable_fn, max_wait_time_sec=TWENTY_MINUTES, iter_time=DEFAULT_ITER_TIME_SEC\n    ):\n        """Iterates and sleeps until callable_fn returns true."""\n        # Wait for cluster ready state\n        ready, curr_iter = False, 0\n        max_iter = max_wait_time_sec / iter_time\n        while not ready and curr_iter < max_iter:\n            ready = callable_fn()\n            time.sleep(iter_time)\n            curr_iter += 1\n\n        # Will return false if ran up to max_iter without success\n        return ready\n\n    @contextmanager\n    def cluster_context_manager(self):\n        """Context manager allowing execution with a dataproc cluster.\n\n        Example:\n        .. code-block::\n            with context.resources.dataproc.cluster as cluster:\n                # do stuff...\n        """\n        self.create_cluster()\n        try:\n            yield self\n        finally:\n            self.delete_cluster()\n\n\n
[docs]class DataprocResource(ConfigurableResource, IAttachDifferentObjectToOpContext):\n """Resource for connecting to a Dataproc cluster.\n\n Example:\n .. code-block::\n\n @asset\n def my_asset(dataproc: DataprocResource):\n with dataproc.get_client() as client:\n # client is a dagster_gcp.DataprocClient\n ...\n """\n\n project_id: str = Field(\n description=(\n "Required. Project ID for the project which the client acts on behalf of. Will be"\n " passed when creating a dataset/job."\n )\n )\n region: str = Field(description="The GCP region.")\n cluster_name: str = Field(\n description=(\n "Required. The cluster name. Cluster names within a project must be unique. Names of"\n " deleted clusters can be reused."\n )\n )\n cluster_config_yaml_path: Optional[str] = Field(\n default=None,\n description=(\n "Full path to a YAML file containing cluster configuration. See"\n " https://cloud.google.com/dataproc/docs/reference/rest/v1/ClusterConfig for"\n " configuration options. Only one of cluster_config_yaml_path,"\n " cluster_config_json_path, or cluster_config_dict may be provided."\n ),\n )\n cluster_config_json_path: Optional[str] = Field(\n default=None,\n description=(\n "Full path to a JSON file containing cluster configuration. See"\n " https://cloud.google.com/dataproc/docs/reference/rest/v1/ClusterConfig for"\n " configuration options. Only one of cluster_config_yaml_path,"\n " cluster_config_json_path, or cluster_config_dict may be provided."\n ),\n )\n cluster_config_dict: Optional[Dict[str, Any]] = Field(\n default=None,\n description=(\n "Python dictionary containing cluster configuration. See"\n " https://cloud.google.com/dataproc/docs/reference/rest/v1/ClusterConfig for"\n " configuration options. Only one of cluster_config_yaml_path,"\n " cluster_config_json_path, or cluster_config_dict may be provided."\n ),\n )\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n def _read_yaml_config(self, path: str) -> Mapping[str, Any]:\n with open(path, "r", encoding="utf8") as f:\n return yaml.safe_load(f)\n\n def _read_json_config(self, path: str) -> Mapping[str, Any]:\n with open(path, "r", encoding="utf8") as f:\n return json.load(f)\n\n def _get_cluster_config(self) -> Optional[Mapping[str, Any]]:\n methods = 0\n methods += 1 if self.cluster_config_dict is not None else 0\n methods += 1 if self.cluster_config_json_path is not None else 0\n methods += 1 if self.cluster_config_yaml_path is not None else 0\n\n # ensure that at most 1 method is provided\n check.invariant(\n methods <= 1,\n "Dataproc Resource: Incorrect config: Cannot provide cluster config multiple ways."\n " Choose one of cluster_config_dict, cluster_config_json_path, or"\n " cluster_config_yaml_path",\n )\n\n cluster_config = None\n if self.cluster_config_json_path:\n cluster_config = self._read_json_config(self.cluster_config_json_path)\n elif self.cluster_config_yaml_path:\n cluster_config = self._read_yaml_config(self.cluster_config_yaml_path)\n elif self.cluster_config_dict:\n cluster_config = self.cluster_config_dict\n\n return cluster_config\n\n def get_client(self) -> DataprocClient:\n cluster_config = self._get_cluster_config()\n\n client_config_dict = {\n "projectId": self.project_id,\n "region": self.region,\n "clusterName": self.cluster_name,\n "cluster_config": cluster_config,\n }\n\n return DataprocClient(config=client_config_dict)\n\n def get_object_to_set_on_execution_context(self) -> Any:\n return self.get_client()
\n\n\n
[docs]@dagster_maintained_resource\n@resource(\n config_schema=define_dataproc_create_cluster_config(),\n description="Manage a Dataproc cluster resource",\n)\ndef dataproc_resource(context):\n return DataprocClient(context.resource_config)
\n
", "current_page_name": "_modules/dagster_gcp/dataproc/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_gcp.dataproc.resources"}}, "gcs": {"compute_log_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_gcp.gcs.compute_log_manager

\nimport datetime\nimport json\nimport os\nfrom typing import Any, Mapping, Optional, Sequence\n\nimport dagster._seven as seven\nfrom dagster import (\n    Field,\n    StringSource,\n    _check as check,\n)\nfrom dagster._config.config_type import Noneable\nfrom dagster._core.storage.cloud_storage_compute_log_manager import (\n    CloudStorageComputeLogManager,\n    PollingComputeLogSubscriptionManager,\n)\nfrom dagster._core.storage.compute_log_manager import ComputeIOType\nfrom dagster._core.storage.local_compute_log_manager import (\n    IO_TYPE_EXTENSION,\n    LocalComputeLogManager,\n)\nfrom dagster._serdes import ConfigurableClass, ConfigurableClassData\nfrom dagster._utils import ensure_dir, ensure_file\nfrom google.cloud import storage\nfrom typing_extensions import Self\n\n\n
[docs]class GCSComputeLogManager(CloudStorageComputeLogManager, ConfigurableClass):\n """Logs op compute function stdout and stderr to GCS.\n\n Users should not instantiate this class directly. Instead, use a YAML block in ``dagster.yaml``\n such as the following:\n\n .. code-block:: YAML\n\n compute_logs:\n module: dagster_gcp.gcs.compute_log_manager\n class: GCSComputeLogManager\n config:\n bucket: "mycorp-dagster-compute-logs"\n local_dir: "/tmp/cool"\n prefix: "dagster-test-"\n upload_interval: 30\n\n There are more configuration examples in the instance documentation guide: https://docs.dagster.io/deployment/dagster-instance#compute-log-storage\n\n Args:\n bucket (str): The name of the GCS bucket to which to log.\n local_dir (Optional[str]): Path to the local directory in which to stage logs. Default:\n ``dagster._seven.get_system_temp_directory()``.\n prefix (Optional[str]): Prefix for the log file keys.\n json_credentials_envvar (Optional[str]): Environment variable that contains the JSON with a private key\n and other credentials information. If this is set, ``GOOGLE_APPLICATION_CREDENTIALS`` will be ignored.\n Can be used when the private key cannot be used as a file.\n upload_interval: (Optional[int]): Interval in seconds to upload partial log files to GCS. By default, will only upload when the capture is complete.\n inst_data (Optional[ConfigurableClassData]): Serializable representation of the compute\n log manager when instantiated from config.\n """\n\n def __init__(\n self,\n bucket,\n local_dir=None,\n inst_data: Optional[ConfigurableClassData] = None,\n prefix="dagster",\n json_credentials_envvar=None,\n upload_interval=None,\n ):\n self._bucket_name = check.str_param(bucket, "bucket")\n self._prefix = self._clean_prefix(check.str_param(prefix, "prefix"))\n\n if json_credentials_envvar:\n json_info_str = os.environ.get(json_credentials_envvar)\n credentials_info = json.loads(json_info_str) # type: ignore # (possible none)\n self._bucket = (\n storage.Client()\n .from_service_account_info(credentials_info)\n .bucket(self._bucket_name)\n )\n else:\n self._bucket = storage.Client().bucket(self._bucket_name)\n\n # Check if the bucket exists\n check.invariant(self._bucket.exists())\n\n # proxy calls to local compute log manager (for subscriptions, etc)\n if not local_dir:\n local_dir = seven.get_system_temp_directory()\n\n self._upload_interval = check.opt_int_param(upload_interval, "upload_interval")\n self._local_manager = LocalComputeLogManager(local_dir)\n self._subscription_manager = PollingComputeLogSubscriptionManager(self)\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n\n @property\n def inst_data(self):\n return self._inst_data\n\n @classmethod\n def config_type(cls):\n return {\n "bucket": StringSource,\n "local_dir": Field(StringSource, is_required=False),\n "prefix": Field(StringSource, is_required=False, default_value="dagster"),\n "json_credentials_envvar": Field(StringSource, is_required=False),\n "upload_interval": Field(Noneable(int), is_required=False, default_value=None),\n }\n\n @classmethod\n def from_config_value(\n cls, inst_data: ConfigurableClassData, config_value: Mapping[str, Any]\n ) -> Self:\n return GCSComputeLogManager(inst_data=inst_data, **config_value)\n\n @property\n def local_manager(self) -> LocalComputeLogManager:\n return self._local_manager\n\n @property\n def upload_interval(self) -> Optional[int]:\n return self._upload_interval if self._upload_interval else None\n\n def _clean_prefix(self, prefix):\n parts = prefix.split("/")\n return "/".join([part for part in parts if part])\n\n def _gcs_key(self, log_key, io_type, partial=False):\n check.inst_param(io_type, "io_type", ComputeIOType)\n extension = IO_TYPE_EXTENSION[io_type]\n [*namespace, filebase] = log_key\n filename = f"{filebase}.{extension}"\n if partial:\n filename = f"{filename}.partial"\n paths = [self._prefix, "storage", *namespace, filename]\n return "/".join(paths)\n\n def delete_logs(\n self, log_key: Optional[Sequence[str]] = None, prefix: Optional[Sequence[str]] = None\n ):\n self._local_manager.delete_logs(log_key, prefix)\n if log_key:\n gcs_keys_to_remove = [\n self._gcs_key(log_key, ComputeIOType.STDOUT),\n self._gcs_key(log_key, ComputeIOType.STDERR),\n self._gcs_key(log_key, ComputeIOType.STDOUT, partial=True),\n self._gcs_key(log_key, ComputeIOType.STDERR, partial=True),\n ]\n # if the blob doesn't exist, do nothing instead of raising a not found exception\n self._bucket.delete_blobs(gcs_keys_to_remove, on_error=lambda _: None)\n elif prefix:\n # add the trailing '/' to make sure that ['a'] does not match ['apple']\n delete_prefix = "/".join([self._prefix, "storage", *prefix, ""])\n to_delete = self._bucket.list_blobs(prefix=delete_prefix)\n self._bucket.delete_blobs(list(to_delete))\n else:\n check.failed("Must pass in either `log_key` or `prefix` argument to delete_logs")\n\n def download_url_for_type(self, log_key: Sequence[str], io_type: ComputeIOType):\n if not self.is_capture_complete(log_key):\n return None\n\n gcs_key = self._gcs_key(log_key, io_type)\n try:\n return self._bucket.blob(gcs_key).generate_signed_url(\n expiration=datetime.timedelta(minutes=60)\n )\n except:\n # fallback to the local download url if the current credentials are insufficient to create\n # signed urls\n return self.local_manager.get_captured_log_download_url(log_key, io_type)\n\n def display_path_for_type(self, log_key: Sequence[str], io_type: ComputeIOType):\n if not self.is_capture_complete(log_key):\n return self.local_manager.get_captured_local_path(log_key, IO_TYPE_EXTENSION[io_type])\n gcs_key = self._gcs_key(log_key, io_type)\n return f"gs://{self._bucket_name}/{gcs_key}"\n\n def cloud_storage_has_logs(\n self, log_key: Sequence[str], io_type: ComputeIOType, partial: bool = False\n ) -> bool:\n gcs_key = self._gcs_key(log_key, io_type, partial)\n return self._bucket.blob(gcs_key).exists()\n\n def upload_to_cloud_storage(\n self, log_key: Sequence[str], io_type: ComputeIOType, partial=False\n ):\n path = self.local_manager.get_captured_local_path(log_key, IO_TYPE_EXTENSION[io_type])\n ensure_file(path)\n\n if partial and os.stat(path).st_size == 0:\n return\n\n gcs_key = self._gcs_key(log_key, io_type, partial=partial)\n with open(path, "rb") as data:\n self._bucket.blob(gcs_key).upload_from_file(data)\n\n def download_from_cloud_storage(\n self, log_key: Sequence[str], io_type: ComputeIOType, partial=False\n ):\n path = self.local_manager.get_captured_local_path(\n log_key, IO_TYPE_EXTENSION[io_type], partial=partial\n )\n ensure_dir(os.path.dirname(path))\n\n gcs_key = self._gcs_key(log_key, io_type, partial=partial)\n with open(path, "wb") as fileobj:\n self._bucket.blob(gcs_key).download_to_file(fileobj)\n\n def on_subscribe(self, subscription):\n self._subscription_manager.add_subscription(subscription)\n\n def on_unsubscribe(self, subscription):\n self._subscription_manager.remove_subscription(subscription)\n\n def dispose(self):\n self._subscription_manager.dispose()\n self._local_manager.dispose()
\n
", "current_page_name": "_modules/dagster_gcp/gcs/compute_log_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_gcp.gcs.compute_log_manager"}, "file_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_gcp.gcs.file_manager

\nimport io\nimport uuid\nfrom contextlib import contextmanager\nfrom typing import Optional\n\nimport dagster._check as check\nfrom dagster._core.storage.file_manager import (\n    FileHandle,\n    FileManager,\n    TempfileManager,\n    check_file_like_obj,\n)\nfrom google.cloud import storage\n\n\n
[docs]class GCSFileHandle(FileHandle):\n """A reference to a file on GCS."""\n\n def __init__(self, gcs_bucket: str, gcs_key: str):\n self._gcs_bucket = check.str_param(gcs_bucket, "gcs_bucket")\n self._gcs_key = check.str_param(gcs_key, "gcs_key")\n\n @property\n def gcs_bucket(self) -> str:\n """str: The name of the GCS bucket."""\n return self._gcs_bucket\n\n @property\n def gcs_key(self) -> str:\n """str: The GCS key."""\n return self._gcs_key\n\n @property\n def path_desc(self) -> str:\n """str: The file's GCS URL."""\n return self.gcs_path\n\n @property\n def gcs_path(self) -> str:\n """str: The file's GCS URL."""\n return f"gs://{self.gcs_bucket}/{self.gcs_key}"
\n\n\nclass GCSFileManager(FileManager):\n def __init__(self, client, gcs_bucket, gcs_base_key):\n self._client = check.inst_param(client, "client", storage.client.Client)\n self._gcs_bucket = check.str_param(gcs_bucket, "gcs_bucket")\n self._gcs_base_key = check.str_param(gcs_base_key, "gcs_base_key")\n self._local_handle_cache = {}\n self._temp_file_manager = TempfileManager()\n\n def copy_handle_to_local_temp(self, file_handle):\n self._download_if_not_cached(file_handle)\n return self._get_local_path(file_handle)\n\n def _download_if_not_cached(self, file_handle):\n if not self._file_handle_cached(file_handle):\n # instigate download\n temp_file_obj = self._temp_file_manager.tempfile()\n temp_name = temp_file_obj.name\n bucket_obj = self._client.bucket(file_handle.gcs_bucket)\n bucket_obj.blob(file_handle.gcs_key).download_to_file(temp_file_obj)\n self._local_handle_cache[file_handle.gcs_path] = temp_name\n\n return file_handle\n\n @contextmanager\n def read(self, file_handle, mode="rb"):\n check.inst_param(file_handle, "file_handle", GCSFileHandle)\n check.str_param(mode, "mode")\n check.param_invariant(mode in {"r", "rb"}, "mode")\n\n self._download_if_not_cached(file_handle)\n\n encoding = None if mode == "rb" else "utf-8"\n with open(self._get_local_path(file_handle), mode, encoding=encoding) as file_obj:\n yield file_obj\n\n def _file_handle_cached(self, file_handle):\n return file_handle.gcs_path in self._local_handle_cache\n\n def _get_local_path(self, file_handle):\n return self._local_handle_cache[file_handle.gcs_path]\n\n def read_data(self, file_handle):\n with self.read(file_handle, mode="rb") as file_obj:\n return file_obj.read()\n\n def write_data(self, data, ext=None, key: Optional[str] = None):\n key = check.opt_str_param(key, "key", default=str(uuid.uuid4()))\n check.inst_param(data, "data", bytes)\n return self.write(io.BytesIO(data), mode="wb", key=key, ext=ext)\n\n def write(self, file_obj, mode="wb", ext=None, key: Optional[str] = None):\n key = check.opt_str_param(key, "key", default=str(uuid.uuid4()))\n check_file_like_obj(file_obj)\n gcs_key = self.get_full_key(key + (("." + ext) if ext is not None else ""))\n bucket_obj = self._client.bucket(self._gcs_bucket)\n bucket_obj.blob(gcs_key).upload_from_file(file_obj)\n return GCSFileHandle(self._gcs_bucket, gcs_key)\n\n def get_full_key(self, file_key):\n return f"{self._gcs_base_key}/{file_key}"\n\n def delete_local_temp(self):\n self._temp_file_manager.close()\n
", "current_page_name": "_modules/dagster_gcp/gcs/file_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_gcp.gcs.file_manager"}, "io_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_gcp.gcs.io_manager

\nimport pickle\nfrom typing import Any, Optional, Union\n\nfrom dagster import (\n    ConfigurableIOManager,\n    InputContext,\n    OutputContext,\n    ResourceDependency,\n    _check as check,\n    io_manager,\n)\nfrom dagster._annotations import deprecated\nfrom dagster._core.storage.io_manager import dagster_maintained_io_manager\nfrom dagster._core.storage.upath_io_manager import UPathIOManager\nfrom dagster._utils import PICKLE_PROTOCOL\nfrom dagster._utils.backoff import backoff\nfrom dagster._utils.cached_method import cached_method\nfrom google.api_core.exceptions import Forbidden, ServiceUnavailable, TooManyRequests\nfrom google.cloud import storage\nfrom pydantic import Field\nfrom upath import UPath\n\nfrom .resources import GCSResource\n\nDEFAULT_LEASE_DURATION = 60  # One minute\n\n\nclass PickledObjectGCSIOManager(UPathIOManager):\n    def __init__(self, bucket: str, client: Optional[Any] = None, prefix: str = "dagster"):\n        self.bucket = check.str_param(bucket, "bucket")\n        self.client = client or storage.Client()\n        self.bucket_obj = self.client.bucket(bucket)\n        check.invariant(self.bucket_obj.exists())\n        self.prefix = check.str_param(prefix, "prefix")\n        super().__init__(base_path=UPath(self.prefix))\n\n    def unlink(self, path: UPath) -> None:\n        key = str(path)\n        if self.bucket_obj.blob(key).exists():\n            self.bucket_obj.blob(key).delete()\n\n    def path_exists(self, path: UPath) -> bool:\n        key = str(path)\n        blobs = self.client.list_blobs(self.bucket, prefix=key)\n        return len(list(blobs)) > 0\n\n    def get_op_output_relative_path(self, context: Union[InputContext, OutputContext]) -> UPath:\n        parts = context.get_identifier()\n        run_id = parts[0]\n        output_parts = parts[1:]\n        return UPath("storage", run_id, "files", *output_parts)\n\n    def get_loading_input_log_message(self, path: UPath) -> str:\n        return f"Loading GCS object from: {self._uri_for_path(path)}"\n\n    def get_writing_output_log_message(self, path: UPath) -> str:\n        return f"Writing GCS object at: {self._uri_for_path(path)}"\n\n    def _uri_for_path(self, path: UPath) -> str:\n        return f"gs://{self.bucket}/{path}"\n\n    def make_directory(self, path: UPath) -> None:\n        # It is not necessary to create directories in GCP\n        return None\n\n    def load_from_path(self, context: InputContext, path: UPath) -> Any:\n        bytes_obj = self.bucket_obj.blob(str(path)).download_as_bytes()\n        return pickle.loads(bytes_obj)\n\n    def dump_to_path(self, context: OutputContext, obj: Any, path: UPath) -> None:\n        if self.path_exists(path):\n            context.log.warning(f"Removing existing GCS key: {path}")\n            self.unlink(path)\n\n        pickled_obj = pickle.dumps(obj, PICKLE_PROTOCOL)\n\n        backoff(\n            self.bucket_obj.blob(str(path)).upload_from_string,\n            args=[pickled_obj],\n            retry_on=(TooManyRequests, Forbidden, ServiceUnavailable),\n        )\n\n\n
[docs]class GCSPickleIOManager(ConfigurableIOManager):\n """Persistent IO manager using GCS for storage.\n\n Serializes objects via pickling. Suitable for objects storage for distributed executors, so long\n as each execution node has network connectivity and credentials for GCS and the backing bucket.\n\n Assigns each op output to a unique filepath containing run ID, step key, and output name.\n Assigns each asset to a single filesystem path, at ``<base_dir>/<asset_key>``. If the asset key\n has multiple components, the final component is used as the name of the file, and the preceding\n components as parent directories under the base_dir.\n\n Subsequent materializations of an asset will overwrite previous materializations of that asset.\n With a base directory of ``/my/base/path``, an asset with key\n ``AssetKey(["one", "two", "three"])`` would be stored in a file called ``three`` in a directory\n with path ``/my/base/path/one/two/``.\n\n Example usage:\n\n 1. Attach this IO manager to a set of assets.\n\n .. code-block:: python\n\n from dagster import asset, Definitions\n from dagster_gcp.gcs import GCSPickleIOManager, GCSResource\n\n @asset\n def asset1():\n # create df ...\n return df\n\n @asset\n def asset2(asset1):\n return asset1[:5]\n\n defs = Definitions(\n assets=[asset1, asset2],\n resources={\n "io_manager": GCSPickleIOManager(\n gcs_bucket="my-cool-bucket",\n gcs_prefix="my-cool-prefix"\n ),\n "gcs": GCSResource(project="my-cool-project")\n }\n )\n\n\n 2. Attach this IO manager to your job to make it available to your ops.\n\n .. code-block:: python\n\n from dagster import job\n from dagster_gcp.gcs import GCSPickleIOManager, GCSResource\n\n @job(\n resource_defs={\n "io_manager": GCSPickleIOManager(\n gcs=GCSResource(project="my-cool-project")\n gcs_bucket="my-cool-bucket",\n gcs_prefix="my-cool-prefix"\n ),\n }\n )\n def my_job():\n ...\n """\n\n gcs: ResourceDependency[GCSResource]\n gcs_bucket: str = Field(description="GCS bucket to store files")\n gcs_prefix: str = Field(default="dagster", description="Prefix to add to all file paths")\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n @property\n @cached_method\n def _internal_io_manager(self) -> PickledObjectGCSIOManager:\n return PickledObjectGCSIOManager(\n bucket=self.gcs_bucket, client=self.gcs.get_client(), prefix=self.gcs_prefix\n )\n\n def load_input(self, context: InputContext) -> Any:\n return self._internal_io_manager.load_input(context)\n\n def handle_output(self, context: OutputContext, obj: Any) -> None:\n self._internal_io_manager.handle_output(context, obj)
\n\n\n
[docs]@deprecated(\n breaking_version="2.0",\n additional_warn_text="Please use GCSPickleIOManager instead.",\n)\nclass ConfigurablePickledObjectGCSIOManager(GCSPickleIOManager):\n """Renamed to GCSPickleIOManager. See GCSPickleIOManager for documentation."""\n\n pass
\n\n\n
[docs]@dagster_maintained_io_manager\n@io_manager(\n config_schema=GCSPickleIOManager.to_config_schema(),\n required_resource_keys={"gcs"},\n)\ndef gcs_pickle_io_manager(init_context):\n """Persistent IO manager using GCS for storage.\n\n Serializes objects via pickling. Suitable for objects storage for distributed executors, so long\n as each execution node has network connectivity and credentials for GCS and the backing bucket.\n\n Assigns each op output to a unique filepath containing run ID, step key, and output name.\n Assigns each asset to a single filesystem path, at ``<base_dir>/<asset_key>``. If the asset key\n has multiple components, the final component is used as the name of the file, and the preceding\n components as parent directories under the base_dir.\n\n Subsequent materializations of an asset will overwrite previous materializations of that asset.\n With a base directory of ``/my/base/path``, an asset with key\n ``AssetKey(["one", "two", "three"])`` would be stored in a file called ``three`` in a directory\n with path ``/my/base/path/one/two/``.\n\n Example usage:\n\n 1. Attach this IO manager to a set of assets.\n\n .. code-block:: python\n\n from dagster import Definitions, asset\n from dagster_gcp.gcs import gcs_pickle_io_manager, gcs_resource\n\n @asset\n def asset1():\n # create df ...\n return df\n\n @asset\n def asset2(asset1):\n return asset1[:5]\n\n defs = Definitions(\n assets=[asset1, asset2],\n resources={\n "io_manager": gcs_pickle_io_manager.configured(\n {"gcs_bucket": "my-cool-bucket", "gcs_prefix": "my-cool-prefix"}\n ),\n "gcs": gcs_resource.configured({"project": "my-cool-project"}),\n },\n )\n\n\n 2. Attach this IO manager to your job to make it available to your ops.\n\n .. code-block:: python\n\n from dagster import job\n from dagster_gcp.gcs import gcs_pickle_io_manager, gcs_resource\n\n @job(\n resource_defs={\n "io_manager": gcs_pickle_io_manager.configured(\n {"gcs_bucket": "my-cool-bucket", "gcs_prefix": "my-cool-prefix"}\n ),\n "gcs": gcs_resource.configured({"project": "my-cool-project"}),\n },\n )\n def my_job():\n ...\n """\n client = init_context.resources.gcs\n pickled_io_manager = PickledObjectGCSIOManager(\n bucket=init_context.resource_config["gcs_bucket"],\n client=client,\n prefix=init_context.resource_config["gcs_prefix"],\n )\n return pickled_io_manager
\n
", "current_page_name": "_modules/dagster_gcp/gcs/io_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_gcp.gcs.io_manager"}, "resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_gcp.gcs.resources

\nfrom typing import Any, Optional\n\nfrom dagster import ConfigurableResource, IAttachDifferentObjectToOpContext, resource\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom google.cloud import storage\nfrom pydantic import Field\n\nfrom .file_manager import GCSFileManager\n\n\n
[docs]class GCSResource(ConfigurableResource, IAttachDifferentObjectToOpContext):\n """Resource for interacting with Google Cloud Storage.\n\n Example:\n .. code-block::\n\n @asset\n def my_asset(gcs: GCSResource):\n with gcs.get_client() as client:\n # client is a google.cloud.storage.Client\n ...\n """\n\n project: Optional[str] = Field(default=None, description="Project name")\n\n def get_client(self) -> storage.Client:\n """Creates a GCS Client.\n\n Returns: google.cloud.storage.Client\n """\n return _gcs_client_from_config(project=self.project)\n\n def get_object_to_set_on_execution_context(self) -> Any:\n return self.get_client()
\n\n\n
[docs]@dagster_maintained_resource\n@resource(\n config_schema=GCSResource.to_config_schema(),\n description="This resource provides a GCS client",\n)\ndef gcs_resource(init_context) -> storage.Client:\n return GCSResource.from_resource_context(init_context).get_client()
\n\n\n
[docs]class GCSFileManagerResource(ConfigurableResource, IAttachDifferentObjectToOpContext):\n """FileManager that provides abstract access to GCS."""\n\n project: Optional[str] = Field(default=None, description="Project name")\n gcs_bucket: str = Field(description="GCS bucket to store files")\n gcs_prefix: str = Field(default="dagster", description="Prefix to add to all file paths")\n\n def get_client(self) -> GCSFileManager:\n """Creates a :py:class:`~dagster_gcp.GCSFileManager` object that implements the\n :py:class:`~dagster._core.storage.file_manager.FileManager` API .\n\n Returns: GCSFileManager\n """\n gcs_client = _gcs_client_from_config(project=self.project)\n return GCSFileManager(\n client=gcs_client,\n gcs_bucket=self.gcs_bucket,\n gcs_base_key=self.gcs_prefix,\n )\n\n def get_object_to_set_on_execution_context(self) -> Any:\n return self.get_client()
\n\n\n
[docs]@dagster_maintained_resource\n@resource(config_schema=GCSFileManagerResource.to_config_schema())\ndef gcs_file_manager(context):\n """FileManager that provides abstract access to GCS.\n\n Implements the :py:class:`~dagster._core.storage.file_manager.FileManager` API.\n """\n return GCSFileManagerResource.from_resource_context(context).get_client()
\n\n\ndef _gcs_client_from_config(project: Optional[str]) -> storage.Client:\n """Creates a GCS Client.\n\n Args:\n project: The GCP project\n\n Returns: A GCS client.\n """\n return storage.client.Client(project=project)\n
", "current_page_name": "_modules/dagster_gcp/gcs/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_gcp.gcs.resources"}}}, "dagster_gcp_pandas": {"bigquery": {"bigquery_pandas_type_handler": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_gcp_pandas.bigquery.bigquery_pandas_type_handler

\nfrom typing import Optional, Sequence, Type\n\nimport pandas as pd\nfrom dagster import InputContext, MetadataValue, OutputContext, TableColumn, TableSchema\nfrom dagster._core.storage.db_io_manager import DbTypeHandler, TableSlice\nfrom dagster_gcp.bigquery.io_manager import (\n    BigQueryClient,\n    BigQueryIOManager,\n    build_bigquery_io_manager,\n)\n\n\n
[docs]class BigQueryPandasTypeHandler(DbTypeHandler[pd.DataFrame]):\n """Plugin for the BigQuery I/O Manager that can store and load Pandas DataFrames as BigQuery tables.\n\n Examples:\n .. code-block:: python\n\n from dagster_gcp import BigQueryIOManager\n from dagster_bigquery_pandas import BigQueryPandasTypeHandler\n from dagster import Definitions, EnvVar\n\n class MyBigQueryIOManager(BigQueryIOManager):\n @staticmethod\n def type_handlers() -> Sequence[DbTypeHandler]:\n return [BigQueryPandasTypeHandler()]\n\n @asset(\n key_prefix=["my_dataset"] # my_dataset will be used as the dataset in BigQuery\n )\n def my_table() -> pd.DataFrame: # the name of the asset will be the table name\n ...\n\n defs = Definitions(\n assets=[my_table],\n resources={\n "io_manager": MyBigQueryIOManager(project=EnvVar("GCP_PROJECT"))\n }\n )\n\n """\n\n def handle_output(\n self, context: OutputContext, table_slice: TableSlice, obj: pd.DataFrame, connection\n ):\n """Stores the pandas DataFrame in BigQuery."""\n with_uppercase_cols = obj.rename(str.upper, copy=False, axis="columns")\n\n job = connection.load_table_from_dataframe(\n dataframe=with_uppercase_cols,\n destination=f"{table_slice.schema}.{table_slice.table}",\n project=table_slice.database,\n location=context.resource_config.get("location") if context.resource_config else None,\n timeout=context.resource_config.get("timeout") if context.resource_config else None,\n )\n job.result()\n\n context.add_output_metadata(\n {\n "row_count": obj.shape[0],\n "dataframe_columns": MetadataValue.table_schema(\n TableSchema(\n columns=[\n TableColumn(name=name, type=str(dtype)) # type: ignore # (bad stubs)\n for name, dtype in obj.dtypes.items()\n ]\n )\n ),\n }\n )\n\n def load_input(\n self, context: InputContext, table_slice: TableSlice, connection\n ) -> pd.DataFrame:\n """Loads the input as a Pandas DataFrame."""\n if table_slice.partition_dimensions and len(context.asset_partition_keys) == 0:\n return pd.DataFrame()\n result = connection.query(\n query=BigQueryClient.get_select_statement(table_slice),\n project=table_slice.database,\n location=context.resource_config.get("location") if context.resource_config else None,\n timeout=context.resource_config.get("timeout") if context.resource_config else None,\n ).to_dataframe()\n\n result.columns = map(str.lower, result.columns)\n return result\n\n @property\n def supported_types(self):\n return [pd.DataFrame]
\n\n\nbigquery_pandas_io_manager = build_bigquery_io_manager(\n [BigQueryPandasTypeHandler()], default_load_type=pd.DataFrame\n)\nbigquery_pandas_io_manager.__doc__ = """\nAn I/O manager definition that reads inputs from and writes pandas DataFrames to BigQuery.\n\nReturns:\n IOManagerDefinition\n\nExamples:\n\n .. code-block:: python\n\n from dagster_gcp_pandas import bigquery_pandas_io_manager\n from dagster import Definitions\n\n @asset(\n key_prefix=["my_dataset"] # will be used as the dataset in BigQuery\n )\n def my_table() -> pd.DataFrame: # the name of the asset will be the table name\n ...\n\n defs = Definitions(\n assets=[my_table],\n resources={\n "io_manager": bigquery_pandas_io_manager.configured({\n "project" : {"env": "GCP_PROJECT"}\n })\n }\n )\n\n You can tell Dagster in which dataset to create tables by setting the "dataset" configuration value.\n If you do not provide a dataset as configuration to the I/O manager, Dagster will determine a dataset based\n on the assets and ops using the I/O Manager. For assets, the dataset will be determined from the asset key,\n as shown in the above example. The final prefix before the asset name will be used as the dataset. For example,\n if the asset "my_table" had the key prefix ["gcp", "bigquery", "my_dataset"], the dataset "my_dataset" will be\n used. For ops, the dataset can be specified by including a "schema" entry in output metadata. If "schema" is not provided\n via config or on the asset/op, "public" will be used for the dataset.\n\n .. code-block:: python\n\n @op(\n out={"my_table": Out(metadata={"schema": "my_dataset"})}\n )\n def make_my_table() -> pd.DataFrame:\n # the returned value will be stored at my_dataset.my_table\n ...\n\n To only use specific columns of a table as input to a downstream op or asset, add the metadata "columns" to the\n In or AssetIn.\n\n .. code-block:: python\n\n @asset(\n ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n )\n def my_table_a(my_table: pd.DataFrame) -> pd.DataFrame:\n # my_table will just contain the data from column "a"\n ...\n\n If you cannot upload a file to your Dagster deployment, or otherwise cannot\n `authenticate with GCP <https://cloud.google.com/docs/authentication/provide-credentials-adc>`_\n via a standard method, you can provide a service account key as the "gcp_credentials" configuration.\n Dagster will store this key in a temporary file and set GOOGLE_APPLICATION_CREDENTIALS to point to the file.\n After the run completes, the file will be deleted, and GOOGLE_APPLICATION_CREDENTIALS will be\n unset. The key must be base64 encoded to avoid issues with newlines in the keys. You can retrieve\n the base64 encoded key with this shell command: cat $GOOGLE_APPLICATION_CREDENTIALS | base64\n\n"""\n\n\n
[docs]class BigQueryPandasIOManager(BigQueryIOManager):\n """An I/O manager definition that reads inputs from and writes pandas DataFrames to BigQuery.\n\n Returns:\n IOManagerDefinition\n\n Examples:\n .. code-block:: python\n\n from dagster_gcp_pandas import BigQueryPandasIOManager\n from dagster import Definitions, EnvVar\n\n @asset(\n key_prefix=["my_dataset"] # will be used as the dataset in BigQuery\n )\n def my_table() -> pd.DataFrame: # the name of the asset will be the table name\n ...\n\n defs = Definitions(\n assets=[my_table],\n resources={\n "io_manager": BigQueryPandasIOManager(project=EnvVar("GCP_PROJECT"))\n }\n )\n\n You can tell Dagster in which dataset to create tables by setting the "dataset" configuration value.\n If you do not provide a dataset as configuration to the I/O manager, Dagster will determine a dataset based\n on the assets and ops using the I/O Manager. For assets, the dataset will be determined from the asset key,\n as shown in the above example. The final prefix before the asset name will be used as the dataset. For example,\n if the asset "my_table" had the key prefix ["gcp", "bigquery", "my_dataset"], the dataset "my_dataset" will be\n used. For ops, the dataset can be specified by including a "schema" entry in output metadata. If "schema" is not provided\n via config or on the asset/op, "public" will be used for the dataset.\n\n .. code-block:: python\n\n @op(\n out={"my_table": Out(metadata={"schema": "my_dataset"})}\n )\n def make_my_table() -> pd.DataFrame:\n # the returned value will be stored at my_dataset.my_table\n ...\n\n To only use specific columns of a table as input to a downstream op or asset, add the metadata "columns" to the\n In or AssetIn.\n\n .. code-block:: python\n\n @asset(\n ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n )\n def my_table_a(my_table: pd.DataFrame) -> pd.DataFrame:\n # my_table will just contain the data from column "a"\n ...\n\n If you cannot upload a file to your Dagster deployment, or otherwise cannot\n `authenticate with GCP <https://cloud.google.com/docs/authentication/provide-credentials-adc>`_\n via a standard method, you can provide a service account key as the "gcp_credentials" configuration.\n Dagster will store this key in a temporary file and set GOOGLE_APPLICATION_CREDENTIALS to point to the file.\n After the run completes, the file will be deleted, and GOOGLE_APPLICATION_CREDENTIALS will be\n unset. The key must be base64 encoded to avoid issues with newlines in the keys. You can retrieve\n the base64 encoded key with this shell command: cat $GOOGLE_APPLICATION_CREDENTIALS | base64\n\n """\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n @staticmethod\n def type_handlers() -> Sequence[DbTypeHandler]:\n return [BigQueryPandasTypeHandler()]\n\n @staticmethod\n def default_load_type() -> Optional[Type]:\n return pd.DataFrame
\n
", "current_page_name": "_modules/dagster_gcp_pandas/bigquery/bigquery_pandas_type_handler", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_gcp_pandas.bigquery.bigquery_pandas_type_handler"}}}, "dagster_gcp_pyspark": {"bigquery": {"bigquery_pyspark_type_handler": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_gcp_pyspark.bigquery.bigquery_pyspark_type_handler

\nfrom typing import Any, Mapping, Optional, Sequence, Type\n\nfrom dagster import InputContext, MetadataValue, OutputContext, TableColumn, TableSchema\nfrom dagster._core.definitions.metadata import RawMetadataValue\nfrom dagster._core.storage.db_io_manager import DbTypeHandler, TableSlice\nfrom dagster_gcp import BigQueryIOManager, build_bigquery_io_manager\nfrom dagster_gcp.bigquery.io_manager import BigQueryClient\nfrom pyspark.sql import DataFrame, SparkSession\nfrom pyspark.sql.types import StructType\n\n\ndef _get_bigquery_write_options(\n    config: Optional[Mapping[str, Any]], table_slice: TableSlice\n) -> Mapping[str, str]:\n    conf = {\n        "table": f"{table_slice.database}.{table_slice.schema}.{table_slice.table}",\n    }\n    if config and config.get("temporary_gcs_bucket") is not None:\n        conf["temporaryGcsBucket"] = config["temporary_gcs_bucket"]\n    else:\n        conf["writeMethod"] = "direct"\n    return conf\n\n\ndef _get_bigquery_read_options(table_slice: TableSlice) -> Mapping[str, str]:\n    conf = {"viewsEnabled": "true", "materializationDataset": table_slice.schema}\n    return conf\n\n\n
[docs]class BigQueryPySparkTypeHandler(DbTypeHandler[DataFrame]):\n """Plugin for the BigQuery I/O Manager that can store and load PySpark DataFrames as BigQuery tables.\n\n Examples:\n .. code-block:: python\n\n from dagster_gcp import BigQueryIOManager\n from dagster_bigquery_pandas import BigQueryPySparkTypeHandler\n from dagster import Definitions, EnvVar\n\n class MyBigQueryIOManager(BigQueryIOManager):\n @staticmethod\n def type_handlers() -> Sequence[DbTypeHandler]:\n return [BigQueryPySparkTypeHandler()]\n\n @asset(\n key_prefix=["my_dataset"] # my_dataset will be used as the dataset in BigQuery\n )\n def my_table() -> pd.DataFrame: # the name of the asset will be the table name\n ...\n\n defs = Definitions(\n assets=[my_table],\n resources={\n "io_manager": MyBigQueryIOManager(project=EnvVar("GCP_PROJECT"))\n }\n )\n\n """\n\n def handle_output(\n self, context: OutputContext, table_slice: TableSlice, obj: DataFrame, _\n ) -> Mapping[str, RawMetadataValue]:\n options = _get_bigquery_write_options(context.resource_config, table_slice)\n\n with_uppercase_cols = obj.toDF(*[c.upper() for c in obj.columns])\n\n with_uppercase_cols.write.format("bigquery").options(**options).mode("append").save()\n\n return {\n "dataframe_columns": MetadataValue.table_schema(\n TableSchema(\n columns=[\n TableColumn(name=field.name, type=field.dataType.typeName())\n for field in obj.schema.fields\n ]\n )\n ),\n }\n\n def load_input(self, context: InputContext, table_slice: TableSlice, _) -> DataFrame:\n options = _get_bigquery_read_options(table_slice)\n spark = SparkSession.builder.getOrCreate() # type: ignore\n\n if table_slice.partition_dimensions and len(context.asset_partition_keys) == 0:\n return spark.createDataFrame([], StructType([]))\n\n df = (\n spark.read.format("bigquery")\n .options(**options)\n .load(BigQueryClient.get_select_statement(table_slice))\n )\n\n return df.toDF(*[c.lower() for c in df.columns])\n\n @property\n def supported_types(self):\n return [DataFrame]
\n\n\nbigquery_pyspark_io_manager = build_bigquery_io_manager(\n [BigQueryPySparkTypeHandler()], default_load_type=DataFrame\n)\nbigquery_pyspark_io_manager.__doc__ = """\nAn I/O manager definition that reads inputs from and writes PySpark DataFrames to BigQuery.\n\nReturns:\n IOManagerDefinition\n\nExamples:\n\n .. code-block:: python\n\n from dagster_gcp_pyspark import bigquery_pyspark_io_manager\n from dagster import Definitions\n\n @asset(\n key_prefix=["my_dataset"] # will be used as the dataset in BigQuery\n )\n def my_table() -> pd.DataFrame: # the name of the asset will be the table name\n ...\n\n defs = Definitions(\n assets=[my_table],\n resources={\n "io_manager": bigquery_pyspark_io_manager.configured({\n "project" : {"env": "GCP_PROJECT"}\n })\n }\n )\n\n You can tell Dagster in which dataset to create tables by setting the "dataset" configuration value.\n If you do not provide a dataset as configuration to the I/O manager, Dagster will determine a dataset based\n on the assets and ops using the I/O Manager. For assets, the dataset will be determined from the asset key,\n as shown in the above example. The final prefix before the asset name will be used as the dataset. For example,\n if the asset "my_table" had the key prefix ["gcp", "bigquery", "my_dataset"], the dataset "my_dataset" will be\n used. For ops, the dataset can be specified by including a "schema" entry in output metadata. If "schema" is not provided\n via config or on the asset/op, "public" will be used for the dataset.\n\n .. code-block:: python\n\n @op(\n out={"my_table": Out(metadata={"schema": "my_dataset"})}\n )\n def make_my_table() -> pd.DataFrame:\n # the returned value will be stored at my_dataset.my_table\n ...\n\n To only use specific columns of a table as input to a downstream op or asset, add the metadata "columns" to the\n In or AssetIn.\n\n .. code-block:: python\n\n @asset(\n ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n )\n def my_table_a(my_table: pd.DataFrame) -> pd.DataFrame:\n # my_table will just contain the data from column "a"\n ...\n\n If you cannot upload a file to your Dagster deployment, or otherwise cannot\n `authenticate with GCP <https://cloud.google.com/docs/authentication/provide-credentials-adc>`_\n via a standard method, you can provide a service account key as the "gcp_credentials" configuration.\n Dagster will store this key in a temporary file and set GOOGLE_APPLICATION_CREDENTIALS to point to the file.\n After the run completes, the file will be deleted, and GOOGLE_APPLICATION_CREDENTIALS will be\n unset. The key must be base64 encoded to avoid issues with newlines in the keys. You can retrieve\n the base64 encoded key with this shell command: cat $GOOGLE_APPLICATION_CREDENTIALS | base64\n\n"""\n\n\n
[docs]class BigQueryPySparkIOManager(BigQueryIOManager):\n """An I/O manager definition that reads inputs from and writes PySpark DataFrames to BigQuery.\n\n Returns:\n IOManagerDefinition\n\n Examples:\n .. code-block:: python\n\n from dagster_gcp_pyspark import BigQueryPySparkIOManager\n from dagster import Definitions, EnvVar\n\n @asset(\n key_prefix=["my_dataset"] # will be used as the dataset in BigQuery\n )\n def my_table() -> pd.DataFrame: # the name of the asset will be the table name\n ...\n\n defs = Definitions(\n assets=[my_table],\n resources={\n "io_manager": BigQueryPySparkIOManager(project=EnvVar("GCP_PROJECT"))\n }\n )\n\n You can tell Dagster in which dataset to create tables by setting the "dataset" configuration value.\n If you do not provide a dataset as configuration to the I/O manager, Dagster will determine a dataset based\n on the assets and ops using the I/O Manager. For assets, the dataset will be determined from the asset key,\n as shown in the above example. The final prefix before the asset name will be used as the dataset. For example,\n if the asset "my_table" had the key prefix ["gcp", "bigquery", "my_dataset"], the dataset "my_dataset" will be\n used. For ops, the dataset can be specified by including a "schema" entry in output metadata. If "schema" is not provided\n via config or on the asset/op, "public" will be used for the dataset.\n\n .. code-block:: python\n\n @op(\n out={"my_table": Out(metadata={"schema": "my_dataset"})}\n )\n def make_my_table() -> pd.DataFrame:\n # the returned value will be stored at my_dataset.my_table\n ...\n\n To only use specific columns of a table as input to a downstream op or asset, add the metadata "columns" to the\n In or AssetIn.\n\n .. code-block:: python\n\n @asset(\n ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n )\n def my_table_a(my_table: pd.DataFrame) -> pd.DataFrame:\n # my_table will just contain the data from column "a"\n ...\n\n If you cannot upload a file to your Dagster deployment, or otherwise cannot\n `authenticate with GCP <https://cloud.google.com/docs/authentication/provide-credentials-adc>`_\n via a standard method, you can provide a service account key as the "gcp_credentials" configuration.\n Dagster will store this key in a temporary file and set GOOGLE_APPLICATION_CREDENTIALS to point to the file.\n After the run completes, the file will be deleted, and GOOGLE_APPLICATION_CREDENTIALS will be\n unset. The key must be base64 encoded to avoid issues with newlines in the keys. You can retrieve\n the base64 encoded key with this shell command: cat $GOOGLE_APPLICATION_CREDENTIALS | base64\n\n """\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n @staticmethod\n def type_handlers() -> Sequence[DbTypeHandler]:\n return [BigQueryPySparkTypeHandler()]\n\n @staticmethod\n def default_load_type() -> Optional[Type]:\n return DataFrame
\n
", "current_page_name": "_modules/dagster_gcp_pyspark/bigquery/bigquery_pyspark_type_handler", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_gcp_pyspark.bigquery.bigquery_pyspark_type_handler"}}}, "dagster_ge": {"factory": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_ge.factory

\nimport datetime\nfrom typing import Any, Dict\n\nimport great_expectations as ge\nfrom dagster import (\n    ConfigurableResource,\n    ExpectationResult,\n    IAttachDifferentObjectToOpContext,\n    In,\n    MetadataValue,\n    OpExecutionContext,\n    Out,\n    Output,\n    _check as check,\n    op,\n    resource,\n)\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom dagster_pandas import DataFrame\nfrom great_expectations.render.renderer import ValidationResultsPageRenderer\nfrom great_expectations.render.view import DefaultMarkdownPageView\nfrom pydantic import Field\n\ntry:\n    # ge < v0.13.0\n    from great_expectations.core import convert_to_json_serializable\nexcept ImportError:\n    # ge >= v0.13.0\n    from great_expectations.core.util import convert_to_json_serializable\n\n\nclass GEContextResource(ConfigurableResource, IAttachDifferentObjectToOpContext):\n    ge_root_dir: str = Field(\n        default=None,\n        description="The root directory for your Great Expectations project.",\n    )\n\n    def get_data_context(self):\n        if self.ge_root_dir is None:\n            return ge.data_context.DataContext()\n        return ge.data_context.DataContext(context_root_dir=self.ge_root_dir)\n\n    def get_object_to_set_on_execution_context(self):\n        return self.get_data_context()\n\n\n@dagster_maintained_resource\n@resource(config_schema=GEContextResource.to_config_schema())\ndef ge_data_context(context):\n    return GEContextResource.from_resource_context(context).get_data_context()\n\n\n
[docs]def ge_validation_op_factory(\n name,\n datasource_name,\n suite_name,\n validation_operator_name=None,\n input_dagster_type=DataFrame,\n batch_kwargs=None,\n):\n """Generates ops for interacting with GE.\n\n Args:\n name (str): the name of the op\n datasource_name (str): the name of your DataSource, see your great_expectations.yml\n suite_name (str): the name of your expectation suite, see your great_expectations.yml\n validation_operator_name (Optional[str]): what validation operator to run -- defaults to\n None, which generates an ephemeral validator. If you want to save data docs, use\n 'action_list_operator'.\n See https://legacy.docs.greatexpectations.io/en/0.12.1/reference/core_concepts/validation_operators_and_actions.html#\n input_dagster_type (DagsterType): the Dagster type used to type check the input to the op.\n Defaults to `dagster_pandas.DataFrame`.\n batch_kwargs (Optional[dict]): overrides the `batch_kwargs` parameter when calling the\n `ge_data_context`'s `get_batch` method. Defaults to `{"dataset": dataset}`, where\n `dataset` is the input to the generated op.\n\n Returns:\n An op that takes in a set of data and yields both an expectation with relevant metadata\n and an output with all the metadata (for user processing)\n """\n check.str_param(datasource_name, "datasource_name")\n check.str_param(suite_name, "suite_name")\n check.opt_str_param(validation_operator_name, "validation_operator_name")\n batch_kwargs = check.opt_dict_param(batch_kwargs, "batch_kwargs")\n\n @op(\n name=name,\n ins={"dataset": In(input_dagster_type)},\n out=Out(\n dict,\n description="""\n This op yields an expectationResult with a structured dict of metadata from\n the GE suite, as well as the full result in case a user wants to process it differently.\n The structured dict contains both summary stats from the suite as well as expectation by\n expectation results/details.\n """,\n ),\n required_resource_keys={"ge_data_context"},\n tags={"kind": "ge"},\n )\n def _ge_validation_fn(context: OpExecutionContext, dataset):\n data_context = context.resources.ge_data_context\n\n if validation_operator_name is not None:\n validation_operator = validation_operator_name\n else:\n data_context.add_validation_operator(\n "ephemeral_validation",\n {"class_name": "ActionListValidationOperator", "action_list": []},\n )\n validation_operator = "ephemeral_validation"\n suite = data_context.get_expectation_suite(suite_name)\n final_batch_kwargs = batch_kwargs or {"dataset": dataset}\n if "datasource" in final_batch_kwargs:\n context.log.warning(\n "`datasource` field of `batch_kwargs` will be ignored; use the `datasource_name` "\n "parameter of the op factory instead."\n )\n final_batch_kwargs["datasource"] = datasource_name\n batch = data_context.get_batch(final_batch_kwargs, suite)\n run_id = {\n "run_name": datasource_name + " run",\n "run_time": datetime.datetime.utcnow(),\n }\n results = data_context.run_validation_operator(\n validation_operator, assets_to_validate=[batch], run_id=run_id\n )\n res = convert_to_json_serializable(results.list_validation_results())[0]\n validation_results_page_renderer = ValidationResultsPageRenderer(run_info_at_end=True)\n rendered_document_content_list = (\n validation_results_page_renderer.render_validation_operator_result(results)\n )\n md_str = " ".join(DefaultMarkdownPageView().render(rendered_document_content_list))\n\n yield ExpectationResult(\n success=res["success"],\n metadata={"Expectation Results": MetadataValue.md(md_str)},\n )\n yield Output(res)\n\n return _ge_validation_fn
\n\n\ndef ge_validation_op_factory_v3(\n name,\n datasource_name,\n data_connector_name,\n data_asset_name,\n suite_name,\n batch_identifiers: dict,\n input_dagster_type=DataFrame,\n runtime_method_type="batch_data",\n extra_kwargs=None,\n):\n """Generates ops for interacting with GE (v3 API).\n\n Args:\n name (str): the name of the op\n datasource_name (str): the name of your DataSource, see your great_expectations.yml\n data_connector_name (str): the name of the data connector for this datasource. This should\n point to a RuntimeDataConnector. For information on how to set this up, see:\n https://docs.greatexpectations.io/docs/guides/connecting_to_your_data/how_to_create_a_batch_of_data_from_an_in_memory_spark_or_pandas_dataframe\n data_asset_name (str): the name of the data asset that this op will be validating.\n suite_name (str): the name of your expectation suite, see your great_expectations.yml\n batch_identifier_fn (dict): A dicitonary of batch identifiers to uniquely identify this\n batch of data. To learn more about batch identifiers, see:\n https://docs.greatexpectations.io/docs/reference/datasources#batches.\n input_dagster_type (DagsterType): the Dagster type used to type check the input to the op.\n Defaults to `dagster_pandas.DataFrame`.\n runtime_method_type (str): how GE should interperet the op input. One of ("batch_data",\n "path", "query"). Defaults to "batch_data", which will interperet the input as an\n in-memory object.\n extra_kwargs (Optional[dict]): adds extra kwargs to the invocation of `ge_data_context`'s\n `get_validator` method. If not set, input will be:\n {\n "datasource_name": datasource_name,\n "data_connector_name": data_connector_name,\n "data_asset_name": data_asset_name,\n "runtime_parameters": {\n "<runtime_method_type>": <op input>\n },\n "batch_identifiers": batch_identifiers,\n "expectation_suite_name": suite_name,\n }\n\n Returns:\n An op that takes in a set of data and yields both an expectation with relevant metadata and\n an output with all the metadata (for user processing)\n\n """\n check.str_param(datasource_name, "datasource_name")\n check.str_param(data_connector_name, "data_connector_name")\n check.str_param(suite_name, "suite_name")\n\n _extra_kwargs: Dict[Any, Any] = check.opt_dict_param(extra_kwargs, "extra_kwargs")\n\n @op(\n name=name,\n ins={"dataset": In(input_dagster_type)},\n out=Out(\n dict,\n description="""\n This op yields an ExpectationResult with a structured dict of metadata from\n the GE suite, as well as the full result in case a user wants to process it differently.\n The structured dict contains both summary stats from the suite as well as expectation by\n expectation results/details.\n """,\n ),\n required_resource_keys={"ge_data_context"},\n tags={"kind": "ge"},\n )\n def _ge_validation_fn(context: OpExecutionContext, dataset):\n data_context = context.resources.ge_data_context\n\n validator_kwargs = {\n "datasource_name": datasource_name,\n "data_connector_name": data_connector_name,\n "data_asset_name": datasource_name or data_asset_name,\n "runtime_parameters": {runtime_method_type: dataset},\n "batch_identifiers": batch_identifiers,\n "expectation_suite_name": suite_name,\n **_extra_kwargs,\n }\n validator = data_context.get_validator(**validator_kwargs)\n\n run_id = {\n "run_name": datasource_name + " run",\n "run_time": datetime.datetime.utcnow(),\n }\n results = validator.validate(run_id=run_id)\n\n validation_results_page_renderer = ValidationResultsPageRenderer(run_info_at_end=True)\n rendered_document_content_list = validation_results_page_renderer.render(\n validation_results=results\n )\n md_str = "".join(DefaultMarkdownPageView().render(rendered_document_content_list))\n\n yield ExpectationResult(\n success=bool(results["success"]),\n metadata={"Expectation Results": MetadataValue.md(md_str)},\n )\n yield Output(results.to_json_dict())\n\n return _ge_validation_fn\n
", "current_page_name": "_modules/dagster_ge/factory", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_ge.factory"}}, "dagster_github": {"resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_github.resources

\nimport time\nfrom datetime import datetime\nfrom typing import Optional\n\nimport jwt\nimport requests\nfrom dagster import ConfigurableResource, resource\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom pydantic import Field\n\n\ndef to_seconds(dt):\n    return (dt - datetime(1970, 1, 1)).total_seconds()\n\n\nclass GithubClient:\n    def __init__(\n        self, client, app_id, app_private_rsa_key, default_installation_id, hostname=None\n    ) -> None:\n        self.client = client\n        self.app_private_rsa_key = app_private_rsa_key\n        self.app_id = app_id\n        self.default_installation_id = default_installation_id\n        self.installation_tokens = {}\n        self.app_token = {}\n        self.hostname = hostname\n\n    def __set_app_token(self):\n        # from https://developer.github.com/apps/building-github-apps/authenticating-with-github-apps/\n        # needing to self-sign a JWT\n        now = int(time.time())\n        # JWT expiration time (10 minute maximum)\n        expires = now + (10 * 60)\n        encoded_token = jwt.encode(\n            {\n                # issued at time\n                "iat": now,\n                # JWT expiration time\n                "exp": expires,\n                # GitHub App's identifier\n                "iss": self.app_id,\n            },\n            self.app_private_rsa_key,\n            algorithm="RS256",\n        )\n        self.app_token = {\n            "value": encoded_token,\n            "expires": expires,\n        }\n\n    def __check_app_token(self):\n        if ("expires" not in self.app_token) or (\n            self.app_token["expires"] < (int(time.time()) + 60)\n        ):\n            self.__set_app_token()\n\n    def get_installations(self, headers=None):\n        if headers is None:\n            headers = {}\n        self.__check_app_token()\n        headers["Authorization"] = "Bearer {}".format(self.app_token["value"])\n        headers["Accept"] = "application/vnd.github.machine-man-preview+json"\n        request = self.client.get(\n            (\n                "https://api.github.com/app/installations"\n                if self.hostname is None\n                else f"https://{self.hostname}/api/v3/app/installations"\n            ),\n            headers=headers,\n        )\n        request.raise_for_status()\n        return request.json()\n\n    def __set_installation_token(self, installation_id, headers=None):\n        if headers is None:\n            headers = {}\n        self.__check_app_token()\n        headers["Authorization"] = "Bearer {}".format(self.app_token["value"])\n        headers["Accept"] = "application/vnd.github.machine-man-preview+json"\n        request = requests.post(\n            (\n                f"https://api.github.com/app/installations/{installation_id}/access_tokens"\n                if self.hostname is None\n                else "https://{}/api/v3/app/installations/{}/access_tokens".format(\n                    self.hostname, installation_id\n                )\n            ),\n            headers=headers,\n        )\n        request.raise_for_status()\n        auth = request.json()\n        self.installation_tokens[installation_id] = {\n            "value": auth["token"],\n            "expires": to_seconds(datetime.strptime(auth["expires_at"], "%Y-%m-%dT%H:%M:%SZ")),\n        }\n\n    def __check_installation_tokens(self, installation_id):\n        if (installation_id not in self.installation_tokens) or (\n            self.installation_tokens[installation_id]["expires"] < (int(time.time()) + 60)\n        ):\n            self.__set_installation_token(installation_id)\n\n    def execute(self, query, variables, headers=None, installation_id=None):\n        if headers is None:\n            headers = {}\n        if installation_id is None:\n            installation_id = self.default_installation_id\n        self.__check_installation_tokens(installation_id)\n        headers["Authorization"] = "token {}".format(\n            self.installation_tokens[installation_id]["value"]\n        )\n        request = requests.post(\n            (\n                "https://api.github.com/graphql"\n                if self.hostname is None\n                else f"https://{self.hostname}/api/graphql"\n            ),\n            json={"query": query, "variables": variables},\n            headers=headers,\n        )\n        request.raise_for_status()\n        return request.json()\n\n    def create_issue(self, repo_name, repo_owner, title, body, installation_id=None):\n        if installation_id is None:\n            installation_id = self.default_installation_id\n        res = self.execute(\n            query="""\n            query get_repo_id($repo_name: String!, $repo_owner: String!) {\n                repository(name: $repo_name, owner: $repo_owner) {\n                    id\n                }\n            }\n            """,\n            variables={"repo_name": repo_name, "repo_owner": repo_owner},\n            installation_id=installation_id,\n        )\n\n        return self.execute(\n            query="""\n                mutation CreateIssue($id: ID!, $title: String!, $body: String!) {\n                createIssue(input: {\n                    repositoryId: $id,\n                    title: $title,\n                    body: $body\n                }) {\n                    clientMutationId,\n                    issue {\n                        body\n                        title\n                        url\n                    }\n                }\n                }\n            """,\n            variables={\n                "id": res["data"]["repository"]["id"],\n                "title": title,\n                "body": body,\n            },\n            installation_id=installation_id,\n        )\n\n\n
[docs]class GithubResource(ConfigurableResource):\n github_app_id: int = Field(\n description="Github Application ID, for more info see https://developer.github.com/apps/",\n )\n github_app_private_rsa_key: str = Field(\n description=(\n "Github Application Private RSA key text, for more info see"\n " https://developer.github.com/apps/"\n ),\n )\n github_installation_id: Optional[int] = Field(\n default=None,\n description=(\n "Github Application Installation ID, for more info see"\n " https://developer.github.com/apps/"\n ),\n )\n github_hostname: Optional[str] = Field(\n default=None,\n description=(\n "Github hostname. Defaults to `api.github.com`, for more info see"\n " https://developer.github.com/apps/"\n ),\n )\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n def get_client(self) -> GithubClient:\n return GithubClient(\n client=requests.Session(),\n app_id=self.github_app_id,\n app_private_rsa_key=self.github_app_private_rsa_key,\n default_installation_id=self.github_installation_id,\n hostname=self.github_hostname,\n )
\n\n\n
[docs]@dagster_maintained_resource\n@resource(\n config_schema=GithubResource.to_config_schema(),\n description="This resource is for connecting to Github",\n)\ndef github_resource(context) -> GithubClient:\n return GithubResource(**context.resource_config).get_client()
\n
", "current_page_name": "_modules/dagster_github/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_github.resources"}}, "dagster_graphql": {"client": {"client": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_graphql.client.client

\nfrom itertools import chain\nfrom typing import Any, Dict, Iterable, List, Mapping, Optional, Sequence, Union\n\nimport dagster._check as check\nimport requests.exceptions\nfrom dagster import DagsterRunStatus\nfrom dagster._annotations import experimental, public\nfrom dagster._core.definitions.run_config import RunConfig, convert_config_input\nfrom dagster._core.definitions.utils import validate_tags\nfrom gql import Client, gql\nfrom gql.transport import Transport\nfrom gql.transport.requests import RequestsHTTPTransport\n\nfrom .client_queries import (\n    CLIENT_GET_REPO_LOCATIONS_NAMES_AND_PIPELINES_QUERY,\n    CLIENT_SUBMIT_PIPELINE_RUN_MUTATION,\n    GET_PIPELINE_RUN_STATUS_QUERY,\n    RELOAD_REPOSITORY_LOCATION_MUTATION,\n    SHUTDOWN_REPOSITORY_LOCATION_MUTATION,\n    TERMINATE_RUN_JOB_MUTATION,\n)\nfrom .utils import (\n    DagsterGraphQLClientError,\n    InvalidOutputErrorInfo,\n    JobInfo,\n    ReloadRepositoryLocationInfo,\n    ReloadRepositoryLocationStatus,\n    ShutdownRepositoryLocationInfo,\n    ShutdownRepositoryLocationStatus,\n)\n\n\n
[docs]@experimental\nclass DagsterGraphQLClient:\n """Official Dagster Python Client for GraphQL.\n\n Utilizes the gql library to dispatch queries over HTTP to a remote Dagster GraphQL Server\n\n As of now, all operations on this client are synchronous.\n\n Intended usage:\n\n .. code-block:: python\n\n client = DagsterGraphQLClient("localhost", port_number=3000)\n status = client.get_run_status(**SOME_RUN_ID**)\n\n Args:\n hostname (str): Hostname for the Dagster GraphQL API, like `localhost` or\n `dagster.YOUR_ORG_HERE`.\n port_number (Optional[int]): Port number to connect to on the host.\n Defaults to None.\n transport (Optional[Transport], optional): A custom transport to use to connect to the\n GraphQL API with (e.g. for custom auth). Defaults to None.\n use_https (bool, optional): Whether to use https in the URL connection string for the\n GraphQL API. Defaults to False.\n timeout (int): Number of seconds before requests should time out. Defaults to 60.\n headers (Optional[Dict[str, str]]): Additional headers to include in the request. To use\n this client in Dagster Cloud, set the "Dagster-Cloud-Api-Token" header to a user token\n generated in the Dagster Cloud UI.\n\n Raises:\n :py:class:`~requests.exceptions.ConnectionError`: if the client cannot connect to the host.\n """\n\n def __init__(\n self,\n hostname: str,\n port_number: Optional[int] = None,\n transport: Optional[Transport] = None,\n use_https: bool = False,\n timeout: int = 300,\n headers: Optional[Dict[str, str]] = None,\n ):\n self._hostname = check.str_param(hostname, "hostname")\n self._port_number = check.opt_int_param(port_number, "port_number")\n self._use_https = check.bool_param(use_https, "use_https")\n\n self._url = (\n ("https://" if self._use_https else "http://")\n + (f"{self._hostname}:{self._port_number}" if self._port_number else self._hostname)\n + "/graphql"\n )\n\n self._transport = check.opt_inst_param(\n transport,\n "transport",\n Transport,\n default=RequestsHTTPTransport(\n url=self._url, use_json=True, timeout=timeout, headers=headers\n ),\n )\n try:\n self._client = Client(transport=self._transport, fetch_schema_from_transport=True)\n except requests.exceptions.ConnectionError as exc:\n raise DagsterGraphQLClientError(\n f"Error when connecting to url {self._url}. "\n + f"Did you specify hostname: {self._hostname} "\n + (f"and port_number: {self._port_number} " if self._port_number else "")\n + "correctly?"\n ) from exc\n\n def _execute(self, query: str, variables: Optional[Dict[str, Any]] = None):\n try:\n return self._client.execute(gql(query), variable_values=variables)\n except Exception as exc: # catch generic Exception from the gql client\n raise DagsterGraphQLClientError(\n f"Exception occured during execution of query \\n{query}\\n with variables"\n f" \\n{variables}\\n"\n ) from exc\n\n def _get_repo_locations_and_names_with_pipeline(self, job_name: str) -> List[JobInfo]:\n res_data = self._execute(CLIENT_GET_REPO_LOCATIONS_NAMES_AND_PIPELINES_QUERY)\n query_res = res_data["repositoriesOrError"]\n repo_connection_status = query_res["__typename"]\n if repo_connection_status == "RepositoryConnection":\n valid_nodes: Iterable[JobInfo] = chain(*map(JobInfo.from_node, query_res["nodes"]))\n return [info for info in valid_nodes if info.job_name == job_name]\n else:\n raise DagsterGraphQLClientError(repo_connection_status, query_res["message"])\n\n def _core_submit_execution(\n self,\n pipeline_name: str,\n repository_location_name: Optional[str] = None,\n repository_name: Optional[str] = None,\n run_config: Optional[Union[RunConfig, Mapping[str, Any]]] = None,\n mode: str = "default",\n preset: Optional[str] = None,\n tags: Optional[Mapping[str, str]] = None,\n op_selection: Optional[Sequence[str]] = None,\n is_using_job_op_graph_apis: Optional[bool] = False,\n ):\n check.opt_str_param(repository_location_name, "repository_location_name")\n check.opt_str_param(repository_name, "repository_name")\n check.str_param(pipeline_name, "pipeline_name")\n check.opt_str_param(mode, "mode")\n check.opt_str_param(preset, "preset")\n run_config = check.opt_mapping_param(convert_config_input(run_config), "run_config")\n\n # The following invariant will never fail when a job is executed\n check.invariant(\n (mode is not None and run_config is not None) or preset is not None,\n "Either a mode and run_config or a preset must be specified in order to "\n f"submit the pipeline {pipeline_name} for execution",\n )\n tags = validate_tags(tags)\n\n pipeline_or_job = "Job" if is_using_job_op_graph_apis else "Pipeline"\n\n if not repository_location_name or not repository_name:\n job_info_lst = self._get_repo_locations_and_names_with_pipeline(pipeline_name)\n if len(job_info_lst) == 0:\n raise DagsterGraphQLClientError(\n f"{pipeline_or_job}NotFoundError",\n f"No {'jobs' if is_using_job_op_graph_apis else 'pipelines'} with the name"\n f" `{pipeline_name}` exist",\n )\n elif len(job_info_lst) == 1:\n job_info = job_info_lst[0]\n repository_location_name = job_info.repository_location_name\n repository_name = job_info.repository_name\n else:\n raise DagsterGraphQLClientError(\n "Must specify repository_location_name and repository_name since there are"\n f" multiple {'jobs' if is_using_job_op_graph_apis else 'pipelines'} with the"\n f" name {pipeline_name}.\\n\\tchoose one of: {job_info_lst}"\n )\n\n variables: Dict[str, Any] = {\n "executionParams": {\n "selector": {\n "repositoryLocationName": repository_location_name,\n "repositoryName": repository_name,\n "pipelineName": pipeline_name,\n "solidSelection": op_selection,\n }\n }\n }\n if preset is not None:\n variables["executionParams"]["preset"] = preset\n if mode is not None and run_config is not None:\n variables["executionParams"] = {\n **variables["executionParams"],\n "runConfigData": run_config,\n "mode": mode,\n "executionMetadata": (\n {"tags": [{"key": k, "value": v} for k, v in tags.items()]} if tags else {}\n ),\n }\n\n res_data: Dict[str, Any] = self._execute(CLIENT_SUBMIT_PIPELINE_RUN_MUTATION, variables)\n query_result = res_data["launchPipelineExecution"]\n query_result_type = query_result["__typename"]\n if (\n query_result_type == "LaunchRunSuccess"\n or query_result_type == "LaunchPipelineRunSuccess"\n ):\n return query_result["run"]["runId"]\n elif query_result_type == "InvalidStepError":\n raise DagsterGraphQLClientError(query_result_type, query_result["invalidStepKey"])\n elif query_result_type == "InvalidOutputError":\n error_info = InvalidOutputErrorInfo(\n step_key=query_result["stepKey"],\n invalid_output_name=query_result["invalidOutputName"],\n )\n raise DagsterGraphQLClientError(query_result_type, body=error_info)\n elif (\n query_result_type == "RunConfigValidationInvalid"\n or query_result_type == "PipelineConfigValidationInvalid"\n ):\n raise DagsterGraphQLClientError(query_result_type, query_result["errors"])\n else:\n # query_result_type is a ConflictingExecutionParamsError, a PresetNotFoundError\n # a PipelineNotFoundError, a RunConflict, or a PythonError\n raise DagsterGraphQLClientError(query_result_type, query_result["message"])\n\n
[docs] @public\n def submit_job_execution(\n self,\n job_name: str,\n repository_location_name: Optional[str] = None,\n repository_name: Optional[str] = None,\n run_config: Optional[Dict[str, Any]] = None,\n tags: Optional[Dict[str, Any]] = None,\n op_selection: Optional[Sequence[str]] = None,\n ) -> str:\n """Submits a job with attached configuration for execution.\n\n Args:\n job_name (str): The job's name\n repository_location_name (Optional[str]): The name of the repository location where\n the job is located. If omitted, the client will try to infer the repository location\n from the available options on the Dagster deployment. Defaults to None.\n repository_name (Optional[str]): The name of the repository where the job is located.\n If omitted, the client will try to infer the repository from the available options\n on the Dagster deployment. Defaults to None.\n run_config (Optional[Dict[str, Any]]): This is the run config to execute the job with.\n Note that runConfigData is any-typed in the GraphQL type system. This type is used when passing in\n an arbitrary object for run config. However, it must conform to the constraints of the config\n schema for this job. If it does not, the client will throw a DagsterGraphQLClientError with a message of\n JobConfigValidationInvalid. Defaults to None.\n tags (Optional[Dict[str, Any]]): A set of tags to add to the job execution.\n\n Raises:\n DagsterGraphQLClientError("InvalidStepError", invalid_step_key): the job has an invalid step\n DagsterGraphQLClientError("InvalidOutputError", body=error_object): some solid has an invalid output within the job.\n The error_object is of type dagster_graphql.InvalidOutputErrorInfo.\n DagsterGraphQLClientError("RunConflict", message): a `DagsterRunConflict` occured during execution.\n This indicates that a conflicting job run already exists in run storage.\n DagsterGraphQLClientError("PipelineConfigurationInvalid", invalid_step_key): the run_config is not in the expected format\n for the job\n DagsterGraphQLClientError("JobNotFoundError", message): the requested job does not exist\n DagsterGraphQLClientError("PythonError", message): an internal framework error occurred\n\n Returns:\n str: run id of the submitted pipeline run\n """\n return self._core_submit_execution(\n pipeline_name=job_name,\n repository_location_name=repository_location_name,\n repository_name=repository_name,\n run_config=run_config,\n mode="default",\n preset=None,\n tags=tags,\n op_selection=op_selection,\n is_using_job_op_graph_apis=True,\n )
\n\n
[docs] @public\n def get_run_status(self, run_id: str) -> DagsterRunStatus:\n """Get the status of a given Pipeline Run.\n\n Args:\n run_id (str): run id of the requested pipeline run.\n\n Raises:\n DagsterGraphQLClientError("PipelineNotFoundError", message): if the requested run id is not found\n DagsterGraphQLClientError("PythonError", message): on internal framework errors\n\n Returns:\n DagsterRunStatus: returns a status Enum describing the state of the requested pipeline run\n """\n check.str_param(run_id, "run_id")\n\n res_data: Dict[str, Dict[str, Any]] = self._execute(\n GET_PIPELINE_RUN_STATUS_QUERY, {"runId": run_id}\n )\n query_result: Dict[str, Any] = res_data["pipelineRunOrError"]\n query_result_type: str = query_result["__typename"]\n if query_result_type == "PipelineRun" or query_result_type == "Run":\n return DagsterRunStatus(query_result["status"])\n else:\n raise DagsterGraphQLClientError(query_result_type, query_result["message"])
\n\n
[docs] @public\n def reload_repository_location(\n self, repository_location_name: str\n ) -> ReloadRepositoryLocationInfo:\n """Reloads a Dagster Repository Location, which reloads all repositories in that repository location.\n\n This is useful in a variety of contexts, including refreshing the Dagster UI without restarting\n the server.\n\n Args:\n repository_location_name (str): The name of the repository location\n\n Returns:\n ReloadRepositoryLocationInfo: Object with information about the result of the reload request\n """\n check.str_param(repository_location_name, "repository_location_name")\n\n res_data: Dict[str, Dict[str, Any]] = self._execute(\n RELOAD_REPOSITORY_LOCATION_MUTATION,\n {"repositoryLocationName": repository_location_name},\n )\n\n query_result: Dict[str, Any] = res_data["reloadRepositoryLocation"]\n query_result_type: str = query_result["__typename"]\n if query_result_type == "WorkspaceLocationEntry":\n location_or_error_type = query_result["locationOrLoadError"]["__typename"]\n if location_or_error_type == "RepositoryLocation":\n return ReloadRepositoryLocationInfo(status=ReloadRepositoryLocationStatus.SUCCESS)\n else:\n return ReloadRepositoryLocationInfo(\n status=ReloadRepositoryLocationStatus.FAILURE,\n failure_type="PythonError",\n message=query_result["locationOrLoadError"]["message"],\n )\n else:\n # query_result_type is either ReloadNotSupported or RepositoryLocationNotFound\n return ReloadRepositoryLocationInfo(\n status=ReloadRepositoryLocationStatus.FAILURE,\n failure_type=query_result_type,\n message=query_result["message"],\n )
\n\n
[docs] @public\n def shutdown_repository_location(\n self, repository_location_name: str\n ) -> ShutdownRepositoryLocationInfo:\n """Shuts down the server that is serving metadata for the provided repository location.\n\n This is primarily useful when you want the server to be restarted by the compute environment\n in which it is running (for example, in Kubernetes, the pod in which the server is running\n will automatically restart when the server is shut down, and the repository metadata will\n be reloaded)\n\n Args:\n repository_location_name (str): The name of the repository location\n\n Returns:\n ShutdownRepositoryLocationInfo: Object with information about the result of the reload request\n """\n check.str_param(repository_location_name, "repository_location_name")\n\n res_data: Dict[str, Dict[str, Any]] = self._execute(\n SHUTDOWN_REPOSITORY_LOCATION_MUTATION,\n {"repositoryLocationName": repository_location_name},\n )\n\n query_result: Dict[str, Any] = res_data["shutdownRepositoryLocation"]\n query_result_type: str = query_result["__typename"]\n if query_result_type == "ShutdownRepositoryLocationSuccess":\n return ShutdownRepositoryLocationInfo(status=ShutdownRepositoryLocationStatus.SUCCESS)\n elif (\n query_result_type == "RepositoryLocationNotFound" or query_result_type == "PythonError"\n ):\n return ShutdownRepositoryLocationInfo(\n status=ShutdownRepositoryLocationStatus.FAILURE,\n message=query_result["message"],\n )\n else:\n raise Exception(f"Unexpected query result type {query_result_type}")
\n\n def terminate_run(self, run_id: str):\n """Terminates a pipeline run. This method it is useful when you would like to stop a pipeline run\n based on a external event.\n\n Args:\n run_id (str): The run id of the pipeline run to terminate\n """\n check.str_param(run_id, "run_id")\n\n res_data: Dict[str, Dict[str, Any]] = self._execute(\n TERMINATE_RUN_JOB_MUTATION, {"runId": run_id}\n )\n\n query_result: Dict[str, Any] = res_data["terminateRun"]\n query_result_type: str = query_result["__typename"]\n if query_result_type == "TerminateRunSuccess":\n return\n\n elif query_result_type == "RunNotFoundError":\n raise DagsterGraphQLClientError("RunNotFoundError", f"Run Id {run_id} not found")\n else:\n raise DagsterGraphQLClientError(query_result_type, query_result["message"])
\n
", "current_page_name": "_modules/dagster_graphql/client/client", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_graphql.client.client"}, "utils": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_graphql.client.utils

\nfrom enum import Enum\nfrom typing import Any, Dict, List, NamedTuple, Optional\n\n\n
[docs]class DagsterGraphQLClientError(Exception):\n def __init__(self, *args, body=None):\n super().__init__(*args)\n self.body = body
\n\n\n
[docs]class ReloadRepositoryLocationStatus(Enum):\n """This enum describes the status of a GraphQL mutation to reload a Dagster repository location.\n\n Args:\n Enum (str): can be either `ReloadRepositoryLocationStatus.SUCCESS`\n or `ReloadRepositoryLocationStatus.FAILURE`.\n """\n\n SUCCESS = "SUCCESS"\n FAILURE = "FAILURE"
\n\n\nclass ShutdownRepositoryLocationStatus(Enum):\n SUCCESS = "SUCCESS"\n FAILURE = "FAILURE"\n\n\n
[docs]class ReloadRepositoryLocationInfo(NamedTuple):\n """This class gives information about the result of reloading\n a Dagster repository location with a GraphQL mutation.\n\n Args:\n status (ReloadRepositoryLocationStatus): The status of the reload repository location mutation\n failure_type: (Optional[str], optional): the failure type if `status == ReloadRepositoryLocationStatus.FAILURE`.\n Can be one of `ReloadNotSupported`, `RepositoryLocationNotFound`, or `RepositoryLocationLoadFailure`. Defaults to None.\n message (Optional[str], optional): the failure message/reason if\n `status == ReloadRepositoryLocationStatus.FAILURE`. Defaults to None.\n """\n\n status: ReloadRepositoryLocationStatus\n failure_type: Optional[str] = None\n message: Optional[str] = None
\n\n\nclass ShutdownRepositoryLocationInfo(NamedTuple):\n """This class gives information about the result of shutting down the server for\n a Dagster repository location using a GraphQL mutation.\n\n Args:\n status (ShutdownRepositoryLocationStatus) Whether the shutdown succeeded or failed.\n message (Optional[str], optional): the failure message/reason if\n `status == ShutdownRepositoryLocationStatus.FAILURE`. Defaults to None.\n """\n\n status: ShutdownRepositoryLocationStatus\n message: Optional[str] = None\n\n\nclass JobInfo(NamedTuple):\n repository_location_name: str\n repository_name: str\n job_name: str\n\n @staticmethod\n def from_node(node: Dict[str, Any]) -> List["JobInfo"]:\n repo_name = node["name"]\n repo_location_name = node["location"]["name"]\n return [\n JobInfo(\n repository_location_name=repo_location_name,\n repository_name=repo_name,\n job_name=job["name"],\n )\n for job in node["pipelines"]\n ]\n\n\n
[docs]class InvalidOutputErrorInfo(NamedTuple):\n """This class gives information about an InvalidOutputError from submitting a pipeline for execution\n from GraphQL.\n\n Args:\n step_key (str): key of the step that failed\n invalid_output_name (str): the name of the invalid output from the given step\n """\n\n step_key: str\n invalid_output_name: str
\n
", "current_page_name": "_modules/dagster_graphql/client/utils", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_graphql.client.utils"}}}, "dagster_k8s": {"executor": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_k8s.executor

\nfrom typing import Iterator, List, Optional, cast\n\nimport kubernetes.config\nfrom dagster import (\n    Field,\n    IntSource,\n    Noneable,\n    StringSource,\n    _check as check,\n    executor,\n)\nfrom dagster._core.definitions.executor_definition import multiple_process_executor_requirements\nfrom dagster._core.definitions.metadata import MetadataValue\nfrom dagster._core.events import DagsterEvent, EngineEventData\nfrom dagster._core.execution.retries import RetryMode, get_retries_config\nfrom dagster._core.execution.tags import get_tag_concurrency_limits_config\nfrom dagster._core.executor.base import Executor\nfrom dagster._core.executor.init import InitExecutorContext\nfrom dagster._core.executor.step_delegating import (\n    CheckStepHealthResult,\n    StepDelegatingExecutor,\n    StepHandler,\n    StepHandlerContext,\n)\nfrom dagster._utils.merger import merge_dicts\n\nfrom dagster_k8s.launcher import K8sRunLauncher\n\nfrom .client import DagsterKubernetesClient\nfrom .container_context import K8sContainerContext\nfrom .job import (\n    USER_DEFINED_K8S_CONFIG_SCHEMA,\n    DagsterK8sJobConfig,\n    construct_dagster_k8s_job,\n    get_k8s_job_name,\n    get_user_defined_k8s_config,\n)\n\n_K8S_EXECUTOR_CONFIG_SCHEMA = merge_dicts(\n    DagsterK8sJobConfig.config_type_job(),\n    {\n        "load_incluster_config": Field(\n            bool,\n            is_required=False,\n            description="""Whether or not the executor is running within a k8s cluster already. If\n            the job is using the `K8sRunLauncher`, the default value of this parameter will be\n            the same as the corresponding value on the run launcher.\n            If ``True``, we assume the executor is running within the target cluster and load config\n            using ``kubernetes.config.load_incluster_config``. Otherwise, we will use the k8s config\n            specified in ``kubeconfig_file`` (using ``kubernetes.config.load_kube_config``) or fall\n            back to the default kubeconfig.""",\n        ),\n        "kubeconfig_file": Field(\n            Noneable(str),\n            is_required=False,\n            description="""Path to a kubeconfig file to use, if not using default kubeconfig. If\n            the job is using the `K8sRunLauncher`, the default value of this parameter will be\n            the same as the corresponding value on the run launcher.""",\n        ),\n        "job_namespace": Field(StringSource, is_required=False),\n        "retries": get_retries_config(),\n        "max_concurrent": Field(\n            IntSource,\n            is_required=False,\n            description=(\n                "Limit on the number of pods that will run concurrently within the scope "\n                "of a Dagster run. Note that this limit is per run, not global."\n            ),\n        ),\n        "tag_concurrency_limits": get_tag_concurrency_limits_config(),\n        "step_k8s_config": Field(\n            USER_DEFINED_K8S_CONFIG_SCHEMA,\n            is_required=False,\n            description="Raw Kubernetes configuration for each step launched by the executor.",\n        ),\n    },\n)\n\n\n
[docs]@executor(\n name="k8s",\n config_schema=_K8S_EXECUTOR_CONFIG_SCHEMA,\n requirements=multiple_process_executor_requirements(),\n)\ndef k8s_job_executor(init_context: InitExecutorContext) -> Executor:\n """Executor which launches steps as Kubernetes Jobs.\n\n To use the `k8s_job_executor`, set it as the `executor_def` when defining a job:\n\n .. literalinclude:: ../../../../../../python_modules/libraries/dagster-k8s/dagster_k8s_tests/unit_tests/test_example_executor_mode_def.py\n :start-after: start_marker\n :end-before: end_marker\n :language: python\n\n Then you can configure the executor with run config as follows:\n\n .. code-block:: YAML\n\n execution:\n config:\n job_namespace: 'some-namespace'\n image_pull_policy: ...\n image_pull_secrets: ...\n service_account_name: ...\n env_config_maps: ...\n env_secrets: ...\n env_vars: ...\n job_image: ... # leave out if using userDeployments\n max_concurrent: ...\n\n `max_concurrent` limits the number of pods that will execute concurrently for one run. By default\n there is no limit- it will maximally parallel as allowed by the DAG. Note that this is not a\n global limit.\n\n Configuration set on the Kubernetes Jobs and Pods created by the `K8sRunLauncher` will also be\n set on Kubernetes Jobs and Pods created by the `k8s_job_executor`.\n\n Configuration set using `tags` on a `@job` will only apply to the `run` level. For configuration\n to apply at each `step` it must be set using `tags` for each `@op`.\n """\n run_launcher = (\n init_context.instance.run_launcher\n if isinstance(init_context.instance.run_launcher, K8sRunLauncher)\n else None\n )\n\n exc_cfg = init_context.executor_config\n\n k8s_container_context = K8sContainerContext(\n image_pull_policy=exc_cfg.get("image_pull_policy"), # type: ignore\n image_pull_secrets=exc_cfg.get("image_pull_secrets"), # type: ignore\n service_account_name=exc_cfg.get("service_account_name"), # type: ignore\n env_config_maps=exc_cfg.get("env_config_maps"), # type: ignore\n env_secrets=exc_cfg.get("env_secrets"), # type: ignore\n env_vars=exc_cfg.get("env_vars"), # type: ignore\n volume_mounts=exc_cfg.get("volume_mounts"), # type: ignore\n volumes=exc_cfg.get("volumes"), # type: ignore\n labels=exc_cfg.get("labels"), # type: ignore\n namespace=exc_cfg.get("job_namespace"), # type: ignore\n resources=exc_cfg.get("resources"), # type: ignore\n scheduler_name=exc_cfg.get("scheduler_name"), # type: ignore\n # step_k8s_config feeds into the run_k8s_config field because it is merged\n # with any configuration for the run that was set on the run launcher or code location\n run_k8s_config=exc_cfg.get("step_k8s_config"), # type: ignore\n )\n\n if "load_incluster_config" in exc_cfg:\n load_incluster_config = cast(bool, exc_cfg["load_incluster_config"])\n else:\n load_incluster_config = run_launcher.load_incluster_config if run_launcher else True\n\n if "kubeconfig_file" in exc_cfg:\n kubeconfig_file = cast(Optional[str], exc_cfg["kubeconfig_file"])\n else:\n kubeconfig_file = run_launcher.kubeconfig_file if run_launcher else None\n\n return StepDelegatingExecutor(\n K8sStepHandler(\n image=exc_cfg.get("job_image"), # type: ignore\n container_context=k8s_container_context,\n load_incluster_config=load_incluster_config,\n kubeconfig_file=kubeconfig_file,\n ),\n retries=RetryMode.from_config(exc_cfg["retries"]), # type: ignore\n max_concurrent=check.opt_int_elem(exc_cfg, "max_concurrent"),\n tag_concurrency_limits=check.opt_list_elem(exc_cfg, "tag_concurrency_limits"),\n should_verify_step=True,\n )
\n\n\nclass K8sStepHandler(StepHandler):\n @property\n def name(self):\n return "K8sStepHandler"\n\n def __init__(\n self,\n image: Optional[str],\n container_context: K8sContainerContext,\n load_incluster_config: bool,\n kubeconfig_file: Optional[str],\n k8s_client_batch_api=None,\n ):\n super().__init__()\n\n self._executor_image = check.opt_str_param(image, "image")\n self._executor_container_context = check.inst_param(\n container_context, "container_context", K8sContainerContext\n )\n\n if load_incluster_config:\n check.invariant(\n kubeconfig_file is None,\n "`kubeconfig_file` is set but `load_incluster_config` is True.",\n )\n kubernetes.config.load_incluster_config()\n else:\n check.opt_str_param(kubeconfig_file, "kubeconfig_file")\n kubernetes.config.load_kube_config(kubeconfig_file)\n\n self._api_client = DagsterKubernetesClient.production_client(\n batch_api_override=k8s_client_batch_api\n )\n\n def _get_step_key(self, step_handler_context: StepHandlerContext) -> str:\n step_keys_to_execute = cast(\n List[str], step_handler_context.execute_step_args.step_keys_to_execute\n )\n assert len(step_keys_to_execute) == 1, "Launching multiple steps is not currently supported"\n return step_keys_to_execute[0]\n\n def _get_container_context(\n self, step_handler_context: StepHandlerContext\n ) -> K8sContainerContext:\n step_key = self._get_step_key(step_handler_context)\n\n context = K8sContainerContext.create_for_run(\n step_handler_context.dagster_run,\n cast(K8sRunLauncher, step_handler_context.instance.run_launcher),\n include_run_tags=False, # For now don't include job-level dagster-k8s/config tags in step pods\n )\n context = context.merge(self._executor_container_context)\n\n user_defined_k8s_config = get_user_defined_k8s_config(\n step_handler_context.step_tags[step_key]\n )\n return context.merge(K8sContainerContext(run_k8s_config=user_defined_k8s_config.to_dict()))\n\n def _get_k8s_step_job_name(self, step_handler_context: StepHandlerContext):\n step_key = self._get_step_key(step_handler_context)\n\n name_key = get_k8s_job_name(\n step_handler_context.execute_step_args.run_id,\n step_key,\n )\n\n if step_handler_context.execute_step_args.known_state:\n retry_state = step_handler_context.execute_step_args.known_state.get_retry_state()\n if retry_state.get_attempt_count(step_key):\n return "dagster-step-%s-%d" % (name_key, retry_state.get_attempt_count(step_key))\n\n return "dagster-step-%s" % (name_key)\n\n def launch_step(self, step_handler_context: StepHandlerContext) -> Iterator[DagsterEvent]:\n step_key = self._get_step_key(step_handler_context)\n\n job_name = self._get_k8s_step_job_name(step_handler_context)\n pod_name = job_name\n\n container_context = self._get_container_context(step_handler_context)\n\n job_config = container_context.get_k8s_job_config(\n self._executor_image, step_handler_context.instance.run_launcher\n )\n\n args = step_handler_context.execute_step_args.get_command_args(\n skip_serialized_namedtuple=True\n )\n\n if not job_config.job_image:\n job_config = job_config.with_image(\n step_handler_context.execute_step_args.job_origin.repository_origin.container_image\n )\n\n if not job_config.job_image:\n raise Exception("No image included in either executor config or the job")\n\n run = step_handler_context.dagster_run\n labels = {\n "dagster/job": run.job_name,\n "dagster/op": step_key,\n "dagster/run-id": step_handler_context.execute_step_args.run_id,\n }\n if run.external_job_origin:\n labels["dagster/code-location"] = (\n run.external_job_origin.external_repository_origin.code_location_origin.location_name\n )\n job = construct_dagster_k8s_job(\n job_config=job_config,\n args=args,\n job_name=job_name,\n pod_name=pod_name,\n component="step_worker",\n user_defined_k8s_config=container_context.get_run_user_defined_k8s_config(),\n labels=labels,\n env_vars=[\n *step_handler_context.execute_step_args.get_command_env(),\n {\n "name": "DAGSTER_RUN_JOB_NAME",\n "value": run.job_name,\n },\n {"name": "DAGSTER_RUN_STEP_KEY", "value": step_key},\n *container_context.env,\n ],\n )\n\n yield DagsterEvent.step_worker_starting(\n step_handler_context.get_step_context(step_key),\n message=f'Executing step "{step_key}" in Kubernetes job {job_name}.',\n metadata={\n "Kubernetes Job name": MetadataValue.text(job_name),\n },\n )\n\n namespace = check.not_none(container_context.namespace)\n self._api_client.create_namespaced_job_with_retries(body=job, namespace=namespace)\n\n def check_step_health(self, step_handler_context: StepHandlerContext) -> CheckStepHealthResult:\n step_key = self._get_step_key(step_handler_context)\n\n job_name = self._get_k8s_step_job_name(step_handler_context)\n\n container_context = self._get_container_context(step_handler_context)\n\n status = self._api_client.get_job_status(\n namespace=container_context.namespace,\n job_name=job_name,\n )\n if status.failed:\n return CheckStepHealthResult.unhealthy(\n reason=f"Discovered failed Kubernetes job {job_name} for step {step_key}.",\n )\n\n return CheckStepHealthResult.healthy()\n\n def terminate_step(self, step_handler_context: StepHandlerContext) -> Iterator[DagsterEvent]:\n step_key = self._get_step_key(step_handler_context)\n\n job_name = self._get_k8s_step_job_name(step_handler_context)\n container_context = self._get_container_context(step_handler_context)\n\n yield DagsterEvent.engine_event(\n step_handler_context.get_step_context(step_key),\n message=f"Deleting Kubernetes job {job_name} for step",\n event_specific_data=EngineEventData(),\n )\n\n self._api_client.delete_job(job_name=job_name, namespace=container_context.namespace)\n
", "current_page_name": "_modules/dagster_k8s/executor", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_k8s.executor"}, "launcher": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_k8s.launcher

\nimport logging\nimport sys\nfrom typing import Any, Mapping, Optional, Sequence\n\nimport kubernetes\nfrom dagster import (\n    _check as check,\n)\nfrom dagster._cli.api import ExecuteRunArgs\nfrom dagster._core.events import EngineEventData\nfrom dagster._core.launcher import LaunchRunContext, ResumeRunContext, RunLauncher\nfrom dagster._core.launcher.base import CheckRunHealthResult, WorkerStatus\nfrom dagster._core.storage.dagster_run import DagsterRun, DagsterRunStatus\nfrom dagster._core.storage.tags import DOCKER_IMAGE_TAG\nfrom dagster._grpc.types import ResumeRunArgs\nfrom dagster._serdes import ConfigurableClass, ConfigurableClassData\nfrom dagster._utils.error import serializable_error_info_from_exc_info\n\nfrom .client import DagsterKubernetesClient\nfrom .container_context import K8sContainerContext\nfrom .job import DagsterK8sJobConfig, construct_dagster_k8s_job, get_job_name_from_run_id\n\n\n
[docs]class K8sRunLauncher(RunLauncher, ConfigurableClass):\n """RunLauncher that starts a Kubernetes Job for each Dagster job run.\n\n Encapsulates each run in a separate, isolated invocation of ``dagster-graphql``.\n\n You can configure a Dagster instance to use this RunLauncher by adding a section to your\n ``dagster.yaml`` like the following:\n\n .. code-block:: yaml\n\n run_launcher:\n module: dagster_k8s.launcher\n class: K8sRunLauncher\n config:\n service_account_name: your_service_account\n job_image: my_project/dagster_image:latest\n instance_config_map: dagster-instance\n postgres_password_secret: dagster-postgresql-secret\n\n """\n\n def __init__(\n self,\n service_account_name,\n instance_config_map,\n postgres_password_secret=None,\n dagster_home=None,\n job_image=None,\n image_pull_policy=None,\n image_pull_secrets=None,\n load_incluster_config=True,\n kubeconfig_file=None,\n inst_data: Optional[ConfigurableClassData] = None,\n job_namespace="default",\n env_config_maps=None,\n env_secrets=None,\n env_vars=None,\n k8s_client_batch_api=None,\n volume_mounts=None,\n volumes=None,\n labels=None,\n fail_pod_on_run_failure=None,\n resources=None,\n scheduler_name=None,\n security_context=None,\n run_k8s_config=None,\n ):\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n self.job_namespace = check.str_param(job_namespace, "job_namespace")\n\n self.load_incluster_config = load_incluster_config\n self.kubeconfig_file = kubeconfig_file\n if load_incluster_config:\n check.invariant(\n kubeconfig_file is None,\n "`kubeconfig_file` is set but `load_incluster_config` is True.",\n )\n kubernetes.config.load_incluster_config()\n else:\n check.opt_str_param(kubeconfig_file, "kubeconfig_file")\n kubernetes.config.load_kube_config(kubeconfig_file)\n\n self._api_client = DagsterKubernetesClient.production_client(\n batch_api_override=k8s_client_batch_api\n )\n\n self._job_config = None\n self._job_image = check.opt_str_param(job_image, "job_image")\n self.dagster_home = check.str_param(dagster_home, "dagster_home")\n self._image_pull_policy = check.opt_str_param(\n image_pull_policy, "image_pull_policy", "IfNotPresent"\n )\n self._image_pull_secrets = check.opt_list_param(\n image_pull_secrets, "image_pull_secrets", of_type=dict\n )\n self._service_account_name = check.str_param(service_account_name, "service_account_name")\n self.instance_config_map = check.str_param(instance_config_map, "instance_config_map")\n self.postgres_password_secret = check.opt_str_param(\n postgres_password_secret, "postgres_password_secret"\n )\n self._env_config_maps = check.opt_list_param(\n env_config_maps, "env_config_maps", of_type=str\n )\n self._env_secrets = check.opt_list_param(env_secrets, "env_secrets", of_type=str)\n self._env_vars = check.opt_list_param(env_vars, "env_vars", of_type=str)\n self._volume_mounts = check.opt_list_param(volume_mounts, "volume_mounts")\n self._volumes = check.opt_list_param(volumes, "volumes")\n self._labels: Mapping[str, str] = check.opt_mapping_param(\n labels, "labels", key_type=str, value_type=str\n )\n self._fail_pod_on_run_failure = check.opt_bool_param(\n fail_pod_on_run_failure, "fail_pod_on_run_failure"\n )\n self._resources: Mapping[str, Any] = check.opt_mapping_param(resources, "resources")\n self._scheduler_name = check.opt_str_param(scheduler_name, "scheduler_name")\n self._security_context = check.opt_dict_param(security_context, "security_context")\n self._run_k8s_config = check.opt_dict_param(run_k8s_config, "run_k8s_config")\n super().__init__()\n\n @property\n def job_image(self):\n return self._job_image\n\n @property\n def image_pull_policy(self) -> str:\n return self._image_pull_policy\n\n @property\n def image_pull_secrets(self) -> Sequence[Mapping]:\n return self._image_pull_secrets\n\n @property\n def service_account_name(self) -> str:\n return self._service_account_name\n\n @property\n def env_config_maps(self) -> Sequence[str]:\n return self._env_config_maps\n\n @property\n def env_secrets(self) -> Sequence[str]:\n return self._env_secrets\n\n @property\n def volume_mounts(self) -> Sequence:\n return self._volume_mounts\n\n @property\n def volumes(self) -> Sequence:\n return self._volumes\n\n @property\n def resources(self) -> Mapping:\n return self._resources\n\n @property\n def scheduler_name(self) -> Optional[str]:\n return self._scheduler_name\n\n @property\n def security_context(self) -> Mapping[str, Any]:\n return self._security_context\n\n @property\n def env_vars(self) -> Sequence[str]:\n return self._env_vars\n\n @property\n def labels(self) -> Mapping[str, str]:\n return self._labels\n\n @property\n def run_k8s_config(self) -> Mapping[str, str]:\n return self._run_k8s_config\n\n @property\n def fail_pod_on_run_failure(self) -> Optional[bool]:\n return self._fail_pod_on_run_failure\n\n @classmethod\n def config_type(cls):\n """Include all arguments required for DagsterK8sJobConfig along with additional arguments\n needed for the RunLauncher itself.\n """\n return DagsterK8sJobConfig.config_type_run_launcher()\n\n @classmethod\n def from_config_value(cls, inst_data, config_value):\n return cls(inst_data=inst_data, **config_value)\n\n @property\n def inst_data(self) -> Optional[ConfigurableClassData]:\n return self._inst_data\n\n def get_container_context_for_run(self, dagster_run: DagsterRun) -> K8sContainerContext:\n return K8sContainerContext.create_for_run(dagster_run, self, include_run_tags=True)\n\n def _launch_k8s_job_with_args(\n self, job_name: str, args: Optional[Sequence[str]], run: DagsterRun\n ) -> None:\n container_context = self.get_container_context_for_run(run)\n\n pod_name = job_name\n\n job_origin = check.not_none(run.job_code_origin)\n user_defined_k8s_config = container_context.get_run_user_defined_k8s_config()\n repository_origin = job_origin.repository_origin\n\n job_config = container_context.get_k8s_job_config(\n job_image=repository_origin.container_image, run_launcher=self\n )\n job_image = job_config.job_image\n if job_image: # expected to be set\n self._instance.add_run_tags(\n run.run_id,\n {DOCKER_IMAGE_TAG: job_image},\n )\n\n labels = {\n "dagster/job": job_origin.job_name,\n "dagster/run-id": run.run_id,\n }\n if run.external_job_origin:\n labels["dagster/code-location"] = (\n run.external_job_origin.external_repository_origin.code_location_origin.location_name\n )\n\n job = construct_dagster_k8s_job(\n job_config=job_config,\n args=args,\n job_name=job_name,\n pod_name=pod_name,\n component="run_worker",\n user_defined_k8s_config=user_defined_k8s_config,\n labels=labels,\n env_vars=[\n {\n "name": "DAGSTER_RUN_JOB_NAME",\n "value": job_origin.job_name,\n },\n *container_context.env,\n ],\n )\n\n namespace = check.not_none(container_context.namespace)\n\n self._instance.report_engine_event(\n "Creating Kubernetes run worker job",\n run,\n EngineEventData(\n {\n "Kubernetes Job name": job_name,\n "Kubernetes Namespace": namespace,\n "Run ID": run.run_id,\n }\n ),\n cls=self.__class__,\n )\n\n self._api_client.create_namespaced_job_with_retries(body=job, namespace=namespace)\n self._instance.report_engine_event(\n "Kubernetes run worker job created",\n run,\n cls=self.__class__,\n )\n\n def launch_run(self, context: LaunchRunContext) -> None:\n run = context.dagster_run\n job_name = get_job_name_from_run_id(run.run_id)\n job_origin = check.not_none(run.job_code_origin)\n\n args = ExecuteRunArgs(\n job_origin=job_origin,\n run_id=run.run_id,\n instance_ref=self._instance.get_ref(),\n set_exit_code_on_failure=self._fail_pod_on_run_failure,\n ).get_command_args()\n\n self._launch_k8s_job_with_args(job_name, args, run)\n\n @property\n def supports_resume_run(self):\n return True\n\n def resume_run(self, context: ResumeRunContext) -> None:\n run = context.dagster_run\n job_name = get_job_name_from_run_id(\n run.run_id, resume_attempt_number=context.resume_attempt_number\n )\n job_origin = check.not_none(run.job_code_origin)\n\n args = ResumeRunArgs(\n job_origin=job_origin,\n run_id=run.run_id,\n instance_ref=self._instance.get_ref(),\n set_exit_code_on_failure=self._fail_pod_on_run_failure,\n ).get_command_args()\n\n self._launch_k8s_job_with_args(job_name, args, run)\n\n def terminate(self, run_id):\n check.str_param(run_id, "run_id")\n run = self._instance.get_run_by_id(run_id)\n\n if not run:\n return False\n\n self._instance.report_run_canceling(run)\n\n container_context = self.get_container_context_for_run(run)\n\n job_name = get_job_name_from_run_id(\n run_id, resume_attempt_number=self._instance.count_resume_run_attempts(run.run_id)\n )\n\n try:\n termination_result = self._api_client.delete_job(\n job_name=job_name, namespace=container_context.namespace\n )\n if termination_result:\n self._instance.report_engine_event(\n message="Run was terminated successfully.",\n dagster_run=run,\n cls=self.__class__,\n )\n else:\n self._instance.report_engine_event(\n message="Run was not terminated successfully; delete_job returned {}".format(\n termination_result\n ),\n dagster_run=run,\n cls=self.__class__,\n )\n return termination_result\n except Exception:\n self._instance.report_engine_event(\n message="Run was not terminated successfully; encountered error in delete_job",\n dagster_run=run,\n engine_event_data=EngineEventData.engine_error(\n serializable_error_info_from_exc_info(sys.exc_info())\n ),\n cls=self.__class__,\n )\n\n @property\n def supports_check_run_worker_health(self):\n return True\n\n @property\n def supports_run_worker_crash_recovery(self):\n return True\n\n def get_run_worker_debug_info(self, run: DagsterRun) -> Optional[str]:\n container_context = self.get_container_context_for_run(run)\n if self.supports_run_worker_crash_recovery:\n resume_attempt_number = self._instance.count_resume_run_attempts(run.run_id)\n else:\n resume_attempt_number = None\n\n job_name = get_job_name_from_run_id(run.run_id, resume_attempt_number=resume_attempt_number)\n namespace = container_context.namespace\n user_defined_k8s_config = container_context.get_run_user_defined_k8s_config()\n container_name = user_defined_k8s_config.container_config.get("name", "dagster")\n pod_names = self._api_client.get_pod_names_in_job(job_name, namespace=namespace)\n full_msg = ""\n try:\n pod_debug_info = [\n self._api_client.get_pod_debug_info(\n pod_name, namespace, container_name=container_name\n )\n for pod_name in pod_names\n ]\n full_msg = "\\n".join(pod_debug_info)\n except Exception:\n logging.exception(\n f"Error trying to get debug information for failed k8s job {job_name}"\n )\n if pod_names:\n full_msg = (\n full_msg\n + "\\nFor more information about the failure, try running `kubectl describe pod"\n f" {pod_names[0]}`, `kubectl logs {pod_names[0]}`, or `kubectl describe job"\n f" {job_name}` in your cluster."\n )\n\n else:\n full_msg = (\n full_msg\n + "\\nFor more information about the failure, try running `kubectl describe job"\n f" {job_name}` in your cluster."\n )\n\n return full_msg\n\n def check_run_worker_health(self, run: DagsterRun):\n container_context = self.get_container_context_for_run(run)\n\n if self.supports_run_worker_crash_recovery:\n resume_attempt_number = self._instance.count_resume_run_attempts(run.run_id)\n else:\n resume_attempt_number = None\n\n job_name = get_job_name_from_run_id(run.run_id, resume_attempt_number=resume_attempt_number)\n try:\n status = self._api_client.get_job_status(\n namespace=container_context.namespace,\n job_name=job_name,\n )\n except Exception:\n return CheckRunHealthResult(\n WorkerStatus.UNKNOWN, str(serializable_error_info_from_exc_info(sys.exc_info()))\n )\n\n inactive_job_with_finished_pods = bool(\n (not status.active) and (status.failed or status.succeeded)\n )\n\n # If the run is in a non-terminal (and non-STARTING) state but the k8s job is not active,\n # something went wrong\n if (\n run.status in (DagsterRunStatus.STARTED, DagsterRunStatus.CANCELING)\n and inactive_job_with_finished_pods\n ):\n return CheckRunHealthResult(\n WorkerStatus.FAILED, "Run has not completed but K8s job has no active pods"\n )\n\n if status.failed:\n return CheckRunHealthResult(WorkerStatus.FAILED, "K8s job failed")\n if status.succeeded:\n return CheckRunHealthResult(WorkerStatus.SUCCESS)\n return CheckRunHealthResult(WorkerStatus.RUNNING)
\n
", "current_page_name": "_modules/dagster_k8s/launcher", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_k8s.launcher"}, "ops": {"k8s_job_op": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_k8s.ops.k8s_job_op

\nimport time\nfrom typing import Any, Dict, List, Optional\n\nimport kubernetes.config\nimport kubernetes.watch\nfrom dagster import Field, In, Noneable, Nothing, OpExecutionContext, Permissive, StringSource, op\nfrom dagster._annotations import experimental\nfrom dagster._utils.merger import merge_dicts\n\nfrom ..client import DEFAULT_JOB_POD_COUNT, DagsterKubernetesClient\nfrom ..container_context import K8sContainerContext\nfrom ..job import DagsterK8sJobConfig, construct_dagster_k8s_job, get_k8s_job_name\nfrom ..launcher import K8sRunLauncher\n\nK8S_JOB_OP_CONFIG = merge_dicts(\n    DagsterK8sJobConfig.config_type_container(),\n    {\n        "image": Field(\n            StringSource,\n            is_required=True,\n            description="The image in which to launch the k8s job.",\n        ),\n        "command": Field(\n            [str],\n            is_required=False,\n            description="The command to run in the container within the launched k8s job.",\n        ),\n        "args": Field(\n            [str],\n            is_required=False,\n            description="The args for the command for the container.",\n        ),\n        "namespace": Field(StringSource, is_required=False),\n        "load_incluster_config": Field(\n            bool,\n            is_required=False,\n            default_value=True,\n            description="""Set this value if you are running the launcher\n            within a k8s cluster. If ``True``, we assume the launcher is running within the target\n            cluster and load config using ``kubernetes.config.load_incluster_config``. Otherwise,\n            we will use the k8s config specified in ``kubeconfig_file`` (using\n            ``kubernetes.config.load_kube_config``) or fall back to the default kubeconfig.""",\n        ),\n        "kubeconfig_file": Field(\n            Noneable(str),\n            is_required=False,\n            default_value=None,\n            description=(\n                "The kubeconfig file from which to load config. Defaults to using the default"\n                " kubeconfig."\n            ),\n        ),\n        "timeout": Field(\n            int,\n            is_required=False,\n            description="How long to wait for the job to succeed before raising an exception",\n        ),\n        "container_config": Field(\n            Permissive(),\n            is_required=False,\n            description=(\n                "Raw k8s config for the k8s pod's main container"\n                " (https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.20/#container-v1-core)."\n                " Keys can either snake_case or camelCase."\n            ),\n        ),\n        "pod_template_spec_metadata": Field(\n            Permissive(),\n            is_required=False,\n            description=(\n                "Raw k8s config for the k8s pod's metadata"\n                " (https://kubernetes.io/docs/reference/kubernetes-api/common-definitions/object-meta/#ObjectMeta)."\n                " Keys can either snake_case or camelCase."\n            ),\n        ),\n        "pod_spec_config": Field(\n            Permissive(),\n            is_required=False,\n            description=(\n                "Raw k8s config for the k8s pod's pod spec"\n                " (https://kubernetes.io/docs/reference/kubernetes-api/workload-resources/pod-v1/#PodSpec)."\n                " Keys can either snake_case or camelCase."\n            ),\n        ),\n        "job_metadata": Field(\n            Permissive(),\n            is_required=False,\n            description=(\n                "Raw k8s config for the k8s job's metadata"\n                " (https://kubernetes.io/docs/reference/kubernetes-api/common-definitions/object-meta/#ObjectMeta)."\n                " Keys can either snake_case or camelCase."\n            ),\n        ),\n        "job_spec_config": Field(\n            Permissive(),\n            is_required=False,\n            description=(\n                "Raw k8s config for the k8s job's job spec"\n                " (https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.20/#jobspec-v1-batch)."\n                " Keys can either snake_case or camelCase."\n            ),\n        ),\n    },\n)\n\n\n
[docs]@experimental\ndef execute_k8s_job(\n context: OpExecutionContext,\n image: str,\n command: Optional[List[str]] = None,\n args: Optional[List[str]] = None,\n namespace: Optional[str] = None,\n image_pull_policy: Optional[str] = None,\n image_pull_secrets: Optional[List[Dict[str, str]]] = None,\n service_account_name: Optional[str] = None,\n env_config_maps: Optional[List[str]] = None,\n env_secrets: Optional[List[str]] = None,\n env_vars: Optional[List[str]] = None,\n volume_mounts: Optional[List[Dict[str, Any]]] = None,\n volumes: Optional[List[Dict[str, Any]]] = None,\n labels: Optional[Dict[str, str]] = None,\n resources: Optional[Dict[str, Any]] = None,\n scheduler_name: Optional[str] = None,\n load_incluster_config: bool = True,\n kubeconfig_file: Optional[str] = None,\n timeout: Optional[int] = None,\n container_config: Optional[Dict[str, Any]] = None,\n pod_template_spec_metadata: Optional[Dict[str, Any]] = None,\n pod_spec_config: Optional[Dict[str, Any]] = None,\n job_metadata: Optional[Dict[str, Any]] = None,\n job_spec_config: Optional[Dict[str, Any]] = None,\n k8s_job_name: Optional[str] = None,\n):\n """This function is a utility for executing a Kubernetes job from within a Dagster op.\n\n Args:\n image (str): The image in which to launch the k8s job.\n command (Optional[List[str]]): The command to run in the container within the launched\n k8s job. Default: None.\n args (Optional[List[str]]): The args for the command for the container. Default: None.\n namespace (Optional[str]): Override the kubernetes namespace in which to run the k8s job.\n Default: None.\n image_pull_policy (Optional[str]): Allows the image pull policy to be overridden, e.g. to\n facilitate local testing with `kind <https://kind.sigs.k8s.io/>`_. Default:\n ``"Always"``. See:\n https://kubernetes.io/docs/concepts/containers/images/#updating-images.\n image_pull_secrets (Optional[List[Dict[str, str]]]): Optionally, a list of dicts, each of\n which corresponds to a Kubernetes ``LocalObjectReference`` (e.g.,\n ``{'name': 'myRegistryName'}``). This allows you to specify the ```imagePullSecrets`` on\n a pod basis. Typically, these will be provided through the service account, when needed,\n and you will not need to pass this argument. See:\n https://kubernetes.io/docs/concepts/containers/images/#specifying-imagepullsecrets-on-a-pod\n and https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.17/#podspec-v1-core\n service_account_name (Optional[str]): The name of the Kubernetes service account under which\n to run the Job. Defaults to "default" env_config_maps (Optional[List[str]]): A list of custom ConfigMapEnvSource names from which to\n draw environment variables (using ``envFrom``) for the Job. Default: ``[]``. See:\n https://kubernetes.io/docs/tasks/inject-data-application/define-environment-variable-container/#define-an-environment-variable-for-a-container\n env_secrets (Optional[List[str]]): A list of custom Secret names from which to\n draw environment variables (using ``envFrom``) for the Job. Default: ``[]``. See:\n https://kubernetes.io/docs/tasks/inject-data-application/distribute-credentials-secure/#configure-all-key-value-pairs-in-a-secret-as-container-environment-variables\n env_vars (Optional[List[str]]): A list of environment variables to inject into the Job.\n Default: ``[]``. See: https://kubernetes.io/docs/tasks/inject-data-application/distribute-credentials-secure/#configure-all-key-value-pairs-in-a-secret-as-container-environment-variables\n volume_mounts (Optional[List[Permissive]]): A list of volume mounts to include in the job's\n container. Default: ``[]``. See:\n https://v1-18.docs.kubernetes.io/docs/reference/generated/kubernetes-api/v1.18/#volumemount-v1-core\n volumes (Optional[List[Permissive]]): A list of volumes to include in the Job's Pod. Default: ``[]``. See:\n https://v1-18.docs.kubernetes.io/docs/reference/generated/kubernetes-api/v1.18/#volume-v1-core\n labels (Optional[Dict[str, str]]): Additional labels that should be included in the Job's Pod. See:\n https://kubernetes.io/docs/concepts/overview/working-with-objects/labels\n resources (Optional[Dict[str, Any]]) Compute resource requirements for the container. See:\n https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/\n scheduler_name (Optional[str]): Use a custom Kubernetes scheduler for launched Pods. See:\n https://kubernetes.io/docs/tasks/extend-kubernetes/configure-multiple-schedulers/\n load_incluster_config (bool): Whether the op is running within a k8s cluster. If ``True``,\n we assume the launcher is running within the target cluster and load config using\n ``kubernetes.config.load_incluster_config``. Otherwise, we will use the k8s config\n specified in ``kubeconfig_file`` (using ``kubernetes.config.load_kube_config``) or fall\n back to the default kubeconfig. Default: True,\n kubeconfig_file (Optional[str]): The kubeconfig file from which to load config. Defaults to\n using the default kubeconfig. Default: None.\n timeout (Optional[int]): Raise an exception if the op takes longer than this timeout in\n seconds to execute. Default: None.\n container_config (Optional[Dict[str, Any]]): Raw k8s config for the k8s pod's main container\n (https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.20/#container-v1-core).\n Keys can either snake_case or camelCase.Default: None.\n pod_template_spec_metadata (Optional[Dict[str, Any]]): Raw k8s config for the k8s pod's\n metadata (https://kubernetes.io/docs/reference/kubernetes-api/common-definitions/object-meta/#ObjectMeta).\n Keys can either snake_case or camelCase. Default: None.\n pod_spec_config (Optional[Dict[str, Any]]): Raw k8s config for the k8s pod's pod spec\n (https://kubernetes.io/docs/reference/kubernetes-api/workload-resources/pod-v1/#PodSpec).\n Keys can either snake_case or camelCase. Default: None.\n job_metadata (Optional[Dict[str, Any]]): Raw k8s config for the k8s job's metadata\n (https://kubernetes.io/docs/reference/kubernetes-api/common-definitions/object-meta/#ObjectMeta).\n Keys can either snake_case or camelCase. Default: None.\n job_spec_config (Optional[Dict[str, Any]]): Raw k8s config for the k8s job's job spec\n (https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.20/#jobspec-v1-batch).\n Keys can either snake_case or camelCase.Default: None.\n k8s_job_name (Optional[str]): Overrides the name of the the k8s job. If not set, will be set\n to a unique name based on the current run ID and the name of the calling op. If set,\n make sure that the passed in name is a valid Kubernetes job name that does not\n already exist in the cluster.\n """\n run_container_context = K8sContainerContext.create_for_run(\n context.dagster_run,\n (\n context.instance.run_launcher\n if isinstance(context.instance.run_launcher, K8sRunLauncher)\n else None\n ),\n include_run_tags=False,\n )\n\n container_config = container_config.copy() if container_config else {}\n if command:\n container_config["command"] = command\n\n op_container_context = K8sContainerContext(\n image_pull_policy=image_pull_policy,\n image_pull_secrets=image_pull_secrets,\n service_account_name=service_account_name,\n env_config_maps=env_config_maps,\n env_secrets=env_secrets,\n env_vars=env_vars,\n volume_mounts=volume_mounts,\n volumes=volumes,\n labels=labels,\n namespace=namespace,\n resources=resources,\n scheduler_name=scheduler_name,\n run_k8s_config={\n "container_config": container_config,\n "pod_template_spec_metadata": pod_template_spec_metadata,\n "pod_spec_config": pod_spec_config,\n "job_metadata": job_metadata,\n "job_spec_config": job_spec_config,\n },\n )\n\n container_context = run_container_context.merge(op_container_context)\n\n namespace = container_context.namespace\n\n user_defined_k8s_config = container_context.get_run_user_defined_k8s_config()\n\n k8s_job_config = DagsterK8sJobConfig(\n job_image=image,\n dagster_home=None,\n image_pull_policy=container_context.image_pull_policy,\n image_pull_secrets=container_context.image_pull_secrets,\n service_account_name=container_context.service_account_name,\n instance_config_map=None,\n postgres_password_secret=None,\n env_config_maps=container_context.env_config_maps,\n env_secrets=container_context.env_secrets,\n env_vars=container_context.env_vars,\n volume_mounts=container_context.volume_mounts,\n volumes=container_context.volumes,\n labels=container_context.labels,\n resources=container_context.resources,\n )\n\n job_name = k8s_job_name or get_k8s_job_name(\n context.run_id, context.get_step_execution_context().step.key\n )\n\n retry_number = context.retry_number\n if retry_number > 0:\n job_name = f"{job_name}-{retry_number}"\n\n labels = {\n "dagster/job": context.dagster_run.job_name,\n "dagster/op": context.op.name,\n "dagster/run-id": context.dagster_run.run_id,\n }\n if context.dagster_run.external_job_origin:\n labels["dagster/code-location"] = (\n context.dagster_run.external_job_origin.external_repository_origin.code_location_origin.location_name\n )\n\n job = construct_dagster_k8s_job(\n job_config=k8s_job_config,\n args=args,\n job_name=job_name,\n pod_name=job_name,\n component="k8s_job_op",\n user_defined_k8s_config=user_defined_k8s_config,\n labels=labels,\n )\n\n if load_incluster_config:\n kubernetes.config.load_incluster_config()\n else:\n kubernetes.config.load_kube_config(kubeconfig_file)\n\n # changing this to be able to be passed in will allow for unit testing\n api_client = DagsterKubernetesClient.production_client()\n\n context.log.info(f"Creating Kubernetes job {job_name} in namespace {namespace}...")\n\n start_time = time.time()\n\n api_client.batch_api.create_namespaced_job(namespace, job)\n\n context.log.info("Waiting for Kubernetes job to finish...")\n\n timeout = timeout or 0\n\n api_client.wait_for_job(\n job_name=job_name,\n namespace=namespace,\n wait_timeout=timeout,\n start_time=start_time,\n )\n\n restart_policy = user_defined_k8s_config.pod_spec_config.get("restart_policy", "Never")\n\n if restart_policy == "Never":\n container_name = container_config.get("name", "dagster")\n\n pods = api_client.wait_for_job_to_have_pods(\n job_name,\n namespace,\n wait_timeout=timeout,\n start_time=start_time,\n )\n\n pod_names = [p.metadata.name for p in pods]\n\n if not pod_names:\n raise Exception("No pod names in job after it started")\n\n pod_to_watch = pod_names[0]\n watch = kubernetes.watch.Watch() # consider moving in to api_client\n\n api_client.wait_for_pod(\n pod_to_watch, namespace, wait_timeout=timeout, start_time=start_time\n )\n\n log_stream = watch.stream(\n api_client.core_api.read_namespaced_pod_log,\n name=pod_to_watch,\n namespace=namespace,\n container=container_name,\n )\n\n while True:\n if timeout and time.time() - start_time > timeout:\n watch.stop()\n raise Exception("Timed out waiting for pod to finish")\n\n try:\n log_entry = next(log_stream)\n print(log_entry) # noqa: T201\n except StopIteration:\n break\n else:\n context.log.info("Pod logs are disabled, because restart_policy is not Never")\n\n if job_spec_config and job_spec_config.get("parallelism"):\n num_pods_to_wait_for = job_spec_config["parallelism"]\n else:\n num_pods_to_wait_for = DEFAULT_JOB_POD_COUNT\n api_client.wait_for_running_job_to_succeed(\n job_name=job_name,\n namespace=namespace,\n wait_timeout=timeout,\n start_time=start_time,\n num_pods_to_wait_for=num_pods_to_wait_for,\n )
\n\n\n
[docs]@op(ins={"start_after": In(Nothing)}, config_schema=K8S_JOB_OP_CONFIG)\n@experimental\ndef k8s_job_op(context):\n """An op that runs a Kubernetes job using the k8s API.\n\n Contrast with the `k8s_job_executor`, which runs each Dagster op in a Dagster job in its\n own k8s job.\n\n This op may be useful when:\n - You need to orchestrate a command that isn't a Dagster op (or isn't written in Python)\n - You want to run the rest of a Dagster job using a specific executor, and only a single\n op in k8s.\n\n For example:\n\n .. literalinclude:: ../../../../../../python_modules/libraries/dagster-k8s/dagster_k8s_tests/unit_tests/test_example_k8s_job_op.py\n :start-after: start_marker\n :end-before: end_marker\n :language: python\n\n You can create your own op with the same implementation by calling the `execute_k8s_job` function\n inside your own op.\n\n The service account that is used to run this job should have the following RBAC permissions:\n\n .. literalinclude:: ../../../../../../examples/docs_snippets/docs_snippets/deploying/kubernetes/k8s_job_op_rbac.yaml\n :language: YAML\n """\n execute_k8s_job(context, **context.op_config)
\n
", "current_page_name": "_modules/dagster_k8s/ops/k8s_job_op", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_k8s.ops.k8s_job_op"}}}, "dagster_mlflow": {"hooks": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_mlflow.hooks

\nfrom dagster._core.definitions.decorators.hook_decorator import event_list_hook\nfrom dagster._core.definitions.events import HookExecutionResult\nfrom mlflow.entities.run_status import RunStatus\n\n\ndef _create_mlflow_run_hook(name):\n    @event_list_hook(name=name, required_resource_keys={"mlflow"})\n    def _hook(context, event_list):\n        for event in event_list:\n            if event.is_step_success:\n                _cleanup_on_success(context)\n            elif event.is_step_failure:\n                mlf = context.resources.mlflow\n                mlf.end_run(status=RunStatus.to_string(RunStatus.FAILED))\n\n        return HookExecutionResult(hook_name=name, is_skipped=False)\n\n    return _hook\n\n\ndef _cleanup_on_success(context):\n    """Checks if the current solid in the context is the last solid in the job\n    and ends the mlflow run with a successful status when this is the case.\n    """\n    last_solid_name = context._step_execution_context.job_def.nodes_in_topological_order[  # noqa: SLF001  # fmt: skip\n        -1\n    ].name\n\n    if context.op.name == last_solid_name:\n        context.resources.mlflow.end_run()\n\n\nend_mlflow_on_run_finished = _create_mlflow_run_hook("end_mlflow_on_run_finished")\n
", "current_page_name": "_modules/dagster_mlflow/hooks", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_mlflow.hooks"}, "resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_mlflow.resources

\n"""This module contains the mlflow resource provided by the MlFlow\nclass. This resource provides an easy way to configure mlflow for logging various\nthings from dagster runs.\n"""\nimport atexit\nimport sys\nfrom itertools import islice\nfrom os import environ\nfrom typing import Any, Optional\n\nimport mlflow\nfrom dagster import Field, Noneable, Permissive, StringSource, resource\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom mlflow.entities.run_status import RunStatus\n\nCONFIG_SCHEMA = {\n    "experiment_name": Field(StringSource, is_required=True, description="MlFlow experiment name."),\n    "mlflow_tracking_uri": Field(\n        Noneable(StringSource),\n        default_value=None,\n        is_required=False,\n        description="MlFlow tracking server uri.",\n    ),\n    "parent_run_id": Field(\n        Noneable(str),\n        default_value=None,\n        is_required=False,\n        description="Mlflow run ID of parent run if this is a nested run.",\n    ),\n    "env": Field(Permissive(), description="Environment variables for mlflow setup."),\n    "env_to_tag": Field(\n        Noneable(list),\n        default_value=None,\n        is_required=False,\n        description="List of environment variables to log as tags in mlflow.",\n    ),\n    "extra_tags": Field(Permissive(), description="Any extra key-value tags to log to mlflow."),\n}\n\n\nclass MlflowMeta(type):\n    """Mlflow Metaclass to create methods that "inherit" all of Mlflow's\n    methods. If the class has a method defined it is excluded from the\n    attribute setting from mlflow.\n    """\n\n    def __new__(cls, name, bases, attrs):\n        class_cls = super(MlflowMeta, cls).__new__(cls, name, bases, attrs)\n        for attr in (attr for attr in dir(mlflow) if attr not in dir(class_cls)):\n            mlflow_attribute = getattr(mlflow, attr)\n            if callable(mlflow_attribute):\n                setattr(class_cls, attr, staticmethod(mlflow_attribute))\n            else:\n                setattr(class_cls, attr, mlflow_attribute)\n        return class_cls\n\n\nclass MlFlow(metaclass=MlflowMeta):\n    """Class for setting up an mlflow resource for dagster runs.\n    This takes care of all the configuration required to use mlflow tracking and the complexities of\n    mlflow tracking dagster parallel runs.\n    """\n\n    def __init__(self, context):\n        # Context associated attributes\n        self.log = context.log\n        self.run_name = context.dagster_run.job_name\n        self.dagster_run_id = context.run_id\n\n        # resource config attributes\n        resource_config = context.resource_config\n        self.tracking_uri = resource_config.get("mlflow_tracking_uri")\n        if self.tracking_uri:\n            mlflow.set_tracking_uri(self.tracking_uri)\n        self.parent_run_id = resource_config.get("parent_run_id")\n        self.experiment_name = resource_config["experiment_name"]\n        self.env_tags_to_log = resource_config.get("env_to_tag") or []\n        self.extra_tags = resource_config.get("extra_tags")\n\n        # Update env variables if any are given\n        self.env_vars = resource_config.get("env", {})\n        if self.env_vars:\n            environ.update(self.env_vars)\n\n        # If the experiment exists then the set won't do anything\n        mlflow.set_experiment(self.experiment_name)\n        self.experiment = mlflow.get_experiment_by_name(self.experiment_name)\n\n        # Get the client object\n        self.tracking_client = mlflow.tracking.MlflowClient()\n\n        # Set up the active run and tags\n        self._setup()\n\n    def _setup(self):\n        """Sets the active run and tags. If an Mlflow run_id exists then the\n        active run is set to it. This way a single Dagster run outputs data\n        to the same Mlflow run, even when multiprocess executors are used.\n        """\n        # Get the run id\n        run_id = self._get_current_run_id()\n        self._set_active_run(run_id=run_id)\n        self._set_all_tags()\n\n        # hack needed to stop mlflow from marking run as finished when\n        # a process exits in parallel runs\n        atexit.unregister(mlflow.end_run)\n\n    def _get_current_run_id(\n        self, experiment: Optional[Any] = None, dagster_run_id: Optional[str] = None\n    ):\n        """Gets the run id of a specific dagster run and experiment id.\n        If it doesn't exist then it returns a None.\n\n        Args:\n            experiment (optional): Mlflow experiment.\n            When none is passed it fetches the experiment object set in\n            the constructor.  Defaults to None.\n            dagster_run_id (optional): The Dagster run id.\n            When none is passed it fetches the dagster_run_id object set in\n            the constructor.  Defaults to None.\n\n        Returns:\n            run_id (str or None): run_id if it is found else None\n        """\n        experiment = experiment or self.experiment\n        dagster_run_id = dagster_run_id or self.dagster_run_id\n        if experiment:\n            # Check if a run with this dagster run id has already been started\n            # in mlflow, will get an empty dataframe if not\n            current_run_df = mlflow.search_runs(\n                experiment_ids=[experiment.experiment_id],\n                filter_string=f"tags.dagster_run_id='{dagster_run_id}'",\n            )\n            if not current_run_df.empty:\n                return current_run_df.run_id.values[0]\n\n    def _set_active_run(self, run_id=None):\n        """This method sets the active run to be that of the specified\n        run_id. If None is passed then a new run is started. The new run also\n        takes care of nested runs.\n\n        Args:\n            run_id (str, optional): Mlflow run_id. Defaults to None.\n        """\n        nested_run = False\n        if self.parent_run_id is not None:\n            self._start_run(run_id=self.parent_run_id, run_name=self.run_name)\n            nested_run = True\n        self._start_run(run_id=run_id, run_name=self.run_name, nested=nested_run)\n\n    def _start_run(self, **kwargs):\n        """Catches the Mlflow exception if a run is already active."""\n        try:\n            run = mlflow.start_run(**kwargs)\n            self.log.info(\n                f"Starting a new mlflow run with id {run.info.run_id} "\n                f"in experiment {self.experiment_name}"\n            )\n        except Exception as ex:\n            run = mlflow.active_run()\n            if "is already active" not in str(ex):\n                raise (ex)\n            self.log.info(f"Run with id {run.info.run_id} is already active.")\n\n    def _set_all_tags(self):\n        """Method collects dagster_run_id plus all env variables/tags that have been\n            specified by the user in the config_schema and logs them as tags in mlflow.\n\n        Returns:\n            tags [dict]: Dictionary of all the tags\n        """\n        tags = {tag: environ.get(tag) for tag in self.env_tags_to_log}\n        tags["dagster_run_id"] = self.dagster_run_id\n        if self.extra_tags:\n            tags.update(self.extra_tags)\n\n        mlflow.set_tags(tags)\n\n    def cleanup_on_error(self):\n        """Method ends mlflow run with correct exit status for failed runs. Note that\n        this method does not work when a job running in the webserver fails, it seems\n        that in this case a different process runs the job and when it fails\n        the stack trace is therefore not available. For this case we can use the\n        cleanup_on_failure hook defined below.\n        """\n        any_error = sys.exc_info()\n\n        if any_error[1]:\n            if isinstance(any_error[1], KeyboardInterrupt):\n                mlflow.end_run(status=RunStatus.to_string(RunStatus.KILLED))\n            else:\n                mlflow.end_run(status=RunStatus.to_string(RunStatus.FAILED))\n\n    @staticmethod\n    def log_params(params: dict):\n        """Overload of the mlflow.log_params. If len(params) >100 then\n        params is sent to mlflow in chunks.\n\n        Args:\n            params (dict): Parameters to be logged\n        """\n        for param_chunk in MlFlow.chunks(params, 100):\n            mlflow.log_params(param_chunk)\n\n    @staticmethod\n    def chunks(params: dict, size: int = 100):\n        """Method that chunks a dictionary into batches of size.\n\n        Args:\n            params (dict): Dictionary set to be batched\n            size (int, optional): Number of batches. Defaults to 100.\n\n        Yields:\n            (dict): Batch of dictionary\n        """\n        it = iter(params)\n        for _ in range(0, len(params), size):\n            yield {k: params[k] for k in islice(it, size)}\n\n\n
[docs]@dagster_maintained_resource\n@resource(config_schema=CONFIG_SCHEMA)\ndef mlflow_tracking(context):\n """This resource initializes an MLflow run that's used for all steps within a Dagster run.\n\n This resource provides access to all of mlflow's methods as well as the mlflow tracking client's\n methods.\n\n Usage:\n\n 1. Add the mlflow resource to any ops in which you want to invoke mlflow tracking APIs.\n 2. Add the `end_mlflow_on_run_finished` hook to your job to end the MLflow run\n when the Dagster run is finished.\n\n Examples:\n .. code-block:: python\n\n from dagster_mlflow import end_mlflow_on_run_finished, mlflow_tracking\n\n @op(required_resource_keys={"mlflow"})\n def mlflow_op(context):\n mlflow.log_params(some_params)\n mlflow.tracking.MlflowClient().create_registered_model(some_model_name)\n\n @end_mlflow_on_run_finished\n @job(resource_defs={"mlflow": mlflow_tracking})\n def mlf_example():\n mlflow_op()\n\n # example using an mlflow instance with s3 storage\n mlf_example.execute_in_process(run_config={\n "resources": {\n "mlflow": {\n "config": {\n "experiment_name": my_experiment,\n "mlflow_tracking_uri": "http://localhost:5000",\n\n # if want to run a nested run, provide parent_run_id\n "parent_run_id": an_existing_mlflow_run_id,\n\n # env variables to pass to mlflow\n "env": {\n "MLFLOW_S3_ENDPOINT_URL": my_s3_endpoint,\n "AWS_ACCESS_KEY_ID": my_aws_key_id,\n "AWS_SECRET_ACCESS_KEY": my_secret,\n },\n\n # env variables you want to log as mlflow tags\n "env_to_tag": ["DOCKER_IMAGE_TAG"],\n\n # key-value tags to add to your experiment\n "extra_tags": {"super": "experiment"},\n }\n }\n }\n })\n """\n mlf = MlFlow(context)\n yield mlf\n mlf.cleanup_on_error()
\n
", "current_page_name": "_modules/dagster_mlflow/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_mlflow.resources"}}, "dagster_msteams": {"hooks": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_msteams.hooks

\nfrom typing import Callable, Optional\n\nfrom dagster._annotations import deprecated_param\nfrom dagster._core.definitions import failure_hook, success_hook\nfrom dagster._core.execution.context.hook import HookContext\nfrom dagster._utils.warnings import normalize_renamed_param\n\nfrom dagster_msteams.card import Card\n\n\ndef _default_status_message(context: HookContext, status: str) -> str:\n    return f"Op {context.op.name} on job {context.job_name} {status}!\\nRun ID: {context.run_id}"\n\n\ndef _default_failure_message(context: HookContext) -> str:\n    return _default_status_message(context, status="failed")\n\n\ndef _default_success_message(context: HookContext) -> str:\n    return _default_status_message(context, status="succeeded")\n\n\n
[docs]@deprecated_param(\n param="dagit_base_url",\n breaking_version="2.0",\n additional_warn_text="Use `webserver_base_url` instead.",\n)\ndef teams_on_failure(\n message_fn: Callable[[HookContext], str] = _default_failure_message,\n dagit_base_url: Optional[str] = None,\n webserver_base_url: Optional[str] = None,\n):\n """Create a hook on step failure events that will message the given MS Teams webhook URL.\n\n Args:\n message_fn (Optional(Callable[[HookContext], str])): Function which takes in the\n HookContext outputs the message you want to send.\n dagit_base_url: (Optional[str]): The base url of your webserver instance. Specify this\n to allow messages to include deeplinks to the specific run that triggered\n the hook.\n webserver_base_url: (Optional[str]): The base url of your webserver instance. Specify this\n to allow messages to include deeplinks to the specific run that triggered\n the hook.\n\n Examples:\n .. code-block:: python\n\n @teams_on_failure(webserver_base_url="http://localhost:3000")\n @job(...)\n def my_job():\n pass\n\n .. code-block:: python\n\n def my_message_fn(context: HookContext) -> str:\n return f"Op {context.op.name} failed!"\n\n @op\n def a_op(context):\n pass\n\n @job(...)\n def my_job():\n a_op.with_hooks(hook_defs={teams_on_failure("#foo", my_message_fn)})\n\n """\n webserver_base_url = normalize_renamed_param(\n webserver_base_url, "webserver_base_url", dagit_base_url, "dagit_base_url"\n )\n\n @failure_hook(required_resource_keys={"msteams"})\n def _hook(context: HookContext):\n text = message_fn(context)\n if webserver_base_url:\n text += f"<a href='{webserver_base_url}/runs/{context.run_id}'>View in Dagster UI</a>"\n card = Card()\n card.add_attachment(text_message=text)\n context.resources.msteams.post_message(payload=card.payload)\n\n return _hook
\n\n\n
[docs]@deprecated_param(\n param="dagit_base_url",\n breaking_version="2.0",\n additional_warn_text="Use `webserver_base_url` instead.",\n)\ndef teams_on_success(\n message_fn: Callable[[HookContext], str] = _default_success_message,\n dagit_base_url: Optional[str] = None,\n webserver_base_url: Optional[str] = None,\n):\n """Create a hook on step success events that will message the given MS Teams webhook URL.\n\n Args:\n message_fn (Optional(Callable[[HookContext], str])): Function which takes in the\n HookContext outputs the message you want to send.\n dagit_base_url: (Optional[str]): The base url of your webserver instance. Specify this\n to allow messages to include deeplinks to the specific run that triggered\n the hook.\n\n Examples:\n .. code-block:: python\n\n @teams_on_success(webserver_base_url="http://localhost:3000")\n @job(...)\n def my_job():\n pass\n\n .. code-block:: python\n\n def my_message_fn(context: HookContext) -> str:\n return f"Op {context.op.name} failed!"\n\n @op\n def a_op(context):\n pass\n\n @job(...)\n def my_job():\n a_op.with_hooks(hook_defs={teams_on_success("#foo", my_message_fn)})\n\n """\n webserver_base_url = normalize_renamed_param(\n webserver_base_url, "webserver_base_url", dagit_base_url, "dagit_base_url"\n )\n\n @success_hook(required_resource_keys={"msteams"})\n def _hook(context: HookContext):\n text = message_fn(context)\n if webserver_base_url:\n text += f"<a href='{webserver_base_url}/runs/{context.run_id}'>View in webserver</a>"\n card = Card()\n card.add_attachment(text_message=text)\n context.resources.msteams.post_message(payload=card.payload)\n\n return _hook
\n
", "current_page_name": "_modules/dagster_msteams/hooks", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_msteams.hooks"}, "resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_msteams.resources

\nfrom dagster import ConfigurableResource, resource\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom pydantic import Field\n\nfrom dagster_msteams.client import TeamsClient\n\n\n
[docs]class MSTeamsResource(ConfigurableResource):\n """This resource is for connecting to Microsoft Teams.\n\n Provides a `dagster_msteams.TeamsClient` which can be used to\n interface with the MS Teams API.\n\n By configuring this resource, you can post messages to MS Teams from any Dagster op,\n asset, schedule, or sensor:\n\n Examples:\n .. code-block:: python\n\n import os\n\n from dagster import op, job, Definitions, EnvVar\n from dagster_msteams import Card, MSTeamsResource\n\n\n @op\n def teams_op(msteams: MSTeamsResource):\n card = Card()\n card.add_attachment(text_message="Hello There !!")\n msteams.get_client().post_message(payload=card.payload)\n\n\n @job\n def teams_job():\n teams_op()\n\n defs = Definitions(\n jobs=[teams_job],\n resources={\n "msteams": MSTeamsResource(\n hook_url=EnvVar("TEAMS_WEBHOOK_URL")\n )\n }\n )\n """\n\n hook_url: str = Field(\n default=None,\n description=(\n "To send messages to MS Teams channel, an incoming webhook has to be created. The"\n " incoming webhook url must be given as a part of the resource config to the"\n " MSTeamsResource in Dagster. For more information on how to create an incoming"\n " webhook, see"\n " https://docs.microsoft.com/en-us/microsoftteams/platform/webhooks-and-connectors/how-to/add-incoming-webhook"\n ),\n )\n http_proxy: str = Field(default=None, description="HTTP proxy URL")\n https_proxy: str = Field(default=None, description="HTTPS proxy URL")\n timeout: float = Field(default=60, description="Timeout for requests to MS Teams")\n verify: bool = Field(\n default=True, description="Whether to verify SSL certificates, defaults to True"\n )\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n def get_client(self) -> TeamsClient:\n return TeamsClient(\n hook_url=self.hook_url,\n http_proxy=self.http_proxy,\n https_proxy=self.https_proxy,\n timeout=self.timeout,\n verify=self.verify,\n )
\n\n\n
[docs]@dagster_maintained_resource\n@resource(\n config_schema=MSTeamsResource.to_config_schema(),\n description="This resource is for connecting to MS Teams",\n)\ndef msteams_resource(context) -> TeamsClient:\n """This resource is for connecting to Microsoft Teams.\n\n The resource object is a `dagster_msteams.TeamsClient`.\n\n By configuring this resource, you can post messages to MS Teams from any Dagster solid:\n\n Examples:\n .. code-block:: python\n\n import os\n\n from dagster import op, job\n from dagster_msteams import Card, msteams_resource\n\n\n @op(required_resource_keys={"msteams"})\n def teams_op(context):\n card = Card()\n card.add_attachment(text_message="Hello There !!")\n context.resources.msteams.post_message(payload=card.payload)\n\n\n @job(resource_defs={"msteams": msteams_resource})\n def teams_job():\n teams_op()\n\n\n teams_job.execute_in_process(\n {"resources": {"msteams": {"config": {"hook_url": os.getenv("TEAMS_WEBHOOK_URL")}}}}\n )\n """\n return MSTeamsResource.from_resource_context(context).get_client()
\n
", "current_page_name": "_modules/dagster_msteams/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_msteams.resources"}, "sensors": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_msteams.sensors

\nfrom typing import TYPE_CHECKING, Callable, Optional, Sequence, Union\n\nfrom dagster import DefaultSensorStatus\nfrom dagster._annotations import deprecated_param\nfrom dagster._core.definitions import GraphDefinition, JobDefinition\nfrom dagster._core.definitions.run_status_sensor_definition import (\n    RunFailureSensorContext,\n    run_failure_sensor,\n)\nfrom dagster._core.definitions.unresolved_asset_job_definition import UnresolvedAssetJobDefinition\nfrom dagster._utils.warnings import normalize_renamed_param\n\nfrom dagster_msteams.card import Card\nfrom dagster_msteams.client import TeamsClient\n\nif TYPE_CHECKING:\n    from dagster._core.definitions.selector import JobSelector, RepositorySelector\n\n\ndef _default_failure_message(context: RunFailureSensorContext) -> str:\n    return "\\n".join(\n        [\n            f"Job {context.dagster_run.job_name} failed!",\n            f"Run ID: {context.dagster_run.run_id}",\n            f"Error: {context.failure_event.message}",\n        ]\n    )\n\n\n
[docs]@deprecated_param(\n param="dagit_base_url",\n breaking_version="2.0",\n additional_warn_text="Use `webserver_base_url` instead.",\n)\ndef make_teams_on_run_failure_sensor(\n hook_url: str,\n message_fn: Callable[[RunFailureSensorContext], str] = _default_failure_message,\n http_proxy: Optional[str] = None,\n https_proxy: Optional[str] = None,\n timeout: Optional[float] = 60,\n verify: Optional[bool] = None,\n name: Optional[str] = None,\n dagit_base_url: Optional[str] = None,\n default_status: DefaultSensorStatus = DefaultSensorStatus.STOPPED,\n monitored_jobs: Optional[\n Sequence[\n Union[\n JobDefinition,\n GraphDefinition,\n UnresolvedAssetJobDefinition,\n "RepositorySelector",\n "JobSelector",\n ]\n ]\n ] = None,\n monitor_all_repositories: bool = False,\n webserver_base_url: Optional[str] = None,\n):\n """Create a sensor on run failures that will message the given MS Teams webhook URL.\n\n Args:\n hook_url (str): MS Teams incoming webhook URL.\n message_fn (Optional(Callable[[RunFailureSensorContext], str])): Function which\n takes in the ``RunFailureSensorContext`` and outputs the message you want to send.\n Defaults to a text message that contains error message, job name, and run ID.\n http_proxy : (Optional[str]): Proxy for requests using http protocol.\n https_proxy : (Optional[str]): Proxy for requests using https protocol.\n timeout: (Optional[float]): Connection timeout in seconds. Defaults to 60.\n verify: (Optional[bool]): Whether to verify the servers TLS certificate.\n name: (Optional[str]): The name of the sensor. Defaults to "teams_on_run_failure".\n dagit_base_url: (Optional[str]): The base url of your webserver instance. Specify this to allow\n messages to include deeplinks to the failed run.\n default_status (DefaultSensorStatus): Whether the sensor starts as running or not. The default\n status can be overridden from Dagit or via the GraphQL API.\n monitored_jobs (Optional[List[Union[JobDefinition, GraphDefinition, UnresolvedAssetJobDefinition, RepositorySelector, JobSelector]]]):\n Jobs in the current repository that will be monitored by this sensor. Defaults to None,\n which means the alert will be sent when any job in the repository matches the requested\n run_status. To monitor jobs in external repositories, use RepositorySelector and JobSelector.\n monitor_all_repositories (bool): If set to True, the sensor will monitor all runs in the\n Dagster instance. If set to True, an error will be raised if you also specify\n monitored_jobs or job_selection. Defaults to False.\n webserver_base_url: (Optional[str]): The base url of your webserver instance. Specify this to allow\n messages to include deeplinks to the failed run.\n\n Examples:\n .. code-block:: python\n\n teams_on_run_failure = make_teams_on_run_failure_sensor(\n hook_url=os.getenv("TEAMS_WEBHOOK_URL")\n )\n\n @repository\n def my_repo():\n return [my_job + teams_on_run_failure]\n\n .. code-block:: python\n\n def my_message_fn(context: RunFailureSensorContext) -> str:\n return "Job {job_name} failed! Error: {error}".format(\n job_name=context.dagster_run.job_name,\n error=context.failure_event.message,\n )\n\n teams_on_run_failure = make_teams_on_run_failure_sensor(\n hook_url=os.getenv("TEAMS_WEBHOOK_URL"),\n message_fn=my_message_fn,\n webserver_base_url="http://localhost:3000",\n )\n\n\n """\n webserver_base_url = normalize_renamed_param(\n webserver_base_url, "webserver_base_url", dagit_base_url, "dagit_base_url"\n )\n\n teams_client = TeamsClient(\n hook_url=hook_url,\n http_proxy=http_proxy,\n https_proxy=https_proxy,\n timeout=timeout,\n verify=verify,\n )\n\n @run_failure_sensor(\n name=name,\n default_status=default_status,\n monitored_jobs=monitored_jobs,\n monitor_all_repositories=monitor_all_repositories,\n )\n def teams_on_run_failure(context: RunFailureSensorContext):\n text = message_fn(context)\n if webserver_base_url:\n text += "<a href='{base_url}/runs/{run_id}'>View in Dagit</a>".format(\n base_url=webserver_base_url,\n run_id=context.dagster_run.run_id,\n )\n card = Card()\n card.add_attachment(text_message=text)\n teams_client.post_message(payload=card.payload)\n\n return teams_on_run_failure
\n
", "current_page_name": "_modules/dagster_msteams/sensors", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_msteams.sensors"}}, "dagster_mysql": {"event_log": {"event_log": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_mysql.event_log.event_log

\nfrom typing import ContextManager, Optional, cast\n\nimport dagster._check as check\nimport sqlalchemy as db\nimport sqlalchemy.dialects as db_dialects\nimport sqlalchemy.exc as db_exc\nimport sqlalchemy.pool as db_pool\nfrom dagster._config.config_schema import UserConfigSchema\nfrom dagster._core.event_api import EventHandlerFn\nfrom dagster._core.events.log import EventLogEntry\nfrom dagster._core.storage.config import MySqlStorageConfig, mysql_config\nfrom dagster._core.storage.event_log import (\n    AssetKeyTable,\n    SqlEventLogStorage,\n    SqlEventLogStorageMetadata,\n    SqlPollingEventWatcher,\n)\nfrom dagster._core.storage.event_log.base import EventLogCursor\nfrom dagster._core.storage.event_log.migration import ASSET_KEY_INDEX_COLS\nfrom dagster._core.storage.sql import (\n    AlembicVersion,\n    check_alembic_revision,\n    create_engine,\n    run_alembic_upgrade,\n    stamp_alembic_rev,\n)\nfrom dagster._serdes import ConfigurableClass, ConfigurableClassData\nfrom sqlalchemy.engine import Connection\n\nfrom ..utils import (\n    create_mysql_connection,\n    mysql_alembic_config,\n    mysql_isolation_level,\n    mysql_url_from_config,\n    parse_mysql_version,\n    retry_mysql_connection_fn,\n    retry_mysql_creation_fn,\n)\n\nMINIMUM_MYSQL_INTERSECT_VERSION = "8.0.31"\n\n\n
[docs]class MySQLEventLogStorage(SqlEventLogStorage, ConfigurableClass):\n """MySQL-backed event log storage.\n\n Users should not directly instantiate this class; it is instantiated by internal machinery when\n ``dagster-webserver`` and ``dagster-graphql`` load, based on the values in the ``dagster.yaml`` file in\n ``$DAGSTER_HOME``. Configuration of this class should be done by setting values in that file.\n\n .. literalinclude:: ../../../../../../examples/docs_snippets/docs_snippets/deploying/dagster-mysql-legacy.yaml\n :caption: dagster.yaml\n :start-after: start_marker_event_log\n :end-before: end_marker_event_log\n :language: YAML\n\n Note that the fields in this config are :py:class:`~dagster.StringSource` and\n :py:class:`~dagster.IntSource` and can be configured from environment variables.\n\n """\n\n def __init__(self, mysql_url: str, inst_data: Optional[ConfigurableClassData] = None):\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n self.mysql_url = check.str_param(mysql_url, "mysql_url")\n self._disposed = False\n\n self._event_watcher = SqlPollingEventWatcher(self)\n\n # Default to not holding any connections open to prevent accumulating connections per DagsterInstance\n self._engine = create_engine(\n self.mysql_url,\n isolation_level=mysql_isolation_level(),\n poolclass=db_pool.NullPool,\n )\n self._secondary_index_cache = {}\n\n table_names = retry_mysql_connection_fn(db.inspect(self._engine).get_table_names)\n\n # Stamp and create tables if the main table does not exist (we can't check alembic\n # revision because alembic config may be shared with other storage classes)\n if "event_logs" not in table_names:\n retry_mysql_creation_fn(self._init_db)\n # mark all secondary indexes to be used\n self.reindex_events()\n self.reindex_assets()\n\n self._mysql_version = self.get_server_version()\n super().__init__()\n\n def _init_db(self) -> None:\n with self._connect() as conn:\n SqlEventLogStorageMetadata.create_all(conn)\n stamp_alembic_rev(mysql_alembic_config(__file__), conn)\n\n def optimize_for_webserver(self, statement_timeout: int, pool_recycle: int) -> None:\n # When running in dagster-webserver, hold an open connection\n # https://github.com/dagster-io/dagster/issues/3719\n self._engine = create_engine(\n self.mysql_url,\n isolation_level=mysql_isolation_level(),\n pool_size=1,\n pool_recycle=pool_recycle,\n )\n\n def upgrade(self) -> None:\n alembic_config = mysql_alembic_config(__file__)\n with self._connect() as conn:\n run_alembic_upgrade(alembic_config, conn)\n\n @property\n def inst_data(self) -> Optional[ConfigurableClassData]:\n return self._inst_data\n\n @classmethod\n def config_type(cls) -> UserConfigSchema:\n return mysql_config()\n\n @classmethod\n def from_config_value(\n cls, inst_data: Optional[ConfigurableClassData], config_value: MySqlStorageConfig\n ) -> "MySQLEventLogStorage":\n return MySQLEventLogStorage(\n inst_data=inst_data, mysql_url=mysql_url_from_config(config_value)\n )\n\n @staticmethod\n def wipe_storage(mysql_url: str) -> None:\n engine = create_engine(\n mysql_url, isolation_level=mysql_isolation_level(), poolclass=db_pool.NullPool\n )\n try:\n SqlEventLogStorageMetadata.drop_all(engine)\n finally:\n engine.dispose()\n\n @staticmethod\n def create_clean_storage(conn_string: str) -> "MySQLEventLogStorage":\n MySQLEventLogStorage.wipe_storage(conn_string)\n return MySQLEventLogStorage(conn_string)\n\n def get_server_version(self) -> Optional[str]:\n with self.index_connection() as conn:\n row = conn.execute(db.text("select version()")).fetchone()\n\n if not row:\n return None\n\n return cast(str, row[0])\n\n def store_asset_event(self, event: EventLogEntry, event_id: int) -> None:\n # last_materialization_timestamp is updated upon observation, materialization, materialization_planned\n # See SqlEventLogStorage.store_asset_event method for more details\n\n values = self._get_asset_entry_values(\n event, event_id, self.has_secondary_index(ASSET_KEY_INDEX_COLS)\n )\n with self.index_connection() as conn:\n if values:\n conn.execute(\n db_dialects.mysql.insert(AssetKeyTable)\n .values(\n asset_key=event.dagster_event.asset_key.to_string(), # type: ignore # (possible none)\n **values,\n )\n .on_duplicate_key_update(\n **values,\n )\n )\n else:\n try:\n conn.execute(\n db_dialects.mysql.insert(AssetKeyTable).values(\n asset_key=event.dagster_event.asset_key.to_string(), # type: ignore # (possible none)\n )\n )\n except db_exc.IntegrityError:\n pass\n\n def _connect(self) -> ContextManager[Connection]:\n return create_mysql_connection(self._engine, __file__, "event log")\n\n def run_connection(self, run_id: Optional[str] = None) -> ContextManager[Connection]:\n return self._connect()\n\n def index_connection(self) -> ContextManager[Connection]:\n return self._connect()\n\n def has_table(self, table_name: str) -> bool:\n with self._connect() as conn:\n return table_name in db.inspect(conn).get_table_names()\n\n def has_secondary_index(self, name: str) -> bool:\n if name not in self._secondary_index_cache:\n self._secondary_index_cache[name] = super(\n MySQLEventLogStorage, self\n ).has_secondary_index(name)\n return self._secondary_index_cache[name]\n\n def enable_secondary_index(self, name: str) -> None:\n super(MySQLEventLogStorage, self).enable_secondary_index(name)\n if name in self._secondary_index_cache:\n del self._secondary_index_cache[name]\n\n def watch(self, run_id: str, cursor: Optional[str], callback: EventHandlerFn) -> None:\n if cursor and EventLogCursor.parse(cursor).is_offset_cursor():\n check.failed("Cannot call `watch` with an offset cursor")\n self._event_watcher.watch_run(run_id, cursor, callback)\n\n def end_watch(self, run_id: str, handler: EventHandlerFn) -> None:\n self._event_watcher.unwatch_run(run_id, handler)\n\n @property\n def supports_intersect(self) -> bool:\n return parse_mysql_version(self._mysql_version) >= parse_mysql_version( # type: ignore # (possible none)\n MINIMUM_MYSQL_INTERSECT_VERSION\n )\n\n @property\n def event_watcher(self) -> SqlPollingEventWatcher:\n return self._event_watcher\n\n def __del__(self) -> None:\n self.dispose()\n\n def dispose(self) -> None:\n if not self._disposed:\n self._disposed = True\n self._event_watcher.close()\n\n def alembic_version(self) -> AlembicVersion:\n alembic_config = mysql_alembic_config(__file__)\n with self._connect() as conn:\n return check_alembic_revision(alembic_config, conn)
\n
", "current_page_name": "_modules/dagster_mysql/event_log/event_log", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_mysql.event_log.event_log"}}, "run_storage": {"run_storage": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_mysql.run_storage.run_storage

\nfrom typing import ContextManager, Mapping, Optional, cast\n\nimport dagster._check as check\nimport sqlalchemy as db\nimport sqlalchemy.dialects as db_dialects\nimport sqlalchemy.pool as db_pool\nfrom dagster._config.config_schema import UserConfigSchema\nfrom dagster._core.storage.config import MySqlStorageConfig, mysql_config\nfrom dagster._core.storage.runs import (\n    DaemonHeartbeatsTable,\n    InstanceInfo,\n    RunStorageSqlMetadata,\n    SqlRunStorage,\n)\nfrom dagster._core.storage.runs.schema import KeyValueStoreTable\nfrom dagster._core.storage.sql import (\n    AlembicVersion,\n    check_alembic_revision,\n    create_engine,\n    run_alembic_upgrade,\n    stamp_alembic_rev,\n)\nfrom dagster._daemon.types import DaemonHeartbeat\nfrom dagster._serdes import ConfigurableClass, ConfigurableClassData, serialize_value\nfrom dagster._utils import utc_datetime_from_timestamp\nfrom sqlalchemy.engine import Connection\n\nfrom ..utils import (\n    create_mysql_connection,\n    mysql_alembic_config,\n    mysql_isolation_level,\n    mysql_url_from_config,\n    parse_mysql_version,\n    retry_mysql_connection_fn,\n    retry_mysql_creation_fn,\n)\n\nMINIMUM_MYSQL_BUCKET_VERSION = "8.0.0"\nMINIMUM_MYSQL_INTERSECT_VERSION = "8.0.31"\n\n\n
[docs]class MySQLRunStorage(SqlRunStorage, ConfigurableClass):\n """MySQL-backed run storage.\n\n Users should not directly instantiate this class; it is instantiated by internal machinery when\n ``dagster-webserver`` and ``dagster-graphql`` load, based on the values in the ``dagster.yaml`` file in\n ``$DAGSTER_HOME``. Configuration of this class should be done by setting values in that file.\n\n\n .. literalinclude:: ../../../../../../examples/docs_snippets/docs_snippets/deploying/dagster-mysql-legacy.yaml\n :caption: dagster.yaml\n :start-after: start_marker_runs\n :end-before: end_marker_runs\n :language: YAML\n\n Note that the fields in this config are :py:class:`~dagster.StringSource` and\n :py:class:`~dagster.IntSource` and can be configured from environment variables.\n """\n\n def __init__(self, mysql_url: str, inst_data: Optional[ConfigurableClassData] = None):\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n self.mysql_url = mysql_url\n\n # Default to not holding any connections open to prevent accumulating connections per DagsterInstance\n self._engine = create_engine(\n self.mysql_url,\n isolation_level=mysql_isolation_level(),\n poolclass=db_pool.NullPool,\n )\n\n self._index_migration_cache = {}\n table_names = retry_mysql_connection_fn(db.inspect(self._engine).get_table_names)\n\n # Stamp and create tables if the main table does not exist (we can't check alembic\n # revision because alembic config may be shared with other storage classes)\n if "runs" not in table_names:\n retry_mysql_creation_fn(self._init_db)\n self.migrate()\n self.optimize()\n\n elif "instance_info" not in table_names:\n InstanceInfo.create(self._engine)\n\n self._mysql_version = self.get_server_version()\n\n super().__init__()\n\n def _init_db(self) -> None:\n with self.connect() as conn:\n RunStorageSqlMetadata.create_all(conn)\n stamp_alembic_rev(mysql_alembic_config(__file__), conn)\n\n def optimize_for_webserver(self, statement_timeout: int, pool_recycle: int) -> None:\n # When running in dagster-webserver, hold 1 open connection\n # https://github.com/dagster-io/dagster/issues/3719\n self._engine = create_engine(\n self.mysql_url,\n isolation_level=mysql_isolation_level(),\n pool_size=1,\n pool_recycle=pool_recycle,\n )\n\n @property\n def inst_data(self) -> Optional[ConfigurableClassData]:\n return self._inst_data\n\n @classmethod\n def config_type(cls) -> UserConfigSchema:\n return mysql_config()\n\n def get_server_version(self) -> Optional[str]:\n with self.connect() as conn:\n row = conn.execute(db.text("select version()")).fetchone()\n\n if not row:\n return None\n\n return cast(str, row[0])\n\n @classmethod\n def from_config_value(\n cls, inst_data: Optional[ConfigurableClassData], config_value: MySqlStorageConfig\n ) -> "MySQLRunStorage":\n return MySQLRunStorage(inst_data=inst_data, mysql_url=mysql_url_from_config(config_value))\n\n @staticmethod\n def wipe_storage(mysql_url: str) -> None:\n engine = create_engine(\n mysql_url, isolation_level=mysql_isolation_level(), poolclass=db_pool.NullPool\n )\n try:\n RunStorageSqlMetadata.drop_all(engine)\n finally:\n engine.dispose()\n\n @staticmethod\n def create_clean_storage(mysql_url: str) -> "MySQLRunStorage":\n MySQLRunStorage.wipe_storage(mysql_url)\n return MySQLRunStorage(mysql_url)\n\n def connect(self, run_id: Optional[str] = None) -> ContextManager[Connection]:\n return create_mysql_connection(self._engine, __file__, "run")\n\n def upgrade(self) -> None:\n alembic_config = mysql_alembic_config(__file__)\n with self.connect() as conn:\n run_alembic_upgrade(alembic_config, conn)\n\n def has_built_index(self, migration_name: str) -> None:\n if migration_name not in self._index_migration_cache:\n self._index_migration_cache[migration_name] = super(\n MySQLRunStorage, self\n ).has_built_index(migration_name)\n return self._index_migration_cache[migration_name]\n\n def mark_index_built(self, migration_name: str) -> None:\n super(MySQLRunStorage, self).mark_index_built(migration_name)\n if migration_name in self._index_migration_cache:\n del self._index_migration_cache[migration_name]\n\n @property\n def supports_intersect(self) -> bool:\n return parse_mysql_version(self._mysql_version) >= parse_mysql_version( # type: ignore\n MINIMUM_MYSQL_INTERSECT_VERSION\n )\n\n def add_daemon_heartbeat(self, daemon_heartbeat: DaemonHeartbeat) -> None:\n with self.connect() as conn:\n conn.execute(\n db_dialects.mysql.insert(DaemonHeartbeatsTable)\n .values(\n timestamp=utc_datetime_from_timestamp(daemon_heartbeat.timestamp),\n daemon_type=daemon_heartbeat.daemon_type,\n daemon_id=daemon_heartbeat.daemon_id,\n body=serialize_value(daemon_heartbeat),\n )\n .on_duplicate_key_update(\n timestamp=utc_datetime_from_timestamp(daemon_heartbeat.timestamp),\n daemon_id=daemon_heartbeat.daemon_id,\n body=serialize_value(daemon_heartbeat),\n )\n )\n\n def set_cursor_values(self, pairs: Mapping[str, str]) -> None:\n check.mapping_param(pairs, "pairs", key_type=str, value_type=str)\n db_values = [{"key": k, "value": v} for k, v in pairs.items()]\n\n with self.connect() as conn:\n insert_stmt = db_dialects.mysql.insert(KeyValueStoreTable).values(db_values)\n conn.execute(\n insert_stmt.on_duplicate_key_update(\n value=insert_stmt.inserted.value,\n )\n )\n\n def alembic_version(self) -> AlembicVersion:\n alembic_config = mysql_alembic_config(__file__)\n with self.connect() as conn:\n return check_alembic_revision(alembic_config, conn)
\n
", "current_page_name": "_modules/dagster_mysql/run_storage/run_storage", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_mysql.run_storage.run_storage"}}, "schedule_storage": {"schedule_storage": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_mysql.schedule_storage.schedule_storage

\nfrom typing import ContextManager, Optional, cast\n\nimport dagster._check as check\nimport pendulum\nimport sqlalchemy as db\nimport sqlalchemy.dialects as db_dialects\nimport sqlalchemy.pool as db_pool\nfrom dagster._config.config_schema import UserConfigSchema\nfrom dagster._core.storage.config import MySqlStorageConfig, mysql_config\nfrom dagster._core.storage.schedules import ScheduleStorageSqlMetadata, SqlScheduleStorage\nfrom dagster._core.storage.schedules.schema import InstigatorsTable\nfrom dagster._core.storage.sql import (\n    AlembicVersion,\n    check_alembic_revision,\n    create_engine,\n    run_alembic_upgrade,\n    stamp_alembic_rev,\n)\nfrom dagster._serdes import ConfigurableClass, ConfigurableClassData, serialize_value\nfrom sqlalchemy.engine import Connection\n\nfrom ..utils import (\n    create_mysql_connection,\n    mysql_alembic_config,\n    mysql_isolation_level,\n    mysql_url_from_config,\n    parse_mysql_version,\n    retry_mysql_connection_fn,\n    retry_mysql_creation_fn,\n)\n\nMINIMUM_MYSQL_BATCH_VERSION = "8.0.0"\n\n\n
[docs]class MySQLScheduleStorage(SqlScheduleStorage, ConfigurableClass):\n """MySQL-backed run storage.\n\n Users should not directly instantiate this class; it is instantiated by internal machinery when\n ``dagster-webserver`` and ``dagster-graphql`` load, based on the values in the ``dagster.yaml`` file in\n ``$DAGSTER_HOME``. Configuration of this class should be done by setting values in that file.\n\n .. literalinclude:: ../../../../../../examples/docs_snippets/docs_snippets/deploying/dagster-mysql-legacy.yaml\n :caption: dagster.yaml\n :start-after: start_marker_schedules\n :end-before: end_marker_schedules\n :language: YAML\n\n Note that the fields in this config are :py:class:`~dagster.StringSource` and\n :py:class:`~dagster.IntSource` and can be configured from environment variables.\n """\n\n def __init__(self, mysql_url: str, inst_data: Optional[ConfigurableClassData] = None):\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n self.mysql_url = mysql_url\n\n # Default to not holding any connections open to prevent accumulating connections per DagsterInstance\n self._engine = create_engine(\n self.mysql_url,\n isolation_level=mysql_isolation_level(),\n poolclass=db_pool.NullPool,\n )\n\n # Stamp and create tables if the main table does not exist (we can't check alembic\n # revision because alembic config may be shared with other storage classes)\n table_names = retry_mysql_connection_fn(db.inspect(self._engine).get_table_names)\n if "jobs" not in table_names:\n retry_mysql_creation_fn(self._init_db)\n\n self._mysql_version = self.get_server_version()\n\n super().__init__()\n\n def _init_db(self) -> None:\n with self.connect() as conn:\n ScheduleStorageSqlMetadata.create_all(conn)\n stamp_alembic_rev(mysql_alembic_config(__file__), conn)\n\n # mark all the data migrations as applied\n self.migrate()\n self.optimize()\n\n def optimize_for_webserver(self, statement_timeout: int, pool_recycle: int) -> None:\n # When running in dagster-webserver, hold an open connection\n # https://github.com/dagster-io/dagster/issues/3719\n self._engine = create_engine(\n self.mysql_url,\n isolation_level=mysql_isolation_level(),\n pool_size=1,\n pool_recycle=pool_recycle,\n )\n\n @property\n def inst_data(self) -> Optional[ConfigurableClassData]:\n return self._inst_data\n\n @classmethod\n def config_type(cls) -> UserConfigSchema:\n return mysql_config()\n\n @classmethod\n def from_config_value(\n cls, inst_data: Optional[ConfigurableClassData], config_value: MySqlStorageConfig\n ) -> "MySQLScheduleStorage":\n return MySQLScheduleStorage(\n inst_data=inst_data, mysql_url=mysql_url_from_config(config_value)\n )\n\n @staticmethod\n def wipe_storage(mysql_url: str) -> None:\n engine = create_engine(\n mysql_url, isolation_level=mysql_isolation_level(), poolclass=db_pool.NullPool\n )\n try:\n ScheduleStorageSqlMetadata.drop_all(engine)\n finally:\n engine.dispose()\n\n @staticmethod\n def create_clean_storage(mysql_url: str) -> "MySQLScheduleStorage":\n MySQLScheduleStorage.wipe_storage(mysql_url)\n return MySQLScheduleStorage(mysql_url)\n\n def connect(self) -> ContextManager[Connection]:\n return create_mysql_connection(self._engine, __file__, "schedule")\n\n @property\n def supports_batch_queries(self) -> bool:\n if not self._mysql_version:\n return False\n\n return parse_mysql_version(self._mysql_version) >= parse_mysql_version(\n MINIMUM_MYSQL_BATCH_VERSION\n )\n\n def get_server_version(self) -> Optional[str]:\n with self.connect() as conn:\n row = conn.execute(db.text("select version()")).fetchone()\n\n if not row:\n return None\n\n return cast(str, row[0])\n\n def upgrade(self) -> None:\n with self.connect() as conn:\n alembic_config = mysql_alembic_config(__file__)\n run_alembic_upgrade(alembic_config, conn)\n\n def _add_or_update_instigators_table(self, conn: Connection, state) -> None:\n selector_id = state.selector_id\n conn.execute(\n db_dialects.mysql.insert(InstigatorsTable)\n .values(\n selector_id=selector_id,\n repository_selector_id=state.repository_selector_id,\n status=state.status.value,\n instigator_type=state.instigator_type.value,\n instigator_body=serialize_value(state),\n )\n .on_duplicate_key_update(\n status=state.status.value,\n instigator_type=state.instigator_type.value,\n instigator_body=serialize_value(state),\n update_timestamp=pendulum.now("UTC"),\n )\n )\n\n def alembic_version(self) -> AlembicVersion:\n alembic_config = mysql_alembic_config(__file__)\n with self.connect() as conn:\n return check_alembic_revision(alembic_config, conn)
\n
", "current_page_name": "_modules/dagster_mysql/schedule_storage/schedule_storage", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_mysql.schedule_storage.schedule_storage"}}}, "dagster_pagerduty": {"resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_pagerduty.resources

\nfrom typing import Dict, Optional, cast\n\nimport pypd\nfrom dagster import ConfigurableResource, resource\nfrom dagster._config.pythonic_config import infer_schema_from_config_class\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom dagster._utils.warnings import suppress_dagster_warnings\nfrom pydantic import Field as PyField\n\n\n
[docs]class PagerDutyService(ConfigurableResource):\n """This resource is for posting events to PagerDuty."""\n\n """Integrates with PagerDuty via the pypd library.\n\n See:\n https://v2.developer.pagerduty.com/docs/events-api-v2\n https://v2.developer.pagerduty.com/docs/send-an-event-events-api-v2\n https://support.pagerduty.com/docs/services-and-integrations#section-events-api-v2\n https://github.com/PagerDuty/pagerduty-api-python-client\n\n for documentation and more information.\n """\n\n routing_key: str = PyField(\n ...,\n description=(\n "The routing key provisions access to your PagerDuty service. You"\n "will need to include the integration key for your new integration, as a"\n "routing_key in the event payload."\n ),\n )\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n def EventV2_create(\n self,\n summary: str,\n source: str,\n severity: str,\n event_action: str = "trigger",\n dedup_key: Optional[str] = None,\n timestamp: Optional[str] = None,\n component: Optional[str] = None,\n group: Optional[str] = None,\n event_class: Optional[str] = None,\n custom_details: Optional[object] = None,\n ) -> object:\n """Events API v2 enables you to add PagerDuty's advanced event and incident management\n functionality to any system that can make an outbound HTTP connection.\n\n Args:\n summary (str):\n A high-level, text summary message of the event. Will be used to construct an\n alert's description. Example:\n\n "PING OK - Packet loss = 0%, RTA = 1.41 ms" "Host\n 'acme-andromeda-sv1-c40 :: 179.21.24.50' is DOWN"\n\n source (str):\n Specific human-readable unique identifier, such as a hostname, for the system having\n the problem. Examples:\n\n "prod05.theseus.acme-widgets.com"\n "171.26.23.22"\n "aws:elasticache:us-east-1:852511987:cluster/api-stats-prod-003"\n "9c09acd49a25"\n\n severity (str):\n How impacted the affected system is. Displayed to users in lists and influences the\n priority of any created incidents. Must be one of {info, warning, error, critical}\n\n Keyword Args:\n event_action (str):\n There are three types of events that PagerDuty recognizes, and are used to represent\n different types of activity in your monitored systems. (default: 'trigger')\n\n * trigger: When PagerDuty receives a trigger event, it will either open a new alert,\n or add a new trigger log entry to an existing alert, depending on the\n provided dedup_key. Your monitoring tools should send PagerDuty a trigger\n when a new problem has been detected. You may send additional triggers\n when a previously detected problem has occurred again.\n\n * acknowledge: acknowledge events cause the referenced incident to enter the\n acknowledged state. While an incident is acknowledged, it won't\n generate any additional notifications, even if it receives new\n trigger events. Your monitoring tools should send PagerDuty an\n acknowledge event when they know someone is presently working on the\n problem.\n\n * resolve: resolve events cause the referenced incident to enter the resolved state.\n Once an incident is resolved, it won't generate any additional\n notifications. New trigger events with the same dedup_key as a resolved\n incident won't re-open the incident. Instead, a new incident will be\n created. Your monitoring tools should send PagerDuty a resolve event when\n the problem that caused the initial trigger event has been fixed.\n\n dedup_key (str):\n Deduplication key for correlating triggers and resolves. The maximum permitted\n length of this property is 255 characters.\n\n timestamp (str):\n Timestamp (ISO 8601). When the upstream system detected / created the event. This is\n useful if a system batches or holds events before sending them to PagerDuty. This\n will be auto-generated by PagerDuty if not provided. Example:\n\n 2015-07-17T08:42:58.315+0000\n\n component (str):\n The part or component of the affected system that is broken. Examples:\n\n "keepalive"\n "webping"\n "mysql"\n "wqueue"\n\n group (str):\n A cluster or grouping of sources. For example, sources "prod-datapipe-02" and\n "prod-datapipe-03" might both be part of "prod-datapipe". Examples:\n\n "prod-datapipe"\n "www"\n "web_stack"\n\n event_class (str):\n The class/type of the event. Examples:\n\n "High CPU"\n "Latency"\n "500 Error"\n\n custom_details (Dict[str, str]):\n Additional details about the event and affected system. Example:\n\n {"ping time": "1500ms", "load avg": 0.75 }\n """\n data = {\n "routing_key": self.routing_key,\n "event_action": event_action,\n "payload": {"summary": summary, "source": source, "severity": severity},\n }\n\n if dedup_key is not None:\n data["dedup_key"] = dedup_key\n\n payload: Dict[str, object] = cast(Dict[str, object], data["payload"])\n\n if timestamp is not None:\n payload["timestamp"] = timestamp\n\n if component is not None:\n payload["component"] = component\n\n if group is not None:\n payload["group"] = group\n\n if event_class is not None:\n payload["class"] = event_class\n\n if custom_details is not None:\n payload["custom_details"] = custom_details\n\n return pypd.EventV2.create(data=data)
\n\n\n
[docs]@dagster_maintained_resource\n@resource(\n config_schema=infer_schema_from_config_class(PagerDutyService),\n description="""This resource is for posting events to PagerDuty.""",\n)\n@suppress_dagster_warnings\ndef pagerduty_resource(context) -> PagerDutyService:\n """A resource for posting events (alerts) to PagerDuty.\n\n Example:\n .. code-block:: python\n\n @op\n def pagerduty_op(pagerduty: PagerDutyService):\n pagerduty.EventV2_create(\n summary='alert from dagster'\n source='localhost',\n severity='error',\n event_action='trigger',\n )\n\n @job(resource_defs={ 'pagerduty': pagerduty_resource })\n def pagerduty_test():\n pagerduty_op()\n\n pagerduty_test.execute_in_process(\n run_config={\n "resources": {\n 'pagerduty': {'config': {'routing_key': '0123456789abcdef0123456789abcdef'}}\n }\n }\n )\n """\n return PagerDutyService(**context.resource_config)
\n
", "current_page_name": "_modules/dagster_pagerduty/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_pagerduty.resources"}}, "dagster_pandas": {"constraints": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_pandas.constraints

\nimport sys\nfrom collections import defaultdict\nfrom datetime import datetime\nfrom functools import wraps\n\nimport pandas as pd\nfrom dagster import (\n    DagsterType,\n    TypeCheck,\n    _check as check,\n)\nfrom dagster._annotations import experimental\nfrom pandas import DataFrame\nfrom typing_extensions import Final\n\nCONSTRAINT_METADATA_KEY: Final = "constraint_metadata"\n\n\nclass ConstraintViolationException(Exception):\n    """Indicates that a constraint has been violated."""\n\n\nclass ConstraintWithMetadataException(Exception):\n    """This class defines the response generated when a pandas DF fails validation -- it can be used to generate either a\n    failed typecheck or an exception.\n\n    Args:\n        constraint_name (str):  the name of the violated constraint\n        constraint_description (Optional[str]): the description of the violated constraint\n        expectation (Optional[Union[dict,list, str, set]]): what result was expected -- typically a jsonlike, though it can be a string\n        offending (Optional[Union[dict,list, str, set]]):  which pieces of the dataframe violated the expectation, typically list or string\n        actual (Optional[Union[dict,list, str, set]]): what those pieces of the dataframe actually were -- typically a jsonlike\n    """\n\n    def __init__(\n        self,\n        constraint_name,\n        constraint_description="",\n        expectation=None,\n        offending=None,\n        actual=None,\n    ):\n        self.constraint_name = constraint_name\n        self.constraint_description = constraint_description\n        self.expectation = check.opt_inst_param(expectation, "expectation", (dict, list, str, set))\n        self.offending = check.opt_inst_param(offending, "offending", (dict, list, str, set))\n        self.actual = check.opt_inst_param(actual, "actual", (dict, list, str, set))\n        super(ConstraintWithMetadataException, self).__init__(\n            "Violated {} - {}, {} was/were expected, but we received {} which was/were {}".format(\n                constraint_name,\n                constraint_description,\n                expectation,\n                offending,\n                actual,\n            )\n        )\n\n    def normalize_metadata_json_value(self, val):\n        if isinstance(val, set):\n            return list(val)\n        else:\n            return val\n\n    def convert_to_metadata(self):\n        return {\n            CONSTRAINT_METADATA_KEY: {\n                "constraint_name": self.constraint_name,\n                "constraint_description": self.constraint_description,\n                "expected": self.normalize_metadata_json_value(self.expectation),\n                "offending": self.normalize_metadata_json_value(self.offending),\n                "actual": self.normalize_metadata_json_value(self.actual),\n            },\n        }\n\n    def return_as_typecheck(self):\n        return TypeCheck(\n            success=False, description=self.args[0], metadata=self.convert_to_metadata()\n        )\n\n\nclass DataFrameConstraintViolationException(ConstraintViolationException):\n    """Indicates a dataframe level constraint has been violated."""\n\n    def __init__(self, constraint_name, constraint_description):\n        super(DataFrameConstraintViolationException, self).__init__(\n            f"Violated {constraint_name} - {constraint_description}"\n        )\n\n\nclass DataFrameWithMetadataException(ConstraintWithMetadataException):\n    def __init__(self, constraint_name, constraint_description, expectation, actual):\n        super(DataFrameWithMetadataException, self).__init__(\n            constraint_name, constraint_description, expectation, "a malformed dataframe", actual\n        )\n\n\nclass ColumnConstraintViolationException(ConstraintViolationException):\n    """Indicates that a column constraint has been violated."""\n\n    def __init__(self, constraint_name, constraint_description, column_name, offending_rows=None):\n        self.constraint_name = constraint_name\n        self.constraint_description = constraint_description\n        self.column_name = column_name\n        self.offending_rows = offending_rows\n        super(ColumnConstraintViolationException, self).__init__(self.construct_message())\n\n    def construct_message(self):\n        base_message = (\n            'Violated "{constraint_name}" for column "{column_name}" - {constraint_description}'\n            .format(\n                constraint_name=self.constraint_name,\n                constraint_description=self.constraint_description,\n                column_name=self.column_name,\n            )\n        )\n        if self.offending_rows is not None:\n            base_message += "The offending (index, row values) are the following: {}".format(\n                self.offending_rows\n            )\n        return base_message\n\n\nclass ColumnWithMetadataException(ConstraintWithMetadataException):\n    def __init__(self, constraint_name, constraint_description, expectation, offending, actual):\n        super(ColumnWithMetadataException, self).__init__(\n            "the column constraint " + constraint_name,\n            constraint_description,\n            expectation,\n            offending,\n            actual,\n        )\n\n\nclass Constraint:\n    """Base constraint object that all constraints inherit from.\n\n    Args:\n        error_description (Optional[str]): The plain string description that is output in the terminal if the constraint fails.\n        markdown_description (Optional[str]): A markdown supported description that is shown in the Dagster UI if the constraint fails.\n    """\n\n    def __init__(self, error_description=None, markdown_description=None):\n        self.name = self.__class__.__name__\n        self.markdown_description = check.str_param(markdown_description, "markdown_description")\n        self.error_description = check.str_param(error_description, "error_description")\n\n\n@experimental\nclass ConstraintWithMetadata:\n    """This class defines a base constraint over pandas DFs with organized metadata.\n\n    Args:\n        description (str): description of the constraint\n        validation_fn (Callable[[DataFrame], Tuple[bool, dict[str, Union[dict,list, str, set]]]]:\n                    the validation function to run over inputted data\n                    This function should return a tuple of a boolean for success or failure, and a dict containing\n                    metadata about the test -- this metadata will be passed to the resulting exception if validation\n                    fails.\n        resulting_exception (ConstraintWithMetadataException):  what response a failed typecheck should induce\n        raise_or_typecheck (Optional[bool]): whether to raise an exception (if set to True) or emit a failed typecheck event\n                    (if set to False) when validation fails\n        name (Optional[str]): what to call the constraint, defaults to the class name.\n    """\n\n    # TODO:  validation_fn returning metadata is sorta broken.  maybe have it yield typecheck events and grab metadata?\n\n    def __init__(\n        self, description, validation_fn, resulting_exception, raise_or_typecheck=True, name=None\n    ):\n        if name is None:\n            self.name = self.__class__.__name__\n        else:\n            self.name = name\n        self.description = description\n        # should return a tuple of (bool, and either an empty dict or a dict of extra params)\n        self.validation_fn = validation_fn\n        self.resulting_exception = resulting_exception\n        self.raise_or_typecheck = raise_or_typecheck\n\n    def validate(self, data, *args, **kwargs):\n        res = self.validation_fn(data, *args, **kwargs)\n        if not res[0]:\n            exc = self.resulting_exception(\n                constraint_name=self.name, constraint_description=self.description, **res[1]\n            )\n\n            if self.raise_or_typecheck:\n                raise exc\n            else:\n                return exc.return_as_typecheck()\n\n        else:\n            if res[0]:\n                return TypeCheck(success=True)\n\n    # TODO:  composition of validations\n    def as_dagster_type(self, *args, **kwargs):\n        if self.raise_or_typecheck:\n            raise Exception(\n                "Dagster types can only be constructed from constraints that return typechecks"\n            )\n        return DagsterType(\n            name=self.name,\n            description=f"A Pandas DataFrame with the following validation: {self.description}",\n            type_check_fn=lambda x: self.validate(x, *args),\n            **kwargs,\n        )\n\n\nclass MultiConstraintWithMetadata(ConstraintWithMetadata):\n    """Use this class if you have multiple constraints to check over the entire dataframe.\n\n    Args:\n        description (str): description of the constraint\n        validation_fn_arr(List[Callable[[DataFrame], Tuple[bool, dict[str, Union[dict,list, str, set]]]]]):\n                    a list of the validation functions to run over inputted data\n                    Each function should return a tuple of a boolean for success or failure, and a dict containing\n                    metadata about the test -- this metadata will be passed to the resulting exception if validation\n                    fails.\n        resulting_exception (ConstraintWithMetadataException):  what response a failed typecheck should induce\n        raise_or_typecheck (Optional[bool]): whether to raise an exception (if set to True) or emit a failed typecheck event\n                    (if set to False) when validation fails\n        name (Optional[str]): what to call the constraint, defaults to the class name.\n    """\n\n    def __init__(\n        self,\n        description,\n        validation_fn_arr,\n        resulting_exception,\n        raise_or_typecheck=True,\n        name=None,\n    ):\n        validation_fn_arr = check.list_param(validation_fn_arr, "validation_fn_arr")\n\n        def validation_fn(data, *args, **kwargs):\n            results = [f(data, *args, **kwargs) for f in validation_fn_arr]\n            truthparam = all(item[0] for item in results)\n            metadict = defaultdict(dict)\n            for i, dicta in enumerate(item[1] for item in results):\n                if len(dicta.keys()) > 0:\n                    for key in dicta:\n                        metadict[key][validation_fn_arr[i].__name__] = dicta[key]\n            return (truthparam, metadict)\n\n        super(MultiConstraintWithMetadata, self).__init__(\n            description,\n            validation_fn,\n            resulting_exception,\n            raise_or_typecheck=raise_or_typecheck,\n            name=name,\n        )\n\n\nclass StrictColumnsWithMetadata(ConstraintWithMetadata):\n    def __init__(self, column_list, enforce_ordering=False, raise_or_typecheck=True, name=None):\n        self.enforce_ordering = check.bool_param(enforce_ordering, "enforce_ordering")\n        self.column_list = check.list_param(column_list, "strict_column_list", of_type=str)\n\n        def validation_fcn(inframe):\n            if list(inframe.columns) == column_list:\n                return (True, {})\n            else:\n                if self.enforce_ordering:\n                    resdict = {"expectation": self.column_list, "actual": list(inframe.columns)}\n                    return (False, resdict)\n                else:\n                    if set(inframe.columns) == set(column_list):\n                        return (True, {})\n                    else:\n                        extra = [x for x in inframe.columns if x not in set(column_list)]\n                        missing = [x for x in set(column_list) if x not in inframe.columns]\n                        resdict = {\n                            "expectation": self.column_list,\n                            "actual": {"extra_columns": extra, "missing_columns": missing},\n                        }\n                        return (False, resdict)\n\n        basestr = f"ensuring that the right columns, {self.column_list} were present"\n        if enforce_ordering:\n            basestr += " in the right order"\n        super(StrictColumnsWithMetadata, self).__init__(\n            basestr,\n            validation_fcn,\n            DataFrameWithMetadataException,\n            raise_or_typecheck=raise_or_typecheck,\n            name=name,\n        )\n\n\nclass DataFrameConstraint(Constraint):\n    """Base constraint object that represent Dataframe shape constraints.\n\n    Args:\n        error_description (Optional[str]): The plain string description that is output in the terminal if the constraint fails.\n        markdown_description (Optional[str]): A markdown supported description that is shown in the Dagster UI if the constraint fails.\n    """\n\n    def __init__(self, error_description=None, markdown_description=None):\n        super(DataFrameConstraint, self).__init__(\n            error_description=error_description, markdown_description=markdown_description\n        )\n\n    def validate(self, dataframe):\n        raise NotImplementedError()\n\n\n
[docs]class StrictColumnsConstraint(DataFrameConstraint):\n """A dataframe constraint that validates column existence and ordering.\n\n Args:\n strict_column_list (List[str]): The exact list of columns that your dataframe must have.\n enforce_ordering (Optional[bool]): If true, will enforce that the ordering of column names must match.\n Default is False.\n """\n\n def __init__(self, strict_column_list, enforce_ordering=False):\n self.enforce_ordering = check.bool_param(enforce_ordering, "enforce_ordering")\n self.strict_column_list = check.list_param(\n strict_column_list, "strict_column_list", of_type=str\n )\n description = f"No columns outside of {self.strict_column_list} allowed. "\n if enforce_ordering:\n description += "Columns must be in that order."\n super(StrictColumnsConstraint, self).__init__(\n error_description=description, markdown_description=description\n )\n\n def validate(self, dataframe):\n check.inst_param(dataframe, "dataframe", DataFrame)\n columns_received = list(dataframe.columns)\n if self.enforce_ordering:\n if self.strict_column_list != columns_received:\n raise DataFrameConstraintViolationException(\n constraint_name=self.name,\n constraint_description=(\n "Expected the following ordering of columns {expected}. Received:"\n " {received}".format(\n expected=self.strict_column_list, received=columns_received\n )\n ),\n )\n for column in columns_received:\n if column not in self.strict_column_list:\n raise DataFrameConstraintViolationException(\n constraint_name=self.name,\n constraint_description="Expected {}. Recevied {}.".format(\n self.strict_column_list, columns_received\n ),\n )
\n\n\n
[docs]class RowCountConstraint(DataFrameConstraint):\n """A dataframe constraint that validates the expected count of rows.\n\n Args:\n num_allowed_rows (int): The number of allowed rows in your dataframe.\n error_tolerance (Optional[int]): The acceptable threshold if you are not completely certain. Defaults to 0.\n """\n\n def __init__(self, num_allowed_rows, error_tolerance=0):\n self.num_allowed_rows = check.int_param(num_allowed_rows, "num_allowed_rows")\n self.error_tolerance = abs(check.int_param(error_tolerance, "error_tolerance"))\n if self.error_tolerance > self.num_allowed_rows:\n raise ValueError("Tolerance can't be greater than the number of rows you expect.")\n description = f"Dataframe must have {self.num_allowed_rows} +- {self.error_tolerance} rows."\n super(RowCountConstraint, self).__init__(\n error_description=description, markdown_description=description\n )\n\n def validate(self, dataframe):\n check.inst_param(dataframe, "dataframe", DataFrame)\n\n if not (\n self.num_allowed_rows - self.error_tolerance\n <= len(dataframe)\n <= self.num_allowed_rows + self.error_tolerance\n ):\n raise DataFrameConstraintViolationException(\n constraint_name=self.name,\n constraint_description=(\n "Expected {expected} +- {tolerance} rows. Got {received}".format(\n expected=self.num_allowed_rows,\n tolerance=self.error_tolerance,\n received=len(dataframe),\n )\n ),\n )
\n\n\ndef apply_ignore_missing_data_to_mask(mask, column):\n return mask & ~column.isnull()\n\n\nclass ColumnAggregateConstraintWithMetadata(ConstraintWithMetadata):\n """Similar to the base class, but now your validation functions should take in columns (pd.Series) not Dataframes.\n\n Args:\n description (str): description of the constraint\n validation_fn (Callable[[pd.Series], Tuple[bool, dict[str, Union[dict,list, str, set]]]]:\n the validation function to run over inputted data\n This function should return a tuple of a boolean for success or failure, and a dict containing\n metadata about the test -- this metadata will be passed to the resulting exception if validation\n fails.\n resulting_exception (ConstraintWithMetadataException): what response a failed typecheck should induce\n raise_or_typecheck (Optional[bool]): whether to raise an exception (if set to True) or emit a failed typecheck event\n (if set to False) when validation fails\n name (Optional[str]): what to call the constraint, defaults to the class name.\n """\n\n def validate(self, data, *columns, **kwargs):\n if len(columns) == 0:\n columns = data.columns\n columns = [column for column in columns if column in data.columns]\n relevant_data = data[list(columns)]\n\n offending_columns = set()\n offending_values = {}\n for column in columns:\n # TODO: grab extra metadata\n res = self.validation_fn(relevant_data[column])\n if not res[0]:\n offending_columns.add(column)\n if res[1].get("actual") is not None:\n offending_values[column] = [x.item() for x in res[1].get("actual").to_numpy()]\n else:\n offending_values[column] = [x.item() for x in relevant_data[column].to_numpy()]\n if len(offending_columns) == 0 and not self.raise_or_typecheck:\n return TypeCheck(success=True)\n elif len(offending_columns) > 0:\n metadict = {\n "expectation": self.description.replace("Confirms", ""),\n "actual": offending_values,\n "offending": offending_columns,\n }\n exc = self.resulting_exception(\n constraint_name=self.name, constraint_description=self.description, **metadict\n )\n\n if self.raise_or_typecheck:\n raise exc\n else:\n return exc.return_as_typecheck()\n\n\nclass ColumnConstraintWithMetadata(ConstraintWithMetadata):\n """This class is useful for constructing single constraints that you want to apply to multiple\n columns of your dataframe.\n\n The main difference from the base class in terms of construction is that now, your validation_fns should operate on\n individual values.\n\n Args:\n description (str): description of the constraint\n validation_fn (Callable[[Any], Tuple[bool, dict[str, Union[dict,list, str, set]]]]:\n the validation function to run over inputted data\n This function should return a tuple of a boolean for success or failure, and a dict containing\n metadata about the test -- this metadata will be passed to the resulting exception if validation\n fails.\n resulting_exception (ConstraintWithMetadataException): what response a failed typecheck should induce\n raise_or_typecheck (Optional[bool]): whether to raise an exception (if set to True) or emit a failed typecheck event\n (if set to False) when validation fails\n name (Optional[str]): what to call the constraint, defaults to the class name.\n """\n\n def validate(self, data, *columns, **kwargs):\n if len(columns) == 0:\n columns = data.columns\n\n columns = [column for column in columns if column in data.columns]\n relevant_data = data[list(columns)]\n offending = {}\n offending_values = {}\n # TODO: grab metadata from here\n inverse_validation = lambda x: not self.validation_fn(x)[0]\n for column in columns:\n results = relevant_data[relevant_data[column].apply(inverse_validation)]\n if len(results.index.tolist()) > 0:\n offending[column] = ["row " + str(i) for i in (results.index.tolist())]\n offending_values[column] = results[column].tolist()\n if len(offending) == 0:\n if not self.raise_or_typecheck:\n return TypeCheck(success=True)\n else:\n metadict = {\n "expectation": self.validation_fn.__doc__,\n "actual": offending_values,\n "offending": offending,\n }\n exc = self.resulting_exception(\n constraint_name=self.name, constraint_description=self.description, **metadict\n )\n\n if self.raise_or_typecheck:\n raise exc\n else:\n return exc.return_as_typecheck()\n\n\nclass MultiColumnConstraintWithMetadata(ColumnConstraintWithMetadata):\n """This class is useful for constructing more complicated relationships between columns\n and expectations -- i.e. you want some validations on column A, others on column B, etc.\n This lets you package up the metadata neatly, and also allows for cases like 'fail if any one of\n these constraints fails but still run all of them'.\n\n Args:\n description (str): description of the overall set of validations\n fn_and_columns_dict (Dict[str, List[Callable[[Any], Tuple[bool, dict[str, Union[dict,list, str, set]]]]]):\n while this is a relatively complex type,\n what it amounts to is 'a dict mapping columns to the functions to\n run on them'\n resulting_exception (type): the response to generate if validation fails. Subclass of\n ConstraintWithMetadataException\n raise_or_typecheck (Optional[bool]): whether to raise an exception (true) or a failed typecheck (false)\n type_for_internal (Optional[type]): what type to use for internal validators. Subclass of\n ConstraintWithMetadata\n name (Optional[str]): what to call the constraint, defaults to the class name.\n """\n\n def __init__(\n self,\n description,\n fn_and_columns_dict,\n resulting_exception,\n raise_or_typecheck=True,\n type_for_internal=ColumnConstraintWithMetadata,\n name=None,\n ):\n # TODO: support multiple descriptions\n self.column_to_fn_dict = check.dict_param(\n fn_and_columns_dict, "fn_and_columns_dict", key_type=str\n )\n\n def validation_fn(data, *args, **kwargs):\n metadict = defaultdict(dict)\n truthparam = True\n for column, fn_arr in self.column_to_fn_dict.items():\n if column not in data.columns:\n continue\n for fn in fn_arr:\n # TODO: do this more effectively\n new_validator = type_for_internal(\n fn.__doc__, fn, ColumnWithMetadataException, raise_or_typecheck=False\n )\n result = new_validator.validate(\n DataFrame(data[column]), column, *args, **kwargs\n )\n result_val = result.success\n if result_val:\n continue\n result_dict = result.metadata[CONSTRAINT_METADATA_KEY].data\n truthparam = truthparam and result_val\n for key in result_dict.keys():\n if "constraint" not in key:\n if key == "expected":\n new_key = "expectation"\n result_dict[key] = result_dict[key].replace("returns", "").strip()\n if column not in metadict[new_key] or new_key not in metadict:\n metadict[new_key][column] = dict()\n metadict[new_key][column][fn.__name__] = result_dict[key]\n else:\n if column not in metadict[key] or key not in metadict:\n metadict[key][column] = dict()\n if isinstance(result_dict[key], dict):\n metadict[key][column][fn.__name__] = result_dict[key][column]\n else:\n metadict[key][column][fn.__name__] = "a violation"\n return truthparam, metadict\n\n super(MultiColumnConstraintWithMetadata, self).__init__(\n description,\n validation_fn,\n resulting_exception,\n raise_or_typecheck=raise_or_typecheck,\n name=name,\n )\n\n def validate(self, data, *args, **kwargs):\n return ConstraintWithMetadata.validate(self, data, *args, **kwargs)\n\n\nclass MultiAggregateConstraintWithMetadata(MultiColumnConstraintWithMetadata):\n """This class is similar to multicolumn, but takes in functions that operate on the whole column at once\n rather than ones that operate on each value --\n consider this similar to the difference between apply-map and apply aggregate.\n\n Args:\n description (str): description of the overall set of validations (TODO: support multiple descriptions)\n fn_and_columns_dict (Dict[str, List[Callable[[pd.Series], Tuple[bool, dict[str, Union[dict,list, str, set]]]]]):\n while this is a relatively complex type,\n what it amounts to is a dict mapping columns to the functions to\n run on them'\n resulting_exception (type): the response to generate if validation fails. Subclass of\n ConstraintWithMetadataException\n raise_or_typecheck (Optional[bool]): whether to raise an exception (true) or a failed typecheck (false)\n type_for_internal (Optional[type]): what type to use for internal validators. Subclass of\n ConstraintWithMetadata\n name (Optional[str]): what to call the constraint, defaults to the class name.\n """\n\n def __init__(\n self,\n description,\n fn_and_columns_dict,\n resulting_exception,\n raise_or_typecheck=True,\n name=None,\n ):\n super(MultiAggregateConstraintWithMetadata, self).__init__(\n description,\n fn_and_columns_dict,\n resulting_exception,\n raise_or_typecheck=raise_or_typecheck,\n type_for_internal=ColumnAggregateConstraintWithMetadata,\n name=name,\n )\n\n\ndef non_null_validation(x):\n """Validates that a particular value in a column is not null.\n\n Usage:\n pass this as a column validator to\n :py:class:'~dagster_pandas.constraints.ColumnConstraintWithMetadata'\n or :py:class:'~dagster_pandas.constraints.MultiColumnConstraintWithMetadata'\n Generally, you should prefer to use nonnull as a decorator/wrapper rather than using this\n directly.\n """\n return not pd.isnull(x), {}\n\n\ndef all_unique_validator(column, ignore_missing_vals=False):\n """Validates that all values in an iterable are unique.\n\n Returns duplicated values as metadata.\n\n Usage:\n As a validation function for a\n :py:class:'~dagster_pandas.constraints.ColumnAggregateConstraintWithMetadata'\n or :py:class:'~dagster_pandas.constraints.MultiAggregateConstraintWithMetadata'\n Example:\n .. code-block:: python\n aggregate_validator = MultiAggregateConstraintWithMetadata(\n "confirms all values are unique",\n {'bar': [all_unique_validator]},\n ConstraintWithMetadataException,\n raise_or_typecheck=False,\n )\n ntype = create_structured_dataframe_type(\n "NumericType",\n columns_aggregate_validator=aggregate_validator\n )\n @op(out={'basic_dataframe': Out(dagster_type=ntype)})\n def create_dataframe(_):\n yield Output(\n DataFrame({'foo': [1, 2, 3], 'bar': [9, 10, 10]}),\n output_name='basic_dataframe',\n )\n #will fail with\n metadata['offending'] == {'bar': {'all_unique_validator': 'a violation'}}\n metadata['actual'] == {'bar': {'all_unique_validator': [10.0]}}\n """\n column = pd.Series(column)\n duplicated = column.duplicated()\n if ignore_missing_vals:\n duplicated = apply_ignore_missing_data_to_mask(duplicated, column)\n return not duplicated.any(), {"actual": column[duplicated]}\n\n\ndef nonnull(func):\n """Decorator for column validation functions to make them error on nulls.\n\n Usage:\n pass decorated functions as column validators to\n :py:class:'~dagster_pandas.constraints.ColumnConstraintWithMetadata'\n or :py:class:'~dagster_pandas.constraints.MultiColumnConstraintWithMetadata'\n Args:\n func (Callable[[Any], Tuple[bool, dict[str, Union[dict,list, str, set]]]]]):\n the column validator you want to error on nulls.\n """\n\n @wraps(func)\n def nvalidator(val):\n origval = func(val)\n nval = non_null_validation(val)\n return origval[0] and nval[0], {}\n\n nvalidator.__doc__ += " and ensures no values are null"\n\n return nvalidator\n\n\ndef column_range_validation_factory(minim=None, maxim=None, ignore_missing_vals=False):\n """Factory for validators testing if column values are within a range.\n\n Args:\n minim(Optional[Comparable]): the low end of the range\n maxim(Optional[Comparable]): the high end of the range\n ignore_missing_vals(Optional[bool]): whether to ignore nulls.\n\n Returns: a validation function for this constraint\n Usage:\n pass returned functions as column validators to\n :py:class:'~dagster_pandas.constraints.ColumnConstraintWithMetadata'\n or :py:class:'~dagster_pandas.constraints.MultiColumnConstraintWithMetadata'\n Examples:\n .. code-block:: python\n in_range_validator = column_range_validation_factory(1, 3, ignore_missing_vals=True)\n column_validator = MultiColumnConstraintWithMetadata(\n "confirms values are numbers in a range",\n {'foo': [in_range_validator]},\n ColumnWithMetadataException,\n raise_or_typecheck=False,\n )\n ntype = create_structured_dataframe_type(\n "NumericType",\n columns_validator=column_validator\n )\n @op(out={'basic_dataframe': Out(dagster_type=ntype)})\n def create_dataframe(_):\n yield Output(\n DataFrame({'foo': [1, 2, 7], 'bar': [9, 10, 10]}),\n output_name='basic_dataframe',\n )\n #will fail with\n metadata['offending'] == {'foo': {'in_range_validation_fn': ['row 2']}}\n metadata['actual'] == {'foo': {'in_range_validation_fn': [7]}}\n\n """\n if minim is None:\n if isinstance(maxim, datetime):\n minim = datetime.min\n else:\n minim = -1 * (sys.maxsize - 1)\n if maxim is None:\n if isinstance(minim, datetime):\n maxim = datetime.max\n else:\n maxim = sys.maxsize\n\n def in_range_validation_fn(x):\n if ignore_missing_vals and pd.isnull(x):\n return True, {}\n return (isinstance(x, (type(minim), type(maxim)))) and (x <= maxim) and (x >= minim), {}\n\n in_range_validation_fn.__doc__ = f"checks whether values are between {minim} and {maxim}"\n if ignore_missing_vals:\n in_range_validation_fn.__doc__ += ", ignoring nulls"\n\n return in_range_validation_fn\n\n\ndef categorical_column_validator_factory(categories, ignore_missing_vals=False):\n """Factory for validators testing if all values are in some set.\n\n Args:\n categories(Union[Sequence, set]): the set of allowed values\n ignore_missing_vals(Optional[bool]): whether to ignore nulls.\n\n Returns: a validation function for this constraint\n\n Usage:\n pass returned functions as column validators to\n :py:class:'~dagster_pandas.constraints.ColumnConstraintWithMetadata'\n or :py:class:'~dagster_pandas.constraints.MultiColumnConstraintWithMetadata'\n\n Example:\n .. code-block:: python\n categorical_validation_fn = categorical_column_validator_factory([1, 2])\n column_validator = MultiColumnConstraintWithMetadata(\n "confirms values are numbers in a range",\n {'foo': [categorical_validation_fn]},\n ColumnWithMetadataException,\n raise_or_typecheck=False,\n )\n ntype = create_structured_dataframe_type(\n "NumericType",\n columns_validator=column_validator\n )\n @op(out={'basic_dataframe': Out(dagster_type=ntype)})\n def create_dataframe(_):\n yield Output(\n DataFrame({'foo': [1, 2, 7], 'bar': [9, 10, 10]}),\n output_name='basic_dataframe',\n )\n #will fail with\n metadata['offending'] == {'foo': {'categorical_validation_fn': ['row 2']}}\n metadata['actual'] == {'foo': {'categorical_validation_fn': [7]}}\n\n """\n categories = set(categories)\n\n def categorical_validation_fn(x):\n if ignore_missing_vals and pd.isnull(x):\n return True, {}\n return (x in categories), {}\n\n categorical_validation_fn.__doc__ = (\n f"checks whether values are within this set of values: {categories}"\n )\n if ignore_missing_vals:\n categorical_validation_fn.__doc__ += ", ignoring nulls"\n\n return categorical_validation_fn\n\n\ndef dtype_in_set_validation_factory(datatypes, ignore_missing_vals=False):\n """Factory for testing if the dtype of a val falls within some allowed set.\n\n Args:\n datatypes(Union[set[type], type]): which datatype/datatypes are allowed\n ignore_missing_vals(Optional[bool]): whether to ignore nulls\n\n Returns: a validation function for this constraint\n\n Usage:\n pass returned functions as column validators to\n :py:class:'~dagster_pandas.constraints.ColumnConstraintWithMetadata'\n or :py:class:'~dagster_pandas.constraints.MultiColumnConstraintWithMetadata'\n\n Examples:\n .. code-block:: python\n dtype_is_num_validator = dtype_in_set_validation_factory((int, float, int64, float64))\n column_validator = MultiColumnConstraintWithMetadata(\n "confirms values are numbers in a range",\n {'foo': [dtype_is_num_validator]},\n ColumnWithMetadataException,\n raise_or_typecheck=False,\n )\n ntype = create_structured_dataframe_type(\n "NumericType",\n columns_validator=column_validator\n )\n @op(out={'basic_dataframe': Out(dagster_type=ntype)})\n def create_dataframe(_):\n yield Output(\n DataFrame({'foo': [1, 'a', 7], 'bar': [9, 10, 10]}),\n output_name='basic_dataframe',\n )\n #will fail with\n metadata['offending'] == {'foo': {'categorical_validation_fn': ['row 1']}}\n metadata['actual'] == {'foo': {'categorical_validation_fn': ['a']}}\n\n """\n\n def dtype_in_set_validation_fn(x):\n if ignore_missing_vals and pd.isnull(x):\n return True, {}\n return isinstance(x, datatypes), {}\n\n dtype_in_set_validation_fn.__doc__ = f"checks whether values are this type/types: {datatypes}"\n if ignore_missing_vals:\n dtype_in_set_validation_fn.__doc__ += ", ignoring nulls"\n\n return dtype_in_set_validation_fn\n\n\nclass ColumnRangeConstraintWithMetadata(ColumnConstraintWithMetadata):\n def __init__(self, minim=None, maxim=None, columns=None, raise_or_typecheck=True):\n self.name = self.__class__.__name__\n\n description = f"Confirms values are between {minim} and {maxim}"\n super(ColumnRangeConstraintWithMetadata, self).__init__(\n description=description,\n validation_fn=column_range_validation_factory(minim=minim, maxim=maxim),\n resulting_exception=ColumnWithMetadataException,\n raise_or_typecheck=raise_or_typecheck,\n )\n self.columns = columns\n\n def validate(self, data, *args, **kwargs):\n if self.columns is None:\n self.columns = list(data.columns)\n self.columns.extend(args)\n return super(ColumnRangeConstraintWithMetadata, self).validate(\n data, *self.columns, **kwargs\n )\n\n\nclass ColumnConstraint(Constraint):\n """Base constraint object that represent dataframe column shape constraints.\n\n Args:\n error_description (Optional[str]): The plain string description that is output in the terminal if the constraint fails.\n markdown_description (Optional[str]): A markdown supported description that is shown in the Dagster UI if the constraint fails.\n """\n\n def __init__(self, error_description=None, markdown_description=None):\n super(ColumnConstraint, self).__init__(\n error_description=error_description, markdown_description=markdown_description\n )\n\n def validate(self, dataframe, column_name):\n pass\n\n @staticmethod\n def get_offending_row_pairs(dataframe, column_name):\n return zip(dataframe.index.tolist(), dataframe[column_name].tolist())\n\n\nclass ColumnDTypeFnConstraint(ColumnConstraint):\n """A column constraint that applies a pandas dtype validation function to a columns dtype.\n\n Args:\n type_fn (Callable[[Set[str]], bool]): This is a function that takes the pandas columns dtypes and\n returns if those dtypes match the types it expects. See pandas.core.dtypes.common for examples.\n """\n\n def __init__(self, type_fn):\n self.type_fn = check.callable_param(type_fn, "type_fn")\n description = f'Dtype must satisfy "{self.type_fn.__name__}"'\n super(ColumnDTypeFnConstraint, self).__init__(\n error_description=description, markdown_description=description\n )\n\n def validate(self, dataframe, column_name):\n column_dtype = dataframe[column_name].dtype\n if not self.type_fn(column_dtype):\n raise ColumnConstraintViolationException(\n constraint_name=self.name,\n constraint_description=f'{self.error_description}, but was "{column_dtype}"',\n column_name=column_name,\n )\n\n\nclass ColumnDTypeInSetConstraint(ColumnConstraint):\n """A column constraint that validates the pandas column dtypes based on the expected set of dtypes.\n\n Args:\n expected_dtype_set (Set[str]): The set of pandas dtypes that the pandas column dtypes must match.\n """\n\n def __init__(self, expected_dtype_set):\n self.expected_dtype_set = check.set_param(expected_dtype_set, "expected_dtype_set")\n description = f"Column dtype must be in the following set {self.expected_dtype_set}."\n super(ColumnDTypeInSetConstraint, self).__init__(\n error_description=description, markdown_description=description\n )\n\n def validate(self, dataframe, column_name):\n received_dtypes = dataframe[column_name].dtype\n if str(received_dtypes) not in self.expected_dtype_set:\n raise ColumnConstraintViolationException(\n constraint_name=self.name,\n constraint_description=(\n f"{self.error_description}. DTypes received: {received_dtypes}"\n ),\n column_name=column_name,\n )\n\n\nclass NonNullableColumnConstraint(ColumnConstraint):\n """A column constraint that ensures all values in a pandas column are not null."""\n\n def __init__(self):\n description = "No Null values allowed."\n super(NonNullableColumnConstraint, self).__init__(\n error_description=description, markdown_description=description\n )\n\n def validate(self, dataframe, column_name):\n rows_with_null_columns = dataframe[dataframe[column_name].isna()]\n if not rows_with_null_columns.empty:\n raise ColumnConstraintViolationException(\n constraint_name=self.name,\n constraint_description=self.error_description,\n column_name=column_name,\n offending_rows=self.get_offending_row_pairs(rows_with_null_columns, column_name),\n )\n\n\nclass UniqueColumnConstraint(ColumnConstraint):\n """A column constraint that ensures all values in a pandas column are unique.\n\n Args:\n ignore_missing_vals (bool): If true, this constraint will enforce the constraint on non missing values.\n """\n\n def __init__(self, ignore_missing_vals):\n description = "Column must be unique."\n self.ignore_missing_vals = check.bool_param(ignore_missing_vals, "ignore_missing_vals")\n super(UniqueColumnConstraint, self).__init__(\n error_description=description, markdown_description=description\n )\n\n def validate(self, dataframe, column_name):\n invalid = dataframe[column_name].duplicated()\n if self.ignore_missing_vals:\n invalid = apply_ignore_missing_data_to_mask(invalid, dataframe[column_name])\n rows_with_duplicated_values = dataframe[invalid]\n if not rows_with_duplicated_values.empty:\n raise ColumnConstraintViolationException(\n constraint_name=self.name,\n constraint_description=self.error_description,\n column_name=column_name,\n offending_rows=rows_with_duplicated_values,\n )\n\n\nclass CategoricalColumnConstraint(ColumnConstraint):\n """A column constraint that ensures all values in a pandas column are a valid category.\n\n Args:\n categories (Set[str]): Set of categories that values in your pandas column must match.\n ignore_missing_vals (bool): If true, this constraint will enforce the constraint on non missing values.\n """\n\n def __init__(self, categories, ignore_missing_vals):\n self.categories = list(check.set_param(categories, "categories", of_type=str))\n self.ignore_missing_vals = check.bool_param(ignore_missing_vals, "ignore_missing_vals")\n super(CategoricalColumnConstraint, self).__init__(\n error_description=f"Expected Categories are {self.categories}",\n markdown_description=f"Category examples are {self.categories[:5]}...",\n )\n\n def validate(self, dataframe, column_name):\n invalid = ~dataframe[column_name].isin(self.categories)\n if self.ignore_missing_vals:\n invalid = apply_ignore_missing_data_to_mask(invalid, dataframe[column_name])\n rows_with_unexpected_buckets = dataframe[invalid]\n if not rows_with_unexpected_buckets.empty:\n raise ColumnConstraintViolationException(\n constraint_name=self.name,\n constraint_description=self.error_description,\n column_name=column_name,\n offending_rows=rows_with_unexpected_buckets,\n )\n\n\nclass MinValueColumnConstraint(ColumnConstraint):\n """A column constraint that ensures all values in a pandas column are greater than the provided\n lower bound [inclusive].\n\n Args:\n min_value (Union[int, float, datetime.datetime]): The lower bound.\n ignore_missing_vals (bool): If true, this constraint will enforce the constraint on non missing values.\n """\n\n def __init__(self, min_value, ignore_missing_vals):\n self.min_value = check.inst_param(min_value, "min_value", (int, float, datetime))\n self.ignore_missing_vals = check.bool_param(ignore_missing_vals, "ignore_missing_vals")\n super(MinValueColumnConstraint, self).__init__(\n markdown_description=f"values > {self.min_value}",\n error_description=f"Column must have values > {self.min_value}",\n )\n\n def validate(self, dataframe, column_name):\n invalid = dataframe[column_name] < self.min_value\n if self.ignore_missing_vals:\n invalid = apply_ignore_missing_data_to_mask(invalid, dataframe[column_name])\n out_of_bounds_rows = dataframe[invalid]\n if not out_of_bounds_rows.empty:\n raise ColumnConstraintViolationException(\n constraint_name=self.name,\n constraint_description=self.error_description,\n column_name=column_name,\n offending_rows=out_of_bounds_rows,\n )\n\n\nclass MaxValueColumnConstraint(ColumnConstraint):\n """A column constraint that ensures all values in a pandas column are less than the provided\n upper bound [inclusive].\n\n Args:\n max_value (Union[int, float, datetime.datetime]): The upper bound.\n ignore_missing_vals (bool): If true, this constraint will enforce the constraint on non missing values.\n """\n\n def __init__(self, max_value, ignore_missing_vals):\n self.max_value = check.inst_param(max_value, "max_value", (int, float, datetime))\n self.ignore_missing_vals = check.bool_param(ignore_missing_vals, "ignore_missing_vals")\n super(MaxValueColumnConstraint, self).__init__(\n markdown_description=f"values < {self.max_value}",\n error_description=f"Column must have values < {self.max_value}",\n )\n\n def validate(self, dataframe, column_name):\n invalid = dataframe[column_name] > self.max_value\n if self.ignore_missing_vals:\n invalid = apply_ignore_missing_data_to_mask(invalid, dataframe[column_name])\n out_of_bounds_rows = dataframe[invalid]\n if not out_of_bounds_rows.empty:\n raise ColumnConstraintViolationException(\n constraint_name=self.name,\n constraint_description=self.error_description,\n column_name=column_name,\n offending_rows=out_of_bounds_rows,\n )\n\n\nclass InRangeColumnConstraint(ColumnConstraint):\n """A column constraint that ensures all values in a pandas column are between the lower and upper\n bound [inclusive].\n\n Args:\n min_value (Union[int, float, datetime.datetime]): The lower bound.\n max_value (Union[int, float, datetime.datetime]): The upper bound.\n ignore_missing_vals (bool): If true, this constraint will enforce the constraint on non\n missing values.\n """\n\n def __init__(self, min_value, max_value, ignore_missing_vals):\n self.min_value = check.inst_param(min_value, "min_value", (int, float, datetime))\n self.max_value = check.inst_param(max_value, "max_value", (int, float, datetime))\n self.ignore_missing_vals = check.bool_param(ignore_missing_vals, "ignore_missing_vals")\n super(InRangeColumnConstraint, self).__init__(\n markdown_description=f"{self.min_value} < values < {self.max_value}",\n error_description="Column must have values between {} and {} inclusive.".format(\n self.min_value, self.max_value\n ),\n )\n\n def validate(self, dataframe, column_name):\n invalid = ~dataframe[column_name].between(self.min_value, self.max_value)\n if self.ignore_missing_vals:\n invalid = apply_ignore_missing_data_to_mask(invalid, dataframe[column_name])\n out_of_bounds_rows = dataframe[invalid]\n if not out_of_bounds_rows.empty:\n raise ColumnConstraintViolationException(\n constraint_name=self.name,\n constraint_description=self.error_description,\n column_name=column_name,\n offending_rows=out_of_bounds_rows,\n )\n
", "current_page_name": "_modules/dagster_pandas/constraints", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_pandas.constraints"}, "data_frame": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_pandas.data_frame

\nimport pandas as pd\nfrom dagster import (\n    DagsterInvariantViolationError,\n    DagsterType,\n    Field,\n    MetadataValue,\n    StringSource,\n    TableColumn,\n    TableSchema,\n    TableSchemaMetadataValue,\n    TypeCheck,\n    _check as check,\n    dagster_type_loader,\n)\nfrom dagster._annotations import experimental\nfrom dagster._config import Selector\nfrom dagster._core.definitions.metadata import normalize_metadata\nfrom dagster._utils import dict_without_keys\n\nfrom dagster_pandas.constraints import (\n    CONSTRAINT_METADATA_KEY,\n    ColumnDTypeFnConstraint,\n    ColumnDTypeInSetConstraint,\n    ConstraintViolationException,\n)\nfrom dagster_pandas.validation import PandasColumn, validate_constraints\n\nCONSTRAINT_BLACKLIST = {ColumnDTypeFnConstraint, ColumnDTypeInSetConstraint}\n\n\n@dagster_type_loader(\n    Selector(\n        {\n            "csv": {\n                "path": StringSource,\n                "sep": Field(StringSource, is_required=False, default_value=","),\n            },\n            "parquet": {"path": StringSource},\n            "table": {"path": StringSource},\n            "pickle": {"path": StringSource},\n        },\n    )\n)\ndef dataframe_loader(_context, config):\n    file_type, file_options = next(iter(config.items()))\n\n    if file_type == "csv":\n        path = file_options["path"]\n        return pd.read_csv(path, **dict_without_keys(file_options, "path"))\n    elif file_type == "parquet":\n        return pd.read_parquet(file_options["path"])\n    elif file_type == "table":\n        return pd.read_csv(file_options["path"], sep="\\t")\n    elif file_type == "pickle":\n        return pd.read_pickle(file_options["path"])\n    else:\n        raise DagsterInvariantViolationError(f"Unsupported file_type {file_type}")\n\n\ndef df_type_check(_, value):\n    if not isinstance(value, pd.DataFrame):\n        return TypeCheck(success=False)\n    return TypeCheck(\n        success=True,\n        metadata={\n            "row_count": str(len(value)),\n            # string cast columns since they may be things like datetime\n            "metadata": {"columns": list(map(str, value.columns))},\n        },\n    )\n\n\nDataFrame = DagsterType(\n    name="PandasDataFrame",\n    description="""Two-dimensional size-mutable, potentially heterogeneous\n    tabular data structure with labeled axes (rows and columns).\n    See http://pandas.pydata.org/""",\n    loader=dataframe_loader,\n    type_check_fn=df_type_check,\n    typing_type=pd.DataFrame,\n)\n\n\ndef _construct_constraint_list(constraints):\n    def add_bullet(constraint_list, constraint_description):\n        return constraint_list + f"+ {constraint_description}\\n"\n\n    constraint_list = ""\n    for constraint in constraints:\n        if constraint.__class__ not in CONSTRAINT_BLACKLIST:\n            constraint_list = add_bullet(constraint_list, constraint.markdown_description)\n    return constraint_list\n\n\ndef _build_column_header(column_name, constraints):\n    header = f"**{column_name}**"\n    for constraint in constraints:\n        if isinstance(constraint, ColumnDTypeInSetConstraint):\n            dtypes_tuple = tuple(constraint.expected_dtype_set)\n            return header + f": `{dtypes_tuple if len(dtypes_tuple) > 1 else dtypes_tuple[0]}`"\n        elif isinstance(constraint, ColumnDTypeFnConstraint):\n            return header + f": Validator `{constraint.type_fn.__name__}`"\n    return header\n\n\ndef create_dagster_pandas_dataframe_description(description, columns):\n    title = "\\n".join([description, "### Columns", ""])\n    buildme = title\n    for column in columns:\n        buildme += "{}\\n{}\\n".format(\n            _build_column_header(column.name, column.constraints),\n            _construct_constraint_list(column.constraints),\n        )\n    return buildme\n\n\ndef create_table_schema_metadata_from_dataframe(\n    pandas_df: pd.DataFrame,\n) -> TableSchemaMetadataValue:\n    """This function takes a pandas DataFrame and returns its metadata as a Dagster TableSchema.\n\n    Args:\n        pandas_df (pandas.DataFrame): A pandas DataFrame for which to create metadata.\n\n    Returns:\n        TableSchemaMetadataValue: returns an object with the TableSchema for the DataFrame.\n    """\n    check.inst(pandas_df, pd.DataFrame, "Input must be a pandas DataFrame object")\n    return MetadataValue.table_schema(\n        TableSchema(\n            columns=[\n                TableColumn(name=str(name), type=str(dtype))\n                for name, dtype in pandas_df.dtypes.items()\n            ]\n        )\n    )\n\n\n
[docs]def create_dagster_pandas_dataframe_type(\n name,\n description=None,\n columns=None,\n metadata_fn=None,\n dataframe_constraints=None,\n loader=None,\n):\n """Constructs a custom pandas dataframe dagster type.\n\n Args:\n name (str): Name of the dagster pandas type.\n description (Optional[str]): A markdown-formatted string, displayed in tooling.\n columns (Optional[List[PandasColumn]]): A list of :py:class:`~dagster.PandasColumn` objects\n which express dataframe column schemas and constraints.\n metadata_fn (Optional[Callable[[], Union[Dict[str, Union[str, float, int, Dict, MetadataValue]])\n A callable which takes your dataframe and returns a dict with string label keys and\n MetadataValue values.\n dataframe_constraints (Optional[List[DataFrameConstraint]]): A list of objects that inherit from\n :py:class:`~dagster.DataFrameConstraint`. This allows you to express dataframe-level constraints.\n loader (Optional[DagsterTypeLoader]): An instance of a class that\n inherits from :py:class:`~dagster.DagsterTypeLoader`. If None, we will default\n to using `dataframe_loader`.\n """\n # We allow for the plugging in of a dagster_type_loader so that users can load their custom\n # dataframes via configuration their own way if the default configs don't suffice. This is\n # purely optional.\n check.str_param(name, "name")\n metadata_fn = check.opt_callable_param(metadata_fn, "metadata_fn")\n description = create_dagster_pandas_dataframe_description(\n check.opt_str_param(description, "description", default=""),\n check.opt_list_param(columns, "columns", of_type=PandasColumn),\n )\n\n def _dagster_type_check(_, value):\n if not isinstance(value, pd.DataFrame):\n return TypeCheck(\n success=False,\n description=(\n f"Must be a pandas.DataFrame. Got value of type. {type(value).__name__}"\n ),\n )\n\n try:\n validate_constraints(\n value,\n pandas_columns=columns,\n dataframe_constraints=dataframe_constraints,\n )\n except ConstraintViolationException as e:\n return TypeCheck(success=False, description=str(e))\n\n return TypeCheck(\n success=True,\n metadata=_execute_summary_stats(name, value, metadata_fn) if metadata_fn else None,\n )\n\n return DagsterType(\n name=name,\n type_check_fn=_dagster_type_check,\n loader=loader if loader else dataframe_loader,\n description=description,\n typing_type=pd.DataFrame,\n )
\n\n\n@experimental\ndef create_structured_dataframe_type(\n name,\n description=None,\n columns_validator=None,\n columns_aggregate_validator=None,\n dataframe_validator=None,\n loader=None,\n):\n """Args:\n name (str): the name of the new type\n description (Optional[str]): the description of the new type\n columns_validator (Optional[Union[ColumnConstraintWithMetadata, MultiColumnConstraintWithMetadata]]):\n what column-level row by row validation you want to have applied.\n Leave empty for no column-level row by row validation.\n columns_aggregate_validator (Optional[Union[ColumnAggregateConstraintWithMetadata,\n MultiAggregateConstraintWithMetadata]]):\n what column-level aggregate validation you want to have applied,\n Leave empty for no column-level aggregate validation.\n dataframe_validator (Optional[Union[ConstraintWithMetadata, MultiConstraintWithMetadata]]):\n what dataframe-wide validation you want to have applied.\n Leave empty for no dataframe-wide validation.\n loader (Optional[DagsterTypeLoader]): An instance of a class that\n inherits from :py:class:`~dagster.DagsterTypeLoader`. If None, we will default\n to using `dataframe_loader`.\n\n Returns:\n a DagsterType with the corresponding name and packaged validation.\n\n """\n\n def _dagster_type_check(_, value):\n if not isinstance(value, pd.DataFrame):\n return TypeCheck(\n success=False,\n description=(\n f"Must be a pandas.DataFrame. Got value of type. {type(value).__name__}"\n ),\n )\n individual_result_dict = {}\n\n if dataframe_validator is not None:\n individual_result_dict["dataframe"] = dataframe_validator.validate(value)\n if columns_validator is not None:\n individual_result_dict["columns"] = columns_validator.validate(value)\n\n if columns_aggregate_validator is not None:\n individual_result_dict["column-aggregates"] = columns_aggregate_validator.validate(\n value\n )\n\n typechecks_succeeded = True\n metadata = {}\n overall_description = "Failed Constraints: {}"\n constraint_clauses = []\n for key, result in individual_result_dict.items():\n result_val = result.success\n if result_val:\n continue\n typechecks_succeeded = typechecks_succeeded and result_val\n result_dict = result.metadata[CONSTRAINT_METADATA_KEY].data\n metadata[f"{key}-constraint-metadata"] = MetadataValue.json(result_dict)\n constraint_clauses.append(f"{key} failing constraints, {result.description}")\n # returns aggregates, then column, then dataframe\n return TypeCheck(\n success=typechecks_succeeded,\n description=overall_description.format(constraint_clauses),\n metadata=metadata,\n )\n\n description = check.opt_str_param(description, "description", default="")\n return DagsterType(\n name=name,\n type_check_fn=_dagster_type_check,\n loader=loader if loader else dataframe_loader,\n description=description,\n )\n\n\ndef _execute_summary_stats(type_name, value, metadata_fn):\n if not metadata_fn:\n return []\n\n user_metadata = metadata_fn(value)\n try:\n return normalize_metadata(user_metadata)\n except:\n raise DagsterInvariantViolationError(\n "The return value of the user-defined summary_statistics function for pandas "\n f"data frame type {type_name} returned {value}. This function must return "\n "Dict[str, RawMetadataValue]."\n )\n
", "current_page_name": "_modules/dagster_pandas/data_frame", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_pandas.data_frame"}, "validation": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_pandas.validation

\nfrom dagster import (\n    DagsterInvariantViolationError,\n    _check as check,\n)\nfrom pandas import DataFrame, Timestamp\nfrom pandas.core.dtypes.common import (\n    is_bool_dtype,\n    is_float_dtype,\n    is_integer_dtype,\n    is_numeric_dtype,\n    is_string_dtype,\n)\n\nfrom dagster_pandas.constraints import (\n    CategoricalColumnConstraint,\n    ColumnDTypeFnConstraint,\n    ColumnDTypeInSetConstraint,\n    Constraint,\n    ConstraintViolationException,\n    DataFrameConstraint,\n    InRangeColumnConstraint,\n    NonNullableColumnConstraint,\n    UniqueColumnConstraint,\n)\n\nPANDAS_NUMERIC_TYPES = {"int64", "float"}\n\n\ndef _construct_keyword_constraints(non_nullable, unique, ignore_missing_vals):\n    non_nullable = check.bool_param(non_nullable, "exists")\n    unique = check.bool_param(unique, "unique")\n    ignore_missing_vals = check.bool_param(ignore_missing_vals, "ignore_missing_vals")\n    if non_nullable and ignore_missing_vals:\n        raise DagsterInvariantViolationError(\n            "PandasColumn cannot have a non-null constraint while also ignore missing values"\n        )\n    constraints = []\n    if non_nullable:\n        constraints.append(NonNullableColumnConstraint())\n    if unique:\n        constraints.append(UniqueColumnConstraint(ignore_missing_vals=ignore_missing_vals))\n    return constraints\n\n\n
[docs]class PandasColumn:\n """The main API for expressing column level schemas and constraints for your custom dataframe\n types.\n\n Args:\n name (str): Name of the column. This must match up with the column name in the dataframe you\n expect to receive.\n is_required (Optional[bool]): Flag indicating the optional/required presence of the column.\n If th column exists, the validate function will validate the column. Defaults to True.\n constraints (Optional[List[Constraint]]): List of constraint objects that indicate the\n validation rules for the pandas column.\n """\n\n def __init__(self, name, constraints=None, is_required=None):\n self.name = check.str_param(name, "name")\n self.is_required = check.opt_bool_param(is_required, "is_required", default=True)\n self.constraints = check.opt_list_param(constraints, "constraints", of_type=Constraint)\n\n def validate(self, dataframe):\n if self.name not in dataframe.columns:\n # Ignore validation if column is missing from dataframe and is not required\n if self.is_required:\n raise ConstraintViolationException(\n f"Required column {self.name} not in dataframe with columns {dataframe.columns}"\n )\n else:\n for constraint in self.constraints:\n constraint.validate(dataframe, self.name)\n\n @staticmethod\n def exists(name, non_nullable=False, unique=False, ignore_missing_vals=False, is_required=None):\n """Simple constructor for PandasColumns that expresses existence constraints.\n\n Args:\n name (str): Name of the column. This must match up with the column name in the dataframe you\n expect to receive.\n non_nullable (Optional[bool]): If true, this column will enforce a constraint that all values in the column\n ought to be non null values.\n unique (Optional[bool]): If true, this column will enforce a uniqueness constraint on the column values.\n ignore_missing_vals (Optional[bool]): A flag that is passed into most constraints. If true, the constraint will\n only evaluate non-null data. Ignore_missing_vals and non_nullable cannot both be True.\n is_required (Optional[bool]): Flag indicating the optional/required presence of the column.\n If the column exists the validate function will validate the column. Default to True.\n """\n return PandasColumn(\n name=check.str_param(name, "name"),\n constraints=_construct_keyword_constraints(\n non_nullable=non_nullable, unique=unique, ignore_missing_vals=ignore_missing_vals\n ),\n is_required=is_required,\n )\n\n @staticmethod\n def boolean_column(\n name, non_nullable=False, unique=False, ignore_missing_vals=False, is_required=None\n ):\n """Simple constructor for PandasColumns that expresses boolean constraints on boolean dtypes.\n\n Args:\n name (str): Name of the column. This must match up with the column name in the dataframe you\n expect to receive.\n non_nullable (Optional[bool]): If true, this column will enforce a constraint that all values in the column\n ought to be non null values.\n unique (Optional[bool]): If true, this column will enforce a uniqueness constraint on the column values.\n ignore_missing_vals (Optional[bool]): A flag that is passed into most constraints. If true, the constraint will\n only evaluate non-null data. Ignore_missing_vals and non_nullable cannot both be True.\n is_required (Optional[bool]): Flag indicating the optional/required presence of the column.\n If the column exists the validate function will validate the column. Default to True.\n """\n return PandasColumn(\n name=check.str_param(name, "name"),\n constraints=[ColumnDTypeFnConstraint(is_bool_dtype)]\n + _construct_keyword_constraints(\n non_nullable=non_nullable, unique=unique, ignore_missing_vals=ignore_missing_vals\n ),\n is_required=is_required,\n )\n\n @staticmethod\n def numeric_column(\n name,\n min_value=-float("inf"),\n max_value=float("inf"),\n non_nullable=False,\n unique=False,\n ignore_missing_vals=False,\n is_required=None,\n ):\n """Simple constructor for PandasColumns that expresses numeric constraints numeric dtypes.\n\n Args:\n name (str): Name of the column. This must match up with the column name in the dataframe you\n expect to receive.\n min_value (Optional[Union[int,float]]): The lower bound for values you expect in this column. Defaults to -float('inf')\n max_value (Optional[Union[int,float]]): The upper bound for values you expect in this column. Defaults to float('inf')\n non_nullable (Optional[bool]): If true, this column will enforce a constraint that all values in the column\n ought to be non null values.\n unique (Optional[bool]): If true, this column will enforce a uniqueness constraint on the column values.\n ignore_missing_vals (Optional[bool]): A flag that is passed into most constraints. If true, the constraint will\n only evaluate non-null data. Ignore_missing_vals and non_nullable cannot both be True.\n is_required (Optional[bool]): Flag indicating the optional/required presence of the column.\n If the column exists the validate function will validate the column. Default to True.\n """\n return PandasColumn(\n name=check.str_param(name, "name"),\n constraints=[\n ColumnDTypeFnConstraint(is_numeric_dtype),\n InRangeColumnConstraint(\n check.numeric_param(min_value, "min_value"),\n check.numeric_param(max_value, "max_value"),\n ignore_missing_vals=ignore_missing_vals,\n ),\n ]\n + _construct_keyword_constraints(\n non_nullable=non_nullable, unique=unique, ignore_missing_vals=ignore_missing_vals\n ),\n is_required=is_required,\n )\n\n @staticmethod\n def integer_column(\n name,\n min_value=-float("inf"),\n max_value=float("inf"),\n non_nullable=False,\n unique=False,\n ignore_missing_vals=False,\n is_required=None,\n ):\n """Simple constructor for PandasColumns that expresses numeric constraints on integer dtypes.\n\n Args:\n name (str): Name of the column. This must match up with the column name in the dataframe you\n expect to receive.\n min_value (Optional[Union[int,float]]): The lower bound for values you expect in this column. Defaults to -float('inf')\n max_value (Optional[Union[int,float]]): The upper bound for values you expect in this column. Defaults to float('inf')\n non_nullable (Optional[bool]): If true, this column will enforce a constraint that all values in the column\n ought to be non null values.\n unique (Optional[bool]): If true, this column will enforce a uniqueness constraint on the column values.\n ignore_missing_vals (Optional[bool]): A flag that is passed into most constraints. If true, the constraint will\n only evaluate non-null data. Ignore_missing_vals and non_nullable cannot both be True.\n is_required (Optional[bool]): Flag indicating the optional/required presence of the column.\n If the column exists the validate function will validate the column. Default to True.\n """\n return PandasColumn(\n name=check.str_param(name, "name"),\n constraints=[\n ColumnDTypeFnConstraint(is_integer_dtype),\n InRangeColumnConstraint(\n check.numeric_param(min_value, "min_value"),\n check.numeric_param(max_value, "max_value"),\n ignore_missing_vals=ignore_missing_vals,\n ),\n ]\n + _construct_keyword_constraints(\n non_nullable=non_nullable, unique=unique, ignore_missing_vals=ignore_missing_vals\n ),\n is_required=is_required,\n )\n\n @staticmethod\n def float_column(\n name,\n min_value=-float("inf"),\n max_value=float("inf"),\n non_nullable=False,\n unique=False,\n ignore_missing_vals=False,\n is_required=None,\n ):\n """Simple constructor for PandasColumns that expresses numeric constraints on float dtypes.\n\n Args:\n name (str): Name of the column. This must match up with the column name in the dataframe you\n expect to receive.\n min_value (Optional[Union[int,float]]): The lower bound for values you expect in this column. Defaults to -float('inf')\n max_value (Optional[Union[int,float]]): The upper bound for values you expect in this column. Defaults to float('inf')\n non_nullable (Optional[bool]): If true, this column will enforce a constraint that all values in the column\n ought to be non null values.\n unique (Optional[bool]): If true, this column will enforce a uniqueness constraint on the column values.\n ignore_missing_vals (Optional[bool]): A flag that is passed into most constraints. If true, the constraint will\n only evaluate non-null data. Ignore_missing_vals and non_nullable cannot both be True.\n is_required (Optional[bool]): Flag indicating the optional/required presence of the column.\n If the column exists the validate function will validate the column. Default to True.\n """\n return PandasColumn(\n name=check.str_param(name, "name"),\n constraints=[\n ColumnDTypeFnConstraint(is_float_dtype),\n InRangeColumnConstraint(\n check.numeric_param(min_value, "min_value"),\n check.numeric_param(max_value, "max_value"),\n ignore_missing_vals=ignore_missing_vals,\n ),\n ]\n + _construct_keyword_constraints(\n non_nullable=non_nullable, unique=unique, ignore_missing_vals=ignore_missing_vals\n ),\n is_required=is_required,\n )\n\n @staticmethod\n def datetime_column(\n name,\n min_datetime=Timestamp.min,\n max_datetime=Timestamp.max,\n non_nullable=False,\n unique=False,\n ignore_missing_vals=False,\n is_required=None,\n tz=None,\n ):\n """Simple constructor for PandasColumns that expresses datetime constraints on 'datetime64[ns]' dtypes.\n\n Args:\n name (str): Name of the column. This must match up with the column name in the dataframe you\n expect to receive.\n min_datetime (Optional[Union[int,float]]): The lower bound for values you expect in this column.\n Defaults to pandas.Timestamp.min.\n max_datetime (Optional[Union[int,float]]): The upper bound for values you expect in this column.\n Defaults to pandas.Timestamp.max.\n non_nullable (Optional[bool]): If true, this column will enforce a constraint that all values in the column\n ought to be non null values.\n unique (Optional[bool]): If true, this column will enforce a uniqueness constraint on the column values.\n ignore_missing_vals (Optional[bool]): A flag that is passed into most constraints. If true, the constraint will\n only evaluate non-null data. Ignore_missing_vals and non_nullable cannot both be True.\n is_required (Optional[bool]): Flag indicating the optional/required presence of the column.\n If the column exists the validate function will validate the column. Default to True.\n tz (Optional[str]): Required timezone for values eg: tz='UTC', tz='Europe/Dublin', tz='US/Eastern'.\n Defaults to None, meaning naive datetime values.\n """\n if tz is None:\n datetime_constraint = ColumnDTypeInSetConstraint({"datetime64[ns]"})\n else:\n datetime_constraint = ColumnDTypeInSetConstraint({f"datetime64[ns, {tz}]"})\n # One day more/less than absolute min/max to prevent OutOfBoundsDatetime errors when converting min/max to be tz aware\n if min_datetime.tz_localize(None) == Timestamp.min:\n min_datetime = Timestamp("1677-09-22 00:12:43.145225Z")\n if max_datetime.tz_localize(None) == Timestamp.max:\n max_datetime = Timestamp("2262-04-10 23:47:16.854775807Z")\n # Convert bounds to same tz\n if Timestamp(min_datetime).tz is None:\n min_datetime = Timestamp(min_datetime).tz_localize(tz)\n if Timestamp(max_datetime).tz is None:\n max_datetime = Timestamp(max_datetime).tz_localize(tz)\n\n return PandasColumn(\n name=check.str_param(name, "name"),\n constraints=[\n datetime_constraint,\n InRangeColumnConstraint(\n min_datetime, max_datetime, ignore_missing_vals=ignore_missing_vals\n ),\n ]\n + _construct_keyword_constraints(\n non_nullable=non_nullable, unique=unique, ignore_missing_vals=ignore_missing_vals\n ),\n is_required=is_required,\n )\n\n @staticmethod\n def string_column(\n name, non_nullable=False, unique=False, ignore_missing_vals=False, is_required=None\n ):\n """Simple constructor for PandasColumns that expresses constraints on string dtypes.\n\n Args:\n name (str): Name of the column. This must match up with the column name in the dataframe you\n expect to receive.\n non_nullable (Optional[bool]): If true, this column will enforce a constraint that all values in the column\n ought to be non null values.\n unique (Optional[bool]): If true, this column will enforce a uniqueness constraint on the column values.\n ignore_missing_vals (Optional[bool]): A flag that is passed into most constraints. If true, the constraint will\n only evaluate non-null data. Ignore_missing_vals and non_nullable cannot both be True.\n is_required (Optional[bool]): Flag indicating the optional/required presence of the column.\n If the column exists the validate function will validate the column. Default to True.\n """\n return PandasColumn(\n name=check.str_param(name, "name"),\n constraints=[ColumnDTypeFnConstraint(is_string_dtype)]\n + _construct_keyword_constraints(\n non_nullable=non_nullable, unique=unique, ignore_missing_vals=ignore_missing_vals\n ),\n is_required=is_required,\n )\n\n @staticmethod\n def categorical_column(\n name,\n categories,\n of_types=frozenset({"category", "object"}),\n non_nullable=False,\n unique=False,\n ignore_missing_vals=False,\n is_required=None,\n ):\n """Simple constructor for PandasColumns that expresses categorical constraints on specified dtypes.\n\n Args:\n name (str): Name of the column. This must match up with the column name in the dataframe you\n expect to receive.\n categories (List[Any]): The valid set of buckets that all values in the column must match.\n of_types (Optional[Union[str, Set[str]]]): The expected dtype[s] that your categories and values must\n abide by.\n non_nullable (Optional[bool]): If true, this column will enforce a constraint that all values in\n the column ought to be non null values.\n unique (Optional[bool]): If true, this column will enforce a uniqueness constraint on the column values.\n ignore_missing_vals (Optional[bool]): A flag that is passed into most constraints. If true, the\n constraint will only evaluate non-null data. Ignore_missing_vals and non_nullable cannot both be True.\n is_required (Optional[bool]): Flag indicating the optional/required presence of the column.\n If the column exists the validate function will validate the column. Default to True.\n """\n of_types = {of_types} if isinstance(of_types, str) else of_types\n return PandasColumn(\n name=check.str_param(name, "name"),\n constraints=[\n ColumnDTypeInSetConstraint(of_types),\n CategoricalColumnConstraint(categories, ignore_missing_vals=ignore_missing_vals),\n ]\n + _construct_keyword_constraints(\n non_nullable=non_nullable, unique=unique, ignore_missing_vals=ignore_missing_vals\n ),\n is_required=is_required,\n )
\n\n\ndef validate_constraints(dataframe, pandas_columns=None, dataframe_constraints=None):\n dataframe = check.inst_param(dataframe, "dataframe", DataFrame)\n pandas_columns = check.opt_list_param(\n pandas_columns, "column_constraints", of_type=PandasColumn\n )\n dataframe_constraints = check.opt_list_param(\n dataframe_constraints, "dataframe_constraints", of_type=DataFrameConstraint\n )\n\n if pandas_columns:\n for column in pandas_columns:\n column.validate(dataframe)\n\n if dataframe_constraints:\n for dataframe_constraint in dataframe_constraints:\n dataframe_constraint.validate(dataframe)\n
", "current_page_name": "_modules/dagster_pandas/validation", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_pandas.validation"}}, "dagster_postgres": {"event_log": {"event_log": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_postgres.event_log.event_log

\nfrom typing import Any, ContextManager, Mapping, Optional, Sequence\n\nimport dagster._check as check\nimport sqlalchemy as db\nimport sqlalchemy.dialects as db_dialects\nimport sqlalchemy.pool as db_pool\nfrom dagster._config.config_schema import UserConfigSchema\nfrom dagster._core.errors import DagsterInvariantViolationError\nfrom dagster._core.event_api import EventHandlerFn\nfrom dagster._core.events import ASSET_CHECK_EVENTS, ASSET_EVENTS\nfrom dagster._core.events.log import EventLogEntry\nfrom dagster._core.storage.config import pg_config\nfrom dagster._core.storage.event_log import (\n    AssetKeyTable,\n    DynamicPartitionsTable,\n    SqlEventLogStorage,\n    SqlEventLogStorageMetadata,\n    SqlEventLogStorageTable,\n)\nfrom dagster._core.storage.event_log.base import EventLogCursor\nfrom dagster._core.storage.event_log.migration import ASSET_KEY_INDEX_COLS\nfrom dagster._core.storage.event_log.polling_event_watcher import SqlPollingEventWatcher\nfrom dagster._core.storage.sql import (\n    AlembicVersion,\n    check_alembic_revision,\n    create_engine,\n    run_alembic_upgrade,\n    stamp_alembic_rev,\n)\nfrom dagster._core.storage.sqlalchemy_compat import db_select\nfrom dagster._serdes import ConfigurableClass, ConfigurableClassData, deserialize_value\nfrom sqlalchemy.engine import Connection\n\nfrom ..utils import (\n    create_pg_connection,\n    pg_alembic_config,\n    pg_statement_timeout,\n    pg_url_from_config,\n    retry_pg_connection_fn,\n    retry_pg_creation_fn,\n)\n\nCHANNEL_NAME = "run_events"\n\n\n
[docs]class PostgresEventLogStorage(SqlEventLogStorage, ConfigurableClass):\n """Postgres-backed event log storage.\n\n Users should not directly instantiate this class; it is instantiated by internal machinery when\n ``dagster-webserver`` and ``dagster-graphql`` load, based on the values in the ``dagster.yaml`` file in\n ``$DAGSTER_HOME``. Configuration of this class should be done by setting values in that file.\n\n To use Postgres for all of the components of your instance storage, you can add the following\n block to your ``dagster.yaml``:\n\n .. literalinclude:: ../../../../../../examples/docs_snippets/docs_snippets/deploying/dagster-pg.yaml\n :caption: dagster.yaml\n :lines: 1-8\n :language: YAML\n\n If you are configuring the different storage components separately and are specifically\n configuring your event log storage to use Postgres, you can add a block such as the following\n to your ``dagster.yaml``:\n\n .. literalinclude:: ../../../../../../examples/docs_snippets/docs_snippets/deploying/dagster-pg-legacy.yaml\n :caption: dagster.yaml\n :lines: 12-21\n :language: YAML\n\n Note that the fields in this config are :py:class:`~dagster.StringSource` and\n :py:class:`~dagster.IntSource` and can be configured from environment variables.\n\n """\n\n def __init__(\n self,\n postgres_url: str,\n should_autocreate_tables: bool = True,\n inst_data: Optional[ConfigurableClassData] = None,\n ):\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n self.postgres_url = check.str_param(postgres_url, "postgres_url")\n self.should_autocreate_tables = check.bool_param(\n should_autocreate_tables, "should_autocreate_tables"\n )\n\n self._disposed = False\n\n # Default to not holding any connections open to prevent accumulating connections per DagsterInstance\n self._engine = create_engine(\n self.postgres_url, isolation_level="AUTOCOMMIT", poolclass=db_pool.NullPool\n )\n\n self._event_watcher = SqlPollingEventWatcher(self)\n\n self._secondary_index_cache = {}\n\n # Stamp and create tables if the main table does not exist (we can't check alembic\n # revision because alembic config may be shared with other storage classes)\n if self.should_autocreate_tables:\n table_names = retry_pg_connection_fn(lambda: db.inspect(self._engine).get_table_names())\n if "event_logs" not in table_names:\n retry_pg_creation_fn(self._init_db)\n self.reindex_events()\n self.reindex_assets()\n\n super().__init__()\n\n def _init_db(self) -> None:\n with self._connect() as conn:\n with conn.begin():\n SqlEventLogStorageMetadata.create_all(conn)\n stamp_alembic_rev(pg_alembic_config(__file__), conn)\n\n def optimize_for_webserver(self, statement_timeout: int, pool_recycle: int) -> None:\n # When running in dagster-webserver, hold an open connection and set statement_timeout\n existing_options = self._engine.url.query.get("options")\n timeout_option = pg_statement_timeout(statement_timeout)\n if existing_options:\n options = f"{timeout_option} {existing_options}"\n else:\n options = timeout_option\n self._engine = create_engine(\n self.postgres_url,\n isolation_level="AUTOCOMMIT",\n pool_size=1,\n connect_args={"options": options},\n pool_recycle=pool_recycle,\n )\n\n def upgrade(self) -> None:\n alembic_config = pg_alembic_config(__file__)\n with self._connect() as conn:\n run_alembic_upgrade(alembic_config, conn)\n\n @property\n def inst_data(self) -> Optional[ConfigurableClassData]:\n return self._inst_data\n\n @classmethod\n def config_type(cls) -> UserConfigSchema:\n return pg_config()\n\n @classmethod\n def from_config_value(\n cls, inst_data: Optional[ConfigurableClassData], config_value: Mapping[str, Any]\n ) -> "PostgresEventLogStorage":\n return PostgresEventLogStorage(\n inst_data=inst_data,\n postgres_url=pg_url_from_config(config_value),\n should_autocreate_tables=config_value.get("should_autocreate_tables", True),\n )\n\n @staticmethod\n def create_clean_storage(\n conn_string: str, should_autocreate_tables: bool = True\n ) -> "PostgresEventLogStorage":\n engine = create_engine(\n conn_string, isolation_level="AUTOCOMMIT", poolclass=db_pool.NullPool\n )\n try:\n SqlEventLogStorageMetadata.drop_all(engine)\n finally:\n engine.dispose()\n\n return PostgresEventLogStorage(conn_string, should_autocreate_tables)\n\n def store_event(self, event: EventLogEntry) -> None:\n """Store an event corresponding to a run.\n\n Args:\n event (EventLogEntry): The event to store.\n """\n check.inst_param(event, "event", EventLogEntry)\n insert_event_statement = self.prepare_insert_event(event) # from SqlEventLogStorage.py\n with self._connect() as conn:\n result = conn.execute(\n insert_event_statement.returning(\n SqlEventLogStorageTable.c.run_id, SqlEventLogStorageTable.c.id\n )\n )\n res = result.fetchone()\n result.close()\n\n # LISTEN/NOTIFY no longer used for pg event watch - preserved here to support version skew\n conn.execute(\n db.text(f"""NOTIFY {CHANNEL_NAME}, :notify_id; """),\n {"notify_id": res[0] + "_" + str(res[1])}, # type: ignore\n )\n event_id = int(res[1]) # type: ignore\n\n if (\n event.is_dagster_event\n and event.dagster_event_type in ASSET_EVENTS\n and event.dagster_event.asset_key # type: ignore\n ):\n self.store_asset_event(event, event_id)\n\n if event_id is None:\n raise DagsterInvariantViolationError(\n "Cannot store asset event tags for null event id."\n )\n\n self.store_asset_event_tags(event, event_id)\n\n if event.is_dagster_event and event.dagster_event_type in ASSET_CHECK_EVENTS:\n self.store_asset_check_event(event, event_id)\n\n def store_asset_event(self, event: EventLogEntry, event_id: int) -> None:\n check.inst_param(event, "event", EventLogEntry)\n if not (event.dagster_event and event.dagster_event.asset_key):\n return\n\n # We switched to storing the entire event record of the last materialization instead of just\n # the AssetMaterialization object, so that we have access to metadata like timestamp,\n # job, run_id, etc.\n #\n # This should make certain asset queries way more performant, without having to do extra\n # queries against the event log.\n #\n # This should be accompanied by a schema change in 0.12.0, renaming `last_materialization`\n # to `last_materialization_event`, for clarity. For now, we should do some back-compat.\n #\n # https://github.com/dagster-io/dagster/issues/3945\n\n # The AssetKeyTable contains a `last_materialization_timestamp` column that is exclusively\n # used to determine if an asset exists (last materialization timestamp > wipe timestamp).\n # This column is used nowhere else, and as of AssetObservation/AssetMaterializationPlanned\n # event creation, we want to extend this functionality to ensure that assets with any event\n # (observation, materialization, or materialization planned) yielded with timestamp\n # > wipe timestamp display in the Dagster UI.\n\n # As of the following PRs, we update last_materialization_timestamp to store the timestamp\n # of the latest asset observation, materialization, or materialization_planned that has occurred.\n # https://github.com/dagster-io/dagster/pull/6885\n # https://github.com/dagster-io/dagster/pull/7319\n\n # The AssetKeyTable also contains a `last_run_id` column that is updated upon asset\n # materialization. This column was not being used until the below PR. This new change\n # writes to the column upon `ASSET_MATERIALIZATION_PLANNED` events to fetch the last\n # run id for a set of assets in one roundtrip call to event log storage.\n # https://github.com/dagster-io/dagster/pull/7319\n\n values = self._get_asset_entry_values(\n event, event_id, self.has_secondary_index(ASSET_KEY_INDEX_COLS)\n )\n with self.index_connection() as conn:\n query = db_dialects.postgresql.insert(AssetKeyTable).values(\n asset_key=event.dagster_event.asset_key.to_string(),\n **values,\n )\n if values:\n query = query.on_conflict_do_update(\n index_elements=[AssetKeyTable.c.asset_key],\n set_=dict(**values),\n )\n else:\n query = query.on_conflict_do_nothing()\n conn.execute(query)\n\n def add_dynamic_partitions(\n self, partitions_def_name: str, partition_keys: Sequence[str]\n ) -> None:\n if not partition_keys:\n return\n\n # Overload base implementation to push upsert logic down into the db layer\n self._check_partitions_table()\n with self.index_connection() as conn:\n conn.execute(\n db_dialects.postgresql.insert(DynamicPartitionsTable)\n .values(\n [\n dict(partitions_def_name=partitions_def_name, partition=partition_key)\n for partition_key in partition_keys\n ]\n )\n .on_conflict_do_nothing(),\n )\n\n def _connect(self) -> ContextManager[Connection]:\n return create_pg_connection(self._engine)\n\n def run_connection(self, run_id: Optional[str] = None) -> ContextManager[Connection]:\n return self._connect()\n\n def index_connection(self) -> ContextManager[Connection]:\n return self._connect()\n\n def has_table(self, table_name: str) -> bool:\n return bool(self._engine.dialect.has_table(self._engine.connect(), table_name))\n\n def has_secondary_index(self, name: str) -> bool:\n if name not in self._secondary_index_cache:\n self._secondary_index_cache[name] = super(\n PostgresEventLogStorage, self\n ).has_secondary_index(name)\n return self._secondary_index_cache[name]\n\n def enable_secondary_index(self, name: str) -> None:\n super(PostgresEventLogStorage, self).enable_secondary_index(name)\n if name in self._secondary_index_cache:\n del self._secondary_index_cache[name]\n\n def watch(\n self,\n run_id: str,\n cursor: Optional[str],\n callback: EventHandlerFn,\n ) -> None:\n if cursor and EventLogCursor.parse(cursor).is_offset_cursor():\n check.failed("Cannot call `watch` with an offset cursor")\n\n self._event_watcher.watch_run(run_id, cursor, callback)\n\n def _gen_event_log_entry_from_cursor(self, cursor) -> EventLogEntry:\n with self._engine.connect() as conn:\n cursor_res = conn.execute(\n db_select([SqlEventLogStorageTable.c.event]).where(\n SqlEventLogStorageTable.c.id == cursor\n ),\n )\n return deserialize_value(cursor_res.scalar(), EventLogEntry) # type: ignore\n\n def end_watch(self, run_id: str, handler: EventHandlerFn) -> None:\n self._event_watcher.unwatch_run(run_id, handler)\n\n def __del__(self) -> None:\n # Keep the inherent limitations of __del__ in Python in mind!\n self.dispose()\n\n def dispose(self) -> None:\n if not self._disposed:\n self._disposed = True\n self._event_watcher.close()\n\n def alembic_version(self) -> AlembicVersion:\n alembic_config = pg_alembic_config(__file__)\n with self._connect() as conn:\n return check_alembic_revision(alembic_config, conn)
\n
", "current_page_name": "_modules/dagster_postgres/event_log/event_log", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_postgres.event_log.event_log"}}, "run_storage": {"run_storage": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_postgres.run_storage.run_storage

\nimport zlib\nfrom typing import ContextManager, Mapping, Optional\n\nimport dagster._check as check\nimport sqlalchemy as db\nimport sqlalchemy.dialects as db_dialects\nimport sqlalchemy.pool as db_pool\nfrom dagster._config.config_schema import UserConfigSchema\nfrom dagster._core.storage.config import PostgresStorageConfig, pg_config\nfrom dagster._core.storage.runs import (\n    DaemonHeartbeatsTable,\n    InstanceInfo,\n    RunStorageSqlMetadata,\n    SqlRunStorage,\n)\nfrom dagster._core.storage.runs.schema import KeyValueStoreTable, SnapshotsTable\nfrom dagster._core.storage.runs.sql_run_storage import SnapshotType\nfrom dagster._core.storage.sql import (\n    AlembicVersion,\n    check_alembic_revision,\n    create_engine,\n    run_alembic_upgrade,\n    stamp_alembic_rev,\n)\nfrom dagster._daemon.types import DaemonHeartbeat\nfrom dagster._serdes import ConfigurableClass, ConfigurableClassData, serialize_value\nfrom dagster._utils import utc_datetime_from_timestamp\nfrom sqlalchemy.engine import Connection\n\nfrom ..utils import (\n    create_pg_connection,\n    pg_alembic_config,\n    pg_statement_timeout,\n    pg_url_from_config,\n    retry_pg_connection_fn,\n    retry_pg_creation_fn,\n)\n\n\n
[docs]class PostgresRunStorage(SqlRunStorage, ConfigurableClass):\n """Postgres-backed run storage.\n\n Users should not directly instantiate this class; it is instantiated by internal machinery when\n ``dagster-webserver`` and ``dagster-graphql`` load, based on the values in the ``dagster.yaml`` file in\n ``$DAGSTER_HOME``. Configuration of this class should be done by setting values in that file.\n\n To use Postgres for all of the components of your instance storage, you can add the following\n block to your ``dagster.yaml``:\n\n .. literalinclude:: ../../../../../../examples/docs_snippets/docs_snippets/deploying/dagster-pg.yaml\n :caption: dagster.yaml\n :lines: 1-8\n :language: YAML\n\n If you are configuring the different storage components separately and are specifically\n configuring your run storage to use Postgres, you can add a block such as the following\n to your ``dagster.yaml``:\n\n .. literalinclude:: ../../../../../../examples/docs_snippets/docs_snippets/deploying/dagster-pg-legacy.yaml\n :caption: dagster.yaml\n :lines: 1-10\n :language: YAML\n\n Note that the fields in this config are :py:class:`~dagster.StringSource` and\n :py:class:`~dagster.IntSource` and can be configured from environment variables.\n """\n\n def __init__(\n self,\n postgres_url: str,\n should_autocreate_tables: bool = True,\n inst_data: Optional[ConfigurableClassData] = None,\n ):\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n self.postgres_url = postgres_url\n self.should_autocreate_tables = check.bool_param(\n should_autocreate_tables, "should_autocreate_tables"\n )\n\n # Default to not holding any connections open to prevent accumulating connections per DagsterInstance\n self._engine = create_engine(\n self.postgres_url,\n isolation_level="AUTOCOMMIT",\n poolclass=db_pool.NullPool,\n )\n\n self._index_migration_cache = {}\n\n # Stamp and create tables if the main table does not exist (we can't check alembic\n # revision because alembic config may be shared with other storage classes)\n if self.should_autocreate_tables:\n table_names = retry_pg_connection_fn(lambda: db.inspect(self._engine).get_table_names())\n if "runs" not in table_names:\n retry_pg_creation_fn(self._init_db)\n self.migrate()\n self.optimize()\n elif "instance_info" not in table_names:\n InstanceInfo.create(self._engine)\n\n super().__init__()\n\n def _init_db(self) -> None:\n with self.connect() as conn:\n with conn.begin():\n RunStorageSqlMetadata.create_all(conn)\n # This revision may be shared by any other dagster storage classes using the same DB\n stamp_alembic_rev(pg_alembic_config(__file__), conn)\n\n def optimize_for_webserver(self, statement_timeout: int, pool_recycle: int) -> None:\n # When running in dagster-webserver, hold 1 open connection and set statement_timeout\n existing_options = self._engine.url.query.get("options")\n timeout_option = pg_statement_timeout(statement_timeout)\n if existing_options:\n options = f"{timeout_option} {existing_options}"\n else:\n options = timeout_option\n self._engine = create_engine(\n self.postgres_url,\n isolation_level="AUTOCOMMIT",\n pool_size=1,\n connect_args={"options": options},\n pool_recycle=pool_recycle,\n )\n\n @property\n def inst_data(self) -> Optional[ConfigurableClassData]:\n return self._inst_data\n\n @classmethod\n def config_type(cls) -> UserConfigSchema:\n return pg_config()\n\n @classmethod\n def from_config_value(\n cls, inst_data: Optional[ConfigurableClassData], config_value: PostgresStorageConfig\n ):\n return PostgresRunStorage(\n inst_data=inst_data,\n postgres_url=pg_url_from_config(config_value),\n should_autocreate_tables=config_value.get("should_autocreate_tables", True),\n )\n\n @staticmethod\n def create_clean_storage(\n postgres_url: str, should_autocreate_tables: bool = True\n ) -> "PostgresRunStorage":\n engine = create_engine(\n postgres_url, isolation_level="AUTOCOMMIT", poolclass=db_pool.NullPool\n )\n try:\n RunStorageSqlMetadata.drop_all(engine)\n finally:\n engine.dispose()\n return PostgresRunStorage(postgres_url, should_autocreate_tables)\n\n def connect(self) -> ContextManager[Connection]:\n return create_pg_connection(self._engine)\n\n def upgrade(self) -> None:\n with self.connect() as conn:\n run_alembic_upgrade(pg_alembic_config(__file__), conn)\n\n def has_built_index(self, migration_name: str) -> bool:\n if migration_name not in self._index_migration_cache:\n self._index_migration_cache[migration_name] = super(\n PostgresRunStorage, self\n ).has_built_index(migration_name)\n return self._index_migration_cache[migration_name]\n\n def mark_index_built(self, migration_name: str) -> None:\n super(PostgresRunStorage, self).mark_index_built(migration_name)\n if migration_name in self._index_migration_cache:\n del self._index_migration_cache[migration_name]\n\n def add_daemon_heartbeat(self, daemon_heartbeat: DaemonHeartbeat) -> None:\n with self.connect() as conn:\n # insert or update if already present, using postgres specific on_conflict\n conn.execute(\n db_dialects.postgresql.insert(DaemonHeartbeatsTable)\n .values(\n timestamp=utc_datetime_from_timestamp(daemon_heartbeat.timestamp),\n daemon_type=daemon_heartbeat.daemon_type,\n daemon_id=daemon_heartbeat.daemon_id,\n body=serialize_value(daemon_heartbeat),\n )\n .on_conflict_do_update(\n index_elements=[DaemonHeartbeatsTable.c.daemon_type],\n set_={\n "timestamp": utc_datetime_from_timestamp(daemon_heartbeat.timestamp),\n "daemon_id": daemon_heartbeat.daemon_id,\n "body": serialize_value(daemon_heartbeat),\n },\n )\n .returning(\n # required because sqlalchemy might by default return the declared primary key,\n # which might not exist\n DaemonHeartbeatsTable.c.daemon_type,\n )\n )\n\n def set_cursor_values(self, pairs: Mapping[str, str]) -> None:\n check.mapping_param(pairs, "pairs", key_type=str, value_type=str)\n\n # pg speciic on_conflict_do_update\n insert_stmt = db_dialects.postgresql.insert(KeyValueStoreTable).values(\n [{"key": k, "value": v} for k, v in pairs.items()]\n )\n upsert_stmt = insert_stmt.on_conflict_do_update(\n index_elements=[\n KeyValueStoreTable.c.key,\n ],\n set_={"value": insert_stmt.excluded.value},\n ).returning(\n # required because sqlalchemy might by default return the declared primary key,\n # which might not exist\n KeyValueStoreTable.c.key\n )\n\n with self.connect() as conn:\n conn.execute(upsert_stmt)\n\n def _add_snapshot(self, snapshot_id: str, snapshot_obj, snapshot_type: SnapshotType) -> str:\n with self.connect() as conn:\n snapshot_insert = (\n db_dialects.postgresql.insert(SnapshotsTable)\n .values(\n snapshot_id=snapshot_id,\n snapshot_body=zlib.compress(serialize_value(snapshot_obj).encode("utf-8")),\n snapshot_type=snapshot_type.value,\n )\n .on_conflict_do_nothing()\n )\n conn.execute(snapshot_insert)\n return snapshot_id\n\n def alembic_version(self) -> AlembicVersion:\n alembic_config = pg_alembic_config(__file__)\n with self.connect() as conn:\n return check_alembic_revision(alembic_config, conn)
\n
", "current_page_name": "_modules/dagster_postgres/run_storage/run_storage", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_postgres.run_storage.run_storage"}}, "schedule_storage": {"schedule_storage": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_postgres.schedule_storage.schedule_storage

\nfrom typing import ContextManager, Optional\n\nimport dagster._check as check\nimport pendulum\nimport sqlalchemy as db\nimport sqlalchemy.dialects as db_dialects\nimport sqlalchemy.pool as db_pool\nfrom dagster._config.config_schema import UserConfigSchema\nfrom dagster._core.scheduler.instigation import InstigatorState\nfrom dagster._core.storage.config import PostgresStorageConfig, pg_config\nfrom dagster._core.storage.schedules import ScheduleStorageSqlMetadata, SqlScheduleStorage\nfrom dagster._core.storage.schedules.schema import InstigatorsTable\nfrom dagster._core.storage.sql import (\n    AlembicVersion,\n    check_alembic_revision,\n    create_engine,\n    run_alembic_upgrade,\n    stamp_alembic_rev,\n)\nfrom dagster._serdes import ConfigurableClass, ConfigurableClassData, serialize_value\nfrom sqlalchemy.engine import Connection\n\nfrom ..utils import (\n    create_pg_connection,\n    pg_alembic_config,\n    pg_statement_timeout,\n    pg_url_from_config,\n    retry_pg_connection_fn,\n    retry_pg_creation_fn,\n)\n\n\n
[docs]class PostgresScheduleStorage(SqlScheduleStorage, ConfigurableClass):\n """Postgres-backed run storage.\n\n Users should not directly instantiate this class; it is instantiated by internal machinery when\n ``dagster-webserver`` and ``dagster-graphql`` load, based on the values in the ``dagster.yaml`` file in\n ``$DAGSTER_HOME``. Configuration of this class should be done by setting values in that file.\n\n To use Postgres for all of the components of your instance storage, you can add the following\n block to your ``dagster.yaml``:\n\n .. literalinclude:: ../../../../../../examples/docs_snippets/docs_snippets/deploying/dagster-pg.yaml\n :caption: dagster.yaml\n :lines: 1-8\n :language: YAML\n\n If you are configuring the different storage components separately and are specifically\n configuring your schedule storage to use Postgres, you can add a block such as the following\n to your ``dagster.yaml``:\n\n .. literalinclude:: ../../../../../../examples/docs_snippets/docs_snippets/deploying/dagster-pg-legacy.yaml\n :caption: dagster.yaml\n :lines: 23-32\n :language: YAML\n\n Note that the fields in this config are :py:class:`~dagster.StringSource` and\n :py:class:`~dagster.IntSource` and can be configured from environment variables.\n """\n\n def __init__(\n self,\n postgres_url: str,\n should_autocreate_tables: bool = True,\n inst_data: Optional[ConfigurableClassData] = None,\n ):\n self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)\n self.postgres_url = postgres_url\n self.should_autocreate_tables = check.bool_param(\n should_autocreate_tables, "should_autocreate_tables"\n )\n\n # Default to not holding any connections open to prevent accumulating connections per DagsterInstance\n self._engine = create_engine(\n self.postgres_url, isolation_level="AUTOCOMMIT", poolclass=db_pool.NullPool\n )\n\n # Stamp and create tables if the main table does not exist (we can't check alembic\n # revision because alembic config may be shared with other storage classes)\n if self.should_autocreate_tables:\n table_names = retry_pg_connection_fn(lambda: db.inspect(self._engine).get_table_names())\n missing_main_table = "schedules" not in table_names and "jobs" not in table_names\n if missing_main_table:\n retry_pg_creation_fn(self._init_db)\n\n super().__init__()\n\n def _init_db(self) -> None:\n with self.connect() as conn:\n with conn.begin():\n ScheduleStorageSqlMetadata.create_all(conn)\n stamp_alembic_rev(pg_alembic_config(__file__), conn)\n\n # mark all the data migrations as applied\n self.migrate()\n self.optimize()\n\n def optimize_for_webserver(self, statement_timeout: int, pool_recycle: int) -> None:\n # When running in dagster-webserver, hold an open connection and set statement_timeout\n existing_options = self._engine.url.query.get("options")\n timeout_option = pg_statement_timeout(statement_timeout)\n if existing_options:\n options = f"{timeout_option} {existing_options}"\n else:\n options = timeout_option\n self._engine = create_engine(\n self.postgres_url,\n isolation_level="AUTOCOMMIT",\n pool_size=1,\n connect_args={"options": options},\n pool_recycle=pool_recycle,\n )\n\n @property\n def inst_data(self) -> Optional[ConfigurableClassData]:\n return self._inst_data\n\n @classmethod\n def config_type(cls) -> UserConfigSchema:\n return pg_config()\n\n @classmethod\n def from_config_value(\n cls, inst_data: Optional[ConfigurableClassData], config_value: PostgresStorageConfig\n ) -> "PostgresScheduleStorage":\n return PostgresScheduleStorage(\n inst_data=inst_data,\n postgres_url=pg_url_from_config(config_value),\n should_autocreate_tables=config_value.get("should_autocreate_tables", True),\n )\n\n @staticmethod\n def create_clean_storage(\n postgres_url: str, should_autocreate_tables: bool = True\n ) -> "PostgresScheduleStorage":\n engine = create_engine(\n postgres_url, isolation_level="AUTOCOMMIT", poolclass=db_pool.NullPool\n )\n try:\n ScheduleStorageSqlMetadata.drop_all(engine)\n finally:\n engine.dispose()\n return PostgresScheduleStorage(postgres_url, should_autocreate_tables)\n\n def connect(self, run_id: Optional[str] = None) -> ContextManager[Connection]:\n return create_pg_connection(self._engine)\n\n def upgrade(self) -> None:\n alembic_config = pg_alembic_config(__file__)\n with self.connect() as conn:\n run_alembic_upgrade(alembic_config, conn)\n\n def _add_or_update_instigators_table(self, conn: Connection, state: InstigatorState) -> None:\n selector_id = state.selector_id\n conn.execute(\n db_dialects.postgresql.insert(InstigatorsTable)\n .values(\n selector_id=selector_id,\n repository_selector_id=state.repository_selector_id,\n status=state.status.value,\n instigator_type=state.instigator_type.value,\n instigator_body=serialize_value(state),\n )\n .on_conflict_do_update(\n index_elements=[InstigatorsTable.c.selector_id],\n set_={\n "status": state.status.value,\n "instigator_type": state.instigator_type.value,\n "instigator_body": serialize_value(state),\n "update_timestamp": pendulum.now("UTC"),\n },\n )\n )\n\n def alembic_version(self) -> AlembicVersion:\n alembic_config = pg_alembic_config(__file__)\n with self.connect() as conn:\n return check_alembic_revision(alembic_config, conn)
\n
", "current_page_name": "_modules/dagster_postgres/schedule_storage/schedule_storage", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_postgres.schedule_storage.schedule_storage"}}}, "dagster_prometheus": {"resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_prometheus.resources

\nimport prometheus_client\nfrom dagster import (\n    ConfigurableResource,\n    resource,\n)\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom dagster._core.execution.context.init import InitResourceContext\nfrom prometheus_client.exposition import default_handler\nfrom pydantic import Field, PrivateAttr\n\n\n
[docs]class PrometheusClient:\n """Integrates with Prometheus via the prometheus_client library."""
\n\n\n
[docs]class PrometheusResource(ConfigurableResource):\n """This resource is used to send metrics to a Prometheus Pushgateway.\n\n **Example:**\n\n .. code-block:: python\n\n from dagster_prometheus import PrometheusResource\n from dagster import Definitions, job, op\n\n @op\n def example_prometheus_op(prometheus: PrometheusResource):\n prometheus.push_to_gateway(job="my_job")\n\n @job\n def my_job():\n example_prometheus_op()\n\n defs = Definitions(\n jobs=[my_job],\n resources={"prometheus": PrometheusResource(gateway="http://pushgateway.local")},\n )\n\n """\n\n gateway: str = Field(\n description=(\n "The url for your push gateway. Either of the"\n " form 'http://pushgateway.local', or 'pushgateway.local'."\n " Scheme defaults to 'http' if none is provided"\n )\n )\n timeout: int = Field(\n default=30,\n description="is how long delete will attempt to connect before giving up. Defaults to 30s.",\n )\n _registry: prometheus_client.CollectorRegistry = PrivateAttr(default=None)\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n def setup_for_execution(self, context: InitResourceContext) -> None:\n self._registry = prometheus_client.CollectorRegistry()\n\n @property\n def registry(self) -> prometheus_client.CollectorRegistry:\n return self._registry\n\n def push_to_gateway(self, job, grouping_key=None, handler=default_handler) -> None:\n """Push metrics to the given pushgateway.\n `job` is the job label to be attached to all pushed metrics\n `grouping_key` please see the pushgateway documentation for details.\n Defaults to None\n `handler` is an optional function which can be provided to perform\n requests to the 'gateway'.\n Defaults to None, in which case an http or https request\n will be carried out by a default handler.\n If not None, the argument must be a function which accepts\n the following arguments:\n url, method, timeout, headers, and content\n May be used to implement additional functionality not\n supported by the built-in default handler (such as SSL\n client certicates, and HTTP authentication mechanisms).\n 'url' is the URL for the request, the 'gateway' argument\n described earlier will form the basis of this URL.\n 'method' is the HTTP method which should be used when\n carrying out the request.\n 'timeout' requests not successfully completed after this\n many seconds should be aborted. If timeout is None, then\n the handler should not set a timeout.\n 'headers' is a list of ("header-name","header-value") tuples\n which must be passed to the pushgateway in the form of HTTP\n request headers.\n The function should raise an exception (e.g. IOError) on\n failure.\n 'content' is the data which should be used to form the HTTP\n Message Body.\n This overwrites all metrics with the same job and grouping_key.\n This uses the PUT HTTP method.\n """\n prometheus_client.push_to_gateway(\n gateway=self.gateway,\n job=job,\n registry=self._registry,\n grouping_key=grouping_key,\n timeout=self.timeout,\n handler=handler,\n )\n\n def pushadd_to_gateway(self, job, grouping_key=None, handler=default_handler) -> None:\n """PushAdd metrics to the given pushgateway.\n `job` is the job label to be attached to all pushed metrics\n `registry` is an instance of CollectorRegistry\n `grouping_key` please see the pushgateway documentation for details.\n Defaults to None\n `handler` is an optional function which can be provided to perform\n requests to the 'gateway'.\n Defaults to None, in which case an http or https request\n will be carried out by a default handler.\n See the 'prometheus_client.push_to_gateway' documentation\n for implementation requirements.\n This replaces metrics with the same name, job and grouping_key.\n This uses the POST HTTP method.\n """\n prometheus_client.pushadd_to_gateway(\n gateway=self.gateway,\n job=job,\n registry=self._registry,\n grouping_key=grouping_key,\n timeout=self.timeout,\n handler=handler,\n )\n\n def delete_from_gateway(self, job, grouping_key=None, handler=default_handler) -> None:\n """Delete metrics from the given pushgateway.\n `job` is the job label to be attached to all pushed metrics\n `grouping_key` please see the pushgateway documentation for details.\n Defaults to None\n `handler` is an optional function which can be provided to perform\n requests to the 'gateway'.\n Defaults to None, in which case an http or https request\n will be carried out by a default handler.\n See the 'prometheus_client.push_to_gateway' documentation\n for implementation requirements.\n This deletes metrics with the given job and grouping_key.\n This uses the DELETE HTTP method.\n """\n prometheus_client.delete_from_gateway(\n gateway=self.gateway,\n job=job,\n grouping_key=grouping_key,\n timeout=self.timeout,\n handler=handler,\n )
\n\n\n
[docs]@dagster_maintained_resource\n@resource(\n config_schema=PrometheusResource.to_config_schema(),\n description="""This resource is for sending metrics to a Prometheus Pushgateway.""",\n)\ndef prometheus_resource(context):\n return PrometheusResource(\n gateway=context.resource_config["gateway"], timeout=context.resource_config["timeout"]\n )
\n
", "current_page_name": "_modules/dagster_prometheus/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_prometheus.resources"}}, "dagster_pyspark": {"resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_pyspark.resources

\nfrom typing import Any, Dict\n\nimport dagster._check as check\nfrom dagster import ConfigurableResource, resource\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom dagster._core.execution.context.init import InitResourceContext\nfrom dagster_spark.configs_spark import spark_config\nfrom dagster_spark.utils import flatten_dict\nfrom pydantic import PrivateAttr\nfrom pyspark.sql import SparkSession\n\n\ndef spark_session_from_config(spark_conf=None):\n    spark_conf = check.opt_dict_param(spark_conf, "spark_conf")\n    builder = SparkSession.builder\n    flat = flatten_dict(spark_conf)\n    for key, value in flat:\n        builder = builder.config(key, value)\n\n    return builder.getOrCreate()\n\n\n
[docs]class PySparkResource(ConfigurableResource):\n """This resource provides access to a PySpark Session for executing PySpark code within Dagster.\n\n Example:\n .. code-block:: python\n\n @op\n def my_op(pyspark: PySparkResource)\n spark_session = pyspark.spark_session\n dataframe = spark_session.read.json("examples/src/main/resources/people.json")\n\n\n @job(\n resource_defs={\n "pyspark": PySparkResource(\n spark_config={\n "spark.executor.memory": "2g"\n }\n )\n }\n )\n def my_spark_job():\n my_op()\n """\n\n spark_config: Dict[str, Any]\n _spark_session = PrivateAttr(default=None)\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n def setup_for_execution(self, context: InitResourceContext) -> None:\n self._spark_session = spark_session_from_config(self.spark_config)\n\n @property\n def spark_session(self) -> Any:\n return self._spark_session\n\n @property\n def spark_context(self) -> Any:\n return self.spark_session.sparkContext
\n\n\n
[docs]@dagster_maintained_resource\n@resource({"spark_conf": spark_config()})\ndef pyspark_resource(init_context) -> PySparkResource:\n """This resource provides access to a PySpark SparkSession for executing PySpark code within Dagster.\n\n Example:\n .. code-block:: python\n\n @op(required_resource_keys={"pyspark"})\n def my_op(context):\n spark_session = context.resources.pyspark.spark_session\n dataframe = spark_session.read.json("examples/src/main/resources/people.json")\n\n my_pyspark_resource = pyspark_resource.configured(\n {"spark_conf": {"spark.executor.memory": "2g"}}\n )\n\n @job(resource_defs={"pyspark": my_pyspark_resource})\n def my_spark_job():\n my_op()\n """\n context_updated_config = init_context.replace_config(\n {"spark_config": init_context.resource_config["spark_conf"]}\n )\n return PySparkResource.from_resource_context(context_updated_config)
\n\n\nclass LazyPySparkResource(ConfigurableResource):\n """This resource provides access to a lazily-created PySpark SparkSession for executing PySpark\n code within Dagster, avoiding the creation of a SparkSession object until the .spark_session attribute\n of the resource is accessed. This is helpful for avoiding the creation (and startup penalty) of a SparkSession\n until it is actually needed / accessed by an op or IOManager.\n\n Example:\n .. code-block:: python\n\n @op\n def my_op(lazy_pyspark: LazyPySparkResource)\n spark_session = lazy_pyspark.spark_session\n dataframe = spark_session.read.json("examples/src/main/resources/people.json")\n\n @job(\n resource_defs={\n "lazy_pyspark": LazyPySparkResource(\n spark_config={\n "spark.executor.memory": "2g"\n }\n )\n }\n )\n def my_spark_job():\n my_op()\n """\n\n spark_config: Dict[str, Any]\n _spark_session = PrivateAttr(default=None)\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n def _init_session(self) -> None:\n if self._spark_session is None:\n self._spark_session = spark_session_from_config(self.spark_config)\n\n @property\n def spark_session(self) -> Any:\n self._init_session()\n return self._spark_session\n\n @property\n def spark_context(self) -> Any:\n self._init_session()\n return self._spark_session.sparkContext\n\n\n@dagster_maintained_resource\n@resource({"spark_conf": spark_config()})\ndef lazy_pyspark_resource(init_context: InitResourceContext) -> LazyPySparkResource:\n """This resource provides access to a lazily-created PySpark SparkSession for executing PySpark\n code within Dagster, avoiding the creation of a SparkSession object until the .spark_session attribute\n of the resource is accessed. This is helpful for avoiding the creation (and startup penalty) of a SparkSession\n until it is actually needed / accessed by an op or IOManager.\n\n Example:\n .. code-block:: python\n\n @op(required_resource_keys={"lazy_pyspark"})\n def my_op(context):\n spark_session = context.resources.lazy_pyspark.spark_session\n dataframe = spark_session.read.json("examples/src/main/resources/people.json")\n\n my_pyspark_resource = lazy_pyspark_resource.configured(\n {"spark_conf": {"spark.executor.memory": "2g"}}\n )\n\n @job(resource_defs={"lazy_pyspark": my_pyspark_resource})\n def my_spark_job():\n my_op()\n """\n context_updated_config = init_context.replace_config(\n {"spark_config": init_context.resource_config["spark_conf"]}\n )\n return LazyPySparkResource.from_resource_context(context_updated_config)\n
", "current_page_name": "_modules/dagster_pyspark/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_pyspark.resources"}}, "dagster_shell": {"ops": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_shell.ops

\nimport os\nfrom enum import Enum\nfrom typing import AbstractSet, Any, Dict, Mapping, Optional\n\nfrom dagster import (\n    Config,\n    Failure,\n    In,\n    Nothing,\n    OpExecutionContext,\n    Out,\n    _check as check,\n    op,\n)\nfrom dagster._core.definitions.op_definition import OpDefinition\nfrom pydantic import Field\n\nfrom .utils import execute, execute_script_file\n\n\nclass OutputType(Enum):\n    STREAM = "STREAM"\n    """Stream script stdout/stderr."""\n\n    BUFFER = "BUFFER"\n    """Buffer shell script stdout/stderr, then log upon completion."""\n\n    NONE = "NONE"\n    """No logging."""\n\n\nclass ShellOpConfig(Config):\n    env: Optional[Dict[str, str]] = Field(\n        default=None,\n        description="An optional dict of environment variables to pass to the subprocess.",\n    )\n    output_logging: OutputType = Field(\n        OutputType.BUFFER.value,\n    )\n    cwd: Optional[str] = Field(\n        default=None, description="Working directory in which to execute shell script"\n    )\n\n    def to_execute_params(self) -> Dict[str, Any]:\n        return {\n            "env": {**os.environ, **(self.env or {})},\n            "output_logging": self.output_logging.value,\n            "cwd": self.cwd,\n        }\n\n\n
[docs]@op(\n name="shell_op",\n description=(\n "This op executes a shell command it receives as input.\\n\\n"\n "This op is suitable for uses where the command to execute is generated dynamically by "\n "upstream ops. If you know the command to execute at job construction time, "\n "consider `shell_command_op` instead."\n ),\n ins={"shell_command": In(str)},\n out=Out(str),\n)\ndef shell_op(context: OpExecutionContext, shell_command: str, config: ShellOpConfig) -> str:\n """This op executes a shell command it receives as input.\n This op is suitable for uses where the command to execute is generated dynamically by\n upstream ops. If you know the command to execute at job construction time,\n consider ``shell_command_op`` instead.\n\n Args:\n shell_command: The shell command to be executed\n config (ShellOpConfig): A ShellOpConfig object specifying configuration options\n\n Examples:\n .. code-block:: python\n\n @op\n def create_shell_command():\n return "echo hello world!"\n\n @graph\n def echo_graph():\n shell_op(create_shell_command())\n """\n output, return_code = execute(\n shell_command=shell_command, log=context.log, **config.to_execute_params()\n )\n\n if return_code:\n raise Failure(description=f"Shell command execution failed with output: {output}")\n\n return output
\n\n\n
[docs]def create_shell_command_op(\n shell_command: str,\n name: str,\n description: Optional[str] = None,\n required_resource_keys: Optional[AbstractSet[str]] = None,\n tags: Optional[Mapping[str, str]] = None,\n) -> OpDefinition:\n """This function is a factory that constructs ops to execute a shell command.\n\n Note that you can only use ``shell_command_op`` if you know the command you'd like to execute\n at job construction time. If you'd like to construct shell commands dynamically during\n job execution and pass them between ops, you should use ``shell_op`` instead.\n\n The resulting op can take a single ``start`` argument that is a\n `Nothing dependency <https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies>`__\n to allow you to run ops before the shell op.\n\n Examples:\n .. literalinclude:: ../../../../../../python_modules/libraries/dagster-shell/dagster_shell_tests/example_shell_command_op.py\n :language: python\n\n .. code-block:: python\n\n @op\n def run_before_shell_op():\n do_some_work()\n\n @graph\n def my_graph():\n my_echo_op = create_shell_command_op("echo hello world!", name="echo_op")\n my_echo_op(start=run_before_shell_op())\n\n\n Args:\n shell_command (str): The shell command that the constructed op will execute.\n name (str): The name of the constructed op.\n description (Optional[str]): Human-readable description of this op.\n required_resource_keys (Optional[Set[str]]): Set of resource handles required by this op.\n Setting this ensures that resource spin up for the required resources will occur before\n the shell command is executed.\n tags (Optional[Dict[str, Any]]): Arbitrary metadata for the op. Frameworks may\n expect and require certain metadata to be attached to a op. Users should generally\n not set metadata directly. Values that are not strings will be json encoded and must meet\n the criteria that `json.loads(json.dumps(value)) == value`.\n\n Raises:\n Failure: Raised when the shell command returns a non-zero exit code.\n\n Returns:\n OpDefinition: Returns the constructed op definition.\n """\n\n @op(\n name=name,\n description=description,\n ins={"start": In(Nothing)},\n out=Out(str),\n required_resource_keys=required_resource_keys,\n tags=tags,\n )\n def _shell_fn(context, config: ShellOpConfig):\n output, return_code = execute(\n shell_command=shell_command, log=context.log, **config.to_execute_params()\n )\n\n if return_code:\n raise Failure(description=f"Shell command execution failed with output: {output}")\n\n return output\n\n return _shell_fn
\n\n\n
[docs]def create_shell_script_op(\n shell_script_path,\n name="create_shell_script_op",\n ins: Optional[Mapping[str, In]] = None,\n **kwargs: Any,\n) -> OpDefinition:\n """This function is a factory which constructs an op that will execute a shell command read\n from a script file.\n\n Any kwargs passed to this function will be passed along to the underlying :func:`@op\n <dagster.op>` decorator. However, note that overriding ``config`` or ``output_defs`` is not\n supported.\n\n You might consider using :func:`@graph <dagster.graph>` to wrap this op\n in the cases where you'd like to configure the shell op with different config fields.\n\n If no ``ins`` are passed then the resulting op can take a single ``start`` argument that is a\n `Nothing dependency <https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies>`__\n to allow you to run ops before the shell op.\n\n\n Examples:\n .. literalinclude:: ../../../../../../python_modules/libraries/dagster-shell/dagster_shell_tests/example_shell_script_op.py\n :language: python\n\n .. code-block:: python\n\n @op\n def run_before_shell_op():\n do_some_work()\n\n @graph\n def my_graph():\n my_echo_op = create_shell_script_op(file_relative_path(__file__, "hello_world.sh"), name="echo_op")\n my_echo_op(start=run_before_shell_op())\n\n\n Args:\n shell_script_path (str): The script file to execute.\n name (Optional[str]): The name of this op. Defaults to "create_shell_script_op".\n ins (Optional[Mapping[str, In]]): Ins for the op. Defaults to\n a single Nothing input.\n\n Raises:\n Failure: Raised when the shell command returns a non-zero exit code.\n\n Returns:\n OpDefinition: Returns the constructed op definition.\n """\n check.str_param(shell_script_path, "shell_script_path")\n name = check.str_param(name, "name")\n check.opt_mapping_param(ins, "ins", value_type=In)\n\n if "config" in kwargs:\n raise TypeError("Overriding config for shell op is not supported.")\n\n @op(\n name=name,\n description=kwargs.pop("description", "An op to invoke a shell command."),\n ins=ins or {"start": In(Nothing)},\n out=Out(str),\n **kwargs,\n )\n def _shell_script_fn(context, config: ShellOpConfig):\n output, return_code = execute_script_file(\n shell_script_path=shell_script_path, log=context.log, **config.to_execute_params()\n )\n\n if return_code:\n raise Failure(description=f"Shell command execution failed with output: {output}")\n\n return output\n\n return _shell_script_fn
\n
", "current_page_name": "_modules/dagster_shell/ops", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_shell.ops"}, "utils": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_shell.utils

\n#\n# NOTE: This file is based on the bash operator from Apache Airflow, which can be found here:\n# https://github.com/apache/airflow/blob/master/airflow/operators/bash.py\n#\n# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements.  See the NOTICE file\n# distributed with this work for additional information\n# regarding copyright ownership.  The ASF licenses this file\n# to you under the Apache License, Version 2.0 (the\n# "License"); you may not use this file except in compliance\n# with the License.  You may obtain a copy of the License at\n#\n#   http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing,\n# software distributed under the License is distributed on an\n# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY\n# KIND, either express or implied.  See the License for the\n# specific language governing permissions and limitations\n# under the License.\nimport os\nimport signal\nfrom logging import Logger\nfrom subprocess import PIPE, STDOUT, Popen\nfrom typing import Mapping, Optional, Tuple\n\nimport dagster._check as check\nfrom dagster._utils import safe_tempfile_path\nfrom typing_extensions import Final\n\nOUTPUT_LOGGING_OPTIONS: Final = ["STREAM", "BUFFER", "NONE"]\n\n\ndef execute_script_file(\n    shell_script_path: str,\n    output_logging: str,\n    log: Logger,\n    cwd: Optional[str] = None,\n    env: Optional[Mapping[str, str]] = None,\n) -> Tuple[str, int]:\n    """Execute a shell script file specified by the argument ``shell_script_path``. The script will be\n    invoked via ``subprocess.Popen(['bash', shell_script_path], ...)``.\n\n    In the Popen invocation, ``stdout=PIPE, stderr=STDOUT`` is used, and the combined stdout/stderr\n    output is retrieved.\n\n    Examples:\n        .. literalinclude:: ../../../../../../python_modules/libraries/dagster-shell/dagster_shell_tests/example_shell_script_utility.py\n           :language: python\n\n    Args:\n        shell_script_path (str): The shell script to execute.\n        output_logging (str): The logging mode to use. Supports STREAM, BUFFER, and NONE.\n        log (Union[logging.Logger, DagsterLogManager]): Any logger which responds to .info()\n        cwd (str, optional): Working directory for the shell command to use. Defaults to the\n            temporary path where we store the shell command in a script file.\n        env (Dict[str, str], optional): Environment dictionary to pass to ``subprocess.Popen``.\n            Unused by default.\n\n    Raises:\n        Exception: When an invalid output_logging is selected. Unreachable from op-based\n            invocation since the config system will check output_logging against the config\n            enum.\n\n    Returns:\n        Tuple[str, int]: A tuple where the first element is the combined stdout/stderr output of running the shell\n        command and the second element is the return code.\n    """\n    check.str_param(shell_script_path, "shell_script_path")\n    check.str_param(output_logging, "output_logging")\n    check.opt_str_param(cwd, "cwd", default=os.path.dirname(shell_script_path))\n    env = check.opt_nullable_dict_param(env, "env", key_type=str, value_type=str)\n\n    if output_logging not in OUTPUT_LOGGING_OPTIONS:\n        raise Exception("Unrecognized output_logging %s" % output_logging)\n\n    def pre_exec():\n        # Restore default signal disposition and invoke setsid\n        for sig in ("SIGPIPE", "SIGXFZ", "SIGXFSZ"):\n            if hasattr(signal, sig):\n                signal.signal(getattr(signal, sig), signal.SIG_DFL)\n        os.setsid()\n\n    with open(shell_script_path, "rb") as f:\n        shell_command = f.read().decode("utf-8")\n\n    log.info(f"Running command:\\n{shell_command}")\n\n    sub_process = None\n    try:\n        stdout_pipe = PIPE\n        stderr_pipe = STDOUT\n        if output_logging == "NONE":\n            stdout_pipe = stderr_pipe = None\n\n        sub_process = Popen(\n            ["bash", shell_script_path],\n            stdout=stdout_pipe,\n            stderr=stderr_pipe,\n            cwd=cwd,\n            env=env,\n            preexec_fn=pre_exec,  # noqa: PLW1509\n            encoding="UTF-8",\n        )\n\n        log.info(f"Command pid: {sub_process.pid}")\n\n        output = ""\n        if output_logging == "STREAM":\n            assert sub_process.stdout is not None, "Setting stdout=PIPE should always set stdout."\n            # Stream back logs as they are emitted\n            lines = []\n            for line in sub_process.stdout:\n                log.info(line.rstrip())\n                lines.append(line)\n            output = "".join(lines)\n        elif output_logging == "BUFFER":\n            # Collect and buffer all logs, then emit\n            output, _ = sub_process.communicate()\n            log.info(output)\n\n        sub_process.wait()\n        log.info(f"Command exited with return code {sub_process.returncode}")\n\n        return output, sub_process.returncode\n    finally:\n        # Always terminate subprocess, including in cases where the run is terminated\n        if sub_process:\n            sub_process.terminate()\n\n\ndef execute(\n    shell_command: str,\n    output_logging: str,\n    log: Logger,\n    cwd: Optional[str] = None,\n    env: Optional[Mapping[str, str]] = None,\n) -> Tuple[str, int]:\n    """This function is a utility for executing shell commands from within a Dagster op (or from Python in general).\n    It can be used to execute shell commands on either op input data, or any data generated within a generic python op.\n\n    Internally, it executes a shell script specified by the argument ``shell_command``. The script will be written\n    to a temporary file first and invoked via ``subprocess.Popen(['bash', shell_script_path], ...)``.\n\n    In the Popen invocation, ``stdout=PIPE, stderr=STDOUT`` is used, and the combined stdout/stderr\n    output is retrieved.\n\n    Examples:\n        .. literalinclude:: ../../../../../../python_modules/libraries/dagster-shell/dagster_shell_tests/example_shell_command_utility.py\n           :language: python\n\n    Args:\n        shell_command (str): The shell command to execute\n        output_logging (str): The logging mode to use. Supports STREAM, BUFFER, and NONE.\n        log (Union[logging.Logger, DagsterLogManager]): Any logger which responds to .info()\n        cwd (str, optional): Working directory for the shell command to use. Defaults to the\n            temporary path where we store the shell command in a script file.\n        env (Dict[str, str], optional): Environment dictionary to pass to ``subprocess.Popen``.\n            Unused by default.\n\n    Returns:\n        Tuple[str, int]: A tuple where the first element is the combined stdout/stderr output of running the shell\n        command and the second element is the return code.\n    """\n    check.str_param(shell_command, "shell_command")\n    # other args checked in execute_file\n\n    with safe_tempfile_path() as tmp_file_path:\n        tmp_path = os.path.dirname(tmp_file_path)\n        log.info("Using temporary directory: %s" % tmp_path)\n\n        with open(tmp_file_path, "wb") as tmp_file:\n            tmp_file.write(shell_command.encode("utf-8"))\n            tmp_file.flush()\n            script_location = os.path.abspath(tmp_file.name)\n            log.info(f"Temporary script location: {script_location}")\n            return execute_script_file(\n                shell_script_path=tmp_file.name,\n                output_logging=output_logging,\n                log=log,\n                cwd=(cwd or tmp_path),\n                env=env,\n            )\n
", "current_page_name": "_modules/dagster_shell/utils", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_shell.utils"}}, "dagster_slack": {"hooks": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_slack.hooks

\nfrom typing import Callable, Optional\n\nfrom dagster._annotations import deprecated_param\nfrom dagster._core.definitions import failure_hook, success_hook\nfrom dagster._core.execution.context.hook import HookContext\nfrom dagster._utils.warnings import normalize_renamed_param\n\n\ndef _default_status_message(context: HookContext, status: str) -> str:\n    return f"Op {context.op.name} on job {context.job_name} {status}!\\nRun ID: {context.run_id}"\n\n\ndef _default_failure_message(context: HookContext) -> str:\n    return _default_status_message(context, status="failed")\n\n\ndef _default_success_message(context: HookContext) -> str:\n    return _default_status_message(context, status="succeeded")\n\n\n
[docs]@deprecated_param(\n param="dagit_base_url",\n breaking_version="2.0",\n additional_warn_text="Use `webserver_base_url` instead.",\n)\ndef slack_on_failure(\n channel: str,\n message_fn: Callable[[HookContext], str] = _default_failure_message,\n dagit_base_url: Optional[str] = None,\n webserver_base_url: Optional[str] = None,\n):\n """Create a hook on step failure events that will message the given Slack channel.\n\n Args:\n channel (str): The channel to send the message to (e.g. "#my_channel")\n message_fn (Optional(Callable[[HookContext], str])): Function which takes in the HookContext\n outputs the message you want to send.\n dagit_base_url: (Optional[str]): The base url of your webserver instance. Specify this to allow\n messages to include deeplinks to the specific run that triggered the hook.\n webserver_base_url: (Optional[str]): The base url of your webserver instance. Specify this to allow\n messages to include deeplinks to the specific run that triggered the hook.\n\n Examples:\n .. code-block:: python\n\n @slack_on_failure("#foo", webserver_base_url="http://localhost:3000")\n @job(...)\n def my_job():\n pass\n\n .. code-block:: python\n\n def my_message_fn(context: HookContext) -> str:\n return f"Op {context.op} failed!"\n\n @op\n def an_op(context):\n pass\n\n @job(...)\n def my_job():\n an_op.with_hooks(hook_defs={slack_on_failure("#foo", my_message_fn)})\n\n """\n webserver_base_url = normalize_renamed_param(\n webserver_base_url, "webserver_base_url", dagit_base_url, "dagit_base_url"\n )\n\n @failure_hook(required_resource_keys={"slack"})\n def _hook(context: HookContext):\n text = message_fn(context)\n if webserver_base_url:\n text += f"\\n<{webserver_base_url}/runs/{context.run_id}|View in Dagster UI>"\n\n context.resources.slack.chat_postMessage(channel=channel, text=text)\n\n return _hook
\n\n\n
[docs]@deprecated_param(\n param="dagit_base_url",\n breaking_version="2.0",\n additional_warn_text="Use `webserver_base_url` instead.",\n)\ndef slack_on_success(\n channel: str,\n message_fn: Callable[[HookContext], str] = _default_success_message,\n dagit_base_url: Optional[str] = None,\n webserver_base_url: Optional[str] = None,\n):\n """Create a hook on step success events that will message the given Slack channel.\n\n Args:\n channel (str): The channel to send the message to (e.g. "#my_channel")\n message_fn (Optional(Callable[[HookContext], str])): Function which takes in the HookContext\n outputs the message you want to send.\n dagit_base_url: (Optional[str]): The base url of your webserver instance. Specify this to allow\n messages to include deeplinks to the specific run that triggered the hook.\n webserver_base_url: (Optional[str]): The base url of your webserver instance. Specify this to allow\n messages to include deeplinks to the specific run that triggered the hook.\n\n Examples:\n .. code-block:: python\n\n @slack_on_success("#foo", webserver_base_url="http://localhost:3000")\n @job(...)\n def my_job():\n pass\n\n .. code-block:: python\n\n def my_message_fn(context: HookContext) -> str:\n return f"Op {context.op} worked!"\n\n @op\n def an_op(context):\n pass\n\n @job(...)\n def my_job():\n an_op.with_hooks(hook_defs={slack_on_success("#foo", my_message_fn)})\n\n """\n webserver_base_url = normalize_renamed_param(\n webserver_base_url, "webserver_base_url", dagit_base_url, "dagit_base_url"\n )\n\n @success_hook(required_resource_keys={"slack"})\n def _hook(context: HookContext):\n text = message_fn(context)\n if webserver_base_url:\n text += f"\\n<{webserver_base_url}/runs/{context.run_id}|View in Dagster UI>"\n\n context.resources.slack.chat_postMessage(channel=channel, text=text)\n\n return _hook
\n
", "current_page_name": "_modules/dagster_slack/hooks", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_slack.hooks"}, "resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_slack.resources

\nfrom dagster import ConfigurableResource, resource\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom pydantic import Field\nfrom slack_sdk.web.client import WebClient\n\n\n
[docs]class SlackResource(ConfigurableResource):\n """This resource is for connecting to Slack.\n\n By configuring this Slack resource, you can post messages to Slack from any Dagster op, asset, schedule or sensor.\n\n Examples:\n .. code-block:: python\n\n import os\n\n from dagster import EnvVar, job, op\n from dagster_slack import SlackResource\n\n\n @op\n def slack_op(slack: SlackResource):\n slack.get_client().chat_postMessage(channel='#noise', text=':wave: hey there!')\n\n @job\n def slack_job():\n slack_op()\n\n defs = Definitions(\n jobs=[slack_job],\n resources={\n "slack": SlackResource(token=EnvVar("MY_SLACK_TOKEN")),\n },\n )\n """\n\n token: str = Field(\n description=(\n "To configure access to the Slack API, you'll need an access"\n " token provisioned with access to your Slack workspace."\n " Tokens are typically either user tokens or bot tokens. For programmatic posting"\n " to Slack from this resource, you probably want to provision and use a bot token."\n " More in the Slack API documentation here: https://api.slack.com/docs/token-types"\n ),\n )\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n def get_client(self) -> WebClient:\n """Returns a ``slack_sdk.WebClient`` for interacting with the Slack API."""\n return WebClient(self.token)
\n\n\n
[docs]@dagster_maintained_resource\n@resource(\n config_schema=SlackResource.to_config_schema(),\n)\ndef slack_resource(context) -> WebClient:\n """This resource is for connecting to Slack.\n\n The resource object is a `slack_sdk.WebClient`.\n\n By configuring this Slack resource, you can post messages to Slack from any Dagster op, asset, schedule or sensor.\n\n Examples:\n .. code-block:: python\n\n import os\n\n from dagster import job, op\n from dagster_slack import slack_resource\n\n\n @op(required_resource_keys={'slack'})\n def slack_op(context):\n context.resources.slack.chat_postMessage(channel='#noise', text=':wave: hey there!')\n\n @job(resource_defs={'slack': slack_resource})\n def slack_job():\n slack_op()\n\n slack_job.execute_in_process(\n run_config={'resources': {'slack': {'config': {'token': os.getenv('SLACK_TOKEN')}}}}\n )\n """\n return SlackResource.from_resource_context(context).get_client()
\n
", "current_page_name": "_modules/dagster_slack/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_slack.resources"}, "sensors": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_slack.sensors

\nfrom typing import (\n    TYPE_CHECKING,\n    Any,\n    Callable,\n    Dict,\n    List,\n    Optional,\n    Sequence,\n    Tuple,\n    TypeVar,\n    Union,\n)\n\nfrom dagster import (\n    AssetSelection,\n    DefaultSensorStatus,\n    FreshnessPolicySensorContext,\n    freshness_policy_sensor,\n)\nfrom dagster._annotations import deprecated_param, experimental\nfrom dagster._core.definitions import GraphDefinition, JobDefinition\nfrom dagster._core.definitions.run_status_sensor_definition import (\n    RunFailureSensorContext,\n    run_failure_sensor,\n)\nfrom dagster._core.definitions.unresolved_asset_job_definition import UnresolvedAssetJobDefinition\nfrom dagster._utils.warnings import normalize_renamed_param\nfrom slack_sdk.web.client import WebClient\n\nif TYPE_CHECKING:\n    from dagster._core.definitions.selector import (\n        CodeLocationSelector,\n        JobSelector,\n        RepositorySelector,\n    )\n\nT = TypeVar("T", RunFailureSensorContext, FreshnessPolicySensorContext)\n\n\ndef _build_slack_blocks_and_text(\n    context: T,\n    text_fn: Callable[[T], str],\n    blocks_fn: Optional[Callable[[T], List[Dict[Any, Any]]]],\n    webserver_base_url: Optional[str],\n) -> Tuple[List[Dict[str, Any]], str]:\n    main_body_text = text_fn(context)\n    blocks: List[Dict[Any, Any]] = []\n    if blocks_fn:\n        blocks.extend(blocks_fn(context))\n    else:\n        if isinstance(context, RunFailureSensorContext):\n            text = (\n                f'*Job "{context.dagster_run.job_name}" failed.'\n                f' `{context.dagster_run.run_id.split("-")[0]}`*'\n            )\n        else:\n            text = (\n                f'*Asset "{context.asset_key.to_user_string()}" is now'\n                f' {"on time" if context.minutes_overdue == 0 else f"{context.minutes_overdue:.2f} minutes late.*"}'\n            )\n\n        blocks.extend(\n            [\n                {\n                    "type": "section",\n                    "text": {\n                        "type": "mrkdwn",\n                        "text": text,\n                    },\n                },\n                {\n                    "type": "section",\n                    "text": {"type": "mrkdwn", "text": main_body_text},\n                },\n            ]\n        )\n\n    if webserver_base_url:\n        if isinstance(context, RunFailureSensorContext):\n            url = f"{webserver_base_url}/runs/{context.dagster_run.run_id}"\n        else:\n            url = f"{webserver_base_url}/assets/{'/'.join(context.asset_key.path)}"\n        blocks.append(\n            {\n                "type": "actions",\n                "elements": [\n                    {\n                        "type": "button",\n                        "text": {"type": "plain_text", "text": "View in Dagster UI"},\n                        "url": url,\n                    }\n                ],\n            }\n        )\n    return blocks, main_body_text\n\n\ndef _default_failure_message_text_fn(context: RunFailureSensorContext) -> str:\n    return f"Error: ```{context.failure_event.message}```"\n\n\n
[docs]@deprecated_param(\n param="dagit_base_url",\n breaking_version="2.0",\n additional_warn_text="Use `webserver_base_url` instead.",\n)\n@deprecated_param(\n param="job_selection",\n breaking_version="2.0",\n additional_warn_text="Use `monitored_jobs` instead.",\n)\ndef make_slack_on_run_failure_sensor(\n channel: str,\n slack_token: str,\n text_fn: Callable[[RunFailureSensorContext], str] = _default_failure_message_text_fn,\n blocks_fn: Optional[Callable[[RunFailureSensorContext], List[Dict[Any, Any]]]] = None,\n name: Optional[str] = None,\n dagit_base_url: Optional[str] = None,\n minimum_interval_seconds: Optional[int] = None,\n monitored_jobs: Optional[\n Sequence[\n Union[\n JobDefinition,\n GraphDefinition,\n UnresolvedAssetJobDefinition,\n "RepositorySelector",\n "JobSelector",\n "CodeLocationSelector",\n ]\n ]\n ] = None,\n job_selection: Optional[\n Sequence[\n Union[\n JobDefinition,\n GraphDefinition,\n UnresolvedAssetJobDefinition,\n "RepositorySelector",\n "JobSelector",\n "CodeLocationSelector",\n ]\n ]\n ] = None,\n monitor_all_repositories: bool = False,\n default_status: DefaultSensorStatus = DefaultSensorStatus.STOPPED,\n webserver_base_url: Optional[str] = None,\n):\n """Create a sensor on job failures that will message the given Slack channel.\n\n Args:\n channel (str): The channel to send the message to (e.g. "#my_channel")\n slack_token (str): The slack token.\n Tokens are typically either user tokens or bot tokens. More in the Slack API\n documentation here: https://api.slack.com/docs/token-types\n text_fn (Optional(Callable[[RunFailureSensorContext], str])): Function which\n takes in the ``RunFailureSensorContext`` and outputs the message you want to send.\n Defaults to a text message that contains error message, job name, and run ID.\n The usage of the `text_fn` changes depending on whether you're using `blocks_fn`. If you\n are using `blocks_fn`, this is used as a fallback string to display in notifications. If\n you aren't, this is the main body text of the message. It can be formatted as plain text,\n or with markdown.\n See more details in https://api.slack.com/methods/chat.postMessage#text_usage\n blocks_fn (Callable[[RunFailureSensorContext], List[Dict]]): Function which takes in\n the ``RunFailureSensorContext`` and outputs the message blocks you want to send.\n See information about Blocks in https://api.slack.com/reference/block-kit/blocks\n name: (Optional[str]): The name of the sensor. Defaults to "slack_on_run_failure".\n dagit_base_url: (Optional[str]): The base url of your Dagit instance. Specify this to allow\n messages to include deeplinks to the failed job run.\n minimum_interval_seconds: (Optional[int]): The minimum number of seconds that will elapse\n between sensor evaluations.\n monitored_jobs (Optional[List[Union[JobDefinition, GraphDefinition, RepositorySelector, JobSelector, CodeLocationSensor]]]): The jobs in the\n current repository that will be monitored by this failure sensor. Defaults to None, which\n means the alert will be sent when any job in the repository fails. To monitor jobs in external repositories, use RepositorySelector and JobSelector\n job_selection (Optional[List[Union[JobDefinition, GraphDefinition, RepositorySelector, JobSelector, CodeLocationSensor]]]): (deprecated in favor of monitored_jobs)\n The jobs in the current repository that will be monitored by this failure sensor. Defaults to None, which means the alert will\n be sent when any job in the repository fails.\n monitor_all_repositories (bool): If set to True, the sensor will monitor all runs in the\n Dagster instance. If set to True, an error will be raised if you also specify\n monitored_jobs or job_selection. Defaults to False.\n default_status (DefaultSensorStatus): Whether the sensor starts as running or not. The default\n status can be overridden from Dagit or via the GraphQL API.\n webserver_base_url: (Optional[str]): The base url of your webserver instance. Specify this to allow\n messages to include deeplinks to the failed job run.\n\n Examples:\n .. code-block:: python\n\n slack_on_run_failure = make_slack_on_run_failure_sensor(\n "#my_channel",\n os.getenv("MY_SLACK_TOKEN")\n )\n\n @repository\n def my_repo():\n return [my_job + slack_on_run_failure]\n\n .. code-block:: python\n\n def my_message_fn(context: RunFailureSensorContext) -> str:\n return (\n f"Job {context.dagster_run.job_name} failed!"\n f"Error: {context.failure_event.message}"\n )\n\n slack_on_run_failure = make_slack_on_run_failure_sensor(\n channel="#my_channel",\n slack_token=os.getenv("MY_SLACK_TOKEN"),\n text_fn=my_message_fn,\n webserver_base_url="http://mycoolsite.com",\n )\n\n\n """\n webserver_base_url = normalize_renamed_param(\n webserver_base_url, "webserver_base_url", dagit_base_url, "dagit_base_url"\n )\n slack_client = WebClient(token=slack_token)\n jobs = monitored_jobs if monitored_jobs else job_selection\n\n @run_failure_sensor(\n name=name,\n minimum_interval_seconds=minimum_interval_seconds,\n monitored_jobs=jobs,\n monitor_all_repositories=monitor_all_repositories,\n default_status=default_status,\n )\n def slack_on_run_failure(context: RunFailureSensorContext):\n blocks, main_body_text = _build_slack_blocks_and_text(\n context=context,\n text_fn=text_fn,\n blocks_fn=blocks_fn,\n webserver_base_url=webserver_base_url,\n )\n\n slack_client.chat_postMessage(channel=channel, blocks=blocks, text=main_body_text)\n\n return slack_on_run_failure
\n\n\ndef _default_freshness_message_text_fn(context: FreshnessPolicySensorContext) -> str:\n return (\n f"Asset `{context.asset_key.to_user_string()}` is now {context.minutes_overdue:.2f} minutes"\n " late."\n )\n\n\n
[docs]@deprecated_param(\n param="dagit_base_url",\n breaking_version="2.0",\n additional_warn_text="Use `webserver_base_url` instead.",\n)\n@experimental\ndef make_slack_on_freshness_policy_status_change_sensor(\n channel: str,\n slack_token: str,\n asset_selection: AssetSelection,\n warn_after_minutes_overdue: float = 0,\n notify_when_back_on_time: bool = False,\n text_fn: Callable[[FreshnessPolicySensorContext], str] = _default_freshness_message_text_fn,\n blocks_fn: Optional[Callable[[FreshnessPolicySensorContext], List[Dict[Any, Any]]]] = None,\n name: Optional[str] = None,\n dagit_base_url: Optional[str] = None,\n default_status: DefaultSensorStatus = DefaultSensorStatus.STOPPED,\n webserver_base_url: Optional[str] = None,\n):\n """Create a sensor that will message the given Slack channel whenever an asset in the provided\n AssetSelection becomes out of date. Messages are only fired when the state changes, meaning\n only a single slack message will be sent (when the asset begins to be out of date). If\n `notify_when_back_on_time` is set to `True`, a second slack message will be sent once the asset\n is on time again.\n\n Args:\n channel (str): The channel to send the message to (e.g. "#my_channel")\n slack_token (str): The slack token.\n Tokens are typically either user tokens or bot tokens. More in the Slack API\n documentation here: https://api.slack.com/docs/token-types\n asset_selection (AssetSelection): The selection of assets which this sensor will monitor.\n Alerts will only be fired for assets that have a FreshnessPolicy defined.\n warn_after_minutes_overdue (float): How many minutes past the specified FreshnessPolicy this\n sensor will wait before firing an alert (by default, an alert will be fired as soon as\n the policy is violated).\n notify_when_back_on_time (bool): If a success message should be sent when the asset becomes on\n time again.\n text_fn (Optional(Callable[[RunFailureSensorContext], str])): Function which\n takes in the ``FreshnessPolicySensorContext`` and outputs the message you want to send.\n Defaults to a text message that contains the relevant asset key, and the number of\n minutes past its defined freshness policy it currently is.\n The usage of the `text_fn` changes depending on whether you're using `blocks_fn`. If you\n are using `blocks_fn`, this is used as a fallback string to display in notifications. If\n you aren't, this is the main body text of the message. It can be formatted as plain text,\n or with markdown.\n See more details in https://api.slack.com/methods/chat.postMessage#text_usage\n blocks_fn (Callable[[FreshnessPolicySensorContext], List[Dict]]): Function which takes in\n the ``FreshnessPolicySensorContext`` and outputs the message blocks you want to send.\n See information about Blocks in https://api.slack.com/reference/block-kit/blocks\n name: (Optional[str]): The name of the sensor. Defaults to "slack_on_freshness_policy".\n dagit_base_url: (Optional[str]): The base url of your Dagit instance. Specify this to allow\n messages to include deeplinks to the relevant asset page.\n default_status (DefaultSensorStatus): Whether the sensor starts as running or not. The default\n status can be overridden from Dagit or via the GraphQL API.\n webserver_base_url: (Optional[str]): The base url of your Dagit instance. Specify this to allow\n messages to include deeplinks to the relevant asset page.\n\n Examples:\n .. code-block:: python\n\n slack_on_freshness_policy = make_slack_on_freshness_policy_status_change_sensor(\n "#my_channel",\n os.getenv("MY_SLACK_TOKEN"),\n )\n\n .. code-block:: python\n\n def my_message_fn(context: FreshnessPolicySensorContext) -> str:\n if context.minutes_overdue == 0:\n return f"Asset {context.asset_key} is currently on time :)"\n return (\n f"Asset {context.asset_key} is currently {context.minutes_overdue} minutes late!!"\n )\n\n slack_on_run_failure = make_slack_on_run_failure_sensor(\n channel="#my_channel",\n slack_token=os.getenv("MY_SLACK_TOKEN"),\n text_fn=my_message_fn,\n webserver_base_url="http://mycoolsite.com",\n )\n\n\n """\n webserver_base_url = normalize_renamed_param(\n webserver_base_url, "webserver_base_url", dagit_base_url, "dagit_base_url"\n )\n slack_client = WebClient(token=slack_token)\n\n @freshness_policy_sensor(\n name=name, asset_selection=asset_selection, default_status=default_status\n )\n def slack_on_freshness_policy(context: FreshnessPolicySensorContext):\n if context.minutes_overdue is None or context.previous_minutes_overdue is None:\n return\n\n if (\n context.minutes_overdue > warn_after_minutes_overdue\n and context.previous_minutes_overdue <= warn_after_minutes_overdue\n ) or (\n notify_when_back_on_time\n and context.minutes_overdue == 0\n and context.previous_minutes_overdue != 0\n ):\n blocks, main_body_text = _build_slack_blocks_and_text(\n context=context,\n text_fn=text_fn,\n blocks_fn=blocks_fn,\n webserver_base_url=webserver_base_url,\n )\n\n slack_client.chat_postMessage(channel=channel, blocks=blocks, text=main_body_text)\n\n return slack_on_freshness_policy
\n
", "current_page_name": "_modules/dagster_slack/sensors", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_slack.sensors"}}, "dagster_snowflake": {"ops": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_snowflake.ops

\nfrom dagster import (\n    Nothing,\n    _check as check,\n    op,\n)\nfrom dagster._core.definitions.input import In\n\n\ndef _core_create_snowflake_command(dagster_decorator, decorator_name, sql, parameters=None):\n    check.str_param(sql, "sql")\n    check.opt_dict_param(parameters, "parameters")\n\n    @dagster_decorator(\n        name=f"snowflake_{decorator_name}",\n        ins={"start": In(Nothing)},\n        required_resource_keys={"snowflake"},\n        tags={"kind": "sql", "sql": sql},\n    )\n    def snowflake_fn(context):\n        context.resources.snowflake.execute_query(sql=sql, parameters=parameters)\n\n    return snowflake_fn\n\n\ndef snowflake_solid_for_query(sql, parameters=None):\n    """This function is a solid factory that constructs solids to execute a snowflake query.\n\n    Note that you can only use `snowflake_solid_for_query` if you know the query you'd like to\n    execute at job construction time. If you'd like to execute queries dynamically during\n    job execution, you should manually execute those queries in your custom solid using the\n    snowflake resource.\n\n    Args:\n        sql (str): The sql query that will execute against the provided snowflake resource.\n        parameters (dict): The parameters for the sql query.\n\n    Returns:\n        SolidDefinition: Returns the constructed solid definition.\n    """\n    return _core_create_snowflake_command(op, "solid", sql, parameters)\n\n\n
[docs]def snowflake_op_for_query(sql, parameters=None):\n """This function is an op factory that constructs an op to execute a snowflake query.\n\n Note that you can only use `snowflake_op_for_query` if you know the query you'd like to\n execute at graph construction time. If you'd like to execute queries dynamically during\n job execution, you should manually execute those queries in your custom op using the\n snowflake resource.\n\n Args:\n sql (str): The sql query that will execute against the provided snowflake resource.\n parameters (dict): The parameters for the sql query.\n\n Returns:\n OpDefinition: Returns the constructed op definition.\n """\n return _core_create_snowflake_command(op, "op", sql, parameters)
\n
", "current_page_name": "_modules/dagster_snowflake/ops", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_snowflake.ops"}, "resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_snowflake.resources

\nimport base64\nimport sys\nimport warnings\nfrom contextlib import closing, contextmanager\nfrom typing import Any, Dict, Iterator, List, Mapping, Optional, Sequence, Union\n\nimport dagster._check as check\nfrom cryptography.hazmat.backends import default_backend\nfrom cryptography.hazmat.primitives import serialization\nfrom dagster import (\n    ConfigurableResource,\n    IAttachDifferentObjectToOpContext,\n    get_dagster_logger,\n    resource,\n)\nfrom dagster._annotations import public\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom dagster._core.storage.event_log.sql_event_log import SqlDbConnection\nfrom dagster._utils.cached_method import cached_method\nfrom pydantic import Field, root_validator, validator\n\ntry:\n    import snowflake.connector\nexcept ImportError:\n    msg = (\n        "Could not import snowflake.connector. This could mean you have an incompatible version "\n        "of azure-storage-blob installed. dagster-snowflake requires azure-storage-blob<12.0.0; "\n        "this conflicts with dagster-azure which requires azure-storage-blob~=12.0.0 and is "\n        "incompatible with dagster-snowflake. Please uninstall dagster-azure and reinstall "\n        "dagster-snowflake to fix this error."\n    )\n    warnings.warn(msg)\n    raise\n\n\n
[docs]class SnowflakeResource(ConfigurableResource, IAttachDifferentObjectToOpContext):\n """A resource for connecting to the Snowflake data warehouse.\n\n If connector configuration is not set, SnowflakeResource.get_connection() will return a\n `snowflake.connector.Connection <https://docs.snowflake.com/en/developer-guide/python-connector/python-connector-api#object-connection>`__\n object. If connector="sqlalchemy" configuration is set, then SnowflakeResource.get_connection() will\n return a `SQLAlchemy Connection <https://docs.sqlalchemy.org/en/20/core/connections.html#sqlalchemy.engine.Connection>`__\n or a `SQLAlchemy raw connection <https://docs.sqlalchemy.org/en/20/core/connections.html#sqlalchemy.engine.Engine.raw_connection>`__.\n\n A simple example of loading data into Snowflake and subsequently querying that data is shown below:\n\n Examples:\n .. code-block:: python\n\n from dagster import job, op\n from dagster_snowflake import SnowflakeResource\n\n @op\n def get_one(snowflake_resource: SnowflakeResource):\n with snowflake_resource.get_connection() as conn:\n # conn is a snowflake.connector.Connection object\n conn.cursor().execute("SELECT 1")\n\n @job\n def my_snowflake_job():\n get_one()\n\n my_snowflake_job.execute_in_process(\n resources={\n 'snowflake_resource': SnowflakeResource(\n account=EnvVar("SNOWFLAKE_ACCOUNT"),\n user=EnvVar("SNOWFLAKE_USER"),\n password=EnvVar("SNOWFLAKE_PASSWORD")\n database="MY_DATABASE",\n schema="MY_SCHEMA",\n warehouse="MY_WAREHOUSE"\n )\n }\n )\n """\n\n account: Optional[str] = Field(\n default=None,\n description=(\n "Your Snowflake account name. For more details, see the `Snowflake documentation."\n " <https://docs.snowflake.com/developer-guide/python-connector/python-connector-api>`__"\n ),\n )\n\n user: str = Field(description="User login name.")\n\n password: Optional[str] = Field(default=None, description="User password.")\n\n database: Optional[str] = Field(\n default=None,\n description=(\n "Name of the default database to use. After login, you can use ``USE DATABASE`` "\n " to change the database."\n ),\n )\n\n schema_: Optional[str] = Field(\n default=None,\n description=(\n "Name of the default schema to use. After login, you can use ``USE SCHEMA`` to "\n "change the schema."\n ),\n alias="schema",\n ) # schema is a reserved word for pydantic\n\n role: Optional[str] = Field(\n default=None,\n description=(\n "Name of the default role to use. After login, you can use ``USE ROLE`` to change "\n " the role."\n ),\n )\n\n warehouse: Optional[str] = Field(\n default=None,\n description=(\n "Name of the default warehouse to use. After login, you can use ``USE WAREHOUSE`` "\n "to change the role."\n ),\n )\n\n private_key: Optional[str] = Field(\n default=None,\n description=(\n "Raw private key to use. See the `Snowflake documentation"\n " <https://docs.snowflake.com/en/user-guide/key-pair-auth.html>`__ for details."\n " Alternately, set private_key_path and private_key_password. To avoid issues with"\n " newlines in the keys, you can base64 encode the key. You can retrieve the base64"\n " encoded key with this shell command: ``cat rsa_key.p8 | base64``"\n ),\n )\n\n private_key_password: Optional[str] = Field(\n default=None,\n description=(\n "Raw private key password to use. See the `Snowflake documentation"\n " <https://docs.snowflake.com/en/user-guide/key-pair-auth.html>`__ for details."\n " Required for both ``private_key`` and ``private_key_path`` if the private key is"\n " encrypted. For unencrypted keys, this config can be omitted or set to None."\n ),\n )\n\n private_key_path: Optional[str] = Field(\n default=None,\n description=(\n "Raw private key path to use. See the `Snowflake documentation"\n " <https://docs.snowflake.com/en/user-guide/key-pair-auth.html>`__ for details."\n " Alternately, set the raw private key as ``private_key``."\n ),\n )\n\n autocommit: Optional[bool] = Field(\n default=None,\n description=(\n "None by default, which honors the Snowflake parameter AUTOCOMMIT. Set to True "\n "or False to enable or disable autocommit mode in the session, respectively."\n ),\n )\n\n client_prefetch_threads: Optional[int] = Field(\n default=None,\n description=(\n "Number of threads used to download the results sets (4 by default). "\n "Increasing the value improves fetch performance but requires more memory."\n ),\n )\n\n client_session_keep_alive: Optional[bool] = Field(\n default=None,\n description=(\n "False by default. Set this to True to keep the session active indefinitely, "\n "even if there is no activity from the user. Make certain to call the close method to "\n "terminate the thread properly or the process may hang."\n ),\n )\n\n login_timeout: Optional[int] = Field(\n default=None,\n description=(\n "Timeout in seconds for login. By default, 60 seconds. The login request gives "\n 'up after the timeout length if the HTTP response is "success".'\n ),\n )\n\n network_timeout: Optional[int] = Field(\n default=None,\n description=(\n "Timeout in seconds for all other operations. By default, none/infinite. A general"\n " request gives up after the timeout length if the HTTP response is not 'success'."\n ),\n )\n\n ocsp_response_cache_filename: Optional[str] = Field(\n default=None,\n description=(\n "URI for the OCSP response cache file. By default, the OCSP response cache "\n "file is created in the cache directory."\n ),\n )\n\n validate_default_parameters: Optional[bool] = Field(\n default=None,\n description=(\n "If True, raise an exception if the warehouse, database, or schema doesn't exist."\n " Defaults to False."\n ),\n )\n\n paramstyle: Optional[str] = Field(\n default=None,\n description=(\n "pyformat by default for client side binding. Specify qmark or numeric to "\n "change bind variable formats for server side binding."\n ),\n )\n\n timezone: Optional[str] = Field(\n default=None,\n description=(\n "None by default, which honors the Snowflake parameter TIMEZONE. Set to a "\n "valid time zone (e.g. America/Los_Angeles) to set the session time zone."\n ),\n )\n\n connector: Optional[str] = Field(\n default=None,\n description=(\n "Indicate alternative database connection engine. Permissible option is "\n "'sqlalchemy' otherwise defaults to use the Snowflake Connector for Python."\n ),\n is_required=False,\n )\n\n cache_column_metadata: Optional[str] = Field(\n default=None,\n description=(\n "Optional parameter when connector is set to sqlalchemy. Snowflake SQLAlchemy takes a"\n " flag ``cache_column_metadata=True`` such that all of column metadata for all tables"\n ' are "cached"'\n ),\n )\n\n numpy: Optional[bool] = Field(\n default=None,\n description=(\n "Optional parameter when connector is set to sqlalchemy. To enable fetching "\n "NumPy data types, add numpy=True to the connection parameters."\n ),\n )\n\n authenticator: Optional[str] = Field(\n default=None,\n description="Optional parameter to specify the authentication mechanism to use.",\n )\n\n @validator("paramstyle")\n def validate_paramstyle(cls, v: Optional[str]) -> Optional[str]:\n valid_config = ["pyformat", "qmark", "numeric"]\n if v is not None and v not in valid_config:\n raise ValueError(\n "Snowflake Resource: 'paramstyle' configuration value must be one of:"\n f" {','.join(valid_config)}."\n )\n return v\n\n @validator("connector")\n def validate_connector(cls, v: Optional[str]) -> Optional[str]:\n if v is not None and v != "sqlalchemy":\n raise ValueError(\n "Snowflake Resource: 'connector' configuration value must be None or sqlalchemy."\n )\n return v\n\n @root_validator\n def validate_authentication(cls, values):\n auths_set = 0\n auths_set += 1 if values.get("password") is not None else 0\n auths_set += 1 if values.get("private_key") is not None else 0\n auths_set += 1 if values.get("private_key_path") is not None else 0\n\n # if authenticator is set, there can be 0 or 1 additional auth method;\n # otherwise, ensure at least 1 method is provided\n check.invariant(\n auths_set > 0 or values.get("authenticator") is not None,\n "Missing config: Password, private key, or authenticator authentication required"\n " for Snowflake resource.",\n )\n\n # ensure that only 1 non-authenticator method is provided\n check.invariant(\n auths_set <= 1,\n "Incorrect config: Cannot provide both password and private key authentication to"\n " Snowflake Resource.",\n )\n\n return values\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n @property\n @cached_method\n def _connection_args(self) -> Mapping[str, Any]:\n conn_args = {\n k: self._resolved_config_dict.get(k)\n for k in (\n "account",\n "user",\n "password",\n "database",\n "schema",\n "role",\n "warehouse",\n "autocommit",\n "client_prefetch_threads",\n "client_session_keep_alive",\n "login_timeout",\n "network_timeout",\n "ocsp_response_cache_filename",\n "validate_default_parameters",\n "paramstyle",\n "timezone",\n "authenticator",\n )\n if self._resolved_config_dict.get(k) is not None\n }\n if (\n self._resolved_config_dict.get("private_key", None) is not None\n or self._resolved_config_dict.get("private_key_path", None) is not None\n ):\n conn_args["private_key"] = self._snowflake_private_key(self._resolved_config_dict)\n\n return conn_args\n\n @property\n @cached_method\n def _sqlalchemy_connection_args(self) -> Mapping[str, Any]:\n conn_args: Dict[str, Any] = {\n k: self._resolved_config_dict.get(k)\n for k in (\n "account",\n "user",\n "password",\n "database",\n "schema",\n "role",\n "warehouse",\n "cache_column_metadata",\n "numpy",\n )\n if self._resolved_config_dict.get(k) is not None\n }\n\n return conn_args\n\n @property\n @cached_method\n def _sqlalchemy_engine_args(self) -> Mapping[str, Any]:\n config = self._resolved_config_dict\n sqlalchemy_engine_args = {}\n if (\n config.get("private_key", None) is not None\n or config.get("private_key_path", None) is not None\n ):\n # sqlalchemy passes private key args separately, so store them in a new dict\n sqlalchemy_engine_args["private_key"] = self._snowflake_private_key(config)\n if config.get("authenticator", None) is not None:\n sqlalchemy_engine_args["authenticator"] = config["authenticator"]\n\n return sqlalchemy_engine_args\n\n def _snowflake_private_key(self, config) -> bytes:\n # If the user has defined a path to a private key, we will use that.\n if config.get("private_key_path", None) is not None:\n # read the file from the path.\n with open(config.get("private_key_path"), "rb") as key:\n private_key = key.read()\n else:\n private_key = config.get("private_key", None)\n\n kwargs = {}\n if config.get("private_key_password", None) is not None:\n kwargs["password"] = config["private_key_password"].encode()\n else:\n kwargs["password"] = None\n\n try:\n p_key = serialization.load_pem_private_key(\n private_key, backend=default_backend(), **kwargs\n )\n except TypeError:\n try:\n private_key = base64.b64decode(private_key)\n p_key = serialization.load_pem_private_key(\n private_key, backend=default_backend(), **kwargs\n )\n except ValueError:\n raise ValueError(\n "Unable to load private key. You may need to base64 encode your private key."\n " You can retrieve the base64 encoded key with this shell command: cat"\n " rsa_key.p8 | base64"\n )\n\n pkb = p_key.private_bytes(\n encoding=serialization.Encoding.DER,\n format=serialization.PrivateFormat.PKCS8,\n encryption_algorithm=serialization.NoEncryption(),\n )\n\n return pkb\n\n @public\n @contextmanager\n def get_connection(\n self, raw_conn: bool = True\n ) -> Iterator[Union[SqlDbConnection, snowflake.connector.SnowflakeConnection]]:\n """Gets a connection to Snowflake as a context manager.\n\n If connector configuration is not set, SnowflakeResource.get_connection() will return a\n `snowflake.connector.Connection <https://docs.snowflake.com/en/developer-guide/python-connector/python-connector-api#object-connection>`__\n If connector="sqlalchemy" configuration is set, then SnowflakeResource.get_connection() will\n return a `SQLAlchemy Connection <https://docs.sqlalchemy.org/en/20/core/connections.html#sqlalchemy.engine.Connection>`__\n or a `SQLAlchemy raw connection <https://docs.sqlalchemy.org/en/20/core/connections.html#sqlalchemy.engine.Engine.raw_connection>`__\n if raw_conn=True.\n\n\n Args:\n raw_conn (bool): If using the sqlalchemy connector, you can set raw_conn to True to create a raw\n connection. Defaults to True.\n\n Examples:\n .. code-block:: python\n\n @op\n def get_query_status(snowflake: SnowflakeResource, query_id):\n with snowflake.get_connection() as conn:\n # conn is a Snowflake Connection object or a SQLAlchemy Connection if\n # sqlalchemy is specified as the connector in the Snowflake Resource config\n\n return conn.get_query_status(query_id)\n\n """\n if self.connector == "sqlalchemy":\n from snowflake.sqlalchemy import URL\n from sqlalchemy import create_engine\n\n engine = create_engine(\n URL(**self._sqlalchemy_connection_args), connect_args=self._sqlalchemy_engine_args\n )\n conn = engine.raw_connection() if raw_conn else engine.connect()\n\n yield conn\n conn.close()\n engine.dispose()\n else:\n conn = snowflake.connector.connect(**self._connection_args)\n\n yield conn\n if not self.autocommit:\n conn.commit()\n conn.close()\n\n def get_object_to_set_on_execution_context(self) -> Any:\n # Directly create a SnowflakeConnection here for backcompat since the SnowflakeConnection\n # has methods this resource does not have\n return SnowflakeConnection(\n config=self._resolved_config_dict,\n log=get_dagster_logger(),\n snowflake_connection_resource=self,\n )
\n\n\n
[docs]class SnowflakeConnection:\n """A connection to Snowflake that can execute queries. In general this class should not be\n directly instantiated, but rather used as a resource in an op or asset via the\n :py:func:`snowflake_resource`.\n\n Note that the SnowflakeConnection is only used by the snowflake_resource. The Pythonic SnowflakeResource does\n not use this SnowflakeConnection class.\n """\n\n def __init__(\n self, config: Mapping[str, str], log, snowflake_connection_resource: SnowflakeResource\n ):\n self.snowflake_connection_resource = snowflake_connection_resource\n self.log = log\n\n
[docs] @public\n @contextmanager\n def get_connection(\n self, raw_conn: bool = True\n ) -> Iterator[Union[SqlDbConnection, snowflake.connector.SnowflakeConnection]]:\n """Gets a connection to Snowflake as a context manager.\n\n If using the execute_query, execute_queries, or load_table_from_local_parquet methods,\n you do not need to create a connection using this context manager.\n\n Args:\n raw_conn (bool): If using the sqlalchemy connector, you can set raw_conn to True to create a raw\n connection. Defaults to True.\n\n Examples:\n .. code-block:: python\n\n @op(\n required_resource_keys={"snowflake"}\n )\n def get_query_status(query_id):\n with context.resources.snowflake.get_connection() as conn:\n # conn is a Snowflake Connection object or a SQLAlchemy Connection if\n # sqlalchemy is specified as the connector in the Snowflake Resource config\n\n return conn.get_query_status(query_id)\n\n """\n with self.snowflake_connection_resource.get_connection(raw_conn=raw_conn) as conn:\n yield conn
\n\n
[docs] @public\n def execute_query(\n self,\n sql: str,\n parameters: Optional[Union[Sequence[Any], Mapping[Any, Any]]] = None,\n fetch_results: bool = False,\n use_pandas_result: bool = False,\n ):\n """Execute a query in Snowflake.\n\n Args:\n sql (str): the query to be executed\n parameters (Optional[Union[Sequence[Any], Mapping[Any, Any]]]): Parameters to be passed to the query. See the\n `Snowflake documentation <https://docs.snowflake.com/en/user-guide/python-connector-example.html#binding-data>`__\n for more information.\n fetch_results (bool): If True, will return the result of the query. Defaults to False. If True\n and use_pandas_result is also True, results will be returned as a Pandas DataFrame.\n use_pandas_result (bool): If True, will return the result of the query as a Pandas DataFrame.\n Defaults to False. If fetch_results is False and use_pandas_result is True, an error will be\n raised.\n\n Returns:\n The result of the query if fetch_results or use_pandas_result is True, otherwise returns None\n\n Examples:\n .. code-block:: python\n\n @op\n def drop_database(snowflake: SnowflakeResource):\n snowflake.execute_query(\n "DROP DATABASE IF EXISTS MY_DATABASE"\n )\n """\n check.str_param(sql, "sql")\n check.opt_inst_param(parameters, "parameters", (list, dict))\n check.bool_param(fetch_results, "fetch_results")\n if not fetch_results and use_pandas_result:\n check.failed("If use_pandas_result is True, fetch_results must also be True.")\n\n with self.get_connection() as conn:\n with closing(conn.cursor()) as cursor:\n if sys.version_info[0] < 3:\n sql = sql.encode("utf-8")\n\n self.log.info("Executing query: " + sql)\n parameters = dict(parameters) if isinstance(parameters, Mapping) else parameters\n cursor.execute(sql, parameters)\n if use_pandas_result:\n return cursor.fetch_pandas_all()\n if fetch_results:\n return cursor.fetchall()
\n\n
[docs] @public\n def execute_queries(\n self,\n sql_queries: Sequence[str],\n parameters: Optional[Union[Sequence[Any], Mapping[Any, Any]]] = None,\n fetch_results: bool = False,\n use_pandas_result: bool = False,\n ) -> Optional[Sequence[Any]]:\n """Execute multiple queries in Snowflake.\n\n Args:\n sql_queries (str): List of queries to be executed in series\n parameters (Optional[Union[Sequence[Any], Mapping[Any, Any]]]): Parameters to be passed to every query. See the\n `Snowflake documentation <https://docs.snowflake.com/en/user-guide/python-connector-example.html#binding-data>`__\n for more information.\n fetch_results (bool): If True, will return the results of the queries as a list. Defaults to False. If True\n and use_pandas_result is also True, results will be returned as Pandas DataFrames.\n use_pandas_result (bool): If True, will return the results of the queries as a list of a Pandas DataFrames.\n Defaults to False. If fetch_results is False and use_pandas_result is True, an error will be\n raised.\n\n Returns:\n The results of the queries as a list if fetch_results or use_pandas_result is True,\n otherwise returns None\n\n Examples:\n .. code-block:: python\n\n @op\n def create_fresh_database(snowflake: SnowflakeResource):\n queries = ["DROP DATABASE IF EXISTS MY_DATABASE", "CREATE DATABASE MY_DATABASE"]\n snowflake.execute_queries(\n sql_queries=queries\n )\n\n """\n check.sequence_param(sql_queries, "sql_queries", of_type=str)\n check.opt_inst_param(parameters, "parameters", (list, dict))\n check.bool_param(fetch_results, "fetch_results")\n if not fetch_results and use_pandas_result:\n check.failed("If use_pandas_result is True, fetch_results must also be True.")\n\n results: List[Any] = []\n with self.get_connection() as conn:\n with closing(conn.cursor()) as cursor:\n for raw_sql in sql_queries:\n sql = raw_sql.encode("utf-8") if sys.version_info[0] < 3 else raw_sql\n self.log.info("Executing query: " + sql)\n parameters = dict(parameters) if isinstance(parameters, Mapping) else parameters\n cursor.execute(sql, parameters)\n if use_pandas_result:\n results = results.append(cursor.fetch_pandas_all()) # type: ignore\n elif fetch_results:\n results.append(cursor.fetchall())\n\n return results if len(results) > 0 else None
\n\n
[docs] @public\n def load_table_from_local_parquet(self, src: str, table: str):\n """Stores the content of a parquet file to a Snowflake table.\n\n Args:\n src (str): the name of the file to store in Snowflake\n table (str): the name of the table to store the data. If the table does not exist, it will\n be created. Otherwise the contents of the table will be replaced with the data in src\n\n Examples:\n .. code-block:: python\n\n import pandas as pd\n import pyarrow as pa\n import pyarrow.parquet as pq\n\n @op\n def write_parquet_file(snowflake: SnowflakeResource):\n df = pd.DataFrame({"one": [1, 2, 3], "ten": [11, 12, 13]})\n table = pa.Table.from_pandas(df)\n pq.write_table(table, "example.parquet')\n snowflake.load_table_from_local_parquet(\n src="example.parquet",\n table="MY_TABLE"\n )\n\n """\n check.str_param(src, "src")\n check.str_param(table, "table")\n\n sql_queries = [\n f"CREATE OR REPLACE TABLE {table} ( data VARIANT DEFAULT NULL);",\n "CREATE OR REPLACE FILE FORMAT parquet_format TYPE = 'parquet';",\n f"PUT {src} @%{table};",\n f"COPY INTO {table} FROM @%{table} FILE_FORMAT = (FORMAT_NAME = 'parquet_format');",\n ]\n\n self.execute_queries(sql_queries)
\n\n\n
[docs]@dagster_maintained_resource\n@resource(\n config_schema=SnowflakeResource.to_config_schema(),\n description="This resource is for connecting to the Snowflake data warehouse",\n)\ndef snowflake_resource(context) -> SnowflakeConnection:\n """A resource for connecting to the Snowflake data warehouse. The returned resource object is an\n instance of :py:class:`SnowflakeConnection`.\n\n A simple example of loading data into Snowflake and subsequently querying that data is shown below:\n\n Examples:\n .. code-block:: python\n\n from dagster import job, op\n from dagster_snowflake import snowflake_resource\n\n @op(required_resource_keys={'snowflake'})\n def get_one(context):\n context.resources.snowflake.execute_query('SELECT 1')\n\n @job(resource_defs={'snowflake': snowflake_resource})\n def my_snowflake_job():\n get_one()\n\n my_snowflake_job.execute_in_process(\n run_config={\n 'resources': {\n 'snowflake': {\n 'config': {\n 'account': {'env': 'SNOWFLAKE_ACCOUNT'},\n 'user': {'env': 'SNOWFLAKE_USER'},\n 'password': {'env': 'SNOWFLAKE_PASSWORD'},\n 'database': {'env': 'SNOWFLAKE_DATABASE'},\n 'schema': {'env': 'SNOWFLAKE_SCHEMA'},\n 'warehouse': {'env': 'SNOWFLAKE_WAREHOUSE'},\n }\n }\n }\n }\n )\n """\n snowflake_resource = SnowflakeResource.from_resource_context(context)\n return SnowflakeConnection(\n config=context, log=context.log, snowflake_connection_resource=snowflake_resource\n )
\n
", "current_page_name": "_modules/dagster_snowflake/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_snowflake.resources"}, "snowflake_io_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_snowflake.snowflake_io_manager

\nfrom abc import abstractmethod\nfrom contextlib import contextmanager\nfrom typing import Optional, Sequence, Type, cast\n\nfrom dagster import IOManagerDefinition, OutputContext, io_manager\nfrom dagster._config.pythonic_config import (\n    ConfigurableIOManagerFactory,\n)\nfrom dagster._core.definitions.time_window_partitions import TimeWindow\nfrom dagster._core.storage.db_io_manager import (\n    DbClient,\n    DbIOManager,\n    DbTypeHandler,\n    TablePartitionDimension,\n    TableSlice,\n)\nfrom dagster._core.storage.io_manager import dagster_maintained_io_manager\nfrom pydantic import Field\nfrom sqlalchemy.exc import ProgrammingError\n\nfrom .resources import SnowflakeResource\n\nSNOWFLAKE_DATETIME_FORMAT = "%Y-%m-%d %H:%M:%S"\n\n\n
[docs]def build_snowflake_io_manager(\n type_handlers: Sequence[DbTypeHandler], default_load_type: Optional[Type] = None\n) -> IOManagerDefinition:\n """Builds an IO manager definition that reads inputs from and writes outputs to Snowflake.\n\n Args:\n type_handlers (Sequence[DbTypeHandler]): Each handler defines how to translate between\n slices of Snowflake tables and an in-memory type - e.g. a Pandas DataFrame. If only\n one DbTypeHandler is provided, it will be used as teh default_load_type.\n default_load_type (Type): When an input has no type annotation, load it as this type.\n\n Returns:\n IOManagerDefinition\n\n Examples:\n .. code-block:: python\n\n from dagster_snowflake import build_snowflake_io_manager\n from dagster_snowflake_pandas import SnowflakePandasTypeHandler\n from dagster_snowflake_pyspark import SnowflakePySparkTypeHandler\n from dagster import Definitions\n\n @asset(\n key_prefix=["my_schema"] # will be used as the schema in snowflake\n )\n def my_table() -> pd.DataFrame: # the name of the asset will be the table name\n ...\n\n snowflake_io_manager = build_snowflake_io_manager([SnowflakePandasTypeHandler(), SnowflakePySparkTypeHandler()])\n\n defs = Definitions(\n assets=[my_table],\n resources={\n "io_manager": snowflake_io_manager.configured({\n "database": "my_database",\n "account" : {"env": "SNOWFLAKE_ACCOUNT"}\n ...\n })\n }\n )\n\n If you do not provide a schema, Dagster will determine a schema based on the assets and ops using\n the IO Manager. For assets, the schema will be determined from the asset key,\n as shown in the above example. The final prefix before the asset name will be used as the schema. For example,\n if the asset ``my_table`` had the key prefix ``["snowflake", "my_schema"]``, the schema ``my_schema`` will be\n used. For ops, the schema can be specified by including a ``schema`` entry in output metadata. If ``schema`` is not provided\n via config or on the asset/op, ``public`` will be used for the schema.\n\n .. code-block:: python\n\n @op(\n out={"my_table": Out(metadata={"schema": "my_schema"})}\n )\n def make_my_table() -> pd.DataFrame:\n # the returned value will be stored at my_schema.my_table\n ...\n\n To only use specific columns of a table as input to a downstream op or asset, add the metadata ``columns`` to the\n In or AssetIn.\n\n .. code-block:: python\n\n @asset(\n ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n )\n def my_table_a(my_table: pd.DataFrame) -> pd.DataFrame:\n # my_table will just contain the data from column "a"\n ...\n\n """\n\n @dagster_maintained_io_manager\n @io_manager(config_schema=SnowflakeIOManager.to_config_schema())\n def snowflake_io_manager(init_context):\n return DbIOManager(\n type_handlers=type_handlers,\n db_client=SnowflakeDbClient(),\n io_manager_name="SnowflakeIOManager",\n database=init_context.resource_config["database"],\n schema=init_context.resource_config.get("schema"),\n default_load_type=default_load_type,\n )\n\n return snowflake_io_manager
\n\n\n
[docs]class SnowflakeIOManager(ConfigurableIOManagerFactory):\n """Base class for an IO manager definition that reads inputs from and writes outputs to Snowflake.\n\n Examples:\n .. code-block:: python\n\n from dagster_snowflake import SnowflakeIOManager\n from dagster_snowflake_pandas import SnowflakePandasTypeHandler\n from dagster_snowflake_pyspark import SnowflakePySparkTypeHandler\n from dagster import Definitions, EnvVar\n\n class MySnowflakeIOManager(SnowflakeIOManager):\n @staticmethod\n def type_handlers() -> Sequence[DbTypeHandler]:\n return [SnowflakePandasTypeHandler(), SnowflakePySparkTypeHandler()]\n\n @asset(\n key_prefix=["my_schema"] # will be used as the schema in snowflake\n )\n def my_table() -> pd.DataFrame: # the name of the asset will be the table name\n ...\n\n defs = Definitions(\n assets=[my_table],\n resources={\n "io_manager": MySnowflakeIOManager(database="MY_DATABASE", account=EnvVar("SNOWFLAKE_ACCOUNT"), ...)\n }\n )\n\n If you do not provide a schema, Dagster will determine a schema based on the assets and ops using\n the IO Manager. For assets, the schema will be determined from the asset key,\n as shown in the above example. The final prefix before the asset name will be used as the schema. For example,\n if the asset ``my_table`` had the key prefix ``["snowflake", "my_schema"]``, the schema ``my_schema`` will be\n used. For ops, the schema can be specified by including a ``schema`` entry in output metadata. If ``schema`` is not provided\n via config or on the asset/op, ``public`` will be used for the schema.\n\n .. code-block:: python\n\n @op(\n out={"my_table": Out(metadata={"schema": "my_schema"})}\n )\n def make_my_table() -> pd.DataFrame:\n # the returned value will be stored at my_schema.my_table\n ...\n\n To only use specific columns of a table as input to a downstream op or asset, add the metadata ``columns`` to the\n In or AssetIn.\n\n .. code-block:: python\n\n @asset(\n ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n )\n def my_table_a(my_table: pd.DataFrame) -> pd.DataFrame:\n # my_table will just contain the data from column "a"\n ...\n\n """\n\n database: str = Field(description="Name of the database to use.")\n account: str = Field(\n description=(\n "Your Snowflake account name. For more details, see the `Snowflake documentation."\n " <https://docs.snowflake.com/developer-guide/python-connector/python-connector-api>`__"\n ),\n )\n user: str = Field(description="User login name.")\n schema_: Optional[str] = Field(\n default=None, alias="schema", description="Name of the schema to use."\n ) # schema is a reserved word for pydantic\n password: Optional[str] = Field(default=None, description="User password.")\n warehouse: Optional[str] = Field(default=None, description="Name of the warehouse to use.")\n role: Optional[str] = Field(default=None, description="Name of the role to use.")\n private_key: Optional[str] = Field(\n default=None,\n description=(\n "Raw private key to use. See the `Snowflake documentation"\n " <https://docs.snowflake.com/en/user-guide/key-pair-auth.html>`__ for details. To"\n " avoid issues with newlines in the keys, you can base64 encode the key. You can"\n " retrieve the base64 encoded key with this shell command: cat rsa_key.p8 | base64"\n ),\n )\n private_key_path: Optional[str] = Field(\n default=None,\n description=(\n "Path to the private key. See the `Snowflake documentation"\n " <https://docs.snowflake.com/en/user-guide/key-pair-auth.html>`__ for details."\n ),\n )\n private_key_password: Optional[str] = Field(\n default=None,\n description=(\n "The password of the private key. See the `Snowflake documentation"\n " <https://docs.snowflake.com/en/user-guide/key-pair-auth.html>`__ for details."\n " Required for both private_key and private_key_path if the private key is encrypted."\n " For unencrypted keys, this config can be omitted or set to None."\n ),\n )\n store_timestamps_as_strings: bool = Field(\n default=False,\n description=(\n "If using Pandas DataFrames, whether to convert time data to strings. If True, time"\n " data will be converted to strings when storing the DataFrame and converted back to"\n " time data when loading the DataFrame. If False, time data without a timezone will be"\n " set to UTC timezone to avoid a Snowflake bug. Defaults to False."\n ),\n )\n authenticator: Optional[str] = Field(\n default=None,\n description="Optional parameter to specify the authentication mechanism to use.",\n )\n\n @staticmethod\n @abstractmethod\n def type_handlers() -> Sequence[DbTypeHandler]:\n """type_handlers should return a list of the TypeHandlers that the I/O manager can use.\n\n .. code-block:: python\n\n from dagster_snowflake import SnowflakeIOManager\n from dagster_snowflake_pandas import SnowflakePandasTypeHandler\n from dagster_snowflake_pyspark import SnowflakePySparkTypeHandler\n from dagster import Definitions, EnvVar\n\n class MySnowflakeIOManager(SnowflakeIOManager):\n @staticmethod\n def type_handlers() -> Sequence[DbTypeHandler]:\n return [SnowflakePandasTypeHandler(), SnowflakePySparkTypeHandler()]\n """\n ...\n\n @staticmethod\n def default_load_type() -> Optional[Type]:\n """If an asset or op is not annotated with an return type, default_load_type will be used to\n determine which TypeHandler to use to store and load the output.\n\n If left unimplemented, default_load_type will return None. In that case, if there is only\n one TypeHandler, the I/O manager will default to loading unannotated outputs with that\n TypeHandler.\n\n .. code-block:: python\n\n from dagster_snowflake import SnowflakeIOManager\n from dagster_snowflake_pandas import SnowflakePandasTypeHandler\n from dagster_snowflake_pyspark import SnowflakePySparkTypeHandler\n from dagster import Definitions, EnvVar\n import pandas as pd\n\n class MySnowflakeIOManager(SnowflakeIOManager):\n @staticmethod\n def type_handlers() -> Sequence[DbTypeHandler]:\n return [SnowflakePandasTypeHandler(), SnowflakePySparkTypeHandler()]\n\n @staticmethod\n def default_load_type() -> Optional[Type]:\n return pd.DataFrame\n """\n return None\n\n def create_io_manager(self, context) -> DbIOManager:\n return DbIOManager(\n db_client=SnowflakeDbClient(),\n io_manager_name="SnowflakeIOManager",\n database=self.database,\n schema=self.schema_,\n type_handlers=self.type_handlers(),\n default_load_type=self.default_load_type(),\n )
\n\n\nclass SnowflakeDbClient(DbClient):\n @staticmethod\n @contextmanager\n def connect(context, table_slice):\n no_schema_config = (\n {k: v for k, v in context.resource_config.items() if k != "schema"}\n if context.resource_config\n else {}\n )\n with SnowflakeResource(\n schema=table_slice.schema, connector="sqlalchemy", **no_schema_config\n ).get_connection(raw_conn=False) as conn:\n yield conn\n\n @staticmethod\n def ensure_schema_exists(context: OutputContext, table_slice: TableSlice, connection) -> None:\n schemas = connection.execute(\n f"show schemas like '{table_slice.schema}' in database {table_slice.database}"\n ).fetchall()\n if len(schemas) == 0:\n connection.execute(f"create schema {table_slice.schema};")\n\n @staticmethod\n def delete_table_slice(context: OutputContext, table_slice: TableSlice, connection) -> None:\n try:\n connection.execute(_get_cleanup_statement(table_slice))\n except ProgrammingError:\n # table doesn't exist yet, so ignore the error\n pass\n\n @staticmethod\n def get_select_statement(table_slice: TableSlice) -> str:\n col_str = ", ".join(table_slice.columns) if table_slice.columns else "*"\n if table_slice.partition_dimensions and len(table_slice.partition_dimensions) > 0:\n query = (\n f"SELECT {col_str} FROM"\n f" {table_slice.database}.{table_slice.schema}.{table_slice.table} WHERE\\n"\n )\n return query + _partition_where_clause(table_slice.partition_dimensions)\n else:\n return f"""SELECT {col_str} FROM {table_slice.database}.{table_slice.schema}.{table_slice.table}"""\n\n\ndef _get_cleanup_statement(table_slice: TableSlice) -> str:\n """Returns a SQL statement that deletes data in the given table to make way for the output data\n being written.\n """\n if table_slice.partition_dimensions and len(table_slice.partition_dimensions) > 0:\n query = (\n f"DELETE FROM {table_slice.database}.{table_slice.schema}.{table_slice.table} WHERE\\n"\n )\n return query + _partition_where_clause(table_slice.partition_dimensions)\n else:\n return f"DELETE FROM {table_slice.database}.{table_slice.schema}.{table_slice.table}"\n\n\ndef _partition_where_clause(partition_dimensions: Sequence[TablePartitionDimension]) -> str:\n return " AND\\n".join(\n (\n _time_window_where_clause(partition_dimension)\n if isinstance(partition_dimension.partitions, TimeWindow)\n else _static_where_clause(partition_dimension)\n )\n for partition_dimension in partition_dimensions\n )\n\n\ndef _time_window_where_clause(table_partition: TablePartitionDimension) -> str:\n partition = cast(TimeWindow, table_partition.partitions)\n start_dt, end_dt = partition\n start_dt_str = start_dt.strftime(SNOWFLAKE_DATETIME_FORMAT)\n end_dt_str = end_dt.strftime(SNOWFLAKE_DATETIME_FORMAT)\n # Snowflake BETWEEN is inclusive; start <= partition expr <= end. We don't want to remove the next partition so we instead\n # write this as start <= partition expr < end.\n return f"""{table_partition.partition_expr} >= '{start_dt_str}' AND {table_partition.partition_expr} < '{end_dt_str}'"""\n\n\ndef _static_where_clause(table_partition: TablePartitionDimension) -> str:\n partitions = ", ".join(f"'{partition}'" for partition in table_partition.partitions)\n return f"""{table_partition.partition_expr} in ({partitions})"""\n
", "current_page_name": "_modules/dagster_snowflake/snowflake_io_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_snowflake.snowflake_io_manager"}}, "dagster_snowflake_pandas": {"snowflake_pandas_type_handler": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_snowflake_pandas.snowflake_pandas_type_handler

\nfrom typing import Mapping, Optional, Sequence, Type\n\nimport pandas as pd\nimport pandas.core.dtypes.common as pd_core_dtypes_common\nfrom dagster import InputContext, MetadataValue, OutputContext, TableColumn, TableSchema\nfrom dagster._core.definitions.metadata import RawMetadataValue\nfrom dagster._core.errors import DagsterInvariantViolationError\nfrom dagster._core.storage.db_io_manager import DbTypeHandler, TableSlice\nfrom dagster_snowflake import build_snowflake_io_manager\nfrom dagster_snowflake.snowflake_io_manager import SnowflakeDbClient, SnowflakeIOManager\nfrom snowflake.connector.pandas_tools import pd_writer\n\n\ndef _table_exists(table_slice: TableSlice, connection):\n    tables = connection.execute(\n        f"SHOW TABLES LIKE '{table_slice.table}' IN SCHEMA"\n        f" {table_slice.database}.{table_slice.schema}"\n    ).fetchall()\n    return len(tables) > 0\n\n\ndef _get_table_column_types(table_slice: TableSlice, connection) -> Optional[Mapping[str, str]]:\n    if _table_exists(table_slice, connection):\n        schema_list = connection.execute(f"DESCRIBE TABLE {table_slice.table}").fetchall()\n        return {item[0]: item[1] for item in schema_list}\n\n\ndef _convert_timestamp_to_string(\n    s: pd.Series, column_types: Optional[Mapping[str, str]], table_name: str\n) -> pd.Series:\n    """Converts columns of data of type pd.Timestamp to string so that it can be stored in\n    snowflake.\n    """\n    column_name = str(s.name)\n    if pd_core_dtypes_common.is_datetime_or_timedelta_dtype(s):  # type: ignore  # (bad stubs)\n        if column_types:\n            if "VARCHAR" not in column_types[column_name]:\n                raise DagsterInvariantViolationError(\n                    "Snowflake I/O manager: Snowflake I/O manager configured to convert time data"\n                    f" in DataFrame column {column_name} to strings, but the corresponding"\n                    f" {column_name.upper()} column in table {table_name} is not of type VARCHAR,"\n                    f" it is of type {column_types[column_name]}. Please set"\n                    " store_timestamps_as_strings=False in the Snowflake I/O manager configuration"\n                    " to store time data as TIMESTAMP types."\n                )\n        return s.dt.strftime("%Y-%m-%d %H:%M:%S.%f %z")\n    else:\n        return s\n\n\ndef _convert_string_to_timestamp(s: pd.Series) -> pd.Series:\n    """Converts columns of strings in Timestamp format to pd.Timestamp to undo the conversion in\n    _convert_timestamp_to_string.\n\n    This will not convert non-timestamp strings into timestamps (pd.to_datetime will raise an\n    exception if the string cannot be converted)\n    """\n    if isinstance(s[0], str):\n        try:\n            return pd.to_datetime(s.values)  # type: ignore  # (bad stubs)\n        except ValueError:\n            return s\n    else:\n        return s\n\n\ndef _add_missing_timezone(\n    s: pd.Series, column_types: Optional[Mapping[str, str]], table_name: str\n) -> pd.Series:\n    column_name = str(s.name)\n    if pd_core_dtypes_common.is_datetime_or_timedelta_dtype(s):  # type: ignore  # (bad stubs)\n        if column_types:\n            if "VARCHAR" in column_types[column_name]:\n                raise DagsterInvariantViolationError(\n                    f"Snowflake I/O manager: The Snowflake column {column_name.upper()} in table"\n                    f" {table_name} is of type {column_types[column_name]} and should be of type"\n                    f" TIMESTAMP to store the time data in dataframe column {column_name}. Please"\n                    " migrate this column to be of time TIMESTAMP_NTZ(9) to store time data."\n                )\n        return s.dt.tz_localize("UTC")\n    return s\n\n\n
[docs]class SnowflakePandasTypeHandler(DbTypeHandler[pd.DataFrame]):\n """Plugin for the Snowflake I/O Manager that can store and load Pandas DataFrames as Snowflake tables.\n\n Examples:\n .. code-block:: python\n\n from dagster_snowflake import SnowflakeIOManager\n from dagster_snowflake_pandas import SnowflakePandasTypeHandler\n from dagster_snowflake_pyspark import SnowflakePySparkTypeHandler\n from dagster import Definitions, EnvVar\n\n class MySnowflakeIOManager(SnowflakeIOManager):\n @staticmethod\n def type_handlers() -> Sequence[DbTypeHandler]:\n return [SnowflakePandasTypeHandler(), SnowflakePySparkTypeHandler()]\n\n @asset(\n key_prefix=["my_schema"] # will be used as the schema in snowflake\n )\n def my_table() -> pd.DataFrame: # the name of the asset will be the table name\n ...\n\n defs = Definitions(\n assets=[my_table],\n resources={\n "io_manager": MySnowflakeIOManager(database="MY_DATABASE", account=EnvVar("SNOWFLAKE_ACCOUNT"), ...)\n }\n )\n """\n\n def handle_output(\n self, context: OutputContext, table_slice: TableSlice, obj: pd.DataFrame, connection\n ) -> Mapping[str, RawMetadataValue]:\n from snowflake import connector\n\n connector.paramstyle = "pyformat"\n with_uppercase_cols = obj.rename(str.upper, copy=False, axis="columns")\n column_types = _get_table_column_types(table_slice, connection)\n if context.resource_config and context.resource_config.get(\n "store_timestamps_as_strings", False\n ):\n with_uppercase_cols = with_uppercase_cols.apply(\n lambda x: _convert_timestamp_to_string(x, column_types, table_slice.table),\n axis="index",\n )\n else:\n with_uppercase_cols = with_uppercase_cols.apply(\n lambda x: _add_missing_timezone(x, column_types, table_slice.table), axis="index"\n )\n with_uppercase_cols.to_sql(\n table_slice.table,\n con=connection.engine,\n if_exists="append",\n index=False,\n method=pd_writer,\n )\n\n return {\n "row_count": obj.shape[0],\n "dataframe_columns": MetadataValue.table_schema(\n TableSchema(\n columns=[\n TableColumn(name=str(name), type=str(dtype))\n for name, dtype in obj.dtypes.items()\n ]\n )\n ),\n }\n\n def load_input(\n self, context: InputContext, table_slice: TableSlice, connection\n ) -> pd.DataFrame:\n if table_slice.partition_dimensions and len(context.asset_partition_keys) == 0:\n return pd.DataFrame()\n result = pd.read_sql(\n sql=SnowflakeDbClient.get_select_statement(table_slice), con=connection\n )\n if context.resource_config and context.resource_config.get(\n "store_timestamps_as_strings", False\n ):\n result = result.apply(_convert_string_to_timestamp, axis="index")\n result.columns = map(str.lower, result.columns) # type: ignore # (bad stubs)\n return result\n\n @property\n def supported_types(self):\n return [pd.DataFrame]
\n\n\nsnowflake_pandas_io_manager = build_snowflake_io_manager(\n [SnowflakePandasTypeHandler()], default_load_type=pd.DataFrame\n)\nsnowflake_pandas_io_manager.__doc__ = """\nAn I/O manager definition that reads inputs from and writes Pandas DataFrames to Snowflake. When\nusing the snowflake_pandas_io_manager, any inputs and outputs without type annotations will be loaded\nas Pandas DataFrames.\n\n\nReturns:\n IOManagerDefinition\n\nExamples:\n\n .. code-block:: python\n\n from dagster_snowflake_pandas import snowflake_pandas_io_manager\n from dagster import asset, Definitions\n\n @asset(\n key_prefix=["my_schema"] # will be used as the schema in snowflake\n )\n def my_table() -> pd.DataFrame: # the name of the asset will be the table name\n ...\n\n defs = Definitions(\n assets=[my_table],\n resources={\n "io_manager": snowflake_pandas_io_manager.configured({\n "database": "my_database",\n "account" : {"env": "SNOWFLAKE_ACCOUNT"}\n ...\n })\n }\n )\n\n If you do not provide a schema, Dagster will determine a schema based on the assets and ops using\n the I/O Manager. For assets, the schema will be determined from the asset key.\n For ops, the schema can be specified by including a "schema" entry in output metadata. If "schema" is not provided\n via config or on the asset/op, "public" will be used for the schema.\n\n .. code-block:: python\n\n @op(\n out={"my_table": Out(metadata={"schema": "my_schema"})}\n )\n def make_my_table() -> pd.DataFrame:\n # the returned value will be stored at my_schema.my_table\n ...\n\n To only use specific columns of a table as input to a downstream op or asset, add the metadata "columns" to the\n In or AssetIn.\n\n .. code-block:: python\n\n @asset(\n ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n )\n def my_table_a(my_table: pd.DataFrame) -> pd.DataFrame:\n # my_table will just contain the data from column "a"\n ...\n\n"""\n\n\n
[docs]class SnowflakePandasIOManager(SnowflakeIOManager):\n """An I/O manager definition that reads inputs from and writes Pandas DataFrames to Snowflake. When\n using the SnowflakePandasIOManager, any inputs and outputs without type annotations will be loaded\n as Pandas DataFrames.\n\n\n Returns:\n IOManagerDefinition\n\n Examples:\n .. code-block:: python\n\n from dagster_snowflake_pandas import SnowflakePandasIOManager\n from dagster import asset, Definitions, EnvVar\n\n @asset(\n key_prefix=["my_schema"] # will be used as the schema in snowflake\n )\n def my_table() -> pd.DataFrame: # the name of the asset will be the table name\n ...\n\n defs = Definitions(\n assets=[my_table],\n resources={\n "io_manager": SnowflakePandasIOManager(database="MY_DATABASE", account=EnvVar("SNOWFLAKE_ACCOUNT"), ...)\n }\n )\n\n If you do not provide a schema, Dagster will determine a schema based on the assets and ops using\n the I/O Manager. For assets, the schema will be determined from the asset key, as in the above example.\n For ops, the schema can be specified by including a "schema" entry in output metadata. If "schema" is not provided\n via config or on the asset/op, "public" will be used for the schema.\n\n .. code-block:: python\n\n @op(\n out={"my_table": Out(metadata={"schema": "my_schema"})}\n )\n def make_my_table() -> pd.DataFrame:\n # the returned value will be stored at my_schema.my_table\n ...\n\n To only use specific columns of a table as input to a downstream op or asset, add the metadata "columns" to the\n In or AssetIn.\n\n .. code-block:: python\n\n @asset(\n ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n )\n def my_table_a(my_table: pd.DataFrame) -> pd.DataFrame:\n # my_table will just contain the data from column "a"\n ...\n\n """\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n @staticmethod\n def type_handlers() -> Sequence[DbTypeHandler]:\n return [SnowflakePandasTypeHandler()]\n\n @staticmethod\n def default_load_type() -> Optional[Type]:\n return pd.DataFrame
\n
", "current_page_name": "_modules/dagster_snowflake_pandas/snowflake_pandas_type_handler", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_snowflake_pandas.snowflake_pandas_type_handler"}}, "dagster_snowflake_pyspark": {"snowflake_pyspark_type_handler": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_snowflake_pyspark.snowflake_pyspark_type_handler

\nfrom typing import Mapping, Optional, Sequence, Type\n\nimport dagster._check as check\nfrom dagster import InputContext, MetadataValue, OutputContext, TableColumn, TableSchema\nfrom dagster._core.definitions.metadata import RawMetadataValue\nfrom dagster._core.storage.db_io_manager import DbTypeHandler, TableSlice\nfrom dagster_snowflake import SnowflakeIOManager, build_snowflake_io_manager\nfrom dagster_snowflake.snowflake_io_manager import SnowflakeDbClient\nfrom pyspark.sql import DataFrame, SparkSession\nfrom pyspark.sql.types import StructType\n\nSNOWFLAKE_CONNECTOR = "net.snowflake.spark.snowflake"\n\n\ndef _get_snowflake_options(config, table_slice: TableSlice) -> Mapping[str, str]:\n    check.invariant(\n        config.get("warehouse", None) is not None,\n        "Missing config: Warehouse is required when using PySpark with the Snowflake I/O manager.",\n    )\n\n    conf = {\n        "sfURL": f"{config['account']}.snowflakecomputing.com",\n        "sfUser": config["user"],\n        "sfPassword": config["password"],\n        "sfDatabase": config["database"],\n        "sfSchema": table_slice.schema,\n        "sfWarehouse": config["warehouse"],\n    }\n\n    return conf\n\n\n
[docs]class SnowflakePySparkTypeHandler(DbTypeHandler[DataFrame]):\n """Plugin for the Snowflake I/O Manager that can store and load PySpark DataFrames as Snowflake tables.\n\n Examples:\n .. code-block:: python\n\n from dagster_snowflake import SnowflakeIOManager\n from dagster_snowflake_pandas import SnowflakePandasTypeHandler\n from dagster_snowflake_pyspark import SnowflakePySparkTypeHandler\n from dagster import Definitions, EnvVar\n\n class MySnowflakeIOManager(SnowflakeIOManager):\n @staticmethod\n def type_handlers() -> Sequence[DbTypeHandler]:\n return [SnowflakePandasTypeHandler(), SnowflakePySparkTypeHandler()]\n\n @asset(\n key_prefix=["my_schema"] # will be used as the schema in snowflake\n )\n def my_table() -> pd.DataFrame: # the name of the asset will be the table name\n ...\n\n defs = Definitions(\n assets=[my_table],\n resources={\n "io_manager": MySnowflakeIOManager(database="MY_DATABASE", account=EnvVar("SNOWFLAKE_ACCOUNT"), warehouse="my_warehouse", ...)\n }\n )\n\n """\n\n def handle_output(\n self, context: OutputContext, table_slice: TableSlice, obj: DataFrame, _\n ) -> Mapping[str, RawMetadataValue]:\n options = _get_snowflake_options(context.resource_config, table_slice)\n\n with_uppercase_cols = obj.toDF(*[c.upper() for c in obj.columns])\n\n with_uppercase_cols.write.format(SNOWFLAKE_CONNECTOR).options(**options).option(\n "dbtable", table_slice.table\n ).mode("append").save()\n\n return {\n "dataframe_columns": MetadataValue.table_schema(\n TableSchema(\n columns=[\n TableColumn(name=field.name, type=field.dataType.typeName())\n for field in obj.schema.fields\n ]\n )\n ),\n }\n\n def load_input(self, context: InputContext, table_slice: TableSlice, _) -> DataFrame:\n options = _get_snowflake_options(context.resource_config, table_slice)\n\n spark = SparkSession.builder.getOrCreate() # type: ignore\n if table_slice.partition_dimensions and len(context.asset_partition_keys) == 0:\n return spark.createDataFrame([], StructType([]))\n\n df = (\n spark.read.format(SNOWFLAKE_CONNECTOR)\n .options(**options)\n .option("query", SnowflakeDbClient.get_select_statement(table_slice))\n .load()\n )\n return df.toDF(*[c.lower() for c in df.columns])\n\n @property\n def supported_types(self):\n return [DataFrame]
\n\n\nsnowflake_pyspark_io_manager = build_snowflake_io_manager(\n [SnowflakePySparkTypeHandler()], default_load_type=DataFrame\n)\nsnowflake_pyspark_io_manager.__doc__ = """\nAn I/O manager definition that reads inputs from and writes PySpark DataFrames to Snowflake. When\nusing the snowflake_pyspark_io_manager, any inputs and outputs without type annotations will be loaded\nas PySpark DataFrames.\n\nReturns:\n IOManagerDefinition\n\nExamples:\n\n .. code-block:: python\n\n from dagster_snowflake_pyspark import snowflake_pyspark_io_manager\n from pyspark.sql import DataFrame\n from dagster import Definitions\n\n @asset(\n key_prefix=["my_schema"] # will be used as the schema in snowflake\n )\n def my_table() -> DataFrame: # the name of the asset will be the table name\n ...\n\n defs = Definitions(\n assets=[my_table],\n resources={\n "io_manager": snowflake_pyspark_io_manager.configured({\n "database": "my_database",\n "warehouse": "my_warehouse", # required for snowflake_pyspark_io_manager\n "account" : {"env": "SNOWFLAKE_ACCOUNT"},\n "password": {"env": "SNOWFLAKE_PASSWORD"},\n ...\n })\n }\n )\n\n Note that the warehouse configuration value is required when using the snowflake_pyspark_io_manager\n\n If you do not provide a schema, Dagster will determine a schema based on the assets and ops using\n the I/O Manager. For assets, the schema will be determined from the asset key.\n For ops, the schema can be specified by including a "schema" entry in output metadata. If "schema" is not provided\n via config or on the asset/op, "public" will be used for the schema.\n\n .. code-block:: python\n\n @op(\n out={"my_table": Out(metadata={"schema": "my_schema"})}\n )\n def make_my_table() -> DataFrame:\n # the returned value will be stored at my_schema.my_table\n ...\n\n To only use specific columns of a table as input to a downstream op or asset, add the metadata "columns" to the\n In or AssetIn.\n\n .. code-block:: python\n\n @asset(\n ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n )\n def my_table_a(my_table: DataFrame) -> DataFrame:\n # my_table will just contain the data from column "a"\n ...\n\n"""\n\n\n
[docs]class SnowflakePySparkIOManager(SnowflakeIOManager):\n """An I/O manager definition that reads inputs from and writes PySpark DataFrames to Snowflake. When\n using the SnowflakePySparkIOManager, any inputs and outputs without type annotations will be loaded\n as PySpark DataFrames.\n\n Returns:\n IOManagerDefinition\n\n Examples:\n .. code-block:: python\n\n from dagster_snowflake_pyspark import SnowflakePySparkIOManager\n from pyspark.sql import DataFrame\n from dagster import Definitions, EnvVar\n\n @asset(\n key_prefix=["my_schema"] # will be used as the schema in snowflake\n )\n def my_table() -> DataFrame: # the name of the asset will be the table name\n ...\n\n defs = Definitions(\n assets=[my_table],\n resources={\n "io_manager": SnowflakePySparkIOManager(\n database="my_database",\n warehouse="my_warehouse", # required for SnowflakePySparkIOManager\n account=EnvVar("SNOWFLAKE_ACCOUNT"),\n password=EnvVar("SNOWFLAKE_PASSWORD"),\n ...\n )\n }\n )\n\n Note that the warehouse configuration value is required when using the SnowflakePySparkIOManager\n\n If you do not provide a schema, Dagster will determine a schema based on the assets and ops using\n the I/O Manager. For assets, the schema will be determined from the asset key, as in the above example.\n For ops, the schema can be specified by including a "schema" entry in output metadata. If "schema" is not provided\n via config or on the asset/op, "public" will be used for the schema.\n\n .. code-block:: python\n\n @op(\n out={"my_table": Out(metadata={"schema": "my_schema"})}\n )\n def make_my_table() -> DataFrame:\n # the returned value will be stored at my_schema.my_table\n ...\n\n To only use specific columns of a table as input to a downstream op or asset, add the metadata "columns" to the\n In or AssetIn.\n\n .. code-block:: python\n\n @asset(\n ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n )\n def my_table_a(my_table: DataFrame) -> DataFrame:\n # my_table will just contain the data from column "a"\n ...\n\n """\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n @staticmethod\n def type_handlers() -> Sequence[DbTypeHandler]:\n return [SnowflakePySparkTypeHandler()]\n\n @staticmethod\n def default_load_type() -> Optional[Type]:\n return DataFrame
\n
", "current_page_name": "_modules/dagster_snowflake_pyspark/snowflake_pyspark_type_handler", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_snowflake_pyspark.snowflake_pyspark_type_handler"}}, "dagster_spark": {"configs": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_spark.configs

\n"""Spark Configuration.\n\nIn this file we define the key configuration parameters for submitting Spark jobs. Spark can be run\nin a variety of deployment contexts. See the Spark documentation at\nhttps://spark.apache.org/docs/latest/submitting-applications.html for a more in-depth summary of\nSpark deployment contexts and configuration.\n"""\nfrom dagster import Field, StringSource\n\nfrom .configs_spark import spark_config\nfrom .types import SparkDeployMode\n\n\n
[docs]def define_spark_config():\n """Spark configuration.\n\n See the Spark documentation for reference:\n https://spark.apache.org/docs/latest/submitting-applications.html\n """\n master_url = Field(\n StringSource,\n description="The master URL for the cluster (e.g. spark://23.195.26.187:7077)",\n is_required=True,\n )\n\n deploy_mode = Field(\n SparkDeployMode,\n description="""Whether to deploy your driver on the worker nodes (cluster) or locally as an\n external client (client) (default: client). A common deployment strategy is to submit your\n application from a gateway machine that is physically co-located with your worker machines\n (e.g. Master node in a standalone EC2 cluster). In this setup, client mode is appropriate.\n In client mode, the driver is launched directly within the spark-submit process which acts\n as a client to the cluster. The input and output of the application is attached to the\n console. Thus, this mode is especially suitable for applications that involve the REPL (e.g.\n Spark shell).""",\n is_required=False,\n )\n\n application_jar = Field(\n StringSource,\n description="""Path to a bundled jar including your application and all\n dependencies. The URL must be globally visible inside of your cluster, for\n instance, an hdfs:// path or a file:// path that is present on all nodes.\n """,\n is_required=True,\n )\n\n application_arguments = Field(\n StringSource,\n description="Arguments passed to the main method of your main class, if any",\n is_required=False,\n )\n\n spark_home = Field(\n StringSource,\n description=(\n "The path to your spark installation. Defaults to $SPARK_HOME at runtime if not"\n " provided."\n ),\n is_required=False,\n )\n\n return {\n "master_url": master_url,\n "deploy_mode": deploy_mode,\n "application_jar": application_jar,\n "spark_conf": spark_config(),\n "spark_home": spark_home,\n "application_arguments": application_arguments,\n }
\n
", "current_page_name": "_modules/dagster_spark/configs", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_spark.configs"}, "ops": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_spark.ops

\nfrom dagster import (\n    In,\n    Nothing,\n    Out,\n    _check as check,\n    op,\n)\n\nfrom .configs import define_spark_config\n\n\n
[docs]def create_spark_op(\n name, main_class, description=None, required_resource_keys=frozenset(["spark"])\n):\n check.str_param(name, "name")\n check.str_param(main_class, "main_class")\n check.opt_str_param(description, "description", "A parameterized Spark job.")\n check.set_param(required_resource_keys, "required_resource_keys")\n\n @op(\n name=name,\n description=description,\n config_schema=define_spark_config(),\n ins={"start": In(Nothing)},\n out=Out(Nothing),\n tags={"kind": "spark", "main_class": main_class},\n required_resource_keys=required_resource_keys,\n )\n def spark_op(context):\n context.resources.spark.run_spark_job(context.op_config, main_class)\n\n return spark_op
\n
", "current_page_name": "_modules/dagster_spark/ops", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_spark.ops"}, "resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_spark.resources

\nimport os\nimport subprocess\n\nimport dagster._check as check\nfrom dagster import resource\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom dagster._core.log_manager import DagsterLogManager\n\nfrom .types import SparkOpError\nfrom .utils import construct_spark_shell_command\n\n\nclass SparkResource:\n    def __init__(self, logger):\n        self.logger = check.inst_param(logger, "logger", DagsterLogManager)\n\n    def run_spark_job(self, config, main_class):\n        check.dict_param(config, "config")\n        check.str_param(main_class, "main_class")\n\n        # Extract parameters from config\n        (\n            master_url,\n            deploy_mode,\n            application_jar,\n            spark_conf,\n            application_arguments,\n            spark_home,\n        ) = [\n            config.get(k)\n            for k in (\n                "master_url",\n                "deploy_mode",\n                "application_jar",\n                "spark_conf",\n                "application_arguments",\n                "spark_home",\n            )\n        ]\n\n        if not os.path.exists(application_jar):\n            raise SparkOpError(\n                f"Application jar {application_jar} does not exist. A valid jar must be "\n                "built before running this op."\n            )\n\n        spark_shell_cmd = construct_spark_shell_command(\n            application_jar=application_jar,\n            main_class=main_class,\n            master_url=master_url,\n            spark_conf=spark_conf,\n            deploy_mode=deploy_mode,\n            application_arguments=application_arguments,\n            spark_home=spark_home,\n        )\n        self.logger.info("Running spark-submit: " + " ".join(spark_shell_cmd))\n\n        retcode = subprocess.call(" ".join(spark_shell_cmd), shell=True)\n\n        if retcode != 0:\n            raise SparkOpError("Spark job failed. Please consult your logs.")\n\n\n
[docs]@dagster_maintained_resource\n@resource\ndef spark_resource(context):\n return SparkResource(context.log)
\n
", "current_page_name": "_modules/dagster_spark/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_spark.resources"}, "types": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_spark.types

\nfrom dagster import Enum, EnumValue\n\nSparkDeployModeCluster = EnumValue("cluster")\nSparkDeployModeClient = EnumValue("client")\nSparkDeployMode = Enum(\n    name="SparkDeployMode", enum_values=[SparkDeployModeCluster, SparkDeployModeClient]\n)\n\n\n
[docs]class SparkOpError(Exception):\n pass
\n
", "current_page_name": "_modules/dagster_spark/types", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_spark.types"}, "utils": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_spark.utils

\nimport itertools\nimport os\n\nimport dagster._check as check\n\nfrom .types import SparkOpError\n\n\ndef flatten_dict(d):\n    def _flatten_dict(d, result, key_path=None):\n        """Iterates an arbitrarily nested dictionary and yield dot-notation key:value tuples.\n\n        {'foo': {'bar': 3, 'baz': 1}, {'other': {'key': 1}} =>\n            [('foo.bar', 3), ('foo.baz', 1), ('other.key', 1)]\n\n        """\n        for k, v in d.items():\n            new_key_path = (key_path or []) + [k]\n            if isinstance(v, dict):\n                _flatten_dict(v, result, new_key_path)\n            else:\n                result.append((".".join(new_key_path), v))\n\n    result = []\n    if d is not None:\n        _flatten_dict(d, result)\n    return result\n\n\ndef parse_spark_config(spark_conf):\n    """Convert spark conf dict to list of CLI arguments.\n\n    For each key-value pair in spark conf, we need to pass to CLI in format:\n\n    --conf "key=value"\n    """\n    spark_conf_list = flatten_dict(spark_conf)\n    return format_for_cli(spark_conf_list)\n\n\ndef format_for_cli(spark_conf_list):\n    return list(\n        itertools.chain.from_iterable([("--conf", "{}={}".format(*c)) for c in spark_conf_list])\n    )\n\n\n
[docs]def construct_spark_shell_command(\n application_jar,\n main_class,\n master_url=None,\n spark_conf=None,\n deploy_mode=None,\n application_arguments=None,\n spark_home=None,\n):\n """Constructs the spark-submit command for a Spark job."""\n check.opt_str_param(master_url, "master_url")\n check.str_param(application_jar, "application_jar")\n spark_conf = check.opt_dict_param(spark_conf, "spark_conf")\n check.opt_str_param(deploy_mode, "deploy_mode")\n check.opt_str_param(application_arguments, "application_arguments")\n check.opt_str_param(spark_home, "spark_home")\n\n spark_home = spark_home if spark_home else os.environ.get("SPARK_HOME")\n if spark_home is None:\n raise SparkOpError(\n "No spark home set. You must either pass spark_home in config or "\n "set $SPARK_HOME in your environment (got None)."\n )\n\n master_url = ["--master", master_url] if master_url else []\n deploy_mode = ["--deploy-mode", deploy_mode] if deploy_mode else []\n\n spark_shell_cmd = (\n [f"{spark_home}/bin/spark-submit", "--class", main_class]\n + master_url\n + deploy_mode\n + parse_spark_config(spark_conf)\n + [application_jar]\n + [application_arguments]\n )\n return spark_shell_cmd
\n
", "current_page_name": "_modules/dagster_spark/utils", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_spark.utils"}}, "dagster_ssh": {"resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_ssh.resources

\nimport getpass\nimport os\nfrom io import StringIO\n\nimport paramiko\nfrom dagster import (\n    BoolSource,\n    Field,\n    IntSource,\n    StringSource,\n    _check as check,\n    resource,\n)\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom dagster._utils import mkdir_p\nfrom dagster._utils.merger import merge_dicts\nfrom paramiko.config import SSH_PORT\nfrom sshtunnel import SSHTunnelForwarder\n\n\ndef key_from_str(key_str):\n    """Creates a paramiko SSH key from a string."""\n    check.str_param(key_str, "key_str")\n\n    # py2 StringIO doesn't support with\n    key_file = StringIO(key_str)\n    result = paramiko.RSAKey.from_private_key(key_file)\n    key_file.close()\n    return result\n\n\nclass SSHResource:\n    """Resource for ssh remote execution using Paramiko.\n\n    ref: https://github.com/paramiko/paramiko\n    """\n\n    def __init__(\n        self,\n        remote_host,\n        remote_port,\n        username=None,\n        password=None,\n        key_file=None,\n        key_string=None,\n        timeout=10,\n        keepalive_interval=30,\n        compress=True,\n        no_host_key_check=True,\n        allow_host_key_change=False,\n        logger=None,\n    ):\n        self.remote_host = check.str_param(remote_host, "remote_host")\n        self.remote_port = check.opt_int_param(remote_port, "remote_port")\n        self.username = check.opt_str_param(username, "username")\n        self.password = check.opt_str_param(password, "password")\n        self.key_file = check.opt_str_param(key_file, "key_file")\n        self.timeout = check.opt_int_param(timeout, "timeout")\n        self.keepalive_interval = check.opt_int_param(keepalive_interval, "keepalive_interval")\n        self.compress = check.opt_bool_param(compress, "compress")\n        self.no_host_key_check = check.opt_bool_param(no_host_key_check, "no_host_key_check")\n        self.log = logger\n\n        self.host_proxy = None\n\n        # Create RSAKey object from private key string\n        self.key_obj = key_from_str(key_string) if key_string is not None else None\n\n        # Auto detecting username values from system\n        if not self.username:\n            logger.debug(\n                "username to ssh to host: %s is not specified. Using system's default provided by"\n                " getpass.getuser()"\n                % self.remote_host\n            )\n            self.username = getpass.getuser()\n\n        user_ssh_config_filename = os.path.expanduser("~/.ssh/config")\n        if os.path.isfile(user_ssh_config_filename):\n            ssh_conf = paramiko.SSHConfig()\n            ssh_conf.parse(open(user_ssh_config_filename, encoding="utf8"))\n            host_info = ssh_conf.lookup(self.remote_host)\n            if host_info and host_info.get("proxycommand"):\n                self.host_proxy = paramiko.ProxyCommand(host_info.get("proxycommand"))\n\n            if not (self.password or self.key_file):\n                if host_info and host_info.get("identityfile"):\n                    self.key_file = host_info.get("identityfile")[0]\n\n    def get_connection(self):\n        """Opens a SSH connection to the remote host.\n\n        :rtype: paramiko.client.SSHClient\n        """\n        client = paramiko.SSHClient()\n        client.load_system_host_keys()\n        if self.no_host_key_check:\n            self.log.warning(\n                "No Host Key Verification. This won't protect against Man-In-The-Middle attacks"\n            )\n            # Default is RejectPolicy\n            client.set_missing_host_key_policy(paramiko.AutoAddPolicy())\n\n        if self.password and self.password.strip():\n            client.connect(\n                hostname=self.remote_host,\n                username=self.username,\n                password=self.password,\n                key_filename=self.key_file,\n                pkey=self.key_obj,\n                timeout=self.timeout,\n                compress=self.compress,\n                port=self.remote_port,\n                sock=self.host_proxy,\n                look_for_keys=False,\n            )\n        else:\n            client.connect(\n                hostname=self.remote_host,\n                username=self.username,\n                key_filename=self.key_file,\n                pkey=self.key_obj,\n                timeout=self.timeout,\n                compress=self.compress,\n                port=self.remote_port,\n                sock=self.host_proxy,\n            )\n\n        if self.keepalive_interval:\n            client.get_transport().set_keepalive(self.keepalive_interval)\n\n        return client\n\n    def get_tunnel(self, remote_port, remote_host="localhost", local_port=None):\n        check.int_param(remote_port, "remote_port")\n        check.str_param(remote_host, "remote_host")\n        check.opt_int_param(local_port, "local_port")\n\n        if local_port is not None:\n            local_bind_address = ("localhost", local_port)\n        else:\n            local_bind_address = ("localhost",)\n\n        # Will prefer key string if specified, otherwise use the key file\n        pkey = self.key_obj if self.key_obj else self.key_file\n\n        if self.password and self.password.strip():\n            client = SSHTunnelForwarder(\n                self.remote_host,\n                ssh_port=self.remote_port,\n                ssh_username=self.username,\n                ssh_password=self.password,\n                ssh_pkey=pkey,\n                ssh_proxy=self.host_proxy,\n                local_bind_address=local_bind_address,\n                remote_bind_address=(remote_host, remote_port),\n                logger=self.log,\n            )\n        else:\n            client = SSHTunnelForwarder(\n                self.remote_host,\n                ssh_port=self.remote_port,\n                ssh_username=self.username,\n                ssh_pkey=pkey,\n                ssh_proxy=self.host_proxy,\n                local_bind_address=local_bind_address,\n                remote_bind_address=(remote_host, remote_port),\n                host_pkey_directories=[],\n                logger=self.log,\n            )\n\n        return client\n\n    def sftp_get(self, remote_filepath, local_filepath):\n        check.str_param(remote_filepath, "remote_filepath")\n        check.str_param(local_filepath, "local_filepath")\n        conn = self.get_connection()\n        with conn.open_sftp() as sftp_client:\n            local_folder = os.path.dirname(local_filepath)\n\n            # Create intermediate directories if they don't exist\n            mkdir_p(local_folder)\n\n            self.log.info(f"Starting to transfer from {remote_filepath} to {local_filepath}")\n\n            sftp_client.get(remote_filepath, local_filepath)\n\n        conn.close()\n        return local_filepath\n\n    def sftp_put(self, remote_filepath, local_filepath, confirm=True):\n        check.str_param(remote_filepath, "remote_filepath")\n        check.str_param(local_filepath, "local_filepath")\n        conn = self.get_connection()\n        with conn.open_sftp() as sftp_client:\n            self.log.info(f"Starting to transfer file from {local_filepath} to {remote_filepath}")\n\n            sftp_client.put(local_filepath, remote_filepath, confirm=confirm)\n\n        conn.close()\n        return local_filepath\n\n\n
[docs]@dagster_maintained_resource\n@resource(\n config_schema={\n "remote_host": Field(\n StringSource, description="remote host to connect to", is_required=True\n ),\n "remote_port": Field(\n IntSource,\n description="port of remote host to connect (Default is paramiko SSH_PORT)",\n is_required=False,\n default_value=SSH_PORT,\n ),\n "username": Field(\n StringSource, description="username to connect to the remote_host", is_required=False\n ),\n "password": Field(\n StringSource,\n description="password of the username to connect to the remote_host",\n is_required=False,\n ),\n "key_file": Field(\n StringSource,\n description="key file to use to connect to the remote_host.",\n is_required=False,\n ),\n "key_string": Field(\n StringSource,\n description="key string to use to connect to remote_host",\n is_required=False,\n ),\n "timeout": Field(\n IntSource,\n description="timeout for the attempt to connect to the remote_host.",\n is_required=False,\n default_value=10,\n ),\n "keepalive_interval": Field(\n IntSource,\n description="send a keepalive packet to remote host every keepalive_interval seconds",\n is_required=False,\n default_value=30,\n ),\n "compress": Field(BoolSource, is_required=False, default_value=True),\n "no_host_key_check": Field(BoolSource, is_required=False, default_value=True),\n "allow_host_key_change": Field(\n BoolSource, description="[Deprecated]", is_required=False, default_value=False\n ),\n }\n)\ndef ssh_resource(init_context):\n args = init_context.resource_config\n args = merge_dicts(init_context.resource_config, {"logger": init_context.log})\n return SSHResource(**args)
\n
", "current_page_name": "_modules/dagster_ssh/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_ssh.resources"}}, "dagster_twilio": {"resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_twilio.resources

\nfrom dagster import ConfigurableResource, resource\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom dagster._core.execution.context.init import InitResourceContext\nfrom pydantic import Field\nfrom twilio.rest import Client\n\n\n
[docs]class TwilioResource(ConfigurableResource):\n """This resource is for connecting to Twilio."""\n\n account_sid: str = Field(\n description=(\n "Twilio Account SID, created with yout Twilio account. This can be found on your Twilio"\n " dashboard, see"\n " https://www.twilio.com/blog/twilio-access-tokens-python"\n ),\n )\n auth_token: str = Field(\n description=(\n "Twilio Authentication Token, created with yout Twilio account. This can be found on"\n " your Twilio dashboard, see https://www.twilio.com/blog/twilio-access-tokens-python"\n ),\n )\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n def create_client(self) -> Client:\n return Client(self.account_sid, self.auth_token)
\n\n\n
[docs]@dagster_maintained_resource\n@resource(\n config_schema=TwilioResource.to_config_schema(),\n description="This resource is for connecting to Twilio",\n)\ndef twilio_resource(context: InitResourceContext) -> Client:\n return TwilioResource.from_resource_context(context).create_client()
\n
", "current_page_name": "_modules/dagster_twilio/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_twilio.resources"}}, "dagster_wandb": {"io_manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_wandb.io_manager

\nimport datetime\nimport os\nimport pickle\nimport platform\nimport shutil\nimport sys\nimport time\nimport uuid\nfrom contextlib import contextmanager\nfrom typing import List, Optional\n\nfrom dagster import (\n    Field,\n    InitResourceContext,\n    InputContext,\n    Int,\n    IOManager,\n    MetadataValue,\n    OutputContext,\n    String,\n    io_manager,\n)\nfrom dagster._core.storage.io_manager import dagster_maintained_io_manager\nfrom wandb import Artifact\nfrom wandb.data_types import WBValue\n\nfrom .resources import WANDB_CLOUD_HOST\nfrom .utils.errors import (\n    WandbArtifactsIOManagerError,\n    raise_on_empty_configuration,\n    raise_on_unknown_partition_keys,\n    raise_on_unknown_read_configuration_keys,\n    raise_on_unknown_write_configuration_keys,\n)\nfrom .utils.pickling import (\n    ACCEPTED_SERIALIZATION_MODULES,\n    pickle_artifact_content,\n    unpickle_artifact_content,\n)\nfrom .version import __version__\n\nif sys.version_info >= (3, 8):\n    from typing import TypedDict\nelse:\n    from typing_extensions import TypedDict\n\n\nclass Config(TypedDict):\n    dagster_run_id: str\n    wandb_host: str\n    wandb_entity: str\n    wandb_project: str\n    wandb_run_name: Optional[str]\n    wandb_run_id: Optional[str]\n    wandb_run_tags: Optional[List[str]]\n    base_dir: str\n    cache_duration_in_minutes: Optional[int]\n\n\nclass ArtifactsIOManager(IOManager):\n    """IO Manager to handle Artifacts in Weights & Biases (W&B) .\n\n    It handles 3 different inputs:\n    - Pickable objects (the serialization module is configurable)\n    - W&B Objects (Audio, Table, Image, etc)\n    - W&B Artifacts\n    """\n\n    def __init__(self, wandb_client, config: Config):\n        self.wandb = wandb_client\n\n        dagster_run_id = config["dagster_run_id"]\n        self.dagster_run_id = dagster_run_id\n        self.wandb_host = config["wandb_host"]\n        self.wandb_entity = config["wandb_entity"]\n        self.wandb_project = config["wandb_project"]\n        self.wandb_run_id = config.get("wandb_run_id") or dagster_run_id\n        self.wandb_run_name = config.get("wandb_run_name") or f"dagster-run-{dagster_run_id[0:8]}"\n        # augments the run tags\n        wandb_run_tags = config["wandb_run_tags"] or []\n        if "dagster_wandb" not in wandb_run_tags:\n            wandb_run_tags = [*wandb_run_tags, "dagster_wandb"]\n        self.wandb_run_tags = wandb_run_tags\n\n        self.base_dir = config["base_dir"]\n        cache_duration_in_minutes = config["cache_duration_in_minutes"]\n        default_cache_expiration_in_minutes = 60 * 24 * 30  # 60 minutes * 24 hours * 30 days\n        self.cache_duration_in_minutes = (\n            cache_duration_in_minutes\n            if cache_duration_in_minutes is not None\n            else default_cache_expiration_in_minutes\n        )\n\n    def _get_local_storage_path(self):\n        path = self.base_dir\n        if os.path.basename(path) != "storage":\n            path = os.path.join(path, "storage")\n        path = os.path.join(path, "wandb_artifacts_manager")\n        os.makedirs(path, exist_ok=True)\n        return path\n\n    def _get_artifacts_path(self, name, version):\n        local_storage_path = self._get_local_storage_path()\n        path = os.path.join(local_storage_path, "artifacts", f"{name}.{version}")\n        os.makedirs(path, exist_ok=True)\n        return path\n\n    def _get_wandb_logs_path(self):\n        local_storage_path = self._get_local_storage_path()\n        # Adding a random uuid to avoid collisions in multi-process context\n        path = os.path.join(local_storage_path, "runs", self.dagster_run_id, str(uuid.uuid4()))\n        os.makedirs(path, exist_ok=True)\n        return path\n\n    def _clean_local_storage_path(self):\n        local_storage_path = self._get_local_storage_path()\n        cache_duration_in_minutes = self.cache_duration_in_minutes\n        current_timestamp = int(time.time())\n        expiration_timestamp = current_timestamp - (\n            cache_duration_in_minutes * 60  # convert to seconds\n        )\n\n        for root, dirs, files in os.walk(local_storage_path, topdown=False):\n            for name in files:\n                current_file_path = os.path.join(root, name)\n                most_recent_access = os.lstat(current_file_path).st_atime\n                if most_recent_access <= expiration_timestamp or cache_duration_in_minutes == 0:\n                    os.remove(current_file_path)\n            for name in dirs:\n                current_dir_path = os.path.join(root, name)\n                if not os.path.islink(current_dir_path):\n                    if len(os.listdir(current_dir_path)) == 0 or cache_duration_in_minutes == 0:\n                        shutil.rmtree(current_dir_path)\n\n    @contextmanager\n    def wandb_run(self):\n        self.wandb.init(\n            id=self.wandb_run_id,\n            name=self.wandb_run_name,\n            project=self.wandb_project,\n            entity=self.wandb_entity,\n            dir=self._get_wandb_logs_path(),\n            tags=self.wandb_run_tags,\n            anonymous="never",\n            resume="allow",\n        )\n        try:\n            yield self.wandb.run\n        finally:\n            self.wandb.finish()\n            self._clean_local_storage_path()\n\n    def _upload_artifact(self, context: OutputContext, obj):\n        if not context.has_partition_key and context.has_asset_partitions:\n            raise WandbArtifactsIOManagerError(\n                "Sorry, but the Weights & Biases (W&B) IO Manager can't handle processing several"\n                " partitions at the same time within a single run. Please process each partition"\n                " separately. If you think this might be an error, don't hesitate to reach out to"\n                " Weights & Biases Support."\n            )\n\n        with self.wandb_run() as run:\n            parameters = {}\n            if context.metadata is not None:\n                parameters = context.metadata.get("wandb_artifact_configuration", {})\n\n            raise_on_unknown_write_configuration_keys(parameters)\n\n            serialization_module = parameters.get("serialization_module", {})\n            serialization_module_name = serialization_module.get("name", "pickle")\n\n            if serialization_module_name not in ACCEPTED_SERIALIZATION_MODULES:\n                raise WandbArtifactsIOManagerError(\n                    f"Oops! It looks like the value you provided, '{serialization_module_name}',"\n                    " isn't recognized as a valid serialization module. Here are the ones we do"\n                    f" support: {ACCEPTED_SERIALIZATION_MODULES}."\n                )\n\n            serialization_module_parameters = serialization_module.get("parameters", {})\n            serialization_module_parameters_with_protocol = {\n                "protocol": (\n                    pickle.HIGHEST_PROTOCOL\n                ),  # we use the highest available protocol if we don't pass one\n                **serialization_module_parameters,\n            }\n\n            artifact_type = parameters.get("type", "artifact")\n            artifact_description = parameters.get("description")\n            artifact_metadata = {\n                "source_integration": "dagster_wandb",\n                "source_integration_version": __version__,\n                "source_dagster_run_id": self.dagster_run_id,\n                "source_created_at": datetime.datetime.now(datetime.timezone.utc).isoformat(),\n                "source_python_version": platform.python_version(),\n            }\n            if isinstance(obj, Artifact):\n                if parameters.get("name") is not None:\n                    raise WandbArtifactsIOManagerError(\n                        "You've provided a 'name' property in the 'wandb_artifact_configuration'"\n                        " settings. However, this 'name' property should only be used when the"\n                        " output isn't already an Artifact object."\n                    )\n\n                if parameters.get("type") is not None:\n                    raise WandbArtifactsIOManagerError(\n                        "You've provided a 'type' property in the 'wandb_artifact_configuration'"\n                        " settings. However, this 'type' property should only be used when the"\n                        " output isn't already an Artifact object."\n                    )\n\n                if obj.name is None:\n                    raise WandbArtifactsIOManagerError(\n                        "The Weights & Biases (W&B) Artifact you provided is missing a name."\n                        " Please, assign a name to your Artifact."\n                    )\n\n                if context.has_asset_key and obj.name != context.get_asset_identifier()[0]:\n                    asset_identifier = context.get_asset_identifier()[0]\n                    context.log.warning(\n                        f"Please note, the name '{obj.name}' of your Artifact is overwritten by the"\n                        f" name derived from the AssetKey '{asset_identifier}'. For consistency and"\n                        " to avoid confusion, we advise sharing a constant for both your asset's"\n                        " name and the artifact's name."\n                    )\n                    obj._name = asset_identifier  # noqa: SLF001\n\n                if context.has_partition_key:\n                    artifact_name = f"{obj.name}.{context.partition_key}"\n                    # The Artifact provided is produced in a partitioned execution we add the\n                    # partition as a suffix to the Artifact name\n                    obj._name = artifact_name  # noqa: SLF001\n\n                if len(serialization_module) != 0:  # not an empty dict\n                    context.log.warning(\n                        "You've included a 'serialization_module' in the"\n                        " 'wandb_artifact_configuration' settings. However, this doesn't have any"\n                        " impact when the output is already an Artifact object."\n                    )\n\n                # The obj is already an Artifact we augment its metadata\n                artifact = obj\n\n                artifact.metadata = {**artifact.metadata, **artifact_metadata}\n\n                if artifact.description is not None and artifact_description is not None:\n                    raise WandbArtifactsIOManagerError(\n                        "You've given a 'description' in the 'wandb_artifact_configuration'"\n                        " settings for an existing Artifact that already has a description. Please,"\n                        " either set the description using 'wandb_artifact_argument' or when"\n                        " creating your Artifact."\n                    )\n                if artifact_description is not None:\n                    artifact.description = artifact_description\n            else:\n                if context.has_asset_key:\n                    if parameters.get("name") is not None:\n                        raise WandbArtifactsIOManagerError(\n                            "You've included a 'name' property in the"\n                            " 'wandb_artifact_configuration' settings. But, a 'name' is only needed"\n                            " when there's no 'AssetKey'. When an Artifact is created from an"\n                            " @asset, it uses the asset name. When it's created from an @op with an"\n                            " 'asset_key' for the output, that value is used. Please remove the"\n                            " 'name' property."\n                        )\n                    artifact_name = context.get_asset_identifier()[0]  # name of asset\n                else:\n                    name_parameter = parameters.get("name")\n                    if name_parameter is None:\n                        raise WandbArtifactsIOManagerError(\n                            "The 'name' property is missing in the 'wandb_artifact_configuration'"\n                            " settings. For Artifacts created from an @op, a 'name' property is"\n                            " needed. You could also use an @asset as an alternative."\n                        )\n                    assert name_parameter is not None\n                    artifact_name = name_parameter\n\n                if context.has_partition_key:\n                    artifact_name = f"{artifact_name}.{context.partition_key}"\n\n                # We replace the | character with - because it is not allowed in artifact names\n                # The | character is used in multi-dimensional partition keys\n                artifact_name = str(artifact_name).replace("|", "-")\n\n                # Creates an artifact to hold the obj\n                artifact = self.wandb.Artifact(\n                    name=artifact_name,\n                    type=artifact_type,\n                    description=artifact_description,\n                    metadata=artifact_metadata,\n                )\n                if isinstance(obj, WBValue):\n                    if len(serialization_module) != 0:  # not an empty dict\n                        context.log.warning(\n                            "You've included a 'serialization_module' in the"\n                            " 'wandb_artifact_configuration' settings. However, this doesn't have"\n                            " any impact when the output is already an W&B object like e.g Table or"\n                            " Image."\n                        )\n                    # Adds the WBValue object using the class name as the name for the file\n                    artifact.add(obj, obj.__class__.__name__)\n                elif obj is not None:\n                    # The output is not a native wandb Object, we serialize it\n                    pickle_artifact_content(\n                        context,\n                        serialization_module_name,\n                        serialization_module_parameters_with_protocol,\n                        artifact,\n                        obj,\n                    )\n\n            # Add any files: https://docs.wandb.ai/ref/python/artifact#add_file\n            add_files = parameters.get("add_files")\n            if add_files is not None and len(add_files) > 0:\n                for add_file in add_files:\n                    artifact.add_file(**add_file)\n\n            # Add any dirs: https://docs.wandb.ai/ref/python/artifact#add_dir\n            add_dirs = parameters.get("add_dirs")\n            if add_dirs is not None and len(add_dirs) > 0:\n                for add_dir in add_dirs:\n                    artifact.add_dir(**add_dir)\n\n            # Add any reference: https://docs.wandb.ai/ref/python/artifact#add_reference\n            add_references = parameters.get("add_references")\n            if add_references is not None and len(add_references) > 0:\n                for add_reference in add_references:\n                    artifact.add_reference(**add_reference)\n\n            # Augments the aliases\n            aliases = parameters.get("aliases", [])\n            aliases.append(f"dagster-run-{self.dagster_run_id[0:8]}")\n            if "latest" not in aliases:\n                aliases.append("latest")\n\n            # Logs the artifact\n            self.wandb.log_artifact(artifact, aliases=aliases)\n            artifact.wait()\n\n            # Adds useful metadata to the output or Asset\n            artifacts_base_url = (\n                "https://wandb.ai"\n                if self.wandb_host == WANDB_CLOUD_HOST\n                else self.wandb_host.rstrip("/")\n            )\n            assert artifact.id is not None\n            output_metadata = {\n                "dagster_run_id": MetadataValue.dagster_run(self.dagster_run_id),\n                "wandb_artifact_id": MetadataValue.text(artifact.id),\n                "wandb_artifact_type": MetadataValue.text(artifact.type),\n                "wandb_artifact_version": MetadataValue.text(artifact.version),\n                "wandb_artifact_size": MetadataValue.int(artifact.size),\n                "wandb_artifact_url": MetadataValue.url(\n                    f"{artifacts_base_url}/{run.entity}/{run.project}/artifacts/{artifact.type}/{'/'.join(artifact.name.rsplit(':', 1))}"\n                ),\n                "wandb_entity": MetadataValue.text(run.entity),\n                "wandb_project": MetadataValue.text(run.project),\n                "wandb_run_id": MetadataValue.text(run.id),\n                "wandb_run_name": MetadataValue.text(run.name),\n                "wandb_run_path": MetadataValue.text(run.path),\n                "wandb_run_url": MetadataValue.url(run.url),\n            }\n            context.add_output_metadata(output_metadata)\n\n    def _download_artifact(self, context: InputContext):\n        with self.wandb_run() as run:\n            parameters = {}\n            if context.metadata is not None:\n                parameters = context.metadata.get("wandb_artifact_configuration", {})\n\n            raise_on_unknown_read_configuration_keys(parameters)\n\n            partitions_configuration = parameters.get("partitions", {})\n\n            if not context.has_asset_partitions and len(partitions_configuration) > 0:\n                raise WandbArtifactsIOManagerError(\n                    "You've included a 'partitions' value in the 'wandb_artifact_configuration'"\n                    " settings but it's not within a partitioned execution. Please only use"\n                    " 'partitions' within a partitioned context."\n                )\n\n            if context.has_asset_partitions:\n                # Note: this is currently impossible to unit test with current Dagster APIs but was\n                # tested thoroughly manually\n                name = parameters.get("get")\n                path = parameters.get("get_path")\n                if name is not None or path is not None:\n                    raise WandbArtifactsIOManagerError(\n                        "You've given a value for 'get' and/or 'get_path' in the"\n                        " 'wandb_artifact_configuration' settings during a partitioned execution."\n                        " Please use the 'partitions' property to set 'get' or 'get_path' for each"\n                        " individual partition. To set a default value for all partitions, use '*'."\n                    )\n\n                artifact_name = parameters.get("name")\n                if artifact_name is None:\n                    artifact_name = context.asset_key[0][0]  # name of asset\n\n                partitions = [\n                    (key, f"{artifact_name}.{ str(key).replace('|', '-')}")\n                    for key in context.asset_partition_keys\n                ]\n\n                output = {}\n\n                for key, artifact_name in partitions:\n                    context.log.info(f"Handling partition with key '{key}'")\n                    partition_configuration = partitions_configuration.get(\n                        key, partitions_configuration.get("*")\n                    )\n\n                    raise_on_empty_configuration(key, partition_configuration)\n                    raise_on_unknown_partition_keys(key, partition_configuration)\n\n                    partition_version = None\n                    partition_alias = None\n                    if partition_configuration and partition_configuration is not None:\n                        partition_version = partition_configuration.get("version")\n                        partition_alias = partition_configuration.get("alias")\n                        if partition_version is not None and partition_alias is not None:\n                            raise WandbArtifactsIOManagerError(\n                                "You've provided both 'version' and 'alias' for the partition with"\n                                " key '{key}'. You should only use one of these properties at a"\n                                " time. If you choose not to use any, the latest version will be"\n                                " used by default. If this partition is configured with the '*'"\n                                " key, please correct the wildcard configuration."\n                            )\n                    partition_identifier = partition_version or partition_alias or "latest"\n\n                    artifact_uri = (\n                        f"{run.entity}/{run.project}/{artifact_name}:{partition_identifier}"\n                    )\n                    try:\n                        api = self.wandb.Api()\n                        api.artifact(artifact_uri)\n                    except Exception as exception:\n                        raise WandbArtifactsIOManagerError(\n                            "The artifact you're attempting to download might not exist, or you"\n                            " might have forgotten to include the 'name' property in the"\n                            " 'wandb_artifact_configuration' settings."\n                        ) from exception\n\n                    artifact = run.use_artifact(artifact_uri)\n\n                    artifacts_path = self._get_artifacts_path(artifact_name, artifact.version)\n                    if partition_configuration and partition_configuration is not None:\n                        partition_name = partition_configuration.get("get")\n                        partition_path = partition_configuration.get("get_path")\n                        if partition_name is not None and partition_path is not None:\n                            raise WandbArtifactsIOManagerError(\n                                "You've provided both 'get' and 'get_path' in the"\n                                " 'wandb_artifact_configuration' settings for the partition with"\n                                " key '{key}'. Only one of these properties should be used. If you"\n                                " choose not to use any, the whole Artifact will be returned. If"\n                                " this partition is configured with the '*' key, please correct the"\n                                " wildcard configuration."\n                            )\n\n                        if partition_name is not None:\n                            wandb_object = artifact.get(partition_name)\n                            if wandb_object is not None:\n                                output[key] = wandb_object\n                                continue\n\n                        if partition_path is not None:\n                            path = artifact.get_path(partition_path)\n                            download_path = path.download(root=artifacts_path)\n                            if download_path is not None:\n                                output[key] = download_path\n                                continue\n\n                    artifact_dir = artifact.download(root=artifacts_path, recursive=True)\n                    unpickled_content = unpickle_artifact_content(artifact_dir)\n                    if unpickled_content is not None:\n                        output[key] = unpickled_content\n                        continue\n\n                    artifact.verify(root=artifacts_path)\n                    output[key] = artifact\n\n                if len(output) == 1:\n                    # If there's only one partition, return the value directly\n                    return next(iter(output.values()))\n\n                return output\n\n            elif context.has_asset_key:\n                # Input is an asset\n                if parameters.get("name") is not None:\n                    raise WandbArtifactsIOManagerError(\n                        "A conflict has been detected in the provided configuration settings. The"\n                        " 'name' parameter appears to be specified twice - once in the"\n                        " 'wandb_artifact_configuration' metadata dictionary, and again as an"\n                        " AssetKey. Kindly avoid setting the name directly, since the AssetKey will"\n                        " be used for this purpose."\n                    )\n                artifact_name = context.get_asset_identifier()[0]  # name of asset\n            else:\n                artifact_name = parameters.get("name")\n                if artifact_name is None:\n                    raise WandbArtifactsIOManagerError(\n                        "The 'name' property is missing in the 'wandb_artifact_configuration'"\n                        " settings. For Artifacts used in an @op, a 'name' property is required."\n                        " You could use an @asset as an alternative."\n                    )\n\n            if context.has_partition_key:\n                artifact_name = f"{artifact_name}.{context.partition_key}"\n\n            artifact_alias = parameters.get("alias")\n            artifact_version = parameters.get("version")\n\n            if artifact_alias is not None and artifact_version is not None:\n                raise WandbArtifactsIOManagerError(\n                    "You've provided both 'version' and 'alias' in the"\n                    " 'wandb_artifact_configuration' settings. Only one should be used at a time."\n                    " If you decide not to use any, the latest version will be applied"\n                    " automatically."\n                )\n\n            artifact_identifier = artifact_alias or artifact_version or "latest"\n            artifact_uri = f"{run.entity}/{run.project}/{artifact_name}:{artifact_identifier}"\n\n            # This try/except block is a workaround for a bug in the W&B SDK, this should be removed\n            # once the bug is fixed.\n            try:\n                artifact = run.use_artifact(artifact_uri)\n            except Exception:\n                api = self.wandb.Api()\n                artifact = api.artifact(artifact_uri)\n\n            name = parameters.get("get")\n            path = parameters.get("get_path")\n            if name is not None and path is not None:\n                raise WandbArtifactsIOManagerError(\n                    "You've provided both 'get' and 'get_path' in the"\n                    " 'wandb_artifact_configuration' settings. Only one should be used at a time."\n                    " If you decide not to use any, the entire Artifact will be returned."\n                )\n\n            if name is not None:\n                return artifact.get(name)\n\n            artifacts_path = self._get_artifacts_path(artifact_name, artifact.version)\n            if path is not None:\n                path = artifact.get_path(path)\n                return path.download(root=artifacts_path)\n\n            artifact_dir = artifact.download(root=artifacts_path, recursive=True)\n\n            unpickled_content = unpickle_artifact_content(artifact_dir)\n            if unpickled_content is not None:\n                return unpickled_content\n\n            artifact.verify(root=artifacts_path)\n            return artifact\n\n    def handle_output(self, context: OutputContext, obj) -> None:\n        if obj is None:\n            context.log.warning(\n                "The output value given to the Weights & Biases (W&B) IO Manager is empty. If this"\n                " was intended, you can disregard this warning."\n            )\n        else:\n            try:\n                self._upload_artifact(context, obj)\n            except WandbArtifactsIOManagerError as exception:\n                raise exception\n            except Exception as exception:\n                raise WandbArtifactsIOManagerError() from exception\n\n    def load_input(self, context: InputContext):\n        try:\n            return self._download_artifact(context)\n        except WandbArtifactsIOManagerError as exception:\n            raise exception\n        except Exception as exception:\n            raise WandbArtifactsIOManagerError() from exception\n\n\n
[docs]@dagster_maintained_io_manager\n@io_manager(\n required_resource_keys={"wandb_resource", "wandb_config"},\n description="IO manager to read and write W&B Artifacts",\n config_schema={\n "run_name": Field(\n String,\n is_required=False,\n description=(\n "Short display name for this run, which is how you'll identify this run in the UI."\n " By default, it`s set to a string with the following format dagster-run-[8 first"\n " characters of the Dagster Run ID] e.g. dagster-run-7e4df022."\n ),\n ),\n "run_id": Field(\n String,\n is_required=False,\n description=(\n "Unique ID for this run, used for resuming. It must be unique in the project, and"\n " if you delete a run you can't reuse the ID. Use the name field for a short"\n " descriptive name, or config for saving hyperparameters to compare across runs."\n r" The ID cannot contain the following special characters: /\\#?%:.. You need to set"\n " the Run ID when you are doing experiment tracking inside Dagster to allow the IO"\n " Manager to resume the run. By default it`s set to the Dagster Run ID e.g "\n " 7e4df022-1bf2-44b5-a383-bb852df4077e."\n ),\n ),\n "run_tags": Field(\n [String],\n is_required=False,\n description=(\n "A list of strings, which will populate the list of tags on this run in the UI."\n " Tags are useful for organizing runs together, or applying temporary labels like"\n " 'baseline' or 'production'. It's easy to add and remove tags in the UI, or filter"\n " down to just runs with a specific tag. Any W&B Run used by the integration will"\n " have the dagster_wandb tag."\n ),\n ),\n "base_dir": Field(\n String,\n is_required=False,\n description=(\n "Base directory used for local storage and caching. W&B Artifacts and W&B Run logs"\n " will be written and read from that directory. By default, it`s using the"\n " DAGSTER_HOME directory."\n ),\n ),\n "cache_duration_in_minutes": Field(\n Int,\n is_required=False,\n description=(\n "Defines the amount of time W&B Artifacts and W&B Run logs should be kept in the"\n " local storage. Only files and directories that were not opened for that amount of"\n " time are removed from the cache. Cache purging happens at the end of an IO"\n " Manager execution. You can set it to 0, if you want to disable caching"\n " completely. Caching improves speed when an Artifact is reused between jobs"\n " running on the same machine. It defaults to 30 days."\n ),\n ),\n },\n)\ndef wandb_artifacts_io_manager(context: InitResourceContext):\n """Dagster IO Manager to create and consume W&B Artifacts.\n\n It allows any Dagster @op or @asset to create and consume W&B Artifacts natively.\n\n For a complete set of documentation, see `Dagster integration <https://docs.wandb.ai/guides/integrations/dagster>`_.\n\n **Example:**\n\n .. code-block:: python\n\n @repository\n def my_repository():\n return [\n *with_resources(\n load_assets_from_current_module(),\n resource_defs={\n "wandb_config": make_values_resource(\n entity=str,\n project=str,\n ),\n "wandb_resource": wandb_resource.configured(\n {"api_key": {"env": "WANDB_API_KEY"}}\n ),\n "wandb_artifacts_manager": wandb_artifacts_io_manager.configured(\n {"cache_duration_in_minutes": 60} # only cache files for one hour\n ),\n },\n resource_config_by_key={\n "wandb_config": {\n "config": {\n "entity": "my_entity",\n "project": "my_project"\n }\n }\n },\n ),\n ]\n\n\n @asset(\n name="my_artifact",\n metadata={\n "wandb_artifact_configuration": {\n "type": "dataset",\n }\n },\n io_manager_key="wandb_artifacts_manager",\n )\n def create_dataset():\n return [1, 2, 3]\n\n """\n wandb_client = context.resources.wandb_resource["sdk"]\n wandb_host = context.resources.wandb_resource["host"]\n wandb_entity = context.resources.wandb_config["entity"]\n wandb_project = context.resources.wandb_config["project"]\n\n wandb_run_name = None\n wandb_run_id = None\n wandb_run_tags = None\n base_dir = (\n context.instance.storage_directory() if context.instance else os.environ["DAGSTER_HOME"]\n )\n cache_duration_in_minutes = None\n if context.resource_config is not None:\n wandb_run_name = context.resource_config.get("run_name")\n wandb_run_id = context.resource_config.get("run_id")\n wandb_run_tags = context.resource_config.get("run_tags")\n base_dir = context.resource_config.get("base_dir", base_dir)\n cache_duration_in_minutes = context.resource_config.get("cache_duration_in_minutes")\n\n if "PYTEST_CURRENT_TEST" in os.environ:\n dagster_run_id = "unit-testing"\n else:\n dagster_run_id = context.run_id\n\n assert dagster_run_id is not None\n\n config: Config = {\n "dagster_run_id": dagster_run_id,\n "wandb_host": wandb_host,\n "wandb_entity": wandb_entity,\n "wandb_project": wandb_project,\n "wandb_run_name": wandb_run_name,\n "wandb_run_id": wandb_run_id,\n "wandb_run_tags": wandb_run_tags,\n "base_dir": base_dir,\n "cache_duration_in_minutes": cache_duration_in_minutes,\n }\n return ArtifactsIOManager(wandb_client, config)
\n
", "current_page_name": "_modules/dagster_wandb/io_manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_wandb.io_manager"}, "launch": {"ops": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_wandb.launch.ops

\nfrom dagster import OpExecutionContext, op\nfrom wandb.sdk.launch import launch\nfrom wandb.sdk.launch.launch_add import launch_add\n\nfrom .configs import launch_agent_config, launch_config\n\n\ndef raise_on_invalid_config(context: OpExecutionContext):\n    entity = context.resources.wandb_config["entity"]\n    if entity == "":\n        raise RuntimeError(\n            "(dagster_wandb) An empty string was provided for the 'entity' property of the"\n            " 'wandb_config'."\n        )\n\n    project = context.resources.wandb_config["project"]\n    if project == "":\n        raise RuntimeError(\n            "(dagster_wandb) An empty string was provided for the 'project' property of the"\n            " 'wandb_config'."\n        )\n\n\n
[docs]@op(\n required_resource_keys={"wandb_resource", "wandb_config"},\n config_schema=launch_agent_config(),\n)\ndef run_launch_agent(context: OpExecutionContext):\n """It starts a Launch Agent and runs it as a long running process until stopped manually.\n\n Agents are processes that poll launch queues and execute the jobs (or dispatch them to external\n services to be executed) in order.\n\n **Example:**\n\n .. code-block:: YAML\n\n # config.yaml\n\n resources:\n wandb_config:\n config:\n entity: my_entity\n project: my_project\n ops:\n run_launch_agent:\n config:\n max_jobs: -1\n queues:\n - my_dagster_queue\n\n .. code-block:: python\n\n from dagster_wandb.launch.ops import run_launch_agent\n from dagster_wandb.resources import wandb_resource\n\n from dagster import job, make_values_resource\n\n\n @job(\n resource_defs={\n "wandb_config": make_values_resource(\n entity=str,\n project=str,\n ),\n "wandb_resource": wandb_resource.configured(\n {"api_key": {"env": "WANDB_API_KEY"}}\n ),\n },\n )\n def run_launch_agent_example():\n run_launch_agent()\n\n """\n raise_on_invalid_config(context)\n config = {\n "entity": context.resources.wandb_config["entity"],\n "project": context.resources.wandb_config["project"],\n **context.op_config,\n }\n context.log.info(f"Launch agent configuration: {config}")\n context.log.info("Running Launch agent...")\n launch.create_and_run_agent(api=context.resources.wandb_resource["api"], config=config)
\n\n\n
[docs]@op(\n required_resource_keys={\n "wandb_resource",\n "wandb_config",\n },\n config_schema=launch_config(),\n)\ndef run_launch_job(context: OpExecutionContext):\n """Executes a Launch job.\n\n A Launch job is assigned to a queue in order to be executed. You can create a queue or use the\n default one. Make sure you have an active agent listening to that queue. You can run an agent\n inside your Dagster instance but can also consider using a deployable agent in Kubernetes.\n\n **Example:**\n\n .. code-block:: YAML\n\n # config.yaml\n\n resources:\n wandb_config:\n config:\n entity: my_entity\n project: my_project\n ops:\n my_launched_job:\n config:\n entry_point:\n - python\n - train.py\n queue: my_dagster_queue\n uri: https://github.com/wandb/example-dagster-integration-with-launch\n\n .. code-block:: python\n\n from dagster_wandb.launch.ops import run_launch_job\n from dagster_wandb.resources import wandb_resource\n\n from dagster import job, make_values_resource\n\n\n @job(\n resource_defs={\n "wandb_config": make_values_resource(\n entity=str,\n project=str,\n ),\n "wandb_resource": wandb_resource.configured(\n {"api_key": {"env": "WANDB_API_KEY"}}\n ),\n },\n )\n def run_launch_job_example():\n run_launch_job.alias("my_launched_job")() # we rename the job with an alias\n\n """\n raise_on_invalid_config(context)\n config = {\n "entity": context.resources.wandb_config["entity"],\n "project": context.resources.wandb_config["project"],\n **context.op_config,\n }\n context.log.info(f"Launch job configuration: {config}")\n\n queue = context.op_config.get("queue")\n if queue is None:\n context.log.info("No queue provided, running Launch job locally")\n launch.run(api=context.resources.wandb_resource["api"], config=config)\n else:\n synchronous = config.get("synchronous", True)\n config.pop("synchronous", None)\n queued_run = launch_add(**config)\n if synchronous is True:\n context.log.info(\n f"Synchronous Launch job added to queue with name={queue}. Waiting for"\n " completion..."\n )\n queued_run.wait_until_finished()\n else:\n context.log.info(f"Asynchronous Launch job added to queue with name={queue}")
\n
", "current_page_name": "_modules/dagster_wandb/launch/ops", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_wandb.launch.ops"}}, "resources": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_wandb.resources

\nfrom typing import Any, Dict\n\nimport wandb\nfrom dagster import Field, InitResourceContext, String, StringSource, resource\nfrom dagster._core.definitions.resource_definition import dagster_maintained_resource\nfrom wandb.sdk.internal.internal_api import Api\n\nWANDB_CLOUD_HOST: str = "https://api.wandb.ai"\n\n\n
[docs]@dagster_maintained_resource\n@resource(\n config_schema={\n "api_key": Field(\n StringSource,\n description="W&B API key necessary to communicate with the W&B API.",\n is_required=True,\n ),\n "host": Field(\n String,\n description=(\n "API host server you wish to use. Only required if you are using W&B Server."\n ),\n is_required=False,\n default_value=WANDB_CLOUD_HOST,\n ),\n },\n description="Resource for interacting with Weights & Biases",\n)\ndef wandb_resource(context: InitResourceContext) -> Dict[str, Any]:\n """Dagster resource used to communicate with the W&B API. It's useful when you want to use the\n wandb client within your ops and assets. It's a required resources if you are using the W&B IO\n Manager.\n\n It automatically authenticates using the provided API key.\n\n For a complete set of documentation, see `Dagster integration <https://docs.wandb.ai/guides/integrations/dagster>`_.\n\n To configure this resource, we recommend using the `configured\n <https://docs.dagster.io/concepts/configuration/configured>`_ method.\n\n **Example:**\n\n .. code-block:: python\n\n from dagster import job\n from dagster_wandb import wandb_resource\n\n my_wandb_resource = wandb_resource.configured({"api_key": {"env": "WANDB_API_KEY"}})\n\n @job(resource_defs={"wandb_resource": my_wandb_resource})\n def my_wandb_job():\n ...\n\n """\n api_key = context.resource_config["api_key"]\n host = context.resource_config["host"]\n wandb.login(\n key=api_key,\n host=host,\n anonymous="never",\n )\n client_settings = wandb.Settings(\n api_key=api_key,\n base_url=host,\n anonymous="never",\n launch=True,\n )\n api = Api(default_settings=client_settings, load_settings=False)\n return {"sdk": wandb, "api": api, "host": host}
\n
", "current_page_name": "_modules/dagster_wandb/resources", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_wandb.resources"}, "types": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_wandb.types

\nimport sys\n\nif sys.version_info >= (3, 8):\n    from typing import TypedDict\nelse:\n    from typing_extensions import TypedDict\n\nfrom typing import Any, Dict, List\n\n\n
[docs]class SerializationModule(TypedDict, total=False):\n """W&B Artifacts IO Manager configuration of the serialization module. Useful for type checking."""\n\n name: str\n parameters: Dict[str, Any]
\n\n\n
[docs]class WandbArtifactConfiguration(TypedDict, total=False):\n """W&B Artifacts IO Manager configuration. Useful for type checking."""\n\n name: str\n type: str\n description: str\n aliases: List[str]\n add_dirs: List[Dict[str, Any]]\n add_files: List[Dict[str, Any]]\n add_references: List[Dict[str, Any]]\n serialization_module: SerializationModule\n partitions: Dict[str, Dict[str, Any]]
\n
", "current_page_name": "_modules/dagster_wandb/types", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_wandb.types"}, "utils": {"errors": {"alabaster_version": "0.7.13", "body": "

Source code for dagster_wandb.utils.errors

\n
[docs]class WandbArtifactsIOManagerError(Exception):\n """Represents an execution error of the W&B Artifacts IO Manager."""\n\n def __init__(self, message="A W&B Artifacts IO Manager error occurred."):\n self.message = message\n super().__init__(self.message)
\n\n\nSUPPORTED_READ_CONFIG_KEYS = [\n "alias",\n "get_path",\n "get",\n "name",\n "partitions",\n "version",\n]\nSUPPORTED_WRITE_CONFIG_KEYS = [\n "add_dirs",\n "add_files",\n "add_references",\n "aliases",\n "description",\n "name",\n "partitions",\n "serialization_module",\n "type",\n]\nSUPPORTED_PARTITION_CONFIG_KEYS = ["get", "get_path", "version", "alias"]\n\n\ndef raise_on_empty_configuration(partition_key, dictionary):\n if dictionary is not None and len(dictionary) == 0:\n raise WandbArtifactsIOManagerError(\n f"The configuration is empty for the partition identified by the key '{partition_key}'."\n " This happened within the 'wandb_artifact_configuration' metadata dictionary."\n )\n\n\ndef raise_on_unknown_keys(supported_config_keys, dictionary, is_read_config):\n if dictionary is None:\n return\n\n unsupported_keys = [key for key in dictionary.keys() if key not in supported_config_keys]\n if len(unsupported_keys) > 0:\n if is_read_config:\n raise WandbArtifactsIOManagerError(\n f"The configuration keys '{unsupported_keys}' you are trying to use are not"\n " supported within the 'wandb_artifact_configuration' metadata dictionary when"\n " reading an Artifact."\n )\n else:\n raise WandbArtifactsIOManagerError(\n f"The configuration keys '{unsupported_keys}' you are trying to use are not"\n " supported within the 'wandb_artifact_configuration' metadata dictionary when"\n " writing an Artifact."\n )\n\n\ndef raise_on_unknown_write_configuration_keys(dictionary):\n raise_on_unknown_keys(SUPPORTED_WRITE_CONFIG_KEYS, dictionary, False)\n\n\ndef raise_on_unknown_read_configuration_keys(dictionary):\n raise_on_unknown_keys(SUPPORTED_READ_CONFIG_KEYS, dictionary, True)\n\n\ndef raise_on_unknown_partition_keys(partition_key, dictionary):\n if dictionary is None:\n return\n\n unsupported_keys = [\n key for key in dictionary.keys() if key not in SUPPORTED_PARTITION_CONFIG_KEYS\n ]\n if len(unsupported_keys) > 0:\n raise WandbArtifactsIOManagerError(\n f"The configuration keys '{unsupported_keys}' you are trying to use are not supported"\n f" for the partition identified by the key '{partition_key}'. This happened within the"\n " 'wandb_artifact_configuration' metadata dictionary."\n )\n
", "current_page_name": "_modules/dagster_wandb/utils/errors", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagster_wandb.utils.errors"}}}, "dagstermill": {"asset_factory": {"alabaster_version": "0.7.13", "body": "

Source code for dagstermill.asset_factory

\nimport pickle\nimport tempfile\nfrom typing import Any, Callable, Iterable, Mapping, Optional, Set, Type, Union, cast\n\nimport dagster._check as check\nfrom dagster import (\n    AssetIn,\n    AssetKey,\n    AssetsDefinition,\n    Failure,\n    Output,\n    PartitionsDefinition,\n    ResourceDefinition,\n    RetryPolicy,\n    RetryRequested,\n    SourceAsset,\n    asset,\n)\nfrom dagster._config.pythonic_config import Config, infer_schema_from_config_class\nfrom dagster._config.pythonic_config.inheritance_utils import safe_is_subclass\nfrom dagster._core.definitions.events import CoercibleToAssetKey, CoercibleToAssetKeyPrefix\nfrom dagster._core.definitions.utils import validate_tags\nfrom dagster._core.execution.context.compute import OpExecutionContext\n\nfrom dagstermill.factory import _clean_path_for_windows, execute_notebook\n\n\ndef _make_dagstermill_asset_compute_fn(\n    name: str,\n    notebook_path: str,\n    save_notebook_on_failure: bool,\n) -> Callable:\n    def _t_fn(context: OpExecutionContext, **inputs) -> Iterable:\n        check.param_invariant(\n            isinstance(context.run_config, dict),\n            "context",\n            "StepExecutionContext must have valid run_config",\n        )\n\n        with tempfile.TemporaryDirectory() as output_notebook_dir:\n            executed_notebook_path = execute_notebook(\n                context.get_step_execution_context(),\n                name=name,\n                inputs=inputs,\n                save_notebook_on_failure=save_notebook_on_failure,\n                notebook_path=notebook_path,\n                output_notebook_dir=output_notebook_dir,\n            )\n\n            with open(executed_notebook_path, "rb") as fd:\n                yield Output(fd.read())\n\n            # deferred import for perf\n            import scrapbook\n\n            output_nb = scrapbook.read_notebook(executed_notebook_path)\n\n            for key, value in output_nb.scraps.items():\n                if key.startswith("event-"):\n                    with open(value.data, "rb") as fd:\n                        event = pickle.loads(fd.read())\n                        if isinstance(event, (Failure, RetryRequested)):\n                            raise event\n                        else:\n                            yield event\n\n    return _t_fn\n\n\n
[docs]def define_dagstermill_asset(\n name: str,\n notebook_path: str,\n key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,\n ins: Optional[Mapping[str, AssetIn]] = None,\n deps: Optional[Iterable[Union[CoercibleToAssetKey, AssetsDefinition, SourceAsset]]] = None,\n metadata: Optional[Mapping[str, Any]] = None,\n config_schema: Optional[Union[Any, Mapping[str, Any]]] = None,\n required_resource_keys: Optional[Set[str]] = None,\n resource_defs: Optional[Mapping[str, ResourceDefinition]] = None,\n description: Optional[str] = None,\n partitions_def: Optional[PartitionsDefinition] = None,\n op_tags: Optional[Mapping[str, Any]] = None,\n group_name: Optional[str] = None,\n io_manager_key: Optional[str] = None,\n retry_policy: Optional[RetryPolicy] = None,\n save_notebook_on_failure: bool = False,\n non_argument_deps: Optional[Union[Set[AssetKey], Set[str]]] = None,\n) -> AssetsDefinition:\n """Creates a Dagster asset for a Jupyter notebook.\n\n Arguments:\n name (str): The name for the asset\n notebook_path (str): Path to the backing notebook\n key_prefix (Optional[Union[str, Sequence[str]]]): If provided, the asset's key is the\n concatenation of the key_prefix and the asset's name, which defaults to the name of\n the decorated function. Each item in key_prefix must be a valid name in dagster (ie only\n contains letters, numbers, and _) and may not contain python reserved keywords.\n ins (Optional[Mapping[str, AssetIn]]): A dictionary that maps input names to information\n about the input.\n deps (Optional[Sequence[Union[AssetsDefinition, SourceAsset, AssetKey, str]]]): The assets\n that are upstream dependencies, but do not pass an input value to the notebook.\n config_schema (Optional[ConfigSchema): The configuration schema for the asset's underlying\n op. If set, Dagster will check that config provided for the op matches this schema and fail\n if it does not. If not set, Dagster will accept any config provided for the op.\n metadata (Optional[Dict[str, Any]]): A dict of metadata entries for the asset.\n required_resource_keys (Optional[Set[str]]): Set of resource handles required by the notebook.\n description (Optional[str]): Description of the asset to display in the Dagster UI.\n partitions_def (Optional[PartitionsDefinition]): Defines the set of partition keys that\n compose the asset.\n op_tags (Optional[Dict[str, Any]]): A dictionary of tags for the op that computes the asset.\n Frameworks may expect and require certain metadata to be attached to a op. Values that\n are not strings will be json encoded and must meet the criteria that\n `json.loads(json.dumps(value)) == value`.\n group_name (Optional[str]): A string name used to organize multiple assets into groups. If not provided,\n the name "default" is used.\n resource_defs (Optional[Mapping[str, ResourceDefinition]]):\n (Experimental) A mapping of resource keys to resource definitions. These resources\n will be initialized during execution, and can be accessed from the\n context within the notebook.\n io_manager_key (Optional[str]): A string key for the IO manager used to store the output notebook.\n If not provided, the default key output_notebook_io_manager will be used.\n retry_policy (Optional[RetryPolicy]): The retry policy for the op that computes the asset.\n save_notebook_on_failure (bool): If True and the notebook fails during execution, the failed notebook will be\n written to the Dagster storage directory. The location of the file will be printed in the Dagster logs.\n Defaults to False.\n non_argument_deps (Optional[Union[Set[AssetKey], Set[str]]]): Deprecated, use deps instead. Set of asset keys that are\n upstream dependencies, but do not pass an input to the asset.\n\n Examples:\n .. code-block:: python\n\n from dagstermill import define_dagstermill_asset\n from dagster import asset, AssetIn, AssetKey\n from sklearn import datasets\n import pandas as pd\n import numpy as np\n\n @asset\n def iris_dataset():\n sk_iris = datasets.load_iris()\n return pd.DataFrame(\n data=np.c_[sk_iris["data"], sk_iris["target"]],\n columns=sk_iris["feature_names"] + ["target"],\n )\n\n iris_kmeans_notebook = define_dagstermill_asset(\n name="iris_kmeans_notebook",\n notebook_path="/path/to/iris_kmeans.ipynb",\n ins={\n "iris": AssetIn(key=AssetKey("iris_dataset"))\n }\n )\n """\n check.str_param(name, "name")\n check.str_param(notebook_path, "notebook_path")\n check.bool_param(save_notebook_on_failure, "save_notebook_on_failure")\n\n required_resource_keys = set(\n check.opt_set_param(required_resource_keys, "required_resource_keys", of_type=str)\n )\n ins = check.opt_mapping_param(ins, "ins", key_type=str, value_type=AssetIn)\n\n if isinstance(key_prefix, str):\n key_prefix = [key_prefix]\n\n key_prefix = check.opt_list_param(key_prefix, "key_prefix", of_type=str)\n\n default_description = f"This asset is backed by the notebook at {notebook_path}"\n description = check.opt_str_param(description, "description", default=default_description)\n\n io_mgr_key = check.opt_str_param(\n io_manager_key, "io_manager_key", default="output_notebook_io_manager"\n )\n\n user_tags = validate_tags(op_tags)\n if op_tags is not None:\n check.invariant(\n "notebook_path" not in op_tags,\n "user-defined op tags contains the `notebook_path` key, but the `notebook_path` key"\n " is reserved for use by Dagster",\n )\n check.invariant(\n "kind" not in op_tags,\n "user-defined op tags contains the `kind` key, but the `kind` key is reserved for"\n " use by Dagster",\n )\n\n default_tags = {"notebook_path": _clean_path_for_windows(notebook_path), "kind": "ipynb"}\n\n if safe_is_subclass(config_schema, Config):\n config_schema = infer_schema_from_config_class(cast(Type[Config], config_schema))\n\n return asset(\n name=name,\n key_prefix=key_prefix,\n ins=ins,\n deps=deps,\n metadata=metadata,\n description=description,\n config_schema=config_schema,\n required_resource_keys=required_resource_keys,\n resource_defs=resource_defs,\n partitions_def=partitions_def,\n op_tags={**user_tags, **default_tags},\n group_name=group_name,\n output_required=False,\n io_manager_key=io_mgr_key,\n retry_policy=retry_policy,\n non_argument_deps=non_argument_deps,\n )(\n _make_dagstermill_asset_compute_fn(\n name=name,\n notebook_path=notebook_path,\n save_notebook_on_failure=save_notebook_on_failure,\n )\n )
\n
", "current_page_name": "_modules/dagstermill/asset_factory", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagstermill.asset_factory"}, "context": {"alabaster_version": "0.7.13", "body": "

Source code for dagstermill.context

\nfrom typing import AbstractSet, Any, Mapping, Optional, cast\n\nfrom dagster import (\n    DagsterRun,\n    JobDefinition,\n    OpDefinition,\n    _check as check,\n)\nfrom dagster._annotations import public\nfrom dagster._core.definitions.dependency import Node, NodeHandle\nfrom dagster._core.execution.context.compute import AbstractComputeExecutionContext\nfrom dagster._core.execution.context.system import PlanExecutionContext, StepExecutionContext\nfrom dagster._core.log_manager import DagsterLogManager\nfrom dagster._core.system_config.objects import ResolvedRunConfig\n\n\n
[docs]class DagstermillExecutionContext(AbstractComputeExecutionContext):\n """Dagstermill-specific execution context.\n\n Do not initialize directly: use :func:`dagstermill.get_context`.\n """\n\n def __init__(\n self,\n job_context: PlanExecutionContext,\n job_def: JobDefinition,\n resource_keys_to_init: AbstractSet[str],\n op_name: str,\n node_handle: NodeHandle,\n op_config: Any = None,\n ):\n self._job_context = check.inst_param(job_context, "job_context", PlanExecutionContext)\n self._job_def = check.inst_param(job_def, "job_def", JobDefinition)\n self._resource_keys_to_init = check.set_param(\n resource_keys_to_init, "resource_keys_to_init", of_type=str\n )\n self.op_name = check.str_param(op_name, "op_name")\n self.node_handle = check.inst_param(node_handle, "node_handle", NodeHandle)\n self._op_config = op_config\n\n def has_tag(self, key: str) -> bool:\n """Check if a logging tag is defined on the context.\n\n Args:\n key (str): The key to check.\n\n Returns:\n bool\n """\n check.str_param(key, "key")\n return self._job_context.has_tag(key)\n\n def get_tag(self, key: str) -> Optional[str]:\n """Get a logging tag defined on the context.\n\n Args:\n key (str): The key to get.\n\n Returns:\n str\n """\n check.str_param(key, "key")\n return self._job_context.get_tag(key)\n\n @public\n @property\n def run_id(self) -> str:\n """str: The run_id for the context."""\n return self._job_context.run_id\n\n @public\n @property\n def run_config(self) -> Mapping[str, Any]:\n """dict: The run_config for the context."""\n return self._job_context.run_config\n\n @property\n def resolved_run_config(self) -> ResolvedRunConfig:\n """:class:`dagster.ResolvedRunConfig`: The resolved_run_config for the context."""\n return self._job_context.resolved_run_config\n\n @public\n @property\n def logging_tags(self) -> Mapping[str, str]:\n """dict: The logging tags for the context."""\n return self._job_context.logging_tags\n\n @public\n @property\n def job_name(self) -> str:\n """str: The name of the executing job."""\n return self._job_context.job_name\n\n @public\n @property\n def job_def(self) -> JobDefinition:\n """:class:`dagster.JobDefinition`: The job definition for the context.\n\n This will be a dagstermill-specific shim.\n """\n return self._job_def\n\n @property\n def resources(self) -> Any:\n """collections.namedtuple: A dynamically-created type whose properties allow access to\n resources.\n """\n return self._job_context.scoped_resources_builder.build(\n required_resource_keys=self._resource_keys_to_init,\n )\n\n @public\n @property\n def run(self) -> DagsterRun:\n """:class:`dagster.DagsterRun`: The job run for the context."""\n return cast(DagsterRun, self._job_context.dagster_run)\n\n @property\n def log(self) -> DagsterLogManager:\n """:class:`dagster.DagsterLogManager`: The log manager for the context.\n\n Call, e.g., ``log.info()`` to log messages through the Dagster machinery.\n """\n return self._job_context.log\n\n @public\n @property\n def op_def(self) -> OpDefinition:\n """:class:`dagster.OpDefinition`: The op definition for the context.\n\n In interactive contexts, this may be a dagstermill-specific shim, depending whether an\n op definition was passed to ``dagstermill.get_context``.\n """\n return cast(OpDefinition, self._job_def.node_def_named(self.op_name))\n\n @property\n def node(self) -> Node:\n """:class:`dagster.Node`: The node for the context.\n\n In interactive contexts, this may be a dagstermill-specific shim, depending whether an\n op definition was passed to ``dagstermill.get_context``.\n """\n return self.job_def.get_node(self.node_handle)\n\n @public\n @property\n def op_config(self) -> Any:\n """collections.namedtuple: A dynamically-created type whose properties allow access to\n op-specific config.\n """\n if self._op_config:\n return self._op_config\n\n op_config = self.resolved_run_config.ops.get(self.op_name)\n return op_config.config if op_config else None
\n\n\nclass DagstermillRuntimeExecutionContext(DagstermillExecutionContext):\n def __init__(\n self,\n job_context: PlanExecutionContext,\n job_def: JobDefinition,\n resource_keys_to_init: AbstractSet[str],\n op_name: str,\n step_context: StepExecutionContext,\n node_handle: NodeHandle,\n op_config: Any = None,\n ):\n self._step_context = check.inst_param(step_context, "step_context", StepExecutionContext)\n super().__init__(\n job_context,\n job_def,\n resource_keys_to_init,\n op_name,\n node_handle,\n op_config,\n )\n\n @property\n def step_context(self) -> StepExecutionContext:\n return self._step_context\n
", "current_page_name": "_modules/dagstermill/context", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagstermill.context"}, "errors": {"alabaster_version": "0.7.13", "body": "

Source code for dagstermill.errors

\nfrom dagster._core.errors import DagsterError\n\n\n
[docs]class DagstermillError(DagsterError):\n """Base class for errors raised by dagstermill."""
\n
", "current_page_name": "_modules/dagstermill/errors", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagstermill.errors"}, "factory": {"alabaster_version": "0.7.13", "body": "

Source code for dagstermill.factory

\nimport copy\nimport os\nimport pickle\nimport sys\nimport tempfile\nimport uuid\nfrom typing import Any, Callable, Iterable, Mapping, Optional, Sequence, Set, Type, Union, cast\n\nimport nbformat\nimport papermill\nfrom dagster import (\n    In,\n    OpDefinition,\n    Out,\n    Output,\n    _check as check,\n    _seven,\n)\nfrom dagster._config.pythonic_config import Config, infer_schema_from_config_class\nfrom dagster._config.pythonic_config.inheritance_utils import safe_is_subclass\nfrom dagster._core.definitions.events import AssetMaterialization, Failure, RetryRequested\nfrom dagster._core.definitions.metadata import MetadataValue\nfrom dagster._core.definitions.reconstruct import ReconstructableJob\nfrom dagster._core.definitions.utils import validate_tags\nfrom dagster._core.execution.context.compute import OpExecutionContext\nfrom dagster._core.execution.context.input import build_input_context\nfrom dagster._core.execution.context.system import StepExecutionContext\nfrom dagster._core.execution.plan.outputs import StepOutputHandle\nfrom dagster._serdes import pack_value\nfrom dagster._seven import get_system_temp_directory\nfrom dagster._utils import mkdir_p, safe_tempfile_path\nfrom dagster._utils.error import serializable_error_info_from_exc_info\nfrom papermill.engines import papermill_engines\nfrom papermill.iorw import load_notebook_node, write_ipynb\n\nfrom .compat import ExecutionError\nfrom .engine import DagstermillEngine\nfrom .errors import DagstermillError\nfrom .translator import DagsterTranslator\n\n\ndef _clean_path_for_windows(notebook_path: str) -> str:\n    """In windows, the notebook can't render in the Dagster UI unless the C: prefix is removed.\n    os.path.splitdrive will split the path into (drive, tail), so just return the tail.\n    """\n    return os.path.splitdrive(notebook_path)[1]\n\n\n# https://github.com/nteract/papermill/blob/17d4bbb3960c30c263bca835e48baf34322a3530/papermill/parameterize.py\ndef _find_first_tagged_cell_index(nb, tag):\n    parameters_indices = []\n    for idx, cell in enumerate(nb.cells):\n        if tag in cell.metadata.tags:\n            parameters_indices.append(idx)\n    if not parameters_indices:\n        return -1\n    return parameters_indices[0]\n\n\n# This is based on papermill.parameterize.parameterize_notebook\n# Typically, papermill injects the injected-parameters cell *below* the parameters cell\n# but we want to *replace* the parameters cell, which is what this function does.\ndef replace_parameters(context, nb, parameters):\n    """Assigned parameters into the appropriate place in the input notebook.\n\n    Args:\n        nb (NotebookNode): Executable notebook object\n        parameters (dict): Arbitrary keyword arguments to pass to the notebook parameters.\n    """\n    check.dict_param(parameters, "parameters")\n\n    # Copy the nb object to avoid polluting the input\n    nb = copy.deepcopy(nb)\n\n    # papermill method chooses translator based on kernel_name and language, but we just call the\n    # DagsterTranslator to generate parameter content based on the kernel_name\n    param_content = DagsterTranslator.codify(parameters)\n\n    newcell = nbformat.v4.new_code_cell(source=param_content)\n    newcell.metadata["tags"] = ["injected-parameters"]\n\n    param_cell_index = _find_first_tagged_cell_index(nb, "parameters")\n    injected_cell_index = _find_first_tagged_cell_index(nb, "injected-parameters")\n    if injected_cell_index >= 0:\n        # Replace the injected cell with a new version\n        before = nb.cells[:injected_cell_index]\n        after = nb.cells[injected_cell_index + 1 :]\n        check.int_value_param(param_cell_index, -1, "param_cell_index")\n        # We should have blown away the parameters cell if there is an injected-parameters cell\n    elif param_cell_index >= 0:\n        # Replace the parameter cell with the injected-parameters cell\n        before = nb.cells[:param_cell_index]\n        after = nb.cells[param_cell_index + 1 :]\n    else:\n        # Inject to the top of the notebook, presumably first cell includes dagstermill import\n        context.log.debug(\n            "Executing notebook with no tagged parameters cell: injecting boilerplate in first "\n            "cell."\n        )\n        before = []\n        after = nb.cells\n\n    nb.cells = before + [newcell] + after\n    nb.metadata.papermill["parameters"] = _seven.json.dumps(parameters)\n\n    return nb\n\n\ndef get_papermill_parameters(\n    step_context: StepExecutionContext,\n    inputs: Mapping[str, object],\n    output_log_path: str,\n    compute_descriptor: str,\n) -> Mapping[str, object]:\n    check.param_invariant(\n        isinstance(step_context.run_config, dict),\n        "step_context",\n        "StepExecutionContext must have valid run_config",\n    )\n\n    run_id = step_context.run_id\n    temp_dir = get_system_temp_directory()\n    marshal_dir = os.path.normpath(os.path.join(temp_dir, "dagstermill", str(run_id), "marshal"))\n    mkdir_p(marshal_dir)\n\n    if not isinstance(step_context.job, ReconstructableJob):\n        if compute_descriptor == "asset":\n            raise DagstermillError(\n                "Can't execute a dagstermill asset that is not reconstructable. "\n                "Use the reconstructable() function if executing from python"\n            )\n        else:\n            raise DagstermillError(\n                "Can't execute a dagstermill op from a job that is not reconstructable. "\n                "Use the reconstructable() function if executing from python"\n            )\n\n    dm_executable_dict = step_context.job.to_dict()\n\n    dm_context_dict = {\n        "output_log_path": output_log_path,\n        "marshal_dir": marshal_dir,\n        "run_config": step_context.run_config,\n    }\n\n    dm_node_handle_kwargs = step_context.node_handle._asdict()\n    dm_step_key = step_context.step.key\n\n    parameters = {}\n\n    parameters["__dm_context"] = dm_context_dict\n    parameters["__dm_executable_dict"] = dm_executable_dict\n    parameters["__dm_pipeline_run_dict"] = pack_value(step_context.dagster_run)\n    parameters["__dm_node_handle_kwargs"] = dm_node_handle_kwargs\n    parameters["__dm_instance_ref_dict"] = pack_value(step_context.instance.get_ref())\n    parameters["__dm_step_key"] = dm_step_key\n    parameters["__dm_input_names"] = list(inputs.keys())\n\n    return parameters\n\n\ndef execute_notebook(\n    step_context: StepExecutionContext,\n    name: str,\n    save_notebook_on_failure: bool,\n    notebook_path: str,\n    output_notebook_dir: str,\n    inputs: Mapping[str, object],\n) -> str:\n    with safe_tempfile_path() as output_log_path:\n        prefix = str(uuid.uuid4())\n        parameterized_notebook_path = os.path.join(output_notebook_dir, f"{prefix}-inter.ipynb")\n\n        executed_notebook_path = os.path.join(output_notebook_dir, f"{prefix}-out.ipynb")\n\n        # Scaffold the registration here\n        nb = load_notebook_node(notebook_path)\n        compute_descriptor = "op"\n        nb_no_parameters = replace_parameters(\n            step_context,\n            nb,\n            get_papermill_parameters(\n                step_context,\n                inputs,\n                output_log_path,\n                compute_descriptor,\n            ),\n        )\n        write_ipynb(nb_no_parameters, parameterized_notebook_path)\n\n        try:\n            papermill_engines.register("dagstermill", DagstermillEngine)\n            papermill.execute_notebook(\n                input_path=parameterized_notebook_path,\n                output_path=executed_notebook_path,\n                engine_name="dagstermill",\n                log_output=True,\n            )\n\n        except Exception as ex:\n            step_context.log.warn(\n                "Error when attempting to materialize executed notebook: {exc}".format(\n                    exc=str(serializable_error_info_from_exc_info(sys.exc_info()))\n                )\n            )\n\n            if isinstance(ex, ExecutionError):\n                exception_name = ex.ename  # type: ignore\n                if exception_name in ["RetryRequested", "Failure"]:\n                    step_context.log.warn(\n                        f"Encountered raised {exception_name} in notebook. Use"\n                        " dagstermill.yield_event with RetryRequested or Failure to trigger"\n                        " their behavior."\n                    )\n\n            if save_notebook_on_failure:\n                storage_dir = step_context.instance.storage_directory()\n                storage_path = os.path.join(storage_dir, f"{prefix}-out.ipynb")\n                with open(storage_path, "wb") as dest_file_obj:\n                    with open(executed_notebook_path, "rb") as obj:\n                        dest_file_obj.write(obj.read())\n\n                step_context.log.info(f"Failed notebook written to {storage_path}")\n\n            raise\n\n    step_context.log.debug(f"Notebook execution complete for {name} at {executed_notebook_path}.")\n\n    return executed_notebook_path\n\n\ndef _handle_events_from_notebook(\n    step_context: StepExecutionContext, executed_notebook_path: str\n) -> Iterable:\n    # deferred import for perf\n    import scrapbook\n\n    output_nb = scrapbook.read_notebook(executed_notebook_path)\n\n    for output_name in step_context.op_def.output_dict.keys():\n        data_dict = output_nb.scraps.data_dict\n        if output_name in data_dict:\n            # read outputs that were passed out of process via io manager from `yield_result`\n            step_output_handle = StepOutputHandle(\n                step_key=step_context.step.key,\n                output_name=output_name,\n            )\n            output_context = step_context.get_output_context(step_output_handle)\n            io_manager = step_context.get_io_manager(step_output_handle)\n            value = io_manager.load_input(\n                build_input_context(\n                    upstream_output=output_context, dagster_type=output_context.dagster_type\n                )\n            )\n\n            yield Output(value, output_name)\n\n    for key, value in output_nb.scraps.items():\n        if key.startswith("event-"):\n            with open(value.data, "rb") as fd:\n                event = pickle.loads(fd.read())\n                if isinstance(event, (Failure, RetryRequested)):\n                    raise event\n                else:\n                    yield event\n\n\ndef _make_dagstermill_compute_fn(\n    dagster_factory_name: str,\n    name: str,\n    notebook_path: str,\n    output_notebook_name: Optional[str] = None,\n    asset_key_prefix: Optional[Sequence[str]] = None,\n    output_notebook: Optional[str] = None,\n    save_notebook_on_failure: bool = False,\n) -> Callable:\n    def _t_fn(op_context: OpExecutionContext, inputs: Mapping[str, object]) -> Iterable:\n        check.param_invariant(\n            isinstance(op_context.run_config, dict),\n            "context",\n            "StepExecutionContext must have valid run_config",\n        )\n\n        step_context = op_context.get_step_execution_context()\n\n        with tempfile.TemporaryDirectory() as output_notebook_dir:\n            executed_notebook_path = execute_notebook(\n                step_context,\n                name=name,\n                inputs=inputs,\n                save_notebook_on_failure=save_notebook_on_failure,\n                notebook_path=notebook_path,\n                output_notebook_dir=output_notebook_dir,\n            )\n\n            if output_notebook_name is not None:\n                # yield output notebook binary stream as an op output\n                with open(executed_notebook_path, "rb") as fd:\n                    yield Output(fd.read(), output_notebook_name)\n\n            else:\n                # backcompat\n                executed_notebook_file_handle = None\n                try:\n                    # use binary mode when when moving the file since certain file_managers such as S3\n                    # may try to hash the contents\n                    with open(executed_notebook_path, "rb") as fd:\n                        executed_notebook_file_handle = op_context.resources.file_manager.write(\n                            fd, mode="wb", ext="ipynb"\n                        )\n                        executed_notebook_materialization_path = (\n                            executed_notebook_file_handle.path_desc\n                        )\n\n                    yield AssetMaterialization(\n                        asset_key=[*(asset_key_prefix or []), f"{name}_output_notebook"],\n                        description="Location of output notebook in file manager",\n                        metadata={\n                            "path": MetadataValue.path(executed_notebook_materialization_path),\n                        },\n                    )\n\n                except Exception:\n                    # if file manager writing errors, e.g. file manager is not provided, we throw a warning\n                    # and fall back to the previously stored temp executed notebook.\n                    op_context.log.warning(\n                        "Error when attempting to materialize executed notebook using file"\n                        " manager:"\n                        f" {serializable_error_info_from_exc_info(sys.exc_info())}\\nNow"\n                        " falling back to local: notebook execution was temporarily materialized"\n                        f" at {executed_notebook_path}\\nIf you have supplied a file manager and"\n                        " expect to use it for materializing the notebook, please include"\n                        ' "file_manager" in the `required_resource_keys` argument to'\n                        f" `{dagster_factory_name}`"\n                    )\n\n                if output_notebook is not None:\n                    yield Output(executed_notebook_file_handle, output_notebook)\n\n            yield from _handle_events_from_notebook(step_context, executed_notebook_path)\n\n    return _t_fn\n\n\n
[docs]def define_dagstermill_op(\n name: str,\n notebook_path: str,\n ins: Optional[Mapping[str, In]] = None,\n outs: Optional[Mapping[str, Out]] = None,\n config_schema: Optional[Union[Any, Mapping[str, Any]]] = None,\n required_resource_keys: Optional[Set[str]] = None,\n output_notebook_name: Optional[str] = None,\n asset_key_prefix: Optional[Union[Sequence[str], str]] = None,\n description: Optional[str] = None,\n tags: Optional[Mapping[str, Any]] = None,\n io_manager_key: Optional[str] = None,\n save_notebook_on_failure: bool = False,\n) -> OpDefinition:\n """Wrap a Jupyter notebook in a op.\n\n Arguments:\n name (str): The name of the op.\n notebook_path (str): Path to the backing notebook.\n ins (Optional[Mapping[str, In]]): The op's inputs.\n outs (Optional[Mapping[str, Out]]): The op's outputs. Your notebook should\n call :py:func:`~dagstermill.yield_result` to yield each of these outputs.\n required_resource_keys (Optional[Set[str]]): The string names of any required resources.\n output_notebook_name: (Optional[str]): If set, will be used as the name of an injected output\n of type of :py:class:`~dagster.BufferedIOBase` that is the file object of the executed\n notebook (in addition to the :py:class:`~dagster.AssetMaterialization` that is always\n created). It allows the downstream ops to access the executed notebook via a file\n object.\n asset_key_prefix (Optional[Union[List[str], str]]): If set, will be used to prefix the\n asset keys for materialized notebooks.\n description (Optional[str]): If set, description used for op.\n tags (Optional[Dict[str, str]]): If set, additional tags used to annotate op.\n Dagster uses the tag keys `notebook_path` and `kind`, which cannot be\n overwritten by the user.\n io_manager_key (Optional[str]): If using output_notebook_name, you can additionally provide\n a string key for the IO manager used to store the output notebook.\n If not provided, the default key output_notebook_io_manager will be used.\n save_notebook_on_failure (bool): If True and the notebook fails during execution, the failed notebook will be\n written to the Dagster storage directory. The location of the file will be printed in the Dagster logs.\n Defaults to False.\n\n Returns:\n :py:class:`~dagster.OpDefinition`\n """\n check.str_param(name, "name")\n check.str_param(notebook_path, "notebook_path")\n check.bool_param(save_notebook_on_failure, "save_notebook_on_failure")\n\n required_resource_keys = set(\n check.opt_set_param(required_resource_keys, "required_resource_keys", of_type=str)\n )\n outs = check.opt_mapping_param(outs, "outs", key_type=str, value_type=Out)\n ins = check.opt_mapping_param(ins, "ins", key_type=str, value_type=In)\n\n if output_notebook_name is not None:\n io_mgr_key = check.opt_str_param(\n io_manager_key, "io_manager_key", default="output_notebook_io_manager"\n )\n required_resource_keys.add(io_mgr_key)\n outs = {\n **outs,\n cast(str, output_notebook_name): Out(io_manager_key=io_mgr_key),\n }\n\n if isinstance(asset_key_prefix, str):\n asset_key_prefix = [asset_key_prefix]\n\n asset_key_prefix = check.opt_list_param(asset_key_prefix, "asset_key_prefix", of_type=str)\n\n default_description = f"This op is backed by the notebook at {notebook_path}"\n description = check.opt_str_param(description, "description", default=default_description)\n\n user_tags = validate_tags(tags)\n if tags is not None:\n check.invariant(\n "notebook_path" not in tags,\n "user-defined op tags contains the `notebook_path` key, but the `notebook_path` key"\n " is reserved for use by Dagster",\n )\n check.invariant(\n "kind" not in tags,\n "user-defined op tags contains the `kind` key, but the `kind` key is reserved for"\n " use by Dagster",\n )\n default_tags = {"notebook_path": _clean_path_for_windows(notebook_path), "kind": "ipynb"}\n\n if safe_is_subclass(config_schema, Config):\n config_schema = infer_schema_from_config_class(cast(Type[Config], config_schema))\n\n return OpDefinition(\n name=name,\n compute_fn=_make_dagstermill_compute_fn(\n "define_dagstermill_op",\n name,\n notebook_path,\n output_notebook_name,\n asset_key_prefix=asset_key_prefix,\n save_notebook_on_failure=save_notebook_on_failure,\n ),\n ins=ins,\n outs=outs,\n config_schema=config_schema,\n required_resource_keys=required_resource_keys,\n description=description,\n tags={**user_tags, **default_tags},\n )
\n
", "current_page_name": "_modules/dagstermill/factory", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagstermill.factory"}, "io_managers": {"alabaster_version": "0.7.13", "body": "

Source code for dagstermill.io_managers

\nimport os\nfrom pathlib import Path\nfrom typing import Any, List, Optional, Sequence\n\nimport dagster._check as check\nfrom dagster import (\n    AssetKey,\n    AssetMaterialization,\n    ConfigurableIOManagerFactory,\n    InitResourceContext,\n    IOManager,\n)\nfrom dagster._core.definitions.metadata import MetadataValue\nfrom dagster._core.execution.context.input import InputContext\nfrom dagster._core.execution.context.output import OutputContext\nfrom dagster._core.storage.io_manager import dagster_maintained_io_manager, io_manager\nfrom dagster._utils import mkdir_p\nfrom pydantic import Field\n\nfrom dagstermill.factory import _clean_path_for_windows\n\n\nclass OutputNotebookIOManager(IOManager):\n    def __init__(self, asset_key_prefix: Optional[Sequence[str]] = None):\n        self.asset_key_prefix = asset_key_prefix if asset_key_prefix else []\n\n    def handle_output(self, context: OutputContext, obj: bytes):\n        raise NotImplementedError\n\n    def load_input(self, context: InputContext) -> Any:\n        raise NotImplementedError\n\n\nclass LocalOutputNotebookIOManager(OutputNotebookIOManager):\n    def __init__(self, base_dir: str, asset_key_prefix: Optional[Sequence[str]] = None):\n        super(LocalOutputNotebookIOManager, self).__init__(asset_key_prefix=asset_key_prefix)\n        self.base_dir = base_dir\n        self.write_mode = "wb"\n        self.read_mode = "rb"\n\n    def _get_path(self, context: OutputContext) -> str:\n        """Automatically construct filepath."""\n        if context.has_asset_key:\n            keys = context.get_asset_identifier()\n        else:\n            keys = context.get_run_scoped_output_identifier()\n        return str(Path(self.base_dir, *keys).with_suffix(".ipynb"))\n\n    def handle_output(self, context: OutputContext, obj: bytes):\n        """obj: bytes."""\n        check.inst_param(context, "context", OutputContext)\n\n        # the output notebook itself is stored at output_file_path\n        output_notebook_path = self._get_path(context)\n        mkdir_p(os.path.dirname(output_notebook_path))\n        with open(output_notebook_path, self.write_mode) as dest_file_obj:\n            dest_file_obj.write(obj)\n\n        metadata = {\n            "Executed notebook": MetadataValue.notebook(\n                _clean_path_for_windows(output_notebook_path)\n            )\n        }\n\n        if context.has_asset_key:\n            context.add_output_metadata(metadata)\n        else:\n            context.log_event(\n                AssetMaterialization(\n                    asset_key=AssetKey(\n                        [*self.asset_key_prefix, f"{context.step_key}_output_notebook"]\n                    ),\n                    metadata=metadata,\n                )\n            )\n\n    def load_input(self, context: InputContext) -> bytes:\n        check.inst_param(context, "context", InputContext)\n        # pass output notebook to downstream ops as File Object\n        output_context = check.not_none(context.upstream_output)\n        with open(self._get_path(output_context), self.read_mode) as file_obj:\n            return file_obj.read()\n\n\n
[docs]class ConfigurableLocalOutputNotebookIOManager(ConfigurableIOManagerFactory):\n """Built-in IO Manager for handling output notebook."""\n\n base_dir: Optional[str] = Field(\n default=None,\n description=(\n "Base directory to use for output notebooks. Defaults to the Dagster instance storage"\n " directory if not provided."\n ),\n )\n asset_key_prefix: List[str] = Field(\n default=[],\n description=(\n "Asset key prefix to apply to assets materialized for output notebooks. Defaults to no"\n " prefix."\n ),\n )\n\n @classmethod\n def _is_dagster_maintained(cls) -> bool:\n return True\n\n def create_io_manager(self, context: InitResourceContext) -> "LocalOutputNotebookIOManager":\n return LocalOutputNotebookIOManager(\n base_dir=self.base_dir or check.not_none(context.instance).storage_directory(),\n asset_key_prefix=self.asset_key_prefix,\n )
\n\n\n@dagster_maintained_io_manager\n@io_manager(config_schema=ConfigurableLocalOutputNotebookIOManager.to_config_schema())\ndef local_output_notebook_io_manager(init_context) -> LocalOutputNotebookIOManager:\n """Built-in IO Manager that handles output notebooks."""\n return ConfigurableLocalOutputNotebookIOManager.from_resource_context(init_context)\n
", "current_page_name": "_modules/dagstermill/io_managers", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagstermill.io_managers"}, "manager": {"alabaster_version": "0.7.13", "body": "

Source code for dagstermill.manager

\nimport os\nimport pickle\nimport uuid\nfrom typing import TYPE_CHECKING, AbstractSet, Any, Mapping, Optional, cast\n\nfrom dagster import (\n    AssetMaterialization,\n    AssetObservation,\n    ExpectationResult,\n    Failure,\n    LoggerDefinition,\n    ResourceDefinition,\n    StepExecutionContext,\n    TypeCheck,\n    _check as check,\n)\nfrom dagster._core.definitions.dependency import NodeHandle\nfrom dagster._core.definitions.events import RetryRequested\nfrom dagster._core.definitions.graph_definition import GraphDefinition\nfrom dagster._core.definitions.job_base import InMemoryJob\nfrom dagster._core.definitions.job_definition import JobDefinition\nfrom dagster._core.definitions.op_definition import OpDefinition\nfrom dagster._core.definitions.reconstruct import ReconstructableJob\nfrom dagster._core.definitions.resource_definition import ScopedResourcesBuilder\nfrom dagster._core.events import DagsterEvent\nfrom dagster._core.execution.api import create_execution_plan, scoped_job_context\nfrom dagster._core.execution.plan.outputs import StepOutputHandle\nfrom dagster._core.execution.plan.plan import ExecutionPlan\nfrom dagster._core.execution.plan.state import KnownExecutionState\nfrom dagster._core.execution.plan.step import ExecutionStep\nfrom dagster._core.execution.resources_init import (\n    get_required_resource_keys_to_init,\n    resource_initialization_event_generator,\n)\nfrom dagster._core.instance import DagsterInstance\nfrom dagster._core.instance.ref import InstanceRef\nfrom dagster._core.log_manager import DagsterLogManager\nfrom dagster._core.storage.dagster_run import DagsterRun, DagsterRunStatus\nfrom dagster._core.system_config.objects import ResolvedRunConfig, ResourceConfig\nfrom dagster._core.utils import make_new_run_id\nfrom dagster._loggers import colored_console_logger\nfrom dagster._serdes import unpack_value\nfrom dagster._utils import EventGenerationManager\n\nfrom .context import DagstermillExecutionContext, DagstermillRuntimeExecutionContext\nfrom .errors import DagstermillError\nfrom .serialize import PICKLE_PROTOCOL\n\nif TYPE_CHECKING:\n    from dagster._core.definitions.node_definition import NodeDefinition\n\n\nclass DagstermillResourceEventGenerationManager(EventGenerationManager):\n    """Utility class to explicitly manage setup/teardown of resource events. Overrides the default\n    `generate_teardown_events` method so that teardown is deferred until explicitly called by the\n    dagstermill Manager.\n    """\n\n    def generate_teardown_events(self):\n        return iter(())\n\n    def teardown(self):\n        return [\n            teardown_event\n            for teardown_event in super(\n                DagstermillResourceEventGenerationManager, self\n            ).generate_teardown_events()\n        ]\n\n\nclass Manager:\n    def __init__(self):\n        self.job = None\n        self.op_def: Optional[NodeDefinition] = None\n        self.in_job: bool = False\n        self.marshal_dir: Optional[str] = None\n        self.context = None\n        self.resource_manager = None\n\n    def _setup_resources(\n        self,\n        resource_defs: Mapping[str, ResourceDefinition],\n        resource_configs: Mapping[str, ResourceConfig],\n        log_manager: DagsterLogManager,\n        execution_plan: Optional[ExecutionPlan],\n        dagster_run: Optional[DagsterRun],\n        resource_keys_to_init: Optional[AbstractSet[str]],\n        instance: Optional[DagsterInstance],\n        emit_persistent_events: Optional[bool],\n    ):\n        """Drop-in replacement for\n        `dagster._core.execution.resources_init.resource_initialization_manager`.  It uses a\n        `DagstermillResourceEventGenerationManager` and explicitly calls `teardown` on it.\n        """\n        generator = resource_initialization_event_generator(\n            resource_defs=resource_defs,\n            resource_configs=resource_configs,\n            log_manager=log_manager,\n            execution_plan=execution_plan,\n            dagster_run=dagster_run,\n            resource_keys_to_init=resource_keys_to_init,\n            instance=instance,\n            emit_persistent_events=emit_persistent_events,\n        )\n        self.resource_manager = DagstermillResourceEventGenerationManager(\n            generator, ScopedResourcesBuilder\n        )\n        return self.resource_manager\n\n    def reconstitute_job_context(\n        self,\n        executable_dict: Mapping[str, Any],\n        job_run_dict: Mapping[str, Any],\n        node_handle_kwargs: Mapping[str, Any],\n        instance_ref_dict: Mapping[str, Any],\n        step_key: str,\n        output_log_path: Optional[str] = None,\n        marshal_dir: Optional[str] = None,\n        run_config: Optional[Mapping[str, Any]] = None,\n    ):\n        """Reconstitutes a context for dagstermill-managed execution.\n\n        You'll see this function called to reconstruct a job context within the ``injected\n        parameters`` cell of a dagstermill output notebook. Users should not call this function\n        interactively except when debugging output notebooks.\n\n        Use :func:`dagstermill.get_context` in the ``parameters`` cell of your notebook to define a\n        context for interactive exploration and development. This call will be replaced by one to\n        :func:`dagstermill.reconstitute_job_context` when the notebook is executed by\n        dagstermill.\n        """\n        check.opt_str_param(output_log_path, "output_log_path")\n        check.opt_str_param(marshal_dir, "marshal_dir")\n        run_config = check.opt_mapping_param(run_config, "run_config", key_type=str)\n        check.mapping_param(job_run_dict, "job_run_dict")\n        check.mapping_param(executable_dict, "executable_dict")\n        check.mapping_param(node_handle_kwargs, "node_handle_kwargs")\n        check.mapping_param(instance_ref_dict, "instance_ref_dict")\n        check.str_param(step_key, "step_key")\n\n        job = ReconstructableJob.from_dict(executable_dict)\n        job_def = job.get_definition()\n\n        try:\n            instance_ref = unpack_value(instance_ref_dict, InstanceRef)\n            instance = DagsterInstance.from_ref(instance_ref)\n        except Exception as err:\n            raise DagstermillError(\n                "Error when attempting to resolve DagsterInstance from serialized InstanceRef"\n            ) from err\n\n        dagster_run = unpack_value(job_run_dict, DagsterRun)\n\n        node_handle = NodeHandle.from_dict(node_handle_kwargs)\n        op = job_def.get_node(node_handle)\n        op_def = op.definition\n\n        self.marshal_dir = marshal_dir\n        self.in_job = True\n        self.op_def = op_def\n        self.job = job\n\n        ResolvedRunConfig.build(job_def, run_config)\n\n        execution_plan = create_execution_plan(\n            self.job,\n            run_config,\n            step_keys_to_execute=dagster_run.step_keys_to_execute,\n        )\n\n        with scoped_job_context(\n            execution_plan,\n            job,\n            run_config,\n            dagster_run,\n            instance,\n            scoped_resources_builder_cm=self._setup_resources,\n            # Set this flag even though we're not in test for clearer error reporting\n            raise_on_error=True,\n        ) as job_context:\n            known_state = None\n            if dagster_run.parent_run_id:\n                known_state = KnownExecutionState.build_for_reexecution(\n                    instance=instance,\n                    parent_run=check.not_none(instance.get_run_by_id(dagster_run.parent_run_id)),\n                )\n            self.context = DagstermillRuntimeExecutionContext(\n                job_context=job_context,\n                job_def=job_def,\n                op_config=run_config.get("ops", {}).get(op.name, {}).get("config"),\n                resource_keys_to_init=get_required_resource_keys_to_init(\n                    execution_plan,\n                    job_def,\n                ),\n                op_name=op.name,\n                node_handle=node_handle,\n                step_context=cast(\n                    StepExecutionContext,\n                    job_context.for_step(\n                        cast(ExecutionStep, execution_plan.get_step_by_key(step_key)),\n                        known_state=known_state,\n                    ),\n                ),\n            )\n\n        return self.context\n\n    def get_context(\n        self,\n        op_config: Any = None,\n        resource_defs: Optional[Mapping[str, ResourceDefinition]] = None,\n        logger_defs: Optional[Mapping[str, LoggerDefinition]] = None,\n        run_config: Optional[dict] = None,\n    ) -> DagstermillExecutionContext:\n        """Get a dagstermill execution context for interactive exploration and development.\n\n        Args:\n            op_config (Optional[Any]): If specified, this value will be made available on the\n                context as its ``op_config`` property.\n            resource_defs (Optional[Mapping[str, ResourceDefinition]]): Specifies resources to provide to context.\n            logger_defs (Optional[Mapping[str, LoggerDefinition]]): Specifies loggers to provide to context.\n            run_config(Optional[dict]): The config dict with which to construct\n                the context.\n\n        Returns:\n            :py:class:`~dagstermill.DagstermillExecutionContext`\n        """\n        run_config = check.opt_dict_param(run_config, "run_config", key_type=str)\n\n        # If we are running non-interactively, and there is already a context reconstituted, return\n        # that context rather than overwriting it.\n        if self.context is not None and isinstance(\n            self.context, DagstermillRuntimeExecutionContext\n        ):\n            return self.context\n\n        if not logger_defs:\n            logger_defs = {"dagstermill": colored_console_logger}\n            run_config["loggers"] = {"dagstermill": {}}\n        logger_defs = check.opt_mapping_param(logger_defs, "logger_defs")\n        resource_defs = check.opt_mapping_param(resource_defs, "resource_defs")\n\n        op_def = OpDefinition(\n            name="this_op",\n            compute_fn=lambda *args, **kwargs: None,\n            description="Ephemeral op constructed by dagstermill.get_context()",\n            required_resource_keys=set(resource_defs.keys()),\n        )\n\n        job_def = JobDefinition(\n            graph_def=GraphDefinition(name="ephemeral_dagstermill_pipeline", node_defs=[op_def]),\n            logger_defs=logger_defs,\n            resource_defs=resource_defs,\n        )\n\n        run_id = make_new_run_id()\n\n        # construct stubbed DagsterRun for notebook exploration...\n        # The actual dagster run during job execution will be serialized and reconstituted\n        # in the `reconstitute_job_context` call\n        dagster_run = DagsterRun(\n            job_name=job_def.name,\n            run_id=run_id,\n            run_config=run_config,\n            step_keys_to_execute=None,\n            status=DagsterRunStatus.NOT_STARTED,\n            tags=None,\n        )\n\n        self.in_job = False\n        self.op_def = op_def\n        self.job = job_def\n\n        job = InMemoryJob(job_def)\n        execution_plan = create_execution_plan(job, run_config)\n\n        with scoped_job_context(\n            execution_plan,\n            job,\n            run_config,\n            dagster_run,\n            DagsterInstance.ephemeral(),\n            scoped_resources_builder_cm=self._setup_resources,\n        ) as job_context:\n            self.context = DagstermillExecutionContext(\n                job_context=job_context,\n                job_def=job_def,\n                op_config=op_config,\n                resource_keys_to_init=get_required_resource_keys_to_init(\n                    execution_plan,\n                    job_def,\n                ),\n                op_name=op_def.name,\n                node_handle=NodeHandle(op_def.name, parent=None),\n            )\n\n        return self.context\n\n    def yield_result(self, value, output_name="result"):\n        """Yield a result directly from notebook code.\n\n        When called interactively or in development, returns its input.\n\n        Args:\n            value (Any): The value to yield.\n            output_name (Optional[str]): The name of the result to yield (default: ``'result'``).\n        """\n        if not self.in_job:\n            return value\n\n        # deferred import for perf\n        import scrapbook\n\n        if not self.op_def.has_output(output_name):\n            raise DagstermillError(\n                f"Op {self.op_def.name} does not have output named {output_name}.Expected one of"\n                f" {[str(output_def.name) for output_def in self.op_def.output_defs]}"\n            )\n\n        # pass output value cross process boundary using io manager\n        step_context = self.context._step_context  # noqa: SLF001\n        # Note: yield_result currently does not support DynamicOutput\n\n        # dagstermill assets do not support yielding additional results within the notebook:\n        if len(step_context.job_def.asset_layer.asset_keys) > 0:\n            raise DagstermillError(\n                "dagstermill assets do not currently support dagstermill.yield_result"\n            )\n\n        step_output_handle = StepOutputHandle(\n            step_key=step_context.step.key, output_name=output_name\n        )\n        output_context = step_context.get_output_context(step_output_handle)\n        io_manager = step_context.get_io_manager(step_output_handle)\n\n        # Note that we assume io manager is symmetric, i.e handle_input(handle_output(X)) == X\n        io_manager.handle_output(output_context, value)\n\n        # record that the output has been yielded\n        scrapbook.glue(output_name, "")\n\n    def yield_event(self, dagster_event):\n        """Yield a dagster event directly from notebook code.\n\n        When called interactively or in development, returns its input.\n\n        Args:\n            dagster_event (Union[:class:`dagster.AssetMaterialization`, :class:`dagster.ExpectationResult`, :class:`dagster.TypeCheck`, :class:`dagster.Failure`, :class:`dagster.RetryRequested`]):\n                An event to yield back to Dagster.\n        """\n        valid_types = (\n            AssetMaterialization,\n            AssetObservation,\n            ExpectationResult,\n            TypeCheck,\n            Failure,\n            RetryRequested,\n        )\n        if not isinstance(dagster_event, valid_types):\n            raise DagstermillError(\n                f"Received invalid type {dagster_event} in yield_event. Expected a Dagster event"\n                f" type, one of {valid_types}."\n            )\n\n        if not self.in_job:\n            return dagster_event\n\n        # deferred import for perf\n        import scrapbook\n\n        event_id = f"event-{uuid.uuid4()}"\n        out_file_path = os.path.join(self.marshal_dir, event_id)\n        with open(out_file_path, "wb") as fd:\n            fd.write(pickle.dumps(dagster_event, PICKLE_PROTOCOL))\n\n        scrapbook.glue(event_id, out_file_path)\n\n    def teardown_resources(self):\n        if self.resource_manager is not None:\n            self.resource_manager.teardown()\n\n    def load_input_parameter(self, input_name: str):\n        # load input from source\n        dm_context = check.not_none(self.context)\n        if not isinstance(dm_context, DagstermillRuntimeExecutionContext):\n            check.failed("Expected DagstermillRuntimeExecutionContext")\n        step_context = dm_context.step_context\n        step_input = step_context.step.step_input_named(input_name)\n        input_def = step_context.op_def.input_def_named(input_name)\n        for event_or_input_value in step_input.source.load_input_object(step_context, input_def):\n            if isinstance(event_or_input_value, DagsterEvent):\n                continue\n            else:\n                return event_or_input_value\n\n\nMANAGER_FOR_NOTEBOOK_INSTANCE = Manager()\n
", "current_page_name": "_modules/dagstermill/manager", "customsidebar": null, "favicon_url": null, "logo_url": null, "parents": [{"link": "../../", "title": "Module code"}], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "title": "dagstermill.manager"}}} \ No newline at end of file diff --git a/docs/content/api/searchindex.json b/docs/content/api/searchindex.json index d2d4bc2b7b92c..6766972aa34dc 100644 --- a/docs/content/api/searchindex.json +++ b/docs/content/api/searchindex.json @@ -1 +1 @@ -{"docnames": ["index", "sections/api/apidocs/asset-checks", "sections/api/apidocs/assets", "sections/api/apidocs/cli", "sections/api/apidocs/config", "sections/api/apidocs/definitions", "sections/api/apidocs/dynamic", "sections/api/apidocs/errors", "sections/api/apidocs/execution", "sections/api/apidocs/graphs", "sections/api/apidocs/hooks", "sections/api/apidocs/internals", "sections/api/apidocs/io-managers", "sections/api/apidocs/jobs", "sections/api/apidocs/libraries/dagster-airbyte", "sections/api/apidocs/libraries/dagster-airflow", "sections/api/apidocs/libraries/dagster-aws", "sections/api/apidocs/libraries/dagster-azure", "sections/api/apidocs/libraries/dagster-celery", "sections/api/apidocs/libraries/dagster-celery-docker", "sections/api/apidocs/libraries/dagster-celery-k8s", "sections/api/apidocs/libraries/dagster-census", "sections/api/apidocs/libraries/dagster-dask", "sections/api/apidocs/libraries/dagster-databricks", "sections/api/apidocs/libraries/dagster-datadog", "sections/api/apidocs/libraries/dagster-datahub", "sections/api/apidocs/libraries/dagster-dbt", "sections/api/apidocs/libraries/dagster-docker", "sections/api/apidocs/libraries/dagster-duckdb", "sections/api/apidocs/libraries/dagster-duckdb-pandas", "sections/api/apidocs/libraries/dagster-duckdb-polars", "sections/api/apidocs/libraries/dagster-duckdb-pyspark", "sections/api/apidocs/libraries/dagster-embedded-elt", "sections/api/apidocs/libraries/dagster-fivetran", "sections/api/apidocs/libraries/dagster-gcp", "sections/api/apidocs/libraries/dagster-gcp-pandas", "sections/api/apidocs/libraries/dagster-gcp-pyspark", "sections/api/apidocs/libraries/dagster-ge", "sections/api/apidocs/libraries/dagster-github", "sections/api/apidocs/libraries/dagster-graphql", "sections/api/apidocs/libraries/dagster-k8s", "sections/api/apidocs/libraries/dagster-mlflow", "sections/api/apidocs/libraries/dagster-msteams", "sections/api/apidocs/libraries/dagster-mysql", "sections/api/apidocs/libraries/dagster-pagerduty", "sections/api/apidocs/libraries/dagster-pandas", "sections/api/apidocs/libraries/dagster-pandera", "sections/api/apidocs/libraries/dagster-papertrail", "sections/api/apidocs/libraries/dagster-postgres", "sections/api/apidocs/libraries/dagster-prometheus", "sections/api/apidocs/libraries/dagster-pyspark", "sections/api/apidocs/libraries/dagster-shell", "sections/api/apidocs/libraries/dagster-slack", "sections/api/apidocs/libraries/dagster-snowflake", "sections/api/apidocs/libraries/dagster-snowflake-pandas", "sections/api/apidocs/libraries/dagster-snowflake-pyspark", "sections/api/apidocs/libraries/dagster-spark", "sections/api/apidocs/libraries/dagster-ssh", "sections/api/apidocs/libraries/dagster-twilio", "sections/api/apidocs/libraries/dagster-wandb", "sections/api/apidocs/libraries/dagstermill", "sections/api/apidocs/loggers", "sections/api/apidocs/memoization", "sections/api/apidocs/ops", "sections/api/apidocs/partitions", "sections/api/apidocs/repositories", "sections/api/apidocs/resources", "sections/api/apidocs/schedules-sensors", "sections/api/apidocs/types", "sections/api/apidocs/utilities"], "envversion": {"sphinx": 56, "sphinx.domains.c": 2, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 6, "sphinx.domains.index": 1, "sphinx.domains.javascript": 2, "sphinx.domains.math": 2, "sphinx.domains.python": 3, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.viewcode": 1}, "filenames": ["index.rst", "sections/api/apidocs/asset-checks.rst", "sections/api/apidocs/assets.rst", "sections/api/apidocs/cli.rst", "sections/api/apidocs/config.rst", "sections/api/apidocs/definitions.rst", "sections/api/apidocs/dynamic.rst", "sections/api/apidocs/errors.rst", "sections/api/apidocs/execution.rst", "sections/api/apidocs/graphs.rst", "sections/api/apidocs/hooks.rst", "sections/api/apidocs/internals.rst", "sections/api/apidocs/io-managers.rst", "sections/api/apidocs/jobs.rst", "sections/api/apidocs/libraries/dagster-airbyte.rst", "sections/api/apidocs/libraries/dagster-airflow.rst", "sections/api/apidocs/libraries/dagster-aws.rst", "sections/api/apidocs/libraries/dagster-azure.rst", "sections/api/apidocs/libraries/dagster-celery.rst", "sections/api/apidocs/libraries/dagster-celery-docker.rst", "sections/api/apidocs/libraries/dagster-celery-k8s.rst", "sections/api/apidocs/libraries/dagster-census.rst", "sections/api/apidocs/libraries/dagster-dask.rst", "sections/api/apidocs/libraries/dagster-databricks.rst", "sections/api/apidocs/libraries/dagster-datadog.rst", "sections/api/apidocs/libraries/dagster-datahub.rst", "sections/api/apidocs/libraries/dagster-dbt.rst", "sections/api/apidocs/libraries/dagster-docker.rst", "sections/api/apidocs/libraries/dagster-duckdb.rst", "sections/api/apidocs/libraries/dagster-duckdb-pandas.rst", "sections/api/apidocs/libraries/dagster-duckdb-polars.rst", "sections/api/apidocs/libraries/dagster-duckdb-pyspark.rst", "sections/api/apidocs/libraries/dagster-embedded-elt.rst", "sections/api/apidocs/libraries/dagster-fivetran.rst", "sections/api/apidocs/libraries/dagster-gcp.rst", "sections/api/apidocs/libraries/dagster-gcp-pandas.rst", "sections/api/apidocs/libraries/dagster-gcp-pyspark.rst", "sections/api/apidocs/libraries/dagster-ge.rst", "sections/api/apidocs/libraries/dagster-github.rst", "sections/api/apidocs/libraries/dagster-graphql.rst", "sections/api/apidocs/libraries/dagster-k8s.rst", "sections/api/apidocs/libraries/dagster-mlflow.rst", "sections/api/apidocs/libraries/dagster-msteams.rst", "sections/api/apidocs/libraries/dagster-mysql.rst", "sections/api/apidocs/libraries/dagster-pagerduty.rst", "sections/api/apidocs/libraries/dagster-pandas.rst", "sections/api/apidocs/libraries/dagster-pandera.rst", "sections/api/apidocs/libraries/dagster-papertrail.rst", "sections/api/apidocs/libraries/dagster-postgres.rst", "sections/api/apidocs/libraries/dagster-prometheus.rst", "sections/api/apidocs/libraries/dagster-pyspark.rst", "sections/api/apidocs/libraries/dagster-shell.rst", "sections/api/apidocs/libraries/dagster-slack.rst", "sections/api/apidocs/libraries/dagster-snowflake.rst", "sections/api/apidocs/libraries/dagster-snowflake-pandas.rst", "sections/api/apidocs/libraries/dagster-snowflake-pyspark.rst", "sections/api/apidocs/libraries/dagster-spark.rst", "sections/api/apidocs/libraries/dagster-ssh.rst", "sections/api/apidocs/libraries/dagster-twilio.rst", "sections/api/apidocs/libraries/dagster-wandb.rst", "sections/api/apidocs/libraries/dagstermill.rst", "sections/api/apidocs/loggers.rst", "sections/api/apidocs/memoization.rst", "sections/api/apidocs/ops.rst", "sections/api/apidocs/partitions.rst", "sections/api/apidocs/repositories.rst", "sections/api/apidocs/resources.rst", "sections/api/apidocs/schedules-sensors.rst", "sections/api/apidocs/types.rst", "sections/api/apidocs/utilities.rst"], "objects": {"dagster": [[67, 0, 1, "", "AddDynamicPartitionsRequest"], [64, 0, 1, "", "AllPartitionMapping"], [4, 0, 1, "", "Array"], [1, 0, 1, "", "AssetCheckKey"], [1, 0, 1, "", "AssetCheckResult"], [1, 0, 1, "", "AssetCheckSeverity"], [1, 0, 1, "", "AssetCheckSpec"], [2, 0, 1, "", "AssetDep"], [8, 0, 1, "", "AssetExecutionContext"], [2, 0, 1, "", "AssetIn"], [63, 0, 1, "", "AssetKey"], [63, 0, 1, "", "AssetMaterialization"], [2, 0, 1, "", "AssetOut"], [2, 0, 1, "", "AssetSelection"], [67, 0, 1, "", "AssetSensorDefinition"], [2, 0, 1, "", "AssetSpec"], [2, 0, 1, "", "AssetValueLoader"], [2, 0, 1, "", "AssetsDefinition"], [2, 0, 1, "", "AutoMaterializePolicy"], [2, 0, 1, "", "AutoMaterializeRule"], [64, 0, 1, "", "BackfillPolicy"], [63, 0, 1, "", "Backoff"], [4, 2, 1, "", "BoolSource"], [4, 0, 1, "", "Config"], [4, 0, 1, "", "ConfigMapping"], [4, 0, 1, "", "ConfigSchema"], [12, 0, 1, "", "ConfigurableIOManager"], [12, 0, 1, "", "ConfigurableIOManagerFactory"], [66, 0, 1, "", "ConfigurableResource"], [63, 0, 1, "", "DagsterAssetMetadataValue"], [7, 4, 1, "", "DagsterConfigMappingFunctionError"], [7, 4, 1, "", "DagsterError"], [8, 0, 1, "", "DagsterEvent"], [7, 4, 1, "", "DagsterEventLogInvalidForRun"], [8, 0, 1, "", "DagsterEventType"], [7, 4, 1, "", "DagsterExecutionStepExecutionError"], [7, 4, 1, "", "DagsterExecutionStepNotFoundError"], [11, 0, 1, "", "DagsterInstance"], [7, 4, 1, "", "DagsterInvalidConfigDefinitionError"], [7, 4, 1, "", "DagsterInvalidConfigError"], [7, 4, 1, "", "DagsterInvalidDefinitionError"], [7, 4, 1, "", "DagsterInvalidSubsetError"], [7, 4, 1, "", "DagsterInvariantViolationError"], [61, 0, 1, "", "DagsterLogManager"], [7, 4, 1, "", "DagsterResourceFunctionError"], [11, 0, 1, "", "DagsterRun"], [63, 0, 1, "", "DagsterRunMetadataValue"], [7, 4, 1, "", "DagsterRunNotFoundError"], [11, 0, 1, "", "DagsterRunStatus"], [7, 4, 1, "", "DagsterStepOutputNotFoundError"], [7, 4, 1, "", "DagsterSubprocessError"], [68, 0, 1, "", "DagsterType"], [7, 4, 1, "", "DagsterTypeCheckDidNotPass"], [7, 4, 1, "", "DagsterTypeCheckError"], [68, 0, 1, "", "DagsterTypeLoader"], [68, 0, 1, "", "DagsterTypeLoaderContext"], [7, 4, 1, "", "DagsterUnknownResourceError"], [7, 4, 1, "", "DagsterUnmetExecutorRequirementsError"], [7, 4, 1, "", "DagsterUserCodeExecutionError"], [64, 0, 1, "", "DailyPartitionsDefinition"], [5, 0, 1, "", "Definitions"], [67, 0, 1, "", "DeleteDynamicPartitionsRequest"], [9, 0, 1, "", "DependencyDefinition"], [6, 0, 1, "", "DynamicOut"], [6, 0, 1, "", "DynamicOutput"], [64, 0, 1, "", "DynamicPartitionsDefinition"], [4, 0, 1, "", "Enum"], [4, 0, 1, "", "EnumValue"], [11, 0, 1, "", "EventLogEntry"], [11, 0, 1, "", "EventLogRecord"], [11, 0, 1, "", "EventRecordsFilter"], [8, 0, 1, "", "ExecuteInProcessResult"], [11, 0, 1, "", "Executor"], [11, 0, 1, "", "ExecutorDefinition"], [63, 0, 1, "", "ExpectationResult"], [69, 0, 1, "", "ExperimentalWarning"], [63, 0, 1, "", "Failure"], [4, 0, 1, "", "Field"], [11, 0, 1, "", "FileHandle"], [12, 5, 1, "", "FilesystemIOManager"], [63, 0, 1, "", "FloatMetadataValue"], [2, 0, 1, "", "FreshnessPolicy"], [67, 0, 1, "", "FreshnessPolicySensorDefinition"], [9, 0, 1, "", "GraphDefinition"], [9, 0, 1, "", "GraphIn"], [9, 0, 1, "", "GraphOut"], [10, 0, 1, "", "HookContext"], [10, 0, 1, "", "HookDefinition"], [64, 0, 1, "", "HourlyPartitionsDefinition"], [12, 0, 1, "", "IOManager"], [12, 0, 1, "", "IOManagerDefinition"], [64, 0, 1, "", "IdentityPartitionMapping"], [63, 0, 1, "", "In"], [12, 5, 1, "", "InMemoryIOManager"], [11, 0, 1, "", "InitExecutorContext"], [61, 0, 1, "", "InitLoggerContext"], [66, 0, 1, "", "InitResourceContext"], [12, 0, 1, "", "InputContext"], [12, 0, 1, "", "InputManager"], [12, 0, 1, "", "InputManagerDefinition"], [9, 0, 1, "", "InputMapping"], [63, 0, 1, "", "IntMetadataValue"], [4, 2, 1, "", "IntSource"], [63, 0, 1, "", "Jitter"], [13, 0, 1, "", "JobDefinition"], [8, 0, 1, "", "JobExecutionResult"], [67, 0, 1, "", "JobSelector"], [63, 0, 1, "", "JsonMetadataValue"], [64, 0, 1, "", "LastPartitionMapping"], [11, 0, 1, "", "LocalFileHandle"], [61, 0, 1, "", "LoggerDefinition"], [62, 2, 1, "", "MEMOIZED_RUN_TAG"], [4, 0, 1, "", "Map"], [63, 0, 1, "", "MarkdownMetadataValue"], [2, 0, 1, "", "MaterializeResult"], [62, 0, 1, "", "MemoizableIOManager"], [63, 0, 1, "", "MetadataEntry"], [63, 0, 1, "", "MetadataValue"], [64, 0, 1, "", "MonthlyPartitionsDefinition"], [67, 0, 1, "", "MultiAssetSensorDefinition"], [67, 0, 1, "", "MultiAssetSensorEvaluationContext"], [9, 0, 1, "", "MultiDependencyDefinition"], [64, 0, 1, "", "MultiPartitionKey"], [64, 0, 1, "", "MultiPartitionMapping"], [64, 0, 1, "", "MultiPartitionsDefinition"], [64, 0, 1, "", "MultiToSingleDimensionPartitionMapping"], [9, 0, 1, "", "NodeInvocation"], [4, 0, 1, "", "Noneable"], [63, 0, 1, "", "NotebookMetadataValue"], [68, 2, 1, "", "Nothing"], [63, 0, 1, "", "OpDefinition"], [8, 0, 1, "", "OpExecutionContext"], [62, 0, 1, "", "OpVersionContext"], [63, 0, 1, "", "Out"], [63, 0, 1, "", "Output"], [12, 0, 1, "", "OutputContext"], [9, 0, 1, "", "OutputMapping"], [64, 0, 1, "", "PartitionKeyRange"], [64, 0, 1, "", "PartitionMapping"], [64, 0, 1, "", "PartitionedConfig"], [64, 0, 1, "", "PartitionsDefinition"], [63, 0, 1, "", "PathMetadataValue"], [4, 0, 1, "", "Permissive"], [4, 0, 1, "", "PermissiveConfig"], [63, 0, 1, "", "PythonArtifactMetadataValue"], [68, 6, 1, "", "PythonObjectDagsterType"], [8, 0, 1, "", "ReexecutionOptions"], [65, 0, 1, "", "RepositoryData"], [65, 0, 1, "", "RepositoryDefinition"], [67, 0, 1, "", "RepositorySelector"], [66, 0, 1, "", "ResourceDefinition"], [62, 0, 1, "", "ResourceVersionContext"], [63, 0, 1, "", "RetryPolicy"], [63, 0, 1, "", "RetryRequested"], [4, 0, 1, "", "RunConfig"], [67, 0, 1, "", "RunFailureSensorContext"], [67, 0, 1, "", "RunRequest"], [11, 0, 1, "", "RunShardedEventsCursor"], [67, 0, 1, "", "RunStatusSensorContext"], [67, 0, 1, "", "RunStatusSensorDefinition"], [11, 0, 1, "", "RunsFilter"], [4, 0, 1, "", "ScalarUnion"], [67, 0, 1, "", "ScheduleDefinition"], [67, 0, 1, "", "ScheduleEvaluationContext"], [4, 0, 1, "", "Selector"], [67, 0, 1, "", "SensorDefinition"], [67, 0, 1, "", "SensorResult"], [4, 0, 1, "", "Shape"], [67, 0, 1, "", "SkipReason"], [2, 0, 1, "", "SourceAsset"], [62, 0, 1, "", "SourceHashVersionStrategy"], [64, 0, 1, "", "SpecificPartitionsPartitionMapping"], [64, 0, 1, "", "StaticPartitionMapping"], [64, 0, 1, "", "StaticPartitionsDefinition"], [11, 0, 1, "", "StepExecutionContext"], [11, 0, 1, "", "StepLauncher"], [11, 0, 1, "", "StepRunRef"], [4, 2, 1, "", "StringSource"], [63, 0, 1, "", "TableColumn"], [63, 0, 1, "", "TableColumnConstraints"], [63, 0, 1, "", "TableConstraints"], [63, 0, 1, "", "TableMetadataValue"], [63, 0, 1, "", "TableRecord"], [63, 0, 1, "", "TableSchema"], [63, 0, 1, "", "TableSchemaMetadataValue"], [63, 0, 1, "", "TextMetadataValue"], [64, 0, 1, "", "TimeWindow"], [64, 0, 1, "", "TimeWindowPartitionMapping"], [64, 0, 1, "", "TimeWindowPartitionsDefinition"], [63, 0, 1, "", "TypeCheck"], [8, 0, 1, "", "TypeCheckContext"], [12, 0, 1, "", "UPathIOManager"], [63, 0, 1, "", "UrlMetadataValue"], [62, 0, 1, "", "VersionStrategy"], [64, 0, 1, "", "WeeklyPartitionsDefinition"], [2, 6, 1, "", "asset"], [1, 6, 1, "", "asset_check"], [67, 6, 1, "", "asset_sensor"], [8, 6, 1, "", "build_asset_context"], [67, 6, 1, "", "build_freshness_policy_sensor_context"], [10, 6, 1, "", "build_hook_context"], [61, 6, 1, "", "build_init_logger_context"], [66, 6, 1, "", "build_init_resource_context"], [12, 6, 1, "", "build_input_context"], [67, 6, 1, "", "build_multi_asset_sensor_context"], [8, 6, 1, "", "build_op_context"], [12, 6, 1, "", "build_output_context"], [13, 6, 1, "", "build_reconstructable_job"], [66, 6, 1, "", "build_resources"], [67, 6, 1, "", "build_run_status_sensor_context"], [67, 6, 1, "", "build_schedule_context"], [67, 6, 1, "", "build_schedule_from_partitioned_job"], [67, 6, 1, "", "build_sensor_context"], [68, 6, 1, "", "check_dagster_type"], [69, 6, 1, "", "config_from_files"], [69, 6, 1, "", "config_from_pkg_resources"], [69, 6, 1, "", "config_from_yaml_strings"], [4, 6, 1, "", "configured"], [5, 6, 1, "", "create_repository_using_definitions_args"], [68, 6, 1, "", "dagster_type_loader"], [64, 6, 1, "", "daily_partitioned_config"], [2, 6, 1, "", "define_asset_job"], [64, 6, 1, "", "dynamic_partitioned_config"], [8, 6, 1, "", "execute_job"], [11, 6, 1, "", "executor"], [10, 6, 1, "", "failure_hook"], [69, 6, 1, "", "file_relative_path"], [67, 6, 1, "", "freshness_policy_sensor"], [12, 5, 1, "", "fs_io_manager"], [69, 6, 1, "", "get_dagster_logger"], [9, 6, 1, "", "graph"], [2, 6, 1, "", "graph_asset"], [2, 6, 1, "", "graph_multi_asset"], [64, 6, 1, "", "hourly_partitioned_config"], [8, 5, 1, "", "in_process_executor"], [12, 6, 1, "", "input_manager"], [8, 6, 1, "", "instance_for_test"], [12, 6, 1, "", "io_manager"], [13, 6, 1, "", "job"], [2, 6, 1, "", "load_assets_from_current_module"], [2, 6, 1, "", "load_assets_from_modules"], [2, 6, 1, "", "load_assets_from_package_module"], [2, 6, 1, "", "load_assets_from_package_name"], [11, 5, 1, "", "local_file_manager"], [61, 6, 1, "", "logger"], [69, 6, 1, "", "make_email_on_run_failure_sensor"], [68, 6, 1, "", "make_python_type_usable_as_dagster_type"], [66, 6, 1, "", "make_values_resource"], [8, 6, 1, "", "materialize"], [8, 6, 1, "", "materialize_to_memory"], [12, 5, 1, "", "mem_io_manager"], [64, 6, 1, "", "monthly_partitioned_config"], [2, 6, 1, "", "multi_asset"], [67, 6, 1, "", "multi_asset_sensor"], [8, 5, 1, "", "multi_or_in_process_executor"], [8, 5, 1, "", "multiprocess_executor"], [63, 6, 1, "", "op"], [8, 0, 1, "", "reconstructable"], [65, 5, 1, "", "repository"], [66, 6, 1, "", "resource"], [67, 6, 1, "", "run_failure_sensor"], [67, 6, 1, "", "run_status_sensor"], [67, 6, 1, "", "schedule"], [67, 6, 1, "", "sensor"], [64, 6, 1, "", "static_partitioned_config"], [10, 6, 1, "", "success_hook"], [68, 6, 1, "", "usable_as_dagster_type"], [8, 6, 1, "", "validate_run_config"], [64, 6, 1, "", "weekly_partitioned_config"], [66, 6, 1, "", "with_resources"]], "dagster-api-grpc": [[3, 8, 1, "cmdoption-dagster-api-grpc-a", "--attribute"], [3, 8, 1, "cmdoption-dagster-api-grpc-container-context", "--container-context"], [3, 8, 1, "cmdoption-dagster-api-grpc-container-image", "--container-image"], [3, 8, 1, "cmdoption-dagster-api-grpc-empty-working-directory", "--empty-working-directory"], [3, 8, 1, "cmdoption-dagster-api-grpc-fixed-server-id", "--fixed-server-id"], [3, 8, 1, "cmdoption-dagster-api-grpc-heartbeat", "--heartbeat"], [3, 8, 1, "cmdoption-dagster-api-grpc-heartbeat-timeout", "--heartbeat-timeout"], [3, 8, 1, "cmdoption-dagster-api-grpc-h", "--host"], [3, 8, 1, "cmdoption-dagster-api-grpc-inject-env-vars-from-instance", "--inject-env-vars-from-instance"], [3, 8, 1, "cmdoption-dagster-api-grpc-instance-ref", "--instance-ref"], [3, 8, 1, "cmdoption-dagster-api-grpc-lazy-load-user-code", "--lazy-load-user-code"], [3, 8, 1, "cmdoption-dagster-api-grpc-location-name", "--location-name"], [3, 8, 1, "cmdoption-dagster-api-grpc-log-level", "--log-level"], [3, 8, 1, "cmdoption-dagster-api-grpc-n", "--max-workers"], [3, 8, 1, "cmdoption-dagster-api-grpc-n", "--max_workers"], [3, 8, 1, "cmdoption-dagster-api-grpc-m", "--module-name"], [3, 8, 1, "cmdoption-dagster-api-grpc-package-name", "--package-name"], [3, 8, 1, "cmdoption-dagster-api-grpc-p", "--port"], [3, 8, 1, "cmdoption-dagster-api-grpc-f", "--python-file"], [3, 8, 1, "cmdoption-dagster-api-grpc-s", "--socket"], [3, 8, 1, "cmdoption-dagster-api-grpc-use-python-environment-entry-point", "--use-python-environment-entry-point"], [3, 8, 1, "cmdoption-dagster-api-grpc-d", "--working-directory"], [3, 8, 1, "cmdoption-dagster-api-grpc-a", "-a"], [3, 8, 1, "cmdoption-dagster-api-grpc-d", "-d"], [3, 8, 1, "cmdoption-dagster-api-grpc-f", "-f"], [3, 8, 1, "cmdoption-dagster-api-grpc-h", "-h"], [3, 8, 1, "cmdoption-dagster-api-grpc-m", "-m"], [3, 8, 1, "cmdoption-dagster-api-grpc-n", "-n"], [3, 8, 1, "cmdoption-dagster-api-grpc-p", "-p"], [3, 8, 1, "cmdoption-dagster-api-grpc-s", "-s"]], "dagster-celery-worker-list": [[18, 8, 1, "cmdoption-dagster-celery-worker-list-y", "--config-yaml"], [18, 8, 1, "cmdoption-dagster-celery-worker-list-y", "-y"]], "dagster-celery-worker-start": [[18, 8, 1, "cmdoption-dagster-celery-worker-start-A", "--app"], [18, 8, 1, "cmdoption-dagster-celery-worker-start-d", "--background"], [18, 8, 1, "cmdoption-dagster-celery-worker-start-y", "--config-yaml"], [18, 8, 1, "cmdoption-dagster-celery-worker-start-i", "--includes"], [18, 8, 1, "cmdoption-dagster-celery-worker-start-l", "--loglevel"], [18, 8, 1, "cmdoption-dagster-celery-worker-start-n", "--name"], [18, 8, 1, "cmdoption-dagster-celery-worker-start-q", "--queue"], [18, 8, 1, "cmdoption-dagster-celery-worker-start-A", "-A"], [18, 8, 1, "cmdoption-dagster-celery-worker-start-d", "-d"], [18, 8, 1, "cmdoption-dagster-celery-worker-start-i", "-i"], [18, 8, 1, "cmdoption-dagster-celery-worker-start-l", "-l"], [18, 8, 1, "cmdoption-dagster-celery-worker-start-n", "-n"], [18, 8, 1, "cmdoption-dagster-celery-worker-start-q", "-q"], [18, 8, 1, "cmdoption-dagster-celery-worker-start-y", "-y"], [18, 8, 1, "cmdoption-dagster-celery-worker-start-arg-ADDITIONAL_ARGS", "ADDITIONAL_ARGS"]], "dagster-celery-worker-terminate": [[18, 8, 1, "cmdoption-dagster-celery-worker-terminate-a", "--all"], [18, 8, 1, "cmdoption-dagster-celery-worker-terminate-y", "--config-yaml"], [18, 8, 1, "cmdoption-dagster-celery-worker-terminate-a", "-a"], [18, 8, 1, "cmdoption-dagster-celery-worker-terminate-y", "-y"], [18, 8, 1, "cmdoption-dagster-celery-worker-terminate-arg-NAME", "NAME"]], "dagster-daemon-run": [[3, 8, 1, "cmdoption-dagster-daemon-run-a", "--attribute"], [3, 8, 1, "cmdoption-dagster-daemon-run-code-server-log-level", "--code-server-log-level"], [3, 8, 1, "cmdoption-dagster-daemon-run-empty-workspace", "--empty-workspace"], [3, 8, 1, "cmdoption-dagster-daemon-run-grpc-host", "--grpc-host"], [3, 8, 1, "cmdoption-dagster-daemon-run-grpc-port", "--grpc-port"], [3, 8, 1, "cmdoption-dagster-daemon-run-grpc-socket", "--grpc-socket"], [3, 8, 1, "cmdoption-dagster-daemon-run-log-level", "--log-level"], [3, 8, 1, "cmdoption-dagster-daemon-run-m", "--module-name"], [3, 8, 1, "cmdoption-dagster-daemon-run-package-name", "--package-name"], [3, 8, 1, "cmdoption-dagster-daemon-run-f", "--python-file"], [3, 8, 1, "cmdoption-dagster-daemon-run-use-ssl", "--use-ssl"], [3, 8, 1, "cmdoption-dagster-daemon-run-d", "--working-directory"], [3, 8, 1, "cmdoption-dagster-daemon-run-w", "--workspace"], [3, 8, 1, "cmdoption-dagster-daemon-run-a", "-a"], [3, 8, 1, "cmdoption-dagster-daemon-run-d", "-d"], [3, 8, 1, "cmdoption-dagster-daemon-run-f", "-f"], [3, 8, 1, "cmdoption-dagster-daemon-run-m", "-m"], [3, 8, 1, "cmdoption-dagster-daemon-run-w", "-w"]], "dagster-dbt-project-scaffold": [[26, 8, 1, "cmdoption-dagster-dbt-project-scaffold-dbt-project-dir", "--dbt-project-dir"], [26, 8, 1, "cmdoption-dagster-dbt-project-scaffold-project-name", "--project-name"]], "dagster-dev": [[3, 8, 1, "cmdoption-dagster-dev-code-server-log-level", "--code-server-log-level"], [3, 8, 1, "cmdoption-dagster-dev-h", "--dagit-host"], [3, 8, 1, "cmdoption-dagster-dev-p", "--dagit-port"], [3, 8, 1, "cmdoption-dagster-dev-h", "--host"], [3, 8, 1, "cmdoption-dagster-dev-log-level", "--log-level"], [3, 8, 1, "cmdoption-dagster-dev-m", "--module-name"], [3, 8, 1, "cmdoption-dagster-dev-p", "--port"], [3, 8, 1, "cmdoption-dagster-dev-f", "--python-file"], [3, 8, 1, "cmdoption-dagster-dev-d", "--working-directory"], [3, 8, 1, "cmdoption-dagster-dev-w", "--workspace"], [3, 8, 1, "cmdoption-dagster-dev-d", "-d"], [3, 8, 1, "cmdoption-dagster-dev-f", "-f"], [3, 8, 1, "cmdoption-dagster-dev-h", "-h"], [3, 8, 1, "cmdoption-dagster-dev-m", "-m"], [3, 8, 1, "cmdoption-dagster-dev-p", "-p"], [3, 8, 1, "cmdoption-dagster-dev-w", "-w"]], "dagster-graphql": [[3, 8, 1, "cmdoption-dagster-graphql-a", "--attribute"], [3, 8, 1, "cmdoption-dagster-graphql-empty-workspace", "--empty-workspace"], [3, 8, 1, "cmdoption-dagster-graphql-ephemeral-instance", "--ephemeral-instance"], [3, 8, 1, "cmdoption-dagster-graphql-f", "--file"], [3, 8, 1, "cmdoption-dagster-graphql-grpc-host", "--grpc-host"], [3, 8, 1, "cmdoption-dagster-graphql-grpc-port", "--grpc-port"], [3, 8, 1, "cmdoption-dagster-graphql-grpc-socket", "--grpc-socket"], [3, 8, 1, "cmdoption-dagster-graphql-m", "--module-name"], [3, 8, 1, "cmdoption-dagster-graphql-o", "--output"], [3, 8, 1, "cmdoption-dagster-graphql-package-name", "--package-name"], [3, 8, 1, "cmdoption-dagster-graphql-p", "--predefined"], [3, 8, 1, "cmdoption-dagster-graphql-0", "--python-file"], [3, 8, 1, "cmdoption-dagster-graphql-r", "--remote"], [3, 8, 1, "cmdoption-dagster-graphql-t", "--text"], [3, 8, 1, "cmdoption-dagster-graphql-use-ssl", "--use-ssl"], [3, 8, 1, "cmdoption-dagster-graphql-v", "--variables"], [3, 8, 1, "cmdoption-dagster-graphql-version", "--version"], [3, 8, 1, "cmdoption-dagster-graphql-d", "--working-directory"], [3, 8, 1, "cmdoption-dagster-graphql-w", "--workspace"], [3, 8, 1, "cmdoption-dagster-graphql-a", "-a"], [3, 8, 1, "cmdoption-dagster-graphql-d", "-d"], [3, 8, 1, "cmdoption-dagster-graphql-0", "-f"], [3, 8, 1, "cmdoption-dagster-graphql-m", "-m"], [3, 8, 1, "cmdoption-dagster-graphql-o", "-o"], [3, 8, 1, "cmdoption-dagster-graphql-p", "-p"], [3, 8, 1, "cmdoption-dagster-graphql-r", "-r"], [3, 8, 1, "cmdoption-dagster-graphql-t", "-t"], [3, 8, 1, "cmdoption-dagster-graphql-v", "-v"], [3, 8, 1, "cmdoption-dagster-graphql-w", "-w"]], "dagster-webserver": [[3, 8, 1, "cmdoption-dagster-webserver-a", "--attribute"], [3, 8, 1, "cmdoption-dagster-webserver-code-server-log-level", "--code-server-log-level"], [3, 8, 1, "cmdoption-dagster-webserver-dagster-log-level", "--dagster-log-level"], [3, 8, 1, "cmdoption-dagster-webserver-db-pool-recycle", "--db-pool-recycle"], [3, 8, 1, "cmdoption-dagster-webserver-db-statement-timeout", "--db-statement-timeout"], [3, 8, 1, "cmdoption-dagster-webserver-empty-workspace", "--empty-workspace"], [3, 8, 1, "cmdoption-dagster-webserver-grpc-host", "--grpc-host"], [3, 8, 1, "cmdoption-dagster-webserver-grpc-port", "--grpc-port"], [3, 8, 1, "cmdoption-dagster-webserver-grpc-socket", "--grpc-socket"], [3, 8, 1, "cmdoption-dagster-webserver-h", "--host"], [3, 8, 1, "cmdoption-dagster-webserver-uvicorn-log-level", "--log-level"], [3, 8, 1, "cmdoption-dagster-webserver-m", "--module-name"], [3, 8, 1, "cmdoption-dagster-webserver-package-name", "--package-name"], [3, 8, 1, "cmdoption-dagster-webserver-l", "--path-prefix"], [3, 8, 1, "cmdoption-dagster-webserver-p", "--port"], [3, 8, 1, "cmdoption-dagster-webserver-f", "--python-file"], [3, 8, 1, "cmdoption-dagster-webserver-read-only", "--read-only"], [3, 8, 1, "cmdoption-dagster-webserver-suppress-warnings", "--suppress-warnings"], [3, 8, 1, "cmdoption-dagster-webserver-use-ssl", "--use-ssl"], [3, 8, 1, "cmdoption-dagster-webserver-uvicorn-log-level", "--uvicorn-log-level"], [3, 8, 1, "cmdoption-dagster-webserver-version", "--version"], [3, 8, 1, "cmdoption-dagster-webserver-d", "--working-directory"], [3, 8, 1, "cmdoption-dagster-webserver-w", "--workspace"], [3, 8, 1, "cmdoption-dagster-webserver-a", "-a"], [3, 8, 1, "cmdoption-dagster-webserver-d", "-d"], [3, 8, 1, "cmdoption-dagster-webserver-f", "-f"], [3, 8, 1, "cmdoption-dagster-webserver-h", "-h"], [3, 8, 1, "cmdoption-dagster-webserver-l", "-l"], [3, 8, 1, "cmdoption-dagster-webserver-m", "-m"], [3, 8, 1, "cmdoption-dagster-webserver-p", "-p"], [3, 8, 1, "cmdoption-dagster-webserver-w", "-w"]], "dagster.Array": [[4, 1, 1, "", "description"]], "dagster.AssetCheckResult": [[1, 2, 1, "", "asset_key"], [1, 2, 1, "", "check_name"], [1, 2, 1, "", "metadata"], [1, 2, 1, "", "severity"], [1, 2, 1, "", "success"]], "dagster.AssetDep": [[2, 2, 1, "", "asset"], [2, 2, 1, "", "partition_mapping"]], "dagster.AssetIn": [[2, 2, 1, "", "dagster_type"], [2, 2, 1, "", "key"], [2, 2, 1, "", "key_prefix"], [2, 2, 1, "", "metadata"], [2, 2, 1, "", "partition_mapping"]], "dagster.AssetMaterialization": [[63, 3, 1, "", "file"]], "dagster.AssetOut": [[2, 2, 1, "", "auto_materialize_policy"], [2, 2, 1, "", "backfill_policy"], [2, 2, 1, "", "code_version"], [2, 2, 1, "", "dagster_type"], [2, 2, 1, "", "description"], [2, 2, 1, "", "freshness_policy"], [2, 2, 1, "", "group_name"], [2, 2, 1, "", "io_manager_key"], [2, 2, 1, "", "is_required"], [2, 2, 1, "", "key"], [2, 2, 1, "", "key_prefix"], [2, 2, 1, "", "metadata"]], "dagster.AssetSelection": [[2, 3, 1, "", "all"], [2, 3, 1, "", "all_asset_checks"], [2, 3, 1, "", "assets"], [2, 3, 1, "", "checks"], [2, 3, 1, "", "checks_for_assets"], [2, 3, 1, "", "downstream"], [2, 3, 1, "", "groups"], [2, 3, 1, "", "key_prefixes"], [2, 3, 1, "", "keys"], [2, 3, 1, "", "required_multi_asset_neighbors"], [2, 3, 1, "", "roots"], [2, 3, 1, "", "sinks"], [2, 3, 1, "", "sources"], [2, 3, 1, "", "upstream"], [2, 3, 1, "", "upstream_source_assets"], [2, 3, 1, "", "without_checks"]], "dagster.AssetSensorDefinition": [[67, 1, 1, "", "asset_key"]], "dagster.AssetSpec": [[2, 2, 1, "", "auto_materialize_policy"], [2, 2, 1, "", "backfill_policy"], [2, 2, 1, "", "code_version"], [2, 2, 1, "", "deps"], [2, 2, 1, "", "description"], [2, 2, 1, "", "freshness_policy"], [2, 2, 1, "", "group_name"], [2, 2, 1, "", "key"], [2, 2, 1, "", "metadata"], [2, 2, 1, "", "skippable"]], "dagster.AssetValueLoader": [[2, 3, 1, "", "load_asset_value"]], "dagster.AssetsDefinition": [[2, 1, 1, "", "asset_deps"], [2, 1, 1, "", "can_subset"], [2, 1, 1, "", "check_specs"], [2, 1, 1, "", "dependency_keys"], [2, 1, 1, "", "descriptions_by_key"], [2, 3, 1, "", "from_graph"], [2, 3, 1, "", "from_op"], [2, 3, 1, "", "get_partition_mapping"], [2, 1, 1, "", "group_names_by_key"], [2, 1, 1, "", "key"], [2, 1, 1, "", "keys"], [2, 1, 1, "", "node_def"], [2, 1, 1, "", "op"], [2, 1, 1, "", "partitions_def"], [2, 1, 1, "", "required_resource_keys"], [2, 1, 1, "", "resource_defs"], [2, 3, 1, "", "to_source_asset"], [2, 3, 1, "", "to_source_assets"]], "dagster.AutoMaterializePolicy": [[2, 3, 1, "", "eager"], [2, 3, 1, "", "lazy"], [2, 3, 1, "", "with_rules"], [2, 3, 1, "", "without_rules"]], "dagster.AutoMaterializeRule": [[2, 3, 1, "", "materialize_on_missing"], [2, 3, 1, "", "materialize_on_parent_updated"], [2, 3, 1, "", "materialize_on_required_for_freshness"], [2, 2, 1, "", "require_update_for_all_parent_partitions"], [2, 3, 1, "", "skip_on_not_all_parents_updated"], [2, 3, 1, "", "skip_on_parent_missing"], [2, 3, 1, "", "skip_on_parent_outdated"]], "dagster.BackfillPolicy": [[64, 3, 1, "", "multi_run"], [64, 3, 1, "", "single_run"]], "dagster.DagsterAssetMetadataValue": [[63, 1, 1, "", "value"]], "dagster.DagsterError": [[7, 1, 1, "", "is_user_code_error"]], "dagster.DagsterEvent": [[8, 1, 1, "", "asset_key"], [8, 2, 1, "", "event_specific_data"], [8, 1, 1, "", "event_type"], [8, 2, 1, "", "event_type_value"], [8, 1, 1, "", "is_asset_materialization_planned"], [8, 1, 1, "", "is_asset_observation"], [8, 1, 1, "", "is_engine_event"], [8, 1, 1, "", "is_expectation_result"], [8, 1, 1, "", "is_failure"], [8, 1, 1, "", "is_handled_output"], [8, 1, 1, "", "is_hook_event"], [8, 1, 1, "", "is_loaded_input"], [8, 1, 1, "", "is_resource_init_failure"], [8, 1, 1, "", "is_step_event"], [8, 1, 1, "", "is_step_failure"], [8, 1, 1, "", "is_step_materialization"], [8, 1, 1, "", "is_step_restarted"], [8, 1, 1, "", "is_step_skipped"], [8, 1, 1, "", "is_step_start"], [8, 1, 1, "", "is_step_success"], [8, 1, 1, "", "is_step_up_for_retry"], [8, 1, 1, "", "is_successful_output"], [8, 2, 1, "", "job_name"], [8, 2, 1, "", "logging_tags"], [8, 2, 1, "", "message"], [8, 2, 1, "", "node_handle"], [8, 1, 1, "", "partition"], [8, 2, 1, "", "pid"], [8, 2, 1, "", "step_key"], [8, 2, 1, "", "step_kind_value"]], "dagster.DagsterInstance": [[11, 3, 1, "", "add_dynamic_partitions"], [11, 3, 1, "", "delete_dynamic_partition"], [11, 3, 1, "", "delete_run"], [11, 3, 1, "", "ephemeral"], [11, 3, 1, "", "get"], [11, 3, 1, "", "get_asset_keys"], [11, 3, 1, "", "get_asset_records"], [11, 3, 1, "", "get_dynamic_partitions"], [11, 3, 1, "", "get_event_records"], [11, 3, 1, "", "get_latest_materialization_code_versions"], [11, 3, 1, "", "get_latest_materialization_event"], [11, 3, 1, "", "get_run_by_id"], [11, 3, 1, "", "get_run_record_by_id"], [11, 3, 1, "", "get_run_records"], [11, 3, 1, "", "get_status_by_partition"], [11, 3, 1, "", "has_asset_key"], [11, 3, 1, "", "has_dynamic_partition"], [11, 3, 1, "", "local_temp"], [11, 3, 1, "", "wipe_assets"]], "dagster.DagsterRun": [[11, 1, 1, "", "is_failure"], [11, 1, 1, "", "is_failure_or_canceled"], [11, 1, 1, "", "is_finished"], [11, 1, 1, "", "is_resume_retry"], [11, 1, 1, "", "is_success"]], "dagster.DagsterRunMetadataValue": [[63, 1, 1, "", "value"]], "dagster.DagsterType": [[68, 1, 1, "", "description"], [68, 1, 1, "", "display_name"], [68, 1, 1, "", "has_unique_name"], [68, 1, 1, "", "loader"], [68, 1, 1, "", "required_resource_keys"], [68, 3, 1, "", "type_check"], [68, 1, 1, "", "typing_type"], [68, 1, 1, "", "unique_name"]], "dagster.DagsterTypeLoaderContext": [[68, 1, 1, "", "job_def"], [68, 1, 1, "", "op_def"], [68, 1, 1, "", "resources"]], "dagster.DagsterUserCodeExecutionError": [[7, 1, 1, "", "is_user_code_error"]], "dagster.Definitions": [[5, 3, 1, "", "get_asset_value_loader"], [5, 3, 1, "", "get_job_def"], [5, 3, 1, "", "get_schedule_def"], [5, 3, 1, "", "get_sensor_def"], [5, 3, 1, "", "load_asset_value"]], "dagster.DependencyDefinition": [[9, 3, 1, "", "is_fan_in"]], "dagster.DynamicOutput": [[6, 1, 1, "", "mapping_key"], [6, 1, 1, "", "output_name"], [6, 1, 1, "", "value"]], "dagster.DynamicPartitionsDefinition": [[64, 3, 1, "", "get_partition_keys"]], "dagster.EventLogEntry": [[11, 1, 1, "", "dagster_event_type"], [11, 3, 1, "", "get_dagster_event"], [11, 1, 1, "", "is_dagster_event"], [11, 1, 1, "", "message"]], "dagster.ExecuteInProcessResult": [[8, 1, 1, "", "all_events"], [8, 3, 1, "", "asset_value"], [8, 1, 1, "", "dagster_run"], [8, 1, 1, "", "job_def"], [8, 3, 1, "", "output_for_node"], [8, 3, 1, "", "output_value"], [8, 1, 1, "", "run_id"]], "dagster.Executor": [[11, 3, 1, "", "execute"], [11, 1, 1, "", "retries"]], "dagster.ExecutorDefinition": [[11, 3, 1, "", "configured"], [11, 1, 1, "", "description"], [11, 1, 1, "", "executor_creation_fn"], [11, 1, 1, "", "name"]], "dagster.Field": [[4, 1, 1, "", "default_provided"], [4, 1, 1, "", "default_value"], [4, 1, 1, "", "description"], [4, 1, 1, "", "is_required"]], "dagster.FileHandle": [[11, 1, 1, "", "path_desc"]], "dagster.GraphDefinition": [[9, 3, 1, "", "alias"], [9, 1, 1, "", "config_mapping"], [9, 3, 1, "", "execute_in_process"], [9, 1, 1, "", "input_mappings"], [9, 1, 1, "", "name"], [9, 1, 1, "", "output_mappings"], [9, 3, 1, "", "tag"], [9, 1, 1, "", "tags"], [9, 3, 1, "", "to_job"], [9, 3, 1, "", "with_hooks"], [9, 3, 1, "", "with_retry_policy"]], "dagster.HookContext": [[10, 1, 1, "", "hook_def"], [10, 1, 1, "", "instance"], [10, 1, 1, "", "job_name"], [10, 1, 1, "", "log"], [10, 1, 1, "", "op_config"], [10, 1, 1, "", "op_exception"], [10, 1, 1, "", "op_output_values"], [10, 1, 1, "", "required_resource_keys"], [10, 1, 1, "", "resources"], [10, 1, 1, "", "run_id"], [10, 1, 1, "", "step_key"]], "dagster.IOManager": [[12, 3, 1, "", "handle_output"], [12, 3, 1, "", "load_input"]], "dagster.IOManagerDefinition": [[12, 3, 1, "", "hardcoded_io_manager"]], "dagster.InitExecutorContext": [[11, 2, 1, "", "executor_config"], [11, 2, 1, "", "executor_def"], [11, 2, 1, "", "instance"], [11, 2, 1, "", "job"]], "dagster.InitLoggerContext": [[61, 1, 1, "", "logger_config"], [61, 1, 1, "", "logger_def"], [61, 1, 1, "", "run_id"]], "dagster.InitResourceContext": [[66, 1, 1, "", "instance"], [66, 1, 1, "", "log"], [66, 1, 1, "", "log_manager"], [66, 1, 1, "", "resource_config"], [66, 1, 1, "", "resource_def"], [66, 1, 1, "", "resources"], [66, 1, 1, "", "run_id"]], "dagster.InputContext": [[12, 1, 1, "", "asset_key"], [12, 1, 1, "", "asset_partition_key"], [12, 1, 1, "", "asset_partition_key_range"], [12, 1, 1, "", "asset_partition_keys"], [12, 1, 1, "", "asset_partitions_def"], [12, 1, 1, "", "asset_partitions_time_window"], [12, 1, 1, "", "config"], [12, 1, 1, "", "dagster_type"], [12, 3, 1, "", "get_asset_identifier"], [12, 3, 1, "", "get_identifier"], [12, 1, 1, "", "has_asset_key"], [12, 1, 1, "", "has_asset_partitions"], [12, 1, 1, "", "has_input_name"], [12, 1, 1, "", "has_partition_key"], [12, 1, 1, "", "log"], [12, 1, 1, "", "metadata"], [12, 1, 1, "", "name"], [12, 1, 1, "", "op_def"], [12, 1, 1, "", "partition_key"], [12, 1, 1, "", "resource_config"], [12, 1, 1, "", "resources"], [12, 1, 1, "", "upstream_output"]], "dagster.JobDefinition": [[13, 1, 1, "", "config_mapping"], [13, 3, 1, "", "execute_in_process"], [13, 1, 1, "", "executor_def"], [13, 1, 1, "", "has_specified_executor"], [13, 1, 1, "", "has_specified_loggers"], [13, 1, 1, "", "loggers"], [13, 1, 1, "", "partitioned_config"], [13, 1, 1, "", "partitions_def"], [13, 1, 1, "", "resource_defs"], [13, 3, 1, "", "run_request_for_partition"], [13, 3, 1, "", "with_hooks"], [13, 3, 1, "", "with_top_level_resources"]], "dagster.JobExecutionResult": [[8, 1, 1, "", "all_events"], [8, 1, 1, "", "dagster_run"], [8, 1, 1, "", "job_def"], [8, 3, 1, "", "output_for_node"], [8, 3, 1, "", "output_value"], [8, 1, 1, "", "run_id"]], "dagster.JsonMetadataValue": [[63, 1, 1, "", "value"]], "dagster.LocalFileHandle": [[11, 1, 1, "", "path"], [11, 1, 1, "", "path_desc"]], "dagster.LoggerDefinition": [[61, 1, 1, "", "config_schema"], [61, 1, 1, "", "description"], [61, 1, 1, "", "logger_fn"]], "dagster.Map": [[4, 1, 1, "", "key_label_name"]], "dagster.MarkdownMetadataValue": [[63, 1, 1, "", "value"]], "dagster.MaterializeResult": [[2, 2, 1, "", "asset_key"], [2, 2, 1, "", "metadata"]], "dagster.MemoizableIOManager": [[62, 3, 1, "", "has_output"]], "dagster.MetadataValue": [[63, 3, 1, "", "asset"], [63, 3, 1, "", "bool"], [63, 3, 1, "", "dagster_run"], [63, 3, 1, "", "float"], [63, 3, 1, "", "int"], [63, 3, 1, "", "json"], [63, 3, 1, "", "md"], [63, 3, 1, "", "notebook"], [63, 3, 1, "", "null"], [63, 3, 1, "", "path"], [63, 3, 1, "", "python_artifact"], [63, 3, 1, "", "table"], [63, 3, 1, "", "table_schema"], [63, 3, 1, "", "text"], [63, 3, 1, "", "url"], [63, 1, 1, "", "value"]], "dagster.MultiAssetSensorEvaluationContext": [[67, 3, 1, "", "advance_all_cursors"], [67, 3, 1, "", "advance_cursor"], [67, 3, 1, "", "all_partitions_materialized"], [67, 1, 1, "", "asset_keys"], [67, 1, 1, "", "assets_defs_by_key"], [67, 2, 1, "", "cursor"], [67, 2, 1, "", "definitions"], [67, 3, 1, "", "get_cursor_partition"], [67, 3, 1, "", "get_downstream_partition_keys"], [67, 3, 1, "", "get_trailing_unconsumed_events"], [67, 2, 1, "", "instance"], [67, 2, 1, "", "instance_ref"], [67, 2, 1, "", "last_completion_time"], [67, 2, 1, "", "last_run_key"], [67, 3, 1, "", "latest_materialization_records_by_key"], [67, 3, 1, "", "latest_materialization_records_by_partition"], [67, 3, 1, "", "latest_materialization_records_by_partition_and_asset"], [67, 3, 1, "", "materialization_records_for_key"], [67, 2, 1, "", "monitored_assets"], [67, 2, 1, "", "repository_def"], [67, 2, 1, "", "repository_name"]], "dagster.MultiDependencyDefinition": [[9, 3, 1, "", "get_dependencies_and_mappings"], [9, 3, 1, "", "get_node_dependencies"], [9, 3, 1, "", "is_fan_in"]], "dagster.MultiPartitionsDefinition": [[64, 3, 1, "", "get_partition_keys"], [64, 2, 1, "", "partitions_defs"]], "dagster.NotebookMetadataValue": [[63, 1, 1, "", "value"]], "dagster.OpDefinition": [[63, 3, 1, "", "alias"], [63, 1, 1, "", "config_schema"], [63, 1, 1, "", "ins"], [63, 1, 1, "", "name"], [63, 1, 1, "", "outs"], [63, 1, 1, "", "required_resource_keys"], [63, 1, 1, "", "retry_policy"], [63, 3, 1, "", "tag"], [63, 1, 1, "", "tags"], [63, 1, 1, "", "version"], [63, 3, 1, "", "with_hooks"], [63, 3, 1, "", "with_retry_policy"]], "dagster.OpExecutionContext": [[8, 3, 1, "", "add_output_metadata"], [8, 1, 1, "", "asset_checks_def"], [8, 1, 1, "", "asset_key"], [8, 3, 1, "", "asset_key_for_input"], [8, 3, 1, "", "asset_key_for_output"], [8, 3, 1, "", "asset_partition_key_for_input"], [8, 3, 1, "", "asset_partition_key_for_output"], [8, 1, 1, "", "asset_partition_key_range"], [8, 3, 1, "", "asset_partition_key_range_for_input"], [8, 3, 1, "", "asset_partition_key_range_for_output"], [8, 3, 1, "", "asset_partition_keys_for_input"], [8, 3, 1, "", "asset_partition_keys_for_output"], [8, 3, 1, "", "asset_partitions_def_for_input"], [8, 3, 1, "", "asset_partitions_def_for_output"], [8, 3, 1, "", "asset_partitions_time_window_for_input"], [8, 3, 1, "", "asset_partitions_time_window_for_output"], [8, 1, 1, "", "assets_def"], [8, 3, 1, "", "get_asset_provenance"], [8, 3, 1, "", "get_mapping_key"], [8, 3, 1, "", "get_tag"], [8, 1, 1, "", "has_asset_checks_def"], [8, 1, 1, "", "has_assets_def"], [8, 1, 1, "", "has_partition_key"], [8, 3, 1, "", "has_tag"], [8, 1, 1, "", "instance"], [8, 1, 1, "", "job_def"], [8, 1, 1, "", "job_name"], [8, 1, 1, "", "log"], [8, 3, 1, "", "log_event"], [8, 1, 1, "", "op_config"], [8, 1, 1, "", "op_def"], [8, 3, 1, "", "output_for_asset_key"], [8, 1, 1, "", "partition_key"], [8, 1, 1, "", "partition_key_range"], [8, 1, 1, "", "partition_time_window"], [8, 1, 1, "", "pdb"], [8, 1, 1, "", "resources"], [8, 1, 1, "", "retry_number"], [8, 1, 1, "", "run_config"], [8, 1, 1, "", "run_id"], [8, 1, 1, "", "selected_asset_check_keys"], [8, 1, 1, "", "selected_asset_keys"], [8, 1, 1, "", "selected_output_names"]], "dagster.OpVersionContext": [[62, 2, 1, "", "op_config"], [62, 2, 1, "", "op_def"]], "dagster.Output": [[63, 1, 1, "", "data_version"], [63, 1, 1, "", "output_name"], [63, 1, 1, "", "value"]], "dagster.OutputContext": [[12, 3, 1, "", "add_output_metadata"], [12, 1, 1, "", "asset_key"], [12, 1, 1, "", "asset_partition_key"], [12, 1, 1, "", "asset_partition_key_range"], [12, 1, 1, "", "asset_partition_keys"], [12, 1, 1, "", "asset_partitions_def"], [12, 1, 1, "", "asset_partitions_time_window"], [12, 1, 1, "", "config"], [12, 1, 1, "", "dagster_type"], [12, 3, 1, "", "get_asset_identifier"], [12, 3, 1, "", "get_identifier"], [12, 1, 1, "", "has_asset_key"], [12, 1, 1, "", "has_asset_partitions"], [12, 1, 1, "", "has_partition_key"], [12, 1, 1, "", "log"], [12, 3, 1, "", "log_event"], [12, 1, 1, "", "mapping_key"], [12, 1, 1, "", "metadata"], [12, 1, 1, "", "name"], [12, 1, 1, "", "op_def"], [12, 1, 1, "", "partition_key"], [12, 1, 1, "", "resource_config"], [12, 1, 1, "", "resources"], [12, 1, 1, "", "run_id"], [12, 1, 1, "", "step_key"], [12, 1, 1, "", "version"]], "dagster.PartitionKeyRange": [[64, 2, 1, "", "end"], [64, 2, 1, "", "start"]], "dagster.PartitionMapping": [[64, 3, 1, "", "get_downstream_partitions_for_partitions"], [64, 3, 1, "", "get_upstream_mapped_partitions_result_for_partitions"]], "dagster.PartitionedConfig": [[64, 3, 1, "", "get_partition_keys"], [64, 1, 1, "", "partitions_def"], [64, 1, 1, "", "run_config_for_partition_fn"], [64, 1, 1, "", "run_config_for_partition_key_fn"], [64, 1, 1, "", "tags_for_partition_fn"], [64, 1, 1, "", "tags_for_partition_key_fn"]], "dagster.PartitionsDefinition": [[64, 3, 1, "", "get_partition_keys"]], "dagster.PathMetadataValue": [[63, 1, 1, "", "value"]], "dagster.PythonArtifactMetadataValue": [[63, 1, 1, "", "value"]], "dagster.RepositoryData": [[65, 3, 1, "", "get_all_jobs"], [65, 3, 1, "", "get_all_schedules"], [65, 3, 1, "", "get_all_sensors"], [65, 3, 1, "", "get_assets_defs_by_key"], [65, 3, 1, "", "get_job"], [65, 3, 1, "", "get_job_names"], [65, 3, 1, "", "get_schedule"], [65, 3, 1, "", "get_schedule_names"], [65, 3, 1, "", "get_sensor"], [65, 3, 1, "", "get_sensor_names"], [65, 3, 1, "", "get_source_assets_by_key"], [65, 3, 1, "", "has_job"], [65, 3, 1, "", "has_schedule"], [65, 3, 1, "", "has_sensor"]], "dagster.RepositoryDefinition": [[65, 1, 1, "", "description"], [65, 3, 1, "", "get_all_jobs"], [65, 3, 1, "", "get_asset_value_loader"], [65, 3, 1, "", "get_job"], [65, 3, 1, "", "get_schedule_def"], [65, 3, 1, "", "get_sensor_def"], [65, 3, 1, "", "has_job"], [65, 3, 1, "", "has_schedule_def"], [65, 3, 1, "", "has_sensor_def"], [65, 1, 1, "", "job_names"], [65, 3, 1, "", "load_asset_value"], [65, 1, 1, "", "metadata"], [65, 1, 1, "", "name"], [65, 1, 1, "", "schedule_defs"], [65, 1, 1, "", "sensor_defs"]], "dagster.ResourceDefinition": [[66, 1, 1, "", "description"], [66, 3, 1, "", "hardcoded_resource"], [66, 3, 1, "", "mock_resource"], [66, 3, 1, "", "none_resource"], [66, 1, 1, "", "required_resource_keys"], [66, 3, 1, "", "string_resource"], [66, 1, 1, "", "version"]], "dagster.ResourceVersionContext": [[62, 2, 1, "", "resource_config"], [62, 2, 1, "", "resource_def"]], "dagster.RunFailureSensorContext": [[67, 2, 1, "", "dagster_run"], [67, 1, 1, "", "failure_event"], [67, 3, 1, "", "get_step_failure_events"], [67, 2, 1, "", "sensor_name"]], "dagster.RunRequest": [[67, 2, 1, "", "asset_selection"], [67, 2, 1, "", "job_name"], [67, 2, 1, "", "partition_key"], [67, 2, 1, "", "run_key"], [67, 2, 1, "", "stale_assets_only"], [67, 2, 1, "", "tags"]], "dagster.RunStatusSensorContext": [[67, 1, 1, "", "dagster_event"], [67, 1, 1, "", "dagster_run"], [67, 1, 1, "", "instance"], [67, 1, 1, "", "log"], [67, 1, 1, "", "partition_key"], [67, 1, 1, "", "sensor_name"]], "dagster.ScheduleDefinition": [[67, 1, 1, "", "cron_schedule"], [67, 1, 1, "", "default_status"], [67, 1, 1, "", "description"], [67, 1, 1, "", "environment_vars"], [67, 1, 1, "", "execution_timezone"], [67, 1, 1, "", "job"], [67, 1, 1, "", "job_name"], [67, 1, 1, "", "name"], [67, 1, 1, "", "required_resource_keys"]], "dagster.ScheduleEvaluationContext": [[67, 1, 1, "", "instance"], [67, 1, 1, "", "resources"], [67, 1, 1, "", "scheduled_execution_time"]], "dagster.SensorDefinition": [[67, 1, 1, "", "default_status"], [67, 1, 1, "", "description"], [67, 1, 1, "", "job"], [67, 1, 1, "", "job_name"], [67, 1, 1, "", "jobs"], [67, 1, 1, "", "minimum_interval_seconds"], [67, 1, 1, "", "name"], [67, 1, 1, "", "required_resource_keys"]], "dagster.SensorResult": [[67, 2, 1, "", "asset_events"], [67, 2, 1, "", "cursor"], [67, 2, 1, "", "run_requests"], [67, 2, 1, "", "skip_reason"]], "dagster.SkipReason": [[67, 2, 1, "", "skip_message"]], "dagster.SourceAsset": [[2, 2, 1, "", "description"], [2, 2, 1, "", "io_manager_def"], [2, 2, 1, "", "io_manager_key"], [2, 1, 1, "", "is_observable"], [2, 2, 1, "", "key"], [2, 2, 1, "", "metadata"], [2, 2, 1, "", "observe_fn"], [2, 1, 1, "", "op"], [2, 2, 1, "", "partitions_def"], [2, 2, 1, "", "resource_defs"]], "dagster.SourceHashVersionStrategy": [[62, 3, 1, "", "get_op_version"], [62, 3, 1, "", "get_resource_version"]], "dagster.StaticPartitionsDefinition": [[64, 3, 1, "", "get_partition_keys"]], "dagster.TableMetadataValue": [[63, 3, 1, "", "infer_column_type"], [63, 1, 1, "", "value"]], "dagster.TableSchema": [[63, 3, 1, "", "from_name_type_dict"]], "dagster.TableSchemaMetadataValue": [[63, 1, 1, "", "value"]], "dagster.TextMetadataValue": [[63, 1, 1, "", "value"]], "dagster.TimeWindow": [[64, 2, 1, "", "end"], [64, 2, 1, "", "start"]], "dagster.TimeWindowPartitionMapping": [[64, 2, 1, "", "allow_nonexistent_upstream_partitions"], [64, 2, 1, "", "end_offset"], [64, 2, 1, "", "start_offset"]], "dagster.TimeWindowPartitionsDefinition": [[64, 1, 1, "", "day_offset"], [64, 3, 1, "", "get_cron_schedule"], [64, 1, 1, "", "hour_offset"], [64, 1, 1, "", "minute_offset"], [64, 1, 1, "", "schedule_type"]], "dagster.TypeCheckContext": [[8, 1, 1, "", "log"], [8, 1, 1, "", "resources"], [8, 1, 1, "", "run_id"]], "dagster.UrlMetadataValue": [[63, 1, 1, "", "value"]], "dagster.VersionStrategy": [[62, 3, 1, "", "get_op_version"], [62, 3, 1, "", "get_resource_version"]], "dagster._core": [[7, 7, 0, "-", "errors"]], "dagster._core.errors": [[11, 6, 1, "", "user_code_error_boundary"]], "dagster._core.instance": [[11, 0, 1, "", "InstanceRef"]], "dagster._core.launcher": [[11, 0, 1, "", "DefaultRunLauncher"], [11, 0, 1, "", "RunLauncher"]], "dagster._core.run_coordinator": [[11, 0, 1, "", "DefaultRunCoordinator"], [11, 5, 1, "", "QueuedRunCoordinator"]], "dagster._core.scheduler": [[67, 5, 1, "", "DagsterDaemonScheduler"], [11, 0, 1, "", "Scheduler"]], "dagster._core.storage.base_storage": [[11, 0, 1, "", "DagsterStorage"]], "dagster._core.storage.captured_log_manager": [[11, 0, 1, "", "CapturedLogManager"]], "dagster._core.storage.compute_log_manager": [[11, 0, 1, "", "ComputeLogManager"]], "dagster._core.storage.dagster_run": [[11, 0, 1, "", "RunRecord"]], "dagster._core.storage.event_log": [[11, 0, 1, "", "AssetRecord"], [11, 0, 1, "", "ConsolidatedSqliteEventLogStorage"], [11, 0, 1, "", "EventLogStorage"], [11, 0, 1, "", "SqlEventLogStorage"], [11, 0, 1, "", "SqliteEventLogStorage"]], "dagster._core.storage.file_manager": [[11, 0, 1, "", "FileManager"]], "dagster._core.storage.file_manager.FileManager": [[11, 3, 1, "", "copy_handle_to_local_temp"], [11, 3, 1, "", "delete_local_temp"], [11, 3, 1, "", "read"], [11, 3, 1, "", "read_data"], [11, 3, 1, "", "write"], [11, 3, 1, "", "write_data"]], "dagster._core.storage.local_compute_log_manager": [[11, 0, 1, "", "LocalComputeLogManager"]], "dagster._core.storage.noop_compute_log_manager": [[11, 0, 1, "", "NoOpComputeLogManager"]], "dagster._core.storage.root": [[11, 0, 1, "", "LocalArtifactStorage"]], "dagster._core.storage.runs": [[11, 0, 1, "", "RunStorage"], [11, 0, 1, "", "SqlRunStorage"], [11, 0, 1, "", "SqliteRunStorage"]], "dagster._core.storage.schedules": [[11, 0, 1, "", "ScheduleStorage"], [11, 0, 1, "", "SqlScheduleStorage"], [11, 0, 1, "", "SqliteScheduleStorage"]], "dagster._loggers": [[61, 6, 1, "", "colored_console_logger"], [61, 6, 1, "", "json_console_logger"]], "dagster._serdes": [[11, 0, 1, "", "ConfigurableClass"], [11, 0, 1, "", "ConfigurableClassData"]], "dagster._utils.forked_pdb": [[69, 0, 1, "", "ForkedPdb"]], "dagster_airbyte": [[14, 0, 1, "", "AirbyteConnection"], [14, 0, 1, "", "AirbyteDestination"], [14, 0, 1, "", "AirbyteManagedElementReconciler"], [14, 5, 1, "", "AirbyteResource"], [14, 0, 1, "", "AirbyteSource"], [14, 0, 1, "", "AirbyteSyncMode"], [14, 5, 1, "", "airbyte_resource"], [14, 5, 1, "", "airbyte_sync_op"], [14, 6, 1, "", "build_airbyte_assets"], [14, 6, 1, "", "load_assets_from_airbyte_instance"], [14, 6, 1, "", "load_assets_from_airbyte_project"], [14, 6, 1, "", "load_assets_from_connections"]], "dagster_airbyte.AirbyteConnection": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.AirbyteDestination": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.AirbyteManagedElementReconciler": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.AirbyteSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.AirbyteSyncMode": [[14, 3, 1, "", "full_refresh_append"], [14, 3, 1, "", "full_refresh_overwrite"], [14, 3, 1, "", "incremental_append"], [14, 3, 1, "", "incremental_append_dedup"]], "dagster_airbyte.managed.generated.destinations": [[14, 0, 1, "", "AmazonSqsDestination"], [14, 0, 1, "", "AwsDatalakeDestination"], [14, 0, 1, "", "AzureBlobStorageDestination"], [14, 0, 1, "", "BigqueryDenormalizedDestination"], [14, 0, 1, "", "BigqueryDestination"], [14, 0, 1, "", "CassandraDestination"], [14, 0, 1, "", "ClickhouseDestination"], [14, 0, 1, "", "CsvDestination"], [14, 0, 1, "", "DatabricksDestination"], [14, 0, 1, "", "DynamodbDestination"], [14, 0, 1, "", "ElasticsearchDestination"], [14, 0, 1, "", "FireboltDestination"], [14, 0, 1, "", "FirestoreDestination"], [14, 0, 1, "", "GcsDestination"], [14, 0, 1, "", "GoogleSheetsDestination"], [14, 0, 1, "", "JdbcDestination"], [14, 0, 1, "", "KafkaDestination"], [14, 0, 1, "", "KeenDestination"], [14, 0, 1, "", "KinesisDestination"], [14, 0, 1, "", "KvdbDestination"], [14, 0, 1, "", "LocalJsonDestination"], [14, 0, 1, "", "MariadbColumnstoreDestination"], [14, 0, 1, "", "MeilisearchDestination"], [14, 0, 1, "", "MongodbDestination"], [14, 0, 1, "", "MqttDestination"], [14, 0, 1, "", "MssqlDestination"], [14, 0, 1, "", "MysqlDestination"], [14, 0, 1, "", "OracleDestination"], [14, 0, 1, "", "PostgresDestination"], [14, 0, 1, "", "PubsubDestination"], [14, 0, 1, "", "PulsarDestination"], [14, 0, 1, "", "R2Destination"], [14, 0, 1, "", "RabbitmqDestination"], [14, 0, 1, "", "RedisDestination"], [14, 0, 1, "", "RedshiftDestination"], [14, 0, 1, "", "RocksetDestination"], [14, 0, 1, "", "S3Destination"], [14, 0, 1, "", "ScaffoldDestinationPythonDestination"], [14, 0, 1, "", "ScyllaDestination"], [14, 0, 1, "", "SftpJsonDestination"], [14, 0, 1, "", "SnowflakeDestination"], [14, 0, 1, "", "SqliteDestination"], [14, 0, 1, "", "TidbDestination"]], "dagster_airbyte.managed.generated.destinations.AmazonSqsDestination": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.AwsDatalakeDestination": [[14, 0, 1, "", "IAMRole"], [14, 0, 1, "", "IAMUser"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.AwsDatalakeDestination.IAMRole": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.AwsDatalakeDestination.IAMUser": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.AzureBlobStorageDestination": [[14, 0, 1, "", "CSVCommaSeparatedValues"], [14, 0, 1, "", "JSONLinesNewlineDelimitedJSON"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.AzureBlobStorageDestination.CSVCommaSeparatedValues": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.AzureBlobStorageDestination.JSONLinesNewlineDelimitedJSON": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.BigqueryDenormalizedDestination": [[14, 0, 1, "", "GCSStaging"], [14, 0, 1, "", "HMACKey"], [14, 0, 1, "", "StandardInserts"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.BigqueryDenormalizedDestination.GCSStaging": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.BigqueryDenormalizedDestination.HMACKey": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.BigqueryDenormalizedDestination.StandardInserts": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.BigqueryDestination": [[14, 0, 1, "", "GCSStaging"], [14, 0, 1, "", "HMACKey"], [14, 0, 1, "", "StandardInserts"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.BigqueryDestination.GCSStaging": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.BigqueryDestination.HMACKey": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.BigqueryDestination.StandardInserts": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.CassandraDestination": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.ClickhouseDestination": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.CsvDestination": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.DatabricksDestination": [[14, 0, 1, "", "AmazonS3"], [14, 0, 1, "", "AzureBlobStorage"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.DatabricksDestination.AmazonS3": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.DatabricksDestination.AzureBlobStorage": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.DynamodbDestination": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.ElasticsearchDestination": [[14, 0, 1, "", "ApiKeySecret"], [14, 0, 1, "", "None_"], [14, 0, 1, "", "UsernamePassword"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.ElasticsearchDestination.ApiKeySecret": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.ElasticsearchDestination.None_": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.ElasticsearchDestination.UsernamePassword": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.FireboltDestination": [[14, 0, 1, "", "ExternalTableViaS3"], [14, 0, 1, "", "SQLInserts"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.FireboltDestination.ExternalTableViaS3": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.FireboltDestination.SQLInserts": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.FirestoreDestination": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.GcsDestination": [[14, 0, 1, "", "AvroApacheAvro"], [14, 0, 1, "", "Bzip2"], [14, 0, 1, "", "CSVCommaSeparatedValues"], [14, 0, 1, "", "Deflate"], [14, 0, 1, "", "GZIP"], [14, 0, 1, "", "HMACKey"], [14, 0, 1, "", "JSONLinesNewlineDelimitedJSON"], [14, 0, 1, "", "NoCompression"], [14, 0, 1, "", "ParquetColumnarStorage"], [14, 0, 1, "", "Snappy"], [14, 0, 1, "", "Xz"], [14, 0, 1, "", "Zstandard"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.GcsDestination.AvroApacheAvro": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.GcsDestination.Bzip2": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.GcsDestination.CSVCommaSeparatedValues": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.GcsDestination.Deflate": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.GcsDestination.GZIP": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.GcsDestination.HMACKey": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.GcsDestination.JSONLinesNewlineDelimitedJSON": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.GcsDestination.NoCompression": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.GcsDestination.ParquetColumnarStorage": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.GcsDestination.Snappy": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.GcsDestination.Xz": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.GcsDestination.Zstandard": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.GoogleSheetsDestination": [[14, 0, 1, "", "AuthenticationViaGoogleOAuth"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.GoogleSheetsDestination.AuthenticationViaGoogleOAuth": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.JdbcDestination": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.KafkaDestination": [[14, 0, 1, "", "PLAINTEXT"], [14, 0, 1, "", "SASLPLAINTEXT"], [14, 0, 1, "", "SASLSSL"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.KafkaDestination.PLAINTEXT": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.KafkaDestination.SASLPLAINTEXT": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.KafkaDestination.SASLSSL": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.KeenDestination": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.KinesisDestination": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.KvdbDestination": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.LocalJsonDestination": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.MariadbColumnstoreDestination": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.MeilisearchDestination": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.MongodbDestination": [[14, 0, 1, "", "LoginPassword"], [14, 0, 1, "", "MongoDBAtlas"], [14, 0, 1, "", "None_"], [14, 0, 1, "", "ReplicaSet"], [14, 0, 1, "", "StandaloneMongoDbInstance"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.MongodbDestination.LoginPassword": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.MongodbDestination.MongoDBAtlas": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.MongodbDestination.None_": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.MongodbDestination.ReplicaSet": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.MongodbDestination.StandaloneMongoDbInstance": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.MqttDestination": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.MssqlDestination": [[14, 0, 1, "", "EncryptedTrustServerCertificate"], [14, 0, 1, "", "EncryptedVerifyCertificate"], [14, 0, 1, "", "Unencrypted"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.MssqlDestination.EncryptedTrustServerCertificate": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.MssqlDestination.EncryptedVerifyCertificate": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.MssqlDestination.Unencrypted": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.MysqlDestination": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.OracleDestination": [[14, 0, 1, "", "NativeNetworkEncryptionNNE"], [14, 0, 1, "", "TLSEncryptedVerifyCertificate"], [14, 0, 1, "", "Unencrypted"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.OracleDestination.NativeNetworkEncryptionNNE": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.OracleDestination.TLSEncryptedVerifyCertificate": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.OracleDestination.Unencrypted": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.PostgresDestination": [[14, 0, 1, "", "Allow"], [14, 0, 1, "", "Disable"], [14, 0, 1, "", "Prefer"], [14, 0, 1, "", "Require"], [14, 0, 1, "", "VerifyCa"], [14, 0, 1, "", "VerifyFull"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.PostgresDestination.Allow": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.PostgresDestination.Disable": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.PostgresDestination.Prefer": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.PostgresDestination.Require": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.PostgresDestination.VerifyCa": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.PostgresDestination.VerifyFull": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.PubsubDestination": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.PulsarDestination": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.R2Destination": [[14, 0, 1, "", "AvroApacheAvro"], [14, 0, 1, "", "Bzip2"], [14, 0, 1, "", "CSVCommaSeparatedValues"], [14, 0, 1, "", "Deflate"], [14, 0, 1, "", "GZIP"], [14, 0, 1, "", "JSONLinesNewlineDelimitedJSON"], [14, 0, 1, "", "NoCompression"], [14, 0, 1, "", "Snappy"], [14, 0, 1, "", "Xz"], [14, 0, 1, "", "Zstandard"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.R2Destination.AvroApacheAvro": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.R2Destination.Bzip2": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.R2Destination.CSVCommaSeparatedValues": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.R2Destination.Deflate": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.R2Destination.GZIP": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.R2Destination.JSONLinesNewlineDelimitedJSON": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.R2Destination.NoCompression": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.R2Destination.Snappy": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.R2Destination.Xz": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.R2Destination.Zstandard": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.RabbitmqDestination": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.RedisDestination": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.RedshiftDestination": [[14, 0, 1, "", "AESCBCEnvelopeEncryption"], [14, 0, 1, "", "NoEncryption"], [14, 0, 1, "", "S3Staging"], [14, 0, 1, "", "Standard"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.RedshiftDestination.AESCBCEnvelopeEncryption": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.RedshiftDestination.NoEncryption": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.RedshiftDestination.S3Staging": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.RedshiftDestination.Standard": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.RocksetDestination": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.S3Destination": [[14, 0, 1, "", "AvroApacheAvro"], [14, 0, 1, "", "Bzip2"], [14, 0, 1, "", "CSVCommaSeparatedValues"], [14, 0, 1, "", "Deflate"], [14, 0, 1, "", "GZIP"], [14, 0, 1, "", "JSONLinesNewlineDelimitedJSON"], [14, 0, 1, "", "NoCompression"], [14, 0, 1, "", "ParquetColumnarStorage"], [14, 0, 1, "", "Snappy"], [14, 0, 1, "", "Xz"], [14, 0, 1, "", "Zstandard"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.S3Destination.AvroApacheAvro": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.S3Destination.Bzip2": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.S3Destination.CSVCommaSeparatedValues": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.S3Destination.Deflate": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.S3Destination.GZIP": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.S3Destination.JSONLinesNewlineDelimitedJSON": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.S3Destination.NoCompression": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.S3Destination.ParquetColumnarStorage": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.S3Destination.Snappy": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.S3Destination.Xz": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.S3Destination.Zstandard": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.ScaffoldDestinationPythonDestination": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.ScyllaDestination": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.SftpJsonDestination": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.SnowflakeDestination": [[14, 0, 1, "", "AESCBCEnvelopeEncryption"], [14, 0, 1, "", "AWSS3Staging"], [14, 0, 1, "", "AzureBlobStorageStaging"], [14, 0, 1, "", "GoogleCloudStorageStaging"], [14, 0, 1, "", "KeyPairAuthentication"], [14, 0, 1, "", "NoEncryption"], [14, 0, 1, "", "OAuth20"], [14, 0, 1, "", "RecommendedInternalStaging"], [14, 0, 1, "", "SelectAnotherOption"], [14, 0, 1, "", "UsernameAndPassword"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.SnowflakeDestination.AESCBCEnvelopeEncryption": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.SnowflakeDestination.AWSS3Staging": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.SnowflakeDestination.AzureBlobStorageStaging": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.SnowflakeDestination.GoogleCloudStorageStaging": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.SnowflakeDestination.KeyPairAuthentication": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.SnowflakeDestination.NoEncryption": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.SnowflakeDestination.OAuth20": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.SnowflakeDestination.RecommendedInternalStaging": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.SnowflakeDestination.SelectAnotherOption": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.SnowflakeDestination.UsernameAndPassword": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.SqliteDestination": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.TidbDestination": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources": [[14, 0, 1, "", "AdjustSource"], [14, 0, 1, "", "AirtableSource"], [14, 0, 1, "", "AmazonAdsSource"], [14, 0, 1, "", "AmazonSellerPartnerSource"], [14, 0, 1, "", "AmazonSqsSource"], [14, 0, 1, "", "AmplitudeSource"], [14, 0, 1, "", "ApifyDatasetSource"], [14, 0, 1, "", "AppfollowSource"], [14, 0, 1, "", "AppsflyerSource"], [14, 0, 1, "", "AppstoreSingerSource"], [14, 0, 1, "", "AsanaSource"], [14, 0, 1, "", "AwsCloudtrailSource"], [14, 0, 1, "", "AzureTableSource"], [14, 0, 1, "", "BambooHrSource"], [14, 0, 1, "", "BigcommerceSource"], [14, 0, 1, "", "BigquerySource"], [14, 0, 1, "", "BingAdsSource"], [14, 0, 1, "", "BraintreeSource"], [14, 0, 1, "", "CartSource"], [14, 0, 1, "", "ChargebeeSource"], [14, 0, 1, "", "ChargifySource"], [14, 0, 1, "", "ChartmogulSource"], [14, 0, 1, "", "ClickhouseSource"], [14, 0, 1, "", "CloseComSource"], [14, 0, 1, "", "CockroachdbSource"], [14, 0, 1, "", "CommercetoolsSource"], [14, 0, 1, "", "ConfluenceSource"], [14, 0, 1, "", "CourierSource"], [14, 0, 1, "", "Db2Source"], [14, 0, 1, "", "DelightedSource"], [14, 0, 1, "", "DixaSource"], [14, 0, 1, "", "DockerhubSource"], [14, 0, 1, "", "DriftSource"], [14, 0, 1, "", "Dv360Source"], [14, 0, 1, "", "E2eTestSource"], [14, 0, 1, "", "ElasticsearchSource"], [14, 0, 1, "", "ExchangeRatesSource"], [14, 0, 1, "", "FacebookMarketingSource"], [14, 0, 1, "", "FacebookPagesSource"], [14, 0, 1, "", "FakerSource"], [14, 0, 1, "", "FaunaSource"], [14, 0, 1, "", "FileSecureSource"], [14, 0, 1, "", "FileSource"], [14, 0, 1, "", "FireboltSource"], [14, 0, 1, "", "FlexportSource"], [14, 0, 1, "", "FreshcallerSource"], [14, 0, 1, "", "FreshdeskSource"], [14, 0, 1, "", "FreshsalesSource"], [14, 0, 1, "", "FreshserviceSource"], [14, 0, 1, "", "GithubSource"], [14, 0, 1, "", "GitlabSource"], [14, 0, 1, "", "GlassfrogSource"], [14, 0, 1, "", "GocardlessSource"], [14, 0, 1, "", "GoogleAdsSource"], [14, 0, 1, "", "GoogleAnalyticsDataApiSource"], [14, 0, 1, "", "GoogleAnalyticsV4Source"], [14, 0, 1, "", "GoogleDirectorySource"], [14, 0, 1, "", "GoogleSearchConsoleSource"], [14, 0, 1, "", "GoogleSheetsSource"], [14, 0, 1, "", "GoogleWorkspaceAdminReportsSource"], [14, 0, 1, "", "GreenhouseSource"], [14, 0, 1, "", "GutendexSource"], [14, 0, 1, "", "HarvestSource"], [14, 0, 1, "", "HellobatonSource"], [14, 0, 1, "", "HubplannerSource"], [14, 0, 1, "", "HubspotSource"], [14, 0, 1, "", "InsightlySource"], [14, 0, 1, "", "InstagramSource"], [14, 0, 1, "", "IntercomSource"], [14, 0, 1, "", "IterableSource"], [14, 0, 1, "", "JdbcSource"], [14, 0, 1, "", "JiraSource"], [14, 0, 1, "", "KafkaSource"], [14, 0, 1, "", "KlaviyoSource"], [14, 0, 1, "", "KustomerSingerSource"], [14, 0, 1, "", "KyribaSource"], [14, 0, 1, "", "LemlistSource"], [14, 0, 1, "", "LeverHiringSource"], [14, 0, 1, "", "LinkedinAdsSource"], [14, 0, 1, "", "LinkedinPagesSource"], [14, 0, 1, "", "LinnworksSource"], [14, 0, 1, "", "LookerSource"], [14, 0, 1, "", "MailchimpSource"], [14, 0, 1, "", "MailgunSource"], [14, 0, 1, "", "MarketoSource"], [14, 0, 1, "", "MetabaseSource"], [14, 0, 1, "", "MicrosoftTeamsSource"], [14, 0, 1, "", "MixpanelSource"], [14, 0, 1, "", "MondaySource"], [14, 0, 1, "", "MongodbSource"], [14, 0, 1, "", "MongodbV2Source"], [14, 0, 1, "", "MssqlSource"], [14, 0, 1, "", "MyHoursSource"], [14, 0, 1, "", "MysqlSource"], [14, 0, 1, "", "NetsuiteSource"], [14, 0, 1, "", "NotionSource"], [14, 0, 1, "", "OktaSource"], [14, 0, 1, "", "OnesignalSource"], [14, 0, 1, "", "OpenweatherSource"], [14, 0, 1, "", "OracleSource"], [14, 0, 1, "", "OrbSource"], [14, 0, 1, "", "OrbitSource"], [14, 0, 1, "", "OutreachSource"], [14, 0, 1, "", "PardotSource"], [14, 0, 1, "", "PaypalTransactionSource"], [14, 0, 1, "", "PaystackSource"], [14, 0, 1, "", "PersistiqSource"], [14, 0, 1, "", "PinterestSource"], [14, 0, 1, "", "PipedriveSource"], [14, 0, 1, "", "PivotalTrackerSource"], [14, 0, 1, "", "PlaidSource"], [14, 0, 1, "", "PokeapiSource"], [14, 0, 1, "", "PostgresSource"], [14, 0, 1, "", "PosthogSource"], [14, 0, 1, "", "PrestashopSource"], [14, 0, 1, "", "PrimetricSource"], [14, 0, 1, "", "PythonHttpTutorialSource"], [14, 0, 1, "", "QualarooSource"], [14, 0, 1, "", "QuickbooksSingerSource"], [14, 0, 1, "", "RechargeSource"], [14, 0, 1, "", "RecurlySource"], [14, 0, 1, "", "RedshiftSource"], [14, 0, 1, "", "RetentlySource"], [14, 0, 1, "", "RkiCovidSource"], [14, 0, 1, "", "S3Source"], [14, 0, 1, "", "SalesforceSource"], [14, 0, 1, "", "SalesloftSource"], [14, 0, 1, "", "ScaffoldJavaJdbcSource"], [14, 0, 1, "", "ScaffoldSourceHttpSource"], [14, 0, 1, "", "ScaffoldSourcePythonSource"], [14, 0, 1, "", "SearchMetricsSource"], [14, 0, 1, "", "SendgridSource"], [14, 0, 1, "", "SentrySource"], [14, 0, 1, "", "SftpSource"], [14, 0, 1, "", "ShopifySource"], [14, 0, 1, "", "ShortioSource"], [14, 0, 1, "", "SlackSource"], [14, 0, 1, "", "SmartsheetsSource"], [14, 0, 1, "", "SnapchatMarketingSource"], [14, 0, 1, "", "SnowflakeSource"], [14, 0, 1, "", "SquareSource"], [14, 0, 1, "", "StockTickerApiTutorialSource"], [14, 0, 1, "", "StravaSource"], [14, 0, 1, "", "StripeSource"], [14, 0, 1, "", "SurveymonkeySource"], [14, 0, 1, "", "TalkdeskExploreSource"], [14, 0, 1, "", "TempoSource"], [14, 0, 1, "", "TidbSource"], [14, 0, 1, "", "TiktokMarketingSource"], [14, 0, 1, "", "TimelySource"], [14, 0, 1, "", "TplcentralSource"], [14, 0, 1, "", "TrelloSource"], [14, 0, 1, "", "TwilioSource"], [14, 0, 1, "", "TypeformSource"], [14, 0, 1, "", "UsCensusSource"], [14, 0, 1, "", "WebflowSource"], [14, 0, 1, "", "WhiskyHunterSource"], [14, 0, 1, "", "WoocommerceSource"], [14, 0, 1, "", "WrikeSource"], [14, 0, 1, "", "YahooFinancePriceSource"], [14, 0, 1, "", "YandexMetricaSource"], [14, 0, 1, "", "YoutubeAnalyticsSource"], [14, 0, 1, "", "ZendeskChatSource"], [14, 0, 1, "", "ZendeskSunshineSource"], [14, 0, 1, "", "ZendeskSupportSource"], [14, 0, 1, "", "ZendeskTalkSource"], [14, 0, 1, "", "ZenefitsSource"], [14, 0, 1, "", "ZenloopSource"], [14, 0, 1, "", "ZohoCrmSource"], [14, 0, 1, "", "ZoomSingerSource"], [14, 0, 1, "", "ZuoraSource"]], "dagster_airbyte.managed.generated.sources.AdjustSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.AirtableSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.AmazonAdsSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.AmazonSellerPartnerSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.AmazonSqsSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.AmplitudeSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.ApifyDatasetSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.AppfollowSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.AppsflyerSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.AppstoreSingerSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.AsanaSource": [[14, 0, 1, "", "OAuthCredentials"], [14, 0, 1, "", "PATCredentials"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.AsanaSource.OAuthCredentials": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.AsanaSource.PATCredentials": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.AwsCloudtrailSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.AzureTableSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.BambooHrSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.BigcommerceSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.BigquerySource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.BingAdsSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.BraintreeSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.CartSource": [[14, 0, 1, "", "CentralAPIRouter"], [14, 0, 1, "", "SingleStoreAccessToken"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.CartSource.CentralAPIRouter": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.CartSource.SingleStoreAccessToken": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.ChargebeeSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.ChargifySource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.ChartmogulSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.ClickhouseSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.CloseComSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.CockroachdbSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.CommercetoolsSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.ConfluenceSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.CourierSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.Db2Source": [[14, 0, 1, "", "TLSEncryptedVerifyCertificate"], [14, 0, 1, "", "Unencrypted"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.Db2Source.TLSEncryptedVerifyCertificate": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.Db2Source.Unencrypted": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.DelightedSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.DixaSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.DockerhubSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.DriftSource": [[14, 0, 1, "", "AccessToken"], [14, 0, 1, "", "OAuth20"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.DriftSource.AccessToken": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.DriftSource.OAuth20": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.Dv360Source": [[14, 0, 1, "", "Oauth2Credentials"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.Dv360Source.Oauth2Credentials": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.E2eTestSource": [[14, 0, 1, "", "MultiSchema"], [14, 0, 1, "", "SingleSchema"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.E2eTestSource.MultiSchema": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.E2eTestSource.SingleSchema": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.ElasticsearchSource": [[14, 0, 1, "", "ApiKeySecret"], [14, 0, 1, "", "None_"], [14, 0, 1, "", "UsernamePassword"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.ElasticsearchSource.ApiKeySecret": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.ElasticsearchSource.None_": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.ElasticsearchSource.UsernamePassword": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.ExchangeRatesSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.FacebookMarketingSource": [[14, 0, 1, "", "InsightConfig"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.FacebookMarketingSource.InsightConfig": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.FacebookPagesSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.FakerSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.FaunaSource": [[14, 0, 1, "", "Collection"], [14, 0, 1, "", "Disabled"], [14, 0, 1, "", "Enabled"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.FaunaSource.Collection": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.FaunaSource.Disabled": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.FaunaSource.Enabled": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.FileSecureSource": [[14, 0, 1, "", "AzBlobAzureBlobStorage"], [14, 0, 1, "", "GCSGoogleCloudStorage"], [14, 0, 1, "", "HTTPSPublicWeb"], [14, 0, 1, "", "S3AmazonWebServices"], [14, 0, 1, "", "SCPSecureCopyProtocol"], [14, 0, 1, "", "SFTPSecureFileTransferProtocol"], [14, 0, 1, "", "SSHSecureShell"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.FileSecureSource.AzBlobAzureBlobStorage": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.FileSecureSource.GCSGoogleCloudStorage": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.FileSecureSource.HTTPSPublicWeb": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.FileSecureSource.S3AmazonWebServices": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.FileSecureSource.SCPSecureCopyProtocol": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.FileSecureSource.SFTPSecureFileTransferProtocol": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.FileSecureSource.SSHSecureShell": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.FileSource": [[14, 0, 1, "", "AzBlobAzureBlobStorage"], [14, 0, 1, "", "GCSGoogleCloudStorage"], [14, 0, 1, "", "HTTPSPublicWeb"], [14, 0, 1, "", "LocalFilesystemLimited"], [14, 0, 1, "", "S3AmazonWebServices"], [14, 0, 1, "", "SCPSecureCopyProtocol"], [14, 0, 1, "", "SFTPSecureFileTransferProtocol"], [14, 0, 1, "", "SSHSecureShell"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.FileSource.AzBlobAzureBlobStorage": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.FileSource.GCSGoogleCloudStorage": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.FileSource.HTTPSPublicWeb": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.FileSource.LocalFilesystemLimited": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.FileSource.S3AmazonWebServices": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.FileSource.SCPSecureCopyProtocol": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.FileSource.SFTPSecureFileTransferProtocol": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.FileSource.SSHSecureShell": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.FireboltSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.FlexportSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.FreshcallerSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.FreshdeskSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.FreshsalesSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.FreshserviceSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.GithubSource": [[14, 0, 1, "", "OAuthCredentials"], [14, 0, 1, "", "PATCredentials"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.GithubSource.OAuthCredentials": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.GithubSource.PATCredentials": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.GitlabSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.GlassfrogSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.GocardlessSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.GoogleAdsSource": [[14, 0, 1, "", "CustomGAQLQueriesEntry"], [14, 0, 1, "", "GoogleCredentials"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.GoogleAdsSource.CustomGAQLQueriesEntry": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.GoogleAdsSource.GoogleCredentials": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.GoogleAnalyticsDataApiSource": [[14, 0, 1, "", "AuthenticateViaGoogleOauth"], [14, 0, 1, "", "ServiceAccountKeyAuthentication"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.GoogleAnalyticsDataApiSource.AuthenticateViaGoogleOauth": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.GoogleAnalyticsDataApiSource.ServiceAccountKeyAuthentication": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.GoogleAnalyticsV4Source": [[14, 0, 1, "", "AuthenticateViaGoogleOauth"], [14, 0, 1, "", "ServiceAccountKeyAuthentication"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.GoogleAnalyticsV4Source.AuthenticateViaGoogleOauth": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.GoogleAnalyticsV4Source.ServiceAccountKeyAuthentication": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.GoogleDirectorySource": [[14, 0, 1, "", "ServiceAccountKey"], [14, 0, 1, "", "SignInViaGoogleOAuth"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.GoogleDirectorySource.ServiceAccountKey": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.GoogleDirectorySource.SignInViaGoogleOAuth": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.GoogleSearchConsoleSource": [[14, 0, 1, "", "OAuth"], [14, 0, 1, "", "ServiceAccountKeyAuthentication"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.GoogleSearchConsoleSource.OAuth": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.GoogleSearchConsoleSource.ServiceAccountKeyAuthentication": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.GoogleSheetsSource": [[14, 0, 1, "", "AuthenticateViaGoogleOAuth"], [14, 0, 1, "", "ServiceAccountKeyAuthentication"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.GoogleSheetsSource.AuthenticateViaGoogleOAuth": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.GoogleSheetsSource.ServiceAccountKeyAuthentication": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.GoogleWorkspaceAdminReportsSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.GreenhouseSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.GutendexSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.HarvestSource": [[14, 0, 1, "", "AuthenticateViaHarvestOAuth"], [14, 0, 1, "", "AuthenticateWithPersonalAccessToken"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.HarvestSource.AuthenticateViaHarvestOAuth": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.HarvestSource.AuthenticateWithPersonalAccessToken": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.HellobatonSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.HubplannerSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.HubspotSource": [[14, 0, 1, "", "APIKey"], [14, 0, 1, "", "OAuth"], [14, 0, 1, "", "PrivateAPP"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.HubspotSource.APIKey": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.HubspotSource.OAuth": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.HubspotSource.PrivateAPP": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.InsightlySource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.InstagramSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.IntercomSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.IterableSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.JdbcSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.JiraSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.KafkaSource": [[14, 0, 1, "", "AVRO"], [14, 0, 1, "", "JSON"], [14, 0, 1, "", "ManuallyAssignAListOfPartitions"], [14, 0, 1, "", "PLAINTEXT"], [14, 0, 1, "", "SASLPLAINTEXT"], [14, 0, 1, "", "SASLSSL"], [14, 0, 1, "", "SubscribeToAllTopicsMatchingSpecifiedPattern"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.KafkaSource.AVRO": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.KafkaSource.JSON": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.KafkaSource.ManuallyAssignAListOfPartitions": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.KafkaSource.PLAINTEXT": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.KafkaSource.SASLPLAINTEXT": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.KafkaSource.SASLSSL": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.KafkaSource.SubscribeToAllTopicsMatchingSpecifiedPattern": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.KlaviyoSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.KustomerSingerSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.KyribaSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.LemlistSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.LeverHiringSource": [[14, 0, 1, "", "OAuthCredentials"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.LeverHiringSource.OAuthCredentials": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.LinkedinAdsSource": [[14, 0, 1, "", "AccessToken"], [14, 0, 1, "", "OAuth20"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.LinkedinAdsSource.AccessToken": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.LinkedinAdsSource.OAuth20": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.LinkedinPagesSource": [[14, 0, 1, "", "AccessToken"], [14, 0, 1, "", "OAuth20"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.LinkedinPagesSource.AccessToken": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.LinkedinPagesSource.OAuth20": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.LinnworksSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.LookerSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.MailchimpSource": [[14, 0, 1, "", "APIKey"], [14, 0, 1, "", "OAuth20"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.MailchimpSource.APIKey": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.MailchimpSource.OAuth20": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.MailgunSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.MarketoSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.MetabaseSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.MicrosoftTeamsSource": [[14, 0, 1, "", "AuthenticateViaMicrosoft"], [14, 0, 1, "", "AuthenticateViaMicrosoftOAuth20"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.MicrosoftTeamsSource.AuthenticateViaMicrosoft": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.MicrosoftTeamsSource.AuthenticateViaMicrosoftOAuth20": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.MixpanelSource": [[14, 0, 1, "", "ProjectSecret"], [14, 0, 1, "", "ServiceAccount"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.MixpanelSource.ProjectSecret": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.MixpanelSource.ServiceAccount": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.MondaySource": [[14, 0, 1, "", "APIToken"], [14, 0, 1, "", "OAuth20"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.MondaySource.APIToken": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.MondaySource.OAuth20": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.MongodbSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.MongodbV2Source": [[14, 0, 1, "", "MongoDBAtlas"], [14, 0, 1, "", "ReplicaSet"], [14, 0, 1, "", "StandaloneMongoDbInstance"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.MongodbV2Source.MongoDBAtlas": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.MongodbV2Source.ReplicaSet": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.MongodbV2Source.StandaloneMongoDbInstance": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.MssqlSource": [[14, 0, 1, "", "EncryptedTrustServerCertificate"], [14, 0, 1, "", "EncryptedVerifyCertificate"], [14, 0, 1, "", "LogicalReplicationCDC"], [14, 0, 1, "", "Standard"], [14, 0, 1, "", "Unencrypted"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.MssqlSource.EncryptedTrustServerCertificate": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.MssqlSource.EncryptedVerifyCertificate": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.MssqlSource.LogicalReplicationCDC": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.MssqlSource.Standard": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.MssqlSource.Unencrypted": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.MyHoursSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.MysqlSource": [[14, 0, 1, "", "LogicalReplicationCDC"], [14, 0, 1, "", "Preferred"], [14, 0, 1, "", "Required"], [14, 0, 1, "", "Standard"], [14, 0, 1, "", "VerifyCA"], [14, 0, 1, "", "VerifyIdentity"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.MysqlSource.LogicalReplicationCDC": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.MysqlSource.Preferred": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.MysqlSource.Required": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.MysqlSource.Standard": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.MysqlSource.VerifyCA": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.MysqlSource.VerifyIdentity": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.NetsuiteSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.NotionSource": [[14, 0, 1, "", "AccessToken"], [14, 0, 1, "", "OAuth20"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.NotionSource.AccessToken": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.NotionSource.OAuth20": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.OktaSource": [[14, 0, 1, "", "APIToken"], [14, 0, 1, "", "OAuth20"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.OktaSource.APIToken": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.OktaSource.OAuth20": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.OnesignalSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.OpenweatherSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.OracleSource": [[14, 0, 1, "", "NativeNetworkEncryptionNNE"], [14, 0, 1, "", "ServiceName"], [14, 0, 1, "", "SystemIDSID"], [14, 0, 1, "", "TLSEncryptedVerifyCertificate"], [14, 0, 1, "", "Unencrypted"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.OracleSource.NativeNetworkEncryptionNNE": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.OracleSource.ServiceName": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.OracleSource.SystemIDSID": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.OracleSource.TLSEncryptedVerifyCertificate": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.OracleSource.Unencrypted": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.OrbSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.OrbitSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.OutreachSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.PardotSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.PaypalTransactionSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.PaystackSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.PersistiqSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.PinterestSource": [[14, 0, 1, "", "AccessToken"], [14, 0, 1, "", "OAuth20"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.PinterestSource.AccessToken": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.PinterestSource.OAuth20": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.PipedriveSource": [[14, 0, 1, "", "APIKeyAuthentication"], [14, 0, 1, "", "SignInViaPipedriveOAuth"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.PipedriveSource.APIKeyAuthentication": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.PipedriveSource.SignInViaPipedriveOAuth": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.PivotalTrackerSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.PlaidSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.PokeapiSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.PostgresSource": [[14, 0, 1, "", "Allow"], [14, 0, 1, "", "Disable"], [14, 0, 1, "", "LogicalReplicationCDC"], [14, 0, 1, "", "NoTunnel"], [14, 0, 1, "", "PasswordAuthentication"], [14, 0, 1, "", "Prefer"], [14, 0, 1, "", "Require"], [14, 0, 1, "", "SSHKeyAuthentication"], [14, 0, 1, "", "Standard"], [14, 0, 1, "", "VerifyCa"], [14, 0, 1, "", "VerifyFull"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.PostgresSource.Allow": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.PostgresSource.Disable": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.PostgresSource.LogicalReplicationCDC": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.PostgresSource.NoTunnel": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.PostgresSource.PasswordAuthentication": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.PostgresSource.Prefer": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.PostgresSource.Require": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.PostgresSource.SSHKeyAuthentication": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.PostgresSource.Standard": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.PostgresSource.VerifyCa": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.PostgresSource.VerifyFull": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.PosthogSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.PrestashopSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.PrimetricSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.PythonHttpTutorialSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.QualarooSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.QuickbooksSingerSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.RechargeSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.RecurlySource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.RedshiftSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.RetentlySource": [[14, 0, 1, "", "AuthenticateViaRetentlyOAuth"], [14, 0, 1, "", "AuthenticateWithAPIToken"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.RetentlySource.AuthenticateViaRetentlyOAuth": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.RetentlySource.AuthenticateWithAPIToken": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.RkiCovidSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.S3Source": [[14, 0, 1, "", "Avro"], [14, 0, 1, "", "CSV"], [14, 0, 1, "", "Jsonl"], [14, 0, 1, "", "Parquet"], [14, 0, 1, "", "S3AmazonWebServices"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.S3Source.Avro": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.S3Source.CSV": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.S3Source.Jsonl": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.S3Source.Parquet": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.S3Source.S3AmazonWebServices": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.SalesforceSource": [[14, 0, 1, "", "FilterSalesforceObjectsEntry"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.SalesforceSource.FilterSalesforceObjectsEntry": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.SalesloftSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.ScaffoldJavaJdbcSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.ScaffoldSourceHttpSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.ScaffoldSourcePythonSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.SearchMetricsSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.SendgridSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.SentrySource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.SftpSource": [[14, 0, 1, "", "PasswordAuthentication"], [14, 0, 1, "", "SSHKeyAuthentication"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.SftpSource.PasswordAuthentication": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.SftpSource.SSHKeyAuthentication": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.ShopifySource": [[14, 0, 1, "", "APIPassword"], [14, 0, 1, "", "OAuth20"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.ShopifySource.APIPassword": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.ShopifySource.OAuth20": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.ShortioSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.SlackSource": [[14, 0, 1, "", "APITokenCredentials"], [14, 0, 1, "", "DefaultOAuth20Authorization"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.SlackSource.APITokenCredentials": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.SlackSource.DefaultOAuth20Authorization": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.SmartsheetsSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.SnapchatMarketingSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.SnowflakeSource": [[14, 0, 1, "", "OAuth20"], [14, 0, 1, "", "UsernameAndPassword"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.SnowflakeSource.OAuth20": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.SnowflakeSource.UsernameAndPassword": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.SquareSource": [[14, 0, 1, "", "APIKey"], [14, 0, 1, "", "OauthAuthentication"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.SquareSource.APIKey": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.SquareSource.OauthAuthentication": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.StockTickerApiTutorialSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.StravaSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.StripeSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.SurveymonkeySource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.TalkdeskExploreSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.TempoSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.TidbSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.TiktokMarketingSource": [[14, 0, 1, "", "OAuth20"], [14, 0, 1, "", "SandboxAccessToken"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.TiktokMarketingSource.OAuth20": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.TiktokMarketingSource.SandboxAccessToken": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.TimelySource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.TplcentralSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.TrelloSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.TwilioSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.TypeformSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.UsCensusSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.WebflowSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.WhiskyHunterSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.WoocommerceSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.WrikeSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.YahooFinancePriceSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.YandexMetricaSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.YoutubeAnalyticsSource": [[14, 0, 1, "", "AuthenticateViaOAuth20"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.YoutubeAnalyticsSource.AuthenticateViaOAuth20": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.ZendeskChatSource": [[14, 0, 1, "", "AccessToken"], [14, 0, 1, "", "OAuth20"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.ZendeskChatSource.AccessToken": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.ZendeskChatSource.OAuth20": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.ZendeskSunshineSource": [[14, 0, 1, "", "APIToken"], [14, 0, 1, "", "OAuth20"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.ZendeskSunshineSource.APIToken": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.ZendeskSunshineSource.OAuth20": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.ZendeskSupportSource": [[14, 0, 1, "", "APIToken"], [14, 0, 1, "", "OAuth20"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.ZendeskSupportSource.APIToken": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.ZendeskSupportSource.OAuth20": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.ZendeskTalkSource": [[14, 0, 1, "", "APIToken"], [14, 0, 1, "", "OAuth20"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.ZendeskTalkSource.APIToken": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.ZendeskTalkSource.OAuth20": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.ZenefitsSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.ZenloopSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.ZohoCrmSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.ZoomSingerSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.ZuoraSource": [[14, 3, 1, "", "__init__"]], "dagster_airflow": [[15, 0, 1, "", "DagsterCloudOperator"], [15, 0, 1, "", "DagsterOperator"], [15, 6, 1, "", "load_assets_from_airflow_dag"], [15, 6, 1, "", "make_dagster_definitions_from_airflow_dag_bag"], [15, 6, 1, "", "make_dagster_definitions_from_airflow_dags_path"], [15, 6, 1, "", "make_dagster_job_from_airflow_dag"], [15, 6, 1, "", "make_ephemeral_airflow_db_resource"], [15, 6, 1, "", "make_persistent_airflow_db_resource"], [15, 6, 1, "", "make_schedules_and_jobs_from_airflow_dag_bag"]], "dagster_aws.cloudwatch": [[16, 5, 1, "", "cloudwatch_logger"]], "dagster_aws.ecs": [[16, 5, 1, "", "EcsRunLauncher"]], "dagster_aws.emr": [[16, 5, 1, "", "EmrClusterState"], [16, 0, 1, "", "EmrError"], [16, 0, 1, "", "EmrJobRunner"], [16, 5, 1, "", "EmrStepState"], [16, 5, 1, "", "emr_pyspark_step_launcher"]], "dagster_aws.redshift": [[16, 5, 1, "", "FakeRedshiftClientResource"], [16, 5, 1, "", "RedshiftClientResource"], [16, 5, 1, "", "fake_redshift_resource"], [16, 5, 1, "", "redshift_resource"]], "dagster_aws.s3": [[16, 5, 1, "", "ConfigurablePickledObjectS3IOManager"], [16, 0, 1, "", "S3ComputeLogManager"], [16, 5, 1, "", "S3Coordinate"], [16, 0, 1, "", "S3FileHandle"], [16, 5, 1, "", "S3FileManagerResource"], [16, 5, 1, "", "S3PickleIOManager"], [16, 5, 1, "", "S3Resource"], [16, 5, 1, "", "s3_file_manager"], [16, 5, 1, "", "s3_pickle_io_manager"], [16, 5, 1, "", "s3_resource"]], "dagster_aws.secretsmanager": [[16, 5, 1, "", "SecretsManagerResource"], [16, 5, 1, "", "SecretsManagerSecretsResource"], [16, 5, 1, "", "secretsmanager_resource"], [16, 5, 1, "", "secretsmanager_secrets_resource"]], "dagster_azure.adls2": [[17, 0, 1, "", "ADLS2FileHandle"], [17, 5, 1, "", "ADLS2PickleIOManager"], [17, 5, 1, "", "ADLS2Resource"], [17, 5, 1, "", "ConfigurablePickledObjectADLS2IOManager"], [17, 5, 1, "", "FakeADLS2Resource"], [17, 5, 1, "", "adls2_file_manager"], [17, 5, 1, "", "adls2_pickle_io_manager"], [17, 5, 1, "", "adls2_resource"]], "dagster_azure.blob": [[17, 0, 1, "", "AzureBlobComputeLogManager"]], "dagster_celery": [[18, 5, 1, "", "celery_executor"]], "dagster_celery_docker": [[19, 5, 1, "", "celery_docker_executor"]], "dagster_celery_k8s": [[20, 5, 1, "", "CeleryK8sRunLauncher"], [20, 5, 1, "", "celery_k8s_job_executor"]], "dagster_census": [[21, 0, 1, "", "CensusOutput"], [21, 0, 1, "", "CensusResource"], [21, 5, 1, "", "census_resource"], [21, 5, 1, "", "census_trigger_sync_op"]], "dagster_census.CensusOutput": [[21, 2, 1, "", "destination"], [21, 2, 1, "", "source"], [21, 2, 1, "", "sync_run"]], "dagster_dask": [[22, 5, 1, "", "dask_executor"]], "dagster_databricks": [[23, 0, 1, "", "DatabricksClient"], [23, 5, 1, "", "DatabricksClientResource"], [23, 0, 1, "", "DatabricksError"], [23, 6, 1, "", "create_databricks_run_now_op"], [23, 6, 1, "", "create_databricks_submit_run_op"], [23, 5, 1, "", "databricks_client"], [23, 5, 1, "", "databricks_pyspark_step_launcher"]], "dagster_databricks.DatabricksClient": [[23, 1, 1, "", "api_client"], [23, 1, 1, "", "client"], [23, 1, 1, "", "workspace_client"]], "dagster_datadog": [[24, 5, 1, "", "DatadogResource"], [24, 5, 1, "", "datadog_resource"]], "dagster_datahub": [[25, 5, 1, "", "DatahubKafkaEmitterResource"], [25, 5, 1, "", "DatahubRESTEmitterResource"], [25, 5, 1, "", "datahub_kafka_emitter"], [25, 5, 1, "", "datahub_rest_emitter"]], "dagster_dbt": [[26, 4, 1, "", "DagsterDbtCliFatalRuntimeError"], [26, 4, 1, "", "DagsterDbtCliHandledRuntimeError"], [26, 4, 1, "", "DagsterDbtCliOutputsNotFoundError"], [26, 4, 1, "", "DagsterDbtCliRuntimeError"], [26, 4, 1, "", "DagsterDbtCliUnexpectedOutputError"], [26, 4, 1, "", "DagsterDbtError"], [26, 0, 1, "", "DagsterDbtTranslator"], [26, 0, 1, "", "DbtCliEventMessage"], [26, 0, 1, "", "DbtCliInvocation"], [26, 0, 1, "", "DbtCliOutput"], [26, 0, 1, "", "DbtCliResource"], [26, 0, 1, "", "DbtCloudClientResource"], [26, 0, 1, "", "DbtManifestAssetSelection"], [26, 0, 1, "", "DbtOutput"], [26, 0, 1, "", "DbtResource"], [26, 6, 1, "", "build_dbt_asset_selection"], [26, 6, 1, "", "build_schedule_from_dbt_selection"], [26, 6, 1, "", "dbt_assets"], [26, 5, 1, "", "dbt_cli_resource"], [26, 5, 1, "", "dbt_cloud_resource"], [26, 5, 1, "", "dbt_cloud_run_op"], [26, 6, 1, "", "dbt_compile_op"], [26, 6, 1, "", "dbt_docs_generate_op"], [26, 6, 1, "", "dbt_ls_op"], [26, 5, 1, "", "dbt_run_op"], [26, 6, 1, "", "dbt_seed_op"], [26, 6, 1, "", "dbt_snapshot_op"], [26, 6, 1, "", "dbt_test_op"], [26, 6, 1, "", "default_group_from_dbt_resource_props"], [26, 6, 1, "", "default_metadata_from_dbt_resource_props"], [26, 6, 1, "", "get_asset_key_for_model"], [26, 6, 1, "", "get_asset_key_for_source"], [26, 6, 1, "", "get_asset_keys_by_output_name_for_source"], [26, 6, 1, "", "group_from_dbt_resource_props_fallback_to_directory"], [26, 6, 1, "", "load_assets_from_dbt_cloud_job"], [26, 6, 1, "", "load_assets_from_dbt_manifest"], [26, 6, 1, "", "load_assets_from_dbt_project"]], "dagster_dbt.DagsterDbtCliUnexpectedOutputError": [[26, 2, 1, "", "invalid_line_nos"]], "dagster_dbt.DagsterDbtTranslator": [[26, 3, 1, "", "get_asset_key"], [26, 3, 1, "", "get_auto_materialize_policy"], [26, 3, 1, "", "get_description"], [26, 3, 1, "", "get_freshness_policy"], [26, 3, 1, "", "get_group_name"], [26, 3, 1, "", "get_metadata"]], "dagster_dbt.DbtCliEventMessage": [[26, 3, 1, "", "to_default_asset_events"]], "dagster_dbt.DbtCliInvocation": [[26, 3, 1, "", "get_artifact"], [26, 3, 1, "", "is_successful"], [26, 3, 1, "", "stream"], [26, 3, 1, "", "stream_raw_events"], [26, 3, 1, "", "wait"]], "dagster_dbt.DbtCliOutput": [[26, 2, 1, "", "command"], [26, 2, 1, "", "docs_url"], [26, 2, 1, "", "logs"], [26, 2, 1, "", "raw_output"], [26, 2, 1, "", "result"], [26, 2, 1, "", "return_code"]], "dagster_dbt.DbtCliResource": [[26, 3, 1, "", "cli"], [26, 2, 1, "", "global_config_flags"], [26, 2, 1, "", "profile"], [26, 2, 1, "", "profiles_dir"], [26, 2, 1, "", "project_dir"], [26, 2, 1, "", "target"]], "dagster_dbt.utils": [[26, 6, 1, "", "generate_materializations"]], "dagster_docker": [[27, 5, 1, "", "DockerRunLauncher"], [27, 5, 1, "", "docker_container_op"], [27, 5, 1, "", "docker_executor"], [27, 6, 1, "", "execute_docker_container"]], "dagster_duckdb": [[28, 5, 1, "", "DuckDBIOManager"], [28, 5, 1, "", "DuckDBResource"], [28, 5, 1, "", "build_duckdb_io_manager"]], "dagster_duckdb_pandas": [[29, 5, 1, "", "DuckDBPandasIOManager"], [29, 0, 1, "", "DuckDBPandasTypeHandler"], [29, 5, 1, "", "duckdb_pandas_io_manager"]], "dagster_duckdb_polars": [[30, 5, 1, "", "DuckDBPolarsIOManager"], [30, 0, 1, "", "DuckDBPolarsTypeHandler"], [30, 5, 1, "", "duckdb_polars_io_manager"]], "dagster_duckdb_pyspark": [[31, 5, 1, "", "DuckDBPySparkIOManager"], [31, 0, 1, "", "DuckDBPySparkTypeHandler"], [31, 5, 1, "", "duckdb_pyspark_io_manager"]], "dagster_embedded_elt.sling": [[32, 0, 1, "", "SlingResource"], [32, 6, 1, "", "build_sling_asset"]], "dagster_embedded_elt.sling.resources": [[32, 0, 1, "", "SlingSourceConnection"], [32, 0, 1, "", "SlingTargetConnection"]], "dagster_fivetran": [[33, 5, 1, "", "FivetranResource"], [33, 6, 1, "", "build_fivetran_assets"], [33, 5, 1, "", "fivetran_resource"], [33, 5, 1, "", "fivetran_sync_op"], [33, 6, 1, "", "load_assets_from_fivetran_instance"]], "dagster_gcp": [[34, 0, 1, "", "BigQueryError"], [34, 5, 1, "", "BigQueryIOManager"], [34, 5, 1, "", "BigQueryResource"], [34, 5, 1, "", "ConfigurablePickledObjectGCSIOManager"], [34, 5, 1, "", "DataprocResource"], [34, 0, 1, "", "GCSFileHandle"], [34, 5, 1, "", "GCSFileManagerResource"], [34, 5, 1, "", "GCSPickleIOManager"], [34, 5, 1, "", "GCSResource"], [34, 5, 1, "", "bigquery_resource"], [34, 6, 1, "", "bq_create_dataset"], [34, 6, 1, "", "bq_delete_dataset"], [34, 6, 1, "", "bq_op_for_queries"], [34, 5, 1, "", "build_bigquery_io_manager"], [34, 5, 1, "", "dataproc_op"], [34, 5, 1, "", "dataproc_resource"], [34, 5, 1, "", "gcs_file_manager"], [34, 5, 1, "", "gcs_pickle_io_manager"], [34, 5, 1, "", "gcs_resource"], [34, 6, 1, "", "import_df_to_bq"], [34, 6, 1, "", "import_file_to_bq"], [34, 6, 1, "", "import_gcs_paths_to_bq"]], "dagster_gcp.gcs": [[34, 0, 1, "", "GCSComputeLogManager"]], "dagster_gcp_pandas": [[35, 5, 1, "", "BigQueryPandasIOManager"], [35, 0, 1, "", "BigQueryPandasTypeHandler"], [35, 5, 1, "", "bigquery_pandas_io_manager"]], "dagster_gcp_pyspark": [[36, 5, 1, "", "BigQueryPySparkIOManager"], [36, 0, 1, "", "BigQueryPySparkTypeHandler"], [36, 5, 1, "", "bigquery_pyspark_io_manager"]], "dagster_ge": [[37, 6, 1, "", "ge_validation_op_factory"]], "dagster_github": [[38, 5, 1, "", "GithubResource"], [38, 5, 1, "", "github_resource"]], "dagster_graphql": [[39, 0, 1, "", "DagsterGraphQLClient"], [39, 4, 1, "", "DagsterGraphQLClientError"], [39, 0, 1, "", "InvalidOutputErrorInfo"], [39, 0, 1, "", "ReloadRepositoryLocationInfo"], [39, 0, 1, "", "ReloadRepositoryLocationStatus"]], "dagster_graphql.DagsterGraphQLClient": [[39, 3, 1, "", "get_run_status"], [39, 3, 1, "", "reload_repository_location"], [39, 3, 1, "", "shutdown_repository_location"], [39, 3, 1, "", "submit_job_execution"]], "dagster_k8s": [[40, 5, 1, "", "K8sRunLauncher"], [40, 6, 1, "", "execute_k8s_job"], [40, 5, 1, "", "k8s_job_executor"], [40, 5, 1, "", "k8s_job_op"]], "dagster_mlflow": [[41, 5, 1, "", "end_mlflow_on_run_finished"], [41, 5, 1, "", "mlflow_tracking"]], "dagster_msteams": [[42, 5, 1, "", "MSTeamsResource"], [42, 6, 1, "", "make_teams_on_run_failure_sensor"], [42, 5, 1, "", "msteams_resource"], [42, 5, 1, "", "teams_on_failure"], [42, 5, 1, "", "teams_on_success"]], "dagster_mysql": [[43, 0, 1, "", "MySQLEventLogStorage"], [43, 0, 1, "", "MySQLRunStorage"], [43, 0, 1, "", "MySQLScheduleStorage"]], "dagster_pagerduty": [[44, 5, 1, "", "PagerDutyService"], [44, 5, 1, "", "pagerduty_resource"]], "dagster_pandas": [[45, 5, 1, "", "DataFrame"], [45, 0, 1, "", "PandasColumn"], [45, 0, 1, "", "RowCountConstraint"], [45, 0, 1, "", "StrictColumnsConstraint"], [45, 6, 1, "", "create_dagster_pandas_dataframe_type"]], "dagster_pandera": [[46, 6, 1, "", "pandera_schema_to_dagster_type"]], "dagster_papertrail": [[47, 5, 1, "", "papertrail_logger"]], "dagster_postgres": [[48, 5, 1, "", "PostgresEventLogStorage"], [48, 5, 1, "", "PostgresRunStorage"], [48, 5, 1, "", "PostgresScheduleStorage"]], "dagster_prometheus": [[49, 5, 1, "", "PrometheusResource"], [49, 5, 1, "", "prometheus_resource"]], "dagster_prometheus.resources": [[49, 0, 1, "", "PrometheusClient"]], "dagster_pyspark": [[50, 5, 1, "", "PySparkResource"], [50, 5, 1, "", "pyspark_resource"]], "dagster_shell": [[51, 6, 1, "", "create_shell_command_op"], [51, 6, 1, "", "create_shell_script_op"], [51, 6, 1, "", "execute_shell_command"], [51, 6, 1, "", "execute_shell_script"], [51, 6, 1, "", "shell_op"]], "dagster_slack": [[52, 5, 1, "", "SlackResource"], [52, 6, 1, "", "make_slack_on_freshness_policy_status_change_sensor"], [52, 6, 1, "", "make_slack_on_run_failure_sensor"], [52, 5, 1, "", "slack_on_failure"], [52, 5, 1, "", "slack_on_success"], [52, 5, 1, "", "slack_resource"]], "dagster_snowflake": [[53, 0, 1, "", "SnowflakeConnection"], [53, 5, 1, "", "SnowflakeIOManager"], [53, 5, 1, "", "SnowflakeResource"], [53, 5, 1, "", "build_snowflake_io_manager"], [53, 6, 1, "", "snowflake_op_for_query"], [53, 5, 1, "", "snowflake_resource"]], "dagster_snowflake.SnowflakeConnection": [[53, 3, 1, "", "execute_queries"], [53, 3, 1, "", "execute_query"], [53, 3, 1, "", "get_connection"], [53, 3, 1, "", "load_table_from_local_parquet"]], "dagster_snowflake_pandas": [[54, 5, 1, "", "SnowflakePandasIOManager"], [54, 0, 1, "", "SnowflakePandasTypeHandler"], [54, 5, 1, "", "snowflake_pandas_io_manager"]], "dagster_snowflake_pyspark": [[55, 5, 1, "", "SnowflakePySparkIOManager"], [55, 0, 1, "", "SnowflakePySparkTypeHandler"], [55, 5, 1, "", "snowflake_pyspark_io_manager"]], "dagster_spark": [[56, 0, 1, "", "SparkOpError"], [56, 6, 1, "", "construct_spark_shell_command"], [56, 6, 1, "", "create_spark_op"], [56, 6, 1, "", "define_spark_config"], [56, 5, 1, "", "spark_resource"]], "dagster_ssh": [[57, 5, 1, "", "ssh_resource"]], "dagster_twilio": [[58, 5, 1, "", "TwilioResource"], [58, 5, 1, "", "twilio_resource"]], "dagster_wandb": [[59, 0, 1, "", "SerializationModule"], [59, 0, 1, "", "WandbArtifactConfiguration"], [59, 4, 1, "", "WandbArtifactsIOManagerError"], [59, 6, 1, "", "run_launch_agent"], [59, 6, 1, "", "run_launch_job"], [59, 5, 1, "", "wandb_artifacts_io_manager"], [59, 5, 1, "", "wandb_resource"]], "dagstermill": [[60, 0, 1, "", "ConfigurableLocalOutputNotebookIOManager"], [60, 0, 1, "", "DagstermillError"], [60, 0, 1, "", "DagstermillExecutionContext"], [60, 6, 1, "", "define_dagstermill_asset"], [60, 6, 1, "", "define_dagstermill_op"], [60, 6, 1, "", "get_context"], [60, 6, 1, "", "yield_event"], [60, 6, 1, "", "yield_result"]], "dagstermill.DagstermillExecutionContext": [[60, 1, 1, "", "job_def"], [60, 1, 1, "", "job_name"], [60, 1, 1, "", "logging_tags"], [60, 1, 1, "", "op_config"], [60, 1, 1, "", "op_def"], [60, 1, 1, "", "run"], [60, 1, 1, "", "run_config"], [60, 1, 1, "", "run_id"]]}, "objnames": {"0": ["py", "class", "Python class"], "1": ["py", "property", "Python property"], "2": ["py", "attribute", "Python attribute"], "3": ["py", "method", "Python method"], "4": ["py", "exception", "Python exception"], "5": ["py", "data", "Python data"], "6": ["py", "function", "Python function"], "7": ["py", "module", "Python module"], "8": ["std", "cmdoption", "program option"]}, "objtypes": {"0": "py:class", "1": "py:property", "2": "py:attribute", "3": "py:method", "4": "py:exception", "5": "py:data", "6": "py:function", "7": "py:module", "8": "std:cmdoption"}, "terms": {"0": [1, 2, 3, 8, 9, 11, 13, 14, 15, 16, 18, 19, 20, 21, 23, 26, 33, 34, 37, 40, 42, 45, 50, 52, 59, 61, 63, 64, 65, 67, 69], "00": [14, 64, 67], "000z": 14, "00z": 14, "01": [12, 14, 26, 64, 67], "0123456789abcdef0123456789abcdef": 44, "01t00": 14, "01t13": 14, "02": [64, 67], "03": [14, 64, 67], "04": [40, 64, 67], "05": [64, 67], "06": [12, 14, 40, 64, 67], "07": [64, 67], "08": [8, 18], "09": 14, "1": [2, 3, 4, 5, 8, 9, 11, 13, 14, 15, 16, 22, 23, 26, 34, 37, 40, 50, 53, 59, 63, 64, 65, 67, 68, 69], "10": [11, 14, 16, 21, 23, 26, 33, 34, 46, 50, 57, 64, 67], "100": [14, 16, 34, 50, 64], "1000": [14, 63], "10000": 65, "1000000": 14, "1001": 24, "1035": 34, "11": [14, 23, 53, 64, 67], "11000": 65, "12": [14, 37, 53, 64, 67], "1200": 34, "123": 64, "1234": [4, 23, 24, 26], "127": [3, 40], "13": [53, 64, 67], "13t20": 14, "14": [14, 18, 64, 67], "15": [14, 16, 50, 64, 67], "15000": 3, "1521": 14, "15mb": 14, "16": [64, 67], "17": [14, 40], "18": [20, 40], "180": 14, "19": [64, 67], "1bf2": 59, "1m": [16, 50], "2": [2, 3, 4, 5, 9, 11, 12, 13, 14, 15, 16, 17, 23, 26, 34, 40, 42, 50, 52, 53, 59, 63, 64, 65, 67, 69], "20": [8, 14, 34, 40, 64, 67], "200": 14, "2000": 14, "20000": [16, 50], "200m": [16, 50], "2017": [14, 18], "2018": 14, "2020": [14, 40, 64, 67], "2021": 14, "2022": [14, 26, 64, 67], "2023": [8, 12, 26, 64], "2048m": [16, 50], "20t00": 14, "21": [8, 23, 26], "21t21": 40, "22": [8, 57], "2200": 23, "23": [8, 67], "2344535": 14, "2344535_sb1": 14, "24": [8, 14, 64], "2484": 14, "24t03": 14, "25": [8, 14, 21, 26, 33, 67], "2546": [16, 50], "25t00": 14, "26": [8, 14, 24, 64, 67], "27": [64, 67], "28000m": 23, "29": 34, "2auto": 34, "2g": [16, 50], "2gb": [16, 50], "3": [4, 8, 9, 12, 13, 14, 21, 23, 26, 33, 40, 53, 59, 63, 64, 65, 67, 68], "30": [2, 14, 16, 17, 34, 49, 57, 59, 67], "300": 39, "3000": [3, 14, 39, 42, 52], "300mb": [16, 50], "30z": 14, "31": [14, 64], "32": 34, "3333": 3, "3339": 14, "360": 14, "3600": 3, "364": 14, "39": 14, "4": [8, 18, 19, 20, 22, 23, 34, 53, 63], "420e": 14, "44b5": 59, "45": [23, 67], "465": 69, "4815": [16, 50], "5": [1, 8, 9, 12, 14, 16, 17, 22, 23, 34, 50, 61, 63, 64, 67, 68, 69], "50": [14, 16, 50], "5000": 41, "500gb": 34, "500m": [16, 50], "512m": [16, 50], "5432": [11, 15, 48], "54321": 26, "5439": 16, "5672": 18, "587": 69, "6": [16, 40, 50, 64, 67], "60": [11, 14, 26, 39, 42, 53, 59], "60000": 14, "63": 34, "6313": [16, 50], "6379": 40, "64": 23, "7": [34, 38, 40], "77777": 26, "7e4df022": 59, "8": [51, 59], "8000": 14, "8601": 14, "86400": [20, 23], "87b7fe85": 14, "8d74": 14, "9": [14, 26, 34, 64, 67], "90": 14, "914": 14, "95590a": 40, "999": 24, "9am": 2, "A": [1, 2, 3, 4, 5, 7, 8, 9, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 23, 26, 27, 32, 33, 34, 35, 36, 39, 44, 45, 47, 50, 51, 53, 59, 60, 61, 63, 64, 65, 66, 67, 68, 69], "AND": 11, "As": [5, 23, 24, 39, 45, 68], "At": [2, 16, 50, 67], "But": [16, 50], "By": [2, 4, 8, 9, 11, 14, 16, 17, 18, 19, 20, 21, 23, 26, 27, 33, 34, 38, 40, 42, 50, 52, 53, 59, 67], "For": [2, 3, 4, 8, 9, 11, 12, 13, 14, 15, 16, 20, 23, 26, 27, 28, 29, 30, 31, 33, 34, 35, 36, 40, 42, 45, 50, 53, 54, 55, 59, 63, 64, 66, 67, 68], "IF": 53, "INTO": 14, "If": [1, 2, 3, 4, 5, 8, 9, 10, 11, 12, 13, 14, 16, 17, 18, 19, 20, 21, 22, 23, 26, 27, 28, 29, 30, 31, 33, 34, 35, 36, 37, 39, 40, 42, 45, 46, 47, 48, 50, 51, 52, 53, 54, 55, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69], "In": [2, 5, 8, 9, 11, 12, 14, 16, 18, 19, 20, 26, 28, 29, 30, 31, 34, 35, 36, 40, 45, 50, 51, 53, 54, 55, 60, 63, 65, 67, 68], "Ins": 51, "It": [1, 2, 5, 7, 8, 14, 16, 21, 23, 26, 33, 34, 45, 50, 51, 52, 59, 60, 68], "Its": [2, 16, 50], "No": 34, "Not": [3, 14, 16, 50], "ONE": 3, "OR": 4, "On": [14, 34, 40], "One": [7, 14, 26], "Or": [5, 8, 11], "Such": 69, "That": 2, "The": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 26, 27, 32, 33, 34, 35, 36, 39, 40, 42, 44, 45, 46, 47, 49, 50, 51, 52, 53, 54, 55, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69], "Then": [14, 18, 19, 20, 27, 40, 52], "There": [5, 7, 14, 34, 42, 44, 64], "These": [1, 2, 5, 6, 8, 9, 10, 11, 13, 16, 23, 40, 45, 50, 60, 63, 66, 67, 68], "To": [2, 8, 9, 11, 12, 14, 15, 16, 18, 19, 20, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 33, 34, 35, 36, 38, 39, 40, 42, 44, 48, 50, 52, 53, 54, 55, 59, 61, 66, 67, 69], "Will": [8, 14, 20, 34, 40, 64], "With": [16, 17, 20, 34], "_": [2, 4, 6, 12, 14, 34, 45, 60, 68, 69], "__executor_name__": 8, "__fieldvaluesentinel": 4, "__file__": [6, 51, 69], "__init__": [12, 14, 65, 68], "__input_name__": 8, "__logger_name__": 8, "__main__": 18, "__name__": [18, 68], "__op_name__": 8, "__resource_name__": 8, "_add_on": 63, "_asset_selection_data": [8, 9], "_assets_def": 8, "_clean": 6, "_clean_nam": [14, 33], "_cm_scope_ent": 67, "_config": 4, "_connect": 12, "_construct_job_def_from_yaml_fil": 65, "_context": [45, 63, 68], "_core": [2, 6, 11, 13, 14, 26, 27, 33, 34, 37, 40, 45, 63, 67], "_databricks_run_now_op": 23, "_databricks_submit_run_op": 23, "_default_failure_email_bodi": 69, "_default_failure_email_subject": 69, "_default_failure_messag": 42, "_default_failure_message_text_fn": 52, "_default_freshness_message_text_fn": 52, "_get_path": 12, "_graph_": 9, "_job": 13, "_kei": 4, "_kwarg": 11, "_logger": 61, "_required_resource_kei": 2, "_resourc": 67, "_s3_bucket": 68, "_s3_kei": 68, "_schedul": 67, "_serd": 11, "_seven": [16, 17, 34], "_subset_selection_data": [8, 13], "_util": 69, "_was_explicitly_provided_resourc": [8, 13], "_yaml_directori": 65, "_yaml_file_for_job_nam": 65, "a22c": 14, "a383": 59, "a_downstream": 64, "a_job": 5, "a_multi_asset": 8, "a_op": 42, "a_prefix": [16, 25, 38, 66], "a_repo": 5, "a_resourc": 5, "a_schedul": 5, "a_sensor": 5, "a_str": 4, "abc": [2, 64], "abcdef": 16, "abil": [11, 63], "abl": [4, 5, 8, 13, 14, 16, 18, 19, 20, 23, 50], "abort": [16, 50], "aborttransact": 14, "about": [1, 2, 3, 6, 8, 9, 11, 12, 14, 16, 21, 26, 33, 39, 50, 52, 60, 63], "abov": [14, 16, 23, 28, 29, 30, 31, 34, 35, 36, 38, 40, 50, 53, 54, 55, 63], "absolut": [16, 23, 50], "abstract": [11, 12, 16, 17, 26, 34, 62, 63, 64, 65], "abstractset": [2, 9, 10, 15, 63, 68], "acceler": 34, "accept": [1, 2, 4, 7, 8, 11, 12, 13, 16, 23, 45, 47, 50, 60, 61, 63, 64, 66, 67], "accept_term": 14, "access": [2, 3, 7, 8, 11, 12, 14, 16, 17, 23, 25, 26, 34, 38, 41, 44, 45, 50, 53, 54, 55, 58, 60, 64, 65, 66, 68], "access_kei": 14, "access_key_id": 14, "access_key_kei": 23, "access_token": 14, "accesskei": 14, "accesstoken": 14, "accident": 2, "accord": [2, 4, 7, 11, 16, 34, 50], "accordingli": [16, 50], "account": [14, 16, 17, 20, 23, 24, 26, 34, 35, 36, 38, 40, 47, 50, 53, 54, 55, 58], "account_id": [14, 26, 34], "account_manag": 24, "account_nam": 17, "account_sid": [14, 58], "acct": 14, "accur": [16, 50], "accuraci": 14, "accurateblockthreshold": [16, 50], "ach": 27, "achiev": [8, 16, 50], "ack": [14, 16, 50], "acknowledg": 14, "acl": 23, "acquir": [14, 23], "across": [2, 6, 8, 11, 13, 14, 16, 27, 34, 40, 50, 59, 63, 64, 67], "act": [14, 16, 34, 50], "action": [14, 16, 34, 50], "action_breakdown": 14, "action_list_oper": 37, "action_on_failur": 16, "activ": [14, 23, 34, 47, 53, 59], "actual": [3, 4, 8, 12, 16, 50, 63, 67], "acycl": [8, 9], "ad": [2, 8, 12, 14, 15, 16, 20, 23, 26, 34, 40, 50, 64], "adapt": 26, "add": [4, 8, 11, 12, 14, 15, 16, 23, 26, 28, 29, 30, 31, 34, 35, 36, 39, 40, 41, 42, 48, 53, 54, 55, 59, 66, 67], "add_attach": 42, "add_dagster_env_vari": 23, "add_dynamic_partit": [11, 64], "add_metadata": 8, "add_metadata_two_output": 8, "add_on": [8, 9, 13, 63], "add_output_metadata": [8, 12, 26], "add_to_environ": 16, "adddynamicpartitionsrequest": 67, "addfil": [16, 50], "addit": [7, 11, 12, 14, 16, 18, 19, 20, 23, 26, 39, 40, 46, 50, 60, 63, 64, 67], "addition": [60, 63], "additional_arg": 18, "additional_field": 14, "additional_metr": 14, "additional_reader_opt": 14, "address": [14, 16, 22, 34, 50, 69], "adgroupadreport": 14, "adjust": 14, "adjustsourc": 14, "adl": [17, 23], "adls2": [17, 23], "adls2_client": 17, "adls2_file_manag": 17, "adls2_file_system": 17, "adls2_pickle_io_manag": 17, "adls2_prefix": 17, "adls2_resourc": 17, "adls2filehandl": 17, "adls2pickleiomanag": 17, "adls2resourc": 17, "admin": [14, 34], "administr": [14, 26, 38], "adset": 14, "advanc": [11, 16, 26, 50, 67], "advance_all_cursor": 67, "advance_cursor": 67, "advanced_opt": 14, "advantag": 64, "advertis": [14, 16, 50], "advertiser_id": 14, "advis": [18, 26], "aes256": 16, "aescbcenvelopeencrypt": 14, "affect": [16, 23, 50], "after": [2, 3, 12, 14, 16, 21, 23, 26, 33, 34, 35, 36, 40, 50, 52, 53, 63, 64, 66, 67], "after_cursor": 11, "after_cursor_partit": 67, "after_timestamp": 11, "afterward": 8, "ag": 3, "again": [14, 52, 67], "against": [2, 3, 7, 8, 9, 11, 13, 14, 16, 26, 40, 51, 53, 68], "agent": [24, 59], "aggreg": [14, 16, 50, 67], "ago": [2, 14], "agre": 14, "ahead": [16, 50], "ai": 59, "aim": 17, "airbyte_asset": 14, "airbyte_connect": 14, "airbyte_host": 14, "airbyte_inst": 14, "airbyte_password": 14, "airbyte_port": 14, "airbyte_resourc": 14, "airbyte_sync_op": 14, "airbyte_usernam": 14, "airbyteconnect": 14, "airbyteconnectionmetadata": 14, "airbytecontain": 14, "airbytedestin": 14, "airbytedestinationnamespac": 14, "airbytehq": 14, "airbyteio": 14, "airbytemanagedelementreconcil": 14, "airbyteoutput": 14, "airbyteresourc": 14, "airbytesourc": 14, "airbytesyncmod": 14, "airflow_db": 15, "airflow_execution_d": 15, "airflow_hom": 15, "airline_demo": 69, "airtabl": 14, "airtablesourc": 14, "album": 24, "alert": [42, 44, 52, 67, 69], "alert_email_password": 69, "algorithm": [16, 50], "alia": [4, 8, 9, 59, 63, 65], "alias": [8, 9], "align": [8, 18, 19, 20], "aliv": [11, 16, 50], "all": [2, 3, 4, 6, 7, 8, 9, 11, 12, 13, 14, 16, 18, 19, 20, 23, 26, 27, 33, 34, 37, 39, 40, 41, 42, 46, 48, 50, 52, 53, 63, 64, 65, 66, 67, 68, 69], "all_asset": 2, "all_asset_check": 2, "all_dbt_asset": 26, "all_ev": 8, "all_partitions_materi": 67, "alloc": [16, 50], "allow": [1, 2, 3, 4, 8, 9, 11, 12, 13, 14, 16, 21, 23, 26, 33, 38, 40, 42, 45, 46, 50, 51, 52, 59, 60, 62, 63, 64, 65, 66, 69], "allow_host_key_chang": 57, "allow_missing_partit": 12, "allow_nonexistent_upstream_partit": 64, "allow_retri": [26, 63], "allpartitionmap": 64, "allpartitionsmap": 64, "almost": 14, "aloha": 4, "along": [16, 23, 26, 50, 51], "alongsid": 26, "alreadi": [5, 11, 14, 16, 23, 39, 40, 50], "also": [2, 3, 5, 8, 9, 11, 12, 13, 14, 16, 17, 18, 19, 20, 22, 23, 26, 27, 34, 38, 40, 42, 44, 46, 50, 52, 53, 59, 61, 62, 63, 64, 66, 67, 69], "alter": 6, "altern": [1, 4, 11, 16, 53, 69], "alwai": [2, 8, 9, 13, 14, 16, 34, 40, 45, 60, 67, 68, 69], "am": [4, 64, 67], "amazon": [14, 16, 23], "amazonadssourc": 14, "amazonaw": [14, 16, 23, 40], "amazonec": 16, "amazons3": [14, 23], "amazonsellerpartnersourc": 14, "amazonsqsdestin": 14, "amazonsqssourc": 14, "america": [2, 26, 53, 64, 67], "amount": [14, 16, 23, 50, 59], "amplitud": 14, "amplitudesourc": 14, "amqp": 18, "an": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 33, 34, 35, 36, 37, 38, 39, 41, 42, 44, 45, 46, 47, 50, 51, 52, 53, 54, 55, 57, 58, 59, 60, 62, 63, 64, 65, 66, 67, 68, 69], "an_asset": 8, "an_existing_mlflow_run_id": 41, "an_op": 52, "analyt": [14, 34], "ancestor": [2, 8, 9, 13], "ani": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 23, 26, 27, 29, 30, 31, 32, 33, 34, 38, 39, 40, 41, 42, 45, 47, 50, 51, 52, 53, 54, 55, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69], "annot": [4, 5, 28, 29, 30, 31, 34, 53, 54, 55, 60, 67, 68], "anonym": 63, "anoth": [7, 8, 9, 11, 14, 16, 26, 50, 68], "another_asset": 8, "answer": 14, "ant": [16, 50], "anyth": 14, "apach": [16, 34, 50, 56], "api": [2, 4, 5, 6, 7, 8, 9, 11, 14, 15, 16, 17, 21, 24, 26, 33, 34, 38, 39, 41, 42, 44, 45, 50, 52, 59, 61, 62, 63, 64, 66, 67, 68, 69], "api3": 14, "api_cli": 23, "api_kei": [14, 21, 24, 33, 59], "api_password": 14, "api_secret": [14, 16, 33], "api_serv": 14, "api_stepconfig": 16, "api_token": 14, "api_url": 14, "apicli": 23, "apidoc": 14, "apifi": 14, "apifydatasetsourc": 14, "apigroup": 40, "apikei": 14, "apikeyauthent": 14, "apikeyid": 14, "apikeysecret": 14, "apipassword": 14, "apirefer": 16, "apitoken": 14, "apitokencredenti": 14, "app": [14, 16, 19, 20, 24, 38, 50, 52], "app_id": [14, 16, 50], "app_kei": 24, "appauthexampl": 16, "appear": [4, 8, 12, 16, 23, 38, 50], "append": [14, 21, 23, 63], "appfollow": 14, "appfollowsourc": 14, "appid": 14, "appl": 64, "appli": [1, 2, 5, 8, 9, 11, 13, 14, 15, 16, 20, 23, 26, 27, 33, 40, 50, 59, 63, 64, 66, 67], "applic": [14, 16, 20, 23, 24, 34, 38, 40, 50, 56, 64], "application_argu": 56, "application_id": 14, "application_jar": 56, "application_secret": 14, "applylimitperuniquevalu": [11, 27, 40], "appropri": [4, 9, 14, 19, 20, 33, 40, 53, 54, 55, 68], "appsflyer": 14, "appsflyersourc": 14, "appstor": 14, "appstoresingersourc": 14, "ar": [1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14, 16, 17, 18, 19, 20, 23, 26, 27, 33, 34, 39, 40, 43, 44, 45, 47, 48, 50, 51, 52, 53, 59, 60, 61, 63, 64, 65, 66, 67, 68], "arbitrari": [1, 2, 4, 6, 7, 8, 9, 11, 13, 23, 39, 51, 63, 65, 66], "archiv": [14, 34], "archiveuri": 34, "arctic": 64, "aren": [2, 52], "arg": [3, 7, 8, 13, 15, 16, 19, 26, 34, 39, 40, 61], "argument": [1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 13, 16, 17, 18, 19, 20, 26, 32, 34, 40, 45, 50, 51, 61, 63, 64, 65, 66, 67, 68], "arn": [14, 16], "around": [11, 12, 16, 17, 18, 19, 20, 50], "arouting_kei": 44, "arrai": [4, 7, 14, 16], "arrang": [8, 9], "arriv": 14, "articl": [14, 18], "artifact": [3, 8, 11, 16, 26, 50, 59, 63], "artifactid": [16, 50], "artifactori": [16, 50], "as_dagster_typ": [63, 68], "asana": 14, "asanasourc": 14, "ascend": [11, 14, 67], "asia": 34, "asid": [16, 50], "ask": [16, 50], "asktimeout": [16, 50], "assembl": 9, "assert": [4, 8, 11, 16, 66, 68], "assert_failur": 68, "assert_success": 68, "asset": [4, 5, 9, 11, 12, 13, 15, 16, 17, 21, 23, 25, 28, 29, 30, 31, 34, 35, 36, 38, 42, 52, 53, 54, 55, 59, 60, 62, 64, 65, 66, 67, 68], "asset0": 2, "asset0_valu": 2, "asset1": [2, 5, 8, 12, 16, 17, 34, 64, 65, 66, 67], "asset1_job": [64, 67], "asset1_job_schedul": [64, 67], "asset1_result": 2, "asset1_with_foo": 66, "asset2": [2, 5, 8, 12, 16, 17, 34, 64, 65, 66], "asset2_result": 2, "asset2_with_foo": 66, "asset_1": 67, "asset_2": 67, "asset_a": 67, "asset_b": 67, "asset_check": [1, 2, 5], "asset_check_select": 11, "asset_checks_def": 8, "asset_config": 8, "asset_def": 32, "asset_dep": 2, "asset_entri": 11, "asset_ev": 67, "asset_graph": [8, 9], "asset_info": 12, "asset_kei": [1, 2, 5, 8, 11, 12, 16, 17, 26, 33, 34, 52, 63, 65, 67], "asset_key_for_input": 8, "asset_key_for_output": 8, "asset_key_list": 2, "asset_key_prefix": [14, 21, 26, 33, 60], "asset_lay": [8, 9, 13], "asset_materi": [8, 12], "asset_materialization_fn": 67, "asset_materialization_plan": 8, "asset_materializt": 8, "asset_observ": 8, "asset_on": 5, "asset_one_check_on": 5, "asset_partit": [11, 63], "asset_partition_kei": 12, "asset_partition_key_for_input": 8, "asset_partition_key_for_output": 8, "asset_partition_key_rang": [8, 12], "asset_partition_key_range_for_input": 8, "asset_partition_key_range_for_output": 8, "asset_partition_keys_for_input": 8, "asset_partition_keys_for_output": 8, "asset_partitions_def": 12, "asset_partitions_def_for_input": 8, "asset_partitions_def_for_output": 8, "asset_partitions_subset": 12, "asset_partitions_time_window": 12, "asset_partitions_time_window_for_input": 8, "asset_partitions_time_window_for_output": [8, 26], "asset_select": [8, 11, 13, 52, 67], "asset_sensor": 67, "asset_spec": 32, "asset_that_uses_writ": [16, 25, 38, 66], "asset_to_invok": 8, "asset_two": 5, "asset_valu": 8, "asset_with_config": 4, "assetcheckevalu": 67, "assetcheckkei": 1, "assetcheckresult": [1, 26], "assetchecksdefinit": [1, 5, 8], "assetchecksever": 1, "assetcheckspec": [1, 2], "assetdefinit": 2, "assetdep": 2, "assetexecutioncontext": [8, 26], "assetin": [2, 5, 8, 12, 28, 29, 30, 31, 34, 35, 36, 53, 54, 55, 60, 64, 65], "assetkei": [1, 2, 5, 8, 11, 12, 14, 15, 16, 17, 26, 33, 34, 60, 63, 65, 67], "assetlay": [8, 9], "assetmateri": [2, 8, 11, 12, 26, 60, 63, 67], "assetobserv": [8, 12, 26, 67], "assetout": [2, 8, 26], "assetpartitionstatu": 11, "assetrecord": 11, "assets_def": [2, 8], "assets_defs_by_kei": [2, 67], "assetscheckspec": 2, "assetsdefinit": [1, 2, 5, 8, 14, 15, 26, 33, 60, 65, 67], "assetselect": [2, 8, 26, 52, 67], "assetsensordefinit": 67, "assetspec": [2, 32], "assetvalueload": 2, "assign": [2, 11, 12, 14, 16, 17, 26, 34, 59], "assist": 24, "associ": [2, 3, 5, 6, 8, 9, 10, 11, 12, 14, 26, 33, 34, 63, 64, 65, 67], "asst": 2, "assum": [2, 11, 14, 20, 26, 40, 63, 64], "assumpt": 16, "async": [12, 63], "athlet": 14, "athlete_id": 14, "atla": 14, "atlant": 64, "atlassian": 14, "attach": [1, 2, 8, 9, 12, 13, 14, 16, 17, 26, 33, 34, 39, 46, 51, 60, 63, 64, 67, 68], "attempt": [2, 7, 8, 11, 14, 16, 46, 49, 50, 57, 63, 64], "attempt_num": 63, "attit": 23, "attribut": [2, 3, 7, 8, 14, 23, 34, 63, 66, 67], "attributes_to_return": 14, "attribution_window": 14, "audit": [16, 50], "auth": [14, 23, 34, 39, 40], "auth_method": 14, "auth_sourc": 14, "auth_ssh_kei": 14, "auth_token": [14, 26, 58], "auth_typ": 14, "auth_url": 14, "auth_user_password": 14, "authent": [14, 16, 17, 23, 34, 35, 36, 50, 53, 54, 55, 58, 59], "authenticateviagoogleoauth": 14, "authenticateviaharvestoauth": 14, "authenticateviamicrosoft": 14, "authenticateviamicrosoftoauth20": 14, "authenticateviaoauth20": 14, "authenticateviaretentlyoauth": 14, "authenticatewithapitoken": 14, "authenticatewithpersonalaccesstoken": 14, "authenticationmethod": 14, "authenticationviagoogleoauth": 14, "author": [8, 9, 11, 14, 44, 63], "author_year_end": 14, "author_year_start": 14, "auto": [2, 14, 26, 34], "auto_commit_interval_m": 14, "auto_materialize_polici": [2, 26], "auto_materialize_policies_by_kei": 2, "auto_materialize_policies_by_output_nam": 2, "auto_observe_interval_minut": 2, "auto_offset_reset": 14, "autocommit": [16, 53], "autom": [38, 40], "automat": [2, 3, 4, 5, 7, 8, 14, 16, 23, 26, 33, 39, 46, 47, 50, 59, 61, 66], "automaterializepolici": [2, 14, 26], "automaterializerul": 2, "automatic_reconnect": 14, "autosc": 23, "autoscal": 23, "avail": [3, 4, 5, 7, 8, 10, 11, 12, 14, 15, 16, 17, 18, 23, 26, 27, 34, 39, 40, 47, 50, 60, 61, 66, 67, 68], "avoid": [5, 11, 14, 16, 23, 34, 35, 36, 50, 53, 54, 55, 63, 65, 67], "avro": 14, "avroapacheavro": 14, "aw": [14, 23, 40], "aws_access_kei": 14, "aws_access_key_id": [14, 16, 41], "aws_account_id": [14, 16, 40], "aws_attribut": 23, "aws_environ": 14, "aws_key_id": 14, "aws_key_secret": 14, "aws_region": 16, "aws_region_nam": 14, "aws_secret_access_kei": [14, 16, 41], "aws_secret_kei": 14, "aws_session_token": 16, "awsavail": 23, "awscloudtrailsourc": 14, "awsdatalakedestin": 14, "awss3stag": 14, "azblobazureblobstorag": 14, "azur": [14, 23], "azure_blob_storage_account_kei": 14, "azure_blob_storage_account_nam": 14, "azure_blob_storage_container_nam": 14, "azure_blob_storage_endpoint_domain_nam": 14, "azure_blob_storage_output_buffer_s": 14, "azure_blob_storage_sas_token": 14, "azure_data_lake_storage_kei": 17, "azureblobcomputelogmanag": 17, "azureblobstorag": 14, "azureblobstoragedestin": 14, "azureblobstoragestag": 14, "azuredatabrick": 23, "azuretablesourc": 14, "b": [2, 8, 11, 15, 59, 63, 64, 67], "b30e7ede77df": 14, "back": [8, 11, 16, 17, 18, 20, 26, 34, 40, 43, 48, 50, 53, 54, 55, 60, 67], "backend": [16, 19, 20, 50], "backendconnectiontimeout": [16, 50], "backfil": [2, 3, 8, 67], "backfill_polici": 2, "backfillpolici": [2, 64], "background": [3, 14, 18], "backlog": [16, 50], "backoff": 63, "backoff_delai": 63, "backpressur": [16, 50], "backward": [16, 50], "bad": 4, "badg": 2, "balthazar": 18, "bamboo": 14, "bamboohr": 14, "bamboohrsourc": 14, "bar": [4, 8, 11, 12, 16, 23, 24, 33, 34, 50, 63, 65, 66, 68], "bare": [4, 7], "base": [2, 4, 7, 8, 9, 11, 12, 13, 14, 16, 17, 18, 19, 20, 22, 25, 26, 28, 29, 30, 31, 34, 35, 36, 38, 40, 42, 43, 45, 48, 50, 51, 52, 53, 54, 55, 59, 60, 62, 63, 64, 66, 67, 68, 69], "base64": [34, 35, 36, 53, 54, 55], "base_dir": [11, 12, 16, 17, 34, 59, 60], "base_id": 14, "base_path": 12, "base_storag": 11, "base_url": 14, "baselin": 59, "basemodel": 4, "bash": 51, "basi": 40, "basic": [8, 9, 14, 33, 34], "basicprofil": [16, 50], "bat": 8, "batch": [14, 16, 40, 50], "batch_kwarg": 37, "batch_siz": 14, "batching_en": 14, "batching_max_messag": 14, "batching_max_publish_delai": 14, "baz": [8, 12, 33, 63], "bb852df4077e": 59, "bce": 14, "bearer": 14, "bearer_token": 14, "becaus": [2, 7, 8, 11, 16, 17, 23, 26, 50, 67], "becom": [4, 7, 14, 45, 52, 61, 66, 68], "been": [2, 7, 8, 11, 13, 14, 16, 50, 65, 67], "befor": [2, 4, 5, 11, 14, 16, 20, 21, 23, 26, 33, 34, 35, 36, 39, 40, 49, 50, 51, 52, 53, 63, 64, 67], "before_cursor": 11, "before_timestamp": 11, "begin": [14, 16, 50, 52], "begin_tim": 14, "behalf": [34, 67], "behav": 14, "behavior": [8, 12, 13, 14, 15, 16, 23, 26, 50, 63], "behind": [16, 50, 67], "being": [1, 2, 4, 8, 9, 10, 11, 12, 14, 16, 23, 26, 34, 50, 61, 63, 66, 67, 68], "belong": [2, 8, 9, 10, 14, 16, 23, 34, 50, 67], "below": [4, 16, 23, 26, 34, 38, 40, 50, 53], "bertovi\u0107": 18, "bespok": 65, "best": 15, "beta": 34, "better": [14, 16, 50], "between": [2, 8, 9, 11, 14, 16, 21, 26, 28, 32, 33, 34, 50, 51, 52, 53, 59, 63, 64, 66, 67], "beyond": [14, 16, 50, 63], "big": 14, "big_query_client_buffer_size_mb": 14, "bigcommerc": 14, "bigcommercesourc": 14, "bigger": [14, 16, 50], "bigqueri": 14, "bigquery_io_manag": 34, "bigquery_pandas_io_manag": 35, "bigquery_pyspark_io_manag": 36, "bigquery_resourc": 34, "bigquerydenormalizeddestin": 14, "bigquerydestin": 14, "bigqueryerror": 34, "bigqueryiomanag": [34, 35, 36], "bigquerypandasiomanag": 35, "bigquerypandastypehandl": [34, 35], "bigquerypysparkiomanag": 36, "bigquerypysparktypehandl": 36, "bigqueryresourc": 34, "bigquerysourc": 14, "bigtabl": 34, "billion": 14, "bin": 40, "binari": [16, 34, 50], "binaryio": 11, "bind": [5, 13, 16, 50, 53], "bindaddress": [16, 50], "bing": 14, "bingadssourc": 14, "binlog": 14, "birth": 14, "bit": 2, "bitnami": 40, "blacklist": [16, 50], "blank": [11, 14, 33], "blob": [8, 14, 17, 23, 26], "block": [11, 14, 16, 17, 18, 34, 48, 50, 52], "block_if_queue_ful": 14, "block_siz": 14, "block_size_mb": 14, "blockinterv": [16, 50], "blockmanag": [16, 50], "blockmanagerslavetimeoutm": [16, 50], "blocks_fn": 52, "blocksiz": [16, 50], "blog": [18, 58], "blue": 24, "board": 14, "board_id": 14, "bodi": [2, 7, 8, 9, 12, 13, 14, 38, 39, 52, 61, 62, 63, 66, 69], "book": 14, "bookmark": 14, "bookshelv": 14, "bool": [1, 2, 4, 7, 8, 9, 11, 13, 14, 15, 16, 20, 21, 23, 26, 33, 34, 39, 40, 42, 45, 48, 50, 52, 53, 60, 62, 63, 64, 65, 67, 68, 69], "boolean": [4, 8, 62, 63, 67], "boolmetadatavaluy": 63, "boolsourc": [4, 14, 16, 25, 26, 33, 42, 53, 54, 55, 57], "boostrap": 25, "boot": 34, "bootdisksizegb": 34, "bootdisktyp": 34, "bootstrap": [3, 14, 16, 25, 40], "bootstrap_serv": 14, "born": 14, "bot": [14, 52], "both": [4, 5, 8, 11, 12, 14, 15, 16, 17, 23, 24, 25, 26, 37, 38, 40, 50, 53, 54, 55, 66, 67], "boto": 16, "boto3": 16, "botocor": 16, "bound": [2, 5, 14, 16, 50, 64, 67], "boundari": [5, 7, 8, 11, 13, 63], "bq": 34, "bq_create_dataset": 34, "bq_delete_dataset": 34, "bq_op_for_queri": 34, "braintre": 14, "braintreesourc": 14, "branch": [14, 53], "brand": 14, "break": [2, 14, 15, 26, 63, 66], "breakdown": 14, "breakpoint": 69, "brew": 40, "bridg": [16, 50], "broadcast": [16, 50], "broker": [14, 19, 20], "broker_host": 14, "broker_port": 14, "broker_url": 18, "browser": 24, "bu": [16, 50], "bucket": [4, 14, 16, 23, 34, 35, 36, 68], "bucket_bi": 11, "bucket_id": 14, "bucket_nam": 14, "bucket_prefix": [4, 14], "buffer": [14, 16, 50, 51], "buffer_memori": 14, "buffer_s": 14, "bufferediobas": 60, "buffers": [14, 16, 50], "bug": [53, 54, 55], "build": [3, 8, 9, 10, 12, 13, 14, 26, 28, 32, 33, 34, 40, 53, 61, 63, 66, 67], "build_add_request": 64, "build_airbyte_asset": 14, "build_asset_context": 8, "build_bigquery_io_manag": 34, "build_dbt_asset_select": 26, "build_duckdb_io_manag": 28, "build_fivetran_asset": 33, "build_freshness_policy_sensor_context": 67, "build_hook_context": 10, "build_init_logger_context": 61, "build_init_resource_context": 66, "build_input_context": 12, "build_multi_asset_sensor_context": 67, "build_op_context": [8, 16], "build_output_context": 12, "build_reconstructable_job": [8, 13], "build_resourc": 66, "build_run_status_sensor_context": 67, "build_schedule_context": 67, "build_schedule_from_dbt_select": 26, "build_schedule_from_partitioned_job": [64, 67], "build_sensor_context": 67, "build_sling_asset": 32, "build_snowflake_io_manag": 53, "buildkit": 40, "buildup": [16, 50], "built": [4, 8, 11, 14, 16, 26, 32, 45, 50, 60], "builtin": [4, 21, 69], "bulk": 34, "bundl": [14, 16], "busi": [14, 68], "busybox": [27, 40], "button": 14, "bypass": [26, 40, 63], "bypass_cach": 26, "bypassmergethreshold": [16, 50], "byte": [11, 14, 16, 50, 63], "bzip2": 14, "c": [2, 40, 64], "c_": 60, "ca": [14, 16], "ca_certif": 14, "ca_certificate_path": 25, "cach": [2, 3, 14, 16, 26, 50, 53, 59, 65], "cachableassetsdefinit": 2, "cache_column_metadata": 53, "cache_duration_in_minut": 59, "cache_typ": 14, "cacheabl": 2, "cacheableassetsdefinit": [2, 5, 26], "cachedexecutoridletimeout": [16, 50], "cadenc": [64, 67], "calcul": [2, 16, 50, 63, 64], "calculate_byt": 63, "call": [2, 3, 4, 7, 8, 11, 12, 13, 14, 16, 17, 22, 23, 26, 27, 34, 37, 40, 46, 47, 50, 53, 60, 61, 62, 64, 65, 67, 68], "call_user_provided_funct": 11, "callabl": [4, 10, 11, 13, 14, 16, 26, 33, 42, 45, 47, 52, 61, 63, 64, 65, 66, 67, 68, 69], "callback": 10, "caller": [16, 34, 50], "callercontext": [16, 50], "camelcas": 40, "campaign": 14, "can": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16, 18, 19, 20, 22, 23, 24, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 38, 39, 40, 42, 43, 44, 45, 47, 48, 50, 51, 52, 53, 54, 55, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69], "can_attach_to": 23, "can_manag": 23, "can_manage_run": 23, "can_restart": 23, "can_subset": 2, "can_view": 23, "cancel": [11, 14, 23], "cancel_and_wait": 16, "cancel_run": 23, "cancel_sync_on_run_termin": 14, "candid": 2, "canned_acl": 23, "cannot": [2, 4, 7, 8, 14, 16, 17, 23, 34, 35, 36, 39, 50, 59, 60, 63], "canon": 14, "capabl": [20, 40], "capac": [16, 34, 50], "captur": [11, 12, 14, 16, 17, 26, 34, 69], "capture_log": 26, "captured_log_manag": 11, "capturedlogmanag": 11, "card": 42, "care": [2, 8, 9, 13], "cart": 14, "cartsourc": 14, "case": [8, 9, 11, 12, 14, 16, 18, 19, 20, 26, 27, 33, 40, 45, 50, 51, 63, 64, 65, 68, 69], "cassandra": 14, "cassandradestin": 14, "cat": [34, 35, 36, 51, 53, 54, 55], "catalog": [14, 26, 67], "catch": 7, "categori": 14, "caus": [2, 3, 10, 14, 16, 34, 50], "caution": [14, 16, 50], "cdc": 14, "ce": 14, "celeri": 40, "celery_docker_executor": 19, "celery_docker_job_executor": 19, "celery_enabled_job": [18, 19, 20], "celery_executor": 18, "celery_k8s_job_executor": 20, "celeryk8srunlaunch": 20, "celeryq": [18, 19, 20], "censu": 14, "census_api_kei": 21, "census_resourc": 21, "census_sync_op": 21, "census_trigger_sync_op": 21, "censusoutput": 21, "censusresourc": 21, "center": 14, "central": [8, 10, 16, 50, 61], "central1": 34, "centralapirout": 14, "cereal": 14, "cereals_connect": 14, "cereals_csv_sourc": 14, "cert": 16, "certain": [1, 2, 8, 11, 13, 14, 16, 33, 45, 50, 51, 53, 60, 63], "certif": [14, 16, 34, 42], "chain": [16, 50], "chang": [2, 3, 11, 14, 16, 18, 34, 40, 52, 53, 62, 64, 67], "changelog": 14, "channel": [3, 14, 26, 42, 52], "channel_filt": 14, "char": 14, "charact": [14, 16, 23, 34, 50, 59], "chargebe": 14, "chargebeesourc": 14, "chargifi": 14, "chargifysourc": 14, "chartmogul": 14, "chartmogulsourc": 14, "chat": [14, 52], "chat_postmessag": [26, 52], "check": [2, 4, 5, 7, 8, 11, 14, 16, 23, 26, 37, 45, 46, 50, 51, 59, 60, 62, 63, 65, 66, 67, 68], "check_cluster_everi": 16, "check_dagster_typ": 68, "check_nam": [1, 24], "check_result": 2, "check_spec": 2, "check_specs_by_output_nam": 2, "checker": 4, "checkerror": [45, 68], "checkpoint": [16, 50], "checkpointinterv": [16, 50], "checks_for_asset": 2, "checksum": [16, 50], "child": [4, 7, 8, 9, 13, 63, 69], "children": 2, "choic": 13, "choos": [14, 23], "chose": 14, "chosen": [14, 16], "chunk": [8, 11, 14, 16, 50], "cid": 14, "circumst": [16, 50], "cl": [26, 68], "claim": 40, "class": [1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 20, 21, 23, 25, 26, 28, 29, 30, 31, 32, 33, 34, 35, 36, 38, 39, 40, 43, 45, 46, 47, 48, 49, 50, 53, 54, 55, 56, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69], "class_nam": 11, "classestoregist": [16, 50], "classmethod": [14, 26], "classpath": [16, 34, 50], "clean": [14, 16, 50, 66], "clean_sess": 14, "cleancheckpoint": [16, 50], "cleaned_custom": 26, "cleaner": [16, 50], "cleanup": [16, 40, 50, 66], "clear": [3, 16, 50, 67], "cli": [5, 14, 40], "click": [14, 52], "clickhous": 14, "clickhousedestin": 14, "clickhousesourc": 14, "client": [3, 14, 16, 17, 22, 23, 34, 41, 50, 52, 53, 59], "client_certif": 14, "client_credenti": 14, "client_dns_lookup": 14, "client_id": 14, "client_kei": 14, "client_key_password": 14, "client_prefetch_thread": 53, "client_secret": 14, "client_session_keep_al": 53, "clientnam": 14, "clone": [6, 16, 50], "cloneconf": [16, 50], "close": [8, 11, 14, 16, 50, 53, 64], "closecomsourc": 14, "closefileafterwrit": [16, 50], "cloud": [11, 12, 14, 15, 23, 34, 39, 53], "cloudflar": 14, "cloudtrail": 14, "cloudwatch_logg": 16, "cluster": [11, 14, 16, 20, 22, 23, 34, 50], "cluster_config": 34, "cluster_config_dict": 34, "cluster_config_json_path": 34, "cluster_config_yaml_path": 34, "cluster_id": 16, "cluster_log_conf": 23, "cluster_nam": 34, "cluster_permiss": 23, "cluster_url": 14, "clusterconfig": 34, "clusternam": 34, "cmd": 27, "cn": 4, "coars": [16, 50], "coarser": 64, "cockroachdb": 14, "cockroachdbsourc": 14, "code": [2, 3, 5, 7, 8, 10, 11, 14, 16, 23, 26, 34, 38, 44, 50, 51, 59, 60, 61, 62, 63, 66, 67], "code_server_log_level": 3, "code_vers": [2, 6, 63], "codec": [14, 16, 50], "codelocationselector": 67, "codelocationsensor": 52, "coeercibletoassetkei": 2, "coerc": [5, 63], "coercibletoassetkei": 8, "coercibletoassetkeyprefix": [14, 33], "cogroup": [16, 50], "cohort": 14, "col": 63, "col_a": 63, "col_b": 63, "collect": [4, 7, 8, 9, 12, 14, 16, 50, 60, 63, 64], "collis": 34, "color": [24, 26], "colored_console_logg": [8, 13, 61], "column": [1, 2, 11, 14, 28, 29, 30, 31, 34, 35, 36, 45, 46, 53, 54, 55, 60, 63], "columnstor": 14, "com": [14, 16, 18, 19, 20, 23, 24, 26, 34, 38, 40, 42, 50, 52, 58, 59, 63, 69], "coma": 14, "combin": [1, 2, 9, 26, 51, 63], "come": [2, 9, 14, 16, 50, 67], "comma": [14, 16, 25, 50], "command": [3, 14, 16, 18, 19, 20, 26, 27, 34, 35, 36, 40, 50, 51, 53, 54, 55, 56], "comment": 14, "commercetool": 14, "commercetoolssourc": 14, "commit": 14, "committ": [16, 50], "committransact": 14, "common": [2, 11, 14, 18, 19, 20, 34, 40], "commonli": 64, "commun": [11, 14, 16, 21, 33, 34, 50, 59, 63, 68], "compani": [14, 40, 52], "companynam": 14, "compar": 59, "compat": [12, 14, 16, 17, 34, 50], "compil": [15, 26], "complet": [8, 9, 11, 12, 13, 14, 16, 17, 20, 21, 23, 26, 33, 34, 35, 36, 40, 45, 50, 59, 63, 67], "completekei": 69, "complex": 65, "complex_repositori": 65, "complex_solid": 69, "complexrepositorydata": 65, "complic": [5, 69], "compon": [4, 8, 9, 11, 12, 14, 16, 17, 21, 26, 33, 34, 48], "compos": [2, 18, 26, 60, 63], "compose_fn": [2, 9, 13], "composit": [6, 9, 14], "compress": [14, 16, 50, 57], "compression_codec": 14, "compression_level": 14, "compression_typ": 14, "comput": [1, 2, 6, 7, 8, 9, 10, 12, 14, 16, 17, 20, 23, 26, 33, 39, 40, 47, 50, 59, 60, 61, 62, 63, 67, 68], "compute_fn": [2, 63], "compute_kind": [1, 2], "compute_log": [16, 17, 34], "compute_log_manag": [11, 16, 17, 34], "compute_logs_data": 11, "computelogmanag": 11, "computemetadata": 34, "con": 2, "concaten": [2, 60], "concept": [8, 20, 26, 40], "concert": [7, 20], "concis": [16, 50], "concret": [9, 11], "concurr": [3, 8, 11, 12, 14, 16, 27, 40, 50], "condit": [11, 14, 67], "conf": [16, 23, 34, 50], "config": [1, 2, 3, 5, 7, 9, 10, 11, 12, 13, 15, 16, 17, 18, 19, 20, 22, 23, 24, 25, 26, 27, 29, 30, 31, 34, 35, 36, 38, 39, 40, 41, 42, 43, 44, 45, 47, 48, 50, 51, 52, 53, 54, 55, 60, 61, 62, 63, 65, 66, 67, 68, 69], "config_dict": [4, 32], "config_field": [8, 66], "config_fil": 69, "config_fn": 4, "config_from_fil": 69, "config_from_pkg_resourc": 69, "config_from_yaml_str": 69, "config_map": [8, 9, 13], "config_or_config_fn": 11, "config_schema": [1, 2, 4, 6, 11, 12, 16, 47, 60, 61, 63, 65, 66, 68], "config_sourc": [18, 19, 20], "config_valu": [4, 7, 68], "config_yaml": [11, 18], "configbucket": 34, "configmap": [2, 4, 8, 9, 13, 20, 40], "configmapenvsourc": [20, 40], "configschema": [1, 2, 4, 11, 12, 16, 47, 60, 61, 63, 66, 68], "configu": 4, "configur": [1, 2, 3, 4, 7, 9, 10, 11, 12, 13, 14, 15, 16, 17, 19, 20, 21, 22, 23, 26, 27, 28, 29, 30, 31, 33, 34, 35, 36, 38, 39, 40, 42, 43, 47, 48, 50, 51, 52, 53, 54, 55, 56, 59, 60, 61, 64, 65, 66, 67, 69], "configurableclass": 11, "configurableclassdata": [11, 16, 17, 34], "configurabledefinit": [4, 11], "configurableexternaliomanag": 12, "configurableiomanag": 12, "configurableiomanagerfactori": 12, "configurablelocaloutputnotebookiomanag": 60, "configurablepickledobjectadls2iomanag": 17, "configurablepickledobjectgcsiomanag": 34, "configurablepickledobjects3iomanag": 16, "configurableresourc": [16, 25, 38, 66], "conflict": [8, 16, 34, 39, 50, 66], "confluenc": 14, "confluencesourc": 14, "conform": [2, 4, 8, 9, 13, 15, 34, 39, 63], "confus": [11, 63], "conjunct": 4, "conn": [28, 53], "conn_str": 11, "connect": [3, 9, 11, 12, 14, 15, 16, 17, 19, 22, 25, 26, 27, 32, 33, 34, 39, 40, 42, 49, 50, 52, 53, 57, 58, 63, 66], "connect_timeout": [14, 16], "connect_timeout_sec": 25, "connection_data": 14, "connection_directori": 14, "connection_filt": 14, "connection_id": 14, "connection_str": 32, "connection_to_asset_key_fn": 14, "connection_to_auto_materialize_policy_fn": 14, "connection_to_freshness_policy_fn": 14, "connection_to_group_fn": 14, "connection_to_io_manager_key_fn": 14, "connection_typ": 14, "connectionerror": 39, "connectiontimeout": [16, 50], "connector": [14, 26, 33, 42, 53], "connector_filt": 33, "connector_id": 33, "connector_to_asset_key_fn": 33, "connector_to_group_fn": 33, "connector_to_io_manager_key_fn": 33, "consecut": [14, 16, 50], "consequ": [8, 13], "conserv": [8, 13], "consid": [11, 14, 16, 23, 50, 51, 59, 67, 68], "consider": [16, 50], "consist": [2, 12, 26, 63, 64], "consol": [8, 13, 14, 16, 26, 34, 50], "consolid": 11, "consolidatedsqliteeventlogstorag": 11, "constant": 4, "constitu": [2, 8, 9, 46], "constraint": [2, 39, 45, 63], "construct": [2, 7, 8, 9, 11, 12, 13, 14, 15, 18, 19, 20, 26, 45, 46, 51, 53, 56, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69], "construct_spark_shell_command": 56, "constructor": [6, 7, 16, 18, 19, 20, 50, 63, 65, 69], "consult": 11, "consum": [2, 11, 14, 16, 50, 59, 63, 67], "consumer_kei": 14, "consumer_secret": 14, "consumpt": [16, 50], "contact": 18, "contain": [2, 3, 4, 5, 8, 9, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 23, 26, 27, 28, 29, 30, 31, 33, 34, 35, 36, 40, 42, 45, 46, 50, 52, 53, 54, 55, 59, 60, 63, 64, 65, 66, 67, 68, 69], "container_config": [20, 40], "container_context": 3, "container_imag": 3, "container_kwarg": [19, 27], "container_nam": 16, "content": [2, 5, 8, 14, 16, 23, 26, 50, 53, 65], "context": [2, 3, 4, 5, 6, 7, 9, 10, 11, 13, 16, 17, 20, 23, 24, 26, 27, 34, 39, 40, 41, 42, 45, 47, 50, 51, 52, 53, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69], "context_": [45, 68], "context_manager_resourc": [8, 10, 12, 66], "contextlib": 66, "contextmanag": 66, "contigu": 8, "continu": [14, 34], "continueonfailur": 34, "contrain": 4, "contrast": [20, 27, 40], "contribut": 11, "control": [11, 14, 16, 23, 50, 63, 64, 65, 67], "conveni": [5, 26, 61, 69], "convent": [14, 15], "convers": [14, 64], "conversion_window_dai": 14, "convert": [4, 26, 46, 53, 54, 55, 67, 69], "cool": [16, 17, 34, 63], "coordin": [16, 50], "copi": [2, 11, 14, 16, 23, 34, 50, 52, 63, 66], "copy_handle_to_local_temp": 11, "copyright": 14, "core": [2, 7, 9, 11, 16, 20, 34, 40, 47, 50, 61, 63, 66, 67], "core_concept": 37, "corpor": 14, "correct": [2, 7, 63], "correctli": [16, 23, 40, 50], "correpond": 2, "correspond": [2, 3, 4, 6, 8, 11, 12, 14, 15, 18, 19, 20, 21, 23, 26, 33, 34, 40, 63, 64, 65, 66, 67], "corrupt": [16, 50], "cost": [16, 50], "costli": 65, "could": [2, 4, 5, 14, 16, 23, 50], "count": [14, 16, 24, 45], "counter": 14, "counter_id": 14, "countri": 14, "country_cod": 14, "coupl": [2, 14], "courier": 14, "couriersourc": 14, "cover": 12, "covid": 14, "cowboytyp": 4, "cpu": [16, 50], "cpu_count": 8, "crash": [16, 50], "creat": [1, 2, 3, 4, 5, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 20, 22, 23, 24, 26, 27, 32, 33, 34, 35, 36, 38, 40, 42, 44, 50, 52, 53, 58, 59, 60, 63, 64, 65, 66, 67, 68, 69], "create_assets_for_normalization_t": 14, "create_dagster_pandas_dataframe_typ": 45, "create_databricks_run_now_op": 23, "create_databricks_submit_run_op": 23, "create_dataset": 59, "create_db_connect": 2, "create_fresh_databas": 53, "create_io_manag": 12, "create_issu": 38, "create_k8s_job_task": 20, "create_registered_model": 41, "create_repository_using_definitions_arg": 5, "create_shell_command": 51, "create_shell_command_op": 51, "create_shell_script_op": 51, "create_spark_op": 56, "create_task": 18, "create_timestamp": 11, "created_aft": 11, "created_at": 14, "created_befor": 11, "creation": [5, 14, 16, 19, 23, 27, 40, 50, 65], "creativ": 14, "credenti": [14, 16, 17, 20, 23, 34, 35, 36, 40, 52], "credential_typ": 14, "credentials_json": 14, "credentials_titl": 14, "criteria": [1, 2, 8, 9, 13, 14, 33, 51, 60, 63], "critic": [3, 61], "crm": 14, "cron": [2, 11, 26, 64, 67], "cron_schedul": [2, 26, 64, 65, 67], "cron_schedule_timezon": 2, "cron_up_to_date_asset": 2, "cross": [8, 11, 13, 23, 34, 64], "crossrealmtrustadminserv": 34, "crossrealmtrustkdc": 34, "crossrealmtrustrealm": 34, "crossrealmtrustsharedpassworduri": 34, "csv": [4, 14, 32], "csv_loader": 12, "csv_loader_kei": 12, "csvcommaseparatedvalu": 14, "csvdestin": 14, "curiou": 11, "curl": [14, 34], "currenc": 14, "current": [2, 3, 8, 10, 11, 12, 14, 16, 20, 21, 23, 26, 32, 33, 34, 40, 42, 50, 52, 62, 63, 64, 66, 67, 69], "current_tim": [8, 13, 64], "current_valu": 7, "curri": 4, "cursor": [3, 11, 14, 53, 67], "cursor_field": 14, "cursor_from_latest_materi": 67, "custom": [2, 4, 8, 11, 12, 14, 16, 20, 23, 26, 34, 35, 36, 39, 40, 45, 50, 53, 63, 65, 68], "custom_group_prefix": 26, "custom_insight": 14, "custom_instance_class_data": 11, "custom_queri": 14, "custom_report": 14, "custom_reports_field": 14, "custom_reports_include_default_field": 14, "custom_service_account": 34, "custom_tag": 23, "custom_typ": 63, "customdagsterdbttransl": 26, "customer_id": 14, "customgaqlqueriesentri": 14, "cwd": 51, "cyclic": 66, "d": [2, 3, 14, 18, 33, 51, 53, 64, 67], "d180": 14, "d30": 14, "d7": 14, "d90": 14, "d9971c84d44d47f382a2928c8c161faa": 40, "daemon": [5, 11, 34, 67], "dag": [8, 9, 15, 16, 40, 50], "dag_bag": 15, "dag_path": 15, "dag_run": 15, "dag_run_config": 15, "dagbag": 15, "daggraph": [16, 50], "dagit": [3, 42, 52], "dagit_base_url": [42, 52], "dagredi": 40, "dagrun": 15, "dagster": [1, 2, 5, 6, 7, 8, 9, 10, 11, 12, 13, 19, 20, 27, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69], "dagster_airbyt": 14, "dagster_airflow": 15, "dagster_attribut": 3, "dagster_auto_materialize_polici": 26, "dagster_aw": [11, 16, 68], "dagster_azur": 17, "dagster_bigquery_panda": [34, 35, 36], "dagster_celeri": [18, 19, 20], "dagster_celery_broker_host": [18, 19, 20], "dagster_celery_dock": 19, "dagster_celery_k8": [18, 20], "dagster_censu": 21, "dagster_conn_id": 15, "dagster_container_context": 3, "dagster_container_imag": 3, "dagster_daemon_log_level": 3, "dagster_dask": 22, "dagster_databrick": 23, "dagster_datadog": 24, "dagster_datahub": 25, "dagster_dbt": 26, "dagster_dbt_transl": 26, "dagster_dock": 27, "dagster_docker_imag": 40, "dagster_docker_image_tag": 40, "dagster_docker_repositori": 40, "dagster_duckdb": [28, 29, 30, 31], "dagster_duckdb_panda": [28, 29], "dagster_duckdb_polar": 30, "dagster_duckdb_pyspark": 31, "dagster_embedded_elt": 32, "dagster_empty_working_directori": 3, "dagster_etl": 32, "dagster_ev": [11, 26, 60, 67], "dagster_event_typ": 11, "dagster_exampl": 69, "dagster_fivetran": 33, "dagster_freshness_polici": 26, "dagster_g": 37, "dagster_gcp": [34, 35, 36], "dagster_gcp_panda": 35, "dagster_gcp_pyspark": 36, "dagster_github": 38, "dagster_graphql": 39, "dagster_grpc_host": 3, "dagster_grpc_port": 3, "dagster_grpc_socket": 3, "dagster_handl": 61, "dagster_hom": [3, 8, 11, 12, 19, 20, 40, 43, 48, 59], "dagster_imag": 40, "dagster_inject_env_vars_from_inst": 3, "dagster_inst": 67, "dagster_instance_ref": 3, "dagster_k8": [20, 40], "dagster_lazy_load_user_cod": 3, "dagster_location_nam": 3, "dagster_log_level": 3, "dagster_mlflow": 41, "dagster_module_nam": 3, "dagster_msteam": 42, "dagster_mysql": [11, 43], "dagster_package_nam": 3, "dagster_pagerduti": 44, "dagster_panda": [37, 45], "dagster_pandera": 46, "dagster_papertrail": 47, "dagster_pg_password": [20, 40], "dagster_postgr": [11, 48], "dagster_prometheu": 49, "dagster_pyspark": 50, "dagster_python_fil": 3, "dagster_redshift_password": 16, "dagster_run": [8, 11, 42, 52, 63, 66, 67], "dagster_shel": 51, "dagster_slack": [26, 52], "dagster_snowflak": [53, 54, 55], "dagster_snowflake_panda": [53, 54, 55], "dagster_snowflake_pyspark": [53, 54, 55], "dagster_spark": 56, "dagster_ssh": 57, "dagster_stag": 23, "dagster_test": 40, "dagster_twilio": 58, "dagster_typ": [2, 4, 5, 6, 7, 8, 9, 12, 37, 45, 63, 65, 68], "dagster_type_load": [8, 45, 68], "dagster_use_python_environment_entry_point": 3, "dagster_wandb": 59, "dagster_webserv": 3, "dagster_webserver_log_level": 3, "dagster_webserver_port": 3, "dagster_working_directori": 3, "dagsterassetmetadatavalu": 63, "dagstercloudoper": 15, "dagsterconfigmappingfunctionerror": 7, "dagsterdaemonschedul": 67, "dagsterdbtclifatalruntimeerror": 26, "dagsterdbtclihandledruntimeerror": 26, "dagsterdbtclioutputsnotfounderror": 26, "dagsterdbtcliruntimeerror": 26, "dagsterdbtcliunexpectedoutputerror": 26, "dagsterdbterror": 26, "dagsterdbttransl": 26, "dagstererror": 7, "dagsterev": [8, 10, 11, 67], "dagstereventloginvalidforrun": 7, "dagstereventtyp": [8, 11], "dagsterexecutionstepexecutionerror": [7, 11], "dagsterexecutionstepnotfounderror": 7, "dagstergraphqlcli": 39, "dagstergraphqlclienterror": 39, "dagsterinst": [3, 7, 8, 9, 10, 11, 13, 64, 66, 67], "dagsterinstanceoverrid": 11, "dagsterinvalidconfigdefinitionerror": 7, "dagsterinvalidconfigerror": [4, 7], "dagsterinvaliddefinitionerror": 7, "dagsterinvalidinvocationerror": 26, "dagsterinvalidsubseterror": 7, "dagsterinvariantviolationerror": [7, 8, 66, 69], "dagsterlogmanag": [8, 11, 51, 61, 66], "dagstermil": [8, 13], "dagstermillerror": 60, "dagstermillexecutioncontext": 60, "dagsteroper": 15, "dagsterpanderadatafram": 46, "dagsterresourcefunctionerror": 7, "dagsterrun": [8, 11, 60, 67], "dagsterrunconflict": 39, "dagsterrunmetadatavalu": 63, "dagsterrunnotfounderror": 7, "dagsterrunreact": 67, "dagsterrunstatu": [11, 39, 67], "dagsterstepoutputnotfounderror": 7, "dagsterstorag": 11, "dagstersubprocesserror": 7, "dagstertyp": [2, 7, 8, 9, 12, 16, 37, 45, 46, 63, 68], "dagstertypecheckdidnotpass": 7, "dagstertypecheckerror": 7, "dagstertypekind": [45, 68], "dagstertypeload": [45, 68], "dagstertypeloadercontext": 68, "dagsterunknownresourceerror": 7, "dagsterunmetexecutorrequirementserror": 7, "dagsterusercodeexecutionerror": [7, 11], "dagsterwebserv": 40, "dai": [14, 59, 64, 67], "daili": [14, 16, 50, 64, 67], "daily_123": 64, "daily_dbt_assets_schedul": 26, "daily_partitioned_config": [64, 67], "dailypartitiondefinit": 26, "dailypartitionsdefinit": [8, 26, 64, 67], "dash": 14, "dashboard": [14, 16, 50, 58, 63], "dashboard_url": 63, "dask_enabled_job": 22, "dask_executor": 22, "data": [1, 2, 4, 6, 8, 9, 11, 12, 14, 16, 17, 20, 23, 26, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 40, 45, 46, 47, 50, 51, 53, 54, 55, 60, 61, 62, 63, 64, 66, 68], "data_queri": 14, "data_sourc": 14, "data_source_typ": 14, "data_to_sync": 14, "data_vers": [2, 63], "databas": [2, 3, 11, 12, 14, 15, 16, 26, 28, 29, 30, 31, 32, 34, 35, 36, 40, 43, 53, 54, 55, 63, 64, 66, 67], "database_schema": 14, "databrick": 14, "databricks_cli": 23, "databricks_host": 23, "databricks_http_path": 14, "databricks_job_configur": 23, "databricks_job_id": 23, "databricks_personal_access_token": 14, "databricks_port": 14, "databricks_pyspark_step_launch": 23, "databricks_resource_kei": 23, "databricks_server_hostnam": 14, "databricks_token": 23, "databrickscli": 23, "databricksclientresourc": 23, "databricksdestin": 14, "databrickserror": 23, "datacent": [14, 23], "datadog_cli": 24, "datadog_op": 24, "datadog_resourc": 24, "datadogcli": 24, "datadoghq": 24, "datadogpi": 24, "datadogresourc": 24, "datafram": [1, 2, 28, 29, 30, 31, 34, 35, 36, 37, 45, 46, 50, 53, 54, 55, 60], "dataframe_constraint": 45, "dataframe_load": 45, "dataframeconstraint": 45, "dataframeschema": 46, "datahub_kafka_emitt": 25, "datahub_rest_emitt": 25, "datahubkafkaemitterresourc": 25, "datahubrestemitterresourc": 25, "datalak": 14, "datalakeservicecli": 17, "dataproc_op": 34, "dataproc_resourc": 34, "dataproccli": 34, "dataprocresourc": 34, "dataproven": 8, "dataset": [14, 34, 35, 36, 37, 59, 60, 63], "dataset_id": 14, "dataset_loc": 14, "dataset_nam": 14, "datasetid": 14, "datasourc": 37, "datasource_nam": 37, "datatyp": 14, "datavers": 63, "date": [2, 3, 14, 16, 23, 52, 62, 64, 67], "date_from": 14, "date_ranges_start_d": 14, "date_window_s": 14, "datetim": [8, 11, 13, 64, 67], "day_of_month": [64, 67], "day_of_week": [64, 67], "day_offset": [64, 67], "db": [3, 14, 15, 23, 28], "db2": 14, "db2sourc": 14, "db3": 23, "db_name": [11, 43, 48], "db_password": 12, "db_pool_recycl": 3, "db_statement_timeout": 3, "dbf": 23, "dbt": [1, 2], "dbt_asset": 26, "dbt_build_arg": 26, "dbt_build_invoc": 26, "dbt_cli_invoc": 26, "dbt_cli_resourc": 26, "dbt_cloud": 26, "dbt_cloud_account_id": 26, "dbt_cloud_api_token": 26, "dbt_cloud_asset": 26, "dbt_cloud_auth_token": 26, "dbt_cloud_host": 26, "dbt_cloud_job_id": 26, "dbt_cloud_resourc": 26, "dbt_cloud_run_op": 26, "dbt_cloud_sandbox": 26, "dbt_compile_op": 26, "dbt_docs_generate_op": 26, "dbt_event": 26, "dbt_exclud": 26, "dbt_execut": 26, "dbt_ls_op": 26, "dbt_macro_arg": 26, "dbt_output": 26, "dbt_profiles_dir": 26, "dbt_project": 26, "dbt_project_dir": 26, "dbt_resource_kei": 26, "dbt_resource_prop": 26, "dbt_run_invoc": 26, "dbt_run_op": 26, "dbt_seed_op": 26, "dbt_select": 26, "dbt_snapshot_op": 26, "dbt_test_op": 26, "dbt_var": 26, "dbtcliclientresourc": 26, "dbtclieventmessag": 26, "dbtcliinvoc": 26, "dbtclioutput": 26, "dbtcliresourc": 26, "dbtcloudclientresourc": 26, "dbtmanifestassetselect": 26, "dbtoutput": 26, "dbtresourc": 26, "dbtrpcoutput": 26, "dbtypehandl": [28, 29, 30, 31, 34, 35, 36, 53, 54, 55], "dc_region": 14, "dd": [14, 24], "dd_job": 24, "ddt00": 14, "ddthh": 14, "dead": [16, 50], "deal": 5, "debian": 34, "debug": [8, 11, 14, 21, 23, 26, 34, 40, 61, 69], "debugg": 69, "debugrunpayload": 11, "decid": [16, 50], "decis": 2, "declar": [5, 8, 9, 14, 15, 16, 17, 63], "decor": [1, 2, 4, 8, 9, 10, 11, 12, 13, 24, 26, 45, 51, 60, 61, 62, 63, 64, 65, 66, 67, 68], "decorated_fn": [10, 64], "decreas": [16, 50], "decrement": 24, "dedic": 59, "deduct": 14, "dedup": 14, "dedupl": 14, "deeplink": [42, 52, 69], "def": [1, 2, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 33, 34, 35, 36, 38, 40, 41, 42, 44, 49, 50, 51, 52, 53, 54, 55, 59, 60, 61, 63, 64, 65, 66, 67, 68, 69], "default": [1, 2, 3, 4, 5, 6, 8, 9, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 23, 25, 26, 27, 28, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 45, 48, 49, 50, 51, 52, 53, 54, 55, 57, 59, 60, 63, 64, 65, 66, 67, 68, 69], "default_asset_key_fn": 26, "default_auto_materialize_policy_fn": 26, "default_azure_credenti": 17, "default_freshness_policy_fn": 26, "default_group_from_dbt_resource_prop": 26, "default_load_typ": [28, 34, 53], "default_metadata_from_dbt_resource_prop": 26, "default_provid": 4, "default_statu": [42, 52, 64, 67, 69], "default_tag": 23, "default_valu": [4, 6, 63], "defaultazurecredenti": 17, "defaultcor": [16, 50], "defaultoauth20author": 14, "defaultruncoordin": 11, "defaultrunlaunch": 11, "defaultschedulestatu": [64, 67], "defaultsensorstatu": [42, 52, 67, 69], "defin": [1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16, 18, 19, 20, 22, 23, 26, 27, 28, 32, 33, 34, 40, 45, 46, 47, 52, 53, 59, 60, 62, 64, 65, 66, 67, 68], "define_asset_job": [2, 5, 64, 67], "define_dagstermill_asset": 60, "define_dagstermill_op": 60, "define_job": 8, "define_my_job": [8, 13], "define_pipelin": 3, "define_repo": 3, "define_spark_config": 56, "definit": [1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 23, 25, 26, 27, 28, 29, 30, 31, 33, 34, 35, 36, 38, 40, 42, 49, 51, 52, 53, 54, 55, 60, 61, 62, 63, 65, 66, 67, 68], "defint": [5, 65], "deflat": 14, "delai": [14, 16, 50, 63], "deleg": [18, 20, 61], "delet": [3, 11, 14, 16, 34, 35, 36, 40, 49, 50, 59, 67], "delete_dynamic_partit": [11, 64], "delete_local_temp": 11, "delete_messag": 14, "delete_run": 11, "delete_unmentioned_resourc": 14, "deletedynamicpartitionsrequest": 67, "delight": 14, "delightedsourc": 14, "delimit": [14, 25], "delin": [64, 67], "deliv": [14, 23], "delivery_timeout_m": 14, "delta": 14, "demand": 23, "deni": 18, "denibertov": 18, "denorm": 14, "dep": [2, 8, 14, 26, 60], "depen": 15, "depend": [2, 6, 8, 13, 14, 15, 16, 26, 34, 50, 51, 52, 60, 64, 66, 68], "depende": 9, "dependency_kei": 2, "dependency_structur": 9, "dependencydefinit": [8, 9], "deploi": [16, 23, 34, 40, 50], "deploy": [1, 3, 14, 15, 16, 19, 20, 22, 34, 35, 36, 39, 40, 50, 53, 59], "deploy_local_job_packag": 16, "deploy_local_pipeline_packag": 16, "deploy_mod": 56, "deployment_nam": 15, "deploymod": [16, 50], "deprec": [2, 3, 8, 14, 16, 23, 50, 52, 57, 60, 64, 67, 69], "depth": 2, "deqeueu": 11, "dequeu": 11, "dequeue_interval_second": 11, "dequeue_num_work": 11, "dequeue_use_thread": 11, "deriv": [2, 11, 14, 26], "descend": [8, 9, 11, 13, 14], "describ": [2, 4, 8, 9, 11, 12, 13, 14, 23, 26, 32, 39, 40, 62, 63], "descript": [1, 2, 4, 6, 7, 8, 9, 11, 12, 13, 14, 16, 26, 33, 45, 46, 47, 50, 51, 56, 59, 60, 61, 63, 64, 65, 66, 67, 68], "descriptions_by_kei": 2, "descriptions_by_output_nam": 2, "descriptor": 63, "deseri": 67, "deserialization_strategi": 14, "deserialization_typ": 14, "design": 2, "desir": [14, 26, 40], "dest_tabl": 32, "dest_tbl": 32, "destin": [21, 23, 33], "destination_configur": 14, "destination_default": 14, "destination_namespac": 14, "destination_path": 14, "destination_t": [14, 33], "destination_typ": 14, "destruct": [16, 50], "detail": [2, 3, 14, 16, 18, 19, 20, 21, 23, 26, 33, 46, 50, 52, 53, 54, 55, 63, 67], "detect": [14, 16, 50], "determin": [2, 3, 6, 8, 9, 12, 13, 14, 16, 23, 26, 28, 29, 30, 31, 33, 34, 35, 36, 45, 53, 54, 55, 63, 64, 67, 68], "determinist": [2, 63, 68], "dev": [4, 16, 18, 19, 20, 23, 26, 40], "dev_s3": 4, "develop": [11, 14, 34, 38, 53, 60], "developer_token": 14, "devic": 14, "devstorag": 34, "df": [12, 16, 17, 34, 53, 63], "dfoo": 34, "dict": [1, 2, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 25, 26, 27, 32, 33, 34, 37, 39, 40, 41, 45, 48, 50, 51, 52, 53, 60, 63, 64, 65, 66, 67, 68, 69], "dictionari": [1, 2, 4, 5, 7, 8, 9, 11, 12, 13, 14, 16, 23, 26, 33, 34, 46, 51, 60, 63, 64, 65, 66, 67, 69], "dictionary_encod": 14, "dictionary_page_size_kb": 14, "did": 69, "differ": [2, 4, 5, 8, 12, 13, 14, 16, 18, 19, 20, 26, 48, 50, 51, 63, 64, 67], "digit": 14, "dimens": [8, 12, 14, 64, 67], "dimension": 64, "dimension_nam": 64, "dimensionpartitionmap": 64, "dir": [11, 16, 26, 34, 50], "direct": [3, 8, 9, 13, 14, 16, 50], "directli": [2, 3, 4, 5, 8, 9, 11, 12, 13, 15, 16, 17, 24, 26, 34, 35, 36, 43, 48, 50, 51, 53, 60, 61, 62, 63, 64, 65, 66, 67, 68], "directori": [3, 8, 11, 12, 14, 15, 16, 17, 18, 23, 26, 34, 50, 51, 53, 59, 60, 65, 69], "dirnam": 6, "disabl": [3, 11, 14, 16, 18, 19, 20, 23, 26, 27, 40, 50, 53, 59], "disable_schedule_on_trigg": [26, 33], "disable_ssl_verif": 25, "disallow": 11, "discard": 2, "discern": 2, "disconnect": 14, "discount": 14, "discov": 14, "discover_field": 14, "discoveri": 14, "discret": [8, 9, 13, 64], "disk": [11, 14, 16, 23, 34, 50, 63], "diskconfig": 34, "dispatch": [8, 10, 39, 59, 61], "displai": [1, 2, 3, 4, 6, 11, 14, 16, 23, 26, 45, 50, 52, 59, 60, 63, 65, 67, 68], "display_nam": 68, "display_raw_sql": 26, "distcp": 34, "distinguish": [6, 8, 11, 14, 45, 68], "distribut": [16, 17, 20, 22, 24, 34, 40], "divid": [16, 50], "dixa": 14, "dixasourc": 14, "dkr": 40, "dn": 14, "do": [2, 4, 5, 7, 11, 12, 14, 16, 21, 26, 28, 29, 30, 31, 34, 35, 36, 50, 53, 54, 55, 59, 60, 62, 69], "do_it_al": [8, 9], "do_some_transform": 2, "do_some_work": 51, "do_someth": [10, 12], "do_something_on_failur": 10, "do_something_on_success": 10, "do_stuff": [12, 23], "do_stuff_partit": [64, 67], "do_stuff_partitioned_schedul": [64, 67], "doc": [8, 11, 14, 16, 18, 19, 20, 23, 24, 26, 32, 33, 34, 37, 40, 42, 52, 56], "docker": [14, 18, 20, 40], "docker_container_op": 27, "docker_executor": 27, "docker_image_tag": 41, "docker_job": 27, "docker_password": 19, "docker_usernam": 14, "dockerhub": 14, "dockerhubsourc": 14, "dockerrunlaunch": 27, "docs_url": 26, "docstr": 63, "document": [3, 4, 11, 14, 16, 17, 23, 24, 26, 32, 33, 34, 50, 52, 53, 54, 55, 56, 59, 69], "doe": [1, 2, 7, 8, 9, 11, 13, 14, 16, 23, 26, 27, 33, 39, 40, 45, 50, 53, 60, 63, 64, 66, 67, 68], "doesn": [3, 14, 53], "dog": 8, "dogstatsd": 24, "domain": 14, "domain_id": 14, "domain_nam": 14, "domain_region": 14, "domain_url": 14, "don": [1, 14, 67], "done": [8, 11, 43, 46, 48, 68], "dot": [2, 26, 63, 66], "doubl": [2, 16, 50], "double_quot": 14, "down": [3, 8, 9, 13, 14, 16, 18, 23, 24, 39, 50, 59, 66], "download": [3, 11, 14, 17, 53], "downstream": [1, 2, 6, 8, 9, 12, 13, 14, 26, 28, 29, 30, 31, 33, 34, 35, 36, 53, 54, 55, 60, 63, 64, 67], "downstream_asset": 2, "downstream_mappings_by_upstream_dimens": 64, "downstream_partition_keys_by_upstream_partition_kei": 64, "downstream_partitions_def": 64, "downstream_partitions_subset": 64, "downtim": 67, "draw": [20, 40], "drift": 14, "driftsourc": 14, "drive": [14, 34], "driver": [14, 16, 23, 34, 50], "driver_node_type_id": 23, "driverloglevel": 34, "drop": [16, 50, 53], "drop_databas": 53, "dry": 40, "dspark": 23, "duckdb_io_manag": 28, "duckdb_pandas_io_manag": 29, "duckdb_polars_io_manag": 30, "duckdb_pyspark_io_manag": 31, "duckdbiomanag": [28, 29, 30, 31], "duckdbpandasiomanag": 29, "duckdbpandastypehandl": [28, 29], "duckdbpolarsiomanag": 30, "duckdbpolarstypehandl": 30, "duckdbpysparkiomanag": 31, "duckdbpysparktypehandl": 31, "duckdbresourc": 28, "due": [14, 16, 50], "dump": [1, 2, 8, 9, 13, 16, 26, 33, 50, 51, 60, 63], "dump_profil": [16, 50], "dunderfil": 69, "duplic": [14, 67], "durabl": 14, "durat": [8, 14, 16, 22, 50], "dure": [2, 7, 8, 10, 11, 14, 16, 39, 50, 51, 53, 60, 61, 62, 63, 66, 67, 68], "dv": 14, "dv360sourc": 14, "dynam": [2, 8, 9, 13, 14, 16, 23, 50, 51, 53, 60, 64, 65, 67], "dynamic_partitioned_config": 64, "dynamic_partitions_request": [64, 67], "dynamic_partitions_stor": [8, 13, 64], "dynamicalloc": [16, 50], "dynamicout": 6, "dynamicoutput": [6, 8], "dynamicpartitionsdefinit": [8, 11, 13, 64], "dynamicpartitionsstor": [8, 13, 64], "dynamodb": 14, "dynamodb_endpoint": 14, "dynamodb_region": 14, "dynamodb_table_name_prefix": 14, "dynamodbdestin": 14, "e": [1, 2, 7, 8, 10, 11, 12, 13, 14, 16, 18, 19, 20, 23, 26, 28, 34, 39, 40, 45, 46, 50, 52, 53, 59, 61, 63, 64, 65, 67, 68], "e2": 14, "e2etestsourc": 14, "each": [1, 2, 3, 4, 5, 6, 8, 9, 11, 12, 13, 14, 16, 17, 18, 20, 21, 22, 23, 26, 27, 28, 33, 34, 40, 46, 50, 53, 60, 61, 62, 63, 64, 65, 66, 67], "eager": [2, 11, 26], "earlier": 67, "earliest": [14, 67], "easi": [4, 8, 11, 13, 26, 59], "easier": 16, "easiest": [11, 12], "easili": [12, 14, 47], "east": [14, 16], "east1": 34, "eb": 23, "ebs_volume_count": 23, "ebs_volume_iop": 23, "ebs_volume_s": 23, "ebs_volume_throughput": 23, "ebs_volume_typ": 23, "ebsvolumetyp": 23, "echo": [27, 40, 51, 63], "echo_2": 63, "echo_data": 51, "echo_graph": 51, "echo_op": 51, "ecr": 40, "ecsrunlaunch": 16, "edg": 9, "edit": [2, 8, 9, 13, 14], "effect": [2, 11, 14, 16, 40, 50, 63], "effici": [5, 16, 50, 65], "eg": [3, 14, 16, 26, 50], "egg": [16, 34, 50], "either": [2, 3, 4, 6, 7, 8, 9, 10, 11, 12, 14, 15, 16, 17, 23, 39, 40, 45, 49, 50, 51, 52, 64, 66, 67, 68, 69], "ek": 40, "elaps": [14, 52, 67], "elasticsearch": 14, "elasticsearchdestin": 14, "elasticsearchsourc": 14, "element": [4, 7, 12, 16, 40, 50, 51], "elimin": 3, "els": [14, 26, 34], "email": [14, 40, 69], "email_body_fn": 69, "email_from": 69, "email_on_job_failur": 69, "email_on_run_failur": 69, "email_password": 69, "email_subject_fn": 69, "email_to": 69, "embed": 69, "emit": [2, 8, 11, 14, 23, 26], "emit_f": 9, "emit_metadata": 63, "empti": [3, 14, 16, 17, 20, 34, 40], "emr_pyspark_step_launch": 16, "emr_stag": 16, "emrclusterst": 16, "emrerror": 16, "emrjobrunn": 16, "emrstepst": 16, "en": [4, 14, 18, 19, 20, 23, 27, 37, 42], "enabl": [3, 8, 9, 11, 13, 14, 16, 18, 19, 20, 23, 26, 27, 34, 50, 53, 62, 63], "enable_auto_commit": 14, "enable_elastic_disk": 23, "enable_encrypt": 23, "enable_experimental_stream": 14, "enable_idempot": 14, "enablecompress": [16, 50], "enablekerbero": 34, "encapsul": [2, 34, 40, 63], "encod": [1, 2, 3, 8, 9, 13, 14, 23, 33, 34, 35, 36, 51, 53, 54, 55, 60, 63, 69], "encount": 23, "encrypt": [14, 23, 34, 53, 54, 55], "encryptedtrustservercertif": 14, "encryptedverifycertif": 14, "encryption_algorithm": 14, "encryption_typ": 23, "encryptionconfig": 34, "end": [4, 8, 9, 14, 18, 26, 38, 41, 59, 63, 64, 67], "end_dat": [14, 64], "end_mlflow_on_run_finish": 41, "end_offset": [2, 8, 64, 67], "end_tim": [11, 14], "endpoint": [14, 16, 23, 50], "endpoint_url": 16, "enforc": [7, 14, 16, 45, 50], "enforce_ord": 45, "engag": 14, "engin": [8, 11, 14, 34, 53], "engine_ev": 8, "engineev": 11, "enough": [1, 3, 16, 50], "enqueu": 11, "enrich": 14, "ensur": [8, 11, 13, 14, 15, 23, 24, 40, 51, 53, 54, 55, 64, 67], "entail": 18, "enter": 14, "enterpris": 38, "entir": [14, 16, 23, 50, 64], "entireti": [16, 50], "entiti": [16, 50, 59], "entri": [2, 3, 4, 11, 12, 14, 16, 26, 28, 29, 30, 31, 34, 35, 36, 50, 53, 54, 55, 60, 63], "entry_data": 63, "entry_point": 59, "entrypoint": [8, 27], "enum": [4, 7, 16, 39, 51, 64], "enum_valu": 4, "enumer": [14, 16], "enumvalu": 4, "env": [3, 4, 14, 17, 19, 21, 26, 33, 34, 35, 36, 40, 41, 51, 53, 54, 55, 59], "env_config_map": [20, 40], "env_secret": [20, 40], "env_to_tag": 41, "env_var": [16, 19, 20, 27, 40], "env_vari": 23, "envfrom": [20, 40], "environ": [3, 4, 8, 11, 12, 13, 14, 15, 16, 19, 20, 23, 27, 34, 38, 39, 40, 41, 43, 48, 50, 51, 65, 67, 69], "environemnt": 27, "environment": 23, "environment_var": 67, "envvar": [12, 14, 16, 23, 32, 34, 35, 36, 42, 52, 53, 54, 55], "ephemer": [3, 8, 9, 11, 13, 15, 16, 18, 34, 37, 66, 67], "ephemeral_storag": 16, "equal": [2, 14, 16, 23, 50, 64, 67], "equival": [2, 4, 5, 9, 34, 65], "error": [1, 2, 3, 4, 8, 11, 12, 14, 16, 21, 23, 33, 39, 42, 44, 46, 50, 52, 53, 60, 61, 63, 64, 66, 67, 69], "error_cl": 11, "error_info": 11, "error_object": 39, "error_strings_by_step_kei": 67, "error_toler": 45, "es": 2, "escape_char": 14, "especi": [16, 18, 19, 20, 50], "essenti": [14, 16, 50, 64], "establish": [14, 68], "estim": [16, 50], "etc": [2, 8, 14, 16, 20, 26, 34, 40, 50, 69], "eu": [14, 34], "eur": 14, "evalu": [2, 8, 13, 52, 65, 67], "evaluation_fn": 67, "even": [2, 14, 16, 26, 33, 50, 53, 63, 66, 67], "event": [2, 3, 7, 8, 10, 12, 14, 16, 20, 24, 26, 40, 42, 43, 44, 48, 50, 52, 60, 67, 69], "event_act": 44, "event_list": 8, "event_log": [11, 43, 48], "event_log_entri": 11, "event_log_storag": [11, 43, 48], "event_records_filt": 11, "event_specific_data": [8, 67], "event_storag": 11, "event_storage_data": 11, "event_typ": [8, 11], "event_type_valu": 8, "eventlog": [16, 50], "eventlogentri": [11, 67], "eventlogrecord": [11, 67], "eventlogstorag": 11, "eventqueu": [16, 50], "eventrecordsfilt": 11, "eventu": [11, 63], "eventv2_cr": 44, "everi": [16, 18, 19, 20, 23, 50, 53, 57, 64, 67], "everyth": 3, "evict": [16, 50], "ex": [20, 40], "exact": [14, 16, 45, 50], "exactli": [2, 4, 14, 23], "examin": 69, "exampl": [1, 2, 3, 4, 5, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 38, 39, 40, 41, 42, 44, 49, 50, 51, 52, 53, 54, 55, 59, 60, 61, 63, 64, 65, 66, 67, 68, 69], "example_adls2_op": 17, "example_job": 16, "example_mapping_kei": 6, "example_prometheus_op": 49, "example_redshift_asset": 16, "example_redshift_op": 16, "example_s3_op": 16, "example_secretsmanager_op": 16, "example_secretsmanager_secrets_op": 16, "example_secretsmanager_secrets_op_2": 16, "exampleenum": 4, "exc_info": 7, "exceed": [2, 16, 50], "except": [1, 2, 4, 7, 8, 9, 10, 13, 14, 16, 23, 26, 34, 39, 40, 50, 51, 53, 59, 61, 63], "excess": [16, 50], "exchang": 14, "exchangeratesapi": 14, "exchangeratessourc": 14, "excit": 11, "exclud": [2, 14, 16, 26, 33, 50, 64], "exclude_environment_credenti": 17, "execut": [1, 2, 3, 4, 6, 7, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 23, 26, 27, 28, 33, 34, 39, 40, 50, 51, 53, 59, 60, 62, 64, 66, 67, 68, 69], "execute_docker_contain": 27, "execute_in_process": [4, 7, 8, 9, 13, 15, 16, 24, 38, 41, 42, 44, 52, 53, 67], "execute_job": 8, "execute_k8s_job": 40, "execute_plan": 18, "execute_queri": [16, 53], "execute_shell_command": 51, "execute_shell_script": 51, "executeinprocessresult": [8, 9, 13], "executejobresult": 8, "execution_d": 15, "execution_data": [11, 68], "execution_fn": 67, "execution_plan": 11, "execution_plan_snapshot_id": 11, "execution_timezon": [26, 67], "executionplan": 11, "executor": [4, 5, 7, 13, 16, 17, 18, 19, 20, 22, 23, 27, 34, 40, 50, 63, 66], "executor_config": 11, "executor_creation_fn": 11, "executor_def": [2, 8, 9, 11, 13, 18, 19, 20, 22, 27, 40], "executor_id": [16, 50], "executorallocationratio": [16, 50], "executordefinit": [2, 4, 5, 8, 9, 11, 13, 18, 19, 20, 22, 27, 40], "executoridletimeout": [16, 50], "executorrequir": 11, "exhaust": 14, "exist": [3, 7, 8, 9, 11, 13, 14, 16, 20, 22, 23, 26, 34, 39, 45, 50, 53, 62, 64, 66, 67, 68], "exit": [3, 11, 16, 26, 34, 50, 51, 66], "expand": 14, "expand_issue_changelog": 14, "expect": [1, 2, 8, 11, 14, 16, 18, 19, 20, 26, 33, 34, 39, 45, 50, 51, 60, 63, 64], "expectationresult": [8, 60, 63], "expens": [11, 14, 16, 50], "expensive_job": 65, "expensive_schedul": 65, "experi": [14, 40, 41, 59], "experienc": 67, "experiment": [2, 14, 15, 50, 60, 63, 66, 67, 68], "experiment_nam": 41, "experimentalwarn": 69, "expir": 14, "explicit": [64, 68], "explicitli": [5, 8, 9, 11, 12, 13, 14, 63, 64, 68, 69], "explictli": 5, "explor": [14, 23, 60], "exponenti": [14, 63], "export": [3, 14, 23, 40, 67], "expos": [4, 14, 18, 19, 20, 21, 26, 33, 40, 61], "express": [14, 16, 45, 50, 63], "ext": 11, "ext_id": 14, "extend": [9, 12, 16, 20, 40, 50, 64, 67], "extens": 11, "extern": [2, 11, 16, 34, 40, 42, 50, 52, 59, 66, 67, 68, 69], "external_job_origin": 11, "external_version_fn": 68, "externaliomanag": 12, "externaltablevias3": 14, "extra": [4, 16, 23, 25, 26, 41, 50], "extra_head": 25, "extra_tag": 41, "extraclasspath": [16, 50], "extract": [14, 34, 46], "extrajavaopt": [16, 23, 50], "extralibrarypath": [16, 50], "extralisten": [16, 50], "extras_requir": 11, "extrem": 14, "f": [3, 4, 15, 16, 25, 34, 38, 42, 51, 52, 65, 66, 69], "face": [16, 50], "facebook": 14, "facebookmarketingsourc": 14, "facebookpagessourc": 14, "facil": 68, "facilit": 40, "facility_id": 14, "factori": [13, 23, 32, 51, 53], "fail": [1, 2, 3, 7, 10, 11, 14, 16, 20, 21, 23, 26, 33, 34, 39, 40, 42, 45, 46, 50, 52, 60, 63, 66, 67, 68, 69], "fail_pod_on_run_failur": [20, 40], "failed_run_id": 8, "failur": [6, 8, 10, 11, 13, 14, 16, 23, 26, 39, 42, 50, 51, 52, 60, 63, 67, 69], "failure_ev": [42, 52, 67, 69], "failure_hook": 10, "failure_sampl": 46, "failure_typ": 39, "fair": [16, 50], "fake": 14, "fake_redshift_resourc": 16, "fakeadls2resourc": 17, "fakepassword": 14, "faker": 14, "fakeredshiftclientresourc": 16, "fakersourc": 14, "fall": [20, 26, 34, 40, 64, 67], "fallback": [11, 52], "fals": [2, 4, 7, 8, 9, 11, 12, 13, 14, 15, 16, 21, 23, 25, 26, 33, 34, 39, 40, 42, 45, 50, 52, 53, 54, 55, 57, 60, 62, 63, 64, 67, 68, 69], "fan": [9, 23], "fan_in_index": 9, "far": 14, "fast": [16, 18, 19, 20, 50], "faster": [14, 16, 50], "fatal": [26, 34, 61], "fauna": 14, "faunasourc": 14, "favor": [26, 52, 62, 67, 69], "fd": 11, "fe": 14, "feasibl": 14, "featur": [11, 12, 14, 16, 18, 23, 34, 50], "feature_nam": 60, "feedback": 40, "femal": 24, "fetch": [8, 11, 13, 14, 16, 23, 33, 50, 53, 64, 67], "fetch_files_from_slack": 2, "fetch_result": [16, 53], "fetch_secret": 16, "fetch_thumbnail_imag": 14, "fetchfailur": [16, 50], "fetchtimeout": [16, 50], "few": [5, 14, 16, 50], "fewer": [14, 16, 50], "fh_1": 11, "fh_2": 11, "fi": 34, "field": [2, 4, 6, 7, 11, 14, 15, 16, 20, 23, 26, 40, 43, 48, 50, 51, 59, 63, 65, 67], "field_alias": 4, "field_util": 4, "fifo": 14, "file": [2, 3, 4, 6, 8, 12, 14, 15, 18, 20, 26, 32, 35, 36, 40, 43, 48, 50, 51, 53, 57, 59, 60, 63, 65, 69], "file_handl": 11, "file_manag": [11, 68], "file_nam": 65, "file_name_pattern": 14, "file_obj": 11, "file_pattern": 14, "file_relative_path": [6, 51, 69], "file_result": 6, "file_system": 17, "file_typ": 14, "filehandl": [11, 68], "filemanag": [11, 16, 17, 34], "filenam": [6, 16], "filenotfounderror": 69, "fileoutputcommitt": [16, 50], "filepath": [12, 16, 17, 34, 63], "files_in_directori": 6, "files_pipelin": 11, "filesecuresourc": 14, "filesourc": 14, "filesystem": [2, 8, 9, 11, 12, 13, 16, 17, 34, 50, 63], "filesystemiomanag": 12, "filetyp": 14, "fileuri": 34, "fileystem": 16, "fill": [8, 14], "filter": [3, 11, 14, 16, 33, 45, 50, 59, 68], "filter1": [16, 50], "filtersalesforceobjectsentri": 14, "final": [12, 14, 16, 17, 26, 34, 35, 36, 50, 52, 53], "final_foobar_st": [14, 33], "financ": 14, "find": [11, 14, 15, 16, 18, 26, 38, 40, 44, 50, 67], "fine": [38, 65], "finer": 64, "finish": [11, 16, 23, 41, 50, 52], "fire": [2, 52, 64, 67], "firebolt": 14, "fireboltdestin": 14, "fireboltsourc": 14, "firestor": 14, "firestoredestin": 14, "firewal": [16, 50], "first": [2, 3, 7, 8, 14, 16, 23, 24, 25, 38, 40, 44, 45, 46, 50, 51, 52, 53, 54, 55, 59, 63, 64, 65, 67, 68], "first_asset": 8, "first_on_demand": 23, "first_op": [27, 40], "fit": [16, 50], "fivetran_api_kei": 33, "fivetran_api_secret": 33, "fivetran_asset": 33, "fivetran_inst": 33, "fivetran_resourc": 33, "fivetran_sync_op": 33, "fivetranconnectormetadata": 33, "fivetranoutput": 33, "fivetranresourc": 33, "fix": [3, 14, 16, 50], "fix_m": 14, "fixed_server_id": 3, "fixtur": 40, "flag": [2, 3, 11, 14, 18, 26, 34, 40, 45, 53, 67], "flake": 63, "flakey_oper": 63, "flat_asset_kei": 63, "flatten": 14, "flavor": 34, "flexibl": [26, 63], "flexport": 14, "flexportsourc": 14, "float": [1, 2, 4, 6, 7, 11, 14, 16, 20, 21, 23, 25, 26, 33, 34, 35, 36, 42, 45, 50, 52, 63, 67, 68], "floatmetadatavalu": 63, "flow": [8, 9, 14, 16, 45, 63, 68], "flower": [18, 40], "flush": [14, 16, 50], "fmt": [64, 67], "fn": 66, "folder": [3, 14, 26], "folder_path": 14, "follow": [1, 2, 4, 6, 8, 9, 11, 12, 14, 16, 17, 18, 19, 20, 22, 27, 34, 38, 40, 48, 50, 59, 63, 64, 66, 67], "foo": [4, 8, 10, 11, 12, 13, 16, 23, 24, 26, 33, 34, 42, 50, 52, 63, 65, 66, 68], "foo_and_downstream_select": 26, "foo_job": [8, 13], "foo_job_arg": 13, "foo_job_kwarg": 13, "foo_resourc": 66, "foo_select": 26, "foobar": [4, 14, 21, 33], "footprint": [14, 16, 50], "forc": [16, 23, 50], "force_full_sync": 21, "fork": [16, 50, 69], "forked_pdb": [8, 69], "forkedpdb": [8, 69], "form": [9, 14, 16, 20, 23, 27, 40, 46, 49, 50, 65], "form_id": 14, "format": [2, 4, 8, 9, 10, 13, 14, 16, 26, 34, 39, 42, 45, 50, 52, 53, 59, 64, 65, 67, 68], "format_typ": 14, "forward": [7, 14, 16, 19, 40, 50], "forward_log": 14, "found": [7, 8, 11, 14, 16, 18, 23, 26, 38, 39, 40, 50, 58, 62, 63], "foundat": 63, "four": [16, 18, 50], "fqn": 26, "fraction": [16, 50], "fragment": [8, 11], "framework": [1, 2, 7, 8, 9, 11, 32, 33, 34, 39, 51, 60, 63], "free": [14, 16, 50], "freeli": 7, "frequenc": [14, 16, 50], "frequent": [14, 16, 18, 19, 20, 23, 50], "fresh": [2, 3, 14, 26, 52, 67], "fresh_asset": 2, "freshcal": 14, "freshcallersourc": 14, "freshdesk": 14, "freshdesksourc": 14, "freshness_polici": [2, 14, 26, 67], "freshness_policies_by_kei": 2, "freshness_policies_by_output_nam": 2, "freshness_policy_sensor": 67, "freshness_policy_sensor_fn": 67, "freshness_policy_sensor_to_invok": 67, "freshnesspolici": [2, 14, 26, 52, 67], "freshnesspolicysensorcontext": [52, 67], "freshnesspolicysensordefinit": 67, "freshsal": 14, "freshsalessourc": 14, "freshservic": 14, "freshservicesourc": 14, "friend": 11, "from": [1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 38, 39, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 59, 60, 62, 63, 64, 65, 66, 67, 68, 69], "from_asset_kei": 67, "from_def": 66, "from_dynamic_map": 9, "from_failur": 8, "from_graph": 2, "from_name_type_dict": 63, "from_op": 2, "from_panda": 53, "from_val": 66, "front": [16, 50], "frozenset": 56, "fruit": 64, "fs": 17, "fs_io_manag": 12, "fsspec": 12, "full": [14, 16, 18, 21, 23, 26, 27, 32, 34, 50, 59, 63, 64], "full_control": 34, "full_job": [27, 40], "full_refresh": [14, 26, 32], "full_refresh_append": 14, "full_refresh_overwrit": 14, "fulli": [11, 16, 50], "fulltrac": 40, "function": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16, 17, 21, 23, 26, 27, 33, 34, 40, 42, 45, 46, 47, 50, 51, 52, 53, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69], "further": [16, 23, 50, 67], "futur": [2, 11, 26, 63, 65, 66, 67], "g": [1, 2, 7, 8, 10, 11, 12, 13, 14, 16, 18, 19, 20, 23, 26, 28, 34, 39, 40, 45, 50, 52, 53, 59, 61, 63, 64, 65, 67, 68], "ga4": 14, "gain": 11, "garbag": [12, 16, 50], "gatewai": [14, 49], "gather": [6, 8, 13], "gaug": 24, "gave": 5, "gb": 34, "gc": [14, 16, 23, 35, 36, 50], "gceclusterconfig": 34, "gcepdkmskeynam": 34, "gcloud": 40, "gcp": [14, 40], "gcp_credenti": [34, 35, 36], "gcp_project": [34, 35, 36], "gcs_bucket": 34, "gcs_bucket_nam": 14, "gcs_bucket_path": 14, "gcs_bucket_region": 14, "gcs_file_manag": 34, "gcs_kei": 34, "gcs_pickle_io_manag": 34, "gcs_prefix": 34, "gcs_resourc": 34, "gcscomputelogmanag": 34, "gcsdestin": 14, "gcsfilehandl": 34, "gcsfilemanagerresourc": 34, "gcsgooglecloudstorag": 14, "gcspickleiomanag": 34, "gcsresourc": 34, "gcsstage": 14, "ge_data_context": 37, "ge_validation_op_factori": 37, "gen": 17, "gen2": 17, "gender": 24, "gener": [2, 3, 7, 8, 9, 11, 12, 15, 16, 17, 20, 21, 23, 26, 33, 34, 37, 39, 40, 45, 46, 50, 51, 53, 63, 64, 67, 68], "generate_materi": 26, "get": [3, 4, 5, 8, 11, 12, 14, 15, 16, 18, 20, 23, 24, 26, 34, 39, 40, 50, 53, 60, 65, 67, 69], "get_all_job": 65, "get_all_schedul": 65, "get_all_sensor": 65, "get_artifact": 26, "get_asset_identifi": 12, "get_asset_kei": [11, 26], "get_asset_key_for_model": 26, "get_asset_key_for_sourc": 26, "get_asset_keys_by_output_name_for_sourc": 26, "get_asset_proven": 8, "get_asset_record": 11, "get_asset_value_load": [2, 5, 65], "get_assets_defs_by_kei": 65, "get_auto_materialize_polici": 26, "get_batch": 37, "get_client": [16, 26, 34, 38, 42, 52], "get_connect": [28, 53], "get_context": 60, "get_cron_schedul": 64, "get_cursor_partit": 67, "get_dagster_ev": 11, "get_dagster_logg": 69, "get_dependencies_and_map": 9, "get_descript": 26, "get_downstream_partition_kei": 67, "get_downstream_partitions_for_partit": 64, "get_dynamic_partit": 11, "get_event_record": 11, "get_freshness_polici": 26, "get_group_nam": 26, "get_identifi": 12, "get_job": 65, "get_job_def": 5, "get_job_failure_ev": 67, "get_job_nam": 65, "get_job_success_ev": 67, "get_latest_materialization_code_vers": 11, "get_latest_materialization_ev": 11, "get_mapping_kei": 8, "get_metadata": [12, 26], "get_node_depend": 9, "get_on": 53, "get_op_vers": 62, "get_partition_kei": 64, "get_partition_map": 2, "get_query_statu": 53, "get_repo_id": 38, "get_resource_vers": 62, "get_run": 23, "get_run_by_id": 11, "get_run_record": 11, "get_run_record_by_id": 11, "get_run_statu": 39, "get_schedul": 65, "get_schedule_def": [5, 65], "get_schedule_nam": 65, "get_secret_valu": 16, "get_sensor": 65, "get_sensor_def": [5, 65], "get_sensor_nam": 65, "get_source_assets_by_kei": 65, "get_status_by_partit": 11, "get_step_failure_ev": 67, "get_system_temp_directori": [16, 17, 34], "get_tag": 8, "get_trailing_unconsumed_ev": 67, "get_upstream_mapped_partitions_result_for_partit": 64, "get_upstream_partitions_for_partit": 64, "get_v2_aggs_grouped_locale_us_market_stocks__d": 14, "getdbt": 26, "getenv": [16, 18, 19, 20, 38, 42, 52, 69], "giant": [16, 50], "gib": [16, 23], "github": [14, 59], "github_app_id": 38, "github_app_private_rsa_kei": 38, "github_hostnam": 38, "github_installation_id": 38, "github_job": 38, "github_op": 38, "github_private_kei": 38, "github_resourc": 38, "githubresourc": 38, "githubsourc": 14, "gitlab": 14, "gitlabsourc": 14, "give": [8, 16, 17, 23, 39, 49, 50, 53], "given": [2, 8, 11, 12, 13, 14, 15, 16, 21, 23, 26, 33, 39, 42, 45, 46, 50, 52, 62, 63, 64, 65, 66, 67, 68], "gke": 40, "glassfrog": 14, "glassfrogsourc": 14, "glob": [16, 50, 69], "global": [11, 14, 26, 27, 34, 40, 66], "global_config_flag": 26, "globstar": 14, "gm": 25, "gmail": 69, "go": [2, 14, 16, 26, 50, 52, 63], "gocardless": 14, "gocardless_environ": 14, "gocardless_vers": 14, "gocardlesssourc": 14, "goe": [16, 50], "good": [4, 14, 16, 38, 45, 50], "goodby": [27, 40], "googl": [14, 34], "google_application_credenti": [34, 35, 36], "google_auth_credenti": [34, 35, 36], "googleadssourc": 14, "googleanalyticsdataapisourc": 14, "googleanalyticsv4sourc": 14, "googleapi": 34, "googlecloudstoragestag": 14, "googlecredenti": 14, "googledirectorysourc": 14, "googlesearchconsolesourc": 14, "googlesheetsdestin": 14, "googlesheetssourc": 14, "googleworkspaceadminreportssourc": 14, "govern": 14, "gp3": 23, "gql": 39, "grab": 11, "gracefulli": [16, 50], "grain": [16, 50, 65], "grandchild": 63, "grandchildren": 2, "grandpar": 2, "grant": [14, 34, 38], "granular": 14, "graph": [2, 4, 6, 7, 13, 14, 16, 33, 50, 51, 53], "graph_a": 9, "graph_asset": 2, "graph_def": [2, 8, 9, 13], "graph_input_descript": 9, "graph_input_nam": 9, "graph_multi_asset": 2, "graph_output_descript": 9, "graph_output_nam": 9, "graphdefinit": [2, 4, 7, 8, 9, 13, 42, 52, 61, 63, 67, 69], "graphin": 9, "graphout": 9, "graphql": [11, 15, 40, 42, 43, 48, 52, 67, 69], "graphx": [16, 50], "great_expect": 37, "greater": [11, 14, 16, 23, 50], "greatexpect": 37, "greenhous": 14, "greenhousesourc": 14, "greeting_op": 4, "greetingconfig": 4, "group": [2, 5, 14, 16, 26, 33, 34, 40, 60, 65], "group_from_dbt_resource_props_fallback_to_directori": 26, "group_id": 14, "group_nam": [2, 14, 33, 60], "group_names_by_kei": 2, "group_names_by_output_nam": 2, "group_str": 2, "groupid": [16, 50], "grow": [16, 50], "grpc": 11, "grpc_host": 3, "grpc_port": 3, "grpc_socket": 3, "gs": 34, "gsc": 14, "gserviceaccount": 34, "guarante": [11, 16, 23, 50], "guess": 14, "guest": [18, 19, 20], "guid": [14, 15, 16, 22, 25, 28, 29, 30, 31, 34, 35, 36, 38, 40, 45, 46, 50, 53, 54, 55, 60], "gutenberg": 14, "gutendex": 14, "gutendexsourc": 14, "gz": 34, "gzip": 14, "h": [3, 14, 34, 40], "ha": [1, 2, 4, 7, 8, 11, 12, 13, 14, 16, 17, 23, 26, 28, 34, 39, 42, 45, 50, 53, 62, 63, 64, 65, 66, 67, 68, 69], "had": [14, 16, 34, 35, 36, 50, 53], "hadoop": [16, 34, 50], "hadoopjob": 34, "halt": 2, "hand": [14, 16, 50, 52], "handi": 47, "handl": [2, 12, 16, 33, 50, 51, 60, 61, 63, 64], "handle_output": [12, 62], "handled_output": [8, 12], "handler": [16, 28, 29, 30, 31, 34, 47, 53, 61], "hang": 53, "happen": [7, 16, 50, 59, 67], "happi": 11, "hard": [11, 14, 16, 34, 50, 61], "hardcod": [12, 66], "hardcoded_io_manag": 12, "hardcoded_resourc": 66, "harvest": 14, "harvestsourc": 14, "has_asset_checks_def": 8, "has_asset_kei": [11, 12], "has_asset_partit": 12, "has_assets_def": 8, "has_dynamic_partit": 11, "has_error": 63, "has_input_nam": 12, "has_job": 65, "has_output": 62, "has_partition_kei": [8, 12], "has_repository_load_data": 11, "has_schedul": 65, "has_schedule_def": 65, "has_sensor": 65, "has_sensor_def": 65, "has_specified_executor": [8, 13], "has_specified_logg": [8, 13], "has_tag": 8, "has_unique_nam": 68, "hash": [14, 62], "hash_cod": 14, "have": [1, 2, 3, 4, 5, 6, 7, 8, 11, 12, 13, 14, 16, 23, 26, 27, 33, 34, 40, 45, 47, 50, 52, 53, 54, 55, 59, 62, 63, 64, 65, 66, 67, 68, 69], "haw": 4, "hcf": 34, "hdf": [16, 34, 50], "hdfs_user_guid": 34, "header": 39, "heap": [16, 50], "heartbeat": [16, 50], "heartbeat_timeout": 3, "heartbeatinterv": [16, 50], "hei": 52, "hello": [4, 8, 27, 40, 42, 51, 63, 66], "hello_op": [4, 69], "hello_world": [8, 51, 61, 63], "hello_world_with_default": 4, "hellobaton": 14, "hellobatonsourc": 14, "helloconfig": 4, "help": [2, 3, 11, 12, 14, 16, 18, 23, 26, 32, 50, 65, 66], "helper": [12, 66], "here": [5, 8, 9, 11, 13, 14, 16, 18, 19, 20, 23, 24, 26, 32, 34, 38, 40, 44, 50, 52, 63], "heterogen": 5, "heurist": 15, "hh": 14, "hierarch": 63, "high": [14, 16, 50], "higher": [8, 16, 50], "highest": 14, "highlight": 63, "highlycompressedmapstatu": [16, 50], "hint": [12, 68], "hire": 14, "histogram": 24, "histor": [3, 7, 14], "histori": [3, 11, 14, 16, 50, 64], "hit": [14, 16, 50], "hive": 34, "hivejob": 34, "hmac": 14, "hmac_key_access_id": 14, "hmac_key_secret": 14, "hmackei": 14, "hoc": 15, "hold": [4, 14, 26, 63], "home": [19, 26, 34], "honor": 53, "honua": 4, "hood": 4, "hook": [2, 8, 9, 11, 13, 41, 42, 52, 63], "hook_def": [8, 9, 10, 13, 42, 52, 63], "hook_fn": 10, "hook_to_invok": 10, "hook_url": 42, "hookcontext": [10, 42, 52], "hookdefinit": [9, 10, 41, 42, 52], "hope": [18, 19, 20], "host": [3, 14, 16, 23, 24, 26, 32, 39, 50, 57, 59], "host1": 14, "host2": 14, "hostnam": [3, 11, 14, 16, 18, 26, 34, 38, 39, 43, 48, 50, 69], "hostnameincertif": 14, "hour": [2, 14, 34, 59, 64, 67], "hour_of_dai": [64, 67], "hour_offset": [64, 67], "hourli": [16, 50, 64, 67], "hourly_partitioned_config": [64, 67], "hourlypartitionsdefinit": 64, "hous": [16, 50], "how": [1, 2, 4, 8, 9, 11, 12, 13, 14, 16, 18, 23, 24, 26, 28, 33, 34, 40, 42, 49, 50, 52, 53, 59, 64, 67], "howev": [4, 16, 39, 50, 51, 65], "hr": 14, "html": [14, 16, 18, 19, 20, 23, 27, 34, 37, 50, 56], "http": [2, 8, 14, 16, 18, 19, 20, 23, 24, 26, 27, 32, 34, 37, 38, 39, 40, 41, 42, 49, 50, 52, 53, 56, 58, 59, 63, 64, 67, 69], "http_proxi": 42, "https_proxi": 42, "httpspublicweb": 14, "hub": 14, "hubplann": 14, "hubplannersourc": 14, "hubspot": 14, "hubspotsourc": 14, "human": [2, 4, 8, 9, 12, 16, 47, 51, 61, 63, 65, 66, 67], "hunter": 14, "hunter42": 32, "hydrat": [11, 16], "hyperparamet": 59, "hyphen": 34, "i": [2, 4, 8, 12, 14, 16, 18, 23, 26, 28, 29, 30, 31, 35, 36, 46, 50, 54, 55, 61, 63, 65, 69], "iam": [14, 34], "iamrol": 14, "iamus": 14, "iana": [2, 14, 26, 64, 67], "id": [3, 8, 10, 11, 12, 14, 15, 16, 17, 21, 23, 26, 32, 33, 34, 38, 39, 40, 41, 42, 50, 52, 59, 61, 63, 66, 67, 69], "idea": [16, 38, 50], "idefinitionconfigschema": 63, "idempot": [11, 14, 18, 19, 20, 23], "idempotency_token": 23, "ident": [14, 63, 64, 66, 68], "identif": 14, "identifi": [1, 2, 6, 8, 9, 12, 13, 14, 23, 45, 59, 63, 64, 66, 67, 68], "identitypartitionmap": 64, "idl": [14, 16, 50], "ie": [2, 4, 6, 8, 15, 24, 60], "ietf": 34, "ifnotpres": 40, "ignor": [4, 8, 11, 12, 13, 14, 15, 16, 23, 34, 50, 67], "ignore_handled_error": 26, "ignore_weekend": 14, "ijob": 11, "illeg": [16, 50], "imag": [3, 19, 20, 27, 34, 40], "image_nam": [19, 20], "image_pull_polici": [20, 40], "image_pull_secret": [20, 40], "image_vers": 34, "imagepullpolici": 40, "imagepullsecret": 40, "imageuri": 34, "imagevers": 34, "imagin": 2, "immedi": [11, 16, 50, 62, 63], "immun": [16, 50], "immut": [8, 9], "impact": [14, 16, 50], "imperi": 14, "implement": [8, 9, 11, 12, 13, 14, 16, 17, 18, 20, 21, 26, 27, 33, 34, 40, 45, 50, 62, 63, 64, 67], "implementor": 11, "import": [1, 2, 3, 4, 8, 9, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 38, 40, 41, 42, 49, 50, 51, 52, 53, 54, 55, 59, 60, 61, 63, 64, 66, 67, 69], "import_df_to_bq": 34, "import_file_to_bq": 34, "import_gcs_paths_to_bq": 34, "imprecis": [16, 50], "improv": [11, 16, 50, 53, 59], "in1": 13, "in_asset_kei": 2, "in_process": [8, 20], "in_process_executor": 8, "inaccuraci": 14, "inbound": [16, 50], "includ": [1, 2, 3, 7, 8, 9, 11, 13, 14, 15, 16, 18, 19, 20, 23, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 39, 40, 42, 44, 46, 50, 52, 53, 54, 55, 63, 64, 67, 68, 69], "include_checksum": 14, "include_delet": 14, "include_deleted_object": 14, "include_exampl": 15, "include_self": 2, "include_sidecar": 16, "include_sourc": 2, "inclus": 64, "incom": [16, 42, 50], "incompat": [7, 63], "incorpor": 2, "incorrect": 34, "increas": [14, 16, 50, 53], "increment": [14, 16, 24, 32, 50], "incremental_append": 14, "incremental_append_dedup": 14, "inculd": 9, "incur": 11, "indefinit": 53, "independ": [34, 66], "index": [3, 9, 11, 16, 50, 63, 64, 65, 67], "indian": 64, "indic": [2, 3, 4, 7, 8, 11, 12, 14, 23, 26, 34, 39, 45, 53, 63, 67], "individu": [2, 5, 6, 8, 14], "ineffici": 14, "infer": [2, 9, 12, 14, 26, 34, 39, 63], "infer_column_typ": 63, "infer_datatyp": 14, "infer_missing_t": 33, "infer_timestamp": 14, "infinit": [11, 16, 50, 53], "info": [3, 8, 11, 12, 14, 16, 18, 26, 34, 38, 50, 51, 61, 63, 66, 69], "inform": [1, 2, 3, 8, 9, 11, 12, 13, 14, 15, 16, 19, 21, 23, 26, 27, 33, 34, 39, 42, 50, 52, 53, 60, 62, 63, 67, 69], "infrastructur": 15, "ingest": [14, 62], "ingest_start": 14, "inherit": [5, 7, 8, 11, 13, 29, 30, 31, 45, 61, 63, 68, 69], "init": [23, 45, 66, 68], "init_context": [11, 12, 16, 47, 61, 66], "init_script": 23, "initexecutorcontext": 11, "initi": [2, 4, 7, 8, 11, 12, 14, 16, 17, 22, 23, 26, 33, 41, 50, 60, 61, 66, 67, 69], "initial_waiting_second": 14, "initialexecutor": [16, 50], "initializationact": 34, "initialr": [16, 50], "initloggercontext": [16, 47, 61], "initresourcecontext": [12, 66], "inittransact": 14, "inject": [3, 7, 20, 40, 60, 63], "inlin": [14, 51], "inmemoryiomanag": 12, "inner": 11, "inner_nod": 8, "inner_typ": 4, "input": [2, 4, 5, 7, 8, 9, 11, 13, 14, 16, 23, 26, 28, 29, 30, 31, 33, 34, 35, 36, 37, 45, 50, 51, 53, 54, 55, 60, 63, 65, 66, 68], "input1": 12, "input_config_schema": 12, "input_dagster_typ": 37, "input_def": [9, 63], "input_manag": 12, "input_manager_kei": [2, 12, 63], "input_map": [8, 9], "input_nam": 8, "input_t": 23, "input_valu": [8, 9, 13], "inputcontext": [12, 63], "inputdefinit": [9, 12, 45, 63, 68], "inputmanag": [12, 63], "inputmanagerdefinit": 12, "inputmap": [8, 9], "ins": [2, 8, 9, 11, 12, 28, 29, 30, 31, 34, 35, 36, 51, 53, 54, 55, 60, 63, 64, 68], "insensit": [11, 14], "insert": 14, "insid": [1, 2, 5, 12, 14, 16, 26, 27, 33, 34, 40, 50, 59, 65, 67], "insight": 14, "insightconfig": 14, "insightli": 14, "insightlysourc": 14, "insights_lookback_window": 14, "inspect": 5, "insqlitev": 11, "inst_data": [11, 16, 17, 34, 43], "instagram": 14, "instagramsourc": 14, "instal": [14, 16, 23, 34, 38, 40, 44, 52], "install_default_librari": 23, "installation_id": 38, "instanc": [2, 4, 5, 7, 8, 9, 10, 12, 13, 14, 16, 20, 23, 25, 26, 33, 34, 40, 41, 42, 45, 48, 50, 52, 53, 59, 61, 62, 64, 65, 66, 67, 68, 69], "instance_api_url": 14, "instance_config_map": [20, 40], "instance_for_test": [8, 67], "instance_pool_id": 23, "instance_profile_arn": 23, "instance_ref": [3, 67], "instance_typ": [11, 14], "instance_url_prefix": 14, "instanceof": 68, "instanceref": [3, 11, 67], "instancetyp": 11, "instanti": [2, 5, 6, 8, 11, 12, 16, 17, 26, 34, 43, 47, 48, 53, 61, 66, 67], "instead": [2, 3, 4, 7, 8, 9, 11, 13, 14, 16, 17, 18, 19, 20, 23, 26, 34, 42, 45, 50, 51, 52, 60, 64, 65, 67, 68, 69], "instruct": [14, 26, 38, 40, 44], "insuffici": [16, 50], "int": [1, 2, 4, 6, 7, 8, 9, 11, 14, 16, 17, 21, 23, 26, 34, 39, 40, 45, 51, 52, 59, 63, 64, 65, 66, 67, 68, 69], "integ": [4, 11, 14, 26, 61, 63], "integr": [14, 15, 16, 21, 23, 24, 25, 28, 29, 30, 31, 32, 33, 35, 36, 38, 40, 44, 47, 49, 50, 52, 53, 54, 55, 57, 58, 59, 60], "intend": [2, 8, 9, 11, 16, 20, 26, 39, 63, 65, 68], "intens": 23, "intent": 11, "inter": 3, "interact": [5, 8, 13, 14, 16, 19, 23, 26, 28, 32, 34, 37, 50, 60], "intercom": 14, "intercomsourc": 14, "interfac": [3, 11, 12, 14, 16, 21, 26, 33, 42], "intermedi": [11, 16, 50], "intern": [2, 3, 8, 12, 16, 26, 33, 34, 39, 43, 45, 48, 50, 51, 68], "internal_asset_dep": 2, "internal_ip_onli": 34, "internaliponli": 34, "interpol": [16, 50], "interrupt": [14, 16, 50], "intersect": 2, "interv": [2, 11, 14, 16, 17, 34, 50, 64, 67], "intmetadatavalu": 63, "intro": 14, "introduc": [5, 18, 19, 20], "introduct": [5, 18], "intsourc": [4, 11, 14, 16, 23, 25, 26, 27, 33, 38, 40, 43, 48, 49, 50, 53, 57, 67], "intuit": 16, "invalid": [7, 26, 39, 51, 63, 64, 69], "invalid_line_no": 26, "invalid_output_nam": 39, "invalid_step_kei": 39, "invalidoutputerror": 39, "invalidoutputerrorinfo": 39, "invalidsteperror": 39, "invari": 7, "invoc": [2, 8, 9, 10, 13, 26, 40, 51, 61, 66, 67], "invok": [2, 4, 5, 6, 8, 9, 10, 11, 13, 16, 19, 20, 26, 41, 47, 51, 61, 65, 67], "io": [2, 8, 14, 16, 17, 18, 20, 26, 27, 28, 32, 33, 34, 37, 38, 40, 50, 53, 59, 60, 62, 63], "io_manag": [2, 8, 9, 12, 13, 14, 16, 17, 26, 28, 29, 30, 31, 33, 34, 35, 36, 53, 54, 55, 63], "io_manager_def": 2, "io_manager_kei": [2, 6, 12, 14, 26, 33, 59, 60, 63], "iomanag": [2, 5, 12, 14, 26, 33, 59, 62, 65], "iomanagerdefinit": [2, 5, 12, 16, 17, 28, 29, 30, 31, 34, 35, 36, 53, 54, 55], "iomanagerdefnit": 12, "iop": 23, "ip": [14, 16, 34, 50], "ipynb": [60, 63], "iri": 60, "iris_dataset": 60, "iris_kmean": 60, "iris_kmeans_notebook": 60, "irrespect": 14, "is_asset_materialization_plan": 8, "is_asset_observ": 8, "is_builtin": [45, 68], "is_dagster_ev": 11, "is_engine_ev": 8, "is_expectation_result": 8, "is_failur": [8, 11], "is_failure_or_cancel": 11, "is_fan_in": 9, "is_finish": 11, "is_handled_output": 8, "is_hook_ev": 8, "is_loaded_input": 8, "is_observ": 2, "is_own": 23, "is_pres": 63, "is_requir": [2, 4, 6, 45, 63], "is_resource_init_failur": 8, "is_resume_retri": 11, "is_sandbox": 14, "is_step_ev": 8, "is_step_failur": 8, "is_step_materi": 8, "is_step_restart": 8, "is_step_skip": 8, "is_step_start": 8, "is_step_success": 8, "is_step_up_for_retri": 8, "is_success": [11, 26], "is_successful_output": 8, "is_user_code_error": 7, "is_valid": 63, "isinst": [26, 68], "isn": [27, 40], "iso": 14, "iso8601": 14, "isoformat": 26, "isol": 40, "ispreempt": 34, "issu": [3, 14, 16, 26, 34, 35, 36, 44, 50, 53, 54, 55], "issuer": 14, "issuer_id": 14, "item": [2, 4, 6, 14, 26, 60], "iter": [2, 5, 11, 14, 16, 26, 50, 66, 67], "iterablesourc": 14, "its": [2, 3, 6, 8, 9, 11, 12, 13, 14, 16, 18, 19, 20, 23, 26, 27, 33, 40, 46, 50, 52, 60, 62, 63, 64, 65, 67], "itself": [3, 4, 7, 8, 9, 12, 13, 14, 16, 50], "ivi": [16, 50], "ivy2": [16, 50], "ivyset": [16, 50], "jaffle_shop": 26, "jar": [16, 34, 50], "jar_file_uri": 34, "jarfileuri": 34, "java": [14, 16, 50], "javaseri": [16, 50], "javax": [16, 50], "jdbc": 14, "jdbc_url": 14, "jdbc_url_param": 14, "jdbcdestin": 14, "jdbcsourc": 14, "jira": 14, "jirasourc": 14, "jitter": 63, "jni": [16, 50], "job": [2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 26, 27, 33, 34, 38, 39, 40, 41, 42, 44, 47, 49, 50, 51, 52, 53, 56, 59, 60, 61, 63, 64, 65, 66, 67, 68, 69], "job1": 13, "job_code_origin": 11, "job_config": [20, 34, 40], "job_context": 60, "job_def": [8, 60, 61, 68], "job_for_datadog_op": 24, "job_id": [23, 26], "job_imag": [20, 40], "job_metadata": [20, 40], "job_nam": [8, 10, 11, 12, 15, 26, 39, 42, 52, 60, 65, 67, 69], "job_namespac": [20, 40], "job_permiss": 23, "job_runn": 40, "job_scoped_clust": 34, "job_select": [42, 52, 67, 69], "job_snapshot_id": 11, "job_spec_config": [20, 40], "job_timeout_in_second": 34, "job_wait_timeout": 20, "jobconfigvalidationinvalid": 39, "jobdefinit": [2, 5, 7, 8, 9, 13, 15, 17, 42, 52, 60, 61, 63, 65, 67, 69], "jobexecutionresult": 8, "jobfactori": 13, "jobid": 34, "jobnotfounderror": 39, "jobs_client": 23, "jobsapi": 23, "jobselector": [42, 52, 67, 69], "jobspec": 40, "jog": [64, 67], "join": [6, 12, 14, 16, 50], "join_channel": 14, "joinpath": 26, "json": [1, 2, 3, 4, 6, 8, 9, 13, 14, 26, 32, 33, 34, 50, 51, 60, 63, 67], "json_console_logg": 61, "json_credentials_envvar": 34, "json_log_format": 26, "json_repr": 14, "jsonl": 14, "jsonlinesnewlinedelimitedjson": 14, "jsonmetadatavalu": 63, "july_asset": 67, "june": 64, "jupyt": [8, 13, 60], "just": [4, 5, 8, 14, 16, 20, 27, 28, 29, 30, 31, 34, 35, 36, 40, 53, 54, 55, 59], "jvm": [16, 23, 50], "jwt": 14, "k": [16, 50], "k8": 20, "k8s_job": 40, "k8s_job_executor": 40, "k8s_job_nam": 40, "k8s_job_op": 40, "k8srunlaunch": [20, 40], "kafka": [14, 16, 25, 50], "kafkadestin": 14, "kafkaproduc": 14, "kafkasourc": 14, "kb": [16, 50], "kdc": 34, "kdcdbkeyuri": 34, "keen": 14, "keendestin": 14, "keep": [2, 14, 16, 40, 50, 53, 64, 67], "keep_files_in_gcs_bucket": 14, "keepal": 57, "keepalive_interv": 57, "kei": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 23, 24, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 38, 39, 40, 41, 44, 45, 50, 52, 53, 54, 55, 57, 59, 60, 64, 65, 66, 67, 68], "kept": 59, "kerber": 34, "kerbero": 34, "kerberosconfig": 34, "key1": 14, "key2": 14, "key3": 14, "key_encrypting_kei": 14, "key_fil": 57, "key_id": 14, "key_label_nam": 4, "key_prefix": [2, 14, 26, 28, 29, 30, 31, 33, 34, 35, 36, 53, 54, 55, 60], "key_store_password": 14, "key_str": 57, "key_typ": 4, "keypairauthent": 14, "keypassworduri": 34, "keyprefixdagsterdbttransl": 26, "keys_by_dimens": 64, "keys_by_input_nam": 2, "keys_by_output_nam": 2, "keyspac": 14, "keystor": 34, "keystorepassworduri": 34, "keystoreuri": 34, "keyword": [2, 4, 17, 19, 32, 60, 66], "kib": [16, 50], "kick": 2, "kill": [16, 50], "killblacklistedexecutor": [16, 50], "killen": [16, 50], "killtimeout": [16, 50], "kind": [1, 2, 6, 8, 13, 14, 45, 60, 64, 68], "kinesi": 14, "kinesisdestin": 14, "kit": 52, "klaviyo": 14, "klaviyosourc": 14, "km": [23, 34], "kms_kei": 23, "kmskeyuri": 34, "know": [2, 8, 13, 14, 16, 18, 50, 51, 53], "known": [14, 64], "known_stat": [11, 68], "kryo": [16, 50], "kryoregistr": [16, 50], "kryoseri": [16, 50], "kube": 22, "kubeconfig": [20, 40], "kubeconfig_fil": [20, 40], "kubectl": 40, "kubernet": [11, 16, 18, 22, 39, 50, 59], "kustom": 14, "kustomersingersourc": 14, "kvdb": 14, "kvdbdestin": 14, "kwarg": [4, 7, 8, 9, 11, 13, 14, 15, 17, 51, 61, 66, 68], "kyriba": 14, "kyribasourc": 14, "l": [3, 18], "label": [1, 2, 6, 14, 20, 34, 40, 45, 59, 63], "lack": [16, 50], "lag": 14, "lake": [14, 17, 23], "lakeformation_database_nam": 14, "lambda": [8, 13, 14, 33, 69], "lang": 14, "languag": 14, "larg": [2, 5, 14, 16, 23, 26, 50], "larger": [16, 50, 64], "last": [2, 8, 11, 14, 16, 34, 35, 36, 50, 64, 67], "last_completion_tim": 67, "last_run_kei": 67, "lastli": 2, "lastpartitionmap": 64, "lat": 14, "late": 52, "latenc": [16, 50], "later": [16, 50], "latest": [2, 11, 14, 16, 18, 19, 20, 23, 34, 40, 50, 56, 67], "latest_consumed_event_id": 67, "latest_event_partit": 67, "latest_event_storage_id": 67, "latest_materialization_by_partit": 67, "latest_materialization_records_by_kei": 67, "latest_materialization_records_by_partit": 67, "latest_materialization_records_by_partition_and_asset": 67, "latitud": 14, "latter": 11, "launch": [3, 8, 11, 13, 14, 16, 19, 20, 21, 23, 26, 27, 33, 40, 50, 59, 64, 67], "launcher": [3, 16, 20, 27, 40], "launchpipelineexecut": 3, "lazi": [2, 3, 26, 46, 65], "lazili": 65, "lazy_loaded_repositori": 65, "lead": [2, 14, 16, 50], "leader": [14, 16, 50], "learn": [2, 26], "least": [2, 14, 16, 50], "leav": [4, 9, 14, 16, 17, 40, 50], "ledger": 14, "left": [2, 11, 14, 33, 40, 52], "legaci": [5, 37], "lemlist": 14, "lemlistsourc": 14, "len": 63, "length": [4, 7, 14, 16, 34, 50, 53], "lengthi": 11, "less": [11, 14, 16, 18, 19, 20, 23, 50], "lesson": 14, "let": [14, 16, 18, 50], "letter": [2, 14, 34, 60, 63], "level": [1, 3, 4, 5, 8, 9, 11, 12, 13, 14, 16, 18, 23, 26, 34, 40, 45, 50, 61, 63, 65, 67, 69], "lever": 14, "leverhiringsourc": 14, "lib": 34, "libjar": 34, "librari": [11, 14, 15, 16, 18, 21, 23, 24, 25, 26, 28, 29, 30, 31, 33, 35, 36, 38, 39, 40, 44, 45, 46, 47, 49, 50, 51, 52, 53, 54, 55, 57, 58, 59, 60, 61], "lifetim": [23, 34], "like": [2, 3, 4, 8, 11, 13, 14, 16, 20, 23, 26, 33, 38, 39, 40, 50, 51, 53, 59, 61, 63, 66], "limit": [2, 11, 14, 16, 17, 27, 40, 50, 67], "line": [14, 16, 26, 50], "lineag": [16, 50, 63], "linear": 63, "linger_m": 14, "link": [14, 26], "linkedin": 14, "linkedinadssourc": 14, "linkedinpagessourc": 14, "linnwork": 14, "linnworkssourc": 14, "lint": 40, "list": [1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 16, 17, 19, 20, 21, 23, 25, 26, 27, 32, 33, 34, 40, 41, 42, 45, 50, 52, 53, 59, 60, 63, 64, 65, 67, 68, 69], "list_file_system": 17, "list_objects_v2": 16, "list_vers": 3, "listdir": 65, "listen": [14, 16, 18, 19, 20, 50, 59], "listenerbu": [16, 50], "liter": [4, 26], "littl": [16, 50], "live": [2, 3, 12, 14, 16, 50, 67], "liveupd": [16, 50], "ll": [14, 18, 23, 24, 25, 38, 40, 44, 47, 52, 59], "load": [1, 2, 3, 4, 5, 8, 9, 11, 12, 13, 14, 16, 20, 26, 28, 29, 30, 31, 33, 34, 35, 36, 40, 43, 48, 50, 51, 53, 54, 55, 60, 63, 64, 65, 67, 68, 69], "load_asset_valu": [2, 5, 65], "load_assets_from_airbyte_inst": 14, "load_assets_from_airbyte_project": 14, "load_assets_from_airflow_dag": 15, "load_assets_from_connect": 14, "load_assets_from_current_modul": [2, 59], "load_assets_from_dbt_cloud_job": 26, "load_assets_from_dbt_manifest": 26, "load_assets_from_dbt_project": 26, "load_assets_from_fivetran_inst": 33, "load_assets_from_modul": 2, "load_assets_from_package_modul": 2, "load_assets_from_package_nam": 2, "load_dict": 68, "load_from_path": 12, "load_incluster_config": [20, 40], "load_input": [2, 5, 12, 62, 65], "load_iri": 60, "load_kube_config": [20, 40], "load_table_from_local_parquet": 53, "loadabl": 5, "loaded_input": 8, "loader": [4, 5, 45, 65, 68], "loader_vers": 68, "loading_method": 14, "loadrepositori": 3, "local": [3, 8, 11, 12, 14, 16, 17, 18, 19, 22, 23, 26, 27, 34, 49, 50, 59], "local_artifact_storag": [11, 12], "local_artifact_storage_data": 11, "local_bas": 69, "local_compute_log_manag": 11, "local_dagster_job_package_path": 23, "local_dir": [16, 17, 34, 50], "local_disk0": 23, "local_file_manag": 11, "local_job_package_path": 16, "local_json_destin": 14, "local_pipeline_package_path": [16, 23], "local_temp": 11, "local_warehous": 69, "localartifactstorag": 11, "localclust": 22, "localcomputelogmanag": 11, "localfilehandl": 11, "localfilesystemlimit": 14, "localhost": [3, 14, 15, 16, 18, 19, 20, 39, 41, 42, 44, 52], "localjsondestin": 14, "localobjectrefer": 40, "locat": [2, 3, 5, 11, 14, 15, 16, 20, 23, 25, 34, 35, 36, 39, 40, 50, 60, 67], "location_nam": [3, 67], "log": [3, 7, 8, 10, 12, 14, 16, 17, 18, 20, 21, 23, 26, 40, 41, 43, 47, 48, 50, 51, 53, 59, 60, 66, 67, 69], "log_ev": [8, 12], "log_group_nam": 16, "log_kei": 11, "log_level": [3, 16], "log_manag": [8, 11, 12, 66, 68], "log_materi": 8, "log_param": 41, "log_stream_nam": 16, "logblockupd": [16, 50], "logconf": [16, 50], "logger": [4, 5, 8, 9, 13, 16, 21, 26, 47, 51, 60, 67, 69], "logger_config": [16, 47, 61], "logger_def": [8, 9, 13, 60, 61], "logger_fn": [16, 47, 61], "logger_to_init": 61, "loggerdefinit": [4, 5, 8, 9, 13, 16, 47, 60, 61], "logging_tag": [8, 60], "loggingconfig": 34, "logic": [2, 5, 11, 12, 14, 16, 47, 50, 61, 63, 66, 68], "logicalreplicationcdc": 14, "login": [14, 16, 23, 40, 53, 54, 55], "login_customer_id": 14, "login_timeout": 53, "loginpassword": 14, "loglevel": 18, "logs_batch_s": 14, "logwrit": 34, "lon": 14, "long": [11, 14, 16, 17, 23, 34, 40, 49, 50, 59, 64, 67], "longer": [16, 23, 40, 50, 63, 67], "longform": [16, 50], "longitud": 14, "look": [2, 4, 9, 11, 14, 26, 67], "lookback": 14, "lookback_window": 14, "lookback_window_dai": 14, "looker": 14, "lookersourc": 14, "lookup": [14, 16, 50], "lookuptimeout": [16, 50], "loop": [14, 16, 40, 50], "los_angel": [2, 26, 53, 64, 67], "loss": 14, "lost": [14, 16, 50], "lot": [14, 16, 50], "low": 23, "lower": [16, 23, 50], "lowercas": [18, 19, 20], "lowest": 14, "ls": 26, "lsf": 22, "lwa_app_id": 14, "lwa_client_secret": 14, "lz4": [16, 50], "lz4compressioncodec": [16, 50], "lzf": [16, 50], "lzfcompressioncodec": [16, 50], "m": [3, 16, 40, 50, 64, 67], "machin": [2, 3, 16, 34, 50, 59], "machineri": [11, 43, 45, 48, 63, 68], "machinetyp": 34, "machinetypeuri": 34, "maco": 40, "macro": 26, "made": [8, 9, 14, 16, 50, 60, 61, 66, 67], "magic": [11, 66], "magic_word": 11, "magicmock": [17, 66], "mai": [1, 2, 4, 7, 8, 9, 10, 11, 13, 14, 16, 17, 18, 19, 20, 23, 26, 27, 33, 34, 40, 50, 51, 53, 60, 61, 63, 64, 65, 66, 67, 69], "mailchimp": 14, "mailchimpsourc": 14, "mailgun": 14, "mailgunsourc": 14, "main": [14, 23, 32, 34, 40, 45, 50, 52, 64], "main_class": 56, "mainclass": 34, "mainjarfileuri": 34, "mainli": 2, "mainpythonfileuri": 34, "maintain": [3, 11, 59], "majmin": 40, "make": [3, 4, 8, 9, 11, 12, 14, 16, 17, 18, 19, 20, 23, 26, 34, 38, 40, 50, 53, 59, 65, 66], "make_bar_job": [8, 13], "make_dagster_definit": 15, "make_dagster_definition_from_airflow_dag_bag": 15, "make_dagster_definitions_from_airflow_dag_bag": 15, "make_dagster_definitions_from_airflow_dags_path": 15, "make_dagster_definitions_from_airflow_example_dag": 15, "make_dagster_job_from_airflow_dag": 15, "make_dagster_repo": 15, "make_definition_from_dag_bag": 15, "make_definitions_from_dir": 15, "make_email_on_run_failure_sensor": 69, "make_ephemeral_airflow_db_resourc": 15, "make_expensive_job": 65, "make_expensive_schedul": 65, "make_job": 13, "make_my_t": [28, 29, 30, 31, 34, 35, 36, 53, 54, 55], "make_persistent_airflow_db_resourc": 15, "make_python_type_usable_as_dagster_typ": 68, "make_repo_from_dir": 15, "make_schedules_and_jobs_from_airflow_dag_bag": 15, "make_slack_on_freshness_policy_status_change_sensor": 52, "make_slack_on_run_failure_sensor": 52, "make_teams_on_run_failure_sensor": 42, "make_values_resourc": [59, 66], "malform": 7, "man": 24, "manag": [2, 3, 7, 8, 10, 18, 20, 23, 26, 28, 29, 30, 31, 33, 35, 36, 40, 50, 54, 55, 60, 61, 62, 63, 66], "managed_logg": 61, "managedgroupconfig": 34, "mani": [5, 8, 11, 14, 16, 20, 23, 40, 50, 52, 63, 65, 67, 68], "manifest": 26, "manifest_json": 26, "manipul": 11, "manner": 66, "manual": [2, 8, 9, 14, 16, 23, 50, 53, 59, 63], "manuallyassignalistofpartit": 14, "map": [2, 4, 5, 7, 8, 9, 11, 12, 13, 14, 15, 16, 20, 26, 27, 33, 34, 45, 50, 51, 53, 60, 63, 65, 66, 67, 68], "map_config_op": 4, "mapped_node_input_nam": 9, "mapped_node_nam": 9, "mapped_node_output_nam": 9, "mapped_op": 6, "mappedinputplacehold": 9, "mapping_kei": [6, 8, 12], "mapr": 34, "mapreduc": [16, 34, 50], "mapreducetutori": 34, "maps_x": 9, "mariadb": 14, "mariadbcolumnstoredestin": 14, "mark": [4, 16, 20, 50, 64, 67], "markdown": [45, 52, 63, 68], "markdownmetadatavalu": 63, "market": [2, 14], "marketing_job": 2, "marketo": 14, "marketosourc": 14, "master": [14, 16, 34, 40, 50], "master_url": 56, "masterconfig": 34, "match": [1, 2, 7, 8, 11, 14, 16, 23, 33, 42, 45, 50, 60, 62, 63, 64, 66, 67], "materi": [1, 2, 3, 4, 5, 11, 12, 14, 16, 17, 21, 26, 32, 33, 34, 60, 63, 67], "materializ": 2, "materialization_records_by_kei": 67, "materialization_records_for_kei": 67, "materialize_on_miss": 2, "materialize_on_parent_upd": 2, "materialize_on_required_for_fresh": 2, "materialize_to_memori": 8, "materializeresult": 2, "matter": [2, 16, 50, 67], "maven": [16, 50], "max": [3, 14, 16, 23, 50, 63], "max_attempt": 16, "max_batch_s": 14, "max_block_m": 14, "max_catchup_run": 67, "max_completion_wait_time_second": 23, "max_concurr": [8, 27, 40], "max_concurrent_run": 11, "max_dat": 26, "max_in_flight_requests_per_connect": 14, "max_job": 59, "max_materializations_per_minut": 2, "max_messag": 14, "max_padding_size_mb": 14, "max_partitions_per_run": 64, "max_pending_messag": 14, "max_pending_messages_across_partit": 14, "max_poll_record": 14, "max_records_process": 14, "max_request_s": 14, "max_retri": [8, 9, 63], "max_tick_retri": 67, "max_user_code_failure_retri": 11, "max_wait_second": 14, "max_wait_tim": 14, "max_wait_time_second": 23, "max_work": [3, 23], "maxattempt": [16, 50], "maxblocksinflightperaddress": [16, 50], "maxchunksbeingtransf": [16, 50], "maxconsecutiveattempt": [16, 50], "maxexecutor": [16, 50], "maxfailedexecutorspernod": [16, 50], "maxfailedtasksperexecutor": [16, 50], "maxfailur": [16, 50], "maxfailuresperhour": 34, "maxim": [16, 40, 50], "maximum": [2, 3, 11, 14, 16, 21, 23, 26, 33, 34, 50, 63, 64, 67], "maximum_lag_minut": [2, 26, 67], "maxpartitionbyt": [16, 50], "maxrat": [16, 50], "maxrateperpartit": [16, 50], "maxregisteredresourceswaitingtim": [16, 50], "maxremoteblocksizefetchtomem": [16, 50], "maxreqsinflight": [16, 50], "maxresults": [16, 50], "maxretainedfil": [16, 50], "maxretri": [16, 50], "maxsiz": [16, 50], "maxsizeinflight": [16, 50], "maxtaskattemptsperexecutor": [16, 50], "maxtaskattemptspernod": [16, 50], "mb": [16, 50], "md": [14, 63], "md_str": 63, "mdash": 14, "me": 14, "mean": [2, 4, 6, 8, 12, 13, 14, 16, 42, 50, 52, 64, 67, 69], "meant": [2, 7, 8, 45, 68], "measur": [14, 16, 50], "mechan": [5, 14, 16, 18, 50, 53, 54, 55], "median": [16, 50], "meet": [1, 2, 8, 9, 13, 33, 51, 60, 63], "megabyt": 14, "meilisearch": 14, "meilisearchdestin": 14, "meltano": 14, "mem_io_manag": [8, 12], "member": [7, 11, 14, 65], "membership": 14, "memoiz": [3, 12, 13], "memoizableiomanag": 62, "memoizaton": [8, 9], "memoized_run_tag": 62, "memori": [8, 9, 11, 12, 13, 14, 16, 17, 23, 28, 34, 50, 53], "memory_onli": [16, 50], "memory_only_s": [16, 50], "memoryfract": [16, 50], "memorymapthreshold": [16, 50], "memoryoverhead": [16, 50], "mention": 14, "menu": 14, "merchant_id": 14, "merg": [16, 50], "mesag": 14, "meso": [16, 50], "mesos_sandbox": [16, 50], "messag": [4, 7, 8, 10, 11, 14, 16, 26, 39, 42, 47, 50, 52, 59, 61, 67, 69], "message_body_kei": 14, "message_delai": 14, "message_fn": [42, 52], "message_group_id": 14, "message_interval_m": 14, "message_qo": 14, "message_retain": 14, "messageformat": 14, "met": [11, 67], "meta": [14, 26, 33, 40], "metabas": 14, "metabasesourc": 14, "metadata": [1, 2, 5, 6, 7, 8, 9, 11, 12, 13, 14, 16, 25, 26, 28, 29, 30, 31, 33, 34, 35, 36, 37, 39, 40, 46, 50, 51, 53, 54, 55, 59, 60, 62, 65, 68], "metadata_by_kei": 2, "metadata_by_output_nam": 2, "metadata_by_table_nam": 33, "metadata_fn": 45, "metadatachangeev": 25, "metadatachangeevent_v4": 25, "metadatachangepropos": 25, "metadatachangeproposal_v1": 25, "metadataentri": 63, "metadatamap": 65, "metadatauserinput": [2, 26, 33], "metadatavalu": [1, 2, 6, 8, 9, 13, 45, 63], "method": [1, 2, 6, 8, 11, 12, 13, 14, 15, 16, 21, 24, 26, 29, 30, 31, 33, 34, 35, 36, 37, 41, 46, 47, 52, 53, 59, 61, 62, 63, 64, 66, 67], "metric": [14, 16, 24, 34, 49, 50], "metrica": 14, "mgmt": 16, "mib": [16, 23, 50], "microsoft": 14, "microsoftteam": 42, "microsoftteamssourc": 14, "midnight": [64, 67], "might": [4, 16, 50, 51], "migrat": 3, "mileston": [16, 50], "milli": 14, "millisecond": [3, 14, 16, 50], "min": [14, 23], "min_dat": 26, "min_work": 23, "minexecutor": [16, 50], "minim": [16, 50], "minimum": [14, 16, 23, 50, 52, 67], "minimum_interval_second": [52, 67], "minrateperpartit": [16, 50], "minregisteredresourcesratio": [16, 50], "minut": [2, 14, 16, 23, 34, 50, 52, 64, 67], "minute_of_hour": [64, 67], "minute_offset": [64, 67], "minutes_overdu": [52, 67], "mirror": [14, 24], "miss": [2, 12, 26, 67], "missing_column": 63, "missing_th": 63, "mitig": [16, 50], "mix": 2, "mixin": 11, "mixpanel": 14, "mixpanelsourc": 14, "ml": 59, "ml_model_for_each_ocean": 64, "mlf_exampl": 41, "mlflow_op": 41, "mlflow_s3_endpoint_url": 41, "mlflow_track": 41, "mlflow_tracking_uri": 41, "mlflowclient": 41, "mlop": 59, "mm": 14, "mnt": 19, "moab": 22, "mock": [10, 12, 17, 66], "mock_catalog": 14, "mock_resourc": 66, "mode": [2, 3, 8, 9, 11, 14, 16, 32, 46, 50, 51, 53], "model": [2, 4, 15, 26, 59], "model_nam": 26, "modifi": [14, 18, 19, 20, 26, 63], "modul": [2, 3, 5, 8, 11, 13, 14, 15, 16, 17, 18, 19, 20, 34, 40, 43, 48, 59, 61, 63, 69], "module_nam": [3, 11], "moduletyp": 2, "moment": 14, "mondai": [14, 64], "mondaysourc": 14, "mongo": 14, "mongodb": 14, "mongodbatla": 14, "mongodbdestin": 14, "mongodbsourc": 14, "mongodbv2sourc": 14, "monitor": [14, 15, 16, 21, 26, 33, 42, 50, 52, 67, 69], "monitor_all_repositori": [42, 52, 67, 69], "monitored_asset": 67, "monitored_job": [42, 52, 67, 69], "month": [64, 67], "monthli": [64, 67], "monthly_partitioned_config": [64, 67], "monthlypartitionsdefinit": 64, "more": [2, 5, 7, 8, 14, 15, 16, 18, 23, 26, 34, 38, 42, 50, 52, 53, 54, 55, 63, 65, 66, 67, 69], "most": [2, 8, 14, 16, 18, 19, 20, 23, 45, 50, 67, 68, 69], "mostli": [11, 17], "mount": [14, 16, 20, 40], "mqtt": 14, "mqttdestin": 14, "mr": 34, "ms": [14, 16, 42, 50], "msg": [61, 63], "msg_fn": 11, "mssql": 14, "mssqldestin": 14, "mssqlsourc": 14, "msteams_resourc": 42, "msteamsresourc": 42, "much": [14, 16, 40, 50], "multi": [2, 8, 9, 14, 16, 18, 26, 50, 64, 67], "multi_asset": [1, 2, 8, 26, 33], "multi_asset_sensor": 67, "multi_or_in_process_executor": [2, 8, 9, 13], "multi_out": 63, "multi_run": 64, "multiassetsensordefinit": 67, "multiassetsensorevaluationcontext": 67, "multidependencydefinit": 9, "multipartit": [64, 67], "multipartitionkei": 64, "multipartitionmap": 64, "multipartitionsdefinit": [8, 12, 64], "multipartitionsmap": 64, "multipl": [1, 2, 3, 5, 8, 9, 12, 13, 14, 15, 16, 17, 18, 20, 26, 33, 34, 40, 50, 53, 60, 63, 64, 65, 67], "multipli": [16, 50], "multiprocess": [8, 69], "multiprocess_executor": [8, 13], "multischema": 14, "multitosingledimensionpartitionmap": 64, "must": [1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 23, 26, 33, 34, 35, 36, 39, 40, 42, 45, 50, 51, 59, 60, 62, 63, 64, 66, 67, 68], "mutat": [3, 23, 39], "my": [1, 11, 12, 14, 16, 17, 23, 26, 34, 40, 63], "my_airbyte_job": 14, "my_airbyte_resourc": 14, "my_artifact": 59, "my_asset": [1, 2, 4, 8, 34], "my_asset_check": 2, "my_asset_has_enough_row": 1, "my_asset_sensor": 67, "my_assets_list": 2, "my_aws_key_id": 41, "my_bool_with_metadata": 4, "my_census_job": 21, "my_census_resourc": 21, "my_channel": 52, "my_composed_airbyte_job": 14, "my_composed_fivetran_job": 33, "my_custom_dbt_run": 26, "my_custom_metadata": 26, "my_custom_metadata_valu": 26, "my_custom_tag": 26, "my_dag_bag": 15, "my_dagster_job": 15, "my_dagster_queu": 59, "my_dashboard": 63, "my_databas": [11, 48, 53, 54, 55], "my_dataset": [34, 35, 36, 63], "my_db": [28, 29, 30, 31], "my_dbt_asset": 26, "my_dbt_cli_job": 26, "my_dbt_cloud_job": 26, "my_dbt_cloud_resourc": 26, "my_dbt_op": 26, "my_dbt_output": 26, "my_default_str": 4, "my_downstream_op": 9, "my_echo_op": 51, "my_ent": 59, "my_experi": 41, "my_explicit_paramet": 4, "my_fivetran_job": 33, "my_fivetran_resourc": 33, "my_funct": [2, 63], "my_graph": [8, 9, 13, 51], "my_graph_alia": [8, 9], "my_hook": [8, 9], "my_hostnam": [11, 48], "my_implicit_paramet": 4, "my_int_asset": 2, "my_int_list": 4, "my_int_var": 66, "my_io_manag": 12, "my_io_manager_kei": 12, "my_job": [9, 12, 16, 17, 18, 34, 42, 49, 52, 64, 65, 66, 67, 69], "my_launched_job": 59, "my_message_fn": [42, 52, 69], "my_modul": [18, 19, 20], "my_op": [12, 50, 66], "my_org": 26, "my_other_explicit_paramet": 4, "my_other_t": 63, "my_password": [11, 48], "my_polici": 2, "my_prefix": 26, "my_project": [40, 59], "my_pyspark_resourc": 50, "my_repo": [19, 20, 28, 29, 30, 31, 42, 52, 65, 69], "my_repositori": 59, "my_return_n_": 65, "my_s3_endpoint": 41, "my_sas_token": 17, "my_schedul": 65, "my_schema": [28, 29, 30, 31, 53, 54, 55], "my_secret": 41, "my_select": 26, "my_sensor": [64, 67], "my_shell_op": 51, "my_simple_airbyte_job": 14, "my_simple_census_job": 21, "my_simple_fivetran_job": 33, "my_slack_token": 52, "my_snowflake_job": 53, "my_sourc": 26, "my_spark_job": 50, "my_storage_account": 17, "my_str": 4, "my_str_var": 66, "my_string_asset": 2, "my_subdomain": 14, "my_tabl": [28, 29, 30, 31, 34, 35, 36, 53, 54, 55, 63, 67], "my_table_a": [28, 29, 30, 31, 34, 35, 36, 53, 54, 55], "my_table_schema": 63, "my_tag": [8, 9], "my_text_label": 63, "my_upstream_asset": 2, "my_upstream_graph": 9, "my_upstream_op": 9, "my_us": 19, "my_usernam": [11, 48], "my_valu": [8, 9, 26], "my_vari": 26, "my_wandb_job": 59, "my_wandb_resourc": 59, "my_warehous": [53, 55], "myassetconfig": 4, "mybigqueryiomanag": [34, 35, 36], "myclass": 63, "mycompani": [20, 40], "mycoolsit": [52, 63, 69], "mycorp": [16, 34], "mydbtconfig": 26, "myduckdbiomanag": [28, 29, 30, 31], "myexternaliomanag": 12, "myhourssourc": 14, "myiomanag": 12, "mymodul": 13, "mypermissiveopconfig": 4, "myregistrynam": 40, "myshopifi": 14, "mysit": 14, "mysnowflakeiomanag": [53, 54, 55], "mysql": 14, "mysql_db": 43, "mysql_url": 43, "mysqldestin": 14, "mysqleventlogstorag": [11, 43], "mysqlrunstorag": [11, 43], "mysqlschedulestorag": [11, 43], "mysqlsourc": 14, "mytabl": 63, "n": [3, 14, 15, 18, 46, 65], "n1": 34, "n_worker": 22, "na": 14, "naiv": 34, "name": [1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 23, 26, 27, 28, 29, 30, 31, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 45, 46, 50, 51, 52, 53, 54, 55, 56, 59, 60, 63, 64, 65, 67, 68, 69], "name1": [16, 50], "name2": [16, 50], "name_type_dict": 63, "named_repo": 5, "namedtemporaryfil": 11, "namedtupl": 60, "namespac": [14, 20, 40], "narrow": 67, "nativ": [14, 16, 50, 59], "nativenetworkencryptionnn": 14, "navig": 14, "ndaysago": 14, "necessari": [9, 11, 14, 16, 23, 26, 50, 59, 68], "need": [2, 5, 8, 9, 11, 13, 14, 16, 17, 18, 23, 24, 25, 26, 27, 34, 38, 40, 44, 45, 47, 50, 52, 53, 59, 63, 65, 67, 68, 69], "needs_input": 9, "neg": [8, 11, 14, 16, 50, 63], "neither": [16, 34, 45, 46, 68], "nest": [8, 9, 13, 41], "net": [14, 16, 50], "netsuit": 14, "netsuitesourc": 14, "netti": [16, 50], "network": [11, 14, 16, 17, 19, 27, 34, 50], "network_timeout": 53, "network_uri": 34, "networkuri": 34, "never": [2, 8, 14, 16, 18, 19, 20, 21, 26, 33, 50, 67], "new": [2, 3, 4, 5, 8, 9, 11, 14, 15, 16, 17, 18, 19, 20, 23, 26, 40, 44, 50, 52, 65, 66], "new_clust": 23, "newer": [16, 50], "newli": [11, 14], "newlin": [34, 35, 36, 53, 54, 55], "newlines_in_valu": 14, "next": [6, 14, 63, 67], "nf": [16, 50], "no_host_key_check": 57, "no_permiss": 23, "nocompress": 14, "nodatim": 14, "node": [2, 4, 8, 9, 13, 14, 16, 17, 23, 26, 34, 50], "node_a": 9, "node_b": 9, "node_def": [2, 8, 9], "node_handl": [8, 60], "node_info_to_asset_kei": 26, "node_info_to_auto_materialize_policy_fn": 26, "node_info_to_definition_metadata_fn": 26, "node_info_to_descript": 26, "node_info_to_freshness_policy_fn": 26, "node_info_to_group_fn": 26, "node_info_to_metadata": 26, "node_input_source_asset": [8, 9], "node_str": 8, "node_typ": 23, "node_type_id": 23, "nodedefinit": [2, 8, 9], "nodehandl": 8, "nodeinvoc": [8, 9], "noe": 23, "noencrypt": 14, "nois": 52, "non": [2, 4, 8, 14, 16, 19, 26, 27, 34, 50, 51, 64, 67], "non_argument_dep": [2, 8, 60], "non_scalar_schema": 4, "noncancel": [16, 50], "none": [1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 45, 49, 51, 52, 53, 54, 55, 56, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69], "none_": 14, "none_resourc": 66, "noneabl": [4, 7], "nonetyp": 4, "noop_compute_log_manag": 11, "noopcomputelogmanag": 11, "nor": [34, 45, 68], "normal": [2, 14, 26], "normalization_t": 14, "normalize_data": 14, "normalized_nam": 15, "nosigint": 69, "note": [2, 4, 8, 11, 12, 14, 16, 17, 18, 19, 20, 21, 23, 26, 27, 34, 35, 36, 38, 39, 43, 48, 50, 51, 53, 55, 61, 63, 65, 67], "notebook": [8, 13, 60, 63], "notebook_path": [23, 60, 63], "notebook_task": 23, "notebookmetadatavalu": 63, "noth": [26, 51, 66, 67, 68], "nothing_int_job": 68, "nothing_job": 68, "notic": [14, 61], "notif": 52, "notify_when_back_on_tim": 52, "notion": 14, "notionsourc": 14, "notunnel": 14, "novaluesentinel": [2, 6, 63], "now": [8, 13, 23, 38, 39], "np": 60, "ntype": 68, "null": [1, 14, 63], "nullabl": 63, "num": [8, 9, 16, 50, 63], "num_allowed_row": 45, "num_failur": 46, "num_row": [1, 63], "num_work": 23, "number": [2, 3, 5, 8, 11, 14, 16, 21, 22, 23, 26, 27, 33, 34, 39, 40, 45, 46, 50, 52, 53, 60, 63, 64, 67], "numconnectionsperp": [16, 50], "numer": 53, "numeric_event_properties_kei": 14, "numinst": 34, "numlocalssd": 34, "numpi": [53, 60], "numrbackendthread": [16, 50], "numretri": [16, 50], "o": [2, 3, 12, 14, 28, 29, 30, 31, 35, 36, 54, 55], "oar": 22, "oauth": [14, 16, 23, 50], "oauth2": 14, "oauth20": 14, "oauth2accesstoken": 40, "oauth2credenti": 14, "oauth_client_id": 23, "oauth_client_secret": 23, "oauth_credenti": 23, "oauthauthent": 14, "oauthcredenti": 14, "obj": 12, "object": [1, 2, 4, 5, 8, 9, 10, 11, 12, 13, 14, 16, 17, 18, 19, 20, 23, 26, 27, 32, 33, 34, 37, 39, 40, 42, 45, 46, 50, 51, 52, 53, 60, 61, 62, 63, 64, 65, 66, 67, 68], "object_typ": 14, "objectadmin": 34, "objectmeta": 40, "objectstreamreset": [16, 50], "observ": [2, 6, 26, 67], "observable_source_asset": [2, 5], "observation_job": 2, "observe_fn": 2, "obtain": 14, "occasion": [16, 50], "occur": [2, 7, 8, 9, 11, 13, 14, 16, 34, 39, 50, 51, 59, 63, 64, 67, 68], "oceans_partitions_def": 64, "ocsp": 53, "ocsp_response_cache_filenam": 53, "octavia": 14, "off": [2, 3, 8, 16, 33, 34, 40, 50, 62], "offer": [16, 50], "offheap": [16, 50], "offici": [3, 14, 23, 39], "offset": [14, 16, 50, 64], "often": [1, 2, 7, 16, 23, 50], "ok": 4, "okta": 14, "oktasourc": 14, "old": [2, 5, 8, 16, 50], "older": [5, 16, 50], "omit": [2, 4, 14, 16, 34, 39, 45, 50, 53, 54, 55, 68, 69], "onc": [2, 4, 11, 14, 38, 44, 52, 64, 66, 67, 68], "one": [1, 2, 3, 4, 6, 7, 8, 9, 11, 12, 13, 14, 16, 17, 18, 19, 20, 22, 23, 26, 27, 28, 34, 39, 40, 50, 53, 59, 61, 63, 64, 65, 67, 68, 69], "ones": [2, 23, 26, 64, 67], "onesign": 14, "onesignalsourc": 14, "ongo": 11, "onli": [2, 3, 4, 7, 8, 9, 11, 12, 13, 14, 15, 16, 17, 21, 23, 26, 27, 28, 29, 30, 31, 33, 34, 35, 36, 40, 50, 51, 52, 53, 54, 55, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68], "onlin": 24, "onto": 40, "ontolog": 5, "onward": 14, "oom": [16, 50], "op": [1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 16, 17, 18, 19, 20, 21, 24, 25, 28, 29, 30, 31, 35, 36, 37, 38, 41, 42, 44, 45, 49, 50, 51, 52, 54, 55, 60, 62, 65, 66, 67, 68, 69], "op1": 23, "op_a": [9, 12], "op_b": [9, 12], "op_c": 9, "op_config": [4, 6, 8, 10, 60, 62, 65], "op_def": [2, 8, 12, 60, 62, 68], "op_definit": [14, 26, 27, 33, 34, 40], "op_except": 10, "op_nam": [26, 60], "op_output_valu": 10, "op_retry_polici": [8, 9, 13], "op_select": [8, 9, 11, 13, 39], "op_tag": [1, 2, 33, 60], "op_to_invok": 8, "op_with_config": 4, "opdefinit": [2, 4, 8, 10, 12, 14, 21, 23, 26, 27, 33, 34, 40, 51, 53, 60, 62, 63], "open": [4, 8, 11, 14, 16, 38, 50, 51, 59, 64], "opencostinbyt": [16, 50], "openweath": 14, "openweathersourc": 14, "oper": [2, 8, 12, 13, 14, 15, 16, 21, 26, 33, 37, 38, 39, 50, 53], "opexecutioncontext": [8, 26, 51], "oppos": 2, "opt": [20, 40], "optim": [14, 16, 23, 50], "option": [1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 45, 47, 48, 49, 50, 51, 52, 53, 54, 55, 57, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69], "optionalcompon": 34, "opversioncontext": 62, "oracl": 14, "oracledestin": 14, "oraclesourc": 14, "orb": 14, "orbit": 14, "orbitsourc": 14, "orbsourc": 14, "orchestr": [6, 11, 26, 40, 59], "order": [2, 4, 8, 11, 12, 14, 16, 17, 20, 23, 26, 40, 45, 46, 50, 59, 63, 64, 67, 68], "order_bi": 11, "ordinari": 16, "ore": 67, "org": [2, 14, 16, 26, 34, 50, 56, 64, 67], "org_id": 14, "organ": [2, 5, 14, 15, 33, 59, 60, 63], "organization_id": 15, "origin": [2, 4, 7, 8, 9, 11, 20, 40], "original_exc_info": 7, "original_root": 7, "os": [6, 14, 16, 18, 19, 20, 38, 42, 52, 63, 65, 69], "other": [2, 3, 4, 5, 7, 8, 9, 11, 13, 14, 16, 20, 26, 33, 50, 53, 63, 65, 69], "other_asset": 66, "other_expensive_job": 65, "other_nam": 9, "other_op": [14, 33], "other_op_a": [8, 9, 13], "other_op_b": [8, 9, 13], "other_result": 9, "otherwis": [8, 9, 11, 12, 14, 16, 20, 26, 33, 34, 35, 36, 40, 50, 53, 62, 63, 67], "our": [11, 14], "out": [2, 3, 6, 8, 9, 12, 14, 16, 21, 23, 26, 28, 29, 30, 31, 33, 34, 35, 36, 39, 40, 50, 52, 53, 54, 55, 60], "outcom": 14, "outcome_nam": 14, "outer_graph": 8, "outgo": 14, "outliv": 40, "output": [1, 2, 3, 6, 7, 8, 9, 10, 11, 14, 16, 17, 21, 23, 25, 26, 28, 29, 30, 31, 33, 34, 35, 36, 37, 38, 39, 42, 45, 50, 51, 52, 53, 54, 55, 60, 62, 63, 66, 68, 69], "output_captur": [8, 11, 68], "output_config_schema": 12, "output_def": [9, 51, 63], "output_for_asset_kei": 8, "output_for_nod": 8, "output_log": 51, "output_map": [8, 9], "output_nam": [6, 8, 12, 26, 60, 63], "output_notebook_io_manag": 60, "output_notebook_nam": 60, "output_obj": 8, "output_requir": 2, "output_t": 23, "output_valu": 8, "outputcontext": [12, 62], "outputdefinit": [9, 12, 45, 63, 68], "outputmap": [8, 9], "outreach": 14, "outreachsourc": 14, "outsid": [2, 5, 8, 11, 12, 16, 40, 50, 64, 66], "over": [2, 3, 5, 8, 14, 16, 23, 24, 39, 50, 64, 65, 67], "overdu": 67, "overestim": [16, 50], "overhead": [16, 50], "overlap": 64, "overload": 23, "overrid": [2, 4, 5, 8, 9, 11, 13, 15, 16, 23, 26, 37, 40, 50, 51, 64, 67], "overridden": [8, 16, 26, 40, 42, 50, 52, 67, 68, 69], "overriden": 26, "overview": [14, 20, 23, 40, 63], "overwrit": [12, 14, 16, 17, 34, 50, 64], "overwritten": [2, 8, 9, 13, 34, 60], "own": [2, 5, 7, 8, 13, 16, 26, 27, 40, 50, 63, 66], "owner": [23, 38], "p": [3, 40], "p8": [53, 54, 55], "pa": [46, 53], "pacif": 64, "pack": [16, 50], "packag": [2, 3, 11, 16, 23, 32, 34, 50, 69], "package_modul": 2, "package_nam": [2, 3], "packet": 57, "page": [14, 16, 24, 26, 32, 38, 50, 52], "page_id": 14, "page_s": 14, "page_size_for_large_stream": 14, "page_size_kb": 14, "pagerduty_op": 44, "pagerduty_resourc": 44, "pagerduty_test": 44, "pagerdutyservic": 44, "pagin": [11, 14], "pair": [8, 11, 13, 14, 20, 23, 26, 27, 40, 67], "panda": [1, 28, 34, 36, 46, 53, 55, 60], "pandascolumn": 45, "pandera_schema_to_dagster_typ": 46, "panel": 14, "papermil": 60, "papertrail_logg": 47, "parallel": [16, 34, 40, 50], "param": [3, 11, 16, 48, 50], "paramet": [1, 2, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 22, 23, 26, 27, 28, 32, 33, 34, 37, 39, 40, 42, 45, 46, 47, 50, 51, 52, 53, 54, 55, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69], "parameter": [2, 8, 9, 13, 26, 67], "parametr": 8, "paramiko": 57, "paramstyl": 53, "pardot": 14, "pardot_business_unit_id": 14, "pardotsourc": 14, "parent": [2, 12, 16, 17, 21, 26, 34, 41, 63], "parent_run_id": [8, 11, 41], "parquet": [4, 14, 53], "parquetcolumnarstorag": 14, "pars": [4, 7, 8, 10, 11, 14, 26, 62, 68, 69], "part": [2, 4, 6, 14, 38, 42, 63, 66, 67], "parti": 11, "partial": [4, 16, 17, 34], "partially_specified_config": 4, "particular": [1, 2, 8, 11, 16, 27, 40, 50, 64, 66, 67], "partit": [2, 3, 5, 8, 9, 11, 12, 13, 14, 16, 26, 50, 60, 63, 65], "partition_dimension_nam": 64, "partition_fn": 64, "partition_kei": [2, 5, 8, 11, 12, 13, 64, 65, 67], "partition_key_rang": [8, 64], "partition_key_to_vars_fn": 26, "partition_map": [2, 8, 64], "partition_time_window": 8, "partition_x": 67, "partitiondefinit": 26, "partitiondimensiondefinit": 64, "partitioned_config": [8, 13], "partitionedconfig": [8, 9, 13, 64, 67], "partitionkeyrang": [8, 64], "partitionmap": [2, 12, 64], "partitions_def": [2, 8, 9, 11, 13, 26, 60, 64, 67], "partitions_def_nam": [11, 67], "partitions_subset": 64, "partitionsdefinit": [2, 8, 9, 11, 12, 13, 26, 60, 64], "partitionsfor": 14, "partitionshop_dbt_asset": 26, "partitionssubset": 64, "partner": 14, "partner_id": 14, "pass": [1, 2, 3, 4, 5, 8, 9, 11, 12, 13, 14, 16, 17, 18, 19, 20, 22, 23, 24, 26, 27, 34, 39, 40, 41, 42, 45, 50, 51, 52, 53, 60, 61, 62, 63, 64, 65, 66, 67, 68], "password": [11, 12, 14, 16, 19, 20, 27, 32, 34, 40, 43, 48, 53, 54, 55, 57, 69], "passwordauthent": 14, "past": [11, 14, 23, 52, 64, 67], "patcredenti": 14, "path": [2, 3, 6, 11, 12, 14, 15, 16, 17, 18, 20, 23, 26, 28, 29, 30, 31, 32, 34, 40, 50, 51, 53, 54, 55, 60, 63, 67, 69], "path_desc": [11, 68], "path_pattern": 14, "path_prefix": [3, 12, 14], "pathlib": [12, 26], "pathlik": 63, "pathmetadatavalu": 63, "pattern": [11, 14, 69], "paus": [14, 16, 50], "pawel": 18, "pawelzni": 18, "payload": [11, 42, 44], "paypal": 14, "paypaltransactionsourc": 14, "paystack": 14, "paystacksourc": 14, "pb": 22, "pd": [2, 28, 29, 34, 35, 36, 53, 54, 55, 60], "pdb": [8, 69], "peer": 6, "pem": 16, "pend": [14, 16, 18, 50], "pendingnodeinvoc": 10, "pendulum": 64, "peopl": 50, "per": [1, 2, 5, 8, 11, 12, 13, 14, 16, 22, 23, 27, 34, 40, 50, 67], "percentag": 23, "perform": [2, 3, 7, 11, 14, 16, 23, 38, 50, 53, 62, 63, 64, 65], "period": [11, 14, 16, 50, 64, 67], "period_in_dai": 14, "periodicgc": [16, 50], "perman": 14, "permiss": [4, 7, 11, 14, 16, 17, 18, 19, 20, 22, 23, 27, 34, 38, 40, 41, 48, 50, 53, 68], "permissiveconfig": 4, "permit": [8, 11, 14, 64], "persist": [2, 8, 11, 14, 15, 16, 17, 23, 34, 40, 50, 63, 67], "persistiq": 14, "persistiqsourc": 14, "person": [14, 25, 38], "personal_access_token": 14, "pg": 20, "phase": 14, "photo": 24, "phrase": 14, "pick": [14, 18, 19, 20, 34], "pickl": [4, 12, 16, 17, 34], "pid": 8, "piec": [11, 16, 23, 46, 50], "pig": 34, "pigjob": 34, "pinterest": 14, "pinterestsourc": 14, "pip": 44, "pipe": [51, 64], "pipedr": 14, "pipedrivesourc": 14, "pipelin": [3, 7, 8, 11, 16, 23, 32, 39, 53, 59, 66, 69], "pipeline_def": 61, "pipeline_run": 69, "pipelineconfigurationinvalid": 39, "pipelinenotfounderror": 39, "pipelinerun": 66, "pivot": 14, "pivotaltrackersourc": 14, "pkg_resourc": 69, "pkg_resource_def": 69, "pl": 30, "place": [2, 11, 12, 14, 16, 23, 26, 40, 45, 50], "placehold": 4, "placement": 34, "plai": [14, 67], "plaid": 14, "plaid_env": 14, "plaidsourc": 14, "plain": [52, 69], "plaintext": 14, "plan": [11, 14, 16, 20], "plan_context": 11, "plan_data": [11, 68], "plane": [16, 50], "planorchestrationcontext": 11, "platform": [14, 26, 34, 42], "pleas": [11, 14, 16, 50], "plu": [64, 67], "plug": 11, "pluggabl": [4, 11], "plugin": [11, 14, 35, 36, 54, 55], "plus_minu": 63, "pm": 67, "pod": [18, 20, 39, 40], "pod_spec_config": [20, 40], "pod_template_spec_metadata": [20, 40], "podspec": 40, "point": [2, 3, 11, 12, 14, 16, 18, 19, 20, 34, 35, 36, 50], "pointer": 8, "pokeapi": 14, "pokeapisourc": 14, "pokemon": 14, "pokemon_nam": 14, "polici": [1, 2, 8, 9, 11, 13, 14, 20, 23, 26, 40, 52, 60, 63, 67], "policy_id": 23, "poll": [14, 16, 21, 23, 26, 33, 50, 59], "poll_interv": [14, 21, 26, 33], "poll_interval_sec": 23, "poll_interval_second": 23, "poll_timeout": [14, 21, 26, 33], "polling_tim": 14, "polling_timeout": 11, "pollinginterv": [16, 50], "polygon": 14, "pool": [3, 14, 16, 23, 50], "poor": [16, 50], "popen": [26, 51], "popul": [8, 14, 59, 63], "popular": 14, "port": [3, 11, 14, 16, 23, 32, 39, 40, 43, 47, 48, 50, 57, 69], "port1": 14, "port2": 14, "port_numb": 39, "portion": 14, "posit": [8, 14, 16, 50, 63, 64, 67, 69], "possibl": [4, 11, 14, 16, 20, 23, 40, 50, 64], "post": [4, 14, 18, 42, 44, 52], "post_messag": 42, "postgr": [11, 14, 20, 32, 40], "postgres_airflow_db": 15, "postgres_connection_str": 32, "postgres_db": [11, 48], "postgres_password": 32, "postgres_password_secret": [20, 40], "postgres_url": 48, "postgresdestin": 14, "postgreseventlogstorag": [11, 48], "postgresql": [15, 20, 32, 40], "postgresrunstorag": [11, 48], "postgresschedulestorag": [11, 48], "postgressourc": 14, "posthog": 14, "posthogsourc": 14, "postmessag": 52, "potenti": [8, 11, 14, 16, 50], "power": 11, "pq": 53, "pre": [4, 14, 16, 26, 32, 50], "preambl": 7, "preced": [9, 12, 16, 17, 34, 50], "predefin": [3, 63], "predict": [8, 13], "preemptibl": 34, "prefer": [8, 9, 14, 24, 63, 67], "preferdirectbuf": [16, 50], "prefix": [2, 3, 11, 12, 14, 16, 17, 18, 21, 23, 25, 26, 33, 34, 35, 36, 38, 53, 60, 66], "pregel": [16, 50], "preload": 11, "preparedata": 23, "prepend": [2, 16, 50], "presenc": [2, 4, 8, 45, 63], "present": [2, 4, 8, 9, 11, 14, 16, 20, 23, 26, 33, 34, 38, 40, 44, 50, 52, 62, 63, 65], "preserv": [11, 16, 50, 67], "pressur": [16, 50], "prestashop": 14, "prestashopsourc": 14, "pretti": 38, "prev_sync_tim": 14, "prevent": [16, 50], "preview": [3, 34], "previou": [7, 8, 11, 12, 14, 16, 17, 34, 50, 62, 63, 67], "previous_minutes_overdu": 67, "price": [14, 23], "primari": [14, 32], "primarili": [16, 39, 50, 67], "primary_kei": [14, 32], "primetr": 14, "primetricsourc": 14, "primit": [4, 7, 8, 9, 13], "princip": [23, 34], "print": [3, 4, 16, 25, 26, 38, 40, 60, 63, 66], "printgcdetail": 23, "prior": [14, 67], "prioriti": 8, "priv": 40, "privat": [14, 23, 34, 38, 40, 53, 54, 55], "private_kei": [14, 53, 54, 55], "private_key_password": [14, 53, 54, 55], "private_key_path": [53, 54, 55], "private_token": 14, "privateapp": 14, "privatekei": 14, "privileg": 14, "proactiv": [16, 50], "problem": 26, "proce": 7, "process": [2, 3, 4, 5, 6, 7, 8, 9, 11, 13, 14, 16, 20, 23, 26, 29, 30, 31, 35, 36, 37, 40, 50, 53, 54, 55, 59, 67], "process_directori": 6, "process_fil": 6, "prod_slack_cli": 2, "produc": [1, 2, 6, 7, 8, 9, 11, 12, 14, 26, 33, 63, 66, 67, 68, 69], "producer_nam": 14, "producer_sync": 14, "product": [14, 18, 53, 59, 64, 65], "product_catalog": 14, "profil": [14, 16, 23, 26, 50], "profile_nam": 16, "profiles_dir": 26, "program": [16, 34, 50], "programat": [8, 9, 14, 21, 26, 33, 38, 63], "programmat": [45, 68], "progress": [11, 14, 16, 21, 26, 33, 40, 50, 67], "project": [2, 14, 34, 35, 36, 59], "project_and_instance_metadata": 34, "project_dir": [14, 26], "project_id": [14, 26, 34], "project_kei": 14, "project_nam": 26, "project_timezon": 14, "projectid": 34, "projectsecret": 14, "prometheus_cli": 49, "prometheus_resourc": 49, "prometheuscli": 49, "prometheusresourc": 49, "promot": 2, "propag": 12, "proper": [16, 50], "properli": [16, 50, 53], "properti": [1, 2, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16, 23, 26, 34, 45, 46, 50, 60, 61, 63, 64, 65, 66, 67, 68, 69], "property_id": 14, "protect": [16, 50], "protocol": [14, 24, 42, 69], "proven": 8, "provid": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 28, 29, 30, 31, 32, 33, 34, 35, 36, 38, 39, 40, 41, 42, 44, 45, 47, 49, 50, 51, 52, 53, 54, 55, 57, 58, 59, 60, 61, 62, 63, 64, 66, 67, 68, 69], "provis": [23, 34, 38, 44], "proxi": [16, 42, 50], "psycopg2": 15, "ptat": [16, 50], "public": [14, 16, 19, 23, 24, 27, 28, 29, 30, 31, 34, 35, 36, 50, 53, 54, 55], "public_kei": 14, "publish": [14, 24], "publisher_sync": 14, "pubsub": 14, "pubsubdestin": 14, "pull": [14, 16, 20, 27, 38, 40, 67], "pulsar": 14, "pulsardestin": 14, "purchas": 14, "purg": 59, "purge_staging_data": 14, "purpos": [2, 8, 10, 11, 14, 16, 50, 61, 66, 67], "push": [25, 40, 49], "push_to_gatewai": 49, "pushgatewai": 49, "put": [16, 50, 52, 63], "putobjectacl": 23, "py": [3, 15, 16, 27, 34, 50, 59], "py310": 40, "pyamqp": [18, 19, 20], "pyarrow": 53, "pydant": 4, "pyfil": [16, 50], "pyformat": 53, "pyproject": 3, "pyspark": [16, 23, 34, 35], "pyspark_resourc": 50, "pysparkjob": 34, "pysparkresourc": 50, "pytest": 40, "python": [2, 3, 5, 7, 8, 9, 11, 13, 14, 15, 16, 18, 23, 24, 25, 26, 27, 34, 45, 50, 51, 53, 58, 59, 60, 61, 63, 65, 68, 69], "python_artifact": 63, "python_fil": 3, "python_logging_levels_nam": 11, "python_modul": [18, 40], "python_param": 23, "python_typ": [2, 5, 65, 68], "python_valu": 4, "pythonartifactmetadatavalu": 63, "pythonerror": 39, "pythonfileuri": 34, "pythonhttptutorialsourc": 14, "pythonobjectdagstertyp": [63, 68], "pythonpath": [16, 50], "q": 18, "qmark": 53, "qualaroo": 14, "qualaroosourc": 14, "qualifi": [16, 50], "qualiti": [14, 63], "quantil": [16, 50], "queri": [2, 3, 8, 9, 11, 13, 14, 16, 24, 32, 34, 35, 36, 39, 53], "query1": 34, "query2": 34, "query3": 34, "query4": 34, "query_id": 53, "query_param": 14, "query_path": 14, "queryabl": 67, "queryfileuri": 34, "querylist": 34, "queu": 14, "queue": [11, 14, 16, 18, 20, 50, 59], "queue_url": 14, "queuedruncoordin": 11, "quick": 14, "quickbook": 14, "quickbookssingersourc": 14, "quickli": 3, "quickstart": [25, 38], "quit": [16, 40, 50], "quote_char": 14, "quux": [11, 33], "r": [3, 11, 16, 50], "r1": 34, "r2": [14, 34], "r2destin": 14, "rabbitmq": [14, 18, 19, 20, 40], "rabbitmqdestin": 14, "rack": [16, 50], "rais": [2, 4, 7, 8, 9, 11, 12, 13, 14, 21, 23, 26, 33, 39, 40, 42, 51, 52, 53, 60, 63, 64, 66, 67, 69], "raise_on_error": [7, 8, 9, 13, 26], "random": [14, 63], "randomli": [40, 63], "rang": [8, 12, 14, 16, 50, 64, 65, 69], "rapidli": [16, 50], "rapidoc": 14, "rasset_key_prefix": 26, "rate": [2, 14, 16, 50], "rather": [2, 5, 16, 40, 46, 50, 53, 63, 66, 68], "ratio": [16, 50], "raw": [5, 11, 14, 16, 20, 26, 40, 50, 53, 54, 55], "raw_conn": 53, "raw_data": 26, "raw_ev": 26, "raw_output": 26, "rawmetadatavalu": [1, 2, 8, 9, 12, 13, 63, 65], "rb": 11, "rbac": 40, "rbackend": [16, 50], "rdd": [16, 50], "re": [8, 11, 12, 14, 16, 23, 24, 26, 27, 50, 52], "reach": [11, 14, 16, 18, 50], "reachabl": [14, 33], "react": 67, "read": [2, 3, 4, 11, 14, 16, 23, 28, 29, 30, 31, 34, 35, 36, 38, 50, 51, 53, 54, 55, 59], "read_al": 14, "read_csv": 12, "read_data": 11, "read_fil": 11, "read_materi": 67, "read_text": 26, "read_timeout_sec": 25, "read_writ": 34, "readabl": [2, 4, 8, 9, 12, 16, 47, 51, 61, 63, 65, 66, 67, 68], "reader_opt": 14, "readi": 68, "readm": 14, "readonli": 34, "readrc": 69, "readthedoc": [18, 27], "real": 18, "realm": [14, 34], "realm_id": 14, "reaper": [16, 50], "reason": [7, 39, 67], "rebuild": 3, "receipt": 69, "receiv": [3, 7, 11, 14, 16, 45, 50, 51, 68], "receive_buffer_byt": 14, "receive_processed_config_valu": 4, "recent": [2, 8, 14, 67], "recharg": 14, "rechargesourc": 14, "reclaim": [16, 50], "recommend": [2, 4, 11, 14, 15, 16, 23, 26, 33, 50, 59, 64, 68], "recommendedinternalstag": 14, "recon_job": 11, "reconcil": 14, "reconcili": 2, "reconnect": 14, "reconstruct": [16, 50], "reconstruct_context": 8, "reconstruct_job": 13, "reconstructable_arg": 13, "reconstructable_bar_job": [8, 13], "reconstructable_foo_job": [8, 13], "reconstructable_kwarg": 13, "reconstructablejob": [8, 13], "reconstructor_function_nam": 13, "reconstructor_module_nam": 13, "reconstructor_working_directori": 13, "record": [2, 8, 11, 14, 16, 21, 50, 63, 67], "records_per_slic": 14, "records_per_sync": 14, "recov": [16, 50], "recoveri": [8, 13, 14, 16, 50, 67], "recoverymod": [16, 50], "recurli": 14, "recurlysourc": 14, "recurs": [2, 4, 7], "recycl": 3, "redact": [16, 50], "redi": [14, 18, 19, 20], "redirect": 14, "redirect_uri": 14, "redisdestin": 14, "redshift": 14, "redshift_configur": 16, "redshift_resourc": 16, "redshiftclientresourc": 16, "redshiftdestin": 14, "redshiftsourc": 14, "reduc": [14, 16, 26, 50], "reducebykei": [16, 50], "redund": [16, 50], "reexecut": 8, "reexecution_opt": 8, "reexecutionopt": 8, "ref": [3, 11, 23, 26], "refabl": 26, "refer": [2, 11, 14, 16, 17, 20, 23, 24, 26, 28, 29, 30, 31, 34, 35, 36, 37, 40, 44, 50, 52, 53, 54, 55, 56, 63], "referenc": [16, 63], "referencetrack": [16, 50], "reflect": 14, "refresh": [14, 26, 32, 39], "refresh_token": 14, "regardless": [16, 50], "regener": 14, "regex": [16, 50], "region": [14, 16, 23, 34, 40, 50], "region_nam": [14, 16], "regist": [14, 16, 50, 59], "registr": [16, 50], "registrationrequir": [16, 50], "registri": [19, 25, 27, 40, 59], "regress": [16, 50], "regular": [2, 11, 12, 14, 68], "reindex": 3, "rel": [6, 23, 64, 69], "relat": [2, 8, 14, 16, 23, 26, 28, 29, 30, 31, 32, 34, 35, 36, 50, 53, 54, 55, 60, 63], "relationship": 34, "relative_path": 69, "relaunch": [16, 50], "releas": [2, 14, 26, 34, 63, 66], "relev": [3, 7, 8, 14, 16, 26, 37, 38, 50, 52, 67], "reli": [14, 65], "reliabl": 23, "reload": [14, 39], "reload_repository_loc": 39, "reloadnotsupport": 39, "reloadrepositorylocationinfo": 39, "reloadrepositorylocationstatu": 39, "remain": [8, 9, 64], "remaind": 23, "rememb": [14, 16, 50], "remot": [3, 11, 15, 16, 23, 34, 39, 40, 50, 57], "remote_host": 57, "remote_port": 57, "remov": [2, 9, 13, 16, 23, 26, 42, 50, 52, 59, 63, 64, 67, 69], "renam": [16, 17, 34, 59], "render": [14, 16, 50], "render_field": 14, "renew": 14, "repeat": [4, 11, 14], "repeat_word": 4, "repeated_cal": 14, "repeatedli": 14, "repl": [8, 13], "replac": [8, 13, 14, 16, 50, 53, 67], "replai": [16, 50], "replenish": [16, 50], "repli": 69, "replic": [14, 16, 50], "replica": [14, 16, 50], "replica_set": 14, "replicaset": 14, "replication_end_d": 14, "replication_method": 14, "replication_slot": 14, "replication_start_d": 14, "repo": [14, 16, 40, 50, 59], "repo_location_nam": 20, "repo_nam": 38, "repo_own": 38, "report": [11, 14, 26, 34], "report_generation_max_retri": 14, "report_granular": 14, "report_opt": 14, "report_wait_timeout": 14, "reports_start_d": 14, "repositori": [3, 5, 14, 15, 16, 20, 26, 27, 28, 29, 30, 31, 33, 38, 39, 40, 42, 50, 52, 59, 67, 69], "repository_data": 65, "repository_def": 67, "repository_load_data": 65, "repository_location_nam": 39, "repository_nam": [15, 39, 67], "repository_vers": 65, "repositorydata": 65, "repositorydefinit": [9, 15, 65, 67], "repositorylocationloadfailur": 39, "repositorylocationnotfound": 39, "repositoryselector": [42, 52, 67, 69], "repostitori": 15, "repostitory_location_nam": 15, "repres": [1, 2, 6, 8, 9, 11, 12, 14, 16, 26, 33, 50, 59, 62, 63, 64, 66, 67], "represent": [2, 4, 8, 11, 16, 17, 26, 34, 46, 63, 64, 68], "request": [3, 8, 12, 13, 14, 16, 21, 23, 26, 33, 34, 38, 39, 42, 50, 53, 63], "request_additional_param": 14, "request_asset": 67, "request_job": 67, "request_max_retri": [14, 21, 26, 33], "request_retry_delai": [14, 21, 26, 33], "request_timeout": 14, "request_timeout_m": 14, "requests_per_minut": 14, "requir": [1, 2, 4, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 23, 26, 33, 34, 40, 45, 50, 51, 53, 54, 55, 59, 60, 63, 64, 66, 67, 68], "require_update_for_all_parent_partit": 2, "required_but_nonexistent_partition_kei": 64, "required_multi_asset_neighbor": 2, "required_resource_kei": [1, 2, 7, 10, 11, 12, 16, 17, 23, 24, 26, 41, 42, 45, 50, 51, 52, 53, 56, 60, 63, 66, 67, 68], "rerais": 7, "resend": 14, "reserv": [2, 12, 60], "reset": [14, 16, 50], "resid": [3, 11, 16, 23, 50, 67], "resolut": [16, 50], "resolv": [2, 3, 4, 5, 7, 14, 16, 50, 66], "resolve_canonical_bootstrap_servers_onli": 14, "resolve_standoff": 4, "resolved_op_select": 11, "resourc": [1, 2, 4, 5, 7, 8, 9, 10, 11, 12, 13, 15, 16, 18, 20, 21, 24, 25, 28, 29, 30, 31, 35, 36, 38, 40, 41, 44, 45, 49, 50, 51, 52, 54, 55, 58, 60, 62, 63, 65, 67, 68, 69], "resource_config": [2, 12, 62, 65, 66], "resource_config_by_kei": [59, 66], "resource_def": [1, 2, 8, 9, 11, 12, 13, 14, 15, 16, 17, 21, 23, 24, 26, 33, 34, 38, 41, 42, 44, 50, 52, 53, 59, 60, 62, 66, 67], "resource_fn": [7, 12, 66], "resource_funct": 12, "resource_init_failur": 8, "resource_keys_to_init": 60, "resource_nam": [7, 66], "resource_str": 69, "resource_to_init": 66, "resource_typ": 26, "resourceadd": 66, "resourcedefinit": [2, 4, 5, 7, 8, 11, 12, 13, 14, 15, 16, 17, 21, 23, 24, 25, 26, 28, 33, 34, 38, 41, 42, 44, 49, 50, 52, 53, 56, 57, 58, 59, 60, 62, 65, 66, 67], "resourceparam": [16, 24], "resources_config": 8, "resourceversioncontext": 62, "respect": [2, 3, 5, 12, 18, 23, 40, 53, 63], "respond": [23, 51], "respons": [3, 8, 11, 12, 13, 14, 24, 26, 33, 53, 64], "rest": [14, 21, 23, 26, 27, 33, 34, 40], "restart": [3, 14, 16, 23, 34, 39, 50, 63], "restrict": [23, 34], "result": [1, 2, 3, 4, 6, 9, 11, 12, 13, 14, 15, 16, 18, 19, 20, 21, 24, 26, 33, 34, 39, 50, 51, 53, 60, 61, 63, 64, 66, 67, 68, 69], "resum": [14, 59], "retain": [14, 16, 50], "retainedbatch": [16, 23, 50], "retaineddeadexecutor": [16, 50], "retaineddriv": [16, 50], "retainedexecut": [16, 50], "retainedexecutor": [16, 50], "retainedjob": [16, 50], "retainedrootrdd": [16, 50], "retainedstag": [16, 50], "retainedtask": [16, 50], "retent": 14, "retentlysourc": 14, "rethrown": 7, "retri": [1, 2, 8, 9, 11, 13, 14, 16, 18, 19, 20, 21, 23, 26, 27, 33, 40, 50, 60, 63, 67], "retriev": [8, 11, 12, 14, 20, 23, 26, 33, 34, 35, 36, 38, 40, 51, 53, 54, 55, 65], "retry_backoff_m": 14, "retry_max_tim": 25, "retry_method": 25, "retry_mod": 11, "retry_numb": 8, "retry_polici": [1, 2, 8, 9, 60, 63], "retry_status_cod": 25, "retrymod": 11, "retrypolici": [1, 2, 8, 9, 13, 60, 63], "retryrequest": [60, 63], "retrywait": [16, 50], "return": [1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 16, 17, 23, 26, 28, 29, 30, 31, 33, 34, 35, 36, 37, 39, 42, 45, 46, 50, 51, 52, 53, 54, 55, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69], "return_cod": 26, "return_n": 65, "return_n_": 65, "return_on": [8, 9, 13], "reus": [16, 34, 50, 59], "reusabl": 11, "revers": [16, 50], "reverseproxi": [16, 50], "reverseproxyurl": [16, 50], "reviv": [16, 50], "rewritten": [16, 50], "rfc": [14, 34], "rfc1035": 34, "rfc3339": 14, "rich": 18, "right": 14, "rigidli": [16, 50], "rki": 14, "rkicovidsourc": 14, "rm": [16, 50], "rockset": 14, "rocksetdestin": 14, "role": [4, 14, 23, 32, 34, 53, 54, 55], "role_arn": 14, "roll": [16, 50], "root": [2, 11, 14, 16, 23, 34, 40, 50], "root_run_id": 11, "rootlogg": 34, "rootprincipalpassworduri": 34, "rouberol": 18, "rout": [14, 44], "routing_kei": [14, 44], "row": [1, 14, 45, 46, 63], "row_batch_s": 14, "rowcountconstraint": 45, "rpc": [14, 16, 18, 19, 20, 50], "rsa": 38, "rsa_kei": [53, 54, 55], "rule": [2, 4, 7, 40, 45, 68], "rules_to_add": 2, "rules_to_remov": 2, "run": [1, 2, 4, 7, 9, 10, 12, 13, 14, 16, 17, 18, 19, 20, 21, 23, 24, 26, 27, 34, 35, 36, 37, 38, 39, 41, 42, 43, 48, 50, 51, 52, 59, 60, 61, 62, 63, 64, 66, 69], "run_before_shell_op": 51, "run_config": [4, 8, 9, 11, 13, 15, 16, 23, 24, 39, 41, 44, 52, 53, 60, 65, 67], "run_config_fn": 67, "run_config_for_partition_fn": 64, "run_config_for_partition_key_fn": 64, "run_coordin": 11, "run_coordinator_data": 11, "run_dat": 26, "run_dbt_nightly_sync": 26, "run_ecs_tag": 16, "run_failure_sensor": 67, "run_fn": 24, "run_id": [7, 8, 9, 10, 11, 12, 13, 39, 59, 60, 61, 63, 66, 69], "run_k8s_config": [20, 40], "run_kei": [8, 13, 65, 67], "run_launch": [11, 20, 40], "run_launch_ag": 59, "run_launch_agent_exampl": 59, "run_launch_job": 59, "run_launch_job_exampl": 59, "run_launcher_data": 11, "run_look_id": 14, "run_nam": [23, 59], "run_now": 23, "run_now_op": 23, "run_request": [64, 67], "run_request_for_partit": [8, 13], "run_resourc": 16, "run_result": 26, "run_results_json": 26, "run_results_path": 26, "run_resultsjson": 26, "run_statu": [42, 67], "run_status_sensor": 67, "run_status_sensor_fn": 67, "run_status_sensor_to_invok": 67, "run_storag": [11, 43, 48], "run_storage_data": 11, "run_tag": 59, "run_task": 16, "run_task_kwarg": 16, "run_updated_aft": 11, "runawai": [16, 50], "runconfig": [4, 13, 26], "runconfigdata": 39, "runconflict": 39, "runcoordin": 11, "runfailuresensorcontext": [42, 52, 67, 69], "runlaunch": [11, 16, 20, 27, 40], "runnabl": 69, "runner": [14, 40], "runnow": 23, "runrecord": 11, "runrequest": [8, 13, 64, 65, 67], "runs_client": 23, "runsapi": 23, "runsfilt": 11, "runshardedeventscursor": 11, "runstatussensorcontext": 67, "runstatussensordefinit": 67, "runstorag": 11, "runtim": [2, 4, 6, 7, 8, 9, 13, 16, 23, 26, 34, 45, 50, 61, 64, 68], "runtime_metadata_fn": 26, "s": [1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 16, 18, 20, 23, 26, 34, 37, 38, 39, 40, 41, 46, 50, 52, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68], "s3": [11, 14, 23, 41, 50, 68], "s3_access_key_id": 14, "s3_bucket": [14, 16, 68], "s3_bucket_nam": 14, "s3_bucket_path": 14, "s3_bucket_region": 14, "s3_endpoint": 14, "s3_file": 16, "s3_file_manag": 16, "s3_job_package_path": 16, "s3_kei": [16, 68], "s3_path": 68, "s3_path_format": 14, "s3_pickle_io_manag": 16, "s3_pipeline_package_path": 16, "s3_prefix": 16, "s3_region": 14, "s3_resourc": 16, "s3_secret_access_kei": 14, "s3amazonwebservic": 14, "s3computelogmanag": [11, 16], "s3coordin": 16, "s3destin": 14, "s3filehandl": [16, 68], "s3filemanagerresourc": 16, "s3pickleiomanag": 16, "s3resourc": [4, 16], "s3sourc": 14, "s3stage": 14, "sa": 17, "safe": [16, 50, 52, 66], "safe_mod": 15, "safeguard": 2, "safeti": [16, 50], "salesforc": 14, "salesforcesourc": 14, "salesloft": 14, "salesloftsourc": 14, "same": [1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 14, 16, 18, 23, 26, 27, 33, 40, 50, 59, 63, 64, 66, 67, 68], "same_as_sourc": 14, "sampl": [6, 14, 34], "sandbox": 14, "sandboxaccesstoken": 14, "sanit": [14, 33, 63], "sas_token": 14, "sasl_jaas_config": 14, "sasl_mechan": 14, "saslplaintext": 14, "saslssl": 14, "satisfi": [2, 4, 8, 11, 66], "satur": [16, 50], "saturdai": [64, 67], "save": [16, 37, 50, 59], "save_notebook_on_failur": 60, "saveashadoopfil": [16, 50], "sc": [16, 50], "scaffold": [3, 14], "scaffold_config": 3, "scaffold_java_jdbc": 14, "scaffolddestinationpythondestin": 14, "scaffoldjavajdbcsourc": 14, "scaffoldsourcehttpsourc": 14, "scaffoldsourcepythonsourc": 14, "scala": [16, 50], "scala2": 23, "scalabl": 14, "scalar": [4, 8], "scalar_typ": 4, "scalarunion": 4, "scale": [16, 23, 50], "scan": [16, 50], "scenario": [14, 16, 50], "schedul": [2, 5, 8, 13, 15, 16, 20, 22, 26, 33, 34, 40, 42, 48, 50, 52, 65], "schedule_def": 65, "schedule_nam": [65, 67], "schedule_storag": [11, 43, 48], "schedule_storage_data": 11, "schedule_typ": 64, "scheduled_execution_tim": 67, "scheduledefinit": [5, 15, 26, 65, 67], "scheduleevaluationcontext": 67, "scheduler_data": 11, "scheduler_nam": [20, 40], "schedulerbacklogtimeout": [16, 50], "schedulestorag": 11, "scheduletyp": 64, "schema": [1, 2, 4, 7, 9, 11, 12, 13, 14, 16, 23, 25, 26, 28, 29, 30, 31, 32, 33, 34, 35, 36, 39, 45, 46, 47, 53, 54, 55, 60, 61, 63, 66, 68], "schema1": 33, "schema2": 33, "schema_by_table_nam": 14, "schema_nam": 33, "schema_registry_config": 25, "schema_registry_password": 14, "schema_registry_url": [14, 25], "schema_registry_usernam": 14, "schemamodel": 46, "scheme": [2, 14, 16, 48, 49, 50], "scope": [8, 13, 16, 23, 27, 34, 38, 40, 47, 50, 61, 66], "scoped_resources_build": 8, "scpsecurecopyprotocol": 14, "scratch": [16, 50], "script": [14, 16, 23, 34, 50, 51], "scriptvari": 34, "scroll": 14, "scylla": 14, "scylladestin": 14, "sda": 15, "sdk": 23, "seacrh": 14, "search": [14, 16, 26, 50], "searchmetr": 14, "searchmetricssourc": 14, "second": [11, 14, 16, 17, 20, 21, 23, 26, 33, 34, 39, 40, 42, 50, 51, 52, 53, 57, 63, 64, 67], "second_asset": 8, "second_op": [27, 40], "secondaryworkerconfig": 34, "seconds_to_wait": 63, "secret": [2, 4, 8, 9, 11, 13, 14, 16, 17, 20, 23, 33, 40], "secret_access_kei": 14, "secret_bool_op": 4, "secret_int_op": 4, "secret_job": 4, "secret_kei": [14, 17], "secret_key_kei": 23, "secret_op": 4, "secret_scop": 23, "secretid": 16, "secrets_in_environ": 16, "secrets_load": 11, "secrets_loader_data": 11, "secrets_tag": 16, "secrets_to_env_vari": 23, "secretsmanager_resourc": 16, "secretsmanager_secrets_resourc": 16, "secretsmanagerresourc": 16, "secretsmanagersecretsresourc": 16, "section": [14, 16, 20, 40, 50], "secur": [3, 14, 20, 23, 34, 40], "securili": 38, "security_context": [20, 40], "security_protocol": 14, "securityconfig": 34, "see": [11, 14, 15, 16, 17, 18, 19, 20, 22, 23, 24, 26, 27, 33, 34, 37, 38, 40, 42, 46, 50, 52, 53, 54, 55, 56, 58, 59, 62], "seed": [14, 26], "seek": [11, 16, 50], "seem": 38, "segment": 2, "seldom": 14, "select": [2, 3, 4, 8, 9, 13, 14, 16, 26, 28, 34, 51, 52, 53, 64, 67], "select_properties_by_default": 14, "selectanotheropt": 14, "selected_asset": 2, "selected_asset_check_kei": [2, 8], "selected_asset_kei": [2, 8], "selected_output_nam": [8, 26], "selected_unique_id": 26, "selector": [4, 7, 17, 18, 19, 20, 22, 23, 27, 40], "self": [12, 14, 16, 25, 34, 38, 50, 65, 66, 68], "self_dependent_asset": 8, "seller": 14, "semicolon": 34, "send": [3, 11, 14, 16, 18, 24, 42, 49, 50, 52, 57, 69], "send_buffer_byt": 14, "send_messag": 10, "send_timeout_m": 14, "sender": 69, "sendgrid": 14, "sendgridsourc": 14, "sendoffsetstotransact": 14, "sens": [18, 19, 20], "sensit": [4, 11, 14, 16, 34, 50], "sensor": [2, 5, 8, 11, 13, 52, 64, 65, 69], "sensor_def": 65, "sensor_nam": [65, 67], "sensordefinit": [5, 65, 67], "sensorevaluationcontext": 67, "sensorresult": [64, 67], "sent": [3, 14, 16, 23, 42, 50, 52, 67, 69], "sentri": 14, "sentrysourc": 14, "separ": [2, 5, 6, 8, 11, 12, 14, 16, 34, 40, 48, 50, 64, 65], "sequenc": [1, 2, 5, 8, 9, 11, 12, 13, 14, 27, 28, 29, 30, 31, 34, 35, 36, 53, 54, 55, 60, 63, 64, 65, 67], "sequenti": [23, 67], "serd": [11, 34], "seri": [2, 11, 18, 53], "serial": [3, 5, 11, 14, 16, 17, 34, 50, 59, 67], "serializ": [1, 2, 6, 11, 13, 16, 17, 34, 50, 63], "serializable_error_info_from_exc_info": 11, "serializableerrorinfo": 11, "serializationmodul": 59, "serv": [3, 16, 26, 39, 50, 64], "server": [3, 5, 11, 14, 16, 23, 24, 25, 34, 39, 40, 41, 42, 50, 53, 59, 69], "server_address": 14, "server_telemetry_id": 25, "server_time_zon": 14, "serversideencrypt": 16, "servic": [3, 14, 16, 20, 23, 26, 34, 35, 36, 40, 44, 50, 59], "service_account_info": 14, "service_account_json": 14, "service_account_nam": [20, 40], "service_check": 24, "service_nam": 14, "serviceaccount": [14, 34], "serviceaccountkei": 14, "serviceaccountkeyauthent": 14, "serviceaccountscop": 34, "servicenam": 14, "servlet": [16, 50], "session": [14, 16, 50, 53], "session_token": 14, "set": [1, 2, 3, 4, 5, 6, 8, 9, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 22, 23, 24, 26, 27, 28, 33, 34, 35, 36, 37, 38, 39, 40, 42, 43, 44, 45, 47, 48, 50, 51, 52, 53, 54, 55, 59, 60, 61, 63, 64, 65, 66, 67, 68, 69], "set_dagster_hom": 8, "set_trac": [8, 69], "setup": [14, 16, 33, 41, 50], "sever": [1, 4, 8, 14, 16, 23, 44], "sftp": 14, "sftpjsondestin": 14, "sftpsecurefiletransferprotocol": 14, "sftpsourc": 14, "sge": 22, "sh": [40, 51], "shape": [1, 4, 7], "shard": [11, 14], "shardcount": 14, "share": [14, 16, 17, 34, 38, 50, 64, 66, 67], "shared_kei": 14, "sheet": 14, "shell": [16, 26, 34, 35, 36, 50, 53, 54, 55], "shell_command": 51, "shell_command_op": 51, "shell_op": 51, "shell_script_path": 51, "shellopconfig": 51, "shift": 64, "shim": [45, 60, 68], "shop": 14, "shopifi": 14, "shopifysourc": 14, "short": [2, 14, 16, 34, 50, 59, 63], "shortio": 14, "shortiosourc": 14, "should": [2, 3, 4, 7, 8, 9, 11, 12, 13, 14, 16, 17, 18, 19, 20, 21, 23, 26, 33, 34, 39, 40, 43, 45, 48, 50, 51, 52, 53, 54, 55, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69], "should_autocreate_t": 48, "should_execut": 67, "show": [3, 12, 16, 50, 69], "show_profil": [16, 50], "show_url_onli": 16, "showconsoleprogress": [16, 50], "shown": [14, 23, 34, 35, 36, 53], "shrink": [16, 50], "shuffl": [16, 23, 50], "shut": [3, 16, 18, 39, 50], "shutdown": [3, 16, 50], "shutdown_repository_loc": 39, "shutdownrepositorylocationinfo": 39, "sid": [14, 58], "side": [8, 13, 14, 16, 23, 50, 52, 53, 63], "sidecar": 16, "sign": 34, "signal": [3, 18], "signatur": [1, 2, 9, 63], "signific": [11, 16, 50], "significantli": [16, 50], "signinviagoogleoauth": 14, "signinviapipedriveoauth": 14, "signup": 2, "silenc": [16, 50], "silent": 11, "similar": 13, "simpl": [2, 6, 26, 32, 53, 65], "simple_job": 65, "simple_repositori": 65, "simpler": [16, 26, 50], "simpli": [2, 16, 26, 50], "simultan": [16, 50], "sinc": [2, 3, 8, 11, 13, 14, 16, 18, 23, 34, 50, 51, 67], "singer": 14, "singl": [2, 3, 4, 8, 9, 12, 13, 14, 16, 17, 20, 23, 26, 27, 34, 40, 50, 51, 52, 63, 64, 66, 67, 68], "single_run": 64, "singleschema": 14, "singlestoreaccesstoken": 14, "singular": 26, "sink": 2, "site": [7, 14, 16, 34, 50], "site_api_kei": 14, "site_id": 14, "site_url": 14, "situat": [16, 50], "size": [14, 16, 23, 26, 34, 50, 63], "sk_iri": 60, "sk_live": 14, "skip": [1, 2, 12, 14, 16, 50, 67, 69], "skip_empty_fil": 16, "skip_messag": 67, "skip_on_all_parents_not_upd": 2, "skip_on_not_all_parents_upd": 2, "skip_on_parent_miss": 2, "skip_on_parent_outd": 2, "skip_reason": 67, "skippabl": 2, "skipreason": 67, "sklearn": 60, "slack": [10, 11, 14, 26], "slack_client": [2, 26], "slack_fil": 2, "slack_files_t": 2, "slack_job": 52, "slack_message_on_failur": 10, "slack_message_on_success": 10, "slack_on_failur": 52, "slack_on_freshness_polici": 52, "slack_on_run_failur": 52, "slack_on_success": 52, "slack_op": 52, "slack_resourc": 52, "slack_sdk": 52, "slack_token": 52, "slackresourc": [26, 52], "slacksourc": 14, "slash": 14, "sleep": 68, "slice": [14, 34, 53], "slice_rang": 14, "slightli": 67, "sling_resourc": 32, "sling_resource_kei": 32, "slingdata": 32, "slingmod": 32, "slingresourc": 32, "slingsourceconnect": 32, "slingtargetconnect": 32, "slow": [16, 18, 19, 20, 40, 50], "slower": [16, 50], "slowli": [16, 50], "slug": 14, "slurm": 22, "small": [16, 50], "smaller": [14, 16, 50], "smartsheet": 14, "smartsheetssourc": 14, "smtp": 69, "smtp_host": 69, "smtp_port": 69, "smtp_type": 69, "snake_cas": 40, "snapchat": 14, "snapchatmarketingsourc": 14, "snappi": [14, 16, 50], "snappycompressioncodec": [16, 50], "snapshot": [11, 26], "snapshot_id": 11, "snapshot_isol": 14, "snippet": 34, "snowflak": [14, 32, 33], "snowflake_account": [53, 54, 55], "snowflake_connection_resourc": 53, "snowflake_databas": 53, "snowflake_io_manag": 53, "snowflake_op_for_queri": 53, "snowflake_pandas_io_manag": 54, "snowflake_password": [53, 55], "snowflake_pyspark_io_manag": 55, "snowflake_resourc": 53, "snowflake_schema": 53, "snowflake_us": 53, "snowflake_warehous": 53, "snowflakecomput": 14, "snowflakeconnect": 53, "snowflakedestin": 14, "snowflakeiomanag": [53, 54, 55], "snowflakepandasiomanag": 54, "snowflakepandastypehandl": [53, 54, 55], "snowflakepysparkiomanag": 55, "snowflakepysparktypehandl": [53, 54, 55], "snowflakeresourc": 53, "snowflakesourc": 14, "so": [2, 3, 4, 8, 9, 11, 12, 13, 14, 16, 17, 23, 24, 26, 34, 50, 63, 64, 67, 68], "so_rcvbuf": 14, "so_sndbuf": 14, "socket": [3, 14, 16, 50], "socket_connection_setup_timeout_m": 14, "socket_connection_setup_timeout_max_m": 14, "softwar": [1, 26, 34, 64], "softwareconfig": 34, "solid": [11, 12, 15, 18, 26, 34, 39, 42, 69], "some": [1, 2, 4, 11, 12, 14, 16, 17, 18, 20, 21, 23, 26, 39, 40, 50, 67, 69], "some_asset": 67, "some_celery_backend_url": 20, "some_celery_broker_url": 20, "some_config": 4, "some_config1": 4, "some_config2": 4, "some_directori": 65, "some_graph": 9, "some_job": 65, "some_kei": 33, "some_model_nam": 41, "some_modul": 3, "some_op": [8, 9, 13, 14, 33], "some_param": 41, "some_resourc": 5, "some_run_id": 39, "some_secret": 33, "some_sensor": 65, "some_validation_fn": 63, "someon": [3, 4], "someth": 69, "sometim": [14, 40], "somewher": 52, "sonnest": [64, 67], "soon": [14, 52], "soonest": [64, 67], "sort": [4, 11, 14, 16, 50], "sourc": [1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69], "source_asset": 2, "source_assets_by_kei": 2, "source_configur": 14, "source_connect": 32, "source_key_prefix": [2, 26], "source_nam": 26, "source_opt": 32, "source_stream": 32, "source_typ": 14, "sourceasset": [1, 2, 5, 8, 14, 60, 65], "sourceassetobservefunct": 2, "sourcehashversionstrategi": 62, "southern": 64, "space": [11, 14, 16, 23, 50], "span": 64, "spark": [1, 2, 16, 23, 26, 34, 50], "spark_conf": [23, 50, 56], "spark_config": [16, 50], "spark_daemon_java_opt": 23, "spark_env_var": 23, "spark_hom": 56, "spark_local_dir": [16, 23, 50], "spark_local_ip": [16, 50], "spark_resourc": 56, "spark_sess": 50, "spark_vers": 23, "spark_worker_memori": 23, "sparkconf": [16, 34, 50], "sparkcontext": [16, 50], "sparkjob": 34, "sparklisten": [16, 50], "sparkoperror": 56, "sparkr": [16, 50], "sparkr_driver_r": [16, 50], "sparksess": [34, 35, 36, 50], "sparksqljob": 34, "spars": [16, 50], "spawn": [3, 20], "spec": [2, 23, 40], "special": [16, 50, 59], "specif": [2, 3, 4, 8, 9, 10, 11, 14, 16, 18, 21, 22, 23, 26, 27, 28, 29, 30, 31, 34, 35, 36, 40, 42, 48, 50, 52, 53, 54, 55, 59, 60, 64, 68], "specifi": [1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 18, 20, 23, 26, 27, 28, 29, 30, 31, 33, 34, 35, 36, 40, 42, 45, 50, 51, 52, 53, 54, 55, 60, 61, 63, 64, 66, 67, 68, 69], "specificpartitionspartitionmap": 64, "specul": [16, 23, 50], "speed": [14, 16, 50, 59], "spill": [16, 50], "spin": [5, 51, 65], "splendidrunstorag": 11, "split": [14, 33, 64, 67], "spot": 23, "spot_bid_price_perc": 23, "spread": [14, 16, 34, 50], "spreadsheet": 14, "spreadsheet_id": 14, "spun": 3, "sq": 14, "sql": [11, 14, 16, 26, 31, 34, 50, 53, 55], "sql_queri": [34, 53], "sqlalchemi": [3, 15, 53], "sqleventlogstorag": 11, "sqlinsert": 14, "sqlite": [11, 14], "sqlitedestin": 14, "sqliteeventlogstorag": 11, "sqliterunstorag": 11, "sqliteschedulestorag": 11, "sqlrunstorag": 11, "sqlschedulestorag": 11, "squar": 14, "squaresourc": 14, "src": [50, 53], "ss": 14, "ssd": 34, "sse": 23, "ssh": [14, 22, 23], "ssh_kei": 14, "ssh_port": 57, "ssh_public_kei": 23, "ssh_resourc": 57, "sshkeyauthent": 14, "sshsecureshel": 14, "ssl": [3, 14, 16, 34, 42, 69], "ssl_certif": 14, "ssl_method": 14, "ssl_mode": 14, "sslmode": 16, "ssz": 14, "st": 14, "stabil": [16, 50], "stabl": [11, 18, 19, 20, 27, 40], "stack": [7, 11, 67], "stackoverflowerror": [16, 50], "stage": [14, 16, 17, 34, 50], "staging_bucket": 16, "staging_prefix": [16, 23], "stale": 67, "stale_assets_onli": 67, "standalon": [14, 16, 50], "standalonemongodbinst": 14, "standard": [2, 8, 9, 11, 13, 14, 16, 34, 35, 36, 50, 61, 63], "standardinsert": 14, "start": [3, 5, 8, 14, 15, 16, 19, 20, 22, 25, 26, 40, 42, 45, 50, 51, 52, 59, 63, 64, 67, 69], "start_aft": [14, 33], "start_dat": [14, 26, 64, 67], "start_datetim": 14, "start_offset": [2, 8, 64], "start_tim": [11, 14], "starttl": 69, "stat": [16, 50], "state": [2, 7, 11, 14, 16, 17, 21, 23, 26, 34, 39, 50, 52, 63, 67], "state_filt": 14, "statement": [3, 14], "static": [1, 2, 6, 11, 12, 63, 64, 65, 66, 69], "static_partitioned_config": 64, "staticmethod": [28, 29, 30, 31, 34, 35, 36, 53, 54, 55], "staticpartitionmap": 64, "staticpartitionsdefinit": 64, "statu": [2, 3, 11, 14, 16, 34, 39, 40, 42, 50, 52, 63, 67, 69], "status": [3, 11], "stderr": [3, 11, 16, 17, 34, 51], "stderrfrom": 23, "stdin": 69, "stdout": [3, 11, 16, 17, 23, 26, 34, 51, 69], "step": [1, 7, 8, 10, 12, 16, 17, 18, 19, 20, 27, 34, 38, 39, 40, 41, 42, 50, 52, 62, 63, 67, 68, 69], "step_context": 12, "step_execution_context": [8, 10], "step_expectation_result": 8, "step_failur": 8, "step_handl": 8, "step_k8s_config": 40, "step_kei": [8, 10, 11, 12, 39, 67, 69], "step_keys_to_execut": 11, "step_kind_valu": 8, "step_output": 8, "step_restart": 8, "step_select": 8, "step_skip": 8, "step_start": 8, "step_success": 8, "step_up_for_retri": 8, "stepexecutioncontext": [11, 12], "stepkind": 8, "steplaunch": 11, "steprunref": 11, "still": [4, 5, 16, 34, 50], "stock": 14, "stock_tick": 14, "stocktickerapitutorialsourc": 14, "stop": [3, 11, 14, 16, 23, 26, 42, 46, 50, 52, 59, 64, 67, 69], "stopgap": 5, "stopgracefullyonshutdown": [16, 50], "storag": [2, 3, 7, 12, 14, 16, 17, 20, 23, 34, 39, 40, 41, 43, 48, 50, 59, 60, 67], "storage_access_kei": 14, "storage_account": [14, 17], "storage_account_key_kei": 23, "storage_account_nam": [14, 23], "storage_data": 11, "storage_endpoint_suffix": 14, "storage_id": 11, "storagefract": [16, 50], "storagelevel": [16, 50], "store": [2, 3, 8, 11, 12, 14, 16, 17, 23, 29, 30, 31, 34, 35, 36, 38, 40, 50, 51, 52, 53, 54, 55, 60, 63, 64, 67], "store_fil": 2, "store_files_in_t": 2, "store_hash": 14, "store_nam": 14, "store_timestamps_as_str": [53, 54, 55], "str": [1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 23, 25, 26, 27, 32, 33, 34, 37, 38, 39, 40, 42, 45, 47, 51, 52, 53, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69], "str_param": 68, "straightforward": 68, "strategi": [4, 8, 13, 14, 16, 50, 62], "strava": 14, "stravasourc": 14, "stream": [11, 14, 16, 23, 26, 32, 50, 51], "stream_config": 14, "stream_dupl": 14, "stream_nam": 14, "stream_raw_ev": 26, "stream_schema": 14, "streamingcontext": [16, 50], "streams_criteria": 14, "strict": [11, 16, 17, 18, 19, 20, 22, 23, 25, 27, 34, 40, 48], "strict_column_list": 45, "strictcolumnsconstraint": 45, "strictli": [16, 23, 50], "string": [1, 2, 3, 4, 6, 7, 8, 9, 11, 12, 13, 14, 16, 18, 19, 20, 21, 23, 26, 27, 32, 33, 34, 38, 39, 40, 41, 45, 46, 50, 51, 52, 53, 54, 55, 57, 59, 60, 62, 63, 64, 65, 66, 67, 68, 69], "string_event_properties_kei": 14, "string_resourc": 66, "stringifi": 67, "stringio": 11, "stringsourc": [4, 7, 14, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 33, 34, 35, 36, 38, 40, 41, 42, 43, 44, 48, 49, 50, 53, 54, 55, 57, 58, 59], "stripe": 14, "stripesourc": 14, "structur": [3, 8, 9, 11, 12, 16, 25, 26, 38, 50, 61, 63, 66, 69], "structured_asset_kei": 63, "structured_asset_key_2": 63, "stub": 63, "stuff": 69, "style": 5, "sub": [2, 8, 11, 14, 16, 50], "sub0": 34, "subclass": [4, 7, 11, 12, 16, 25, 38, 50, 62, 64, 65, 66, 68, 69], "subdirectori": [14, 26], "subdomain": 14, "subject": [2, 11, 14, 64, 69], "subminor": 34, "submiss": 34, "submit": [16, 20, 23, 34, 39, 50, 56, 67], "submit_job_execut": 39, "submit_run": 23, "submit_run_op": 23, "submodul": 11, "subnet": 34, "subnetwork": 34, "subnetwork_uri": 34, "subnetworkuri": 34, "subprocess": [26, 51], "subscrib": 14, "subscribetoalltopicsmatchingspecifiedpattern": 14, "subscript": 14, "subsequ": [9, 12, 14, 16, 17, 23, 34, 40, 50, 53], "subset": [2, 7, 8, 11, 14, 21, 26, 33, 64], "substanti": [16, 50], "succe": [23, 40], "succeed": [10, 26, 63], "success": [1, 2, 7, 8, 10, 14, 16, 21, 26, 33, 39, 42, 45, 50, 52, 53, 63, 68], "success_hook": 10, "successfulli": [11, 14, 21, 26, 33], "suffix": [14, 16, 50], "suggest": [23, 26], "suit": 37, "suitabl": [8, 11, 16, 17, 34, 51], "suite_nam": 37, "sum": 14, "summari": 44, "summarize_directori": 6, "sundai": [64, 67], "sunshin": 14, "super": [26, 41], "supervis": [16, 50], "suppli": [2, 4, 7, 8, 9, 13, 20, 26, 33, 40, 67, 69], "support": [2, 6, 8, 11, 12, 13, 14, 16, 17, 21, 23, 24, 25, 26, 34, 38, 44, 50, 51, 52, 61, 63, 64, 67], "suppress": [3, 26], "sure": [14, 16, 38, 40, 50, 59], "surfac": [3, 16], "surpris": 2, "survei": 14, "survey_group_id": 14, "survey_id": 14, "surveygroup": 14, "surveymonkei": 14, "surveymonkeysourc": 14, "sustainedschedulerbacklogtimeout": [16, 50], "svc": [24, 40], "svv": 40, "switch": [2, 8, 9, 13, 14], "sy": 7, "symbol": [3, 14, 16, 50], "sync": [14, 21, 32, 33], "sync_foobar": [14, 21, 33], "sync_id": 21, "sync_lag_minut": 14, "sync_produc": 14, "sync_run": 21, "synchron": [7, 8, 11, 14, 39], "syntax": [8, 14, 26, 68], "synthes": 62, "system": [5, 7, 8, 11, 13, 14, 16, 17, 18, 23, 32, 39, 50, 51, 63, 67], "systemidsid": 14, "t": [1, 2, 3, 8, 9, 12, 13, 14, 16, 18, 26, 27, 40, 50, 52, 53, 59, 67], "t_partitionsdefinit": 64, "tab": [14, 33, 44, 69], "tabl": [2, 14, 26, 28, 29, 30, 31, 32, 33, 34, 35, 36, 46, 53, 54, 55], "table1": 33, "table2": 33, "table_nam": [14, 26, 33], "table_object": 32, "table_schema": 63, "tablecolumn": 63, "tablecolumnconstraint": 63, "tableconstraint": 63, "tablemetadatavalu": 63, "tablerecord": 63, "tableschema": 63, "tableschemametadatavalu": 63, "tabular": 63, "tag": [1, 2, 8, 9, 11, 13, 14, 15, 16, 23, 24, 26, 27, 33, 34, 39, 40, 41, 51, 59, 60, 62, 63, 64, 67], "tag_concurrency_limit": [11, 27, 40], "tags_fn": 67, "tags_for_partition_fn": [64, 67], "tags_for_partition_key_fn": 64, "tagsmor": 23, "take": [2, 5, 9, 11, 14, 16, 22, 23, 26, 33, 37, 40, 42, 45, 50, 51, 52, 53, 63, 64, 65, 66, 67, 68, 69], "taken": 33, "talk": 14, "talkdesk": 14, "talkdeskexploresourc": 14, "tandem": 8, "tap": 14, "tar": 34, "target": [2, 8, 13, 14, 16, 20, 26, 32, 40, 50, 60, 64, 67], "target_connect": 32, "target_dir": 26, "target_object": 32, "target_opt": 32, "target_path": 26, "task": [15, 16, 19, 20, 23, 34, 40, 50], "task_definit": 16, "task_ids_by_asset_kei": 15, "taskdefinit": 16, "tax": 14, "tbc": 14, "tcp": 14, "team": [3, 11, 14, 65], "teams_job": 42, "teams_on_failur": 42, "teams_on_run_failur": 42, "teams_on_success": 42, "teams_op": 42, "teams_webhook_url": 42, "teamsclient": 42, "teardown": 66, "technolog": 26, "teh": [28, 53], "tell": [2, 8, 9, 11, 14, 34, 35, 36], "temp": 11, "temp_dir": 8, "temp_fil": 51, "temp_file_writ": 51, "tempdir": 11, "tempfil": 11, "tempo": 14, "temporari": [8, 11, 12, 34, 35, 36, 51, 59], "temporary_gcs_bucket": [34, 35, 36], "temposourc": 14, "ten": 53, "tenanc": 14, "tenant": 14, "tenant_endpoint": 14, "tenant_id": 14, "tend": [16, 50], "term": [5, 14, 23, 63, 64], "termin": [14, 23, 34, 53], "test": [7, 8, 11, 12, 14, 17, 26, 32, 34, 50, 53, 61, 63, 66, 67], "test_project": 40, "test_top": 14, "test_valu": 4, "text": [3, 16, 25, 26, 38, 42, 52, 63, 66, 69], "text_fn": 52, "text_messag": 42, "text_metadata": 63, "text_usag": 52, "textio": 11, "textmetadatavalu": 63, "tgtlifetimehour": 34, "tgz": 34, "th": 45, "than": [2, 4, 5, 11, 14, 16, 18, 19, 20, 23, 26, 34, 38, 40, 46, 50, 63, 64, 66, 68], "thank": 40, "the_asset": 66, "the_graph": [8, 9], "the_job": [8, 9], "the_resourc": 66, "the_schedul": 67, "the_sensor": 67, "thei": [2, 3, 4, 5, 8, 9, 11, 12, 13, 14, 16, 17, 23, 33, 50, 63, 66, 67, 68, 69], "them": [2, 3, 4, 5, 7, 11, 12, 14, 16, 23, 26, 33, 34, 40, 45, 50, 51, 59, 61, 63, 67, 68, 69], "themselv": [8, 9], "therefor": [26, 33], "thi": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 38, 39, 40, 41, 42, 43, 44, 45, 47, 48, 49, 50, 51, 52, 53, 54, 55, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69], "thin": [23, 24, 38, 44, 52], "thing": [14, 16, 38, 50, 69], "third": 11, "those": [2, 5, 8, 9, 12, 13, 14, 16, 23, 26, 33, 50, 53, 61, 63, 64, 67], "though": [14, 16, 50], "thousand": [16, 50], "thrash": 34, "thread": [3, 8, 11, 14, 16, 22, 50, 53], "threaddump": [16, 50], "threads_per_work": 22, "three": [12, 16, 17, 34], "threshold": [16, 45, 50], "through": [2, 4, 8, 13, 14, 16, 23, 26, 32, 40, 45, 50, 61, 63, 68, 69], "throughput": 23, "throw": [2, 4, 7, 14, 16, 23, 39, 50], "thrown": [2, 7, 8, 10, 45, 66, 68], "thu": [1, 2, 11], "thumbnail_data_url": 14, "thumbnail_url": 14, "tick": [2, 8, 13, 64, 67], "ticker": 14, "ticket": 34, "tidb": 14, "tidbdestin": 14, "tidbsourc": 14, "tight": 14, "tighter": 38, "tightli": [16, 50], "tiktok": 14, "tiktokmarketingsourc": 14, "till": 14, "time": [2, 3, 4, 5, 6, 8, 9, 11, 12, 13, 14, 15, 16, 18, 19, 21, 23, 24, 26, 27, 33, 34, 39, 50, 51, 52, 53, 54, 55, 59, 62, 63, 64, 67, 68], "time_incr": 14, "time_window": 26, "timelysourc": 14, "timeout": [3, 14, 16, 22, 23, 34, 35, 36, 39, 40, 42, 49, 50, 53, 57], "timeout_second": 23, "timestamp": [11, 14, 67], "timewindow": [8, 64], "timewindowpartitionmap": [2, 8, 64], "timewindowpartitionsdefinit": [8, 12, 64], "timezon": [2, 14, 26, 53, 54, 55, 64, 67], "titl": [14, 38, 46], "tl": [14, 42], "tlsencryptedverifycertif": 14, "tmp": [16, 17, 32, 34, 40, 50, 51], "to_asset_kei": 67, "to_default_asset_ev": 26, "to_job": [8, 9, 13, 61], "to_source_asset": 2, "to_sql": 2, "to_str": 67, "todai": 14, "todo": 14, "togeth": [14, 59, 63], "toggl": [14, 62], "token": [14, 15, 16, 17, 23, 25, 26, 38, 39, 40, 52, 58], "token_kei": 14, "token_secret": 14, "token_uri": 14, "toml": 3, "too": [8, 14, 16, 40, 50], "tool": [5, 11, 23, 45, 63, 68], "top": [3, 4, 5, 8, 9, 13, 21, 33, 65, 67], "top_level_resourc": 65, "topic": [14, 25, 52], "topic_id": 14, "topic_namespac": 14, "topic_partit": 14, "topic_pattern": 14, "topic_rout": 25, "topic_ten": 14, "topic_test": 14, "topic_typ": 14, "torn": [8, 66], "torrentbroadcastfactori": [16, 50], "total": [14, 16, 23, 46, 50, 64], "touch": 11, "toward": [14, 16], "tpl_kei": 14, "tplcentral": 14, "tplcentralsourc": 14, "trace": [3, 7, 11, 61, 67], "track": [14, 16, 41, 50, 59, 63, 67], "tracker": 14, "trail": 14, "trailing_unconsumed_partitioned_event_id": 67, "train": 59, "transact": 14, "transfer": [16, 50], "transform": [11, 14, 16, 50], "transform_word": 4, "transformation_prior": 14, "transient": [11, 14, 16, 50], "transit": 53, "translat": [14, 26, 28, 34, 53], "transmiss": 14, "transport": 39, "travers": 20, "treat": 68, "tree": 14, "trello": 14, "trellosourc": 14, "tri": [16, 50], "trigger": [2, 10, 16, 21, 23, 26, 42, 44, 50, 52, 67], "true": [2, 4, 6, 7, 8, 9, 11, 12, 13, 14, 15, 16, 17, 20, 21, 23, 26, 27, 33, 34, 40, 42, 45, 48, 50, 52, 53, 54, 55, 57, 60, 62, 63, 64, 67, 68, 69], "trust": 34, "truststor": 34, "truststorepassworduri": 34, "truststoreuri": 34, "try": [14, 16, 18, 39, 50, 63], "tune": [14, 16, 50], "tunnel": 14, "tunnel_host": 14, "tunnel_method": 14, "tunnel_port": 14, "tunnel_us": 14, "tunnel_user_password": 14, "tupl": [7, 8, 11, 13, 51, 63, 67, 68], "turn": [3, 16, 40, 50, 65, 67], "tutori": 14, "twilio": 14, "twilio_resourc": 58, "twilioresourc": 58, "twiliosourc": 14, "two": [2, 9, 12, 14, 16, 17, 34, 46, 63, 64, 66, 67, 68], "txt": [11, 34, 51], "type": [1, 2, 5, 7, 8, 9, 11, 12, 13, 14, 15, 16, 20, 21, 23, 28, 29, 30, 31, 32, 34, 37, 39, 40, 45, 46, 51, 52, 53, 54, 55, 59, 60, 61, 62, 64, 65, 66, 67, 69], "type_check": 68, "type_check_fn": [45, 63, 68], "type_handl": [28, 29, 30, 31, 34, 35, 36, 53, 54, 55], "typecheck": [7, 45, 46, 60, 63, 68], "typecheckcontext": [8, 45, 68], "typeform": 14, "typeformsourc": 14, "typehint": 9, "typic": [2, 5, 7, 8, 11, 16, 40, 50, 52, 65], "typing_typ": [2, 5, 45, 65, 68], "u6nxl7": 14, "ubuntu": 23, "ud": 3, "udf": 34, "ugli": 4, "ui": [2, 3, 4, 8, 9, 11, 13, 14, 15, 16, 23, 26, 33, 39, 50, 59, 60, 63, 64, 65, 67, 69], "uksouth": 23, "unacknowledg": 14, "uncondition": [16, 50], "unconnect": 68, "unconsum": 67, "under": [4, 8, 9, 11, 12, 13, 14, 16, 17, 18, 19, 20, 34, 40, 50, 63], "underestim": [16, 50], "underli": [1, 2, 8, 9, 14, 16, 17, 18, 19, 20, 23, 26, 51, 60, 61, 63, 66, 68], "underneath": [12, 26], "underscor": [2, 14, 34], "understand": 14, "underutil": 23, "undocu": 14, "unencrypt": [14, 53, 54, 55], "unexpect": [7, 16, 50], "unexpected_field_behavior": 14, "unifi": [16, 50], "uniform": [11, 61], "uninstal": 40, "union": [1, 2, 4, 5, 6, 8, 9, 11, 12, 14, 16, 17, 18, 19, 20, 21, 23, 25, 26, 28, 29, 30, 31, 32, 33, 34, 35, 36, 38, 40, 41, 42, 45, 46, 51, 52, 53, 54, 55, 60, 63, 64, 65, 67, 68, 69], "uniqu": [1, 2, 3, 4, 6, 8, 9, 11, 12, 14, 16, 17, 18, 24, 27, 34, 40, 45, 59, 62, 63, 68], "unique_id": 26, "unique_nam": 68, "unit": [8, 9, 14, 16, 40, 50, 63, 67], "univers": [12, 14], "unix": 11, "unknown": 7, "unless": [2, 8, 11, 14, 16, 50, 63, 64, 67], "unlik": [2, 4, 13, 63], "unlimit": [14, 16, 50], "unpartit": 2, "unpersist": [16, 50], "unreach": 51, "unrecover": 63, "unregist": [16, 50], "unrel": 26, "unresolvedassetjob": 67, "unresolvedassetjobdefinit": [2, 5, 42, 67], "unresolvedpartitionedassetscheduledefinit": 5, "unrol": [16, 50], "unrollfract": [16, 50], "unsaf": [16, 50], "unsatisfi": 66, "unset": [23, 34, 35, 36, 64], "unsign": 16, "unspecifi": [4, 14, 34, 63], "unstructur": 11, "unsuccess": [14, 21, 33], "until": [3, 14, 16, 21, 23, 26, 33, 50, 59, 66, 67], "until_todai": 14, "untitl": 23, "unus": [16, 45, 50, 51, 68], "unusu": [16, 50], "unwil": [16, 50], "unzip": 23, "up": [2, 3, 4, 5, 8, 9, 11, 12, 14, 15, 16, 17, 18, 19, 20, 23, 25, 38, 44, 45, 46, 47, 49, 50, 51, 53, 62, 65, 66, 67, 69], "up_for_retri": 63, "upathiomanag": 12, "updat": [2, 11, 14, 16, 26, 32, 33, 40, 50, 67], "update_cursor": 67, "update_kei": 32, "update_timestamp": 11, "updated_aft": 11, "updated_befor": 11, "upload": [14, 16, 17, 23, 34, 35, 36], "upload_extra_arg": 16, "upload_interv": [16, 17, 34], "uploading_method": 14, "upon": [8, 11, 14, 23, 40, 62, 66], "upper": [2, 14, 16, 50], "upsert": 14, "upstream": [2, 8, 9, 12, 13, 15, 51, 60, 63, 64, 67], "upstream_asset": [2, 8, 14], "upstream_dependencies_by_asset_kei": 15, "upstream_output": 12, "upstream_partitions_def": 64, "upstream_partitions_subset": 64, "upstream_python_asset": 26, "upstream_source_asset": 2, "upstreampartitionsresult": 64, "uri": [14, 15, 34, 41, 53, 59], "url": [3, 14, 16, 17, 18, 19, 20, 23, 26, 27, 34, 39, 42, 47, 49, 50, 52, 63, 69], "url_bas": 14, "urlmetadatavalu": 63, "us": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 44, 45, 46, 48, 49, 50, 51, 52, 53, 54, 55, 57, 60, 61, 63, 64, 65, 66, 67, 68, 69], "us2": 14, "usa": 14, "usabl": [11, 63, 68], "usable_as_dagster_typ": [63, 68], "usag": [4, 5, 11, 12, 14, 15, 16, 17, 25, 34, 38, 39, 41, 50, 52, 63, 65, 66], "uscensussourc": 14, "use_all_dns_ip": 14, "use_build_command": 26, "use_current_ecs_task_config": 16, "use_emphemeral_airflow_db": 15, "use_http": [14, 39], "use_pandas_result": 53, "use_ssl": 16, "use_tl": 14, "use_unsigned_sess": 16, "usefetchcach": [16, 50], "uselegacymod": [16, 50], "usepassword": 40, "user": [2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 21, 23, 24, 26, 32, 33, 34, 37, 38, 39, 43, 45, 46, 47, 48, 50, 51, 52, 53, 54, 55, 60, 61, 62, 63, 64, 65, 66, 67, 68], "user1": 19, "user_ag": 14, "user_auth_kei": 14, "user_code_error_boundari": [7, 11], "user_code_failure_retry_delai": 11, "user_login": 14, "user_login_id": 14, "user_messag": 11, "user_nam": 14, "user_secret": 14, "user_token": 15, "useraccount": 34, "userclasspathfirst": [16, 50], "userdeploy": 40, "userguid": [18, 19, 20], "usernam": [11, 12, 14, 16, 19, 27, 40, 43, 48, 57], "usernameandpassword": 14, "usernamepassword": 14, "usptream": 64, "usr": 34, "usual": [12, 14, 15, 16, 26, 50, 65], "utc": [2, 14, 15, 53, 54, 55, 67], "utc_date_str": 15, "utc_execution_date_str": 15, "utf": 51, "util": [2, 6, 8, 12, 16, 17, 25, 27, 38, 39, 40, 45, 51, 63, 66, 67], "utilis": 14, "uvicorn": 3, "uvicorn_log_level": 3, "v": [3, 14], "v1": [16, 20, 34, 40], "v2": [14, 26, 44], "v3": 14, "v4": [14, 38], "valid": [2, 4, 8, 11, 14, 16, 34, 37, 45, 46, 50, 53, 60, 64, 67, 68], "validate_default_paramet": 53, "validate_run_config": 8, "validate_t": 63, "validateoutputspec": [16, 50], "validation_operator_nam": 37, "validation_operators_and_act": 37, "valu": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16, 17, 18, 19, 20, 21, 23, 25, 26, 27, 29, 30, 31, 33, 34, 35, 36, 39, 40, 41, 42, 43, 45, 46, 48, 49, 50, 51, 53, 54, 55, 57, 59, 60, 63, 64, 65, 66, 67, 68], "value1": 14, "value2": 14, "value3": 14, "var": [3, 19, 26], "vari": 14, "variabl": [3, 4, 5, 8, 11, 12, 16, 19, 20, 23, 26, 27, 34, 38, 40, 41, 43, 48, 50, 53, 67], "variant": [6, 16, 50], "variat": [18, 19, 20], "varieti": [26, 39], "variou": [16, 34, 38, 50, 67], "ve": 18, "vendor": 14, "verb": 40, "verbos": [16, 23, 50], "verbose_log": 23, "veri": [8, 11, 13, 16, 50], "verifi": [1, 14, 16, 42], "verify_cert_path": 16, "verifyca": 14, "verifyful": 14, "verifyident": 14, "versa": 64, "version": [2, 3, 4, 5, 8, 9, 11, 12, 13, 14, 16, 18, 19, 20, 23, 26, 34, 40, 42, 50, 52, 63, 64, 66, 67, 68, 69], "version_strategi": [8, 9, 13], "versionstrategi": [8, 9, 13, 62], "very_cool_packag": 11, "very_secret_env_vari": 4, "very_secret_env_variable_bool": 4, "very_secret_env_variable_int": 4, "via": [3, 4, 5, 8, 9, 11, 12, 13, 14, 16, 17, 18, 23, 29, 30, 31, 34, 35, 36, 40, 42, 49, 50, 51, 52, 53, 54, 55, 60, 64, 66, 67, 69], "viabl": 11, "vice": 64, "video": 14, "view": [3, 14, 24, 26, 40, 64], "view_id": 14, "viewabl": [2, 8, 9, 13, 67], "violat": [7, 52], "virtual": 14, "virtual_host": 14, "visibility_timeout": 14, "visibl": [2, 14, 40], "visit": 14, "visitor": 24, "visual": [45, 68], "vm": [16, 34, 50], "vol1": 19, "vol2": 19, "volum": [14, 19, 20, 23, 40], "volume_mount": [20, 40], "volumemount": [20, 40], "vs": [38, 40], "vvv": 40, "w": [3, 11, 16, 40, 50, 51, 59], "w2": 18, "wa": [1, 2, 4, 5, 6, 7, 8, 11, 13, 14, 16, 20, 26, 40, 50, 60, 62, 63, 64, 67], "wai": [5, 8, 11, 12, 13, 14, 16, 32, 40, 50, 64, 66, 68], "wait": [2, 3, 11, 14, 16, 20, 21, 23, 26, 33, 34, 40, 50, 52, 63, 68], "wait_for_log": [16, 23], "wait_int": 68, "wal": [16, 50], "walk": 6, "wandb_api_kei": 59, "wandb_artifact_configur": 59, "wandb_artifacts_io_manag": 59, "wandb_artifacts_manag": 59, "wandb_config": 59, "wandb_resourc": 59, "wandbartifactconfigur": 59, "wandbartifactsiomanagererror": 59, "want": [2, 4, 5, 8, 11, 14, 16, 18, 19, 20, 23, 26, 27, 33, 37, 38, 39, 40, 41, 42, 50, 52, 59, 64, 65, 67, 69], "warehous": [14, 53, 54, 55], "warm": 40, "warn": [1, 2, 3, 14, 16, 24, 26, 50, 61, 64], "warn_after_minutes_overdu": 52, "warn_error": 26, "warn_on_step_context_us": 12, "wast": [16, 50], "wave": 52, "wb": 11, "we": [5, 11, 12, 14, 15, 16, 17, 18, 19, 20, 23, 24, 25, 26, 33, 40, 45, 50, 51, 59, 66], "weak": [16, 50], "weather": 14, "web": [3, 5, 14, 16, 18, 23, 50], "webclient": 52, "webflow": 14, "webflowsourc": 14, "webhook": 42, "webserv": [5, 11, 15, 40, 42, 43, 48, 52, 69], "webserver_base_url": [42, 52, 69], "websit": [14, 59], "week": [64, 67], "weekend": 14, "weekli": [64, 67], "weekly_abc": 64, "weekly_partitioned_config": [64, 67], "weeklypartitionsdefinit": 64, "well": [2, 4, 7, 8, 9, 11, 13, 16, 23, 33, 40, 41, 50, 63], "were": [2, 5, 8, 11, 13, 14, 18, 19, 20, 59, 67], "west": [14, 16, 23, 40], "wget": 34, "what": [2, 5, 8, 9, 11, 14, 16, 37, 50, 63, 65, 67], "when": [2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 48, 50, 51, 52, 53, 54, 55, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69], "whenev": [2, 8, 9, 13, 14, 16, 47, 52, 61, 63], "where": [2, 3, 6, 8, 10, 11, 14, 16, 19, 20, 26, 33, 34, 37, 39, 40, 45, 50, 51, 63, 64, 67, 68], "wherea": 14, "whether": [2, 3, 4, 8, 9, 11, 12, 13, 14, 16, 18, 19, 20, 23, 26, 27, 34, 39, 40, 42, 50, 52, 53, 54, 55, 60, 62, 63, 67, 68, 69], "which": [2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 23, 26, 27, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 45, 47, 50, 51, 52, 53, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69], "while": [2, 4, 7, 8, 11, 16, 23, 26, 50, 62, 64], "whiski": 14, "whiskyhuntersourc": 14, "whitelist": 11, "whitespac": 14, "who": [3, 5, 11, 14, 16, 50], "whole": [11, 12, 16, 50], "whom": 4, "whose": [2, 4, 7, 8, 9, 12, 14, 16, 60, 62, 63, 64, 68, 69], "why": 67, "wide": 46, "willstor": 34, "window": [8, 12, 14, 34, 64, 67], "window_in_dai": 14, "wipe": 11, "wipe_asset": 11, "wise": 46, "wish": [11, 16, 50, 59, 63], "with_hook": [8, 9, 13, 42, 52, 63], "with_prefix": 26, "with_resourc": [5, 28, 29, 30, 31, 33, 59, 66], "with_retry_polici": [8, 9, 63], "with_rul": 2, "with_top_level_resourc": [8, 13], "with_wandb": 59, "within": [1, 2, 3, 4, 7, 8, 9, 11, 12, 13, 14, 15, 16, 20, 23, 24, 25, 26, 27, 34, 38, 39, 40, 41, 44, 45, 47, 50, 51, 59, 60, 61, 62, 63, 65, 66, 67, 68], "withorb": 14, "without": [2, 3, 4, 7, 8, 11, 14, 16, 18, 22, 29, 30, 31, 34, 39, 50, 53, 54, 55, 67, 68], "without_check": 2, "without_rul": 2, "won": [12, 14, 16, 18, 26, 50], "woocommerc": 14, "woocommercesourc": 14, "word": [4, 14, 63], "wordcount": 34, "work": [3, 6, 11, 14, 16, 17, 20, 26, 34, 40, 50, 51, 52, 62, 67, 69], "worker": [3, 11, 14, 16, 19, 20, 22, 23, 34, 40, 50], "worker_main": 18, "workerconfig": 34, "workflow": 26, "working_directori": 3, "workload": [14, 16, 23, 40, 50], "workspac": [3, 14, 23, 33, 52], "workspace_cli": 23, "workspace_id": [14, 23], "workspacecli": 23, "world": [4, 8, 51, 66], "would": [2, 11, 12, 14, 16, 17, 18, 23, 26, 33, 34, 63, 64], "wrap": [4, 7, 8, 11, 12, 13, 17, 51, 60, 63, 66], "wrapper": [12, 23, 24, 26, 38, 44, 52], "wrike": 14, "wrike_inst": 14, "wrikesourc": 14, "write": [2, 8, 9, 11, 13, 14, 16, 17, 18, 23, 28, 29, 30, 31, 34, 35, 36, 38, 50, 51, 53, 54, 55, 66, 67], "write_csv": 12, "write_data": 11, "write_fil": 11, "write_parquet_fil": 53, "write_t": 53, "writeaheadlog": [16, 50], "writeif": 34, "writer": [16, 25, 38, 66], "writerresourc": [16, 25, 38, 66], "written": [14, 16, 27, 34, 35, 36, 40, 50, 51, 59, 60], "www": [2, 19, 26, 34, 58, 64, 67], "x": [9, 14, 23, 64], "xloggc": [16, 50], "xml": 34, "xmlfor": 34, "xmx": [16, 50], "xx": 23, "xxx": 69, "xz": 14, "y": [3, 18, 23, 64, 67], "yahoo": 14, "yahoofinancepricesourc": 14, "yaml": [3, 8, 11, 12, 14, 16, 17, 18, 19, 20, 34, 40, 43, 48, 59, 65, 69], "yaml_directori": 65, "yaml_str": 69, "yamlfil": 14, "yandex": 14, "yandexmetricasourc": 14, "yarn": [16, 22, 34, 50], "ye": [4, 16, 50], "year": 14, "yesterdai": [2, 14], "yet": [65, 67], "yield": [4, 6, 8, 9, 11, 14, 21, 26, 33, 37, 60, 63, 65, 66, 67], "yield_ev": 60, "yield_materi": [14, 21, 26, 33], "yield_result": 60, "yml": [26, 37], "you": [1, 2, 3, 4, 5, 7, 8, 9, 11, 12, 13, 14, 16, 17, 18, 19, 20, 23, 24, 25, 26, 27, 28, 29, 30, 31, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 44, 45, 47, 48, 50, 51, 52, 53, 54, 55, 59, 60, 61, 63, 64, 65, 66, 67, 68, 69], "your": [1, 2, 3, 8, 11, 12, 13, 14, 15, 16, 17, 19, 20, 23, 26, 27, 34, 35, 36, 37, 38, 40, 41, 42, 44, 45, 47, 48, 49, 50, 52, 53, 54, 55, 58, 59, 60, 66, 67, 69], "your_kei": 24, "your_org_her": 39, "your_service_account": 40, "yourself": 11, "yout": 58, "youtub": 14, "youtubeanalyticssourc": 14, "youwil": 44, "yyyi": 14, "z": [14, 34], "zadrozni": 18, "zendesk": 14, "zendesk_sunshin": 14, "zendeskchatsourc": 14, "zendesksunshinesourc": 14, "zendesksupportsourc": 14, "zendesktalksourc": 14, "zenefit": 14, "zenefitssourc": 14, "zenloop": 14, "zenloopsourc": 14, "zero": [14, 16, 23, 26, 34, 50, 51, 64], "zip": [16, 23, 34, 50], "zoho": 14, "zohocrmsourc": 14, "zone": [2, 14, 23, 26, 34, 53, 64, 67], "zone_id": 23, "zoneuri": 34, "zookeep": [16, 50], "zoom": 14, "zoomsingersourc": 14, "zstandard": 14, "zstd": [16, 50], "zstdcompressioncodec": [16, 50], "zuora": 14, "zuorasourc": 14, "\u4e16\u754c": 4, "\u4f60\u597d": 4}, "titles": ["Home", "Asset Checks (Experimental)", "Software-Defined Assets", "Dagster CLI", "Config", "Definitions", "Dynamic Mapping & Collect", "Errors", "Execution", "Graphs", "Hooks", "Internals", "IO Managers", "Jobs", "Airbyte (dagster-airbyte)", "Airflow (dagster-airflow)", "AWS (dagster-aws)", "Azure (dagster-azure)", "Celery (dagster-celery)", "Orchestration on Celery + Docker", "Orchestration on Celery + Kubernetes", "Census (dagster-census)", "Dask (dagster-dask)", "Databricks (dagster-databricks)", "Datadog (dagster-datadog)", "Datahub (dagster-datahub)", "dbt (dagster-dbt)", "Orchestration on Docker", "DuckDB (dagster-duckdb)", "DuckDB + Pandas (dagster-duckdb-pandas)", "DuckDB + Polars (dagster-duckdb-polars)", "DuckDB + PySpark (dagster-duckdb-pyspark)", "embedded-elt (dagster-embedded-elt)", "Fivetran (dagster-fivetran)", "GCP (dagster-gcp)", "GCP + Pandas (dagster-gcp-pandas)", "GCP + PySpark (dagster-gcp-pyspark)", "Great Expectations (dagster-ge)", "GitHub (dagster-github)", "GraphQL (dagster-graphql)", "Kubernetes (dagster-k8s)", "MLflow (dagster-mlflow)", "Microsoft Teams (dagster-msteams)", "MySQL (dagster-mysql)", "PagerDuty (dagster-pagerduty)", "Pandas (dagster-pandas)", "Pandera (dagster-pandera)", "Papertrail (dagster-papertrail)", "PostgreSQL (dagster-postgres)", "Prometheus (dagster-prometheus)", "Pyspark (dagster-pyspark)", "Shell (dagster-shell)", "Slack (dagster-slack)", "Snowflake (dagster-snowflake)", "Snowflake with Pandas (dagster-snowflake-pandas)", "Snowflake with PySpark (dagster-snowflake-pyspark)", "Spark (dagster-spark)", "SSH / SFTP (dagster-ssh)", "Twilio (dagster-twilio)", "Weights & Biases (dagster-wandb)", "Dagstermill", "Loggers", "Job-Level Versioning and Memoization (Deprecated)", "Ops", "Partitions Definitions", "Repositories", "Resources", "Run Requests", "Types", "Utilities"], "titleterms": {"A": 40, "Ins": 63, "about": 40, "access": 40, "airbyt": 14, "airflow": 15, "an": [40, 61], "api": [3, 18, 19, 20, 23, 27, 40, 51], "app": 18, "asset": [1, 2, 3, 8, 14, 26, 32, 33, 63], "aw": 16, "azur": 17, "backend": 18, "backfil": 64, "best": 18, "bias": 59, "bigqueri": [34, 35, 36], "broker": 18, "built": [12, 61, 68], "celeri": [18, 19, 20], "censu": 21, "chart": 40, "check": 1, "cli": [3, 18, 26], "client": 39, "cloud": 26, "cloudwatch": 16, "cluster": 40, "collect": 6, "comput": [11, 34], "config": [4, 8, 14, 59, 64], "configur": [8, 18], "context": [8, 12], "coordin": 11, "core": 26, "custom": [18, 61], "daemon": 3, "dagster": [3, 4, 14, 15, 16, 17, 18, 21, 22, 23, 24, 25, 26, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59], "dagstermil": 60, "dask": 22, "databrick": 23, "datadog": 24, "datahub": 25, "dataproc": 34, "dbt": 26, "debug": 3, "defin": [2, 61, 63], "definit": [5, 64], "depend": 9, "deprec": [26, 62], "destin": 14, "dev": 3, "develop": 40, "docker": [19, 27], "duckdb": [28, 29, 30, 31], "dump": 3, "dynam": 6, "ec": 16, "elt": 32, "embed": 32, "emr": 16, "enabl": 40, "error": [7, 26, 59], "event": [11, 63], "except": 11, "execut": [8, 38, 63], "executor": [8, 11], "exist": 40, "expect": 37, "experiment": [1, 11, 12, 16, 17, 34, 64], "explicit": 9, "faster": 40, "file": [11, 16, 17, 34], "fivetran": 33, "from": [15, 40, 61], "gc": 34, "gcp": [34, 35, 36], "gcr": 40, "ge": 37, "gener": 14, "get": 44, "github": 38, "googl": [35, 36], "graph": [8, 9], "graphql": [3, 38, 39], "great": 37, "grpc": 3, "handl": 11, "heartbeat": 3, "helm": 40, "hook": 10, "i": [17, 34, 53, 59], "input": 12, "instanc": [3, 11], "intern": 11, "io": 12, "issu": 38, "job": [3, 8, 13, 62], "k8": 40, "kei": 63, "kind": 40, "kubernet": [20, 40], "launcher": [11, 23], "legaci": [4, 12, 14, 16, 17, 23, 24, 25, 28, 29, 30, 31, 33, 34, 35, 36, 38, 42, 44, 49, 50, 52, 53, 54, 55, 56, 58, 66], "level": 62, "link": 59, "list": 18, "local": 40, "log": [11, 34, 61], "logger": 61, "make": 68, "manag": [11, 12, 14, 16, 17, 34, 53, 59], "manual": 40, "map": [6, 64], "materi": 8, "memoiz": 62, "metadata": 63, "microsoft": 42, "minikub": 40, "mlflow": 41, "monitor": 18, "msteam": 42, "mysql": 43, "new": 68, "note": 40, "o": [17, 34, 53, 59], "op": [14, 23, 26, 27, 33, 34, 40, 53, 59, 61, 63], "orchestr": [15, 19, 20, 27], "other": [18, 23, 34], "out": 63, "output": 12, "pagerduti": 44, "panda": [29, 35, 45, 54], "pandera": 46, "papertrail": 47, "partit": [64, 67], "polar": 30, "polici": 64, "post": 38, "postgr": 48, "postgresql": 48, "practic": 18, "project": [3, 26], "prometheu": 49, "pvc": 40, "pyspark": [31, 36, 50, 55], "python": [4, 39, 40, 66], "queri": 38, "quickstart": 18, "reconstruct": [8, 13], "redi": 40, "redshift": 16, "repositori": 65, "request": 67, "resourc": [14, 17, 23, 26, 32, 33, 34, 42, 53, 59, 66], "result": 8, "run": [3, 8, 11, 15, 40, 67], "s3": 16, "scaffold": 26, "schedul": [3, 11, 64, 67], "schema": 8, "secretsmanag": 16, "sensor": [3, 42, 67], "setup": 40, "sftp": 57, "shell": 51, "slack": 52, "sling": 32, "snowflak": [53, 54, 55], "softwar": 2, "sourc": 14, "spark": 56, "ssh": 57, "start": [18, 44], "step": [11, 23], "storag": 11, "system": [4, 66], "tabl": 63, "task": 18, "team": 42, "termin": 18, "test": [16, 40, 68], "twilio": 58, "type": [4, 26, 63, 68], "us": 59, "util": [4, 26, 69], "valid": 40, "version": 62, "wandb": 59, "webserv": 3, "weight": 59, "wipe": 3, "worker": 18, "your": 18}} \ No newline at end of file +{"docnames": ["index", "sections/api/apidocs/asset-checks", "sections/api/apidocs/assets", "sections/api/apidocs/cli", "sections/api/apidocs/config", "sections/api/apidocs/definitions", "sections/api/apidocs/dynamic", "sections/api/apidocs/errors", "sections/api/apidocs/execution", "sections/api/apidocs/graphs", "sections/api/apidocs/hooks", "sections/api/apidocs/internals", "sections/api/apidocs/io-managers", "sections/api/apidocs/jobs", "sections/api/apidocs/libraries/dagster-airbyte", "sections/api/apidocs/libraries/dagster-airflow", "sections/api/apidocs/libraries/dagster-aws", "sections/api/apidocs/libraries/dagster-azure", "sections/api/apidocs/libraries/dagster-celery", "sections/api/apidocs/libraries/dagster-celery-docker", "sections/api/apidocs/libraries/dagster-celery-k8s", "sections/api/apidocs/libraries/dagster-census", "sections/api/apidocs/libraries/dagster-dask", "sections/api/apidocs/libraries/dagster-databricks", "sections/api/apidocs/libraries/dagster-datadog", "sections/api/apidocs/libraries/dagster-datahub", "sections/api/apidocs/libraries/dagster-dbt", "sections/api/apidocs/libraries/dagster-docker", "sections/api/apidocs/libraries/dagster-duckdb", "sections/api/apidocs/libraries/dagster-duckdb-pandas", "sections/api/apidocs/libraries/dagster-duckdb-polars", "sections/api/apidocs/libraries/dagster-duckdb-pyspark", "sections/api/apidocs/libraries/dagster-embedded-elt", "sections/api/apidocs/libraries/dagster-fivetran", "sections/api/apidocs/libraries/dagster-gcp", "sections/api/apidocs/libraries/dagster-gcp-pandas", "sections/api/apidocs/libraries/dagster-gcp-pyspark", "sections/api/apidocs/libraries/dagster-ge", "sections/api/apidocs/libraries/dagster-github", "sections/api/apidocs/libraries/dagster-graphql", "sections/api/apidocs/libraries/dagster-k8s", "sections/api/apidocs/libraries/dagster-mlflow", "sections/api/apidocs/libraries/dagster-msteams", "sections/api/apidocs/libraries/dagster-mysql", "sections/api/apidocs/libraries/dagster-pagerduty", "sections/api/apidocs/libraries/dagster-pandas", "sections/api/apidocs/libraries/dagster-pandera", "sections/api/apidocs/libraries/dagster-papertrail", "sections/api/apidocs/libraries/dagster-postgres", "sections/api/apidocs/libraries/dagster-prometheus", "sections/api/apidocs/libraries/dagster-pyspark", "sections/api/apidocs/libraries/dagster-shell", "sections/api/apidocs/libraries/dagster-slack", "sections/api/apidocs/libraries/dagster-snowflake", "sections/api/apidocs/libraries/dagster-snowflake-pandas", "sections/api/apidocs/libraries/dagster-snowflake-pyspark", "sections/api/apidocs/libraries/dagster-spark", "sections/api/apidocs/libraries/dagster-ssh", "sections/api/apidocs/libraries/dagster-twilio", "sections/api/apidocs/libraries/dagster-wandb", "sections/api/apidocs/libraries/dagstermill", "sections/api/apidocs/loggers", "sections/api/apidocs/memoization", "sections/api/apidocs/ops", "sections/api/apidocs/partitions", "sections/api/apidocs/repositories", "sections/api/apidocs/resources", "sections/api/apidocs/schedules-sensors", "sections/api/apidocs/types", "sections/api/apidocs/utilities"], "envversion": {"sphinx": 56, "sphinx.domains.c": 2, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 6, "sphinx.domains.index": 1, "sphinx.domains.javascript": 2, "sphinx.domains.math": 2, "sphinx.domains.python": 3, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.viewcode": 1}, "filenames": ["index.rst", "sections/api/apidocs/asset-checks.rst", "sections/api/apidocs/assets.rst", "sections/api/apidocs/cli.rst", "sections/api/apidocs/config.rst", "sections/api/apidocs/definitions.rst", "sections/api/apidocs/dynamic.rst", "sections/api/apidocs/errors.rst", "sections/api/apidocs/execution.rst", "sections/api/apidocs/graphs.rst", "sections/api/apidocs/hooks.rst", "sections/api/apidocs/internals.rst", "sections/api/apidocs/io-managers.rst", "sections/api/apidocs/jobs.rst", "sections/api/apidocs/libraries/dagster-airbyte.rst", "sections/api/apidocs/libraries/dagster-airflow.rst", "sections/api/apidocs/libraries/dagster-aws.rst", "sections/api/apidocs/libraries/dagster-azure.rst", "sections/api/apidocs/libraries/dagster-celery.rst", "sections/api/apidocs/libraries/dagster-celery-docker.rst", "sections/api/apidocs/libraries/dagster-celery-k8s.rst", "sections/api/apidocs/libraries/dagster-census.rst", "sections/api/apidocs/libraries/dagster-dask.rst", "sections/api/apidocs/libraries/dagster-databricks.rst", "sections/api/apidocs/libraries/dagster-datadog.rst", "sections/api/apidocs/libraries/dagster-datahub.rst", "sections/api/apidocs/libraries/dagster-dbt.rst", "sections/api/apidocs/libraries/dagster-docker.rst", "sections/api/apidocs/libraries/dagster-duckdb.rst", "sections/api/apidocs/libraries/dagster-duckdb-pandas.rst", "sections/api/apidocs/libraries/dagster-duckdb-polars.rst", "sections/api/apidocs/libraries/dagster-duckdb-pyspark.rst", "sections/api/apidocs/libraries/dagster-embedded-elt.rst", "sections/api/apidocs/libraries/dagster-fivetran.rst", "sections/api/apidocs/libraries/dagster-gcp.rst", "sections/api/apidocs/libraries/dagster-gcp-pandas.rst", "sections/api/apidocs/libraries/dagster-gcp-pyspark.rst", "sections/api/apidocs/libraries/dagster-ge.rst", "sections/api/apidocs/libraries/dagster-github.rst", "sections/api/apidocs/libraries/dagster-graphql.rst", "sections/api/apidocs/libraries/dagster-k8s.rst", "sections/api/apidocs/libraries/dagster-mlflow.rst", "sections/api/apidocs/libraries/dagster-msteams.rst", "sections/api/apidocs/libraries/dagster-mysql.rst", "sections/api/apidocs/libraries/dagster-pagerduty.rst", "sections/api/apidocs/libraries/dagster-pandas.rst", "sections/api/apidocs/libraries/dagster-pandera.rst", "sections/api/apidocs/libraries/dagster-papertrail.rst", "sections/api/apidocs/libraries/dagster-postgres.rst", "sections/api/apidocs/libraries/dagster-prometheus.rst", "sections/api/apidocs/libraries/dagster-pyspark.rst", "sections/api/apidocs/libraries/dagster-shell.rst", "sections/api/apidocs/libraries/dagster-slack.rst", "sections/api/apidocs/libraries/dagster-snowflake.rst", "sections/api/apidocs/libraries/dagster-snowflake-pandas.rst", "sections/api/apidocs/libraries/dagster-snowflake-pyspark.rst", "sections/api/apidocs/libraries/dagster-spark.rst", "sections/api/apidocs/libraries/dagster-ssh.rst", "sections/api/apidocs/libraries/dagster-twilio.rst", "sections/api/apidocs/libraries/dagster-wandb.rst", "sections/api/apidocs/libraries/dagstermill.rst", "sections/api/apidocs/loggers.rst", "sections/api/apidocs/memoization.rst", "sections/api/apidocs/ops.rst", "sections/api/apidocs/partitions.rst", "sections/api/apidocs/repositories.rst", "sections/api/apidocs/resources.rst", "sections/api/apidocs/schedules-sensors.rst", "sections/api/apidocs/types.rst", "sections/api/apidocs/utilities.rst"], "objects": {"dagster": [[67, 0, 1, "", "AddDynamicPartitionsRequest"], [64, 0, 1, "", "AllPartitionMapping"], [4, 0, 1, "", "Array"], [1, 0, 1, "", "AssetCheckKey"], [1, 0, 1, "", "AssetCheckResult"], [1, 0, 1, "", "AssetCheckSeverity"], [1, 0, 1, "", "AssetCheckSpec"], [2, 0, 1, "", "AssetDep"], [8, 0, 1, "", "AssetExecutionContext"], [2, 0, 1, "", "AssetIn"], [63, 0, 1, "", "AssetKey"], [63, 0, 1, "", "AssetMaterialization"], [2, 0, 1, "", "AssetOut"], [2, 0, 1, "", "AssetSelection"], [67, 0, 1, "", "AssetSensorDefinition"], [2, 0, 1, "", "AssetSpec"], [2, 0, 1, "", "AssetValueLoader"], [2, 0, 1, "", "AssetsDefinition"], [2, 0, 1, "", "AutoMaterializePolicy"], [2, 0, 1, "", "AutoMaterializeRule"], [64, 0, 1, "", "BackfillPolicy"], [63, 0, 1, "", "Backoff"], [4, 2, 1, "", "BoolSource"], [4, 0, 1, "", "Config"], [4, 0, 1, "", "ConfigMapping"], [4, 0, 1, "", "ConfigSchema"], [12, 0, 1, "", "ConfigurableIOManager"], [12, 0, 1, "", "ConfigurableIOManagerFactory"], [66, 0, 1, "", "ConfigurableResource"], [63, 0, 1, "", "DagsterAssetMetadataValue"], [7, 4, 1, "", "DagsterConfigMappingFunctionError"], [7, 4, 1, "", "DagsterError"], [8, 0, 1, "", "DagsterEvent"], [7, 4, 1, "", "DagsterEventLogInvalidForRun"], [8, 0, 1, "", "DagsterEventType"], [7, 4, 1, "", "DagsterExecutionStepExecutionError"], [7, 4, 1, "", "DagsterExecutionStepNotFoundError"], [11, 0, 1, "", "DagsterInstance"], [7, 4, 1, "", "DagsterInvalidConfigDefinitionError"], [7, 4, 1, "", "DagsterInvalidConfigError"], [7, 4, 1, "", "DagsterInvalidDefinitionError"], [7, 4, 1, "", "DagsterInvalidSubsetError"], [7, 4, 1, "", "DagsterInvariantViolationError"], [61, 0, 1, "", "DagsterLogManager"], [7, 4, 1, "", "DagsterResourceFunctionError"], [11, 0, 1, "", "DagsterRun"], [63, 0, 1, "", "DagsterRunMetadataValue"], [7, 4, 1, "", "DagsterRunNotFoundError"], [11, 0, 1, "", "DagsterRunStatus"], [7, 4, 1, "", "DagsterStepOutputNotFoundError"], [7, 4, 1, "", "DagsterSubprocessError"], [68, 0, 1, "", "DagsterType"], [7, 4, 1, "", "DagsterTypeCheckDidNotPass"], [7, 4, 1, "", "DagsterTypeCheckError"], [68, 0, 1, "", "DagsterTypeLoader"], [68, 0, 1, "", "DagsterTypeLoaderContext"], [7, 4, 1, "", "DagsterUnknownResourceError"], [7, 4, 1, "", "DagsterUnmetExecutorRequirementsError"], [7, 4, 1, "", "DagsterUserCodeExecutionError"], [64, 0, 1, "", "DailyPartitionsDefinition"], [5, 0, 1, "", "Definitions"], [67, 0, 1, "", "DeleteDynamicPartitionsRequest"], [9, 0, 1, "", "DependencyDefinition"], [6, 0, 1, "", "DynamicOut"], [6, 0, 1, "", "DynamicOutput"], [64, 0, 1, "", "DynamicPartitionsDefinition"], [4, 0, 1, "", "Enum"], [4, 0, 1, "", "EnumValue"], [11, 0, 1, "", "EventLogEntry"], [11, 0, 1, "", "EventLogRecord"], [11, 0, 1, "", "EventRecordsFilter"], [8, 0, 1, "", "ExecuteInProcessResult"], [11, 0, 1, "", "Executor"], [11, 0, 1, "", "ExecutorDefinition"], [63, 0, 1, "", "ExpectationResult"], [69, 0, 1, "", "ExperimentalWarning"], [63, 0, 1, "", "Failure"], [4, 0, 1, "", "Field"], [11, 0, 1, "", "FileHandle"], [12, 5, 1, "", "FilesystemIOManager"], [63, 0, 1, "", "FloatMetadataValue"], [2, 0, 1, "", "FreshnessPolicy"], [67, 0, 1, "", "FreshnessPolicySensorDefinition"], [9, 0, 1, "", "GraphDefinition"], [9, 0, 1, "", "GraphIn"], [9, 0, 1, "", "GraphOut"], [10, 0, 1, "", "HookContext"], [10, 0, 1, "", "HookDefinition"], [64, 0, 1, "", "HourlyPartitionsDefinition"], [12, 0, 1, "", "IOManager"], [12, 0, 1, "", "IOManagerDefinition"], [64, 0, 1, "", "IdentityPartitionMapping"], [63, 0, 1, "", "In"], [12, 5, 1, "", "InMemoryIOManager"], [11, 0, 1, "", "InitExecutorContext"], [61, 0, 1, "", "InitLoggerContext"], [66, 0, 1, "", "InitResourceContext"], [12, 0, 1, "", "InputContext"], [12, 0, 1, "", "InputManager"], [12, 0, 1, "", "InputManagerDefinition"], [9, 0, 1, "", "InputMapping"], [63, 0, 1, "", "IntMetadataValue"], [4, 2, 1, "", "IntSource"], [63, 0, 1, "", "Jitter"], [13, 0, 1, "", "JobDefinition"], [8, 0, 1, "", "JobExecutionResult"], [67, 0, 1, "", "JobSelector"], [63, 0, 1, "", "JsonMetadataValue"], [64, 0, 1, "", "LastPartitionMapping"], [11, 0, 1, "", "LocalFileHandle"], [61, 0, 1, "", "LoggerDefinition"], [62, 2, 1, "", "MEMOIZED_RUN_TAG"], [4, 0, 1, "", "Map"], [63, 0, 1, "", "MarkdownMetadataValue"], [2, 0, 1, "", "MaterializeResult"], [62, 0, 1, "", "MemoizableIOManager"], [63, 0, 1, "", "MetadataEntry"], [63, 0, 1, "", "MetadataValue"], [64, 0, 1, "", "MonthlyPartitionsDefinition"], [67, 0, 1, "", "MultiAssetSensorDefinition"], [67, 0, 1, "", "MultiAssetSensorEvaluationContext"], [9, 0, 1, "", "MultiDependencyDefinition"], [64, 0, 1, "", "MultiPartitionKey"], [64, 0, 1, "", "MultiPartitionMapping"], [64, 0, 1, "", "MultiPartitionsDefinition"], [64, 0, 1, "", "MultiToSingleDimensionPartitionMapping"], [9, 0, 1, "", "NodeInvocation"], [4, 0, 1, "", "Noneable"], [63, 0, 1, "", "NotebookMetadataValue"], [68, 2, 1, "", "Nothing"], [63, 0, 1, "", "OpDefinition"], [8, 0, 1, "", "OpExecutionContext"], [62, 0, 1, "", "OpVersionContext"], [63, 0, 1, "", "Out"], [63, 0, 1, "", "Output"], [12, 0, 1, "", "OutputContext"], [9, 0, 1, "", "OutputMapping"], [64, 0, 1, "", "PartitionKeyRange"], [64, 0, 1, "", "PartitionMapping"], [64, 0, 1, "", "PartitionedConfig"], [64, 0, 1, "", "PartitionsDefinition"], [63, 0, 1, "", "PathMetadataValue"], [4, 0, 1, "", "Permissive"], [4, 0, 1, "", "PermissiveConfig"], [63, 0, 1, "", "PythonArtifactMetadataValue"], [68, 6, 1, "", "PythonObjectDagsterType"], [8, 0, 1, "", "ReexecutionOptions"], [65, 0, 1, "", "RepositoryData"], [65, 0, 1, "", "RepositoryDefinition"], [67, 0, 1, "", "RepositorySelector"], [66, 0, 1, "", "ResourceDefinition"], [62, 0, 1, "", "ResourceVersionContext"], [63, 0, 1, "", "RetryPolicy"], [63, 0, 1, "", "RetryRequested"], [4, 0, 1, "", "RunConfig"], [67, 0, 1, "", "RunFailureSensorContext"], [67, 0, 1, "", "RunRequest"], [11, 0, 1, "", "RunShardedEventsCursor"], [67, 0, 1, "", "RunStatusSensorContext"], [67, 0, 1, "", "RunStatusSensorDefinition"], [11, 0, 1, "", "RunsFilter"], [4, 0, 1, "", "ScalarUnion"], [67, 0, 1, "", "ScheduleDefinition"], [67, 0, 1, "", "ScheduleEvaluationContext"], [4, 0, 1, "", "Selector"], [67, 0, 1, "", "SensorDefinition"], [67, 0, 1, "", "SensorResult"], [4, 0, 1, "", "Shape"], [67, 0, 1, "", "SkipReason"], [2, 0, 1, "", "SourceAsset"], [62, 0, 1, "", "SourceHashVersionStrategy"], [64, 0, 1, "", "SpecificPartitionsPartitionMapping"], [64, 0, 1, "", "StaticPartitionMapping"], [64, 0, 1, "", "StaticPartitionsDefinition"], [11, 0, 1, "", "StepExecutionContext"], [11, 0, 1, "", "StepLauncher"], [11, 0, 1, "", "StepRunRef"], [4, 2, 1, "", "StringSource"], [63, 0, 1, "", "TableColumn"], [63, 0, 1, "", "TableColumnConstraints"], [63, 0, 1, "", "TableConstraints"], [63, 0, 1, "", "TableMetadataValue"], [63, 0, 1, "", "TableRecord"], [63, 0, 1, "", "TableSchema"], [63, 0, 1, "", "TableSchemaMetadataValue"], [63, 0, 1, "", "TextMetadataValue"], [64, 0, 1, "", "TimeWindow"], [64, 0, 1, "", "TimeWindowPartitionMapping"], [64, 0, 1, "", "TimeWindowPartitionsDefinition"], [63, 0, 1, "", "TypeCheck"], [8, 0, 1, "", "TypeCheckContext"], [12, 0, 1, "", "UPathIOManager"], [63, 0, 1, "", "UrlMetadataValue"], [62, 0, 1, "", "VersionStrategy"], [64, 0, 1, "", "WeeklyPartitionsDefinition"], [2, 6, 1, "", "asset"], [1, 6, 1, "", "asset_check"], [67, 6, 1, "", "asset_sensor"], [8, 6, 1, "", "build_asset_context"], [67, 6, 1, "", "build_freshness_policy_sensor_context"], [10, 6, 1, "", "build_hook_context"], [61, 6, 1, "", "build_init_logger_context"], [66, 6, 1, "", "build_init_resource_context"], [12, 6, 1, "", "build_input_context"], [67, 6, 1, "", "build_multi_asset_sensor_context"], [8, 6, 1, "", "build_op_context"], [12, 6, 1, "", "build_output_context"], [13, 6, 1, "", "build_reconstructable_job"], [66, 6, 1, "", "build_resources"], [67, 6, 1, "", "build_run_status_sensor_context"], [67, 6, 1, "", "build_schedule_context"], [67, 6, 1, "", "build_schedule_from_partitioned_job"], [67, 6, 1, "", "build_sensor_context"], [68, 6, 1, "", "check_dagster_type"], [69, 6, 1, "", "config_from_files"], [69, 6, 1, "", "config_from_pkg_resources"], [69, 6, 1, "", "config_from_yaml_strings"], [4, 6, 1, "", "configured"], [5, 6, 1, "", "create_repository_using_definitions_args"], [68, 6, 1, "", "dagster_type_loader"], [64, 6, 1, "", "daily_partitioned_config"], [2, 6, 1, "", "define_asset_job"], [64, 6, 1, "", "dynamic_partitioned_config"], [8, 6, 1, "", "execute_job"], [11, 6, 1, "", "executor"], [10, 6, 1, "", "failure_hook"], [69, 6, 1, "", "file_relative_path"], [67, 6, 1, "", "freshness_policy_sensor"], [12, 5, 1, "", "fs_io_manager"], [69, 6, 1, "", "get_dagster_logger"], [9, 6, 1, "", "graph"], [2, 6, 1, "", "graph_asset"], [2, 6, 1, "", "graph_multi_asset"], [64, 6, 1, "", "hourly_partitioned_config"], [8, 5, 1, "", "in_process_executor"], [12, 6, 1, "", "input_manager"], [8, 6, 1, "", "instance_for_test"], [12, 6, 1, "", "io_manager"], [13, 6, 1, "", "job"], [2, 6, 1, "", "load_assets_from_current_module"], [2, 6, 1, "", "load_assets_from_modules"], [2, 6, 1, "", "load_assets_from_package_module"], [2, 6, 1, "", "load_assets_from_package_name"], [11, 5, 1, "", "local_file_manager"], [61, 6, 1, "", "logger"], [69, 6, 1, "", "make_email_on_run_failure_sensor"], [68, 6, 1, "", "make_python_type_usable_as_dagster_type"], [66, 6, 1, "", "make_values_resource"], [8, 6, 1, "", "materialize"], [8, 6, 1, "", "materialize_to_memory"], [12, 5, 1, "", "mem_io_manager"], [64, 6, 1, "", "monthly_partitioned_config"], [2, 6, 1, "", "multi_asset"], [67, 6, 1, "", "multi_asset_sensor"], [8, 5, 1, "", "multi_or_in_process_executor"], [8, 5, 1, "", "multiprocess_executor"], [63, 6, 1, "", "op"], [8, 0, 1, "", "reconstructable"], [65, 5, 1, "", "repository"], [66, 6, 1, "", "resource"], [67, 6, 1, "", "run_failure_sensor"], [67, 6, 1, "", "run_status_sensor"], [67, 6, 1, "", "schedule"], [67, 6, 1, "", "sensor"], [64, 6, 1, "", "static_partitioned_config"], [10, 6, 1, "", "success_hook"], [68, 6, 1, "", "usable_as_dagster_type"], [8, 6, 1, "", "validate_run_config"], [64, 6, 1, "", "weekly_partitioned_config"], [66, 6, 1, "", "with_resources"]], "dagster-api-grpc": [[3, 8, 1, "cmdoption-dagster-api-grpc-a", "--attribute"], [3, 8, 1, "cmdoption-dagster-api-grpc-container-context", "--container-context"], [3, 8, 1, "cmdoption-dagster-api-grpc-container-image", "--container-image"], [3, 8, 1, "cmdoption-dagster-api-grpc-empty-working-directory", "--empty-working-directory"], [3, 8, 1, "cmdoption-dagster-api-grpc-fixed-server-id", "--fixed-server-id"], [3, 8, 1, "cmdoption-dagster-api-grpc-heartbeat", "--heartbeat"], [3, 8, 1, "cmdoption-dagster-api-grpc-heartbeat-timeout", "--heartbeat-timeout"], [3, 8, 1, "cmdoption-dagster-api-grpc-h", "--host"], [3, 8, 1, "cmdoption-dagster-api-grpc-inject-env-vars-from-instance", "--inject-env-vars-from-instance"], [3, 8, 1, "cmdoption-dagster-api-grpc-instance-ref", "--instance-ref"], [3, 8, 1, "cmdoption-dagster-api-grpc-lazy-load-user-code", "--lazy-load-user-code"], [3, 8, 1, "cmdoption-dagster-api-grpc-location-name", "--location-name"], [3, 8, 1, "cmdoption-dagster-api-grpc-log-level", "--log-level"], [3, 8, 1, "cmdoption-dagster-api-grpc-n", "--max-workers"], [3, 8, 1, "cmdoption-dagster-api-grpc-n", "--max_workers"], [3, 8, 1, "cmdoption-dagster-api-grpc-m", "--module-name"], [3, 8, 1, "cmdoption-dagster-api-grpc-package-name", "--package-name"], [3, 8, 1, "cmdoption-dagster-api-grpc-p", "--port"], [3, 8, 1, "cmdoption-dagster-api-grpc-f", "--python-file"], [3, 8, 1, "cmdoption-dagster-api-grpc-s", "--socket"], [3, 8, 1, "cmdoption-dagster-api-grpc-use-python-environment-entry-point", "--use-python-environment-entry-point"], [3, 8, 1, "cmdoption-dagster-api-grpc-d", "--working-directory"], [3, 8, 1, "cmdoption-dagster-api-grpc-a", "-a"], [3, 8, 1, "cmdoption-dagster-api-grpc-d", "-d"], [3, 8, 1, "cmdoption-dagster-api-grpc-f", "-f"], [3, 8, 1, "cmdoption-dagster-api-grpc-h", "-h"], [3, 8, 1, "cmdoption-dagster-api-grpc-m", "-m"], [3, 8, 1, "cmdoption-dagster-api-grpc-n", "-n"], [3, 8, 1, "cmdoption-dagster-api-grpc-p", "-p"], [3, 8, 1, "cmdoption-dagster-api-grpc-s", "-s"]], "dagster-celery-worker-list": [[18, 8, 1, "cmdoption-dagster-celery-worker-list-y", "--config-yaml"], [18, 8, 1, "cmdoption-dagster-celery-worker-list-y", "-y"]], "dagster-celery-worker-start": [[18, 8, 1, "cmdoption-dagster-celery-worker-start-A", "--app"], [18, 8, 1, "cmdoption-dagster-celery-worker-start-d", "--background"], [18, 8, 1, "cmdoption-dagster-celery-worker-start-y", "--config-yaml"], [18, 8, 1, "cmdoption-dagster-celery-worker-start-i", "--includes"], [18, 8, 1, "cmdoption-dagster-celery-worker-start-l", "--loglevel"], [18, 8, 1, "cmdoption-dagster-celery-worker-start-n", "--name"], [18, 8, 1, "cmdoption-dagster-celery-worker-start-q", "--queue"], [18, 8, 1, "cmdoption-dagster-celery-worker-start-A", "-A"], [18, 8, 1, "cmdoption-dagster-celery-worker-start-d", "-d"], [18, 8, 1, "cmdoption-dagster-celery-worker-start-i", "-i"], [18, 8, 1, "cmdoption-dagster-celery-worker-start-l", "-l"], [18, 8, 1, "cmdoption-dagster-celery-worker-start-n", "-n"], [18, 8, 1, "cmdoption-dagster-celery-worker-start-q", "-q"], [18, 8, 1, "cmdoption-dagster-celery-worker-start-y", "-y"], [18, 8, 1, "cmdoption-dagster-celery-worker-start-arg-ADDITIONAL_ARGS", "ADDITIONAL_ARGS"]], "dagster-celery-worker-terminate": [[18, 8, 1, "cmdoption-dagster-celery-worker-terminate-a", "--all"], [18, 8, 1, "cmdoption-dagster-celery-worker-terminate-y", "--config-yaml"], [18, 8, 1, "cmdoption-dagster-celery-worker-terminate-a", "-a"], [18, 8, 1, "cmdoption-dagster-celery-worker-terminate-y", "-y"], [18, 8, 1, "cmdoption-dagster-celery-worker-terminate-arg-NAME", "NAME"]], "dagster-daemon-run": [[3, 8, 1, "cmdoption-dagster-daemon-run-a", "--attribute"], [3, 8, 1, "cmdoption-dagster-daemon-run-code-server-log-level", "--code-server-log-level"], [3, 8, 1, "cmdoption-dagster-daemon-run-empty-workspace", "--empty-workspace"], [3, 8, 1, "cmdoption-dagster-daemon-run-grpc-host", "--grpc-host"], [3, 8, 1, "cmdoption-dagster-daemon-run-grpc-port", "--grpc-port"], [3, 8, 1, "cmdoption-dagster-daemon-run-grpc-socket", "--grpc-socket"], [3, 8, 1, "cmdoption-dagster-daemon-run-log-level", "--log-level"], [3, 8, 1, "cmdoption-dagster-daemon-run-m", "--module-name"], [3, 8, 1, "cmdoption-dagster-daemon-run-package-name", "--package-name"], [3, 8, 1, "cmdoption-dagster-daemon-run-f", "--python-file"], [3, 8, 1, "cmdoption-dagster-daemon-run-use-ssl", "--use-ssl"], [3, 8, 1, "cmdoption-dagster-daemon-run-d", "--working-directory"], [3, 8, 1, "cmdoption-dagster-daemon-run-w", "--workspace"], [3, 8, 1, "cmdoption-dagster-daemon-run-a", "-a"], [3, 8, 1, "cmdoption-dagster-daemon-run-d", "-d"], [3, 8, 1, "cmdoption-dagster-daemon-run-f", "-f"], [3, 8, 1, "cmdoption-dagster-daemon-run-m", "-m"], [3, 8, 1, "cmdoption-dagster-daemon-run-w", "-w"]], "dagster-dbt-project-scaffold": [[26, 8, 1, "cmdoption-dagster-dbt-project-scaffold-dbt-project-dir", "--dbt-project-dir"], [26, 8, 1, "cmdoption-dagster-dbt-project-scaffold-project-name", "--project-name"]], "dagster-dev": [[3, 8, 1, "cmdoption-dagster-dev-code-server-log-level", "--code-server-log-level"], [3, 8, 1, "cmdoption-dagster-dev-h", "--dagit-host"], [3, 8, 1, "cmdoption-dagster-dev-p", "--dagit-port"], [3, 8, 1, "cmdoption-dagster-dev-h", "--host"], [3, 8, 1, "cmdoption-dagster-dev-log-level", "--log-level"], [3, 8, 1, "cmdoption-dagster-dev-m", "--module-name"], [3, 8, 1, "cmdoption-dagster-dev-p", "--port"], [3, 8, 1, "cmdoption-dagster-dev-f", "--python-file"], [3, 8, 1, "cmdoption-dagster-dev-d", "--working-directory"], [3, 8, 1, "cmdoption-dagster-dev-w", "--workspace"], [3, 8, 1, "cmdoption-dagster-dev-d", "-d"], [3, 8, 1, "cmdoption-dagster-dev-f", "-f"], [3, 8, 1, "cmdoption-dagster-dev-h", "-h"], [3, 8, 1, "cmdoption-dagster-dev-m", "-m"], [3, 8, 1, "cmdoption-dagster-dev-p", "-p"], [3, 8, 1, "cmdoption-dagster-dev-w", "-w"]], "dagster-graphql": [[3, 8, 1, "cmdoption-dagster-graphql-a", "--attribute"], [3, 8, 1, "cmdoption-dagster-graphql-empty-workspace", "--empty-workspace"], [3, 8, 1, "cmdoption-dagster-graphql-ephemeral-instance", "--ephemeral-instance"], [3, 8, 1, "cmdoption-dagster-graphql-f", "--file"], [3, 8, 1, "cmdoption-dagster-graphql-grpc-host", "--grpc-host"], [3, 8, 1, "cmdoption-dagster-graphql-grpc-port", "--grpc-port"], [3, 8, 1, "cmdoption-dagster-graphql-grpc-socket", "--grpc-socket"], [3, 8, 1, "cmdoption-dagster-graphql-m", "--module-name"], [3, 8, 1, "cmdoption-dagster-graphql-o", "--output"], [3, 8, 1, "cmdoption-dagster-graphql-package-name", "--package-name"], [3, 8, 1, "cmdoption-dagster-graphql-p", "--predefined"], [3, 8, 1, "cmdoption-dagster-graphql-0", "--python-file"], [3, 8, 1, "cmdoption-dagster-graphql-r", "--remote"], [3, 8, 1, "cmdoption-dagster-graphql-t", "--text"], [3, 8, 1, "cmdoption-dagster-graphql-use-ssl", "--use-ssl"], [3, 8, 1, "cmdoption-dagster-graphql-v", "--variables"], [3, 8, 1, "cmdoption-dagster-graphql-version", "--version"], [3, 8, 1, "cmdoption-dagster-graphql-d", "--working-directory"], [3, 8, 1, "cmdoption-dagster-graphql-w", "--workspace"], [3, 8, 1, "cmdoption-dagster-graphql-a", "-a"], [3, 8, 1, "cmdoption-dagster-graphql-d", "-d"], [3, 8, 1, "cmdoption-dagster-graphql-0", "-f"], [3, 8, 1, "cmdoption-dagster-graphql-m", "-m"], [3, 8, 1, "cmdoption-dagster-graphql-o", "-o"], [3, 8, 1, "cmdoption-dagster-graphql-p", "-p"], [3, 8, 1, "cmdoption-dagster-graphql-r", "-r"], [3, 8, 1, "cmdoption-dagster-graphql-t", "-t"], [3, 8, 1, "cmdoption-dagster-graphql-v", "-v"], [3, 8, 1, "cmdoption-dagster-graphql-w", "-w"]], "dagster-webserver": [[3, 8, 1, "cmdoption-dagster-webserver-a", "--attribute"], [3, 8, 1, "cmdoption-dagster-webserver-code-server-log-level", "--code-server-log-level"], [3, 8, 1, "cmdoption-dagster-webserver-dagster-log-level", "--dagster-log-level"], [3, 8, 1, "cmdoption-dagster-webserver-db-pool-recycle", "--db-pool-recycle"], [3, 8, 1, "cmdoption-dagster-webserver-db-statement-timeout", "--db-statement-timeout"], [3, 8, 1, "cmdoption-dagster-webserver-empty-workspace", "--empty-workspace"], [3, 8, 1, "cmdoption-dagster-webserver-grpc-host", "--grpc-host"], [3, 8, 1, "cmdoption-dagster-webserver-grpc-port", "--grpc-port"], [3, 8, 1, "cmdoption-dagster-webserver-grpc-socket", "--grpc-socket"], [3, 8, 1, "cmdoption-dagster-webserver-h", "--host"], [3, 8, 1, "cmdoption-dagster-webserver-uvicorn-log-level", "--log-level"], [3, 8, 1, "cmdoption-dagster-webserver-m", "--module-name"], [3, 8, 1, "cmdoption-dagster-webserver-package-name", "--package-name"], [3, 8, 1, "cmdoption-dagster-webserver-l", "--path-prefix"], [3, 8, 1, "cmdoption-dagster-webserver-p", "--port"], [3, 8, 1, "cmdoption-dagster-webserver-f", "--python-file"], [3, 8, 1, "cmdoption-dagster-webserver-read-only", "--read-only"], [3, 8, 1, "cmdoption-dagster-webserver-suppress-warnings", "--suppress-warnings"], [3, 8, 1, "cmdoption-dagster-webserver-use-ssl", "--use-ssl"], [3, 8, 1, "cmdoption-dagster-webserver-uvicorn-log-level", "--uvicorn-log-level"], [3, 8, 1, "cmdoption-dagster-webserver-version", "--version"], [3, 8, 1, "cmdoption-dagster-webserver-d", "--working-directory"], [3, 8, 1, "cmdoption-dagster-webserver-w", "--workspace"], [3, 8, 1, "cmdoption-dagster-webserver-a", "-a"], [3, 8, 1, "cmdoption-dagster-webserver-d", "-d"], [3, 8, 1, "cmdoption-dagster-webserver-f", "-f"], [3, 8, 1, "cmdoption-dagster-webserver-h", "-h"], [3, 8, 1, "cmdoption-dagster-webserver-l", "-l"], [3, 8, 1, "cmdoption-dagster-webserver-m", "-m"], [3, 8, 1, "cmdoption-dagster-webserver-p", "-p"], [3, 8, 1, "cmdoption-dagster-webserver-w", "-w"]], "dagster.Array": [[4, 1, 1, "", "description"]], "dagster.AssetCheckResult": [[1, 2, 1, "", "asset_key"], [1, 2, 1, "", "check_name"], [1, 2, 1, "", "metadata"], [1, 2, 1, "", "passed"], [1, 2, 1, "", "severity"]], "dagster.AssetDep": [[2, 2, 1, "", "asset"], [2, 2, 1, "", "partition_mapping"]], "dagster.AssetIn": [[2, 2, 1, "", "dagster_type"], [2, 2, 1, "", "key"], [2, 2, 1, "", "key_prefix"], [2, 2, 1, "", "metadata"], [2, 2, 1, "", "partition_mapping"]], "dagster.AssetMaterialization": [[63, 3, 1, "", "file"]], "dagster.AssetOut": [[2, 2, 1, "", "auto_materialize_policy"], [2, 2, 1, "", "backfill_policy"], [2, 2, 1, "", "code_version"], [2, 2, 1, "", "dagster_type"], [2, 2, 1, "", "description"], [2, 2, 1, "", "freshness_policy"], [2, 2, 1, "", "group_name"], [2, 2, 1, "", "io_manager_key"], [2, 2, 1, "", "is_required"], [2, 2, 1, "", "key"], [2, 2, 1, "", "key_prefix"], [2, 2, 1, "", "metadata"]], "dagster.AssetSelection": [[2, 3, 1, "", "all"], [2, 3, 1, "", "all_asset_checks"], [2, 3, 1, "", "assets"], [2, 3, 1, "", "checks"], [2, 3, 1, "", "checks_for_assets"], [2, 3, 1, "", "downstream"], [2, 3, 1, "", "groups"], [2, 3, 1, "", "key_prefixes"], [2, 3, 1, "", "keys"], [2, 3, 1, "", "required_multi_asset_neighbors"], [2, 3, 1, "", "roots"], [2, 3, 1, "", "sinks"], [2, 3, 1, "", "sources"], [2, 3, 1, "", "upstream"], [2, 3, 1, "", "upstream_source_assets"], [2, 3, 1, "", "without_checks"]], "dagster.AssetSensorDefinition": [[67, 1, 1, "", "asset_key"]], "dagster.AssetSpec": [[2, 2, 1, "", "auto_materialize_policy"], [2, 2, 1, "", "backfill_policy"], [2, 2, 1, "", "code_version"], [2, 2, 1, "", "deps"], [2, 2, 1, "", "description"], [2, 2, 1, "", "freshness_policy"], [2, 2, 1, "", "group_name"], [2, 2, 1, "", "key"], [2, 2, 1, "", "metadata"], [2, 2, 1, "", "skippable"]], "dagster.AssetValueLoader": [[2, 3, 1, "", "load_asset_value"]], "dagster.AssetsDefinition": [[2, 1, 1, "", "asset_deps"], [2, 1, 1, "", "can_subset"], [2, 1, 1, "", "check_specs"], [2, 1, 1, "", "dependency_keys"], [2, 1, 1, "", "descriptions_by_key"], [2, 3, 1, "", "from_graph"], [2, 3, 1, "", "from_op"], [2, 3, 1, "", "get_partition_mapping"], [2, 1, 1, "", "group_names_by_key"], [2, 1, 1, "", "key"], [2, 1, 1, "", "keys"], [2, 1, 1, "", "node_def"], [2, 1, 1, "", "op"], [2, 1, 1, "", "partitions_def"], [2, 1, 1, "", "required_resource_keys"], [2, 1, 1, "", "resource_defs"], [2, 3, 1, "", "to_source_asset"], [2, 3, 1, "", "to_source_assets"]], "dagster.AutoMaterializePolicy": [[2, 3, 1, "", "eager"], [2, 3, 1, "", "lazy"], [2, 3, 1, "", "with_rules"], [2, 3, 1, "", "without_rules"]], "dagster.AutoMaterializeRule": [[2, 3, 1, "", "materialize_on_missing"], [2, 3, 1, "", "materialize_on_parent_updated"], [2, 3, 1, "", "materialize_on_required_for_freshness"], [2, 2, 1, "", "require_update_for_all_parent_partitions"], [2, 3, 1, "", "skip_on_not_all_parents_updated"], [2, 3, 1, "", "skip_on_parent_missing"], [2, 3, 1, "", "skip_on_parent_outdated"]], "dagster.BackfillPolicy": [[64, 3, 1, "", "multi_run"], [64, 3, 1, "", "single_run"]], "dagster.DagsterAssetMetadataValue": [[63, 1, 1, "", "value"]], "dagster.DagsterError": [[7, 1, 1, "", "is_user_code_error"]], "dagster.DagsterEvent": [[8, 1, 1, "", "asset_key"], [8, 2, 1, "", "event_specific_data"], [8, 1, 1, "", "event_type"], [8, 2, 1, "", "event_type_value"], [8, 1, 1, "", "is_asset_materialization_planned"], [8, 1, 1, "", "is_asset_observation"], [8, 1, 1, "", "is_engine_event"], [8, 1, 1, "", "is_expectation_result"], [8, 1, 1, "", "is_failure"], [8, 1, 1, "", "is_handled_output"], [8, 1, 1, "", "is_hook_event"], [8, 1, 1, "", "is_loaded_input"], [8, 1, 1, "", "is_resource_init_failure"], [8, 1, 1, "", "is_step_event"], [8, 1, 1, "", "is_step_failure"], [8, 1, 1, "", "is_step_materialization"], [8, 1, 1, "", "is_step_restarted"], [8, 1, 1, "", "is_step_skipped"], [8, 1, 1, "", "is_step_start"], [8, 1, 1, "", "is_step_success"], [8, 1, 1, "", "is_step_up_for_retry"], [8, 1, 1, "", "is_successful_output"], [8, 2, 1, "", "job_name"], [8, 2, 1, "", "logging_tags"], [8, 2, 1, "", "message"], [8, 2, 1, "", "node_handle"], [8, 1, 1, "", "partition"], [8, 2, 1, "", "pid"], [8, 2, 1, "", "step_key"], [8, 2, 1, "", "step_kind_value"]], "dagster.DagsterInstance": [[11, 3, 1, "", "add_dynamic_partitions"], [11, 3, 1, "", "delete_dynamic_partition"], [11, 3, 1, "", "delete_run"], [11, 3, 1, "", "ephemeral"], [11, 3, 1, "", "get"], [11, 3, 1, "", "get_asset_keys"], [11, 3, 1, "", "get_asset_records"], [11, 3, 1, "", "get_dynamic_partitions"], [11, 3, 1, "", "get_event_records"], [11, 3, 1, "", "get_latest_materialization_code_versions"], [11, 3, 1, "", "get_latest_materialization_event"], [11, 3, 1, "", "get_run_by_id"], [11, 3, 1, "", "get_run_record_by_id"], [11, 3, 1, "", "get_run_records"], [11, 3, 1, "", "get_status_by_partition"], [11, 3, 1, "", "has_asset_key"], [11, 3, 1, "", "has_dynamic_partition"], [11, 3, 1, "", "local_temp"], [11, 3, 1, "", "wipe_assets"]], "dagster.DagsterRun": [[11, 1, 1, "", "is_failure"], [11, 1, 1, "", "is_failure_or_canceled"], [11, 1, 1, "", "is_finished"], [11, 1, 1, "", "is_resume_retry"], [11, 1, 1, "", "is_success"]], "dagster.DagsterRunMetadataValue": [[63, 1, 1, "", "value"]], "dagster.DagsterType": [[68, 1, 1, "", "description"], [68, 1, 1, "", "display_name"], [68, 1, 1, "", "has_unique_name"], [68, 1, 1, "", "loader"], [68, 1, 1, "", "required_resource_keys"], [68, 3, 1, "", "type_check"], [68, 1, 1, "", "typing_type"], [68, 1, 1, "", "unique_name"]], "dagster.DagsterTypeLoaderContext": [[68, 1, 1, "", "job_def"], [68, 1, 1, "", "op_def"], [68, 1, 1, "", "resources"]], "dagster.DagsterUserCodeExecutionError": [[7, 1, 1, "", "is_user_code_error"]], "dagster.Definitions": [[5, 3, 1, "", "get_asset_value_loader"], [5, 3, 1, "", "get_job_def"], [5, 3, 1, "", "get_schedule_def"], [5, 3, 1, "", "get_sensor_def"], [5, 3, 1, "", "load_asset_value"]], "dagster.DependencyDefinition": [[9, 3, 1, "", "is_fan_in"]], "dagster.DynamicOutput": [[6, 1, 1, "", "mapping_key"], [6, 1, 1, "", "output_name"], [6, 1, 1, "", "value"]], "dagster.DynamicPartitionsDefinition": [[64, 3, 1, "", "get_partition_keys"]], "dagster.EventLogEntry": [[11, 1, 1, "", "dagster_event_type"], [11, 3, 1, "", "get_dagster_event"], [11, 1, 1, "", "is_dagster_event"], [11, 1, 1, "", "message"]], "dagster.ExecuteInProcessResult": [[8, 1, 1, "", "all_events"], [8, 3, 1, "", "asset_value"], [8, 1, 1, "", "dagster_run"], [8, 1, 1, "", "job_def"], [8, 3, 1, "", "output_for_node"], [8, 3, 1, "", "output_value"], [8, 1, 1, "", "run_id"]], "dagster.Executor": [[11, 3, 1, "", "execute"], [11, 1, 1, "", "retries"]], "dagster.ExecutorDefinition": [[11, 3, 1, "", "configured"], [11, 1, 1, "", "description"], [11, 1, 1, "", "executor_creation_fn"], [11, 1, 1, "", "name"]], "dagster.Field": [[4, 1, 1, "", "default_provided"], [4, 1, 1, "", "default_value"], [4, 1, 1, "", "description"], [4, 1, 1, "", "is_required"]], "dagster.FileHandle": [[11, 1, 1, "", "path_desc"]], "dagster.GraphDefinition": [[9, 3, 1, "", "alias"], [9, 1, 1, "", "config_mapping"], [9, 3, 1, "", "execute_in_process"], [9, 1, 1, "", "input_mappings"], [9, 1, 1, "", "name"], [9, 1, 1, "", "output_mappings"], [9, 3, 1, "", "tag"], [9, 1, 1, "", "tags"], [9, 3, 1, "", "to_job"], [9, 3, 1, "", "with_hooks"], [9, 3, 1, "", "with_retry_policy"]], "dagster.HookContext": [[10, 1, 1, "", "hook_def"], [10, 1, 1, "", "instance"], [10, 1, 1, "", "job_name"], [10, 1, 1, "", "log"], [10, 1, 1, "", "op_config"], [10, 1, 1, "", "op_exception"], [10, 1, 1, "", "op_output_values"], [10, 1, 1, "", "required_resource_keys"], [10, 1, 1, "", "resources"], [10, 1, 1, "", "run_id"], [10, 1, 1, "", "step_key"]], "dagster.IOManager": [[12, 3, 1, "", "handle_output"], [12, 3, 1, "", "load_input"]], "dagster.IOManagerDefinition": [[12, 3, 1, "", "hardcoded_io_manager"]], "dagster.InitExecutorContext": [[11, 2, 1, "", "executor_config"], [11, 2, 1, "", "executor_def"], [11, 2, 1, "", "instance"], [11, 2, 1, "", "job"]], "dagster.InitLoggerContext": [[61, 1, 1, "", "logger_config"], [61, 1, 1, "", "logger_def"], [61, 1, 1, "", "run_id"]], "dagster.InitResourceContext": [[66, 1, 1, "", "instance"], [66, 1, 1, "", "log"], [66, 1, 1, "", "log_manager"], [66, 1, 1, "", "resource_config"], [66, 1, 1, "", "resource_def"], [66, 1, 1, "", "resources"], [66, 1, 1, "", "run_id"]], "dagster.InputContext": [[12, 1, 1, "", "asset_key"], [12, 1, 1, "", "asset_partition_key"], [12, 1, 1, "", "asset_partition_key_range"], [12, 1, 1, "", "asset_partition_keys"], [12, 1, 1, "", "asset_partitions_def"], [12, 1, 1, "", "asset_partitions_time_window"], [12, 1, 1, "", "config"], [12, 1, 1, "", "dagster_type"], [12, 3, 1, "", "get_asset_identifier"], [12, 3, 1, "", "get_identifier"], [12, 1, 1, "", "has_asset_key"], [12, 1, 1, "", "has_asset_partitions"], [12, 1, 1, "", "has_input_name"], [12, 1, 1, "", "has_partition_key"], [12, 1, 1, "", "log"], [12, 1, 1, "", "metadata"], [12, 1, 1, "", "name"], [12, 1, 1, "", "op_def"], [12, 1, 1, "", "partition_key"], [12, 1, 1, "", "resource_config"], [12, 1, 1, "", "resources"], [12, 1, 1, "", "upstream_output"]], "dagster.JobDefinition": [[13, 1, 1, "", "config_mapping"], [13, 3, 1, "", "execute_in_process"], [13, 1, 1, "", "executor_def"], [13, 1, 1, "", "has_specified_executor"], [13, 1, 1, "", "has_specified_loggers"], [13, 1, 1, "", "loggers"], [13, 1, 1, "", "partitioned_config"], [13, 1, 1, "", "partitions_def"], [13, 1, 1, "", "resource_defs"], [13, 3, 1, "", "run_request_for_partition"], [13, 3, 1, "", "with_hooks"], [13, 3, 1, "", "with_top_level_resources"]], "dagster.JobExecutionResult": [[8, 1, 1, "", "all_events"], [8, 1, 1, "", "dagster_run"], [8, 1, 1, "", "job_def"], [8, 3, 1, "", "output_for_node"], [8, 3, 1, "", "output_value"], [8, 1, 1, "", "run_id"]], "dagster.JsonMetadataValue": [[63, 1, 1, "", "value"]], "dagster.LocalFileHandle": [[11, 1, 1, "", "path"], [11, 1, 1, "", "path_desc"]], "dagster.LoggerDefinition": [[61, 1, 1, "", "config_schema"], [61, 1, 1, "", "description"], [61, 1, 1, "", "logger_fn"]], "dagster.Map": [[4, 1, 1, "", "key_label_name"]], "dagster.MarkdownMetadataValue": [[63, 1, 1, "", "value"]], "dagster.MaterializeResult": [[2, 2, 1, "", "asset_key"], [2, 2, 1, "", "metadata"]], "dagster.MemoizableIOManager": [[62, 3, 1, "", "has_output"]], "dagster.MetadataValue": [[63, 3, 1, "", "asset"], [63, 3, 1, "", "bool"], [63, 3, 1, "", "dagster_run"], [63, 3, 1, "", "float"], [63, 3, 1, "", "int"], [63, 3, 1, "", "json"], [63, 3, 1, "", "md"], [63, 3, 1, "", "notebook"], [63, 3, 1, "", "null"], [63, 3, 1, "", "path"], [63, 3, 1, "", "python_artifact"], [63, 3, 1, "", "table"], [63, 3, 1, "", "table_schema"], [63, 3, 1, "", "text"], [63, 3, 1, "", "url"], [63, 1, 1, "", "value"]], "dagster.MultiAssetSensorEvaluationContext": [[67, 3, 1, "", "advance_all_cursors"], [67, 3, 1, "", "advance_cursor"], [67, 3, 1, "", "all_partitions_materialized"], [67, 1, 1, "", "asset_keys"], [67, 1, 1, "", "assets_defs_by_key"], [67, 2, 1, "", "cursor"], [67, 2, 1, "", "definitions"], [67, 3, 1, "", "get_cursor_partition"], [67, 3, 1, "", "get_downstream_partition_keys"], [67, 3, 1, "", "get_trailing_unconsumed_events"], [67, 2, 1, "", "instance"], [67, 2, 1, "", "instance_ref"], [67, 2, 1, "", "last_completion_time"], [67, 2, 1, "", "last_run_key"], [67, 3, 1, "", "latest_materialization_records_by_key"], [67, 3, 1, "", "latest_materialization_records_by_partition"], [67, 3, 1, "", "latest_materialization_records_by_partition_and_asset"], [67, 3, 1, "", "materialization_records_for_key"], [67, 2, 1, "", "monitored_assets"], [67, 2, 1, "", "repository_def"], [67, 2, 1, "", "repository_name"]], "dagster.MultiDependencyDefinition": [[9, 3, 1, "", "get_dependencies_and_mappings"], [9, 3, 1, "", "get_node_dependencies"], [9, 3, 1, "", "is_fan_in"]], "dagster.MultiPartitionsDefinition": [[64, 3, 1, "", "get_partition_keys"], [64, 2, 1, "", "partitions_defs"]], "dagster.NotebookMetadataValue": [[63, 1, 1, "", "value"]], "dagster.OpDefinition": [[63, 3, 1, "", "alias"], [63, 1, 1, "", "config_schema"], [63, 1, 1, "", "ins"], [63, 1, 1, "", "name"], [63, 1, 1, "", "outs"], [63, 1, 1, "", "required_resource_keys"], [63, 1, 1, "", "retry_policy"], [63, 3, 1, "", "tag"], [63, 1, 1, "", "tags"], [63, 1, 1, "", "version"], [63, 3, 1, "", "with_hooks"], [63, 3, 1, "", "with_retry_policy"]], "dagster.OpExecutionContext": [[8, 3, 1, "", "add_output_metadata"], [8, 1, 1, "", "asset_checks_def"], [8, 1, 1, "", "asset_key"], [8, 3, 1, "", "asset_key_for_input"], [8, 3, 1, "", "asset_key_for_output"], [8, 3, 1, "", "asset_partition_key_for_input"], [8, 3, 1, "", "asset_partition_key_for_output"], [8, 1, 1, "", "asset_partition_key_range"], [8, 3, 1, "", "asset_partition_key_range_for_input"], [8, 3, 1, "", "asset_partition_key_range_for_output"], [8, 3, 1, "", "asset_partition_keys_for_input"], [8, 3, 1, "", "asset_partition_keys_for_output"], [8, 3, 1, "", "asset_partitions_def_for_input"], [8, 3, 1, "", "asset_partitions_def_for_output"], [8, 3, 1, "", "asset_partitions_time_window_for_input"], [8, 3, 1, "", "asset_partitions_time_window_for_output"], [8, 1, 1, "", "assets_def"], [8, 3, 1, "", "get_asset_provenance"], [8, 3, 1, "", "get_mapping_key"], [8, 3, 1, "", "get_tag"], [8, 1, 1, "", "has_asset_checks_def"], [8, 1, 1, "", "has_assets_def"], [8, 1, 1, "", "has_partition_key"], [8, 3, 1, "", "has_tag"], [8, 1, 1, "", "instance"], [8, 1, 1, "", "job_def"], [8, 1, 1, "", "job_name"], [8, 1, 1, "", "log"], [8, 3, 1, "", "log_event"], [8, 1, 1, "", "op_config"], [8, 1, 1, "", "op_def"], [8, 3, 1, "", "output_for_asset_key"], [8, 1, 1, "", "partition_key"], [8, 1, 1, "", "partition_key_range"], [8, 1, 1, "", "partition_time_window"], [8, 1, 1, "", "pdb"], [8, 1, 1, "", "resources"], [8, 1, 1, "", "retry_number"], [8, 1, 1, "", "run_config"], [8, 1, 1, "", "run_id"], [8, 1, 1, "", "selected_asset_check_keys"], [8, 1, 1, "", "selected_asset_keys"], [8, 1, 1, "", "selected_output_names"]], "dagster.OpVersionContext": [[62, 2, 1, "", "op_config"], [62, 2, 1, "", "op_def"]], "dagster.Output": [[63, 1, 1, "", "data_version"], [63, 1, 1, "", "output_name"], [63, 1, 1, "", "value"]], "dagster.OutputContext": [[12, 3, 1, "", "add_output_metadata"], [12, 1, 1, "", "asset_key"], [12, 1, 1, "", "asset_partition_key"], [12, 1, 1, "", "asset_partition_key_range"], [12, 1, 1, "", "asset_partition_keys"], [12, 1, 1, "", "asset_partitions_def"], [12, 1, 1, "", "asset_partitions_time_window"], [12, 1, 1, "", "config"], [12, 1, 1, "", "dagster_type"], [12, 3, 1, "", "get_asset_identifier"], [12, 3, 1, "", "get_identifier"], [12, 1, 1, "", "has_asset_key"], [12, 1, 1, "", "has_asset_partitions"], [12, 1, 1, "", "has_partition_key"], [12, 1, 1, "", "log"], [12, 3, 1, "", "log_event"], [12, 1, 1, "", "mapping_key"], [12, 1, 1, "", "metadata"], [12, 1, 1, "", "name"], [12, 1, 1, "", "op_def"], [12, 1, 1, "", "partition_key"], [12, 1, 1, "", "resource_config"], [12, 1, 1, "", "resources"], [12, 1, 1, "", "run_id"], [12, 1, 1, "", "step_key"], [12, 1, 1, "", "version"]], "dagster.PartitionKeyRange": [[64, 2, 1, "", "end"], [64, 2, 1, "", "start"]], "dagster.PartitionMapping": [[64, 3, 1, "", "get_downstream_partitions_for_partitions"], [64, 3, 1, "", "get_upstream_mapped_partitions_result_for_partitions"]], "dagster.PartitionedConfig": [[64, 3, 1, "", "get_partition_keys"], [64, 1, 1, "", "partitions_def"], [64, 1, 1, "", "run_config_for_partition_fn"], [64, 1, 1, "", "run_config_for_partition_key_fn"], [64, 1, 1, "", "tags_for_partition_fn"], [64, 1, 1, "", "tags_for_partition_key_fn"]], "dagster.PartitionsDefinition": [[64, 3, 1, "", "get_partition_keys"]], "dagster.PathMetadataValue": [[63, 1, 1, "", "value"]], "dagster.PythonArtifactMetadataValue": [[63, 1, 1, "", "value"]], "dagster.RepositoryData": [[65, 3, 1, "", "get_all_jobs"], [65, 3, 1, "", "get_all_schedules"], [65, 3, 1, "", "get_all_sensors"], [65, 3, 1, "", "get_assets_defs_by_key"], [65, 3, 1, "", "get_job"], [65, 3, 1, "", "get_job_names"], [65, 3, 1, "", "get_schedule"], [65, 3, 1, "", "get_schedule_names"], [65, 3, 1, "", "get_sensor"], [65, 3, 1, "", "get_sensor_names"], [65, 3, 1, "", "get_source_assets_by_key"], [65, 3, 1, "", "has_job"], [65, 3, 1, "", "has_schedule"], [65, 3, 1, "", "has_sensor"]], "dagster.RepositoryDefinition": [[65, 1, 1, "", "description"], [65, 3, 1, "", "get_all_jobs"], [65, 3, 1, "", "get_asset_value_loader"], [65, 3, 1, "", "get_job"], [65, 3, 1, "", "get_schedule_def"], [65, 3, 1, "", "get_sensor_def"], [65, 3, 1, "", "has_job"], [65, 3, 1, "", "has_schedule_def"], [65, 3, 1, "", "has_sensor_def"], [65, 1, 1, "", "job_names"], [65, 3, 1, "", "load_asset_value"], [65, 1, 1, "", "metadata"], [65, 1, 1, "", "name"], [65, 1, 1, "", "schedule_defs"], [65, 1, 1, "", "sensor_defs"]], "dagster.ResourceDefinition": [[66, 1, 1, "", "description"], [66, 3, 1, "", "hardcoded_resource"], [66, 3, 1, "", "mock_resource"], [66, 3, 1, "", "none_resource"], [66, 1, 1, "", "required_resource_keys"], [66, 3, 1, "", "string_resource"], [66, 1, 1, "", "version"]], "dagster.ResourceVersionContext": [[62, 2, 1, "", "resource_config"], [62, 2, 1, "", "resource_def"]], "dagster.RunFailureSensorContext": [[67, 2, 1, "", "dagster_run"], [67, 1, 1, "", "failure_event"], [67, 3, 1, "", "get_step_failure_events"], [67, 2, 1, "", "sensor_name"]], "dagster.RunRequest": [[67, 2, 1, "", "asset_selection"], [67, 2, 1, "", "job_name"], [67, 2, 1, "", "partition_key"], [67, 2, 1, "", "run_key"], [67, 2, 1, "", "stale_assets_only"], [67, 2, 1, "", "tags"]], "dagster.RunStatusSensorContext": [[67, 1, 1, "", "dagster_event"], [67, 1, 1, "", "dagster_run"], [67, 1, 1, "", "instance"], [67, 1, 1, "", "log"], [67, 1, 1, "", "partition_key"], [67, 1, 1, "", "sensor_name"]], "dagster.ScheduleDefinition": [[67, 1, 1, "", "cron_schedule"], [67, 1, 1, "", "default_status"], [67, 1, 1, "", "description"], [67, 1, 1, "", "environment_vars"], [67, 1, 1, "", "execution_timezone"], [67, 1, 1, "", "job"], [67, 1, 1, "", "job_name"], [67, 1, 1, "", "name"], [67, 1, 1, "", "required_resource_keys"]], "dagster.ScheduleEvaluationContext": [[67, 1, 1, "", "instance"], [67, 1, 1, "", "resources"], [67, 1, 1, "", "scheduled_execution_time"]], "dagster.SensorDefinition": [[67, 1, 1, "", "default_status"], [67, 1, 1, "", "description"], [67, 1, 1, "", "job"], [67, 1, 1, "", "job_name"], [67, 1, 1, "", "jobs"], [67, 1, 1, "", "minimum_interval_seconds"], [67, 1, 1, "", "name"], [67, 1, 1, "", "required_resource_keys"]], "dagster.SensorResult": [[67, 2, 1, "", "asset_events"], [67, 2, 1, "", "cursor"], [67, 2, 1, "", "run_requests"], [67, 2, 1, "", "skip_reason"]], "dagster.SkipReason": [[67, 2, 1, "", "skip_message"]], "dagster.SourceAsset": [[2, 2, 1, "", "description"], [2, 2, 1, "", "io_manager_def"], [2, 2, 1, "", "io_manager_key"], [2, 1, 1, "", "is_observable"], [2, 2, 1, "", "key"], [2, 2, 1, "", "metadata"], [2, 2, 1, "", "observe_fn"], [2, 1, 1, "", "op"], [2, 2, 1, "", "partitions_def"], [2, 2, 1, "", "resource_defs"]], "dagster.SourceHashVersionStrategy": [[62, 3, 1, "", "get_op_version"], [62, 3, 1, "", "get_resource_version"]], "dagster.StaticPartitionsDefinition": [[64, 3, 1, "", "get_partition_keys"]], "dagster.TableMetadataValue": [[63, 3, 1, "", "infer_column_type"], [63, 1, 1, "", "value"]], "dagster.TableSchema": [[63, 3, 1, "", "from_name_type_dict"]], "dagster.TableSchemaMetadataValue": [[63, 1, 1, "", "value"]], "dagster.TextMetadataValue": [[63, 1, 1, "", "value"]], "dagster.TimeWindow": [[64, 2, 1, "", "end"], [64, 2, 1, "", "start"]], "dagster.TimeWindowPartitionMapping": [[64, 2, 1, "", "allow_nonexistent_upstream_partitions"], [64, 2, 1, "", "end_offset"], [64, 2, 1, "", "start_offset"]], "dagster.TimeWindowPartitionsDefinition": [[64, 1, 1, "", "day_offset"], [64, 3, 1, "", "get_cron_schedule"], [64, 1, 1, "", "hour_offset"], [64, 1, 1, "", "minute_offset"], [64, 1, 1, "", "schedule_type"]], "dagster.TypeCheckContext": [[8, 1, 1, "", "log"], [8, 1, 1, "", "resources"], [8, 1, 1, "", "run_id"]], "dagster.UrlMetadataValue": [[63, 1, 1, "", "value"]], "dagster.VersionStrategy": [[62, 3, 1, "", "get_op_version"], [62, 3, 1, "", "get_resource_version"]], "dagster._core": [[7, 7, 0, "-", "errors"]], "dagster._core.errors": [[11, 6, 1, "", "user_code_error_boundary"]], "dagster._core.instance": [[11, 0, 1, "", "InstanceRef"]], "dagster._core.launcher": [[11, 0, 1, "", "DefaultRunLauncher"], [11, 0, 1, "", "RunLauncher"]], "dagster._core.run_coordinator": [[11, 0, 1, "", "DefaultRunCoordinator"], [11, 5, 1, "", "QueuedRunCoordinator"]], "dagster._core.scheduler": [[67, 5, 1, "", "DagsterDaemonScheduler"], [11, 0, 1, "", "Scheduler"]], "dagster._core.storage.base_storage": [[11, 0, 1, "", "DagsterStorage"]], "dagster._core.storage.captured_log_manager": [[11, 0, 1, "", "CapturedLogManager"]], "dagster._core.storage.compute_log_manager": [[11, 0, 1, "", "ComputeLogManager"]], "dagster._core.storage.dagster_run": [[11, 0, 1, "", "RunRecord"]], "dagster._core.storage.event_log": [[11, 0, 1, "", "AssetRecord"], [11, 0, 1, "", "ConsolidatedSqliteEventLogStorage"], [11, 0, 1, "", "EventLogStorage"], [11, 0, 1, "", "SqlEventLogStorage"], [11, 0, 1, "", "SqliteEventLogStorage"]], "dagster._core.storage.file_manager": [[11, 0, 1, "", "FileManager"]], "dagster._core.storage.file_manager.FileManager": [[11, 3, 1, "", "copy_handle_to_local_temp"], [11, 3, 1, "", "delete_local_temp"], [11, 3, 1, "", "read"], [11, 3, 1, "", "read_data"], [11, 3, 1, "", "write"], [11, 3, 1, "", "write_data"]], "dagster._core.storage.local_compute_log_manager": [[11, 0, 1, "", "LocalComputeLogManager"]], "dagster._core.storage.noop_compute_log_manager": [[11, 0, 1, "", "NoOpComputeLogManager"]], "dagster._core.storage.root": [[11, 0, 1, "", "LocalArtifactStorage"]], "dagster._core.storage.runs": [[11, 0, 1, "", "RunStorage"], [11, 0, 1, "", "SqlRunStorage"], [11, 0, 1, "", "SqliteRunStorage"]], "dagster._core.storage.schedules": [[11, 0, 1, "", "ScheduleStorage"], [11, 0, 1, "", "SqlScheduleStorage"], [11, 0, 1, "", "SqliteScheduleStorage"]], "dagster._loggers": [[61, 6, 1, "", "colored_console_logger"], [61, 6, 1, "", "json_console_logger"]], "dagster._serdes": [[11, 0, 1, "", "ConfigurableClass"], [11, 0, 1, "", "ConfigurableClassData"]], "dagster._utils.forked_pdb": [[69, 0, 1, "", "ForkedPdb"]], "dagster_airbyte": [[14, 0, 1, "", "AirbyteConnection"], [14, 0, 1, "", "AirbyteDestination"], [14, 0, 1, "", "AirbyteManagedElementReconciler"], [14, 5, 1, "", "AirbyteResource"], [14, 0, 1, "", "AirbyteSource"], [14, 0, 1, "", "AirbyteSyncMode"], [14, 5, 1, "", "airbyte_resource"], [14, 5, 1, "", "airbyte_sync_op"], [14, 6, 1, "", "build_airbyte_assets"], [14, 6, 1, "", "load_assets_from_airbyte_instance"], [14, 6, 1, "", "load_assets_from_airbyte_project"], [14, 6, 1, "", "load_assets_from_connections"]], "dagster_airbyte.AirbyteConnection": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.AirbyteDestination": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.AirbyteManagedElementReconciler": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.AirbyteSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.AirbyteSyncMode": [[14, 3, 1, "", "full_refresh_append"], [14, 3, 1, "", "full_refresh_overwrite"], [14, 3, 1, "", "incremental_append"], [14, 3, 1, "", "incremental_append_dedup"]], "dagster_airbyte.managed.generated.destinations": [[14, 0, 1, "", "AmazonSqsDestination"], [14, 0, 1, "", "AwsDatalakeDestination"], [14, 0, 1, "", "AzureBlobStorageDestination"], [14, 0, 1, "", "BigqueryDenormalizedDestination"], [14, 0, 1, "", "BigqueryDestination"], [14, 0, 1, "", "CassandraDestination"], [14, 0, 1, "", "ClickhouseDestination"], [14, 0, 1, "", "CsvDestination"], [14, 0, 1, "", "DatabricksDestination"], [14, 0, 1, "", "DynamodbDestination"], [14, 0, 1, "", "ElasticsearchDestination"], [14, 0, 1, "", "FireboltDestination"], [14, 0, 1, "", "FirestoreDestination"], [14, 0, 1, "", "GcsDestination"], [14, 0, 1, "", "GoogleSheetsDestination"], [14, 0, 1, "", "JdbcDestination"], [14, 0, 1, "", "KafkaDestination"], [14, 0, 1, "", "KeenDestination"], [14, 0, 1, "", "KinesisDestination"], [14, 0, 1, "", "KvdbDestination"], [14, 0, 1, "", "LocalJsonDestination"], [14, 0, 1, "", "MariadbColumnstoreDestination"], [14, 0, 1, "", "MeilisearchDestination"], [14, 0, 1, "", "MongodbDestination"], [14, 0, 1, "", "MqttDestination"], [14, 0, 1, "", "MssqlDestination"], [14, 0, 1, "", "MysqlDestination"], [14, 0, 1, "", "OracleDestination"], [14, 0, 1, "", "PostgresDestination"], [14, 0, 1, "", "PubsubDestination"], [14, 0, 1, "", "PulsarDestination"], [14, 0, 1, "", "R2Destination"], [14, 0, 1, "", "RabbitmqDestination"], [14, 0, 1, "", "RedisDestination"], [14, 0, 1, "", "RedshiftDestination"], [14, 0, 1, "", "RocksetDestination"], [14, 0, 1, "", "S3Destination"], [14, 0, 1, "", "ScaffoldDestinationPythonDestination"], [14, 0, 1, "", "ScyllaDestination"], [14, 0, 1, "", "SftpJsonDestination"], [14, 0, 1, "", "SnowflakeDestination"], [14, 0, 1, "", "SqliteDestination"], [14, 0, 1, "", "TidbDestination"]], "dagster_airbyte.managed.generated.destinations.AmazonSqsDestination": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.AwsDatalakeDestination": [[14, 0, 1, "", "IAMRole"], [14, 0, 1, "", "IAMUser"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.AwsDatalakeDestination.IAMRole": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.AwsDatalakeDestination.IAMUser": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.AzureBlobStorageDestination": [[14, 0, 1, "", "CSVCommaSeparatedValues"], [14, 0, 1, "", "JSONLinesNewlineDelimitedJSON"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.AzureBlobStorageDestination.CSVCommaSeparatedValues": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.AzureBlobStorageDestination.JSONLinesNewlineDelimitedJSON": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.BigqueryDenormalizedDestination": [[14, 0, 1, "", "GCSStaging"], [14, 0, 1, "", "HMACKey"], [14, 0, 1, "", "StandardInserts"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.BigqueryDenormalizedDestination.GCSStaging": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.BigqueryDenormalizedDestination.HMACKey": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.BigqueryDenormalizedDestination.StandardInserts": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.BigqueryDestination": [[14, 0, 1, "", "GCSStaging"], [14, 0, 1, "", "HMACKey"], [14, 0, 1, "", "StandardInserts"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.BigqueryDestination.GCSStaging": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.BigqueryDestination.HMACKey": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.BigqueryDestination.StandardInserts": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.CassandraDestination": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.ClickhouseDestination": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.CsvDestination": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.DatabricksDestination": [[14, 0, 1, "", "AmazonS3"], [14, 0, 1, "", "AzureBlobStorage"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.DatabricksDestination.AmazonS3": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.DatabricksDestination.AzureBlobStorage": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.DynamodbDestination": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.ElasticsearchDestination": [[14, 0, 1, "", "ApiKeySecret"], [14, 0, 1, "", "None_"], [14, 0, 1, "", "UsernamePassword"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.ElasticsearchDestination.ApiKeySecret": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.ElasticsearchDestination.None_": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.ElasticsearchDestination.UsernamePassword": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.FireboltDestination": [[14, 0, 1, "", "ExternalTableViaS3"], [14, 0, 1, "", "SQLInserts"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.FireboltDestination.ExternalTableViaS3": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.FireboltDestination.SQLInserts": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.FirestoreDestination": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.GcsDestination": [[14, 0, 1, "", "AvroApacheAvro"], [14, 0, 1, "", "Bzip2"], [14, 0, 1, "", "CSVCommaSeparatedValues"], [14, 0, 1, "", "Deflate"], [14, 0, 1, "", "GZIP"], [14, 0, 1, "", "HMACKey"], [14, 0, 1, "", "JSONLinesNewlineDelimitedJSON"], [14, 0, 1, "", "NoCompression"], [14, 0, 1, "", "ParquetColumnarStorage"], [14, 0, 1, "", "Snappy"], [14, 0, 1, "", "Xz"], [14, 0, 1, "", "Zstandard"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.GcsDestination.AvroApacheAvro": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.GcsDestination.Bzip2": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.GcsDestination.CSVCommaSeparatedValues": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.GcsDestination.Deflate": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.GcsDestination.GZIP": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.GcsDestination.HMACKey": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.GcsDestination.JSONLinesNewlineDelimitedJSON": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.GcsDestination.NoCompression": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.GcsDestination.ParquetColumnarStorage": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.GcsDestination.Snappy": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.GcsDestination.Xz": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.GcsDestination.Zstandard": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.GoogleSheetsDestination": [[14, 0, 1, "", "AuthenticationViaGoogleOAuth"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.GoogleSheetsDestination.AuthenticationViaGoogleOAuth": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.JdbcDestination": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.KafkaDestination": [[14, 0, 1, "", "PLAINTEXT"], [14, 0, 1, "", "SASLPLAINTEXT"], [14, 0, 1, "", "SASLSSL"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.KafkaDestination.PLAINTEXT": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.KafkaDestination.SASLPLAINTEXT": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.KafkaDestination.SASLSSL": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.KeenDestination": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.KinesisDestination": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.KvdbDestination": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.LocalJsonDestination": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.MariadbColumnstoreDestination": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.MeilisearchDestination": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.MongodbDestination": [[14, 0, 1, "", "LoginPassword"], [14, 0, 1, "", "MongoDBAtlas"], [14, 0, 1, "", "None_"], [14, 0, 1, "", "ReplicaSet"], [14, 0, 1, "", "StandaloneMongoDbInstance"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.MongodbDestination.LoginPassword": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.MongodbDestination.MongoDBAtlas": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.MongodbDestination.None_": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.MongodbDestination.ReplicaSet": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.MongodbDestination.StandaloneMongoDbInstance": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.MqttDestination": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.MssqlDestination": [[14, 0, 1, "", "EncryptedTrustServerCertificate"], [14, 0, 1, "", "EncryptedVerifyCertificate"], [14, 0, 1, "", "Unencrypted"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.MssqlDestination.EncryptedTrustServerCertificate": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.MssqlDestination.EncryptedVerifyCertificate": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.MssqlDestination.Unencrypted": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.MysqlDestination": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.OracleDestination": [[14, 0, 1, "", "NativeNetworkEncryptionNNE"], [14, 0, 1, "", "TLSEncryptedVerifyCertificate"], [14, 0, 1, "", "Unencrypted"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.OracleDestination.NativeNetworkEncryptionNNE": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.OracleDestination.TLSEncryptedVerifyCertificate": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.OracleDestination.Unencrypted": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.PostgresDestination": [[14, 0, 1, "", "Allow"], [14, 0, 1, "", "Disable"], [14, 0, 1, "", "Prefer"], [14, 0, 1, "", "Require"], [14, 0, 1, "", "VerifyCa"], [14, 0, 1, "", "VerifyFull"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.PostgresDestination.Allow": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.PostgresDestination.Disable": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.PostgresDestination.Prefer": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.PostgresDestination.Require": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.PostgresDestination.VerifyCa": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.PostgresDestination.VerifyFull": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.PubsubDestination": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.PulsarDestination": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.R2Destination": [[14, 0, 1, "", "AvroApacheAvro"], [14, 0, 1, "", "Bzip2"], [14, 0, 1, "", "CSVCommaSeparatedValues"], [14, 0, 1, "", "Deflate"], [14, 0, 1, "", "GZIP"], [14, 0, 1, "", "JSONLinesNewlineDelimitedJSON"], [14, 0, 1, "", "NoCompression"], [14, 0, 1, "", "Snappy"], [14, 0, 1, "", "Xz"], [14, 0, 1, "", "Zstandard"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.R2Destination.AvroApacheAvro": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.R2Destination.Bzip2": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.R2Destination.CSVCommaSeparatedValues": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.R2Destination.Deflate": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.R2Destination.GZIP": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.R2Destination.JSONLinesNewlineDelimitedJSON": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.R2Destination.NoCompression": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.R2Destination.Snappy": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.R2Destination.Xz": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.R2Destination.Zstandard": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.RabbitmqDestination": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.RedisDestination": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.RedshiftDestination": [[14, 0, 1, "", "AESCBCEnvelopeEncryption"], [14, 0, 1, "", "NoEncryption"], [14, 0, 1, "", "S3Staging"], [14, 0, 1, "", "Standard"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.RedshiftDestination.AESCBCEnvelopeEncryption": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.RedshiftDestination.NoEncryption": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.RedshiftDestination.S3Staging": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.RedshiftDestination.Standard": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.RocksetDestination": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.S3Destination": [[14, 0, 1, "", "AvroApacheAvro"], [14, 0, 1, "", "Bzip2"], [14, 0, 1, "", "CSVCommaSeparatedValues"], [14, 0, 1, "", "Deflate"], [14, 0, 1, "", "GZIP"], [14, 0, 1, "", "JSONLinesNewlineDelimitedJSON"], [14, 0, 1, "", "NoCompression"], [14, 0, 1, "", "ParquetColumnarStorage"], [14, 0, 1, "", "Snappy"], [14, 0, 1, "", "Xz"], [14, 0, 1, "", "Zstandard"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.S3Destination.AvroApacheAvro": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.S3Destination.Bzip2": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.S3Destination.CSVCommaSeparatedValues": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.S3Destination.Deflate": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.S3Destination.GZIP": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.S3Destination.JSONLinesNewlineDelimitedJSON": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.S3Destination.NoCompression": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.S3Destination.ParquetColumnarStorage": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.S3Destination.Snappy": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.S3Destination.Xz": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.S3Destination.Zstandard": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.ScaffoldDestinationPythonDestination": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.ScyllaDestination": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.SftpJsonDestination": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.SnowflakeDestination": [[14, 0, 1, "", "AESCBCEnvelopeEncryption"], [14, 0, 1, "", "AWSS3Staging"], [14, 0, 1, "", "AzureBlobStorageStaging"], [14, 0, 1, "", "GoogleCloudStorageStaging"], [14, 0, 1, "", "KeyPairAuthentication"], [14, 0, 1, "", "NoEncryption"], [14, 0, 1, "", "OAuth20"], [14, 0, 1, "", "RecommendedInternalStaging"], [14, 0, 1, "", "SelectAnotherOption"], [14, 0, 1, "", "UsernameAndPassword"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.SnowflakeDestination.AESCBCEnvelopeEncryption": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.SnowflakeDestination.AWSS3Staging": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.SnowflakeDestination.AzureBlobStorageStaging": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.SnowflakeDestination.GoogleCloudStorageStaging": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.SnowflakeDestination.KeyPairAuthentication": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.SnowflakeDestination.NoEncryption": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.SnowflakeDestination.OAuth20": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.SnowflakeDestination.RecommendedInternalStaging": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.SnowflakeDestination.SelectAnotherOption": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.SnowflakeDestination.UsernameAndPassword": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.SqliteDestination": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.destinations.TidbDestination": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources": [[14, 0, 1, "", "AdjustSource"], [14, 0, 1, "", "AirtableSource"], [14, 0, 1, "", "AmazonAdsSource"], [14, 0, 1, "", "AmazonSellerPartnerSource"], [14, 0, 1, "", "AmazonSqsSource"], [14, 0, 1, "", "AmplitudeSource"], [14, 0, 1, "", "ApifyDatasetSource"], [14, 0, 1, "", "AppfollowSource"], [14, 0, 1, "", "AppsflyerSource"], [14, 0, 1, "", "AppstoreSingerSource"], [14, 0, 1, "", "AsanaSource"], [14, 0, 1, "", "AwsCloudtrailSource"], [14, 0, 1, "", "AzureTableSource"], [14, 0, 1, "", "BambooHrSource"], [14, 0, 1, "", "BigcommerceSource"], [14, 0, 1, "", "BigquerySource"], [14, 0, 1, "", "BingAdsSource"], [14, 0, 1, "", "BraintreeSource"], [14, 0, 1, "", "CartSource"], [14, 0, 1, "", "ChargebeeSource"], [14, 0, 1, "", "ChargifySource"], [14, 0, 1, "", "ChartmogulSource"], [14, 0, 1, "", "ClickhouseSource"], [14, 0, 1, "", "CloseComSource"], [14, 0, 1, "", "CockroachdbSource"], [14, 0, 1, "", "CommercetoolsSource"], [14, 0, 1, "", "ConfluenceSource"], [14, 0, 1, "", "CourierSource"], [14, 0, 1, "", "Db2Source"], [14, 0, 1, "", "DelightedSource"], [14, 0, 1, "", "DixaSource"], [14, 0, 1, "", "DockerhubSource"], [14, 0, 1, "", "DriftSource"], [14, 0, 1, "", "Dv360Source"], [14, 0, 1, "", "E2eTestSource"], [14, 0, 1, "", "ElasticsearchSource"], [14, 0, 1, "", "ExchangeRatesSource"], [14, 0, 1, "", "FacebookMarketingSource"], [14, 0, 1, "", "FacebookPagesSource"], [14, 0, 1, "", "FakerSource"], [14, 0, 1, "", "FaunaSource"], [14, 0, 1, "", "FileSecureSource"], [14, 0, 1, "", "FileSource"], [14, 0, 1, "", "FireboltSource"], [14, 0, 1, "", "FlexportSource"], [14, 0, 1, "", "FreshcallerSource"], [14, 0, 1, "", "FreshdeskSource"], [14, 0, 1, "", "FreshsalesSource"], [14, 0, 1, "", "FreshserviceSource"], [14, 0, 1, "", "GithubSource"], [14, 0, 1, "", "GitlabSource"], [14, 0, 1, "", "GlassfrogSource"], [14, 0, 1, "", "GocardlessSource"], [14, 0, 1, "", "GoogleAdsSource"], [14, 0, 1, "", "GoogleAnalyticsDataApiSource"], [14, 0, 1, "", "GoogleAnalyticsV4Source"], [14, 0, 1, "", "GoogleDirectorySource"], [14, 0, 1, "", "GoogleSearchConsoleSource"], [14, 0, 1, "", "GoogleSheetsSource"], [14, 0, 1, "", "GoogleWorkspaceAdminReportsSource"], [14, 0, 1, "", "GreenhouseSource"], [14, 0, 1, "", "GutendexSource"], [14, 0, 1, "", "HarvestSource"], [14, 0, 1, "", "HellobatonSource"], [14, 0, 1, "", "HubplannerSource"], [14, 0, 1, "", "HubspotSource"], [14, 0, 1, "", "InsightlySource"], [14, 0, 1, "", "InstagramSource"], [14, 0, 1, "", "IntercomSource"], [14, 0, 1, "", "IterableSource"], [14, 0, 1, "", "JdbcSource"], [14, 0, 1, "", "JiraSource"], [14, 0, 1, "", "KafkaSource"], [14, 0, 1, "", "KlaviyoSource"], [14, 0, 1, "", "KustomerSingerSource"], [14, 0, 1, "", "KyribaSource"], [14, 0, 1, "", "LemlistSource"], [14, 0, 1, "", "LeverHiringSource"], [14, 0, 1, "", "LinkedinAdsSource"], [14, 0, 1, "", "LinkedinPagesSource"], [14, 0, 1, "", "LinnworksSource"], [14, 0, 1, "", "LookerSource"], [14, 0, 1, "", "MailchimpSource"], [14, 0, 1, "", "MailgunSource"], [14, 0, 1, "", "MarketoSource"], [14, 0, 1, "", "MetabaseSource"], [14, 0, 1, "", "MicrosoftTeamsSource"], [14, 0, 1, "", "MixpanelSource"], [14, 0, 1, "", "MondaySource"], [14, 0, 1, "", "MongodbSource"], [14, 0, 1, "", "MongodbV2Source"], [14, 0, 1, "", "MssqlSource"], [14, 0, 1, "", "MyHoursSource"], [14, 0, 1, "", "MysqlSource"], [14, 0, 1, "", "NetsuiteSource"], [14, 0, 1, "", "NotionSource"], [14, 0, 1, "", "OktaSource"], [14, 0, 1, "", "OnesignalSource"], [14, 0, 1, "", "OpenweatherSource"], [14, 0, 1, "", "OracleSource"], [14, 0, 1, "", "OrbSource"], [14, 0, 1, "", "OrbitSource"], [14, 0, 1, "", "OutreachSource"], [14, 0, 1, "", "PardotSource"], [14, 0, 1, "", "PaypalTransactionSource"], [14, 0, 1, "", "PaystackSource"], [14, 0, 1, "", "PersistiqSource"], [14, 0, 1, "", "PinterestSource"], [14, 0, 1, "", "PipedriveSource"], [14, 0, 1, "", "PivotalTrackerSource"], [14, 0, 1, "", "PlaidSource"], [14, 0, 1, "", "PokeapiSource"], [14, 0, 1, "", "PostgresSource"], [14, 0, 1, "", "PosthogSource"], [14, 0, 1, "", "PrestashopSource"], [14, 0, 1, "", "PrimetricSource"], [14, 0, 1, "", "PythonHttpTutorialSource"], [14, 0, 1, "", "QualarooSource"], [14, 0, 1, "", "QuickbooksSingerSource"], [14, 0, 1, "", "RechargeSource"], [14, 0, 1, "", "RecurlySource"], [14, 0, 1, "", "RedshiftSource"], [14, 0, 1, "", "RetentlySource"], [14, 0, 1, "", "RkiCovidSource"], [14, 0, 1, "", "S3Source"], [14, 0, 1, "", "SalesforceSource"], [14, 0, 1, "", "SalesloftSource"], [14, 0, 1, "", "ScaffoldJavaJdbcSource"], [14, 0, 1, "", "ScaffoldSourceHttpSource"], [14, 0, 1, "", "ScaffoldSourcePythonSource"], [14, 0, 1, "", "SearchMetricsSource"], [14, 0, 1, "", "SendgridSource"], [14, 0, 1, "", "SentrySource"], [14, 0, 1, "", "SftpSource"], [14, 0, 1, "", "ShopifySource"], [14, 0, 1, "", "ShortioSource"], [14, 0, 1, "", "SlackSource"], [14, 0, 1, "", "SmartsheetsSource"], [14, 0, 1, "", "SnapchatMarketingSource"], [14, 0, 1, "", "SnowflakeSource"], [14, 0, 1, "", "SquareSource"], [14, 0, 1, "", "StockTickerApiTutorialSource"], [14, 0, 1, "", "StravaSource"], [14, 0, 1, "", "StripeSource"], [14, 0, 1, "", "SurveymonkeySource"], [14, 0, 1, "", "TalkdeskExploreSource"], [14, 0, 1, "", "TempoSource"], [14, 0, 1, "", "TidbSource"], [14, 0, 1, "", "TiktokMarketingSource"], [14, 0, 1, "", "TimelySource"], [14, 0, 1, "", "TplcentralSource"], [14, 0, 1, "", "TrelloSource"], [14, 0, 1, "", "TwilioSource"], [14, 0, 1, "", "TypeformSource"], [14, 0, 1, "", "UsCensusSource"], [14, 0, 1, "", "WebflowSource"], [14, 0, 1, "", "WhiskyHunterSource"], [14, 0, 1, "", "WoocommerceSource"], [14, 0, 1, "", "WrikeSource"], [14, 0, 1, "", "YahooFinancePriceSource"], [14, 0, 1, "", "YandexMetricaSource"], [14, 0, 1, "", "YoutubeAnalyticsSource"], [14, 0, 1, "", "ZendeskChatSource"], [14, 0, 1, "", "ZendeskSunshineSource"], [14, 0, 1, "", "ZendeskSupportSource"], [14, 0, 1, "", "ZendeskTalkSource"], [14, 0, 1, "", "ZenefitsSource"], [14, 0, 1, "", "ZenloopSource"], [14, 0, 1, "", "ZohoCrmSource"], [14, 0, 1, "", "ZoomSingerSource"], [14, 0, 1, "", "ZuoraSource"]], "dagster_airbyte.managed.generated.sources.AdjustSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.AirtableSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.AmazonAdsSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.AmazonSellerPartnerSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.AmazonSqsSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.AmplitudeSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.ApifyDatasetSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.AppfollowSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.AppsflyerSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.AppstoreSingerSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.AsanaSource": [[14, 0, 1, "", "OAuthCredentials"], [14, 0, 1, "", "PATCredentials"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.AsanaSource.OAuthCredentials": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.AsanaSource.PATCredentials": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.AwsCloudtrailSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.AzureTableSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.BambooHrSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.BigcommerceSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.BigquerySource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.BingAdsSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.BraintreeSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.CartSource": [[14, 0, 1, "", "CentralAPIRouter"], [14, 0, 1, "", "SingleStoreAccessToken"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.CartSource.CentralAPIRouter": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.CartSource.SingleStoreAccessToken": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.ChargebeeSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.ChargifySource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.ChartmogulSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.ClickhouseSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.CloseComSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.CockroachdbSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.CommercetoolsSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.ConfluenceSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.CourierSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.Db2Source": [[14, 0, 1, "", "TLSEncryptedVerifyCertificate"], [14, 0, 1, "", "Unencrypted"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.Db2Source.TLSEncryptedVerifyCertificate": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.Db2Source.Unencrypted": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.DelightedSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.DixaSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.DockerhubSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.DriftSource": [[14, 0, 1, "", "AccessToken"], [14, 0, 1, "", "OAuth20"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.DriftSource.AccessToken": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.DriftSource.OAuth20": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.Dv360Source": [[14, 0, 1, "", "Oauth2Credentials"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.Dv360Source.Oauth2Credentials": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.E2eTestSource": [[14, 0, 1, "", "MultiSchema"], [14, 0, 1, "", "SingleSchema"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.E2eTestSource.MultiSchema": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.E2eTestSource.SingleSchema": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.ElasticsearchSource": [[14, 0, 1, "", "ApiKeySecret"], [14, 0, 1, "", "None_"], [14, 0, 1, "", "UsernamePassword"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.ElasticsearchSource.ApiKeySecret": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.ElasticsearchSource.None_": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.ElasticsearchSource.UsernamePassword": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.ExchangeRatesSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.FacebookMarketingSource": [[14, 0, 1, "", "InsightConfig"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.FacebookMarketingSource.InsightConfig": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.FacebookPagesSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.FakerSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.FaunaSource": [[14, 0, 1, "", "Collection"], [14, 0, 1, "", "Disabled"], [14, 0, 1, "", "Enabled"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.FaunaSource.Collection": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.FaunaSource.Disabled": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.FaunaSource.Enabled": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.FileSecureSource": [[14, 0, 1, "", "AzBlobAzureBlobStorage"], [14, 0, 1, "", "GCSGoogleCloudStorage"], [14, 0, 1, "", "HTTPSPublicWeb"], [14, 0, 1, "", "S3AmazonWebServices"], [14, 0, 1, "", "SCPSecureCopyProtocol"], [14, 0, 1, "", "SFTPSecureFileTransferProtocol"], [14, 0, 1, "", "SSHSecureShell"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.FileSecureSource.AzBlobAzureBlobStorage": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.FileSecureSource.GCSGoogleCloudStorage": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.FileSecureSource.HTTPSPublicWeb": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.FileSecureSource.S3AmazonWebServices": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.FileSecureSource.SCPSecureCopyProtocol": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.FileSecureSource.SFTPSecureFileTransferProtocol": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.FileSecureSource.SSHSecureShell": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.FileSource": [[14, 0, 1, "", "AzBlobAzureBlobStorage"], [14, 0, 1, "", "GCSGoogleCloudStorage"], [14, 0, 1, "", "HTTPSPublicWeb"], [14, 0, 1, "", "LocalFilesystemLimited"], [14, 0, 1, "", "S3AmazonWebServices"], [14, 0, 1, "", "SCPSecureCopyProtocol"], [14, 0, 1, "", "SFTPSecureFileTransferProtocol"], [14, 0, 1, "", "SSHSecureShell"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.FileSource.AzBlobAzureBlobStorage": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.FileSource.GCSGoogleCloudStorage": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.FileSource.HTTPSPublicWeb": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.FileSource.LocalFilesystemLimited": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.FileSource.S3AmazonWebServices": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.FileSource.SCPSecureCopyProtocol": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.FileSource.SFTPSecureFileTransferProtocol": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.FileSource.SSHSecureShell": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.FireboltSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.FlexportSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.FreshcallerSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.FreshdeskSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.FreshsalesSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.FreshserviceSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.GithubSource": [[14, 0, 1, "", "OAuthCredentials"], [14, 0, 1, "", "PATCredentials"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.GithubSource.OAuthCredentials": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.GithubSource.PATCredentials": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.GitlabSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.GlassfrogSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.GocardlessSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.GoogleAdsSource": [[14, 0, 1, "", "CustomGAQLQueriesEntry"], [14, 0, 1, "", "GoogleCredentials"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.GoogleAdsSource.CustomGAQLQueriesEntry": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.GoogleAdsSource.GoogleCredentials": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.GoogleAnalyticsDataApiSource": [[14, 0, 1, "", "AuthenticateViaGoogleOauth"], [14, 0, 1, "", "ServiceAccountKeyAuthentication"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.GoogleAnalyticsDataApiSource.AuthenticateViaGoogleOauth": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.GoogleAnalyticsDataApiSource.ServiceAccountKeyAuthentication": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.GoogleAnalyticsV4Source": [[14, 0, 1, "", "AuthenticateViaGoogleOauth"], [14, 0, 1, "", "ServiceAccountKeyAuthentication"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.GoogleAnalyticsV4Source.AuthenticateViaGoogleOauth": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.GoogleAnalyticsV4Source.ServiceAccountKeyAuthentication": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.GoogleDirectorySource": [[14, 0, 1, "", "ServiceAccountKey"], [14, 0, 1, "", "SignInViaGoogleOAuth"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.GoogleDirectorySource.ServiceAccountKey": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.GoogleDirectorySource.SignInViaGoogleOAuth": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.GoogleSearchConsoleSource": [[14, 0, 1, "", "OAuth"], [14, 0, 1, "", "ServiceAccountKeyAuthentication"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.GoogleSearchConsoleSource.OAuth": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.GoogleSearchConsoleSource.ServiceAccountKeyAuthentication": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.GoogleSheetsSource": [[14, 0, 1, "", "AuthenticateViaGoogleOAuth"], [14, 0, 1, "", "ServiceAccountKeyAuthentication"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.GoogleSheetsSource.AuthenticateViaGoogleOAuth": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.GoogleSheetsSource.ServiceAccountKeyAuthentication": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.GoogleWorkspaceAdminReportsSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.GreenhouseSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.GutendexSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.HarvestSource": [[14, 0, 1, "", "AuthenticateViaHarvestOAuth"], [14, 0, 1, "", "AuthenticateWithPersonalAccessToken"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.HarvestSource.AuthenticateViaHarvestOAuth": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.HarvestSource.AuthenticateWithPersonalAccessToken": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.HellobatonSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.HubplannerSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.HubspotSource": [[14, 0, 1, "", "APIKey"], [14, 0, 1, "", "OAuth"], [14, 0, 1, "", "PrivateAPP"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.HubspotSource.APIKey": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.HubspotSource.OAuth": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.HubspotSource.PrivateAPP": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.InsightlySource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.InstagramSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.IntercomSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.IterableSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.JdbcSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.JiraSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.KafkaSource": [[14, 0, 1, "", "AVRO"], [14, 0, 1, "", "JSON"], [14, 0, 1, "", "ManuallyAssignAListOfPartitions"], [14, 0, 1, "", "PLAINTEXT"], [14, 0, 1, "", "SASLPLAINTEXT"], [14, 0, 1, "", "SASLSSL"], [14, 0, 1, "", "SubscribeToAllTopicsMatchingSpecifiedPattern"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.KafkaSource.AVRO": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.KafkaSource.JSON": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.KafkaSource.ManuallyAssignAListOfPartitions": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.KafkaSource.PLAINTEXT": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.KafkaSource.SASLPLAINTEXT": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.KafkaSource.SASLSSL": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.KafkaSource.SubscribeToAllTopicsMatchingSpecifiedPattern": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.KlaviyoSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.KustomerSingerSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.KyribaSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.LemlistSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.LeverHiringSource": [[14, 0, 1, "", "OAuthCredentials"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.LeverHiringSource.OAuthCredentials": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.LinkedinAdsSource": [[14, 0, 1, "", "AccessToken"], [14, 0, 1, "", "OAuth20"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.LinkedinAdsSource.AccessToken": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.LinkedinAdsSource.OAuth20": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.LinkedinPagesSource": [[14, 0, 1, "", "AccessToken"], [14, 0, 1, "", "OAuth20"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.LinkedinPagesSource.AccessToken": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.LinkedinPagesSource.OAuth20": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.LinnworksSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.LookerSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.MailchimpSource": [[14, 0, 1, "", "APIKey"], [14, 0, 1, "", "OAuth20"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.MailchimpSource.APIKey": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.MailchimpSource.OAuth20": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.MailgunSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.MarketoSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.MetabaseSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.MicrosoftTeamsSource": [[14, 0, 1, "", "AuthenticateViaMicrosoft"], [14, 0, 1, "", "AuthenticateViaMicrosoftOAuth20"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.MicrosoftTeamsSource.AuthenticateViaMicrosoft": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.MicrosoftTeamsSource.AuthenticateViaMicrosoftOAuth20": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.MixpanelSource": [[14, 0, 1, "", "ProjectSecret"], [14, 0, 1, "", "ServiceAccount"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.MixpanelSource.ProjectSecret": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.MixpanelSource.ServiceAccount": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.MondaySource": [[14, 0, 1, "", "APIToken"], [14, 0, 1, "", "OAuth20"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.MondaySource.APIToken": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.MondaySource.OAuth20": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.MongodbSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.MongodbV2Source": [[14, 0, 1, "", "MongoDBAtlas"], [14, 0, 1, "", "ReplicaSet"], [14, 0, 1, "", "StandaloneMongoDbInstance"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.MongodbV2Source.MongoDBAtlas": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.MongodbV2Source.ReplicaSet": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.MongodbV2Source.StandaloneMongoDbInstance": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.MssqlSource": [[14, 0, 1, "", "EncryptedTrustServerCertificate"], [14, 0, 1, "", "EncryptedVerifyCertificate"], [14, 0, 1, "", "LogicalReplicationCDC"], [14, 0, 1, "", "Standard"], [14, 0, 1, "", "Unencrypted"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.MssqlSource.EncryptedTrustServerCertificate": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.MssqlSource.EncryptedVerifyCertificate": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.MssqlSource.LogicalReplicationCDC": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.MssqlSource.Standard": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.MssqlSource.Unencrypted": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.MyHoursSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.MysqlSource": [[14, 0, 1, "", "LogicalReplicationCDC"], [14, 0, 1, "", "Preferred"], [14, 0, 1, "", "Required"], [14, 0, 1, "", "Standard"], [14, 0, 1, "", "VerifyCA"], [14, 0, 1, "", "VerifyIdentity"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.MysqlSource.LogicalReplicationCDC": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.MysqlSource.Preferred": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.MysqlSource.Required": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.MysqlSource.Standard": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.MysqlSource.VerifyCA": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.MysqlSource.VerifyIdentity": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.NetsuiteSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.NotionSource": [[14, 0, 1, "", "AccessToken"], [14, 0, 1, "", "OAuth20"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.NotionSource.AccessToken": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.NotionSource.OAuth20": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.OktaSource": [[14, 0, 1, "", "APIToken"], [14, 0, 1, "", "OAuth20"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.OktaSource.APIToken": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.OktaSource.OAuth20": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.OnesignalSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.OpenweatherSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.OracleSource": [[14, 0, 1, "", "NativeNetworkEncryptionNNE"], [14, 0, 1, "", "ServiceName"], [14, 0, 1, "", "SystemIDSID"], [14, 0, 1, "", "TLSEncryptedVerifyCertificate"], [14, 0, 1, "", "Unencrypted"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.OracleSource.NativeNetworkEncryptionNNE": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.OracleSource.ServiceName": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.OracleSource.SystemIDSID": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.OracleSource.TLSEncryptedVerifyCertificate": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.OracleSource.Unencrypted": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.OrbSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.OrbitSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.OutreachSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.PardotSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.PaypalTransactionSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.PaystackSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.PersistiqSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.PinterestSource": [[14, 0, 1, "", "AccessToken"], [14, 0, 1, "", "OAuth20"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.PinterestSource.AccessToken": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.PinterestSource.OAuth20": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.PipedriveSource": [[14, 0, 1, "", "APIKeyAuthentication"], [14, 0, 1, "", "SignInViaPipedriveOAuth"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.PipedriveSource.APIKeyAuthentication": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.PipedriveSource.SignInViaPipedriveOAuth": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.PivotalTrackerSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.PlaidSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.PokeapiSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.PostgresSource": [[14, 0, 1, "", "Allow"], [14, 0, 1, "", "Disable"], [14, 0, 1, "", "LogicalReplicationCDC"], [14, 0, 1, "", "NoTunnel"], [14, 0, 1, "", "PasswordAuthentication"], [14, 0, 1, "", "Prefer"], [14, 0, 1, "", "Require"], [14, 0, 1, "", "SSHKeyAuthentication"], [14, 0, 1, "", "Standard"], [14, 0, 1, "", "VerifyCa"], [14, 0, 1, "", "VerifyFull"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.PostgresSource.Allow": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.PostgresSource.Disable": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.PostgresSource.LogicalReplicationCDC": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.PostgresSource.NoTunnel": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.PostgresSource.PasswordAuthentication": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.PostgresSource.Prefer": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.PostgresSource.Require": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.PostgresSource.SSHKeyAuthentication": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.PostgresSource.Standard": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.PostgresSource.VerifyCa": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.PostgresSource.VerifyFull": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.PosthogSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.PrestashopSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.PrimetricSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.PythonHttpTutorialSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.QualarooSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.QuickbooksSingerSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.RechargeSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.RecurlySource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.RedshiftSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.RetentlySource": [[14, 0, 1, "", "AuthenticateViaRetentlyOAuth"], [14, 0, 1, "", "AuthenticateWithAPIToken"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.RetentlySource.AuthenticateViaRetentlyOAuth": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.RetentlySource.AuthenticateWithAPIToken": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.RkiCovidSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.S3Source": [[14, 0, 1, "", "Avro"], [14, 0, 1, "", "CSV"], [14, 0, 1, "", "Jsonl"], [14, 0, 1, "", "Parquet"], [14, 0, 1, "", "S3AmazonWebServices"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.S3Source.Avro": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.S3Source.CSV": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.S3Source.Jsonl": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.S3Source.Parquet": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.S3Source.S3AmazonWebServices": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.SalesforceSource": [[14, 0, 1, "", "FilterSalesforceObjectsEntry"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.SalesforceSource.FilterSalesforceObjectsEntry": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.SalesloftSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.ScaffoldJavaJdbcSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.ScaffoldSourceHttpSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.ScaffoldSourcePythonSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.SearchMetricsSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.SendgridSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.SentrySource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.SftpSource": [[14, 0, 1, "", "PasswordAuthentication"], [14, 0, 1, "", "SSHKeyAuthentication"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.SftpSource.PasswordAuthentication": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.SftpSource.SSHKeyAuthentication": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.ShopifySource": [[14, 0, 1, "", "APIPassword"], [14, 0, 1, "", "OAuth20"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.ShopifySource.APIPassword": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.ShopifySource.OAuth20": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.ShortioSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.SlackSource": [[14, 0, 1, "", "APITokenCredentials"], [14, 0, 1, "", "DefaultOAuth20Authorization"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.SlackSource.APITokenCredentials": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.SlackSource.DefaultOAuth20Authorization": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.SmartsheetsSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.SnapchatMarketingSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.SnowflakeSource": [[14, 0, 1, "", "OAuth20"], [14, 0, 1, "", "UsernameAndPassword"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.SnowflakeSource.OAuth20": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.SnowflakeSource.UsernameAndPassword": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.SquareSource": [[14, 0, 1, "", "APIKey"], [14, 0, 1, "", "OauthAuthentication"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.SquareSource.APIKey": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.SquareSource.OauthAuthentication": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.StockTickerApiTutorialSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.StravaSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.StripeSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.SurveymonkeySource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.TalkdeskExploreSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.TempoSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.TidbSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.TiktokMarketingSource": [[14, 0, 1, "", "OAuth20"], [14, 0, 1, "", "SandboxAccessToken"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.TiktokMarketingSource.OAuth20": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.TiktokMarketingSource.SandboxAccessToken": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.TimelySource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.TplcentralSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.TrelloSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.TwilioSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.TypeformSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.UsCensusSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.WebflowSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.WhiskyHunterSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.WoocommerceSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.WrikeSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.YahooFinancePriceSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.YandexMetricaSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.YoutubeAnalyticsSource": [[14, 0, 1, "", "AuthenticateViaOAuth20"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.YoutubeAnalyticsSource.AuthenticateViaOAuth20": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.ZendeskChatSource": [[14, 0, 1, "", "AccessToken"], [14, 0, 1, "", "OAuth20"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.ZendeskChatSource.AccessToken": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.ZendeskChatSource.OAuth20": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.ZendeskSunshineSource": [[14, 0, 1, "", "APIToken"], [14, 0, 1, "", "OAuth20"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.ZendeskSunshineSource.APIToken": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.ZendeskSunshineSource.OAuth20": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.ZendeskSupportSource": [[14, 0, 1, "", "APIToken"], [14, 0, 1, "", "OAuth20"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.ZendeskSupportSource.APIToken": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.ZendeskSupportSource.OAuth20": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.ZendeskTalkSource": [[14, 0, 1, "", "APIToken"], [14, 0, 1, "", "OAuth20"], [14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.ZendeskTalkSource.APIToken": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.ZendeskTalkSource.OAuth20": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.ZenefitsSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.ZenloopSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.ZohoCrmSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.ZoomSingerSource": [[14, 3, 1, "", "__init__"]], "dagster_airbyte.managed.generated.sources.ZuoraSource": [[14, 3, 1, "", "__init__"]], "dagster_airflow": [[15, 0, 1, "", "DagsterCloudOperator"], [15, 0, 1, "", "DagsterOperator"], [15, 6, 1, "", "load_assets_from_airflow_dag"], [15, 6, 1, "", "make_dagster_definitions_from_airflow_dag_bag"], [15, 6, 1, "", "make_dagster_definitions_from_airflow_dags_path"], [15, 6, 1, "", "make_dagster_job_from_airflow_dag"], [15, 6, 1, "", "make_ephemeral_airflow_db_resource"], [15, 6, 1, "", "make_persistent_airflow_db_resource"], [15, 6, 1, "", "make_schedules_and_jobs_from_airflow_dag_bag"]], "dagster_aws.cloudwatch": [[16, 5, 1, "", "cloudwatch_logger"]], "dagster_aws.ecs": [[16, 5, 1, "", "EcsRunLauncher"]], "dagster_aws.emr": [[16, 5, 1, "", "EmrClusterState"], [16, 0, 1, "", "EmrError"], [16, 0, 1, "", "EmrJobRunner"], [16, 5, 1, "", "EmrStepState"], [16, 5, 1, "", "emr_pyspark_step_launcher"]], "dagster_aws.redshift": [[16, 5, 1, "", "FakeRedshiftClientResource"], [16, 5, 1, "", "RedshiftClientResource"], [16, 5, 1, "", "fake_redshift_resource"], [16, 5, 1, "", "redshift_resource"]], "dagster_aws.s3": [[16, 5, 1, "", "ConfigurablePickledObjectS3IOManager"], [16, 0, 1, "", "S3ComputeLogManager"], [16, 5, 1, "", "S3Coordinate"], [16, 0, 1, "", "S3FileHandle"], [16, 5, 1, "", "S3FileManagerResource"], [16, 5, 1, "", "S3PickleIOManager"], [16, 5, 1, "", "S3Resource"], [16, 5, 1, "", "s3_file_manager"], [16, 5, 1, "", "s3_pickle_io_manager"], [16, 5, 1, "", "s3_resource"]], "dagster_aws.secretsmanager": [[16, 5, 1, "", "SecretsManagerResource"], [16, 5, 1, "", "SecretsManagerSecretsResource"], [16, 5, 1, "", "secretsmanager_resource"], [16, 5, 1, "", "secretsmanager_secrets_resource"]], "dagster_azure.adls2": [[17, 0, 1, "", "ADLS2FileHandle"], [17, 5, 1, "", "ADLS2PickleIOManager"], [17, 5, 1, "", "ADLS2Resource"], [17, 5, 1, "", "ConfigurablePickledObjectADLS2IOManager"], [17, 5, 1, "", "FakeADLS2Resource"], [17, 5, 1, "", "adls2_file_manager"], [17, 5, 1, "", "adls2_pickle_io_manager"], [17, 5, 1, "", "adls2_resource"]], "dagster_azure.blob": [[17, 0, 1, "", "AzureBlobComputeLogManager"]], "dagster_celery": [[18, 5, 1, "", "celery_executor"]], "dagster_celery_docker": [[19, 5, 1, "", "celery_docker_executor"]], "dagster_celery_k8s": [[20, 5, 1, "", "CeleryK8sRunLauncher"], [20, 5, 1, "", "celery_k8s_job_executor"]], "dagster_census": [[21, 0, 1, "", "CensusOutput"], [21, 0, 1, "", "CensusResource"], [21, 5, 1, "", "census_resource"], [21, 5, 1, "", "census_trigger_sync_op"]], "dagster_census.CensusOutput": [[21, 2, 1, "", "destination"], [21, 2, 1, "", "source"], [21, 2, 1, "", "sync_run"]], "dagster_dask": [[22, 5, 1, "", "dask_executor"]], "dagster_databricks": [[23, 0, 1, "", "DatabricksClient"], [23, 5, 1, "", "DatabricksClientResource"], [23, 0, 1, "", "DatabricksError"], [23, 6, 1, "", "create_databricks_run_now_op"], [23, 6, 1, "", "create_databricks_submit_run_op"], [23, 5, 1, "", "databricks_client"], [23, 5, 1, "", "databricks_pyspark_step_launcher"]], "dagster_databricks.DatabricksClient": [[23, 1, 1, "", "api_client"], [23, 1, 1, "", "client"], [23, 1, 1, "", "workspace_client"]], "dagster_datadog": [[24, 5, 1, "", "DatadogResource"], [24, 5, 1, "", "datadog_resource"]], "dagster_datahub": [[25, 5, 1, "", "DatahubKafkaEmitterResource"], [25, 5, 1, "", "DatahubRESTEmitterResource"], [25, 5, 1, "", "datahub_kafka_emitter"], [25, 5, 1, "", "datahub_rest_emitter"]], "dagster_dbt": [[26, 4, 1, "", "DagsterDbtCliFatalRuntimeError"], [26, 4, 1, "", "DagsterDbtCliHandledRuntimeError"], [26, 4, 1, "", "DagsterDbtCliOutputsNotFoundError"], [26, 4, 1, "", "DagsterDbtCliRuntimeError"], [26, 4, 1, "", "DagsterDbtCliUnexpectedOutputError"], [26, 4, 1, "", "DagsterDbtError"], [26, 0, 1, "", "DagsterDbtTranslator"], [26, 0, 1, "", "DbtCliEventMessage"], [26, 0, 1, "", "DbtCliInvocation"], [26, 0, 1, "", "DbtCliOutput"], [26, 0, 1, "", "DbtCliResource"], [26, 0, 1, "", "DbtCloudClientResource"], [26, 0, 1, "", "DbtManifestAssetSelection"], [26, 0, 1, "", "DbtOutput"], [26, 0, 1, "", "DbtResource"], [26, 6, 1, "", "build_dbt_asset_selection"], [26, 6, 1, "", "build_schedule_from_dbt_selection"], [26, 6, 1, "", "dbt_assets"], [26, 5, 1, "", "dbt_cli_resource"], [26, 5, 1, "", "dbt_cloud_resource"], [26, 5, 1, "", "dbt_cloud_run_op"], [26, 6, 1, "", "dbt_compile_op"], [26, 6, 1, "", "dbt_docs_generate_op"], [26, 6, 1, "", "dbt_ls_op"], [26, 5, 1, "", "dbt_run_op"], [26, 6, 1, "", "dbt_seed_op"], [26, 6, 1, "", "dbt_snapshot_op"], [26, 6, 1, "", "dbt_test_op"], [26, 6, 1, "", "default_group_from_dbt_resource_props"], [26, 6, 1, "", "default_metadata_from_dbt_resource_props"], [26, 6, 1, "", "get_asset_key_for_model"], [26, 6, 1, "", "get_asset_key_for_source"], [26, 6, 1, "", "get_asset_keys_by_output_name_for_source"], [26, 6, 1, "", "group_from_dbt_resource_props_fallback_to_directory"], [26, 6, 1, "", "load_assets_from_dbt_cloud_job"], [26, 6, 1, "", "load_assets_from_dbt_manifest"], [26, 6, 1, "", "load_assets_from_dbt_project"]], "dagster_dbt.DagsterDbtCliUnexpectedOutputError": [[26, 2, 1, "", "invalid_line_nos"]], "dagster_dbt.DagsterDbtTranslator": [[26, 3, 1, "", "get_asset_key"], [26, 3, 1, "", "get_auto_materialize_policy"], [26, 3, 1, "", "get_description"], [26, 3, 1, "", "get_freshness_policy"], [26, 3, 1, "", "get_group_name"], [26, 3, 1, "", "get_metadata"]], "dagster_dbt.DbtCliEventMessage": [[26, 3, 1, "", "to_default_asset_events"]], "dagster_dbt.DbtCliInvocation": [[26, 3, 1, "", "get_artifact"], [26, 3, 1, "", "is_successful"], [26, 3, 1, "", "stream"], [26, 3, 1, "", "stream_raw_events"], [26, 3, 1, "", "wait"]], "dagster_dbt.DbtCliOutput": [[26, 2, 1, "", "command"], [26, 2, 1, "", "docs_url"], [26, 2, 1, "", "logs"], [26, 2, 1, "", "raw_output"], [26, 2, 1, "", "result"], [26, 2, 1, "", "return_code"]], "dagster_dbt.DbtCliResource": [[26, 3, 1, "", "cli"], [26, 2, 1, "", "global_config_flags"], [26, 2, 1, "", "profile"], [26, 2, 1, "", "profiles_dir"], [26, 2, 1, "", "project_dir"], [26, 2, 1, "", "target"]], "dagster_dbt.utils": [[26, 6, 1, "", "generate_materializations"]], "dagster_docker": [[27, 5, 1, "", "DockerRunLauncher"], [27, 5, 1, "", "docker_container_op"], [27, 5, 1, "", "docker_executor"], [27, 6, 1, "", "execute_docker_container"]], "dagster_duckdb": [[28, 5, 1, "", "DuckDBIOManager"], [28, 5, 1, "", "DuckDBResource"], [28, 5, 1, "", "build_duckdb_io_manager"]], "dagster_duckdb_pandas": [[29, 5, 1, "", "DuckDBPandasIOManager"], [29, 0, 1, "", "DuckDBPandasTypeHandler"], [29, 5, 1, "", "duckdb_pandas_io_manager"]], "dagster_duckdb_polars": [[30, 5, 1, "", "DuckDBPolarsIOManager"], [30, 0, 1, "", "DuckDBPolarsTypeHandler"], [30, 5, 1, "", "duckdb_polars_io_manager"]], "dagster_duckdb_pyspark": [[31, 5, 1, "", "DuckDBPySparkIOManager"], [31, 0, 1, "", "DuckDBPySparkTypeHandler"], [31, 5, 1, "", "duckdb_pyspark_io_manager"]], "dagster_embedded_elt.sling": [[32, 0, 1, "", "SlingResource"], [32, 6, 1, "", "build_sling_asset"]], "dagster_embedded_elt.sling.resources": [[32, 0, 1, "", "SlingSourceConnection"], [32, 0, 1, "", "SlingTargetConnection"]], "dagster_fivetran": [[33, 5, 1, "", "FivetranResource"], [33, 6, 1, "", "build_fivetran_assets"], [33, 5, 1, "", "fivetran_resource"], [33, 5, 1, "", "fivetran_sync_op"], [33, 6, 1, "", "load_assets_from_fivetran_instance"]], "dagster_gcp": [[34, 0, 1, "", "BigQueryError"], [34, 5, 1, "", "BigQueryIOManager"], [34, 5, 1, "", "BigQueryResource"], [34, 5, 1, "", "ConfigurablePickledObjectGCSIOManager"], [34, 5, 1, "", "DataprocResource"], [34, 0, 1, "", "GCSFileHandle"], [34, 5, 1, "", "GCSFileManagerResource"], [34, 5, 1, "", "GCSPickleIOManager"], [34, 5, 1, "", "GCSResource"], [34, 5, 1, "", "bigquery_resource"], [34, 6, 1, "", "bq_create_dataset"], [34, 6, 1, "", "bq_delete_dataset"], [34, 6, 1, "", "bq_op_for_queries"], [34, 5, 1, "", "build_bigquery_io_manager"], [34, 5, 1, "", "dataproc_op"], [34, 5, 1, "", "dataproc_resource"], [34, 5, 1, "", "gcs_file_manager"], [34, 5, 1, "", "gcs_pickle_io_manager"], [34, 5, 1, "", "gcs_resource"], [34, 6, 1, "", "import_df_to_bq"], [34, 6, 1, "", "import_file_to_bq"], [34, 6, 1, "", "import_gcs_paths_to_bq"]], "dagster_gcp.gcs": [[34, 0, 1, "", "GCSComputeLogManager"]], "dagster_gcp_pandas": [[35, 5, 1, "", "BigQueryPandasIOManager"], [35, 0, 1, "", "BigQueryPandasTypeHandler"], [35, 5, 1, "", "bigquery_pandas_io_manager"]], "dagster_gcp_pyspark": [[36, 5, 1, "", "BigQueryPySparkIOManager"], [36, 0, 1, "", "BigQueryPySparkTypeHandler"], [36, 5, 1, "", "bigquery_pyspark_io_manager"]], "dagster_ge": [[37, 6, 1, "", "ge_validation_op_factory"]], "dagster_github": [[38, 5, 1, "", "GithubResource"], [38, 5, 1, "", "github_resource"]], "dagster_graphql": [[39, 0, 1, "", "DagsterGraphQLClient"], [39, 4, 1, "", "DagsterGraphQLClientError"], [39, 0, 1, "", "InvalidOutputErrorInfo"], [39, 0, 1, "", "ReloadRepositoryLocationInfo"], [39, 0, 1, "", "ReloadRepositoryLocationStatus"]], "dagster_graphql.DagsterGraphQLClient": [[39, 3, 1, "", "get_run_status"], [39, 3, 1, "", "reload_repository_location"], [39, 3, 1, "", "shutdown_repository_location"], [39, 3, 1, "", "submit_job_execution"]], "dagster_k8s": [[40, 5, 1, "", "K8sRunLauncher"], [40, 6, 1, "", "execute_k8s_job"], [40, 5, 1, "", "k8s_job_executor"], [40, 5, 1, "", "k8s_job_op"]], "dagster_mlflow": [[41, 5, 1, "", "end_mlflow_on_run_finished"], [41, 5, 1, "", "mlflow_tracking"]], "dagster_msteams": [[42, 5, 1, "", "MSTeamsResource"], [42, 6, 1, "", "make_teams_on_run_failure_sensor"], [42, 5, 1, "", "msteams_resource"], [42, 5, 1, "", "teams_on_failure"], [42, 5, 1, "", "teams_on_success"]], "dagster_mysql": [[43, 0, 1, "", "MySQLEventLogStorage"], [43, 0, 1, "", "MySQLRunStorage"], [43, 0, 1, "", "MySQLScheduleStorage"]], "dagster_pagerduty": [[44, 5, 1, "", "PagerDutyService"], [44, 5, 1, "", "pagerduty_resource"]], "dagster_pandas": [[45, 5, 1, "", "DataFrame"], [45, 0, 1, "", "PandasColumn"], [45, 0, 1, "", "RowCountConstraint"], [45, 0, 1, "", "StrictColumnsConstraint"], [45, 6, 1, "", "create_dagster_pandas_dataframe_type"]], "dagster_pandera": [[46, 6, 1, "", "pandera_schema_to_dagster_type"]], "dagster_papertrail": [[47, 5, 1, "", "papertrail_logger"]], "dagster_postgres": [[48, 5, 1, "", "PostgresEventLogStorage"], [48, 5, 1, "", "PostgresRunStorage"], [48, 5, 1, "", "PostgresScheduleStorage"]], "dagster_prometheus": [[49, 5, 1, "", "PrometheusResource"], [49, 5, 1, "", "prometheus_resource"]], "dagster_prometheus.resources": [[49, 0, 1, "", "PrometheusClient"]], "dagster_pyspark": [[50, 5, 1, "", "PySparkResource"], [50, 5, 1, "", "pyspark_resource"]], "dagster_shell": [[51, 6, 1, "", "create_shell_command_op"], [51, 6, 1, "", "create_shell_script_op"], [51, 6, 1, "", "execute_shell_command"], [51, 6, 1, "", "execute_shell_script"], [51, 6, 1, "", "shell_op"]], "dagster_slack": [[52, 5, 1, "", "SlackResource"], [52, 6, 1, "", "make_slack_on_freshness_policy_status_change_sensor"], [52, 6, 1, "", "make_slack_on_run_failure_sensor"], [52, 5, 1, "", "slack_on_failure"], [52, 5, 1, "", "slack_on_success"], [52, 5, 1, "", "slack_resource"]], "dagster_snowflake": [[53, 0, 1, "", "SnowflakeConnection"], [53, 5, 1, "", "SnowflakeIOManager"], [53, 5, 1, "", "SnowflakeResource"], [53, 5, 1, "", "build_snowflake_io_manager"], [53, 6, 1, "", "snowflake_op_for_query"], [53, 5, 1, "", "snowflake_resource"]], "dagster_snowflake.SnowflakeConnection": [[53, 3, 1, "", "execute_queries"], [53, 3, 1, "", "execute_query"], [53, 3, 1, "", "get_connection"], [53, 3, 1, "", "load_table_from_local_parquet"]], "dagster_snowflake_pandas": [[54, 5, 1, "", "SnowflakePandasIOManager"], [54, 0, 1, "", "SnowflakePandasTypeHandler"], [54, 5, 1, "", "snowflake_pandas_io_manager"]], "dagster_snowflake_pyspark": [[55, 5, 1, "", "SnowflakePySparkIOManager"], [55, 0, 1, "", "SnowflakePySparkTypeHandler"], [55, 5, 1, "", "snowflake_pyspark_io_manager"]], "dagster_spark": [[56, 0, 1, "", "SparkOpError"], [56, 6, 1, "", "construct_spark_shell_command"], [56, 6, 1, "", "create_spark_op"], [56, 6, 1, "", "define_spark_config"], [56, 5, 1, "", "spark_resource"]], "dagster_ssh": [[57, 5, 1, "", "ssh_resource"]], "dagster_twilio": [[58, 5, 1, "", "TwilioResource"], [58, 5, 1, "", "twilio_resource"]], "dagster_wandb": [[59, 0, 1, "", "SerializationModule"], [59, 0, 1, "", "WandbArtifactConfiguration"], [59, 4, 1, "", "WandbArtifactsIOManagerError"], [59, 6, 1, "", "run_launch_agent"], [59, 6, 1, "", "run_launch_job"], [59, 5, 1, "", "wandb_artifacts_io_manager"], [59, 5, 1, "", "wandb_resource"]], "dagstermill": [[60, 0, 1, "", "ConfigurableLocalOutputNotebookIOManager"], [60, 0, 1, "", "DagstermillError"], [60, 0, 1, "", "DagstermillExecutionContext"], [60, 6, 1, "", "define_dagstermill_asset"], [60, 6, 1, "", "define_dagstermill_op"], [60, 6, 1, "", "get_context"], [60, 6, 1, "", "yield_event"], [60, 6, 1, "", "yield_result"]], "dagstermill.DagstermillExecutionContext": [[60, 1, 1, "", "job_def"], [60, 1, 1, "", "job_name"], [60, 1, 1, "", "logging_tags"], [60, 1, 1, "", "op_config"], [60, 1, 1, "", "op_def"], [60, 1, 1, "", "run"], [60, 1, 1, "", "run_config"], [60, 1, 1, "", "run_id"]]}, "objnames": {"0": ["py", "class", "Python class"], "1": ["py", "property", "Python property"], "2": ["py", "attribute", "Python attribute"], "3": ["py", "method", "Python method"], "4": ["py", "exception", "Python exception"], "5": ["py", "data", "Python data"], "6": ["py", "function", "Python function"], "7": ["py", "module", "Python module"], "8": ["std", "cmdoption", "program option"]}, "objtypes": {"0": "py:class", "1": "py:property", "2": "py:attribute", "3": "py:method", "4": "py:exception", "5": "py:data", "6": "py:function", "7": "py:module", "8": "std:cmdoption"}, "terms": {"0": [1, 2, 3, 8, 9, 11, 13, 14, 15, 16, 18, 19, 20, 21, 23, 26, 33, 34, 37, 40, 42, 45, 50, 52, 59, 61, 63, 64, 65, 67, 69], "00": [14, 64, 67], "000z": 14, "00z": 14, "01": [12, 14, 26, 64, 67], "0123456789abcdef0123456789abcdef": 44, "01t00": 14, "01t13": 14, "02": [64, 67], "03": [14, 64, 67], "04": [40, 64, 67], "05": [64, 67], "06": [12, 14, 40, 64, 67], "07": [64, 67], "08": [8, 18], "09": 14, "1": [2, 3, 4, 5, 8, 9, 11, 13, 14, 15, 16, 22, 23, 26, 34, 37, 40, 50, 53, 59, 63, 64, 65, 67, 68, 69], "10": [11, 14, 16, 21, 23, 26, 33, 34, 46, 50, 57, 64, 67], "100": [14, 16, 34, 50, 64], "1000": [14, 63], "10000": 65, "1000000": 14, "1001": 24, "1035": 34, "11": [14, 23, 53, 64, 67], "11000": 65, "12": [14, 37, 53, 64, 67], "1200": 34, "123": 64, "1234": [4, 23, 24, 26], "127": [3, 40], "13": [53, 64, 67], "13t20": 14, "14": [14, 18, 64, 67], "15": [14, 16, 50, 64, 67], "15000": 3, "1521": 14, "15mb": 14, "16": [64, 67], "17": [14, 40], "18": [20, 40], "180": 14, "19": [64, 67], "1bf2": 59, "1m": [16, 50], "2": [2, 3, 4, 5, 9, 11, 12, 13, 14, 15, 16, 17, 23, 26, 34, 40, 42, 50, 52, 53, 59, 63, 64, 65, 67, 69], "20": [8, 14, 34, 40, 64, 67], "200": 14, "2000": 14, "20000": [16, 50], "200m": [16, 50], "2017": [14, 18], "2018": 14, "2020": [14, 40, 64, 67], "2021": 14, "2022": [14, 26, 64, 67], "2023": [8, 12, 26, 64], "2048m": [16, 50], "20t00": 14, "21": [8, 23, 26], "21t21": 40, "22": [8, 57], "2200": 23, "23": [8, 67], "2344535": 14, "2344535_sb1": 14, "24": [8, 14, 64], "2484": 14, "24t03": 14, "25": [8, 14, 21, 26, 33, 67], "2546": [16, 50], "25t00": 14, "26": [8, 14, 24, 64, 67], "27": [64, 67], "28000m": 23, "29": 34, "2auto": 34, "2g": [16, 50], "2gb": [16, 50], "3": [4, 8, 9, 12, 13, 14, 21, 23, 26, 33, 40, 53, 59, 63, 64, 65, 67, 68], "30": [2, 14, 16, 17, 34, 49, 57, 59, 67], "300": 39, "3000": [3, 14, 39, 42, 52], "300mb": [16, 50], "30z": 14, "31": [14, 64], "32": 34, "3333": 3, "3339": 14, "360": 14, "3600": 3, "364": 14, "39": 14, "4": [8, 18, 19, 20, 22, 23, 34, 53, 63], "420e": 14, "44b5": 59, "45": [23, 67], "465": 69, "4815": [16, 50], "5": [1, 8, 9, 12, 14, 16, 17, 22, 23, 34, 50, 61, 63, 64, 67, 68, 69], "50": [14, 16, 50], "5000": 41, "500gb": 34, "500m": [16, 50], "512m": [16, 50], "5432": [11, 15, 48], "54321": 26, "5439": 16, "5672": 18, "587": 69, "6": [16, 40, 50, 64, 67], "60": [11, 14, 26, 39, 42, 53, 59], "60000": 14, "63": 34, "6313": [16, 50], "6379": 40, "64": 23, "7": [34, 38, 40], "77777": 26, "7e4df022": 59, "8": [51, 59], "8000": 14, "8601": 14, "86400": [20, 23], "87b7fe85": 14, "8d74": 14, "9": [14, 26, 34, 64, 67], "90": 14, "914": 14, "95590a": 40, "999": 24, "9am": 2, "A": [1, 2, 3, 4, 5, 7, 8, 9, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 23, 26, 27, 32, 33, 34, 35, 36, 39, 44, 45, 47, 50, 51, 53, 59, 60, 61, 63, 64, 65, 66, 67, 68, 69], "AND": 11, "As": [5, 23, 24, 39, 45, 68], "At": [2, 16, 50, 67], "But": [16, 50], "By": [2, 4, 8, 9, 11, 14, 16, 17, 18, 19, 20, 21, 23, 26, 27, 33, 34, 38, 40, 42, 50, 52, 53, 59, 67], "For": [2, 3, 4, 8, 9, 11, 12, 13, 14, 15, 16, 20, 23, 26, 27, 28, 29, 30, 31, 33, 34, 35, 36, 40, 42, 45, 50, 53, 54, 55, 59, 63, 64, 66, 67, 68], "IF": 53, "INTO": 14, "If": [1, 2, 3, 4, 5, 8, 9, 10, 11, 12, 13, 14, 16, 17, 18, 19, 20, 21, 22, 23, 26, 27, 28, 29, 30, 31, 33, 34, 35, 36, 37, 39, 40, 42, 45, 46, 47, 48, 50, 51, 52, 53, 54, 55, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69], "In": [2, 5, 8, 9, 11, 12, 14, 16, 18, 19, 20, 26, 28, 29, 30, 31, 34, 35, 36, 40, 45, 50, 51, 53, 54, 55, 60, 63, 65, 67, 68], "Ins": 51, "It": [1, 2, 5, 7, 8, 14, 16, 21, 23, 26, 33, 34, 45, 50, 51, 52, 59, 60, 68], "Its": [2, 16, 50], "No": 34, "Not": [3, 14, 16, 50], "ONE": 3, "OR": 4, "On": [14, 34, 40], "One": [7, 14, 26], "Or": [5, 8, 11], "Such": 69, "That": 2, "The": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 26, 27, 32, 33, 34, 35, 36, 39, 40, 42, 44, 45, 46, 47, 49, 50, 51, 52, 53, 54, 55, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69], "Then": [14, 18, 19, 20, 27, 40, 52], "There": [5, 7, 14, 34, 42, 44, 64], "These": [1, 2, 5, 6, 8, 9, 10, 11, 13, 16, 23, 40, 45, 50, 60, 63, 66, 67, 68], "To": [2, 8, 9, 11, 12, 14, 15, 16, 18, 19, 20, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 33, 34, 35, 36, 38, 39, 40, 42, 44, 48, 50, 52, 53, 54, 55, 59, 61, 66, 67, 69], "Will": [8, 14, 20, 34, 40, 64], "With": [16, 17, 20, 34], "_": [2, 4, 6, 12, 14, 34, 45, 60, 68, 69], "__executor_name__": 8, "__fieldvaluesentinel": 4, "__file__": [6, 51, 69], "__init__": [12, 14, 65, 68], "__input_name__": 8, "__logger_name__": 8, "__main__": 18, "__name__": [18, 68], "__op_name__": 8, "__resource_name__": 8, "_add_on": 63, "_asset_selection_data": [8, 9], "_assets_def": 8, "_clean": 6, "_clean_nam": [14, 33], "_cm_scope_ent": 67, "_config": 4, "_connect": 12, "_construct_job_def_from_yaml_fil": 65, "_context": [45, 63, 68], "_core": [2, 6, 11, 13, 14, 26, 27, 33, 34, 37, 40, 45, 63, 67], "_databricks_run_now_op": 23, "_databricks_submit_run_op": 23, "_default_failure_email_bodi": 69, "_default_failure_email_subject": 69, "_default_failure_messag": 42, "_default_failure_message_text_fn": 52, "_default_freshness_message_text_fn": 52, "_get_path": 12, "_graph_": 9, "_job": 13, "_kei": 4, "_kwarg": 11, "_logger": 61, "_required_resource_kei": 2, "_resourc": 67, "_s3_bucket": 68, "_s3_kei": 68, "_schedul": 67, "_serd": 11, "_seven": [16, 17, 34], "_subset_selection_data": [8, 13], "_util": 69, "_was_explicitly_provided_resourc": [8, 13], "_yaml_directori": 65, "_yaml_file_for_job_nam": 65, "a22c": 14, "a383": 59, "a_downstream": 64, "a_job": 5, "a_multi_asset": 8, "a_op": 42, "a_prefix": [16, 25, 38, 66], "a_repo": 5, "a_resourc": 5, "a_schedul": 5, "a_sensor": 5, "a_str": 4, "abc": [2, 64], "abcdef": 16, "abil": [11, 63], "abl": [4, 5, 8, 13, 14, 16, 18, 19, 20, 23, 50], "abort": [16, 50], "aborttransact": 14, "about": [1, 2, 3, 6, 8, 9, 11, 12, 14, 16, 21, 26, 33, 39, 50, 52, 60, 63], "abov": [14, 16, 23, 28, 29, 30, 31, 34, 35, 36, 38, 40, 50, 53, 54, 55, 63], "absolut": [16, 23, 50], "abstract": [11, 12, 16, 17, 26, 34, 62, 63, 64, 65], "abstractset": [2, 9, 10, 15, 63, 68], "acceler": 34, "accept": [1, 2, 4, 7, 8, 11, 12, 13, 16, 23, 45, 47, 50, 60, 61, 63, 64, 66, 67], "accept_term": 14, "access": [2, 3, 7, 8, 11, 12, 14, 16, 17, 23, 25, 26, 34, 38, 41, 44, 45, 50, 53, 54, 55, 58, 60, 64, 65, 66, 68], "access_kei": 14, "access_key_id": 14, "access_key_kei": 23, "access_token": 14, "accesskei": 14, "accesstoken": 14, "accident": 2, "accord": [2, 4, 7, 11, 16, 34, 50], "accordingli": [16, 50], "account": [14, 16, 17, 20, 23, 24, 26, 34, 35, 36, 38, 40, 47, 50, 53, 54, 55, 58], "account_id": [14, 26, 34], "account_manag": 24, "account_nam": 17, "account_sid": [14, 58], "acct": 14, "accur": [16, 50], "accuraci": 14, "accurateblockthreshold": [16, 50], "ach": 27, "achiev": [8, 16, 50], "ack": [14, 16, 50], "acknowledg": 14, "acl": 23, "acquir": [14, 23], "across": [2, 6, 8, 11, 13, 14, 16, 27, 34, 40, 50, 59, 63, 64, 67], "act": [14, 16, 34, 50], "action": [14, 16, 34, 50], "action_breakdown": 14, "action_list_oper": 37, "action_on_failur": 16, "activ": [14, 23, 34, 47, 53, 59], "actual": [3, 4, 8, 12, 16, 50, 63, 67], "acycl": [8, 9], "ad": [2, 8, 12, 14, 15, 16, 20, 23, 26, 34, 40, 50, 64], "adapt": 26, "add": [4, 8, 11, 12, 14, 15, 16, 23, 26, 28, 29, 30, 31, 34, 35, 36, 39, 40, 41, 42, 48, 53, 54, 55, 59, 66, 67], "add_attach": 42, "add_dagster_env_vari": 23, "add_dynamic_partit": [11, 64], "add_metadata": 8, "add_metadata_two_output": 8, "add_on": [8, 9, 13, 63], "add_output_metadata": [8, 12, 26], "add_to_environ": 16, "adddynamicpartitionsrequest": 67, "addfil": [16, 50], "addit": [7, 11, 12, 14, 16, 18, 19, 20, 23, 26, 39, 40, 46, 50, 60, 63, 64, 67], "addition": [60, 63], "additional_arg": 18, "additional_field": 14, "additional_metr": 14, "additional_reader_opt": 14, "address": [14, 16, 22, 34, 50, 69], "adgroupadreport": 14, "adjust": 14, "adjustsourc": 14, "adl": [17, 23], "adls2": [17, 23], "adls2_client": 17, "adls2_file_manag": 17, "adls2_file_system": 17, "adls2_pickle_io_manag": 17, "adls2_prefix": 17, "adls2_resourc": 17, "adls2filehandl": 17, "adls2pickleiomanag": 17, "adls2resourc": 17, "admin": [14, 34], "administr": [14, 26, 38], "adset": 14, "advanc": [11, 16, 26, 50, 67], "advance_all_cursor": 67, "advance_cursor": 67, "advanced_opt": 14, "advantag": 64, "advertis": [14, 16, 50], "advertiser_id": 14, "advis": [18, 26], "aes256": 16, "aescbcenvelopeencrypt": 14, "affect": [16, 23, 50], "after": [2, 3, 12, 14, 16, 21, 23, 26, 33, 34, 35, 36, 40, 50, 52, 53, 63, 64, 66, 67], "after_cursor": 11, "after_cursor_partit": 67, "after_timestamp": 11, "afterward": 8, "ag": 3, "again": [14, 52, 67], "against": [2, 3, 7, 8, 9, 11, 13, 14, 16, 26, 40, 51, 53, 68], "agent": [24, 59], "aggreg": [14, 16, 50, 67], "ago": [2, 14], "agre": 14, "ahead": [16, 50], "ai": 59, "aim": 17, "airbyte_asset": 14, "airbyte_connect": 14, "airbyte_host": 14, "airbyte_inst": 14, "airbyte_password": 14, "airbyte_port": 14, "airbyte_resourc": 14, "airbyte_sync_op": 14, "airbyte_usernam": 14, "airbyteconnect": 14, "airbyteconnectionmetadata": 14, "airbytecontain": 14, "airbytedestin": 14, "airbytedestinationnamespac": 14, "airbytehq": 14, "airbyteio": 14, "airbytemanagedelementreconcil": 14, "airbyteoutput": 14, "airbyteresourc": 14, "airbytesourc": 14, "airbytesyncmod": 14, "airflow_db": 15, "airflow_execution_d": 15, "airflow_hom": 15, "airline_demo": 69, "airtabl": 14, "airtablesourc": 14, "album": 24, "alert": [42, 44, 52, 67, 69], "alert_email_password": 69, "algorithm": [16, 50], "alia": [4, 8, 9, 59, 63, 65], "alias": [8, 9], "align": [8, 18, 19, 20], "aliv": [11, 16, 50], "all": [2, 3, 4, 6, 7, 8, 9, 11, 12, 13, 14, 16, 18, 19, 20, 23, 26, 27, 33, 34, 37, 39, 40, 41, 42, 46, 48, 50, 52, 53, 63, 64, 65, 66, 67, 68, 69], "all_asset": 2, "all_asset_check": 2, "all_dbt_asset": 26, "all_ev": 8, "all_partitions_materi": 67, "alloc": [16, 50], "allow": [1, 2, 3, 4, 8, 9, 11, 12, 13, 14, 16, 21, 23, 26, 33, 38, 40, 42, 45, 46, 50, 51, 52, 59, 60, 62, 63, 64, 65, 66, 69], "allow_host_key_chang": 57, "allow_missing_partit": 12, "allow_nonexistent_upstream_partit": 64, "allow_retri": [26, 63], "allpartitionmap": 64, "allpartitionsmap": 64, "almost": 14, "aloha": 4, "along": [16, 23, 26, 50, 51], "alongsid": 26, "alreadi": [5, 11, 14, 16, 23, 39, 40, 50], "also": [2, 3, 5, 8, 9, 11, 12, 13, 14, 16, 17, 18, 19, 20, 22, 23, 26, 27, 34, 38, 40, 42, 44, 46, 50, 52, 53, 59, 61, 62, 63, 64, 66, 67, 69], "alter": 6, "altern": [1, 4, 11, 16, 53, 69], "alwai": [2, 8, 9, 13, 14, 16, 34, 40, 45, 60, 67, 68, 69], "am": [4, 64, 67], "amazon": [14, 16, 23], "amazonadssourc": 14, "amazonaw": [14, 16, 23, 40], "amazonec": 16, "amazons3": [14, 23], "amazonsellerpartnersourc": 14, "amazonsqsdestin": 14, "amazonsqssourc": 14, "america": [2, 26, 53, 64, 67], "amount": [14, 16, 23, 50, 59], "amplitud": 14, "amplitudesourc": 14, "amqp": 18, "an": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 33, 34, 35, 36, 37, 38, 39, 41, 42, 44, 45, 46, 47, 50, 51, 52, 53, 54, 55, 57, 58, 59, 60, 62, 63, 64, 65, 66, 67, 68, 69], "an_asset": 8, "an_existing_mlflow_run_id": 41, "an_op": 52, "analyt": [14, 34], "ancestor": [2, 8, 9, 13], "ani": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 23, 26, 27, 29, 30, 31, 32, 33, 34, 38, 39, 40, 41, 42, 45, 47, 50, 51, 52, 53, 54, 55, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69], "annot": [4, 5, 28, 29, 30, 31, 34, 53, 54, 55, 60, 67, 68], "anonym": 63, "anoth": [7, 8, 9, 11, 14, 16, 26, 50, 68], "another_asset": 8, "answer": 14, "ant": [16, 50], "anyth": 14, "apach": [16, 34, 50, 56], "api": [2, 4, 5, 6, 7, 8, 9, 11, 14, 15, 16, 17, 21, 24, 26, 33, 34, 38, 39, 41, 42, 44, 45, 50, 52, 59, 61, 62, 63, 64, 66, 67, 68, 69], "api3": 14, "api_cli": 23, "api_kei": [14, 21, 24, 33, 59], "api_password": 14, "api_secret": [14, 16, 33], "api_serv": 14, "api_stepconfig": 16, "api_token": 14, "api_url": 14, "apicli": 23, "apidoc": 14, "apifi": 14, "apifydatasetsourc": 14, "apigroup": 40, "apikei": 14, "apikeyauthent": 14, "apikeyid": 14, "apikeysecret": 14, "apipassword": 14, "apirefer": 16, "apitoken": 14, "apitokencredenti": 14, "app": [14, 16, 19, 20, 24, 38, 50, 52], "app_id": [14, 16, 50], "app_kei": 24, "appauthexampl": 16, "appear": [4, 8, 12, 16, 23, 38, 50], "append": [14, 21, 23, 63], "appfollow": 14, "appfollowsourc": 14, "appid": 14, "appl": 64, "appli": [1, 2, 5, 8, 9, 11, 13, 14, 15, 16, 20, 23, 26, 27, 33, 40, 50, 59, 63, 64, 66, 67], "applic": [14, 16, 20, 23, 24, 34, 38, 40, 50, 56, 64], "application_argu": 56, "application_id": 14, "application_jar": 56, "application_secret": 14, "applylimitperuniquevalu": [11, 27, 40], "appropri": [4, 9, 14, 19, 20, 33, 40, 53, 54, 55, 68], "appsflyer": 14, "appsflyersourc": 14, "appstor": 14, "appstoresingersourc": 14, "ar": [1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14, 16, 17, 18, 19, 20, 23, 26, 27, 33, 34, 39, 40, 43, 44, 45, 47, 48, 50, 51, 52, 53, 59, 60, 61, 63, 64, 65, 66, 67, 68], "arbitrari": [1, 2, 4, 6, 7, 8, 9, 11, 13, 23, 39, 51, 63, 65, 66], "archiv": [14, 34], "archiveuri": 34, "arctic": 64, "aren": [2, 14, 52], "arg": [3, 7, 8, 13, 15, 16, 19, 26, 34, 39, 40, 61], "argument": [1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 13, 16, 17, 18, 19, 20, 26, 32, 34, 40, 45, 50, 51, 61, 63, 64, 65, 66, 67, 68], "arn": [14, 16], "around": [11, 12, 16, 17, 18, 19, 20, 50], "arouting_kei": 44, "arrai": [4, 7, 14, 16], "arrang": [8, 9], "arriv": 14, "articl": [14, 18], "artifact": [3, 8, 11, 16, 26, 50, 59, 63], "artifactid": [16, 50], "artifactori": [16, 50], "as_dagster_typ": [63, 68], "asana": 14, "asanasourc": 14, "ascend": [11, 14, 67], "asia": 34, "asid": [16, 50], "ask": [16, 50], "asktimeout": [16, 50], "assembl": 9, "assert": [4, 8, 11, 16, 66, 68], "assert_failur": 68, "assert_success": 68, "asset": [4, 5, 9, 11, 12, 13, 15, 16, 17, 21, 23, 25, 28, 29, 30, 31, 34, 35, 36, 38, 42, 52, 53, 54, 55, 59, 60, 62, 64, 65, 66, 67, 68], "asset0": 2, "asset0_valu": 2, "asset1": [2, 5, 8, 12, 16, 17, 34, 64, 65, 66, 67], "asset1_job": [64, 67], "asset1_job_schedul": [64, 67], "asset1_result": 2, "asset1_with_foo": 66, "asset2": [2, 5, 8, 12, 16, 17, 34, 64, 65, 66], "asset2_result": 2, "asset2_with_foo": 66, "asset_1": 67, "asset_2": 67, "asset_a": 67, "asset_b": 67, "asset_check": [1, 2, 5], "asset_check_select": 11, "asset_checks_def": 8, "asset_config": 8, "asset_def": 32, "asset_dep": 2, "asset_entri": 11, "asset_ev": 67, "asset_graph": [8, 9], "asset_info": 12, "asset_kei": [1, 2, 5, 8, 11, 12, 16, 17, 26, 33, 34, 52, 63, 65, 67], "asset_key_for_input": 8, "asset_key_for_output": 8, "asset_key_list": 2, "asset_key_prefix": [14, 21, 26, 33, 60], "asset_lay": [8, 9, 13], "asset_materi": [8, 12], "asset_materialization_fn": 67, "asset_materialization_plan": 8, "asset_materializt": 8, "asset_observ": 8, "asset_on": 5, "asset_one_check_on": 5, "asset_partit": [11, 63], "asset_partition_kei": 12, "asset_partition_key_for_input": 8, "asset_partition_key_for_output": 8, "asset_partition_key_rang": [8, 12], "asset_partition_key_range_for_input": 8, "asset_partition_key_range_for_output": 8, "asset_partition_keys_for_input": 8, "asset_partition_keys_for_output": 8, "asset_partitions_def": 12, "asset_partitions_def_for_input": 8, "asset_partitions_def_for_output": 8, "asset_partitions_subset": 12, "asset_partitions_time_window": 12, "asset_partitions_time_window_for_input": 8, "asset_partitions_time_window_for_output": [8, 26], "asset_select": [8, 11, 13, 52, 67], "asset_sensor": 67, "asset_spec": 32, "asset_that_uses_writ": [16, 25, 38, 66], "asset_to_invok": 8, "asset_two": 5, "asset_valu": 8, "asset_with_config": 4, "assetcheckevalu": 67, "assetcheckkei": 1, "assetcheckresult": [1, 26], "assetchecksdefinit": [1, 5, 8], "assetchecksever": 1, "assetcheckspec": [1, 2], "assetdefinit": 2, "assetdep": 2, "assetexecutioncontext": [8, 26], "assetin": [2, 5, 8, 12, 28, 29, 30, 31, 34, 35, 36, 53, 54, 55, 60, 64, 65], "assetkei": [1, 2, 5, 8, 11, 12, 14, 15, 16, 17, 26, 33, 34, 60, 63, 65, 67], "assetlay": [8, 9], "assetmateri": [2, 8, 11, 12, 26, 60, 63, 67], "assetobserv": [8, 12, 26, 67], "assetout": [2, 8, 26], "assetpartitionstatu": 11, "assetrecord": 11, "assets_def": [2, 8], "assets_defs_by_kei": [2, 67], "assetscheckspec": 2, "assetsdefinit": [1, 2, 5, 8, 14, 15, 26, 33, 60, 65, 67], "assetselect": [2, 8, 26, 52, 67], "assetsensordefinit": 67, "assetspec": [2, 32], "assetvalueload": 2, "assign": [2, 11, 12, 14, 16, 17, 26, 34, 59], "assist": 24, "associ": [2, 3, 5, 6, 8, 9, 10, 11, 12, 14, 26, 33, 34, 63, 64, 65, 67], "asst": 2, "assum": [2, 11, 14, 20, 26, 40, 63, 64], "assumpt": 16, "async": [12, 63], "athlet": 14, "athlete_id": 14, "atla": 14, "atlant": 64, "atlassian": 14, "attach": [1, 2, 8, 9, 12, 13, 14, 16, 17, 26, 33, 34, 39, 46, 51, 60, 63, 64, 67, 68], "attempt": [2, 7, 8, 11, 14, 16, 46, 49, 50, 57, 63, 64], "attempt_num": 63, "attit": 23, "attribut": [2, 3, 7, 8, 14, 23, 34, 63, 66, 67], "attributes_to_return": 14, "attribution_window": 14, "audit": [16, 50], "auth": [14, 23, 34, 39, 40], "auth_method": 14, "auth_sourc": 14, "auth_ssh_kei": 14, "auth_token": [14, 26, 58], "auth_typ": 14, "auth_url": 14, "auth_user_password": 14, "authent": [14, 16, 17, 23, 34, 35, 36, 50, 53, 54, 55, 58, 59], "authenticateviagoogleoauth": 14, "authenticateviaharvestoauth": 14, "authenticateviamicrosoft": 14, "authenticateviamicrosoftoauth20": 14, "authenticateviaoauth20": 14, "authenticateviaretentlyoauth": 14, "authenticatewithapitoken": 14, "authenticatewithpersonalaccesstoken": 14, "authenticationmethod": 14, "authenticationviagoogleoauth": 14, "author": [8, 9, 11, 14, 44, 63], "author_year_end": 14, "author_year_start": 14, "auto": [2, 14, 26, 34], "auto_commit_interval_m": 14, "auto_materialize_polici": [2, 26], "auto_materialize_policies_by_kei": 2, "auto_materialize_policies_by_output_nam": 2, "auto_observe_interval_minut": 2, "auto_offset_reset": 14, "autocommit": [16, 53], "autom": [38, 40], "automat": [2, 3, 4, 5, 7, 8, 14, 16, 23, 26, 33, 39, 46, 47, 50, 59, 61, 66], "automaterializepolici": [2, 14, 26], "automaterializerul": 2, "automatic_reconnect": 14, "autosc": 23, "autoscal": 23, "avail": [3, 4, 5, 7, 8, 10, 11, 12, 14, 15, 16, 17, 18, 23, 26, 27, 34, 39, 40, 47, 50, 60, 61, 66, 67, 68], "avoid": [5, 11, 14, 16, 23, 34, 35, 36, 50, 53, 54, 55, 63, 65, 67], "avro": 14, "avroapacheavro": 14, "aw": [14, 23, 40], "aws_access_kei": 14, "aws_access_key_id": [14, 16, 41], "aws_account_id": [14, 16, 40], "aws_attribut": 23, "aws_environ": 14, "aws_key_id": 14, "aws_key_secret": 14, "aws_region": 16, "aws_region_nam": 14, "aws_secret_access_kei": [14, 16, 41], "aws_secret_kei": 14, "aws_session_token": 16, "awsavail": 23, "awscloudtrailsourc": 14, "awsdatalakedestin": 14, "awss3stag": 14, "azblobazureblobstorag": 14, "azur": [14, 23], "azure_blob_storage_account_kei": 14, "azure_blob_storage_account_nam": 14, "azure_blob_storage_container_nam": 14, "azure_blob_storage_endpoint_domain_nam": 14, "azure_blob_storage_output_buffer_s": 14, "azure_blob_storage_sas_token": 14, "azure_data_lake_storage_kei": 17, "azureblobcomputelogmanag": 17, "azureblobstorag": 14, "azureblobstoragedestin": 14, "azureblobstoragestag": 14, "azuredatabrick": 23, "azuretablesourc": 14, "b": [2, 8, 11, 15, 59, 63, 64, 67], "b30e7ede77df": 14, "back": [8, 11, 16, 17, 18, 20, 26, 34, 40, 43, 48, 50, 53, 54, 55, 60, 67], "backend": [16, 19, 20, 50], "backendconnectiontimeout": [16, 50], "backfil": [2, 3, 8, 67], "backfill_polici": 2, "backfillpolici": [2, 64], "background": [3, 14, 18], "backlog": [16, 50], "backoff": 63, "backoff_delai": 63, "backpressur": [16, 50], "backward": [16, 50], "bad": 4, "badg": 2, "balthazar": 18, "bamboo": 14, "bamboohr": 14, "bamboohrsourc": 14, "bar": [4, 8, 11, 12, 16, 23, 24, 33, 34, 50, 63, 65, 66, 68], "bare": [4, 7], "base": [2, 4, 7, 8, 9, 11, 12, 13, 14, 16, 17, 18, 19, 20, 22, 25, 26, 28, 29, 30, 31, 34, 35, 36, 38, 40, 42, 43, 45, 48, 50, 51, 52, 53, 54, 55, 59, 60, 62, 63, 64, 66, 67, 68, 69], "base64": [34, 35, 36, 53, 54, 55], "base_dir": [11, 12, 16, 17, 34, 59, 60], "base_id": 14, "base_path": 12, "base_storag": 11, "base_url": 14, "baselin": 59, "basemodel": 4, "bash": 51, "basi": 40, "basic": [8, 9, 14, 33, 34], "basicprofil": [16, 50], "bat": 8, "batch": [14, 16, 40, 50], "batch_kwarg": 37, "batch_siz": 14, "batching_en": 14, "batching_max_messag": 14, "batching_max_publish_delai": 14, "baz": [8, 12, 33, 63], "bb852df4077e": 59, "bce": 14, "bearer": 14, "bearer_token": 14, "becaus": [2, 7, 8, 11, 16, 17, 23, 26, 50, 67], "becom": [4, 7, 14, 45, 52, 61, 66, 68], "been": [2, 7, 8, 11, 13, 14, 16, 50, 65, 67], "befor": [2, 4, 5, 11, 14, 16, 20, 21, 23, 26, 33, 34, 35, 36, 39, 40, 49, 50, 51, 52, 53, 63, 64, 67], "before_cursor": 11, "before_timestamp": 11, "begin": [14, 16, 50, 52], "begin_tim": 14, "behalf": [34, 67], "behav": 14, "behavior": [8, 12, 13, 14, 15, 16, 23, 26, 50, 63], "behind": [16, 50, 67], "being": [1, 2, 4, 8, 9, 10, 11, 12, 14, 16, 23, 26, 34, 50, 61, 63, 66, 67, 68], "belong": [2, 8, 9, 10, 14, 16, 23, 34, 50, 67], "below": [4, 16, 23, 26, 34, 38, 40, 50, 53], "bertovi\u0107": 18, "bespok": 65, "best": 15, "beta": 34, "better": [14, 16, 50], "between": [2, 8, 9, 11, 14, 16, 21, 26, 28, 32, 33, 34, 50, 51, 52, 53, 59, 63, 64, 66, 67], "beyond": [14, 16, 50, 63], "big": 14, "big_query_client_buffer_size_mb": 14, "bigcommerc": 14, "bigcommercesourc": 14, "bigger": [14, 16, 50], "bigqueri": 14, "bigquery_io_manag": 34, "bigquery_pandas_io_manag": 35, "bigquery_pyspark_io_manag": 36, "bigquery_resourc": 34, "bigquerydenormalizeddestin": 14, "bigquerydestin": 14, "bigqueryerror": 34, "bigqueryiomanag": [34, 35, 36], "bigquerypandasiomanag": 35, "bigquerypandastypehandl": [34, 35], "bigquerypysparkiomanag": 36, "bigquerypysparktypehandl": 36, "bigqueryresourc": 34, "bigquerysourc": 14, "bigtabl": 34, "billion": 14, "bin": 40, "binari": [16, 34, 50], "binaryio": 11, "bind": [5, 13, 16, 50, 53], "bindaddress": [16, 50], "bing": 14, "bingadssourc": 14, "binlog": 14, "birth": 14, "bit": 2, "bitnami": 40, "blacklist": [16, 50], "blank": [11, 14, 33], "blob": [8, 14, 17, 23, 26], "block": [11, 14, 16, 17, 18, 34, 48, 50, 52], "block_if_queue_ful": 14, "block_siz": 14, "block_size_mb": 14, "blockinterv": [16, 50], "blockmanag": [16, 50], "blockmanagerslavetimeoutm": [16, 50], "blocks_fn": 52, "blocksiz": [16, 50], "blog": [18, 58], "blue": 24, "board": 14, "board_id": 14, "bodi": [2, 7, 8, 9, 12, 13, 14, 38, 39, 52, 61, 62, 63, 66, 69], "book": 14, "bookmark": 14, "bookshelv": 14, "bool": [1, 2, 4, 7, 8, 9, 11, 13, 14, 15, 16, 20, 21, 23, 26, 33, 34, 39, 40, 42, 45, 48, 50, 52, 53, 60, 62, 63, 64, 65, 67, 68, 69], "boolean": [4, 8, 62, 63, 67], "boolmetadatavaluy": 63, "boolsourc": [4, 14, 16, 25, 26, 33, 42, 53, 54, 55, 57], "boostrap": 25, "boot": 34, "bootdisksizegb": 34, "bootdisktyp": 34, "bootstrap": [3, 14, 16, 25, 40], "bootstrap_serv": 14, "born": 14, "bot": [14, 52], "both": [4, 5, 8, 11, 12, 14, 15, 16, 17, 23, 24, 25, 26, 37, 38, 40, 50, 53, 54, 55, 66, 67], "boto": 16, "boto3": 16, "botocor": 16, "bound": [2, 5, 14, 16, 50, 64, 67], "boundari": [5, 7, 8, 11, 13, 63], "bq": 34, "bq_create_dataset": 34, "bq_delete_dataset": 34, "bq_op_for_queri": 34, "braintre": 14, "braintreesourc": 14, "branch": [14, 53], "brand": 14, "break": [2, 14, 15, 26, 63, 66], "breakdown": 14, "breakpoint": 69, "brew": 40, "bridg": [16, 50], "broadcast": [16, 50], "broker": [14, 19, 20], "broker_host": 14, "broker_port": 14, "broker_url": 18, "browser": 24, "bu": [16, 50], "bucket": [4, 14, 16, 23, 34, 35, 36, 68], "bucket_bi": 11, "bucket_id": 14, "bucket_nam": 14, "bucket_prefix": [4, 14], "buffer": [14, 16, 50, 51], "buffer_memori": 14, "buffer_s": 14, "bufferediobas": 60, "buffers": [14, 16, 50], "bug": [53, 54, 55], "build": [3, 8, 9, 10, 12, 13, 14, 26, 28, 32, 33, 34, 40, 53, 61, 63, 66, 67], "build_add_request": 64, "build_airbyte_asset": 14, "build_asset_context": 8, "build_bigquery_io_manag": 34, "build_dbt_asset_select": 26, "build_duckdb_io_manag": 28, "build_fivetran_asset": 33, "build_freshness_policy_sensor_context": 67, "build_hook_context": 10, "build_init_logger_context": 61, "build_init_resource_context": 66, "build_input_context": 12, "build_multi_asset_sensor_context": 67, "build_op_context": [8, 16], "build_output_context": 12, "build_reconstructable_job": [8, 13], "build_resourc": 66, "build_run_status_sensor_context": 67, "build_schedule_context": 67, "build_schedule_from_dbt_select": 26, "build_schedule_from_partitioned_job": [64, 67], "build_sensor_context": 67, "build_sling_asset": 32, "build_snowflake_io_manag": 53, "buildkit": 40, "buildup": [16, 50], "built": [4, 8, 11, 14, 16, 26, 32, 45, 50, 60], "builtin": [4, 21, 69], "bulk": 34, "bundl": [14, 16], "busi": [14, 68], "busybox": [27, 40], "button": 14, "bypass": [26, 40, 63], "bypass_cach": 26, "bypassmergethreshold": [16, 50], "byte": [11, 14, 16, 50, 63], "bzip2": 14, "c": [2, 40, 64], "c_": 60, "ca": [14, 16], "ca_certif": 14, "ca_certificate_path": 25, "cach": [2, 3, 14, 16, 26, 50, 53, 59, 65], "cachableassetsdefinit": 2, "cache_column_metadata": 53, "cache_duration_in_minut": 59, "cache_typ": 14, "cacheabl": 2, "cacheableassetsdefinit": [2, 5, 26], "cachedexecutoridletimeout": [16, 50], "cadenc": [64, 67], "calcul": [2, 16, 50, 63, 64], "calculate_byt": 63, "call": [2, 3, 4, 7, 8, 11, 12, 13, 14, 16, 17, 22, 23, 26, 27, 34, 37, 40, 46, 47, 50, 53, 60, 61, 62, 64, 65, 67, 68], "call_user_provided_funct": 11, "callabl": [4, 10, 11, 13, 14, 16, 26, 33, 42, 45, 47, 52, 61, 63, 64, 65, 66, 67, 68, 69], "callback": 10, "caller": [16, 34, 50], "callercontext": [16, 50], "camelcas": 40, "campaign": 14, "can": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16, 18, 19, 20, 22, 23, 24, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 38, 39, 40, 42, 43, 44, 45, 47, 48, 50, 51, 52, 53, 54, 55, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69], "can_attach_to": 23, "can_manag": 23, "can_manage_run": 23, "can_restart": 23, "can_subset": 2, "can_view": 23, "cancel": [11, 14, 23], "cancel_and_wait": 16, "cancel_run": 23, "cancel_sync_on_run_termin": 14, "candid": 2, "canned_acl": 23, "cannot": [2, 4, 7, 8, 14, 16, 17, 23, 34, 35, 36, 39, 50, 59, 60, 63], "canon": 14, "capabl": [20, 40], "capac": [16, 34, 50], "captur": [11, 12, 14, 16, 17, 26, 34, 69], "capture_log": 26, "captured_log_manag": 11, "capturedlogmanag": 11, "card": 42, "care": [2, 8, 9, 13], "cart": 14, "cartsourc": 14, "case": [8, 9, 11, 12, 14, 16, 18, 19, 20, 26, 27, 33, 40, 45, 50, 51, 63, 64, 65, 68, 69], "cassandra": 14, "cassandradestin": 14, "cat": [34, 35, 36, 51, 53, 54, 55], "catalog": [14, 26, 67], "catch": 7, "categori": 14, "caus": [2, 3, 10, 14, 16, 34, 50], "caution": [14, 16, 50], "cdc": 14, "ce": 14, "celeri": 40, "celery_docker_executor": 19, "celery_docker_job_executor": 19, "celery_enabled_job": [18, 19, 20], "celery_executor": 18, "celery_k8s_job_executor": 20, "celeryk8srunlaunch": 20, "celeryq": [18, 19, 20], "censu": 14, "census_api_kei": 21, "census_resourc": 21, "census_sync_op": 21, "census_trigger_sync_op": 21, "censusoutput": 21, "censusresourc": 21, "center": 14, "central": [8, 10, 16, 50, 61], "central1": 34, "centralapirout": 14, "cereal": 14, "cereals_connect": 14, "cereals_csv_sourc": 14, "cert": 16, "certain": [1, 2, 8, 11, 13, 14, 16, 33, 45, 50, 51, 53, 60, 63], "certif": [14, 16, 34, 42], "chain": [16, 50], "chang": [2, 3, 11, 14, 16, 18, 34, 40, 52, 53, 62, 64, 67], "changelog": 14, "channel": [3, 14, 26, 42, 52], "channel_filt": 14, "char": 14, "charact": [14, 16, 23, 34, 50, 59], "chargebe": 14, "chargebeesourc": 14, "chargifi": 14, "chargifysourc": 14, "chartmogul": 14, "chartmogulsourc": 14, "chat": [14, 52], "chat_postmessag": [26, 52], "check": [2, 4, 5, 7, 8, 11, 14, 16, 23, 26, 37, 45, 46, 50, 51, 59, 60, 62, 63, 65, 66, 67, 68], "check_cluster_everi": 16, "check_dagster_typ": 68, "check_nam": [1, 24], "check_result": 2, "check_spec": 2, "check_specs_by_output_nam": 2, "checker": 4, "checkerror": [45, 68], "checkpoint": [16, 50], "checkpointinterv": [16, 50], "checks_for_asset": 2, "checksum": [16, 50], "child": [4, 7, 8, 9, 13, 63, 69], "children": 2, "choic": 13, "choos": [14, 23], "chose": 14, "chosen": [14, 16], "chunk": [8, 11, 14, 16, 50], "cid": 14, "circumst": [16, 50], "cl": [26, 68], "claim": 40, "class": [1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 20, 21, 23, 25, 26, 28, 29, 30, 31, 32, 33, 34, 35, 36, 38, 39, 40, 43, 45, 46, 47, 48, 49, 50, 53, 54, 55, 56, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69], "class_nam": 11, "classestoregist": [16, 50], "classmethod": [14, 26], "classpath": [16, 34, 50], "clean": [14, 16, 50, 66], "clean_sess": 14, "cleancheckpoint": [16, 50], "cleaned_custom": 26, "cleaner": [16, 50], "cleanup": [16, 40, 50, 66], "clear": [3, 16, 50, 67], "cli": [5, 14, 40], "click": [14, 52], "clickhous": 14, "clickhousedestin": 14, "clickhousesourc": 14, "client": [3, 14, 16, 17, 22, 23, 34, 41, 50, 52, 53, 59], "client_certif": 14, "client_credenti": 14, "client_dns_lookup": 14, "client_id": 14, "client_kei": 14, "client_key_password": 14, "client_prefetch_thread": 53, "client_secret": 14, "client_session_keep_al": 53, "clientnam": 14, "clone": [6, 16, 50], "cloneconf": [16, 50], "close": [8, 11, 14, 16, 50, 53, 64], "closecomsourc": 14, "closefileafterwrit": [16, 50], "cloud": [11, 12, 14, 15, 23, 34, 39, 53], "cloudflar": 14, "cloudtrail": 14, "cloudwatch_logg": 16, "cluster": [11, 14, 16, 20, 22, 23, 34, 50], "cluster_config": 34, "cluster_config_dict": 34, "cluster_config_json_path": 34, "cluster_config_yaml_path": 34, "cluster_id": 16, "cluster_log_conf": 23, "cluster_nam": 34, "cluster_permiss": 23, "cluster_url": 14, "clusterconfig": 34, "clusternam": 34, "cmd": 27, "cn": 4, "coars": [16, 50], "coarser": 64, "cockroachdb": 14, "cockroachdbsourc": 14, "code": [2, 3, 5, 7, 8, 10, 11, 14, 16, 23, 26, 34, 38, 44, 50, 51, 59, 60, 61, 62, 63, 66, 67], "code_server_log_level": 3, "code_vers": [2, 6, 63], "codec": [14, 16, 50], "codelocationselector": 67, "codelocationsensor": 52, "coeercibletoassetkei": 2, "coerc": [5, 63], "coercibletoassetkei": 8, "coercibletoassetkeyprefix": [14, 33], "cogroup": [16, 50], "cohort": 14, "col": 63, "col_a": 63, "col_b": 63, "collect": [4, 7, 8, 9, 12, 14, 16, 50, 60, 63, 64], "collis": 34, "color": [24, 26], "colored_console_logg": [8, 13, 61], "column": [1, 2, 11, 14, 28, 29, 30, 31, 34, 35, 36, 45, 46, 53, 54, 55, 60, 63], "columnstor": 14, "com": [14, 16, 18, 19, 20, 23, 24, 26, 34, 38, 40, 42, 50, 52, 58, 59, 63, 69], "coma": 14, "combin": [1, 2, 9, 26, 51, 63], "come": [2, 9, 14, 16, 50, 67], "comma": [14, 16, 25, 50], "command": [3, 14, 16, 18, 19, 20, 26, 27, 34, 35, 36, 40, 50, 51, 53, 54, 55, 56], "comment": 14, "commercetool": 14, "commercetoolssourc": 14, "commit": 14, "committ": [16, 50], "committransact": 14, "common": [2, 11, 14, 18, 19, 20, 34, 40], "commonli": 64, "commun": [11, 14, 16, 21, 33, 34, 50, 59, 63, 68], "compani": [14, 40, 52], "companynam": 14, "compar": 59, "compat": [12, 14, 16, 17, 34, 50], "compil": [15, 26], "complet": [8, 9, 11, 12, 13, 14, 16, 17, 20, 21, 23, 26, 33, 34, 35, 36, 40, 45, 50, 59, 63, 67], "completekei": 69, "complex": 65, "complex_repositori": 65, "complex_solid": 69, "complexrepositorydata": 65, "complic": [5, 69], "compon": [4, 8, 9, 11, 12, 14, 16, 17, 21, 26, 33, 34, 48], "compos": [2, 18, 26, 60, 63], "compose_fn": [2, 9, 13], "composit": [6, 9, 14], "compress": [14, 16, 50, 57], "compression_codec": 14, "compression_level": 14, "compression_typ": 14, "comput": [1, 2, 6, 7, 8, 9, 10, 12, 14, 16, 17, 20, 23, 26, 33, 39, 40, 47, 50, 59, 60, 61, 62, 63, 67, 68], "compute_fn": [2, 63], "compute_kind": [1, 2], "compute_log": [16, 17, 34], "compute_log_manag": [11, 16, 17, 34], "compute_logs_data": 11, "computelogmanag": 11, "computemetadata": 34, "con": 2, "concaten": [2, 60], "concept": [8, 20, 26, 40], "concert": [7, 20], "concis": [16, 50], "concret": [9, 11], "concurr": [3, 8, 11, 12, 14, 16, 27, 40, 50], "condit": [11, 14, 67], "conf": [16, 23, 34, 50], "config": [1, 2, 3, 5, 7, 9, 10, 11, 12, 13, 15, 16, 17, 18, 19, 20, 22, 23, 24, 25, 26, 27, 29, 30, 31, 34, 35, 36, 38, 39, 40, 41, 42, 43, 44, 45, 47, 48, 50, 51, 52, 53, 54, 55, 60, 61, 62, 63, 65, 66, 67, 68, 69], "config_dict": [4, 32], "config_field": [8, 66], "config_fil": 69, "config_fn": 4, "config_from_fil": 69, "config_from_pkg_resourc": 69, "config_from_yaml_str": 69, "config_map": [8, 9, 13], "config_or_config_fn": 11, "config_schema": [1, 2, 4, 6, 11, 12, 16, 47, 60, 61, 63, 65, 66, 68], "config_sourc": [18, 19, 20], "config_valu": [4, 7, 68], "config_yaml": [11, 18], "configbucket": 34, "configmap": [2, 4, 8, 9, 13, 20, 40], "configmapenvsourc": [20, 40], "configschema": [1, 2, 4, 11, 12, 16, 47, 60, 61, 63, 66, 68], "configu": 4, "configur": [1, 2, 3, 4, 7, 9, 10, 11, 12, 13, 14, 15, 16, 17, 19, 20, 21, 22, 23, 26, 27, 28, 29, 30, 31, 33, 34, 35, 36, 38, 39, 40, 42, 43, 47, 48, 50, 51, 52, 53, 54, 55, 56, 59, 60, 61, 64, 65, 66, 67, 69], "configurableclass": 11, "configurableclassdata": [11, 16, 17, 34], "configurabledefinit": [4, 11], "configurableexternaliomanag": 12, "configurableiomanag": 12, "configurableiomanagerfactori": 12, "configurablelocaloutputnotebookiomanag": 60, "configurablepickledobjectadls2iomanag": 17, "configurablepickledobjectgcsiomanag": 34, "configurablepickledobjects3iomanag": 16, "configurableresourc": [16, 25, 38, 66], "conflict": [8, 16, 34, 39, 50, 66], "confluenc": 14, "confluencesourc": 14, "conform": [2, 4, 8, 9, 13, 15, 34, 39, 63], "confus": [11, 63], "conjunct": 4, "conn": [28, 53], "conn_str": 11, "connect": [3, 9, 11, 12, 14, 15, 16, 17, 19, 22, 25, 26, 27, 32, 33, 34, 39, 40, 42, 49, 50, 52, 53, 57, 58, 63, 66], "connect_timeout": [14, 16], "connect_timeout_sec": 25, "connection_data": 14, "connection_directori": 14, "connection_filt": 14, "connection_id": 14, "connection_str": 32, "connection_to_asset_key_fn": 14, "connection_to_auto_materialize_policy_fn": 14, "connection_to_freshness_policy_fn": 14, "connection_to_group_fn": 14, "connection_to_io_manager_key_fn": 14, "connection_typ": 14, "connectionerror": 39, "connectiontimeout": [16, 50], "connector": [14, 26, 33, 42, 53], "connector_filt": 33, "connector_id": 33, "connector_to_asset_key_fn": 33, "connector_to_group_fn": 33, "connector_to_io_manager_key_fn": 33, "consecut": [14, 16, 50], "consequ": [8, 13], "conserv": [8, 13], "consid": [11, 14, 16, 23, 50, 51, 59, 67, 68], "consider": [16, 50], "consist": [2, 12, 26, 63, 64], "consol": [8, 13, 14, 16, 26, 34, 50], "consolid": 11, "consolidatedsqliteeventlogstorag": 11, "constant": 4, "constitu": [2, 8, 9, 46], "constraint": [2, 39, 45, 63], "construct": [2, 7, 8, 9, 11, 12, 13, 14, 15, 18, 19, 20, 26, 45, 46, 51, 53, 56, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69], "construct_spark_shell_command": 56, "constructor": [6, 7, 16, 18, 19, 20, 50, 63, 65, 69], "consult": 11, "consum": [2, 11, 14, 16, 50, 59, 63, 67], "consumer_kei": 14, "consumer_secret": 14, "consumpt": [16, 50], "contact": 18, "contain": [2, 3, 4, 5, 8, 9, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 23, 26, 27, 28, 29, 30, 31, 33, 34, 35, 36, 40, 42, 45, 46, 50, 52, 53, 54, 55, 59, 60, 63, 64, 65, 66, 67, 68, 69], "container_config": [20, 40], "container_context": 3, "container_imag": 3, "container_kwarg": [19, 27], "container_nam": 16, "content": [2, 5, 8, 14, 16, 23, 26, 50, 53, 65], "context": [2, 3, 4, 5, 6, 7, 9, 10, 11, 13, 16, 17, 20, 23, 24, 26, 27, 34, 39, 40, 41, 42, 45, 47, 50, 51, 52, 53, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69], "context_": [45, 68], "context_manager_resourc": [8, 10, 12, 66], "contextlib": 66, "contextmanag": 66, "contigu": 8, "continu": [14, 34], "continueonfailur": 34, "contrain": 4, "contrast": [20, 27, 40], "contribut": 11, "control": [11, 14, 16, 23, 50, 63, 64, 65, 67], "conveni": [5, 26, 61, 69], "convent": [14, 15], "convers": [14, 64], "conversion_window_dai": 14, "convert": [4, 26, 46, 53, 54, 55, 67, 69], "cool": [16, 17, 34, 63], "coordin": [16, 50], "copi": [2, 11, 14, 16, 23, 34, 50, 52, 63, 66], "copy_handle_to_local_temp": 11, "copyright": 14, "core": [2, 7, 9, 11, 16, 20, 34, 40, 47, 50, 61, 63, 66, 67], "core_concept": 37, "corpor": 14, "correct": [2, 7, 63], "correctli": [16, 23, 40, 50], "correpond": 2, "correspond": [2, 3, 4, 6, 8, 11, 12, 14, 15, 18, 19, 20, 21, 23, 26, 33, 34, 40, 63, 64, 65, 66, 67], "corrupt": [16, 50], "cost": [16, 50], "costli": 65, "could": [2, 4, 5, 14, 16, 23, 50], "count": [14, 16, 24, 45], "counter": 14, "counter_id": 14, "countri": 14, "country_cod": 14, "coupl": [2, 14], "courier": 14, "couriersourc": 14, "cover": 12, "covid": 14, "cowboytyp": 4, "cpu": [16, 50], "cpu_count": 8, "crash": [16, 50], "creat": [1, 2, 3, 4, 5, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 20, 22, 23, 24, 26, 27, 32, 33, 34, 35, 36, 38, 40, 42, 44, 50, 52, 53, 58, 59, 60, 63, 64, 65, 66, 67, 68, 69], "create_assets_for_normalization_t": 14, "create_dagster_pandas_dataframe_typ": 45, "create_databricks_run_now_op": 23, "create_databricks_submit_run_op": 23, "create_dataset": 59, "create_db_connect": 2, "create_fresh_databas": 53, "create_io_manag": 12, "create_issu": 38, "create_k8s_job_task": 20, "create_registered_model": 41, "create_repository_using_definitions_arg": 5, "create_shell_command": 51, "create_shell_command_op": 51, "create_shell_script_op": 51, "create_spark_op": 56, "create_task": 18, "create_timestamp": 11, "created_aft": 11, "created_at": 14, "created_befor": 11, "creation": [5, 14, 16, 19, 23, 27, 40, 50, 65], "creativ": 14, "credenti": [14, 16, 17, 20, 23, 34, 35, 36, 40, 52], "credential_typ": 14, "credentials_json": 14, "credentials_titl": 14, "criteria": [1, 2, 8, 9, 13, 14, 33, 51, 60, 63], "critic": [3, 61], "crm": 14, "cron": [2, 11, 26, 64, 67], "cron_schedul": [2, 26, 64, 65, 67], "cron_schedule_timezon": 2, "cron_up_to_date_asset": 2, "cross": [8, 11, 13, 23, 34, 64], "crossrealmtrustadminserv": 34, "crossrealmtrustkdc": 34, "crossrealmtrustrealm": 34, "crossrealmtrustsharedpassworduri": 34, "csv": [4, 14, 32], "csv_loader": 12, "csv_loader_kei": 12, "csvcommaseparatedvalu": 14, "csvdestin": 14, "curiou": 11, "curl": [14, 34], "currenc": 14, "current": [2, 3, 8, 10, 11, 12, 14, 16, 20, 21, 23, 26, 32, 33, 34, 40, 42, 50, 52, 62, 63, 64, 66, 67, 69], "current_tim": [8, 13, 64], "current_valu": 7, "curri": 4, "cursor": [3, 11, 14, 53, 67], "cursor_field": 14, "cursor_from_latest_materi": 67, "custom": [2, 4, 8, 11, 12, 14, 16, 20, 23, 26, 34, 35, 36, 39, 40, 45, 50, 53, 63, 65, 68], "custom_group_prefix": 26, "custom_insight": 14, "custom_instance_class_data": 11, "custom_queri": 14, "custom_report": 14, "custom_reports_field": 14, "custom_reports_include_default_field": 14, "custom_service_account": 34, "custom_tag": 23, "custom_typ": 63, "customdagsterdbttransl": 26, "customer_id": 14, "customgaqlqueriesentri": 14, "cwd": 51, "cyclic": 66, "d": [2, 3, 14, 18, 33, 51, 53, 64, 67], "d180": 14, "d30": 14, "d7": 14, "d90": 14, "d9971c84d44d47f382a2928c8c161faa": 40, "daemon": [5, 11, 34, 67], "dag": [8, 9, 15, 16, 40, 50], "dag_bag": 15, "dag_path": 15, "dag_run": 15, "dag_run_config": 15, "dagbag": 15, "daggraph": [16, 50], "dagit": [3, 42, 52], "dagit_base_url": [42, 52], "dagredi": 40, "dagrun": 15, "dagster": [1, 2, 5, 6, 7, 8, 9, 10, 11, 12, 13, 19, 20, 27, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69], "dagster_airbyt": 14, "dagster_airflow": 15, "dagster_attribut": 3, "dagster_auto_materialize_polici": 26, "dagster_aw": [11, 16, 68], "dagster_azur": 17, "dagster_bigquery_panda": [34, 35, 36], "dagster_celeri": [18, 19, 20], "dagster_celery_broker_host": [18, 19, 20], "dagster_celery_dock": 19, "dagster_celery_k8": [18, 20], "dagster_censu": 21, "dagster_conn_id": 15, "dagster_container_context": 3, "dagster_container_imag": 3, "dagster_daemon_log_level": 3, "dagster_dask": 22, "dagster_databrick": 23, "dagster_datadog": 24, "dagster_datahub": 25, "dagster_dbt": 26, "dagster_dbt_transl": 26, "dagster_dock": 27, "dagster_docker_imag": 40, "dagster_docker_image_tag": 40, "dagster_docker_repositori": 40, "dagster_duckdb": [28, 29, 30, 31], "dagster_duckdb_panda": [28, 29], "dagster_duckdb_polar": 30, "dagster_duckdb_pyspark": 31, "dagster_embedded_elt": 32, "dagster_empty_working_directori": 3, "dagster_etl": 32, "dagster_ev": [11, 26, 60, 67], "dagster_event_typ": 11, "dagster_exampl": 69, "dagster_fivetran": 33, "dagster_freshness_polici": 26, "dagster_g": 37, "dagster_gcp": [34, 35, 36], "dagster_gcp_panda": 35, "dagster_gcp_pyspark": 36, "dagster_github": 38, "dagster_graphql": 39, "dagster_grpc_host": 3, "dagster_grpc_port": 3, "dagster_grpc_socket": 3, "dagster_handl": 61, "dagster_hom": [3, 8, 11, 12, 19, 20, 40, 43, 48, 59], "dagster_imag": 40, "dagster_inject_env_vars_from_inst": 3, "dagster_inst": 67, "dagster_instance_ref": 3, "dagster_k8": [20, 40], "dagster_lazy_load_user_cod": 3, "dagster_location_nam": 3, "dagster_log_level": 3, "dagster_mlflow": 41, "dagster_module_nam": 3, "dagster_msteam": 42, "dagster_mysql": [11, 43], "dagster_package_nam": 3, "dagster_pagerduti": 44, "dagster_panda": [37, 45], "dagster_pandera": 46, "dagster_papertrail": 47, "dagster_pg_password": [20, 40], "dagster_postgr": [11, 48], "dagster_prometheu": 49, "dagster_pyspark": 50, "dagster_python_fil": 3, "dagster_redshift_password": 16, "dagster_run": [8, 11, 42, 52, 63, 66, 67], "dagster_shel": 51, "dagster_slack": [26, 52], "dagster_snowflak": [53, 54, 55], "dagster_snowflake_panda": [53, 54, 55], "dagster_snowflake_pyspark": [53, 54, 55], "dagster_spark": 56, "dagster_ssh": 57, "dagster_stag": 23, "dagster_test": 40, "dagster_twilio": 58, "dagster_typ": [2, 4, 5, 6, 7, 8, 9, 12, 37, 45, 63, 65, 68], "dagster_type_load": [8, 45, 68], "dagster_use_python_environment_entry_point": 3, "dagster_wandb": 59, "dagster_webserv": 3, "dagster_webserver_log_level": 3, "dagster_webserver_port": 3, "dagster_working_directori": 3, "dagsterassetmetadatavalu": 63, "dagstercloudoper": 15, "dagsterconfigmappingfunctionerror": 7, "dagsterdaemonschedul": 67, "dagsterdbtclifatalruntimeerror": 26, "dagsterdbtclihandledruntimeerror": 26, "dagsterdbtclioutputsnotfounderror": 26, "dagsterdbtcliruntimeerror": 26, "dagsterdbtcliunexpectedoutputerror": 26, "dagsterdbterror": 26, "dagsterdbttransl": 26, "dagstererror": 7, "dagsterev": [8, 10, 11, 67], "dagstereventloginvalidforrun": 7, "dagstereventtyp": [8, 11], "dagsterexecutionstepexecutionerror": [7, 11], "dagsterexecutionstepnotfounderror": 7, "dagstergraphqlcli": 39, "dagstergraphqlclienterror": 39, "dagsterinst": [3, 7, 8, 9, 10, 11, 13, 64, 66, 67], "dagsterinstanceoverrid": 11, "dagsterinvalidconfigdefinitionerror": 7, "dagsterinvalidconfigerror": [4, 7], "dagsterinvaliddefinitionerror": 7, "dagsterinvalidinvocationerror": 26, "dagsterinvalidsubseterror": 7, "dagsterinvariantviolationerror": [7, 8, 66, 69], "dagsterlogmanag": [8, 11, 51, 61, 66], "dagstermil": [8, 13], "dagstermillerror": 60, "dagstermillexecutioncontext": 60, "dagsteroper": 15, "dagsterpanderadatafram": 46, "dagsterresourcefunctionerror": 7, "dagsterrun": [8, 11, 60, 67], "dagsterrunconflict": 39, "dagsterrunmetadatavalu": 63, "dagsterrunnotfounderror": 7, "dagsterrunreact": 67, "dagsterrunstatu": [11, 39, 67], "dagsterstepoutputnotfounderror": 7, "dagsterstorag": 11, "dagstersubprocesserror": 7, "dagstertyp": [2, 7, 8, 9, 12, 16, 37, 45, 46, 63, 68], "dagstertypecheckdidnotpass": 7, "dagstertypecheckerror": 7, "dagstertypekind": [45, 68], "dagstertypeload": [45, 68], "dagstertypeloadercontext": 68, "dagsterunknownresourceerror": 7, "dagsterunmetexecutorrequirementserror": 7, "dagsterusercodeexecutionerror": [7, 11], "dagsterwebserv": 40, "dai": [14, 59, 64, 67], "daili": [14, 16, 50, 64, 67], "daily_123": 64, "daily_dbt_assets_schedul": 26, "daily_partitioned_config": [64, 67], "dailypartitiondefinit": 26, "dailypartitionsdefinit": [8, 26, 64, 67], "dash": 14, "dashboard": [14, 16, 50, 58, 63], "dashboard_url": 63, "dask_enabled_job": 22, "dask_executor": 22, "data": [1, 2, 4, 6, 8, 9, 11, 12, 14, 16, 17, 20, 23, 26, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 40, 45, 46, 47, 50, 51, 53, 54, 55, 60, 61, 62, 63, 64, 66, 68], "data_queri": 14, "data_sourc": 14, "data_source_typ": 14, "data_to_sync": 14, "data_vers": [2, 63], "databas": [2, 3, 11, 12, 14, 15, 16, 26, 28, 29, 30, 31, 32, 34, 35, 36, 40, 43, 53, 54, 55, 63, 64, 66, 67], "database_schema": 14, "databrick": 14, "databricks_cli": 23, "databricks_host": 23, "databricks_http_path": 14, "databricks_job_configur": 23, "databricks_job_id": 23, "databricks_personal_access_token": 14, "databricks_port": 14, "databricks_pyspark_step_launch": 23, "databricks_resource_kei": 23, "databricks_server_hostnam": 14, "databricks_token": 23, "databrickscli": 23, "databricksclientresourc": 23, "databricksdestin": 14, "databrickserror": 23, "datacent": [14, 23], "datadog_cli": 24, "datadog_op": 24, "datadog_resourc": 24, "datadogcli": 24, "datadoghq": 24, "datadogpi": 24, "datadogresourc": 24, "datafram": [1, 2, 28, 29, 30, 31, 34, 35, 36, 37, 45, 46, 50, 53, 54, 55, 60], "dataframe_constraint": 45, "dataframe_load": 45, "dataframeconstraint": 45, "dataframeschema": 46, "datahub_kafka_emitt": 25, "datahub_rest_emitt": 25, "datahubkafkaemitterresourc": 25, "datahubrestemitterresourc": 25, "datalak": 14, "datalakeservicecli": 17, "dataproc_op": 34, "dataproc_resourc": 34, "dataproccli": 34, "dataprocresourc": 34, "dataproven": 8, "dataset": [14, 34, 35, 36, 37, 59, 60, 63], "dataset_id": 14, "dataset_loc": 14, "dataset_nam": 14, "datasetid": 14, "datasourc": 37, "datasource_nam": 37, "datatyp": 14, "datavers": 63, "date": [2, 3, 14, 16, 23, 52, 62, 64, 67], "date_from": 14, "date_ranges_start_d": 14, "date_window_s": 14, "datetim": [8, 11, 13, 64, 67], "day_of_month": [64, 67], "day_of_week": [64, 67], "day_offset": [64, 67], "db": [3, 14, 15, 23, 28], "db2": 14, "db2sourc": 14, "db3": 23, "db_name": [11, 43, 48], "db_password": 12, "db_pool_recycl": 3, "db_statement_timeout": 3, "dbf": 23, "dbt": [1, 2], "dbt_asset": 26, "dbt_build_arg": 26, "dbt_build_invoc": 26, "dbt_cli_invoc": 26, "dbt_cli_resourc": 26, "dbt_cloud": 26, "dbt_cloud_account_id": 26, "dbt_cloud_api_token": 26, "dbt_cloud_asset": 26, "dbt_cloud_auth_token": 26, "dbt_cloud_host": 26, "dbt_cloud_job_id": 26, "dbt_cloud_resourc": 26, "dbt_cloud_run_op": 26, "dbt_cloud_sandbox": 26, "dbt_compile_op": 26, "dbt_docs_generate_op": 26, "dbt_event": 26, "dbt_exclud": 26, "dbt_execut": 26, "dbt_ls_op": 26, "dbt_macro_arg": 26, "dbt_output": 26, "dbt_profiles_dir": 26, "dbt_project": 26, "dbt_project_dir": 26, "dbt_resource_kei": 26, "dbt_resource_prop": 26, "dbt_run_invoc": 26, "dbt_run_op": 26, "dbt_seed_op": 26, "dbt_select": 26, "dbt_snapshot_op": 26, "dbt_test_op": 26, "dbt_var": 26, "dbtcliclientresourc": 26, "dbtclieventmessag": 26, "dbtcliinvoc": 26, "dbtclioutput": 26, "dbtcliresourc": 26, "dbtcloudclientresourc": 26, "dbtmanifestassetselect": 26, "dbtoutput": 26, "dbtresourc": 26, "dbtrpcoutput": 26, "dbtypehandl": [28, 29, 30, 31, 34, 35, 36, 53, 54, 55], "dc_region": 14, "dd": [14, 24], "dd_job": 24, "ddt00": 14, "ddthh": 14, "dead": [16, 50], "deal": 5, "debian": 34, "debug": [8, 11, 14, 21, 23, 26, 34, 40, 61, 69], "debugg": 69, "debugrunpayload": 11, "decid": [16, 50], "decis": 2, "declar": [5, 8, 9, 14, 15, 16, 17, 63], "decor": [1, 2, 4, 8, 9, 10, 11, 12, 13, 24, 26, 45, 51, 60, 61, 62, 63, 64, 65, 66, 67, 68], "decorated_fn": [10, 64], "decreas": [16, 50], "decrement": 24, "dedic": 59, "deduct": 14, "dedup": 14, "dedupl": 14, "deeplink": [42, 52, 69], "def": [1, 2, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 33, 34, 35, 36, 38, 40, 41, 42, 44, 49, 50, 51, 52, 53, 54, 55, 59, 60, 61, 63, 64, 65, 66, 67, 68, 69], "default": [1, 2, 3, 4, 5, 6, 8, 9, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 23, 25, 26, 27, 28, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 45, 48, 49, 50, 51, 52, 53, 54, 55, 57, 59, 60, 63, 64, 65, 66, 67, 68, 69], "default_asset_key_fn": 26, "default_auto_materialize_policy_fn": 26, "default_azure_credenti": 17, "default_freshness_policy_fn": 26, "default_group_from_dbt_resource_prop": 26, "default_load_typ": [28, 34, 53], "default_metadata_from_dbt_resource_prop": 26, "default_provid": 4, "default_statu": [42, 52, 64, 67, 69], "default_tag": 23, "default_valu": [4, 6, 63], "defaultazurecredenti": 17, "defaultcor": [16, 50], "defaultoauth20author": 14, "defaultruncoordin": 11, "defaultrunlaunch": 11, "defaultschedulestatu": [64, 67], "defaultsensorstatu": [42, 52, 67, 69], "defin": [1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16, 18, 19, 20, 22, 23, 26, 27, 28, 32, 33, 34, 40, 45, 46, 47, 52, 53, 59, 60, 62, 64, 65, 66, 67, 68], "define_asset_job": [2, 5, 64, 67], "define_dagstermill_asset": 60, "define_dagstermill_op": 60, "define_job": 8, "define_my_job": [8, 13], "define_pipelin": 3, "define_repo": 3, "define_spark_config": 56, "definit": [1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 23, 25, 26, 27, 28, 29, 30, 31, 33, 34, 35, 36, 38, 40, 42, 49, 51, 52, 53, 54, 55, 60, 61, 62, 63, 65, 66, 67, 68], "defint": [5, 65], "deflat": 14, "delai": [14, 16, 50, 63], "deleg": [18, 20, 61], "delet": [3, 11, 14, 16, 34, 35, 36, 40, 49, 50, 59, 67], "delete_dynamic_partit": [11, 64], "delete_local_temp": 11, "delete_messag": 14, "delete_run": 11, "delete_unmentioned_resourc": 14, "deletedynamicpartitionsrequest": 67, "delight": 14, "delightedsourc": 14, "delimit": [14, 25], "delin": [64, 67], "deliv": [14, 23], "delivery_timeout_m": 14, "delta": 14, "demand": 23, "deni": 18, "denibertov": 18, "denorm": 14, "dep": [2, 8, 14, 26, 60], "depen": 15, "depend": [2, 6, 8, 13, 14, 15, 16, 26, 34, 50, 51, 52, 60, 64, 66, 68], "depende": 9, "dependency_kei": 2, "dependency_structur": 9, "dependencydefinit": [8, 9], "deploi": [16, 23, 34, 40, 50], "deploy": [1, 3, 14, 15, 16, 19, 20, 22, 34, 35, 36, 39, 40, 50, 53, 59], "deploy_local_job_packag": 16, "deploy_local_pipeline_packag": 16, "deploy_mod": 56, "deployment_nam": 15, "deploymod": [16, 50], "deprec": [2, 3, 8, 14, 16, 23, 50, 52, 57, 60, 64, 67, 69], "depth": 2, "deqeueu": 11, "dequeu": 11, "dequeue_interval_second": 11, "dequeue_num_work": 11, "dequeue_use_thread": 11, "deriv": [2, 11, 14, 26], "descend": [8, 9, 11, 13, 14], "describ": [2, 4, 8, 9, 11, 12, 13, 14, 23, 26, 32, 39, 40, 62, 63], "descript": [1, 2, 4, 6, 7, 8, 9, 11, 12, 13, 14, 16, 26, 33, 45, 46, 47, 50, 51, 56, 59, 60, 61, 63, 64, 65, 66, 67, 68], "descriptions_by_kei": 2, "descriptions_by_output_nam": 2, "descriptor": 63, "deseri": 67, "deserialization_strategi": 14, "deserialization_typ": 14, "design": 2, "desir": [14, 26, 40], "dest_tabl": 32, "dest_tbl": 32, "destin": [21, 23, 33], "destination_configur": 14, "destination_default": 14, "destination_namespac": 14, "destination_path": 14, "destination_t": [14, 33], "destination_typ": 14, "destruct": [16, 50], "detail": [2, 3, 14, 16, 18, 19, 20, 21, 23, 26, 33, 46, 50, 52, 53, 54, 55, 63, 67], "detect": [14, 16, 50], "determin": [2, 3, 6, 8, 9, 12, 13, 14, 16, 23, 26, 28, 29, 30, 31, 33, 34, 35, 36, 45, 53, 54, 55, 63, 64, 67, 68], "determinist": [2, 63, 68], "dev": [4, 16, 18, 19, 20, 23, 26, 40], "dev_s3": 4, "develop": [11, 14, 34, 38, 53, 60], "developer_token": 14, "devic": 14, "devstorag": 34, "df": [12, 16, 17, 34, 53, 63], "dfoo": 34, "dict": [1, 2, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 25, 26, 27, 32, 33, 34, 37, 39, 40, 41, 45, 48, 50, 51, 52, 53, 60, 63, 64, 65, 66, 67, 68, 69], "dictionari": [1, 2, 4, 5, 7, 8, 9, 11, 12, 13, 14, 16, 23, 26, 33, 34, 46, 51, 60, 63, 64, 65, 66, 67, 69], "dictionary_encod": 14, "dictionary_page_size_kb": 14, "did": 69, "differ": [2, 4, 5, 8, 12, 13, 14, 16, 18, 19, 20, 26, 48, 50, 51, 63, 64, 67], "digit": 14, "dimens": [8, 12, 14, 64, 67], "dimension": 64, "dimension_nam": 64, "dimensionpartitionmap": 64, "dir": [11, 16, 26, 34, 50], "direct": [3, 8, 9, 13, 14, 16, 50], "directli": [2, 3, 4, 5, 8, 9, 11, 12, 13, 15, 16, 17, 24, 26, 34, 35, 36, 43, 48, 50, 51, 53, 60, 61, 62, 63, 64, 65, 66, 67, 68], "directori": [3, 8, 11, 12, 14, 15, 16, 17, 18, 23, 26, 34, 50, 51, 53, 59, 60, 65, 69], "dirnam": 6, "disabl": [3, 11, 14, 16, 18, 19, 20, 23, 26, 27, 40, 50, 53, 59], "disable_schedule_on_trigg": [26, 33], "disable_ssl_verif": 25, "disallow": 11, "discard": 2, "discern": 2, "disconnect": 14, "discount": 14, "discov": 14, "discover_field": 14, "discoveri": 14, "discret": [8, 9, 13, 64], "disk": [11, 14, 16, 23, 34, 50, 63], "diskconfig": 34, "dispatch": [8, 10, 39, 59, 61], "displai": [1, 2, 3, 4, 6, 11, 14, 16, 23, 26, 45, 50, 52, 59, 60, 63, 65, 67, 68], "display_nam": 68, "display_raw_sql": 26, "distcp": 34, "distinguish": [6, 8, 11, 14, 45, 68], "distribut": [16, 17, 20, 22, 24, 34, 40], "divid": [16, 50], "dixa": 14, "dixasourc": 14, "dkr": 40, "dn": 14, "do": [2, 4, 5, 7, 11, 12, 14, 16, 21, 26, 28, 29, 30, 31, 34, 35, 36, 50, 53, 54, 55, 59, 60, 62, 69], "do_it_al": [8, 9], "do_some_transform": 2, "do_some_work": 51, "do_someth": [10, 12], "do_something_on_failur": 10, "do_something_on_success": 10, "do_stuff": [12, 23], "do_stuff_partit": [64, 67], "do_stuff_partitioned_schedul": [64, 67], "doc": [8, 11, 14, 16, 18, 19, 20, 23, 24, 26, 32, 33, 34, 37, 40, 42, 52, 56], "docker": [14, 18, 20, 40], "docker_container_op": 27, "docker_executor": 27, "docker_image_tag": 41, "docker_job": 27, "docker_password": 19, "docker_usernam": 14, "dockerhub": 14, "dockerhubsourc": 14, "dockerrunlaunch": 27, "docs_url": 26, "docstr": 63, "document": [3, 4, 11, 14, 16, 17, 23, 24, 26, 32, 33, 34, 50, 52, 53, 54, 55, 56, 59, 69], "doe": [1, 2, 7, 8, 9, 11, 13, 14, 16, 23, 26, 27, 33, 39, 40, 45, 50, 53, 60, 63, 64, 66, 67, 68], "doesn": [3, 14, 53], "dog": 8, "dogstatsd": 24, "domain": 14, "domain_id": 14, "domain_nam": 14, "domain_region": 14, "domain_url": 14, "don": [1, 14, 67], "done": [8, 11, 43, 46, 48, 68], "dot": [2, 26, 63, 66], "doubl": [2, 16, 50], "double_quot": 14, "down": [3, 8, 9, 13, 14, 16, 18, 23, 24, 39, 50, 59, 66], "download": [3, 11, 14, 17, 53], "downstream": [1, 2, 6, 8, 9, 12, 13, 14, 26, 28, 29, 30, 31, 33, 34, 35, 36, 53, 54, 55, 60, 63, 64, 67], "downstream_asset": 2, "downstream_mappings_by_upstream_dimens": 64, "downstream_partition_keys_by_upstream_partition_kei": 64, "downstream_partitions_def": 64, "downstream_partitions_subset": 64, "downtim": 67, "draw": [20, 40], "drift": 14, "driftsourc": 14, "drive": [14, 34], "driver": [14, 16, 23, 34, 50], "driver_node_type_id": 23, "driverloglevel": 34, "drop": [16, 50, 53], "drop_databas": 53, "dry": 40, "dspark": 23, "duckdb_io_manag": 28, "duckdb_pandas_io_manag": 29, "duckdb_polars_io_manag": 30, "duckdb_pyspark_io_manag": 31, "duckdbiomanag": [28, 29, 30, 31], "duckdbpandasiomanag": 29, "duckdbpandastypehandl": [28, 29], "duckdbpolarsiomanag": 30, "duckdbpolarstypehandl": 30, "duckdbpysparkiomanag": 31, "duckdbpysparktypehandl": 31, "duckdbresourc": 28, "due": [14, 16, 50], "dump": [1, 2, 8, 9, 13, 16, 26, 33, 50, 51, 60, 63], "dump_profil": [16, 50], "dunderfil": 69, "duplic": [14, 67], "durabl": 14, "durat": [8, 14, 16, 22, 50], "dure": [2, 7, 8, 10, 11, 14, 16, 39, 50, 51, 53, 60, 61, 62, 63, 66, 67, 68], "dv": 14, "dv360sourc": 14, "dynam": [2, 8, 9, 13, 14, 16, 23, 50, 51, 53, 60, 64, 65, 67], "dynamic_partitioned_config": 64, "dynamic_partitions_request": [64, 67], "dynamic_partitions_stor": [8, 13, 64], "dynamicalloc": [16, 50], "dynamicout": 6, "dynamicoutput": [6, 8], "dynamicpartitionsdefinit": [8, 11, 13, 64], "dynamicpartitionsstor": [8, 13, 64], "dynamodb": 14, "dynamodb_endpoint": 14, "dynamodb_region": 14, "dynamodb_table_name_prefix": 14, "dynamodbdestin": 14, "e": [1, 2, 7, 8, 10, 11, 12, 13, 14, 16, 18, 19, 20, 23, 26, 28, 34, 39, 40, 45, 46, 50, 52, 53, 59, 61, 63, 64, 65, 67, 68], "e2": 14, "e2etestsourc": 14, "each": [1, 2, 3, 4, 5, 6, 8, 9, 11, 12, 13, 14, 16, 17, 18, 20, 21, 22, 23, 26, 27, 28, 33, 34, 40, 46, 50, 53, 60, 61, 62, 63, 64, 65, 66, 67], "eager": [2, 11, 26], "earlier": 67, "earliest": [14, 67], "easi": [4, 8, 11, 13, 26, 59], "easier": 16, "easiest": [11, 12], "easili": [12, 14, 47], "east": [14, 16], "east1": 34, "eb": 23, "ebs_volume_count": 23, "ebs_volume_iop": 23, "ebs_volume_s": 23, "ebs_volume_throughput": 23, "ebs_volume_typ": 23, "ebsvolumetyp": 23, "echo": [27, 40, 51, 63], "echo_2": 63, "echo_data": 51, "echo_graph": 51, "echo_op": 51, "ecr": 40, "ecsrunlaunch": 16, "edg": 9, "edit": [2, 8, 9, 13, 14], "effect": [2, 11, 14, 16, 40, 50, 63], "effici": [5, 16, 50, 65], "eg": [3, 14, 16, 26, 50], "egg": [16, 34, 50], "either": [2, 3, 4, 6, 7, 8, 9, 10, 11, 12, 14, 15, 16, 17, 23, 39, 40, 45, 49, 50, 51, 52, 64, 66, 67, 68, 69], "ek": 40, "elaps": [14, 52, 67], "elasticsearch": 14, "elasticsearchdestin": 14, "elasticsearchsourc": 14, "element": [4, 7, 12, 16, 40, 50, 51], "elimin": 3, "els": [14, 26, 34], "email": [14, 40, 69], "email_body_fn": 69, "email_from": 69, "email_on_job_failur": 69, "email_on_run_failur": 69, "email_password": 69, "email_subject_fn": 69, "email_to": 69, "embed": 69, "emit": [2, 8, 11, 14, 23, 26], "emit_f": 9, "emit_metadata": 63, "empti": [3, 14, 16, 17, 20, 34, 40], "emr_pyspark_step_launch": 16, "emr_stag": 16, "emrclusterst": 16, "emrerror": 16, "emrjobrunn": 16, "emrstepst": 16, "en": [4, 14, 18, 19, 20, 23, 27, 37, 42], "enabl": [3, 8, 9, 11, 13, 14, 16, 18, 19, 20, 23, 26, 27, 34, 50, 53, 62, 63], "enable_auto_commit": 14, "enable_elastic_disk": 23, "enable_encrypt": 23, "enable_experimental_stream": 14, "enable_idempot": 14, "enablecompress": [16, 50], "enablekerbero": 34, "encapsul": [2, 34, 40, 63], "encod": [1, 2, 3, 8, 9, 13, 14, 23, 33, 34, 35, 36, 51, 53, 54, 55, 60, 63, 69], "encount": 23, "encrypt": [14, 23, 34, 53, 54, 55], "encryptedtrustservercertif": 14, "encryptedverifycertif": 14, "encryption_algorithm": 14, "encryption_typ": 23, "encryptionconfig": 34, "end": [4, 8, 9, 14, 18, 26, 38, 41, 59, 63, 64, 67], "end_dat": [14, 64], "end_mlflow_on_run_finish": 41, "end_offset": [2, 8, 64, 67], "end_tim": [11, 14], "endpoint": [14, 16, 23, 50], "endpoint_url": 16, "enforc": [7, 14, 16, 45, 50], "enforce_ord": 45, "engag": 14, "engin": [8, 11, 14, 34, 53], "engine_ev": 8, "engineev": 11, "enough": [1, 3, 16, 50], "enqueu": 11, "enrich": 14, "ensur": [8, 11, 13, 14, 15, 23, 24, 40, 51, 53, 54, 55, 64, 67], "entail": 18, "enter": 14, "enterpris": 38, "entir": [14, 16, 23, 50, 64], "entireti": [16, 50], "entiti": [16, 50, 59], "entri": [2, 3, 4, 11, 12, 14, 16, 26, 28, 29, 30, 31, 34, 35, 36, 50, 53, 54, 55, 60, 63], "entry_data": 63, "entry_point": 59, "entrypoint": [8, 27], "enum": [4, 7, 16, 39, 51, 64], "enum_valu": 4, "enumer": [14, 16], "enumvalu": 4, "env": [3, 4, 14, 17, 19, 21, 26, 33, 34, 35, 36, 40, 41, 51, 53, 54, 55, 59], "env_config_map": [20, 40], "env_secret": [20, 40], "env_to_tag": 41, "env_var": [16, 19, 20, 27, 40], "env_vari": 23, "envfrom": [20, 40], "environ": [3, 4, 8, 11, 12, 13, 14, 15, 16, 19, 20, 23, 27, 34, 38, 39, 40, 41, 43, 48, 50, 51, 65, 67, 69], "environemnt": 27, "environment": 23, "environment_var": 67, "envvar": [12, 14, 16, 23, 32, 34, 35, 36, 42, 52, 53, 54, 55], "ephemer": [3, 8, 9, 11, 13, 15, 16, 18, 34, 37, 66, 67], "ephemeral_storag": 16, "equal": [2, 14, 16, 23, 50, 64, 67], "equival": [2, 4, 5, 9, 34, 65], "error": [1, 2, 3, 4, 8, 11, 12, 14, 16, 21, 23, 33, 39, 42, 44, 46, 50, 52, 53, 60, 61, 63, 64, 66, 67, 69], "error_cl": 11, "error_info": 11, "error_object": 39, "error_strings_by_step_kei": 67, "error_toler": 45, "es": 2, "escape_char": 14, "especi": [16, 18, 19, 20, 50], "essenti": [14, 16, 50, 64], "establish": [14, 68], "estim": [16, 50], "etc": [2, 8, 14, 16, 20, 26, 34, 40, 50, 69], "eu": [14, 34], "eur": 14, "evalu": [2, 8, 13, 52, 65, 67], "evaluation_fn": 67, "even": [2, 14, 16, 26, 33, 50, 53, 63, 66, 67], "event": [2, 3, 7, 8, 10, 12, 14, 16, 20, 24, 26, 40, 42, 43, 44, 48, 50, 52, 60, 67, 69], "event_act": 44, "event_list": 8, "event_log": [11, 43, 48], "event_log_entri": 11, "event_log_storag": [11, 43, 48], "event_records_filt": 11, "event_specific_data": [8, 67], "event_storag": 11, "event_storage_data": 11, "event_typ": [8, 11], "event_type_valu": 8, "eventlog": [16, 50], "eventlogentri": [11, 67], "eventlogrecord": [11, 67], "eventlogstorag": 11, "eventqueu": [16, 50], "eventrecordsfilt": 11, "eventu": [11, 63], "eventv2_cr": 44, "everi": [16, 18, 19, 20, 23, 50, 53, 57, 64, 67], "everyth": 3, "evict": [16, 50], "ex": [20, 40], "exact": [14, 16, 45, 50], "exactli": [2, 4, 14, 23], "examin": 69, "exampl": [1, 2, 3, 4, 5, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 38, 39, 40, 41, 42, 44, 49, 50, 51, 52, 53, 54, 55, 59, 60, 61, 63, 64, 65, 66, 67, 68, 69], "example_adls2_op": 17, "example_job": 16, "example_mapping_kei": 6, "example_prometheus_op": 49, "example_redshift_asset": 16, "example_redshift_op": 16, "example_s3_op": 16, "example_secretsmanager_op": 16, "example_secretsmanager_secrets_op": 16, "example_secretsmanager_secrets_op_2": 16, "exampleenum": 4, "exc_info": 7, "exceed": [2, 16, 50], "except": [1, 2, 4, 7, 8, 9, 10, 13, 14, 16, 23, 26, 34, 39, 40, 50, 51, 53, 59, 61, 63], "excess": [16, 50], "exchang": 14, "exchangeratesapi": 14, "exchangeratessourc": 14, "excit": 11, "exclud": [2, 14, 16, 26, 33, 50, 64], "exclude_environment_credenti": 17, "execut": [1, 2, 3, 4, 6, 7, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 23, 26, 27, 28, 33, 34, 39, 40, 50, 51, 53, 59, 60, 62, 64, 66, 67, 68, 69], "execute_docker_contain": 27, "execute_in_process": [4, 7, 8, 9, 13, 15, 16, 24, 38, 41, 42, 44, 52, 53, 67], "execute_job": 8, "execute_k8s_job": 40, "execute_plan": 18, "execute_queri": [16, 53], "execute_shell_command": 51, "execute_shell_script": 51, "executeinprocessresult": [8, 9, 13], "executejobresult": 8, "execution_d": 15, "execution_data": [11, 68], "execution_fn": 67, "execution_plan": 11, "execution_plan_snapshot_id": 11, "execution_timezon": [26, 67], "executionplan": 11, "executor": [4, 5, 7, 13, 16, 17, 18, 19, 20, 22, 23, 27, 34, 40, 50, 63, 66], "executor_config": 11, "executor_creation_fn": 11, "executor_def": [2, 8, 9, 11, 13, 18, 19, 20, 22, 27, 40], "executor_id": [16, 50], "executorallocationratio": [16, 50], "executordefinit": [2, 4, 5, 8, 9, 11, 13, 18, 19, 20, 22, 27, 40], "executoridletimeout": [16, 50], "executorrequir": 11, "exhaust": 14, "exist": [3, 7, 8, 9, 11, 13, 14, 16, 20, 22, 23, 26, 34, 39, 45, 50, 53, 62, 64, 66, 67, 68], "exit": [3, 11, 16, 26, 34, 50, 51, 66], "expand": 14, "expand_issue_changelog": 14, "expect": [1, 2, 8, 11, 14, 16, 18, 19, 20, 26, 33, 34, 39, 45, 50, 51, 60, 63, 64], "expectationresult": [8, 60, 63], "expens": [11, 14, 16, 50], "expensive_job": 65, "expensive_schedul": 65, "experi": [14, 40, 41, 59], "experienc": 67, "experiment": [2, 14, 15, 50, 60, 63, 66, 67, 68], "experiment_nam": 41, "experimentalwarn": 69, "expir": 14, "explicit": [64, 68], "explicitli": [5, 8, 9, 11, 12, 13, 14, 63, 64, 68, 69], "explictli": 5, "explor": [14, 23, 60], "exponenti": [14, 63], "export": [3, 14, 23, 40, 67], "expos": [4, 14, 18, 19, 20, 21, 26, 33, 40, 61], "express": [14, 16, 45, 50, 63], "ext": 11, "ext_id": 14, "extend": [9, 12, 16, 20, 40, 50, 64, 67], "extens": 11, "extern": [2, 11, 16, 34, 40, 42, 50, 52, 59, 66, 67, 68, 69], "external_job_origin": 11, "external_version_fn": 68, "externaliomanag": 12, "externaltablevias3": 14, "extra": [4, 16, 23, 25, 26, 41, 50], "extra_head": 25, "extra_tag": 41, "extraclasspath": [16, 50], "extract": [14, 34, 46], "extrajavaopt": [16, 23, 50], "extralibrarypath": [16, 50], "extralisten": [16, 50], "extras_requir": 11, "extrem": 14, "f": [3, 4, 15, 16, 25, 34, 38, 42, 51, 52, 65, 66, 69], "face": [16, 50], "facebook": 14, "facebookmarketingsourc": 14, "facebookpagessourc": 14, "facil": 68, "facilit": 40, "facility_id": 14, "factori": [13, 23, 32, 51, 53], "fail": [1, 2, 3, 7, 10, 11, 14, 16, 20, 21, 23, 26, 33, 34, 39, 40, 42, 45, 46, 50, 52, 60, 63, 66, 67, 68, 69], "fail_pod_on_run_failur": [20, 40], "failed_run_id": 8, "failur": [6, 8, 10, 11, 13, 14, 16, 23, 26, 39, 42, 50, 51, 52, 60, 63, 67, 69], "failure_ev": [42, 52, 67, 69], "failure_hook": 10, "failure_sampl": 46, "failure_typ": 39, "fair": [16, 50], "fake": 14, "fake_redshift_resourc": 16, "fakeadls2resourc": 17, "fakepassword": 14, "faker": 14, "fakeredshiftclientresourc": 16, "fakersourc": 14, "fall": [20, 26, 34, 40, 64, 67], "fallback": [11, 52], "fals": [2, 4, 7, 8, 9, 11, 12, 13, 14, 15, 16, 21, 23, 25, 26, 33, 34, 39, 40, 42, 45, 50, 52, 53, 54, 55, 57, 60, 62, 63, 64, 67, 68, 69], "fan": [9, 23], "fan_in_index": 9, "far": 14, "fast": [16, 18, 19, 20, 50], "faster": [14, 16, 50], "fatal": [26, 34, 61], "fauna": 14, "faunasourc": 14, "favor": [26, 52, 62, 67, 69], "fd": 11, "fe": 14, "feasibl": 14, "featur": [11, 12, 14, 16, 18, 23, 34, 50], "feature_nam": 60, "feedback": 40, "femal": 24, "fetch": [8, 11, 13, 14, 16, 23, 33, 50, 53, 64, 67], "fetch_files_from_slack": 2, "fetch_result": [16, 53], "fetch_secret": 16, "fetch_thumbnail_imag": 14, "fetchfailur": [16, 50], "fetchtimeout": [16, 50], "few": [5, 14, 16, 50], "fewer": [14, 16, 50], "fh_1": 11, "fh_2": 11, "fi": 34, "field": [2, 4, 6, 7, 11, 14, 15, 16, 20, 23, 26, 40, 43, 48, 50, 51, 59, 63, 65, 67], "field_alias": 4, "field_util": 4, "fifo": 14, "file": [2, 3, 4, 6, 8, 12, 14, 15, 18, 20, 26, 32, 35, 36, 40, 43, 48, 50, 51, 53, 57, 59, 60, 63, 65, 69], "file_handl": 11, "file_manag": [11, 68], "file_nam": 65, "file_name_pattern": 14, "file_obj": 11, "file_pattern": 14, "file_relative_path": [6, 51, 69], "file_result": 6, "file_system": 17, "file_typ": 14, "filehandl": [11, 68], "filemanag": [11, 16, 17, 34], "filenam": [6, 16], "filenotfounderror": 69, "fileoutputcommitt": [16, 50], "filepath": [12, 16, 17, 34, 63], "files_in_directori": 6, "files_pipelin": 11, "filesecuresourc": 14, "filesourc": 14, "filesystem": [2, 8, 9, 11, 12, 13, 16, 17, 34, 50, 63], "filesystemiomanag": 12, "filetyp": 14, "fileuri": 34, "fileystem": 16, "fill": [8, 14], "filter": [3, 11, 14, 16, 33, 45, 50, 59, 68], "filter1": [16, 50], "filtersalesforceobjectsentri": 14, "final": [12, 14, 16, 17, 26, 34, 35, 36, 50, 52, 53], "final_foobar_st": [14, 33], "financ": 14, "find": [11, 14, 15, 16, 18, 26, 38, 40, 44, 50, 67], "fine": [38, 65], "finer": 64, "finish": [11, 16, 23, 41, 50, 52], "fire": [2, 52, 64, 67], "firebolt": 14, "fireboltdestin": 14, "fireboltsourc": 14, "firestor": 14, "firestoredestin": 14, "firewal": [16, 50], "first": [2, 3, 7, 8, 14, 16, 23, 24, 25, 38, 40, 44, 45, 46, 50, 51, 52, 53, 54, 55, 59, 63, 64, 65, 67, 68], "first_asset": 8, "first_on_demand": 23, "first_op": [27, 40], "fit": [16, 50], "fivetran_api_kei": 33, "fivetran_api_secret": 33, "fivetran_asset": 33, "fivetran_inst": 33, "fivetran_resourc": 33, "fivetran_sync_op": 33, "fivetranconnectormetadata": 33, "fivetranoutput": 33, "fivetranresourc": 33, "fix": [3, 14, 16, 50], "fix_m": 14, "fixed_server_id": 3, "fixtur": 40, "flag": [2, 3, 11, 14, 18, 26, 34, 40, 45, 53, 67], "flake": 63, "flakey_oper": 63, "flat_asset_kei": 63, "flatten": 14, "flavor": 34, "flexibl": [26, 63], "flexport": 14, "flexportsourc": 14, "float": [1, 2, 4, 6, 7, 11, 14, 16, 20, 21, 23, 25, 26, 33, 34, 35, 36, 42, 45, 50, 52, 63, 67, 68], "floatmetadatavalu": 63, "flow": [8, 9, 14, 16, 45, 63, 68], "flower": [18, 40], "flush": [14, 16, 50], "fmt": [64, 67], "fn": 66, "folder": [3, 14, 26], "folder_path": 14, "follow": [1, 2, 4, 6, 8, 9, 11, 12, 14, 16, 17, 18, 19, 20, 22, 27, 34, 38, 40, 48, 50, 59, 63, 64, 66, 67], "foo": [4, 8, 10, 11, 12, 13, 16, 23, 24, 26, 33, 34, 42, 50, 52, 63, 65, 66, 68], "foo_and_downstream_select": 26, "foo_job": [8, 13], "foo_job_arg": 13, "foo_job_kwarg": 13, "foo_resourc": 66, "foo_select": 26, "foobar": [4, 14, 21, 33], "footprint": [14, 16, 50], "forc": [16, 23, 50], "force_full_sync": 21, "fork": [16, 50, 69], "forked_pdb": [8, 69], "forkedpdb": [8, 69], "form": [9, 14, 16, 20, 23, 27, 40, 46, 49, 50, 65], "form_id": 14, "format": [2, 4, 8, 9, 10, 13, 14, 16, 26, 34, 39, 42, 45, 50, 52, 53, 59, 64, 65, 67, 68], "format_typ": 14, "forward": [7, 14, 16, 19, 40, 50], "forward_log": 14, "found": [7, 8, 11, 14, 16, 18, 23, 26, 38, 39, 40, 50, 58, 62, 63], "foundat": 63, "four": [16, 18, 50], "fqn": 26, "fraction": [16, 50], "fragment": [8, 11], "framework": [1, 2, 7, 8, 9, 11, 32, 33, 34, 39, 51, 60, 63], "free": [14, 16, 50], "freeli": 7, "frequenc": [14, 16, 50], "frequent": [14, 16, 18, 19, 20, 23, 50], "fresh": [2, 3, 14, 26, 52, 67], "fresh_asset": 2, "freshcal": 14, "freshcallersourc": 14, "freshdesk": 14, "freshdesksourc": 14, "freshness_polici": [2, 14, 26, 67], "freshness_policies_by_kei": 2, "freshness_policies_by_output_nam": 2, "freshness_policy_sensor": 67, "freshness_policy_sensor_fn": 67, "freshness_policy_sensor_to_invok": 67, "freshnesspolici": [2, 14, 26, 52, 67], "freshnesspolicysensorcontext": [52, 67], "freshnesspolicysensordefinit": 67, "freshsal": 14, "freshsalessourc": 14, "freshservic": 14, "freshservicesourc": 14, "friend": 11, "from": [1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 38, 39, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 59, 60, 62, 63, 64, 65, 66, 67, 68, 69], "from_asset_kei": 67, "from_def": 66, "from_dynamic_map": 9, "from_failur": 8, "from_graph": 2, "from_name_type_dict": 63, "from_op": 2, "from_panda": 53, "from_val": 66, "front": [16, 50], "frozenset": 56, "fruit": 64, "fs": 17, "fs_io_manag": 12, "fsspec": 12, "full": [14, 16, 18, 21, 23, 26, 27, 32, 34, 50, 59, 63, 64], "full_control": 34, "full_job": [27, 40], "full_refresh": [14, 26, 32], "full_refresh_append": 14, "full_refresh_overwrit": 14, "fulli": [11, 16, 50], "fulltrac": 40, "function": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16, 17, 21, 23, 26, 27, 33, 34, 40, 42, 45, 46, 47, 50, 51, 52, 53, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69], "further": [16, 23, 50, 67], "futur": [2, 11, 26, 63, 65, 66, 67], "g": [1, 2, 7, 8, 10, 11, 12, 13, 14, 16, 18, 19, 20, 23, 26, 28, 34, 39, 40, 45, 50, 52, 53, 59, 61, 63, 64, 65, 67, 68], "ga4": 14, "gain": 11, "garbag": [12, 16, 50], "gatewai": [14, 49], "gather": [6, 8, 13], "gaug": 24, "gave": 5, "gb": 34, "gc": [14, 16, 23, 35, 36, 50], "gceclusterconfig": 34, "gcepdkmskeynam": 34, "gcloud": 40, "gcp": [14, 40], "gcp_credenti": [34, 35, 36], "gcp_project": [34, 35, 36], "gcs_bucket": 34, "gcs_bucket_nam": 14, "gcs_bucket_path": 14, "gcs_bucket_region": 14, "gcs_file_manag": 34, "gcs_kei": 34, "gcs_pickle_io_manag": 34, "gcs_prefix": 34, "gcs_resourc": 34, "gcscomputelogmanag": 34, "gcsdestin": 14, "gcsfilehandl": 34, "gcsfilemanagerresourc": 34, "gcsgooglecloudstorag": 14, "gcspickleiomanag": 34, "gcsresourc": 34, "gcsstage": 14, "ge_data_context": 37, "ge_validation_op_factori": 37, "gen": 17, "gen2": 17, "gender": 24, "gener": [2, 3, 7, 8, 9, 11, 12, 15, 16, 17, 20, 21, 23, 26, 33, 34, 37, 39, 40, 45, 46, 50, 51, 53, 63, 64, 67, 68], "generate_materi": 26, "get": [3, 4, 5, 8, 11, 12, 14, 15, 16, 18, 20, 23, 24, 26, 34, 39, 40, 50, 53, 60, 65, 67, 69], "get_all_job": 65, "get_all_schedul": 65, "get_all_sensor": 65, "get_artifact": 26, "get_asset_identifi": 12, "get_asset_kei": [11, 26], "get_asset_key_for_model": 26, "get_asset_key_for_sourc": 26, "get_asset_keys_by_output_name_for_sourc": 26, "get_asset_proven": 8, "get_asset_record": 11, "get_asset_value_load": [2, 5, 65], "get_assets_defs_by_kei": 65, "get_auto_materialize_polici": 26, "get_batch": 37, "get_client": [16, 26, 34, 38, 42, 52], "get_connect": [28, 53], "get_context": 60, "get_cron_schedul": 64, "get_cursor_partit": 67, "get_dagster_ev": 11, "get_dagster_logg": 69, "get_dependencies_and_map": 9, "get_descript": 26, "get_downstream_partition_kei": 67, "get_downstream_partitions_for_partit": 64, "get_dynamic_partit": 11, "get_event_record": 11, "get_freshness_polici": 26, "get_group_nam": 26, "get_identifi": 12, "get_job": 65, "get_job_def": 5, "get_job_failure_ev": 67, "get_job_nam": 65, "get_job_success_ev": 67, "get_latest_materialization_code_vers": 11, "get_latest_materialization_ev": 11, "get_mapping_kei": 8, "get_metadata": [12, 26], "get_node_depend": 9, "get_on": 53, "get_op_vers": 62, "get_partition_kei": 64, "get_partition_map": 2, "get_query_statu": 53, "get_repo_id": 38, "get_resource_vers": 62, "get_run": 23, "get_run_by_id": 11, "get_run_record": 11, "get_run_record_by_id": 11, "get_run_statu": 39, "get_schedul": 65, "get_schedule_def": [5, 65], "get_schedule_nam": 65, "get_secret_valu": 16, "get_sensor": 65, "get_sensor_def": [5, 65], "get_sensor_nam": 65, "get_source_assets_by_kei": 65, "get_status_by_partit": 11, "get_step_failure_ev": 67, "get_system_temp_directori": [16, 17, 34], "get_tag": 8, "get_trailing_unconsumed_ev": 67, "get_upstream_mapped_partitions_result_for_partit": 64, "get_upstream_partitions_for_partit": 64, "get_v2_aggs_grouped_locale_us_market_stocks__d": 14, "getdbt": 26, "getenv": [16, 18, 19, 20, 38, 42, 52, 69], "giant": [16, 50], "gib": [16, 23], "github": [14, 59], "github_app_id": 38, "github_app_private_rsa_kei": 38, "github_hostnam": 38, "github_installation_id": 38, "github_job": 38, "github_op": 38, "github_private_kei": 38, "github_resourc": 38, "githubresourc": 38, "githubsourc": 14, "gitlab": 14, "gitlabsourc": 14, "give": [8, 16, 17, 23, 39, 49, 50, 53], "given": [2, 8, 11, 12, 13, 14, 15, 16, 21, 23, 26, 33, 39, 42, 45, 46, 50, 52, 62, 63, 64, 65, 66, 67, 68], "gke": 40, "glassfrog": 14, "glassfrogsourc": 14, "glob": [16, 50, 69], "global": [11, 14, 26, 27, 34, 40, 66], "global_config_flag": 26, "globstar": 14, "gm": 25, "gmail": 69, "go": [2, 14, 16, 26, 50, 52, 63], "gocardless": 14, "gocardless_environ": 14, "gocardless_vers": 14, "gocardlesssourc": 14, "goe": [16, 50], "good": [4, 14, 16, 38, 45, 50], "goodby": [27, 40], "googl": [14, 34], "google_application_credenti": [34, 35, 36], "google_auth_credenti": [34, 35, 36], "googleadssourc": 14, "googleanalyticsdataapisourc": 14, "googleanalyticsv4sourc": 14, "googleapi": 34, "googlecloudstoragestag": 14, "googlecredenti": 14, "googledirectorysourc": 14, "googlesearchconsolesourc": 14, "googlesheetsdestin": 14, "googlesheetssourc": 14, "googleworkspaceadminreportssourc": 14, "govern": 14, "gp3": 23, "gql": 39, "grab": 11, "gracefulli": [16, 50], "grain": [16, 50, 65], "grandchild": 63, "grandchildren": 2, "grandpar": 2, "grant": [14, 34, 38], "granular": 14, "graph": [2, 4, 6, 7, 13, 14, 16, 33, 50, 51, 53], "graph_a": 9, "graph_asset": 2, "graph_def": [2, 8, 9, 13], "graph_input_descript": 9, "graph_input_nam": 9, "graph_multi_asset": 2, "graph_output_descript": 9, "graph_output_nam": 9, "graphdefinit": [2, 4, 7, 8, 9, 13, 42, 52, 61, 63, 67, 69], "graphin": 9, "graphout": 9, "graphql": [11, 15, 40, 42, 43, 48, 52, 67, 69], "graphx": [16, 50], "great_expect": 37, "greater": [11, 14, 16, 23, 50], "greatexpect": 37, "greenhous": 14, "greenhousesourc": 14, "greeting_op": 4, "greetingconfig": 4, "group": [2, 5, 14, 16, 26, 33, 34, 40, 60, 65], "group_from_dbt_resource_props_fallback_to_directori": 26, "group_id": 14, "group_nam": [2, 14, 33, 60], "group_names_by_kei": 2, "group_names_by_output_nam": 2, "group_str": 2, "groupid": [16, 50], "grow": [16, 50], "grpc": 11, "grpc_host": 3, "grpc_port": 3, "grpc_socket": 3, "gs": 34, "gsc": 14, "gserviceaccount": 34, "guarante": [11, 16, 23, 50], "guess": 14, "guest": [18, 19, 20], "guid": [14, 15, 16, 22, 25, 28, 29, 30, 31, 34, 35, 36, 38, 40, 45, 46, 50, 53, 54, 55, 60], "gutenberg": 14, "gutendex": 14, "gutendexsourc": 14, "gz": 34, "gzip": 14, "h": [3, 14, 34, 40], "ha": [1, 2, 4, 7, 8, 11, 12, 13, 14, 16, 17, 23, 26, 28, 34, 39, 42, 45, 50, 53, 62, 63, 64, 65, 66, 67, 68, 69], "had": [14, 16, 34, 35, 36, 50, 53], "hadoop": [16, 34, 50], "hadoopjob": 34, "halt": 2, "hand": [14, 16, 50, 52], "handi": 47, "handl": [2, 12, 16, 33, 50, 51, 60, 61, 63, 64], "handle_output": [12, 62], "handled_output": [8, 12], "handler": [16, 28, 29, 30, 31, 34, 47, 53, 61], "hang": 53, "happen": [7, 16, 50, 59, 67], "happi": 11, "hard": [11, 14, 16, 34, 50, 61], "hardcod": [12, 66], "hardcoded_io_manag": 12, "hardcoded_resourc": 66, "harvest": 14, "harvestsourc": 14, "has_asset_checks_def": 8, "has_asset_kei": [11, 12], "has_asset_partit": 12, "has_assets_def": 8, "has_dynamic_partit": 11, "has_error": 63, "has_input_nam": 12, "has_job": 65, "has_output": 62, "has_partition_kei": [8, 12], "has_repository_load_data": 11, "has_schedul": 65, "has_schedule_def": 65, "has_sensor": 65, "has_sensor_def": 65, "has_specified_executor": [8, 13], "has_specified_logg": [8, 13], "has_tag": 8, "has_unique_nam": 68, "hash": [14, 62], "hash_cod": 14, "have": [1, 2, 3, 4, 5, 6, 7, 8, 11, 12, 13, 14, 16, 23, 26, 27, 33, 34, 40, 45, 47, 50, 52, 53, 54, 55, 59, 62, 63, 64, 65, 66, 67, 68, 69], "haw": 4, "hcf": 34, "hdf": [16, 34, 50], "hdfs_user_guid": 34, "header": 39, "heap": [16, 50], "heartbeat": [16, 50], "heartbeat_timeout": 3, "heartbeatinterv": [16, 50], "hei": 52, "hello": [4, 8, 27, 40, 42, 51, 63, 66], "hello_op": [4, 69], "hello_world": [8, 51, 61, 63], "hello_world_with_default": 4, "hellobaton": 14, "hellobatonsourc": 14, "helloconfig": 4, "help": [2, 3, 11, 12, 14, 16, 18, 23, 26, 32, 50, 65, 66], "helper": [12, 66], "here": [5, 8, 9, 11, 13, 14, 16, 18, 19, 20, 23, 24, 26, 32, 34, 38, 40, 44, 50, 52, 63], "heterogen": 5, "heurist": 15, "hh": 14, "hierarch": 63, "high": [14, 16, 50], "higher": [8, 16, 50], "highest": 14, "highlight": 63, "highlycompressedmapstatu": [16, 50], "hint": [12, 68], "hire": 14, "histogram": 24, "histor": [3, 7, 14], "histori": [3, 11, 14, 16, 50, 64], "hit": [14, 16, 50], "hive": 34, "hivejob": 34, "hmac": 14, "hmac_key_access_id": 14, "hmac_key_secret": 14, "hmackei": 14, "hoc": 15, "hold": [4, 14, 26, 63], "home": [19, 26, 34], "honor": 53, "honua": 4, "hood": 4, "hook": [2, 8, 9, 11, 13, 41, 42, 52, 63], "hook_def": [8, 9, 10, 13, 42, 52, 63], "hook_fn": 10, "hook_to_invok": 10, "hook_url": 42, "hookcontext": [10, 42, 52], "hookdefinit": [9, 10, 41, 42, 52], "hope": [18, 19, 20], "host": [3, 14, 16, 23, 24, 26, 32, 39, 50, 57, 59], "host1": 14, "host2": 14, "hostnam": [3, 11, 14, 16, 18, 26, 34, 38, 39, 43, 48, 50, 69], "hostnameincertif": 14, "hour": [2, 14, 34, 59, 64, 67], "hour_of_dai": [64, 67], "hour_offset": [64, 67], "hourli": [16, 50, 64, 67], "hourly_partitioned_config": [64, 67], "hourlypartitionsdefinit": 64, "hous": [16, 50], "how": [1, 2, 4, 8, 9, 11, 12, 13, 14, 16, 18, 23, 24, 26, 28, 33, 34, 40, 42, 49, 50, 52, 53, 59, 64, 67], "howev": [4, 16, 39, 50, 51, 65], "hr": 14, "html": [14, 16, 18, 19, 20, 23, 27, 34, 37, 50, 56], "http": [2, 8, 14, 16, 18, 19, 20, 23, 24, 26, 27, 32, 34, 37, 38, 39, 40, 41, 42, 49, 50, 52, 53, 56, 58, 59, 63, 64, 67, 69], "http_proxi": 42, "https_proxi": 42, "httpspublicweb": 14, "hub": 14, "hubplann": 14, "hubplannersourc": 14, "hubspot": 14, "hubspotsourc": 14, "human": [2, 4, 8, 9, 12, 16, 47, 51, 61, 63, 65, 66, 67], "hunter": 14, "hunter42": 32, "hydrat": [11, 16], "hyperparamet": 59, "hyphen": 34, "i": [2, 4, 8, 12, 14, 16, 18, 23, 26, 28, 29, 30, 31, 35, 36, 46, 50, 54, 55, 61, 63, 65, 69], "iam": [14, 34], "iamrol": 14, "iamus": 14, "iana": [2, 14, 26, 64, 67], "id": [3, 8, 10, 11, 12, 14, 15, 16, 17, 21, 23, 26, 32, 33, 34, 38, 39, 40, 41, 42, 50, 52, 59, 61, 63, 66, 67, 69], "idea": [16, 38, 50], "idefinitionconfigschema": 63, "idempot": [11, 14, 18, 19, 20, 23], "idempotency_token": 23, "ident": [14, 63, 64, 66, 68], "identif": 14, "identifi": [1, 2, 6, 8, 9, 12, 13, 14, 23, 45, 59, 63, 64, 66, 67, 68], "identitypartitionmap": 64, "idl": [14, 16, 50], "ie": [2, 4, 6, 8, 15, 24, 60], "ietf": 34, "ifnotpres": 40, "ignor": [4, 8, 11, 12, 13, 14, 15, 16, 23, 34, 50, 67], "ignore_handled_error": 26, "ignore_weekend": 14, "ijob": 11, "illeg": [16, 50], "imag": [3, 19, 20, 27, 34, 40], "image_nam": [19, 20], "image_pull_polici": [20, 40], "image_pull_secret": [20, 40], "image_vers": 34, "imagepullpolici": 40, "imagepullsecret": 40, "imageuri": 34, "imagevers": 34, "imagin": 2, "immedi": [11, 16, 50, 62, 63], "immun": [16, 50], "immut": [8, 9], "impact": [14, 16, 50], "imperi": 14, "implement": [8, 9, 11, 12, 13, 14, 16, 17, 18, 20, 21, 26, 27, 33, 34, 40, 45, 50, 62, 63, 64, 67], "implementor": 11, "import": [1, 2, 3, 4, 8, 9, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 38, 40, 41, 42, 49, 50, 51, 52, 53, 54, 55, 59, 60, 61, 63, 64, 66, 67, 69], "import_df_to_bq": 34, "import_file_to_bq": 34, "import_gcs_paths_to_bq": 34, "imprecis": [16, 50], "improv": [11, 16, 50, 53, 59], "in1": 13, "in_asset_kei": 2, "in_process": [8, 20], "in_process_executor": 8, "inaccuraci": 14, "inbound": [16, 50], "includ": [1, 2, 3, 7, 8, 9, 11, 13, 14, 15, 16, 18, 19, 20, 23, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 39, 40, 42, 44, 46, 50, 52, 53, 54, 55, 63, 64, 67, 68, 69], "include_checksum": 14, "include_delet": 14, "include_deleted_object": 14, "include_exampl": 15, "include_self": 2, "include_sidecar": 16, "include_sourc": 2, "inclus": 64, "incom": [16, 42, 50], "incompat": [7, 63], "incorpor": 2, "incorrect": 34, "increas": [14, 16, 50, 53], "increment": [14, 16, 24, 32, 50], "incremental_append": 14, "incremental_append_dedup": 14, "inculd": 9, "incur": 11, "indefinit": 53, "independ": [34, 66], "index": [3, 9, 11, 16, 50, 63, 64, 65, 67], "indian": 64, "indic": [2, 3, 4, 7, 8, 11, 12, 14, 23, 26, 34, 39, 45, 53, 63, 67], "individu": [2, 5, 6, 8, 14], "ineffici": 14, "infer": [2, 9, 12, 14, 26, 34, 39, 63], "infer_column_typ": 63, "infer_datatyp": 14, "infer_missing_t": 33, "infer_timestamp": 14, "infinit": [11, 16, 50, 53], "info": [3, 8, 11, 12, 14, 16, 18, 26, 34, 38, 50, 51, 61, 63, 66, 69], "inform": [1, 2, 3, 8, 9, 11, 12, 13, 14, 15, 16, 19, 21, 23, 26, 27, 33, 34, 39, 42, 50, 52, 53, 60, 62, 63, 67, 69], "infrastructur": 15, "ingest": [14, 62], "ingest_start": 14, "inherit": [5, 7, 8, 11, 13, 29, 30, 31, 45, 61, 63, 68, 69], "init": [23, 45, 66, 68], "init_context": [11, 12, 16, 47, 61, 66], "init_script": 23, "initexecutorcontext": 11, "initi": [2, 4, 7, 8, 11, 12, 14, 16, 17, 22, 23, 26, 33, 41, 50, 60, 61, 66, 67, 69], "initial_waiting_second": 14, "initialexecutor": [16, 50], "initializationact": 34, "initialr": [16, 50], "initloggercontext": [16, 47, 61], "initresourcecontext": [12, 66], "inittransact": 14, "inject": [3, 7, 20, 40, 60, 63], "inlin": [14, 51], "inmemoryiomanag": 12, "inner": 11, "inner_nod": 8, "inner_typ": 4, "input": [2, 4, 5, 7, 8, 9, 11, 13, 14, 16, 23, 26, 28, 29, 30, 31, 33, 34, 35, 36, 37, 45, 50, 51, 53, 54, 55, 60, 63, 65, 66, 68], "input1": 12, "input_config_schema": 12, "input_dagster_typ": 37, "input_def": [9, 63], "input_manag": 12, "input_manager_kei": [2, 12, 63], "input_map": [8, 9], "input_nam": 8, "input_t": 23, "input_valu": [8, 9, 13], "inputcontext": [12, 63], "inputdefinit": [9, 12, 45, 63, 68], "inputmanag": [12, 63], "inputmanagerdefinit": 12, "inputmap": [8, 9], "ins": [2, 8, 9, 11, 12, 28, 29, 30, 31, 34, 35, 36, 51, 53, 54, 55, 60, 63, 64, 68], "insensit": [11, 14], "insert": 14, "insid": [1, 2, 5, 12, 14, 16, 26, 27, 33, 34, 40, 50, 59, 65, 67], "insight": 14, "insightconfig": 14, "insightli": 14, "insightlysourc": 14, "insights_lookback_window": 14, "inspect": 5, "insqlitev": 11, "inst_data": [11, 16, 17, 34, 43], "instagram": 14, "instagramsourc": 14, "instal": [14, 16, 23, 34, 38, 40, 44, 52], "install_default_librari": 23, "installation_id": 38, "instanc": [2, 4, 5, 7, 8, 9, 10, 12, 13, 14, 16, 20, 23, 25, 26, 33, 34, 40, 41, 42, 45, 48, 50, 52, 53, 59, 61, 62, 64, 65, 66, 67, 68, 69], "instance_api_url": 14, "instance_config_map": [20, 40], "instance_for_test": [8, 67], "instance_pool_id": 23, "instance_profile_arn": 23, "instance_ref": [3, 67], "instance_typ": [11, 14], "instance_url_prefix": 14, "instanceof": 68, "instanceref": [3, 11, 67], "instancetyp": 11, "instanti": [2, 5, 6, 8, 11, 12, 16, 17, 26, 34, 43, 47, 48, 53, 61, 66, 67], "instead": [2, 3, 4, 7, 8, 9, 11, 13, 14, 16, 17, 18, 19, 20, 23, 26, 34, 42, 45, 50, 51, 52, 60, 64, 65, 67, 68, 69], "instruct": [14, 26, 38, 40, 44], "insuffici": [16, 50], "int": [1, 2, 4, 6, 7, 8, 9, 11, 14, 16, 17, 21, 23, 26, 34, 39, 40, 45, 51, 52, 59, 63, 64, 65, 66, 67, 68, 69], "integ": [4, 11, 14, 26, 61, 63], "integr": [14, 15, 16, 21, 23, 24, 25, 28, 29, 30, 31, 32, 33, 35, 36, 38, 40, 44, 47, 49, 50, 52, 53, 54, 55, 57, 58, 59, 60], "intend": [2, 8, 9, 11, 16, 20, 26, 39, 63, 65, 68], "intens": 23, "intent": 11, "inter": 3, "interact": [5, 8, 13, 14, 16, 19, 23, 26, 28, 32, 34, 37, 50, 60], "intercom": 14, "intercomsourc": 14, "interfac": [3, 11, 12, 14, 16, 21, 26, 33, 42], "intermedi": [11, 16, 50], "intern": [2, 3, 8, 12, 16, 26, 33, 34, 39, 43, 45, 48, 50, 51, 68], "internal_asset_dep": 2, "internal_ip_onli": 34, "internaliponli": 34, "interpol": [16, 50], "interrupt": [14, 16, 50], "intersect": 2, "interv": [2, 11, 14, 16, 17, 34, 50, 64, 67], "intmetadatavalu": 63, "intro": 14, "introduc": [5, 18, 19, 20], "introduct": [5, 18], "intsourc": [4, 11, 14, 16, 23, 25, 26, 27, 33, 38, 40, 43, 48, 49, 50, 53, 57, 67], "intuit": 16, "invalid": [7, 26, 39, 51, 63, 64, 69], "invalid_line_no": 26, "invalid_output_nam": 39, "invalid_step_kei": 39, "invalidoutputerror": 39, "invalidoutputerrorinfo": 39, "invalidsteperror": 39, "invari": 7, "invoc": [2, 8, 9, 10, 13, 26, 40, 51, 61, 66, 67], "invok": [2, 4, 5, 6, 8, 9, 10, 11, 13, 16, 19, 20, 26, 41, 47, 51, 61, 65, 67], "io": [2, 8, 14, 16, 17, 18, 20, 26, 27, 28, 32, 33, 34, 37, 38, 40, 50, 53, 59, 60, 62, 63], "io_manag": [2, 8, 9, 12, 13, 14, 16, 17, 26, 28, 29, 30, 31, 33, 34, 35, 36, 53, 54, 55, 63], "io_manager_def": 2, "io_manager_kei": [2, 6, 12, 14, 26, 33, 59, 60, 63], "iomanag": [2, 5, 12, 14, 26, 33, 59, 62, 65], "iomanagerdefinit": [2, 5, 12, 16, 17, 28, 29, 30, 31, 34, 35, 36, 53, 54, 55], "iomanagerdefnit": 12, "iop": 23, "ip": [14, 16, 34, 50], "ipynb": [60, 63], "iri": 60, "iris_dataset": 60, "iris_kmean": 60, "iris_kmeans_notebook": 60, "irrespect": 14, "is_asset_materialization_plan": 8, "is_asset_observ": 8, "is_builtin": [45, 68], "is_dagster_ev": 11, "is_engine_ev": 8, "is_expectation_result": 8, "is_failur": [8, 11], "is_failure_or_cancel": 11, "is_fan_in": 9, "is_finish": 11, "is_handled_output": 8, "is_hook_ev": 8, "is_loaded_input": 8, "is_observ": 2, "is_own": 23, "is_pres": 63, "is_requir": [2, 4, 6, 45, 63], "is_resource_init_failur": 8, "is_resume_retri": 11, "is_sandbox": 14, "is_step_ev": 8, "is_step_failur": 8, "is_step_materi": 8, "is_step_restart": 8, "is_step_skip": 8, "is_step_start": 8, "is_step_success": 8, "is_step_up_for_retri": 8, "is_success": [11, 26], "is_successful_output": 8, "is_user_code_error": 7, "is_valid": 63, "isinst": [26, 68], "isn": [27, 40], "iso": 14, "iso8601": 14, "isoformat": 26, "isol": 40, "ispreempt": 34, "issu": [3, 14, 16, 26, 34, 35, 36, 44, 50, 53, 54, 55], "issuer": 14, "issuer_id": 14, "item": [2, 4, 6, 14, 26, 60], "iter": [2, 5, 11, 14, 16, 26, 50, 66, 67], "iterablesourc": 14, "its": [2, 3, 6, 8, 9, 11, 12, 13, 14, 16, 18, 19, 20, 23, 26, 27, 33, 40, 46, 50, 52, 60, 62, 63, 64, 65, 67], "itself": [3, 4, 7, 8, 9, 12, 13, 14, 16, 50], "ivi": [16, 50], "ivy2": [16, 50], "ivyset": [16, 50], "jaffle_shop": 26, "jar": [16, 34, 50], "jar_file_uri": 34, "jarfileuri": 34, "java": [14, 16, 50], "javaseri": [16, 50], "javax": [16, 50], "jdbc": 14, "jdbc_url": 14, "jdbc_url_param": 14, "jdbcdestin": 14, "jdbcsourc": 14, "jira": 14, "jirasourc": 14, "jitter": 63, "jni": [16, 50], "job": [2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 26, 27, 33, 34, 38, 39, 40, 41, 42, 44, 47, 49, 50, 51, 52, 53, 56, 59, 60, 61, 63, 64, 65, 66, 67, 68, 69], "job1": 13, "job_code_origin": 11, "job_config": [20, 34, 40], "job_context": 60, "job_def": [8, 60, 61, 68], "job_for_datadog_op": 24, "job_id": [23, 26], "job_imag": [20, 40], "job_metadata": [20, 40], "job_nam": [8, 10, 11, 12, 15, 26, 39, 42, 52, 60, 65, 67, 69], "job_namespac": [20, 40], "job_permiss": 23, "job_runn": 40, "job_scoped_clust": 34, "job_select": [42, 52, 67, 69], "job_snapshot_id": 11, "job_spec_config": [20, 40], "job_timeout_in_second": 34, "job_wait_timeout": 20, "jobconfigvalidationinvalid": 39, "jobdefinit": [2, 5, 7, 8, 9, 13, 15, 17, 42, 52, 60, 61, 63, 65, 67, 69], "jobexecutionresult": 8, "jobfactori": 13, "jobid": 34, "jobnotfounderror": 39, "jobs_client": 23, "jobsapi": 23, "jobselector": [42, 52, 67, 69], "jobspec": 40, "jog": [64, 67], "join": [6, 12, 14, 16, 50], "join_channel": 14, "joinpath": 26, "json": [1, 2, 3, 4, 6, 8, 9, 13, 14, 26, 32, 33, 34, 50, 51, 60, 63, 67], "json_console_logg": 61, "json_credentials_envvar": 34, "json_log_format": 26, "json_repr": 14, "jsonl": 14, "jsonlinesnewlinedelimitedjson": 14, "jsonmetadatavalu": 63, "july_asset": 67, "june": 64, "jupyt": [8, 13, 60], "just": [4, 5, 8, 14, 16, 20, 27, 28, 29, 30, 31, 34, 35, 36, 40, 53, 54, 55, 59], "jvm": [16, 23, 50], "jwt": 14, "k": [16, 50], "k8": 20, "k8s_job": 40, "k8s_job_executor": 40, "k8s_job_nam": 40, "k8s_job_op": 40, "k8srunlaunch": [20, 40], "kafka": [14, 16, 25, 50], "kafkadestin": 14, "kafkaproduc": 14, "kafkasourc": 14, "kb": [16, 50], "kdc": 34, "kdcdbkeyuri": 34, "keen": 14, "keendestin": 14, "keep": [2, 14, 16, 40, 50, 53, 64, 67], "keep_files_in_gcs_bucket": 14, "keepal": 57, "keepalive_interv": 57, "kei": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 23, 24, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 38, 39, 40, 41, 44, 45, 50, 52, 53, 54, 55, 57, 59, 60, 64, 65, 66, 67, 68], "kept": 59, "kerber": 34, "kerbero": 34, "kerberosconfig": 34, "key1": 14, "key2": 14, "key3": 14, "key_encrypting_kei": 14, "key_fil": 57, "key_id": 14, "key_label_nam": 4, "key_prefix": [2, 14, 26, 28, 29, 30, 31, 33, 34, 35, 36, 53, 54, 55, 60], "key_store_password": 14, "key_str": 57, "key_typ": 4, "keypairauthent": 14, "keypassworduri": 34, "keyprefixdagsterdbttransl": 26, "keys_by_dimens": 64, "keys_by_input_nam": 2, "keys_by_output_nam": 2, "keyspac": 14, "keystor": 34, "keystorepassworduri": 34, "keystoreuri": 34, "keyword": [2, 4, 17, 19, 32, 60, 66], "kib": [16, 50], "kick": 2, "kill": [16, 50], "killblacklistedexecutor": [16, 50], "killen": [16, 50], "killtimeout": [16, 50], "kind": [1, 2, 6, 8, 13, 14, 45, 60, 64, 68], "kinesi": 14, "kinesisdestin": 14, "kit": 52, "klaviyo": 14, "klaviyosourc": 14, "km": [23, 34], "kms_kei": 23, "kmskeyuri": 34, "know": [2, 8, 13, 14, 16, 18, 50, 51, 53], "known": [14, 64], "known_stat": [11, 68], "kryo": [16, 50], "kryoregistr": [16, 50], "kryoseri": [16, 50], "kube": 22, "kubeconfig": [20, 40], "kubeconfig_fil": [20, 40], "kubectl": 40, "kubernet": [11, 16, 18, 22, 39, 50, 59], "kustom": 14, "kustomersingersourc": 14, "kvdb": 14, "kvdbdestin": 14, "kwarg": [4, 7, 8, 9, 11, 13, 14, 15, 17, 51, 61, 66, 68], "kyriba": 14, "kyribasourc": 14, "l": [3, 18], "label": [1, 2, 6, 14, 20, 34, 40, 45, 59, 63], "lack": [16, 50], "lag": 14, "lake": [14, 17, 23], "lakeformation_database_nam": 14, "lambda": [8, 13, 14, 33, 69], "lang": 14, "languag": 14, "larg": [2, 5, 14, 16, 23, 26, 50], "larger": [16, 50, 64], "last": [2, 8, 11, 14, 16, 34, 35, 36, 50, 64, 67], "last_completion_tim": 67, "last_run_kei": 67, "lastli": 2, "lastpartitionmap": 64, "lat": 14, "late": 52, "latenc": [16, 50], "later": [16, 50], "latest": [2, 11, 14, 16, 18, 19, 20, 23, 34, 40, 50, 56, 67], "latest_consumed_event_id": 67, "latest_event_partit": 67, "latest_event_storage_id": 67, "latest_materialization_by_partit": 67, "latest_materialization_records_by_kei": 67, "latest_materialization_records_by_partit": 67, "latest_materialization_records_by_partition_and_asset": 67, "latitud": 14, "latter": 11, "launch": [3, 8, 11, 13, 14, 16, 19, 20, 21, 23, 26, 27, 33, 40, 50, 59, 64, 67], "launcher": [3, 16, 20, 27, 40], "launchpipelineexecut": 3, "lazi": [2, 3, 26, 46, 65], "lazili": 65, "lazy_loaded_repositori": 65, "lead": [2, 14, 16, 50], "leader": [14, 16, 50], "learn": [2, 26], "least": [2, 14, 16, 50], "leav": [4, 9, 14, 16, 17, 40, 50], "ledger": 14, "left": [2, 11, 14, 33, 40, 52], "legaci": [5, 37], "lemlist": 14, "lemlistsourc": 14, "len": 63, "length": [4, 7, 14, 16, 34, 50, 53], "lengthi": 11, "less": [11, 14, 16, 18, 19, 20, 23, 50], "lesson": 14, "let": [14, 16, 18, 50], "letter": [2, 14, 34, 60, 63], "level": [1, 3, 4, 5, 8, 9, 11, 12, 13, 14, 16, 18, 23, 26, 34, 40, 45, 50, 61, 63, 65, 67, 69], "lever": 14, "leverhiringsourc": 14, "lib": 34, "libjar": 34, "librari": [11, 14, 15, 16, 18, 21, 23, 24, 25, 26, 28, 29, 30, 31, 33, 35, 36, 38, 39, 40, 44, 45, 46, 47, 49, 50, 51, 52, 53, 54, 55, 57, 58, 59, 60, 61], "lifetim": [23, 34], "like": [2, 3, 4, 8, 11, 13, 14, 16, 20, 23, 26, 33, 38, 39, 40, 50, 51, 53, 59, 61, 63, 66], "limit": [2, 11, 14, 16, 17, 27, 40, 50, 67], "line": [14, 16, 26, 50], "lineag": [16, 50, 63], "linear": 63, "linger_m": 14, "link": [14, 26], "linkedin": 14, "linkedinadssourc": 14, "linkedinpagessourc": 14, "linnwork": 14, "linnworkssourc": 14, "lint": 40, "list": [1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 16, 17, 19, 20, 21, 23, 25, 26, 27, 32, 33, 34, 40, 41, 42, 45, 50, 52, 53, 59, 60, 63, 64, 65, 67, 68, 69], "list_file_system": 17, "list_objects_v2": 16, "list_vers": 3, "listdir": 65, "listen": [14, 16, 18, 19, 20, 50, 59], "listenerbu": [16, 50], "liter": [4, 26], "littl": [16, 50], "live": [2, 3, 12, 14, 16, 50, 67], "liveupd": [16, 50], "ll": [14, 18, 23, 24, 25, 38, 40, 44, 47, 52, 59], "load": [1, 2, 3, 4, 5, 8, 9, 11, 12, 13, 14, 16, 20, 26, 28, 29, 30, 31, 33, 34, 35, 36, 40, 43, 48, 50, 51, 53, 54, 55, 60, 63, 64, 65, 67, 68, 69], "load_asset_valu": [2, 5, 65], "load_assets_from_airbyte_inst": 14, "load_assets_from_airbyte_project": 14, "load_assets_from_airflow_dag": 15, "load_assets_from_connect": 14, "load_assets_from_current_modul": [2, 59], "load_assets_from_dbt_cloud_job": 26, "load_assets_from_dbt_manifest": 26, "load_assets_from_dbt_project": 26, "load_assets_from_fivetran_inst": 33, "load_assets_from_modul": 2, "load_assets_from_package_modul": 2, "load_assets_from_package_nam": 2, "load_dict": 68, "load_from_path": 12, "load_incluster_config": [20, 40], "load_input": [2, 5, 12, 62, 65], "load_iri": 60, "load_kube_config": [20, 40], "load_table_from_local_parquet": 53, "loadabl": 5, "loaded_input": 8, "loader": [4, 5, 45, 65, 68], "loader_vers": 68, "loading_method": 14, "loadrepositori": 3, "local": [3, 8, 11, 12, 14, 16, 17, 18, 19, 22, 23, 26, 27, 34, 49, 50, 59], "local_artifact_storag": [11, 12], "local_artifact_storage_data": 11, "local_bas": 69, "local_compute_log_manag": 11, "local_dagster_job_package_path": 23, "local_dir": [16, 17, 34, 50], "local_disk0": 23, "local_file_manag": 11, "local_job_package_path": 16, "local_json_destin": 14, "local_pipeline_package_path": [16, 23], "local_temp": 11, "local_warehous": 69, "localartifactstorag": 11, "localclust": 22, "localcomputelogmanag": 11, "localfilehandl": 11, "localfilesystemlimit": 14, "localhost": [3, 14, 15, 16, 18, 19, 20, 39, 41, 42, 44, 52], "localjsondestin": 14, "localobjectrefer": 40, "locat": [2, 3, 5, 11, 14, 15, 16, 20, 23, 25, 34, 35, 36, 39, 40, 50, 60, 67], "location_nam": [3, 67], "log": [3, 7, 8, 10, 12, 14, 16, 17, 18, 20, 21, 23, 26, 40, 41, 43, 47, 48, 50, 51, 53, 59, 60, 66, 67, 69], "log_ev": [8, 12], "log_group_nam": 16, "log_kei": 11, "log_level": [3, 16], "log_manag": [8, 11, 12, 66, 68], "log_materi": 8, "log_param": 41, "log_stream_nam": 16, "logblockupd": [16, 50], "logconf": [16, 50], "logger": [4, 5, 8, 9, 13, 16, 21, 26, 47, 51, 60, 67, 69], "logger_config": [16, 47, 61], "logger_def": [8, 9, 13, 60, 61], "logger_fn": [16, 47, 61], "logger_to_init": 61, "loggerdefinit": [4, 5, 8, 9, 13, 16, 47, 60, 61], "logging_tag": [8, 60], "loggingconfig": 34, "logic": [2, 5, 11, 12, 14, 16, 47, 50, 61, 63, 66, 68], "logicalreplicationcdc": 14, "login": [14, 16, 23, 40, 53, 54, 55], "login_customer_id": 14, "login_timeout": 53, "loginpassword": 14, "loglevel": 18, "logs_batch_s": 14, "logwrit": 34, "lon": 14, "long": [11, 14, 16, 17, 23, 34, 40, 49, 50, 59, 64, 67], "longer": [16, 23, 40, 50, 63, 67], "longform": [16, 50], "longitud": 14, "look": [2, 4, 9, 11, 14, 26, 67], "lookback": 14, "lookback_window": 14, "lookback_window_dai": 14, "looker": 14, "lookersourc": 14, "lookup": [14, 16, 50], "lookuptimeout": [16, 50], "loop": [14, 16, 40, 50], "los_angel": [2, 26, 53, 64, 67], "loss": 14, "lost": [14, 16, 50], "lot": [14, 16, 50], "low": 23, "lower": [16, 23, 50], "lowercas": [18, 19, 20], "lowest": 14, "ls": 26, "lsf": 22, "lwa_app_id": 14, "lwa_client_secret": 14, "lz4": [16, 50], "lz4compressioncodec": [16, 50], "lzf": [16, 50], "lzfcompressioncodec": [16, 50], "m": [3, 16, 40, 50, 64, 67], "machin": [2, 3, 16, 34, 50, 59], "machineri": [11, 43, 45, 48, 63, 68], "machinetyp": 34, "machinetypeuri": 34, "maco": 40, "macro": 26, "made": [8, 9, 14, 16, 50, 60, 61, 66, 67], "magic": [11, 66], "magic_word": 11, "magicmock": [17, 66], "mai": [1, 2, 4, 7, 8, 9, 10, 11, 13, 14, 16, 17, 18, 19, 20, 23, 26, 27, 33, 34, 40, 50, 51, 53, 60, 61, 63, 64, 65, 66, 67, 69], "mailchimp": 14, "mailchimpsourc": 14, "mailgun": 14, "mailgunsourc": 14, "main": [14, 23, 32, 34, 40, 45, 50, 52, 64], "main_class": 56, "mainclass": 34, "mainjarfileuri": 34, "mainli": 2, "mainpythonfileuri": 34, "maintain": [3, 11, 59], "majmin": 40, "make": [3, 4, 8, 9, 11, 12, 14, 16, 17, 18, 19, 20, 23, 26, 34, 38, 40, 50, 53, 59, 65, 66], "make_bar_job": [8, 13], "make_dagster_definit": 15, "make_dagster_definition_from_airflow_dag_bag": 15, "make_dagster_definitions_from_airflow_dag_bag": 15, "make_dagster_definitions_from_airflow_dags_path": 15, "make_dagster_definitions_from_airflow_example_dag": 15, "make_dagster_job_from_airflow_dag": 15, "make_dagster_repo": 15, "make_definition_from_dag_bag": 15, "make_definitions_from_dir": 15, "make_email_on_run_failure_sensor": 69, "make_ephemeral_airflow_db_resourc": 15, "make_expensive_job": 65, "make_expensive_schedul": 65, "make_job": 13, "make_my_t": [28, 29, 30, 31, 34, 35, 36, 53, 54, 55], "make_persistent_airflow_db_resourc": 15, "make_python_type_usable_as_dagster_typ": 68, "make_repo_from_dir": 15, "make_schedules_and_jobs_from_airflow_dag_bag": 15, "make_slack_on_freshness_policy_status_change_sensor": 52, "make_slack_on_run_failure_sensor": 52, "make_teams_on_run_failure_sensor": 42, "make_values_resourc": [59, 66], "malform": 7, "man": 24, "manag": [2, 3, 7, 8, 10, 18, 20, 23, 26, 28, 29, 30, 31, 33, 35, 36, 40, 50, 54, 55, 60, 61, 62, 63, 66], "managed_logg": 61, "managedgroupconfig": 34, "mani": [5, 8, 11, 14, 16, 20, 23, 40, 50, 52, 63, 65, 67, 68], "manifest": 26, "manifest_json": 26, "manipul": 11, "manner": 66, "manual": [2, 8, 9, 14, 16, 23, 50, 53, 59, 63], "manuallyassignalistofpartit": 14, "map": [2, 4, 5, 7, 8, 9, 11, 12, 13, 14, 15, 16, 20, 26, 27, 33, 34, 45, 50, 51, 53, 60, 63, 65, 66, 67, 68], "map_config_op": 4, "mapped_node_input_nam": 9, "mapped_node_nam": 9, "mapped_node_output_nam": 9, "mapped_op": 6, "mappedinputplacehold": 9, "mapping_kei": [6, 8, 12], "mapr": 34, "mapreduc": [16, 34, 50], "mapreducetutori": 34, "maps_x": 9, "mariadb": 14, "mariadbcolumnstoredestin": 14, "mark": [4, 16, 20, 50, 64, 67], "markdown": [45, 52, 63, 68], "markdownmetadatavalu": 63, "market": [2, 14], "marketing_job": 2, "marketo": 14, "marketosourc": 14, "master": [14, 16, 34, 40, 50], "master_url": 56, "masterconfig": 34, "match": [1, 2, 7, 8, 11, 14, 16, 23, 33, 42, 45, 50, 60, 62, 63, 64, 66, 67], "materi": [1, 2, 3, 4, 5, 11, 12, 14, 16, 17, 21, 26, 32, 33, 34, 60, 63, 67], "materializ": 2, "materialization_records_by_kei": 67, "materialization_records_for_kei": 67, "materialize_on_miss": 2, "materialize_on_parent_upd": 2, "materialize_on_required_for_fresh": 2, "materialize_to_memori": 8, "materializeresult": 2, "matter": [2, 16, 50, 67], "maven": [16, 50], "max": [3, 14, 16, 23, 50, 63], "max_attempt": 16, "max_batch_s": 14, "max_block_m": 14, "max_catchup_run": 67, "max_completion_wait_time_second": 23, "max_concurr": [8, 27, 40], "max_concurrent_run": 11, "max_dat": 26, "max_in_flight_requests_per_connect": 14, "max_job": 59, "max_materializations_per_minut": 2, "max_messag": 14, "max_padding_size_mb": 14, "max_partitions_per_run": 64, "max_pending_messag": 14, "max_pending_messages_across_partit": 14, "max_poll_record": 14, "max_records_process": 14, "max_request_s": 14, "max_retri": [8, 9, 63], "max_tick_retri": 67, "max_user_code_failure_retri": 11, "max_wait_second": 14, "max_wait_tim": 14, "max_wait_time_second": 23, "max_work": [3, 23], "maxattempt": [16, 50], "maxblocksinflightperaddress": [16, 50], "maxchunksbeingtransf": [16, 50], "maxconsecutiveattempt": [16, 50], "maxexecutor": [16, 50], "maxfailedexecutorspernod": [16, 50], "maxfailedtasksperexecutor": [16, 50], "maxfailur": [16, 50], "maxfailuresperhour": 34, "maxim": [16, 40, 50], "maximum": [2, 3, 11, 14, 16, 21, 23, 26, 33, 34, 50, 63, 64, 67], "maximum_lag_minut": [2, 26, 67], "maxpartitionbyt": [16, 50], "maxrat": [16, 50], "maxrateperpartit": [16, 50], "maxregisteredresourceswaitingtim": [16, 50], "maxremoteblocksizefetchtomem": [16, 50], "maxreqsinflight": [16, 50], "maxresults": [16, 50], "maxretainedfil": [16, 50], "maxretri": [16, 50], "maxsiz": [16, 50], "maxsizeinflight": [16, 50], "maxtaskattemptsperexecutor": [16, 50], "maxtaskattemptspernod": [16, 50], "mb": [16, 50], "md": [14, 63], "md_str": 63, "mdash": 14, "me": 14, "mean": [2, 4, 6, 8, 12, 13, 14, 16, 42, 50, 52, 64, 67, 69], "meant": [2, 7, 8, 45, 68], "measur": [14, 16, 50], "mechan": [5, 14, 16, 18, 50, 53, 54, 55], "median": [16, 50], "meet": [1, 2, 8, 9, 13, 33, 51, 60, 63], "megabyt": 14, "meilisearch": 14, "meilisearchdestin": 14, "meltano": 14, "mem_io_manag": [8, 12], "member": [7, 11, 14, 65], "membership": 14, "memoiz": [3, 12, 13], "memoizableiomanag": 62, "memoizaton": [8, 9], "memoized_run_tag": 62, "memori": [8, 9, 11, 12, 13, 14, 16, 17, 23, 28, 34, 50, 53], "memory_onli": [16, 50], "memory_only_s": [16, 50], "memoryfract": [16, 50], "memorymapthreshold": [16, 50], "memoryoverhead": [16, 50], "mention": 14, "menu": 14, "merchant_id": 14, "merg": [16, 50], "mesag": 14, "meso": [16, 50], "mesos_sandbox": [16, 50], "messag": [4, 7, 8, 10, 11, 14, 16, 26, 39, 42, 47, 50, 52, 59, 61, 67, 69], "message_body_kei": 14, "message_delai": 14, "message_fn": [42, 52], "message_group_id": 14, "message_interval_m": 14, "message_qo": 14, "message_retain": 14, "messageformat": 14, "met": [11, 67], "meta": [14, 26, 33, 40], "metabas": 14, "metabasesourc": 14, "metadata": [1, 2, 5, 6, 7, 8, 9, 11, 12, 13, 14, 16, 25, 26, 28, 29, 30, 31, 33, 34, 35, 36, 37, 39, 40, 46, 50, 51, 53, 54, 55, 59, 60, 62, 65, 68], "metadata_by_kei": 2, "metadata_by_output_nam": 2, "metadata_by_table_nam": 33, "metadata_fn": 45, "metadatachangeev": 25, "metadatachangeevent_v4": 25, "metadatachangepropos": 25, "metadatachangeproposal_v1": 25, "metadataentri": 63, "metadatamap": 65, "metadatauserinput": [2, 26, 33], "metadatavalu": [1, 2, 6, 8, 9, 13, 45, 63], "method": [1, 2, 6, 8, 11, 12, 13, 14, 15, 16, 21, 24, 26, 29, 30, 31, 33, 34, 35, 36, 37, 41, 46, 47, 52, 53, 59, 61, 62, 63, 64, 66, 67], "metric": [14, 16, 24, 34, 49, 50], "metrica": 14, "mgmt": 16, "mib": [16, 23, 50], "microsoft": 14, "microsoftteam": 42, "microsoftteamssourc": 14, "midnight": [64, 67], "might": [4, 16, 50, 51], "migrat": 3, "mileston": [16, 50], "milli": 14, "millisecond": [3, 14, 16, 50], "min": [14, 23], "min_dat": 26, "min_work": 23, "minexecutor": [16, 50], "minim": [16, 50], "minimum": [14, 16, 23, 50, 52, 67], "minimum_interval_second": [52, 67], "minrateperpartit": [16, 50], "minregisteredresourcesratio": [16, 50], "minut": [2, 14, 16, 23, 34, 50, 52, 64, 67], "minute_of_hour": [64, 67], "minute_offset": [64, 67], "minutes_overdu": [52, 67], "mirror": [14, 24], "miss": [2, 12, 26, 67], "missing_column": 63, "missing_th": 63, "mitig": [16, 50], "mix": 2, "mixin": 11, "mixpanel": 14, "mixpanelsourc": 14, "ml": 59, "ml_model_for_each_ocean": 64, "mlf_exampl": 41, "mlflow_op": 41, "mlflow_s3_endpoint_url": 41, "mlflow_track": 41, "mlflow_tracking_uri": 41, "mlflowclient": 41, "mlop": 59, "mm": 14, "mnt": 19, "moab": 22, "mock": [10, 12, 17, 66], "mock_catalog": 14, "mock_resourc": 66, "mode": [2, 3, 8, 9, 11, 14, 16, 32, 46, 50, 51, 53], "model": [2, 4, 15, 26, 59], "model_nam": 26, "modifi": [14, 18, 19, 20, 26, 63], "modul": [2, 3, 5, 8, 11, 13, 14, 15, 16, 17, 18, 19, 20, 34, 40, 43, 48, 59, 61, 63, 69], "module_nam": [3, 11], "moduletyp": 2, "moment": 14, "mondai": [14, 64], "mondaysourc": 14, "mongo": 14, "mongodb": 14, "mongodbatla": 14, "mongodbdestin": 14, "mongodbsourc": 14, "mongodbv2sourc": 14, "monitor": [14, 15, 16, 21, 26, 33, 42, 50, 52, 67, 69], "monitor_all_repositori": [42, 52, 67, 69], "monitored_asset": 67, "monitored_job": [42, 52, 67, 69], "month": [64, 67], "monthli": [64, 67], "monthly_partitioned_config": [64, 67], "monthlypartitionsdefinit": 64, "more": [2, 5, 7, 8, 14, 15, 16, 18, 23, 26, 34, 38, 42, 50, 52, 53, 54, 55, 63, 65, 66, 67, 69], "most": [2, 8, 14, 16, 18, 19, 20, 23, 45, 50, 67, 68, 69], "mostli": [11, 17], "mount": [14, 16, 20, 40], "mqtt": 14, "mqttdestin": 14, "mr": 34, "ms": [14, 16, 42, 50], "msg": [61, 63], "msg_fn": 11, "mssql": 14, "mssqldestin": 14, "mssqlsourc": 14, "msteams_resourc": 42, "msteamsresourc": 42, "much": [14, 16, 40, 50], "multi": [2, 8, 9, 14, 16, 18, 26, 50, 64, 67], "multi_asset": [1, 2, 8, 26, 33], "multi_asset_sensor": 67, "multi_or_in_process_executor": [2, 8, 9, 13], "multi_out": 63, "multi_run": 64, "multiassetsensordefinit": 67, "multiassetsensorevaluationcontext": 67, "multidependencydefinit": 9, "multipartit": [64, 67], "multipartitionkei": 64, "multipartitionmap": 64, "multipartitionsdefinit": [8, 12, 64], "multipartitionsmap": 64, "multipl": [1, 2, 3, 5, 8, 9, 12, 13, 14, 15, 16, 17, 18, 20, 26, 33, 34, 40, 50, 53, 60, 63, 64, 65, 67], "multipli": [16, 50], "multiprocess": [8, 69], "multiprocess_executor": [8, 13], "multischema": 14, "multitosingledimensionpartitionmap": 64, "must": [1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 23, 26, 33, 34, 35, 36, 39, 40, 42, 45, 50, 51, 59, 60, 62, 63, 64, 66, 67, 68], "mutat": [3, 23, 39], "my": [1, 11, 12, 14, 16, 17, 23, 26, 34, 40, 63], "my_airbyte_job": 14, "my_airbyte_resourc": 14, "my_artifact": 59, "my_asset": [1, 2, 4, 8, 34], "my_asset_check": 2, "my_asset_has_enough_row": 1, "my_asset_sensor": 67, "my_assets_list": 2, "my_aws_key_id": 41, "my_bool_with_metadata": 4, "my_census_job": 21, "my_census_resourc": 21, "my_channel": 52, "my_composed_airbyte_job": 14, "my_composed_fivetran_job": 33, "my_custom_dbt_run": 26, "my_custom_metadata": 26, "my_custom_metadata_valu": 26, "my_custom_tag": 26, "my_dag_bag": 15, "my_dagster_job": 15, "my_dagster_queu": 59, "my_dashboard": 63, "my_databas": [11, 48, 53, 54, 55], "my_dataset": [34, 35, 36, 63], "my_db": [28, 29, 30, 31], "my_dbt_asset": 26, "my_dbt_cli_job": 26, "my_dbt_cloud_job": 26, "my_dbt_cloud_resourc": 26, "my_dbt_op": 26, "my_dbt_output": 26, "my_default_str": 4, "my_downstream_op": 9, "my_echo_op": 51, "my_ent": 59, "my_experi": 41, "my_explicit_paramet": 4, "my_fivetran_job": 33, "my_fivetran_resourc": 33, "my_funct": [2, 63], "my_graph": [8, 9, 13, 51], "my_graph_alia": [8, 9], "my_hook": [8, 9], "my_hostnam": [11, 48], "my_implicit_paramet": 4, "my_int_asset": 2, "my_int_list": 4, "my_int_var": 66, "my_io_manag": 12, "my_io_manager_kei": 12, "my_job": [9, 12, 16, 17, 18, 34, 42, 49, 52, 64, 65, 66, 67, 69], "my_launched_job": 59, "my_message_fn": [42, 52, 69], "my_modul": [18, 19, 20], "my_op": [12, 50, 66], "my_org": 26, "my_other_explicit_paramet": 4, "my_other_t": 63, "my_password": [11, 48], "my_polici": 2, "my_prefix": 26, "my_project": [40, 59], "my_pyspark_resourc": 50, "my_repo": [19, 20, 28, 29, 30, 31, 42, 52, 65, 69], "my_repositori": 59, "my_return_n_": 65, "my_s3_endpoint": 41, "my_sas_token": 17, "my_schedul": 65, "my_schema": [28, 29, 30, 31, 53, 54, 55], "my_secret": 41, "my_select": 26, "my_sensor": [64, 67], "my_shell_op": 51, "my_simple_airbyte_job": 14, "my_simple_census_job": 21, "my_simple_fivetran_job": 33, "my_slack_token": 52, "my_snowflake_job": 53, "my_sourc": 26, "my_spark_job": 50, "my_storage_account": 17, "my_str": 4, "my_str_var": 66, "my_string_asset": 2, "my_subdomain": 14, "my_tabl": [28, 29, 30, 31, 34, 35, 36, 53, 54, 55, 63, 67], "my_table_a": [28, 29, 30, 31, 34, 35, 36, 53, 54, 55], "my_table_schema": 63, "my_tag": [8, 9], "my_text_label": 63, "my_upstream_asset": 2, "my_upstream_graph": 9, "my_upstream_op": 9, "my_us": 19, "my_usernam": [11, 48], "my_valu": [8, 9, 26], "my_vari": 26, "my_wandb_job": 59, "my_wandb_resourc": 59, "my_warehous": [53, 55], "myassetconfig": 4, "mybigqueryiomanag": [34, 35, 36], "myclass": 63, "mycompani": [20, 40], "mycoolsit": [52, 63, 69], "mycorp": [16, 34], "mydbtconfig": 26, "myduckdbiomanag": [28, 29, 30, 31], "myexternaliomanag": 12, "myhourssourc": 14, "myiomanag": 12, "mymodul": 13, "mypermissiveopconfig": 4, "myregistrynam": 40, "myshopifi": 14, "mysit": 14, "mysnowflakeiomanag": [53, 54, 55], "mysql": 14, "mysql_db": 43, "mysql_url": 43, "mysqldestin": 14, "mysqleventlogstorag": [11, 43], "mysqlrunstorag": [11, 43], "mysqlschedulestorag": [11, 43], "mysqlsourc": 14, "mytabl": 63, "n": [3, 14, 15, 18, 46, 65], "n1": 34, "n_worker": 22, "na": 14, "naiv": 34, "name": [1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 23, 26, 27, 28, 29, 30, 31, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 45, 46, 50, 51, 52, 53, 54, 55, 56, 59, 60, 63, 64, 65, 67, 68, 69], "name1": [16, 50], "name2": [16, 50], "name_type_dict": 63, "named_repo": 5, "namedtemporaryfil": 11, "namedtupl": 60, "namespac": [14, 20, 40], "narrow": 67, "nativ": [14, 16, 50, 59], "nativenetworkencryptionnn": 14, "navig": 14, "ndaysago": 14, "necessari": [9, 11, 14, 16, 23, 26, 50, 59, 68], "need": [2, 5, 8, 9, 11, 13, 14, 16, 17, 18, 23, 24, 25, 26, 27, 34, 38, 40, 44, 45, 47, 50, 52, 53, 59, 63, 65, 67, 68, 69], "needs_input": 9, "neg": [8, 11, 14, 16, 50, 63], "neither": [16, 34, 45, 46, 68], "nest": [8, 9, 13, 41], "net": [14, 16, 50], "netsuit": 14, "netsuitesourc": 14, "netti": [16, 50], "network": [11, 14, 16, 17, 19, 27, 34, 50], "network_timeout": 53, "network_uri": 34, "networkuri": 34, "never": [2, 8, 14, 16, 18, 19, 20, 21, 26, 33, 50, 67], "new": [2, 3, 4, 5, 8, 9, 11, 14, 15, 16, 17, 18, 19, 20, 23, 26, 40, 44, 50, 52, 65, 66], "new_clust": 23, "newer": [16, 50], "newli": [11, 14], "newlin": [34, 35, 36, 53, 54, 55], "newlines_in_valu": 14, "next": [6, 14, 63, 67], "nf": [16, 50], "no_host_key_check": 57, "no_permiss": 23, "nocompress": 14, "nodatim": 14, "node": [2, 4, 8, 9, 13, 14, 16, 17, 23, 26, 34, 50], "node_a": 9, "node_b": 9, "node_def": [2, 8, 9], "node_handl": [8, 60], "node_info_to_asset_kei": 26, "node_info_to_auto_materialize_policy_fn": 26, "node_info_to_definition_metadata_fn": 26, "node_info_to_descript": 26, "node_info_to_freshness_policy_fn": 26, "node_info_to_group_fn": 26, "node_info_to_metadata": 26, "node_input_source_asset": [8, 9], "node_str": 8, "node_typ": 23, "node_type_id": 23, "nodedefinit": [2, 8, 9], "nodehandl": 8, "nodeinvoc": [8, 9], "noe": 23, "noencrypt": 14, "nois": 52, "non": [2, 4, 8, 14, 16, 19, 26, 27, 34, 50, 51, 64, 67], "non_argument_dep": [2, 8, 60], "non_scalar_schema": 4, "noncancel": [16, 50], "none": [1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 45, 49, 51, 52, 53, 54, 55, 56, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69], "none_": 14, "none_resourc": 66, "noneabl": [4, 7], "nonetyp": 4, "noop_compute_log_manag": 11, "noopcomputelogmanag": 11, "nor": [34, 45, 68], "normal": [2, 14, 26], "normalization_t": 14, "normalize_data": 14, "normalized_nam": 15, "nosigint": 69, "note": [2, 4, 8, 11, 12, 14, 16, 17, 18, 19, 20, 21, 23, 26, 27, 34, 35, 36, 38, 39, 43, 48, 50, 51, 53, 55, 61, 63, 65, 67], "notebook": [8, 13, 60, 63], "notebook_path": [23, 60, 63], "notebook_task": 23, "notebookmetadatavalu": 63, "noth": [26, 51, 66, 67, 68], "nothing_int_job": 68, "nothing_job": 68, "notic": [14, 61], "notif": 52, "notify_when_back_on_tim": 52, "notion": 14, "notionsourc": 14, "notunnel": 14, "novaluesentinel": [2, 6, 63], "now": [8, 13, 23, 38, 39], "np": 60, "ntype": 68, "null": [1, 14, 63], "nullabl": 63, "num": [8, 9, 16, 50, 63], "num_allowed_row": 45, "num_failur": 46, "num_row": [1, 63], "num_work": 23, "number": [2, 3, 5, 8, 11, 14, 16, 21, 22, 23, 26, 27, 33, 34, 39, 40, 45, 46, 50, 52, 53, 60, 63, 64, 67], "numconnectionsperp": [16, 50], "numer": 53, "numeric_event_properties_kei": 14, "numinst": 34, "numlocalssd": 34, "numpi": [53, 60], "numrbackendthread": [16, 50], "numretri": [16, 50], "o": [2, 3, 12, 14, 28, 29, 30, 31, 35, 36, 54, 55], "oar": 22, "oauth": [14, 16, 23, 50], "oauth2": 14, "oauth20": 14, "oauth2accesstoken": 40, "oauth2credenti": 14, "oauth_client_id": 23, "oauth_client_secret": 23, "oauth_credenti": 23, "oauthauthent": 14, "oauthcredenti": 14, "obj": 12, "object": [1, 2, 4, 5, 8, 9, 10, 11, 12, 13, 14, 16, 17, 18, 19, 20, 23, 26, 27, 32, 33, 34, 37, 39, 40, 42, 45, 46, 50, 51, 52, 53, 60, 61, 62, 63, 64, 65, 66, 67, 68], "object_typ": 14, "objectadmin": 34, "objectmeta": 40, "objectstreamreset": [16, 50], "observ": [2, 6, 26, 67], "observable_source_asset": [2, 5], "observation_job": 2, "observe_fn": 2, "obtain": 14, "occasion": [16, 50], "occur": [2, 7, 8, 9, 11, 13, 14, 16, 34, 39, 50, 51, 59, 63, 64, 67, 68], "oceans_partitions_def": 64, "ocsp": 53, "ocsp_response_cache_filenam": 53, "octavia": 14, "off": [2, 3, 8, 16, 33, 34, 40, 50, 62], "offer": [16, 50], "offheap": [16, 50], "offici": [3, 14, 23, 39], "offset": [14, 16, 50, 64], "often": [1, 2, 7, 16, 23, 50], "ok": 4, "okta": 14, "oktasourc": 14, "old": [2, 5, 8, 16, 50], "older": [5, 16, 50], "omit": [2, 4, 14, 16, 34, 39, 45, 50, 53, 54, 55, 68, 69], "onc": [2, 4, 11, 14, 38, 44, 52, 64, 66, 67, 68], "one": [1, 2, 3, 4, 6, 7, 8, 9, 11, 12, 13, 14, 16, 17, 18, 19, 20, 22, 23, 26, 27, 28, 34, 39, 40, 50, 53, 59, 61, 63, 64, 65, 67, 68, 69], "ones": [2, 23, 26, 64, 67], "onesign": 14, "onesignalsourc": 14, "ongo": 11, "onli": [2, 3, 4, 7, 8, 9, 11, 12, 13, 14, 15, 16, 17, 21, 23, 26, 27, 28, 29, 30, 31, 33, 34, 35, 36, 40, 50, 51, 52, 53, 54, 55, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68], "onlin": 24, "onto": 40, "ontolog": 5, "onward": 14, "oom": [16, 50], "op": [1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 16, 17, 18, 19, 20, 21, 24, 25, 28, 29, 30, 31, 35, 36, 37, 38, 41, 42, 44, 45, 49, 50, 51, 52, 54, 55, 60, 62, 65, 66, 67, 68, 69], "op1": 23, "op_a": [9, 12], "op_b": [9, 12], "op_c": 9, "op_config": [4, 6, 8, 10, 60, 62, 65], "op_def": [2, 8, 12, 60, 62, 68], "op_definit": [14, 26, 27, 33, 34, 40], "op_except": 10, "op_nam": [26, 60], "op_output_valu": 10, "op_retry_polici": [8, 9, 13], "op_select": [8, 9, 11, 13, 39], "op_tag": [1, 2, 33, 60], "op_to_invok": 8, "op_with_config": 4, "opdefinit": [2, 4, 8, 10, 12, 14, 21, 23, 26, 27, 33, 34, 40, 51, 53, 60, 62, 63], "open": [4, 8, 11, 14, 16, 38, 50, 51, 59, 64], "opencostinbyt": [16, 50], "openweath": 14, "openweathersourc": 14, "oper": [2, 8, 12, 13, 14, 15, 16, 21, 26, 33, 37, 38, 39, 50, 53], "opexecutioncontext": [8, 26, 51], "oppos": 2, "opt": [20, 40], "optim": [14, 16, 23, 50], "option": [1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 45, 47, 48, 49, 50, 51, 52, 53, 54, 55, 57, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69], "optionalcompon": 34, "opversioncontext": 62, "oracl": 14, "oracledestin": 14, "oraclesourc": 14, "orb": 14, "orbit": 14, "orbitsourc": 14, "orbsourc": 14, "orchestr": [6, 11, 26, 40, 59], "order": [2, 4, 8, 11, 12, 14, 16, 17, 20, 23, 26, 40, 45, 46, 50, 59, 63, 64, 67, 68], "order_bi": 11, "ordinari": 16, "ore": 67, "org": [2, 14, 16, 26, 34, 50, 56, 64, 67], "org_id": 14, "organ": [2, 5, 14, 15, 33, 59, 60, 63], "organization_id": 15, "origin": [2, 4, 7, 8, 9, 11, 20, 40], "original_exc_info": 7, "original_root": 7, "os": [6, 14, 16, 18, 19, 20, 38, 42, 52, 63, 65, 69], "other": [2, 3, 4, 5, 7, 8, 9, 11, 13, 14, 16, 20, 26, 33, 50, 53, 63, 65, 69], "other_asset": 66, "other_expensive_job": 65, "other_nam": 9, "other_op": [14, 33], "other_op_a": [8, 9, 13], "other_op_b": [8, 9, 13], "other_result": 9, "otherwis": [8, 9, 11, 12, 14, 16, 20, 26, 33, 34, 35, 36, 40, 50, 53, 62, 63, 67], "our": [11, 14], "out": [2, 3, 6, 8, 9, 12, 14, 16, 21, 23, 26, 28, 29, 30, 31, 33, 34, 35, 36, 39, 40, 50, 52, 53, 54, 55, 60], "outcom": 14, "outcome_nam": 14, "outer_graph": 8, "outgo": 14, "outliv": 40, "output": [1, 2, 3, 6, 7, 8, 9, 10, 11, 14, 16, 17, 21, 23, 25, 26, 28, 29, 30, 31, 33, 34, 35, 36, 37, 38, 39, 42, 45, 50, 51, 52, 53, 54, 55, 60, 62, 63, 66, 68, 69], "output_captur": [8, 11, 68], "output_config_schema": 12, "output_def": [9, 51, 63], "output_for_asset_kei": 8, "output_for_nod": 8, "output_log": 51, "output_map": [8, 9], "output_nam": [6, 8, 12, 26, 60, 63], "output_notebook_io_manag": 60, "output_notebook_nam": 60, "output_obj": 8, "output_requir": 2, "output_t": 23, "output_valu": 8, "outputcontext": [12, 62], "outputdefinit": [9, 12, 45, 63, 68], "outputmap": [8, 9], "outreach": 14, "outreachsourc": 14, "outsid": [2, 5, 8, 11, 12, 16, 40, 50, 64, 66], "over": [2, 3, 5, 8, 14, 16, 23, 24, 39, 50, 64, 65, 67], "overdu": 67, "overestim": [16, 50], "overhead": [16, 50], "overlap": 64, "overload": 23, "overrid": [2, 4, 5, 8, 9, 11, 13, 15, 16, 23, 26, 37, 40, 50, 51, 64, 67], "overridden": [8, 16, 26, 40, 42, 50, 52, 67, 68, 69], "overriden": 26, "overview": [14, 20, 23, 40, 63], "overwrit": [12, 14, 16, 17, 34, 50, 64], "overwritten": [2, 8, 9, 13, 34, 60], "own": [2, 5, 7, 8, 13, 16, 26, 27, 40, 50, 63, 66], "owner": [23, 38], "p": [3, 40], "p8": [53, 54, 55], "pa": [46, 53], "pacif": 64, "pack": [16, 50], "packag": [2, 3, 11, 16, 23, 32, 34, 50, 69], "package_modul": 2, "package_nam": [2, 3], "packet": 57, "page": [14, 16, 24, 26, 32, 38, 50, 52], "page_id": 14, "page_s": 14, "page_size_for_large_stream": 14, "page_size_kb": 14, "pagerduty_op": 44, "pagerduty_resourc": 44, "pagerduty_test": 44, "pagerdutyservic": 44, "pagin": [11, 14], "pair": [8, 11, 13, 14, 20, 23, 26, 27, 40, 67], "panda": [1, 28, 34, 36, 46, 53, 55, 60], "pandascolumn": 45, "pandera_schema_to_dagster_typ": 46, "panel": 14, "papermil": 60, "papertrail_logg": 47, "parallel": [16, 34, 40, 50], "param": [3, 11, 16, 48, 50], "paramet": [1, 2, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 22, 23, 26, 27, 28, 32, 33, 34, 37, 39, 40, 42, 45, 46, 47, 50, 51, 52, 53, 54, 55, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69], "parameter": [2, 8, 9, 13, 26, 67], "parametr": 8, "paramiko": 57, "paramstyl": 53, "pardot": 14, "pardot_business_unit_id": 14, "pardotsourc": 14, "parent": [2, 12, 16, 17, 21, 26, 34, 41, 63], "parent_run_id": [8, 11, 41], "parquet": [4, 14, 53], "parquetcolumnarstorag": 14, "pars": [4, 7, 8, 10, 11, 14, 26, 62, 68, 69], "part": [2, 4, 6, 14, 38, 42, 63, 66, 67], "parti": 11, "partial": [4, 16, 17, 34], "partially_specified_config": 4, "particular": [1, 2, 8, 11, 16, 27, 40, 50, 64, 66, 67], "partit": [2, 3, 5, 8, 9, 11, 12, 13, 14, 16, 26, 50, 60, 63, 65], "partition_dimension_nam": 64, "partition_fn": 64, "partition_kei": [2, 5, 8, 11, 12, 13, 64, 65, 67], "partition_key_rang": [8, 64], "partition_key_to_vars_fn": 26, "partition_map": [2, 8, 64], "partition_time_window": 8, "partition_x": 67, "partitiondefinit": 26, "partitiondimensiondefinit": 64, "partitioned_config": [8, 13], "partitionedconfig": [8, 9, 13, 64, 67], "partitionkeyrang": [8, 64], "partitionmap": [2, 12, 64], "partitions_def": [2, 8, 9, 11, 13, 26, 60, 64, 67], "partitions_def_nam": [11, 67], "partitions_subset": 64, "partitionsdefinit": [2, 8, 9, 11, 12, 13, 26, 60, 64], "partitionsfor": 14, "partitionshop_dbt_asset": 26, "partitionssubset": 64, "partner": 14, "partner_id": 14, "pass": [1, 2, 3, 4, 5, 8, 9, 11, 12, 13, 14, 16, 17, 18, 19, 20, 22, 23, 24, 26, 27, 34, 39, 40, 41, 42, 45, 50, 51, 52, 53, 60, 61, 62, 63, 64, 65, 66, 67, 68], "password": [11, 12, 14, 16, 19, 20, 27, 32, 34, 40, 43, 48, 53, 54, 55, 57, 69], "passwordauthent": 14, "past": [11, 14, 23, 52, 64, 67], "patcredenti": 14, "path": [2, 3, 6, 11, 12, 14, 15, 16, 17, 18, 20, 23, 26, 28, 29, 30, 31, 32, 34, 40, 50, 51, 53, 54, 55, 60, 63, 67, 69], "path_desc": [11, 68], "path_pattern": 14, "path_prefix": [3, 12, 14], "pathlib": [12, 26], "pathlik": 63, "pathmetadatavalu": 63, "pattern": [11, 14, 69], "paus": [14, 16, 50], "pawel": 18, "pawelzni": 18, "payload": [11, 42, 44], "paypal": 14, "paypaltransactionsourc": 14, "paystack": 14, "paystacksourc": 14, "pb": 22, "pd": [2, 28, 29, 34, 35, 36, 53, 54, 55, 60], "pdb": [8, 69], "peer": 6, "pem": 16, "pend": [14, 16, 18, 50], "pendingnodeinvoc": 10, "pendulum": 64, "peopl": 50, "per": [1, 2, 5, 8, 11, 12, 13, 14, 16, 22, 23, 27, 34, 40, 50, 67], "percentag": 23, "perform": [2, 3, 7, 11, 14, 16, 23, 38, 50, 53, 62, 63, 64, 65], "period": [11, 14, 16, 50, 64, 67], "period_in_dai": 14, "periodicgc": [16, 50], "perman": 14, "permiss": [4, 7, 11, 14, 16, 17, 18, 19, 20, 22, 23, 27, 34, 38, 40, 41, 48, 50, 53, 68], "permissiveconfig": 4, "permit": [8, 11, 14, 64], "persist": [2, 8, 11, 14, 15, 16, 17, 23, 34, 40, 50, 63, 67], "persistiq": 14, "persistiqsourc": 14, "person": [14, 25, 38], "personal_access_token": 14, "pg": 20, "phase": 14, "photo": 24, "phrase": 14, "pick": [14, 18, 19, 20, 34], "pickl": [4, 12, 16, 17, 34], "pid": 8, "piec": [11, 16, 23, 46, 50], "pig": 34, "pigjob": 34, "pinterest": 14, "pinterestsourc": 14, "pip": 44, "pipe": [51, 64], "pipedr": 14, "pipedrivesourc": 14, "pipelin": [3, 7, 8, 11, 16, 23, 32, 39, 53, 59, 66, 69], "pipeline_def": 61, "pipeline_run": 69, "pipelineconfigurationinvalid": 39, "pipelinenotfounderror": 39, "pipelinerun": 66, "pivot": 14, "pivotaltrackersourc": 14, "pkg_resourc": 69, "pkg_resource_def": 69, "pl": 30, "place": [2, 11, 12, 14, 16, 23, 26, 40, 45, 50], "placehold": 4, "placement": 34, "plai": [14, 67], "plaid": 14, "plaid_env": 14, "plaidsourc": 14, "plain": [52, 69], "plaintext": 14, "plan": [11, 14, 16, 20], "plan_context": 11, "plan_data": [11, 68], "plane": [16, 50], "planorchestrationcontext": 11, "platform": [14, 26, 34, 42], "pleas": [11, 14, 16, 50], "plu": [64, 67], "plug": 11, "pluggabl": [4, 11], "plugin": [11, 14, 35, 36, 54, 55], "plus_minu": 63, "pm": 67, "pod": [18, 20, 39, 40], "pod_spec_config": [20, 40], "pod_template_spec_metadata": [20, 40], "podspec": 40, "point": [2, 3, 11, 12, 14, 16, 18, 19, 20, 34, 35, 36, 50], "pointer": 8, "pokeapi": 14, "pokeapisourc": 14, "pokemon": 14, "pokemon_nam": 14, "polici": [1, 2, 8, 9, 11, 13, 14, 20, 23, 26, 40, 52, 60, 63, 67], "policy_id": 23, "poll": [14, 16, 21, 23, 26, 33, 50, 59], "poll_interv": [14, 21, 26, 33], "poll_interval_sec": 23, "poll_interval_second": 23, "poll_timeout": [14, 21, 26, 33], "polling_tim": 14, "polling_timeout": 11, "pollinginterv": [16, 50], "polygon": 14, "pool": [3, 14, 16, 23, 50], "poor": [16, 50], "popen": [26, 51], "popul": [8, 14, 59, 63], "popular": 14, "port": [3, 11, 14, 16, 23, 32, 39, 40, 43, 47, 48, 50, 57, 69], "port1": 14, "port2": 14, "port_numb": 39, "portion": 14, "posit": [8, 14, 16, 50, 63, 64, 67, 69], "possibl": [4, 11, 14, 16, 20, 23, 40, 50, 64], "post": [4, 14, 18, 42, 44, 52], "post_messag": 42, "postgr": [11, 14, 20, 32, 40], "postgres_airflow_db": 15, "postgres_connection_str": 32, "postgres_db": [11, 48], "postgres_password": 32, "postgres_password_secret": [20, 40], "postgres_url": 48, "postgresdestin": 14, "postgreseventlogstorag": [11, 48], "postgresql": [15, 20, 32, 40], "postgresrunstorag": [11, 48], "postgresschedulestorag": [11, 48], "postgressourc": 14, "posthog": 14, "posthogsourc": 14, "postmessag": 52, "potenti": [8, 11, 14, 16, 50], "power": 11, "pq": 53, "pre": [4, 14, 16, 26, 32, 50], "preambl": 7, "preced": [9, 12, 16, 17, 34, 50], "predefin": [3, 63], "predict": [8, 13], "preemptibl": 34, "prefer": [8, 9, 14, 24, 63, 67], "preferdirectbuf": [16, 50], "prefix": [2, 3, 11, 12, 14, 16, 17, 18, 21, 23, 25, 26, 33, 34, 35, 36, 38, 53, 60, 66], "pregel": [16, 50], "preload": 11, "preparedata": 23, "prepend": [2, 16, 50], "presenc": [2, 4, 8, 45, 63], "present": [2, 4, 8, 9, 11, 14, 16, 20, 23, 26, 33, 34, 38, 40, 44, 50, 52, 62, 63, 65], "preserv": [11, 16, 50, 67], "pressur": [16, 50], "prestashop": 14, "prestashopsourc": 14, "pretti": 38, "prev_sync_tim": 14, "prevent": [16, 50], "preview": [3, 34], "previou": [7, 8, 11, 12, 14, 16, 17, 34, 50, 62, 63, 67], "previous_minutes_overdu": 67, "price": [14, 23], "primari": [14, 32], "primarili": [16, 39, 50, 67], "primary_kei": [14, 32], "primetr": 14, "primetricsourc": 14, "primit": [4, 7, 8, 9, 13], "princip": [23, 34], "print": [3, 4, 16, 25, 26, 38, 40, 60, 63, 66], "printgcdetail": 23, "prior": [14, 67], "prioriti": 8, "priv": 40, "privat": [14, 23, 34, 38, 40, 53, 54, 55], "private_kei": [14, 53, 54, 55], "private_key_password": [14, 53, 54, 55], "private_key_path": [53, 54, 55], "private_token": 14, "privateapp": 14, "privatekei": 14, "privileg": 14, "proactiv": [16, 50], "problem": 26, "proce": 7, "process": [2, 3, 4, 5, 6, 7, 8, 9, 11, 13, 14, 16, 20, 23, 26, 29, 30, 31, 35, 36, 37, 40, 50, 53, 54, 55, 59, 67], "process_directori": 6, "process_fil": 6, "prod_slack_cli": 2, "produc": [1, 2, 6, 7, 8, 9, 11, 12, 14, 26, 33, 63, 66, 67, 68, 69], "producer_nam": 14, "producer_sync": 14, "product": [14, 18, 53, 59, 64, 65], "product_catalog": 14, "profil": [14, 16, 23, 26, 50], "profile_nam": 16, "profiles_dir": 26, "program": [16, 34, 50], "programat": [8, 9, 14, 21, 26, 33, 38, 63], "programmat": [45, 68], "progress": [11, 14, 16, 21, 26, 33, 40, 50, 67], "project": [2, 14, 34, 35, 36, 59], "project_and_instance_metadata": 34, "project_dir": [14, 26], "project_id": [14, 26, 34], "project_kei": 14, "project_nam": 26, "project_timezon": 14, "projectid": 34, "projectsecret": 14, "prometheus_cli": 49, "prometheus_resourc": 49, "prometheuscli": 49, "prometheusresourc": 49, "promot": 2, "propag": 12, "proper": [16, 50], "properli": [16, 50, 53], "properti": [1, 2, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16, 23, 26, 34, 45, 46, 50, 60, 61, 63, 64, 65, 66, 67, 68, 69], "property_id": 14, "protect": [16, 50], "protocol": [14, 24, 42, 69], "proven": 8, "provid": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 28, 29, 30, 31, 32, 33, 34, 35, 36, 38, 39, 40, 41, 42, 44, 45, 47, 49, 50, 51, 52, 53, 54, 55, 57, 58, 59, 60, 61, 62, 63, 64, 66, 67, 68, 69], "provis": [23, 34, 38, 44], "proxi": [16, 42, 50], "psycopg2": 15, "ptat": [16, 50], "public": [14, 16, 19, 23, 24, 27, 28, 29, 30, 31, 34, 35, 36, 50, 53, 54, 55], "public_kei": 14, "publish": [14, 24], "publisher_sync": 14, "pubsub": 14, "pubsubdestin": 14, "pull": [14, 16, 20, 27, 38, 40, 67], "pulsar": 14, "pulsardestin": 14, "purchas": 14, "purg": 59, "purge_staging_data": 14, "purpos": [2, 8, 10, 11, 14, 16, 50, 61, 66, 67], "push": [25, 40, 49], "push_to_gatewai": 49, "pushgatewai": 49, "put": [16, 50, 52, 63], "putobjectacl": 23, "py": [3, 15, 16, 27, 34, 50, 59], "py310": 40, "pyamqp": [18, 19, 20], "pyarrow": 53, "pydant": 4, "pyfil": [16, 50], "pyformat": 53, "pyproject": 3, "pyspark": [16, 23, 34, 35], "pyspark_resourc": 50, "pysparkjob": 34, "pysparkresourc": 50, "pytest": 40, "python": [2, 3, 5, 7, 8, 9, 11, 13, 14, 15, 16, 18, 23, 24, 25, 26, 27, 34, 45, 50, 51, 53, 58, 59, 60, 61, 63, 65, 68, 69], "python_artifact": 63, "python_fil": 3, "python_logging_levels_nam": 11, "python_modul": [18, 40], "python_param": 23, "python_typ": [2, 5, 65, 68], "python_valu": 4, "pythonartifactmetadatavalu": 63, "pythonerror": 39, "pythonfileuri": 34, "pythonhttptutorialsourc": 14, "pythonobjectdagstertyp": [63, 68], "pythonpath": [16, 50], "q": 18, "qmark": 53, "qualaroo": 14, "qualaroosourc": 14, "qualifi": [16, 50], "qualiti": [14, 63], "quantil": [16, 50], "queri": [2, 3, 8, 9, 11, 13, 14, 16, 24, 32, 34, 35, 36, 39, 53], "query1": 34, "query2": 34, "query3": 34, "query4": 34, "query_id": 53, "query_param": 14, "query_path": 14, "queryabl": 67, "queryfileuri": 34, "querylist": 34, "queu": 14, "queue": [11, 14, 16, 18, 20, 50, 59], "queue_url": 14, "queuedruncoordin": 11, "quick": 14, "quickbook": 14, "quickbookssingersourc": 14, "quickli": 3, "quickstart": [25, 38], "quit": [16, 40, 50], "quote_char": 14, "quux": [11, 33], "r": [3, 11, 16, 50], "r1": 34, "r2": [14, 34], "r2destin": 14, "rabbitmq": [14, 18, 19, 20, 40], "rabbitmqdestin": 14, "rack": [16, 50], "rais": [2, 4, 7, 8, 9, 11, 12, 13, 14, 21, 23, 26, 33, 39, 40, 42, 51, 52, 53, 60, 63, 64, 66, 67, 69], "raise_on_error": [7, 8, 9, 13, 26], "random": [14, 63], "randomli": [40, 63], "rang": [8, 12, 14, 16, 50, 64, 65, 69], "rapidli": [16, 50], "rapidoc": 14, "rasset_key_prefix": 26, "rate": [2, 14, 16, 50], "rather": [2, 5, 16, 40, 46, 50, 53, 63, 66, 68], "ratio": [16, 50], "raw": [5, 11, 14, 16, 20, 26, 40, 50, 53, 54, 55], "raw_conn": 53, "raw_data": 26, "raw_ev": 26, "raw_output": 26, "rawmetadatavalu": [1, 2, 8, 9, 12, 13, 63, 65], "rb": 11, "rbac": 40, "rbackend": [16, 50], "rdd": [16, 50], "re": [8, 11, 12, 14, 16, 23, 24, 26, 27, 50, 52], "reach": [11, 14, 16, 18, 50], "reachabl": [14, 33], "react": 67, "read": [2, 3, 4, 11, 14, 16, 23, 28, 29, 30, 31, 34, 35, 36, 38, 50, 51, 53, 54, 55, 59], "read_al": 14, "read_csv": 12, "read_data": 11, "read_fil": 11, "read_materi": 67, "read_text": 26, "read_timeout_sec": 25, "read_writ": 34, "readabl": [2, 4, 8, 9, 12, 16, 47, 51, 61, 63, 65, 66, 67, 68], "reader_opt": 14, "readi": 68, "readm": 14, "readonli": 34, "readrc": 69, "readthedoc": [18, 27], "real": 18, "realm": [14, 34], "realm_id": 14, "reaper": [16, 50], "reason": [7, 39, 67], "rebuild": 3, "receipt": 69, "receiv": [3, 7, 11, 14, 16, 45, 50, 51, 68], "receive_buffer_byt": 14, "receive_processed_config_valu": 4, "recent": [2, 8, 14, 67], "recharg": 14, "rechargesourc": 14, "reclaim": [16, 50], "recommend": [2, 4, 11, 14, 15, 16, 23, 26, 33, 50, 59, 64, 68], "recommendedinternalstag": 14, "recon_job": 11, "reconcil": 14, "reconcili": 2, "reconnect": 14, "reconstruct": [16, 50], "reconstruct_context": 8, "reconstruct_job": 13, "reconstructable_arg": 13, "reconstructable_bar_job": [8, 13], "reconstructable_foo_job": [8, 13], "reconstructable_kwarg": 13, "reconstructablejob": [8, 13], "reconstructor_function_nam": 13, "reconstructor_module_nam": 13, "reconstructor_working_directori": 13, "record": [2, 8, 11, 14, 16, 21, 50, 63, 67], "records_per_slic": 14, "records_per_sync": 14, "recov": [16, 50], "recoveri": [8, 13, 14, 16, 50, 67], "recoverymod": [16, 50], "recurli": 14, "recurlysourc": 14, "recurs": [2, 4, 7], "recycl": 3, "redact": [16, 50], "redi": [14, 18, 19, 20], "redirect": 14, "redirect_uri": 14, "redisdestin": 14, "redshift": 14, "redshift_configur": 16, "redshift_resourc": 16, "redshiftclientresourc": 16, "redshiftdestin": 14, "redshiftsourc": 14, "reduc": [14, 16, 26, 50], "reducebykei": [16, 50], "redund": [16, 50], "reexecut": 8, "reexecution_opt": 8, "reexecutionopt": 8, "ref": [3, 11, 23, 26], "refabl": 26, "refer": [2, 11, 14, 16, 17, 20, 23, 24, 26, 28, 29, 30, 31, 34, 35, 36, 37, 40, 44, 50, 52, 53, 54, 55, 56, 63], "referenc": [16, 63], "referencetrack": [16, 50], "reflect": 14, "refresh": [14, 26, 32, 39], "refresh_token": 14, "regardless": [16, 50], "regener": 14, "regex": [16, 50], "region": [14, 16, 23, 34, 40, 50], "region_nam": [14, 16], "regist": [14, 16, 50, 59], "registr": [16, 50], "registrationrequir": [16, 50], "registri": [19, 25, 27, 40, 59], "regress": [16, 50], "regular": [2, 11, 12, 14, 68], "reindex": 3, "rel": [6, 23, 64, 69], "relat": [2, 8, 14, 16, 23, 26, 28, 29, 30, 31, 32, 34, 35, 36, 50, 53, 54, 55, 60, 63], "relationship": 34, "relative_path": 69, "relaunch": [16, 50], "releas": [2, 14, 26, 34, 63, 66], "relev": [3, 7, 8, 14, 16, 26, 37, 38, 50, 52, 67], "reli": [14, 65], "reliabl": 23, "reload": [14, 39], "reload_repository_loc": 39, "reloadnotsupport": 39, "reloadrepositorylocationinfo": 39, "reloadrepositorylocationstatu": 39, "remain": [8, 9, 64], "remaind": 23, "rememb": [14, 16, 50], "remot": [3, 11, 15, 16, 23, 34, 39, 40, 50, 57], "remote_host": 57, "remote_port": 57, "remov": [2, 9, 13, 16, 23, 26, 42, 50, 52, 59, 63, 64, 67, 69], "renam": [16, 17, 34, 59], "render": [14, 16, 50], "render_field": 14, "renew": 14, "repeat": [4, 11, 14], "repeat_word": 4, "repeated_cal": 14, "repeatedli": 14, "repl": [8, 13], "replac": [8, 13, 14, 16, 50, 53, 67], "replai": [16, 50], "replenish": [16, 50], "repli": 69, "replic": [14, 16, 50], "replica": [14, 16, 50], "replica_set": 14, "replicaset": 14, "replication_end_d": 14, "replication_method": 14, "replication_slot": 14, "replication_start_d": 14, "repo": [14, 16, 40, 50, 59], "repo_location_nam": 20, "repo_nam": 38, "repo_own": 38, "report": [11, 14, 26, 34], "report_generation_max_retri": 14, "report_granular": 14, "report_opt": 14, "report_wait_timeout": 14, "reports_start_d": 14, "repositori": [3, 5, 14, 15, 16, 20, 26, 27, 28, 29, 30, 31, 33, 38, 39, 40, 42, 50, 52, 59, 67, 69], "repository_data": 65, "repository_def": 67, "repository_load_data": 65, "repository_location_nam": 39, "repository_nam": [15, 39, 67], "repository_vers": 65, "repositorydata": 65, "repositorydefinit": [9, 15, 65, 67], "repositorylocationloadfailur": 39, "repositorylocationnotfound": 39, "repositoryselector": [42, 52, 67, 69], "repostitori": 15, "repostitory_location_nam": 15, "repres": [1, 2, 6, 8, 9, 11, 12, 14, 16, 26, 33, 50, 59, 62, 63, 64, 66, 67], "represent": [2, 4, 8, 11, 16, 17, 26, 34, 46, 63, 64, 68], "request": [3, 8, 12, 13, 14, 16, 21, 23, 26, 33, 34, 38, 39, 42, 50, 53, 63], "request_additional_param": 14, "request_asset": 67, "request_job": 67, "request_max_retri": [14, 21, 26, 33], "request_retry_delai": [14, 21, 26, 33], "request_timeout": 14, "request_timeout_m": 14, "requests_per_minut": 14, "requir": [1, 2, 4, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 23, 26, 33, 34, 40, 45, 50, 51, 53, 54, 55, 59, 60, 63, 64, 66, 67, 68], "require_update_for_all_parent_partit": 2, "required_but_nonexistent_partition_kei": 64, "required_multi_asset_neighbor": 2, "required_resource_kei": [1, 2, 7, 10, 11, 12, 16, 17, 23, 24, 26, 41, 42, 45, 50, 51, 52, 53, 56, 60, 63, 66, 67, 68], "rerais": 7, "resend": 14, "reserv": [2, 12, 60], "reset": [14, 16, 50], "resid": [3, 11, 16, 23, 50, 67], "resolut": [16, 50], "resolv": [2, 3, 4, 5, 7, 14, 16, 50, 66], "resolve_canonical_bootstrap_servers_onli": 14, "resolve_standoff": 4, "resolved_op_select": 11, "resourc": [1, 2, 4, 5, 7, 8, 9, 10, 11, 12, 13, 15, 16, 18, 20, 21, 24, 25, 28, 29, 30, 31, 35, 36, 38, 40, 41, 44, 45, 49, 50, 51, 52, 54, 55, 58, 60, 62, 63, 65, 67, 68, 69], "resource_config": [2, 12, 62, 65, 66], "resource_config_by_kei": [59, 66], "resource_def": [1, 2, 8, 9, 11, 12, 13, 14, 15, 16, 17, 21, 23, 24, 26, 33, 34, 38, 41, 42, 44, 50, 52, 53, 59, 60, 62, 66, 67], "resource_fn": [7, 12, 66], "resource_funct": 12, "resource_init_failur": 8, "resource_keys_to_init": 60, "resource_nam": [7, 66], "resource_str": 69, "resource_to_init": 66, "resource_typ": 26, "resourceadd": 66, "resourcedefinit": [2, 4, 5, 7, 8, 11, 12, 13, 14, 15, 16, 17, 21, 23, 24, 25, 26, 28, 33, 34, 38, 41, 42, 44, 49, 50, 52, 53, 56, 57, 58, 59, 60, 62, 65, 66, 67], "resourceparam": [16, 24], "resources_config": 8, "resourceversioncontext": 62, "respect": [2, 3, 5, 12, 18, 23, 40, 53, 63], "respond": [23, 51], "respons": [3, 8, 11, 12, 13, 14, 24, 26, 33, 53, 64], "rest": [14, 21, 23, 26, 27, 33, 34, 40], "restart": [3, 14, 16, 23, 34, 39, 50, 63], "restrict": [23, 34], "result": [1, 2, 3, 4, 6, 9, 11, 12, 13, 14, 15, 16, 18, 19, 20, 21, 24, 26, 33, 34, 39, 50, 51, 53, 60, 61, 63, 64, 66, 67, 68, 69], "resum": [14, 59], "retain": [14, 16, 50], "retainedbatch": [16, 23, 50], "retaineddeadexecutor": [16, 50], "retaineddriv": [16, 50], "retainedexecut": [16, 50], "retainedexecutor": [16, 50], "retainedjob": [16, 50], "retainedrootrdd": [16, 50], "retainedstag": [16, 50], "retainedtask": [16, 50], "retent": 14, "retentlysourc": 14, "rethrown": 7, "retri": [1, 2, 8, 9, 11, 13, 14, 16, 18, 19, 20, 21, 23, 26, 27, 33, 40, 50, 60, 63, 67], "retriev": [8, 11, 12, 14, 20, 23, 26, 33, 34, 35, 36, 38, 40, 51, 53, 54, 55, 65], "retry_backoff_m": 14, "retry_max_tim": 25, "retry_method": 25, "retry_mod": 11, "retry_numb": 8, "retry_polici": [1, 2, 8, 9, 60, 63], "retry_status_cod": 25, "retrymod": 11, "retrypolici": [1, 2, 8, 9, 13, 60, 63], "retryrequest": [60, 63], "retrywait": [16, 50], "return": [1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 16, 17, 23, 26, 28, 29, 30, 31, 33, 34, 35, 36, 37, 39, 42, 45, 46, 50, 51, 52, 53, 54, 55, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69], "return_cod": 26, "return_n": 65, "return_n_": 65, "return_on": [8, 9, 13], "reus": [16, 34, 50, 59], "reusabl": 11, "revers": [16, 50], "reverseproxi": [16, 50], "reverseproxyurl": [16, 50], "reviv": [16, 50], "rewritten": [16, 50], "rfc": [14, 34], "rfc1035": 34, "rfc3339": 14, "rich": 18, "right": 14, "rigidli": [16, 50], "rki": 14, "rkicovidsourc": 14, "rm": [16, 50], "rockset": 14, "rocksetdestin": 14, "role": [4, 14, 23, 32, 34, 53, 54, 55], "role_arn": 14, "roll": [16, 50], "root": [2, 11, 14, 16, 23, 34, 40, 50], "root_run_id": 11, "rootlogg": 34, "rootprincipalpassworduri": 34, "rouberol": 18, "rout": [14, 44], "routing_kei": [14, 44], "row": [1, 14, 45, 46, 63], "row_batch_s": 14, "rowcountconstraint": 45, "rpc": [14, 16, 18, 19, 20, 50], "rsa": 38, "rsa_kei": [53, 54, 55], "rule": [2, 4, 7, 40, 45, 68], "rules_to_add": 2, "rules_to_remov": 2, "run": [1, 2, 4, 7, 9, 10, 12, 13, 14, 16, 17, 18, 19, 20, 21, 23, 24, 26, 27, 34, 35, 36, 37, 38, 39, 41, 42, 43, 48, 50, 51, 52, 59, 60, 61, 62, 63, 64, 66, 69], "run_before_shell_op": 51, "run_config": [4, 8, 9, 11, 13, 15, 16, 23, 24, 39, 41, 44, 52, 53, 60, 65, 67], "run_config_fn": 67, "run_config_for_partition_fn": 64, "run_config_for_partition_key_fn": 64, "run_coordin": 11, "run_coordinator_data": 11, "run_dat": 26, "run_dbt_nightly_sync": 26, "run_ecs_tag": 16, "run_failure_sensor": 67, "run_fn": 24, "run_id": [7, 8, 9, 10, 11, 12, 13, 39, 59, 60, 61, 63, 66, 69], "run_k8s_config": [20, 40], "run_kei": [8, 13, 65, 67], "run_launch": [11, 20, 40], "run_launch_ag": 59, "run_launch_agent_exampl": 59, "run_launch_job": 59, "run_launch_job_exampl": 59, "run_launcher_data": 11, "run_look_id": 14, "run_nam": [23, 59], "run_now": 23, "run_now_op": 23, "run_request": [64, 67], "run_request_for_partit": [8, 13], "run_resourc": 16, "run_result": 26, "run_results_json": 26, "run_results_path": 26, "run_resultsjson": 26, "run_statu": [42, 67], "run_status_sensor": 67, "run_status_sensor_fn": 67, "run_status_sensor_to_invok": 67, "run_storag": [11, 43, 48], "run_storage_data": 11, "run_tag": 59, "run_task": 16, "run_task_kwarg": 16, "run_updated_aft": 11, "runawai": [16, 50], "runconfig": [4, 13, 26], "runconfigdata": 39, "runconflict": 39, "runcoordin": 11, "runfailuresensorcontext": [42, 52, 67, 69], "runlaunch": [11, 16, 20, 27, 40], "runnabl": 69, "runner": [14, 40], "runnow": 23, "runrecord": 11, "runrequest": [8, 13, 64, 65, 67], "runs_client": 23, "runsapi": 23, "runsfilt": 11, "runshardedeventscursor": 11, "runstatussensorcontext": 67, "runstatussensordefinit": 67, "runstorag": 11, "runtim": [2, 4, 6, 7, 8, 9, 13, 16, 23, 26, 34, 45, 50, 61, 64, 68], "runtime_metadata_fn": 26, "s": [1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 16, 18, 20, 23, 26, 34, 37, 38, 39, 40, 41, 46, 50, 52, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68], "s3": [11, 14, 23, 41, 50, 68], "s3_access_key_id": 14, "s3_bucket": [14, 16, 68], "s3_bucket_nam": 14, "s3_bucket_path": 14, "s3_bucket_region": 14, "s3_endpoint": 14, "s3_file": 16, "s3_file_manag": 16, "s3_job_package_path": 16, "s3_kei": [16, 68], "s3_path": 68, "s3_path_format": 14, "s3_pickle_io_manag": 16, "s3_pipeline_package_path": 16, "s3_prefix": 16, "s3_region": 14, "s3_resourc": 16, "s3_secret_access_kei": 14, "s3amazonwebservic": 14, "s3computelogmanag": [11, 16], "s3coordin": 16, "s3destin": 14, "s3filehandl": [16, 68], "s3filemanagerresourc": 16, "s3pickleiomanag": 16, "s3resourc": [4, 16], "s3sourc": 14, "s3stage": 14, "sa": 17, "safe": [16, 50, 52, 66], "safe_mod": 15, "safeguard": 2, "safeti": [16, 50], "salesforc": 14, "salesforcesourc": 14, "salesloft": 14, "salesloftsourc": 14, "same": [1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 14, 16, 18, 23, 26, 27, 33, 40, 50, 59, 63, 64, 66, 67, 68], "same_as_sourc": 14, "sampl": [6, 14, 34], "sandbox": 14, "sandboxaccesstoken": 14, "sanit": [14, 33, 63], "sas_token": 14, "sasl_jaas_config": 14, "sasl_mechan": 14, "saslplaintext": 14, "saslssl": 14, "satisfi": [2, 4, 8, 11, 66], "satur": [16, 50], "saturdai": [64, 67], "save": [16, 37, 50, 59], "save_notebook_on_failur": 60, "saveashadoopfil": [16, 50], "sc": [16, 50], "scaffold": [3, 14], "scaffold_config": 3, "scaffold_java_jdbc": 14, "scaffolddestinationpythondestin": 14, "scaffoldjavajdbcsourc": 14, "scaffoldsourcehttpsourc": 14, "scaffoldsourcepythonsourc": 14, "scala": [16, 50], "scala2": 23, "scalabl": 14, "scalar": [4, 8], "scalar_typ": 4, "scalarunion": 4, "scale": [16, 23, 50], "scan": [16, 50], "scenario": [14, 16, 50], "schedul": [2, 5, 8, 13, 15, 16, 20, 22, 26, 33, 34, 40, 42, 48, 50, 52, 65], "schedule_def": 65, "schedule_nam": [65, 67], "schedule_storag": [11, 43, 48], "schedule_storage_data": 11, "schedule_typ": 64, "scheduled_execution_tim": 67, "scheduledefinit": [5, 15, 26, 65, 67], "scheduleevaluationcontext": 67, "scheduler_data": 11, "scheduler_nam": [20, 40], "schedulerbacklogtimeout": [16, 50], "schedulestorag": 11, "scheduletyp": 64, "schema": [1, 2, 4, 7, 9, 11, 12, 13, 14, 16, 23, 25, 26, 28, 29, 30, 31, 32, 33, 34, 35, 36, 39, 45, 46, 47, 53, 54, 55, 60, 61, 63, 66, 68], "schema1": 33, "schema2": 33, "schema_by_table_nam": 14, "schema_nam": 33, "schema_registry_config": 25, "schema_registry_password": 14, "schema_registry_url": [14, 25], "schema_registry_usernam": 14, "schemamodel": 46, "scheme": [2, 14, 16, 48, 49, 50], "scope": [8, 13, 16, 23, 27, 34, 38, 40, 47, 50, 61, 66], "scoped_resources_build": 8, "scpsecurecopyprotocol": 14, "scratch": [16, 50], "script": [14, 16, 23, 34, 50, 51], "scriptvari": 34, "scroll": 14, "scylla": 14, "scylladestin": 14, "sda": 15, "sdk": 23, "seacrh": 14, "search": [14, 16, 26, 50], "searchmetr": 14, "searchmetricssourc": 14, "second": [11, 14, 16, 17, 20, 21, 23, 26, 33, 34, 39, 40, 42, 50, 51, 52, 53, 57, 63, 64, 67], "second_asset": 8, "second_op": [27, 40], "secondaryworkerconfig": 34, "seconds_to_wait": 63, "secret": [2, 4, 8, 9, 11, 13, 14, 16, 17, 20, 23, 33, 40], "secret_access_kei": 14, "secret_bool_op": 4, "secret_int_op": 4, "secret_job": 4, "secret_kei": [14, 17], "secret_key_kei": 23, "secret_op": 4, "secret_scop": 23, "secretid": 16, "secrets_in_environ": 16, "secrets_load": 11, "secrets_loader_data": 11, "secrets_tag": 16, "secrets_to_env_vari": 23, "secretsmanager_resourc": 16, "secretsmanager_secrets_resourc": 16, "secretsmanagerresourc": 16, "secretsmanagersecretsresourc": 16, "section": [14, 16, 20, 40, 50], "secur": [3, 14, 20, 23, 34, 40], "securili": 38, "security_context": [20, 40], "security_protocol": 14, "securityconfig": 34, "see": [11, 14, 15, 16, 17, 18, 19, 20, 22, 23, 24, 26, 27, 33, 34, 37, 38, 40, 42, 46, 50, 52, 53, 54, 55, 56, 58, 59, 62], "seed": [14, 26], "seek": [11, 16, 50], "seem": 38, "segment": 2, "seldom": 14, "select": [2, 3, 4, 8, 9, 13, 14, 16, 26, 28, 34, 51, 52, 53, 64, 67], "select_properties_by_default": 14, "selectanotheropt": 14, "selected_asset": 2, "selected_asset_check_kei": [2, 8], "selected_asset_kei": [2, 8], "selected_output_nam": [8, 26], "selected_unique_id": 26, "selector": [4, 7, 17, 18, 19, 20, 22, 23, 27, 40], "self": [12, 14, 16, 25, 34, 38, 50, 65, 66, 68], "self_dependent_asset": 8, "seller": 14, "semicolon": 34, "send": [3, 11, 14, 16, 18, 24, 42, 49, 50, 52, 57, 69], "send_buffer_byt": 14, "send_messag": 10, "send_timeout_m": 14, "sender": 69, "sendgrid": 14, "sendgridsourc": 14, "sendoffsetstotransact": 14, "sens": [18, 19, 20], "sensit": [4, 11, 14, 16, 34, 50], "sensor": [2, 5, 8, 11, 13, 52, 64, 65, 69], "sensor_def": 65, "sensor_nam": [65, 67], "sensordefinit": [5, 65, 67], "sensorevaluationcontext": 67, "sensorresult": [64, 67], "sent": [3, 14, 16, 23, 42, 50, 52, 67, 69], "sentri": 14, "sentrysourc": 14, "separ": [2, 5, 6, 8, 11, 12, 14, 16, 34, 40, 48, 50, 64, 65], "sequenc": [1, 2, 5, 8, 9, 11, 12, 13, 14, 27, 28, 29, 30, 31, 34, 35, 36, 53, 54, 55, 60, 63, 64, 65, 67], "sequenti": [23, 67], "serd": [11, 34], "seri": [2, 11, 18, 53], "serial": [3, 5, 11, 14, 16, 17, 34, 50, 59, 67], "serializ": [1, 2, 6, 11, 13, 16, 17, 34, 50, 63], "serializable_error_info_from_exc_info": 11, "serializableerrorinfo": 11, "serializationmodul": 59, "serv": [3, 16, 26, 39, 50, 64], "server": [3, 5, 11, 14, 16, 23, 24, 25, 34, 39, 40, 41, 42, 50, 53, 59, 69], "server_address": 14, "server_telemetry_id": 25, "server_time_zon": 14, "serversideencrypt": 16, "servic": [3, 14, 16, 20, 23, 26, 34, 35, 36, 40, 44, 50, 59], "service_account_info": 14, "service_account_json": 14, "service_account_nam": [20, 40], "service_check": 24, "service_nam": 14, "serviceaccount": [14, 34], "serviceaccountkei": 14, "serviceaccountkeyauthent": 14, "serviceaccountscop": 34, "servicenam": 14, "servlet": [16, 50], "session": [14, 16, 50, 53], "session_token": 14, "set": [1, 2, 3, 4, 5, 6, 8, 9, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 22, 23, 24, 26, 27, 28, 33, 34, 35, 36, 37, 38, 39, 40, 42, 43, 44, 45, 47, 48, 50, 51, 52, 53, 54, 55, 59, 60, 61, 63, 64, 65, 66, 67, 68, 69], "set_dagster_hom": 8, "set_trac": [8, 69], "setup": [14, 16, 33, 41, 50], "sever": [1, 4, 8, 14, 16, 23, 44], "sftp": 14, "sftpjsondestin": 14, "sftpsecurefiletransferprotocol": 14, "sftpsourc": 14, "sge": 22, "sh": [40, 51], "shape": [1, 4, 7], "shard": [11, 14], "shardcount": 14, "share": [14, 16, 17, 34, 38, 50, 64, 66, 67], "shared_kei": 14, "sheet": 14, "shell": [16, 26, 34, 35, 36, 50, 53, 54, 55], "shell_command": 51, "shell_command_op": 51, "shell_op": 51, "shell_script_path": 51, "shellopconfig": 51, "shift": 64, "shim": [45, 60, 68], "shop": 14, "shopifi": 14, "shopifysourc": 14, "short": [2, 14, 16, 34, 50, 59, 63], "shortio": 14, "shortiosourc": 14, "should": [2, 3, 4, 7, 8, 9, 11, 12, 13, 14, 16, 17, 18, 19, 20, 21, 23, 26, 33, 34, 39, 40, 43, 45, 48, 50, 51, 52, 53, 54, 55, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69], "should_autocreate_t": 48, "should_execut": 67, "show": [3, 12, 16, 50, 69], "show_profil": [16, 50], "show_url_onli": 16, "showconsoleprogress": [16, 50], "shown": [14, 23, 34, 35, 36, 53], "shrink": [16, 50], "shuffl": [16, 23, 50], "shut": [3, 16, 18, 39, 50], "shutdown": [3, 16, 50], "shutdown_repository_loc": 39, "shutdownrepositorylocationinfo": 39, "sid": [14, 58], "side": [8, 13, 14, 16, 23, 50, 52, 53, 63], "sidecar": 16, "sign": 34, "signal": [3, 18], "signatur": [1, 2, 9, 63], "signific": [11, 16, 50], "significantli": [16, 50], "signinviagoogleoauth": 14, "signinviapipedriveoauth": 14, "signup": 2, "silenc": [16, 50], "silent": 11, "similar": 13, "simpl": [2, 6, 26, 32, 53, 65], "simple_job": 65, "simple_repositori": 65, "simpler": [16, 26, 50], "simpli": [2, 16, 26, 50], "simultan": [16, 50], "sinc": [2, 3, 8, 11, 13, 14, 16, 18, 23, 34, 50, 51, 67], "singer": 14, "singl": [2, 3, 4, 8, 9, 12, 13, 14, 16, 17, 20, 23, 26, 27, 34, 40, 50, 51, 52, 63, 64, 66, 67, 68], "single_run": 64, "singleschema": 14, "singlestoreaccesstoken": 14, "singular": 26, "sink": 2, "site": [7, 14, 16, 34, 50], "site_api_kei": 14, "site_id": 14, "site_url": 14, "situat": [16, 50], "size": [14, 16, 23, 26, 34, 50, 63], "sk_iri": 60, "sk_live": 14, "skip": [1, 2, 12, 14, 16, 50, 67, 69], "skip_empty_fil": 16, "skip_messag": 67, "skip_on_all_parents_not_upd": 2, "skip_on_not_all_parents_upd": 2, "skip_on_parent_miss": 2, "skip_on_parent_outd": 2, "skip_reason": 67, "skippabl": 2, "skipreason": 67, "sklearn": 60, "slack": [10, 11, 14, 26], "slack_client": [2, 26], "slack_fil": 2, "slack_files_t": 2, "slack_job": 52, "slack_message_on_failur": 10, "slack_message_on_success": 10, "slack_on_failur": 52, "slack_on_freshness_polici": 52, "slack_on_run_failur": 52, "slack_on_success": 52, "slack_op": 52, "slack_resourc": 52, "slack_sdk": 52, "slack_token": 52, "slackresourc": [26, 52], "slacksourc": 14, "slash": 14, "sleep": 68, "slice": [14, 34, 53], "slice_rang": 14, "slightli": 67, "sling_resourc": 32, "sling_resource_kei": 32, "slingdata": 32, "slingmod": 32, "slingresourc": 32, "slingsourceconnect": 32, "slingtargetconnect": 32, "slow": [16, 18, 19, 20, 40, 50], "slower": [16, 50], "slowli": [16, 50], "slug": 14, "slurm": 22, "small": [16, 50], "smaller": [14, 16, 50], "smartsheet": 14, "smartsheetssourc": 14, "smtp": 69, "smtp_host": 69, "smtp_port": 69, "smtp_type": 69, "snake_cas": 40, "snapchat": 14, "snapchatmarketingsourc": 14, "snappi": [14, 16, 50], "snappycompressioncodec": [16, 50], "snapshot": [11, 26], "snapshot_id": 11, "snapshot_isol": 14, "snippet": 34, "snowflak": [14, 32, 33], "snowflake_account": [53, 54, 55], "snowflake_connection_resourc": 53, "snowflake_databas": 53, "snowflake_io_manag": 53, "snowflake_op_for_queri": 53, "snowflake_pandas_io_manag": 54, "snowflake_password": [53, 55], "snowflake_pyspark_io_manag": 55, "snowflake_resourc": 53, "snowflake_schema": 53, "snowflake_us": 53, "snowflake_warehous": 53, "snowflakecomput": 14, "snowflakeconnect": 53, "snowflakedestin": 14, "snowflakeiomanag": [53, 54, 55], "snowflakepandasiomanag": 54, "snowflakepandastypehandl": [53, 54, 55], "snowflakepysparkiomanag": 55, "snowflakepysparktypehandl": [53, 54, 55], "snowflakeresourc": 53, "snowflakesourc": 14, "so": [2, 3, 4, 8, 9, 11, 12, 13, 14, 16, 17, 23, 24, 26, 34, 50, 63, 64, 67, 68], "so_rcvbuf": 14, "so_sndbuf": 14, "socket": [3, 14, 16, 50], "socket_connection_setup_timeout_m": 14, "socket_connection_setup_timeout_max_m": 14, "softwar": [1, 26, 34, 64], "softwareconfig": 34, "solid": [11, 12, 15, 18, 26, 34, 39, 42, 69], "some": [1, 2, 4, 11, 12, 14, 16, 17, 18, 20, 21, 23, 26, 39, 40, 50, 67, 69], "some_asset": 67, "some_celery_backend_url": 20, "some_celery_broker_url": 20, "some_config": 4, "some_config1": 4, "some_config2": 4, "some_directori": 65, "some_graph": 9, "some_job": 65, "some_kei": 33, "some_model_nam": 41, "some_modul": 3, "some_op": [8, 9, 13, 14, 33], "some_param": 41, "some_resourc": 5, "some_run_id": 39, "some_secret": 33, "some_sensor": 65, "some_validation_fn": 63, "someon": [3, 4], "someth": 69, "sometim": [14, 40], "somewher": 52, "sonnest": [64, 67], "soon": [14, 52], "soonest": [64, 67], "sort": [4, 11, 14, 16, 50], "sourc": [1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69], "source_asset": 2, "source_assets_by_kei": 2, "source_configur": 14, "source_connect": 32, "source_key_prefix": [2, 26], "source_nam": 26, "source_opt": 32, "source_stream": 32, "source_typ": 14, "sourceasset": [1, 2, 5, 8, 14, 60, 65], "sourceassetobservefunct": 2, "sourcehashversionstrategi": 62, "southern": 64, "space": [11, 14, 16, 23, 50], "span": 64, "spark": [1, 2, 16, 23, 26, 34, 50], "spark_conf": [23, 50, 56], "spark_config": [16, 50], "spark_daemon_java_opt": 23, "spark_env_var": 23, "spark_hom": 56, "spark_local_dir": [16, 23, 50], "spark_local_ip": [16, 50], "spark_resourc": 56, "spark_sess": 50, "spark_vers": 23, "spark_worker_memori": 23, "sparkconf": [16, 34, 50], "sparkcontext": [16, 50], "sparkjob": 34, "sparklisten": [16, 50], "sparkoperror": 56, "sparkr": [16, 50], "sparkr_driver_r": [16, 50], "sparksess": [34, 35, 36, 50], "sparksqljob": 34, "spars": [16, 50], "spawn": [3, 20], "spec": [2, 23, 40], "special": [14, 16, 50, 59], "specif": [2, 3, 4, 8, 9, 10, 11, 14, 16, 18, 21, 22, 23, 26, 27, 28, 29, 30, 31, 34, 35, 36, 40, 42, 48, 50, 52, 53, 54, 55, 59, 60, 64, 68], "specifi": [1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 18, 20, 23, 26, 27, 28, 29, 30, 31, 33, 34, 35, 36, 40, 42, 45, 50, 51, 52, 53, 54, 55, 60, 61, 63, 64, 66, 67, 68, 69], "specificpartitionspartitionmap": 64, "specul": [16, 23, 50], "speed": [14, 16, 50, 59], "spill": [16, 50], "spin": [5, 51, 65], "splendidrunstorag": 11, "split": [14, 33, 64, 67], "spot": 23, "spot_bid_price_perc": 23, "spread": [14, 16, 34, 50], "spreadsheet": 14, "spreadsheet_id": 14, "spun": 3, "sq": 14, "sql": [11, 14, 16, 26, 31, 34, 50, 53, 55], "sql_queri": [34, 53], "sqlalchemi": [3, 15, 53], "sqleventlogstorag": 11, "sqlinsert": 14, "sqlite": [11, 14], "sqlitedestin": 14, "sqliteeventlogstorag": 11, "sqliterunstorag": 11, "sqliteschedulestorag": 11, "sqlrunstorag": 11, "sqlschedulestorag": 11, "squar": 14, "squaresourc": 14, "src": [50, 53], "ss": 14, "ssd": 34, "sse": 23, "ssh": [14, 22, 23], "ssh_kei": 14, "ssh_port": 57, "ssh_public_kei": 23, "ssh_resourc": 57, "sshkeyauthent": 14, "sshsecureshel": 14, "ssl": [3, 14, 16, 34, 42, 69], "ssl_certif": 14, "ssl_method": 14, "ssl_mode": 14, "sslmode": 16, "ssz": 14, "st": 14, "stabil": [16, 50], "stabl": [11, 18, 19, 20, 27, 40], "stack": [7, 11, 67], "stackoverflowerror": [16, 50], "stage": [14, 16, 17, 34, 50], "staging_bucket": 16, "staging_prefix": [16, 23], "stale": 67, "stale_assets_onli": 67, "standalon": [14, 16, 50], "standalonemongodbinst": 14, "standard": [2, 8, 9, 11, 13, 14, 16, 34, 35, 36, 50, 61, 63], "standardinsert": 14, "start": [3, 5, 8, 14, 15, 16, 19, 20, 22, 25, 26, 40, 42, 45, 50, 51, 52, 59, 63, 64, 67, 69], "start_aft": [14, 33], "start_dat": [14, 26, 64, 67], "start_datetim": 14, "start_offset": [2, 8, 64], "start_tim": [11, 14], "starttl": 69, "stat": [16, 50], "state": [2, 7, 11, 14, 16, 17, 21, 23, 26, 34, 39, 50, 52, 63, 67], "state_filt": 14, "statement": [3, 14], "static": [1, 2, 6, 11, 12, 63, 64, 65, 66, 69], "static_partitioned_config": 64, "staticmethod": [28, 29, 30, 31, 34, 35, 36, 53, 54, 55], "staticpartitionmap": 64, "staticpartitionsdefinit": 64, "statu": [2, 3, 11, 14, 16, 34, 39, 40, 42, 50, 52, 63, 67, 69], "status": [3, 11], "stderr": [3, 11, 16, 17, 34, 51], "stderrfrom": 23, "stdin": 69, "stdout": [3, 11, 16, 17, 23, 26, 34, 51, 69], "step": [1, 7, 8, 10, 12, 16, 17, 18, 19, 20, 27, 34, 38, 39, 40, 41, 42, 50, 52, 62, 63, 67, 68, 69], "step_context": 12, "step_execution_context": [8, 10], "step_expectation_result": 8, "step_failur": 8, "step_handl": 8, "step_k8s_config": 40, "step_kei": [8, 10, 11, 12, 39, 67, 69], "step_keys_to_execut": 11, "step_kind_valu": 8, "step_output": 8, "step_restart": 8, "step_select": 8, "step_skip": 8, "step_start": 8, "step_success": 8, "step_up_for_retri": 8, "stepexecutioncontext": [11, 12], "stepkind": 8, "steplaunch": 11, "steprunref": 11, "still": [4, 5, 16, 34, 50], "stock": 14, "stock_tick": 14, "stocktickerapitutorialsourc": 14, "stop": [3, 11, 14, 16, 23, 26, 42, 46, 50, 52, 59, 64, 67, 69], "stopgap": 5, "stopgracefullyonshutdown": [16, 50], "storag": [2, 3, 7, 12, 14, 16, 17, 20, 23, 34, 39, 40, 41, 43, 48, 50, 59, 60, 67], "storage_access_kei": 14, "storage_account": [14, 17], "storage_account_key_kei": 23, "storage_account_nam": [14, 23], "storage_data": 11, "storage_endpoint_suffix": 14, "storage_id": 11, "storagefract": [16, 50], "storagelevel": [16, 50], "store": [2, 3, 8, 11, 12, 14, 16, 17, 23, 29, 30, 31, 34, 35, 36, 38, 40, 50, 51, 52, 53, 54, 55, 60, 63, 64, 67], "store_fil": 2, "store_files_in_t": 2, "store_hash": 14, "store_nam": 14, "store_timestamps_as_str": [53, 54, 55], "str": [1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 23, 25, 26, 27, 32, 33, 34, 37, 38, 39, 40, 42, 45, 47, 51, 52, 53, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69], "str_param": 68, "straightforward": 68, "strategi": [4, 8, 13, 14, 16, 50, 62], "strava": 14, "stravasourc": 14, "stream": [11, 14, 16, 23, 26, 32, 50, 51], "stream_config": 14, "stream_dupl": 14, "stream_nam": 14, "stream_raw_ev": 26, "stream_schema": 14, "stream_to_asset_map": 14, "streamingcontext": [16, 50], "streams_criteria": 14, "strict": [11, 16, 17, 18, 19, 20, 22, 23, 25, 27, 34, 40, 48], "strict_column_list": 45, "strictcolumnsconstraint": 45, "strictli": [16, 23, 50], "string": [1, 2, 3, 4, 6, 7, 8, 9, 11, 12, 13, 14, 16, 18, 19, 20, 21, 23, 26, 27, 32, 33, 34, 38, 39, 40, 41, 45, 46, 50, 51, 52, 53, 54, 55, 57, 59, 60, 62, 63, 64, 65, 66, 67, 68, 69], "string_event_properties_kei": 14, "string_resourc": 66, "stringifi": 67, "stringio": 11, "stringsourc": [4, 7, 14, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 33, 34, 35, 36, 38, 40, 41, 42, 43, 44, 48, 49, 50, 53, 54, 55, 57, 58, 59], "stripe": 14, "stripesourc": 14, "structur": [3, 8, 9, 11, 12, 16, 25, 26, 38, 50, 61, 63, 66, 69], "structured_asset_kei": 63, "structured_asset_key_2": 63, "stub": 63, "stuff": 69, "style": 5, "sub": [2, 8, 11, 14, 16, 50], "sub0": 34, "subclass": [4, 7, 11, 12, 16, 25, 38, 50, 62, 64, 65, 66, 68, 69], "subdirectori": [14, 26], "subdomain": 14, "subject": [2, 11, 14, 64, 69], "subminor": 34, "submiss": 34, "submit": [16, 20, 23, 34, 39, 50, 56, 67], "submit_job_execut": 39, "submit_run": 23, "submit_run_op": 23, "submodul": 11, "subnet": 34, "subnetwork": 34, "subnetwork_uri": 34, "subnetworkuri": 34, "subprocess": [26, 51], "subscrib": 14, "subscribetoalltopicsmatchingspecifiedpattern": 14, "subscript": 14, "subsequ": [9, 12, 14, 16, 17, 23, 34, 40, 50, 53], "subset": [2, 7, 8, 11, 14, 21, 26, 33, 64], "substanti": [16, 50], "succe": [23, 40], "succeed": [10, 26, 63], "success": [2, 7, 8, 10, 14, 16, 21, 26, 33, 39, 42, 45, 50, 52, 53, 63, 68], "success_hook": 10, "successfulli": [11, 14, 21, 26, 33], "suffix": [14, 16, 50], "suggest": [23, 26], "suit": 37, "suitabl": [8, 11, 16, 17, 34, 51], "suite_nam": 37, "sum": 14, "summari": 44, "summarize_directori": 6, "sundai": [64, 67], "sunshin": 14, "super": [26, 41], "supervis": [16, 50], "suppli": [2, 4, 7, 8, 9, 13, 20, 26, 33, 40, 67, 69], "support": [2, 6, 8, 11, 12, 13, 14, 16, 17, 21, 23, 24, 25, 26, 34, 38, 44, 50, 51, 52, 61, 63, 64, 67], "suppress": [3, 26], "sure": [14, 16, 38, 40, 50, 59], "surfac": [3, 16], "surpris": 2, "survei": 14, "survey_group_id": 14, "survey_id": 14, "surveygroup": 14, "surveymonkei": 14, "surveymonkeysourc": 14, "sustainedschedulerbacklogtimeout": [16, 50], "svc": [24, 40], "svv": 40, "switch": [2, 8, 9, 13, 14], "sy": 7, "symbol": [3, 14, 16, 50], "sync": [14, 21, 32, 33], "sync_foobar": [14, 21, 33], "sync_id": 21, "sync_lag_minut": 14, "sync_produc": 14, "sync_run": 21, "synchron": [7, 8, 11, 14, 39], "syntax": [8, 14, 26, 68], "synthes": 62, "system": [5, 7, 8, 11, 13, 14, 16, 17, 18, 23, 32, 39, 50, 51, 63, 67], "systemidsid": 14, "t": [1, 2, 3, 8, 9, 12, 13, 14, 16, 18, 26, 27, 40, 50, 52, 53, 59, 67], "t_partitionsdefinit": 64, "tab": [14, 33, 44, 69], "tabl": [2, 14, 26, 28, 29, 30, 31, 32, 33, 34, 35, 36, 46, 53, 54, 55], "table1": 33, "table2": 33, "table_nam": [14, 26, 33], "table_schema": 63, "tablecolumn": 63, "tablecolumnconstraint": 63, "tableconstraint": 63, "tablemetadatavalu": 63, "tablerecord": 63, "tableschema": 63, "tableschemametadatavalu": 63, "tabular": 63, "tag": [1, 2, 8, 9, 11, 13, 14, 15, 16, 23, 24, 26, 27, 33, 34, 39, 40, 41, 51, 59, 60, 62, 63, 64, 67], "tag_concurrency_limit": [11, 27, 40], "tags_fn": 67, "tags_for_partition_fn": [64, 67], "tags_for_partition_key_fn": 64, "tagsmor": 23, "take": [2, 5, 9, 11, 14, 16, 22, 23, 26, 33, 37, 40, 42, 45, 50, 51, 52, 53, 63, 64, 65, 66, 67, 68, 69], "taken": 33, "talk": 14, "talkdesk": 14, "talkdeskexploresourc": 14, "tandem": 8, "tap": 14, "tar": 34, "target": [2, 8, 13, 14, 16, 20, 26, 32, 40, 50, 60, 64, 67], "target_connect": 32, "target_dir": 26, "target_object": 32, "target_opt": 32, "target_path": 26, "task": [15, 16, 19, 20, 23, 34, 40, 50], "task_definit": 16, "task_ids_by_asset_kei": 15, "taskdefinit": 16, "tax": 14, "tbc": 14, "tcp": 14, "team": [3, 11, 14, 65], "teams_job": 42, "teams_on_failur": 42, "teams_on_run_failur": 42, "teams_on_success": 42, "teams_op": 42, "teams_webhook_url": 42, "teamsclient": 42, "teardown": 66, "technolog": 26, "teh": [28, 53], "tell": [2, 8, 9, 11, 14, 34, 35, 36], "temp": 11, "temp_dir": 8, "temp_fil": 51, "temp_file_writ": 51, "tempdir": 11, "tempfil": 11, "tempo": 14, "temporari": [8, 11, 12, 34, 35, 36, 51, 59], "temporary_gcs_bucket": [34, 35, 36], "temposourc": 14, "ten": 53, "tenanc": 14, "tenant": 14, "tenant_endpoint": 14, "tenant_id": 14, "tend": [16, 50], "term": [5, 14, 23, 63, 64], "termin": [14, 23, 34, 53], "test": [7, 8, 11, 12, 14, 17, 26, 32, 34, 50, 53, 61, 63, 66, 67], "test_project": 40, "test_top": 14, "test_valu": 4, "text": [3, 16, 25, 26, 38, 42, 52, 63, 66, 69], "text_fn": 52, "text_messag": 42, "text_metadata": 63, "text_usag": 52, "textio": 11, "textmetadatavalu": 63, "tgtlifetimehour": 34, "tgz": 34, "th": 45, "than": [2, 4, 5, 11, 14, 16, 18, 19, 20, 23, 26, 34, 38, 40, 46, 50, 63, 64, 66, 68], "thank": 40, "the_asset": 66, "the_graph": [8, 9], "the_job": [8, 9], "the_resourc": 66, "the_schedul": 67, "the_sensor": 67, "thei": [2, 3, 4, 5, 8, 9, 11, 12, 13, 14, 16, 17, 23, 33, 50, 63, 66, 67, 68, 69], "them": [2, 3, 4, 5, 7, 11, 12, 14, 16, 23, 26, 33, 34, 40, 45, 50, 51, 59, 61, 63, 67, 68, 69], "themselv": [8, 9], "therefor": [26, 33], "thi": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 38, 39, 40, 41, 42, 43, 44, 45, 47, 48, 49, 50, 51, 52, 53, 54, 55, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69], "thin": [23, 24, 38, 44, 52], "thing": [14, 16, 38, 50, 69], "third": 11, "those": [2, 5, 8, 9, 12, 13, 14, 16, 23, 26, 33, 50, 53, 61, 63, 64, 67], "though": [14, 16, 50], "thousand": [16, 50], "thrash": 34, "thread": [3, 8, 11, 14, 16, 22, 50, 53], "threaddump": [16, 50], "threads_per_work": 22, "three": [12, 16, 17, 34], "threshold": [16, 45, 50], "through": [2, 4, 8, 13, 14, 16, 23, 26, 32, 40, 45, 50, 61, 63, 68, 69], "throughput": 23, "throw": [2, 4, 7, 14, 16, 23, 39, 50], "thrown": [2, 7, 8, 10, 45, 66, 68], "thu": [1, 2, 11], "thumbnail_data_url": 14, "thumbnail_url": 14, "tick": [2, 8, 13, 64, 67], "ticker": 14, "ticket": 34, "tidb": 14, "tidbdestin": 14, "tidbsourc": 14, "tight": 14, "tighter": 38, "tightli": [16, 50], "tiktok": 14, "tiktokmarketingsourc": 14, "till": 14, "time": [2, 3, 4, 5, 6, 8, 9, 11, 12, 13, 14, 15, 16, 18, 19, 21, 23, 24, 26, 27, 33, 34, 39, 50, 51, 52, 53, 54, 55, 59, 62, 63, 64, 67, 68], "time_incr": 14, "time_window": 26, "timelysourc": 14, "timeout": [3, 14, 16, 22, 23, 34, 35, 36, 39, 40, 42, 49, 50, 53, 57], "timeout_second": 23, "timestamp": [11, 14, 67], "timewindow": [8, 64], "timewindowpartitionmap": [2, 8, 64], "timewindowpartitionsdefinit": [8, 12, 64], "timezon": [2, 14, 26, 53, 54, 55, 64, 67], "titl": [14, 38, 46], "tl": [14, 42], "tlsencryptedverifycertif": 14, "tmp": [16, 17, 32, 34, 40, 50, 51], "to_asset_kei": 67, "to_default_asset_ev": 26, "to_job": [8, 9, 13, 61], "to_source_asset": 2, "to_sql": 2, "to_str": 67, "todai": 14, "todo": 14, "togeth": [14, 59, 63], "toggl": [14, 62], "token": [14, 15, 16, 17, 23, 25, 26, 38, 39, 40, 52, 58], "token_kei": 14, "token_secret": 14, "token_uri": 14, "toml": 3, "too": [8, 14, 16, 40, 50], "tool": [5, 11, 23, 45, 63, 68], "top": [3, 4, 5, 8, 9, 13, 21, 33, 65, 67], "top_level_resourc": 65, "topic": [14, 25, 52], "topic_id": 14, "topic_namespac": 14, "topic_partit": 14, "topic_pattern": 14, "topic_rout": 25, "topic_ten": 14, "topic_test": 14, "topic_typ": 14, "torn": [8, 66], "torrentbroadcastfactori": [16, 50], "total": [14, 16, 23, 46, 50, 64], "touch": 11, "toward": [14, 16], "tpl_kei": 14, "tplcentral": 14, "tplcentralsourc": 14, "trace": [3, 7, 11, 61, 67], "track": [14, 16, 41, 50, 59, 63, 67], "tracker": 14, "trail": 14, "trailing_unconsumed_partitioned_event_id": 67, "train": 59, "transact": 14, "transfer": [16, 50], "transform": [11, 14, 16, 50], "transform_word": 4, "transformation_prior": 14, "transient": [11, 14, 16, 50], "transit": 53, "translat": [14, 26, 28, 34, 53], "transmiss": 14, "transport": 39, "travers": 20, "treat": 68, "tree": 14, "trello": 14, "trellosourc": 14, "tri": [16, 50], "trigger": [2, 10, 16, 21, 23, 26, 42, 44, 50, 52, 67], "true": [2, 4, 6, 7, 8, 9, 11, 12, 13, 14, 15, 16, 17, 20, 21, 23, 26, 27, 33, 34, 40, 42, 45, 48, 50, 52, 53, 54, 55, 57, 60, 62, 63, 64, 67, 68, 69], "trust": 34, "truststor": 34, "truststorepassworduri": 34, "truststoreuri": 34, "try": [14, 16, 18, 39, 50, 63], "tune": [14, 16, 50], "tunnel": 14, "tunnel_host": 14, "tunnel_method": 14, "tunnel_port": 14, "tunnel_us": 14, "tunnel_user_password": 14, "tupl": [7, 8, 11, 13, 51, 63, 67, 68], "turn": [3, 16, 40, 50, 65, 67], "tutori": 14, "twilio": 14, "twilio_resourc": 58, "twilioresourc": 58, "twiliosourc": 14, "two": [2, 9, 12, 14, 16, 17, 34, 46, 63, 64, 66, 67, 68], "txt": [11, 34, 51], "type": [1, 2, 5, 7, 8, 9, 11, 12, 13, 14, 15, 16, 20, 21, 23, 28, 29, 30, 31, 32, 34, 37, 39, 40, 45, 46, 51, 52, 53, 54, 55, 59, 60, 61, 62, 64, 65, 66, 67, 69], "type_check": 68, "type_check_fn": [45, 63, 68], "type_handl": [28, 29, 30, 31, 34, 35, 36, 53, 54, 55], "typecheck": [7, 45, 46, 60, 63, 68], "typecheckcontext": [8, 45, 68], "typeform": 14, "typeformsourc": 14, "typehint": 9, "typic": [2, 5, 7, 8, 11, 16, 40, 50, 52, 65], "typing_typ": [2, 5, 45, 65, 68], "u6nxl7": 14, "ubuntu": 23, "ud": 3, "udf": 34, "ugli": 4, "ui": [2, 3, 4, 8, 9, 11, 13, 14, 15, 16, 23, 26, 33, 39, 50, 59, 60, 63, 64, 65, 67, 69], "uksouth": 23, "unacknowledg": 14, "uncondition": [16, 50], "unconnect": 68, "unconsum": 67, "under": [4, 8, 9, 11, 12, 13, 14, 16, 17, 18, 19, 20, 34, 40, 50, 63], "underestim": [16, 50], "underli": [1, 2, 8, 9, 14, 16, 17, 18, 19, 20, 23, 26, 51, 60, 61, 63, 66, 68], "underneath": [12, 26], "underscor": [2, 14, 34], "understand": 14, "underutil": 23, "undocu": 14, "unencrypt": [14, 53, 54, 55], "unexpect": [7, 16, 50], "unexpected_field_behavior": 14, "unifi": [16, 50], "uniform": [11, 61], "uninstal": 40, "union": [1, 2, 4, 5, 6, 8, 9, 11, 12, 14, 16, 17, 18, 19, 20, 21, 23, 25, 26, 28, 29, 30, 31, 32, 33, 34, 35, 36, 38, 40, 41, 42, 45, 46, 51, 52, 53, 54, 55, 60, 63, 64, 65, 67, 68, 69], "uniqu": [1, 2, 3, 4, 6, 8, 9, 11, 12, 14, 16, 17, 18, 24, 27, 34, 40, 45, 59, 62, 63, 68], "unique_id": 26, "unique_nam": 68, "unit": [8, 9, 14, 16, 40, 50, 63, 67], "univers": [12, 14], "unix": 11, "unknown": 7, "unless": [2, 8, 11, 14, 16, 50, 63, 64, 67], "unlik": [2, 4, 13, 63], "unlimit": [14, 16, 50], "unpartit": 2, "unpersist": [16, 50], "unreach": 51, "unrecover": 63, "unregist": [16, 50], "unrel": 26, "unresolvedassetjob": 67, "unresolvedassetjobdefinit": [2, 5, 42, 67], "unresolvedpartitionedassetscheduledefinit": 5, "unrol": [16, 50], "unrollfract": [16, 50], "unsaf": [16, 50], "unsatisfi": 66, "unset": [23, 34, 35, 36, 64], "unsign": 16, "unspecifi": [4, 14, 34, 63], "unstructur": 11, "unsuccess": [14, 21, 33], "until": [3, 14, 16, 21, 23, 26, 33, 50, 59, 66, 67], "until_todai": 14, "untitl": 23, "unus": [16, 45, 50, 51, 68], "unusu": [16, 50], "unwil": [16, 50], "unzip": 23, "up": [2, 3, 4, 5, 8, 9, 11, 12, 14, 15, 16, 17, 18, 19, 20, 23, 25, 38, 44, 45, 46, 47, 49, 50, 51, 53, 62, 65, 66, 67, 69], "up_for_retri": 63, "upathiomanag": 12, "updat": [2, 11, 14, 16, 26, 32, 33, 40, 50, 67], "update_cursor": 67, "update_kei": 32, "update_timestamp": 11, "updated_aft": 11, "updated_befor": 11, "upload": [14, 16, 17, 23, 34, 35, 36], "upload_extra_arg": 16, "upload_interv": [16, 17, 34], "uploading_method": 14, "upon": [8, 11, 14, 23, 40, 62, 66], "upper": [2, 14, 16, 50], "upsert": 14, "upstream": [2, 8, 9, 12, 13, 15, 51, 60, 63, 64, 67], "upstream_asset": [2, 8, 14], "upstream_dependencies_by_asset_kei": 15, "upstream_output": 12, "upstream_partitions_def": 64, "upstream_partitions_subset": 64, "upstream_python_asset": 26, "upstream_source_asset": 2, "upstreampartitionsresult": 64, "uri": [14, 15, 34, 41, 53, 59], "url": [3, 14, 16, 17, 18, 19, 20, 23, 26, 27, 34, 39, 42, 47, 49, 50, 52, 63, 69], "url_bas": 14, "urlmetadatavalu": 63, "us": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 44, 45, 46, 48, 49, 50, 51, 52, 53, 54, 55, 57, 60, 61, 63, 64, 65, 66, 67, 68, 69], "us2": 14, "usa": 14, "usabl": [11, 63, 68], "usable_as_dagster_typ": [63, 68], "usag": [4, 5, 11, 12, 14, 15, 16, 17, 25, 34, 38, 39, 41, 50, 52, 63, 65, 66], "uscensussourc": 14, "use_all_dns_ip": 14, "use_build_command": 26, "use_current_ecs_task_config": 16, "use_emphemeral_airflow_db": 15, "use_http": [14, 39], "use_pandas_result": 53, "use_ssl": 16, "use_tl": 14, "use_unsigned_sess": 16, "usefetchcach": [16, 50], "uselegacymod": [16, 50], "usepassword": 40, "user": [2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 21, 23, 24, 26, 32, 33, 34, 37, 38, 39, 43, 45, 46, 47, 48, 50, 51, 52, 53, 54, 55, 60, 61, 62, 63, 64, 65, 66, 67, 68], "user1": 19, "user_ag": 14, "user_auth_kei": 14, "user_code_error_boundari": [7, 11], "user_code_failure_retry_delai": 11, "user_login": 14, "user_login_id": 14, "user_messag": 11, "user_nam": 14, "user_secret": 14, "user_token": 15, "useraccount": 34, "userclasspathfirst": [16, 50], "userdeploy": 40, "userguid": [18, 19, 20], "usernam": [11, 12, 14, 16, 19, 27, 40, 43, 48, 57], "usernameandpassword": 14, "usernamepassword": 14, "usptream": 64, "usr": 34, "usual": [12, 14, 15, 16, 26, 50, 65], "utc": [2, 14, 15, 53, 54, 55, 67], "utc_date_str": 15, "utc_execution_date_str": 15, "utf": 51, "util": [2, 6, 8, 12, 16, 17, 25, 27, 38, 39, 40, 45, 51, 63, 66, 67], "utilis": 14, "uvicorn": 3, "uvicorn_log_level": 3, "v": [3, 14], "v1": [16, 20, 34, 40], "v2": [14, 26, 44], "v3": 14, "v4": [14, 38], "valid": [2, 4, 8, 11, 14, 16, 34, 37, 45, 46, 50, 53, 60, 64, 67, 68], "validate_default_paramet": 53, "validate_run_config": 8, "validate_t": 63, "validateoutputspec": [16, 50], "validation_operator_nam": 37, "validation_operators_and_act": 37, "valu": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16, 17, 18, 19, 20, 21, 23, 25, 26, 27, 29, 30, 31, 33, 34, 35, 36, 39, 40, 41, 42, 43, 45, 46, 48, 49, 50, 51, 53, 54, 55, 57, 59, 60, 63, 64, 65, 66, 67, 68], "value1": 14, "value2": 14, "value3": 14, "var": [3, 19, 26], "vari": 14, "variabl": [3, 4, 5, 8, 11, 12, 16, 19, 20, 23, 26, 27, 34, 38, 40, 41, 43, 48, 50, 53, 67], "variant": [6, 16, 50], "variat": [18, 19, 20], "varieti": [26, 39], "variou": [16, 34, 38, 50, 67], "ve": 18, "vendor": 14, "verb": 40, "verbos": [16, 23, 50], "verbose_log": 23, "veri": [8, 11, 13, 16, 50], "verifi": [1, 14, 16, 42], "verify_cert_path": 16, "verifyca": 14, "verifyful": 14, "verifyident": 14, "versa": 64, "version": [2, 3, 4, 5, 8, 9, 11, 12, 13, 14, 16, 18, 19, 20, 23, 26, 34, 40, 42, 50, 52, 63, 64, 66, 67, 68, 69], "version_strategi": [8, 9, 13], "versionstrategi": [8, 9, 13, 62], "very_cool_packag": 11, "very_secret_env_vari": 4, "very_secret_env_variable_bool": 4, "very_secret_env_variable_int": 4, "via": [3, 4, 5, 8, 9, 11, 12, 13, 14, 16, 17, 18, 23, 29, 30, 31, 34, 35, 36, 40, 42, 49, 50, 51, 52, 53, 54, 55, 60, 64, 66, 67, 69], "viabl": 11, "vice": 64, "video": 14, "view": [3, 14, 24, 26, 40, 64], "view_id": 14, "viewabl": [2, 8, 9, 13, 67], "violat": [7, 52], "virtual": 14, "virtual_host": 14, "visibility_timeout": 14, "visibl": [2, 14, 40], "visit": 14, "visitor": 24, "visual": [45, 68], "vm": [16, 34, 50], "vol1": 19, "vol2": 19, "volum": [14, 19, 20, 23, 40], "volume_mount": [20, 40], "volumemount": [20, 40], "vs": [38, 40], "vvv": 40, "w": [3, 11, 16, 40, 50, 51, 59], "w2": 18, "wa": [1, 2, 4, 5, 6, 7, 8, 11, 13, 14, 16, 20, 26, 40, 50, 60, 62, 63, 64, 67], "wai": [5, 8, 11, 12, 13, 14, 16, 32, 40, 50, 64, 66, 68], "wait": [2, 3, 11, 14, 16, 20, 21, 23, 26, 33, 34, 40, 50, 52, 63, 68], "wait_for_log": [16, 23], "wait_int": 68, "wal": [16, 50], "walk": 6, "wandb_api_kei": 59, "wandb_artifact_configur": 59, "wandb_artifacts_io_manag": 59, "wandb_artifacts_manag": 59, "wandb_config": 59, "wandb_resourc": 59, "wandbartifactconfigur": 59, "wandbartifactsiomanagererror": 59, "want": [2, 4, 5, 8, 11, 14, 16, 18, 19, 20, 23, 26, 27, 33, 37, 38, 39, 40, 41, 42, 50, 52, 59, 64, 65, 67, 69], "warehous": [14, 53, 54, 55], "warm": 40, "warn": [1, 2, 3, 14, 16, 24, 26, 50, 61, 64], "warn_after_minutes_overdu": 52, "warn_error": 26, "warn_on_step_context_us": 12, "wast": [16, 50], "wave": 52, "wb": 11, "we": [5, 11, 12, 14, 15, 16, 17, 18, 19, 20, 23, 24, 25, 26, 33, 40, 45, 50, 51, 59, 66], "weak": [16, 50], "weather": 14, "web": [3, 5, 14, 16, 18, 23, 50], "webclient": 52, "webflow": 14, "webflowsourc": 14, "webhook": 42, "webserv": [5, 11, 15, 40, 42, 43, 48, 52, 69], "webserver_base_url": [42, 52, 69], "websit": [14, 59], "week": [64, 67], "weekend": 14, "weekli": [64, 67], "weekly_abc": 64, "weekly_partitioned_config": [64, 67], "weeklypartitionsdefinit": 64, "well": [2, 4, 7, 8, 9, 11, 13, 16, 23, 33, 40, 41, 50, 63], "were": [2, 5, 8, 11, 13, 14, 18, 19, 20, 59, 67], "west": [14, 16, 23, 40], "wget": 34, "what": [2, 5, 8, 9, 11, 14, 16, 37, 50, 63, 65, 67], "when": [2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 48, 50, 51, 52, 53, 54, 55, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69], "whenev": [2, 8, 9, 13, 14, 16, 47, 52, 61, 63], "where": [2, 3, 6, 8, 10, 11, 14, 16, 19, 20, 26, 33, 34, 37, 39, 40, 45, 50, 51, 63, 64, 67, 68], "wherea": 14, "whether": [2, 3, 4, 8, 9, 11, 12, 13, 14, 16, 18, 19, 20, 23, 26, 27, 34, 39, 40, 42, 50, 52, 53, 54, 55, 60, 62, 63, 67, 68, 69], "which": [2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 23, 26, 27, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 45, 47, 50, 51, 52, 53, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69], "while": [2, 4, 7, 8, 11, 16, 23, 26, 50, 62, 64], "whiski": 14, "whiskyhuntersourc": 14, "whitelist": 11, "whitespac": 14, "who": [3, 5, 11, 14, 16, 50], "whole": [11, 12, 16, 50], "whom": 4, "whose": [2, 4, 7, 8, 9, 12, 14, 16, 60, 62, 63, 64, 68, 69], "why": 67, "wide": 46, "willstor": 34, "window": [8, 12, 14, 34, 64, 67], "window_in_dai": 14, "wipe": 11, "wipe_asset": 11, "wise": 46, "wish": [11, 16, 50, 59, 63], "with_hook": [8, 9, 13, 42, 52, 63], "with_prefix": 26, "with_resourc": [5, 28, 29, 30, 31, 33, 59, 66], "with_retry_polici": [8, 9, 63], "with_rul": 2, "with_top_level_resourc": [8, 13], "with_wandb": 59, "within": [1, 2, 3, 4, 7, 8, 9, 11, 12, 13, 14, 15, 16, 20, 23, 24, 25, 26, 27, 34, 38, 39, 40, 41, 44, 45, 47, 50, 51, 59, 60, 61, 62, 63, 65, 66, 67, 68], "withorb": 14, "without": [2, 3, 4, 7, 8, 11, 14, 16, 18, 22, 29, 30, 31, 34, 39, 50, 53, 54, 55, 67, 68], "without_check": 2, "without_rul": 2, "won": [12, 14, 16, 18, 26, 50], "woocommerc": 14, "woocommercesourc": 14, "word": [4, 14, 63], "wordcount": 34, "work": [3, 6, 11, 14, 16, 17, 20, 26, 34, 40, 50, 51, 52, 62, 67, 69], "worker": [3, 11, 14, 16, 19, 20, 22, 23, 34, 40, 50], "worker_main": 18, "workerconfig": 34, "workflow": 26, "working_directori": 3, "workload": [14, 16, 23, 40, 50], "workspac": [3, 14, 23, 33, 52], "workspace_cli": 23, "workspace_id": [14, 23], "workspacecli": 23, "world": [4, 8, 51, 66], "would": [2, 11, 12, 14, 16, 17, 18, 23, 26, 33, 34, 63, 64], "wrap": [4, 7, 8, 11, 12, 13, 17, 51, 60, 63, 66], "wrapper": [12, 23, 24, 26, 38, 44, 52], "wrike": 14, "wrike_inst": 14, "wrikesourc": 14, "write": [2, 8, 9, 11, 13, 14, 16, 17, 18, 23, 28, 29, 30, 31, 34, 35, 36, 38, 50, 51, 53, 54, 55, 66, 67], "write_csv": 12, "write_data": 11, "write_fil": 11, "write_parquet_fil": 53, "write_t": 53, "writeaheadlog": [16, 50], "writeif": 34, "writer": [16, 25, 38, 66], "writerresourc": [16, 25, 38, 66], "written": [14, 16, 27, 34, 35, 36, 40, 50, 51, 59, 60], "www": [2, 19, 26, 34, 58, 64, 67], "x": [9, 14, 23, 64], "xloggc": [16, 50], "xml": 34, "xmlfor": 34, "xmx": [16, 50], "xx": 23, "xxx": 69, "xz": 14, "y": [3, 18, 23, 64, 67], "yahoo": 14, "yahoofinancepricesourc": 14, "yaml": [3, 8, 11, 12, 14, 16, 17, 18, 19, 20, 34, 40, 43, 48, 59, 65, 69], "yaml_directori": 65, "yaml_str": 69, "yamlfil": 14, "yandex": 14, "yandexmetricasourc": 14, "yarn": [16, 22, 34, 50], "ye": [4, 16, 50], "year": 14, "yesterdai": [2, 14], "yet": [65, 67], "yield": [4, 6, 8, 9, 11, 14, 21, 26, 33, 37, 60, 63, 65, 66, 67], "yield_ev": 60, "yield_materi": [14, 21, 26, 33], "yield_result": 60, "yml": [26, 37], "you": [1, 2, 3, 4, 5, 7, 8, 9, 11, 12, 13, 14, 16, 17, 18, 19, 20, 23, 24, 25, 26, 27, 28, 29, 30, 31, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 44, 45, 47, 48, 50, 51, 52, 53, 54, 55, 59, 60, 61, 63, 64, 65, 66, 67, 68, 69], "your": [1, 2, 3, 8, 11, 12, 13, 14, 15, 16, 17, 19, 20, 23, 26, 27, 34, 35, 36, 37, 38, 40, 41, 42, 44, 45, 47, 48, 49, 50, 52, 53, 54, 55, 58, 59, 60, 66, 67, 69], "your_kei": 24, "your_org_her": 39, "your_service_account": 40, "yourself": 11, "yout": 58, "youtub": 14, "youtubeanalyticssourc": 14, "youwil": 44, "yyyi": 14, "z": [14, 34], "zadrozni": 18, "zendesk": 14, "zendesk_sunshin": 14, "zendeskchatsourc": 14, "zendesksunshinesourc": 14, "zendesksupportsourc": 14, "zendesktalksourc": 14, "zenefit": 14, "zenefitssourc": 14, "zenloop": 14, "zenloopsourc": 14, "zero": [14, 16, 23, 26, 34, 50, 51, 64], "zip": [16, 23, 34, 50], "zoho": 14, "zohocrmsourc": 14, "zone": [2, 14, 23, 26, 34, 53, 64, 67], "zone_id": 23, "zoneuri": 34, "zookeep": [16, 50], "zoom": 14, "zoomsingersourc": 14, "zstandard": 14, "zstd": [16, 50], "zstdcompressioncodec": [16, 50], "zuora": 14, "zuorasourc": 14, "\u4e16\u754c": 4, "\u4f60\u597d": 4}, "titles": ["Home", "Asset Checks (Experimental)", "Software-Defined Assets", "Dagster CLI", "Config", "Definitions", "Dynamic Mapping & Collect", "Errors", "Execution", "Graphs", "Hooks", "Internals", "IO Managers", "Jobs", "Airbyte (dagster-airbyte)", "Airflow (dagster-airflow)", "AWS (dagster-aws)", "Azure (dagster-azure)", "Celery (dagster-celery)", "Orchestration on Celery + Docker", "Orchestration on Celery + Kubernetes", "Census (dagster-census)", "Dask (dagster-dask)", "Databricks (dagster-databricks)", "Datadog (dagster-datadog)", "Datahub (dagster-datahub)", "dbt (dagster-dbt)", "Orchestration on Docker", "DuckDB (dagster-duckdb)", "DuckDB + Pandas (dagster-duckdb-pandas)", "DuckDB + Polars (dagster-duckdb-polars)", "DuckDB + PySpark (dagster-duckdb-pyspark)", "embedded-elt (dagster-embedded-elt)", "Fivetran (dagster-fivetran)", "GCP (dagster-gcp)", "GCP + Pandas (dagster-gcp-pandas)", "GCP + PySpark (dagster-gcp-pyspark)", "Great Expectations (dagster-ge)", "GitHub (dagster-github)", "GraphQL (dagster-graphql)", "Kubernetes (dagster-k8s)", "MLflow (dagster-mlflow)", "Microsoft Teams (dagster-msteams)", "MySQL (dagster-mysql)", "PagerDuty (dagster-pagerduty)", "Pandas (dagster-pandas)", "Pandera (dagster-pandera)", "Papertrail (dagster-papertrail)", "PostgreSQL (dagster-postgres)", "Prometheus (dagster-prometheus)", "Pyspark (dagster-pyspark)", "Shell (dagster-shell)", "Slack (dagster-slack)", "Snowflake (dagster-snowflake)", "Snowflake with Pandas (dagster-snowflake-pandas)", "Snowflake with PySpark (dagster-snowflake-pyspark)", "Spark (dagster-spark)", "SSH / SFTP (dagster-ssh)", "Twilio (dagster-twilio)", "Weights & Biases (dagster-wandb)", "Dagstermill", "Loggers", "Job-Level Versioning and Memoization (Deprecated)", "Ops", "Partitions Definitions", "Repositories", "Resources", "Run Requests", "Types", "Utilities"], "titleterms": {"A": 40, "Ins": 63, "about": 40, "access": 40, "airbyt": 14, "airflow": 15, "an": [40, 61], "api": [3, 18, 19, 20, 23, 27, 40, 51], "app": 18, "asset": [1, 2, 3, 8, 14, 26, 32, 33, 63], "aw": 16, "azur": 17, "backend": 18, "backfil": 64, "best": 18, "bias": 59, "bigqueri": [34, 35, 36], "broker": 18, "built": [12, 61, 68], "celeri": [18, 19, 20], "censu": 21, "chart": 40, "check": 1, "cli": [3, 18, 26], "client": 39, "cloud": 26, "cloudwatch": 16, "cluster": 40, "collect": 6, "comput": [11, 34], "config": [4, 8, 14, 59, 64], "configur": [8, 18], "context": [8, 12], "coordin": 11, "core": 26, "custom": [18, 61], "daemon": 3, "dagster": [3, 4, 14, 15, 16, 17, 18, 21, 22, 23, 24, 25, 26, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59], "dagstermil": 60, "dask": 22, "databrick": 23, "datadog": 24, "datahub": 25, "dataproc": 34, "dbt": 26, "debug": 3, "defin": [2, 61, 63], "definit": [5, 64], "depend": 9, "deprec": [26, 62], "destin": 14, "dev": 3, "develop": 40, "docker": [19, 27], "duckdb": [28, 29, 30, 31], "dump": 3, "dynam": 6, "ec": 16, "elt": 32, "embed": 32, "emr": 16, "enabl": 40, "error": [7, 26, 59], "event": [11, 63], "except": 11, "execut": [8, 38, 63], "executor": [8, 11], "exist": 40, "expect": 37, "experiment": [1, 11, 12, 16, 17, 34, 64], "explicit": 9, "faster": 40, "file": [11, 16, 17, 34], "fivetran": 33, "from": [15, 40, 61], "gc": 34, "gcp": [34, 35, 36], "gcr": 40, "ge": 37, "gener": 14, "get": 44, "github": 38, "googl": [35, 36], "graph": [8, 9], "graphql": [3, 38, 39], "great": 37, "grpc": 3, "handl": 11, "heartbeat": 3, "helm": 40, "hook": 10, "i": [17, 34, 53, 59], "input": 12, "instanc": [3, 11], "intern": 11, "io": 12, "issu": 38, "job": [3, 8, 13, 62], "k8": 40, "kei": 63, "kind": 40, "kubernet": [20, 40], "launcher": [11, 23], "legaci": [4, 12, 14, 16, 17, 23, 24, 25, 28, 29, 30, 31, 33, 34, 35, 36, 38, 42, 44, 49, 50, 52, 53, 54, 55, 56, 58, 66], "level": 62, "link": 59, "list": 18, "local": 40, "log": [11, 34, 61], "logger": 61, "make": 68, "manag": [11, 12, 14, 16, 17, 34, 53, 59], "manual": 40, "map": [6, 64], "materi": 8, "memoiz": 62, "metadata": 63, "microsoft": 42, "minikub": 40, "mlflow": 41, "monitor": 18, "msteam": 42, "mysql": 43, "new": 68, "note": 40, "o": [17, 34, 53, 59], "op": [14, 23, 26, 27, 33, 34, 40, 53, 59, 61, 63], "orchestr": [15, 19, 20, 27], "other": [18, 23, 34], "out": 63, "output": 12, "pagerduti": 44, "panda": [29, 35, 45, 54], "pandera": 46, "papertrail": 47, "partit": [64, 67], "polar": 30, "polici": 64, "post": 38, "postgr": 48, "postgresql": 48, "practic": 18, "project": [3, 26], "prometheu": 49, "pvc": 40, "pyspark": [31, 36, 50, 55], "python": [4, 39, 40, 66], "queri": 38, "quickstart": 18, "reconstruct": [8, 13], "redi": 40, "redshift": 16, "repositori": 65, "request": 67, "resourc": [14, 17, 23, 26, 32, 33, 34, 42, 53, 59, 66], "result": 8, "run": [3, 8, 11, 15, 40, 67], "s3": 16, "scaffold": 26, "schedul": [3, 11, 64, 67], "schema": 8, "secretsmanag": 16, "sensor": [3, 42, 67], "setup": 40, "sftp": 57, "shell": 51, "slack": 52, "sling": 32, "snowflak": [53, 54, 55], "softwar": 2, "sourc": 14, "spark": 56, "ssh": 57, "start": [18, 44], "step": [11, 23], "storag": 11, "system": [4, 66], "tabl": 63, "task": 18, "team": 42, "termin": 18, "test": [16, 40, 68], "twilio": 58, "type": [4, 26, 63, 68], "us": 59, "util": [4, 26, 69], "valid": 40, "version": 62, "wandb": 59, "webserv": 3, "weight": 59, "wipe": 3, "worker": 18, "your": 18}} \ No newline at end of file diff --git a/docs/content/api/sections.json b/docs/content/api/sections.json index d72e61e3eb8cf..3aa4bb7bc58ea 100644 --- a/docs/content/api/sections.json +++ b/docs/content/api/sections.json @@ -1 +1 @@ -{"api": {"apidocs": {"asset-checks": {"alabaster_version": "0.7.13", "body": "
\n

Asset Checks (Experimental)\u00b6

\n

Dagster allows you to define and execute checks on your software-defined assets. Each asset check verifies some property of a data asset, e.g. that is has no null values in a particular column.

\n
\n
\n@dagster.asset_check(*, asset, name=None, description=None, required_resource_keys=None, resource_defs=None, config_schema=None, compute_kind=None, op_tags=None, retry_policy=None)[source]\u00b6
\n
\n
\n

\n \n (\n experimental\n )\n \n This API may break in future versions, even between dot releases.\n \n

\n

Create a definition for how to execute an asset check.

\n
\n
Parameters:
\n
    \n
  • asset (Union[AssetKey, Sequence[str], str, AssetsDefinition, SourceAsset]) \u2013 The\nasset that the check applies to.

  • \n
  • name (Optional[str]) \u2013 The name of the check. If not specified, the name of the decorated\nfunction will be used. Checks for the same asset must have unique names.

  • \n
  • description (Optional[str]) \u2013 The description of the check.

  • \n
  • required_resource_keys (Optional[Set[str]]) \u2013 A set of keys for resources that are required\nby the function that execute the check. These can alternatively be specified by\nincluding resource-typed parameters in the function signature.

  • \n
  • config_schema (Optional[ConfigSchema) \u2013 The configuration schema for the check\u2019s underlying\nop. If set, Dagster will check that config provided for the op matches this schema and fail\nif it does not. If not set, Dagster will accept any config provided for the op.

  • \n
  • op_tags (Optional[Dict[str, Any]]) \u2013 A dictionary of tags for the op that executes the check.\nFrameworks may expect and require certain metadata to be attached to a op. Values that\nare not strings will be json encoded and must meet the criteria that\njson.loads(json.dumps(value)) == value.

  • \n
  • compute_kind (Optional[str]) \u2013 A string to represent the kind of computation that executes\nthe check, e.g. \u201cdbt\u201d or \u201cspark\u201d.

  • \n
  • retry_policy (Optional[RetryPolicy]) \u2013 The retry policy for the op that executes the check.

  • \n
\n
\n
\n

Produces an AssetChecksDefinition object.

\n

Example

\n
from dagster import asset, asset_check, AssetCheckResult\n\n@asset\ndef my_asset() -> None:\n    ...\n\n@asset_check(asset=my_asset, description="Check that my asset has enough rows")\ndef my_asset_has_enough_rows() -> AssetCheckResult:\n    num_rows = ...\n    return AssetCheckResult(success=num_rows > 5, metadata={"num_rows": num_rows})\n
\n
\n
\n
Example with a DataFrame Output:
from dagster import asset, asset_check, AssetCheckResult\nfrom pandas import DataFrame\n\n@asset\ndef my_asset() -> DataFrame:\n    ...\n\n@asset_check(asset=my_asset, description="Check that my asset has enough rows")\ndef my_asset_has_enough_rows(my_asset: DataFrame) -> AssetCheckResult:\n    num_rows = my_asset.shape[0]\n    return AssetCheckResult(success=num_rows > 5, metadata={"num_rows": num_rows})\n
\n
\n
\n
\n
\n\n
\n
\nclass dagster.AssetCheckResult(*, success, asset_key=None, check_name=None, metadata=None, severity=AssetCheckSeverity.ERROR)[source]\u00b6
\n
\n
\n

\n \n (\n experimental\n )\n \n This API may break in future versions, even between dot releases.\n \n

\n

The result of an asset check.

\n
\n
\nasset_key\u00b6
\n

The asset key that was checked.

\n
\n
Type:
\n

Optional[AssetKey]

\n
\n
\n
\n\n
\n
\ncheck_name\u00b6
\n

The name of the check.

\n
\n
Type:
\n

Optional[str]

\n
\n
\n
\n\n
\n
\nsuccess\u00b6
\n

The pass/fail result of the check.

\n
\n
Type:
\n

bool

\n
\n
\n
\n\n
\n
\nmetadata\u00b6
\n

Arbitrary metadata about the asset. Keys are displayed string labels, and values are\none of the following: string, float, int, JSON-serializable dict, JSON-serializable\nlist, and one of the data classes returned by a MetadataValue static method.

\n
\n
Type:
\n

Optional[Dict[str, RawMetadataValue]]

\n
\n
\n
\n\n
\n
\nseverity\u00b6
\n

Severity of the check. Defaults to ERROR.

\n
\n
Type:
\n

AssetCheckSeverity

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.AssetCheckSpec(name, *, asset, description=None)[source]\u00b6
\n
\n
\n

\n \n (\n experimental\n )\n \n This API may break in future versions, even between dot releases.\n \n

\n

Defines information about an check, except how to execute it.

\n

AssetCheckSpec is often used as an argument to decorators that decorator a function that can\nexecute multiple checks - e.g. @asset, and @multi_asset. It defines one of the checks that\nwill be executed inside that function.

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 Name of the check.

  • \n
  • asset (Union[AssetKey, Sequence[str], str, AssetsDefinition, SourceAsset]) \u2013 The asset that\nthe check applies to.

  • \n
  • description (Optional[str]) \u2013 Description for the check.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.AssetCheckSeverity(value)[source]\u00b6
\n
\n
\n

\n \n (\n experimental\n )\n \n This API may break in future versions, even between dot releases.\n \n

\n

Severity level for an asset check.

\n

Severities:

\n
    \n
  • WARN: If the check fails, don\u2019t fail the step.

  • \n
  • ERROR: If the check fails, fail the step and, within the run, skip materialization of any\nassets that are downstream of the asset being checked.

  • \n
\n
\n\n
\n
\nclass dagster.AssetCheckKey(asset_key, name)[source]\u00b6
\n
\n
\n

\n \n (\n experimental\n )\n \n This API may break in future versions, even between dot releases.\n \n

\n

Check names are expected to be unique per-asset. Thus, this combination of asset key and\ncheck name uniquely identifies an asset check within a deployment.

\n
\n\n
\n", "current_page_name": "sections/api/apidocs/asset-checks", "customsidebar": null, "display_toc": false, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../cli/", "title": "Dagster CLI"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../assets/", "title": "Software-Defined Assets"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/cli", "Dagster CLI", "N", "next"], ["sections/api/apidocs/assets", "Software-Defined Assets", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/asset-checks.rst.txt", "title": "Asset Checks (Experimental)", "toc": "\n"}, "assets": {"alabaster_version": "0.7.13", "body": "
\n

Software-Defined Assets\u00b6

\n

An asset is an object in persistent storage, such as a table, file, or persisted machine learning model. A software-defined asset is a Dagster object that couples an asset to the function and upstream assets that are used to produce its contents.

\n
\n
\n@dagster.asset(compute_fn=None, *, name=None, key_prefix=None, ins=None, deps=None, metadata=None, description=None, config_schema=None, required_resource_keys=None, resource_defs=None, io_manager_def=None, io_manager_key=None, compute_kind=None, dagster_type=None, partitions_def=None, op_tags=None, group_name=None, output_required=True, freshness_policy=None, auto_materialize_policy=None, backfill_policy=None, retry_policy=None, code_version=None, key=None, non_argument_deps=None, check_specs=None)[source]\u00b6
\n

Create a definition for how to compute an asset.

\n
\n
A software-defined asset is the combination of:
    \n
  1. An asset key, e.g. the name of a table.

  2. \n
  3. A function, which can be run to compute the contents of the asset.

  4. \n
  5. A set of upstream assets that are provided as inputs to the function when computing the asset.

  6. \n
\n
\n
\n

Unlike an op, whose dependencies are determined by the graph it lives inside, an asset knows\nabout the upstream assets it depends on. The upstream assets are inferred from the arguments\nto the decorated function. The name of the argument designates the name of the upstream asset.

\n

An asset has an op inside it to represent the function that computes it. The name of the op\nwill be the segments of the asset key, separated by double-underscores.

\n
\n
Parameters:
\n
    \n
  • name (Optional[str]) \u2013 The name of the asset. If not provided, defaults to the name of the\ndecorated function. The asset\u2019s name must be a valid name in dagster (ie only contains\nletters, numbers, and _) and may not contain python reserved keywords.

  • \n
  • key_prefix (Optional[Union[str, Sequence[str]]]) \u2013 If provided, the asset\u2019s key is the\nconcatenation of the key_prefix and the asset\u2019s name, which defaults to the name of\nthe decorated function. Each item in key_prefix must be a valid name in dagster (ie only\ncontains letters, numbers, and _) and may not contain python reserved keywords.

  • \n
  • ins (Optional[Mapping[str, AssetIn]]) \u2013 A dictionary that maps input names to information\nabout the input.

  • \n
  • deps (Optional[Sequence[Union[AssetDep, AssetsDefinition, SourceAsset, AssetKey, str]]]) \u2013 The assets that are upstream dependencies, but do not correspond to a parameter of the\ndecorated function. If the AssetsDefinition for a multi_asset is provided, dependencies on\nall assets created by the multi_asset will be created.

  • \n
  • config_schema (Optional[ConfigSchema) \u2013 The configuration schema for the asset\u2019s underlying\nop. If set, Dagster will check that config provided for the op matches this schema and fail\nif it does not. If not set, Dagster will accept any config provided for the op.

  • \n
  • metadata (Optional[Dict[str, Any]]) \u2013 A dict of metadata entries for the asset.

  • \n
  • required_resource_keys (Optional[Set[str]]) \u2013 Set of resource handles required by the op.

  • \n
  • io_manager_key (Optional[str]) \u2013 The resource key of the IOManager used\nfor storing the output of the op as an asset, and for loading it in downstream ops\n(default: \u201cio_manager\u201d). Only one of io_manager_key and io_manager_def can be provided.

  • \n
  • io_manager_def (Optional[object]) \u2013 \n \n (\n experimental\n )\n \n (This parameter may break in future versions, even between dot releases.) (Experimental) The IOManager used for\nstoring the output of the op as an asset, and for loading it in\ndownstream ops. Only one of io_manager_def and io_manager_key can be provided.

  • \n
  • compute_kind (Optional[str]) \u2013 A string to represent the kind of computation that produces\nthe asset, e.g. \u201cdbt\u201d or \u201cspark\u201d. It will be displayed in the Dagster UI as a badge on the asset.

  • \n
  • dagster_type (Optional[DagsterType]) \u2013 Allows specifying type validation functions that\nwill be executed on the output of the decorated function after it runs.

  • \n
  • partitions_def (Optional[PartitionsDefinition]) \u2013 Defines the set of partition keys that\ncompose the asset.

  • \n
  • op_tags (Optional[Dict[str, Any]]) \u2013 A dictionary of tags for the op that computes the asset.\nFrameworks may expect and require certain metadata to be attached to a op. Values that\nare not strings will be json encoded and must meet the criteria that\njson.loads(json.dumps(value)) == value.

  • \n
  • group_name (Optional[str]) \u2013 A string name used to organize multiple assets into groups. If not provided,\nthe name \u201cdefault\u201d is used.

  • \n
  • resource_defs (Optional[Mapping[str, object]]) \u2013 \n \n (\n experimental\n )\n \n (This parameter may break in future versions, even between dot releases.) (Experimental) A mapping of resource keys to resources. These resources\nwill be initialized during execution, and can be accessed from the\ncontext within the body of the function.

  • \n
  • output_required (bool) \u2013 Whether the decorated function will always materialize an asset.\nDefaults to True. If False, the function can return None, which will not be materialized to\nstorage and will halt execution of downstream assets.

  • \n
  • freshness_policy (FreshnessPolicy) \u2013 A constraint telling Dagster how often this asset is intended to be updated\nwith respect to its root data.

  • \n
  • auto_materialize_policy (AutoMaterializePolicy) \u2013 \n \n (\n experimental\n )\n \n (This parameter may break in future versions, even between dot releases.) (Experimental) Configure Dagster to automatically materialize\nthis asset according to its FreshnessPolicy and when upstream dependencies change.

  • \n
  • backfill_policy (BackfillPolicy) \u2013 \n \n (\n experimental\n )\n \n (This parameter may break in future versions, even between dot releases.) (Experimental) Configure Dagster to backfill this asset according to its\nBackfillPolicy.

  • \n
  • retry_policy (Optional[RetryPolicy]) \u2013 The retry policy for the op that computes the asset.

  • \n
  • code_version (Optional[str]) \u2013 (Experimental) Version of the code that generates this asset. In\ngeneral, versions should be set only for code that deterministically produces the same\noutput when given the same inputs.

  • \n
  • check_specs (Optional[Sequence[AssetCheckSpec]]) \u2013 (Experimental) Specs for asset checks that\nexecute in the decorated function after materializing the asset.

  • \n
  • non_argument_deps (Optional[Union[Set[AssetKey], Set[str]]]) \u2013 \n \n (\n deprecated\n )\n \n (This parameter will be removed in version 2.0.0. use deps instead.) Deprecated, use deps instead.\nSet of asset keys that are upstream dependencies, but do not pass an input to the asset.

  • \n
  • key (Optional[CoeercibleToAssetKey]) \u2013 The key for this asset. If provided, cannot specify key_prefix or name.

  • \n
\n
\n
\n

Examples

\n
@asset\ndef my_asset(my_upstream_asset: int) -> int:\n    return my_upstream_asset + 1\n
\n
\n
\n\n
\n
\nclass dagster.MaterializeResult(*, asset_key=None, metadata=None, check_results=None, data_version=None)[source]\u00b6
\n
\n
\n

\n \n (\n experimental\n )\n \n This API may break in future versions, even between dot releases.\n \n

\n

An object representing a successful materialization of an asset. These can be returned from\n@asset and @multi_asset decorated functions to pass metadata or specify specific assets were\nmaterialized.

\n
\n
\nasset_key\u00b6
\n

Optional in @asset, required in @multi_asset to discern which asset this refers to.

\n
\n
Type:
\n

Optional[AssetKey]

\n
\n
\n
\n\n
\n
\nmetadata\u00b6
\n

Metadata to record with the corresponding AssetMaterialization event.

\n
\n
Type:
\n

Optional[MetadataUserInput]

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.AssetSpec(key, *, deps=None, description=None, metadata=None, skippable=False, group_name=None, code_version=None, freshness_policy=None, auto_materialize_policy=None)[source]\u00b6
\n
\n
\n

\n \n (\n experimental\n )\n \n This API may break in future versions, even between dot releases.\n \n

\n

Specifies the core attributes of an asset. This object is attached to the decorated\nfunction that defines how it materialized.

\n
\n
\nkey\u00b6
\n

The unique identifier for this asset.

\n
\n
Type:
\n

AssetKey

\n
\n
\n
\n\n
\n
\ndeps\u00b6
\n

The asset keys for the upstream assets that\nmaterializing this asset depends on.

\n
\n
Type:
\n

Optional[AbstractSet[AssetKey]]

\n
\n
\n
\n\n
\n
\ndescription\u00b6
\n

Human-readable description of this asset.

\n
\n
Type:
\n

Optional[str]

\n
\n
\n
\n\n
\n
\nmetadata\u00b6
\n

A dict of static metadata for this asset.\nFor example, users can provide information about the database table this\nasset corresponds to.

\n
\n
Type:
\n

Optional[Dict[str, Any]]

\n
\n
\n
\n\n
\n
\nskippable\u00b6
\n

Whether this asset can be omitted during materialization, causing downstream\ndependencies to skip.

\n
\n
Type:
\n

bool

\n
\n
\n
\n\n
\n
\ngroup_name\u00b6
\n

A string name used to organize multiple assets into groups. If\nnot provided, the name \u201cdefault\u201d is used.

\n
\n
Type:
\n

Optional[str]

\n
\n
\n
\n\n
\n
\ncode_version\u00b6
\n

The version of the code for this specific asset,\noverriding the code version of the materialization function

\n
\n
Type:
\n

Optional[str]

\n
\n
\n
\n\n
\n
\nfreshness_policy\u00b6
\n

A policy which indicates how up to date this\nasset is intended to be.

\n
\n
Type:
\n

Optional[FreshnessPolicy]

\n
\n
\n
\n\n
\n
\nauto_materialize_policy\u00b6
\n

AutoMaterializePolicy to apply to\nthe specified asset.

\n
\n
Type:
\n

Optional[AutoMaterializePolicy]

\n
\n
\n
\n\n
\n
\nbackfill_policy\u00b6
\n

BackfillPolicy to apply to the specified asset.

\n
\n
Type:
\n

Optional[BackfillPolicy]

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.AssetDep(asset, *, partition_mapping=None)[source]\u00b6
\n
\n
\n

\n \n (\n experimental\n )\n \n This API may break in future versions, even between dot releases.\n \n

\n

Specifies a dependency on an upstream asset.

\n
\n
\nasset\u00b6
\n

The upstream asset to depend on.

\n
\n
Type:
\n

Union[AssetKey, str, AssetSpec, AssetsDefinition, SourceAsset]

\n
\n
\n
\n\n
\n
\npartition_mapping\u00b6
\n

Defines what partitions to depend on in\nthe upstream asset. If not provided and the upstream asset is partitioned, defaults to\nthe default partition mapping for the partitions definition, which is typically maps\npartition keys to the same partition keys in upstream assets.

\n
\n
Type:
\n

Optional[PartitionMapping]

\n
\n
\n
\n\n

Examples

\n
upstream_asset = AssetSpec("upstream_asset")\ndownstream_asset = AssetSpec(\n    "downstream_asset",\n    deps=[\n        AssetDep(\n            upstream_asset,\n            partition_mapping=TimeWindowPartitionMapping(start_offset=-1, end_offset=-1)\n        )\n    ]\n)\n
\n
\n
\n\n
\n
\nclass dagster.AssetIn(key=None, metadata=None, key_prefix=None, input_manager_key=None, partition_mapping=None, dagster_type=<class 'dagster._core.definitions.utils.NoValueSentinel'>)[source]\u00b6
\n

Defines an asset dependency.

\n
\n
\nkey_prefix\u00b6
\n

If provided, the asset\u2019s key is the\nconcatenation of the key_prefix and the input name. Only one of the \u201ckey_prefix\u201d and\n\u201ckey\u201d arguments should be provided.

\n
\n
Type:
\n

Optional[Union[str, Sequence[str]]]

\n
\n
\n
\n\n
\n
\nkey\u00b6
\n

The asset\u2019s key. Only one of the\n\u201ckey_prefix\u201d and \u201ckey\u201d arguments should be provided.

\n
\n
Type:
\n

Optional[Union[str, Sequence[str], AssetKey]]

\n
\n
\n
\n\n
\n
\nmetadata\u00b6
\n

A dict of the metadata for the input.\nFor example, if you only need a subset of columns from an upstream table, you could\ninclude that in metadata and the IO manager that loads the upstream table could use the\nmetadata to determine which columns to load.

\n
\n
Type:
\n

Optional[Dict[str, Any]]

\n
\n
\n
\n\n
\n
\npartition_mapping\u00b6
\n

Defines what partitions to depend on in\nthe upstream asset. If not provided, defaults to the default partition mapping for the\npartitions definition, which is typically maps partition keys to the same partition keys\nin upstream assets.

\n
\n
Type:
\n

Optional[PartitionMapping]

\n
\n
\n
\n\n
\n
\ndagster_type\u00b6
\n

Allows specifying type validation functions that\nwill be executed on the input of the decorated function before it runs.

\n
\n
Type:
\n

DagsterType

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.SourceAsset(key, metadata=None, io_manager_key=None, io_manager_def=None, description=None, partitions_def=None, group_name=None, resource_defs=None, observe_fn=None, *, auto_observe_interval_minutes=None, _required_resource_keys=None)[source]\u00b6
\n

A SourceAsset represents an asset that will be loaded by (but not updated by) Dagster.

\n
\n
\nkey\u00b6
\n

The key of the asset.

\n
\n
Type:
\n

Union[AssetKey, Sequence[str], str]

\n
\n
\n
\n\n
\n
\nmetadata\u00b6
\n

Metadata associated with the asset.

\n
\n
Type:
\n

Mapping[str, MetadataValue]

\n
\n
\n
\n\n
\n
\nio_manager_key\u00b6
\n

The key for the IOManager that will be used to load the contents of\nthe asset when it\u2019s used as an input to other assets inside a job.

\n
\n
Type:
\n

Optional[str]

\n
\n
\n
\n\n
\n
\nio_manager_def\u00b6
\n

(Experimental) The definition of the IOManager that will be used to load the contents of\nthe asset when it\u2019s used as an input to other assets inside a job.

\n
\n
Type:
\n

Optional[IOManagerDefinition]

\n
\n
\n
\n\n
\n
\nresource_defs\u00b6
\n

(Experimental) resource definitions that may be required by the dagster.IOManagerDefinition provided in the io_manager_def argument.

\n
\n
Type:
\n

Optional[Mapping[str, ResourceDefinition]]

\n
\n
\n
\n\n
\n
\ndescription\u00b6
\n

The description of the asset.

\n
\n
Type:
\n

Optional[str]

\n
\n
\n
\n\n
\n
\npartitions_def\u00b6
\n

Defines the set of partition keys that\ncompose the asset.

\n
\n
Type:
\n

Optional[PartitionsDefinition]

\n
\n
\n
\n\n
\n
\nobserve_fn\u00b6
\n
\n
Type:
\n

Optional[SourceAssetObserveFunction]

\n
\n
\n
\n\n
\n
\nproperty is_observable\u00b6
\n

Whether the asset is observable.

\n
\n
Type:
\n

bool

\n
\n
\n
\n\n
\n
\nproperty op\u00b6
\n

The OpDefinition associated with the observation function of an observable\nsource asset.

\n

Throws an error if the asset is not observable.

\n
\n
Type:
\n

OpDefinition

\n
\n
\n
\n\n
\n\n
\n
\ndagster.define_asset_job(name, selection=None, config=None, description=None, tags=None, metadata=None, partitions_def=None, executor_def=None, hooks=None)[source]\u00b6
\n

Creates a definition of a job which will either materialize a selection of assets or observe\na selection of source assets. This will only be resolved to a JobDefinition once placed in a\ncode location.

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name for the job.

  • \n
  • selection (Union[str, Sequence[str], Sequence[AssetKey], Sequence[Union[AssetsDefinition, SourceAsset]], AssetSelection]) \u2013

    The assets that will be materialized or observed when the job is run.

    \n

    The selected assets must all be included in the assets that are passed to the assets\nargument of the Definitions object that this job is included on.

    \n

    The string \u201cmy_asset*\u201d selects my_asset and all downstream assets within the code\nlocation. A list of strings represents the union of all assets selected by strings\nwithin the list.

    \n

    The selection will be resolved to a set of assets when the location is loaded. If the\nselection resolves to all source assets, the created job will perform source asset\nobservations. If the selection resolves to all regular assets, the created job will\nmaterialize assets. If the selection resolves to a mixed set of source assets and\nregular assets, an error will be thrown.

    \n

  • \n
  • config \u2013

    Describes how the Job is parameterized at runtime.

    \n

    If no value is provided, then the schema for the job\u2019s run config is a standard\nformat based on its ops and resources.

    \n

    If a dictionary is provided, then it must conform to the standard config schema, and\nit will be used as the job\u2019s run config for the job whenever the job is executed.\nThe values provided will be viewable and editable in the Dagster UI, so be\ncareful with secrets.

    \n

    If a ConfigMapping object is provided, then the schema for the job\u2019s run config is\ndetermined by the config mapping, and the ConfigMapping, which should return\nconfiguration in the standard format to configure the job.

    \n

  • \n
  • tags (Optional[Mapping[str, Any]]) \u2013 Arbitrary information that will be attached to the execution of the Job.\nValues that are not strings will be json encoded and must meet the criteria that\njson.loads(json.dumps(value)) == value. These tag values may be overwritten by tag\nvalues provided at invocation time.

  • \n
  • metadata (Optional[Mapping[str, RawMetadataValue]]) \u2013 Arbitrary metadata about the job.\nKeys are displayed string labels, and values are one of the following: string, float,\nint, JSON-serializable dict, JSON-serializable list, and one of the data classes\nreturned by a MetadataValue static method.

  • \n
  • description (Optional[str]) \u2013 A description for the Job.

  • \n
  • partitions_def (Optional[PartitionsDefinition]) \u2013 Defines the set of partitions for this job. All AssetDefinitions selected for this job\nmust have a matching PartitionsDefinition. If no PartitionsDefinition is provided, the\nPartitionsDefinition will be inferred from the selected AssetDefinitions.

  • \n
  • executor_def (Optional[ExecutorDefinition]) \u2013 How this Job will be executed. Defaults to multi_or_in_process_executor,\nwhich can be switched between multi-process and in-process modes of execution. The\ndefault mode of execution is multi-process.

  • \n
\n
\n
Returns:
\n

The job, which can be placed inside a code location.

\n
\n
Return type:
\n

UnresolvedAssetJobDefinition

\n
\n
\n

Examples

\n
# A job that targets all assets in the code location:\n@asset\ndef asset1():\n    ...\n\ndefs = Definitions(\n    assets=[asset1],\n    jobs=[define_asset_job("all_assets")],\n)\n\n# A job that targets a single asset\n@asset\ndef asset1():\n    ...\n\ndefs = Definitions(\n    assets=[asset1],\n    jobs=[define_asset_job("all_assets", selection=[asset1])],\n)\n\n# A job that targets all the assets in a group:\ndefs = Definitions(\n    assets=assets,\n    jobs=[define_asset_job("marketing_job", selection=AssetSelection.groups("marketing"))],\n)\n\n@observable_source_asset\ndef source_asset():\n    ...\n\n# A job that observes a source asset:\ndefs = Definitions(\n    assets=assets,\n    jobs=[define_asset_job("observation_job", selection=[source_asset])],\n)\n\n# Resources are supplied to the assets, not the job:\n@asset(required_resource_keys={"slack_client"})\ndef asset1():\n    ...\n\ndefs = Definitions(\n    assets=[asset1],\n    jobs=[define_asset_job("all_assets")],\n    resources={"slack_client": prod_slack_client},\n)\n
\n
\n
\n\n
\n
\nclass dagster.AssetSelection[source]\u00b6
\n

An AssetSelection defines a query over a set of assets and asset checks, normally all that are defined in a code location.

\n

You can use the \u201c|\u201d, \u201c&\u201d, and \u201c-\u201d operators to create unions, intersections, and differences of selections, respectively.

\n

AssetSelections are typically used with define_asset_job().

\n

By default, selecting assets will also select all of the asset checks that target those assets.

\n

Examples

\n
# Select all assets in group "marketing":\nAssetSelection.groups("marketing")\n\n# Select all assets in group "marketing", as well as the asset with key "promotion":\nAssetSelection.groups("marketing") | AssetSelection.keys("promotion")\n\n# Select all assets in group "marketing" that are downstream of asset "leads":\nAssetSelection.groups("marketing") & AssetSelection.keys("leads").downstream()\n\n# Select a list of assets:\nAssetSelection.assets(*my_assets_list)\n\n# Select all assets except for those in group "marketing"\nAssetSelection.all() - AssetSelection.groups("marketing")\n\n# Select all assets which are materialized by the same op as "projections":\nAssetSelection.keys("projections").required_multi_asset_neighbors()\n\n# Select all assets in group "marketing" and exclude their asset checks:\nAssetSelection.groups("marketing") - AssetSelection.all_asset_checks()\n\n# Select all asset checks that target a list of assets:\nAssetSelection.checks_for_assets(*my_assets_list)\n\n# Select a specific asset check:\nAssetSelection.checks(my_asset_check)\n
\n
\n
\n
\nstatic all()[source]\u00b6
\n

Returns a selection that includes all assets and asset checks.

\n
\n\n
\n
\nstatic all_asset_checks()[source]\u00b6
\n

Returns a selection that includes all asset checks.

\n
\n\n
\n
\nstatic assets(*assets_defs)[source]\u00b6
\n

Returns a selection that includes all of the provided assets and asset checks that target them.

\n
\n\n
\n
\nstatic checks(*asset_checks)[source]\u00b6
\n

Returns a selection that includes all of the provided asset checks.

\n
\n\n
\n
\nstatic checks_for_assets(*assets_defs)[source]\u00b6
\n

Returns a selection with the asset checks that target the provided assets.

\n
\n\n
\n
\ndownstream(depth=None, include_self=True)[source]\u00b6
\n

Returns a selection that includes all assets that are downstream of any of the assets in\nthis selection, selecting the assets in this selection by default. Includes the asset checks targeting the returned assets. Iterates through each\nasset in this selection and returns the union of all downstream assets.

\n
\n
depth (Optional[int]): If provided, then only include assets to the given depth. A depth

of 2 means all assets that are children or grandchildren of the assets in this\nselection.

\n
\n
include_self (bool): If True, then include the assets in this selection in the result.

If the include_self flag is False, return each downstream asset that is not part of the\noriginal selection. By default, set to True.

\n
\n
\n
\n\n
\n
\nstatic groups(*group_strs, include_sources=False)[source]\u00b6
\n

Returns a selection that includes materializable assets that belong to any of the\nprovided groups and all the asset checks that target them.

\n
\n
Parameters:
\n

include_sources (bool) \u2013 If True, then include source assets matching the group in the\nselection.

\n
\n
\n
\n\n
\n
\nstatic key_prefixes(*key_prefixes, include_sources=False)[source]\u00b6
\n

Returns a selection that includes assets that match any of the provided key prefixes and all the asset checks that target them.

\n
\n
Parameters:
\n

include_sources (bool) \u2013 If True, then include source assets matching the key prefix(es)\nin the selection.

\n
\n
\n

Examples

\n
# match any asset key where the first segment is equal to "a" or "b"\n# e.g. AssetKey(["a", "b", "c"]) would match, but AssetKey(["abc"]) would not.\nAssetSelection.key_prefixes("a", "b")\n\n# match any asset key where the first two segments are ["a", "b"] or ["a", "c"]\nAssetSelection.key_prefixes(["a", "b"], ["a", "c"])\n
\n
\n
\n\n
\n
\nstatic keys(*asset_keys)[source]\u00b6
\n

Returns a selection that includes assets with any of the provided keys and all asset checks that target them.

\n

Examples

\n
AssetSelection.keys(AssetKey(["a"]))\n\nAssetSelection.keys("a")\n\nAssetSelection.keys(AssetKey(["a"]), AssetKey(["b"]))\n\nAssetSelection.keys("a", "b")\n\nasset_key_list = [AssetKey(["a"]), AssetKey(["b"])]\nAssetSelection.keys(*asset_key_list)\n
\n
\n
\n\n
\n
\nrequired_multi_asset_neighbors()[source]\u00b6
\n

Given an asset selection in which some assets are output from a multi-asset compute op\nwhich cannot be subset, returns a new asset selection that contains all of the assets\nrequired to execute the original asset selection. Includes the asset checks targeting the returned assets.

\n
\n\n
\n
\nroots()[source]\u00b6
\n

Given an asset selection, returns a new asset selection that contains all of the root\nassets within the original asset selection. Includes the asset checks targeting the returned assets.

\n

A root asset is an asset that has no upstream dependencies within the asset selection.\nThe root asset can have downstream dependencies outside of the asset selection.

\n

Because mixed selections of source and materializable assets are currently not supported,\nkeys corresponding to SourceAssets will not be included as roots. To select source assets,\nuse the upstream_source_assets method.

\n
\n\n
\n
\nsinks()[source]\u00b6
\n

Given an asset selection, returns a new asset selection that contains all of the sink\nassets within the original asset selection. Includes the asset checks targeting the returned assets.

\n

A sink asset is an asset that has no downstream dependencies within the asset selection.\nThe sink asset can have downstream dependencies outside of the asset selection.

\n
\n\n
\n
\nsources()[source]\u00b6
\n
\n
\n

\n \n (\n deprecated\n )\n \n This API will be removed in version 2.0. Use AssetSelection.roots instead..\n \n

\n

Given an asset selection, returns a new asset selection that contains all of the root\nassets within the original asset selection. Includes the asset checks targeting the returned assets.

\n

A root asset is a materializable asset that has no upstream dependencies within the asset\nselection. The root asset can have downstream dependencies outside of the asset selection.

\n

Because mixed selections of source and materializable assets are currently not supported,\nkeys corresponding to SourceAssets will not be included as roots. To select source assets,\nuse the upstream_source_assets method.

\n
\n\n
\n
\nupstream(depth=None, include_self=True)[source]\u00b6
\n

Returns a selection that includes all materializable assets that are upstream of any of\nthe assets in this selection, selecting the assets in this selection by default. Includes the asset checks targeting the returned assets. Iterates\nthrough each asset in this selection and returns the union of all upstream assets.

\n

Because mixed selections of source and materializable assets are currently not supported,\nkeys corresponding to SourceAssets will not be included as upstream of regular assets.

\n
\n
Parameters:
\n
    \n
  • depth (Optional[int]) \u2013 If provided, then only include assets to the given depth. A depth\nof 2 means all assets that are parents or grandparents of the assets in this\nselection.

  • \n
  • include_self (bool) \u2013 If True, then include the assets in this selection in the result.\nIf the include_self flag is False, return each upstream asset that is not part of the\noriginal selection. By default, set to True.

  • \n
\n
\n
\n
\n\n
\n
\nupstream_source_assets()[source]\u00b6
\n

Given an asset selection, returns a new asset selection that contains all of the source\nassets upstream of assets in the original selection. Includes the asset checks targeting the returned assets.

\n
\n\n
\n
\nwithout_checks()[source]\u00b6
\n

Removes all asset checks in the selection.

\n
\n\n
\n\n
\n
\nclass dagster.FreshnessPolicy(*, maximum_lag_minutes, cron_schedule=None, cron_schedule_timezone=None)[source]\u00b6
\n
\n
\n

\n \n (\n experimental\n )\n \n This API may break in future versions, even between dot releases.\n \n

\n

A FreshnessPolicy specifies how up-to-date you want a given asset to be.

\n

Attaching a FreshnessPolicy to an asset definition encodes an expectation on the upstream data\nthat you expect to be incorporated into the current state of that asset at certain points in time.\nHow this is calculated differs depending on if the asset is unpartitioned or time-partitioned\n(other partitioning schemes are not supported).

\n

For time-partitioned assets, the current data time for the asset is simple to calculate. The\nupstream data that is incorporated into the asset is exactly the set of materialized partitions\nfor that asset. Thus, the current data time for the asset is simply the time up to which all\npartitions have been materialized.

\n

For unpartitioned assets, the current data time is based on the upstream materialization records\nthat were read to generate the current state of the asset. More specifically,\nimagine you have two assets, where A depends on B. If B has a FreshnessPolicy defined, this\nmeans that at time T, the most recent materialization of B should have come after a\nmaterialization of A which was no more than maximum_lag_minutes ago. This calculation is\nrecursive: any given asset is expected to incorporate up-to-date data from all of its upstream\nassets.

\n

It is assumed that all asset definitions with no upstream asset definitions consume from some\nalways-updating source. That is, if you materialize that asset at time T, it will incorporate\nall data up to time T.

\n

If cron_schedule is not defined, the given asset will be expected to incorporate upstream\ndata from no more than maximum_lag_minutes ago at all points in time. For example, \u201cThe events\ntable should always have data from at most 1 hour ago\u201d.

\n

If cron_schedule is defined, the given asset will be expected to incorporate upstream data\nfrom no more than maximum_lag_minutes ago at each cron schedule tick. For example, \u201cBy 9AM,\nthe signups table should contain all of yesterday\u2019s data\u201d.

\n

The freshness status of assets with policies defined will be visible in the UI. If you are using\nan asset reconciliation sensor, this sensor will kick off runs to help keep your assets up to\ndate with respect to their FreshnessPolicy.

\n
\n
Parameters:
\n
    \n
  • maximum_lag_minutes (float) \u2013 An upper bound for how old the data contained within this\nasset may be.

  • \n
  • cron_schedule (Optional[str]) \u2013 A cron schedule string (e.g. "0 1 * * *") specifying a\nseries of times by which the maximum_lag_minutes constraint must be satisfied. If\nno cron schedule is provided, then this constraint must be satisfied at all times.

  • \n
  • cron_schedule_timezone (Optional[str]) \u2013 Timezone in which the cron schedule should be evaluated.\nIf not specified, defaults to UTC. Supported strings for timezones are the ones provided\nby the IANA time zone database <https://www.iana.org/time-zones> - e.g.\n\u201cAmerica/Los_Angeles\u201d.

  • \n
\n
\n
\n
# At any point in time, this asset must incorporate all upstream data from at least 30 minutes ago.\n@asset(freshness_policy=FreshnessPolicy(maximum_lag_minutes=30))\ndef fresh_asset():\n    ...\n\n# At any point in time, this asset must incorporate all upstream data from at least 30 minutes ago.\n@asset(freshness_policy=FreshnessPolicy(maximum_lag_minutes=30))\ndef cron_up_to_date_asset():\n    ...\n
\n
\n
\n\n
\n
\nclass dagster.AutoMaterializePolicy(rules, max_materializations_per_minute=1)[source]\u00b6
\n
\n
\n

\n \n (\n experimental\n )\n \n This API may break in future versions, even between dot releases.\n \n

\n

An AutoMaterializePolicy specifies how Dagster should attempt to keep an asset up-to-date.

\n

Each policy consists of a set of AutoMaterializeRules, which are used to determine whether an\nasset or a partition of an asset should or should not be auto-materialized.

\n

The most common policy is AutoMaterializePolicy.eager(), which consists of the following rules:

\n
    \n
  • \n
    AutoMaterializeRule.materialize_on_missing()

    Materialize an asset or a partition if it has never been materialized.

    \n
    \n
    \n
  • \n
  • \n
    AutoMaterializeRule.materialize_on_parent_updated()

    Materialize an asset or a partition if one of its parents have been updated more recently\nthan it has.

    \n
    \n
    \n
  • \n
  • \n
    AutoMaterializeRule.materialize_on_required_for_freshness()

    Materialize an asset or a partition if it is required to satisfy a freshness policy.

    \n
    \n
    \n
  • \n
  • \n
    AutoMaterializeRule.skip_on_parent_outdated()

    Skip materializing an asset or partition if any of its parents have ancestors that have\nbeen materialized more recently.

    \n
    \n
    \n
  • \n
  • \n
    AutoMaterializeRule.skip_on_parent_missing()

    Skip materializing an asset or a partition if any parent has never been materialized or\nobserved.

    \n
    \n
    \n
  • \n
\n

Policies can be customized by adding or removing rules. For example, if you\u2019d like to allow\nan asset to be materialized even if some of its parent partitions are missing:

\n
from dagster import AutoMaterializePolicy, AutoMaterializeRule\n\nmy_policy = AutoMaterializePolicy.eager().without_rules(\n    AutoMaterializeRule.skip_on_parent_missing(),\n)\n
\n
\n

If you\u2019d like an asset to wait for all of its parents to be updated before materializing:

\n
from dagster import AutoMaterializePolicy, AutoMaterializeRule\n\nmy_policy = AutoMaterializePolicy.eager().with_rules(\n    AutoMaterializeRule.skip_on_all_parents_not_updated(),\n)\n
\n
\n

Lastly, the max_materializations_per_minute parameter, which is set to 1 by default,\nrate-limits the number of auto-materializations that can occur for a particular asset within\na short time interval. This mainly matters for partitioned assets. Its purpose is to provide a\nsafeguard against \u201csurprise backfills\u201d, where user-error causes auto-materialize to be\naccidentally triggered for large numbers of partitions at once.

\n

Warning:

\n

Constructing an AutoMaterializePolicy directly is not recommended as the API is subject to change.\nAutoMaterializePolicy.eager() and AutoMaterializePolicy.lazy() are the recommended API.

\n
\n
\nstatic eager(max_materializations_per_minute=1)[source]\u00b6
\n

Constructs an eager AutoMaterializePolicy.

\n
\n
Parameters:
\n

max_materializations_per_minute (Optional[int]) \u2013 The maximum number of\nauto-materializations for this asset that may be initiated per minute. If this limit\nis exceeded, the partitions which would have been materialized will be discarded,\nand will require manual materialization in order to be updated. Defaults to 1.

\n
\n
\n
\n\n
\n
\nstatic lazy(max_materializations_per_minute=1)[source]\u00b6
\n

Constructs a lazy AutoMaterializePolicy.

\n
\n
Parameters:
\n

max_materializations_per_minute (Optional[int]) \u2013 The maximum number of\nauto-materializations for this asset that may be initiated per minute. If this limit\nis exceeded, the partitions which would have been materialized will be discarded,\nand will require manual materialization in order to be updated. Defaults to 1.

\n
\n
\n
\n\n
\n
\nwith_rules(*rules_to_add)[source]\u00b6
\n

Constructs a copy of this policy with the specified rules added.

\n
\n\n
\n
\nwithout_rules(*rules_to_remove)[source]\u00b6
\n

Constructs a copy of this policy with the specified rules removed. Raises an error\nif any of the arguments are not rules in this policy.

\n
\n\n
\n\n
\n
\nclass dagster.AutoMaterializeRule[source]\u00b6
\n

An AutoMaterializeRule defines a bit of logic which helps determine if a materialization\nshould be kicked off for a given asset partition.

\n

Each rule can have one of two decision types, MATERIALIZE (indicating that an asset partition\nshould be materialized) or SKIP (indicating that the asset partition should not be\nmaterialized).

\n

Materialize rules are evaluated first, and skip rules operate over the set of candidates that\nare produced by the materialize rules. Other than that, there is no ordering between rules.

\n
\n
\nstatic materialize_on_missing()[source]\u00b6
\n

Materialize an asset partition if it has never been materialized before. This rule will\nnot fire for non-root assets unless that asset\u2019s parents have been updated.

\n
\n\n
\n
\nstatic materialize_on_parent_updated()[source]\u00b6
\n

Materialize an asset partition if one of its parents has been updated more recently\nthan it has.

\n

Note: For time-partitioned or dynamic-partitioned assets downstream of an unpartitioned\nasset, this rule will only fire for the most recent partition of the downstream.

\n
\n\n
\n
\nstatic materialize_on_required_for_freshness()[source]\u00b6
\n

Materialize an asset partition if it is required to satisfy a freshness policy of this\nasset or one of its downstream assets.

\n

Note: This rule has no effect on partitioned assets.

\n
\n\n
\n
\nstatic skip_on_not_all_parents_updated(require_update_for_all_parent_partitions=False)[source]\u00b6
\n

Skip materializing an asset partition if any of its parents have not been updated since\nthe asset\u2019s last materialization.

\n
\n
\nrequire_update_for_all_parent_partitions\u00b6
\n

Applies only to an unpartitioned\nasset or an asset partition that depends on more than one partition in any upstream asset.\nIf true, requires all upstream partitions in each upstream asset to be materialized since\nthe downstream asset\u2019s last materialization in order to update it. If false, requires at\nleast one upstream partition in each upstream asset to be materialized since the downstream\nasset\u2019s last materialization in order to update it. Defaults to false.

\n
\n
Type:
\n

Optional[bool]

\n
\n
\n
\n\n
\n\n
\n
\nstatic skip_on_parent_missing()[source]\u00b6
\n

Skip materializing an asset partition if one of its parent asset partitions has never\nbeen materialized (for regular assets) or observed (for observable source assets).

\n
\n\n
\n
\nstatic skip_on_parent_outdated()[source]\u00b6
\n

Skip materializing an asset partition if any of its parents has not incorporated the\nlatest data from its ancestors.

\n
\n\n
\n\n
\n
\ndagster.load_assets_from_modules(modules, group_name=None, key_prefix=None, *, freshness_policy=None, auto_materialize_policy=None, backfill_policy=None, source_key_prefix=None)[source]\u00b6
\n

Constructs a list of assets and source assets from the given modules.

\n
\n
Parameters:
\n
    \n
  • modules (Iterable[ModuleType]) \u2013 The Python modules to look for assets inside.

  • \n
  • group_name (Optional[str]) \u2013 Group name to apply to the loaded assets. The returned assets will be copies of the\nloaded objects, with the group name added.

  • \n
  • key_prefix (Optional[Union[str, Sequence[str]]]) \u2013 Prefix to prepend to the keys of the loaded assets. The returned assets will be copies\nof the loaded objects, with the prefix prepended.

  • \n
  • freshness_policy (Optional[FreshnessPolicy]) \u2013 FreshnessPolicy to apply to all the loaded\nassets.

  • \n
  • auto_materialize_policy (Optional[AutoMaterializePolicy]) \u2013 AutoMaterializePolicy to apply\nto all the loaded assets.

  • \n
  • backfill_policy (Optional[AutoMaterializePolicy]) \u2013 BackfillPolicy to apply to all the loaded assets.

  • \n
  • source_key_prefix (bool) \u2013 Prefix to prepend to the keys of loaded SourceAssets. The returned\nassets will be copies of the loaded objects, with the prefix prepended.

  • \n
\n
\n
Returns:
\n

A list containing assets and source assets defined in the given modules.

\n
\n
Return type:
\n

Sequence[Union[AssetsDefinition, SourceAsset]]

\n
\n
\n
\n\n
\n
\ndagster.load_assets_from_current_module(group_name=None, key_prefix=None, *, freshness_policy=None, auto_materialize_policy=None, backfill_policy=None, source_key_prefix=None)[source]\u00b6
\n

Constructs a list of assets, source assets, and cacheable assets from the module where\nthis function is called.

\n
\n
Parameters:
\n
    \n
  • group_name (Optional[str]) \u2013 Group name to apply to the loaded assets. The returned assets will be copies of the\nloaded objects, with the group name added.

  • \n
  • key_prefix (Optional[Union[str, Sequence[str]]]) \u2013 Prefix to prepend to the keys of the loaded assets. The returned assets will be copies\nof the loaded objects, with the prefix prepended.

  • \n
  • freshness_policy (Optional[FreshnessPolicy]) \u2013 FreshnessPolicy to apply to all the loaded\nassets.

  • \n
  • auto_materialize_policy (Optional[AutoMaterializePolicy]) \u2013 AutoMaterializePolicy to apply\nto all the loaded assets.

  • \n
  • backfill_policy (Optional[AutoMaterializePolicy]) \u2013 BackfillPolicy to apply to all the loaded assets.

  • \n
  • source_key_prefix (bool) \u2013 Prefix to prepend to the keys of loaded SourceAssets. The returned\nassets will be copies of the loaded objects, with the prefix prepended.

  • \n
\n
\n
Returns:
\n

A list containing assets, source assets, and cacheable assets defined in the module.

\n
\n
Return type:
\n

Sequence[Union[AssetsDefinition, SourceAsset, CachableAssetsDefinition]]

\n
\n
\n
\n\n
\n
\ndagster.load_assets_from_package_module(package_module, group_name=None, key_prefix=None, *, freshness_policy=None, auto_materialize_policy=None, backfill_policy=None, source_key_prefix=None)[source]\u00b6
\n

Constructs a list of assets and source assets that includes all asset\ndefinitions, source assets, and cacheable assets in all sub-modules of the given package module.

\n

A package module is the result of importing a package.

\n
\n
Parameters:
\n
    \n
  • package_module (ModuleType) \u2013 The package module to looks for assets inside.

  • \n
  • group_name (Optional[str]) \u2013 Group name to apply to the loaded assets. The returned assets will be copies of the\nloaded objects, with the group name added.

  • \n
  • key_prefix (Optional[Union[str, Sequence[str]]]) \u2013 Prefix to prepend to the keys of the loaded assets. The returned assets will be copies\nof the loaded objects, with the prefix prepended.

  • \n
  • freshness_policy (Optional[FreshnessPolicy]) \u2013 FreshnessPolicy to apply to all the loaded\nassets.

  • \n
  • auto_materialize_policy (Optional[AutoMaterializePolicy]) \u2013 AutoMaterializePolicy to apply\nto all the loaded assets.

  • \n
  • backfill_policy (Optional[AutoMaterializePolicy]) \u2013 BackfillPolicy to apply to all the loaded assets.

  • \n
  • source_key_prefix (bool) \u2013 Prefix to prepend to the keys of loaded SourceAssets. The returned\nassets will be copies of the loaded objects, with the prefix prepended.

  • \n
\n
\n
Returns:
\n

A list containing assets, source assets, and cacheable assets defined in the module.

\n
\n
Return type:
\n

Sequence[Union[AssetsDefinition, SourceAsset, CacheableAssetsDefinition]]

\n
\n
\n
\n\n
\n
\ndagster.load_assets_from_package_name(package_name, group_name=None, key_prefix=None, *, freshness_policy=None, auto_materialize_policy=None, backfill_policy=None, source_key_prefix=None)[source]\u00b6
\n

Constructs a list of assets, source assets, and cacheable assets that includes all asset\ndefinitions and source assets in all sub-modules of the given package.

\n
\n
Parameters:
\n
    \n
  • package_name (str) \u2013 The name of a Python package to look for assets inside.

  • \n
  • group_name (Optional[str]) \u2013 Group name to apply to the loaded assets. The returned assets will be copies of the\nloaded objects, with the group name added.

  • \n
  • key_prefix (Optional[Union[str, Sequence[str]]]) \u2013 Prefix to prepend to the keys of the loaded assets. The returned assets will be copies\nof the loaded objects, with the prefix prepended.

  • \n
  • freshness_policy (Optional[FreshnessPolicy]) \u2013 FreshnessPolicy to apply to all the loaded\nassets.

  • \n
  • auto_materialize_policy (Optional[AutoMaterializePolicy]) \u2013 AutoMaterializePolicy to apply\nto all the loaded assets.

  • \n
  • backfill_policy (Optional[AutoMaterializePolicy]) \u2013 BackfillPolicy to apply to all the loaded assets.

  • \n
  • source_key_prefix (bool) \u2013 Prefix to prepend to the keys of loaded SourceAssets. The returned\nassets will be copies of the loaded objects, with the prefix prepended.

  • \n
\n
\n
Returns:
\n

A list containing assets, source assets, and cacheable assets defined in the module.

\n
\n
Return type:
\n

Sequence[Union[AssetsDefinition, SourceAsset, CacheableAssetsDefinition]]

\n
\n
\n
\n\n
\n
\nclass dagster.AssetsDefinition(*, keys_by_input_name, keys_by_output_name, node_def, partitions_def=None, partition_mappings=None, asset_deps=None, selected_asset_keys=None, can_subset=False, resource_defs=None, group_names_by_key=None, metadata_by_key=None, freshness_policies_by_key=None, auto_materialize_policies_by_key=None, backfill_policy=None, descriptions_by_key=None, check_specs_by_output_name=None, selected_asset_check_keys=None)[source]\u00b6
\n

Defines a set of assets that are produced by the same op or graph.

\n

AssetsDefinitions are typically not instantiated directly, but rather produced using the\n@asset or @multi_asset decorators.

\n
\n
\nproperty asset_deps\u00b6
\n

Maps assets that are produced by this definition to assets that they depend on. The\ndependencies can be either \u201cinternal\u201d, meaning that they refer to other assets that are\nproduced by this definition, or \u201cexternal\u201d, meaning that they refer to assets that aren\u2019t\nproduced by this definition.

\n
\n\n
\n
\nproperty can_subset\u00b6
\n

If True, indicates that this AssetsDefinition may materialize any subset of its\nasset keys in a given computation (as opposed to being required to materialize all asset\nkeys).

\n
\n
Type:
\n

bool

\n
\n
\n
\n\n
\n
\nproperty check_specs\u00b6
\n

Returns the asset check specs defined on this AssetsDefinition, i.e. the checks that can\nbe executed while materializing the assets.

\n
\n
Return type:
\n

Iterable[AssetsCheckSpec]

\n
\n
\n
\n\n
\n
\nproperty dependency_keys\u00b6
\n

The asset keys which are upstream of any asset included in this\nAssetsDefinition.

\n
\n
Type:
\n

Iterable[AssetKey]

\n
\n
\n
\n\n
\n
\nproperty descriptions_by_key\u00b6
\n

Returns a mapping from the asset keys in this AssetsDefinition\nto the descriptions assigned to them. If there is no assigned description for a given AssetKey,\nit will not be present in this dictionary.

\n
\n
Type:
\n

Mapping[AssetKey, str]

\n
\n
\n
\n\n
\n
\nstatic from_graph(graph_def, *, keys_by_input_name=None, keys_by_output_name=None, key_prefix=None, internal_asset_deps=None, partitions_def=None, partition_mappings=None, resource_defs=None, group_name=None, group_names_by_output_name=None, descriptions_by_output_name=None, metadata_by_output_name=None, freshness_policies_by_output_name=None, auto_materialize_policies_by_output_name=None, backfill_policy=None, can_subset=False, check_specs=None)[source]\u00b6
\n

Constructs an AssetsDefinition from a GraphDefinition.

\n
\n
Parameters:
\n
    \n
  • graph_def (GraphDefinition) \u2013 The GraphDefinition that is an asset.

  • \n
  • keys_by_input_name (Optional[Mapping[str, AssetKey]]) \u2013 A mapping of the input\nnames of the decorated graph to their corresponding asset keys. If not provided,\nthe input asset keys will be created from the graph input names.

  • \n
  • keys_by_output_name (Optional[Mapping[str, AssetKey]]) \u2013 A mapping of the output\nnames of the decorated graph to their corresponding asset keys. If not provided,\nthe output asset keys will be created from the graph output names.

  • \n
  • key_prefix (Optional[Union[str, Sequence[str]]]) \u2013 If provided, key_prefix will be prepended\nto each key in keys_by_output_name. Each item in key_prefix must be a valid name in\ndagster (ie only contains letters, numbers, and _) and may not contain python\nreserved keywords.

  • \n
  • internal_asset_deps (Optional[Mapping[str, Set[AssetKey]]]) \u2013 By default, it is assumed\nthat all assets produced by the graph depend on all assets that are consumed by that\ngraph. If this default is not correct, you pass in a map of output names to a\ncorrected set of AssetKeys that they depend on. Any AssetKeys in this list must be\neither used as input to the asset or produced within the graph.

  • \n
  • partitions_def (Optional[PartitionsDefinition]) \u2013 Defines the set of partition keys that\ncompose the assets.

  • \n
  • partition_mappings (Optional[Mapping[str, PartitionMapping]]) \u2013 Defines how to map partition\nkeys for this asset to partition keys of upstream assets. Each key in the dictionary\ncorreponds to one of the input assets, and each value is a PartitionMapping.\nIf no entry is provided for a particular asset dependency, the partition mapping defaults\nto the default partition mapping for the partitions definition, which is typically maps\npartition keys to the same partition keys in upstream assets.

  • \n
  • resource_defs (Optional[Mapping[str, ResourceDefinition]]) \u2013 \n \n (\n experimental\n )\n \n (This parameter may break in future versions, even between dot releases.) (Experimental) A mapping of resource keys to resource definitions. These resources\nwill be initialized during execution, and can be accessed from the\nbody of ops in the graph during execution.

  • \n
  • group_name (Optional[str]) \u2013 A group name for the constructed asset. Assets without a\ngroup name are assigned to a group called \u201cdefault\u201d.

  • \n
  • group_names_by_output_name (Optional[Mapping[str, Optional[str]]]) \u2013 Defines a group name to be\nassociated with some or all of the output assets for this node. Keys are names of the\noutputs, and values are the group name. Cannot be used with the group_name argument.

  • \n
  • descriptions_by_output_name (Optional[Mapping[str, Optional[str]]]) \u2013 Defines a description to be\nassociated with each of the output asstes for this graph.

  • \n
  • metadata_by_output_name (Optional[Mapping[str, Optional[MetadataUserInput]]]) \u2013 Defines metadata to\nbe associated with each of the output assets for this node. Keys are names of the\noutputs, and values are dictionaries of metadata to be associated with the related\nasset.

  • \n
  • freshness_policies_by_output_name (Optional[Mapping[str, Optional[FreshnessPolicy]]]) \u2013 Defines a\nFreshnessPolicy to be associated with some or all of the output assets for this node.\nKeys are the names of the outputs, and values are the FreshnessPolicies to be attached\nto the associated asset.

  • \n
  • auto_materialize_policies_by_output_name (Optional[Mapping[str, Optional[AutoMaterializePolicy]]]) \u2013 Defines an\nAutoMaterializePolicy to be associated with some or all of the output assets for this node.\nKeys are the names of the outputs, and values are the AutoMaterializePolicies to be attached\nto the associated asset.

  • \n
  • backfill_policy (Optional[BackfillPolicy]) \u2013 Defines this asset\u2019s BackfillPolicy

  • \n
\n
\n
\n
\n\n
\n
\nstatic from_op(op_def, *, keys_by_input_name=None, keys_by_output_name=None, key_prefix=None, internal_asset_deps=None, partitions_def=None, partition_mappings=None, group_name=None, group_names_by_output_name=None, descriptions_by_output_name=None, metadata_by_output_name=None, freshness_policies_by_output_name=None, auto_materialize_policies_by_output_name=None, backfill_policy=None, can_subset=False)[source]\u00b6
\n

Constructs an AssetsDefinition from an OpDefinition.

\n
\n
Parameters:
\n
    \n
  • op_def (OpDefinition) \u2013 The OpDefinition that is an asset.

  • \n
  • keys_by_input_name (Optional[Mapping[str, AssetKey]]) \u2013 A mapping of the input\nnames of the decorated op to their corresponding asset keys. If not provided,\nthe input asset keys will be created from the op input names.

  • \n
  • keys_by_output_name (Optional[Mapping[str, AssetKey]]) \u2013 A mapping of the output\nnames of the decorated op to their corresponding asset keys. If not provided,\nthe output asset keys will be created from the op output names.

  • \n
  • key_prefix (Optional[Union[str, Sequence[str]]]) \u2013 If provided, key_prefix will be prepended\nto each key in keys_by_output_name. Each item in key_prefix must be a valid name in\ndagster (ie only contains letters, numbers, and _) and may not contain python\nreserved keywords.

  • \n
  • internal_asset_deps (Optional[Mapping[str, Set[AssetKey]]]) \u2013 By default, it is assumed\nthat all assets produced by the op depend on all assets that are consumed by that\nop. If this default is not correct, you pass in a map of output names to a\ncorrected set of AssetKeys that they depend on. Any AssetKeys in this list must be\neither used as input to the asset or produced within the op.

  • \n
  • partitions_def (Optional[PartitionsDefinition]) \u2013 Defines the set of partition keys that\ncompose the assets.

  • \n
  • partition_mappings (Optional[Mapping[str, PartitionMapping]]) \u2013 Defines how to map partition\nkeys for this asset to partition keys of upstream assets. Each key in the dictionary\ncorreponds to one of the input assets, and each value is a PartitionMapping.\nIf no entry is provided for a particular asset dependency, the partition mapping defaults\nto the default partition mapping for the partitions definition, which is typically maps\npartition keys to the same partition keys in upstream assets.

  • \n
  • group_name (Optional[str]) \u2013 A group name for the constructed asset. Assets without a\ngroup name are assigned to a group called \u201cdefault\u201d.

  • \n
  • group_names_by_output_name (Optional[Mapping[str, Optional[str]]]) \u2013 Defines a group name to be\nassociated with some or all of the output assets for this node. Keys are names of the\noutputs, and values are the group name. Cannot be used with the group_name argument.

  • \n
  • descriptions_by_output_name (Optional[Mapping[str, Optional[str]]]) \u2013 Defines a description to be\nassociated with each of the output asstes for this graph.

  • \n
  • metadata_by_output_name (Optional[Mapping[str, Optional[MetadataUserInput]]]) \u2013 Defines metadata to\nbe associated with each of the output assets for this node. Keys are names of the\noutputs, and values are dictionaries of metadata to be associated with the related\nasset.

  • \n
  • freshness_policies_by_output_name (Optional[Mapping[str, Optional[FreshnessPolicy]]]) \u2013 Defines a\nFreshnessPolicy to be associated with some or all of the output assets for this node.\nKeys are the names of the outputs, and values are the FreshnessPolicies to be attached\nto the associated asset.

  • \n
  • auto_materialize_policies_by_output_name (Optional[Mapping[str, Optional[AutoMaterializePolicy]]]) \u2013 Defines an\nAutoMaterializePolicy to be associated with some or all of the output assets for this node.\nKeys are the names of the outputs, and values are the AutoMaterializePolicies to be attached\nto the associated asset.

  • \n
  • backfill_policy (Optional[BackfillPolicy]) \u2013 Defines this asset\u2019s BackfillPolicy

  • \n
\n
\n
\n
\n\n
\n
\nget_partition_mapping(in_asset_key)[source]\u00b6
\n

Returns the partition mapping between keys in this AssetsDefinition and a given input\nasset key (if any).

\n
\n\n
\n
\nproperty group_names_by_key\u00b6
\n

Returns a mapping from the asset keys in this AssetsDefinition\nto the group names assigned to them. If there is no assigned group name for a given AssetKey,\nit will not be present in this dictionary.

\n
\n
Type:
\n

Mapping[AssetKey, str]

\n
\n
\n
\n\n
\n
\nproperty key\u00b6
\n

The asset key associated with this AssetsDefinition. If this AssetsDefinition\nhas more than one asset key, this will produce an error.

\n
\n
Type:
\n

AssetKey

\n
\n
\n
\n\n
\n
\nproperty keys\u00b6
\n

The asset keys associated with this AssetsDefinition.

\n
\n
Type:
\n

AbstractSet[AssetKey]

\n
\n
\n
\n\n
\n
\nproperty node_def\u00b6
\n

Returns the OpDefinition or GraphDefinition that is used to materialize\nthe assets in this AssetsDefinition.

\n
\n
Type:
\n

NodeDefinition

\n
\n
\n
\n\n
\n
\nproperty op\u00b6
\n

Returns the OpDefinition that is used to materialize the assets in this\nAssetsDefinition.

\n
\n
Type:
\n

OpDefinition

\n
\n
\n
\n\n
\n
\nproperty partitions_def\u00b6
\n

The PartitionsDefinition for this AssetsDefinition (if any).

\n
\n
Type:
\n

Optional[PartitionsDefinition]

\n
\n
\n
\n\n
\n
\nproperty required_resource_keys\u00b6
\n

The set of keys for resources that must be provided to this AssetsDefinition.

\n
\n
Type:
\n

Set[str]

\n
\n
\n
\n\n
\n
\nproperty resource_defs\u00b6
\n

A mapping from resource name to ResourceDefinition for\nthe resources bound to this AssetsDefinition.

\n
\n
Type:
\n

Mapping[str, ResourceDefinition]

\n
\n
\n
\n\n
\n
\nto_source_asset(key=None)[source]\u00b6
\n

Returns a representation of this asset as a SourceAsset.

\n

If this is a multi-asset, the \u201ckey\u201d argument allows selecting which asset to return a\nSourceAsset representation of.

\n
\n
Parameters:
\n

key (Optional[Union[str, Sequence[str], AssetKey]]]) \u2013 If this is a multi-asset, select\nwhich asset to return a SourceAsset representation of. If not a multi-asset, this\ncan be left as None.

\n
\n
Returns:
\n

SourceAsset

\n
\n
\n
\n\n
\n
\nto_source_assets()[source]\u00b6
\n

Returns a SourceAsset for each asset in this definition.

\n

Each produced SourceAsset will have the same key, metadata, io_manager_key, etc. as the\ncorresponding asset

\n
\n\n
\n\n
\n
\n@dagster.multi_asset(*, outs=None, name=None, ins=None, deps=None, description=None, config_schema=None, required_resource_keys=None, compute_kind=None, internal_asset_deps=None, partitions_def=None, backfill_policy=None, op_tags=None, can_subset=False, resource_defs=None, group_name=None, retry_policy=None, code_version=None, specs=None, check_specs=None, non_argument_deps=None)[source]\u00b6
\n

Create a combined definition of multiple assets that are computed using the same op and same\nupstream assets.

\n

Each argument to the decorated function references an upstream asset that this asset depends on.\nThe name of the argument designates the name of the upstream asset.

\n

You can set I/O managers keys, auto-materialize policies, freshness policies, group names, etc.\non an individual asset within the multi-asset by attaching them to the AssetOut\ncorresponding to that asset in the outs parameter.

\n
\n
Parameters:
\n
    \n
  • name (Optional[str]) \u2013 The name of the op.

  • \n
  • outs \u2013 (Optional[Dict[str, AssetOut]]): The AssetOuts representing the assets materialized by\nthis function. AssetOuts detail the output, IO management, and core asset properties.\nThis argument is required except when AssetSpecs are used.

  • \n
  • ins (Optional[Mapping[str, AssetIn]]) \u2013 A dictionary that maps input names to information\nabout the input.

  • \n
  • deps (Optional[Sequence[Union[AssetsDefinition, SourceAsset, AssetKey, str]]]) \u2013 The assets that are upstream dependencies, but do not correspond to a parameter of the\ndecorated function. If the AssetsDefinition for a multi_asset is provided, dependencies on\nall assets created by the multi_asset will be created.

  • \n
  • config_schema (Optional[ConfigSchema) \u2013 The configuration schema for the asset\u2019s underlying\nop. If set, Dagster will check that config provided for the op matches this schema and fail\nif it does not. If not set, Dagster will accept any config provided for the op.

  • \n
  • required_resource_keys (Optional[Set[str]]) \u2013 Set of resource handles required by the underlying op.

  • \n
  • compute_kind (Optional[str]) \u2013 A string to represent the kind of computation that produces\nthe asset, e.g. \u201cdbt\u201d or \u201cspark\u201d. It will be displayed in the Dagster UI as a badge on the asset.

  • \n
  • internal_asset_deps (Optional[Mapping[str, Set[AssetKey]]]) \u2013 By default, it is assumed\nthat all assets produced by a multi_asset depend on all assets that are consumed by that\nmulti asset. If this default is not correct, you pass in a map of output names to a\ncorrected set of AssetKeys that they depend on. Any AssetKeys in this list must be either\nused as input to the asset or produced within the op.

  • \n
  • partitions_def (Optional[PartitionsDefinition]) \u2013 Defines the set of partition keys that\ncompose the assets.

  • \n
  • backfill_policy (Optional[BackfillPolicy]) \u2013 The backfill policy for the op that computes the asset.

  • \n
  • op_tags (Optional[Dict[str, Any]]) \u2013 A dictionary of tags for the op that computes the asset.\nFrameworks may expect and require certain metadata to be attached to a op. Values that\nare not strings will be json encoded and must meet the criteria that\njson.loads(json.dumps(value)) == value.

  • \n
  • can_subset (bool) \u2013 If this asset\u2019s computation can emit a subset of the asset\nkeys based on the context.selected_assets argument. Defaults to False.

  • \n
  • resource_defs (Optional[Mapping[str, object]]) \u2013 \n \n (\n experimental\n )\n \n (This parameter may break in future versions, even between dot releases.) (Experimental) A mapping of resource keys to resources. These resources\nwill be initialized during execution, and can be accessed from the\ncontext within the body of the function.

  • \n
  • group_name (Optional[str]) \u2013 A string name used to organize multiple assets into groups. This\ngroup name will be applied to all assets produced by this multi_asset.

  • \n
  • retry_policy (Optional[RetryPolicy]) \u2013 The retry policy for the op that computes the asset.

  • \n
  • code_version (Optional[str]) \u2013 (Experimental) Version of the code encapsulated by the multi-asset. If set,\nthis is used as a default code version for all defined assets.

  • \n
  • specs (Optional[Sequence[AssetSpec]]) \u2013 (Experimental) The specifications for the assets materialized\nby this function.

  • \n
  • check_specs (Optional[Sequence[AssetCheckSpec]]) \u2013 (Experimental) Specs for asset checks that\nexecute in the decorated function after materializing the assets.

  • \n
  • non_argument_deps (Optional[Union[Set[AssetKey], Set[str]]]) \u2013 \n \n (\n deprecated\n )\n \n (This parameter will be removed in version 2.0.0. use deps instead.) Deprecated, use deps instead. Set of asset keys that are upstream\ndependencies, but do not pass an input to the multi_asset.

  • \n
\n
\n
\n

Examples

\n
# Use IO managers to handle I/O:\n@multi_asset(\n    outs={\n        "my_string_asset": AssetOut(),\n        "my_int_asset": AssetOut(),\n    }\n)\ndef my_function(upstream_asset: int):\n    result = upstream_asset + 1\n    return str(result), result\n\n# Handle I/O on your own:\n@multi_asset(\n    outs={\n        "asset1": AssetOut(),\n        "asset2": AssetOut(),\n    },\n    deps=["asset0"],\n)\ndef my_function():\n    asset0_value = load(path="asset0")\n    asset1_result, asset2_result = do_some_transformation(asset0_value)\n    write(asset1_result, path="asset1")\n    write(asset2_result, path="asset2")\n    return None, None\n
\n
\n
\n\n
\n
\n@dagster.graph_asset(compose_fn=None, *, name=None, description=None, ins=None, config=None, key_prefix=None, group_name=None, partitions_def=None, metadata=None, freshness_policy=None, auto_materialize_policy=None, backfill_policy=None, resource_defs=None, check_specs=None, key=None)[source]\u00b6
\n

Creates a software-defined asset that\u2019s computed using a graph of ops.

\n

This decorator is meant to decorate a function that composes a set of ops or graphs to define\nthe dependencies between them.

\n
\n
Parameters:
\n
    \n
  • name (Optional[str]) \u2013 The name of the asset. If not provided, defaults to the name of the\ndecorated function. The asset\u2019s name must be a valid name in Dagster (ie only contains\nletters, numbers, and underscores) and may not contain Python reserved keywords.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the asset.

  • \n
  • ins (Optional[Mapping[str, AssetIn]]) \u2013 A dictionary that maps input names to information\nabout the input.

  • \n
  • config (Optional[Union[ConfigMapping], Mapping[str, Any]) \u2013

    Describes how the graph underlying the asset is configured at runtime.

    \n

    If a ConfigMapping object is provided, then the graph takes on the config\nschema of this object. The mapping will be applied at runtime to generate the config for\nthe graph\u2019s constituent nodes.

    \n

    If a dictionary is provided, then it will be used as the default run config for the\ngraph. This means it must conform to the config schema of the underlying nodes. Note\nthat the values provided will be viewable and editable in the Dagster UI, so be careful\nwith secrets. its constituent nodes.

    \n

    If no value is provided, then the config schema for the graph is the default (derived\nfrom the underlying nodes).

    \n

  • \n
  • key_prefix (Optional[Union[str, Sequence[str]]]) \u2013 If provided, the asset\u2019s key is the\nconcatenation of the key_prefix and the asset\u2019s name, which defaults to the name of\nthe decorated function. Each item in key_prefix must be a valid name in Dagster (ie only\ncontains letters, numbers, and underscores) and may not contain Python reserved keywords.

  • \n
  • group_name (Optional[str]) \u2013 A string name used to organize multiple assets into groups. If\nnot provided, the name \u201cdefault\u201d is used.

  • \n
  • partitions_def (Optional[PartitionsDefinition]) \u2013 Defines the set of partition keys that\ncompose the asset.

  • \n
  • metadata (Optional[MetadataUserInput]) \u2013 Dictionary of metadata to be associated with\nthe asset.

  • \n
  • freshness_policy (Optional[FreshnessPolicy]) \u2013 A constraint telling Dagster how often this asset is\nintended to be updated with respect to its root data.

  • \n
  • auto_materialize_policy (Optional[AutoMaterializePolicy]) \u2013 The AutoMaterializePolicy to use\nfor this asset.

  • \n
  • backfill_policy (Optional[BackfillPolicy]) \u2013 The BackfillPolicy to use for this asset.

  • \n
  • key (Optional[CoeercibleToAssetKey]) \u2013 The key for this asset. If provided, cannot specify key_prefix or name.

  • \n
\n
\n
\n

Examples

\n
@op\ndef fetch_files_from_slack(context) -> pd.DataFrame:\n    ...\n\n@op\ndef store_files_in_table(files) -> None:\n    files.to_sql(name="slack_files", con=create_db_connection())\n\n@graph_asset\ndef slack_files_table():\n    return store_files(fetch_files_from_slack())\n
\n
\n
\n\n
\n
\n@dagster.graph_multi_asset(*, outs, name=None, ins=None, partitions_def=None, backfill_policy=None, group_name=None, can_subset=False, resource_defs=None, check_specs=None)[source]\u00b6
\n

Create a combined definition of multiple assets that are computed using the same graph of\nops, and the same upstream assets.

\n

Each argument to the decorated function references an upstream asset that this asset depends on.\nThe name of the argument designates the name of the upstream asset.

\n
\n
Parameters:
\n
    \n
  • name (Optional[str]) \u2013 The name of the graph.

  • \n
  • outs \u2013 (Optional[Dict[str, AssetOut]]): The AssetOuts representing the produced assets.

  • \n
  • ins (Optional[Mapping[str, AssetIn]]) \u2013 A dictionary that maps input names to information\nabout the input.

  • \n
  • partitions_def (Optional[PartitionsDefinition]) \u2013 Defines the set of partition keys that\ncompose the assets.

  • \n
  • backfill_policy (Optional[BackfillPolicy]) \u2013 The backfill policy for the asset.

  • \n
  • group_name (Optional[str]) \u2013 A string name used to organize multiple assets into groups. This\ngroup name will be applied to all assets produced by this multi_asset.

  • \n
  • can_subset (bool) \u2013 Whether this asset\u2019s computation can emit a subset of the asset\nkeys based on the context.selected_assets argument. Defaults to False.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.AssetOut(key_prefix=None, key=None, dagster_type=<class 'dagster._core.definitions.utils.NoValueSentinel'>, description=None, is_required=True, io_manager_key=None, metadata=None, group_name=None, code_version=None, freshness_policy=None, auto_materialize_policy=None, backfill_policy=None)[source]\u00b6
\n

Defines one of the assets produced by a @multi_asset.

\n
\n
\nkey_prefix\u00b6
\n

If provided, the asset\u2019s key is the\nconcatenation of the key_prefix and the asset\u2019s name. When using @multi_asset, the\nasset name defaults to the key of the \u201couts\u201d dictionary Only one of the \u201ckey_prefix\u201d and\n\u201ckey\u201d arguments should be provided.

\n
\n
Type:
\n

Optional[Union[str, Sequence[str]]]

\n
\n
\n
\n\n
\n
\nkey\u00b6
\n

The asset\u2019s key. Only one of the\n\u201ckey_prefix\u201d and \u201ckey\u201d arguments should be provided.

\n
\n
Type:
\n

Optional[Union[str, Sequence[str], AssetKey]]

\n
\n
\n
\n\n
\n
\ndagster_type\u00b6
\n

The type of this output. Should only be set if the correct type can not\nbe inferred directly from the type signature of the decorated function.

\n
\n
Type:
\n

Optional[Union[Type, DagsterType]]]

\n
\n
\n
\n\n
\n
\ndescription\u00b6
\n

Human-readable description of the output.

\n
\n
Type:
\n

Optional[str]

\n
\n
\n
\n\n
\n
\nis_required\u00b6
\n

Whether the presence of this field is required. (default: True)

\n
\n
Type:
\n

bool

\n
\n
\n
\n\n
\n
\nio_manager_key\u00b6
\n

The resource key of the IO manager used for this output.\n(default: \u201cio_manager\u201d).

\n
\n
Type:
\n

Optional[str]

\n
\n
\n
\n\n
\n
\nmetadata\u00b6
\n

A dict of the metadata for the output.\nFor example, users can provide a file path if the data object will be stored in a\nfilesystem, or provide information of a database table when it is going to load the data\ninto the table.

\n
\n
Type:
\n

Optional[Dict[str, Any]]

\n
\n
\n
\n\n
\n
\ngroup_name\u00b6
\n

A string name used to organize multiple assets into groups. If\nnot provided, the name \u201cdefault\u201d is used.

\n
\n
Type:
\n

Optional[str]

\n
\n
\n
\n\n
\n
\ncode_version\u00b6
\n

The version of the code that generates this asset.

\n
\n
Type:
\n

Optional[str]

\n
\n
\n
\n\n
\n
\nfreshness_policy\u00b6
\n

A policy which indicates how up to date this\nasset is intended to be.

\n
\n
Type:
\n

Optional[FreshnessPolicy]

\n
\n
\n
\n\n
\n
\nauto_materialize_policy\u00b6
\n

AutoMaterializePolicy to apply to\nthe specified asset.

\n
\n
Type:
\n

Optional[AutoMaterializePolicy]

\n
\n
\n
\n\n
\n
\nbackfill_policy\u00b6
\n

BackfillPolicy to apply to the specified asset.

\n
\n
Type:
\n

Optional[BackfillPolicy]

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.AssetValueLoader(assets_defs_by_key, source_assets_by_key, instance=None)[source]\u00b6
\n

Caches resource definitions that are used to load asset values across multiple load\ninvocations.

\n

Should not be instantiated directly. Instead, use\nget_asset_value_loader().

\n
\n
\nload_asset_value(asset_key, *, python_type=None, partition_key=None, metadata=None, resource_config=None)[source]\u00b6
\n

Loads the contents of an asset as a Python object.

\n

Invokes load_input on the IOManager associated with the asset.

\n
\n
Parameters:
\n
    \n
  • asset_key (Union[AssetKey, Sequence[str], str]) \u2013 The key of the asset to load.

  • \n
  • python_type (Optional[Type]) \u2013 The python type to load the asset as. This is what will\nbe returned inside load_input by context.dagster_type.typing_type.

  • \n
  • partition_key (Optional[str]) \u2013 The partition of the asset to load.

  • \n
  • metadata (Optional[Dict[str, Any]]) \u2013 Input metadata to pass to the IOManager\n(is equivalent to setting the metadata argument in In or AssetIn).

  • \n
  • resource_config (Optional[Any]) \u2013 A dictionary of resource configurations to be passed\nto the IOManager.

  • \n
\n
\n
Returns:
\n

The contents of an asset as a Python object.

\n
\n
\n
\n\n
\n\n
\n", "current_page_name": "sections/api/apidocs/assets", "customsidebar": null, "display_toc": false, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../asset-checks/", "title": "Asset Checks (Experimental)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../../../../", "title": "Home"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/asset-checks", "Asset Checks (Experimental)", "N", "next"], ["index", "Home", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/assets.rst.txt", "title": "Software-Defined Assets", "toc": "\n"}, "cli": {"alabaster_version": "0.7.13", "body": "
\n

Dagster CLI\u00b6

\n
\n

dagster asset\u00b6

\n

Commands for working with Dagster assets.

\n
dagster asset [OPTIONS] COMMAND [ARGS]...\n
\n
\n

Commands

\n
\n
\nlist
\n

List assets

\n
\n\n
\n
\nmaterialize
\n

Execute a run to materialize a selection\u2026

\n
\n\n
\n
\nwipe
\n

Eliminate asset key indexes from event logs.

\n
\n\n
\n
\nwipe-partitions-status-cache
\n

Clears the asset partitions status cache,\u2026

\n
\n\n
\n
\n

dagster debug\u00b6

\n

Commands for helping debug Dagster issues by dumping or loading artifacts from specific runs.

\n

This can be used to send a file to someone like the Dagster team who doesn\u2019t have direct access\nto your instance to allow them to view the events and details of a specific run.

\n

Debug files can be viewed using dagster-webserver-debug cli.\nDebug files can also be downloaded from the Dagster UI.

\n
dagster debug [OPTIONS] COMMAND [ARGS]...\n
\n
\n

Commands

\n
\n
\nexport
\n

Export the relevant artifacts for a job\u2026

\n
\n\n
\n
\nimport
\n

Import the relevant artifacts from debug\u2026

\n
\n\n
\n
\n

dagster dev\u00b6

\n

Start a local deployment of Dagster, including dagster-webserver running on localhost and the dagster-daemon running in the background

\n
dagster dev [OPTIONS]\n
\n
\n

Options

\n
\n
\n-d, --working-directory <working_directory>\u00b6
\n

Specify working directory to use when loading the repository or job

\n
\n\n
\n
\n-m, --module-name <module_name>\u00b6
\n

Specify module or modules (flag can be used multiple times) where dagster definitions reside as top-level symbols/variables and load each module as a code location in the current python environment.

\n
\n\n
\n
\n-f, --python-file <python_file>\u00b6
\n

Specify python file or files (flag can be used multiple times) where dagster definitions reside as top-level symbols/variables and load each file as a code location in the current python environment.

\n
\n\n
\n
\n-w, --workspace <workspace>\u00b6
\n

Path to workspace file. Argument can be provided multiple times.

\n
\n\n
\n
\n--code-server-log-level <code_server_log_level>\u00b6
\n

Set the log level for code servers spun up by dagster services.

\n
\n
Default:
\n

warning

\n
\n
Options:
\n

critical | error | warning | info | debug

\n
\n
\n
\n\n
\n
\n--log-level <log_level>\u00b6
\n

Set the log level for dagster services.

\n
\n
Default:
\n

info

\n
\n
Options:
\n

critical | error | warning | info | debug

\n
\n
\n
\n\n
\n
\n-p, --port, --dagit-port <port>\u00b6
\n

Port to use for the Dagster webserver.

\n
\n\n
\n
\n-h, --host, --dagit-host <host>\u00b6
\n

Host to use for the Dagster webserver.

\n
\n\n

Environment variables

\n
\n
\nDAGSTER_WORKING_DIRECTORY
\n
\n

Provide a default for --working-directory

\n
\n
\n\n
\n
\nDAGSTER_MODULE_NAME
\n
\n

Provide a default for --module-name

\n
\n
\n\n
\n
\nDAGSTER_PYTHON_FILE
\n
\n

Provide a default for --python-file

\n
\n
\n\n
\n
\n

dagster instance\u00b6

\n

Commands for working with the current Dagster instance.

\n
dagster instance [OPTIONS] COMMAND [ARGS]...\n
\n
\n

Commands

\n
\n
\nconcurrency
\n

Commands for working with the\u2026

\n
\n\n
\n
\ninfo
\n

List the information about the current\u2026

\n
\n\n
\n
\nmigrate
\n

Automatically migrate an out of date\u2026

\n
\n\n
\n
\nreindex
\n

Rebuild index over historical runs for\u2026

\n
\n\n
\n
\n

dagster job\u00b6

\n

Commands for working with Dagster jobs.

\n
dagster job [OPTIONS] COMMAND [ARGS]...\n
\n
\n

Commands

\n
\n
\nbackfill
\n

Backfill a partitioned job.

\n
\n\n
\n
\nexecute
\n

Execute a job.

\n
\n\n
\n
\nlaunch
\n

Launch a job using the run launcher\u2026

\n
\n\n
\n
\nlist
\n

List the jobs in a repository.

\n
\n\n
\n
\nlist_versions
\n

Display the freshness of memoized results\u2026

\n
\n\n
\n
\nprint
\n

Print a job.

\n
\n\n
\n
\nscaffold_config
\n

Scaffold the config for a job.

\n
\n\n
\n
\n

dagster run\u00b6

\n

Commands for working with Dagster job runs.

\n
dagster run [OPTIONS] COMMAND [ARGS]...\n
\n
\n

Commands

\n
\n
\ndelete
\n

Delete a run by id and its associated\u2026

\n
\n\n
\n
\nlist
\n

List the runs in the current Dagster\u2026

\n
\n\n
\n
\nmigrate-repository
\n

Migrate the run history for a job from a\u2026

\n
\n\n
\n
\nwipe
\n

Eliminate all run history and event logs.

\n
\n\n
\n
\n

dagster schedule\u00b6

\n

Commands for working with Dagster schedules.

\n
dagster schedule [OPTIONS] COMMAND [ARGS]...\n
\n
\n

Commands

\n
\n
\ndebug
\n

Debug information about the scheduler.

\n
\n\n
\n
\nlist
\n

List all schedules that correspond to a\u2026

\n
\n\n
\n
\nlogs
\n

Get logs for a schedule.

\n
\n\n
\n
\npreview
\n

Preview changes that will be performed by\u2026

\n
\n\n
\n
\nrestart
\n

Restart a running schedule.

\n
\n\n
\n
\nstart
\n

Start an existing schedule.

\n
\n\n
\n
\nstop
\n

Stop an existing schedule.

\n
\n\n
\n
\nwipe
\n

Delete the schedule history and turn off\u2026

\n
\n\n
\n
\n

dagster sensor\u00b6

\n

Commands for working with Dagster sensors.

\n
dagster sensor [OPTIONS] COMMAND [ARGS]...\n
\n
\n

Commands

\n
\n
\ncursor
\n

Set the cursor value for an existing sensor.

\n
\n\n
\n
\nlist
\n

List all sensors that correspond to a\u2026

\n
\n\n
\n
\npreview
\n

Preview an existing sensor execution.

\n
\n\n
\n
\nstart
\n

Start an existing sensor.

\n
\n\n
\n
\nstop
\n

Stop an existing sensor.

\n
\n\n
\n
\n

dagster project\u00b6

\n

Commands for bootstrapping new Dagster projects and code locations.

\n
dagster project [OPTIONS] COMMAND [ARGS]...\n
\n
\n

Commands

\n
\n
\nfrom-example
\n

Download one of the official Dagster examples to the current directory. This CLI enables you to quickly bootstrap your project with an officially maintained example.

\n
\n\n
\n
\nlist-examples
\n

List the examples that available to bootstrap with.

\n
\n\n
\n
\nscaffold
\n

Create a folder structure with a single Dagster code location and other files such as pyproject.toml. This CLI enables you to quickly start building a new Dagster project with everything set up.

\n
\n\n
\n
\nscaffold-code-location
\n

Create a folder structure with a single Dagster code location, in the current directory. This CLI helps you to scaffold a new Dagster code location within a folder structure that includes multiple Dagster code locations.

\n
\n\n
\n
\nscaffold-repository
\n

(DEPRECATED; Use dagster project scaffold-code-location instead) Create a folder structure with a single Dagster repository, in the current directory. This CLI helps you to scaffold a new Dagster repository within a folder structure that includes multiple Dagster repositories

\n
\n\n
\n
\n

dagster-graphql\u00b6

\n

Run a GraphQL query against the dagster interface to a specified repository or pipeline/job.

\n

Can only use ONE of \u2013workspace/-w, \u2013python-file/-f, \u2013module-name/-m, \u2013grpc-port, \u2013grpc-socket.

\n

Examples:

\n
    \n
  1. dagster-graphql

  2. \n
  3. dagster-graphql -y path/to/workspace.yaml

  4. \n
  5. dagster-graphql -f path/to/file.py -a define_repo

  6. \n
  7. dagster-graphql -m some_module -a define_repo

  8. \n
  9. dagster-graphql -f path/to/file.py -a define_pipeline

  10. \n
  11. dagster-graphql -m some_module -a define_pipeline

  12. \n
\n
dagster-graphql [OPTIONS]\n
\n
\n

Options

\n
\n
\n--version\u00b6
\n

Show the version and exit.

\n
\n\n
\n
\n-t, --text <text>\u00b6
\n

GraphQL document to execute passed as a string

\n
\n\n
\n
\n-f, --file <file>\u00b6
\n

GraphQL document to execute passed as a file

\n
\n\n
\n
\n-p, --predefined <predefined>\u00b6
\n

GraphQL document to execute, from a predefined set provided by dagster-graphql.

\n
\n
Options:
\n

launchPipelineExecution

\n
\n
\n
\n\n
\n
\n-v, --variables <variables>\u00b6
\n

A JSON encoded string containing the variables for GraphQL execution.

\n
\n\n
\n
\n-r, --remote <remote>\u00b6
\n

A URL for a remote instance running dagster-webserver to send the GraphQL request to.

\n
\n\n
\n
\n-o, --output <output>\u00b6
\n

A file path to store the GraphQL response to. This flag is useful when making pipeline/job execution queries, since pipeline/job execution causes logs to print to stdout and stderr.

\n
\n\n
\n
\n--ephemeral-instance\u00b6
\n

Use an ephemeral DagsterInstance instead of resolving via DAGSTER_HOME

\n
\n\n
\n
\n--empty-workspace\u00b6
\n

Allow an empty workspace

\n
\n\n
\n
\n-w, --workspace <workspace>\u00b6
\n

Path to workspace file. Argument can be provided multiple times.

\n
\n\n
\n
\n-d, --working-directory <working_directory>\u00b6
\n

Specify working directory to use when loading the repository or job

\n
\n\n
\n
\n-f, --python-file <python_file>\u00b6
\n

Specify python file or files (flag can be used multiple times) where dagster definitions reside as top-level symbols/variables and load each file as a code location in the current python environment.

\n
\n\n
\n
\n-m, --module-name <module_name>\u00b6
\n

Specify module or modules (flag can be used multiple times) where dagster definitions reside as top-level symbols/variables and load each module as a code location in the current python environment.

\n
\n\n
\n
\n--package-name <package_name>\u00b6
\n

Specify Python package where repository or job function lives

\n
\n\n
\n
\n-a, --attribute <attribute>\u00b6
\n

Attribute that is either a 1) repository or job or 2) a function that returns a repository or job

\n
\n\n
\n
\n--grpc-port <grpc_port>\u00b6
\n

Port to use to connect to gRPC server

\n
\n\n
\n
\n--grpc-socket <grpc_socket>\u00b6
\n

Named socket to use to connect to gRPC server

\n
\n\n
\n
\n--grpc-host <grpc_host>\u00b6
\n

Host to use to connect to gRPC server, defaults to localhost

\n
\n\n
\n
\n--use-ssl\u00b6
\n

Use a secure channel when connecting to the gRPC server

\n
\n\n

Environment variables

\n
\n
\nDAGSTER_WORKING_DIRECTORY
\n
\n

Provide a default for --working-directory

\n
\n
\n\n
\n
\nDAGSTER_PYTHON_FILE
\n
\n

Provide a default for --python-file

\n
\n
\n\n
\n
\nDAGSTER_MODULE_NAME
\n
\n

Provide a default for --module-name

\n
\n
\n\n
\n
\nDAGSTER_PACKAGE_NAME
\n
\n

Provide a default for --package-name

\n
\n
\n\n
\n
\nDAGSTER_ATTRIBUTE
\n
\n

Provide a default for --attribute

\n
\n
\n\n
\n
\n

dagster-webserver\u00b6

\n

Run dagster-webserver. Loads a code location.

\n

Can only use ONE of \u2013workspace/-w, \u2013python-file/-f, \u2013module-name/-m, \u2013grpc-port, \u2013grpc-socket.

\n

Examples:

\n
    \n
  1. dagster-webserver (works if ./workspace.yaml exists)

  2. \n
  3. dagster-webserver -w path/to/workspace.yaml

  4. \n
  5. dagster-webserver -f path/to/file.py

  6. \n
  7. dagster-webserver -f path/to/file.py -d path/to/working_directory

  8. \n
  9. dagster-webserver -m some_module

  10. \n
  11. dagster-webserver -f path/to/file.py -a define_repo

  12. \n
  13. dagster-webserver -m some_module -a define_repo

  14. \n
  15. dagster-webserver -p 3333

  16. \n
\n

Options can also provide arguments via environment variables prefixed with DAGSTER_WEBSERVER.

\n

For example, DAGSTER_WEBSERVER_PORT=3333 dagster-webserver

\n
dagster-webserver [OPTIONS]\n
\n
\n

Options

\n
\n
\n--use-ssl\u00b6
\n

Use a secure channel when connecting to the gRPC server

\n
\n\n
\n
\n--grpc-host <grpc_host>\u00b6
\n

Host to use to connect to gRPC server, defaults to localhost

\n
\n\n
\n
\n--grpc-socket <grpc_socket>\u00b6
\n

Named socket to use to connect to gRPC server

\n
\n\n
\n
\n--grpc-port <grpc_port>\u00b6
\n

Port to use to connect to gRPC server

\n
\n\n
\n
\n-a, --attribute <attribute>\u00b6
\n

Attribute that is either a 1) repository or job or 2) a function that returns a repository or job

\n
\n\n
\n
\n--package-name <package_name>\u00b6
\n

Specify Python package where repository or job function lives

\n
\n\n
\n
\n-m, --module-name <module_name>\u00b6
\n

Specify module or modules (flag can be used multiple times) where dagster definitions reside as top-level symbols/variables and load each module as a code location in the current python environment.

\n
\n\n
\n
\n-f, --python-file <python_file>\u00b6
\n

Specify python file or files (flag can be used multiple times) where dagster definitions reside as top-level symbols/variables and load each file as a code location in the current python environment.

\n
\n\n
\n
\n-d, --working-directory <working_directory>\u00b6
\n

Specify working directory to use when loading the repository or job

\n
\n\n
\n
\n-w, --workspace <workspace>\u00b6
\n

Path to workspace file. Argument can be provided multiple times.

\n
\n\n
\n
\n--empty-workspace\u00b6
\n

Allow an empty workspace

\n
\n\n
\n
\n-h, --host <host>\u00b6
\n

Host to run server on

\n
\n
Default:
\n

127.0.0.1

\n
\n
\n
\n\n
\n
\n-p, --port <port>\u00b6
\n

Port to run server on - defaults to 3000

\n
\n\n
\n
\n-l, --path-prefix <path_prefix>\u00b6
\n

The path prefix where server will be hosted (eg: /dagster-webserver)

\n
\n
Default:
\n

\n
\n
\n\n
\n
\n--db-statement-timeout <db_statement_timeout>\u00b6
\n

The timeout in milliseconds to set on database statements sent to the DagsterInstance. Not respected in all configurations.

\n
\n
Default:
\n

15000

\n
\n
\n
\n\n
\n
\n--db-pool-recycle <db_pool_recycle>\u00b6
\n

The maximum age of a connection to use from the sqlalchemy pool without connection recycling. Set to -1 to disable. Not respected in all configurations.

\n
\n
Default:
\n

3600

\n
\n
\n
\n\n
\n
\n--read-only\u00b6
\n

Start server in read-only mode, where all mutations such as launching runs and turning schedules on/off are turned off.

\n
\n\n
\n
\n--suppress-warnings\u00b6
\n

Filter all warnings when hosting server.

\n
\n\n
\n
\n--uvicorn-log-level, --log-level <uvicorn_log_level>\u00b6
\n

Set the log level for the uvicorn web server.

\n
\n
Default:
\n

warning

\n
\n
Options:
\n

critical | error | warning | info | debug | trace

\n
\n
\n
\n\n
\n
\n--dagster-log-level <dagster_log_level>\u00b6
\n

Set the log level for dagster log events.

\n
\n
Default:
\n

info

\n
\n
Options:
\n

critical | error | warning | info | debug

\n
\n
\n
\n\n
\n
\n--code-server-log-level <code_server_log_level>\u00b6
\n

Set the log level for any code servers spun up by the webserver.

\n
\n
Default:
\n

info

\n
\n
Options:
\n

critical | error | warning | info | debug

\n
\n
\n
\n\n
\n
\n--version\u00b6
\n

Show the version and exit.

\n
\n\n

Environment variables

\n
\n
\nDAGSTER_ATTRIBUTE
\n
\n

Provide a default for --attribute

\n
\n
\n\n
\n
\nDAGSTER_PACKAGE_NAME
\n
\n

Provide a default for --package-name

\n
\n
\n\n
\n
\nDAGSTER_MODULE_NAME
\n
\n

Provide a default for --module-name

\n
\n
\n\n
\n
\nDAGSTER_PYTHON_FILE
\n
\n

Provide a default for --python-file

\n
\n
\n\n
\n
\nDAGSTER_WORKING_DIRECTORY
\n
\n

Provide a default for --working-directory

\n
\n
\n\n
\n
\nDAGSTER_WEBSERVER_LOG_LEVEL
\n
\n

Provide a default for --dagster-log-level

\n
\n
\n\n
\n
\n

dagster-daemon run\u00b6

\n

Run any daemons configured on the DagsterInstance.

\n
dagster-daemon run [OPTIONS]\n
\n
\n

Options

\n
\n
\n--code-server-log-level <code_server_log_level>\u00b6
\n

Set the log level for any code servers spun up by the daemon.

\n
\n
Default:
\n

warning

\n
\n
Options:
\n

critical | error | warning | info | debug

\n
\n
\n
\n\n
\n
\n--log-level <log_level>\u00b6
\n

Set the log level for any code servers spun up by the daemon.

\n
\n
Default:
\n

info

\n
\n
Options:
\n

critical | error | warning | info | debug

\n
\n
\n
\n\n
\n
\n--use-ssl\u00b6
\n

Use a secure channel when connecting to the gRPC server

\n
\n\n
\n
\n--grpc-host <grpc_host>\u00b6
\n

Host to use to connect to gRPC server, defaults to localhost

\n
\n\n
\n
\n--grpc-socket <grpc_socket>\u00b6
\n

Named socket to use to connect to gRPC server

\n
\n\n
\n
\n--grpc-port <grpc_port>\u00b6
\n

Port to use to connect to gRPC server

\n
\n\n
\n
\n-a, --attribute <attribute>\u00b6
\n

Attribute that is either a 1) repository or job or 2) a function that returns a repository or job

\n
\n\n
\n
\n--package-name <package_name>\u00b6
\n

Specify Python package where repository or job function lives

\n
\n\n
\n
\n-m, --module-name <module_name>\u00b6
\n

Specify module or modules (flag can be used multiple times) where dagster definitions reside as top-level symbols/variables and load each module as a code location in the current python environment.

\n
\n\n
\n
\n-f, --python-file <python_file>\u00b6
\n

Specify python file or files (flag can be used multiple times) where dagster definitions reside as top-level symbols/variables and load each file as a code location in the current python environment.

\n
\n\n
\n
\n-d, --working-directory <working_directory>\u00b6
\n

Specify working directory to use when loading the repository or job

\n
\n\n
\n
\n-w, --workspace <workspace>\u00b6
\n

Path to workspace file. Argument can be provided multiple times.

\n
\n\n
\n
\n--empty-workspace\u00b6
\n

Allow an empty workspace

\n
\n\n

Environment variables

\n
\n
\nDAGSTER_DAEMON_LOG_LEVEL
\n
\n

Provide a default for --log-level

\n
\n
\n\n
\n
\nDAGSTER_ATTRIBUTE
\n
\n

Provide a default for --attribute

\n
\n
\n\n
\n
\nDAGSTER_PACKAGE_NAME
\n
\n

Provide a default for --package-name

\n
\n
\n\n
\n
\nDAGSTER_MODULE_NAME
\n
\n

Provide a default for --module-name

\n
\n
\n\n
\n
\nDAGSTER_PYTHON_FILE
\n
\n

Provide a default for --python-file

\n
\n
\n\n
\n
\nDAGSTER_WORKING_DIRECTORY
\n
\n

Provide a default for --working-directory

\n
\n
\n\n
\n
\n

dagster-daemon wipe\u00b6

\n

Wipe all heartbeats from storage.

\n
dagster-daemon wipe [OPTIONS]\n
\n
\n
\n
\n

dagster-daemon debug heartbeat-dump\u00b6

\n

Log all heartbeat statuses

\n
dagster-daemon debug heartbeat-dump [OPTIONS]\n
\n
\n
\n
\n

dagster api grpc\u00b6

\n

Serve the Dagster inter-process API over GRPC

\n
dagster api grpc [OPTIONS]\n
\n
\n

Options

\n
\n
\n-p, --port <port>\u00b6
\n

Port over which to serve. You must pass one and only one of \u2013port/-p or \u2013socket/-s.

\n
\n\n
\n
\n-s, --socket <socket>\u00b6
\n

Serve over a UDS socket. You must pass one and only one of \u2013port/-p or \u2013socket/-s.

\n
\n\n
\n
\n-h, --host <host>\u00b6
\n

Hostname at which to serve. Default is localhost.

\n
\n\n
\n
\n-n, --max-workers, --max_workers <max_workers>\u00b6
\n

Maximum number of (threaded) workers to use in the GRPC server

\n
\n\n
\n
\n--heartbeat\u00b6
\n

If set, the GRPC server will shut itself down when it fails to receive a heartbeat after a timeout configurable with \u2013heartbeat-timeout.

\n
\n\n
\n
\n--heartbeat-timeout <heartbeat_timeout>\u00b6
\n

Timeout after which to shutdown if \u2013heartbeat is set and a heartbeat is not received

\n
\n\n
\n
\n--lazy-load-user-code\u00b6
\n

Wait until the first LoadRepositories call to actually load the repositories, instead of waiting to load them when the server is launched. Useful for surfacing errors when the server is managed directly from the Dagster UI.

\n
\n\n
\n
\n-a, --attribute <attribute>\u00b6
\n

Attribute that is either a 1) repository or job or 2) a function that returns a repository or job

\n
\n\n
\n
\n--package-name <package_name>\u00b6
\n

Specify Python package where repository or job function lives

\n
\n\n
\n
\n-m, --module-name <module_name>\u00b6
\n

Specify module where dagster definitions reside as top-level symbols/variables and load the module as a code location in the current python environment.

\n
\n\n
\n
\n-f, --python-file <python_file>\u00b6
\n

Specify python file where dagster definitions reside as top-level symbols/variables and load the file as a code location in the current python environment.

\n
\n\n
\n
\n-d, --working-directory <working_directory>\u00b6
\n

Specify working directory to use when loading the repository or job

\n
\n\n
\n
\n--use-python-environment-entry-point\u00b6
\n

If this flag is set, the server will signal to clients that they should launch dagster commands using <this server\u2019s python executable> -m dagster, instead of the default dagster entry point. This is useful when there are multiple Python environments running in the same machine, so a single dagster entry point is not enough to uniquely determine the environment.

\n
\n\n
\n
\n--empty-working-directory\u00b6
\n

Indicates that the working directory should be empty and should not set to the current directory as a default

\n
\n\n
\n
\n--fixed-server-id <fixed_server_id>\u00b6
\n

[INTERNAL] This option should generally not be used by users. Internal param used by dagster to spawn a gRPC server with the specified server id.

\n
\n\n
\n
\n--log-level <log_level>\u00b6
\n

Level at which to log output from the code server process

\n
\n
Default:
\n

info

\n
\n
Options:
\n

critical | error | warning | info | debug

\n
\n
\n
\n\n
\n
\n--container-image <container_image>\u00b6
\n

Container image to use to run code from this server.

\n
\n\n
\n
\n--container-context <container_context>\u00b6
\n

Serialized JSON with configuration for any containers created to run the code from this server.

\n
\n\n
\n
\n--inject-env-vars-from-instance\u00b6
\n

Whether to load env vars from the instance and inject them into the environment.

\n
\n\n
\n
\n--location-name <location_name>\u00b6
\n

Name of the code location this server corresponds to.

\n
\n\n
\n
\n--instance-ref <instance_ref>\u00b6
\n

[INTERNAL] Serialized InstanceRef to use for accessing the instance

\n
\n\n

Environment variables

\n
\n
\nDAGSTER_GRPC_PORT
\n
\n

Provide a default for --port

\n
\n
\n\n
\n
\nDAGSTER_GRPC_SOCKET
\n
\n

Provide a default for --socket

\n
\n
\n\n
\n
\nDAGSTER_GRPC_HOST
\n
\n

Provide a default for --host

\n
\n
\n\n
\n
\nDAGSTER_LAZY_LOAD_USER_CODE
\n
\n

Provide a default for --lazy-load-user-code

\n
\n
\n\n
\n
\nDAGSTER_ATTRIBUTE
\n
\n

Provide a default for --attribute

\n
\n
\n\n
\n
\nDAGSTER_PACKAGE_NAME
\n
\n

Provide a default for --package-name

\n
\n
\n\n
\n
\nDAGSTER_MODULE_NAME
\n
\n

Provide a default for --module-name

\n
\n
\n\n
\n
\nDAGSTER_PYTHON_FILE
\n
\n

Provide a default for --python-file

\n
\n
\n\n
\n
\nDAGSTER_WORKING_DIRECTORY
\n
\n

Provide a default for --working-directory

\n
\n
\n\n
\n
\nDAGSTER_USE_PYTHON_ENVIRONMENT_ENTRY_POINT
\n
\n

Provide a default for --use-python-environment-entry-point

\n
\n
\n\n
\n
\nDAGSTER_EMPTY_WORKING_DIRECTORY
\n
\n

Provide a default for --empty-working-directory

\n
\n
\n\n
\n
\nDAGSTER_CONTAINER_IMAGE
\n
\n

Provide a default for --container-image

\n
\n
\n\n
\n
\nDAGSTER_CONTAINER_CONTEXT
\n
\n

Provide a default for --container-context

\n
\n
\n\n
\n
\nDAGSTER_INJECT_ENV_VARS_FROM_INSTANCE
\n
\n

Provide a default for --inject-env-vars-from-instance

\n
\n
\n\n
\n
\nDAGSTER_LOCATION_NAME
\n
\n

Provide a default for --location-name

\n
\n
\n\n
\n
\nDAGSTER_INSTANCE_REF
\n
\n

Provide a default for --instance-ref

\n
\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/cli", "customsidebar": null, "display_toc": true, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../config/", "title": "Config"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../asset-checks/", "title": "Asset Checks (Experimental)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/config", "Config", "N", "next"], ["sections/api/apidocs/asset-checks", "Asset Checks (Experimental)", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/cli.rst.txt", "title": "Dagster CLI", "toc": "\n"}, "config": {"alabaster_version": "0.7.13", "body": "
\n

Config\u00b6

\n
\n

Pythonic config system\u00b6

\n

The following classes are used as part of the new Pythonic config system. They are used in conjunction with builtin types.

\n
\n
\nclass dagster.Config[source]\u00b6
\n

Base class for Dagster configuration models, used to specify config schema for\nops and assets. Subclasses pydantic.BaseModel.

\n

Example definition:

\n
from pydantic import Field\n\nclass MyAssetConfig(Config):\n    my_str: str = "my_default_string"\n    my_int_list: List[int]\n    my_bool_with_metadata: bool = Field(default=False, description="A bool field")\n
\n
\n

Example usage:

\n
@asset\ndef asset_with_config(config: MyAssetConfig):\n    assert config.my_str == "my_default_string"\n    assert config.my_int_list == [1, 2, 3]\n    assert config.my_bool_with_metadata == False\n\nasset_with_config(MyAssetConfig(my_int_list=[1, 2, 3], my_bool_with_metadata=True))\n
\n
\n
\n\n
\n
\nclass dagster.PermissiveConfig(**config_dict)[source]\u00b6
\n

Subclass of Config that allows arbitrary extra fields. This is useful for\nconfig classes which may have open-ended inputs.

\n

Example definition:

\n
class MyPermissiveOpConfig(PermissiveConfig):\n    my_explicit_parameter: bool\n    my_other_explicit_parameter: str\n
\n
\n

Example usage:

\n
@op\ndef op_with_config(config: MyPermissiveOpConfig):\n    assert config.my_explicit_parameter == True\n    assert config.my_other_explicit_parameter == "foo"\n    assert config.dict().get("my_implicit_parameter") == "bar"\n\nop_with_config(\n    MyPermissiveOpConfig(\n        my_explicit_parameter=True,\n        my_other_explicit_parameter="foo",\n        my_implicit_parameter="bar"\n    )\n)\n
\n
\n
\n\n
\n
\nclass dagster.RunConfig(ops=None, resources=None, loggers=None, execution=None)[source]\u00b6
\n

Container for all the configuration that can be passed to a run. Accepts Pythonic definitions\nfor op and asset config and resources and converts them under the hood to the appropriate config dictionaries.

\n

Example usage:

\n
class MyAssetConfig(Config):\n    a_str: str\n\n@asset\ndef my_asset(config: MyAssetConfig):\n    assert config.a_str == "foo"\n\nmaterialize(\n    [my_asset],\n    run_config=RunConfig(\n        ops={"my_asset": MyAssetConfig(a_str="foo")}\n    )\n)\n
\n
\n
\n\n
\n
\n

Legacy Dagster config types\u00b6

\n

The following types are used as part of the legacy Dagster config system. They are used in conjunction with builtin types.

\n
\n
\nclass dagster.ConfigSchema[source]\u00b6
\n

Placeholder type for config schemas.

\n

Any time that it appears in documentation, it means that any of the following types are\nacceptable:

\n
    \n
  1. A Python scalar type that resolves to a Dagster config type\n(python:int, python:float, python:bool,\nor python:str). For example:

    \n
      \n
    • @op(config_schema=int)

    • \n
    • @op(config_schema=str)

    • \n
    \n
  2. \n
  3. A built-in python collection (python:list, or python:dict).\npython:list is exactly equivalent to Array [\nAny ] and python:dict is equivalent to\nPermissive. For example:

    \n
      \n
    • @op(config_schema=list)

    • \n
    • @op(config_schema=dict)

    • \n
    \n
  4. \n
  5. A Dagster config type:

    \n\n
  6. \n
  7. A bare python dictionary, which will be automatically wrapped in\nShape. Values of the dictionary are resolved recursively\naccording to the same rules. For example:

    \n
      \n
    • {'some_config': str} is equivalent to Shape({'some_config: str}).

    • \n
    • \n
      {'some_config1': {'some_config2': str}} is equivalent to

      Shape({'some_config1: Shape({'some_config2: str})}).

      \n
      \n
      \n
    • \n
    \n
  8. \n
  9. A bare python list of length one, whose single element will be wrapped in a\nArray is resolved recursively according to the same\nrules. For example:

    \n
      \n
    • [str] is equivalent to Array[str].

    • \n
    • [[str]] is equivalent to Array[Array[str]].

    • \n
    • [{'some_config': str}] is equivalent to Array(Shape({'some_config: str})).

    • \n
    \n
  10. \n
  11. An instance of Field.

  12. \n
\n
\n\n
\n
\nclass dagster.Field(config, default_value=<class 'dagster._config.field_utils.__FieldValueSentinel'>, is_required=None, description=None)[source]\u00b6
\n

Defines the schema for a configuration field.

\n

Fields are used in config schema instead of bare types when one wants to add a description,\na default value, or to mark it as not required.

\n

Config fields are parsed according to their schemas in order to yield values available at\njob execution time through the config system. Config fields can be set on ops, on\nloaders for custom, and on other pluggable components of the system, such as resources, loggers,\nand executors.

\n
\n
Parameters:
\n
    \n
  • config (Any) \u2013

    The schema for the config. This value can be any of:

    \n
      \n
    1. A Python primitive type that resolves to a Dagster config type\n(python:int, python:float, python:bool,\npython:str, or python:list).

    2. \n
    3. A Dagster config type:

      \n\n
    4. \n
    5. A bare python dictionary, which will be automatically wrapped in\nShape. Values of the dictionary are resolved recursively\naccording to the same rules.

    6. \n
    7. A bare python list of length one which itself is config type.\nBecomes Array with list element as an argument.

    8. \n
    \n

  • \n
  • default_value (Any) \u2013

    A default value for this field, conformant to the schema set by the dagster_type\nargument. If a default value is provided, is_required should be False.

    \n

    Note: for config types that do post processing such as Enum, this value must be\nthe pre processed version, ie use ExampleEnum.VALUE.name instead of\nExampleEnum.VALUE

    \n

  • \n
  • is_required (bool) \u2013 Whether the presence of this field is required. Defaults to true. If is_required\nis True, no default value should be provided.

  • \n
  • description (str) \u2013 A human-readable description of this config field.

  • \n
\n
\n
\n

Examples

\n
@op(\n    config_schema={\n        'word': Field(str, description='I am a word.'),\n        'repeats': Field(Int, default_value=1, is_required=False),\n    }\n)\ndef repeat_word(context):\n    return context.op_config['word'] * context.op_config['repeats']\n
\n
\n
\n
\nproperty default_provided\u00b6
\n

Was a default value provided.

\n
\n
Returns:
\n

Yes or no

\n
\n
Return type:
\n

bool

\n
\n
\n
\n\n
\n
\nproperty default_value\u00b6
\n

The default value for the field.

\n

Raises an exception if no default value was provided.

\n
\n\n
\n
\nproperty description\u00b6
\n

A human-readable description of this config field, if provided.

\n
\n\n
\n
\nproperty is_required\u00b6
\n

Whether a value for this field must be provided at runtime.

\n

Cannot be True if a default value is provided.

\n
\n\n
\n\n
\n
\nclass dagster.Selector(fields, description=None)[source]\u00b6
\n

Define a config field requiring the user to select one option.

\n

Selectors are used when you want to be able to present several different options in config but\nallow only one to be selected. For example, a single input might be read in from either a csv\nfile or a parquet file, but not both at once.

\n

Note that in some other type systems this might be called an \u2018input union\u2019.

\n

Functionally, a selector is like a Dict, except that only one key from the dict can\nbe specified in valid config.

\n
\n
Parameters:
\n

fields (Dict[str, Field]) \u2013 The fields from which the user must select.

\n
\n
\n

Examples:

\n
@op(\n    config_schema=Field(\n        Selector(\n            {\n                'haw': {'whom': Field(String, default_value='honua', is_required=False)},\n                'cn': {'whom': Field(String, default_value='\u4e16\u754c', is_required=False)},\n                'en': {'whom': Field(String, default_value='world', is_required=False)},\n            }\n        ),\n        is_required=False,\n        default_value={'en': {'whom': 'world'}},\n    )\n)\ndef hello_world_with_default(context):\n    if 'haw' in context.op_config:\n        return 'Aloha {whom}!'.format(whom=context.op_config['haw']['whom'])\n    if 'cn' in context.op_config:\n        return '\u4f60\u597d, {whom}!'.format(whom=context.op_config['cn']['whom'])\n    if 'en' in context.op_config:\n        return 'Hello, {whom}!'.format(whom=context.op_config['en']['whom'])\n
\n
\n
\n\n
\n
\nclass dagster.Permissive(fields=None, description=None)[source]\u00b6
\n

Defines a config dict with a partially specified schema.

\n

A permissive dict allows partial specification of the config schema. Any fields with a\nspecified schema will be type checked. Other fields will be allowed, but will be ignored by\nthe type checker.

\n
\n
Parameters:
\n

fields (Dict[str, Field]) \u2013 The partial specification of the config dict.

\n
\n
\n

Examples:

\n
@op(config_schema=Field(Permissive({'required': Field(String)})))\ndef map_config_op(context) -> List:\n    return sorted(list(context.op_config.items()))\n
\n
\n
\n\n
\n
\nclass dagster.Shape(fields, description=None, field_aliases=None)[source]\u00b6
\n

Schema for configuration data with string keys and typed values via Field.

\n

Unlike Permissive, unspecified fields are not allowed and will throw a\nDagsterInvalidConfigError.

\n
\n
Parameters:
\n
    \n
  • fields (Dict[str, Field]) \u2013 The specification of the config dict.

  • \n
  • field_aliases (Dict[str, str]) \u2013 Maps a string key to an alias that can be used instead of the original key. For example,\nan entry {\u201cfoo\u201d: \u201cbar\u201d} means that someone could use \u201cbar\u201d instead of \u201cfoo\u201d as a\ntop level string key.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.Map(key_type, inner_type, key_label_name=None)[source]\u00b6
\n

Defines a config dict with arbitrary scalar keys and typed values.

\n

A map can contrain arbitrary keys of the specified scalar type, each of which has\ntype checked values. Unlike Shape and Permissive, scalar\nkeys other than strings can be used, and unlike Permissive, all\nvalues are type checked.

\n
\n
Parameters:
\n
    \n
  • key_type (type) \u2013 The type of keys this map can contain. Must be a scalar type.

  • \n
  • inner_type (type) \u2013 The type of the values that this map type can contain.

  • \n
  • key_label_name (string) \u2013 Optional name which describes the role of keys in the map.

  • \n
\n
\n
\n

Examples:

\n
@op(config_schema=Field(Map({str: int})))\ndef partially_specified_config(context) -> List:\n    return sorted(list(context.op_config.items()))\n
\n
\n
\n
\nproperty key_label_name\u00b6
\n

Name which describes the role of keys in the map, if provided.

\n
\n\n
\n\n
\n
\nclass dagster.Array(inner_type)[source]\u00b6
\n

Defines an array (list) configuration type that contains values of type inner_type.

\n
\n
Parameters:
\n

inner_type (type) \u2013 The type of the values that this configuration type can contain.

\n
\n
\n
\n
\nproperty description\u00b6
\n

A human-readable description of this Array type.

\n
\n\n
\n\n
\n
\nclass dagster.Noneable(inner_type)[source]\u00b6
\n

Defines a configuration type that is the union of NoneType and the type inner_type.

\n
\n
Parameters:
\n

inner_type (type) \u2013 The type of the values that this configuration type can contain.

\n
\n
\n

Examples:

\n
config_schema={"name": Noneable(str)}\n\nconfig={"name": "Hello"}  # Ok\nconfig={"name": None}     # Ok\nconfig={}                 # Error\n
\n
\n
\n\n
\n
\nclass dagster.Enum(name, enum_values)[source]\u00b6
\n

Defines a enum configuration type that allows one of a defined set of possible values.

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the enum configuration type.

  • \n
  • enum_values (List[EnumValue]) \u2013 The set of possible values for the enum configuration type.

  • \n
\n
\n
\n

Examples:

\n
@op(\n    config_schema=Field(\n        Enum(\n            'CowboyType',\n            [\n                EnumValue('good'),\n                EnumValue('bad'),\n                EnumValue('ugly'),\n            ]\n        )\n    )\n)\ndef resolve_standoff(context):\n    # ...\n
\n
\n
\n\n
\n
\nclass dagster.EnumValue(config_value, python_value=None, description=None)[source]\u00b6
\n

Define an entry in a Enum.

\n
\n
Parameters:
\n
    \n
  • config_value (str) \u2013 The string representation of the config to accept when passed.

  • \n
  • python_value (Optional[Any]) \u2013 The python value to convert the enum entry in to. Defaults to the config_value.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the enum entry.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.ScalarUnion(scalar_type, non_scalar_schema, _key=None)[source]\u00b6
\n

Defines a configuration type that accepts a scalar value OR a non-scalar value like a\nList, Dict, or Selector.

\n

This allows runtime scalars to be configured without a dictionary with the key value and\ninstead just use the scalar value directly. However this still leaves the option to\nload scalars from a json or pickle file.

\n
\n
Parameters:
\n
    \n
  • scalar_type (type) \u2013 The scalar type of values that this configuration type can hold. For example,\npython:int, python:float, python:bool,\nor python:str.

  • \n
  • non_scalar_schema (ConfigSchema) \u2013 The schema of a non-scalar Dagster configuration type. For example, List,\nDict, or Selector.

  • \n
  • key (Optional[str]) \u2013 The configuation type\u2019s unique key. If not set, then the key will be set to\nScalarUnion.{scalar_type}-{non_scalar_schema}.

  • \n
\n
\n
\n

Examples:

\n
graph:\n  transform_word:\n    inputs:\n      word:\n        value: foobar\n
\n
\n

becomes, optionally,

\n
graph:\n  transform_word:\n    inputs:\n      word: foobar\n
\n
\n
\n\n
\n
\ndagster.StringSource\u00b6
\n

Use this type when you want to read a string config value from an environment variable. The value\npassed to a config field of this type may either be a string literal, or a selector describing\nhow to look up the value from the executing process\u2019s environment variables.

\n

Examples:

\n
from dagster import job, op, StringSource\n\n@op(config_schema=StringSource)\ndef secret_op(context) -> str:\n    return context.op_config\n\n@job\ndef secret_job():\n    secret_op()\n\nsecret_job.execute_in_process(\n    run_config={\n        'ops': {'secret_op': {'config': 'test_value'}}\n    }\n)\n\nsecret_job.execute_in_process(\n    run_config={\n        'ops': {'secret_op': {'config': {'env': 'VERY_SECRET_ENV_VARIABLE'}}}\n    }\n)\n
\n
\n
\n\n
\n
\ndagster.IntSource\u00b6
\n

Use this type when you want to read an integer config value from an environment variable. The\nvalue passed to a config field of this type may either be a integer literal, or a selector\ndescribing how to look up the value from the executing process\u2019s environment variables.

\n

Examples:

\n
from dagster import job, op, IntSource\n\n@op(config_schema=IntSource)\ndef secret_int_op(context) -> int:\n    return context.op_config\n\n@job\ndef secret_job():\n    secret_int_op()\n\nsecret_job.execute_in_process(\n    run_config={\n        'ops': {'secret_int_op': {'config': 1234}}\n    }\n)\n\nsecret_job.execute_in_process(\n    run_config={\n        'ops': {'secret_int_op': {'config': {'env': 'VERY_SECRET_ENV_VARIABLE_INT'}}}\n    }\n)\n
\n
\n
\n\n
\n
\ndagster.BoolSource\u00b6
\n

Use this type when you want to read an boolean config value from an environment variable. The\nvalue passed to a config field of this type may either be a boolean literal, or a selector\ndescribing how to look up the value from the executing process\u2019s environment variables. Set the\nvalue of the corresponding environment variable to "" to indicate False.

\n

Examples:

\n
from dagster import job, op, BoolSource\n\n@op(config_schema=BoolSource)\ndef secret_bool_op(context) -> bool:\n    return context.op_config\n\n@job\ndef secret_job():\n    secret_bool_op()\n\nsecret_job.execute_in_process(\n    run_config={\n        'ops': {'secret_bool_op': {'config': False}}\n    }\n)\n\nsecret_job.execute_in_process(\n    run_config={\n        'ops': {'secret_bool_op': {'config': {'env': 'VERY_SECRET_ENV_VARIABLE_BOOL'}}}\n    }\n)\n
\n
\n
\n\n
\n
\n

Config Utilities\u00b6

\n
\n
\nclass dagster.ConfigMapping(config_fn, config_schema=None, receive_processed_config_values=None)[source]\u00b6
\n

Defines a config mapping for a graph (or job).

\n

By specifying a config mapping function, you can override the configuration for the child\nops and graphs contained within a graph.

\n

Config mappings require the configuration schema to be specified as config_schema, which will\nbe exposed as the configuration schema for the graph, as well as a configuration mapping\nfunction, config_fn, which maps the config provided to the graph to the config\nthat will be provided to the child nodes.

\n
\n
Parameters:
\n
    \n
  • config_fn (Callable[[dict], dict]) \u2013 The function that will be called\nto map the graph config to a config appropriate for the child nodes.

  • \n
  • config_schema (ConfigSchema) \u2013 The schema of the graph config.

  • \n
  • receive_processed_config_values (Optional[bool]) \u2013 If true, config values provided to the config_fn\nwill be converted to their dagster types before being passed in. For example, if this\nvalue is true, enum config passed to config_fn will be actual enums, while if false,\nthen enum config passed to config_fn will be strings.

  • \n
\n
\n
\n
\n\n
\n
\n@dagster.configured(configurable, config_schema=None, **kwargs)[source]\u00b6
\n

A decorator that makes it easy to create a function-configured version of an object.

\n

The following definition types can be configured using this function:

\n\n

Using configured may result in config values being displayed in the Dagster UI,\nso it is not recommended to use this API with sensitive values, such as\nsecrets.

\n

If the config that will be supplied to the object is constant, you may alternatively invoke this\nand call the result with a dict of config values to be curried. Examples of both strategies\nbelow.

\n
\n
Parameters:
\n
    \n
  • configurable (ConfigurableDefinition) \u2013 An object that can be configured.

  • \n
  • config_schema (ConfigSchema) \u2013 The config schema that the inputs to the decorated function\nmust satisfy. Alternatively, annotate the config parameter to the decorated function\nwith a subclass of Config and omit this argument.

  • \n
  • **kwargs \u2013 Arbitrary keyword arguments that will be passed to the initializer of the returned\nobject.

  • \n
\n
\n
Returns:
\n

(Callable[[Union[Any, Callable[[Any], Any]]], ConfigurableDefinition])

\n
\n
\n

Examples:

\n
class GreetingConfig(Config):\n    message: str\n\n@op\ndef greeting_op(config: GreetingConfig):\n    print(config.message)\n\nclass HelloConfig(Config):\n    name: str\n\n@configured(greeting_op)\ndef hello_op(config: HelloConfig):\n    return GreetingConfig(message=f"Hello, {config.name}!")\n
\n
\n
dev_s3 = configured(S3Resource, name="dev_s3")({'bucket': 'dev'})\n\n@configured(S3Resource)\ndef dev_s3(_):\n    return {'bucket': 'dev'}\n\n@configured(S3Resource, {'bucket_prefix', str})\ndef dev_s3(config):\n    return {'bucket': config['bucket_prefix'] + 'dev'}\n
\n
\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/config", "customsidebar": null, "display_toc": true, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../errors/", "title": "Errors"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../cli/", "title": "Dagster CLI"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/errors", "Errors", "N", "next"], ["sections/api/apidocs/cli", "Dagster CLI", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/config.rst.txt", "title": "Config", "toc": "\n"}, "definitions": {"alabaster_version": "0.7.13", "body": "
\n

Definitions\u00b6

\n
\n
\nclass dagster.Definitions(assets=None, schedules=None, sensors=None, jobs=None, resources=None, executor=None, loggers=None, asset_checks=None)[source]\u00b6
\n

A set of definitions explicitly available and loadable by Dagster tools.

\n
\n
Parameters:
\n
    \n
  • assets (Optional[Iterable[Union[AssetsDefinition, SourceAsset, CacheableAssetsDefinition]]]) \u2013 A list of assets. Assets can be created by annotating\na function with @asset or\n@observable_source_asset.\nOr they can by directly instantiating AssetsDefinition,\nSourceAsset, or CacheableAssetsDefinition.

  • \n
  • asset_checks (Optional[Iterable[AssetChecksDefinition]]) \u2013 A list of asset checks.

  • \n
  • schedules (Optional[Iterable[Union[ScheduleDefinition, UnresolvedPartitionedAssetScheduleDefinition]]]) \u2013 List of schedules.

  • \n
  • sensors (Optional[Iterable[SensorDefinition]]) \u2013 List of sensors, typically created with @sensor.

  • \n
  • jobs (Optional[Iterable[Union[JobDefinition, UnresolvedAssetJobDefinition]]]) \u2013 List of jobs. Typically created with define_asset_job\nor with @job for jobs defined in terms of ops directly.\nJobs created with @job must already have resources bound\nat job creation time. They do not respect the resources argument here.

  • \n
  • resources (Optional[Mapping[str, Any]]) \u2013 Dictionary of resources to bind to assets.\nThe resources dictionary takes raw Python objects,\nnot just instances of ResourceDefinition. If that raw object inherits from\nIOManager, it gets coerced to an IOManagerDefinition.\nAny other object is coerced to a ResourceDefinition.\nThese resources will be automatically bound\nto any assets passed to this Definitions instance using\nwith_resources. Assets passed to Definitions with\nresources already bound using with_resources will\noverride this dictionary.

  • \n
  • executor (Optional[Union[ExecutorDefinition, Executor]]) \u2013 Default executor for jobs. Individual jobs can override this and define their own executors\nby setting the executor on @job or define_asset_job\nexplicitly. This executor will also be used for materializing assets directly\noutside of the context of jobs. If an Executor is passed, it is coerced into\nan ExecutorDefinition.

  • \n
  • loggers (Optional[Mapping[str, LoggerDefinition]) \u2013 Default loggers for jobs. Individual jobs\ncan define their own loggers by setting them explictly.

  • \n
\n
\n
\n

Example usage:

\n
defs = Definitions(\n    assets=[asset_one, asset_two],\n    schedules=[a_schedule],\n    sensors=[a_sensor],\n    jobs=[a_job],\n    resources={\n        "a_resource": some_resource,\n    },\n    asset_checks=[asset_one_check_one]\n)\n
\n
\n

Dagster separates user-defined code from system tools such the web server and\nthe daemon. Rather than loading code directly into process, a tool such as the\nwebserver interacts with user-defined code over a serialization boundary.

\n

These tools must be able to locate and load this code when they start. Via CLI\narguments or config, they specify a Python module to inspect.

\n

A Python module is loadable by Dagster tools if there is a top-level variable\nthat is an instance of Definitions.

\n

Before the introduction of Definitions,\n@repository was the API for organizing defintions.\nDefinitions provides a few conveniences for dealing with resources\nthat do not apply to old-style @repository declarations:

\n\n
\n
\nget_asset_value_loader(instance=None)[source]\u00b6
\n

Returns an object that can load the contents of assets as Python objects.

\n

Invokes load_input on the IOManager associated with the assets. Avoids\nspinning up resources separately for each asset.

\n

Usage:

\n
with defs.get_asset_value_loader() as loader:\n    asset1 = loader.load_asset_value("asset1")\n    asset2 = loader.load_asset_value("asset2")\n
\n
\n
\n\n
\n
\nget_job_def(name)[source]\u00b6
\n

Get a job definition by name. If you passed in a an UnresolvedAssetJobDefinition\n(return value of define_asset_job()) it will be resolved to a JobDefinition when returned\nfrom this function.

\n
\n\n
\n
\nget_schedule_def(name)[source]\u00b6
\n

Get a schedule definition by name.

\n
\n\n
\n
\nget_sensor_def(name)[source]\u00b6
\n

Get a sensor definition by name.

\n
\n\n
\n
\nload_asset_value(asset_key, *, python_type=None, instance=None, partition_key=None, metadata=None)[source]\u00b6
\n

Load the contents of an asset as a Python object.

\n

Invokes load_input on the IOManager associated with the asset.

\n

If you want to load the values of multiple assets, it\u2019s more efficient to use\nget_asset_value_loader(), which avoids spinning up\nresources separately for each asset.

\n
\n
Parameters:
\n
    \n
  • asset_key (Union[AssetKey, Sequence[str], str]) \u2013 The key of the asset to load.

  • \n
  • python_type (Optional[Type]) \u2013 The python type to load the asset as. This is what will\nbe returned inside load_input by context.dagster_type.typing_type.

  • \n
  • partition_key (Optional[str]) \u2013 The partition of the asset to load.

  • \n
  • metadata (Optional[Dict[str, Any]]) \u2013 Input metadata to pass to the IOManager\n(is equivalent to setting the metadata argument in In or AssetIn).

  • \n
\n
\n
Returns:
\n

The contents of an asset as a Python object.

\n
\n
\n
\n\n
\n\n
\n
\ndagster.create_repository_using_definitions_args(name, assets=None, schedules=None, sensors=None, jobs=None, resources=None, executor=None, loggers=None, asset_checks=None)[source]\u00b6
\n
\n
\n

\n \n (\n experimental\n )\n \n This API may break in future versions, even between dot releases.\n \n

\n

Create a named repository using the same arguments as Definitions. In older\nversions of Dagster, repositories were the mechanism for organizing assets, schedules, sensors,\nand jobs. There could be many repositories per code location. This was a complicated ontology but\ngave users a way to organize code locations that contained large numbers of heterogenous definitions.

\n

As a stopgap for those who both want to 1) use the new Definitions API and 2) but still\nwant multiple logical groups of assets in the same code location, we have introduced this function.

\n

Example usage:

\n
named_repo = create_repository_using_definitions_args(\n    name="a_repo",\n    assets=[asset_one, asset_two],\n    schedules=[a_schedule],\n    sensors=[a_sensor],\n    jobs=[a_job],\n    resources={\n        "a_resource": some_resource,\n    }\n)\n
\n
\n
\n\n
\n", "current_page_name": "sections/api/apidocs/definitions", "customsidebar": null, "display_toc": false, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../repositories/", "title": "Repositories"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../partitions/", "title": "Partitions Definitions"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/repositories", "Repositories", "N", "next"], ["sections/api/apidocs/partitions", "Partitions Definitions", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/definitions.rst.txt", "title": "Definitions", "toc": "\n"}, "dynamic": {"alabaster_version": "0.7.13", "body": "
\n

Dynamic Mapping & Collect\u00b6

\n

These APIs provide the means for a simple kind of dynamic orchestration \u2014 where the work to be orchestrated is determined not at job definition time but at runtime, dependent on data that\u2019s observed as part of job execution.

\n
\n
\nclass dagster.DynamicOut(dagster_type=<class 'dagster._core.definitions.utils.NoValueSentinel'>, description=None, is_required=True, io_manager_key=None, metadata=None, code_version=None)[source]\u00b6
\n

Variant of Out for an output that will dynamically alter the graph at\nruntime.

\n

When using in a composition function such as @graph,\ndynamic outputs must be used with either

\n
    \n
  • map - clone downstream ops for each separate DynamicOut

  • \n
  • collect - gather across all DynamicOut in to a list

  • \n
\n

Uses the same constructor as Out

\n
\n
@op(\n    config_schema={\n        "path": Field(str, default_value=file_relative_path(__file__, "sample"))\n    },\n    out=DynamicOut(str),\n)\ndef files_in_directory(context):\n    path = context.op_config["path"]\n    dirname, _, filenames = next(os.walk(path))\n    for file in filenames:\n        yield DynamicOutput(os.path.join(dirname, file), mapping_key=_clean(file))\n\n@job\ndef process_directory():\n    files = files_in_directory()\n\n    # use map to invoke an op on each dynamic output\n    file_results = files.map(process_file)\n\n    # use collect to gather the results in to a list\n    summarize_directory(file_results.collect())\n
\n
\n
\n
\n\n
\n
\nclass dagster.DynamicOutput(value, mapping_key, output_name='result', metadata=None)[source]\u00b6
\n

Variant of Output used to support\ndynamic mapping & collect. Each DynamicOutput produced by an op represents\none item in a set that can be processed individually with map or gathered\nwith collect.

\n

Each DynamicOutput must have a unique mapping_key to distinguish it with it\u2019s set.

\n
\n
Parameters:
\n
    \n
  • value (Any) \u2013 The value returned by the compute function.

  • \n
  • mapping_key (str) \u2013 The key that uniquely identifies this dynamic value relative to its peers.\nThis key will be used to identify the downstream ops when mapped, ie\nmapped_op[example_mapping_key]

  • \n
  • output_name (Optional[str]) \u2013 Name of the corresponding DynamicOut defined on the op.\n(default: \u201cresult\u201d)

  • \n
  • metadata (Optional[Dict[str, Union[str, float, int, MetadataValue]]]) \u2013 Arbitrary metadata about the failure. Keys are displayed string labels, and values are\none of the following: string, float, int, JSON-serializable dict, JSON-serializable\nlist, and one of the data classes returned by a MetadataValue static method.

  • \n
\n
\n
\n
\n
\nproperty mapping_key\u00b6
\n

The mapping_key that was set for this DynamicOutput at instantiation.

\n
\n\n
\n
\nproperty output_name\u00b6
\n

Name of the DynamicOut defined on the op that this DynamicOut is associated with.

\n
\n\n
\n
\nproperty value\u00b6
\n

The value that is returned by the compute function for this DynamicOut.

\n
\n\n
\n\n
\n", "current_page_name": "sections/api/apidocs/dynamic", "customsidebar": null, "display_toc": false, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../types/", "title": "Types"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../schedules-sensors/", "title": "Run Requests"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/types", "Types", "N", "next"], ["sections/api/apidocs/schedules-sensors", "Run Requests", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/dynamic.rst.txt", "title": "Dynamic Mapping & Collect", "toc": "\n"}, "errors": {"alabaster_version": "0.7.13", "body": "
\n

Errors\u00b6

\n

Core Dagster error classes.

\n

All errors thrown by the Dagster framework inherit from DagsterError. Users\nshould not subclass this base class for their own exceptions.

\n

There is another exception base class, DagsterUserCodeExecutionError, which is\nused by the framework in concert with the user_code_error_boundary().

\n

Dagster uses this construct to wrap user code into which it calls. User code can perform arbitrary\ncomputations and may itself throw exceptions. The error boundary catches these user code-generated\nexceptions, and then reraises them wrapped in a subclass of\nDagsterUserCodeExecutionError.

\n

The wrapped exceptions include additional context for the original exceptions, injected by the\nDagster runtime.

\n
\n
\nexception dagster.DagsterError[source]\u00b6
\n

Base class for all errors thrown by the Dagster framework.

\n

Users should not subclass this base class for their own exceptions.

\n
\n
\nproperty is_user_code_error\u00b6
\n

Returns true if this error is attributable to user code.

\n
\n\n
\n\n
\n
\nexception dagster.DagsterConfigMappingFunctionError(*args, **kwargs)[source]\u00b6
\n

Indicates that an unexpected error occurred while executing the body of a config mapping\nfunction defined in a JobDefinition or ~dagster.GraphDefinition during\nconfig parsing.

\n
\n\n
\n
\nexception dagster.DagsterEventLogInvalidForRun(run_id)[source]\u00b6
\n

Raised when the event logs for a historical run are malformed or invalid.

\n
\n\n
\n
\nexception dagster.DagsterExecutionStepExecutionError(*args, **kwargs)[source]\u00b6
\n

Indicates an error occurred while executing the body of an execution step.

\n
\n\n
\n
\nexception dagster.DagsterExecutionStepNotFoundError(*args, **kwargs)[source]\u00b6
\n

Thrown when the user specifies execution step keys that do not exist.

\n
\n\n
\n
\nexception dagster.DagsterInvalidConfigError(preamble, errors, config_value, *args, **kwargs)[source]\u00b6
\n

Thrown when provided config is invalid (does not type check against the relevant config\nschema).

\n
\n\n
\n
\nexception dagster.DagsterInvalidConfigDefinitionError(original_root, current_value, stack, reason=None, **kwargs)[source]\u00b6
\n

Indicates that you have attempted to construct a config with an invalid value.

\n
\n
Acceptable values for config types are any of:
    \n
  1. \n
    A Python primitive type that resolves to a Dagster config type

    (python:int, python:float, python:bool,\npython:str, or python:list).

    \n
    \n
    \n
  2. \n
  3. \n
    A Dagster config type: Int, Float,

    Bool, String,\nStringSource, Any,\nArray, Noneable, Enum,\nSelector, Shape, or\nPermissive.

    \n
    \n
    \n
  4. \n
  5. \n
    A bare python dictionary, which will be automatically wrapped in

    Shape. Values of the dictionary are resolved recursively\naccording to the same rules.

    \n
    \n
    \n
  6. \n
  7. \n
    A bare python list of length one which itself is config type.

    Becomes Array with list element as an argument.

    \n
    \n
    \n
  8. \n
  9. An instance of Field.

  10. \n
\n
\n
\n
\n\n
\n
\nexception dagster.DagsterInvalidDefinitionError[source]\u00b6
\n

Indicates that the rules for a definition have been violated by the user.

\n
\n\n
\n
\nexception dagster.DagsterInvalidSubsetError[source]\u00b6
\n

Indicates that a subset of a pipeline is invalid because either:\n- One or more ops in the specified subset do not exist on the job.\u2019\n- The subset produces an invalid job.

\n
\n\n
\n
\nexception dagster.DagsterInvariantViolationError[source]\u00b6
\n

Indicates the user has violated a well-defined invariant that can only be enforced\nat runtime.

\n
\n\n
\n
\nexception dagster.DagsterResourceFunctionError(*args, **kwargs)[source]\u00b6
\n

Indicates an error occurred while executing the body of the resource_fn in a\nResourceDefinition during resource initialization.

\n
\n\n
\n
\nexception dagster.DagsterRunNotFoundError(*args, **kwargs)[source]\u00b6
\n

Thrown when a run cannot be found in run storage.

\n
\n\n
\n
\nexception dagster.DagsterStepOutputNotFoundError(*args, **kwargs)[source]\u00b6
\n

Indicates that previous step outputs required for an execution step to proceed are not\navailable.

\n
\n\n
\n
\nexception dagster.DagsterSubprocessError(*args, **kwargs)[source]\u00b6
\n

An exception has occurred in one or more of the child processes dagster manages.\nThis error forwards the message and stack trace for all of the collected errors.

\n
\n\n
\n
\nexception dagster.DagsterTypeCheckDidNotPass(description=None, metadata=None, dagster_type=None)[source]\u00b6
\n

Indicates that a type check failed.

\n

This is raised when raise_on_error is True in calls to the synchronous job and\ngraph execution APIs (e.g. graph.execute_in_process(), job.execute_in_process() \u2013 typically\nwithin a test), and a DagsterType\u2019s type check fails by returning either\nFalse or an instance of TypeCheck whose success member is False.

\n
\n\n
\n
\nexception dagster.DagsterTypeCheckError(*args, **kwargs)[source]\u00b6
\n

Indicates an error in the op type system at runtime. E.g. a op receives an\nunexpected input, or produces an output that does not match the type of the output definition.

\n
\n\n
\n
\nexception dagster.DagsterUnknownResourceError(resource_name, *args, **kwargs)[source]\u00b6
\n

Indicates that an unknown resource was accessed in the body of an execution step. May often\nhappen by accessing a resource in the compute function of an op without first supplying the\nop with the correct required_resource_keys argument.

\n
\n\n
\n
\nexception dagster.DagsterUnmetExecutorRequirementsError[source]\u00b6
\n

Indicates the resolved executor is incompatible with the state of other systems\nsuch as the DagsterInstance or system storage configuration.

\n
\n\n
\n
\nexception dagster.DagsterUserCodeExecutionError(*args, **kwargs)[source]\u00b6
\n

This is the base class for any exception that is meant to wrap an\npython:Exception thrown by user code. It wraps that existing user code.\nThe original_exc_info argument to the constructor is meant to be a tuple of the type\nreturned by sys.exc_info at the call site of the constructor.

\n

Users should not subclass this base class for their own exceptions and should instead throw\nfreely from user code. User exceptions will be automatically wrapped and rethrown.

\n
\n
\nproperty is_user_code_error\u00b6
\n

Returns true if this error is attributable to user code.

\n
\n\n
\n\n
\n", "current_page_name": "sections/api/apidocs/errors", "customsidebar": null, "display_toc": false, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../execution/", "title": "Execution"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../config/", "title": "Config"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/execution", "Execution", "N", "next"], ["sections/api/apidocs/config", "Config", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/errors.rst.txt", "title": "Errors", "toc": "\n"}, "execution": {"alabaster_version": "0.7.13", "body": "
\n

Execution\u00b6

\n
\n

Materializing Assets\u00b6

\n
\n
\ndagster.materialize(assets, run_config=None, instance=None, resources=None, partition_key=None, raise_on_error=True, tags=None, selection=None)[source]\u00b6
\n

Executes a single-threaded, in-process run which materializes provided assets.

\n

By default, will materialize assets to the local filesystem.

\n
\n
Parameters:
\n
    \n
  • assets (Sequence[Union[AssetsDefinition, SourceAsset]]) \u2013

    The assets to materialize.

    \n

    Unless you\u2019re using deps or non_argument_deps, you must also include all assets that are\nupstream of the assets that you want to materialize. This is because those upstream\nasset definitions have information that is needed to load their contents while\nmaterializing the downstream assets.

    \n

    You can use the selection argument to distinguish between assets that you want to\nmaterialize and assets that are just present for loading.

    \n

  • \n
  • resources (Optional[Mapping[str, object]]) \u2013 The resources needed for execution. Can provide resource instances\ndirectly, or resource definitions. Note that if provided resources\nconflict with resources directly on assets, an error will be thrown.

  • \n
  • run_config (Optional[Any]) \u2013 The run config to use for the run that materializes the assets.

  • \n
  • partition_key \u2013 (Optional[str])\nThe string partition key that specifies the run config to execute. Can only be used\nto select run config for assets with partitioned config.

  • \n
  • tags (Optional[Mapping[str, str]]) \u2013 Tags for the run.

  • \n
  • selection (Optional[Union[str, Sequence[str], Sequence[AssetKey], Sequence[Union[AssetsDefinition, SourceAsset]], AssetSelection]]) \u2013

    A sub-selection of assets to materialize.

    \n

    If not provided, then all assets will be materialized.

    \n

    If providing a string or sequence of strings,\nhttps://docs.dagster.io/concepts/assets/asset-selection-syntax describes the accepted\nsyntax.

    \n

  • \n
\n
\n
Returns:
\n

The result of the execution.

\n
\n
Return type:
\n

ExecuteInProcessResult

\n
\n
\n

Examples

\n
@asset\ndef asset1():\n    ...\n\n@asset\ndef asset2(asset1):\n    ...\n\n# executes a run that materializes asset1 and then asset2\nmaterialize([asset1, asset2])\n\n# executes a run that materializes just asset2, loading its input from asset1\nmaterialize([asset1, asset2], selection=[asset2])\n
\n
\n
\n\n
\n
\ndagster.materialize_to_memory(assets, run_config=None, instance=None, resources=None, partition_key=None, raise_on_error=True, tags=None, selection=None)[source]\u00b6
\n

Executes a single-threaded, in-process run which materializes provided assets in memory.

\n

Will explicitly use mem_io_manager() for all required io manager\nkeys. If any io managers are directly provided using the resources\nargument, a DagsterInvariantViolationError will be thrown.

\n
\n
Parameters:
\n
    \n
  • assets (Sequence[Union[AssetsDefinition, SourceAsset]]) \u2013 The assets to materialize. Can also provide SourceAsset objects to fill dependencies for asset defs.

  • \n
  • run_config (Optional[Any]) \u2013 The run config to use for the run that materializes the assets.

  • \n
  • resources (Optional[Mapping[str, object]]) \u2013 The resources needed for execution. Can provide resource instances\ndirectly, or resource definitions. If provided resources\nconflict with resources directly on assets, an error will be thrown.

  • \n
  • partition_key \u2013 (Optional[str])\nThe string partition key that specifies the run config to execute. Can only be used\nto select run config for assets with partitioned config.

  • \n
  • tags (Optional[Mapping[str, str]]) \u2013 Tags for the run.

  • \n
  • selection (Optional[Union[str, Sequence[str], Sequence[AssetKey], Sequence[Union[AssetsDefinition, SourceAsset]], AssetSelection]]) \u2013

    A sub-selection of assets to materialize.

    \n

    If not provided, then all assets will be materialized.

    \n

    If providing a string or sequence of strings,\nhttps://docs.dagster.io/concepts/assets/asset-selection-syntax describes the accepted\nsyntax.

    \n

  • \n
\n
\n
Returns:
\n

The result of the execution.

\n
\n
Return type:
\n

ExecuteInProcessResult

\n
\n
\n

Examples

\n
@asset\ndef asset1():\n    ...\n\n@asset\ndef asset2(asset1):\n    ...\n\n# executes a run that materializes asset1 and then asset2\nmaterialize([asset1, asset2])\n\n# executes a run that materializes just asset1\nmaterialize([asset1, asset2], selection=[asset1])\n
\n
\n
\n\n
\n
\n

Executing Jobs\u00b6

\n
\n
\nclass dagster.JobDefinition(*, graph_def, resource_defs=None, executor_def=None, logger_defs=None, name=None, config=None, description=None, partitions_def=None, tags=None, metadata=None, hook_defs=None, op_retry_policy=None, version_strategy=None, _subset_selection_data=None, asset_layer=None, input_values=None, _was_explicitly_provided_resources=None)[source]
\n

Defines a Dagster job.

\n
\n
\nproperty config_mapping
\n

The config mapping for the job, if it has one.

\n

A config mapping defines a way to map a top-level config schema to run config for the job.

\n
\n\n
\n
\nexecute_in_process(run_config=None, instance=None, partition_key=None, raise_on_error=True, op_selection=None, asset_selection=None, run_id=None, input_values=None, tags=None, resources=None)[source]
\n

Execute the Job in-process, gathering results in-memory.

\n

The executor_def on the Job will be ignored, and replaced with the in-process executor.\nIf using the default io_manager, it will switch from filesystem to in-memory.

\n
\n
Parameters:
\n
    \n
  • (Optional[Mapping[str (run_config) \u2013 The configuration for the run

  • \n
  • Any]] \u2013 The configuration for the run

  • \n
  • instance (Optional[DagsterInstance]) \u2013 The instance to execute against, an ephemeral one will be used if none provided.

  • \n
  • partition_key \u2013 (Optional[str])\nThe string partition key that specifies the run config to execute. Can only be used\nto select run config for jobs with partitioned config.

  • \n
  • raise_on_error (Optional[bool]) \u2013 Whether or not to raise exceptions when they occur.\nDefaults to True.

  • \n
  • op_selection (Optional[Sequence[str]]) \u2013 A list of op selection queries (including single op\nnames) to execute. For example:\n* ['some_op']: selects some_op itself.\n* ['*some_op']: select some_op and all its ancestors (upstream dependencies).\n* ['*some_op+++']: select some_op, all its ancestors, and its descendants\n(downstream dependencies) within 3 levels down.\n* ['*some_op', 'other_op_a', 'other_op_b+']: select some_op and all its\nancestors, other_op_a itself, and other_op_b and its direct child ops.

  • \n
  • input_values (Optional[Mapping[str, Any]]) \u2013 A dictionary that maps python objects to the top-level inputs of the job. Input values provided here will override input values that have been provided to the job directly.

  • \n
  • resources (Optional[Mapping[str, Any]]) \u2013 The resources needed if any are required. Can provide resource instances directly,\nor resource definitions.

  • \n
\n
\n
Returns:
\n

ExecuteInProcessResult

\n
\n
\n
\n\n
\n
\nproperty executor_def
\n

Returns the default ExecutorDefinition for the job.

\n

If the user has not specified an executor definition, then this will default to the multi_or_in_process_executor(). If a default is specified on the Definitions object the job was provided to, then that will be used instead.

\n
\n\n
\n
\nproperty has_specified_executor
\n

Returns True if this job has explicitly specified an executor, and False if the executor was inherited through defaults or the Definitions object the job was provided to.

\n
\n\n
\n
\nproperty has_specified_loggers
\n

Returns true if the job explicitly set loggers, and False if loggers were inherited through defaults or the Definitions object the job was provided to.

\n
\n\n
\n
\nproperty loggers
\n

Returns the set of LoggerDefinition objects specified on the job.

\n

If the user has not specified a mapping of LoggerDefinition objects, then this will default to the colored_console_logger() under the key console. If a default is specified on the Definitions object the job was provided to, then that will be used instead.

\n
\n\n
\n
\nproperty partitioned_config
\n

The partitioned config for the job, if it has one.

\n

A partitioned config defines a way to map partition keys to run config for the job.

\n
\n\n
\n
\nproperty partitions_def
\n

Returns the PartitionsDefinition for the job, if it has one.

\n

A partitions definition defines the set of partition keys the job operates on.

\n
\n\n
\n
\nproperty resource_defs
\n

Returns the set of ResourceDefinition objects specified on the job.

\n

This may not be the complete set of resources required by the job, since those can also be provided on the Definitions object the job may be provided to.

\n
\n\n
\n
\nrun_request_for_partition(partition_key, run_key=None, tags=None, asset_selection=None, run_config=None, current_time=None, dynamic_partitions_store=None)[source]
\n
\n
\n

\n \n (\n deprecated\n )\n \n This API will be removed in version 2.0.0. Directly instantiate RunRequest(partition_key=...) instead..\n \n

\n

Creates a RunRequest object for a run that processes the given partition.

\n
\n
Parameters:
\n
    \n
  • partition_key \u2013 The key of the partition to request a run for.

  • \n
  • run_key (Optional[str]) \u2013 A string key to identify this launched run. For sensors, ensures that\nonly one run is created per run key across all sensor evaluations. For schedules,\nensures that one run is created per tick, across failure recoveries. Passing in a None\nvalue means that a run will always be launched per evaluation.

  • \n
  • tags (Optional[Dict[str, str]]) \u2013 A dictionary of tags (string key-value pairs) to attach\nto the launched run.

  • \n
  • (Optional[Mapping[str (run_config) \u2013 Configuration for the run. If the job has\na PartitionedConfig, this value will override replace the config\nprovided by it.

  • \n
  • Any]] \u2013 Configuration for the run. If the job has\na PartitionedConfig, this value will override replace the config\nprovided by it.

  • \n
  • current_time (Optional[datetime]) \u2013 Used to determine which time-partitions exist.\nDefaults to now.

  • \n
  • dynamic_partitions_store (Optional[DynamicPartitionsStore]) \u2013 The DynamicPartitionsStore\nobject that is responsible for fetching dynamic partitions. Required when the\npartitions definition is a DynamicPartitionsDefinition with a name defined. Users\ncan pass the DagsterInstance fetched via context.instance to this argument.

  • \n
\n
\n
Returns:
\n

an object that requests a run to process the given partition.

\n
\n
Return type:
\n

RunRequest

\n
\n
\n
\n\n
\n
\nwith_hooks(hook_defs)[source]
\n

Apply a set of hooks to all op instances within the job.

\n
\n\n
\n
\nwith_top_level_resources(resource_defs)[source]
\n

Apply a set of resources to all op instances within the job.

\n
\n\n
\n\n
\n
\ndagster.execute_job(job, instance, run_config=None, tags=None, raise_on_error=False, op_selection=None, reexecution_options=None, asset_selection=None)[source]\u00b6
\n

Execute a job synchronously.

\n

This API represents dagster\u2019s python entrypoint for out-of-process\nexecution. For most testing purposes, \nexecute_in_process() will be more suitable, but when wanting to run\nexecution using an out-of-process executor (such as dagster.\nmultiprocess_executor), then execute_job is suitable.

\n

execute_job expects a persistent DagsterInstance for\nexecution, meaning the $DAGSTER_HOME environment variable must be set.\nIt also expects a reconstructable pointer to a JobDefinition so\nthat it can be reconstructed in separate processes. This can be done by\nwrapping the JobDefinition in a call to dagster.\nreconstructable().

\n
from dagster import DagsterInstance, execute_job, job, reconstructable\n\n@job\ndef the_job():\n    ...\n\ninstance = DagsterInstance.get()\nresult = execute_job(reconstructable(the_job), instance=instance)\nassert result.success\n
\n
\n

If using the to_job() method to\nconstruct the JobDefinition, then the invocation must be wrapped in a\nmodule-scope function, which can be passed to reconstructable.

\n
from dagster import graph, reconstructable\n\n@graph\ndef the_graph():\n    ...\n\ndef define_job():\n    return the_graph.to_job(...)\n\nresult = execute_job(reconstructable(define_job), ...)\n
\n
\n

Since execute_job is potentially executing outside of the current\nprocess, output objects need to be retrieved by use of the provided job\u2019s\nio managers. Output objects can be retrieved by opening the result of\nexecute_job as a context manager.

\n
from dagster import execute_job\n\nwith execute_job(...) as result:\n    output_obj = result.output_for_node("some_op")\n
\n
\n

execute_job can also be used to reexecute a run, by providing a ReexecutionOptions object.

\n
from dagster import ReexecutionOptions, execute_job\n\ninstance = DagsterInstance.get()\n\noptions = ReexecutionOptions.from_failure(run_id=failed_run_id, instance)\nexecute_job(reconstructable(job), instance, reexecution_options=options)\n
\n
\n
\n
Parameters:
\n
    \n
  • job (ReconstructableJob) \u2013 A reconstructable pointer to a JobDefinition.

  • \n
  • instance (DagsterInstance) \u2013 The instance to execute against.

  • \n
  • run_config (Optional[dict]) \u2013 The configuration that parametrizes this run, as a dict.

  • \n
  • tags (Optional[Dict[str, Any]]) \u2013 Arbitrary key-value pairs that will be added to run logs.

  • \n
  • raise_on_error (Optional[bool]) \u2013 Whether or not to raise exceptions when they occur.\nDefaults to False.

  • \n
  • op_selection (Optional[List[str]]) \u2013

    A list of op selection queries (including single\nop names) to execute. For example:

    \n
      \n
    • ['some_op']: selects some_op itself.

    • \n
    • ['*some_op']: select some_op and all its ancestors (upstream dependencies).

    • \n
    • ['*some_op+++']: select some_op, all its ancestors, and its descendants\n(downstream dependencies) within 3 levels down.

    • \n
    • ['*some_op', 'other_op_a', 'other_op_b+']: select some_op and all its\nancestors, other_op_a itself, and other_op_b and its direct child ops.

    • \n
    \n

  • \n
  • reexecution_options (Optional[ReexecutionOptions]) \u2013 Reexecution options to provide to the run, if this run is\nintended to be a reexecution of a previous run. Cannot be used in\ntandem with the op_selection argument.

  • \n
\n
\n
Returns:
\n

The result of job execution.

\n
\n
Return type:
\n

JobExecutionResult

\n
\n
\n
\n\n
\n
\nclass dagster.ReexecutionOptions(parent_run_id, step_selection=[])[source]\u00b6
\n

Reexecution options for python-based execution in Dagster.

\n
\n
Parameters:
\n
    \n
  • parent_run_id (str) \u2013 The run_id of the run to reexecute.

  • \n
  • step_selection (Sequence[str]) \u2013

    The list of step selections to reexecute. Must be a subset or match of the\nset of steps executed in the original run. For example:

    \n
      \n
    • ['some_op']: selects some_op itself.

    • \n
    • ['*some_op']: select some_op and all its ancestors (upstream dependencies).

    • \n
    • ['*some_op+++']: select some_op, all its ancestors, and its descendants\n(downstream dependencies) within 3 levels down.

    • \n
    • ['*some_op', 'other_op_a', 'other_op_b+']: select some_op and all its\nancestors, other_op_a itself, and other_op_b and its direct child ops.

    • \n
    \n

  • \n
\n
\n
\n
\n\n
\n
\ndagster.instance_for_test(overrides=None, set_dagster_home=True, temp_dir=None)[source]\u00b6
\n

Creates a persistent DagsterInstance available within a context manager.

\n

When a context manager is opened, if no temp_dir parameter is set, a new\ntemporary directory will be created for the duration of the context\nmanager\u2019s opening. If the set_dagster_home parameter is set to True\n(True by default), the $DAGSTER_HOME environment variable will be\noverridden to be this directory (or the directory passed in by temp_dir)\nfor the duration of the context manager being open.

\n
\n
Parameters:
\n
    \n
  • overrides (Optional[Mapping[str, Any]]) \u2013 Config to provide to instance (config format follows that typically found in an instance.yaml file).

  • \n
  • set_dagster_home (Optional[bool]) \u2013 If set to True, the $DAGSTER_HOME environment variable will be\noverridden to be the directory used by this instance for the\nduration that the context manager is open. Upon the context\nmanager closing, the $DAGSTER_HOME variable will be re-set to the original value. (Defaults to True).

  • \n
  • temp_dir (Optional[str]) \u2013 The directory to use for storing local artifacts produced by the\ninstance. If not set, a temporary directory will be created for\nthe duration of the context manager being open, and all artifacts\nwill be torn down afterward.

  • \n
\n
\n
\n
\n\n
\n
\n

Executing Graphs\u00b6

\n
\n
\nclass dagster.GraphDefinition(name, *, description=None, node_defs=None, dependencies=None, input_mappings=None, output_mappings=None, config=None, tags=None, node_input_source_assets=None, **kwargs)[source]
\n

Defines a Dagster op graph.

\n

An op graph is made up of

\n
    \n
  • Nodes, which can either be an op (the functional unit of computation), or another graph.

  • \n
  • Dependencies, which determine how the values produced by nodes as outputs flow from\none node to another. This tells Dagster how to arrange nodes into a directed, acyclic graph\n(DAG) of compute.

  • \n
\n

End users should prefer the @graph decorator. GraphDefinition is generally\nintended to be used by framework authors or for programatically generated graphs.

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the graph. Must be unique within any GraphDefinition\nor JobDefinition containing the graph.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the job.

  • \n
  • node_defs (Optional[Sequence[NodeDefinition]]) \u2013 The set of ops / graphs used in this graph.

  • \n
  • dependencies (Optional[Dict[Union[str, NodeInvocation], Dict[str, DependencyDefinition]]]) \u2013 A structure that declares the dependencies of each op\u2019s inputs on the outputs of other\nops in the graph. Keys of the top level dict are either the string names of ops in the\ngraph or, in the case of aliased ops, NodeInvocations.\nValues of the top level dict are themselves dicts, which map input names belonging to\nthe op or aliased op to DependencyDefinitions.

  • \n
  • input_mappings (Optional[Sequence[InputMapping]]) \u2013 Defines the inputs to the nested graph, and\nhow they map to the inputs of its constituent ops.

  • \n
  • output_mappings (Optional[Sequence[OutputMapping]]) \u2013 Defines the outputs of the nested graph,\nand how they map from the outputs of its constituent ops.

  • \n
  • config (Optional[ConfigMapping]) \u2013 Defines the config of the graph, and how its schema maps\nto the config of its constituent ops.

  • \n
  • tags (Optional[Dict[str, Any]]) \u2013 Arbitrary metadata for any execution of the graph.\nValues that are not strings will be json encoded and must meet the criteria that\njson.loads(json.dumps(value)) == value. These tag values may be overwritten by tag\nvalues provided at invocation time.

  • \n
\n
\n
\n

Examples

\n
@op\ndef return_one():\n    return 1\n\n@op\ndef add_one(num):\n    return num + 1\n\ngraph_def = GraphDefinition(\n    name='basic',\n    node_defs=[return_one, add_one],\n    dependencies={'add_one': {'num': DependencyDefinition('return_one')}},\n)\n
\n
\n
\n
\nalias(name)[source]
\n

Aliases the graph with a new name.

\n

Can only be used in the context of a @graph, @job, or @asset_graph decorated function.

\n
\n
Examples:
@job\ndef do_it_all():\n    my_graph.alias("my_graph_alias")\n
\n
\n
\n
\n
\n\n
\n
\nproperty config_mapping
\n

The config mapping for the graph, if present.

\n

By specifying a config mapping function, you can override the configuration for the child nodes contained within a graph.

\n
\n\n
\n
\nexecute_in_process(run_config=None, instance=None, resources=None, raise_on_error=True, op_selection=None, run_id=None, input_values=None)[source]
\n

Execute this graph in-process, collecting results in-memory.

\n
\n
Parameters:
\n
    \n
  • run_config (Optional[Mapping[str, Any]]) \u2013 Run config to provide to execution. The configuration for the underlying graph\nshould exist under the \u201cops\u201d key.

  • \n
  • instance (Optional[DagsterInstance]) \u2013 The instance to execute against, an ephemeral one will be used if none provided.

  • \n
  • resources (Optional[Mapping[str, Any]]) \u2013 The resources needed if any are required. Can provide resource instances directly,\nor resource definitions.

  • \n
  • raise_on_error (Optional[bool]) \u2013 Whether or not to raise exceptions when they occur.\nDefaults to True.

  • \n
  • op_selection (Optional[List[str]]) \u2013 A list of op selection queries (including single op\nnames) to execute. For example:\n* ['some_op']: selects some_op itself.\n* ['*some_op']: select some_op and all its ancestors (upstream dependencies).\n* ['*some_op+++']: select some_op, all its ancestors, and its descendants\n(downstream dependencies) within 3 levels down.\n* ['*some_op', 'other_op_a', 'other_op_b+']: select some_op and all its\nancestors, other_op_a itself, and other_op_b and its direct child ops.

  • \n
  • input_values (Optional[Mapping[str, Any]]) \u2013 A dictionary that maps python objects to the top-level inputs of the graph.

  • \n
\n
\n
Returns:
\n

ExecuteInProcessResult

\n
\n
\n
\n\n
\n
\nproperty input_mappings
\n

Input mappings for the graph.

\n

An input mapping is a mapping from an input of the graph to an input of a child node.

\n
\n\n
\n
\nproperty name
\n

The name of the graph.

\n
\n\n
\n
\nproperty output_mappings
\n

Output mappings for the graph.

\n

An output mapping is a mapping from an output of the graph to an output of a child node.

\n
\n\n
\n
\ntag(tags)[source]
\n

Attaches the provided tags to the graph immutably.

\n

Can only be used in the context of a @graph, @job, or @asset_graph decorated function.

\n
\n
Examples:
@job\ndef do_it_all():\n    my_graph.tag({"my_tag": "my_value"})\n
\n
\n
\n
\n
\n\n
\n
\nproperty tags
\n

The tags associated with the graph.

\n
\n\n
\n
\nto_job(name=None, description=None, resource_defs=None, config=None, tags=None, metadata=None, logger_defs=None, executor_def=None, hooks=None, op_retry_policy=None, version_strategy=None, op_selection=None, partitions_def=None, asset_layer=None, input_values=None, _asset_selection_data=None)[source]
\n

Make this graph in to an executable Job by providing remaining components required for execution.

\n
\n
Parameters:
\n
    \n
  • name (Optional[str]) \u2013 The name for the Job. Defaults to the name of the this graph.

  • \n
  • resource_defs (Optional[Mapping [str, object]]) \u2013 Resources that are required by this graph for execution.\nIf not defined, io_manager will default to filesystem.

  • \n
  • config \u2013

    Describes how the job is parameterized at runtime.

    \n

    If no value is provided, then the schema for the job\u2019s run config is a standard\nformat based on its ops and resources.

    \n

    If a dictionary is provided, then it must conform to the standard config schema, and\nit will be used as the job\u2019s run config for the job whenever the job is executed.\nThe values provided will be viewable and editable in the Dagster UI, so be\ncareful with secrets.

    \n

    If a ConfigMapping object is provided, then the schema for the job\u2019s run config is\ndetermined by the config mapping, and the ConfigMapping, which should return\nconfiguration in the standard format to configure the job.

    \n

    If a PartitionedConfig object is provided, then it defines a discrete set of config\nvalues that can parameterize the job, as well as a function for mapping those\nvalues to the base config. The values provided will be viewable and editable in the\nDagster UI, so be careful with secrets.

    \n

  • \n
  • tags (Optional[Mapping[str, Any]]) \u2013 Arbitrary information that will be attached to the execution of the Job.\nValues that are not strings will be json encoded and must meet the criteria that\njson.loads(json.dumps(value)) == value. These tag values may be overwritten by tag\nvalues provided at invocation time.

  • \n
  • metadata (Optional[Mapping[str, RawMetadataValue]]) \u2013 Arbitrary information that will be attached to the JobDefinition and be viewable in the Dagster UI.\nKeys must be strings, and values must be python primitive types or one of the provided\nMetadataValue types

  • \n
  • logger_defs (Optional[Mapping[str, LoggerDefinition]]) \u2013 A dictionary of string logger identifiers to their implementations.

  • \n
  • executor_def (Optional[ExecutorDefinition]) \u2013 How this Job will be executed. Defaults to multi_or_in_process_executor,\nwhich can be switched between multi-process and in-process modes of execution. The\ndefault mode of execution is multi-process.

  • \n
  • op_retry_policy (Optional[RetryPolicy]) \u2013 The default retry policy for all ops in this job.\nOnly used if retry policy is not defined on the op definition or op invocation.

  • \n
  • version_strategy (Optional[VersionStrategy]) \u2013 Defines how each op (and optionally, resource) in the job can be versioned. If\nprovided, memoizaton will be enabled for this job.

  • \n
  • partitions_def (Optional[PartitionsDefinition]) \u2013 Defines a discrete set of partition\nkeys that can parameterize the job. If this argument is supplied, the config\nargument can\u2019t also be supplied.

  • \n
  • asset_layer (Optional[AssetLayer]) \u2013 Top level information about the assets this job\nwill produce. Generally should not be set manually.

  • \n
  • input_values (Optional[Mapping[str, Any]]) \u2013 A dictionary that maps python objects to the top-level inputs of a job.

  • \n
\n
\n
Returns:
\n

JobDefinition

\n
\n
\n
\n\n
\n
\nwith_hooks(hook_defs)[source]
\n

Attaches the provided hooks to the graph immutably.

\n

Can only be used in the context of a @graph, @job, or @asset_graph decorated function.

\n
\n
Examples:
@job\ndef do_it_all():\n    my_graph.with_hooks({my_hook})\n
\n
\n
\n
\n
\n\n
\n
\nwith_retry_policy(retry_policy)[source]
\n

Attaches the provided retry policy to the graph immutably.

\n

Can only be used in the context of a @graph, @job, or @asset_graph decorated function.

\n
\n
Examples:
@job\ndef do_it_all():\n    my_graph.with_retry_policy(RetryPolicy(max_retries=5))\n
\n
\n
\n
\n
\n\n
\n\n
\n
\n

Execution results\u00b6

\n
\n
\nclass dagster.ExecuteInProcessResult(event_list, dagster_run, output_capture, job_def)[source]\u00b6
\n

Result object returned by in-process testing APIs.

\n

Users should not instantiate this object directly. Used for retrieving run success, events, and outputs from execution methods that return this object.

\n

This object is returned by:\n- dagster.GraphDefinition.execute_in_process()\n- dagster.JobDefinition.execute_in_process()\n- dagster.materialize_to_memory()\n- dagster.materialize()

\n
\n
\nproperty all_events\u00b6
\n

All dagster events emitted during execution.

\n
\n
Type:
\n

List[DagsterEvent]

\n
\n
\n
\n\n
\n
\nasset_value(asset_key)[source]\u00b6
\n

Retrieves the value of an asset that was materialized during the execution of the job.

\n
\n
Parameters:
\n

asset_key (CoercibleToAssetKey) \u2013 The key of the asset to retrieve.

\n
\n
Returns:
\n

The value of the retrieved asset.

\n
\n
Return type:
\n

Any

\n
\n
\n
\n\n
\n
\nproperty dagster_run\u00b6
\n

The Dagster run that was executed.

\n
\n
Type:
\n

DagsterRun

\n
\n
\n
\n\n
\n
\nproperty job_def\u00b6
\n

The job definition that was executed.

\n
\n
Type:
\n

JobDefinition

\n
\n
\n
\n\n
\n
\noutput_for_node(node_str, output_name='result')[source]\u00b6
\n

Retrieves output value with a particular name from the in-process run of the job.

\n
\n
Parameters:
\n
    \n
  • node_str (str) \u2013 Name of the op/graph whose output should be retrieved. If the intended\ngraph/op is nested within another graph, the syntax is outer_graph.inner_node.

  • \n
  • output_name (Optional[str]) \u2013 Name of the output on the op/graph to retrieve. Defaults to\nresult, the default output name in dagster.

  • \n
\n
\n
Returns:
\n

The value of the retrieved output.

\n
\n
Return type:
\n

Any

\n
\n
\n
\n\n
\n
\noutput_value(output_name='result')[source]\u00b6
\n

Retrieves output of top-level job, if an output is returned.

\n
\n
Parameters:
\n

output_name (Optional[str]) \u2013 The name of the output to retrieve. Defaults to result,\nthe default output name in dagster.

\n
\n
Returns:
\n

The value of the retrieved output.

\n
\n
Return type:
\n

Any

\n
\n
\n
\n\n
\n
\nproperty run_id\u00b6
\n

The run ID of the executed DagsterRun.

\n
\n
Type:
\n

str

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.JobExecutionResult(job_def, reconstruct_context, event_list, dagster_run)[source]\u00b6
\n

Result object returned by dagster.execute_job().

\n

Used for retrieving run success, events, and outputs from execute_job.\nUsers should not directly instantiate this class.

\n

Events and run information can be retrieved off of the object directly. In\norder to access outputs, the ExecuteJobResult object needs to be opened\nas a context manager, which will re-initialize the resources from\nexecution.

\n
\n
\nproperty all_events\u00b6
\n

List of all events yielded by the job execution.

\n
\n
Type:
\n

Sequence[DagsterEvent]

\n
\n
\n
\n\n
\n
\nproperty dagster_run\u00b6
\n

The Dagster run that was executed.

\n
\n
Type:
\n

DagsterRun

\n
\n
\n
\n\n
\n
\nproperty job_def\u00b6
\n

The job definition that was executed.

\n
\n
Type:
\n

JobDefinition

\n
\n
\n
\n\n
\n
\noutput_for_node(node_str, output_name='result')[source]\u00b6
\n

Retrieves output value with a particular name from the run of the job.

\n

In order to use this method, the ExecuteJobResult object must be opened as a context manager. If this method is used without opening the context manager, it will result in a DagsterInvariantViolationError.

\n
\n
Parameters:
\n
    \n
  • node_str (str) \u2013 Name of the op/graph whose output should be retrieved. If the intended\ngraph/op is nested within another graph, the syntax is outer_graph.inner_node.

  • \n
  • output_name (Optional[str]) \u2013 Name of the output on the op/graph to retrieve. Defaults to\nresult, the default output name in dagster.

  • \n
\n
\n
Returns:
\n

The value of the retrieved output.

\n
\n
Return type:
\n

Any

\n
\n
\n
\n\n
\n
\noutput_value(output_name='result')[source]\u00b6
\n

Retrieves output of top-level job, if an output is returned.

\n

In order to use this method, the ExecuteJobResult object must be opened as a context manager. If this method is used without opening the context manager, it will result in a DagsterInvariantViolationError. If the top-level job has no output, calling this method will also result in a DagsterInvariantViolationError.

\n
\n
Parameters:
\n

output_name (Optional[str]) \u2013 The name of the output to retrieve. Defaults to result,\nthe default output name in dagster.

\n
\n
Returns:
\n

The value of the retrieved output.

\n
\n
Return type:
\n

Any

\n
\n
\n
\n\n
\n
\nproperty run_id\u00b6
\n

The id of the Dagster run that was executed.

\n
\n
Type:
\n

str

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.DagsterEvent(event_type_value, job_name, step_handle=None, node_handle=None, step_kind_value=None, logging_tags=None, event_specific_data=None, message=None, pid=None, step_key=None)[source]\u00b6
\n

Events yielded by op and job execution.

\n

Users should not instantiate this class.

\n
\n
\nevent_type_value\u00b6
\n

Value for a DagsterEventType.

\n
\n
Type:
\n

str

\n
\n
\n
\n\n
\n
\njob_name\u00b6
\n
\n
Type:
\n

str

\n
\n
\n
\n\n
\n
\nnode_handle\u00b6
\n
\n
Type:
\n

NodeHandle

\n
\n
\n
\n\n
\n
\nstep_kind_value\u00b6
\n

Value for a StepKind.

\n
\n
Type:
\n

str

\n
\n
\n
\n\n
\n
\nlogging_tags\u00b6
\n
\n
Type:
\n

Dict[str, str]

\n
\n
\n
\n\n
\n
\nevent_specific_data\u00b6
\n

Type must correspond to event_type_value.

\n
\n
Type:
\n

Any

\n
\n
\n
\n\n
\n
\nmessage\u00b6
\n
\n
Type:
\n

str

\n
\n
\n
\n\n
\n
\npid\u00b6
\n
\n
Type:
\n

int

\n
\n
\n
\n\n
\n
\nstep_key\u00b6
\n

DEPRECATED

\n
\n
Type:
\n

Optional[str]

\n
\n
\n
\n\n
\n
\nproperty asset_key\u00b6
\n

For events that correspond to a specific asset_key / partition\n(ASSET_MATERIALIZTION, ASSET_OBSERVATION, ASSET_MATERIALIZATION_PLANNED), returns that\nasset key. Otherwise, returns None.

\n
\n
Type:
\n

Optional[AssetKey]

\n
\n
\n
\n\n
\n
\nproperty event_type\u00b6
\n

The type of this event.

\n
\n
Type:
\n

DagsterEventType

\n
\n
\n
\n\n
\n
\nproperty is_asset_materialization_planned\u00b6
\n

If this event is of type ASSET_MATERIALIZATION_PLANNED.

\n
\n
Type:
\n

bool

\n
\n
\n
\n\n
\n
\nproperty is_asset_observation\u00b6
\n

If this event is of type ASSET_OBSERVATION.

\n
\n
Type:
\n

bool

\n
\n
\n
\n\n
\n
\nproperty is_engine_event\u00b6
\n

If this event is of type ENGINE_EVENT.

\n
\n
Type:
\n

bool

\n
\n
\n
\n\n
\n
\nproperty is_expectation_result\u00b6
\n

If this event is of type STEP_EXPECTATION_RESULT.

\n
\n
Type:
\n

bool

\n
\n
\n
\n\n
\n
\nproperty is_failure\u00b6
\n

If this event represents the failure of a run or step.

\n
\n
Type:
\n

bool

\n
\n
\n
\n\n
\n
\nproperty is_handled_output\u00b6
\n

If this event is of type HANDLED_OUTPUT.

\n
\n
Type:
\n

bool

\n
\n
\n
\n\n
\n
\nproperty is_hook_event\u00b6
\n

If this event relates to the execution of a hook.

\n
\n
Type:
\n

bool

\n
\n
\n
\n\n
\n
\nproperty is_loaded_input\u00b6
\n

If this event is of type LOADED_INPUT.

\n
\n
Type:
\n

bool

\n
\n
\n
\n\n
\n
\nproperty is_resource_init_failure\u00b6
\n

If this event is of type RESOURCE_INIT_FAILURE.

\n
\n
Type:
\n

bool

\n
\n
\n
\n\n
\n
\nproperty is_step_event\u00b6
\n

If this event relates to a specific step.

\n
\n
Type:
\n

bool

\n
\n
\n
\n\n
\n
\nproperty is_step_failure\u00b6
\n

If this event is of type STEP_FAILURE.

\n
\n
Type:
\n

bool

\n
\n
\n
\n\n
\n
\nproperty is_step_materialization\u00b6
\n

If this event is of type ASSET_MATERIALIZATION.

\n
\n
Type:
\n

bool

\n
\n
\n
\n\n
\n
\nproperty is_step_restarted\u00b6
\n

If this event is of type STEP_RESTARTED.

\n
\n
Type:
\n

bool

\n
\n
\n
\n\n
\n
\nproperty is_step_skipped\u00b6
\n

If this event is of type STEP_SKIPPED.

\n
\n
Type:
\n

bool

\n
\n
\n
\n\n
\n
\nproperty is_step_start\u00b6
\n

If this event is of type STEP_START.

\n
\n
Type:
\n

bool

\n
\n
\n
\n\n
\n
\nproperty is_step_success\u00b6
\n

If this event is of type STEP_SUCCESS.

\n
\n
Type:
\n

bool

\n
\n
\n
\n\n
\n
\nproperty is_step_up_for_retry\u00b6
\n

If this event is of type STEP_UP_FOR_RETRY.

\n
\n
Type:
\n

bool

\n
\n
\n
\n\n
\n
\nproperty is_successful_output\u00b6
\n

If this event is of type STEP_OUTPUT.

\n
\n
Type:
\n

bool

\n
\n
\n
\n\n
\n
\nproperty partition\u00b6
\n

For events that correspond to a specific asset_key / partition\n(ASSET_MATERIALIZTION, ASSET_OBSERVATION, ASSET_MATERIALIZATION_PLANNED), returns that\npartition. Otherwise, returns None.

\n
\n
Type:
\n

Optional[AssetKey]

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.DagsterEventType(value)[source]\u00b6
\n

The types of events that may be yielded by op and job execution.

\n
\n\n
\n
\n

Reconstructable jobs\u00b6

\n
\n
\nclass dagster.reconstructable(target)[source]\u00b6
\n

Create a ReconstructableJob from a\nfunction that returns a JobDefinition/JobDefinition,\nor a function decorated with @job.

\n

When your job must cross process boundaries, e.g., for execution on multiple nodes or\nin different systems (like dagstermill), Dagster must know how to reconstruct the job\non the other side of the process boundary.

\n

Passing a job created with ~dagster.GraphDefinition.to_job to reconstructable(),\nrequires you to wrap that job\u2019s definition in a module-scoped function, and pass that function\ninstead:

\n
from dagster import graph, reconstructable\n\n@graph\ndef my_graph():\n    ...\n\ndef define_my_job():\n    return my_graph.to_job()\n\nreconstructable(define_my_job)\n
\n
\n

This function implements a very conservative strategy for reconstruction, so that its behavior\nis easy to predict, but as a consequence it is not able to reconstruct certain kinds of jobs\nor jobs, such as those defined by lambdas, in nested scopes (e.g., dynamically within a method\ncall), or in interactive environments such as the Python REPL or Jupyter notebooks.

\n

If you need to reconstruct objects constructed in these ways, you should use\nbuild_reconstructable_job() instead, which allows you to\nspecify your own reconstruction strategy.

\n

Examples

\n
from dagster import job, reconstructable\n\n@job\ndef foo_job():\n    ...\n\nreconstructable_foo_job = reconstructable(foo_job)\n\n\n@graph\ndef foo():\n    ...\n\ndef make_bar_job():\n    return foo.to_job()\n\nreconstructable_bar_job = reconstructable(make_bar_job)\n
\n
\n
\n\n
\n
\n

Executors\u00b6

\n
\n
\ndagster.multi_or_in_process_executor ExecutorDefinition[source]\u00b6
\n

The default executor for a job.

\n

This is the executor available by default on a JobDefinition\nthat does not provide custom executors. This executor has a multiprocessing-enabled mode, and a\nsingle-process mode. By default, multiprocessing mode is enabled. Switching between multiprocess\nmode and in-process mode can be achieved via config.

\n
execution:\n  config:\n    multiprocess:\n\n\nexecution:\n  config:\n    in_process:\n
\n
\n

When using the multiprocess mode, max_concurrent and retries can also be configured.

\n
execution:\n  config:\n    multiprocess:\n      max_concurrent: 4\n      retries:\n        enabled:\n
\n
\n

The max_concurrent arg is optional and tells the execution engine how many processes may run\nconcurrently. By default, or if you set max_concurrent to be 0, this is the return value of\npython:multiprocessing.cpu_count().

\n

When using the in_process mode, then only retries can be configured.

\n

Execution priority can be configured using the dagster/priority tag via op metadata,\nwhere the higher the number the higher the priority. 0 is the default and both positive\nand negative numbers can be used.

\n
\n\n
\n
\ndagster.in_process_executor ExecutorDefinition[source]\u00b6
\n

The in-process executor executes all steps in a single process.

\n

To select it, include the following top-level fragment in config:

\n
execution:\n  in_process:\n
\n
\n

Execution priority can be configured using the dagster/priority tag via op metadata,\nwhere the higher the number the higher the priority. 0 is the default and both positive\nand negative numbers can be used.

\n
\n\n
\n
\ndagster.multiprocess_executor ExecutorDefinition[source]\u00b6
\n

The multiprocess executor executes each step in an individual process.

\n

Any job that does not specify custom executors will use the multiprocess_executor by default.\nTo configure the multiprocess executor, include a fragment such as the following in your run\nconfig:

\n
execution:\n  config:\n    multiprocess:\n      max_concurrent: 4\n
\n
\n

The max_concurrent arg is optional and tells the execution engine how many processes may run\nconcurrently. By default, or if you set max_concurrent to be None or 0, this is the return value of\npython:multiprocessing.cpu_count().

\n

Execution priority can be configured using the dagster/priority tag via op metadata,\nwhere the higher the number the higher the priority. 0 is the default and both positive\nand negative numbers can be used.

\n
\n\n
\n
\n

Contexts\u00b6

\n
\n
\nclass dagster.AssetExecutionContext(step_execution_context)[source]\u00b6
\n
\n\n
\n
\nclass dagster.OpExecutionContext(step_execution_context)[source]\u00b6
\n

The context object that can be made available as the first argument to the function\nused for computing an op or asset.

\n

This context object provides system information such as resources, config, and logging.

\n

To construct an execution context for testing purposes, use dagster.build_op_context().

\n

Example

\n
from dagster import op, OpExecutionContext\n\n@op\ndef hello_world(context: OpExecutionContext):\n    context.log.info("Hello, world!")\n
\n
\n
\n
\nadd_output_metadata(metadata, output_name=None, mapping_key=None)[source]\u00b6
\n

Add metadata to one of the outputs of an op.

\n

This can be invoked multiple times per output in the body of an op. If the same key is\npassed multiple times, the value associated with the last call will be used.

\n
\n
Parameters:
\n
    \n
  • metadata (Mapping[str, Any]) \u2013 The metadata to attach to the output

  • \n
  • output_name (Optional[str]) \u2013 The name of the output to attach metadata to. If there is only one output on the op, then this argument does not need to be provided. The metadata will automatically be attached to the only output.

  • \n
  • mapping_key (Optional[str]) \u2013 The mapping key of the output to attach metadata to. If the\noutput is not dynamic, this argument does not need to be provided.

  • \n
\n
\n
\n

Examples:

\n
from dagster import Out, op\nfrom typing import Tuple\n\n@op\ndef add_metadata(context):\n    context.add_output_metadata({"foo", "bar"})\n    return 5 # Since the default output is called "result", metadata will be attached to the output "result".\n\n@op(out={"a": Out(), "b": Out()})\ndef add_metadata_two_outputs(context) -> Tuple[str, int]:\n    context.add_output_metadata({"foo": "bar"}, output_name="b")\n    context.add_output_metadata({"baz": "bat"}, output_name="a")\n\n    return ("dog", 5)\n
\n
\n
\n\n
\n
\nproperty asset_checks_def\u00b6
\n

The backing AssetChecksDefinition for what is currently executing, errors if not\navailable.

\n
\n
Returns:
\n

AssetChecksDefinition.

\n
\n
\n
\n\n
\n
\nproperty asset_key\u00b6
\n

The AssetKey for the current asset. In a multi_asset, use asset_key_for_output instead.

\n
\n\n
\n
\nasset_key_for_input(input_name)[source]\u00b6
\n

Return the AssetKey for the corresponding input.

\n
\n\n
\n
\nasset_key_for_output(output_name='result')[source]\u00b6
\n

Return the AssetKey for the corresponding output.

\n
\n\n
\n
\nasset_partition_key_for_input(input_name)[source]\u00b6
\n

Returns the partition key of the upstream asset corresponding to the given input.

\n
\n
Parameters:
\n

input_name (str) \u2013 The name of the input to get the partition key for.

\n
\n
\n

Examples

\n
partitions_def = DailyPartitionsDefinition("2023-08-20")\n\n@asset(\n    partitions_def=partitions_def\n)\ndef upstream_asset():\n    ...\n\n@asset(\n    partitions_def=partitions_def\n)\ndef an_asset(context: AssetExecutionContext, upstream_asset):\n    context.log.info(context.asset_partition_key_for_input("upstream_asset"))\n\n# materializing the 2023-08-21 partition of this asset will log:\n#   "2023-08-21"\n\n\n@asset(\n    partitions_def=partitions_def,\n    ins={\n        "self_dependent_asset": AssetIn(partition_mapping=TimeWindowPartitionMapping(start_offset=-1, end_offset=-1))\n    }\n)\ndef self_dependent_asset(context: AssetExecutionContext, self_dependent_asset):\n    context.log.info(context.asset_partition_key_for_input("self_dependent_asset"))\n\n# materializing the 2023-08-21 partition of this asset will log:\n#   "2023-08-20"\n
\n
\n
\n\n
\n
\nasset_partition_key_for_output(output_name='result')[source]\u00b6
\n

Returns the asset partition key for the given output.

\n
\n
Parameters:
\n

output_name (str) \u2013 For assets defined with the @asset decorator, the name of the output\nwill be automatically provided. For assets defined with @multi_asset, output_name\nshould be the op output associated with the asset key (as determined by AssetOut)\nto get the partition key for.

\n
\n
\n

Examples

\n
partitions_def = DailyPartitionsDefinition("2023-08-20")\n\n@asset(\n    partitions_def=partitions_def\n)\ndef an_asset(context: AssetExecutionContext):\n    context.log.info(context.asset_partition_key_for_output())\n\n\n# materializing the 2023-08-21 partition of this asset will log:\n#   "2023-08-21"\n\n@multi_asset(\n    outs={\n        "first_asset": AssetOut(key=["my_assets", "first_asset"]),\n        "second_asset": AssetOut(key=["my_assets", "second_asset"])\n    }\n    partitions_def=partitions_def,\n)\ndef a_multi_asset(context: AssetExecutionContext):\n    context.log.info(context.asset_partition_key_for_output("first_asset"))\n    context.log.info(context.asset_partition_key_for_output("second_asset"))\n\n\n# materializing the 2023-08-21 partition of this asset will log:\n#   "2023-08-21"\n#   "2023-08-21"\n\n\n@asset(\n    partitions_def=partitions_def,\n    ins={\n        "self_dependent_asset": AssetIn(partition_mapping=TimeWindowPartitionMapping(start_offset=-1, end_offset=-1))\n    }\n)\ndef self_dependent_asset(context: AssetExecutionContext, self_dependent_asset):\n    context.log.info(context.asset_partition_key_for_output())\n\n# materializing the 2023-08-21 partition of this asset will log:\n#   "2023-08-21"\n
\n
\n
\n\n
\n
\nproperty asset_partition_key_range\u00b6
\n
\n
\n

\n \n (\n deprecated\n )\n \n This API will be removed in version 2.0. Use partition_key_range instead..\n \n

\n

The range of partition keys for the current run.

\n

If run is for a single partition key, return a PartitionKeyRange with the same start and\nend. Raises an error if the current run is not a partitioned run.

\n
\n\n
\n
\nasset_partition_key_range_for_input(input_name)[source]\u00b6
\n

Return the PartitionKeyRange for the corresponding input. Errors if the asset depends on a\nnon-contiguous chunk of the input.

\n

If you want to write your asset to support running a backfill of several partitions in a single run,\nyou can use asset_partition_key_range_for_input to get the range of partitions keys of the input that\nare relevant to that backfill.

\n
\n
Parameters:
\n

input_name (str) \u2013 The name of the input to get the time window for.

\n
\n
\n

Examples

\n
partitions_def = DailyPartitionsDefinition("2023-08-20")\n\n@asset(\n    partitions_def=partitions_def\n)\ndef upstream_asset():\n    ...\n\n@asset(\n    partitions_def=partitions_def\n)\ndef an_asset(context: AssetExecutionContext, upstream_asset):\n    context.log.info(context.asset_partition_key_range_for_input("upstream_asset"))\n\n\n# running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n#   PartitionKeyRange(start="2023-08-21", end="2023-08-25")\n\n@asset(\n    ins={\n        "upstream_asset": AssetIn(partition_mapping=TimeWindowPartitionMapping(start_offset=-1, end_offset=-1))\n    }\n    partitions_def=partitions_def,\n)\ndef another_asset(context: AssetExecutionContext, upstream_asset):\n    context.log.info(context.asset_partition_key_range_for_input("upstream_asset"))\n\n\n# running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n#   PartitionKeyRange(start="2023-08-20", end="2023-08-24")\n\n\n@asset(\n    partitions_def=partitions_def,\n    ins={\n        "self_dependent_asset": AssetIn(partition_mapping=TimeWindowPartitionMapping(start_offset=-1, end_offset=-1))\n    }\n)\ndef self_dependent_asset(context: AssetExecutionContext, self_dependent_asset):\n    context.log.info(context.asset_partition_key_range_for_input("self_dependent_asset"))\n\n# running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n#   PartitionKeyRange(start="2023-08-20", end="2023-08-24")\n
\n
\n
\n\n
\n
\nasset_partition_key_range_for_output(output_name='result')[source]\u00b6
\n

Return the PartitionKeyRange for the corresponding output. Errors if the run is not partitioned.

\n

If you want to write your asset to support running a backfill of several partitions in a single run,\nyou can use asset_partition_key_range_for_output to get all of the partitions being materialized\nby the backfill.

\n
\n
Parameters:
\n

output_name (str) \u2013 For assets defined with the @asset decorator, the name of the output\nwill be automatically provided. For assets defined with @multi_asset, output_name\nshould be the op output associated with the asset key (as determined by AssetOut)\nto get the partition key range for.

\n
\n
\n

Examples

\n
partitions_def = DailyPartitionsDefinition("2023-08-20")\n\n@asset(\n    partitions_def=partitions_def\n)\ndef an_asset(context: AssetExecutionContext):\n    context.log.info(context.asset_partition_key_range_for_output())\n\n\n# running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n#   PartitionKeyRange(start="2023-08-21", end="2023-08-25")\n\n@multi_asset(\n    outs={\n        "first_asset": AssetOut(key=["my_assets", "first_asset"]),\n        "second_asset": AssetOut(key=["my_assets", "second_asset"])\n    }\n    partitions_def=partitions_def,\n)\ndef a_multi_asset(context: AssetExecutionContext):\n    context.log.info(context.asset_partition_key_range_for_output("first_asset"))\n    context.log.info(context.asset_partition_key_range_for_output("second_asset"))\n\n\n# running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n#   PartitionKeyRange(start="2023-08-21", end="2023-08-25")\n#   PartitionKeyRange(start="2023-08-21", end="2023-08-25")\n\n\n@asset(\n    partitions_def=partitions_def,\n    ins={\n        "self_dependent_asset": AssetIn(partition_mapping=TimeWindowPartitionMapping(start_offset=-1, end_offset=-1))\n    }\n)\ndef self_dependent_asset(context: AssetExecutionContext, self_dependent_asset):\n    context.log.info(context.asset_partition_key_range_for_output())\n\n# running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n#   PartitionKeyRange(start="2023-08-21", end="2023-08-25")\n
\n
\n
\n\n
\n
\nasset_partition_keys_for_input(input_name)[source]\u00b6
\n

Returns a list of the partition keys of the upstream asset corresponding to the\ngiven input.

\n

If you want to write your asset to support running a backfill of several partitions in a single run,\nyou can use asset_partition_keys_for_input to get all of the partition keys of the input that\nare relevant to that backfill.

\n
\n
Parameters:
\n

input_name (str) \u2013 The name of the input to get the time window for.

\n
\n
\n

Examples

\n
partitions_def = DailyPartitionsDefinition("2023-08-20")\n\n@asset(\n    partitions_def=partitions_def\n)\ndef upstream_asset():\n    ...\n\n@asset(\n    partitions_def=partitions_def\n)\ndef an_asset(context: AssetExecutionContext, upstream_asset):\n    context.log.info(context.asset_partition_keys_for_input("upstream_asset"))\n\n\n# running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n#   ["2023-08-21", "2023-08-22", "2023-08-23", "2023-08-24", "2023-08-25"]\n\n@asset(\n    ins={\n        "upstream_asset": AssetIn(partition_mapping=TimeWindowPartitionMapping(start_offset=-1, end_offset=-1))\n    }\n    partitions_def=partitions_def,\n)\ndef another_asset(context: AssetExecutionContext, upstream_asset):\n    context.log.info(context.asset_partition_keys_for_input("upstream_asset"))\n\n\n# running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n#   ["2023-08-20", "2023-08-21", "2023-08-22", "2023-08-23", "2023-08-24"]\n\n\n@asset(\n    partitions_def=partitions_def,\n    ins={\n        "self_dependent_asset": AssetIn(partition_mapping=TimeWindowPartitionMapping(start_offset=-1, end_offset=-1))\n    }\n)\ndef self_dependent_asset(context: AssetExecutionContext, self_dependent_asset):\n    context.log.info(context.asset_partition_keys_for_input("self_dependent_asset"))\n\n# running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n#   ["2023-08-20", "2023-08-21", "2023-08-22", "2023-08-23", "2023-08-24"]\n
\n
\n
\n\n
\n
\nasset_partition_keys_for_output(output_name='result')[source]\u00b6
\n

Returns a list of the partition keys for the given output.

\n

If you want to write your asset to support running a backfill of several partitions in a single run,\nyou can use asset_partition_keys_for_output to get all of the partitions being materialized\nby the backfill.

\n
\n
Parameters:
\n

output_name (str) \u2013 For assets defined with the @asset decorator, the name of the output\nwill be automatically provided. For assets defined with @multi_asset, output_name\nshould be the op output associated with the asset key (as determined by AssetOut)\nto get the partition keys for.

\n
\n
\n

Examples

\n
partitions_def = DailyPartitionsDefinition("2023-08-20")\n\n@asset(\n    partitions_def=partitions_def\n)\ndef an_asset(context: AssetExecutionContext):\n    context.log.info(context.asset_partition_keys_for_output())\n\n\n# running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n#   ["2023-08-21", "2023-08-22", "2023-08-23", "2023-08-24", "2023-08-25"]\n\n@multi_asset(\n    outs={\n        "first_asset": AssetOut(key=["my_assets", "first_asset"]),\n        "second_asset": AssetOut(key=["my_assets", "second_asset"])\n    }\n    partitions_def=partitions_def,\n)\ndef a_multi_asset(context: AssetExecutionContext):\n    context.log.info(context.asset_partition_keys_for_output("first_asset"))\n    context.log.info(context.asset_partition_keys_for_output("second_asset"))\n\n\n# running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n#   ["2023-08-21", "2023-08-22", "2023-08-23", "2023-08-24", "2023-08-25"]\n#   ["2023-08-21", "2023-08-22", "2023-08-23", "2023-08-24", "2023-08-25"]\n\n\n@asset(\n    partitions_def=partitions_def,\n    ins={\n        "self_dependent_asset": AssetIn(partition_mapping=TimeWindowPartitionMapping(start_offset=-1, end_offset=-1))\n    }\n)\ndef self_dependent_asset(context: AssetExecutionContext, self_dependent_asset):\n    context.log.info(context.asset_partition_keys_for_output())\n\n# running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n#   ["2023-08-21", "2023-08-22", "2023-08-23", "2023-08-24", "2023-08-25"]\n
\n
\n
\n\n
\n
\nasset_partitions_def_for_input(input_name)[source]\u00b6
\n

The PartitionsDefinition on the upstream asset corresponding to this input.

\n
\n
Parameters:
\n

input_name (str) \u2013 The name of the input to get the PartitionsDefinition for.

\n
\n
\n

Examples

\n
partitions_def = DailyPartitionsDefinition("2023-08-20")\n\n@asset(\n    partitions_def=partitions_def\n)\ndef upstream_asset():\n    ...\n\n@asset(\n    partitions_def=partitions_def\n)\ndef upstream_asset(context: AssetExecutionContext, upstream_asset):\n    context.log.info(context.asset_partitions_def_for_input("upstream_asset"))\n\n# materializing the 2023-08-21 partition of this asset will log:\n#   DailyPartitionsDefinition("2023-08-20")\n
\n
\n
\n\n
\n
\nasset_partitions_def_for_output(output_name='result')[source]\u00b6
\n

The PartitionsDefinition on the asset corresponding to this output.

\n
\n
Parameters:
\n

output_name (str) \u2013 For assets defined with the @asset decorator, the name of the output\nwill be automatically provided. For assets defined with @multi_asset, output_name\nshould be the op output associated with the asset key (as determined by AssetOut)\nto get the PartitionsDefinition for.

\n
\n
\n

Examples

\n
partitions_def = DailyPartitionsDefinition("2023-08-20")\n\n@asset(\n    partitions_def=partitions_def\n)\ndef upstream_asset(context: AssetExecutionContext):\n    context.log.info(context.asset_partitions_def_for_output())\n\n# materializing the 2023-08-21 partition of this asset will log:\n#   DailyPartitionsDefinition("2023-08-20")\n\n@multi_asset(\n    outs={\n        "first_asset": AssetOut(key=["my_assets", "first_asset"]),\n        "second_asset": AssetOut(key=["my_assets", "second_asset"])\n    }\n    partitions_def=partitions_def,\n)\ndef a_multi_asset(context: AssetExecutionContext):\n    context.log.info(context.asset_partitions_def_for_output("first_asset"))\n    context.log.info(context.asset_partitions_def_for_output("second_asset"))\n\n# materializing the 2023-08-21 partition of this asset will log:\n#   DailyPartitionsDefinition("2023-08-20")\n#   DailyPartitionsDefinition("2023-08-20")\n
\n
\n
\n\n
\n
\nasset_partitions_time_window_for_input(input_name='result')[source]\u00b6
\n

The time window for the partitions of the input asset.

\n

If you want to write your asset to support running a backfill of several partitions in a single run,\nyou can use asset_partitions_time_window_for_input to get the time window of the input that\nare relevant to that backfill.

\n

Raises an error if either of the following are true:\n- The input asset has no partitioning.\n- The input asset is not partitioned with a TimeWindowPartitionsDefinition or a\nMultiPartitionsDefinition with one time-partitioned dimension.

\n
\n
Parameters:
\n

input_name (str) \u2013 The name of the input to get the partition key for.

\n
\n
\n

Examples

\n
partitions_def = DailyPartitionsDefinition("2023-08-20")\n\n@asset(\n    partitions_def=partitions_def\n)\ndef upstream_asset():\n    ...\n\n@asset(\n    partitions_def=partitions_def\n)\ndef an_asset(context: AssetExecutionContext, upstream_asset):\n    context.log.info(context.asset_partitions_time_window_for_input("upstream_asset"))\n\n\n# materializing the 2023-08-21 partition of this asset will log:\n#   TimeWindow("2023-08-21", "2023-08-22")\n\n# running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n#   TimeWindow("2023-08-21", "2023-08-26")\n\n\n@asset(\n    ins={\n        "upstream_asset": AssetIn(partition_mapping=TimeWindowPartitionMapping(start_offset=-1, end_offset=-1))\n    }\n    partitions_def=partitions_def,\n)\ndef another_asset(context: AssetExecutionContext, upstream_asset):\n    context.log.info(context.asset_partitions_time_window_for_input("upstream_asset"))\n\n\n# materializing the 2023-08-21 partition of this asset will log:\n#   TimeWindow("2023-08-20", "2023-08-21")\n\n# running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n#   TimeWindow("2023-08-21", "2023-08-26")\n\n\n@asset(\n    partitions_def=partitions_def,\n    ins={\n        "self_dependent_asset": AssetIn(partition_mapping=TimeWindowPartitionMapping(start_offset=-1, end_offset=-1))\n    }\n)\ndef self_dependent_asset(context: AssetExecutionContext, self_dependent_asset):\n    context.log.info(context.asset_partitions_time_window_for_input("self_dependent_asset"))\n\n# materializing the 2023-08-21 partition of this asset will log:\n#   TimeWindow("2023-08-20", "2023-08-21")\n\n# running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n#   TimeWindow("2023-08-20", "2023-08-25")\n
\n
\n
\n\n
\n
\nasset_partitions_time_window_for_output(output_name='result')[source]\u00b6
\n

The time window for the partitions of the output asset.

\n

If you want to write your asset to support running a backfill of several partitions in a single run,\nyou can use asset_partitions_time_window_for_output to get the TimeWindow of all of the partitions\nbeing materialized by the backfill.

\n

Raises an error if either of the following are true:\n- The output asset has no partitioning.\n- The output asset is not partitioned with a TimeWindowPartitionsDefinition or a\nMultiPartitionsDefinition with one time-partitioned dimension.

\n
\n
Parameters:
\n

output_name (str) \u2013 For assets defined with the @asset decorator, the name of the output\nwill be automatically provided. For assets defined with @multi_asset, output_name\nshould be the op output associated with the asset key (as determined by AssetOut)\nto get the time window for.

\n
\n
\n

Examples

\n
partitions_def = DailyPartitionsDefinition("2023-08-20")\n\n@asset(\n    partitions_def=partitions_def\n)\ndef an_asset(context: AssetExecutionContext):\n    context.log.info(context.asset_partitions_time_window_for_output())\n\n\n# materializing the 2023-08-21 partition of this asset will log:\n#   TimeWindow("2023-08-21", "2023-08-22")\n\n# running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n#   TimeWindow("2023-08-21", "2023-08-26")\n\n@multi_asset(\n    outs={\n        "first_asset": AssetOut(key=["my_assets", "first_asset"]),\n        "second_asset": AssetOut(key=["my_assets", "second_asset"])\n    }\n    partitions_def=partitions_def,\n)\ndef a_multi_asset(context: AssetExecutionContext):\n    context.log.info(context.asset_partitions_time_window_for_output("first_asset"))\n    context.log.info(context.asset_partitions_time_window_for_output("second_asset"))\n\n# materializing the 2023-08-21 partition of this asset will log:\n#   TimeWindow("2023-08-21", "2023-08-22")\n#   TimeWindow("2023-08-21", "2023-08-22")\n\n# running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n#   TimeWindow("2023-08-21", "2023-08-26")\n#   TimeWindow("2023-08-21", "2023-08-26")\n\n\n@asset(\n    partitions_def=partitions_def,\n    ins={\n        "self_dependent_asset": AssetIn(partition_mapping=TimeWindowPartitionMapping(start_offset=-1, end_offset=-1))\n    }\n)\ndef self_dependent_asset(context: AssetExecutionContext, self_dependent_asset):\n    context.log.info(context.asset_partitions_time_window_for_output())\n\n# materializing the 2023-08-21 partition of this asset will log:\n#   TimeWindow("2023-08-21", "2023-08-22")\n\n# running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n#   TimeWindow("2023-08-21", "2023-08-26")\n
\n
\n
\n\n
\n
\nproperty assets_def\u00b6
\n

The backing AssetsDefinition for what is currently executing, errors if not available.

\n
\n\n
\n
\nget_asset_provenance(asset_key)[source]\u00b6
\n
\n
\n

\n \n (\n experimental\n )\n \n This API may break in future versions, even between dot releases.\n \n

\n

Return the provenance information for the most recent materialization of an asset.

\n
\n
Parameters:
\n

asset_key (AssetKey) \u2013 Key of the asset for which to retrieve provenance.

\n
\n
Returns:
\n

\n
Provenance information for the most recent

materialization of the asset. Returns None if the asset was never materialized or\nthe materialization record is too old to contain provenance information.

\n
\n
\n

\n
\n
Return type:
\n

Optional[DataProvenance]

\n
\n
\n
\n\n
\n
\nget_mapping_key()[source]\u00b6
\n

Which mapping_key this execution is for if downstream of a DynamicOutput, otherwise None.

\n
\n\n
\n
\nget_tag(key)[source]\u00b6
\n

Get a logging tag.

\n
\n
Parameters:
\n

key (tag) \u2013 The tag to get.

\n
\n
Returns:
\n

The value of the tag, if present.

\n
\n
Return type:
\n

Optional[str]

\n
\n
\n
\n\n
\n
\nproperty has_asset_checks_def\u00b6
\n

Return a boolean indicating the presence of a backing AssetChecksDefinition\nfor the current execution.

\n
\n
Returns:
\n

True if there is a backing AssetChecksDefinition for the current execution, otherwise False.

\n
\n
Return type:
\n

bool

\n
\n
\n
\n\n
\n
\nproperty has_assets_def\u00b6
\n

If there is a backing AssetsDefinition for what is currently executing.

\n
\n\n
\n
\nproperty has_partition_key\u00b6
\n

Whether the current run is a partitioned run.

\n
\n\n
\n
\nhas_tag(key)[source]\u00b6
\n

Check if a logging tag is set.

\n
\n
Parameters:
\n

key (str) \u2013 The tag to check.

\n
\n
Returns:
\n

Whether the tag is set.

\n
\n
Return type:
\n

bool

\n
\n
\n
\n\n
\n
\nproperty instance\u00b6
\n

The current Dagster instance.

\n
\n
Type:
\n

DagsterInstance

\n
\n
\n
\n\n
\n
\nproperty job_def\u00b6
\n

The currently executing pipeline.

\n
\n
Type:
\n

JobDefinition

\n
\n
\n
\n\n
\n
\nproperty job_name\u00b6
\n

The name of the currently executing pipeline.

\n
\n
Type:
\n

str

\n
\n
\n
\n\n
\n
\nproperty log\u00b6
\n

The log manager available in the execution context.

\n
\n
Type:
\n

DagsterLogManager

\n
\n
\n
\n\n
\n
\nlog_event(event)[source]\u00b6
\n

Log an AssetMaterialization, AssetObservation, or ExpectationResult from within the body of an op.

\n

Events logged with this method will appear in the list of DagsterEvents, as well as the event log.

\n
\n
Parameters:
\n

event (Union[AssetMaterialization, AssetObservation, ExpectationResult]) \u2013 The event to log.

\n
\n
\n

Examples:

\n
from dagster import op, AssetMaterialization\n\n@op\ndef log_materialization(context):\n    context.log_event(AssetMaterialization("foo"))\n
\n
\n
\n\n
\n
\nproperty op_config\u00b6
\n

The parsed config specific to this op.

\n
\n
Type:
\n

Any

\n
\n
\n
\n\n
\n
\nproperty op_def\u00b6
\n

The current op definition.

\n
\n
Type:
\n

OpDefinition

\n
\n
\n
\n\n
\n
\noutput_for_asset_key(asset_key)[source]\u00b6
\n

Return the output name for the corresponding asset key.

\n
\n\n
\n
\nproperty partition_key\u00b6
\n

The partition key for the current run.

\n

Raises an error if the current run is not a partitioned run. Or if the current run is operating\nover a range of partitions (ie. a backfill of several partitions executed in a single run).

\n

Examples

\n
partitions_def = DailyPartitionsDefinition("2023-08-20")\n\n@asset(\n    partitions_def=partitions_def\n)\ndef my_asset(context: AssetExecutionContext):\n    context.log.info(context.partition_key)\n\n# materializing the 2023-08-21 partition of this asset will log:\n#   "2023-08-21"\n
\n
\n
\n\n
\n
\nproperty partition_key_range\u00b6
\n

The range of partition keys for the current run.

\n

If run is for a single partition key, returns a PartitionKeyRange with the same start and\nend. Raises an error if the current run is not a partitioned run.

\n

Examples

\n
partitions_def = DailyPartitionsDefinition("2023-08-20")\n\n@asset(\n    partitions_def=partitions_def\n)\ndef my_asset(context: AssetExecutionContext):\n    context.log.info(context.partition_key_range)\n\n# running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n#   PartitionKeyRange(start="2023-08-21", end="2023-08-25")\n
\n
\n
\n\n
\n
\nproperty partition_time_window\u00b6
\n

The partition time window for the current run.

\n

Raises an error if the current run is not a partitioned run, or if the job\u2019s partition\ndefinition is not a TimeWindowPartitionsDefinition.

\n

Examples

\n
partitions_def = DailyPartitionsDefinition("2023-08-20")\n\n@asset(\n    partitions_def=partitions_def\n)\ndef my_asset(context: AssetExecutionContext):\n    context.log.info(context.partition_time_window)\n\n# materializing the 2023-08-21 partition of this asset will log:\n#   TimeWindow("2023-08-21", "2023-08-22")\n
\n
\n
\n\n
\n
\nproperty pdb\u00b6
\n

Gives access to pdb debugging from within the op.

\n

Example

\n
@op\ndef debug(context):\n    context.pdb.set_trace()\n
\n
\n
\n
Type:
\n

dagster.utils.forked_pdb.ForkedPdb

\n
\n
\n
\n\n
\n
\nproperty resources\u00b6
\n

The currently available resources.

\n
\n
Type:
\n

Resources

\n
\n
\n
\n\n
\n
\nproperty retry_number\u00b6
\n

Which retry attempt is currently executing i.e. 0 for initial attempt, 1 for first retry, etc.

\n
\n\n
\n
\nproperty run_config\u00b6
\n

The run config for the current execution.

\n
\n
Type:
\n

dict

\n
\n
\n
\n\n
\n
\nproperty run_id\u00b6
\n

The id of the current execution\u2019s run.

\n
\n
Type:
\n

str

\n
\n
\n
\n\n
\n
\nproperty selected_asset_check_keys\u00b6
\n
\n\n
\n
\nproperty selected_asset_keys\u00b6
\n

Get the set of AssetKeys this execution is expected to materialize.

\n
\n\n
\n
\nproperty selected_output_names\u00b6
\n

Get the output names that correspond to the current selection of assets this execution is expected to materialize.

\n
\n\n
\n\n
\n
\ndagster.build_op_context(resources=None, op_config=None, resources_config=None, instance=None, config=None, partition_key=None, partition_key_range=None, mapping_key=None, _assets_def=None)[source]\u00b6
\n

Builds op execution context from provided parameters.

\n

build_op_context can be used as either a function or context manager. If there is a\nprovided resource that is a context manager, then build_op_context must be used as a\ncontext manager. This function can be used to provide the context argument when directly\ninvoking a op.

\n
\n
Parameters:
\n
    \n
  • resources (Optional[Dict[str, Any]]) \u2013 The resources to provide to the context. These can be\neither values or resource definitions.

  • \n
  • op_config (Optional[Mapping[str, Any]]) \u2013 The config to provide to the op.

  • \n
  • resources_config (Optional[Mapping[str, Any]]) \u2013 The config to provide to the resources.

  • \n
  • instance (Optional[DagsterInstance]) \u2013 The dagster instance configured for the context.\nDefaults to DagsterInstance.ephemeral().

  • \n
  • mapping_key (Optional[str]) \u2013 A key representing the mapping key from an upstream dynamic\noutput. Can be accessed using context.get_mapping_key().

  • \n
  • partition_key (Optional[str]) \u2013 String value representing partition key to execute with.

  • \n
  • partition_key_range (Optional[PartitionKeyRange]) \u2013 Partition key range to execute with.

  • \n
  • _assets_def (Optional[AssetsDefinition]) \u2013 Internal argument that populates the op\u2019s assets\ndefinition, not meant to be populated by users.

  • \n
\n
\n
\n

Examples

\n
context = build_op_context()\nop_to_invoke(context)\n\nwith build_op_context(resources={"foo": context_manager_resource}) as context:\n    op_to_invoke(context)\n
\n
\n
\n\n
\n
\ndagster.build_asset_context(resources=None, resources_config=None, asset_config=None, instance=None, partition_key=None, partition_key_range=None)[source]\u00b6
\n

Builds asset execution context from provided parameters.

\n

build_asset_context can be used as either a function or context manager. If there is a\nprovided resource that is a context manager, then build_asset_context must be used as a\ncontext manager. This function can be used to provide the context argument when directly\ninvoking an asset.

\n
\n
Parameters:
\n
    \n
  • resources (Optional[Dict[str, Any]]) \u2013 The resources to provide to the context. These can be\neither values or resource definitions.

  • \n
  • resources_config (Optional[Mapping[str, Any]]) \u2013 The config to provide to the resources.

  • \n
  • asset_config (Optional[Mapping[str, Any]]) \u2013 The config to provide to the asset.

  • \n
  • instance (Optional[DagsterInstance]) \u2013 The dagster instance configured for the context.\nDefaults to DagsterInstance.ephemeral().

  • \n
  • partition_key (Optional[str]) \u2013 String value representing partition key to execute with.

  • \n
  • partition_key_range (Optional[PartitionKeyRange]) \u2013 Partition key range to execute with.

  • \n
\n
\n
\n

Examples

\n
context = build_asset_context()\nasset_to_invoke(context)\n\nwith build_asset_context(resources={"foo": context_manager_resource}) as context:\n    asset_to_invoke(context)\n
\n
\n
\n\n
\n
\nclass dagster.TypeCheckContext(run_id, log_manager, scoped_resources_builder, dagster_type)[source]\u00b6
\n

The context object available to a type check function on a DagsterType.

\n
\n
\nproperty log\u00b6
\n

Centralized log dispatch from user code.

\n
\n\n
\n
\nproperty resources\u00b6
\n

An object whose attributes contain the resources available to this op.

\n
\n\n
\n
\nproperty run_id\u00b6
\n

The id of this job run.

\n
\n\n
\n\n
\n
\n

Job configuration\u00b6

\n
\n
\ndagster.validate_run_config(job_def, run_config=None)[source]\u00b6
\n

Function to validate a provided run config blob against a given job.

\n

If validation is successful, this function will return a dictionary representation of the\nvalidated config actually used during execution.

\n
\n
Parameters:
\n
    \n
  • job_def (JobDefinition) \u2013 The job definition to validate run\nconfig against

  • \n
  • run_config (Optional[Dict[str, Any]]) \u2013 The run config to validate

  • \n
\n
\n
Returns:
\n

A dictionary representation of the validated config.

\n
\n
Return type:
\n

Dict[str, Any]

\n
\n
\n
\n\n
\n

Run Config Schema\u00b6

\n
\n

The run_config used for jobs has the following schema:

\n
{\n  # configuration for execution, required if executors require config\n  execution: {\n    # the name of one, and only one available executor, typically 'in_process' or 'multiprocess'\n    __executor_name__: {\n      # executor-specific config, if required or permitted\n      config: {\n        ...\n      }\n    }\n  },\n\n  # configuration for loggers, required if loggers require config\n  loggers: {\n    # the name of an available logger\n    __logger_name__: {\n      # logger-specific config, if required or permitted\n      config: {\n        ...\n      }\n    },\n    ...\n  },\n\n  # configuration for resources, required if resources require config\n  resources: {\n    # the name of a resource\n    __resource_name__: {\n      # resource-specific config, if required or permitted\n      config: {\n        ...\n      }\n    },\n    ...\n  },\n\n  # configuration for underlying ops, required if ops require config\n  ops: {\n\n    # these keys align with the names of the ops, or their alias in this job\n    __op_name__: {\n\n      # pass any data that was defined via config_field\n      config: ...,\n\n      # configurably specify input values, keyed by input name\n      inputs: {\n        __input_name__: {\n          # if an dagster_type_loader is specified, that schema must be satisfied here;\n          # scalar, built-in types will generally allow their values to be specified directly:\n          value: ...\n        }\n      },\n\n    }\n  },\n\n}\n
\n
\n
\n
\n
\n
\n", "current_page_name": "sections/api/apidocs/execution", "customsidebar": null, "display_toc": true, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../graphs/", "title": "Graphs"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../errors/", "title": "Errors"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/graphs", "Graphs", "N", "next"], ["sections/api/apidocs/errors", "Errors", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/execution.rst.txt", "title": "Execution", "toc": "\n"}, "graphs": {"alabaster_version": "0.7.13", "body": "
\n

Graphs\u00b6

\n

The core of a job is a _graph_ of ops - connected via data dependencies.

\n
\n
\n@dagster.graph(compose_fn=None, *, name=None, description=None, input_defs=None, output_defs=None, ins=None, out=None, tags=None, config=None)[source]\u00b6
\n

Create an op graph with the specified parameters from the decorated composition function.

\n

Using this decorator allows you to build up a dependency graph by writing a\nfunction that invokes ops (or other graphs) and passes the output to subsequent invocations.

\n
\n
Parameters:
\n
    \n
  • name (Optional[str]) \u2013 The name of the op graph. Must be unique within any RepositoryDefinition containing the graph.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the graph.

  • \n
  • input_defs (Optional[List[InputDefinition]]) \u2013

    Information about the inputs that this graph maps. Information provided here\nwill be combined with what can be inferred from the function signature, with these\nexplicit InputDefinitions taking precedence.

    \n

    Uses of inputs in the body of the decorated composition function will determine\nthe InputMappings passed to the underlying\nGraphDefinition.

    \n

  • \n
  • output_defs (Optional[List[OutputDefinition]]) \u2013

    Output definitions for the graph. If not provided explicitly, these will be inferred from typehints.

    \n

    Uses of these outputs in the body of the decorated composition function, as well as the\nreturn value of the decorated function, will be used to infer the appropriate set of\nOutputMappings for the underlying\nGraphDefinition.

    \n

    To map multiple outputs, return a dictionary from the composition function.

    \n

  • \n
  • ins (Optional[Dict[str, GraphIn]]) \u2013 Information about the inputs that this graph maps. Information provided here\nwill be combined with what can be inferred from the function signature, with these\nexplicit GraphIn taking precedence.

  • \n
  • out \u2013

    Information about the outputs that this graph maps. Information provided here will be\ncombined with what can be inferred from the return type signature if the function does\nnot use yield.

    \n

    To map multiple outputs, return a dictionary from the composition function.

    \n

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.GraphDefinition(name, *, description=None, node_defs=None, dependencies=None, input_mappings=None, output_mappings=None, config=None, tags=None, node_input_source_assets=None, **kwargs)[source]\u00b6
\n

Defines a Dagster op graph.

\n

An op graph is made up of

\n
    \n
  • Nodes, which can either be an op (the functional unit of computation), or another graph.

  • \n
  • Dependencies, which determine how the values produced by nodes as outputs flow from\none node to another. This tells Dagster how to arrange nodes into a directed, acyclic graph\n(DAG) of compute.

  • \n
\n

End users should prefer the @graph decorator. GraphDefinition is generally\nintended to be used by framework authors or for programatically generated graphs.

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the graph. Must be unique within any GraphDefinition\nor JobDefinition containing the graph.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the job.

  • \n
  • node_defs (Optional[Sequence[NodeDefinition]]) \u2013 The set of ops / graphs used in this graph.

  • \n
  • dependencies (Optional[Dict[Union[str, NodeInvocation], Dict[str, DependencyDefinition]]]) \u2013 A structure that declares the dependencies of each op\u2019s inputs on the outputs of other\nops in the graph. Keys of the top level dict are either the string names of ops in the\ngraph or, in the case of aliased ops, NodeInvocations.\nValues of the top level dict are themselves dicts, which map input names belonging to\nthe op or aliased op to DependencyDefinitions.

  • \n
  • input_mappings (Optional[Sequence[InputMapping]]) \u2013 Defines the inputs to the nested graph, and\nhow they map to the inputs of its constituent ops.

  • \n
  • output_mappings (Optional[Sequence[OutputMapping]]) \u2013 Defines the outputs of the nested graph,\nand how they map from the outputs of its constituent ops.

  • \n
  • config (Optional[ConfigMapping]) \u2013 Defines the config of the graph, and how its schema maps\nto the config of its constituent ops.

  • \n
  • tags (Optional[Dict[str, Any]]) \u2013 Arbitrary metadata for any execution of the graph.\nValues that are not strings will be json encoded and must meet the criteria that\njson.loads(json.dumps(value)) == value. These tag values may be overwritten by tag\nvalues provided at invocation time.

  • \n
\n
\n
\n

Examples

\n
@op\ndef return_one():\n    return 1\n\n@op\ndef add_one(num):\n    return num + 1\n\ngraph_def = GraphDefinition(\n    name='basic',\n    node_defs=[return_one, add_one],\n    dependencies={'add_one': {'num': DependencyDefinition('return_one')}},\n)\n
\n
\n
\n
\nalias(name)[source]\u00b6
\n

Aliases the graph with a new name.

\n

Can only be used in the context of a @graph, @job, or @asset_graph decorated function.

\n
\n
Examples:
@job\ndef do_it_all():\n    my_graph.alias("my_graph_alias")\n
\n
\n
\n
\n
\n\n
\n
\nproperty config_mapping\u00b6
\n

The config mapping for the graph, if present.

\n

By specifying a config mapping function, you can override the configuration for the child nodes contained within a graph.

\n
\n\n
\n
\nexecute_in_process(run_config=None, instance=None, resources=None, raise_on_error=True, op_selection=None, run_id=None, input_values=None)[source]\u00b6
\n

Execute this graph in-process, collecting results in-memory.

\n
\n
Parameters:
\n
    \n
  • run_config (Optional[Mapping[str, Any]]) \u2013 Run config to provide to execution. The configuration for the underlying graph\nshould exist under the \u201cops\u201d key.

  • \n
  • instance (Optional[DagsterInstance]) \u2013 The instance to execute against, an ephemeral one will be used if none provided.

  • \n
  • resources (Optional[Mapping[str, Any]]) \u2013 The resources needed if any are required. Can provide resource instances directly,\nor resource definitions.

  • \n
  • raise_on_error (Optional[bool]) \u2013 Whether or not to raise exceptions when they occur.\nDefaults to True.

  • \n
  • op_selection (Optional[List[str]]) \u2013 A list of op selection queries (including single op\nnames) to execute. For example:\n* ['some_op']: selects some_op itself.\n* ['*some_op']: select some_op and all its ancestors (upstream dependencies).\n* ['*some_op+++']: select some_op, all its ancestors, and its descendants\n(downstream dependencies) within 3 levels down.\n* ['*some_op', 'other_op_a', 'other_op_b+']: select some_op and all its\nancestors, other_op_a itself, and other_op_b and its direct child ops.

  • \n
  • input_values (Optional[Mapping[str, Any]]) \u2013 A dictionary that maps python objects to the top-level inputs of the graph.

  • \n
\n
\n
Returns:
\n

ExecuteInProcessResult

\n
\n
\n
\n\n
\n
\nproperty input_mappings\u00b6
\n

Input mappings for the graph.

\n

An input mapping is a mapping from an input of the graph to an input of a child node.

\n
\n\n
\n
\nproperty name\u00b6
\n

The name of the graph.

\n
\n\n
\n
\nproperty output_mappings\u00b6
\n

Output mappings for the graph.

\n

An output mapping is a mapping from an output of the graph to an output of a child node.

\n
\n\n
\n
\ntag(tags)[source]\u00b6
\n

Attaches the provided tags to the graph immutably.

\n

Can only be used in the context of a @graph, @job, or @asset_graph decorated function.

\n
\n
Examples:
@job\ndef do_it_all():\n    my_graph.tag({"my_tag": "my_value"})\n
\n
\n
\n
\n
\n\n
\n
\nproperty tags\u00b6
\n

The tags associated with the graph.

\n
\n\n
\n
\nto_job(name=None, description=None, resource_defs=None, config=None, tags=None, metadata=None, logger_defs=None, executor_def=None, hooks=None, op_retry_policy=None, version_strategy=None, op_selection=None, partitions_def=None, asset_layer=None, input_values=None, _asset_selection_data=None)[source]\u00b6
\n

Make this graph in to an executable Job by providing remaining components required for execution.

\n
\n
Parameters:
\n
    \n
  • name (Optional[str]) \u2013 The name for the Job. Defaults to the name of the this graph.

  • \n
  • resource_defs (Optional[Mapping [str, object]]) \u2013 Resources that are required by this graph for execution.\nIf not defined, io_manager will default to filesystem.

  • \n
  • config \u2013

    Describes how the job is parameterized at runtime.

    \n

    If no value is provided, then the schema for the job\u2019s run config is a standard\nformat based on its ops and resources.

    \n

    If a dictionary is provided, then it must conform to the standard config schema, and\nit will be used as the job\u2019s run config for the job whenever the job is executed.\nThe values provided will be viewable and editable in the Dagster UI, so be\ncareful with secrets.

    \n

    If a ConfigMapping object is provided, then the schema for the job\u2019s run config is\ndetermined by the config mapping, and the ConfigMapping, which should return\nconfiguration in the standard format to configure the job.

    \n

    If a PartitionedConfig object is provided, then it defines a discrete set of config\nvalues that can parameterize the job, as well as a function for mapping those\nvalues to the base config. The values provided will be viewable and editable in the\nDagster UI, so be careful with secrets.

    \n

  • \n
  • tags (Optional[Mapping[str, Any]]) \u2013 Arbitrary information that will be attached to the execution of the Job.\nValues that are not strings will be json encoded and must meet the criteria that\njson.loads(json.dumps(value)) == value. These tag values may be overwritten by tag\nvalues provided at invocation time.

  • \n
  • metadata (Optional[Mapping[str, RawMetadataValue]]) \u2013 Arbitrary information that will be attached to the JobDefinition and be viewable in the Dagster UI.\nKeys must be strings, and values must be python primitive types or one of the provided\nMetadataValue types

  • \n
  • logger_defs (Optional[Mapping[str, LoggerDefinition]]) \u2013 A dictionary of string logger identifiers to their implementations.

  • \n
  • executor_def (Optional[ExecutorDefinition]) \u2013 How this Job will be executed. Defaults to multi_or_in_process_executor,\nwhich can be switched between multi-process and in-process modes of execution. The\ndefault mode of execution is multi-process.

  • \n
  • op_retry_policy (Optional[RetryPolicy]) \u2013 The default retry policy for all ops in this job.\nOnly used if retry policy is not defined on the op definition or op invocation.

  • \n
  • version_strategy (Optional[VersionStrategy]) \u2013 Defines how each op (and optionally, resource) in the job can be versioned. If\nprovided, memoizaton will be enabled for this job.

  • \n
  • partitions_def (Optional[PartitionsDefinition]) \u2013 Defines a discrete set of partition\nkeys that can parameterize the job. If this argument is supplied, the config\nargument can\u2019t also be supplied.

  • \n
  • asset_layer (Optional[AssetLayer]) \u2013 Top level information about the assets this job\nwill produce. Generally should not be set manually.

  • \n
  • input_values (Optional[Mapping[str, Any]]) \u2013 A dictionary that maps python objects to the top-level inputs of a job.

  • \n
\n
\n
Returns:
\n

JobDefinition

\n
\n
\n
\n\n
\n
\nwith_hooks(hook_defs)[source]\u00b6
\n

Attaches the provided hooks to the graph immutably.

\n

Can only be used in the context of a @graph, @job, or @asset_graph decorated function.

\n
\n
Examples:
@job\ndef do_it_all():\n    my_graph.with_hooks({my_hook})\n
\n
\n
\n
\n
\n\n
\n
\nwith_retry_policy(retry_policy)[source]\u00b6
\n

Attaches the provided retry policy to the graph immutably.

\n

Can only be used in the context of a @graph, @job, or @asset_graph decorated function.

\n
\n
Examples:
@job\ndef do_it_all():\n    my_graph.with_retry_policy(RetryPolicy(max_retries=5))\n
\n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.GraphIn(description=None)[source]\u00b6
\n

Represents information about an input that a graph maps.

\n
\n
Parameters:
\n

description (Optional[str]) \u2013 Human-readable description of the input.

\n
\n
\n
\n\n
\n
\nclass dagster.GraphOut(description=None)[source]\u00b6
\n

Represents information about the outputs that a graph maps.

\n
\n
Parameters:
\n

description (Optional[str]) \u2013 Human-readable description of the output.

\n
\n
\n
\n\n
\n

Explicit dependencies\u00b6

\n
\n
\nclass dagster.DependencyDefinition(node, output='result', description=None)[source]\u00b6
\n

Represents an edge in the DAG of nodes (ops or graphs) forming a job.

\n

This object is used at the leaves of a dictionary structure that represents the complete\ndependency structure of a job whose keys represent the dependent node and dependent\ninput, so this object only contains information about the dependee.

\n

Concretely, if the input named \u2018input\u2019 of op_b depends on the output named \u2018result\u2019 of\nop_a, and the output named \u2018other_result\u2019 of graph_a, the structure will look as follows:

\n
dependency_structure = {\n    'my_downstream_op': {\n        'input': DependencyDefinition('my_upstream_op', 'result')\n    }\n    'my_downstream_op': {\n        'input': DependencyDefinition('my_upstream_graph', 'result')\n    }\n}\n
\n
\n

In general, users should prefer not to construct this class directly or use the\nJobDefinition API that requires instances of this class. Instead, use the\n@job API:

\n
@job\ndef the_job():\n    node_b(node_a())\n
\n
\n
\n
Parameters:
\n
    \n
  • node (str) \u2013 The name of the node (op or graph) that is depended on, that is, from which the value\npassed between the two nodes originates.

  • \n
  • output (Optional[str]) \u2013 The name of the output that is depended on. (default: \u201cresult\u201d)

  • \n
  • description (Optional[str]) \u2013 Human-readable description of this dependency.

  • \n
\n
\n
\n
\n
\nis_fan_in()[source]\u00b6
\n

Return True if the dependency is fan-in (always False for DependencyDefinition).

\n
\n\n
\n\n
\n
\nclass dagster.MultiDependencyDefinition(dependencies)[source]\u00b6
\n

Represents a fan-in edge in the DAG of op instances forming a job.

\n

This object is used only when an input of type List[T] is assembled by fanning-in multiple\nupstream outputs of type T.

\n

This object is used at the leaves of a dictionary structure that represents the complete\ndependency structure of a job whose keys represent the dependent ops or graphs and dependent\ninput, so this object only contains information about the dependee.

\n

Concretely, if the input named \u2018input\u2019 of op_c depends on the outputs named \u2018result\u2019 of\nop_a and op_b, this structure will look as follows:

\n
dependency_structure = {\n    'op_c': {\n        'input': MultiDependencyDefinition(\n            [\n                DependencyDefinition('op_a', 'result'),\n                DependencyDefinition('op_b', 'result')\n            ]\n        )\n    }\n}\n
\n
\n

In general, users should prefer not to construct this class directly or use the\nJobDefinition API that requires instances of this class. Instead, use the\n@job API:

\n
@job\ndef the_job():\n    op_c(op_a(), op_b())\n
\n
\n
\n
Parameters:
\n

dependencies (List[Union[DependencyDefinition, Type[MappedInputPlaceHolder]]]) \u2013 List of\nupstream dependencies fanned in to this input.

\n
\n
\n
\n
\nget_dependencies_and_mappings()[source]\u00b6
\n

Return the combined list of dependencies contained by this object, inculding of DependencyDefinition and MappedInputPlaceholder objects.

\n
\n\n
\n
\nget_node_dependencies()[source]\u00b6
\n

Return the list of DependencyDefinition contained by this object.

\n
\n\n
\n
\nis_fan_in()[source]\u00b6
\n

Return True if the dependency is fan-in (always True for MultiDependencyDefinition).

\n
\n\n
\n\n
\n
\nclass dagster.NodeInvocation(name, alias=None, tags=None, hook_defs=None, retry_policy=None)[source]\u00b6
\n

Identifies an instance of a node in a graph dependency structure.

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 Name of the node of which this is an instance.

  • \n
  • alias (Optional[str]) \u2013 Name specific to this instance of the node. Necessary when there are\nmultiple instances of the same node.

  • \n
  • tags (Optional[Dict[str, Any]]) \u2013 Optional tags values to extend or override those\nset on the node definition.

  • \n
  • hook_defs (Optional[AbstractSet[HookDefinition]]) \u2013 A set of hook definitions applied to the\nnode instance.

  • \n
\n
\n
\n

Examples:\nIn general, users should prefer not to construct this class directly or use the\nJobDefinition API that requires instances of this class. Instead, use the\n@job API:

\n
from dagster import job\n\n@job\ndef my_job():\n    other_name = some_op.alias('other_name')\n    some_graph(other_name(some_op))\n
\n
\n
\n\n
\n
\nclass dagster.OutputMapping(graph_output_name, mapped_node_name, mapped_node_output_name, graph_output_description=None, dagster_type=None, from_dynamic_mapping=False)[source]\u00b6
\n

Defines an output mapping for a graph.

\n
\n
Parameters:
\n
    \n
  • graph_output_name (str) \u2013 Name of the output in the graph being mapped to.

  • \n
  • mapped_node_name (str) \u2013 Named of the node (op/graph) that the output is being mapped from.

  • \n
  • mapped_node_output_name (str) \u2013 Name of the output in the node (op/graph) that is being mapped from.

  • \n
  • graph_output_description (Optional[str]) \u2013 A description of the output in the graph being mapped from.

  • \n
  • from_dynamic_mapping (bool) \u2013 Set to true if the node being mapped to is a mapped dynamic node.

  • \n
  • dagster_type (Optional[DagsterType]) \u2013 \n \n (\n deprecated\n )\n \n (This parameter will be removed in version 2.0. Any defined dagster_type should come from the underlying op Output.) The dagster type of the graph\u2019s output being mapped to.

  • \n
\n
\n
\n

Examples

\n
from dagster import OutputMapping, GraphDefinition, op, graph, GraphOut\n\n@op\ndef emit_five(x):\n    return 5\n\n# The following two graph definitions are equivalent\nGraphDefinition(\n    name="the_graph",\n    node_defs=[emit_five],\n    output_mappings=[\n        OutputMapping(\n            graph_output_name="result", # Default output name\n            mapped_node_name="emit_five",\n            mapped_node_output_name="result"\n        )\n    ]\n)\n\n@graph(out=GraphOut())\ndef the_graph:\n    return emit_five()\n
\n
\n
\n\n
\n
\nclass dagster.InputMapping(graph_input_name, mapped_node_name, mapped_node_input_name, fan_in_index=None, graph_input_description=None, dagster_type=None)[source]\u00b6
\n

Defines an input mapping for a graph.

\n
\n
Parameters:
\n
    \n
  • graph_input_name (str) \u2013 Name of the input in the graph being mapped from.

  • \n
  • mapped_node_name (str) \u2013 Named of the node (op/graph) that the input is being mapped to.

  • \n
  • mapped_node_input_name (str) \u2013 Name of the input in the node (op/graph) that is being mapped to.

  • \n
  • fan_in_index (Optional[int]) \u2013 The index in to a fanned input, otherwise None.

  • \n
  • graph_input_description (Optional[str]) \u2013 A description of the input in the graph being mapped from.

  • \n
  • dagster_type (Optional[DagsterType]) \u2013 \n \n (\n deprecated\n )\n \n (This parameter will be removed in version 2.0. Any defined dagster_type should come from the upstream op Output.) The dagster type of the graph\u2019s input\nbeing mapped from.

  • \n
\n
\n
\n

Examples

\n
from dagster import InputMapping, GraphDefinition, op, graph\n\n@op\ndef needs_input(x):\n    return x + 1\n\n# The following two graph definitions are equivalent\nGraphDefinition(\n    name="the_graph",\n    node_defs=[needs_input],\n    input_mappings=[\n        InputMapping(\n            graph_input_name="maps_x", mapped_node_name="needs_input",\n            mapped_node_input_name="x"\n        )\n    ]\n)\n\n@graph\ndef the_graph(maps_x):\n    needs_input(maps_x)\n
\n
\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/graphs", "customsidebar": null, "display_toc": true, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../hooks/", "title": "Hooks"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../execution/", "title": "Execution"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/hooks", "Hooks", "N", "next"], ["sections/api/apidocs/execution", "Execution", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/graphs.rst.txt", "title": "Graphs", "toc": "\n"}, "hooks": {"alabaster_version": "0.7.13", "body": "
\n

Hooks\u00b6

\n
\n
\n@dagster.success_hook(hook_fn=None, *, name=None, required_resource_keys=None)[source]\u00b6
\n

Create a hook on step success events with the specified parameters from the decorated function.

\n
\n
Parameters:
\n
    \n
  • name (Optional[str]) \u2013 The name of this hook.

  • \n
  • required_resource_keys (Optional[AbstractSet[str]]) \u2013 Keys for the resources required by the\nhook.

  • \n
\n
\n
\n

Examples

\n
@success_hook(required_resource_keys={'slack'})\ndef slack_message_on_success(context):\n    message = 'op {} succeeded'.format(context.op.name)\n    context.resources.slack.send_message(message)\n\n@success_hook\ndef do_something_on_success(context):\n    do_something()\n
\n
\n
\n\n
\n
\n@dagster.failure_hook(name=None, required_resource_keys=None)[source]\u00b6
\n

Create a hook on step failure events with the specified parameters from the decorated function.

\n
\n
Parameters:
\n
    \n
  • name (Optional[str]) \u2013 The name of this hook.

  • \n
  • required_resource_keys (Optional[AbstractSet[str]]) \u2013 Keys for the resources required by the\nhook.

  • \n
\n
\n
\n

Examples

\n
@failure_hook(required_resource_keys={'slack'})\ndef slack_message_on_failure(context):\n    message = 'op {} failed'.format(context.op.name)\n    context.resources.slack.send_message(message)\n\n@failure_hook\ndef do_something_on_failure(context):\n    do_something()\n
\n
\n
\n\n
\n
\nclass dagster.HookDefinition(*, name, hook_fn, required_resource_keys=None, decorated_fn=None)[source]\u00b6
\n

Define a hook which can be triggered during a op execution (e.g. a callback on the step\nexecution failure event during a op execution).

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of this hook.

  • \n
  • hook_fn (Callable) \u2013 The callback function that will be triggered.

  • \n
  • required_resource_keys (Optional[AbstractSet[str]]) \u2013 Keys for the resources required by the\nhook.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.HookContext(step_execution_context, hook_def)[source]\u00b6
\n

The context object available to a hook function on an DagsterEvent.

\n
\n
\nproperty hook_def\u00b6
\n

The hook that the context object belongs to.

\n
\n\n
\n
\nproperty instance\u00b6
\n

The instance configured to run the current job.

\n
\n\n
\n
\nproperty job_name\u00b6
\n

The name of the job where this hook is being triggered.

\n
\n\n
\n
\nproperty log\u00b6
\n

Centralized log dispatch from user code.

\n
\n\n
\n
\nproperty op_config\u00b6
\n

The parsed config specific to this op.

\n
\n\n
\n
\nproperty op_exception\u00b6
\n

The thrown exception in a failed op.

\n
\n\n
\n
\nproperty op_output_values\u00b6
\n

Computed output values in an op.

\n
\n\n
\n
\nproperty required_resource_keys\u00b6
\n

Resources required by this hook.

\n
\n\n
\n
\nproperty resources\u00b6
\n

Resources available in the hook context.

\n
\n\n
\n
\nproperty run_id\u00b6
\n

The id of the run where this hook is being triggered.

\n
\n\n
\n
\nproperty step_key\u00b6
\n

The key for the step where this hook is being triggered.

\n
\n\n
\n\n
\n
\ndagster.build_hook_context(resources=None, op=None, run_id=None, job_name=None, op_exception=None, instance=None)[source]\u00b6
\n

Builds hook context from provided parameters.

\n

build_hook_context can be used as either a function or a context manager. If there is a\nprovided resource to build_hook_context that is a context manager, then it must be used as a\ncontext manager. This function can be used to provide the context argument to the invocation of\na hook definition.

\n
\n
Parameters:
\n
    \n
  • resources (Optional[Dict[str, Any]]) \u2013 The resources to provide to the context. These can\neither be values or resource definitions.

  • \n
  • op (Optional[OpDefinition, PendingNodeInvocation]) \u2013 The op definition which the\nhook may be associated with.

  • \n
  • run_id (Optional[str]) \u2013 The id of the run in which the hook is invoked (provided for mocking purposes).

  • \n
  • job_name (Optional[str]) \u2013 The name of the job in which the hook is used (provided for mocking purposes).

  • \n
  • op_exception (Optional[Exception]) \u2013 The exception that caused the hook to be triggered.

  • \n
  • instance (Optional[DagsterInstance]) \u2013 The Dagster instance configured to run the hook.

  • \n
\n
\n
\n

Examples

\n
context = build_hook_context()\nhook_to_invoke(context)\n\nwith build_hook_context(resources={"foo": context_manager_resource}) as context:\n    hook_to_invoke(context)\n
\n
\n
\n\n
\n", "current_page_name": "sections/api/apidocs/hooks", "customsidebar": null, "display_toc": false, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../internals/", "title": "Internals"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../graphs/", "title": "Graphs"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/internals", "Internals", "N", "next"], ["sections/api/apidocs/graphs", "Graphs", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/hooks.rst.txt", "title": "Hooks", "toc": "\n"}, "internals": {"alabaster_version": "0.7.13", "body": "
\n

Internals\u00b6

\n

Note that APIs imported from Dagster submodules are not considered stable, and are potentially subject to change in the future.

\n

If you find yourself consulting these docs because you are writing custom components and plug-ins,\nplease get in touch with the core team on our Slack.\nWe\u2019re curious what you\u2019re up to, happy to help, excited for new community contributions, and eager\nto make the system as easy to work with as possible \u2013 including for teams who are looking to\ncustomize it.

\n
\n

Executors (Experimental)\u00b6

\n

APIs for constructing custom executors. This is considered advanced experimental usage. Please note that using Dagster-provided executors is considered stable, common usage.

\n
\n
\n@dagster.executor(name=None, config_schema=None, requirements=None)[source]\u00b6
\n

Define an executor.

\n

The decorated function should accept an InitExecutorContext and return an instance\nof Executor.

\n
\n
Parameters:
\n
    \n
  • name (Optional[str]) \u2013 The name of the executor.

  • \n
  • config_schema (Optional[ConfigSchema]) \u2013 The schema for the config. Configuration data available in\ninit_context.executor_config. If not set, Dagster will accept any config provided for.

  • \n
  • requirements (Optional[List[ExecutorRequirement]]) \u2013 Any requirements that must\nbe met in order for the executor to be usable for a particular job execution.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.ExecutorDefinition(name, config_schema=None, requirements=None, executor_creation_fn=None, description=None)[source]\u00b6
\n

An executor is responsible for executing the steps of a job.

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the executor.

  • \n
  • config_schema (Optional[ConfigSchema]) \u2013 The schema for the config. Configuration data\navailable in init_context.executor_config. If not set, Dagster will accept any config\nprovided.

  • \n
  • requirements (Optional[List[ExecutorRequirement]]) \u2013 Any requirements that must\nbe met in order for the executor to be usable for a particular job execution.

  • \n
  • executor_creation_fn (Optional[Callable]) \u2013 Should accept an InitExecutorContext\nand return an instance of Executor

  • \n
  • required_resource_keys (Optional[Set[str]]) \u2013 Keys for the resources required by the\nexecutor.

  • \n
  • description (Optional[str]) \u2013 A description of the executor.

  • \n
\n
\n
\n
\n
\nconfigured(config_or_config_fn, name=None, config_schema=None, description=None)[source]\u00b6
\n

Wraps this object in an object of the same type that provides configuration to the inner\nobject.

\n

Using configured may result in config values being displayed in\nthe Dagster UI, so it is not recommended to use this API with sensitive values,\nsuch as secrets.

\n
\n
Parameters:
\n
    \n
  • config_or_config_fn (Union[Any, Callable[[Any], Any]]) \u2013 Either (1) Run configuration\nthat fully satisfies this object\u2019s config schema or (2) A function that accepts run\nconfiguration and returns run configuration that fully satisfies this object\u2019s\nconfig schema. In the latter case, config_schema must be specified. When\npassing a function, it\u2019s easiest to use configured().

  • \n
  • name (Optional[str]) \u2013 Name of the new definition. If not provided, the emitted\ndefinition will inherit the name of the ExecutorDefinition upon which this\nfunction is called.

  • \n
  • config_schema (Optional[ConfigSchema]) \u2013 If config_or_config_fn is a function, the config\nschema that its input must satisfy. If not set, Dagster will accept any config\nprovided.

  • \n
  • description (Optional[str]) \u2013 Description of the new definition. If not specified,\ninherits the description of the definition being configured.

  • \n
\n
\n
\n

Returns (ConfigurableDefinition): A configured version of this object.

\n
\n\n
\n
\nproperty description\u00b6
\n

Description of executor, if provided.

\n
\n\n
\n
\nproperty executor_creation_fn\u00b6
\n

Callable that takes an InitExecutorContext and returns an instance of\nExecutor.

\n
\n\n
\n
\nproperty name\u00b6
\n

Name of the executor.

\n
\n\n
\n\n
\n
\nclass dagster.InitExecutorContext(job, executor_def, executor_config, instance)[source]\u00b6
\n

Executor-specific initialization context.

\n
\n
\njob\u00b6
\n

The job to be executed.

\n
\n
Type:
\n

IJob

\n
\n
\n
\n\n
\n
\nexecutor_def\u00b6
\n

The definition of the executor currently being\nconstructed.

\n
\n
Type:
\n

ExecutorDefinition

\n
\n
\n
\n\n
\n
\nexecutor_config\u00b6
\n

The parsed config passed to the executor.

\n
\n
Type:
\n

dict

\n
\n
\n
\n\n
\n
\ninstance\u00b6
\n

The current instance.

\n
\n
Type:
\n

DagsterInstance

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.Executor[source]\u00b6
\n
\n
\nabstract execute(plan_context, execution_plan)[source]\u00b6
\n

For the given context and execution plan, orchestrate a series of sub plan executions in a way that satisfies the whole plan being executed.

\n
\n
Parameters:
\n
    \n
  • plan_context (PlanOrchestrationContext) \u2013 The plan\u2019s orchestration context.

  • \n
  • execution_plan (ExecutionPlan) \u2013 The plan to execute.

  • \n
\n
\n
Returns:
\n

A stream of dagster events.

\n
\n
\n
\n\n
\n
\nabstract property retries\u00b6
\n

Whether retries are enabled or disabled for this instance of the executor.

\n

Executors should allow this to be controlled via configuration if possible.

\n

Returns: RetryMode

\n
\n\n
\n\n
\n
\n
\n

File Manager (Experimental)\u00b6

\n
\n
\nclass dagster._core.storage.file_manager.FileManager[source]\u00b6
\n

Base class for all file managers in dagster.

\n

The file manager is an interface that can be implemented by resources to provide abstract\naccess to a file system such as local disk, S3, or other cloud storage.

\n

For examples of usage, see the documentation of the concrete file manager implementations.

\n
\n
\nabstract copy_handle_to_local_temp(file_handle)[source]\u00b6
\n

Copy a file represented by a file handle to a temp file.

\n

In an implementation built around an object store such as S3, this method would be expected\nto download the file from S3 to local filesystem in a location assigned by the standard\nlibrary\u2019s python:tempfile module.

\n

Temp files returned by this method are not guaranteed to be reusable across solid\nboundaries. For files that must be available across solid boundaries, use the\nread(),\nread_data(),\nwrite(), and\nwrite_data() methods.

\n
\n
Parameters:
\n

file_handle (FileHandle) \u2013 The handle to the file to make available as a local temp file.

\n
\n
Returns:
\n

Path to the local temp file.

\n
\n
Return type:
\n

str

\n
\n
\n
\n\n
\n
\nabstract delete_local_temp()[source]\u00b6
\n

Delete all local temporary files created by previous calls to\ncopy_handle_to_local_temp().

\n

Should typically only be called by framework implementors.

\n
\n\n
\n
\nabstract read(file_handle, mode='rb')[source]\u00b6
\n

Return a file-like stream for the file handle.

\n

This may incur an expensive network call for file managers backed by object stores\nsuch as S3.

\n
\n
Parameters:
\n
    \n
  • file_handle (FileHandle) \u2013 The file handle to make available as a stream.

  • \n
  • mode (str) \u2013 The mode in which to open the file. Default: "rb".

  • \n
\n
\n
Returns:
\n

A file-like stream.

\n
\n
Return type:
\n

Union[TextIO, BinaryIO]

\n
\n
\n
\n\n
\n
\nabstract read_data(file_handle)[source]\u00b6
\n

Return the bytes for a given file handle. This may incur an expensive network\ncall for file managers backed by object stores such as s3.

\n
\n
Parameters:
\n

file_handle (FileHandle) \u2013 The file handle for which to return bytes.

\n
\n
Returns:
\n

Bytes for a given file handle.

\n
\n
Return type:
\n

bytes

\n
\n
\n
\n\n
\n
\nabstract write(file_obj, mode='wb', ext=None)[source]\u00b6
\n

Write the bytes contained within the given file object into the file manager.

\n
\n
Parameters:
\n
    \n
  • file_obj (Union[TextIO, StringIO]) \u2013 A file-like object.

  • \n
  • mode (Optional[str]) \u2013 The mode in which to write the file into the file manager.\nDefault: "wb".

  • \n
  • ext (Optional[str]) \u2013 For file managers that support file extensions, the extension with\nwhich to write the file. Default: None.

  • \n
\n
\n
Returns:
\n

A handle to the newly created file.

\n
\n
Return type:
\n

FileHandle

\n
\n
\n
\n\n
\n
\nabstract write_data(data, ext=None)[source]\u00b6
\n

Write raw bytes into the file manager.

\n
\n
Parameters:
\n
    \n
  • data (bytes) \u2013 The bytes to write into the file manager.

  • \n
  • ext (Optional[str]) \u2013 For file managers that support file extensions, the extension with\nwhich to write the file. Default: None.

  • \n
\n
\n
Returns:
\n

A handle to the newly created file.

\n
\n
Return type:
\n

FileHandle

\n
\n
\n
\n\n
\n\n
\n
\ndagster.local_file_manager ResourceDefinition[source]\u00b6
\n

FileManager that provides abstract access to a local filesystem.

\n

By default, files will be stored in <local_artifact_storage>/storage/file_manager where\n<local_artifact_storage> can be configured the dagster.yaml file in $DAGSTER_HOME.

\n

Implements the FileManager API.

\n

Examples

\n
import tempfile\n\nfrom dagster import job, local_file_manager, op\n\n\n@op(required_resource_keys={"file_manager"})\ndef write_files(context):\n    fh_1 = context.resources.file_manager.write_data(b"foo")\n\n    with tempfile.NamedTemporaryFile("w+") as fd:\n        fd.write("bar")\n        fd.seek(0)\n        fh_2 = context.resources.file_manager.write(fd, mode="w", ext=".txt")\n\n    return (fh_1, fh_2)\n\n\n@op(required_resource_keys={"file_manager"})\ndef read_files(context, file_handles):\n    fh_1, fh_2 = file_handles\n    assert context.resources.file_manager.read_data(fh_2) == b"bar"\n    fd = context.resources.file_manager.read(fh_2, mode="r")\n    assert fd.read() == "foo"\n    fd.close()\n\n\n@job(resource_defs={"file_manager": local_file_manager})\ndef files_pipeline():\n    read_files(write_files())\n
\n
\n

Or to specify the file directory:

\n
@job(\n    resource_defs={\n        "file_manager": local_file_manager.configured({"base_dir": "/my/base/dir"})\n    }\n)\ndef files_pipeline():\n    read_files(write_files())\n
\n
\n
\n\n
\n
\nclass dagster.FileHandle[source]\u00b6
\n

A reference to a file as manipulated by a FileManager.

\n

Subclasses may handle files that are resident on the local file system, in an object store, or\nin any arbitrary place where a file can be stored.

\n

This exists to handle the very common case where you wish to write a computation that reads,\ntransforms, and writes files, but where you also want the same code to work in local development\nas well as on a cluster where the files will be stored in a globally available object store\nsuch as S3.

\n
\n
\nabstract property path_desc\u00b6
\n

A representation of the file path for display purposes only.

\n
\n\n
\n\n
\n
\nclass dagster.LocalFileHandle(path)[source]\u00b6
\n

A reference to a file on a local filesystem.

\n
\n
\nproperty path\u00b6
\n

The file\u2019s path.

\n
\n\n
\n
\nproperty path_desc\u00b6
\n

A representation of the file path for display purposes only.

\n
\n\n
\n\n
\n
\n
\n

Instance\u00b6

\n
\n
\nclass dagster.DagsterInstance(instance_type, local_artifact_storage, run_storage, event_storage, run_coordinator, compute_log_manager, run_launcher, scheduler=None, schedule_storage=None, settings=None, secrets_loader=None, ref=None, **_kwargs)[source]\u00b6
\n

Core abstraction for managing Dagster\u2019s access to storage and other resources.

\n

Use DagsterInstance.get() to grab the current DagsterInstance which will load based on\nthe values in the dagster.yaml file in $DAGSTER_HOME.

\n

Alternatively, DagsterInstance.ephemeral() can use used which provides a set of\ntransient in-memory components.

\n

Configuration of this class should be done by setting values in $DAGSTER_HOME/dagster.yaml.\nFor example, to use Postgres for dagster storage, you can write a dagster.yaml such as the\nfollowing:

\n
\n
dagster.yaml\u00b6
\n
storage:\n  postgres:\n    postgres_db:\n      username: my_username\n      password: my_password\n      hostname: my_hostname\n      db_name: my_database\n      port: 5432\n
\n
\n
\n
\n
Parameters:
\n
    \n
  • instance_type (InstanceType) \u2013 Indicates whether the instance is ephemeral or persistent.\nUsers should not attempt to set this value directly or in their dagster.yaml files.

  • \n
  • local_artifact_storage (LocalArtifactStorage) \u2013 The local artifact storage is used to\nconfigure storage for any artifacts that require a local disk, such as schedules, or\nwhen using the filesystem system storage to manage files and intermediates. By default,\nthis will be a dagster._core.storage.root.LocalArtifactStorage. Configurable\nin dagster.yaml using the ConfigurableClass\nmachinery.

  • \n
  • run_storage (RunStorage) \u2013 The run storage is used to store metadata about ongoing and past\npipeline runs. By default, this will be a\ndagster._core.storage.runs.SqliteRunStorage. Configurable in dagster.yaml\nusing the ConfigurableClass machinery.

  • \n
  • event_storage (EventLogStorage) \u2013 Used to store the structured event logs generated by\npipeline runs. By default, this will be a\ndagster._core.storage.event_log.SqliteEventLogStorage. Configurable in\ndagster.yaml using the ConfigurableClass machinery.

  • \n
  • compute_log_manager (Optional[ComputeLogManager]) \u2013 The compute log manager handles stdout\nand stderr logging for op compute functions. By default, this will be a\ndagster._core.storage.local_compute_log_manager.LocalComputeLogManager.\nConfigurable in dagster.yaml using the\nConfigurableClass machinery.

  • \n
  • run_coordinator (Optional[RunCoordinator]) \u2013 A runs coordinator may be used to manage the execution\nof pipeline runs.

  • \n
  • run_launcher (Optional[RunLauncher]) \u2013 Optionally, a run launcher may be used to enable\na Dagster instance to launch pipeline runs, e.g. on a remote Kubernetes cluster, in\naddition to running them locally.

  • \n
  • settings (Optional[Dict]) \u2013 Specifies certain per-instance settings,\nsuch as feature flags. These are set in the dagster.yaml under a set of whitelisted\nkeys.

  • \n
  • ref (Optional[InstanceRef]) \u2013 Used by internal machinery to pass instances across process\nboundaries.

  • \n
\n
\n
\n
\n
\nadd_dynamic_partitions(partitions_def_name, partition_keys)[source]\u00b6
\n

Add partitions to the specified DynamicPartitionsDefinition idempotently.\nDoes not add any partitions that already exist.

\n
\n
Parameters:
\n
    \n
  • partitions_def_name (str) \u2013 The name of the DynamicPartitionsDefinition.

  • \n
  • partition_keys (Sequence[str]) \u2013 Partition keys to add.

  • \n
\n
\n
\n
\n\n
\n
\ndelete_dynamic_partition(partitions_def_name, partition_key)[source]\u00b6
\n

Delete a partition for the specified DynamicPartitionsDefinition.\nIf the partition does not exist, exits silently.

\n
\n
Parameters:
\n
    \n
  • partitions_def_name (str) \u2013 The name of the DynamicPartitionsDefinition.

  • \n
  • partition_key (Sequence[str]) \u2013 Partition key to delete.

  • \n
\n
\n
\n
\n\n
\n
\ndelete_run(run_id)[source]\u00b6
\n

Delete a run and all events generated by that from storage.

\n
\n
Parameters:
\n

run_id (str) \u2013 The id of the run to delete.

\n
\n
\n
\n\n
\n
\nstatic ephemeral(tempdir=None, preload=None, settings=None)[source]\u00b6
\n

Create a DagsterInstance suitable for ephemeral execution, useful in test contexts. An\nephemeral instance uses mostly in-memory components. Use local_temp to create a test\ninstance that is fully persistent.

\n
\n
Parameters:
\n
    \n
  • tempdir (Optional[str]) \u2013 The path of a directory to be used for local artifact storage.

  • \n
  • preload (Optional[Sequence[DebugRunPayload]]) \u2013 A sequence of payloads to load into the\ninstance\u2019s run storage. Useful for debugging.

  • \n
  • settings (Optional[Dict]) \u2013 Settings for the instance.

  • \n
\n
\n
Returns:
\n

An ephemeral DagsterInstance.

\n
\n
Return type:
\n

DagsterInstance

\n
\n
\n
\n\n
\n
\nstatic get()[source]\u00b6
\n

Get the current DagsterInstance as specified by the DAGSTER_HOME environment variable.

\n
\n
Returns:
\n

The current DagsterInstance.

\n
\n
Return type:
\n

DagsterInstance

\n
\n
\n
\n\n
\n
\nget_asset_keys(prefix=None, limit=None, cursor=None)[source]\u00b6
\n

Return a filtered subset of asset keys managed by this instance.

\n
\n
Parameters:
\n
    \n
  • prefix (Optional[Sequence[str]]) \u2013 Return only assets having this key prefix.

  • \n
  • limit (Optional[int]) \u2013 Maximum number of keys to return.

  • \n
  • cursor (Optional[str]) \u2013 Cursor to use for pagination.

  • \n
\n
\n
Returns:
\n

List of asset keys.

\n
\n
Return type:
\n

Sequence[AssetKey]

\n
\n
\n
\n\n
\n
\nget_asset_records(asset_keys=None)[source]\u00b6
\n

Return an AssetRecord for each of the given asset keys.

\n
\n
Parameters:
\n

asset_keys (Optional[Sequence[AssetKey]]) \u2013 List of asset keys to retrieve records for.

\n
\n
Returns:
\n

List of asset records.

\n
\n
Return type:
\n

Sequence[AssetRecord]

\n
\n
\n
\n\n
\n
\nget_dynamic_partitions(partitions_def_name)[source]\u00b6
\n

Get the set of partition keys for the specified DynamicPartitionsDefinition.

\n
\n
Parameters:
\n

partitions_def_name (str) \u2013 The name of the DynamicPartitionsDefinition.

\n
\n
\n
\n\n
\n
\nget_event_records(event_records_filter, limit=None, ascending=False)[source]\u00b6
\n

Return a list of event records stored in the event log storage.

\n
\n
Parameters:
\n
    \n
  • event_records_filter (Optional[EventRecordsFilter]) \u2013 the filter by which to filter event\nrecords.

  • \n
  • limit (Optional[int]) \u2013 Number of results to get. Defaults to infinite.

  • \n
  • ascending (Optional[bool]) \u2013 Sort the result in ascending order if True, descending\notherwise. Defaults to descending.

  • \n
\n
\n
Returns:
\n

List of event log records stored in the event log storage.

\n
\n
Return type:
\n

List[EventLogRecord]

\n
\n
\n
\n\n
\n
\nget_latest_materialization_code_versions(asset_keys)[source]\u00b6
\n

Returns the code version used for the latest materialization of each of the provided\nassets.

\n
\n
Parameters:
\n

asset_keys (Iterable[AssetKey]) \u2013 The asset keys to find latest materialization code\nversions for.

\n
\n
Returns:
\n

\n
A dictionary with a key for each of the provided asset

keys. The values will be None if the asset has no materializations. If an asset does\nnot have a code version explicitly assigned to its definitions, but was\nmaterialized, Dagster assigns the run ID as its code version.

\n
\n
\n

\n
\n
Return type:
\n

Mapping[AssetKey, Optional[str]]

\n
\n
\n
\n\n
\n
\nget_latest_materialization_event(asset_key)[source]\u00b6
\n

Fetch the latest materialization event for the given asset key.

\n
\n
Parameters:
\n

asset_key (AssetKey) \u2013 Asset key to return materialization for.

\n
\n
Returns:
\n

\n
The latest materialization event for the given asset

key, or None if the asset has not been materialized.

\n
\n
\n

\n
\n
Return type:
\n

Optional[AssetMaterialization]

\n
\n
\n
\n\n
\n
\nget_run_by_id(run_id)[source]\u00b6
\n

Get a DagsterRun matching the provided run_id.

\n
\n
Parameters:
\n

run_id (str) \u2013 The id of the run to retrieve.

\n
\n
Returns:
\n

\n
The run corresponding to the given id. If no run matching the id

is found, return None.

\n
\n
\n

\n
\n
Return type:
\n

Optional[DagsterRun]

\n
\n
\n
\n\n
\n
\nget_run_record_by_id(run_id)[source]\u00b6
\n

Get a RunRecord matching the provided run_id.

\n
\n
Parameters:
\n

run_id (str) \u2013 The id of the run record to retrieve.

\n
\n
Returns:
\n

\n
The run record corresponding to the given id. If no run matching

the id is found, return None.

\n
\n
\n

\n
\n
Return type:
\n

Optional[RunRecord]

\n
\n
\n
\n\n
\n
\nget_run_records(filters=None, limit=None, order_by=None, ascending=False, cursor=None, bucket_by=None)[source]\u00b6
\n

Return a list of run records stored in the run storage, sorted by the given column in given order.

\n
\n
Parameters:
\n
    \n
  • filters (Optional[RunsFilter]) \u2013 the filter by which to filter runs.

  • \n
  • limit (Optional[int]) \u2013 Number of results to get. Defaults to infinite.

  • \n
  • order_by (Optional[str]) \u2013 Name of the column to sort by. Defaults to id.

  • \n
  • ascending (Optional[bool]) \u2013 Sort the result in ascending order if True, descending\notherwise. Defaults to descending.

  • \n
\n
\n
Returns:
\n

List of run records stored in the run storage.

\n
\n
Return type:
\n

List[RunRecord]

\n
\n
\n
\n\n
\n
\nget_status_by_partition(asset_key, partition_keys, partitions_def)[source]\u00b6
\n

Get the current status of provided partition_keys for the provided asset.

\n
\n
Parameters:
\n
    \n
  • asset_key (AssetKey) \u2013 The asset to get per-partition status for.

  • \n
  • partition_keys (Sequence[str]) \u2013 The partitions to get status for.

  • \n
  • partitions_def (PartitionsDefinition) \u2013 The PartitionsDefinition of the asset to get\nper-partition status for.

  • \n
\n
\n
Returns:
\n

status for each partition key

\n
\n
Return type:
\n

Optional[Mapping[str, AssetPartitionStatus]]

\n
\n
\n
\n\n
\n
\nhas_asset_key(asset_key)[source]\u00b6
\n

Return true if this instance manages the given asset key.

\n
\n
Parameters:
\n

asset_key (AssetKey) \u2013 Asset key to check.

\n
\n
\n
\n\n
\n
\nhas_dynamic_partition(partitions_def_name, partition_key)[source]\u00b6
\n

Check if a partition key exists for the DynamicPartitionsDefinition.

\n
\n
Parameters:
\n
    \n
  • partitions_def_name (str) \u2013 The name of the DynamicPartitionsDefinition.

  • \n
  • partition_key (Sequence[str]) \u2013 Partition key to check.

  • \n
\n
\n
\n
\n\n
\n
\nstatic local_temp(tempdir=None, overrides=None)[source]\u00b6
\n

Create a DagsterInstance that uses a temporary directory for local storage. This is a\nregular, fully persistent instance. Use ephemeral to get an ephemeral instance with\nin-memory components.

\n
\n
Parameters:
\n
    \n
  • tempdir (Optional[str]) \u2013 The path of a directory to be used for local artifact storage.

  • \n
  • overrides (Optional[DagsterInstanceOverrides]) \u2013 Override settings for the instance.

  • \n
\n
\n
Returns:
\n

DagsterInstance

\n
\n
\n
\n\n
\n
\nwipe_assets(asset_keys)[source]\u00b6
\n

Wipes asset event history from the event log for the given asset keys.

\n
\n
Parameters:
\n

asset_keys (Sequence[AssetKey]) \u2013 Asset keys to wipe.

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster._core.instance.InstanceRef(local_artifact_storage_data, compute_logs_data, scheduler_data, run_coordinator_data, run_launcher_data, settings, run_storage_data, event_storage_data, schedule_storage_data, custom_instance_class_data=None, storage_data=None, secrets_loader_data=None)[source]\u00b6
\n

Serializable representation of a DagsterInstance.

\n

Users should not instantiate this class directly.

\n
\n\n
\n
\nclass dagster._serdes.ConfigurableClass[source]\u00b6
\n

Abstract mixin for classes that can be loaded from config.

\n

This supports a powerful plugin pattern which avoids both a) a lengthy, hard-to-synchronize list\nof conditional imports / optional extras_requires in dagster core and b) a magic directory or\nfile in which third parties can place plugin packages. Instead, the intention is to make, e.g.,\nrun storage, pluggable with a config chunk like:

\n
run_storage:\n    module: very_cool_package.run_storage\n    class: SplendidRunStorage\n    config:\n        magic_word: "quux"\n
\n
\n

This same pattern should eventually be viable for other system components, e.g. engines.

\n

The ConfigurableClass mixin provides the necessary hooks for classes to be instantiated from\nan instance of ConfigurableClassData.

\n

Pieces of the Dagster system which we wish to make pluggable in this way should consume a config\ntype such as:

\n
{'module': str, 'class': str, 'config': Field(Permissive())}\n
\n
\n
\n\n
\n
\nclass dagster._serdes.ConfigurableClassData(module_name, class_name, config_yaml)[source]\u00b6
\n

Serializable tuple describing where to find a class and the config fragment that should\nbe used to instantiate it.

\n

Users should not instantiate this class directly.

\n

Classes intended to be serialized in this way should implement the\ndagster.serdes.ConfigurableClass mixin.

\n
\n\n
\n
\nclass dagster._core.storage.root.LocalArtifactStorage(base_dir, inst_data=None)[source]\u00b6
\n
\n\n
\n
\n
\n

Storage\u00b6

\n
\n
\nclass dagster._core.storage.base_storage.DagsterStorage[source]\u00b6
\n

Abstract base class for Dagster persistent storage, for reading and writing data for runs,\nevents, and schedule/sensor state.

\n

Users should not directly instantiate concrete subclasses of this class; they are instantiated\nby internal machinery when dagster-webserver and dagster-daemon load, based on the values in the\ndagster.yaml file in $DAGSTER_HOME. Configuration of concrete subclasses of this class\nshould be done by setting values in that file.

\n
\n\n
\n
\n
\n

Run storage\u00b6

\n
\n
\nclass dagster.DagsterRun(job_name, run_id=None, run_config=None, asset_selection=None, asset_check_selection=None, op_selection=None, resolved_op_selection=None, step_keys_to_execute=None, status=None, tags=None, root_run_id=None, parent_run_id=None, job_snapshot_id=None, execution_plan_snapshot_id=None, external_job_origin=None, job_code_origin=None, has_repository_load_data=None)[source]\u00b6
\n

Serializable internal representation of a dagster run, as stored in a\nRunStorage.

\n
\n
\nproperty is_failure\u00b6
\n

If this run has failed.

\n
\n
Type:
\n

bool

\n
\n
\n
\n\n
\n
\nproperty is_failure_or_canceled\u00b6
\n

If this run has either failed or was canceled.

\n
\n
Type:
\n

bool

\n
\n
\n
\n\n
\n
\nproperty is_finished\u00b6
\n

If this run has completely finished execution.

\n
\n
Type:
\n

bool

\n
\n
\n
\n\n
\n
\nproperty is_resume_retry\u00b6
\n

If this run was created from retrying another run from the point of failure.

\n
\n
Type:
\n

bool

\n
\n
\n
\n\n
\n
\nproperty is_success\u00b6
\n

If this run has successfully finished executing.

\n
\n
Type:
\n

bool

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.DagsterRunStatus(value)[source]\u00b6
\n

The status of run execution.

\n
\n\n
\n
\nclass dagster.RunsFilter(run_ids=None, job_name=None, statuses=None, tags=None, snapshot_id=None, updated_after=None, updated_before=None, created_after=None, created_before=None)[source]\u00b6
\n

Defines a filter across job runs, for use when querying storage directly.

\n

Each field of the RunsFilter represents a logical AND with each other. For\nexample, if you specify job_name and tags, then you will receive only runs\nwith the specified job_name AND the specified tags. If left blank, then\nall values will be permitted for that field.

\n
\n
Parameters:
\n
    \n
  • run_ids (Optional[List[str]]) \u2013 A list of job run_id values.

  • \n
  • job_name (Optional[str]) \u2013 Name of the job to query for. If blank, all job_names will be accepted.

  • \n
  • statuses (Optional[List[DagsterRunStatus]]) \u2013 A list of run statuses to filter by. If blank, all run statuses will be allowed.

  • \n
  • tags (Optional[Dict[str, Union[str, List[str]]]]) \u2013 A dictionary of run tags to query by. All tags specified here must be present for a given run to pass the filter.

  • \n
  • snapshot_id (Optional[str]) \u2013 The ID of the job snapshot to query for. Intended for internal use.

  • \n
  • updated_after (Optional[DateTime]) \u2013 Filter by runs that were last updated before this datetime.

  • \n
  • created_before (Optional[DateTime]) \u2013 Filter by runs that were created before this datetime.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster._core.storage.runs.RunStorage[source]\u00b6
\n

Abstract base class for storing pipeline run history.

\n

Note that run storages using SQL databases as backing stores should implement\nSqlRunStorage.

\n

Users should not directly instantiate concrete subclasses of this class; they are instantiated\nby internal machinery when dagster-webserver and dagster-graphql load, based on the values in the\ndagster.yaml file in $DAGSTER_HOME. Configuration of concrete subclasses of this class\nshould be done by setting values in that file.

\n
\n\n
\n
\nclass dagster._core.storage.runs.SqlRunStorage[source]\u00b6
\n

Base class for SQL based run storages.

\n
\n\n
\n
\nclass dagster._core.storage.runs.SqliteRunStorage(conn_string, inst_data=None)[source]\u00b6
\n

SQLite-backed run storage.

\n

Users should not directly instantiate this class; it is instantiated by internal machinery when\ndagster-webserver and dagster-graphql load, based on the values in the dagster.yaml file in\n$DAGSTER_HOME. Configuration of this class should be done by setting values in that file.

\n

This is the default run storage when none is specified in the dagster.yaml.

\n

To explicitly specify SQLite for run storage, you can add a block such as the following to your\ndagster.yaml:

\n
run_storage:\n  module: dagster._core.storage.runs\n  class: SqliteRunStorage\n  config:\n    base_dir: /path/to/dir\n
\n
\n

The base_dir param tells the run storage where on disk to store the database.

\n
\n\n
\n
\nclass dagster._core.storage.dagster_run.RunRecord(storage_id, dagster_run, create_timestamp, update_timestamp, start_time=None, end_time=None)[source]\u00b6
\n

Internal representation of a run record, as stored in a\nRunStorage.

\n

Users should not invoke this class directly.

\n
\n\n

See also: dagster_postgres.PostgresRunStorage and dagster_mysql.MySQLRunStorage.

\n
\n
\n
\n

Event log storage\u00b6

\n
\n
\nclass dagster.EventLogEntry(error_info, level, user_message, run_id, timestamp, step_key=None, job_name=None, dagster_event=None)[source]\u00b6
\n

Entries in the event log.

\n

Users should not instantiate this object directly. These entries may originate from the logging machinery (DagsterLogManager/context.log), from\nframework events (e.g. EngineEvent), or they may correspond to events yielded by user code\n(e.g. Output).

\n
\n
Parameters:
\n
    \n
  • error_info (Optional[SerializableErrorInfo]) \u2013 Error info for an associated exception, if\nany, as generated by serializable_error_info_from_exc_info and friends.

  • \n
  • level (Union[str, int]) \u2013 The Python log level at which to log this event. Note that\nframework and user code events are also logged to Python logging. This value may be an\ninteger or a (case-insensitive) string member of PYTHON_LOGGING_LEVELS_NAMES.

  • \n
  • user_message (str) \u2013 For log messages, this is the user-generated message.

  • \n
  • run_id (str) \u2013 The id of the run which generated this event.

  • \n
  • timestamp (float) \u2013 The Unix timestamp of this event.

  • \n
  • step_key (Optional[str]) \u2013 The step key for the step which generated this event. Some events\nare generated outside of a step context.

  • \n
  • job_name (Optional[str]) \u2013 The job which generated this event. Some events are\ngenerated outside of a job context.

  • \n
  • dagster_event (Optional[DagsterEvent]) \u2013 For framework and user events, the associated\nstructured event.

  • \n
\n
\n
\n
\n
\nproperty dagster_event_type\u00b6
\n

The type of the DagsterEvent contained by this entry, if any.

\n
\n
Type:
\n

Optional[DagsterEventType]

\n
\n
\n
\n\n
\n
\nget_dagster_event()[source]\u00b6
\n

DagsterEvent: Returns the DagsterEvent contained within this entry. If this entry does not\ncontain a DagsterEvent, an error will be raised.

\n
\n\n
\n
\nproperty is_dagster_event\u00b6
\n

If this entry contains a DagsterEvent.

\n
\n
Type:
\n

bool

\n
\n
\n
\n\n
\n
\nproperty message\u00b6
\n

Return the message from the structured DagsterEvent if present, fallback to user_message.

\n
\n\n
\n\n
\n
\nclass dagster.EventLogRecord(storage_id, event_log_entry)[source]\u00b6
\n

Internal representation of an event record, as stored in a\nEventLogStorage.

\n

Users should not instantiate this class directly.

\n
\n\n
\n
\nclass dagster.EventRecordsFilter(event_type, asset_key=None, asset_partitions=None, after_cursor=None, before_cursor=None, after_timestamp=None, before_timestamp=None, storage_ids=None, tags=None)[source]\u00b6
\n

Defines a set of filter fields for fetching a set of event log entries or event log records.

\n
\n
Parameters:
\n
    \n
  • event_type (DagsterEventType) \u2013 Filter argument for dagster event type

  • \n
  • asset_key (Optional[AssetKey]) \u2013 Asset key for which to get asset materialization event\nentries / records.

  • \n
  • asset_partitions (Optional[List[str]]) \u2013 Filter parameter such that only asset\nevents with a partition value matching one of the provided values. Only\nvalid when the asset_key parameter is provided.

  • \n
  • after_cursor (Optional[Union[int, RunShardedEventsCursor]]) \u2013 Filter parameter such that only\nrecords with storage_id greater than the provided value are returned. Using a\nrun-sharded events cursor will result in a significant performance gain when run against\na SqliteEventLogStorage implementation (which is run-sharded)

  • \n
  • before_cursor (Optional[Union[int, RunShardedEventsCursor]]) \u2013 Filter parameter such that\nrecords with storage_id less than the provided value are returned. Using a run-sharded\nevents cursor will result in a significant performance gain when run against\na SqliteEventLogStorage implementation (which is run-sharded)

  • \n
  • after_timestamp (Optional[float]) \u2013 Filter parameter such that only event records for\nevents with timestamp greater than the provided value are returned.

  • \n
  • before_timestamp (Optional[float]) \u2013 Filter parameter such that only event records for\nevents with timestamp less than the provided value are returned.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.RunShardedEventsCursor(id, run_updated_after)[source]\u00b6
\n

Pairs an id-based event log cursor with a timestamp-based run cursor, for improved\nperformance on run-sharded event log storages (e.g. the default SqliteEventLogStorage). For\nrun-sharded storages, the id field is ignored, since they may not be unique across shards.

\n
\n\n
\n
\nclass dagster._core.storage.event_log.EventLogStorage[source]\u00b6
\n

Abstract base class for storing structured event logs from pipeline runs.

\n

Note that event log storages using SQL databases as backing stores should implement\nSqlEventLogStorage.

\n

Users should not directly instantiate concrete subclasses of this class; they are instantiated\nby internal machinery when dagster-webserver and dagster-graphql load, based on the values in the\ndagster.yaml file in $DAGSTER_HOME. Configuration of concrete subclasses of this class\nshould be done by setting values in that file.

\n
\n\n
\n
\nclass dagster._core.storage.event_log.SqlEventLogStorage[source]\u00b6
\n

Base class for SQL backed event log storages.

\n

Distinguishes between run-based connections and index connections in order to support run-level\nsharding, while maintaining the ability to do cross-run queries

\n
\n\n
\n
\nclass dagster._core.storage.event_log.SqliteEventLogStorage(base_dir, inst_data=None)[source]\u00b6
\n

SQLite-backed event log storage.

\n

Users should not directly instantiate this class; it is instantiated by internal machinery when\ndagster-webserver and dagster-graphql load, based on the values in the dagster.yaml file insqliteve\n$DAGSTER_HOME. Configuration of this class should be done by setting values in that file.

\n

This is the default event log storage when none is specified in the dagster.yaml.

\n

To explicitly specify SQLite for event log storage, you can add a block such as the following\nto your dagster.yaml:

\n
event_log_storage:\n  module: dagster._core.storage.event_log\n  class: SqliteEventLogStorage\n  config:\n    base_dir: /path/to/dir\n
\n
\n

The base_dir param tells the event log storage where on disk to store the databases. To\nimprove concurrent performance, event logs are stored in a separate SQLite database for each\nrun.

\n
\n\n
\n
\nclass dagster._core.storage.event_log.ConsolidatedSqliteEventLogStorage(base_dir, inst_data=None)[source]\u00b6
\n

SQLite-backed consolidated event log storage intended for test cases only.

\n

Users should not directly instantiate this class; it is instantiated by internal machinery when\ndagster-webserver and dagster-graphql load, based on the values in the dagster.yaml file in\n$DAGSTER_HOME. Configuration of this class should be done by setting values in that file.

\n

To explicitly specify the consolidated SQLite for event log storage, you can add a block such as\nthe following to your dagster.yaml:

\n
run_storage:\n  module: dagster._core.storage.event_log\n  class: ConsolidatedSqliteEventLogStorage\n  config:\n    base_dir: /path/to/dir\n
\n
\n

The base_dir param tells the event log storage where on disk to store the database.

\n
\n\n
\n
\nclass dagster._core.storage.event_log.AssetRecord(storage_id, asset_entry)[source]\u00b6
\n

Internal representation of an asset record, as stored in a EventLogStorage.

\n

Users should not invoke this class directly.

\n
\n\n

See also: dagster_postgres.PostgresEventLogStorage and dagster_mysql.MySQLEventLogStorage.

\n
\n
\n
\n

Compute log manager\u00b6

\n
\n
\nclass dagster._core.storage.captured_log_manager.CapturedLogManager[source]\u00b6
\n

Abstract base class for capturing the unstructured logs (stdout/stderr) in the current\nprocess, stored / retrieved with a provided log_key.

\n
\n\n
\n
\nclass dagster._core.storage.compute_log_manager.ComputeLogManager[source]\u00b6
\n

Abstract base class for storing unstructured compute logs (stdout/stderr) from the compute\nsteps of pipeline solids.

\n
\n\n
\n
\nclass dagster._core.storage.local_compute_log_manager.LocalComputeLogManager(base_dir, polling_timeout=None, inst_data=None)[source]\u00b6
\n

Stores copies of stdout & stderr for each compute step locally on disk.

\n
\n\n
\n
\nclass dagster._core.storage.noop_compute_log_manager.NoOpComputeLogManager(inst_data=None)[source]\u00b6
\n

When enabled for a Dagster instance, stdout and stderr will not be available for any step.

\n
\n\n

See also: dagster_aws.S3ComputeLogManager.

\n
\n
\n
\n

Run launcher\u00b6

\n
\n
\nclass dagster._core.launcher.RunLauncher[source]\u00b6
\n
\n\n
\n
\nclass dagster._core.launcher.DefaultRunLauncher(inst_data=None)[source]\u00b6
\n

Launches runs against running GRPC servers.

\n
\n\n
\n
\n
\n

Run coordinator\u00b6

\n
\n
\nclass dagster._core.run_coordinator.DefaultRunCoordinator(inst_data=None)[source]\u00b6
\n

Immediately send runs to the run launcher.

\n
\n\n
\n
\ndagster._core.run_coordinator.QueuedRunCoordinator RunCoordinator[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
max_concurrent_runs (dagster.IntSource, optional):
\n

The maximum number of runs that are allowed to be in progress at once. Defaults to 10. Set to -1 to disable the limit. Set to 0 to stop any runs from launching. Any other negative values are disallowed.

\n
\n
tag_concurrency_limits (Union[List[strict dict], None], optional):
\n

A set of limits that are applied to runs with particular tags. If a value is set, the limit is applied to only that key-value pair. If no value is set, the limit is applied across all values of that key. If the value is set to a dict with applyLimitPerUniqueValue: true, the limit will apply to the number of unique values for that key.

\n
\n
dequeue_interval_seconds (dagster.IntSource, optional):
\n

The interval in seconds at which the Dagster Daemon should periodically check the run queue for new runs to launch.

\n
\n
dequeue_use_threads (Bool, optional):
\n

Whether or not to use threads for concurrency when launching dequeued runs.

\n
\n
dequeue_num_workers (dagster.IntSource, optional):
\n

If dequeue_use_threads is true, limit the number of concurrent worker threads.

\n
\n
max_user_code_failure_retries (dagster.IntSource, optional):
\n

If there is an error reaching a Dagster gRPC server while dequeuing the run, how many times to retry the dequeue before failing it. The only run launcher that requires the gRPC server to be running is the DefaultRunLauncher, so setting this will have no effect unless that run launcher is being used.

\n

Default Value: 0

\n
\n
user_code_failure_retry_delay (dagster.IntSource, optional):
\n

If there is an error reaching a Dagster gRPC server while dequeuing the run, how long to wait before retrying any runs from that same code location. The only run launcher that requires the gRPC server to be running is the DefaultRunLauncher, so setting this will have no effect unless that run launcher is being used.

\n

Default Value: 60

\n
\n
\n

Enqueues runs via the run storage, to be deqeueued by the Dagster Daemon process. Requires\nthe Dagster Daemon process to be alive in order for runs to be launched.

\n
\n\n
\n
\n
\n

Scheduling\u00b6

\n
\n
\nclass dagster._core.scheduler.Scheduler[source]\u00b6
\n

Abstract base class for a scheduler. This component is responsible for interfacing with\nan external system such as cron to ensure scheduled repeated execution according.

\n
\n\n
\n
\nclass dagster._core.storage.schedules.ScheduleStorage[source]\u00b6
\n

Abstract class for managing persistance of scheduler artifacts.

\n
\n\n
\n
\nclass dagster._core.storage.schedules.SqlScheduleStorage[source]\u00b6
\n

Base class for SQL backed schedule storage.

\n
\n\n
\n
\nclass dagster._core.storage.schedules.SqliteScheduleStorage(conn_string, inst_data=None)[source]\u00b6
\n

Local SQLite backed schedule storage.

\n
\n\n

see also: dagster_postgres.PostgresScheduleStorage and dagster_mysql.MySQLScheduleStorage.

\n
\n
\n
\n

Exception handling\u00b6

\n
\n
\ndagster._core.errors.user_code_error_boundary(error_cls, msg_fn, log_manager=None, **kwargs)[source]\u00b6
\n

Wraps the execution of user-space code in an error boundary. This places a uniform\npolicy around any user code invoked by the framework. This ensures that all user\nerrors are wrapped in an exception derived from DagsterUserCodeExecutionError,\nand that the original stack trace of the user error is preserved, so that it\ncan be reported without confusing framework code in the stack trace, if a\ntool author wishes to do so.

\n

Examples:\n.. code-block:: python

\n
\n
\n
with user_code_error_boundary(

# Pass a class that inherits from DagsterUserCodeExecutionError\nDagsterExecutionStepExecutionError,\n# Pass a function that produces a message\n\u201cError occurred during step execution\u201d

\n
\n
):

call_user_provided_function()

\n
\n
\n
\n
\n\n
\n
\n
\n

Step Launchers (Experimental)\u00b6

\n
\n
\nclass dagster.StepLauncher[source]\u00b6
\n

A StepLauncher is responsible for executing steps, either in-process or in an external process.

\n
\n\n
\n
\nclass dagster.StepRunRef(run_config, dagster_run, run_id, retry_mode, step_key, recon_job, known_state)[source]\u00b6
\n

A serializable object that specifies what\u2019s needed to hydrate a step so\nthat it can be executed in a process outside the plan process.

\n

Users should not instantiate this class directly.

\n
\n\n
\n
\nclass dagster.StepExecutionContext(plan_data, execution_data, log_manager, step, output_capture, known_state)[source]\u00b6
\n

Context for the execution of a step. Users should not instantiate this class directly.

\n

This context assumes that user code can be run directly, and thus includes resource and information.

\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/internals", "customsidebar": null, "display_toc": true, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../jobs/", "title": "Jobs"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../hooks/", "title": "Hooks"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/jobs", "Jobs", "N", "next"], ["sections/api/apidocs/hooks", "Hooks", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/internals.rst.txt", "title": "Internals", "toc": "\n"}, "io-managers": {"alabaster_version": "0.7.13", "body": "
\n

IO Managers\u00b6

\n

IO managers are user-provided objects that store op outputs and load them as inputs to downstream\nops.

\n
\n
\nclass dagster.ConfigurableIOManager[source]\u00b6
\n

Base class for Dagster IO managers that utilize structured config.

\n

This class is a subclass of both IOManagerDefinition, Config,\nand IOManager. Implementers must provide an implementation of the\nhandle_output() and load_input() methods.

\n

Example definition:

\n
class MyIOManager(ConfigurableIOManager):\n    path_prefix: List[str]\n\n    def _get_path(self, context) -> str:\n        return "/".join(context.asset_key.path)\n\n    def handle_output(self, context, obj):\n        write_csv(self._get_path(context), obj)\n\n    def load_input(self, context):\n        return read_csv(self._get_path(context))\n\ndefs = Definitions(\n    ...,\n    resources={\n        "io_manager": MyIOManager(path_prefix=["my", "prefix"])\n    }\n)\n
\n
\n
\n\n
\n
\nclass dagster.ConfigurableIOManagerFactory[source]\u00b6
\n

Base class for Dagster IO managers that utilize structured config. This base class\nis useful for cases in which the returned IO manager is not the same as the class itself\n(e.g. when it is a wrapper around the actual IO manager implementation).

\n

This class is a subclass of both IOManagerDefinition and Config.\nImplementers should provide an implementation of the resource_function() method,\nwhich should return an instance of IOManager.

\n

Example definition:

\n
class ExternalIOManager(IOManager):\n\n    def __init__(self, connection):\n        self._connection = connection\n\n    def handle_output(self, context, obj):\n        ...\n\n    def load_input(self, context):\n        ...\n\nclass ConfigurableExternalIOManager(ConfigurableIOManagerFactory):\n    username: str\n    password: str\n\n    def create_io_manager(self, context) -> IOManager:\n        with database.connect(username, password) as connection:\n            return MyExternalIOManager(connection)\n\ndefs = Definitions(\n    ...,\n    resources={\n        "io_manager": ConfigurableExternalIOManager(\n            username="dagster",\n            password=EnvVar("DB_PASSWORD")\n        )\n    }\n)\n
\n
\n
\n\n
\n
\nclass dagster.IOManager[source]\u00b6
\n

Base class for user-provided IO managers.

\n

IOManagers are used to store op outputs and load them as inputs to downstream ops.

\n

Extend this class to handle how objects are loaded and stored. Users should implement\nhandle_output to store an object and load_input to retrieve an object.

\n
\n
\nabstract handle_output(context, obj)[source]\u00b6
\n

User-defined method that stores an output of an op.

\n
\n
Parameters:
\n
    \n
  • context (OutputContext) \u2013 The context of the step output that produces this object.

  • \n
  • obj (Any) \u2013 The object, returned by the op, to be stored.

  • \n
\n
\n
\n
\n\n
\n
\nabstract load_input(context)[source]\u00b6
\n

User-defined method that loads an input to an op.

\n
\n
Parameters:
\n

context (InputContext) \u2013 The input context, which describes the input that\u2019s being loaded\nand the upstream output that\u2019s being loaded from.

\n
\n
Returns:
\n

The data object.

\n
\n
Return type:
\n

Any

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.IOManagerDefinition(resource_fn, config_schema=None, description=None, required_resource_keys=None, version=None, input_config_schema=None, output_config_schema=None)[source]\u00b6
\n

Definition of an IO manager resource.

\n

IOManagers are used to store op outputs and load them as inputs to downstream ops.

\n

An IOManagerDefinition is a ResourceDefinition whose resource_fn returns an\nIOManager.

\n

The easiest way to create an IOManagerDefnition is with the @io_manager\ndecorator.

\n
\n
\nstatic hardcoded_io_manager(value, description=None)[source]\u00b6
\n

A helper function that creates an IOManagerDefinition with a hardcoded IOManager.

\n
\n
Parameters:
\n
    \n
  • value (IOManager) \u2013 A hardcoded IO Manager which helps mock the definition.

  • \n
  • description ([Optional[str]]) \u2013 The description of the IO Manager. Defaults to None.

  • \n
\n
\n
Returns:
\n

A hardcoded resource.

\n
\n
Return type:
\n

[IOManagerDefinition]

\n
\n
\n
\n\n
\n\n
\n
\n@dagster.io_manager(config_schema=None, description=None, output_config_schema=None, input_config_schema=None, required_resource_keys=None, version=None)[source]\u00b6
\n

Define an IO manager.

\n

IOManagers are used to store op outputs and load them as inputs to downstream ops.

\n

The decorated function should accept an InitResourceContext and return an\nIOManager.

\n
\n
Parameters:
\n
    \n
  • config_schema (Optional[ConfigSchema]) \u2013 The schema for the resource config. Configuration\ndata available in init_context.resource_config. If not set, Dagster will accept any\nconfig provided.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the resource.

  • \n
  • output_config_schema (Optional[ConfigSchema]) \u2013 The schema for per-output config. If not set,\nno per-output configuration will be allowed.

  • \n
  • input_config_schema (Optional[ConfigSchema]) \u2013 The schema for per-input config. If not set,\nDagster will accept any config provided.

  • \n
  • required_resource_keys (Optional[Set[str]]) \u2013 Keys for the resources required by the object\nmanager.

  • \n
  • version (Optional[str]) \u2013 (Experimental) The version of a resource function. Two wrapped\nresource functions should only have the same version if they produce the same resource\ndefinition when provided with the same inputs.

  • \n
\n
\n
\n

Examples:

\n
class MyIOManager(IOManager):\n    def handle_output(self, context, obj):\n        write_csv("some/path")\n\n    def load_input(self, context):\n        return read_csv("some/path")\n\n@io_manager\ndef my_io_manager(init_context):\n    return MyIOManager()\n\n@op(out=Out(io_manager_key="my_io_manager_key"))\ndef my_op(_):\n    return do_stuff()\n\n@job(resource_defs={"my_io_manager_key": my_io_manager})\ndef my_job():\n    my_op()\n
\n
\n
\n\n
\n

Input and Output Contexts\u00b6

\n
\n
\nclass dagster.InputContext(*, name=None, job_name=None, op_def=None, config=None, metadata=None, upstream_output=None, dagster_type=None, log_manager=None, resource_config=None, resources=None, step_context=None, asset_key=None, partition_key=None, asset_partitions_subset=None, asset_partitions_def=None, instance=None)[source]\u00b6
\n

The context object available to the load_input method of InputManager.

\n

Users should not instantiate this object directly. In order to construct\nan InputContext for testing an IO Manager\u2019s load_input method, use\ndagster.build_input_context().

\n

Example

\n
from dagster import IOManager, InputContext\n\nclass MyIOManager(IOManager):\n    def load_input(self, context: InputContext):\n        ...\n
\n
\n
\n
\nproperty asset_key\u00b6
\n

The AssetKey of the asset that is being loaded as an input.

\n
\n\n
\n
\nproperty asset_partition_key\u00b6
\n

The partition key for input asset.

\n

Raises an error if the input asset has no partitioning, or if the run covers a partition\nrange for the input asset.

\n
\n\n
\n
\nproperty asset_partition_key_range\u00b6
\n

The partition key range for input asset.

\n

Raises an error if the input asset has no partitioning.

\n
\n\n
\n
\nproperty asset_partition_keys\u00b6
\n

The partition keys for input asset.

\n

Raises an error if the input asset has no partitioning.

\n
\n\n
\n
\nproperty asset_partitions_def\u00b6
\n

The PartitionsDefinition on the upstream asset corresponding to this input.

\n
\n\n
\n
\nproperty asset_partitions_time_window\u00b6
\n

The time window for the partitions of the input asset.

\n

Raises an error if either of the following are true:\n- The input asset has no partitioning.\n- The input asset is not partitioned with a TimeWindowPartitionsDefinition.

\n
\n\n
\n
\nproperty config\u00b6
\n

The config attached to the input that we\u2019re loading.

\n
\n\n
\n
\nproperty dagster_type\u00b6
\n

The type of this input.\nDagster types do not propagate from an upstream output to downstream inputs,\nand this property only captures type information for the input that is either\npassed in explicitly with AssetIn or In, or can be\ninfered from type hints. For an asset input, the Dagster type from the upstream\nasset definition is ignored.

\n
\n\n
\n
\nget_asset_identifier()[source]\u00b6
\n

The sequence of strings making up the AssetKey for the asset being loaded as an input.\nIf the asset is partitioned, the identifier contains the partition key as the final element in the\nsequence. For example, for the asset key AssetKey(["foo", "bar", "baz"]), materialized with\npartition key \u201c2023-06-01\u201d, get_asset_identifier will return ["foo", "bar", "baz", "2023-06-01"].

\n
\n\n
\n
\nget_identifier()[source]\u00b6
\n

Utility method to get a collection of identifiers that as a whole represent a unique\nstep input.

\n

If not using memoization, the unique identifier collection consists of

\n
    \n
  • \n
    run_id: the id of the run which generates the input.

    Note: This method also handles the re-execution memoization logic. If the step that\ngenerates the input is skipped in the re-execution, the run_id will be the id\nof its parent run.

    \n
    \n
    \n
  • \n
  • step_key: the key for a compute step.

  • \n
  • name: the name of the output. (default: \u2018result\u2019).

  • \n
\n

If using memoization, the version corresponding to the step output is used in place of\nthe run_id.

\n
\n
Returns:
\n

A list of identifiers, i.e. (run_id or version), step_key, and output_name

\n
\n
Return type:
\n

List[str, \u2026]

\n
\n
\n
\n\n
\n
\nproperty has_asset_key\u00b6
\n

Returns True if an asset is being loaded as input, otherwise returns False. A return value of False\nindicates that an output from an op is being loaded as the input.

\n
\n\n
\n
\nproperty has_asset_partitions\u00b6
\n

Returns True if the asset being loaded as input is partitioned.

\n
\n\n
\n
\nproperty has_input_name\u00b6
\n

If we\u2019re the InputContext is being used to load the result of a run from outside the run,\nthen it won\u2019t have an input name.

\n
\n\n
\n
\nproperty has_partition_key\u00b6
\n

Whether the current run is a partitioned run.

\n
\n\n
\n
\nproperty log\u00b6
\n

The log manager to use for this input.

\n
\n\n
\n
\nproperty metadata\u00b6
\n

A dict of metadata that is assigned to the InputDefinition that we\u2019re loading for.\nThis property only contains metadata passed in explicitly with AssetIn\nor In. To access metadata of an upstream asset or operation definition,\nuse the metadata in InputContext.upstream_output.

\n
\n\n
\n
\nproperty name\u00b6
\n

The name of the input that we\u2019re loading.

\n
\n\n
\n
\nproperty op_def\u00b6
\n

The definition of the op that\u2019s loading the input.

\n
\n\n
\n
\nproperty partition_key\u00b6
\n

The partition key for the current run.

\n

Raises an error if the current run is not a partitioned run.

\n
\n\n
\n
\nproperty resource_config\u00b6
\n

The config associated with the resource that initializes the InputManager.

\n
\n\n
\n
\nproperty resources\u00b6
\n

The resources required by the resource that initializes the\ninput manager. If using the @input_manager() decorator, these resources\ncorrespond to those requested with the required_resource_keys parameter.

\n
\n\n
\n
\nproperty upstream_output\u00b6
\n

Info about the output that produced the object we\u2019re loading.

\n
\n\n
\n\n
\n
\nclass dagster.OutputContext(step_key=None, name=None, job_name=None, run_id=None, metadata=None, mapping_key=None, config=None, dagster_type=None, log_manager=None, version=None, resource_config=None, resources=None, step_context=None, op_def=None, asset_info=None, warn_on_step_context_use=False, partition_key=None)[source]\u00b6
\n

The context object that is available to the handle_output method of an IOManager.

\n

Users should not instantiate this object directly. To construct an\nOutputContext for testing an IO Manager\u2019s handle_output method, use\ndagster.build_output_context().

\n

Example

\n
from dagster import IOManager, OutputContext\n\nclass MyIOManager(IOManager):\n    def handle_output(self, context: OutputContext, obj):\n        ...\n
\n
\n
\n
\nadd_output_metadata(metadata)[source]\u00b6
\n

Add a dictionary of metadata to the handled output.

\n

Metadata entries added will show up in the HANDLED_OUTPUT and ASSET_MATERIALIZATION events for the run.

\n
\n
Parameters:
\n

metadata (Mapping[str, RawMetadataValue]) \u2013 A metadata dictionary to log

\n
\n
\n

Examples

\n
from dagster import IOManager\n\nclass MyIOManager(IOManager):\n    def handle_output(self, context, obj):\n        context.add_output_metadata({"foo": "bar"})\n
\n
\n
\n\n
\n
\nproperty asset_key\u00b6
\n

The AssetKey of the asset that is being stored as an output.

\n
\n\n
\n
\nproperty asset_partition_key\u00b6
\n

The partition key for output asset.

\n

Raises an error if the output asset has no partitioning, or if the run covers a partition\nrange for the output asset.

\n
\n\n
\n
\nproperty asset_partition_key_range\u00b6
\n

The partition key range for output asset.

\n

Raises an error if the output asset has no partitioning.

\n
\n\n
\n
\nproperty asset_partition_keys\u00b6
\n

The partition keys for the output asset.

\n

Raises an error if the output asset has no partitioning.

\n
\n\n
\n
\nproperty asset_partitions_def\u00b6
\n

The PartitionsDefinition on the asset corresponding to this output.

\n
\n\n
\n
\nproperty asset_partitions_time_window\u00b6
\n

The time window for the partitions of the output asset.

\n

Raises an error if either of the following are true:\n- The output asset has no partitioning.\n- The output asset is not partitioned with a TimeWindowPartitionsDefinition or a\nMultiPartitionsDefinition with one time-partitioned dimension.

\n
\n\n
\n
\nproperty config\u00b6
\n

The configuration for the output.

\n
\n\n
\n
\nproperty dagster_type\u00b6
\n

The type of this output.

\n
\n\n
\n
\nget_asset_identifier()[source]\u00b6
\n

The sequence of strings making up the AssetKey for the asset being stored as an output.\nIf the asset is partitioned, the identifier contains the partition key as the final element in the\nsequence. For example, for the asset key AssetKey(["foo", "bar", "baz"]) materialized with\npartition key \u201c2023-06-01\u201d, get_asset_identifier will return ["foo", "bar", "baz", "2023-06-01"].

\n
\n\n
\n
\nget_identifier()[source]\u00b6
\n

Utility method to get a collection of identifiers that as a whole represent a unique\nstep output.

\n

If not using memoization, the unique identifier collection consists of

\n
    \n
  • \n
    run_id: the id of the run which generates the output.

    Note: This method also handles the re-execution memoization logic. If the step that\ngenerates the output is skipped in the re-execution, the run_id will be the id\nof its parent run.

    \n
    \n
    \n
  • \n
  • step_key: the key for a compute step.

  • \n
  • name: the name of the output. (default: \u2018result\u2019).

  • \n
\n

If using memoization, the version corresponding to the step output is used in place of\nthe run_id.

\n
\n
Returns:
\n

A list of identifiers, i.e. (run_id or version), step_key, and output_name

\n
\n
Return type:
\n

Sequence[str, \u2026]

\n
\n
\n
\n\n
\n
\nproperty has_asset_key\u00b6
\n

Returns True if an asset is being stored, otherwise returns False. A return value of False\nindicates that an output from an op is being stored.

\n
\n\n
\n
\nproperty has_asset_partitions\u00b6
\n

Returns True if the asset being stored is partitioned.

\n
\n\n
\n
\nproperty has_partition_key\u00b6
\n

Whether the current run is a partitioned run.

\n
\n\n
\n
\nproperty log\u00b6
\n

The log manager to use for this output.

\n
\n\n
\n
\nlog_event(event)[source]\u00b6
\n

Log an AssetMaterialization or AssetObservation from within the body of an io manager\u2019s handle_output method.

\n

Events logged with this method will appear in the event log.

\n
\n
Parameters:
\n

event (Union[AssetMaterialization, AssetObservation]) \u2013 The event to log.

\n
\n
\n

Examples

\n
from dagster import IOManager, AssetMaterialization\n\nclass MyIOManager(IOManager):\n    def handle_output(self, context, obj):\n        context.log_event(AssetMaterialization("foo"))\n
\n
\n
\n\n
\n
\nproperty mapping_key\u00b6
\n

The key that identifies a unique mapped output. None for regular outputs.

\n
\n\n
\n
\nproperty metadata\u00b6
\n

A dict of the metadata that is assigned to the OutputDefinition that produced\nthe output.

\n
\n\n
\n
\nproperty name\u00b6
\n

The name of the output that produced the output.

\n
\n\n
\n
\nproperty op_def\u00b6
\n

The definition of the op that produced the output.

\n
\n\n
\n
\nproperty partition_key\u00b6
\n

The partition key for the current run.

\n

Raises an error if the current run is not a partitioned run.

\n
\n\n
\n
\nproperty resource_config\u00b6
\n

The config associated with the resource that initializes the InputManager.

\n
\n\n
\n
\nproperty resources\u00b6
\n

The resources required by the output manager, specified by the required_resource_keys\nparameter.

\n
\n\n
\n
\nproperty run_id\u00b6
\n

The id of the run that produced the output.

\n
\n\n
\n
\nproperty step_key\u00b6
\n

The step_key for the compute step that produced the output.

\n
\n\n
\n
\nproperty version\u00b6
\n

(Experimental) The version of the output.

\n
\n\n
\n\n
\n
\ndagster.build_input_context(name=None, config=None, metadata=None, upstream_output=None, dagster_type=None, resource_config=None, resources=None, op_def=None, step_context=None, asset_key=None, partition_key=None, asset_partition_key_range=None, asset_partitions_def=None, instance=None)[source]\u00b6
\n

Builds input context from provided parameters.

\n

build_input_context can be used as either a function, or a context manager. If resources\nthat are also context managers are provided, then build_input_context must be used as a\ncontext manager.

\n
\n
Parameters:
\n
    \n
  • name (Optional[str]) \u2013 The name of the input that we\u2019re loading.

  • \n
  • config (Optional[Any]) \u2013 The config attached to the input that we\u2019re loading.

  • \n
  • metadata (Optional[Dict[str, Any]]) \u2013 A dict of metadata that is assigned to the\nInputDefinition that we\u2019re loading for.

  • \n
  • upstream_output (Optional[OutputContext]) \u2013 Info about the output that produced the object\nwe\u2019re loading.

  • \n
  • dagster_type (Optional[DagsterType]) \u2013 The type of this input.

  • \n
  • resource_config (Optional[Dict[str, Any]]) \u2013 The resource config to make available from the\ninput context. This usually corresponds to the config provided to the resource that\nloads the input manager.

  • \n
  • resources (Optional[Dict[str, Any]]) \u2013 The resources to make available from the context.\nFor a given key, you can provide either an actual instance of an object, or a resource\ndefinition.

  • \n
  • asset_key (Optional[Union[AssetKey, Sequence[str], str]]) \u2013 The asset key attached to the InputDefinition.

  • \n
  • op_def (Optional[OpDefinition]) \u2013 The definition of the op that\u2019s loading the input.

  • \n
  • step_context (Optional[StepExecutionContext]) \u2013 For internal use.

  • \n
  • partition_key (Optional[str]) \u2013 String value representing partition key to execute with.

  • \n
  • asset_partition_key_range (Optional[str]) \u2013 The range of asset partition keys to load.

  • \n
  • asset_partitions_def \u2013 Optional[PartitionsDefinition]: The PartitionsDefinition of the asset\nbeing loaded.

  • \n
\n
\n
\n

Examples

\n
build_input_context()\n\nwith build_input_context(resources={"foo": context_manager_resource}) as context:\n    do_something\n
\n
\n
\n\n
\n
\ndagster.build_output_context(step_key=None, name=None, metadata=None, run_id=None, mapping_key=None, config=None, dagster_type=None, version=None, resource_config=None, resources=None, op_def=None, asset_key=None, partition_key=None)[source]\u00b6
\n

Builds output context from provided parameters.

\n

build_output_context can be used as either a function, or a context manager. If resources\nthat are also context managers are provided, then build_output_context must be used as a\ncontext manager.

\n
\n
Parameters:
\n
    \n
  • step_key (Optional[str]) \u2013 The step_key for the compute step that produced the output.

  • \n
  • name (Optional[str]) \u2013 The name of the output that produced the output.

  • \n
  • metadata (Optional[Mapping[str, Any]]) \u2013 A dict of the metadata that is assigned to the\nOutputDefinition that produced the output.

  • \n
  • mapping_key (Optional[str]) \u2013 The key that identifies a unique mapped output. None for regular outputs.

  • \n
  • config (Optional[Any]) \u2013 The configuration for the output.

  • \n
  • dagster_type (Optional[DagsterType]) \u2013 The type of this output.

  • \n
  • version (Optional[str]) \u2013 (Experimental) The version of the output.

  • \n
  • resource_config (Optional[Mapping[str, Any]]) \u2013 The resource config to make available from the\ninput context. This usually corresponds to the config provided to the resource that\nloads the output manager.

  • \n
  • resources (Optional[Resources]) \u2013 The resources to make available from the context.\nFor a given key, you can provide either an actual instance of an object, or a resource\ndefinition.

  • \n
  • op_def (Optional[OpDefinition]) \u2013 The definition of the op that produced the output.

  • \n
  • asset_key \u2013 Optional[Union[AssetKey, Sequence[str], str]]: The asset key corresponding to the\noutput.

  • \n
  • partition_key \u2013 Optional[str]: String value representing partition key to execute with.

  • \n
\n
\n
\n

Examples

\n
build_output_context()\n\nwith build_output_context(resources={"foo": context_manager_resource}) as context:\n    do_something\n
\n
\n
\n\n
\n
\n

Built-in IO Managers\u00b6

\n
\n
\ndagster.FilesystemIOManager IOManagerDefinition[source]\u00b6
\n

Built-in filesystem IO manager that stores and retrieves values using pickling.

\n

The base directory that the pickle files live inside is determined by:

\n
    \n
  • The IO manager\u2019s \u201cbase_dir\u201d configuration value, if specified. Otherwise\u2026

  • \n
  • A \u201cstorage/\u201d directory underneath the value for \u201clocal_artifact_storage\u201d in your dagster.yaml\nfile, if specified. Otherwise\u2026

  • \n
  • A \u201cstorage/\u201d directory underneath the directory that the DAGSTER_HOME environment variable\npoints to, if that environment variable is specified. Otherwise\u2026

  • \n
  • A temporary directory.

  • \n
\n

Assigns each op output to a unique filepath containing run ID, step key, and output name.\nAssigns each asset to a single filesystem path, at \u201c<base_dir>/<asset_key>\u201d. If the asset key\nhas multiple components, the final component is used as the name of the file, and the preceding\ncomponents as parent directories under the base_dir.

\n

Subsequent materializations of an asset will overwrite previous materializations of that asset.\nSo, with a base directory of \u201c/my/base/path\u201d, an asset with key\nAssetKey([\u201cone\u201d, \u201ctwo\u201d, \u201cthree\u201d]) would be stored in a file called \u201cthree\u201d in a directory\nwith path \u201c/my/base/path/one/two/\u201d.

\n

Example usage:

\n
    \n
  1. Attach an IO manager to a set of assets using the reserved resource key "io_manager".

  2. \n
\n
from dagster import Definitions, asset, FilesystemIOManager\n\n@asset\ndef asset1():\n    # create df ...\n    return df\n\n@asset\ndef asset2(asset1):\n    return asset1[:5]\n\ndefs = Definitions(\n    assets=[asset1, asset2],\n    resources={\n        "io_manager": FilesystemIOManager(base_dir="/my/base/path")\n    },\n)\n
\n
\n

2. Specify a job-level IO manager using the reserved resource key "io_manager",\nwhich will set the given IO manager on all ops in a job.

\n
from dagster import FilesystemIOManager, job, op\n\n@op\ndef op_a():\n    # create df ...\n    return df\n\n@op\ndef op_b(df):\n    return df[:5]\n\n@job(\n    resource_defs={\n        "io_manager": FilesystemIOManager(base_dir="/my/base/path")\n    }\n)\ndef job():\n    op_b(op_a())\n
\n
\n

3. Specify IO manager on Out, which allows you to set different IO managers on\ndifferent step outputs.

\n
from dagster import FilesystemIOManager, job, op, Out\n\n@op(out=Out(io_manager_key="my_io_manager"))\ndef op_a():\n    # create df ...\n    return df\n\n@op\ndef op_b(df):\n    return df[:5]\n\n@job(resource_defs={"my_io_manager": FilesystemIOManager()})\ndef job():\n    op_b(op_a())\n
\n
\n
\n\n
\n
\ndagster.InMemoryIOManager IOManagerDefinition[source]\u00b6
\n

I/O manager that stores and retrieves values in memory. After execution is complete, the values will\nbe garbage-collected. Note that this means that each run will not have access to values from previous runs.

\n
\n\n

The UPathIOManager can be used to easily define filesystem-based IO Managers.

\n
\n
\nclass dagster.UPathIOManager(base_path=None)[source]\u00b6
\n

Abstract IOManager base class compatible with local and cloud storage via universal-pathlib and fsspec.

\n
\n
Features:
    \n
  • handles partitioned assets

  • \n
  • handles loading a single upstream partition

  • \n
  • handles loading multiple upstream partitions (with respect to PartitionMapping)

  • \n
  • supports loading multiple partitions concurrently with async load_from_path method

  • \n
  • the get_metadata method can be customized to add additional metadata to the output

  • \n
  • the allow_missing_partitions metadata value can be set to True to skip missing partitions\n(the default behavior is to raise an error)

  • \n
\n
\n
\n
\n\n
\n
\n

Input Managers (Experimental)\u00b6

\n

Input managers load inputs from either upstream outputs or from provided default values.

\n
\n
\n@dagster.input_manager(config_schema=None, description=None, input_config_schema=None, required_resource_keys=None, version=None)[source]\u00b6
\n

Define an input manager.

\n

Input managers load op inputs, either from upstream outputs or by providing default values.

\n

The decorated function should accept a InputContext and resource config, and return\na loaded object that will be passed into one of the inputs of an op.

\n

The decorator produces an InputManagerDefinition.

\n
\n
Parameters:
\n
    \n
  • config_schema (Optional[ConfigSchema]) \u2013 The schema for the resource-level config. If not\nset, Dagster will accept any config provided.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the resource.

  • \n
  • input_config_schema (Optional[ConfigSchema]) \u2013 A schema for the input-level config. Each\ninput that uses this input manager can be configured separately using this config.\nIf not set, Dagster will accept any config provided.

  • \n
  • required_resource_keys (Optional[Set[str]]) \u2013 Keys for the resources required by the input\nmanager.

  • \n
  • version (Optional[str]) \u2013 (Experimental) the version of the input manager definition.

  • \n
\n
\n
\n

Examples:

\n
from dagster import input_manager, op, job, In\n\n@input_manager\ndef csv_loader(_):\n    return read_csv("some/path")\n\n@op(ins={"input1": In(input_manager_key="csv_loader_key")})\ndef my_op(_, input1):\n    do_stuff(input1)\n\n@job(resource_defs={"csv_loader_key": csv_loader})\ndef my_job():\n    my_op()\n\n@input_manager(config_schema={"base_dir": str})\ndef csv_loader(context):\n    return read_csv(context.resource_config["base_dir"] + "/some/path")\n\n@input_manager(input_config_schema={"path": str})\ndef csv_loader(context):\n    return read_csv(context.config["path"])\n
\n
\n
\n\n
\n
\nclass dagster.InputManager[source]\u00b6
\n

Base interface for classes that are responsible for loading solid inputs.

\n
\n\n
\n
\nclass dagster.InputManagerDefinition(resource_fn, config_schema=None, description=None, input_config_schema=None, required_resource_keys=None, version=None)[source]\u00b6
\n

Definition of an input manager resource.

\n

Input managers load op inputs.

\n

An InputManagerDefinition is a ResourceDefinition whose resource_fn returns an\nInputManager.

\n

The easiest way to create an InputManagerDefinition is with the\n@input_manager decorator.

\n
\n\n
\n
\n

Legacy\u00b6

\n
\n
\ndagster.fs_io_manager IOManagerDefinition[source]\u00b6
\n

Built-in filesystem IO manager that stores and retrieves values using pickling.

\n

The base directory that the pickle files live inside is determined by:

\n
    \n
  • The IO manager\u2019s \u201cbase_dir\u201d configuration value, if specified. Otherwise\u2026

  • \n
  • A \u201cstorage/\u201d directory underneath the value for \u201clocal_artifact_storage\u201d in your dagster.yaml\nfile, if specified. Otherwise\u2026

  • \n
  • A \u201cstorage/\u201d directory underneath the directory that the DAGSTER_HOME environment variable\npoints to, if that environment variable is specified. Otherwise\u2026

  • \n
  • A temporary directory.

  • \n
\n

Assigns each op output to a unique filepath containing run ID, step key, and output name.\nAssigns each asset to a single filesystem path, at \u201c<base_dir>/<asset_key>\u201d. If the asset key\nhas multiple components, the final component is used as the name of the file, and the preceding\ncomponents as parent directories under the base_dir.

\n

Subsequent materializations of an asset will overwrite previous materializations of that asset.\nSo, with a base directory of \u201c/my/base/path\u201d, an asset with key\nAssetKey([\u201cone\u201d, \u201ctwo\u201d, \u201cthree\u201d]) would be stored in a file called \u201cthree\u201d in a directory\nwith path \u201c/my/base/path/one/two/\u201d.

\n

Example usage:

\n
    \n
  1. Attach an IO manager to a set of assets using the reserved resource key "io_manager".

  2. \n
\n
from dagster import Definitions, asset, fs_io_manager\n\n@asset\ndef asset1():\n    # create df ...\n    return df\n\n@asset\ndef asset2(asset1):\n    return asset1[:5]\n\ndefs = Definitions(\n    assets=[asset1, asset2],\n    resources={\n        "io_manager": fs_io_manager.configured({"base_dir": "/my/base/path"})\n    },\n)\n
\n
\n

2. Specify a job-level IO manager using the reserved resource key "io_manager",\nwhich will set the given IO manager on all ops in a job.

\n
from dagster import fs_io_manager, job, op\n\n@op\ndef op_a():\n    # create df ...\n    return df\n\n@op\ndef op_b(df):\n    return df[:5]\n\n@job(\n    resource_defs={\n        "io_manager": fs_io_manager.configured({"base_dir": "/my/base/path"})\n    }\n)\ndef job():\n    op_b(op_a())\n
\n
\n

3. Specify IO manager on Out, which allows you to set different IO managers on\ndifferent step outputs.

\n
from dagster import fs_io_manager, job, op, Out\n\n@op(out=Out(io_manager_key="my_io_manager"))\ndef op_a():\n    # create df ...\n    return df\n\n@op\ndef op_b(df):\n    return df[:5]\n\n@job(resource_defs={"my_io_manager": fs_io_manager})\ndef job():\n    op_b(op_a())\n
\n
\n
\n\n
\n
\ndagster.mem_io_manager IOManagerDefinition[source]\u00b6
\n

Built-in IO manager that stores and retrieves values in memory.

\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/io-managers", "customsidebar": null, "display_toc": true, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../partitions/", "title": "Partitions Definitions"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../ops/", "title": "Ops"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/partitions", "Partitions Definitions", "N", "next"], ["sections/api/apidocs/ops", "Ops", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/io-managers.rst.txt", "title": "IO Managers", "toc": "\n"}, "jobs": {"alabaster_version": "0.7.13", "body": "
\n

Jobs\u00b6

\n

A Job binds a Graph and the resources it needs to be executable.

\n

Jobs are created by calling GraphDefinition.to_job() on a graph instance, or using the job decorator.

\n
\n
\n@dagster.job(compose_fn=None, *, name=None, description=None, resource_defs=None, config=None, tags=None, metadata=None, logger_defs=None, executor_def=None, hooks=None, op_retry_policy=None, version_strategy=None, partitions_def=None, input_values=None)[source]\u00b6
\n

Creates a job with the specified parameters from the decorated graph/op invocation function.

\n

Using this decorator allows you to build an executable job by writing a function that invokes\nops (or graphs).

\n
\n
Parameters:
\n
    \n
  • (Callable[... (compose_fn) \u2013 The decorated function. The body should contain op or graph invocations. Unlike op\nfunctions, does not accept a context argument.

  • \n
  • Any] \u2013 The decorated function. The body should contain op or graph invocations. Unlike op\nfunctions, does not accept a context argument.

  • \n
  • name (Optional[str]) \u2013 The name for the Job. Defaults to the name of the this graph.

  • \n
  • resource_defs (Optional[Mapping[str, object]]) \u2013 Resources that are required by this graph for execution.\nIf not defined, io_manager will default to filesystem.

  • \n
  • config \u2013

    Describes how the job is parameterized at runtime.

    \n

    If no value is provided, then the schema for the job\u2019s run config is a standard\nformat based on its ops and resources.

    \n

    If a dictionary is provided, then it must conform to the standard config schema, and\nit will be used as the job\u2019s run config for the job whenever the job is executed.\nThe values provided will be viewable and editable in the Dagster UI, so be\ncareful with secrets.

    \n

    If a RunConfig object is provided, then it will be used directly as the run config\nfor the job whenever the job is executed, similar to providing a dictionary.

    \n

    If a ConfigMapping object is provided, then the schema for the job\u2019s run config is\ndetermined by the config mapping, and the ConfigMapping, which should return\nconfiguration in the standard format to configure the job.

    \n

    If a PartitionedConfig object is provided, then it defines a discrete set of config\nvalues that can parameterize the job, as well as a function for mapping those\nvalues to the base config. The values provided will be viewable and editable in the\nDagster UI, so be careful with secrets.

    \n

  • \n
  • tags (Optional[Dict[str, Any]]) \u2013 Arbitrary information that will be attached to the execution of the Job.\nValues that are not strings will be json encoded and must meet the criteria that\njson.loads(json.dumps(value)) == value. These tag values may be overwritten by tag\nvalues provided at invocation time.

  • \n
  • metadata (Optional[Dict[str, RawMetadataValue]]) \u2013 Arbitrary information that will be attached to the JobDefinition and be viewable in the Dagster UI.\nKeys must be strings, and values must be python primitive types or one of the provided\nMetadataValue types

  • \n
  • logger_defs (Optional[Dict[str, LoggerDefinition]]) \u2013 A dictionary of string logger identifiers to their implementations.

  • \n
  • executor_def (Optional[ExecutorDefinition]) \u2013 How this Job will be executed. Defaults to multiprocess_executor .

  • \n
  • op_retry_policy (Optional[RetryPolicy]) \u2013 The default retry policy for all ops in this job.\nOnly used if retry policy is not defined on the op definition or op invocation.

  • \n
  • version_strategy (Optional[VersionStrategy]) \u2013 \n \n (\n deprecated\n )\n \n (This parameter will be removed in version 2.0. Use asset versioning instead.) Defines how each op (and optionally, resource) in the job can be versioned. If\nprovided, memoization will be enabled for this job.

  • \n
  • partitions_def (Optional[PartitionsDefinition]) \u2013 Defines a discrete set of partition keys\nthat can parameterize the job. If this argument is supplied, the config argument\ncan\u2019t also be supplied.

  • \n
  • input_values (Optional[Mapping[str, Any]]) \u2013 A dictionary that maps python objects to the top-level inputs of a job.

  • \n
\n
\n
\n

Examples

\n
@op\ndef return_one():\n    return 1\n\n@op\ndef add_one(in1):\n    return in1 + 1\n\n@job\ndef job1():\n    add_one(return_one())\n
\n
\n
\n\n
\n
\nclass dagster.JobDefinition(*, graph_def, resource_defs=None, executor_def=None, logger_defs=None, name=None, config=None, description=None, partitions_def=None, tags=None, metadata=None, hook_defs=None, op_retry_policy=None, version_strategy=None, _subset_selection_data=None, asset_layer=None, input_values=None, _was_explicitly_provided_resources=None)[source]\u00b6
\n

Defines a Dagster job.

\n
\n
\nproperty config_mapping\u00b6
\n

The config mapping for the job, if it has one.

\n

A config mapping defines a way to map a top-level config schema to run config for the job.

\n
\n\n
\n
\nexecute_in_process(run_config=None, instance=None, partition_key=None, raise_on_error=True, op_selection=None, asset_selection=None, run_id=None, input_values=None, tags=None, resources=None)[source]\u00b6
\n

Execute the Job in-process, gathering results in-memory.

\n

The executor_def on the Job will be ignored, and replaced with the in-process executor.\nIf using the default io_manager, it will switch from filesystem to in-memory.

\n
\n
Parameters:
\n
    \n
  • (Optional[Mapping[str (run_config) \u2013 The configuration for the run

  • \n
  • Any]] \u2013 The configuration for the run

  • \n
  • instance (Optional[DagsterInstance]) \u2013 The instance to execute against, an ephemeral one will be used if none provided.

  • \n
  • partition_key \u2013 (Optional[str])\nThe string partition key that specifies the run config to execute. Can only be used\nto select run config for jobs with partitioned config.

  • \n
  • raise_on_error (Optional[bool]) \u2013 Whether or not to raise exceptions when they occur.\nDefaults to True.

  • \n
  • op_selection (Optional[Sequence[str]]) \u2013 A list of op selection queries (including single op\nnames) to execute. For example:\n* ['some_op']: selects some_op itself.\n* ['*some_op']: select some_op and all its ancestors (upstream dependencies).\n* ['*some_op+++']: select some_op, all its ancestors, and its descendants\n(downstream dependencies) within 3 levels down.\n* ['*some_op', 'other_op_a', 'other_op_b+']: select some_op and all its\nancestors, other_op_a itself, and other_op_b and its direct child ops.

  • \n
  • input_values (Optional[Mapping[str, Any]]) \u2013 A dictionary that maps python objects to the top-level inputs of the job. Input values provided here will override input values that have been provided to the job directly.

  • \n
  • resources (Optional[Mapping[str, Any]]) \u2013 The resources needed if any are required. Can provide resource instances directly,\nor resource definitions.

  • \n
\n
\n
Returns:
\n

ExecuteInProcessResult

\n
\n
\n
\n\n
\n
\nproperty executor_def\u00b6
\n

Returns the default ExecutorDefinition for the job.

\n

If the user has not specified an executor definition, then this will default to the multi_or_in_process_executor(). If a default is specified on the Definitions object the job was provided to, then that will be used instead.

\n
\n\n
\n
\nproperty has_specified_executor\u00b6
\n

Returns True if this job has explicitly specified an executor, and False if the executor was inherited through defaults or the Definitions object the job was provided to.

\n
\n\n
\n
\nproperty has_specified_loggers\u00b6
\n

Returns true if the job explicitly set loggers, and False if loggers were inherited through defaults or the Definitions object the job was provided to.

\n
\n\n
\n
\nproperty loggers\u00b6
\n

Returns the set of LoggerDefinition objects specified on the job.

\n

If the user has not specified a mapping of LoggerDefinition objects, then this will default to the colored_console_logger() under the key console. If a default is specified on the Definitions object the job was provided to, then that will be used instead.

\n
\n\n
\n
\nproperty partitioned_config\u00b6
\n

The partitioned config for the job, if it has one.

\n

A partitioned config defines a way to map partition keys to run config for the job.

\n
\n\n
\n
\nproperty partitions_def\u00b6
\n

Returns the PartitionsDefinition for the job, if it has one.

\n

A partitions definition defines the set of partition keys the job operates on.

\n
\n\n
\n
\nproperty resource_defs\u00b6
\n

Returns the set of ResourceDefinition objects specified on the job.

\n

This may not be the complete set of resources required by the job, since those can also be provided on the Definitions object the job may be provided to.

\n
\n\n
\n
\nrun_request_for_partition(partition_key, run_key=None, tags=None, asset_selection=None, run_config=None, current_time=None, dynamic_partitions_store=None)[source]\u00b6
\n
\n
\n

\n \n (\n deprecated\n )\n \n This API will be removed in version 2.0.0. Directly instantiate RunRequest(partition_key=...) instead..\n \n

\n

Creates a RunRequest object for a run that processes the given partition.

\n
\n
Parameters:
\n
    \n
  • partition_key \u2013 The key of the partition to request a run for.

  • \n
  • run_key (Optional[str]) \u2013 A string key to identify this launched run. For sensors, ensures that\nonly one run is created per run key across all sensor evaluations. For schedules,\nensures that one run is created per tick, across failure recoveries. Passing in a None\nvalue means that a run will always be launched per evaluation.

  • \n
  • tags (Optional[Dict[str, str]]) \u2013 A dictionary of tags (string key-value pairs) to attach\nto the launched run.

  • \n
  • (Optional[Mapping[str (run_config) \u2013 Configuration for the run. If the job has\na PartitionedConfig, this value will override replace the config\nprovided by it.

  • \n
  • Any]] \u2013 Configuration for the run. If the job has\na PartitionedConfig, this value will override replace the config\nprovided by it.

  • \n
  • current_time (Optional[datetime]) \u2013 Used to determine which time-partitions exist.\nDefaults to now.

  • \n
  • dynamic_partitions_store (Optional[DynamicPartitionsStore]) \u2013 The DynamicPartitionsStore\nobject that is responsible for fetching dynamic partitions. Required when the\npartitions definition is a DynamicPartitionsDefinition with a name defined. Users\ncan pass the DagsterInstance fetched via context.instance to this argument.

  • \n
\n
\n
Returns:
\n

an object that requests a run to process the given partition.

\n
\n
Return type:
\n

RunRequest

\n
\n
\n
\n\n
\n
\nwith_hooks(hook_defs)[source]\u00b6
\n

Apply a set of hooks to all op instances within the job.

\n
\n\n
\n
\nwith_top_level_resources(resource_defs)[source]\u00b6
\n

Apply a set of resources to all op instances within the job.

\n
\n\n
\n\n
\n

Reconstructable jobs\u00b6

\n
\n
\nclass dagster.reconstructable(target)[source]
\n

Create a ReconstructableJob from a\nfunction that returns a JobDefinition/JobDefinition,\nor a function decorated with @job.

\n

When your job must cross process boundaries, e.g., for execution on multiple nodes or\nin different systems (like dagstermill), Dagster must know how to reconstruct the job\non the other side of the process boundary.

\n

Passing a job created with ~dagster.GraphDefinition.to_job to reconstructable(),\nrequires you to wrap that job\u2019s definition in a module-scoped function, and pass that function\ninstead:

\n
from dagster import graph, reconstructable\n\n@graph\ndef my_graph():\n    ...\n\ndef define_my_job():\n    return my_graph.to_job()\n\nreconstructable(define_my_job)\n
\n
\n

This function implements a very conservative strategy for reconstruction, so that its behavior\nis easy to predict, but as a consequence it is not able to reconstruct certain kinds of jobs\nor jobs, such as those defined by lambdas, in nested scopes (e.g., dynamically within a method\ncall), or in interactive environments such as the Python REPL or Jupyter notebooks.

\n

If you need to reconstruct objects constructed in these ways, you should use\nbuild_reconstructable_job() instead, which allows you to\nspecify your own reconstruction strategy.

\n

Examples

\n
from dagster import job, reconstructable\n\n@job\ndef foo_job():\n    ...\n\nreconstructable_foo_job = reconstructable(foo_job)\n\n\n@graph\ndef foo():\n    ...\n\ndef make_bar_job():\n    return foo.to_job()\n\nreconstructable_bar_job = reconstructable(make_bar_job)\n
\n
\n
\n\n
\n
\ndagster.build_reconstructable_job(reconstructor_module_name, reconstructor_function_name, reconstructable_args=None, reconstructable_kwargs=None, reconstructor_working_directory=None)[source]\u00b6
\n
\n
\n

\n \n (\n experimental\n )\n \n This API may break in future versions, even between dot releases.\n \n

\n

Create a dagster._core.definitions.reconstructable.ReconstructableJob.

\n

When your job must cross process boundaries, e.g., for execution on multiple nodes or in\ndifferent systems (like dagstermill), Dagster must know how to reconstruct the job\non the other side of the process boundary.

\n

This function allows you to use the strategy of your choice for reconstructing jobs, so\nthat you can reconstruct certain kinds of jobs that are not supported by\nreconstructable(), such as those defined by lambdas, in nested scopes (e.g.,\ndynamically within a method call), or in interactive environments such as the Python REPL or\nJupyter notebooks.

\n

If you need to reconstruct jobs constructed in these ways, use this function instead of\nreconstructable().

\n
\n
Parameters:
\n
    \n
  • reconstructor_module_name (str) \u2013 The name of the module containing the function to use to\nreconstruct the job.

  • \n
  • reconstructor_function_name (str) \u2013 The name of the function to use to reconstruct the\njob.

  • \n
  • reconstructable_args (Tuple) \u2013 Args to the function to use to reconstruct the job.\nValues of the tuple must be JSON serializable.

  • \n
  • reconstructable_kwargs (Dict[str, Any]) \u2013 Kwargs to the function to use to reconstruct the\njob. Values of the dict must be JSON serializable.

  • \n
\n
\n
\n

Examples

\n
# module: mymodule\n\nfrom dagster import JobDefinition, job, build_reconstructable_job\n\nclass JobFactory:\n    def make_job(*args, **kwargs):\n\n        @job\n        def _job(...):\n            ...\n\n        return _job\n\ndef reconstruct_job(*args):\n    factory = JobFactory()\n    return factory.make_job(*args)\n\nfactory = JobFactory()\n\nfoo_job_args = (...,...)\n\nfoo_job_kwargs = {...:...}\n\nfoo_job = factory.make_job(*foo_job_args, **foo_job_kwargs)\n\nreconstructable_foo_job = build_reconstructable_job(\n    'mymodule',\n    'reconstruct_job',\n    foo_job_args,\n    foo_job_kwargs,\n)\n
\n
\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/jobs", "customsidebar": null, "display_toc": true, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../loggers/", "title": "Loggers"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../internals/", "title": "Internals"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/loggers", "Loggers", "N", "next"], ["sections/api/apidocs/internals", "Internals", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/jobs.rst.txt", "title": "Jobs", "toc": "\n"}, "libraries": {"dagster-airbyte": {"alabaster_version": "0.7.13", "body": "
\n

Airbyte (dagster-airbyte)\u00b6

\n

This library provides a Dagster integration with Airbyte.

\n

For more information on getting started, see the Airbyte integration guide.

\n
\n

Resources\u00b6

\n
\n
\ndagster_airbyte.AirbyteResource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
request_max_retries (dagster.IntSource, optional):
\n

The maximum number of times requests to the Airbyte API should be retried before failing.

\n

Default Value: 3

\n
\n
request_retry_delay (Float, optional):
\n

Time (in seconds) to wait between each request retry.

\n

Default Value: 0.25

\n
\n
request_timeout (dagster.IntSource, optional):
\n

Time (in seconds) after which the requests to Airbyte are declared timed out.

\n

Default Value: 15

\n
\n
cancel_sync_on_run_termination (dagster.BoolSource, optional):
\n

Whether to cancel a sync in Airbyte if the Dagster runner is terminated. This may be useful to disable if using Airbyte sources that cannot be cancelled and resumed easily, or if your Dagster deployment may experience runner interruptions that do not impact your Airbyte deployment.

\n

Default Value: True

\n
\n
poll_interval (Float, optional):
\n

Time (in seconds) to wait between checking a sync\u2019s status.

\n

Default Value: 10

\n
\n
host (dagster.StringSource):
\n

The Airbyte server address.

\n
\n
port (dagster.StringSource):
\n

Port used for the Airbyte server.

\n
\n
username (Union[dagster.StringSource, None], optional):
\n

Username if using basic auth.

\n
\n
password (Union[dagster.StringSource, None], optional):
\n

Password if using basic auth.

\n
\n
use_https (dagster.BoolSource, optional):
\n

Whether to use HTTPS to connect to the Airbyte server.

\n

Default Value: False

\n
\n
forward_logs (dagster.BoolSource, optional):
\n

Whether to forward Airbyte logs to the compute log, can be expensive for long-running syncs.

\n

Default Value: True

\n
\n
request_additional_params (dict, optional):
\n

Any additional kwargs to pass to the requests library when making requests to Airbyte.

\n
\nDefault Value:
{}\n
\n
\n
\n
\n

This resource allows users to programatically interface with the Airbyte REST API to launch\nsyncs and monitor their progress.

\n

Examples:

\n
from dagster import job, EnvVar\nfrom dagster_airbyte import AirbyteResource\n\nmy_airbyte_resource = AirbyteResource(\n    host=EnvVar("AIRBYTE_HOST"),\n    port=EnvVar("AIRBYTE_PORT"),\n    # If using basic auth\n    username=EnvVar("AIRBYTE_USERNAME"),\n    password=EnvVar("AIRBYTE_PASSWORD"),\n)\n\nairbyte_assets = build_airbyte_assets(\n    connection_id="87b7fe85-a22c-420e-8d74-b30e7ede77df",\n    destination_tables=["releases", "tags", "teams"],\n)\n\ndefs = Definitions(\n    assets=[airbyte_assets],\n    resources={"airbyte": my_airbyte_resource},\n)\n
\n
\n
\n\n
\n
\n

Assets\u00b6

\n
\n
\ndagster_airbyte.load_assets_from_airbyte_instance(airbyte, workspace_id=None, key_prefix=None, create_assets_for_normalization_tables=True, connection_to_group_fn=<function _clean_name>, io_manager_key=None, connection_to_io_manager_key_fn=None, connection_filter=None, connection_to_asset_key_fn=None, connection_to_freshness_policy_fn=None, connection_to_auto_materialize_policy_fn=None)[source]\u00b6
\n

Loads Airbyte connection assets from a configured AirbyteResource instance. This fetches information\nabout defined connections at initialization time, and will error on workspace load if the Airbyte\ninstance is not reachable.

\n
\n
Parameters:
\n
    \n
  • airbyte (ResourceDefinition) \u2013 An AirbyteResource configured with the appropriate connection\ndetails.

  • \n
  • workspace_id (Optional[str]) \u2013 The ID of the Airbyte workspace to load connections from. Only\nrequired if multiple workspaces exist in your instance.

  • \n
  • key_prefix (Optional[CoercibleToAssetKeyPrefix]) \u2013 A prefix for the asset keys created.

  • \n
  • create_assets_for_normalization_tables (bool) \u2013 If True, assets will be created for tables\ncreated by Airbyte\u2019s normalization feature. If False, only the destination tables\nwill be created. Defaults to True.

  • \n
  • connection_to_group_fn (Optional[Callable[[str], Optional[str]]]) \u2013 Function which returns an asset\ngroup name for a given Airbyte connection name. If None, no groups will be created. Defaults\nto a basic sanitization function.

  • \n
  • io_manager_key (Optional[str]) \u2013 The I/O manager key to use for all assets. Defaults to \u201cio_manager\u201d.\nUse this if all assets should be loaded from the same source, otherwise use connection_to_io_manager_key_fn.

  • \n
  • connection_to_io_manager_key_fn (Optional[Callable[[str], Optional[str]]]) \u2013 Function which returns an\nI/O manager key for a given Airbyte connection name. When other ops are downstream of the loaded assets,\nthe IOManager specified determines how the inputs to those ops are loaded. Defaults to \u201cio_manager\u201d.

  • \n
  • connection_filter (Optional[Callable[[AirbyteConnectionMetadata], bool]]) \u2013 Optional function which takes\nin connection metadata and returns False if the connection should be excluded from the output assets.

  • \n
  • connection_to_asset_key_fn (Optional[Callable[[AirbyteConnectionMetadata, str], AssetKey]]) \u2013 Optional function which\ntakes in connection metadata and table name and returns an asset key for the table. If None, the default asset\nkey is based on the table name. Any asset key prefix will be applied to the output of this function.

  • \n
  • connection_to_freshness_policy_fn (Optional[Callable[[AirbyteConnectionMetadata], Optional[FreshnessPolicy]]]) \u2013 Optional function\nwhich takes in connection metadata and returns a freshness policy for the connection\u2019s assets. If None, no freshness policies\nwill be applied to the assets.

  • \n
  • connection_to_auto_materialize_policy_fn (Optional[Callable[[AirbyteConnectionMetadata], Optional[AutoMaterializePolicy]]]) \u2013 Optional\nfunction which takes in connection metadata and returns an auto materialization policy for the connection\u2019s assets. If None, no\nauto materialization policies will be applied to the assets.

  • \n
\n
\n
\n

Examples:

\n

Loading all Airbyte connections as assets:

\n
from dagster_airbyte import airbyte_resource, load_assets_from_airbyte_instance\n\nairbyte_instance = airbyte_resource.configured(\n    {\n        "host": "localhost",\n        "port": "8000",\n    }\n)\nairbyte_assets = load_assets_from_airbyte_instance(airbyte_instance)\n
\n
\n

Filtering the set of loaded connections:

\n
from dagster_airbyte import airbyte_resource, load_assets_from_airbyte_instance\n\nairbyte_instance = airbyte_resource.configured(\n    {\n        "host": "localhost",\n        "port": "8000",\n    }\n)\nairbyte_assets = load_assets_from_airbyte_instance(\n    airbyte_instance,\n    connection_filter=lambda meta: "snowflake" in meta.name,\n)\n
\n
\n
\n\n
\n
\ndagster_airbyte.load_assets_from_airbyte_project(project_dir, workspace_id=None, key_prefix=None, create_assets_for_normalization_tables=True, connection_to_group_fn=<function _clean_name>, io_manager_key=None, connection_to_io_manager_key_fn=None, connection_filter=None, connection_directories=None, connection_to_asset_key_fn=None, connection_to_freshness_policy_fn=None, connection_to_auto_materialize_policy_fn=None)[source]\u00b6
\n

Loads an Airbyte project into a set of Dagster assets.

\n

Point to the root folder of an Airbyte project synced using the Octavia CLI. For\nmore information, see https://github.com/airbytehq/airbyte/tree/master/octavia-cli#octavia-import-all.

\n
\n
Parameters:
\n
    \n
  • project_dir (str) \u2013 The path to the root of your Airbyte project, containing sources, destinations,\nand connections folders.

  • \n
  • workspace_id (Optional[str]) \u2013 The ID of the Airbyte workspace to load connections from. Only\nrequired if multiple workspace state YAMLfiles exist in the project.

  • \n
  • key_prefix (Optional[CoercibleToAssetKeyPrefix]) \u2013 A prefix for the asset keys created.

  • \n
  • create_assets_for_normalization_tables (bool) \u2013 If True, assets will be created for tables\ncreated by Airbyte\u2019s normalization feature. If False, only the destination tables\nwill be created. Defaults to True.

  • \n
  • connection_to_group_fn (Optional[Callable[[str], Optional[str]]]) \u2013 Function which returns an asset\ngroup name for a given Airbyte connection name. If None, no groups will be created. Defaults\nto a basic sanitization function.

  • \n
  • io_manager_key (Optional[str]) \u2013 The I/O manager key to use for all assets. Defaults to \u201cio_manager\u201d.\nUse this if all assets should be loaded from the same source, otherwise use connection_to_io_manager_key_fn.

  • \n
  • connection_to_io_manager_key_fn (Optional[Callable[[str], Optional[str]]]) \u2013 Function which returns an\nI/O manager key for a given Airbyte connection name. When other ops are downstream of the loaded assets,\nthe IOManager specified determines how the inputs to those ops are loaded. Defaults to \u201cio_manager\u201d.

  • \n
  • connection_filter (Optional[Callable[[AirbyteConnectionMetadata], bool]]) \u2013 Optional function which\ntakes in connection metadata and returns False if the connection should be excluded from the output assets.

  • \n
  • connection_directories (Optional[List[str]]) \u2013 Optional list of connection directories to load assets from.\nIf omitted, all connections in the Airbyte project are loaded. May be faster than connection_filter\nif the project has many connections or if the connection yaml files are large.

  • \n
  • connection_to_asset_key_fn (Optional[Callable[[AirbyteConnectionMetadata, str], AssetKey]]) \u2013 Optional function which\ntakes in connection metadata and table name and returns an asset key for the table. If None, the default asset\nkey is based on the table name. Any asset key prefix will be applied to the output of this function.

  • \n
  • connection_to_freshness_policy_fn (Optional[Callable[[AirbyteConnectionMetadata], Optional[FreshnessPolicy]]]) \u2013 Optional function which takes in connection metadata and returns a freshness policy for the connection\u2019s assets.\nIf None, no freshness policies will be applied to the assets.

  • \n
  • connection_to_auto_materialize_policy_fn (Optional[Callable[[AirbyteConnectionMetadata], Optional[AutoMaterializePolicy]]]) \u2013 Optional function which takes in connection metadata and returns an auto materialization policy for the connection\u2019s assets.\nIf None, no auto materialization policies will be applied to the assets.

  • \n
\n
\n
\n

Examples:

\n

Loading all Airbyte connections as assets:

\n
from dagster_airbyte import load_assets_from_airbyte_project\n\nairbyte_assets = load_assets_from_airbyte_project(\n    project_dir="path/to/airbyte/project",\n)\n
\n
\n

Filtering the set of loaded connections:

\n
from dagster_airbyte import load_assets_from_airbyte_project\n\nairbyte_assets = load_assets_from_airbyte_project(\n    project_dir="path/to/airbyte/project",\n    connection_filter=lambda meta: "snowflake" in meta.name,\n)\n
\n
\n
\n\n
\n
\ndagster_airbyte.build_airbyte_assets(connection_id, destination_tables, asset_key_prefix=None, group_name=None, normalization_tables=None, deps=None, upstream_assets=None, schema_by_table_name=None, freshness_policy=None)[source]\u00b6
\n

Builds a set of assets representing the tables created by an Airbyte sync operation.

\n
\n
Parameters:
\n
    \n
  • connection_id (str) \u2013 The Airbyte Connection ID that this op will sync. You can retrieve this\nvalue from the \u201cConnections\u201d tab of a given connector in the Airbyte UI.

  • \n
  • destination_tables (List[str]) \u2013 The names of the tables that you want to be represented\nin the Dagster asset graph for this sync. This will generally map to the name of the\nstream in Airbyte, unless a stream prefix has been specified in Airbyte.

  • \n
  • normalization_tables (Optional[Mapping[str, List[str]]]) \u2013 If you are using Airbyte\u2019s\nnormalization feature, you may specify a mapping of destination table to a list of\nderived tables that will be created by the normalization process.

  • \n
  • asset_key_prefix (Optional[List[str]]) \u2013 A prefix for the asset keys inside this asset.\nIf left blank, assets will have a key of AssetKey([table_name]).

  • \n
  • deps (Optional[Sequence[Union[AssetsDefinition, SourceAsset, str, AssetKey]]]) \u2013 A list of assets to add as sources.

  • \n
  • upstream_assets (Optional[Set[AssetKey]]) \u2013 Deprecated, use deps instead. A list of assets to add as sources.

  • \n
  • freshness_policy (Optional[FreshnessPolicy]) \u2013 A freshness policy to apply to the assets

  • \n
\n
\n
\n
\n\n
\n
\n

Ops\u00b6

\n
\n
\ndagster_airbyte.airbyte_sync_op = <dagster._core.definitions.op_definition.OpDefinition object>[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
connection_id (dagster.StringSource):
\n

Parsed json dictionary representing the details of the Airbyte connector after the sync successfully completes. See the [Airbyte API Docs](https://airbyte-public-api-docs.s3.us-east-2.amazonaws.com/rapidoc-api-docs.html#overview) to see detailed information on this response.

\n
\n
poll_interval (Float, optional):
\n

The maximum time that will waited before this operation is timed out. By default, this will never time out.

\n

Default Value: 10

\n
\n
poll_timeout (Union[Float, None], optional):
\n

The maximum time that will waited before this operation is timed out. By default, this will never time out.

\n
\n
yield_materializations (dagster.BoolSource, optional):
\n

If True, materializations corresponding to the results of the Airbyte sync will be yielded when the op executes.

\n

Default Value: True

\n
\n
asset_key_prefix (List[dagster.StringSource], optional):
\n

If provided and yield_materializations is True, these components will be used to prefix the generated asset keys.

\n

Default Value: [\u2018airbyte\u2019]

\n
\n
\n

Executes a Airbyte job sync for a given connection_id, and polls until that sync\ncompletes, raising an error if it is unsuccessful. It outputs a AirbyteOutput which contains\nthe job details for a given connection_id.

\n

It requires the use of the airbyte_resource, which allows it to\ncommunicate with the Airbyte API.

\n

Examples

\n
from dagster import job\nfrom dagster_airbyte import airbyte_resource, airbyte_sync_op\n\nmy_airbyte_resource = airbyte_resource.configured(\n    {\n        "host": {"env": "AIRBYTE_HOST"},\n        "port": {"env": "AIRBYTE_PORT"},\n    }\n)\n\nsync_foobar = airbyte_sync_op.configured({"connection_id": "foobar"}, name="sync_foobar")\n\n@job(resource_defs={"airbyte": my_airbyte_resource})\ndef my_simple_airbyte_job():\n    sync_foobar()\n\n@job(resource_defs={"airbyte": my_airbyte_resource})\ndef my_composed_airbyte_job():\n    final_foobar_state = sync_foobar(start_after=some_op())\n    other_op(final_foobar_state)\n
\n
\n
\n\n
\n
\n

Managed Config\u00b6

\n

The following APIs are used as part of the experimental ingestion-as-code functionality.\nFor more information, see the Airbyte ingestion as code guide.

\n
\n
\nclass dagster_airbyte.AirbyteManagedElementReconciler(airbyte, connections, delete_unmentioned_resources=False)[source]\u00b6
\n
\n
\n

\n \n (\n experimental\n )\n \n This API may break in future versions, even between dot releases.\n \n

\n

Reconciles Python-specified Airbyte connections with an Airbyte instance.

\n

Passing the module containing an AirbyteManagedElementReconciler to the dagster-airbyte\nCLI will allow you to check the state of your Python-code-specified Airbyte connections\nagainst an Airbyte instance, and reconcile them if necessary.

\n

This functionality is experimental and subject to change.

\n
\n
\n__init__(airbyte, connections, delete_unmentioned_resources=False)[source]\u00b6
\n

Reconciles Python-specified Airbyte connections with an Airbyte instance.

\n
\n
Parameters:
\n
    \n
  • airbyte (Union[AirbyteResource, ResourceDefinition]) \u2013 The Airbyte resource definition to reconcile against.

  • \n
  • connections (Iterable[AirbyteConnection]) \u2013 The Airbyte connection objects to reconcile.

  • \n
  • delete_unmentioned_resources (bool) \u2013 Whether to delete resources that are not mentioned in\nthe set of connections provided. When True, all Airbyte instance contents are effectively\nmanaged by the reconciler. Defaults to False.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\ndagster_airbyte.load_assets_from_connections(airbyte, connections, key_prefix=None, create_assets_for_normalization_tables=True, connection_to_group_fn=<function _clean_name>, io_manager_key=None, connection_to_io_manager_key_fn=None, connection_to_asset_key_fn=None, connection_to_freshness_policy_fn=None)[source]\u00b6
\n
\n
\n

\n \n (\n experimental\n )\n \n This API may break in future versions, even between dot releases.\n \n

\n

Loads Airbyte connection assets from a configured AirbyteResource instance, checking against a list of AirbyteConnection objects.\nThis method will raise an error on repo load if the passed AirbyteConnection objects are not in sync with the Airbyte instance.

\n
\n
Parameters:
\n
    \n
  • airbyte (Union[AirbyteResource, ResourceDefinition]) \u2013 An AirbyteResource configured with the appropriate connection\ndetails.

  • \n
  • connections (Iterable[AirbyteConnection]) \u2013 A list of AirbyteConnection objects to build assets for.

  • \n
  • key_prefix (Optional[CoercibleToAssetKeyPrefix]) \u2013 A prefix for the asset keys created.

  • \n
  • create_assets_for_normalization_tables (bool) \u2013 If True, assets will be created for tables\ncreated by Airbyte\u2019s normalization feature. If False, only the destination tables\nwill be created. Defaults to True.

  • \n
  • connection_to_group_fn (Optional[Callable[[str], Optional[str]]]) \u2013 Function which returns an asset\ngroup name for a given Airbyte connection name. If None, no groups will be created. Defaults\nto a basic sanitization function.

  • \n
  • io_manager_key (Optional[str]) \u2013 The IO manager key to use for all assets. Defaults to \u201cio_manager\u201d.\nUse this if all assets should be loaded from the same source, otherwise use connection_to_io_manager_key_fn.

  • \n
  • connection_to_io_manager_key_fn (Optional[Callable[[str], Optional[str]]]) \u2013 Function which returns an\nIO manager key for a given Airbyte connection name. When other ops are downstream of the loaded assets,\nthe IOManager specified determines how the inputs to those ops are loaded. Defaults to \u201cio_manager\u201d.

  • \n
  • connection_to_asset_key_fn (Optional[Callable[[AirbyteConnectionMetadata, str], AssetKey]]) \u2013 Optional function which\ntakes in connection metadata and table name and returns an asset key for the table. If None, the default asset\nkey is based on the table name. Any asset key prefix will be applied to the output of this function.

  • \n
  • connection_to_freshness_policy_fn (Optional[Callable[[AirbyteConnectionMetadata], Optional[FreshnessPolicy]]]) \u2013 Optional function which\ntakes in connection metadata and returns a freshness policy for the connection. If None, no freshness policy will be applied.

  • \n
\n
\n
\n

Examples:

\n
from dagster_airbyte import (\n    AirbyteConnection,\n    AirbyteResource,\n    load_assets_from_connections,\n)\n\nairbyte_instance = AirbyteResource(\n        host: "localhost",\n        port: "8000",\n)\nairbyte_connections = [\n    AirbyteConnection(...),\n    AirbyteConnection(...)\n]\nairbyte_assets = load_assets_from_connections(airbyte_instance, airbyte_connections)\n
\n
\n
\n\n
\n
\nclass dagster_airbyte.AirbyteConnection(name, source, destination, stream_config, normalize_data=None, destination_namespace=AirbyteDestinationNamespace.SAME_AS_SOURCE, prefix=None)[source]\u00b6
\n

A user-defined Airbyte connection, pairing an Airbyte source and destination and configuring\nwhich streams to sync.

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The display name of the connection.

  • \n
  • source (AirbyteSource) \u2013 The source to sync from.

  • \n
  • destination (AirbyteDestination) \u2013 The destination to sync to.

  • \n
  • stream_config (Mapping[str, AirbyteSyncMode]) \u2013 A mapping from stream name to\nthe sync mode for that stream, including any additional configuration\nof primary key or cursor field.

  • \n
  • normalize_data (Optional[bool]) \u2013 Whether to normalize the data in the\ndestination.

  • \n
  • destination_namespace (Optional[Union[AirbyteDestinationNamespace, str]]) \u2013 The namespace to sync to in the destination. If set to\nAirbyteDestinationNamespace.SAME_AS_SOURCE, the namespace will be the\nsame as the source namespace. If set to\nAirbyteDestinationNamespace.DESTINATION_DEFAULT, the namespace will be\nthe default namespace for the destination. If set to a string, the\nnamespace will be that string.

  • \n
  • prefix (Optional[str]) \u2013 A prefix to add to the table names in the destination.

  • \n
\n
\n
\n

Example

\n
from dagster_airbyte.managed.generated.sources import FileSource\nfrom dagster_airbyte.managed.generated.destinations import LocalJsonDestination\nfrom dagster_airbyte import AirbyteConnection, AirbyteSyncMode\n\ncereals_csv_source = FileSource(...)\nlocal_json_destination = LocalJsonDestination(...)\n\ncereals_connection = AirbyteConnection(\n    name="download-cereals",\n    source=cereals_csv_source,\n    destination=local_json_destination,\n    stream_config={"cereals": AirbyteSyncMode.full_refresh_overwrite()},\n)\n
\n
\n
\n
\n__init__(name, source, destination, stream_config, normalize_data=None, destination_namespace=AirbyteDestinationNamespace.SAME_AS_SOURCE, prefix=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.AirbyteSource(name, source_type, source_configuration)[source]\u00b6
\n

Represents a user-defined Airbyte source.

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The display name of the source.

  • \n
  • source_type (str) \u2013 The type of the source, from Airbyte\u2019s list\nof sources https://airbytehq.github.io/category/sources/.

  • \n
  • source_configuration (Mapping[str, Any]) \u2013 The configuration for the\nsource, as defined by Airbyte\u2019s API.

  • \n
\n
\n
\n
\n
\n__init__(name, source_type, source_configuration)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.AirbyteDestination(name, destination_type, destination_configuration)[source]\u00b6
\n

Represents a user-defined Airbyte destination.

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The display name of the destination.

  • \n
  • destination_type (str) \u2013 The type of the destination, from Airbyte\u2019s list\nof destinations https://airbytehq.github.io/category/destinations/.

  • \n
  • destination_configuration (Mapping[str, Any]) \u2013 The configuration for the\ndestination, as defined by Airbyte\u2019s API.

  • \n
\n
\n
\n
\n
\n__init__(name, destination_type, destination_configuration)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.AirbyteSyncMode(json_repr)[source]\u00b6
\n

Represents the sync mode for a given Airbyte stream, which governs how Airbyte reads\nfrom a source and writes to a destination.

\n

For more information, see https://docs.airbyte.com/understanding-airbyte/connections/.

\n
\n
\nclassmethod full_refresh_append()[source]\u00b6
\n

Syncs the entire data stream from the source, appending rows to the destination.

\n

https://docs.airbyte.com/understanding-airbyte/connections/full-refresh-append/

\n
\n\n
\n
\nclassmethod full_refresh_overwrite()[source]\u00b6
\n

Syncs the entire data stream from the source, replaces data in the destination by\noverwriting it.

\n

https://docs.airbyte.com/understanding-airbyte/connections/full-refresh-overwrite

\n
\n\n
\n
\nclassmethod incremental_append(cursor_field=None)[source]\u00b6
\n

Syncs only new records from the source, appending rows to the destination.\nMay optionally specify the cursor field used to determine which records\nare new.

\n

https://docs.airbyte.com/understanding-airbyte/connections/incremental-append/

\n
\n\n
\n
\nclassmethod incremental_append_dedup(cursor_field=None, primary_key=None)[source]\u00b6
\n

Syncs new records from the source, appending to an append-only history\ntable in the destination. Also generates a deduplicated view mirroring the\nsource table. May optionally specify the cursor field used to determine\nwhich records are new, and the primary key used to determine which records\nare duplicates.

\n

https://docs.airbyte.com/understanding-airbyte/connections/incremental-append-dedup/

\n
\n\n
\n\n
\n
\n

Managed Config Generated Sources\u00b6

\n
\n
\nclass dagster_airbyte.managed.generated.sources.StravaSource(name, client_id, client_secret, refresh_token, athlete_id, start_date, auth_type=None)[source]\u00b6
\n
\n
\n__init__(name, client_id, client_secret, refresh_token, athlete_id, start_date, auth_type=None)[source]\u00b6
\n

Airbyte Source for Strava.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/strava

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • client_id (str) \u2013 The Client ID of your Strava developer application.

  • \n
  • client_secret (str) \u2013 The Client Secret of your Strava developer application.

  • \n
  • refresh_token (str) \u2013 The Refresh Token with the activity: read_all permissions.

  • \n
  • athlete_id (int) \u2013 The Athlete ID of your Strava developer application.

  • \n
  • start_date (str) \u2013 UTC date and time. Any data before this date will not be replicated.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.AppsflyerSource(name, app_id, api_token, start_date, timezone=None)[source]\u00b6
\n
\n
\n__init__(name, app_id, api_token, start_date, timezone=None)[source]\u00b6
\n

Airbyte Source for Appsflyer.

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • app_id (str) \u2013 App identifier as found in AppsFlyer.

  • \n
  • api_token (str) \u2013 Pull API token for authentication. If you change the account admin, the token changes, and you must update scripts with the new token. Get the API token in the Dashboard.

  • \n
  • start_date (str) \u2013 The default value to use if no bookmark exists for an endpoint. Raw Reports historical lookback is limited to 90 days.

  • \n
  • timezone (Optional[str]) \u2013 Time zone in which date times are stored. The project timezone may be found in the App settings in the AppsFlyer console.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.GoogleWorkspaceAdminReportsSource(name, credentials_json, email, lookback=None)[source]\u00b6
\n
\n
\n__init__(name, credentials_json, email, lookback=None)[source]\u00b6
\n

Airbyte Source for Google Workspace Admin Reports.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/google-workspace-admin-reports

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • credentials_json (str) \u2013 The contents of the JSON service account key. See the docs for more information on how to generate this key.

  • \n
  • email (str) \u2013 The email of the user, who has permissions to access the Google Workspace Admin APIs.

  • \n
  • lookback (Optional[int]) \u2013 Sets the range of time shown in the report. The maximum value allowed by the Google API is 180 days.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.CartSource(name, credentials, start_date)[source]\u00b6
\n
\n
\n__init__(name, credentials, start_date)[source]\u00b6
\n

Airbyte Source for Cart.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/cart

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • start_date (str) \u2013 The date from which you\u2019d like to replicate the data

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass CartSource.CentralAPIRouter(user_name, user_secret, site_id)[source]\u00b6
\n
\n
\n__init__(user_name, user_secret, site_id)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass CartSource.SingleStoreAccessToken(access_token, store_name)[source]\u00b6
\n
\n
\n__init__(access_token, store_name)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.LinkedinAdsSource(name, credentials, start_date, account_ids=None)[source]\u00b6
\n
\n
\n__init__(name, credentials, start_date, account_ids=None)[source]\u00b6
\n

Airbyte Source for Linkedin Ads.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/linkedin-ads

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • start_date (str) \u2013 UTC date in the format 2020-09-17. Any data before this date will not be replicated.

  • \n
  • account_ids (Optional[List[int]]) \u2013 Specify the account IDs separated by a space, to pull the data from. Leave empty, if you want to pull the data from all associated accounts. See the LinkedIn Ads docs for more info.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass LinkedinAdsSource.OAuth20(client_id, client_secret, refresh_token, auth_method=None)[source]\u00b6
\n
\n
\n__init__(client_id, client_secret, refresh_token, auth_method=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass LinkedinAdsSource.AccessToken(access_token, auth_method=None)[source]\u00b6
\n
\n
\n__init__(access_token, auth_method=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.MongodbSource(name, host, port, database, user, password, auth_source, replica_set=None, ssl=None)[source]\u00b6
\n
\n
\n__init__(name, host, port, database, user, password, auth_source, replica_set=None, ssl=None)[source]\u00b6
\n

Airbyte Source for Mongodb.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/mongodb

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • host (str) \u2013 Host of a Mongo database to be replicated.

  • \n
  • port (int) \u2013 Port of a Mongo database to be replicated.

  • \n
  • database (str) \u2013 Database to be replicated.

  • \n
  • user (str) \u2013 User

  • \n
  • password (str) \u2013 Password

  • \n
  • auth_source (str) \u2013 Authentication source where user information is stored. See the Mongo docs for more info.

  • \n
  • replica_set (Optional[str]) \u2013 The name of the set to filter servers by, when connecting to a replica set (Under this condition, the \u2018TLS connection\u2019 value automatically becomes \u2018true\u2019). See the Mongo docs for more info.

  • \n
  • ssl (Optional[bool]) \u2013 If this switch is enabled, TLS connections will be used to connect to MongoDB.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.TimelySource(name, account_id, start_date, bearer_token)[source]\u00b6
\n
\n
\n__init__(name, account_id, start_date, bearer_token)[source]\u00b6
\n

Airbyte Source for Timely.

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • account_id (str) \u2013 Timely account id

  • \n
  • start_date (str) \u2013 start date

  • \n
  • bearer_token (str) \u2013 Timely bearer token

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.StockTickerApiTutorialSource(name, stock_ticker, api_key)[source]\u00b6
\n
\n
\n__init__(name, stock_ticker, api_key)[source]\u00b6
\n

Airbyte Source for Stock Ticker Api Tutorial.

\n

Documentation can be found at https://polygon.io/docs/stocks/get_v2_aggs_grouped_locale_us_market_stocks__date

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • stock_ticker (str) \u2013 The stock ticker to track

  • \n
  • api_key (str) \u2013 The Polygon.io Stocks API key to use to hit the API.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.WrikeSource(name, access_token, wrike_instance, start_date=None)[source]\u00b6
\n
\n
\n__init__(name, access_token, wrike_instance, start_date=None)[source]\u00b6
\n

Airbyte Source for Wrike.

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • access_token (str) \u2013 Permanent access token. You can find documentation on how to acquire a permanent access token here

  • \n
  • wrike_instance (str) \u2013 Wrike\u2019s instance such as app-us2.wrike.com

  • \n
  • start_date (Optional[str]) \u2013 UTC date and time in the format 2017-01-25T00:00:00Z. Only comments after this date will be replicated.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.CommercetoolsSource(name, region, host, start_date, project_key, client_id, client_secret)[source]\u00b6
\n
\n
\n__init__(name, region, host, start_date, project_key, client_id, client_secret)[source]\u00b6
\n

Airbyte Source for Commercetools.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/commercetools

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • region (str) \u2013 The region of the platform.

  • \n
  • host (str) \u2013 The cloud provider your shop is hosted. See: https://docs.commercetools.com/api/authorization

  • \n
  • start_date (str) \u2013 The date you would like to replicate data. Format: YYYY-MM-DD.

  • \n
  • project_key (str) \u2013 The project key

  • \n
  • client_id (str) \u2013 Id of API Client.

  • \n
  • client_secret (str) \u2013 The password of secret of API Client.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.GutendexSource(name, author_year_start=None, author_year_end=None, copyright=None, languages=None, search=None, sort=None, topic=None)[source]\u00b6
\n
\n
\n__init__(name, author_year_start=None, author_year_end=None, copyright=None, languages=None, search=None, sort=None, topic=None)[source]\u00b6
\n

Airbyte Source for Gutendex.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/gutendex

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • author_year_start (Optional[str]) \u2013 (Optional) Defines the minimum birth year of the authors. Books by authors born prior to the start year will not be returned. Supports both positive (CE) or negative (BCE) integer values

  • \n
  • author_year_end (Optional[str]) \u2013 (Optional) Defines the maximum birth year of the authors. Books by authors born after the end year will not be returned. Supports both positive (CE) or negative (BCE) integer values

  • \n
  • copyright (Optional[str]) \u2013 (Optional) Use this to find books with a certain copyright status - true for books with existing copyrights, false for books in the public domain in the USA, or null for books with no available copyright information.

  • \n
  • languages (Optional[str]) \u2013 (Optional) Use this to find books in any of a list of languages. They must be comma-separated, two-character language codes.

  • \n
  • search (Optional[str]) \u2013 (Optional) Use this to search author names and book titles with given words. They must be separated by a space (i.e. %20 in URL-encoded format) and are case-insensitive.

  • \n
  • sort (Optional[str]) \u2013 (Optional) Use this to sort books - ascending for Project Gutenberg ID numbers from lowest to highest, descending for IDs highest to lowest, or popular (the default) for most popular to least popular by number of downloads.

  • \n
  • topic (Optional[str]) \u2013 (Optional) Use this to search for a case-insensitive key-phrase in books\u2019 bookshelves or subjects.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.IterableSource(name, api_key, start_date)[source]\u00b6
\n
\n
\n__init__(name, api_key, start_date)[source]\u00b6
\n

Airbyte Source for Iterable.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/iterable

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • api_key (str) \u2013 Iterable API Key. See the docs for more information on how to obtain this key.

  • \n
  • start_date (str) \u2013 The date from which you\u2019d like to replicate data for Iterable, in the format YYYY-MM-DDT00:00:00Z. All data generated after this date will be replicated.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.QuickbooksSingerSource(name, client_id, client_secret, refresh_token, realm_id, user_agent, start_date, sandbox)[source]\u00b6
\n
\n
\n__init__(name, client_id, client_secret, refresh_token, realm_id, user_agent, start_date, sandbox)[source]\u00b6
\n

Airbyte Source for Quickbooks Singer.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/quickbooks

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • client_id (str) \u2013 Identifies which app is making the request. Obtain this value from the Keys tab on the app profile via My Apps on the developer site. There are two versions of this key: development and production.

  • \n
  • client_secret (str) \u2013 Obtain this value from the Keys tab on the app profile via My Apps on the developer site. There are two versions of this key: development and production.

  • \n
  • refresh_token (str) \u2013 A token used when refreshing the access token.

  • \n
  • realm_id (str) \u2013 Labeled Company ID. The Make API Calls panel is populated with the realm id and the current access token.

  • \n
  • user_agent (str) \u2013 Process and email for API logging purposes. Example: tap-quickbooks .

  • \n
  • start_date (str) \u2013 The default value to use if no bookmark exists for an endpoint (rfc3339 date string). E.g, 2021-03-20T00:00:00Z. Any data before this date will not be replicated.

  • \n
  • sandbox (bool) \u2013 Determines whether to use the sandbox or production environment.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.BigcommerceSource(name, start_date, store_hash, access_token)[source]\u00b6
\n
\n
\n__init__(name, start_date, store_hash, access_token)[source]\u00b6
\n

Airbyte Source for Bigcommerce.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/bigcommerce

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • start_date (str) \u2013 The date you would like to replicate data. Format: YYYY-MM-DD.

  • \n
  • store_hash (str) \u2013 The hash code of the store. For https://api.bigcommerce.com/stores/HASH_CODE/v3/, The store\u2019s hash code is \u2018HASH_CODE\u2019.

  • \n
  • access_token (str) \u2013 Access Token for making authenticated requests.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.ShopifySource(name, shop, credentials, start_date)[source]\u00b6
\n
\n
\n__init__(name, shop, credentials, start_date)[source]\u00b6
\n

Airbyte Source for Shopify.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/shopify

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • shop (str) \u2013 The name of your Shopify store found in the URL. For example, if your URL was https://NAME.myshopify.com, then the name would be \u2018NAME\u2019.

  • \n
  • credentials (Union[ShopifySource.APIPassword, ShopifySource.OAuth20]) \u2013 The authorization method to use to retrieve data from Shopify

  • \n
  • start_date (str) \u2013 The date you would like to replicate data from. Format: YYYY-MM-DD. Any data before this date will not be replicated.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass ShopifySource.APIPassword(api_password)[source]\u00b6
\n
\n
\n__init__(api_password)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass ShopifySource.OAuth20(client_id=None, client_secret=None, access_token=None)[source]\u00b6
\n
\n
\n__init__(client_id=None, client_secret=None, access_token=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.AppstoreSingerSource(name, key_id, private_key, issuer_id, vendor, start_date)[source]\u00b6
\n
\n
\n__init__(name, key_id, private_key, issuer_id, vendor, start_date)[source]\u00b6
\n

Airbyte Source for Appstore Singer.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/appstore

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • key_id (str) \u2013 Appstore Key ID. See the docs for more information on how to obtain this key.

  • \n
  • private_key (str) \u2013 Appstore Private Key. See the docs for more information on how to obtain this key.

  • \n
  • issuer_id (str) \u2013 Appstore Issuer ID. See the docs for more information on how to obtain this ID.

  • \n
  • vendor (str) \u2013 Appstore Vendor ID. See the docs for more information on how to obtain this ID.

  • \n
  • start_date (str) \u2013 UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.GreenhouseSource(name, api_key)[source]\u00b6
\n
\n
\n__init__(name, api_key)[source]\u00b6
\n

Airbyte Source for Greenhouse.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/greenhouse

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • api_key (str) \u2013 Greenhouse API Key. See the docs for more information on how to generate this key.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.ZoomSingerSource(name, jwt)[source]\u00b6
\n
\n
\n__init__(name, jwt)[source]\u00b6
\n

Airbyte Source for Zoom Singer.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/zoom

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • jwt (str) \u2013 Zoom JWT Token. See the docs for more information on how to obtain this key.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.TiktokMarketingSource(name, credentials, start_date=None, end_date=None, report_granularity=None)[source]\u00b6
\n
\n
\n__init__(name, credentials, start_date=None, end_date=None, report_granularity=None)[source]\u00b6
\n

Airbyte Source for Tiktok Marketing.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/tiktok-marketing

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • credentials (Union[TiktokMarketingSource.OAuth20, TiktokMarketingSource.SandboxAccessToken]) \u2013 Authentication method

  • \n
  • start_date (Optional[str]) \u2013 The Start Date in format: YYYY-MM-DD. Any data before this date will not be replicated. If this parameter is not set, all data will be replicated.

  • \n
  • end_date (Optional[str]) \u2013 The date until which you\u2019d like to replicate data for all incremental streams, in the format YYYY-MM-DD. All data generated between start_date and this date will be replicated. Not setting this option will result in always syncing the data till the current date.

  • \n
  • report_granularity (Optional[str]) \u2013 The granularity used for aggregating performance data in reports. See the docs.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass TiktokMarketingSource.OAuth20(app_id, secret, access_token, auth_type=None)[source]\u00b6
\n
\n
\n__init__(app_id, secret, access_token, auth_type=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass TiktokMarketingSource.SandboxAccessToken(advertiser_id, access_token, auth_type=None)[source]\u00b6
\n
\n
\n__init__(advertiser_id, access_token, auth_type=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.ZendeskChatSource(name, start_date, credentials, subdomain=None)[source]\u00b6
\n
\n
\n__init__(name, start_date, credentials, subdomain=None)[source]\u00b6
\n

Airbyte Source for Zendesk Chat.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/zendesk-chat

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • start_date (str) \u2013 The date from which you\u2019d like to replicate data for Zendesk Chat API, in the format YYYY-MM-DDT00:00:00Z.

  • \n
  • subdomain (Optional[str]) \u2013 Required if you access Zendesk Chat from a Zendesk Support subdomain.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass ZendeskChatSource.OAuth20(client_id=None, client_secret=None, access_token=None, refresh_token=None)[source]\u00b6
\n
\n
\n__init__(client_id=None, client_secret=None, access_token=None, refresh_token=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass ZendeskChatSource.AccessToken(access_token)[source]\u00b6
\n
\n
\n__init__(access_token)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.AwsCloudtrailSource(name, aws_key_id, aws_secret_key, aws_region_name, start_date)[source]\u00b6
\n
\n
\n__init__(name, aws_key_id, aws_secret_key, aws_region_name, start_date)[source]\u00b6
\n

Airbyte Source for Aws Cloudtrail.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/aws-cloudtrail

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • aws_key_id (str) \u2013 AWS CloudTrail Access Key ID. See the docs for more information on how to obtain this key.

  • \n
  • aws_secret_key (str) \u2013 AWS CloudTrail Access Key ID. See the docs for more information on how to obtain this key.

  • \n
  • aws_region_name (str) \u2013 The default AWS Region to use, for example, us-west-1 or us-west-2. When specifying a Region inline during client initialization, this property is named region_name.

  • \n
  • start_date (str) \u2013 The date you would like to replicate data. Data in AWS CloudTrail is available for last 90 days only. Format: YYYY-MM-DD.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.OktaSource(name, credentials, domain=None, start_date=None)[source]\u00b6
\n
\n
\n__init__(name, credentials, domain=None, start_date=None)[source]\u00b6
\n

Airbyte Source for Okta.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/okta

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • domain (Optional[str]) \u2013 The Okta domain. See the docs for instructions on how to find it.

  • \n
  • start_date (Optional[str]) \u2013 UTC date and time in the format YYYY-MM-DDTHH:MM:SSZ. Any data before this date will not be replicated.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass OktaSource.OAuth20(client_id, client_secret, refresh_token)[source]\u00b6
\n
\n
\n__init__(client_id, client_secret, refresh_token)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass OktaSource.APIToken(api_token)[source]\u00b6
\n
\n
\n__init__(api_token)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.InsightlySource(name, token=None, start_date=None)[source]\u00b6
\n
\n
\n__init__(name, token=None, start_date=None)[source]\u00b6
\n

Airbyte Source for Insightly.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/insightly

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • token (Optional[str]) \u2013 Your Insightly API token.

  • \n
  • start_date (Optional[str]) \u2013 The date from which you\u2019d like to replicate data for Insightly in the format YYYY-MM-DDT00:00:00Z. All data generated after this date will be replicated. Note that it will be used only for incremental streams.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.LinkedinPagesSource(name, org_id, credentials)[source]\u00b6
\n
\n
\n__init__(name, org_id, credentials)[source]\u00b6
\n

Airbyte Source for Linkedin Pages.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/linkedin-pages/

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • org_id (int) \u2013 Specify the Organization ID

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass LinkedinPagesSource.OAuth20(client_id, client_secret, refresh_token, auth_method=None)[source]\u00b6
\n
\n
\n__init__(client_id, client_secret, refresh_token, auth_method=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass LinkedinPagesSource.AccessToken(access_token, auth_method=None)[source]\u00b6
\n
\n
\n__init__(access_token, auth_method=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.PersistiqSource(name, api_key)[source]\u00b6
\n
\n
\n__init__(name, api_key)[source]\u00b6
\n

Airbyte Source for Persistiq.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/persistiq

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • api_key (str) \u2013 PersistIq API Key. See the docs for more information on where to find that key.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.FreshcallerSource(name, domain, api_key, start_date, requests_per_minute=None, sync_lag_minutes=None)[source]\u00b6
\n
\n
\n__init__(name, domain, api_key, start_date, requests_per_minute=None, sync_lag_minutes=None)[source]\u00b6
\n

Airbyte Source for Freshcaller.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/freshcaller

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • domain (str) \u2013 Used to construct Base URL for the Freshcaller APIs

  • \n
  • api_key (str) \u2013 Freshcaller API Key. See the docs for more information on how to obtain this key.

  • \n
  • requests_per_minute (Optional[int]) \u2013 The number of requests per minute that this source allowed to use. There is a rate limit of 50 requests per minute per app per account.

  • \n
  • start_date (str) \u2013 UTC date and time. Any data created after this date will be replicated.

  • \n
  • sync_lag_minutes (Optional[int]) \u2013 Lag in minutes for each sync, i.e., at time T, data for the time range [prev_sync_time, T-30] will be fetched

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.AppfollowSource(name, ext_id, cid, api_secret, country)[source]\u00b6
\n
\n
\n__init__(name, ext_id, cid, api_secret, country)[source]\u00b6
\n

Airbyte Source for Appfollow.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/appfollow

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • ext_id (str) \u2013 for App Store \u2014 this is 9-10 digits identification number; for Google Play \u2014 this is bundle name;

  • \n
  • cid (str) \u2013 client id provided by Appfollow

  • \n
  • api_secret (str) \u2013 api secret provided by Appfollow

  • \n
  • country (str) \u2013 getting data by Country

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.FacebookPagesSource(name, access_token, page_id)[source]\u00b6
\n
\n
\n__init__(name, access_token, page_id)[source]\u00b6
\n

Airbyte Source for Facebook Pages.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/facebook-pages

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • access_token (str) \u2013 Facebook Page Access Token

  • \n
  • page_id (str) \u2013 Page ID

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.JiraSource(name, api_token, domain, email, projects=None, start_date=None, additional_fields=None, expand_issue_changelog=None, render_fields=None, enable_experimental_streams=None)[source]\u00b6
\n
\n
\n__init__(name, api_token, domain, email, projects=None, start_date=None, additional_fields=None, expand_issue_changelog=None, render_fields=None, enable_experimental_streams=None)[source]\u00b6
\n

Airbyte Source for Jira.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/jira

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • api_token (str) \u2013 Jira API Token. See the docs for more information on how to generate this key.

  • \n
  • domain (str) \u2013 The Domain for your Jira account, e.g. airbyteio.atlassian.net

  • \n
  • email (str) \u2013 The user email for your Jira account.

  • \n
  • projects (Optional[List[str]]) \u2013 List of Jira project keys to replicate data for.

  • \n
  • start_date (Optional[str]) \u2013 The date from which you\u2019d like to replicate data for Jira in the format YYYY-MM-DDT00:00:00Z. All data generated after this date will be replicated. Note that it will be used only in the following incremental streams: issues.

  • \n
  • additional_fields (Optional[List[str]]) \u2013 List of additional fields to include in replicating issues.

  • \n
  • expand_issue_changelog (Optional[bool]) \u2013 Expand the changelog when replicating issues.

  • \n
  • render_fields (Optional[bool]) \u2013 Render issue fields in HTML format in addition to Jira JSON-like format.

  • \n
  • enable_experimental_streams (Optional[bool]) \u2013 Allow the use of experimental streams which rely on undocumented Jira API endpoints. See https://docs.airbyte.com/integrations/sources/jira#experimental-tables for more info.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.GoogleSheetsSource(name, spreadsheet_id, credentials, row_batch_size=None)[source]\u00b6
\n
\n
\n__init__(name, spreadsheet_id, credentials, row_batch_size=None)[source]\u00b6
\n

Airbyte Source for Google Sheets.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/google-sheets

\n
\n
Parameters:
\n
\n
\n
\n
\n\n
\n\n
\n
\nclass GoogleSheetsSource.AuthenticateViaGoogleOAuth(client_id, client_secret, refresh_token)[source]\u00b6
\n
\n
\n__init__(client_id, client_secret, refresh_token)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass GoogleSheetsSource.ServiceAccountKeyAuthentication(service_account_info)[source]\u00b6
\n
\n
\n__init__(service_account_info)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.DockerhubSource(name, docker_username)[source]\u00b6
\n
\n
\n__init__(name, docker_username)[source]\u00b6
\n

Airbyte Source for Dockerhub.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/dockerhub

\n
\n
Parameters:
\n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.UsCensusSource(name, query_path, api_key, query_params=None)[source]\u00b6
\n
\n
\n__init__(name, query_path, api_key, query_params=None)[source]\u00b6
\n

Airbyte Source for Us Census.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/us-census

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • query_params (Optional[str]) \u2013 The query parameters portion of the GET request, without the api key

  • \n
  • query_path (str) \u2013 The path portion of the GET request

  • \n
  • api_key (str) \u2013 Your API Key. Get your key here.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.KustomerSingerSource(name, api_token, start_date)[source]\u00b6
\n
\n
\n__init__(name, api_token, start_date)[source]\u00b6
\n

Airbyte Source for Kustomer Singer.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/kustomer

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • api_token (str) \u2013 Kustomer API Token. See the docs on how to obtain this

  • \n
  • start_date (str) \u2013 The date from which you\u2019d like to replicate the data

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.AzureTableSource(name, storage_account_name, storage_access_key, storage_endpoint_suffix=None)[source]\u00b6
\n
\n
\n__init__(name, storage_account_name, storage_access_key, storage_endpoint_suffix=None)[source]\u00b6
\n

Airbyte Source for Azure Table.

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • storage_account_name (str) \u2013 The name of your storage account.

  • \n
  • storage_access_key (str) \u2013 Azure Table Storage Access Key. See the docs for more information on how to obtain this key.

  • \n
  • storage_endpoint_suffix (Optional[str]) \u2013 Azure Table Storage service account URL suffix. See the docs for more information on how to obtain endpoint suffix

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.ScaffoldJavaJdbcSource(name, host, port, database, username, replication_method, password=None, jdbc_url_params=None)[source]\u00b6
\n
\n
\n__init__(name, host, port, database, username, replication_method, password=None, jdbc_url_params=None)[source]\u00b6
\n

Airbyte Source for Scaffold Java Jdbc.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/scaffold_java_jdbc

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • host (str) \u2013 Hostname of the database.

  • \n
  • port (int) \u2013 Port of the database.

  • \n
  • database (str) \u2013 Name of the database.

  • \n
  • username (str) \u2013 Username to use to access the database.

  • \n
  • password (Optional[str]) \u2013 Password associated with the username.

  • \n
  • jdbc_url_params (Optional[str]) \u2013 Additional properties to pass to the JDBC URL string when connecting to the database formatted as \u2018key=value\u2019 pairs separated by the symbol \u2018&\u2019. (example: key1=value1&key2=value2&key3=value3)

  • \n
  • replication_method (str) \u2013 Replication method to use for extracting data from the database. STANDARD replication requires no setup on the DB side but will not be able to represent deletions incrementally. CDC uses the Binlog to detect inserts, updates, and deletes. This needs to be configured on the source database itself.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.TidbSource(name, host, port, database, username, password=None, jdbc_url_params=None, ssl=None)[source]\u00b6
\n
\n
\n__init__(name, host, port, database, username, password=None, jdbc_url_params=None, ssl=None)[source]\u00b6
\n

Airbyte Source for Tidb.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/tidb

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • host (str) \u2013 Hostname of the database.

  • \n
  • port (int) \u2013 Port of the database.

  • \n
  • database (str) \u2013 Name of the database.

  • \n
  • username (str) \u2013 Username to use to access the database.

  • \n
  • password (Optional[str]) \u2013 Password associated with the username.

  • \n
  • jdbc_url_params (Optional[str]) \u2013 Additional properties to pass to the JDBC URL string when connecting to the database formatted as \u2018key=value\u2019 pairs separated by the symbol \u2018&\u2019. (example: key1=value1&key2=value2&key3=value3)

  • \n
  • ssl (Optional[bool]) \u2013 Encrypt data using SSL.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.QualarooSource(name, token, key, start_date, survey_ids=None)[source]\u00b6
\n
\n
\n__init__(name, token, key, start_date, survey_ids=None)[source]\u00b6
\n

Airbyte Source for Qualaroo.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/qualaroo

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • token (str) \u2013 A Qualaroo token. See the docs for instructions on how to generate it.

  • \n
  • key (str) \u2013 A Qualaroo token. See the docs for instructions on how to generate it.

  • \n
  • start_date (str) \u2013 UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated.

  • \n
  • survey_ids (Optional[List[str]]) \u2013 IDs of the surveys from which you\u2019d like to replicate data. If left empty, data from all surveys to which you have access will be replicated.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.YahooFinancePriceSource(name, tickers, interval=None, range=None)[source]\u00b6
\n
\n
\n__init__(name, tickers, interval=None, range=None)[source]\u00b6
\n

Airbyte Source for Yahoo Finance Price.

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • tickers (str) \u2013 Comma-separated identifiers for the stocks to be queried. Whitespaces are allowed.

  • \n
  • interval (Optional[str]) \u2013 The interval of between prices queried.

  • \n
  • range (Optional[str]) \u2013 The range of prices to be queried.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.GoogleAnalyticsV4Source(name, credentials, start_date, view_id, custom_reports=None, window_in_days=None)[source]\u00b6
\n
\n
\n__init__(name, credentials, start_date, view_id, custom_reports=None, window_in_days=None)[source]\u00b6
\n

Airbyte Source for Google Analytics V4.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/google-analytics-universal-analytics

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • credentials (Union[GoogleAnalyticsV4Source.AuthenticateViaGoogleOauth, GoogleAnalyticsV4Source.ServiceAccountKeyAuthentication]) \u2013 Credentials for the service

  • \n
  • start_date (str) \u2013 The date in the format YYYY-MM-DD. Any data before this date will not be replicated.

  • \n
  • view_id (str) \u2013 The ID for the Google Analytics View you want to fetch data from. This can be found from the Google Analytics Account Explorer.

  • \n
  • custom_reports (Optional[str]) \u2013 A JSON array describing the custom reports you want to sync from Google Analytics. See the docs for more information about the exact format you can use to fill out this field.

  • \n
  • window_in_days (Optional[int]) \u2013 The time increment used by the connector when requesting data from the Google Analytics API. More information is available in the the docs. The bigger this value is, the faster the sync will be, but the more likely that sampling will be applied to your data, potentially causing inaccuracies in the returned results. We recommend setting this to 1 unless you have a hard requirement to make the sync faster at the expense of accuracy. The minimum allowed value for this field is 1, and the maximum is 364.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass GoogleAnalyticsV4Source.AuthenticateViaGoogleOauth(client_id, client_secret, refresh_token, auth_type=None, access_token=None)[source]\u00b6
\n
\n
\n__init__(client_id, client_secret, refresh_token, auth_type=None, access_token=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass GoogleAnalyticsV4Source.ServiceAccountKeyAuthentication(credentials_json, auth_type=None)[source]\u00b6
\n
\n
\n__init__(credentials_json, auth_type=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.JdbcSource(name, username, jdbc_url, password=None, jdbc_url_params=None)[source]\u00b6
\n
\n
\n__init__(name, username, jdbc_url, password=None, jdbc_url_params=None)[source]\u00b6
\n

Airbyte Source for Jdbc.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/postgres

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • username (str) \u2013 The username which is used to access the database.

  • \n
  • password (Optional[str]) \u2013 The password associated with this username.

  • \n
  • jdbc_url (str) \u2013 JDBC formatted URL. See the standard here.

  • \n
  • jdbc_url_params (Optional[str]) \u2013 Additional properties to pass to the JDBC URL string when connecting to the database formatted as \u2018key=value\u2019 pairs separated by the symbol \u2018&\u2019. (example: key1=value1&key2=value2&key3=value3).

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.FakerSource(name, count, seed=None, records_per_sync=None, records_per_slice=None)[source]\u00b6
\n
\n
\n__init__(name, count, seed=None, records_per_sync=None, records_per_slice=None)[source]\u00b6
\n

Airbyte Source for Faker.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/faker

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • count (int) \u2013 How many users should be generated in total. This setting does not apply to the purchases or products stream.

  • \n
  • seed (Optional[int]) \u2013 Manually control the faker random seed to return the same values on subsequent runs (leave -1 for random)

  • \n
  • records_per_sync (Optional[int]) \u2013 How many fake records will be returned for each sync, for each stream? By default, it will take 2 syncs to create the requested 1000 records.

  • \n
  • records_per_slice (Optional[int]) \u2013 How many fake records will be in each page (stream slice), before a state message is emitted?

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.TplcentralSource(name, url_base, client_id, client_secret, user_login_id=None, user_login=None, tpl_key=None, customer_id=None, facility_id=None, start_date=None)[source]\u00b6
\n
\n
\n__init__(name, url_base, client_id, client_secret, user_login_id=None, user_login=None, tpl_key=None, customer_id=None, facility_id=None, start_date=None)[source]\u00b6
\n

Airbyte Source for Tplcentral.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/tplcentral

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • user_login_id (Optional[int]) \u2013 User login ID and/or name is required

  • \n
  • user_login (Optional[str]) \u2013 User login ID and/or name is required

  • \n
  • start_date (Optional[str]) \u2013 Date and time together in RFC 3339 format, for example, 2018-11-13T20:20:39+00:00.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.ClickhouseSource(name, host, port, database, username, password=None, jdbc_url_params=None, ssl=None)[source]\u00b6
\n
\n
\n__init__(name, host, port, database, username, password=None, jdbc_url_params=None, ssl=None)[source]\u00b6
\n

Airbyte Source for Clickhouse.

\n

Documentation can be found at https://docs.airbyte.com/integrations/destinations/clickhouse

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • host (str) \u2013 The host endpoint of the Clickhouse cluster.

  • \n
  • port (int) \u2013 The port of the database.

  • \n
  • database (str) \u2013 The name of the database.

  • \n
  • username (str) \u2013 The username which is used to access the database.

  • \n
  • password (Optional[str]) \u2013 The password associated with this username.

  • \n
  • jdbc_url_params (Optional[str]) \u2013 Additional properties to pass to the JDBC URL string when connecting to the database formatted as \u2018key=value\u2019 pairs separated by the symbol \u2018&\u2019. (Eg. key1=value1&key2=value2&key3=value3). For more information read about JDBC URL parameters.

  • \n
  • ssl (Optional[bool]) \u2013 Encrypt data using SSL.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.FreshserviceSource(name, domain_name, api_key, start_date)[source]\u00b6
\n
\n
\n__init__(name, domain_name, api_key, start_date)[source]\u00b6
\n

Airbyte Source for Freshservice.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/freshservice

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • domain_name (str) \u2013 The name of your Freshservice domain

  • \n
  • api_key (str) \u2013 Freshservice API Key. See here. The key is case sensitive.

  • \n
  • start_date (str) \u2013 UTC date and time in the format 2020-10-01T00:00:00Z. Any data before this date will not be replicated.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.ZenloopSource(name, api_token, date_from=None, survey_id=None, survey_group_id=None)[source]\u00b6
\n
\n
\n__init__(name, api_token, date_from=None, survey_id=None, survey_group_id=None)[source]\u00b6
\n

Airbyte Source for Zenloop.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/zenloop

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • api_token (str) \u2013 Zenloop API Token. You can get the API token in settings page here

  • \n
  • date_from (Optional[str]) \u2013 Zenloop date_from. Format: 2021-10-24T03:30:30Z or 2021-10-24. Leave empty if only data from current data should be synced

  • \n
  • survey_id (Optional[str]) \u2013 Zenloop Survey ID. Can be found here. Leave empty to pull answers from all surveys

  • \n
  • survey_group_id (Optional[str]) \u2013 Zenloop Survey Group ID. Can be found by pulling All Survey Groups via SurveyGroups stream. Leave empty to pull answers from all survey groups

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.OracleSource(name, host, port, connection_data, username, encryption, password=None, schemas=None, jdbc_url_params=None)[source]\u00b6
\n
\n
\n__init__(name, host, port, connection_data, username, encryption, password=None, schemas=None, jdbc_url_params=None)[source]\u00b6
\n

Airbyte Source for Oracle.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/oracle

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • host (str) \u2013 Hostname of the database.

  • \n
  • port (int) \u2013 Port of the database. Oracle Corporations recommends the following port numbers: 1521 - Default listening port for client connections to the listener. 2484 - Recommended and officially registered listening port for client connections to the listener using TCP/IP with SSL

  • \n
  • connection_data (Union[OracleSource.ServiceName, OracleSource.SystemIDSID]) \u2013 Connect data that will be used for DB connection

  • \n
  • username (str) \u2013 The username which is used to access the database.

  • \n
  • password (Optional[str]) \u2013 The password associated with the username.

  • \n
  • schemas (Optional[List[str]]) \u2013 The list of schemas to sync from. Defaults to user. Case sensitive.

  • \n
  • jdbc_url_params (Optional[str]) \u2013 Additional properties to pass to the JDBC URL string when connecting to the database formatted as \u2018key=value\u2019 pairs separated by the symbol \u2018&\u2019. (example: key1=value1&key2=value2&key3=value3).

  • \n
  • encryption (Union[OracleSource.Unencrypted, OracleSource.NativeNetworkEncryptionNNE, OracleSource.TLSEncryptedVerifyCertificate]) \u2013 The encryption method with is used when communicating with the database.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass OracleSource.ServiceName(service_name, connection_type=None)[source]\u00b6
\n
\n
\n__init__(service_name, connection_type=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass OracleSource.SystemIDSID(sid, connection_type=None)[source]\u00b6
\n
\n
\n__init__(sid, connection_type=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass OracleSource.Unencrypted[source]\u00b6
\n
\n
\n__init__()[source]\u00b6
\n
\n\n
\n\n
\n
\nclass OracleSource.NativeNetworkEncryptionNNE(encryption_algorithm=None)[source]\u00b6
\n
\n
\n__init__(encryption_algorithm=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass OracleSource.TLSEncryptedVerifyCertificate(ssl_certificate)[source]\u00b6
\n
\n
\n__init__(ssl_certificate)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.KlaviyoSource(name, api_key, start_date)[source]\u00b6
\n
\n
\n__init__(name, api_key, start_date)[source]\u00b6
\n

Airbyte Source for Klaviyo.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/klaviyo

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • api_key (str) \u2013 Klaviyo API Key. See our docs if you need help finding this key.

  • \n
  • start_date (str) \u2013 UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.GoogleDirectorySource(name, credentials)[source]\u00b6
\n
\n
\n__init__(name, credentials)[source]\u00b6
\n

Airbyte Source for Google Directory.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/google-directory

\n
\n
Parameters:
\n
\n
\n
\n
\n\n
\n\n
\n
\nclass GoogleDirectorySource.SignInViaGoogleOAuth(client_id, client_secret, refresh_token, credentials_title=None)[source]\u00b6
\n
\n
\n__init__(client_id, client_secret, refresh_token, credentials_title=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass GoogleDirectorySource.ServiceAccountKey(credentials_json, email, credentials_title=None)[source]\u00b6
\n
\n
\n__init__(credentials_json, email, credentials_title=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.InstagramSource(name, start_date, access_token)[source]\u00b6
\n
\n
\n__init__(name, start_date, access_token)[source]\u00b6
\n

Airbyte Source for Instagram.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/instagram

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • start_date (str) \u2013 The date from which you\u2019d like to replicate data for User Insights, in the format YYYY-MM-DDT00:00:00Z. All data generated after this date will be replicated.

  • \n
  • access_token (str) \u2013 The value of the access token generated. See the docs for more information

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.ShortioSource(name, domain_id, secret_key, start_date)[source]\u00b6
\n
\n
\n__init__(name, domain_id, secret_key, start_date)[source]\u00b6
\n

Airbyte Source for Shortio.

\n

Documentation can be found at https://developers.short.io/reference

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • secret_key (str) \u2013 Short.io Secret Key

  • \n
  • start_date (str) \u2013 UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.SquareSource(name, is_sandbox, credentials, start_date=None, include_deleted_objects=None)[source]\u00b6
\n
\n
\n__init__(name, is_sandbox, credentials, start_date=None, include_deleted_objects=None)[source]\u00b6
\n

Airbyte Source for Square.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/square

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • is_sandbox (bool) \u2013 Determines whether to use the sandbox or production environment.

  • \n
  • start_date (Optional[str]) \u2013 UTC date in the format YYYY-MM-DD. Any data before this date will not be replicated. If not set, all data will be replicated.

  • \n
  • include_deleted_objects (Optional[bool]) \u2013 In some streams there is an option to include deleted objects (Items, Categories, Discounts, Taxes)

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass SquareSource.OauthAuthentication(client_id, client_secret, refresh_token)[source]\u00b6
\n
\n
\n__init__(client_id, client_secret, refresh_token)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass SquareSource.APIKey(api_key)[source]\u00b6
\n
\n
\n__init__(api_key)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.DelightedSource(name, since, api_key)[source]\u00b6
\n
\n
\n__init__(name, since, api_key)[source]\u00b6
\n

Airbyte Source for Delighted.

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • since (str) \u2013 The date from which you\u2019d like to replicate the data

  • \n
  • api_key (str) \u2013 A Delighted API key.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.AmazonSqsSource(name, queue_url, region, delete_messages, max_batch_size=None, max_wait_time=None, attributes_to_return=None, visibility_timeout=None, access_key=None, secret_key=None)[source]\u00b6
\n
\n
\n__init__(name, queue_url, region, delete_messages, max_batch_size=None, max_wait_time=None, attributes_to_return=None, visibility_timeout=None, access_key=None, secret_key=None)[source]\u00b6
\n

Airbyte Source for Amazon Sqs.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/amazon-sqs

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • queue_url (str) \u2013 URL of the SQS Queue

  • \n
  • region (str) \u2013 AWS Region of the SQS Queue

  • \n
  • delete_messages (bool) \u2013 If Enabled, messages will be deleted from the SQS Queue after being read. If Disabled, messages are left in the queue and can be read more than once. WARNING: Enabling this option can result in data loss in cases of failure, use with caution, see documentation for more detail.

  • \n
  • max_batch_size (Optional[int]) \u2013 Max amount of messages to get in one batch (10 max)

  • \n
  • max_wait_time (Optional[int]) \u2013 Max amount of time in seconds to wait for messages in a single poll (20 max)

  • \n
  • attributes_to_return (Optional[str]) \u2013 Comma separated list of Mesage Attribute names to return

  • \n
  • visibility_timeout (Optional[int]) \u2013 Modify the Visibility Timeout of the individual message from the Queue\u2019s default (seconds).

  • \n
  • access_key (Optional[str]) \u2013 The Access Key ID of the AWS IAM Role to use for pulling messages

  • \n
  • secret_key (Optional[str]) \u2013 The Secret Key of the AWS IAM Role to use for pulling messages

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.YoutubeAnalyticsSource(name, credentials)[source]\u00b6
\n
\n
\n__init__(name, credentials)[source]\u00b6
\n

Airbyte Source for Youtube Analytics.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/youtube-analytics

\n
\n
Parameters:
\n

name (str) \u2013 The name of the destination.

\n
\n
\n
\n\n
\n\n
\n
\nclass YoutubeAnalyticsSource.AuthenticateViaOAuth20(client_id, client_secret, refresh_token)[source]\u00b6
\n
\n
\n__init__(client_id, client_secret, refresh_token)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.ScaffoldSourcePythonSource(name, fix_me=None)[source]\u00b6
\n
\n
\n__init__(name, fix_me=None)[source]\u00b6
\n

Airbyte Source for Scaffold Source Python.

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • fix_me (Optional[str]) \u2013 describe me

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.LookerSource(name, domain, client_id, client_secret, run_look_ids=None)[source]\u00b6
\n
\n
\n__init__(name, domain, client_id, client_secret, run_look_ids=None)[source]\u00b6
\n

Airbyte Source for Looker.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/looker

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • domain (str) \u2013 Domain for your Looker account, e.g. airbyte.cloud.looker.com,looker.[clientname].com,IP address

  • \n
  • client_id (str) \u2013 The Client ID is first part of an API3 key that is specific to each Looker user. See the docs for more information on how to generate this key.

  • \n
  • client_secret (str) \u2013 The Client Secret is second part of an API3 key.

  • \n
  • run_look_ids (Optional[List[str]]) \u2013 The IDs of any Looks to run

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.GitlabSource(name, api_url, private_token, start_date, groups=None, projects=None)[source]\u00b6
\n
\n
\n__init__(name, api_url, private_token, start_date, groups=None, projects=None)[source]\u00b6
\n

Airbyte Source for Gitlab.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/gitlab

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • api_url (str) \u2013 Please enter your basic URL from GitLab instance.

  • \n
  • private_token (str) \u2013 Log into your GitLab account and then generate a personal Access Token.

  • \n
  • groups (Optional[str]) \u2013 Space-delimited list of groups. e.g. airbyte.io.

  • \n
  • projects (Optional[str]) \u2013 Space-delimited list of projects. e.g. airbyte.io/documentation meltano/tap-gitlab.

  • \n
  • start_date (str) \u2013 The date from which you\u2019d like to replicate data for GitLab API, in the format YYYY-MM-DDT00:00:00Z. All data generated after this date will be replicated.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.ExchangeRatesSource(name, start_date, access_key, base=None, ignore_weekends=None)[source]\u00b6
\n
\n
\n__init__(name, start_date, access_key, base=None, ignore_weekends=None)[source]\u00b6
\n

Airbyte Source for Exchange Rates.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/exchangeratesapi

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • start_date (str) \u2013 Start getting data from that date.

  • \n
  • access_key (str) \u2013 Your API Key. See here. The key is case sensitive.

  • \n
  • base (Optional[str]) \u2013 ISO reference currency. See here. Free plan doesn\u2019t support Source Currency Switching, default base currency is EUR

  • \n
  • ignore_weekends (Optional[bool]) \u2013 Ignore weekends? (Exchanges don\u2019t run on weekends)

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.AmazonAdsSource(name, client_id, client_secret, refresh_token, auth_type=None, region=None, report_wait_timeout=None, report_generation_max_retries=None, start_date=None, profiles=None, state_filter=None)[source]\u00b6
\n
\n
\n__init__(name, client_id, client_secret, refresh_token, auth_type=None, region=None, report_wait_timeout=None, report_generation_max_retries=None, start_date=None, profiles=None, state_filter=None)[source]\u00b6
\n

Airbyte Source for Amazon Ads.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/amazon-ads

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • client_id (str) \u2013 The client ID of your Amazon Ads developer application. See the docs for more information.

  • \n
  • client_secret (str) \u2013 The client secret of your Amazon Ads developer application. See the docs for more information.

  • \n
  • refresh_token (str) \u2013 Amazon Ads refresh token. See the docs for more information on how to obtain this token.

  • \n
  • region (Optional[str]) \u2013 Region to pull data from (EU/NA/FE). See docs for more details.

  • \n
  • report_wait_timeout (Optional[int]) \u2013 Timeout duration in minutes for Reports. Default is 60 minutes.

  • \n
  • report_generation_max_retries (Optional[int]) \u2013 Maximum retries Airbyte will attempt for fetching report data. Default is 5.

  • \n
  • start_date (Optional[str]) \u2013 The Start date for collecting reports, should not be more than 60 days in the past. In YYYY-MM-DD format

  • \n
  • profiles (Optional[List[int]]) \u2013 Profile IDs you want to fetch data for. See docs for more details.

  • \n
  • state_filter (Optional[List[str]]) \u2013 Reflects the state of the Display, Product, and Brand Campaign streams as enabled, paused, or archived. If you do not populate this field, it will be ignored completely.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.MixpanelSource(name, credentials, project_id=None, attribution_window=None, project_timezone=None, select_properties_by_default=None, start_date=None, end_date=None, region=None, date_window_size=None)[source]\u00b6
\n
\n
\n__init__(name, credentials, project_id=None, attribution_window=None, project_timezone=None, select_properties_by_default=None, start_date=None, end_date=None, region=None, date_window_size=None)[source]\u00b6
\n

Airbyte Source for Mixpanel.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/mixpanel

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • credentials (Union[MixpanelSource.ServiceAccount, MixpanelSource.ProjectSecret]) \u2013 Choose how to authenticate to Mixpanel

  • \n
  • project_id (Optional[int]) \u2013 Your project ID number. See the docs for more information on how to obtain this.

  • \n
  • attribution_window (Optional[int]) \u2013 A period of time for attributing results to ads and the lookback period after those actions occur during which ad results are counted. Default attribution window is 5 days.

  • \n
  • project_timezone (Optional[str]) \u2013 Time zone in which integer date times are stored. The project timezone may be found in the project settings in the Mixpanel console.

  • \n
  • select_properties_by_default (Optional[bool]) \u2013 Setting this config parameter to TRUE ensures that new properties on events and engage records are captured. Otherwise new properties will be ignored.

  • \n
  • start_date (Optional[str]) \u2013 The date in the format YYYY-MM-DD. Any data before this date will not be replicated. If this option is not set, the connector will replicate data from up to one year ago by default.

  • \n
  • end_date (Optional[str]) \u2013 The date in the format YYYY-MM-DD. Any data after this date will not be replicated. Left empty to always sync to most recent date

  • \n
  • region (Optional[str]) \u2013 The region of mixpanel domain instance either US or EU.

  • \n
  • date_window_size (Optional[int]) \u2013 Defines window size in days, that used to slice through data. You can reduce it, if amount of data in each window is too big for your environment.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass MixpanelSource.ServiceAccount(username, secret)[source]\u00b6
\n
\n
\n__init__(username, secret)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass MixpanelSource.ProjectSecret(api_secret)[source]\u00b6
\n
\n
\n__init__(api_secret)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.OrbitSource(name, api_token, workspace, start_date=None)[source]\u00b6
\n
\n
\n__init__(name, api_token, workspace, start_date=None)[source]\u00b6
\n

Airbyte Source for Orbit.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/orbit

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • api_token (str) \u2013 Authorizes you to work with Orbit workspaces associated with the token.

  • \n
  • workspace (str) \u2013 The unique name of the workspace that your API token is associated with.

  • \n
  • start_date (Optional[str]) \u2013 Date in the format 2022-06-26. Only load members whose last activities are after this date.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.AmazonSellerPartnerSource(name, lwa_app_id, lwa_client_secret, refresh_token, aws_access_key, aws_secret_key, role_arn, replication_start_date, aws_environment, region, app_id=None, auth_type=None, replication_end_date=None, period_in_days=None, report_options=None, max_wait_seconds=None)[source]\u00b6
\n
\n
\n__init__(name, lwa_app_id, lwa_client_secret, refresh_token, aws_access_key, aws_secret_key, role_arn, replication_start_date, aws_environment, region, app_id=None, auth_type=None, replication_end_date=None, period_in_days=None, report_options=None, max_wait_seconds=None)[source]\u00b6
\n

Airbyte Source for Amazon Seller Partner.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/amazon-seller-partner

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • app_id (Optional[str]) \u2013 Your Amazon App ID

  • \n
  • lwa_app_id (str) \u2013 Your Login with Amazon Client ID.

  • \n
  • lwa_client_secret (str) \u2013 Your Login with Amazon Client Secret.

  • \n
  • refresh_token (str) \u2013 The Refresh Token obtained via OAuth flow authorization.

  • \n
  • aws_access_key (str) \u2013 Specifies the AWS access key used as part of the credentials to authenticate the user.

  • \n
  • aws_secret_key (str) \u2013 Specifies the AWS secret key used as part of the credentials to authenticate the user.

  • \n
  • role_arn (str) \u2013 Specifies the Amazon Resource Name (ARN) of an IAM role that you want to use to perform operations requested using this profile. (Needs permission to \u2018Assume Role\u2019 STS).

  • \n
  • replication_start_date (str) \u2013 UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated.

  • \n
  • replication_end_date (Optional[str]) \u2013 UTC date and time in the format 2017-01-25T00:00:00Z. Any data after this date will not be replicated.

  • \n
  • period_in_days (Optional[int]) \u2013 Will be used for stream slicing for initial full_refresh sync when no updated state is present for reports that support sliced incremental sync.

  • \n
  • report_options (Optional[str]) \u2013 Additional information passed to reports. This varies by report type. Must be a valid json string.

  • \n
  • max_wait_seconds (Optional[int]) \u2013 Sometimes report can take up to 30 minutes to generate. This will set the limit for how long to wait for a successful report.

  • \n
  • aws_environment (str) \u2013 An enumeration.

  • \n
  • region (str) \u2013 An enumeration.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.CourierSource(name, api_key)[source]\u00b6
\n
\n
\n__init__(name, api_key)[source]\u00b6
\n

Airbyte Source for Courier.

\n

Documentation can be found at https://docs.airbyte.io/integrations/sources/courier

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • api_key (str) \u2013 Courier API Key to retrieve your data.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.CloseComSource(name, api_key, start_date=None)[source]\u00b6
\n
\n
\n__init__(name, api_key, start_date=None)[source]\u00b6
\n

Airbyte Source for Close Com.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/close-com

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • api_key (str) \u2013 Close.com API key (usually starts with \u2018api\\_\u2019; find yours here).

  • \n
  • start_date (Optional[str]) \u2013 The start date to sync data. Leave blank for full sync. Format: YYYY-MM-DD.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.BingAdsSource(name, client_id, refresh_token, developer_token, reports_start_date, auth_method=None, tenant_id=None, client_secret=None)[source]\u00b6
\n
\n
\n__init__(name, client_id, refresh_token, developer_token, reports_start_date, auth_method=None, tenant_id=None, client_secret=None)[source]\u00b6
\n

Airbyte Source for Bing Ads.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/bing-ads

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • tenant_id (Optional[str]) \u2013 The Tenant ID of your Microsoft Advertising developer application. Set this to \u201ccommon\u201d unless you know you need a different value.

  • \n
  • client_id (str) \u2013 The Client ID of your Microsoft Advertising developer application.

  • \n
  • client_secret (Optional[str]) \u2013 The Client Secret of your Microsoft Advertising developer application.

  • \n
  • refresh_token (str) \u2013 Refresh Token to renew the expired Access Token.

  • \n
  • developer_token (str) \u2013 Developer token associated with user. See more info in the docs.

  • \n
  • reports_start_date (str) \u2013 The start date from which to begin replicating report data. Any data generated before this date will not be replicated in reports. This is a UTC date in YYYY-MM-DD format.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.PrimetricSource(name, client_id, client_secret)[source]\u00b6
\n
\n
\n__init__(name, client_id, client_secret)[source]\u00b6
\n

Airbyte Source for Primetric.

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • client_id (str) \u2013 The Client ID of your Primetric developer application. The Client ID is visible here.

  • \n
  • client_secret (str) \u2013 The Client Secret of your Primetric developer application. You can manage your client\u2019s credentials here.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.PivotalTrackerSource(name, api_token)[source]\u00b6
\n
\n
\n__init__(name, api_token)[source]\u00b6
\n

Airbyte Source for Pivotal Tracker.

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • api_token (str) \u2013 Pivotal Tracker API token

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.ElasticsearchSource(name, endpoint, authenticationMethod)[source]\u00b6
\n
\n
\n__init__(name, endpoint, authenticationMethod)[source]\u00b6
\n

Airbyte Source for Elasticsearch.

\n

Documentation can be found at https://docs.airbyte.com/integrations/source/elasticsearch

\n
\n
Parameters:
\n
\n
\n
\n
\n\n
\n\n
\n
\nclass ElasticsearchSource.None_[source]\u00b6
\n
\n
\n__init__()[source]\u00b6
\n
\n\n
\n\n
\n
\nclass ElasticsearchSource.ApiKeySecret(apiKeyId, apiKeySecret)[source]\u00b6
\n
\n
\n__init__(apiKeyId, apiKeySecret)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass ElasticsearchSource.UsernamePassword(username, password)[source]\u00b6
\n
\n
\n__init__(username, password)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.BigquerySource(name, project_id, credentials_json, dataset_id=None)[source]\u00b6
\n
\n
\n__init__(name, project_id, credentials_json, dataset_id=None)[source]\u00b6
\n

Airbyte Source for Bigquery.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/bigquery

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • project_id (str) \u2013 The GCP project ID for the project containing the target BigQuery dataset.

  • \n
  • dataset_id (Optional[str]) \u2013 The dataset ID to search for tables and views. If you are only loading data from one dataset, setting this option could result in much faster schema discovery.

  • \n
  • credentials_json (str) \u2013 The contents of your Service Account Key JSON file. See the docs for more information on how to obtain this key.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.WoocommerceSource(name, shop, start_date, api_key, api_secret, conversion_window_days=None)[source]\u00b6
\n
\n
\n__init__(name, shop, start_date, api_key, api_secret, conversion_window_days=None)[source]\u00b6
\n

Airbyte Source for Woocommerce.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/woocommerce

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • shop (str) \u2013 The name of the store. For https://EXAMPLE.com, the shop name is \u2018EXAMPLE.com\u2019.

  • \n
  • start_date (str) \u2013 The date you would like to replicate data. Format: YYYY-MM-DD.

  • \n
  • api_key (str) \u2013 The CUSTOMER KEY for API in WooCommerce shop.

  • \n
  • api_secret (str) \u2013 The CUSTOMER SECRET for API in WooCommerce shop.

  • \n
  • conversion_window_days (Optional[int]) \u2013 A conversion window is the period of time after an ad interaction (such as an ad click or video view) during which a conversion, such as a purchase, is recorded in Google Ads.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.SearchMetricsSource(name, api_key, client_secret, country_code, start_date)[source]\u00b6
\n
\n
\n__init__(name, api_key, client_secret, country_code, start_date)[source]\u00b6
\n

Airbyte Source for Search Metrics.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/seacrh-metrics

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • country_code (str) \u2013 The region of the S3 staging bucket to use if utilising a copy strategy.

  • \n
  • start_date (str) \u2013 Data generated in SearchMetrics after this date will be replicated. This date must be specified in the format YYYY-MM-DDT00:00:00Z.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.TypeformSource(name, start_date, token, form_ids=None)[source]\u00b6
\n
\n
\n__init__(name, start_date, token, form_ids=None)[source]\u00b6
\n

Airbyte Source for Typeform.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/typeform

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • start_date (str) \u2013 UTC date and time in the format: YYYY-MM-DDTHH:mm:ss[Z]. Any data before this date will not be replicated.

  • \n
  • token (str) \u2013 The API Token for a Typeform account.

  • \n
  • form_ids (Optional[List[str]]) \u2013 When this parameter is set, the connector will replicate data only from the input forms. Otherwise, all forms in your Typeform account will be replicated. You can find form IDs in your form URLs. For example, in the URL \u201chttps://mysite.typeform.com/to/u6nXL7\u201d the form_id is u6nXL7. You can find form URLs on Share panel

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.WebflowSource(name, site_id, api_key)[source]\u00b6
\n
\n
\n__init__(name, site_id, api_key)[source]\u00b6
\n

Airbyte Source for Webflow.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/webflow

\n
\n
Parameters:
\n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.FireboltSource(name, username, password, database, account=None, host=None, engine=None)[source]\u00b6
\n
\n
\n__init__(name, username, password, database, account=None, host=None, engine=None)[source]\u00b6
\n

Airbyte Source for Firebolt.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/firebolt

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • username (str) \u2013 Firebolt email address you use to login.

  • \n
  • password (str) \u2013 Firebolt password.

  • \n
  • account (Optional[str]) \u2013 Firebolt account to login.

  • \n
  • host (Optional[str]) \u2013 The host name of your Firebolt database.

  • \n
  • database (str) \u2013 The database to connect to.

  • \n
  • engine (Optional[str]) \u2013 Engine name or url to connect to.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.FaunaSource(name, domain, port, scheme, secret, collection)[source]\u00b6
\n
\n
\n__init__(name, domain, port, scheme, secret, collection)[source]\u00b6
\n

Airbyte Source for Fauna.

\n

Documentation can be found at https://github.com/fauna/airbyte/blob/source-fauna/docs/integrations/sources/fauna.md

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • domain (str) \u2013 Domain of Fauna to query. Defaults db.fauna.com. See the docs.

  • \n
  • port (int) \u2013 Endpoint port.

  • \n
  • scheme (str) \u2013 URL scheme.

  • \n
  • secret (str) \u2013 Fauna secret, used when authenticating with the database.

  • \n
  • collection (FaunaSource.Collection) \u2013 Settings for the Fauna Collection.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass FaunaSource.Disabled[source]\u00b6
\n
\n
\n__init__()[source]\u00b6
\n
\n\n
\n\n
\n
\nclass FaunaSource.Enabled(column)[source]\u00b6
\n
\n
\n__init__(column)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass FaunaSource.Collection(page_size, deletions)[source]\u00b6
\n
\n
\n__init__(page_size, deletions)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.IntercomSource(name, start_date, access_token)[source]\u00b6
\n
\n
\n__init__(name, start_date, access_token)[source]\u00b6
\n

Airbyte Source for Intercom.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/intercom

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • start_date (str) \u2013 UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated.

  • \n
  • access_token (str) \u2013 Access token for making authenticated requests. See the Intercom docs for more information.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.FreshsalesSource(name, domain_name, api_key)[source]\u00b6
\n
\n
\n__init__(name, domain_name, api_key)[source]\u00b6
\n

Airbyte Source for Freshsales.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/freshsales

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • domain_name (str) \u2013 The Name of your Freshsales domain

  • \n
  • api_key (str) \u2013 Freshsales API Key. See here. The key is case sensitive.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.AdjustSource(name, api_token, dimensions, ingest_start, metrics, additional_metrics=None, until_today=None)[source]\u00b6
\n
\n
\n__init__(name, api_token, dimensions, ingest_start, metrics, additional_metrics=None, until_today=None)[source]\u00b6
\n

Airbyte Source for Adjust.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/adjust

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • additional_metrics (Optional[List[str]]) \u2013 Metrics names that are not pre-defined, such as cohort metrics or app specific metrics.

  • \n
  • api_token (str) \u2013 Adjust API key, see https://help.adjust.com/en/article/report-service-api-authentication

  • \n
  • dimensions (List[str]) \u2013 Dimensions allow a user to break down metrics into groups using one or several parameters. For example, the number of installs by date, country and network. See https://help.adjust.com/en/article/reports-endpoint#dimensions for more information about the dimensions.

  • \n
  • ingest_start (str) \u2013 Data ingest start date.

  • \n
  • metrics (List[str]) \u2013 Select at least one metric to query.

  • \n
  • until_today (Optional[bool]) \u2013 Syncs data up until today. Useful when running daily incremental syncs, and duplicates are not desired.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.BambooHrSource(name, subdomain, api_key, custom_reports_fields=None, custom_reports_include_default_fields=None)[source]\u00b6
\n
\n
\n__init__(name, subdomain, api_key, custom_reports_fields=None, custom_reports_include_default_fields=None)[source]\u00b6
\n

Airbyte Source for Bamboo Hr.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/bamboo-hr

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • subdomain (str) \u2013 Sub Domain of bamboo hr

  • \n
  • api_key (str) \u2013 Api key of bamboo hr

  • \n
  • custom_reports_fields (Optional[str]) \u2013 Comma-separated list of fields to include in custom reports.

  • \n
  • custom_reports_include_default_fields (Optional[bool]) \u2013 If true, the custom reports endpoint will include the default fields defined here: https://documentation.bamboohr.com/docs/list-of-field-names.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.GoogleAdsSource(name, credentials, customer_id, start_date, end_date=None, custom_queries=None, login_customer_id=None, conversion_window_days=None)[source]\u00b6
\n
\n
\n__init__(name, credentials, customer_id, start_date, end_date=None, custom_queries=None, login_customer_id=None, conversion_window_days=None)[source]\u00b6
\n

Airbyte Source for Google Ads.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/google-ads

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • customer_id (str) \u2013 Comma separated list of (client) customer IDs. Each customer ID must be specified as a 10-digit number without dashes. More instruction on how to find this value in our docs. Metrics streams like AdGroupAdReport cannot be requested for a manager account.

  • \n
  • start_date (str) \u2013 UTC date and time in the format 2017-01-25. Any data before this date will not be replicated.

  • \n
  • end_date (Optional[str]) \u2013 UTC date and time in the format 2017-01-25. Any data after this date will not be replicated.

  • \n
  • login_customer_id (Optional[str]) \u2013 If your access to the customer account is through a manager account, this field is required and must be set to the customer ID of the manager account (10-digit number without dashes). More information about this field you can see here

  • \n
  • conversion_window_days (Optional[int]) \u2013 A conversion window is the period of time after an ad interaction (such as an ad click or video view) during which a conversion, such as a purchase, is recorded in Google Ads. For more information, see Google\u2019s documentation.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass GoogleAdsSource.GoogleCredentials(developer_token, client_id, client_secret, refresh_token, access_token=None)[source]\u00b6
\n
\n
\n__init__(developer_token, client_id, client_secret, refresh_token, access_token=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass GoogleAdsSource.CustomGAQLQueriesEntry(query, table_name)[source]\u00b6
\n
\n
\n__init__(query, table_name)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.HellobatonSource(name, api_key, company)[source]\u00b6
\n
\n
\n__init__(name, api_key, company)[source]\u00b6
\n

Airbyte Source for Hellobaton.

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • api_key (str) \u2013 authentication key required to access the api endpoints

  • \n
  • company (str) \u2013 Company name that generates your base api url

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.SendgridSource(name, apikey, start_time)[source]\u00b6
\n
\n
\n__init__(name, apikey, start_time)[source]\u00b6
\n

Airbyte Source for Sendgrid.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/sendgrid

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • apikey (str) \u2013 API Key, use admin to generate this key.

  • \n
  • start_time (Union[int, str]) \u2013 Start time in ISO8601 format. Any data before this time point will not be replicated.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.MondaySource(name, credentials)[source]\u00b6
\n
\n
\n__init__(name, credentials)[source]\u00b6
\n

Airbyte Source for Monday.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/monday

\n
\n
Parameters:
\n

name (str) \u2013 The name of the destination.

\n
\n
\n
\n\n
\n\n
\n
\nclass MondaySource.OAuth20(client_id, client_secret, access_token, subdomain=None)[source]\u00b6
\n
\n
\n__init__(client_id, client_secret, access_token, subdomain=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass MondaySource.APIToken(api_token)[source]\u00b6
\n
\n
\n__init__(api_token)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.DixaSource(name, api_token, start_date, batch_size=None)[source]\u00b6
\n
\n
\n__init__(name, api_token, start_date, batch_size=None)[source]\u00b6
\n

Airbyte Source for Dixa.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/dixa

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • api_token (str) \u2013 Dixa API token

  • \n
  • start_date (str) \u2013 The connector pulls records updated from this date onwards.

  • \n
  • batch_size (Optional[int]) \u2013 Number of days to batch into one request. Max 31.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.SalesforceSource(name, client_id, client_secret, refresh_token, is_sandbox=None, auth_type=None, start_date=None, streams_criteria=None)[source]\u00b6
\n
\n
\n__init__(name, client_id, client_secret, refresh_token, is_sandbox=None, auth_type=None, start_date=None, streams_criteria=None)[source]\u00b6
\n

Airbyte Source for Salesforce.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/salesforce

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • is_sandbox (Optional[bool]) \u2013 Toggle if you\u2019re using a Salesforce Sandbox

  • \n
  • client_id (str) \u2013 Enter your Salesforce developer application\u2019s Client ID

  • \n
  • client_secret (str) \u2013 Enter your Salesforce developer application\u2019s Client secret

  • \n
  • refresh_token (str) \u2013 Enter your application\u2019s Salesforce Refresh Token used for Airbyte to access your Salesforce account.

  • \n
  • start_date (Optional[str]) \u2013 Enter the date in the YYYY-MM-DD format. Airbyte will replicate the data added on and after this date. If this field is blank, Airbyte will replicate all data.

  • \n
  • streams_criteria (Optional[List[SalesforceSource.FilterSalesforceObjectsEntry]]) \u2013 Filter streams relevant to you

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass SalesforceSource.FilterSalesforceObjectsEntry(criteria, value)[source]\u00b6
\n
\n
\n__init__(criteria, value)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.PipedriveSource(name, authorization, replication_start_date)[source]\u00b6
\n
\n
\n__init__(name, authorization, replication_start_date)[source]\u00b6
\n

Airbyte Source for Pipedrive.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/pipedrive

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • authorization (Union[PipedriveSource.SignInViaPipedriveOAuth, PipedriveSource.APIKeyAuthentication]) \u2013 Choose one of the possible authorization method

  • \n
  • replication_start_date (str) \u2013 UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated. When specified and not None, then stream will behave as incremental

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass PipedriveSource.SignInViaPipedriveOAuth(client_id, client_secret, refresh_token)[source]\u00b6
\n
\n
\n__init__(client_id, client_secret, refresh_token)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass PipedriveSource.APIKeyAuthentication(api_token)[source]\u00b6
\n
\n
\n__init__(api_token)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.FileSource(name, dataset_name, format, url, provider, reader_options=None)[source]\u00b6
\n
\n
\n__init__(name, dataset_name, format, url, provider, reader_options=None)[source]\u00b6
\n

Airbyte Source for File.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/file

\n
\n
Parameters:
\n
\n
\n
\n
\n\n
\n\n
\n
\nclass FileSource.HTTPSPublicWeb(user_agent=None)[source]\u00b6
\n
\n
\n__init__(user_agent=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass FileSource.GCSGoogleCloudStorage(service_account_json=None)[source]\u00b6
\n
\n
\n__init__(service_account_json=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass FileSource.S3AmazonWebServices(aws_access_key_id=None, aws_secret_access_key=None)[source]\u00b6
\n
\n
\n__init__(aws_access_key_id=None, aws_secret_access_key=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass FileSource.AzBlobAzureBlobStorage(storage_account, sas_token=None, shared_key=None)[source]\u00b6
\n
\n
\n__init__(storage_account, sas_token=None, shared_key=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass FileSource.SSHSecureShell(user, host, password=None, port=None)[source]\u00b6
\n
\n
\n__init__(user, host, password=None, port=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass FileSource.SCPSecureCopyProtocol(user, host, password=None, port=None)[source]\u00b6
\n
\n
\n__init__(user, host, password=None, port=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass FileSource.SFTPSecureFileTransferProtocol(user, host, password=None, port=None)[source]\u00b6
\n
\n
\n__init__(user, host, password=None, port=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass FileSource.LocalFilesystemLimited[source]\u00b6
\n
\n
\n__init__()[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.GlassfrogSource(name, api_key)[source]\u00b6
\n
\n
\n__init__(name, api_key)[source]\u00b6
\n

Airbyte Source for Glassfrog.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/glassfrog

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • api_key (str) \u2013 API key provided by Glassfrog

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.ChartmogulSource(name, api_key, start_date, interval)[source]\u00b6
\n
\n
\n__init__(name, api_key, start_date, interval)[source]\u00b6
\n

Airbyte Source for Chartmogul.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/chartmogul

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • api_key (str) \u2013 Chartmogul API key

  • \n
  • start_date (str) \u2013 UTC date and time in the format 2017-01-25T00:00:00Z. When feasible, any data before this date will not be replicated.

  • \n
  • interval (str) \u2013 Some APIs such as Metrics require intervals to cluster data.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.OrbSource(name, api_key, start_date=None, lookback_window_days=None, string_event_properties_keys=None, numeric_event_properties_keys=None)[source]\u00b6
\n
\n
\n__init__(name, api_key, start_date=None, lookback_window_days=None, string_event_properties_keys=None, numeric_event_properties_keys=None)[source]\u00b6
\n

Airbyte Source for Orb.

\n

Documentation can be found at https://docs.withorb.com/

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • api_key (str) \u2013 Orb API Key, issued from the Orb admin console.

  • \n
  • start_date (Optional[str]) \u2013 UTC date and time in the format 2022-03-01T00:00:00Z. Any data with created_at before this data will not be synced.

  • \n
  • lookback_window_days (Optional[int]) \u2013 When set to N, the connector will always refresh resources created within the past N days. By default, updated objects that are not newly created are not incrementally synced.

  • \n
  • string_event_properties_keys (Optional[List[str]]) \u2013 Property key names to extract from all events, in order to enrich ledger entries corresponding to an event deduction.

  • \n
  • numeric_event_properties_keys (Optional[List[str]]) \u2013 Property key names to extract from all events, in order to enrich ledger entries corresponding to an event deduction.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.CockroachdbSource(name, host, port, database, username, password=None, jdbc_url_params=None, ssl=None)[source]\u00b6
\n
\n
\n__init__(name, host, port, database, username, password=None, jdbc_url_params=None, ssl=None)[source]\u00b6
\n

Airbyte Source for Cockroachdb.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/cockroachdb

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • host (str) \u2013 Hostname of the database.

  • \n
  • port (int) \u2013 Port of the database.

  • \n
  • database (str) \u2013 Name of the database.

  • \n
  • username (str) \u2013 Username to use to access the database.

  • \n
  • password (Optional[str]) \u2013 Password associated with the username.

  • \n
  • jdbc_url_params (Optional[str]) \u2013 Additional properties to pass to the JDBC URL string when connecting to the database formatted as \u2018key=value\u2019 pairs separated by the symbol \u2018&\u2019. (Eg. key1=value1&key2=value2&key3=value3). For more information read about JDBC URL parameters.

  • \n
  • ssl (Optional[bool]) \u2013 Encrypt client/server communications for increased security.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.ConfluenceSource(name, api_token, domain_name, email)[source]\u00b6
\n
\n
\n__init__(name, api_token, domain_name, email)[source]\u00b6
\n

Airbyte Source for Confluence.

\n
\n
Parameters:
\n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.PlaidSource(name, access_token, api_key, client_id, plaid_env, start_date=None)[source]\u00b6
\n
\n
\n__init__(name, access_token, api_key, client_id, plaid_env, start_date=None)[source]\u00b6
\n

Airbyte Source for Plaid.

\n

Documentation can be found at https://plaid.com/docs/api/

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • access_token (str) \u2013 The end-user\u2019s Link access token.

  • \n
  • api_key (str) \u2013 The Plaid API key to use to hit the API.

  • \n
  • client_id (str) \u2013 The Plaid client id

  • \n
  • plaid_env (str) \u2013 The Plaid environment

  • \n
  • start_date (Optional[str]) \u2013 The date from which you\u2019d like to replicate data for Plaid in the format YYYY-MM-DD. All data generated after this date will be replicated.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.SnapchatMarketingSource(name, client_id, client_secret, refresh_token, start_date=None, end_date=None)[source]\u00b6
\n
\n
\n__init__(name, client_id, client_secret, refresh_token, start_date=None, end_date=None)[source]\u00b6
\n

Airbyte Source for Snapchat Marketing.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/snapchat-marketing

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • client_id (str) \u2013 The Client ID of your Snapchat developer application.

  • \n
  • client_secret (str) \u2013 The Client Secret of your Snapchat developer application.

  • \n
  • refresh_token (str) \u2013 Refresh Token to renew the expired Access Token.

  • \n
  • start_date (Optional[str]) \u2013 Date in the format 2022-01-01. Any data before this date will not be replicated.

  • \n
  • end_date (Optional[str]) \u2013 Date in the format 2017-01-25. Any data after this date will not be replicated.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.MicrosoftTeamsSource(name, period, credentials)[source]\u00b6
\n
\n
\n__init__(name, period, credentials)[source]\u00b6
\n

Airbyte Source for Microsoft Teams.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/microsoft-teams

\n
\n
Parameters:
\n
\n
\n
\n
\n\n
\n\n
\n
\nclass MicrosoftTeamsSource.AuthenticateViaMicrosoftOAuth20(tenant_id, client_id, client_secret, refresh_token, auth_type=None)[source]\u00b6
\n
\n
\n__init__(tenant_id, client_id, client_secret, refresh_token, auth_type=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass MicrosoftTeamsSource.AuthenticateViaMicrosoft(tenant_id, client_id, client_secret, auth_type=None)[source]\u00b6
\n
\n
\n__init__(tenant_id, client_id, client_secret, auth_type=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.LeverHiringSource(name, credentials, start_date, environment=None)[source]\u00b6
\n
\n
\n__init__(name, credentials, start_date, environment=None)[source]\u00b6
\n

Airbyte Source for Lever Hiring.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/lever-hiring

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • credentials (LeverHiringSource.OAuthCredentials) \u2013 Choose how to authenticate to Lever Hiring.

  • \n
  • start_date (str) \u2013 UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated. Note that it will be used only in the following incremental streams: comments, commits, and issues.

  • \n
  • environment (Optional[str]) \u2013 The environment in which you\u2019d like to replicate data for Lever. This is used to determine which Lever API endpoint to use.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass LeverHiringSource.OAuthCredentials(refresh_token, auth_type=None, client_id=None, client_secret=None)[source]\u00b6
\n
\n
\n__init__(refresh_token, auth_type=None, client_id=None, client_secret=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.TwilioSource(name, account_sid, auth_token, start_date, lookback_window=None)[source]\u00b6
\n
\n
\n__init__(name, account_sid, auth_token, start_date, lookback_window=None)[source]\u00b6
\n

Airbyte Source for Twilio.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/twilio

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • account_sid (str) \u2013 Twilio account SID

  • \n
  • auth_token (str) \u2013 Twilio Auth Token.

  • \n
  • start_date (str) \u2013 UTC date and time in the format 2020-10-01T00:00:00Z. Any data before this date will not be replicated.

  • \n
  • lookback_window (Optional[int]) \u2013 How far into the past to look for records. (in minutes)

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.StripeSource(name, account_id, client_secret, start_date, lookback_window_days=None, slice_range=None)[source]\u00b6
\n
\n
\n__init__(name, account_id, client_secret, start_date, lookback_window_days=None, slice_range=None)[source]\u00b6
\n

Airbyte Source for Stripe.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/stripe

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • account_id (str) \u2013 Your Stripe account ID (starts with \u2018acct\\_\u2019, find yours here).

  • \n
  • client_secret (str) \u2013 Stripe API key (usually starts with \u2018sk_live\\_\u2019; find yours here).

  • \n
  • start_date (str) \u2013 UTC date and time in the format 2017-01-25T00:00:00Z. Only data generated after this date will be replicated.

  • \n
  • lookback_window_days (Optional[int]) \u2013 When set, the connector will always re-export data from the past N days, where N is the value set here. This is useful if your data is frequently updated after creation. More info here

  • \n
  • slice_range (Optional[int]) \u2013 The time increment used by the connector when requesting data from the Stripe API. The bigger the value is, the less requests will be made and faster the sync will be. On the other hand, the more seldom the state is persisted.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.Db2Source(name, host, port, db, username, password, encryption, jdbc_url_params=None)[source]\u00b6
\n
\n
\n__init__(name, host, port, db, username, password, encryption, jdbc_url_params=None)[source]\u00b6
\n

Airbyte Source for Db2.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/db2

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • host (str) \u2013 Host of the Db2.

  • \n
  • port (int) \u2013 Port of the database.

  • \n
  • db (str) \u2013 Name of the database.

  • \n
  • username (str) \u2013 Username to use to access the database.

  • \n
  • password (str) \u2013 Password associated with the username.

  • \n
  • jdbc_url_params (Optional[str]) \u2013 Additional properties to pass to the JDBC URL string when connecting to the database formatted as \u2018key=value\u2019 pairs separated by the symbol \u2018&\u2019. (example: key1=value1&key2=value2&key3=value3).

  • \n
  • encryption (Union[Db2Source.Unencrypted, Db2Source.TLSEncryptedVerifyCertificate]) \u2013 Encryption method to use when communicating with the database

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass Db2Source.Unencrypted[source]\u00b6
\n
\n
\n__init__()[source]\u00b6
\n
\n\n
\n\n
\n
\nclass Db2Source.TLSEncryptedVerifyCertificate(ssl_certificate, key_store_password=None)[source]\u00b6
\n
\n
\n__init__(ssl_certificate, key_store_password=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.SlackSource(name, start_date, lookback_window, join_channels, credentials, channel_filter=None)[source]\u00b6
\n
\n
\n__init__(name, start_date, lookback_window, join_channels, credentials, channel_filter=None)[source]\u00b6
\n

Airbyte Source for Slack.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/slack

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • start_date (str) \u2013 UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated.

  • \n
  • lookback_window (int) \u2013 How far into the past to look for messages in threads.

  • \n
  • join_channels (bool) \u2013 Whether to join all channels or to sync data only from channels the bot is already in. If false, you\u2019ll need to manually add the bot to all the channels from which you\u2019d like to sync messages.

  • \n
  • channel_filter (Optional[List[str]]) \u2013 A channel name list (without leading \u2018#\u2019 char) which limit the channels from which you\u2019d like to sync. Empty list means no filter.

  • \n
  • credentials (Union[SlackSource.DefaultOAuth20Authorization, SlackSource.APITokenCredentials]) \u2013 Choose how to authenticate into Slack

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass SlackSource.DefaultOAuth20Authorization(client_id, client_secret, access_token, refresh_token=None)[source]\u00b6
\n
\n
\n__init__(client_id, client_secret, access_token, refresh_token=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass SlackSource.APITokenCredentials(api_token)[source]\u00b6
\n
\n
\n__init__(api_token)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.RechargeSource(name, start_date, access_token)[source]\u00b6
\n
\n
\n__init__(name, start_date, access_token)[source]\u00b6
\n

Airbyte Source for Recharge.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/recharge

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • start_date (str) \u2013 The date from which you\u2019d like to replicate data for Recharge API, in the format YYYY-MM-DDT00:00:00Z. Any data before this date will not be replicated.

  • \n
  • access_token (str) \u2013 The value of the Access Token generated. See the docs for more information.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.OpenweatherSource(name, lat, lon, appid, units=None, lang=None)[source]\u00b6
\n
\n
\n__init__(name, lat, lon, appid, units=None, lang=None)[source]\u00b6
\n

Airbyte Source for Openweather.

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • lat (str) \u2013 Latitude for which you want to get weather condition from. (min -90, max 90)

  • \n
  • lon (str) \u2013 Longitude for which you want to get weather condition from. (min -180, max 180)

  • \n
  • appid (str) \u2013 Your OpenWeather API Key. See here. The key is case sensitive.

  • \n
  • units (Optional[str]) \u2013 Units of measurement. standard, metric and imperial units are available. If you do not use the units parameter, standard units will be applied by default.

  • \n
  • lang (Optional[str]) \u2013 You can use lang parameter to get the output in your language. The contents of the description field will be translated. See here for the list of supported languages.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.RetentlySource(name, credentials)[source]\u00b6
\n
\n
\n__init__(name, credentials)[source]\u00b6
\n

Airbyte Source for Retently.

\n
\n
Parameters:
\n
\n
\n
\n
\n\n
\n\n
\n
\nclass RetentlySource.AuthenticateViaRetentlyOAuth(client_id, client_secret, refresh_token, auth_type=None)[source]\u00b6
\n
\n
\n__init__(client_id, client_secret, refresh_token, auth_type=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass RetentlySource.AuthenticateWithAPIToken(api_key, auth_type=None)[source]\u00b6
\n
\n
\n__init__(api_key, auth_type=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.ScaffoldSourceHttpSource(name, TODO)[source]\u00b6
\n
\n
\n__init__(name, TODO)[source]\u00b6
\n

Airbyte Source for Scaffold Source Http.

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • TODO (str) \u2013 describe me

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.YandexMetricaSource(name, auth_token, counter_id, start_date, end_date)[source]\u00b6
\n
\n
\n__init__(name, auth_token, counter_id, start_date, end_date)[source]\u00b6
\n

Airbyte Source for Yandex Metrica.

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • auth_token (str) \u2013 Your Yandex Metrica API access token

  • \n
  • counter_id (str) \u2013 Counter ID

  • \n
  • start_date (str) \u2013 UTC date and time in the format YYYY-MM-DD.

  • \n
  • end_date (str) \u2013 UTC date and time in the format YYYY-MM-DD.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.TalkdeskExploreSource(name, start_date, auth_url, api_key, timezone=None)[source]\u00b6
\n
\n
\n__init__(name, start_date, auth_url, api_key, timezone=None)[source]\u00b6
\n

Airbyte Source for Talkdesk Explore.

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • start_date (str) \u2013 The date from which you\u2019d like to replicate data for Talkdesk Explore API, in the format YYYY-MM-DDT00:00:00. All data generated after this date will be replicated.

  • \n
  • timezone (Optional[str]) \u2013 Timezone to use when generating reports. Only IANA timezones are supported (https://nodatime.org/TimeZones)

  • \n
  • auth_url (str) \u2013 Talkdesk Auth URL. Only \u2018client_credentials\u2019 auth type supported at the moment.

  • \n
  • api_key (str) \u2013 Talkdesk API key.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.ChargifySource(name, api_key, domain)[source]\u00b6
\n
\n
\n__init__(name, api_key, domain)[source]\u00b6
\n

Airbyte Source for Chargify.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/chargify

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • api_key (str) \u2013 Chargify API Key.

  • \n
  • domain (str) \u2013 Chargify domain. Normally this domain follows the following format companyname.chargify.com

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.RkiCovidSource(name, start_date)[source]\u00b6
\n
\n
\n__init__(name, start_date)[source]\u00b6
\n

Airbyte Source for Rki Covid.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/rki-covid

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • start_date (str) \u2013 UTC date in the format 2017-01-25. Any data before this date will not be replicated.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.PostgresSource(name, host, port, database, username, ssl_mode, replication_method, tunnel_method, schemas=None, password=None, jdbc_url_params=None, ssl=None)[source]\u00b6
\n
\n
\n__init__(name, host, port, database, username, ssl_mode, replication_method, tunnel_method, schemas=None, password=None, jdbc_url_params=None, ssl=None)[source]\u00b6
\n

Airbyte Source for Postgres.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/postgres

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • host (str) \u2013 Hostname of the database.

  • \n
  • port (int) \u2013 Port of the database.

  • \n
  • database (str) \u2013 Name of the database.

  • \n
  • schemas (Optional[List[str]]) \u2013 The list of schemas (case sensitive) to sync from. Defaults to public.

  • \n
  • username (str) \u2013 Username to access the database.

  • \n
  • password (Optional[str]) \u2013 Password associated with the username.

  • \n
  • jdbc_url_params (Optional[str]) \u2013 Additional properties to pass to the JDBC URL string when connecting to the database formatted as \u2018key=value\u2019 pairs separated by the symbol \u2018&\u2019. (Eg. key1=value1&key2=value2&key3=value3). For more information read about JDBC URL parameters.

  • \n
  • ssl (Optional[bool]) \u2013 Encrypt data using SSL. When activating SSL, please select one of the connection modes.

  • \n
  • ssl_mode (Union[PostgresSource.Disable, PostgresSource.Allow, PostgresSource.Prefer, PostgresSource.Require, PostgresSource.VerifyCa, PostgresSource.VerifyFull]) \u2013 SSL connection modes. disable - Disables encryption of communication between Airbyte and source database allow - Enables encryption only when required by the source database prefer - allows unencrypted connection only if the source database does not support encryption require - Always require encryption. If the source database server does not support encryption, connection will fail verify-ca - Always require encryption and verifies that the source database server has a valid SSL certificate verify-full - This is the most secure mode. Always require encryption and verifies the identity of the source database server Read more in the docs.

  • \n
  • replication_method (Union[PostgresSource.Standard, PostgresSource.LogicalReplicationCDC]) \u2013 Replication method for extracting data from the database.

  • \n
  • tunnel_method (Union[PostgresSource.NoTunnel, PostgresSource.SSHKeyAuthentication, PostgresSource.PasswordAuthentication]) \u2013 Whether to initiate an SSH tunnel before connecting to the database, and if so, which kind of authentication to use.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass PostgresSource.Disable[source]\u00b6
\n
\n
\n__init__()[source]\u00b6
\n
\n\n
\n\n
\n
\nclass PostgresSource.Allow[source]\u00b6
\n
\n
\n__init__()[source]\u00b6
\n
\n\n
\n\n
\n
\nclass PostgresSource.Prefer[source]\u00b6
\n
\n
\n__init__()[source]\u00b6
\n
\n\n
\n\n
\n
\nclass PostgresSource.Require[source]\u00b6
\n
\n
\n__init__()[source]\u00b6
\n
\n\n
\n\n
\n
\nclass PostgresSource.VerifyCa(ca_certificate, client_certificate=None, client_key=None, client_key_password=None)[source]\u00b6
\n
\n
\n__init__(ca_certificate, client_certificate=None, client_key=None, client_key_password=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass PostgresSource.VerifyFull(ca_certificate, client_certificate=None, client_key=None, client_key_password=None)[source]\u00b6
\n
\n
\n__init__(ca_certificate, client_certificate=None, client_key=None, client_key_password=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass PostgresSource.Standard[source]\u00b6
\n
\n
\n__init__()[source]\u00b6
\n
\n\n
\n\n
\n
\nclass PostgresSource.LogicalReplicationCDC(replication_slot, publication, plugin=None, initial_waiting_seconds=None)[source]\u00b6
\n
\n
\n__init__(replication_slot, publication, plugin=None, initial_waiting_seconds=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass PostgresSource.NoTunnel[source]\u00b6
\n
\n
\n__init__()[source]\u00b6
\n
\n\n
\n\n
\n
\nclass PostgresSource.SSHKeyAuthentication(tunnel_host, tunnel_port, tunnel_user, ssh_key)[source]\u00b6
\n
\n
\n__init__(tunnel_host, tunnel_port, tunnel_user, ssh_key)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass PostgresSource.PasswordAuthentication(tunnel_host, tunnel_port, tunnel_user, tunnel_user_password)[source]\u00b6
\n
\n
\n__init__(tunnel_host, tunnel_port, tunnel_user, tunnel_user_password)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.TrelloSource(name, token, key, start_date, board_ids=None)[source]\u00b6
\n
\n
\n__init__(name, token, key, start_date, board_ids=None)[source]\u00b6
\n

Airbyte Source for Trello.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/trello

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • token (str) \u2013 Trello v API token. See the docs for instructions on how to generate it.

  • \n
  • key (str) \u2013 Trello API key. See the docs for instructions on how to generate it.

  • \n
  • start_date (str) \u2013 UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated.

  • \n
  • board_ids (Optional[List[str]]) \u2013 IDs of the boards to replicate data from. If left empty, data from all boards to which you have access will be replicated.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.PrestashopSource(name, url, access_key)[source]\u00b6
\n
\n
\n__init__(name, url, access_key)[source]\u00b6
\n

Airbyte Source for Prestashop.

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • url (str) \u2013 Shop URL without trailing slash (domain name or IP address)

  • \n
  • access_key (str) \u2013 Your PrestaShop access key. See the docs for info on how to obtain this.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.PaystackSource(name, secret_key, start_date, lookback_window_days=None)[source]\u00b6
\n
\n
\n__init__(name, secret_key, start_date, lookback_window_days=None)[source]\u00b6
\n

Airbyte Source for Paystack.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/paystack

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • secret_key (str) \u2013 The Paystack API key (usually starts with \u2018sk_live\\_\u2019; find yours here).

  • \n
  • start_date (str) \u2013 UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated.

  • \n
  • lookback_window_days (Optional[int]) \u2013 When set, the connector will always reload data from the past N days, where N is the value set here. This is useful if your data is updated after creation.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.S3Source(name, dataset, path_pattern, format, provider, schema=None)[source]\u00b6
\n
\n
\n__init__(name, dataset, path_pattern, format, provider, schema=None)[source]\u00b6
\n

Airbyte Source for S3.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/s3

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • dataset (str) \u2013 The name of the stream you would like this source to output. Can contain letters, numbers, or underscores.

  • \n
  • path_pattern (str) \u2013 A regular expression which tells the connector which files to replicate. All files which match this pattern will be replicated. Use | to separate multiple patterns. See this page to understand pattern syntax (GLOBSTAR and SPLIT flags are enabled). Use pattern ** to pick up all files.

  • \n
  • format (Union[S3Source.CSV, S3Source.Parquet, S3Source.Avro, S3Source.Jsonl]) \u2013 The format of the files you\u2019d like to replicate

  • \n
  • schema (Optional[str]) \u2013 Optionally provide a schema to enforce, as a valid JSON string. Ensure this is a mapping of { \u201ccolumn\u201d : \u201ctype\u201d }, where types are valid JSON Schema datatypes. Leave as {} to auto-infer the schema.

  • \n
  • provider (S3Source.S3AmazonWebServices) \u2013 Use this to load files from S3 or S3-compatible services

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass S3Source.CSV(filetype=None, delimiter=None, infer_datatypes=None, quote_char=None, escape_char=None, encoding=None, double_quote=None, newlines_in_values=None, additional_reader_options=None, advanced_options=None, block_size=None)[source]\u00b6
\n
\n
\n__init__(filetype=None, delimiter=None, infer_datatypes=None, quote_char=None, escape_char=None, encoding=None, double_quote=None, newlines_in_values=None, additional_reader_options=None, advanced_options=None, block_size=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass S3Source.Parquet(filetype=None, columns=None, batch_size=None, buffer_size=None)[source]\u00b6
\n
\n
\n__init__(filetype=None, columns=None, batch_size=None, buffer_size=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass S3Source.Avro(filetype=None)[source]\u00b6
\n
\n
\n__init__(filetype=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass S3Source.Jsonl(filetype=None, newlines_in_values=None, unexpected_field_behavior=None, block_size=None)[source]\u00b6
\n
\n
\n__init__(filetype=None, newlines_in_values=None, unexpected_field_behavior=None, block_size=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass S3Source.S3AmazonWebServices(bucket, aws_access_key_id=None, aws_secret_access_key=None, path_prefix=None, endpoint=None)[source]\u00b6
\n
\n
\n__init__(bucket, aws_access_key_id=None, aws_secret_access_key=None, path_prefix=None, endpoint=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.SnowflakeSource(name, credentials, host, role, warehouse, database, schema, jdbc_url_params=None)[source]\u00b6
\n
\n
\n__init__(name, credentials, host, role, warehouse, database, schema, jdbc_url_params=None)[source]\u00b6
\n

Airbyte Source for Snowflake.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/snowflake

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • host (str) \u2013 The host domain of the snowflake instance (must include the account, region, cloud environment, and end with snowflakecomputing.com).

  • \n
  • role (str) \u2013 The role you created for Airbyte to access Snowflake.

  • \n
  • warehouse (str) \u2013 The warehouse you created for Airbyte to access data.

  • \n
  • database (str) \u2013 The database you created for Airbyte to access data.

  • \n
  • schema (str) \u2013 The source Snowflake schema tables.

  • \n
  • jdbc_url_params (Optional[str]) \u2013 Additional properties to pass to the JDBC URL string when connecting to the database formatted as \u2018key=value\u2019 pairs separated by the symbol \u2018&\u2019. (example: key1=value1&key2=value2&key3=value3).

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass SnowflakeSource.OAuth20(client_id, client_secret, access_token=None, refresh_token=None)[source]\u00b6
\n
\n
\n__init__(client_id, client_secret, access_token=None, refresh_token=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass SnowflakeSource.UsernameAndPassword(username, password)[source]\u00b6
\n
\n
\n__init__(username, password)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.AmplitudeSource(name, api_key, secret_key, start_date)[source]\u00b6
\n
\n
\n__init__(name, api_key, secret_key, start_date)[source]\u00b6
\n

Airbyte Source for Amplitude.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/amplitude

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • api_key (str) \u2013 Amplitude API Key. See the setup guide for more information on how to obtain this key.

  • \n
  • secret_key (str) \u2013 Amplitude Secret Key. See the setup guide for more information on how to obtain this key.

  • \n
  • start_date (str) \u2013 UTC date and time in the format 2021-01-25T00:00:00Z. Any data before this date will not be replicated.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.PosthogSource(name, start_date, api_key, base_url=None)[source]\u00b6
\n
\n
\n__init__(name, start_date, api_key, base_url=None)[source]\u00b6
\n

Airbyte Source for Posthog.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/posthog

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • start_date (str) \u2013 The date from which you\u2019d like to replicate the data. Any data before this date will not be replicated.

  • \n
  • api_key (str) \u2013 API Key. See the docs for information on how to generate this key.

  • \n
  • base_url (Optional[str]) \u2013 Base PostHog url. Defaults to PostHog Cloud (https://app.posthog.com).

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.PaypalTransactionSource(name, start_date, is_sandbox, client_id=None, client_secret=None, refresh_token=None)[source]\u00b6
\n
\n
\n__init__(name, start_date, is_sandbox, client_id=None, client_secret=None, refresh_token=None)[source]\u00b6
\n

Airbyte Source for Paypal Transaction.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/paypal-transactions

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • client_id (Optional[str]) \u2013 The Client ID of your Paypal developer application.

  • \n
  • client_secret (Optional[str]) \u2013 The Client Secret of your Paypal developer application.

  • \n
  • refresh_token (Optional[str]) \u2013 The key to refresh the expired access token.

  • \n
  • start_date (str) \u2013 Start Date for data extraction in ISO format. Date must be in range from 3 years till 12 hrs before present time.

  • \n
  • is_sandbox (bool) \u2013 Determines whether to use the sandbox or production environment.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.MssqlSource(name, host, port, database, username, ssl_method, replication_method, schemas=None, password=None, jdbc_url_params=None)[source]\u00b6
\n
\n
\n__init__(name, host, port, database, username, ssl_method, replication_method, schemas=None, password=None, jdbc_url_params=None)[source]\u00b6
\n

Airbyte Source for Mssql.

\n

Documentation can be found at https://docs.airbyte.com/integrations/destinations/mssql

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • host (str) \u2013 The hostname of the database.

  • \n
  • port (int) \u2013 The port of the database.

  • \n
  • database (str) \u2013 The name of the database.

  • \n
  • schemas (Optional[List[str]]) \u2013 The list of schemas to sync from. Defaults to user. Case sensitive.

  • \n
  • username (str) \u2013 The username which is used to access the database.

  • \n
  • password (Optional[str]) \u2013 The password associated with the username.

  • \n
  • jdbc_url_params (Optional[str]) \u2013 Additional properties to pass to the JDBC URL string when connecting to the database formatted as \u2018key=value\u2019 pairs separated by the symbol \u2018&\u2019. (example: key1=value1&key2=value2&key3=value3).

  • \n
  • ssl_method (Union[MssqlSource.Unencrypted, MssqlSource.EncryptedTrustServerCertificate, MssqlSource.EncryptedVerifyCertificate]) \u2013 The encryption method which is used when communicating with the database.

  • \n
  • replication_method (Union[MssqlSource.Standard, MssqlSource.LogicalReplicationCDC]) \u2013 The replication method used for extracting data from the database. STANDARD replication requires no setup on the DB side but will not be able to represent deletions incrementally. CDC uses {TBC} to detect inserts, updates, and deletes. This needs to be configured on the source database itself.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass MssqlSource.Unencrypted[source]\u00b6
\n
\n
\n__init__()[source]\u00b6
\n
\n\n
\n\n
\n
\nclass MssqlSource.EncryptedTrustServerCertificate[source]\u00b6
\n
\n
\n__init__()[source]\u00b6
\n
\n\n
\n\n
\n
\nclass MssqlSource.EncryptedVerifyCertificate(hostNameInCertificate=None)[source]\u00b6
\n
\n
\n__init__(hostNameInCertificate=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass MssqlSource.Standard[source]\u00b6
\n
\n
\n__init__()[source]\u00b6
\n
\n\n
\n\n
\n
\nclass MssqlSource.LogicalReplicationCDC(data_to_sync=None, snapshot_isolation=None)[source]\u00b6
\n
\n
\n__init__(data_to_sync=None, snapshot_isolation=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.ZohoCrmSource(name, client_id, client_secret, refresh_token, dc_region, environment, edition, start_datetime=None)[source]\u00b6
\n
\n
\n__init__(name, client_id, client_secret, refresh_token, dc_region, environment, edition, start_datetime=None)[source]\u00b6
\n

Airbyte Source for Zoho Crm.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/zoho-crm

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • client_id (str) \u2013 OAuth2.0 Client ID

  • \n
  • client_secret (str) \u2013 OAuth2.0 Client Secret

  • \n
  • refresh_token (str) \u2013 OAuth2.0 Refresh Token

  • \n
  • dc_region (str) \u2013 Please choose the region of your Data Center location. More info by this Link

  • \n
  • environment (str) \u2013 Please choose the environment

  • \n
  • start_datetime (Optional[str]) \u2013 ISO 8601, for instance: YYYY-MM-DD, YYYY-MM-DD HH:MM:SS+HH:MM

  • \n
  • edition (str) \u2013 Choose your Edition of Zoho CRM to determine API Concurrency Limits

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.RedshiftSource(name, host, port, database, username, password, schemas=None, jdbc_url_params=None)[source]\u00b6
\n
\n
\n__init__(name, host, port, database, username, password, schemas=None, jdbc_url_params=None)[source]\u00b6
\n

Airbyte Source for Redshift.

\n

Documentation can be found at https://docs.airbyte.com/integrations/destinations/redshift

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • host (str) \u2013 Host Endpoint of the Redshift Cluster (must include the cluster-id, region and end with .redshift.amazonaws.com).

  • \n
  • port (int) \u2013 Port of the database.

  • \n
  • database (str) \u2013 Name of the database.

  • \n
  • schemas (Optional[List[str]]) \u2013 The list of schemas to sync from. Specify one or more explicitly or keep empty to process all schemas. Schema names are case sensitive.

  • \n
  • username (str) \u2013 Username to use to access the database.

  • \n
  • password (str) \u2013 Password associated with the username.

  • \n
  • jdbc_url_params (Optional[str]) \u2013 Additional properties to pass to the JDBC URL string when connecting to the database formatted as \u2018key=value\u2019 pairs separated by the symbol \u2018&\u2019. (example: key1=value1&key2=value2&key3=value3).

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.AsanaSource(name, credentials)[source]\u00b6
\n
\n
\n__init__(name, credentials)[source]\u00b6
\n

Airbyte Source for Asana.

\n
\n
Parameters:
\n
\n
\n
\n
\n\n
\n\n
\n
\nclass AsanaSource.PATCredentials(personal_access_token)[source]\u00b6
\n
\n
\n__init__(personal_access_token)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass AsanaSource.OAuthCredentials(client_id, client_secret, refresh_token)[source]\u00b6
\n
\n
\n__init__(client_id, client_secret, refresh_token)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.SmartsheetsSource(name, access_token, spreadsheet_id, start_datetime=None)[source]\u00b6
\n
\n
\n__init__(name, access_token, spreadsheet_id, start_datetime=None)[source]\u00b6
\n

Airbyte Source for Smartsheets.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/smartsheets

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • access_token (str) \u2013 The access token to use for accessing your data from Smartsheets. This access token must be generated by a user with at least read access to the data you\u2019d like to replicate. Generate an access token in the Smartsheets main menu by clicking Account > Apps & Integrations > API Access. See the setup guide for information on how to obtain this token.

  • \n
  • spreadsheet_id (str) \u2013 The spreadsheet ID. Find it by opening the spreadsheet then navigating to File > Properties

  • \n
  • start_datetime (Optional[str]) \u2013 Only rows modified after this date/time will be replicated. This should be an ISO 8601 string, for instance: 2000-01-01T13:00:00

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.MailchimpSource(name, credentials)[source]\u00b6
\n
\n
\n__init__(name, credentials)[source]\u00b6
\n

Airbyte Source for Mailchimp.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/mailchimp

\n
\n
Parameters:
\n

name (str) \u2013 The name of the destination.

\n
\n
\n
\n\n
\n\n
\n
\nclass MailchimpSource.OAuth20(access_token, client_id=None, client_secret=None)[source]\u00b6
\n
\n
\n__init__(access_token, client_id=None, client_secret=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass MailchimpSource.APIKey(apikey)[source]\u00b6
\n
\n
\n__init__(apikey)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.SentrySource(name, auth_token, organization, project, hostname=None, discover_fields=None)[source]\u00b6
\n
\n
\n__init__(name, auth_token, organization, project, hostname=None, discover_fields=None)[source]\u00b6
\n

Airbyte Source for Sentry.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/sentry

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • auth_token (str) \u2013 Log into Sentry and then create authentication tokens.For self-hosted, you can find or create authentication tokens by visiting \u201c{instance_url_prefix}/settings/account/api/auth-tokens/\u201d

  • \n
  • hostname (Optional[str]) \u2013 Host name of Sentry API server.For self-hosted, specify your host name here. Otherwise, leave it empty.

  • \n
  • organization (str) \u2013 The slug of the organization the groups belong to.

  • \n
  • project (str) \u2013 The name (slug) of the Project you want to sync.

  • \n
  • discover_fields (Optional[List[str]]) \u2013 Fields to retrieve when fetching discover events

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.MailgunSource(name, private_key, domain_region=None, start_date=None)[source]\u00b6
\n
\n
\n__init__(name, private_key, domain_region=None, start_date=None)[source]\u00b6
\n

Airbyte Source for Mailgun.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/mailgun

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • private_key (str) \u2013 Primary account API key to access your Mailgun data.

  • \n
  • domain_region (Optional[str]) \u2013 Domain region code. \u2018EU\u2019 or \u2018US\u2019 are possible values. The default is \u2018US\u2019.

  • \n
  • start_date (Optional[str]) \u2013 UTC date and time in the format 2020-10-01 00:00:00. Any data before this date will not be replicated. If omitted, defaults to 3 days ago.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.OnesignalSource(name, user_auth_key, start_date, outcome_names)[source]\u00b6
\n
\n
\n__init__(name, user_auth_key, start_date, outcome_names)[source]\u00b6
\n

Airbyte Source for Onesignal.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/onesignal

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • user_auth_key (str) \u2013 OneSignal User Auth Key, see the docs for more information on how to obtain this key.

  • \n
  • start_date (str) \u2013 The date from which you\u2019d like to replicate data for OneSignal API, in the format YYYY-MM-DDT00:00:00Z. All data generated after this date will be replicated.

  • \n
  • outcome_names (str) \u2013 Comma-separated list of names and the value (sum/count) for the returned outcome data. See the docs for more details

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.PythonHttpTutorialSource(name, start_date, base, access_key=None)[source]\u00b6
\n
\n
\n__init__(name, start_date, base, access_key=None)[source]\u00b6
\n

Airbyte Source for Python Http Tutorial.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/exchangeratesapi

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • access_key (Optional[str]) \u2013 API access key used to retrieve data from the Exchange Rates API.

  • \n
  • start_date (str) \u2013 UTC date and time in the format 2017-01-25. Any data before this date will not be replicated.

  • \n
  • base (str) \u2013 ISO reference currency. See here.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.AirtableSource(name, api_key, base_id, tables)[source]\u00b6
\n
\n
\n__init__(name, api_key, base_id, tables)[source]\u00b6
\n

Airbyte Source for Airtable.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/airtable

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • api_key (str) \u2013 The API Key for the Airtable account. See the Support Guide for more information on how to obtain this key.

  • \n
  • base_id (str) \u2013 The Base ID to integrate the data from. You can find the Base ID following the link Airtable API, log in to your account, select the base you need and find Base ID in the docs.

  • \n
  • tables (List[str]) \u2013 The list of Tables to integrate.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.MongodbV2Source(name, instance_type, database, user=None, password=None, auth_source=None)[source]\u00b6
\n
\n
\n__init__(name, instance_type, database, user=None, password=None, auth_source=None)[source]\u00b6
\n

Airbyte Source for Mongodb V2.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/mongodb-v2

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • instance_type (Union[MongodbV2Source.StandaloneMongoDbInstance, MongodbV2Source.ReplicaSet, MongodbV2Source.MongoDBAtlas]) \u2013 The MongoDb instance to connect to. For MongoDB Atlas and Replica Set TLS connection is used by default.

  • \n
  • database (str) \u2013 The database you want to replicate.

  • \n
  • user (Optional[str]) \u2013 The username which is used to access the database.

  • \n
  • password (Optional[str]) \u2013 The password associated with this username.

  • \n
  • auth_source (Optional[str]) \u2013 The authentication source where the user information is stored.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass MongodbV2Source.StandaloneMongoDbInstance(instance, host, port, tls=None)[source]\u00b6
\n
\n
\n__init__(instance, host, port, tls=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass MongodbV2Source.ReplicaSet(instance, server_addresses, replica_set=None)[source]\u00b6
\n
\n
\n__init__(instance, server_addresses, replica_set=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass MongodbV2Source.MongoDBAtlas(instance, cluster_url)[source]\u00b6
\n
\n
\n__init__(instance, cluster_url)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.FileSecureSource(name, dataset_name, format, url, provider, reader_options=None)[source]\u00b6
\n
\n
\n__init__(name, dataset_name, format, url, provider, reader_options=None)[source]\u00b6
\n

Airbyte Source for File Secure.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/file

\n
\n
Parameters:
\n
\n
\n
\n
\n\n
\n\n
\n
\nclass FileSecureSource.HTTPSPublicWeb(user_agent=None)[source]\u00b6
\n
\n
\n__init__(user_agent=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass FileSecureSource.GCSGoogleCloudStorage(service_account_json=None)[source]\u00b6
\n
\n
\n__init__(service_account_json=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass FileSecureSource.S3AmazonWebServices(aws_access_key_id=None, aws_secret_access_key=None)[source]\u00b6
\n
\n
\n__init__(aws_access_key_id=None, aws_secret_access_key=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass FileSecureSource.AzBlobAzureBlobStorage(storage_account, sas_token=None, shared_key=None)[source]\u00b6
\n
\n
\n__init__(storage_account, sas_token=None, shared_key=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass FileSecureSource.SSHSecureShell(user, host, password=None, port=None)[source]\u00b6
\n
\n
\n__init__(user, host, password=None, port=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass FileSecureSource.SCPSecureCopyProtocol(user, host, password=None, port=None)[source]\u00b6
\n
\n
\n__init__(user, host, password=None, port=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass FileSecureSource.SFTPSecureFileTransferProtocol(user, host, password=None, port=None)[source]\u00b6
\n
\n
\n__init__(user, host, password=None, port=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.ZendeskSupportSource(name, start_date, subdomain, credentials)[source]\u00b6
\n
\n
\n__init__(name, start_date, subdomain, credentials)[source]\u00b6
\n

Airbyte Source for Zendesk Support.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/zendesk-support

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • start_date (str) \u2013 The date from which you\u2019d like to replicate data for Zendesk Support API, in the format YYYY-MM-DDT00:00:00Z. All data generated after this date will be replicated.

  • \n
  • subdomain (str) \u2013 This is your Zendesk subdomain that can be found in your account URL. For example, in https://{MY_SUBDOMAIN}.zendesk.com/, where MY_SUBDOMAIN is the value of your subdomain.

  • \n
  • credentials (Union[ZendeskSupportSource.OAuth20, ZendeskSupportSource.APIToken]) \u2013 Zendesk service provides two authentication methods. Choose between: OAuth2.0 or API token.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass ZendeskSupportSource.OAuth20(access_token, credentials=None)[source]\u00b6
\n
\n
\n__init__(access_token, credentials=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass ZendeskSupportSource.APIToken(email, api_token, credentials=None)[source]\u00b6
\n
\n
\n__init__(email, api_token, credentials=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.TempoSource(name, api_token)[source]\u00b6
\n
\n
\n__init__(name, api_token)[source]\u00b6
\n

Airbyte Source for Tempo.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • api_token (str) \u2013 Tempo API Token. Go to Tempo>Settings, scroll down to Data Access and select API integration.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.BraintreeSource(name, merchant_id, public_key, private_key, environment, start_date=None)[source]\u00b6
\n
\n
\n__init__(name, merchant_id, public_key, private_key, environment, start_date=None)[source]\u00b6
\n

Airbyte Source for Braintree.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/braintree

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • merchant_id (str) \u2013 The unique identifier for your entire gateway account. See the docs for more information on how to obtain this ID.

  • \n
  • public_key (str) \u2013 Braintree Public Key. See the docs for more information on how to obtain this key.

  • \n
  • private_key (str) \u2013 Braintree Private Key. See the docs for more information on how to obtain this key.

  • \n
  • start_date (Optional[str]) \u2013 UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated.

  • \n
  • environment (str) \u2013 Environment specifies where the data will come from.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.SalesloftSource(name, client_id, client_secret, refresh_token, start_date)[source]\u00b6
\n
\n
\n__init__(name, client_id, client_secret, refresh_token, start_date)[source]\u00b6
\n

Airbyte Source for Salesloft.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/salesloft

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • client_id (str) \u2013 The Client ID of your Salesloft developer application.

  • \n
  • client_secret (str) \u2013 The Client Secret of your Salesloft developer application.

  • \n
  • refresh_token (str) \u2013 The token for obtaining a new access token.

  • \n
  • start_date (str) \u2013 The date from which you\u2019d like to replicate data for Salesloft API, in the format YYYY-MM-DDT00:00:00Z. All data generated after this date will be replicated.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.LinnworksSource(name, application_id, application_secret, token, start_date)[source]\u00b6
\n
\n
\n__init__(name, application_id, application_secret, token, start_date)[source]\u00b6
\n

Airbyte Source for Linnworks.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/linnworks

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • application_id (str) \u2013 Linnworks Application ID

  • \n
  • application_secret (str) \u2013 Linnworks Application Secret

  • \n
  • start_date (str) \u2013 UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.ChargebeeSource(name, site, site_api_key, start_date, product_catalog)[source]\u00b6
\n
\n
\n__init__(name, site, site_api_key, start_date, product_catalog)[source]\u00b6
\n

Airbyte Source for Chargebee.

\n

Documentation can be found at https://apidocs.chargebee.com/docs/api

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • site (str) \u2013 The site prefix for your Chargebee instance.

  • \n
  • site_api_key (str) \u2013 Chargebee API Key. See the docs for more information on how to obtain this key.

  • \n
  • start_date (str) \u2013 UTC date and time in the format 2021-01-25T00:00:00Z. Any data before this date will not be replicated.

  • \n
  • product_catalog (str) \u2013 Product Catalog version of your Chargebee site. Instructions on how to find your version you may find here under API Version section.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.GoogleAnalyticsDataApiSource(name, property_id, credentials, date_ranges_start_date, custom_reports=None, window_in_days=None)[source]\u00b6
\n
\n
\n__init__(name, property_id, credentials, date_ranges_start_date, custom_reports=None, window_in_days=None)[source]\u00b6
\n

Airbyte Source for Google Analytics Data Api.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/google-analytics-v4

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • property_id (str) \u2013 A Google Analytics GA4 property identifier whose events are tracked. Specified in the URL path and not the body

  • \n
  • credentials (Union[GoogleAnalyticsDataApiSource.AuthenticateViaGoogleOauth, GoogleAnalyticsDataApiSource.ServiceAccountKeyAuthentication]) \u2013 Credentials for the service

  • \n
  • date_ranges_start_date (str) \u2013 The start date. One of the values Ndaysago, yesterday, today or in the format YYYY-MM-DD

  • \n
  • custom_reports (Optional[str]) \u2013 A JSON array describing the custom reports you want to sync from Google Analytics. See the docs for more information about the exact format you can use to fill out this field.

  • \n
  • window_in_days (Optional[int]) \u2013 The time increment used by the connector when requesting data from the Google Analytics API. More information is available in the the docs. The bigger this value is, the faster the sync will be, but the more likely that sampling will be applied to your data, potentially causing inaccuracies in the returned results. We recommend setting this to 1 unless you have a hard requirement to make the sync faster at the expense of accuracy. The minimum allowed value for this field is 1, and the maximum is 364.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass GoogleAnalyticsDataApiSource.AuthenticateViaGoogleOauth(client_id, client_secret, refresh_token, auth_type=None, access_token=None)[source]\u00b6
\n
\n
\n__init__(client_id, client_secret, refresh_token, auth_type=None, access_token=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass GoogleAnalyticsDataApiSource.ServiceAccountKeyAuthentication(credentials_json, auth_type=None)[source]\u00b6
\n
\n
\n__init__(credentials_json, auth_type=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.OutreachSource(name, client_id, client_secret, refresh_token, redirect_uri, start_date)[source]\u00b6
\n
\n
\n__init__(name, client_id, client_secret, refresh_token, redirect_uri, start_date)[source]\u00b6
\n

Airbyte Source for Outreach.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/outreach

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • client_id (str) \u2013 The Client ID of your Outreach developer application.

  • \n
  • client_secret (str) \u2013 The Client Secret of your Outreach developer application.

  • \n
  • refresh_token (str) \u2013 The token for obtaining the new access token.

  • \n
  • redirect_uri (str) \u2013 A Redirect URI is the location where the authorization server sends the user once the app has been successfully authorized and granted an authorization code or access token.

  • \n
  • start_date (str) \u2013 The date from which you\u2019d like to replicate data for Outreach API, in the format YYYY-MM-DDT00:00:00Z. All data generated after this date will be replicated.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.LemlistSource(name, api_key)[source]\u00b6
\n
\n
\n__init__(name, api_key)[source]\u00b6
\n

Airbyte Source for Lemlist.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/lemlist

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • api_key (str) \u2013 Lemlist API key.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.ApifyDatasetSource(name, datasetId, clean=None)[source]\u00b6
\n
\n
\n__init__(name, datasetId, clean=None)[source]\u00b6
\n

Airbyte Source for Apify Dataset.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/apify-dataset

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • datasetId (str) \u2013 ID of the dataset you would like to load to Airbyte.

  • \n
  • clean (Optional[bool]) \u2013 If set to true, only clean items will be downloaded from the dataset. See description of what clean means in Apify API docs. If not sure, set clean to false.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.RecurlySource(name, api_key, begin_time=None, end_time=None)[source]\u00b6
\n
\n
\n__init__(name, api_key, begin_time=None, end_time=None)[source]\u00b6
\n

Airbyte Source for Recurly.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/recurly

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • api_key (str) \u2013 Recurly API Key. See the docs for more information on how to generate this key.

  • \n
  • begin_time (Optional[str]) \u2013 ISO8601 timestamp from which the replication from Recurly API will start from.

  • \n
  • end_time (Optional[str]) \u2013 ISO8601 timestamp to which the replication from Recurly API will stop. Records after that date won\u2019t be imported.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.ZendeskTalkSource(name, subdomain, credentials, start_date)[source]\u00b6
\n
\n
\n__init__(name, subdomain, credentials, start_date)[source]\u00b6
\n

Airbyte Source for Zendesk Talk.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/zendesk-talk

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • subdomain (str) \u2013 This is your Zendesk subdomain that can be found in your account URL. For example, in https://{MY_SUBDOMAIN}.zendesk.com/, where MY_SUBDOMAIN is the value of your subdomain.

  • \n
  • credentials (Union[ZendeskTalkSource.APIToken, ZendeskTalkSource.OAuth20]) \u2013 Zendesk service provides two authentication methods. Choose between: OAuth2.0 or API token.

  • \n
  • start_date (str) \u2013 The date from which you\u2019d like to replicate data for Zendesk Talk API, in the format YYYY-MM-DDT00:00:00Z. All data generated after this date will be replicated.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass ZendeskTalkSource.APIToken(email, api_token, auth_type=None)[source]\u00b6
\n
\n
\n__init__(email, api_token, auth_type=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass ZendeskTalkSource.OAuth20(access_token, auth_type=None)[source]\u00b6
\n
\n
\n__init__(access_token, auth_type=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.SftpSource(name, user, host, port, credentials, file_types=None, folder_path=None, file_pattern=None)[source]\u00b6
\n
\n
\n__init__(name, user, host, port, credentials, file_types=None, folder_path=None, file_pattern=None)[source]\u00b6
\n

Airbyte Source for Sftp.

\n

Documentation can be found at https://docs.airbyte.com/integrations/source/sftp

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • user (str) \u2013 The server user

  • \n
  • host (str) \u2013 The server host address

  • \n
  • port (int) \u2013 The server port

  • \n
  • credentials (Union[SftpSource.PasswordAuthentication, SftpSource.SSHKeyAuthentication]) \u2013 The server authentication method

  • \n
  • file_types (Optional[str]) \u2013 Coma separated file types. Currently only \u2018csv\u2019 and \u2018json\u2019 types are supported.

  • \n
  • folder_path (Optional[str]) \u2013 The directory to search files for sync

  • \n
  • file_pattern (Optional[str]) \u2013 The regular expression to specify files for sync in a chosen Folder Path

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass SftpSource.PasswordAuthentication(auth_user_password)[source]\u00b6
\n
\n
\n__init__(auth_user_password)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass SftpSource.SSHKeyAuthentication(auth_ssh_key)[source]\u00b6
\n
\n
\n__init__(auth_ssh_key)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.WhiskyHunterSource(name)[source]\u00b6
\n
\n
\n__init__(name)[source]\u00b6
\n

Airbyte Source for Whisky Hunter.

\n

Documentation can be found at https://docs.airbyte.io/integrations/sources/whisky-hunter

\n
\n
Parameters:
\n

name (str) \u2013 The name of the destination.

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.FreshdeskSource(name, domain, api_key, requests_per_minute=None, start_date=None)[source]\u00b6
\n
\n
\n__init__(name, domain, api_key, requests_per_minute=None, start_date=None)[source]\u00b6
\n

Airbyte Source for Freshdesk.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/freshdesk

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • domain (str) \u2013 Freshdesk domain

  • \n
  • api_key (str) \u2013 Freshdesk API Key. See the docs for more information on how to obtain this key.

  • \n
  • requests_per_minute (Optional[int]) \u2013 The number of requests per minute that this source allowed to use. There is a rate limit of 50 requests per minute per app per account.

  • \n
  • start_date (Optional[str]) \u2013 UTC date and time. Any data created after this date will be replicated. If this parameter is not set, all data will be replicated.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.GocardlessSource(name, access_token, gocardless_environment, gocardless_version, start_date)[source]\u00b6
\n
\n
\n__init__(name, access_token, gocardless_environment, gocardless_version, start_date)[source]\u00b6
\n

Airbyte Source for Gocardless.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/gocardless

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • access_token (str) \u2013 Gocardless API TOKEN

  • \n
  • gocardless_environment (str) \u2013 Environment you are trying to connect to.

  • \n
  • gocardless_version (str) \u2013 GoCardless version. This is a date. You can find the latest here: https://developer.gocardless.com/api-reference/#api-usage-making-requests

  • \n
  • start_date (str) \u2013 UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.ZuoraSource(name, start_date, tenant_endpoint, data_query, client_id, client_secret, window_in_days=None)[source]\u00b6
\n
\n
\n__init__(name, start_date, tenant_endpoint, data_query, client_id, client_secret, window_in_days=None)[source]\u00b6
\n

Airbyte Source for Zuora.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/zuora

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • start_date (str) \u2013 Start Date in format: YYYY-MM-DD

  • \n
  • window_in_days (Optional[str]) \u2013 The amount of days for each data-chunk begining from start_date. Bigger the value - faster the fetch. (0.1 - as for couple of hours, 1 - as for a Day; 364 - as for a Year).

  • \n
  • tenant_endpoint (str) \u2013 Please choose the right endpoint where your Tenant is located. More info by this Link

  • \n
  • data_query (str) \u2013 Choose between Live, or Unlimited - the optimized, replicated database at 12 hours freshness for high volume extraction Link

  • \n
  • client_id (str) \u2013 Your OAuth user Client ID

  • \n
  • client_secret (str) \u2013 Your OAuth user Client Secret

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.MarketoSource(name, domain_url, client_id, client_secret, start_date)[source]\u00b6
\n
\n
\n__init__(name, domain_url, client_id, client_secret, start_date)[source]\u00b6
\n

Airbyte Source for Marketo.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/marketo

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • domain_url (str) \u2013 Your Marketo Base URL. See the docs for info on how to obtain this.

  • \n
  • client_id (str) \u2013 The Client ID of your Marketo developer application. See the docs for info on how to obtain this.

  • \n
  • client_secret (str) \u2013 The Client Secret of your Marketo developer application. See the docs for info on how to obtain this.

  • \n
  • start_date (str) \u2013 UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.DriftSource(name, credentials)[source]\u00b6
\n
\n
\n__init__(name, credentials)[source]\u00b6
\n

Airbyte Source for Drift.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/drift

\n
\n
Parameters:
\n

name (str) \u2013 The name of the destination.

\n
\n
\n
\n\n
\n\n
\n
\nclass DriftSource.OAuth20(client_id, client_secret, access_token, refresh_token, credentials=None)[source]\u00b6
\n
\n
\n__init__(client_id, client_secret, access_token, refresh_token, credentials=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass DriftSource.AccessToken(access_token, credentials=None)[source]\u00b6
\n
\n
\n__init__(access_token, credentials=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.PokeapiSource(name, pokemon_name)[source]\u00b6
\n
\n
\n__init__(name, pokemon_name)[source]\u00b6
\n

Airbyte Source for Pokeapi.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/pokeapi

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • pokemon_name (str) \u2013 Pokemon requested from the API.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.NetsuiteSource(name, realm, consumer_key, consumer_secret, token_key, token_secret, start_datetime, object_types=None, window_in_days=None)[source]\u00b6
\n
\n
\n__init__(name, realm, consumer_key, consumer_secret, token_key, token_secret, start_datetime, object_types=None, window_in_days=None)[source]\u00b6
\n

Airbyte Source for Netsuite.

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • realm (str) \u2013 Netsuite realm e.g. 2344535, as for production or 2344535_SB1, as for the sandbox

  • \n
  • consumer_key (str) \u2013 Consumer key associated with your integration

  • \n
  • consumer_secret (str) \u2013 Consumer secret associated with your integration

  • \n
  • token_key (str) \u2013 Access token key

  • \n
  • token_secret (str) \u2013 Access token secret

  • \n
  • object_types (Optional[List[str]]) \u2013 The API names of the Netsuite objects you want to sync. Setting this speeds up the connection setup process by limiting the number of schemas that need to be retrieved from Netsuite.

  • \n
  • start_datetime (str) \u2013 Starting point for your data replication, in format of \u201cYYYY-MM-DDTHH:mm:ssZ\u201d

  • \n
  • window_in_days (Optional[int]) \u2013 The amount of days used to query the data with date chunks. Set smaller value, if you have lots of data.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.HubplannerSource(name, api_key)[source]\u00b6
\n
\n
\n__init__(name, api_key)[source]\u00b6
\n

Airbyte Source for Hubplanner.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/hubplanner

\n
\n
Parameters:
\n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.Dv360Source(name, credentials, partner_id, start_date, end_date=None, filters=None)[source]\u00b6
\n
\n
\n__init__(name, credentials, partner_id, start_date, end_date=None, filters=None)[source]\u00b6
\n

Airbyte Source for Dv 360.

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • credentials (Dv360Source.Oauth2Credentials) \u2013 Oauth2 credentials

  • \n
  • partner_id (int) \u2013 Partner ID

  • \n
  • start_date (str) \u2013 UTC date and time in the format 2017-01-25. Any data before this date will not be replicated

  • \n
  • end_date (Optional[str]) \u2013 UTC date and time in the format 2017-01-25. Any data after this date will not be replicated.

  • \n
  • filters (Optional[List[str]]) \u2013 filters for the dimensions. each filter object had 2 keys: \u2018type\u2019 for the name of the dimension to be used as. and \u2018value\u2019 for the value of the filter

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass Dv360Source.Oauth2Credentials(access_token, refresh_token, token_uri, client_id, client_secret)[source]\u00b6
\n
\n
\n__init__(access_token, refresh_token, token_uri, client_id, client_secret)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.NotionSource(name, start_date, credentials)[source]\u00b6
\n
\n
\n__init__(name, start_date, credentials)[source]\u00b6
\n

Airbyte Source for Notion.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/notion

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • start_date (str) \u2013 UTC date and time in the format 2017-01-25T00:00:00.000Z. Any data before this date will not be replicated.

  • \n
  • credentials (Union[NotionSource.OAuth20, NotionSource.AccessToken]) \u2013 Pick an authentication method.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass NotionSource.OAuth20(client_id, client_secret, access_token)[source]\u00b6
\n
\n
\n__init__(client_id, client_secret, access_token)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass NotionSource.AccessToken(token)[source]\u00b6
\n
\n
\n__init__(token)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.ZendeskSunshineSource(name, subdomain, start_date, credentials)[source]\u00b6
\n
\n
\n__init__(name, subdomain, start_date, credentials)[source]\u00b6
\n

Airbyte Source for Zendesk Sunshine.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/zendesk_sunshine

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • subdomain (str) \u2013 The subdomain for your Zendesk Account.

  • \n
  • start_date (str) \u2013 The date from which you\u2019d like to replicate data for Zendesk Sunshine API, in the format YYYY-MM-DDT00:00:00Z.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass ZendeskSunshineSource.OAuth20(client_id, client_secret, access_token)[source]\u00b6
\n
\n
\n__init__(client_id, client_secret, access_token)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass ZendeskSunshineSource.APIToken(api_token, email)[source]\u00b6
\n
\n
\n__init__(api_token, email)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.PinterestSource(name, start_date, credentials)[source]\u00b6
\n
\n
\n__init__(name, start_date, credentials)[source]\u00b6
\n

Airbyte Source for Pinterest.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/pinterest

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • start_date (str) \u2013 A date in the format YYYY-MM-DD. If you have not set a date, it would be defaulted to latest allowed date by api (914 days from today).

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass PinterestSource.OAuth20(refresh_token, client_id=None, client_secret=None)[source]\u00b6
\n
\n
\n__init__(refresh_token, client_id=None, client_secret=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass PinterestSource.AccessToken(access_token)[source]\u00b6
\n
\n
\n__init__(access_token)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.MetabaseSource(name, instance_api_url, username=None, password=None, session_token=None)[source]\u00b6
\n
\n
\n__init__(name, instance_api_url, username=None, password=None, session_token=None)[source]\u00b6
\n

Airbyte Source for Metabase.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/metabase

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • instance_api_url (str) \u2013 URL to your metabase instance API

  • \n
  • session_token (Optional[str]) \u2013 To generate your session token, you need to run the following command: ` curl -X POST \\\\   -H "Content-Type: application/json" \\\\   -d '{"username": "person@metabase.com", "password": "fakepassword"}' \\\\   http://localhost:3000/api/session ` Then copy the value of the id field returned by a successful call to that API. Note that by default, sessions are good for 14 days and needs to be regenerated.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.HubspotSource(name, start_date, credentials)[source]\u00b6
\n
\n
\n__init__(name, start_date, credentials)[source]\u00b6
\n

Airbyte Source for Hubspot.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/hubspot

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • start_date (str) \u2013 UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated.

  • \n
  • credentials (Union[HubspotSource.OAuth, HubspotSource.APIKey, HubspotSource.PrivateAPP]) \u2013 Choose how to authenticate to HubSpot.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass HubspotSource.OAuth(client_id, client_secret, refresh_token)[source]\u00b6
\n
\n
\n__init__(client_id, client_secret, refresh_token)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass HubspotSource.APIKey(api_key)[source]\u00b6
\n
\n
\n__init__(api_key)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass HubspotSource.PrivateAPP(access_token)[source]\u00b6
\n
\n
\n__init__(access_token)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.HarvestSource(name, account_id, replication_start_date, credentials)[source]\u00b6
\n
\n
\n__init__(name, account_id, replication_start_date, credentials)[source]\u00b6
\n

Airbyte Source for Harvest.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/harvest

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • account_id (str) \u2013 Harvest account ID. Required for all Harvest requests in pair with Personal Access Token

  • \n
  • replication_start_date (str) \u2013 UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated.

  • \n
  • credentials (Union[HarvestSource.AuthenticateViaHarvestOAuth, HarvestSource.AuthenticateWithPersonalAccessToken]) \u2013 Choose how to authenticate to Harvest.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass HarvestSource.AuthenticateViaHarvestOAuth(client_id, client_secret, refresh_token, auth_type=None)[source]\u00b6
\n
\n
\n__init__(client_id, client_secret, refresh_token, auth_type=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass HarvestSource.AuthenticateWithPersonalAccessToken(api_token, auth_type=None)[source]\u00b6
\n
\n
\n__init__(api_token, auth_type=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.GithubSource(name, credentials, start_date, repository, branch=None, page_size_for_large_streams=None)[source]\u00b6
\n
\n
\n__init__(name, credentials, start_date, repository, branch=None, page_size_for_large_streams=None)[source]\u00b6
\n

Airbyte Source for Github.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/github

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • credentials (Union[GithubSource.OAuthCredentials, GithubSource.PATCredentials]) \u2013 Choose how to authenticate to GitHub

  • \n
  • start_date (str) \u2013 The date from which you\u2019d like to replicate data from GitHub in the format YYYY-MM-DDT00:00:00Z. For the streams which support this configuration, only data generated on or after the start date will be replicated. This field doesn\u2019t apply to all streams, see the docs for more info

  • \n
  • repository (str) \u2013 Space-delimited list of GitHub organizations/repositories, e.g. airbytehq/airbyte for single repository, airbytehq/* for get all repositories from organization and airbytehq/airbyte airbytehq/another-repo for multiple repositories.

  • \n
  • branch (Optional[str]) \u2013 Space-delimited list of GitHub repository branches to pull commits for, e.g. airbytehq/airbyte/master. If no branches are specified for a repository, the default branch will be pulled.

  • \n
  • page_size_for_large_streams (Optional[int]) \u2013 The Github connector contains several streams with a large amount of data. The page size of such streams depends on the size of your repository. We recommended that you specify values between 10 and 30.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass GithubSource.OAuthCredentials(access_token)[source]\u00b6
\n
\n
\n__init__(access_token)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass GithubSource.PATCredentials(personal_access_token)[source]\u00b6
\n
\n
\n__init__(personal_access_token)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.E2eTestSource(name, max_messages, mock_catalog, type=None, seed=None, message_interval_ms=None)[source]\u00b6
\n
\n
\n__init__(name, max_messages, mock_catalog, type=None, seed=None, message_interval_ms=None)[source]\u00b6
\n

Airbyte Source for E2e Test.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/e2e-test

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • max_messages (int) \u2013 Number of records to emit per stream. Min 1. Max 100 billion.

  • \n
  • seed (Optional[int]) \u2013 When the seed is unspecified, the current time millis will be used as the seed. Range: [0, 1000000].

  • \n
  • message_interval_ms (Optional[int]) \u2013 Interval between messages in ms. Min 0 ms. Max 60000 ms (1 minute).

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass E2eTestSource.SingleSchema(stream_name, stream_schema, stream_duplication=None)[source]\u00b6
\n
\n
\n__init__(stream_name, stream_schema, stream_duplication=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass E2eTestSource.MultiSchema(stream_schemas)[source]\u00b6
\n
\n
\n__init__(stream_schemas)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.MysqlSource(name, host, port, database, username, ssl_mode, replication_method, password=None, jdbc_url_params=None, ssl=None)[source]\u00b6
\n
\n
\n__init__(name, host, port, database, username, ssl_mode, replication_method, password=None, jdbc_url_params=None, ssl=None)[source]\u00b6
\n

Airbyte Source for Mysql.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/mysql

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • host (str) \u2013 The host name of the database.

  • \n
  • port (int) \u2013 The port to connect to.

  • \n
  • database (str) \u2013 The database name.

  • \n
  • username (str) \u2013 The username which is used to access the database.

  • \n
  • password (Optional[str]) \u2013 The password associated with the username.

  • \n
  • jdbc_url_params (Optional[str]) \u2013 Additional properties to pass to the JDBC URL string when connecting to the database formatted as \u2018key=value\u2019 pairs separated by the symbol \u2018&\u2019. (example: key1=value1&key2=value2&key3=value3). For more information read about JDBC URL parameters.

  • \n
  • ssl (Optional[bool]) \u2013 Encrypt data using SSL.

  • \n
  • ssl_mode (Union[MysqlSource.Preferred, MysqlSource.Required, MysqlSource.VerifyCA, MysqlSource.VerifyIdentity]) \u2013 SSL connection modes. preferred - Automatically attempt SSL connection. If the MySQL server does not support SSL, continue with a regular connection.required - Always connect with SSL. If the MySQL server doesn`t support SSL, the connection will not be established. Certificate Authority (CA) and Hostname are not verified.verify-ca - Always connect with SSL. Verifies CA, but allows connection even if Hostname does not match.Verify Identity - Always connect with SSL. Verify both CA and Hostname.Read more in the docs.

  • \n
  • replication_method (Union[MysqlSource.Standard, MysqlSource.LogicalReplicationCDC]) \u2013 Replication method to use for extracting data from the database.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass MysqlSource.Preferred[source]\u00b6
\n
\n
\n__init__()[source]\u00b6
\n
\n\n
\n\n
\n
\nclass MysqlSource.Required[source]\u00b6
\n
\n
\n__init__()[source]\u00b6
\n
\n\n
\n\n
\n
\nclass MysqlSource.VerifyCA(ca_certificate, client_certificate=None, client_key=None, client_key_password=None)[source]\u00b6
\n
\n
\n__init__(ca_certificate, client_certificate=None, client_key=None, client_key_password=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass MysqlSource.VerifyIdentity(ca_certificate, client_certificate=None, client_key=None, client_key_password=None)[source]\u00b6
\n
\n
\n__init__(ca_certificate, client_certificate=None, client_key=None, client_key_password=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass MysqlSource.Standard[source]\u00b6
\n
\n
\n__init__()[source]\u00b6
\n
\n\n
\n\n
\n
\nclass MysqlSource.LogicalReplicationCDC(initial_waiting_seconds=None, server_time_zone=None)[source]\u00b6
\n
\n
\n__init__(initial_waiting_seconds=None, server_time_zone=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.MyHoursSource(name, email, password, start_date, logs_batch_size=None)[source]\u00b6
\n
\n
\n__init__(name, email, password, start_date, logs_batch_size=None)[source]\u00b6
\n

Airbyte Source for My Hours.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/my-hours

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • email (str) \u2013 Your My Hours username

  • \n
  • password (str) \u2013 The password associated to the username

  • \n
  • start_date (str) \u2013 Start date for collecting time logs

  • \n
  • logs_batch_size (Optional[int]) \u2013 Pagination size used for retrieving logs in days

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.KyribaSource(name, domain, username, password, start_date, end_date=None)[source]\u00b6
\n
\n
\n__init__(name, domain, username, password, start_date, end_date=None)[source]\u00b6
\n

Airbyte Source for Kyriba.

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • domain (str) \u2013 Kyriba domain

  • \n
  • username (str) \u2013 Username to be used in basic auth

  • \n
  • password (str) \u2013 Password to be used in basic auth

  • \n
  • start_date (str) \u2013 The date the sync should start from.

  • \n
  • end_date (Optional[str]) \u2013 The date the sync should end. If let empty the sync will run to the current date.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.GoogleSearchConsoleSource(name, site_urls, start_date, authorization, end_date=None, custom_reports=None)[source]\u00b6
\n
\n
\n__init__(name, site_urls, start_date, authorization, end_date=None, custom_reports=None)[source]\u00b6
\n

Airbyte Source for Google Search Console.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/google-search-console

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • site_urls (List[str]) \u2013 The URLs of the website property attached to your GSC account. Read more here.

  • \n
  • start_date (str) \u2013 UTC date in the format 2017-01-25. Any data before this date will not be replicated.

  • \n
  • end_date (Optional[str]) \u2013 UTC date in the format 2017-01-25. Any data after this date will not be replicated. Must be greater or equal to the start date field.

  • \n
  • custom_reports (Optional[str]) \u2013 A JSON array describing the custom reports you want to sync from Google Search Console. See the docs for more information about the exact format you can use to fill out this field.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass GoogleSearchConsoleSource.OAuth(client_id, client_secret, refresh_token, access_token=None)[source]\u00b6
\n
\n
\n__init__(client_id, client_secret, refresh_token, access_token=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass GoogleSearchConsoleSource.ServiceAccountKeyAuthentication(service_account_info, email)[source]\u00b6
\n
\n
\n__init__(service_account_info, email)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.FacebookMarketingSource(name, account_id, start_date, access_token, end_date=None, include_deleted=None, fetch_thumbnail_images=None, custom_insights=None, page_size=None, insights_lookback_window=None, max_batch_size=None)[source]\u00b6
\n
\n
\n__init__(name, account_id, start_date, access_token, end_date=None, include_deleted=None, fetch_thumbnail_images=None, custom_insights=None, page_size=None, insights_lookback_window=None, max_batch_size=None)[source]\u00b6
\n

Airbyte Source for Facebook Marketing.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/facebook-marketing

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • account_id (str) \u2013 The Facebook Ad account ID to use when pulling data from the Facebook Marketing API.

  • \n
  • start_date (str) \u2013 The date from which you\u2019d like to replicate data for all incremental streams, in the format YYYY-MM-DDT00:00:00Z. All data generated after this date will be replicated.

  • \n
  • end_date (Optional[str]) \u2013 The date until which you\u2019d like to replicate data for all incremental streams, in the format YYYY-MM-DDT00:00:00Z. All data generated between start_date and this date will be replicated. Not setting this option will result in always syncing the latest data.

  • \n
  • access_token (str) \u2013 The value of the access token generated. See the docs for more information

  • \n
  • include_deleted (Optional[bool]) \u2013 Include data from deleted Campaigns, Ads, and AdSets

  • \n
  • fetch_thumbnail_images (Optional[bool]) \u2013 In each Ad Creative, fetch the thumbnail_url and store the result in thumbnail_data_url

  • \n
  • custom_insights (Optional[List[FacebookMarketingSource.InsightConfig]]) \u2013 A list which contains insights entries, each entry must have a name and can contains fields, breakdowns or action_breakdowns)

  • \n
  • page_size (Optional[int]) \u2013 Page size used when sending requests to Facebook API to specify number of records per page when response has pagination. Most users do not need to set this field unless they specifically need to tune the connector to address specific issues or use cases.

  • \n
  • insights_lookback_window (Optional[int]) \u2013 The attribution window

  • \n
  • max_batch_size (Optional[int]) \u2013 Maximum batch size used when sending batch requests to Facebook API. Most users do not need to set this field unless they specifically need to tune the connector to address specific issues or use cases.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass FacebookMarketingSource.InsightConfig(name, fields=None, breakdowns=None, action_breakdowns=None, time_increment=None, start_date=None, end_date=None, insights_lookback_window=None)[source]\u00b6
\n
\n
\n__init__(name, fields=None, breakdowns=None, action_breakdowns=None, time_increment=None, start_date=None, end_date=None, insights_lookback_window=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.SurveymonkeySource(name, access_token, start_date, survey_ids=None)[source]\u00b6
\n
\n
\n__init__(name, access_token, start_date, survey_ids=None)[source]\u00b6
\n

Airbyte Source for Surveymonkey.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/surveymonkey

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • access_token (str) \u2013 Access Token for making authenticated requests. See the docs for information on how to generate this key.

  • \n
  • start_date (str) \u2013 UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated.

  • \n
  • survey_ids (Optional[List[str]]) \u2013 IDs of the surveys from which you\u2019d like to replicate data. If left empty, data from all boards to which you have access will be replicated.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.PardotSource(name, pardot_business_unit_id, client_id, client_secret, refresh_token, start_date=None, is_sandbox=None)[source]\u00b6
\n
\n
\n__init__(name, pardot_business_unit_id, client_id, client_secret, refresh_token, start_date=None, is_sandbox=None)[source]\u00b6
\n

Airbyte Source for Pardot.

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • pardot_business_unit_id (str) \u2013 Pardot Business ID, can be found at Setup > Pardot > Pardot Account Setup

  • \n
  • client_id (str) \u2013 The Consumer Key that can be found when viewing your app in Salesforce

  • \n
  • client_secret (str) \u2013 The Consumer Secret that can be found when viewing your app in Salesforce

  • \n
  • refresh_token (str) \u2013 Salesforce Refresh Token used for Airbyte to access your Salesforce account. If you don\u2019t know what this is, follow this guide to retrieve it.

  • \n
  • start_date (Optional[str]) \u2013 UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated. Leave blank to skip this filter

  • \n
  • is_sandbox (Optional[bool]) \u2013 Whether or not the the app is in a Salesforce sandbox. If you do not know what this, assume it is false.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.FlexportSource(name, api_key, start_date)[source]\u00b6
\n
\n
\n__init__(name, api_key, start_date)[source]\u00b6
\n

Airbyte Source for Flexport.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/flexport

\n
\n
Parameters:
\n

name (str) \u2013 The name of the destination.

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.ZenefitsSource(name, token)[source]\u00b6
\n
\n
\n__init__(name, token)[source]\u00b6
\n

Airbyte Source for Zenefits.

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • token (str) \u2013 Use Sync with Zenefits button on the link given on the readme file, and get the token to access the api

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.KafkaSource(name, MessageFormat, bootstrap_servers, subscription, protocol, test_topic=None, group_id=None, max_poll_records=None, polling_time=None, client_id=None, enable_auto_commit=None, auto_commit_interval_ms=None, client_dns_lookup=None, retry_backoff_ms=None, request_timeout_ms=None, receive_buffer_bytes=None, auto_offset_reset=None, repeated_calls=None, max_records_process=None)[source]\u00b6
\n
\n
\n__init__(name, MessageFormat, bootstrap_servers, subscription, protocol, test_topic=None, group_id=None, max_poll_records=None, polling_time=None, client_id=None, enable_auto_commit=None, auto_commit_interval_ms=None, client_dns_lookup=None, retry_backoff_ms=None, request_timeout_ms=None, receive_buffer_bytes=None, auto_offset_reset=None, repeated_calls=None, max_records_process=None)[source]\u00b6
\n

Airbyte Source for Kafka.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/kafka

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • MessageFormat (Union[KafkaSource.JSON, KafkaSource.AVRO]) \u2013 The serialization used based on this

  • \n
  • bootstrap_servers (str) \u2013 A list of host/port pairs to use for establishing the initial connection to the Kafka cluster. The client will make use of all servers irrespective of which servers are specified here for bootstrapping&mdash;this list only impacts the initial hosts used to discover the full set of servers. This list should be in the form host1:port1,host2:port2,\u2026. Since these servers are just used for the initial connection to discover the full cluster membership (which may change dynamically), this list need not contain the full set of servers (you may want more than one, though, in case a server is down).

  • \n
  • subscription (Union[KafkaSource.ManuallyAssignAListOfPartitions, KafkaSource.SubscribeToAllTopicsMatchingSpecifiedPattern]) \u2013 You can choose to manually assign a list of partitions, or subscribe to all topics matching specified pattern to get dynamically assigned partitions.

  • \n
  • test_topic (Optional[str]) \u2013 The Topic to test in case the Airbyte can consume messages.

  • \n
  • group_id (Optional[str]) \u2013 The Group ID is how you distinguish different consumer groups.

  • \n
  • max_poll_records (Optional[int]) \u2013 The maximum number of records returned in a single call to poll(). Note, that max_poll_records does not impact the underlying fetching behavior. The consumer will cache the records from each fetch request and returns them incrementally from each poll.

  • \n
  • polling_time (Optional[int]) \u2013 Amount of time Kafka connector should try to poll for messages.

  • \n
  • protocol (Union[KafkaSource.PLAINTEXT, KafkaSource.SASLPLAINTEXT, KafkaSource.SASLSSL]) \u2013 The Protocol used to communicate with brokers.

  • \n
  • client_id (Optional[str]) \u2013 An ID string to pass to the server when making requests. The purpose of this is to be able to track the source of requests beyond just ip/port by allowing a logical application name to be included in server-side request logging.

  • \n
  • enable_auto_commit (Optional[bool]) \u2013 If true, the consumer\u2019s offset will be periodically committed in the background.

  • \n
  • auto_commit_interval_ms (Optional[int]) \u2013 The frequency in milliseconds that the consumer offsets are auto-committed to Kafka if enable.auto.commit is set to true.

  • \n
  • client_dns_lookup (Optional[str]) \u2013 Controls how the client uses DNS lookups. If set to use_all_dns_ips, connect to each returned IP address in sequence until a successful connection is established. After a disconnection, the next IP is used. Once all IPs have been used once, the client resolves the IP(s) from the hostname again. If set to resolve_canonical_bootstrap_servers_only, resolve each bootstrap address into a list of canonical names. After the bootstrap phase, this behaves the same as use_all_dns_ips. If set to default (deprecated), attempt to connect to the first IP address returned by the lookup, even if the lookup returns multiple IP addresses.

  • \n
  • retry_backoff_ms (Optional[int]) \u2013 The amount of time to wait before attempting to retry a failed request to a given topic partition. This avoids repeatedly sending requests in a tight loop under some failure scenarios.

  • \n
  • request_timeout_ms (Optional[int]) \u2013 The configuration controls the maximum amount of time the client will wait for the response of a request. If the response is not received before the timeout elapses the client will resend the request if necessary or fail the request if retries are exhausted.

  • \n
  • receive_buffer_bytes (Optional[int]) \u2013 The size of the TCP receive buffer (SO_RCVBUF) to use when reading data. If the value is -1, the OS default will be used.

  • \n
  • auto_offset_reset (Optional[str]) \u2013 What to do when there is no initial offset in Kafka or if the current offset does not exist any more on the server - earliest: automatically reset the offset to the earliest offset, latest: automatically reset the offset to the latest offset, none: throw exception to the consumer if no previous offset is found for the consumer\u2019s group, anything else: throw exception to the consumer.

  • \n
  • repeated_calls (Optional[int]) \u2013 The number of repeated calls to poll() if no messages were received.

  • \n
  • max_records_process (Optional[int]) \u2013 The Maximum to be processed per execution

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass KafkaSource.JSON(deserialization_type=None)[source]\u00b6
\n
\n
\n__init__(deserialization_type=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass KafkaSource.AVRO(deserialization_type=None, deserialization_strategy=None, schema_registry_url=None, schema_registry_username=None, schema_registry_password=None)[source]\u00b6
\n
\n
\n__init__(deserialization_type=None, deserialization_strategy=None, schema_registry_url=None, schema_registry_username=None, schema_registry_password=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass KafkaSource.ManuallyAssignAListOfPartitions(topic_partitions)[source]\u00b6
\n
\n
\n__init__(topic_partitions)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass KafkaSource.SubscribeToAllTopicsMatchingSpecifiedPattern(topic_pattern)[source]\u00b6
\n
\n
\n__init__(topic_pattern)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass KafkaSource.PLAINTEXT(security_protocol)[source]\u00b6
\n
\n
\n__init__(security_protocol)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass KafkaSource.SASLPLAINTEXT(security_protocol, sasl_mechanism, sasl_jaas_config)[source]\u00b6
\n
\n
\n__init__(security_protocol, sasl_mechanism, sasl_jaas_config)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass KafkaSource.SASLSSL(security_protocol, sasl_mechanism, sasl_jaas_config)[source]\u00b6
\n
\n
\n__init__(security_protocol, sasl_mechanism, sasl_jaas_config)[source]\u00b6
\n
\n\n
\n\n
\n
\n

Managed Config Generated Destinations\u00b6

\n
\n
\nclass dagster_airbyte.managed.generated.destinations.DynamodbDestination(name, dynamodb_table_name_prefix, dynamodb_region, access_key_id, secret_access_key, dynamodb_endpoint=None)[source]\u00b6
\n
\n
\n__init__(name, dynamodb_table_name_prefix, dynamodb_region, access_key_id, secret_access_key, dynamodb_endpoint=None)[source]\u00b6
\n

Airbyte Destination for Dynamodb.

\n

Documentation can be found at https://docs.airbyte.com/integrations/destinations/dynamodb

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • dynamodb_endpoint (Optional[str]) \u2013 This is your DynamoDB endpoint url.(if you are working with AWS DynamoDB, just leave empty).

  • \n
  • dynamodb_table_name_prefix (str) \u2013 The prefix to use when naming DynamoDB tables.

  • \n
  • dynamodb_region (str) \u2013 The region of the DynamoDB.

  • \n
  • access_key_id (str) \u2013 The access key id to access the DynamoDB. Airbyte requires Read and Write permissions to the DynamoDB.

  • \n
  • secret_access_key (str) \u2013 The corresponding secret to the access key id.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.destinations.BigqueryDestination(name, project_id, dataset_location, dataset_id, loading_method, credentials_json=None, transformation_priority=None, big_query_client_buffer_size_mb=None)[source]\u00b6
\n
\n
\n__init__(name, project_id, dataset_location, dataset_id, loading_method, credentials_json=None, transformation_priority=None, big_query_client_buffer_size_mb=None)[source]\u00b6
\n

Airbyte Destination for Bigquery.

\n

Documentation can be found at https://docs.airbyte.com/integrations/destinations/bigquery

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • project_id (str) \u2013 The GCP project ID for the project containing the target BigQuery dataset. Read more here.

  • \n
  • dataset_location (str) \u2013 The location of the dataset. Warning: Changes made after creation will not be applied. Read more here.

  • \n
  • dataset_id (str) \u2013 The default BigQuery Dataset ID that tables are replicated to if the source does not specify a namespace. Read more here.

  • \n
  • loading_method (Union[BigqueryDestination.StandardInserts, BigqueryDestination.GCSStaging]) \u2013 Loading method used to send select the way data will be uploaded to BigQuery. Standard Inserts - Direct uploading using SQL INSERT statements. This method is extremely inefficient and provided only for quick testing. In almost all cases, you should use staging. GCS Staging - Writes large batches of records to a file, uploads the file to GCS, then uses COPY INTO table to upload the file. Recommended for most workloads for better speed and scalability. Read more about GCS Staging here.

  • \n
  • credentials_json (Optional[str]) \u2013 The contents of the JSON service account key. Check out the docs if you need help generating this key. Default credentials will be used if this field is left empty.

  • \n
  • transformation_priority (Optional[str]) \u2013 Interactive run type means that the query is executed as soon as possible, and these queries count towards concurrent rate limit and daily limit. Read more about interactive run type here. Batch queries are queued and started as soon as idle resources are available in the BigQuery shared resource pool, which usually occurs within a few minutes. Batch queries don`t count towards your concurrent rate limit. Read more about batch queries here. The default \u201cinteractive\u201d value is used if not set explicitly.

  • \n
  • big_query_client_buffer_size_mb (Optional[int]) \u2013 Google BigQuery client\u2019s chunk (buffer) size (MIN=1, MAX = 15) for each table. The size that will be written by a single RPC. Written data will be buffered and only flushed upon reaching this size or closing the channel. The default 15MB value is used if not set explicitly. Read more here.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass BigqueryDestination.StandardInserts[source]\u00b6
\n
\n
\n__init__()[source]\u00b6
\n
\n\n
\n\n
\n
\nclass BigqueryDestination.HMACKey(hmac_key_access_id, hmac_key_secret)[source]\u00b6
\n
\n
\n__init__(hmac_key_access_id, hmac_key_secret)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass BigqueryDestination.GCSStaging(credential, gcs_bucket_name, gcs_bucket_path, keep_files_in_gcs_bucket=None)[source]\u00b6
\n
\n
\n__init__(credential, gcs_bucket_name, gcs_bucket_path, keep_files_in_gcs_bucket=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.destinations.RabbitmqDestination(name, host, routing_key, ssl=None, port=None, virtual_host=None, username=None, password=None, exchange=None)[source]\u00b6
\n
\n
\n__init__(name, host, routing_key, ssl=None, port=None, virtual_host=None, username=None, password=None, exchange=None)[source]\u00b6
\n

Airbyte Destination for Rabbitmq.

\n

Documentation can be found at https://docs.airbyte.com/integrations/destinations/rabbitmq

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • ssl (Optional[bool]) \u2013 SSL enabled.

  • \n
  • host (str) \u2013 The RabbitMQ host name.

  • \n
  • port (Optional[int]) \u2013 The RabbitMQ port.

  • \n
  • virtual_host (Optional[str]) \u2013 The RabbitMQ virtual host name.

  • \n
  • username (Optional[str]) \u2013 The username to connect.

  • \n
  • password (Optional[str]) \u2013 The password to connect.

  • \n
  • exchange (Optional[str]) \u2013 The exchange name.

  • \n
  • routing_key (str) \u2013 The routing key.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.destinations.KvdbDestination(name, bucket_id, secret_key)[source]\u00b6
\n
\n
\n__init__(name, bucket_id, secret_key)[source]\u00b6
\n

Airbyte Destination for Kvdb.

\n

Documentation can be found at https://kvdb.io/docs/api/

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • bucket_id (str) \u2013 The ID of your KVdb bucket.

  • \n
  • secret_key (str) \u2013 Your bucket Secret Key.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.destinations.ClickhouseDestination(name, host, port, database, username, password=None, jdbc_url_params=None, ssl=None)[source]\u00b6
\n
\n
\n__init__(name, host, port, database, username, password=None, jdbc_url_params=None, ssl=None)[source]\u00b6
\n

Airbyte Destination for Clickhouse.

\n

Documentation can be found at https://docs.airbyte.com/integrations/destinations/clickhouse

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • host (str) \u2013 Hostname of the database.

  • \n
  • port (int) \u2013 HTTP port of the database.

  • \n
  • database (str) \u2013 Name of the database.

  • \n
  • username (str) \u2013 Username to use to access the database.

  • \n
  • password (Optional[str]) \u2013 Password associated with the username.

  • \n
  • jdbc_url_params (Optional[str]) \u2013 Additional properties to pass to the JDBC URL string when connecting to the database formatted as \u2018key=value\u2019 pairs separated by the symbol \u2018&\u2019. (example: key1=value1&key2=value2&key3=value3).

  • \n
  • ssl (Optional[bool]) \u2013 Encrypt data using SSL.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.destinations.AmazonSqsDestination(name, queue_url, region, message_delay=None, access_key=None, secret_key=None, message_body_key=None, message_group_id=None)[source]\u00b6
\n
\n
\n__init__(name, queue_url, region, message_delay=None, access_key=None, secret_key=None, message_body_key=None, message_group_id=None)[source]\u00b6
\n

Airbyte Destination for Amazon Sqs.

\n

Documentation can be found at https://docs.airbyte.com/integrations/destinations/amazon-sqs

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • queue_url (str) \u2013 URL of the SQS Queue

  • \n
  • region (str) \u2013 AWS Region of the SQS Queue

  • \n
  • message_delay (Optional[int]) \u2013 Modify the Message Delay of the individual message from the Queue\u2019s default (seconds).

  • \n
  • access_key (Optional[str]) \u2013 The Access Key ID of the AWS IAM Role to use for sending messages

  • \n
  • secret_key (Optional[str]) \u2013 The Secret Key of the AWS IAM Role to use for sending messages

  • \n
  • message_body_key (Optional[str]) \u2013 Use this property to extract the contents of the named key in the input record to use as the SQS message body. If not set, the entire content of the input record data is used as the message body.

  • \n
  • message_group_id (Optional[str]) \u2013 The tag that specifies that a message belongs to a specific message group. This parameter applies only to, and is REQUIRED by, FIFO queues.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.destinations.MariadbColumnstoreDestination(name, host, port, database, username, password=None, jdbc_url_params=None)[source]\u00b6
\n
\n
\n__init__(name, host, port, database, username, password=None, jdbc_url_params=None)[source]\u00b6
\n

Airbyte Destination for Mariadb Columnstore.

\n

Documentation can be found at https://docs.airbyte.com/integrations/destinations/mariadb-columnstore

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • host (str) \u2013 The Hostname of the database.

  • \n
  • port (int) \u2013 The Port of the database.

  • \n
  • database (str) \u2013 Name of the database.

  • \n
  • username (str) \u2013 The Username which is used to access the database.

  • \n
  • password (Optional[str]) \u2013 The Password associated with the username.

  • \n
  • jdbc_url_params (Optional[str]) \u2013 Additional properties to pass to the JDBC URL string when connecting to the database formatted as \u2018key=value\u2019 pairs separated by the symbol \u2018&\u2019. (example: key1=value1&key2=value2&key3=value3).

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.destinations.KinesisDestination(name, endpoint, region, shardCount, accessKey, privateKey, bufferSize)[source]\u00b6
\n
\n
\n__init__(name, endpoint, region, shardCount, accessKey, privateKey, bufferSize)[source]\u00b6
\n

Airbyte Destination for Kinesis.

\n

Documentation can be found at https://docs.airbyte.com/integrations/destinations/kinesis

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • endpoint (str) \u2013 AWS Kinesis endpoint.

  • \n
  • region (str) \u2013 AWS region. Your account determines the Regions that are available to you.

  • \n
  • shardCount (int) \u2013 Number of shards to which the data should be streamed.

  • \n
  • accessKey (str) \u2013 Generate the AWS Access Key for current user.

  • \n
  • privateKey (str) \u2013 The AWS Private Key - a string of numbers and letters that are unique for each account, also known as a \u201crecovery phrase\u201d.

  • \n
  • bufferSize (int) \u2013 Buffer size for storing kinesis records before being batch streamed.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.destinations.AzureBlobStorageDestination(name, azure_blob_storage_account_name, azure_blob_storage_account_key, format, azure_blob_storage_endpoint_domain_name=None, azure_blob_storage_container_name=None, azure_blob_storage_output_buffer_size=None)[source]\u00b6
\n
\n
\n__init__(name, azure_blob_storage_account_name, azure_blob_storage_account_key, format, azure_blob_storage_endpoint_domain_name=None, azure_blob_storage_container_name=None, azure_blob_storage_output_buffer_size=None)[source]\u00b6
\n

Airbyte Destination for Azure Blob Storage.

\n

Documentation can be found at https://docs.airbyte.com/integrations/destinations/azureblobstorage

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • azure_blob_storage_endpoint_domain_name (Optional[str]) \u2013 This is Azure Blob Storage endpoint domain name. Leave default value (or leave it empty if run container from command line) to use Microsoft native from example.

  • \n
  • azure_blob_storage_container_name (Optional[str]) \u2013 The name of the Azure blob storage container. If not exists - will be created automatically. May be empty, then will be created automatically airbytecontainer+timestamp

  • \n
  • azure_blob_storage_account_name (str) \u2013 The account\u2019s name of the Azure Blob Storage.

  • \n
  • azure_blob_storage_account_key (str) \u2013 The Azure blob storage account key.

  • \n
  • azure_blob_storage_output_buffer_size (Optional[int]) \u2013 The amount of megabytes to buffer for the output stream to Azure. This will impact memory footprint on workers, but may need adjustment for performance and appropriate block size in Azure.

  • \n
  • format (Union[AzureBlobStorageDestination.CSVCommaSeparatedValues, AzureBlobStorageDestination.JSONLinesNewlineDelimitedJSON]) \u2013 Output data format

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass AzureBlobStorageDestination.CSVCommaSeparatedValues(flattening)[source]\u00b6
\n
\n
\n__init__(flattening)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass AzureBlobStorageDestination.JSONLinesNewlineDelimitedJSON[source]\u00b6
\n
\n
\n__init__()[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.destinations.KafkaDestination(name, bootstrap_servers, topic_pattern, protocol, acks, enable_idempotence, compression_type, batch_size, linger_ms, max_in_flight_requests_per_connection, client_dns_lookup, buffer_memory, max_request_size, retries, socket_connection_setup_timeout_ms, socket_connection_setup_timeout_max_ms, max_block_ms, request_timeout_ms, delivery_timeout_ms, send_buffer_bytes, receive_buffer_bytes, test_topic=None, sync_producer=None, client_id=None)[source]\u00b6
\n
\n
\n__init__(name, bootstrap_servers, topic_pattern, protocol, acks, enable_idempotence, compression_type, batch_size, linger_ms, max_in_flight_requests_per_connection, client_dns_lookup, buffer_memory, max_request_size, retries, socket_connection_setup_timeout_ms, socket_connection_setup_timeout_max_ms, max_block_ms, request_timeout_ms, delivery_timeout_ms, send_buffer_bytes, receive_buffer_bytes, test_topic=None, sync_producer=None, client_id=None)[source]\u00b6
\n

Airbyte Destination for Kafka.

\n

Documentation can be found at https://docs.airbyte.com/integrations/destinations/kafka

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • bootstrap_servers (str) \u2013 A list of host/port pairs to use for establishing the initial connection to the Kafka cluster. The client will make use of all servers irrespective of which servers are specified here for bootstrapping&mdash;this list only impacts the initial hosts used to discover the full set of servers. This list should be in the form host1:port1,host2:port2,\u2026. Since these servers are just used for the initial connection to discover the full cluster membership (which may change dynamically), this list need not contain the full set of servers (you may want more than one, though, in case a server is down).

  • \n
  • topic_pattern (str) \u2013 Topic pattern in which the records will be sent. You can use patterns like \u2018{namespace}\u2019 and/or \u2018{stream}\u2019 to send the message to a specific topic based on these values. Notice that the topic name will be transformed to a standard naming convention.

  • \n
  • test_topic (Optional[str]) \u2013 Topic to test if Airbyte can produce messages.

  • \n
  • sync_producer (Optional[bool]) \u2013 Wait synchronously until the record has been sent to Kafka.

  • \n
  • protocol (Union[KafkaDestination.PLAINTEXT, KafkaDestination.SASLPLAINTEXT, KafkaDestination.SASLSSL]) \u2013 Protocol used to communicate with brokers.

  • \n
  • client_id (Optional[str]) \u2013 An ID string to pass to the server when making requests. The purpose of this is to be able to track the source of requests beyond just ip/port by allowing a logical application name to be included in server-side request logging.

  • \n
  • acks (str) \u2013 The number of acknowledgments the producer requires the leader to have received before considering a request complete. This controls the durability of records that are sent.

  • \n
  • enable_idempotence (bool) \u2013 When set to \u2018true\u2019, the producer will ensure that exactly one copy of each message is written in the stream. If \u2018false\u2019, producer retries due to broker failures, etc., may write duplicates of the retried message in the stream.

  • \n
  • compression_type (str) \u2013 The compression type for all data generated by the producer.

  • \n
  • batch_size (int) \u2013 The producer will attempt to batch records together into fewer requests whenever multiple records are being sent to the same partition.

  • \n
  • linger_ms (str) \u2013 The producer groups together any records that arrive in between request transmissions into a single batched request.

  • \n
  • max_in_flight_requests_per_connection (int) \u2013 The maximum number of unacknowledged requests the client will send on a single connection before blocking. Can be greater than 1, and the maximum value supported with idempotency is 5.

  • \n
  • client_dns_lookup (str) \u2013 Controls how the client uses DNS lookups. If set to use_all_dns_ips, connect to each returned IP address in sequence until a successful connection is established. After a disconnection, the next IP is used. Once all IPs have been used once, the client resolves the IP(s) from the hostname again. If set to resolve_canonical_bootstrap_servers_only, resolve each bootstrap address into a list of canonical names. After the bootstrap phase, this behaves the same as use_all_dns_ips. If set to default (deprecated), attempt to connect to the first IP address returned by the lookup, even if the lookup returns multiple IP addresses.

  • \n
  • buffer_memory (str) \u2013 The total bytes of memory the producer can use to buffer records waiting to be sent to the server.

  • \n
  • max_request_size (int) \u2013 The maximum size of a request in bytes.

  • \n
  • retries (int) \u2013 Setting a value greater than zero will cause the client to resend any record whose send fails with a potentially transient error.

  • \n
  • socket_connection_setup_timeout_ms (str) \u2013 The amount of time the client will wait for the socket connection to be established.

  • \n
  • socket_connection_setup_timeout_max_ms (str) \u2013 The maximum amount of time the client will wait for the socket connection to be established. The connection setup timeout will increase exponentially for each consecutive connection failure up to this maximum.

  • \n
  • max_block_ms (str) \u2013 The configuration controls how long the KafkaProducer\u2019s send(), partitionsFor(), initTransactions(), sendOffsetsToTransaction(), commitTransaction() and abortTransaction() methods will block.

  • \n
  • request_timeout_ms (int) \u2013 The configuration controls the maximum amount of time the client will wait for the response of a request. If the response is not received before the timeout elapses the client will resend the request if necessary or fail the request if retries are exhausted.

  • \n
  • delivery_timeout_ms (int) \u2013 An upper bound on the time to report success or failure after a call to \u2018send()\u2019 returns.

  • \n
  • send_buffer_bytes (int) \u2013 The size of the TCP send buffer (SO_SNDBUF) to use when sending data. If the value is -1, the OS default will be used.

  • \n
  • receive_buffer_bytes (int) \u2013 The size of the TCP receive buffer (SO_RCVBUF) to use when reading data. If the value is -1, the OS default will be used.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass KafkaDestination.PLAINTEXT(security_protocol)[source]\u00b6
\n
\n
\n__init__(security_protocol)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass KafkaDestination.SASLPLAINTEXT(security_protocol, sasl_mechanism, sasl_jaas_config)[source]\u00b6
\n
\n
\n__init__(security_protocol, sasl_mechanism, sasl_jaas_config)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass KafkaDestination.SASLSSL(security_protocol, sasl_mechanism, sasl_jaas_config)[source]\u00b6
\n
\n
\n__init__(security_protocol, sasl_mechanism, sasl_jaas_config)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.destinations.ElasticsearchDestination(name, endpoint, authenticationMethod, upsert=None)[source]\u00b6
\n
\n
\n__init__(name, endpoint, authenticationMethod, upsert=None)[source]\u00b6
\n

Airbyte Destination for Elasticsearch.

\n

Documentation can be found at https://docs.airbyte.com/integrations/destinations/elasticsearch

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • endpoint (str) \u2013 The full url of the Elasticsearch server

  • \n
  • upsert (Optional[bool]) \u2013 If a primary key identifier is defined in the source, an upsert will be performed using the primary key value as the elasticsearch doc id. Does not support composite primary keys.

  • \n
  • authenticationMethod (Union[ElasticsearchDestination.None\\_, ElasticsearchDestination.ApiKeySecret, ElasticsearchDestination.UsernamePassword]) \u2013 The type of authentication to be used

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass ElasticsearchDestination.None_[source]\u00b6
\n
\n
\n__init__()[source]\u00b6
\n
\n\n
\n\n
\n
\nclass ElasticsearchDestination.ApiKeySecret(apiKeyId, apiKeySecret)[source]\u00b6
\n
\n
\n__init__(apiKeyId, apiKeySecret)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass ElasticsearchDestination.UsernamePassword(username, password)[source]\u00b6
\n
\n
\n__init__(username, password)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.destinations.MysqlDestination(name, host, port, database, username, password=None, ssl=None, jdbc_url_params=None)[source]\u00b6
\n
\n
\n__init__(name, host, port, database, username, password=None, ssl=None, jdbc_url_params=None)[source]\u00b6
\n

Airbyte Destination for Mysql.

\n

Documentation can be found at https://docs.airbyte.com/integrations/destinations/mysql

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • host (str) \u2013 Hostname of the database.

  • \n
  • port (int) \u2013 Port of the database.

  • \n
  • database (str) \u2013 Name of the database.

  • \n
  • username (str) \u2013 Username to use to access the database.

  • \n
  • password (Optional[str]) \u2013 Password associated with the username.

  • \n
  • ssl (Optional[bool]) \u2013 Encrypt data using SSL.

  • \n
  • jdbc_url_params (Optional[str]) \u2013 Additional properties to pass to the JDBC URL string when connecting to the database formatted as \u2018key=value\u2019 pairs separated by the symbol \u2018&\u2019. (example: key1=value1&key2=value2&key3=value3).

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.destinations.SftpJsonDestination(name, host, username, password, destination_path, port=None)[source]\u00b6
\n
\n
\n__init__(name, host, username, password, destination_path, port=None)[source]\u00b6
\n

Airbyte Destination for Sftp Json.

\n

Documentation can be found at https://docs.airbyte.com/integrations/destinations/sftp-json

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • host (str) \u2013 Hostname of the SFTP server.

  • \n
  • port (Optional[int]) \u2013 Port of the SFTP server.

  • \n
  • username (str) \u2013 Username to use to access the SFTP server.

  • \n
  • password (str) \u2013 Password associated with the username.

  • \n
  • destination_path (str) \u2013 Path to the directory where json files will be written.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.destinations.GcsDestination(name, gcs_bucket_name, gcs_bucket_path, credential, format, gcs_bucket_region=None)[source]\u00b6
\n
\n
\n__init__(name, gcs_bucket_name, gcs_bucket_path, credential, format, gcs_bucket_region=None)[source]\u00b6
\n

Airbyte Destination for Gcs.

\n

Documentation can be found at https://docs.airbyte.com/integrations/destinations/gcs

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • gcs_bucket_name (str) \u2013 You can find the bucket name in the App Engine Admin console Application Settings page, under the label Google Cloud Storage Bucket. Read more here.

  • \n
  • gcs_bucket_path (str) \u2013 GCS Bucket Path string Subdirectory under the above bucket to sync the data into.

  • \n
  • gcs_bucket_region (Optional[str]) \u2013 Select a Region of the GCS Bucket. Read more here.

  • \n
  • credential (GcsDestination.HMACKey) \u2013 An HMAC key is a type of credential and can be associated with a service account or a user account in Cloud Storage. Read more here.

  • \n
  • format (Union[GcsDestination.AvroApacheAvro, GcsDestination.CSVCommaSeparatedValues, GcsDestination.JSONLinesNewlineDelimitedJSON, GcsDestination.ParquetColumnarStorage]) \u2013 Output data format. One of the following formats must be selected - AVRO format, PARQUET format, CSV format, or JSONL format.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass GcsDestination.HMACKey(credential_type, hmac_key_access_id, hmac_key_secret)[source]\u00b6
\n
\n
\n__init__(credential_type, hmac_key_access_id, hmac_key_secret)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass GcsDestination.NoCompression(compression_type=None)[source]\u00b6
\n
\n
\n__init__(compression_type=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass GcsDestination.Deflate(codec, compression_level=None)[source]\u00b6
\n
\n
\n__init__(codec, compression_level=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass GcsDestination.Bzip2(codec)[source]\u00b6
\n
\n
\n__init__(codec)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass GcsDestination.Xz(codec, compression_level=None)[source]\u00b6
\n
\n
\n__init__(codec, compression_level=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass GcsDestination.Zstandard(codec, compression_level=None, include_checksum=None)[source]\u00b6
\n
\n
\n__init__(codec, compression_level=None, include_checksum=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass GcsDestination.Snappy(codec)[source]\u00b6
\n
\n
\n__init__(codec)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass GcsDestination.AvroApacheAvro(format_type, compression_codec)[source]\u00b6
\n
\n
\n__init__(format_type, compression_codec)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass GcsDestination.GZIP(compression_type=None)[source]\u00b6
\n
\n
\n__init__(compression_type=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass GcsDestination.CSVCommaSeparatedValues(format_type, compression, flattening=None)[source]\u00b6
\n
\n
\n__init__(format_type, compression, flattening=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass GcsDestination.JSONLinesNewlineDelimitedJSON(format_type, compression)[source]\u00b6
\n
\n
\n__init__(format_type, compression)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass GcsDestination.ParquetColumnarStorage(format_type, compression_codec=None, block_size_mb=None, max_padding_size_mb=None, page_size_kb=None, dictionary_page_size_kb=None, dictionary_encoding=None)[source]\u00b6
\n
\n
\n__init__(format_type, compression_codec=None, block_size_mb=None, max_padding_size_mb=None, page_size_kb=None, dictionary_page_size_kb=None, dictionary_encoding=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.destinations.CassandraDestination(name, keyspace, username, password, address, port, datacenter=None, replication=None)[source]\u00b6
\n
\n
\n__init__(name, keyspace, username, password, address, port, datacenter=None, replication=None)[source]\u00b6
\n

Airbyte Destination for Cassandra.

\n

Documentation can be found at https://docs.airbyte.com/integrations/destinations/cassandra

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • keyspace (str) \u2013 Default Cassandra keyspace to create data in.

  • \n
  • username (str) \u2013 Username to use to access Cassandra.

  • \n
  • password (str) \u2013 Password associated with Cassandra.

  • \n
  • address (str) \u2013 Address to connect to.

  • \n
  • port (int) \u2013 Port of Cassandra.

  • \n
  • datacenter (Optional[str]) \u2013 Datacenter of the cassandra cluster.

  • \n
  • replication (Optional[int]) \u2013 Indicates to how many nodes the data should be replicated to.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.destinations.FireboltDestination(name, username, password, database, loading_method, account=None, host=None, engine=None)[source]\u00b6
\n
\n
\n__init__(name, username, password, database, loading_method, account=None, host=None, engine=None)[source]\u00b6
\n

Airbyte Destination for Firebolt.

\n

Documentation can be found at https://docs.airbyte.com/integrations/destinations/firebolt

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • username (str) \u2013 Firebolt email address you use to login.

  • \n
  • password (str) \u2013 Firebolt password.

  • \n
  • account (Optional[str]) \u2013 Firebolt account to login.

  • \n
  • host (Optional[str]) \u2013 The host name of your Firebolt database.

  • \n
  • database (str) \u2013 The database to connect to.

  • \n
  • engine (Optional[str]) \u2013 Engine name or url to connect to.

  • \n
  • loading_method (Union[FireboltDestination.SQLInserts, FireboltDestination.ExternalTableViaS3]) \u2013 Loading method used to select the way data will be uploaded to Firebolt

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass FireboltDestination.SQLInserts[source]\u00b6
\n
\n
\n__init__()[source]\u00b6
\n
\n\n
\n\n
\n
\nclass FireboltDestination.ExternalTableViaS3(s3_bucket, s3_region, aws_key_id, aws_key_secret)[source]\u00b6
\n
\n
\n__init__(s3_bucket, s3_region, aws_key_id, aws_key_secret)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.destinations.GoogleSheetsDestination(name, spreadsheet_id, credentials)[source]\u00b6
\n
\n
\n__init__(name, spreadsheet_id, credentials)[source]\u00b6
\n

Airbyte Destination for Google Sheets.

\n

Documentation can be found at https://docs.airbyte.com/integrations/destinations/google-sheets

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • spreadsheet_id (str) \u2013 The link to your spreadsheet. See this guide for more details.

  • \n
  • credentials (GoogleSheetsDestination.AuthenticationViaGoogleOAuth) \u2013 Google API Credentials for connecting to Google Sheets and Google Drive APIs

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass GoogleSheetsDestination.AuthenticationViaGoogleOAuth(client_id, client_secret, refresh_token)[source]\u00b6
\n
\n
\n__init__(client_id, client_secret, refresh_token)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.destinations.DatabricksDestination(name, accept_terms, databricks_server_hostname, databricks_http_path, databricks_personal_access_token, data_source, databricks_port=None, database_schema=None, purge_staging_data=None)[source]\u00b6
\n
\n
\n__init__(name, accept_terms, databricks_server_hostname, databricks_http_path, databricks_personal_access_token, data_source, databricks_port=None, database_schema=None, purge_staging_data=None)[source]\u00b6
\n

Airbyte Destination for Databricks.

\n

Documentation can be found at https://docs.airbyte.com/integrations/destinations/databricks

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • accept_terms (bool) \u2013 You must agree to the Databricks JDBC Driver Terms & Conditions to use this connector.

  • \n
  • databricks_server_hostname (str) \u2013 Databricks Cluster Server Hostname.

  • \n
  • databricks_http_path (str) \u2013 Databricks Cluster HTTP Path.

  • \n
  • databricks_port (Optional[str]) \u2013 Databricks Cluster Port.

  • \n
  • databricks_personal_access_token (str) \u2013 Databricks Personal Access Token for making authenticated requests.

  • \n
  • database_schema (Optional[str]) \u2013 The default schema tables are written to if the source does not specify a namespace. Unless specifically configured, the usual value for this field is \u201cpublic\u201d.

  • \n
  • data_source (Union[DatabricksDestination.AmazonS3, DatabricksDestination.AzureBlobStorage]) \u2013 Storage on which the delta lake is built.

  • \n
  • purge_staging_data (Optional[bool]) \u2013 Default to \u2018true\u2019. Switch it to \u2018false\u2019 for debugging purpose.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass DatabricksDestination.AmazonS3(data_source_type, s3_bucket_name, s3_bucket_path, s3_bucket_region, s3_access_key_id, s3_secret_access_key, file_name_pattern=None)[source]\u00b6
\n
\n
\n__init__(data_source_type, s3_bucket_name, s3_bucket_path, s3_bucket_region, s3_access_key_id, s3_secret_access_key, file_name_pattern=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass DatabricksDestination.AzureBlobStorage(data_source_type, azure_blob_storage_account_name, azure_blob_storage_container_name, azure_blob_storage_sas_token, azure_blob_storage_endpoint_domain_name=None)[source]\u00b6
\n
\n
\n__init__(data_source_type, azure_blob_storage_account_name, azure_blob_storage_container_name, azure_blob_storage_sas_token, azure_blob_storage_endpoint_domain_name=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.destinations.BigqueryDenormalizedDestination(name, project_id, dataset_id, loading_method, credentials_json=None, dataset_location=None, big_query_client_buffer_size_mb=None)[source]\u00b6
\n
\n
\n__init__(name, project_id, dataset_id, loading_method, credentials_json=None, dataset_location=None, big_query_client_buffer_size_mb=None)[source]\u00b6
\n

Airbyte Destination for Bigquery Denormalized.

\n

Documentation can be found at https://docs.airbyte.com/integrations/destinations/bigquery

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • project_id (str) \u2013 The GCP project ID for the project containing the target BigQuery dataset. Read more here.

  • \n
  • dataset_id (str) \u2013 The default BigQuery Dataset ID that tables are replicated to if the source does not specify a namespace. Read more here.

  • \n
  • loading_method (Union[BigqueryDenormalizedDestination.StandardInserts, BigqueryDenormalizedDestination.GCSStaging]) \u2013 Loading method used to send select the way data will be uploaded to BigQuery. Standard Inserts - Direct uploading using SQL INSERT statements. This method is extremely inefficient and provided only for quick testing. In almost all cases, you should use staging. GCS Staging - Writes large batches of records to a file, uploads the file to GCS, then uses COPY INTO table to upload the file. Recommended for most workloads for better speed and scalability. Read more about GCS Staging here.

  • \n
  • credentials_json (Optional[str]) \u2013 The contents of the JSON service account key. Check out the docs if you need help generating this key. Default credentials will be used if this field is left empty.

  • \n
  • dataset_location (Optional[str]) \u2013 The location of the dataset. Warning: Changes made after creation will not be applied. The default \u201cUS\u201d value is used if not set explicitly. Read more here.

  • \n
  • big_query_client_buffer_size_mb (Optional[int]) \u2013 Google BigQuery client\u2019s chunk (buffer) size (MIN=1, MAX = 15) for each table. The size that will be written by a single RPC. Written data will be buffered and only flushed upon reaching this size or closing the channel. The default 15MB value is used if not set explicitly. Read more here.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass BigqueryDenormalizedDestination.StandardInserts[source]\u00b6
\n
\n
\n__init__()[source]\u00b6
\n
\n\n
\n\n
\n
\nclass BigqueryDenormalizedDestination.HMACKey(hmac_key_access_id, hmac_key_secret)[source]\u00b6
\n
\n
\n__init__(hmac_key_access_id, hmac_key_secret)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass BigqueryDenormalizedDestination.GCSStaging(credential, gcs_bucket_name, gcs_bucket_path, keep_files_in_gcs_bucket=None)[source]\u00b6
\n
\n
\n__init__(credential, gcs_bucket_name, gcs_bucket_path, keep_files_in_gcs_bucket=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.destinations.SqliteDestination(name, destination_path)[source]\u00b6
\n
\n
\n__init__(name, destination_path)[source]\u00b6
\n

Airbyte Destination for Sqlite.

\n

Documentation can be found at https://docs.airbyte.com/integrations/destinations/sqlite

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • destination_path (str) \u2013 Path to the sqlite.db file. The file will be placed inside that local mount. For more information check out our docs

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.destinations.MongodbDestination(name, instance_type, database, auth_type)[source]\u00b6
\n
\n
\n__init__(name, instance_type, database, auth_type)[source]\u00b6
\n

Airbyte Destination for Mongodb.

\n

Documentation can be found at https://docs.airbyte.com/integrations/destinations/mongodb

\n
\n
Parameters:
\n
\n
\n
\n
\n\n
\n\n
\n
\nclass MongodbDestination.StandaloneMongoDbInstance(instance, host, port, tls=None)[source]\u00b6
\n
\n
\n__init__(instance, host, port, tls=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass MongodbDestination.ReplicaSet(instance, server_addresses, replica_set=None)[source]\u00b6
\n
\n
\n__init__(instance, server_addresses, replica_set=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass MongodbDestination.MongoDBAtlas(instance, cluster_url)[source]\u00b6
\n
\n
\n__init__(instance, cluster_url)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass MongodbDestination.None_[source]\u00b6
\n
\n
\n__init__()[source]\u00b6
\n
\n\n
\n\n
\n
\nclass MongodbDestination.LoginPassword(username, password)[source]\u00b6
\n
\n
\n__init__(username, password)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.destinations.RocksetDestination(name, api_key, workspace, api_server=None)[source]\u00b6
\n
\n
\n__init__(name, api_key, workspace, api_server=None)[source]\u00b6
\n

Airbyte Destination for Rockset.

\n

Documentation can be found at https://docs.airbyte.com/integrations/destinations/rockset

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • api_key (str) \u2013 Rockset api key

  • \n
  • workspace (str) \u2013 The Rockset workspace in which collections will be created + written to.

  • \n
  • api_server (Optional[str]) \u2013 Rockset api URL

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.destinations.OracleDestination(name, host, port, sid, username, encryption, password=None, jdbc_url_params=None, schema=None)[source]\u00b6
\n
\n
\n__init__(name, host, port, sid, username, encryption, password=None, jdbc_url_params=None, schema=None)[source]\u00b6
\n

Airbyte Destination for Oracle.

\n

Documentation can be found at https://docs.airbyte.com/integrations/destinations/oracle

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • host (str) \u2013 The hostname of the database.

  • \n
  • port (int) \u2013 The port of the database.

  • \n
  • sid (str) \u2013 The System Identifier uniquely distinguishes the instance from any other instance on the same computer.

  • \n
  • username (str) \u2013 The username to access the database. This user must have CREATE USER privileges in the database.

  • \n
  • password (Optional[str]) \u2013 The password associated with the username.

  • \n
  • jdbc_url_params (Optional[str]) \u2013 Additional properties to pass to the JDBC URL string when connecting to the database formatted as \u2018key=value\u2019 pairs separated by the symbol \u2018&\u2019. (example: key1=value1&key2=value2&key3=value3).

  • \n
  • schema (Optional[str]) \u2013 The default schema is used as the target schema for all statements issued from the connection that do not explicitly specify a schema name. The usual value for this field is \u201cairbyte\u201d. In Oracle, schemas and users are the same thing, so the \u201cuser\u201d parameter is used as the login credentials and this is used for the default Airbyte message schema.

  • \n
  • encryption (Union[OracleDestination.Unencrypted, OracleDestination.NativeNetworkEncryptionNNE, OracleDestination.TLSEncryptedVerifyCertificate]) \u2013 The encryption method which is used when communicating with the database.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass OracleDestination.Unencrypted[source]\u00b6
\n
\n
\n__init__()[source]\u00b6
\n
\n\n
\n\n
\n
\nclass OracleDestination.NativeNetworkEncryptionNNE(encryption_algorithm=None)[source]\u00b6
\n
\n
\n__init__(encryption_algorithm=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass OracleDestination.TLSEncryptedVerifyCertificate(ssl_certificate)[source]\u00b6
\n
\n
\n__init__(ssl_certificate)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.destinations.CsvDestination(name, destination_path)[source]\u00b6
\n
\n
\n__init__(name, destination_path)[source]\u00b6
\n

Airbyte Destination for Csv.

\n

Documentation can be found at https://docs.airbyte.com/integrations/destinations/local-csv

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • destination_path (str) \u2013 Path to the directory where csv files will be written. The destination uses the local mount \u201c/local\u201d and any data files will be placed inside that local mount. For more information check out our docs

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.destinations.S3Destination(name, s3_bucket_name, s3_bucket_path, s3_bucket_region, format, access_key_id=None, secret_access_key=None, s3_endpoint=None, s3_path_format=None, file_name_pattern=None)[source]\u00b6
\n
\n
\n__init__(name, s3_bucket_name, s3_bucket_path, s3_bucket_region, format, access_key_id=None, secret_access_key=None, s3_endpoint=None, s3_path_format=None, file_name_pattern=None)[source]\u00b6
\n

Airbyte Destination for S3.

\n

Documentation can be found at https://docs.airbyte.com/integrations/destinations/s3

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • access_key_id (Optional[str]) \u2013 The access key ID to access the S3 bucket. Airbyte requires Read and Write permissions to the given bucket. Read more here.

  • \n
  • secret_access_key (Optional[str]) \u2013 The corresponding secret to the access key ID. Read more here

  • \n
  • s3_bucket_name (str) \u2013 The name of the S3 bucket. Read more here.

  • \n
  • s3_bucket_path (str) \u2013 Directory under the S3 bucket where data will be written. Read more here

  • \n
  • s3_bucket_region (str) \u2013 The region of the S3 bucket. See here for all region codes.

  • \n
  • format (Union[S3Destination.AvroApacheAvro, S3Destination.CSVCommaSeparatedValues, S3Destination.JSONLinesNewlineDelimitedJSON, S3Destination.ParquetColumnarStorage]) \u2013 Format of the data output. See here for more details

  • \n
  • s3_endpoint (Optional[str]) \u2013 Your S3 endpoint url. Read more here

  • \n
  • s3_path_format (Optional[str]) \u2013 Format string on how data will be organized inside the S3 bucket directory. Read more here

  • \n
  • file_name_pattern (Optional[str]) \u2013 The pattern allows you to set the file-name format for the S3 staging file(s)

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass S3Destination.NoCompression(compression_type=None)[source]\u00b6
\n
\n
\n__init__(compression_type=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass S3Destination.Deflate(codec, compression_level)[source]\u00b6
\n
\n
\n__init__(codec, compression_level)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass S3Destination.Bzip2(codec)[source]\u00b6
\n
\n
\n__init__(codec)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass S3Destination.Xz(codec, compression_level)[source]\u00b6
\n
\n
\n__init__(codec, compression_level)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass S3Destination.Zstandard(codec, compression_level, include_checksum=None)[source]\u00b6
\n
\n
\n__init__(codec, compression_level, include_checksum=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass S3Destination.Snappy(codec)[source]\u00b6
\n
\n
\n__init__(codec)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass S3Destination.AvroApacheAvro(format_type, compression_codec)[source]\u00b6
\n
\n
\n__init__(format_type, compression_codec)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass S3Destination.GZIP(compression_type=None)[source]\u00b6
\n
\n
\n__init__(compression_type=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass S3Destination.CSVCommaSeparatedValues(format_type, flattening, compression)[source]\u00b6
\n
\n
\n__init__(format_type, flattening, compression)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass S3Destination.JSONLinesNewlineDelimitedJSON(format_type, compression)[source]\u00b6
\n
\n
\n__init__(format_type, compression)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass S3Destination.ParquetColumnarStorage(format_type, compression_codec=None, block_size_mb=None, max_padding_size_mb=None, page_size_kb=None, dictionary_page_size_kb=None, dictionary_encoding=None)[source]\u00b6
\n
\n
\n__init__(format_type, compression_codec=None, block_size_mb=None, max_padding_size_mb=None, page_size_kb=None, dictionary_page_size_kb=None, dictionary_encoding=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.destinations.AwsDatalakeDestination(name, region, credentials, bucket_name, bucket_prefix, aws_account_id=None, lakeformation_database_name=None)[source]\u00b6
\n
\n
\n__init__(name, region, credentials, bucket_name, bucket_prefix, aws_account_id=None, lakeformation_database_name=None)[source]\u00b6
\n

Airbyte Destination for Aws Datalake.

\n

Documentation can be found at https://docs.airbyte.com/integrations/destinations/aws-datalake

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • aws_account_id (Optional[str]) \u2013 target aws account id

  • \n
  • region (str) \u2013 Region name

  • \n
  • credentials (Union[AwsDatalakeDestination.IAMRole, AwsDatalakeDestination.IAMUser]) \u2013 Choose How to Authenticate to AWS.

  • \n
  • bucket_name (str) \u2013 Name of the bucket

  • \n
  • bucket_prefix (str) \u2013 S3 prefix

  • \n
  • lakeformation_database_name (Optional[str]) \u2013 Which database to use

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass AwsDatalakeDestination.IAMRole(role_arn)[source]\u00b6
\n
\n
\n__init__(role_arn)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass AwsDatalakeDestination.IAMUser(aws_access_key_id, aws_secret_access_key)[source]\u00b6
\n
\n
\n__init__(aws_access_key_id, aws_secret_access_key)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.destinations.MssqlDestination(name, host, port, database, schema, username, ssl_method, password=None, jdbc_url_params=None)[source]\u00b6
\n
\n
\n__init__(name, host, port, database, schema, username, ssl_method, password=None, jdbc_url_params=None)[source]\u00b6
\n

Airbyte Destination for Mssql.

\n

Documentation can be found at https://docs.airbyte.com/integrations/destinations/mssql

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • host (str) \u2013 The host name of the MSSQL database.

  • \n
  • port (int) \u2013 The port of the MSSQL database.

  • \n
  • database (str) \u2013 The name of the MSSQL database.

  • \n
  • schema (str) \u2013 The default schema tables are written to if the source does not specify a namespace. The usual value for this field is \u201cpublic\u201d.

  • \n
  • username (str) \u2013 The username which is used to access the database.

  • \n
  • password (Optional[str]) \u2013 The password associated with this username.

  • \n
  • jdbc_url_params (Optional[str]) \u2013 Additional properties to pass to the JDBC URL string when connecting to the database formatted as \u2018key=value\u2019 pairs separated by the symbol \u2018&\u2019. (example: key1=value1&key2=value2&key3=value3).

  • \n
  • ssl_method (Union[MssqlDestination.Unencrypted, MssqlDestination.EncryptedTrustServerCertificate, MssqlDestination.EncryptedVerifyCertificate]) \u2013 The encryption method which is used to communicate with the database.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass MssqlDestination.Unencrypted[source]\u00b6
\n
\n
\n__init__()[source]\u00b6
\n
\n\n
\n\n
\n
\nclass MssqlDestination.EncryptedTrustServerCertificate[source]\u00b6
\n
\n
\n__init__()[source]\u00b6
\n
\n\n
\n\n
\n
\nclass MssqlDestination.EncryptedVerifyCertificate(hostNameInCertificate=None)[source]\u00b6
\n
\n
\n__init__(hostNameInCertificate=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.destinations.PubsubDestination(name, project_id, topic_id, credentials_json)[source]\u00b6
\n
\n
\n__init__(name, project_id, topic_id, credentials_json)[source]\u00b6
\n

Airbyte Destination for Pubsub.

\n

Documentation can be found at https://docs.airbyte.com/integrations/destinations/pubsub

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • project_id (str) \u2013 The GCP project ID for the project containing the target PubSub.

  • \n
  • topic_id (str) \u2013 The PubSub topic ID in the given GCP project ID.

  • \n
  • credentials_json (str) \u2013 The contents of the JSON service account key. Check out the docs if you need help generating this key.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.destinations.R2Destination(name, account_id, access_key_id, secret_access_key, s3_bucket_name, s3_bucket_path, format, s3_path_format=None, file_name_pattern=None)[source]\u00b6
\n
\n
\n__init__(name, account_id, access_key_id, secret_access_key, s3_bucket_name, s3_bucket_path, format, s3_path_format=None, file_name_pattern=None)[source]\u00b6
\n

Airbyte Destination for R2.

\n

Documentation can be found at https://docs.airbyte.com/integrations/destinations/r2

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • account_id (str) \u2013 Cloudflare account ID

  • \n
  • access_key_id (str) \u2013 The access key ID to access the R2 bucket. Airbyte requires Read and Write permissions to the given bucket. Read more here.

  • \n
  • secret_access_key (str) \u2013 The corresponding secret to the access key ID. Read more here

  • \n
  • s3_bucket_name (str) \u2013 The name of the R2 bucket. Read more here.

  • \n
  • s3_bucket_path (str) \u2013 Directory under the R2 bucket where data will be written.

  • \n
  • format (Union[R2Destination.AvroApacheAvro, R2Destination.CSVCommaSeparatedValues, R2Destination.JSONLinesNewlineDelimitedJSON]) \u2013 Format of the data output. See here for more details

  • \n
  • s3_path_format (Optional[str]) \u2013 Format string on how data will be organized inside the R2 bucket directory. Read more here

  • \n
  • file_name_pattern (Optional[str]) \u2013 The pattern allows you to set the file-name format for the R2 staging file(s)

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass R2Destination.NoCompression(compression_type=None)[source]\u00b6
\n
\n
\n__init__(compression_type=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass R2Destination.Deflate(codec, compression_level)[source]\u00b6
\n
\n
\n__init__(codec, compression_level)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass R2Destination.Bzip2(codec)[source]\u00b6
\n
\n
\n__init__(codec)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass R2Destination.Xz(codec, compression_level)[source]\u00b6
\n
\n
\n__init__(codec, compression_level)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass R2Destination.Zstandard(codec, compression_level, include_checksum=None)[source]\u00b6
\n
\n
\n__init__(codec, compression_level, include_checksum=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass R2Destination.Snappy(codec)[source]\u00b6
\n
\n
\n__init__(codec)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass R2Destination.AvroApacheAvro(format_type, compression_codec)[source]\u00b6
\n
\n
\n__init__(format_type, compression_codec)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass R2Destination.GZIP(compression_type=None)[source]\u00b6
\n
\n
\n__init__(compression_type=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass R2Destination.CSVCommaSeparatedValues(format_type, flattening, compression)[source]\u00b6
\n
\n
\n__init__(format_type, flattening, compression)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass R2Destination.JSONLinesNewlineDelimitedJSON(format_type, compression)[source]\u00b6
\n
\n
\n__init__(format_type, compression)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.destinations.JdbcDestination(name, username, jdbc_url, password=None, schema=None)[source]\u00b6
\n
\n
\n__init__(name, username, jdbc_url, password=None, schema=None)[source]\u00b6
\n

Airbyte Destination for Jdbc.

\n

Documentation can be found at https://docs.airbyte.com/integrations/destinations/postgres

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • username (str) \u2013 The username which is used to access the database.

  • \n
  • password (Optional[str]) \u2013 The password associated with this username.

  • \n
  • jdbc_url (str) \u2013 JDBC formatted url. See the standard here.

  • \n
  • schema (Optional[str]) \u2013 If you leave the schema unspecified, JDBC defaults to a schema named \u201cpublic\u201d.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.destinations.KeenDestination(name, project_id, api_key, infer_timestamp=None)[source]\u00b6
\n
\n
\n__init__(name, project_id, api_key, infer_timestamp=None)[source]\u00b6
\n

Airbyte Destination for Keen.

\n

Documentation can be found at https://docs.airbyte.com/integrations/destinations/keen

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • project_id (str) \u2013 To get Keen Project ID, navigate to the Access tab from the left-hand, side panel and check the Project Details section.

  • \n
  • api_key (str) \u2013 To get Keen Master API Key, navigate to the Access tab from the left-hand, side panel and check the Project Details section.

  • \n
  • infer_timestamp (Optional[bool]) \u2013 Allow connector to guess keen.timestamp value based on the streamed data.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.destinations.TidbDestination(name, host, port, database, username, password=None, ssl=None, jdbc_url_params=None)[source]\u00b6
\n
\n
\n__init__(name, host, port, database, username, password=None, ssl=None, jdbc_url_params=None)[source]\u00b6
\n

Airbyte Destination for Tidb.

\n

Documentation can be found at https://docs.airbyte.com/integrations/destinations/tidb

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • host (str) \u2013 Hostname of the database.

  • \n
  • port (int) \u2013 Port of the database.

  • \n
  • database (str) \u2013 Name of the database.

  • \n
  • username (str) \u2013 Username to use to access the database.

  • \n
  • password (Optional[str]) \u2013 Password associated with the username.

  • \n
  • ssl (Optional[bool]) \u2013 Encrypt data using SSL.

  • \n
  • jdbc_url_params (Optional[str]) \u2013 Additional properties to pass to the JDBC URL string when connecting to the database formatted as \u2018key=value\u2019 pairs separated by the symbol \u2018&\u2019. (example: key1=value1&key2=value2&key3=value3).

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.destinations.FirestoreDestination(name, project_id, credentials_json=None)[source]\u00b6
\n
\n
\n__init__(name, project_id, credentials_json=None)[source]\u00b6
\n

Airbyte Destination for Firestore.

\n

Documentation can be found at https://docs.airbyte.com/integrations/destinations/firestore

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • project_id (str) \u2013 The GCP project ID for the project containing the target BigQuery dataset.

  • \n
  • credentials_json (Optional[str]) \u2013 The contents of the JSON service account key. Check out the docs if you need help generating this key. Default credentials will be used if this field is left empty.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.destinations.ScyllaDestination(name, keyspace, username, password, address, port, replication=None)[source]\u00b6
\n
\n
\n__init__(name, keyspace, username, password, address, port, replication=None)[source]\u00b6
\n

Airbyte Destination for Scylla.

\n

Documentation can be found at https://docs.airbyte.com/integrations/destinations/scylla

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • keyspace (str) \u2013 Default Scylla keyspace to create data in.

  • \n
  • username (str) \u2013 Username to use to access Scylla.

  • \n
  • password (str) \u2013 Password associated with Scylla.

  • \n
  • address (str) \u2013 Address to connect to.

  • \n
  • port (int) \u2013 Port of Scylla.

  • \n
  • replication (Optional[int]) \u2013 Indicates to how many nodes the data should be replicated to.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.destinations.RedisDestination(name, host, port, username, password, cache_type)[source]\u00b6
\n
\n
\n__init__(name, host, port, username, password, cache_type)[source]\u00b6
\n

Airbyte Destination for Redis.

\n

Documentation can be found at https://docs.airbyte.com/integrations/destinations/redis

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • host (str) \u2013 Redis host to connect to.

  • \n
  • port (int) \u2013 Port of Redis.

  • \n
  • username (str) \u2013 Username associated with Redis.

  • \n
  • password (str) \u2013 Password associated with Redis.

  • \n
  • cache_type (str) \u2013 Redis cache type to store data in.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.destinations.MqttDestination(name, broker_host, broker_port, use_tls, topic_pattern, publisher_sync, connect_timeout, automatic_reconnect, clean_session, message_retained, message_qos, username=None, password=None, topic_test=None, client=None)[source]\u00b6
\n
\n
\n__init__(name, broker_host, broker_port, use_tls, topic_pattern, publisher_sync, connect_timeout, automatic_reconnect, clean_session, message_retained, message_qos, username=None, password=None, topic_test=None, client=None)[source]\u00b6
\n

Airbyte Destination for Mqtt.

\n

Documentation can be found at https://docs.airbyte.com/integrations/destinations/mqtt

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • broker_host (str) \u2013 Host of the broker to connect to.

  • \n
  • broker_port (int) \u2013 Port of the broker.

  • \n
  • use_tls (bool) \u2013 Whether to use TLS encryption on the connection.

  • \n
  • username (Optional[str]) \u2013 User name to use for the connection.

  • \n
  • password (Optional[str]) \u2013 Password to use for the connection.

  • \n
  • topic_pattern (str) \u2013 Topic pattern in which the records will be sent. You can use patterns like \u2018{namespace}\u2019 and/or \u2018{stream}\u2019 to send the message to a specific topic based on these values. Notice that the topic name will be transformed to a standard naming convention.

  • \n
  • topic_test (Optional[str]) \u2013 Topic to test if Airbyte can produce messages.

  • \n
  • client (Optional[str]) \u2013 A client identifier that is unique on the server being connected to.

  • \n
  • publisher_sync (bool) \u2013 Wait synchronously until the record has been sent to the broker.

  • \n
  • connect_timeout (int) \u2013 Maximum time interval (in seconds) the client will wait for the network connection to the MQTT server to be established.

  • \n
  • automatic_reconnect (bool) \u2013 Whether the client will automatically attempt to reconnect to the server if the connection is lost.

  • \n
  • clean_session (bool) \u2013 Whether the client and server should remember state across restarts and reconnects.

  • \n
  • message_retained (bool) \u2013 Whether or not the publish message should be retained by the messaging engine.

  • \n
  • message_qos (str) \u2013 Quality of service used for each message to be delivered.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.destinations.RedshiftDestination(name, host, port, username, password, database, schema, uploading_method, jdbc_url_params=None)[source]\u00b6
\n
\n
\n__init__(name, host, port, username, password, database, schema, uploading_method, jdbc_url_params=None)[source]\u00b6
\n

Airbyte Destination for Redshift.

\n

Documentation can be found at https://docs.airbyte.com/integrations/destinations/redshift

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • host (str) \u2013 Host Endpoint of the Redshift Cluster (must include the cluster-id, region and end with .redshift.amazonaws.com)

  • \n
  • port (int) \u2013 Port of the database.

  • \n
  • username (str) \u2013 Username to use to access the database.

  • \n
  • password (str) \u2013 Password associated with the username.

  • \n
  • database (str) \u2013 Name of the database.

  • \n
  • schema (str) \u2013 The default schema tables are written to if the source does not specify a namespace. Unless specifically configured, the usual value for this field is \u201cpublic\u201d.

  • \n
  • jdbc_url_params (Optional[str]) \u2013 Additional properties to pass to the JDBC URL string when connecting to the database formatted as \u2018key=value\u2019 pairs separated by the symbol \u2018&\u2019. (example: key1=value1&key2=value2&key3=value3).

  • \n
  • uploading_method (Union[RedshiftDestination.Standard, RedshiftDestination.S3Staging]) \u2013 The method how the data will be uploaded to the database.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass RedshiftDestination.Standard[source]\u00b6
\n
\n
\n__init__()[source]\u00b6
\n
\n\n
\n\n
\n
\nclass RedshiftDestination.NoEncryption[source]\u00b6
\n
\n
\n__init__()[source]\u00b6
\n
\n\n
\n\n
\n
\nclass RedshiftDestination.AESCBCEnvelopeEncryption(key_encrypting_key=None)[source]\u00b6
\n
\n
\n__init__(key_encrypting_key=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass RedshiftDestination.S3Staging(s3_bucket_name, s3_bucket_region, access_key_id, secret_access_key, encryption, s3_bucket_path=None, file_name_pattern=None, purge_staging_data=None)[source]\u00b6
\n
\n
\n__init__(s3_bucket_name, s3_bucket_region, access_key_id, secret_access_key, encryption, s3_bucket_path=None, file_name_pattern=None, purge_staging_data=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.destinations.PulsarDestination(name, brokers, use_tls, topic_type, topic_tenant, topic_namespace, topic_pattern, compression_type, send_timeout_ms, max_pending_messages, max_pending_messages_across_partitions, batching_enabled, batching_max_messages, batching_max_publish_delay, block_if_queue_full, topic_test=None, producer_name=None, producer_sync=None)[source]\u00b6
\n
\n
\n__init__(name, brokers, use_tls, topic_type, topic_tenant, topic_namespace, topic_pattern, compression_type, send_timeout_ms, max_pending_messages, max_pending_messages_across_partitions, batching_enabled, batching_max_messages, batching_max_publish_delay, block_if_queue_full, topic_test=None, producer_name=None, producer_sync=None)[source]\u00b6
\n

Airbyte Destination for Pulsar.

\n

Documentation can be found at https://docs.airbyte.com/integrations/destinations/pulsar

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • brokers (str) \u2013 A list of host/port pairs to use for establishing the initial connection to the Pulsar cluster.

  • \n
  • use_tls (bool) \u2013 Whether to use TLS encryption on the connection.

  • \n
  • topic_type (str) \u2013 It identifies type of topic. Pulsar supports two kind of topics: persistent and non-persistent. In persistent topic, all messages are durably persisted on disk (that means on multiple disks unless the broker is standalone), whereas non-persistent topic does not persist message into storage disk.

  • \n
  • topic_tenant (str) \u2013 The topic tenant within the instance. Tenants are essential to multi-tenancy in Pulsar, and spread across clusters.

  • \n
  • topic_namespace (str) \u2013 The administrative unit of the topic, which acts as a grouping mechanism for related topics. Most topic configuration is performed at the namespace level. Each tenant has one or multiple namespaces.

  • \n
  • topic_pattern (str) \u2013 Topic pattern in which the records will be sent. You can use patterns like \u2018{namespace}\u2019 and/or \u2018{stream}\u2019 to send the message to a specific topic based on these values. Notice that the topic name will be transformed to a standard naming convention.

  • \n
  • topic_test (Optional[str]) \u2013 Topic to test if Airbyte can produce messages.

  • \n
  • producer_name (Optional[str]) \u2013 Name for the producer. If not filled, the system will generate a globally unique name which can be accessed with.

  • \n
  • producer_sync (Optional[bool]) \u2013 Wait synchronously until the record has been sent to Pulsar.

  • \n
  • compression_type (str) \u2013 Compression type for the producer.

  • \n
  • send_timeout_ms (int) \u2013 If a message is not acknowledged by a server before the send-timeout expires, an error occurs (in ms).

  • \n
  • max_pending_messages (int) \u2013 The maximum size of a queue holding pending messages.

  • \n
  • max_pending_messages_across_partitions (int) \u2013 The maximum number of pending messages across partitions.

  • \n
  • batching_enabled (bool) \u2013 Control whether automatic batching of messages is enabled for the producer.

  • \n
  • batching_max_messages (int) \u2013 Maximum number of messages permitted in a batch.

  • \n
  • batching_max_publish_delay (int) \u2013 Time period in milliseconds within which the messages sent will be batched.

  • \n
  • block_if_queue_full (bool) \u2013 If the send operation should block when the outgoing message queue is full.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.destinations.SnowflakeDestination(name, host, role, warehouse, database, schema, username, credentials, loading_method, jdbc_url_params=None)[source]\u00b6
\n
\n
\n__init__(name, host, role, warehouse, database, schema, username, credentials, loading_method, jdbc_url_params=None)[source]\u00b6
\n

Airbyte Destination for Snowflake.

\n

Documentation can be found at https://docs.airbyte.com/integrations/destinations/snowflake

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • host (str) \u2013 Enter your Snowflake account\u2019s locator (in the format \u2026snowflakecomputing.com)

  • \n
  • role (str) \u2013 Enter the role that you want to use to access Snowflake

  • \n
  • warehouse (str) \u2013 Enter the name of the warehouse that you want to sync data into

  • \n
  • database (str) \u2013 Enter the name of the database you want to sync data into

  • \n
  • schema (str) \u2013 Enter the name of the default schema

  • \n
  • username (str) \u2013 Enter the name of the user you want to use to access the database

  • \n
  • jdbc_url_params (Optional[str]) \u2013 Enter the additional properties to pass to the JDBC URL string when connecting to the database (formatted as key=value pairs separated by the symbol &). Example: key1=value1&key2=value2&key3=value3

  • \n
  • loading_method (Union[SnowflakeDestination.SelectAnotherOption, SnowflakeDestination.RecommendedInternalStaging, SnowflakeDestination.AWSS3Staging, SnowflakeDestination.GoogleCloudStorageStaging, SnowflakeDestination.AzureBlobStorageStaging]) \u2013 Select a data staging method

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass SnowflakeDestination.OAuth20(access_token, refresh_token, auth_type=None, client_id=None, client_secret=None)[source]\u00b6
\n
\n
\n__init__(access_token, refresh_token, auth_type=None, client_id=None, client_secret=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass SnowflakeDestination.KeyPairAuthentication(private_key, auth_type=None, private_key_password=None)[source]\u00b6
\n
\n
\n__init__(private_key, auth_type=None, private_key_password=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass SnowflakeDestination.UsernameAndPassword(password)[source]\u00b6
\n
\n
\n__init__(password)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass SnowflakeDestination.SelectAnotherOption(method)[source]\u00b6
\n
\n
\n__init__(method)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass SnowflakeDestination.RecommendedInternalStaging(method)[source]\u00b6
\n
\n
\n__init__(method)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass SnowflakeDestination.NoEncryption[source]\u00b6
\n
\n
\n__init__()[source]\u00b6
\n
\n\n
\n\n
\n
\nclass SnowflakeDestination.AESCBCEnvelopeEncryption(key_encrypting_key=None)[source]\u00b6
\n
\n
\n__init__(key_encrypting_key=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass SnowflakeDestination.AWSS3Staging(method, s3_bucket_name, access_key_id, secret_access_key, encryption, s3_bucket_region=None, purge_staging_data=None, file_name_pattern=None)[source]\u00b6
\n
\n
\n__init__(method, s3_bucket_name, access_key_id, secret_access_key, encryption, s3_bucket_region=None, purge_staging_data=None, file_name_pattern=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass SnowflakeDestination.GoogleCloudStorageStaging(method, project_id, bucket_name, credentials_json)[source]\u00b6
\n
\n
\n__init__(method, project_id, bucket_name, credentials_json)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass SnowflakeDestination.AzureBlobStorageStaging(method, azure_blob_storage_account_name, azure_blob_storage_container_name, azure_blob_storage_sas_token, azure_blob_storage_endpoint_domain_name=None)[source]\u00b6
\n
\n
\n__init__(method, azure_blob_storage_account_name, azure_blob_storage_container_name, azure_blob_storage_sas_token, azure_blob_storage_endpoint_domain_name=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.destinations.PostgresDestination(name, host, port, database, schema, username, ssl_mode, password=None, ssl=None, jdbc_url_params=None)[source]\u00b6
\n
\n
\n__init__(name, host, port, database, schema, username, ssl_mode, password=None, ssl=None, jdbc_url_params=None)[source]\u00b6
\n

Airbyte Destination for Postgres.

\n

Documentation can be found at https://docs.airbyte.com/integrations/destinations/postgres

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • host (str) \u2013 Hostname of the database.

  • \n
  • port (int) \u2013 Port of the database.

  • \n
  • database (str) \u2013 Name of the database.

  • \n
  • schema (str) \u2013 The default schema tables are written to if the source does not specify a namespace. The usual value for this field is \u201cpublic\u201d.

  • \n
  • username (str) \u2013 Username to use to access the database.

  • \n
  • password (Optional[str]) \u2013 Password associated with the username.

  • \n
  • ssl (Optional[bool]) \u2013 Encrypt data using SSL. When activating SSL, please select one of the connection modes.

  • \n
  • ssl_mode (Union[PostgresDestination.Disable, PostgresDestination.Allow, PostgresDestination.Prefer, PostgresDestination.Require, PostgresDestination.VerifyCa, PostgresDestination.VerifyFull]) \u2013 SSL connection modes. disable - Chose this mode to disable encryption of communication between Airbyte and destination database allow - Chose this mode to enable encryption only when required by the source database prefer - Chose this mode to allow unencrypted connection only if the source database does not support encryption require - Chose this mode to always require encryption. If the source database server does not support encryption, connection will fail verify-ca - Chose this mode to always require encryption and to verify that the source database server has a valid SSL certificate verify-full - This is the most secure mode. Chose this mode to always require encryption and to verify the identity of the source database server See more information - in the docs.

  • \n
  • jdbc_url_params (Optional[str]) \u2013 Additional properties to pass to the JDBC URL string when connecting to the database formatted as \u2018key=value\u2019 pairs separated by the symbol \u2018&\u2019. (example: key1=value1&key2=value2&key3=value3).

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass PostgresDestination.Disable[source]\u00b6
\n
\n
\n__init__()[source]\u00b6
\n
\n\n
\n\n
\n
\nclass PostgresDestination.Allow[source]\u00b6
\n
\n
\n__init__()[source]\u00b6
\n
\n\n
\n\n
\n
\nclass PostgresDestination.Prefer[source]\u00b6
\n
\n
\n__init__()[source]\u00b6
\n
\n\n
\n\n
\n
\nclass PostgresDestination.Require[source]\u00b6
\n
\n
\n__init__()[source]\u00b6
\n
\n\n
\n\n
\n
\nclass PostgresDestination.VerifyCa(ca_certificate, client_key_password=None)[source]\u00b6
\n
\n
\n__init__(ca_certificate, client_key_password=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass PostgresDestination.VerifyFull(ca_certificate, client_certificate, client_key, client_key_password=None)[source]\u00b6
\n
\n
\n__init__(ca_certificate, client_certificate, client_key, client_key_password=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.destinations.ScaffoldDestinationPythonDestination(name, TODO=None)[source]\u00b6
\n
\n
\n__init__(name, TODO=None)[source]\u00b6
\n

Airbyte Destination for Scaffold Destination Python.

\n

Documentation can be found at https://docs.airbyte.com/integrations/destinations/scaffold-destination-python

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • TODO (Optional[str]) \u2013 FIX ME

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.destinations.LocalJsonDestination(name, destination_path)[source]\u00b6
\n
\n
\n__init__(name, destination_path)[source]\u00b6
\n

Airbyte Destination for Local Json.

\n

Documentation can be found at https://docs.airbyte.com/integrations/destinations/local-json

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • destination_path (str) \u2013 Path to the directory where json files will be written. The files will be placed inside that local mount. For more information check out our docs

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.destinations.MeilisearchDestination(name, host, api_key=None)[source]\u00b6
\n
\n
\n__init__(name, host, api_key=None)[source]\u00b6
\n

Airbyte Destination for Meilisearch.

\n

Documentation can be found at https://docs.airbyte.com/integrations/destinations/meilisearch

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • host (str) \u2013 Hostname of the MeiliSearch instance.

  • \n
  • api_key (Optional[str]) \u2013 MeiliSearch API Key. See the docs for more information on how to obtain this key.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\n

Legacy\u00b6

\n
\n
\ndagster_airbyte.airbyte_resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
request_max_retries (dagster.IntSource, optional):
\n

The maximum number of times requests to the Airbyte API should be retried before failing.

\n

Default Value: 3

\n
\n
request_retry_delay (Float, optional):
\n

Time (in seconds) to wait between each request retry.

\n

Default Value: 0.25

\n
\n
request_timeout (dagster.IntSource, optional):
\n

Time (in seconds) after which the requests to Airbyte are declared timed out.

\n

Default Value: 15

\n
\n
cancel_sync_on_run_termination (dagster.BoolSource, optional):
\n

Whether to cancel a sync in Airbyte if the Dagster runner is terminated. This may be useful to disable if using Airbyte sources that cannot be cancelled and resumed easily, or if your Dagster deployment may experience runner interruptions that do not impact your Airbyte deployment.

\n

Default Value: True

\n
\n
poll_interval (Float, optional):
\n

Time (in seconds) to wait between checking a sync\u2019s status.

\n

Default Value: 10

\n
\n
host (dagster.StringSource):
\n

The Airbyte server address.

\n
\n
port (dagster.StringSource):
\n

Port used for the Airbyte server.

\n
\n
username (Union[dagster.StringSource, None], optional):
\n

Username if using basic auth.

\n
\n
password (Union[dagster.StringSource, None], optional):
\n

Password if using basic auth.

\n
\n
use_https (dagster.BoolSource, optional):
\n

Whether to use HTTPS to connect to the Airbyte server.

\n

Default Value: False

\n
\n
forward_logs (dagster.BoolSource, optional):
\n

Whether to forward Airbyte logs to the compute log, can be expensive for long-running syncs.

\n

Default Value: True

\n
\n
request_additional_params (dict, optional):
\n

Any additional kwargs to pass to the requests library when making requests to Airbyte.

\n
\nDefault Value:
{}\n
\n
\n
\n
\n

This resource allows users to programatically interface with the Airbyte REST API to launch\nsyncs and monitor their progress. This currently implements only a subset of the functionality\nexposed by the API.

\n

For a complete set of documentation on the Airbyte REST API, including expected response JSON\nschema, see the Airbyte API Docs.

\n

To configure this resource, we recommend using the configured method.

\n

Examples:

\n
from dagster import job\nfrom dagster_airbyte import airbyte_resource\n\nmy_airbyte_resource = airbyte_resource.configured(\n    {\n        "host": {"env": "AIRBYTE_HOST"},\n        "port": {"env": "AIRBYTE_PORT"},\n        # If using basic auth\n        "username": {"env": "AIRBYTE_USERNAME"},\n        "password": {"env": "AIRBYTE_PASSWORD"},\n    }\n)\n\n@job(resource_defs={"airbyte":my_airbyte_resource})\ndef my_airbyte_job():\n    ...\n
\n
\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-airbyte", "customsidebar": null, "display_toc": true, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../dagster-airflow/", "title": "Airflow (dagster-airflow)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../../memoization/", "title": "Job-Level Versioning and Memoization (Deprecated)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-airflow", "Airflow (dagster-airflow)", "N", "next"], ["sections/api/apidocs/memoization", "Job-Level Versioning and Memoization (Deprecated)", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/libraries/dagster-airbyte.rst.txt", "title": "Airbyte (dagster-airbyte)", "toc": "\n"}, "dagster-airflow": {"alabaster_version": "0.7.13", "body": "
\n

Airflow (dagster-airflow)\u00b6

\n

This library provides a Dagster integration with Airflow.

\n

For more information on getting started, see the Airflow integration guide.

\n
\n

Run Airflow on Dagster\u00b6

\n
\n
\ndagster_airflow.make_dagster_definitions_from_airflow_dags_path(dag_path, safe_mode=True, connections=None, resource_defs={})[source]\u00b6
\n

Construct a Dagster repository corresponding to Airflow DAGs in dag_path.

\n
\n
Usage:

Create make_dagster_definitions.py:

\n
from dagster_airflow import make_dagster_definitions_from_airflow_dags_path\n\ndef make_definitions_from_dir():\n    return make_dagster_definitions_from_airflow_dags_path(\n        '/path/to/dags/',\n    )\n
\n
\n

Use RepositoryDefinition as usual, for example:\ndagster-webserver -f path/to/make_dagster_repo.py -n make_repo_from_dir

\n
\n
\n
\n
Parameters:
\n
    \n
  • dag_path (str) \u2013 Path to directory or file that contains Airflow Dags

  • \n
  • include_examples (bool) \u2013 True to include Airflow\u2019s example DAGs. (default: False)

  • \n
  • safe_mode (bool) \u2013 True to use Airflow\u2019s default heuristic to find files that contain DAGs\n(ie find files that contain both b\u2019DAG\u2019 and b\u2019airflow\u2019) (default: True)

  • \n
  • connections (List[Connection]) \u2013 List of Airflow Connections to be created in the Airflow DB

  • \n
\n
\n
Returns:
\n

Definitions

\n
\n
\n
\n\n
\n
\ndagster_airflow.make_dagster_definitions_from_airflow_dag_bag(dag_bag, connections=None, resource_defs={})[source]\u00b6
\n

Construct a Dagster definition corresponding to Airflow DAGs in DagBag.

\n
\n
Usage:
\n
Create make_dagster_definition.py:

from dagster_airflow import make_dagster_definition_from_airflow_dag_bag\nfrom airflow_home import my_dag_bag

\n
\n
def make_definition_from_dag_bag():

return make_dagster_definition_from_airflow_dag_bag(my_dag_bag)

\n
\n
\n
\n
Use Definitions as usual, for example:

dagster-webserver -f path/to/make_dagster_definition.py

\n
\n
\n
\n
\n
\n
Parameters:
\n
    \n
  • dag_bag (DagBag) \u2013 Airflow DagBag Model

  • \n
  • connections (List[Connection]) \u2013 List of Airflow Connections to be created in the Airflow DB

  • \n
\n
\n
Returns:
\n

Definitions

\n
\n
\n
\n\n
\n
\ndagster_airflow.make_schedules_and_jobs_from_airflow_dag_bag(dag_bag, connections=None, resource_defs={})[source]\u00b6
\n

Construct Dagster Schedules and Jobs corresponding to Airflow DagBag.

\n
\n
Parameters:
\n
    \n
  • dag_bag (DagBag) \u2013 Airflow DagBag Model

  • \n
  • connections (List[Connection]) \u2013 List of Airflow Connections to be created in the Airflow DB

  • \n
\n
\n
Returns:
\n

The generated Dagster Schedules\n- List[JobDefinition]: The generated Dagster Jobs

\n
\n
Return type:
\n

    \n
  • List[ScheduleDefinition]

  • \n
\n

\n
\n
\n
\n\n
\n
\ndagster_airflow.make_dagster_job_from_airflow_dag(dag, tags=None, connections=None, resource_defs={})[source]\u00b6
\n

Construct a Dagster job corresponding to a given Airflow DAG.

\n

Tasks in the resulting job will execute the execute() method on the corresponding\nAirflow Operator. Dagster, any dependencies required by Airflow Operators, and the module\ncontaining your DAG definition must be available in the Python environment within which your\nDagster solids execute.

\n

To set Airflow\u2019s execution_date for use with Airflow Operator\u2019s execute() methods,\neither:

\n
    \n
  1. \n
    (Best for ad hoc runs) Execute job directly. This will set execution_date to the

    time (in UTC) of the run.

    \n
    \n
    \n
  2. \n
  3. \n
    Add {'airflow_execution_date': utc_date_string} to the job tags. This will override

    behavior from (1).

    \n
    my_dagster_job = make_dagster_job_from_airflow_dag(\n        dag=dag,\n        tags={'airflow_execution_date': utc_execution_date_str}\n)\nmy_dagster_job.execute_in_process()\n
    \n
    \n
    \n
    \n
  4. \n
  5. \n
    (Recommended) Add {'airflow_execution_date': utc_date_string} to the run tags,

    such as in the Dagster UI. This will override behavior from (1) and (2)

    \n
    \n
    \n
  6. \n
\n

We apply normalized_name() to the dag id and task ids when generating job name and op\nnames to ensure that names conform to Dagster\u2019s naming conventions.

\n
\n
Parameters:
\n
    \n
  • dag (DAG) \u2013 The Airflow DAG to compile into a Dagster job

  • \n
  • tags (Dict[str, Field]) \u2013 Job tags. Optionally include\ntags={\u2018airflow_execution_date\u2019: utc_date_string} to specify execution_date used within\nexecution of Airflow Operators.

  • \n
  • connections (List[Connection]) \u2013 List of Airflow Connections to be created in the Ephemeral\nAirflow DB, if use_emphemeral_airflow_db is False this will be ignored.

  • \n
\n
\n
Returns:
\n

The generated Dagster job

\n
\n
Return type:
\n

JobDefinition

\n
\n
\n
\n\n
\n
\ndagster_airflow.load_assets_from_airflow_dag(dag, task_ids_by_asset_key={}, upstream_dependencies_by_asset_key={}, connections=None)[source]\u00b6
\n

[Experimental] Construct Dagster Assets for a given Airflow DAG.

\n
\n
Parameters:
\n
    \n
  • dag (DAG) \u2013 The Airflow DAG to compile into a Dagster job

  • \n
  • task_ids_by_asset_key (Optional[Mapping[AssetKey, AbstractSet[str]]]) \u2013 A mapping from asset\nkeys to task ids. Used break up the Airflow Dag into multiple SDAs

  • \n
  • upstream_dependencies_by_asset_key (Optional[Mapping[AssetKey, AbstractSet[AssetKey]]]) \u2013 A\nmapping from upstream asset keys to assets provided in task_ids_by_asset_key. Used to\ndeclare new upstream SDA depenencies.

  • \n
  • connections (List[Connection]) \u2013 List of Airflow Connections to be created in the Airflow DB

  • \n
\n
\n
Returns:
\n

List[AssetsDefinition]

\n
\n
\n
\n\n
\n
\ndagster_airflow.make_ephemeral_airflow_db_resource(connections=[], dag_run_config=None)[source]\u00b6
\n

Creates a Dagster resource that provides an ephemeral Airflow database.

\n
\n
Parameters:
\n
    \n
  • connections (List[Connection]) \u2013 List of Airflow Connections to be created in the Airflow DB

  • \n
  • dag_run_config (Optional[dict]) \u2013 dag_run configuration to be used when creating a DagRun

  • \n
\n
\n
Returns:
\n

The ephemeral Airflow DB resource

\n
\n
Return type:
\n

ResourceDefinition

\n
\n
\n
\n\n
\n
\ndagster_airflow.make_persistent_airflow_db_resource(uri='', connections=[], dag_run_config={})[source]\u00b6
\n

Creates a Dagster resource that provides an persistent Airflow database.

\n
\n
Usage:
from dagster_airflow import (\n    make_dagster_definitions_from_airflow_dags_path,\n    make_persistent_airflow_db_resource,\n)\npostgres_airflow_db = "postgresql+psycopg2://airflow:airflow@localhost:5432/airflow"\nairflow_db = make_persistent_airflow_db_resource(uri=postgres_airflow_db)\ndefinitions = make_dagster_definitions_from_airflow_example_dags(\n    '/path/to/dags/',\n    resource_defs={"airflow_db": airflow_db}\n)\n
\n
\n
\n
\n
\n
Parameters:
\n
    \n
  • uri \u2013 SQLAlchemy URI of the Airflow DB to be used

  • \n
  • connections (List[Connection]) \u2013 List of Airflow Connections to be created in the Airflow DB

  • \n
  • dag_run_config (Optional[dict]) \u2013 dag_run configuration to be used when creating a DagRun

  • \n
\n
\n
Returns:
\n

The persistent Airflow DB resource

\n
\n
Return type:
\n

ResourceDefinition

\n
\n
\n
\n\n
\n
\n

Orchestrate Dagster from Airflow\u00b6

\n
\n
\nclass dagster_airflow.DagsterCloudOperator(*args, **kwargs)[source]\u00b6
\n

DagsterCloudOperator.

\n

Uses the dagster cloud graphql api to run and monitor dagster jobs on dagster cloud

\n
\n
Parameters:
\n
    \n
  • repository_name (str) \u2013 the name of the repository to use

  • \n
  • repostitory_location_name (str) \u2013 the name of the repostitory location to use

  • \n
  • job_name (str) \u2013 the name of the job to run

  • \n
  • run_config (Optional[Dict[str, Any]]) \u2013 the run config to use for the job run

  • \n
  • dagster_conn_id (Optional[str]) \u2013 the id of the dagster connection, airflow 2.0+ only

  • \n
  • organization_id (Optional[str]) \u2013 the id of the dagster cloud organization

  • \n
  • deployment_name (Optional[str]) \u2013 the name of the dagster cloud deployment

  • \n
  • user_token (Optional[str]) \u2013 the dagster cloud user token to use

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster_airflow.DagsterOperator(*args, **kwargs)[source]\u00b6
\n

DagsterOperator.

\n

Uses the dagster graphql api to run and monitor dagster jobs on remote dagster infrastructure

\n
\n
Parameters:
\n
    \n
  • repository_name (str) \u2013 the name of the repository to use

  • \n
  • repostitory_location_name (str) \u2013 the name of the repostitory location to use

  • \n
  • job_name (str) \u2013 the name of the job to run

  • \n
  • run_config (Optional[Dict[str, Any]]) \u2013 the run config to use for the job run

  • \n
  • dagster_conn_id (Optional[str]) \u2013 the id of the dagster connection, airflow 2.0+ only

  • \n
  • organization_id (Optional[str]) \u2013 the id of the dagster cloud organization

  • \n
  • deployment_name (Optional[str]) \u2013 the name of the dagster cloud deployment

  • \n
  • user_token (Optional[str]) \u2013 the dagster cloud user token to use

  • \n
\n
\n
\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-airflow", "customsidebar": null, "display_toc": true, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../dagster-aws/", "title": "AWS (dagster-aws)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-airbyte/", "title": "Airbyte (dagster-airbyte)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-aws", "AWS (dagster-aws)", "N", "next"], ["sections/api/apidocs/libraries/dagster-airbyte", "Airbyte (dagster-airbyte)", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/libraries/dagster-airflow.rst.txt", "title": "Airflow (dagster-airflow)", "toc": "\n"}, "dagster-aws": {"alabaster_version": "0.7.13", "body": "
\n

AWS (dagster-aws)\u00b6

\n

Utilities for interfacing with AWS with Dagster.

\n
\n

S3\u00b6

\n
\n
\ndagster_aws.s3.S3Resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
use_unsigned_session (dagster.BoolSource, optional):
\n

Specifies whether to use an unsigned S3 session.

\n

Default Value: False

\n
\n
region_name (Union[dagster.StringSource, None], optional):
\n

Specifies a custom region for the S3 session.

\n
\n
endpoint_url (Union[dagster.StringSource, None], optional):
\n

Specifies a custom endpoint for the S3 session.

\n
\n
max_attempts (dagster.IntSource, optional):
\n

This provides Boto3\u2019s retry handler with a value of maximum retry attempts, where the initial call counts toward the max_attempts value that you provide.

\n

Default Value: 5

\n
\n
profile_name (Union[dagster.StringSource, None], optional):
\n

Specifies a profile to connect that session.

\n
\n
use_ssl (dagster.BoolSource, optional):
\n

Whether or not to use SSL. By default, SSL is used.

\n

Default Value: True

\n
\n
verify (Union[dagster.StringSource, None], optional):
\n

Whether or not to verify SSL certificates. By default SSL certificates are verified. You can also specify this argument if you want to use a different CA cert bundle than the one used by botocore.

\n
\n
aws_access_key_id (Union[dagster.StringSource, None], optional):
\n

AWS access key ID to use when creating the boto3 session.

\n
\n
aws_secret_access_key (Union[dagster.StringSource, None], optional):
\n

AWS secret access key to use when creating the boto3 session.

\n
\n
aws_session_token (Union[dagster.StringSource, None], optional):
\n

AWS session token to use when creating the boto3 session.

\n
\n
\n

Resource that gives access to S3.

\n

The underlying S3 session is created by calling\nboto3.session.Session(profile_name).\nThe returned resource object is an S3 client, an instance of botocore.client.S3.

\n

Example

\n
from dagster import job, op, Definitions\nfrom dagster_aws.s3 import S3Resource\n\n@op\ndef example_s3_op(s3: S3Resource):\n    return s3.get_client().list_objects_v2(\n        Bucket='my-bucket',\n        Prefix='some-key'\n    )\n\n@job\ndef example_job():\n    example_s3_op()\n\ndefs = Definitions(\n    jobs=[example_job],\n    resources={'s3': S3Resource(region_name='us-west-1')}\n)\n
\n
\n
\n\n
\n
\ndagster_aws.s3.S3PickleIOManager IOManagerDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
s3_resource (Union[Any, None], optional):
\n

\n
s3_bucket (dagster.StringSource):
\n

S3 bucket to use for the file manager.

\n
\n
s3_prefix (dagster.StringSource, optional):
\n

Prefix to use for the S3 bucket for this file manager.

\n

Default Value: \u2018dagster\u2019

\n
\n
\n

Persistent IO manager using S3 for storage.

\n

Serializes objects via pickling. Suitable for objects storage for distributed executors, so long\nas each execution node has network connectivity and credentials for S3 and the backing bucket.

\n

Assigns each op output to a unique filepath containing run ID, step key, and output name.\nAssigns each asset to a single filesystem path, at \u201c<base_dir>/<asset_key>\u201d. If the asset key\nhas multiple components, the final component is used as the name of the file, and the preceding\ncomponents as parent directories under the base_dir.

\n

Subsequent materializations of an asset will overwrite previous materializations of that asset.\nWith a base directory of \u201c/my/base/path\u201d, an asset with key\nAssetKey([\u201cone\u201d, \u201ctwo\u201d, \u201cthree\u201d]) would be stored in a file called \u201cthree\u201d in a directory\nwith path \u201c/my/base/path/one/two/\u201d.

\n

Example usage:

\n
from dagster import asset, Definitions\nfrom dagster_aws.s3 import S3PickleIOManager, S3Resource\n\n\n@asset\ndef asset1():\n    # create df ...\n    return df\n\n@asset\ndef asset2(asset1):\n    return asset1[:5]\n\ndefs = Definitions(\n    assets=[asset1, asset2],\n    resources={\n        "io_manager": S3PickleIOManager(\n            s3_resource=S3Resource(),\n            s3_bucket="my-cool-bucket",\n            s3_prefix="my-cool-prefix",\n        )\n    }\n)\n
\n
\n
\n\n
\n
\nclass dagster_aws.s3.S3ComputeLogManager(bucket, local_dir=None, inst_data=None, prefix='dagster', use_ssl=True, verify=True, verify_cert_path=None, endpoint_url=None, skip_empty_files=False, upload_interval=None, upload_extra_args=None, show_url_only=False, region=None)[source]\u00b6
\n

Logs compute function stdout and stderr to S3.

\n

Users should not instantiate this class directly. Instead, use a YAML block in dagster.yaml\nsuch as the following:

\n
compute_logs:\n  module: dagster_aws.s3.compute_log_manager\n  class: S3ComputeLogManager\n  config:\n    bucket: "mycorp-dagster-compute-logs"\n    local_dir: "/tmp/cool"\n    prefix: "dagster-test-"\n    use_ssl: true\n    verify: true\n    verify_cert_path: "/path/to/cert/bundle.pem"\n    endpoint_url: "http://alternate-s3-host.io"\n    skip_empty_files: true\n    upload_interval: 30\n    upload_extra_args:\n      ServerSideEncryption: "AES256"\n    show_url_only: false\n    region: "us-west-1"\n
\n
\n
\n
Parameters:
\n
    \n
  • bucket (str) \u2013 The name of the s3 bucket to which to log.

  • \n
  • local_dir (Optional[str]) \u2013 Path to the local directory in which to stage logs. Default:\ndagster._seven.get_system_temp_directory().

  • \n
  • prefix (Optional[str]) \u2013 Prefix for the log file keys.

  • \n
  • use_ssl (Optional[bool]) \u2013 Whether or not to use SSL. Default True.

  • \n
  • verify (Optional[bool]) \u2013 Whether or not to verify SSL certificates. Default True.

  • \n
  • verify_cert_path (Optional[str]) \u2013 A filename of the CA cert bundle to use. Only used if\nverify set to False.

  • \n
  • endpoint_url (Optional[str]) \u2013 Override for the S3 endpoint url.

  • \n
  • skip_empty_files \u2013 (Optional[bool]): Skip upload of empty log files.

  • \n
  • upload_interval \u2013 (Optional[int]): Interval in seconds to upload partial log files to S3. By default, will only upload when the capture is complete.

  • \n
  • upload_extra_args \u2013 (Optional[dict]): Extra args for S3 file upload

  • \n
  • show_url_only \u2013 (Optional[bool]): Only show the URL of the log file in the UI, instead of fetching and displaying the full content. Default False.

  • \n
  • region \u2013 (Optional[str]): The region of the S3 bucket. If not specified, will use the default region of the AWS session.

  • \n
  • inst_data (Optional[ConfigurableClassData]) \u2013 Serializable representation of the compute\nlog manager when newed up from config.

  • \n
\n
\n
\n
\n\n
\n
\ndagster_aws.s3.S3Coordinate DagsterType\u00b6
\n

A dagster.DagsterType intended to make it easier to pass information about files on S3\nfrom op to op. Objects of this type should be dicts with 'bucket' and 'key' keys,\nand may be hydrated from config in the intuitive way, e.g., for an input with the name\ns3_file:

\n
inputs:\n  s3_file:\n    value:\n      bucket: my-bucket\n      key: my-key\n
\n
\n
\n\n
\n

File Manager (Experimental)\u00b6

\n
\n
\nclass dagster_aws.s3.S3FileHandle(s3_bucket, s3_key)[source]\u00b6
\n

A reference to a file on S3.

\n
\n\n
\n
\ndagster_aws.s3.S3FileManagerResource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
use_unsigned_session (dagster.BoolSource, optional):
\n

Specifies whether to use an unsigned S3 session.

\n

Default Value: False

\n
\n
region_name (Union[dagster.StringSource, None], optional):
\n

Specifies a custom region for the S3 session.

\n
\n
endpoint_url (Union[dagster.StringSource, None], optional):
\n

Specifies a custom endpoint for the S3 session.

\n
\n
max_attempts (dagster.IntSource, optional):
\n

This provides Boto3\u2019s retry handler with a value of maximum retry attempts, where the initial call counts toward the max_attempts value that you provide.

\n

Default Value: 5

\n
\n
profile_name (Union[dagster.StringSource, None], optional):
\n

Specifies a profile to connect that session.

\n
\n
use_ssl (dagster.BoolSource, optional):
\n

Whether or not to use SSL. By default, SSL is used.

\n

Default Value: True

\n
\n
verify (Union[dagster.StringSource, None], optional):
\n

Whether or not to verify SSL certificates. By default SSL certificates are verified. You can also specify this argument if you want to use a different CA cert bundle than the one used by botocore.

\n
\n
aws_access_key_id (Union[dagster.StringSource, None], optional):
\n

AWS access key ID to use when creating the boto3 session.

\n
\n
aws_secret_access_key (Union[dagster.StringSource, None], optional):
\n

AWS secret access key to use when creating the boto3 session.

\n
\n
aws_session_token (Union[dagster.StringSource, None], optional):
\n

AWS session token to use when creating the boto3 session.

\n
\n
s3_bucket (dagster.StringSource):
\n

S3 bucket to use for the file manager.

\n
\n
s3_prefix (dagster.StringSource, optional):
\n

Prefix to use for the S3 bucket for this file manager.

\n

Default Value: \u2018dagster\u2019

\n
\n
\n

Base class for Dagster resources that utilize structured config.

\n

This class is a subclass of both ResourceDefinition and Config.

\n

Example definition:

\n
class WriterResource(ConfigurableResource):\n    prefix: str\n\n    def output(self, text: str) -> None:\n        print(f"{self.prefix}{text}")\n
\n
\n

Example usage:

\n
@asset\ndef asset_that_uses_writer(writer: WriterResource):\n    writer.output("text")\n\ndefs = Definitions(\n    assets=[asset_that_uses_writer],\n    resources={"writer": WriterResource(prefix="a_prefix")},\n)\n
\n
\n
\n\n
\n
\n
\n

ECS\u00b6

\n
\n
\ndagster_aws.ecs.EcsRunLauncher RunLauncher[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
task_definition (Union[String, strict dict], optional):
\n

Either the short name of an existing task definition to use when launching new tasks, or a dictionary configuration to use when creating a task definition for the run.If neither is provided, the task definition will be created based on the current task\u2019s task definition.

\n
\n
container_name (dagster.StringSource, optional):
\n

The container name to use when launching new tasks. Defaults to \u2018run\u2019.

\n

Default Value: \u2018run\u2019

\n
\n
secrets (List[Union[String, strict dict]], optional):
\n

An array of AWS Secrets Manager secrets. These secrets will be mounted as environment variables in the container. See https://docs.aws.amazon.com/AmazonECS/latest/APIReference/API_Secret.html.

\n
\n
secrets_tag (Union[dagster.StringSource, None], optional):
\n

AWS Secrets Manager secrets with this tag will be mounted as environment variables in the container. Defaults to \u2018dagster\u2019.

\n

Default Value: \u2018dagster\u2019

\n
\n
include_sidecars (Bool, optional):
\n

Whether each run should use the same sidecars as the task that launches it. Defaults to False.

\n

Default Value: False

\n
\n
use_current_ecs_task_config (Bool, optional):
\n

Whether to use the run launcher\u2019s current ECS task in order to determine the cluster and networking configuration for the launched task. Defaults to True. Should only be called if the run launcher is running within an ECS task.

\n

Default Value: True

\n
\n
run_task_kwargs (permissive dict, optional):
\n

Additional arguments to include while running the task. See https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/ecs.html#ECS.Client.run_task for the available parameters. The overrides and taskDefinition arguments will always be set by the run launcher.

\n
\nConfig Schema:
\n
cluster (dagster.StringSource, optional):
\n

Name of the ECS cluster to launch ECS tasks in.

\n
\n
\n
\n
env_vars (List[dagster.StringSource], optional):
\n

List of environment variable names to include in the ECS task. Each can be of the form KEY=VALUE or just KEY (in which case the value will be pulled from the current process)

\n
\n
run_resources (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
cpu (String, optional):
\n

The CPU override to use for the launched task.

\n
\n
memory (String, optional):
\n

The memory override to use for the launched task.

\n
\n
ephemeral_storage (Int, optional):
\n

The ephemeral storage, in GiB, to use for the launched task.

\n
\n
\n
\n
run_ecs_tags (List[strict dict], optional):
\n

Additional tags to apply to the launched ECS task.

\n
\n
\n

RunLauncher that starts a task in ECS for each Dagster job run.

\n
\n\n
\n
\n

Redshift\u00b6

\n
\n
\ndagster_aws.redshift.RedshiftClientResource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
host (dagster.StringSource):
\n

Redshift host

\n
\n
port (dagster.IntSource, optional):
\n

Redshift port

\n

Default Value: 5439

\n
\n
user (Union[dagster.StringSource, None], optional):
\n

Username for Redshift connection

\n
\n
password (Union[dagster.StringSource, None], optional):
\n

Password for Redshift connection

\n
\n
database (Union[dagster.StringSource, None], optional):
\n

Name of the default database to use. After login, you can use USE DATABASE to change the database.

\n
\n
autocommit (Union[dagster.BoolSource, None], optional):
\n

Whether to autocommit queries

\n
\n
connect_timeout (dagster.IntSource, optional):
\n

Timeout for connection to Redshift cluster. Defaults to 5 seconds.

\n

Default Value: 5

\n
\n
sslmode (dagster.StringSource, optional):
\n

SSL mode to use. See the Redshift documentation for reference: https://docs.aws.amazon.com/redshift/latest/mgmt/connecting-ssl-support.html

\n

Default Value: \u2018require\u2019

\n
\n
\n

This resource enables connecting to a Redshift cluster and issuing queries against that\ncluster.

\n

Example

\n
from dagster import Definitions, asset, EnvVar\nfrom dagster_aws.redshift import RedshiftClientResource\n\n@asset\ndef example_redshift_asset(context, redshift: RedshiftClientResource):\n    redshift.get_client().execute_query('SELECT 1', fetch_results=True)\n\nredshift_configured = RedshiftClientResource(\n    host='my-redshift-cluster.us-east-1.redshift.amazonaws.com',\n    port=5439,\n    user='dagster',\n    password=EnvVar("DAGSTER_REDSHIFT_PASSWORD"),\n    database='dev',\n)\n\ndefs = Definitions(\n    assets=[example_redshift_asset],\n    resources={'redshift': redshift_configured},\n)\n
\n
\n
\n\n
\n

Testing\u00b6

\n
\n
\ndagster_aws.redshift.FakeRedshiftClientResource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
host (dagster.StringSource):
\n

Redshift host

\n
\n
port (dagster.IntSource, optional):
\n

Redshift port

\n

Default Value: 5439

\n
\n
user (Union[dagster.StringSource, None], optional):
\n

Username for Redshift connection

\n
\n
password (Union[dagster.StringSource, None], optional):
\n

Password for Redshift connection

\n
\n
database (Union[dagster.StringSource, None], optional):
\n

Name of the default database to use. After login, you can use USE DATABASE to change the database.

\n
\n
autocommit (Union[dagster.BoolSource, None], optional):
\n

Whether to autocommit queries

\n
\n
connect_timeout (dagster.IntSource, optional):
\n

Timeout for connection to Redshift cluster. Defaults to 5 seconds.

\n

Default Value: 5

\n
\n
sslmode (dagster.StringSource, optional):
\n

SSL mode to use. See the Redshift documentation for reference: https://docs.aws.amazon.com/redshift/latest/mgmt/connecting-ssl-support.html

\n

Default Value: \u2018require\u2019

\n
\n
\n

This resource enables connecting to a Redshift cluster and issuing queries against that\ncluster.

\n

Example

\n
from dagster import Definitions, asset, EnvVar\nfrom dagster_aws.redshift import RedshiftClientResource\n\n@asset\ndef example_redshift_asset(context, redshift: RedshiftClientResource):\n    redshift.get_client().execute_query('SELECT 1', fetch_results=True)\n\nredshift_configured = RedshiftClientResource(\n    host='my-redshift-cluster.us-east-1.redshift.amazonaws.com',\n    port=5439,\n    user='dagster',\n    password=EnvVar("DAGSTER_REDSHIFT_PASSWORD"),\n    database='dev',\n)\n\ndefs = Definitions(\n    assets=[example_redshift_asset],\n    resources={'redshift': redshift_configured},\n)\n
\n
\n
\n\n
\n
\n
\n

EMR\u00b6

\n
\n
\ndagster_aws.emr.emr_pyspark_step_launcher ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
spark_config (permissive dict, optional):
\n
\nDefault Value:
{\n    "spark": {\n        "app": {},\n        "driver": {\n            "blockManager": {}\n        },\n        "executor": {\n            "pyspark": {},\n            "logs": {\n                "rolling": {\n                    "time": {}\n                }\n            }\n        },\n        "local": {},\n        "submit": {},\n        "log": {},\n        "redaction": {},\n        "python": {\n            "profile": {},\n            "worker": {}\n        },\n        "files": {},\n        "jars": {},\n        "pyspark": {\n            "driver": {}\n        },\n        "reducer": {},\n        "shuffle": {\n            "file": {},\n            "io": {},\n            "service": {\n                "index": {\n                    "cache": {}\n                }\n            },\n            "sort": {},\n            "spill": {},\n            "registration": {}\n        },\n        "eventLog": {\n            "logBlockUpdates": {},\n            "longForm": {},\n            "buffer": {}\n        },\n        "ui": {\n            "dagGraph": {},\n            "liveUpdate": {}\n        },\n        "worker": {\n            "ui": {}\n        },\n        "sql": {\n            "ui": {}\n        },\n        "streaming": {\n            "ui": {},\n            "backpressure": {},\n            "receiver": {\n                "writeAheadLog": {}\n            },\n            "kafka": {},\n            "driver": {\n                "writeAheadLog": {}\n            }\n        },\n        "broadcast": {},\n        "io": {\n            "compression": {\n                "lz4": {},\n                "snappy": {},\n                "zstd": {}\n            }\n        },\n        "kryo": {},\n        "kryoserializer": {\n            "buffer": {}\n        },\n        "rdd": {},\n        "serializer": {},\n        "memory": {\n            "offHeap": {}\n        },\n        "storage": {\n            "replication": {}\n        },\n        "cleaner": {\n            "periodicGC": {},\n            "referenceTracking": {\n                "blocking": {}\n            }\n        },\n        "default": {},\n        "hadoop": {\n            "mapreduce": {\n                "fileoutputcommitter": {\n                    "algorithm": {}\n                }\n            }\n        },\n        "rpc": {\n            "message": {},\n            "retry": {}\n        },\n        "blockManager": {},\n        "network": {},\n        "port": {},\n        "core": {\n            "connection": {\n                "ack": {\n                    "wait": {}\n                }\n            }\n        },\n        "cores": {},\n        "locality": {\n            "wait": {}\n        },\n        "scheduler": {\n            "revive": {},\n            "listenerbus": {\n                "eventqueue": {}\n            }\n        },\n        "blacklist": {\n            "task": {},\n            "stage": {},\n            "application": {\n                "fetchFailure": {}\n            }\n        },\n        "speculation": {},\n        "task": {\n            "reaper": {}\n        },\n        "stage": {},\n        "dynamicAllocation": {},\n        "r": {\n            "driver": {},\n            "shell": {}\n        },\n        "graphx": {\n            "pregel": {}\n        },\n        "deploy": {\n            "zookeeper": {}\n        }\n    }\n}\n
\n
\n
\nConfig Schema:
\n
spark (permissive dict, optional):
\n
\nDefault Value:
{\n    "app": {},\n    "driver": {\n        "blockManager": {}\n    },\n    "executor": {\n        "pyspark": {},\n        "logs": {\n            "rolling": {\n                "time": {}\n            }\n        }\n    },\n    "local": {},\n    "submit": {},\n    "log": {},\n    "redaction": {},\n    "python": {\n        "profile": {},\n        "worker": {}\n    },\n    "files": {},\n    "jars": {},\n    "pyspark": {\n        "driver": {}\n    },\n    "reducer": {},\n    "shuffle": {\n        "file": {},\n        "io": {},\n        "service": {\n            "index": {\n                "cache": {}\n            }\n        },\n        "sort": {},\n        "spill": {},\n        "registration": {}\n    },\n    "eventLog": {\n        "logBlockUpdates": {},\n        "longForm": {},\n        "buffer": {}\n    },\n    "ui": {\n        "dagGraph": {},\n        "liveUpdate": {}\n    },\n    "worker": {\n        "ui": {}\n    },\n    "sql": {\n        "ui": {}\n    },\n    "streaming": {\n        "ui": {},\n        "backpressure": {},\n        "receiver": {\n            "writeAheadLog": {}\n        },\n        "kafka": {},\n        "driver": {\n            "writeAheadLog": {}\n        }\n    },\n    "broadcast": {},\n    "io": {\n        "compression": {\n            "lz4": {},\n            "snappy": {},\n            "zstd": {}\n        }\n    },\n    "kryo": {},\n    "kryoserializer": {\n        "buffer": {}\n    },\n    "rdd": {},\n    "serializer": {},\n    "memory": {\n        "offHeap": {}\n    },\n    "storage": {\n        "replication": {}\n    },\n    "cleaner": {\n        "periodicGC": {},\n        "referenceTracking": {\n            "blocking": {}\n        }\n    },\n    "default": {},\n    "hadoop": {\n        "mapreduce": {\n            "fileoutputcommitter": {\n                "algorithm": {}\n            }\n        }\n    },\n    "rpc": {\n        "message": {},\n        "retry": {}\n    },\n    "blockManager": {},\n    "network": {},\n    "port": {},\n    "core": {\n        "connection": {\n            "ack": {\n                "wait": {}\n            }\n        }\n    },\n    "cores": {},\n    "locality": {\n        "wait": {}\n    },\n    "scheduler": {\n        "revive": {},\n        "listenerbus": {\n            "eventqueue": {}\n        }\n    },\n    "blacklist": {\n        "task": {},\n        "stage": {},\n        "application": {\n            "fetchFailure": {}\n        }\n    },\n    "speculation": {},\n    "task": {\n        "reaper": {}\n    },\n    "stage": {},\n    "dynamicAllocation": {},\n    "r": {\n        "driver": {},\n        "shell": {}\n    },\n    "graphx": {\n        "pregel": {}\n    },\n    "deploy": {\n        "zookeeper": {}\n    }\n}\n
\n
\n
\nConfig Schema:
\n
app (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
name (dagster.StringSource, optional):
\n

Application Properties: The name of your application. This will appear in the UI and in log data.

\n
\n
\n
\n
driver (permissive dict, optional):
\n
\nDefault Value:
{\n    "blockManager": {}\n}\n
\n
\n
\nConfig Schema:
\n
cores (dagster.IntSource, optional):
\n

Application Properties: Number of cores to use for the driver process, only in cluster mode.

\n
\n
maxResultSize (dagster.StringSource, optional):
\n

Application Properties: Limit of total size of serialized results of all partitions for each Spark action (e.g. collect) in bytes. Should be at least 1M, or 0 for unlimited. Jobs will be aborted if the total size is above this limit. Having a high limit may cause out-of-memory errors in driver (depends on spark.driver.memory and memory overhead of objects in JVM). Setting a proper limit can protect the driver from out-of-memory errors.

\n
\n
memory (dagster.StringSource, optional):
\n

Application Properties: Amount of memory to use for the driver process, i.e. where SparkContext is initialized, in the same format as JVM memory strings with a size unit suffix (\u201ck\u201d, \u201cm\u201d, \u201cg\u201d or \u201ct\u201d) (e.g. 512m, 2g). Note: In client mode, this config must not be set through the SparkConf directly in your application, because the driver JVM has already started at that point. Instead, please set this through the \u2013driver-memory command line option or in your default properties file.

\n
\n
memoryOverhead (dagster.StringSource, optional):
\n

Application Properties: The amount of off-heap memory to be allocated per driver in cluster mode, in MiB unless otherwise specified. This is memory that accounts for things like VM overheads, interned strings, other native overheads, etc. This tends to grow with the container size (typically 6-10%). This option is currently supported on YARN and Kubernetes.

\n
\n
supervise (Bool, optional):
\n

Application Properties: If true, restarts the driver automatically if it fails with a non-zero exit status. Only has effect in Spark standalone mode or Mesos cluster deploy mode.

\n
\n
extraClassPath (dagster.StringSource, optional):
\n

Runtime Environment: Extra classpath entries to prepend to the classpath of the driver. Note: In client mode, this config must not be set through the SparkConf directly in your application, because the driver JVM has already started at that point. Instead, please set this through the \u2013driver-class-path command line option or in your default properties file.

\n
\n
extraJavaOptions (dagster.StringSource, optional):
\n

Runtime Environment: A string of extra JVM options to pass to the driver. For instance, GC settings or other logging. Note that it is illegal to set maximum heap size (-Xmx) settings with this option. Maximum heap size settings can be set with spark.driver.memory in the cluster mode and through the \u2013driver-memory command line option in the client mode. Note: In client mode, this config must not be set through the SparkConf directly in your application, because the driver JVM has already started at that point. Instead, please set this through the \u2013driver-java-options command line option or in your default properties file.

\n
\n
extraLibraryPath (dagster.StringSource, optional):
\n

Runtime Environment: Set a special library path to use when launching the driver JVM. Note: In client mode, this config must not be set through the SparkConf directly in your application, because the driver JVM has already started at that point. Instead, please set this through the \u2013driver-library-path command line option or in your default properties file.

\n
\n
userClassPathFirst (Bool, optional):
\n

Runtime Environment: (Experimental) Whether to give user-added jars precedence over Spark\u2019s own jars when loading classes in the driver. This feature can be used to mitigate conflicts between Spark\u2019s dependencies and user dependencies. It is currently an experimental feature. This is used in cluster mode only.

\n
\n
blockManager (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
port (dagster.StringSource, optional):
\n

Networking: Driver-specific port for the block manager to listen on, for cases where it cannot use the same configuration as executors.

\n
\n
\n
\n
bindAddress (dagster.StringSource, optional):
\n

Networking: Hostname or IP address where to bind listening sockets. This config overrides the SPARK_LOCAL_IP environment variable (see below). It also allows a different address from the local one to be advertised to executors or external systems. This is useful, for example, when running containers with bridged networking. For this to properly work, the different ports used by the driver (RPC, block manager and UI) need to be forwarded from the container\u2019s host.

\n
\n
host (dagster.StringSource, optional):
\n

Networking: Hostname or IP address for the driver. This is used for communicating with the executors and the standalone Master.

\n
\n
port (dagster.StringSource, optional):
\n

Networking: Port for the driver to listen on. This is used for communicating with the executors and the standalone Master.

\n
\n
\n
\n
executor (permissive dict, optional):
\n
\nDefault Value:
{\n    "pyspark": {},\n    "logs": {\n        "rolling": {\n            "time": {}\n        }\n    }\n}\n
\n
\n
\nConfig Schema:
\n
memory (dagster.StringSource, optional):
\n

Application Properties: Amount of memory to use per executor process, in the same format as JVM memory strings with a size unit suffix (\u201ck\u201d, \u201cm\u201d, \u201cg\u201d or \u201ct\u201d) (e.g. 512m, 2g).

\n
\n
pyspark (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
memory (dagster.StringSource, optional):
\n

Application Properties: The amount of memory to be allocated to PySpark in each executor, in MiB unless otherwise specified. If set, PySpark memory for an executor will be limited to this amount. If not set, Spark will not limit Python\u2019s memory use and it is up to the application to avoid exceeding the overhead memory space shared with other non-JVM processes. When PySpark is run in YARN or Kubernetes, this memory is added to executor resource requests.

\n
\n
\n
\n
memoryOverhead (dagster.StringSource, optional):
\n

Application Properties: The amount of off-heap memory to be allocated per executor, in MiB unless otherwise specified. This is memory that accounts for things like VM overheads, interned strings, other native overheads, etc. This tends to grow with the executor size (typically 6-10%). This option is currently supported on YARN and Kubernetes.

\n
\n
extraClassPath (dagster.StringSource, optional):
\n

Runtime Environment: Extra classpath entries to prepend to the classpath of executors. This exists primarily for backwards-compatibility with older versions of Spark. Users typically should not need to set this option.

\n
\n
extraJavaOptions (dagster.StringSource, optional):
\n

Runtime Environment: A string of extra JVM options to pass to executors. For instance, GC settings or other logging. Note that it is illegal to set Spark properties or maximum heap size (-Xmx) settings with this option. Spark properties should be set using a SparkConf object or the spark-defaults.conf file used with the spark-submit script. Maximum heap size settings can be set with spark.executor.memory. The following symbols, if present will be interpolated: {{APP_ID}} will be replaced by application ID and {{EXECUTOR_ID}} will be replaced by executor ID. For example, to enable verbose gc logging to a file named for the executor ID of the app in /tmp, pass a \u2018value\u2019 of: -verbose:gc -Xloggc:/tmp/{{APP_ID}}-{{EXECUTOR_ID}}.gc

\n
\n
extraLibraryPath (dagster.StringSource, optional):
\n

Runtime Environment: Set a special library path to use when launching executor JVM\u2019s.

\n
\n
logs (permissive dict, optional):
\n
\nDefault Value:
{\n    "rolling": {\n        "time": {}\n    }\n}\n
\n
\n
\nConfig Schema:
\n
rolling (permissive dict, optional):
\n
\nDefault Value:
{\n    "time": {}\n}\n
\n
\n
\nConfig Schema:
\n
maxRetainedFiles (dagster.IntSource, optional):
\n

Runtime Environment: Sets the number of latest rolling log files that are going to be retained by the system. Older log files will be deleted. Disabled by default.

\n
\n
enableCompression (Bool, optional):
\n

Runtime Environment: Enable executor log compression. If it is enabled, the rolled executor logs will be compressed. Disabled by default.

\n
\n
maxSize (dagster.IntSource, optional):
\n

Runtime Environment: Set the max size of the file in bytes by which the executor logs will be rolled over. Rolling is disabled by default. See spark.executor.logs.rolling.maxRetainedFiles for automatic cleaning of old logs.

\n
\n
strategy (dagster.StringSource, optional):
\n

Runtime Environment: Set the strategy of rolling of executor logs. By default it is disabled. It can be set to \u201ctime\u201d (time-based rolling) or \u201csize\u201d (size-based rolling). For \u201ctime\u201d, use spark.executor.logs.rolling.time.interval to set the rolling interval. For \u201csize\u201d, use spark.executor.logs.rolling.maxSize to set the maximum file size for rolling.

\n
\n
time (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
interval (dagster.StringSource, optional):
\n

Runtime Environment: Set the time interval by which the executor logs will be rolled over. Rolling is disabled by default. Valid values are daily, hourly, minutely or any interval in seconds. See spark.executor.logs.rolling.maxRetainedFiles for automatic cleaning of old logs.

\n
\n
\n
\n
\n
\n
\n
\n
userClassPathFirst (Bool, optional):
\n

Runtime Environment: (Experimental) Same functionality as spark.driver.userClassPathFirst, but applied to executor instances.

\n
\n
cores (dagster.IntSource, optional):
\n

Execution Behavior: The number of cores to use on each executor. In standalone and Mesos coarse-grained modes, for more detail, see this description.

\n
\n
heartbeatInterval (dagster.StringSource, optional):
\n

Execution Behavior: Interval between each executor\u2019s heartbeats to the driver. Heartbeats let the driver know that the executor is still alive and update it with metrics for in-progress tasks. spark.executor.heartbeatInterval should be significantly less than spark.network.timeout

\n
\n
\n
\n
extraListeners (dagster.StringSource, optional):
\n

Application Properties: A comma-separated list of classes that implement SparkListener; when initializing SparkContext, instances of these classes will be created and registered with Spark\u2019s listener bus. If a class has a single-argument constructor that accepts a SparkConf, that constructor will be called; otherwise, a zero-argument constructor will be called. If no valid constructor can be found, the SparkContext creation will fail with an exception.

\n
\n
local (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
dir (dagster.StringSource, optional):
\n

Application Properties: Directory to use for \u201cscratch\u201d space in Spark, including map output files and RDDs that get stored on disk. This should be on a fast, local disk in your system. It can also be a comma-separated list of multiple directories on different disks. NOTE: In Spark 1.0 and later this will be overridden by SPARK_LOCAL_DIRS (Standalone), MESOS_SANDBOX (Mesos) or LOCAL_DIRS (YARN) environment variables set by the cluster manager.

\n
\n
\n
\n
logConf (Bool, optional):
\n

Application Properties: Logs the effective SparkConf as INFO when a SparkContext is started.

\n
\n
master (dagster.StringSource, optional):
\n

Application Properties: The cluster manager to connect to. See the list of allowed master URL\u2019s.

\n
\n
submit (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
deployMode (dagster.StringSource, optional):
\n

Application Properties: The deploy mode of Spark driver program, either \u201cclient\u201d or \u201ccluster\u201d, Which means to launch driver program locally (\u201cclient\u201d) or remotely (\u201ccluster\u201d) on one of the nodes inside the cluster.

\n
\n
pyFiles (dagster.StringSource, optional):
\n

Runtime Environment: Comma-separated list of .zip, .egg, or .py files to place on the PYTHONPATH for Python apps. Globs are allowed.

\n
\n
\n
\n
log (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
callerContext (dagster.StringSource, optional):
\n

Application Properties: Application information that will be written into Yarn RM log/HDFS audit log when running on Yarn/HDFS. Its length depends on the Hadoop configuration hadoop.caller.context.max.size. It should be concise, and typically can have up to 50 characters.

\n
\n
\n
\n
redaction (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
regex (dagster.StringSource, optional):
\n

Runtime Environment: Regex to decide which Spark configuration properties and environment variables in driver and executor environments contain sensitive information. When this regex matches a property key or value, the value is redacted from the environment UI and various logs like YARN and event logs.

\n
\n
\n
\n
python (permissive dict, optional):
\n
\nDefault Value:
{\n    "profile": {},\n    "worker": {}\n}\n
\n
\n
\nConfig Schema:
\n
profile (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
root (Bool, optional):
\n

Runtime Environment: Enable profiling in Python worker, the profile result will show up by sc.show_profiles(), or it will be displayed before the driver exits. It also can be dumped into disk by sc.dump_profiles(path). If some of the profile results had been displayed manually, they will not be displayed automatically before driver exiting. By default the pyspark.profiler.BasicProfiler will be used, but this can be overridden by passing a profiler class in as a parameter to the SparkContext constructor.

\n
\n
dump (dagster.StringSource, optional):
\n

Runtime Environment: The directory which is used to dump the profile result before driver exiting. The results will be dumped as separated file for each RDD. They can be loaded by ptats.Stats(). If this is specified, the profile result will not be displayed automatically.

\n
\n
\n
\n
worker (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
memory (dagster.StringSource, optional):
\n

Runtime Environment: Amount of memory to use per python worker process during aggregation, in the same format as JVM memory strings with a size unit suffix (\u201ck\u201d, \u201cm\u201d, \u201cg\u201d or \u201ct\u201d) (e.g. 512m, 2g). If the memory used during aggregation goes above this amount, it will spill the data into disks.

\n
\n
reuse (Bool, optional):
\n

Runtime Environment: Reuse Python worker or not. If yes, it will use a fixed number of Python workers, does not need to fork() a Python process for every task. It will be very useful if there is large broadcast, then the broadcast will not be needed to transferred from JVM to Python worker for every task.

\n
\n
\n
\n
\n
\n
files (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
root (dagster.StringSource, optional):
\n

Runtime Environment: Comma-separated list of files to be placed in the working directory of each executor. Globs are allowed.

\n
\n
fetchTimeout (dagster.StringSource, optional):
\n

Execution Behavior: Communication timeout to use when fetching files added through SparkContext.addFile() from the driver.

\n
\n
useFetchCache (Bool, optional):
\n

Execution Behavior: If set to true (default), file fetching will use a local cache that is shared by executors that belong to the same application, which can improve task launching performance when running many executors on the same host. If set to false, these caching optimizations will be disabled and all executors will fetch their own copies of files. This optimization may be disabled in order to use Spark local directories that reside on NFS filesystems (see SPARK-6313 for more details).

\n
\n
overwrite (Bool, optional):
\n

Execution Behavior: Whether to overwrite files added through SparkContext.addFile() when the target file exists and its contents do not match those of the source.

\n
\n
maxPartitionBytes (dagster.IntSource, optional):
\n

Execution Behavior: The maximum number of bytes to pack into a single partition when reading files.

\n
\n
openCostInBytes (dagster.IntSource, optional):
\n

Execution Behavior: The estimated cost to open a file, measured by the number of bytes could be scanned at the same time. This is used when putting multiple files into a partition. It is better to overestimate, then the partitions with small files will be faster than partitions with bigger files.

\n
\n
\n
\n
jars (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
root (dagster.StringSource, optional):
\n

Runtime Environment: Comma-separated list of jars to include on the driver and executor classpaths. Globs are allowed.

\n
\n
packages (dagster.StringSource, optional):
\n

Runtime Environment: Comma-separated list of Maven coordinates of jars to include on the driver and executor classpaths. The coordinates should be groupId:artifactId:version. If spark.jars.ivySettings is given artifacts will be resolved according to the configuration in the file, otherwise artifacts will be searched for in the local maven repo, then maven central and finally any additional remote repositories given by the command-line option \u2013repositories. For more details, see Advanced Dependency Management.

\n
\n
excludes (dagster.StringSource, optional):
\n

Runtime Environment: Comma-separated list of groupId:artifactId, to exclude while resolving the dependencies provided in spark.jars.packages to avoid dependency conflicts.

\n
\n
ivy (dagster.StringSource, optional):
\n

Runtime Environment: Path to specify the Ivy user directory, used for the local Ivy cache and package files from spark.jars.packages. This will override the Ivy property ivy.default.ivy.user.dir which defaults to ~/.ivy2.

\n
\n
ivySettings (dagster.StringSource, optional):
\n

Runtime Environment: Path to an Ivy settings file to customize resolution of jars specified using spark.jars.packages instead of the built-in defaults, such as maven central. Additional repositories given by the command-line option \u2013repositories or spark.jars.repositories will also be included. Useful for allowing Spark to resolve artifacts from behind a firewall e.g. via an in-house artifact server like Artifactory. Details on the settings file format can be found at http://ant.apache.org/ivy/history/latest-milestone/settings.html

\n
\n
repositories (dagster.StringSource, optional):
\n

Runtime Environment: Comma-separated list of additional remote repositories to search for the maven coordinates given with \u2013packages or spark.jars.packages.

\n
\n
\n
\n
pyspark (permissive dict, optional):
\n
\nDefault Value:
{\n    "driver": {}\n}\n
\n
\n
\nConfig Schema:
\n
driver (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
python (dagster.StringSource, optional):
\n

Runtime Environment: Python binary executable to use for PySpark in driver. (default is spark.pyspark.python)

\n
\n
\n
\n
python (dagster.StringSource, optional):
\n

Runtime Environment: Python binary executable to use for PySpark in both driver and executors.

\n
\n
\n
\n
reducer (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
maxSizeInFlight (dagster.StringSource, optional):
\n

Shuffle Behavior: Maximum size of map outputs to fetch simultaneously from each reduce task, in MiB unless otherwise specified. Since each output requires us to create a buffer to receive it, this represents a fixed memory overhead per reduce task, so keep it small unless you have a large amount of memory.

\n
\n
maxReqsInFlight (dagster.IntSource, optional):
\n

Shuffle Behavior: This configuration limits the number of remote requests to fetch blocks at any given point. When the number of hosts in the cluster increase, it might lead to very large number of inbound connections to one or more nodes, causing the workers to fail under load. By allowing it to limit the number of fetch requests, this scenario can be mitigated.

\n
\n
maxBlocksInFlightPerAddress (dagster.IntSource, optional):
\n

Shuffle Behavior: This configuration limits the number of remote blocks being fetched per reduce task from a given host port. When a large number of blocks are being requested from a given address in a single fetch or simultaneously, this could crash the serving executor or Node Manager. This is especially useful to reduce the load on the Node Manager when external shuffle is enabled. You can mitigate this issue by setting it to a lower value.

\n
\n
\n
\n
maxRemoteBlockSizeFetchToMem (dagster.IntSource, optional):
\n

Shuffle Behavior: The remote block will be fetched to disk when size of the block is above this threshold in bytes. This is to avoid a giant request that takes too much memory. By default, this is only enabled for blocks > 2GB, as those cannot be fetched directly into memory, no matter what resources are available. But it can be turned down to a much lower value (eg. 200m) to avoid using too much memory on smaller blocks as well. Note this configuration will affect both shuffle fetch and block manager remote block fetch. For users who enabled external shuffle service, this feature can only be used when external shuffle service is newer than Spark 2.2.

\n
\n
shuffle (permissive dict, optional):
\n
\nDefault Value:
{\n    "file": {},\n    "io": {},\n    "service": {\n        "index": {\n            "cache": {}\n        }\n    },\n    "sort": {},\n    "spill": {},\n    "registration": {}\n}\n
\n
\n
\nConfig Schema:
\n
compress (Bool, optional):
\n

Shuffle Behavior: Whether to compress map output files. Generally a good idea. Compression will use spark.io.compression.codec.

\n
\n
file (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
buffer (dagster.StringSource, optional):
\n

Shuffle Behavior: Size of the in-memory buffer for each shuffle file output stream, in KiB unless otherwise specified. These buffers reduce the number of disk seeks and system calls made in creating intermediate shuffle files.

\n
\n
\n
\n
io (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
maxRetries (dagster.IntSource, optional):
\n

Shuffle Behavior: (Netty only) Fetches that fail due to IO-related exceptions are automatically retried if this is set to a non-zero value. This retry logic helps stabilize large shuffles in the face of long GC pauses or transient network connectivity issues.

\n
\n
numConnectionsPerPeer (dagster.IntSource, optional):
\n

Shuffle Behavior: (Netty only) Connections between hosts are reused in order to reduce connection buildup for large clusters. For clusters with many hard disks and few hosts, this may result in insufficient concurrency to saturate all disks, and so users may consider increasing this value.

\n
\n
preferDirectBufs (Bool, optional):
\n

Shuffle Behavior: (Netty only) Off-heap buffers are used to reduce garbage collection during shuffle and cache block transfer. For environments where off-heap memory is tightly limited, users may wish to turn this off to force all allocations from Netty to be on-heap.

\n
\n
retryWait (dagster.StringSource, optional):
\n

Shuffle Behavior: (Netty only) How long to wait between retries of fetches. The maximum delay caused by retrying is 15 seconds by default, calculated as maxRetries * retryWait.

\n
\n
\n
\n
service (permissive dict, optional):
\n
\nDefault Value:
{\n    "index": {\n        "cache": {}\n    }\n}\n
\n
\n
\nConfig Schema:
\n
enabled (Bool, optional):
\n

Shuffle Behavior: Enables the external shuffle service. This service preserves the shuffle files written by executors so the executors can be safely removed. This must be enabled if spark.dynamicAllocation.enabled is \u201ctrue\u201d. The external shuffle service must be set up in order to enable it. See dynamic allocation configuration and setup documentation for more information.

\n
\n
port (dagster.IntSource, optional):
\n

Shuffle Behavior: Port on which the external shuffle service will run.

\n
\n
index (permissive dict, optional):
\n
\nDefault Value:
{\n    "cache": {}\n}\n
\n
\n
\nConfig Schema:
\n
cache (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
size (dagster.StringSource, optional):
\n

Shuffle Behavior: Cache entries limited to the specified memory footprint in bytes.

\n
\n
\n
\n
\n
\n
\n
\n
maxChunksBeingTransferred (dagster.IntSource, optional):
\n

Shuffle Behavior: The max number of chunks allowed to be transferred at the same time on shuffle service. Note that new incoming connections will be closed when the max number is hit. The client will retry according to the shuffle retry configs (see spark.shuffle.io.maxRetries and spark.shuffle.io.retryWait), if those limits are reached the task will fail with fetch failure.

\n
\n
sort (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
bypassMergeThreshold (dagster.IntSource, optional):
\n

Shuffle Behavior: (Advanced) In the sort-based shuffle manager, avoid merge-sorting data if there is no map-side aggregation and there are at most this many reduce partitions.

\n
\n
\n
\n
spill (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
compress (Bool, optional):
\n

Shuffle Behavior: Whether to compress data spilled during shuffles. Compression will use spark.io.compression.codec.

\n
\n
\n
\n
accurateBlockThreshold (dagster.IntSource, optional):
\n

Shuffle Behavior: Threshold in bytes above which the size of shuffle blocks in HighlyCompressedMapStatus is accurately recorded. This helps to prevent OOM by avoiding underestimating shuffle block size when fetch shuffle blocks.

\n
\n
registration (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
timeout (dagster.IntSource, optional):
\n

Shuffle Behavior: Timeout in milliseconds for registration to the external shuffle service.

\n
\n
maxAttempts (dagster.IntSource, optional):
\n

Shuffle Behavior: When we fail to register to the external shuffle service, we will retry for maxAttempts times.

\n
\n
\n
\n
memoryFraction (Float, optional):
\n

Memory Management: (deprecated) This is read only if spark.memory.useLegacyMode is enabled. Fraction of Java heap to use for aggregation and cogroups during shuffles. At any given time, the collective size of all in-memory maps used for shuffles is bounded by this limit, beyond which the contents will begin to spill to disk. If spills are often, consider increasing this value at the expense of spark.storage.memoryFraction.

\n
\n
\n
\n
eventLog (permissive dict, optional):
\n
\nDefault Value:
{\n    "logBlockUpdates": {},\n    "longForm": {},\n    "buffer": {}\n}\n
\n
\n
\nConfig Schema:
\n
logBlockUpdates (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
enabled (dagster.StringSource, optional):
\n

Spark UI: Whether to log events for every block update, if spark.eventLog.enabled is true. *Warning*: This will increase the size of the event log considerably.

\n
\n
\n
\n
longForm (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
enabled (dagster.StringSource, optional):
\n

Spark UI: If true, use the long form of call sites in the event log. Otherwise use the short form.

\n
\n
\n
\n
compress (dagster.StringSource, optional):
\n

Spark UI: Whether to compress logged events, if spark.eventLog.enabled is true. Compression will use spark.io.compression.codec.

\n
\n
dir (dagster.StringSource, optional):
\n

Spark UI: Base directory in which Spark events are logged, if spark.eventLog.enabled is true. Within this base directory, Spark creates a sub-directory for each application, and logs the events specific to the application in this directory. Users may want to set this to a unified location like an HDFS directory so history files can be read by the history server.

\n
\n
enabled (dagster.StringSource, optional):
\n

Spark UI: Whether to log Spark events, useful for reconstructing the Web UI after the application has finished.

\n
\n
overwrite (dagster.StringSource, optional):
\n

Spark UI: Whether to overwrite any existing files.

\n
\n
buffer (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
kb (dagster.StringSource, optional):
\n

Spark UI: Buffer size to use when writing to output streams, in KiB unless otherwise specified.

\n
\n
\n
\n
\n
\n
ui (permissive dict, optional):
\n
\nDefault Value:
{\n    "dagGraph": {},\n    "liveUpdate": {}\n}\n
\n
\n
\nConfig Schema:
\n
dagGraph (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
retainedRootRDDs (dagster.StringSource, optional):
\n

Spark UI: How many DAG graph nodes the Spark UI and status APIs remember before garbage collecting.

\n
\n
\n
\n
enabled (dagster.StringSource, optional):
\n

Spark UI: Whether to run the web UI for the Spark application.

\n
\n
killEnabled (dagster.StringSource, optional):
\n

Spark UI: Allows jobs and stages to be killed from the web UI.

\n
\n
liveUpdate (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
period (dagster.StringSource, optional):
\n

Spark UI: How often to update live entities. -1 means \u201cnever update\u201d when replaying applications, meaning only the last write will happen. For live applications, this avoids a few operations that we can live without when rapidly processing incoming task events.

\n
\n
\n
\n
port (dagster.StringSource, optional):
\n

Spark UI: Port for your application\u2019s dashboard, which shows memory and workload data.

\n
\n
retainedJobs (dagster.StringSource, optional):
\n

Spark UI: How many jobs the Spark UI and status APIs remember before garbage collecting. This is a target maximum, and fewer elements may be retained in some circumstances.

\n
\n
retainedStages (dagster.StringSource, optional):
\n

Spark UI: How many stages the Spark UI and status APIs remember before garbage collecting. This is a target maximum, and fewer elements may be retained in some circumstances.

\n
\n
retainedTasks (dagster.StringSource, optional):
\n

Spark UI: How many tasks the Spark UI and status APIs remember before garbage collecting. This is a target maximum, and fewer elements may be retained in some circumstances.

\n
\n
reverseProxy (dagster.StringSource, optional):
\n

Spark UI: Enable running Spark Master as reverse proxy for worker and application UIs. In this mode, Spark master will reverse proxy the worker and application UIs to enable access without requiring direct access to their hosts. Use it with caution, as worker and application UI will not be accessible directly, you will only be able to access them through spark master/proxy public URL. This setting affects all the workers and application UIs running in the cluster and must be set on all the workers, drivers and masters.

\n
\n
reverseProxyUrl (dagster.StringSource, optional):
\n

Spark UI: This is the URL where your proxy is running. This URL is for proxy which is running in front of Spark Master. This is useful when running proxy for authentication e.g. OAuth proxy. Make sure this is a complete URL including scheme (http/https) and port to reach your proxy.

\n
\n
showConsoleProgress (dagster.StringSource, optional):
\n

Spark UI: Show the progress bar in the console. The progress bar shows the progress of stages that run for longer than 500ms. If multiple stages run at the same time, multiple progress bars will be displayed on the same line.

\n
\n
retainedDeadExecutors (dagster.StringSource, optional):
\n

Spark UI: How many dead executors the Spark UI and status APIs remember before garbage collecting.

\n
\n
filters (dagster.StringSource, optional):
\n

Spark UI: Comma separated list of filter class names to apply to the Spark Web UI. The filter should be a standard javax servlet Filter. Filter parameters can also be specified in the configuration, by setting config entries of the form spark.<class name of filter>.param.<param name>=<value> For example: spark.ui.filters=com.test.filter1 spark.com.test.filter1.param.name1=foo spark.com.test.filter1.param.name2=bar

\n
\n
\n
\n
worker (permissive dict, optional):
\n
\nDefault Value:
{\n    "ui": {}\n}\n
\n
\n
\nConfig Schema:
\n
ui (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
retainedExecutors (dagster.StringSource, optional):
\n

Spark UI: How many finished executors the Spark UI and status APIs remember before garbage collecting.

\n
\n
retainedDrivers (dagster.StringSource, optional):
\n

Spark UI: How many finished drivers the Spark UI and status APIs remember before garbage collecting.

\n
\n
\n
\n
\n
\n
sql (permissive dict, optional):
\n
\nDefault Value:
{\n    "ui": {}\n}\n
\n
\n
\nConfig Schema:
\n
ui (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
retainedExecutions (dagster.StringSource, optional):
\n

Spark UI: How many finished executions the Spark UI and status APIs remember before garbage collecting.

\n
\n
\n
\n
\n
\n
streaming (permissive dict, optional):
\n
\nDefault Value:
{\n    "ui": {},\n    "backpressure": {},\n    "receiver": {\n        "writeAheadLog": {}\n    },\n    "kafka": {},\n    "driver": {\n        "writeAheadLog": {}\n    }\n}\n
\n
\n
\nConfig Schema:
\n
ui (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
retainedBatches (dagster.StringSource, optional):
\n

Spark Streaming: How many batches the Spark Streaming UI and status APIs remember before garbage collecting.

\n
\n
\n
\n
backpressure (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
enabled (dagster.StringSource, optional):
\n

Spark Streaming: Enables or disables Spark Streaming\u2019s internal backpressure mechanism (since 1.5). This enables the Spark Streaming to control the receiving rate based on the current batch scheduling delays and processing times so that the system receives only as fast as the system can process. Internally, this dynamically sets the maximum receiving rate of receivers. This rate is upper bounded by the values spark.streaming.receiver.maxRate and spark.streaming.kafka.maxRatePerPartition if they are set (see below).

\n
\n
initialRate (dagster.StringSource, optional):
\n

Spark Streaming: This is the initial maximum receiving rate at which each receiver will receive data for the first batch when the backpressure mechanism is enabled.

\n
\n
\n
\n
blockInterval (dagster.StringSource, optional):
\n

Spark Streaming: Interval at which data received by Spark Streaming receivers is chunked into blocks of data before storing them in Spark. Minimum recommended - 50 ms. See the performance tuning section in the Spark Streaming programing guide for more details.

\n
\n
receiver (permissive dict, optional):
\n
\nDefault Value:
{\n    "writeAheadLog": {}\n}\n
\n
\n
\nConfig Schema:
\n
maxRate (dagster.StringSource, optional):
\n

Spark Streaming: Maximum rate (number of records per second) at which each receiver will receive data. Effectively, each stream will consume at most this number of records per second. Setting this configuration to 0 or a negative number will put no limit on the rate. See the deployment guide in the Spark Streaming programing guide for mode details.

\n
\n
writeAheadLog (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
enable (dagster.StringSource, optional):
\n

Spark Streaming: Enable write-ahead logs for receivers. All the input data received through receivers will be saved to write-ahead logs that will allow it to be recovered after driver failures. See the deployment guide in the Spark Streaming programing guide for more details.

\n
\n
closeFileAfterWrite (dagster.StringSource, optional):
\n

Spark Streaming: Whether to close the file after writing a write-ahead log record on the receivers. Set this to \u2018true\u2019 when you want to use S3 (or any file system that does not support flushing) for the data WAL on the receivers.

\n
\n
\n
\n
\n
\n
unpersist (dagster.StringSource, optional):
\n

Spark Streaming: Force RDDs generated and persisted by Spark Streaming to be automatically unpersisted from Spark\u2019s memory. The raw input data received by Spark Streaming is also automatically cleared. Setting this to false will allow the raw data and persisted RDDs to be accessible outside the streaming application as they will not be cleared automatically. But it comes at the cost of higher memory usage in Spark.

\n
\n
stopGracefullyOnShutdown (dagster.StringSource, optional):
\n

Spark Streaming: If true, Spark shuts down the StreamingContext gracefully on JVM shutdown rather than immediately.

\n
\n
kafka (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
maxRatePerPartition (dagster.StringSource, optional):
\n

Spark Streaming: Maximum rate (number of records per second) at which data will be read from each Kafka partition when using the new Kafka direct stream API. See the Kafka Integration guide for more details.

\n
\n
minRatePerPartition (dagster.StringSource, optional):
\n

Spark Streaming: Minimum rate (number of records per second) at which data will be read from each Kafka partition when using the new Kafka direct stream API.

\n
\n
maxRetries (dagster.StringSource, optional):
\n

Spark Streaming: Maximum number of consecutive retries the driver will make in order to find the latest offsets on the leader of each partition (a default value of 1 means that the driver will make a maximum of 2 attempts). Only applies to the new Kafka direct stream API.

\n
\n
\n
\n
driver (permissive dict, optional):
\n
\nDefault Value:
{\n    "writeAheadLog": {}\n}\n
\n
\n
\nConfig Schema:
\n
writeAheadLog (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
closeFileAfterWrite (dagster.StringSource, optional):
\n

Spark Streaming: Whether to close the file after writing a write-ahead log record on the driver. Set this to \u2018true\u2019 when you want to use S3 (or any file system that does not support flushing) for the metadata WAL on the driver.

\n
\n
\n
\n
\n
\n
\n
\n
broadcast (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
compress (dagster.StringSource, optional):
\n

Compression and Serialization: Whether to compress broadcast variables before sending them. Generally a good idea. Compression will use spark.io.compression.codec.

\n
\n
blockSize (dagster.StringSource, optional):
\n

Execution Behavior: Size of each piece of a block for TorrentBroadcastFactory, in KiB unless otherwise specified. Too large a value decreases parallelism during broadcast (makes it slower); however, if it is too small, BlockManager might take a performance hit.

\n
\n
checksum (dagster.StringSource, optional):
\n

Execution Behavior: Whether to enable checksum for broadcast. If enabled, broadcasts will include a checksum, which can help detect corrupted blocks, at the cost of computing and sending a little more data. It\u2019s possible to disable it if the network has other mechanisms to guarantee data won\u2019t be corrupted during broadcast.

\n
\n
\n
\n
io (permissive dict, optional):
\n
\nDefault Value:
{\n    "compression": {\n        "lz4": {},\n        "snappy": {},\n        "zstd": {}\n    }\n}\n
\n
\n
\nConfig Schema:
\n
compression (permissive dict, optional):
\n
\nDefault Value:
{\n    "lz4": {},\n    "snappy": {},\n    "zstd": {}\n}\n
\n
\n
\nConfig Schema:
\n
codec (dagster.StringSource, optional):
\n

Compression and Serialization: The codec used to compress internal data such as RDD partitions, event log, broadcast variables and shuffle outputs. By default, Spark provides four codecs: lz4, lzf, snappy, and zstd. You can also use fully qualified class names to specify the codec, e.g. org.apache.spark.io.LZ4CompressionCodec, org.apache.spark.io.LZFCompressionCodec, org.apache.spark.io.SnappyCompressionCodec, and org.apache.spark.io.ZStdCompressionCodec.

\n
\n
lz4 (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
blockSize (dagster.StringSource, optional):
\n

Compression and Serialization: Block size in bytes used in LZ4 compression, in the case when LZ4 compression codec is used. Lowering this block size will also lower shuffle memory usage when LZ4 is used.

\n
\n
\n
\n
snappy (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
blockSize (dagster.StringSource, optional):
\n

Compression and Serialization: Block size in bytes used in Snappy compression, in the case when Snappy compression codec is used. Lowering this block size will also lower shuffle memory usage when Snappy is used.

\n
\n
\n
\n
zstd (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
level (dagster.StringSource, optional):
\n

Compression and Serialization: Compression level for Zstd compression codec. Increasing the compression level will result in better compression at the expense of more CPU and memory.

\n
\n
bufferSize (dagster.StringSource, optional):
\n

Compression and Serialization: Buffer size in bytes used in Zstd compression, in the case when Zstd compression codec is used. Lowering this size will lower the shuffle memory usage when Zstd is used, but it might increase the compression cost because of excessive JNI call overhead.

\n
\n
\n
\n
\n
\n
\n
\n
kryo (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
classesToRegister (dagster.StringSource, optional):
\n

Compression and Serialization: If you use Kryo serialization, give a comma-separated list of custom class names to register with Kryo. See the tuning guide for more details.

\n
\n
referenceTracking (dagster.StringSource, optional):
\n

Compression and Serialization: Whether to track references to the same object when serializing data with Kryo, which is necessary if your object graphs have loops and useful for efficiency if they contain multiple copies of the same object. Can be disabled to improve performance if you know this is not the case.

\n
\n
registrationRequired (dagster.StringSource, optional):
\n

Compression and Serialization: Whether to require registration with Kryo. If set to \u2018true\u2019, Kryo will throw an exception if an unregistered class is serialized. If set to false (the default), Kryo will write unregistered class names along with each object. Writing class names can cause significant performance overhead, so enabling this option can enforce strictly that a user has not omitted classes from registration.

\n
\n
registrator (dagster.StringSource, optional):
\n

Compression and Serialization: If you use Kryo serialization, give a comma-separated list of classes that register your custom classes with Kryo. This property is useful if you need to register your classes in a custom way, e.g. to specify a custom field serializer. Otherwise spark.kryo.classesToRegister is simpler. It should be set to classes that extend KryoRegistrator. See the tuning guide for more details.

\n
\n
unsafe (dagster.StringSource, optional):
\n

Compression and Serialization: Whether to use unsafe based Kryo serializer. Can be substantially faster by using Unsafe Based IO.

\n
\n
\n
\n
kryoserializer (permissive dict, optional):
\n
\nDefault Value:
{\n    "buffer": {}\n}\n
\n
\n
\nConfig Schema:
\n
buffer (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
root (dagster.StringSource, optional):
\n

Compression and Serialization: Initial size of Kryo\u2019s serialization buffer, in KiB unless otherwise specified. Note that there will be one buffer per core on each worker. This buffer will grow up to spark.kryoserializer.buffer.max if needed.

\n
\n
max (dagster.StringSource, optional):
\n

Compression and Serialization: Maximum allowable size of Kryo serialization buffer, in MiB unless otherwise specified. This must be larger than any object you attempt to serialize and must be less than 2048m. Increase this if you get a \u201cbuffer limit exceeded\u201d exception inside Kryo.

\n
\n
\n
\n
\n
\n
rdd (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
compress (dagster.StringSource, optional):
\n

Compression and Serialization: Whether to compress serialized RDD partitions (e.g. for StorageLevel.MEMORY_ONLY_SER in Java and Scala or StorageLevel.MEMORY_ONLY in Python). Can save substantial space at the cost of some extra CPU time. Compression will use spark.io.compression.codec.

\n
\n
\n
\n
serializer (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
root (dagster.StringSource, optional):
\n

Compression and Serialization: Class to use for serializing objects that will be sent over the network or need to be cached in serialized form. The default of Java serialization works with any Serializable Java object but is quite slow, so we recommend using org.apache.spark.serializer.KryoSerializer and configuring Kryo serialization when speed is necessary. Can be any subclass of org.apache.spark.Serializer.

\n
\n
objectStreamReset (dagster.StringSource, optional):
\n

Compression and Serialization: When serializing using org.apache.spark.serializer.JavaSerializer, the serializer caches objects to prevent writing redundant data, however that stops garbage collection of those objects. By calling \u2018reset\u2019 you flush that info from the serializer, and allow old objects to be collected. To turn off this periodic reset set it to -1. By default it will reset the serializer every 100 objects.

\n
\n
\n
\n
memory (permissive dict, optional):
\n
\nDefault Value:
{\n    "offHeap": {}\n}\n
\n
\n
\nConfig Schema:
\n
fraction (Float, optional):
\n

Memory Management: Fraction of (heap space - 300MB) used for execution and storage. The lower this is, the more frequently spills and cached data eviction occur. The purpose of this config is to set aside memory for internal metadata, user data structures, and imprecise size estimation in the case of sparse, unusually large records. Leaving this at the default value is recommended. For more detail, including important information about correctly tuning JVM garbage collection when increasing this value, see this description.

\n
\n
storageFraction (Float, optional):
\n

Memory Management: Amount of storage memory immune to eviction, expressed as a fraction of the size of the region set aside by spark.memory.fraction. The higher this is, the less working memory may be available to execution and tasks may spill to disk more often. Leaving this at the default value is recommended. For more detail, see this description.

\n
\n
offHeap (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
enabled (Bool, optional):
\n

Memory Management: If true, Spark will attempt to use off-heap memory for certain operations. If off-heap memory use is enabled, then spark.memory.offHeap.size must be positive.

\n
\n
size (dagster.IntSource, optional):
\n

Memory Management: The absolute amount of memory in bytes which can be used for off-heap allocation. This setting has no impact on heap memory usage, so if your executors\u2019 total memory consumption must fit within some hard limit then be sure to shrink your JVM heap size accordingly. This must be set to a positive value when spark.memory.offHeap.enabled=true.

\n
\n
\n
\n
useLegacyMode (Bool, optional):
\n

Memory Management: Whether to enable the legacy memory management mode used in Spark 1.5 and before. The legacy mode rigidly partitions the heap space into fixed-size regions, potentially leading to excessive spilling if the application was not tuned. The following deprecated memory fraction configurations are not read unless this is enabled: spark.shuffle.memoryFraction spark.storage.memoryFraction spark.storage.unrollFraction

\n
\n
\n
\n
storage (permissive dict, optional):
\n
\nDefault Value:
{\n    "replication": {}\n}\n
\n
\n
\nConfig Schema:
\n
memoryFraction (Float, optional):
\n

Memory Management: (deprecated) This is read only if spark.memory.useLegacyMode is enabled. Fraction of Java heap to use for Spark\u2019s memory cache. This should not be larger than the \u201cold\u201d generation of objects in the JVM, which by default is given 0.6 of the heap, but you can increase it if you configure your own old generation size.

\n
\n
unrollFraction (Float, optional):
\n

Memory Management: (deprecated) This is read only if spark.memory.useLegacyMode is enabled. Fraction of spark.storage.memoryFraction to use for unrolling blocks in memory. This is dynamically allocated by dropping existing blocks when there is not enough free storage space to unroll the new block in its entirety.

\n
\n
replication (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
proactive (Bool, optional):
\n

Memory Management: Enables proactive block replication for RDD blocks. Cached RDD block replicas lost due to executor failures are replenished if there are any existing available replicas. This tries to get the replication level of the block to the initial number.

\n
\n
\n
\n
memoryMapThreshold (dagster.StringSource, optional):
\n

Execution Behavior: Size in bytes of a block above which Spark memory maps when reading a block from disk. This prevents Spark from memory mapping very small blocks. In general, memory mapping has high overhead for blocks close to or below the page size of the operating system.

\n
\n
\n
\n
cleaner (permissive dict, optional):
\n
\nDefault Value:
{\n    "periodicGC": {},\n    "referenceTracking": {\n        "blocking": {}\n    }\n}\n
\n
\n
\nConfig Schema:
\n
periodicGC (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
interval (dagster.StringSource, optional):
\n

Memory Management: Controls how often to trigger a garbage collection. This context cleaner triggers cleanups only when weak references are garbage collected. In long-running applications with large driver JVMs, where there is little memory pressure on the driver, this may happen very occasionally or not at all. Not cleaning at all may lead to executors running out of disk space after a while.

\n
\n
\n
\n
referenceTracking (permissive dict, optional):
\n
\nDefault Value:
{\n    "blocking": {}\n}\n
\n
\n
\nConfig Schema:
\n
root (Bool, optional):
\n

Memory Management: Enables or disables context cleaning.

\n
\n
blocking (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
root (Bool, optional):
\n

Memory Management: Controls whether the cleaning thread should block on cleanup tasks (other than shuffle, which is controlled by spark.cleaner.referenceTracking.blocking.shuffle Spark property).

\n
\n
shuffle (Bool, optional):
\n

Memory Management: Controls whether the cleaning thread should block on shuffle cleanup tasks.

\n
\n
\n
\n
cleanCheckpoints (Bool, optional):
\n

Memory Management: Controls whether to clean checkpoint files if the reference is out of scope.

\n
\n
\n
\n
\n
\n
default (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
parallelism (dagster.IntSource, optional):
\n

Execution Behavior: Default number of partitions in RDDs returned by transformations like join, reduceByKey, and parallelize when not set by user.

\n
\n
\n
\n
hadoop (permissive dict, optional):
\n
\nDefault Value:
{\n    "mapreduce": {\n        "fileoutputcommitter": {\n            "algorithm": {}\n        }\n    }\n}\n
\n
\n
\nConfig Schema:
\n
cloneConf (Bool, optional):
\n

Execution Behavior: If set to true, clones a new Hadoop Configuration object for each task. This option should be enabled to work around Configuration thread-safety issues (see SPARK-2546 for more details). This is disabled by default in order to avoid unexpected performance regressions for jobs that are not affected by these issues.

\n
\n
validateOutputSpecs (Bool, optional):
\n

Execution Behavior: If set to true, validates the output specification (e.g. checking if the output directory already exists) used in saveAsHadoopFile and other variants. This can be disabled to silence exceptions due to pre-existing output directories. We recommend that users do not disable this except if trying to achieve compatibility with previous versions of Spark. Simply use Hadoop\u2019s FileSystem API to delete output directories by hand. This setting is ignored for jobs generated through Spark Streaming\u2019s StreamingContext, since data may need to be rewritten to pre-existing output directories during checkpoint recovery.

\n
\n
mapreduce (permissive dict, optional):
\n
\nDefault Value:
{\n    "fileoutputcommitter": {\n        "algorithm": {}\n    }\n}\n
\n
\n
\nConfig Schema:
\n
fileoutputcommitter (permissive dict, optional):
\n
\nDefault Value:
{\n    "algorithm": {}\n}\n
\n
\n
\nConfig Schema:
\n
algorithm (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
version (dagster.IntSource, optional):
\n

Execution Behavior: The file output committer algorithm version, valid algorithm version number: 1 or 2. Version 2 may have better performance, but version 1 may handle failures better in certain situations, as per MAPREDUCE-4815.

\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
rpc (permissive dict, optional):
\n
\nDefault Value:
{\n    "message": {},\n    "retry": {}\n}\n
\n
\n
\nConfig Schema:
\n
message (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
maxSize (dagster.StringSource, optional):
\n

Networking: Maximum message size (in MB) to allow in \u201ccontrol plane\u201d communication; generally only applies to map output size information sent between executors and the driver. Increase this if you are running jobs with many thousands of map and reduce tasks and see messages about the RPC message size.

\n
\n
\n
\n
numRetries (dagster.StringSource, optional):
\n

Networking: Number of times to retry before an RPC task gives up. An RPC task will run at most times of this number.

\n
\n
retry (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
wait (dagster.StringSource, optional):
\n

Networking: Duration for an RPC ask operation to wait before retrying.

\n
\n
\n
\n
askTimeout (dagster.StringSource, optional):
\n

Networking: Duration for an RPC ask operation to wait before timing out.

\n
\n
lookupTimeout (dagster.StringSource, optional):
\n

Networking: Duration for an RPC remote endpoint lookup operation to wait before timing out.

\n
\n
\n
\n
blockManager (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
port (dagster.StringSource, optional):
\n

Networking: Port for all block managers to listen on. These exist on both the driver and the executors.

\n
\n
\n
\n
network (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
timeout (dagster.StringSource, optional):
\n

Networking: Default timeout for all network interactions. This config will be used in place of spark.core.connection.ack.wait.timeout, spark.storage.blockManagerSlaveTimeoutMs, spark.shuffle.io.connectionTimeout, spark.rpc.askTimeout or spark.rpc.lookupTimeout if they are not configured.

\n
\n
\n
\n
port (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
maxRetries (dagster.StringSource, optional):
\n

Networking: Maximum number of retries when binding to a port before giving up. When a port is given a specific value (non 0), each subsequent retry will increment the port used in the previous attempt by 1 before retrying. This essentially allows it to try a range of ports from the start port specified to port + maxRetries.

\n
\n
\n
\n
core (permissive dict, optional):
\n
\nDefault Value:
{\n    "connection": {\n        "ack": {\n            "wait": {}\n        }\n    }\n}\n
\n
\n
\nConfig Schema:
\n
connection (permissive dict, optional):
\n
\nDefault Value:
{\n    "ack": {\n        "wait": {}\n    }\n}\n
\n
\n
\nConfig Schema:
\n
ack (permissive dict, optional):
\n
\nDefault Value:
{\n    "wait": {}\n}\n
\n
\n
\nConfig Schema:
\n
wait (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
timeout (dagster.StringSource, optional):
\n

Networking: How long for the connection to wait for ack to occur before timing out and giving up. To avoid unwilling timeout caused by long pause like GC, you can set larger value.

\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
cores (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
max (dagster.StringSource, optional):
\n

Scheduling: When running on a standalone deploy cluster or a Mesos cluster in \u201ccoarse-grained\u201d sharing mode, the maximum amount of CPU cores to request for the application from across the cluster (not from each machine). If not set, the default will be spark.deploy.defaultCores on Spark\u2019s standalone cluster manager, or infinite (all available cores) on Mesos.

\n
\n
\n
\n
locality (permissive dict, optional):
\n
\nDefault Value:
{\n    "wait": {}\n}\n
\n
\n
\nConfig Schema:
\n
wait (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
root (dagster.StringSource, optional):
\n

Scheduling: How long to wait to launch a data-local task before giving up and launching it on a less-local node. The same wait will be used to step through multiple locality levels (process-local, node-local, rack-local and then any). It is also possible to customize the waiting time for each level by setting spark.locality.wait.node, etc. You should increase this setting if your tasks are long and see poor locality, but the default usually works well.

\n
\n
node (dagster.StringSource, optional):
\n

Scheduling: Customize the locality wait for node locality. For example, you can set this to 0 to skip node locality and search immediately for rack locality (if your cluster has rack information).

\n
\n
process (dagster.StringSource, optional):
\n

Scheduling: Customize the locality wait for process locality. This affects tasks that attempt to access cached data in a particular executor process.

\n
\n
rack (dagster.StringSource, optional):
\n

Scheduling: Customize the locality wait for rack locality.

\n
\n
\n
\n
\n
\n
scheduler (permissive dict, optional):
\n
\nDefault Value:
{\n    "revive": {},\n    "listenerbus": {\n        "eventqueue": {}\n    }\n}\n
\n
\n
\nConfig Schema:
\n
maxRegisteredResourcesWaitingTime (dagster.StringSource, optional):
\n

Scheduling: Maximum amount of time to wait for resources to register before scheduling begins.

\n
\n
minRegisteredResourcesRatio (dagster.StringSource, optional):
\n

Scheduling: The minimum ratio of registered resources (registered resources / total expected resources) (resources are executors in yarn mode and Kubernetes mode, CPU cores in standalone mode and Mesos coarse-grained mode [\u2018spark.cores.max\u2019 value is total expected resources for Mesos coarse-grained mode] ) to wait for before scheduling begins. Specified as a double between 0.0 and 1.0. Regardless of whether the minimum ratio of resources has been reached, the maximum amount of time it will wait before scheduling begins is controlled by config spark.scheduler.maxRegisteredResourcesWaitingTime.

\n
\n
mode (dagster.StringSource, optional):
\n

Scheduling: The scheduling mode between jobs submitted to the same SparkContext. Can be set to FAIR to use fair sharing instead of queueing jobs one after another. Useful for multi-user services.

\n
\n
revive (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
interval (dagster.StringSource, optional):
\n

Scheduling: The interval length for the scheduler to revive the worker resource offers to run tasks.

\n
\n
\n
\n
listenerbus (permissive dict, optional):
\n
\nDefault Value:
{\n    "eventqueue": {}\n}\n
\n
\n
\nConfig Schema:
\n
eventqueue (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
capacity (dagster.StringSource, optional):
\n

Scheduling: Capacity for event queue in Spark listener bus, must be greater than 0. Consider increasing value (e.g. 20000) if listener events are dropped. Increasing this value may result in the driver using more memory.

\n
\n
\n
\n
\n
\n
\n
\n
blacklist (permissive dict, optional):
\n
\nDefault Value:
{\n    "task": {},\n    "stage": {},\n    "application": {\n        "fetchFailure": {}\n    }\n}\n
\n
\n
\nConfig Schema:
\n
enabled (dagster.StringSource, optional):
\n

Scheduling: If set to \u201ctrue\u201d, prevent Spark from scheduling tasks on executors that have been blacklisted due to too many task failures. The blacklisting algorithm can be further controlled by the other \u201cspark.blacklist\u201d configuration options.

\n
\n
timeout (dagster.StringSource, optional):
\n

Scheduling: (Experimental) How long a node or executor is blacklisted for the entire application, before it is unconditionally removed from the blacklist to attempt running new tasks.

\n
\n
task (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
maxTaskAttemptsPerExecutor (dagster.StringSource, optional):
\n

Scheduling: (Experimental) For a given task, how many times it can be retried on one executor before the executor is blacklisted for that task.

\n
\n
maxTaskAttemptsPerNode (dagster.StringSource, optional):
\n

Scheduling: (Experimental) For a given task, how many times it can be retried on one node, before the entire node is blacklisted for that task.

\n
\n
\n
\n
stage (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
maxFailedTasksPerExecutor (dagster.StringSource, optional):
\n

Scheduling: (Experimental) How many different tasks must fail on one executor, within one stage, before the executor is blacklisted for that stage.

\n
\n
maxFailedExecutorsPerNode (dagster.StringSource, optional):
\n

Scheduling: (Experimental) How many different executors are marked as blacklisted for a given stage, before the entire node is marked as failed for the stage.

\n
\n
\n
\n
application (permissive dict, optional):
\n
\nDefault Value:
{\n    "fetchFailure": {}\n}\n
\n
\n
\nConfig Schema:
\n
maxFailedTasksPerExecutor (dagster.StringSource, optional):
\n

Scheduling: (Experimental) How many different tasks must fail on one executor, in successful task sets, before the executor is blacklisted for the entire application. Blacklisted executors will be automatically added back to the pool of available resources after the timeout specified by spark.blacklist.timeout. Note that with dynamic allocation, though, the executors may get marked as idle and be reclaimed by the cluster manager.

\n
\n
maxFailedExecutorsPerNode (dagster.StringSource, optional):
\n

Scheduling: (Experimental) How many different executors must be blacklisted for the entire application, before the node is blacklisted for the entire application. Blacklisted nodes will be automatically added back to the pool of available resources after the timeout specified by spark.blacklist.timeout. Note that with dynamic allocation, though, the executors on the node may get marked as idle and be reclaimed by the cluster manager.

\n
\n
fetchFailure (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
enabled (dagster.StringSource, optional):
\n

Scheduling: (Experimental) If set to \u201ctrue\u201d, Spark will blacklist the executor immediately when a fetch failure happens. If external shuffle service is enabled, then the whole node will be blacklisted.

\n
\n
\n
\n
\n
\n
killBlacklistedExecutors (dagster.StringSource, optional):
\n

Scheduling: (Experimental) If set to \u201ctrue\u201d, allow Spark to automatically kill the executors when they are blacklisted on fetch failure or blacklisted for the entire application, as controlled by spark.blacklist.application.*. Note that, when an entire node is added to the blacklist, all of the executors on that node will be killed.

\n
\n
\n
\n
speculation (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
root (dagster.StringSource, optional):
\n

Scheduling: If set to \u201ctrue\u201d, performs speculative execution of tasks. This means if one or more tasks are running slowly in a stage, they will be re-launched.

\n
\n
interval (dagster.StringSource, optional):
\n

Scheduling: How often Spark will check for tasks to speculate.

\n
\n
multiplier (dagster.StringSource, optional):
\n

Scheduling: How many times slower a task is than the median to be considered for speculation.

\n
\n
quantile (dagster.StringSource, optional):
\n

Scheduling: Fraction of tasks which must be complete before speculation is enabled for a particular stage.

\n
\n
\n
\n
task (permissive dict, optional):
\n
\nDefault Value:
{\n    "reaper": {}\n}\n
\n
\n
\nConfig Schema:
\n
cpus (dagster.StringSource, optional):
\n

Scheduling: Number of cores to allocate for each task.

\n
\n
maxFailures (dagster.StringSource, optional):
\n

Scheduling: Number of failures of any particular task before giving up on the job. The total number of failures spread across different tasks will not cause the job to fail; a particular task has to fail this number of attempts. Should be greater than or equal to 1. Number of allowed retries = this value - 1.

\n
\n
reaper (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
enabled (dagster.StringSource, optional):
\n

Scheduling: Enables monitoring of killed / interrupted tasks. When set to true, any task which is killed will be monitored by the executor until that task actually finishes executing. See the other spark.task.reaper.* configurations for details on how to control the exact behavior of this monitoring. When set to false (the default), task killing will use an older code path which lacks such monitoring.

\n
\n
pollingInterval (dagster.StringSource, optional):
\n

Scheduling: When spark.task.reaper.enabled = true, this setting controls the frequency at which executors will poll the status of killed tasks. If a killed task is still running when polled then a warning will be logged and, by default, a thread-dump of the task will be logged (this thread dump can be disabled via the spark.task.reaper.threadDump setting, which is documented below).

\n
\n
threadDump (dagster.StringSource, optional):
\n

Scheduling: When spark.task.reaper.enabled = true, this setting controls whether task thread dumps are logged during periodic polling of killed tasks. Set this to false to disable collection of thread dumps.

\n
\n
killTimeout (dagster.StringSource, optional):
\n

Scheduling: When spark.task.reaper.enabled = true, this setting specifies a timeout after which the executor JVM will kill itself if a killed task has not stopped running. The default value, -1, disables this mechanism and prevents the executor from self-destructing. The purpose of this setting is to act as a safety-net to prevent runaway noncancellable tasks from rendering an executor unusable.

\n
\n
\n
\n
\n
\n
stage (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
maxConsecutiveAttempts (dagster.StringSource, optional):
\n

Scheduling: Number of consecutive stage attempts allowed before a stage is aborted.

\n
\n
\n
\n
dynamicAllocation (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
enabled (dagster.StringSource, optional):
\n

Dynamic Allocation: Whether to use dynamic resource allocation, which scales the number of executors registered with this application up and down based on the workload. For more detail, see the description here. This requires spark.shuffle.service.enabled to be set. The following configurations are also relevant: spark.dynamicAllocation.minExecutors, spark.dynamicAllocation.maxExecutors, and spark.dynamicAllocation.initialExecutors spark.dynamicAllocation.executorAllocationRatio

\n
\n
executorIdleTimeout (dagster.StringSource, optional):
\n

Dynamic Allocation: If dynamic allocation is enabled and an executor has been idle for more than this duration, the executor will be removed. For more detail, see this description.

\n
\n
cachedExecutorIdleTimeout (dagster.StringSource, optional):
\n

Dynamic Allocation: If dynamic allocation is enabled and an executor which has cached data blocks has been idle for more than this duration, the executor will be removed. For more details, see this description.

\n
\n
initialExecutors (dagster.StringSource, optional):
\n

Dynamic Allocation: Initial number of executors to run if dynamic allocation is enabled. If \u2013num-executors (or spark.executor.instances) is set and larger than this value, it will be used as the initial number of executors.

\n
\n
maxExecutors (dagster.StringSource, optional):
\n

Dynamic Allocation: Upper bound for the number of executors if dynamic allocation is enabled.

\n
\n
minExecutors (dagster.StringSource, optional):
\n

Dynamic Allocation: Lower bound for the number of executors if dynamic allocation is enabled.

\n
\n
executorAllocationRatio (dagster.StringSource, optional):
\n

Dynamic Allocation: By default, the dynamic allocation will request enough executors to maximize the parallelism according to the number of tasks to process. While this minimizes the latency of the job, with small tasks this setting can waste a lot of resources due to executor allocation overhead, as some executor might not even do any work. This setting allows to set a ratio that will be used to reduce the number of executors w.r.t. full parallelism. Defaults to 1.0 to give maximum parallelism. 0.5 will divide the target number of executors by 2 The target number of executors computed by the dynamicAllocation can still be overridden by the spark.dynamicAllocation.minExecutors and spark.dynamicAllocation.maxExecutors settings

\n
\n
schedulerBacklogTimeout (dagster.StringSource, optional):
\n

Dynamic Allocation: If dynamic allocation is enabled and there have been pending tasks backlogged for more than this duration, new executors will be requested. For more detail, see this description.

\n
\n
sustainedSchedulerBacklogTimeout (dagster.StringSource, optional):
\n

Dynamic Allocation: Same as spark.dynamicAllocation.schedulerBacklogTimeout, but used only for subsequent executor requests. For more detail, see this description.

\n
\n
\n
\n
r (permissive dict, optional):
\n
\nDefault Value:
{\n    "driver": {},\n    "shell": {}\n}\n
\n
\n
\nConfig Schema:
\n
numRBackendThreads (dagster.StringSource, optional):
\n

SparkR: Number of threads used by RBackend to handle RPC calls from SparkR package.

\n
\n
command (dagster.StringSource, optional):
\n

SparkR: Executable for executing R scripts in cluster modes for both driver and workers.

\n
\n
driver (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
command (dagster.StringSource, optional):
\n

SparkR: Executable for executing R scripts in client modes for driver. Ignored in cluster modes.

\n
\n
\n
\n
shell (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
command (dagster.StringSource, optional):
\n

SparkR: Executable for executing sparkR shell in client modes for driver. Ignored in cluster modes. It is the same as environment variable SPARKR_DRIVER_R, but take precedence over it. spark.r.shell.command is used for sparkR shell while spark.r.driver.command is used for running R script.

\n
\n
\n
\n
backendConnectionTimeout (dagster.StringSource, optional):
\n

SparkR: Connection timeout set by R process on its connection to RBackend in seconds.

\n
\n
heartBeatInterval (dagster.StringSource, optional):
\n

SparkR: Interval for heartbeats sent from SparkR backend to R process to prevent connection timeout.

\n
\n
\n
\n
graphx (permissive dict, optional):
\n
\nDefault Value:
{\n    "pregel": {}\n}\n
\n
\n
\nConfig Schema:
\n
pregel (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
checkpointInterval (dagster.StringSource, optional):
\n

GraphX: Checkpoint interval for graph and message in Pregel. It used to avoid stackOverflowError due to long lineage chains after lots of iterations. The checkpoint is disabled by default.

\n
\n
\n
\n
\n
\n
deploy (permissive dict, optional):
\n
\nDefault Value:
{\n    "zookeeper": {}\n}\n
\n
\n
\nConfig Schema:
\n
recoveryMode (dagster.StringSource, optional):
\n

Deploy: The recovery mode setting to recover submitted Spark jobs with cluster mode when it failed and relaunches. This is only applicable for cluster mode when running with Standalone or Mesos.

\n
\n
zookeeper (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
url (dagster.StringSource, optional):
\n

Deploy: When spark.deploy.recoveryMode is set to ZOOKEEPER, this configuration is used to set the zookeeper URL to connect to.

\n
\n
dir (dagster.StringSource, optional):
\n

Deploy: When spark.deploy.recoveryMode is set to ZOOKEEPER, this configuration is used to set the zookeeper directory to store recovery state.

\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
cluster_id (dagster.StringSource):
\n

Name of the job flow (cluster) on which to execute.

\n
\n
region_name (dagster.StringSource):
\n

The AWS region that the cluster is in.

\n
\n
action_on_failure (String, optional):
\n

The EMR action to take when the cluster step fails: https://docs.aws.amazon.com/emr/latest/APIReference/API_StepConfig.html

\n

Default Value: \u2018CANCEL_AND_WAIT\u2019

\n
\n
staging_bucket (dagster.StringSource):
\n

S3 bucket to use for passing files between the plan process and EMR process.

\n
\n
staging_prefix (dagster.StringSource, optional):
\n

S3 key prefix inside the staging_bucket to use for files passed the plan process and EMR process

\n

Default Value: \u2018emr_staging\u2019

\n
\n
wait_for_logs (Bool, optional):
\n

If set, the system will wait for EMR logs to appear on S3. Note that logs are copied every 5 minutes, so enabling this will add several minutes to the job runtime.

\n

Default Value: False

\n
\n
local_job_package_path (dagster.StringSource, optional):
\n

Absolute path to the package that contains the job definition(s) whose steps will execute remotely on EMR. This is a path on the local fileystem of the process executing the job. The expectation is that this package will also be available on the python path of the launched process running the Spark step on EMR, either deployed on step launch via the deploy_local_job_package option, referenced on s3 via the s3_job_package_path option, or installed on the cluster via bootstrap actions.

\n
\n
local_pipeline_package_path (dagster.StringSource, optional):
\n

(legacy) Absolute path to the package that contains the pipeline definition(s) whose steps will execute remotely on EMR. This is a path on the local fileystem of the process executing the pipeline. The expectation is that this package will also be available on the python path of the launched process running the Spark step on EMR, either deployed on step launch via the deploy_local_pipeline_package option, referenced on s3 via the s3_pipeline_package_path option, or installed on the cluster via bootstrap actions.

\n
\n
deploy_local_job_package (Bool, optional):
\n

If set, before every step run, the launcher will zip up all the code in local_job_package_path, upload it to s3, and pass it to spark-submit\u2019s \u2013py-files option. This gives the remote process access to up-to-date user code. If not set, the assumption is that some other mechanism is used for distributing code to the EMR cluster. If this option is set to True, s3_job_package_path should not also be set.

\n

Default Value: False

\n
\n
deploy_local_pipeline_package (Bool, optional):
\n

(legacy) If set, before every step run, the launcher will zip up all the code in local_job_package_path, upload it to s3, and pass it to spark-submit\u2019s \u2013py-files option. This gives the remote process access to up-to-date user code. If not set, the assumption is that some other mechanism is used for distributing code to the EMR cluster. If this option is set to True, s3_job_package_path should not also be set.

\n

Default Value: False

\n
\n
s3_job_package_path (dagster.StringSource, optional):
\n

If set, this path will be passed to the \u2013py-files option of spark-submit. This should usually be a path to a zip file. If this option is set, deploy_local_job_package should not be set to True.

\n
\n
s3_pipeline_package_path (dagster.StringSource, optional):
\n

If set, this path will be passed to the \u2013py-files option of spark-submit. This should usually be a path to a zip file. If this option is set, deploy_local_pipeline_package should not be set to True.

\n
\n
\n
    \n
  • spark_config:

  • \n
  • cluster_id: Name of the job flow (cluster) on which to execute.

  • \n
  • region_name: The AWS region that the cluster is in.

  • \n
  • action_on_failure: The EMR action to take when the cluster step fails: https://docs.aws.amazon.com/emr/latest/APIReference/API_StepConfig.html

  • \n
  • staging_bucket: S3 bucket to use for passing files between the plan process and EMR process.

  • \n
  • staging_prefix: S3 key prefix inside the staging_bucket to use for files passed the plan process and EMR process

  • \n
  • wait_for_logs: If set, the system will wait for EMR logs to appear on S3. Note that logs are copied every 5 minutes, so enabling this will add several minutes to the job runtime.

  • \n
  • local_job_package_path: Absolute path to the package that contains the job definition(s) whose steps will execute remotely on EMR. This is a path on the local fileystem of the process executing the job. The expectation is that this package will also be available on the python path of the launched process running the Spark step on EMR, either deployed on step launch via the deploy_local_job_package option, referenced on s3 via the s3_job_package_path option, or installed on the cluster via bootstrap actions.

  • \n
  • local_pipeline_package_path: (legacy) Absolute path to the package that contains the pipeline definition(s) whose steps will execute remotely on EMR. This is a path on the local fileystem of the process executing the pipeline. The expectation is that this package will also be available on the python path of the launched process running the Spark step on EMR, either deployed on step launch via the deploy_local_pipeline_package option, referenced on s3 via the s3_pipeline_package_path option, or installed on the cluster via bootstrap actions.

  • \n
  • deploy_local_job_package: If set, before every step run, the launcher will zip up all the code in local_job_package_path, upload it to s3, and pass it to spark-submit\u2019s \u2013py-files option. This gives the remote process access to up-to-date user code. If not set, the assumption is that some other mechanism is used for distributing code to the EMR cluster. If this option is set to True, s3_job_package_path should not also be set.

  • \n
  • deploy_local_pipeline_package: (legacy) If set, before every step run, the launcher will zip up all the code in local_job_package_path, upload it to s3, and pass it to spark-submit\u2019s \u2013py-files option. This gives the remote process access to up-to-date user code. If not set, the assumption is that some other mechanism is used for distributing code to the EMR cluster. If this option is set to True, s3_job_package_path should not also be set.

  • \n
  • s3_job_package_path: If set, this path will be passed to the \u2013py-files option of spark-submit. This should usually be a path to a zip file. If this option is set, deploy_local_job_package should not be set to True.

  • \n
  • s3_pipeline_package_path: If set, this path will be passed to the \u2013py-files option of spark-submit. This should usually be a path to a zip file. If this option is set, deploy_local_pipeline_package should not be set to True.

  • \n
\n
\n\n
\n
\nclass dagster_aws.emr.EmrJobRunner(region, check_cluster_every=30, aws_access_key_id=None, aws_secret_access_key=None)[source]\u00b6
\n
\n\n
\n
\nclass dagster_aws.emr.EmrError[source]\u00b6
\n
\n\n
\n
\ndagster_aws.emr.EmrClusterState = <enum 'EmrClusterState'>[source]\u00b6
\n

An enumeration.

\n
\n\n
\n
\ndagster_aws.emr.EmrStepState = <enum 'EmrStepState'>[source]\u00b6
\n

An enumeration.

\n
\n\n
\n
\n

CloudWatch\u00b6

\n
\n
\ndagster_aws.cloudwatch.cloudwatch_logger LoggerDefinition\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
log_level (String, optional):
\n

Default Value: \u2018INFO\u2019

\n
\n
name (String, optional):
\n

Default Value: \u2018dagster\u2019

\n
\n
log_group_name (String):
\n

The name of the log group

\n
\n
log_stream_name (String):
\n

The name of the log stream

\n
\n
aws_region (dagster.StringSource, optional):
\n

Specifies a custom region for the S3 session. Default is chosen through the ordinary boto3 credential chain.

\n
\n
aws_secret_access_key (dagster.StringSource, optional):
\n

\n
aws_access_key_id (dagster.StringSource, optional):
\n

\n
\n

Core class for defining loggers.

\n

Loggers are job-scoped logging handlers, which will be automatically invoked whenever\ndagster messages are logged from within a job.

\n
\n
Parameters:
\n
    \n
  • logger_fn (Callable[[InitLoggerContext], logging.Logger]) \u2013 User-provided function to\ninstantiate the logger. This logger will be automatically invoked whenever the methods\non context.log are called from within job compute logic.

  • \n
  • config_schema (Optional[ConfigSchema]) \u2013 The schema for the config. Configuration data available in\ninit_context.logger_config. If not set, Dagster will accept any config provided.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of this logger.

  • \n
\n
\n
\n
\n\n
\n
\n

SecretsManager\u00b6

\n

Resources which surface SecretsManager secrets for use in Dagster resources and jobs.

\n
\n
\ndagster_aws.secretsmanager.SecretsManagerResource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
region_name (Union[dagster.StringSource, None], optional):
\n

Specifies a custom region for the Boto3 session

\n
\n
max_attempts (dagster.IntSource, optional):
\n

This provides Boto3\u2019s retry handler with a value of maximum retry attempts, where the initial call counts toward the max_attempts value that you provide

\n

Default Value: 5

\n
\n
profile_name (Union[dagster.StringSource, None], optional):
\n

Specifies a profile to connect that session

\n
\n
\n

Resource that gives access to AWS SecretsManager.

\n

The underlying SecretsManager session is created by calling\nboto3.session.Session(profile_name).\nThe returned resource object is a SecretsManager client, an instance of botocore.client.SecretsManager.

\n

Example

\n
from dagster import build_op_context, job, op\nfrom dagster_aws.secretsmanager import SecretsManagerResource\n\n@op\ndef example_secretsmanager_op(secretsmanager: SecretsManagerResource):\n    return secretsmanager.get_client().get_secret_value(\n        SecretId='arn:aws:secretsmanager:region:aws_account_id:secret:appauthexample-AbCdEf'\n    )\n\n@job\ndef example_job():\n    example_secretsmanager_op()\n\ndefs = Definitions(\n    jobs=[example_job],\n    resources={\n        'secretsmanager': SecretsManagerResource(\n            region_name='us-west-1'\n        )\n    }\n)\n
\n
\n
\n\n
\n
\ndagster_aws.secretsmanager.SecretsManagerSecretsResource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
region_name (Union[dagster.StringSource, None], optional):
\n

Specifies a custom region for the Boto3 session

\n
\n
max_attempts (dagster.IntSource, optional):
\n

This provides Boto3\u2019s retry handler with a value of maximum retry attempts, where the initial call counts toward the max_attempts value that you provide

\n

Default Value: 5

\n
\n
profile_name (Union[dagster.StringSource, None], optional):
\n

Specifies a profile to connect that session

\n
\n
secrets (List[dagster.StringSource], optional):
\n

An array of AWS Secrets Manager secrets arns to fetch.

\n

Default Value: []

\n
\n
secrets_tag (Union[dagster.StringSource, None], optional):
\n

AWS Secrets Manager secrets with this tag will be fetched and made available.

\n
\n
\n

Resource that provides a dict which maps selected SecretsManager secrets to\ntheir string values. Also optionally sets chosen secrets as environment variables.

\n

Example

\n
import os\nfrom dagster import build_op_context, job, op, ResourceParam\nfrom dagster_aws.secretsmanager import SecretsManagerSecretsResource\n\n@op\ndef example_secretsmanager_secrets_op(secrets: SecretsManagerSecretsResource):\n    return secrets.fetch_secrets().get("my-secret-name")\n\n@op\ndef example_secretsmanager_secrets_op_2(secrets: SecretsManagerSecretsResource):\n    with secrets.secrets_in_environment():\n        return os.getenv("my-other-secret-name")\n\n@job\ndef example_job():\n    example_secretsmanager_secrets_op()\n    example_secretsmanager_secrets_op_2()\n\ndefs = Definitions(\n    jobs=[example_job],\n    resources={\n        'secrets': SecretsManagerSecretsResource(\n            region_name='us-west-1',\n            secrets_tag="dagster",\n            add_to_environment=True,\n        )\n    }\n)\n
\n
\n

Note that your ops must also declare that they require this resource with or it will not be initialized\nfor the execution of their compute functions.

\n
\n\n
\n
\n

Legacy\u00b6

\n
\n
\ndagster_aws.s3.ConfigurablePickledObjectS3IOManager IOManagerDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
s3_resource (Union[Any, None], optional):
\n

\n
s3_bucket (dagster.StringSource):
\n

S3 bucket to use for the file manager.

\n
\n
s3_prefix (dagster.StringSource, optional):
\n

Prefix to use for the S3 bucket for this file manager.

\n

Default Value: \u2018dagster\u2019

\n
\n
\n
\n
\n

\n \n (\n deprecated\n )\n \n This API will be removed in version 2.0. Please use S3PickleIOManager instead..\n \n

\n

Renamed to S3PickleIOManager. See S3PickleIOManager for documentation.

\n
\n\n
\n
\ndagster_aws.s3.s3_resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
use_unsigned_session (dagster.BoolSource, optional):
\n

Specifies whether to use an unsigned S3 session.

\n

Default Value: False

\n
\n
region_name (Union[dagster.StringSource, None], optional):
\n

Specifies a custom region for the S3 session.

\n
\n
endpoint_url (Union[dagster.StringSource, None], optional):
\n

Specifies a custom endpoint for the S3 session.

\n
\n
max_attempts (dagster.IntSource, optional):
\n

This provides Boto3\u2019s retry handler with a value of maximum retry attempts, where the initial call counts toward the max_attempts value that you provide.

\n

Default Value: 5

\n
\n
profile_name (Union[dagster.StringSource, None], optional):
\n

Specifies a profile to connect that session.

\n
\n
use_ssl (dagster.BoolSource, optional):
\n

Whether or not to use SSL. By default, SSL is used.

\n

Default Value: True

\n
\n
verify (Union[dagster.StringSource, None], optional):
\n

Whether or not to verify SSL certificates. By default SSL certificates are verified. You can also specify this argument if you want to use a different CA cert bundle than the one used by botocore.

\n
\n
aws_access_key_id (Union[dagster.StringSource, None], optional):
\n

AWS access key ID to use when creating the boto3 session.

\n
\n
aws_secret_access_key (Union[dagster.StringSource, None], optional):
\n

AWS secret access key to use when creating the boto3 session.

\n
\n
aws_session_token (Union[dagster.StringSource, None], optional):
\n

AWS session token to use when creating the boto3 session.

\n
\n
\n

Resource that gives access to S3.

\n

The underlying S3 session is created by calling\nboto3.session.Session(profile_name).\nThe returned resource object is an S3 client, an instance of botocore.client.S3.

\n

Example

\n
from dagster import build_op_context, job, op\nfrom dagster_aws.s3 import s3_resource\n\n@op(required_resource_keys={'s3'})\ndef example_s3_op(context):\n    return context.resources.s3.list_objects_v2(\n        Bucket='my-bucket',\n        Prefix='some-key'\n    )\n\n@job(resource_defs={'s3': s3_resource})\ndef example_job():\n    example_s3_op()\n\nexample_job.execute_in_process(\n    run_config={\n        'resources': {\n            's3': {\n                'config': {\n                    'region_name': 'us-west-1',\n                }\n            }\n        }\n    }\n)\n
\n
\n

Note that your ops must also declare that they require this resource with\nrequired_resource_keys, or it will not be initialized for the execution of their compute\nfunctions.

\n

You may configure this resource as follows:

\n
resources:\n  s3:\n    config:\n      region_name: "us-west-1"\n      # Optional[str]: Specifies a custom region for the S3 session. Default is chosen\n      # through the ordinary boto credential chain.\n      use_unsigned_session: false\n      # Optional[bool]: Specifies whether to use an unsigned S3 session. Default: True\n      endpoint_url: "http://localhost"\n      # Optional[str]: Specifies a custom endpoint for the S3 session. Default is None.\n      profile_name: "dev"\n      # Optional[str]: Specifies a custom profile for S3 session. Default is default\n      # profile as specified in ~/.aws/credentials file\n      use_ssl: true\n      # Optional[bool]: Whether or not to use SSL. By default, SSL is used.\n      verify: None\n      # Optional[str]: Whether or not to verify SSL certificates. By default SSL certificates are verified.\n      # You can also specify this argument if you want to use a different CA cert bundle than the one used by botocore."\n      aws_access_key_id: None\n      # Optional[str]: The access key to use when creating the client.\n      aws_secret_access_key: None\n      # Optional[str]: The secret key to use when creating the client.\n      aws_session_token: None\n      # Optional[str]:  The session token to use when creating the client.\n
\n
\n
\n\n
\n
\ndagster_aws.s3.s3_pickle_io_manager IOManagerDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
s3_resource (Union[Any, None], optional):
\n

\n
s3_bucket (dagster.StringSource):
\n

S3 bucket to use for the file manager.

\n
\n
s3_prefix (dagster.StringSource, optional):
\n

Prefix to use for the S3 bucket for this file manager.

\n

Default Value: \u2018dagster\u2019

\n
\n
\n

Persistent IO manager using S3 for storage.

\n

Serializes objects via pickling. Suitable for objects storage for distributed executors, so long\nas each execution node has network connectivity and credentials for S3 and the backing bucket.

\n

Assigns each op output to a unique filepath containing run ID, step key, and output name.\nAssigns each asset to a single filesystem path, at \u201c<base_dir>/<asset_key>\u201d. If the asset key\nhas multiple components, the final component is used as the name of the file, and the preceding\ncomponents as parent directories under the base_dir.

\n

Subsequent materializations of an asset will overwrite previous materializations of that asset.\nWith a base directory of \u201c/my/base/path\u201d, an asset with key\nAssetKey([\u201cone\u201d, \u201ctwo\u201d, \u201cthree\u201d]) would be stored in a file called \u201cthree\u201d in a directory\nwith path \u201c/my/base/path/one/two/\u201d.

\n

Example usage:

\n
    \n
  1. Attach this IO manager to a set of assets.

  2. \n
\n
from dagster import Definitions, asset\nfrom dagster_aws.s3 import s3_pickle_io_manager, s3_resource\n\n\n@asset\ndef asset1():\n    # create df ...\n    return df\n\n@asset\ndef asset2(asset1):\n    return asset1[:5]\n\ndefs = Definitions(\n    assets=[asset1, asset2],\n    resources={\n        "io_manager": s3_pickle_io_manager.configured(\n            {"s3_bucket": "my-cool-bucket", "s3_prefix": "my-cool-prefix"}\n        ),\n        "s3": s3_resource,\n    },\n)\n
\n
\n
    \n
  1. Attach this IO manager to your job to make it available to your ops.

  2. \n
\n
from dagster import job\nfrom dagster_aws.s3 import s3_pickle_io_manager, s3_resource\n\n@job(\n    resource_defs={\n        "io_manager": s3_pickle_io_manager.configured(\n            {"s3_bucket": "my-cool-bucket", "s3_prefix": "my-cool-prefix"}\n        ),\n        "s3": s3_resource,\n    },\n)\ndef my_job():\n    ...\n
\n
\n
\n\n
\n
\ndagster_aws.s3.s3_file_manager ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
use_unsigned_session (dagster.BoolSource, optional):
\n

Specifies whether to use an unsigned S3 session.

\n

Default Value: False

\n
\n
region_name (Union[dagster.StringSource, None], optional):
\n

Specifies a custom region for the S3 session.

\n
\n
endpoint_url (Union[dagster.StringSource, None], optional):
\n

Specifies a custom endpoint for the S3 session.

\n
\n
max_attempts (dagster.IntSource, optional):
\n

This provides Boto3\u2019s retry handler with a value of maximum retry attempts, where the initial call counts toward the max_attempts value that you provide.

\n

Default Value: 5

\n
\n
profile_name (Union[dagster.StringSource, None], optional):
\n

Specifies a profile to connect that session.

\n
\n
use_ssl (dagster.BoolSource, optional):
\n

Whether or not to use SSL. By default, SSL is used.

\n

Default Value: True

\n
\n
verify (Union[dagster.StringSource, None], optional):
\n

Whether or not to verify SSL certificates. By default SSL certificates are verified. You can also specify this argument if you want to use a different CA cert bundle than the one used by botocore.

\n
\n
aws_access_key_id (Union[dagster.StringSource, None], optional):
\n

AWS access key ID to use when creating the boto3 session.

\n
\n
aws_secret_access_key (Union[dagster.StringSource, None], optional):
\n

AWS secret access key to use when creating the boto3 session.

\n
\n
aws_session_token (Union[dagster.StringSource, None], optional):
\n

AWS session token to use when creating the boto3 session.

\n
\n
s3_bucket (dagster.StringSource):
\n

S3 bucket to use for the file manager.

\n
\n
s3_prefix (dagster.StringSource, optional):
\n

Prefix to use for the S3 bucket for this file manager.

\n

Default Value: \u2018dagster\u2019

\n
\n
\n

FileManager that provides abstract access to S3.

\n

Implements the FileManager API.

\n
\n\n
\n
\ndagster_aws.redshift.redshift_resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
host (dagster.StringSource):
\n

Redshift host

\n
\n
port (dagster.IntSource, optional):
\n

Redshift port

\n

Default Value: 5439

\n
\n
user (Union[dagster.StringSource, None], optional):
\n

Username for Redshift connection

\n
\n
password (Union[dagster.StringSource, None], optional):
\n

Password for Redshift connection

\n
\n
database (Union[dagster.StringSource, None], optional):
\n

Name of the default database to use. After login, you can use USE DATABASE to change the database.

\n
\n
autocommit (Union[dagster.BoolSource, None], optional):
\n

Whether to autocommit queries

\n
\n
connect_timeout (dagster.IntSource, optional):
\n

Timeout for connection to Redshift cluster. Defaults to 5 seconds.

\n

Default Value: 5

\n
\n
sslmode (dagster.StringSource, optional):
\n

SSL mode to use. See the Redshift documentation for reference: https://docs.aws.amazon.com/redshift/latest/mgmt/connecting-ssl-support.html

\n

Default Value: \u2018require\u2019

\n
\n
\n

This resource enables connecting to a Redshift cluster and issuing queries against that\ncluster.

\n

Example

\n
from dagster import build_op_context, op\nfrom dagster_aws.redshift import redshift_resource\n\n@op(required_resource_keys={'redshift'})\ndef example_redshift_op(context):\n    return context.resources.redshift.execute_query('SELECT 1', fetch_results=True)\n\nredshift_configured = redshift_resource.configured({\n    'host': 'my-redshift-cluster.us-east-1.redshift.amazonaws.com',\n    'port': 5439,\n    'user': 'dagster',\n    'password': 'dagster',\n    'database': 'dev',\n})\ncontext = build_op_context(resources={'redshift': redshift_configured})\nassert example_redshift_op(context) == [(1,)]\n
\n
\n
\n\n
\n
\ndagster_aws.redshift.fake_redshift_resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
host (dagster.StringSource):
\n

Redshift host

\n
\n
port (dagster.IntSource, optional):
\n

Redshift port

\n

Default Value: 5439

\n
\n
user (Union[dagster.StringSource, None], optional):
\n

Username for Redshift connection

\n
\n
password (Union[dagster.StringSource, None], optional):
\n

Password for Redshift connection

\n
\n
database (Union[dagster.StringSource, None], optional):
\n

Name of the default database to use. After login, you can use USE DATABASE to change the database.

\n
\n
autocommit (Union[dagster.BoolSource, None], optional):
\n

Whether to autocommit queries

\n
\n
connect_timeout (dagster.IntSource, optional):
\n

Timeout for connection to Redshift cluster. Defaults to 5 seconds.

\n

Default Value: 5

\n
\n
sslmode (dagster.StringSource, optional):
\n

SSL mode to use. See the Redshift documentation for reference: https://docs.aws.amazon.com/redshift/latest/mgmt/connecting-ssl-support.html

\n

Default Value: \u2018require\u2019

\n
\n
\n
\n\n
\n
\ndagster_aws.secretsmanager.secretsmanager_resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
region_name (Union[dagster.StringSource, None], optional):
\n

Specifies a custom region for the Boto3 session

\n
\n
max_attempts (dagster.IntSource, optional):
\n

This provides Boto3\u2019s retry handler with a value of maximum retry attempts, where the initial call counts toward the max_attempts value that you provide

\n

Default Value: 5

\n
\n
profile_name (Union[dagster.StringSource, None], optional):
\n

Specifies a profile to connect that session

\n
\n
\n

Resource that gives access to AWS SecretsManager.

\n

The underlying SecretsManager session is created by calling\nboto3.session.Session(profile_name).\nThe returned resource object is a SecretsManager client, an instance of botocore.client.SecretsManager.

\n

Example

\n
from dagster import build_op_context, job, op\nfrom dagster_aws.secretsmanager import secretsmanager_resource\n\n@op(required_resource_keys={'secretsmanager'})\ndef example_secretsmanager_op(context):\n    return context.resources.secretsmanager.get_secret_value(\n        SecretId='arn:aws:secretsmanager:region:aws_account_id:secret:appauthexample-AbCdEf'\n    )\n\n@job(resource_defs={'secretsmanager': secretsmanager_resource})\ndef example_job():\n    example_secretsmanager_op()\n\nexample_job.execute_in_process(\n    run_config={\n        'resources': {\n            'secretsmanager': {\n                'config': {\n                    'region_name': 'us-west-1',\n                }\n            }\n        }\n    }\n)\n
\n
\n

You may configure this resource as follows:

\n
resources:\n  secretsmanager:\n    config:\n      region_name: "us-west-1"\n      # Optional[str]: Specifies a custom region for the SecretsManager session. Default is chosen\n      # through the ordinary boto credential chain.\n      profile_name: "dev"\n      # Optional[str]: Specifies a custom profile for SecretsManager session. Default is default\n      # profile as specified in ~/.aws/credentials file\n
\n
\n
\n\n
\n
\ndagster_aws.secretsmanager.secretsmanager_secrets_resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
region_name (Union[dagster.StringSource, None], optional):
\n

Specifies a custom region for the Boto3 session

\n
\n
max_attempts (dagster.IntSource, optional):
\n

This provides Boto3\u2019s retry handler with a value of maximum retry attempts, where the initial call counts toward the max_attempts value that you provide

\n

Default Value: 5

\n
\n
profile_name (Union[dagster.StringSource, None], optional):
\n

Specifies a profile to connect that session

\n
\n
secrets (List[dagster.StringSource], optional):
\n

An array of AWS Secrets Manager secrets arns to fetch.

\n

Default Value: []

\n
\n
secrets_tag (Union[dagster.StringSource, None], optional):
\n

AWS Secrets Manager secrets with this tag will be fetched and made available.

\n
\n
add_to_environment (Bool, optional):
\n

Whether to add the secrets to the environment. Defaults to False.

\n

Default Value: False

\n
\n
\n

Resource that provides a dict which maps selected SecretsManager secrets to\ntheir string values. Also optionally sets chosen secrets as environment variables.

\n

Example

\n
import os\nfrom dagster import build_op_context, job, op\nfrom dagster_aws.secretsmanager import secretsmanager_secrets_resource\n\n@op(required_resource_keys={'secrets'})\ndef example_secretsmanager_secrets_op(context):\n    return context.resources.secrets.get("my-secret-name")\n\n@op(required_resource_keys={'secrets'})\ndef example_secretsmanager_secrets_op_2(context):\n    return os.getenv("my-other-secret-name")\n\n@job(resource_defs={'secrets': secretsmanager_secrets_resource})\ndef example_job():\n    example_secretsmanager_secrets_op()\n    example_secretsmanager_secrets_op_2()\n\nexample_job.execute_in_process(\n    run_config={\n        'resources': {\n            'secrets': {\n                'config': {\n                    'region_name': 'us-west-1',\n                    'secrets_tag': 'dagster',\n                    'add_to_environment': True,\n                }\n            }\n        }\n    }\n)\n
\n
\n

Note that your ops must also declare that they require this resource with\nrequired_resource_keys, or it will not be initialized for the execution of their compute\nfunctions.

\n

You may configure this resource as follows:

\n
resources:\n  secretsmanager:\n    config:\n      region_name: "us-west-1"\n      # Optional[str]: Specifies a custom region for the SecretsManager session. Default is chosen\n      # through the ordinary boto credential chain.\n      profile_name: "dev"\n      # Optional[str]: Specifies a custom profile for SecretsManager session. Default is default\n      # profile as specified in ~/.aws/credentials file\n      secrets: ["arn:aws:secretsmanager:region:aws_account_id:secret:appauthexample-AbCdEf"]\n      # Optional[List[str]]: Specifies a list of secret ARNs to pull from SecretsManager.\n      secrets_tag: "dagster"\n      # Optional[str]: Specifies a tag, all secrets which have the tag set will be pulled\n      # from SecretsManager.\n      add_to_environment: true\n      # Optional[bool]: Whether to set the selected secrets as environment variables. Defaults\n      # to false.\n
\n
\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-aws", "customsidebar": null, "display_toc": true, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../dagster-azure/", "title": "Azure (dagster-azure)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-airflow/", "title": "Airflow (dagster-airflow)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-azure", "Azure (dagster-azure)", "N", "next"], ["sections/api/apidocs/libraries/dagster-airflow", "Airflow (dagster-airflow)", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/libraries/dagster-aws.rst.txt", "title": "AWS (dagster-aws)", "toc": "\n"}, "dagster-azure": {"alabaster_version": "0.7.13", "body": "
\n

Azure (dagster-azure)\u00b6

\n

Utilities for using Azure Storage Accounts with Dagster. This is mostly aimed at Azure Data Lake\nStorage Gen 2 (ADLS2) but also contains some utilities for Azure Blob Storage.

\n
\n

Resources\u00b6

\n
\n
\ndagster_azure.adls2.ADLS2Resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
storage_account (dagster.StringSource):
\n

The storage account name.

\n
\n
credential (selector):
\n
\nConfig Schema:
\n
sas (strict dict):
\n
\nConfig Schema:
\n
token (dagster.StringSource):
\n

\n
\n
\n
key (strict dict):
\n
\nConfig Schema:
\n
key (dagster.StringSource):
\n

\n
\n
\n
default_azure_credential (strict dict):
\n
\nConfig Schema:
\n
kwargs (dict):
\n

\n
\n
\n
\n
\n
\n

Resource containing clients to access Azure Data Lake Storage Gen2.

\n

Contains a client for both the Data Lake and Blob APIs, to work around the limitations\nof each.

\n
\n\n
\n
\ndagster_azure.adls2.FakeADLS2Resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
account_name (dagster.StringSource):
\n

\n
storage_account (Union[dagster.StringSource, None], optional):
\n

\n
\n

Stateful mock of an ADLS2Resource for testing.

\n

Wraps a mock.MagicMock. Containers are implemented using an in-memory dict.

\n
\n\n
\n
\nclass dagster_azure.blob.AzureBlobComputeLogManager(storage_account, container, secret_key=None, local_dir=None, inst_data=None, prefix='dagster', upload_interval=None, default_azure_credential=None)[source]\u00b6
\n

Logs op compute function stdout and stderr to Azure Blob Storage.

\n

This is also compatible with Azure Data Lake Storage.

\n

Users should not instantiate this class directly. Instead, use a YAML block in dagster.yaml\nsuch as the following:

\n
compute_logs:\n  module: dagster_azure.blob.compute_log_manager\n  class: AzureBlobComputeLogManager\n  config:\n    storage_account: my-storage-account\n    container: my-container\n    credential: sas-token-or-secret-key\n    default_azure_credential:\n      exclude_environment_credential: true\n    prefix: "dagster-test-"\n    local_dir: "/tmp/cool"\n    upload_interval: 30\n
\n
\n
\n
Parameters:
\n
    \n
  • storage_account (str) \u2013 The storage account name to which to log.

  • \n
  • container (str) \u2013 The container (or ADLS2 filesystem) to which to log.

  • \n
  • secret_key (Optional[str]) \u2013 Secret key for the storage account. SAS tokens are not\nsupported because we need a secret key to generate a SAS token for a download URL.

  • \n
  • default_azure_credential (Optional[dict]) \u2013 Use and configure DefaultAzureCredential.\nCannot be used with sas token or secret key config.

  • \n
  • local_dir (Optional[str]) \u2013 Path to the local directory in which to stage logs. Default:\ndagster._seven.get_system_temp_directory().

  • \n
  • prefix (Optional[str]) \u2013 Prefix for the log file keys.

  • \n
  • upload_interval \u2013 (Optional[int]): Interval in seconds to upload partial log files blob storage. By default, will only upload when the capture is complete.

  • \n
  • inst_data (Optional[ConfigurableClassData]) \u2013 Serializable representation of the compute\nlog manager when newed up from config.

  • \n
\n
\n
\n
\n\n
\n
\n

I/O Manager\u00b6

\n
\n
\ndagster_azure.adls2.ADLS2PickleIOManager IOManagerDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
adls2 (Union[Any, None], optional):
\n

\n
adls2_file_system (dagster.StringSource):
\n

ADLS Gen2 file system name.

\n
\n
adls2_prefix (dagster.StringSource, optional):
\n

ADLS Gen2 file system prefix to write to.

\n

Default Value: \u2018dagster\u2019

\n
\n
\n

Persistent IO manager using Azure Data Lake Storage Gen2 for storage.

\n

Serializes objects via pickling. Suitable for objects storage for distributed executors, so long\nas each execution node has network connectivity and credentials for ADLS and the backing\ncontainer.

\n

Assigns each op output to a unique filepath containing run ID, step key, and output name.\nAssigns each asset to a single filesystem path, at \u201c<base_dir>/<asset_key>\u201d. If the asset key\nhas multiple components, the final component is used as the name of the file, and the preceding\ncomponents as parent directories under the base_dir.

\n

Subsequent materializations of an asset will overwrite previous materializations of that asset.\nWith a base directory of \u201c/my/base/path\u201d, an asset with key\nAssetKey([\u201cone\u201d, \u201ctwo\u201d, \u201cthree\u201d]) would be stored in a file called \u201cthree\u201d in a directory\nwith path \u201c/my/base/path/one/two/\u201d.

\n

Example usage:

\n
    \n
  1. Attach this IO manager to a set of assets.

  2. \n
\n
from dagster import Definitions, asset\nfrom dagster_azure.adls2 import ADLS2PickleIOManager, adls2_resource\n\n@asset\ndef asset1():\n    # create df ...\n    return df\n\n@asset\ndef asset2(asset1):\n    return df[:5]\n\ndefs = Definitions(\n    assets=[asset1, asset2],\n    resources={\n        "io_manager": ADLS2PickleIOManager(\n            adls2_file_system="my-cool-fs",\n            adls2_prefix="my-cool-prefix"\n        ),\n        "adls2": adls2_resource,\n    },\n)\n
\n
\n
    \n
  1. Attach this IO manager to your job to make it available to your ops.

  2. \n
\n
from dagster import job\nfrom dagster_azure.adls2 import ADLS2PickleIOManager, adls2_resource\n\n@job(\n    resource_defs={\n        "io_manager": ADLS2PickleIOManager(\n            adls2_file_system="my-cool-fs",\n            adls2_prefix="my-cool-prefix"\n        ),\n        "adls2": adls2_resource,\n    },\n)\ndef my_job():\n    ...\n
\n
\n
\n\n
\n
\n

File Manager (Experimental)\u00b6

\n
\n
\ndagster_azure.adls2.adls2_file_manager ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
storage_account (dagster.StringSource):
\n

The storage account name.

\n
\n
credential (selector):
\n

The credentials with which to authenticate.

\n
\nConfig Schema:
\n
sas (dagster.StringSource):
\n

SAS token for the account.

\n
\n
key (dagster.StringSource):
\n

Shared Access Key for the account.

\n
\n
DefaultAzureCredential (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\n
\n
\n
adls2_file_system (dagster.StringSource):
\n

ADLS Gen2 file system name

\n
\n
adls2_prefix (dagster.StringSource, optional):
\n

Default Value: \u2018dagster\u2019

\n
\n
\n

FileManager that provides abstract access to ADLS2.

\n

Implements the FileManager API.

\n
\n\n
\n
\nclass dagster_azure.adls2.ADLS2FileHandle(account, file_system, key)[source]\u00b6
\n

A reference to a file on ADLS2.

\n
\n\n
\n
\n

Legacy\u00b6

\n
\n
\ndagster_azure.adls2.ConfigurablePickledObjectADLS2IOManager IOManagerDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
adls2 (Union[Any, None], optional):
\n

\n
adls2_file_system (dagster.StringSource):
\n

ADLS Gen2 file system name.

\n
\n
adls2_prefix (dagster.StringSource, optional):
\n

ADLS Gen2 file system prefix to write to.

\n

Default Value: \u2018dagster\u2019

\n
\n
\n
\n
\n

\n \n (\n deprecated\n )\n \n This API will be removed in version 2.0. Please use GCSPickleIOManager instead..\n \n

\n

Renamed to ADLS2PickleIOManager. See ADLS2PickleIOManager for documentation.

\n
\n\n
\n
\ndagster_azure.adls2.adls2_resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
storage_account (dagster.StringSource):
\n

The storage account name.

\n
\n
credential (selector):
\n

The credentials with which to authenticate.

\n
\nConfig Schema:
\n
sas (dagster.StringSource):
\n

SAS token for the account.

\n
\n
key (dagster.StringSource):
\n

Shared Access Key for the account.

\n
\n
DefaultAzureCredential (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\n
\n
\n
\n

Resource that gives ops access to Azure Data Lake Storage Gen2.

\n

The underlying client is a DataLakeServiceClient.

\n

Attach this resource definition to a JobDefinition in order to make it\navailable to your ops.

\n

Example

\n
from dagster import job, op\nfrom dagster_azure.adls2 import adls2_resource\n\n@op(required_resource_keys={'adls2'})\ndef example_adls2_op(context):\n    return list(context.resources.adls2.adls2_client.list_file_systems())\n\n@job(resource_defs={"adls2": adls2_resource})\ndef my_job():\n    example_adls2_op()\n
\n
\n

Note that your ops must also declare that they require this resource with\nrequired_resource_keys, or it will not be initialized for the execution of their compute\nfunctions.

\n

You may pass credentials to this resource using either a SAS token, a key or by passing the\nDefaultAzureCredential object.

\n
resources:\n  adls2:\n    config:\n      storage_account: my_storage_account\n      # str: The storage account name.\n      credential:\n        sas: my_sas_token\n        # str: the SAS token for the account.\n        key:\n          env: AZURE_DATA_LAKE_STORAGE_KEY\n        # str: The shared access key for the account.\n        DefaultAzureCredential: {}\n        # dict: The keyword arguments used for DefaultAzureCredential\n        # or leave the object empty for no arguments\n        DefaultAzureCredential:\n            exclude_environment_credential: true\n
\n
\n
\n\n
\n
\ndagster_azure.adls2.adls2_pickle_io_manager IOManagerDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
adls2 (Union[Any, None], optional):
\n

\n
adls2_file_system (dagster.StringSource):
\n

ADLS Gen2 file system name.

\n
\n
adls2_prefix (dagster.StringSource, optional):
\n

ADLS Gen2 file system prefix to write to.

\n

Default Value: \u2018dagster\u2019

\n
\n
\n

Persistent IO manager using Azure Data Lake Storage Gen2 for storage.

\n

Serializes objects via pickling. Suitable for objects storage for distributed executors, so long\nas each execution node has network connectivity and credentials for ADLS and the backing\ncontainer.

\n

Assigns each op output to a unique filepath containing run ID, step key, and output name.\nAssigns each asset to a single filesystem path, at \u201c<base_dir>/<asset_key>\u201d. If the asset key\nhas multiple components, the final component is used as the name of the file, and the preceding\ncomponents as parent directories under the base_dir.

\n

Subsequent materializations of an asset will overwrite previous materializations of that asset.\nWith a base directory of \u201c/my/base/path\u201d, an asset with key\nAssetKey([\u201cone\u201d, \u201ctwo\u201d, \u201cthree\u201d]) would be stored in a file called \u201cthree\u201d in a directory\nwith path \u201c/my/base/path/one/two/\u201d.

\n

Example usage:

\n
    \n
  1. Attach this IO manager to a set of assets.

  2. \n
\n
from dagster import Definitions, asset\nfrom dagster_azure.adls2 import adls2_pickle_io_manager, adls2_resource\n\n@asset\ndef asset1():\n    # create df ...\n    return df\n\n@asset\ndef asset2(asset1):\n    return df[:5]\n\ndefs = Definitions(\n    assets=[asset1, asset2],\n    resources={\n        "io_manager": adls2_pickle_io_manager.configured(\n            {"adls2_file_system": "my-cool-fs", "adls2_prefix": "my-cool-prefix"}\n        ),\n        "adls2": adls2_resource,\n    },\n)\n
\n
\n
    \n
  1. Attach this IO manager to your job to make it available to your ops.

  2. \n
\n
from dagster import job\nfrom dagster_azure.adls2 import adls2_pickle_io_manager, adls2_resource\n\n@job(\n    resource_defs={\n        "io_manager": adls2_pickle_io_manager.configured(\n            {"adls2_file_system": "my-cool-fs", "adls2_prefix": "my-cool-prefix"}\n        ),\n        "adls2": adls2_resource,\n    },\n)\ndef my_job():\n    ...\n
\n
\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-azure", "customsidebar": null, "display_toc": true, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../dagster-celery/", "title": "Celery (dagster-celery)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-aws/", "title": "AWS (dagster-aws)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-celery", "Celery (dagster-celery)", "N", "next"], ["sections/api/apidocs/libraries/dagster-aws", "AWS (dagster-aws)", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/libraries/dagster-azure.rst.txt", "title": "Azure (dagster-azure)", "toc": "\n"}, "dagster-celery": {"alabaster_version": "0.7.13", "body": "
\n

Celery (dagster-celery)\u00b6

\n
\n

Quickstart\u00b6

\n

To get a local rabbitmq broker started and available via the default\npyamqp://guest@localhost:5672, in the dagster/python_modules/libraries/dagster-celery/\ndirectory run:

\n
docker-compose up\n
\n
\n

To run a celery worker:

\n
celery -A dagster_celery.app worker -l info\n
\n
\n

To start multiple workers in the background, run:

\n
celery multi start w2 -A dagster_celery.app -l info\n
\n
\n

To execute a job using the celery-backed executor, you\u2019ll need to set the job\u2019s executor_def to\nthe celery_executor.

\n
from dagster import job\nfrom dagster_celery import celery_executor\n\n@job(executor_def=celery_executor)\ndef my_job():\n    pass\n
\n
\n
\n

Monitoring your Celery tasks\u00b6

\n

We advise using [Flower](https://celery.readthedocs.io/en/latest/userguide/monitoring.html#flower-real-time-celery-web-monitor):

\n
celery -A dagster_celery.app flower\n
\n
\n
\n
\n

Customizing the Celery broker, backend, and other app configuration\u00b6

\n

By default this will use amqp://guest:**@localhost:5672// as the Celery broker URL and\nrpc:// as the results backend. In production, you will want to change these values. Pending the\nintroduction of a dagster_celery CLI, that would entail writing a Python module my_module as\nfollows:

\n
from celery import Celery\n\nfrom dagster_celery.tasks import create_task\n\napp = Celery('dagster', broker_url='some://custom@value', ...)\n\nexecute_plan = create_task(app)\n\nif __name__ == '__main__':\n    app.worker_main()\n
\n
\n

You can then run the celery worker using:

\n
celery -A my_module worker --loglevel=info\n
\n
\n

This customization mechanism is used to implement dagster_celery_k8s and dagster_celery_k8s which delegate the execution of steps to ephemeral kubernetes pods and docker containers, respectively.

\n
\n
\n

Celery best practices\u00b6

\n

Celery is a rich and full-featured system. We\u2019ve found the following resources helpful:

\n\n
\n
\n
\n

API\u00b6

\n
\n
\ndagster_celery.celery_executor ExecutorDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
broker (Union[dagster.StringSource, None], optional):
\n

The URL of the Celery broker. Default: \u2018pyamqp://guest@{os.getenv(\u2018DAGSTER_CELERY_BROKER_HOST\u2019,\u2019localhost\u2019)}//\u2019.

\n
\n
backend (Union[dagster.StringSource, None], optional):
\n

The URL of the Celery results backend. Default: \u2018rpc://\u2019.

\n

Default Value: \u2018rpc://\u2019

\n
\n
include (List[String], optional):
\n

List of modules every worker should import

\n
\n
config_source (Union[permissive dict, None], optional):
\n

Additional settings for the Celery app.

\n
\n
retries (selector, optional):
\n

Whether retries are enabled or not. By default, retries are enabled.

\n
\nDefault Value:
{\n    "enabled": {}\n}\n
\n
\n
\nConfig Schema:
\n
enabled (strict dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\n
disabled (strict dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\n
\n
\n
\n

Celery-based executor.

\n

The Celery executor exposes config settings for the underlying Celery app under\nthe config_source key. This config corresponds to the \u201cnew lowercase settings\u201d introduced\nin Celery version 4.0 and the object constructed from config will be passed to the\ncelery.Celery constructor as its config_source argument.\n(See https://docs.celeryq.dev/en/stable/userguide/configuration.html for details.)

\n

The executor also exposes the broker, backend, and include arguments to the\ncelery.Celery constructor.

\n

In the most common case, you may want to modify the broker and backend (e.g., to use\nRedis instead of RabbitMQ). We expect that config_source will be less frequently\nmodified, but that when solid executions are especially fast or slow, or when there are\ndifferent requirements around idempotence or retry, it may make sense to execute jobs\nwith variations on these settings.

\n

To use the celery_executor, set it as the executor_def when defining a job:

\n
from dagster import job\nfrom dagster_celery import celery_executor\n\n@job(executor_def=celery_executor)\ndef celery_enabled_job():\n    pass\n
\n
\n

Then you can configure the executor as follows:

\n
execution:\n  config:\n    broker: 'pyamqp://guest@localhost//'  # Optional[str]: The URL of the Celery broker\n    backend: 'rpc://' # Optional[str]: The URL of the Celery results backend\n    include: ['my_module'] # Optional[List[str]]: Modules every worker should import\n    config_source: # Dict[str, Any]: Any additional parameters to pass to the\n        #...       # Celery workers. This dict will be passed as the `config_source`\n        #...       # argument of celery.Celery().\n
\n
\n

Note that the YAML you provide here must align with the configuration with which the Celery\nworkers on which you hope to run were started. If, for example, you point the executor at a\ndifferent broker than the one your workers are listening to, the workers will never be able to\npick up tasks for execution.

\n
\n\n
\n
\n

CLI\u00b6

\n

The dagster-celery CLI lets you start, monitor, and terminate workers.

\n
\n

dagster-celery worker start\u00b6

\n

Start a dagster celery worker.

\n
dagster-celery worker start [OPTIONS] [ADDITIONAL_ARGS]...\n
\n
\n

Options

\n
\n
\n-n, --name <name>\u00b6
\n

The name of the worker. Defaults to a unique name prefixed with \u201cdagster-\u201d and ending with the hostname.

\n
\n\n
\n
\n-y, --config-yaml <config_yaml>\u00b6
\n

Specify the path to a config YAML file with options for the worker. This is the same config block that you provide to dagster_celery.celery_executor when configuring a job for execution with Celery, with, e.g., the URL of the broker to use.

\n
\n\n
\n
\n-q, --queue <queue>\u00b6
\n

Names of the queues on which this worker should listen for tasks. Provide multiple -q arguments to specify multiple queues. Note that each celery worker may listen on no more than four queues.

\n
\n\n
\n
\n-d, --background\u00b6
\n

Set this flag to run the worker in the background.

\n
\n\n
\n
\n-i, --includes <includes>\u00b6
\n

Python modules the worker should import. Provide multiple -i arguments to specify multiple modules.

\n
\n\n
\n
\n-l, --loglevel <loglevel>\u00b6
\n

Log level for the worker.

\n
\n\n
\n
\n-A, --app <app>\u00b6
\n
\n\n

Arguments

\n
\n
\nADDITIONAL_ARGS\u00b6
\n

Optional argument(s)

\n
\n\n
\n
\n

dagster-celery worker list\u00b6

\n

List running dagster-celery workers. Note that we use the broker to contact the workers.

\n
dagster-celery worker list [OPTIONS]\n
\n
\n

Options

\n
\n
\n-y, --config-yaml <config_yaml>\u00b6
\n

Specify the path to a config YAML file with options for the workers you are trying to manage. This is the same config block that you provide to dagster_celery.celery_executor when configuring a job for execution with Celery, with, e.g., the URL of the broker to use. Without this config file, you will not be able to find your workers (since the CLI won\u2019t know how to reach the broker).

\n
\n\n
\n
\n

dagster-celery worker terminate\u00b6

\n

Shut down dagster-celery workers. Note that we use the broker to send signals to the workers to terminate \u2013 if the broker is not running, this command is a no-op. Provide the argument NAME to terminate a specific worker by name.

\n
dagster-celery worker terminate [OPTIONS] [NAME]\n
\n
\n

Options

\n
\n
\n-a, --all\u00b6
\n

Set this flag to terminate all running workers.

\n
\n\n
\n
\n-y, --config-yaml <config_yaml>\u00b6
\n

Specify the path to a config YAML file with options for the workers you are trying to manage. This is the same config block that you provide to dagster_celery.celery_executor when configuring a job for execution with Celery, with, e.g., the URL of the broker to use. Without this config file, you will not be able to terminate your workers (since the CLI won\u2019t know how to reach the broker).

\n
\n\n

Arguments

\n
\n
\nNAME\u00b6
\n

Optional argument

\n
\n\n
\n
\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-celery", "customsidebar": null, "display_toc": true, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../dagster-celery-docker/", "title": "Orchestration on Celery + Docker"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-azure/", "title": "Azure (dagster-azure)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-celery-docker", "Orchestration on Celery + Docker", "N", "next"], ["sections/api/apidocs/libraries/dagster-azure", "Azure (dagster-azure)", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/libraries/dagster-celery.rst.txt", "title": "Celery (dagster-celery)", "toc": "\n"}, "dagster-celery-docker": {"alabaster_version": "0.7.13", "body": "
\n

Orchestration on Celery + Docker\u00b6

\n
\n
\n

APIs\u00b6

\n
\n
\ndagster_celery_docker.celery_docker_executor ExecutorDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
broker (Union[dagster.StringSource, None], optional):
\n

The URL of the Celery broker. Default: \u2018pyamqp://guest@{os.getenv(\u2018DAGSTER_CELERY_BROKER_HOST\u2019,\u2019localhost\u2019)}//\u2019.

\n
\n
backend (Union[dagster.StringSource, None], optional):
\n

The URL of the Celery results backend. Default: \u2018rpc://\u2019.

\n

Default Value: \u2018rpc://\u2019

\n
\n
include (List[String], optional):
\n

List of modules every worker should import

\n
\n
config_source (Union[permissive dict, None], optional):
\n

Additional settings for the Celery app.

\n
\n
retries (selector, optional):
\n

Whether retries are enabled or not. By default, retries are enabled.

\n
\nDefault Value:
{\n    "enabled": {}\n}\n
\n
\n
\nConfig Schema:
\n
enabled (strict dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\n
disabled (strict dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\n
\n
\n
docker (strict dict):
\n

The configuration for interacting with docker in the celery worker.

\n
\nConfig Schema:
\n
image (dagster.StringSource, optional):
\n

The docker image to be used for step execution.

\n
\n
registry (strict dict, optional):
\n

Information for using a non local/public docker registry

\n
\nConfig Schema:
\n
url (dagster.StringSource):
\n

\n
username (dagster.StringSource):
\n

\n
password (dagster.StringSource):
\n

\n
\n
\n
env_vars (List[String], optional):
\n

The list of environment variables names to forward from the celery worker in to the docker container

\n
\n
network (String, optional):
\n

Name of the network this container will be connected to at creation time

\n
\n
container_kwargs (permissive dict, optional):
\n

Additional keyword args for the docker container

\n
\n
\n
\n
\n

Celery-based executor which launches tasks in docker containers.

\n

The Celery executor exposes config settings for the underlying Celery app under\nthe config_source key. This config corresponds to the \u201cnew lowercase settings\u201d introduced\nin Celery version 4.0 and the object constructed from config will be passed to the\ncelery.Celery constructor as its config_source argument.\n(See https://docs.celeryq.dev/en/stable/userguide/configuration.html for details.)

\n

The executor also exposes the broker, backend, and include arguments to the\ncelery.Celery constructor.

\n

In the most common case, you may want to modify the broker and backend (e.g., to use\nRedis instead of RabbitMQ). We expect that config_source will be less frequently\nmodified, but that when op executions are especially fast or slow, or when there are\ndifferent requirements around idempotence or retry, it may make sense to execute jobs\nwith variations on these settings.

\n

To use the celery_docker_executor, set it as the executor_def when defining a job:

\n
from dagster import job\nfrom dagster_celery_docker.executor import celery_docker_executor\n\n@job(executor_def=celery_docker_executor)\ndef celery_enabled_job():\n    pass\n
\n
\n

Then you can configure the executor as follows:

\n
execution:\n  config:\n    docker:\n      image: 'my_repo.com/image_name:latest'\n      registry:\n        url: 'my_repo.com'\n        username: 'my_user'\n        password: {env: 'DOCKER_PASSWORD'}\n      env_vars: ["DAGSTER_HOME"] # environment vars to pass from celery worker to docker\n      container_kwargs: # keyword args to be passed to the container. example:\n        volumes: ['/home/user1/:/mnt/vol2','/var/www:/mnt/vol1']\n\n    broker: 'pyamqp://guest@localhost//'  # Optional[str]: The URL of the Celery broker\n    backend: 'rpc://' # Optional[str]: The URL of the Celery results backend\n    include: ['my_module'] # Optional[List[str]]: Modules every worker should import\n    config_source: # Dict[str, Any]: Any additional parameters to pass to the\n        #...       # Celery workers. This dict will be passed as the `config_source`\n        #...       # argument of celery.Celery().\n
\n
\n

Note that the YAML you provide here must align with the configuration with which the Celery\nworkers on which you hope to run were started. If, for example, you point the executor at a\ndifferent broker than the one your workers are listening to, the workers will never be able to\npick up tasks for execution.

\n

In deployments where the celery_docker_job_executor is used all appropriate celery and dagster_celery\ncommands must be invoked with the -A dagster_celery_docker.app argument.

\n
\n\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-celery-docker", "customsidebar": null, "display_toc": true, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../dagster-celery-k8s/", "title": "Orchestration on Celery + Kubernetes"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-celery/", "title": "Celery (dagster-celery)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-celery-k8s", "Orchestration on Celery + Kubernetes", "N", "next"], ["sections/api/apidocs/libraries/dagster-celery", "Celery (dagster-celery)", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/libraries/dagster-celery-docker.rst.txt", "title": "Orchestration on Celery + Docker", "toc": "\n"}, "dagster-celery-k8s": {"alabaster_version": "0.7.13", "body": "
\n

Orchestration on Celery + Kubernetes\u00b6

\n
\n
\n

APIs\u00b6

\n
\n
\ndagster_celery_k8s.CeleryK8sRunLauncher RunLauncher[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
job_image (Union[dagster.StringSource, None], optional):
\n

Docker image to use for launched Jobs. If this field is empty, the image that was used to originally load the Dagster repository will be used. (Ex: \u201cmycompany.com/dagster-k8s-image:latest\u201d).

\n
\n
image_pull_policy (Union[dagster.StringSource, None], optional):
\n

Image pull policy to set on launched Pods.

\n
\n
image_pull_secrets (Union[List[strict dict], None], optional):
\n

Specifies that Kubernetes should get the credentials from the Secrets named in this list.

\n
\n
service_account_name (Union[dagster.StringSource, None], optional):
\n

The name of the Kubernetes service account under which to run.

\n
\n
env_config_maps (Union[List[dagster.StringSource], None], optional):
\n

A list of custom ConfigMapEnvSource names from which to draw environment variables (using envFrom) for the Job. Default: []. See:https://kubernetes.io/docs/tasks/inject-data-application/define-environment-variable-container/#define-an-environment-variable-for-a-container

\n
\n
env_secrets (Union[List[dagster.StringSource], None], optional):
\n

A list of custom Secret names from which to draw environment variables (using envFrom) for the Job. Default: []. See:https://kubernetes.io/docs/tasks/inject-data-application/distribute-credentials-secure/#configure-all-key-value-pairs-in-a-secret-as-container-environment-variables

\n
\n
env_vars (Union[List[String], None], optional):
\n

A list of environment variables to inject into the Job. Each can be of the form KEY=VALUE or just KEY (in which case the value will be pulled from the current process). Default: []. See: https://kubernetes.io/docs/tasks/inject-data-application/distribute-credentials-secure/#configure-all-key-value-pairs-in-a-secret-as-container-environment-variables

\n
\n
volume_mounts (List[permissive dict], optional):
\n

A list of volume mounts to include in the job\u2019s container. Default: []. See: https://v1-18.docs.kubernetes.io/docs/reference/generated/kubernetes-api/v1.18/#volumemount-v1-core

\n

Default Value: []

\n
\n
volumes (List[permissive dict], optional):
\n

A list of volumes to include in the Job\u2019s Pod. Default: []. For the many possible volume source types that can be included, see: https://v1-18.docs.kubernetes.io/docs/reference/generated/kubernetes-api/v1.18/#volume-v1-core

\n

Default Value: []

\n
\n
labels (permissive dict, optional):
\n

Labels to apply to all created pods. See: https://kubernetes.io/docs/concepts/overview/working-with-objects/labels

\n
\n
resources (Union[strict dict, None], optional):
\n

Compute resource requirements for the container. See: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/

\n
\n
scheduler_name (Union[dagster.StringSource, None], optional):
\n

Use a custom Kubernetes scheduler for launched Pods. See:https://kubernetes.io/docs/tasks/extend-kubernetes/configure-multiple-schedulers/

\n
\n
security_context (permissive dict, optional):
\n

Security settings for the container. See:https://kubernetes.io/docs/tasks/configure-pod-container/security-context/#set-capabilities-for-a-container

\n
\n
instance_config_map (dagster.StringSource):
\n

The name of an existing Volume to mount into the pod in order to provide a ConfigMap for the Dagster instance. This Volume should contain a dagster.yaml with appropriate values for run storage, event log storage, etc.

\n
\n
postgres_password_secret (dagster.StringSource, optional):
\n

The name of the Kubernetes Secret where the postgres password can be retrieved. Will be mounted and supplied as an environment variable to the Job Pod.Secret must contain the key "postgresql-password" which will be exposed in the Job environment as the environment variable DAGSTER_PG_PASSWORD.

\n
\n
dagster_home (dagster.StringSource, optional):
\n

The location of DAGSTER_HOME in the Job container; this is where the dagster.yaml file will be mounted from the instance ConfigMap specified here. Defaults to /opt/dagster/dagster_home.

\n

Default Value: \u2018/opt/dagster/dagster_home\u2019

\n
\n
load_incluster_config (Bool, optional):
\n

Set this value if you are running the launcher\nwithin a k8s cluster. If True, we assume the launcher is running within the target\ncluster and load config using kubernetes.config.load_incluster_config. Otherwise,\nwe will use the k8s config specified in kubeconfig_file (using\nkubernetes.config.load_kube_config) or fall back to the default kubeconfig.

\n

Default Value: True

\n
\n
kubeconfig_file (Union[String, None], optional):
\n

The kubeconfig file from which to load config. Defaults to using the default kubeconfig.

\n

Default Value: None

\n
\n
fail_pod_on_run_failure (Bool, optional):
\n

Whether the launched Kubernetes Jobs and Pods should fail if the Dagster run fails

\n
\n
run_k8s_config (strict dict, optional):
\n

Raw Kubernetes configuration for launched runs.

\n
\nConfig Schema:
\n
container_config (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\n
pod_template_spec_metadata (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\n
pod_spec_config (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\n
job_config (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\n
job_metadata (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\n
job_spec_config (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\n
\n
\n
job_namespace (dagster.StringSource, optional):
\n

Default Value: \u2018default\u2019

\n
\n
broker (Union[dagster.StringSource, None], optional):
\n

The URL of the Celery broker. Default: \u2018pyamqp://guest@{os.getenv(\u2018DAGSTER_CELERY_BROKER_HOST\u2019,\u2019localhost\u2019)}//\u2019.

\n
\n
backend (Union[dagster.StringSource, None], optional):
\n

The URL of the Celery results backend. Default: \u2018rpc://\u2019.

\n

Default Value: \u2018rpc://\u2019

\n
\n
include (List[String], optional):
\n

List of modules every worker should import

\n
\n
config_source (Union[permissive dict, None], optional):
\n

Additional settings for the Celery app.

\n
\n
retries (selector, optional):
\n

Whether retries are enabled or not. By default, retries are enabled.

\n
\nDefault Value:
{\n    "enabled": {}\n}\n
\n
\n
\nConfig Schema:
\n
enabled (strict dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\n
disabled (strict dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\n
\n
\n
\n

In contrast to the K8sRunLauncher, which launches dagster runs as single K8s\nJobs, this run launcher is intended for use in concert with\ndagster_celery_k8s.celery_k8s_job_executor().

\n

With this run launcher, execution is delegated to:

\n
\n
    \n
  1. A run worker Kubernetes Job, which traverses the dagster run execution plan and\nsubmits steps to Celery queues for execution;

  2. \n
  3. The step executions which are submitted to Celery queues are picked up by Celery workers,\nand each step execution spawns a step execution Kubernetes Job. See the implementation\ndefined in dagster_celery_k8.executor.create_k8s_job_task().

  4. \n
\n
\n

You can configure a Dagster instance to use this RunLauncher by adding a section to your\ndagster.yaml like the following:

\n
run_launcher:\n  module: dagster_k8s.launcher\n  class: CeleryK8sRunLauncher\n  config:\n    instance_config_map: "dagster-k8s-instance-config-map"\n    dagster_home: "/some/path"\n    postgres_password_secret: "dagster-k8s-pg-password"\n    broker: "some_celery_broker_url"\n    backend: "some_celery_backend_url"\n
\n
\n
\n\n
\n
\ndagster_celery_k8s.celery_k8s_job_executor ExecutorDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
broker (Union[dagster.StringSource, None], optional):
\n

The URL of the Celery broker. Default: \u2018pyamqp://guest@{os.getenv(\u2018DAGSTER_CELERY_BROKER_HOST\u2019,\u2019localhost\u2019)}//\u2019.

\n
\n
backend (Union[dagster.StringSource, None], optional):
\n

The URL of the Celery results backend. Default: \u2018rpc://\u2019.

\n

Default Value: \u2018rpc://\u2019

\n
\n
include (List[String], optional):
\n

List of modules every worker should import

\n
\n
config_source (Union[permissive dict, None], optional):
\n

Additional settings for the Celery app.

\n
\n
retries (selector, optional):
\n

Whether retries are enabled or not. By default, retries are enabled.

\n
\nDefault Value:
{\n    "enabled": {}\n}\n
\n
\n
\nConfig Schema:
\n
enabled (strict dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\n
disabled (strict dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\n
\n
\n
job_image (Union[dagster.StringSource, None], optional):
\n

Docker image to use for launched Jobs. If this field is empty, the image that was used to originally load the Dagster repository will be used. (Ex: \u201cmycompany.com/dagster-k8s-image:latest\u201d).

\n
\n
image_pull_policy (Union[dagster.StringSource, None], optional):
\n

Image pull policy to set on launched Pods.

\n
\n
image_pull_secrets (Union[List[strict dict], None], optional):
\n

Specifies that Kubernetes should get the credentials from the Secrets named in this list.

\n
\n
service_account_name (Union[dagster.StringSource, None], optional):
\n

The name of the Kubernetes service account under which to run.

\n
\n
env_config_maps (Union[List[dagster.StringSource], None], optional):
\n

A list of custom ConfigMapEnvSource names from which to draw environment variables (using envFrom) for the Job. Default: []. See:https://kubernetes.io/docs/tasks/inject-data-application/define-environment-variable-container/#define-an-environment-variable-for-a-container

\n
\n
env_secrets (Union[List[dagster.StringSource], None], optional):
\n

A list of custom Secret names from which to draw environment variables (using envFrom) for the Job. Default: []. See:https://kubernetes.io/docs/tasks/inject-data-application/distribute-credentials-secure/#configure-all-key-value-pairs-in-a-secret-as-container-environment-variables

\n
\n
env_vars (Union[List[String], None], optional):
\n

A list of environment variables to inject into the Job. Each can be of the form KEY=VALUE or just KEY (in which case the value will be pulled from the current process). Default: []. See: https://kubernetes.io/docs/tasks/inject-data-application/distribute-credentials-secure/#configure-all-key-value-pairs-in-a-secret-as-container-environment-variables

\n
\n
volume_mounts (List[permissive dict], optional):
\n

A list of volume mounts to include in the job\u2019s container. Default: []. See: https://v1-18.docs.kubernetes.io/docs/reference/generated/kubernetes-api/v1.18/#volumemount-v1-core

\n

Default Value: []

\n
\n
volumes (List[permissive dict], optional):
\n

A list of volumes to include in the Job\u2019s Pod. Default: []. For the many possible volume source types that can be included, see: https://v1-18.docs.kubernetes.io/docs/reference/generated/kubernetes-api/v1.18/#volume-v1-core

\n

Default Value: []

\n
\n
labels (permissive dict, optional):
\n

Labels to apply to all created pods. See: https://kubernetes.io/docs/concepts/overview/working-with-objects/labels

\n
\n
resources (Union[strict dict, None], optional):
\n

Compute resource requirements for the container. See: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/

\n
\n
scheduler_name (Union[dagster.StringSource, None], optional):
\n

Use a custom Kubernetes scheduler for launched Pods. See:https://kubernetes.io/docs/tasks/extend-kubernetes/configure-multiple-schedulers/

\n
\n
security_context (permissive dict, optional):
\n

Security settings for the container. See:https://kubernetes.io/docs/tasks/configure-pod-container/security-context/#set-capabilities-for-a-container

\n
\n
load_incluster_config (Bool, optional):
\n

Set this value if you are running the launcher within a k8s cluster. If\nTrue, we assume the launcher is running within the target cluster and load config\nusing kubernetes.config.load_incluster_config. Otherwise, we will use the k8s config\nspecified in kubeconfig_file (using kubernetes.config.load_kube_config) or fall\nback to the default kubeconfig. Default: True.

\n

Default Value: True

\n
\n
kubeconfig_file (Union[String, None], optional):
\n

Path to a kubeconfig file to use, if not using default kubeconfig.

\n
\n
job_namespace (dagster.StringSource, optional):
\n

The namespace into which to launch new jobs. Note that any other Kubernetes resources the Job requires (such as the service account) must be present in this namespace. Default: "default"

\n
\n
repo_location_name (dagster.StringSource, optional):
\n

The repository location name to use for execution.

\n

Default Value: \u2018<<in_process>>\u2019

\n
\n
job_wait_timeout (Float, optional):
\n

Wait this many seconds for a job to complete before marking the run as failed. Defaults to 86400.0 seconds.

\n

Default Value: 86400.0

\n
\n
\n

Celery-based executor which launches tasks as Kubernetes Jobs.

\n

The Celery executor exposes config settings for the underlying Celery app under\nthe config_source key. This config corresponds to the \u201cnew lowercase settings\u201d introduced\nin Celery version 4.0 and the object constructed from config will be passed to the\ncelery.Celery constructor as its config_source argument.\n(See https://docs.celeryq.dev/en/stable/userguide/configuration.html for details.)

\n

The executor also exposes the broker, backend, and include arguments to the\ncelery.Celery constructor.

\n

In the most common case, you may want to modify the broker and backend (e.g., to use\nRedis instead of RabbitMQ). We expect that config_source will be less frequently\nmodified, but that when op executions are especially fast or slow, or when there are\ndifferent requirements around idempotence or retry, it may make sense to execute dagster jobs\nwith variations on these settings.

\n

To use the celery_k8s_job_executor, set it as the executor_def when defining a job:

\n
from dagster import job\nfrom dagster_celery_k8s.executor import celery_k8s_job_executor\n\n\n@job(executor_def=celery_k8s_job_executor)\ndef celery_enabled_job():\n    pass\n
\n
\n

Then you can configure the executor as follows:

\n
execution:\n  config:\n    job_image: 'my_repo.com/image_name:latest'\n    job_namespace: 'some-namespace'\n    broker: 'pyamqp://guest@localhost//'  # Optional[str]: The URL of the Celery broker\n    backend: 'rpc://' # Optional[str]: The URL of the Celery results backend\n    include: ['my_module'] # Optional[List[str]]: Modules every worker should import\n    config_source: # Dict[str, Any]: Any additional parameters to pass to the\n        #...       # Celery workers. This dict will be passed as the `config_source`\n        #...       # argument of celery.Celery().\n
\n
\n

Note that the YAML you provide here must align with the configuration with which the Celery\nworkers on which you hope to run were started. If, for example, you point the executor at a\ndifferent broker than the one your workers are listening to, the workers will never be able to\npick up tasks for execution.

\n

In deployments where the celery_k8s_job_executor is used all appropriate celery and dagster_celery\ncommands must be invoked with the -A dagster_celery_k8s.app argument.

\n
\n\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-celery-k8s", "customsidebar": null, "display_toc": true, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../dagster-census/", "title": "Census (dagster-census)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-celery-docker/", "title": "Orchestration on Celery + Docker"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-census", "Census (dagster-census)", "N", "next"], ["sections/api/apidocs/libraries/dagster-celery-docker", "Orchestration on Celery + Docker", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/libraries/dagster-celery-k8s.rst.txt", "title": "Orchestration on Celery + Kubernetes", "toc": "\n"}, "dagster-census": {"alabaster_version": "0.7.13", "body": "
\n

Census (dagster-census)\u00b6

\n

This library provides an integration with Census.

\n
\n
\ndagster_census.census_trigger_sync_op OpDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
sync_id (Int):
\n

Id of the parent sync.

\n
\n
force_full_sync (Bool, optional):
\n

If this trigger request should be a Full Sync. Note that some sync configurations such as Append do not support full syncs.

\n

Default Value: False

\n
\n
poll_interval (Float, optional):
\n

The time (in seconds) to wait between successive polls.

\n

Default Value: 10

\n
\n
poll_timeout (Union[Float, None], optional):
\n

The maximum time to wait before this operation is timed out. By default, this will never time out.

\n

Default Value: None

\n
\n
yield_materializations (Bool, optional):
\n

If True, materializations corresponding to the results of the Census sync will be yielded when the op executes.

\n

Default Value: True

\n
\n
asset_key_prefix (List[String], optional):
\n

If provided and yield_materializations is True, these components will be used to prefix the generated asset keys.

\n

Default Value: [\u2018census\u2019]

\n
\n
\n

Executes a Census sync for a given sync_id and polls until that sync completes, raising\nan error if it is unsuccessful.

\n

It outputs a CensusOutput which contains the details of the Census\nsync after it successfully completes.

\n

It requires the use of the census_resource, which allows it to\ncommunicate with the Census API.

\n

Examples:

\n
from dagster import job\nfrom dagster_census import census_resource, census_sync_op\n\nmy_census_resource = census_resource.configured(\n    {\n        "api_key": {"env": "CENSUS_API_KEY"},\n    }\n)\n\nsync_foobar = census_sync_op.configured({"sync_id": "foobar"}, name="sync_foobar")\n\n@job(resource_defs={"census": my_census_resource})\ndef my_simple_census_job():\n    sync_foobar()\n
\n
\n
\n\n
\n
\ndagster_census.census_resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
api_key (dagster.StringSource):
\n

Census API Key.

\n
\n
request_max_retries (Int, optional):
\n

The maximum number of times requests to the Census API should be retried before failing.

\n

Default Value: 3

\n
\n
request_retry_delay (Float, optional):
\n

Time (in seconds) to wait between each request retry.

\n

Default Value: 0.25

\n
\n
\n

This resource allows users to programatically interface with the Census REST API to launch\nsyncs and monitor their progress. This currently implements only a subset of the functionality\nexposed by the API.

\n

Examples:

\n
from dagster import job\nfrom dagster_census import census_resource\n\nmy_census_resource = census_resource.configured(\n    {\n        "api_key": {"env": "CENSUS_API_KEY"},\n    }\n)\n\n@job(resource_defs={"census":my_census_resource})\ndef my_census_job():\n    ...\n
\n
\n
\n\n
\n
\nclass dagster_census.CensusResource(api_key, request_max_retries=3, request_retry_delay=0.25, log=<Logger dagster.builtin (DEBUG)>)[source]\u00b6
\n

This class exposes methods on top of the Census REST API.

\n
\n\n
\n
\nclass dagster_census.CensusOutput(sync_run, source, destination)[source]\u00b6
\n

Contains recorded information about the state of a Census sync after a sync completes.

\n
\n
\nsync_run\u00b6
\n

The details of the specific sync run.

\n
\n
Type:
\n

Dict[str, Any]

\n
\n
\n
\n\n
\n
\nsource\u00b6
\n

Information about the source for the Census sync.

\n
\n
Type:
\n

Dict[str, Any]

\n
\n
\n
\n\n
\n
\ndestination\u00b6
\n

Information about the destination for the Census sync.

\n
\n
Type:
\n

Dict[str, Any]

\n
\n
\n
\n\n
\n\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-census", "customsidebar": null, "display_toc": false, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../dagster-dask/", "title": "Dask (dagster-dask)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-celery-k8s/", "title": "Orchestration on Celery + Kubernetes"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-dask", "Dask (dagster-dask)", "N", "next"], ["sections/api/apidocs/libraries/dagster-celery-k8s", "Orchestration on Celery + Kubernetes", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/libraries/dagster-census.rst.txt", "title": "Census (dagster-census)", "toc": "\n"}, "dagster-dask": {"alabaster_version": "0.7.13", "body": "
\n

Dask (dagster-dask)\u00b6

\n

See also the Dask deployment guide.

\n
\n
\ndagster_dask.dask_executor ExecutorDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
cluster (selector):
\n
\nConfig Schema:
\n
existing (strict dict):
\n

Connect to an existing scheduler.

\n
\nConfig Schema:
\n
address (dagster.StringSource):
\n

\n
\n
\n
local (permissive dict, optional):
\n

Local cluster configuration.

\n
\n
yarn (permissive dict, optional):
\n

YARN cluster configuration.

\n
\n
ssh (permissive dict, optional):
\n

SSH cluster configuration.

\n
\n
pbs (permissive dict, optional):
\n

PBS cluster configuration.

\n
\n
moab (permissive dict, optional):
\n

Moab cluster configuration.

\n
\n
sge (permissive dict, optional):
\n

SGE cluster configuration.

\n
\n
lsf (permissive dict, optional):
\n

LSF cluster configuration.

\n
\n
slurm (permissive dict, optional):
\n

SLURM cluster configuration.

\n
\n
oar (permissive dict, optional):
\n

OAR cluster configuration.

\n
\n
kube (permissive dict, optional):
\n

Kubernetes cluster configuration.

\n
\n
\n
\n
\n

Dask-based executor.

\n

The \u2018cluster\u2019 can be one of the following:\n(\u2018existing\u2019, \u2018local\u2019, \u2018yarn\u2019, \u2018ssh\u2019, \u2018pbs\u2019, \u2018moab\u2019, \u2018sge\u2019, \u2018lsf\u2019, \u2018slurm\u2019, \u2018oar\u2019, \u2018kube\u2019).

\n

If the Dask executor is used without providing executor-specific config, a local Dask cluster\nwill be created (as when calling dask.distributed.Client()\nwith dask.distributed.LocalCluster()).

\n

The Dask executor optionally takes the following config:

\n
cluster:\n    {\n        local?: # takes distributed.LocalCluster parameters\n            {\n                timeout?: 5,  # Timeout duration for initial connection to the scheduler\n                n_workers?: 4  # Number of workers to start\n                threads_per_worker?: 1 # Number of threads per each worker\n            }\n    }\n
\n
\n

To use the dask_executor, set it as the executor_def when defining a job:

\n
from dagster import job\nfrom dagster_dask import dask_executor\n\n@job(executor_def=dask_executor)\ndef dask_enabled_job():\n    pass\n
\n
\n
\n\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-dask", "customsidebar": null, "display_toc": false, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../dagster-databricks/", "title": "Databricks (dagster-databricks)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-census/", "title": "Census (dagster-census)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-databricks", "Databricks (dagster-databricks)", "N", "next"], ["sections/api/apidocs/libraries/dagster-census", "Census (dagster-census)", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/libraries/dagster-dask.rst.txt", "title": "Dask (dagster-dask)", "toc": "\n"}, "dagster-databricks": {"alabaster_version": "0.7.13", "body": "
\n

Databricks (dagster-databricks)\u00b6

\n

The dagster_databricks package provides these main pieces of functionality:

\n\n

Note that, for the databricks_pyspark_step_launcher, either S3 or Azure Data Lake Storage config\nmust be specified for ops to succeed, and the credentials for this storage must also be\nstored as a Databricks Secret and stored in the resource config so that the Databricks cluster can\naccess storage.

\n
\n
\n

APIs\u00b6

\n
\n

Resources\u00b6

\n
\n
\ndagster_databricks.DatabricksClientResource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
host (dagster.StringSource):
\n

Databricks host, e.g. https://uksouth.azuredatabricks.com

\n
\n
token (Union[dagster.StringSource, None], optional):
\n

Databricks access token

\n
\n
oauth_credentials (Union[strict dict, None], optional):
\n

Databricks OAuth credentials for using a service principal. See https://docs.databricks.com/en/dev-tools/auth.html#oauth-2-0

\n
\n
workspace_id (Union[dagster.StringSource, None], optional):
\n

DEPRECATED: The Databricks workspace ID, as described in https://docs.databricks.com/workspace/workspace-details.html#workspace-instance-names-urls-and-ids. This is no longer used and will be removed in a 0.21.

\n
\n
\n

Resource which provides a Python client for interacting with Databricks within an\nop or asset.

\n
\n\n
\n
\nclass dagster_databricks.DatabricksClient(host, token=None, oauth_client_id=None, oauth_client_secret=None, workspace_id=None)[source]\u00b6
\n

A thin wrapper over the Databricks REST API.

\n
\n
\nproperty api_client\u00b6
\n
\n
\n

\n \n (\n deprecated\n )\n \n This API will be removed in version 0.21.0. Use workspace_client property instead..\n \n

\n

Retrieve a reference to the underlying Databricks API client. For more information,\nsee the Databricks Python API.\nNoe: accessing this property will throw an exception if oauth credentials are used to initialize the\nDatabricksClient, because oauth credentials are not supported by the legacy Databricks API client.\nExamples:.

\n
from dagster import op\nfrom databricks_cli.jobs.api import JobsApi\nfrom databricks_cli.runs.api import RunsApi\nfrom databricks.sdk import WorkspaceClient\n\n@op(required_resource_keys={"databricks_client"})\ndef op1(context):\n    # Initialize the Databricks Jobs API\n    jobs_client = JobsApi(context.resources.databricks_client.api_client)\n    runs_client = RunsApi(context.resources.databricks_client.api_client)\n    client = context.resources.databricks_client.api_client\n\n    # Example 1: Run a Databricks job with some parameters.\n    jobs_client.run_now(...)\n    client.jobs.run_now(...)\n\n    # Example 2: Trigger a one-time run of a Databricks workload.\n    runs_client.submit_run(...)\n    client.jobs.submit(...)\n\n    # Example 3: Get an existing run.\n    runs_client.get_run(...)\n    client.jobs.get_run(...)\n\n    # Example 4: Cancel a run.\n    runs_client.cancel_run(...)\n    client.jobs.cancel_run(...)\n
\n
\n
\n
Returns:
\n

The authenticated Databricks API client.

\n
\n
Return type:
\n

ApiClient

\n
\n
\n
\n\n
\n
\nproperty client\u00b6
\n
\n
\n

\n \n (\n deprecated\n )\n \n This API will be removed in version 0.21.0. Use workspace_client property instead..\n \n

\n

accessing this property will throw an exception if oauth\ncredentials are used to initialize the DatabricksClient, because oauth credentials are not supported by the\nlegacy Databricks API client.

\n
\n
Type:
\n

Retrieve the legacy Databricks API client. Note

\n
\n
\n
\n\n
\n
\nproperty workspace_client\u00b6
\n

Retrieve a reference to the underlying Databricks Workspace client. For more information,\nsee the Databricks SDK for Python.

\n

Examples:

\n
from dagster import op\nfrom databricks.sdk import WorkspaceClient\n\n@op(required_resource_keys={"databricks_client"})\ndef op1(context):\n    # Initialize the Databricks Jobs API\n    client = context.resources.databricks_client.api_client\n\n    # Example 1: Run a Databricks job with some parameters.\n    client.jobs.run_now(...)\n\n    # Example 2: Trigger a one-time run of a Databricks workload.\n    client.jobs.submit(...)\n\n    # Example 3: Get an existing run.\n    client.jobs.get_run(...)\n\n    # Example 4: Cancel a run.\n    client.jobs.cancel_run(...)\n
\n
\n
\n
Returns:
\n

The authenticated Databricks SDK Workspace Client.

\n
\n
Return type:
\n

WorkspaceClient

\n
\n
\n
\n\n
\n\n
\n
\n

Ops\u00b6

\n
\n
\ndagster_databricks.create_databricks_run_now_op(databricks_job_id, databricks_job_configuration=None, poll_interval_seconds=10, max_wait_time_seconds=86400, name=None, databricks_resource_key='databricks')[source]\u00b6
\n

Creates an op that launches an existing databricks job.

\n

As config, the op accepts a blob of the form described in Databricks\u2019 Job API:\nhttps://docs.databricks.com/api-explorer/workspace/jobs/runnow. The only required field is\njob_id, which is the ID of the job to be executed. Additional fields can be used to specify\noverride parameters for the Databricks Job.

\n
\n
Parameters:
\n
    \n
  • databricks_job_id (int) \u2013 The ID of the Databricks Job to be executed.

  • \n
  • databricks_job_configuration (dict) \u2013 Configuration for triggering a new job run of a\nDatabricks Job. See https://docs.databricks.com/api-explorer/workspace/jobs/runnow\nfor the full configuration.

  • \n
  • poll_interval_seconds (float) \u2013 How often to poll the Databricks API to check whether the\nDatabricks job has finished running.

  • \n
  • max_wait_time_seconds (float) \u2013 How long to wait for the Databricks job to finish running\nbefore raising an error.

  • \n
  • name (Optional[str]) \u2013 The name of the op. If not provided, the name will be\n_databricks_run_now_op.

  • \n
  • databricks_resource_key (str) \u2013 The name of the resource key used by this op. If not\nprovided, the resource key will be \u201cdatabricks\u201d.

  • \n
\n
\n
Returns:
\n

An op definition to run the Databricks Job.

\n
\n
Return type:
\n

OpDefinition

\n
\n
\n

Example

\n
from dagster import job\nfrom dagster_databricks import create_databricks_run_now_op, DatabricksClientResource\n\nDATABRICKS_JOB_ID = 1234\n\n\nrun_now_op = create_databricks_run_now_op(\n    databricks_job_id=DATABRICKS_JOB_ID,\n    databricks_job_configuration={\n        "python_params": [\n            "--input",\n            "schema.db.input_table",\n            "--output",\n            "schema.db.output_table",\n        ],\n    },\n)\n\n@job(\n    resource_defs={\n        "databricks": DatabricksClientResource(\n            host=EnvVar("DATABRICKS_HOST"),\n            token=EnvVar("DATABRICKS_TOKEN")\n        )\n    }\n)\ndef do_stuff():\n    run_now_op()\n
\n
\n
\n\n
\n
\ndagster_databricks.create_databricks_submit_run_op(databricks_job_configuration, poll_interval_seconds=10, max_wait_time_seconds=86400, name=None, databricks_resource_key='databricks')[source]\u00b6
\n

Creates an op that submits a one-time run of a set of tasks on Databricks.

\n

As config, the op accepts a blob of the form described in Databricks\u2019 Job API:\nhttps://docs.databricks.com/api-explorer/workspace/jobs/submit.

\n
\n
Parameters:
\n
    \n
  • databricks_job_configuration (dict) \u2013 Configuration for submitting a one-time run of a set\nof tasks on Databricks. See https://docs.databricks.com/api-explorer/workspace/jobs/submit\nfor the full configuration.

  • \n
  • poll_interval_seconds (float) \u2013 How often to poll the Databricks API to check whether the\nDatabricks job has finished running.

  • \n
  • max_wait_time_seconds (float) \u2013 How long to wait for the Databricks job to finish running\nbefore raising an error.

  • \n
  • name (Optional[str]) \u2013 The name of the op. If not provided, the name will be\n_databricks_submit_run_op.

  • \n
  • databricks_resource_key (str) \u2013 The name of the resource key used by this op. If not\nprovided, the resource key will be \u201cdatabricks\u201d.

  • \n
\n
\n
Returns:
\n

An op definition to submit a one-time run of a set of tasks on Databricks.

\n
\n
Return type:
\n

OpDefinition

\n
\n
\n

Example

\n
from dagster import job\nfrom dagster_databricks import create_databricks_submit_run_op, DatabricksClientResource\n\n\nsubmit_run_op = create_databricks_submit_run_op(\n    databricks_job_configuration={\n        "new_cluster": {\n            "spark_version": '2.1.0-db3-scala2.11',\n            "num_workers": 2\n        },\n        "notebook_task": {\n            "notebook_path": "/Users/dagster@example.com/PrepareData",\n        },\n    }\n)\n\n@job(\n    resource_defs={\n        "databricks": DatabricksClientResource(\n            host=EnvVar("DATABRICKS_HOST"),\n            token=EnvVar("DATABRICKS_TOKEN")\n        )\n    }\n)\ndef do_stuff():\n    submit_run_op()\n
\n
\n
\n\n
\n
\n

Step Launcher\u00b6

\n
\n
\ndagster_databricks.databricks_pyspark_step_launcher ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
run_config (strict dict):
\n

Databricks job run configuration

\n
\nConfig Schema:
\n
cluster (selector):
\n
\nConfig Schema:
\n
new (strict dict):
\n
\nConfig Schema:
\n
size (selector):
\n
\nConfig Schema:
\n
autoscale (strict dict):
\n
\nConfig Schema:
\n
min_workers (Int):
\n

The minimum number of workers to which the cluster can scale down when underutilized. It is also the initial number of workers the cluster will have after creation.

\n
\n
max_workers (Int):
\n

The maximum number of workers to which the cluster can scale up when overloaded. max_workers must be strictly greater than min_workers.

\n
\n
\n
\n
num_workers (Int):
\n

If num_workers, number of worker nodes that this cluster should have. A cluster has one Spark Driver and num_workers Executors for a total of num_workers + 1 Spark nodes.

\n
\n
\n
\n
spark_version (String):
\n

The Spark version of the cluster. A list of available Spark versions can be retrieved by using the Runtime versions API call. This field is required.

\n
\n
spark_conf (permissive dict, optional):
\n

An object containing a set of optional, user-specified Spark configuration key-value pairs. You can also pass in a string of extra JVM options to the driver and the executors via spark.driver.extraJavaOptions and spark.executor.extraJavaOptions respectively. Example Spark confs: {\u201cspark.speculation\u201d: true, \u201cspark.streaming.ui.retainedBatches\u201d: 5} or {\u201cspark.driver.extraJavaOptions\u201d: \u201c-verbose:gc -XX:+PrintGCDetails\u201d}

\n
\n
nodes (selector):
\n

The nodes used in the cluster. Either the node types or an instance pool can be specified.

\n
\nConfig Schema:
\n
node_types (strict dict):
\n
\nConfig Schema:
\n
node_type_id (String):
\n

This field encodes, through a single value, the resources available to each of the Spark nodes in this cluster. For example, the Spark nodes can be provisioned and optimized for memory or compute intensive workloads. A list of available node types can be retrieved by using the List node types API call. This field is required.

\n
\n
driver_node_type_id (String, optional):
\n

The node type of the Spark driver. This field is optional; if unset, the driver node type is set as the same value as node_type_id defined above.

\n
\n
\n
\n
instance_pool_id (String, optional):
\n

The optional ID of the instance pool to which the cluster belongs. Refer to the Instance Pools API for details.

\n
\n
\n
\n
aws_attributes (permissive dict, optional):
\n

Attributes related to clusters running on Amazon Web Services. If not specified at cluster creation, a set of default values is used. See aws_attributes at https://docs.databricks.com/dev-tools/api/latest/clusters.html.

\n
\nConfig Schema:
\n
first_on_demand (Int, optional):
\n

The first first_on_demand nodes of the cluster will be placed on on-demand instances. If this value is greater than 0, the cluster driver node will be placed on an on-demand instance. If this value is greater than or equal to the current cluster size, all nodes will be placed on on-demand instances. If this value is less than the current cluster size, first_on_demand nodes will be placed on on-demand instances and the remainder will be placed on availability instances. This value does not affect cluster size and cannot be mutated over the lifetime of a cluster.

\n
\n
availability (AWSAvailability, optional):
\n

Availability type used for all subsequent nodes past the first_on_demand ones. Note: If first_on_demand is zero, this availability type will be used for the entire cluster.

\n
\n
zone_id (String, optional):
\n

Identifier for the availability zone/datacenter in which the cluster resides.

\n
\n
instance_profile_arn (String, optional):
\n

Nodes for this cluster will only be placed on AWS instances with this instance profile.

\n
\n
spot_bid_price_percent (Int, optional):
\n

The max price for AWS spot instances, as a percentage of the corresponding instance type\u2019s on-demand price.

\n
\n
ebs_volume_type (EBSVolumeType, optional):
\n

The type of EBS volumes that will be launched with this cluster.

\n
\n
ebs_volume_count (Int, optional):
\n

The number of volumes launched for each instance. You can choose up to 10 volumes.

\n
\n
ebs_volume_size (Int, optional):
\n

The size of each EBS volume (in GiB) launched for each instance.

\n
\n
ebs_volume_iops (Int, optional):
\n

The number of IOPS per EBS gp3 volume.

\n
\n
ebs_volume_throughput (Int, optional):
\n

The throughput per EBS gp3 volume, in MiB per second.

\n
\n
\n
\n
ssh_public_keys (List[String], optional):
\n

SSH public key contents that will be added to each Spark node in this cluster. The corresponding private keys can be used to login with the user name ubuntu on port 2200. Up to 10 keys can be specified.

\n
\n
custom_tags (List[strict dict], optional):
\n

Additional tags for cluster resources. Databricks tags all cluster resources (e.g., AWS instances and EBS volumes) with these tags in addition to default_tags. Note: - Tags are not supported on legacy node types such as compute-optimized and memory-optimized - Databricks allows at most 45 custom tagsMore restrictions may apply if using Azure Databricks; refer to the official docs for further details.

\n
\n
cluster_log_conf (selector, optional):
\n

Recommended! The configuration for delivering Spark logs to a long-term storage destination. Only one destination can be specified for one cluster. If the conf is given, the logs will be delivered to the destination every 5 mins. The destination of driver logs is <destination>/<cluster-id>/driver, while the destination of executor logs is <destination>/<cluster-id>/executor.

\n
\nConfig Schema:
\n
dbfs (strict dict):
\n

DBFS storage information

\n
\nConfig Schema:
\n
destination (String):
\n

DBFS destination, e.g. dbfs:/my/path

\n
\n
\n
\n
s3 (strict dict):
\n

S3 storage information

\n
\nConfig Schema:
\n
destination (String):
\n

S3 destination, e.g. s3://my-bucket/some-prefix. You must configure the cluster with an instance profile and the instance profile must have write access to the destination. You cannot use AWS keys.

\n
\n
region (String):
\n

S3 region, e.g. us-west-2. Either region or endpoint must be set. If both are set, endpoint is used.

\n
\n
endpoint (String):
\n

S3 endpoint, e.g. https://s3-us-west-2.amazonaws.com. Either region or endpoint must be set. If both are set, endpoint is used.

\n
\n
enable_encryption (Bool, optional):
\n

(Optional) Enable server side encryption, false by default.

\n
\n
encryption_type (String, optional):
\n

(Optional) The encryption type, it could be sse-s3 or sse-kms. It is used only when encryption is enabled and the default type is sse-s3.

\n
\n
kms_key (String, optional):
\n

(Optional) KMS key used if encryption is enabled and encryption type is set to sse-kms.

\n
\n
canned_acl (String, optional):
\n

(Optional) Set canned access control list, e.g. bucket-owner-full-control.If canned_acl is set, the cluster instance profile must have s3:PutObjectAcl permission on the destination bucket and prefix. The full list of possible canned ACLs can be found at https://docs.aws.amazon.com/AmazonS3/latest/dev/acl-overview.html#canned-acl. By default only the object owner gets full control. If you are using cross account role for writing data, you may want to set bucket-owner-full-control to make bucket owner able to read the logs.

\n
\n
\n
\n
\n
\n
init_scripts (List[selector], optional):
\n

The configuration for storing init scripts. Any number of scripts can be specified. The scripts are executed sequentially in the order provided. If cluster_log_conf is specified, init script logs are sent to <destination>/<cluster-id>/init_scripts.

\n
\n
spark_env_vars (permissive dict, optional):
\n

An object containing a set of optional, user-specified environment variable key-value pairs. Key-value pair of the form (X,Y) are exported as is (i.e., export X=\u201dY\u201d) while launching the driver and workers. To specify an additional set of SPARK_DAEMON_JAVA_OPTS, we recommend appending them to $SPARK_DAEMON_JAVA_OPTS as shown in the example below. This ensures that all default Databricks managed environmental variables are included as well. Example Spark environment variables: {\u201cSPARK_WORKER_MEMORY\u201d: \u201c28000m\u201d, \u201cSPARK_LOCAL_DIRS\u201d: \u201c/local_disk0\u201d} or {\u201cSPARK_DAEMON_JAVA_OPTS\u201d: \u201c$SPARK_DAEMON_JAVA_OPTS -Dspark.shuffle.service.enabled=true\u201d}

\n
\n
enable_elastic_disk (Bool, optional):
\n

Autoscaling Local Storage: when enabled, this cluster dynamically acquires attitional disk space when its Spark workers are running low on disk space. This feature requires specific AWS permissions to function correctly - refer to https://docs.databricks.com/clusters/configure.html#autoscaling-local-storage for details.

\n
\n
policy_id (String, optional):
\n

The ID of the cluster policy used to create the cluster if applicable

\n
\n
\n
\n
existing (String):
\n

The ID of an existing cluster that will be used for all runs of this job. When running jobs on an existing cluster, you may need to manually restart the cluster if it stops responding. Databricks suggests running jobs on new clusters for greater reliability.

\n
\n
\n
\n
run_name (String, optional):
\n

An optional name for the run. The default value is Untitled

\n
\n
libraries (List[selector], optional):
\n

An optional list of libraries to be installed on the cluster that will execute the job. By default dagster, dagster-databricks and dagster-pyspark libraries will be included.

\n
\n
install_default_libraries (Bool, optional):
\n

By default, Dagster installs a version of dagster, dagster-databricks, and dagster-pyspark matching the locally-installed versions of those libraries. If you would like to disable this behavior, this value can be set to False.

\n
\n
timeout_seconds (Int, optional):
\n

An optional timeout applied to each run of this job. The default behavior is to have no timeout.

\n
\n
idempotency_token (String, optional):
\n

An optional token that can be used to guarantee the idempotency of job run requests.If an active run with the provided token already exists, the request will not create a new run, but will return the ID of the existing run instead. If you specify the idempotency token, upon failure you can retry until the request succeeds. Databricks guarantees that exactly one run will be launched with that idempotency token. This token should have at most 64 characters.

\n
\n
\n
\n
permissions (strict dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
job_permissions (strict dict, optional):
\n

job permission spec; ref: https://docs.databricks.com/security/access-control/jobs-acl.html#job-permissions

\n
\nConfig Schema:
\n
NO_PERMISSIONS (List[selector], optional):
\n

\n
CAN_VIEW (List[selector], optional):
\n

\n
CAN_MANAGE_RUN (List[selector], optional):
\n

\n
IS_OWNER (List[selector], optional):
\n

\n
CAN_MANAGE (List[selector], optional):
\n

\n
\n
\n
cluster_permissions (strict dict, optional):
\n

cluster permission spec; ref: https://docs.databricks.com/security/access-control/cluster-acl.html#cluster-level-permissions

\n
\nConfig Schema:
\n
NO_PERMISSIONS (List[selector], optional):
\n

\n
CAN_ATTACH_TO (List[selector], optional):
\n

\n
CAN_RESTART (List[selector], optional):
\n

\n
CAN_MANAGE (List[selector], optional):
\n

\n
\n
\n
\n
\n
databricks_host (dagster.StringSource):
\n

Databricks host, e.g. uksouth.azuredatabricks.com

\n
\n
databricks_token (Union[dagster.StringSource, None], optional):
\n

Databricks access token

\n

Default Value: None

\n
\n
oauth_credentials (Union[strict dict, None], optional):
\n

Oauth credentials for interacting with the Databricks REST API via a service principal. See https://docs.databricks.com/en/dev-tools/auth.html#oauth-2-0

\n

Default Value: None

\n
\n
env_variables (permissive dict, optional):
\n

Dictionary of arbitrary environment variables to be set on the databricks cluster.

\n
\n
secrets_to_env_variables (List[strict dict], optional):
\n

Databricks secrets to be exported as environment variables. Since runs will execute in the Databricks runtime environment, environment variables (such as those required for a StringSource config variable) will not be accessible to Dagster. These variables must be stored as Databricks secrets and specified here, which will ensure they are re-exported as environment variables accessible to Dagster upon execution.

\n
\n
storage (selector, optional):
\n

Databricks storage configuration for either S3 or ADLS2. If access credentials for your Databricks storage are stored in Databricks secrets, this config indicates the secret scope and the secret keys used to access either S3 or ADLS2.

\n
\nConfig Schema:
\n
s3 (strict dict):
\n

S3 storage secret configuration

\n
\nConfig Schema:
\n
secret_scope (String):
\n

The Databricks secret scope containing the storage secrets.

\n
\n
access_key_key (String):
\n

The key of a Databricks secret containing the S3 access key ID.

\n
\n
secret_key_key (String):
\n

The key of a Databricks secret containing the S3 secret access key.

\n
\n
\n
\n
adls2 (strict dict):
\n

ADLS2 storage secret configuration

\n
\nConfig Schema:
\n
secret_scope (String):
\n

The Databricks secret scope containing the storage secrets.

\n
\n
storage_account_name (String):
\n

The name of the storage account used to access data.

\n
\n
storage_account_key_key (String):
\n

The key of a Databricks secret containing the storage account secret key.

\n
\n
\n
\n
\n
\n
local_pipeline_package_path (dagster.StringSource, optional):
\n

Absolute path to root python package containing your Dagster code. If you set this value to a directory lower than the root package, and have user relative imports in your code (e.g. from .foo import bar), it\u2019s likely you\u2019ll encounter an import error on the remote step. Before every step run, the launcher will zip up the code in this local path, upload it to DBFS, and unzip it into the Python path of the remote Spark process. This gives the remote process access to up-to-date user code.

\n
\n
local_dagster_job_package_path (dagster.StringSource, optional):
\n

Absolute path to root python package containing your Dagster code. If you set this value to a directory lower than the root package, and have user relative imports in your code (e.g. from .foo import bar), it\u2019s likely you\u2019ll encounter an import error on the remote step. Before every step run, the launcher will zip up the code in this local path, upload it to DBFS, and unzip it into the Python path of the remote Spark process. This gives the remote process access to up-to-date user code.

\n
\n
staging_prefix (dagster.StringSource, optional):
\n

Directory in DBFS to use for uploaded job code. Must be absolute.

\n

Default Value: \u2018/dagster_staging\u2019

\n
\n
wait_for_logs (Bool, optional):
\n

If set, and if the specified cluster is configured to export logs, the system will wait after job completion for the logs to appear in the configured location. Note that logs are copied every 5 minutes, so enabling this will add several minutes to the job runtime. NOTE: this integration will export stdout/stderrfrom the remote Databricks process automatically, so this option is not generally necessary.

\n

Default Value: False

\n
\n
max_completion_wait_time_seconds (dagster.IntSource, optional):
\n

If the Databricks job run takes more than this many seconds, then consider it failed and terminate the step.

\n

Default Value: 86400

\n
\n
poll_interval_sec (Float, optional):
\n

How frequently Dagster will poll Databricks to determine the state of the job.

\n

Default Value: 5.0

\n
\n
verbose_logs (Bool, optional):
\n

Determines whether to display debug logs emitted while job is being polled. It can be helpful for Dagster UI performance to set to False when running long-running or fan-out Databricks jobs, to avoid forcing the UI to fetch large amounts of debug logs.

\n

Default Value: True

\n
\n
add_dagster_env_variables (Bool, optional):
\n

Automatically add Dagster system environment variables. This option is only applicable when the code being executed is deployed on Dagster Cloud. It will be ignored when the environment variables provided by Dagster Cloud are not present.

\n

Default Value: True

\n
\n
\n

Resource for running ops as a Databricks Job.

\n

When this resource is used, the op will be executed in Databricks using the \u2018Run Submit\u2019\nAPI. Pipeline code will be zipped up and copied to a directory in DBFS along with the op\u2019s\nexecution context.

\n

Use the \u2018run_config\u2019 configuration to specify the details of the Databricks cluster used, and\nthe \u2018storage\u2019 key to configure persistent storage on that cluster. Storage is accessed by\nsetting the credentials in the Spark context, as documented here for S3 and here for ADLS.

\n
\n\n
\n
\n

Other\u00b6

\n
\n
\nclass dagster_databricks.DatabricksError[source]\u00b6
\n
\n\n
\n
\n

Legacy\u00b6

\n
\n
\ndagster_databricks.databricks_client ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
host (dagster.StringSource):
\n

Databricks host, e.g. https://uksouth.azuredatabricks.com

\n
\n
token (Union[dagster.StringSource, None], optional):
\n

Databricks access token

\n
\n
oauth_credentials (Union[strict dict, None], optional):
\n

Databricks OAuth credentials for using a service principal. See https://docs.databricks.com/en/dev-tools/auth.html#oauth-2-0

\n
\n
workspace_id (Union[dagster.StringSource, None], optional):
\n

DEPRECATED: The Databricks workspace ID, as described in https://docs.databricks.com/workspace/workspace-details.html#workspace-instance-names-urls-and-ids. This is no longer used and will be removed in a 0.21.

\n
\n
\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-databricks", "customsidebar": null, "display_toc": true, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../dagster-datadog/", "title": "Datadog (dagster-datadog)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-dask/", "title": "Dask (dagster-dask)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-datadog", "Datadog (dagster-datadog)", "N", "next"], ["sections/api/apidocs/libraries/dagster-dask", "Dask (dagster-dask)", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/libraries/dagster-databricks.rst.txt", "title": "Databricks (dagster-databricks)", "toc": "\n"}, "dagster-datadog": {"alabaster_version": "0.7.13", "body": "
\n

Datadog (dagster-datadog)\u00b6

\n

This library provides an integration with Datadog, to support publishing metrics to Datadog from\nwithin Dagster ops.

\n

We use the Python datadogpy library. To use it, you\u2019ll\nfirst need to create a DataDog account and get both API and Application keys.

\n

The integration uses DogStatsD, so you\u2019ll need\nto ensure the datadog agent is running on the host you\u2019re sending metrics from.

\n
\n
\ndagster_datadog.DatadogResource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
api_key (dagster.StringSource):
\n

Datadog API key. See https://docs.datadoghq.com/account_management/api-app-keys/

\n
\n
app_key (dagster.StringSource):
\n

Datadog application key. See https://docs.datadoghq.com/account_management/api-app-keys/.

\n
\n
\n

This resource is a thin wrapper over the\ndogstatsd library.

\n

As such, we directly mirror the public API methods of DogStatsd here; you can refer to the\nDataDog documentation for how to use this\nresource.

\n

Examples

\n
@op\ndef datadog_op(datadog_client: ResourceParam[DatadogClient]):\n    datadog_client.event('Man down!', 'This server needs assistance.')\n    datadog_client.gauge('users.online', 1001, tags=["protocol:http"])\n    datadog_client.increment('page.views')\n    datadog_client.decrement('page.views')\n    datadog_client.histogram('album.photo.count', 26, tags=["gender:female"])\n    datadog_client.distribution('album.photo.count', 26, tags=["color:blue"])\n    datadog_client.set('visitors.uniques', 999, tags=["browser:ie"])\n    datadog_client.service_check('svc.check_name', datadog_client.WARNING)\n    datadog_client.timing("query.response.time", 1234)\n\n    # Use timed decorator\n    @datadog_client.timed('run_fn')\n    def run_fn():\n        pass\n\n    run_fn()\n\n@job\ndef job_for_datadog_op() -> None:\n    datadog_op()\n\njob_for_datadog_op.execute_in_process(\n    resources={"datadog_client": DatadogResource(api_key="FOO", app_key="BAR")}\n)\n
\n
\n
\n\n
\n

Legacy\u00b6

\n
\n
\ndagster_datadog.datadog_resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
api_key (dagster.StringSource):
\n

Datadog API key. See https://docs.datadoghq.com/account_management/api-app-keys/

\n
\n
app_key (dagster.StringSource):
\n

Datadog application key. See https://docs.datadoghq.com/account_management/api-app-keys/.

\n
\n
\n

This legacy resource is a thin wrapper over the\ndogstatsd library.

\n

Prefer using DatadogResource.

\n

As such, we directly mirror the public API methods of DogStatsd here; you can refer to the\nDataDog documentation for how to use this\nresource.

\n

Examples

\n
@op(required_resource_keys={'datadog'})\ndef datadog_op(context):\n    dd = context.resources.datadog\n\n    dd.event('Man down!', 'This server needs assistance.')\n    dd.gauge('users.online', 1001, tags=["protocol:http"])\n    dd.increment('page.views')\n    dd.decrement('page.views')\n    dd.histogram('album.photo.count', 26, tags=["gender:female"])\n    dd.distribution('album.photo.count', 26, tags=["color:blue"])\n    dd.set('visitors.uniques', 999, tags=["browser:ie"])\n    dd.service_check('svc.check_name', dd.WARNING)\n    dd.timing("query.response.time", 1234)\n\n    # Use timed decorator\n    @dd.timed('run_fn')\n    def run_fn():\n        pass\n\n    run_fn()\n\n@job(resource_defs={'datadog': datadog_resource})\ndef dd_job():\n    datadog_op()\n\nresult = dd_job.execute_in_process(\n    run_config={'resources': {'datadog': {'config': {'api_key': 'YOUR_KEY', 'app_key': 'YOUR_KEY'}}}}\n)\n
\n
\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-datadog", "customsidebar": null, "display_toc": true, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../dagster-datahub/", "title": "Datahub (dagster-datahub)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-databricks/", "title": "Databricks (dagster-databricks)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-datahub", "Datahub (dagster-datahub)", "N", "next"], ["sections/api/apidocs/libraries/dagster-databricks", "Databricks (dagster-databricks)", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/libraries/dagster-datadog.rst.txt", "title": "Datadog (dagster-datadog)", "toc": "\n"}, "dagster-datahub": {"alabaster_version": "0.7.13", "body": "
\n

Datahub (dagster-datahub)\u00b6

\n

This library provides an integration with Datahub, to support pushing metadata to Datahub from\nwithin Dagster ops.

\n
\n

\n
\n

We use the Datahub Python Library. To use it, you\u2019ll\nfirst need to start up a Datahub Instance. Datahub Quickstart Guide.

\n
\n

\n
\n
\n
\ndagster_datahub.DatahubRESTEmitterResource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
connection (dagster.StringSource):
\n

Datahub GMS Server

\n
\n
token (Union[dagster.StringSource, None], optional):
\n

Personal Access Token

\n
\n
connect_timeout_sec (Union[Float, None], optional):
\n

\n
read_timeout_sec (Union[Float, None], optional):
\n

\n
retry_status_codes (Union[List[dagster.IntSource], None], optional):
\n

\n
retry_methods (Union[List[dagster.StringSource], None], optional):
\n

\n
retry_max_times (Union[dagster.IntSource, None], optional):
\n

\n
extra_headers (Union[dict, None], optional):
\n

\n
ca_certificate_path (Union[dagster.StringSource, None], optional):
\n

\n
server_telemetry_id (Union[dagster.StringSource, None], optional):
\n

\n
disable_ssl_verification (dagster.BoolSource, optional):
\n

Default Value: False

\n
\n
\n

Base class for Dagster resources that utilize structured config.

\n

This class is a subclass of both ResourceDefinition and Config.

\n

Example definition:

\n
class WriterResource(ConfigurableResource):\n    prefix: str\n\n    def output(self, text: str) -> None:\n        print(f"{self.prefix}{text}")\n
\n
\n

Example usage:

\n
@asset\ndef asset_that_uses_writer(writer: WriterResource):\n    writer.output("text")\n\ndefs = Definitions(\n    assets=[asset_that_uses_writer],\n    resources={"writer": WriterResource(prefix="a_prefix")},\n)\n
\n
\n
\n\n
\n
\ndagster_datahub.DatahubKafkaEmitterResource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
connection (strict dict):
\n
\nConfig Schema:
\n
bootstrap (dagster.StringSource):
\n

Kafka Boostrap Servers. Comma delimited

\n
\n
schema_registry_url (dagster.StringSource):
\n

Schema Registry Location.

\n
\n
schema_registry_config (dict, optional):
\n

Extra Schema Registry Config.

\n
\nDefault Value:
{}\n
\n
\n
\n
\n
\n
topic (Union[dagster.StringSource, None], optional):
\n

\n
topic_routes (dict, optional):
\n
\nDefault Value:
{\n    "MetadataChangeEvent": "MetadataChangeEvent_v4",\n    "MetadataChangeProposal": "MetadataChangeProposal_v1"\n}\n
\n
\n
\n
\n

Base class for Dagster resources that utilize structured config.

\n

This class is a subclass of both ResourceDefinition and Config.

\n

Example definition:

\n
class WriterResource(ConfigurableResource):\n    prefix: str\n\n    def output(self, text: str) -> None:\n        print(f"{self.prefix}{text}")\n
\n
\n

Example usage:

\n
@asset\ndef asset_that_uses_writer(writer: WriterResource):\n    writer.output("text")\n\ndefs = Definitions(\n    assets=[asset_that_uses_writer],\n    resources={"writer": WriterResource(prefix="a_prefix")},\n)\n
\n
\n
\n\n
\n

Legacy\u00b6

\n
\n
\ndagster_datahub.datahub_rest_emitter ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
connection (dagster.StringSource):
\n

Datahub GMS Server

\n
\n
token (Union[dagster.StringSource, None], optional):
\n

Personal Access Token

\n
\n
connect_timeout_sec (Union[Float, None], optional):
\n

\n
read_timeout_sec (Union[Float, None], optional):
\n

\n
retry_status_codes (Union[List[dagster.IntSource], None], optional):
\n

\n
retry_methods (Union[List[dagster.StringSource], None], optional):
\n

\n
retry_max_times (Union[dagster.IntSource, None], optional):
\n

\n
extra_headers (Union[dict, None], optional):
\n

\n
ca_certificate_path (Union[dagster.StringSource, None], optional):
\n

\n
server_telemetry_id (Union[dagster.StringSource, None], optional):
\n

\n
disable_ssl_verification (dagster.BoolSource, optional):
\n

Default Value: False

\n
\n
\n
\n\n
\n
\ndagster_datahub.datahub_kafka_emitter ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
connection (strict dict):
\n
\nConfig Schema:
\n
bootstrap (dagster.StringSource):
\n

Kafka Boostrap Servers. Comma delimited

\n
\n
schema_registry_url (dagster.StringSource):
\n

Schema Registry Location.

\n
\n
schema_registry_config (dict, optional):
\n

Extra Schema Registry Config.

\n
\nDefault Value:
{}\n
\n
\n
\n
\n
\n
topic (Union[dagster.StringSource, None], optional):
\n

\n
topic_routes (dict, optional):
\n
\nDefault Value:
{\n    "MetadataChangeEvent": "MetadataChangeEvent_v4",\n    "MetadataChangeProposal": "MetadataChangeProposal_v1"\n}\n
\n
\n
\n
\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-datahub", "customsidebar": null, "display_toc": true, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../dagster-dbt/", "title": "dbt (dagster-dbt)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-datadog/", "title": "Datadog (dagster-datadog)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-dbt", "dbt (dagster-dbt)", "N", "next"], ["sections/api/apidocs/libraries/dagster-datadog", "Datadog (dagster-datadog)", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/libraries/dagster-datahub.rst.txt", "title": "Datahub (dagster-datahub)", "toc": "\n"}, "dagster-dbt": {"alabaster_version": "0.7.13", "body": "
\n

dbt (dagster-dbt)\u00b6

\n

Dagster orchestrates dbt alongside other technologies, so you can combine dbt with Spark, Python,\netc. in a single workflow. Dagster\u2019s software-defined asset abstractions make it simple to define\ndata assets that depend on specific dbt models, or to define the computation required to compute\nthe sources that your dbt models depend on.

\n

Related documentation pages: dbt and\ndbt Cloud.

\n
\n

dagster-dbt\u00b6

\n
\n

dagster-dbt project scaffold\u00b6

\n

This command will initialize a new Dagster project and create directories and files that\nload assets from an existing dbt project.

\n
dagster-dbt project scaffold [OPTIONS]\n
\n
\n

Options

\n
\n
\n--project-name <project_name>\u00b6
\n

Required The name of the Dagster project to initialize for your dbt project.

\n
\n\n
\n
\n--dbt-project-dir <dbt_project_dir>\u00b6
\n

The path of your dbt project directory. This path must contain a dbt_project.yml file. By default, this command will assume that the current working directory contains a dbt project, but you can set a different directory by setting this option.

\n
\n\n
\n
\n
\n

dbt Core\u00b6

\n

Here, we provide interfaces to manage dbt projects invoked by the local dbt command line interface\n(dbt CLI).

\n
\n

Assets (dbt Core)\u00b6

\n
\n
\ndagster_dbt.load_assets_from_dbt_project(project_dir, profiles_dir=None, *, select=None, exclude=None, dagster_dbt_translator=None, io_manager_key=None, target_dir=None, key_prefix=None, source_key_prefix=None, op_name=None, runtime_metadata_fn=None, node_info_to_asset_key=<function default_asset_key_fn>, use_build_command=True, partitions_def=None, partition_key_to_vars_fn=None, node_info_to_group_fn=<function default_group_from_dbt_resource_props>, node_info_to_freshness_policy_fn=<function default_freshness_policy_fn>, node_info_to_auto_materialize_policy_fn=<function default_auto_materialize_policy_fn>, node_info_to_definition_metadata_fn=<function default_metadata_from_dbt_resource_props>, display_raw_sql=None, dbt_resource_key='dbt')[source]\u00b6
\n

Loads a set of dbt models from a dbt project into Dagster assets.

\n

Creates one Dagster asset for each dbt model. All assets will be re-materialized using a single\ndbt run or dbt build command.

\n

When searching for more flexibility in defining the computations that materialize your\ndbt assets, we recommend that you use dbt_assets.

\n
\n
Parameters:
\n
    \n
  • project_dir (Optional[str]) \u2013 The directory containing the dbt project to load.

  • \n
  • profiles_dir (Optional[str]) \u2013 The profiles directory to use for loading the DBT project.\nDefaults to a directory called \u201cconfig\u201d inside the project_dir.

  • \n
  • target_dir (Optional[str]) \u2013 The target directory where dbt will place compiled artifacts.\nDefaults to \u201ctarget\u201d underneath the project_dir.

  • \n
  • select (Optional[str]) \u2013 A dbt selection string for the models in a project that you want\nto include. Defaults to \u201cfqn:*\u201d.

  • \n
  • exclude (Optional[str]) \u2013 A dbt selection string for the models in a project that you want\nto exclude. Defaults to \u201c\u201d.

  • \n
  • dagster_dbt_translator (Optional[DagsterDbtTranslator]) \u2013 Allows customizing how to map\ndbt models, seeds, etc. to asset keys and asset metadata.

  • \n
  • key_prefix (Optional[Union[str, List[str]]]) \u2013 [Deprecated] A key prefix to apply to all assets loaded\nfrom the dbt project. Does not apply to input assets. Deprecated: use\ndagster_dbt_translator=KeyPrefixDagsterDbtTranslator(key_prefix=\u2026) instead.

  • \n
  • source_key_prefix (Optional[Union[str, List[str]]]) \u2013 [Deprecated] A key prefix to apply to all input\nassets for the set of assets loaded from the dbt project. Deprecated: use\ndagster_dbt_translator=KeyPrefixDagsterDbtTranslator(source_key_prefix=\u2026) instead.

  • \n
  • op_name (Optional[str]) \u2013 [Deprecated] Sets the name of the underlying Op that will generate the dbt assets.\nDeprecated: use the @dbt_assets decorator if you need to customize the op name.

  • \n
  • dbt_resource_key (Optional[str]) \u2013 [Deprecated] The resource key that the dbt resource will be specified at.\nDefaults to \u201cdbt\u201d. Deprecated: use the @dbt_assets decorator if you need to customize\nthe resource key.

  • \n
  • runtime_metadata_fn (Optional[Callable[[OpExecutionContext, Mapping[str, Any]], Mapping[str, Any]]]) \u2013 [Deprecated]\nA function that will be run after any of the assets are materialized and returns\nmetadata entries for the asset, to be displayed in the asset catalog for that run.\nDeprecated: use the @dbt_assets decorator if you need to customize runtime metadata.

  • \n
  • manifest_json (Optional[Mapping[str, Any]]) \u2013 [Deprecated] Use the manifest argument instead.

  • \n
  • selected_unique_ids (Optional[Set[str]]) \u2013 [Deprecated] The set of dbt unique_ids that you want to load\nas assets. Deprecated: use the select argument instead.

  • \n
  • node_info_to_asset_key (Mapping[str, Any] -> AssetKey) \u2013 [Deprecated] A function that takes a dictionary\nof dbt node info and returns the AssetKey that you want to represent that node. By\ndefault, the asset key will simply be the name of the dbt model. Deprecated: instead,\nprovide a custom DagsterDbtTranslator that overrides node_info_to_asset_key.

  • \n
  • use_build_command (bool) \u2013 Flag indicating if you want to use dbt build as the core computation\nfor this asset. Defaults to True. If set to False, then dbt run will be used, and\nseeds and snapshots won\u2019t be loaded as assets.

  • \n
  • partitions_def (Optional[PartitionsDefinition]) \u2013 [Deprecated] Defines the set of partition keys that\ncompose the dbt assets. Deprecated: use the @dbt_assets decorator to define partitioned\ndbt assets.

  • \n
  • partition_key_to_vars_fn (Optional[str -> Dict[str, Any]]) \u2013 [Deprecated] A function to translate a given\npartition key (e.g. \u20182022-01-01\u2019) to a dictionary of vars to be passed into the dbt\ninvocation (e.g. {\u201crun_date\u201d: \u201c2022-01-01\u201d}). Deprecated: use the @dbt_assets decorator\nto define partitioned dbt assets.

  • \n
  • node_info_to_group_fn (Dict[str, Any] -> Optional[str]) \u2013 [Deprecated] A function that takes a\ndictionary of dbt node info and returns the group that this node should be assigned to.\nDeprecated: instead, configure dagster groups on a dbt resource\u2019s meta field or assign\ndbt groups.

  • \n
  • node_info_to_freshness_policy_fn (Dict[str, Any] -> Optional[FreshnessPolicy]) \u2013 [Deprecated] A function\nthat takes a dictionary of dbt node info and optionally returns a FreshnessPolicy that\nshould be applied to this node. By default, freshness policies will be created from\nconfig applied to dbt models, i.e.:\ndagster_freshness_policy={\u201cmaximum_lag_minutes\u201d: 60, \u201ccron_schedule\u201d: \u201c0 9 * * *\u201d}\nwill result in that model being assigned\nFreshnessPolicy(maximum_lag_minutes=60, cron_schedule=\u201d0 9 * * *\u201d). Deprecated:\ninstead, configure auto-materialize policies on a dbt resource\u2019s meta field.

  • \n
  • node_info_to_auto_materialize_policy_fn (Dict[str, Any] -> Optional[AutoMaterializePolicy]) \u2013 [Deprecated]\nA function that takes a dictionary of dbt node info and optionally returns a AutoMaterializePolicy\nthat should be applied to this node. By default, AutoMaterializePolicies will be created from\nconfig applied to dbt models, i.e.:\ndagster_auto_materialize_policy={\u201ctype\u201d: \u201clazy\u201d} will result in that model being assigned\nAutoMaterializePolicy.lazy(). Deprecated: instead, configure auto-materialize\npolicies on a dbt resource\u2019s meta field.

  • \n
  • node_info_to_definition_metadata_fn (Dict[str, Any] -> Optional[Dict[str, MetadataUserInput]]) \u2013 [Deprecated]\nA function that takes a dictionary of dbt node info and optionally returns a dictionary\nof metadata to be attached to the corresponding definition. This is added to the default\nmetadata assigned to the node, which consists of the node\u2019s schema (if present).\nDeprecated: instead, provide a custom DagsterDbtTranslator that overrides\nnode_info_to_metadata.

  • \n
  • display_raw_sql (Optional[bool]) \u2013 [Deprecated] A flag to indicate if the raw sql associated\nwith each model should be included in the asset description. For large projects, setting\nthis flag to False is advised to reduce the size of the resulting snapshot. Deprecated:\ninstead, provide a custom DagsterDbtTranslator that overrides node_info_to_description.

  • \n
\n
\n
\n
\n\n
\n
\ndagster_dbt.load_assets_from_dbt_manifest(manifest=None, *, select=None, exclude=None, io_manager_key=None, dagster_dbt_translator=None, key_prefix=None, source_key_prefix=None, selected_unique_ids=None, display_raw_sql=None, dbt_resource_key='dbt', op_name=None, manifest_json=None, use_build_command=True, partitions_def=None, partition_key_to_vars_fn=None, runtime_metadata_fn=None, node_info_to_asset_key=<function default_asset_key_fn>, node_info_to_group_fn=<function default_group_from_dbt_resource_props>, node_info_to_freshness_policy_fn=<function default_freshness_policy_fn>, node_info_to_auto_materialize_policy_fn=<function default_auto_materialize_policy_fn>, node_info_to_definition_metadata_fn=<function default_metadata_from_dbt_resource_props>)[source]\u00b6
\n

Loads a set of dbt models, described in a manifest.json, into Dagster assets.

\n

Creates one Dagster asset for each dbt model. All assets will be re-materialized using a single\ndbt run command.

\n

When searching for more flexibility in defining the computations that materialize your\ndbt assets, we recommend that you use dbt_assets.

\n
\n
Parameters:
\n
    \n
  • manifest (Optional[Mapping[str, Any]]) \u2013 The contents of a DBT manifest.json, which contains\na set of models to load into assets.

  • \n
  • select (Optional[str]) \u2013 A dbt selection string for the models in a project that you want\nto include. Defaults to \u201cfqn:*\u201d.

  • \n
  • exclude (Optional[str]) \u2013 A dbt selection string for the models in a project that you want\nto exclude. Defaults to \u201c\u201d.

  • \n
  • io_manager_key (Optional[str]) \u2013 The IO manager key that will be set on each of the returned\nassets. When other ops are downstream of the loaded assets, the IOManager specified\nhere determines how the inputs to those ops are loaded. Defaults to \u201cio_manager\u201d.

  • \n
  • dagster_dbt_translator (Optional[DagsterDbtTranslator]) \u2013 Allows customizing how to map\ndbt models, seeds, etc. to asset keys and asset metadata.

  • \n
  • key_prefix (Optional[Union[str, List[str]]]) \u2013 [Deprecated] A key prefix to apply to all assets loaded\nfrom the dbt project. Does not apply to input assets. Deprecated: use\ndagster_dbt_translator=KeyPrefixDagsterDbtTranslator(key_prefix=\u2026) instead.

  • \n
  • source_key_prefix (Optional[Union[str, List[str]]]) \u2013 [Deprecated] A key prefix to apply to all input\nassets for the set of assets loaded from the dbt project. Deprecated: use\ndagster_dbt_translator=KeyPrefixDagsterDbtTranslator(source_key_prefix=\u2026) instead.

  • \n
  • op_name (Optional[str]) \u2013 [Deprecated] Sets the name of the underlying Op that will generate the dbt assets.\nDeprecated: use the @dbt_assets decorator if you need to customize the op name.

  • \n
  • dbt_resource_key (Optional[str]) \u2013 \n \n (\n deprecated\n )\n \n (This parameter will be removed in version 0.21. Use the @dbt_assets decorator if you need to customize your resource key.) [Deprecated] The resource key that the dbt resource will be specified at.\nDefaults to \u201cdbt\u201d. Deprecated: use the @dbt_assets decorator if you need to customize\nthe resource key.

  • \n
  • runtime_metadata_fn (Optional[Callable[[OpExecutionContext, Mapping[str, Any]], Mapping[str, Any]]]) \u2013 \n \n (\n deprecated\n )\n \n (This parameter will be removed in version 0.21. Use the @dbt_assets decorator if you need to customize runtime metadata.) [Deprecated]\nA function that will be run after any of the assets are materialized and returns\nmetadata entries for the asset, to be displayed in the asset catalog for that run.\nDeprecated: use the @dbt_assets decorator if you need to customize runtime metadata.

  • \n
  • selected_unique_ids (Optional[Set[str]]) \u2013 \n \n (\n deprecated\n )\n \n (This parameter will be removed in version 0.21. Use the select parameter instead.) [Deprecated] The set of dbt unique_ids that you want to load\nas assets. Deprecated: use the select argument instead.

  • \n
  • node_info_to_asset_key (Mapping[str, Any] -> AssetKey) \u2013 [Deprecated] A function that takes a dictionary\nof dbt node info and returns the AssetKey that you want to represent that node. By\ndefault, the asset key will simply be the name of the dbt model.

  • \n
  • use_build_command (bool) \u2013 \n \n (\n deprecated\n )\n \n (This parameter will be removed in version 0.21. Use the @dbt_assets decorator if you need to customize the underlying dbt commands.) Flag indicating if you want to use dbt build as the core computation\nfor this asset. Defaults to True. If set to False, then dbt run will be used, and\nseeds and snapshots won\u2019t be loaded as assets.

  • \n
  • partitions_def (Optional[PartitionsDefinition]) \u2013 \n \n (\n deprecated\n )\n \n (This parameter will be removed in version 0.21. Use the @dbt_assets decorator to define partitioned dbt assets.) [Deprecated] Defines the set of partition keys that\ncompose the dbt assets. Deprecated: use the @dbt_assets decorator to define partitioned\ndbt assets.

  • \n
  • partition_key_to_vars_fn (Optional[str -> Dict[str, Any]]) \u2013 \n \n (\n deprecated\n )\n \n (This parameter will be removed in version 0.21. Use the @dbt_assets decorator to define partitioned dbt assets.) [Deprecated] A function to translate a given\npartition key (e.g. \u20182022-01-01\u2019) to a dictionary of vars to be passed into the dbt\ninvocation (e.g. {\u201crun_date\u201d: \u201c2022-01-01\u201d}). Deprecated: use the @dbt_assets decorator\nto define partitioned dbt assets.

  • \n
  • node_info_to_group_fn (Dict[str, Any] -> Optional[str]) \u2013 [Deprecated] A function that takes a\ndictionary of dbt node info and returns the group that this node should be assigned to.\nDeprecated: instead, configure dagster groups on a dbt resource\u2019s meta field or assign\ndbt groups.

  • \n
  • node_info_to_freshness_policy_fn (Dict[str, Any] -> Optional[FreshnessPolicy]) \u2013 [Deprecated] A function\nthat takes a dictionary of dbt node info and optionally returns a FreshnessPolicy that\nshould be applied to this node. By default, freshness policies will be created from\nconfig applied to dbt models, i.e.:\ndagster_freshness_policy={\u201cmaximum_lag_minutes\u201d: 60, \u201ccron_schedule\u201d: \u201c0 9 * * *\u201d}\nwill result in that model being assigned\nFreshnessPolicy(maximum_lag_minutes=60, cron_schedule=\u201d0 9 * * *\u201d). Deprecated:\ninstead, configure auto-materialize policies on a dbt resource\u2019s meta field.

  • \n
  • node_info_to_auto_materialize_policy_fn (Dict[str, Any] -> Optional[AutoMaterializePolicy]) \u2013 [Deprecated]\nA function that takes a dictionary of dbt node info and optionally returns a AutoMaterializePolicy\nthat should be applied to this node. By default, AutoMaterializePolicies will be created from\nconfig applied to dbt models, i.e.:\ndagster_auto_materialize_policy={\u201ctype\u201d: \u201clazy\u201d} will result in that model being assigned\nAutoMaterializePolicy.lazy(). Deprecated: instead, configure auto-materialize\npolicies on a dbt resource\u2019s meta field.

  • \n
  • node_info_to_definition_metadata_fn (Dict[str, Any] -> Optional[Dict[str, MetadataUserInput]]) \u2013 [Deprecated]\nA function that takes a dictionary of dbt node info and optionally returns a dictionary\nof metadata to be attached to the corresponding definition. This is added to the default\nmetadata assigned to the node, which consists of the node\u2019s schema (if present).\nDeprecated: instead, provide a custom DagsterDbtTranslator that overrides\nnode_info_to_metadata.

  • \n
  • display_raw_sql (Optional[bool]) \u2013 [Deprecated] A flag to indicate if the raw sql associated\nwith each model should be included in the asset description. For large projects, setting\nthis flag to False is advised to reduce the size of the resulting snapshot. Deprecated:\ninstead, provide a custom DagsterDbtTranslator that overrides node_info_to_description.

  • \n
\n
\n
\n
\n\n
\n
\n@dagster_dbt.dbt_assets(*, manifest, select='fqn:*', exclude=None, io_manager_key=None, partitions_def=None, dagster_dbt_translator=<dagster_dbt.dagster_dbt_translator.DagsterDbtTranslator object>)[source]\u00b6
\n

Create a definition for how to compute a set of dbt resources, described by a manifest.json.\nWhen invoking dbt commands using DbtCliResource\u2019s\ncli() method, Dagster events are emitted by calling\nyield from on the event stream returned by stream().

\n
\n
Parameters:
\n
    \n
  • manifest (Union[Mapping[str, Any], str, Path]) \u2013 The contents of a manifest.json file\nor the path to a manifest.json file. A manifest.json contains a representation of a\ndbt project (models, tests, macros, etc). We use this representation to create\ncorresponding Dagster assets.

  • \n
  • select (str) \u2013 A dbt selection string for the models in a project that you want\nto include. Defaults to fqn:*.

  • \n
  • exclude (Optional[str]) \u2013 A dbt selection string for the models in a project that you want\nto exclude. Defaults to \u201c\u201d.

  • \n
  • io_manager_key (Optional[str]) \u2013 The IO manager key that will be set on each of the returned\nassets. When other ops are downstream of the loaded assets, the IOManager specified\nhere determines how the inputs to those ops are loaded. Defaults to \u201cio_manager\u201d.

  • \n
  • partitions_def (Optional[PartitionsDefinition]) \u2013 Defines the set of partition keys that\ncompose the dbt assets.

  • \n
  • dagster_dbt_translator (Optional[DagsterDbtTranslator]) \u2013 Allows customizing how to map\ndbt models, seeds, etc. to asset keys and asset metadata.

  • \n
\n
\n
\n

Examples

\n

Running dbt build for a dbt project:

\n
from pathlib import Path\n\nfrom dagster import AssetExecutionContext\nfrom dagster_dbt import DbtCliResource, dbt_assets\n\n\n@dbt_assets(manifest=Path("target", "manifest.json"))\ndef my_dbt_assets(context: AssetExecutionContext, dbt: DbtCliResource):\n    yield from dbt.cli(["build"], context=context).stream()\n
\n
\n

Running dbt commands with flags:

\n
from pathlib import Path\n\nfrom dagster import AssetExecutionContext\nfrom dagster_dbt import DbtCliResource, dbt_assets\n\n\n@dbt_assets(manifest=Path("target", "manifest.json"))\ndef my_dbt_assets(context: AssetExecutionContext, dbt: DbtCliResource):\n    yield from dbt.cli(["build", "--full-refresh"], context=context).stream()\n
\n
\n

Running dbt commands with --vars:

\n
import json\nfrom pathlib import Path\n\nfrom dagster import AssetExecutionContext\nfrom dagster_dbt import DbtCliResource, dbt_assets\n\n\n@dbt_assets(manifest=Path("target", "manifest.json"))\ndef my_dbt_assets(context: AssetExecutionContext, dbt: DbtCliResource):\n    dbt_vars = {"key": "value"}\n\n    yield from dbt.cli(["build", "--vars", json.dumps(dbt_vars)], context=context).stream()\n
\n
\n

Retrieving dbt artifacts after running a dbt command:

\n
from pathlib import Path\n\nfrom dagster import AssetExecutionContext\nfrom dagster_dbt import DbtCliResource, dbt_assets\n\n\n@dbt_assets(manifest=Path("target", "manifest.json"))\ndef my_dbt_assets(context: AssetExecutionContext, dbt: DbtCliResource):\n    dbt_build_invocation = dbt.cli(["build"], context=context)\n\n    yield from dbt_build_invocation.stream()\n\n    run_results_json = dbt_build_invocation.get_artifact("run_results.json")\n
\n
\n

Running multiple dbt commands for a dbt project:

\n
from pathlib import Path\n\nfrom dagster import AssetExecutionContext\nfrom dagster_dbt import DbtCliResource, dbt_assets\n\n\n@dbt_assets(manifest=Path("target", "manifest.json"))\ndef my_dbt_assets(context: AssetExecutionContext, dbt: DbtCliResource):\n    yield from dbt.cli(["run"], context=context).stream()\n    yield from dbt.cli(["test"], context=context).stream()\n
\n
\n

Customizing the Dagster asset metadata inferred from a dbt project using DagsterDbtTranslator:

\n
from pathlib import Path\n\nfrom dagster import AssetExecutionContext\nfrom dagster_dbt import DagsterDbtTranslator, DbtCliResource, dbt_assets\n\n\nclass CustomDagsterDbtTranslator(DagsterDbtTranslator):\n    ...\n\n\n@dbt_assets(\n    manifest=Path("target", "manifest.json"),\n    dagster_dbt_translator=CustomDagsterDbtTranslator(),\n)\ndef my_dbt_assets(context: AssetExecutionContext, dbt: DbtCliResource):\n    yield from dbt.cli(["build"], context=context).stream()\n
\n
\n

Invoking another Dagster ResourceDefinition alongside dbt:

\n
from pathlib import Path\n\nfrom dagster import AssetExecutionContext\nfrom dagster_dbt import DagsterDbtTranslator, DbtCliResource, dbt_assets\nfrom dagster_slack import SlackResource\n\n\n@dbt_assets(manifest=Path("target", "manifest.json"))\ndef my_dbt_assets(context: AssetExecutionContext, dbt: DbtCliResource, slack: SlackResource):\n    yield from dbt.cli(["build"], context=context).stream()\n\n    slack_client = slack.get_client()\n    slack_client.chat_postMessage(channel="#my-channel", text="dbt build succeeded!")\n
\n
\n

Defining and accessing Dagster Config alongside dbt:

\n
from pathlib import Path\n\nfrom dagster import AssetExecutionContext, Config\nfrom dagster_dbt import DagsterDbtTranslator, DbtCliResource, dbt_assets\n\n\nclass MyDbtConfig(Config):\n    full_refresh: bool\n\n\n@dbt_assets(manifest=Path("target", "manifest.json"))\ndef my_dbt_assets(context: AssetExecutionContext, dbt: DbtCliResource, config: MyDbtConfig):\n    dbt_build_args = ["build"]\n    if config.full_refresh:\n        dbt_build_args += ["--full-refresh"]\n\n    yield from dbt.cli(dbt_build_args, context=context).stream()\n
\n
\n

Defining Dagster PartitionDefinition alongside dbt:

\n
import json\nfrom pathlib import Path\n\nfrom dagster import AssetExecutionContext, DailyPartitionDefinition\nfrom dagster_dbt import DbtCliResource, dbt_assets\n\n\n@dbt_assets(\n    manifest=Path("target", "manifest.json"),\n    partitions_def=DailyPartitionsDefinition(start_date="2023-01-01")\n)\ndef partitionshop_dbt_assets(context: AssetExecutionContext, dbt: DbtCliResource):\n    time_window = context.asset_partitions_time_window_for_output(\n        list(context.selected_output_names)[0]\n    )\n\n    dbt_vars = {\n        "min_date": time_window.start.isoformat(),\n        "max_date": time_window.end.isoformat()\n    }\n    dbt_build_args = ["build", "--vars", json.dumps(dbt_vars)]\n\n    yield from dbt.cli(dbt_build_args, context=context).stream()\n
\n
\n
\n\n
\n
\nclass dagster_dbt.DagsterDbtTranslator[source]\u00b6
\n

Holds a set of methods that derive Dagster asset definition metadata given a representation\nof a dbt resource (models, tests, sources, etc).

\n

This class is exposed so that methods can be overriden to customize how Dagster asset metadata\nis derived.

\n
\n
\nclassmethod get_asset_key(dbt_resource_props)[source]\u00b6
\n

A function that takes a dictionary representing properties of a dbt resource, and\nreturns the Dagster asset key that represents that resource.

\n

Note that a dbt resource is unrelated to Dagster\u2019s resource concept, and simply represents\na model, seed, snapshot or source in a given dbt project. You can learn more about dbt\nresources and the properties available in this dictionary here:\nhttps://docs.getdbt.com/reference/artifacts/manifest-json#resource-details

\n

This method can be overridden to provide a custom asset key for a dbt resource.

\n
\n
Parameters:
\n

dbt_resource_props (Mapping[str, Any]) \u2013 A dictionary representing the dbt resource.

\n
\n
Returns:
\n

The Dagster asset key for the dbt resource.

\n
\n
Return type:
\n

AssetKey

\n
\n
\n

Examples

\n

Adding a prefix to the default asset key generated for each dbt resource:

\n
from typing import Any, Mapping\n\nfrom dagster import AssetKey\nfrom dagster_dbt import DagsterDbtTranslator\n\n\nclass CustomDagsterDbtTranslator(DagsterDbtTranslator):\n    @classmethod\n    def get_asset_key(cls, dbt_resource_props: Mapping[str, Any]) -> AssetKey:\n        return super().get_asset_key(dbt_resource_props).with_prefix("prefix")\n
\n
\n

Adding a prefix to the default asset key generated for each dbt resource, but only for dbt sources:

\n
from typing import Any, Mapping\n\nfrom dagster import AssetKey\nfrom dagster_dbt import DagsterDbtTranslator\n\n\nclass CustomDagsterDbtTranslator(DagsterDbtTranslator):\n    @classmethod\n    def get_asset_key(cls, dbt_resource_props: Mapping[str, Any]) -> AssetKey:\n        asset_key = super().get_asset_key(dbt_resource_props)\n\n        if dbt_resource_props["resource_type"] == "source":\n            asset_key = asset_key.with_prefix("my_prefix")\n\n        return asset_key\n
\n
\n
\n\n
\n
\nclassmethod get_auto_materialize_policy(dbt_resource_props)[source]\u00b6
\n

A function that takes a dictionary representing properties of a dbt resource, and\nreturns the Dagster dagster.AutoMaterializePolicy for that resource.

\n

Note that a dbt resource is unrelated to Dagster\u2019s resource concept, and simply represents\na model, seed, snapshot or source in a given dbt project. You can learn more about dbt\nresources and the properties available in this dictionary here:\nhttps://docs.getdbt.com/reference/artifacts/manifest-json#resource-details

\n

This method can be overridden to provide a custom auto-materialize policy for a dbt resource.

\n
\n
Parameters:
\n

dbt_resource_props (Mapping[str, Any]) \u2013 A dictionary representing the dbt resource.

\n
\n
Returns:
\n

A Dagster auto-materialize policy.

\n
\n
Return type:
\n

Optional[AutoMaterializePolicy]

\n
\n
\n

Examples

\n

Set a custom auto-materialize policy for all dbt resources:

\n
from typing import Any, Mapping\n\nfrom dagster_dbt import DagsterDbtTranslator\n\n\nclass CustomDagsterDbtTranslator(DagsterDbtTranslator):\n    @classmethod\n    def get_auto_materialize_policy(cls, dbt_resource_props: Mapping[str, Any]) -> Optional[AutoMaterializePolicy]:\n        return AutoMaterializePolicy.eager()\n
\n
\n

Set a custom auto-materialize policy for dbt resources with a specific tag:

\n
from typing import Any, Mapping\n\nfrom dagster_dbt import DagsterDbtTranslator\n\n\nclass CustomDagsterDbtTranslator(DagsterDbtTranslator):\n    @classmethod\n    def get_auto_materialize_policy(cls, dbt_resource_props: Mapping[str, Any]) -> Optional[AutoMaterializePolicy]:\n        auto_materialize_policy = None\n        if "my_custom_tag" in dbt_resource_props.get("tags", []):\n            auto_materialize_policy = AutoMaterializePolicy.eager()\n\n        return auto_materialize_policy\n
\n
\n
\n\n
\n
\nclassmethod get_description(dbt_resource_props)[source]\u00b6
\n

A function that takes a dictionary representing properties of a dbt resource, and\nreturns the Dagster description for that resource.

\n

Note that a dbt resource is unrelated to Dagster\u2019s resource concept, and simply represents\na model, seed, snapshot or source in a given dbt project. You can learn more about dbt\nresources and the properties available in this dictionary here:\nhttps://docs.getdbt.com/reference/artifacts/manifest-json#resource-details

\n

This method can be overridden to provide a custom description for a dbt resource.

\n
\n
Parameters:
\n

dbt_resource_props (Mapping[str, Any]) \u2013 A dictionary representing the dbt resource.

\n
\n
Returns:
\n

The description for the dbt resource.

\n
\n
Return type:
\n

str

\n
\n
\n

Examples

\n
from typing import Any, Mapping\n\nfrom dagster_dbt import DagsterDbtTranslator\n\n\nclass CustomDagsterDbtTranslator(DagsterDbtTranslator):\n    @classmethod\n    def get_description(cls, dbt_resource_props: Mapping[str, Any]) -> str:\n        return "custom description"\n
\n
\n
\n\n
\n
\nclassmethod get_freshness_policy(dbt_resource_props)[source]\u00b6
\n

A function that takes a dictionary representing properties of a dbt resource, and\nreturns the Dagster dagster.FreshnessPolicy for that resource.

\n

Note that a dbt resource is unrelated to Dagster\u2019s resource concept, and simply represents\na model, seed, snapshot or source in a given dbt project. You can learn more about dbt\nresources and the properties available in this dictionary here:\nhttps://docs.getdbt.com/reference/artifacts/manifest-json#resource-details

\n

This method can be overridden to provide a custom freshness policy for a dbt resource.

\n
\n
Parameters:
\n

dbt_resource_props (Mapping[str, Any]) \u2013 A dictionary representing the dbt resource.

\n
\n
Returns:
\n

A Dagster freshness policy.

\n
\n
Return type:
\n

Optional[FreshnessPolicy]

\n
\n
\n

Examples

\n

Set a custom freshness policy for all dbt resources:

\n
from typing import Any, Mapping\n\nfrom dagster_dbt import DagsterDbtTranslator\n\n\nclass CustomDagsterDbtTranslator(DagsterDbtTranslator):\n    @classmethod\n    def get_freshness_policy(cls, dbt_resource_props: Mapping[str, Any]) -> Optional[FreshnessPolicy]:\n        return FreshnessPolicy(maximum_lag_minutes=60)\n
\n
\n

Set a custom freshness policy for dbt resources with a specific tag:

\n
from typing import Any, Mapping\n\nfrom dagster_dbt import DagsterDbtTranslator\n\n\nclass CustomDagsterDbtTranslator(DagsterDbtTranslator):\n    @classmethod\n    def get_freshness_policy(cls, dbt_resource_props: Mapping[str, Any]) -> Optional[FreshnessPolicy]:\n        freshness_policy = None\n        if "my_custom_tag" in dbt_resource_props.get("tags", []):\n            freshness_policy = FreshnessPolicy(maximum_lag_minutes=60)\n\n        return freshness_policy\n
\n
\n
\n\n
\n
\nclassmethod get_group_name(dbt_resource_props)[source]\u00b6
\n

A function that takes a dictionary representing properties of a dbt resource, and\nreturns the Dagster group name for that resource.

\n

Note that a dbt resource is unrelated to Dagster\u2019s resource concept, and simply represents\na model, seed, snapshot or source in a given dbt project. You can learn more about dbt\nresources and the properties available in this dictionary here:\nhttps://docs.getdbt.com/reference/artifacts/manifest-json#resource-details

\n

This method can be overridden to provide a custom group name for a dbt resource.

\n
\n
Parameters:
\n

dbt_resource_props (Mapping[str, Any]) \u2013 A dictionary representing the dbt resource.

\n
\n
Returns:
\n

A Dagster group name.

\n
\n
Return type:
\n

Optional[str]

\n
\n
\n

Examples

\n
from typing import Any, Mapping\n\nfrom dagster_dbt import DagsterDbtTranslator\n\n\nclass CustomDagsterDbtTranslator(DagsterDbtTranslator):\n    @classmethod\n    def get_group_name(cls, dbt_resource_props: Mapping[str, Any]) -> Optional[str]:\n        return "custom_group_prefix" + dbt_resource_props.get("config", {}).get("group")\n
\n
\n
\n\n
\n
\nclassmethod get_metadata(dbt_resource_props)[source]\u00b6
\n

A function that takes a dictionary representing properties of a dbt resource, and\nreturns the Dagster metadata for that resource.

\n

Note that a dbt resource is unrelated to Dagster\u2019s resource concept, and simply represents\na model, seed, snapshot or source in a given dbt project. You can learn more about dbt\nresources and the properties available in this dictionary here:\nhttps://docs.getdbt.com/reference/artifacts/manifest-json#resource-details

\n

This method can be overridden to provide a custom metadata for a dbt resource.

\n
\n
Parameters:
\n

dbt_resource_props (Mapping[str, Any]) \u2013 A dictionary representing the dbt resource.

\n
\n
Returns:
\n

A dictionary representing the Dagster metadata for the dbt resource.

\n
\n
Return type:
\n

Mapping[str, Any]

\n
\n
\n

Examples

\n
from typing import Any, Mapping\n\nfrom dagster_dbt import DagsterDbtTranslator\n\n\nclass CustomDagsterDbtTranslator(DagsterDbtTranslator):\n    @classmethod\n    def get_metadata(cls, dbt_resource_props: Mapping[str, Any]) -> Mapping[str, Any]:\n        return {"custom": "metadata"}\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_dbt.DbtManifestAssetSelection(manifest, select='fqn:*', *, dagster_dbt_translator=None, exclude=None)[source]\u00b6
\n

Defines a selection of assets from a dbt manifest wrapper and a dbt selection string.

\n
\n
Parameters:
\n
    \n
  • manifest (Mapping[str, Any]) \u2013 The dbt manifest blob.

  • \n
  • select (str) \u2013 A dbt selection string to specify a set of dbt resources.

  • \n
  • exclude (Optional[str]) \u2013 A dbt selection string to exclude a set of dbt resources.

  • \n
\n
\n
\n

Examples

\n
import json\nfrom pathlib import Path\n\nfrom dagster_dbt import DbtManifestAssetSelection\n\nmanifest = json.loads(Path("path/to/manifest.json").read_text())\n\n# select the dbt assets that have the tag "foo".\nmy_selection = DbtManifestAssetSelection(manifest=manifest, select="tag:foo")\n
\n
\n
\n\n
\n
\ndagster_dbt.build_dbt_asset_selection(dbt_assets, dbt_select='fqn:*', dbt_exclude=None)[source]\u00b6
\n

Build an asset selection for a dbt selection string.

\n

See https://docs.getdbt.com/reference/node-selection/syntax#how-does-selection-work for\nmore information.

\n
\n
Parameters:
\n
    \n
  • dbt_select (str) \u2013 A dbt selection string to specify a set of dbt resources.

  • \n
  • dbt_exclude (Optional[str]) \u2013 A dbt selection string to exclude a set of dbt resources.

  • \n
\n
\n
Returns:
\n

An asset selection for the selected dbt nodes.

\n
\n
Return type:
\n

AssetSelection

\n
\n
\n

Examples

\n
from dagster_dbt import dbt_assets, build_dbt_asset_selection\n\n@dbt_assets(manifest=...)\ndef all_dbt_assets():\n    ...\n\n# Select the dbt assets that have the tag "foo".\nfoo_selection = build_dbt_asset_selection([dbt_assets], dbt_select="tag:foo")\n\n# Select the dbt assets that have the tag "foo" and all Dagster assets downstream\n# of them (dbt-related or otherwise)\nfoo_and_downstream_selection = foo_selection.downstream()\n
\n
\n
\n\n
\n
\ndagster_dbt.build_schedule_from_dbt_selection(dbt_assets, job_name, cron_schedule, dbt_select='fqn:*', dbt_exclude=None, tags=None, config=None, execution_timezone=None)[source]\u00b6
\n

Build a schedule to materialize a specified set of dbt resources from a dbt selection string.

\n

See https://docs.getdbt.com/reference/node-selection/syntax#how-does-selection-work for\nmore information.

\n
\n
Parameters:
\n
    \n
  • job_name (str) \u2013 The name of the job to materialize the dbt resources.

  • \n
  • cron_schedule (str) \u2013 The cron schedule to define the schedule.

  • \n
  • dbt_select (str) \u2013 A dbt selection string to specify a set of dbt resources.

  • \n
  • dbt_exclude (Optional[str]) \u2013 A dbt selection string to exclude a set of dbt resources.

  • \n
  • tags (Optional[Mapping[str, str]]) \u2013 A dictionary of tags (string key-value pairs) to attach\nto the scheduled runs.

  • \n
  • config (Optional[RunConfig]) \u2013 The config that parameterizes the execution of this schedule.

  • \n
  • execution_timezone (Optional[str]) \u2013 Timezone in which the schedule should run.\nSupported strings for timezones are the ones provided by the\nIANA time zone database <https://www.iana.org/time-zones> - e.g. \u201cAmerica/Los_Angeles\u201d.

  • \n
\n
\n
Returns:
\n

A definition to materialize the selected dbt resources on a cron schedule.

\n
\n
Return type:
\n

ScheduleDefinition

\n
\n
\n

Examples

\n
from dagster_dbt import dbt_assets, build_schedule_from_dbt_selection\n\n@dbt_assets(manifest=...)\ndef all_dbt_assets():\n    ...\n\ndaily_dbt_assets_schedule = build_schedule_from_dbt_selection(\n    [all_dbt_assets],\n    job_name="all_dbt_assets",\n    cron_schedule="0 0 * * *",\n    dbt_select="fqn:*",\n)\n
\n
\n
\n\n
\n
\ndagster_dbt.get_asset_key_for_model(dbt_assets, model_name)[source]\u00b6
\n

Return the corresponding Dagster asset key for a dbt model.

\n
\n
Parameters:
\n
    \n
  • dbt_assets (AssetsDefinition) \u2013 An AssetsDefinition object produced by\nload_assets_from_dbt_project, load_assets_from_dbt_manifest, or @dbt_assets.

  • \n
  • model_name (str) \u2013 The name of the dbt model.

  • \n
\n
\n
Returns:
\n

The corresponding Dagster asset key.

\n
\n
Return type:
\n

AssetKey

\n
\n
\n

Examples

\n
from dagster import asset\nfrom dagster_dbt import dbt_assets, get_asset_key_for_model\n\n@dbt_assets(manifest=...)\ndef all_dbt_assets():\n    ...\n\n\n@asset(deps={get_asset_key_for_model([all_dbt_assets], "customers")})\ndef cleaned_customers():\n    ...\n
\n
\n
\n\n
\n
\ndagster_dbt.get_asset_key_for_source(dbt_assets, source_name)[source]\u00b6
\n

Returns the corresponding Dagster asset key for a dbt source with a singular table.

\n
\n
Parameters:
\n

source_name (str) \u2013 The name of the dbt source.

\n
\n
Raises:
\n

DagsterInvalidInvocationError \u2013 If the source has more than one table.

\n
\n
Returns:
\n

The corresponding Dagster asset key.

\n
\n
Return type:
\n

AssetKey

\n
\n
\n

Examples

\n
from dagster import asset\nfrom dagster_dbt import dbt_assets, get_asset_key_for_source\n\n@dbt_assets(manifest=...)\ndef all_dbt_assets():\n    ...\n\n@asset(key=get_asset_key_for_source([all_dbt_assets], "my_source"))\ndef upstream_python_asset():\n    ...\n
\n
\n
\n\n
\n
\ndagster_dbt.get_asset_keys_by_output_name_for_source(dbt_assets, source_name)[source]\u00b6
\n

Returns the corresponding Dagster asset keys for all tables in a dbt source.

\n

This is a convenience method that makes it easy to define a multi-asset that generates\nall the tables for a given dbt source.

\n
\n
Parameters:
\n

source_name (str) \u2013 The name of the dbt source.

\n
\n
Returns:
\n

\n
A mapping of the table name to corresponding Dagster asset key

for all tables in the given dbt source.

\n
\n
\n

\n
\n
Return type:
\n

Mapping[str, AssetKey]

\n
\n
\n

Examples

\n
from dagster import AssetOut, multi_asset\nfrom dagster_dbt import dbt_assets, get_asset_keys_by_output_name_for_source\n\n@dbt_assets(manifest=...)\ndef all_dbt_assets():\n    ...\n\n@multi_asset(\n    outs={\n        name: AssetOut(key=asset_key)\n        for name, asset_key in get_asset_keys_by_output_name_for_source(\n            [all_dbt_assets], "raw_data"\n        ).items()\n    },\n)\ndef upstream_python_asset():\n    ...\n
\n
\n
\n\n
\n
\n

Resources (dbt Core)\u00b6

\n
\n

CLI Resource\u00b6

\n
\n
\nclass dagster_dbt.DbtCliResource(*, project_dir, global_config_flags=[], profiles_dir=None, profile=None, target=None)[source]\u00b6
\n

A resource used to execute dbt CLI commands.

\n
\n
\nproject_dir\u00b6
\n

The path to the dbt project directory. This directory should contain a\ndbt_project.yml. See https://docs.getdbt.com/reference/dbt_project.yml for more\ninformation.

\n
\n
Type:
\n

str

\n
\n
\n
\n\n
\n
\nglobal_config_flags\u00b6
\n

A list of global flags configuration to pass to the dbt CLI\ninvocation. See https://docs.getdbt.com/reference/global-configs for a full list of\nconfiguration.

\n
\n
Type:
\n

List[str]

\n
\n
\n
\n\n
\n
\nprofiles_dir\u00b6
\n

The path to the directory containing your dbt profiles.yml.\nBy default, the current working directory is used, which is the dbt project directory.\nSee https://docs.getdbt.com/docs/core/connect-data-platform/connection-profiles for more\ninformation.

\n
\n
Type:
\n

Optional[str]

\n
\n
\n
\n\n
\n
\nprofile\u00b6
\n

The profile from your dbt profiles.yml to use for execution. See\nhttps://docs.getdbt.com/docs/core/connect-data-platform/connection-profiles for more\ninformation.

\n
\n
Type:
\n

Optional[str]

\n
\n
\n
\n\n
\n
\ntarget\u00b6
\n

The target from your dbt profiles.yml to use for execution. See\nhttps://docs.getdbt.com/docs/core/connect-data-platform/connection-profiles for more\ninformation.

\n
\n
Type:
\n

Optional[str]

\n
\n
\n
\n\n

Examples

\n

Creating a dbt resource with only a reference to project_dir:

\n
from dagster_dbt import DbtCliResource\n\ndbt = DbtCliResource(project_dir="/path/to/dbt/project")\n
\n
\n

Creating a dbt resource with a custom profiles_dir:

\n
from dagster_dbt import DbtCliResource\n\ndbt = DbtCliResource(\n    project_dir="/path/to/dbt/project",\n    profiles_dir="/path/to/dbt/project/profiles",\n)\n
\n
\n

Creating a dbt resource with a custom profile and target:

\n
from dagster_dbt import DbtCliResource\n\ndbt = DbtCliResource(\n    project_dir="/path/to/dbt/project",\n    profiles_dir="/path/to/dbt/project/profiles",\n    profile="jaffle_shop",\n    target="dev",\n)\n
\n
\n

Creating a dbt resource with global configs, e.g. disabling colored logs with --no-use-color:

\n
from dagster_dbt import DbtCliResource\n\ndbt = DbtCliResource(\n    project_dir="/path/to/dbt/project",\n    global_config_flags=["--no-use-color"],\n)\n
\n
\n
\n
\ncli(args, *, raise_on_error=True, manifest=None, dagster_dbt_translator=None, context=None)[source]\u00b6
\n

Create a subprocess to execute a dbt CLI command.

\n
\n
Parameters:
\n
    \n
  • args (List[str]) \u2013 The dbt CLI command to execute.

  • \n
  • raise_on_error (bool) \u2013 Whether to raise an exception if the dbt CLI command fails.

  • \n
  • manifest (Optional[Union[Mapping[str, Any], str, Path]]) \u2013 The dbt manifest blob. If an\nexecution context from within @dbt_assets is provided to the context argument,\nthen the manifest provided to @dbt_assets will be used.

  • \n
  • dagster_dbt_translator (Optional[DagsterDbtTranslator]) \u2013 The translator to link dbt\nnodes to Dagster assets. If an execution context from within @dbt_assets is\nprovided to the context argument, then the dagster_dbt_translator provided to\n@dbt_assets will be used.

  • \n
  • context (Optional[OpExecutionContext]) \u2013 The execution context from within @dbt_assets.

  • \n
\n
\n
Returns:
\n

\n
A invocation instance that can be used to retrieve the output of the

dbt CLI command.

\n
\n
\n

\n
\n
Return type:
\n

DbtCliInvocation

\n
\n
\n

Examples

\n

Streaming Dagster events for dbt asset materializations and observations:

\n
from pathlib import Path\n\nfrom dagster import AssetExecutionContext\nfrom dagster_dbt import DbtCliResource, dbt_assets\n\n\n@dbt_assets(manifest=Path("target", "manifest.json"))\ndef my_dbt_assets(context: AssetExecutionContext, dbt: DbtCliResource):\n    yield from dbt.cli(["run"], context=context).stream()\n
\n
\n

Retrieving a dbt artifact after streaming the Dagster events:

\n
from pathlib import Path\n\nfrom dagster import AssetExecutionContext\nfrom dagster_dbt import DbtCliResource, dbt_assets\n\n\n@dbt_assets(manifest=Path("target", "manifest.json"))\ndef my_dbt_assets(context: AssetExecutionContext, dbt: DbtCliResource):\n    dbt_run_invocation = dbt.cli(["run"], context=context)\n\n    yield from dbt_run_invocation.stream()\n\n    # Retrieve the `run_results.json` dbt artifact as a dictionary:\n    run_results_json = dbt_run_invocation.get_artifact("run_results.json")\n\n    # Retrieve the `run_results.json` dbt artifact as a file path:\n    run_results_path = dbt_run_invocation.target_path.joinpath("run_results.json")\n
\n
\n

Customizing the asset materialization metadata when streaming the Dagster events:

\n
from pathlib import Path\n\nfrom dagster import AssetExecutionContext\nfrom dagster_dbt import DbtCliResource, dbt_assets\n\n\n@dbt_assets(manifest=Path("target", "manifest.json"))\ndef my_dbt_assets(context: AssetExecutionContext, dbt: DbtCliResource):\n    dbt_cli_invocation = dbt.cli(["run"], context=context)\n\n    for dbt_event in dbt_cli_invocation.stream_raw_events():\n        for dagster_event in dbt_event.to_default_asset_events(manifest=dbt_cli_invocation.manifest):\n            if isinstance(dagster_event, Output):\n                context.add_output_metadata(\n                    metadata={\n                        "my_custom_metadata": "my_custom_metadata_value",\n                    },\n                    output_name=dagster_event.output_name,\n                )\n\n            yield dagster_event\n
\n
\n

Suppressing exceptions from a dbt CLI command when a non-zero exit code is returned:

\n
from pathlib import Path\n\nfrom dagster import AssetExecutionContext\nfrom dagster_dbt import DbtCliResource, dbt_assets\n\n\n@dbt_assets(manifest=Path("target", "manifest.json"))\ndef my_dbt_assets(context: AssetExecutionContext, dbt: DbtCliResource):\n    dbt_run_invocation = dbt.cli(["run"], context=context, raise_on_error=False)\n\n    if dbt_run_invocation.is_successful():\n        yield from dbt_run_invocation.stream()\n    else:\n        ...\n
\n
\n

Invoking a dbt CLI command in a custom asset or op:

\n
import json\n\nfrom dagster import asset, op\nfrom dagster_dbt import DbtCliResource\n\n\n@asset\ndef my_dbt_asset(dbt: DbtCliResource):\n    dbt_macro_args = {"key": "value"}\n    dbt.cli(["run-operation", "my-macro", json.dumps(dbt_macro_args)]).wait()\n\n\n@op\ndef my_dbt_op(dbt: DbtCliResource):\n    dbt_macro_args = {"key": "value"}\n    dbt.cli(["run-operation", "my-macro", json.dumps(dbt_macro_args)]).wait()\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_dbt.DbtCliInvocation(process, manifest, dagster_dbt_translator, project_dir, target_path, raise_on_error)[source]\u00b6
\n

The representation of an invoked dbt command.

\n
\n
Parameters:
\n
    \n
  • process (subprocess.Popen) \u2013 The process running the dbt command.

  • \n
  • manifest (Mapping[str, Any]) \u2013 The dbt manifest blob.

  • \n
  • project_dir (Path) \u2013 The path to the dbt project.

  • \n
  • target_path (Path) \u2013 The path to the dbt target folder.

  • \n
  • raise_on_error (bool) \u2013 Whether to raise an exception if the dbt command fails.

  • \n
\n
\n
\n
\n
\nget_artifact(artifact)[source]\u00b6
\n

Retrieve a dbt artifact from the target path.

\n

See https://docs.getdbt.com/reference/artifacts/dbt-artifacts for more information.

\n
\n
Parameters:
\n

artifact (Union[Literal["manifest.json"], Literal["catalog.json"], Literal["run_results.json"], Literal["sources.json"]]) \u2013 The name of the artifact to retrieve.

\n
\n
Returns:
\n

The artifact as a dictionary.

\n
\n
Return type:
\n

Dict[str, Any]

\n
\n
\n

Examples

\n
from dagster_dbt import DbtCliResource\n\ndbt = DbtCliResource(project_dir="/path/to/dbt/project")\n\ndbt_cli_invocation = dbt.cli(["run"]).wait()\n\n# Retrieve the run_results.json artifact.\nrun_results = dbt_cli_invocation.get_artifact("run_results.json")\n
\n
\n
\n\n
\n
\nis_successful()[source]\u00b6
\n

Return whether the dbt CLI process completed successfully.

\n
\n
Returns:
\n

True, if the dbt CLI process returns with a zero exit code, and False otherwise.

\n
\n
Return type:
\n

bool

\n
\n
\n

Examples

\n
from dagster_dbt import DbtCliResource\n\ndbt = DbtCliResource(project_dir="/path/to/dbt/project")\n\ndbt_cli_invocation = dbt.cli(["run"], raise_on_error=False)\n\nif dbt_cli_invocation.is_successful():\n    ...\n
\n
\n
\n\n
\n
\nstream()[source]\u00b6
\n

Stream the events from the dbt CLI process and convert them to Dagster events.

\n
\n
Returns:
\n

\n
A set of corresponding Dagster events.
    \n
  • Output for refables (e.g. models, seeds, snapshots.)

  • \n
  • AssetObservation for dbt test results that are not enabled as asset checks.

  • \n
  • AssetCheckResult for dbt test results that are enabled as asset checks.

  • \n
\n
\n
\n

\n
\n
Return type:
\n

Iterator[Union[Output, AssetObservation, AssetCheckResult]]

\n
\n
\n

Examples

\n
from pathlib import Path\nfrom dagster_dbt import DbtCliResource, dbt_assets\n\n@dbt_assets(manifest=Path("target", "manifest.json"))\ndef my_dbt_assets(context, dbt: DbtCliResource):\n    yield from dbt.cli(["run"], context=context).stream()\n
\n
\n
\n\n
\n
\nstream_raw_events()[source]\u00b6
\n

Stream the events from the dbt CLI process.

\n
\n
Returns:
\n

An iterator of events from the dbt CLI process.

\n
\n
Return type:
\n

Iterator[DbtCliEventMessage]

\n
\n
\n
\n\n
\n
\nwait()[source]\u00b6
\n

Wait for the dbt CLI process to complete.

\n
\n
Returns:
\n

The current representation of the dbt CLI invocation.

\n
\n
Return type:
\n

DbtCliInvocation

\n
\n
\n

Examples

\n
from dagster_dbt import DbtCliResource\n\ndbt = DbtCliResource(project_dir="/path/to/dbt/project")\n\ndbt_cli_invocation = dbt.cli(["run"]).wait()\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_dbt.DbtCliEventMessage(raw_event)[source]\u00b6
\n

The representation of a dbt CLI event.

\n
\n
Parameters:
\n

raw_event (Dict[str, Any]) \u2013 The raw event dictionary.\nSee https://docs.getdbt.com/reference/events-logging#structured-logging for more\ninformation.

\n
\n
\n
\n
\nto_default_asset_events(manifest, dagster_dbt_translator=<dagster_dbt.dagster_dbt_translator.DagsterDbtTranslator object>)[source]\u00b6
\n

Convert a dbt CLI event to a set of corresponding Dagster events.

\n
\n
Parameters:
\n
    \n
  • manifest (Union[Mapping[str, Any], str, Path]) \u2013 The dbt manifest blob.

  • \n
  • dagster_dbt_translator (DagsterDbtTranslator) \u2013 Optionally, a custom translator for\nlinking dbt nodes to Dagster assets.

  • \n
\n
\n
Returns:
\n

\n
A set of corresponding Dagster events.
    \n
  • Output for refables (e.g. models, seeds, snapshots.)

  • \n
  • AssetObservation for dbt test results that are not enabled as asset checks.

  • \n
  • AssetCheckResult for dbt test results that are enabled as asset checks.

  • \n
\n
\n
\n

\n
\n
Return type:
\n

Iterator[Union[Output, AssetObservation, AssetCheckResult]]

\n
\n
\n
\n\n
\n\n
\n
\n

Deprecated (dbt Core)\u00b6

\n
\n
\nclass dagster_dbt.DbtCliOutput(command, return_code, raw_output, logs, result, docs_url=None)[source]\u00b6
\n

The results of executing a dbt command, along with additional metadata about the dbt CLI\nprocess that was run.

\n

This class is deprecated, because it\u2019s only produced by methods of the DbtCliClientResource class,\nwhich is deprecated in favor of DbtCliResource.

\n

Note that users should not construct instances of this class directly. This class is intended\nto be constructed from the JSON output of dbt commands.

\n
\n
\ncommand\u00b6
\n

The full shell command that was executed.

\n
\n
Type:
\n

str

\n
\n
\n
\n\n
\n
\nreturn_code\u00b6
\n

The return code of the dbt CLI process.

\n
\n
Type:
\n

int

\n
\n
\n
\n\n
\n
\nraw_output\u00b6
\n

The raw output (stdout) of the dbt CLI process.

\n
\n
Type:
\n

str

\n
\n
\n
\n\n
\n
\nlogs\u00b6
\n

List of parsed JSON logs produced by the dbt command.

\n
\n
Type:
\n

List[Dict[str, Any]]

\n
\n
\n
\n\n
\n
\nresult\u00b6
\n

Dictionary containing dbt-reported result information\ncontained in run_results.json. Some dbt commands do not produce results, and will\ntherefore have result = None.

\n
\n
Type:
\n

Optional[Dict[str, Any]]

\n
\n
\n
\n\n
\n
\ndocs_url\u00b6
\n

Hostname where dbt docs are being served for this project.

\n
\n
Type:
\n

Optional[str]

\n
\n
\n
\n\n
\n\n
\n
\ndagster_dbt.dbt_cli_resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
project_dir (dagster.StringSource, optional):
\n

Which directory to look in for the dbt_project.yml file. Default is the current working directory and its parents.

\n

Default Value: \u2018.\u2019

\n
\n
profiles_dir (Union[dagster.StringSource, None], optional):
\n

Which directory to look in for the profiles.yml file. Default = $DBT_PROFILES_DIR or $HOME/.dbt

\n
\n
profile (Union[dagster.StringSource, None], optional):
\n

Which profile to load. Overrides setting in dbt_project.yml.

\n
\n
target (Union[dagster.StringSource, None], optional):
\n

Which target to load for the given profile.

\n
\n
vars (Union[dict, None], optional):
\n

Supply variables to the project. This argument overrides variables defined in your dbt_project.yml file. This argument should be a dictionary, eg. {\u2018my_variable\u2019: \u2018my_value\u2019}

\n
\n
bypass_cache (dagster.BoolSource, optional):
\n

If set, bypass the adapter-level cache of database state

\n

Default Value: False

\n
\n
warn_error (dagster.BoolSource, optional):
\n

If dbt would normally warn, instead raise an exception. Examples include \u2013models that selects nothing, deprecations, configurations with no associated models, invalid test configurations, and missing sources/refs in tests.

\n

Default Value: False

\n
\n
dbt_executable (dagster.StringSource, optional):
\n

Path to the dbt executable. Default is dbt

\n

Default Value: \u2018dbt\u2019

\n
\n
ignore_handled_error (dagster.BoolSource, optional):
\n

When True, will not raise an exception when the dbt CLI returns error code 1. Default is False.

\n

Default Value: False

\n
\n
target_path (dagster.StringSource, optional):
\n

The directory path for target if different from the default target-path in your dbt project configuration file.

\n

Default Value: \u2018target\u2019

\n
\n
docs_url (Union[dagster.StringSource, None], optional):
\n

The url for where dbt docs are being served for this project.

\n
\n
json_log_format (dagster.BoolSource, optional):
\n

When True, dbt will invoked with the \u2013log-format json flag, allowing Dagster to parse the log messages and emit simpler log messages to the event log.

\n

Default Value: True

\n
\n
capture_logs (dagster.BoolSource, optional):
\n

When True, dbt will invoked with the \u2013capture-output flag, allowing Dagster to capture the logs and emit them to the event log.

\n

Default Value: True

\n
\n
debug (dagster.BoolSource, optional):
\n

When True, dbt will invoked with the \u2013debug flag, which will print additional debug information to the console.

\n

Default Value: False

\n
\n
\n
\n
\n

\n \n (\n deprecated\n )\n \n This API will be removed in version 0.21. Use DbtCliResource instead..\n \n

\n

This resource issues dbt CLI commands against a configured dbt project. It is deprecated\nin favor of DbtCliResource.

\n
\n\n
\n
\n
\n

Ops (dbt Core)\u00b6

\n

If you\u2019re using asset-based dbt APIs like load_assets_from_dbt_project, you usually will not also use the below op-based APIs.

\n

dagster_dbt provides a set of pre-built ops that work with the CLI. For more advanced use cases,\nwe suggest building your own ops which directly interact with these resources.

\n
\n
\ndagster_dbt.dbt_run_op = <dagster._core.definitions.op_definition.OpDefinition object>[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
yield_materializations (dagster.BoolSource, optional):
\n

If True, materializations corresponding to the results of the dbt operation will be yielded when the op executes. Default: True

\n

Default Value: True

\n
\n
asset_key_prefix (Union[List[dagster.StringSource], None], optional):
\n

If provided and yield_materializations is True, these components will be used to prefix the generated asset keys.

\n

Default Value: [\u2018dbt\u2019]

\n
\n
\n

This op executes a dbt run command. It requires the use of a dbt resource, which can be\nset to execute this command through the CLI (using the dbt_cli_resource).

\n

Examples:

\n
from dagster import job\nfrom dagster_dbt import dbt_run_op, dbt_cli_resource\n\n@job(resource_defs={"dbt":dbt_cli_resource})\ndef my_dbt_cli_job():\n    dbt_run_op()\n
\n
\n
\n\n
\n
\ndagster_dbt.dbt_compile_op(context)[source]\u00b6
\n

This op executes a dbt compile command. It requires the use of a dbt resource, which can be\nset to execute this command through the CLI (using the dbt_cli_resource).

\n

Examples:

\n
from dagster import job\nfrom dagster_dbt import dbt_compile_op, dbt_cli_resource\n\n@job(resource_defs={"dbt":dbt_cli_resource})\ndef my_dbt_cli_job():\n    dbt_compile_op()\n
\n
\n
\n\n
\n
\ndagster_dbt.dbt_ls_op(context)[source]\u00b6
\n

This op executes a dbt ls command. It requires the use of a dbt resource, which can be\nset to execute this command through the CLI (using the dbt_cli_resource).

\n

Examples:

\n
from dagster import job\nfrom dagster_dbt import dbt_ls_op, dbt_cli_resource\n\n@job(resource_defs={"dbt":dbt_cli_resource})\ndef my_dbt_cli_job():\n    dbt_ls_op()\n
\n
\n
\n\n
\n
\ndagster_dbt.dbt_test_op(context)[source]\u00b6
\n

This op executes a dbt test command. It requires the use of a dbt resource, which can be\nset to execute this command through the CLI (using the dbt_cli_resource).

\n

Examples:

\n
from dagster import job\nfrom dagster_dbt import dbt_test_op, dbt_cli_resource\n\n@job(resource_defs={"dbt":dbt_cli_resource})\ndef my_dbt_cli_job():\n    dbt_test_op()\n
\n
\n
\n\n
\n
\ndagster_dbt.dbt_snapshot_op(context)[source]\u00b6
\n

This op executes a dbt snapshot command. It requires the use of a dbt resource, which can be\nset to execute this command through the CLI (using the dbt_cli_resource).

\n

Examples:

\n
from dagster import job\nfrom dagster_dbt import dbt_snapshot_op, dbt_cli_resource\n\n@job(resource_defs={"dbt":dbt_cli_resource})\ndef my_dbt_cli_job():\n    dbt_snapshot_op()\n
\n
\n
\n\n
\n
\ndagster_dbt.dbt_seed_op(context)[source]\u00b6
\n

This op executes a dbt seed command. It requires the use of a dbt resource, which can be\nset to execute this command through the CLI (using the dbt_cli_resource).

\n

Examples:

\n
from dagster import job\nfrom dagster_dbt import dbt_seed_op, dbt_cli_resource\n\n@job(resource_defs={"dbt":dbt_cli_resource})\ndef my_dbt_cli_job():\n    dbt_seed_op()\n
\n
\n
\n\n
\n
\ndagster_dbt.dbt_docs_generate_op(context)[source]\u00b6
\n

This op executes a dbt docs generate command. It requires the use of a dbt resource, which can be\nset to execute this command through the CLI (using the dbt_cli_resource).

\n

Examples:

\n
from dagster import job\nfrom dagster_dbt import dbt_docs_generate_op, dbt_cli_resource\n\n@job(resource_defs={"dbt":dbt_cli_resource})\ndef my_dbt_cli_job():\n    dbt_docs_generate_op()\n
\n
\n
\n\n
\n
\n
\n

dbt Cloud\u00b6

\n

Here, we provide interfaces to manage dbt projects invoked by the hosted dbt Cloud service.

\n
\n

Assets (dbt Cloud)\u00b6

\n
\n
\ndagster_dbt.load_assets_from_dbt_cloud_job(dbt_cloud, job_id, node_info_to_asset_key=<function default_asset_key_fn>, node_info_to_group_fn=<function default_group_from_dbt_resource_props>, node_info_to_freshness_policy_fn=<function default_freshness_policy_fn>, node_info_to_auto_materialize_policy_fn=<function default_auto_materialize_policy_fn>, partitions_def=None, partition_key_to_vars_fn=None)[source]\u00b6
\n
\n
\n

\n \n (\n experimental\n )\n \n This API may break in future versions, even between dot releases.\n \n

\n

Loads a set of dbt models, managed by a dbt Cloud job, into Dagster assets. In order to\ndetermine the set of dbt models, the project is compiled to generate the necessary artifacts\nthat define the dbt models and their dependencies.

\n

One Dagster asset is created for each dbt model.

\n
\n
Parameters:
\n
    \n
  • dbt_cloud (ResourceDefinition) \u2013 The dbt Cloud resource to use to connect to the dbt Cloud API.

  • \n
  • job_id (int) \u2013 The ID of the dbt Cloud job to load assets from.

  • \n
  • node_info_to_asset_key \u2013 (Mapping[str, Any] -> AssetKey): A function that takes a dictionary\nof dbt metadata and returns the AssetKey that you want to represent a given model or\nsource. By default: dbt model -> AssetKey([model_name]) and\ndbt source -> AssetKey([source_name, table_name])

  • \n
  • node_info_to_group_fn (Dict[str, Any] -> Optional[str]) \u2013 A function that takes a\ndictionary of dbt node info and returns the group that this node should be assigned to.

  • \n
  • node_info_to_freshness_policy_fn (Dict[str, Any] -> Optional[FreshnessPolicy]) \u2013 A function\nthat takes a dictionary of dbt node info and optionally returns a FreshnessPolicy that\nshould be applied to this node. By default, freshness policies will be created from\nconfig applied to dbt models, i.e.:\ndagster_freshness_policy={\u201cmaximum_lag_minutes\u201d: 60, \u201ccron_schedule\u201d: \u201c0 9 * * *\u201d}\nwill result in that model being assigned\nFreshnessPolicy(maximum_lag_minutes=60, cron_schedule=\u201d0 9 * * *\u201d)

  • \n
  • node_info_to_auto_materialize_policy_fn (Dict[str, Any] -> Optional[AutoMaterializePolicy]) \u2013 A function that takes a dictionary of dbt node info and optionally returns a AutoMaterializePolicy\nthat should be applied to this node. By default, AutoMaterializePolicies will be created from\nconfig applied to dbt models, i.e.:\ndagster_auto_materialize_policy={\u201ctype\u201d: \u201clazy\u201d} will result in that model being assigned\nAutoMaterializePolicy.lazy()

  • \n
  • node_info_to_definition_metadata_fn (Dict[str, Any] -> Optional[Dict[str, MetadataUserInput]]) \u2013 A function that takes a dictionary of dbt node info and optionally returns a dictionary\nof metadata to be attached to the corresponding definition. This is added to the default\nmetadata assigned to the node, which consists of the node\u2019s schema (if present).

  • \n
  • partitions_def (Optional[PartitionsDefinition]) \u2013 \n \n (\n experimental\n )\n \n (This parameter may break in future versions, even between dot releases.) Defines the set of partition keys that\ncompose the dbt assets.

  • \n
  • partition_key_to_vars_fn (Optional[str -> Dict[str, Any]]) \u2013 \n \n (\n experimental\n )\n \n (This parameter may break in future versions, even between dot releases.) A function to translate a given\npartition key (e.g. \u20182022-01-01\u2019) to a dictionary of vars to be passed into the dbt\ninvocation (e.g. {\u201crun_date\u201d: \u201c2022-01-01\u201d})

  • \n
\n
\n
Returns:
\n

A definition for the loaded assets.

\n
\n
Return type:
\n

CacheableAssetsDefinition

\n
\n
\n

Examples

\n
from dagster import repository\nfrom dagster_dbt import dbt_cloud_resource, load_assets_from_dbt_cloud_job\n\nDBT_CLOUD_JOB_ID = 1234\n\ndbt_cloud = dbt_cloud_resource.configured(\n    {\n        "auth_token": {"env": "DBT_CLOUD_API_TOKEN"},\n        "account_id": {"env": "DBT_CLOUD_ACCOUNT_ID"},\n    }\n)\n\ndbt_cloud_assets = load_assets_from_dbt_cloud_job(\n    dbt_cloud=dbt_cloud, job_id=DBT_CLOUD_JOB_ID\n)\n\n\n@repository\ndef dbt_cloud_sandbox():\n    return [dbt_cloud_assets]\n
\n
\n
\n\n
\n
\n

Ops (dbt Cloud)\u00b6

\n
\n
\ndagster_dbt.dbt_cloud_run_op = <dagster._core.definitions.op_definition.OpDefinition object>[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
job_id (dagster.IntSource):
\n

The integer ID of the relevant dbt Cloud job. You can find this value by going to the details page of your job in the dbt Cloud UI. It will be the final number in the url, e.g.: https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/jobs/{job_id}/

\n
\n
poll_interval (Float, optional):
\n

The time (in seconds) that will be waited between successive polls.

\n

Default Value: 10

\n
\n
poll_timeout (Union[Float, None], optional):
\n

The maximum time that will waited before this operation is timed out. By default, this will never time out.

\n
\n
yield_materializations (dagster.BoolSource, optional):
\n

If True, materializations corresponding to the results of the dbt operation will be yielded when the op executes.

\n

Default Value: True

\n
\n
asset_key_prefix (List[dagster.StringSource], optional):
\n

If provided and yield_materializations is True, these components will be used to prefix the generated asset keys.

\n

Default Value: [\u2018dbt\u2019]

\n
\n
\n

Initiates a run for a dbt Cloud job, then polls until the run completes. If the job\nfails or is otherwised stopped before succeeding, a dagster.Failure exception will be raised,\nand this op will fail.

\n

It requires the use of a \u2018dbt_cloud\u2019 resource, which is used to connect to the dbt Cloud API.

\n

Config Options:

\n
\n
job_id (int)

The integer ID of the relevant dbt Cloud job. You can find this value by going to the details\npage of your job in the dbt Cloud UI. It will be the final number in the url, e.g.:\nhttps://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/jobs/{job_id}/

\n
\n
poll_interval (float)

The time (in seconds) that will be waited between successive polls. Defaults to 10.

\n
\n
poll_timeout (float)

The maximum time (in seconds) that will waited before this operation is timed out. By\ndefault, this will never time out.

\n
\n
yield_materializations (bool)

If True, materializations corresponding to the results of the dbt operation will be\nyielded when the solid executes. Defaults to True.

\n
\n
rasset_key_prefix (float)

If provided and yield_materializations is True, these components will be used to \u201d\nprefix the generated asset keys. Defaults to [\u201cdbt\u201d].

\n
\n
\n

Examples:

\n
from dagster import job\nfrom dagster_dbt import dbt_cloud_resource, dbt_cloud_run_op\n\nmy_dbt_cloud_resource = dbt_cloud_resource.configured(\n    {"auth_token": {"env": "DBT_CLOUD_AUTH_TOKEN"}, "account_id": 77777}\n)\nrun_dbt_nightly_sync = dbt_cloud_run_op.configured(\n    {"job_id": 54321}, name="run_dbt_nightly_sync"\n)\n\n@job(resource_defs={"dbt_cloud": my_dbt_cloud_resource})\ndef dbt_cloud():\n    run_dbt_nightly_sync()\n
\n
\n
\n\n
\n
\n

Resources (dbt Cloud)\u00b6

\n
\n
\nclass dagster_dbt.DbtCloudClientResource(*, auth_token, account_id, disable_schedule_on_trigger=True, request_max_retries=3, request_retry_delay=0.25, dbt_cloud_host='https://cloud.getdbt.com/')[source]\u00b6
\n

This resource helps interact with dbt Cloud connectors.

\n
\n\n
\n

Deprecated (dbt Cloud)\u00b6

\n
\n
\ndagster_dbt.dbt_cloud_resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
auth_token (dagster.StringSource):
\n

dbt Cloud API Token. User tokens can be found in the [dbt Cloud UI](https://cloud.getdbt.com/#/profile/api/), or see the [dbt Cloud Docs](https://docs.getdbt.com/docs/dbt-cloud/dbt-cloud-api/service-tokens) for instructions on creating a Service Account token.

\n
\n
account_id (dagster.IntSource):
\n

dbt Cloud Account ID. This value can be found in the url of a variety of views in the dbt Cloud UI, e.g. https://cloud.getdbt.com/#/accounts/{account_id}/settings/.

\n
\n
disable_schedule_on_trigger (dagster.BoolSource, optional):
\n

Specifies if you would like any job that is triggered using this resource to automatically disable its schedule.

\n

Default Value: True

\n
\n
request_max_retries (dagster.IntSource, optional):
\n

The maximum number of times requests to the dbt Cloud API should be retried before failing.

\n

Default Value: 3

\n
\n
request_retry_delay (Float, optional):
\n

Time (in seconds) to wait between each request retry.

\n

Default Value: 0.25

\n
\n
dbt_cloud_host (dagster.StringSource, optional):
\n

The hostname where dbt cloud is being hosted (e.g. https://my_org.cloud.getdbt.com/).

\n

Default Value: \u2018https://cloud.getdbt.com/\u2019

\n
\n
\n

This resource allows users to programatically interface with the dbt Cloud Administrative REST\nAPI (v2) to launch jobs and monitor their progress. This currently implements only a subset of\nthe functionality exposed by the API.

\n

For a complete set of documentation on the dbt Cloud Administrative REST API, including expected\nresponse JSON schemae, see the dbt Cloud API Docs.

\n

To configure this resource, we recommend using the configured method.

\n

Examples:

\n
from dagster import job\nfrom dagster_dbt import dbt_cloud_resource\n\nmy_dbt_cloud_resource = dbt_cloud_resource.configured(\n    {\n        "auth_token": {"env": "DBT_CLOUD_AUTH_TOKEN"},\n        "account_id": {"env": "DBT_CLOUD_ACCOUNT_ID"},\n    }\n)\n\n@job(resource_defs={"dbt_cloud": my_dbt_cloud_resource})\ndef my_dbt_cloud_job():\n    ...\n
\n
\n
\n\n
\n
\n
\n
\n

Types\u00b6

\n
\n
\nclass dagster_dbt.DbtOutput(result)[source]\u00b6
\n

Base class for both DbtCliOutput and DbtRPCOutput. Contains a single field, result, which\nrepresents the dbt-formatted result of the command that was run (if any).

\n

Used internally, should not be instantiated directly by the user.

\n
\n\n
\n
\nclass dagster_dbt.DbtResource(logger=None)[source]\u00b6
\n
\n\n
\n
\n

Errors\u00b6

\n
\n
\nexception dagster_dbt.DagsterDbtError(description=None, metadata=None, allow_retries=None)[source]\u00b6
\n

The base exception of the dagster-dbt library.

\n
\n\n
\n
\nexception dagster_dbt.DagsterDbtCliRuntimeError(description, logs=None, raw_output=None, messages=None)[source]\u00b6
\n

Represents an error while executing a dbt CLI command.

\n
\n\n
\n
\nexception dagster_dbt.DagsterDbtCliFatalRuntimeError(logs=None, raw_output=None, messages=None)[source]\u00b6
\n

Represents a fatal error in the dbt CLI (return code 2).

\n
\n\n
\n
\nexception dagster_dbt.DagsterDbtCliHandledRuntimeError(logs=None, raw_output=None, messages=None)[source]\u00b6
\n

Represents a model error reported by the dbt CLI at runtime (return code 1).

\n
\n\n
\n
\nexception dagster_dbt.DagsterDbtCliOutputsNotFoundError(path)[source]\u00b6
\n

Represents a problem in finding the target/run_results.json artifact when executing a dbt\nCLI command.

\n

For more details on target/run_results.json, see\nhttps://docs.getdbt.com/reference/dbt-artifacts#run_resultsjson.

\n
\n\n
\n
\nexception dagster_dbt.DagsterDbtCliUnexpectedOutputError(invalid_line_nos)[source]\u00b6
\n

Represents an error when parsing the output of a dbt CLI command.

\n
\n
\ninvalid_line_nos\u00b6
\n
\n\n
\n\n
\n
\n

Utils\u00b6

\n
\n
\ndagster_dbt.default_group_from_dbt_resource_props(dbt_resource_props)[source]\u00b6
\n

Get the group name for a dbt node.

\n

If a Dagster group is configured in the metadata for the node, use that.

\n

Otherwise, if a dbt group is configured for the node, use that.

\n
\n\n
\n
\ndagster_dbt.group_from_dbt_resource_props_fallback_to_directory(dbt_resource_props)[source]\u00b6
\n

Get the group name for a dbt node.

\n

Has the same behavior as the default_group_from_dbt_resource_props, except for that, if no group can be determined\nfrom config or metadata, falls back to using the subdirectory of the models directory that the\nsource file is in.

\n
\n
Parameters:
\n

dbt_resource_props (Mapping[str, Any]) \u2013 A dictionary representing the dbt resource.

\n
\n
\n

Examples

\n
from dagster_dbt import group_from_dbt_resource_props_fallback_to_directory\n\ndbt_assets = load_assets_from_dbt_manifest(\n    manifest=manifest,\n    node_info_to_group_fn=group_from_dbt_resource_props_fallback_to_directory,\n)\n
\n
\n
\n\n
\n
\ndagster_dbt.default_metadata_from_dbt_resource_props(dbt_resource_props)[source]\u00b6
\n
\n\n
\n
\ndagster_dbt.utils.generate_materializations(dbt_output, asset_key_prefix=None)[source]\u00b6
\n

This function yields dagster.AssetMaterialization events for each model updated by\na dbt command.

\n

Information parsed from a DbtOutput object.

\n

Examples

\n
from dagster import op, Output\nfrom dagster_dbt.utils import generate_materializations\nfrom dagster_dbt import dbt_cli_resource\n\n@op(required_resource_keys={"dbt"})\ndef my_custom_dbt_run(context):\n    dbt_output = context.resources.dbt.run()\n    for materialization in generate_materializations(dbt_output):\n        # you can modify the materialization object to add extra metadata, if desired\n        yield materialization\n    yield Output(my_dbt_output)\n\n@job(resource_defs={{"dbt":dbt_cli_resource}})\ndef my_dbt_cli_job():\n    my_custom_dbt_run()\n
\n
\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-dbt", "customsidebar": null, "display_toc": true, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../dagster-duckdb/", "title": "DuckDB (dagster-duckdb)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-datahub/", "title": "Datahub (dagster-datahub)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-duckdb", "DuckDB (dagster-duckdb)", "N", "next"], ["sections/api/apidocs/libraries/dagster-datahub", "Datahub (dagster-datahub)", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/libraries/dagster-dbt.rst.txt", "title": "dbt (dagster-dbt)", "toc": "\n"}, "dagster-docker": {"alabaster_version": "0.7.13", "body": "
\n

Orchestration on Docker\u00b6

\n
\n
\n

APIs\u00b6

\n
\n
\ndagster_docker.DockerRunLauncher RunLauncher[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
image (dagster.StringSource, optional):
\n

The docker image to be used if the repository does not specify one.

\n
\n
network (dagster.StringSource, optional):
\n

Name of the network to which to connect the launched container at creation time

\n
\n
registry (strict dict, optional):
\n

Information for using a non local/public docker registry

\n
\nConfig Schema:
\n
url (dagster.StringSource):
\n

\n
username (dagster.StringSource):
\n

\n
password (dagster.StringSource):
\n

\n
\n
\n
env_vars (List[String], optional):
\n

The list of environment variables names to include in the docker container. Each can be of the form KEY=VALUE or just KEY (in which case the value will be pulled from the local environment)

\n
\n
container_kwargs (permissive dict, optional):
\n

key-value pairs that can be passed into containers.create. See https://docker-py.readthedocs.io/en/stable/containers.html for the full list of available options.

\n
\n
networks (List[dagster.StringSource], optional):
\n

Names of the networks to which to connect the launched container at creation time

\n
\n
\n

Launches runs in a Docker container.

\n
\n\n
\n
\ndagster_docker.docker_executor ExecutorDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
image (dagster.StringSource, optional):
\n

The docker image to be used if the repository does not specify one.

\n
\n
network (dagster.StringSource, optional):
\n

Name of the network to which to connect the launched container at creation time

\n
\n
registry (strict dict, optional):
\n

Information for using a non local/public docker registry

\n
\nConfig Schema:
\n
url (dagster.StringSource):
\n

\n
username (dagster.StringSource):
\n

\n
password (dagster.StringSource):
\n

\n
\n
\n
env_vars (List[String], optional):
\n

The list of environment variables names to include in the docker container. Each can be of the form KEY=VALUE or just KEY (in which case the value will be pulled from the local environment)

\n
\n
container_kwargs (permissive dict, optional):
\n

key-value pairs that can be passed into containers.create. See https://docker-py.readthedocs.io/en/stable/containers.html for the full list of available options.

\n
\n
networks (List[dagster.StringSource], optional):
\n

Names of the networks to which to connect the launched container at creation time

\n
\n
retries (selector, optional):
\n

Whether retries are enabled or not. By default, retries are enabled.

\n
\nDefault Value:
{\n    "enabled": {}\n}\n
\n
\n
\nConfig Schema:
\n
enabled (strict dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\n
disabled (strict dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\n
\n
\n
max_concurrent (dagster.IntSource, optional):
\n

Limit on the number of containers that will run concurrently within the scope of a Dagster run. Note that this limit is per run, not global.

\n
\n
tag_concurrency_limits (List[strict dict], optional):
\n

A set of limits that are applied to steps with particular tags. If a value is set, the limit is applied to only that key-value pair. If no value is set, the limit is applied across all values of that key. If the value is set to a dict with applyLimitPerUniqueValue: true, the limit will apply to the number of unique values for that key. Note that these limits are per run, not global.

\n
\n
\n
\n
\n

\n \n (\n experimental\n )\n \n This API may break in future versions, even between dot releases.\n \n

\n

Executor which launches steps as Docker containers.

\n

To use the docker_executor, set it as the executor_def when defining a job:

\n
from dagster_docker import docker_executor\n\nfrom dagster import job\n\n@job(executor_def=docker_executor)\ndef docker_job():\n    pass\n
\n
\n

Then you can configure the executor with run config as follows:

\n
execution:\n  config:\n    registry: ...\n    network: ...\n    networks: ...\n    container_kwargs: ...\n
\n
\n

If you\u2019re using the DockerRunLauncher, configuration set on the containers created by the run\nlauncher will also be set on the containers that are created for each step.

\n
\n\n
\n

Ops\u00b6

\n
\n
\ndagster_docker.docker_container_op = <dagster._core.definitions.op_definition.OpDefinition object>[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
image (dagster.StringSource):
\n

The image in which to run the Docker container.

\n
\n
network (dagster.StringSource, optional):
\n

Name of the network to which to connect the launched container at creation time

\n
\n
registry (strict dict, optional):
\n

Information for using a non local/public docker registry

\n
\nConfig Schema:
\n
url (dagster.StringSource):
\n

\n
username (dagster.StringSource):
\n

\n
password (dagster.StringSource):
\n

\n
\n
\n
env_vars (List[String], optional):
\n

The list of environment variables names to include in the docker container. Each can be of the form KEY=VALUE or just KEY (in which case the value will be pulled from the local environment)

\n
\n
container_kwargs (permissive dict, optional):
\n

key-value pairs that can be passed into containers.create. See https://docker-py.readthedocs.io/en/stable/containers.html for the full list of available options.

\n
\n
networks (List[dagster.StringSource], optional):
\n

Names of the networks to which to connect the launched container at creation time

\n
\n
entrypoint (List[String], optional):
\n

The ENTRYPOINT for the Docker container

\n
\n
command (List[String], optional):
\n

The command to run in the container within the launched Docker container.

\n
\n
\n
\n
\n

\n \n (\n experimental\n )\n \n This API may break in future versions, even between dot releases.\n \n

\n

An op that runs a Docker container using the docker Python API.

\n

Contrast with the docker_executor, which runs each Dagster op in a Dagster job in its\nown Docker container.

\n
\n
This op may be useful when:
    \n
  • You need to orchestrate a command that isn\u2019t a Dagster op (or isn\u2019t written in Python)

  • \n
  • You want to run the rest of a Dagster job using a specific executor, and only a single\nop in docker.

  • \n
\n
\n
\n

For example:

\n
from dagster_docker import docker_container_op\n\nfrom dagster import job\n\nfirst_op = docker_container_op.configured(\n    {\n        "image": "busybox",\n        "command": ["echo HELLO"],\n    },\n    name="first_op",\n)\nsecond_op = docker_container_op.configured(\n    {\n        "image": "busybox",\n        "command": ["echo GOODBYE"],\n    },\n    name="second_op",\n)\n\n@job\ndef full_job():\n    second_op(first_op())\n
\n
\n

You can create your own op with the same implementation by calling the execute_docker_container function\ninside your own op.

\n
\n\n
\n
\ndagster_docker.execute_docker_container(context, image, entrypoint=None, command=None, networks=None, registry=None, env_vars=None, container_kwargs=None)[source]\u00b6
\n
\n
\n

\n \n (\n experimental\n )\n \n This API may break in future versions, even between dot releases.\n \n

\n

This function is a utility for executing a Docker container from within a Dagster op.

\n
\n
Parameters:
\n
    \n
  • image (str) \u2013 The image to use for the launched Docker container.

  • \n
  • entrypoint (Optional[Sequence[str]]) \u2013 The ENTRYPOINT to run in the launched Docker\ncontainer. Default: None.

  • \n
  • command (Optional[Sequence[str]]) \u2013 The CMD to run in the launched Docker container.\nDefault: None.

  • \n
  • networks (Optional[Sequence[str]]) \u2013 Names of the Docker networks to which to connect the\nlaunched container. Default: None.

  • \n
  • registry \u2013 (Optional[Mapping[str, str]]): Information for using a non local/public Docker\nregistry. Can have \u201curl\u201d, \u201cusername\u201d, or \u201cpassword\u201d keys.

  • \n
  • env_vars (Optional[Sequence[str]]) \u2013 List of environemnt variables to include in the launched\ncontainer. ach can be of the form KEY=VALUE or just KEY (in which case the value will be\npulled from the calling environment.

  • \n
  • container_kwargs (Optional[Dict[str[Any]]]) \u2013 key-value pairs that can be passed into\ncontainers.create in the Docker Python API. See\nhttps://docker-py.readthedocs.io/en/stable/containers.html for the full list\nof available options.

  • \n
\n
\n
\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-docker", "customsidebar": null, "display_toc": true, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../dagster-gcp/", "title": "GCP (dagster-gcp)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-fivetran/", "title": "Fivetran (dagster-fivetran)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-gcp", "GCP (dagster-gcp)", "N", "next"], ["sections/api/apidocs/libraries/dagster-fivetran", "Fivetran (dagster-fivetran)", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/libraries/dagster-docker.rst.txt", "title": "Orchestration on Docker", "toc": "\n"}, "dagster-duckdb": {"alabaster_version": "0.7.13", "body": "
\n

DuckDB (dagster-duckdb)\u00b6

\n

This library provides an integration with the DuckDB database.

\n

Related Guides:

\n\n
\n
\ndagster_duckdb.DuckDBIOManager IOManagerDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
database (dagster.StringSource):
\n

Path to the DuckDB database.

\n
\n
schema (Union[dagster.StringSource, None], optional):
\n

Name of the schema to use.

\n
\n
\n

Base class for an IO manager definition that reads inputs from and writes outputs to DuckDB.

\n

Examples

\n
from dagster_duckdb import DuckDBIOManager\nfrom dagster_duckdb_pandas import DuckDBPandasTypeHandler\n\nclass MyDuckDBIOManager(DuckDBIOManager):\n    @staticmethod\n    def type_handlers() -> Sequence[DbTypeHandler]:\n        return [DuckDBPandasTypeHandler()]\n\n@asset(\n    key_prefix=["my_schema"]  # will be used as the schema in duckdb\n)\ndef my_table() -> pd.DataFrame:  # the name of the asset will be the table name\n    ...\n\ndefs = Definitions(\n    assets=[my_table],\n    resources={"io_manager": MyDuckDBIOManager(database="my_db.duckdb")}\n)\n
\n
\n

If you do not provide a schema, Dagster will determine a schema based on the assets and ops using\nthe IO Manager. For assets, the schema will be determined from the asset key, as in the above example.\nFor ops, the schema can be specified by including a \u201cschema\u201d entry in output metadata. If none\nof these is provided, the schema will default to \u201cpublic\u201d.

\n
@op(\n    out={"my_table": Out(metadata={"schema": "my_schema"})}\n)\ndef make_my_table() -> pd.DataFrame:\n    ...\n
\n
\n

To only use specific columns of a table as input to a downstream op or asset, add the metadata \u201ccolumns\u201d to the\nIn or AssetIn.

\n
@asset(\n    ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n)\ndef my_table_a(my_table: pd.DataFrame):\n    # my_table will just contain the data from column "a"\n    ...\n
\n
\n
\n\n
\n
\ndagster_duckdb.DuckDBResource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
database (dagster.StringSource):
\n

Path to the DuckDB database. Setting database=\u2019:memory:\u2019 will use an in-memory database

\n
\n
\n

Resource for interacting with a DuckDB database.

\n

Examples

\n
from dagster import Definitions, asset\nfrom dagster_duckdb import DuckDBResource\n\n@asset\ndef my_table(duckdb: DuckDBResource):\n    with duckdb.get_connection() as conn:\n        conn.execute("SELECT * from MY_SCHEMA.MY_TABLE")\n\ndefs = Definitions(\n    assets=[my_table],\n    resources={"duckdb": DuckDBResource(database="path/to/db.duckdb")}\n)\n
\n
\n
\n\n
\n

Legacy\u00b6

\n
\n
\ndagster_duckdb.build_duckdb_io_manager IOManagerDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
database (dagster.StringSource):
\n

Path to the DuckDB database.

\n
\n
schema (Union[dagster.StringSource, None], optional):
\n

Name of the schema to use.

\n
\n
\n

Builds an IO manager definition that reads inputs from and writes outputs to DuckDB.

\n
\n
Parameters:
\n
    \n
  • type_handlers (Sequence[DbTypeHandler]) \u2013 Each handler defines how to translate between\nDuckDB tables and an in-memory type - e.g. a Pandas DataFrame. If only\none DbTypeHandler is provided, it will be used as teh default_load_type.

  • \n
  • default_load_type (Type) \u2013 When an input has no type annotation, load it as this type.

  • \n
\n
\n
Returns:
\n

IOManagerDefinition

\n
\n
\n

Examples

\n
from dagster_duckdb import build_duckdb_io_manager\nfrom dagster_duckdb_pandas import DuckDBPandasTypeHandler\n\n@asset(\n    key_prefix=["my_schema"]  # will be used as the schema in duckdb\n)\ndef my_table() -> pd.DataFrame:  # the name of the asset will be the table name\n    ...\n\nduckdb_io_manager = build_duckdb_io_manager([DuckDBPandasTypeHandler()])\n\n@repository\ndef my_repo():\n    return with_resources(\n        [my_table],\n        {"io_manager": duckdb_io_manager.configured({"database": "my_db.duckdb"})}\n    )\n
\n
\n

If you do not provide a schema, Dagster will determine a schema based on the assets and ops using\nthe IO Manager. For assets, the schema will be determined from the asset key. For ops, the schema can be\nspecified by including a \u201cschema\u201d entry in output metadata. If none of these is provided, the schema will\ndefault to \u201cpublic\u201d.

\n
@op(\n    out={"my_table": Out(metadata={"schema": "my_schema"})}\n)\ndef make_my_table() -> pd.DataFrame:\n    ...\n
\n
\n

To only use specific columns of a table as input to a downstream op or asset, add the metadata \u201ccolumns\u201d to the\nIn or AssetIn.

\n
@asset(\n    ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n)\ndef my_table_a(my_table: pd.DataFrame):\n    # my_table will just contain the data from column "a"\n    ...\n
\n
\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-duckdb", "customsidebar": null, "display_toc": true, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../dagster-duckdb-pandas/", "title": "DuckDB + Pandas (dagster-duckdb-pandas)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-dbt/", "title": "dbt (dagster-dbt)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-duckdb-pandas", "DuckDB + Pandas (dagster-duckdb-pandas)", "N", "next"], ["sections/api/apidocs/libraries/dagster-dbt", "dbt (dagster-dbt)", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/libraries/dagster-duckdb.rst.txt", "title": "DuckDB (dagster-duckdb)", "toc": "\n"}, "dagster-duckdb-pandas": {"alabaster_version": "0.7.13", "body": "
\n

DuckDB + Pandas (dagster-duckdb-pandas)\u00b6

\n

This library provides an integration with the DuckDB database and Pandas data processing library.

\n

Related guides:

\n\n
\n
\ndagster_duckdb_pandas.DuckDBPandasIOManager IOManagerDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
database (dagster.StringSource):
\n

Path to the DuckDB database.

\n
\n
schema (Union[dagster.StringSource, None], optional):
\n

Name of the schema to use.

\n
\n
\n

An I/O manager definition that reads inputs from and writes Pandas DataFrames to DuckDB. When\nusing the DuckDBPandasIOManager, any inputs and outputs without type annotations will be loaded\nas Pandas DataFrames.

\n
\n
Returns:
\n

IOManagerDefinition

\n
\n
\n

Examples

\n
from dagster_duckdb_pandas import DuckDBPandasIOManager\n\n@asset(\n    key_prefix=["my_schema"]  # will be used as the schema in DuckDB\n)\ndef my_table() -> pd.DataFrame:  # the name of the asset will be the table name\n    ...\n\ndefs = Definitions(\n    assets=[my_table],\n    resources={"io_manager": DuckDBPandasIOManager(database="my_db.duckdb")}\n)\n
\n
\n

If you do not provide a schema, Dagster will determine a schema based on the assets and ops using\nthe I/O Manager. For assets, the schema will be determined from the asset key, as in the above example.\nFor ops, the schema can be specified by including a \u201cschema\u201d entry in output metadata. If \u201cschema\u201d is not provided\nvia config or on the asset/op, \u201cpublic\u201d will be used for the schema.

\n
@op(\n    out={"my_table": Out(metadata={"schema": "my_schema"})}\n)\ndef make_my_table() -> pd.DataFrame:\n    # the returned value will be stored at my_schema.my_table\n    ...\n
\n
\n

To only use specific columns of a table as input to a downstream op or asset, add the metadata \u201ccolumns\u201d to the\nIn or AssetIn.

\n
@asset(\n    ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n)\ndef my_table_a(my_table: pd.DataFrame) -> pd.DataFrame:\n    # my_table will just contain the data from column "a"\n    ...\n
\n
\n
\n\n
\n
\nclass dagster_duckdb_pandas.DuckDBPandasTypeHandler[source]\u00b6
\n

Stores and loads Pandas DataFrames in DuckDB.

\n

To use this type handler, return it from the type_handlers` method of an I/O manager that inherits from ``DuckDBIOManager.

\n

Example

\n
from dagster_duckdb import DuckDBIOManager\nfrom dagster_duckdb_pandas import DuckDBPandasTypeHandler\n\nclass MyDuckDBIOManager(DuckDBIOManager):\n    @staticmethod\n    def type_handlers() -> Sequence[DbTypeHandler]:\n        return [DuckDBPandasTypeHandler()]\n\n@asset(\n    key_prefix=["my_schema"]  # will be used as the schema in duckdb\n)\ndef my_table() -> pd.DataFrame:  # the name of the asset will be the table name\n    ...\n\ndefs = Definitions(\n    assets=[my_table],\n    resources={"io_manager": MyDuckDBIOManager(database="my_db.duckdb")}\n)\n
\n
\n
\n\n
\n

Legacy\u00b6

\n
\n
\ndagster_duckdb_pandas.duckdb_pandas_io_manager IOManagerDefinition\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
database (dagster.StringSource):
\n

Path to the DuckDB database.

\n
\n
schema (Union[dagster.StringSource, None], optional):
\n

Name of the schema to use.

\n
\n
\n

An I/O manager definition that reads inputs from and writes Pandas DataFrames to DuckDB. When\nusing the duckdb_pandas_io_manager, any inputs and outputs without type annotations will be loaded\nas Pandas DataFrames.

\n
\n
Returns:
\n

IOManagerDefinition

\n
\n
\n

Examples

\n
from dagster_duckdb_pandas import duckdb_pandas_io_manager\n\n@asset(\n    key_prefix=["my_schema"]  # will be used as the schema in DuckDB\n)\ndef my_table() -> pd.DataFrame:  # the name of the asset will be the table name\n    ...\n\n@repository\ndef my_repo():\n    return with_resources(\n        [my_table],\n        {"io_manager": duckdb_pandas_io_manager.configured({"database": "my_db.duckdb"})}\n    )\n
\n
\n

If you do not provide a schema, Dagster will determine a schema based on the assets and ops using\nthe I/O Manager. For assets, the schema will be determined from the asset key.\nFor ops, the schema can be specified by including a \u201cschema\u201d entry in output metadata. If \u201cschema\u201d is not provided\nvia config or on the asset/op, \u201cpublic\u201d will be used for the schema.

\n
@op(\n    out={"my_table": Out(metadata={"schema": "my_schema"})}\n)\ndef make_my_table() -> pd.DataFrame:\n    # the returned value will be stored at my_schema.my_table\n    ...\n
\n
\n

To only use specific columns of a table as input to a downstream op or asset, add the metadata \u201ccolumns\u201d to the\nIn or AssetIn.

\n
@asset(\n    ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n)\ndef my_table_a(my_table: pd.DataFrame) -> pd.DataFrame:\n    # my_table will just contain the data from column "a"\n    ...\n
\n
\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-duckdb-pandas", "customsidebar": null, "display_toc": true, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../dagster-duckdb-pyspark/", "title": "DuckDB + PySpark (dagster-duckdb-pyspark)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-duckdb/", "title": "DuckDB (dagster-duckdb)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-duckdb-pyspark", "DuckDB + PySpark (dagster-duckdb-pyspark)", "N", "next"], ["sections/api/apidocs/libraries/dagster-duckdb", "DuckDB (dagster-duckdb)", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/libraries/dagster-duckdb-pandas.rst.txt", "title": "DuckDB + Pandas (dagster-duckdb-pandas)", "toc": "\n"}, "dagster-duckdb-polars": {"alabaster_version": "0.7.13", "body": "
\n

DuckDB + Polars (dagster-duckdb-polars)\u00b6

\n

This library provides an integration with the DuckDB database and Polars data processing library.

\n

Related guides:

\n\n
\n
\ndagster_duckdb_polars.DuckDBPolarsIOManager IOManagerDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
database (dagster.StringSource):
\n

Path to the DuckDB database.

\n
\n
schema (Union[dagster.StringSource, None], optional):
\n

Name of the schema to use.

\n
\n
\n

An I/O manager definition that reads inputs from and writes Polars DataFrames to DuckDB. When\nusing the DuckDBPolarsIOManager, any inputs and outputs without type annotations will be loaded\nas Polars DataFrames.

\n
\n
Returns:
\n

IOManagerDefinition

\n
\n
\n

Examples

\n
from dagster_duckdb_polars import DuckDBPolarsIOManager\n\n@asset(\n    key_prefix=["my_schema"]  # will be used as the schema in DuckDB\n)\ndef my_table() -> pl.DataFrame:  # the name of the asset will be the table name\n    ...\n\ndefs = Definitions(\n    assets=[my_table],\n    resources={"io_manager": DuckDBPolarsIOManager(database="my_db.duckdb")}\n)\n
\n
\n

If you do not provide a schema, Dagster will determine a schema based on the assets and ops using\nthe I/O Manager. For assets, the schema will be determined from the asset key, as in the above example.\nFor ops, the schema can be specified by including a \u201cschema\u201d entry in output metadata. If \u201cschema\u201d is not provided\nvia config or on the asset/op, \u201cpublic\u201d will be used for the schema.

\n
@op(\n    out={"my_table": Out(metadata={"schema": "my_schema"})}\n)\ndef make_my_table() -> pl.DataFrame:\n    # the returned value will be stored at my_schema.my_table\n    ...\n
\n
\n

To only use specific columns of a table as input to a downstream op or asset, add the metadata \u201ccolumns\u201d to the\nIn or AssetIn.

\n
@asset(\n    ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n)\ndef my_table_a(my_table: pl.DataFrame) -> pl.DataFrame:\n    # my_table will just contain the data from column "a"\n    ...\n
\n
\n
\n\n
\n
\nclass dagster_duckdb_polars.DuckDBPolarsTypeHandler[source]\u00b6
\n

Stores and loads Polars DataFrames in DuckDB.

\n

To use this type handler, return it from the type_handlers` method of an I/O manager that inherits from ``DuckDBIOManager.

\n

Example

\n
from dagster_duckdb import DuckDBIOManager\nfrom dagster_duckdb_polars import DuckDBPolarsTypeHandler\n\nclass MyDuckDBIOManager(DuckDBIOManager):\n    @staticmethod\n    def type_handlers() -> Sequence[DbTypeHandler]:\n        return [DuckDBPolarsTypeHandler()]\n\n@asset(\n    key_prefix=["my_schema"]  # will be used as the schema in duckdb\n)\ndef my_table() -> pl.DataFrame:  # the name of the asset will be the table name\n    ...\n\ndefs = Definitions(\n    assets=[my_table],\n    resources={"io_manager": MyDuckDBIOManager(database="my_db.duckdb")}\n)\n
\n
\n
\n\n
\n

Legacy\u00b6

\n
\n
\ndagster_duckdb_polars.duckdb_polars_io_manager IOManagerDefinition\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
database (dagster.StringSource):
\n

Path to the DuckDB database.

\n
\n
schema (Union[dagster.StringSource, None], optional):
\n

Name of the schema to use.

\n
\n
\n

An I/O manager definition that reads inputs from and writes polars dataframes to DuckDB. When\nusing the duckdb_polars_io_manager, any inputs and outputs without type annotations will be loaded\nas Polars DataFrames.

\n
\n
Returns:
\n

IOManagerDefinition

\n
\n
\n

Examples

\n
from dagster_duckdb_polars import duckdb_polars_io_manager\n\n@asset(\n    key_prefix=["my_schema"]  # will be used as the schema in DuckDB\n)\ndef my_table() -> pl.DataFrame:  # the name of the asset will be the table name\n    ...\n\n@repository\ndef my_repo():\n    return with_resources(\n        [my_table],\n        {"io_manager": duckdb_polars_io_manager.configured({"database": "my_db.duckdb"})}\n    )\n
\n
\n

If you do not provide a schema, Dagster will determine a schema based on the assets and ops using\nthe I/O Manager. For assets, the schema will be determined from the asset key.\nFor ops, the schema can be specified by including a \u201cschema\u201d entry in output metadata. If \u201cschema\u201d is not provided\nvia config or on the asset/op, \u201cpublic\u201d will be used for the schema.

\n
@op(\n    out={"my_table": Out(metadata={"schema": "my_schema"})}\n)\ndef make_my_table() -> pl.DataFrame:\n    # the returned value will be stored at my_schema.my_table\n    ...\n
\n
\n

To only use specific columns of a table as input to a downstream op or asset, add the metadata \u201ccolumns\u201d to the\nIn or AssetIn.

\n
@asset(\n    ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n)\ndef my_table_a(my_table: pl.DataFrame) -> pl.DataFrame:\n    # my_table will just contain the data from column "a"\n    ...\n
\n
\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-duckdb-polars", "customsidebar": null, "display_toc": true, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../dagster-embedded-elt/", "title": "embedded-elt (dagster-embedded-elt)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-duckdb-pyspark/", "title": "DuckDB + PySpark (dagster-duckdb-pyspark)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-embedded-elt", "embedded-elt (dagster-embedded-elt)", "N", "next"], ["sections/api/apidocs/libraries/dagster-duckdb-pyspark", "DuckDB + PySpark (dagster-duckdb-pyspark)", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/libraries/dagster-duckdb-polars.rst.txt", "title": "DuckDB + Polars (dagster-duckdb-polars)", "toc": "\n"}, "dagster-duckdb-pyspark": {"alabaster_version": "0.7.13", "body": "
\n

DuckDB + PySpark (dagster-duckdb-pyspark)\u00b6

\n

This library provides an integration with the DuckDB database and PySpark data processing library.

\n

Related guides:

\n\n
\n
\ndagster_duckdb_pyspark.DuckDBPySparkIOManager IOManagerDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
database (dagster.StringSource):
\n

Path to the DuckDB database.

\n
\n
schema (Union[dagster.StringSource, None], optional):
\n

Name of the schema to use.

\n
\n
\n

An I/O manager definition that reads inputs from and writes PySpark DataFrames to DuckDB. When\nusing the DuckDBPySparkIOManager, any inputs and outputs without type annotations will be loaded\nas PySpark DataFrames.

\n
\n
Returns:
\n

IOManagerDefinition

\n
\n
\n

Examples

\n
from dagster_duckdb_pyspark import DuckDBPySparkIOManager\n\n@asset(\n    key_prefix=["my_schema"]  # will be used as the schema in DuckDB\n)\ndef my_table() -> pyspark.sql.DataFrame:  # the name of the asset will be the table name\n    ...\n\ndefs = Definitions(\n    assets=[my_table],\n    resources={"io_manager": DuckDBPySparkIOManager(database="my_db.duckdb")}\n)\n
\n
\n

If you do not provide a schema, Dagster will determine a schema based on the assets and ops using\nthe I/O Manager. For assets, the schema will be determined from the asset key, as in the above example.\nFor ops, the schema can be specified by including a \u201cschema\u201d entry in output metadata. If \u201cschema\u201d is not provided\nvia config or on the asset/op, \u201cpublic\u201d will be used for the schema.

\n
@op(\n    out={"my_table": Out(metadata={"schema": "my_schema"})}\n)\ndef make_my_table() -> pyspark.sql.DataFrame:\n    # the returned value will be stored at my_schema.my_table\n    ...\n
\n
\n

To only use specific columns of a table as input to a downstream op or asset, add the metadata \u201ccolumns\u201d to the\nIn or AssetIn.

\n
@asset(\n    ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n)\ndef my_table_a(my_table: pyspark.sql.DataFrame) -> pyspark.sql.DataFrame:\n    # my_table will just contain the data from column "a"\n    ...\n
\n
\n
\n\n
\n
\nclass dagster_duckdb_pyspark.DuckDBPySparkTypeHandler[source]\u00b6
\n

Stores PySpark DataFrames in DuckDB.

\n

To use this type handler, return it from the type_handlers` method of an I/O manager that inherits from ``DuckDBIOManager.

\n

Example

\n
from dagster_duckdb import DuckDBIOManager\nfrom dagster_duckdb_pyspark import DuckDBPySparkTypeHandler\n\nclass MyDuckDBIOManager(DuckDBIOManager):\n    @staticmethod\n    def type_handlers() -> Sequence[DbTypeHandler]:\n        return [DuckDBPySparkTypeHandler()]\n\n@asset(\n    key_prefix=["my_schema"]  # will be used as the schema in duckdb\n)\ndef my_table() -> pyspark.sql.DataFrame:  # the name of the asset will be the table name\n    ...\n\ndefs = Definitions(\n    assets=[my_table],\n    resources={"io_manager": MyDuckDBIOManager(database="my_db.duckdb")}\n)\n
\n
\n
\n\n
\n

Legacy\u00b6

\n
\n
\ndagster_duckdb_pyspark.duckdb_pyspark_io_manager IOManagerDefinition\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
database (dagster.StringSource):
\n

Path to the DuckDB database.

\n
\n
schema (Union[dagster.StringSource, None], optional):
\n

Name of the schema to use.

\n
\n
\n

An I/O manager definition that reads inputs from and writes PySpark DataFrames to DuckDB. When\nusing the duckdb_pyspark_io_manager, any inputs and outputs without type annotations will be loaded\nas PySpark DataFrames.

\n
\n
Returns:
\n

IOManagerDefinition

\n
\n
\n

Examples

\n
from dagster_duckdb_pyspark import duckdb_pyspark_io_manager\n\n@asset(\n    key_prefix=["my_schema"]  # will be used as the schema in DuckDB\n)\ndef my_table() -> pyspark.sql.DataFrame:  # the name of the asset will be the table name\n    ...\n\n@repository\ndef my_repo():\n    return with_resources(\n        [my_table],\n        {"io_manager": duckdb_pyspark_io_manager.configured({"database": "my_db.duckdb"})}\n    )\n
\n
\n

If you do not provide a schema, Dagster will determine a schema based on the assets and ops using\nthe I/O Manager. For assets, the schema will be determined from the asset key.\nFor ops, the schema can be specified by including a \u201cschema\u201d entry in output metadata. If \u201cschema\u201d is not provided\nvia config or on the asset/op, \u201cpublic\u201d will be used for the schema.

\n
@op(\n    out={"my_table": Out(metadata={"schema": "my_schema"})}\n)\ndef make_my_table() -> pyspark.sql.DataFrame:\n    # the returned value will be stored at my_schema.my_table\n    ...\n
\n
\n

To only use specific columns of a table as input to a downstream op or asset, add the metadata \u201ccolumns\u201d to the\nIn or AssetIn.

\n
@asset(\n    ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n)\ndef my_table_a(my_table: pyspark.sql.DataFrame) -> pyspark.sql.DataFrame:\n    # my_table will just contain the data from column "a"\n    ...\n
\n
\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-duckdb-pyspark", "customsidebar": null, "display_toc": true, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../dagster-duckdb-polars/", "title": "DuckDB + Polars (dagster-duckdb-polars)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-duckdb-pandas/", "title": "DuckDB + Pandas (dagster-duckdb-pandas)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-duckdb-polars", "DuckDB + Polars (dagster-duckdb-polars)", "N", "next"], ["sections/api/apidocs/libraries/dagster-duckdb-pandas", "DuckDB + Pandas (dagster-duckdb-pandas)", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/libraries/dagster-duckdb-pyspark.rst.txt", "title": "DuckDB + PySpark (dagster-duckdb-pyspark)", "toc": "\n"}, "dagster-embedded-elt": {"alabaster_version": "0.7.13", "body": "
\n

embedded-elt (dagster-embedded-elt)\u00b6

\n

This package provides a framework for building ELT pipelines with Dagster through\nhelpful pre-built assets and resources.

\n

This package currently includes a Sling <https://slingdata.io> integration which\nprovides a simple way to sync data between databases and file systems.

\n

Related documentation pages: embedded-elt.

\n
\n

Sling\u00b6

\n
\n

Assets\u00b6

\n
\n
\ndagster_embedded_elt.sling.build_sling_asset(asset_spec, source_stream, target_object, mode=SlingMode.FULL_REFRESH, primary_key=None, update_key=None, source_options=None, target_options=None, sling_resource_key='sling')[source]\u00b6
\n

Asset Factory for using Sling to sync data from a source stream to a target object.

\n
\n
Parameters:
\n
    \n
  • asset_spec (AssetSpec) \u2013 The AssetSpec to use to materialize this asset.

  • \n
  • source_stream (str) \u2013 The source stream to sync from. This can be a table, a query, or a path.

  • \n
  • target_object (str) \u2013 The target object to sync to. This can be a table, or a path.

  • \n
  • mode (SlingMode, optional) \u2013 The sync mode to use when syncing. Defaults to SlingMode.FULL_REFRESH.

  • \n
  • primary_key (Optional[Union[str, List[str]]], optional) \u2013 The optional primary key to use when syncing.

  • \n
  • update_key (Optional[Union[str, List[str]]], optional) \u2013 The optional update key to use when syncing.

  • \n
  • source_options (Optional[Dict[str, Any]], optional) \u2013 Any optional Sling source options to use when syncing.

  • \n
  • target_options (Optional[Dict[str, Any]], optional) \u2013 Any optional target options to use when syncing.

  • \n
  • sling_resource_key (str, optional) \u2013 The resource key for the SlingResource. Defaults to \u201csling\u201d.

  • \n
\n
\n
\n

Examples

\n

Creating a Sling asset that syncs from a file to a table:

\n
asset_spec = AssetSpec(key=["main", "dest_tbl"])\nasset_def = build_sling_asset(\n        asset_spec=asset_spec,\n        source_stream="file:///tmp/test.csv",\n        target_object="main.dest_table",\n        mode=SlingMode.INCREMENTAL,\n        primary_key="id"\n)\n
\n
\n

Creating a Sling asset that syncs from a table to a file with a full refresh:

\n
asset_spec = AssetSpec(key="test.csv")\nasset_def = build_sling_asset(\n        asset_spec=asset_spec,\n        source_stream="main.dest_table",\n        table_object="file:///tmp/test.csv",\n        mode=SlingMode.FULL_REFRESH\n        primary_key="id"\n)\n
\n
\n
\n\n
\n
\n

Resources\u00b6

\n
\n
\nclass dagster_embedded_elt.sling.SlingResource(*, source_connection, target_connection)[source]\u00b6
\n

Resource for interacting with the Sling package.

\n

Examples

\n
from dagster_etl.sling import SlingResource\nsling_resource = SlingResource(\n    source_connection=SlingSourceConnection(\n        type="postgres", connection_string=EnvVar("POSTGRES_CONNECTION_STRING")\n    ),\n    target_connection=SlingTargetConnection(\n        type="snowflake",\n        host="host",\n        user="user",\n        database="database",\n        password="password",\n        role="role",\n    ),\n)\n
\n
\n
\n\n
\n
\nclass dagster_embedded_elt.sling.resources.SlingSourceConnection(*, type, connection_string=None, **config_dict)[source]\u00b6
\n

A Sling Source Connection defines the source connection used by SlingResource.

\n

Examples

\n

Creating a Sling Source for a file, such as CSV or JSON:

\n
source = SlingSourceConnection(type="file")\n
\n
\n

Create a Sling Source for a Postgres database, using a connection string:

\n
source = SlingTargetConnection(type="postgres", connection_string=EnvVar("POSTGRES_CONNECTION_STRING"))\nsource = SlingSourceConnection(type="postgres", connection_string="postgresql://user:password@host:port/schema"\n
\n
\n

Create a Sling Source for a Postgres database, using keyword arguments, as described here:\nhttps://docs.slingdata.io/connections/database-connections/postgres

\n
source = SlingTargetConnection(type="postgres", host="host", user="hunter42", password=EnvVar("POSTGRES_PASSWORD"))\n
\n
\n
\n\n
\n
\nclass dagster_embedded_elt.sling.resources.SlingTargetConnection(*, type, connection_string=None, **config_dict)[source]\u00b6
\n

A Sling Target Connection defines the target connection used by SlingResource.

\n

Examples

\n

Creating a Sling Target for a file, such as CSV or JSON:

\n
source = SlingTargetConnection(type="file")\n
\n
\n

Create a Sling Source for a Postgres database, using a connection string:

\n
source = SlingTargetConnection(type="postgres", connection_string="postgresql://user:password@host:port/schema"\nsource = SlingTargetConnection(type="postgres", connection_string=EnvVar("POSTGRES_CONNECTION_STRING"))\n
\n
\n

Create a Sling Source for a Postgres database, using keyword arguments, as described here:\nhttps://docs.slingdata.io/connections/database-connections/postgres

\n
\n\n
\n
\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-embedded-elt", "customsidebar": null, "display_toc": true, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../dagster-fivetran/", "title": "Fivetran (dagster-fivetran)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-duckdb-polars/", "title": "DuckDB + Polars (dagster-duckdb-polars)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-fivetran", "Fivetran (dagster-fivetran)", "N", "next"], ["sections/api/apidocs/libraries/dagster-duckdb-polars", "DuckDB + Polars (dagster-duckdb-polars)", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/libraries/dagster-embedded-elt.rst.txt", "title": "embedded-elt (dagster-embedded-elt)", "toc": "\n"}, "dagster-fivetran": {"alabaster_version": "0.7.13", "body": "
\n

Fivetran (dagster-fivetran)\u00b6

\n

This library provides a Dagster integration with Fivetran.

\n
\n

Resources\u00b6

\n
\n
\ndagster_fivetran.FivetranResource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
api_key (dagster.StringSource):
\n

The Fivetran API key to use for this resource.

\n
\n
api_secret (dagster.StringSource):
\n

The Fivetran API secret to use for this resource.

\n
\n
disable_schedule_on_trigger (dagster.BoolSource, optional):
\n

Specifies if you would like any connector that is sync\u2019d using this resource to be automatically taken off its Fivetran schedule.

\n

Default Value: True

\n
\n
request_max_retries (dagster.IntSource, optional):
\n

The maximum number of times requests to the Fivetran API should be retried before failing.

\n

Default Value: 3

\n
\n
request_retry_delay (Float, optional):
\n

Time (in seconds) to wait between each request retry.

\n

Default Value: 0.25

\n
\n
\n

This class exposes methods on top of the Fivetran REST API.

\n
\n\n
\n
\n

Assets\u00b6

\n
\n
\ndagster_fivetran.load_assets_from_fivetran_instance(fivetran, key_prefix=None, connector_to_group_fn=<function _clean_name>, io_manager_key=None, connector_to_io_manager_key_fn=None, connector_filter=None, connector_to_asset_key_fn=None, poll_interval=10, poll_timeout=None)[source]\u00b6
\n

Loads Fivetran connector assets from a configured FivetranResource instance. This fetches information\nabout defined connectors at initialization time, and will error on workspace load if the Fivetran\ninstance is not reachable.

\n
\n
Parameters:
\n
    \n
  • fivetran (ResourceDefinition) \u2013 A FivetranResource configured with the appropriate connection\ndetails.

  • \n
  • key_prefix (Optional[CoercibleToAssetKeyPrefix]) \u2013 A prefix for the asset keys created.

  • \n
  • connector_to_group_fn (Optional[Callable[[str], Optional[str]]]) \u2013 Function which returns an asset\ngroup name for a given Fivetran connector name. If None, no groups will be created. Defaults\nto a basic sanitization function.

  • \n
  • io_manager_key (Optional[str]) \u2013 The IO manager key to use for all assets. Defaults to \u201cio_manager\u201d.\nUse this if all assets should be loaded from the same source, otherwise use connector_to_io_manager_key_fn.

  • \n
  • connector_to_io_manager_key_fn (Optional[Callable[[str], Optional[str]]]) \u2013 Function which returns an\nIO manager key for a given Fivetran connector name. When other ops are downstream of the loaded assets,\nthe IOManager specified determines how the inputs to those ops are loaded. Defaults to \u201cio_manager\u201d.

  • \n
  • connector_filter (Optional[Callable[[FivetranConnectorMetadata], bool]]) \u2013 Optional function which takes\nin connector metadata and returns False if the connector should be excluded from the output assets.

  • \n
  • connector_to_asset_key_fn (Optional[Callable[[FivetranConnectorMetadata, str], AssetKey]]) \u2013 Optional function\nwhich takes in connector metadata and a table name and returns an AssetKey for that table. Defaults to\na function that generates an AssetKey matching the table name, split by \u201c.\u201d.

  • \n
  • poll_interval (float) \u2013 The time (in seconds) that will be waited between successive polls.

  • \n
  • poll_timeout (Optional[float]) \u2013 The maximum time that will waited before this operation is\ntimed out. By default, this will never time out.

  • \n
\n
\n
\n

Examples:

\n

Loading all Fivetran connectors as assets:

\n
from dagster_fivetran import fivetran_resource, load_assets_from_fivetran_instance\n\nfivetran_instance = fivetran_resource.configured(\n    {\n        "api_key": "some_key",\n        "api_secret": "some_secret",\n    }\n)\nfivetran_assets = load_assets_from_fivetran_instance(fivetran_instance)\n
\n
\n

Filtering the set of loaded connectors:

\n
from dagster_fivetran import fivetran_resource, load_assets_from_fivetran_instance\n\nfivetran_instance = fivetran_resource.configured(\n    {\n        "api_key": "some_key",\n        "api_secret": "some_secret",\n    }\n)\nfivetran_assets = load_assets_from_fivetran_instance(\n    fivetran_instance,\n    connector_filter=lambda meta: "snowflake" in meta.name,\n)\n
\n
\n
\n\n
\n
\ndagster_fivetran.build_fivetran_assets(connector_id, destination_tables, poll_interval=10, poll_timeout=None, io_manager_key=None, asset_key_prefix=None, metadata_by_table_name=None, group_name=None, infer_missing_tables=False, op_tags=None)[source]\u00b6
\n

Build a set of assets for a given Fivetran connector.

\n

Returns an AssetsDefinition which connects the specified asset_keys to the computation that\nwill update them. Internally, executes a Fivetran sync for a given connector_id, and\npolls until that sync completes, raising an error if it is unsuccessful. Requires the use of the\nfivetran_resource, which allows it to communicate with the\nFivetran API.

\n
\n
Parameters:
\n
    \n
  • connector_id (str) \u2013 The Fivetran Connector ID that this op will sync. You can retrieve this\nvalue from the \u201cSetup\u201d tab of a given connector in the Fivetran UI.

  • \n
  • destination_tables (List[str]) \u2013 schema_name.table_name for each table that you want to be\nrepresented in the Dagster asset graph for this connection.

  • \n
  • poll_interval (float) \u2013 The time (in seconds) that will be waited between successive polls.

  • \n
  • poll_timeout (Optional[float]) \u2013 The maximum time that will waited before this operation is\ntimed out. By default, this will never time out.

  • \n
  • io_manager_key (Optional[str]) \u2013 The io_manager to be used to handle each of these assets.

  • \n
  • asset_key_prefix (Optional[List[str]]) \u2013 A prefix for the asset keys inside this asset.\nIf left blank, assets will have a key of AssetKey([schema_name, table_name]).

  • \n
  • metadata_by_table_name (Optional[Mapping[str, MetadataUserInput]]) \u2013 A mapping from destination\ntable name to user-supplied metadata that should be associated with the asset for that table.

  • \n
  • group_name (Optional[str]) \u2013 A string name used to organize multiple assets into groups. This\ngroup name will be applied to all assets produced by this multi_asset.

  • \n
  • infer_missing_tables (bool) \u2013 If True, will create asset materializations for tables specified\nin destination_tables even if they are not present in the Fivetran sync output. This is useful\nin cases where Fivetran does not sync any data for a table and therefore does not include it\nin the sync output API response.

  • \n
  • op_tags (Optional[Dict[str, Any]]) \u2013 A dictionary of tags for the op that computes the asset. Frameworks may expect and\nrequire certain metadata to be attached to a op. Values that are not strings will be\njson encoded and must meet the criteria that json.loads(json.dumps(value)) == value.

  • \n
\n
\n
\n

Examples:

\n

Basic example:

\n
\n
from dagster import AssetKey, repository, with_resources\n\nfrom dagster_fivetran import fivetran_resource\nfrom dagster_fivetran.assets import build_fivetran_assets\n\nmy_fivetran_resource = fivetran_resource.configured(\n    {\n        "api_key": {"env": "FIVETRAN_API_KEY"},\n        "api_secret": {"env": "FIVETRAN_API_SECRET"},\n    }\n)\n
\n
\n
\n

Attaching metadata:

\n
\n
fivetran_assets = build_fivetran_assets(\n    connector_id="foobar",\n    table_names=["schema1.table1", "schema2.table2"],\n    metadata_by_table_name={\n        "schema1.table1": {\n            "description": "This is a table that contains foo and bar",\n        },\n        "schema2.table2": {\n            "description": "This is a table that contains baz and quux",\n        },\n    },\n)\n
\n
\n
\n
\n\n
\n
\n

Ops\u00b6

\n
\n
\ndagster_fivetran.fivetran_sync_op = <dagster._core.definitions.op_definition.OpDefinition object>[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
connector_id (dagster.StringSource):
\n

The Fivetran Connector ID that this op will sync. You can retrieve this value from the \u201cSetup\u201d tab of a given connector in the Fivetran UI.

\n
\n
poll_interval (Float, optional):
\n

The time (in seconds) that will be waited between successive polls.

\n

Default Value: 10

\n
\n
poll_timeout (Union[Float, None], optional):
\n

The maximum time that will waited before this operation is timed out. By default, this will never time out.

\n
\n
yield_materializations (dagster.BoolSource, optional):
\n

If True, materializations corresponding to the results of the Fivetran sync will be yielded when the op executes.

\n

Default Value: True

\n
\n
asset_key_prefix (List[dagster.StringSource], optional):
\n

If provided and yield_materializations is True, these components will be used to prefix the generated asset keys.

\n

Default Value: [\u2018fivetran\u2019]

\n
\n
\n

Executes a Fivetran sync for a given connector_id, and polls until that sync\ncompletes, raising an error if it is unsuccessful. It outputs a FivetranOutput which contains\nthe details of the Fivetran connector after the sync successfully completes, as well as details\nabout which tables the sync updates.

\n

It requires the use of the fivetran_resource, which allows it to\ncommunicate with the Fivetran API.

\n

Examples

\n
from dagster import job\nfrom dagster_fivetran import fivetran_resource, fivetran_sync_op\n\nmy_fivetran_resource = fivetran_resource.configured(\n    {\n        "api_key": {"env": "FIVETRAN_API_KEY"},\n        "api_secret": {"env": "FIVETRAN_API_SECRET"},\n    }\n)\n\nsync_foobar = fivetran_sync_op.configured({"connector_id": "foobar"}, name="sync_foobar")\n\n@job(resource_defs={"fivetran": my_fivetran_resource})\ndef my_simple_fivetran_job():\n    sync_foobar()\n\n@job(resource_defs={"fivetran": my_fivetran_resource})\ndef my_composed_fivetran_job():\n    final_foobar_state = sync_foobar(start_after=some_op())\n    other_op(final_foobar_state)\n
\n
\n
\n\n
\n
\n

Legacy\u00b6

\n
\n
\ndagster_fivetran.fivetran_resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
api_key (dagster.StringSource):
\n

The Fivetran API key to use for this resource.

\n
\n
api_secret (dagster.StringSource):
\n

The Fivetran API secret to use for this resource.

\n
\n
disable_schedule_on_trigger (dagster.BoolSource, optional):
\n

Specifies if you would like any connector that is sync\u2019d using this resource to be automatically taken off its Fivetran schedule.

\n

Default Value: True

\n
\n
request_max_retries (dagster.IntSource, optional):
\n

The maximum number of times requests to the Fivetran API should be retried before failing.

\n

Default Value: 3

\n
\n
request_retry_delay (Float, optional):
\n

Time (in seconds) to wait between each request retry.

\n

Default Value: 0.25

\n
\n
\n

This resource allows users to programatically interface with the Fivetran REST API to launch\nsyncs and monitor their progress. This currently implements only a subset of the functionality\nexposed by the API.

\n

For a complete set of documentation on the Fivetran REST API, including expected response JSON\nschemae, see the Fivetran API Docs.

\n

To configure this resource, we recommend using the configured method.

\n

Examples:

\n
from dagster import job\nfrom dagster_fivetran import fivetran_resource\n\nmy_fivetran_resource = fivetran_resource.configured(\n    {\n        "api_key": {"env": "FIVETRAN_API_KEY"},\n        "api_secret": {"env": "FIVETRAN_API_SECRET"},\n    }\n)\n\n@job(resource_defs={"fivetran":my_fivetran_resource})\ndef my_fivetran_job():\n    ...\n
\n
\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-fivetran", "customsidebar": null, "display_toc": true, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../dagster-docker/", "title": "Orchestration on Docker"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-embedded-elt/", "title": "embedded-elt (dagster-embedded-elt)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-docker", "Orchestration on Docker", "N", "next"], ["sections/api/apidocs/libraries/dagster-embedded-elt", "embedded-elt (dagster-embedded-elt)", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/libraries/dagster-fivetran.rst.txt", "title": "Fivetran (dagster-fivetran)", "toc": "\n"}, "dagster-gcp": {"alabaster_version": "0.7.13", "body": "
\n

GCP (dagster-gcp)\u00b6

\n
\n

BigQuery\u00b6

\n

Related Guides:

\n\n
\n

BigQuery Resource\u00b6

\n
\n
\ndagster_gcp.BigQueryResource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
project (Union[dagster.StringSource, None], optional):
\n

Project ID for the project which the client acts on behalf of. Will be passed when creating a dataset / job. If not passed, falls back to the default inferred from the environment.

\n
\n
location (Union[dagster.StringSource, None], optional):
\n

Default location for jobs / datasets / tables.

\n
\n
gcp_credentials (Union[dagster.StringSource, None], optional):
\n

GCP authentication credentials. If provided, a temporary file will be created with the credentials and GOOGLE_APPLICATION_CREDENTIALS will be set to the temporary file. To avoid issues with newlines in the keys, you must base64 encode the key. You can retrieve the base64 encoded key with this shell command: cat $GOOGLE_AUTH_CREDENTIALS | base64

\n
\n
\n

Resource for interacting with Google BigQuery.

\n

Examples

\n
from dagster import Definitions, asset\nfrom dagster_gcp import BigQueryResource\n\n@asset\ndef my_table(bigquery: BigQueryResource):\n    with bigquery.get_client() as client:\n        client.query("SELECT * FROM my_dataset.my_table")\n\ndefs = Definitions(\n    assets=[my_table],\n    resources={\n        "bigquery": BigQueryResource(project="my-project")\n    }\n)\n
\n
\n
\n\n
\n
\n

BigQuery I/O Manager\u00b6

\n
\n
\ndagster_gcp.BigQueryIOManager IOManagerDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
project (dagster.StringSource):
\n

The GCP project to use.

\n
\n
dataset (Union[dagster.StringSource, None], optional):
\n

Name of the BigQuery dataset to use. If not provided, the last prefix before the asset name will be used.

\n
\n
location (Union[dagster.StringSource, None], optional):
\n

The GCP location. Note: When using PySpark DataFrames, the default location of the project will be used. A custom location can be specified in your SparkSession configuration.

\n
\n
gcp_credentials (Union[dagster.StringSource, None], optional):
\n

GCP authentication credentials. If provided, a temporary file will be created with the credentials and GOOGLE_APPLICATION_CREDENTIALS will be set to the temporary file. To avoid issues with newlines in the keys, you must base64 encode the key. You can retrieve the base64 encoded key with this shell command: cat $GOOGLE_AUTH_CREDENTIALS | base64

\n
\n
temporary_gcs_bucket (Union[dagster.StringSource, None], optional):
\n

When using PySpark DataFrames, optionally specify a temporary GCS bucket to store data. If not provided, data will be directly written to BigQuery.

\n
\n
timeout (Union[Float, None], optional):
\n

When using Pandas DataFrames, optionally specify a timeout for the BigQuery queries (loading and reading from tables).

\n
\n
\n

Base class for an I/O manager definition that reads inputs from and writes outputs to BigQuery.

\n

Examples

\n
from dagster_gcp import BigQueryIOManager\nfrom dagster_bigquery_pandas import BigQueryPandasTypeHandler\nfrom dagster import Definitions, EnvVar\n\nclass MyBigQueryIOManager(BigQueryIOManager):\n    @staticmethod\n    def type_handlers() -> Sequence[DbTypeHandler]:\n        return [BigQueryPandasTypeHandler()]\n\n@asset(\n    key_prefix=["my_dataset"]  # my_dataset will be used as the dataset in BigQuery\n)\ndef my_table() -> pd.DataFrame:  # the name of the asset will be the table name\n    ...\n\ndefs = Definitions(\n    assets=[my_table],\n    resources={\n        "io_manager": MyBigQueryIOManager(project=EnvVar("GCP_PROJECT"))\n    }\n)\n
\n
\n

You can tell Dagster in which dataset to create tables by setting the dataset configuration value.\nIf you do not provide a dataset as configuration to the I/O manager, Dagster will determine a dataset based\non the assets and ops using the I/O Manager. For assets, the dataset will be determined from the asset key,\nas shown in the above example. The final prefix before the asset name will be used as the dataset. For example,\nif the asset my_table had the key prefix ["gcp", "bigquery", "my_dataset"], the dataset my_dataset will be\nused. For ops, the dataset can be specified by including a schema entry in output metadata. If schema is\nnot provided via config or on the asset/op, public will be used for the dataset.

\n
@op(\n    out={"my_table": Out(metadata={"schema": "my_dataset"})}\n)\ndef make_my_table() -> pd.DataFrame:\n    # the returned value will be stored at my_dataset.my_table\n    ...\n
\n
\n

To only use specific columns of a table as input to a downstream op or asset, add the metadata columns to the\nIn or AssetIn.

\n
@asset(\n    ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n)\ndef my_table_a(my_table: pd.DataFrame) -> pd.DataFrame:\n    # my_table will just contain the data from column "a"\n    ...\n
\n
\n

If you cannot upload a file to your Dagster deployment, or otherwise cannot\nauthenticate with GCP\nvia a standard method, you can provide a service account key as the gcp_credentials configuration.\nDagster will store this key in a temporary file and set GOOGLE_APPLICATION_CREDENTIALS to point to the file.\nAfter the run completes, the file will be deleted, and GOOGLE_APPLICATION_CREDENTIALS will be\nunset. The key must be base64 encoded to avoid issues with newlines in the keys. You can retrieve\nthe base64 encoded with this shell command: cat $GOOGLE_APPLICATION_CREDENTIALS | base64

\n
\n\n
\n
\n

BigQuery Ops\u00b6

\n
\n
\ndagster_gcp.bq_create_dataset(context)[source]\u00b6
\n

BigQuery Create Dataset.

\n

This op encapsulates creating a BigQuery dataset.

\n

Expects a BQ client to be provisioned in resources as context.resources.bigquery.

\n
\n\n
\n
\ndagster_gcp.bq_delete_dataset(context)[source]\u00b6
\n

BigQuery Delete Dataset.

\n

This op encapsulates deleting a BigQuery dataset.

\n

Expects a BQ client to be provisioned in resources as context.resources.bigquery.

\n
\n\n
\n
\ndagster_gcp.bq_op_for_queries(sql_queries)[source]\u00b6
\n

Executes BigQuery SQL queries.

\n

Expects a BQ client to be provisioned in resources as context.resources.bigquery.

\n
\n\n
\n
\ndagster_gcp.import_df_to_bq(context, df)[source]\u00b6
\n
\n\n
\n
\ndagster_gcp.import_file_to_bq(context, path)[source]\u00b6
\n
\n\n
\n
\ndagster_gcp.import_gcs_paths_to_bq(context, paths)[source]\u00b6
\n
\n\n
\n
\n

Other\u00b6

\n
\n
\nclass dagster_gcp.BigQueryError[source]\u00b6
\n
\n\n
\n
\n
\n

GCS\u00b6

\n
\n

GCS Resource\u00b6

\n
\n
\ndagster_gcp.GCSResource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
project (Union[dagster.StringSource, None], optional):
\n

Project name

\n
\n
\n

Resource for interacting with Google Cloud Storage.

\n

Example

\n
@asset\ndef my_asset(gcs: GCSResource):\n    with gcs.get_client() as client:\n        # client is a google.cloud.storage.Client\n        ...\n
\n
\n
\n\n
\n
\n

GCS I/O Manager\u00b6

\n
\n
\ndagster_gcp.GCSPickleIOManager IOManagerDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
gcs (Union[Any, None], optional):
\n

\n
gcs_bucket (dagster.StringSource):
\n

GCS bucket to store files

\n
\n
gcs_prefix (dagster.StringSource, optional):
\n

Prefix to add to all file paths

\n

Default Value: \u2018dagster\u2019

\n
\n
\n

Persistent IO manager using GCS for storage.

\n

Serializes objects via pickling. Suitable for objects storage for distributed executors, so long\nas each execution node has network connectivity and credentials for GCS and the backing bucket.

\n

Assigns each op output to a unique filepath containing run ID, step key, and output name.\nAssigns each asset to a single filesystem path, at <base_dir>/<asset_key>. If the asset key\nhas multiple components, the final component is used as the name of the file, and the preceding\ncomponents as parent directories under the base_dir.

\n

Subsequent materializations of an asset will overwrite previous materializations of that asset.\nWith a base directory of /my/base/path, an asset with key\nAssetKey(["one", "two", "three"]) would be stored in a file called three in a directory\nwith path /my/base/path/one/two/.

\n

Example usage:

\n
    \n
  1. Attach this IO manager to a set of assets.

  2. \n
\n
from dagster import asset, Definitions\nfrom dagster_gcp.gcs import GCSPickleIOManager, GCSResource\n\n@asset\ndef asset1():\n    # create df ...\n    return df\n\n@asset\ndef asset2(asset1):\n    return asset1[:5]\n\ndefs = Definitions(\n    assets=[asset1, asset2],\n    resources={\n        "io_manager": GCSPickleIOManager(\n            gcs_bucket="my-cool-bucket",\n            gcs_prefix="my-cool-prefix"\n        ),\n        "gcs": GCSResource(project="my-cool-project")\n    }\n)\n
\n
\n
    \n
  1. Attach this IO manager to your job to make it available to your ops.

  2. \n
\n
from dagster import job\nfrom dagster_gcp.gcs import GCSPickleIOManager, GCSResource\n\n@job(\n    resource_defs={\n        "io_manager": GCSPickleIOManager(\n            gcs=GCSResource(project="my-cool-project")\n            gcs_bucket="my-cool-bucket",\n            gcs_prefix="my-cool-prefix"\n        ),\n    }\n)\ndef my_job():\n    ...\n
\n
\n
\n\n
\n
\n

File Manager (Experimental)\u00b6

\n
\n
\nclass dagster_gcp.GCSFileHandle(gcs_bucket, gcs_key)[source]\u00b6
\n

A reference to a file on GCS.

\n
\n\n
\n
\ndagster_gcp.GCSFileManagerResource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
project (Union[dagster.StringSource, None], optional):
\n

Project name

\n
\n
gcs_bucket (dagster.StringSource):
\n

GCS bucket to store files

\n
\n
gcs_prefix (dagster.StringSource, optional):
\n

Prefix to add to all file paths

\n

Default Value: \u2018dagster\u2019

\n
\n
\n

FileManager that provides abstract access to GCS.

\n
\n\n
\n
\n

GCS Compute Log Manager\u00b6

\n
\n
\nclass dagster_gcp.gcs.GCSComputeLogManager(bucket, local_dir=None, inst_data=None, prefix='dagster', json_credentials_envvar=None, upload_interval=None)[source]\u00b6
\n

Logs op compute function stdout and stderr to GCS.

\n

Users should not instantiate this class directly. Instead, use a YAML block in dagster.yaml\nsuch as the following:

\n
compute_logs:\n  module: dagster_gcp.gcs.compute_log_manager\n  class: GCSComputeLogManager\n  config:\n    bucket: "mycorp-dagster-compute-logs"\n    local_dir: "/tmp/cool"\n    prefix: "dagster-test-"\n    upload_interval: 30\n
\n
\n

There are more configuration examples in the instance documentation guide: https://docs.dagster.io/deployment/dagster-instance#compute-log-storage

\n
\n
Parameters:
\n
    \n
  • bucket (str) \u2013 The name of the GCS bucket to which to log.

  • \n
  • local_dir (Optional[str]) \u2013 Path to the local directory in which to stage logs. Default:\ndagster._seven.get_system_temp_directory().

  • \n
  • prefix (Optional[str]) \u2013 Prefix for the log file keys.

  • \n
  • json_credentials_envvar (Optional[str]) \u2013 Environment variable that contains the JSON with a private key\nand other credentials information. If this is set, GOOGLE_APPLICATION_CREDENTIALS will be ignored.\nCan be used when the private key cannot be used as a file.

  • \n
  • upload_interval \u2013 (Optional[int]): Interval in seconds to upload partial log files to GCS. By default, will only upload when the capture is complete.

  • \n
  • inst_data (Optional[ConfigurableClassData]) \u2013 Serializable representation of the compute\nlog manager when instantiated from config.

  • \n
\n
\n
\n
\n\n
\n
\n
\n

Dataproc\u00b6

\n
\n

Dataproc Resource\u00b6

\n
\n
\ndagster_gcp.DataprocResource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
project_id (dagster.StringSource):
\n

Required. Project ID for the project which the client acts on behalf of. Will be passed when creating a dataset/job.

\n
\n
region (dagster.StringSource):
\n

The GCP region.

\n
\n
cluster_name (dagster.StringSource):
\n

Required. The cluster name. Cluster names within a project must be unique. Names of deleted clusters can be reused.

\n
\n
cluster_config_yaml_path (Union[dagster.StringSource, None], optional):
\n

Full path to a YAML file containing cluster configuration. See https://cloud.google.com/dataproc/docs/reference/rest/v1/ClusterConfig for configuration options. Only one of cluster_config_yaml_path, cluster_config_json_path, or cluster_config_dict may be provided.

\n
\n
cluster_config_json_path (Union[dagster.StringSource, None], optional):
\n

Full path to a JSON file containing cluster configuration. See https://cloud.google.com/dataproc/docs/reference/rest/v1/ClusterConfig for configuration options. Only one of cluster_config_yaml_path, cluster_config_json_path, or cluster_config_dict may be provided.

\n
\n
cluster_config_dict (Union[dict, None], optional):
\n

Python dictionary containing cluster configuration. See https://cloud.google.com/dataproc/docs/reference/rest/v1/ClusterConfig for configuration options. Only one of cluster_config_yaml_path, cluster_config_json_path, or cluster_config_dict may be provided.

\n
\n
\n

Resource for connecting to a Dataproc cluster.

\n

Example

\n
@asset\ndef my_asset(dataproc: DataprocResource):\n    with dataproc.get_client() as client:\n        # client is a dagster_gcp.DataprocClient\n        ...\n
\n
\n
\n\n
\n
\n

Dataproc Ops\u00b6

\n
\n
\ndagster_gcp.dataproc_op = <dagster._core.definitions.op_definition.OpDefinition object>[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
job_timeout_in_seconds (Int, optional):
\n

Optional. Maximum time in seconds to wait for the job being\ncompleted. Default is set to 1200 seconds (20 minutes).

\n

Default Value: 1200

\n
\n
job_config (strict dict):
\n
\nConfig Schema:
\n
job (strict dict, optional):
\n

A Cloud Dataproc job resource.

\n
\nConfig Schema:
\n
status (strict dict, optional):
\n

Cloud Dataproc job status.

\n
\n
placement (strict dict, optional):
\n

Cloud Dataproc job config.

\n
\nConfig Schema:
\n
clusterName (String, optional):
\n

Required. The name of the cluster where the job will\nbe submitted.

\n
\n
\n
\n
scheduling (strict dict, optional):
\n

Job scheduling options.

\n
\nConfig Schema:
\n
maxFailuresPerHour (Int, optional):
\n

Optional. Maximum number of times per hour a driver\nmay be restarted as a result of driver terminating with non-zero\ncode before job is reported failed.A job may be reported as\nthrashing if driver exits with non-zero code 4 times within 10\nminute window.Maximum value is 10.

\n
\n
\n
\n
pigJob (strict dict, optional):
\n

A Cloud Dataproc job for running Apache Pig\n(https://pig.apache.org/) queries on YARN.

\n
\nConfig Schema:
\n
queryFileUri (String, optional):
\n

The HCFS URI of the script that contains the Pig\nqueries.

\n
\n
queryList (strict dict, optional):
\n

A list of queries to run on a cluster.

\n
\nConfig Schema:
\n
queries (List[String], optional):
\n

Required. The queries to execute. You do\nnot need to terminate a query with a semicolon. Multiple\nqueries can be specified in one string by separating\neach with a semicolon. Here is an example of an Cloud\nDataproc API snippet that uses a QueryList to specify a\nHiveJob: \u201chiveJob\u201d: { \u201cqueryList\u201d: { \u201cqueries\u201d: [\n\u201cquery1\u201d, \u201cquery2\u201d, \u201cquery3;query4\u201d, ]\n} }

\n
\n
\n
\n
jarFileUris (List[String], optional):
\n

Optional. HCFS URIs of jar files to add to the\nCLASSPATH of the Pig Client and Hadoop MapReduce (MR) tasks. Can\ncontain Pig UDFs.

\n
\n
scriptVariables (permissive dict, optional):
\n

Optional. Mapping of query variable names to values\n(equivalent to the Pig command: name=[value]).

\n
\n
loggingConfig (strict dict, optional):
\n

The runtime logging config of the job.

\n
\nConfig Schema:
\n
driverLogLevels (permissive dict, optional):
\n

The per-package log levels for the\ndriver. This may include \u201croot\u201d package name to\nconfigure rootLogger. Examples: \u2018com.google = FATAL\u2019,\n\u2018root = INFO\u2019, \u2018org.apache = DEBUG\u2019

\n
\n
\n
\n
properties (permissive dict, optional):
\n

Optional. A mapping of property names to values, used\nto configure Pig. Properties that conflict with values set by the\nCloud Dataproc API may be overwritten. Can include properties set in\n/etc/hadoop/conf/*-site.xml, /etc/pig/conf/pig.properties, and\nclasses in user code.

\n
\n
continueOnFailure (Bool, optional):
\n

Optional. Whether to continue executing queries if a\nquery fails. The default value is false. Setting to true can be\nuseful when executing independent parallel queries.

\n
\n
\n
\n
hiveJob (strict dict, optional):
\n

A Cloud Dataproc job for running Apache Hive\n(https://hive.apache.org/) queries on YARN.

\n
\nConfig Schema:
\n
continueOnFailure (Bool, optional):
\n

Optional. Whether to continue executing queries if a\nquery fails. The default value is false. Setting to true can be\nuseful when executing independent parallel queries.

\n
\n
queryFileUri (String, optional):
\n

The HCFS URI of the script that contains Hive\nqueries.

\n
\n
queryList (strict dict, optional):
\n

A list of queries to run on a cluster.

\n
\nConfig Schema:
\n
queries (List[String], optional):
\n

Required. The queries to execute. You do\nnot need to terminate a query with a semicolon. Multiple\nqueries can be specified in one string by separating\neach with a semicolon. Here is an example of an Cloud\nDataproc API snippet that uses a QueryList to specify a\nHiveJob: \u201chiveJob\u201d: { \u201cqueryList\u201d: { \u201cqueries\u201d: [\n\u201cquery1\u201d, \u201cquery2\u201d, \u201cquery3;query4\u201d, ]\n} }

\n
\n
\n
\n
jarFileUris (List[String], optional):
\n

Optional. HCFS URIs of jar files to add to the\nCLASSPATH of the Hive server and Hadoop MapReduce (MR) tasks. Can\ncontain Hive SerDes and UDFs.

\n
\n
scriptVariables (permissive dict, optional):
\n

Optional. Mapping of query variable names to values\n(equivalent to the Hive command: SET name=\u201dvalue\u201d;).

\n
\n
properties (permissive dict, optional):
\n

Optional. A mapping of property names and values,\nused to configure Hive. Properties that conflict with values set by\nthe Cloud Dataproc API may be overwritten. Can include properties\nset in /etc/hadoop/conf/*-site.xml, /etc/hive/conf/hive-site.xml,\nand classes in user code.

\n
\n
\n
\n
labels (permissive dict, optional):
\n

Optional. The labels to associate with this job. Label keys must\ncontain 1 to 63 characters, and must conform to RFC 1035\n(https://www.ietf.org/rfc/rfc1035.txt). Label values may be empty, but, if\npresent, must contain 1 to 63 characters, and must conform to RFC 1035\n(https://www.ietf.org/rfc/rfc1035.txt). No more than 32 labels can be associated\nwith a job.

\n
\n
sparkJob (strict dict, optional):
\n

A Cloud Dataproc job for running Apache Spark\n(http://spark.apache.org/) applications on YARN.

\n
\nConfig Schema:
\n
archiveUris (List[String], optional):
\n

Optional. HCFS URIs of archives to be extracted in\nthe working directory of Spark drivers and tasks. Supported file\ntypes: .jar, .tar, .tar.gz, .tgz, and .zip.

\n
\n
mainJarFileUri (String, optional):
\n

The HCFS URI of the jar file that contains the main\nclass.

\n
\n
jarFileUris (List[String], optional):
\n

Optional. HCFS URIs of jar files to add to the\nCLASSPATHs of the Spark driver and tasks.

\n
\n
loggingConfig (strict dict, optional):
\n

The runtime logging config of the job.

\n
\nConfig Schema:
\n
driverLogLevels (permissive dict, optional):
\n

The per-package log levels for the\ndriver. This may include \u201croot\u201d package name to\nconfigure rootLogger. Examples: \u2018com.google = FATAL\u2019,\n\u2018root = INFO\u2019, \u2018org.apache = DEBUG\u2019

\n
\n
\n
\n
properties (permissive dict, optional):
\n

Optional. A mapping of property names to values, used\nto configure Spark. Properties that conflict with values set by the\nCloud Dataproc API may be overwritten. Can include properties set in\n/etc/spark/conf/spark-defaults.conf and classes in user code.

\n
\n
args (List[String], optional):
\n

Optional. The arguments to pass to the driver. Do not\ninclude arguments, such as \u2013conf, that can be set as job\nproperties, since a collision may occur that causes an incorrect job\nsubmission.

\n
\n
fileUris (List[String], optional):
\n

Optional. HCFS URIs of files to be copied to the\nworking directory of Spark drivers and distributed tasks. Useful for\nnaively parallel tasks.

\n
\n
mainClass (String, optional):
\n

The name of the driver\u2019s main class. The jar file\nthat contains the class must be in the default CLASSPATH or\nspecified in jar_file_uris.

\n
\n
\n
\n
sparkSqlJob (strict dict, optional):
\n

A Cloud Dataproc job for running Apache Spark SQL\n(http://spark.apache.org/sql/) queries.

\n
\nConfig Schema:
\n
queryList (strict dict, optional):
\n

A list of queries to run on a cluster.

\n
\nConfig Schema:
\n
queries (List[String], optional):
\n

Required. The queries to execute. You do\nnot need to terminate a query with a semicolon. Multiple\nqueries can be specified in one string by separating\neach with a semicolon. Here is an example of an Cloud\nDataproc API snippet that uses a QueryList to specify a\nHiveJob: \u201chiveJob\u201d: { \u201cqueryList\u201d: { \u201cqueries\u201d: [\n\u201cquery1\u201d, \u201cquery2\u201d, \u201cquery3;query4\u201d, ]\n} }

\n
\n
\n
\n
queryFileUri (String, optional):
\n

The HCFS URI of the script that contains SQL\nqueries.

\n
\n
scriptVariables (permissive dict, optional):
\n

Optional. Mapping of query variable names to values\n(equivalent to the Spark SQL command: SET name=\u201dvalue\u201d;).

\n
\n
jarFileUris (List[String], optional):
\n

Optional. HCFS URIs of jar files to be added to the\nSpark CLASSPATH.

\n
\n
loggingConfig (strict dict, optional):
\n

The runtime logging config of the job.

\n
\nConfig Schema:
\n
driverLogLevels (permissive dict, optional):
\n

The per-package log levels for the\ndriver. This may include \u201croot\u201d package name to\nconfigure rootLogger. Examples: \u2018com.google = FATAL\u2019,\n\u2018root = INFO\u2019, \u2018org.apache = DEBUG\u2019

\n
\n
\n
\n
properties (permissive dict, optional):
\n

Optional. A mapping of property names to values, used\nto configure Spark SQL\u2019s SparkConf. Properties that conflict with\nvalues set by the Cloud Dataproc API may be overwritten.

\n
\n
\n
\n
pysparkJob (strict dict, optional):
\n

A Cloud Dataproc job for running Apache PySpark\n(https://spark.apache.org/docs/0.9.0/python-programming-guide.html) applications\non YARN.

\n
\nConfig Schema:
\n
jarFileUris (List[String], optional):
\n

Optional. HCFS URIs of jar files to add to the\nCLASSPATHs of the Python driver and tasks.

\n
\n
loggingConfig (strict dict, optional):
\n

The runtime logging config of the job.

\n
\nConfig Schema:
\n
driverLogLevels (permissive dict, optional):
\n

The per-package log levels for the\ndriver. This may include \u201croot\u201d package name to\nconfigure rootLogger. Examples: \u2018com.google = FATAL\u2019,\n\u2018root = INFO\u2019, \u2018org.apache = DEBUG\u2019

\n
\n
\n
\n
properties (permissive dict, optional):
\n

Optional. A mapping of property names to values, used\nto configure PySpark. Properties that conflict with values set by\nthe Cloud Dataproc API may be overwritten. Can include properties\nset in /etc/spark/conf/spark-defaults.conf and classes in user\ncode.

\n
\n
args (List[String], optional):
\n

Optional. The arguments to pass to the driver. Do not\ninclude arguments, such as \u2013conf, that can be set as job\nproperties, since a collision may occur that causes an incorrect job\nsubmission.

\n
\n
fileUris (List[String], optional):
\n

Optional. HCFS URIs of files to be copied to the\nworking directory of Python drivers and distributed tasks. Useful\nfor naively parallel tasks.

\n
\n
pythonFileUris (List[String], optional):
\n

Optional. HCFS file URIs of Python files to pass to\nthe PySpark framework. Supported file types: .py, .egg, and\n.zip.

\n
\n
mainPythonFileUri (String, optional):
\n

Required. The HCFS URI of the main Python file to use\nas the driver. Must be a .py file.

\n
\n
archiveUris (List[String], optional):
\n

Optional. HCFS URIs of archives to be extracted in\nthe working directory of .jar, .tar, .tar.gz, .tgz, and .zip.

\n
\n
\n
\n
reference (strict dict, optional):
\n

Encapsulates the full scoping used to reference a job.

\n
\nConfig Schema:
\n
projectId (String, optional):
\n

Required. The ID of the Google Cloud Platform project\nthat the job belongs to.

\n
\n
jobId (String, optional):
\n

Optional. The job ID, which must be unique within the\nproject.The ID must contain only letters (a-z, A-Z), numbers (0-9),\nunderscores (_), or hyphens (-). The maximum length is 100\ncharacters.If not specified by the caller, the job ID will be\nprovided by the server.

\n
\n
\n
\n
hadoopJob (strict dict, optional):
\n

A Cloud Dataproc job for running Apache Hadoop MapReduce\n(https://hadoop.apache.org/docs/current/hadoop-mapreduce-client/hadoop-mapreduce-client-core/MapReduceTutorial.html)\njobs on Apache Hadoop YARN\n(https://hadoop.apache.org/docs/r2.7.1/hadoop-yarn/hadoop-yarn-site/YARN.html).

\n
\nConfig Schema:
\n
jarFileUris (List[String], optional):
\n

Optional. Jar file URIs to add to the CLASSPATHs of\nthe Hadoop driver and tasks.

\n
\n
loggingConfig (strict dict, optional):
\n

The runtime logging config of the job.

\n
\nConfig Schema:
\n
driverLogLevels (permissive dict, optional):
\n

The per-package log levels for the\ndriver. This may include \u201croot\u201d package name to\nconfigure rootLogger. Examples: \u2018com.google = FATAL\u2019,\n\u2018root = INFO\u2019, \u2018org.apache = DEBUG\u2019

\n
\n
\n
\n
properties (permissive dict, optional):
\n

Optional. A mapping of property names to values, used\nto configure Hadoop. Properties that conflict with values set by the\nCloud Dataproc API may be overwritten. Can include properties set in\n/etc/hadoop/conf/*-site and classes in user code.

\n
\n
args (List[String], optional):
\n

Optional. The arguments to pass to the driver. Do not\ninclude arguments, such as -libjars or -Dfoo=bar, that can be set as\njob properties, since a collision may occur that causes an incorrect\njob submission.

\n
\n
fileUris (List[String], optional):
\n

Optional. HCFS (Hadoop Compatible Filesystem) URIs of\nfiles to be copied to the working directory of Hadoop drivers and\ndistributed tasks. Useful for naively parallel tasks.

\n
\n
mainClass (String, optional):
\n

The name of the driver\u2019s main class. The jar file\ncontaining the class must be in the default CLASSPATH or specified\nin jar_file_uris.

\n
\n
archiveUris (List[String], optional):
\n

Optional. HCFS URIs of archives to be extracted in\nthe working directory of Hadoop drivers and tasks. Supported file\ntypes: .jar, .tar, .tar.gz, .tgz, or .zip.

\n
\n
mainJarFileUri (String, optional):
\n

The HCFS URI of the jar file containing the main\nclass. Examples:\n\u2018gs://foo-bucket/analytics-binaries/extract-useful-metrics-mr.jar\u2019\n\u2018hdfs:/tmp/test-samples/custom-wordcount.jar\u2019\n\u2018file:///home/usr/lib/hadoop-mapreduce/hadoop-mapreduce-examples.jar\u2019

\n
\n
\n
\n
\n
\n
projectId (dagster.StringSource):
\n

Required. Project ID for the project which the client acts on behalf of. Will\nbe passed when creating a dataset / job. If not passed, falls back to the default inferred\nfrom the environment.

\n
\n
region (dagster.StringSource):
\n

\n
\n
\n
job_scoped_cluster (Bool, optional):
\n

whether to create a cluster or use an existing cluster

\n

Default Value: True

\n
\n
\n
\n\n
\n
\n
\n

Legacy\u00b6

\n
\n
\ndagster_gcp.ConfigurablePickledObjectGCSIOManager IOManagerDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
gcs (Union[Any, None], optional):
\n

\n
gcs_bucket (dagster.StringSource):
\n

GCS bucket to store files

\n
\n
gcs_prefix (dagster.StringSource, optional):
\n

Prefix to add to all file paths

\n

Default Value: \u2018dagster\u2019

\n
\n
\n
\n
\n

\n \n (\n deprecated\n )\n \n This API will be removed in version 2.0. Please use GCSPickleIOManager instead..\n \n

\n

Renamed to GCSPickleIOManager. See GCSPickleIOManager for documentation.

\n
\n\n
\n
\ndagster_gcp.bigquery_resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
project (Union[dagster.StringSource, None], optional):
\n

Project ID for the project which the client acts on behalf of. Will be passed when creating a dataset / job. If not passed, falls back to the default inferred from the environment.

\n
\n
location (Union[dagster.StringSource, None], optional):
\n

Default location for jobs / datasets / tables.

\n
\n
gcp_credentials (Union[dagster.StringSource, None], optional):
\n

GCP authentication credentials. If provided, a temporary file will be created with the credentials and GOOGLE_APPLICATION_CREDENTIALS will be set to the temporary file. To avoid issues with newlines in the keys, you must base64 encode the key. You can retrieve the base64 encoded key with this shell command: cat $GOOGLE_AUTH_CREDENTIALS | base64

\n
\n
\n
\n\n
\n
\ndagster_gcp.build_bigquery_io_manager IOManagerDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
project (dagster.StringSource):
\n

The GCP project to use.

\n
\n
dataset (Union[dagster.StringSource, None], optional):
\n

Name of the BigQuery dataset to use. If not provided, the last prefix before the asset name will be used.

\n
\n
location (Union[dagster.StringSource, None], optional):
\n

The GCP location. Note: When using PySpark DataFrames, the default location of the project will be used. A custom location can be specified in your SparkSession configuration.

\n
\n
gcp_credentials (Union[dagster.StringSource, None], optional):
\n

GCP authentication credentials. If provided, a temporary file will be created with the credentials and GOOGLE_APPLICATION_CREDENTIALS will be set to the temporary file. To avoid issues with newlines in the keys, you must base64 encode the key. You can retrieve the base64 encoded key with this shell command: cat $GOOGLE_AUTH_CREDENTIALS | base64

\n
\n
temporary_gcs_bucket (Union[dagster.StringSource, None], optional):
\n

When using PySpark DataFrames, optionally specify a temporary GCS bucket to store data. If not provided, data will be directly written to BigQuery.

\n
\n
timeout (Union[Float, None], optional):
\n

When using Pandas DataFrames, optionally specify a timeout for the BigQuery queries (loading and reading from tables).

\n
\n
\n
\n
\n

\n \n (\n experimental\n )\n \n This API may break in future versions, even between dot releases.\n \n

\n

Builds an I/O manager definition that reads inputs from and writes outputs to BigQuery.

\n
\n
Parameters:
\n
    \n
  • type_handlers (Sequence[DbTypeHandler]) \u2013 Each handler defines how to translate between\nslices of BigQuery tables and an in-memory type - e.g. a Pandas DataFrame.\nIf only one DbTypeHandler is provided, it will be used as the default_load_type.

  • \n
  • default_load_type (Type) \u2013 When an input has no type annotation, load it as this type.

  • \n
\n
\n
Returns:
\n

IOManagerDefinition

\n
\n
\n

Examples

\n
from dagster_gcp import build_bigquery_io_manager\nfrom dagster_bigquery_pandas import BigQueryPandasTypeHandler\nfrom dagster import Definitions\n\n@asset(\n    key_prefix=["my_dataset"]  # my_dataset will be used as the dataset in BigQuery\n)\ndef my_table() -> pd.DataFrame:  # the name of the asset will be the table name\n    ...\n\nbigquery_io_manager = build_bigquery_io_manager([BigQueryPandasTypeHandler()])\n\ndefs = Definitions(\n    assets=[my_table],\n    resources={\n        "io_manager": bigquery_io_manager.configured({\n            "project" : {"env": "GCP_PROJECT"}\n        })\n    }\n)\n
\n
\n

You can tell Dagster in which dataset to create tables by setting the dataset configuration value.\nIf you do not provide a dataset as configuration to the I/O manager, Dagster will determine a dataset based\non the assets and ops using the I/O Manager. For assets, the dataset will be determined from the asset key,\nas shown in the above example. The final prefix before the asset name will be used as the dataset. For example,\nif the asset my_table had the key prefix ["gcp", "bigquery", "my_dataset"], the dataset my_dataset will be\nused. For ops, the dataset can be specified by including a schema entry in output metadata. If schema is\nnot provided via config or on the asset/op, public will be used for the dataset.

\n
@op(\n    out={"my_table": Out(metadata={"schema": "my_dataset"})}\n)\ndef make_my_table() -> pd.DataFrame:\n    # the returned value will be stored at my_dataset.my_table\n    ...\n
\n
\n

To only use specific columns of a table as input to a downstream op or asset, add the metadata columns to the\nIn or AssetIn.

\n
@asset(\n    ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n)\ndef my_table_a(my_table: pd.DataFrame) -> pd.DataFrame:\n    # my_table will just contain the data from column "a"\n    ...\n
\n
\n

If you cannot upload a file to your Dagster deployment, or otherwise cannot\nauthenticate with GCP\nvia a standard method, you can provide a service account key as the gcp_credentials configuration.\nDagster willstore this key in a temporary file and set GOOGLE_APPLICATION_CREDENTIALS to point to the file.\nAfter the run completes, the file will be deleted, and GOOGLE_APPLICATION_CREDENTIALS will be\nunset. The key must be base64 encoded to avoid issues with newlines in the keys. You can retrieve\nthe base64 encoded with this shell command: cat $GOOGLE_APPLICATION_CREDENTIALS | base64

\n
\n\n
\n
\ndagster_gcp.gcs_resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
project (Union[dagster.StringSource, None], optional):
\n

Project name

\n
\n
\n
\n\n
\n
\ndagster_gcp.gcs_pickle_io_manager IOManagerDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
gcs (Union[Any, None], optional):
\n

\n
gcs_bucket (dagster.StringSource):
\n

GCS bucket to store files

\n
\n
gcs_prefix (dagster.StringSource, optional):
\n

Prefix to add to all file paths

\n

Default Value: \u2018dagster\u2019

\n
\n
\n

Persistent IO manager using GCS for storage.

\n

Serializes objects via pickling. Suitable for objects storage for distributed executors, so long\nas each execution node has network connectivity and credentials for GCS and the backing bucket.

\n

Assigns each op output to a unique filepath containing run ID, step key, and output name.\nAssigns each asset to a single filesystem path, at <base_dir>/<asset_key>. If the asset key\nhas multiple components, the final component is used as the name of the file, and the preceding\ncomponents as parent directories under the base_dir.

\n

Subsequent materializations of an asset will overwrite previous materializations of that asset.\nWith a base directory of /my/base/path, an asset with key\nAssetKey(["one", "two", "three"]) would be stored in a file called three in a directory\nwith path /my/base/path/one/two/.

\n

Example usage:

\n
    \n
  1. Attach this IO manager to a set of assets.

  2. \n
\n
from dagster import Definitions, asset\nfrom dagster_gcp.gcs import gcs_pickle_io_manager, gcs_resource\n\n@asset\ndef asset1():\n    # create df ...\n    return df\n\n@asset\ndef asset2(asset1):\n    return asset1[:5]\n\ndefs = Definitions(\n    assets=[asset1, asset2],\n    resources={\n            "io_manager": gcs_pickle_io_manager.configured(\n                {"gcs_bucket": "my-cool-bucket", "gcs_prefix": "my-cool-prefix"}\n            ),\n            "gcs": gcs_resource.configured({"project": "my-cool-project"}),\n        },\n)\n
\n
\n
    \n
  1. Attach this IO manager to your job to make it available to your ops.

  2. \n
\n
from dagster import job\nfrom dagster_gcp.gcs import gcs_pickle_io_manager, gcs_resource\n\n@job(\n    resource_defs={\n        "io_manager": gcs_pickle_io_manager.configured(\n            {"gcs_bucket": "my-cool-bucket", "gcs_prefix": "my-cool-prefix"}\n        ),\n        "gcs": gcs_resource.configured({"project": "my-cool-project"}),\n    },\n)\ndef my_job():\n    ...\n
\n
\n
\n\n
\n
\ndagster_gcp.gcs_file_manager ResourceDefinition[source]\u00b6
\n

FileManager that provides abstract access to GCS.

\n

Implements the FileManager API.

\n
\n\n
\n
\ndagster_gcp.dataproc_resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
projectId (dagster.StringSource):
\n

Required. Project ID for the project which the client acts on behalf of. Will\nbe passed when creating a dataset / job. If not passed, falls back to the default inferred\nfrom the environment.

\n
\n
region (dagster.StringSource):
\n

\n
clusterName (dagster.StringSource):
\n

Required. The cluster name. Cluster names within a project must be unique.\nNames of deleted clusters can be reused.

\n
\n
cluster_config (strict dict, optional):
\n

The cluster config.

\n
\nConfig Schema:
\n
masterConfig (strict dict, optional):
\n

Optional. The config settings for Compute Engine resources in an\ninstance group, such as a master or worker group.

\n
\nConfig Schema:
\n
accelerators (List[strict dict], optional):
\n

Optional. The Compute Engine accelerator\nconfiguration for these instances.Beta Feature: This feature is\nstill under development. It may be changed before final release.

\n
\n
numInstances (Int, optional):
\n

Optional. The number of VM instances in the instance\ngroup. For master instance groups, must be set to 1.

\n
\n
diskConfig (strict dict, optional):
\n

Specifies the config of disk options for a group of\nVM instances.

\n
\nConfig Schema:
\n
numLocalSsds (Int, optional):
\n

Optional. Number of attached SSDs, from 0\nto 4 (default is 0). If SSDs are not attached, the boot\ndisk is used to store runtime logs and HDFS\n(https://hadoop.apache.org/docs/r1.2.1/hdfs_user_guide.html)\ndata. If one or more SSDs are attached, this runtime\nbulk data is spread across them, and the boot disk\ncontains only basic config and installed binaries.

\n
\n
bootDiskSizeGb (Int, optional):
\n

Optional. Size in GB of the boot disk\n(default is 500GB).

\n
\n
bootDiskType (String, optional):
\n

Optional. Type of the boot disk (default\nis \u201cpd-standard\u201d). Valid values: \u201cpd-ssd\u201d (Persistent\nDisk Solid State Drive) or \u201cpd-standard\u201d (Persistent\nDisk Hard Disk Drive).

\n
\n
\n
\n
managedGroupConfig (strict dict, optional):
\n

Specifies the resources used to actively manage an\ninstance group.

\n
\n
isPreemptible (Bool, optional):
\n

Optional. Specifies that this instance group contains\npreemptible instances.

\n
\n
imageUri (String, optional):
\n

Optional. The Compute Engine image resource used for\ncluster instances. It can be specified or may be inferred from\nSoftwareConfig.image_version.

\n
\n
machineTypeUri (String, optional):
\n

Optional. The Compute Engine machine type used for\ncluster instances.A full URL, partial URI, or short name are valid.\nExamples:\nhttps://www.googleapis.com/compute/v1/projects/[project_id]/zones/us-east1-a/machineTypes/n1-standard-2\nprojects/[project_id]/zones/us-east1-a/machineTypes/n1-standard-2\nn1-standard-2Auto Zone Exception: If you are using the Cloud\nDataproc Auto Zone Placement feature, you must use the short name of\nthe machine type resource, for example, n1-standard-2.

\n
\n
\n
\n
secondaryWorkerConfig (strict dict, optional):
\n

Optional. The config settings for Compute Engine resources in an\ninstance group, such as a master or worker group.

\n
\nConfig Schema:
\n
accelerators (List[strict dict], optional):
\n

Optional. The Compute Engine accelerator\nconfiguration for these instances.Beta Feature: This feature is\nstill under development. It may be changed before final release.

\n
\n
numInstances (Int, optional):
\n

Optional. The number of VM instances in the instance\ngroup. For master instance groups, must be set to 1.

\n
\n
diskConfig (strict dict, optional):
\n

Specifies the config of disk options for a group of\nVM instances.

\n
\nConfig Schema:
\n
numLocalSsds (Int, optional):
\n

Optional. Number of attached SSDs, from 0\nto 4 (default is 0). If SSDs are not attached, the boot\ndisk is used to store runtime logs and HDFS\n(https://hadoop.apache.org/docs/r1.2.1/hdfs_user_guide.html)\ndata. If one or more SSDs are attached, this runtime\nbulk data is spread across them, and the boot disk\ncontains only basic config and installed binaries.

\n
\n
bootDiskSizeGb (Int, optional):
\n

Optional. Size in GB of the boot disk\n(default is 500GB).

\n
\n
bootDiskType (String, optional):
\n

Optional. Type of the boot disk (default\nis \u201cpd-standard\u201d). Valid values: \u201cpd-ssd\u201d (Persistent\nDisk Solid State Drive) or \u201cpd-standard\u201d (Persistent\nDisk Hard Disk Drive).

\n
\n
\n
\n
managedGroupConfig (strict dict, optional):
\n

Specifies the resources used to actively manage an\ninstance group.

\n
\n
isPreemptible (Bool, optional):
\n

Optional. Specifies that this instance group contains\npreemptible instances.

\n
\n
imageUri (String, optional):
\n

Optional. The Compute Engine image resource used for\ncluster instances. It can be specified or may be inferred from\nSoftwareConfig.image_version.

\n
\n
machineTypeUri (String, optional):
\n

Optional. The Compute Engine machine type used for\ncluster instances.A full URL, partial URI, or short name are valid.\nExamples:\nhttps://www.googleapis.com/compute/v1/projects/[project_id]/zones/us-east1-a/machineTypes/n1-standard-2\nprojects/[project_id]/zones/us-east1-a/machineTypes/n1-standard-2\nn1-standard-2Auto Zone Exception: If you are using the Cloud\nDataproc Auto Zone Placement feature, you must use the short name of\nthe machine type resource, for example, n1-standard-2.

\n
\n
\n
\n
encryptionConfig (strict dict, optional):
\n

Encryption settings for the cluster.

\n
\nConfig Schema:
\n
gcePdKmsKeyName (String, optional):
\n

Optional. The Cloud KMS key name to use for PD disk\nencryption for all instances in the cluster.

\n
\n
\n
\n
securityConfig (strict dict, optional):
\n

Security related configuration, including Kerberos.

\n
\nConfig Schema:
\n
kerberosConfig (strict dict, optional):
\n

Specifies Kerberos related configuration.

\n
\nConfig Schema:
\n
truststorePasswordUri (String, optional):
\n

Optional. The Cloud Storage URI of a KMS\nencrypted file containing the password to the user\nprovided truststore. For the self-signed certificate,\nthis password is generated by Dataproc.

\n
\n
enableKerberos (Bool, optional):
\n

Optional. Flag to indicate whether to\nKerberize the cluster.

\n
\n
truststoreUri (String, optional):
\n

Optional. The Cloud Storage URI of the\ntruststore file used for SSL encryption. If not\nprovided, Dataproc will provide a self-signed\ncertificate.

\n
\n
crossRealmTrustRealm (String, optional):
\n

Optional. The remote realm the Dataproc\non-cluster KDC will trust, should the user enable cross\nrealm trust.

\n
\n
rootPrincipalPasswordUri (String, optional):
\n

Required. The Cloud Storage URI of a KMS\nencrypted file containing the root principal\npassword.

\n
\n
kmsKeyUri (String, optional):
\n

Required. The uri of the KMS key used to\nencrypt various sensitive files.

\n
\n
crossRealmTrustKdc (String, optional):
\n

Optional. The KDC (IP or hostname) for\nthe remote trusted realm in a cross realm trust\nrelationship.

\n
\n
crossRealmTrustSharedPasswordUri (String, optional):
\n

Optional. The Cloud Storage URI of a KMS\nencrypted file containing the shared password between\nthe on-cluster Kerberos realm and the remote trusted\nrealm, in a cross realm trust relationship.

\n
\n
tgtLifetimeHours (Int, optional):
\n

Optional. The lifetime of the ticket\ngranting ticket, in hours. If not specified, or user\nspecifies 0, then default value 10 will be used.

\n
\n
keystoreUri (String, optional):
\n

Optional. The Cloud Storage URI of the\nkeystore file used for SSL encryption. If not provided,\nDataproc will provide a self-signed certificate.

\n
\n
keyPasswordUri (String, optional):
\n

Optional. The Cloud Storage URI of a KMS\nencrypted file containing the password to the user\nprovided key. For the self-signed certificate, this\npassword is generated by Dataproc.

\n
\n
keystorePasswordUri (String, optional):
\n

Optional. The Cloud Storage URI of a KMS\nencrypted file containing the password to the user\nprovided keystore. For the self-signed certificate, this\npassword is generated by Dataproc.

\n
\n
crossRealmTrustAdminServer (String, optional):
\n

Optional. The admin server (IP or\nhostname) for the remote trusted realm in a cross realm\ntrust relationship.

\n
\n
kdcDbKeyUri (String, optional):
\n

Optional. The Cloud Storage URI of a KMS\nencrypted file containing the master key of the KDC\ndatabase.

\n
\n
\n
\n
\n
\n
initializationActions (List[strict dict], optional):
\n

Optional. Commands to execute on each node after config is\ncompleted. By default, executables are run on master and all worker nodes. You\ncan test a node\u2019s role metadata to run an executable on a master or worker\nnode, as shown below using curl (you can also use wget): ROLE=$(curl -H\nMetadata-Flavor:Google\nhttp://metadata/computeMetadata/v1/instance/attributes/dataproc-role) if [[\n\u201c${ROLE}\u201d == \u2018Master\u2019 ]]; then \u2026 master specific actions \u2026 else \u2026\nworker specific actions \u2026 fi

\n
\n
configBucket (String, optional):
\n

Optional. A Google Cloud Storage bucket used to stage job\ndependencies, config files, and job driver console output. If you do not specify\na staging bucket, Cloud Dataproc will determine a Cloud Storage location (US,\nASIA, or EU) for your cluster\u2019s staging bucket according to the Google Compute\nEngine zone where your cluster is deployed, and then create and manage this\nproject-level, per-location bucket (see Cloud Dataproc staging bucket).

\n
\n
workerConfig (strict dict, optional):
\n

Optional. The config settings for Compute Engine resources in an\ninstance group, such as a master or worker group.

\n
\nConfig Schema:
\n
accelerators (List[strict dict], optional):
\n

Optional. The Compute Engine accelerator\nconfiguration for these instances.Beta Feature: This feature is\nstill under development. It may be changed before final release.

\n
\n
numInstances (Int, optional):
\n

Optional. The number of VM instances in the instance\ngroup. For master instance groups, must be set to 1.

\n
\n
diskConfig (strict dict, optional):
\n

Specifies the config of disk options for a group of\nVM instances.

\n
\nConfig Schema:
\n
numLocalSsds (Int, optional):
\n

Optional. Number of attached SSDs, from 0\nto 4 (default is 0). If SSDs are not attached, the boot\ndisk is used to store runtime logs and HDFS\n(https://hadoop.apache.org/docs/r1.2.1/hdfs_user_guide.html)\ndata. If one or more SSDs are attached, this runtime\nbulk data is spread across them, and the boot disk\ncontains only basic config and installed binaries.

\n
\n
bootDiskSizeGb (Int, optional):
\n

Optional. Size in GB of the boot disk\n(default is 500GB).

\n
\n
bootDiskType (String, optional):
\n

Optional. Type of the boot disk (default\nis \u201cpd-standard\u201d). Valid values: \u201cpd-ssd\u201d (Persistent\nDisk Solid State Drive) or \u201cpd-standard\u201d (Persistent\nDisk Hard Disk Drive).

\n
\n
\n
\n
managedGroupConfig (strict dict, optional):
\n

Specifies the resources used to actively manage an\ninstance group.

\n
\n
isPreemptible (Bool, optional):
\n

Optional. Specifies that this instance group contains\npreemptible instances.

\n
\n
imageUri (String, optional):
\n

Optional. The Compute Engine image resource used for\ncluster instances. It can be specified or may be inferred from\nSoftwareConfig.image_version.

\n
\n
machineTypeUri (String, optional):
\n

Optional. The Compute Engine machine type used for\ncluster instances.A full URL, partial URI, or short name are valid.\nExamples:\nhttps://www.googleapis.com/compute/v1/projects/[project_id]/zones/us-east1-a/machineTypes/n1-standard-2\nprojects/[project_id]/zones/us-east1-a/machineTypes/n1-standard-2\nn1-standard-2Auto Zone Exception: If you are using the Cloud\nDataproc Auto Zone Placement feature, you must use the short name of\nthe machine type resource, for example, n1-standard-2.

\n
\n
\n
\n
gceClusterConfig (strict dict, optional):
\n

Common config settings for resources of Compute Engine cluster\ninstances, applicable to all instances in the cluster.

\n
\nConfig Schema:
\n
networkUri (String, optional):
\n

Optional. The Compute Engine network to be used for\nmachine communications. Cannot be specified with subnetwork_uri. If\nneither network_uri nor subnetwork_uri is specified, the \u201cdefault\u201d\nnetwork of the project is used, if it exists. Cannot be a \u201cCustom\nSubnet Network\u201d (see Using Subnetworks for more information).A full\nURL, partial URI, or short name are valid. Examples:\nhttps://www.googleapis.com/compute/v1/projects/[project_id]/regions/global/default\nprojects/[project_id]/regions/global/default default

\n
\n
zoneUri (String, optional):
\n

Optional. The zone where the Compute Engine cluster\nwill be located. On a create request, it is required in the \u201cglobal\u201d\nregion. If omitted in a non-global Cloud Dataproc region, the\nservice will pick a zone in the corresponding Compute Engine region.\nOn a get request, zone will always be present.A full URL, partial\nURI, or short name are valid. Examples:\nhttps://www.googleapis.com/compute/v1/projects/[project_id]/zones/[zone]\nprojects/[project_id]/zones/[zone] us-central1-f

\n
\n
metadata (permissive dict, optional):
\n

The Compute Engine metadata entries to add to all\ninstances (see Project and instance metadata\n(https://cloud.google.com/compute/docs/storing-retrieving-metadata#project_and_instance_metadata)).

\n
\n
internalIpOnly (Bool, optional):
\n

Optional. If true, all instances in the cluster will\nonly have internal IP addresses. By default, clusters are not\nrestricted to internal IP addresses, and will have ephemeral\nexternal IP addresses assigned to each instance. This\ninternal_ip_only restriction can only be enabled for subnetwork\nenabled networks, and all off-cluster dependencies must be\nconfigured to be accessible without external IP addresses.

\n
\n
serviceAccountScopes (List[String], optional):
\n

Optional. The URIs of service account scopes to be\nincluded in Compute Engine instances. The following base set of\nscopes is always included:\nhttps://www.googleapis.com/auth/cloud.useraccounts.readonly\nhttps://www.googleapis.com/auth/devstorage.read_write\nhttps://www.googleapis.com/auth/logging.writeIf no scopes are\nspecified, the following defaults are also provided:\nhttps://www.googleapis.com/auth/bigquery\nhttps://www.googleapis.com/auth/bigtable.admin.table\nhttps://www.googleapis.com/auth/bigtable.data\nhttps://www.googleapis.com/auth/devstorage.full_control

\n
\n
tags (List[String], optional):
\n

The Compute Engine tags to add to all instances (see\nTagging instances).

\n
\n
serviceAccount (String, optional):
\n

Optional. The service account of the instances.\nDefaults to the default Compute Engine service account. Custom\nservice accounts need permissions equivalent to the following IAM\nroles: roles/logging.logWriter roles/storage.objectAdmin(see\nhttps://cloud.google.com/compute/docs/access/service-accounts#custom_service_accounts\nfor more information). Example:\n[account_id]@[project_id].iam.gserviceaccount.com

\n
\n
subnetworkUri (String, optional):
\n

Optional. The Compute Engine subnetwork to be used\nfor machine communications. Cannot be specified with network_uri.A\nfull URL, partial URI, or short name are valid. Examples:\nhttps://www.googleapis.com/compute/v1/projects/[project_id]/regions/us-east1/subnetworks/sub0\nprojects/[project_id]/regions/us-east1/subnetworks/sub0 sub0

\n
\n
\n
\n
softwareConfig (strict dict, optional):
\n

Specifies the selection and config of software inside the\ncluster.

\n
\nConfig Schema:
\n
properties (permissive dict, optional):
\n

Optional. The properties to set on daemon config\nfiles.Property keys are specified in prefix:property format, for\nexample core:hadoop.tmp.dir. The following are supported prefixes\nand their mappings: capacity-scheduler: capacity-scheduler.xml core:\ncore-site.xml distcp: distcp-default.xml hdfs: hdfs-site.xml hive:\nhive-site.xml mapred: mapred-site.xml pig: pig.properties spark:\nspark-defaults.conf yarn: yarn-site.xmlFor more information, see\nCluster properties.

\n
\n
optionalComponents (List[Component], optional):
\n

The set of optional components to activate on the\ncluster.

\n
\n
imageVersion (String, optional):
\n

Optional. The version of software inside the cluster.\nIt must be one of the supported Cloud Dataproc Versions, such as\n\u201c1.2\u201d (including a subminor version, such as \u201c1.2.29\u201d), or the\n\u201cpreview\u201d version. If unspecified, it defaults to the latest Debian\nversion.

\n
\n
\n
\n
\n
\n
\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-gcp", "customsidebar": null, "display_toc": true, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../dagster-gcp-pandas/", "title": "GCP + Pandas (dagster-gcp-pandas)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-docker/", "title": "Orchestration on Docker"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-gcp-pandas", "GCP + Pandas (dagster-gcp-pandas)", "N", "next"], ["sections/api/apidocs/libraries/dagster-docker", "Orchestration on Docker", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/libraries/dagster-gcp.rst.txt", "title": "GCP (dagster-gcp)", "toc": "\n"}, "dagster-gcp-pandas": {"alabaster_version": "0.7.13", "body": "
\n

GCP + Pandas (dagster-gcp-pandas)\u00b6

\n
\n

Google BigQuery\u00b6

\n

This library provides an integration with the BigQuery database and Pandas data processing library.

\n

Related Guides:

\n\n
\n
\ndagster_gcp_pandas.BigQueryPandasIOManager IOManagerDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
project (dagster.StringSource):
\n

The GCP project to use.

\n
\n
dataset (Union[dagster.StringSource, None], optional):
\n

Name of the BigQuery dataset to use. If not provided, the last prefix before the asset name will be used.

\n
\n
location (Union[dagster.StringSource, None], optional):
\n

The GCP location. Note: When using PySpark DataFrames, the default location of the project will be used. A custom location can be specified in your SparkSession configuration.

\n
\n
gcp_credentials (Union[dagster.StringSource, None], optional):
\n

GCP authentication credentials. If provided, a temporary file will be created with the credentials and GOOGLE_APPLICATION_CREDENTIALS will be set to the temporary file. To avoid issues with newlines in the keys, you must base64 encode the key. You can retrieve the base64 encoded key with this shell command: cat $GOOGLE_AUTH_CREDENTIALS | base64

\n
\n
temporary_gcs_bucket (Union[dagster.StringSource, None], optional):
\n

When using PySpark DataFrames, optionally specify a temporary GCS bucket to store data. If not provided, data will be directly written to BigQuery.

\n
\n
timeout (Union[Float, None], optional):
\n

When using Pandas DataFrames, optionally specify a timeout for the BigQuery queries (loading and reading from tables).

\n
\n
\n

An I/O manager definition that reads inputs from and writes pandas DataFrames to BigQuery.

\n
\n
Returns:
\n

IOManagerDefinition

\n
\n
\n

Examples

\n
from dagster_gcp_pandas import BigQueryPandasIOManager\nfrom dagster import Definitions, EnvVar\n\n@asset(\n    key_prefix=["my_dataset"]  # will be used as the dataset in BigQuery\n)\ndef my_table() -> pd.DataFrame:  # the name of the asset will be the table name\n    ...\n\ndefs = Definitions(\n    assets=[my_table],\n    resources={\n        "io_manager": BigQueryPandasIOManager(project=EnvVar("GCP_PROJECT"))\n    }\n)\n
\n
\n

You can tell Dagster in which dataset to create tables by setting the \u201cdataset\u201d configuration value.\nIf you do not provide a dataset as configuration to the I/O manager, Dagster will determine a dataset based\non the assets and ops using the I/O Manager. For assets, the dataset will be determined from the asset key,\nas shown in the above example. The final prefix before the asset name will be used as the dataset. For example,\nif the asset \u201cmy_table\u201d had the key prefix [\u201cgcp\u201d, \u201cbigquery\u201d, \u201cmy_dataset\u201d], the dataset \u201cmy_dataset\u201d will be\nused. For ops, the dataset can be specified by including a \u201cschema\u201d entry in output metadata. If \u201cschema\u201d is not provided\nvia config or on the asset/op, \u201cpublic\u201d will be used for the dataset.

\n
@op(\n    out={"my_table": Out(metadata={"schema": "my_dataset"})}\n)\ndef make_my_table() -> pd.DataFrame:\n    # the returned value will be stored at my_dataset.my_table\n    ...\n
\n
\n

To only use specific columns of a table as input to a downstream op or asset, add the metadata \u201ccolumns\u201d to the\nIn or AssetIn.

\n
@asset(\n    ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n)\ndef my_table_a(my_table: pd.DataFrame) -> pd.DataFrame:\n    # my_table will just contain the data from column "a"\n    ...\n
\n
\n

If you cannot upload a file to your Dagster deployment, or otherwise cannot\nauthenticate with GCP\nvia a standard method, you can provide a service account key as the \u201cgcp_credentials\u201d configuration.\nDagster will store this key in a temporary file and set GOOGLE_APPLICATION_CREDENTIALS to point to the file.\nAfter the run completes, the file will be deleted, and GOOGLE_APPLICATION_CREDENTIALS will be\nunset. The key must be base64 encoded to avoid issues with newlines in the keys. You can retrieve\nthe base64 encoded key with this shell command: cat $GOOGLE_APPLICATION_CREDENTIALS | base64

\n
\n\n
\n
\nclass dagster_gcp_pandas.BigQueryPandasTypeHandler[source]\u00b6
\n

Plugin for the BigQuery I/O Manager that can store and load Pandas DataFrames as BigQuery tables.

\n

Examples

\n
from dagster_gcp import BigQueryIOManager\nfrom dagster_bigquery_pandas import BigQueryPandasTypeHandler\nfrom dagster import Definitions, EnvVar\n\nclass MyBigQueryIOManager(BigQueryIOManager):\n    @staticmethod\n    def type_handlers() -> Sequence[DbTypeHandler]:\n        return [BigQueryPandasTypeHandler()]\n\n@asset(\n    key_prefix=["my_dataset"]  # my_dataset will be used as the dataset in BigQuery\n)\ndef my_table() -> pd.DataFrame:  # the name of the asset will be the table name\n    ...\n\ndefs = Definitions(\n    assets=[my_table],\n    resources={\n        "io_manager": MyBigQueryIOManager(project=EnvVar("GCP_PROJECT"))\n    }\n)\n
\n
\n
\n\n
\n
\n
\n

Legacy\u00b6

\n
\n
\ndagster_gcp_pandas.bigquery_pandas_io_manager IOManagerDefinition\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
project (dagster.StringSource):
\n

The GCP project to use.

\n
\n
dataset (Union[dagster.StringSource, None], optional):
\n

Name of the BigQuery dataset to use. If not provided, the last prefix before the asset name will be used.

\n
\n
location (Union[dagster.StringSource, None], optional):
\n

The GCP location. Note: When using PySpark DataFrames, the default location of the project will be used. A custom location can be specified in your SparkSession configuration.

\n
\n
gcp_credentials (Union[dagster.StringSource, None], optional):
\n

GCP authentication credentials. If provided, a temporary file will be created with the credentials and GOOGLE_APPLICATION_CREDENTIALS will be set to the temporary file. To avoid issues with newlines in the keys, you must base64 encode the key. You can retrieve the base64 encoded key with this shell command: cat $GOOGLE_AUTH_CREDENTIALS | base64

\n
\n
temporary_gcs_bucket (Union[dagster.StringSource, None], optional):
\n

When using PySpark DataFrames, optionally specify a temporary GCS bucket to store data. If not provided, data will be directly written to BigQuery.

\n
\n
timeout (Union[Float, None], optional):
\n

When using Pandas DataFrames, optionally specify a timeout for the BigQuery queries (loading and reading from tables).

\n
\n
\n

An I/O manager definition that reads inputs from and writes pandas DataFrames to BigQuery.

\n
\n
Returns:
\n

IOManagerDefinition

\n
\n
\n

Examples

\n
from dagster_gcp_pandas import bigquery_pandas_io_manager\nfrom dagster import Definitions\n\n@asset(\n    key_prefix=["my_dataset"]  # will be used as the dataset in BigQuery\n)\ndef my_table() -> pd.DataFrame:  # the name of the asset will be the table name\n    ...\n\ndefs = Definitions(\n    assets=[my_table],\n    resources={\n        "io_manager": bigquery_pandas_io_manager.configured({\n            "project" : {"env": "GCP_PROJECT"}\n        })\n    }\n)\n
\n
\n

You can tell Dagster in which dataset to create tables by setting the \u201cdataset\u201d configuration value.\nIf you do not provide a dataset as configuration to the I/O manager, Dagster will determine a dataset based\non the assets and ops using the I/O Manager. For assets, the dataset will be determined from the asset key,\nas shown in the above example. The final prefix before the asset name will be used as the dataset. For example,\nif the asset \u201cmy_table\u201d had the key prefix [\u201cgcp\u201d, \u201cbigquery\u201d, \u201cmy_dataset\u201d], the dataset \u201cmy_dataset\u201d will be\nused. For ops, the dataset can be specified by including a \u201cschema\u201d entry in output metadata. If \u201cschema\u201d is not provided\nvia config or on the asset/op, \u201cpublic\u201d will be used for the dataset.

\n
@op(\n    out={"my_table": Out(metadata={"schema": "my_dataset"})}\n)\ndef make_my_table() -> pd.DataFrame:\n    # the returned value will be stored at my_dataset.my_table\n    ...\n
\n
\n

To only use specific columns of a table as input to a downstream op or asset, add the metadata \u201ccolumns\u201d to the\nIn or AssetIn.

\n
@asset(\n    ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n)\ndef my_table_a(my_table: pd.DataFrame) -> pd.DataFrame:\n    # my_table will just contain the data from column "a"\n    ...\n
\n
\n

If you cannot upload a file to your Dagster deployment, or otherwise cannot\nauthenticate with GCP\nvia a standard method, you can provide a service account key as the \u201cgcp_credentials\u201d configuration.\nDagster will store this key in a temporary file and set GOOGLE_APPLICATION_CREDENTIALS to point to the file.\nAfter the run completes, the file will be deleted, and GOOGLE_APPLICATION_CREDENTIALS will be\nunset. The key must be base64 encoded to avoid issues with newlines in the keys. You can retrieve\nthe base64 encoded key with this shell command: cat $GOOGLE_APPLICATION_CREDENTIALS | base64

\n
\n\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-gcp-pandas", "customsidebar": null, "display_toc": true, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../dagster-gcp-pyspark/", "title": "GCP + PySpark (dagster-gcp-pyspark)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-gcp/", "title": "GCP (dagster-gcp)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-gcp-pyspark", "GCP + PySpark (dagster-gcp-pyspark)", "N", "next"], ["sections/api/apidocs/libraries/dagster-gcp", "GCP (dagster-gcp)", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/libraries/dagster-gcp-pandas.rst.txt", "title": "GCP + Pandas (dagster-gcp-pandas)", "toc": "\n"}, "dagster-gcp-pyspark": {"alabaster_version": "0.7.13", "body": "
\n

GCP + PySpark (dagster-gcp-pyspark)\u00b6

\n
\n

Google BigQuery\u00b6

\n

This library provides an integration with the BigQuery database and PySpark data processing library.

\n

Related Guides:

\n\n
\n
\ndagster_gcp_pyspark.BigQueryPySparkIOManager IOManagerDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
project (dagster.StringSource):
\n

The GCP project to use.

\n
\n
dataset (Union[dagster.StringSource, None], optional):
\n

Name of the BigQuery dataset to use. If not provided, the last prefix before the asset name will be used.

\n
\n
location (Union[dagster.StringSource, None], optional):
\n

The GCP location. Note: When using PySpark DataFrames, the default location of the project will be used. A custom location can be specified in your SparkSession configuration.

\n
\n
gcp_credentials (Union[dagster.StringSource, None], optional):
\n

GCP authentication credentials. If provided, a temporary file will be created with the credentials and GOOGLE_APPLICATION_CREDENTIALS will be set to the temporary file. To avoid issues with newlines in the keys, you must base64 encode the key. You can retrieve the base64 encoded key with this shell command: cat $GOOGLE_AUTH_CREDENTIALS | base64

\n
\n
temporary_gcs_bucket (Union[dagster.StringSource, None], optional):
\n

When using PySpark DataFrames, optionally specify a temporary GCS bucket to store data. If not provided, data will be directly written to BigQuery.

\n
\n
timeout (Union[Float, None], optional):
\n

When using Pandas DataFrames, optionally specify a timeout for the BigQuery queries (loading and reading from tables).

\n
\n
\n

An I/O manager definition that reads inputs from and writes PySpark DataFrames to BigQuery.

\n
\n
Returns:
\n

IOManagerDefinition

\n
\n
\n

Examples

\n
from dagster_gcp_pyspark import BigQueryPySparkIOManager\nfrom dagster import Definitions, EnvVar\n\n@asset(\n    key_prefix=["my_dataset"]  # will be used as the dataset in BigQuery\n)\ndef my_table() -> pd.DataFrame:  # the name of the asset will be the table name\n    ...\n\ndefs = Definitions(\n    assets=[my_table],\n    resources={\n        "io_manager": BigQueryPySparkIOManager(project=EnvVar("GCP_PROJECT"))\n    }\n)\n
\n
\n

You can tell Dagster in which dataset to create tables by setting the \u201cdataset\u201d configuration value.\nIf you do not provide a dataset as configuration to the I/O manager, Dagster will determine a dataset based\non the assets and ops using the I/O Manager. For assets, the dataset will be determined from the asset key,\nas shown in the above example. The final prefix before the asset name will be used as the dataset. For example,\nif the asset \u201cmy_table\u201d had the key prefix [\u201cgcp\u201d, \u201cbigquery\u201d, \u201cmy_dataset\u201d], the dataset \u201cmy_dataset\u201d will be\nused. For ops, the dataset can be specified by including a \u201cschema\u201d entry in output metadata. If \u201cschema\u201d is not provided\nvia config or on the asset/op, \u201cpublic\u201d will be used for the dataset.

\n
@op(\n    out={"my_table": Out(metadata={"schema": "my_dataset"})}\n)\ndef make_my_table() -> pd.DataFrame:\n    # the returned value will be stored at my_dataset.my_table\n    ...\n
\n
\n

To only use specific columns of a table as input to a downstream op or asset, add the metadata \u201ccolumns\u201d to the\nIn or AssetIn.

\n
@asset(\n    ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n)\ndef my_table_a(my_table: pd.DataFrame) -> pd.DataFrame:\n    # my_table will just contain the data from column "a"\n    ...\n
\n
\n

If you cannot upload a file to your Dagster deployment, or otherwise cannot\nauthenticate with GCP\nvia a standard method, you can provide a service account key as the \u201cgcp_credentials\u201d configuration.\nDagster will store this key in a temporary file and set GOOGLE_APPLICATION_CREDENTIALS to point to the file.\nAfter the run completes, the file will be deleted, and GOOGLE_APPLICATION_CREDENTIALS will be\nunset. The key must be base64 encoded to avoid issues with newlines in the keys. You can retrieve\nthe base64 encoded key with this shell command: cat $GOOGLE_APPLICATION_CREDENTIALS | base64

\n
\n\n
\n
\nclass dagster_gcp_pyspark.BigQueryPySparkTypeHandler[source]\u00b6
\n

Plugin for the BigQuery I/O Manager that can store and load PySpark DataFrames as BigQuery tables.

\n

Examples

\n
from dagster_gcp import BigQueryIOManager\nfrom dagster_bigquery_pandas import BigQueryPySparkTypeHandler\nfrom dagster import Definitions, EnvVar\n\nclass MyBigQueryIOManager(BigQueryIOManager):\n    @staticmethod\n    def type_handlers() -> Sequence[DbTypeHandler]:\n        return [BigQueryPySparkTypeHandler()]\n\n@asset(\n    key_prefix=["my_dataset"]  # my_dataset will be used as the dataset in BigQuery\n)\ndef my_table() -> pd.DataFrame:  # the name of the asset will be the table name\n    ...\n\ndefs = Definitions(\n    assets=[my_table],\n    resources={\n        "io_manager": MyBigQueryIOManager(project=EnvVar("GCP_PROJECT"))\n    }\n)\n
\n
\n
\n\n
\n
\n
\n

Legacy\u00b6

\n
\n
\ndagster_gcp_pyspark.bigquery_pyspark_io_manager IOManagerDefinition\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
project (dagster.StringSource):
\n

The GCP project to use.

\n
\n
dataset (Union[dagster.StringSource, None], optional):
\n

Name of the BigQuery dataset to use. If not provided, the last prefix before the asset name will be used.

\n
\n
location (Union[dagster.StringSource, None], optional):
\n

The GCP location. Note: When using PySpark DataFrames, the default location of the project will be used. A custom location can be specified in your SparkSession configuration.

\n
\n
gcp_credentials (Union[dagster.StringSource, None], optional):
\n

GCP authentication credentials. If provided, a temporary file will be created with the credentials and GOOGLE_APPLICATION_CREDENTIALS will be set to the temporary file. To avoid issues with newlines in the keys, you must base64 encode the key. You can retrieve the base64 encoded key with this shell command: cat $GOOGLE_AUTH_CREDENTIALS | base64

\n
\n
temporary_gcs_bucket (Union[dagster.StringSource, None], optional):
\n

When using PySpark DataFrames, optionally specify a temporary GCS bucket to store data. If not provided, data will be directly written to BigQuery.

\n
\n
timeout (Union[Float, None], optional):
\n

When using Pandas DataFrames, optionally specify a timeout for the BigQuery queries (loading and reading from tables).

\n
\n
\n

An I/O manager definition that reads inputs from and writes PySpark DataFrames to BigQuery.

\n
\n
Returns:
\n

IOManagerDefinition

\n
\n
\n

Examples

\n
from dagster_gcp_pyspark import bigquery_pyspark_io_manager\nfrom dagster import Definitions\n\n@asset(\n    key_prefix=["my_dataset"]  # will be used as the dataset in BigQuery\n)\ndef my_table() -> pd.DataFrame:  # the name of the asset will be the table name\n    ...\n\ndefs = Definitions(\n    assets=[my_table],\n    resources={\n        "io_manager": bigquery_pyspark_io_manager.configured({\n            "project" : {"env": "GCP_PROJECT"}\n        })\n    }\n)\n
\n
\n

You can tell Dagster in which dataset to create tables by setting the \u201cdataset\u201d configuration value.\nIf you do not provide a dataset as configuration to the I/O manager, Dagster will determine a dataset based\non the assets and ops using the I/O Manager. For assets, the dataset will be determined from the asset key,\nas shown in the above example. The final prefix before the asset name will be used as the dataset. For example,\nif the asset \u201cmy_table\u201d had the key prefix [\u201cgcp\u201d, \u201cbigquery\u201d, \u201cmy_dataset\u201d], the dataset \u201cmy_dataset\u201d will be\nused. For ops, the dataset can be specified by including a \u201cschema\u201d entry in output metadata. If \u201cschema\u201d is not provided\nvia config or on the asset/op, \u201cpublic\u201d will be used for the dataset.

\n
@op(\n    out={"my_table": Out(metadata={"schema": "my_dataset"})}\n)\ndef make_my_table() -> pd.DataFrame:\n    # the returned value will be stored at my_dataset.my_table\n    ...\n
\n
\n

To only use specific columns of a table as input to a downstream op or asset, add the metadata \u201ccolumns\u201d to the\nIn or AssetIn.

\n
@asset(\n    ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n)\ndef my_table_a(my_table: pd.DataFrame) -> pd.DataFrame:\n    # my_table will just contain the data from column "a"\n    ...\n
\n
\n

If you cannot upload a file to your Dagster deployment, or otherwise cannot\nauthenticate with GCP\nvia a standard method, you can provide a service account key as the \u201cgcp_credentials\u201d configuration.\nDagster will store this key in a temporary file and set GOOGLE_APPLICATION_CREDENTIALS to point to the file.\nAfter the run completes, the file will be deleted, and GOOGLE_APPLICATION_CREDENTIALS will be\nunset. The key must be base64 encoded to avoid issues with newlines in the keys. You can retrieve\nthe base64 encoded key with this shell command: cat $GOOGLE_APPLICATION_CREDENTIALS | base64

\n
\n\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-gcp-pyspark", "customsidebar": null, "display_toc": true, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../dagster-ge/", "title": "Great Expectations (dagster-ge)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-gcp-pandas/", "title": "GCP + Pandas (dagster-gcp-pandas)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-ge", "Great Expectations (dagster-ge)", "N", "next"], ["sections/api/apidocs/libraries/dagster-gcp-pandas", "GCP + Pandas (dagster-gcp-pandas)", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/libraries/dagster-gcp-pyspark.rst.txt", "title": "GCP + PySpark (dagster-gcp-pyspark)", "toc": "\n"}, "dagster-ge": {"alabaster_version": "0.7.13", "body": "
\n

Great Expectations (dagster-ge)\u00b6

\n
\n
\ndagster_ge.ge_validation_op_factory(name, datasource_name, suite_name, validation_operator_name=None, input_dagster_type=<dagster._core.types.dagster_type.DagsterType object>, batch_kwargs=None)[source]\u00b6
\n

Generates ops for interacting with GE.

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 the name of the op

  • \n
  • datasource_name (str) \u2013 the name of your DataSource, see your great_expectations.yml

  • \n
  • suite_name (str) \u2013 the name of your expectation suite, see your great_expectations.yml

  • \n
  • validation_operator_name (Optional[str]) \u2013 what validation operator to run \u2013 defaults to\nNone, which generates an ephemeral validator. If you want to save data docs, use\n\u2018action_list_operator\u2019.\nSee https://legacy.docs.greatexpectations.io/en/0.12.1/reference/core_concepts/validation_operators_and_actions.html#

  • \n
  • input_dagster_type (DagsterType) \u2013 the Dagster type used to type check the input to the op.\nDefaults to dagster_pandas.DataFrame.

  • \n
  • batch_kwargs (Optional[dict]) \u2013 overrides the batch_kwargs parameter when calling the\nge_data_context\u2019s get_batch method. Defaults to {\u201cdataset\u201d: dataset}, where\ndataset is the input to the generated op.

  • \n
\n
\n
Returns:
\n

An op that takes in a set of data and yields both an expectation with relevant metadata\nand an output with all the metadata (for user processing)

\n
\n
\n
\n\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-ge", "customsidebar": null, "display_toc": false, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../dagster-github/", "title": "GitHub (dagster-github)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-gcp-pyspark/", "title": "GCP + PySpark (dagster-gcp-pyspark)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-github", "GitHub (dagster-github)", "N", "next"], ["sections/api/apidocs/libraries/dagster-gcp-pyspark", "GCP + PySpark (dagster-gcp-pyspark)", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/libraries/dagster-ge.rst.txt", "title": "Great Expectations (dagster-ge)", "toc": "\n"}, "dagster-github": {"alabaster_version": "0.7.13", "body": "
\n

GitHub (dagster-github)\u00b6

\n

This library provides an integration with GitHub Apps, to support performing various automation\noperations within your github repositories and with the tighter permissions scopes that github apps\nallow for vs using a personal token.

\n

Presently, it provides a thin wrapper on the github v4 graphql API.

\n

To use this integration, you\u2019ll first need to create a GitHub App for it.

\n
    \n
  1. Create App: Follow the instructions in\nhttps://developer.github.com/apps/quickstart-guides/setting-up-your-development-environment/, You will end up with a private key and App ID, which will be used when configuring the\ndagster-github resource. Note you will need to grant your app the relevent permissions\nfor the API requests you want to make, for example to post issues it will need read/write access\nfor the issues repository permission, more info on GitHub application permissions can be found\nhere

  2. \n
  3. Install App: Follow the instructions in\nhttps://developer.github.com/apps/quickstart-guides/setting-up-your-development-environment/#step-7-install-the-app-on-your-account

  4. \n
  5. Find your installation_id: You can pull this from the GitHub app administration page,\nhttps://github.com/apps/<app-name>/installations/<installation_id>. Note if your app is\ninstalled more than once you can also programatically retrieve these IDs.

  6. \n
\n

Sharing your App ID and Installation ID is fine, but make sure that the Private Key for your app is\nstored securily.

\n
\n
\n

Posting Issues\u00b6

\n

Now, you can create issues in GitHub from Dagster with the GitHub resource:

\n
import os\n\nfrom dagster import job, op\nfrom dagster_github import GithubResource\n\n\n@op\ndef github_op(github: GithubResource):\n    github.get_client().create_issue(\n        repo_name='dagster',\n        repo_owner='dagster-io',\n        title='Dagster\\'s first github issue',\n        body='this open source thing seems like a pretty good idea',\n    )\n\n@job(resource_defs={\n     'github': GithubResource(\n         github_app_id=os.getenv('GITHUB_APP_ID'),\n         github_app_private_rsa_key=os.getenv('GITHUB_PRIVATE_KEY'),\n         github_installation_id=os.getenv('GITHUB_INSTALLATION_ID')\n )})\ndef github_job():\n    github_op()\n\ngithub_job.execute_in_process()\n
\n
\n

Run the above code, and you\u2019ll see the issue appear in GitHub:\n

\n

GitHub enterprise users can provide their hostname in the run config. Provide github_hostname\nas part of your github config like below.

\n
GithubResource(\n    github_app_id=os.getenv('GITHUB_APP_ID'),\n    github_app_private_rsa_key=os.getenv('GITHUB_PRIVATE_KEY'),\n    github_installation_id=os.getenv('GITHUB_INSTALLATION_ID'),\n    github_hostname=os.getenv('GITHUB_HOSTNAME'),\n)\n
\n
\n

By provisioning GithubResource as a Dagster resource, you can post to GitHub from\nwithin any asset or op execution.

\n
\n
\n

Executing GraphQL queries\u00b6

\n
import os\n\nfrom dagster import job, op\nfrom dagster_github import github_resource\n\n\n@op\ndef github_op(github: GithubResource):\n    github.get_client().execute(\n        query="""\n        query get_repo_id($repo_name: String!, $repo_owner: String!) {\n            repository(name: $repo_name, owner: $repo_owner) {\n                id\n            }\n        }\n        """,\n        variables={"repo_name": repo_name, "repo_owner": repo_owner},\n    )\n\n@job(resource_defs={\n     'github': GithubResource(\n         github_app_id=os.getenv('GITHUB_APP_ID'),\n         github_app_private_rsa_key=os.getenv('GITHUB_PRIVATE_KEY'),\n         github_installation_id=os.getenv('GITHUB_INSTALLATION_ID')\n )})\ndef github_job():\n    github_op()\n\ngithub_job.execute_in_process()\n
\n
\n
\n
\ndagster_github.GithubResource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
github_app_id (dagster.IntSource):
\n

Github Application ID, for more info see https://developer.github.com/apps/

\n
\n
github_app_private_rsa_key (dagster.StringSource):
\n

Github Application Private RSA key text, for more info see https://developer.github.com/apps/

\n
\n
github_installation_id (Union[dagster.IntSource, None], optional):
\n

Github Application Installation ID, for more info see https://developer.github.com/apps/

\n
\n
github_hostname (Union[dagster.StringSource, None], optional):
\n

Github hostname. Defaults to api.github.com, for more info see https://developer.github.com/apps/

\n
\n
\n

Base class for Dagster resources that utilize structured config.

\n

This class is a subclass of both ResourceDefinition and Config.

\n

Example definition:

\n
class WriterResource(ConfigurableResource):\n    prefix: str\n\n    def output(self, text: str) -> None:\n        print(f"{self.prefix}{text}")\n
\n
\n

Example usage:

\n
@asset\ndef asset_that_uses_writer(writer: WriterResource):\n    writer.output("text")\n\ndefs = Definitions(\n    assets=[asset_that_uses_writer],\n    resources={"writer": WriterResource(prefix="a_prefix")},\n)\n
\n
\n
\n\n
\n

Legacy\u00b6

\n
\n
\ndagster_github.github_resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
github_app_id (dagster.IntSource):
\n

Github Application ID, for more info see https://developer.github.com/apps/

\n
\n
github_app_private_rsa_key (dagster.StringSource):
\n

Github Application Private RSA key text, for more info see https://developer.github.com/apps/

\n
\n
github_installation_id (Union[dagster.IntSource, None], optional):
\n

Github Application Installation ID, for more info see https://developer.github.com/apps/

\n
\n
github_hostname (Union[dagster.StringSource, None], optional):
\n

Github hostname. Defaults to api.github.com, for more info see https://developer.github.com/apps/

\n
\n
\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-github", "customsidebar": null, "display_toc": true, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../dagster-k8s/", "title": "Kubernetes (dagster-k8s)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-ge/", "title": "Great Expectations (dagster-ge)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-k8s", "Kubernetes (dagster-k8s)", "N", "next"], ["sections/api/apidocs/libraries/dagster-ge", "Great Expectations (dagster-ge)", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/libraries/dagster-github.rst.txt", "title": "GitHub (dagster-github)", "toc": "\n"}, "dagster-graphql": {"alabaster_version": "0.7.13", "body": "
\n

GraphQL (dagster-graphql)\u00b6

\n
\n

Python Client\u00b6

\n
\n
\nclass dagster_graphql.DagsterGraphQLClient(hostname, port_number=None, transport=None, use_https=False, timeout=300, headers=None)[source]\u00b6
\n
\n
\n

\n \n (\n experimental\n )\n \n This API may break in future versions, even between dot releases.\n \n

\n

Official Dagster Python Client for GraphQL.

\n

Utilizes the gql library to dispatch queries over HTTP to a remote Dagster GraphQL Server

\n

As of now, all operations on this client are synchronous.

\n

Intended usage:

\n
client = DagsterGraphQLClient("localhost", port_number=3000)\nstatus = client.get_run_status(**SOME_RUN_ID**)\n
\n
\n
\n
Parameters:
\n
    \n
  • hostname (str) \u2013 Hostname for the Dagster GraphQL API, like localhost or\ndagster.YOUR_ORG_HERE.

  • \n
  • port_number (Optional[int]) \u2013 Port number to connect to on the host.\nDefaults to None.

  • \n
  • transport (Optional[Transport], optional) \u2013 A custom transport to use to connect to the\nGraphQL API with (e.g. for custom auth). Defaults to None.

  • \n
  • use_https (bool, optional) \u2013 Whether to use https in the URL connection string for the\nGraphQL API. Defaults to False.

  • \n
  • timeout (int) \u2013 Number of seconds before requests should time out. Defaults to 60.

  • \n
  • headers (Optional[Dict[str, str]]) \u2013 Additional headers to include in the request. To use\nthis client in Dagster Cloud, set the \u201cDagster-Cloud-Api-Token\u201d header to a user token\ngenerated in the Dagster Cloud UI.

  • \n
\n
\n
Raises:
\n

ConnectionError \u2013 if the client cannot connect to the host.

\n
\n
\n
\n
\nget_run_status(run_id)[source]\u00b6
\n

Get the status of a given Pipeline Run.

\n
\n
Parameters:
\n

run_id (str) \u2013 run id of the requested pipeline run.

\n
\n
Raises:
\n
\n
\n
Returns:
\n

returns a status Enum describing the state of the requested pipeline run

\n
\n
Return type:
\n

DagsterRunStatus

\n
\n
\n
\n\n
\n
\nreload_repository_location(repository_location_name)[source]\u00b6
\n

Reloads a Dagster Repository Location, which reloads all repositories in that repository location.

\n

This is useful in a variety of contexts, including refreshing the Dagster UI without restarting\nthe server.

\n
\n
Parameters:
\n

repository_location_name (str) \u2013 The name of the repository location

\n
\n
Returns:
\n

Object with information about the result of the reload request

\n
\n
Return type:
\n

ReloadRepositoryLocationInfo

\n
\n
\n
\n\n
\n
\nshutdown_repository_location(repository_location_name)[source]\u00b6
\n

Shuts down the server that is serving metadata for the provided repository location.

\n

This is primarily useful when you want the server to be restarted by the compute environment\nin which it is running (for example, in Kubernetes, the pod in which the server is running\nwill automatically restart when the server is shut down, and the repository metadata will\nbe reloaded)

\n
\n
Parameters:
\n

repository_location_name (str) \u2013 The name of the repository location

\n
\n
Returns:
\n

Object with information about the result of the reload request

\n
\n
Return type:
\n

ShutdownRepositoryLocationInfo

\n
\n
\n
\n\n
\n
\nsubmit_job_execution(job_name, repository_location_name=None, repository_name=None, run_config=None, tags=None, op_selection=None)[source]\u00b6
\n

Submits a job with attached configuration for execution.

\n
\n
Parameters:
\n
    \n
  • job_name (str) \u2013 The job\u2019s name

  • \n
  • repository_location_name (Optional[str]) \u2013 The name of the repository location where\nthe job is located. If omitted, the client will try to infer the repository location\nfrom the available options on the Dagster deployment. Defaults to None.

  • \n
  • repository_name (Optional[str]) \u2013 The name of the repository where the job is located.\nIf omitted, the client will try to infer the repository from the available options\non the Dagster deployment. Defaults to None.

  • \n
  • run_config (Optional[Dict[str, Any]]) \u2013 This is the run config to execute the job with.\nNote that runConfigData is any-typed in the GraphQL type system. This type is used when passing in\nan arbitrary object for run config. However, it must conform to the constraints of the config\nschema for this job. If it does not, the client will throw a DagsterGraphQLClientError with a message of\nJobConfigValidationInvalid. Defaults to None.

  • \n
  • tags (Optional[Dict[str, Any]]) \u2013 A set of tags to add to the job execution.

  • \n
\n
\n
Raises:
\n
    \n
  • DagsterGraphQLClientError("InvalidStepError", invalid_step_key) \u2013 the job has an invalid step

  • \n
  • DagsterGraphQLClientError("InvalidOutputError", body=error_object) \u2013 some solid has an invalid output within the job.\n The error_object is of type dagster_graphql.InvalidOutputErrorInfo.

  • \n
  • DagsterGraphQLClientError("RunConflict", message) \u2013 a DagsterRunConflict occured during execution.\n This indicates that a conflicting job run already exists in run storage.

  • \n
  • DagsterGraphQLClientError("PipelineConfigurationInvalid", invalid_step_key) \u2013 the run_config is not in the expected format\n for the job

  • \n
  • DagsterGraphQLClientError("JobNotFoundError", message) \u2013 the requested job does not exist

  • \n
  • DagsterGraphQLClientError("PythonError", message) \u2013 an internal framework error occurred

  • \n
\n
\n
Returns:
\n

run id of the submitted pipeline run

\n
\n
Return type:
\n

str

\n
\n
\n
\n\n
\n\n
\n
\nexception dagster_graphql.DagsterGraphQLClientError(*args, body=None)[source]\u00b6
\n
\n\n
\n
\nclass dagster_graphql.InvalidOutputErrorInfo(step_key, invalid_output_name)[source]\u00b6
\n

This class gives information about an InvalidOutputError from submitting a pipeline for execution\nfrom GraphQL.

\n
\n
Parameters:
\n
    \n
  • step_key (str) \u2013 key of the step that failed

  • \n
  • invalid_output_name (str) \u2013 the name of the invalid output from the given step

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster_graphql.ReloadRepositoryLocationInfo(status, failure_type=None, message=None)[source]\u00b6
\n

This class gives information about the result of reloading\na Dagster repository location with a GraphQL mutation.

\n
\n
Parameters:
\n
    \n
  • status (ReloadRepositoryLocationStatus) \u2013 The status of the reload repository location mutation

  • \n
  • failure_type \u2013 (Optional[str], optional): the failure type if status == ReloadRepositoryLocationStatus.FAILURE.\nCan be one of ReloadNotSupported, RepositoryLocationNotFound, or RepositoryLocationLoadFailure. Defaults to None.

  • \n
  • message (Optional[str], optional) \u2013 the failure message/reason if\nstatus == ReloadRepositoryLocationStatus.FAILURE. Defaults to None.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster_graphql.ReloadRepositoryLocationStatus(value)[source]\u00b6
\n

This enum describes the status of a GraphQL mutation to reload a Dagster repository location.

\n
\n
Parameters:
\n

Enum (str) \u2013 can be either ReloadRepositoryLocationStatus.SUCCESS\nor ReloadRepositoryLocationStatus.FAILURE.

\n
\n
\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-graphql", "customsidebar": null, "display_toc": true, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../dagster-wandb/", "title": "Weights & Biases (dagster-wandb)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagstermill/", "title": "Dagstermill"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-wandb", "Weights & Biases (dagster-wandb)", "N", "next"], ["sections/api/apidocs/libraries/dagstermill", "Dagstermill", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/libraries/dagster-graphql.rst.txt", "title": "GraphQL (dagster-graphql)", "toc": "\n"}, "dagster-k8s": {"alabaster_version": "0.7.13", "body": "
\n

Kubernetes (dagster-k8s)\u00b6

\n

See also the Kubernetes deployment guide.

\n

This library contains utilities for running Dagster with Kubernetes. This includes a Python API\nallowing the webserver to launch runs as Kubernetes Jobs, as well as a Helm chart you can use as the basis\nfor a Dagster deployment on a Kubernetes cluster.

\n
\n
\n

APIs\u00b6

\n
\n
\ndagster_k8s.K8sRunLauncher RunLauncher[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
job_image (Union[dagster.StringSource, None], optional):
\n

Docker image to use for launched Jobs. If this field is empty, the image that was used to originally load the Dagster repository will be used. (Ex: \u201cmycompany.com/dagster-k8s-image:latest\u201d).

\n
\n
image_pull_policy (Union[dagster.StringSource, None], optional):
\n

Image pull policy to set on launched Pods.

\n
\n
image_pull_secrets (Union[List[strict dict], None], optional):
\n

Specifies that Kubernetes should get the credentials from the Secrets named in this list.

\n
\n
service_account_name (Union[dagster.StringSource, None], optional):
\n

The name of the Kubernetes service account under which to run.

\n
\n
env_config_maps (Union[List[dagster.StringSource], None], optional):
\n

A list of custom ConfigMapEnvSource names from which to draw environment variables (using envFrom) for the Job. Default: []. See:https://kubernetes.io/docs/tasks/inject-data-application/define-environment-variable-container/#define-an-environment-variable-for-a-container

\n
\n
env_secrets (Union[List[dagster.StringSource], None], optional):
\n

A list of custom Secret names from which to draw environment variables (using envFrom) for the Job. Default: []. See:https://kubernetes.io/docs/tasks/inject-data-application/distribute-credentials-secure/#configure-all-key-value-pairs-in-a-secret-as-container-environment-variables

\n
\n
env_vars (Union[List[String], None], optional):
\n

A list of environment variables to inject into the Job. Each can be of the form KEY=VALUE or just KEY (in which case the value will be pulled from the current process). Default: []. See: https://kubernetes.io/docs/tasks/inject-data-application/distribute-credentials-secure/#configure-all-key-value-pairs-in-a-secret-as-container-environment-variables

\n
\n
volume_mounts (List[permissive dict], optional):
\n

A list of volume mounts to include in the job\u2019s container. Default: []. See: https://v1-18.docs.kubernetes.io/docs/reference/generated/kubernetes-api/v1.18/#volumemount-v1-core

\n

Default Value: []

\n
\n
volumes (List[permissive dict], optional):
\n

A list of volumes to include in the Job\u2019s Pod. Default: []. For the many possible volume source types that can be included, see: https://v1-18.docs.kubernetes.io/docs/reference/generated/kubernetes-api/v1.18/#volume-v1-core

\n

Default Value: []

\n
\n
labels (permissive dict, optional):
\n

Labels to apply to all created pods. See: https://kubernetes.io/docs/concepts/overview/working-with-objects/labels

\n
\n
resources (Union[strict dict, None], optional):
\n

Compute resource requirements for the container. See: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/

\n
\n
scheduler_name (Union[dagster.StringSource, None], optional):
\n

Use a custom Kubernetes scheduler for launched Pods. See:https://kubernetes.io/docs/tasks/extend-kubernetes/configure-multiple-schedulers/

\n
\n
security_context (permissive dict, optional):
\n

Security settings for the container. See:https://kubernetes.io/docs/tasks/configure-pod-container/security-context/#set-capabilities-for-a-container

\n
\n
instance_config_map (dagster.StringSource):
\n

The name of an existing Volume to mount into the pod in order to provide a ConfigMap for the Dagster instance. This Volume should contain a dagster.yaml with appropriate values for run storage, event log storage, etc.

\n
\n
postgres_password_secret (dagster.StringSource, optional):
\n

The name of the Kubernetes Secret where the postgres password can be retrieved. Will be mounted and supplied as an environment variable to the Job Pod.Secret must contain the key "postgresql-password" which will be exposed in the Job environment as the environment variable DAGSTER_PG_PASSWORD.

\n
\n
dagster_home (dagster.StringSource, optional):
\n

The location of DAGSTER_HOME in the Job container; this is where the dagster.yaml file will be mounted from the instance ConfigMap specified here. Defaults to /opt/dagster/dagster_home.

\n

Default Value: \u2018/opt/dagster/dagster_home\u2019

\n
\n
load_incluster_config (Bool, optional):
\n

Set this value if you are running the launcher\nwithin a k8s cluster. If True, we assume the launcher is running within the target\ncluster and load config using kubernetes.config.load_incluster_config. Otherwise,\nwe will use the k8s config specified in kubeconfig_file (using\nkubernetes.config.load_kube_config) or fall back to the default kubeconfig.

\n

Default Value: True

\n
\n
kubeconfig_file (Union[String, None], optional):
\n

The kubeconfig file from which to load config. Defaults to using the default kubeconfig.

\n

Default Value: None

\n
\n
fail_pod_on_run_failure (Bool, optional):
\n

Whether the launched Kubernetes Jobs and Pods should fail if the Dagster run fails

\n
\n
run_k8s_config (strict dict, optional):
\n

Raw Kubernetes configuration for launched runs.

\n
\nConfig Schema:
\n
container_config (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\n
pod_template_spec_metadata (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\n
pod_spec_config (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\n
job_config (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\n
job_metadata (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\n
job_spec_config (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\n
\n
\n
job_namespace (dagster.StringSource, optional):
\n

Default Value: \u2018default\u2019

\n
\n
\n

RunLauncher that starts a Kubernetes Job for each Dagster job run.

\n

Encapsulates each run in a separate, isolated invocation of dagster-graphql.

\n

You can configure a Dagster instance to use this RunLauncher by adding a section to your\ndagster.yaml like the following:

\n
run_launcher:\n  module: dagster_k8s.launcher\n  class: K8sRunLauncher\n  config:\n    service_account_name: your_service_account\n    job_image: my_project/dagster_image:latest\n    instance_config_map: dagster-instance\n    postgres_password_secret: dagster-postgresql-secret\n
\n
\n
\n\n
\n
\ndagster_k8s.k8s_job_executor ExecutorDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
job_image (Union[dagster.StringSource, None], optional):
\n

Docker image to use for launched Jobs. If this field is empty, the image that was used to originally load the Dagster repository will be used. (Ex: \u201cmycompany.com/dagster-k8s-image:latest\u201d).

\n
\n
image_pull_policy (Union[dagster.StringSource, None], optional):
\n

Image pull policy to set on launched Pods.

\n
\n
image_pull_secrets (Union[List[strict dict], None], optional):
\n

Specifies that Kubernetes should get the credentials from the Secrets named in this list.

\n
\n
service_account_name (Union[dagster.StringSource, None], optional):
\n

The name of the Kubernetes service account under which to run.

\n
\n
env_config_maps (Union[List[dagster.StringSource], None], optional):
\n

A list of custom ConfigMapEnvSource names from which to draw environment variables (using envFrom) for the Job. Default: []. See:https://kubernetes.io/docs/tasks/inject-data-application/define-environment-variable-container/#define-an-environment-variable-for-a-container

\n
\n
env_secrets (Union[List[dagster.StringSource], None], optional):
\n

A list of custom Secret names from which to draw environment variables (using envFrom) for the Job. Default: []. See:https://kubernetes.io/docs/tasks/inject-data-application/distribute-credentials-secure/#configure-all-key-value-pairs-in-a-secret-as-container-environment-variables

\n
\n
env_vars (Union[List[String], None], optional):
\n

A list of environment variables to inject into the Job. Each can be of the form KEY=VALUE or just KEY (in which case the value will be pulled from the current process). Default: []. See: https://kubernetes.io/docs/tasks/inject-data-application/distribute-credentials-secure/#configure-all-key-value-pairs-in-a-secret-as-container-environment-variables

\n
\n
volume_mounts (List[permissive dict], optional):
\n

A list of volume mounts to include in the job\u2019s container. Default: []. See: https://v1-18.docs.kubernetes.io/docs/reference/generated/kubernetes-api/v1.18/#volumemount-v1-core

\n

Default Value: []

\n
\n
volumes (List[permissive dict], optional):
\n

A list of volumes to include in the Job\u2019s Pod. Default: []. For the many possible volume source types that can be included, see: https://v1-18.docs.kubernetes.io/docs/reference/generated/kubernetes-api/v1.18/#volume-v1-core

\n

Default Value: []

\n
\n
labels (permissive dict, optional):
\n

Labels to apply to all created pods. See: https://kubernetes.io/docs/concepts/overview/working-with-objects/labels

\n
\n
resources (Union[strict dict, None], optional):
\n

Compute resource requirements for the container. See: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/

\n
\n
scheduler_name (Union[dagster.StringSource, None], optional):
\n

Use a custom Kubernetes scheduler for launched Pods. See:https://kubernetes.io/docs/tasks/extend-kubernetes/configure-multiple-schedulers/

\n
\n
security_context (permissive dict, optional):
\n

Security settings for the container. See:https://kubernetes.io/docs/tasks/configure-pod-container/security-context/#set-capabilities-for-a-container

\n
\n
load_incluster_config (Bool, optional):
\n

Whether or not the executor is running within a k8s cluster already. If\nthe job is using the K8sRunLauncher, the default value of this parameter will be\nthe same as the corresponding value on the run launcher.\nIf True, we assume the executor is running within the target cluster and load config\nusing kubernetes.config.load_incluster_config. Otherwise, we will use the k8s config\nspecified in kubeconfig_file (using kubernetes.config.load_kube_config) or fall\nback to the default kubeconfig.

\n
\n
kubeconfig_file (Union[String, None], optional):
\n

Path to a kubeconfig file to use, if not using default kubeconfig. If\nthe job is using the K8sRunLauncher, the default value of this parameter will be\nthe same as the corresponding value on the run launcher.

\n
\n
job_namespace (dagster.StringSource, optional):
\n

\n
retries (selector, optional):
\n

Whether retries are enabled or not. By default, retries are enabled.

\n
\nDefault Value:
{\n    "enabled": {}\n}\n
\n
\n
\nConfig Schema:
\n
enabled (strict dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\n
disabled (strict dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\n
\n
\n
max_concurrent (dagster.IntSource, optional):
\n

Limit on the number of pods that will run concurrently within the scope of a Dagster run. Note that this limit is per run, not global.

\n
\n
tag_concurrency_limits (List[strict dict], optional):
\n

A set of limits that are applied to steps with particular tags. If a value is set, the limit is applied to only that key-value pair. If no value is set, the limit is applied across all values of that key. If the value is set to a dict with applyLimitPerUniqueValue: true, the limit will apply to the number of unique values for that key. Note that these limits are per run, not global.

\n
\n
step_k8s_config (strict dict, optional):
\n

Raw Kubernetes configuration for each step launched by the executor.

\n
\nConfig Schema:
\n
container_config (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\n
pod_template_spec_metadata (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\n
pod_spec_config (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\n
job_config (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\n
job_metadata (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\n
job_spec_config (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\n
\n
\n
\n

Executor which launches steps as Kubernetes Jobs.

\n

To use the k8s_job_executor, set it as the executor_def when defining a job:

\n
from dagster_k8s import k8s_job_executor\n\nfrom dagster import job\n\n@job(executor_def=k8s_job_executor)\ndef k8s_job():\n    pass\n
\n
\n

Then you can configure the executor with run config as follows:

\n
execution:\n  config:\n    job_namespace: 'some-namespace'\n    image_pull_policy: ...\n    image_pull_secrets: ...\n    service_account_name: ...\n    env_config_maps: ...\n    env_secrets: ...\n    env_vars: ...\n    job_image: ... # leave out if using userDeployments\n    max_concurrent: ...\n
\n
\n

max_concurrent limits the number of pods that will execute concurrently for one run. By default\nthere is no limit- it will maximally parallel as allowed by the DAG. Note that this is not a\nglobal limit.

\n

Configuration set on the Kubernetes Jobs and Pods created by the K8sRunLauncher will also be\nset on Kubernetes Jobs and Pods created by the k8s_job_executor.

\n

Configuration set using tags on a @job will only apply to the run level. For configuration\nto apply at each step it must be set using tags for each @op.

\n
\n\n
\n

Ops\u00b6

\n
\n
\ndagster_k8s.k8s_job_op = <dagster._core.definitions.op_definition.OpDefinition object>[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
image_pull_policy (Union[dagster.StringSource, None], optional):
\n

Image pull policy to set on launched Pods.

\n
\n
image_pull_secrets (Union[List[strict dict], None], optional):
\n

Specifies that Kubernetes should get the credentials from the Secrets named in this list.

\n
\n
service_account_name (Union[dagster.StringSource, None], optional):
\n

The name of the Kubernetes service account under which to run.

\n
\n
env_config_maps (Union[List[dagster.StringSource], None], optional):
\n

A list of custom ConfigMapEnvSource names from which to draw environment variables (using envFrom) for the Job. Default: []. See:https://kubernetes.io/docs/tasks/inject-data-application/define-environment-variable-container/#define-an-environment-variable-for-a-container

\n
\n
env_secrets (Union[List[dagster.StringSource], None], optional):
\n

A list of custom Secret names from which to draw environment variables (using envFrom) for the Job. Default: []. See:https://kubernetes.io/docs/tasks/inject-data-application/distribute-credentials-secure/#configure-all-key-value-pairs-in-a-secret-as-container-environment-variables

\n
\n
env_vars (Union[List[String], None], optional):
\n

A list of environment variables to inject into the Job. Each can be of the form KEY=VALUE or just KEY (in which case the value will be pulled from the current process). Default: []. See: https://kubernetes.io/docs/tasks/inject-data-application/distribute-credentials-secure/#configure-all-key-value-pairs-in-a-secret-as-container-environment-variables

\n
\n
volume_mounts (List[permissive dict], optional):
\n

A list of volume mounts to include in the job\u2019s container. Default: []. See: https://v1-18.docs.kubernetes.io/docs/reference/generated/kubernetes-api/v1.18/#volumemount-v1-core

\n

Default Value: []

\n
\n
volumes (List[permissive dict], optional):
\n

A list of volumes to include in the Job\u2019s Pod. Default: []. For the many possible volume source types that can be included, see: https://v1-18.docs.kubernetes.io/docs/reference/generated/kubernetes-api/v1.18/#volume-v1-core

\n

Default Value: []

\n
\n
labels (permissive dict, optional):
\n

Labels to apply to all created pods. See: https://kubernetes.io/docs/concepts/overview/working-with-objects/labels

\n
\n
resources (Union[strict dict, None], optional):
\n

Compute resource requirements for the container. See: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/

\n
\n
scheduler_name (Union[dagster.StringSource, None], optional):
\n

Use a custom Kubernetes scheduler for launched Pods. See:https://kubernetes.io/docs/tasks/extend-kubernetes/configure-multiple-schedulers/

\n
\n
security_context (permissive dict, optional):
\n

Security settings for the container. See:https://kubernetes.io/docs/tasks/configure-pod-container/security-context/#set-capabilities-for-a-container

\n
\n
image (dagster.StringSource):
\n

The image in which to launch the k8s job.

\n
\n
command (List[String], optional):
\n

The command to run in the container within the launched k8s job.

\n
\n
args (List[String], optional):
\n

The args for the command for the container.

\n
\n
namespace (dagster.StringSource, optional):
\n

\n
load_incluster_config (Bool, optional):
\n

Set this value if you are running the launcher\nwithin a k8s cluster. If True, we assume the launcher is running within the target\ncluster and load config using kubernetes.config.load_incluster_config. Otherwise,\nwe will use the k8s config specified in kubeconfig_file (using\nkubernetes.config.load_kube_config) or fall back to the default kubeconfig.

\n

Default Value: True

\n
\n
kubeconfig_file (Union[String, None], optional):
\n

The kubeconfig file from which to load config. Defaults to using the default kubeconfig.

\n

Default Value: None

\n
\n
timeout (Int, optional):
\n

How long to wait for the job to succeed before raising an exception

\n
\n
container_config (permissive dict, optional):
\n

Raw k8s config for the k8s pod\u2019s main container (https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.20/#container-v1-core). Keys can either snake_case or camelCase.

\n
\n
pod_template_spec_metadata (permissive dict, optional):
\n

Raw k8s config for the k8s pod\u2019s metadata (https://kubernetes.io/docs/reference/kubernetes-api/common-definitions/object-meta/#ObjectMeta). Keys can either snake_case or camelCase.

\n
\n
pod_spec_config (permissive dict, optional):
\n

Raw k8s config for the k8s pod\u2019s pod spec (https://kubernetes.io/docs/reference/kubernetes-api/workload-resources/pod-v1/#PodSpec). Keys can either snake_case or camelCase.

\n
\n
job_metadata (permissive dict, optional):
\n

Raw k8s config for the k8s job\u2019s metadata (https://kubernetes.io/docs/reference/kubernetes-api/common-definitions/object-meta/#ObjectMeta). Keys can either snake_case or camelCase.

\n
\n
job_spec_config (permissive dict, optional):
\n

Raw k8s config for the k8s job\u2019s job spec (https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.20/#jobspec-v1-batch). Keys can either snake_case or camelCase.

\n
\n
\n
\n
\n

\n \n (\n experimental\n )\n \n This API may break in future versions, even between dot releases.\n \n

\n

An op that runs a Kubernetes job using the k8s API.

\n

Contrast with the k8s_job_executor, which runs each Dagster op in a Dagster job in its\nown k8s job.

\n
\n
This op may be useful when:
    \n
  • You need to orchestrate a command that isn\u2019t a Dagster op (or isn\u2019t written in Python)

  • \n
  • You want to run the rest of a Dagster job using a specific executor, and only a single\nop in k8s.

  • \n
\n
\n
\n

For example:

\n
from dagster_k8s import k8s_job_op\n\nfrom dagster import job\n\nfirst_op = k8s_job_op.configured(\n    {\n        "image": "busybox",\n        "command": ["/bin/sh", "-c"],\n        "args": ["echo HELLO"],\n    },\n    name="first_op",\n)\nsecond_op = k8s_job_op.configured(\n    {\n        "image": "busybox",\n        "command": ["/bin/sh", "-c"],\n        "args": ["echo GOODBYE"],\n    },\n    name="second_op",\n)\n\n@job\ndef full_job():\n    second_op(first_op())\n
\n
\n

You can create your own op with the same implementation by calling the execute_k8s_job function\ninside your own op.

\n

The service account that is used to run this job should have the following RBAC permissions:

\n
rules:\n  - apiGroups: ["batch"]\n      resources: ["jobs", "jobs/status"]\n      verbs: ["*"]\n  # The empty arg "" corresponds to the core API group\n  - apiGroups: [""]\n      resources: ["pods", "pods/log", "pods/status"]\n      verbs: ["*"]'\n
\n
\n
\n\n
\n
\ndagster_k8s.execute_k8s_job(context, image, command=None, args=None, namespace=None, image_pull_policy=None, image_pull_secrets=None, service_account_name=None, env_config_maps=None, env_secrets=None, env_vars=None, volume_mounts=None, volumes=None, labels=None, resources=None, scheduler_name=None, load_incluster_config=True, kubeconfig_file=None, timeout=None, container_config=None, pod_template_spec_metadata=None, pod_spec_config=None, job_metadata=None, job_spec_config=None, k8s_job_name=None)[source]\u00b6
\n
\n
\n

\n \n (\n experimental\n )\n \n This API may break in future versions, even between dot releases.\n \n

\n

This function is a utility for executing a Kubernetes job from within a Dagster op.

\n
\n
Parameters:
\n
\n
\n
\n
\n\n
\n

Python API\u00b6

\n

The K8sRunLauncher allows webserver instances to be configured to launch new runs by starting\nper-run Kubernetes Jobs. To configure the K8sRunLauncher, your dagster.yaml should\ninclude a section like:

\n
run_launcher:\n  module: dagster_k8s.launcher\n  class: K8sRunLauncher\n  config:\n    image_pull_secrets:\n    service_account_name: dagster\n    job_image: "my-company.com/image:latest"\n    dagster_home: "/opt/dagster/dagster_home"\n    postgres_password_secret: "dagster-postgresql-secret"\n    image_pull_policy: "IfNotPresent"\n    job_namespace: "dagster"\n    instance_config_map: "dagster-instance"\n    env_config_maps:\n      - "dagster-k8s-job-runner-env"\n    env_secrets:\n      - "dagster-k8s-some-secret"\n
\n
\n
\n
\n

Helm chart\u00b6

\n

For local dev (e.g., on kind or minikube):

\n
helm install \\\n    --set dagsterWebserver.image.repository="dagster.io/buildkite-test-image" \\\n    --set dagsterWebserver.image.tag="py310-latest" \\\n    --set job_runner.image.repository="dagster.io/buildkite-test-image" \\\n    --set job_runner.image.tag="py310-latest" \\\n    --set imagePullPolicy="IfNotPresent" \\\n    dagster \\\n    helm/dagster/\n
\n
\n

Upon installation, the Helm chart will provide instructions for port forwarding\nthe Dagster webserver and Flower (if configured).

\n
\n
\n

Running tests\u00b6

\n

To run the unit tests:

\n
pytest -m "not integration"\n
\n
\n

To run the integration tests, you must have Docker,\nkind,\nand helm installed.

\n

On macOS:

\n
brew install kind\nbrew install helm\n
\n
\n

Docker must be running.

\n

You may experience slow first test runs thanks to image pulls (run pytest -svv --fulltrace for\nvisibility). Building images and loading them to the kind cluster is slow, and there is\nno visibility into the progress of the load.

\n

NOTE: This process is quite slow, as it requires bootstrapping a local kind cluster with\nDocker images and the dagster-k8s Helm chart. For faster development, you can either:

\n
    \n
  1. Keep a warm kind cluster

  2. \n
  3. Use a remote K8s cluster, e.g. via AWS EKS or GCP GKE

  4. \n
\n

Instructions are below.

\n
\n

Faster local development (with kind)\u00b6

\n

You may find that the kind cluster creation, image loading, and kind cluster creation loop\nis too slow for effective local dev.

\n

You may bypass cluster creation and image loading in the following way. First add the --no-cleanup\nflag to your pytest invocation:

\n
pytest --no-cleanup -s -vvv -m "not integration"\n
\n
\n

The tests will run as before, but the kind cluster will be left running after the tests are completed.

\n

For subsequent test runs, you can run:

\n
pytest --kind-cluster="cluster-d9971c84d44d47f382a2928c8c161faa" --existing-helm-namespace="dagster-test-95590a" -s -vvv -m "not integration"\n
\n
\n

This will bypass cluster creation, image loading, and Helm chart installation, for much faster tests.

\n

The kind cluster name and Helm namespace for this command can be found in the logs, or retrieved\nvia the respective CLIs, using kind get clusters and kubectl get namespaces. Note that\nfor kubectl and helm to work correctly with a kind cluster, you should override your\nkubeconfig file location with:

\n
kind get kubeconfig --name kind-test > /tmp/kubeconfig\nexport KUBECONFIG=/tmp/kubeconfig\n
\n
\n
\n
\n

Manual kind cluster setup\u00b6

\n

The test fixtures provided by dagster-k8s automate the process described below, but sometimes\nit\u2019s useful to manually configure a kind cluster and load images onto it.

\n

First, ensure you have a Docker image appropriate for your Python version. Run, from the root of\nthe repo:

\n
./python_modules/dagster-test/dagster_test/test_project/build.sh 3.7.6 \\\n    dagster.io.priv/buildkite-test-image:py310-latest\n
\n
\n

In the above invocation, the Python majmin version should be appropriate for your desired tests.

\n

Then run the following commands to create the cluster and load the image. Note that there is no\nfeedback from the loading process.

\n
kind create cluster --name kind-test\nkind load docker-image --name kind-test dagster.io/dagster-docker-buildkite:py310-latest\n
\n
\n

If you are deploying the Helm chart with an in-cluster Postgres (rather than an external database),\nand/or with dagster-celery workers (and a RabbitMQ), you\u2019ll also want to have images present for\nrabbitmq and postgresql:

\n
docker pull docker.io/bitnami/rabbitmq\ndocker pull docker.io/bitnami/postgresql\n\nkind load docker-image --name kind-test docker.io/bitnami/rabbitmq:latest\nkind load docker-image --name kind-test docker.io/bitnami/postgresql:latest\n
\n
\n

Then you can run pytest as follows:

\n
pytest --kind-cluster=kind-test\n
\n
\n
\n
\n
\n

Faster local development (with an existing K8s cluster)\u00b6

\n

If you already have a development K8s cluster available, you can run tests on that cluster vs.\nrunning locally in kind.

\n

For this to work, first build and deploy the test image to a registry available to your cluster.\nFor example, with a private ECR repository:

\n
./python_modules/dagster-test/dagster_test/test_project/build.sh 3.7.6\ndocker tag dagster-docker-buildkite:latest $AWS_ACCOUNT_ID.dkr.ecr.us-west-2.amazonaws.com/dagster-k8s-tests:2020-04-21T21-04-06\n\naws ecr get-login --no-include-email --region us-west-1 | sh\ndocker push $AWS_ACCOUNT_ID.dkr.ecr.us-west-1.amazonaws.com/dagster-k8s-tests:2020-04-21T21-04-06\n
\n
\n

Then, you can run tests on EKS with:

\n
export DAGSTER_DOCKER_IMAGE_TAG="2020-04-21T21-04-06"\nexport DAGSTER_DOCKER_REPOSITORY="$AWS_ACCOUNT_ID.dkr.ecr.us-west-2.amazonaws.com"\nexport DAGSTER_DOCKER_IMAGE="dagster-k8s-tests"\n\n# First run with --no-cleanup to leave Helm chart in place\npytest --cluster-provider="kubeconfig" --no-cleanup -s -vvv\n\n# Subsequent runs against existing Helm chart\npytest --cluster-provider="kubeconfig" --existing-helm-namespace="dagster-test-<some id>" -s -vvv\n
\n
\n
\n
\n

Validating Helm charts\u00b6

\n

To test / validate Helm charts, you can run:

\n
helm install dagster --dry-run --debug helm/dagster\nhelm lint\n
\n
\n
\n
\n

Enabling GCR access from Minikube\u00b6

\n

To enable GCR access from Minikube:

\n
kubectl create secret docker-registry element-dev-key \\\n    --docker-server=https://gcr.io \\\n    --docker-username=oauth2accesstoken \\\n    --docker-password="$(gcloud auth print-access-token)" \\\n    --docker-email=my@email.com\n
\n
\n
\n
\n

A note about PVCs\u00b6

\n

Both the Postgres and the RabbitMQ Helm charts will store credentials using Persistent Volume\nClaims, which will outlive test invocations and calls to helm uninstall. These must be deleted if\nyou want to change credentials. To view your pvcs, run:

\n
kubectl get pvc\n
\n
\n
\n
\n

Testing Redis\u00b6

\n

The Redis Helm chart installs w/ a randomly-generated password by default; turn this off:

\n
helm install dagredis stable/redis --set usePassword=false\n
\n
\n

Then, to connect to your database from outside the cluster execute the following commands:

\n
kubectl port-forward --namespace default svc/dagredis-master 6379:6379\nredis-cli -h 127.0.0.1 -p 6379\n
\n
\n
\n
\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-k8s", "customsidebar": null, "display_toc": true, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../dagster-mlflow/", "title": "MLflow (dagster-mlflow)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-github/", "title": "GitHub (dagster-github)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-mlflow", "MLflow (dagster-mlflow)", "N", "next"], ["sections/api/apidocs/libraries/dagster-github", "GitHub (dagster-github)", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/libraries/dagster-k8s.rst.txt", "title": "Kubernetes (dagster-k8s)", "toc": "\n"}, "dagster-mlflow": {"alabaster_version": "0.7.13", "body": "
\n

MLflow (dagster-mlflow)\u00b6

\n
\n
\ndagster_mlflow.mlflow_tracking ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
experiment_name (dagster.StringSource):
\n

MlFlow experiment name.

\n
\n
mlflow_tracking_uri (Union[dagster.StringSource, None], optional):
\n

MlFlow tracking server uri.

\n

Default Value: None

\n
\n
parent_run_id (Union[String, None], optional):
\n

Mlflow run ID of parent run if this is a nested run.

\n

Default Value: None

\n
\n
env (permissive dict, optional):
\n

Environment variables for mlflow setup.

\n
\nDefault Value:
{}\n
\n
\n
\n
env_to_tag (Union[List[Any], None], optional):
\n

List of environment variables to log as tags in mlflow.

\n

Default Value: None

\n
\n
extra_tags (permissive dict, optional):
\n

Any extra key-value tags to log to mlflow.

\n
\nDefault Value:
{}\n
\n
\n
\n
\n

This resource initializes an MLflow run that\u2019s used for all steps within a Dagster run.

\n

This resource provides access to all of mlflow\u2019s methods as well as the mlflow tracking client\u2019s\nmethods.

\n

Usage:

\n
    \n
  1. Add the mlflow resource to any ops in which you want to invoke mlflow tracking APIs.

  2. \n
  3. Add the end_mlflow_on_run_finished hook to your job to end the MLflow run\nwhen the Dagster run is finished.

  4. \n
\n

Examples

\n
from dagster_mlflow import end_mlflow_on_run_finished, mlflow_tracking\n\n@op(required_resource_keys={"mlflow"})\ndef mlflow_op(context):\n    mlflow.log_params(some_params)\n    mlflow.tracking.MlflowClient().create_registered_model(some_model_name)\n\n@end_mlflow_on_run_finished\n@job(resource_defs={"mlflow": mlflow_tracking})\ndef mlf_example():\n    mlflow_op()\n\n# example using an mlflow instance with s3 storage\nmlf_example.execute_in_process(run_config={\n    "resources": {\n        "mlflow": {\n            "config": {\n                "experiment_name": my_experiment,\n                "mlflow_tracking_uri": "http://localhost:5000",\n\n                # if want to run a nested run, provide parent_run_id\n                "parent_run_id": an_existing_mlflow_run_id,\n\n                # env variables to pass to mlflow\n                "env": {\n                    "MLFLOW_S3_ENDPOINT_URL": my_s3_endpoint,\n                    "AWS_ACCESS_KEY_ID": my_aws_key_id,\n                    "AWS_SECRET_ACCESS_KEY": my_secret,\n                },\n\n                # env variables you want to log as mlflow tags\n                "env_to_tag": ["DOCKER_IMAGE_TAG"],\n\n                # key-value tags to add to your experiment\n                "extra_tags": {"super": "experiment"},\n            }\n        }\n    }\n})\n
\n
\n
\n\n
\n
\ndagster_mlflow.end_mlflow_on_run_finished HookDefinition\u00b6
\n
\n\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-mlflow", "customsidebar": null, "display_toc": false, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../dagster-msteams/", "title": "Microsoft Teams (dagster-msteams)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-k8s/", "title": "Kubernetes (dagster-k8s)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-msteams", "Microsoft Teams (dagster-msteams)", "N", "next"], ["sections/api/apidocs/libraries/dagster-k8s", "Kubernetes (dagster-k8s)", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/libraries/dagster-mlflow.rst.txt", "title": "MLflow (dagster-mlflow)", "toc": "\n"}, "dagster-msteams": {"alabaster_version": "0.7.13", "body": "
\n

Microsoft Teams (dagster-msteams)\u00b6

\n
\n

Resource\u00b6

\n
\n
\ndagster_msteams.MSTeamsResource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
hook_url (Union[dagster.StringSource, None], optional):
\n

To send messages to MS Teams channel, an incoming webhook has to be created. The incoming webhook url must be given as a part of the resource config to the MSTeamsResource in Dagster. For more information on how to create an incoming webhook, see https://docs.microsoft.com/en-us/microsoftteams/platform/webhooks-and-connectors/how-to/add-incoming-webhook

\n
\n
http_proxy (Union[dagster.StringSource, None], optional):
\n

HTTP proxy URL

\n
\n
https_proxy (Union[dagster.StringSource, None], optional):
\n

HTTPS proxy URL

\n
\n
timeout (Float, optional):
\n

Timeout for requests to MS Teams

\n

Default Value: 60

\n
\n
verify (dagster.BoolSource, optional):
\n

Whether to verify SSL certificates, defaults to True

\n

Default Value: True

\n
\n
\n

This resource is for connecting to Microsoft Teams.

\n

Provides a dagster_msteams.TeamsClient which can be used to\ninterface with the MS Teams API.

\n

By configuring this resource, you can post messages to MS Teams from any Dagster op,\nasset, schedule, or sensor:

\n

Examples

\n
import os\n\nfrom dagster import op, job, Definitions, EnvVar\nfrom dagster_msteams import Card, MSTeamsResource\n\n\n@op\ndef teams_op(msteams: MSTeamsResource):\n    card = Card()\n    card.add_attachment(text_message="Hello There !!")\n    msteams.get_client().post_message(payload=card.payload)\n\n\n@job\ndef teams_job():\n    teams_op()\n\ndefs = Definitions(\n    jobs=[teams_job],\n    resources={\n        "msteams": MSTeamsResource(\n            hook_url=EnvVar("TEAMS_WEBHOOK_URL")\n        )\n    }\n)\n
\n
\n
\n\n
\n
\n

Sensors\u00b6

\n
\n
\ndagster_msteams.teams_on_failure HookDefinition[source]\u00b6
\n

Create a hook on step failure events that will message the given MS Teams webhook URL.

\n
\n
Parameters:
\n
    \n
  • message_fn (Optional(Callable[[HookContext], str])) \u2013 Function which takes in the\nHookContext outputs the message you want to send.

  • \n
  • dagit_base_url \u2013 \n \n (\n deprecated\n )\n \n (This parameter will be removed in version 2.0. Use webserver_base_url instead.) (Optional[str]): The base url of your webserver instance. Specify this\nto allow messages to include deeplinks to the specific run that triggered\nthe hook.

  • \n
  • webserver_base_url \u2013 (Optional[str]): The base url of your webserver instance. Specify this\nto allow messages to include deeplinks to the specific run that triggered\nthe hook.

  • \n
\n
\n
\n

Examples

\n
@teams_on_failure(webserver_base_url="http://localhost:3000")\n@job(...)\ndef my_job():\n    pass\n
\n
\n
def my_message_fn(context: HookContext) -> str:\n    return f"Op {context.op.name} failed!"\n\n@op\ndef a_op(context):\n    pass\n\n@job(...)\ndef my_job():\n    a_op.with_hooks(hook_defs={teams_on_failure("#foo", my_message_fn)})\n
\n
\n
\n\n
\n
\ndagster_msteams.teams_on_success HookDefinition[source]\u00b6
\n

Create a hook on step success events that will message the given MS Teams webhook URL.

\n
\n
Parameters:
\n
    \n
  • message_fn (Optional(Callable[[HookContext], str])) \u2013 Function which takes in the\nHookContext outputs the message you want to send.

  • \n
  • dagit_base_url \u2013 \n \n (\n deprecated\n )\n \n (This parameter will be removed in version 2.0. Use webserver_base_url instead.) (Optional[str]): The base url of your webserver instance. Specify this\nto allow messages to include deeplinks to the specific run that triggered\nthe hook.

  • \n
\n
\n
\n

Examples

\n
@teams_on_success(webserver_base_url="http://localhost:3000")\n@job(...)\ndef my_job():\n    pass\n
\n
\n
def my_message_fn(context: HookContext) -> str:\n    return f"Op {context.op.name} failed!"\n\n@op\ndef a_op(context):\n    pass\n\n@job(...)\ndef my_job():\n    a_op.with_hooks(hook_defs={teams_on_success("#foo", my_message_fn)})\n
\n
\n
\n\n
\n
\ndagster_msteams.make_teams_on_run_failure_sensor(hook_url, message_fn=<function _default_failure_message>, http_proxy=None, https_proxy=None, timeout=60, verify=None, name=None, dagit_base_url=None, default_status=DefaultSensorStatus.STOPPED, monitored_jobs=None, monitor_all_repositories=False, webserver_base_url=None)[source]\u00b6
\n

Create a sensor on run failures that will message the given MS Teams webhook URL.

\n
\n
Parameters:
\n
    \n
  • hook_url (str) \u2013 MS Teams incoming webhook URL.

  • \n
  • message_fn (Optional(Callable[[RunFailureSensorContext], str])) \u2013 Function which\ntakes in the RunFailureSensorContext and outputs the message you want to send.\nDefaults to a text message that contains error message, job name, and run ID.

  • \n
  • http_proxy \u2013 (Optional[str]): Proxy for requests using http protocol.

  • \n
  • https_proxy \u2013 (Optional[str]): Proxy for requests using https protocol.

  • \n
  • timeout \u2013 (Optional[float]): Connection timeout in seconds. Defaults to 60.

  • \n
  • verify \u2013 (Optional[bool]): Whether to verify the servers TLS certificate.

  • \n
  • name \u2013 (Optional[str]): The name of the sensor. Defaults to \u201cteams_on_run_failure\u201d.

  • \n
  • dagit_base_url \u2013 \n \n (\n deprecated\n )\n \n (This parameter will be removed in version 2.0. Use webserver_base_url instead.) (Optional[str]): The base url of your webserver instance. Specify this to allow\nmessages to include deeplinks to the failed run.

  • \n
  • default_status (DefaultSensorStatus) \u2013 Whether the sensor starts as running or not. The default\nstatus can be overridden from Dagit or via the GraphQL API.

  • \n
  • monitored_jobs (Optional[List[Union[JobDefinition, GraphDefinition, UnresolvedAssetJobDefinition, RepositorySelector, JobSelector]]]) \u2013 Jobs in the current repository that will be monitored by this sensor. Defaults to None,\nwhich means the alert will be sent when any job in the repository matches the requested\nrun_status. To monitor jobs in external repositories, use RepositorySelector and JobSelector.

  • \n
  • monitor_all_repositories (bool) \u2013 If set to True, the sensor will monitor all runs in the\nDagster instance. If set to True, an error will be raised if you also specify\nmonitored_jobs or job_selection. Defaults to False.

  • \n
  • webserver_base_url \u2013 (Optional[str]): The base url of your webserver instance. Specify this to allow\nmessages to include deeplinks to the failed run.

  • \n
\n
\n
\n

Examples

\n
teams_on_run_failure = make_teams_on_run_failure_sensor(\n    hook_url=os.getenv("TEAMS_WEBHOOK_URL")\n)\n\n@repository\ndef my_repo():\n    return [my_job + teams_on_run_failure]\n
\n
\n
def my_message_fn(context: RunFailureSensorContext) -> str:\n    return "Job {job_name} failed! Error: {error}".format(\n        job_name=context.dagster_run.job_name,\n        error=context.failure_event.message,\n    )\n\nteams_on_run_failure = make_teams_on_run_failure_sensor(\n    hook_url=os.getenv("TEAMS_WEBHOOK_URL"),\n    message_fn=my_message_fn,\n    webserver_base_url="http://localhost:3000",\n)\n
\n
\n
\n\n
\n
\n

Legacy\u00b6

\n
\n
\ndagster_msteams.msteams_resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
hook_url (Union[dagster.StringSource, None], optional):
\n

To send messages to MS Teams channel, an incoming webhook has to be created. The incoming webhook url must be given as a part of the resource config to the MSTeamsResource in Dagster. For more information on how to create an incoming webhook, see https://docs.microsoft.com/en-us/microsoftteams/platform/webhooks-and-connectors/how-to/add-incoming-webhook

\n
\n
http_proxy (Union[dagster.StringSource, None], optional):
\n

HTTP proxy URL

\n
\n
https_proxy (Union[dagster.StringSource, None], optional):
\n

HTTPS proxy URL

\n
\n
timeout (Float, optional):
\n

Timeout for requests to MS Teams

\n

Default Value: 60

\n
\n
verify (dagster.BoolSource, optional):
\n

Whether to verify SSL certificates, defaults to True

\n

Default Value: True

\n
\n
\n

This resource is for connecting to Microsoft Teams.

\n

The resource object is a dagster_msteams.TeamsClient.

\n

By configuring this resource, you can post messages to MS Teams from any Dagster solid:

\n

Examples

\n
import os\n\nfrom dagster import op, job\nfrom dagster_msteams import Card, msteams_resource\n\n\n@op(required_resource_keys={"msteams"})\ndef teams_op(context):\n    card = Card()\n    card.add_attachment(text_message="Hello There !!")\n    context.resources.msteams.post_message(payload=card.payload)\n\n\n@job(resource_defs={"msteams": msteams_resource})\ndef teams_job():\n    teams_op()\n\n\nteams_job.execute_in_process(\n    {"resources": {"msteams": {"config": {"hook_url": os.getenv("TEAMS_WEBHOOK_URL")}}}}\n)\n
\n
\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-msteams", "customsidebar": null, "display_toc": true, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../dagster-mysql/", "title": "MySQL (dagster-mysql)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-mlflow/", "title": "MLflow (dagster-mlflow)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-mysql", "MySQL (dagster-mysql)", "N", "next"], ["sections/api/apidocs/libraries/dagster-mlflow", "MLflow (dagster-mlflow)", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/libraries/dagster-msteams.rst.txt", "title": "Microsoft Teams (dagster-msteams)", "toc": "\n"}, "dagster-mysql": {"alabaster_version": "0.7.13", "body": "
\n

MySQL (dagster-mysql)\u00b6

\n
\n
\nclass dagster_mysql.MySQLEventLogStorage(mysql_url, inst_data=None)[source]\u00b6
\n

MySQL-backed event log storage.

\n

Users should not directly instantiate this class; it is instantiated by internal machinery when\ndagster-webserver and dagster-graphql load, based on the values in the dagster.yaml file in\n$DAGSTER_HOME. Configuration of this class should be done by setting values in that file.

\n
\n
dagster.yaml\u00b6
\n
event_log_storage:\n module: dagster_mysql.event_log\n class: MySQLEventLogStorage\n config:\n  mysql_db:\n   username: { username }\n   password: { password }\n   hostname: { hostname }\n   db_name: { db_name }\n   port: { port }\n
\n
\n
\n

Note that the fields in this config are StringSource and\nIntSource and can be configured from environment variables.

\n
\n\n
\n
\nclass dagster_mysql.MySQLRunStorage(mysql_url, inst_data=None)[source]\u00b6
\n

MySQL-backed run storage.

\n

Users should not directly instantiate this class; it is instantiated by internal machinery when\ndagster-webserver and dagster-graphql load, based on the values in the dagster.yaml file in\n$DAGSTER_HOME. Configuration of this class should be done by setting values in that file.

\n
\n
dagster.yaml\u00b6
\n
run_storage:\n module: dagster_mysql.run_storage\n class: MySQLRunStorage\n config:\n  mysql_db:\n   username: { username }\n   password: { password }\n   hostname: { hostname }\n   db_name: { database }\n   port: { port }\n
\n
\n
\n

Note that the fields in this config are StringSource and\nIntSource and can be configured from environment variables.

\n
\n\n
\n
\nclass dagster_mysql.MySQLScheduleStorage(mysql_url, inst_data=None)[source]\u00b6
\n

MySQL-backed run storage.

\n

Users should not directly instantiate this class; it is instantiated by internal machinery when\ndagster-webserver and dagster-graphql load, based on the values in the dagster.yaml file in\n$DAGSTER_HOME. Configuration of this class should be done by setting values in that file.

\n
\n
dagster.yaml\u00b6
\n
schedule_storage:\n module: dagster_mysql.schedule_storage\n class: MySQLScheduleStorage\n config:\n  mysql_db:\n   username: { username }\n   password: { password }\n   hostname: { hostname }\n   db_name: { db_name }\n   port: { port }\n
\n
\n
\n

Note that the fields in this config are StringSource and\nIntSource and can be configured from environment variables.

\n
\n\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-mysql", "customsidebar": null, "display_toc": false, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../dagster-pagerduty/", "title": "PagerDuty (dagster-pagerduty)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-msteams/", "title": "Microsoft Teams (dagster-msteams)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-pagerduty", "PagerDuty (dagster-pagerduty)", "N", "next"], ["sections/api/apidocs/libraries/dagster-msteams", "Microsoft Teams (dagster-msteams)", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/libraries/dagster-mysql.rst.txt", "title": "MySQL (dagster-mysql)", "toc": "\n"}, "dagster-pagerduty": {"alabaster_version": "0.7.13", "body": "
\n

PagerDuty (dagster-pagerduty)\u00b6

\n

This library provides an integration with PagerDuty, to support creating alerts from your Dagster\ncode.

\n

Presently, it provides a thin wrapper on the Events API V2.

\n
\n
\n

Getting Started\u00b6

\n

You can install this library with:

\n
pip install dagster_pagerduty\n
\n
\n

To use this integration, you\u2019ll first need to create an Events API V2 PagerDuty integration on a PagerDuty service. There are instructions\nhere for\ncreating a new PagerDuty service & integration.

\n

Once your Events API V2 integration is set up, you\u2019ll find an Integration Key (also referred to as a\n\u201cRouting Key\u201d) on the Integrations tab for your service. This key is used to authorize events\ncreated from the PagerDuty events API.

\n

Once your service/integration is created, you can provision a PagerDuty resource and issue PagerDuty\nalerts from within your ops.

\n
\n
\ndagster_pagerduty.PagerDutyService ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
routing_key (dagster.StringSource):
\n

The routing key provisions access to your PagerDuty service. Youwill need to include the integration key for your new integration, as arouting_key in the event payload.

\n
\n
\n

This resource is for posting events to PagerDuty.

\n
\n\n
\n

Legacy\u00b6

\n
\n
\ndagster_pagerduty.pagerduty_resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
routing_key (dagster.StringSource):
\n

The routing key provisions access to your PagerDuty service. Youwill need to include the integration key for your new integration, as arouting_key in the event payload.

\n
\n
\n

A resource for posting events (alerts) to PagerDuty.

\n

Example

\n
@op\ndef pagerduty_op(pagerduty: PagerDutyService):\n    pagerduty.EventV2_create(\n        summary='alert from dagster'\n        source='localhost',\n        severity='error',\n        event_action='trigger',\n    )\n\n@job(resource_defs={ 'pagerduty': pagerduty_resource })\ndef pagerduty_test():\n    pagerduty_op()\n\npagerduty_test.execute_in_process(\n    run_config={\n        "resources": {\n            'pagerduty': {'config': {'routing_key': '0123456789abcdef0123456789abcdef'}}\n        }\n    }\n)\n
\n
\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-pagerduty", "customsidebar": null, "display_toc": true, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../dagster-pandas/", "title": "Pandas (dagster-pandas)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-mysql/", "title": "MySQL (dagster-mysql)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-pandas", "Pandas (dagster-pandas)", "N", "next"], ["sections/api/apidocs/libraries/dagster-mysql", "MySQL (dagster-mysql)", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/libraries/dagster-pagerduty.rst.txt", "title": "PagerDuty (dagster-pagerduty)", "toc": "\n"}, "dagster-pandas": {"alabaster_version": "0.7.13", "body": "
\n

Pandas (dagster-pandas)\u00b6

\n

The dagster_pandas library provides utilities for using pandas with Dagster and for implementing\nvalidation on pandas DataFrames. A good place to start with dagster_pandas is the validation\nguide.

\n
\n
\ndagster_pandas.create_dagster_pandas_dataframe_type(name, description=None, columns=None, metadata_fn=None, dataframe_constraints=None, loader=None)[source]\u00b6
\n

Constructs a custom pandas dataframe dagster type.

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 Name of the dagster pandas type.

  • \n
  • description (Optional[str]) \u2013 A markdown-formatted string, displayed in tooling.

  • \n
  • columns (Optional[List[PandasColumn]]) \u2013 A list of PandasColumn objects\nwhich express dataframe column schemas and constraints.

  • \n
  • metadata_fn (Optional[Callable[[], Union[Dict[str, Union[str, float, int, Dict, MetadataValue]]) \u2013 A callable which takes your dataframe and returns a dict with string label keys and\nMetadataValue values.

  • \n
  • dataframe_constraints (Optional[List[DataFrameConstraint]]) \u2013 A list of objects that inherit from\nDataFrameConstraint. This allows you to express dataframe-level constraints.

  • \n
  • loader (Optional[DagsterTypeLoader]) \u2013 An instance of a class that\ninherits from DagsterTypeLoader. If None, we will default\nto using dataframe_loader.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster_pandas.RowCountConstraint(num_allowed_rows, error_tolerance=0)[source]\u00b6
\n

A dataframe constraint that validates the expected count of rows.

\n
\n
Parameters:
\n
    \n
  • num_allowed_rows (int) \u2013 The number of allowed rows in your dataframe.

  • \n
  • error_tolerance (Optional[int]) \u2013 The acceptable threshold if you are not completely certain. Defaults to 0.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster_pandas.StrictColumnsConstraint(strict_column_list, enforce_ordering=False)[source]\u00b6
\n

A dataframe constraint that validates column existence and ordering.

\n
\n
Parameters:
\n
    \n
  • strict_column_list (List[str]) \u2013 The exact list of columns that your dataframe must have.

  • \n
  • enforce_ordering (Optional[bool]) \u2013 If true, will enforce that the ordering of column names must match.\nDefault is False.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster_pandas.PandasColumn(name, constraints=None, is_required=None)[source]\u00b6
\n

The main API for expressing column level schemas and constraints for your custom dataframe\ntypes.

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 Name of the column. This must match up with the column name in the dataframe you\nexpect to receive.

  • \n
  • is_required (Optional[bool]) \u2013 Flag indicating the optional/required presence of the column.\nIf th column exists, the validate function will validate the column. Defaults to True.

  • \n
  • constraints (Optional[List[Constraint]]) \u2013 List of constraint objects that indicate the\nvalidation rules for the pandas column.

  • \n
\n
\n
\n
\n\n
\n
\ndagster_pandas.DataFrame = <dagster._core.types.dagster_type.DagsterType object>\u00b6
\n

Define a type in dagster. These can be used in the inputs and outputs of ops.

\n
\n
Parameters:
\n
    \n
  • type_check_fn (Callable[[TypeCheckContext, Any], [Union[bool, TypeCheck]]]) \u2013 The function that defines the type check. It takes the value flowing\nthrough the input or output of the op. If it passes, return either\nTrue or a TypeCheck with success set to True. If it fails,\nreturn either False or a TypeCheck with success set to False.\nThe first argument must be named context (or, if unused, _, _context, or context_).\nUse required_resource_keys for access to resources.

  • \n
  • key (Optional[str]) \u2013

    The unique key to identify types programmatically.\nThe key property always has a value. If you omit key to the argument\nto the init function, it instead receives the value of name. If\nneither key nor name is provided, a CheckError is thrown.

    \n

    In the case of a generic type such as List or Optional, this is\ngenerated programmatically based on the type parameters.

    \n

    For most use cases, name should be set and the key argument should\nnot be specified.

    \n

  • \n
  • name (Optional[str]) \u2013 A unique name given by a user. If key is None, key\nbecomes this value. Name is not given in a case where the user does\nnot specify a unique name for this type, such as a generic class.

  • \n
  • description (Optional[str]) \u2013 A markdown-formatted string, displayed in tooling.

  • \n
  • loader (Optional[DagsterTypeLoader]) \u2013 An instance of a class that\ninherits from DagsterTypeLoader and can map config data to a value of\nthis type. Specify this argument if you will need to shim values of this type using the\nconfig machinery. As a rule, you should use the\n@dagster_type_loader decorator to construct\nthese arguments.

  • \n
  • required_resource_keys (Optional[Set[str]]) \u2013 Resource keys required by the type_check_fn.

  • \n
  • is_builtin (bool) \u2013 Defaults to False. This is used by tools to display or\nfilter built-in types (such as String, Int) to visually distinguish\nthem from user-defined types. Meant for internal use.

  • \n
  • kind (DagsterTypeKind) \u2013 Defaults to None. This is used to determine the kind of runtime type\nfor InputDefinition and OutputDefinition type checking.

  • \n
  • typing_type \u2013 Defaults to None. A valid python typing type (e.g. Optional[List[int]]) for the\nvalue contained within the DagsterType. Meant for internal use.

  • \n
\n
\n
\n
\n\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-pandas", "customsidebar": null, "display_toc": false, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../dagster-pandera/", "title": "Pandera (dagster-pandera)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-pagerduty/", "title": "PagerDuty (dagster-pagerduty)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-pandera", "Pandera (dagster-pandera)", "N", "next"], ["sections/api/apidocs/libraries/dagster-pagerduty", "PagerDuty (dagster-pagerduty)", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/libraries/dagster-pandas.rst.txt", "title": "Pandas (dagster-pandas)", "toc": "\n"}, "dagster-pandera": {"alabaster_version": "0.7.13", "body": "
\n

Pandera (dagster-pandera)\u00b6

\n

The dagster_pandera library allows Dagster users to use dataframe validation library Pandera for the validation of Pandas dataframes. See the guide for details.

\n
\n
\ndagster_pandera.pandera_schema_to_dagster_type(schema)[source]\u00b6
\n

Convert a Pandera dataframe schema to a DagsterType.

\n

The generated Dagster type will be given an automatically generated name. The schema\u2019s title\nproperty, name property, or class name (in that order) will be used. If neither title or\nname is defined, a name of the form DagsterPanderaDataframe<n> is generated.

\n

Additional metadata is also extracted from the Pandera schema and attached to the returned\nDagsterType as a metadata dictionary. The extracted metadata includes:

\n
    \n
  • Descriptions on the schema and constituent columns and checks.

  • \n
  • Data types for each column.

  • \n
  • String representations of all column-wise checks.

  • \n
  • String representations of all row-wise (i.e. \u201cwide\u201d) checks.

  • \n
\n

The returned DagsterType type will call the Pandera schema\u2019s validate() method in its type\ncheck function. Validation is done in lazy mode, i.e. pandera will attempt to validate all\nvalues in the dataframe, rather than stopping on the first error.

\n

If validation fails, the returned TypeCheck object will contain two pieces of metadata:

\n
    \n
  • num_failures total number of validation errors.

  • \n
  • failure_sample a table containing up to the first 10 validation errors.

  • \n
\n
\n
Parameters:
\n

schema (Union[pa.DataFrameSchema, Type[pa.SchemaModel]]) \u2013

\n
\n
Returns:
\n

Dagster Type constructed from the Pandera schema.

\n
\n
Return type:
\n

DagsterType

\n
\n
\n
\n\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-pandera", "customsidebar": null, "display_toc": false, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../dagster-papertrail/", "title": "Papertrail (dagster-papertrail)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-pandas/", "title": "Pandas (dagster-pandas)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-papertrail", "Papertrail (dagster-papertrail)", "N", "next"], ["sections/api/apidocs/libraries/dagster-pandas", "Pandas (dagster-pandas)", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/libraries/dagster-pandera.rst.txt", "title": "Pandera (dagster-pandera)", "toc": "\n"}, "dagster-papertrail": {"alabaster_version": "0.7.13", "body": "
\n

Papertrail (dagster-papertrail)\u00b6

\n

This library provides an integration with Papertrail for logging.

\n

You can easily set up your Dagster job to log to Papertrail. You\u2019ll need an active Papertrail\naccount, and have your papertrail URL and port handy.

\n
\n
\ndagster_papertrail.papertrail_logger LoggerDefinition\u00b6
\n

Core class for defining loggers.

\n

Loggers are job-scoped logging handlers, which will be automatically invoked whenever\ndagster messages are logged from within a job.

\n
\n
Parameters:
\n
    \n
  • logger_fn (Callable[[InitLoggerContext], logging.Logger]) \u2013 User-provided function to\ninstantiate the logger. This logger will be automatically invoked whenever the methods\non context.log are called from within job compute logic.

  • \n
  • config_schema (Optional[ConfigSchema]) \u2013 The schema for the config. Configuration data available in\ninit_context.logger_config. If not set, Dagster will accept any config provided.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of this logger.

  • \n
\n
\n
\n
\n\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-papertrail", "customsidebar": null, "display_toc": false, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../dagster-postgres/", "title": "PostgreSQL (dagster-postgres)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-pandera/", "title": "Pandera (dagster-pandera)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-postgres", "PostgreSQL (dagster-postgres)", "N", "next"], ["sections/api/apidocs/libraries/dagster-pandera", "Pandera (dagster-pandera)", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/libraries/dagster-papertrail.rst.txt", "title": "Papertrail (dagster-papertrail)", "toc": "\n"}, "dagster-postgres": {"alabaster_version": "0.7.13", "body": "
\n

PostgreSQL (dagster-postgres)\u00b6

\n
\n
\ndagster_postgres.PostgresEventLogStorage = <class 'dagster_postgres.event_log.event_log.PostgresEventLogStorage'>[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
postgres_url (dagster.StringSource, optional):
\n

\n
postgres_db (strict dict, optional):
\n
\nConfig Schema:
\n
username (dagster.StringSource):
\n

\n
password (dagster.StringSource):
\n

\n
hostname (dagster.StringSource):
\n

\n
db_name (dagster.StringSource):
\n

\n
port (dagster.IntSource, optional):
\n

Default Value: 5432

\n
\n
params (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\n
scheme (dagster.StringSource, optional):
\n

Default Value: \u2018postgresql\u2019

\n
\n
\n
\n
should_autocreate_tables (Bool, optional):
\n

Default Value: True

\n
\n
\n

Postgres-backed event log storage.

\n

Users should not directly instantiate this class; it is instantiated by internal machinery when\ndagster-webserver and dagster-graphql load, based on the values in the dagster.yaml file in\n$DAGSTER_HOME. Configuration of this class should be done by setting values in that file.

\n

To use Postgres for all of the components of your instance storage, you can add the following\nblock to your dagster.yaml:

\n
\n
dagster.yaml\u00b6
\n
storage:\n  postgres:\n    postgres_db:\n      username: my_username\n      password: my_password\n      hostname: my_hostname\n      db_name: my_database\n      port: 5432\n
\n
\n
\n

If you are configuring the different storage components separately and are specifically\nconfiguring your event log storage to use Postgres, you can add a block such as the following\nto your dagster.yaml:

\n
\n
dagster.yaml\u00b6
\n
event_log_storage:\n  module: dagster_postgres.event_log\n  class: PostgresEventLogStorage\n  config:\n    postgres_db:\n      username: { username }\n      password: { password }\n      hostname: { hostname }\n      db_name: { db_name }\n      port: { port }\n
\n
\n
\n

Note that the fields in this config are StringSource and\nIntSource and can be configured from environment variables.

\n
\n\n
\n
\ndagster_postgres.PostgresRunStorage = <class 'dagster_postgres.run_storage.run_storage.PostgresRunStorage'>[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
postgres_url (dagster.StringSource, optional):
\n

\n
postgres_db (strict dict, optional):
\n
\nConfig Schema:
\n
username (dagster.StringSource):
\n

\n
password (dagster.StringSource):
\n

\n
hostname (dagster.StringSource):
\n

\n
db_name (dagster.StringSource):
\n

\n
port (dagster.IntSource, optional):
\n

Default Value: 5432

\n
\n
params (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\n
scheme (dagster.StringSource, optional):
\n

Default Value: \u2018postgresql\u2019

\n
\n
\n
\n
should_autocreate_tables (Bool, optional):
\n

Default Value: True

\n
\n
\n

Postgres-backed run storage.

\n

Users should not directly instantiate this class; it is instantiated by internal machinery when\ndagster-webserver and dagster-graphql load, based on the values in the dagster.yaml file in\n$DAGSTER_HOME. Configuration of this class should be done by setting values in that file.

\n

To use Postgres for all of the components of your instance storage, you can add the following\nblock to your dagster.yaml:

\n
\n
dagster.yaml\u00b6
\n
storage:\n  postgres:\n    postgres_db:\n      username: my_username\n      password: my_password\n      hostname: my_hostname\n      db_name: my_database\n      port: 5432\n
\n
\n
\n

If you are configuring the different storage components separately and are specifically\nconfiguring your run storage to use Postgres, you can add a block such as the following\nto your dagster.yaml:

\n
\n
dagster.yaml\u00b6
\n
run_storage:\n  module: dagster_postgres.run_storage\n  class: PostgresRunStorage\n  config:\n    postgres_db:\n      username: { username }\n      password: { password }\n      hostname: { hostname }\n      db_name: { db_name }\n      port: { port }\n
\n
\n
\n

Note that the fields in this config are StringSource and\nIntSource and can be configured from environment variables.

\n
\n\n
\n
\ndagster_postgres.PostgresScheduleStorage = <class 'dagster_postgres.schedule_storage.schedule_storage.PostgresScheduleStorage'>[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
postgres_url (dagster.StringSource, optional):
\n

\n
postgres_db (strict dict, optional):
\n
\nConfig Schema:
\n
username (dagster.StringSource):
\n

\n
password (dagster.StringSource):
\n

\n
hostname (dagster.StringSource):
\n

\n
db_name (dagster.StringSource):
\n

\n
port (dagster.IntSource, optional):
\n

Default Value: 5432

\n
\n
params (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\n
scheme (dagster.StringSource, optional):
\n

Default Value: \u2018postgresql\u2019

\n
\n
\n
\n
should_autocreate_tables (Bool, optional):
\n

Default Value: True

\n
\n
\n

Postgres-backed run storage.

\n

Users should not directly instantiate this class; it is instantiated by internal machinery when\ndagster-webserver and dagster-graphql load, based on the values in the dagster.yaml file in\n$DAGSTER_HOME. Configuration of this class should be done by setting values in that file.

\n

To use Postgres for all of the components of your instance storage, you can add the following\nblock to your dagster.yaml:

\n
\n
dagster.yaml\u00b6
\n
storage:\n  postgres:\n    postgres_db:\n      username: my_username\n      password: my_password\n      hostname: my_hostname\n      db_name: my_database\n      port: 5432\n
\n
\n
\n

If you are configuring the different storage components separately and are specifically\nconfiguring your schedule storage to use Postgres, you can add a block such as the following\nto your dagster.yaml:

\n
\n
dagster.yaml\u00b6
\n
schedule_storage:\n  module: dagster_postgres.schedule_storage\n  class: PostgresScheduleStorage\n  config:\n    postgres_db:\n      username: { username }\n      password: { password }\n      hostname: { hostname }\n      db_name: { db_name }\n      port: { port }\n
\n
\n
\n

Note that the fields in this config are StringSource and\nIntSource and can be configured from environment variables.

\n
\n\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-postgres", "customsidebar": null, "display_toc": false, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../dagster-prometheus/", "title": "Prometheus (dagster-prometheus)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-papertrail/", "title": "Papertrail (dagster-papertrail)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-prometheus", "Prometheus (dagster-prometheus)", "N", "next"], ["sections/api/apidocs/libraries/dagster-papertrail", "Papertrail (dagster-papertrail)", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/libraries/dagster-postgres.rst.txt", "title": "PostgreSQL (dagster-postgres)", "toc": "\n"}, "dagster-prometheus": {"alabaster_version": "0.7.13", "body": "
\n

Prometheus (dagster-prometheus)\u00b6

\n
\n
\ndagster_prometheus.PrometheusResource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
gateway (dagster.StringSource):
\n

The url for your push gateway. Either of the form \u2018http://pushgateway.local\u2019, or \u2018pushgateway.local\u2019. Scheme defaults to \u2018http\u2019 if none is provided

\n
\n
timeout (dagster.IntSource, optional):
\n

is how long delete will attempt to connect before giving up. Defaults to 30s.

\n

Default Value: 30

\n
\n
\n

This resource is used to send metrics to a Prometheus Pushgateway.

\n

Example:

\n
from dagster_prometheus import PrometheusResource\nfrom dagster import Definitions, job, op\n\n@op\ndef example_prometheus_op(prometheus: PrometheusResource):\n    prometheus.push_to_gateway(job="my_job")\n\n@job\ndef my_job():\n    example_prometheus_op()\n\ndefs = Definitions(\n    jobs=[my_job],\n    resources={"prometheus": PrometheusResource(gateway="http://pushgateway.local")},\n)\n
\n
\n
\n\n
\n
\nclass dagster_prometheus.resources.PrometheusClient[source]\u00b6
\n

Integrates with Prometheus via the prometheus_client library.

\n
\n\n
\n

Legacy\u00b6

\n
\n
\ndagster_prometheus.prometheus_resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
gateway (dagster.StringSource):
\n

The url for your push gateway. Either of the form \u2018http://pushgateway.local\u2019, or \u2018pushgateway.local\u2019. Scheme defaults to \u2018http\u2019 if none is provided

\n
\n
timeout (dagster.IntSource, optional):
\n

is how long delete will attempt to connect before giving up. Defaults to 30s.

\n

Default Value: 30

\n
\n
\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-prometheus", "customsidebar": null, "display_toc": true, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../dagster-pyspark/", "title": "Pyspark (dagster-pyspark)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-postgres/", "title": "PostgreSQL (dagster-postgres)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-pyspark", "Pyspark (dagster-pyspark)", "N", "next"], ["sections/api/apidocs/libraries/dagster-postgres", "PostgreSQL (dagster-postgres)", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/libraries/dagster-prometheus.rst.txt", "title": "Prometheus (dagster-prometheus)", "toc": "\n"}, "dagster-pyspark": {"alabaster_version": "0.7.13", "body": "
\n

Pyspark (dagster-pyspark)\u00b6

\n
\n
\ndagster_pyspark.PySparkResource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
spark_config (dict):
\n

\n
\n

This resource provides access to a PySpark Session for executing PySpark code within Dagster.

\n

Example

\n
@op\ndef my_op(pyspark: PySparkResource)\n    spark_session = pyspark.spark_session\n    dataframe = spark_session.read.json("examples/src/main/resources/people.json")\n\n\n@job(\n    resource_defs={\n        "pyspark": PySparkResource(\n            spark_config={\n                "spark.executor.memory": "2g"\n            }\n        )\n    }\n)\ndef my_spark_job():\n    my_op()\n
\n
\n
\n\n
\n

Legacy\u00b6

\n
\n
\ndagster_pyspark.pyspark_resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
spark_conf (permissive dict, optional):
\n
\nDefault Value:
{\n    "spark": {\n        "app": {},\n        "driver": {\n            "blockManager": {}\n        },\n        "executor": {\n            "pyspark": {},\n            "logs": {\n                "rolling": {\n                    "time": {}\n                }\n            }\n        },\n        "local": {},\n        "submit": {},\n        "log": {},\n        "redaction": {},\n        "python": {\n            "profile": {},\n            "worker": {}\n        },\n        "files": {},\n        "jars": {},\n        "pyspark": {\n            "driver": {}\n        },\n        "reducer": {},\n        "shuffle": {\n            "file": {},\n            "io": {},\n            "service": {\n                "index": {\n                    "cache": {}\n                }\n            },\n            "sort": {},\n            "spill": {},\n            "registration": {}\n        },\n        "eventLog": {\n            "logBlockUpdates": {},\n            "longForm": {},\n            "buffer": {}\n        },\n        "ui": {\n            "dagGraph": {},\n            "liveUpdate": {}\n        },\n        "worker": {\n            "ui": {}\n        },\n        "sql": {\n            "ui": {}\n        },\n        "streaming": {\n            "ui": {},\n            "backpressure": {},\n            "receiver": {\n                "writeAheadLog": {}\n            },\n            "kafka": {},\n            "driver": {\n                "writeAheadLog": {}\n            }\n        },\n        "broadcast": {},\n        "io": {\n            "compression": {\n                "lz4": {},\n                "snappy": {},\n                "zstd": {}\n            }\n        },\n        "kryo": {},\n        "kryoserializer": {\n            "buffer": {}\n        },\n        "rdd": {},\n        "serializer": {},\n        "memory": {\n            "offHeap": {}\n        },\n        "storage": {\n            "replication": {}\n        },\n        "cleaner": {\n            "periodicGC": {},\n            "referenceTracking": {\n                "blocking": {}\n            }\n        },\n        "default": {},\n        "hadoop": {\n            "mapreduce": {\n                "fileoutputcommitter": {\n                    "algorithm": {}\n                }\n            }\n        },\n        "rpc": {\n            "message": {},\n            "retry": {}\n        },\n        "blockManager": {},\n        "network": {},\n        "port": {},\n        "core": {\n            "connection": {\n                "ack": {\n                    "wait": {}\n                }\n            }\n        },\n        "cores": {},\n        "locality": {\n            "wait": {}\n        },\n        "scheduler": {\n            "revive": {},\n            "listenerbus": {\n                "eventqueue": {}\n            }\n        },\n        "blacklist": {\n            "task": {},\n            "stage": {},\n            "application": {\n                "fetchFailure": {}\n            }\n        },\n        "speculation": {},\n        "task": {\n            "reaper": {}\n        },\n        "stage": {},\n        "dynamicAllocation": {},\n        "r": {\n            "driver": {},\n            "shell": {}\n        },\n        "graphx": {\n            "pregel": {}\n        },\n        "deploy": {\n            "zookeeper": {}\n        }\n    }\n}\n
\n
\n
\nConfig Schema:
\n
spark (permissive dict, optional):
\n
\nDefault Value:
{\n    "app": {},\n    "driver": {\n        "blockManager": {}\n    },\n    "executor": {\n        "pyspark": {},\n        "logs": {\n            "rolling": {\n                "time": {}\n            }\n        }\n    },\n    "local": {},\n    "submit": {},\n    "log": {},\n    "redaction": {},\n    "python": {\n        "profile": {},\n        "worker": {}\n    },\n    "files": {},\n    "jars": {},\n    "pyspark": {\n        "driver": {}\n    },\n    "reducer": {},\n    "shuffle": {\n        "file": {},\n        "io": {},\n        "service": {\n            "index": {\n                "cache": {}\n            }\n        },\n        "sort": {},\n        "spill": {},\n        "registration": {}\n    },\n    "eventLog": {\n        "logBlockUpdates": {},\n        "longForm": {},\n        "buffer": {}\n    },\n    "ui": {\n        "dagGraph": {},\n        "liveUpdate": {}\n    },\n    "worker": {\n        "ui": {}\n    },\n    "sql": {\n        "ui": {}\n    },\n    "streaming": {\n        "ui": {},\n        "backpressure": {},\n        "receiver": {\n            "writeAheadLog": {}\n        },\n        "kafka": {},\n        "driver": {\n            "writeAheadLog": {}\n        }\n    },\n    "broadcast": {},\n    "io": {\n        "compression": {\n            "lz4": {},\n            "snappy": {},\n            "zstd": {}\n        }\n    },\n    "kryo": {},\n    "kryoserializer": {\n        "buffer": {}\n    },\n    "rdd": {},\n    "serializer": {},\n    "memory": {\n        "offHeap": {}\n    },\n    "storage": {\n        "replication": {}\n    },\n    "cleaner": {\n        "periodicGC": {},\n        "referenceTracking": {\n            "blocking": {}\n        }\n    },\n    "default": {},\n    "hadoop": {\n        "mapreduce": {\n            "fileoutputcommitter": {\n                "algorithm": {}\n            }\n        }\n    },\n    "rpc": {\n        "message": {},\n        "retry": {}\n    },\n    "blockManager": {},\n    "network": {},\n    "port": {},\n    "core": {\n        "connection": {\n            "ack": {\n                "wait": {}\n            }\n        }\n    },\n    "cores": {},\n    "locality": {\n        "wait": {}\n    },\n    "scheduler": {\n        "revive": {},\n        "listenerbus": {\n            "eventqueue": {}\n        }\n    },\n    "blacklist": {\n        "task": {},\n        "stage": {},\n        "application": {\n            "fetchFailure": {}\n        }\n    },\n    "speculation": {},\n    "task": {\n        "reaper": {}\n    },\n    "stage": {},\n    "dynamicAllocation": {},\n    "r": {\n        "driver": {},\n        "shell": {}\n    },\n    "graphx": {\n        "pregel": {}\n    },\n    "deploy": {\n        "zookeeper": {}\n    }\n}\n
\n
\n
\nConfig Schema:
\n
app (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
name (dagster.StringSource, optional):
\n

Application Properties: The name of your application. This will appear in the UI and in log data.

\n
\n
\n
\n
driver (permissive dict, optional):
\n
\nDefault Value:
{\n    "blockManager": {}\n}\n
\n
\n
\nConfig Schema:
\n
cores (dagster.IntSource, optional):
\n

Application Properties: Number of cores to use for the driver process, only in cluster mode.

\n
\n
maxResultSize (dagster.StringSource, optional):
\n

Application Properties: Limit of total size of serialized results of all partitions for each Spark action (e.g. collect) in bytes. Should be at least 1M, or 0 for unlimited. Jobs will be aborted if the total size is above this limit. Having a high limit may cause out-of-memory errors in driver (depends on spark.driver.memory and memory overhead of objects in JVM). Setting a proper limit can protect the driver from out-of-memory errors.

\n
\n
memory (dagster.StringSource, optional):
\n

Application Properties: Amount of memory to use for the driver process, i.e. where SparkContext is initialized, in the same format as JVM memory strings with a size unit suffix (\u201ck\u201d, \u201cm\u201d, \u201cg\u201d or \u201ct\u201d) (e.g. 512m, 2g). Note: In client mode, this config must not be set through the SparkConf directly in your application, because the driver JVM has already started at that point. Instead, please set this through the \u2013driver-memory command line option or in your default properties file.

\n
\n
memoryOverhead (dagster.StringSource, optional):
\n

Application Properties: The amount of off-heap memory to be allocated per driver in cluster mode, in MiB unless otherwise specified. This is memory that accounts for things like VM overheads, interned strings, other native overheads, etc. This tends to grow with the container size (typically 6-10%). This option is currently supported on YARN and Kubernetes.

\n
\n
supervise (Bool, optional):
\n

Application Properties: If true, restarts the driver automatically if it fails with a non-zero exit status. Only has effect in Spark standalone mode or Mesos cluster deploy mode.

\n
\n
extraClassPath (dagster.StringSource, optional):
\n

Runtime Environment: Extra classpath entries to prepend to the classpath of the driver. Note: In client mode, this config must not be set through the SparkConf directly in your application, because the driver JVM has already started at that point. Instead, please set this through the \u2013driver-class-path command line option or in your default properties file.

\n
\n
extraJavaOptions (dagster.StringSource, optional):
\n

Runtime Environment: A string of extra JVM options to pass to the driver. For instance, GC settings or other logging. Note that it is illegal to set maximum heap size (-Xmx) settings with this option. Maximum heap size settings can be set with spark.driver.memory in the cluster mode and through the \u2013driver-memory command line option in the client mode. Note: In client mode, this config must not be set through the SparkConf directly in your application, because the driver JVM has already started at that point. Instead, please set this through the \u2013driver-java-options command line option or in your default properties file.

\n
\n
extraLibraryPath (dagster.StringSource, optional):
\n

Runtime Environment: Set a special library path to use when launching the driver JVM. Note: In client mode, this config must not be set through the SparkConf directly in your application, because the driver JVM has already started at that point. Instead, please set this through the \u2013driver-library-path command line option or in your default properties file.

\n
\n
userClassPathFirst (Bool, optional):
\n

Runtime Environment: (Experimental) Whether to give user-added jars precedence over Spark\u2019s own jars when loading classes in the driver. This feature can be used to mitigate conflicts between Spark\u2019s dependencies and user dependencies. It is currently an experimental feature. This is used in cluster mode only.

\n
\n
blockManager (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
port (dagster.StringSource, optional):
\n

Networking: Driver-specific port for the block manager to listen on, for cases where it cannot use the same configuration as executors.

\n
\n
\n
\n
bindAddress (dagster.StringSource, optional):
\n

Networking: Hostname or IP address where to bind listening sockets. This config overrides the SPARK_LOCAL_IP environment variable (see below). It also allows a different address from the local one to be advertised to executors or external systems. This is useful, for example, when running containers with bridged networking. For this to properly work, the different ports used by the driver (RPC, block manager and UI) need to be forwarded from the container\u2019s host.

\n
\n
host (dagster.StringSource, optional):
\n

Networking: Hostname or IP address for the driver. This is used for communicating with the executors and the standalone Master.

\n
\n
port (dagster.StringSource, optional):
\n

Networking: Port for the driver to listen on. This is used for communicating with the executors and the standalone Master.

\n
\n
\n
\n
executor (permissive dict, optional):
\n
\nDefault Value:
{\n    "pyspark": {},\n    "logs": {\n        "rolling": {\n            "time": {}\n        }\n    }\n}\n
\n
\n
\nConfig Schema:
\n
memory (dagster.StringSource, optional):
\n

Application Properties: Amount of memory to use per executor process, in the same format as JVM memory strings with a size unit suffix (\u201ck\u201d, \u201cm\u201d, \u201cg\u201d or \u201ct\u201d) (e.g. 512m, 2g).

\n
\n
pyspark (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
memory (dagster.StringSource, optional):
\n

Application Properties: The amount of memory to be allocated to PySpark in each executor, in MiB unless otherwise specified. If set, PySpark memory for an executor will be limited to this amount. If not set, Spark will not limit Python\u2019s memory use and it is up to the application to avoid exceeding the overhead memory space shared with other non-JVM processes. When PySpark is run in YARN or Kubernetes, this memory is added to executor resource requests.

\n
\n
\n
\n
memoryOverhead (dagster.StringSource, optional):
\n

Application Properties: The amount of off-heap memory to be allocated per executor, in MiB unless otherwise specified. This is memory that accounts for things like VM overheads, interned strings, other native overheads, etc. This tends to grow with the executor size (typically 6-10%). This option is currently supported on YARN and Kubernetes.

\n
\n
extraClassPath (dagster.StringSource, optional):
\n

Runtime Environment: Extra classpath entries to prepend to the classpath of executors. This exists primarily for backwards-compatibility with older versions of Spark. Users typically should not need to set this option.

\n
\n
extraJavaOptions (dagster.StringSource, optional):
\n

Runtime Environment: A string of extra JVM options to pass to executors. For instance, GC settings or other logging. Note that it is illegal to set Spark properties or maximum heap size (-Xmx) settings with this option. Spark properties should be set using a SparkConf object or the spark-defaults.conf file used with the spark-submit script. Maximum heap size settings can be set with spark.executor.memory. The following symbols, if present will be interpolated: {{APP_ID}} will be replaced by application ID and {{EXECUTOR_ID}} will be replaced by executor ID. For example, to enable verbose gc logging to a file named for the executor ID of the app in /tmp, pass a \u2018value\u2019 of: -verbose:gc -Xloggc:/tmp/{{APP_ID}}-{{EXECUTOR_ID}}.gc

\n
\n
extraLibraryPath (dagster.StringSource, optional):
\n

Runtime Environment: Set a special library path to use when launching executor JVM\u2019s.

\n
\n
logs (permissive dict, optional):
\n
\nDefault Value:
{\n    "rolling": {\n        "time": {}\n    }\n}\n
\n
\n
\nConfig Schema:
\n
rolling (permissive dict, optional):
\n
\nDefault Value:
{\n    "time": {}\n}\n
\n
\n
\nConfig Schema:
\n
maxRetainedFiles (dagster.IntSource, optional):
\n

Runtime Environment: Sets the number of latest rolling log files that are going to be retained by the system. Older log files will be deleted. Disabled by default.

\n
\n
enableCompression (Bool, optional):
\n

Runtime Environment: Enable executor log compression. If it is enabled, the rolled executor logs will be compressed. Disabled by default.

\n
\n
maxSize (dagster.IntSource, optional):
\n

Runtime Environment: Set the max size of the file in bytes by which the executor logs will be rolled over. Rolling is disabled by default. See spark.executor.logs.rolling.maxRetainedFiles for automatic cleaning of old logs.

\n
\n
strategy (dagster.StringSource, optional):
\n

Runtime Environment: Set the strategy of rolling of executor logs. By default it is disabled. It can be set to \u201ctime\u201d (time-based rolling) or \u201csize\u201d (size-based rolling). For \u201ctime\u201d, use spark.executor.logs.rolling.time.interval to set the rolling interval. For \u201csize\u201d, use spark.executor.logs.rolling.maxSize to set the maximum file size for rolling.

\n
\n
time (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
interval (dagster.StringSource, optional):
\n

Runtime Environment: Set the time interval by which the executor logs will be rolled over. Rolling is disabled by default. Valid values are daily, hourly, minutely or any interval in seconds. See spark.executor.logs.rolling.maxRetainedFiles for automatic cleaning of old logs.

\n
\n
\n
\n
\n
\n
\n
\n
userClassPathFirst (Bool, optional):
\n

Runtime Environment: (Experimental) Same functionality as spark.driver.userClassPathFirst, but applied to executor instances.

\n
\n
cores (dagster.IntSource, optional):
\n

Execution Behavior: The number of cores to use on each executor. In standalone and Mesos coarse-grained modes, for more detail, see this description.

\n
\n
heartbeatInterval (dagster.StringSource, optional):
\n

Execution Behavior: Interval between each executor\u2019s heartbeats to the driver. Heartbeats let the driver know that the executor is still alive and update it with metrics for in-progress tasks. spark.executor.heartbeatInterval should be significantly less than spark.network.timeout

\n
\n
\n
\n
extraListeners (dagster.StringSource, optional):
\n

Application Properties: A comma-separated list of classes that implement SparkListener; when initializing SparkContext, instances of these classes will be created and registered with Spark\u2019s listener bus. If a class has a single-argument constructor that accepts a SparkConf, that constructor will be called; otherwise, a zero-argument constructor will be called. If no valid constructor can be found, the SparkContext creation will fail with an exception.

\n
\n
local (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
dir (dagster.StringSource, optional):
\n

Application Properties: Directory to use for \u201cscratch\u201d space in Spark, including map output files and RDDs that get stored on disk. This should be on a fast, local disk in your system. It can also be a comma-separated list of multiple directories on different disks. NOTE: In Spark 1.0 and later this will be overridden by SPARK_LOCAL_DIRS (Standalone), MESOS_SANDBOX (Mesos) or LOCAL_DIRS (YARN) environment variables set by the cluster manager.

\n
\n
\n
\n
logConf (Bool, optional):
\n

Application Properties: Logs the effective SparkConf as INFO when a SparkContext is started.

\n
\n
master (dagster.StringSource, optional):
\n

Application Properties: The cluster manager to connect to. See the list of allowed master URL\u2019s.

\n
\n
submit (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
deployMode (dagster.StringSource, optional):
\n

Application Properties: The deploy mode of Spark driver program, either \u201cclient\u201d or \u201ccluster\u201d, Which means to launch driver program locally (\u201cclient\u201d) or remotely (\u201ccluster\u201d) on one of the nodes inside the cluster.

\n
\n
pyFiles (dagster.StringSource, optional):
\n

Runtime Environment: Comma-separated list of .zip, .egg, or .py files to place on the PYTHONPATH for Python apps. Globs are allowed.

\n
\n
\n
\n
log (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
callerContext (dagster.StringSource, optional):
\n

Application Properties: Application information that will be written into Yarn RM log/HDFS audit log when running on Yarn/HDFS. Its length depends on the Hadoop configuration hadoop.caller.context.max.size. It should be concise, and typically can have up to 50 characters.

\n
\n
\n
\n
redaction (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
regex (dagster.StringSource, optional):
\n

Runtime Environment: Regex to decide which Spark configuration properties and environment variables in driver and executor environments contain sensitive information. When this regex matches a property key or value, the value is redacted from the environment UI and various logs like YARN and event logs.

\n
\n
\n
\n
python (permissive dict, optional):
\n
\nDefault Value:
{\n    "profile": {},\n    "worker": {}\n}\n
\n
\n
\nConfig Schema:
\n
profile (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
root (Bool, optional):
\n

Runtime Environment: Enable profiling in Python worker, the profile result will show up by sc.show_profiles(), or it will be displayed before the driver exits. It also can be dumped into disk by sc.dump_profiles(path). If some of the profile results had been displayed manually, they will not be displayed automatically before driver exiting. By default the pyspark.profiler.BasicProfiler will be used, but this can be overridden by passing a profiler class in as a parameter to the SparkContext constructor.

\n
\n
dump (dagster.StringSource, optional):
\n

Runtime Environment: The directory which is used to dump the profile result before driver exiting. The results will be dumped as separated file for each RDD. They can be loaded by ptats.Stats(). If this is specified, the profile result will not be displayed automatically.

\n
\n
\n
\n
worker (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
memory (dagster.StringSource, optional):
\n

Runtime Environment: Amount of memory to use per python worker process during aggregation, in the same format as JVM memory strings with a size unit suffix (\u201ck\u201d, \u201cm\u201d, \u201cg\u201d or \u201ct\u201d) (e.g. 512m, 2g). If the memory used during aggregation goes above this amount, it will spill the data into disks.

\n
\n
reuse (Bool, optional):
\n

Runtime Environment: Reuse Python worker or not. If yes, it will use a fixed number of Python workers, does not need to fork() a Python process for every task. It will be very useful if there is large broadcast, then the broadcast will not be needed to transferred from JVM to Python worker for every task.

\n
\n
\n
\n
\n
\n
files (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
root (dagster.StringSource, optional):
\n

Runtime Environment: Comma-separated list of files to be placed in the working directory of each executor. Globs are allowed.

\n
\n
fetchTimeout (dagster.StringSource, optional):
\n

Execution Behavior: Communication timeout to use when fetching files added through SparkContext.addFile() from the driver.

\n
\n
useFetchCache (Bool, optional):
\n

Execution Behavior: If set to true (default), file fetching will use a local cache that is shared by executors that belong to the same application, which can improve task launching performance when running many executors on the same host. If set to false, these caching optimizations will be disabled and all executors will fetch their own copies of files. This optimization may be disabled in order to use Spark local directories that reside on NFS filesystems (see SPARK-6313 for more details).

\n
\n
overwrite (Bool, optional):
\n

Execution Behavior: Whether to overwrite files added through SparkContext.addFile() when the target file exists and its contents do not match those of the source.

\n
\n
maxPartitionBytes (dagster.IntSource, optional):
\n

Execution Behavior: The maximum number of bytes to pack into a single partition when reading files.

\n
\n
openCostInBytes (dagster.IntSource, optional):
\n

Execution Behavior: The estimated cost to open a file, measured by the number of bytes could be scanned at the same time. This is used when putting multiple files into a partition. It is better to overestimate, then the partitions with small files will be faster than partitions with bigger files.

\n
\n
\n
\n
jars (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
root (dagster.StringSource, optional):
\n

Runtime Environment: Comma-separated list of jars to include on the driver and executor classpaths. Globs are allowed.

\n
\n
packages (dagster.StringSource, optional):
\n

Runtime Environment: Comma-separated list of Maven coordinates of jars to include on the driver and executor classpaths. The coordinates should be groupId:artifactId:version. If spark.jars.ivySettings is given artifacts will be resolved according to the configuration in the file, otherwise artifacts will be searched for in the local maven repo, then maven central and finally any additional remote repositories given by the command-line option \u2013repositories. For more details, see Advanced Dependency Management.

\n
\n
excludes (dagster.StringSource, optional):
\n

Runtime Environment: Comma-separated list of groupId:artifactId, to exclude while resolving the dependencies provided in spark.jars.packages to avoid dependency conflicts.

\n
\n
ivy (dagster.StringSource, optional):
\n

Runtime Environment: Path to specify the Ivy user directory, used for the local Ivy cache and package files from spark.jars.packages. This will override the Ivy property ivy.default.ivy.user.dir which defaults to ~/.ivy2.

\n
\n
ivySettings (dagster.StringSource, optional):
\n

Runtime Environment: Path to an Ivy settings file to customize resolution of jars specified using spark.jars.packages instead of the built-in defaults, such as maven central. Additional repositories given by the command-line option \u2013repositories or spark.jars.repositories will also be included. Useful for allowing Spark to resolve artifacts from behind a firewall e.g. via an in-house artifact server like Artifactory. Details on the settings file format can be found at http://ant.apache.org/ivy/history/latest-milestone/settings.html

\n
\n
repositories (dagster.StringSource, optional):
\n

Runtime Environment: Comma-separated list of additional remote repositories to search for the maven coordinates given with \u2013packages or spark.jars.packages.

\n
\n
\n
\n
pyspark (permissive dict, optional):
\n
\nDefault Value:
{\n    "driver": {}\n}\n
\n
\n
\nConfig Schema:
\n
driver (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
python (dagster.StringSource, optional):
\n

Runtime Environment: Python binary executable to use for PySpark in driver. (default is spark.pyspark.python)

\n
\n
\n
\n
python (dagster.StringSource, optional):
\n

Runtime Environment: Python binary executable to use for PySpark in both driver and executors.

\n
\n
\n
\n
reducer (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
maxSizeInFlight (dagster.StringSource, optional):
\n

Shuffle Behavior: Maximum size of map outputs to fetch simultaneously from each reduce task, in MiB unless otherwise specified. Since each output requires us to create a buffer to receive it, this represents a fixed memory overhead per reduce task, so keep it small unless you have a large amount of memory.

\n
\n
maxReqsInFlight (dagster.IntSource, optional):
\n

Shuffle Behavior: This configuration limits the number of remote requests to fetch blocks at any given point. When the number of hosts in the cluster increase, it might lead to very large number of inbound connections to one or more nodes, causing the workers to fail under load. By allowing it to limit the number of fetch requests, this scenario can be mitigated.

\n
\n
maxBlocksInFlightPerAddress (dagster.IntSource, optional):
\n

Shuffle Behavior: This configuration limits the number of remote blocks being fetched per reduce task from a given host port. When a large number of blocks are being requested from a given address in a single fetch or simultaneously, this could crash the serving executor or Node Manager. This is especially useful to reduce the load on the Node Manager when external shuffle is enabled. You can mitigate this issue by setting it to a lower value.

\n
\n
\n
\n
maxRemoteBlockSizeFetchToMem (dagster.IntSource, optional):
\n

Shuffle Behavior: The remote block will be fetched to disk when size of the block is above this threshold in bytes. This is to avoid a giant request that takes too much memory. By default, this is only enabled for blocks > 2GB, as those cannot be fetched directly into memory, no matter what resources are available. But it can be turned down to a much lower value (eg. 200m) to avoid using too much memory on smaller blocks as well. Note this configuration will affect both shuffle fetch and block manager remote block fetch. For users who enabled external shuffle service, this feature can only be used when external shuffle service is newer than Spark 2.2.

\n
\n
shuffle (permissive dict, optional):
\n
\nDefault Value:
{\n    "file": {},\n    "io": {},\n    "service": {\n        "index": {\n            "cache": {}\n        }\n    },\n    "sort": {},\n    "spill": {},\n    "registration": {}\n}\n
\n
\n
\nConfig Schema:
\n
compress (Bool, optional):
\n

Shuffle Behavior: Whether to compress map output files. Generally a good idea. Compression will use spark.io.compression.codec.

\n
\n
file (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
buffer (dagster.StringSource, optional):
\n

Shuffle Behavior: Size of the in-memory buffer for each shuffle file output stream, in KiB unless otherwise specified. These buffers reduce the number of disk seeks and system calls made in creating intermediate shuffle files.

\n
\n
\n
\n
io (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
maxRetries (dagster.IntSource, optional):
\n

Shuffle Behavior: (Netty only) Fetches that fail due to IO-related exceptions are automatically retried if this is set to a non-zero value. This retry logic helps stabilize large shuffles in the face of long GC pauses or transient network connectivity issues.

\n
\n
numConnectionsPerPeer (dagster.IntSource, optional):
\n

Shuffle Behavior: (Netty only) Connections between hosts are reused in order to reduce connection buildup for large clusters. For clusters with many hard disks and few hosts, this may result in insufficient concurrency to saturate all disks, and so users may consider increasing this value.

\n
\n
preferDirectBufs (Bool, optional):
\n

Shuffle Behavior: (Netty only) Off-heap buffers are used to reduce garbage collection during shuffle and cache block transfer. For environments where off-heap memory is tightly limited, users may wish to turn this off to force all allocations from Netty to be on-heap.

\n
\n
retryWait (dagster.StringSource, optional):
\n

Shuffle Behavior: (Netty only) How long to wait between retries of fetches. The maximum delay caused by retrying is 15 seconds by default, calculated as maxRetries * retryWait.

\n
\n
\n
\n
service (permissive dict, optional):
\n
\nDefault Value:
{\n    "index": {\n        "cache": {}\n    }\n}\n
\n
\n
\nConfig Schema:
\n
enabled (Bool, optional):
\n

Shuffle Behavior: Enables the external shuffle service. This service preserves the shuffle files written by executors so the executors can be safely removed. This must be enabled if spark.dynamicAllocation.enabled is \u201ctrue\u201d. The external shuffle service must be set up in order to enable it. See dynamic allocation configuration and setup documentation for more information.

\n
\n
port (dagster.IntSource, optional):
\n

Shuffle Behavior: Port on which the external shuffle service will run.

\n
\n
index (permissive dict, optional):
\n
\nDefault Value:
{\n    "cache": {}\n}\n
\n
\n
\nConfig Schema:
\n
cache (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
size (dagster.StringSource, optional):
\n

Shuffle Behavior: Cache entries limited to the specified memory footprint in bytes.

\n
\n
\n
\n
\n
\n
\n
\n
maxChunksBeingTransferred (dagster.IntSource, optional):
\n

Shuffle Behavior: The max number of chunks allowed to be transferred at the same time on shuffle service. Note that new incoming connections will be closed when the max number is hit. The client will retry according to the shuffle retry configs (see spark.shuffle.io.maxRetries and spark.shuffle.io.retryWait), if those limits are reached the task will fail with fetch failure.

\n
\n
sort (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
bypassMergeThreshold (dagster.IntSource, optional):
\n

Shuffle Behavior: (Advanced) In the sort-based shuffle manager, avoid merge-sorting data if there is no map-side aggregation and there are at most this many reduce partitions.

\n
\n
\n
\n
spill (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
compress (Bool, optional):
\n

Shuffle Behavior: Whether to compress data spilled during shuffles. Compression will use spark.io.compression.codec.

\n
\n
\n
\n
accurateBlockThreshold (dagster.IntSource, optional):
\n

Shuffle Behavior: Threshold in bytes above which the size of shuffle blocks in HighlyCompressedMapStatus is accurately recorded. This helps to prevent OOM by avoiding underestimating shuffle block size when fetch shuffle blocks.

\n
\n
registration (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
timeout (dagster.IntSource, optional):
\n

Shuffle Behavior: Timeout in milliseconds for registration to the external shuffle service.

\n
\n
maxAttempts (dagster.IntSource, optional):
\n

Shuffle Behavior: When we fail to register to the external shuffle service, we will retry for maxAttempts times.

\n
\n
\n
\n
memoryFraction (Float, optional):
\n

Memory Management: (deprecated) This is read only if spark.memory.useLegacyMode is enabled. Fraction of Java heap to use for aggregation and cogroups during shuffles. At any given time, the collective size of all in-memory maps used for shuffles is bounded by this limit, beyond which the contents will begin to spill to disk. If spills are often, consider increasing this value at the expense of spark.storage.memoryFraction.

\n
\n
\n
\n
eventLog (permissive dict, optional):
\n
\nDefault Value:
{\n    "logBlockUpdates": {},\n    "longForm": {},\n    "buffer": {}\n}\n
\n
\n
\nConfig Schema:
\n
logBlockUpdates (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
enabled (dagster.StringSource, optional):
\n

Spark UI: Whether to log events for every block update, if spark.eventLog.enabled is true. *Warning*: This will increase the size of the event log considerably.

\n
\n
\n
\n
longForm (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
enabled (dagster.StringSource, optional):
\n

Spark UI: If true, use the long form of call sites in the event log. Otherwise use the short form.

\n
\n
\n
\n
compress (dagster.StringSource, optional):
\n

Spark UI: Whether to compress logged events, if spark.eventLog.enabled is true. Compression will use spark.io.compression.codec.

\n
\n
dir (dagster.StringSource, optional):
\n

Spark UI: Base directory in which Spark events are logged, if spark.eventLog.enabled is true. Within this base directory, Spark creates a sub-directory for each application, and logs the events specific to the application in this directory. Users may want to set this to a unified location like an HDFS directory so history files can be read by the history server.

\n
\n
enabled (dagster.StringSource, optional):
\n

Spark UI: Whether to log Spark events, useful for reconstructing the Web UI after the application has finished.

\n
\n
overwrite (dagster.StringSource, optional):
\n

Spark UI: Whether to overwrite any existing files.

\n
\n
buffer (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
kb (dagster.StringSource, optional):
\n

Spark UI: Buffer size to use when writing to output streams, in KiB unless otherwise specified.

\n
\n
\n
\n
\n
\n
ui (permissive dict, optional):
\n
\nDefault Value:
{\n    "dagGraph": {},\n    "liveUpdate": {}\n}\n
\n
\n
\nConfig Schema:
\n
dagGraph (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
retainedRootRDDs (dagster.StringSource, optional):
\n

Spark UI: How many DAG graph nodes the Spark UI and status APIs remember before garbage collecting.

\n
\n
\n
\n
enabled (dagster.StringSource, optional):
\n

Spark UI: Whether to run the web UI for the Spark application.

\n
\n
killEnabled (dagster.StringSource, optional):
\n

Spark UI: Allows jobs and stages to be killed from the web UI.

\n
\n
liveUpdate (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
period (dagster.StringSource, optional):
\n

Spark UI: How often to update live entities. -1 means \u201cnever update\u201d when replaying applications, meaning only the last write will happen. For live applications, this avoids a few operations that we can live without when rapidly processing incoming task events.

\n
\n
\n
\n
port (dagster.StringSource, optional):
\n

Spark UI: Port for your application\u2019s dashboard, which shows memory and workload data.

\n
\n
retainedJobs (dagster.StringSource, optional):
\n

Spark UI: How many jobs the Spark UI and status APIs remember before garbage collecting. This is a target maximum, and fewer elements may be retained in some circumstances.

\n
\n
retainedStages (dagster.StringSource, optional):
\n

Spark UI: How many stages the Spark UI and status APIs remember before garbage collecting. This is a target maximum, and fewer elements may be retained in some circumstances.

\n
\n
retainedTasks (dagster.StringSource, optional):
\n

Spark UI: How many tasks the Spark UI and status APIs remember before garbage collecting. This is a target maximum, and fewer elements may be retained in some circumstances.

\n
\n
reverseProxy (dagster.StringSource, optional):
\n

Spark UI: Enable running Spark Master as reverse proxy for worker and application UIs. In this mode, Spark master will reverse proxy the worker and application UIs to enable access without requiring direct access to their hosts. Use it with caution, as worker and application UI will not be accessible directly, you will only be able to access them through spark master/proxy public URL. This setting affects all the workers and application UIs running in the cluster and must be set on all the workers, drivers and masters.

\n
\n
reverseProxyUrl (dagster.StringSource, optional):
\n

Spark UI: This is the URL where your proxy is running. This URL is for proxy which is running in front of Spark Master. This is useful when running proxy for authentication e.g. OAuth proxy. Make sure this is a complete URL including scheme (http/https) and port to reach your proxy.

\n
\n
showConsoleProgress (dagster.StringSource, optional):
\n

Spark UI: Show the progress bar in the console. The progress bar shows the progress of stages that run for longer than 500ms. If multiple stages run at the same time, multiple progress bars will be displayed on the same line.

\n
\n
retainedDeadExecutors (dagster.StringSource, optional):
\n

Spark UI: How many dead executors the Spark UI and status APIs remember before garbage collecting.

\n
\n
filters (dagster.StringSource, optional):
\n

Spark UI: Comma separated list of filter class names to apply to the Spark Web UI. The filter should be a standard javax servlet Filter. Filter parameters can also be specified in the configuration, by setting config entries of the form spark.<class name of filter>.param.<param name>=<value> For example: spark.ui.filters=com.test.filter1 spark.com.test.filter1.param.name1=foo spark.com.test.filter1.param.name2=bar

\n
\n
\n
\n
worker (permissive dict, optional):
\n
\nDefault Value:
{\n    "ui": {}\n}\n
\n
\n
\nConfig Schema:
\n
ui (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
retainedExecutors (dagster.StringSource, optional):
\n

Spark UI: How many finished executors the Spark UI and status APIs remember before garbage collecting.

\n
\n
retainedDrivers (dagster.StringSource, optional):
\n

Spark UI: How many finished drivers the Spark UI and status APIs remember before garbage collecting.

\n
\n
\n
\n
\n
\n
sql (permissive dict, optional):
\n
\nDefault Value:
{\n    "ui": {}\n}\n
\n
\n
\nConfig Schema:
\n
ui (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
retainedExecutions (dagster.StringSource, optional):
\n

Spark UI: How many finished executions the Spark UI and status APIs remember before garbage collecting.

\n
\n
\n
\n
\n
\n
streaming (permissive dict, optional):
\n
\nDefault Value:
{\n    "ui": {},\n    "backpressure": {},\n    "receiver": {\n        "writeAheadLog": {}\n    },\n    "kafka": {},\n    "driver": {\n        "writeAheadLog": {}\n    }\n}\n
\n
\n
\nConfig Schema:
\n
ui (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
retainedBatches (dagster.StringSource, optional):
\n

Spark Streaming: How many batches the Spark Streaming UI and status APIs remember before garbage collecting.

\n
\n
\n
\n
backpressure (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
enabled (dagster.StringSource, optional):
\n

Spark Streaming: Enables or disables Spark Streaming\u2019s internal backpressure mechanism (since 1.5). This enables the Spark Streaming to control the receiving rate based on the current batch scheduling delays and processing times so that the system receives only as fast as the system can process. Internally, this dynamically sets the maximum receiving rate of receivers. This rate is upper bounded by the values spark.streaming.receiver.maxRate and spark.streaming.kafka.maxRatePerPartition if they are set (see below).

\n
\n
initialRate (dagster.StringSource, optional):
\n

Spark Streaming: This is the initial maximum receiving rate at which each receiver will receive data for the first batch when the backpressure mechanism is enabled.

\n
\n
\n
\n
blockInterval (dagster.StringSource, optional):
\n

Spark Streaming: Interval at which data received by Spark Streaming receivers is chunked into blocks of data before storing them in Spark. Minimum recommended - 50 ms. See the performance tuning section in the Spark Streaming programing guide for more details.

\n
\n
receiver (permissive dict, optional):
\n
\nDefault Value:
{\n    "writeAheadLog": {}\n}\n
\n
\n
\nConfig Schema:
\n
maxRate (dagster.StringSource, optional):
\n

Spark Streaming: Maximum rate (number of records per second) at which each receiver will receive data. Effectively, each stream will consume at most this number of records per second. Setting this configuration to 0 or a negative number will put no limit on the rate. See the deployment guide in the Spark Streaming programing guide for mode details.

\n
\n
writeAheadLog (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
enable (dagster.StringSource, optional):
\n

Spark Streaming: Enable write-ahead logs for receivers. All the input data received through receivers will be saved to write-ahead logs that will allow it to be recovered after driver failures. See the deployment guide in the Spark Streaming programing guide for more details.

\n
\n
closeFileAfterWrite (dagster.StringSource, optional):
\n

Spark Streaming: Whether to close the file after writing a write-ahead log record on the receivers. Set this to \u2018true\u2019 when you want to use S3 (or any file system that does not support flushing) for the data WAL on the receivers.

\n
\n
\n
\n
\n
\n
unpersist (dagster.StringSource, optional):
\n

Spark Streaming: Force RDDs generated and persisted by Spark Streaming to be automatically unpersisted from Spark\u2019s memory. The raw input data received by Spark Streaming is also automatically cleared. Setting this to false will allow the raw data and persisted RDDs to be accessible outside the streaming application as they will not be cleared automatically. But it comes at the cost of higher memory usage in Spark.

\n
\n
stopGracefullyOnShutdown (dagster.StringSource, optional):
\n

Spark Streaming: If true, Spark shuts down the StreamingContext gracefully on JVM shutdown rather than immediately.

\n
\n
kafka (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
maxRatePerPartition (dagster.StringSource, optional):
\n

Spark Streaming: Maximum rate (number of records per second) at which data will be read from each Kafka partition when using the new Kafka direct stream API. See the Kafka Integration guide for more details.

\n
\n
minRatePerPartition (dagster.StringSource, optional):
\n

Spark Streaming: Minimum rate (number of records per second) at which data will be read from each Kafka partition when using the new Kafka direct stream API.

\n
\n
maxRetries (dagster.StringSource, optional):
\n

Spark Streaming: Maximum number of consecutive retries the driver will make in order to find the latest offsets on the leader of each partition (a default value of 1 means that the driver will make a maximum of 2 attempts). Only applies to the new Kafka direct stream API.

\n
\n
\n
\n
driver (permissive dict, optional):
\n
\nDefault Value:
{\n    "writeAheadLog": {}\n}\n
\n
\n
\nConfig Schema:
\n
writeAheadLog (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
closeFileAfterWrite (dagster.StringSource, optional):
\n

Spark Streaming: Whether to close the file after writing a write-ahead log record on the driver. Set this to \u2018true\u2019 when you want to use S3 (or any file system that does not support flushing) for the metadata WAL on the driver.

\n
\n
\n
\n
\n
\n
\n
\n
broadcast (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
compress (dagster.StringSource, optional):
\n

Compression and Serialization: Whether to compress broadcast variables before sending them. Generally a good idea. Compression will use spark.io.compression.codec.

\n
\n
blockSize (dagster.StringSource, optional):
\n

Execution Behavior: Size of each piece of a block for TorrentBroadcastFactory, in KiB unless otherwise specified. Too large a value decreases parallelism during broadcast (makes it slower); however, if it is too small, BlockManager might take a performance hit.

\n
\n
checksum (dagster.StringSource, optional):
\n

Execution Behavior: Whether to enable checksum for broadcast. If enabled, broadcasts will include a checksum, which can help detect corrupted blocks, at the cost of computing and sending a little more data. It\u2019s possible to disable it if the network has other mechanisms to guarantee data won\u2019t be corrupted during broadcast.

\n
\n
\n
\n
io (permissive dict, optional):
\n
\nDefault Value:
{\n    "compression": {\n        "lz4": {},\n        "snappy": {},\n        "zstd": {}\n    }\n}\n
\n
\n
\nConfig Schema:
\n
compression (permissive dict, optional):
\n
\nDefault Value:
{\n    "lz4": {},\n    "snappy": {},\n    "zstd": {}\n}\n
\n
\n
\nConfig Schema:
\n
codec (dagster.StringSource, optional):
\n

Compression and Serialization: The codec used to compress internal data such as RDD partitions, event log, broadcast variables and shuffle outputs. By default, Spark provides four codecs: lz4, lzf, snappy, and zstd. You can also use fully qualified class names to specify the codec, e.g. org.apache.spark.io.LZ4CompressionCodec, org.apache.spark.io.LZFCompressionCodec, org.apache.spark.io.SnappyCompressionCodec, and org.apache.spark.io.ZStdCompressionCodec.

\n
\n
lz4 (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
blockSize (dagster.StringSource, optional):
\n

Compression and Serialization: Block size in bytes used in LZ4 compression, in the case when LZ4 compression codec is used. Lowering this block size will also lower shuffle memory usage when LZ4 is used.

\n
\n
\n
\n
snappy (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
blockSize (dagster.StringSource, optional):
\n

Compression and Serialization: Block size in bytes used in Snappy compression, in the case when Snappy compression codec is used. Lowering this block size will also lower shuffle memory usage when Snappy is used.

\n
\n
\n
\n
zstd (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
level (dagster.StringSource, optional):
\n

Compression and Serialization: Compression level for Zstd compression codec. Increasing the compression level will result in better compression at the expense of more CPU and memory.

\n
\n
bufferSize (dagster.StringSource, optional):
\n

Compression and Serialization: Buffer size in bytes used in Zstd compression, in the case when Zstd compression codec is used. Lowering this size will lower the shuffle memory usage when Zstd is used, but it might increase the compression cost because of excessive JNI call overhead.

\n
\n
\n
\n
\n
\n
\n
\n
kryo (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
classesToRegister (dagster.StringSource, optional):
\n

Compression and Serialization: If you use Kryo serialization, give a comma-separated list of custom class names to register with Kryo. See the tuning guide for more details.

\n
\n
referenceTracking (dagster.StringSource, optional):
\n

Compression and Serialization: Whether to track references to the same object when serializing data with Kryo, which is necessary if your object graphs have loops and useful for efficiency if they contain multiple copies of the same object. Can be disabled to improve performance if you know this is not the case.

\n
\n
registrationRequired (dagster.StringSource, optional):
\n

Compression and Serialization: Whether to require registration with Kryo. If set to \u2018true\u2019, Kryo will throw an exception if an unregistered class is serialized. If set to false (the default), Kryo will write unregistered class names along with each object. Writing class names can cause significant performance overhead, so enabling this option can enforce strictly that a user has not omitted classes from registration.

\n
\n
registrator (dagster.StringSource, optional):
\n

Compression and Serialization: If you use Kryo serialization, give a comma-separated list of classes that register your custom classes with Kryo. This property is useful if you need to register your classes in a custom way, e.g. to specify a custom field serializer. Otherwise spark.kryo.classesToRegister is simpler. It should be set to classes that extend KryoRegistrator. See the tuning guide for more details.

\n
\n
unsafe (dagster.StringSource, optional):
\n

Compression and Serialization: Whether to use unsafe based Kryo serializer. Can be substantially faster by using Unsafe Based IO.

\n
\n
\n
\n
kryoserializer (permissive dict, optional):
\n
\nDefault Value:
{\n    "buffer": {}\n}\n
\n
\n
\nConfig Schema:
\n
buffer (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
root (dagster.StringSource, optional):
\n

Compression and Serialization: Initial size of Kryo\u2019s serialization buffer, in KiB unless otherwise specified. Note that there will be one buffer per core on each worker. This buffer will grow up to spark.kryoserializer.buffer.max if needed.

\n
\n
max (dagster.StringSource, optional):
\n

Compression and Serialization: Maximum allowable size of Kryo serialization buffer, in MiB unless otherwise specified. This must be larger than any object you attempt to serialize and must be less than 2048m. Increase this if you get a \u201cbuffer limit exceeded\u201d exception inside Kryo.

\n
\n
\n
\n
\n
\n
rdd (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
compress (dagster.StringSource, optional):
\n

Compression and Serialization: Whether to compress serialized RDD partitions (e.g. for StorageLevel.MEMORY_ONLY_SER in Java and Scala or StorageLevel.MEMORY_ONLY in Python). Can save substantial space at the cost of some extra CPU time. Compression will use spark.io.compression.codec.

\n
\n
\n
\n
serializer (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
root (dagster.StringSource, optional):
\n

Compression and Serialization: Class to use for serializing objects that will be sent over the network or need to be cached in serialized form. The default of Java serialization works with any Serializable Java object but is quite slow, so we recommend using org.apache.spark.serializer.KryoSerializer and configuring Kryo serialization when speed is necessary. Can be any subclass of org.apache.spark.Serializer.

\n
\n
objectStreamReset (dagster.StringSource, optional):
\n

Compression and Serialization: When serializing using org.apache.spark.serializer.JavaSerializer, the serializer caches objects to prevent writing redundant data, however that stops garbage collection of those objects. By calling \u2018reset\u2019 you flush that info from the serializer, and allow old objects to be collected. To turn off this periodic reset set it to -1. By default it will reset the serializer every 100 objects.

\n
\n
\n
\n
memory (permissive dict, optional):
\n
\nDefault Value:
{\n    "offHeap": {}\n}\n
\n
\n
\nConfig Schema:
\n
fraction (Float, optional):
\n

Memory Management: Fraction of (heap space - 300MB) used for execution and storage. The lower this is, the more frequently spills and cached data eviction occur. The purpose of this config is to set aside memory for internal metadata, user data structures, and imprecise size estimation in the case of sparse, unusually large records. Leaving this at the default value is recommended. For more detail, including important information about correctly tuning JVM garbage collection when increasing this value, see this description.

\n
\n
storageFraction (Float, optional):
\n

Memory Management: Amount of storage memory immune to eviction, expressed as a fraction of the size of the region set aside by spark.memory.fraction. The higher this is, the less working memory may be available to execution and tasks may spill to disk more often. Leaving this at the default value is recommended. For more detail, see this description.

\n
\n
offHeap (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
enabled (Bool, optional):
\n

Memory Management: If true, Spark will attempt to use off-heap memory for certain operations. If off-heap memory use is enabled, then spark.memory.offHeap.size must be positive.

\n
\n
size (dagster.IntSource, optional):
\n

Memory Management: The absolute amount of memory in bytes which can be used for off-heap allocation. This setting has no impact on heap memory usage, so if your executors\u2019 total memory consumption must fit within some hard limit then be sure to shrink your JVM heap size accordingly. This must be set to a positive value when spark.memory.offHeap.enabled=true.

\n
\n
\n
\n
useLegacyMode (Bool, optional):
\n

Memory Management: Whether to enable the legacy memory management mode used in Spark 1.5 and before. The legacy mode rigidly partitions the heap space into fixed-size regions, potentially leading to excessive spilling if the application was not tuned. The following deprecated memory fraction configurations are not read unless this is enabled: spark.shuffle.memoryFraction spark.storage.memoryFraction spark.storage.unrollFraction

\n
\n
\n
\n
storage (permissive dict, optional):
\n
\nDefault Value:
{\n    "replication": {}\n}\n
\n
\n
\nConfig Schema:
\n
memoryFraction (Float, optional):
\n

Memory Management: (deprecated) This is read only if spark.memory.useLegacyMode is enabled. Fraction of Java heap to use for Spark\u2019s memory cache. This should not be larger than the \u201cold\u201d generation of objects in the JVM, which by default is given 0.6 of the heap, but you can increase it if you configure your own old generation size.

\n
\n
unrollFraction (Float, optional):
\n

Memory Management: (deprecated) This is read only if spark.memory.useLegacyMode is enabled. Fraction of spark.storage.memoryFraction to use for unrolling blocks in memory. This is dynamically allocated by dropping existing blocks when there is not enough free storage space to unroll the new block in its entirety.

\n
\n
replication (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
proactive (Bool, optional):
\n

Memory Management: Enables proactive block replication for RDD blocks. Cached RDD block replicas lost due to executor failures are replenished if there are any existing available replicas. This tries to get the replication level of the block to the initial number.

\n
\n
\n
\n
memoryMapThreshold (dagster.StringSource, optional):
\n

Execution Behavior: Size in bytes of a block above which Spark memory maps when reading a block from disk. This prevents Spark from memory mapping very small blocks. In general, memory mapping has high overhead for blocks close to or below the page size of the operating system.

\n
\n
\n
\n
cleaner (permissive dict, optional):
\n
\nDefault Value:
{\n    "periodicGC": {},\n    "referenceTracking": {\n        "blocking": {}\n    }\n}\n
\n
\n
\nConfig Schema:
\n
periodicGC (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
interval (dagster.StringSource, optional):
\n

Memory Management: Controls how often to trigger a garbage collection. This context cleaner triggers cleanups only when weak references are garbage collected. In long-running applications with large driver JVMs, where there is little memory pressure on the driver, this may happen very occasionally or not at all. Not cleaning at all may lead to executors running out of disk space after a while.

\n
\n
\n
\n
referenceTracking (permissive dict, optional):
\n
\nDefault Value:
{\n    "blocking": {}\n}\n
\n
\n
\nConfig Schema:
\n
root (Bool, optional):
\n

Memory Management: Enables or disables context cleaning.

\n
\n
blocking (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
root (Bool, optional):
\n

Memory Management: Controls whether the cleaning thread should block on cleanup tasks (other than shuffle, which is controlled by spark.cleaner.referenceTracking.blocking.shuffle Spark property).

\n
\n
shuffle (Bool, optional):
\n

Memory Management: Controls whether the cleaning thread should block on shuffle cleanup tasks.

\n
\n
\n
\n
cleanCheckpoints (Bool, optional):
\n

Memory Management: Controls whether to clean checkpoint files if the reference is out of scope.

\n
\n
\n
\n
\n
\n
default (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
parallelism (dagster.IntSource, optional):
\n

Execution Behavior: Default number of partitions in RDDs returned by transformations like join, reduceByKey, and parallelize when not set by user.

\n
\n
\n
\n
hadoop (permissive dict, optional):
\n
\nDefault Value:
{\n    "mapreduce": {\n        "fileoutputcommitter": {\n            "algorithm": {}\n        }\n    }\n}\n
\n
\n
\nConfig Schema:
\n
cloneConf (Bool, optional):
\n

Execution Behavior: If set to true, clones a new Hadoop Configuration object for each task. This option should be enabled to work around Configuration thread-safety issues (see SPARK-2546 for more details). This is disabled by default in order to avoid unexpected performance regressions for jobs that are not affected by these issues.

\n
\n
validateOutputSpecs (Bool, optional):
\n

Execution Behavior: If set to true, validates the output specification (e.g. checking if the output directory already exists) used in saveAsHadoopFile and other variants. This can be disabled to silence exceptions due to pre-existing output directories. We recommend that users do not disable this except if trying to achieve compatibility with previous versions of Spark. Simply use Hadoop\u2019s FileSystem API to delete output directories by hand. This setting is ignored for jobs generated through Spark Streaming\u2019s StreamingContext, since data may need to be rewritten to pre-existing output directories during checkpoint recovery.

\n
\n
mapreduce (permissive dict, optional):
\n
\nDefault Value:
{\n    "fileoutputcommitter": {\n        "algorithm": {}\n    }\n}\n
\n
\n
\nConfig Schema:
\n
fileoutputcommitter (permissive dict, optional):
\n
\nDefault Value:
{\n    "algorithm": {}\n}\n
\n
\n
\nConfig Schema:
\n
algorithm (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
version (dagster.IntSource, optional):
\n

Execution Behavior: The file output committer algorithm version, valid algorithm version number: 1 or 2. Version 2 may have better performance, but version 1 may handle failures better in certain situations, as per MAPREDUCE-4815.

\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
rpc (permissive dict, optional):
\n
\nDefault Value:
{\n    "message": {},\n    "retry": {}\n}\n
\n
\n
\nConfig Schema:
\n
message (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
maxSize (dagster.StringSource, optional):
\n

Networking: Maximum message size (in MB) to allow in \u201ccontrol plane\u201d communication; generally only applies to map output size information sent between executors and the driver. Increase this if you are running jobs with many thousands of map and reduce tasks and see messages about the RPC message size.

\n
\n
\n
\n
numRetries (dagster.StringSource, optional):
\n

Networking: Number of times to retry before an RPC task gives up. An RPC task will run at most times of this number.

\n
\n
retry (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
wait (dagster.StringSource, optional):
\n

Networking: Duration for an RPC ask operation to wait before retrying.

\n
\n
\n
\n
askTimeout (dagster.StringSource, optional):
\n

Networking: Duration for an RPC ask operation to wait before timing out.

\n
\n
lookupTimeout (dagster.StringSource, optional):
\n

Networking: Duration for an RPC remote endpoint lookup operation to wait before timing out.

\n
\n
\n
\n
blockManager (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
port (dagster.StringSource, optional):
\n

Networking: Port for all block managers to listen on. These exist on both the driver and the executors.

\n
\n
\n
\n
network (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
timeout (dagster.StringSource, optional):
\n

Networking: Default timeout for all network interactions. This config will be used in place of spark.core.connection.ack.wait.timeout, spark.storage.blockManagerSlaveTimeoutMs, spark.shuffle.io.connectionTimeout, spark.rpc.askTimeout or spark.rpc.lookupTimeout if they are not configured.

\n
\n
\n
\n
port (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
maxRetries (dagster.StringSource, optional):
\n

Networking: Maximum number of retries when binding to a port before giving up. When a port is given a specific value (non 0), each subsequent retry will increment the port used in the previous attempt by 1 before retrying. This essentially allows it to try a range of ports from the start port specified to port + maxRetries.

\n
\n
\n
\n
core (permissive dict, optional):
\n
\nDefault Value:
{\n    "connection": {\n        "ack": {\n            "wait": {}\n        }\n    }\n}\n
\n
\n
\nConfig Schema:
\n
connection (permissive dict, optional):
\n
\nDefault Value:
{\n    "ack": {\n        "wait": {}\n    }\n}\n
\n
\n
\nConfig Schema:
\n
ack (permissive dict, optional):
\n
\nDefault Value:
{\n    "wait": {}\n}\n
\n
\n
\nConfig Schema:
\n
wait (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
timeout (dagster.StringSource, optional):
\n

Networking: How long for the connection to wait for ack to occur before timing out and giving up. To avoid unwilling timeout caused by long pause like GC, you can set larger value.

\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
cores (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
max (dagster.StringSource, optional):
\n

Scheduling: When running on a standalone deploy cluster or a Mesos cluster in \u201ccoarse-grained\u201d sharing mode, the maximum amount of CPU cores to request for the application from across the cluster (not from each machine). If not set, the default will be spark.deploy.defaultCores on Spark\u2019s standalone cluster manager, or infinite (all available cores) on Mesos.

\n
\n
\n
\n
locality (permissive dict, optional):
\n
\nDefault Value:
{\n    "wait": {}\n}\n
\n
\n
\nConfig Schema:
\n
wait (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
root (dagster.StringSource, optional):
\n

Scheduling: How long to wait to launch a data-local task before giving up and launching it on a less-local node. The same wait will be used to step through multiple locality levels (process-local, node-local, rack-local and then any). It is also possible to customize the waiting time for each level by setting spark.locality.wait.node, etc. You should increase this setting if your tasks are long and see poor locality, but the default usually works well.

\n
\n
node (dagster.StringSource, optional):
\n

Scheduling: Customize the locality wait for node locality. For example, you can set this to 0 to skip node locality and search immediately for rack locality (if your cluster has rack information).

\n
\n
process (dagster.StringSource, optional):
\n

Scheduling: Customize the locality wait for process locality. This affects tasks that attempt to access cached data in a particular executor process.

\n
\n
rack (dagster.StringSource, optional):
\n

Scheduling: Customize the locality wait for rack locality.

\n
\n
\n
\n
\n
\n
scheduler (permissive dict, optional):
\n
\nDefault Value:
{\n    "revive": {},\n    "listenerbus": {\n        "eventqueue": {}\n    }\n}\n
\n
\n
\nConfig Schema:
\n
maxRegisteredResourcesWaitingTime (dagster.StringSource, optional):
\n

Scheduling: Maximum amount of time to wait for resources to register before scheduling begins.

\n
\n
minRegisteredResourcesRatio (dagster.StringSource, optional):
\n

Scheduling: The minimum ratio of registered resources (registered resources / total expected resources) (resources are executors in yarn mode and Kubernetes mode, CPU cores in standalone mode and Mesos coarse-grained mode [\u2018spark.cores.max\u2019 value is total expected resources for Mesos coarse-grained mode] ) to wait for before scheduling begins. Specified as a double between 0.0 and 1.0. Regardless of whether the minimum ratio of resources has been reached, the maximum amount of time it will wait before scheduling begins is controlled by config spark.scheduler.maxRegisteredResourcesWaitingTime.

\n
\n
mode (dagster.StringSource, optional):
\n

Scheduling: The scheduling mode between jobs submitted to the same SparkContext. Can be set to FAIR to use fair sharing instead of queueing jobs one after another. Useful for multi-user services.

\n
\n
revive (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
interval (dagster.StringSource, optional):
\n

Scheduling: The interval length for the scheduler to revive the worker resource offers to run tasks.

\n
\n
\n
\n
listenerbus (permissive dict, optional):
\n
\nDefault Value:
{\n    "eventqueue": {}\n}\n
\n
\n
\nConfig Schema:
\n
eventqueue (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
capacity (dagster.StringSource, optional):
\n

Scheduling: Capacity for event queue in Spark listener bus, must be greater than 0. Consider increasing value (e.g. 20000) if listener events are dropped. Increasing this value may result in the driver using more memory.

\n
\n
\n
\n
\n
\n
\n
\n
blacklist (permissive dict, optional):
\n
\nDefault Value:
{\n    "task": {},\n    "stage": {},\n    "application": {\n        "fetchFailure": {}\n    }\n}\n
\n
\n
\nConfig Schema:
\n
enabled (dagster.StringSource, optional):
\n

Scheduling: If set to \u201ctrue\u201d, prevent Spark from scheduling tasks on executors that have been blacklisted due to too many task failures. The blacklisting algorithm can be further controlled by the other \u201cspark.blacklist\u201d configuration options.

\n
\n
timeout (dagster.StringSource, optional):
\n

Scheduling: (Experimental) How long a node or executor is blacklisted for the entire application, before it is unconditionally removed from the blacklist to attempt running new tasks.

\n
\n
task (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
maxTaskAttemptsPerExecutor (dagster.StringSource, optional):
\n

Scheduling: (Experimental) For a given task, how many times it can be retried on one executor before the executor is blacklisted for that task.

\n
\n
maxTaskAttemptsPerNode (dagster.StringSource, optional):
\n

Scheduling: (Experimental) For a given task, how many times it can be retried on one node, before the entire node is blacklisted for that task.

\n
\n
\n
\n
stage (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
maxFailedTasksPerExecutor (dagster.StringSource, optional):
\n

Scheduling: (Experimental) How many different tasks must fail on one executor, within one stage, before the executor is blacklisted for that stage.

\n
\n
maxFailedExecutorsPerNode (dagster.StringSource, optional):
\n

Scheduling: (Experimental) How many different executors are marked as blacklisted for a given stage, before the entire node is marked as failed for the stage.

\n
\n
\n
\n
application (permissive dict, optional):
\n
\nDefault Value:
{\n    "fetchFailure": {}\n}\n
\n
\n
\nConfig Schema:
\n
maxFailedTasksPerExecutor (dagster.StringSource, optional):
\n

Scheduling: (Experimental) How many different tasks must fail on one executor, in successful task sets, before the executor is blacklisted for the entire application. Blacklisted executors will be automatically added back to the pool of available resources after the timeout specified by spark.blacklist.timeout. Note that with dynamic allocation, though, the executors may get marked as idle and be reclaimed by the cluster manager.

\n
\n
maxFailedExecutorsPerNode (dagster.StringSource, optional):
\n

Scheduling: (Experimental) How many different executors must be blacklisted for the entire application, before the node is blacklisted for the entire application. Blacklisted nodes will be automatically added back to the pool of available resources after the timeout specified by spark.blacklist.timeout. Note that with dynamic allocation, though, the executors on the node may get marked as idle and be reclaimed by the cluster manager.

\n
\n
fetchFailure (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
enabled (dagster.StringSource, optional):
\n

Scheduling: (Experimental) If set to \u201ctrue\u201d, Spark will blacklist the executor immediately when a fetch failure happens. If external shuffle service is enabled, then the whole node will be blacklisted.

\n
\n
\n
\n
\n
\n
killBlacklistedExecutors (dagster.StringSource, optional):
\n

Scheduling: (Experimental) If set to \u201ctrue\u201d, allow Spark to automatically kill the executors when they are blacklisted on fetch failure or blacklisted for the entire application, as controlled by spark.blacklist.application.*. Note that, when an entire node is added to the blacklist, all of the executors on that node will be killed.

\n
\n
\n
\n
speculation (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
root (dagster.StringSource, optional):
\n

Scheduling: If set to \u201ctrue\u201d, performs speculative execution of tasks. This means if one or more tasks are running slowly in a stage, they will be re-launched.

\n
\n
interval (dagster.StringSource, optional):
\n

Scheduling: How often Spark will check for tasks to speculate.

\n
\n
multiplier (dagster.StringSource, optional):
\n

Scheduling: How many times slower a task is than the median to be considered for speculation.

\n
\n
quantile (dagster.StringSource, optional):
\n

Scheduling: Fraction of tasks which must be complete before speculation is enabled for a particular stage.

\n
\n
\n
\n
task (permissive dict, optional):
\n
\nDefault Value:
{\n    "reaper": {}\n}\n
\n
\n
\nConfig Schema:
\n
cpus (dagster.StringSource, optional):
\n

Scheduling: Number of cores to allocate for each task.

\n
\n
maxFailures (dagster.StringSource, optional):
\n

Scheduling: Number of failures of any particular task before giving up on the job. The total number of failures spread across different tasks will not cause the job to fail; a particular task has to fail this number of attempts. Should be greater than or equal to 1. Number of allowed retries = this value - 1.

\n
\n
reaper (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
enabled (dagster.StringSource, optional):
\n

Scheduling: Enables monitoring of killed / interrupted tasks. When set to true, any task which is killed will be monitored by the executor until that task actually finishes executing. See the other spark.task.reaper.* configurations for details on how to control the exact behavior of this monitoring. When set to false (the default), task killing will use an older code path which lacks such monitoring.

\n
\n
pollingInterval (dagster.StringSource, optional):
\n

Scheduling: When spark.task.reaper.enabled = true, this setting controls the frequency at which executors will poll the status of killed tasks. If a killed task is still running when polled then a warning will be logged and, by default, a thread-dump of the task will be logged (this thread dump can be disabled via the spark.task.reaper.threadDump setting, which is documented below).

\n
\n
threadDump (dagster.StringSource, optional):
\n

Scheduling: When spark.task.reaper.enabled = true, this setting controls whether task thread dumps are logged during periodic polling of killed tasks. Set this to false to disable collection of thread dumps.

\n
\n
killTimeout (dagster.StringSource, optional):
\n

Scheduling: When spark.task.reaper.enabled = true, this setting specifies a timeout after which the executor JVM will kill itself if a killed task has not stopped running. The default value, -1, disables this mechanism and prevents the executor from self-destructing. The purpose of this setting is to act as a safety-net to prevent runaway noncancellable tasks from rendering an executor unusable.

\n
\n
\n
\n
\n
\n
stage (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
maxConsecutiveAttempts (dagster.StringSource, optional):
\n

Scheduling: Number of consecutive stage attempts allowed before a stage is aborted.

\n
\n
\n
\n
dynamicAllocation (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
enabled (dagster.StringSource, optional):
\n

Dynamic Allocation: Whether to use dynamic resource allocation, which scales the number of executors registered with this application up and down based on the workload. For more detail, see the description here. This requires spark.shuffle.service.enabled to be set. The following configurations are also relevant: spark.dynamicAllocation.minExecutors, spark.dynamicAllocation.maxExecutors, and spark.dynamicAllocation.initialExecutors spark.dynamicAllocation.executorAllocationRatio

\n
\n
executorIdleTimeout (dagster.StringSource, optional):
\n

Dynamic Allocation: If dynamic allocation is enabled and an executor has been idle for more than this duration, the executor will be removed. For more detail, see this description.

\n
\n
cachedExecutorIdleTimeout (dagster.StringSource, optional):
\n

Dynamic Allocation: If dynamic allocation is enabled and an executor which has cached data blocks has been idle for more than this duration, the executor will be removed. For more details, see this description.

\n
\n
initialExecutors (dagster.StringSource, optional):
\n

Dynamic Allocation: Initial number of executors to run if dynamic allocation is enabled. If \u2013num-executors (or spark.executor.instances) is set and larger than this value, it will be used as the initial number of executors.

\n
\n
maxExecutors (dagster.StringSource, optional):
\n

Dynamic Allocation: Upper bound for the number of executors if dynamic allocation is enabled.

\n
\n
minExecutors (dagster.StringSource, optional):
\n

Dynamic Allocation: Lower bound for the number of executors if dynamic allocation is enabled.

\n
\n
executorAllocationRatio (dagster.StringSource, optional):
\n

Dynamic Allocation: By default, the dynamic allocation will request enough executors to maximize the parallelism according to the number of tasks to process. While this minimizes the latency of the job, with small tasks this setting can waste a lot of resources due to executor allocation overhead, as some executor might not even do any work. This setting allows to set a ratio that will be used to reduce the number of executors w.r.t. full parallelism. Defaults to 1.0 to give maximum parallelism. 0.5 will divide the target number of executors by 2 The target number of executors computed by the dynamicAllocation can still be overridden by the spark.dynamicAllocation.minExecutors and spark.dynamicAllocation.maxExecutors settings

\n
\n
schedulerBacklogTimeout (dagster.StringSource, optional):
\n

Dynamic Allocation: If dynamic allocation is enabled and there have been pending tasks backlogged for more than this duration, new executors will be requested. For more detail, see this description.

\n
\n
sustainedSchedulerBacklogTimeout (dagster.StringSource, optional):
\n

Dynamic Allocation: Same as spark.dynamicAllocation.schedulerBacklogTimeout, but used only for subsequent executor requests. For more detail, see this description.

\n
\n
\n
\n
r (permissive dict, optional):
\n
\nDefault Value:
{\n    "driver": {},\n    "shell": {}\n}\n
\n
\n
\nConfig Schema:
\n
numRBackendThreads (dagster.StringSource, optional):
\n

SparkR: Number of threads used by RBackend to handle RPC calls from SparkR package.

\n
\n
command (dagster.StringSource, optional):
\n

SparkR: Executable for executing R scripts in cluster modes for both driver and workers.

\n
\n
driver (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
command (dagster.StringSource, optional):
\n

SparkR: Executable for executing R scripts in client modes for driver. Ignored in cluster modes.

\n
\n
\n
\n
shell (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
command (dagster.StringSource, optional):
\n

SparkR: Executable for executing sparkR shell in client modes for driver. Ignored in cluster modes. It is the same as environment variable SPARKR_DRIVER_R, but take precedence over it. spark.r.shell.command is used for sparkR shell while spark.r.driver.command is used for running R script.

\n
\n
\n
\n
backendConnectionTimeout (dagster.StringSource, optional):
\n

SparkR: Connection timeout set by R process on its connection to RBackend in seconds.

\n
\n
heartBeatInterval (dagster.StringSource, optional):
\n

SparkR: Interval for heartbeats sent from SparkR backend to R process to prevent connection timeout.

\n
\n
\n
\n
graphx (permissive dict, optional):
\n
\nDefault Value:
{\n    "pregel": {}\n}\n
\n
\n
\nConfig Schema:
\n
pregel (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
checkpointInterval (dagster.StringSource, optional):
\n

GraphX: Checkpoint interval for graph and message in Pregel. It used to avoid stackOverflowError due to long lineage chains after lots of iterations. The checkpoint is disabled by default.

\n
\n
\n
\n
\n
\n
deploy (permissive dict, optional):
\n
\nDefault Value:
{\n    "zookeeper": {}\n}\n
\n
\n
\nConfig Schema:
\n
recoveryMode (dagster.StringSource, optional):
\n

Deploy: The recovery mode setting to recover submitted Spark jobs with cluster mode when it failed and relaunches. This is only applicable for cluster mode when running with Standalone or Mesos.

\n
\n
zookeeper (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
url (dagster.StringSource, optional):
\n

Deploy: When spark.deploy.recoveryMode is set to ZOOKEEPER, this configuration is used to set the zookeeper URL to connect to.

\n
\n
dir (dagster.StringSource, optional):
\n

Deploy: When spark.deploy.recoveryMode is set to ZOOKEEPER, this configuration is used to set the zookeeper directory to store recovery state.

\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n

This resource provides access to a PySpark SparkSession for executing PySpark code within Dagster.

\n

Example

\n
@op(required_resource_keys={"pyspark"})\ndef my_op(context):\n    spark_session = context.resources.pyspark.spark_session\n    dataframe = spark_session.read.json("examples/src/main/resources/people.json")\n\nmy_pyspark_resource = pyspark_resource.configured(\n    {"spark_conf": {"spark.executor.memory": "2g"}}\n)\n\n@job(resource_defs={"pyspark": my_pyspark_resource})\ndef my_spark_job():\n    my_op()\n
\n
\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-pyspark", "customsidebar": null, "display_toc": true, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../dagster-shell/", "title": "Shell (dagster-shell)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-prometheus/", "title": "Prometheus (dagster-prometheus)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-shell", "Shell (dagster-shell)", "N", "next"], ["sections/api/apidocs/libraries/dagster-prometheus", "Prometheus (dagster-prometheus)", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/libraries/dagster-pyspark.rst.txt", "title": "Pyspark (dagster-pyspark)", "toc": "\n"}, "dagster-shell": {"alabaster_version": "0.7.13", "body": "
\n

Shell (dagster-shell)\u00b6

\n

The Dagster shell library provides utilities and op factories for executing inline shell scripts or script files.

\n
\n
\n

APIs\u00b6

\n
\n
\ndagster_shell.create_shell_command_op(shell_command, name, description=None, required_resource_keys=None, tags=None)[source]\u00b6
\n

This function is a factory that constructs ops to execute a shell command.

\n

Note that you can only use shell_command_op if you know the command you\u2019d like to execute\nat job construction time. If you\u2019d like to construct shell commands dynamically during\njob execution and pass them between ops, you should use shell_op instead.

\n

The resulting op can take a single start argument that is a\nNothing dependency\nto allow you to run ops before the shell op.

\n

Examples

\n
from dagster import graph\nfrom dagster_shell import create_shell_command_op\n\n\n@graph\ndef my_graph():\n    a = create_shell_command_op('echo "hello, world!"', name="a")\n    a()\n
\n
\n
@op\ndef run_before_shell_op():\n    do_some_work()\n\n@graph\ndef my_graph():\n    my_echo_op = create_shell_command_op("echo hello world!", name="echo_op")\n    my_echo_op(start=run_before_shell_op())\n
\n
\n
\n
Parameters:
\n
    \n
  • shell_command (str) \u2013 The shell command that the constructed op will execute.

  • \n
  • name (str) \u2013 The name of the constructed op.

  • \n
  • description (Optional[str]) \u2013 Human-readable description of this op.

  • \n
  • required_resource_keys (Optional[Set[str]]) \u2013 Set of resource handles required by this op.\nSetting this ensures that resource spin up for the required resources will occur before\nthe shell command is executed.

  • \n
  • tags (Optional[Dict[str, Any]]) \u2013 Arbitrary metadata for the op. Frameworks may\nexpect and require certain metadata to be attached to a op. Users should generally\nnot set metadata directly. Values that are not strings will be json encoded and must meet\nthe criteria that json.loads(json.dumps(value)) == value.

  • \n
\n
\n
Raises:
\n

Failure \u2013 Raised when the shell command returns a non-zero exit code.

\n
\n
Returns:
\n

Returns the constructed op definition.

\n
\n
Return type:
\n

OpDefinition

\n
\n
\n
\n\n
\n
\ndagster_shell.create_shell_script_op(shell_script_path, name='create_shell_script_op', ins=None, **kwargs)[source]\u00b6
\n

This function is a factory which constructs an op that will execute a shell command read\nfrom a script file.

\n

Any kwargs passed to this function will be passed along to the underlying @op decorator. However, note that overriding config or output_defs is not\nsupported.

\n

You might consider using @graph to wrap this op\nin the cases where you\u2019d like to configure the shell op with different config fields.

\n

If no ins are passed then the resulting op can take a single start argument that is a\nNothing dependency\nto allow you to run ops before the shell op.

\n

Examples

\n
from dagster import file_relative_path, graph\nfrom dagster_shell import create_shell_script_op\n\n\n@graph\ndef my_graph():\n    a = create_shell_script_op(file_relative_path(__file__, "hello_world.sh"), name="a")\n    a()\n
\n
\n
@op\ndef run_before_shell_op():\n    do_some_work()\n\n@graph\ndef my_graph():\n    my_echo_op = create_shell_script_op(file_relative_path(__file__, "hello_world.sh"), name="echo_op")\n    my_echo_op(start=run_before_shell_op())\n
\n
\n
\n
Parameters:
\n
    \n
  • shell_script_path (str) \u2013 The script file to execute.

  • \n
  • name (Optional[str]) \u2013 The name of this op. Defaults to \u201ccreate_shell_script_op\u201d.

  • \n
  • ins (Optional[Mapping[str, In]]) \u2013 Ins for the op. Defaults to\na single Nothing input.

  • \n
\n
\n
Raises:
\n

Failure \u2013 Raised when the shell command returns a non-zero exit code.

\n
\n
Returns:
\n

Returns the constructed op definition.

\n
\n
Return type:
\n

OpDefinition

\n
\n
\n
\n\n
\n
\ndagster_shell.shell_op(context, shell_command, config)[source]\u00b6
\n

This op executes a shell command it receives as input.\nThis op is suitable for uses where the command to execute is generated dynamically by\nupstream ops. If you know the command to execute at job construction time,\nconsider shell_command_op instead.

\n
\n
Parameters:
\n
    \n
  • shell_command \u2013 The shell command to be executed

  • \n
  • config (ShellOpConfig) \u2013 A ShellOpConfig object specifying configuration options

  • \n
\n
\n
\n

Examples

\n
@op\ndef create_shell_command():\n    return "echo hello world!"\n\n@graph\ndef echo_graph():\n    shell_op(create_shell_command())\n
\n
\n
\n\n
\n
\ndagster_shell.execute_shell_command(shell_command, output_logging, log, cwd=None, env=None)\u00b6
\n

This function is a utility for executing shell commands from within a Dagster op (or from Python in general).\nIt can be used to execute shell commands on either op input data, or any data generated within a generic python op.

\n

Internally, it executes a shell script specified by the argument shell_command. The script will be written\nto a temporary file first and invoked via subprocess.Popen(['bash', shell_script_path], ...).

\n

In the Popen invocation, stdout=PIPE, stderr=STDOUT is used, and the combined stdout/stderr\noutput is retrieved.

\n

Examples

\n
from dagster import OpExecutionContext, op\nfrom dagster_shell import execute_shell_command\n\n\n@op\ndef my_shell_op(context: OpExecutionContext, data: str):\n    temp_file = "/tmp/data.txt"\n    with open(temp_file, "w", encoding="utf-8") as temp_file_writer:\n        temp_file_writer.write(data)\n        execute_shell_command(f"cat {temp_file}", output_logging="STREAM", log=context.log)\n
\n
\n
\n
Parameters:
\n
    \n
  • shell_command (str) \u2013 The shell command to execute

  • \n
  • output_logging (str) \u2013 The logging mode to use. Supports STREAM, BUFFER, and NONE.

  • \n
  • log (Union[logging.Logger, DagsterLogManager]) \u2013 Any logger which responds to .info()

  • \n
  • cwd (str, optional) \u2013 Working directory for the shell command to use. Defaults to the\ntemporary path where we store the shell command in a script file.

  • \n
  • env (Dict[str, str], optional) \u2013 Environment dictionary to pass to subprocess.Popen.\nUnused by default.

  • \n
\n
\n
Returns:
\n

A tuple where the first element is the combined stdout/stderr output of running the shell\ncommand and the second element is the return code.

\n
\n
Return type:
\n

Tuple[str, int]

\n
\n
\n
\n\n
\n
\ndagster_shell.execute_shell_script(shell_script_path, output_logging, log, cwd=None, env=None)\u00b6
\n

Execute a shell script file specified by the argument shell_script_path. The script will be\ninvoked via subprocess.Popen(['bash', shell_script_path], ...).

\n

In the Popen invocation, stdout=PIPE, stderr=STDOUT is used, and the combined stdout/stderr\noutput is retrieved.

\n

Examples

\n
from dagster import OpExecutionContext, op\nfrom dagster_shell import execute_shell_script\n\n\n@op\ndef my_shell_op(context: OpExecutionContext, data: str):\n    temp_file = "/tmp/echo_data.sh"\n    with open(temp_file, "w", encoding="utf-8") as temp_file_writer:\n        temp_file_writer.write(f"echo {data}")\n        execute_shell_script(temp_file, output_logging="STREAM", log=context.log)\n
\n
\n
\n
Parameters:
\n
    \n
  • shell_script_path (str) \u2013 The shell script to execute.

  • \n
  • output_logging (str) \u2013 The logging mode to use. Supports STREAM, BUFFER, and NONE.

  • \n
  • log (Union[logging.Logger, DagsterLogManager]) \u2013 Any logger which responds to .info()

  • \n
  • cwd (str, optional) \u2013 Working directory for the shell command to use. Defaults to the\ntemporary path where we store the shell command in a script file.

  • \n
  • env (Dict[str, str], optional) \u2013 Environment dictionary to pass to subprocess.Popen.\nUnused by default.

  • \n
\n
\n
Raises:
\n

Exception \u2013 When an invalid output_logging is selected. Unreachable from op-based\n invocation since the config system will check output_logging against the config\n enum.

\n
\n
Returns:
\n

A tuple where the first element is the combined stdout/stderr output of running the shell\ncommand and the second element is the return code.

\n
\n
Return type:
\n

Tuple[str, int]

\n
\n
\n
\n\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-shell", "customsidebar": null, "display_toc": true, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../dagster-slack/", "title": "Slack (dagster-slack)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-pyspark/", "title": "Pyspark (dagster-pyspark)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-slack", "Slack (dagster-slack)", "N", "next"], ["sections/api/apidocs/libraries/dagster-pyspark", "Pyspark (dagster-pyspark)", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/libraries/dagster-shell.rst.txt", "title": "Shell (dagster-shell)", "toc": "\n"}, "dagster-slack": {"alabaster_version": "0.7.13", "body": "
\n

Slack (dagster-slack)\u00b6

\n

\n
\n

\n
\n

This library provides an integration with Slack, to support posting messages in your company\u2019s Slack workspace.

\n
\n

\n
\n

Presently, it provides a thin wrapper on the Slack client API chat.postMessage.

\n
\n

\n
\n

To use this integration, you\u2019ll first need to create a Slack App for it.

\n
    \n
  1. Create App: Go to https://api.slack.com/apps and click \u201cCreate New App\u201d:

    \n

    \n
  2. \n
  3. Install App: After creating an app, on the left-hand side of the app configuration, click \u201cBot Users\u201d, and then create a bot user. Then, click \u201cInstall App\u201d on the left hand side, and finally \u201cInstall App to Workspace\u201d.

  4. \n
  5. Bot Token: Once finished, this will create a new bot token for your bot/workspace:

    \n

    \n
  6. \n
\n

Copy this bot token and put it somewhere safe; see Safely Storing Credentials for more on this topic.

\n
\n
\ndagster_slack.SlackResource ResourceDefinition[source]\u00b6
\n

This resource is for connecting to Slack.

\n

By configuring this Slack resource, you can post messages to Slack from any Dagster op, asset, schedule or sensor.

\n

Examples

\n
import os\n\nfrom dagster import EnvVar, job, op\nfrom dagster_slack import SlackResource\n\n\n@op\ndef slack_op(slack: SlackResource):\n    slack.get_client().chat_postMessage(channel='#noise', text=':wave: hey there!')\n\n@job\ndef slack_job():\n    slack_op()\n\ndefs = Definitions(\n    jobs=[slack_job],\n    resources={\n        "slack": SlackResource(token=EnvVar("MY_SLACK_TOKEN")),\n    },\n)\n
\n
\n
\n\n
\n
\ndagster_slack.make_slack_on_run_failure_sensor(channel, slack_token, text_fn=<function _default_failure_message_text_fn>, blocks_fn=None, name=None, dagit_base_url=None, minimum_interval_seconds=None, monitored_jobs=None, job_selection=None, monitor_all_repositories=False, default_status=DefaultSensorStatus.STOPPED, webserver_base_url=None)[source]\u00b6
\n

Create a sensor on job failures that will message the given Slack channel.

\n
\n
Parameters:
\n
    \n
  • channel (str) \u2013 The channel to send the message to (e.g. \u201c#my_channel\u201d)

  • \n
  • slack_token (str) \u2013 The slack token.\nTokens are typically either user tokens or bot tokens. More in the Slack API\ndocumentation here: https://api.slack.com/docs/token-types

  • \n
  • text_fn (Optional(Callable[[RunFailureSensorContext], str])) \u2013 Function which\ntakes in the RunFailureSensorContext and outputs the message you want to send.\nDefaults to a text message that contains error message, job name, and run ID.\nThe usage of the text_fn changes depending on whether you\u2019re using blocks_fn. If you\nare using blocks_fn, this is used as a fallback string to display in notifications. If\nyou aren\u2019t, this is the main body text of the message. It can be formatted as plain text,\nor with markdown.\nSee more details in https://api.slack.com/methods/chat.postMessage#text_usage

  • \n
  • blocks_fn (Callable[[RunFailureSensorContext], List[Dict]]) \u2013 Function which takes in\nthe RunFailureSensorContext and outputs the message blocks you want to send.\nSee information about Blocks in https://api.slack.com/reference/block-kit/blocks

  • \n
  • name \u2013 (Optional[str]): The name of the sensor. Defaults to \u201cslack_on_run_failure\u201d.

  • \n
  • dagit_base_url \u2013 \n \n (\n deprecated\n )\n \n (This parameter will be removed in version 2.0. Use webserver_base_url instead.) (Optional[str]): The base url of your Dagit instance. Specify this to allow\nmessages to include deeplinks to the failed job run.

  • \n
  • minimum_interval_seconds \u2013 (Optional[int]): The minimum number of seconds that will elapse\nbetween sensor evaluations.

  • \n
  • monitored_jobs (Optional[List[Union[JobDefinition, GraphDefinition, RepositorySelector, JobSelector, CodeLocationSensor]]]) \u2013 The jobs in the\ncurrent repository that will be monitored by this failure sensor. Defaults to None, which\nmeans the alert will be sent when any job in the repository fails. To monitor jobs in external repositories, use RepositorySelector and JobSelector

  • \n
  • job_selection (Optional[List[Union[JobDefinition, GraphDefinition, RepositorySelector, JobSelector, CodeLocationSensor]]]) \u2013 \n \n (\n deprecated\n )\n \n (This parameter will be removed in version 2.0. Use monitored_jobs instead.) (deprecated in favor of monitored_jobs)\nThe jobs in the current repository that will be monitored by this failure sensor. Defaults to None, which means the alert will\nbe sent when any job in the repository fails.

  • \n
  • monitor_all_repositories (bool) \u2013 If set to True, the sensor will monitor all runs in the\nDagster instance. If set to True, an error will be raised if you also specify\nmonitored_jobs or job_selection. Defaults to False.

  • \n
  • default_status (DefaultSensorStatus) \u2013 Whether the sensor starts as running or not. The default\nstatus can be overridden from Dagit or via the GraphQL API.

  • \n
  • webserver_base_url \u2013 (Optional[str]): The base url of your webserver instance. Specify this to allow\nmessages to include deeplinks to the failed job run.

  • \n
\n
\n
\n

Examples

\n
slack_on_run_failure = make_slack_on_run_failure_sensor(\n    "#my_channel",\n    os.getenv("MY_SLACK_TOKEN")\n)\n\n@repository\ndef my_repo():\n    return [my_job + slack_on_run_failure]\n
\n
\n
def my_message_fn(context: RunFailureSensorContext) -> str:\n    return (\n        f"Job {context.dagster_run.job_name} failed!"\n        f"Error: {context.failure_event.message}"\n    )\n\nslack_on_run_failure = make_slack_on_run_failure_sensor(\n    channel="#my_channel",\n    slack_token=os.getenv("MY_SLACK_TOKEN"),\n    text_fn=my_message_fn,\n    webserver_base_url="http://mycoolsite.com",\n)\n
\n
\n
\n\n
\n
\ndagster_slack.make_slack_on_freshness_policy_status_change_sensor(channel, slack_token, asset_selection, warn_after_minutes_overdue=0, notify_when_back_on_time=False, text_fn=<function _default_freshness_message_text_fn>, blocks_fn=None, name=None, dagit_base_url=None, default_status=DefaultSensorStatus.STOPPED, webserver_base_url=None)[source]\u00b6
\n
\n
\n

\n \n (\n experimental\n )\n \n This API may break in future versions, even between dot releases.\n \n

\n

Create a sensor that will message the given Slack channel whenever an asset in the provided\nAssetSelection becomes out of date. Messages are only fired when the state changes, meaning\nonly a single slack message will be sent (when the asset begins to be out of date). If\nnotify_when_back_on_time is set to True, a second slack message will be sent once the asset\nis on time again.

\n
\n
Parameters:
\n
    \n
  • channel (str) \u2013 The channel to send the message to (e.g. \u201c#my_channel\u201d)

  • \n
  • slack_token (str) \u2013 The slack token.\nTokens are typically either user tokens or bot tokens. More in the Slack API\ndocumentation here: https://api.slack.com/docs/token-types

  • \n
  • asset_selection (AssetSelection) \u2013 The selection of assets which this sensor will monitor.\nAlerts will only be fired for assets that have a FreshnessPolicy defined.

  • \n
  • warn_after_minutes_overdue (float) \u2013 How many minutes past the specified FreshnessPolicy this\nsensor will wait before firing an alert (by default, an alert will be fired as soon as\nthe policy is violated).

  • \n
  • notify_when_back_on_time (bool) \u2013 If a success message should be sent when the asset becomes on\ntime again.

  • \n
  • text_fn (Optional(Callable[[RunFailureSensorContext], str])) \u2013 Function which\ntakes in the FreshnessPolicySensorContext and outputs the message you want to send.\nDefaults to a text message that contains the relevant asset key, and the number of\nminutes past its defined freshness policy it currently is.\nThe usage of the text_fn changes depending on whether you\u2019re using blocks_fn. If you\nare using blocks_fn, this is used as a fallback string to display in notifications. If\nyou aren\u2019t, this is the main body text of the message. It can be formatted as plain text,\nor with markdown.\nSee more details in https://api.slack.com/methods/chat.postMessage#text_usage

  • \n
  • blocks_fn (Callable[[FreshnessPolicySensorContext], List[Dict]]) \u2013 Function which takes in\nthe FreshnessPolicySensorContext and outputs the message blocks you want to send.\nSee information about Blocks in https://api.slack.com/reference/block-kit/blocks

  • \n
  • name \u2013 (Optional[str]): The name of the sensor. Defaults to \u201cslack_on_freshness_policy\u201d.

  • \n
  • dagit_base_url \u2013 \n \n (\n deprecated\n )\n \n (This parameter will be removed in version 2.0. Use webserver_base_url instead.) (Optional[str]): The base url of your Dagit instance. Specify this to allow\nmessages to include deeplinks to the relevant asset page.

  • \n
  • default_status (DefaultSensorStatus) \u2013 Whether the sensor starts as running or not. The default\nstatus can be overridden from Dagit or via the GraphQL API.

  • \n
  • webserver_base_url \u2013 (Optional[str]): The base url of your Dagit instance. Specify this to allow\nmessages to include deeplinks to the relevant asset page.

  • \n
\n
\n
\n

Examples

\n
slack_on_freshness_policy = make_slack_on_freshness_policy_status_change_sensor(\n    "#my_channel",\n    os.getenv("MY_SLACK_TOKEN"),\n)\n
\n
\n
def my_message_fn(context: FreshnessPolicySensorContext) -> str:\n    if context.minutes_overdue == 0:\n        return f"Asset {context.asset_key} is currently on time :)"\n    return (\n        f"Asset {context.asset_key} is currently {context.minutes_overdue} minutes late!!"\n    )\n\nslack_on_run_failure = make_slack_on_run_failure_sensor(\n    channel="#my_channel",\n    slack_token=os.getenv("MY_SLACK_TOKEN"),\n    text_fn=my_message_fn,\n    webserver_base_url="http://mycoolsite.com",\n)\n
\n
\n
\n\n
\n
\ndagster_slack.slack_on_failure HookDefinition[source]\u00b6
\n

Create a hook on step failure events that will message the given Slack channel.

\n
\n
Parameters:
\n
    \n
  • channel (str) \u2013 The channel to send the message to (e.g. \u201c#my_channel\u201d)

  • \n
  • message_fn (Optional(Callable[[HookContext], str])) \u2013 Function which takes in the HookContext\noutputs the message you want to send.

  • \n
  • dagit_base_url \u2013 \n \n (\n deprecated\n )\n \n (This parameter will be removed in version 2.0. Use webserver_base_url instead.) (Optional[str]): The base url of your webserver instance. Specify this to allow\nmessages to include deeplinks to the specific run that triggered the hook.

  • \n
  • webserver_base_url \u2013 (Optional[str]): The base url of your webserver instance. Specify this to allow\nmessages to include deeplinks to the specific run that triggered the hook.

  • \n
\n
\n
\n

Examples

\n
@slack_on_failure("#foo", webserver_base_url="http://localhost:3000")\n@job(...)\ndef my_job():\n    pass\n
\n
\n
def my_message_fn(context: HookContext) -> str:\n    return f"Op {context.op} failed!"\n\n@op\ndef an_op(context):\n    pass\n\n@job(...)\ndef my_job():\n    an_op.with_hooks(hook_defs={slack_on_failure("#foo", my_message_fn)})\n
\n
\n
\n\n
\n
\ndagster_slack.slack_on_success HookDefinition[source]\u00b6
\n

Create a hook on step success events that will message the given Slack channel.

\n
\n
Parameters:
\n
    \n
  • channel (str) \u2013 The channel to send the message to (e.g. \u201c#my_channel\u201d)

  • \n
  • message_fn (Optional(Callable[[HookContext], str])) \u2013 Function which takes in the HookContext\noutputs the message you want to send.

  • \n
  • dagit_base_url \u2013 \n \n (\n deprecated\n )\n \n (This parameter will be removed in version 2.0. Use webserver_base_url instead.) (Optional[str]): The base url of your webserver instance. Specify this to allow\nmessages to include deeplinks to the specific run that triggered the hook.

  • \n
  • webserver_base_url \u2013 (Optional[str]): The base url of your webserver instance. Specify this to allow\nmessages to include deeplinks to the specific run that triggered the hook.

  • \n
\n
\n
\n

Examples

\n
@slack_on_success("#foo", webserver_base_url="http://localhost:3000")\n@job(...)\ndef my_job():\n    pass\n
\n
\n
def my_message_fn(context: HookContext) -> str:\n    return f"Op {context.op} worked!"\n\n@op\ndef an_op(context):\n    pass\n\n@job(...)\ndef my_job():\n    an_op.with_hooks(hook_defs={slack_on_success("#foo", my_message_fn)})\n
\n
\n
\n\n
\n

Legacy\u00b6

\n
\n
\ndagster_slack.slack_resource ResourceDefinition[source]\u00b6
\n

This resource is for connecting to Slack.

\n

The resource object is a slack_sdk.WebClient.

\n

By configuring this Slack resource, you can post messages to Slack from any Dagster op, asset, schedule or sensor.

\n

Examples

\n
import os\n\nfrom dagster import job, op\nfrom dagster_slack import slack_resource\n\n\n@op(required_resource_keys={'slack'})\ndef slack_op(context):\n    context.resources.slack.chat_postMessage(channel='#noise', text=':wave: hey there!')\n\n@job(resource_defs={'slack': slack_resource})\ndef slack_job():\n    slack_op()\n\nslack_job.execute_in_process(\n    run_config={'resources': {'slack': {'config': {'token': os.getenv('SLACK_TOKEN')}}}}\n)\n
\n
\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-slack", "customsidebar": null, "display_toc": true, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../dagster-snowflake/", "title": "Snowflake (dagster-snowflake)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-shell/", "title": "Shell (dagster-shell)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-snowflake", "Snowflake (dagster-snowflake)", "N", "next"], ["sections/api/apidocs/libraries/dagster-shell", "Shell (dagster-shell)", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/libraries/dagster-slack.rst.txt", "title": "Slack (dagster-slack)", "toc": "\n"}, "dagster-snowflake": {"alabaster_version": "0.7.13", "body": "
\n

Snowflake (dagster-snowflake)\u00b6

\n

This library provides an integration with the Snowflake data\nwarehouse.

\n

To use this library, you should first ensure that you have an appropriate Snowflake user configured to access\nyour data warehouse.

\n

Related Guides:

\n\n
\n

I/O Manager\u00b6

\n
\n
\ndagster_snowflake.SnowflakeIOManager IOManagerDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
database (dagster.StringSource):
\n

Name of the database to use.

\n
\n
account (dagster.StringSource):
\n

Your Snowflake account name. For more details, see the Snowflake documentation.

\n
\n
user (dagster.StringSource):
\n

User login name.

\n
\n
schema (Union[dagster.StringSource, None], optional):
\n

Name of the schema to use.

\n
\n
password (Union[dagster.StringSource, None], optional):
\n

User password.

\n
\n
warehouse (Union[dagster.StringSource, None], optional):
\n

Name of the warehouse to use.

\n
\n
role (Union[dagster.StringSource, None], optional):
\n

Name of the role to use.

\n
\n
private_key (Union[dagster.StringSource, None], optional):
\n

Raw private key to use. See the Snowflake documentation for details. To avoid issues with newlines in the keys, you can base64 encode the key. You can retrieve the base64 encoded key with this shell command: cat rsa_key.p8 | base64

\n
\n
private_key_path (Union[dagster.StringSource, None], optional):
\n

Path to the private key. See the Snowflake documentation for details.

\n
\n
private_key_password (Union[dagster.StringSource, None], optional):
\n

The password of the private key. See the Snowflake documentation for details. Required for both private_key and private_key_path if the private key is encrypted. For unencrypted keys, this config can be omitted or set to None.

\n
\n
store_timestamps_as_strings (dagster.BoolSource, optional):
\n

If using Pandas DataFrames, whether to convert time data to strings. If True, time data will be converted to strings when storing the DataFrame and converted back to time data when loading the DataFrame. If False, time data without a timezone will be set to UTC timezone to avoid a Snowflake bug. Defaults to False.

\n

Default Value: False

\n
\n
authenticator (Union[dagster.StringSource, None], optional):
\n

Optional parameter to specify the authentication mechanism to use.

\n
\n
\n

Base class for an IO manager definition that reads inputs from and writes outputs to Snowflake.

\n

Examples

\n
from dagster_snowflake import SnowflakeIOManager\nfrom dagster_snowflake_pandas import SnowflakePandasTypeHandler\nfrom dagster_snowflake_pyspark import SnowflakePySparkTypeHandler\nfrom dagster import Definitions, EnvVar\n\nclass MySnowflakeIOManager(SnowflakeIOManager):\n    @staticmethod\n    def type_handlers() -> Sequence[DbTypeHandler]:\n        return [SnowflakePandasTypeHandler(), SnowflakePySparkTypeHandler()]\n\n@asset(\n    key_prefix=["my_schema"]  # will be used as the schema in snowflake\n)\ndef my_table() -> pd.DataFrame:  # the name of the asset will be the table name\n    ...\n\ndefs = Definitions(\n    assets=[my_table],\n    resources={\n        "io_manager": MySnowflakeIOManager(database="MY_DATABASE", account=EnvVar("SNOWFLAKE_ACCOUNT"), ...)\n    }\n)\n
\n
\n

If you do not provide a schema, Dagster will determine a schema based on the assets and ops using\nthe IO Manager. For assets, the schema will be determined from the asset key,\nas shown in the above example. The final prefix before the asset name will be used as the schema. For example,\nif the asset my_table had the key prefix ["snowflake", "my_schema"], the schema my_schema will be\nused. For ops, the schema can be specified by including a schema entry in output metadata. If schema is not provided\nvia config or on the asset/op, public will be used for the schema.

\n
@op(\n    out={"my_table": Out(metadata={"schema": "my_schema"})}\n)\ndef make_my_table() -> pd.DataFrame:\n    # the returned value will be stored at my_schema.my_table\n    ...\n
\n
\n

To only use specific columns of a table as input to a downstream op or asset, add the metadata columns to the\nIn or AssetIn.

\n
@asset(\n    ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n)\ndef my_table_a(my_table: pd.DataFrame) -> pd.DataFrame:\n    # my_table will just contain the data from column "a"\n    ...\n
\n
\n
\n\n
\n
\n

Resource\u00b6

\n
\n
\ndagster_snowflake.SnowflakeResource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
account (Union[dagster.StringSource, None], optional):
\n

Your Snowflake account name. For more details, see the Snowflake documentation.

\n
\n
user (dagster.StringSource):
\n

User login name.

\n
\n
password (Union[dagster.StringSource, None], optional):
\n

User password.

\n
\n
database (Union[dagster.StringSource, None], optional):
\n

Name of the default database to use. After login, you can use USE DATABASE to change the database.

\n
\n
schema (Union[dagster.StringSource, None], optional):
\n

Name of the default schema to use. After login, you can use USE SCHEMA to change the schema.

\n
\n
role (Union[dagster.StringSource, None], optional):
\n

Name of the default role to use. After login, you can use USE ROLE to change the role.

\n
\n
warehouse (Union[dagster.StringSource, None], optional):
\n

Name of the default warehouse to use. After login, you can use USE WAREHOUSE to change the role.

\n
\n
private_key (Union[dagster.StringSource, None], optional):
\n

Raw private key to use. See the Snowflake documentation for details. Alternately, set private_key_path and private_key_password. To avoid issues with newlines in the keys, you can base64 encode the key. You can retrieve the base64 encoded key with this shell command: cat rsa_key.p8 | base64

\n
\n
private_key_password (Union[dagster.StringSource, None], optional):
\n

Raw private key password to use. See the Snowflake documentation for details. Required for both private_key and private_key_path if the private key is encrypted. For unencrypted keys, this config can be omitted or set to None.

\n
\n
private_key_path (Union[dagster.StringSource, None], optional):
\n

Raw private key path to use. See the Snowflake documentation for details. Alternately, set the raw private key as private_key.

\n
\n
autocommit (Union[dagster.BoolSource, None], optional):
\n

None by default, which honors the Snowflake parameter AUTOCOMMIT. Set to True or False to enable or disable autocommit mode in the session, respectively.

\n
\n
client_prefetch_threads (Union[dagster.IntSource, None], optional):
\n

Number of threads used to download the results sets (4 by default). Increasing the value improves fetch performance but requires more memory.

\n
\n
client_session_keep_alive (Union[dagster.BoolSource, None], optional):
\n

False by default. Set this to True to keep the session active indefinitely, even if there is no activity from the user. Make certain to call the close method to terminate the thread properly or the process may hang.

\n
\n
login_timeout (Union[dagster.IntSource, None], optional):
\n

Timeout in seconds for login. By default, 60 seconds. The login request gives up after the timeout length if the HTTP response is \u201csuccess\u201d.

\n
\n
network_timeout (Union[dagster.IntSource, None], optional):
\n

Timeout in seconds for all other operations. By default, none/infinite. A general request gives up after the timeout length if the HTTP response is not \u2018success\u2019.

\n
\n
ocsp_response_cache_filename (Union[dagster.StringSource, None], optional):
\n

URI for the OCSP response cache file. By default, the OCSP response cache file is created in the cache directory.

\n
\n
validate_default_parameters (Union[dagster.BoolSource, None], optional):
\n

If True, raise an exception if the warehouse, database, or schema doesn\u2019t exist. Defaults to False.

\n
\n
paramstyle (Union[dagster.StringSource, None], optional):
\n

pyformat by default for client side binding. Specify qmark or numeric to change bind variable formats for server side binding.

\n
\n
timezone (Union[dagster.StringSource, None], optional):
\n

None by default, which honors the Snowflake parameter TIMEZONE. Set to a valid time zone (e.g. America/Los_Angeles) to set the session time zone.

\n
\n
connector (Union[dagster.StringSource, None], optional):
\n

Indicate alternative database connection engine. Permissible option is \u2018sqlalchemy\u2019 otherwise defaults to use the Snowflake Connector for Python.

\n
\n
cache_column_metadata (Union[dagster.StringSource, None], optional):
\n

Optional parameter when connector is set to sqlalchemy. Snowflake SQLAlchemy takes a flag cache_column_metadata=True such that all of column metadata for all tables are \u201ccached\u201d

\n
\n
numpy (Union[dagster.BoolSource, None], optional):
\n

Optional parameter when connector is set to sqlalchemy. To enable fetching NumPy data types, add numpy=True to the connection parameters.

\n
\n
authenticator (Union[dagster.StringSource, None], optional):
\n

Optional parameter to specify the authentication mechanism to use.

\n
\n
\n

A resource for connecting to the Snowflake data warehouse.

\n

If connector configuration is not set, SnowflakeResource.get_connection() will return a\nsnowflake.connector.Connection\nobject. If connector=\u201dsqlalchemy\u201d configuration is set, then SnowflakeResource.get_connection() will\nreturn a SQLAlchemy Connection\nor a SQLAlchemy raw connection.

\n

A simple example of loading data into Snowflake and subsequently querying that data is shown below:

\n

Examples

\n
from dagster import job, op\nfrom dagster_snowflake import SnowflakeResource\n\n@op\ndef get_one(snowflake_resource: SnowflakeResource):\n    with snowflake_resource.get_connection() as conn:\n        # conn is a snowflake.connector.Connection object\n        conn.cursor().execute("SELECT 1")\n\n@job\ndef my_snowflake_job():\n    get_one()\n\nmy_snowflake_job.execute_in_process(\n    resources={\n        'snowflake_resource': SnowflakeResource(\n            account=EnvVar("SNOWFLAKE_ACCOUNT"),\n            user=EnvVar("SNOWFLAKE_USER"),\n            password=EnvVar("SNOWFLAKE_PASSWORD")\n            database="MY_DATABASE",\n            schema="MY_SCHEMA",\n            warehouse="MY_WAREHOUSE"\n        )\n    }\n)\n
\n
\n
\n\n
\n
\nclass dagster_snowflake.SnowflakeConnection(config, log, snowflake_connection_resource)[source]\u00b6
\n

A connection to Snowflake that can execute queries. In general this class should not be\ndirectly instantiated, but rather used as a resource in an op or asset via the\nsnowflake_resource().

\n

Note that the SnowflakeConnection is only used by the snowflake_resource. The Pythonic SnowflakeResource does\nnot use this SnowflakeConnection class.

\n
\n
\nexecute_queries(sql_queries, parameters=None, fetch_results=False, use_pandas_result=False)[source]\u00b6
\n

Execute multiple queries in Snowflake.

\n
\n
Parameters:
\n
    \n
  • sql_queries (str) \u2013 List of queries to be executed in series

  • \n
  • parameters (Optional[Union[Sequence[Any], Mapping[Any, Any]]]) \u2013 Parameters to be passed to every query. See the\nSnowflake documentation\nfor more information.

  • \n
  • fetch_results (bool) \u2013 If True, will return the results of the queries as a list. Defaults to False. If True\nand use_pandas_result is also True, results will be returned as Pandas DataFrames.

  • \n
  • use_pandas_result (bool) \u2013 If True, will return the results of the queries as a list of a Pandas DataFrames.\nDefaults to False. If fetch_results is False and use_pandas_result is True, an error will be\nraised.

  • \n
\n
\n
Returns:
\n

The results of the queries as a list if fetch_results or use_pandas_result is True,\notherwise returns None

\n
\n
\n

Examples

\n
@op\ndef create_fresh_database(snowflake: SnowflakeResource):\n    queries = ["DROP DATABASE IF EXISTS MY_DATABASE", "CREATE DATABASE MY_DATABASE"]\n    snowflake.execute_queries(\n        sql_queries=queries\n    )\n
\n
\n
\n\n
\n
\nexecute_query(sql, parameters=None, fetch_results=False, use_pandas_result=False)[source]\u00b6
\n

Execute a query in Snowflake.

\n
\n
Parameters:
\n
    \n
  • sql (str) \u2013 the query to be executed

  • \n
  • parameters (Optional[Union[Sequence[Any], Mapping[Any, Any]]]) \u2013 Parameters to be passed to the query. See the\nSnowflake documentation\nfor more information.

  • \n
  • fetch_results (bool) \u2013 If True, will return the result of the query. Defaults to False. If True\nand use_pandas_result is also True, results will be returned as a Pandas DataFrame.

  • \n
  • use_pandas_result (bool) \u2013 If True, will return the result of the query as a Pandas DataFrame.\nDefaults to False. If fetch_results is False and use_pandas_result is True, an error will be\nraised.

  • \n
\n
\n
Returns:
\n

The result of the query if fetch_results or use_pandas_result is True, otherwise returns None

\n
\n
\n

Examples

\n
@op\ndef drop_database(snowflake: SnowflakeResource):\n    snowflake.execute_query(\n        "DROP DATABASE IF EXISTS MY_DATABASE"\n    )\n
\n
\n
\n\n
\n
\nget_connection(raw_conn=True)[source]\u00b6
\n

Gets a connection to Snowflake as a context manager.

\n

If using the execute_query, execute_queries, or load_table_from_local_parquet methods,\nyou do not need to create a connection using this context manager.

\n
\n
Parameters:
\n

raw_conn (bool) \u2013 If using the sqlalchemy connector, you can set raw_conn to True to create a raw\nconnection. Defaults to True.

\n
\n
\n

Examples

\n
@op(\n    required_resource_keys={"snowflake"}\n)\ndef get_query_status(query_id):\n    with context.resources.snowflake.get_connection() as conn:\n        # conn is a Snowflake Connection object or a SQLAlchemy Connection if\n        # sqlalchemy is specified as the connector in the Snowflake Resource config\n\n        return conn.get_query_status(query_id)\n
\n
\n
\n\n
\n
\nload_table_from_local_parquet(src, table)[source]\u00b6
\n

Stores the content of a parquet file to a Snowflake table.

\n
\n
Parameters:
\n
    \n
  • src (str) \u2013 the name of the file to store in Snowflake

  • \n
  • table (str) \u2013 the name of the table to store the data. If the table does not exist, it will\nbe created. Otherwise the contents of the table will be replaced with the data in src

  • \n
\n
\n
\n

Examples

\n
import pandas as pd\nimport pyarrow as pa\nimport pyarrow.parquet as pq\n\n@op\ndef write_parquet_file(snowflake: SnowflakeResource):\n    df = pd.DataFrame({"one": [1, 2, 3], "ten": [11, 12, 13]})\n    table = pa.Table.from_pandas(df)\n    pq.write_table(table, "example.parquet')\n    snowflake.load_table_from_local_parquet(\n        src="example.parquet",\n        table="MY_TABLE"\n    )\n
\n
\n
\n\n
\n\n
\n
\n

Ops\u00b6

\n
\n
\ndagster_snowflake.snowflake_op_for_query(sql, parameters=None)[source]\u00b6
\n

This function is an op factory that constructs an op to execute a snowflake query.

\n

Note that you can only use snowflake_op_for_query if you know the query you\u2019d like to\nexecute at graph construction time. If you\u2019d like to execute queries dynamically during\njob execution, you should manually execute those queries in your custom op using the\nsnowflake resource.

\n
\n
Parameters:
\n
    \n
  • sql (str) \u2013 The sql query that will execute against the provided snowflake resource.

  • \n
  • parameters (dict) \u2013 The parameters for the sql query.

  • \n
\n
\n
Returns:
\n

Returns the constructed op definition.

\n
\n
Return type:
\n

OpDefinition

\n
\n
\n
\n\n
\n
\n

Legacy\u00b6

\n
\n
\ndagster_snowflake.build_snowflake_io_manager IOManagerDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
database (dagster.StringSource):
\n

Name of the database to use.

\n
\n
account (dagster.StringSource):
\n

Your Snowflake account name. For more details, see the Snowflake documentation.

\n
\n
user (dagster.StringSource):
\n

User login name.

\n
\n
schema (Union[dagster.StringSource, None], optional):
\n

Name of the schema to use.

\n
\n
password (Union[dagster.StringSource, None], optional):
\n

User password.

\n
\n
warehouse (Union[dagster.StringSource, None], optional):
\n

Name of the warehouse to use.

\n
\n
role (Union[dagster.StringSource, None], optional):
\n

Name of the role to use.

\n
\n
private_key (Union[dagster.StringSource, None], optional):
\n

Raw private key to use. See the Snowflake documentation for details. To avoid issues with newlines in the keys, you can base64 encode the key. You can retrieve the base64 encoded key with this shell command: cat rsa_key.p8 | base64

\n
\n
private_key_path (Union[dagster.StringSource, None], optional):
\n

Path to the private key. See the Snowflake documentation for details.

\n
\n
private_key_password (Union[dagster.StringSource, None], optional):
\n

The password of the private key. See the Snowflake documentation for details. Required for both private_key and private_key_path if the private key is encrypted. For unencrypted keys, this config can be omitted or set to None.

\n
\n
store_timestamps_as_strings (dagster.BoolSource, optional):
\n

If using Pandas DataFrames, whether to convert time data to strings. If True, time data will be converted to strings when storing the DataFrame and converted back to time data when loading the DataFrame. If False, time data without a timezone will be set to UTC timezone to avoid a Snowflake bug. Defaults to False.

\n

Default Value: False

\n
\n
authenticator (Union[dagster.StringSource, None], optional):
\n

Optional parameter to specify the authentication mechanism to use.

\n
\n
\n

Builds an IO manager definition that reads inputs from and writes outputs to Snowflake.

\n
\n
Parameters:
\n
    \n
  • type_handlers (Sequence[DbTypeHandler]) \u2013 Each handler defines how to translate between\nslices of Snowflake tables and an in-memory type - e.g. a Pandas DataFrame. If only\none DbTypeHandler is provided, it will be used as teh default_load_type.

  • \n
  • default_load_type (Type) \u2013 When an input has no type annotation, load it as this type.

  • \n
\n
\n
Returns:
\n

IOManagerDefinition

\n
\n
\n

Examples

\n
from dagster_snowflake import build_snowflake_io_manager\nfrom dagster_snowflake_pandas import SnowflakePandasTypeHandler\nfrom dagster_snowflake_pyspark import SnowflakePySparkTypeHandler\nfrom dagster import Definitions\n\n@asset(\n    key_prefix=["my_schema"]  # will be used as the schema in snowflake\n)\ndef my_table() -> pd.DataFrame:  # the name of the asset will be the table name\n    ...\n\nsnowflake_io_manager = build_snowflake_io_manager([SnowflakePandasTypeHandler(), SnowflakePySparkTypeHandler()])\n\ndefs = Definitions(\n    assets=[my_table],\n    resources={\n        "io_manager": snowflake_io_manager.configured({\n            "database": "my_database",\n            "account" : {"env": "SNOWFLAKE_ACCOUNT"}\n            ...\n        })\n    }\n)\n
\n
\n

If you do not provide a schema, Dagster will determine a schema based on the assets and ops using\nthe IO Manager. For assets, the schema will be determined from the asset key,\nas shown in the above example. The final prefix before the asset name will be used as the schema. For example,\nif the asset my_table had the key prefix ["snowflake", "my_schema"], the schema my_schema will be\nused. For ops, the schema can be specified by including a schema entry in output metadata. If schema is not provided\nvia config or on the asset/op, public will be used for the schema.

\n
@op(\n    out={"my_table": Out(metadata={"schema": "my_schema"})}\n)\ndef make_my_table() -> pd.DataFrame:\n    # the returned value will be stored at my_schema.my_table\n    ...\n
\n
\n

To only use specific columns of a table as input to a downstream op or asset, add the metadata columns to the\nIn or AssetIn.

\n
@asset(\n    ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n)\ndef my_table_a(my_table: pd.DataFrame) -> pd.DataFrame:\n    # my_table will just contain the data from column "a"\n    ...\n
\n
\n
\n\n
\n
\ndagster_snowflake.snowflake_resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
account (Union[dagster.StringSource, None], optional):
\n

Your Snowflake account name. For more details, see the Snowflake documentation.

\n
\n
user (dagster.StringSource):
\n

User login name.

\n
\n
password (Union[dagster.StringSource, None], optional):
\n

User password.

\n
\n
database (Union[dagster.StringSource, None], optional):
\n

Name of the default database to use. After login, you can use USE DATABASE to change the database.

\n
\n
schema (Union[dagster.StringSource, None], optional):
\n

Name of the default schema to use. After login, you can use USE SCHEMA to change the schema.

\n
\n
role (Union[dagster.StringSource, None], optional):
\n

Name of the default role to use. After login, you can use USE ROLE to change the role.

\n
\n
warehouse (Union[dagster.StringSource, None], optional):
\n

Name of the default warehouse to use. After login, you can use USE WAREHOUSE to change the role.

\n
\n
private_key (Union[dagster.StringSource, None], optional):
\n

Raw private key to use. See the Snowflake documentation for details. Alternately, set private_key_path and private_key_password. To avoid issues with newlines in the keys, you can base64 encode the key. You can retrieve the base64 encoded key with this shell command: cat rsa_key.p8 | base64

\n
\n
private_key_password (Union[dagster.StringSource, None], optional):
\n

Raw private key password to use. See the Snowflake documentation for details. Required for both private_key and private_key_path if the private key is encrypted. For unencrypted keys, this config can be omitted or set to None.

\n
\n
private_key_path (Union[dagster.StringSource, None], optional):
\n

Raw private key path to use. See the Snowflake documentation for details. Alternately, set the raw private key as private_key.

\n
\n
autocommit (Union[dagster.BoolSource, None], optional):
\n

None by default, which honors the Snowflake parameter AUTOCOMMIT. Set to True or False to enable or disable autocommit mode in the session, respectively.

\n
\n
client_prefetch_threads (Union[dagster.IntSource, None], optional):
\n

Number of threads used to download the results sets (4 by default). Increasing the value improves fetch performance but requires more memory.

\n
\n
client_session_keep_alive (Union[dagster.BoolSource, None], optional):
\n

False by default. Set this to True to keep the session active indefinitely, even if there is no activity from the user. Make certain to call the close method to terminate the thread properly or the process may hang.

\n
\n
login_timeout (Union[dagster.IntSource, None], optional):
\n

Timeout in seconds for login. By default, 60 seconds. The login request gives up after the timeout length if the HTTP response is \u201csuccess\u201d.

\n
\n
network_timeout (Union[dagster.IntSource, None], optional):
\n

Timeout in seconds for all other operations. By default, none/infinite. A general request gives up after the timeout length if the HTTP response is not \u2018success\u2019.

\n
\n
ocsp_response_cache_filename (Union[dagster.StringSource, None], optional):
\n

URI for the OCSP response cache file. By default, the OCSP response cache file is created in the cache directory.

\n
\n
validate_default_parameters (Union[dagster.BoolSource, None], optional):
\n

If True, raise an exception if the warehouse, database, or schema doesn\u2019t exist. Defaults to False.

\n
\n
paramstyle (Union[dagster.StringSource, None], optional):
\n

pyformat by default for client side binding. Specify qmark or numeric to change bind variable formats for server side binding.

\n
\n
timezone (Union[dagster.StringSource, None], optional):
\n

None by default, which honors the Snowflake parameter TIMEZONE. Set to a valid time zone (e.g. America/Los_Angeles) to set the session time zone.

\n
\n
connector (Union[dagster.StringSource, None], optional):
\n

Indicate alternative database connection engine. Permissible option is \u2018sqlalchemy\u2019 otherwise defaults to use the Snowflake Connector for Python.

\n
\n
cache_column_metadata (Union[dagster.StringSource, None], optional):
\n

Optional parameter when connector is set to sqlalchemy. Snowflake SQLAlchemy takes a flag cache_column_metadata=True such that all of column metadata for all tables are \u201ccached\u201d

\n
\n
numpy (Union[dagster.BoolSource, None], optional):
\n

Optional parameter when connector is set to sqlalchemy. To enable fetching NumPy data types, add numpy=True to the connection parameters.

\n
\n
authenticator (Union[dagster.StringSource, None], optional):
\n

Optional parameter to specify the authentication mechanism to use.

\n
\n
\n

A resource for connecting to the Snowflake data warehouse. The returned resource object is an\ninstance of SnowflakeConnection.

\n

A simple example of loading data into Snowflake and subsequently querying that data is shown below:

\n

Examples

\n
from dagster import job, op\nfrom dagster_snowflake import snowflake_resource\n\n@op(required_resource_keys={'snowflake'})\ndef get_one(context):\n    context.resources.snowflake.execute_query('SELECT 1')\n\n@job(resource_defs={'snowflake': snowflake_resource})\ndef my_snowflake_job():\n    get_one()\n\nmy_snowflake_job.execute_in_process(\n    run_config={\n        'resources': {\n            'snowflake': {\n                'config': {\n                    'account': {'env': 'SNOWFLAKE_ACCOUNT'},\n                    'user': {'env': 'SNOWFLAKE_USER'},\n                    'password': {'env': 'SNOWFLAKE_PASSWORD'},\n                    'database': {'env': 'SNOWFLAKE_DATABASE'},\n                    'schema': {'env': 'SNOWFLAKE_SCHEMA'},\n                    'warehouse': {'env': 'SNOWFLAKE_WAREHOUSE'},\n                }\n            }\n        }\n    }\n)\n
\n
\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-snowflake", "customsidebar": null, "display_toc": true, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../dagster-snowflake-pandas/", "title": "Snowflake with Pandas (dagster-snowflake-pandas)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-slack/", "title": "Slack (dagster-slack)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-snowflake-pandas", "Snowflake with Pandas (dagster-snowflake-pandas)", "N", "next"], ["sections/api/apidocs/libraries/dagster-slack", "Slack (dagster-slack)", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/libraries/dagster-snowflake.rst.txt", "title": "Snowflake (dagster-snowflake)", "toc": "\n"}, "dagster-snowflake-pandas": {"alabaster_version": "0.7.13", "body": "
\n

Snowflake with Pandas (dagster-snowflake-pandas)\u00b6

\n

This library provides an integration with the Snowflake data\nwarehouse and Pandas data processing library.

\n

To use this library, you should first ensure that you have an appropriate Snowflake user configured to access\nyour data warehouse.

\n

Related Guides:

\n\n
\n
\ndagster_snowflake_pandas.SnowflakePandasIOManager IOManagerDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
database (dagster.StringSource):
\n

Name of the database to use.

\n
\n
account (dagster.StringSource):
\n

Your Snowflake account name. For more details, see the Snowflake documentation.

\n
\n
user (dagster.StringSource):
\n

User login name.

\n
\n
schema (Union[dagster.StringSource, None], optional):
\n

Name of the schema to use.

\n
\n
password (Union[dagster.StringSource, None], optional):
\n

User password.

\n
\n
warehouse (Union[dagster.StringSource, None], optional):
\n

Name of the warehouse to use.

\n
\n
role (Union[dagster.StringSource, None], optional):
\n

Name of the role to use.

\n
\n
private_key (Union[dagster.StringSource, None], optional):
\n

Raw private key to use. See the Snowflake documentation for details. To avoid issues with newlines in the keys, you can base64 encode the key. You can retrieve the base64 encoded key with this shell command: cat rsa_key.p8 | base64

\n
\n
private_key_path (Union[dagster.StringSource, None], optional):
\n

Path to the private key. See the Snowflake documentation for details.

\n
\n
private_key_password (Union[dagster.StringSource, None], optional):
\n

The password of the private key. See the Snowflake documentation for details. Required for both private_key and private_key_path if the private key is encrypted. For unencrypted keys, this config can be omitted or set to None.

\n
\n
store_timestamps_as_strings (dagster.BoolSource, optional):
\n

If using Pandas DataFrames, whether to convert time data to strings. If True, time data will be converted to strings when storing the DataFrame and converted back to time data when loading the DataFrame. If False, time data without a timezone will be set to UTC timezone to avoid a Snowflake bug. Defaults to False.

\n

Default Value: False

\n
\n
authenticator (Union[dagster.StringSource, None], optional):
\n

Optional parameter to specify the authentication mechanism to use.

\n
\n
\n

An I/O manager definition that reads inputs from and writes Pandas DataFrames to Snowflake. When\nusing the SnowflakePandasIOManager, any inputs and outputs without type annotations will be loaded\nas Pandas DataFrames.

\n
\n
Returns:
\n

IOManagerDefinition

\n
\n
\n

Examples

\n
from dagster_snowflake_pandas import SnowflakePandasIOManager\nfrom dagster import asset, Definitions, EnvVar\n\n@asset(\n    key_prefix=["my_schema"]  # will be used as the schema in snowflake\n)\ndef my_table() -> pd.DataFrame:  # the name of the asset will be the table name\n    ...\n\ndefs = Definitions(\n    assets=[my_table],\n    resources={\n        "io_manager": SnowflakePandasIOManager(database="MY_DATABASE", account=EnvVar("SNOWFLAKE_ACCOUNT"), ...)\n    }\n)\n
\n
\n

If you do not provide a schema, Dagster will determine a schema based on the assets and ops using\nthe I/O Manager. For assets, the schema will be determined from the asset key, as in the above example.\nFor ops, the schema can be specified by including a \u201cschema\u201d entry in output metadata. If \u201cschema\u201d is not provided\nvia config or on the asset/op, \u201cpublic\u201d will be used for the schema.

\n
@op(\n    out={"my_table": Out(metadata={"schema": "my_schema"})}\n)\ndef make_my_table() -> pd.DataFrame:\n    # the returned value will be stored at my_schema.my_table\n    ...\n
\n
\n

To only use specific columns of a table as input to a downstream op or asset, add the metadata \u201ccolumns\u201d to the\nIn or AssetIn.

\n
@asset(\n    ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n)\ndef my_table_a(my_table: pd.DataFrame) -> pd.DataFrame:\n    # my_table will just contain the data from column "a"\n    ...\n
\n
\n
\n\n
\n
\nclass dagster_snowflake_pandas.SnowflakePandasTypeHandler[source]\u00b6
\n

Plugin for the Snowflake I/O Manager that can store and load Pandas DataFrames as Snowflake tables.

\n

Examples

\n
from dagster_snowflake import SnowflakeIOManager\nfrom dagster_snowflake_pandas import SnowflakePandasTypeHandler\nfrom dagster_snowflake_pyspark import SnowflakePySparkTypeHandler\nfrom dagster import Definitions, EnvVar\n\nclass MySnowflakeIOManager(SnowflakeIOManager):\n    @staticmethod\n    def type_handlers() -> Sequence[DbTypeHandler]:\n        return [SnowflakePandasTypeHandler(), SnowflakePySparkTypeHandler()]\n\n@asset(\n    key_prefix=["my_schema"]  # will be used as the schema in snowflake\n)\ndef my_table() -> pd.DataFrame:  # the name of the asset will be the table name\n    ...\n\ndefs = Definitions(\n    assets=[my_table],\n    resources={\n        "io_manager": MySnowflakeIOManager(database="MY_DATABASE", account=EnvVar("SNOWFLAKE_ACCOUNT"), ...)\n    }\n)\n
\n
\n
\n\n
\n

Legacy\u00b6

\n
\n
\ndagster_snowflake_pandas.snowflake_pandas_io_manager IOManagerDefinition\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
database (dagster.StringSource):
\n

Name of the database to use.

\n
\n
account (dagster.StringSource):
\n

Your Snowflake account name. For more details, see the Snowflake documentation.

\n
\n
user (dagster.StringSource):
\n

User login name.

\n
\n
schema (Union[dagster.StringSource, None], optional):
\n

Name of the schema to use.

\n
\n
password (Union[dagster.StringSource, None], optional):
\n

User password.

\n
\n
warehouse (Union[dagster.StringSource, None], optional):
\n

Name of the warehouse to use.

\n
\n
role (Union[dagster.StringSource, None], optional):
\n

Name of the role to use.

\n
\n
private_key (Union[dagster.StringSource, None], optional):
\n

Raw private key to use. See the Snowflake documentation for details. To avoid issues with newlines in the keys, you can base64 encode the key. You can retrieve the base64 encoded key with this shell command: cat rsa_key.p8 | base64

\n
\n
private_key_path (Union[dagster.StringSource, None], optional):
\n

Path to the private key. See the Snowflake documentation for details.

\n
\n
private_key_password (Union[dagster.StringSource, None], optional):
\n

The password of the private key. See the Snowflake documentation for details. Required for both private_key and private_key_path if the private key is encrypted. For unencrypted keys, this config can be omitted or set to None.

\n
\n
store_timestamps_as_strings (dagster.BoolSource, optional):
\n

If using Pandas DataFrames, whether to convert time data to strings. If True, time data will be converted to strings when storing the DataFrame and converted back to time data when loading the DataFrame. If False, time data without a timezone will be set to UTC timezone to avoid a Snowflake bug. Defaults to False.

\n

Default Value: False

\n
\n
authenticator (Union[dagster.StringSource, None], optional):
\n

Optional parameter to specify the authentication mechanism to use.

\n
\n
\n

An I/O manager definition that reads inputs from and writes Pandas DataFrames to Snowflake. When\nusing the snowflake_pandas_io_manager, any inputs and outputs without type annotations will be loaded\nas Pandas DataFrames.

\n
\n
Returns:
\n

IOManagerDefinition

\n
\n
\n

Examples

\n
from dagster_snowflake_pandas import snowflake_pandas_io_manager\nfrom dagster import asset, Definitions\n\n@asset(\n    key_prefix=["my_schema"]  # will be used as the schema in snowflake\n)\ndef my_table() -> pd.DataFrame:  # the name of the asset will be the table name\n    ...\n\ndefs = Definitions(\n    assets=[my_table],\n    resources={\n        "io_manager": snowflake_pandas_io_manager.configured({\n            "database": "my_database",\n            "account" : {"env": "SNOWFLAKE_ACCOUNT"}\n            ...\n        })\n    }\n)\n
\n
\n

If you do not provide a schema, Dagster will determine a schema based on the assets and ops using\nthe I/O Manager. For assets, the schema will be determined from the asset key.\nFor ops, the schema can be specified by including a \u201cschema\u201d entry in output metadata. If \u201cschema\u201d is not provided\nvia config or on the asset/op, \u201cpublic\u201d will be used for the schema.

\n
@op(\n    out={"my_table": Out(metadata={"schema": "my_schema"})}\n)\ndef make_my_table() -> pd.DataFrame:\n    # the returned value will be stored at my_schema.my_table\n    ...\n
\n
\n

To only use specific columns of a table as input to a downstream op or asset, add the metadata \u201ccolumns\u201d to the\nIn or AssetIn.

\n
@asset(\n    ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n)\ndef my_table_a(my_table: pd.DataFrame) -> pd.DataFrame:\n    # my_table will just contain the data from column "a"\n    ...\n
\n
\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-snowflake-pandas", "customsidebar": null, "display_toc": true, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../dagster-snowflake-pyspark/", "title": "Snowflake with PySpark (dagster-snowflake-pyspark)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-snowflake/", "title": "Snowflake (dagster-snowflake)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-snowflake-pyspark", "Snowflake with PySpark (dagster-snowflake-pyspark)", "N", "next"], ["sections/api/apidocs/libraries/dagster-snowflake", "Snowflake (dagster-snowflake)", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/libraries/dagster-snowflake-pandas.rst.txt", "title": "Snowflake with Pandas (dagster-snowflake-pandas)", "toc": "\n"}, "dagster-snowflake-pyspark": {"alabaster_version": "0.7.13", "body": "
\n

Snowflake with PySpark (dagster-snowflake-pyspark)\u00b6

\n

This library provides an integration with the Snowflake data\nwarehouse and PySpark data processing library.

\n

To use this library, you should first ensure that you have an appropriate Snowflake user configured to access\nyour data warehouse.

\n

Related Guides:

\n\n
\n
\ndagster_snowflake_pyspark.SnowflakePySparkIOManager IOManagerDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
database (dagster.StringSource):
\n

Name of the database to use.

\n
\n
account (dagster.StringSource):
\n

Your Snowflake account name. For more details, see the Snowflake documentation.

\n
\n
user (dagster.StringSource):
\n

User login name.

\n
\n
schema (Union[dagster.StringSource, None], optional):
\n

Name of the schema to use.

\n
\n
password (Union[dagster.StringSource, None], optional):
\n

User password.

\n
\n
warehouse (Union[dagster.StringSource, None], optional):
\n

Name of the warehouse to use.

\n
\n
role (Union[dagster.StringSource, None], optional):
\n

Name of the role to use.

\n
\n
private_key (Union[dagster.StringSource, None], optional):
\n

Raw private key to use. See the Snowflake documentation for details. To avoid issues with newlines in the keys, you can base64 encode the key. You can retrieve the base64 encoded key with this shell command: cat rsa_key.p8 | base64

\n
\n
private_key_path (Union[dagster.StringSource, None], optional):
\n

Path to the private key. See the Snowflake documentation for details.

\n
\n
private_key_password (Union[dagster.StringSource, None], optional):
\n

The password of the private key. See the Snowflake documentation for details. Required for both private_key and private_key_path if the private key is encrypted. For unencrypted keys, this config can be omitted or set to None.

\n
\n
store_timestamps_as_strings (dagster.BoolSource, optional):
\n

If using Pandas DataFrames, whether to convert time data to strings. If True, time data will be converted to strings when storing the DataFrame and converted back to time data when loading the DataFrame. If False, time data without a timezone will be set to UTC timezone to avoid a Snowflake bug. Defaults to False.

\n

Default Value: False

\n
\n
authenticator (Union[dagster.StringSource, None], optional):
\n

Optional parameter to specify the authentication mechanism to use.

\n
\n
\n

An I/O manager definition that reads inputs from and writes PySpark DataFrames to Snowflake. When\nusing the SnowflakePySparkIOManager, any inputs and outputs without type annotations will be loaded\nas PySpark DataFrames.

\n
\n
Returns:
\n

IOManagerDefinition

\n
\n
\n

Examples

\n
from dagster_snowflake_pyspark import SnowflakePySparkIOManager\nfrom pyspark.sql import DataFrame\nfrom dagster import Definitions, EnvVar\n\n@asset(\n    key_prefix=["my_schema"]  # will be used as the schema in snowflake\n)\ndef my_table() -> DataFrame:  # the name of the asset will be the table name\n    ...\n\ndefs = Definitions(\n    assets=[my_table],\n    resources={\n        "io_manager": SnowflakePySparkIOManager(\n            database="my_database",\n            warehouse="my_warehouse", # required for SnowflakePySparkIOManager\n            account=EnvVar("SNOWFLAKE_ACCOUNT"),\n            password=EnvVar("SNOWFLAKE_PASSWORD"),\n            ...\n        )\n    }\n)\n
\n
\n

Note that the warehouse configuration value is required when using the SnowflakePySparkIOManager

\n

If you do not provide a schema, Dagster will determine a schema based on the assets and ops using\nthe I/O Manager. For assets, the schema will be determined from the asset key, as in the above example.\nFor ops, the schema can be specified by including a \u201cschema\u201d entry in output metadata. If \u201cschema\u201d is not provided\nvia config or on the asset/op, \u201cpublic\u201d will be used for the schema.

\n
@op(\n    out={"my_table": Out(metadata={"schema": "my_schema"})}\n)\ndef make_my_table() -> DataFrame:\n    # the returned value will be stored at my_schema.my_table\n    ...\n
\n
\n

To only use specific columns of a table as input to a downstream op or asset, add the metadata \u201ccolumns\u201d to the\nIn or AssetIn.

\n
@asset(\n    ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n)\ndef my_table_a(my_table: DataFrame) -> DataFrame:\n    # my_table will just contain the data from column "a"\n    ...\n
\n
\n
\n\n
\n
\nclass dagster_snowflake_pyspark.SnowflakePySparkTypeHandler[source]\u00b6
\n

Plugin for the Snowflake I/O Manager that can store and load PySpark DataFrames as Snowflake tables.

\n

Examples

\n
from dagster_snowflake import SnowflakeIOManager\nfrom dagster_snowflake_pandas import SnowflakePandasTypeHandler\nfrom dagster_snowflake_pyspark import SnowflakePySparkTypeHandler\nfrom dagster import Definitions, EnvVar\n\nclass MySnowflakeIOManager(SnowflakeIOManager):\n    @staticmethod\n    def type_handlers() -> Sequence[DbTypeHandler]:\n        return [SnowflakePandasTypeHandler(), SnowflakePySparkTypeHandler()]\n\n@asset(\n    key_prefix=["my_schema"]  # will be used as the schema in snowflake\n)\ndef my_table() -> pd.DataFrame:  # the name of the asset will be the table name\n    ...\n\ndefs = Definitions(\n    assets=[my_table],\n    resources={\n        "io_manager": MySnowflakeIOManager(database="MY_DATABASE", account=EnvVar("SNOWFLAKE_ACCOUNT"), warehouse="my_warehouse", ...)\n    }\n)\n
\n
\n
\n\n
\n

Legacy\u00b6

\n
\n
\ndagster_snowflake_pyspark.snowflake_pyspark_io_manager IOManagerDefinition\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
database (dagster.StringSource):
\n

Name of the database to use.

\n
\n
account (dagster.StringSource):
\n

Your Snowflake account name. For more details, see the Snowflake documentation.

\n
\n
user (dagster.StringSource):
\n

User login name.

\n
\n
schema (Union[dagster.StringSource, None], optional):
\n

Name of the schema to use.

\n
\n
password (Union[dagster.StringSource, None], optional):
\n

User password.

\n
\n
warehouse (Union[dagster.StringSource, None], optional):
\n

Name of the warehouse to use.

\n
\n
role (Union[dagster.StringSource, None], optional):
\n

Name of the role to use.

\n
\n
private_key (Union[dagster.StringSource, None], optional):
\n

Raw private key to use. See the Snowflake documentation for details. To avoid issues with newlines in the keys, you can base64 encode the key. You can retrieve the base64 encoded key with this shell command: cat rsa_key.p8 | base64

\n
\n
private_key_path (Union[dagster.StringSource, None], optional):
\n

Path to the private key. See the Snowflake documentation for details.

\n
\n
private_key_password (Union[dagster.StringSource, None], optional):
\n

The password of the private key. See the Snowflake documentation for details. Required for both private_key and private_key_path if the private key is encrypted. For unencrypted keys, this config can be omitted or set to None.

\n
\n
store_timestamps_as_strings (dagster.BoolSource, optional):
\n

If using Pandas DataFrames, whether to convert time data to strings. If True, time data will be converted to strings when storing the DataFrame and converted back to time data when loading the DataFrame. If False, time data without a timezone will be set to UTC timezone to avoid a Snowflake bug. Defaults to False.

\n

Default Value: False

\n
\n
authenticator (Union[dagster.StringSource, None], optional):
\n

Optional parameter to specify the authentication mechanism to use.

\n
\n
\n

An I/O manager definition that reads inputs from and writes PySpark DataFrames to Snowflake. When\nusing the snowflake_pyspark_io_manager, any inputs and outputs without type annotations will be loaded\nas PySpark DataFrames.

\n
\n
Returns:
\n

IOManagerDefinition

\n
\n
\n

Examples

\n
from dagster_snowflake_pyspark import snowflake_pyspark_io_manager\nfrom pyspark.sql import DataFrame\nfrom dagster import Definitions\n\n@asset(\n    key_prefix=["my_schema"]  # will be used as the schema in snowflake\n)\ndef my_table() -> DataFrame:  # the name of the asset will be the table name\n    ...\n\ndefs = Definitions(\n    assets=[my_table],\n    resources={\n        "io_manager": snowflake_pyspark_io_manager.configured({\n            "database": "my_database",\n            "warehouse": "my_warehouse", # required for snowflake_pyspark_io_manager\n            "account" : {"env": "SNOWFLAKE_ACCOUNT"},\n            "password": {"env": "SNOWFLAKE_PASSWORD"},\n            ...\n        })\n    }\n)\n
\n
\n

Note that the warehouse configuration value is required when using the snowflake_pyspark_io_manager

\n

If you do not provide a schema, Dagster will determine a schema based on the assets and ops using\nthe I/O Manager. For assets, the schema will be determined from the asset key.\nFor ops, the schema can be specified by including a \u201cschema\u201d entry in output metadata. If \u201cschema\u201d is not provided\nvia config or on the asset/op, \u201cpublic\u201d will be used for the schema.

\n
@op(\n    out={"my_table": Out(metadata={"schema": "my_schema"})}\n)\ndef make_my_table() -> DataFrame:\n    # the returned value will be stored at my_schema.my_table\n    ...\n
\n
\n

To only use specific columns of a table as input to a downstream op or asset, add the metadata \u201ccolumns\u201d to the\nIn or AssetIn.

\n
@asset(\n    ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n)\ndef my_table_a(my_table: DataFrame) -> DataFrame:\n    # my_table will just contain the data from column "a"\n    ...\n
\n
\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-snowflake-pyspark", "customsidebar": null, "display_toc": true, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../dagster-spark/", "title": "Spark (dagster-spark)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-snowflake-pandas/", "title": "Snowflake with Pandas (dagster-snowflake-pandas)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-spark", "Spark (dagster-spark)", "N", "next"], ["sections/api/apidocs/libraries/dagster-snowflake-pandas", "Snowflake with Pandas (dagster-snowflake-pandas)", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/libraries/dagster-snowflake-pyspark.rst.txt", "title": "Snowflake with PySpark (dagster-snowflake-pyspark)", "toc": "\n"}, "dagster-spark": {"alabaster_version": "0.7.13", "body": "
\n

Spark (dagster-spark)\u00b6

\n
\n
\nclass dagster_spark.SparkOpError[source]\u00b6
\n
\n\n
\n
\ndagster_spark.define_spark_config()[source]\u00b6
\n

Spark configuration.

\n
\n
See the Spark documentation for reference:

https://spark.apache.org/docs/latest/submitting-applications.html

\n
\n
\n
\n\n
\n
\ndagster_spark.create_spark_op(name, main_class, description=None, required_resource_keys=frozenset({'spark'}))[source]\u00b6
\n
\n\n
\n
\ndagster_spark.construct_spark_shell_command(application_jar, main_class, master_url=None, spark_conf=None, deploy_mode=None, application_arguments=None, spark_home=None)[source]\u00b6
\n

Constructs the spark-submit command for a Spark job.

\n
\n\n
\n

Legacy\u00b6

\n
\n
\ndagster_spark.spark_resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-spark", "customsidebar": null, "display_toc": true, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../dagster-ssh/", "title": "SSH / SFTP (dagster-ssh)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-snowflake-pyspark/", "title": "Snowflake with PySpark (dagster-snowflake-pyspark)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-ssh", "SSH / SFTP (dagster-ssh)", "N", "next"], ["sections/api/apidocs/libraries/dagster-snowflake-pyspark", "Snowflake with PySpark (dagster-snowflake-pyspark)", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/libraries/dagster-spark.rst.txt", "title": "Spark (dagster-spark)", "toc": "\n"}, "dagster-ssh": {"alabaster_version": "0.7.13", "body": "
\n

SSH / SFTP (dagster-ssh)\u00b6

\n

This library provides an integration with SSH and SFTP.

\n
\n
\ndagster_ssh.ssh_resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
remote_host (dagster.StringSource):
\n

remote host to connect to

\n
\n
remote_port (dagster.IntSource, optional):
\n

port of remote host to connect (Default is paramiko SSH_PORT)

\n

Default Value: 22

\n
\n
username (dagster.StringSource, optional):
\n

username to connect to the remote_host

\n
\n
password (dagster.StringSource, optional):
\n

password of the username to connect to the remote_host

\n
\n
key_file (dagster.StringSource, optional):
\n

key file to use to connect to the remote_host.

\n
\n
key_string (dagster.StringSource, optional):
\n

key string to use to connect to remote_host

\n
\n
timeout (dagster.IntSource, optional):
\n

timeout for the attempt to connect to the remote_host.

\n

Default Value: 10

\n
\n
keepalive_interval (dagster.IntSource, optional):
\n

send a keepalive packet to remote host every keepalive_interval seconds

\n

Default Value: 30

\n
\n
compress (dagster.BoolSource, optional):
\n

Default Value: True

\n
\n
no_host_key_check (dagster.BoolSource, optional):
\n

Default Value: True

\n
\n
allow_host_key_change (dagster.BoolSource, optional):
\n

[Deprecated]

\n

Default Value: False

\n
\n
\n
\n\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-ssh", "customsidebar": null, "display_toc": false, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../dagster-twilio/", "title": "Twilio (dagster-twilio)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-spark/", "title": "Spark (dagster-spark)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-twilio", "Twilio (dagster-twilio)", "N", "next"], ["sections/api/apidocs/libraries/dagster-spark", "Spark (dagster-spark)", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/libraries/dagster-ssh.rst.txt", "title": "SSH / SFTP (dagster-ssh)", "toc": "\n"}, "dagster-twilio": {"alabaster_version": "0.7.13", "body": "
\n

Twilio (dagster-twilio)\u00b6

\n

This library provides an integration with Twilio.

\n
\n
\ndagster_twilio.TwilioResource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
account_sid (dagster.StringSource):
\n

Twilio Account SID, created with yout Twilio account. This can be found on your Twilio dashboard, see https://www.twilio.com/blog/twilio-access-tokens-python

\n
\n
auth_token (dagster.StringSource):
\n

Twilio Authentication Token, created with yout Twilio account. This can be found on your Twilio dashboard, see https://www.twilio.com/blog/twilio-access-tokens-python

\n
\n
\n

This resource is for connecting to Twilio.

\n
\n\n
\n

Legacy\u00b6

\n
\n
\ndagster_twilio.twilio_resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
account_sid (dagster.StringSource):
\n

Twilio Account SID, created with yout Twilio account. This can be found on your Twilio dashboard, see https://www.twilio.com/blog/twilio-access-tokens-python

\n
\n
auth_token (dagster.StringSource):
\n

Twilio Authentication Token, created with yout Twilio account. This can be found on your Twilio dashboard, see https://www.twilio.com/blog/twilio-access-tokens-python

\n
\n
\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-twilio", "customsidebar": null, "display_toc": true, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../dagstermill/", "title": "Dagstermill"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-ssh/", "title": "SSH / SFTP (dagster-ssh)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagstermill", "Dagstermill", "N", "next"], ["sections/api/apidocs/libraries/dagster-ssh", "SSH / SFTP (dagster-ssh)", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/libraries/dagster-twilio.rst.txt", "title": "Twilio (dagster-twilio)", "toc": "\n"}, "dagster-wandb": {"alabaster_version": "0.7.13", "body": "
\n

Weights & Biases (dagster-wandb)\u00b6

\n

This library provides a Dagster integration with Weights & Biases.

\n

Use Dagster and Weights & Biases (W&B) to orchestrate your MLOps pipelines and maintain ML assets.

\n
\n

The integration with W&B makes it easy within Dagster to:

\n\n
\n

Useful links\u00b6

\n

For a complete set of documentation, see Dagster integration on the W&B website.

\n

For full-code examples, see examples/with_wandb in the Dagster\u2019s Github repo.

\n
\n
\n

Resource\u00b6

\n
\n
\ndagster_wandb.wandb_resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
api_key (dagster.StringSource):
\n

W&B API key necessary to communicate with the W&B API.

\n
\n
host (String, optional):
\n

API host server you wish to use. Only required if you are using W&B Server.

\n

Default Value: \u2018https://api.wandb.ai\u2019

\n
\n
\n

Dagster resource used to communicate with the W&B API. It\u2019s useful when you want to use the\nwandb client within your ops and assets. It\u2019s a required resources if you are using the W&B IO\nManager.

\n

It automatically authenticates using the provided API key.

\n

For a complete set of documentation, see Dagster integration.

\n

To configure this resource, we recommend using the configured method.

\n

Example:

\n
from dagster import job\nfrom dagster_wandb import wandb_resource\n\nmy_wandb_resource = wandb_resource.configured({"api_key": {"env": "WANDB_API_KEY"}})\n\n@job(resource_defs={"wandb_resource": my_wandb_resource})\ndef my_wandb_job():\n    ...\n
\n
\n
\n\n
\n
\n

I/O Manager\u00b6

\n
\n
\ndagster_wandb.wandb_artifacts_io_manager IOManager[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
run_name (String, optional):
\n

Short display name for this run, which is how you\u2019ll identify this run in the UI. By default, it`s set to a string with the following format dagster-run-[8 first characters of the Dagster Run ID] e.g. dagster-run-7e4df022.

\n
\n
run_id (String, optional):
\n

Unique ID for this run, used for resuming. It must be unique in the project, and if you delete a run you can\u2019t reuse the ID. Use the name field for a short descriptive name, or config for saving hyperparameters to compare across runs. The ID cannot contain the following special characters: /#?%:.. You need to set the Run ID when you are doing experiment tracking inside Dagster to allow the IO Manager to resume the run. By default it`s set to the Dagster Run ID e.g 7e4df022-1bf2-44b5-a383-bb852df4077e.

\n
\n
run_tags (List[String], optional):
\n

A list of strings, which will populate the list of tags on this run in the UI. Tags are useful for organizing runs together, or applying temporary labels like \u2018baseline\u2019 or \u2018production\u2019. It\u2019s easy to add and remove tags in the UI, or filter down to just runs with a specific tag. Any W&B Run used by the integration will have the dagster_wandb tag.

\n
\n
base_dir (String, optional):
\n

Base directory used for local storage and caching. W&B Artifacts and W&B Run logs will be written and read from that directory. By default, it`s using the DAGSTER_HOME directory.

\n
\n
cache_duration_in_minutes (Int, optional):
\n

Defines the amount of time W&B Artifacts and W&B Run logs should be kept in the local storage. Only files and directories that were not opened for that amount of time are removed from the cache. Cache purging happens at the end of an IO Manager execution. You can set it to 0, if you want to disable caching completely. Caching improves speed when an Artifact is reused between jobs running on the same machine. It defaults to 30 days.

\n
\n
\n

Dagster IO Manager to create and consume W&B Artifacts.

\n

It allows any Dagster @op or @asset to create and consume W&B Artifacts natively.

\n

For a complete set of documentation, see Dagster integration.

\n

Example:

\n
@repository\ndef my_repository():\n    return [\n        *with_resources(\n            load_assets_from_current_module(),\n            resource_defs={\n                "wandb_config": make_values_resource(\n                    entity=str,\n                    project=str,\n                ),\n                "wandb_resource": wandb_resource.configured(\n                    {"api_key": {"env": "WANDB_API_KEY"}}\n                ),\n                "wandb_artifacts_manager": wandb_artifacts_io_manager.configured(\n                    {"cache_duration_in_minutes": 60} # only cache files for one hour\n                ),\n            },\n            resource_config_by_key={\n                "wandb_config": {\n                    "config": {\n                        "entity": "my_entity",\n                        "project": "my_project"\n                    }\n                }\n            },\n        ),\n    ]\n\n\n@asset(\n    name="my_artifact",\n    metadata={\n        "wandb_artifact_configuration": {\n            "type": "dataset",\n        }\n    },\n    io_manager_key="wandb_artifacts_manager",\n)\ndef create_dataset():\n    return [1, 2, 3]\n
\n
\n
\n\n
\n

Config\u00b6

\n
\n
\nclass dagster_wandb.WandbArtifactConfiguration[source]\u00b6
\n

W&B Artifacts IO Manager configuration. Useful for type checking.

\n
\n\n
\n
\nclass dagster_wandb.SerializationModule[source]\u00b6
\n

W&B Artifacts IO Manager configuration of the serialization module. Useful for type checking.

\n
\n\n
\n
\n

Errors\u00b6

\n
\n
\nexception dagster_wandb.WandbArtifactsIOManagerError(message='A W&B Artifacts IO Manager error occurred.')[source]\u00b6
\n

Represents an execution error of the W&B Artifacts IO Manager.

\n
\n\n
\n
\n
\n

Ops\u00b6

\n
\n
\ndagster_wandb.run_launch_agent(context)[source]\u00b6
\n

It starts a Launch Agent and runs it as a long running process until stopped manually.

\n

Agents are processes that poll launch queues and execute the jobs (or dispatch them to external\nservices to be executed) in order.

\n

Example:

\n
# config.yaml\n\nresources:\n  wandb_config:\n    config:\n      entity: my_entity\n      project: my_project\nops:\n  run_launch_agent:\n    config:\n      max_jobs: -1\n      queues:\n        - my_dagster_queue\n
\n
\n
from dagster_wandb.launch.ops import run_launch_agent\nfrom dagster_wandb.resources import wandb_resource\n\nfrom dagster import job, make_values_resource\n\n\n@job(\n    resource_defs={\n        "wandb_config": make_values_resource(\n            entity=str,\n            project=str,\n        ),\n        "wandb_resource": wandb_resource.configured(\n            {"api_key": {"env": "WANDB_API_KEY"}}\n        ),\n    },\n)\ndef run_launch_agent_example():\n    run_launch_agent()\n
\n
\n
\n\n
\n
\ndagster_wandb.run_launch_job(context)[source]\u00b6
\n

Executes a Launch job.

\n

A Launch job is assigned to a queue in order to be executed. You can create a queue or use the\ndefault one. Make sure you have an active agent listening to that queue. You can run an agent\ninside your Dagster instance but can also consider using a deployable agent in Kubernetes.

\n

Example:

\n
# config.yaml\n\nresources:\n  wandb_config:\n    config:\n      entity: my_entity\n      project: my_project\nops:\n  my_launched_job:\n    config:\n      entry_point:\n        - python\n        - train.py\n      queue: my_dagster_queue\n      uri: https://github.com/wandb/example-dagster-integration-with-launch\n
\n
\n
from dagster_wandb.launch.ops import run_launch_job\nfrom dagster_wandb.resources import wandb_resource\n\nfrom dagster import job, make_values_resource\n\n\n@job(\n    resource_defs={\n        "wandb_config": make_values_resource(\n            entity=str,\n            project=str,\n        ),\n        "wandb_resource": wandb_resource.configured(\n            {"api_key": {"env": "WANDB_API_KEY"}}\n        ),\n    },\n)\ndef run_launch_job_example():\n    run_launch_job.alias("my_launched_job")() # we rename the job with an alias\n
\n
\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-wandb", "customsidebar": null, "display_toc": true, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": null, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-graphql/", "title": "GraphQL (dagster-graphql)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-graphql", "GraphQL (dagster-graphql)", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/libraries/dagster-wandb.rst.txt", "title": "Weights & Biases (dagster-wandb)", "toc": "\n"}, "dagstermill": {"alabaster_version": "0.7.13", "body": "
\n

Dagstermill\u00b6

\n

This library provides an integration with papermill to allow you to run Jupyter notebooks with Dagster.

\n

Related Guides:

\n\n
\n
\ndagstermill.define_dagstermill_asset(name, notebook_path, key_prefix=None, ins=None, deps=None, metadata=None, config_schema=None, required_resource_keys=None, resource_defs=None, description=None, partitions_def=None, op_tags=None, group_name=None, io_manager_key=None, retry_policy=None, save_notebook_on_failure=False, non_argument_deps=None)[source]\u00b6
\n

Creates a Dagster asset for a Jupyter notebook.

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name for the asset

  • \n
  • notebook_path (str) \u2013 Path to the backing notebook

  • \n
  • key_prefix (Optional[Union[str, Sequence[str]]]) \u2013 If provided, the asset\u2019s key is the\nconcatenation of the key_prefix and the asset\u2019s name, which defaults to the name of\nthe decorated function. Each item in key_prefix must be a valid name in dagster (ie only\ncontains letters, numbers, and _) and may not contain python reserved keywords.

  • \n
  • ins (Optional[Mapping[str, AssetIn]]) \u2013 A dictionary that maps input names to information\nabout the input.

  • \n
  • deps (Optional[Sequence[Union[AssetsDefinition, SourceAsset, AssetKey, str]]]) \u2013 The assets\nthat are upstream dependencies, but do not pass an input value to the notebook.

  • \n
  • config_schema (Optional[ConfigSchema) \u2013 The configuration schema for the asset\u2019s underlying\nop. If set, Dagster will check that config provided for the op matches this schema and fail\nif it does not. If not set, Dagster will accept any config provided for the op.

  • \n
  • metadata (Optional[Dict[str, Any]]) \u2013 A dict of metadata entries for the asset.

  • \n
  • required_resource_keys (Optional[Set[str]]) \u2013 Set of resource handles required by the notebook.

  • \n
  • description (Optional[str]) \u2013 Description of the asset to display in the Dagster UI.

  • \n
  • partitions_def (Optional[PartitionsDefinition]) \u2013 Defines the set of partition keys that\ncompose the asset.

  • \n
  • op_tags (Optional[Dict[str, Any]]) \u2013 A dictionary of tags for the op that computes the asset.\nFrameworks may expect and require certain metadata to be attached to a op. Values that\nare not strings will be json encoded and must meet the criteria that\njson.loads(json.dumps(value)) == value.

  • \n
  • group_name (Optional[str]) \u2013 A string name used to organize multiple assets into groups. If not provided,\nthe name \u201cdefault\u201d is used.

  • \n
  • resource_defs (Optional[Mapping[str, ResourceDefinition]]) \u2013 (Experimental) A mapping of resource keys to resource definitions. These resources\nwill be initialized during execution, and can be accessed from the\ncontext within the notebook.

  • \n
  • io_manager_key (Optional[str]) \u2013 A string key for the IO manager used to store the output notebook.\nIf not provided, the default key output_notebook_io_manager will be used.

  • \n
  • retry_policy (Optional[RetryPolicy]) \u2013 The retry policy for the op that computes the asset.

  • \n
  • save_notebook_on_failure (bool) \u2013 If True and the notebook fails during execution, the failed notebook will be\nwritten to the Dagster storage directory. The location of the file will be printed in the Dagster logs.\nDefaults to False.

  • \n
  • non_argument_deps (Optional[Union[Set[AssetKey], Set[str]]]) \u2013 Deprecated, use deps instead. Set of asset keys that are\nupstream dependencies, but do not pass an input to the asset.

  • \n
\n
\n
\n

Examples

\n
from dagstermill import define_dagstermill_asset\nfrom dagster import asset, AssetIn, AssetKey\nfrom sklearn import datasets\nimport pandas as pd\nimport numpy as np\n\n@asset\ndef iris_dataset():\n    sk_iris = datasets.load_iris()\n    return pd.DataFrame(\n        data=np.c_[sk_iris["data"], sk_iris["target"]],\n        columns=sk_iris["feature_names"] + ["target"],\n    )\n\niris_kmeans_notebook = define_dagstermill_asset(\n    name="iris_kmeans_notebook",\n    notebook_path="/path/to/iris_kmeans.ipynb",\n    ins={\n        "iris": AssetIn(key=AssetKey("iris_dataset"))\n    }\n)\n
\n
\n
\n\n
\n
\ndagstermill.define_dagstermill_op(name, notebook_path, ins=None, outs=None, config_schema=None, required_resource_keys=None, output_notebook_name=None, asset_key_prefix=None, description=None, tags=None, io_manager_key=None, save_notebook_on_failure=False)[source]\u00b6
\n

Wrap a Jupyter notebook in a op.

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the op.

  • \n
  • notebook_path (str) \u2013 Path to the backing notebook.

  • \n
  • ins (Optional[Mapping[str, In]]) \u2013 The op\u2019s inputs.

  • \n
  • outs (Optional[Mapping[str, Out]]) \u2013 The op\u2019s outputs. Your notebook should\ncall yield_result() to yield each of these outputs.

  • \n
  • required_resource_keys (Optional[Set[str]]) \u2013 The string names of any required resources.

  • \n
  • output_notebook_name \u2013 (Optional[str]): If set, will be used as the name of an injected output\nof type of BufferedIOBase that is the file object of the executed\nnotebook (in addition to the AssetMaterialization that is always\ncreated). It allows the downstream ops to access the executed notebook via a file\nobject.

  • \n
  • asset_key_prefix (Optional[Union[List[str], str]]) \u2013 If set, will be used to prefix the\nasset keys for materialized notebooks.

  • \n
  • description (Optional[str]) \u2013 If set, description used for op.

  • \n
  • tags (Optional[Dict[str, str]]) \u2013 If set, additional tags used to annotate op.\nDagster uses the tag keys notebook_path and kind, which cannot be\noverwritten by the user.

  • \n
  • io_manager_key (Optional[str]) \u2013 If using output_notebook_name, you can additionally provide\na string key for the IO manager used to store the output notebook.\nIf not provided, the default key output_notebook_io_manager will be used.

  • \n
  • save_notebook_on_failure (bool) \u2013 If True and the notebook fails during execution, the failed notebook will be\nwritten to the Dagster storage directory. The location of the file will be printed in the Dagster logs.\nDefaults to False.

  • \n
\n
\n
Returns:
\n

OpDefinition

\n
\n
\n
\n\n
\n
\nclass dagstermill.ConfigurableLocalOutputNotebookIOManager(*, base_dir=None, asset_key_prefix=[])[source]\u00b6
\n

Built-in IO Manager for handling output notebook.

\n
\n\n
\n
\ndagstermill.get_context(op_config=None, resource_defs=None, logger_defs=None, run_config=None)\u00b6
\n

Get a dagstermill execution context for interactive exploration and development.

\n
\n
Parameters:
\n
    \n
  • op_config (Optional[Any]) \u2013 If specified, this value will be made available on the\ncontext as its op_config property.

  • \n
  • resource_defs (Optional[Mapping[str, ResourceDefinition]]) \u2013 Specifies resources to provide to context.

  • \n
  • logger_defs (Optional[Mapping[str, LoggerDefinition]]) \u2013 Specifies loggers to provide to context.

  • \n
  • run_config (Optional[dict]) \u2013 The config dict with which to construct\nthe context.

  • \n
\n
\n
Returns:
\n

DagstermillExecutionContext

\n
\n
\n
\n\n
\n
\ndagstermill.yield_event(dagster_event)\u00b6
\n

Yield a dagster event directly from notebook code.

\n

When called interactively or in development, returns its input.

\n
\n
Parameters:
\n

dagster_event (Union[dagster.AssetMaterialization, dagster.ExpectationResult, dagster.TypeCheck, dagster.Failure, dagster.RetryRequested]) \u2013 An event to yield back to Dagster.

\n
\n
\n
\n\n
\n
\ndagstermill.yield_result(value, output_name='result')\u00b6
\n

Yield a result directly from notebook code.

\n

When called interactively or in development, returns its input.

\n
\n
Parameters:
\n
    \n
  • value (Any) \u2013 The value to yield.

  • \n
  • output_name (Optional[str]) \u2013 The name of the result to yield (default: 'result').

  • \n
\n
\n
\n
\n\n
\n
\nclass dagstermill.DagstermillExecutionContext(job_context, job_def, resource_keys_to_init, op_name, node_handle, op_config=None)[source]\u00b6
\n

Dagstermill-specific execution context.

\n

Do not initialize directly: use dagstermill.get_context().

\n
\n
\nproperty job_def\u00b6
\n

The job definition for the context.

\n

This will be a dagstermill-specific shim.

\n
\n
Type:
\n

dagster.JobDefinition

\n
\n
\n
\n\n
\n
\nproperty job_name\u00b6
\n

The name of the executing job.

\n
\n
Type:
\n

str

\n
\n
\n
\n\n
\n
\nproperty logging_tags\u00b6
\n

The logging tags for the context.

\n
\n
Type:
\n

dict

\n
\n
\n
\n\n
\n
\nproperty op_config\u00b6
\n

A dynamically-created type whose properties allow access to\nop-specific config.

\n
\n
Type:
\n

collections.namedtuple

\n
\n
\n
\n\n
\n
\nproperty op_def\u00b6
\n

The op definition for the context.

\n

In interactive contexts, this may be a dagstermill-specific shim, depending whether an\nop definition was passed to dagstermill.get_context.

\n
\n
Type:
\n

dagster.OpDefinition

\n
\n
\n
\n\n
\n
\nproperty run\u00b6
\n

The job run for the context.

\n
\n
Type:
\n

dagster.DagsterRun

\n
\n
\n
\n\n
\n
\nproperty run_config\u00b6
\n

The run_config for the context.

\n
\n
Type:
\n

dict

\n
\n
\n
\n\n
\n
\nproperty run_id\u00b6
\n

The run_id for the context.

\n
\n
Type:
\n

str

\n
\n
\n
\n\n
\n\n
\n
\nclass dagstermill.DagstermillError[source]\u00b6
\n

Base class for errors raised by dagstermill.

\n
\n\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagstermill", "customsidebar": null, "display_toc": false, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../dagster-graphql/", "title": "GraphQL (dagster-graphql)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-twilio/", "title": "Twilio (dagster-twilio)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-graphql", "GraphQL (dagster-graphql)", "N", "next"], ["sections/api/apidocs/libraries/dagster-twilio", "Twilio (dagster-twilio)", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/libraries/dagstermill.rst.txt", "title": "Dagstermill", "toc": "\n"}}, "loggers": {"alabaster_version": "0.7.13", "body": "
\n

Loggers\u00b6

\n
\n

Built-in loggers\u00b6

\n
\n
\ndagster._loggers.colored_console_logger(*args, **kwargs)\u00b6
\n

Core class for defining loggers.

\n

Loggers are job-scoped logging handlers, which will be automatically invoked whenever\ndagster messages are logged from within a job.

\n
\n
Parameters:
\n
    \n
  • logger_fn (Callable[[InitLoggerContext], logging.Logger]) \u2013 User-provided function to\ninstantiate the logger. This logger will be automatically invoked whenever the methods\non context.log are called from within job compute logic.

  • \n
  • config_schema (Optional[ConfigSchema]) \u2013 The schema for the config. Configuration data available in\ninit_context.logger_config. If not set, Dagster will accept any config provided.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of this logger.

  • \n
\n
\n
\n
\n\n
\n
\ndagster._loggers.json_console_logger(*args, **kwargs)\u00b6
\n

Core class for defining loggers.

\n

Loggers are job-scoped logging handlers, which will be automatically invoked whenever\ndagster messages are logged from within a job.

\n
\n
Parameters:
\n
    \n
  • logger_fn (Callable[[InitLoggerContext], logging.Logger]) \u2013 User-provided function to\ninstantiate the logger. This logger will be automatically invoked whenever the methods\non context.log are called from within job compute logic.

  • \n
  • config_schema (Optional[ConfigSchema]) \u2013 The schema for the config. Configuration data available in\ninit_context.logger_config. If not set, Dagster will accept any config provided.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of this logger.

  • \n
\n
\n
\n
\n\n
\n
\n

Logging from an @op\u00b6

\n
\n
\nclass dagster.DagsterLogManager(dagster_handler, level=0, managed_loggers=None)[source]\u00b6
\n

Centralized dispatch for logging from user code.

\n

Handles the construction of uniform structured log messages and passes them through to the\nunderlying loggers/handlers.

\n

An instance of the log manager is made available to ops as context.log. Users should not\ninitialize instances of the log manager directly. To configure custom loggers, set the\nlogger_defs argument in an @job decorator or when calling the to_job() method on a\nGraphDefinition.

\n

The log manager inherits standard convenience methods like those exposed by the Python standard\nlibrary python:logging module (i.e., within the body of an op,\ncontext.log.{debug, info, warning, warn, error, critical, fatal}).

\n

The underlying integer API can also be called directly using, e.g.\ncontext.log.log(5, msg), and the log manager will delegate to the log method\ndefined on each of the loggers it manages.

\n

User-defined custom log levels are not supported, and calls to, e.g.,\ncontext.log.trace or context.log.notice will result in hard exceptions at runtime.

\n
\n\n
\n
\n

Defining custom loggers\u00b6

\n
\n
\n@dagster.logger(config_schema=None, description=None)[source]\u00b6
\n

Define a logger.

\n

The decorated function should accept an InitLoggerContext and return an instance of\npython:logging.Logger. This function will become the logger_fn of an underlying\nLoggerDefinition.

\n
\n
Parameters:
\n
    \n
  • config_schema (Optional[ConfigSchema]) \u2013 The schema for the config. Configuration data available in\ninit_context.logger_config. If not set, Dagster will accept any config provided.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the logger.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.LoggerDefinition(logger_fn, config_schema=None, description=None)[source]\u00b6
\n

Core class for defining loggers.

\n

Loggers are job-scoped logging handlers, which will be automatically invoked whenever\ndagster messages are logged from within a job.

\n
\n
Parameters:
\n
    \n
  • logger_fn (Callable[[InitLoggerContext], logging.Logger]) \u2013 User-provided function to\ninstantiate the logger. This logger will be automatically invoked whenever the methods\non context.log are called from within job compute logic.

  • \n
  • config_schema (Optional[ConfigSchema]) \u2013 The schema for the config. Configuration data available in\ninit_context.logger_config. If not set, Dagster will accept any config provided.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of this logger.

  • \n
\n
\n
\n
\n
\nproperty config_schema\u00b6
\n

The schema for the logger\u2019s config. Configuration data available in init_context.logger_config.

\n
\n
Type:
\n

Any

\n
\n
\n
\n\n
\n
\nproperty description\u00b6
\n

A human-readable description of the logger.

\n
\n
Type:
\n

Optional[str]

\n
\n
\n
\n\n
\n
\nproperty logger_fn\u00b6
\n

The function that will be invoked to\ninstantiate the logger.

\n
\n
Type:
\n

Callable[[InitLoggerContext], logging.Logger]

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.InitLoggerContext(logger_config, logger_def=None, job_def=None, run_id=None)[source]\u00b6
\n

The context object available as the argument to the initialization function of a dagster.LoggerDefinition.

\n

Users should not instantiate this object directly. To construct an\nInitLoggerContext for testing purposes, use dagster.\nbuild_init_logger_context().

\n

Example

\n
from dagster import logger, InitLoggerContext\n\n@logger\ndef hello_world(init_context: InitLoggerContext):\n    ...\n
\n
\n
\n
\nproperty logger_config\u00b6
\n

The configuration data provided by the run config. The\nschema for this data is defined by config_schema on the LoggerDefinition.

\n
\n\n
\n
\nproperty logger_def\u00b6
\n

The logger definition for the logger being constructed.

\n
\n\n
\n
\nproperty run_id\u00b6
\n

The ID for this run of the job.

\n
\n\n
\n\n
\n
\ndagster.build_init_logger_context(logger_config=None, job_def=None)[source]\u00b6
\n

Builds logger initialization context from provided parameters.

\n

This function can be used to provide the context argument to the invocation of a logger\ndefinition.

\n

Note that you may only specify one of pipeline_def and job_def.

\n
\n
Parameters:
\n
    \n
  • logger_config (Any) \u2013 The config to provide during initialization of logger.

  • \n
  • job_def (Optional[JobDefinition]) \u2013 The job definition that the logger will be used with.

  • \n
\n
\n
\n

Examples

\n
context = build_init_logger_context()\nlogger_to_init(context)\n
\n
\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/loggers", "customsidebar": null, "display_toc": true, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../ops/", "title": "Ops"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../jobs/", "title": "Jobs"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/ops", "Ops", "N", "next"], ["sections/api/apidocs/jobs", "Jobs", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/loggers.rst.txt", "title": "Loggers", "toc": "\n"}, "memoization": {"alabaster_version": "0.7.13", "body": "
\n

Job-Level Versioning and Memoization (Deprecated)\u00b6

\n

Dagster has deprecated functionality that allows for job-level code versioning and memoization of previous op outputs based upon that versioning.

\n

This is currently deprecated in favor of asset versioning.

\n
\n

Versioning\u00b6

\n
\n
\nclass dagster.VersionStrategy[source]\u00b6
\n

Abstract class for defining a strategy to version ops and resources.

\n

When subclassing, get_op_version must be implemented, and\nget_resource_version can be optionally implemented.

\n

get_op_version should ingest an OpVersionContext, and get_resource_version should ingest a\nResourceVersionContext. From that, each synthesize a unique string called\na version, which will\nbe tagged to outputs of that op in the job. Providing a\nVersionStrategy instance to a\njob will enable memoization on that job, such that only steps whose\noutputs do not have an up-to-date version will run.

\n
\n
\nabstract get_op_version(context)[source]\u00b6
\n

Computes a version for an op.

\n
\n
Parameters:
\n

context (OpVersionContext) \u2013 The context for computing the version.

\n
\n
Returns:
\n

The version for the op.

\n
\n
Return type:
\n

str

\n
\n
\n
\n\n
\n
\nget_resource_version(context)[source]\u00b6
\n

Computes a version for a resource.

\n
\n
Parameters:
\n

context (ResourceVersionContext) \u2013 The context for computing the version.

\n
\n
Returns:
\n

\n
The version for the resource. If None, the resource will not be

memoized.

\n
\n
\n

\n
\n
Return type:
\n

Optional[str]

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.SourceHashVersionStrategy[source]\u00b6
\n

VersionStrategy that checks for changes to the source code of ops and resources.

\n

Only checks for changes within the immediate body of the op/resource\u2019s\ndecorated function (or compute function, if the op/resource was\nconstructed directly from a definition).

\n
\n
\nget_op_version(context)[source]\u00b6
\n

Computes a version for an op by hashing its source code.

\n
\n
Parameters:
\n

context (OpVersionContext) \u2013 The context for computing the version.

\n
\n
Returns:
\n

The version for the op.

\n
\n
Return type:
\n

str

\n
\n
\n
\n\n
\n
\nget_resource_version(context)[source]\u00b6
\n

Computes a version for a resource by hashing its source code.

\n
\n
Parameters:
\n

context (ResourceVersionContext) \u2013 The context for computing the version.

\n
\n
Returns:
\n

\n
The version for the resource. If None, the resource will not be

memoized.

\n
\n
\n

\n
\n
Return type:
\n

Optional[str]

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.OpVersionContext(op_def, op_config)[source]\u00b6
\n

Provides execution-time information for computing the version for an op.

\n
\n
\nop_def\u00b6
\n

The definition of the op to compute a version for.

\n
\n
Type:
\n

OpDefinition

\n
\n
\n
\n\n
\n
\nop_config\u00b6
\n

The parsed config to be passed to the op during execution.

\n
\n
Type:
\n

Any

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.ResourceVersionContext(resource_def, resource_config)[source]\u00b6
\n

Provides execution-time information for computing the version for a resource.

\n
\n
\nresource_def\u00b6
\n

The definition of the resource whose version will be computed.

\n
\n
Type:
\n

ResourceDefinition

\n
\n
\n
\n\n
\n
\nresource_config\u00b6
\n

The parsed config to be passed to the resource during execution.

\n
\n
Type:
\n

Any

\n
\n
\n
\n\n
\n\n
\n
\n

Memoization\u00b6

\n
\n
\nclass dagster.MemoizableIOManager[source]\u00b6
\n

Base class for IO manager enabled to work with memoized execution. Users should implement\nthe load_input and handle_output methods described in the IOManager API, and the\nhas_output method, which returns a boolean representing whether a data object can be found.

\n
\n
\nabstract has_output(context)[source]\u00b6
\n

The user-defined method that returns whether data exists given the metadata.

\n
\n
Parameters:
\n

context (OutputContext) \u2013 The context of the step performing this check.

\n
\n
Returns:
\n

True if there is data present that matches the provided context. False otherwise.

\n
\n
Return type:
\n

bool

\n
\n
\n
\n\n
\n\n

See also: dagster.IOManager.

\n
\n
\ndagster.MEMOIZED_RUN_TAG\u00b6
\n

Provide this tag to a run to toggle memoization on or off. {MEMOIZED_RUN_TAG: "true"} toggles memoization on, while {MEMOIZED_RUN_TAG: "false"} toggles memoization off.

\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/memoization", "customsidebar": null, "display_toc": true, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../libraries/dagster-airbyte/", "title": "Airbyte (dagster-airbyte)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../utilities/", "title": "Utilities"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-airbyte", "Airbyte (dagster-airbyte)", "N", "next"], ["sections/api/apidocs/utilities", "Utilities", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/memoization.rst.txt", "title": "Job-Level Versioning and Memoization (Deprecated)", "toc": "\n"}, "ops": {"alabaster_version": "0.7.13", "body": "
\n

Ops\u00b6

\n

The foundational unit of computation in Dagster.

\n
\n
\n

Defining ops\u00b6

\n
\n
\n@dagster.op(compute_fn=None, *, name=None, description=None, ins=None, out=None, config_schema=None, required_resource_keys=None, tags=None, version=None, retry_policy=None, code_version=None)[source]\u00b6
\n

Create an op with the specified parameters from the decorated function.

\n

Ins and outs will be inferred from the type signature of the decorated function\nif not explicitly provided.

\n

The decorated function will be used as the op\u2019s compute function. The signature of the\ndecorated function is more flexible than that of the compute_fn in the core API; it may:

\n
    \n
  1. Return a value. This value will be wrapped in an Output and yielded by the compute function.

  2. \n
  3. Return an Output. This output will be yielded by the compute function.

  4. \n
  5. Yield Output or other event objects. Same as default compute behavior.

  6. \n
\n

Note that options 1) and 2) are incompatible with yielding other events \u2013 if you would like\nto decorate a function that yields events, it must also wrap its eventual output in an\nOutput and yield it.

\n

@op supports async def functions as well, including async generators when yielding multiple\nevents or outputs. Note that async ops will generally be run on their own unless using a custom\nExecutor implementation that supports running them together.

\n
\n
Parameters:
\n
    \n
  • name (Optional[str]) \u2013 Name of op. Must be unique within any GraphDefinition\nusing the op.

  • \n
  • description (Optional[str]) \u2013 Human-readable description of this op. If not provided, and\nthe decorated function has docstring, that docstring will be used as the description.

  • \n
  • ins (Optional[Dict[str, In]]) \u2013 Information about the inputs to the op. Information provided here will be combined\nwith what can be inferred from the function signature.

  • \n
  • out (Optional[Union[Out, Dict[str, Out]]]) \u2013 Information about the op outputs. Information provided here will be combined with\nwhat can be inferred from the return type signature if the function does not use yield.

  • \n
  • config_schema (Optional[ConfigSchema) \u2013 The schema for the config. If set, Dagster will check\nthat config provided for the op matches this schema and fail if it does not. If not\nset, Dagster will accept any config provided for the op.

  • \n
  • required_resource_keys (Optional[Set[str]]) \u2013 Set of resource handles required by this op.

  • \n
  • tags (Optional[Dict[str, Any]]) \u2013 Arbitrary metadata for the op. Frameworks may\nexpect and require certain metadata to be attached to a op. Values that are not strings\nwill be json encoded and must meet the criteria that json.loads(json.dumps(value)) == value.

  • \n
  • code_version (Optional[str]) \u2013 (Experimental) Version of the logic encapsulated by the op. If set,\nthis is used as a default version for all outputs.

  • \n
  • retry_policy (Optional[RetryPolicy]) \u2013 The retry policy for this op.

  • \n
\n
\n
\n

Examples

\n
@op\ndef hello_world():\n    print('hello')\n\n@op\ndef echo(msg: str) -> str:\n    return msg\n\n@op(\n    ins={'msg': In(str)},\n    out=Out(str)\n)\ndef echo_2(msg): # same as above\n    return msg\n\n@op(\n    out={'word': Out(), 'num': Out()}\n)\ndef multi_out() -> Tuple[str, int]:\n    return 'cool', 4\n
\n
\n
\n\n
\n
\nclass dagster.OpDefinition(compute_fn, name, ins=None, outs=None, description=None, config_schema=None, required_resource_keys=None, tags=None, version=None, retry_policy=None, code_version=None)[source]\u00b6
\n

Defines an op, the functional unit of user-defined computation.

\n

For more details on what a op is, refer to the\nOps Overview .

\n

End users should prefer the @op decorator. OpDefinition is generally intended to be\nused by framework authors or for programatically generated ops.

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 Name of the op. Must be unique within any GraphDefinition or\nJobDefinition that contains the op.

  • \n
  • input_defs (List[InputDefinition]) \u2013 Inputs of the op.

  • \n
  • compute_fn (Callable) \u2013

    The core of the op, the function that performs the actual\ncomputation. The signature of this function is determined by input_defs, and\noptionally, an injected first argument, context, a collection of information\nprovided by the system.

    \n

    This function will be coerced into a generator or an async generator, which must yield\none Output for each of the op\u2019s output_defs, and additionally may\nyield other types of Dagster events, including AssetMaterialization and\nExpectationResult.

    \n

  • \n
  • output_defs (List[OutputDefinition]) \u2013 Outputs of the op.

  • \n
  • config_schema (Optional[ConfigSchema) \u2013 The schema for the config. If set, Dagster will check\nthat the config provided for the op matches this schema and will fail if it does not. If\nnot set, Dagster will accept any config provided for the op.

  • \n
  • description (Optional[str]) \u2013 Human-readable description of the op.

  • \n
  • tags (Optional[Dict[str, Any]]) \u2013 Arbitrary metadata for the op. Frameworks may\nexpect and require certain metadata to be attached to a op. Users should generally\nnot set metadata directly. Values that are not strings will be json encoded and must meet\nthe criteria that json.loads(json.dumps(value)) == value.

  • \n
  • required_resource_keys (Optional[Set[str]]) \u2013 Set of resources handles required by this op.

  • \n
  • code_version (Optional[str]) \u2013 (Experimental) Version of the code encapsulated by the op. If set,\nthis is used as a default code version for all outputs.

  • \n
  • retry_policy (Optional[RetryPolicy]) \u2013 The retry policy for this op.

  • \n
\n
\n
\n

Examples

\n
def _add_one(_context, inputs):\n    yield Output(inputs["num"] + 1)\n\nOpDefinition(\n    name="add_one",\n    ins={"num": In(int)},\n    outs={"result": Out(int)},\n    compute_fn=_add_one,\n)\n
\n
\n
\n
\nalias(name)[source]\u00b6
\n

Creates a copy of this op with the given name.

\n
\n\n
\n
\nproperty config_schema\u00b6
\n

The config schema for this op.

\n
\n
Type:
\n

IDefinitionConfigSchema

\n
\n
\n
\n\n
\n
\nproperty ins\u00b6
\n

A mapping from input name to the In object that represents that input.

\n
\n
Type:
\n

Mapping[str, In]

\n
\n
\n
\n\n
\n
\nproperty name\u00b6
\n

The name of this op.

\n
\n
Type:
\n

str

\n
\n
\n
\n\n
\n
\nproperty outs\u00b6
\n

A mapping from output name to the Out object that represents that output.

\n
\n
Type:
\n

Mapping[str, Out]

\n
\n
\n
\n\n
\n
\nproperty required_resource_keys\u00b6
\n

A set of keys for resources that must be provided to this OpDefinition.

\n
\n
Type:
\n

AbstractSet[str]

\n
\n
\n
\n\n
\n
\nproperty retry_policy\u00b6
\n

The RetryPolicy for this op.

\n
\n
Type:
\n

Optional[RetryPolicy]

\n
\n
\n
\n\n
\n
\ntag(tags)[source]\u00b6
\n

Creates a copy of this op with the given tags.

\n
\n\n
\n
\nproperty tags\u00b6
\n

The tags for this op.

\n
\n
Type:
\n

Mapping[str, str]

\n
\n
\n
\n\n
\n
\nproperty version\u00b6
\n
\n
\n

\n \n (\n deprecated\n )\n \n This API will be removed in version 2.0. Use code_version instead..\n \n

\n

Version of the code encapsulated by the op. If set, this is used as a\ndefault code version for all outputs.

\n
\n
Type:
\n

str

\n
\n
\n
\n\n
\n
\nwith_hooks(hook_defs)[source]\u00b6
\n

Creates a copy of this op with the given hook definitions.

\n
\n\n
\n
\nwith_retry_policy(retry_policy)[source]\u00b6
\n

Creates a copy of this op with the given retry policy.

\n
\n\n
\n\n
\n
\n
\n

Ins & outs\u00b6

\n
\n
\nclass dagster.In(dagster_type=<class 'dagster._core.definitions.utils.NoValueSentinel'>, description=None, default_value=<class 'dagster._core.definitions.utils.NoValueSentinel'>, metadata=None, asset_key=None, asset_partitions=None, input_manager_key=None)[source]\u00b6
\n

Defines an argument to an op\u2019s compute function.

\n

Inputs may flow from previous op\u2019s outputs, or be stubbed using config. They may optionally\nbe typed using the Dagster type system.

\n
\n
Parameters:
\n
    \n
  • dagster_type (Optional[Union[Type, DagsterType]]]) \u2013 The type of this input. Should only be set if the correct type can not\nbe inferred directly from the type signature of the decorated function.

  • \n
  • description (Optional[str]) \u2013 Human-readable description of the input.

  • \n
  • default_value (Optional[Any]) \u2013 The default value to use if no input is provided.

  • \n
  • metadata (Optional[Dict[str, RawMetadataValue]]) \u2013 A dict of metadata for the input.

  • \n
  • asset_key (Optional[Union[AssetKey, InputContext -> AssetKey]]) \u2013 (Experimental) An AssetKey\n(or function that produces an AssetKey from the InputContext) which should be associated\nwith this In. Used for tracking lineage information through Dagster.

  • \n
  • asset_partitions (Optional[Union[Set[str], InputContext -> Set[str]]]) \u2013 (Experimental) A\nset of partitions of the given asset_key (or a function that produces this list of\npartitions from the InputContext) which should be associated with this In.

  • \n
  • input_manager_key (Optional[str]) \u2013 (Experimental) The resource key for the\nInputManager used for loading this input when it is not connected to an\nupstream output.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.Out(dagster_type=<class 'dagster._core.definitions.utils.NoValueSentinel'>, description=None, is_required=True, io_manager_key=None, metadata=None, code_version=None)[source]\u00b6
\n

Defines an output from an op\u2019s compute function.

\n

Ops can have multiple outputs, in which case outputs cannot be anonymous.

\n

Many ops have only one output, in which case the user can provide a single output definition\nthat will be given the default name, \u201cresult\u201d.

\n

Outs may be typed using the Dagster type system.

\n
\n
Parameters:
\n
    \n
  • dagster_type (Optional[Union[Type, DagsterType]]]) \u2013 The type of this output. Should only be set if the correct type can not\nbe inferred directly from the type signature of the decorated function.

  • \n
  • description (Optional[str]) \u2013 Human-readable description of the output.

  • \n
  • is_required (bool) \u2013 Whether the presence of this field is required. (default: True)

  • \n
  • io_manager_key (Optional[str]) \u2013 The resource key of the output manager used for this output.\n(default: \u201cio_manager\u201d).

  • \n
  • metadata (Optional[Dict[str, Any]]) \u2013 A dict of the metadata for the output.\nFor example, users can provide a file path if the data object will be stored in a\nfilesystem, or provide information of a database table when it is going to load the data\ninto the table.

  • \n
  • code_version (Optional[str]) \u2013 (Experimental) Version of the code that generates this output. In\ngeneral, versions should be set only for code that deterministically produces the same\noutput when given the same inputs.

  • \n
\n
\n
\n
\n\n
\n
\n
\n

Execution\u00b6

\n
\n
\nclass dagster.RetryPolicy(max_retries=1, delay=None, backoff=None, jitter=None)[source]\u00b6
\n

A declarative policy for when to request retries when an exception occurs during op execution.

\n
\n
Parameters:
\n
    \n
  • max_retries (int) \u2013 The maximum number of retries to attempt. Defaults to 1.

  • \n
  • delay (Optional[Union[int,float]]) \u2013 The time in seconds to wait between the retry being requested and the next attempt\nbeing started. This unit of time can be modulated as a function of attempt number\nwith backoff and randomly with jitter.

  • \n
  • backoff (Optional[Backoff]) \u2013 A modifier for delay as a function of retry attempt number.

  • \n
  • jitter (Optional[Jitter]) \u2013 A randomizing modifier for delay, applied after backoff calculation.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.Backoff(value)[source]\u00b6
\n

A modifier for delay as a function of attempt number.

\n

LINEAR: attempt_num * delay\nEXPONENTIAL: ((2 ^ attempt_num) - 1) * delay

\n
\n\n
\n
\nclass dagster.Jitter(value)[source]\u00b6
\n

A randomizing modifier for delay, applied after backoff calculation.

\n

FULL: between 0 and the calculated delay based on backoff: random() * backoff_delay\nPLUS_MINUS: +/- the delay: backoff_delay + ((2 * (random() * delay)) - delay)

\n
\n\n
\n
\n
\n

Events\u00b6

\n

The objects that can be yielded by the body of ops\u2019 compute functions to communicate with the\nDagster framework.

\n

(Note that Failure and RetryRequested are intended to be raised from ops rather than yielded.)

\n
\n

Event types\u00b6

\n
\n
\nclass dagster.Output(value, output_name='result', metadata=None, data_version=None)[source]\u00b6
\n

Event corresponding to one of a op\u2019s outputs.

\n

Op compute functions must explicitly yield events of this type when they have more than\none output, or when they also yield events of other types, or when defining a op using the\nOpDefinition API directly.

\n

Outputs are values produced by ops that will be consumed by downstream ops in a job.\nThey are type-checked at op boundaries when their corresponding Out\nor the downstream In is typed.

\n
\n
Parameters:
\n
    \n
  • value (Any) \u2013 The value returned by the compute function.

  • \n
  • output_name (Optional[str]) \u2013 Name of the corresponding out. (default:\n\u201cresult\u201d)

  • \n
  • metadata (Optional[Dict[str, Union[str, float, int, MetadataValue]]]) \u2013 Arbitrary metadata about the failure. Keys are displayed string labels, and values are\none of the following: string, float, int, JSON-serializable dict, JSON-serializable\nlist, and one of the data classes returned by a MetadataValue static method.

  • \n
  • data_version (Optional[DataVersion]) \u2013 \n \n (\n experimental\n )\n \n (This parameter may break in future versions, even between dot releases.) (Experimental) A data version to manually set\nfor the asset.

  • \n
\n
\n
\n
\n
\nproperty data_version\u00b6
\n

A data version that was manually set on the Output.

\n
\n
Type:
\n

Optional[DataVersion]

\n
\n
\n
\n\n
\n
\nproperty output_name\u00b6
\n

Name of the corresponding Out.

\n
\n
Type:
\n

str

\n
\n
\n
\n\n
\n
\nproperty value\u00b6
\n

The value returned by the compute function.

\n
\n
Type:
\n

Any

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.AssetMaterialization(asset_key, description=None, metadata=None, partition=None, tags=None)[source]\u00b6
\n

Event indicating that an op has materialized an asset.

\n

Op compute functions may yield events of this type whenever they wish to indicate to the\nDagster framework (and the end user) that they have produced a materialized value as a\nside effect of computation. Unlike outputs, asset materializations can not be passed to other\nops, and their persistence is controlled by op logic, rather than by the Dagster\nframework.

\n

Op authors should use these events to organize metadata about the side effects of their\ncomputations, enabling tooling like the Assets dashboard in the Dagster UI.

\n
\n
Parameters:
\n
    \n
  • asset_key (Union[str, List[str], AssetKey]) \u2013 A key to identify the materialized asset across\njob runs

  • \n
  • description (Optional[str]) \u2013 A longer human-readable description of the materialized value.

  • \n
  • partition (Optional[str]) \u2013 The name of the partition\nthat was materialized.

  • \n
  • tags (Optional[Mapping[str, str]]) \u2013 A mapping containing system-populated tags for the\nmaterialization. Users should not pass values into this argument.

  • \n
  • metadata (Optional[Dict[str, RawMetadataValue]]) \u2013 Arbitrary metadata about the asset. Keys are displayed string labels, and values are\none of the following: string, float, int, JSON-serializable dict, JSON-serializable\nlist, and one of the data classes returned by a MetadataValue static method.

  • \n
\n
\n
\n
\n
\nstatic file(path, description=None, asset_key=None)[source]\u00b6
\n

Static constructor for standard materializations corresponding to files on disk.

\n
\n
Parameters:
\n
    \n
  • path (str) \u2013 The path to the file.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the materialization.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.ExpectationResult(success, label=None, description=None, metadata=None)[source]\u00b6
\n
\n
\n

\n \n (\n deprecated\n )\n \n This API will be removed in version 1.7. Please use AssetCheckResult and @asset_check instead..\n \n

\n

Event corresponding to a data quality test.

\n

Op compute functions may yield events of this type whenever they wish to indicate to the\nDagster framework (and the end user) that a data quality test has produced a (positive or\nnegative) result.

\n
\n
Parameters:
\n
    \n
  • success (bool) \u2013 Whether the expectation passed or not.

  • \n
  • label (Optional[str]) \u2013 Short display name for expectation. Defaults to \u201cresult\u201d.

  • \n
  • description (Optional[str]) \u2013 A longer human-readable description of the expectation.

  • \n
  • metadata (Optional[Dict[str, RawMetadataValue]]) \u2013 Arbitrary metadata about the failure. Keys are displayed string labels, and values are\none of the following: string, float, int, JSON-serializable dict, JSON-serializable\nlist, and one of the data classes returned by a MetadataValue static method.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.TypeCheck(success, description=None, metadata=None)[source]\u00b6
\n

Event corresponding to a successful typecheck.

\n

Events of this type should be returned by user-defined type checks when they need to encapsulate\nadditional metadata about a type check\u2019s success or failure. (i.e., when using\nas_dagster_type(), @usable_as_dagster_type, or the underlying\nPythonObjectDagsterType() API.)

\n

Op compute functions should generally avoid yielding events of this type to avoid confusion.

\n
\n
Parameters:
\n
    \n
  • success (bool) \u2013 True if the type check succeeded, False otherwise.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the type check.

  • \n
  • metadata (Optional[Dict[str, RawMetadataValue]]) \u2013 Arbitrary metadata about the failure. Keys are displayed string labels, and values are\none of the following: string, float, int, JSON-serializable dict, JSON-serializable\nlist, and one of the data classes returned by a MetadataValue static method.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.Failure(description=None, metadata=None, allow_retries=None)[source]\u00b6
\n

Event indicating op failure.

\n

Raise events of this type from within op compute functions or custom type checks in order to\nindicate an unrecoverable failure in user code to the Dagster machinery and return\nstructured metadata about the failure.

\n
\n
Parameters:
\n
    \n
  • description (Optional[str]) \u2013 A human-readable description of the failure.

  • \n
  • metadata (Optional[Dict[str, RawMetadataValue]]) \u2013 Arbitrary metadata about the failure. Keys are displayed string labels, and values are\none of the following: string, float, int, JSON-serializable dict, JSON-serializable\nlist, and one of the data classes returned by a MetadataValue static method.

  • \n
  • allow_retries (Optional[bool]) \u2013 Whether this Failure should respect the retry policy or bypass it and immediately fail.\nDefaults to True, respecting the retry policy and allowing retries.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.RetryRequested(max_retries=1, seconds_to_wait=None)[source]\u00b6
\n

An exception to raise from an op to indicate that it should be retried.

\n
\n
Parameters:
\n
    \n
  • max_retries (Optional[int]) \u2013 The max number of retries this step should attempt before failing

  • \n
  • seconds_to_wait (Optional[Union[float,int]]) \u2013 Seconds to wait before restarting the step after putting the step in\nto the up_for_retry state

  • \n
\n
\n
\n

Example

\n
@op\ndef flakes():\n    try:\n        flakey_operation()\n    except Exception as e:\n        raise RetryRequested(max_retries=3) from e\n
\n
\n
\n\n
\n
\n
\n

Event metadata\u00b6

\n

Dagster uses metadata to communicate arbitrary user-specified metadata about structured\nevents.

\n
\n
\nclass dagster.MetadataValue[source]\u00b6
\n

Utility class to wrap metadata values passed into Dagster events so that they can be\ndisplayed in the Dagster UI and other tooling.

\n
@op\ndef emit_metadata(context, df):\n    yield AssetMaterialization(\n        asset_key="my_dataset",\n        metadata={\n            "my_text_label": "hello",\n            "dashboard_url": MetadataValue.url("http://mycoolsite.com/my_dashboard"),\n            "num_rows": 0,\n        },\n    )\n
\n
\n
\n
\nstatic asset(asset_key)[source]\u00b6
\n

Static constructor for a metadata value referencing a Dagster asset, by key.

\n

For example:

\n
@op\ndef validate_table(context, df):\n    yield AssetMaterialization(\n        asset_key=AssetKey("my_table"),\n        metadata={\n            "Related asset": MetadataValue.asset(AssetKey('my_other_table')),\n        },\n    )\n
\n
\n
\n
Parameters:
\n

asset_key (AssetKey) \u2013 The asset key referencing the asset.

\n
\n
\n
\n\n
\n
\nstatic bool(value)[source]\u00b6
\n

Static constructor for a metadata value wrapping a bool as\nBoolMetadataValuye. Can be used as the value type for the metadata\nparameter for supported events.

\n

Example

\n
@op\ndef emit_metadata(context, df):\n    yield AssetMaterialization(\n        asset_key="my_dataset",\n        metadata={\n            "num rows > 1000": MetadataValue.bool(len(df) > 1000),\n        },\n    )\n
\n
\n
\n
Parameters:
\n

value (bool) \u2013 The bool value for a metadata entry.

\n
\n
\n
\n\n
\n
\nstatic dagster_run(run_id)[source]\u00b6
\n

Static constructor for a metadata value wrapping a reference to a Dagster run.

\n
\n
Parameters:
\n

run_id (str) \u2013 The ID of the run.

\n
\n
\n
\n\n
\n
\nstatic float(value)[source]\u00b6
\n

Static constructor for a metadata value wrapping a float as\nFloatMetadataValue. Can be used as the value type for the metadata\nparameter for supported events.

\n

Example

\n
@op\ndef emit_metadata(context, df):\n    yield AssetMaterialization(\n        asset_key="my_dataset",\n        metadata={\n            "size (bytes)": MetadataValue.float(calculate_bytes(df)),\n        }\n    )\n
\n
\n
\n
Parameters:
\n

value (float) \u2013 The float value for a metadata entry.

\n
\n
\n
\n\n
\n
\nstatic int(value)[source]\u00b6
\n

Static constructor for a metadata value wrapping an int as\nIntMetadataValue. Can be used as the value type for the metadata\nparameter for supported events.

\n

Example

\n
@op\ndef emit_metadata(context, df):\n    yield AssetMaterialization(\n        asset_key="my_dataset",\n        metadata={\n            "number of rows": MetadataValue.int(len(df)),\n        },\n    )\n
\n
\n
\n
Parameters:
\n

value (int) \u2013 The int value for a metadata entry.

\n
\n
\n
\n\n
\n
\nstatic json(data)[source]\u00b6
\n

Static constructor for a metadata value wrapping a json-serializable list or dict\nas JsonMetadataValue. Can be used as the value type for the metadata\nparameter for supported events.

\n

Example

\n
@op\ndef emit_metadata(context):\n    yield ExpectationResult(\n        success=not missing_things,\n        label="is_present",\n        metadata={\n            "about my dataset": MetadataValue.json({"missing_columns": missing_things})\n        },\n    )\n
\n
\n
\n
Parameters:
\n

data (Union[Sequence[Any], Mapping[str, Any]]) \u2013 The JSON data for a metadata entry.

\n
\n
\n
\n\n
\n
\nstatic md(data)[source]\u00b6
\n

Static constructor for a metadata value wrapping markdown data as\nMarkdownMetadataValue. Can be used as the value type for the metadata\nparameter for supported events.

\n

Example

\n
@op\ndef emit_metadata(context, md_str):\n    yield AssetMaterialization(\n        asset_key="info",\n        metadata={\n            'Details': MetadataValue.md(md_str)\n        },\n    )\n
\n
\n
\n
Parameters:
\n

md_str (str) \u2013 The markdown for a metadata entry.

\n
\n
\n
\n\n
\n
\nstatic notebook(path)[source]\u00b6
\n

Static constructor for a metadata value wrapping a notebook path as\nNotebookMetadataValue.

\n

Example

\n
@op\ndef emit_metadata(context):\n    yield AssetMaterialization(\n        asset_key="my_dataset",\n        metadata={\n            "notebook_path": MetadataValue.notebook("path/to/notebook.ipynb"),\n        }\n    )\n
\n
\n
\n
Parameters:
\n

path (str) \u2013 The path to a notebook for a metadata entry.

\n
\n
\n
\n\n
\n
\nstatic null()[source]\u00b6
\n

Static constructor for a metadata value representing null. Can be used as the value type\nfor the metadata parameter for supported events.

\n
\n\n
\n
\nstatic path(path)[source]\u00b6
\n

Static constructor for a metadata value wrapping a path as\nPathMetadataValue.

\n

Example

\n
@op\ndef emit_metadata(context):\n    yield AssetMaterialization(\n        asset_key="my_dataset",\n        metadata={\n            "filepath": MetadataValue.path("path/to/file"),\n        }\n    )\n
\n
\n
\n
Parameters:
\n

path (str) \u2013 The path for a metadata entry.

\n
\n
\n
\n\n
\n
\nstatic python_artifact(python_artifact)[source]\u00b6
\n

Static constructor for a metadata value wrapping a python artifact as\nPythonArtifactMetadataValue. Can be used as the value type for the\nmetadata parameter for supported events.

\n

Example

\n
@op\ndef emit_metadata(context, df):\n    yield AssetMaterialization(\n        asset_key="my_dataset",\n        metadata={\n            "class": MetadataValue.python_artifact(MyClass),\n            "function": MetadataValue.python_artifact(my_function),\n        }\n    )\n
\n
\n
\n
Parameters:
\n

value (Callable) \u2013 The python class or function for a metadata entry.

\n
\n
\n
\n\n
\n
\nstatic table(records, schema=None)[source]\u00b6
\n
\n
\n

\n \n (\n experimental\n )\n \n This API may break in future versions, even between dot releases.\n \n

\n

Static constructor for a metadata value wrapping arbitrary tabular data as\nTableMetadataValue. Can be used as the value type for the metadata\nparameter for supported events.

\n

Example

\n
@op\ndef emit_metadata(context):\n    yield ExpectationResult(\n        success=not has_errors,\n        label="is_valid",\n        metadata={\n            "errors": MetadataValue.table(\n                records=[\n                    TableRecord(code="invalid-data-type", row=2, col="name"),\n                ],\n                schema=TableSchema(\n                    columns=[\n                        TableColumn(name="code", type="string"),\n                        TableColumn(name="row", type="int"),\n                        TableColumn(name="col", type="string"),\n                    ]\n                )\n            ),\n        },\n    )\n
\n
\n
\n\n
\n
\nstatic table_schema(schema)[source]\u00b6
\n

Static constructor for a metadata value wrapping a table schema as\nTableSchemaMetadataValue. Can be used as the value type\nfor the metadata parameter for supported events.

\n

Example

\n
schema = TableSchema(\n    columns = [\n        TableColumn(name="id", type="int"),\n        TableColumn(name="status", type="bool"),\n    ]\n)\n\nDagsterType(\n    type_check_fn=some_validation_fn,\n    name='MyTable',\n    metadata={\n        'my_table_schema': MetadataValue.table_schema(schema),\n    }\n)\n
\n
\n
\n
Parameters:
\n

schema (TableSchema) \u2013 The table schema for a metadata entry.

\n
\n
\n
\n\n
\n
\nstatic text(text)[source]\u00b6
\n

Static constructor for a metadata value wrapping text as\nTextMetadataValue. Can be used as the value type for the metadata\nparameter for supported events.

\n

Example

\n
@op\ndef emit_metadata(context, df):\n    yield AssetMaterialization(\n        asset_key="my_dataset",\n        metadata={\n            "my_text_label": MetadataValue.text("hello")\n        },\n    )\n
\n
\n
\n
Parameters:
\n

text (str) \u2013 The text string for a metadata entry.

\n
\n
\n
\n\n
\n
\nstatic url(url)[source]\u00b6
\n

Static constructor for a metadata value wrapping a URL as\nUrlMetadataValue. Can be used as the value type for the metadata\nparameter for supported events.

\n

Example

\n
@op\ndef emit_metadata(context):\n    yield AssetMaterialization(\n        asset_key="my_dashboard",\n        metadata={\n            "dashboard_url": MetadataValue.url("http://mycoolsite.com/my_dashboard"),\n        }\n    )\n
\n
\n
\n
Parameters:
\n

url (str) \u2013 The URL for a metadata entry.

\n
\n
\n
\n\n
\n
\nabstract property value\u00b6
\n

The wrapped value.

\n
\n\n
\n\n
\n
\nclass dagster.MetadataEntry(label, description=None, entry_data=None, value=None)[source]\u00b6
\n
\n
\n

\n \n (\n deprecated\n )\n \n This API will be removed in version 2.0. Please use a dict with MetadataValue values instead..\n \n

\n

A structure for describing metadata for Dagster events.

\n
\n

Note

\n

This class is no longer usable in any Dagster API, and will be completely removed in 2.0.

\n
\n

Lists of objects of this type can be passed as arguments to Dagster events and will be displayed\nin the Dagster UI and other tooling.

\n

Should be yielded from within an IO manager to append metadata for a given input/output event.\nFor other event types, passing a dict with MetadataValue values to the metadata argument\nis preferred.

\n
\n
Parameters:
\n
    \n
  • label (str) \u2013 Short display label for this metadata entry.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of this metadata entry.

  • \n
  • value (MetadataValue) \u2013 Typed metadata entry data. The different types allow\nfor customized display in tools like the Dagster UI.

  • \n
\n
\n
\n
\n\n
\n
\n

Metadata types\u00b6

\n

All metadata types inherit from MetadataValue. The following types are defined:

\n
\n
\nclass dagster.DagsterAssetMetadataValue(asset_key)[source]\u00b6
\n

Representation of a dagster asset.

\n
\n
Parameters:
\n

asset_key (AssetKey) \u2013 The dagster asset key

\n
\n
\n
\n
\nproperty value\u00b6
\n

The wrapped AssetKey.

\n
\n
Type:
\n

AssetKey

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.DagsterRunMetadataValue(run_id)[source]\u00b6
\n

Representation of a dagster run.

\n
\n
Parameters:
\n

run_id (str) \u2013 The run id

\n
\n
\n
\n
\nproperty value\u00b6
\n

The wrapped run id.

\n
\n
Type:
\n

str

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.FloatMetadataValue(value)[source]\u00b6
\n

Container class for float metadata entry data.

\n
\n
Parameters:
\n

value (Optional[float]) \u2013 The float value.

\n
\n
\n
\n\n
\n
\nclass dagster.IntMetadataValue(value)[source]\u00b6
\n

Container class for int metadata entry data.

\n
\n
Parameters:
\n

value (Optional[int]) \u2013 The int value.

\n
\n
\n
\n\n
\n
\nclass dagster.JsonMetadataValue(data)[source]\u00b6
\n

Container class for JSON metadata entry data.

\n
\n
Parameters:
\n

data (Union[Sequence[Any], Dict[str, Any]]) \u2013 The JSON data.

\n
\n
\n
\n
\nproperty value\u00b6
\n

The wrapped JSON data.

\n
\n
Type:
\n

Optional[Union[Sequence[Any], Dict[str, Any]]]

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.MarkdownMetadataValue(md_str)[source]\u00b6
\n

Container class for markdown metadata entry data.

\n
\n
Parameters:
\n

md_str (Optional[str]) \u2013 The markdown as a string.

\n
\n
\n
\n
\nproperty value\u00b6
\n

The wrapped markdown as a string.

\n
\n
Type:
\n

Optional[str]

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.PathMetadataValue(path)[source]\u00b6
\n

Container class for path metadata entry data.

\n
\n
Parameters:
\n

path (Optional[str]) \u2013 The path as a string or conforming to os.PathLike.

\n
\n
\n
\n
\nproperty value\u00b6
\n

The wrapped path.

\n
\n
Type:
\n

Optional[str]

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.NotebookMetadataValue(path)[source]\u00b6
\n

Container class for notebook metadata entry data.

\n
\n
Parameters:
\n

path (Optional[str]) \u2013 The path to the notebook as a string or conforming to os.PathLike.

\n
\n
\n
\n
\nproperty value\u00b6
\n

The wrapped path to the notebook as a string.

\n
\n
Type:
\n

Optional[str]

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.PythonArtifactMetadataValue(module, name)[source]\u00b6
\n

Container class for python artifact metadata entry data.

\n
\n
Parameters:
\n
    \n
  • module (str) \u2013 The module where the python artifact can be found

  • \n
  • name (str) \u2013 The name of the python artifact

  • \n
\n
\n
\n
\n
\nproperty value\u00b6
\n

Identity function.

\n
\n
Type:
\n

PythonArtifactMetadataValue

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.TableMetadataValue(records, schema)[source]\u00b6
\n
\n
\n

\n \n (\n experimental\n )\n \n This API may break in future versions, even between dot releases.\n \n

\n

Container class for table metadata entry data.

\n
\n
Parameters:
\n
    \n
  • records (TableRecord) \u2013 The data as a list of records (i.e. rows).

  • \n
  • schema (Optional[TableSchema]) \u2013 A schema for the table.

  • \n
\n
\n
\n
\n
\nstatic infer_column_type(value)[source]\u00b6
\n

str: Infer the TableSchema column type that will be used for a value.

\n
\n\n
\n
\nproperty value\u00b6
\n

Identity function.

\n
\n
Type:
\n

TableMetadataValue

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.TableSchemaMetadataValue(schema)[source]\u00b6
\n

Representation of a schema for arbitrary tabular data.

\n
\n
Parameters:
\n

schema (TableSchema) \u2013 The dictionary containing the schema representation.

\n
\n
\n
\n
\nproperty value\u00b6
\n

The wrapped TableSchema.

\n
\n
Type:
\n

TableSchema

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.TextMetadataValue(text)[source]\u00b6
\n

Container class for text metadata entry data.

\n
\n
Parameters:
\n

text (Optional[str]) \u2013 The text data.

\n
\n
\n
\n
\nproperty value\u00b6
\n

The wrapped text data.

\n
\n
Type:
\n

Optional[str]

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.UrlMetadataValue(url)[source]\u00b6
\n

Container class for URL metadata entry data.

\n
\n
Parameters:
\n

url (Optional[str]) \u2013 The URL as a string.

\n
\n
\n
\n
\nproperty value\u00b6
\n

The wrapped URL.

\n
\n
Type:
\n

Optional[str]

\n
\n
\n
\n\n
\n\n
\n
\n

Tables\u00b6

\n

These APIs provide the ability to express table schemas (TableSchema) and table rows/records (TableRecord) in Dagster. Currently the only use case for TableSchemas and TableRecords is to wrap them in their corresponding metadata classes TableMetadataValue and TableSchemaMetadataValue for attachment to events or Dagster types.

\n
\n
\nclass dagster.TableRecord(data)[source]\u00b6
\n
\n
\n

\n \n (\n experimental\n )\n \n This API may break in future versions, even between dot releases.\n \n

\n

Represents one record in a table. Field keys are arbitrary strings\u2013 field values must be\nstrings, integers, floats, or bools.

\n
\n\n
\n
\nclass dagster.TableSchema(columns, constraints=None)[source]\u00b6
\n

Representation of a schema for tabular data.

\n

Schema is composed of two parts:

\n
    \n
  • A required list of columns (TableColumn). Each column specifies a\nname, type, set of constraints, and (optional) description. type\ndefaults to string if unspecified. Column constraints\n(TableColumnConstraints) consist of boolean properties unique and\nnullable, as well as a list of strings other containing string\ndescriptions of all additional constraints (e.g. \u201c<= 5\u201d).

  • \n
  • An optional list of table-level constraints (TableConstraints). A\ntable-level constraint cannot be expressed in terms of a single column,\ne.g. col a > col b. Presently, all table-level constraints must be\nexpressed as strings under the other attribute of a TableConstraints\nobject.

  • \n
\n
# example schema\nTableSchema(\n    constraints = TableConstraints(\n        other = [\n            "foo > bar",\n        ],\n    ),\n    columns = [\n        TableColumn(\n            name = "foo",\n            type = "string",\n            description = "Foo description",\n            constraints = TableColumnConstraints(\n                required = True,\n                other = [\n                    "starts with the letter 'a'",\n                ],\n            ),\n        ),\n        TableColumn(\n            name = "bar",\n            type = "string",\n        ),\n        TableColumn(\n            name = "baz",\n            type = "custom_type",\n            constraints = TableColumnConstraints(\n                unique = True,\n            )\n        ),\n    ],\n)\n
\n
\n
\n
Parameters:
\n
    \n
  • columns (List[TableColumn]) \u2013 The columns of the table.

  • \n
  • constraints (Optional[TableConstraints]) \u2013 The constraints of the table.

  • \n
\n
\n
\n
\n
\nstatic from_name_type_dict(name_type_dict)[source]\u00b6
\n

Constructs a TableSchema from a dictionary whose keys are column names and values are the\nnames of data types of those columns.

\n
\n\n
\n\n
\n
\nclass dagster.TableConstraints(other)[source]\u00b6
\n

Descriptor for \u201ctable-level\u201d constraints. Presently only one property,\nother is supported. This contains strings describing arbitrary\ntable-level constraints. A table-level constraint is a constraint defined\nin terms of multiple columns (e.g. col_A > col_B) or in terms of rows.

\n
\n
Parameters:
\n

other (List[str]) \u2013 Descriptions of arbitrary table-level constraints.

\n
\n
\n
\n\n
\n
\nclass dagster.TableColumn(name, type='string', description=None, constraints=None)[source]\u00b6
\n

Descriptor for a table column. The only property that must be specified\nby the user is name. If no type is specified, string is assumed. If\nno constraints are specified, the column is assumed to be nullable\n(i.e. required = False) and have no other constraints beyond the data type.

\n
\n
Parameters:
\n
    \n
  • name (List[str]) \u2013 Descriptions of arbitrary table-level constraints.

  • \n
  • type (Optional[str]) \u2013 The type of the column. Can be an arbitrary\nstring. Defaults to \u201cstring\u201d.

  • \n
  • description (Optional[str]) \u2013 Description of this column. Defaults to None.

  • \n
  • constraints (Optional[TableColumnConstraints]) \u2013 Column-level constraints.\nIf unspecified, column is nullable with no constraints.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.TableColumnConstraints(nullable=True, unique=False, other=None)[source]\u00b6
\n

Descriptor for a table column\u2019s constraints. Nullability and uniqueness are specified with\nboolean properties. All other constraints are described using arbitrary strings under the\nother property.

\n
\n
Parameters:
\n
    \n
  • nullable (Optional[bool]) \u2013 If true, this column can hold null values.

  • \n
  • unique (Optional[bool]) \u2013 If true, all values in this column must be unique.

  • \n
  • other (List[str]) \u2013 Descriptions of arbitrary column-level constraints\nnot expressible by the predefined properties.

  • \n
\n
\n
\n
\n\n
\n
\n
\n

Asset key\u00b6

\n

Dagster uses AssetKey to build an index on Materialization events.\nAssets materialized with an AssetKey are highlighted in the Dagster UI on the Assets\ndashboard.

\n
\n
\nclass dagster.AssetKey(path)[source]\u00b6
\n

Object representing the structure of an asset key. Takes in a sanitized string, list of\nstrings, or tuple of strings.

\n

Example usage:

\n
from dagster import op\n\n@op\ndef emit_metadata(context, df):\n    yield AssetMaterialization(\n        asset_key=AssetKey('flat_asset_key'),\n        metadata={"text_metadata": "Text-based metadata for this event"},\n    )\n\n@op\ndef structured_asset_key(context, df):\n    yield AssetMaterialization(\n        asset_key=AssetKey(['parent', 'child', 'grandchild']),\n        metadata={"text_metadata": "Text-based metadata for this event"},\n    )\n\n@op\ndef structured_asset_key_2(context, df):\n    yield AssetMaterialization(\n        asset_key=AssetKey(('parent', 'child', 'grandchild')),\n        metadata={"text_metadata": "Text-based metadata for this event"},\n    )\n
\n
\n
\n
Parameters:
\n

path (Sequence[str]) \u2013 String, list of strings, or tuple of strings. A list of strings\nrepresent the hierarchical structure of the asset_key.

\n
\n
\n
\n\n
\n
\n
\n", "current_page_name": "sections/api/apidocs/ops", "customsidebar": null, "display_toc": true, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../io-managers/", "title": "IO Managers"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../loggers/", "title": "Loggers"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/io-managers", "IO Managers", "N", "next"], ["sections/api/apidocs/loggers", "Loggers", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/ops.rst.txt", "title": "Ops", "toc": "\n"}, "partitions": {"alabaster_version": "0.7.13", "body": "
\n

Partitions Definitions\u00b6

\n
\n
\nclass dagster.PartitionsDefinition[source]\u00b6
\n

Defines a set of partitions, which can be attached to a software-defined asset or job.

\n

Abstract class with implementations for different kinds of partitions.

\n
\n
\nabstract get_partition_keys(current_time=None, dynamic_partitions_store=None)[source]\u00b6
\n

Returns a list of strings representing the partition keys of the PartitionsDefinition.

\n
\n
Parameters:
\n
    \n
  • current_time (Optional[datetime]) \u2013 A datetime object representing the current time, only\napplicable to time-based partitions definitions.

  • \n
  • dynamic_partitions_store (Optional[DynamicPartitionsStore]) \u2013 The DynamicPartitionsStore\nobject that is responsible for fetching dynamic partitions. Required when the\npartitions definition is a DynamicPartitionsDefinition with a name defined. Users\ncan pass the DagsterInstance fetched via context.instance to this argument.

  • \n
\n
\n
Returns:
\n

Sequence[str]

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.HourlyPartitionsDefinition(start_date, end_date=None, minute_offset=0, timezone=None, fmt=None, end_offset=0)[source]\u00b6
\n

A set of hourly partitions.

\n

The first partition in the set will start on the start_date at midnight. The last partition\nin the set will end before the current time, unless the end_offset argument is set to a\npositive number. If minute_offset is provided, the start and end times of each partition\nwill be minute_offset past the hour.

\n
\n
Parameters:
\n
    \n
  • start_date (Union[datetime.datetime, str]) \u2013 The first date in the set of partitions. Can\nprovide in either a datetime or string format.

  • \n
  • end_date (Union[datetime.datetime, str, None]) \u2013 The last date(excluding) in the set of partitions.\nDefault is None. Can provide in either a datetime or string format.

  • \n
  • minute_offset (int) \u2013 Number of minutes past the hour to \u201csplit\u201d the partition. Defaults\nto 0.

  • \n
  • fmt (Optional[str]) \u2013 The date format to use. Defaults to %Y-%m-%d.

  • \n
  • timezone (Optional[str]) \u2013 The timezone in which each date should exist.\nSupported strings for timezones are the ones provided by the\nIANA time zone database <https://www.iana.org/time-zones> - e.g. \u201cAmerica/Los_Angeles\u201d.

  • \n
  • end_offset (int) \u2013 Extends the partition set by a number of partitions equal to the value\npassed. If end_offset is 0 (the default), the last partition ends before the current\ntime. If end_offset is 1, the second-to-last partition ends before the current time,\nand so on.

  • \n
\n
\n
\n
HourlyPartitionsDefinition(start_date=datetime(2022, 03, 12))\n# creates partitions (2022-03-12-00:00, 2022-03-12-01:00), (2022-03-12-01:00, 2022-03-12-02:00), ...\n\nHourlyPartitionsDefinition(start_date=datetime(2022, 03, 12), minute_offset=15)\n# creates partitions (2022-03-12-00:15, 2022-03-12-01:15), (2022-03-12-01:15, 2022-03-12-02:15), ...\n
\n
\n
\n\n
\n
\nclass dagster.DailyPartitionsDefinition(start_date, end_date=None, minute_offset=0, hour_offset=0, timezone=None, fmt=None, end_offset=0)[source]\u00b6
\n

A set of daily partitions.

\n

The first partition in the set will start at the start_date at midnight. The last partition\nin the set will end before the current time, unless the end_offset argument is set to a\npositive number. If minute_offset and/or hour_offset are used, the start and end times of\neach partition will be hour_offset:minute_offset of each day.

\n
\n
Parameters:
\n
    \n
  • start_date (Union[datetime.datetime, str]) \u2013 The first date in the set of partitions. Can\nprovide in either a datetime or string format.

  • \n
  • end_date (Union[datetime.datetime, str, None]) \u2013 The last date(excluding) in the set of partitions.\nDefault is None. Can provide in either a datetime or string format.

  • \n
  • minute_offset (int) \u2013 Number of minutes past the hour to \u201csplit\u201d the partition. Defaults\nto 0.

  • \n
  • hour_offset (int) \u2013 Number of hours past 00:00 to \u201csplit\u201d the partition. Defaults to 0.

  • \n
  • timezone (Optional[str]) \u2013 The timezone in which each date should exist.\nSupported strings for timezones are the ones provided by the\nIANA time zone database <https://www.iana.org/time-zones> - e.g. \u201cAmerica/Los_Angeles\u201d.

  • \n
  • fmt (Optional[str]) \u2013 The date format to use. Defaults to %Y-%m-%d.

  • \n
  • end_offset (int) \u2013 Extends the partition set by a number of partitions equal to the value\npassed. If end_offset is 0 (the default), the last partition ends before the current\ntime. If end_offset is 1, the second-to-last partition ends before the current time,\nand so on.

  • \n
\n
\n
\n
DailyPartitionsDefinition(start_date="2022-03-12")\n# creates partitions (2022-03-12-00:00, 2022-03-13-00:00), (2022-03-13-00:00, 2022-03-14-00:00), ...\n\nDailyPartitionsDefinition(start_date="2022-03-12", minute_offset=15, hour_offset=16)\n# creates partitions (2022-03-12-16:15, 2022-03-13-16:15), (2022-03-13-16:15, 2022-03-14-16:15), ...\n
\n
\n
\n\n
\n
\nclass dagster.WeeklyPartitionsDefinition(start_date, end_date=None, minute_offset=0, hour_offset=0, day_offset=0, timezone=None, fmt=None, end_offset=0)[source]\u00b6
\n

Defines a set of weekly partitions.

\n

The first partition in the set will start at the start_date. The last partition in the set will\nend before the current time, unless the end_offset argument is set to a positive number. If\nday_offset is provided, the start and end date of each partition will be day of the week\ncorresponding to day_offset (0 indexed with Sunday as the start of the week). If\nminute_offset and/or hour_offset are used, the start and end times of each partition will be\nhour_offset:minute_offset of each day.

\n
\n
Parameters:
\n
    \n
  • start_date (Union[datetime.datetime, str]) \u2013 The first date in the set of partitions will\nSunday at midnight following start_date. Can provide in either a datetime or string\nformat.

  • \n
  • end_date (Union[datetime.datetime, str, None]) \u2013 The last date(excluding) in the set of partitions.\nDefault is None. Can provide in either a datetime or string format.

  • \n
  • minute_offset (int) \u2013 Number of minutes past the hour to \u201csplit\u201d the partition. Defaults\nto 0.

  • \n
  • hour_offset (int) \u2013 Number of hours past 00:00 to \u201csplit\u201d the partition. Defaults to 0.

  • \n
  • day_offset (int) \u2013 Day of the week to \u201csplit\u201d the partition. Defaults to 0 (Sunday).

  • \n
  • timezone (Optional[str]) \u2013 The timezone in which each date should exist.\nSupported strings for timezones are the ones provided by the\nIANA time zone database <https://www.iana.org/time-zones> - e.g. \u201cAmerica/Los_Angeles\u201d.

  • \n
  • fmt (Optional[str]) \u2013 The date format to use. Defaults to %Y-%m-%d.

  • \n
  • end_offset (int) \u2013 Extends the partition set by a number of partitions equal to the value\npassed. If end_offset is 0 (the default), the last partition ends before the current\ntime. If end_offset is 1, the second-to-last partition ends before the current time,\nand so on.

  • \n
\n
\n
\n
WeeklyPartitionsDefinition(start_date="2022-03-12")\n# creates partitions (2022-03-13-00:00, 2022-03-20-00:00), (2022-03-20-00:00, 2022-03-27-00:00), ...\n\nWeeklyPartitionsDefinition(start_date="2022-03-12", minute_offset=15, hour_offset=3, day_offset=6)\n# creates partitions (2022-03-12-03:15, 2022-03-19-03:15), (2022-03-19-03:15, 2022-03-26-03:15), ...\n
\n
\n
\n\n
\n
\nclass dagster.MonthlyPartitionsDefinition(start_date, end_date=None, minute_offset=0, hour_offset=0, day_offset=1, timezone=None, fmt=None, end_offset=0)[source]\u00b6
\n

A set of monthly partitions.

\n

The first partition in the set will start at the soonest first of the month after start_date\nat midnight. The last partition in the set will end before the current time, unless the\nend_offset argument is set to a positive number. If day_offset is provided, the start and\nend date of each partition will be day_offset. If minute_offset and/or hour_offset are used,\nthe start and end times of each partition will be hour_offset:minute_offset of each day.

\n
\n
Parameters:
\n
    \n
  • start_date (Union[datetime.datetime, str]) \u2013 The first date in the set of partitions will be\nmidnight the sonnest first of the month following start_date. Can provide in either a\ndatetime or string format.

  • \n
  • end_date (Union[datetime.datetime, str, None]) \u2013 The last date(excluding) in the set of partitions.\nDefault is None. Can provide in either a datetime or string format.

  • \n
  • minute_offset (int) \u2013 Number of minutes past the hour to \u201csplit\u201d the partition. Defaults\nto 0.

  • \n
  • hour_offset (int) \u2013 Number of hours past 00:00 to \u201csplit\u201d the partition. Defaults to 0.

  • \n
  • day_offset (int) \u2013 Day of the month to \u201csplit\u201d the partition. Defaults to 1.

  • \n
  • timezone (Optional[str]) \u2013 The timezone in which each date should exist.\nSupported strings for timezones are the ones provided by the\nIANA time zone database <https://www.iana.org/time-zones> - e.g. \u201cAmerica/Los_Angeles\u201d.

  • \n
  • fmt (Optional[str]) \u2013 The date format to use. Defaults to %Y-%m-%d.

  • \n
  • end_offset (int) \u2013 Extends the partition set by a number of partitions equal to the value\npassed. If end_offset is 0 (the default), the last partition ends before the current\ntime. If end_offset is 1, the second-to-last partition ends before the current time,\nand so on.

  • \n
\n
\n
\n
MonthlyPartitionsDefinition(start_date="2022-03-12")\n# creates partitions (2022-04-01-00:00, 2022-05-01-00:00), (2022-05-01-00:00, 2022-06-01-00:00), ...\n\nMonthlyPartitionsDefinition(start_date="2022-03-12", minute_offset=15, hour_offset=3, day_offset=5)\n# creates partitions (2022-04-05-03:15, 2022-05-05-03:15), (2022-05-05-03:15, 2022-06-05-03:15), ...\n
\n
\n
\n\n
\n
\nclass dagster.TimeWindowPartitionsDefinition(start, fmt, end=None, schedule_type=None, timezone=None, end_offset=0, minute_offset=None, hour_offset=None, day_offset=None, cron_schedule=None)[source]\u00b6
\n

A set of partitions where each partitions corresponds to a time window.

\n

The provided cron_schedule determines the bounds of the time windows. E.g. a cron_schedule of\n\u201c0 0 \\* \\* \\*\u201d will result in daily partitions that start at midnight and end at midnight of the\nfollowing day.

\n

The string partition_key associated with each partition corresponds to the start of the\npartition\u2019s time window.

\n

The first partition in the set will start on at the first cron_schedule tick that is equal to\nor after the given start datetime. The last partition in the set will end before the current\ntime, unless the end_offset argument is set to a positive number.

\n
\n
Parameters:
\n
    \n
  • cron_schedule (str) \u2013 Determines the bounds of the time windows.

  • \n
  • start (datetime) \u2013 The first partition in the set will start on at the first cron_schedule\ntick that is equal to or after this value.

  • \n
  • timezone (Optional[str]) \u2013 The timezone in which each time should exist.\nSupported strings for timezones are the ones provided by the\nIANA time zone database <https://www.iana.org/time-zones> - e.g. \u201cAmerica/Los_Angeles\u201d.

  • \n
  • end (datetime) \u2013 The last partition (excluding) in the set.

  • \n
  • fmt (str) \u2013 The date format to use for partition_keys.

  • \n
  • end_offset (int) \u2013 Extends the partition set by a number of partitions equal to the value\npassed. If end_offset is 0 (the default), the last partition ends before the current\ntime. If end_offset is 1, the second-to-last partition ends before the current time,\nand so on.

  • \n
\n
\n
\n
\n
\nproperty day_offset\u00b6
\n

For a weekly or monthly partitions definition, returns the day to \u201csplit\u201d partitions\nby. Each partition will start on this day, and end before this day in the following\nweek/month. Returns 0 if the day_offset parameter is unset in the\nWeeklyPartitionsDefinition, MonthlyPartitionsDefinition, or the provided cron schedule.

\n

For weekly partitions, returns a value between 0 (representing Sunday) and 6 (representing\nSaturday). Providing a value of 1 means that a partition will exist weekly from Monday to\nthe following Sunday.

\n

For monthly partitions, returns a value between 0 (the first day of the month) and 31 (the\nlast possible day of the month).

\n
\n
Type:
\n

int

\n
\n
\n
\n\n
\n
\nget_cron_schedule(minute_of_hour=None, hour_of_day=None, day_of_week=None, day_of_month=None)[source]\u00b6
\n

The schedule executes at the cadence specified by the partitioning, but may overwrite\nthe minute/hour/day offset of the partitioning.

\n

This is useful e.g. if you have partitions that span midnight to midnight but you want to\nschedule a job that runs at 2 am.

\n
\n\n
\n
\nproperty hour_offset\u00b6
\n

Number of hours past 00:00 to \u201csplit\u201d partitions. Defaults to 0.

\n

For example, returns 1 if each partition starts at 01:00.

\n
\n
Type:
\n

int

\n
\n
\n
\n\n
\n
\nproperty minute_offset\u00b6
\n

Number of minutes past the hour to \u201csplit\u201d partitions. Defaults to 0.

\n

For example, returns 15 if each partition starts at 15 minutes past the hour.

\n
\n
Type:
\n

int

\n
\n
\n
\n\n
\n
\nproperty schedule_type\u00b6
\n

An enum representing the partition cadence (hourly, daily,\nweekly, or monthly).

\n
\n
Type:
\n

Optional[ScheduleType]

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.TimeWindow(start, end)[source]\u00b6
\n

An interval that is closed at the start and open at the end.

\n
\n
\nstart\u00b6
\n

A pendulum datetime that marks the start of the window.

\n
\n
Type:
\n

datetime

\n
\n
\n
\n\n
\n
\nend\u00b6
\n

A pendulum datetime that marks the end of the window.

\n
\n
Type:
\n

datetime

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.StaticPartitionsDefinition(partition_keys)[source]\u00b6
\n

A statically-defined set of partitions.

\n

Example

\n
from dagster import StaticPartitionsDefinition, asset\n\noceans_partitions_def = StaticPartitionsDefinition(\n    ["arctic", "atlantic", "indian", "pacific", "southern"]\n)\n\n@asset(partitions_def=oceans_partitions_defs)\ndef ml_model_for_each_ocean():\n    ...\n
\n
\n
\n
\nget_partition_keys(current_time=None, dynamic_partitions_store=None)[source]\u00b6
\n

Returns a list of strings representing the partition keys of the PartitionsDefinition.

\n
\n
Parameters:
\n
    \n
  • current_time (Optional[datetime]) \u2013 A datetime object representing the current time, only\napplicable to time-based partitions definitions.

  • \n
  • dynamic_partitions_store (Optional[DynamicPartitionsStore]) \u2013 The DynamicPartitionsStore\nobject that is responsible for fetching dynamic partitions. Only applicable to\nDynamicPartitionsDefinitions.

  • \n
\n
\n
Returns:
\n

Sequence[str]

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.MultiPartitionsDefinition(partitions_defs)[source]\u00b6
\n

Takes the cross-product of partitions from two partitions definitions.

\n

For example, with a static partitions definition where the partitions are [\u201ca\u201d, \u201cb\u201d, \u201cc\u201d]\nand a daily partitions definition, this partitions definition will have the following\npartitions:

\n

2020-01-01|a\n2020-01-01|b\n2020-01-01|c\n2020-01-02|a\n2020-01-02|b\n\u2026

\n
\n
Parameters:
\n

partitions_defs (Mapping[str, PartitionsDefinition]) \u2013 A mapping of dimension name to partitions definition. The total set of partitions will\nbe the cross-product of the partitions from each PartitionsDefinition.

\n
\n
\n
\n
\npartitions_defs\u00b6
\n

A sequence of PartitionDimensionDefinition objects, each of which contains a dimension\nname and a PartitionsDefinition. The total set of partitions will be the cross-product\nof the partitions from each PartitionsDefinition. This sequence is ordered by\ndimension name, to ensure consistent ordering of the partitions.

\n
\n
Type:
\n

Sequence[PartitionDimensionDefinition]

\n
\n
\n
\n\n
\n
\nget_partition_keys(current_time=None, dynamic_partitions_store=None)[source]\u00b6
\n

Returns a list of MultiPartitionKeys representing the partition keys of the\nPartitionsDefinition.

\n
\n
Parameters:
\n
    \n
  • current_time (Optional[datetime]) \u2013 A datetime object representing the current time, only\napplicable to time-based partition dimensions.

  • \n
  • dynamic_partitions_store (Optional[DynamicPartitionsStore]) \u2013 The DynamicPartitionsStore\nobject that is responsible for fetching dynamic partitions. Required when a\ndimension is a DynamicPartitionsDefinition with a name defined. Users can pass the\nDagsterInstance fetched via context.instance to this argument.

  • \n
\n
\n
Returns:
\n

Sequence[MultiPartitionKey]

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.MultiPartitionKey(keys_by_dimension)[source]\u00b6
\n

A multi-dimensional partition key stores the partition key for each dimension.\nSubclasses the string class to keep partition key type as a string.

\n

Contains additional methods to access the partition key for each dimension.\nCreates a string representation of the partition key for each dimension, separated by a pipe (|).\nOrders the dimensions by name, to ensure consistent string representation.

\n
\n\n
\n
\nclass dagster.DynamicPartitionsDefinition(partition_fn=None, name=None)[source]\u00b6
\n

A partitions definition whose partition keys can be dynamically added and removed.

\n

This is useful for cases where the set of partitions is not known at definition time,\nbut is instead determined at runtime.

\n

Partitions can be added and removed using instance.add_dynamic_partitions and\ninstance.delete_dynamic_partition methods.

\n
\n
Parameters:
\n
    \n
  • name (Optional[str]) \u2013 The name of the partitions definition.

  • \n
  • partition_fn (Optional[Callable[[Optional[datetime]], Union[Sequence[Partition], Sequence[str]]]]) \u2013 \n \n (\n deprecated\n )\n \n (This parameter will be removed in version 2.0. Provide partition definition name instead.) A function that returns the current set of partitions. This argument is deprecated and\nwill be removed in 2.0.0.

  • \n
\n
\n
\n

Examples

\n
fruits = DynamicPartitionsDefinition(name="fruits")\n\n@sensor(job=my_job)\ndef my_sensor(context):\n    return SensorResult(\n        run_requests=[RunRequest(partition_key="apple")],\n        dynamic_partitions_requests=[fruits.build_add_request(["apple"])]\n    )\n
\n
\n
\n
\nget_partition_keys(current_time=None, dynamic_partitions_store=None)[source]\u00b6
\n

Returns a list of strings representing the partition keys of the\nPartitionsDefinition.

\n
\n
Parameters:
\n
    \n
  • current_time (Optional[datetime]) \u2013 A datetime object representing the current time, only\napplicable to time-based partitions definitions.

  • \n
  • dynamic_partitions_store (Optional[DynamicPartitionsStore]) \u2013 The DynamicPartitionsStore\nobject that is responsible for fetching dynamic partitions. Required when the\npartitions definition is a DynamicPartitionsDefinition with a name defined. Users\ncan pass the DagsterInstance fetched via context.instance to this argument.

  • \n
\n
\n
Returns:
\n

Sequence[str]

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.PartitionKeyRange(start, end)[source]\u00b6
\n

Defines a range of partitions.

\n
\n
\nstart\u00b6
\n

The starting partition key in the range (inclusive).

\n
\n
Type:
\n

str

\n
\n
\n
\n\n
\n
\nend\u00b6
\n

The ending partition key in the range (inclusive).

\n
\n
Type:
\n

str

\n
\n
\n
\n\n

Examples

\n
partitions_def = StaticPartitionsDefinition(["a", "b", "c", "d"])\npartition_key_range = PartitionKeyRange(start="a", end="c") # Represents ["a", "b", "c"]\n
\n
\n
\n\n
\n
\n

Partitioned Schedules\u00b6

\n
\n
\ndagster.build_schedule_from_partitioned_job(job, description=None, name=None, minute_of_hour=None, hour_of_day=None, day_of_week=None, day_of_month=None, default_status=DefaultScheduleStatus.STOPPED, tags=None)[source]
\n

Creates a schedule from a time window-partitioned job or a job that targets\ntime window-partitioned assets. The job can also be multipartitioned, as long as one\nof the partitions dimensions is time-partitioned.

\n

The schedule executes at the cadence specified by the time partitioning of the job or assets.

\n

Examples

\n
######################################\n# Job that targets partitioned assets\n######################################\n\nfrom dagster import (\n    DailyPartitionsDefinition,\n    asset,\n    build_schedule_from_partitioned_job,\n    define_asset_job,\n)\n\n@asset(partitions_def=DailyPartitionsDefinition(start_date="2020-01-01"))\ndef asset1():\n    ...\n\nasset1_job = define_asset_job("asset1_job", selection=[asset1])\n\n# The created schedule will fire daily\nasset1_job_schedule = build_schedule_from_partitioned_job(asset1_job)\n\ndefs = Definitions(assets=[asset1], schedules=[asset1_job_schedule])\n\n################\n# Non-asset job\n################\n\nfrom dagster import DailyPartitionsDefinition, build_schedule_from_partitioned_job, jog\n\n\n@job(partitions_def=DailyPartitionsDefinition(start_date="2020-01-01"))\ndef do_stuff_partitioned():\n    ...\n\n# The created schedule will fire daily\ndo_stuff_partitioned_schedule = build_schedule_from_partitioned_job(\n    do_stuff_partitioned,\n)\n\ndefs = Definitions(schedules=[do_stuff_partitioned_schedule])\n
\n
\n
\n\n
\n
\n

Partition Mapping\u00b6

\n
\n
\nclass dagster.PartitionMapping[source]\u00b6
\n

Defines a correspondence between the partitions in an asset and the partitions in an asset\nthat it depends on.

\n

Overriding PartitionMapping outside of Dagster is not supported. The abstract methods of this\nclass may change at any time.

\n
\n
\nabstract get_downstream_partitions_for_partitions(upstream_partitions_subset, downstream_partitions_def, current_time=None, dynamic_partitions_store=None)[source]\u00b6
\n

Returns the subset of partition keys in the downstream asset that use the data in the given\npartition key subset of the upstream asset.

\n
\n
Parameters:
\n
    \n
  • upstream_partitions_subset (Union[PartitionKeyRange, PartitionsSubset]) \u2013 The\nsubset of partition keys in the upstream asset.

  • \n
  • downstream_partitions_def (PartitionsDefinition) \u2013 The partitions definition for the\ndownstream asset.

  • \n
\n
\n
\n
\n\n
\n
\nabstract get_upstream_mapped_partitions_result_for_partitions(downstream_partitions_subset, upstream_partitions_def, current_time=None, dynamic_partitions_store=None)[source]\u00b6
\n

Returns a UpstreamPartitionsResult object containing the partition keys the downstream\npartitions subset was mapped to in the upstream partitions definition.

\n

Valid upstream partitions will be included in UpstreamPartitionsResult.partitions_subset.\nInvalid upstream partitions will be included in UpstreamPartitionsResult.required_but_nonexistent_partition_keys.

\n

For example, if an upstream asset is time-partitioned and starts in June 2023, and the\ndownstream asset is time-partitioned and starts in May 2023, this function would return a\nUpstreamPartitionsResult(PartitionsSubset(\u201c2023-06-01\u201d), required_but_nonexistent_partition_keys=[\u201c2023-05-01\u201d])\nwhen downstream_partitions_subset contains 2023-05-01 and 2023-06-01.

\n
\n\n
\n\n
\n
\nclass dagster.TimeWindowPartitionMapping(start_offset=0, end_offset=0, allow_nonexistent_upstream_partitions=False)[source]\u00b6
\n

The default mapping between two TimeWindowPartitionsDefinitions.

\n

A partition in the downstream partitions definition is mapped to all partitions in the upstream\nasset whose time windows overlap it.

\n

This means that, if the upstream and downstream partitions definitions share the same time\nperiod, then this mapping is essentially the identity partition mapping - plus conversion of\ndatetime formats.

\n

If the upstream time period is coarser than the downstream time period, then each partition in\nthe downstream asset will map to a single (larger) upstream partition. E.g. if the downstream is\nhourly and the upstream is daily, then each hourly partition in the downstream will map to the\ndaily partition in the upstream that contains that hour.

\n

If the upstream time period is finer than the downstream time period, then each partition in the\ndownstream asset will map to multiple upstream partitions. E.g. if the downstream is daily and\nthe upstream is hourly, then each daily partition in the downstream asset will map to the 24\nhourly partitions in the upstream that occur on that day.

\n
\n
\nstart_offset\u00b6
\n

If not 0, then the starts of the upstream windows are shifted by this\noffset relative to the starts of the downstream windows. For example, if start_offset=-1\nand end_offset=0, then the downstream partition \u201c2022-07-04\u201d would map to the upstream\npartitions \u201c2022-07-03\u201d and \u201c2022-07-04\u201d. Only permitted to be non-zero when the\nupstream and downstream PartitionsDefinitions are the same. Defaults to 0.

\n
\n
Type:
\n

int

\n
\n
\n
\n\n
\n
\nend_offset\u00b6
\n

If not 0, then the ends of the upstream windows are shifted by this\noffset relative to the ends of the downstream windows. For example, if start_offset=0\nand end_offset=1, then the downstream partition \u201c2022-07-04\u201d would map to the upstream\npartitions \u201c2022-07-04\u201d and \u201c2022-07-05\u201d. Only permitted to be non-zero when the\nupstream and downstream PartitionsDefinitions are the same. Defaults to 0.

\n
\n
Type:
\n

int

\n
\n
\n
\n\n
\n
\nallow_nonexistent_upstream_partitions\u00b6
\n

Defaults to false. If true, does not\nraise an error when mapped upstream partitions fall outside the start-end time window of the\npartitions def. For example, if the upstream partitions def starts on \u201c2023-01-01\u201d but\nthe downstream starts on \u201c2022-01-01\u201d, setting this bool to true would return no\npartition keys when get_upstream_partitions_for_partitions is called with \u201c2022-06-01\u201d.\nWhen set to false, would raise an error.

\n
\n
Type:
\n

bool

\n
\n
\n
\n\n

Examples

\n
from dagster import DailyPartitionsDefinition, TimeWindowPartitionMapping, AssetIn, asset\n\npartitions_def = DailyPartitionsDefinition(start_date="2020-01-01")\n\n@asset(partitions_def=partitions_def)\ndef asset1():\n    ...\n\n@asset(\n    partitions_def=partitions_def,\n    ins={\n        "asset1": AssetIn(\n            partition_mapping=TimeWindowPartitionMapping(start_offset=-1)\n        )\n    }\n)\ndef asset2(asset1):\n    ...\n
\n
\n
\n\n
\n
\nclass dagster.IdentityPartitionMapping[source]\u00b6
\n

Expects that the upstream and downstream assets are partitioned in the same way, and maps\npartitions in the downstream asset to the same partition in the upstream asset.

\n
\n\n
\n
\nclass dagster.AllPartitionMapping[source]\u00b6
\n

Maps every partition in the downstream asset to every partition in the upstream asset.

\n

Commonly used in the case when the downstream asset is not partitioned, in which the entire\ndownstream asset depends on all partitions of the usptream asset.

\n
\n\n
\n
\nclass dagster.LastPartitionMapping[source]\u00b6
\n

Maps all dependencies to the last partition in the upstream asset.

\n

Commonly used in the case when the downstream asset is not partitioned, in which the entire\ndownstream asset depends on the last partition of the upstream asset.

\n
\n\n
\n
\nclass dagster.StaticPartitionMapping(downstream_partition_keys_by_upstream_partition_key)[source]\u00b6
\n

Define an explicit correspondence between two StaticPartitionsDefinitions.

\n
\n
Parameters:
\n

downstream_partition_keys_by_upstream_partition_key (Dict[str, str | Collection[str]]) \u2013 The single or multi-valued correspondence from upstream keys to downstream keys.

\n
\n
\n
\n\n
\n
\nclass dagster.SpecificPartitionsPartitionMapping(partition_keys)[source]\u00b6
\n

Maps to a specific subset of partitions in the upstream asset.

\n

Example

\n
from dagster import SpecificPartitionsPartitionMapping, StaticPartitionsDefinition, asset\n\n@asset(partitions_def=StaticPartitionsDefinition(["a", "b", "c"]))\ndef upstream():\n    ...\n\n@asset(\n    ins={\n        "upstream": AssetIn(partition_mapping=SpecificPartitionsPartitionMapping(["a"]))\n    }\n)\ndef a_downstream(upstream):\n    ...\n
\n
\n
\n\n
\n
\nclass dagster.MultiToSingleDimensionPartitionMapping(partition_dimension_name=None)[source]\u00b6
\n
\n
\n

\n \n (\n experimental\n )\n \n This API may break in future versions, even between dot releases.\n \n

\n

Defines a correspondence between an single-dimensional partitions definition\nand a MultiPartitionsDefinition. The single-dimensional partitions definition must be\na dimension of the MultiPartitionsDefinition.

\n

This class handles the case where the upstream asset is multipartitioned and the\ndownstream asset is single dimensional, and vice versa.

\n

For a partition key X, this partition mapping assumes that any multi-partition key with\nX in the selected dimension is a dependency.

\n
\n
Parameters:
\n

partition_dimension_name (Optional[str]) \u2013 The name of the partition dimension in the\nMultiPartitionsDefinition that matches the single-dimension partitions definition.

\n
\n
\n
\n\n
\n
\nclass dagster.MultiPartitionMapping(downstream_mappings_by_upstream_dimension)[source]\u00b6
\n
\n
\n

\n \n (\n experimental\n )\n \n This API may break in future versions, even between dot releases.\n \n

\n

Defines a correspondence between two MultiPartitionsDefinitions.

\n

Accepts a mapping of upstream dimension name to downstream DimensionPartitionMapping, representing\nthe explicit correspondence between the upstream and downstream MultiPartitions dimensions\nand the partition mapping used to calculate the downstream partitions.

\n

Examples

\n
weekly_abc = MultiPartitionsDefinition(\n    {\n        "abc": StaticPartitionsDefinition(["a", "b", "c"]),\n        "weekly": WeeklyPartitionsDefinition("2023-01-01"),\n    }\n)\ndaily_123 = MultiPartitionsDefinition(\n    {\n        "123": StaticPartitionsDefinition(["1", "2", "3"]),\n        "daily": DailyPartitionsDefinition("2023-01-01"),\n    }\n)\n\nMultiPartitionsMapping(\n    {\n        "abc": DimensionPartitionMapping(\n            dimension_name="123",\n            partition_mapping=StaticPartitionMapping({"a": "1", "b": "2", "c": "3"}),\n        ),\n        "weekly": DimensionPartitionMapping(\n            dimension_name="daily",\n            partition_mapping=TimeWindowPartitionMapping(),\n        )\n    }\n)\n
\n
\n

For upstream or downstream dimensions not explicitly defined in the mapping, Dagster will\nassume an AllPartitionsMapping, meaning that all upstream partitions in those dimensions\nwill be mapped to all downstream partitions in those dimensions.

\n

Examples

\n
weekly_abc = MultiPartitionsDefinition(\n    {\n        "abc": StaticPartitionsDefinition(["a", "b", "c"]),\n        "daily": DailyPartitionsDefinition("2023-01-01"),\n    }\n)\ndaily_123 = MultiPartitionsDefinition(\n    {\n        "123": StaticPartitionsDefinition(["1", "2", "3"]),\n        "daily": DailyPartitionsDefinition("2023-01-01"),\n    }\n)\n\nMultiPartitionsMapping(\n    {\n        "daily": DimensionPartitionMapping(\n            dimension_name="daily",\n            partition_mapping=IdentityPartitionMapping(),\n        )\n    }\n)\n\n# Will map `daily_123` partition key {"123": "1", "daily": "2023-01-01"} to the upstream:\n# {"abc": "a", "daily": "2023-01-01"}\n# {"abc": "b", "daily": "2023-01-01"}\n# {"abc": "c", "daily": "2023-01-01"}\n
\n
\n
\n
Parameters:
\n

downstream_mappings_by_upstream_dimension (Mapping[str, DimensionPartitionMapping]) \u2013 A\nmapping that defines an explicit correspondence between one dimension of the upstream\nMultiPartitionsDefinition and one dimension of the downstream MultiPartitionsDefinition.\nMaps a string representing upstream dimension name to downstream DimensionPartitionMapping,\ncontaining the downstream dimension name and partition mapping.

\n
\n
\n
\n\n
\n
\n

Backfill Policy (Experimental)\u00b6

\n
\n
\nclass dagster.BackfillPolicy(max_partitions_per_run=1)[source]\u00b6
\n
\n
\n

\n \n (\n experimental\n )\n \n This API may break in future versions, even between dot releases.\n \n

\n

A BackfillPolicy specifies how Dagster should attempt to backfill a partitioned asset.

\n

There are two main kinds of backfill policies: single-run and multi-run.

\n

An asset with a single-run backfill policy will take a single run to backfill all of its\npartitions at once.

\n

An asset with a multi-run backfill policy will take multiple runs to backfill all of its\npartitions. Each run will backfill a subset of the partitions. The number of partitions to\nbackfill in each run is controlled by the max_partitions_per_run parameter.

\n

For example:

\n
    \n
  • If an asset has 100 partitions, and the max_partitions_per_run is set to 10, then it will\nbe backfilled in 10 runs; each run will backfill 10 partitions.

  • \n
  • If an asset has 100 partitions, and the max_partitions_per_run is set to 11, then it will\nbe backfilled in 10 runs; the first 9 runs will backfill 11 partitions, and the last one run\nwill backfill the remaining 9 partitions.

  • \n
\n

Warning:

\n

Constructing an BackfillPolicy directly is not recommended as the API is subject to change.\nBackfillPolicy.single_run() and BackfillPolicy.multi_run(max_partitions_per_run=x) are the\nrecommended APIs.

\n
\n
\nstatic multi_run(max_partitions_per_run=1)[source]\u00b6
\n

Creates a BackfillPolicy that executes the entire backfill in multiple runs.\nEach run will backfill [max_partitions_per_run] number of partitions.

\n
\n
Parameters:
\n

max_partitions_per_run (Optional[int]) \u2013 The maximum number of partitions in each run of\nthe multiple runs. Defaults to 1.

\n
\n
\n
\n\n
\n
\nstatic single_run()[source]\u00b6
\n

Creates a BackfillPolicy that executes the entire backfill in a single run.

\n
\n\n
\n\n
\n
\n

Partitioned Config\u00b6

\n
\n
\nclass dagster.PartitionedConfig(partitions_def, run_config_for_partition_fn=None, decorated_fn=None, tags_for_partition_fn=None, run_config_for_partition_key_fn=None, tags_for_partition_key_fn=None)[source]\u00b6
\n

Defines a way of configuring a job where the job can be run on one of a discrete set of\npartitions, and each partition corresponds to run configuration for the job.

\n

Setting PartitionedConfig as the config for a job allows you to launch backfills for that job\nand view the run history across partitions.

\n
\n
\nget_partition_keys(current_time=None)[source]\u00b6
\n

Returns a list of partition keys, representing the full set of partitions that\nconfig can be applied to.

\n
\n
Parameters:
\n

current_time (Optional[datetime]) \u2013 A datetime object representing the current time. Only\napplicable to time-based partitions definitions.

\n
\n
Returns:
\n

Sequence[str]

\n
\n
\n
\n\n
\n
\nproperty partitions_def\u00b6
\n

The partitions definition associated with this PartitionedConfig.

\n
\n
Type:
\n

T_PartitionsDefinition

\n
\n
\n
\n\n
\n
\nproperty run_config_for_partition_fn\u00b6
\n
\n
\n

\n \n (\n deprecated\n )\n \n This API will be removed in version 2.0. Use run_config_for_partition_key_fn instead..\n \n

\n

A function that accepts a partition\nand returns a dictionary representing the config to attach to runs for that partition.\nDeprecated as of 1.3.3.

\n
\n
Type:
\n

Optional[Callable[[Partition], Mapping[str, Any]]]

\n
\n
\n
\n\n
\n
\nproperty run_config_for_partition_key_fn\u00b6
\n

A function that accepts a partition key\nand returns a dictionary representing the config to attach to runs for that partition.

\n
\n
Type:
\n

Optional[Callable[[str], Mapping[str, Any]]]

\n
\n
\n
\n\n
\n
\nproperty tags_for_partition_fn\u00b6
\n
\n
\n

\n \n (\n deprecated\n )\n \n This API will be removed in version 2.0. Use tags_for_partition_key_fn instead..\n \n

\n

A function that\naccepts a partition and returns a dictionary of tags to attach to runs for\nthat partition. Deprecated as of 1.3.3.

\n
\n
Type:
\n

Optional[Callable[[Partition], Mapping[str, str]]]

\n
\n
\n
\n\n
\n
\nproperty tags_for_partition_key_fn\u00b6
\n

A function that\naccepts a partition key and returns a dictionary of tags to attach to runs for\nthat partition.

\n
\n
Type:
\n

Optional[Callable[[str], Mapping[str, str]]]

\n
\n
\n
\n\n
\n\n
\n
\ndagster.static_partitioned_config(partition_keys, tags_for_partition_fn=None, tags_for_partition_key_fn=None)[source]\u00b6
\n

Creates a static partitioned config for a job.

\n

The provided partition_keys is a static list of strings identifying the set of partitions. The\nlist of partitions is static, so while the run config returned by the decorated function may\nchange over time, the list of valid partition keys does not.

\n

This has performance advantages over dynamic_partitioned_config in terms of loading different\npartition views in the Dagster UI.

\n

The decorated function takes in a partition key and returns a valid run config for a particular\ntarget job.

\n
\n
Parameters:
\n
    \n
  • partition_keys (Sequence[str]) \u2013 A list of valid partition keys, which serve as the range of\nvalues that can be provided to the decorated run config function.

  • \n
  • tags_for_partition_fn (Optional[Callable[[str], Mapping[str, str]]]) \u2013 \n \n (\n deprecated\n )\n \n (This parameter will be removed in version 2.0. Use tags_for_partition_key_fn instead.) A function that\naccepts a partition key and returns a dictionary of tags to attach to runs for that\npartition.

  • \n
  • tags_for_partition_key_fn (Optional[Callable[[str], Mapping[str, str]]]) \u2013 A function that\naccepts a partition key and returns a dictionary of tags to attach to runs for that\npartition.

  • \n
\n
\n
Returns:
\n

PartitionedConfig

\n
\n
\n
\n\n
\n
\ndagster.dynamic_partitioned_config(partition_fn, tags_for_partition_fn=None, tags_for_partition_key_fn=None)[source]\u00b6
\n

Creates a dynamic partitioned config for a job.

\n

The provided partition_fn returns a list of strings identifying the set of partitions, given\nan optional datetime argument (representing the current time). The list of partitions returned\nmay change over time.

\n

The decorated function takes in a partition key and returns a valid run config for a particular\ntarget job.

\n
\n
Parameters:
\n
    \n
  • partition_fn (Callable[[datetime.datetime], Sequence[str]]) \u2013 A function that generates a\nlist of valid partition keys, which serve as the range of values that can be provided\nto the decorated run config function.

  • \n
  • tags_for_partition_fn (Optional[Callable[[str], Mapping[str, str]]]) \u2013 \n \n (\n deprecated\n )\n \n (This parameter will be removed in version 2.0. Use tags_for_partition_key_fn instead.) A function that\naccepts a partition key and returns a dictionary of tags to attach to runs for that\npartition.

  • \n
\n
\n
Returns:
\n

PartitionedConfig

\n
\n
\n
\n\n
\n
\ndagster.hourly_partitioned_config(start_date, minute_offset=0, timezone=None, fmt=None, end_offset=0, tags_for_partition_fn=None)[source]\u00b6
\n

Defines run config over a set of hourly partitions.

\n

The decorated function should accept a start datetime and end datetime, which represent the date\npartition the config should delineate.

\n

The decorated function should return a run config dictionary.

\n

The resulting object created by this decorator can be provided to the config argument of a Job.\nThe first partition in the set will start at the start_date at midnight. The last partition in\nthe set will end before the current time, unless the end_offset argument is set to a positive\nnumber. If minute_offset is provided, the start and end times of each partition will be\nminute_offset past the hour.

\n
\n
Parameters:
\n
    \n
  • start_date (Union[datetime.datetime, str]) \u2013 The first date in the set of partitions. Can\nprovide in either a datetime or string format.

  • \n
  • minute_offset (int) \u2013 Number of minutes past the hour to \u201csplit\u201d the partition. Defaults\nto 0.

  • \n
  • fmt (Optional[str]) \u2013 The date format to use. Defaults to %Y-%m-%d.

  • \n
  • timezone (Optional[str]) \u2013 The timezone in which each date should exist.\nSupported strings for timezones are the ones provided by the\nIANA time zone database <https://www.iana.org/time-zones> - e.g. \u201cAmerica/Los_Angeles\u201d.

  • \n
  • end_offset (int) \u2013 Extends the partition set by a number of partitions equal to the value\npassed. If end_offset is 0 (the default), the last partition ends before the current\ntime. If end_offset is 1, the second-to-last partition ends before the current time,\nand so on.

  • \n
  • tags_for_partition_fn (Optional[Callable[[str], Mapping[str, str]]]) \u2013 A function that\naccepts a partition time window and returns a dictionary of tags to attach to runs for\nthat partition.

  • \n
\n
\n
\n
@hourly_partitioned_config(start_date=datetime(2022, 03, 12))\n# creates partitions (2022-03-12-00:00, 2022-03-12-01:00), (2022-03-12-01:00, 2022-03-12-02:00), ...\n\n@hourly_partitioned_config(start_date=datetime(2022, 03, 12), minute_offset=15)\n# creates partitions (2022-03-12-00:15, 2022-03-12-01:15), (2022-03-12-01:15, 2022-03-12-02:15), ...\n
\n
\n
\n\n
\n
\ndagster.daily_partitioned_config(start_date, minute_offset=0, hour_offset=0, timezone=None, fmt=None, end_offset=0, tags_for_partition_fn=None)[source]\u00b6
\n

Defines run config over a set of daily partitions.

\n

The decorated function should accept a start datetime and end datetime, which represent the bounds\nof the date partition the config should delineate.

\n

The decorated function should return a run config dictionary.

\n

The resulting object created by this decorator can be provided to the config argument of a Job.\nThe first partition in the set will start at the start_date at midnight. The last partition in\nthe set will end before the current time, unless the end_offset argument is set to a positive\nnumber. If minute_offset and/or hour_offset are used, the start and end times of each partition\nwill be hour_offset:minute_offset of each day.

\n
\n
Parameters:
\n
    \n
  • start_date (Union[datetime.datetime, str]) \u2013 The first date in the set of partitions. Can\nprovide in either a datetime or string format.

  • \n
  • minute_offset (int) \u2013 Number of minutes past the hour to \u201csplit\u201d the partition. Defaults\nto 0.

  • \n
  • hour_offset (int) \u2013 Number of hours past 00:00 to \u201csplit\u201d the partition. Defaults to 0.

  • \n
  • timezone (Optional[str]) \u2013 The timezone in which each date should exist.\nSupported strings for timezones are the ones provided by the\nIANA time zone database <https://www.iana.org/time-zones> - e.g. \u201cAmerica/Los_Angeles\u201d.

  • \n
  • fmt (Optional[str]) \u2013 The date format to use. Defaults to %Y-%m-%d.

  • \n
  • end_offset (int) \u2013 Extends the partition set by a number of partitions equal to the value\npassed. If end_offset is 0 (the default), the last partition ends before the current\ntime. If end_offset is 1, the second-to-last partition ends before the current time,\nand so on.

  • \n
  • tags_for_partition_fn (Optional[Callable[[str], Mapping[str, str]]]) \u2013 A function that\naccepts a partition time window and returns a dictionary of tags to attach to runs for\nthat partition.

  • \n
\n
\n
\n
@daily_partitioned_config(start_date="2022-03-12")\n# creates partitions (2022-03-12-00:00, 2022-03-13-00:00), (2022-03-13-00:00, 2022-03-14-00:00), ...\n\n@daily_partitioned_config(start_date="2022-03-12", minute_offset=15, hour_offset=16)\n# creates partitions (2022-03-12-16:15, 2022-03-13-16:15), (2022-03-13-16:15, 2022-03-14-16:15), ...\n
\n
\n
\n\n
\n
\ndagster.weekly_partitioned_config(start_date, minute_offset=0, hour_offset=0, day_offset=0, timezone=None, fmt=None, end_offset=0, tags_for_partition_fn=None)[source]\u00b6
\n

Defines run config over a set of weekly partitions.

\n

The decorated function should accept a start datetime and end datetime, which represent the date\npartition the config should delineate.

\n

The decorated function should return a run config dictionary.

\n

The resulting object created by this decorator can be provided to the config argument of a Job.\nThe first partition in the set will start at the start_date. The last partition in the set will\nend before the current time, unless the end_offset argument is set to a positive number. If\nday_offset is provided, the start and end date of each partition will be day of the week\ncorresponding to day_offset (0 indexed with Sunday as the start of the week). If\nminute_offset and/or hour_offset are used, the start and end times of each partition will be\nhour_offset:minute_offset of each day.

\n
\n
Parameters:
\n
    \n
  • start_date (Union[datetime.datetime, str]) \u2013 The first date in the set of partitions will\nSunday at midnight following start_date. Can provide in either a datetime or string\nformat.

  • \n
  • minute_offset (int) \u2013 Number of minutes past the hour to \u201csplit\u201d the partition. Defaults\nto 0.

  • \n
  • hour_offset (int) \u2013 Number of hours past 00:00 to \u201csplit\u201d the partition. Defaults to 0.

  • \n
  • day_offset (int) \u2013 Day of the week to \u201csplit\u201d the partition. Defaults to 0 (Sunday).

  • \n
  • timezone (Optional[str]) \u2013 The timezone in which each date should exist.\nSupported strings for timezones are the ones provided by the\nIANA time zone database <https://www.iana.org/time-zones> - e.g. \u201cAmerica/Los_Angeles\u201d.

  • \n
  • fmt (Optional[str]) \u2013 The date format to use. Defaults to %Y-%m-%d.

  • \n
  • end_offset (int) \u2013 Extends the partition set by a number of partitions equal to the value\npassed. If end_offset is 0 (the default), the last partition ends before the current\ntime. If end_offset is 1, the second-to-last partition ends before the current time,\nand so on.

  • \n
  • tags_for_partition_fn (Optional[Callable[[str], Mapping[str, str]]]) \u2013 A function that\naccepts a partition time window and returns a dictionary of tags to attach to runs for\nthat partition.

  • \n
\n
\n
\n
@weekly_partitioned_config(start_date="2022-03-12")\n# creates partitions (2022-03-13-00:00, 2022-03-20-00:00), (2022-03-20-00:00, 2022-03-27-00:00), ...\n\n@weekly_partitioned_config(start_date="2022-03-12", minute_offset=15, hour_offset=3, day_offset=6)\n# creates partitions (2022-03-12-03:15, 2022-03-19-03:15), (2022-03-19-03:15, 2022-03-26-03:15), ...\n
\n
\n
\n\n
\n
\ndagster.monthly_partitioned_config(start_date, minute_offset=0, hour_offset=0, day_offset=1, timezone=None, fmt=None, end_offset=0, tags_for_partition_fn=None)[source]\u00b6
\n

Defines run config over a set of monthly partitions.

\n

The decorated function should accept a start datetime and end datetime, which represent the date\npartition the config should delineate.

\n

The decorated function should return a run config dictionary.

\n

The resulting object created by this decorator can be provided to the config argument of a Job.\nThe first partition in the set will start at midnight on the soonest first of the month after\nstart_date. The last partition in the set will end before the current time, unless the\nend_offset argument is set to a positive number. If day_offset is provided, the start and end\ndate of each partition will be day_offset. If minute_offset and/or hour_offset are used, the\nstart and end times of each partition will be hour_offset:minute_offset of each day.

\n
\n
Parameters:
\n
    \n
  • start_date (Union[datetime.datetime, str]) \u2013 The first date in the set of partitions will be\nmidnight the sonnest first of the month following start_date. Can provide in either a\ndatetime or string format.

  • \n
  • minute_offset (int) \u2013 Number of minutes past the hour to \u201csplit\u201d the partition. Defaults\nto 0.

  • \n
  • hour_offset (int) \u2013 Number of hours past 00:00 to \u201csplit\u201d the partition. Defaults to 0.

  • \n
  • day_offset (int) \u2013 Day of the month to \u201csplit\u201d the partition. Defaults to 1.

  • \n
  • timezone (Optional[str]) \u2013 The timezone in which each date should exist.\nSupported strings for timezones are the ones provided by the\nIANA time zone database <https://www.iana.org/time-zones> - e.g. \u201cAmerica/Los_Angeles\u201d.

  • \n
  • fmt (Optional[str]) \u2013 The date format to use. Defaults to %Y-%m-%d.

  • \n
  • end_offset (int) \u2013 Extends the partition set by a number of partitions equal to the value\npassed. If end_offset is 0 (the default), the last partition ends before the current\ntime. If end_offset is 1, the second-to-last partition ends before the current time,\nand so on.

  • \n
  • tags_for_partition_fn (Optional[Callable[[str], Mapping[str, str]]]) \u2013 A function that\naccepts a partition time window and returns a dictionary of tags to attach to runs for\nthat partition.

  • \n
\n
\n
\n
@monthly_partitioned_config(start_date="2022-03-12")\n# creates partitions (2022-04-01-00:00, 2022-05-01-00:00), (2022-05-01-00:00, 2022-06-01-00:00), ...\n\n@monthly_partitioned_config(start_date="2022-03-12", minute_offset=15, hour_offset=3, day_offset=5)\n# creates partitions (2022-04-05-03:15, 2022-05-05-03:15), (2022-05-05-03:15, 2022-06-05-03:15), ...\n
\n
\n
\n\n
\n", "current_page_name": "sections/api/apidocs/partitions", "customsidebar": null, "display_toc": true, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../definitions/", "title": "Definitions"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../io-managers/", "title": "IO Managers"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/definitions", "Definitions", "N", "next"], ["sections/api/apidocs/io-managers", "IO Managers", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/partitions.rst.txt", "title": "Partitions Definitions", "toc": "\n"}, "repositories": {"alabaster_version": "0.7.13", "body": "
\n

Repositories\u00b6

\n
\n
\ndagster.repository RepositoryDefinition[source]\u00b6
\n

Create a repository from the decorated function.

\n

The decorated function should take no arguments and its return value should one of:

\n

1. List[Union[JobDefinition, ScheduleDefinition, SensorDefinition]].\nUse this form when you have no need to lazy load jobs or other definitions. This is the\ntypical use case.

\n
    \n
  1. A dict of the form:

  2. \n
\n
{\n    'jobs': Dict[str, Callable[[], JobDefinition]],\n    'schedules': Dict[str, Callable[[], ScheduleDefinition]]\n    'sensors': Dict[str, Callable[[], SensorDefinition]]\n}\n
\n
\n

This form is intended to allow definitions to be created lazily when accessed by name,\nwhich can be helpful for performance when there are many definitions in a repository, or\nwhen constructing the definitions is costly.

\n

3. A RepositoryData. Return this object if you need fine-grained\ncontrol over the construction and indexing of definitions within the repository, e.g., to\ncreate definitions dynamically from .yaml files in a directory.

\n
\n
Parameters:
\n
    \n
  • name (Optional[str]) \u2013 The name of the repository. Defaults to the name of the decorated\nfunction.

  • \n
  • description (Optional[str]) \u2013 A string description of the repository.

  • \n
  • metadata (Optional[Dict[str, RawMetadataValue]]) \u2013 Arbitrary metadata for the repository.

  • \n
  • top_level_resources (Optional[Mapping[str, ResourceDefinition]]) \u2013 A dict of top-level\nresource keys to defintions, for resources which should be displayed in the UI.

  • \n
\n
\n
\n

Example

\n
######################################################################\n# A simple repository using the first form of the decorated function\n######################################################################\n\n@op(config_schema={n: Field(Int)})\ndef return_n(context):\n    return context.op_config['n']\n\n@job\ndef simple_job():\n    return_n()\n\n@job\ndef some_job():\n    ...\n\n@sensor(job=some_job)\ndef some_sensor():\n    if foo():\n        yield RunRequest(\n            run_key= ...,\n            run_config={\n                'ops': {'return_n': {'config': {'n': bar()}}}\n            }\n        )\n\n@job\ndef my_job():\n    ...\n\nmy_schedule = ScheduleDefinition(cron_schedule="0 0 * * *", job=my_job)\n\n@repository\ndef simple_repository():\n    return [simple_job, some_sensor, my_schedule]\n\n######################################################################\n# A simple repository using the first form of the decorated function\n# and custom metadata that will be displayed in the UI\n######################################################################\n\n...\n\n@repository(\n    name='my_repo',\n    metadata={\n        'team': 'Team A',\n        'repository_version': '1.2.3',\n        'environment': 'production',\n })\ndef simple_repository():\n    return [simple_job, some_sensor, my_schedule]\n\n######################################################################\n# A lazy-loaded repository\n######################################################################\n\ndef make_expensive_job():\n    @job\n    def expensive_job():\n        for i in range(10000):\n            return_n.alias(f'return_n_{i}')()\n\n    return expensive_job\n\ndef make_expensive_schedule():\n    @job\n    def other_expensive_job():\n        for i in range(11000):\n            return_n.alias(f'my_return_n_{i}')()\n\n    return ScheduleDefinition(cron_schedule="0 0 * * *", job=other_expensive_job)\n\n@repository\ndef lazy_loaded_repository():\n    return {\n        'jobs': {'expensive_job': make_expensive_job},\n        'schedules': {'expensive_schedule': make_expensive_schedule}\n    }\n\n\n######################################################################\n# A complex repository that lazily constructs jobs from a directory\n# of files in a bespoke YAML format\n######################################################################\n\nclass ComplexRepositoryData(RepositoryData):\n    def __init__(self, yaml_directory):\n        self._yaml_directory = yaml_directory\n\n    def get_all_jobs(self):\n        return [\n            self._construct_job_def_from_yaml_file(\n              self._yaml_file_for_job_name(file_name)\n            )\n            for file_name in os.listdir(self._yaml_directory)\n        ]\n\n    ...\n\n@repository\ndef complex_repository():\n    return ComplexRepositoryData('some_directory')\n
\n
\n
\n\n
\n
\nclass dagster.RepositoryDefinition(name, *, repository_data, description=None, metadata=None, repository_load_data=None)[source]\u00b6
\n

Define a repository that contains a group of definitions.

\n

Users should typically not create objects of this class directly. Instead, use the\n@repository() decorator.

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the repository.

  • \n
  • repository_data (RepositoryData) \u2013 Contains the definitions making up the repository.

  • \n
  • description (Optional[str]) \u2013 A string description of the repository.

  • \n
  • metadata (Optional[MetadataMapping]) \u2013 A map of arbitrary metadata for the repository.

  • \n
\n
\n
\n
\n
\nproperty description\u00b6
\n

A human-readable description of the repository.

\n
\n
Type:
\n

Optional[str]

\n
\n
\n
\n\n
\n
\nget_all_jobs()[source]\u00b6
\n

Return all jobs in the repository as a list.

\n

Note that this will construct any job in the lazily evaluated dictionary that has\nnot yet been constructed.

\n
\n
Returns:
\n

All jobs in the repository.

\n
\n
Return type:
\n

List[JobDefinition]

\n
\n
\n
\n\n
\n
\nget_asset_value_loader(instance=None)[source]\u00b6
\n

Returns an object that can load the contents of assets as Python objects.

\n

Invokes load_input on the IOManager associated with the assets. Avoids\nspinning up resources separately for each asset.

\n

Usage:

\n
with my_repo.get_asset_value_loader() as loader:\n    asset1 = loader.load_asset_value("asset1")\n    asset2 = loader.load_asset_value("asset2")\n
\n
\n
\n\n
\n
\nget_job(name)[source]\u00b6
\n

Get a job by name.

\n

If this job is present in the lazily evaluated dictionary passed to the\nconstructor, but has not yet been constructed, only this job is constructed, and\nwill be cached for future calls.

\n
\n
Parameters:
\n

name (str) \u2013 Name of the job to retrieve.

\n
\n
Returns:
\n

The job definition corresponding to\nthe given name.

\n
\n
Return type:
\n

JobDefinition

\n
\n
\n
\n\n
\n
\nget_schedule_def(name)[source]\u00b6
\n

Get a schedule definition by name.

\n
\n
Parameters:
\n

name (str) \u2013 The name of the schedule.

\n
\n
Returns:
\n

The schedule definition.

\n
\n
Return type:
\n

ScheduleDefinition

\n
\n
\n
\n\n
\n
\nget_sensor_def(name)[source]\u00b6
\n

Get a sensor definition by name.

\n
\n
Parameters:
\n

name (str) \u2013 The name of the sensor.

\n
\n
Returns:
\n

The sensor definition.

\n
\n
Return type:
\n

SensorDefinition

\n
\n
\n
\n\n
\n
\nhas_job(name)[source]\u00b6
\n

Check if a job with a given name is present in the repository.

\n
\n
Parameters:
\n

name (str) \u2013 The name of the job.

\n
\n
Returns:
\n

bool

\n
\n
\n
\n\n
\n
\nhas_schedule_def(name)[source]\u00b6
\n

bool: Check if a schedule with a given name is present in the repository.

\n
\n\n
\n
\nhas_sensor_def(name)[source]\u00b6
\n

bool: Check if a sensor with a given name is present in the repository.

\n
\n\n
\n
\nproperty job_names\u00b6
\n

Names of all jobs in the repository.

\n
\n
Type:
\n

List[str]

\n
\n
\n
\n\n
\n
\nload_asset_value(asset_key, *, python_type=None, instance=None, partition_key=None, metadata=None, resource_config=None)[source]\u00b6
\n

Load the contents of an asset as a Python object.

\n

Invokes load_input on the IOManager associated with the asset.

\n

If you want to load the values of multiple assets, it\u2019s more efficient to use\nget_asset_value_loader(), which avoids spinning up\nresources separately for each asset.

\n
\n
Parameters:
\n
    \n
  • asset_key (Union[AssetKey, Sequence[str], str]) \u2013 The key of the asset to load.

  • \n
  • python_type (Optional[Type]) \u2013 The python type to load the asset as. This is what will\nbe returned inside load_input by context.dagster_type.typing_type.

  • \n
  • partition_key (Optional[str]) \u2013 The partition of the asset to load.

  • \n
  • metadata (Optional[Dict[str, Any]]) \u2013 Input metadata to pass to the IOManager\n(is equivalent to setting the metadata argument in In or AssetIn).

  • \n
  • resource_config (Optional[Any]) \u2013 A dictionary of resource configurations to be passed\nto the IOManager.

  • \n
\n
\n
Returns:
\n

The contents of an asset as a Python object.

\n
\n
\n
\n\n
\n
\nproperty metadata\u00b6
\n

Arbitrary metadata for the repository.

\n
\n
Type:
\n

Optional[MetadataMapping]

\n
\n
\n
\n\n
\n
\nproperty name\u00b6
\n

The name of the repository.

\n
\n
Type:
\n

str

\n
\n
\n
\n\n
\n
\nproperty schedule_defs\u00b6
\n

All schedules in the repository.

\n
\n
Type:
\n

List[ScheduleDefinition]

\n
\n
\n
\n\n
\n
\nproperty sensor_defs\u00b6
\n

All sensors in the repository.

\n
\n
Type:
\n

Sequence[SensorDefinition]

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.RepositoryData[source]\u00b6
\n

Users should usually rely on the @repository decorator to create new\nrepositories, which will in turn call the static constructors on this class. However, users may\nsubclass RepositoryData for fine-grained control over access to and lazy creation\nof repository members.

\n
\n
\nabstract get_all_jobs()[source]\u00b6
\n

Return all jobs in the repository as a list.

\n
\n
Returns:
\n

All jobs in the repository.

\n
\n
Return type:
\n

List[JobDefinition]

\n
\n
\n
\n\n
\n
\nget_all_schedules()[source]\u00b6
\n

Return all schedules in the repository as a list.

\n
\n
Returns:
\n

All jobs in the repository.

\n
\n
Return type:
\n

List[ScheduleDefinition]

\n
\n
\n
\n\n
\n
\nget_all_sensors()[source]\u00b6
\n

Sequence[SensorDefinition]: Return all sensors in the repository as a list.

\n
\n\n
\n
\nget_assets_defs_by_key()[source]\u00b6
\n

Mapping[AssetKey, AssetsDefinition]: Get the asset definitions for the repository.

\n
\n\n
\n
\nget_job(job_name)[source]\u00b6
\n

Get a job by name.

\n
\n
Parameters:
\n

job_name (str) \u2013 Name of the job to retrieve.

\n
\n
Returns:
\n

The job definition corresponding to the given name.

\n
\n
Return type:
\n

JobDefinition

\n
\n
\n
\n\n
\n
\nget_job_names()[source]\u00b6
\n

Get the names of all jobs in the repository.

\n
\n
Returns:
\n

List[str]

\n
\n
\n
\n\n
\n
\nget_schedule(schedule_name)[source]\u00b6
\n

Get a schedule by name.

\n
\n
Parameters:
\n

schedule_name (str) \u2013 name of the schedule to retrieve.

\n
\n
Returns:
\n

The schedule definition corresponding to the given name.

\n
\n
Return type:
\n

ScheduleDefinition

\n
\n
\n
\n\n
\n
\nget_schedule_names()[source]\u00b6
\n

Get the names of all schedules in the repository.

\n
\n
Returns:
\n

List[str]

\n
\n
\n
\n\n
\n
\nget_sensor(sensor_name)[source]\u00b6
\n

Get a sensor by name.

\n
\n
Parameters:
\n

sensor_name (str) \u2013 name of the sensor to retrieve.

\n
\n
Returns:
\n

The sensor definition corresponding to the given name.

\n
\n
Return type:
\n

SensorDefinition

\n
\n
\n
\n\n
\n
\nget_sensor_names()[source]\u00b6
\n

Sequence[str]: Get the names of all sensors in the repository.

\n
\n\n
\n
\nget_source_assets_by_key()[source]\u00b6
\n

Mapping[AssetKey, SourceAsset]: Get the source assets for the repository.

\n
\n\n
\n
\nhas_job(job_name)[source]\u00b6
\n

Check if a job with a given name is present in the repository.

\n
\n
Parameters:
\n

job_name (str) \u2013 The name of the job.

\n
\n
Returns:
\n

bool

\n
\n
\n
\n\n
\n
\nhas_schedule(schedule_name)[source]\u00b6
\n

Check if a schedule with a given name is present in the repository.

\n
\n\n
\n
\nhas_sensor(sensor_name)[source]\u00b6
\n

Check if a sensor with a given name is present in the repository.

\n
\n\n
\n\n
\n", "current_page_name": "sections/api/apidocs/repositories", "customsidebar": null, "display_toc": false, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../resources/", "title": "Resources"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../definitions/", "title": "Definitions"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/resources", "Resources", "N", "next"], ["sections/api/apidocs/definitions", "Definitions", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/repositories.rst.txt", "title": "Repositories", "toc": "\n"}, "resources": {"alabaster_version": "0.7.13", "body": "
\n

Resources\u00b6

\n
\n

Pythonic resource system\u00b6

\n

The following classes are used as part of the new Pythonic resources system.

\n
\n
\nclass dagster.ConfigurableResource[source]\u00b6
\n

Base class for Dagster resources that utilize structured config.

\n

This class is a subclass of both ResourceDefinition and Config.

\n

Example definition:

\n
class WriterResource(ConfigurableResource):\n    prefix: str\n\n    def output(self, text: str) -> None:\n        print(f"{self.prefix}{text}")\n
\n
\n

Example usage:

\n
@asset\ndef asset_that_uses_writer(writer: WriterResource):\n    writer.output("text")\n\ndefs = Definitions(\n    assets=[asset_that_uses_writer],\n    resources={"writer": WriterResource(prefix="a_prefix")},\n)\n
\n
\n
\n\n
\n
\nclass dagster.ResourceDefinition(resource_fn, config_schema=None, description=None, required_resource_keys=None, version=None)[source]\u00b6
\n

Core class for defining resources.

\n

Resources are scoped ways to make external resources (like database connections) available to\nops and assets during job execution and to clean up after execution resolves.

\n

If resource_fn yields once rather than returning (in the manner of functions decorable with\n@contextlib.contextmanager) then the body of the\nfunction after the yield will be run after execution resolves, allowing users to write their\nown teardown/cleanup logic.

\n

Depending on your executor, resources may be instantiated and cleaned up more than once in a\njob execution.

\n
\n
Parameters:
\n
    \n
  • resource_fn (Callable[[InitResourceContext], Any]) \u2013 User-provided function to instantiate\nthe resource, which will be made available to executions keyed on the\ncontext.resources object.

  • \n
  • config_schema (Optional[ConfigSchema) \u2013 The schema for the config. If set, Dagster will check\nthat config provided for the resource matches this schema and fail if it does not. If\nnot set, Dagster will accept any config provided for the resource.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the resource.

  • \n
  • required_resource_keys \u2013 (Optional[Set[str]]) Keys for the resources required by this\nresource. A DagsterInvariantViolationError will be raised during initialization if\ndependencies are cyclic.

  • \n
  • version (Optional[str]) \u2013 \n \n (\n experimental\n )\n \n (This parameter may break in future versions, even between dot releases.) (Experimental) The version of the resource\u2019s definition fn. Two\nwrapped resource functions should only have the same version if they produce the same\nresource definition when provided with the same inputs.

  • \n
\n
\n
\n
\n
\nproperty description\u00b6
\n

A human-readable description of the resource.

\n
\n\n
\n
\nstatic hardcoded_resource(value, description=None)[source]\u00b6
\n

A helper function that creates a ResourceDefinition with a hardcoded object.

\n
\n
Parameters:
\n
    \n
  • value (Any) \u2013 The value that will be accessible via context.resources.resource_name.

  • \n
  • description ([Optional[str]]) \u2013 The description of the resource. Defaults to None.

  • \n
\n
\n
Returns:
\n

A hardcoded resource.

\n
\n
Return type:
\n

[ResourceDefinition]

\n
\n
\n
\n\n
\n
\nstatic mock_resource(description=None)[source]\u00b6
\n

A helper function that creates a ResourceDefinition which wraps a mock.MagicMock.

\n
\n
Parameters:
\n

description ([Optional[str]]) \u2013 The description of the resource. Defaults to None.

\n
\n
Returns:
\n

\n
A resource that creates the magic methods automatically and helps

you mock existing resources.

\n
\n
\n

\n
\n
Return type:
\n

[ResourceDefinition]

\n
\n
\n
\n\n
\n
\nstatic none_resource(description=None)[source]\u00b6
\n

A helper function that returns a none resource.

\n
\n
Parameters:
\n

description ([Optional[str]]) \u2013 The description of the resource. Defaults to None.

\n
\n
Returns:
\n

A resource that does nothing.

\n
\n
Return type:
\n

[ResourceDefinition]

\n
\n
\n
\n\n
\n
\nproperty required_resource_keys\u00b6
\n

A set of the resource keys that this resource depends on. These keys will be made available\nto the resource\u2019s init context during execution, and the resource will not be instantiated\nuntil all required resources are available.

\n
\n\n
\n
\nstatic string_resource(description=None)[source]\u00b6
\n

Creates a ResourceDefinition which takes in a single string as configuration\nand returns this configured string to any ops or assets which depend on it.

\n
\n
Parameters:
\n

description ([Optional[str]]) \u2013 The description of the string resource. Defaults to None.

\n
\n
Returns:
\n

\n
A resource that takes in a single string as configuration and

returns that string.

\n
\n
\n

\n
\n
Return type:
\n

[ResourceDefinition]

\n
\n
\n
\n\n
\n
\nproperty version\u00b6
\n

A string which can be used to identify a particular code version of a resource definition.

\n
\n\n
\n\n
\n
\nclass dagster.InitResourceContext(resource_config, resources, resource_def=None, instance=None, dagster_run=None, log_manager=None)[source]\u00b6
\n

The context object available as the argument to the initialization function of a dagster.ResourceDefinition.

\n

Users should not instantiate this object directly. To construct an InitResourceContext for testing purposes, use dagster.build_init_resource_context().

\n

Example

\n
from dagster import resource, InitResourceContext\n\n@resource\ndef the_resource(init_context: InitResourceContext):\n    init_context.log.info("Hello, world!")\n
\n
\n
\n
\nproperty instance\u00b6
\n

The Dagster instance configured for the current execution context.

\n
\n\n
\n
\nproperty log\u00b6
\n

The Dagster log manager configured for the current execution context.

\n
\n\n
\n
\nproperty log_manager\u00b6
\n

The log manager for this run of the job.

\n
\n\n
\n
\nproperty resource_config\u00b6
\n

The configuration data provided by the run config. The schema\nfor this data is defined by the config_field argument to\nResourceDefinition.

\n
\n\n
\n
\nproperty resource_def\u00b6
\n

The definition of the resource currently being constructed.

\n
\n\n
\n
\nproperty resources\u00b6
\n

The resources that are available to the resource that we are initalizing.

\n
\n\n
\n
\nproperty run_id\u00b6
\n

The id for this run of the job or pipeline. When initializing resources outside of\nexecution context, this will be None.

\n
\n\n
\n\n
\n
\ndagster.make_values_resource(**kwargs)[source]\u00b6
\n

A helper function that creates a ResourceDefinition to take in user-defined values.

\n
\n

This is useful for sharing values between ops.

\n
\n
\n
Parameters:
\n

**kwargs \u2013 Arbitrary keyword arguments that will be passed to the config schema of the\nreturned resource definition. If not set, Dagster will accept any config provided for\nthe resource.

\n
\n
\n

For example:

\n
@op(required_resource_keys={"globals"})\ndef my_op(context):\n    print(context.resources.globals["my_str_var"])\n\n@job(resource_defs={"globals": make_values_resource(my_str_var=str, my_int_var=int)})\ndef my_job():\n    my_op()\n
\n
\n
\n
Returns:
\n

A resource that passes in user-defined values.

\n
\n
Return type:
\n

ResourceDefinition

\n
\n
\n
\n\n
\n
\ndagster.build_init_resource_context(config=None, resources=None, instance=None)[source]\u00b6
\n

Builds resource initialization context from provided parameters.

\n

build_init_resource_context can be used as either a function or context manager. If there is a\nprovided resource to build_init_resource_context that is a context manager, then it must be\nused as a context manager. This function can be used to provide the context argument to the\ninvocation of a resource.

\n
\n
Parameters:
\n
    \n
  • resources (Optional[Dict[str, Any]]) \u2013 The resources to provide to the context. These can be\neither values or resource definitions.

  • \n
  • config (Optional[Any]) \u2013 The resource config to provide to the context.

  • \n
  • instance (Optional[DagsterInstance]) \u2013 The dagster instance configured for the context.\nDefaults to DagsterInstance.ephemeral().

  • \n
\n
\n
\n

Examples

\n
context = build_init_resource_context()\nresource_to_init(context)\n\nwith build_init_resource_context(\n    resources={"foo": context_manager_resource}\n) as context:\n    resource_to_init(context)\n
\n
\n
\n\n
\n
\ndagster.build_resources(resources, instance=None, resource_config=None, dagster_run=None, log_manager=None)[source]\u00b6
\n

Context manager that yields resources using provided resource definitions and run config.

\n

This API allows for using resources in an independent context. Resources will be initialized\nwith the provided run config, and optionally, dagster_run. The resulting resources will be\nyielded on a dictionary keyed identically to that provided for resource_defs. Upon exiting the\ncontext, resources will also be torn down safely.

\n
\n
Parameters:
\n
    \n
  • resources (Mapping[str, Any]) \u2013 Resource instances or definitions to build. All\nrequired resource dependencies to a given resource must be contained within this\ndictionary, or the resource build will fail.

  • \n
  • instance (Optional[DagsterInstance]) \u2013 The dagster instance configured to instantiate\nresources on.

  • \n
  • resource_config (Optional[Mapping[str, Any]]) \u2013 A dict representing the config to be\nprovided to each resource during initialization and teardown.

  • \n
  • dagster_run (Optional[PipelineRun]) \u2013 The pipeline run to provide during resource\ninitialization and teardown. If the provided resources require either the dagster_run\nor run_id attributes of the provided context during resource initialization and/or\nteardown, this must be provided, or initialization will fail.

  • \n
  • log_manager (Optional[DagsterLogManager]) \u2013 Log Manager to use during resource\ninitialization. Defaults to system log manager.

  • \n
\n
\n
\n

Examples

\n
from dagster import resource, build_resources\n\n@resource\ndef the_resource():\n    return "foo"\n\nwith build_resources(resources={"from_def": the_resource, "from_val": "bar"}) as resources:\n    assert resources.from_def == "foo"\n    assert resources.from_val == "bar"\n
\n
\n
\n\n
\n
\ndagster.with_resources(definitions, resource_defs, resource_config_by_key=None)[source]\u00b6
\n

Adds dagster resources to copies of resource-requiring dagster definitions.

\n

An error will be thrown if any provided definitions have a conflicting\nresource definition provided for a key provided to resource_defs. Resource\nconfig can be provided, with keys in the config dictionary corresponding to\nthe keys for each resource definition. If any definition has unsatisfied\nresource keys after applying with_resources, an error will be thrown.

\n
\n
Parameters:
\n
    \n
  • definitions (Iterable[ResourceAddable]) \u2013 Dagster definitions to provide resources to.

  • \n
  • resource_defs (Mapping[str, object]) \u2013 Mapping of resource keys to objects to satisfy\nresource requirements of provided dagster definitions.

  • \n
  • resource_config_by_key (Optional[Mapping[str, Any]]) \u2013 Specifies config for provided resources. The key in this dictionary\ncorresponds to configuring the same key in the resource_defs\ndictionary.

  • \n
\n
\n
\n

Examples

\n
from dagster import asset, resource, with_resources\n\n@resource(config_schema={"bar": str})\ndef foo_resource():\n    ...\n\n@asset(required_resource_keys={"foo"})\ndef asset1(context):\n    foo = context.resources.foo\n    ...\n\n@asset(required_resource_keys={"foo"})\ndef asset2(context):\n    foo = context.resources.foo\n    ...\n\nasset1_with_foo, asset2_with_foo = with_resources(\n    [the_asset, other_asset],\n    resource_config_by_key={\n        "foo": {\n            "config": {"bar": ...}\n        }\n    }\n)\n
\n
\n
\n\n
\n
\n

Legacy resource system\u00b6

\n

The following classes are used as part of the legacy resource system.

\n
\n
\n@dagster.resource(config_schema=None, description=None, required_resource_keys=None, version=None)[source]\u00b6
\n

Define a resource.

\n

The decorated function should accept an InitResourceContext and return an instance of\nthe resource. This function will become the resource_fn of an underlying\nResourceDefinition.

\n

If the decorated function yields once rather than returning (in the manner of functions\ndecorable with @contextlib.contextmanager) then\nthe body of the function after the yield will be run after execution resolves, allowing users\nto write their own teardown/cleanup logic.

\n
\n
Parameters:
\n
    \n
  • config_schema (Optional[ConfigSchema]) \u2013 The schema for the config. Configuration data available in\ninit_context.resource_config. If not set, Dagster will accept any config provided.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the resource.

  • \n
  • version (Optional[str]) \u2013 (Experimental) The version of a resource function. Two wrapped\nresource functions should only have the same version if they produce the same resource\ndefinition when provided with the same inputs.

  • \n
  • required_resource_keys (Optional[Set[str]]) \u2013 Keys for the resources required by this resource.

  • \n
\n
\n
\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/resources", "customsidebar": null, "display_toc": true, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../schedules-sensors/", "title": "Run Requests"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../repositories/", "title": "Repositories"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/schedules-sensors", "Run Requests", "N", "next"], ["sections/api/apidocs/repositories", "Repositories", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/resources.rst.txt", "title": "Resources", "toc": "\n"}, "schedules-sensors": {"alabaster_version": "0.7.13", "body": "
\n

Run Requests\u00b6

\n
\n
\nclass dagster.RunRequest(run_key=None, run_config=None, tags=None, job_name=None, asset_selection=None, stale_assets_only=False, partition_key=None)[source]\u00b6
\n

Represents all the information required to launch a single run. Must be returned by a\nSensorDefinition or ScheduleDefinition\u2019s evaluation function for a run to be launched.

\n
\n
\nrun_key\u00b6
\n

A string key to identify this launched run. For sensors, ensures that\nonly one run is created per run key across all sensor evaluations. For schedules,\nensures that one run is created per tick, across failure recoveries. Passing in a None\nvalue means that a run will always be launched per evaluation.

\n
\n
Type:
\n

Optional[str]

\n
\n
\n
\n\n
\n
\nrun_config (Optional[Mapping[str, Any]]
\n

Configuration for the run. If the job has\na PartitionedConfig, this value will override replace the config\nprovided by it.

\n
\n\n
\n
\ntags\u00b6
\n

A dictionary of tags (string key-value pairs) to attach\nto the launched run.

\n
\n
Type:
\n

Optional[Dict[str, Any]]

\n
\n
\n
\n\n
\n
\njob_name\u00b6
\n

(Experimental) The name of the job this run request will launch.\nRequired for sensors that target multiple jobs.

\n
\n
Type:
\n

Optional[str]

\n
\n
\n
\n\n
\n
\nasset_selection\u00b6
\n

A sequence of AssetKeys that should be\nlaunched with this run.

\n
\n
Type:
\n

Optional[Sequence[AssetKey]]

\n
\n
\n
\n\n
\n
\nstale_assets_only\u00b6
\n

Set to true to further narrow the asset\nselection to stale assets. If passed without an asset selection, all stale assets in the\njob will be materialized. If the job does not materialize assets, this flag is ignored.

\n
\n
Type:
\n

bool

\n
\n
\n
\n\n
\n
\npartition_key\u00b6
\n

The partition key for this run request.

\n
\n
Type:
\n

Optional[str]

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.SkipReason(skip_message=None)[source]\u00b6
\n

Represents a skipped evaluation, where no runs are requested. May contain a message to indicate\nwhy no runs were requested.

\n
\n
\nskip_message\u00b6
\n

A message displayed in the Dagster UI for why this evaluation resulted\nin no requested runs.

\n
\n
Type:
\n

Optional[str]

\n
\n
\n
\n\n
\n\n
\n
\n

Schedules\u00b6

\n
\n
\n@dagster.schedule(cron_schedule, *, job_name=None, name=None, tags=None, tags_fn=None, should_execute=None, environment_vars=None, execution_timezone=None, description=None, job=None, default_status=DefaultScheduleStatus.STOPPED, required_resource_keys=None)[source]\u00b6
\n

Creates a schedule following the provided cron schedule and requests runs for the provided job.

\n

The decorated function takes in a ScheduleEvaluationContext as its only\nargument, and does one of the following:

\n
    \n
  1. Return a RunRequest object.

  2. \n
  3. Return a list of RunRequest objects.

  4. \n
  5. Return a SkipReason object, providing a descriptive message of why no runs were requested.

  6. \n
  7. Return nothing (skipping without providing a reason)

  8. \n
  9. Return a run config dictionary.

  10. \n
  11. Yield a SkipReason or yield one ore more RunRequest objects.

  12. \n
\n

Returns a ScheduleDefinition.

\n
\n
Parameters:
\n
    \n
  • cron_schedule (Union[str, Sequence[str]]) \u2013 A valid cron string or sequence of cron strings\nspecifying when the schedule will run, e.g., '45 23 * * 6' for a schedule that runs\nat 11:45 PM every Saturday. If a sequence is provided, then the schedule will run for\nthe union of all execution times for the provided cron strings, e.g.,\n['45 23 * * 6', '30 9 * * 0] for a schedule that runs at 11:45 PM every Saturday and\n9:30 AM every Sunday.

  • \n
  • name (Optional[str]) \u2013 The name of the schedule to create.

  • \n
  • tags (Optional[Dict[str, str]]) \u2013 A dictionary of tags (string key-value pairs) to attach\nto the scheduled runs.

  • \n
  • tags_fn (Optional[Callable[[ScheduleEvaluationContext], Optional[Dict[str, str]]]]) \u2013 A function\nthat generates tags to attach to the schedules runs. Takes a\nScheduleEvaluationContext and returns a dictionary of tags (string\nkey-value pairs). You may set only one of tags and tags_fn.

  • \n
  • should_execute (Optional[Callable[[ScheduleEvaluationContext], bool]]) \u2013 A function that runs at\nschedule execution time to determine whether a schedule should execute or skip. Takes a\nScheduleEvaluationContext and returns a boolean (True if the\nschedule should execute). Defaults to a function that always returns True.

  • \n
  • execution_timezone (Optional[str]) \u2013 Timezone in which the schedule should run.\nSupported strings for timezones are the ones provided by the\nIANA time zone database <https://www.iana.org/time-zones> - e.g. \u201cAmerica/Los_Angeles\u201d.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the schedule.

  • \n
  • job (Optional[Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]]) \u2013 The job\nthat should execute when this schedule runs.

  • \n
  • default_status (DefaultScheduleStatus) \u2013 Whether the schedule starts as running or not. The default\nstatus can be overridden from the Dagster UI or via the GraphQL API.

  • \n
  • required_resource_keys (Optional[Set[str]]) \u2013 The set of resource keys required by the schedule.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.ScheduleDefinition(name=None, *, cron_schedule=None, job_name=None, run_config=None, run_config_fn=None, tags=None, tags_fn=None, should_execute=None, environment_vars=None, execution_timezone=None, execution_fn=None, description=None, job=None, default_status=DefaultScheduleStatus.STOPPED, required_resource_keys=None)[source]\u00b6
\n

Define a schedule that targets a job.

\n
\n
Parameters:
\n
    \n
  • name (Optional[str]) \u2013 The name of the schedule to create. Defaults to the job name plus\n\u201c_schedule\u201d.

  • \n
  • cron_schedule (Union[str, Sequence[str]]) \u2013 A valid cron string or sequence of cron strings\nspecifying when the schedule will run, e.g., '45 23 * * 6' for a schedule that runs\nat 11:45 PM every Saturday. If a sequence is provided, then the schedule will run for\nthe union of all execution times for the provided cron strings, e.g.,\n['45 23 * * 6', '30 9 * * 0] for a schedule that runs at 11:45 PM every Saturday and\n9:30 AM every Sunday.

  • \n
  • execution_fn (Callable[ScheduleEvaluationContext]) \u2013

    The core evaluation function for the\nschedule, which is run at an interval to determine whether a run should be launched or\nnot. Takes a ScheduleEvaluationContext.

    \n

    This function must return a generator, which must yield either a single SkipReason\nor one or more RunRequest objects.

    \n

  • \n
  • run_config (Optional[Mapping]) \u2013 The config that parameterizes this execution,\nas a dict.

  • \n
  • run_config_fn (Optional[Callable[[ScheduleEvaluationContext], [Mapping]]]) \u2013 A function that\ntakes a ScheduleEvaluationContext object and returns the run configuration that\nparameterizes this execution, as a dict. You may set only one of run_config,\nrun_config_fn, and execution_fn.

  • \n
  • tags (Optional[Mapping[str, str]]) \u2013 A dictionary of tags (string key-value pairs) to attach\nto the scheduled runs.

  • \n
  • tags_fn (Optional[Callable[[ScheduleEvaluationContext], Optional[Mapping[str, str]]]]) \u2013 A\nfunction that generates tags to attach to the schedules runs. Takes a\nScheduleEvaluationContext and returns a dictionary of tags (string\nkey-value pairs). You may set only one of tags, tags_fn, and execution_fn.

  • \n
  • should_execute (Optional[Callable[[ScheduleEvaluationContext], bool]]) \u2013 A function that runs\nat schedule execution time to determine whether a schedule should execute or skip. Takes\na ScheduleEvaluationContext and returns a boolean (True if the\nschedule should execute). Defaults to a function that always returns True.

  • \n
  • execution_timezone (Optional[str]) \u2013 Timezone in which the schedule should run.\nSupported strings for timezones are the ones provided by the\nIANA time zone database <https://www.iana.org/time-zones> - e.g. \u201cAmerica/Los_Angeles\u201d.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the schedule.

  • \n
  • job (Optional[Union[GraphDefinition, JobDefinition]]) \u2013 The job that should execute when this\nschedule runs.

  • \n
  • default_status (DefaultScheduleStatus) \u2013 Whether the schedule starts as running or not. The default\nstatus can be overridden from the Dagster UI or via the GraphQL API.

  • \n
  • required_resource_keys (Optional[Set[str]]) \u2013 The set of resource keys required by the schedule.

  • \n
\n
\n
\n
\n
\nproperty cron_schedule\u00b6
\n

The cron schedule representing when this schedule will be evaluated.

\n
\n
Type:
\n

Union[str, Sequence[str]]

\n
\n
\n
\n\n
\n
\nproperty default_status\u00b6
\n

The default status for this schedule when it is first loaded in\na code location.

\n
\n
Type:
\n

DefaultScheduleStatus

\n
\n
\n
\n\n
\n
\nproperty description\u00b6
\n

A description for this schedule.

\n
\n
Type:
\n

Optional[str]

\n
\n
\n
\n\n
\n
\nproperty environment_vars\u00b6
\n
\n
\n

\n \n (\n deprecated\n )\n \n This API will be removed in version 2.0. Setting this property no longer has any effect..\n \n

\n

Environment variables to export to the cron schedule.

\n
\n
Type:
\n

Mapping[str, str]

\n
\n
\n
\n\n
\n
\nproperty execution_timezone\u00b6
\n

The timezone in which this schedule will be evaluated.

\n
\n
Type:
\n

Optional[str]

\n
\n
\n
\n\n
\n
\nproperty job\u00b6
\n

The job that is\ntargeted by this schedule.

\n
\n
Type:
\n

Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]

\n
\n
\n
\n\n
\n
\nproperty job_name\u00b6
\n

The name of the job targeted by this schedule.

\n
\n
Type:
\n

str

\n
\n
\n
\n\n
\n
\nproperty name\u00b6
\n

The name of the schedule.

\n
\n
Type:
\n

str

\n
\n
\n
\n\n
\n
\nproperty required_resource_keys\u00b6
\n

The set of keys for resources that must be provided to this schedule.

\n
\n
Type:
\n

Set[str]

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.ScheduleEvaluationContext(instance_ref, scheduled_execution_time, repository_name=None, schedule_name=None, resources=None, repository_def=None)[source]\u00b6
\n

The context object available as the first argument various functions defined on a dagster.ScheduleDefinition.

\n

A ScheduleEvaluationContext object is passed as the first argument to run_config_fn, tags_fn,\nand should_execute.

\n

Users should not instantiate this object directly. To construct a ScheduleEvaluationContext for testing purposes, use dagster.build_schedule_context().

\n

Example

\n
from dagster import schedule, ScheduleEvaluationContext\n\n@schedule\ndef the_schedule(context: ScheduleEvaluationContext):\n    ...\n
\n
\n
\n
\nproperty instance\u00b6
\n

The current DagsterInstance.

\n
\n
Type:
\n

DagsterInstance

\n
\n
\n
\n\n
\n
\nproperty resources\u00b6
\n

Mapping of resource key to resource definition to be made available\nduring schedule execution.

\n
\n\n
\n
\nproperty scheduled_execution_time\u00b6
\n

The time in which the execution was scheduled to happen. May differ slightly\nfrom both the actual execution time and the time at which the run config is computed.

\n
\n\n
\n\n
\n
\ndagster.build_schedule_context(instance=None, scheduled_execution_time=None, resources=None, repository_def=None, instance_ref=None)[source]\u00b6
\n

Builds schedule execution context using the provided parameters.

\n

The instance provided to build_schedule_context must be persistent;\nDagsterInstance.ephemeral() will result in an error.

\n
\n
Parameters:
\n
    \n
  • instance (Optional[DagsterInstance]) \u2013 The dagster instance configured to run the schedule.

  • \n
  • scheduled_execution_time (datetime) \u2013 The time in which the execution was scheduled to\nhappen. May differ slightly from both the actual execution time and the time at which\nthe run config is computed.

  • \n
\n
\n
\n

Examples

\n
context = build_schedule_context(instance)\n
\n
\n
\n\n
\n
\ndagster._core.scheduler.DagsterDaemonScheduler Scheduler[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
max_catchup_runs (dagster.IntSource, optional):
\n

For partitioned schedules, controls the maximum number of past\npartitions for each schedule that will be considered when looking for missing\nruns . Generally this parameter will only come into play if the scheduler\nfalls behind or launches after experiencing downtime. This parameter will not be checked for\nschedules without partition sets (for example, schedules created using the @schedule\ndecorator) - only the most recent execution time will be considered for those schedules.

\n

Note that no matter what this value is, the scheduler will never launch a run from a time\nbefore the schedule was turned on (even if the start_date on the schedule is earlier) - if\nyou want to launch runs for earlier partitions, launch a backfill.

\n

Default Value: 5

\n
\n
max_tick_retries (dagster.IntSource, optional):
\n

For each schedule tick that raises an error, how many times to retry that tick

\n

Default Value: 0

\n
\n
\n

Default scheduler implementation that submits runs from the dagster-daemon\nlong-lived process. Periodically checks each running schedule for execution times that don\u2019t\nhave runs yet and launches them.

\n
\n\n
\n
\n

Partitioned Schedules\u00b6

\n
\n
\ndagster.build_schedule_from_partitioned_job(job, description=None, name=None, minute_of_hour=None, hour_of_day=None, day_of_week=None, day_of_month=None, default_status=DefaultScheduleStatus.STOPPED, tags=None)[source]\u00b6
\n

Creates a schedule from a time window-partitioned job or a job that targets\ntime window-partitioned assets. The job can also be multipartitioned, as long as one\nof the partitions dimensions is time-partitioned.

\n

The schedule executes at the cadence specified by the time partitioning of the job or assets.

\n

Examples

\n
######################################\n# Job that targets partitioned assets\n######################################\n\nfrom dagster import (\n    DailyPartitionsDefinition,\n    asset,\n    build_schedule_from_partitioned_job,\n    define_asset_job,\n)\n\n@asset(partitions_def=DailyPartitionsDefinition(start_date="2020-01-01"))\ndef asset1():\n    ...\n\nasset1_job = define_asset_job("asset1_job", selection=[asset1])\n\n# The created schedule will fire daily\nasset1_job_schedule = build_schedule_from_partitioned_job(asset1_job)\n\ndefs = Definitions(assets=[asset1], schedules=[asset1_job_schedule])\n\n################\n# Non-asset job\n################\n\nfrom dagster import DailyPartitionsDefinition, build_schedule_from_partitioned_job, jog\n\n\n@job(partitions_def=DailyPartitionsDefinition(start_date="2020-01-01"))\ndef do_stuff_partitioned():\n    ...\n\n# The created schedule will fire daily\ndo_stuff_partitioned_schedule = build_schedule_from_partitioned_job(\n    do_stuff_partitioned,\n)\n\ndefs = Definitions(schedules=[do_stuff_partitioned_schedule])\n
\n
\n
\n\n
\n
\n@dagster.hourly_partitioned_config(start_date, minute_offset=0, timezone=None, fmt=None, end_offset=0, tags_for_partition_fn=None)[source]
\n

Defines run config over a set of hourly partitions.

\n

The decorated function should accept a start datetime and end datetime, which represent the date\npartition the config should delineate.

\n

The decorated function should return a run config dictionary.

\n

The resulting object created by this decorator can be provided to the config argument of a Job.\nThe first partition in the set will start at the start_date at midnight. The last partition in\nthe set will end before the current time, unless the end_offset argument is set to a positive\nnumber. If minute_offset is provided, the start and end times of each partition will be\nminute_offset past the hour.

\n
\n
Parameters:
\n
    \n
  • start_date (Union[datetime.datetime, str]) \u2013 The first date in the set of partitions. Can\nprovide in either a datetime or string format.

  • \n
  • minute_offset (int) \u2013 Number of minutes past the hour to \u201csplit\u201d the partition. Defaults\nto 0.

  • \n
  • fmt (Optional[str]) \u2013 The date format to use. Defaults to %Y-%m-%d.

  • \n
  • timezone (Optional[str]) \u2013 The timezone in which each date should exist.\nSupported strings for timezones are the ones provided by the\nIANA time zone database <https://www.iana.org/time-zones> - e.g. \u201cAmerica/Los_Angeles\u201d.

  • \n
  • end_offset (int) \u2013 Extends the partition set by a number of partitions equal to the value\npassed. If end_offset is 0 (the default), the last partition ends before the current\ntime. If end_offset is 1, the second-to-last partition ends before the current time,\nand so on.

  • \n
  • tags_for_partition_fn (Optional[Callable[[str], Mapping[str, str]]]) \u2013 A function that\naccepts a partition time window and returns a dictionary of tags to attach to runs for\nthat partition.

  • \n
\n
\n
\n
@hourly_partitioned_config(start_date=datetime(2022, 03, 12))\n# creates partitions (2022-03-12-00:00, 2022-03-12-01:00), (2022-03-12-01:00, 2022-03-12-02:00), ...\n\n@hourly_partitioned_config(start_date=datetime(2022, 03, 12), minute_offset=15)\n# creates partitions (2022-03-12-00:15, 2022-03-12-01:15), (2022-03-12-01:15, 2022-03-12-02:15), ...\n
\n
\n
\n\n
\n
\n@dagster.daily_partitioned_config(start_date, minute_offset=0, hour_offset=0, timezone=None, fmt=None, end_offset=0, tags_for_partition_fn=None)[source]
\n

Defines run config over a set of daily partitions.

\n

The decorated function should accept a start datetime and end datetime, which represent the bounds\nof the date partition the config should delineate.

\n

The decorated function should return a run config dictionary.

\n

The resulting object created by this decorator can be provided to the config argument of a Job.\nThe first partition in the set will start at the start_date at midnight. The last partition in\nthe set will end before the current time, unless the end_offset argument is set to a positive\nnumber. If minute_offset and/or hour_offset are used, the start and end times of each partition\nwill be hour_offset:minute_offset of each day.

\n
\n
Parameters:
\n
    \n
  • start_date (Union[datetime.datetime, str]) \u2013 The first date in the set of partitions. Can\nprovide in either a datetime or string format.

  • \n
  • minute_offset (int) \u2013 Number of minutes past the hour to \u201csplit\u201d the partition. Defaults\nto 0.

  • \n
  • hour_offset (int) \u2013 Number of hours past 00:00 to \u201csplit\u201d the partition. Defaults to 0.

  • \n
  • timezone (Optional[str]) \u2013 The timezone in which each date should exist.\nSupported strings for timezones are the ones provided by the\nIANA time zone database <https://www.iana.org/time-zones> - e.g. \u201cAmerica/Los_Angeles\u201d.

  • \n
  • fmt (Optional[str]) \u2013 The date format to use. Defaults to %Y-%m-%d.

  • \n
  • end_offset (int) \u2013 Extends the partition set by a number of partitions equal to the value\npassed. If end_offset is 0 (the default), the last partition ends before the current\ntime. If end_offset is 1, the second-to-last partition ends before the current time,\nand so on.

  • \n
  • tags_for_partition_fn (Optional[Callable[[str], Mapping[str, str]]]) \u2013 A function that\naccepts a partition time window and returns a dictionary of tags to attach to runs for\nthat partition.

  • \n
\n
\n
\n
@daily_partitioned_config(start_date="2022-03-12")\n# creates partitions (2022-03-12-00:00, 2022-03-13-00:00), (2022-03-13-00:00, 2022-03-14-00:00), ...\n\n@daily_partitioned_config(start_date="2022-03-12", minute_offset=15, hour_offset=16)\n# creates partitions (2022-03-12-16:15, 2022-03-13-16:15), (2022-03-13-16:15, 2022-03-14-16:15), ...\n
\n
\n
\n\n
\n
\n@dagster.weekly_partitioned_config(start_date, minute_offset=0, hour_offset=0, day_offset=0, timezone=None, fmt=None, end_offset=0, tags_for_partition_fn=None)[source]
\n

Defines run config over a set of weekly partitions.

\n

The decorated function should accept a start datetime and end datetime, which represent the date\npartition the config should delineate.

\n

The decorated function should return a run config dictionary.

\n

The resulting object created by this decorator can be provided to the config argument of a Job.\nThe first partition in the set will start at the start_date. The last partition in the set will\nend before the current time, unless the end_offset argument is set to a positive number. If\nday_offset is provided, the start and end date of each partition will be day of the week\ncorresponding to day_offset (0 indexed with Sunday as the start of the week). If\nminute_offset and/or hour_offset are used, the start and end times of each partition will be\nhour_offset:minute_offset of each day.

\n
\n
Parameters:
\n
    \n
  • start_date (Union[datetime.datetime, str]) \u2013 The first date in the set of partitions will\nSunday at midnight following start_date. Can provide in either a datetime or string\nformat.

  • \n
  • minute_offset (int) \u2013 Number of minutes past the hour to \u201csplit\u201d the partition. Defaults\nto 0.

  • \n
  • hour_offset (int) \u2013 Number of hours past 00:00 to \u201csplit\u201d the partition. Defaults to 0.

  • \n
  • day_offset (int) \u2013 Day of the week to \u201csplit\u201d the partition. Defaults to 0 (Sunday).

  • \n
  • timezone (Optional[str]) \u2013 The timezone in which each date should exist.\nSupported strings for timezones are the ones provided by the\nIANA time zone database <https://www.iana.org/time-zones> - e.g. \u201cAmerica/Los_Angeles\u201d.

  • \n
  • fmt (Optional[str]) \u2013 The date format to use. Defaults to %Y-%m-%d.

  • \n
  • end_offset (int) \u2013 Extends the partition set by a number of partitions equal to the value\npassed. If end_offset is 0 (the default), the last partition ends before the current\ntime. If end_offset is 1, the second-to-last partition ends before the current time,\nand so on.

  • \n
  • tags_for_partition_fn (Optional[Callable[[str], Mapping[str, str]]]) \u2013 A function that\naccepts a partition time window and returns a dictionary of tags to attach to runs for\nthat partition.

  • \n
\n
\n
\n
@weekly_partitioned_config(start_date="2022-03-12")\n# creates partitions (2022-03-13-00:00, 2022-03-20-00:00), (2022-03-20-00:00, 2022-03-27-00:00), ...\n\n@weekly_partitioned_config(start_date="2022-03-12", minute_offset=15, hour_offset=3, day_offset=6)\n# creates partitions (2022-03-12-03:15, 2022-03-19-03:15), (2022-03-19-03:15, 2022-03-26-03:15), ...\n
\n
\n
\n\n
\n
\n@dagster.monthly_partitioned_config(start_date, minute_offset=0, hour_offset=0, day_offset=1, timezone=None, fmt=None, end_offset=0, tags_for_partition_fn=None)[source]
\n

Defines run config over a set of monthly partitions.

\n

The decorated function should accept a start datetime and end datetime, which represent the date\npartition the config should delineate.

\n

The decorated function should return a run config dictionary.

\n

The resulting object created by this decorator can be provided to the config argument of a Job.\nThe first partition in the set will start at midnight on the soonest first of the month after\nstart_date. The last partition in the set will end before the current time, unless the\nend_offset argument is set to a positive number. If day_offset is provided, the start and end\ndate of each partition will be day_offset. If minute_offset and/or hour_offset are used, the\nstart and end times of each partition will be hour_offset:minute_offset of each day.

\n
\n
Parameters:
\n
    \n
  • start_date (Union[datetime.datetime, str]) \u2013 The first date in the set of partitions will be\nmidnight the sonnest first of the month following start_date. Can provide in either a\ndatetime or string format.

  • \n
  • minute_offset (int) \u2013 Number of minutes past the hour to \u201csplit\u201d the partition. Defaults\nto 0.

  • \n
  • hour_offset (int) \u2013 Number of hours past 00:00 to \u201csplit\u201d the partition. Defaults to 0.

  • \n
  • day_offset (int) \u2013 Day of the month to \u201csplit\u201d the partition. Defaults to 1.

  • \n
  • timezone (Optional[str]) \u2013 The timezone in which each date should exist.\nSupported strings for timezones are the ones provided by the\nIANA time zone database <https://www.iana.org/time-zones> - e.g. \u201cAmerica/Los_Angeles\u201d.

  • \n
  • fmt (Optional[str]) \u2013 The date format to use. Defaults to %Y-%m-%d.

  • \n
  • end_offset (int) \u2013 Extends the partition set by a number of partitions equal to the value\npassed. If end_offset is 0 (the default), the last partition ends before the current\ntime. If end_offset is 1, the second-to-last partition ends before the current time,\nand so on.

  • \n
  • tags_for_partition_fn (Optional[Callable[[str], Mapping[str, str]]]) \u2013 A function that\naccepts a partition time window and returns a dictionary of tags to attach to runs for\nthat partition.

  • \n
\n
\n
\n
@monthly_partitioned_config(start_date="2022-03-12")\n# creates partitions (2022-04-01-00:00, 2022-05-01-00:00), (2022-05-01-00:00, 2022-06-01-00:00), ...\n\n@monthly_partitioned_config(start_date="2022-03-12", minute_offset=15, hour_offset=3, day_offset=5)\n# creates partitions (2022-04-05-03:15, 2022-05-05-03:15), (2022-05-05-03:15, 2022-06-05-03:15), ...\n
\n
\n
\n\n
\n
\n

Sensors\u00b6

\n
\n
\n@dagster.sensor(job_name=None, *, name=None, minimum_interval_seconds=None, description=None, job=None, jobs=None, default_status=DefaultSensorStatus.STOPPED, asset_selection=None, required_resource_keys=None)[source]\u00b6
\n

Creates a sensor where the decorated function is used as the sensor\u2019s evaluation function.

\n

The decorated function may:

\n
    \n
  1. Return a RunRequest object.

  2. \n
  3. Return a list of RunRequest objects.

  4. \n
  5. Return a SkipReason object, providing a descriptive message of why no runs were requested.

  6. \n
  7. Return nothing (skipping without providing a reason)

  8. \n
  9. Yield a SkipReason or yield one or more RunRequest objects.

  10. \n
\n

Takes a SensorEvaluationContext.

\n
\n
Parameters:
\n
    \n
  • name (Optional[str]) \u2013 The name of the sensor. Defaults to the name of the decorated\nfunction.

  • \n
  • minimum_interval_seconds (Optional[int]) \u2013 The minimum number of seconds that will elapse\nbetween sensor evaluations.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the sensor.

  • \n
  • job (Optional[Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]]) \u2013 The job to be executed when the sensor fires.

  • \n
  • jobs (Optional[Sequence[Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]]]) \u2013 (experimental) A list of jobs to be executed when the sensor fires.

  • \n
  • default_status (DefaultSensorStatus) \u2013 Whether the sensor starts as running or not. The default\nstatus can be overridden from the Dagster UI or via the GraphQL API.

  • \n
  • asset_selection (AssetSelection) \u2013 (Experimental) an asset selection to launch a run for if\nthe sensor condition is met. This can be provided instead of specifying a job.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.SensorDefinition(name=None, *, evaluation_fn=None, job_name=None, minimum_interval_seconds=None, description=None, job=None, jobs=None, default_status=DefaultSensorStatus.STOPPED, asset_selection=None, required_resource_keys=None)[source]\u00b6
\n

Define a sensor that initiates a set of runs based on some external state.

\n
\n
Parameters:
\n
    \n
  • evaluation_fn (Callable[[SensorEvaluationContext]]) \u2013

    The core evaluation function for the\nsensor, which is run at an interval to determine whether a run should be launched or\nnot. Takes a SensorEvaluationContext.

    \n

    This function must return a generator, which must yield either a single SkipReason\nor one or more RunRequest objects.

    \n

  • \n
  • name (Optional[str]) \u2013 The name of the sensor to create. Defaults to name of evaluation_fn

  • \n
  • minimum_interval_seconds (Optional[int]) \u2013 The minimum number of seconds that will elapse\nbetween sensor evaluations.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the sensor.

  • \n
  • job (Optional[GraphDefinition, JobDefinition, UnresolvedAssetJob]) \u2013 The job to execute when this sensor fires.

  • \n
  • jobs (Optional[Sequence[GraphDefinition, JobDefinition, UnresolvedAssetJob]]) \u2013 (experimental) A list of jobs to execute when this sensor fires.

  • \n
  • default_status (DefaultSensorStatus) \u2013 Whether the sensor starts as running or not. The default\nstatus can be overridden from the Dagster UI or via the GraphQL API.

  • \n
  • asset_selection (AssetSelection) \u2013 (Experimental) an asset selection to launch a run for if\nthe sensor condition is met. This can be provided instead of specifying a job.

  • \n
\n
\n
\n
\n
\nproperty default_status\u00b6
\n

The default status for this sensor when it is first loaded in\na code location.

\n
\n
Type:
\n

DefaultSensorStatus

\n
\n
\n
\n\n
\n
\nproperty description\u00b6
\n

A description for this sensor.

\n
\n
Type:
\n

Optional[str]

\n
\n
\n
\n\n
\n
\nproperty job\u00b6
\n

The job that is\ntargeted by this schedule.

\n
\n
Type:
\n

Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]

\n
\n
\n
\n\n
\n
\nproperty job_name\u00b6
\n

The name of the job that is targeted by this sensor.

\n
\n
Type:
\n

Optional[str]

\n
\n
\n
\n\n
\n
\nproperty jobs\u00b6
\n

A list of jobs\nthat are targeted by this schedule.

\n
\n
Type:
\n

List[Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]]

\n
\n
\n
\n\n
\n
\nproperty minimum_interval_seconds\u00b6
\n

The minimum number of seconds between sequential evaluations of this sensor.

\n
\n
Type:
\n

Optional[int]

\n
\n
\n
\n\n
\n
\nproperty name\u00b6
\n

The name of this sensor.

\n
\n
Type:
\n

str

\n
\n
\n
\n\n
\n
\nproperty required_resource_keys\u00b6
\n

The set of keys for resources that must be provided to this sensor.

\n
\n
Type:
\n

Set[str]

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.SensorEvaluationContext(instance_ref, last_completion_time, last_run_key, cursor, repository_name, repository_def=None, instance=None, sensor_name=None, resources=None, definitions=None)[source]
\n

The context object available as the argument to the evaluation function of a dagster.SensorDefinition.

\n

Users should not instantiate this object directly. To construct a\nSensorEvaluationContext for testing purposes, use dagster.\nbuild_sensor_context().

\n
\n
\ninstance_ref
\n

The serialized instance configured to run the schedule

\n
\n
Type:
\n

Optional[InstanceRef]

\n
\n
\n
\n\n
\n
\ncursor
\n

The cursor, passed back from the last sensor evaluation via\nthe cursor attribute of SkipReason and RunRequest

\n
\n
Type:
\n

Optional[str]

\n
\n
\n
\n\n
\n
\nlast_completion_time
\n

DEPRECATED The last time that the sensor was evaluated (UTC).

\n
\n
Type:
\n

float

\n
\n
\n
\n\n
\n
\nlast_run_key
\n

DEPRECATED The run key of the RunRequest most recently created by this\nsensor. Use the preferred cursor attribute instead.

\n
\n
Type:
\n

str

\n
\n
\n
\n\n
\n
\nrepository_name
\n

The name of the repository that the sensor belongs to.

\n
\n
Type:
\n

Optional[str]

\n
\n
\n
\n\n
\n
\nrepository_def
\n

The repository or that\nthe sensor belongs to. If needed by the sensor top-level resource definitions will be\npulled from this repository. You can provide either this or definitions.

\n
\n
Type:
\n

Optional[RepositoryDefinition]

\n
\n
\n
\n\n
\n
\ninstance
\n

The deserialized instance can also be passed in\ndirectly (primarily useful in testing contexts).

\n
\n
Type:
\n

Optional[DagsterInstance]

\n
\n
\n
\n\n
\n
\ndefinitions
\n

Definitions object that the sensor is defined in.\nIf needed by the sensor, top-level resource definitions will be pulled from these\ndefinitions. You can provide either this or repository_def.

\n
\n
Type:
\n

Optional[Definitions]

\n
\n
\n
\n\n
\n
\nresources
\n

A dict of resource keys to resource\ndefinitions to be made available during sensor execution.

\n
\n
Type:
\n

Optional[Dict[str, Any]]

\n
\n
\n
\n\n

Example

\n
from dagster import sensor, SensorEvaluationContext\n\n@sensor\ndef the_sensor(context: SensorEvaluationContext):\n    ...\n
\n
\n
\n
\nproperty cursor
\n

The cursor value for this sensor, which was set in an earlier sensor evaluation.

\n
\n\n
\n
\nproperty instance
\n

The current DagsterInstance.

\n
\n
Type:
\n

DagsterInstance

\n
\n
\n
\n\n
\n
\nproperty last_completion_time
\n

Timestamp representing the last time this sensor completed an evaluation.

\n
\n
Type:
\n

Optional[float]

\n
\n
\n
\n\n
\n
\nproperty last_run_key
\n

The run key supplied to the most recent RunRequest produced by this sensor.

\n
\n
Type:
\n

Optional[str]

\n
\n
\n
\n\n
\n
\nproperty repository_def
\n

The RepositoryDefinition that this sensor resides in.

\n
\n
Type:
\n

Optional[RepositoryDefinition]

\n
\n
\n
\n\n
\n
\nproperty repository_name
\n

The name of the repository that this sensor resides in.

\n
\n
Type:
\n

Optional[str]

\n
\n
\n
\n\n
\n
\nproperty resources
\n

A mapping from resource key to instantiated resources for this sensor.

\n
\n
Type:
\n

Resources

\n
\n
\n
\n\n
\n
\nupdate_cursor(cursor)[source]
\n

Updates the cursor value for this sensor, which will be provided on the context for the\nnext sensor evaluation.

\n

This can be used to keep track of progress and avoid duplicate work across sensor\nevaluations.

\n
\n
Parameters:
\n

cursor (Optional[str]) \u2013

\n
\n
\n
\n\n
\n\n
\n
\ndagster.build_sensor_context(instance=None, cursor=None, repository_name=None, repository_def=None, sensor_name=None, resources=None, definitions=None, instance_ref=None)[source]\u00b6
\n

Builds sensor execution context using the provided parameters.

\n

This function can be used to provide a context to the invocation of a sensor definition.If\nprovided, the dagster instance must be persistent; DagsterInstance.ephemeral() will result in an\nerror.

\n
\n
Parameters:
\n
    \n
  • instance (Optional[DagsterInstance]) \u2013 The dagster instance configured to run the sensor.

  • \n
  • cursor (Optional[str]) \u2013 A cursor value to provide to the evaluation of the sensor.

  • \n
  • repository_name (Optional[str]) \u2013 The name of the repository that the sensor belongs to.

  • \n
  • repository_def (Optional[RepositoryDefinition]) \u2013 The repository that the sensor belongs to.\nIf needed by the sensor top-level resource definitions will be pulled from this repository.\nYou can provide either this or definitions.

  • \n
  • resources (Optional[Mapping[str, ResourceDefinition]]) \u2013 A set of resource definitions\nto provide to the sensor. If passed, these will override any resource definitions\nprovided by the repository.

  • \n
  • definitions (Optional[Definitions]) \u2013 Definitions object that the sensor is defined in.\nIf needed by the sensor, top-level resource definitions will be pulled from these\ndefinitions. You can provide either this or repository_def.

  • \n
\n
\n
\n

Examples

\n
context = build_sensor_context()\nmy_sensor(context)\n
\n
\n
\n\n
\n
\n@dagster.asset_sensor(asset_key, *, job_name=None, name=None, minimum_interval_seconds=None, description=None, job=None, jobs=None, default_status=DefaultSensorStatus.STOPPED, required_resource_keys=None)[source]\u00b6
\n

Creates an asset sensor where the decorated function is used as the asset sensor\u2019s evaluation\nfunction.

\n

If the asset has been materialized multiple times between since the last sensor tick, the\nevaluation function will only be invoked once, with the latest materialization.

\n

The decorated function may:

\n
    \n
  1. Return a RunRequest object.

  2. \n
  3. Return a list of RunRequest objects.

  4. \n
  5. Return a SkipReason object, providing a descriptive message of why no runs were requested.

  6. \n
  7. Return nothing (skipping without providing a reason)

  8. \n
  9. Yield a SkipReason or yield one or more RunRequest objects.

  10. \n
\n

Takes a SensorEvaluationContext and an EventLogEntry corresponding to an\nAssetMaterialization event.

\n
\n
Parameters:
\n
    \n
  • asset_key (AssetKey) \u2013 The asset_key this sensor monitors.

  • \n
  • name (Optional[str]) \u2013 The name of the sensor. Defaults to the name of the decorated\nfunction.

  • \n
  • minimum_interval_seconds (Optional[int]) \u2013 The minimum number of seconds that will elapse\nbetween sensor evaluations.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the sensor.

  • \n
  • job (Optional[Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]]) \u2013 The\njob to be executed when the sensor fires.

  • \n
  • jobs (Optional[Sequence[Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]]]) \u2013 (experimental) A list of jobs to be executed when the sensor fires.

  • \n
  • default_status (DefaultSensorStatus) \u2013 Whether the sensor starts as running or not. The default\nstatus can be overridden from the Dagster UI or via the GraphQL API.

  • \n
\n
\n
\n

Example

\n
from dagster import AssetKey, EventLogEntry, SensorEvaluationContext, asset_sensor\n\n\n@asset_sensor(asset_key=AssetKey("my_table"), job=my_job)\ndef my_asset_sensor(context: SensorEvaluationContext, asset_event: EventLogEntry):\n    return RunRequest(\n        run_key=context.cursor,\n        run_config={\n            "ops": {\n                "read_materialization": {\n                    "config": {\n                        "asset_key": asset_event.dagster_event.asset_key.path,\n                    }\n                }\n            }\n        },\n    )\n
\n
\n
\n\n
\n
\nclass dagster.AssetSensorDefinition(name, asset_key, job_name, asset_materialization_fn, minimum_interval_seconds=None, description=None, job=None, jobs=None, default_status=DefaultSensorStatus.STOPPED, required_resource_keys=None)[source]\u00b6
\n

Define an asset sensor that initiates a set of runs based on the materialization of a given\nasset.

\n

If the asset has been materialized multiple times between since the last sensor tick, the\nevaluation function will only be invoked once, with the latest materialization.

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the sensor to create.

  • \n
  • asset_key (AssetKey) \u2013 The asset_key this sensor monitors.

  • \n
  • asset_materialization_fn (Callable[[SensorEvaluationContext, EventLogEntry], Union[Iterator[Union[RunRequest, SkipReason]], RunRequest, SkipReason]]) \u2013

    The core\nevaluation function for the sensor, which is run at an interval to determine whether a\nrun should be launched or not. Takes a SensorEvaluationContext and\nan EventLogEntry corresponding to an AssetMaterialization event.

    \n

    This function must return a generator, which must yield either a single SkipReason\nor one or more RunRequest objects.

    \n

  • \n
  • minimum_interval_seconds (Optional[int]) \u2013 The minimum number of seconds that will elapse\nbetween sensor evaluations.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the sensor.

  • \n
  • job (Optional[Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]]) \u2013 The job\nobject to target with this sensor.

  • \n
  • jobs (Optional[Sequence[Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]]]) \u2013 (experimental) A list of jobs to be executed when the sensor fires.

  • \n
  • default_status (DefaultSensorStatus) \u2013 Whether the sensor starts as running or not. The default\nstatus can be overridden from the Dagster UI or via the GraphQL API.

  • \n
\n
\n
\n
\n
\nproperty asset_key\u00b6
\n

The key of the asset targeted by this sensor.

\n
\n
Type:
\n

AssetKey

\n
\n
\n
\n\n
\n\n
\n
\n@dagster.freshness_policy_sensor(asset_selection, *, name=None, minimum_interval_seconds=None, description=None, default_status=DefaultSensorStatus.STOPPED)[source]\u00b6
\n
\n
\n

\n \n (\n experimental\n )\n \n This API may break in future versions, even between dot releases.\n \n

\n

Define a sensor that reacts to the status of a given set of asset freshness policies, where the\ndecorated function will be evaluated on every tick for each asset in the selection that has a\nFreshnessPolicy defined.

\n

Note: returning or yielding a value from the annotated function will result in an error.

\n

Takes a FreshnessPolicySensorContext.

\n
\n
Parameters:
\n
    \n
  • asset_selection (AssetSelection) \u2013 The asset selection monitored by the sensor.

  • \n
  • name (Optional[str]) \u2013 The name of the sensor. Defaults to the name of the decorated function.

  • \n
  • freshness_policy_sensor_fn (Callable[[FreshnessPolicySensorContext], None]) \u2013 The core\nevaluation function for the sensor. Takes a FreshnessPolicySensorContext.

  • \n
  • minimum_interval_seconds (Optional[int]) \u2013 The minimum number of seconds that will elapse\nbetween sensor evaluations.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the sensor.

  • \n
  • default_status (DefaultSensorStatus) \u2013 Whether the sensor starts as running or not. The default\nstatus can be overridden from the Dagster UI or via the GraphQL API.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.FreshnessPolicySensorDefinition(name, asset_selection, freshness_policy_sensor_fn, minimum_interval_seconds=None, description=None, default_status=DefaultSensorStatus.STOPPED, required_resource_keys=None)[source]\u00b6
\n

Define a sensor that reacts to the status of a given set of asset freshness policies,\nwhere the decorated function will be evaluated on every sensor tick.

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the sensor. Defaults to the name of the decorated function.

  • \n
  • freshness_policy_sensor_fn (Callable[[FreshnessPolicySensorContext], None]) \u2013 The core\nevaluation function for the sensor. Takes a FreshnessPolicySensorContext.

  • \n
  • asset_selection (AssetSelection) \u2013 The asset selection monitored by the sensor.

  • \n
  • minimum_interval_seconds (Optional[int]) \u2013 The minimum number of seconds that will elapse\nbetween sensor evaluations.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the sensor.

  • \n
  • default_status (DefaultSensorStatus) \u2013 Whether the sensor starts as running or not. The default\nstatus can be overridden from the Dagster UI or via the GraphQL API.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.FreshnessPolicySensorContext(sensor_name, asset_key, freshness_policy, minutes_overdue, previous_minutes_overdue, instance, resources=None)[source]
\n

The context object available to a decorated function of freshness_policy_sensor.

\n
\n
\nsensor_name
\n

the name of the sensor.

\n
\n
Type:
\n

str

\n
\n
\n
\n\n
\n
\nasset_key
\n

the key of the asset being monitored

\n
\n
Type:
\n

AssetKey

\n
\n
\n
\n\n
\n
\nfreshness_policy
\n

the freshness policy of the asset being monitored

\n
\n
Type:
\n

FreshnessPolicy

\n
\n
\n
\n\n
\n
\nminutes_overdue
\n
\n
Type:
\n

Optional[float]

\n
\n
\n
\n\n
\n
\nprevious_minutes_overdue
\n

the minutes_overdue value for this asset on the\nprevious sensor tick.

\n
\n
Type:
\n

Optional[float]

\n
\n
\n
\n\n
\n
\ninstance
\n

the current instance.

\n
\n
Type:
\n

DagsterInstance

\n
\n
\n
\n\n
\n\n
\n
\ndagster.build_freshness_policy_sensor_context(sensor_name, asset_key, freshness_policy, minutes_overdue, previous_minutes_overdue=None, instance=None, resources=None)[source]\u00b6
\n
\n
\n

\n \n (\n experimental\n )\n \n This API may break in future versions, even between dot releases.\n \n

\n

Builds freshness policy sensor context from provided parameters.

\n

This function can be used to provide the context argument when directly invoking a function\ndecorated with @freshness_policy_sensor, such as when writing unit tests.

\n
\n
Parameters:
\n
    \n
  • sensor_name (str) \u2013 The name of the sensor the context is being constructed for.

  • \n
  • asset_key (AssetKey) \u2013 The AssetKey for the monitored asset

  • \n
  • freshness_policy (FreshnessPolicy) \u2013 The FreshnessPolicy for the monitored asset

  • \n
  • minutes_overdue (Optional[float]) \u2013 How overdue the monitored asset currently is

  • \n
  • previous_minutes_overdue (Optional[float]) \u2013 How overdue the monitored asset was on the\nprevious tick.

  • \n
  • instance (DagsterInstance) \u2013 The dagster instance configured for the context.

  • \n
\n
\n
\n

Examples

\n
context = build_freshness_policy_sensor_context(\n    sensor_name="freshness_policy_sensor_to_invoke",\n    asset_key=AssetKey("some_asset"),\n    freshness_policy=FreshnessPolicy(maximum_lag_minutes=30)<\n    minutes_overdue=10.0,\n)\nfreshness_policy_sensor_to_invoke(context)\n
\n
\n
\n\n
\n
\n@dagster.multi_asset_sensor(monitored_assets, *, job_name=None, name=None, minimum_interval_seconds=None, description=None, job=None, jobs=None, default_status=DefaultSensorStatus.STOPPED, request_assets=None, required_resource_keys=None)[source]\u00b6
\n
\n
\n

\n \n (\n experimental\n )\n \n This API may break in future versions, even between dot releases.\n \n

\n

Creates an asset sensor that can monitor multiple assets.

\n

The decorated function is used as the asset sensor\u2019s evaluation\nfunction. The decorated function may:

\n
    \n
  1. Return a RunRequest object.

  2. \n
  3. Return a list of RunRequest objects.

  4. \n
  5. Return a SkipReason object, providing a descriptive message of why no runs were requested.

  6. \n
  7. Return nothing (skipping without providing a reason)

  8. \n
  9. Yield a SkipReason or yield one or more RunRequest objects.

  10. \n
\n

Takes a MultiAssetSensorEvaluationContext.

\n
\n
Parameters:
\n
    \n
  • monitored_assets (Union[Sequence[AssetKey], AssetSelection]) \u2013 The assets this\nsensor monitors. If an AssetSelection object is provided, it will only apply to assets\nwithin the Definitions that this sensor is part of.

  • \n
  • name (Optional[str]) \u2013 The name of the sensor. Defaults to the name of the decorated\nfunction.

  • \n
  • minimum_interval_seconds (Optional[int]) \u2013 The minimum number of seconds that will elapse\nbetween sensor evaluations.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the sensor.

  • \n
  • job (Optional[Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]]) \u2013 The\njob to be executed when the sensor fires.

  • \n
  • jobs (Optional[Sequence[Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]]]) \u2013 (experimental) A list of jobs to be executed when the sensor fires.

  • \n
  • default_status (DefaultSensorStatus) \u2013 Whether the sensor starts as running or not. The default\nstatus can be overridden from the Dagster UI or via the GraphQL API.

  • \n
  • request_assets (Optional[AssetSelection]) \u2013 (Experimental) an asset selection to launch a run\nfor if the sensor condition is met. This can be provided instead of specifying a job.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.MultiAssetSensorDefinition(name, monitored_assets, job_name, asset_materialization_fn, minimum_interval_seconds=None, description=None, job=None, jobs=None, default_status=DefaultSensorStatus.STOPPED, request_assets=None, required_resource_keys=None)[source]\u00b6
\n
\n
\n

\n \n (\n experimental\n )\n \n This API may break in future versions, even between dot releases.\n \n

\n

Define an asset sensor that initiates a set of runs based on the materialization of a list of\nassets.

\n

Users should not instantiate this object directly. To construct a\nMultiAssetSensorDefinition, use dagster.\nmulti_asset_sensor().

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the sensor to create.

  • \n
  • asset_keys (Sequence[AssetKey]) \u2013 The asset_keys this sensor monitors.

  • \n
  • asset_materialization_fn (Callable[[MultiAssetSensorEvaluationContext], Union[Iterator[Union[RunRequest, SkipReason]], RunRequest, SkipReason]]) \u2013

    The core\nevaluation function for the sensor, which is run at an interval to determine whether a\nrun should be launched or not. Takes a MultiAssetSensorEvaluationContext.

    \n

    This function must return a generator, which must yield either a single SkipReason\nor one or more RunRequest objects.

    \n

  • \n
  • minimum_interval_seconds (Optional[int]) \u2013 The minimum number of seconds that will elapse\nbetween sensor evaluations.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the sensor.

  • \n
  • job (Optional[Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]]) \u2013 The job\nobject to target with this sensor.

  • \n
  • jobs (Optional[Sequence[Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]]]) \u2013 (experimental) A list of jobs to be executed when the sensor fires.

  • \n
  • default_status (DefaultSensorStatus) \u2013 Whether the sensor starts as running or not. The default\nstatus can be overridden from the Dagster UI or via the GraphQL API.

  • \n
  • request_assets (Optional[AssetSelection]) \u2013 (Experimental) an asset selection to launch a run\nfor if the sensor condition is met. This can be provided instead of specifying a job.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.MultiAssetSensorEvaluationContext(instance_ref, last_completion_time, last_run_key, cursor, repository_name, repository_def, monitored_assets, instance=None, resource_defs=None, definitions=None)[source]\u00b6
\n
\n
\n

\n \n (\n experimental\n )\n \n This API may break in future versions, even between dot releases.\n \n

\n

The context object available as the argument to the evaluation function of a\ndagster.MultiAssetSensorDefinition.

\n

Users should not instantiate this object directly. To construct a\nMultiAssetSensorEvaluationContext for testing purposes, use dagster.\nbuild_multi_asset_sensor_context().

\n

The MultiAssetSensorEvaluationContext contains a cursor object that tracks the state of\nconsumed event logs for each monitored asset. For each asset, the cursor stores the storage ID\nof the latest materialization that has been marked as \u201cconsumed\u201d (via a call to advance_cursor)\nin a latest_consumed_event_id field.

\n

For each monitored asset, the cursor will store the latest unconsumed event ID for up to 25\npartitions. Each event ID must be before the latest_consumed_event_id field for the asset.

\n

Events marked as consumed via advance_cursor will be returned in future ticks until they\nare marked as consumed.

\n

To update the cursor to the latest materialization and clear the unconsumed events, call\nadvance_all_cursors.

\n
\n
\nmonitored_assets\u00b6
\n

The assets monitored\nby the sensor. If an AssetSelection object is provided, it will only apply to assets\nwithin the Definitions that this sensor is part of.

\n
\n
Type:
\n

Union[Sequence[AssetKey], AssetSelection]

\n
\n
\n
\n\n
\n
\nrepository_def\u00b6
\n

The repository that the sensor belongs to.\nIf needed by the sensor top-level resource definitions will be pulled from this repository.\nYou can provide either this or definitions.

\n
\n
Type:
\n

Optional[RepositoryDefinition]

\n
\n
\n
\n\n
\n
\ninstance_ref\u00b6
\n

The serialized instance configured to run the schedule

\n
\n
Type:
\n

Optional[InstanceRef]

\n
\n
\n
\n\n
\n
\ncursor\u00b6
\n

The cursor, passed back from the last sensor evaluation via\nthe cursor attribute of SkipReason and RunRequest. Must be a dictionary of asset key\nstrings to a stringified tuple of (latest_event_partition, latest_event_storage_id,\ntrailing_unconsumed_partitioned_event_ids).

\n
\n
Type:
\n

Optional[str]

\n
\n
\n
\n\n
\n
\nlast_completion_time\u00b6
\n

DEPRECATED The last time that the sensor was consumed (UTC).

\n
\n
Type:
\n

float

\n
\n
\n
\n\n
\n
\nlast_run_key\u00b6
\n

DEPRECATED The run key of the RunRequest most recently created by this\nsensor. Use the preferred cursor attribute instead.

\n
\n
Type:
\n

str

\n
\n
\n
\n\n
\n
\nrepository_name\u00b6
\n

The name of the repository that the sensor belongs to.

\n
\n
Type:
\n

Optional[str]

\n
\n
\n
\n\n
\n
\ninstance\u00b6
\n

The deserialized instance can also be passed in\ndirectly (primarily useful in testing contexts).

\n
\n
Type:
\n

Optional[DagsterInstance]

\n
\n
\n
\n\n
\n
\ndefinitions\u00b6
\n

Definitions object that the sensor is defined in.\nIf needed by the sensor, top-level resource definitions will be pulled from these\ndefinitions. You can provide either this or repository_def.

\n
\n
Type:
\n

Optional[Definitions]

\n
\n
\n
\n\n

Example

\n
from dagster import multi_asset_sensor, MultiAssetSensorEvaluationContext\n\n@multi_asset_sensor(monitored_assets=[AssetKey("asset_1), AssetKey("asset_2)])\ndef the_sensor(context: MultiAssetSensorEvaluationContext):\n    ...\n
\n
\n
\n
\nadvance_all_cursors()[source]\u00b6
\n

Updates the cursor to the most recent materialization event for all assets monitored by\nthe multi_asset_sensor.

\n

Marks all materialization events as consumed by the sensor, including unconsumed events.

\n
\n\n
\n
\nadvance_cursor(materialization_records_by_key)[source]\u00b6
\n

Marks the provided materialization records as having been consumed by the sensor.

\n

At the end of the tick, the cursor will be updated to advance past all materializations\nrecords provided via advance_cursor. In the next tick, records that have been consumed\nwill no longer be returned.

\n

Passing a partitioned materialization record into this function will mark prior materializations\nwith the same asset key and partition as having been consumed.

\n
\n
Parameters:
\n

materialization_records_by_key (Mapping[AssetKey, Optional[EventLogRecord]]) \u2013 Mapping of\nAssetKeys to EventLogRecord or None. If an EventLogRecord is provided, the cursor\nfor the AssetKey will be updated and future calls to fetch asset materialization events\nwill not fetch this event again. If None is provided, the cursor for the AssetKey\nwill not be updated.

\n
\n
\n
\n\n
\n
\nall_partitions_materialized(asset_key, partitions=None)[source]\u00b6
\n

A utility method to check if a provided list of partitions have been materialized\nfor a particular asset. This method ignores the cursor and checks all materializations\nfor the asset.

\n
\n
Parameters:
\n
    \n
  • asset_key (AssetKey) \u2013 The asset to check partitions for.

  • \n
  • partitions (Optional[Sequence[str]]) \u2013 A list of partitions to check. If not provided,\nall partitions for the asset will be checked.

  • \n
\n
\n
Returns:
\n

True if all selected partitions have been materialized, False otherwise.

\n
\n
Return type:
\n

bool

\n
\n
\n
\n\n
\n
\nproperty asset_keys\u00b6
\n

The asset keys which are monitored by this sensor.

\n
\n
Type:
\n

Sequence[AssetKey]

\n
\n
\n
\n\n
\n
\nproperty assets_defs_by_key\u00b6
\n

A mapping from AssetKey to the\nAssetsDefinition object which produces it. If a given asset is monitored by this sensor, but\nis not produced within the same code location as this sensor, then the value will be None.

\n
\n
Type:
\n

Mapping[AssetKey, Optional[AssetsDefinition]]

\n
\n
\n
\n\n
\n
\nget_cursor_partition(asset_key)[source]\u00b6
\n

A utility method to get the current partition the cursor is on.

\n
\n\n
\n
\nget_downstream_partition_keys(partition_key, from_asset_key, to_asset_key)[source]\u00b6
\n

Converts a partition key from one asset to the corresponding partition key in a downstream\nasset. Uses the existing partition mapping between the upstream asset and the downstream\nasset if it exists, otherwise, uses the default partition mapping.

\n
\n
Parameters:
\n
    \n
  • partition_key (str) \u2013 The partition key to convert.

  • \n
  • from_asset_key (AssetKey) \u2013 The asset key of the upstream asset, which the provided\npartition key belongs to.

  • \n
  • to_asset_key (AssetKey) \u2013 The asset key of the downstream asset. The provided partition\nkey will be mapped to partitions within this asset.

  • \n
\n
\n
Returns:
\n

\n
A list of the corresponding downstream partitions in to_asset_key that

partition_key maps to.

\n
\n
\n

\n
\n
Return type:
\n

Sequence[str]

\n
\n
\n
\n\n
\n
\nget_trailing_unconsumed_events(asset_key)[source]\u00b6
\n

Fetches the unconsumed events for a given asset key. Returns only events\nbefore the latest consumed event ID for the given asset. To mark an event as consumed,\npass the event to advance_cursor. Returns events in ascending order by storage ID.

\n
\n
Parameters:
\n

asset_key (AssetKey) \u2013 The asset key to get unconsumed events for.

\n
\n
Returns:
\n

The unconsumed events for the given asset key.

\n
\n
Return type:
\n

Sequence[EventLogRecord]

\n
\n
\n
\n\n
\n
\nlatest_materialization_records_by_key(asset_keys=None)[source]\u00b6
\n

Fetches the most recent materialization event record for each asset in asset_keys.\nOnly fetches events after the latest consumed event ID for the given asset key.

\n
\n
Parameters:
\n

asset_keys (Optional[Sequence[AssetKey]]) \u2013 list of asset keys to fetch events for. If\nnot specified, the latest materialization will be fetched for all assets the\nmulti_asset_sensor monitors.

\n
\n
\n
\n
Returns: Mapping of AssetKey to EventLogRecord where the EventLogRecord is the latest

materialization event for the asset. If there is no materialization event for the asset,\nthe value in the mapping will be None.

\n
\n
\n
\n\n
\n
\nlatest_materialization_records_by_partition(asset_key, after_cursor_partition=False)[source]\u00b6
\n

Given an asset, returns a mapping of partition key to the latest materialization event\nfor that partition. Fetches only materializations that have not been marked as \u201cconsumed\u201d\nvia a call to advance_cursor.

\n
\n
Parameters:
\n
    \n
  • asset_key (AssetKey) \u2013 The asset to fetch events for.

  • \n
  • after_cursor_partition (Optional[bool]) \u2013 If True, only materializations with partitions\nafter the cursor\u2019s current partition will be returned. By default, set to False.

  • \n
\n
\n
Returns:
\n

Mapping of AssetKey to a mapping of partitions to EventLogRecords where the\nEventLogRecord is the most recent materialization event for the partition.\nThe mapping preserves the order that the materializations occurred.

\n
\n
Return type:
\n

Mapping[str, EventLogRecord]

\n
\n
\n

Example

\n
@asset(partitions_def=DailyPartitionsDefinition("2022-07-01"))\ndef july_asset():\n    return 1\n\n@multi_asset_sensor(asset_keys=[july_asset.key])\ndef my_sensor(context):\n    context.latest_materialization_records_by_partition(july_asset.key)\n\n# After materializing july_asset for 2022-07-05, latest_materialization_by_partition\n# returns {"2022-07-05": EventLogRecord(...)}\n
\n
\n
\n\n
\n
\nlatest_materialization_records_by_partition_and_asset()[source]\u00b6
\n

Finds the most recent unconsumed materialization for each partition for each asset\nmonitored by the sensor. Aggregates all materializations into a mapping of partition key\nto a mapping of asset key to the materialization event for that partition.

\n

For example, if the sensor monitors two partitioned assets A and B that are materialized\nfor partition_x after the cursor, this function returns:

\n
\n
{\n    "partition_x": {asset_a.key: EventLogRecord(...), asset_b.key: EventLogRecord(...)}\n}\n
\n
\n
\n

This method can only be called when all monitored assets are partitioned and share\nthe same partition definition.

\n
\n\n
\n
\nmaterialization_records_for_key(asset_key, limit=None)[source]\u00b6
\n

Fetches asset materialization event records for asset_key, with the earliest event first.

\n

Only fetches events after the latest consumed event ID for the given asset key.

\n
\n
Parameters:
\n
    \n
  • asset_key (AssetKey) \u2013 The asset to fetch materialization events for

  • \n
  • limit (Optional[int]) \u2013 The number of events to fetch

  • \n
\n
\n
\n
\n\n
\n\n
\n
\ndagster.build_multi_asset_sensor_context(*, monitored_assets, repository_def=None, instance=None, cursor=None, repository_name=None, cursor_from_latest_materializations=False, resources=None, definitions=None)[source]\u00b6
\n
\n
\n

\n \n (\n experimental\n )\n \n This API may break in future versions, even between dot releases.\n \n

\n

Builds multi asset sensor execution context for testing purposes using the provided parameters.

\n

This function can be used to provide a context to the invocation of a multi asset sensor definition. If\nprovided, the dagster instance must be persistent; DagsterInstance.ephemeral() will result in an\nerror.

\n
\n
Parameters:
\n
    \n
  • monitored_assets (Union[Sequence[AssetKey], AssetSelection]) \u2013 The assets monitored\nby the sensor. If an AssetSelection object is provided, it will only apply to assets\nwithin the Definitions that this sensor is part of.

  • \n
  • repository_def (RepositoryDefinition) \u2013 RepositoryDefinition object that\nthe sensor is defined in. Must provide definitions if this is not provided.

  • \n
  • instance (Optional[DagsterInstance]) \u2013 The dagster instance configured to run the sensor.

  • \n
  • cursor (Optional[str]) \u2013 A string cursor to provide to the evaluation of the sensor. Must be\na dictionary of asset key strings to ints that has been converted to a json string

  • \n
  • repository_name (Optional[str]) \u2013 The name of the repository that the sensor belongs to.

  • \n
  • cursor_from_latest_materializations (bool) \u2013 If True, the cursor will be set to the latest\nmaterialization for each monitored asset. By default, set to False.

  • \n
  • resources (Optional[Mapping[str, object]]) \u2013 The resource definitions\nto provide to the sensor.

  • \n
  • definitions (Optional[Definitions]) \u2013 Definitions object that the sensor is defined in.\nMust provide repository_def if this is not provided.

  • \n
\n
\n
\n

Examples

\n
with instance_for_test() as instance:\n    context = build_multi_asset_sensor_context(\n        monitored_assets=[AssetKey("asset_1"), AssetKey("asset_2")],\n        instance=instance,\n    )\n    my_asset_sensor(context)\n
\n
\n
\n\n
\n
\nclass dagster.RunStatusSensorDefinition(name, run_status, run_status_sensor_fn, minimum_interval_seconds=None, description=None, monitored_jobs=None, monitor_all_repositories=False, default_status=DefaultSensorStatus.STOPPED, request_job=None, request_jobs=None, required_resource_keys=None)[source]\u00b6
\n

Define a sensor that reacts to a given status of job execution, where the decorated\nfunction will be evaluated when a run is at the given status.

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the sensor. Defaults to the name of the decorated function.

  • \n
  • run_status (DagsterRunStatus) \u2013 The status of a run which will be\nmonitored by the sensor.

  • \n
  • run_status_sensor_fn (Callable[[RunStatusSensorContext], Union[SkipReason, DagsterRunReaction]]) \u2013 The core\nevaluation function for the sensor. Takes a RunStatusSensorContext.

  • \n
  • minimum_interval_seconds (Optional[int]) \u2013 The minimum number of seconds that will elapse\nbetween sensor evaluations.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the sensor.

  • \n
  • monitored_jobs (Optional[List[Union[JobDefinition, GraphDefinition, UnresolvedAssetJobDefinition, JobSelector, RepositorySelector, CodeLocationSelector]]]) \u2013 The jobs in the current repository that will be monitored by this sensor. Defaults to\nNone, which means the alert will be sent when any job in the repository fails.

  • \n
  • monitor_all_repositories (bool) \u2013 If set to True, the sensor will monitor all runs in the\nDagster instance. If set to True, an error will be raised if you also specify\nmonitored_jobs or job_selection. Defaults to False.

  • \n
  • default_status (DefaultSensorStatus) \u2013 Whether the sensor starts as running or not. The default\nstatus can be overridden from the Dagster UI or via the GraphQL API.

  • \n
  • request_job (Optional[Union[GraphDefinition, JobDefinition]]) \u2013 The job a RunRequest should\nexecute if yielded from the sensor.

  • \n
  • request_jobs (Optional[Sequence[Union[GraphDefinition, JobDefinition]]]) \u2013 (experimental)\nA list of jobs to be executed if RunRequests are yielded from the sensor.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.RunStatusSensorContext(sensor_name, dagster_run, dagster_event, instance, context=None, resource_defs=None, logger=None, partition_key=None, _resources=None, _cm_scope_entered=False)[source]\u00b6
\n

The context object available to a decorated function of run_status_sensor.

\n
\n
\nproperty dagster_event\u00b6
\n

The event associated with the job run status.

\n
\n\n
\n
\nproperty dagster_run\u00b6
\n

The run of the job.

\n
\n\n
\n
\nproperty instance\u00b6
\n

The current instance.

\n
\n\n
\n
\nproperty log\u00b6
\n

The logger for the current sensor evaluation.

\n
\n\n
\n
\nproperty partition_key\u00b6
\n

The partition key of the relevant run.

\n
\n
Type:
\n

Optional[str]

\n
\n
\n
\n\n
\n
\nproperty sensor_name\u00b6
\n

The name of the sensor.

\n
\n\n
\n\n
\n
\nclass dagster.RunFailureSensorContext(sensor_name, dagster_run, dagster_event, instance, context=None, resource_defs=None, logger=None, partition_key=None, _resources=None, _cm_scope_entered=False)[source]\u00b6
\n

The context object available to a decorated function of run_failure_sensor.

\n
\n
\nsensor_name\u00b6
\n

the name of the sensor.

\n
\n
Type:
\n

str

\n
\n
\n
\n\n
\n
\ndagster_run\u00b6
\n

the failed run.

\n
\n
Type:
\n

DagsterRun

\n
\n
\n
\n\n
\n
\nproperty failure_event\u00b6
\n

The run failure event.

\n

If the run failed because of an error inside a step, get_step_failure_events will have more\ndetails on the step failure.

\n
\n\n
\n
\nget_step_failure_events()[source]\u00b6
\n

The step failure event for each step in the run that failed.

\n

Examples

\n
error_strings_by_step_key = {\n    # includes the stack trace\n    event.step_key: event.event_specific_data.error.to_string()\n    for event in context.get_step_failure_events()\n}\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.JobSelector(location_name, repository_name=None, job_name=None)[source]\u00b6
\n
\n\n
\n
\nclass dagster.RepositorySelector(location_name, repository_name)[source]\u00b6
\n
\n\n
\n
\ndagster.build_run_status_sensor_context(sensor_name, dagster_event, dagster_instance, dagster_run, context=None, resources=None, partition_key=None)[source]\u00b6
\n

Builds run status sensor context from provided parameters.

\n

This function can be used to provide the context argument when directly invoking a function\ndecorated with @run_status_sensor or @run_failure_sensor, such as when writing unit tests.

\n
\n
Parameters:
\n
    \n
  • sensor_name (str) \u2013 The name of the sensor the context is being constructed for.

  • \n
  • dagster_event (DagsterEvent) \u2013 A DagsterEvent with the same event type as the one that\ntriggers the run_status_sensor

  • \n
  • dagster_instance (DagsterInstance) \u2013 The dagster instance configured for the context.

  • \n
  • dagster_run (DagsterRun) \u2013 DagsterRun object from running a job

  • \n
  • resources (Optional[Mapping[str, object]]) \u2013 A dictionary of resources to be made available\nto the sensor.

  • \n
\n
\n
\n

Examples

\n
instance = DagsterInstance.ephemeral()\nresult = my_job.execute_in_process(instance=instance)\n\ndagster_run = result.dagster_run\ndagster_event = result.get_job_success_event() # or get_job_failure_event()\n\ncontext = build_run_status_sensor_context(\n    sensor_name="run_status_sensor_to_invoke",\n    dagster_instance=instance,\n    dagster_run=dagster_run,\n    dagster_event=dagster_event,\n)\nrun_status_sensor_to_invoke(context)\n
\n
\n
\n\n
\n
\n@dagster.run_status_sensor(run_status, name=None, minimum_interval_seconds=None, description=None, monitored_jobs=None, job_selection=None, monitor_all_repositories=False, default_status=DefaultSensorStatus.STOPPED, request_job=None, request_jobs=None)[source]\u00b6
\n

Creates a sensor that reacts to a given status of job execution, where the decorated\nfunction will be run when a job is at the given status.

\n

Takes a RunStatusSensorContext.

\n
\n
Parameters:
\n
    \n
  • run_status (DagsterRunStatus) \u2013 The status of run execution which will be\nmonitored by the sensor.

  • \n
  • name (Optional[str]) \u2013 The name of the sensor. Defaults to the name of the decorated function.

  • \n
  • minimum_interval_seconds (Optional[int]) \u2013 The minimum number of seconds that will elapse\nbetween sensor evaluations.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the sensor.

  • \n
  • monitored_jobs (Optional[List[Union[JobDefinition, GraphDefinition, UnresolvedAssetJobDefinition, RepositorySelector, JobSelector, CodeLocationSelector]]]) \u2013 Jobs in the current repository that will be monitored by this sensor. Defaults to None, which means the alert will\nbe sent when any job in the repository matches the requested run_status. Jobs in external repositories can be monitored by using\nRepositorySelector or JobSelector.

  • \n
  • monitor_all_repositories (bool) \u2013 If set to True, the sensor will monitor all runs in the Dagster instance.\nIf set to True, an error will be raised if you also specify monitored_jobs or job_selection.\nDefaults to False.

  • \n
  • job_selection (Optional[List[Union[JobDefinition, GraphDefinition, RepositorySelector, JobSelector, CodeLocationSelector]]]) \u2013 \n \n (\n deprecated\n )\n \n (This parameter will be removed in version 2.0. Use monitored_jobs instead.) (deprecated in favor of monitored_jobs) Jobs in the current repository that will be\nmonitored by this sensor. Defaults to None, which means the alert will be sent when\nany job in the repository matches the requested run_status.

  • \n
  • default_status (DefaultSensorStatus) \u2013 Whether the sensor starts as running or not. The default\nstatus can be overridden from the Dagster UI or via the GraphQL API.

  • \n
  • request_job (Optional[Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]]) \u2013 The job that should be\nexecuted if a RunRequest is yielded from the sensor.

  • \n
  • request_jobs (Optional[Sequence[Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]]]) \u2013 (experimental)\nA list of jobs to be executed if RunRequests are yielded from the sensor.

  • \n
\n
\n
\n
\n\n
\n
\n@dagster.run_failure_sensor(name=None, minimum_interval_seconds=None, description=None, monitored_jobs=None, job_selection=None, monitor_all_repositories=False, default_status=DefaultSensorStatus.STOPPED, request_job=None, request_jobs=None)[source]\u00b6
\n

Creates a sensor that reacts to job failure events, where the decorated function will be\nrun when a run fails.

\n

Takes a RunFailureSensorContext.

\n
\n
Parameters:
\n
    \n
  • name (Optional[str]) \u2013 The name of the job failure sensor. Defaults to the name of the\ndecorated function.

  • \n
  • minimum_interval_seconds (Optional[int]) \u2013 The minimum number of seconds that will elapse\nbetween sensor evaluations.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the sensor.

  • \n
  • monitored_jobs (Optional[List[Union[JobDefinition, GraphDefinition, UnresolvedAssetJobDefinition, RepositorySelector, JobSelector, CodeLocationSelector]]]) \u2013 The jobs in the current repository that will be monitored by this failure sensor.\nDefaults to None, which means the alert will be sent when any job in the current\nrepository fails.

  • \n
  • monitor_all_repositories (bool) \u2013 If set to True, the sensor will monitor all runs in the\nDagster instance. If set to True, an error will be raised if you also specify\nmonitored_jobs or job_selection. Defaults to False.

  • \n
  • job_selection (Optional[List[Union[JobDefinition, GraphDefinition, RepositorySelector, JobSelector, CodeLocationSelector]]]) \u2013 \n \n (\n deprecated\n )\n \n (This parameter will be removed in version 2.0. Use monitored_jobs instead.) (deprecated in favor of monitored_jobs) The jobs in the current repository that will be\nmonitored by this failure sensor. Defaults to None, which means the alert will be sent\nwhen any job in the repository fails.

  • \n
  • default_status (DefaultSensorStatus) \u2013 Whether the sensor starts as running or not. The default\nstatus can be overridden from the Dagster UI or via the GraphQL API.

  • \n
  • request_job (Optional[Union[GraphDefinition, JobDefinition, UnresolvedAssetJob]]) \u2013 The job a RunRequest should\nexecute if yielded from the sensor.

  • \n
  • request_jobs (Optional[Sequence[Union[GraphDefinition, JobDefinition, UnresolvedAssetJob]]]) \u2013 (experimental)\nA list of jobs to be executed if RunRequests are yielded from the sensor.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.SensorResult(run_requests=None, skip_reason=None, cursor=None, dynamic_partitions_requests=None, asset_events=None)[source]\u00b6
\n

The result of a sensor evaluation.

\n
\n
\nrun_requests\u00b6
\n

A list\nof run requests to be executed.

\n
\n
Type:
\n

Optional[Sequence[RunRequest]]

\n
\n
\n
\n\n
\n
\nskip_reason\u00b6
\n

A skip message indicating why sensor\nevaluation was skipped.

\n
\n
Type:
\n

Optional[Union[str, SkipReason]]

\n
\n
\n
\n\n
\n
\ncursor\u00b6
\n

The cursor value for this sensor, which will be provided on the\ncontext for the next sensor evaluation.

\n
\n
Type:
\n

Optional[str]

\n
\n
\n
\n\n
\n
\ndynamic_partitions_requests (Optional[Sequence[Union[DeleteDynamicPartitionsRequest,
\n

AddDynamicPartitionsRequest]]]): A list of dynamic partition requests to request dynamic\npartition addition and deletion. Run requests will be evaluated using the state of the\npartitions with these changes applied.

\n
\n\n
\n
\nasset_events\u00b6
\n

(Experimental) A\nlist of materializations, observations, and asset check evaluations that the system\nwill persist on your behalf at the end of sensor evaluation. These events will be not\nbe associated with any particular run, but will be queryable and viewable in the asset catalog.

\n
\n
Type:
\n

Optional[Sequence[Union[AssetObservation, AssetMaterialization, AssetCheckEvaluation]]]

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.AddDynamicPartitionsRequest(partitions_def_name, partition_keys)[source]\u00b6
\n

A request to add partitions to a dynamic partitions definition, to be evaluated by a sensor or schedule.

\n
\n\n
\n
\nclass dagster.DeleteDynamicPartitionsRequest(partitions_def_name, partition_keys)[source]\u00b6
\n

A request to delete partitions to a dynamic partitions definition, to be evaluated by a sensor or schedule.

\n
\n\n
\n", "current_page_name": "sections/api/apidocs/schedules-sensors", "customsidebar": null, "display_toc": true, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../dynamic/", "title": "Dynamic Mapping & Collect"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../resources/", "title": "Resources"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/dynamic", "Dynamic Mapping & Collect", "N", "next"], ["sections/api/apidocs/resources", "Resources", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/schedules-sensors.rst.txt", "title": "Run Requests", "toc": "\n"}, "types": {"alabaster_version": "0.7.13", "body": "
\n

Types\u00b6

\n

Dagster includes facilities for typing the input and output values of ops (\u201cruntime\u201d types).

\n
\n

Built-in types\u00b6

\n
\n
\ndagster.Nothing\u00b6
\n

Use this type only for inputs and outputs, in order to establish an execution dependency without\ncommunicating a value. Inputs of this type will not be passed to the op compute function, so\nit is necessary to use the explicit In API to define them rather than\nthe Python 3 type hint syntax.

\n

All values are considered to be instances of Nothing.

\n

Examples:

\n
@op\ndef wait(_) -> Nothing:\n    time.sleep(1)\n    return\n\n@op(\n    ins={"ready": In(dagster_type=Nothing)},\n)\ndef done(_) -> str:\n    return 'done'\n\n@job\ndef nothing_job():\n    done(wait())\n\n# Any value will pass the type check for Nothing\n@op\ndef wait_int(_) -> Int:\n    time.sleep(1)\n    return 1\n\n@job\ndef nothing_int_job():\n    done(wait_int())\n
\n
\n
\n\n
\n
\n

Making New Types\u00b6

\n
\n
\nclass dagster.DagsterType(type_check_fn, key=None, name=None, is_builtin=False, description=None, loader=None, required_resource_keys=None, kind=DagsterTypeKind.REGULAR, typing_type=typing.Any, metadata=None)[source]\u00b6
\n

Define a type in dagster. These can be used in the inputs and outputs of ops.

\n
\n
Parameters:
\n
    \n
  • type_check_fn (Callable[[TypeCheckContext, Any], [Union[bool, TypeCheck]]]) \u2013 The function that defines the type check. It takes the value flowing\nthrough the input or output of the op. If it passes, return either\nTrue or a TypeCheck with success set to True. If it fails,\nreturn either False or a TypeCheck with success set to False.\nThe first argument must be named context (or, if unused, _, _context, or context_).\nUse required_resource_keys for access to resources.

  • \n
  • key (Optional[str]) \u2013

    The unique key to identify types programmatically.\nThe key property always has a value. If you omit key to the argument\nto the init function, it instead receives the value of name. If\nneither key nor name is provided, a CheckError is thrown.

    \n

    In the case of a generic type such as List or Optional, this is\ngenerated programmatically based on the type parameters.

    \n

    For most use cases, name should be set and the key argument should\nnot be specified.

    \n

  • \n
  • name (Optional[str]) \u2013 A unique name given by a user. If key is None, key\nbecomes this value. Name is not given in a case where the user does\nnot specify a unique name for this type, such as a generic class.

  • \n
  • description (Optional[str]) \u2013 A markdown-formatted string, displayed in tooling.

  • \n
  • loader (Optional[DagsterTypeLoader]) \u2013 An instance of a class that\ninherits from DagsterTypeLoader and can map config data to a value of\nthis type. Specify this argument if you will need to shim values of this type using the\nconfig machinery. As a rule, you should use the\n@dagster_type_loader decorator to construct\nthese arguments.

  • \n
  • required_resource_keys (Optional[Set[str]]) \u2013 Resource keys required by the type_check_fn.

  • \n
  • is_builtin (bool) \u2013 Defaults to False. This is used by tools to display or\nfilter built-in types (such as String, Int) to visually distinguish\nthem from user-defined types. Meant for internal use.

  • \n
  • kind (DagsterTypeKind) \u2013 Defaults to None. This is used to determine the kind of runtime type\nfor InputDefinition and OutputDefinition type checking.

  • \n
  • typing_type \u2013 Defaults to None. A valid python typing type (e.g. Optional[List[int]]) for the\nvalue contained within the DagsterType. Meant for internal use.

  • \n
\n
\n
\n
\n
\nproperty description\u00b6
\n

Description of the type, or None if not provided.

\n
\n
Type:
\n

Optional[str]

\n
\n
\n
\n\n
\n
\nproperty display_name\u00b6
\n

Either the name or key (if name is None) of the type, overridden in many subclasses.

\n
\n\n
\n
\nproperty has_unique_name\u00b6
\n

Whether the type has a unique name.

\n
\n
Type:
\n

bool

\n
\n
\n
\n\n
\n
\nproperty loader\u00b6
\n

Loader for this type, if any.

\n
\n
Type:
\n

Optional[DagsterTypeLoader]

\n
\n
\n
\n\n
\n
\nproperty required_resource_keys\u00b6
\n

Set of resource keys required by the type check function.

\n
\n
Type:
\n

AbstractSet[str]

\n
\n
\n
\n\n
\n
\ntype_check(context, value)[source]\u00b6
\n

Type check the value against the type.

\n
\n
Parameters:
\n
    \n
  • context (TypeCheckContext) \u2013 The context of the type check.

  • \n
  • value (Any) \u2013 The value to check.

  • \n
\n
\n
Returns:
\n

The result of the type check.

\n
\n
Return type:
\n

TypeCheck

\n
\n
\n
\n\n
\n
\nproperty typing_type\u00b6
\n

The python typing type for this type.

\n
\n
Type:
\n

Any

\n
\n
\n
\n\n
\n
\nproperty unique_name\u00b6
\n

The unique name of this type. Can be None if the type is not unique, such as container types.

\n
\n\n
\n\n
\n
\ndagster.PythonObjectDagsterType(python_type, key=None, name=None, **kwargs)[source]\u00b6
\n

Define a type in dagster whose typecheck is an isinstance check.

\n

Specifically, the type can either be a single python type (e.g. int),\nor a tuple of types (e.g. (int, float)) which is treated as a union.

\n

Examples

\n
ntype = PythonObjectDagsterType(python_type=int)\nassert ntype.name == 'int'\nassert_success(ntype, 1)\nassert_failure(ntype, 'a')\n
\n
\n
ntype = PythonObjectDagsterType(python_type=(int, float))\nassert ntype.name == 'Union[int, float]'\nassert_success(ntype, 1)\nassert_success(ntype, 1.5)\nassert_failure(ntype, 'a')\n
\n
\n
\n
Parameters:
\n
    \n
  • python_type (Union[Type, Tuple[Type, ...]) \u2013 The dagster typecheck function calls instanceof on\nthis type.

  • \n
  • name (Optional[str]) \u2013 Name the type. Defaults to the name of python_type.

  • \n
  • key (Optional[str]) \u2013 Key of the type. Defaults to name.

  • \n
  • description (Optional[str]) \u2013 A markdown-formatted string, displayed in tooling.

  • \n
  • loader (Optional[DagsterTypeLoader]) \u2013 An instance of a class that\ninherits from DagsterTypeLoader and can map config data to a value of\nthis type. Specify this argument if you will need to shim values of this type using the\nconfig machinery. As a rule, you should use the\n@dagster_type_loader decorator to construct\nthese arguments.

  • \n
\n
\n
\n
\n\n
\n
\ndagster.dagster_type_loader(config_schema, required_resource_keys=None, loader_version=None, external_version_fn=None)[source]\u00b6
\n

Create an dagster type loader that maps config data to a runtime value.

\n

The decorated function should take the execution context and parsed config value and return the\nappropriate runtime value.

\n
\n
Parameters:
\n
    \n
  • config_schema (ConfigSchema) \u2013 The schema for the config that\u2019s passed to the decorated\nfunction.

  • \n
  • loader_version (str) \u2013 (Experimental) The version of the decorated compute function. Two\nloading functions should have the same version if and only if they deterministically\nproduce the same outputs when provided the same inputs.

  • \n
  • external_version_fn (Callable) \u2013 (Experimental) A function that takes in the same parameters as the loader\nfunction (config_value) and returns a representation of the version of the external\nasset (str). Two external assets with identical versions are treated as identical to one\nanother.

  • \n
\n
\n
\n

Examples

\n
@dagster_type_loader(Permissive())\ndef load_dict(_context, value):\n    return value\n
\n
\n
\n\n
\n
\nclass dagster.DagsterTypeLoader[source]\u00b6
\n

Dagster type loaders are used to load unconnected inputs of the dagster type they are attached\nto.

\n

The recommended way to define a type loader is with the\n@dagster_type_loader decorator.

\n
\n\n
\n
\nclass dagster.DagsterTypeLoaderContext(plan_data, execution_data, log_manager, step, output_capture, known_state)[source]\u00b6
\n

The context object provided to a @dagster_type_loader-decorated function during execution.

\n

Users should not construct this object directly.

\n
\n
\nproperty job_def\u00b6
\n

The underlying job definition being executed.

\n
\n\n
\n
\nproperty op_def\u00b6
\n

The op for which type loading is occurring.

\n
\n\n
\n
\nproperty resources\u00b6
\n

The resources available to the type loader, specified by the required_resource_keys argument of the decorator.

\n
\n\n
\n\n
\n
\ndagster.usable_as_dagster_type(name=None, description=None, loader=None)[source]\u00b6
\n

Decorate a Python class to make it usable as a Dagster Type.

\n

This is intended to make it straightforward to annotate existing business logic classes to\nmake them dagster types whose typecheck is an isinstance check against that python class.

\n
\n
Parameters:
\n
    \n
  • python_type (cls) \u2013 The python type to make usable as python type.

  • \n
  • name (Optional[str]) \u2013 Name of the new Dagster type. If None, the name (__name__) of\nthe python_type will be used.

  • \n
  • description (Optional[str]) \u2013 A user-readable description of the type.

  • \n
  • loader (Optional[DagsterTypeLoader]) \u2013 An instance of a class that\ninherits from DagsterTypeLoader and can map config data to a value of\nthis type. Specify this argument if you will need to shim values of this type using the\nconfig machinery. As a rule, you should use the\n@dagster_type_loader decorator to construct\nthese arguments.

  • \n
\n
\n
\n

Examples

\n
# dagster_aws.s3.file_manager.S3FileHandle\n@usable_as_dagster_type\nclass S3FileHandle(FileHandle):\n    def __init__(self, s3_bucket, s3_key):\n        self._s3_bucket = check.str_param(s3_bucket, 's3_bucket')\n        self._s3_key = check.str_param(s3_key, 's3_key')\n\n    @property\n    def s3_bucket(self):\n        return self._s3_bucket\n\n    @property\n    def s3_key(self):\n        return self._s3_key\n\n    @property\n    def path_desc(self):\n        return self.s3_path\n\n    @property\n    def s3_path(self):\n        return 's3://{bucket}/{key}'.format(bucket=self.s3_bucket, key=self.s3_key)\n
\n
\n
\n\n
\n
\ndagster.make_python_type_usable_as_dagster_type(python_type, dagster_type)[source]\u00b6
\n

Take any existing python type and map it to a dagster type (generally created with\nDagsterType) This can only be called once\non a given python type.

\n
\n\n
\n

Testing Types\u00b6

\n
\n
\ndagster.check_dagster_type(dagster_type, value)[source]\u00b6
\n

Test a custom Dagster type.

\n
\n
Parameters:
\n
    \n
  • dagster_type (Any) \u2013 The Dagster type to test. Should be one of the\nbuilt-in types, a dagster type explicitly constructed with\nas_dagster_type(), @usable_as_dagster_type, or\nPythonObjectDagsterType(), or a Python type.

  • \n
  • value (Any) \u2013 The runtime value to test.

  • \n
\n
\n
Returns:
\n

The result of the type check.

\n
\n
Return type:
\n

TypeCheck

\n
\n
\n

Examples

\n
assert check_dagster_type(Dict[Any, Any], {'foo': 'bar'}).success\n
\n
\n
\n\n
\n
\n
\n", "current_page_name": "sections/api/apidocs/types", "customsidebar": null, "display_toc": true, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../utilities/", "title": "Utilities"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dynamic/", "title": "Dynamic Mapping & Collect"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/utilities", "Utilities", "N", "next"], ["sections/api/apidocs/dynamic", "Dynamic Mapping & Collect", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/types.rst.txt", "title": "Types", "toc": "\n"}, "utilities": {"alabaster_version": "0.7.13", "body": "
\n

Utilities\u00b6

\n
\n
\ndagster.file_relative_path(dunderfile, relative_path)[source]\u00b6
\n

Get a path relative to the currently executing Python file.

\n

This function is useful when one needs to load a file that is relative to the position of\nthe current file. (Such as when you encode a configuration file path in source file and want\nin runnable in any current working directory)

\n
\n
Parameters:
\n
    \n
  • dunderfile (str) \u2013 Should always be __file__.

  • \n
  • relative_path (str) \u2013 Path to get relative to the currently executing file.

  • \n
\n
\n
\n

Examples:

\n
file_relative_path(__file__, 'path/relative/to/file')\n
\n
\n
\n\n
\n
\ndagster.config_from_files(config_files)[source]\u00b6
\n

Constructs run config from YAML files.

\n
\n
Parameters:
\n

config_files (List[str]) \u2013 List of paths or glob patterns for yaml files\nto load and parse as the run config.

\n
\n
Returns:
\n

A run config dictionary constructed from provided YAML files.

\n
\n
Return type:
\n

Dict[str, Any]

\n
\n
Raises:
\n
    \n
  • FileNotFoundError \u2013 When a config file produces no results

  • \n
  • DagsterInvariantViolationError \u2013 When one of the YAML files is invalid and has a parse\n error.

  • \n
\n
\n
\n
\n\n
\n
\ndagster.config_from_pkg_resources(pkg_resource_defs)[source]\u00b6
\n

Load a run config from a package resource, using pkg_resources.resource_string().

\n

Example

\n
config_from_pkg_resources(\n    pkg_resource_defs=[\n        ('dagster_examples.airline_demo.environments', 'local_base.yaml'),\n        ('dagster_examples.airline_demo.environments', 'local_warehouse.yaml'),\n    ],\n)\n
\n
\n
\n
Parameters:
\n

pkg_resource_defs (List[(str, str)]) \u2013 List of pkg_resource modules/files to\nload as the run config.

\n
\n
Returns:
\n

A run config dictionary constructed from the provided yaml strings

\n
\n
Return type:
\n

Dict[Str, Any]

\n
\n
Raises:
\n

DagsterInvariantViolationError \u2013 When one of the YAML documents is invalid and has a\n parse error.

\n
\n
\n
\n\n
\n
\ndagster.config_from_yaml_strings(yaml_strings)[source]\u00b6
\n

Static constructor for run configs from YAML strings.

\n
\n
Parameters:
\n

yaml_strings (List[str]) \u2013 List of yaml strings to parse as the run config.

\n
\n
Returns:
\n

A run config dictionary constructed from the provided yaml strings

\n
\n
Return type:
\n

Dict[Str, Any]

\n
\n
Raises:
\n

DagsterInvariantViolationError \u2013 When one of the YAML documents is invalid and has a\n parse error.

\n
\n
\n
\n\n
\n
\ndagster.get_dagster_logger(name=None)[source]\u00b6
\n

Creates a python logger whose output messages will be captured and converted into Dagster log\nmessages. This means they will have structured information such as the step_key, run_id, etc.\nembedded into them, and will show up in the Dagster event log.

\n

This can be used as a more convenient alternative to context.log in most cases. If log level\nis not set explicitly, defaults to DEBUG.

\n
\n
Parameters:
\n

name (Optional[str]) \u2013 If supplied, will create a logger with the name \u201cdagster.builtin.{name}\u201d,\nwith properties inherited from the base Dagster logger. If omitted, the returned logger\nwill be named \u201cdagster.builtin\u201d.

\n
\n
Returns:
\n

A logger whose output will be captured by Dagster.

\n
\n
Return type:
\n

logging.Logger

\n
\n
\n

Example

\n
from dagster import get_dagster_logger, op\n\n@op\ndef hello_op():\n    log = get_dagster_logger()\n    for i in range(5):\n        # do something\n        log.info(f"Did {i+1} things!")\n
\n
\n
\n\n
\n
\nclass dagster.ExperimentalWarning[source]\u00b6
\n
\n\n
\n
\ndagster.make_email_on_run_failure_sensor(email_from, email_password, email_to, email_body_fn=<function _default_failure_email_body>, email_subject_fn=<function _default_failure_email_subject>, smtp_host='smtp.gmail.com', smtp_type='SSL', smtp_port=None, name=None, webserver_base_url=None, monitored_jobs=None, job_selection=None, monitor_all_repositories=False, default_status=DefaultSensorStatus.STOPPED)[source]\u00b6
\n

Create a job failure sensor that sends email via the SMTP protocol.

\n
\n
Parameters:
\n
    \n
  • email_from (str) \u2013 The sender email address to send the message from.

  • \n
  • email_password (str) \u2013 The password of the sender.

  • \n
  • email_to (List[str]) \u2013 The receipt email addresses to send the message to.

  • \n
  • email_body_fn (Optional(Callable[[RunFailureSensorContext], str])) \u2013 Function which\ntakes in the RunFailureSensorContext outputs the email body you want to send.\nDefaults to the plain text that contains error message, job name, and run ID.

  • \n
  • email_subject_fn (Optional(Callable[[RunFailureSensorContext], str])) \u2013 Function which\ntakes in the RunFailureSensorContext outputs the email subject you want to send.\nDefaults to \u201cDagster Run Failed: <job_name>\u201d.

  • \n
  • smtp_host (str) \u2013 The hostname of the SMTP server. Defaults to \u201csmtp.gmail.com\u201d.

  • \n
  • smtp_type (str) \u2013 The protocol; either \u201cSSL\u201d or \u201cSTARTTLS\u201d. Defaults to SSL.

  • \n
  • smtp_port (Optional[int]) \u2013 The SMTP port. Defaults to 465 for SSL, 587 for STARTTLS.

  • \n
  • name \u2013 (Optional[str]): The name of the sensor. Defaults to \u201cemail_on_job_failure\u201d.

  • \n
  • webserver_base_url \u2013 (Optional[str]): The base url of your dagster-webserver instance. Specify this to allow\nmessages to include deeplinks to the failed run.

  • \n
  • monitored_jobs (Optional[List[Union[JobDefinition, GraphDefinition, JobDefinition, RepositorySelector, JobSelector]]]) \u2013 The jobs that will be monitored by this failure sensor. Defaults to None, which means the alert will\nbe sent when any job in the repository fails. To monitor jobs in external repositories,\nuse RepositorySelector and JobSelector.

  • \n
  • monitor_all_repositories (bool) \u2013 If set to True, the sensor will monitor all runs in the\nDagster instance. If set to True, an error will be raised if you also specify\nmonitored_jobs or job_selection. Defaults to False.

  • \n
  • job_selection (Optional[List[Union[JobDefinition, GraphDefinition, JobDefinition, RepositorySelector, JobSelector]]]) \u2013 \n \n (\n deprecated\n )\n \n (This parameter will be removed in version 2.0. Use monitored_jobs instead.) (deprecated in favor of monitored_jobs) The jobs that will be monitored by this failure\nsensor. Defaults to None, which means the alert will be sent when any job in the repository fails.

  • \n
  • default_status (DefaultSensorStatus) \u2013 Whether the sensor starts as running or not. The default\nstatus can be overridden from the Dagster UI or via the GraphQL API.

  • \n
\n
\n
\n

Examples

\n
email_on_run_failure = make_email_on_run_failure_sensor(\n    email_from="no-reply@example.com",\n    email_password=os.getenv("ALERT_EMAIL_PASSWORD"),\n    email_to=["xxx@example.com"],\n)\n\n@repository\ndef my_repo():\n    return [my_job + email_on_run_failure]\n
\n
\n
def my_message_fn(context: RunFailureSensorContext) -> str:\n    return (\n        f"Job {context.pipeline_run.job_name} failed!"\n        f"Error: {context.failure_event.message}"\n    )\n\nemail_on_run_failure = make_email_on_run_failure_sensor(\n    email_from="no-reply@example.com",\n    email_password=os.getenv("ALERT_EMAIL_PASSWORD"),\n    email_to=["xxx@example.com"],\n    email_body_fn=my_message_fn,\n    email_subject_fn=lambda _: "Dagster Alert",\n    webserver_base_url="http://mycoolsite.com",\n)\n
\n
\n
\n\n
\n
\nclass dagster._utils.forked_pdb.ForkedPdb(completekey='tab', stdin=None, stdout=None, skip=None, nosigint=False, readrc=True)[source]\u00b6
\n

A pdb subclass that may be used from a forked multiprocessing child.

\n

Examples:

\n
from dagster._utils.forked_pdb import ForkedPdb\n\n@solid\ndef complex_solid(_):\n    # some complicated stuff\n\n    ForkedPdb().set_trace()\n\n    # some other complicated stuff\n
\n
\n

You can initiate pipeline execution via the webserver and use the pdb debugger to examine/step through\nexecution at the breakpoint.

\n
\n\n
\n", "current_page_name": "sections/api/apidocs/utilities", "customsidebar": null, "display_toc": false, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../memoization/", "title": "Job-Level Versioning and Memoization (Deprecated)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../types/", "title": "Types"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/memoization", "Job-Level Versioning and Memoization (Deprecated)", "N", "next"], ["sections/api/apidocs/types", "Types", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/utilities.rst.txt", "title": "Utilities", "toc": "\n"}}}} \ No newline at end of file +{"api": {"apidocs": {"asset-checks": {"alabaster_version": "0.7.13", "body": "
\n

Asset Checks (Experimental)\u00b6

\n

Dagster allows you to define and execute checks on your software-defined assets. Each asset check verifies some property of a data asset, e.g. that is has no null values in a particular column.

\n
\n
\n@dagster.asset_check(*, asset, name=None, description=None, required_resource_keys=None, resource_defs=None, config_schema=None, compute_kind=None, op_tags=None, retry_policy=None)[source]\u00b6
\n
\n
\n

\n \n (\n experimental\n )\n \n This API may break in future versions, even between dot releases.\n \n

\n

Create a definition for how to execute an asset check.

\n
\n
Parameters:
\n
    \n
  • asset (Union[AssetKey, Sequence[str], str, AssetsDefinition, SourceAsset]) \u2013 The\nasset that the check applies to.

  • \n
  • name (Optional[str]) \u2013 The name of the check. If not specified, the name of the decorated\nfunction will be used. Checks for the same asset must have unique names.

  • \n
  • description (Optional[str]) \u2013 The description of the check.

  • \n
  • required_resource_keys (Optional[Set[str]]) \u2013 A set of keys for resources that are required\nby the function that execute the check. These can alternatively be specified by\nincluding resource-typed parameters in the function signature.

  • \n
  • config_schema (Optional[ConfigSchema) \u2013 The configuration schema for the check\u2019s underlying\nop. If set, Dagster will check that config provided for the op matches this schema and fail\nif it does not. If not set, Dagster will accept any config provided for the op.

  • \n
  • op_tags (Optional[Dict[str, Any]]) \u2013 A dictionary of tags for the op that executes the check.\nFrameworks may expect and require certain metadata to be attached to a op. Values that\nare not strings will be json encoded and must meet the criteria that\njson.loads(json.dumps(value)) == value.

  • \n
  • compute_kind (Optional[str]) \u2013 A string to represent the kind of computation that executes\nthe check, e.g. \u201cdbt\u201d or \u201cspark\u201d.

  • \n
  • retry_policy (Optional[RetryPolicy]) \u2013 The retry policy for the op that executes the check.

  • \n
\n
\n
\n

Produces an AssetChecksDefinition object.

\n

Example

\n
from dagster import asset, asset_check, AssetCheckResult\n\n@asset\ndef my_asset() -> None:\n    ...\n\n@asset_check(asset=my_asset, description="Check that my asset has enough rows")\ndef my_asset_has_enough_rows() -> AssetCheckResult:\n    num_rows = ...\n    return AssetCheckResult(passed=num_rows > 5, metadata={"num_rows": num_rows})\n
\n
\n
\n
Example with a DataFrame Output:
from dagster import asset, asset_check, AssetCheckResult\nfrom pandas import DataFrame\n\n@asset\ndef my_asset() -> DataFrame:\n    ...\n\n@asset_check(asset=my_asset, description="Check that my asset has enough rows")\ndef my_asset_has_enough_rows(my_asset: DataFrame) -> AssetCheckResult:\n    num_rows = my_asset.shape[0]\n    return AssetCheckResult(passed=num_rows > 5, metadata={"num_rows": num_rows})\n
\n
\n
\n
\n
\n\n
\n
\nclass dagster.AssetCheckResult(*, passed, asset_key=None, check_name=None, metadata=None, severity=AssetCheckSeverity.ERROR)[source]\u00b6
\n
\n
\n

\n \n (\n experimental\n )\n \n This API may break in future versions, even between dot releases.\n \n

\n

The result of an asset check.

\n
\n
\nasset_key\u00b6
\n

The asset key that was checked.

\n
\n
Type:
\n

Optional[AssetKey]

\n
\n
\n
\n\n
\n
\ncheck_name\u00b6
\n

The name of the check.

\n
\n
Type:
\n

Optional[str]

\n
\n
\n
\n\n
\n
\npassed\u00b6
\n

The pass/fail result of the check.

\n
\n
Type:
\n

bool

\n
\n
\n
\n\n
\n
\nmetadata\u00b6
\n

Arbitrary metadata about the asset. Keys are displayed string labels, and values are\none of the following: string, float, int, JSON-serializable dict, JSON-serializable\nlist, and one of the data classes returned by a MetadataValue static method.

\n
\n
Type:
\n

Optional[Dict[str, RawMetadataValue]]

\n
\n
\n
\n\n
\n
\nseverity\u00b6
\n

Severity of the check. Defaults to ERROR.

\n
\n
Type:
\n

AssetCheckSeverity

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.AssetCheckSpec(name, *, asset, description=None)[source]\u00b6
\n
\n
\n

\n \n (\n experimental\n )\n \n This API may break in future versions, even between dot releases.\n \n

\n

Defines information about an check, except how to execute it.

\n

AssetCheckSpec is often used as an argument to decorators that decorator a function that can\nexecute multiple checks - e.g. @asset, and @multi_asset. It defines one of the checks that\nwill be executed inside that function.

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 Name of the check.

  • \n
  • asset (Union[AssetKey, Sequence[str], str, AssetsDefinition, SourceAsset]) \u2013 The asset that\nthe check applies to.

  • \n
  • description (Optional[str]) \u2013 Description for the check.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.AssetCheckSeverity(value)[source]\u00b6
\n
\n
\n

\n \n (\n experimental\n )\n \n This API may break in future versions, even between dot releases.\n \n

\n

Severity level for an asset check.

\n

Severities:

\n
    \n
  • WARN: If the check fails, don\u2019t fail the step.

  • \n
  • ERROR: If the check fails, fail the step and, within the run, skip materialization of any\nassets that are downstream of the asset being checked.

  • \n
\n
\n\n
\n
\nclass dagster.AssetCheckKey(asset_key, name)[source]\u00b6
\n
\n
\n

\n \n (\n experimental\n )\n \n This API may break in future versions, even between dot releases.\n \n

\n

Check names are expected to be unique per-asset. Thus, this combination of asset key and\ncheck name uniquely identifies an asset check within a deployment.

\n
\n\n
\n", "current_page_name": "sections/api/apidocs/asset-checks", "customsidebar": null, "display_toc": false, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../cli/", "title": "Dagster CLI"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../assets/", "title": "Software-Defined Assets"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/cli", "Dagster CLI", "N", "next"], ["sections/api/apidocs/assets", "Software-Defined Assets", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/asset-checks.rst.txt", "title": "Asset Checks (Experimental)", "toc": "\n"}, "assets": {"alabaster_version": "0.7.13", "body": "
\n

Software-Defined Assets\u00b6

\n

An asset is an object in persistent storage, such as a table, file, or persisted machine learning model. A software-defined asset is a Dagster object that couples an asset to the function and upstream assets that are used to produce its contents.

\n
\n
\n@dagster.asset(compute_fn=None, *, name=None, key_prefix=None, ins=None, deps=None, metadata=None, description=None, config_schema=None, required_resource_keys=None, resource_defs=None, io_manager_def=None, io_manager_key=None, compute_kind=None, dagster_type=None, partitions_def=None, op_tags=None, group_name=None, output_required=True, freshness_policy=None, auto_materialize_policy=None, backfill_policy=None, retry_policy=None, code_version=None, key=None, non_argument_deps=None, check_specs=None)[source]\u00b6
\n

Create a definition for how to compute an asset.

\n
\n
A software-defined asset is the combination of:
    \n
  1. An asset key, e.g. the name of a table.

  2. \n
  3. A function, which can be run to compute the contents of the asset.

  4. \n
  5. A set of upstream assets that are provided as inputs to the function when computing the asset.

  6. \n
\n
\n
\n

Unlike an op, whose dependencies are determined by the graph it lives inside, an asset knows\nabout the upstream assets it depends on. The upstream assets are inferred from the arguments\nto the decorated function. The name of the argument designates the name of the upstream asset.

\n

An asset has an op inside it to represent the function that computes it. The name of the op\nwill be the segments of the asset key, separated by double-underscores.

\n
\n
Parameters:
\n
    \n
  • name (Optional[str]) \u2013 The name of the asset. If not provided, defaults to the name of the\ndecorated function. The asset\u2019s name must be a valid name in dagster (ie only contains\nletters, numbers, and _) and may not contain python reserved keywords.

  • \n
  • key_prefix (Optional[Union[str, Sequence[str]]]) \u2013 If provided, the asset\u2019s key is the\nconcatenation of the key_prefix and the asset\u2019s name, which defaults to the name of\nthe decorated function. Each item in key_prefix must be a valid name in dagster (ie only\ncontains letters, numbers, and _) and may not contain python reserved keywords.

  • \n
  • ins (Optional[Mapping[str, AssetIn]]) \u2013 A dictionary that maps input names to information\nabout the input.

  • \n
  • deps (Optional[Sequence[Union[AssetDep, AssetsDefinition, SourceAsset, AssetKey, str]]]) \u2013 The assets that are upstream dependencies, but do not correspond to a parameter of the\ndecorated function. If the AssetsDefinition for a multi_asset is provided, dependencies on\nall assets created by the multi_asset will be created.

  • \n
  • config_schema (Optional[ConfigSchema) \u2013 The configuration schema for the asset\u2019s underlying\nop. If set, Dagster will check that config provided for the op matches this schema and fail\nif it does not. If not set, Dagster will accept any config provided for the op.

  • \n
  • metadata (Optional[Dict[str, Any]]) \u2013 A dict of metadata entries for the asset.

  • \n
  • required_resource_keys (Optional[Set[str]]) \u2013 Set of resource handles required by the op.

  • \n
  • io_manager_key (Optional[str]) \u2013 The resource key of the IOManager used\nfor storing the output of the op as an asset, and for loading it in downstream ops\n(default: \u201cio_manager\u201d). Only one of io_manager_key and io_manager_def can be provided.

  • \n
  • io_manager_def (Optional[object]) \u2013 \n \n (\n experimental\n )\n \n (This parameter may break in future versions, even between dot releases.) (Experimental) The IOManager used for\nstoring the output of the op as an asset, and for loading it in\ndownstream ops. Only one of io_manager_def and io_manager_key can be provided.

  • \n
  • compute_kind (Optional[str]) \u2013 A string to represent the kind of computation that produces\nthe asset, e.g. \u201cdbt\u201d or \u201cspark\u201d. It will be displayed in the Dagster UI as a badge on the asset.

  • \n
  • dagster_type (Optional[DagsterType]) \u2013 Allows specifying type validation functions that\nwill be executed on the output of the decorated function after it runs.

  • \n
  • partitions_def (Optional[PartitionsDefinition]) \u2013 Defines the set of partition keys that\ncompose the asset.

  • \n
  • op_tags (Optional[Dict[str, Any]]) \u2013 A dictionary of tags for the op that computes the asset.\nFrameworks may expect and require certain metadata to be attached to a op. Values that\nare not strings will be json encoded and must meet the criteria that\njson.loads(json.dumps(value)) == value.

  • \n
  • group_name (Optional[str]) \u2013 A string name used to organize multiple assets into groups. If not provided,\nthe name \u201cdefault\u201d is used.

  • \n
  • resource_defs (Optional[Mapping[str, object]]) \u2013 \n \n (\n experimental\n )\n \n (This parameter may break in future versions, even between dot releases.) (Experimental) A mapping of resource keys to resources. These resources\nwill be initialized during execution, and can be accessed from the\ncontext within the body of the function.

  • \n
  • output_required (bool) \u2013 Whether the decorated function will always materialize an asset.\nDefaults to True. If False, the function can return None, which will not be materialized to\nstorage and will halt execution of downstream assets.

  • \n
  • freshness_policy (FreshnessPolicy) \u2013 A constraint telling Dagster how often this asset is intended to be updated\nwith respect to its root data.

  • \n
  • auto_materialize_policy (AutoMaterializePolicy) \u2013 \n \n (\n experimental\n )\n \n (This parameter may break in future versions, even between dot releases.) (Experimental) Configure Dagster to automatically materialize\nthis asset according to its FreshnessPolicy and when upstream dependencies change.

  • \n
  • backfill_policy (BackfillPolicy) \u2013 \n \n (\n experimental\n )\n \n (This parameter may break in future versions, even between dot releases.) (Experimental) Configure Dagster to backfill this asset according to its\nBackfillPolicy.

  • \n
  • retry_policy (Optional[RetryPolicy]) \u2013 The retry policy for the op that computes the asset.

  • \n
  • code_version (Optional[str]) \u2013 (Experimental) Version of the code that generates this asset. In\ngeneral, versions should be set only for code that deterministically produces the same\noutput when given the same inputs.

  • \n
  • check_specs (Optional[Sequence[AssetCheckSpec]]) \u2013 (Experimental) Specs for asset checks that\nexecute in the decorated function after materializing the asset.

  • \n
  • non_argument_deps (Optional[Union[Set[AssetKey], Set[str]]]) \u2013 \n \n (\n deprecated\n )\n \n (This parameter will be removed in version 2.0.0. use deps instead.) Deprecated, use deps instead.\nSet of asset keys that are upstream dependencies, but do not pass an input to the asset.

  • \n
  • key (Optional[CoeercibleToAssetKey]) \u2013 The key for this asset. If provided, cannot specify key_prefix or name.

  • \n
\n
\n
\n

Examples

\n
@asset\ndef my_asset(my_upstream_asset: int) -> int:\n    return my_upstream_asset + 1\n
\n
\n
\n\n
\n
\nclass dagster.MaterializeResult(*, asset_key=None, metadata=None, check_results=None, data_version=None)[source]\u00b6
\n
\n
\n

\n \n (\n experimental\n )\n \n This API may break in future versions, even between dot releases.\n \n

\n

An object representing a successful materialization of an asset. These can be returned from\n@asset and @multi_asset decorated functions to pass metadata or specify specific assets were\nmaterialized.

\n
\n
\nasset_key\u00b6
\n

Optional in @asset, required in @multi_asset to discern which asset this refers to.

\n
\n
Type:
\n

Optional[AssetKey]

\n
\n
\n
\n\n
\n
\nmetadata\u00b6
\n

Metadata to record with the corresponding AssetMaterialization event.

\n
\n
Type:
\n

Optional[MetadataUserInput]

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.AssetSpec(key, *, deps=None, description=None, metadata=None, skippable=False, group_name=None, code_version=None, freshness_policy=None, auto_materialize_policy=None)[source]\u00b6
\n
\n
\n

\n \n (\n experimental\n )\n \n This API may break in future versions, even between dot releases.\n \n

\n

Specifies the core attributes of an asset. This object is attached to the decorated\nfunction that defines how it materialized.

\n
\n
\nkey\u00b6
\n

The unique identifier for this asset.

\n
\n
Type:
\n

AssetKey

\n
\n
\n
\n\n
\n
\ndeps\u00b6
\n

The asset keys for the upstream assets that\nmaterializing this asset depends on.

\n
\n
Type:
\n

Optional[AbstractSet[AssetKey]]

\n
\n
\n
\n\n
\n
\ndescription\u00b6
\n

Human-readable description of this asset.

\n
\n
Type:
\n

Optional[str]

\n
\n
\n
\n\n
\n
\nmetadata\u00b6
\n

A dict of static metadata for this asset.\nFor example, users can provide information about the database table this\nasset corresponds to.

\n
\n
Type:
\n

Optional[Dict[str, Any]]

\n
\n
\n
\n\n
\n
\nskippable\u00b6
\n

Whether this asset can be omitted during materialization, causing downstream\ndependencies to skip.

\n
\n
Type:
\n

bool

\n
\n
\n
\n\n
\n
\ngroup_name\u00b6
\n

A string name used to organize multiple assets into groups. If\nnot provided, the name \u201cdefault\u201d is used.

\n
\n
Type:
\n

Optional[str]

\n
\n
\n
\n\n
\n
\ncode_version\u00b6
\n

The version of the code for this specific asset,\noverriding the code version of the materialization function

\n
\n
Type:
\n

Optional[str]

\n
\n
\n
\n\n
\n
\nfreshness_policy\u00b6
\n

A policy which indicates how up to date this\nasset is intended to be.

\n
\n
Type:
\n

Optional[FreshnessPolicy]

\n
\n
\n
\n\n
\n
\nauto_materialize_policy\u00b6
\n

AutoMaterializePolicy to apply to\nthe specified asset.

\n
\n
Type:
\n

Optional[AutoMaterializePolicy]

\n
\n
\n
\n\n
\n
\nbackfill_policy\u00b6
\n

BackfillPolicy to apply to the specified asset.

\n
\n
Type:
\n

Optional[BackfillPolicy]

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.AssetDep(asset, *, partition_mapping=None)[source]\u00b6
\n
\n
\n

\n \n (\n experimental\n )\n \n This API may break in future versions, even between dot releases.\n \n

\n

Specifies a dependency on an upstream asset.

\n
\n
\nasset\u00b6
\n

The upstream asset to depend on.

\n
\n
Type:
\n

Union[AssetKey, str, AssetSpec, AssetsDefinition, SourceAsset]

\n
\n
\n
\n\n
\n
\npartition_mapping\u00b6
\n

Defines what partitions to depend on in\nthe upstream asset. If not provided and the upstream asset is partitioned, defaults to\nthe default partition mapping for the partitions definition, which is typically maps\npartition keys to the same partition keys in upstream assets.

\n
\n
Type:
\n

Optional[PartitionMapping]

\n
\n
\n
\n\n

Examples

\n
upstream_asset = AssetSpec("upstream_asset")\ndownstream_asset = AssetSpec(\n    "downstream_asset",\n    deps=[\n        AssetDep(\n            upstream_asset,\n            partition_mapping=TimeWindowPartitionMapping(start_offset=-1, end_offset=-1)\n        )\n    ]\n)\n
\n
\n
\n\n
\n
\nclass dagster.AssetIn(key=None, metadata=None, key_prefix=None, input_manager_key=None, partition_mapping=None, dagster_type=<class 'dagster._core.definitions.utils.NoValueSentinel'>)[source]\u00b6
\n

Defines an asset dependency.

\n
\n
\nkey_prefix\u00b6
\n

If provided, the asset\u2019s key is the\nconcatenation of the key_prefix and the input name. Only one of the \u201ckey_prefix\u201d and\n\u201ckey\u201d arguments should be provided.

\n
\n
Type:
\n

Optional[Union[str, Sequence[str]]]

\n
\n
\n
\n\n
\n
\nkey\u00b6
\n

The asset\u2019s key. Only one of the\n\u201ckey_prefix\u201d and \u201ckey\u201d arguments should be provided.

\n
\n
Type:
\n

Optional[Union[str, Sequence[str], AssetKey]]

\n
\n
\n
\n\n
\n
\nmetadata\u00b6
\n

A dict of the metadata for the input.\nFor example, if you only need a subset of columns from an upstream table, you could\ninclude that in metadata and the IO manager that loads the upstream table could use the\nmetadata to determine which columns to load.

\n
\n
Type:
\n

Optional[Dict[str, Any]]

\n
\n
\n
\n\n
\n
\npartition_mapping\u00b6
\n

Defines what partitions to depend on in\nthe upstream asset. If not provided, defaults to the default partition mapping for the\npartitions definition, which is typically maps partition keys to the same partition keys\nin upstream assets.

\n
\n
Type:
\n

Optional[PartitionMapping]

\n
\n
\n
\n\n
\n
\ndagster_type\u00b6
\n

Allows specifying type validation functions that\nwill be executed on the input of the decorated function before it runs.

\n
\n
Type:
\n

DagsterType

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.SourceAsset(key, metadata=None, io_manager_key=None, io_manager_def=None, description=None, partitions_def=None, group_name=None, resource_defs=None, observe_fn=None, *, auto_observe_interval_minutes=None, _required_resource_keys=None)[source]\u00b6
\n

A SourceAsset represents an asset that will be loaded by (but not updated by) Dagster.

\n
\n
\nkey\u00b6
\n

The key of the asset.

\n
\n
Type:
\n

Union[AssetKey, Sequence[str], str]

\n
\n
\n
\n\n
\n
\nmetadata\u00b6
\n

Metadata associated with the asset.

\n
\n
Type:
\n

Mapping[str, MetadataValue]

\n
\n
\n
\n\n
\n
\nio_manager_key\u00b6
\n

The key for the IOManager that will be used to load the contents of\nthe asset when it\u2019s used as an input to other assets inside a job.

\n
\n
Type:
\n

Optional[str]

\n
\n
\n
\n\n
\n
\nio_manager_def\u00b6
\n

(Experimental) The definition of the IOManager that will be used to load the contents of\nthe asset when it\u2019s used as an input to other assets inside a job.

\n
\n
Type:
\n

Optional[IOManagerDefinition]

\n
\n
\n
\n\n
\n
\nresource_defs\u00b6
\n

(Experimental) resource definitions that may be required by the dagster.IOManagerDefinition provided in the io_manager_def argument.

\n
\n
Type:
\n

Optional[Mapping[str, ResourceDefinition]]

\n
\n
\n
\n\n
\n
\ndescription\u00b6
\n

The description of the asset.

\n
\n
Type:
\n

Optional[str]

\n
\n
\n
\n\n
\n
\npartitions_def\u00b6
\n

Defines the set of partition keys that\ncompose the asset.

\n
\n
Type:
\n

Optional[PartitionsDefinition]

\n
\n
\n
\n\n
\n
\nobserve_fn\u00b6
\n
\n
Type:
\n

Optional[SourceAssetObserveFunction]

\n
\n
\n
\n\n
\n
\nproperty is_observable\u00b6
\n

Whether the asset is observable.

\n
\n
Type:
\n

bool

\n
\n
\n
\n\n
\n
\nproperty op\u00b6
\n

The OpDefinition associated with the observation function of an observable\nsource asset.

\n

Throws an error if the asset is not observable.

\n
\n
Type:
\n

OpDefinition

\n
\n
\n
\n\n
\n\n
\n
\ndagster.define_asset_job(name, selection=None, config=None, description=None, tags=None, metadata=None, partitions_def=None, executor_def=None, hooks=None)[source]\u00b6
\n

Creates a definition of a job which will either materialize a selection of assets or observe\na selection of source assets. This will only be resolved to a JobDefinition once placed in a\ncode location.

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name for the job.

  • \n
  • selection (Union[str, Sequence[str], Sequence[AssetKey], Sequence[Union[AssetsDefinition, SourceAsset]], AssetSelection]) \u2013

    The assets that will be materialized or observed when the job is run.

    \n

    The selected assets must all be included in the assets that are passed to the assets\nargument of the Definitions object that this job is included on.

    \n

    The string \u201cmy_asset*\u201d selects my_asset and all downstream assets within the code\nlocation. A list of strings represents the union of all assets selected by strings\nwithin the list.

    \n

    The selection will be resolved to a set of assets when the location is loaded. If the\nselection resolves to all source assets, the created job will perform source asset\nobservations. If the selection resolves to all regular assets, the created job will\nmaterialize assets. If the selection resolves to a mixed set of source assets and\nregular assets, an error will be thrown.

    \n

  • \n
  • config \u2013

    Describes how the Job is parameterized at runtime.

    \n

    If no value is provided, then the schema for the job\u2019s run config is a standard\nformat based on its ops and resources.

    \n

    If a dictionary is provided, then it must conform to the standard config schema, and\nit will be used as the job\u2019s run config for the job whenever the job is executed.\nThe values provided will be viewable and editable in the Dagster UI, so be\ncareful with secrets.

    \n

    If a ConfigMapping object is provided, then the schema for the job\u2019s run config is\ndetermined by the config mapping, and the ConfigMapping, which should return\nconfiguration in the standard format to configure the job.

    \n

  • \n
  • tags (Optional[Mapping[str, Any]]) \u2013 Arbitrary information that will be attached to the execution of the Job.\nValues that are not strings will be json encoded and must meet the criteria that\njson.loads(json.dumps(value)) == value. These tag values may be overwritten by tag\nvalues provided at invocation time.

  • \n
  • metadata (Optional[Mapping[str, RawMetadataValue]]) \u2013 Arbitrary metadata about the job.\nKeys are displayed string labels, and values are one of the following: string, float,\nint, JSON-serializable dict, JSON-serializable list, and one of the data classes\nreturned by a MetadataValue static method.

  • \n
  • description (Optional[str]) \u2013 A description for the Job.

  • \n
  • partitions_def (Optional[PartitionsDefinition]) \u2013 Defines the set of partitions for this job. All AssetDefinitions selected for this job\nmust have a matching PartitionsDefinition. If no PartitionsDefinition is provided, the\nPartitionsDefinition will be inferred from the selected AssetDefinitions.

  • \n
  • executor_def (Optional[ExecutorDefinition]) \u2013 How this Job will be executed. Defaults to multi_or_in_process_executor,\nwhich can be switched between multi-process and in-process modes of execution. The\ndefault mode of execution is multi-process.

  • \n
\n
\n
Returns:
\n

The job, which can be placed inside a code location.

\n
\n
Return type:
\n

UnresolvedAssetJobDefinition

\n
\n
\n

Examples

\n
# A job that targets all assets in the code location:\n@asset\ndef asset1():\n    ...\n\ndefs = Definitions(\n    assets=[asset1],\n    jobs=[define_asset_job("all_assets")],\n)\n\n# A job that targets a single asset\n@asset\ndef asset1():\n    ...\n\ndefs = Definitions(\n    assets=[asset1],\n    jobs=[define_asset_job("all_assets", selection=[asset1])],\n)\n\n# A job that targets all the assets in a group:\ndefs = Definitions(\n    assets=assets,\n    jobs=[define_asset_job("marketing_job", selection=AssetSelection.groups("marketing"))],\n)\n\n@observable_source_asset\ndef source_asset():\n    ...\n\n# A job that observes a source asset:\ndefs = Definitions(\n    assets=assets,\n    jobs=[define_asset_job("observation_job", selection=[source_asset])],\n)\n\n# Resources are supplied to the assets, not the job:\n@asset(required_resource_keys={"slack_client"})\ndef asset1():\n    ...\n\ndefs = Definitions(\n    assets=[asset1],\n    jobs=[define_asset_job("all_assets")],\n    resources={"slack_client": prod_slack_client},\n)\n
\n
\n
\n\n
\n
\nclass dagster.AssetSelection[source]\u00b6
\n

An AssetSelection defines a query over a set of assets and asset checks, normally all that are defined in a code location.

\n

You can use the \u201c|\u201d, \u201c&\u201d, and \u201c-\u201d operators to create unions, intersections, and differences of selections, respectively.

\n

AssetSelections are typically used with define_asset_job().

\n

By default, selecting assets will also select all of the asset checks that target those assets.

\n

Examples

\n
# Select all assets in group "marketing":\nAssetSelection.groups("marketing")\n\n# Select all assets in group "marketing", as well as the asset with key "promotion":\nAssetSelection.groups("marketing") | AssetSelection.keys("promotion")\n\n# Select all assets in group "marketing" that are downstream of asset "leads":\nAssetSelection.groups("marketing") & AssetSelection.keys("leads").downstream()\n\n# Select a list of assets:\nAssetSelection.assets(*my_assets_list)\n\n# Select all assets except for those in group "marketing"\nAssetSelection.all() - AssetSelection.groups("marketing")\n\n# Select all assets which are materialized by the same op as "projections":\nAssetSelection.keys("projections").required_multi_asset_neighbors()\n\n# Select all assets in group "marketing" and exclude their asset checks:\nAssetSelection.groups("marketing") - AssetSelection.all_asset_checks()\n\n# Select all asset checks that target a list of assets:\nAssetSelection.checks_for_assets(*my_assets_list)\n\n# Select a specific asset check:\nAssetSelection.checks(my_asset_check)\n
\n
\n
\n
\nstatic all()[source]\u00b6
\n

Returns a selection that includes all assets and asset checks.

\n
\n\n
\n
\nstatic all_asset_checks()[source]\u00b6
\n

Returns a selection that includes all asset checks.

\n
\n\n
\n
\nstatic assets(*assets_defs)[source]\u00b6
\n

Returns a selection that includes all of the provided assets and asset checks that target them.

\n
\n\n
\n
\nstatic checks(*asset_checks)[source]\u00b6
\n

Returns a selection that includes all of the provided asset checks.

\n
\n\n
\n
\nstatic checks_for_assets(*assets_defs)[source]\u00b6
\n

Returns a selection with the asset checks that target the provided assets.

\n
\n\n
\n
\ndownstream(depth=None, include_self=True)[source]\u00b6
\n

Returns a selection that includes all assets that are downstream of any of the assets in\nthis selection, selecting the assets in this selection by default. Includes the asset checks targeting the returned assets. Iterates through each\nasset in this selection and returns the union of all downstream assets.

\n
\n
depth (Optional[int]): If provided, then only include assets to the given depth. A depth

of 2 means all assets that are children or grandchildren of the assets in this\nselection.

\n
\n
include_self (bool): If True, then include the assets in this selection in the result.

If the include_self flag is False, return each downstream asset that is not part of the\noriginal selection. By default, set to True.

\n
\n
\n
\n\n
\n
\nstatic groups(*group_strs, include_sources=False)[source]\u00b6
\n

Returns a selection that includes materializable assets that belong to any of the\nprovided groups and all the asset checks that target them.

\n
\n
Parameters:
\n

include_sources (bool) \u2013 If True, then include source assets matching the group in the\nselection.

\n
\n
\n
\n\n
\n
\nstatic key_prefixes(*key_prefixes, include_sources=False)[source]\u00b6
\n

Returns a selection that includes assets that match any of the provided key prefixes and all the asset checks that target them.

\n
\n
Parameters:
\n

include_sources (bool) \u2013 If True, then include source assets matching the key prefix(es)\nin the selection.

\n
\n
\n

Examples

\n
# match any asset key where the first segment is equal to "a" or "b"\n# e.g. AssetKey(["a", "b", "c"]) would match, but AssetKey(["abc"]) would not.\nAssetSelection.key_prefixes("a", "b")\n\n# match any asset key where the first two segments are ["a", "b"] or ["a", "c"]\nAssetSelection.key_prefixes(["a", "b"], ["a", "c"])\n
\n
\n
\n\n
\n
\nstatic keys(*asset_keys)[source]\u00b6
\n

Returns a selection that includes assets with any of the provided keys and all asset checks that target them.

\n

Examples

\n
AssetSelection.keys(AssetKey(["a"]))\n\nAssetSelection.keys("a")\n\nAssetSelection.keys(AssetKey(["a"]), AssetKey(["b"]))\n\nAssetSelection.keys("a", "b")\n\nasset_key_list = [AssetKey(["a"]), AssetKey(["b"])]\nAssetSelection.keys(*asset_key_list)\n
\n
\n
\n\n
\n
\nrequired_multi_asset_neighbors()[source]\u00b6
\n

Given an asset selection in which some assets are output from a multi-asset compute op\nwhich cannot be subset, returns a new asset selection that contains all of the assets\nrequired to execute the original asset selection. Includes the asset checks targeting the returned assets.

\n
\n\n
\n
\nroots()[source]\u00b6
\n

Given an asset selection, returns a new asset selection that contains all of the root\nassets within the original asset selection. Includes the asset checks targeting the returned assets.

\n

A root asset is an asset that has no upstream dependencies within the asset selection.\nThe root asset can have downstream dependencies outside of the asset selection.

\n

Because mixed selections of source and materializable assets are currently not supported,\nkeys corresponding to SourceAssets will not be included as roots. To select source assets,\nuse the upstream_source_assets method.

\n
\n\n
\n
\nsinks()[source]\u00b6
\n

Given an asset selection, returns a new asset selection that contains all of the sink\nassets within the original asset selection. Includes the asset checks targeting the returned assets.

\n

A sink asset is an asset that has no downstream dependencies within the asset selection.\nThe sink asset can have downstream dependencies outside of the asset selection.

\n
\n\n
\n
\nsources()[source]\u00b6
\n
\n
\n

\n \n (\n deprecated\n )\n \n This API will be removed in version 2.0. Use AssetSelection.roots instead..\n \n

\n

Given an asset selection, returns a new asset selection that contains all of the root\nassets within the original asset selection. Includes the asset checks targeting the returned assets.

\n

A root asset is a materializable asset that has no upstream dependencies within the asset\nselection. The root asset can have downstream dependencies outside of the asset selection.

\n

Because mixed selections of source and materializable assets are currently not supported,\nkeys corresponding to SourceAssets will not be included as roots. To select source assets,\nuse the upstream_source_assets method.

\n
\n\n
\n
\nupstream(depth=None, include_self=True)[source]\u00b6
\n

Returns a selection that includes all materializable assets that are upstream of any of\nthe assets in this selection, selecting the assets in this selection by default. Includes the asset checks targeting the returned assets. Iterates\nthrough each asset in this selection and returns the union of all upstream assets.

\n

Because mixed selections of source and materializable assets are currently not supported,\nkeys corresponding to SourceAssets will not be included as upstream of regular assets.

\n
\n
Parameters:
\n
    \n
  • depth (Optional[int]) \u2013 If provided, then only include assets to the given depth. A depth\nof 2 means all assets that are parents or grandparents of the assets in this\nselection.

  • \n
  • include_self (bool) \u2013 If True, then include the assets in this selection in the result.\nIf the include_self flag is False, return each upstream asset that is not part of the\noriginal selection. By default, set to True.

  • \n
\n
\n
\n
\n\n
\n
\nupstream_source_assets()[source]\u00b6
\n

Given an asset selection, returns a new asset selection that contains all of the source\nassets upstream of assets in the original selection. Includes the asset checks targeting the returned assets.

\n
\n\n
\n
\nwithout_checks()[source]\u00b6
\n

Removes all asset checks in the selection.

\n
\n\n
\n\n
\n
\nclass dagster.FreshnessPolicy(*, maximum_lag_minutes, cron_schedule=None, cron_schedule_timezone=None)[source]\u00b6
\n
\n
\n

\n \n (\n experimental\n )\n \n This API may break in future versions, even between dot releases.\n \n

\n

A FreshnessPolicy specifies how up-to-date you want a given asset to be.

\n

Attaching a FreshnessPolicy to an asset definition encodes an expectation on the upstream data\nthat you expect to be incorporated into the current state of that asset at certain points in time.\nHow this is calculated differs depending on if the asset is unpartitioned or time-partitioned\n(other partitioning schemes are not supported).

\n

For time-partitioned assets, the current data time for the asset is simple to calculate. The\nupstream data that is incorporated into the asset is exactly the set of materialized partitions\nfor that asset. Thus, the current data time for the asset is simply the time up to which all\npartitions have been materialized.

\n

For unpartitioned assets, the current data time is based on the upstream materialization records\nthat were read to generate the current state of the asset. More specifically,\nimagine you have two assets, where A depends on B. If B has a FreshnessPolicy defined, this\nmeans that at time T, the most recent materialization of B should have come after a\nmaterialization of A which was no more than maximum_lag_minutes ago. This calculation is\nrecursive: any given asset is expected to incorporate up-to-date data from all of its upstream\nassets.

\n

It is assumed that all asset definitions with no upstream asset definitions consume from some\nalways-updating source. That is, if you materialize that asset at time T, it will incorporate\nall data up to time T.

\n

If cron_schedule is not defined, the given asset will be expected to incorporate upstream\ndata from no more than maximum_lag_minutes ago at all points in time. For example, \u201cThe events\ntable should always have data from at most 1 hour ago\u201d.

\n

If cron_schedule is defined, the given asset will be expected to incorporate upstream data\nfrom no more than maximum_lag_minutes ago at each cron schedule tick. For example, \u201cBy 9AM,\nthe signups table should contain all of yesterday\u2019s data\u201d.

\n

The freshness status of assets with policies defined will be visible in the UI. If you are using\nan asset reconciliation sensor, this sensor will kick off runs to help keep your assets up to\ndate with respect to their FreshnessPolicy.

\n
\n
Parameters:
\n
    \n
  • maximum_lag_minutes (float) \u2013 An upper bound for how old the data contained within this\nasset may be.

  • \n
  • cron_schedule (Optional[str]) \u2013 A cron schedule string (e.g. "0 1 * * *") specifying a\nseries of times by which the maximum_lag_minutes constraint must be satisfied. If\nno cron schedule is provided, then this constraint must be satisfied at all times.

  • \n
  • cron_schedule_timezone (Optional[str]) \u2013 Timezone in which the cron schedule should be evaluated.\nIf not specified, defaults to UTC. Supported strings for timezones are the ones provided\nby the IANA time zone database <https://www.iana.org/time-zones> - e.g.\n\u201cAmerica/Los_Angeles\u201d.

  • \n
\n
\n
\n
# At any point in time, this asset must incorporate all upstream data from at least 30 minutes ago.\n@asset(freshness_policy=FreshnessPolicy(maximum_lag_minutes=30))\ndef fresh_asset():\n    ...\n\n# At any point in time, this asset must incorporate all upstream data from at least 30 minutes ago.\n@asset(freshness_policy=FreshnessPolicy(maximum_lag_minutes=30))\ndef cron_up_to_date_asset():\n    ...\n
\n
\n
\n\n
\n
\nclass dagster.AutoMaterializePolicy(rules, max_materializations_per_minute=1)[source]\u00b6
\n
\n
\n

\n \n (\n experimental\n )\n \n This API may break in future versions, even between dot releases.\n \n

\n

An AutoMaterializePolicy specifies how Dagster should attempt to keep an asset up-to-date.

\n

Each policy consists of a set of AutoMaterializeRules, which are used to determine whether an\nasset or a partition of an asset should or should not be auto-materialized.

\n

The most common policy is AutoMaterializePolicy.eager(), which consists of the following rules:

\n
    \n
  • \n
    AutoMaterializeRule.materialize_on_missing()

    Materialize an asset or a partition if it has never been materialized.

    \n
    \n
    \n
  • \n
  • \n
    AutoMaterializeRule.materialize_on_parent_updated()

    Materialize an asset or a partition if one of its parents have been updated more recently\nthan it has.

    \n
    \n
    \n
  • \n
  • \n
    AutoMaterializeRule.materialize_on_required_for_freshness()

    Materialize an asset or a partition if it is required to satisfy a freshness policy.

    \n
    \n
    \n
  • \n
  • \n
    AutoMaterializeRule.skip_on_parent_outdated()

    Skip materializing an asset or partition if any of its parents have ancestors that have\nbeen materialized more recently.

    \n
    \n
    \n
  • \n
  • \n
    AutoMaterializeRule.skip_on_parent_missing()

    Skip materializing an asset or a partition if any parent has never been materialized or\nobserved.

    \n
    \n
    \n
  • \n
\n

Policies can be customized by adding or removing rules. For example, if you\u2019d like to allow\nan asset to be materialized even if some of its parent partitions are missing:

\n
from dagster import AutoMaterializePolicy, AutoMaterializeRule\n\nmy_policy = AutoMaterializePolicy.eager().without_rules(\n    AutoMaterializeRule.skip_on_parent_missing(),\n)\n
\n
\n

If you\u2019d like an asset to wait for all of its parents to be updated before materializing:

\n
from dagster import AutoMaterializePolicy, AutoMaterializeRule\n\nmy_policy = AutoMaterializePolicy.eager().with_rules(\n    AutoMaterializeRule.skip_on_all_parents_not_updated(),\n)\n
\n
\n

Lastly, the max_materializations_per_minute parameter, which is set to 1 by default,\nrate-limits the number of auto-materializations that can occur for a particular asset within\na short time interval. This mainly matters for partitioned assets. Its purpose is to provide a\nsafeguard against \u201csurprise backfills\u201d, where user-error causes auto-materialize to be\naccidentally triggered for large numbers of partitions at once.

\n

Warning:

\n

Constructing an AutoMaterializePolicy directly is not recommended as the API is subject to change.\nAutoMaterializePolicy.eager() and AutoMaterializePolicy.lazy() are the recommended API.

\n
\n
\nstatic eager(max_materializations_per_minute=1)[source]\u00b6
\n

Constructs an eager AutoMaterializePolicy.

\n
\n
Parameters:
\n

max_materializations_per_minute (Optional[int]) \u2013 The maximum number of\nauto-materializations for this asset that may be initiated per minute. If this limit\nis exceeded, the partitions which would have been materialized will be discarded,\nand will require manual materialization in order to be updated. Defaults to 1.

\n
\n
\n
\n\n
\n
\nstatic lazy(max_materializations_per_minute=1)[source]\u00b6
\n

Constructs a lazy AutoMaterializePolicy.

\n
\n
Parameters:
\n

max_materializations_per_minute (Optional[int]) \u2013 The maximum number of\nauto-materializations for this asset that may be initiated per minute. If this limit\nis exceeded, the partitions which would have been materialized will be discarded,\nand will require manual materialization in order to be updated. Defaults to 1.

\n
\n
\n
\n\n
\n
\nwith_rules(*rules_to_add)[source]\u00b6
\n

Constructs a copy of this policy with the specified rules added.

\n
\n\n
\n
\nwithout_rules(*rules_to_remove)[source]\u00b6
\n

Constructs a copy of this policy with the specified rules removed. Raises an error\nif any of the arguments are not rules in this policy.

\n
\n\n
\n\n
\n
\nclass dagster.AutoMaterializeRule[source]\u00b6
\n

An AutoMaterializeRule defines a bit of logic which helps determine if a materialization\nshould be kicked off for a given asset partition.

\n

Each rule can have one of two decision types, MATERIALIZE (indicating that an asset partition\nshould be materialized) or SKIP (indicating that the asset partition should not be\nmaterialized).

\n

Materialize rules are evaluated first, and skip rules operate over the set of candidates that\nare produced by the materialize rules. Other than that, there is no ordering between rules.

\n
\n
\nstatic materialize_on_missing()[source]\u00b6
\n

Materialize an asset partition if it has never been materialized before. This rule will\nnot fire for non-root assets unless that asset\u2019s parents have been updated.

\n
\n\n
\n
\nstatic materialize_on_parent_updated()[source]\u00b6
\n

Materialize an asset partition if one of its parents has been updated more recently\nthan it has.

\n

Note: For time-partitioned or dynamic-partitioned assets downstream of an unpartitioned\nasset, this rule will only fire for the most recent partition of the downstream.

\n
\n\n
\n
\nstatic materialize_on_required_for_freshness()[source]\u00b6
\n

Materialize an asset partition if it is required to satisfy a freshness policy of this\nasset or one of its downstream assets.

\n

Note: This rule has no effect on partitioned assets.

\n
\n\n
\n
\nstatic skip_on_not_all_parents_updated(require_update_for_all_parent_partitions=False)[source]\u00b6
\n

Skip materializing an asset partition if any of its parents have not been updated since\nthe asset\u2019s last materialization.

\n
\n
\nrequire_update_for_all_parent_partitions\u00b6
\n

Applies only to an unpartitioned\nasset or an asset partition that depends on more than one partition in any upstream asset.\nIf true, requires all upstream partitions in each upstream asset to be materialized since\nthe downstream asset\u2019s last materialization in order to update it. If false, requires at\nleast one upstream partition in each upstream asset to be materialized since the downstream\nasset\u2019s last materialization in order to update it. Defaults to false.

\n
\n
Type:
\n

Optional[bool]

\n
\n
\n
\n\n
\n\n
\n
\nstatic skip_on_parent_missing()[source]\u00b6
\n

Skip materializing an asset partition if one of its parent asset partitions has never\nbeen materialized (for regular assets) or observed (for observable source assets).

\n
\n\n
\n
\nstatic skip_on_parent_outdated()[source]\u00b6
\n

Skip materializing an asset partition if any of its parents has not incorporated the\nlatest data from its ancestors.

\n
\n\n
\n\n
\n
\ndagster.load_assets_from_modules(modules, group_name=None, key_prefix=None, *, freshness_policy=None, auto_materialize_policy=None, backfill_policy=None, source_key_prefix=None)[source]\u00b6
\n

Constructs a list of assets and source assets from the given modules.

\n
\n
Parameters:
\n
    \n
  • modules (Iterable[ModuleType]) \u2013 The Python modules to look for assets inside.

  • \n
  • group_name (Optional[str]) \u2013 Group name to apply to the loaded assets. The returned assets will be copies of the\nloaded objects, with the group name added.

  • \n
  • key_prefix (Optional[Union[str, Sequence[str]]]) \u2013 Prefix to prepend to the keys of the loaded assets. The returned assets will be copies\nof the loaded objects, with the prefix prepended.

  • \n
  • freshness_policy (Optional[FreshnessPolicy]) \u2013 FreshnessPolicy to apply to all the loaded\nassets.

  • \n
  • auto_materialize_policy (Optional[AutoMaterializePolicy]) \u2013 AutoMaterializePolicy to apply\nto all the loaded assets.

  • \n
  • backfill_policy (Optional[AutoMaterializePolicy]) \u2013 BackfillPolicy to apply to all the loaded assets.

  • \n
  • source_key_prefix (bool) \u2013 Prefix to prepend to the keys of loaded SourceAssets. The returned\nassets will be copies of the loaded objects, with the prefix prepended.

  • \n
\n
\n
Returns:
\n

A list containing assets and source assets defined in the given modules.

\n
\n
Return type:
\n

Sequence[Union[AssetsDefinition, SourceAsset]]

\n
\n
\n
\n\n
\n
\ndagster.load_assets_from_current_module(group_name=None, key_prefix=None, *, freshness_policy=None, auto_materialize_policy=None, backfill_policy=None, source_key_prefix=None)[source]\u00b6
\n

Constructs a list of assets, source assets, and cacheable assets from the module where\nthis function is called.

\n
\n
Parameters:
\n
    \n
  • group_name (Optional[str]) \u2013 Group name to apply to the loaded assets. The returned assets will be copies of the\nloaded objects, with the group name added.

  • \n
  • key_prefix (Optional[Union[str, Sequence[str]]]) \u2013 Prefix to prepend to the keys of the loaded assets. The returned assets will be copies\nof the loaded objects, with the prefix prepended.

  • \n
  • freshness_policy (Optional[FreshnessPolicy]) \u2013 FreshnessPolicy to apply to all the loaded\nassets.

  • \n
  • auto_materialize_policy (Optional[AutoMaterializePolicy]) \u2013 AutoMaterializePolicy to apply\nto all the loaded assets.

  • \n
  • backfill_policy (Optional[AutoMaterializePolicy]) \u2013 BackfillPolicy to apply to all the loaded assets.

  • \n
  • source_key_prefix (bool) \u2013 Prefix to prepend to the keys of loaded SourceAssets. The returned\nassets will be copies of the loaded objects, with the prefix prepended.

  • \n
\n
\n
Returns:
\n

A list containing assets, source assets, and cacheable assets defined in the module.

\n
\n
Return type:
\n

Sequence[Union[AssetsDefinition, SourceAsset, CachableAssetsDefinition]]

\n
\n
\n
\n\n
\n
\ndagster.load_assets_from_package_module(package_module, group_name=None, key_prefix=None, *, freshness_policy=None, auto_materialize_policy=None, backfill_policy=None, source_key_prefix=None)[source]\u00b6
\n

Constructs a list of assets and source assets that includes all asset\ndefinitions, source assets, and cacheable assets in all sub-modules of the given package module.

\n

A package module is the result of importing a package.

\n
\n
Parameters:
\n
    \n
  • package_module (ModuleType) \u2013 The package module to looks for assets inside.

  • \n
  • group_name (Optional[str]) \u2013 Group name to apply to the loaded assets. The returned assets will be copies of the\nloaded objects, with the group name added.

  • \n
  • key_prefix (Optional[Union[str, Sequence[str]]]) \u2013 Prefix to prepend to the keys of the loaded assets. The returned assets will be copies\nof the loaded objects, with the prefix prepended.

  • \n
  • freshness_policy (Optional[FreshnessPolicy]) \u2013 FreshnessPolicy to apply to all the loaded\nassets.

  • \n
  • auto_materialize_policy (Optional[AutoMaterializePolicy]) \u2013 AutoMaterializePolicy to apply\nto all the loaded assets.

  • \n
  • backfill_policy (Optional[AutoMaterializePolicy]) \u2013 BackfillPolicy to apply to all the loaded assets.

  • \n
  • source_key_prefix (bool) \u2013 Prefix to prepend to the keys of loaded SourceAssets. The returned\nassets will be copies of the loaded objects, with the prefix prepended.

  • \n
\n
\n
Returns:
\n

A list containing assets, source assets, and cacheable assets defined in the module.

\n
\n
Return type:
\n

Sequence[Union[AssetsDefinition, SourceAsset, CacheableAssetsDefinition]]

\n
\n
\n
\n\n
\n
\ndagster.load_assets_from_package_name(package_name, group_name=None, key_prefix=None, *, freshness_policy=None, auto_materialize_policy=None, backfill_policy=None, source_key_prefix=None)[source]\u00b6
\n

Constructs a list of assets, source assets, and cacheable assets that includes all asset\ndefinitions and source assets in all sub-modules of the given package.

\n
\n
Parameters:
\n
    \n
  • package_name (str) \u2013 The name of a Python package to look for assets inside.

  • \n
  • group_name (Optional[str]) \u2013 Group name to apply to the loaded assets. The returned assets will be copies of the\nloaded objects, with the group name added.

  • \n
  • key_prefix (Optional[Union[str, Sequence[str]]]) \u2013 Prefix to prepend to the keys of the loaded assets. The returned assets will be copies\nof the loaded objects, with the prefix prepended.

  • \n
  • freshness_policy (Optional[FreshnessPolicy]) \u2013 FreshnessPolicy to apply to all the loaded\nassets.

  • \n
  • auto_materialize_policy (Optional[AutoMaterializePolicy]) \u2013 AutoMaterializePolicy to apply\nto all the loaded assets.

  • \n
  • backfill_policy (Optional[AutoMaterializePolicy]) \u2013 BackfillPolicy to apply to all the loaded assets.

  • \n
  • source_key_prefix (bool) \u2013 Prefix to prepend to the keys of loaded SourceAssets. The returned\nassets will be copies of the loaded objects, with the prefix prepended.

  • \n
\n
\n
Returns:
\n

A list containing assets, source assets, and cacheable assets defined in the module.

\n
\n
Return type:
\n

Sequence[Union[AssetsDefinition, SourceAsset, CacheableAssetsDefinition]]

\n
\n
\n
\n\n
\n
\nclass dagster.AssetsDefinition(*, keys_by_input_name, keys_by_output_name, node_def, partitions_def=None, partition_mappings=None, asset_deps=None, selected_asset_keys=None, can_subset=False, resource_defs=None, group_names_by_key=None, metadata_by_key=None, freshness_policies_by_key=None, auto_materialize_policies_by_key=None, backfill_policy=None, descriptions_by_key=None, check_specs_by_output_name=None, selected_asset_check_keys=None)[source]\u00b6
\n

Defines a set of assets that are produced by the same op or graph.

\n

AssetsDefinitions are typically not instantiated directly, but rather produced using the\n@asset or @multi_asset decorators.

\n
\n
\nproperty asset_deps\u00b6
\n

Maps assets that are produced by this definition to assets that they depend on. The\ndependencies can be either \u201cinternal\u201d, meaning that they refer to other assets that are\nproduced by this definition, or \u201cexternal\u201d, meaning that they refer to assets that aren\u2019t\nproduced by this definition.

\n
\n\n
\n
\nproperty can_subset\u00b6
\n

If True, indicates that this AssetsDefinition may materialize any subset of its\nasset keys in a given computation (as opposed to being required to materialize all asset\nkeys).

\n
\n
Type:
\n

bool

\n
\n
\n
\n\n
\n
\nproperty check_specs\u00b6
\n

Returns the asset check specs defined on this AssetsDefinition, i.e. the checks that can\nbe executed while materializing the assets.

\n
\n
Return type:
\n

Iterable[AssetsCheckSpec]

\n
\n
\n
\n\n
\n
\nproperty dependency_keys\u00b6
\n

The asset keys which are upstream of any asset included in this\nAssetsDefinition.

\n
\n
Type:
\n

Iterable[AssetKey]

\n
\n
\n
\n\n
\n
\nproperty descriptions_by_key\u00b6
\n

Returns a mapping from the asset keys in this AssetsDefinition\nto the descriptions assigned to them. If there is no assigned description for a given AssetKey,\nit will not be present in this dictionary.

\n
\n
Type:
\n

Mapping[AssetKey, str]

\n
\n
\n
\n\n
\n
\nstatic from_graph(graph_def, *, keys_by_input_name=None, keys_by_output_name=None, key_prefix=None, internal_asset_deps=None, partitions_def=None, partition_mappings=None, resource_defs=None, group_name=None, group_names_by_output_name=None, descriptions_by_output_name=None, metadata_by_output_name=None, freshness_policies_by_output_name=None, auto_materialize_policies_by_output_name=None, backfill_policy=None, can_subset=False, check_specs=None)[source]\u00b6
\n

Constructs an AssetsDefinition from a GraphDefinition.

\n
\n
Parameters:
\n
    \n
  • graph_def (GraphDefinition) \u2013 The GraphDefinition that is an asset.

  • \n
  • keys_by_input_name (Optional[Mapping[str, AssetKey]]) \u2013 A mapping of the input\nnames of the decorated graph to their corresponding asset keys. If not provided,\nthe input asset keys will be created from the graph input names.

  • \n
  • keys_by_output_name (Optional[Mapping[str, AssetKey]]) \u2013 A mapping of the output\nnames of the decorated graph to their corresponding asset keys. If not provided,\nthe output asset keys will be created from the graph output names.

  • \n
  • key_prefix (Optional[Union[str, Sequence[str]]]) \u2013 If provided, key_prefix will be prepended\nto each key in keys_by_output_name. Each item in key_prefix must be a valid name in\ndagster (ie only contains letters, numbers, and _) and may not contain python\nreserved keywords.

  • \n
  • internal_asset_deps (Optional[Mapping[str, Set[AssetKey]]]) \u2013 By default, it is assumed\nthat all assets produced by the graph depend on all assets that are consumed by that\ngraph. If this default is not correct, you pass in a map of output names to a\ncorrected set of AssetKeys that they depend on. Any AssetKeys in this list must be\neither used as input to the asset or produced within the graph.

  • \n
  • partitions_def (Optional[PartitionsDefinition]) \u2013 Defines the set of partition keys that\ncompose the assets.

  • \n
  • partition_mappings (Optional[Mapping[str, PartitionMapping]]) \u2013 Defines how to map partition\nkeys for this asset to partition keys of upstream assets. Each key in the dictionary\ncorreponds to one of the input assets, and each value is a PartitionMapping.\nIf no entry is provided for a particular asset dependency, the partition mapping defaults\nto the default partition mapping for the partitions definition, which is typically maps\npartition keys to the same partition keys in upstream assets.

  • \n
  • resource_defs (Optional[Mapping[str, ResourceDefinition]]) \u2013 \n \n (\n experimental\n )\n \n (This parameter may break in future versions, even between dot releases.) (Experimental) A mapping of resource keys to resource definitions. These resources\nwill be initialized during execution, and can be accessed from the\nbody of ops in the graph during execution.

  • \n
  • group_name (Optional[str]) \u2013 A group name for the constructed asset. Assets without a\ngroup name are assigned to a group called \u201cdefault\u201d.

  • \n
  • group_names_by_output_name (Optional[Mapping[str, Optional[str]]]) \u2013 Defines a group name to be\nassociated with some or all of the output assets for this node. Keys are names of the\noutputs, and values are the group name. Cannot be used with the group_name argument.

  • \n
  • descriptions_by_output_name (Optional[Mapping[str, Optional[str]]]) \u2013 Defines a description to be\nassociated with each of the output asstes for this graph.

  • \n
  • metadata_by_output_name (Optional[Mapping[str, Optional[MetadataUserInput]]]) \u2013 Defines metadata to\nbe associated with each of the output assets for this node. Keys are names of the\noutputs, and values are dictionaries of metadata to be associated with the related\nasset.

  • \n
  • freshness_policies_by_output_name (Optional[Mapping[str, Optional[FreshnessPolicy]]]) \u2013 Defines a\nFreshnessPolicy to be associated with some or all of the output assets for this node.\nKeys are the names of the outputs, and values are the FreshnessPolicies to be attached\nto the associated asset.

  • \n
  • auto_materialize_policies_by_output_name (Optional[Mapping[str, Optional[AutoMaterializePolicy]]]) \u2013 Defines an\nAutoMaterializePolicy to be associated with some or all of the output assets for this node.\nKeys are the names of the outputs, and values are the AutoMaterializePolicies to be attached\nto the associated asset.

  • \n
  • backfill_policy (Optional[BackfillPolicy]) \u2013 Defines this asset\u2019s BackfillPolicy

  • \n
\n
\n
\n
\n\n
\n
\nstatic from_op(op_def, *, keys_by_input_name=None, keys_by_output_name=None, key_prefix=None, internal_asset_deps=None, partitions_def=None, partition_mappings=None, group_name=None, group_names_by_output_name=None, descriptions_by_output_name=None, metadata_by_output_name=None, freshness_policies_by_output_name=None, auto_materialize_policies_by_output_name=None, backfill_policy=None, can_subset=False)[source]\u00b6
\n

Constructs an AssetsDefinition from an OpDefinition.

\n
\n
Parameters:
\n
    \n
  • op_def (OpDefinition) \u2013 The OpDefinition that is an asset.

  • \n
  • keys_by_input_name (Optional[Mapping[str, AssetKey]]) \u2013 A mapping of the input\nnames of the decorated op to their corresponding asset keys. If not provided,\nthe input asset keys will be created from the op input names.

  • \n
  • keys_by_output_name (Optional[Mapping[str, AssetKey]]) \u2013 A mapping of the output\nnames of the decorated op to their corresponding asset keys. If not provided,\nthe output asset keys will be created from the op output names.

  • \n
  • key_prefix (Optional[Union[str, Sequence[str]]]) \u2013 If provided, key_prefix will be prepended\nto each key in keys_by_output_name. Each item in key_prefix must be a valid name in\ndagster (ie only contains letters, numbers, and _) and may not contain python\nreserved keywords.

  • \n
  • internal_asset_deps (Optional[Mapping[str, Set[AssetKey]]]) \u2013 By default, it is assumed\nthat all assets produced by the op depend on all assets that are consumed by that\nop. If this default is not correct, you pass in a map of output names to a\ncorrected set of AssetKeys that they depend on. Any AssetKeys in this list must be\neither used as input to the asset or produced within the op.

  • \n
  • partitions_def (Optional[PartitionsDefinition]) \u2013 Defines the set of partition keys that\ncompose the assets.

  • \n
  • partition_mappings (Optional[Mapping[str, PartitionMapping]]) \u2013 Defines how to map partition\nkeys for this asset to partition keys of upstream assets. Each key in the dictionary\ncorreponds to one of the input assets, and each value is a PartitionMapping.\nIf no entry is provided for a particular asset dependency, the partition mapping defaults\nto the default partition mapping for the partitions definition, which is typically maps\npartition keys to the same partition keys in upstream assets.

  • \n
  • group_name (Optional[str]) \u2013 A group name for the constructed asset. Assets without a\ngroup name are assigned to a group called \u201cdefault\u201d.

  • \n
  • group_names_by_output_name (Optional[Mapping[str, Optional[str]]]) \u2013 Defines a group name to be\nassociated with some or all of the output assets for this node. Keys are names of the\noutputs, and values are the group name. Cannot be used with the group_name argument.

  • \n
  • descriptions_by_output_name (Optional[Mapping[str, Optional[str]]]) \u2013 Defines a description to be\nassociated with each of the output asstes for this graph.

  • \n
  • metadata_by_output_name (Optional[Mapping[str, Optional[MetadataUserInput]]]) \u2013 Defines metadata to\nbe associated with each of the output assets for this node. Keys are names of the\noutputs, and values are dictionaries of metadata to be associated with the related\nasset.

  • \n
  • freshness_policies_by_output_name (Optional[Mapping[str, Optional[FreshnessPolicy]]]) \u2013 Defines a\nFreshnessPolicy to be associated with some or all of the output assets for this node.\nKeys are the names of the outputs, and values are the FreshnessPolicies to be attached\nto the associated asset.

  • \n
  • auto_materialize_policies_by_output_name (Optional[Mapping[str, Optional[AutoMaterializePolicy]]]) \u2013 Defines an\nAutoMaterializePolicy to be associated with some or all of the output assets for this node.\nKeys are the names of the outputs, and values are the AutoMaterializePolicies to be attached\nto the associated asset.

  • \n
  • backfill_policy (Optional[BackfillPolicy]) \u2013 Defines this asset\u2019s BackfillPolicy

  • \n
\n
\n
\n
\n\n
\n
\nget_partition_mapping(in_asset_key)[source]\u00b6
\n

Returns the partition mapping between keys in this AssetsDefinition and a given input\nasset key (if any).

\n
\n\n
\n
\nproperty group_names_by_key\u00b6
\n

Returns a mapping from the asset keys in this AssetsDefinition\nto the group names assigned to them. If there is no assigned group name for a given AssetKey,\nit will not be present in this dictionary.

\n
\n
Type:
\n

Mapping[AssetKey, str]

\n
\n
\n
\n\n
\n
\nproperty key\u00b6
\n

The asset key associated with this AssetsDefinition. If this AssetsDefinition\nhas more than one asset key, this will produce an error.

\n
\n
Type:
\n

AssetKey

\n
\n
\n
\n\n
\n
\nproperty keys\u00b6
\n

The asset keys associated with this AssetsDefinition.

\n
\n
Type:
\n

AbstractSet[AssetKey]

\n
\n
\n
\n\n
\n
\nproperty node_def\u00b6
\n

Returns the OpDefinition or GraphDefinition that is used to materialize\nthe assets in this AssetsDefinition.

\n
\n
Type:
\n

NodeDefinition

\n
\n
\n
\n\n
\n
\nproperty op\u00b6
\n

Returns the OpDefinition that is used to materialize the assets in this\nAssetsDefinition.

\n
\n
Type:
\n

OpDefinition

\n
\n
\n
\n\n
\n
\nproperty partitions_def\u00b6
\n

The PartitionsDefinition for this AssetsDefinition (if any).

\n
\n
Type:
\n

Optional[PartitionsDefinition]

\n
\n
\n
\n\n
\n
\nproperty required_resource_keys\u00b6
\n

The set of keys for resources that must be provided to this AssetsDefinition.

\n
\n
Type:
\n

Set[str]

\n
\n
\n
\n\n
\n
\nproperty resource_defs\u00b6
\n

A mapping from resource name to ResourceDefinition for\nthe resources bound to this AssetsDefinition.

\n
\n
Type:
\n

Mapping[str, ResourceDefinition]

\n
\n
\n
\n\n
\n
\nto_source_asset(key=None)[source]\u00b6
\n

Returns a representation of this asset as a SourceAsset.

\n

If this is a multi-asset, the \u201ckey\u201d argument allows selecting which asset to return a\nSourceAsset representation of.

\n
\n
Parameters:
\n

key (Optional[Union[str, Sequence[str], AssetKey]]]) \u2013 If this is a multi-asset, select\nwhich asset to return a SourceAsset representation of. If not a multi-asset, this\ncan be left as None.

\n
\n
Returns:
\n

SourceAsset

\n
\n
\n
\n\n
\n
\nto_source_assets()[source]\u00b6
\n

Returns a SourceAsset for each asset in this definition.

\n

Each produced SourceAsset will have the same key, metadata, io_manager_key, etc. as the\ncorresponding asset

\n
\n\n
\n\n
\n
\n@dagster.multi_asset(*, outs=None, name=None, ins=None, deps=None, description=None, config_schema=None, required_resource_keys=None, compute_kind=None, internal_asset_deps=None, partitions_def=None, backfill_policy=None, op_tags=None, can_subset=False, resource_defs=None, group_name=None, retry_policy=None, code_version=None, specs=None, check_specs=None, non_argument_deps=None)[source]\u00b6
\n

Create a combined definition of multiple assets that are computed using the same op and same\nupstream assets.

\n

Each argument to the decorated function references an upstream asset that this asset depends on.\nThe name of the argument designates the name of the upstream asset.

\n

You can set I/O managers keys, auto-materialize policies, freshness policies, group names, etc.\non an individual asset within the multi-asset by attaching them to the AssetOut\ncorresponding to that asset in the outs parameter.

\n
\n
Parameters:
\n
    \n
  • name (Optional[str]) \u2013 The name of the op.

  • \n
  • outs \u2013 (Optional[Dict[str, AssetOut]]): The AssetOuts representing the assets materialized by\nthis function. AssetOuts detail the output, IO management, and core asset properties.\nThis argument is required except when AssetSpecs are used.

  • \n
  • ins (Optional[Mapping[str, AssetIn]]) \u2013 A dictionary that maps input names to information\nabout the input.

  • \n
  • deps (Optional[Sequence[Union[AssetsDefinition, SourceAsset, AssetKey, str]]]) \u2013 The assets that are upstream dependencies, but do not correspond to a parameter of the\ndecorated function. If the AssetsDefinition for a multi_asset is provided, dependencies on\nall assets created by the multi_asset will be created.

  • \n
  • config_schema (Optional[ConfigSchema) \u2013 The configuration schema for the asset\u2019s underlying\nop. If set, Dagster will check that config provided for the op matches this schema and fail\nif it does not. If not set, Dagster will accept any config provided for the op.

  • \n
  • required_resource_keys (Optional[Set[str]]) \u2013 Set of resource handles required by the underlying op.

  • \n
  • compute_kind (Optional[str]) \u2013 A string to represent the kind of computation that produces\nthe asset, e.g. \u201cdbt\u201d or \u201cspark\u201d. It will be displayed in the Dagster UI as a badge on the asset.

  • \n
  • internal_asset_deps (Optional[Mapping[str, Set[AssetKey]]]) \u2013 By default, it is assumed\nthat all assets produced by a multi_asset depend on all assets that are consumed by that\nmulti asset. If this default is not correct, you pass in a map of output names to a\ncorrected set of AssetKeys that they depend on. Any AssetKeys in this list must be either\nused as input to the asset or produced within the op.

  • \n
  • partitions_def (Optional[PartitionsDefinition]) \u2013 Defines the set of partition keys that\ncompose the assets.

  • \n
  • backfill_policy (Optional[BackfillPolicy]) \u2013 The backfill policy for the op that computes the asset.

  • \n
  • op_tags (Optional[Dict[str, Any]]) \u2013 A dictionary of tags for the op that computes the asset.\nFrameworks may expect and require certain metadata to be attached to a op. Values that\nare not strings will be json encoded and must meet the criteria that\njson.loads(json.dumps(value)) == value.

  • \n
  • can_subset (bool) \u2013 If this asset\u2019s computation can emit a subset of the asset\nkeys based on the context.selected_assets argument. Defaults to False.

  • \n
  • resource_defs (Optional[Mapping[str, object]]) \u2013 \n \n (\n experimental\n )\n \n (This parameter may break in future versions, even between dot releases.) (Experimental) A mapping of resource keys to resources. These resources\nwill be initialized during execution, and can be accessed from the\ncontext within the body of the function.

  • \n
  • group_name (Optional[str]) \u2013 A string name used to organize multiple assets into groups. This\ngroup name will be applied to all assets produced by this multi_asset.

  • \n
  • retry_policy (Optional[RetryPolicy]) \u2013 The retry policy for the op that computes the asset.

  • \n
  • code_version (Optional[str]) \u2013 (Experimental) Version of the code encapsulated by the multi-asset. If set,\nthis is used as a default code version for all defined assets.

  • \n
  • specs (Optional[Sequence[AssetSpec]]) \u2013 (Experimental) The specifications for the assets materialized\nby this function.

  • \n
  • check_specs (Optional[Sequence[AssetCheckSpec]]) \u2013 (Experimental) Specs for asset checks that\nexecute in the decorated function after materializing the assets.

  • \n
  • non_argument_deps (Optional[Union[Set[AssetKey], Set[str]]]) \u2013 \n \n (\n deprecated\n )\n \n (This parameter will be removed in version 2.0.0. use deps instead.) Deprecated, use deps instead. Set of asset keys that are upstream\ndependencies, but do not pass an input to the multi_asset.

  • \n
\n
\n
\n

Examples

\n
# Use IO managers to handle I/O:\n@multi_asset(\n    outs={\n        "my_string_asset": AssetOut(),\n        "my_int_asset": AssetOut(),\n    }\n)\ndef my_function(upstream_asset: int):\n    result = upstream_asset + 1\n    return str(result), result\n\n# Handle I/O on your own:\n@multi_asset(\n    outs={\n        "asset1": AssetOut(),\n        "asset2": AssetOut(),\n    },\n    deps=["asset0"],\n)\ndef my_function():\n    asset0_value = load(path="asset0")\n    asset1_result, asset2_result = do_some_transformation(asset0_value)\n    write(asset1_result, path="asset1")\n    write(asset2_result, path="asset2")\n    return None, None\n
\n
\n
\n\n
\n
\n@dagster.graph_asset(compose_fn=None, *, name=None, description=None, ins=None, config=None, key_prefix=None, group_name=None, partitions_def=None, metadata=None, freshness_policy=None, auto_materialize_policy=None, backfill_policy=None, resource_defs=None, check_specs=None, key=None)[source]\u00b6
\n

Creates a software-defined asset that\u2019s computed using a graph of ops.

\n

This decorator is meant to decorate a function that composes a set of ops or graphs to define\nthe dependencies between them.

\n
\n
Parameters:
\n
    \n
  • name (Optional[str]) \u2013 The name of the asset. If not provided, defaults to the name of the\ndecorated function. The asset\u2019s name must be a valid name in Dagster (ie only contains\nletters, numbers, and underscores) and may not contain Python reserved keywords.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the asset.

  • \n
  • ins (Optional[Mapping[str, AssetIn]]) \u2013 A dictionary that maps input names to information\nabout the input.

  • \n
  • config (Optional[Union[ConfigMapping], Mapping[str, Any]) \u2013

    Describes how the graph underlying the asset is configured at runtime.

    \n

    If a ConfigMapping object is provided, then the graph takes on the config\nschema of this object. The mapping will be applied at runtime to generate the config for\nthe graph\u2019s constituent nodes.

    \n

    If a dictionary is provided, then it will be used as the default run config for the\ngraph. This means it must conform to the config schema of the underlying nodes. Note\nthat the values provided will be viewable and editable in the Dagster UI, so be careful\nwith secrets. its constituent nodes.

    \n

    If no value is provided, then the config schema for the graph is the default (derived\nfrom the underlying nodes).

    \n

  • \n
  • key_prefix (Optional[Union[str, Sequence[str]]]) \u2013 If provided, the asset\u2019s key is the\nconcatenation of the key_prefix and the asset\u2019s name, which defaults to the name of\nthe decorated function. Each item in key_prefix must be a valid name in Dagster (ie only\ncontains letters, numbers, and underscores) and may not contain Python reserved keywords.

  • \n
  • group_name (Optional[str]) \u2013 A string name used to organize multiple assets into groups. If\nnot provided, the name \u201cdefault\u201d is used.

  • \n
  • partitions_def (Optional[PartitionsDefinition]) \u2013 Defines the set of partition keys that\ncompose the asset.

  • \n
  • metadata (Optional[MetadataUserInput]) \u2013 Dictionary of metadata to be associated with\nthe asset.

  • \n
  • freshness_policy (Optional[FreshnessPolicy]) \u2013 A constraint telling Dagster how often this asset is\nintended to be updated with respect to its root data.

  • \n
  • auto_materialize_policy (Optional[AutoMaterializePolicy]) \u2013 The AutoMaterializePolicy to use\nfor this asset.

  • \n
  • backfill_policy (Optional[BackfillPolicy]) \u2013 The BackfillPolicy to use for this asset.

  • \n
  • key (Optional[CoeercibleToAssetKey]) \u2013 The key for this asset. If provided, cannot specify key_prefix or name.

  • \n
\n
\n
\n

Examples

\n
@op\ndef fetch_files_from_slack(context) -> pd.DataFrame:\n    ...\n\n@op\ndef store_files_in_table(files) -> None:\n    files.to_sql(name="slack_files", con=create_db_connection())\n\n@graph_asset\ndef slack_files_table():\n    return store_files(fetch_files_from_slack())\n
\n
\n
\n\n
\n
\n@dagster.graph_multi_asset(*, outs, name=None, ins=None, partitions_def=None, backfill_policy=None, group_name=None, can_subset=False, resource_defs=None, check_specs=None)[source]\u00b6
\n

Create a combined definition of multiple assets that are computed using the same graph of\nops, and the same upstream assets.

\n

Each argument to the decorated function references an upstream asset that this asset depends on.\nThe name of the argument designates the name of the upstream asset.

\n
\n
Parameters:
\n
    \n
  • name (Optional[str]) \u2013 The name of the graph.

  • \n
  • outs \u2013 (Optional[Dict[str, AssetOut]]): The AssetOuts representing the produced assets.

  • \n
  • ins (Optional[Mapping[str, AssetIn]]) \u2013 A dictionary that maps input names to information\nabout the input.

  • \n
  • partitions_def (Optional[PartitionsDefinition]) \u2013 Defines the set of partition keys that\ncompose the assets.

  • \n
  • backfill_policy (Optional[BackfillPolicy]) \u2013 The backfill policy for the asset.

  • \n
  • group_name (Optional[str]) \u2013 A string name used to organize multiple assets into groups. This\ngroup name will be applied to all assets produced by this multi_asset.

  • \n
  • can_subset (bool) \u2013 Whether this asset\u2019s computation can emit a subset of the asset\nkeys based on the context.selected_assets argument. Defaults to False.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.AssetOut(key_prefix=None, key=None, dagster_type=<class 'dagster._core.definitions.utils.NoValueSentinel'>, description=None, is_required=True, io_manager_key=None, metadata=None, group_name=None, code_version=None, freshness_policy=None, auto_materialize_policy=None, backfill_policy=None)[source]\u00b6
\n

Defines one of the assets produced by a @multi_asset.

\n
\n
\nkey_prefix\u00b6
\n

If provided, the asset\u2019s key is the\nconcatenation of the key_prefix and the asset\u2019s name. When using @multi_asset, the\nasset name defaults to the key of the \u201couts\u201d dictionary Only one of the \u201ckey_prefix\u201d and\n\u201ckey\u201d arguments should be provided.

\n
\n
Type:
\n

Optional[Union[str, Sequence[str]]]

\n
\n
\n
\n\n
\n
\nkey\u00b6
\n

The asset\u2019s key. Only one of the\n\u201ckey_prefix\u201d and \u201ckey\u201d arguments should be provided.

\n
\n
Type:
\n

Optional[Union[str, Sequence[str], AssetKey]]

\n
\n
\n
\n\n
\n
\ndagster_type\u00b6
\n

The type of this output. Should only be set if the correct type can not\nbe inferred directly from the type signature of the decorated function.

\n
\n
Type:
\n

Optional[Union[Type, DagsterType]]]

\n
\n
\n
\n\n
\n
\ndescription\u00b6
\n

Human-readable description of the output.

\n
\n
Type:
\n

Optional[str]

\n
\n
\n
\n\n
\n
\nis_required\u00b6
\n

Whether the presence of this field is required. (default: True)

\n
\n
Type:
\n

bool

\n
\n
\n
\n\n
\n
\nio_manager_key\u00b6
\n

The resource key of the IO manager used for this output.\n(default: \u201cio_manager\u201d).

\n
\n
Type:
\n

Optional[str]

\n
\n
\n
\n\n
\n
\nmetadata\u00b6
\n

A dict of the metadata for the output.\nFor example, users can provide a file path if the data object will be stored in a\nfilesystem, or provide information of a database table when it is going to load the data\ninto the table.

\n
\n
Type:
\n

Optional[Dict[str, Any]]

\n
\n
\n
\n\n
\n
\ngroup_name\u00b6
\n

A string name used to organize multiple assets into groups. If\nnot provided, the name \u201cdefault\u201d is used.

\n
\n
Type:
\n

Optional[str]

\n
\n
\n
\n\n
\n
\ncode_version\u00b6
\n

The version of the code that generates this asset.

\n
\n
Type:
\n

Optional[str]

\n
\n
\n
\n\n
\n
\nfreshness_policy\u00b6
\n

A policy which indicates how up to date this\nasset is intended to be.

\n
\n
Type:
\n

Optional[FreshnessPolicy]

\n
\n
\n
\n\n
\n
\nauto_materialize_policy\u00b6
\n

AutoMaterializePolicy to apply to\nthe specified asset.

\n
\n
Type:
\n

Optional[AutoMaterializePolicy]

\n
\n
\n
\n\n
\n
\nbackfill_policy\u00b6
\n

BackfillPolicy to apply to the specified asset.

\n
\n
Type:
\n

Optional[BackfillPolicy]

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.AssetValueLoader(assets_defs_by_key, source_assets_by_key, instance=None)[source]\u00b6
\n

Caches resource definitions that are used to load asset values across multiple load\ninvocations.

\n

Should not be instantiated directly. Instead, use\nget_asset_value_loader().

\n
\n
\nload_asset_value(asset_key, *, python_type=None, partition_key=None, metadata=None, resource_config=None)[source]\u00b6
\n

Loads the contents of an asset as a Python object.

\n

Invokes load_input on the IOManager associated with the asset.

\n
\n
Parameters:
\n
    \n
  • asset_key (Union[AssetKey, Sequence[str], str]) \u2013 The key of the asset to load.

  • \n
  • python_type (Optional[Type]) \u2013 The python type to load the asset as. This is what will\nbe returned inside load_input by context.dagster_type.typing_type.

  • \n
  • partition_key (Optional[str]) \u2013 The partition of the asset to load.

  • \n
  • metadata (Optional[Dict[str, Any]]) \u2013 Input metadata to pass to the IOManager\n(is equivalent to setting the metadata argument in In or AssetIn).

  • \n
  • resource_config (Optional[Any]) \u2013 A dictionary of resource configurations to be passed\nto the IOManager.

  • \n
\n
\n
Returns:
\n

The contents of an asset as a Python object.

\n
\n
\n
\n\n
\n\n
\n", "current_page_name": "sections/api/apidocs/assets", "customsidebar": null, "display_toc": false, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../asset-checks/", "title": "Asset Checks (Experimental)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../../../../", "title": "Home"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/asset-checks", "Asset Checks (Experimental)", "N", "next"], ["index", "Home", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/assets.rst.txt", "title": "Software-Defined Assets", "toc": "\n"}, "cli": {"alabaster_version": "0.7.13", "body": "
\n

Dagster CLI\u00b6

\n
\n

dagster asset\u00b6

\n

Commands for working with Dagster assets.

\n
dagster asset [OPTIONS] COMMAND [ARGS]...\n
\n
\n

Commands

\n
\n
\nlist
\n

List assets

\n
\n\n
\n
\nmaterialize
\n

Execute a run to materialize a selection\u2026

\n
\n\n
\n
\nwipe
\n

Eliminate asset key indexes from event logs.

\n
\n\n
\n
\nwipe-partitions-status-cache
\n

Clears the asset partitions status cache,\u2026

\n
\n\n
\n
\n

dagster debug\u00b6

\n

Commands for helping debug Dagster issues by dumping or loading artifacts from specific runs.

\n

This can be used to send a file to someone like the Dagster team who doesn\u2019t have direct access\nto your instance to allow them to view the events and details of a specific run.

\n

Debug files can be viewed using dagster-webserver-debug cli.\nDebug files can also be downloaded from the Dagster UI.

\n
dagster debug [OPTIONS] COMMAND [ARGS]...\n
\n
\n

Commands

\n
\n
\nexport
\n

Export the relevant artifacts for a job\u2026

\n
\n\n
\n
\nimport
\n

Import the relevant artifacts from debug\u2026

\n
\n\n
\n
\n

dagster dev\u00b6

\n

Start a local deployment of Dagster, including dagster-webserver running on localhost and the dagster-daemon running in the background

\n
dagster dev [OPTIONS]\n
\n
\n

Options

\n
\n
\n-d, --working-directory <working_directory>\u00b6
\n

Specify working directory to use when loading the repository or job

\n
\n\n
\n
\n-m, --module-name <module_name>\u00b6
\n

Specify module or modules (flag can be used multiple times) where dagster definitions reside as top-level symbols/variables and load each module as a code location in the current python environment.

\n
\n\n
\n
\n-f, --python-file <python_file>\u00b6
\n

Specify python file or files (flag can be used multiple times) where dagster definitions reside as top-level symbols/variables and load each file as a code location in the current python environment.

\n
\n\n
\n
\n-w, --workspace <workspace>\u00b6
\n

Path to workspace file. Argument can be provided multiple times.

\n
\n\n
\n
\n--code-server-log-level <code_server_log_level>\u00b6
\n

Set the log level for code servers spun up by dagster services.

\n
\n
Default:
\n

warning

\n
\n
Options:
\n

critical | error | warning | info | debug

\n
\n
\n
\n\n
\n
\n--log-level <log_level>\u00b6
\n

Set the log level for dagster services.

\n
\n
Default:
\n

info

\n
\n
Options:
\n

critical | error | warning | info | debug

\n
\n
\n
\n\n
\n
\n-p, --port, --dagit-port <port>\u00b6
\n

Port to use for the Dagster webserver.

\n
\n\n
\n
\n-h, --host, --dagit-host <host>\u00b6
\n

Host to use for the Dagster webserver.

\n
\n\n

Environment variables

\n
\n
\nDAGSTER_WORKING_DIRECTORY
\n
\n

Provide a default for --working-directory

\n
\n
\n\n
\n
\nDAGSTER_MODULE_NAME
\n
\n

Provide a default for --module-name

\n
\n
\n\n
\n
\nDAGSTER_PYTHON_FILE
\n
\n

Provide a default for --python-file

\n
\n
\n\n
\n
\n

dagster instance\u00b6

\n

Commands for working with the current Dagster instance.

\n
dagster instance [OPTIONS] COMMAND [ARGS]...\n
\n
\n

Commands

\n
\n
\nconcurrency
\n

Commands for working with the\u2026

\n
\n\n
\n
\ninfo
\n

List the information about the current\u2026

\n
\n\n
\n
\nmigrate
\n

Automatically migrate an out of date\u2026

\n
\n\n
\n
\nreindex
\n

Rebuild index over historical runs for\u2026

\n
\n\n
\n
\n

dagster job\u00b6

\n

Commands for working with Dagster jobs.

\n
dagster job [OPTIONS] COMMAND [ARGS]...\n
\n
\n

Commands

\n
\n
\nbackfill
\n

Backfill a partitioned job.

\n
\n\n
\n
\nexecute
\n

Execute a job.

\n
\n\n
\n
\nlaunch
\n

Launch a job using the run launcher\u2026

\n
\n\n
\n
\nlist
\n

List the jobs in a repository.

\n
\n\n
\n
\nlist_versions
\n

Display the freshness of memoized results\u2026

\n
\n\n
\n
\nprint
\n

Print a job.

\n
\n\n
\n
\nscaffold_config
\n

Scaffold the config for a job.

\n
\n\n
\n
\n

dagster run\u00b6

\n

Commands for working with Dagster job runs.

\n
dagster run [OPTIONS] COMMAND [ARGS]...\n
\n
\n

Commands

\n
\n
\ndelete
\n

Delete a run by id and its associated\u2026

\n
\n\n
\n
\nlist
\n

List the runs in the current Dagster\u2026

\n
\n\n
\n
\nmigrate-repository
\n

Migrate the run history for a job from a\u2026

\n
\n\n
\n
\nwipe
\n

Eliminate all run history and event logs.

\n
\n\n
\n
\n

dagster schedule\u00b6

\n

Commands for working with Dagster schedules.

\n
dagster schedule [OPTIONS] COMMAND [ARGS]...\n
\n
\n

Commands

\n
\n
\ndebug
\n

Debug information about the scheduler.

\n
\n\n
\n
\nlist
\n

List all schedules that correspond to a\u2026

\n
\n\n
\n
\nlogs
\n

Get logs for a schedule.

\n
\n\n
\n
\npreview
\n

Preview changes that will be performed by\u2026

\n
\n\n
\n
\nrestart
\n

Restart a running schedule.

\n
\n\n
\n
\nstart
\n

Start an existing schedule.

\n
\n\n
\n
\nstop
\n

Stop an existing schedule.

\n
\n\n
\n
\nwipe
\n

Delete the schedule history and turn off\u2026

\n
\n\n
\n
\n

dagster sensor\u00b6

\n

Commands for working with Dagster sensors.

\n
dagster sensor [OPTIONS] COMMAND [ARGS]...\n
\n
\n

Commands

\n
\n
\ncursor
\n

Set the cursor value for an existing sensor.

\n
\n\n
\n
\nlist
\n

List all sensors that correspond to a\u2026

\n
\n\n
\n
\npreview
\n

Preview an existing sensor execution.

\n
\n\n
\n
\nstart
\n

Start an existing sensor.

\n
\n\n
\n
\nstop
\n

Stop an existing sensor.

\n
\n\n
\n
\n

dagster project\u00b6

\n

Commands for bootstrapping new Dagster projects and code locations.

\n
dagster project [OPTIONS] COMMAND [ARGS]...\n
\n
\n

Commands

\n
\n
\nfrom-example
\n

Download one of the official Dagster examples to the current directory. This CLI enables you to quickly bootstrap your project with an officially maintained example.

\n
\n\n
\n
\nlist-examples
\n

List the examples that available to bootstrap with.

\n
\n\n
\n
\nscaffold
\n

Create a folder structure with a single Dagster code location and other files such as pyproject.toml. This CLI enables you to quickly start building a new Dagster project with everything set up.

\n
\n\n
\n
\nscaffold-code-location
\n

Create a folder structure with a single Dagster code location, in the current directory. This CLI helps you to scaffold a new Dagster code location within a folder structure that includes multiple Dagster code locations.

\n
\n\n
\n
\nscaffold-repository
\n

(DEPRECATED; Use dagster project scaffold-code-location instead) Create a folder structure with a single Dagster repository, in the current directory. This CLI helps you to scaffold a new Dagster repository within a folder structure that includes multiple Dagster repositories

\n
\n\n
\n
\n

dagster-graphql\u00b6

\n

Run a GraphQL query against the dagster interface to a specified repository or pipeline/job.

\n

Can only use ONE of \u2013workspace/-w, \u2013python-file/-f, \u2013module-name/-m, \u2013grpc-port, \u2013grpc-socket.

\n

Examples:

\n
    \n
  1. dagster-graphql

  2. \n
  3. dagster-graphql -y path/to/workspace.yaml

  4. \n
  5. dagster-graphql -f path/to/file.py -a define_repo

  6. \n
  7. dagster-graphql -m some_module -a define_repo

  8. \n
  9. dagster-graphql -f path/to/file.py -a define_pipeline

  10. \n
  11. dagster-graphql -m some_module -a define_pipeline

  12. \n
\n
dagster-graphql [OPTIONS]\n
\n
\n

Options

\n
\n
\n--version\u00b6
\n

Show the version and exit.

\n
\n\n
\n
\n-t, --text <text>\u00b6
\n

GraphQL document to execute passed as a string

\n
\n\n
\n
\n-f, --file <file>\u00b6
\n

GraphQL document to execute passed as a file

\n
\n\n
\n
\n-p, --predefined <predefined>\u00b6
\n

GraphQL document to execute, from a predefined set provided by dagster-graphql.

\n
\n
Options:
\n

launchPipelineExecution

\n
\n
\n
\n\n
\n
\n-v, --variables <variables>\u00b6
\n

A JSON encoded string containing the variables for GraphQL execution.

\n
\n\n
\n
\n-r, --remote <remote>\u00b6
\n

A URL for a remote instance running dagster-webserver to send the GraphQL request to.

\n
\n\n
\n
\n-o, --output <output>\u00b6
\n

A file path to store the GraphQL response to. This flag is useful when making pipeline/job execution queries, since pipeline/job execution causes logs to print to stdout and stderr.

\n
\n\n
\n
\n--ephemeral-instance\u00b6
\n

Use an ephemeral DagsterInstance instead of resolving via DAGSTER_HOME

\n
\n\n
\n
\n--empty-workspace\u00b6
\n

Allow an empty workspace

\n
\n\n
\n
\n-w, --workspace <workspace>\u00b6
\n

Path to workspace file. Argument can be provided multiple times.

\n
\n\n
\n
\n-d, --working-directory <working_directory>\u00b6
\n

Specify working directory to use when loading the repository or job

\n
\n\n
\n
\n-f, --python-file <python_file>\u00b6
\n

Specify python file or files (flag can be used multiple times) where dagster definitions reside as top-level symbols/variables and load each file as a code location in the current python environment.

\n
\n\n
\n
\n-m, --module-name <module_name>\u00b6
\n

Specify module or modules (flag can be used multiple times) where dagster definitions reside as top-level symbols/variables and load each module as a code location in the current python environment.

\n
\n\n
\n
\n--package-name <package_name>\u00b6
\n

Specify Python package where repository or job function lives

\n
\n\n
\n
\n-a, --attribute <attribute>\u00b6
\n

Attribute that is either a 1) repository or job or 2) a function that returns a repository or job

\n
\n\n
\n
\n--grpc-port <grpc_port>\u00b6
\n

Port to use to connect to gRPC server

\n
\n\n
\n
\n--grpc-socket <grpc_socket>\u00b6
\n

Named socket to use to connect to gRPC server

\n
\n\n
\n
\n--grpc-host <grpc_host>\u00b6
\n

Host to use to connect to gRPC server, defaults to localhost

\n
\n\n
\n
\n--use-ssl\u00b6
\n

Use a secure channel when connecting to the gRPC server

\n
\n\n

Environment variables

\n
\n
\nDAGSTER_WORKING_DIRECTORY
\n
\n

Provide a default for --working-directory

\n
\n
\n\n
\n
\nDAGSTER_PYTHON_FILE
\n
\n

Provide a default for --python-file

\n
\n
\n\n
\n
\nDAGSTER_MODULE_NAME
\n
\n

Provide a default for --module-name

\n
\n
\n\n
\n
\nDAGSTER_PACKAGE_NAME
\n
\n

Provide a default for --package-name

\n
\n
\n\n
\n
\nDAGSTER_ATTRIBUTE
\n
\n

Provide a default for --attribute

\n
\n
\n\n
\n
\n

dagster-webserver\u00b6

\n

Run dagster-webserver. Loads a code location.

\n

Can only use ONE of \u2013workspace/-w, \u2013python-file/-f, \u2013module-name/-m, \u2013grpc-port, \u2013grpc-socket.

\n

Examples:

\n
    \n
  1. dagster-webserver (works if ./workspace.yaml exists)

  2. \n
  3. dagster-webserver -w path/to/workspace.yaml

  4. \n
  5. dagster-webserver -f path/to/file.py

  6. \n
  7. dagster-webserver -f path/to/file.py -d path/to/working_directory

  8. \n
  9. dagster-webserver -m some_module

  10. \n
  11. dagster-webserver -f path/to/file.py -a define_repo

  12. \n
  13. dagster-webserver -m some_module -a define_repo

  14. \n
  15. dagster-webserver -p 3333

  16. \n
\n

Options can also provide arguments via environment variables prefixed with DAGSTER_WEBSERVER.

\n

For example, DAGSTER_WEBSERVER_PORT=3333 dagster-webserver

\n
dagster-webserver [OPTIONS]\n
\n
\n

Options

\n
\n
\n--use-ssl\u00b6
\n

Use a secure channel when connecting to the gRPC server

\n
\n\n
\n
\n--grpc-host <grpc_host>\u00b6
\n

Host to use to connect to gRPC server, defaults to localhost

\n
\n\n
\n
\n--grpc-socket <grpc_socket>\u00b6
\n

Named socket to use to connect to gRPC server

\n
\n\n
\n
\n--grpc-port <grpc_port>\u00b6
\n

Port to use to connect to gRPC server

\n
\n\n
\n
\n-a, --attribute <attribute>\u00b6
\n

Attribute that is either a 1) repository or job or 2) a function that returns a repository or job

\n
\n\n
\n
\n--package-name <package_name>\u00b6
\n

Specify Python package where repository or job function lives

\n
\n\n
\n
\n-m, --module-name <module_name>\u00b6
\n

Specify module or modules (flag can be used multiple times) where dagster definitions reside as top-level symbols/variables and load each module as a code location in the current python environment.

\n
\n\n
\n
\n-f, --python-file <python_file>\u00b6
\n

Specify python file or files (flag can be used multiple times) where dagster definitions reside as top-level symbols/variables and load each file as a code location in the current python environment.

\n
\n\n
\n
\n-d, --working-directory <working_directory>\u00b6
\n

Specify working directory to use when loading the repository or job

\n
\n\n
\n
\n-w, --workspace <workspace>\u00b6
\n

Path to workspace file. Argument can be provided multiple times.

\n
\n\n
\n
\n--empty-workspace\u00b6
\n

Allow an empty workspace

\n
\n\n
\n
\n-h, --host <host>\u00b6
\n

Host to run server on

\n
\n
Default:
\n

127.0.0.1

\n
\n
\n
\n\n
\n
\n-p, --port <port>\u00b6
\n

Port to run server on - defaults to 3000

\n
\n\n
\n
\n-l, --path-prefix <path_prefix>\u00b6
\n

The path prefix where server will be hosted (eg: /dagster-webserver)

\n
\n
Default:
\n

\n
\n
\n\n
\n
\n--db-statement-timeout <db_statement_timeout>\u00b6
\n

The timeout in milliseconds to set on database statements sent to the DagsterInstance. Not respected in all configurations.

\n
\n
Default:
\n

15000

\n
\n
\n
\n\n
\n
\n--db-pool-recycle <db_pool_recycle>\u00b6
\n

The maximum age of a connection to use from the sqlalchemy pool without connection recycling. Set to -1 to disable. Not respected in all configurations.

\n
\n
Default:
\n

3600

\n
\n
\n
\n\n
\n
\n--read-only\u00b6
\n

Start server in read-only mode, where all mutations such as launching runs and turning schedules on/off are turned off.

\n
\n\n
\n
\n--suppress-warnings\u00b6
\n

Filter all warnings when hosting server.

\n
\n\n
\n
\n--uvicorn-log-level, --log-level <uvicorn_log_level>\u00b6
\n

Set the log level for the uvicorn web server.

\n
\n
Default:
\n

warning

\n
\n
Options:
\n

critical | error | warning | info | debug | trace

\n
\n
\n
\n\n
\n
\n--dagster-log-level <dagster_log_level>\u00b6
\n

Set the log level for dagster log events.

\n
\n
Default:
\n

info

\n
\n
Options:
\n

critical | error | warning | info | debug

\n
\n
\n
\n\n
\n
\n--code-server-log-level <code_server_log_level>\u00b6
\n

Set the log level for any code servers spun up by the webserver.

\n
\n
Default:
\n

info

\n
\n
Options:
\n

critical | error | warning | info | debug

\n
\n
\n
\n\n
\n
\n--version\u00b6
\n

Show the version and exit.

\n
\n\n

Environment variables

\n
\n
\nDAGSTER_ATTRIBUTE
\n
\n

Provide a default for --attribute

\n
\n
\n\n
\n
\nDAGSTER_PACKAGE_NAME
\n
\n

Provide a default for --package-name

\n
\n
\n\n
\n
\nDAGSTER_MODULE_NAME
\n
\n

Provide a default for --module-name

\n
\n
\n\n
\n
\nDAGSTER_PYTHON_FILE
\n
\n

Provide a default for --python-file

\n
\n
\n\n
\n
\nDAGSTER_WORKING_DIRECTORY
\n
\n

Provide a default for --working-directory

\n
\n
\n\n
\n
\nDAGSTER_WEBSERVER_LOG_LEVEL
\n
\n

Provide a default for --dagster-log-level

\n
\n
\n\n
\n
\n

dagster-daemon run\u00b6

\n

Run any daemons configured on the DagsterInstance.

\n
dagster-daemon run [OPTIONS]\n
\n
\n

Options

\n
\n
\n--code-server-log-level <code_server_log_level>\u00b6
\n

Set the log level for any code servers spun up by the daemon.

\n
\n
Default:
\n

warning

\n
\n
Options:
\n

critical | error | warning | info | debug

\n
\n
\n
\n\n
\n
\n--log-level <log_level>\u00b6
\n

Set the log level for any code servers spun up by the daemon.

\n
\n
Default:
\n

info

\n
\n
Options:
\n

critical | error | warning | info | debug

\n
\n
\n
\n\n
\n
\n--use-ssl\u00b6
\n

Use a secure channel when connecting to the gRPC server

\n
\n\n
\n
\n--grpc-host <grpc_host>\u00b6
\n

Host to use to connect to gRPC server, defaults to localhost

\n
\n\n
\n
\n--grpc-socket <grpc_socket>\u00b6
\n

Named socket to use to connect to gRPC server

\n
\n\n
\n
\n--grpc-port <grpc_port>\u00b6
\n

Port to use to connect to gRPC server

\n
\n\n
\n
\n-a, --attribute <attribute>\u00b6
\n

Attribute that is either a 1) repository or job or 2) a function that returns a repository or job

\n
\n\n
\n
\n--package-name <package_name>\u00b6
\n

Specify Python package where repository or job function lives

\n
\n\n
\n
\n-m, --module-name <module_name>\u00b6
\n

Specify module or modules (flag can be used multiple times) where dagster definitions reside as top-level symbols/variables and load each module as a code location in the current python environment.

\n
\n\n
\n
\n-f, --python-file <python_file>\u00b6
\n

Specify python file or files (flag can be used multiple times) where dagster definitions reside as top-level symbols/variables and load each file as a code location in the current python environment.

\n
\n\n
\n
\n-d, --working-directory <working_directory>\u00b6
\n

Specify working directory to use when loading the repository or job

\n
\n\n
\n
\n-w, --workspace <workspace>\u00b6
\n

Path to workspace file. Argument can be provided multiple times.

\n
\n\n
\n
\n--empty-workspace\u00b6
\n

Allow an empty workspace

\n
\n\n

Environment variables

\n
\n
\nDAGSTER_DAEMON_LOG_LEVEL
\n
\n

Provide a default for --log-level

\n
\n
\n\n
\n
\nDAGSTER_ATTRIBUTE
\n
\n

Provide a default for --attribute

\n
\n
\n\n
\n
\nDAGSTER_PACKAGE_NAME
\n
\n

Provide a default for --package-name

\n
\n
\n\n
\n
\nDAGSTER_MODULE_NAME
\n
\n

Provide a default for --module-name

\n
\n
\n\n
\n
\nDAGSTER_PYTHON_FILE
\n
\n

Provide a default for --python-file

\n
\n
\n\n
\n
\nDAGSTER_WORKING_DIRECTORY
\n
\n

Provide a default for --working-directory

\n
\n
\n\n
\n
\n

dagster-daemon wipe\u00b6

\n

Wipe all heartbeats from storage.

\n
dagster-daemon wipe [OPTIONS]\n
\n
\n
\n
\n

dagster-daemon debug heartbeat-dump\u00b6

\n

Log all heartbeat statuses

\n
dagster-daemon debug heartbeat-dump [OPTIONS]\n
\n
\n
\n
\n

dagster api grpc\u00b6

\n

Serve the Dagster inter-process API over GRPC

\n
dagster api grpc [OPTIONS]\n
\n
\n

Options

\n
\n
\n-p, --port <port>\u00b6
\n

Port over which to serve. You must pass one and only one of \u2013port/-p or \u2013socket/-s.

\n
\n\n
\n
\n-s, --socket <socket>\u00b6
\n

Serve over a UDS socket. You must pass one and only one of \u2013port/-p or \u2013socket/-s.

\n
\n\n
\n
\n-h, --host <host>\u00b6
\n

Hostname at which to serve. Default is localhost.

\n
\n\n
\n
\n-n, --max-workers, --max_workers <max_workers>\u00b6
\n

Maximum number of (threaded) workers to use in the GRPC server

\n
\n\n
\n
\n--heartbeat\u00b6
\n

If set, the GRPC server will shut itself down when it fails to receive a heartbeat after a timeout configurable with \u2013heartbeat-timeout.

\n
\n\n
\n
\n--heartbeat-timeout <heartbeat_timeout>\u00b6
\n

Timeout after which to shutdown if \u2013heartbeat is set and a heartbeat is not received

\n
\n\n
\n
\n--lazy-load-user-code\u00b6
\n

Wait until the first LoadRepositories call to actually load the repositories, instead of waiting to load them when the server is launched. Useful for surfacing errors when the server is managed directly from the Dagster UI.

\n
\n\n
\n
\n-a, --attribute <attribute>\u00b6
\n

Attribute that is either a 1) repository or job or 2) a function that returns a repository or job

\n
\n\n
\n
\n--package-name <package_name>\u00b6
\n

Specify Python package where repository or job function lives

\n
\n\n
\n
\n-m, --module-name <module_name>\u00b6
\n

Specify module where dagster definitions reside as top-level symbols/variables and load the module as a code location in the current python environment.

\n
\n\n
\n
\n-f, --python-file <python_file>\u00b6
\n

Specify python file where dagster definitions reside as top-level symbols/variables and load the file as a code location in the current python environment.

\n
\n\n
\n
\n-d, --working-directory <working_directory>\u00b6
\n

Specify working directory to use when loading the repository or job

\n
\n\n
\n
\n--use-python-environment-entry-point\u00b6
\n

If this flag is set, the server will signal to clients that they should launch dagster commands using <this server\u2019s python executable> -m dagster, instead of the default dagster entry point. This is useful when there are multiple Python environments running in the same machine, so a single dagster entry point is not enough to uniquely determine the environment.

\n
\n\n
\n
\n--empty-working-directory\u00b6
\n

Indicates that the working directory should be empty and should not set to the current directory as a default

\n
\n\n
\n
\n--fixed-server-id <fixed_server_id>\u00b6
\n

[INTERNAL] This option should generally not be used by users. Internal param used by dagster to spawn a gRPC server with the specified server id.

\n
\n\n
\n
\n--log-level <log_level>\u00b6
\n

Level at which to log output from the code server process

\n
\n
Default:
\n

info

\n
\n
Options:
\n

critical | error | warning | info | debug

\n
\n
\n
\n\n
\n
\n--container-image <container_image>\u00b6
\n

Container image to use to run code from this server.

\n
\n\n
\n
\n--container-context <container_context>\u00b6
\n

Serialized JSON with configuration for any containers created to run the code from this server.

\n
\n\n
\n
\n--inject-env-vars-from-instance\u00b6
\n

Whether to load env vars from the instance and inject them into the environment.

\n
\n\n
\n
\n--location-name <location_name>\u00b6
\n

Name of the code location this server corresponds to.

\n
\n\n
\n
\n--instance-ref <instance_ref>\u00b6
\n

[INTERNAL] Serialized InstanceRef to use for accessing the instance

\n
\n\n

Environment variables

\n
\n
\nDAGSTER_GRPC_PORT
\n
\n

Provide a default for --port

\n
\n
\n\n
\n
\nDAGSTER_GRPC_SOCKET
\n
\n

Provide a default for --socket

\n
\n
\n\n
\n
\nDAGSTER_GRPC_HOST
\n
\n

Provide a default for --host

\n
\n
\n\n
\n
\nDAGSTER_LAZY_LOAD_USER_CODE
\n
\n

Provide a default for --lazy-load-user-code

\n
\n
\n\n
\n
\nDAGSTER_ATTRIBUTE
\n
\n

Provide a default for --attribute

\n
\n
\n\n
\n
\nDAGSTER_PACKAGE_NAME
\n
\n

Provide a default for --package-name

\n
\n
\n\n
\n
\nDAGSTER_MODULE_NAME
\n
\n

Provide a default for --module-name

\n
\n
\n\n
\n
\nDAGSTER_PYTHON_FILE
\n
\n

Provide a default for --python-file

\n
\n
\n\n
\n
\nDAGSTER_WORKING_DIRECTORY
\n
\n

Provide a default for --working-directory

\n
\n
\n\n
\n
\nDAGSTER_USE_PYTHON_ENVIRONMENT_ENTRY_POINT
\n
\n

Provide a default for --use-python-environment-entry-point

\n
\n
\n\n
\n
\nDAGSTER_EMPTY_WORKING_DIRECTORY
\n
\n

Provide a default for --empty-working-directory

\n
\n
\n\n
\n
\nDAGSTER_CONTAINER_IMAGE
\n
\n

Provide a default for --container-image

\n
\n
\n\n
\n
\nDAGSTER_CONTAINER_CONTEXT
\n
\n

Provide a default for --container-context

\n
\n
\n\n
\n
\nDAGSTER_INJECT_ENV_VARS_FROM_INSTANCE
\n
\n

Provide a default for --inject-env-vars-from-instance

\n
\n
\n\n
\n
\nDAGSTER_LOCATION_NAME
\n
\n

Provide a default for --location-name

\n
\n
\n\n
\n
\nDAGSTER_INSTANCE_REF
\n
\n

Provide a default for --instance-ref

\n
\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/cli", "customsidebar": null, "display_toc": true, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../config/", "title": "Config"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../asset-checks/", "title": "Asset Checks (Experimental)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/config", "Config", "N", "next"], ["sections/api/apidocs/asset-checks", "Asset Checks (Experimental)", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/cli.rst.txt", "title": "Dagster CLI", "toc": "\n"}, "config": {"alabaster_version": "0.7.13", "body": "
\n

Config\u00b6

\n
\n

Pythonic config system\u00b6

\n

The following classes are used as part of the new Pythonic config system. They are used in conjunction with builtin types.

\n
\n
\nclass dagster.Config[source]\u00b6
\n

Base class for Dagster configuration models, used to specify config schema for\nops and assets. Subclasses pydantic.BaseModel.

\n

Example definition:

\n
from pydantic import Field\n\nclass MyAssetConfig(Config):\n    my_str: str = "my_default_string"\n    my_int_list: List[int]\n    my_bool_with_metadata: bool = Field(default=False, description="A bool field")\n
\n
\n

Example usage:

\n
@asset\ndef asset_with_config(config: MyAssetConfig):\n    assert config.my_str == "my_default_string"\n    assert config.my_int_list == [1, 2, 3]\n    assert config.my_bool_with_metadata == False\n\nasset_with_config(MyAssetConfig(my_int_list=[1, 2, 3], my_bool_with_metadata=True))\n
\n
\n
\n\n
\n
\nclass dagster.PermissiveConfig(**config_dict)[source]\u00b6
\n

Subclass of Config that allows arbitrary extra fields. This is useful for\nconfig classes which may have open-ended inputs.

\n

Example definition:

\n
class MyPermissiveOpConfig(PermissiveConfig):\n    my_explicit_parameter: bool\n    my_other_explicit_parameter: str\n
\n
\n

Example usage:

\n
@op\ndef op_with_config(config: MyPermissiveOpConfig):\n    assert config.my_explicit_parameter == True\n    assert config.my_other_explicit_parameter == "foo"\n    assert config.dict().get("my_implicit_parameter") == "bar"\n\nop_with_config(\n    MyPermissiveOpConfig(\n        my_explicit_parameter=True,\n        my_other_explicit_parameter="foo",\n        my_implicit_parameter="bar"\n    )\n)\n
\n
\n
\n\n
\n
\nclass dagster.RunConfig(ops=None, resources=None, loggers=None, execution=None)[source]\u00b6
\n

Container for all the configuration that can be passed to a run. Accepts Pythonic definitions\nfor op and asset config and resources and converts them under the hood to the appropriate config dictionaries.

\n

Example usage:

\n
class MyAssetConfig(Config):\n    a_str: str\n\n@asset\ndef my_asset(config: MyAssetConfig):\n    assert config.a_str == "foo"\n\nmaterialize(\n    [my_asset],\n    run_config=RunConfig(\n        ops={"my_asset": MyAssetConfig(a_str="foo")}\n    )\n)\n
\n
\n
\n\n
\n
\n

Legacy Dagster config types\u00b6

\n

The following types are used as part of the legacy Dagster config system. They are used in conjunction with builtin types.

\n
\n
\nclass dagster.ConfigSchema[source]\u00b6
\n

Placeholder type for config schemas.

\n

Any time that it appears in documentation, it means that any of the following types are\nacceptable:

\n
    \n
  1. A Python scalar type that resolves to a Dagster config type\n(python:int, python:float, python:bool,\nor python:str). For example:

    \n
      \n
    • @op(config_schema=int)

    • \n
    • @op(config_schema=str)

    • \n
    \n
  2. \n
  3. A built-in python collection (python:list, or python:dict).\npython:list is exactly equivalent to Array [\nAny ] and python:dict is equivalent to\nPermissive. For example:

    \n
      \n
    • @op(config_schema=list)

    • \n
    • @op(config_schema=dict)

    • \n
    \n
  4. \n
  5. A Dagster config type:

    \n\n
  6. \n
  7. A bare python dictionary, which will be automatically wrapped in\nShape. Values of the dictionary are resolved recursively\naccording to the same rules. For example:

    \n
      \n
    • {'some_config': str} is equivalent to Shape({'some_config: str}).

    • \n
    • \n
      {'some_config1': {'some_config2': str}} is equivalent to

      Shape({'some_config1: Shape({'some_config2: str})}).

      \n
      \n
      \n
    • \n
    \n
  8. \n
  9. A bare python list of length one, whose single element will be wrapped in a\nArray is resolved recursively according to the same\nrules. For example:

    \n
      \n
    • [str] is equivalent to Array[str].

    • \n
    • [[str]] is equivalent to Array[Array[str]].

    • \n
    • [{'some_config': str}] is equivalent to Array(Shape({'some_config: str})).

    • \n
    \n
  10. \n
  11. An instance of Field.

  12. \n
\n
\n\n
\n
\nclass dagster.Field(config, default_value=<class 'dagster._config.field_utils.__FieldValueSentinel'>, is_required=None, description=None)[source]\u00b6
\n

Defines the schema for a configuration field.

\n

Fields are used in config schema instead of bare types when one wants to add a description,\na default value, or to mark it as not required.

\n

Config fields are parsed according to their schemas in order to yield values available at\njob execution time through the config system. Config fields can be set on ops, on\nloaders for custom, and on other pluggable components of the system, such as resources, loggers,\nand executors.

\n
\n
Parameters:
\n
    \n
  • config (Any) \u2013

    The schema for the config. This value can be any of:

    \n
      \n
    1. A Python primitive type that resolves to a Dagster config type\n(python:int, python:float, python:bool,\npython:str, or python:list).

    2. \n
    3. A Dagster config type:

      \n\n
    4. \n
    5. A bare python dictionary, which will be automatically wrapped in\nShape. Values of the dictionary are resolved recursively\naccording to the same rules.

    6. \n
    7. A bare python list of length one which itself is config type.\nBecomes Array with list element as an argument.

    8. \n
    \n

  • \n
  • default_value (Any) \u2013

    A default value for this field, conformant to the schema set by the dagster_type\nargument. If a default value is provided, is_required should be False.

    \n

    Note: for config types that do post processing such as Enum, this value must be\nthe pre processed version, ie use ExampleEnum.VALUE.name instead of\nExampleEnum.VALUE

    \n

  • \n
  • is_required (bool) \u2013 Whether the presence of this field is required. Defaults to true. If is_required\nis True, no default value should be provided.

  • \n
  • description (str) \u2013 A human-readable description of this config field.

  • \n
\n
\n
\n

Examples

\n
@op(\n    config_schema={\n        'word': Field(str, description='I am a word.'),\n        'repeats': Field(Int, default_value=1, is_required=False),\n    }\n)\ndef repeat_word(context):\n    return context.op_config['word'] * context.op_config['repeats']\n
\n
\n
\n
\nproperty default_provided\u00b6
\n

Was a default value provided.

\n
\n
Returns:
\n

Yes or no

\n
\n
Return type:
\n

bool

\n
\n
\n
\n\n
\n
\nproperty default_value\u00b6
\n

The default value for the field.

\n

Raises an exception if no default value was provided.

\n
\n\n
\n
\nproperty description\u00b6
\n

A human-readable description of this config field, if provided.

\n
\n\n
\n
\nproperty is_required\u00b6
\n

Whether a value for this field must be provided at runtime.

\n

Cannot be True if a default value is provided.

\n
\n\n
\n\n
\n
\nclass dagster.Selector(fields, description=None)[source]\u00b6
\n

Define a config field requiring the user to select one option.

\n

Selectors are used when you want to be able to present several different options in config but\nallow only one to be selected. For example, a single input might be read in from either a csv\nfile or a parquet file, but not both at once.

\n

Note that in some other type systems this might be called an \u2018input union\u2019.

\n

Functionally, a selector is like a Dict, except that only one key from the dict can\nbe specified in valid config.

\n
\n
Parameters:
\n

fields (Dict[str, Field]) \u2013 The fields from which the user must select.

\n
\n
\n

Examples:

\n
@op(\n    config_schema=Field(\n        Selector(\n            {\n                'haw': {'whom': Field(String, default_value='honua', is_required=False)},\n                'cn': {'whom': Field(String, default_value='\u4e16\u754c', is_required=False)},\n                'en': {'whom': Field(String, default_value='world', is_required=False)},\n            }\n        ),\n        is_required=False,\n        default_value={'en': {'whom': 'world'}},\n    )\n)\ndef hello_world_with_default(context):\n    if 'haw' in context.op_config:\n        return 'Aloha {whom}!'.format(whom=context.op_config['haw']['whom'])\n    if 'cn' in context.op_config:\n        return '\u4f60\u597d, {whom}!'.format(whom=context.op_config['cn']['whom'])\n    if 'en' in context.op_config:\n        return 'Hello, {whom}!'.format(whom=context.op_config['en']['whom'])\n
\n
\n
\n\n
\n
\nclass dagster.Permissive(fields=None, description=None)[source]\u00b6
\n

Defines a config dict with a partially specified schema.

\n

A permissive dict allows partial specification of the config schema. Any fields with a\nspecified schema will be type checked. Other fields will be allowed, but will be ignored by\nthe type checker.

\n
\n
Parameters:
\n

fields (Dict[str, Field]) \u2013 The partial specification of the config dict.

\n
\n
\n

Examples:

\n
@op(config_schema=Field(Permissive({'required': Field(String)})))\ndef map_config_op(context) -> List:\n    return sorted(list(context.op_config.items()))\n
\n
\n
\n\n
\n
\nclass dagster.Shape(fields, description=None, field_aliases=None)[source]\u00b6
\n

Schema for configuration data with string keys and typed values via Field.

\n

Unlike Permissive, unspecified fields are not allowed and will throw a\nDagsterInvalidConfigError.

\n
\n
Parameters:
\n
    \n
  • fields (Dict[str, Field]) \u2013 The specification of the config dict.

  • \n
  • field_aliases (Dict[str, str]) \u2013 Maps a string key to an alias that can be used instead of the original key. For example,\nan entry {\u201cfoo\u201d: \u201cbar\u201d} means that someone could use \u201cbar\u201d instead of \u201cfoo\u201d as a\ntop level string key.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.Map(key_type, inner_type, key_label_name=None)[source]\u00b6
\n

Defines a config dict with arbitrary scalar keys and typed values.

\n

A map can contrain arbitrary keys of the specified scalar type, each of which has\ntype checked values. Unlike Shape and Permissive, scalar\nkeys other than strings can be used, and unlike Permissive, all\nvalues are type checked.

\n
\n
Parameters:
\n
    \n
  • key_type (type) \u2013 The type of keys this map can contain. Must be a scalar type.

  • \n
  • inner_type (type) \u2013 The type of the values that this map type can contain.

  • \n
  • key_label_name (string) \u2013 Optional name which describes the role of keys in the map.

  • \n
\n
\n
\n

Examples:

\n
@op(config_schema=Field(Map({str: int})))\ndef partially_specified_config(context) -> List:\n    return sorted(list(context.op_config.items()))\n
\n
\n
\n
\nproperty key_label_name\u00b6
\n

Name which describes the role of keys in the map, if provided.

\n
\n\n
\n\n
\n
\nclass dagster.Array(inner_type)[source]\u00b6
\n

Defines an array (list) configuration type that contains values of type inner_type.

\n
\n
Parameters:
\n

inner_type (type) \u2013 The type of the values that this configuration type can contain.

\n
\n
\n
\n
\nproperty description\u00b6
\n

A human-readable description of this Array type.

\n
\n\n
\n\n
\n
\nclass dagster.Noneable(inner_type)[source]\u00b6
\n

Defines a configuration type that is the union of NoneType and the type inner_type.

\n
\n
Parameters:
\n

inner_type (type) \u2013 The type of the values that this configuration type can contain.

\n
\n
\n

Examples:

\n
config_schema={"name": Noneable(str)}\n\nconfig={"name": "Hello"}  # Ok\nconfig={"name": None}     # Ok\nconfig={}                 # Error\n
\n
\n
\n\n
\n
\nclass dagster.Enum(name, enum_values)[source]\u00b6
\n

Defines a enum configuration type that allows one of a defined set of possible values.

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the enum configuration type.

  • \n
  • enum_values (List[EnumValue]) \u2013 The set of possible values for the enum configuration type.

  • \n
\n
\n
\n

Examples:

\n
@op(\n    config_schema=Field(\n        Enum(\n            'CowboyType',\n            [\n                EnumValue('good'),\n                EnumValue('bad'),\n                EnumValue('ugly'),\n            ]\n        )\n    )\n)\ndef resolve_standoff(context):\n    # ...\n
\n
\n
\n\n
\n
\nclass dagster.EnumValue(config_value, python_value=None, description=None)[source]\u00b6
\n

Define an entry in a Enum.

\n
\n
Parameters:
\n
    \n
  • config_value (str) \u2013 The string representation of the config to accept when passed.

  • \n
  • python_value (Optional[Any]) \u2013 The python value to convert the enum entry in to. Defaults to the config_value.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the enum entry.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.ScalarUnion(scalar_type, non_scalar_schema, _key=None)[source]\u00b6
\n

Defines a configuration type that accepts a scalar value OR a non-scalar value like a\nList, Dict, or Selector.

\n

This allows runtime scalars to be configured without a dictionary with the key value and\ninstead just use the scalar value directly. However this still leaves the option to\nload scalars from a json or pickle file.

\n
\n
Parameters:
\n
    \n
  • scalar_type (type) \u2013 The scalar type of values that this configuration type can hold. For example,\npython:int, python:float, python:bool,\nor python:str.

  • \n
  • non_scalar_schema (ConfigSchema) \u2013 The schema of a non-scalar Dagster configuration type. For example, List,\nDict, or Selector.

  • \n
  • key (Optional[str]) \u2013 The configuation type\u2019s unique key. If not set, then the key will be set to\nScalarUnion.{scalar_type}-{non_scalar_schema}.

  • \n
\n
\n
\n

Examples:

\n
graph:\n  transform_word:\n    inputs:\n      word:\n        value: foobar\n
\n
\n

becomes, optionally,

\n
graph:\n  transform_word:\n    inputs:\n      word: foobar\n
\n
\n
\n\n
\n
\ndagster.StringSource\u00b6
\n

Use this type when you want to read a string config value from an environment variable. The value\npassed to a config field of this type may either be a string literal, or a selector describing\nhow to look up the value from the executing process\u2019s environment variables.

\n

Examples:

\n
from dagster import job, op, StringSource\n\n@op(config_schema=StringSource)\ndef secret_op(context) -> str:\n    return context.op_config\n\n@job\ndef secret_job():\n    secret_op()\n\nsecret_job.execute_in_process(\n    run_config={\n        'ops': {'secret_op': {'config': 'test_value'}}\n    }\n)\n\nsecret_job.execute_in_process(\n    run_config={\n        'ops': {'secret_op': {'config': {'env': 'VERY_SECRET_ENV_VARIABLE'}}}\n    }\n)\n
\n
\n
\n\n
\n
\ndagster.IntSource\u00b6
\n

Use this type when you want to read an integer config value from an environment variable. The\nvalue passed to a config field of this type may either be a integer literal, or a selector\ndescribing how to look up the value from the executing process\u2019s environment variables.

\n

Examples:

\n
from dagster import job, op, IntSource\n\n@op(config_schema=IntSource)\ndef secret_int_op(context) -> int:\n    return context.op_config\n\n@job\ndef secret_job():\n    secret_int_op()\n\nsecret_job.execute_in_process(\n    run_config={\n        'ops': {'secret_int_op': {'config': 1234}}\n    }\n)\n\nsecret_job.execute_in_process(\n    run_config={\n        'ops': {'secret_int_op': {'config': {'env': 'VERY_SECRET_ENV_VARIABLE_INT'}}}\n    }\n)\n
\n
\n
\n\n
\n
\ndagster.BoolSource\u00b6
\n

Use this type when you want to read an boolean config value from an environment variable. The\nvalue passed to a config field of this type may either be a boolean literal, or a selector\ndescribing how to look up the value from the executing process\u2019s environment variables. Set the\nvalue of the corresponding environment variable to "" to indicate False.

\n

Examples:

\n
from dagster import job, op, BoolSource\n\n@op(config_schema=BoolSource)\ndef secret_bool_op(context) -> bool:\n    return context.op_config\n\n@job\ndef secret_job():\n    secret_bool_op()\n\nsecret_job.execute_in_process(\n    run_config={\n        'ops': {'secret_bool_op': {'config': False}}\n    }\n)\n\nsecret_job.execute_in_process(\n    run_config={\n        'ops': {'secret_bool_op': {'config': {'env': 'VERY_SECRET_ENV_VARIABLE_BOOL'}}}\n    }\n)\n
\n
\n
\n\n
\n
\n

Config Utilities\u00b6

\n
\n
\nclass dagster.ConfigMapping(config_fn, config_schema=None, receive_processed_config_values=None)[source]\u00b6
\n

Defines a config mapping for a graph (or job).

\n

By specifying a config mapping function, you can override the configuration for the child\nops and graphs contained within a graph.

\n

Config mappings require the configuration schema to be specified as config_schema, which will\nbe exposed as the configuration schema for the graph, as well as a configuration mapping\nfunction, config_fn, which maps the config provided to the graph to the config\nthat will be provided to the child nodes.

\n
\n
Parameters:
\n
    \n
  • config_fn (Callable[[dict], dict]) \u2013 The function that will be called\nto map the graph config to a config appropriate for the child nodes.

  • \n
  • config_schema (ConfigSchema) \u2013 The schema of the graph config.

  • \n
  • receive_processed_config_values (Optional[bool]) \u2013 If true, config values provided to the config_fn\nwill be converted to their dagster types before being passed in. For example, if this\nvalue is true, enum config passed to config_fn will be actual enums, while if false,\nthen enum config passed to config_fn will be strings.

  • \n
\n
\n
\n
\n\n
\n
\n@dagster.configured(configurable, config_schema=None, **kwargs)[source]\u00b6
\n

A decorator that makes it easy to create a function-configured version of an object.

\n

The following definition types can be configured using this function:

\n\n

Using configured may result in config values being displayed in the Dagster UI,\nso it is not recommended to use this API with sensitive values, such as\nsecrets.

\n

If the config that will be supplied to the object is constant, you may alternatively invoke this\nand call the result with a dict of config values to be curried. Examples of both strategies\nbelow.

\n
\n
Parameters:
\n
    \n
  • configurable (ConfigurableDefinition) \u2013 An object that can be configured.

  • \n
  • config_schema (ConfigSchema) \u2013 The config schema that the inputs to the decorated function\nmust satisfy. Alternatively, annotate the config parameter to the decorated function\nwith a subclass of Config and omit this argument.

  • \n
  • **kwargs \u2013 Arbitrary keyword arguments that will be passed to the initializer of the returned\nobject.

  • \n
\n
\n
Returns:
\n

(Callable[[Union[Any, Callable[[Any], Any]]], ConfigurableDefinition])

\n
\n
\n

Examples:

\n
class GreetingConfig(Config):\n    message: str\n\n@op\ndef greeting_op(config: GreetingConfig):\n    print(config.message)\n\nclass HelloConfig(Config):\n    name: str\n\n@configured(greeting_op)\ndef hello_op(config: HelloConfig):\n    return GreetingConfig(message=f"Hello, {config.name}!")\n
\n
\n
dev_s3 = configured(S3Resource, name="dev_s3")({'bucket': 'dev'})\n\n@configured(S3Resource)\ndef dev_s3(_):\n    return {'bucket': 'dev'}\n\n@configured(S3Resource, {'bucket_prefix', str})\ndef dev_s3(config):\n    return {'bucket': config['bucket_prefix'] + 'dev'}\n
\n
\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/config", "customsidebar": null, "display_toc": true, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../errors/", "title": "Errors"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../cli/", "title": "Dagster CLI"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/errors", "Errors", "N", "next"], ["sections/api/apidocs/cli", "Dagster CLI", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/config.rst.txt", "title": "Config", "toc": "\n"}, "definitions": {"alabaster_version": "0.7.13", "body": "
\n

Definitions\u00b6

\n
\n
\nclass dagster.Definitions(assets=None, schedules=None, sensors=None, jobs=None, resources=None, executor=None, loggers=None, asset_checks=None)[source]\u00b6
\n

A set of definitions explicitly available and loadable by Dagster tools.

\n
\n
Parameters:
\n
    \n
  • assets (Optional[Iterable[Union[AssetsDefinition, SourceAsset, CacheableAssetsDefinition]]]) \u2013 A list of assets. Assets can be created by annotating\na function with @asset or\n@observable_source_asset.\nOr they can by directly instantiating AssetsDefinition,\nSourceAsset, or CacheableAssetsDefinition.

  • \n
  • asset_checks (Optional[Iterable[AssetChecksDefinition]]) \u2013 A list of asset checks.

  • \n
  • schedules (Optional[Iterable[Union[ScheduleDefinition, UnresolvedPartitionedAssetScheduleDefinition]]]) \u2013 List of schedules.

  • \n
  • sensors (Optional[Iterable[SensorDefinition]]) \u2013 List of sensors, typically created with @sensor.

  • \n
  • jobs (Optional[Iterable[Union[JobDefinition, UnresolvedAssetJobDefinition]]]) \u2013 List of jobs. Typically created with define_asset_job\nor with @job for jobs defined in terms of ops directly.\nJobs created with @job must already have resources bound\nat job creation time. They do not respect the resources argument here.

  • \n
  • resources (Optional[Mapping[str, Any]]) \u2013 Dictionary of resources to bind to assets.\nThe resources dictionary takes raw Python objects,\nnot just instances of ResourceDefinition. If that raw object inherits from\nIOManager, it gets coerced to an IOManagerDefinition.\nAny other object is coerced to a ResourceDefinition.\nThese resources will be automatically bound\nto any assets passed to this Definitions instance using\nwith_resources. Assets passed to Definitions with\nresources already bound using with_resources will\noverride this dictionary.

  • \n
  • executor (Optional[Union[ExecutorDefinition, Executor]]) \u2013 Default executor for jobs. Individual jobs can override this and define their own executors\nby setting the executor on @job or define_asset_job\nexplicitly. This executor will also be used for materializing assets directly\noutside of the context of jobs. If an Executor is passed, it is coerced into\nan ExecutorDefinition.

  • \n
  • loggers (Optional[Mapping[str, LoggerDefinition]) \u2013 Default loggers for jobs. Individual jobs\ncan define their own loggers by setting them explictly.

  • \n
\n
\n
\n

Example usage:

\n
defs = Definitions(\n    assets=[asset_one, asset_two],\n    schedules=[a_schedule],\n    sensors=[a_sensor],\n    jobs=[a_job],\n    resources={\n        "a_resource": some_resource,\n    },\n    asset_checks=[asset_one_check_one]\n)\n
\n
\n

Dagster separates user-defined code from system tools such the web server and\nthe daemon. Rather than loading code directly into process, a tool such as the\nwebserver interacts with user-defined code over a serialization boundary.

\n

These tools must be able to locate and load this code when they start. Via CLI\narguments or config, they specify a Python module to inspect.

\n

A Python module is loadable by Dagster tools if there is a top-level variable\nthat is an instance of Definitions.

\n

Before the introduction of Definitions,\n@repository was the API for organizing defintions.\nDefinitions provides a few conveniences for dealing with resources\nthat do not apply to old-style @repository declarations:

\n\n
\n
\nget_asset_value_loader(instance=None)[source]\u00b6
\n

Returns an object that can load the contents of assets as Python objects.

\n

Invokes load_input on the IOManager associated with the assets. Avoids\nspinning up resources separately for each asset.

\n

Usage:

\n
with defs.get_asset_value_loader() as loader:\n    asset1 = loader.load_asset_value("asset1")\n    asset2 = loader.load_asset_value("asset2")\n
\n
\n
\n\n
\n
\nget_job_def(name)[source]\u00b6
\n

Get a job definition by name. If you passed in a an UnresolvedAssetJobDefinition\n(return value of define_asset_job()) it will be resolved to a JobDefinition when returned\nfrom this function.

\n
\n\n
\n
\nget_schedule_def(name)[source]\u00b6
\n

Get a schedule definition by name.

\n
\n\n
\n
\nget_sensor_def(name)[source]\u00b6
\n

Get a sensor definition by name.

\n
\n\n
\n
\nload_asset_value(asset_key, *, python_type=None, instance=None, partition_key=None, metadata=None)[source]\u00b6
\n

Load the contents of an asset as a Python object.

\n

Invokes load_input on the IOManager associated with the asset.

\n

If you want to load the values of multiple assets, it\u2019s more efficient to use\nget_asset_value_loader(), which avoids spinning up\nresources separately for each asset.

\n
\n
Parameters:
\n
    \n
  • asset_key (Union[AssetKey, Sequence[str], str]) \u2013 The key of the asset to load.

  • \n
  • python_type (Optional[Type]) \u2013 The python type to load the asset as. This is what will\nbe returned inside load_input by context.dagster_type.typing_type.

  • \n
  • partition_key (Optional[str]) \u2013 The partition of the asset to load.

  • \n
  • metadata (Optional[Dict[str, Any]]) \u2013 Input metadata to pass to the IOManager\n(is equivalent to setting the metadata argument in In or AssetIn).

  • \n
\n
\n
Returns:
\n

The contents of an asset as a Python object.

\n
\n
\n
\n\n
\n\n
\n
\ndagster.create_repository_using_definitions_args(name, assets=None, schedules=None, sensors=None, jobs=None, resources=None, executor=None, loggers=None, asset_checks=None)[source]\u00b6
\n
\n
\n

\n \n (\n experimental\n )\n \n This API may break in future versions, even between dot releases.\n \n

\n

Create a named repository using the same arguments as Definitions. In older\nversions of Dagster, repositories were the mechanism for organizing assets, schedules, sensors,\nand jobs. There could be many repositories per code location. This was a complicated ontology but\ngave users a way to organize code locations that contained large numbers of heterogenous definitions.

\n

As a stopgap for those who both want to 1) use the new Definitions API and 2) but still\nwant multiple logical groups of assets in the same code location, we have introduced this function.

\n

Example usage:

\n
named_repo = create_repository_using_definitions_args(\n    name="a_repo",\n    assets=[asset_one, asset_two],\n    schedules=[a_schedule],\n    sensors=[a_sensor],\n    jobs=[a_job],\n    resources={\n        "a_resource": some_resource,\n    }\n)\n
\n
\n
\n\n
\n", "current_page_name": "sections/api/apidocs/definitions", "customsidebar": null, "display_toc": false, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../repositories/", "title": "Repositories"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../partitions/", "title": "Partitions Definitions"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/repositories", "Repositories", "N", "next"], ["sections/api/apidocs/partitions", "Partitions Definitions", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/definitions.rst.txt", "title": "Definitions", "toc": "\n"}, "dynamic": {"alabaster_version": "0.7.13", "body": "
\n

Dynamic Mapping & Collect\u00b6

\n

These APIs provide the means for a simple kind of dynamic orchestration \u2014 where the work to be orchestrated is determined not at job definition time but at runtime, dependent on data that\u2019s observed as part of job execution.

\n
\n
\nclass dagster.DynamicOut(dagster_type=<class 'dagster._core.definitions.utils.NoValueSentinel'>, description=None, is_required=True, io_manager_key=None, metadata=None, code_version=None)[source]\u00b6
\n

Variant of Out for an output that will dynamically alter the graph at\nruntime.

\n

When using in a composition function such as @graph,\ndynamic outputs must be used with either

\n
    \n
  • map - clone downstream ops for each separate DynamicOut

  • \n
  • collect - gather across all DynamicOut in to a list

  • \n
\n

Uses the same constructor as Out

\n
\n
@op(\n    config_schema={\n        "path": Field(str, default_value=file_relative_path(__file__, "sample"))\n    },\n    out=DynamicOut(str),\n)\ndef files_in_directory(context):\n    path = context.op_config["path"]\n    dirname, _, filenames = next(os.walk(path))\n    for file in filenames:\n        yield DynamicOutput(os.path.join(dirname, file), mapping_key=_clean(file))\n\n@job\ndef process_directory():\n    files = files_in_directory()\n\n    # use map to invoke an op on each dynamic output\n    file_results = files.map(process_file)\n\n    # use collect to gather the results in to a list\n    summarize_directory(file_results.collect())\n
\n
\n
\n
\n\n
\n
\nclass dagster.DynamicOutput(value, mapping_key, output_name='result', metadata=None)[source]\u00b6
\n

Variant of Output used to support\ndynamic mapping & collect. Each DynamicOutput produced by an op represents\none item in a set that can be processed individually with map or gathered\nwith collect.

\n

Each DynamicOutput must have a unique mapping_key to distinguish it with it\u2019s set.

\n
\n
Parameters:
\n
    \n
  • value (Any) \u2013 The value returned by the compute function.

  • \n
  • mapping_key (str) \u2013 The key that uniquely identifies this dynamic value relative to its peers.\nThis key will be used to identify the downstream ops when mapped, ie\nmapped_op[example_mapping_key]

  • \n
  • output_name (Optional[str]) \u2013 Name of the corresponding DynamicOut defined on the op.\n(default: \u201cresult\u201d)

  • \n
  • metadata (Optional[Dict[str, Union[str, float, int, MetadataValue]]]) \u2013 Arbitrary metadata about the failure. Keys are displayed string labels, and values are\none of the following: string, float, int, JSON-serializable dict, JSON-serializable\nlist, and one of the data classes returned by a MetadataValue static method.

  • \n
\n
\n
\n
\n
\nproperty mapping_key\u00b6
\n

The mapping_key that was set for this DynamicOutput at instantiation.

\n
\n\n
\n
\nproperty output_name\u00b6
\n

Name of the DynamicOut defined on the op that this DynamicOut is associated with.

\n
\n\n
\n
\nproperty value\u00b6
\n

The value that is returned by the compute function for this DynamicOut.

\n
\n\n
\n\n
\n", "current_page_name": "sections/api/apidocs/dynamic", "customsidebar": null, "display_toc": false, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../types/", "title": "Types"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../schedules-sensors/", "title": "Run Requests"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/types", "Types", "N", "next"], ["sections/api/apidocs/schedules-sensors", "Run Requests", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/dynamic.rst.txt", "title": "Dynamic Mapping & Collect", "toc": "\n"}, "errors": {"alabaster_version": "0.7.13", "body": "
\n

Errors\u00b6

\n

Core Dagster error classes.

\n

All errors thrown by the Dagster framework inherit from DagsterError. Users\nshould not subclass this base class for their own exceptions.

\n

There is another exception base class, DagsterUserCodeExecutionError, which is\nused by the framework in concert with the user_code_error_boundary().

\n

Dagster uses this construct to wrap user code into which it calls. User code can perform arbitrary\ncomputations and may itself throw exceptions. The error boundary catches these user code-generated\nexceptions, and then reraises them wrapped in a subclass of\nDagsterUserCodeExecutionError.

\n

The wrapped exceptions include additional context for the original exceptions, injected by the\nDagster runtime.

\n
\n
\nexception dagster.DagsterError[source]\u00b6
\n

Base class for all errors thrown by the Dagster framework.

\n

Users should not subclass this base class for their own exceptions.

\n
\n
\nproperty is_user_code_error\u00b6
\n

Returns true if this error is attributable to user code.

\n
\n\n
\n\n
\n
\nexception dagster.DagsterConfigMappingFunctionError(*args, **kwargs)[source]\u00b6
\n

Indicates that an unexpected error occurred while executing the body of a config mapping\nfunction defined in a JobDefinition or ~dagster.GraphDefinition during\nconfig parsing.

\n
\n\n
\n
\nexception dagster.DagsterEventLogInvalidForRun(run_id)[source]\u00b6
\n

Raised when the event logs for a historical run are malformed or invalid.

\n
\n\n
\n
\nexception dagster.DagsterExecutionStepExecutionError(*args, **kwargs)[source]\u00b6
\n

Indicates an error occurred while executing the body of an execution step.

\n
\n\n
\n
\nexception dagster.DagsterExecutionStepNotFoundError(*args, **kwargs)[source]\u00b6
\n

Thrown when the user specifies execution step keys that do not exist.

\n
\n\n
\n
\nexception dagster.DagsterInvalidConfigError(preamble, errors, config_value, *args, **kwargs)[source]\u00b6
\n

Thrown when provided config is invalid (does not type check against the relevant config\nschema).

\n
\n\n
\n
\nexception dagster.DagsterInvalidConfigDefinitionError(original_root, current_value, stack, reason=None, **kwargs)[source]\u00b6
\n

Indicates that you have attempted to construct a config with an invalid value.

\n
\n
Acceptable values for config types are any of:
    \n
  1. \n
    A Python primitive type that resolves to a Dagster config type

    (python:int, python:float, python:bool,\npython:str, or python:list).

    \n
    \n
    \n
  2. \n
  3. \n
    A Dagster config type: Int, Float,

    Bool, String,\nStringSource, Any,\nArray, Noneable, Enum,\nSelector, Shape, or\nPermissive.

    \n
    \n
    \n
  4. \n
  5. \n
    A bare python dictionary, which will be automatically wrapped in

    Shape. Values of the dictionary are resolved recursively\naccording to the same rules.

    \n
    \n
    \n
  6. \n
  7. \n
    A bare python list of length one which itself is config type.

    Becomes Array with list element as an argument.

    \n
    \n
    \n
  8. \n
  9. An instance of Field.

  10. \n
\n
\n
\n
\n\n
\n
\nexception dagster.DagsterInvalidDefinitionError[source]\u00b6
\n

Indicates that the rules for a definition have been violated by the user.

\n
\n\n
\n
\nexception dagster.DagsterInvalidSubsetError[source]\u00b6
\n

Indicates that a subset of a pipeline is invalid because either:\n- One or more ops in the specified subset do not exist on the job.\u2019\n- The subset produces an invalid job.

\n
\n\n
\n
\nexception dagster.DagsterInvariantViolationError[source]\u00b6
\n

Indicates the user has violated a well-defined invariant that can only be enforced\nat runtime.

\n
\n\n
\n
\nexception dagster.DagsterResourceFunctionError(*args, **kwargs)[source]\u00b6
\n

Indicates an error occurred while executing the body of the resource_fn in a\nResourceDefinition during resource initialization.

\n
\n\n
\n
\nexception dagster.DagsterRunNotFoundError(*args, **kwargs)[source]\u00b6
\n

Thrown when a run cannot be found in run storage.

\n
\n\n
\n
\nexception dagster.DagsterStepOutputNotFoundError(*args, **kwargs)[source]\u00b6
\n

Indicates that previous step outputs required for an execution step to proceed are not\navailable.

\n
\n\n
\n
\nexception dagster.DagsterSubprocessError(*args, **kwargs)[source]\u00b6
\n

An exception has occurred in one or more of the child processes dagster manages.\nThis error forwards the message and stack trace for all of the collected errors.

\n
\n\n
\n
\nexception dagster.DagsterTypeCheckDidNotPass(description=None, metadata=None, dagster_type=None)[source]\u00b6
\n

Indicates that a type check failed.

\n

This is raised when raise_on_error is True in calls to the synchronous job and\ngraph execution APIs (e.g. graph.execute_in_process(), job.execute_in_process() \u2013 typically\nwithin a test), and a DagsterType\u2019s type check fails by returning either\nFalse or an instance of TypeCheck whose success member is False.

\n
\n\n
\n
\nexception dagster.DagsterTypeCheckError(*args, **kwargs)[source]\u00b6
\n

Indicates an error in the op type system at runtime. E.g. a op receives an\nunexpected input, or produces an output that does not match the type of the output definition.

\n
\n\n
\n
\nexception dagster.DagsterUnknownResourceError(resource_name, *args, **kwargs)[source]\u00b6
\n

Indicates that an unknown resource was accessed in the body of an execution step. May often\nhappen by accessing a resource in the compute function of an op without first supplying the\nop with the correct required_resource_keys argument.

\n
\n\n
\n
\nexception dagster.DagsterUnmetExecutorRequirementsError[source]\u00b6
\n

Indicates the resolved executor is incompatible with the state of other systems\nsuch as the DagsterInstance or system storage configuration.

\n
\n\n
\n
\nexception dagster.DagsterUserCodeExecutionError(*args, **kwargs)[source]\u00b6
\n

This is the base class for any exception that is meant to wrap an\npython:Exception thrown by user code. It wraps that existing user code.\nThe original_exc_info argument to the constructor is meant to be a tuple of the type\nreturned by sys.exc_info at the call site of the constructor.

\n

Users should not subclass this base class for their own exceptions and should instead throw\nfreely from user code. User exceptions will be automatically wrapped and rethrown.

\n
\n
\nproperty is_user_code_error\u00b6
\n

Returns true if this error is attributable to user code.

\n
\n\n
\n\n
\n", "current_page_name": "sections/api/apidocs/errors", "customsidebar": null, "display_toc": false, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../execution/", "title": "Execution"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../config/", "title": "Config"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/execution", "Execution", "N", "next"], ["sections/api/apidocs/config", "Config", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/errors.rst.txt", "title": "Errors", "toc": "\n"}, "execution": {"alabaster_version": "0.7.13", "body": "
\n

Execution\u00b6

\n
\n

Materializing Assets\u00b6

\n
\n
\ndagster.materialize(assets, run_config=None, instance=None, resources=None, partition_key=None, raise_on_error=True, tags=None, selection=None)[source]\u00b6
\n

Executes a single-threaded, in-process run which materializes provided assets.

\n

By default, will materialize assets to the local filesystem.

\n
\n
Parameters:
\n
    \n
  • assets (Sequence[Union[AssetsDefinition, SourceAsset]]) \u2013

    The assets to materialize.

    \n

    Unless you\u2019re using deps or non_argument_deps, you must also include all assets that are\nupstream of the assets that you want to materialize. This is because those upstream\nasset definitions have information that is needed to load their contents while\nmaterializing the downstream assets.

    \n

    You can use the selection argument to distinguish between assets that you want to\nmaterialize and assets that are just present for loading.

    \n

  • \n
  • resources (Optional[Mapping[str, object]]) \u2013 The resources needed for execution. Can provide resource instances\ndirectly, or resource definitions. Note that if provided resources\nconflict with resources directly on assets, an error will be thrown.

  • \n
  • run_config (Optional[Any]) \u2013 The run config to use for the run that materializes the assets.

  • \n
  • partition_key \u2013 (Optional[str])\nThe string partition key that specifies the run config to execute. Can only be used\nto select run config for assets with partitioned config.

  • \n
  • tags (Optional[Mapping[str, str]]) \u2013 Tags for the run.

  • \n
  • selection (Optional[Union[str, Sequence[str], Sequence[AssetKey], Sequence[Union[AssetsDefinition, SourceAsset]], AssetSelection]]) \u2013

    A sub-selection of assets to materialize.

    \n

    If not provided, then all assets will be materialized.

    \n

    If providing a string or sequence of strings,\nhttps://docs.dagster.io/concepts/assets/asset-selection-syntax describes the accepted\nsyntax.

    \n

  • \n
\n
\n
Returns:
\n

The result of the execution.

\n
\n
Return type:
\n

ExecuteInProcessResult

\n
\n
\n

Examples

\n
@asset\ndef asset1():\n    ...\n\n@asset\ndef asset2(asset1):\n    ...\n\n# executes a run that materializes asset1 and then asset2\nmaterialize([asset1, asset2])\n\n# executes a run that materializes just asset2, loading its input from asset1\nmaterialize([asset1, asset2], selection=[asset2])\n
\n
\n
\n\n
\n
\ndagster.materialize_to_memory(assets, run_config=None, instance=None, resources=None, partition_key=None, raise_on_error=True, tags=None, selection=None)[source]\u00b6
\n

Executes a single-threaded, in-process run which materializes provided assets in memory.

\n

Will explicitly use mem_io_manager() for all required io manager\nkeys. If any io managers are directly provided using the resources\nargument, a DagsterInvariantViolationError will be thrown.

\n
\n
Parameters:
\n
    \n
  • assets (Sequence[Union[AssetsDefinition, SourceAsset]]) \u2013 The assets to materialize. Can also provide SourceAsset objects to fill dependencies for asset defs.

  • \n
  • run_config (Optional[Any]) \u2013 The run config to use for the run that materializes the assets.

  • \n
  • resources (Optional[Mapping[str, object]]) \u2013 The resources needed for execution. Can provide resource instances\ndirectly, or resource definitions. If provided resources\nconflict with resources directly on assets, an error will be thrown.

  • \n
  • partition_key \u2013 (Optional[str])\nThe string partition key that specifies the run config to execute. Can only be used\nto select run config for assets with partitioned config.

  • \n
  • tags (Optional[Mapping[str, str]]) \u2013 Tags for the run.

  • \n
  • selection (Optional[Union[str, Sequence[str], Sequence[AssetKey], Sequence[Union[AssetsDefinition, SourceAsset]], AssetSelection]]) \u2013

    A sub-selection of assets to materialize.

    \n

    If not provided, then all assets will be materialized.

    \n

    If providing a string or sequence of strings,\nhttps://docs.dagster.io/concepts/assets/asset-selection-syntax describes the accepted\nsyntax.

    \n

  • \n
\n
\n
Returns:
\n

The result of the execution.

\n
\n
Return type:
\n

ExecuteInProcessResult

\n
\n
\n

Examples

\n
@asset\ndef asset1():\n    ...\n\n@asset\ndef asset2(asset1):\n    ...\n\n# executes a run that materializes asset1 and then asset2\nmaterialize([asset1, asset2])\n\n# executes a run that materializes just asset1\nmaterialize([asset1, asset2], selection=[asset1])\n
\n
\n
\n\n
\n
\n

Executing Jobs\u00b6

\n
\n
\nclass dagster.JobDefinition(*, graph_def, resource_defs=None, executor_def=None, logger_defs=None, name=None, config=None, description=None, partitions_def=None, tags=None, metadata=None, hook_defs=None, op_retry_policy=None, version_strategy=None, _subset_selection_data=None, asset_layer=None, input_values=None, _was_explicitly_provided_resources=None)[source]
\n

Defines a Dagster job.

\n
\n
\nproperty config_mapping
\n

The config mapping for the job, if it has one.

\n

A config mapping defines a way to map a top-level config schema to run config for the job.

\n
\n\n
\n
\nexecute_in_process(run_config=None, instance=None, partition_key=None, raise_on_error=True, op_selection=None, asset_selection=None, run_id=None, input_values=None, tags=None, resources=None)[source]
\n

Execute the Job in-process, gathering results in-memory.

\n

The executor_def on the Job will be ignored, and replaced with the in-process executor.\nIf using the default io_manager, it will switch from filesystem to in-memory.

\n
\n
Parameters:
\n
    \n
  • (Optional[Mapping[str (run_config) \u2013 The configuration for the run

  • \n
  • Any]] \u2013 The configuration for the run

  • \n
  • instance (Optional[DagsterInstance]) \u2013 The instance to execute against, an ephemeral one will be used if none provided.

  • \n
  • partition_key \u2013 (Optional[str])\nThe string partition key that specifies the run config to execute. Can only be used\nto select run config for jobs with partitioned config.

  • \n
  • raise_on_error (Optional[bool]) \u2013 Whether or not to raise exceptions when they occur.\nDefaults to True.

  • \n
  • op_selection (Optional[Sequence[str]]) \u2013 A list of op selection queries (including single op\nnames) to execute. For example:\n* ['some_op']: selects some_op itself.\n* ['*some_op']: select some_op and all its ancestors (upstream dependencies).\n* ['*some_op+++']: select some_op, all its ancestors, and its descendants\n(downstream dependencies) within 3 levels down.\n* ['*some_op', 'other_op_a', 'other_op_b+']: select some_op and all its\nancestors, other_op_a itself, and other_op_b and its direct child ops.

  • \n
  • input_values (Optional[Mapping[str, Any]]) \u2013 A dictionary that maps python objects to the top-level inputs of the job. Input values provided here will override input values that have been provided to the job directly.

  • \n
  • resources (Optional[Mapping[str, Any]]) \u2013 The resources needed if any are required. Can provide resource instances directly,\nor resource definitions.

  • \n
\n
\n
Returns:
\n

ExecuteInProcessResult

\n
\n
\n
\n\n
\n
\nproperty executor_def
\n

Returns the default ExecutorDefinition for the job.

\n

If the user has not specified an executor definition, then this will default to the multi_or_in_process_executor(). If a default is specified on the Definitions object the job was provided to, then that will be used instead.

\n
\n\n
\n
\nproperty has_specified_executor
\n

Returns True if this job has explicitly specified an executor, and False if the executor was inherited through defaults or the Definitions object the job was provided to.

\n
\n\n
\n
\nproperty has_specified_loggers
\n

Returns true if the job explicitly set loggers, and False if loggers were inherited through defaults or the Definitions object the job was provided to.

\n
\n\n
\n
\nproperty loggers
\n

Returns the set of LoggerDefinition objects specified on the job.

\n

If the user has not specified a mapping of LoggerDefinition objects, then this will default to the colored_console_logger() under the key console. If a default is specified on the Definitions object the job was provided to, then that will be used instead.

\n
\n\n
\n
\nproperty partitioned_config
\n

The partitioned config for the job, if it has one.

\n

A partitioned config defines a way to map partition keys to run config for the job.

\n
\n\n
\n
\nproperty partitions_def
\n

Returns the PartitionsDefinition for the job, if it has one.

\n

A partitions definition defines the set of partition keys the job operates on.

\n
\n\n
\n
\nproperty resource_defs
\n

Returns the set of ResourceDefinition objects specified on the job.

\n

This may not be the complete set of resources required by the job, since those can also be provided on the Definitions object the job may be provided to.

\n
\n\n
\n
\nrun_request_for_partition(partition_key, run_key=None, tags=None, asset_selection=None, run_config=None, current_time=None, dynamic_partitions_store=None)[source]
\n
\n
\n

\n \n (\n deprecated\n )\n \n This API will be removed in version 2.0.0. Directly instantiate RunRequest(partition_key=...) instead..\n \n

\n

Creates a RunRequest object for a run that processes the given partition.

\n
\n
Parameters:
\n
    \n
  • partition_key \u2013 The key of the partition to request a run for.

  • \n
  • run_key (Optional[str]) \u2013 A string key to identify this launched run. For sensors, ensures that\nonly one run is created per run key across all sensor evaluations. For schedules,\nensures that one run is created per tick, across failure recoveries. Passing in a None\nvalue means that a run will always be launched per evaluation.

  • \n
  • tags (Optional[Dict[str, str]]) \u2013 A dictionary of tags (string key-value pairs) to attach\nto the launched run.

  • \n
  • (Optional[Mapping[str (run_config) \u2013 Configuration for the run. If the job has\na PartitionedConfig, this value will override replace the config\nprovided by it.

  • \n
  • Any]] \u2013 Configuration for the run. If the job has\na PartitionedConfig, this value will override replace the config\nprovided by it.

  • \n
  • current_time (Optional[datetime]) \u2013 Used to determine which time-partitions exist.\nDefaults to now.

  • \n
  • dynamic_partitions_store (Optional[DynamicPartitionsStore]) \u2013 The DynamicPartitionsStore\nobject that is responsible for fetching dynamic partitions. Required when the\npartitions definition is a DynamicPartitionsDefinition with a name defined. Users\ncan pass the DagsterInstance fetched via context.instance to this argument.

  • \n
\n
\n
Returns:
\n

an object that requests a run to process the given partition.

\n
\n
Return type:
\n

RunRequest

\n
\n
\n
\n\n
\n
\nwith_hooks(hook_defs)[source]
\n

Apply a set of hooks to all op instances within the job.

\n
\n\n
\n
\nwith_top_level_resources(resource_defs)[source]
\n

Apply a set of resources to all op instances within the job.

\n
\n\n
\n\n
\n
\ndagster.execute_job(job, instance, run_config=None, tags=None, raise_on_error=False, op_selection=None, reexecution_options=None, asset_selection=None)[source]\u00b6
\n

Execute a job synchronously.

\n

This API represents dagster\u2019s python entrypoint for out-of-process\nexecution. For most testing purposes, \nexecute_in_process() will be more suitable, but when wanting to run\nexecution using an out-of-process executor (such as dagster.\nmultiprocess_executor), then execute_job is suitable.

\n

execute_job expects a persistent DagsterInstance for\nexecution, meaning the $DAGSTER_HOME environment variable must be set.\nIt also expects a reconstructable pointer to a JobDefinition so\nthat it can be reconstructed in separate processes. This can be done by\nwrapping the JobDefinition in a call to dagster.\nreconstructable().

\n
from dagster import DagsterInstance, execute_job, job, reconstructable\n\n@job\ndef the_job():\n    ...\n\ninstance = DagsterInstance.get()\nresult = execute_job(reconstructable(the_job), instance=instance)\nassert result.success\n
\n
\n

If using the to_job() method to\nconstruct the JobDefinition, then the invocation must be wrapped in a\nmodule-scope function, which can be passed to reconstructable.

\n
from dagster import graph, reconstructable\n\n@graph\ndef the_graph():\n    ...\n\ndef define_job():\n    return the_graph.to_job(...)\n\nresult = execute_job(reconstructable(define_job), ...)\n
\n
\n

Since execute_job is potentially executing outside of the current\nprocess, output objects need to be retrieved by use of the provided job\u2019s\nio managers. Output objects can be retrieved by opening the result of\nexecute_job as a context manager.

\n
from dagster import execute_job\n\nwith execute_job(...) as result:\n    output_obj = result.output_for_node("some_op")\n
\n
\n

execute_job can also be used to reexecute a run, by providing a ReexecutionOptions object.

\n
from dagster import ReexecutionOptions, execute_job\n\ninstance = DagsterInstance.get()\n\noptions = ReexecutionOptions.from_failure(run_id=failed_run_id, instance)\nexecute_job(reconstructable(job), instance, reexecution_options=options)\n
\n
\n
\n
Parameters:
\n
    \n
  • job (ReconstructableJob) \u2013 A reconstructable pointer to a JobDefinition.

  • \n
  • instance (DagsterInstance) \u2013 The instance to execute against.

  • \n
  • run_config (Optional[dict]) \u2013 The configuration that parametrizes this run, as a dict.

  • \n
  • tags (Optional[Dict[str, Any]]) \u2013 Arbitrary key-value pairs that will be added to run logs.

  • \n
  • raise_on_error (Optional[bool]) \u2013 Whether or not to raise exceptions when they occur.\nDefaults to False.

  • \n
  • op_selection (Optional[List[str]]) \u2013

    A list of op selection queries (including single\nop names) to execute. For example:

    \n
      \n
    • ['some_op']: selects some_op itself.

    • \n
    • ['*some_op']: select some_op and all its ancestors (upstream dependencies).

    • \n
    • ['*some_op+++']: select some_op, all its ancestors, and its descendants\n(downstream dependencies) within 3 levels down.

    • \n
    • ['*some_op', 'other_op_a', 'other_op_b+']: select some_op and all its\nancestors, other_op_a itself, and other_op_b and its direct child ops.

    • \n
    \n

  • \n
  • reexecution_options (Optional[ReexecutionOptions]) \u2013 Reexecution options to provide to the run, if this run is\nintended to be a reexecution of a previous run. Cannot be used in\ntandem with the op_selection argument.

  • \n
\n
\n
Returns:
\n

The result of job execution.

\n
\n
Return type:
\n

JobExecutionResult

\n
\n
\n
\n\n
\n
\nclass dagster.ReexecutionOptions(parent_run_id, step_selection=[])[source]\u00b6
\n

Reexecution options for python-based execution in Dagster.

\n
\n
Parameters:
\n
    \n
  • parent_run_id (str) \u2013 The run_id of the run to reexecute.

  • \n
  • step_selection (Sequence[str]) \u2013

    The list of step selections to reexecute. Must be a subset or match of the\nset of steps executed in the original run. For example:

    \n
      \n
    • ['some_op']: selects some_op itself.

    • \n
    • ['*some_op']: select some_op and all its ancestors (upstream dependencies).

    • \n
    • ['*some_op+++']: select some_op, all its ancestors, and its descendants\n(downstream dependencies) within 3 levels down.

    • \n
    • ['*some_op', 'other_op_a', 'other_op_b+']: select some_op and all its\nancestors, other_op_a itself, and other_op_b and its direct child ops.

    • \n
    \n

  • \n
\n
\n
\n
\n\n
\n
\ndagster.instance_for_test(overrides=None, set_dagster_home=True, temp_dir=None)[source]\u00b6
\n

Creates a persistent DagsterInstance available within a context manager.

\n

When a context manager is opened, if no temp_dir parameter is set, a new\ntemporary directory will be created for the duration of the context\nmanager\u2019s opening. If the set_dagster_home parameter is set to True\n(True by default), the $DAGSTER_HOME environment variable will be\noverridden to be this directory (or the directory passed in by temp_dir)\nfor the duration of the context manager being open.

\n
\n
Parameters:
\n
    \n
  • overrides (Optional[Mapping[str, Any]]) \u2013 Config to provide to instance (config format follows that typically found in an instance.yaml file).

  • \n
  • set_dagster_home (Optional[bool]) \u2013 If set to True, the $DAGSTER_HOME environment variable will be\noverridden to be the directory used by this instance for the\nduration that the context manager is open. Upon the context\nmanager closing, the $DAGSTER_HOME variable will be re-set to the original value. (Defaults to True).

  • \n
  • temp_dir (Optional[str]) \u2013 The directory to use for storing local artifacts produced by the\ninstance. If not set, a temporary directory will be created for\nthe duration of the context manager being open, and all artifacts\nwill be torn down afterward.

  • \n
\n
\n
\n
\n\n
\n
\n

Executing Graphs\u00b6

\n
\n
\nclass dagster.GraphDefinition(name, *, description=None, node_defs=None, dependencies=None, input_mappings=None, output_mappings=None, config=None, tags=None, node_input_source_assets=None, **kwargs)[source]
\n

Defines a Dagster op graph.

\n

An op graph is made up of

\n
    \n
  • Nodes, which can either be an op (the functional unit of computation), or another graph.

  • \n
  • Dependencies, which determine how the values produced by nodes as outputs flow from\none node to another. This tells Dagster how to arrange nodes into a directed, acyclic graph\n(DAG) of compute.

  • \n
\n

End users should prefer the @graph decorator. GraphDefinition is generally\nintended to be used by framework authors or for programatically generated graphs.

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the graph. Must be unique within any GraphDefinition\nor JobDefinition containing the graph.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the job.

  • \n
  • node_defs (Optional[Sequence[NodeDefinition]]) \u2013 The set of ops / graphs used in this graph.

  • \n
  • dependencies (Optional[Dict[Union[str, NodeInvocation], Dict[str, DependencyDefinition]]]) \u2013 A structure that declares the dependencies of each op\u2019s inputs on the outputs of other\nops in the graph. Keys of the top level dict are either the string names of ops in the\ngraph or, in the case of aliased ops, NodeInvocations.\nValues of the top level dict are themselves dicts, which map input names belonging to\nthe op or aliased op to DependencyDefinitions.

  • \n
  • input_mappings (Optional[Sequence[InputMapping]]) \u2013 Defines the inputs to the nested graph, and\nhow they map to the inputs of its constituent ops.

  • \n
  • output_mappings (Optional[Sequence[OutputMapping]]) \u2013 Defines the outputs of the nested graph,\nand how they map from the outputs of its constituent ops.

  • \n
  • config (Optional[ConfigMapping]) \u2013 Defines the config of the graph, and how its schema maps\nto the config of its constituent ops.

  • \n
  • tags (Optional[Dict[str, Any]]) \u2013 Arbitrary metadata for any execution of the graph.\nValues that are not strings will be json encoded and must meet the criteria that\njson.loads(json.dumps(value)) == value. These tag values may be overwritten by tag\nvalues provided at invocation time.

  • \n
\n
\n
\n

Examples

\n
@op\ndef return_one():\n    return 1\n\n@op\ndef add_one(num):\n    return num + 1\n\ngraph_def = GraphDefinition(\n    name='basic',\n    node_defs=[return_one, add_one],\n    dependencies={'add_one': {'num': DependencyDefinition('return_one')}},\n)\n
\n
\n
\n
\nalias(name)[source]
\n

Aliases the graph with a new name.

\n

Can only be used in the context of a @graph, @job, or @asset_graph decorated function.

\n
\n
Examples:
@job\ndef do_it_all():\n    my_graph.alias("my_graph_alias")\n
\n
\n
\n
\n
\n\n
\n
\nproperty config_mapping
\n

The config mapping for the graph, if present.

\n

By specifying a config mapping function, you can override the configuration for the child nodes contained within a graph.

\n
\n\n
\n
\nexecute_in_process(run_config=None, instance=None, resources=None, raise_on_error=True, op_selection=None, run_id=None, input_values=None)[source]
\n

Execute this graph in-process, collecting results in-memory.

\n
\n
Parameters:
\n
    \n
  • run_config (Optional[Mapping[str, Any]]) \u2013 Run config to provide to execution. The configuration for the underlying graph\nshould exist under the \u201cops\u201d key.

  • \n
  • instance (Optional[DagsterInstance]) \u2013 The instance to execute against, an ephemeral one will be used if none provided.

  • \n
  • resources (Optional[Mapping[str, Any]]) \u2013 The resources needed if any are required. Can provide resource instances directly,\nor resource definitions.

  • \n
  • raise_on_error (Optional[bool]) \u2013 Whether or not to raise exceptions when they occur.\nDefaults to True.

  • \n
  • op_selection (Optional[List[str]]) \u2013 A list of op selection queries (including single op\nnames) to execute. For example:\n* ['some_op']: selects some_op itself.\n* ['*some_op']: select some_op and all its ancestors (upstream dependencies).\n* ['*some_op+++']: select some_op, all its ancestors, and its descendants\n(downstream dependencies) within 3 levels down.\n* ['*some_op', 'other_op_a', 'other_op_b+']: select some_op and all its\nancestors, other_op_a itself, and other_op_b and its direct child ops.

  • \n
  • input_values (Optional[Mapping[str, Any]]) \u2013 A dictionary that maps python objects to the top-level inputs of the graph.

  • \n
\n
\n
Returns:
\n

ExecuteInProcessResult

\n
\n
\n
\n\n
\n
\nproperty input_mappings
\n

Input mappings for the graph.

\n

An input mapping is a mapping from an input of the graph to an input of a child node.

\n
\n\n
\n
\nproperty name
\n

The name of the graph.

\n
\n\n
\n
\nproperty output_mappings
\n

Output mappings for the graph.

\n

An output mapping is a mapping from an output of the graph to an output of a child node.

\n
\n\n
\n
\ntag(tags)[source]
\n

Attaches the provided tags to the graph immutably.

\n

Can only be used in the context of a @graph, @job, or @asset_graph decorated function.

\n
\n
Examples:
@job\ndef do_it_all():\n    my_graph.tag({"my_tag": "my_value"})\n
\n
\n
\n
\n
\n\n
\n
\nproperty tags
\n

The tags associated with the graph.

\n
\n\n
\n
\nto_job(name=None, description=None, resource_defs=None, config=None, tags=None, metadata=None, logger_defs=None, executor_def=None, hooks=None, op_retry_policy=None, version_strategy=None, op_selection=None, partitions_def=None, asset_layer=None, input_values=None, _asset_selection_data=None)[source]
\n

Make this graph in to an executable Job by providing remaining components required for execution.

\n
\n
Parameters:
\n
    \n
  • name (Optional[str]) \u2013 The name for the Job. Defaults to the name of the this graph.

  • \n
  • resource_defs (Optional[Mapping [str, object]]) \u2013 Resources that are required by this graph for execution.\nIf not defined, io_manager will default to filesystem.

  • \n
  • config \u2013

    Describes how the job is parameterized at runtime.

    \n

    If no value is provided, then the schema for the job\u2019s run config is a standard\nformat based on its ops and resources.

    \n

    If a dictionary is provided, then it must conform to the standard config schema, and\nit will be used as the job\u2019s run config for the job whenever the job is executed.\nThe values provided will be viewable and editable in the Dagster UI, so be\ncareful with secrets.

    \n

    If a ConfigMapping object is provided, then the schema for the job\u2019s run config is\ndetermined by the config mapping, and the ConfigMapping, which should return\nconfiguration in the standard format to configure the job.

    \n

    If a PartitionedConfig object is provided, then it defines a discrete set of config\nvalues that can parameterize the job, as well as a function for mapping those\nvalues to the base config. The values provided will be viewable and editable in the\nDagster UI, so be careful with secrets.

    \n

  • \n
  • tags (Optional[Mapping[str, Any]]) \u2013 Arbitrary information that will be attached to the execution of the Job.\nValues that are not strings will be json encoded and must meet the criteria that\njson.loads(json.dumps(value)) == value. These tag values may be overwritten by tag\nvalues provided at invocation time.

  • \n
  • metadata (Optional[Mapping[str, RawMetadataValue]]) \u2013 Arbitrary information that will be attached to the JobDefinition and be viewable in the Dagster UI.\nKeys must be strings, and values must be python primitive types or one of the provided\nMetadataValue types

  • \n
  • logger_defs (Optional[Mapping[str, LoggerDefinition]]) \u2013 A dictionary of string logger identifiers to their implementations.

  • \n
  • executor_def (Optional[ExecutorDefinition]) \u2013 How this Job will be executed. Defaults to multi_or_in_process_executor,\nwhich can be switched between multi-process and in-process modes of execution. The\ndefault mode of execution is multi-process.

  • \n
  • op_retry_policy (Optional[RetryPolicy]) \u2013 The default retry policy for all ops in this job.\nOnly used if retry policy is not defined on the op definition or op invocation.

  • \n
  • version_strategy (Optional[VersionStrategy]) \u2013 Defines how each op (and optionally, resource) in the job can be versioned. If\nprovided, memoizaton will be enabled for this job.

  • \n
  • partitions_def (Optional[PartitionsDefinition]) \u2013 Defines a discrete set of partition\nkeys that can parameterize the job. If this argument is supplied, the config\nargument can\u2019t also be supplied.

  • \n
  • asset_layer (Optional[AssetLayer]) \u2013 Top level information about the assets this job\nwill produce. Generally should not be set manually.

  • \n
  • input_values (Optional[Mapping[str, Any]]) \u2013 A dictionary that maps python objects to the top-level inputs of a job.

  • \n
\n
\n
Returns:
\n

JobDefinition

\n
\n
\n
\n\n
\n
\nwith_hooks(hook_defs)[source]
\n

Attaches the provided hooks to the graph immutably.

\n

Can only be used in the context of a @graph, @job, or @asset_graph decorated function.

\n
\n
Examples:
@job\ndef do_it_all():\n    my_graph.with_hooks({my_hook})\n
\n
\n
\n
\n
\n\n
\n
\nwith_retry_policy(retry_policy)[source]
\n

Attaches the provided retry policy to the graph immutably.

\n

Can only be used in the context of a @graph, @job, or @asset_graph decorated function.

\n
\n
Examples:
@job\ndef do_it_all():\n    my_graph.with_retry_policy(RetryPolicy(max_retries=5))\n
\n
\n
\n
\n
\n\n
\n\n
\n
\n

Execution results\u00b6

\n
\n
\nclass dagster.ExecuteInProcessResult(event_list, dagster_run, output_capture, job_def)[source]\u00b6
\n

Result object returned by in-process testing APIs.

\n

Users should not instantiate this object directly. Used for retrieving run success, events, and outputs from execution methods that return this object.

\n

This object is returned by:\n- dagster.GraphDefinition.execute_in_process()\n- dagster.JobDefinition.execute_in_process()\n- dagster.materialize_to_memory()\n- dagster.materialize()

\n
\n
\nproperty all_events\u00b6
\n

All dagster events emitted during execution.

\n
\n
Type:
\n

List[DagsterEvent]

\n
\n
\n
\n\n
\n
\nasset_value(asset_key)[source]\u00b6
\n

Retrieves the value of an asset that was materialized during the execution of the job.

\n
\n
Parameters:
\n

asset_key (CoercibleToAssetKey) \u2013 The key of the asset to retrieve.

\n
\n
Returns:
\n

The value of the retrieved asset.

\n
\n
Return type:
\n

Any

\n
\n
\n
\n\n
\n
\nproperty dagster_run\u00b6
\n

The Dagster run that was executed.

\n
\n
Type:
\n

DagsterRun

\n
\n
\n
\n\n
\n
\nproperty job_def\u00b6
\n

The job definition that was executed.

\n
\n
Type:
\n

JobDefinition

\n
\n
\n
\n\n
\n
\noutput_for_node(node_str, output_name='result')[source]\u00b6
\n

Retrieves output value with a particular name from the in-process run of the job.

\n
\n
Parameters:
\n
    \n
  • node_str (str) \u2013 Name of the op/graph whose output should be retrieved. If the intended\ngraph/op is nested within another graph, the syntax is outer_graph.inner_node.

  • \n
  • output_name (Optional[str]) \u2013 Name of the output on the op/graph to retrieve. Defaults to\nresult, the default output name in dagster.

  • \n
\n
\n
Returns:
\n

The value of the retrieved output.

\n
\n
Return type:
\n

Any

\n
\n
\n
\n\n
\n
\noutput_value(output_name='result')[source]\u00b6
\n

Retrieves output of top-level job, if an output is returned.

\n
\n
Parameters:
\n

output_name (Optional[str]) \u2013 The name of the output to retrieve. Defaults to result,\nthe default output name in dagster.

\n
\n
Returns:
\n

The value of the retrieved output.

\n
\n
Return type:
\n

Any

\n
\n
\n
\n\n
\n
\nproperty run_id\u00b6
\n

The run ID of the executed DagsterRun.

\n
\n
Type:
\n

str

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.JobExecutionResult(job_def, reconstruct_context, event_list, dagster_run)[source]\u00b6
\n

Result object returned by dagster.execute_job().

\n

Used for retrieving run success, events, and outputs from execute_job.\nUsers should not directly instantiate this class.

\n

Events and run information can be retrieved off of the object directly. In\norder to access outputs, the ExecuteJobResult object needs to be opened\nas a context manager, which will re-initialize the resources from\nexecution.

\n
\n
\nproperty all_events\u00b6
\n

List of all events yielded by the job execution.

\n
\n
Type:
\n

Sequence[DagsterEvent]

\n
\n
\n
\n\n
\n
\nproperty dagster_run\u00b6
\n

The Dagster run that was executed.

\n
\n
Type:
\n

DagsterRun

\n
\n
\n
\n\n
\n
\nproperty job_def\u00b6
\n

The job definition that was executed.

\n
\n
Type:
\n

JobDefinition

\n
\n
\n
\n\n
\n
\noutput_for_node(node_str, output_name='result')[source]\u00b6
\n

Retrieves output value with a particular name from the run of the job.

\n

In order to use this method, the ExecuteJobResult object must be opened as a context manager. If this method is used without opening the context manager, it will result in a DagsterInvariantViolationError.

\n
\n
Parameters:
\n
    \n
  • node_str (str) \u2013 Name of the op/graph whose output should be retrieved. If the intended\ngraph/op is nested within another graph, the syntax is outer_graph.inner_node.

  • \n
  • output_name (Optional[str]) \u2013 Name of the output on the op/graph to retrieve. Defaults to\nresult, the default output name in dagster.

  • \n
\n
\n
Returns:
\n

The value of the retrieved output.

\n
\n
Return type:
\n

Any

\n
\n
\n
\n\n
\n
\noutput_value(output_name='result')[source]\u00b6
\n

Retrieves output of top-level job, if an output is returned.

\n

In order to use this method, the ExecuteJobResult object must be opened as a context manager. If this method is used without opening the context manager, it will result in a DagsterInvariantViolationError. If the top-level job has no output, calling this method will also result in a DagsterInvariantViolationError.

\n
\n
Parameters:
\n

output_name (Optional[str]) \u2013 The name of the output to retrieve. Defaults to result,\nthe default output name in dagster.

\n
\n
Returns:
\n

The value of the retrieved output.

\n
\n
Return type:
\n

Any

\n
\n
\n
\n\n
\n
\nproperty run_id\u00b6
\n

The id of the Dagster run that was executed.

\n
\n
Type:
\n

str

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.DagsterEvent(event_type_value, job_name, step_handle=None, node_handle=None, step_kind_value=None, logging_tags=None, event_specific_data=None, message=None, pid=None, step_key=None)[source]\u00b6
\n

Events yielded by op and job execution.

\n

Users should not instantiate this class.

\n
\n
\nevent_type_value\u00b6
\n

Value for a DagsterEventType.

\n
\n
Type:
\n

str

\n
\n
\n
\n\n
\n
\njob_name\u00b6
\n
\n
Type:
\n

str

\n
\n
\n
\n\n
\n
\nnode_handle\u00b6
\n
\n
Type:
\n

NodeHandle

\n
\n
\n
\n\n
\n
\nstep_kind_value\u00b6
\n

Value for a StepKind.

\n
\n
Type:
\n

str

\n
\n
\n
\n\n
\n
\nlogging_tags\u00b6
\n
\n
Type:
\n

Dict[str, str]

\n
\n
\n
\n\n
\n
\nevent_specific_data\u00b6
\n

Type must correspond to event_type_value.

\n
\n
Type:
\n

Any

\n
\n
\n
\n\n
\n
\nmessage\u00b6
\n
\n
Type:
\n

str

\n
\n
\n
\n\n
\n
\npid\u00b6
\n
\n
Type:
\n

int

\n
\n
\n
\n\n
\n
\nstep_key\u00b6
\n

DEPRECATED

\n
\n
Type:
\n

Optional[str]

\n
\n
\n
\n\n
\n
\nproperty asset_key\u00b6
\n

For events that correspond to a specific asset_key / partition\n(ASSET_MATERIALIZTION, ASSET_OBSERVATION, ASSET_MATERIALIZATION_PLANNED), returns that\nasset key. Otherwise, returns None.

\n
\n
Type:
\n

Optional[AssetKey]

\n
\n
\n
\n\n
\n
\nproperty event_type\u00b6
\n

The type of this event.

\n
\n
Type:
\n

DagsterEventType

\n
\n
\n
\n\n
\n
\nproperty is_asset_materialization_planned\u00b6
\n

If this event is of type ASSET_MATERIALIZATION_PLANNED.

\n
\n
Type:
\n

bool

\n
\n
\n
\n\n
\n
\nproperty is_asset_observation\u00b6
\n

If this event is of type ASSET_OBSERVATION.

\n
\n
Type:
\n

bool

\n
\n
\n
\n\n
\n
\nproperty is_engine_event\u00b6
\n

If this event is of type ENGINE_EVENT.

\n
\n
Type:
\n

bool

\n
\n
\n
\n\n
\n
\nproperty is_expectation_result\u00b6
\n

If this event is of type STEP_EXPECTATION_RESULT.

\n
\n
Type:
\n

bool

\n
\n
\n
\n\n
\n
\nproperty is_failure\u00b6
\n

If this event represents the failure of a run or step.

\n
\n
Type:
\n

bool

\n
\n
\n
\n\n
\n
\nproperty is_handled_output\u00b6
\n

If this event is of type HANDLED_OUTPUT.

\n
\n
Type:
\n

bool

\n
\n
\n
\n\n
\n
\nproperty is_hook_event\u00b6
\n

If this event relates to the execution of a hook.

\n
\n
Type:
\n

bool

\n
\n
\n
\n\n
\n
\nproperty is_loaded_input\u00b6
\n

If this event is of type LOADED_INPUT.

\n
\n
Type:
\n

bool

\n
\n
\n
\n\n
\n
\nproperty is_resource_init_failure\u00b6
\n

If this event is of type RESOURCE_INIT_FAILURE.

\n
\n
Type:
\n

bool

\n
\n
\n
\n\n
\n
\nproperty is_step_event\u00b6
\n

If this event relates to a specific step.

\n
\n
Type:
\n

bool

\n
\n
\n
\n\n
\n
\nproperty is_step_failure\u00b6
\n

If this event is of type STEP_FAILURE.

\n
\n
Type:
\n

bool

\n
\n
\n
\n\n
\n
\nproperty is_step_materialization\u00b6
\n

If this event is of type ASSET_MATERIALIZATION.

\n
\n
Type:
\n

bool

\n
\n
\n
\n\n
\n
\nproperty is_step_restarted\u00b6
\n

If this event is of type STEP_RESTARTED.

\n
\n
Type:
\n

bool

\n
\n
\n
\n\n
\n
\nproperty is_step_skipped\u00b6
\n

If this event is of type STEP_SKIPPED.

\n
\n
Type:
\n

bool

\n
\n
\n
\n\n
\n
\nproperty is_step_start\u00b6
\n

If this event is of type STEP_START.

\n
\n
Type:
\n

bool

\n
\n
\n
\n\n
\n
\nproperty is_step_success\u00b6
\n

If this event is of type STEP_SUCCESS.

\n
\n
Type:
\n

bool

\n
\n
\n
\n\n
\n
\nproperty is_step_up_for_retry\u00b6
\n

If this event is of type STEP_UP_FOR_RETRY.

\n
\n
Type:
\n

bool

\n
\n
\n
\n\n
\n
\nproperty is_successful_output\u00b6
\n

If this event is of type STEP_OUTPUT.

\n
\n
Type:
\n

bool

\n
\n
\n
\n\n
\n
\nproperty partition\u00b6
\n

For events that correspond to a specific asset_key / partition\n(ASSET_MATERIALIZTION, ASSET_OBSERVATION, ASSET_MATERIALIZATION_PLANNED), returns that\npartition. Otherwise, returns None.

\n
\n
Type:
\n

Optional[AssetKey]

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.DagsterEventType(value)[source]\u00b6
\n

The types of events that may be yielded by op and job execution.

\n
\n\n
\n
\n

Reconstructable jobs\u00b6

\n
\n
\nclass dagster.reconstructable(target)[source]\u00b6
\n

Create a ReconstructableJob from a\nfunction that returns a JobDefinition/JobDefinition,\nor a function decorated with @job.

\n

When your job must cross process boundaries, e.g., for execution on multiple nodes or\nin different systems (like dagstermill), Dagster must know how to reconstruct the job\non the other side of the process boundary.

\n

Passing a job created with ~dagster.GraphDefinition.to_job to reconstructable(),\nrequires you to wrap that job\u2019s definition in a module-scoped function, and pass that function\ninstead:

\n
from dagster import graph, reconstructable\n\n@graph\ndef my_graph():\n    ...\n\ndef define_my_job():\n    return my_graph.to_job()\n\nreconstructable(define_my_job)\n
\n
\n

This function implements a very conservative strategy for reconstruction, so that its behavior\nis easy to predict, but as a consequence it is not able to reconstruct certain kinds of jobs\nor jobs, such as those defined by lambdas, in nested scopes (e.g., dynamically within a method\ncall), or in interactive environments such as the Python REPL or Jupyter notebooks.

\n

If you need to reconstruct objects constructed in these ways, you should use\nbuild_reconstructable_job() instead, which allows you to\nspecify your own reconstruction strategy.

\n

Examples

\n
from dagster import job, reconstructable\n\n@job\ndef foo_job():\n    ...\n\nreconstructable_foo_job = reconstructable(foo_job)\n\n\n@graph\ndef foo():\n    ...\n\ndef make_bar_job():\n    return foo.to_job()\n\nreconstructable_bar_job = reconstructable(make_bar_job)\n
\n
\n
\n\n
\n
\n

Executors\u00b6

\n
\n
\ndagster.multi_or_in_process_executor ExecutorDefinition[source]\u00b6
\n

The default executor for a job.

\n

This is the executor available by default on a JobDefinition\nthat does not provide custom executors. This executor has a multiprocessing-enabled mode, and a\nsingle-process mode. By default, multiprocessing mode is enabled. Switching between multiprocess\nmode and in-process mode can be achieved via config.

\n
execution:\n  config:\n    multiprocess:\n\n\nexecution:\n  config:\n    in_process:\n
\n
\n

When using the multiprocess mode, max_concurrent and retries can also be configured.

\n
execution:\n  config:\n    multiprocess:\n      max_concurrent: 4\n      retries:\n        enabled:\n
\n
\n

The max_concurrent arg is optional and tells the execution engine how many processes may run\nconcurrently. By default, or if you set max_concurrent to be 0, this is the return value of\npython:multiprocessing.cpu_count().

\n

When using the in_process mode, then only retries can be configured.

\n

Execution priority can be configured using the dagster/priority tag via op metadata,\nwhere the higher the number the higher the priority. 0 is the default and both positive\nand negative numbers can be used.

\n
\n\n
\n
\ndagster.in_process_executor ExecutorDefinition[source]\u00b6
\n

The in-process executor executes all steps in a single process.

\n

To select it, include the following top-level fragment in config:

\n
execution:\n  in_process:\n
\n
\n

Execution priority can be configured using the dagster/priority tag via op metadata,\nwhere the higher the number the higher the priority. 0 is the default and both positive\nand negative numbers can be used.

\n
\n\n
\n
\ndagster.multiprocess_executor ExecutorDefinition[source]\u00b6
\n

The multiprocess executor executes each step in an individual process.

\n

Any job that does not specify custom executors will use the multiprocess_executor by default.\nTo configure the multiprocess executor, include a fragment such as the following in your run\nconfig:

\n
execution:\n  config:\n    multiprocess:\n      max_concurrent: 4\n
\n
\n

The max_concurrent arg is optional and tells the execution engine how many processes may run\nconcurrently. By default, or if you set max_concurrent to be None or 0, this is the return value of\npython:multiprocessing.cpu_count().

\n

Execution priority can be configured using the dagster/priority tag via op metadata,\nwhere the higher the number the higher the priority. 0 is the default and both positive\nand negative numbers can be used.

\n
\n\n
\n
\n

Contexts\u00b6

\n
\n
\nclass dagster.AssetExecutionContext(step_execution_context)[source]\u00b6
\n
\n\n
\n
\nclass dagster.OpExecutionContext(step_execution_context)[source]\u00b6
\n

The context object that can be made available as the first argument to the function\nused for computing an op or asset.

\n

This context object provides system information such as resources, config, and logging.

\n

To construct an execution context for testing purposes, use dagster.build_op_context().

\n

Example

\n
from dagster import op, OpExecutionContext\n\n@op\ndef hello_world(context: OpExecutionContext):\n    context.log.info("Hello, world!")\n
\n
\n
\n
\nadd_output_metadata(metadata, output_name=None, mapping_key=None)[source]\u00b6
\n

Add metadata to one of the outputs of an op.

\n

This can be invoked multiple times per output in the body of an op. If the same key is\npassed multiple times, the value associated with the last call will be used.

\n
\n
Parameters:
\n
    \n
  • metadata (Mapping[str, Any]) \u2013 The metadata to attach to the output

  • \n
  • output_name (Optional[str]) \u2013 The name of the output to attach metadata to. If there is only one output on the op, then this argument does not need to be provided. The metadata will automatically be attached to the only output.

  • \n
  • mapping_key (Optional[str]) \u2013 The mapping key of the output to attach metadata to. If the\noutput is not dynamic, this argument does not need to be provided.

  • \n
\n
\n
\n

Examples:

\n
from dagster import Out, op\nfrom typing import Tuple\n\n@op\ndef add_metadata(context):\n    context.add_output_metadata({"foo", "bar"})\n    return 5 # Since the default output is called "result", metadata will be attached to the output "result".\n\n@op(out={"a": Out(), "b": Out()})\ndef add_metadata_two_outputs(context) -> Tuple[str, int]:\n    context.add_output_metadata({"foo": "bar"}, output_name="b")\n    context.add_output_metadata({"baz": "bat"}, output_name="a")\n\n    return ("dog", 5)\n
\n
\n
\n\n
\n
\nproperty asset_checks_def\u00b6
\n

The backing AssetChecksDefinition for what is currently executing, errors if not\navailable.

\n
\n
Returns:
\n

AssetChecksDefinition.

\n
\n
\n
\n\n
\n
\nproperty asset_key\u00b6
\n

The AssetKey for the current asset. In a multi_asset, use asset_key_for_output instead.

\n
\n\n
\n
\nasset_key_for_input(input_name)[source]\u00b6
\n

Return the AssetKey for the corresponding input.

\n
\n\n
\n
\nasset_key_for_output(output_name='result')[source]\u00b6
\n

Return the AssetKey for the corresponding output.

\n
\n\n
\n
\nasset_partition_key_for_input(input_name)[source]\u00b6
\n

Returns the partition key of the upstream asset corresponding to the given input.

\n
\n
Parameters:
\n

input_name (str) \u2013 The name of the input to get the partition key for.

\n
\n
\n

Examples

\n
partitions_def = DailyPartitionsDefinition("2023-08-20")\n\n@asset(\n    partitions_def=partitions_def\n)\ndef upstream_asset():\n    ...\n\n@asset(\n    partitions_def=partitions_def\n)\ndef an_asset(context: AssetExecutionContext, upstream_asset):\n    context.log.info(context.asset_partition_key_for_input("upstream_asset"))\n\n# materializing the 2023-08-21 partition of this asset will log:\n#   "2023-08-21"\n\n\n@asset(\n    partitions_def=partitions_def,\n    ins={\n        "self_dependent_asset": AssetIn(partition_mapping=TimeWindowPartitionMapping(start_offset=-1, end_offset=-1))\n    }\n)\ndef self_dependent_asset(context: AssetExecutionContext, self_dependent_asset):\n    context.log.info(context.asset_partition_key_for_input("self_dependent_asset"))\n\n# materializing the 2023-08-21 partition of this asset will log:\n#   "2023-08-20"\n
\n
\n
\n\n
\n
\nasset_partition_key_for_output(output_name='result')[source]\u00b6
\n

Returns the asset partition key for the given output.

\n
\n
Parameters:
\n

output_name (str) \u2013 For assets defined with the @asset decorator, the name of the output\nwill be automatically provided. For assets defined with @multi_asset, output_name\nshould be the op output associated with the asset key (as determined by AssetOut)\nto get the partition key for.

\n
\n
\n

Examples

\n
partitions_def = DailyPartitionsDefinition("2023-08-20")\n\n@asset(\n    partitions_def=partitions_def\n)\ndef an_asset(context: AssetExecutionContext):\n    context.log.info(context.asset_partition_key_for_output())\n\n\n# materializing the 2023-08-21 partition of this asset will log:\n#   "2023-08-21"\n\n@multi_asset(\n    outs={\n        "first_asset": AssetOut(key=["my_assets", "first_asset"]),\n        "second_asset": AssetOut(key=["my_assets", "second_asset"])\n    }\n    partitions_def=partitions_def,\n)\ndef a_multi_asset(context: AssetExecutionContext):\n    context.log.info(context.asset_partition_key_for_output("first_asset"))\n    context.log.info(context.asset_partition_key_for_output("second_asset"))\n\n\n# materializing the 2023-08-21 partition of this asset will log:\n#   "2023-08-21"\n#   "2023-08-21"\n\n\n@asset(\n    partitions_def=partitions_def,\n    ins={\n        "self_dependent_asset": AssetIn(partition_mapping=TimeWindowPartitionMapping(start_offset=-1, end_offset=-1))\n    }\n)\ndef self_dependent_asset(context: AssetExecutionContext, self_dependent_asset):\n    context.log.info(context.asset_partition_key_for_output())\n\n# materializing the 2023-08-21 partition of this asset will log:\n#   "2023-08-21"\n
\n
\n
\n\n
\n
\nproperty asset_partition_key_range\u00b6
\n
\n
\n

\n \n (\n deprecated\n )\n \n This API will be removed in version 2.0. Use partition_key_range instead..\n \n

\n

The range of partition keys for the current run.

\n

If run is for a single partition key, return a PartitionKeyRange with the same start and\nend. Raises an error if the current run is not a partitioned run.

\n
\n\n
\n
\nasset_partition_key_range_for_input(input_name)[source]\u00b6
\n

Return the PartitionKeyRange for the corresponding input. Errors if the asset depends on a\nnon-contiguous chunk of the input.

\n

If you want to write your asset to support running a backfill of several partitions in a single run,\nyou can use asset_partition_key_range_for_input to get the range of partitions keys of the input that\nare relevant to that backfill.

\n
\n
Parameters:
\n

input_name (str) \u2013 The name of the input to get the time window for.

\n
\n
\n

Examples

\n
partitions_def = DailyPartitionsDefinition("2023-08-20")\n\n@asset(\n    partitions_def=partitions_def\n)\ndef upstream_asset():\n    ...\n\n@asset(\n    partitions_def=partitions_def\n)\ndef an_asset(context: AssetExecutionContext, upstream_asset):\n    context.log.info(context.asset_partition_key_range_for_input("upstream_asset"))\n\n\n# running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n#   PartitionKeyRange(start="2023-08-21", end="2023-08-25")\n\n@asset(\n    ins={\n        "upstream_asset": AssetIn(partition_mapping=TimeWindowPartitionMapping(start_offset=-1, end_offset=-1))\n    }\n    partitions_def=partitions_def,\n)\ndef another_asset(context: AssetExecutionContext, upstream_asset):\n    context.log.info(context.asset_partition_key_range_for_input("upstream_asset"))\n\n\n# running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n#   PartitionKeyRange(start="2023-08-20", end="2023-08-24")\n\n\n@asset(\n    partitions_def=partitions_def,\n    ins={\n        "self_dependent_asset": AssetIn(partition_mapping=TimeWindowPartitionMapping(start_offset=-1, end_offset=-1))\n    }\n)\ndef self_dependent_asset(context: AssetExecutionContext, self_dependent_asset):\n    context.log.info(context.asset_partition_key_range_for_input("self_dependent_asset"))\n\n# running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n#   PartitionKeyRange(start="2023-08-20", end="2023-08-24")\n
\n
\n
\n\n
\n
\nasset_partition_key_range_for_output(output_name='result')[source]\u00b6
\n

Return the PartitionKeyRange for the corresponding output. Errors if the run is not partitioned.

\n

If you want to write your asset to support running a backfill of several partitions in a single run,\nyou can use asset_partition_key_range_for_output to get all of the partitions being materialized\nby the backfill.

\n
\n
Parameters:
\n

output_name (str) \u2013 For assets defined with the @asset decorator, the name of the output\nwill be automatically provided. For assets defined with @multi_asset, output_name\nshould be the op output associated with the asset key (as determined by AssetOut)\nto get the partition key range for.

\n
\n
\n

Examples

\n
partitions_def = DailyPartitionsDefinition("2023-08-20")\n\n@asset(\n    partitions_def=partitions_def\n)\ndef an_asset(context: AssetExecutionContext):\n    context.log.info(context.asset_partition_key_range_for_output())\n\n\n# running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n#   PartitionKeyRange(start="2023-08-21", end="2023-08-25")\n\n@multi_asset(\n    outs={\n        "first_asset": AssetOut(key=["my_assets", "first_asset"]),\n        "second_asset": AssetOut(key=["my_assets", "second_asset"])\n    }\n    partitions_def=partitions_def,\n)\ndef a_multi_asset(context: AssetExecutionContext):\n    context.log.info(context.asset_partition_key_range_for_output("first_asset"))\n    context.log.info(context.asset_partition_key_range_for_output("second_asset"))\n\n\n# running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n#   PartitionKeyRange(start="2023-08-21", end="2023-08-25")\n#   PartitionKeyRange(start="2023-08-21", end="2023-08-25")\n\n\n@asset(\n    partitions_def=partitions_def,\n    ins={\n        "self_dependent_asset": AssetIn(partition_mapping=TimeWindowPartitionMapping(start_offset=-1, end_offset=-1))\n    }\n)\ndef self_dependent_asset(context: AssetExecutionContext, self_dependent_asset):\n    context.log.info(context.asset_partition_key_range_for_output())\n\n# running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n#   PartitionKeyRange(start="2023-08-21", end="2023-08-25")\n
\n
\n
\n\n
\n
\nasset_partition_keys_for_input(input_name)[source]\u00b6
\n

Returns a list of the partition keys of the upstream asset corresponding to the\ngiven input.

\n

If you want to write your asset to support running a backfill of several partitions in a single run,\nyou can use asset_partition_keys_for_input to get all of the partition keys of the input that\nare relevant to that backfill.

\n
\n
Parameters:
\n

input_name (str) \u2013 The name of the input to get the time window for.

\n
\n
\n

Examples

\n
partitions_def = DailyPartitionsDefinition("2023-08-20")\n\n@asset(\n    partitions_def=partitions_def\n)\ndef upstream_asset():\n    ...\n\n@asset(\n    partitions_def=partitions_def\n)\ndef an_asset(context: AssetExecutionContext, upstream_asset):\n    context.log.info(context.asset_partition_keys_for_input("upstream_asset"))\n\n\n# running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n#   ["2023-08-21", "2023-08-22", "2023-08-23", "2023-08-24", "2023-08-25"]\n\n@asset(\n    ins={\n        "upstream_asset": AssetIn(partition_mapping=TimeWindowPartitionMapping(start_offset=-1, end_offset=-1))\n    }\n    partitions_def=partitions_def,\n)\ndef another_asset(context: AssetExecutionContext, upstream_asset):\n    context.log.info(context.asset_partition_keys_for_input("upstream_asset"))\n\n\n# running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n#   ["2023-08-20", "2023-08-21", "2023-08-22", "2023-08-23", "2023-08-24"]\n\n\n@asset(\n    partitions_def=partitions_def,\n    ins={\n        "self_dependent_asset": AssetIn(partition_mapping=TimeWindowPartitionMapping(start_offset=-1, end_offset=-1))\n    }\n)\ndef self_dependent_asset(context: AssetExecutionContext, self_dependent_asset):\n    context.log.info(context.asset_partition_keys_for_input("self_dependent_asset"))\n\n# running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n#   ["2023-08-20", "2023-08-21", "2023-08-22", "2023-08-23", "2023-08-24"]\n
\n
\n
\n\n
\n
\nasset_partition_keys_for_output(output_name='result')[source]\u00b6
\n

Returns a list of the partition keys for the given output.

\n

If you want to write your asset to support running a backfill of several partitions in a single run,\nyou can use asset_partition_keys_for_output to get all of the partitions being materialized\nby the backfill.

\n
\n
Parameters:
\n

output_name (str) \u2013 For assets defined with the @asset decorator, the name of the output\nwill be automatically provided. For assets defined with @multi_asset, output_name\nshould be the op output associated with the asset key (as determined by AssetOut)\nto get the partition keys for.

\n
\n
\n

Examples

\n
partitions_def = DailyPartitionsDefinition("2023-08-20")\n\n@asset(\n    partitions_def=partitions_def\n)\ndef an_asset(context: AssetExecutionContext):\n    context.log.info(context.asset_partition_keys_for_output())\n\n\n# running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n#   ["2023-08-21", "2023-08-22", "2023-08-23", "2023-08-24", "2023-08-25"]\n\n@multi_asset(\n    outs={\n        "first_asset": AssetOut(key=["my_assets", "first_asset"]),\n        "second_asset": AssetOut(key=["my_assets", "second_asset"])\n    }\n    partitions_def=partitions_def,\n)\ndef a_multi_asset(context: AssetExecutionContext):\n    context.log.info(context.asset_partition_keys_for_output("first_asset"))\n    context.log.info(context.asset_partition_keys_for_output("second_asset"))\n\n\n# running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n#   ["2023-08-21", "2023-08-22", "2023-08-23", "2023-08-24", "2023-08-25"]\n#   ["2023-08-21", "2023-08-22", "2023-08-23", "2023-08-24", "2023-08-25"]\n\n\n@asset(\n    partitions_def=partitions_def,\n    ins={\n        "self_dependent_asset": AssetIn(partition_mapping=TimeWindowPartitionMapping(start_offset=-1, end_offset=-1))\n    }\n)\ndef self_dependent_asset(context: AssetExecutionContext, self_dependent_asset):\n    context.log.info(context.asset_partition_keys_for_output())\n\n# running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n#   ["2023-08-21", "2023-08-22", "2023-08-23", "2023-08-24", "2023-08-25"]\n
\n
\n
\n\n
\n
\nasset_partitions_def_for_input(input_name)[source]\u00b6
\n

The PartitionsDefinition on the upstream asset corresponding to this input.

\n
\n
Parameters:
\n

input_name (str) \u2013 The name of the input to get the PartitionsDefinition for.

\n
\n
\n

Examples

\n
partitions_def = DailyPartitionsDefinition("2023-08-20")\n\n@asset(\n    partitions_def=partitions_def\n)\ndef upstream_asset():\n    ...\n\n@asset(\n    partitions_def=partitions_def\n)\ndef upstream_asset(context: AssetExecutionContext, upstream_asset):\n    context.log.info(context.asset_partitions_def_for_input("upstream_asset"))\n\n# materializing the 2023-08-21 partition of this asset will log:\n#   DailyPartitionsDefinition("2023-08-20")\n
\n
\n
\n\n
\n
\nasset_partitions_def_for_output(output_name='result')[source]\u00b6
\n

The PartitionsDefinition on the asset corresponding to this output.

\n
\n
Parameters:
\n

output_name (str) \u2013 For assets defined with the @asset decorator, the name of the output\nwill be automatically provided. For assets defined with @multi_asset, output_name\nshould be the op output associated with the asset key (as determined by AssetOut)\nto get the PartitionsDefinition for.

\n
\n
\n

Examples

\n
partitions_def = DailyPartitionsDefinition("2023-08-20")\n\n@asset(\n    partitions_def=partitions_def\n)\ndef upstream_asset(context: AssetExecutionContext):\n    context.log.info(context.asset_partitions_def_for_output())\n\n# materializing the 2023-08-21 partition of this asset will log:\n#   DailyPartitionsDefinition("2023-08-20")\n\n@multi_asset(\n    outs={\n        "first_asset": AssetOut(key=["my_assets", "first_asset"]),\n        "second_asset": AssetOut(key=["my_assets", "second_asset"])\n    }\n    partitions_def=partitions_def,\n)\ndef a_multi_asset(context: AssetExecutionContext):\n    context.log.info(context.asset_partitions_def_for_output("first_asset"))\n    context.log.info(context.asset_partitions_def_for_output("second_asset"))\n\n# materializing the 2023-08-21 partition of this asset will log:\n#   DailyPartitionsDefinition("2023-08-20")\n#   DailyPartitionsDefinition("2023-08-20")\n
\n
\n
\n\n
\n
\nasset_partitions_time_window_for_input(input_name='result')[source]\u00b6
\n

The time window for the partitions of the input asset.

\n

If you want to write your asset to support running a backfill of several partitions in a single run,\nyou can use asset_partitions_time_window_for_input to get the time window of the input that\nare relevant to that backfill.

\n

Raises an error if either of the following are true:\n- The input asset has no partitioning.\n- The input asset is not partitioned with a TimeWindowPartitionsDefinition or a\nMultiPartitionsDefinition with one time-partitioned dimension.

\n
\n
Parameters:
\n

input_name (str) \u2013 The name of the input to get the partition key for.

\n
\n
\n

Examples

\n
partitions_def = DailyPartitionsDefinition("2023-08-20")\n\n@asset(\n    partitions_def=partitions_def\n)\ndef upstream_asset():\n    ...\n\n@asset(\n    partitions_def=partitions_def\n)\ndef an_asset(context: AssetExecutionContext, upstream_asset):\n    context.log.info(context.asset_partitions_time_window_for_input("upstream_asset"))\n\n\n# materializing the 2023-08-21 partition of this asset will log:\n#   TimeWindow("2023-08-21", "2023-08-22")\n\n# running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n#   TimeWindow("2023-08-21", "2023-08-26")\n\n\n@asset(\n    ins={\n        "upstream_asset": AssetIn(partition_mapping=TimeWindowPartitionMapping(start_offset=-1, end_offset=-1))\n    }\n    partitions_def=partitions_def,\n)\ndef another_asset(context: AssetExecutionContext, upstream_asset):\n    context.log.info(context.asset_partitions_time_window_for_input("upstream_asset"))\n\n\n# materializing the 2023-08-21 partition of this asset will log:\n#   TimeWindow("2023-08-20", "2023-08-21")\n\n# running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n#   TimeWindow("2023-08-21", "2023-08-26")\n\n\n@asset(\n    partitions_def=partitions_def,\n    ins={\n        "self_dependent_asset": AssetIn(partition_mapping=TimeWindowPartitionMapping(start_offset=-1, end_offset=-1))\n    }\n)\ndef self_dependent_asset(context: AssetExecutionContext, self_dependent_asset):\n    context.log.info(context.asset_partitions_time_window_for_input("self_dependent_asset"))\n\n# materializing the 2023-08-21 partition of this asset will log:\n#   TimeWindow("2023-08-20", "2023-08-21")\n\n# running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n#   TimeWindow("2023-08-20", "2023-08-25")\n
\n
\n
\n\n
\n
\nasset_partitions_time_window_for_output(output_name='result')[source]\u00b6
\n

The time window for the partitions of the output asset.

\n

If you want to write your asset to support running a backfill of several partitions in a single run,\nyou can use asset_partitions_time_window_for_output to get the TimeWindow of all of the partitions\nbeing materialized by the backfill.

\n

Raises an error if either of the following are true:\n- The output asset has no partitioning.\n- The output asset is not partitioned with a TimeWindowPartitionsDefinition or a\nMultiPartitionsDefinition with one time-partitioned dimension.

\n
\n
Parameters:
\n

output_name (str) \u2013 For assets defined with the @asset decorator, the name of the output\nwill be automatically provided. For assets defined with @multi_asset, output_name\nshould be the op output associated with the asset key (as determined by AssetOut)\nto get the time window for.

\n
\n
\n

Examples

\n
partitions_def = DailyPartitionsDefinition("2023-08-20")\n\n@asset(\n    partitions_def=partitions_def\n)\ndef an_asset(context: AssetExecutionContext):\n    context.log.info(context.asset_partitions_time_window_for_output())\n\n\n# materializing the 2023-08-21 partition of this asset will log:\n#   TimeWindow("2023-08-21", "2023-08-22")\n\n# running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n#   TimeWindow("2023-08-21", "2023-08-26")\n\n@multi_asset(\n    outs={\n        "first_asset": AssetOut(key=["my_assets", "first_asset"]),\n        "second_asset": AssetOut(key=["my_assets", "second_asset"])\n    }\n    partitions_def=partitions_def,\n)\ndef a_multi_asset(context: AssetExecutionContext):\n    context.log.info(context.asset_partitions_time_window_for_output("first_asset"))\n    context.log.info(context.asset_partitions_time_window_for_output("second_asset"))\n\n# materializing the 2023-08-21 partition of this asset will log:\n#   TimeWindow("2023-08-21", "2023-08-22")\n#   TimeWindow("2023-08-21", "2023-08-22")\n\n# running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n#   TimeWindow("2023-08-21", "2023-08-26")\n#   TimeWindow("2023-08-21", "2023-08-26")\n\n\n@asset(\n    partitions_def=partitions_def,\n    ins={\n        "self_dependent_asset": AssetIn(partition_mapping=TimeWindowPartitionMapping(start_offset=-1, end_offset=-1))\n    }\n)\ndef self_dependent_asset(context: AssetExecutionContext, self_dependent_asset):\n    context.log.info(context.asset_partitions_time_window_for_output())\n\n# materializing the 2023-08-21 partition of this asset will log:\n#   TimeWindow("2023-08-21", "2023-08-22")\n\n# running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n#   TimeWindow("2023-08-21", "2023-08-26")\n
\n
\n
\n\n
\n
\nproperty assets_def\u00b6
\n

The backing AssetsDefinition for what is currently executing, errors if not available.

\n
\n\n
\n
\nget_asset_provenance(asset_key)[source]\u00b6
\n
\n
\n

\n \n (\n experimental\n )\n \n This API may break in future versions, even between dot releases.\n \n

\n

Return the provenance information for the most recent materialization of an asset.

\n
\n
Parameters:
\n

asset_key (AssetKey) \u2013 Key of the asset for which to retrieve provenance.

\n
\n
Returns:
\n

\n
Provenance information for the most recent

materialization of the asset. Returns None if the asset was never materialized or\nthe materialization record is too old to contain provenance information.

\n
\n
\n

\n
\n
Return type:
\n

Optional[DataProvenance]

\n
\n
\n
\n\n
\n
\nget_mapping_key()[source]\u00b6
\n

Which mapping_key this execution is for if downstream of a DynamicOutput, otherwise None.

\n
\n\n
\n
\nget_tag(key)[source]\u00b6
\n

Get a logging tag.

\n
\n
Parameters:
\n

key (tag) \u2013 The tag to get.

\n
\n
Returns:
\n

The value of the tag, if present.

\n
\n
Return type:
\n

Optional[str]

\n
\n
\n
\n\n
\n
\nproperty has_asset_checks_def\u00b6
\n

Return a boolean indicating the presence of a backing AssetChecksDefinition\nfor the current execution.

\n
\n
Returns:
\n

True if there is a backing AssetChecksDefinition for the current execution, otherwise False.

\n
\n
Return type:
\n

bool

\n
\n
\n
\n\n
\n
\nproperty has_assets_def\u00b6
\n

If there is a backing AssetsDefinition for what is currently executing.

\n
\n\n
\n
\nproperty has_partition_key\u00b6
\n

Whether the current run is a partitioned run.

\n
\n\n
\n
\nhas_tag(key)[source]\u00b6
\n

Check if a logging tag is set.

\n
\n
Parameters:
\n

key (str) \u2013 The tag to check.

\n
\n
Returns:
\n

Whether the tag is set.

\n
\n
Return type:
\n

bool

\n
\n
\n
\n\n
\n
\nproperty instance\u00b6
\n

The current Dagster instance.

\n
\n
Type:
\n

DagsterInstance

\n
\n
\n
\n\n
\n
\nproperty job_def\u00b6
\n

The currently executing pipeline.

\n
\n
Type:
\n

JobDefinition

\n
\n
\n
\n\n
\n
\nproperty job_name\u00b6
\n

The name of the currently executing pipeline.

\n
\n
Type:
\n

str

\n
\n
\n
\n\n
\n
\nproperty log\u00b6
\n

The log manager available in the execution context.

\n
\n
Type:
\n

DagsterLogManager

\n
\n
\n
\n\n
\n
\nlog_event(event)[source]\u00b6
\n

Log an AssetMaterialization, AssetObservation, or ExpectationResult from within the body of an op.

\n

Events logged with this method will appear in the list of DagsterEvents, as well as the event log.

\n
\n
Parameters:
\n

event (Union[AssetMaterialization, AssetObservation, ExpectationResult]) \u2013 The event to log.

\n
\n
\n

Examples:

\n
from dagster import op, AssetMaterialization\n\n@op\ndef log_materialization(context):\n    context.log_event(AssetMaterialization("foo"))\n
\n
\n
\n\n
\n
\nproperty op_config\u00b6
\n

The parsed config specific to this op.

\n
\n
Type:
\n

Any

\n
\n
\n
\n\n
\n
\nproperty op_def\u00b6
\n

The current op definition.

\n
\n
Type:
\n

OpDefinition

\n
\n
\n
\n\n
\n
\noutput_for_asset_key(asset_key)[source]\u00b6
\n

Return the output name for the corresponding asset key.

\n
\n\n
\n
\nproperty partition_key\u00b6
\n

The partition key for the current run.

\n

Raises an error if the current run is not a partitioned run. Or if the current run is operating\nover a range of partitions (ie. a backfill of several partitions executed in a single run).

\n

Examples

\n
partitions_def = DailyPartitionsDefinition("2023-08-20")\n\n@asset(\n    partitions_def=partitions_def\n)\ndef my_asset(context: AssetExecutionContext):\n    context.log.info(context.partition_key)\n\n# materializing the 2023-08-21 partition of this asset will log:\n#   "2023-08-21"\n
\n
\n
\n\n
\n
\nproperty partition_key_range\u00b6
\n

The range of partition keys for the current run.

\n

If run is for a single partition key, returns a PartitionKeyRange with the same start and\nend. Raises an error if the current run is not a partitioned run.

\n

Examples

\n
partitions_def = DailyPartitionsDefinition("2023-08-20")\n\n@asset(\n    partitions_def=partitions_def\n)\ndef my_asset(context: AssetExecutionContext):\n    context.log.info(context.partition_key_range)\n\n# running a backfill of the 2023-08-21 through 2023-08-25 partitions of this asset will log:\n#   PartitionKeyRange(start="2023-08-21", end="2023-08-25")\n
\n
\n
\n\n
\n
\nproperty partition_time_window\u00b6
\n

The partition time window for the current run.

\n

Raises an error if the current run is not a partitioned run, or if the job\u2019s partition\ndefinition is not a TimeWindowPartitionsDefinition.

\n

Examples

\n
partitions_def = DailyPartitionsDefinition("2023-08-20")\n\n@asset(\n    partitions_def=partitions_def\n)\ndef my_asset(context: AssetExecutionContext):\n    context.log.info(context.partition_time_window)\n\n# materializing the 2023-08-21 partition of this asset will log:\n#   TimeWindow("2023-08-21", "2023-08-22")\n
\n
\n
\n\n
\n
\nproperty pdb\u00b6
\n

Gives access to pdb debugging from within the op.

\n

Example

\n
@op\ndef debug(context):\n    context.pdb.set_trace()\n
\n
\n
\n
Type:
\n

dagster.utils.forked_pdb.ForkedPdb

\n
\n
\n
\n\n
\n
\nproperty resources\u00b6
\n

The currently available resources.

\n
\n
Type:
\n

Resources

\n
\n
\n
\n\n
\n
\nproperty retry_number\u00b6
\n

Which retry attempt is currently executing i.e. 0 for initial attempt, 1 for first retry, etc.

\n
\n\n
\n
\nproperty run_config\u00b6
\n

The run config for the current execution.

\n
\n
Type:
\n

dict

\n
\n
\n
\n\n
\n
\nproperty run_id\u00b6
\n

The id of the current execution\u2019s run.

\n
\n
Type:
\n

str

\n
\n
\n
\n\n
\n
\nproperty selected_asset_check_keys\u00b6
\n
\n\n
\n
\nproperty selected_asset_keys\u00b6
\n

Get the set of AssetKeys this execution is expected to materialize.

\n
\n\n
\n
\nproperty selected_output_names\u00b6
\n

Get the output names that correspond to the current selection of assets this execution is expected to materialize.

\n
\n\n
\n\n
\n
\ndagster.build_op_context(resources=None, op_config=None, resources_config=None, instance=None, config=None, partition_key=None, partition_key_range=None, mapping_key=None, _assets_def=None)[source]\u00b6
\n

Builds op execution context from provided parameters.

\n

build_op_context can be used as either a function or context manager. If there is a\nprovided resource that is a context manager, then build_op_context must be used as a\ncontext manager. This function can be used to provide the context argument when directly\ninvoking a op.

\n
\n
Parameters:
\n
    \n
  • resources (Optional[Dict[str, Any]]) \u2013 The resources to provide to the context. These can be\neither values or resource definitions.

  • \n
  • op_config (Optional[Mapping[str, Any]]) \u2013 The config to provide to the op.

  • \n
  • resources_config (Optional[Mapping[str, Any]]) \u2013 The config to provide to the resources.

  • \n
  • instance (Optional[DagsterInstance]) \u2013 The dagster instance configured for the context.\nDefaults to DagsterInstance.ephemeral().

  • \n
  • mapping_key (Optional[str]) \u2013 A key representing the mapping key from an upstream dynamic\noutput. Can be accessed using context.get_mapping_key().

  • \n
  • partition_key (Optional[str]) \u2013 String value representing partition key to execute with.

  • \n
  • partition_key_range (Optional[PartitionKeyRange]) \u2013 Partition key range to execute with.

  • \n
  • _assets_def (Optional[AssetsDefinition]) \u2013 Internal argument that populates the op\u2019s assets\ndefinition, not meant to be populated by users.

  • \n
\n
\n
\n

Examples

\n
context = build_op_context()\nop_to_invoke(context)\n\nwith build_op_context(resources={"foo": context_manager_resource}) as context:\n    op_to_invoke(context)\n
\n
\n
\n\n
\n
\ndagster.build_asset_context(resources=None, resources_config=None, asset_config=None, instance=None, partition_key=None, partition_key_range=None)[source]\u00b6
\n

Builds asset execution context from provided parameters.

\n

build_asset_context can be used as either a function or context manager. If there is a\nprovided resource that is a context manager, then build_asset_context must be used as a\ncontext manager. This function can be used to provide the context argument when directly\ninvoking an asset.

\n
\n
Parameters:
\n
    \n
  • resources (Optional[Dict[str, Any]]) \u2013 The resources to provide to the context. These can be\neither values or resource definitions.

  • \n
  • resources_config (Optional[Mapping[str, Any]]) \u2013 The config to provide to the resources.

  • \n
  • asset_config (Optional[Mapping[str, Any]]) \u2013 The config to provide to the asset.

  • \n
  • instance (Optional[DagsterInstance]) \u2013 The dagster instance configured for the context.\nDefaults to DagsterInstance.ephemeral().

  • \n
  • partition_key (Optional[str]) \u2013 String value representing partition key to execute with.

  • \n
  • partition_key_range (Optional[PartitionKeyRange]) \u2013 Partition key range to execute with.

  • \n
\n
\n
\n

Examples

\n
context = build_asset_context()\nasset_to_invoke(context)\n\nwith build_asset_context(resources={"foo": context_manager_resource}) as context:\n    asset_to_invoke(context)\n
\n
\n
\n\n
\n
\nclass dagster.TypeCheckContext(run_id, log_manager, scoped_resources_builder, dagster_type)[source]\u00b6
\n

The context object available to a type check function on a DagsterType.

\n
\n
\nproperty log\u00b6
\n

Centralized log dispatch from user code.

\n
\n\n
\n
\nproperty resources\u00b6
\n

An object whose attributes contain the resources available to this op.

\n
\n\n
\n
\nproperty run_id\u00b6
\n

The id of this job run.

\n
\n\n
\n\n
\n
\n

Job configuration\u00b6

\n
\n
\ndagster.validate_run_config(job_def, run_config=None)[source]\u00b6
\n

Function to validate a provided run config blob against a given job.

\n

If validation is successful, this function will return a dictionary representation of the\nvalidated config actually used during execution.

\n
\n
Parameters:
\n
    \n
  • job_def (JobDefinition) \u2013 The job definition to validate run\nconfig against

  • \n
  • run_config (Optional[Dict[str, Any]]) \u2013 The run config to validate

  • \n
\n
\n
Returns:
\n

A dictionary representation of the validated config.

\n
\n
Return type:
\n

Dict[str, Any]

\n
\n
\n
\n\n
\n

Run Config Schema\u00b6

\n
\n

The run_config used for jobs has the following schema:

\n
{\n  # configuration for execution, required if executors require config\n  execution: {\n    # the name of one, and only one available executor, typically 'in_process' or 'multiprocess'\n    __executor_name__: {\n      # executor-specific config, if required or permitted\n      config: {\n        ...\n      }\n    }\n  },\n\n  # configuration for loggers, required if loggers require config\n  loggers: {\n    # the name of an available logger\n    __logger_name__: {\n      # logger-specific config, if required or permitted\n      config: {\n        ...\n      }\n    },\n    ...\n  },\n\n  # configuration for resources, required if resources require config\n  resources: {\n    # the name of a resource\n    __resource_name__: {\n      # resource-specific config, if required or permitted\n      config: {\n        ...\n      }\n    },\n    ...\n  },\n\n  # configuration for underlying ops, required if ops require config\n  ops: {\n\n    # these keys align with the names of the ops, or their alias in this job\n    __op_name__: {\n\n      # pass any data that was defined via config_field\n      config: ...,\n\n      # configurably specify input values, keyed by input name\n      inputs: {\n        __input_name__: {\n          # if an dagster_type_loader is specified, that schema must be satisfied here;\n          # scalar, built-in types will generally allow their values to be specified directly:\n          value: ...\n        }\n      },\n\n    }\n  },\n\n}\n
\n
\n
\n
\n
\n
\n", "current_page_name": "sections/api/apidocs/execution", "customsidebar": null, "display_toc": true, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../graphs/", "title": "Graphs"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../errors/", "title": "Errors"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/graphs", "Graphs", "N", "next"], ["sections/api/apidocs/errors", "Errors", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/execution.rst.txt", "title": "Execution", "toc": "\n"}, "graphs": {"alabaster_version": "0.7.13", "body": "
\n

Graphs\u00b6

\n

The core of a job is a _graph_ of ops - connected via data dependencies.

\n
\n
\n@dagster.graph(compose_fn=None, *, name=None, description=None, input_defs=None, output_defs=None, ins=None, out=None, tags=None, config=None)[source]\u00b6
\n

Create an op graph with the specified parameters from the decorated composition function.

\n

Using this decorator allows you to build up a dependency graph by writing a\nfunction that invokes ops (or other graphs) and passes the output to subsequent invocations.

\n
\n
Parameters:
\n
    \n
  • name (Optional[str]) \u2013 The name of the op graph. Must be unique within any RepositoryDefinition containing the graph.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the graph.

  • \n
  • input_defs (Optional[List[InputDefinition]]) \u2013

    Information about the inputs that this graph maps. Information provided here\nwill be combined with what can be inferred from the function signature, with these\nexplicit InputDefinitions taking precedence.

    \n

    Uses of inputs in the body of the decorated composition function will determine\nthe InputMappings passed to the underlying\nGraphDefinition.

    \n

  • \n
  • output_defs (Optional[List[OutputDefinition]]) \u2013

    Output definitions for the graph. If not provided explicitly, these will be inferred from typehints.

    \n

    Uses of these outputs in the body of the decorated composition function, as well as the\nreturn value of the decorated function, will be used to infer the appropriate set of\nOutputMappings for the underlying\nGraphDefinition.

    \n

    To map multiple outputs, return a dictionary from the composition function.

    \n

  • \n
  • ins (Optional[Dict[str, GraphIn]]) \u2013 Information about the inputs that this graph maps. Information provided here\nwill be combined with what can be inferred from the function signature, with these\nexplicit GraphIn taking precedence.

  • \n
  • out \u2013

    Information about the outputs that this graph maps. Information provided here will be\ncombined with what can be inferred from the return type signature if the function does\nnot use yield.

    \n

    To map multiple outputs, return a dictionary from the composition function.

    \n

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.GraphDefinition(name, *, description=None, node_defs=None, dependencies=None, input_mappings=None, output_mappings=None, config=None, tags=None, node_input_source_assets=None, **kwargs)[source]\u00b6
\n

Defines a Dagster op graph.

\n

An op graph is made up of

\n
    \n
  • Nodes, which can either be an op (the functional unit of computation), or another graph.

  • \n
  • Dependencies, which determine how the values produced by nodes as outputs flow from\none node to another. This tells Dagster how to arrange nodes into a directed, acyclic graph\n(DAG) of compute.

  • \n
\n

End users should prefer the @graph decorator. GraphDefinition is generally\nintended to be used by framework authors or for programatically generated graphs.

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the graph. Must be unique within any GraphDefinition\nor JobDefinition containing the graph.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the job.

  • \n
  • node_defs (Optional[Sequence[NodeDefinition]]) \u2013 The set of ops / graphs used in this graph.

  • \n
  • dependencies (Optional[Dict[Union[str, NodeInvocation], Dict[str, DependencyDefinition]]]) \u2013 A structure that declares the dependencies of each op\u2019s inputs on the outputs of other\nops in the graph. Keys of the top level dict are either the string names of ops in the\ngraph or, in the case of aliased ops, NodeInvocations.\nValues of the top level dict are themselves dicts, which map input names belonging to\nthe op or aliased op to DependencyDefinitions.

  • \n
  • input_mappings (Optional[Sequence[InputMapping]]) \u2013 Defines the inputs to the nested graph, and\nhow they map to the inputs of its constituent ops.

  • \n
  • output_mappings (Optional[Sequence[OutputMapping]]) \u2013 Defines the outputs of the nested graph,\nand how they map from the outputs of its constituent ops.

  • \n
  • config (Optional[ConfigMapping]) \u2013 Defines the config of the graph, and how its schema maps\nto the config of its constituent ops.

  • \n
  • tags (Optional[Dict[str, Any]]) \u2013 Arbitrary metadata for any execution of the graph.\nValues that are not strings will be json encoded and must meet the criteria that\njson.loads(json.dumps(value)) == value. These tag values may be overwritten by tag\nvalues provided at invocation time.

  • \n
\n
\n
\n

Examples

\n
@op\ndef return_one():\n    return 1\n\n@op\ndef add_one(num):\n    return num + 1\n\ngraph_def = GraphDefinition(\n    name='basic',\n    node_defs=[return_one, add_one],\n    dependencies={'add_one': {'num': DependencyDefinition('return_one')}},\n)\n
\n
\n
\n
\nalias(name)[source]\u00b6
\n

Aliases the graph with a new name.

\n

Can only be used in the context of a @graph, @job, or @asset_graph decorated function.

\n
\n
Examples:
@job\ndef do_it_all():\n    my_graph.alias("my_graph_alias")\n
\n
\n
\n
\n
\n\n
\n
\nproperty config_mapping\u00b6
\n

The config mapping for the graph, if present.

\n

By specifying a config mapping function, you can override the configuration for the child nodes contained within a graph.

\n
\n\n
\n
\nexecute_in_process(run_config=None, instance=None, resources=None, raise_on_error=True, op_selection=None, run_id=None, input_values=None)[source]\u00b6
\n

Execute this graph in-process, collecting results in-memory.

\n
\n
Parameters:
\n
    \n
  • run_config (Optional[Mapping[str, Any]]) \u2013 Run config to provide to execution. The configuration for the underlying graph\nshould exist under the \u201cops\u201d key.

  • \n
  • instance (Optional[DagsterInstance]) \u2013 The instance to execute against, an ephemeral one will be used if none provided.

  • \n
  • resources (Optional[Mapping[str, Any]]) \u2013 The resources needed if any are required. Can provide resource instances directly,\nor resource definitions.

  • \n
  • raise_on_error (Optional[bool]) \u2013 Whether or not to raise exceptions when they occur.\nDefaults to True.

  • \n
  • op_selection (Optional[List[str]]) \u2013 A list of op selection queries (including single op\nnames) to execute. For example:\n* ['some_op']: selects some_op itself.\n* ['*some_op']: select some_op and all its ancestors (upstream dependencies).\n* ['*some_op+++']: select some_op, all its ancestors, and its descendants\n(downstream dependencies) within 3 levels down.\n* ['*some_op', 'other_op_a', 'other_op_b+']: select some_op and all its\nancestors, other_op_a itself, and other_op_b and its direct child ops.

  • \n
  • input_values (Optional[Mapping[str, Any]]) \u2013 A dictionary that maps python objects to the top-level inputs of the graph.

  • \n
\n
\n
Returns:
\n

ExecuteInProcessResult

\n
\n
\n
\n\n
\n
\nproperty input_mappings\u00b6
\n

Input mappings for the graph.

\n

An input mapping is a mapping from an input of the graph to an input of a child node.

\n
\n\n
\n
\nproperty name\u00b6
\n

The name of the graph.

\n
\n\n
\n
\nproperty output_mappings\u00b6
\n

Output mappings for the graph.

\n

An output mapping is a mapping from an output of the graph to an output of a child node.

\n
\n\n
\n
\ntag(tags)[source]\u00b6
\n

Attaches the provided tags to the graph immutably.

\n

Can only be used in the context of a @graph, @job, or @asset_graph decorated function.

\n
\n
Examples:
@job\ndef do_it_all():\n    my_graph.tag({"my_tag": "my_value"})\n
\n
\n
\n
\n
\n\n
\n
\nproperty tags\u00b6
\n

The tags associated with the graph.

\n
\n\n
\n
\nto_job(name=None, description=None, resource_defs=None, config=None, tags=None, metadata=None, logger_defs=None, executor_def=None, hooks=None, op_retry_policy=None, version_strategy=None, op_selection=None, partitions_def=None, asset_layer=None, input_values=None, _asset_selection_data=None)[source]\u00b6
\n

Make this graph in to an executable Job by providing remaining components required for execution.

\n
\n
Parameters:
\n
    \n
  • name (Optional[str]) \u2013 The name for the Job. Defaults to the name of the this graph.

  • \n
  • resource_defs (Optional[Mapping [str, object]]) \u2013 Resources that are required by this graph for execution.\nIf not defined, io_manager will default to filesystem.

  • \n
  • config \u2013

    Describes how the job is parameterized at runtime.

    \n

    If no value is provided, then the schema for the job\u2019s run config is a standard\nformat based on its ops and resources.

    \n

    If a dictionary is provided, then it must conform to the standard config schema, and\nit will be used as the job\u2019s run config for the job whenever the job is executed.\nThe values provided will be viewable and editable in the Dagster UI, so be\ncareful with secrets.

    \n

    If a ConfigMapping object is provided, then the schema for the job\u2019s run config is\ndetermined by the config mapping, and the ConfigMapping, which should return\nconfiguration in the standard format to configure the job.

    \n

    If a PartitionedConfig object is provided, then it defines a discrete set of config\nvalues that can parameterize the job, as well as a function for mapping those\nvalues to the base config. The values provided will be viewable and editable in the\nDagster UI, so be careful with secrets.

    \n

  • \n
  • tags (Optional[Mapping[str, Any]]) \u2013 Arbitrary information that will be attached to the execution of the Job.\nValues that are not strings will be json encoded and must meet the criteria that\njson.loads(json.dumps(value)) == value. These tag values may be overwritten by tag\nvalues provided at invocation time.

  • \n
  • metadata (Optional[Mapping[str, RawMetadataValue]]) \u2013 Arbitrary information that will be attached to the JobDefinition and be viewable in the Dagster UI.\nKeys must be strings, and values must be python primitive types or one of the provided\nMetadataValue types

  • \n
  • logger_defs (Optional[Mapping[str, LoggerDefinition]]) \u2013 A dictionary of string logger identifiers to their implementations.

  • \n
  • executor_def (Optional[ExecutorDefinition]) \u2013 How this Job will be executed. Defaults to multi_or_in_process_executor,\nwhich can be switched between multi-process and in-process modes of execution. The\ndefault mode of execution is multi-process.

  • \n
  • op_retry_policy (Optional[RetryPolicy]) \u2013 The default retry policy for all ops in this job.\nOnly used if retry policy is not defined on the op definition or op invocation.

  • \n
  • version_strategy (Optional[VersionStrategy]) \u2013 Defines how each op (and optionally, resource) in the job can be versioned. If\nprovided, memoizaton will be enabled for this job.

  • \n
  • partitions_def (Optional[PartitionsDefinition]) \u2013 Defines a discrete set of partition\nkeys that can parameterize the job. If this argument is supplied, the config\nargument can\u2019t also be supplied.

  • \n
  • asset_layer (Optional[AssetLayer]) \u2013 Top level information about the assets this job\nwill produce. Generally should not be set manually.

  • \n
  • input_values (Optional[Mapping[str, Any]]) \u2013 A dictionary that maps python objects to the top-level inputs of a job.

  • \n
\n
\n
Returns:
\n

JobDefinition

\n
\n
\n
\n\n
\n
\nwith_hooks(hook_defs)[source]\u00b6
\n

Attaches the provided hooks to the graph immutably.

\n

Can only be used in the context of a @graph, @job, or @asset_graph decorated function.

\n
\n
Examples:
@job\ndef do_it_all():\n    my_graph.with_hooks({my_hook})\n
\n
\n
\n
\n
\n\n
\n
\nwith_retry_policy(retry_policy)[source]\u00b6
\n

Attaches the provided retry policy to the graph immutably.

\n

Can only be used in the context of a @graph, @job, or @asset_graph decorated function.

\n
\n
Examples:
@job\ndef do_it_all():\n    my_graph.with_retry_policy(RetryPolicy(max_retries=5))\n
\n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.GraphIn(description=None)[source]\u00b6
\n

Represents information about an input that a graph maps.

\n
\n
Parameters:
\n

description (Optional[str]) \u2013 Human-readable description of the input.

\n
\n
\n
\n\n
\n
\nclass dagster.GraphOut(description=None)[source]\u00b6
\n

Represents information about the outputs that a graph maps.

\n
\n
Parameters:
\n

description (Optional[str]) \u2013 Human-readable description of the output.

\n
\n
\n
\n\n
\n

Explicit dependencies\u00b6

\n
\n
\nclass dagster.DependencyDefinition(node, output='result', description=None)[source]\u00b6
\n

Represents an edge in the DAG of nodes (ops or graphs) forming a job.

\n

This object is used at the leaves of a dictionary structure that represents the complete\ndependency structure of a job whose keys represent the dependent node and dependent\ninput, so this object only contains information about the dependee.

\n

Concretely, if the input named \u2018input\u2019 of op_b depends on the output named \u2018result\u2019 of\nop_a, and the output named \u2018other_result\u2019 of graph_a, the structure will look as follows:

\n
dependency_structure = {\n    'my_downstream_op': {\n        'input': DependencyDefinition('my_upstream_op', 'result')\n    }\n    'my_downstream_op': {\n        'input': DependencyDefinition('my_upstream_graph', 'result')\n    }\n}\n
\n
\n

In general, users should prefer not to construct this class directly or use the\nJobDefinition API that requires instances of this class. Instead, use the\n@job API:

\n
@job\ndef the_job():\n    node_b(node_a())\n
\n
\n
\n
Parameters:
\n
    \n
  • node (str) \u2013 The name of the node (op or graph) that is depended on, that is, from which the value\npassed between the two nodes originates.

  • \n
  • output (Optional[str]) \u2013 The name of the output that is depended on. (default: \u201cresult\u201d)

  • \n
  • description (Optional[str]) \u2013 Human-readable description of this dependency.

  • \n
\n
\n
\n
\n
\nis_fan_in()[source]\u00b6
\n

Return True if the dependency is fan-in (always False for DependencyDefinition).

\n
\n\n
\n\n
\n
\nclass dagster.MultiDependencyDefinition(dependencies)[source]\u00b6
\n

Represents a fan-in edge in the DAG of op instances forming a job.

\n

This object is used only when an input of type List[T] is assembled by fanning-in multiple\nupstream outputs of type T.

\n

This object is used at the leaves of a dictionary structure that represents the complete\ndependency structure of a job whose keys represent the dependent ops or graphs and dependent\ninput, so this object only contains information about the dependee.

\n

Concretely, if the input named \u2018input\u2019 of op_c depends on the outputs named \u2018result\u2019 of\nop_a and op_b, this structure will look as follows:

\n
dependency_structure = {\n    'op_c': {\n        'input': MultiDependencyDefinition(\n            [\n                DependencyDefinition('op_a', 'result'),\n                DependencyDefinition('op_b', 'result')\n            ]\n        )\n    }\n}\n
\n
\n

In general, users should prefer not to construct this class directly or use the\nJobDefinition API that requires instances of this class. Instead, use the\n@job API:

\n
@job\ndef the_job():\n    op_c(op_a(), op_b())\n
\n
\n
\n
Parameters:
\n

dependencies (List[Union[DependencyDefinition, Type[MappedInputPlaceHolder]]]) \u2013 List of\nupstream dependencies fanned in to this input.

\n
\n
\n
\n
\nget_dependencies_and_mappings()[source]\u00b6
\n

Return the combined list of dependencies contained by this object, inculding of DependencyDefinition and MappedInputPlaceholder objects.

\n
\n\n
\n
\nget_node_dependencies()[source]\u00b6
\n

Return the list of DependencyDefinition contained by this object.

\n
\n\n
\n
\nis_fan_in()[source]\u00b6
\n

Return True if the dependency is fan-in (always True for MultiDependencyDefinition).

\n
\n\n
\n\n
\n
\nclass dagster.NodeInvocation(name, alias=None, tags=None, hook_defs=None, retry_policy=None)[source]\u00b6
\n

Identifies an instance of a node in a graph dependency structure.

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 Name of the node of which this is an instance.

  • \n
  • alias (Optional[str]) \u2013 Name specific to this instance of the node. Necessary when there are\nmultiple instances of the same node.

  • \n
  • tags (Optional[Dict[str, Any]]) \u2013 Optional tags values to extend or override those\nset on the node definition.

  • \n
  • hook_defs (Optional[AbstractSet[HookDefinition]]) \u2013 A set of hook definitions applied to the\nnode instance.

  • \n
\n
\n
\n

Examples:\nIn general, users should prefer not to construct this class directly or use the\nJobDefinition API that requires instances of this class. Instead, use the\n@job API:

\n
from dagster import job\n\n@job\ndef my_job():\n    other_name = some_op.alias('other_name')\n    some_graph(other_name(some_op))\n
\n
\n
\n\n
\n
\nclass dagster.OutputMapping(graph_output_name, mapped_node_name, mapped_node_output_name, graph_output_description=None, dagster_type=None, from_dynamic_mapping=False)[source]\u00b6
\n

Defines an output mapping for a graph.

\n
\n
Parameters:
\n
    \n
  • graph_output_name (str) \u2013 Name of the output in the graph being mapped to.

  • \n
  • mapped_node_name (str) \u2013 Named of the node (op/graph) that the output is being mapped from.

  • \n
  • mapped_node_output_name (str) \u2013 Name of the output in the node (op/graph) that is being mapped from.

  • \n
  • graph_output_description (Optional[str]) \u2013 A description of the output in the graph being mapped from.

  • \n
  • from_dynamic_mapping (bool) \u2013 Set to true if the node being mapped to is a mapped dynamic node.

  • \n
  • dagster_type (Optional[DagsterType]) \u2013 \n \n (\n deprecated\n )\n \n (This parameter will be removed in version 2.0. Any defined dagster_type should come from the underlying op Output.) The dagster type of the graph\u2019s output being mapped to.

  • \n
\n
\n
\n

Examples

\n
from dagster import OutputMapping, GraphDefinition, op, graph, GraphOut\n\n@op\ndef emit_five(x):\n    return 5\n\n# The following two graph definitions are equivalent\nGraphDefinition(\n    name="the_graph",\n    node_defs=[emit_five],\n    output_mappings=[\n        OutputMapping(\n            graph_output_name="result", # Default output name\n            mapped_node_name="emit_five",\n            mapped_node_output_name="result"\n        )\n    ]\n)\n\n@graph(out=GraphOut())\ndef the_graph:\n    return emit_five()\n
\n
\n
\n\n
\n
\nclass dagster.InputMapping(graph_input_name, mapped_node_name, mapped_node_input_name, fan_in_index=None, graph_input_description=None, dagster_type=None)[source]\u00b6
\n

Defines an input mapping for a graph.

\n
\n
Parameters:
\n
    \n
  • graph_input_name (str) \u2013 Name of the input in the graph being mapped from.

  • \n
  • mapped_node_name (str) \u2013 Named of the node (op/graph) that the input is being mapped to.

  • \n
  • mapped_node_input_name (str) \u2013 Name of the input in the node (op/graph) that is being mapped to.

  • \n
  • fan_in_index (Optional[int]) \u2013 The index in to a fanned input, otherwise None.

  • \n
  • graph_input_description (Optional[str]) \u2013 A description of the input in the graph being mapped from.

  • \n
  • dagster_type (Optional[DagsterType]) \u2013 \n \n (\n deprecated\n )\n \n (This parameter will be removed in version 2.0. Any defined dagster_type should come from the upstream op Output.) The dagster type of the graph\u2019s input\nbeing mapped from.

  • \n
\n
\n
\n

Examples

\n
from dagster import InputMapping, GraphDefinition, op, graph\n\n@op\ndef needs_input(x):\n    return x + 1\n\n# The following two graph definitions are equivalent\nGraphDefinition(\n    name="the_graph",\n    node_defs=[needs_input],\n    input_mappings=[\n        InputMapping(\n            graph_input_name="maps_x", mapped_node_name="needs_input",\n            mapped_node_input_name="x"\n        )\n    ]\n)\n\n@graph\ndef the_graph(maps_x):\n    needs_input(maps_x)\n
\n
\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/graphs", "customsidebar": null, "display_toc": true, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../hooks/", "title": "Hooks"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../execution/", "title": "Execution"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/hooks", "Hooks", "N", "next"], ["sections/api/apidocs/execution", "Execution", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/graphs.rst.txt", "title": "Graphs", "toc": "\n"}, "hooks": {"alabaster_version": "0.7.13", "body": "
\n

Hooks\u00b6

\n
\n
\n@dagster.success_hook(hook_fn=None, *, name=None, required_resource_keys=None)[source]\u00b6
\n

Create a hook on step success events with the specified parameters from the decorated function.

\n
\n
Parameters:
\n
    \n
  • name (Optional[str]) \u2013 The name of this hook.

  • \n
  • required_resource_keys (Optional[AbstractSet[str]]) \u2013 Keys for the resources required by the\nhook.

  • \n
\n
\n
\n

Examples

\n
@success_hook(required_resource_keys={'slack'})\ndef slack_message_on_success(context):\n    message = 'op {} succeeded'.format(context.op.name)\n    context.resources.slack.send_message(message)\n\n@success_hook\ndef do_something_on_success(context):\n    do_something()\n
\n
\n
\n\n
\n
\n@dagster.failure_hook(name=None, required_resource_keys=None)[source]\u00b6
\n

Create a hook on step failure events with the specified parameters from the decorated function.

\n
\n
Parameters:
\n
    \n
  • name (Optional[str]) \u2013 The name of this hook.

  • \n
  • required_resource_keys (Optional[AbstractSet[str]]) \u2013 Keys for the resources required by the\nhook.

  • \n
\n
\n
\n

Examples

\n
@failure_hook(required_resource_keys={'slack'})\ndef slack_message_on_failure(context):\n    message = 'op {} failed'.format(context.op.name)\n    context.resources.slack.send_message(message)\n\n@failure_hook\ndef do_something_on_failure(context):\n    do_something()\n
\n
\n
\n\n
\n
\nclass dagster.HookDefinition(*, name, hook_fn, required_resource_keys=None, decorated_fn=None)[source]\u00b6
\n

Define a hook which can be triggered during a op execution (e.g. a callback on the step\nexecution failure event during a op execution).

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of this hook.

  • \n
  • hook_fn (Callable) \u2013 The callback function that will be triggered.

  • \n
  • required_resource_keys (Optional[AbstractSet[str]]) \u2013 Keys for the resources required by the\nhook.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.HookContext(step_execution_context, hook_def)[source]\u00b6
\n

The context object available to a hook function on an DagsterEvent.

\n
\n
\nproperty hook_def\u00b6
\n

The hook that the context object belongs to.

\n
\n\n
\n
\nproperty instance\u00b6
\n

The instance configured to run the current job.

\n
\n\n
\n
\nproperty job_name\u00b6
\n

The name of the job where this hook is being triggered.

\n
\n\n
\n
\nproperty log\u00b6
\n

Centralized log dispatch from user code.

\n
\n\n
\n
\nproperty op_config\u00b6
\n

The parsed config specific to this op.

\n
\n\n
\n
\nproperty op_exception\u00b6
\n

The thrown exception in a failed op.

\n
\n\n
\n
\nproperty op_output_values\u00b6
\n

Computed output values in an op.

\n
\n\n
\n
\nproperty required_resource_keys\u00b6
\n

Resources required by this hook.

\n
\n\n
\n
\nproperty resources\u00b6
\n

Resources available in the hook context.

\n
\n\n
\n
\nproperty run_id\u00b6
\n

The id of the run where this hook is being triggered.

\n
\n\n
\n
\nproperty step_key\u00b6
\n

The key for the step where this hook is being triggered.

\n
\n\n
\n\n
\n
\ndagster.build_hook_context(resources=None, op=None, run_id=None, job_name=None, op_exception=None, instance=None)[source]\u00b6
\n

Builds hook context from provided parameters.

\n

build_hook_context can be used as either a function or a context manager. If there is a\nprovided resource to build_hook_context that is a context manager, then it must be used as a\ncontext manager. This function can be used to provide the context argument to the invocation of\na hook definition.

\n
\n
Parameters:
\n
    \n
  • resources (Optional[Dict[str, Any]]) \u2013 The resources to provide to the context. These can\neither be values or resource definitions.

  • \n
  • op (Optional[OpDefinition, PendingNodeInvocation]) \u2013 The op definition which the\nhook may be associated with.

  • \n
  • run_id (Optional[str]) \u2013 The id of the run in which the hook is invoked (provided for mocking purposes).

  • \n
  • job_name (Optional[str]) \u2013 The name of the job in which the hook is used (provided for mocking purposes).

  • \n
  • op_exception (Optional[Exception]) \u2013 The exception that caused the hook to be triggered.

  • \n
  • instance (Optional[DagsterInstance]) \u2013 The Dagster instance configured to run the hook.

  • \n
\n
\n
\n

Examples

\n
context = build_hook_context()\nhook_to_invoke(context)\n\nwith build_hook_context(resources={"foo": context_manager_resource}) as context:\n    hook_to_invoke(context)\n
\n
\n
\n\n
\n", "current_page_name": "sections/api/apidocs/hooks", "customsidebar": null, "display_toc": false, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../internals/", "title": "Internals"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../graphs/", "title": "Graphs"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/internals", "Internals", "N", "next"], ["sections/api/apidocs/graphs", "Graphs", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/hooks.rst.txt", "title": "Hooks", "toc": "\n"}, "internals": {"alabaster_version": "0.7.13", "body": "
\n

Internals\u00b6

\n

Note that APIs imported from Dagster submodules are not considered stable, and are potentially subject to change in the future.

\n

If you find yourself consulting these docs because you are writing custom components and plug-ins,\nplease get in touch with the core team on our Slack.\nWe\u2019re curious what you\u2019re up to, happy to help, excited for new community contributions, and eager\nto make the system as easy to work with as possible \u2013 including for teams who are looking to\ncustomize it.

\n
\n

Executors (Experimental)\u00b6

\n

APIs for constructing custom executors. This is considered advanced experimental usage. Please note that using Dagster-provided executors is considered stable, common usage.

\n
\n
\n@dagster.executor(name=None, config_schema=None, requirements=None)[source]\u00b6
\n

Define an executor.

\n

The decorated function should accept an InitExecutorContext and return an instance\nof Executor.

\n
\n
Parameters:
\n
    \n
  • name (Optional[str]) \u2013 The name of the executor.

  • \n
  • config_schema (Optional[ConfigSchema]) \u2013 The schema for the config. Configuration data available in\ninit_context.executor_config. If not set, Dagster will accept any config provided for.

  • \n
  • requirements (Optional[List[ExecutorRequirement]]) \u2013 Any requirements that must\nbe met in order for the executor to be usable for a particular job execution.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.ExecutorDefinition(name, config_schema=None, requirements=None, executor_creation_fn=None, description=None)[source]\u00b6
\n

An executor is responsible for executing the steps of a job.

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the executor.

  • \n
  • config_schema (Optional[ConfigSchema]) \u2013 The schema for the config. Configuration data\navailable in init_context.executor_config. If not set, Dagster will accept any config\nprovided.

  • \n
  • requirements (Optional[List[ExecutorRequirement]]) \u2013 Any requirements that must\nbe met in order for the executor to be usable for a particular job execution.

  • \n
  • executor_creation_fn (Optional[Callable]) \u2013 Should accept an InitExecutorContext\nand return an instance of Executor

  • \n
  • required_resource_keys (Optional[Set[str]]) \u2013 Keys for the resources required by the\nexecutor.

  • \n
  • description (Optional[str]) \u2013 A description of the executor.

  • \n
\n
\n
\n
\n
\nconfigured(config_or_config_fn, name=None, config_schema=None, description=None)[source]\u00b6
\n

Wraps this object in an object of the same type that provides configuration to the inner\nobject.

\n

Using configured may result in config values being displayed in\nthe Dagster UI, so it is not recommended to use this API with sensitive values,\nsuch as secrets.

\n
\n
Parameters:
\n
    \n
  • config_or_config_fn (Union[Any, Callable[[Any], Any]]) \u2013 Either (1) Run configuration\nthat fully satisfies this object\u2019s config schema or (2) A function that accepts run\nconfiguration and returns run configuration that fully satisfies this object\u2019s\nconfig schema. In the latter case, config_schema must be specified. When\npassing a function, it\u2019s easiest to use configured().

  • \n
  • name (Optional[str]) \u2013 Name of the new definition. If not provided, the emitted\ndefinition will inherit the name of the ExecutorDefinition upon which this\nfunction is called.

  • \n
  • config_schema (Optional[ConfigSchema]) \u2013 If config_or_config_fn is a function, the config\nschema that its input must satisfy. If not set, Dagster will accept any config\nprovided.

  • \n
  • description (Optional[str]) \u2013 Description of the new definition. If not specified,\ninherits the description of the definition being configured.

  • \n
\n
\n
\n

Returns (ConfigurableDefinition): A configured version of this object.

\n
\n\n
\n
\nproperty description\u00b6
\n

Description of executor, if provided.

\n
\n\n
\n
\nproperty executor_creation_fn\u00b6
\n

Callable that takes an InitExecutorContext and returns an instance of\nExecutor.

\n
\n\n
\n
\nproperty name\u00b6
\n

Name of the executor.

\n
\n\n
\n\n
\n
\nclass dagster.InitExecutorContext(job, executor_def, executor_config, instance)[source]\u00b6
\n

Executor-specific initialization context.

\n
\n
\njob\u00b6
\n

The job to be executed.

\n
\n
Type:
\n

IJob

\n
\n
\n
\n\n
\n
\nexecutor_def\u00b6
\n

The definition of the executor currently being\nconstructed.

\n
\n
Type:
\n

ExecutorDefinition

\n
\n
\n
\n\n
\n
\nexecutor_config\u00b6
\n

The parsed config passed to the executor.

\n
\n
Type:
\n

dict

\n
\n
\n
\n\n
\n
\ninstance\u00b6
\n

The current instance.

\n
\n
Type:
\n

DagsterInstance

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.Executor[source]\u00b6
\n
\n
\nabstract execute(plan_context, execution_plan)[source]\u00b6
\n

For the given context and execution plan, orchestrate a series of sub plan executions in a way that satisfies the whole plan being executed.

\n
\n
Parameters:
\n
    \n
  • plan_context (PlanOrchestrationContext) \u2013 The plan\u2019s orchestration context.

  • \n
  • execution_plan (ExecutionPlan) \u2013 The plan to execute.

  • \n
\n
\n
Returns:
\n

A stream of dagster events.

\n
\n
\n
\n\n
\n
\nabstract property retries\u00b6
\n

Whether retries are enabled or disabled for this instance of the executor.

\n

Executors should allow this to be controlled via configuration if possible.

\n

Returns: RetryMode

\n
\n\n
\n\n
\n
\n
\n

File Manager (Experimental)\u00b6

\n
\n
\nclass dagster._core.storage.file_manager.FileManager[source]\u00b6
\n

Base class for all file managers in dagster.

\n

The file manager is an interface that can be implemented by resources to provide abstract\naccess to a file system such as local disk, S3, or other cloud storage.

\n

For examples of usage, see the documentation of the concrete file manager implementations.

\n
\n
\nabstract copy_handle_to_local_temp(file_handle)[source]\u00b6
\n

Copy a file represented by a file handle to a temp file.

\n

In an implementation built around an object store such as S3, this method would be expected\nto download the file from S3 to local filesystem in a location assigned by the standard\nlibrary\u2019s python:tempfile module.

\n

Temp files returned by this method are not guaranteed to be reusable across solid\nboundaries. For files that must be available across solid boundaries, use the\nread(),\nread_data(),\nwrite(), and\nwrite_data() methods.

\n
\n
Parameters:
\n

file_handle (FileHandle) \u2013 The handle to the file to make available as a local temp file.

\n
\n
Returns:
\n

Path to the local temp file.

\n
\n
Return type:
\n

str

\n
\n
\n
\n\n
\n
\nabstract delete_local_temp()[source]\u00b6
\n

Delete all local temporary files created by previous calls to\ncopy_handle_to_local_temp().

\n

Should typically only be called by framework implementors.

\n
\n\n
\n
\nabstract read(file_handle, mode='rb')[source]\u00b6
\n

Return a file-like stream for the file handle.

\n

This may incur an expensive network call for file managers backed by object stores\nsuch as S3.

\n
\n
Parameters:
\n
    \n
  • file_handle (FileHandle) \u2013 The file handle to make available as a stream.

  • \n
  • mode (str) \u2013 The mode in which to open the file. Default: "rb".

  • \n
\n
\n
Returns:
\n

A file-like stream.

\n
\n
Return type:
\n

Union[TextIO, BinaryIO]

\n
\n
\n
\n\n
\n
\nabstract read_data(file_handle)[source]\u00b6
\n

Return the bytes for a given file handle. This may incur an expensive network\ncall for file managers backed by object stores such as s3.

\n
\n
Parameters:
\n

file_handle (FileHandle) \u2013 The file handle for which to return bytes.

\n
\n
Returns:
\n

Bytes for a given file handle.

\n
\n
Return type:
\n

bytes

\n
\n
\n
\n\n
\n
\nabstract write(file_obj, mode='wb', ext=None)[source]\u00b6
\n

Write the bytes contained within the given file object into the file manager.

\n
\n
Parameters:
\n
    \n
  • file_obj (Union[TextIO, StringIO]) \u2013 A file-like object.

  • \n
  • mode (Optional[str]) \u2013 The mode in which to write the file into the file manager.\nDefault: "wb".

  • \n
  • ext (Optional[str]) \u2013 For file managers that support file extensions, the extension with\nwhich to write the file. Default: None.

  • \n
\n
\n
Returns:
\n

A handle to the newly created file.

\n
\n
Return type:
\n

FileHandle

\n
\n
\n
\n\n
\n
\nabstract write_data(data, ext=None)[source]\u00b6
\n

Write raw bytes into the file manager.

\n
\n
Parameters:
\n
    \n
  • data (bytes) \u2013 The bytes to write into the file manager.

  • \n
  • ext (Optional[str]) \u2013 For file managers that support file extensions, the extension with\nwhich to write the file. Default: None.

  • \n
\n
\n
Returns:
\n

A handle to the newly created file.

\n
\n
Return type:
\n

FileHandle

\n
\n
\n
\n\n
\n\n
\n
\ndagster.local_file_manager ResourceDefinition[source]\u00b6
\n

FileManager that provides abstract access to a local filesystem.

\n

By default, files will be stored in <local_artifact_storage>/storage/file_manager where\n<local_artifact_storage> can be configured the dagster.yaml file in $DAGSTER_HOME.

\n

Implements the FileManager API.

\n

Examples

\n
import tempfile\n\nfrom dagster import job, local_file_manager, op\n\n\n@op(required_resource_keys={"file_manager"})\ndef write_files(context):\n    fh_1 = context.resources.file_manager.write_data(b"foo")\n\n    with tempfile.NamedTemporaryFile("w+") as fd:\n        fd.write("bar")\n        fd.seek(0)\n        fh_2 = context.resources.file_manager.write(fd, mode="w", ext=".txt")\n\n    return (fh_1, fh_2)\n\n\n@op(required_resource_keys={"file_manager"})\ndef read_files(context, file_handles):\n    fh_1, fh_2 = file_handles\n    assert context.resources.file_manager.read_data(fh_2) == b"bar"\n    fd = context.resources.file_manager.read(fh_2, mode="r")\n    assert fd.read() == "foo"\n    fd.close()\n\n\n@job(resource_defs={"file_manager": local_file_manager})\ndef files_pipeline():\n    read_files(write_files())\n
\n
\n

Or to specify the file directory:

\n
@job(\n    resource_defs={\n        "file_manager": local_file_manager.configured({"base_dir": "/my/base/dir"})\n    }\n)\ndef files_pipeline():\n    read_files(write_files())\n
\n
\n
\n\n
\n
\nclass dagster.FileHandle[source]\u00b6
\n

A reference to a file as manipulated by a FileManager.

\n

Subclasses may handle files that are resident on the local file system, in an object store, or\nin any arbitrary place where a file can be stored.

\n

This exists to handle the very common case where you wish to write a computation that reads,\ntransforms, and writes files, but where you also want the same code to work in local development\nas well as on a cluster where the files will be stored in a globally available object store\nsuch as S3.

\n
\n
\nabstract property path_desc\u00b6
\n

A representation of the file path for display purposes only.

\n
\n\n
\n\n
\n
\nclass dagster.LocalFileHandle(path)[source]\u00b6
\n

A reference to a file on a local filesystem.

\n
\n
\nproperty path\u00b6
\n

The file\u2019s path.

\n
\n\n
\n
\nproperty path_desc\u00b6
\n

A representation of the file path for display purposes only.

\n
\n\n
\n\n
\n
\n
\n

Instance\u00b6

\n
\n
\nclass dagster.DagsterInstance(instance_type, local_artifact_storage, run_storage, event_storage, run_coordinator, compute_log_manager, run_launcher, scheduler=None, schedule_storage=None, settings=None, secrets_loader=None, ref=None, **_kwargs)[source]\u00b6
\n

Core abstraction for managing Dagster\u2019s access to storage and other resources.

\n

Use DagsterInstance.get() to grab the current DagsterInstance which will load based on\nthe values in the dagster.yaml file in $DAGSTER_HOME.

\n

Alternatively, DagsterInstance.ephemeral() can use used which provides a set of\ntransient in-memory components.

\n

Configuration of this class should be done by setting values in $DAGSTER_HOME/dagster.yaml.\nFor example, to use Postgres for dagster storage, you can write a dagster.yaml such as the\nfollowing:

\n
\n
dagster.yaml\u00b6
\n
storage:\n  postgres:\n    postgres_db:\n      username: my_username\n      password: my_password\n      hostname: my_hostname\n      db_name: my_database\n      port: 5432\n
\n
\n
\n
\n
Parameters:
\n
    \n
  • instance_type (InstanceType) \u2013 Indicates whether the instance is ephemeral or persistent.\nUsers should not attempt to set this value directly or in their dagster.yaml files.

  • \n
  • local_artifact_storage (LocalArtifactStorage) \u2013 The local artifact storage is used to\nconfigure storage for any artifacts that require a local disk, such as schedules, or\nwhen using the filesystem system storage to manage files and intermediates. By default,\nthis will be a dagster._core.storage.root.LocalArtifactStorage. Configurable\nin dagster.yaml using the ConfigurableClass\nmachinery.

  • \n
  • run_storage (RunStorage) \u2013 The run storage is used to store metadata about ongoing and past\npipeline runs. By default, this will be a\ndagster._core.storage.runs.SqliteRunStorage. Configurable in dagster.yaml\nusing the ConfigurableClass machinery.

  • \n
  • event_storage (EventLogStorage) \u2013 Used to store the structured event logs generated by\npipeline runs. By default, this will be a\ndagster._core.storage.event_log.SqliteEventLogStorage. Configurable in\ndagster.yaml using the ConfigurableClass machinery.

  • \n
  • compute_log_manager (Optional[ComputeLogManager]) \u2013 The compute log manager handles stdout\nand stderr logging for op compute functions. By default, this will be a\ndagster._core.storage.local_compute_log_manager.LocalComputeLogManager.\nConfigurable in dagster.yaml using the\nConfigurableClass machinery.

  • \n
  • run_coordinator (Optional[RunCoordinator]) \u2013 A runs coordinator may be used to manage the execution\nof pipeline runs.

  • \n
  • run_launcher (Optional[RunLauncher]) \u2013 Optionally, a run launcher may be used to enable\na Dagster instance to launch pipeline runs, e.g. on a remote Kubernetes cluster, in\naddition to running them locally.

  • \n
  • settings (Optional[Dict]) \u2013 Specifies certain per-instance settings,\nsuch as feature flags. These are set in the dagster.yaml under a set of whitelisted\nkeys.

  • \n
  • ref (Optional[InstanceRef]) \u2013 Used by internal machinery to pass instances across process\nboundaries.

  • \n
\n
\n
\n
\n
\nadd_dynamic_partitions(partitions_def_name, partition_keys)[source]\u00b6
\n

Add partitions to the specified DynamicPartitionsDefinition idempotently.\nDoes not add any partitions that already exist.

\n
\n
Parameters:
\n
    \n
  • partitions_def_name (str) \u2013 The name of the DynamicPartitionsDefinition.

  • \n
  • partition_keys (Sequence[str]) \u2013 Partition keys to add.

  • \n
\n
\n
\n
\n\n
\n
\ndelete_dynamic_partition(partitions_def_name, partition_key)[source]\u00b6
\n

Delete a partition for the specified DynamicPartitionsDefinition.\nIf the partition does not exist, exits silently.

\n
\n
Parameters:
\n
    \n
  • partitions_def_name (str) \u2013 The name of the DynamicPartitionsDefinition.

  • \n
  • partition_key (Sequence[str]) \u2013 Partition key to delete.

  • \n
\n
\n
\n
\n\n
\n
\ndelete_run(run_id)[source]\u00b6
\n

Delete a run and all events generated by that from storage.

\n
\n
Parameters:
\n

run_id (str) \u2013 The id of the run to delete.

\n
\n
\n
\n\n
\n
\nstatic ephemeral(tempdir=None, preload=None, settings=None)[source]\u00b6
\n

Create a DagsterInstance suitable for ephemeral execution, useful in test contexts. An\nephemeral instance uses mostly in-memory components. Use local_temp to create a test\ninstance that is fully persistent.

\n
\n
Parameters:
\n
    \n
  • tempdir (Optional[str]) \u2013 The path of a directory to be used for local artifact storage.

  • \n
  • preload (Optional[Sequence[DebugRunPayload]]) \u2013 A sequence of payloads to load into the\ninstance\u2019s run storage. Useful for debugging.

  • \n
  • settings (Optional[Dict]) \u2013 Settings for the instance.

  • \n
\n
\n
Returns:
\n

An ephemeral DagsterInstance.

\n
\n
Return type:
\n

DagsterInstance

\n
\n
\n
\n\n
\n
\nstatic get()[source]\u00b6
\n

Get the current DagsterInstance as specified by the DAGSTER_HOME environment variable.

\n
\n
Returns:
\n

The current DagsterInstance.

\n
\n
Return type:
\n

DagsterInstance

\n
\n
\n
\n\n
\n
\nget_asset_keys(prefix=None, limit=None, cursor=None)[source]\u00b6
\n

Return a filtered subset of asset keys managed by this instance.

\n
\n
Parameters:
\n
    \n
  • prefix (Optional[Sequence[str]]) \u2013 Return only assets having this key prefix.

  • \n
  • limit (Optional[int]) \u2013 Maximum number of keys to return.

  • \n
  • cursor (Optional[str]) \u2013 Cursor to use for pagination.

  • \n
\n
\n
Returns:
\n

List of asset keys.

\n
\n
Return type:
\n

Sequence[AssetKey]

\n
\n
\n
\n\n
\n
\nget_asset_records(asset_keys=None)[source]\u00b6
\n

Return an AssetRecord for each of the given asset keys.

\n
\n
Parameters:
\n

asset_keys (Optional[Sequence[AssetKey]]) \u2013 List of asset keys to retrieve records for.

\n
\n
Returns:
\n

List of asset records.

\n
\n
Return type:
\n

Sequence[AssetRecord]

\n
\n
\n
\n\n
\n
\nget_dynamic_partitions(partitions_def_name)[source]\u00b6
\n

Get the set of partition keys for the specified DynamicPartitionsDefinition.

\n
\n
Parameters:
\n

partitions_def_name (str) \u2013 The name of the DynamicPartitionsDefinition.

\n
\n
\n
\n\n
\n
\nget_event_records(event_records_filter, limit=None, ascending=False)[source]\u00b6
\n

Return a list of event records stored in the event log storage.

\n
\n
Parameters:
\n
    \n
  • event_records_filter (Optional[EventRecordsFilter]) \u2013 the filter by which to filter event\nrecords.

  • \n
  • limit (Optional[int]) \u2013 Number of results to get. Defaults to infinite.

  • \n
  • ascending (Optional[bool]) \u2013 Sort the result in ascending order if True, descending\notherwise. Defaults to descending.

  • \n
\n
\n
Returns:
\n

List of event log records stored in the event log storage.

\n
\n
Return type:
\n

List[EventLogRecord]

\n
\n
\n
\n\n
\n
\nget_latest_materialization_code_versions(asset_keys)[source]\u00b6
\n

Returns the code version used for the latest materialization of each of the provided\nassets.

\n
\n
Parameters:
\n

asset_keys (Iterable[AssetKey]) \u2013 The asset keys to find latest materialization code\nversions for.

\n
\n
Returns:
\n

\n
A dictionary with a key for each of the provided asset

keys. The values will be None if the asset has no materializations. If an asset does\nnot have a code version explicitly assigned to its definitions, but was\nmaterialized, Dagster assigns the run ID as its code version.

\n
\n
\n

\n
\n
Return type:
\n

Mapping[AssetKey, Optional[str]]

\n
\n
\n
\n\n
\n
\nget_latest_materialization_event(asset_key)[source]\u00b6
\n

Fetch the latest materialization event for the given asset key.

\n
\n
Parameters:
\n

asset_key (AssetKey) \u2013 Asset key to return materialization for.

\n
\n
Returns:
\n

\n
The latest materialization event for the given asset

key, or None if the asset has not been materialized.

\n
\n
\n

\n
\n
Return type:
\n

Optional[AssetMaterialization]

\n
\n
\n
\n\n
\n
\nget_run_by_id(run_id)[source]\u00b6
\n

Get a DagsterRun matching the provided run_id.

\n
\n
Parameters:
\n

run_id (str) \u2013 The id of the run to retrieve.

\n
\n
Returns:
\n

\n
The run corresponding to the given id. If no run matching the id

is found, return None.

\n
\n
\n

\n
\n
Return type:
\n

Optional[DagsterRun]

\n
\n
\n
\n\n
\n
\nget_run_record_by_id(run_id)[source]\u00b6
\n

Get a RunRecord matching the provided run_id.

\n
\n
Parameters:
\n

run_id (str) \u2013 The id of the run record to retrieve.

\n
\n
Returns:
\n

\n
The run record corresponding to the given id. If no run matching

the id is found, return None.

\n
\n
\n

\n
\n
Return type:
\n

Optional[RunRecord]

\n
\n
\n
\n\n
\n
\nget_run_records(filters=None, limit=None, order_by=None, ascending=False, cursor=None, bucket_by=None)[source]\u00b6
\n

Return a list of run records stored in the run storage, sorted by the given column in given order.

\n
\n
Parameters:
\n
    \n
  • filters (Optional[RunsFilter]) \u2013 the filter by which to filter runs.

  • \n
  • limit (Optional[int]) \u2013 Number of results to get. Defaults to infinite.

  • \n
  • order_by (Optional[str]) \u2013 Name of the column to sort by. Defaults to id.

  • \n
  • ascending (Optional[bool]) \u2013 Sort the result in ascending order if True, descending\notherwise. Defaults to descending.

  • \n
\n
\n
Returns:
\n

List of run records stored in the run storage.

\n
\n
Return type:
\n

List[RunRecord]

\n
\n
\n
\n\n
\n
\nget_status_by_partition(asset_key, partition_keys, partitions_def)[source]\u00b6
\n

Get the current status of provided partition_keys for the provided asset.

\n
\n
Parameters:
\n
    \n
  • asset_key (AssetKey) \u2013 The asset to get per-partition status for.

  • \n
  • partition_keys (Sequence[str]) \u2013 The partitions to get status for.

  • \n
  • partitions_def (PartitionsDefinition) \u2013 The PartitionsDefinition of the asset to get\nper-partition status for.

  • \n
\n
\n
Returns:
\n

status for each partition key

\n
\n
Return type:
\n

Optional[Mapping[str, AssetPartitionStatus]]

\n
\n
\n
\n\n
\n
\nhas_asset_key(asset_key)[source]\u00b6
\n

Return true if this instance manages the given asset key.

\n
\n
Parameters:
\n

asset_key (AssetKey) \u2013 Asset key to check.

\n
\n
\n
\n\n
\n
\nhas_dynamic_partition(partitions_def_name, partition_key)[source]\u00b6
\n

Check if a partition key exists for the DynamicPartitionsDefinition.

\n
\n
Parameters:
\n
    \n
  • partitions_def_name (str) \u2013 The name of the DynamicPartitionsDefinition.

  • \n
  • partition_key (Sequence[str]) \u2013 Partition key to check.

  • \n
\n
\n
\n
\n\n
\n
\nstatic local_temp(tempdir=None, overrides=None)[source]\u00b6
\n

Create a DagsterInstance that uses a temporary directory for local storage. This is a\nregular, fully persistent instance. Use ephemeral to get an ephemeral instance with\nin-memory components.

\n
\n
Parameters:
\n
    \n
  • tempdir (Optional[str]) \u2013 The path of a directory to be used for local artifact storage.

  • \n
  • overrides (Optional[DagsterInstanceOverrides]) \u2013 Override settings for the instance.

  • \n
\n
\n
Returns:
\n

DagsterInstance

\n
\n
\n
\n\n
\n
\nwipe_assets(asset_keys)[source]\u00b6
\n

Wipes asset event history from the event log for the given asset keys.

\n
\n
Parameters:
\n

asset_keys (Sequence[AssetKey]) \u2013 Asset keys to wipe.

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster._core.instance.InstanceRef(local_artifact_storage_data, compute_logs_data, scheduler_data, run_coordinator_data, run_launcher_data, settings, run_storage_data, event_storage_data, schedule_storage_data, custom_instance_class_data=None, storage_data=None, secrets_loader_data=None)[source]\u00b6
\n

Serializable representation of a DagsterInstance.

\n

Users should not instantiate this class directly.

\n
\n\n
\n
\nclass dagster._serdes.ConfigurableClass[source]\u00b6
\n

Abstract mixin for classes that can be loaded from config.

\n

This supports a powerful plugin pattern which avoids both a) a lengthy, hard-to-synchronize list\nof conditional imports / optional extras_requires in dagster core and b) a magic directory or\nfile in which third parties can place plugin packages. Instead, the intention is to make, e.g.,\nrun storage, pluggable with a config chunk like:

\n
run_storage:\n    module: very_cool_package.run_storage\n    class: SplendidRunStorage\n    config:\n        magic_word: "quux"\n
\n
\n

This same pattern should eventually be viable for other system components, e.g. engines.

\n

The ConfigurableClass mixin provides the necessary hooks for classes to be instantiated from\nan instance of ConfigurableClassData.

\n

Pieces of the Dagster system which we wish to make pluggable in this way should consume a config\ntype such as:

\n
{'module': str, 'class': str, 'config': Field(Permissive())}\n
\n
\n
\n\n
\n
\nclass dagster._serdes.ConfigurableClassData(module_name, class_name, config_yaml)[source]\u00b6
\n

Serializable tuple describing where to find a class and the config fragment that should\nbe used to instantiate it.

\n

Users should not instantiate this class directly.

\n

Classes intended to be serialized in this way should implement the\ndagster.serdes.ConfigurableClass mixin.

\n
\n\n
\n
\nclass dagster._core.storage.root.LocalArtifactStorage(base_dir, inst_data=None)[source]\u00b6
\n
\n\n
\n
\n
\n

Storage\u00b6

\n
\n
\nclass dagster._core.storage.base_storage.DagsterStorage[source]\u00b6
\n

Abstract base class for Dagster persistent storage, for reading and writing data for runs,\nevents, and schedule/sensor state.

\n

Users should not directly instantiate concrete subclasses of this class; they are instantiated\nby internal machinery when dagster-webserver and dagster-daemon load, based on the values in the\ndagster.yaml file in $DAGSTER_HOME. Configuration of concrete subclasses of this class\nshould be done by setting values in that file.

\n
\n\n
\n
\n
\n

Run storage\u00b6

\n
\n
\nclass dagster.DagsterRun(job_name, run_id=None, run_config=None, asset_selection=None, asset_check_selection=None, op_selection=None, resolved_op_selection=None, step_keys_to_execute=None, status=None, tags=None, root_run_id=None, parent_run_id=None, job_snapshot_id=None, execution_plan_snapshot_id=None, external_job_origin=None, job_code_origin=None, has_repository_load_data=None)[source]\u00b6
\n

Serializable internal representation of a dagster run, as stored in a\nRunStorage.

\n
\n
\nproperty is_failure\u00b6
\n

If this run has failed.

\n
\n
Type:
\n

bool

\n
\n
\n
\n\n
\n
\nproperty is_failure_or_canceled\u00b6
\n

If this run has either failed or was canceled.

\n
\n
Type:
\n

bool

\n
\n
\n
\n\n
\n
\nproperty is_finished\u00b6
\n

If this run has completely finished execution.

\n
\n
Type:
\n

bool

\n
\n
\n
\n\n
\n
\nproperty is_resume_retry\u00b6
\n

If this run was created from retrying another run from the point of failure.

\n
\n
Type:
\n

bool

\n
\n
\n
\n\n
\n
\nproperty is_success\u00b6
\n

If this run has successfully finished executing.

\n
\n
Type:
\n

bool

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.DagsterRunStatus(value)[source]\u00b6
\n

The status of run execution.

\n
\n\n
\n
\nclass dagster.RunsFilter(run_ids=None, job_name=None, statuses=None, tags=None, snapshot_id=None, updated_after=None, updated_before=None, created_after=None, created_before=None)[source]\u00b6
\n

Defines a filter across job runs, for use when querying storage directly.

\n

Each field of the RunsFilter represents a logical AND with each other. For\nexample, if you specify job_name and tags, then you will receive only runs\nwith the specified job_name AND the specified tags. If left blank, then\nall values will be permitted for that field.

\n
\n
Parameters:
\n
    \n
  • run_ids (Optional[List[str]]) \u2013 A list of job run_id values.

  • \n
  • job_name (Optional[str]) \u2013 Name of the job to query for. If blank, all job_names will be accepted.

  • \n
  • statuses (Optional[List[DagsterRunStatus]]) \u2013 A list of run statuses to filter by. If blank, all run statuses will be allowed.

  • \n
  • tags (Optional[Dict[str, Union[str, List[str]]]]) \u2013 A dictionary of run tags to query by. All tags specified here must be present for a given run to pass the filter.

  • \n
  • snapshot_id (Optional[str]) \u2013 The ID of the job snapshot to query for. Intended for internal use.

  • \n
  • updated_after (Optional[DateTime]) \u2013 Filter by runs that were last updated before this datetime.

  • \n
  • created_before (Optional[DateTime]) \u2013 Filter by runs that were created before this datetime.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster._core.storage.runs.RunStorage[source]\u00b6
\n

Abstract base class for storing pipeline run history.

\n

Note that run storages using SQL databases as backing stores should implement\nSqlRunStorage.

\n

Users should not directly instantiate concrete subclasses of this class; they are instantiated\nby internal machinery when dagster-webserver and dagster-graphql load, based on the values in the\ndagster.yaml file in $DAGSTER_HOME. Configuration of concrete subclasses of this class\nshould be done by setting values in that file.

\n
\n\n
\n
\nclass dagster._core.storage.runs.SqlRunStorage[source]\u00b6
\n

Base class for SQL based run storages.

\n
\n\n
\n
\nclass dagster._core.storage.runs.SqliteRunStorage(conn_string, inst_data=None)[source]\u00b6
\n

SQLite-backed run storage.

\n

Users should not directly instantiate this class; it is instantiated by internal machinery when\ndagster-webserver and dagster-graphql load, based on the values in the dagster.yaml file in\n$DAGSTER_HOME. Configuration of this class should be done by setting values in that file.

\n

This is the default run storage when none is specified in the dagster.yaml.

\n

To explicitly specify SQLite for run storage, you can add a block such as the following to your\ndagster.yaml:

\n
run_storage:\n  module: dagster._core.storage.runs\n  class: SqliteRunStorage\n  config:\n    base_dir: /path/to/dir\n
\n
\n

The base_dir param tells the run storage where on disk to store the database.

\n
\n\n
\n
\nclass dagster._core.storage.dagster_run.RunRecord(storage_id, dagster_run, create_timestamp, update_timestamp, start_time=None, end_time=None)[source]\u00b6
\n

Internal representation of a run record, as stored in a\nRunStorage.

\n

Users should not invoke this class directly.

\n
\n\n

See also: dagster_postgres.PostgresRunStorage and dagster_mysql.MySQLRunStorage.

\n
\n
\n
\n

Event log storage\u00b6

\n
\n
\nclass dagster.EventLogEntry(error_info, level, user_message, run_id, timestamp, step_key=None, job_name=None, dagster_event=None)[source]\u00b6
\n

Entries in the event log.

\n

Users should not instantiate this object directly. These entries may originate from the logging machinery (DagsterLogManager/context.log), from\nframework events (e.g. EngineEvent), or they may correspond to events yielded by user code\n(e.g. Output).

\n
\n
Parameters:
\n
    \n
  • error_info (Optional[SerializableErrorInfo]) \u2013 Error info for an associated exception, if\nany, as generated by serializable_error_info_from_exc_info and friends.

  • \n
  • level (Union[str, int]) \u2013 The Python log level at which to log this event. Note that\nframework and user code events are also logged to Python logging. This value may be an\ninteger or a (case-insensitive) string member of PYTHON_LOGGING_LEVELS_NAMES.

  • \n
  • user_message (str) \u2013 For log messages, this is the user-generated message.

  • \n
  • run_id (str) \u2013 The id of the run which generated this event.

  • \n
  • timestamp (float) \u2013 The Unix timestamp of this event.

  • \n
  • step_key (Optional[str]) \u2013 The step key for the step which generated this event. Some events\nare generated outside of a step context.

  • \n
  • job_name (Optional[str]) \u2013 The job which generated this event. Some events are\ngenerated outside of a job context.

  • \n
  • dagster_event (Optional[DagsterEvent]) \u2013 For framework and user events, the associated\nstructured event.

  • \n
\n
\n
\n
\n
\nproperty dagster_event_type\u00b6
\n

The type of the DagsterEvent contained by this entry, if any.

\n
\n
Type:
\n

Optional[DagsterEventType]

\n
\n
\n
\n\n
\n
\nget_dagster_event()[source]\u00b6
\n

DagsterEvent: Returns the DagsterEvent contained within this entry. If this entry does not\ncontain a DagsterEvent, an error will be raised.

\n
\n\n
\n
\nproperty is_dagster_event\u00b6
\n

If this entry contains a DagsterEvent.

\n
\n
Type:
\n

bool

\n
\n
\n
\n\n
\n
\nproperty message\u00b6
\n

Return the message from the structured DagsterEvent if present, fallback to user_message.

\n
\n\n
\n\n
\n
\nclass dagster.EventLogRecord(storage_id, event_log_entry)[source]\u00b6
\n

Internal representation of an event record, as stored in a\nEventLogStorage.

\n

Users should not instantiate this class directly.

\n
\n\n
\n
\nclass dagster.EventRecordsFilter(event_type, asset_key=None, asset_partitions=None, after_cursor=None, before_cursor=None, after_timestamp=None, before_timestamp=None, storage_ids=None, tags=None)[source]\u00b6
\n

Defines a set of filter fields for fetching a set of event log entries or event log records.

\n
\n
Parameters:
\n
    \n
  • event_type (DagsterEventType) \u2013 Filter argument for dagster event type

  • \n
  • asset_key (Optional[AssetKey]) \u2013 Asset key for which to get asset materialization event\nentries / records.

  • \n
  • asset_partitions (Optional[List[str]]) \u2013 Filter parameter such that only asset\nevents with a partition value matching one of the provided values. Only\nvalid when the asset_key parameter is provided.

  • \n
  • after_cursor (Optional[Union[int, RunShardedEventsCursor]]) \u2013 Filter parameter such that only\nrecords with storage_id greater than the provided value are returned. Using a\nrun-sharded events cursor will result in a significant performance gain when run against\na SqliteEventLogStorage implementation (which is run-sharded)

  • \n
  • before_cursor (Optional[Union[int, RunShardedEventsCursor]]) \u2013 Filter parameter such that\nrecords with storage_id less than the provided value are returned. Using a run-sharded\nevents cursor will result in a significant performance gain when run against\na SqliteEventLogStorage implementation (which is run-sharded)

  • \n
  • after_timestamp (Optional[float]) \u2013 Filter parameter such that only event records for\nevents with timestamp greater than the provided value are returned.

  • \n
  • before_timestamp (Optional[float]) \u2013 Filter parameter such that only event records for\nevents with timestamp less than the provided value are returned.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.RunShardedEventsCursor(id, run_updated_after)[source]\u00b6
\n

Pairs an id-based event log cursor with a timestamp-based run cursor, for improved\nperformance on run-sharded event log storages (e.g. the default SqliteEventLogStorage). For\nrun-sharded storages, the id field is ignored, since they may not be unique across shards.

\n
\n\n
\n
\nclass dagster._core.storage.event_log.EventLogStorage[source]\u00b6
\n

Abstract base class for storing structured event logs from pipeline runs.

\n

Note that event log storages using SQL databases as backing stores should implement\nSqlEventLogStorage.

\n

Users should not directly instantiate concrete subclasses of this class; they are instantiated\nby internal machinery when dagster-webserver and dagster-graphql load, based on the values in the\ndagster.yaml file in $DAGSTER_HOME. Configuration of concrete subclasses of this class\nshould be done by setting values in that file.

\n
\n\n
\n
\nclass dagster._core.storage.event_log.SqlEventLogStorage[source]\u00b6
\n

Base class for SQL backed event log storages.

\n

Distinguishes between run-based connections and index connections in order to support run-level\nsharding, while maintaining the ability to do cross-run queries

\n
\n\n
\n
\nclass dagster._core.storage.event_log.SqliteEventLogStorage(base_dir, inst_data=None)[source]\u00b6
\n

SQLite-backed event log storage.

\n

Users should not directly instantiate this class; it is instantiated by internal machinery when\ndagster-webserver and dagster-graphql load, based on the values in the dagster.yaml file insqliteve\n$DAGSTER_HOME. Configuration of this class should be done by setting values in that file.

\n

This is the default event log storage when none is specified in the dagster.yaml.

\n

To explicitly specify SQLite for event log storage, you can add a block such as the following\nto your dagster.yaml:

\n
event_log_storage:\n  module: dagster._core.storage.event_log\n  class: SqliteEventLogStorage\n  config:\n    base_dir: /path/to/dir\n
\n
\n

The base_dir param tells the event log storage where on disk to store the databases. To\nimprove concurrent performance, event logs are stored in a separate SQLite database for each\nrun.

\n
\n\n
\n
\nclass dagster._core.storage.event_log.ConsolidatedSqliteEventLogStorage(base_dir, inst_data=None)[source]\u00b6
\n

SQLite-backed consolidated event log storage intended for test cases only.

\n

Users should not directly instantiate this class; it is instantiated by internal machinery when\ndagster-webserver and dagster-graphql load, based on the values in the dagster.yaml file in\n$DAGSTER_HOME. Configuration of this class should be done by setting values in that file.

\n

To explicitly specify the consolidated SQLite for event log storage, you can add a block such as\nthe following to your dagster.yaml:

\n
run_storage:\n  module: dagster._core.storage.event_log\n  class: ConsolidatedSqliteEventLogStorage\n  config:\n    base_dir: /path/to/dir\n
\n
\n

The base_dir param tells the event log storage where on disk to store the database.

\n
\n\n
\n
\nclass dagster._core.storage.event_log.AssetRecord(storage_id, asset_entry)[source]\u00b6
\n

Internal representation of an asset record, as stored in a EventLogStorage.

\n

Users should not invoke this class directly.

\n
\n\n

See also: dagster_postgres.PostgresEventLogStorage and dagster_mysql.MySQLEventLogStorage.

\n
\n
\n
\n

Compute log manager\u00b6

\n
\n
\nclass dagster._core.storage.captured_log_manager.CapturedLogManager[source]\u00b6
\n

Abstract base class for capturing the unstructured logs (stdout/stderr) in the current\nprocess, stored / retrieved with a provided log_key.

\n
\n\n
\n
\nclass dagster._core.storage.compute_log_manager.ComputeLogManager[source]\u00b6
\n

Abstract base class for storing unstructured compute logs (stdout/stderr) from the compute\nsteps of pipeline solids.

\n
\n\n
\n
\nclass dagster._core.storage.local_compute_log_manager.LocalComputeLogManager(base_dir, polling_timeout=None, inst_data=None)[source]\u00b6
\n

Stores copies of stdout & stderr for each compute step locally on disk.

\n
\n\n
\n
\nclass dagster._core.storage.noop_compute_log_manager.NoOpComputeLogManager(inst_data=None)[source]\u00b6
\n

When enabled for a Dagster instance, stdout and stderr will not be available for any step.

\n
\n\n

See also: dagster_aws.S3ComputeLogManager.

\n
\n
\n
\n

Run launcher\u00b6

\n
\n
\nclass dagster._core.launcher.RunLauncher[source]\u00b6
\n
\n\n
\n
\nclass dagster._core.launcher.DefaultRunLauncher(inst_data=None)[source]\u00b6
\n

Launches runs against running GRPC servers.

\n
\n\n
\n
\n
\n

Run coordinator\u00b6

\n
\n
\nclass dagster._core.run_coordinator.DefaultRunCoordinator(inst_data=None)[source]\u00b6
\n

Immediately send runs to the run launcher.

\n
\n\n
\n
\ndagster._core.run_coordinator.QueuedRunCoordinator RunCoordinator[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
max_concurrent_runs (dagster.IntSource, optional):
\n

The maximum number of runs that are allowed to be in progress at once. Defaults to 10. Set to -1 to disable the limit. Set to 0 to stop any runs from launching. Any other negative values are disallowed.

\n
\n
tag_concurrency_limits (Union[List[strict dict], None], optional):
\n

A set of limits that are applied to runs with particular tags. If a value is set, the limit is applied to only that key-value pair. If no value is set, the limit is applied across all values of that key. If the value is set to a dict with applyLimitPerUniqueValue: true, the limit will apply to the number of unique values for that key.

\n
\n
dequeue_interval_seconds (dagster.IntSource, optional):
\n

The interval in seconds at which the Dagster Daemon should periodically check the run queue for new runs to launch.

\n
\n
dequeue_use_threads (Bool, optional):
\n

Whether or not to use threads for concurrency when launching dequeued runs.

\n
\n
dequeue_num_workers (dagster.IntSource, optional):
\n

If dequeue_use_threads is true, limit the number of concurrent worker threads.

\n
\n
max_user_code_failure_retries (dagster.IntSource, optional):
\n

If there is an error reaching a Dagster gRPC server while dequeuing the run, how many times to retry the dequeue before failing it. The only run launcher that requires the gRPC server to be running is the DefaultRunLauncher, so setting this will have no effect unless that run launcher is being used.

\n

Default Value: 0

\n
\n
user_code_failure_retry_delay (dagster.IntSource, optional):
\n

If there is an error reaching a Dagster gRPC server while dequeuing the run, how long to wait before retrying any runs from that same code location. The only run launcher that requires the gRPC server to be running is the DefaultRunLauncher, so setting this will have no effect unless that run launcher is being used.

\n

Default Value: 60

\n
\n
\n

Enqueues runs via the run storage, to be deqeueued by the Dagster Daemon process. Requires\nthe Dagster Daemon process to be alive in order for runs to be launched.

\n
\n\n
\n
\n
\n

Scheduling\u00b6

\n
\n
\nclass dagster._core.scheduler.Scheduler[source]\u00b6
\n

Abstract base class for a scheduler. This component is responsible for interfacing with\nan external system such as cron to ensure scheduled repeated execution according.

\n
\n\n
\n
\nclass dagster._core.storage.schedules.ScheduleStorage[source]\u00b6
\n

Abstract class for managing persistance of scheduler artifacts.

\n
\n\n
\n
\nclass dagster._core.storage.schedules.SqlScheduleStorage[source]\u00b6
\n

Base class for SQL backed schedule storage.

\n
\n\n
\n
\nclass dagster._core.storage.schedules.SqliteScheduleStorage(conn_string, inst_data=None)[source]\u00b6
\n

Local SQLite backed schedule storage.

\n
\n\n

see also: dagster_postgres.PostgresScheduleStorage and dagster_mysql.MySQLScheduleStorage.

\n
\n
\n
\n

Exception handling\u00b6

\n
\n
\ndagster._core.errors.user_code_error_boundary(error_cls, msg_fn, log_manager=None, **kwargs)[source]\u00b6
\n

Wraps the execution of user-space code in an error boundary. This places a uniform\npolicy around any user code invoked by the framework. This ensures that all user\nerrors are wrapped in an exception derived from DagsterUserCodeExecutionError,\nand that the original stack trace of the user error is preserved, so that it\ncan be reported without confusing framework code in the stack trace, if a\ntool author wishes to do so.

\n

Examples:\n.. code-block:: python

\n
\n
\n
with user_code_error_boundary(

# Pass a class that inherits from DagsterUserCodeExecutionError\nDagsterExecutionStepExecutionError,\n# Pass a function that produces a message\n\u201cError occurred during step execution\u201d

\n
\n
):

call_user_provided_function()

\n
\n
\n
\n
\n\n
\n
\n
\n

Step Launchers (Experimental)\u00b6

\n
\n
\nclass dagster.StepLauncher[source]\u00b6
\n

A StepLauncher is responsible for executing steps, either in-process or in an external process.

\n
\n\n
\n
\nclass dagster.StepRunRef(run_config, dagster_run, run_id, retry_mode, step_key, recon_job, known_state)[source]\u00b6
\n

A serializable object that specifies what\u2019s needed to hydrate a step so\nthat it can be executed in a process outside the plan process.

\n

Users should not instantiate this class directly.

\n
\n\n
\n
\nclass dagster.StepExecutionContext(plan_data, execution_data, log_manager, step, output_capture, known_state)[source]\u00b6
\n

Context for the execution of a step. Users should not instantiate this class directly.

\n

This context assumes that user code can be run directly, and thus includes resource and information.

\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/internals", "customsidebar": null, "display_toc": true, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../jobs/", "title": "Jobs"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../hooks/", "title": "Hooks"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/jobs", "Jobs", "N", "next"], ["sections/api/apidocs/hooks", "Hooks", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/internals.rst.txt", "title": "Internals", "toc": "\n"}, "io-managers": {"alabaster_version": "0.7.13", "body": "
\n

IO Managers\u00b6

\n

IO managers are user-provided objects that store op outputs and load them as inputs to downstream\nops.

\n
\n
\nclass dagster.ConfigurableIOManager[source]\u00b6
\n

Base class for Dagster IO managers that utilize structured config.

\n

This class is a subclass of both IOManagerDefinition, Config,\nand IOManager. Implementers must provide an implementation of the\nhandle_output() and load_input() methods.

\n

Example definition:

\n
class MyIOManager(ConfigurableIOManager):\n    path_prefix: List[str]\n\n    def _get_path(self, context) -> str:\n        return "/".join(context.asset_key.path)\n\n    def handle_output(self, context, obj):\n        write_csv(self._get_path(context), obj)\n\n    def load_input(self, context):\n        return read_csv(self._get_path(context))\n\ndefs = Definitions(\n    ...,\n    resources={\n        "io_manager": MyIOManager(path_prefix=["my", "prefix"])\n    }\n)\n
\n
\n
\n\n
\n
\nclass dagster.ConfigurableIOManagerFactory[source]\u00b6
\n

Base class for Dagster IO managers that utilize structured config. This base class\nis useful for cases in which the returned IO manager is not the same as the class itself\n(e.g. when it is a wrapper around the actual IO manager implementation).

\n

This class is a subclass of both IOManagerDefinition and Config.\nImplementers should provide an implementation of the resource_function() method,\nwhich should return an instance of IOManager.

\n

Example definition:

\n
class ExternalIOManager(IOManager):\n\n    def __init__(self, connection):\n        self._connection = connection\n\n    def handle_output(self, context, obj):\n        ...\n\n    def load_input(self, context):\n        ...\n\nclass ConfigurableExternalIOManager(ConfigurableIOManagerFactory):\n    username: str\n    password: str\n\n    def create_io_manager(self, context) -> IOManager:\n        with database.connect(username, password) as connection:\n            return MyExternalIOManager(connection)\n\ndefs = Definitions(\n    ...,\n    resources={\n        "io_manager": ConfigurableExternalIOManager(\n            username="dagster",\n            password=EnvVar("DB_PASSWORD")\n        )\n    }\n)\n
\n
\n
\n\n
\n
\nclass dagster.IOManager[source]\u00b6
\n

Base class for user-provided IO managers.

\n

IOManagers are used to store op outputs and load them as inputs to downstream ops.

\n

Extend this class to handle how objects are loaded and stored. Users should implement\nhandle_output to store an object and load_input to retrieve an object.

\n
\n
\nabstract handle_output(context, obj)[source]\u00b6
\n

User-defined method that stores an output of an op.

\n
\n
Parameters:
\n
    \n
  • context (OutputContext) \u2013 The context of the step output that produces this object.

  • \n
  • obj (Any) \u2013 The object, returned by the op, to be stored.

  • \n
\n
\n
\n
\n\n
\n
\nabstract load_input(context)[source]\u00b6
\n

User-defined method that loads an input to an op.

\n
\n
Parameters:
\n

context (InputContext) \u2013 The input context, which describes the input that\u2019s being loaded\nand the upstream output that\u2019s being loaded from.

\n
\n
Returns:
\n

The data object.

\n
\n
Return type:
\n

Any

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.IOManagerDefinition(resource_fn, config_schema=None, description=None, required_resource_keys=None, version=None, input_config_schema=None, output_config_schema=None)[source]\u00b6
\n

Definition of an IO manager resource.

\n

IOManagers are used to store op outputs and load them as inputs to downstream ops.

\n

An IOManagerDefinition is a ResourceDefinition whose resource_fn returns an\nIOManager.

\n

The easiest way to create an IOManagerDefnition is with the @io_manager\ndecorator.

\n
\n
\nstatic hardcoded_io_manager(value, description=None)[source]\u00b6
\n

A helper function that creates an IOManagerDefinition with a hardcoded IOManager.

\n
\n
Parameters:
\n
    \n
  • value (IOManager) \u2013 A hardcoded IO Manager which helps mock the definition.

  • \n
  • description ([Optional[str]]) \u2013 The description of the IO Manager. Defaults to None.

  • \n
\n
\n
Returns:
\n

A hardcoded resource.

\n
\n
Return type:
\n

[IOManagerDefinition]

\n
\n
\n
\n\n
\n\n
\n
\n@dagster.io_manager(config_schema=None, description=None, output_config_schema=None, input_config_schema=None, required_resource_keys=None, version=None)[source]\u00b6
\n

Define an IO manager.

\n

IOManagers are used to store op outputs and load them as inputs to downstream ops.

\n

The decorated function should accept an InitResourceContext and return an\nIOManager.

\n
\n
Parameters:
\n
    \n
  • config_schema (Optional[ConfigSchema]) \u2013 The schema for the resource config. Configuration\ndata available in init_context.resource_config. If not set, Dagster will accept any\nconfig provided.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the resource.

  • \n
  • output_config_schema (Optional[ConfigSchema]) \u2013 The schema for per-output config. If not set,\nno per-output configuration will be allowed.

  • \n
  • input_config_schema (Optional[ConfigSchema]) \u2013 The schema for per-input config. If not set,\nDagster will accept any config provided.

  • \n
  • required_resource_keys (Optional[Set[str]]) \u2013 Keys for the resources required by the object\nmanager.

  • \n
  • version (Optional[str]) \u2013 (Experimental) The version of a resource function. Two wrapped\nresource functions should only have the same version if they produce the same resource\ndefinition when provided with the same inputs.

  • \n
\n
\n
\n

Examples:

\n
class MyIOManager(IOManager):\n    def handle_output(self, context, obj):\n        write_csv("some/path")\n\n    def load_input(self, context):\n        return read_csv("some/path")\n\n@io_manager\ndef my_io_manager(init_context):\n    return MyIOManager()\n\n@op(out=Out(io_manager_key="my_io_manager_key"))\ndef my_op(_):\n    return do_stuff()\n\n@job(resource_defs={"my_io_manager_key": my_io_manager})\ndef my_job():\n    my_op()\n
\n
\n
\n\n
\n

Input and Output Contexts\u00b6

\n
\n
\nclass dagster.InputContext(*, name=None, job_name=None, op_def=None, config=None, metadata=None, upstream_output=None, dagster_type=None, log_manager=None, resource_config=None, resources=None, step_context=None, asset_key=None, partition_key=None, asset_partitions_subset=None, asset_partitions_def=None, instance=None)[source]\u00b6
\n

The context object available to the load_input method of InputManager.

\n

Users should not instantiate this object directly. In order to construct\nan InputContext for testing an IO Manager\u2019s load_input method, use\ndagster.build_input_context().

\n

Example

\n
from dagster import IOManager, InputContext\n\nclass MyIOManager(IOManager):\n    def load_input(self, context: InputContext):\n        ...\n
\n
\n
\n
\nproperty asset_key\u00b6
\n

The AssetKey of the asset that is being loaded as an input.

\n
\n\n
\n
\nproperty asset_partition_key\u00b6
\n

The partition key for input asset.

\n

Raises an error if the input asset has no partitioning, or if the run covers a partition\nrange for the input asset.

\n
\n\n
\n
\nproperty asset_partition_key_range\u00b6
\n

The partition key range for input asset.

\n

Raises an error if the input asset has no partitioning.

\n
\n\n
\n
\nproperty asset_partition_keys\u00b6
\n

The partition keys for input asset.

\n

Raises an error if the input asset has no partitioning.

\n
\n\n
\n
\nproperty asset_partitions_def\u00b6
\n

The PartitionsDefinition on the upstream asset corresponding to this input.

\n
\n\n
\n
\nproperty asset_partitions_time_window\u00b6
\n

The time window for the partitions of the input asset.

\n

Raises an error if either of the following are true:\n- The input asset has no partitioning.\n- The input asset is not partitioned with a TimeWindowPartitionsDefinition.

\n
\n\n
\n
\nproperty config\u00b6
\n

The config attached to the input that we\u2019re loading.

\n
\n\n
\n
\nproperty dagster_type\u00b6
\n

The type of this input.\nDagster types do not propagate from an upstream output to downstream inputs,\nand this property only captures type information for the input that is either\npassed in explicitly with AssetIn or In, or can be\ninfered from type hints. For an asset input, the Dagster type from the upstream\nasset definition is ignored.

\n
\n\n
\n
\nget_asset_identifier()[source]\u00b6
\n

The sequence of strings making up the AssetKey for the asset being loaded as an input.\nIf the asset is partitioned, the identifier contains the partition key as the final element in the\nsequence. For example, for the asset key AssetKey(["foo", "bar", "baz"]), materialized with\npartition key \u201c2023-06-01\u201d, get_asset_identifier will return ["foo", "bar", "baz", "2023-06-01"].

\n
\n\n
\n
\nget_identifier()[source]\u00b6
\n

Utility method to get a collection of identifiers that as a whole represent a unique\nstep input.

\n

If not using memoization, the unique identifier collection consists of

\n
    \n
  • \n
    run_id: the id of the run which generates the input.

    Note: This method also handles the re-execution memoization logic. If the step that\ngenerates the input is skipped in the re-execution, the run_id will be the id\nof its parent run.

    \n
    \n
    \n
  • \n
  • step_key: the key for a compute step.

  • \n
  • name: the name of the output. (default: \u2018result\u2019).

  • \n
\n

If using memoization, the version corresponding to the step output is used in place of\nthe run_id.

\n
\n
Returns:
\n

A list of identifiers, i.e. (run_id or version), step_key, and output_name

\n
\n
Return type:
\n

List[str, \u2026]

\n
\n
\n
\n\n
\n
\nproperty has_asset_key\u00b6
\n

Returns True if an asset is being loaded as input, otherwise returns False. A return value of False\nindicates that an output from an op is being loaded as the input.

\n
\n\n
\n
\nproperty has_asset_partitions\u00b6
\n

Returns True if the asset being loaded as input is partitioned.

\n
\n\n
\n
\nproperty has_input_name\u00b6
\n

If we\u2019re the InputContext is being used to load the result of a run from outside the run,\nthen it won\u2019t have an input name.

\n
\n\n
\n
\nproperty has_partition_key\u00b6
\n

Whether the current run is a partitioned run.

\n
\n\n
\n
\nproperty log\u00b6
\n

The log manager to use for this input.

\n
\n\n
\n
\nproperty metadata\u00b6
\n

A dict of metadata that is assigned to the InputDefinition that we\u2019re loading for.\nThis property only contains metadata passed in explicitly with AssetIn\nor In. To access metadata of an upstream asset or operation definition,\nuse the metadata in InputContext.upstream_output.

\n
\n\n
\n
\nproperty name\u00b6
\n

The name of the input that we\u2019re loading.

\n
\n\n
\n
\nproperty op_def\u00b6
\n

The definition of the op that\u2019s loading the input.

\n
\n\n
\n
\nproperty partition_key\u00b6
\n

The partition key for the current run.

\n

Raises an error if the current run is not a partitioned run.

\n
\n\n
\n
\nproperty resource_config\u00b6
\n

The config associated with the resource that initializes the InputManager.

\n
\n\n
\n
\nproperty resources\u00b6
\n

The resources required by the resource that initializes the\ninput manager. If using the @input_manager() decorator, these resources\ncorrespond to those requested with the required_resource_keys parameter.

\n
\n\n
\n
\nproperty upstream_output\u00b6
\n

Info about the output that produced the object we\u2019re loading.

\n
\n\n
\n\n
\n
\nclass dagster.OutputContext(step_key=None, name=None, job_name=None, run_id=None, metadata=None, mapping_key=None, config=None, dagster_type=None, log_manager=None, version=None, resource_config=None, resources=None, step_context=None, op_def=None, asset_info=None, warn_on_step_context_use=False, partition_key=None)[source]\u00b6
\n

The context object that is available to the handle_output method of an IOManager.

\n

Users should not instantiate this object directly. To construct an\nOutputContext for testing an IO Manager\u2019s handle_output method, use\ndagster.build_output_context().

\n

Example

\n
from dagster import IOManager, OutputContext\n\nclass MyIOManager(IOManager):\n    def handle_output(self, context: OutputContext, obj):\n        ...\n
\n
\n
\n
\nadd_output_metadata(metadata)[source]\u00b6
\n

Add a dictionary of metadata to the handled output.

\n

Metadata entries added will show up in the HANDLED_OUTPUT and ASSET_MATERIALIZATION events for the run.

\n
\n
Parameters:
\n

metadata (Mapping[str, RawMetadataValue]) \u2013 A metadata dictionary to log

\n
\n
\n

Examples

\n
from dagster import IOManager\n\nclass MyIOManager(IOManager):\n    def handle_output(self, context, obj):\n        context.add_output_metadata({"foo": "bar"})\n
\n
\n
\n\n
\n
\nproperty asset_key\u00b6
\n

The AssetKey of the asset that is being stored as an output.

\n
\n\n
\n
\nproperty asset_partition_key\u00b6
\n

The partition key for output asset.

\n

Raises an error if the output asset has no partitioning, or if the run covers a partition\nrange for the output asset.

\n
\n\n
\n
\nproperty asset_partition_key_range\u00b6
\n

The partition key range for output asset.

\n

Raises an error if the output asset has no partitioning.

\n
\n\n
\n
\nproperty asset_partition_keys\u00b6
\n

The partition keys for the output asset.

\n

Raises an error if the output asset has no partitioning.

\n
\n\n
\n
\nproperty asset_partitions_def\u00b6
\n

The PartitionsDefinition on the asset corresponding to this output.

\n
\n\n
\n
\nproperty asset_partitions_time_window\u00b6
\n

The time window for the partitions of the output asset.

\n

Raises an error if either of the following are true:\n- The output asset has no partitioning.\n- The output asset is not partitioned with a TimeWindowPartitionsDefinition or a\nMultiPartitionsDefinition with one time-partitioned dimension.

\n
\n\n
\n
\nproperty config\u00b6
\n

The configuration for the output.

\n
\n\n
\n
\nproperty dagster_type\u00b6
\n

The type of this output.

\n
\n\n
\n
\nget_asset_identifier()[source]\u00b6
\n

The sequence of strings making up the AssetKey for the asset being stored as an output.\nIf the asset is partitioned, the identifier contains the partition key as the final element in the\nsequence. For example, for the asset key AssetKey(["foo", "bar", "baz"]) materialized with\npartition key \u201c2023-06-01\u201d, get_asset_identifier will return ["foo", "bar", "baz", "2023-06-01"].

\n
\n\n
\n
\nget_identifier()[source]\u00b6
\n

Utility method to get a collection of identifiers that as a whole represent a unique\nstep output.

\n

If not using memoization, the unique identifier collection consists of

\n
    \n
  • \n
    run_id: the id of the run which generates the output.

    Note: This method also handles the re-execution memoization logic. If the step that\ngenerates the output is skipped in the re-execution, the run_id will be the id\nof its parent run.

    \n
    \n
    \n
  • \n
  • step_key: the key for a compute step.

  • \n
  • name: the name of the output. (default: \u2018result\u2019).

  • \n
\n

If using memoization, the version corresponding to the step output is used in place of\nthe run_id.

\n
\n
Returns:
\n

A list of identifiers, i.e. (run_id or version), step_key, and output_name

\n
\n
Return type:
\n

Sequence[str, \u2026]

\n
\n
\n
\n\n
\n
\nproperty has_asset_key\u00b6
\n

Returns True if an asset is being stored, otherwise returns False. A return value of False\nindicates that an output from an op is being stored.

\n
\n\n
\n
\nproperty has_asset_partitions\u00b6
\n

Returns True if the asset being stored is partitioned.

\n
\n\n
\n
\nproperty has_partition_key\u00b6
\n

Whether the current run is a partitioned run.

\n
\n\n
\n
\nproperty log\u00b6
\n

The log manager to use for this output.

\n
\n\n
\n
\nlog_event(event)[source]\u00b6
\n

Log an AssetMaterialization or AssetObservation from within the body of an io manager\u2019s handle_output method.

\n

Events logged with this method will appear in the event log.

\n
\n
Parameters:
\n

event (Union[AssetMaterialization, AssetObservation]) \u2013 The event to log.

\n
\n
\n

Examples

\n
from dagster import IOManager, AssetMaterialization\n\nclass MyIOManager(IOManager):\n    def handle_output(self, context, obj):\n        context.log_event(AssetMaterialization("foo"))\n
\n
\n
\n\n
\n
\nproperty mapping_key\u00b6
\n

The key that identifies a unique mapped output. None for regular outputs.

\n
\n\n
\n
\nproperty metadata\u00b6
\n

A dict of the metadata that is assigned to the OutputDefinition that produced\nthe output.

\n
\n\n
\n
\nproperty name\u00b6
\n

The name of the output that produced the output.

\n
\n\n
\n
\nproperty op_def\u00b6
\n

The definition of the op that produced the output.

\n
\n\n
\n
\nproperty partition_key\u00b6
\n

The partition key for the current run.

\n

Raises an error if the current run is not a partitioned run.

\n
\n\n
\n
\nproperty resource_config\u00b6
\n

The config associated with the resource that initializes the InputManager.

\n
\n\n
\n
\nproperty resources\u00b6
\n

The resources required by the output manager, specified by the required_resource_keys\nparameter.

\n
\n\n
\n
\nproperty run_id\u00b6
\n

The id of the run that produced the output.

\n
\n\n
\n
\nproperty step_key\u00b6
\n

The step_key for the compute step that produced the output.

\n
\n\n
\n
\nproperty version\u00b6
\n

(Experimental) The version of the output.

\n
\n\n
\n\n
\n
\ndagster.build_input_context(name=None, config=None, metadata=None, upstream_output=None, dagster_type=None, resource_config=None, resources=None, op_def=None, step_context=None, asset_key=None, partition_key=None, asset_partition_key_range=None, asset_partitions_def=None, instance=None)[source]\u00b6
\n

Builds input context from provided parameters.

\n

build_input_context can be used as either a function, or a context manager. If resources\nthat are also context managers are provided, then build_input_context must be used as a\ncontext manager.

\n
\n
Parameters:
\n
    \n
  • name (Optional[str]) \u2013 The name of the input that we\u2019re loading.

  • \n
  • config (Optional[Any]) \u2013 The config attached to the input that we\u2019re loading.

  • \n
  • metadata (Optional[Dict[str, Any]]) \u2013 A dict of metadata that is assigned to the\nInputDefinition that we\u2019re loading for.

  • \n
  • upstream_output (Optional[OutputContext]) \u2013 Info about the output that produced the object\nwe\u2019re loading.

  • \n
  • dagster_type (Optional[DagsterType]) \u2013 The type of this input.

  • \n
  • resource_config (Optional[Dict[str, Any]]) \u2013 The resource config to make available from the\ninput context. This usually corresponds to the config provided to the resource that\nloads the input manager.

  • \n
  • resources (Optional[Dict[str, Any]]) \u2013 The resources to make available from the context.\nFor a given key, you can provide either an actual instance of an object, or a resource\ndefinition.

  • \n
  • asset_key (Optional[Union[AssetKey, Sequence[str], str]]) \u2013 The asset key attached to the InputDefinition.

  • \n
  • op_def (Optional[OpDefinition]) \u2013 The definition of the op that\u2019s loading the input.

  • \n
  • step_context (Optional[StepExecutionContext]) \u2013 For internal use.

  • \n
  • partition_key (Optional[str]) \u2013 String value representing partition key to execute with.

  • \n
  • asset_partition_key_range (Optional[str]) \u2013 The range of asset partition keys to load.

  • \n
  • asset_partitions_def \u2013 Optional[PartitionsDefinition]: The PartitionsDefinition of the asset\nbeing loaded.

  • \n
\n
\n
\n

Examples

\n
build_input_context()\n\nwith build_input_context(resources={"foo": context_manager_resource}) as context:\n    do_something\n
\n
\n
\n\n
\n
\ndagster.build_output_context(step_key=None, name=None, metadata=None, run_id=None, mapping_key=None, config=None, dagster_type=None, version=None, resource_config=None, resources=None, op_def=None, asset_key=None, partition_key=None)[source]\u00b6
\n

Builds output context from provided parameters.

\n

build_output_context can be used as either a function, or a context manager. If resources\nthat are also context managers are provided, then build_output_context must be used as a\ncontext manager.

\n
\n
Parameters:
\n
    \n
  • step_key (Optional[str]) \u2013 The step_key for the compute step that produced the output.

  • \n
  • name (Optional[str]) \u2013 The name of the output that produced the output.

  • \n
  • metadata (Optional[Mapping[str, Any]]) \u2013 A dict of the metadata that is assigned to the\nOutputDefinition that produced the output.

  • \n
  • mapping_key (Optional[str]) \u2013 The key that identifies a unique mapped output. None for regular outputs.

  • \n
  • config (Optional[Any]) \u2013 The configuration for the output.

  • \n
  • dagster_type (Optional[DagsterType]) \u2013 The type of this output.

  • \n
  • version (Optional[str]) \u2013 (Experimental) The version of the output.

  • \n
  • resource_config (Optional[Mapping[str, Any]]) \u2013 The resource config to make available from the\ninput context. This usually corresponds to the config provided to the resource that\nloads the output manager.

  • \n
  • resources (Optional[Resources]) \u2013 The resources to make available from the context.\nFor a given key, you can provide either an actual instance of an object, or a resource\ndefinition.

  • \n
  • op_def (Optional[OpDefinition]) \u2013 The definition of the op that produced the output.

  • \n
  • asset_key \u2013 Optional[Union[AssetKey, Sequence[str], str]]: The asset key corresponding to the\noutput.

  • \n
  • partition_key \u2013 Optional[str]: String value representing partition key to execute with.

  • \n
\n
\n
\n

Examples

\n
build_output_context()\n\nwith build_output_context(resources={"foo": context_manager_resource}) as context:\n    do_something\n
\n
\n
\n\n
\n
\n

Built-in IO Managers\u00b6

\n
\n
\ndagster.FilesystemIOManager IOManagerDefinition[source]\u00b6
\n

Built-in filesystem IO manager that stores and retrieves values using pickling.

\n

The base directory that the pickle files live inside is determined by:

\n
    \n
  • The IO manager\u2019s \u201cbase_dir\u201d configuration value, if specified. Otherwise\u2026

  • \n
  • A \u201cstorage/\u201d directory underneath the value for \u201clocal_artifact_storage\u201d in your dagster.yaml\nfile, if specified. Otherwise\u2026

  • \n
  • A \u201cstorage/\u201d directory underneath the directory that the DAGSTER_HOME environment variable\npoints to, if that environment variable is specified. Otherwise\u2026

  • \n
  • A temporary directory.

  • \n
\n

Assigns each op output to a unique filepath containing run ID, step key, and output name.\nAssigns each asset to a single filesystem path, at \u201c<base_dir>/<asset_key>\u201d. If the asset key\nhas multiple components, the final component is used as the name of the file, and the preceding\ncomponents as parent directories under the base_dir.

\n

Subsequent materializations of an asset will overwrite previous materializations of that asset.\nSo, with a base directory of \u201c/my/base/path\u201d, an asset with key\nAssetKey([\u201cone\u201d, \u201ctwo\u201d, \u201cthree\u201d]) would be stored in a file called \u201cthree\u201d in a directory\nwith path \u201c/my/base/path/one/two/\u201d.

\n

Example usage:

\n
    \n
  1. Attach an IO manager to a set of assets using the reserved resource key "io_manager".

  2. \n
\n
from dagster import Definitions, asset, FilesystemIOManager\n\n@asset\ndef asset1():\n    # create df ...\n    return df\n\n@asset\ndef asset2(asset1):\n    return asset1[:5]\n\ndefs = Definitions(\n    assets=[asset1, asset2],\n    resources={\n        "io_manager": FilesystemIOManager(base_dir="/my/base/path")\n    },\n)\n
\n
\n

2. Specify a job-level IO manager using the reserved resource key "io_manager",\nwhich will set the given IO manager on all ops in a job.

\n
from dagster import FilesystemIOManager, job, op\n\n@op\ndef op_a():\n    # create df ...\n    return df\n\n@op\ndef op_b(df):\n    return df[:5]\n\n@job(\n    resource_defs={\n        "io_manager": FilesystemIOManager(base_dir="/my/base/path")\n    }\n)\ndef job():\n    op_b(op_a())\n
\n
\n

3. Specify IO manager on Out, which allows you to set different IO managers on\ndifferent step outputs.

\n
from dagster import FilesystemIOManager, job, op, Out\n\n@op(out=Out(io_manager_key="my_io_manager"))\ndef op_a():\n    # create df ...\n    return df\n\n@op\ndef op_b(df):\n    return df[:5]\n\n@job(resource_defs={"my_io_manager": FilesystemIOManager()})\ndef job():\n    op_b(op_a())\n
\n
\n
\n\n
\n
\ndagster.InMemoryIOManager IOManagerDefinition[source]\u00b6
\n

I/O manager that stores and retrieves values in memory. After execution is complete, the values will\nbe garbage-collected. Note that this means that each run will not have access to values from previous runs.

\n
\n\n

The UPathIOManager can be used to easily define filesystem-based IO Managers.

\n
\n
\nclass dagster.UPathIOManager(base_path=None)[source]\u00b6
\n

Abstract IOManager base class compatible with local and cloud storage via universal-pathlib and fsspec.

\n
\n
Features:
    \n
  • handles partitioned assets

  • \n
  • handles loading a single upstream partition

  • \n
  • handles loading multiple upstream partitions (with respect to PartitionMapping)

  • \n
  • supports loading multiple partitions concurrently with async load_from_path method

  • \n
  • the get_metadata method can be customized to add additional metadata to the output

  • \n
  • the allow_missing_partitions metadata value can be set to True to skip missing partitions\n(the default behavior is to raise an error)

  • \n
\n
\n
\n
\n\n
\n
\n

Input Managers (Experimental)\u00b6

\n

Input managers load inputs from either upstream outputs or from provided default values.

\n
\n
\n@dagster.input_manager(config_schema=None, description=None, input_config_schema=None, required_resource_keys=None, version=None)[source]\u00b6
\n

Define an input manager.

\n

Input managers load op inputs, either from upstream outputs or by providing default values.

\n

The decorated function should accept a InputContext and resource config, and return\na loaded object that will be passed into one of the inputs of an op.

\n

The decorator produces an InputManagerDefinition.

\n
\n
Parameters:
\n
    \n
  • config_schema (Optional[ConfigSchema]) \u2013 The schema for the resource-level config. If not\nset, Dagster will accept any config provided.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the resource.

  • \n
  • input_config_schema (Optional[ConfigSchema]) \u2013 A schema for the input-level config. Each\ninput that uses this input manager can be configured separately using this config.\nIf not set, Dagster will accept any config provided.

  • \n
  • required_resource_keys (Optional[Set[str]]) \u2013 Keys for the resources required by the input\nmanager.

  • \n
  • version (Optional[str]) \u2013 (Experimental) the version of the input manager definition.

  • \n
\n
\n
\n

Examples:

\n
from dagster import input_manager, op, job, In\n\n@input_manager\ndef csv_loader(_):\n    return read_csv("some/path")\n\n@op(ins={"input1": In(input_manager_key="csv_loader_key")})\ndef my_op(_, input1):\n    do_stuff(input1)\n\n@job(resource_defs={"csv_loader_key": csv_loader})\ndef my_job():\n    my_op()\n\n@input_manager(config_schema={"base_dir": str})\ndef csv_loader(context):\n    return read_csv(context.resource_config["base_dir"] + "/some/path")\n\n@input_manager(input_config_schema={"path": str})\ndef csv_loader(context):\n    return read_csv(context.config["path"])\n
\n
\n
\n\n
\n
\nclass dagster.InputManager[source]\u00b6
\n

Base interface for classes that are responsible for loading solid inputs.

\n
\n\n
\n
\nclass dagster.InputManagerDefinition(resource_fn, config_schema=None, description=None, input_config_schema=None, required_resource_keys=None, version=None)[source]\u00b6
\n

Definition of an input manager resource.

\n

Input managers load op inputs.

\n

An InputManagerDefinition is a ResourceDefinition whose resource_fn returns an\nInputManager.

\n

The easiest way to create an InputManagerDefinition is with the\n@input_manager decorator.

\n
\n\n
\n
\n

Legacy\u00b6

\n
\n
\ndagster.fs_io_manager IOManagerDefinition[source]\u00b6
\n

Built-in filesystem IO manager that stores and retrieves values using pickling.

\n

The base directory that the pickle files live inside is determined by:

\n
    \n
  • The IO manager\u2019s \u201cbase_dir\u201d configuration value, if specified. Otherwise\u2026

  • \n
  • A \u201cstorage/\u201d directory underneath the value for \u201clocal_artifact_storage\u201d in your dagster.yaml\nfile, if specified. Otherwise\u2026

  • \n
  • A \u201cstorage/\u201d directory underneath the directory that the DAGSTER_HOME environment variable\npoints to, if that environment variable is specified. Otherwise\u2026

  • \n
  • A temporary directory.

  • \n
\n

Assigns each op output to a unique filepath containing run ID, step key, and output name.\nAssigns each asset to a single filesystem path, at \u201c<base_dir>/<asset_key>\u201d. If the asset key\nhas multiple components, the final component is used as the name of the file, and the preceding\ncomponents as parent directories under the base_dir.

\n

Subsequent materializations of an asset will overwrite previous materializations of that asset.\nSo, with a base directory of \u201c/my/base/path\u201d, an asset with key\nAssetKey([\u201cone\u201d, \u201ctwo\u201d, \u201cthree\u201d]) would be stored in a file called \u201cthree\u201d in a directory\nwith path \u201c/my/base/path/one/two/\u201d.

\n

Example usage:

\n
    \n
  1. Attach an IO manager to a set of assets using the reserved resource key "io_manager".

  2. \n
\n
from dagster import Definitions, asset, fs_io_manager\n\n@asset\ndef asset1():\n    # create df ...\n    return df\n\n@asset\ndef asset2(asset1):\n    return asset1[:5]\n\ndefs = Definitions(\n    assets=[asset1, asset2],\n    resources={\n        "io_manager": fs_io_manager.configured({"base_dir": "/my/base/path"})\n    },\n)\n
\n
\n

2. Specify a job-level IO manager using the reserved resource key "io_manager",\nwhich will set the given IO manager on all ops in a job.

\n
from dagster import fs_io_manager, job, op\n\n@op\ndef op_a():\n    # create df ...\n    return df\n\n@op\ndef op_b(df):\n    return df[:5]\n\n@job(\n    resource_defs={\n        "io_manager": fs_io_manager.configured({"base_dir": "/my/base/path"})\n    }\n)\ndef job():\n    op_b(op_a())\n
\n
\n

3. Specify IO manager on Out, which allows you to set different IO managers on\ndifferent step outputs.

\n
from dagster import fs_io_manager, job, op, Out\n\n@op(out=Out(io_manager_key="my_io_manager"))\ndef op_a():\n    # create df ...\n    return df\n\n@op\ndef op_b(df):\n    return df[:5]\n\n@job(resource_defs={"my_io_manager": fs_io_manager})\ndef job():\n    op_b(op_a())\n
\n
\n
\n\n
\n
\ndagster.mem_io_manager IOManagerDefinition[source]\u00b6
\n

Built-in IO manager that stores and retrieves values in memory.

\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/io-managers", "customsidebar": null, "display_toc": true, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../partitions/", "title": "Partitions Definitions"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../ops/", "title": "Ops"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/partitions", "Partitions Definitions", "N", "next"], ["sections/api/apidocs/ops", "Ops", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/io-managers.rst.txt", "title": "IO Managers", "toc": "\n"}, "jobs": {"alabaster_version": "0.7.13", "body": "
\n

Jobs\u00b6

\n

A Job binds a Graph and the resources it needs to be executable.

\n

Jobs are created by calling GraphDefinition.to_job() on a graph instance, or using the job decorator.

\n
\n
\n@dagster.job(compose_fn=None, *, name=None, description=None, resource_defs=None, config=None, tags=None, metadata=None, logger_defs=None, executor_def=None, hooks=None, op_retry_policy=None, version_strategy=None, partitions_def=None, input_values=None)[source]\u00b6
\n

Creates a job with the specified parameters from the decorated graph/op invocation function.

\n

Using this decorator allows you to build an executable job by writing a function that invokes\nops (or graphs).

\n
\n
Parameters:
\n
    \n
  • (Callable[... (compose_fn) \u2013 The decorated function. The body should contain op or graph invocations. Unlike op\nfunctions, does not accept a context argument.

  • \n
  • Any] \u2013 The decorated function. The body should contain op or graph invocations. Unlike op\nfunctions, does not accept a context argument.

  • \n
  • name (Optional[str]) \u2013 The name for the Job. Defaults to the name of the this graph.

  • \n
  • resource_defs (Optional[Mapping[str, object]]) \u2013 Resources that are required by this graph for execution.\nIf not defined, io_manager will default to filesystem.

  • \n
  • config \u2013

    Describes how the job is parameterized at runtime.

    \n

    If no value is provided, then the schema for the job\u2019s run config is a standard\nformat based on its ops and resources.

    \n

    If a dictionary is provided, then it must conform to the standard config schema, and\nit will be used as the job\u2019s run config for the job whenever the job is executed.\nThe values provided will be viewable and editable in the Dagster UI, so be\ncareful with secrets.

    \n

    If a RunConfig object is provided, then it will be used directly as the run config\nfor the job whenever the job is executed, similar to providing a dictionary.

    \n

    If a ConfigMapping object is provided, then the schema for the job\u2019s run config is\ndetermined by the config mapping, and the ConfigMapping, which should return\nconfiguration in the standard format to configure the job.

    \n

    If a PartitionedConfig object is provided, then it defines a discrete set of config\nvalues that can parameterize the job, as well as a function for mapping those\nvalues to the base config. The values provided will be viewable and editable in the\nDagster UI, so be careful with secrets.

    \n

  • \n
  • tags (Optional[Dict[str, Any]]) \u2013 Arbitrary information that will be attached to the execution of the Job.\nValues that are not strings will be json encoded and must meet the criteria that\njson.loads(json.dumps(value)) == value. These tag values may be overwritten by tag\nvalues provided at invocation time.

  • \n
  • metadata (Optional[Dict[str, RawMetadataValue]]) \u2013 Arbitrary information that will be attached to the JobDefinition and be viewable in the Dagster UI.\nKeys must be strings, and values must be python primitive types or one of the provided\nMetadataValue types

  • \n
  • logger_defs (Optional[Dict[str, LoggerDefinition]]) \u2013 A dictionary of string logger identifiers to their implementations.

  • \n
  • executor_def (Optional[ExecutorDefinition]) \u2013 How this Job will be executed. Defaults to multiprocess_executor .

  • \n
  • op_retry_policy (Optional[RetryPolicy]) \u2013 The default retry policy for all ops in this job.\nOnly used if retry policy is not defined on the op definition or op invocation.

  • \n
  • version_strategy (Optional[VersionStrategy]) \u2013 \n \n (\n deprecated\n )\n \n (This parameter will be removed in version 2.0. Use asset versioning instead.) Defines how each op (and optionally, resource) in the job can be versioned. If\nprovided, memoization will be enabled for this job.

  • \n
  • partitions_def (Optional[PartitionsDefinition]) \u2013 Defines a discrete set of partition keys\nthat can parameterize the job. If this argument is supplied, the config argument\ncan\u2019t also be supplied.

  • \n
  • input_values (Optional[Mapping[str, Any]]) \u2013 A dictionary that maps python objects to the top-level inputs of a job.

  • \n
\n
\n
\n

Examples

\n
@op\ndef return_one():\n    return 1\n\n@op\ndef add_one(in1):\n    return in1 + 1\n\n@job\ndef job1():\n    add_one(return_one())\n
\n
\n
\n\n
\n
\nclass dagster.JobDefinition(*, graph_def, resource_defs=None, executor_def=None, logger_defs=None, name=None, config=None, description=None, partitions_def=None, tags=None, metadata=None, hook_defs=None, op_retry_policy=None, version_strategy=None, _subset_selection_data=None, asset_layer=None, input_values=None, _was_explicitly_provided_resources=None)[source]\u00b6
\n

Defines a Dagster job.

\n
\n
\nproperty config_mapping\u00b6
\n

The config mapping for the job, if it has one.

\n

A config mapping defines a way to map a top-level config schema to run config for the job.

\n
\n\n
\n
\nexecute_in_process(run_config=None, instance=None, partition_key=None, raise_on_error=True, op_selection=None, asset_selection=None, run_id=None, input_values=None, tags=None, resources=None)[source]\u00b6
\n

Execute the Job in-process, gathering results in-memory.

\n

The executor_def on the Job will be ignored, and replaced with the in-process executor.\nIf using the default io_manager, it will switch from filesystem to in-memory.

\n
\n
Parameters:
\n
    \n
  • (Optional[Mapping[str (run_config) \u2013 The configuration for the run

  • \n
  • Any]] \u2013 The configuration for the run

  • \n
  • instance (Optional[DagsterInstance]) \u2013 The instance to execute against, an ephemeral one will be used if none provided.

  • \n
  • partition_key \u2013 (Optional[str])\nThe string partition key that specifies the run config to execute. Can only be used\nto select run config for jobs with partitioned config.

  • \n
  • raise_on_error (Optional[bool]) \u2013 Whether or not to raise exceptions when they occur.\nDefaults to True.

  • \n
  • op_selection (Optional[Sequence[str]]) \u2013 A list of op selection queries (including single op\nnames) to execute. For example:\n* ['some_op']: selects some_op itself.\n* ['*some_op']: select some_op and all its ancestors (upstream dependencies).\n* ['*some_op+++']: select some_op, all its ancestors, and its descendants\n(downstream dependencies) within 3 levels down.\n* ['*some_op', 'other_op_a', 'other_op_b+']: select some_op and all its\nancestors, other_op_a itself, and other_op_b and its direct child ops.

  • \n
  • input_values (Optional[Mapping[str, Any]]) \u2013 A dictionary that maps python objects to the top-level inputs of the job. Input values provided here will override input values that have been provided to the job directly.

  • \n
  • resources (Optional[Mapping[str, Any]]) \u2013 The resources needed if any are required. Can provide resource instances directly,\nor resource definitions.

  • \n
\n
\n
Returns:
\n

ExecuteInProcessResult

\n
\n
\n
\n\n
\n
\nproperty executor_def\u00b6
\n

Returns the default ExecutorDefinition for the job.

\n

If the user has not specified an executor definition, then this will default to the multi_or_in_process_executor(). If a default is specified on the Definitions object the job was provided to, then that will be used instead.

\n
\n\n
\n
\nproperty has_specified_executor\u00b6
\n

Returns True if this job has explicitly specified an executor, and False if the executor was inherited through defaults or the Definitions object the job was provided to.

\n
\n\n
\n
\nproperty has_specified_loggers\u00b6
\n

Returns true if the job explicitly set loggers, and False if loggers were inherited through defaults or the Definitions object the job was provided to.

\n
\n\n
\n
\nproperty loggers\u00b6
\n

Returns the set of LoggerDefinition objects specified on the job.

\n

If the user has not specified a mapping of LoggerDefinition objects, then this will default to the colored_console_logger() under the key console. If a default is specified on the Definitions object the job was provided to, then that will be used instead.

\n
\n\n
\n
\nproperty partitioned_config\u00b6
\n

The partitioned config for the job, if it has one.

\n

A partitioned config defines a way to map partition keys to run config for the job.

\n
\n\n
\n
\nproperty partitions_def\u00b6
\n

Returns the PartitionsDefinition for the job, if it has one.

\n

A partitions definition defines the set of partition keys the job operates on.

\n
\n\n
\n
\nproperty resource_defs\u00b6
\n

Returns the set of ResourceDefinition objects specified on the job.

\n

This may not be the complete set of resources required by the job, since those can also be provided on the Definitions object the job may be provided to.

\n
\n\n
\n
\nrun_request_for_partition(partition_key, run_key=None, tags=None, asset_selection=None, run_config=None, current_time=None, dynamic_partitions_store=None)[source]\u00b6
\n
\n
\n

\n \n (\n deprecated\n )\n \n This API will be removed in version 2.0.0. Directly instantiate RunRequest(partition_key=...) instead..\n \n

\n

Creates a RunRequest object for a run that processes the given partition.

\n
\n
Parameters:
\n
    \n
  • partition_key \u2013 The key of the partition to request a run for.

  • \n
  • run_key (Optional[str]) \u2013 A string key to identify this launched run. For sensors, ensures that\nonly one run is created per run key across all sensor evaluations. For schedules,\nensures that one run is created per tick, across failure recoveries. Passing in a None\nvalue means that a run will always be launched per evaluation.

  • \n
  • tags (Optional[Dict[str, str]]) \u2013 A dictionary of tags (string key-value pairs) to attach\nto the launched run.

  • \n
  • (Optional[Mapping[str (run_config) \u2013 Configuration for the run. If the job has\na PartitionedConfig, this value will override replace the config\nprovided by it.

  • \n
  • Any]] \u2013 Configuration for the run. If the job has\na PartitionedConfig, this value will override replace the config\nprovided by it.

  • \n
  • current_time (Optional[datetime]) \u2013 Used to determine which time-partitions exist.\nDefaults to now.

  • \n
  • dynamic_partitions_store (Optional[DynamicPartitionsStore]) \u2013 The DynamicPartitionsStore\nobject that is responsible for fetching dynamic partitions. Required when the\npartitions definition is a DynamicPartitionsDefinition with a name defined. Users\ncan pass the DagsterInstance fetched via context.instance to this argument.

  • \n
\n
\n
Returns:
\n

an object that requests a run to process the given partition.

\n
\n
Return type:
\n

RunRequest

\n
\n
\n
\n\n
\n
\nwith_hooks(hook_defs)[source]\u00b6
\n

Apply a set of hooks to all op instances within the job.

\n
\n\n
\n
\nwith_top_level_resources(resource_defs)[source]\u00b6
\n

Apply a set of resources to all op instances within the job.

\n
\n\n
\n\n
\n

Reconstructable jobs\u00b6

\n
\n
\nclass dagster.reconstructable(target)[source]
\n

Create a ReconstructableJob from a\nfunction that returns a JobDefinition/JobDefinition,\nor a function decorated with @job.

\n

When your job must cross process boundaries, e.g., for execution on multiple nodes or\nin different systems (like dagstermill), Dagster must know how to reconstruct the job\non the other side of the process boundary.

\n

Passing a job created with ~dagster.GraphDefinition.to_job to reconstructable(),\nrequires you to wrap that job\u2019s definition in a module-scoped function, and pass that function\ninstead:

\n
from dagster import graph, reconstructable\n\n@graph\ndef my_graph():\n    ...\n\ndef define_my_job():\n    return my_graph.to_job()\n\nreconstructable(define_my_job)\n
\n
\n

This function implements a very conservative strategy for reconstruction, so that its behavior\nis easy to predict, but as a consequence it is not able to reconstruct certain kinds of jobs\nor jobs, such as those defined by lambdas, in nested scopes (e.g., dynamically within a method\ncall), or in interactive environments such as the Python REPL or Jupyter notebooks.

\n

If you need to reconstruct objects constructed in these ways, you should use\nbuild_reconstructable_job() instead, which allows you to\nspecify your own reconstruction strategy.

\n

Examples

\n
from dagster import job, reconstructable\n\n@job\ndef foo_job():\n    ...\n\nreconstructable_foo_job = reconstructable(foo_job)\n\n\n@graph\ndef foo():\n    ...\n\ndef make_bar_job():\n    return foo.to_job()\n\nreconstructable_bar_job = reconstructable(make_bar_job)\n
\n
\n
\n\n
\n
\ndagster.build_reconstructable_job(reconstructor_module_name, reconstructor_function_name, reconstructable_args=None, reconstructable_kwargs=None, reconstructor_working_directory=None)[source]\u00b6
\n
\n
\n

\n \n (\n experimental\n )\n \n This API may break in future versions, even between dot releases.\n \n

\n

Create a dagster._core.definitions.reconstructable.ReconstructableJob.

\n

When your job must cross process boundaries, e.g., for execution on multiple nodes or in\ndifferent systems (like dagstermill), Dagster must know how to reconstruct the job\non the other side of the process boundary.

\n

This function allows you to use the strategy of your choice for reconstructing jobs, so\nthat you can reconstruct certain kinds of jobs that are not supported by\nreconstructable(), such as those defined by lambdas, in nested scopes (e.g.,\ndynamically within a method call), or in interactive environments such as the Python REPL or\nJupyter notebooks.

\n

If you need to reconstruct jobs constructed in these ways, use this function instead of\nreconstructable().

\n
\n
Parameters:
\n
    \n
  • reconstructor_module_name (str) \u2013 The name of the module containing the function to use to\nreconstruct the job.

  • \n
  • reconstructor_function_name (str) \u2013 The name of the function to use to reconstruct the\njob.

  • \n
  • reconstructable_args (Tuple) \u2013 Args to the function to use to reconstruct the job.\nValues of the tuple must be JSON serializable.

  • \n
  • reconstructable_kwargs (Dict[str, Any]) \u2013 Kwargs to the function to use to reconstruct the\njob. Values of the dict must be JSON serializable.

  • \n
\n
\n
\n

Examples

\n
# module: mymodule\n\nfrom dagster import JobDefinition, job, build_reconstructable_job\n\nclass JobFactory:\n    def make_job(*args, **kwargs):\n\n        @job\n        def _job(...):\n            ...\n\n        return _job\n\ndef reconstruct_job(*args):\n    factory = JobFactory()\n    return factory.make_job(*args)\n\nfactory = JobFactory()\n\nfoo_job_args = (...,...)\n\nfoo_job_kwargs = {...:...}\n\nfoo_job = factory.make_job(*foo_job_args, **foo_job_kwargs)\n\nreconstructable_foo_job = build_reconstructable_job(\n    'mymodule',\n    'reconstruct_job',\n    foo_job_args,\n    foo_job_kwargs,\n)\n
\n
\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/jobs", "customsidebar": null, "display_toc": true, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../loggers/", "title": "Loggers"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../internals/", "title": "Internals"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/loggers", "Loggers", "N", "next"], ["sections/api/apidocs/internals", "Internals", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/jobs.rst.txt", "title": "Jobs", "toc": "\n"}, "libraries": {"dagster-airbyte": {"alabaster_version": "0.7.13", "body": "
\n

Airbyte (dagster-airbyte)\u00b6

\n

This library provides a Dagster integration with Airbyte.

\n

For more information on getting started, see the Airbyte integration guide.

\n
\n

Resources\u00b6

\n
\n
\ndagster_airbyte.AirbyteResource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
request_max_retries (dagster.IntSource, optional):
\n

The maximum number of times requests to the Airbyte API should be retried before failing.

\n

Default Value: 3

\n
\n
request_retry_delay (Float, optional):
\n

Time (in seconds) to wait between each request retry.

\n

Default Value: 0.25

\n
\n
request_timeout (dagster.IntSource, optional):
\n

Time (in seconds) after which the requests to Airbyte are declared timed out.

\n

Default Value: 15

\n
\n
cancel_sync_on_run_termination (dagster.BoolSource, optional):
\n

Whether to cancel a sync in Airbyte if the Dagster runner is terminated. This may be useful to disable if using Airbyte sources that cannot be cancelled and resumed easily, or if your Dagster deployment may experience runner interruptions that do not impact your Airbyte deployment.

\n

Default Value: True

\n
\n
poll_interval (Float, optional):
\n

Time (in seconds) to wait between checking a sync\u2019s status.

\n

Default Value: 10

\n
\n
host (dagster.StringSource):
\n

The Airbyte server address.

\n
\n
port (dagster.StringSource):
\n

Port used for the Airbyte server.

\n
\n
username (Union[dagster.StringSource, None], optional):
\n

Username if using basic auth.

\n
\n
password (Union[dagster.StringSource, None], optional):
\n

Password if using basic auth.

\n
\n
use_https (dagster.BoolSource, optional):
\n

Whether to use HTTPS to connect to the Airbyte server.

\n

Default Value: False

\n
\n
forward_logs (dagster.BoolSource, optional):
\n

Whether to forward Airbyte logs to the compute log, can be expensive for long-running syncs.

\n

Default Value: True

\n
\n
request_additional_params (dict, optional):
\n

Any additional kwargs to pass to the requests library when making requests to Airbyte.

\n
\nDefault Value:
{}\n
\n
\n
\n
\n

This resource allows users to programatically interface with the Airbyte REST API to launch\nsyncs and monitor their progress.

\n

Examples:

\n
from dagster import job, EnvVar\nfrom dagster_airbyte import AirbyteResource\n\nmy_airbyte_resource = AirbyteResource(\n    host=EnvVar("AIRBYTE_HOST"),\n    port=EnvVar("AIRBYTE_PORT"),\n    # If using basic auth\n    username=EnvVar("AIRBYTE_USERNAME"),\n    password=EnvVar("AIRBYTE_PASSWORD"),\n)\n\nairbyte_assets = build_airbyte_assets(\n    connection_id="87b7fe85-a22c-420e-8d74-b30e7ede77df",\n    destination_tables=["releases", "tags", "teams"],\n)\n\ndefs = Definitions(\n    assets=[airbyte_assets],\n    resources={"airbyte": my_airbyte_resource},\n)\n
\n
\n
\n\n
\n
\n

Assets\u00b6

\n
\n
\ndagster_airbyte.load_assets_from_airbyte_instance(airbyte, workspace_id=None, key_prefix=None, create_assets_for_normalization_tables=True, connection_to_group_fn=<function _clean_name>, io_manager_key=None, connection_to_io_manager_key_fn=None, connection_filter=None, connection_to_asset_key_fn=None, connection_to_freshness_policy_fn=None, connection_to_auto_materialize_policy_fn=None)[source]\u00b6
\n

Loads Airbyte connection assets from a configured AirbyteResource instance. This fetches information\nabout defined connections at initialization time, and will error on workspace load if the Airbyte\ninstance is not reachable.

\n
\n
Parameters:
\n
    \n
  • airbyte (ResourceDefinition) \u2013 An AirbyteResource configured with the appropriate connection\ndetails.

  • \n
  • workspace_id (Optional[str]) \u2013 The ID of the Airbyte workspace to load connections from. Only\nrequired if multiple workspaces exist in your instance.

  • \n
  • key_prefix (Optional[CoercibleToAssetKeyPrefix]) \u2013 A prefix for the asset keys created.

  • \n
  • create_assets_for_normalization_tables (bool) \u2013 If True, assets will be created for tables\ncreated by Airbyte\u2019s normalization feature. If False, only the destination tables\nwill be created. Defaults to True.

  • \n
  • connection_to_group_fn (Optional[Callable[[str], Optional[str]]]) \u2013 Function which returns an asset\ngroup name for a given Airbyte connection name. If None, no groups will be created. Defaults\nto a basic sanitization function.

  • \n
  • io_manager_key (Optional[str]) \u2013 The I/O manager key to use for all assets. Defaults to \u201cio_manager\u201d.\nUse this if all assets should be loaded from the same source, otherwise use connection_to_io_manager_key_fn.

  • \n
  • connection_to_io_manager_key_fn (Optional[Callable[[str], Optional[str]]]) \u2013 Function which returns an\nI/O manager key for a given Airbyte connection name. When other ops are downstream of the loaded assets,\nthe IOManager specified determines how the inputs to those ops are loaded. Defaults to \u201cio_manager\u201d.

  • \n
  • connection_filter (Optional[Callable[[AirbyteConnectionMetadata], bool]]) \u2013 Optional function which takes\nin connection metadata and returns False if the connection should be excluded from the output assets.

  • \n
  • connection_to_asset_key_fn (Optional[Callable[[AirbyteConnectionMetadata, str], AssetKey]]) \u2013 Optional function which\ntakes in connection metadata and table name and returns an asset key for the table. If None, the default asset\nkey is based on the table name. Any asset key prefix will be applied to the output of this function.

  • \n
  • connection_to_freshness_policy_fn (Optional[Callable[[AirbyteConnectionMetadata], Optional[FreshnessPolicy]]]) \u2013 Optional function\nwhich takes in connection metadata and returns a freshness policy for the connection\u2019s assets. If None, no freshness policies\nwill be applied to the assets.

  • \n
  • connection_to_auto_materialize_policy_fn (Optional[Callable[[AirbyteConnectionMetadata], Optional[AutoMaterializePolicy]]]) \u2013 Optional\nfunction which takes in connection metadata and returns an auto materialization policy for the connection\u2019s assets. If None, no\nauto materialization policies will be applied to the assets.

  • \n
\n
\n
\n

Examples:

\n

Loading all Airbyte connections as assets:

\n
from dagster_airbyte import airbyte_resource, load_assets_from_airbyte_instance\n\nairbyte_instance = airbyte_resource.configured(\n    {\n        "host": "localhost",\n        "port": "8000",\n    }\n)\nairbyte_assets = load_assets_from_airbyte_instance(airbyte_instance)\n
\n
\n

Filtering the set of loaded connections:

\n
from dagster_airbyte import airbyte_resource, load_assets_from_airbyte_instance\n\nairbyte_instance = airbyte_resource.configured(\n    {\n        "host": "localhost",\n        "port": "8000",\n    }\n)\nairbyte_assets = load_assets_from_airbyte_instance(\n    airbyte_instance,\n    connection_filter=lambda meta: "snowflake" in meta.name,\n)\n
\n
\n
\n\n
\n
\ndagster_airbyte.load_assets_from_airbyte_project(project_dir, workspace_id=None, key_prefix=None, create_assets_for_normalization_tables=True, connection_to_group_fn=<function _clean_name>, io_manager_key=None, connection_to_io_manager_key_fn=None, connection_filter=None, connection_directories=None, connection_to_asset_key_fn=None, connection_to_freshness_policy_fn=None, connection_to_auto_materialize_policy_fn=None)[source]\u00b6
\n

Loads an Airbyte project into a set of Dagster assets.

\n

Point to the root folder of an Airbyte project synced using the Octavia CLI. For\nmore information, see https://github.com/airbytehq/airbyte/tree/master/octavia-cli#octavia-import-all.

\n
\n
Parameters:
\n
    \n
  • project_dir (str) \u2013 The path to the root of your Airbyte project, containing sources, destinations,\nand connections folders.

  • \n
  • workspace_id (Optional[str]) \u2013 The ID of the Airbyte workspace to load connections from. Only\nrequired if multiple workspace state YAMLfiles exist in the project.

  • \n
  • key_prefix (Optional[CoercibleToAssetKeyPrefix]) \u2013 A prefix for the asset keys created.

  • \n
  • create_assets_for_normalization_tables (bool) \u2013 If True, assets will be created for tables\ncreated by Airbyte\u2019s normalization feature. If False, only the destination tables\nwill be created. Defaults to True.

  • \n
  • connection_to_group_fn (Optional[Callable[[str], Optional[str]]]) \u2013 Function which returns an asset\ngroup name for a given Airbyte connection name. If None, no groups will be created. Defaults\nto a basic sanitization function.

  • \n
  • io_manager_key (Optional[str]) \u2013 The I/O manager key to use for all assets. Defaults to \u201cio_manager\u201d.\nUse this if all assets should be loaded from the same source, otherwise use connection_to_io_manager_key_fn.

  • \n
  • connection_to_io_manager_key_fn (Optional[Callable[[str], Optional[str]]]) \u2013 Function which returns an\nI/O manager key for a given Airbyte connection name. When other ops are downstream of the loaded assets,\nthe IOManager specified determines how the inputs to those ops are loaded. Defaults to \u201cio_manager\u201d.

  • \n
  • connection_filter (Optional[Callable[[AirbyteConnectionMetadata], bool]]) \u2013 Optional function which\ntakes in connection metadata and returns False if the connection should be excluded from the output assets.

  • \n
  • connection_directories (Optional[List[str]]) \u2013 Optional list of connection directories to load assets from.\nIf omitted, all connections in the Airbyte project are loaded. May be faster than connection_filter\nif the project has many connections or if the connection yaml files are large.

  • \n
  • connection_to_asset_key_fn (Optional[Callable[[AirbyteConnectionMetadata, str], AssetKey]]) \u2013 Optional function which\ntakes in connection metadata and table name and returns an asset key for the table. If None, the default asset\nkey is based on the table name. Any asset key prefix will be applied to the output of this function.

  • \n
  • connection_to_freshness_policy_fn (Optional[Callable[[AirbyteConnectionMetadata], Optional[FreshnessPolicy]]]) \u2013 Optional function which takes in connection metadata and returns a freshness policy for the connection\u2019s assets.\nIf None, no freshness policies will be applied to the assets.

  • \n
  • connection_to_auto_materialize_policy_fn (Optional[Callable[[AirbyteConnectionMetadata], Optional[AutoMaterializePolicy]]]) \u2013 Optional function which takes in connection metadata and returns an auto materialization policy for the connection\u2019s assets.\nIf None, no auto materialization policies will be applied to the assets.

  • \n
\n
\n
\n

Examples:

\n

Loading all Airbyte connections as assets:

\n
from dagster_airbyte import load_assets_from_airbyte_project\n\nairbyte_assets = load_assets_from_airbyte_project(\n    project_dir="path/to/airbyte/project",\n)\n
\n
\n

Filtering the set of loaded connections:

\n
from dagster_airbyte import load_assets_from_airbyte_project\n\nairbyte_assets = load_assets_from_airbyte_project(\n    project_dir="path/to/airbyte/project",\n    connection_filter=lambda meta: "snowflake" in meta.name,\n)\n
\n
\n
\n\n
\n
\ndagster_airbyte.build_airbyte_assets(connection_id, destination_tables, asset_key_prefix=None, group_name=None, normalization_tables=None, deps=None, upstream_assets=None, schema_by_table_name=None, freshness_policy=None, stream_to_asset_map=None)[source]\u00b6
\n

Builds a set of assets representing the tables created by an Airbyte sync operation.

\n
\n
Parameters:
\n
    \n
  • connection_id (str) \u2013 The Airbyte Connection ID that this op will sync. You can retrieve this\nvalue from the \u201cConnections\u201d tab of a given connector in the Airbyte UI.

  • \n
  • destination_tables (List[str]) \u2013 The names of the tables that you want to be represented\nin the Dagster asset graph for this sync. This will generally map to the name of the\nstream in Airbyte, unless a stream prefix has been specified in Airbyte.

  • \n
  • normalization_tables (Optional[Mapping[str, List[str]]]) \u2013 If you are using Airbyte\u2019s\nnormalization feature, you may specify a mapping of destination table to a list of\nderived tables that will be created by the normalization process.

  • \n
  • asset_key_prefix (Optional[List[str]]) \u2013 A prefix for the asset keys inside this asset.\nIf left blank, assets will have a key of AssetKey([table_name]).

  • \n
  • deps (Optional[Sequence[Union[AssetsDefinition, SourceAsset, str, AssetKey]]]) \u2013 A list of assets to add as sources.

  • \n
  • upstream_assets (Optional[Set[AssetKey]]) \u2013 Deprecated, use deps instead. A list of assets to add as sources.

  • \n
  • freshness_policy (Optional[FreshnessPolicy]) \u2013 A freshness policy to apply to the assets

  • \n
  • stream_to_asset_map (Optional[Mapping[str, str]]) \u2013 A mapping of an Airbyte stream name to a Dagster asset.\nThis allows the use of the \u201cprefix\u201d setting in Airbyte with special characters that aren\u2019t valid asset names.

  • \n
\n
\n
\n
\n\n
\n
\n

Ops\u00b6

\n
\n
\ndagster_airbyte.airbyte_sync_op = <dagster._core.definitions.op_definition.OpDefinition object>[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
connection_id (dagster.StringSource):
\n

Parsed json dictionary representing the details of the Airbyte connector after the sync successfully completes. See the [Airbyte API Docs](https://airbyte-public-api-docs.s3.us-east-2.amazonaws.com/rapidoc-api-docs.html#overview) to see detailed information on this response.

\n
\n
poll_interval (Float, optional):
\n

The maximum time that will waited before this operation is timed out. By default, this will never time out.

\n

Default Value: 10

\n
\n
poll_timeout (Union[Float, None], optional):
\n

The maximum time that will waited before this operation is timed out. By default, this will never time out.

\n
\n
yield_materializations (dagster.BoolSource, optional):
\n

If True, materializations corresponding to the results of the Airbyte sync will be yielded when the op executes.

\n

Default Value: True

\n
\n
asset_key_prefix (List[dagster.StringSource], optional):
\n

If provided and yield_materializations is True, these components will be used to prefix the generated asset keys.

\n

Default Value: [\u2018airbyte\u2019]

\n
\n
\n

Executes a Airbyte job sync for a given connection_id, and polls until that sync\ncompletes, raising an error if it is unsuccessful. It outputs a AirbyteOutput which contains\nthe job details for a given connection_id.

\n

It requires the use of the airbyte_resource, which allows it to\ncommunicate with the Airbyte API.

\n

Examples

\n
from dagster import job\nfrom dagster_airbyte import airbyte_resource, airbyte_sync_op\n\nmy_airbyte_resource = airbyte_resource.configured(\n    {\n        "host": {"env": "AIRBYTE_HOST"},\n        "port": {"env": "AIRBYTE_PORT"},\n    }\n)\n\nsync_foobar = airbyte_sync_op.configured({"connection_id": "foobar"}, name="sync_foobar")\n\n@job(resource_defs={"airbyte": my_airbyte_resource})\ndef my_simple_airbyte_job():\n    sync_foobar()\n\n@job(resource_defs={"airbyte": my_airbyte_resource})\ndef my_composed_airbyte_job():\n    final_foobar_state = sync_foobar(start_after=some_op())\n    other_op(final_foobar_state)\n
\n
\n
\n\n
\n
\n

Managed Config\u00b6

\n

The following APIs are used as part of the experimental ingestion-as-code functionality.\nFor more information, see the Airbyte ingestion as code guide.

\n
\n
\nclass dagster_airbyte.AirbyteManagedElementReconciler(airbyte, connections, delete_unmentioned_resources=False)[source]\u00b6
\n
\n
\n

\n \n (\n experimental\n )\n \n This API may break in future versions, even between dot releases.\n \n

\n

Reconciles Python-specified Airbyte connections with an Airbyte instance.

\n

Passing the module containing an AirbyteManagedElementReconciler to the dagster-airbyte\nCLI will allow you to check the state of your Python-code-specified Airbyte connections\nagainst an Airbyte instance, and reconcile them if necessary.

\n

This functionality is experimental and subject to change.

\n
\n
\n__init__(airbyte, connections, delete_unmentioned_resources=False)[source]\u00b6
\n

Reconciles Python-specified Airbyte connections with an Airbyte instance.

\n
\n
Parameters:
\n
    \n
  • airbyte (Union[AirbyteResource, ResourceDefinition]) \u2013 The Airbyte resource definition to reconcile against.

  • \n
  • connections (Iterable[AirbyteConnection]) \u2013 The Airbyte connection objects to reconcile.

  • \n
  • delete_unmentioned_resources (bool) \u2013 Whether to delete resources that are not mentioned in\nthe set of connections provided. When True, all Airbyte instance contents are effectively\nmanaged by the reconciler. Defaults to False.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\ndagster_airbyte.load_assets_from_connections(airbyte, connections, key_prefix=None, create_assets_for_normalization_tables=True, connection_to_group_fn=<function _clean_name>, io_manager_key=None, connection_to_io_manager_key_fn=None, connection_to_asset_key_fn=None, connection_to_freshness_policy_fn=None)[source]\u00b6
\n
\n
\n

\n \n (\n experimental\n )\n \n This API may break in future versions, even between dot releases.\n \n

\n

Loads Airbyte connection assets from a configured AirbyteResource instance, checking against a list of AirbyteConnection objects.\nThis method will raise an error on repo load if the passed AirbyteConnection objects are not in sync with the Airbyte instance.

\n
\n
Parameters:
\n
    \n
  • airbyte (Union[AirbyteResource, ResourceDefinition]) \u2013 An AirbyteResource configured with the appropriate connection\ndetails.

  • \n
  • connections (Iterable[AirbyteConnection]) \u2013 A list of AirbyteConnection objects to build assets for.

  • \n
  • key_prefix (Optional[CoercibleToAssetKeyPrefix]) \u2013 A prefix for the asset keys created.

  • \n
  • create_assets_for_normalization_tables (bool) \u2013 If True, assets will be created for tables\ncreated by Airbyte\u2019s normalization feature. If False, only the destination tables\nwill be created. Defaults to True.

  • \n
  • connection_to_group_fn (Optional[Callable[[str], Optional[str]]]) \u2013 Function which returns an asset\ngroup name for a given Airbyte connection name. If None, no groups will be created. Defaults\nto a basic sanitization function.

  • \n
  • io_manager_key (Optional[str]) \u2013 The IO manager key to use for all assets. Defaults to \u201cio_manager\u201d.\nUse this if all assets should be loaded from the same source, otherwise use connection_to_io_manager_key_fn.

  • \n
  • connection_to_io_manager_key_fn (Optional[Callable[[str], Optional[str]]]) \u2013 Function which returns an\nIO manager key for a given Airbyte connection name. When other ops are downstream of the loaded assets,\nthe IOManager specified determines how the inputs to those ops are loaded. Defaults to \u201cio_manager\u201d.

  • \n
  • connection_to_asset_key_fn (Optional[Callable[[AirbyteConnectionMetadata, str], AssetKey]]) \u2013 Optional function which\ntakes in connection metadata and table name and returns an asset key for the table. If None, the default asset\nkey is based on the table name. Any asset key prefix will be applied to the output of this function.

  • \n
  • connection_to_freshness_policy_fn (Optional[Callable[[AirbyteConnectionMetadata], Optional[FreshnessPolicy]]]) \u2013 Optional function which\ntakes in connection metadata and returns a freshness policy for the connection. If None, no freshness policy will be applied.

  • \n
\n
\n
\n

Examples:

\n
from dagster_airbyte import (\n    AirbyteConnection,\n    AirbyteResource,\n    load_assets_from_connections,\n)\n\nairbyte_instance = AirbyteResource(\n        host: "localhost",\n        port: "8000",\n)\nairbyte_connections = [\n    AirbyteConnection(...),\n    AirbyteConnection(...)\n]\nairbyte_assets = load_assets_from_connections(airbyte_instance, airbyte_connections)\n
\n
\n
\n\n
\n
\nclass dagster_airbyte.AirbyteConnection(name, source, destination, stream_config, normalize_data=None, destination_namespace=AirbyteDestinationNamespace.SAME_AS_SOURCE, prefix=None)[source]\u00b6
\n

A user-defined Airbyte connection, pairing an Airbyte source and destination and configuring\nwhich streams to sync.

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The display name of the connection.

  • \n
  • source (AirbyteSource) \u2013 The source to sync from.

  • \n
  • destination (AirbyteDestination) \u2013 The destination to sync to.

  • \n
  • stream_config (Mapping[str, AirbyteSyncMode]) \u2013 A mapping from stream name to\nthe sync mode for that stream, including any additional configuration\nof primary key or cursor field.

  • \n
  • normalize_data (Optional[bool]) \u2013 Whether to normalize the data in the\ndestination.

  • \n
  • destination_namespace (Optional[Union[AirbyteDestinationNamespace, str]]) \u2013 The namespace to sync to in the destination. If set to\nAirbyteDestinationNamespace.SAME_AS_SOURCE, the namespace will be the\nsame as the source namespace. If set to\nAirbyteDestinationNamespace.DESTINATION_DEFAULT, the namespace will be\nthe default namespace for the destination. If set to a string, the\nnamespace will be that string.

  • \n
  • prefix (Optional[str]) \u2013 A prefix to add to the table names in the destination.

  • \n
\n
\n
\n

Example

\n
from dagster_airbyte.managed.generated.sources import FileSource\nfrom dagster_airbyte.managed.generated.destinations import LocalJsonDestination\nfrom dagster_airbyte import AirbyteConnection, AirbyteSyncMode\n\ncereals_csv_source = FileSource(...)\nlocal_json_destination = LocalJsonDestination(...)\n\ncereals_connection = AirbyteConnection(\n    name="download-cereals",\n    source=cereals_csv_source,\n    destination=local_json_destination,\n    stream_config={"cereals": AirbyteSyncMode.full_refresh_overwrite()},\n)\n
\n
\n
\n
\n__init__(name, source, destination, stream_config, normalize_data=None, destination_namespace=AirbyteDestinationNamespace.SAME_AS_SOURCE, prefix=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.AirbyteSource(name, source_type, source_configuration)[source]\u00b6
\n

Represents a user-defined Airbyte source.

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The display name of the source.

  • \n
  • source_type (str) \u2013 The type of the source, from Airbyte\u2019s list\nof sources https://airbytehq.github.io/category/sources/.

  • \n
  • source_configuration (Mapping[str, Any]) \u2013 The configuration for the\nsource, as defined by Airbyte\u2019s API.

  • \n
\n
\n
\n
\n
\n__init__(name, source_type, source_configuration)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.AirbyteDestination(name, destination_type, destination_configuration)[source]\u00b6
\n

Represents a user-defined Airbyte destination.

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The display name of the destination.

  • \n
  • destination_type (str) \u2013 The type of the destination, from Airbyte\u2019s list\nof destinations https://airbytehq.github.io/category/destinations/.

  • \n
  • destination_configuration (Mapping[str, Any]) \u2013 The configuration for the\ndestination, as defined by Airbyte\u2019s API.

  • \n
\n
\n
\n
\n
\n__init__(name, destination_type, destination_configuration)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.AirbyteSyncMode(json_repr)[source]\u00b6
\n

Represents the sync mode for a given Airbyte stream, which governs how Airbyte reads\nfrom a source and writes to a destination.

\n

For more information, see https://docs.airbyte.com/understanding-airbyte/connections/.

\n
\n
\nclassmethod full_refresh_append()[source]\u00b6
\n

Syncs the entire data stream from the source, appending rows to the destination.

\n

https://docs.airbyte.com/understanding-airbyte/connections/full-refresh-append/

\n
\n\n
\n
\nclassmethod full_refresh_overwrite()[source]\u00b6
\n

Syncs the entire data stream from the source, replaces data in the destination by\noverwriting it.

\n

https://docs.airbyte.com/understanding-airbyte/connections/full-refresh-overwrite

\n
\n\n
\n
\nclassmethod incremental_append(cursor_field=None)[source]\u00b6
\n

Syncs only new records from the source, appending rows to the destination.\nMay optionally specify the cursor field used to determine which records\nare new.

\n

https://docs.airbyte.com/understanding-airbyte/connections/incremental-append/

\n
\n\n
\n
\nclassmethod incremental_append_dedup(cursor_field=None, primary_key=None)[source]\u00b6
\n

Syncs new records from the source, appending to an append-only history\ntable in the destination. Also generates a deduplicated view mirroring the\nsource table. May optionally specify the cursor field used to determine\nwhich records are new, and the primary key used to determine which records\nare duplicates.

\n

https://docs.airbyte.com/understanding-airbyte/connections/incremental-append-dedup/

\n
\n\n
\n\n
\n
\n

Managed Config Generated Sources\u00b6

\n
\n
\nclass dagster_airbyte.managed.generated.sources.StravaSource(name, client_id, client_secret, refresh_token, athlete_id, start_date, auth_type=None)[source]\u00b6
\n
\n
\n__init__(name, client_id, client_secret, refresh_token, athlete_id, start_date, auth_type=None)[source]\u00b6
\n

Airbyte Source for Strava.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/strava

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • client_id (str) \u2013 The Client ID of your Strava developer application.

  • \n
  • client_secret (str) \u2013 The Client Secret of your Strava developer application.

  • \n
  • refresh_token (str) \u2013 The Refresh Token with the activity: read_all permissions.

  • \n
  • athlete_id (int) \u2013 The Athlete ID of your Strava developer application.

  • \n
  • start_date (str) \u2013 UTC date and time. Any data before this date will not be replicated.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.AppsflyerSource(name, app_id, api_token, start_date, timezone=None)[source]\u00b6
\n
\n
\n__init__(name, app_id, api_token, start_date, timezone=None)[source]\u00b6
\n

Airbyte Source for Appsflyer.

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • app_id (str) \u2013 App identifier as found in AppsFlyer.

  • \n
  • api_token (str) \u2013 Pull API token for authentication. If you change the account admin, the token changes, and you must update scripts with the new token. Get the API token in the Dashboard.

  • \n
  • start_date (str) \u2013 The default value to use if no bookmark exists for an endpoint. Raw Reports historical lookback is limited to 90 days.

  • \n
  • timezone (Optional[str]) \u2013 Time zone in which date times are stored. The project timezone may be found in the App settings in the AppsFlyer console.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.GoogleWorkspaceAdminReportsSource(name, credentials_json, email, lookback=None)[source]\u00b6
\n
\n
\n__init__(name, credentials_json, email, lookback=None)[source]\u00b6
\n

Airbyte Source for Google Workspace Admin Reports.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/google-workspace-admin-reports

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • credentials_json (str) \u2013 The contents of the JSON service account key. See the docs for more information on how to generate this key.

  • \n
  • email (str) \u2013 The email of the user, who has permissions to access the Google Workspace Admin APIs.

  • \n
  • lookback (Optional[int]) \u2013 Sets the range of time shown in the report. The maximum value allowed by the Google API is 180 days.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.CartSource(name, credentials, start_date)[source]\u00b6
\n
\n
\n__init__(name, credentials, start_date)[source]\u00b6
\n

Airbyte Source for Cart.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/cart

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • start_date (str) \u2013 The date from which you\u2019d like to replicate the data

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass CartSource.CentralAPIRouter(user_name, user_secret, site_id)[source]\u00b6
\n
\n
\n__init__(user_name, user_secret, site_id)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass CartSource.SingleStoreAccessToken(access_token, store_name)[source]\u00b6
\n
\n
\n__init__(access_token, store_name)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.LinkedinAdsSource(name, credentials, start_date, account_ids=None)[source]\u00b6
\n
\n
\n__init__(name, credentials, start_date, account_ids=None)[source]\u00b6
\n

Airbyte Source for Linkedin Ads.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/linkedin-ads

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • start_date (str) \u2013 UTC date in the format 2020-09-17. Any data before this date will not be replicated.

  • \n
  • account_ids (Optional[List[int]]) \u2013 Specify the account IDs separated by a space, to pull the data from. Leave empty, if you want to pull the data from all associated accounts. See the LinkedIn Ads docs for more info.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass LinkedinAdsSource.OAuth20(client_id, client_secret, refresh_token, auth_method=None)[source]\u00b6
\n
\n
\n__init__(client_id, client_secret, refresh_token, auth_method=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass LinkedinAdsSource.AccessToken(access_token, auth_method=None)[source]\u00b6
\n
\n
\n__init__(access_token, auth_method=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.MongodbSource(name, host, port, database, user, password, auth_source, replica_set=None, ssl=None)[source]\u00b6
\n
\n
\n__init__(name, host, port, database, user, password, auth_source, replica_set=None, ssl=None)[source]\u00b6
\n

Airbyte Source for Mongodb.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/mongodb

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • host (str) \u2013 Host of a Mongo database to be replicated.

  • \n
  • port (int) \u2013 Port of a Mongo database to be replicated.

  • \n
  • database (str) \u2013 Database to be replicated.

  • \n
  • user (str) \u2013 User

  • \n
  • password (str) \u2013 Password

  • \n
  • auth_source (str) \u2013 Authentication source where user information is stored. See the Mongo docs for more info.

  • \n
  • replica_set (Optional[str]) \u2013 The name of the set to filter servers by, when connecting to a replica set (Under this condition, the \u2018TLS connection\u2019 value automatically becomes \u2018true\u2019). See the Mongo docs for more info.

  • \n
  • ssl (Optional[bool]) \u2013 If this switch is enabled, TLS connections will be used to connect to MongoDB.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.TimelySource(name, account_id, start_date, bearer_token)[source]\u00b6
\n
\n
\n__init__(name, account_id, start_date, bearer_token)[source]\u00b6
\n

Airbyte Source for Timely.

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • account_id (str) \u2013 Timely account id

  • \n
  • start_date (str) \u2013 start date

  • \n
  • bearer_token (str) \u2013 Timely bearer token

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.StockTickerApiTutorialSource(name, stock_ticker, api_key)[source]\u00b6
\n
\n
\n__init__(name, stock_ticker, api_key)[source]\u00b6
\n

Airbyte Source for Stock Ticker Api Tutorial.

\n

Documentation can be found at https://polygon.io/docs/stocks/get_v2_aggs_grouped_locale_us_market_stocks__date

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • stock_ticker (str) \u2013 The stock ticker to track

  • \n
  • api_key (str) \u2013 The Polygon.io Stocks API key to use to hit the API.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.WrikeSource(name, access_token, wrike_instance, start_date=None)[source]\u00b6
\n
\n
\n__init__(name, access_token, wrike_instance, start_date=None)[source]\u00b6
\n

Airbyte Source for Wrike.

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • access_token (str) \u2013 Permanent access token. You can find documentation on how to acquire a permanent access token here

  • \n
  • wrike_instance (str) \u2013 Wrike\u2019s instance such as app-us2.wrike.com

  • \n
  • start_date (Optional[str]) \u2013 UTC date and time in the format 2017-01-25T00:00:00Z. Only comments after this date will be replicated.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.CommercetoolsSource(name, region, host, start_date, project_key, client_id, client_secret)[source]\u00b6
\n
\n
\n__init__(name, region, host, start_date, project_key, client_id, client_secret)[source]\u00b6
\n

Airbyte Source for Commercetools.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/commercetools

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • region (str) \u2013 The region of the platform.

  • \n
  • host (str) \u2013 The cloud provider your shop is hosted. See: https://docs.commercetools.com/api/authorization

  • \n
  • start_date (str) \u2013 The date you would like to replicate data. Format: YYYY-MM-DD.

  • \n
  • project_key (str) \u2013 The project key

  • \n
  • client_id (str) \u2013 Id of API Client.

  • \n
  • client_secret (str) \u2013 The password of secret of API Client.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.GutendexSource(name, author_year_start=None, author_year_end=None, copyright=None, languages=None, search=None, sort=None, topic=None)[source]\u00b6
\n
\n
\n__init__(name, author_year_start=None, author_year_end=None, copyright=None, languages=None, search=None, sort=None, topic=None)[source]\u00b6
\n

Airbyte Source for Gutendex.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/gutendex

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • author_year_start (Optional[str]) \u2013 (Optional) Defines the minimum birth year of the authors. Books by authors born prior to the start year will not be returned. Supports both positive (CE) or negative (BCE) integer values

  • \n
  • author_year_end (Optional[str]) \u2013 (Optional) Defines the maximum birth year of the authors. Books by authors born after the end year will not be returned. Supports both positive (CE) or negative (BCE) integer values

  • \n
  • copyright (Optional[str]) \u2013 (Optional) Use this to find books with a certain copyright status - true for books with existing copyrights, false for books in the public domain in the USA, or null for books with no available copyright information.

  • \n
  • languages (Optional[str]) \u2013 (Optional) Use this to find books in any of a list of languages. They must be comma-separated, two-character language codes.

  • \n
  • search (Optional[str]) \u2013 (Optional) Use this to search author names and book titles with given words. They must be separated by a space (i.e. %20 in URL-encoded format) and are case-insensitive.

  • \n
  • sort (Optional[str]) \u2013 (Optional) Use this to sort books - ascending for Project Gutenberg ID numbers from lowest to highest, descending for IDs highest to lowest, or popular (the default) for most popular to least popular by number of downloads.

  • \n
  • topic (Optional[str]) \u2013 (Optional) Use this to search for a case-insensitive key-phrase in books\u2019 bookshelves or subjects.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.IterableSource(name, api_key, start_date)[source]\u00b6
\n
\n
\n__init__(name, api_key, start_date)[source]\u00b6
\n

Airbyte Source for Iterable.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/iterable

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • api_key (str) \u2013 Iterable API Key. See the docs for more information on how to obtain this key.

  • \n
  • start_date (str) \u2013 The date from which you\u2019d like to replicate data for Iterable, in the format YYYY-MM-DDT00:00:00Z. All data generated after this date will be replicated.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.QuickbooksSingerSource(name, client_id, client_secret, refresh_token, realm_id, user_agent, start_date, sandbox)[source]\u00b6
\n
\n
\n__init__(name, client_id, client_secret, refresh_token, realm_id, user_agent, start_date, sandbox)[source]\u00b6
\n

Airbyte Source for Quickbooks Singer.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/quickbooks

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • client_id (str) \u2013 Identifies which app is making the request. Obtain this value from the Keys tab on the app profile via My Apps on the developer site. There are two versions of this key: development and production.

  • \n
  • client_secret (str) \u2013 Obtain this value from the Keys tab on the app profile via My Apps on the developer site. There are two versions of this key: development and production.

  • \n
  • refresh_token (str) \u2013 A token used when refreshing the access token.

  • \n
  • realm_id (str) \u2013 Labeled Company ID. The Make API Calls panel is populated with the realm id and the current access token.

  • \n
  • user_agent (str) \u2013 Process and email for API logging purposes. Example: tap-quickbooks .

  • \n
  • start_date (str) \u2013 The default value to use if no bookmark exists for an endpoint (rfc3339 date string). E.g, 2021-03-20T00:00:00Z. Any data before this date will not be replicated.

  • \n
  • sandbox (bool) \u2013 Determines whether to use the sandbox or production environment.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.BigcommerceSource(name, start_date, store_hash, access_token)[source]\u00b6
\n
\n
\n__init__(name, start_date, store_hash, access_token)[source]\u00b6
\n

Airbyte Source for Bigcommerce.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/bigcommerce

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • start_date (str) \u2013 The date you would like to replicate data. Format: YYYY-MM-DD.

  • \n
  • store_hash (str) \u2013 The hash code of the store. For https://api.bigcommerce.com/stores/HASH_CODE/v3/, The store\u2019s hash code is \u2018HASH_CODE\u2019.

  • \n
  • access_token (str) \u2013 Access Token for making authenticated requests.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.ShopifySource(name, shop, credentials, start_date)[source]\u00b6
\n
\n
\n__init__(name, shop, credentials, start_date)[source]\u00b6
\n

Airbyte Source for Shopify.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/shopify

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • shop (str) \u2013 The name of your Shopify store found in the URL. For example, if your URL was https://NAME.myshopify.com, then the name would be \u2018NAME\u2019.

  • \n
  • credentials (Union[ShopifySource.APIPassword, ShopifySource.OAuth20]) \u2013 The authorization method to use to retrieve data from Shopify

  • \n
  • start_date (str) \u2013 The date you would like to replicate data from. Format: YYYY-MM-DD. Any data before this date will not be replicated.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass ShopifySource.APIPassword(api_password)[source]\u00b6
\n
\n
\n__init__(api_password)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass ShopifySource.OAuth20(client_id=None, client_secret=None, access_token=None)[source]\u00b6
\n
\n
\n__init__(client_id=None, client_secret=None, access_token=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.AppstoreSingerSource(name, key_id, private_key, issuer_id, vendor, start_date)[source]\u00b6
\n
\n
\n__init__(name, key_id, private_key, issuer_id, vendor, start_date)[source]\u00b6
\n

Airbyte Source for Appstore Singer.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/appstore

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • key_id (str) \u2013 Appstore Key ID. See the docs for more information on how to obtain this key.

  • \n
  • private_key (str) \u2013 Appstore Private Key. See the docs for more information on how to obtain this key.

  • \n
  • issuer_id (str) \u2013 Appstore Issuer ID. See the docs for more information on how to obtain this ID.

  • \n
  • vendor (str) \u2013 Appstore Vendor ID. See the docs for more information on how to obtain this ID.

  • \n
  • start_date (str) \u2013 UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.GreenhouseSource(name, api_key)[source]\u00b6
\n
\n
\n__init__(name, api_key)[source]\u00b6
\n

Airbyte Source for Greenhouse.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/greenhouse

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • api_key (str) \u2013 Greenhouse API Key. See the docs for more information on how to generate this key.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.ZoomSingerSource(name, jwt)[source]\u00b6
\n
\n
\n__init__(name, jwt)[source]\u00b6
\n

Airbyte Source for Zoom Singer.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/zoom

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • jwt (str) \u2013 Zoom JWT Token. See the docs for more information on how to obtain this key.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.TiktokMarketingSource(name, credentials, start_date=None, end_date=None, report_granularity=None)[source]\u00b6
\n
\n
\n__init__(name, credentials, start_date=None, end_date=None, report_granularity=None)[source]\u00b6
\n

Airbyte Source for Tiktok Marketing.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/tiktok-marketing

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • credentials (Union[TiktokMarketingSource.OAuth20, TiktokMarketingSource.SandboxAccessToken]) \u2013 Authentication method

  • \n
  • start_date (Optional[str]) \u2013 The Start Date in format: YYYY-MM-DD. Any data before this date will not be replicated. If this parameter is not set, all data will be replicated.

  • \n
  • end_date (Optional[str]) \u2013 The date until which you\u2019d like to replicate data for all incremental streams, in the format YYYY-MM-DD. All data generated between start_date and this date will be replicated. Not setting this option will result in always syncing the data till the current date.

  • \n
  • report_granularity (Optional[str]) \u2013 The granularity used for aggregating performance data in reports. See the docs.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass TiktokMarketingSource.OAuth20(app_id, secret, access_token, auth_type=None)[source]\u00b6
\n
\n
\n__init__(app_id, secret, access_token, auth_type=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass TiktokMarketingSource.SandboxAccessToken(advertiser_id, access_token, auth_type=None)[source]\u00b6
\n
\n
\n__init__(advertiser_id, access_token, auth_type=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.ZendeskChatSource(name, start_date, credentials, subdomain=None)[source]\u00b6
\n
\n
\n__init__(name, start_date, credentials, subdomain=None)[source]\u00b6
\n

Airbyte Source for Zendesk Chat.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/zendesk-chat

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • start_date (str) \u2013 The date from which you\u2019d like to replicate data for Zendesk Chat API, in the format YYYY-MM-DDT00:00:00Z.

  • \n
  • subdomain (Optional[str]) \u2013 Required if you access Zendesk Chat from a Zendesk Support subdomain.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass ZendeskChatSource.OAuth20(client_id=None, client_secret=None, access_token=None, refresh_token=None)[source]\u00b6
\n
\n
\n__init__(client_id=None, client_secret=None, access_token=None, refresh_token=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass ZendeskChatSource.AccessToken(access_token)[source]\u00b6
\n
\n
\n__init__(access_token)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.AwsCloudtrailSource(name, aws_key_id, aws_secret_key, aws_region_name, start_date)[source]\u00b6
\n
\n
\n__init__(name, aws_key_id, aws_secret_key, aws_region_name, start_date)[source]\u00b6
\n

Airbyte Source for Aws Cloudtrail.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/aws-cloudtrail

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • aws_key_id (str) \u2013 AWS CloudTrail Access Key ID. See the docs for more information on how to obtain this key.

  • \n
  • aws_secret_key (str) \u2013 AWS CloudTrail Access Key ID. See the docs for more information on how to obtain this key.

  • \n
  • aws_region_name (str) \u2013 The default AWS Region to use, for example, us-west-1 or us-west-2. When specifying a Region inline during client initialization, this property is named region_name.

  • \n
  • start_date (str) \u2013 The date you would like to replicate data. Data in AWS CloudTrail is available for last 90 days only. Format: YYYY-MM-DD.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.OktaSource(name, credentials, domain=None, start_date=None)[source]\u00b6
\n
\n
\n__init__(name, credentials, domain=None, start_date=None)[source]\u00b6
\n

Airbyte Source for Okta.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/okta

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • domain (Optional[str]) \u2013 The Okta domain. See the docs for instructions on how to find it.

  • \n
  • start_date (Optional[str]) \u2013 UTC date and time in the format YYYY-MM-DDTHH:MM:SSZ. Any data before this date will not be replicated.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass OktaSource.OAuth20(client_id, client_secret, refresh_token)[source]\u00b6
\n
\n
\n__init__(client_id, client_secret, refresh_token)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass OktaSource.APIToken(api_token)[source]\u00b6
\n
\n
\n__init__(api_token)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.InsightlySource(name, token=None, start_date=None)[source]\u00b6
\n
\n
\n__init__(name, token=None, start_date=None)[source]\u00b6
\n

Airbyte Source for Insightly.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/insightly

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • token (Optional[str]) \u2013 Your Insightly API token.

  • \n
  • start_date (Optional[str]) \u2013 The date from which you\u2019d like to replicate data for Insightly in the format YYYY-MM-DDT00:00:00Z. All data generated after this date will be replicated. Note that it will be used only for incremental streams.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.LinkedinPagesSource(name, org_id, credentials)[source]\u00b6
\n
\n
\n__init__(name, org_id, credentials)[source]\u00b6
\n

Airbyte Source for Linkedin Pages.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/linkedin-pages/

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • org_id (int) \u2013 Specify the Organization ID

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass LinkedinPagesSource.OAuth20(client_id, client_secret, refresh_token, auth_method=None)[source]\u00b6
\n
\n
\n__init__(client_id, client_secret, refresh_token, auth_method=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass LinkedinPagesSource.AccessToken(access_token, auth_method=None)[source]\u00b6
\n
\n
\n__init__(access_token, auth_method=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.PersistiqSource(name, api_key)[source]\u00b6
\n
\n
\n__init__(name, api_key)[source]\u00b6
\n

Airbyte Source for Persistiq.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/persistiq

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • api_key (str) \u2013 PersistIq API Key. See the docs for more information on where to find that key.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.FreshcallerSource(name, domain, api_key, start_date, requests_per_minute=None, sync_lag_minutes=None)[source]\u00b6
\n
\n
\n__init__(name, domain, api_key, start_date, requests_per_minute=None, sync_lag_minutes=None)[source]\u00b6
\n

Airbyte Source for Freshcaller.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/freshcaller

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • domain (str) \u2013 Used to construct Base URL for the Freshcaller APIs

  • \n
  • api_key (str) \u2013 Freshcaller API Key. See the docs for more information on how to obtain this key.

  • \n
  • requests_per_minute (Optional[int]) \u2013 The number of requests per minute that this source allowed to use. There is a rate limit of 50 requests per minute per app per account.

  • \n
  • start_date (str) \u2013 UTC date and time. Any data created after this date will be replicated.

  • \n
  • sync_lag_minutes (Optional[int]) \u2013 Lag in minutes for each sync, i.e., at time T, data for the time range [prev_sync_time, T-30] will be fetched

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.AppfollowSource(name, ext_id, cid, api_secret, country)[source]\u00b6
\n
\n
\n__init__(name, ext_id, cid, api_secret, country)[source]\u00b6
\n

Airbyte Source for Appfollow.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/appfollow

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • ext_id (str) \u2013 for App Store \u2014 this is 9-10 digits identification number; for Google Play \u2014 this is bundle name;

  • \n
  • cid (str) \u2013 client id provided by Appfollow

  • \n
  • api_secret (str) \u2013 api secret provided by Appfollow

  • \n
  • country (str) \u2013 getting data by Country

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.FacebookPagesSource(name, access_token, page_id)[source]\u00b6
\n
\n
\n__init__(name, access_token, page_id)[source]\u00b6
\n

Airbyte Source for Facebook Pages.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/facebook-pages

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • access_token (str) \u2013 Facebook Page Access Token

  • \n
  • page_id (str) \u2013 Page ID

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.JiraSource(name, api_token, domain, email, projects=None, start_date=None, additional_fields=None, expand_issue_changelog=None, render_fields=None, enable_experimental_streams=None)[source]\u00b6
\n
\n
\n__init__(name, api_token, domain, email, projects=None, start_date=None, additional_fields=None, expand_issue_changelog=None, render_fields=None, enable_experimental_streams=None)[source]\u00b6
\n

Airbyte Source for Jira.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/jira

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • api_token (str) \u2013 Jira API Token. See the docs for more information on how to generate this key.

  • \n
  • domain (str) \u2013 The Domain for your Jira account, e.g. airbyteio.atlassian.net

  • \n
  • email (str) \u2013 The user email for your Jira account.

  • \n
  • projects (Optional[List[str]]) \u2013 List of Jira project keys to replicate data for.

  • \n
  • start_date (Optional[str]) \u2013 The date from which you\u2019d like to replicate data for Jira in the format YYYY-MM-DDT00:00:00Z. All data generated after this date will be replicated. Note that it will be used only in the following incremental streams: issues.

  • \n
  • additional_fields (Optional[List[str]]) \u2013 List of additional fields to include in replicating issues.

  • \n
  • expand_issue_changelog (Optional[bool]) \u2013 Expand the changelog when replicating issues.

  • \n
  • render_fields (Optional[bool]) \u2013 Render issue fields in HTML format in addition to Jira JSON-like format.

  • \n
  • enable_experimental_streams (Optional[bool]) \u2013 Allow the use of experimental streams which rely on undocumented Jira API endpoints. See https://docs.airbyte.com/integrations/sources/jira#experimental-tables for more info.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.GoogleSheetsSource(name, spreadsheet_id, credentials, row_batch_size=None)[source]\u00b6
\n
\n
\n__init__(name, spreadsheet_id, credentials, row_batch_size=None)[source]\u00b6
\n

Airbyte Source for Google Sheets.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/google-sheets

\n
\n
Parameters:
\n
\n
\n
\n
\n\n
\n\n
\n
\nclass GoogleSheetsSource.AuthenticateViaGoogleOAuth(client_id, client_secret, refresh_token)[source]\u00b6
\n
\n
\n__init__(client_id, client_secret, refresh_token)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass GoogleSheetsSource.ServiceAccountKeyAuthentication(service_account_info)[source]\u00b6
\n
\n
\n__init__(service_account_info)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.DockerhubSource(name, docker_username)[source]\u00b6
\n
\n
\n__init__(name, docker_username)[source]\u00b6
\n

Airbyte Source for Dockerhub.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/dockerhub

\n
\n
Parameters:
\n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.UsCensusSource(name, query_path, api_key, query_params=None)[source]\u00b6
\n
\n
\n__init__(name, query_path, api_key, query_params=None)[source]\u00b6
\n

Airbyte Source for Us Census.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/us-census

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • query_params (Optional[str]) \u2013 The query parameters portion of the GET request, without the api key

  • \n
  • query_path (str) \u2013 The path portion of the GET request

  • \n
  • api_key (str) \u2013 Your API Key. Get your key here.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.KustomerSingerSource(name, api_token, start_date)[source]\u00b6
\n
\n
\n__init__(name, api_token, start_date)[source]\u00b6
\n

Airbyte Source for Kustomer Singer.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/kustomer

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • api_token (str) \u2013 Kustomer API Token. See the docs on how to obtain this

  • \n
  • start_date (str) \u2013 The date from which you\u2019d like to replicate the data

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.AzureTableSource(name, storage_account_name, storage_access_key, storage_endpoint_suffix=None)[source]\u00b6
\n
\n
\n__init__(name, storage_account_name, storage_access_key, storage_endpoint_suffix=None)[source]\u00b6
\n

Airbyte Source for Azure Table.

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • storage_account_name (str) \u2013 The name of your storage account.

  • \n
  • storage_access_key (str) \u2013 Azure Table Storage Access Key. See the docs for more information on how to obtain this key.

  • \n
  • storage_endpoint_suffix (Optional[str]) \u2013 Azure Table Storage service account URL suffix. See the docs for more information on how to obtain endpoint suffix

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.ScaffoldJavaJdbcSource(name, host, port, database, username, replication_method, password=None, jdbc_url_params=None)[source]\u00b6
\n
\n
\n__init__(name, host, port, database, username, replication_method, password=None, jdbc_url_params=None)[source]\u00b6
\n

Airbyte Source for Scaffold Java Jdbc.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/scaffold_java_jdbc

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • host (str) \u2013 Hostname of the database.

  • \n
  • port (int) \u2013 Port of the database.

  • \n
  • database (str) \u2013 Name of the database.

  • \n
  • username (str) \u2013 Username to use to access the database.

  • \n
  • password (Optional[str]) \u2013 Password associated with the username.

  • \n
  • jdbc_url_params (Optional[str]) \u2013 Additional properties to pass to the JDBC URL string when connecting to the database formatted as \u2018key=value\u2019 pairs separated by the symbol \u2018&\u2019. (example: key1=value1&key2=value2&key3=value3)

  • \n
  • replication_method (str) \u2013 Replication method to use for extracting data from the database. STANDARD replication requires no setup on the DB side but will not be able to represent deletions incrementally. CDC uses the Binlog to detect inserts, updates, and deletes. This needs to be configured on the source database itself.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.TidbSource(name, host, port, database, username, password=None, jdbc_url_params=None, ssl=None)[source]\u00b6
\n
\n
\n__init__(name, host, port, database, username, password=None, jdbc_url_params=None, ssl=None)[source]\u00b6
\n

Airbyte Source for Tidb.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/tidb

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • host (str) \u2013 Hostname of the database.

  • \n
  • port (int) \u2013 Port of the database.

  • \n
  • database (str) \u2013 Name of the database.

  • \n
  • username (str) \u2013 Username to use to access the database.

  • \n
  • password (Optional[str]) \u2013 Password associated with the username.

  • \n
  • jdbc_url_params (Optional[str]) \u2013 Additional properties to pass to the JDBC URL string when connecting to the database formatted as \u2018key=value\u2019 pairs separated by the symbol \u2018&\u2019. (example: key1=value1&key2=value2&key3=value3)

  • \n
  • ssl (Optional[bool]) \u2013 Encrypt data using SSL.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.QualarooSource(name, token, key, start_date, survey_ids=None)[source]\u00b6
\n
\n
\n__init__(name, token, key, start_date, survey_ids=None)[source]\u00b6
\n

Airbyte Source for Qualaroo.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/qualaroo

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • token (str) \u2013 A Qualaroo token. See the docs for instructions on how to generate it.

  • \n
  • key (str) \u2013 A Qualaroo token. See the docs for instructions on how to generate it.

  • \n
  • start_date (str) \u2013 UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated.

  • \n
  • survey_ids (Optional[List[str]]) \u2013 IDs of the surveys from which you\u2019d like to replicate data. If left empty, data from all surveys to which you have access will be replicated.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.YahooFinancePriceSource(name, tickers, interval=None, range=None)[source]\u00b6
\n
\n
\n__init__(name, tickers, interval=None, range=None)[source]\u00b6
\n

Airbyte Source for Yahoo Finance Price.

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • tickers (str) \u2013 Comma-separated identifiers for the stocks to be queried. Whitespaces are allowed.

  • \n
  • interval (Optional[str]) \u2013 The interval of between prices queried.

  • \n
  • range (Optional[str]) \u2013 The range of prices to be queried.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.GoogleAnalyticsV4Source(name, credentials, start_date, view_id, custom_reports=None, window_in_days=None)[source]\u00b6
\n
\n
\n__init__(name, credentials, start_date, view_id, custom_reports=None, window_in_days=None)[source]\u00b6
\n

Airbyte Source for Google Analytics V4.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/google-analytics-universal-analytics

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • credentials (Union[GoogleAnalyticsV4Source.AuthenticateViaGoogleOauth, GoogleAnalyticsV4Source.ServiceAccountKeyAuthentication]) \u2013 Credentials for the service

  • \n
  • start_date (str) \u2013 The date in the format YYYY-MM-DD. Any data before this date will not be replicated.

  • \n
  • view_id (str) \u2013 The ID for the Google Analytics View you want to fetch data from. This can be found from the Google Analytics Account Explorer.

  • \n
  • custom_reports (Optional[str]) \u2013 A JSON array describing the custom reports you want to sync from Google Analytics. See the docs for more information about the exact format you can use to fill out this field.

  • \n
  • window_in_days (Optional[int]) \u2013 The time increment used by the connector when requesting data from the Google Analytics API. More information is available in the the docs. The bigger this value is, the faster the sync will be, but the more likely that sampling will be applied to your data, potentially causing inaccuracies in the returned results. We recommend setting this to 1 unless you have a hard requirement to make the sync faster at the expense of accuracy. The minimum allowed value for this field is 1, and the maximum is 364.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass GoogleAnalyticsV4Source.AuthenticateViaGoogleOauth(client_id, client_secret, refresh_token, auth_type=None, access_token=None)[source]\u00b6
\n
\n
\n__init__(client_id, client_secret, refresh_token, auth_type=None, access_token=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass GoogleAnalyticsV4Source.ServiceAccountKeyAuthentication(credentials_json, auth_type=None)[source]\u00b6
\n
\n
\n__init__(credentials_json, auth_type=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.JdbcSource(name, username, jdbc_url, password=None, jdbc_url_params=None)[source]\u00b6
\n
\n
\n__init__(name, username, jdbc_url, password=None, jdbc_url_params=None)[source]\u00b6
\n

Airbyte Source for Jdbc.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/postgres

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • username (str) \u2013 The username which is used to access the database.

  • \n
  • password (Optional[str]) \u2013 The password associated with this username.

  • \n
  • jdbc_url (str) \u2013 JDBC formatted URL. See the standard here.

  • \n
  • jdbc_url_params (Optional[str]) \u2013 Additional properties to pass to the JDBC URL string when connecting to the database formatted as \u2018key=value\u2019 pairs separated by the symbol \u2018&\u2019. (example: key1=value1&key2=value2&key3=value3).

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.FakerSource(name, count, seed=None, records_per_sync=None, records_per_slice=None)[source]\u00b6
\n
\n
\n__init__(name, count, seed=None, records_per_sync=None, records_per_slice=None)[source]\u00b6
\n

Airbyte Source for Faker.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/faker

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • count (int) \u2013 How many users should be generated in total. This setting does not apply to the purchases or products stream.

  • \n
  • seed (Optional[int]) \u2013 Manually control the faker random seed to return the same values on subsequent runs (leave -1 for random)

  • \n
  • records_per_sync (Optional[int]) \u2013 How many fake records will be returned for each sync, for each stream? By default, it will take 2 syncs to create the requested 1000 records.

  • \n
  • records_per_slice (Optional[int]) \u2013 How many fake records will be in each page (stream slice), before a state message is emitted?

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.TplcentralSource(name, url_base, client_id, client_secret, user_login_id=None, user_login=None, tpl_key=None, customer_id=None, facility_id=None, start_date=None)[source]\u00b6
\n
\n
\n__init__(name, url_base, client_id, client_secret, user_login_id=None, user_login=None, tpl_key=None, customer_id=None, facility_id=None, start_date=None)[source]\u00b6
\n

Airbyte Source for Tplcentral.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/tplcentral

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • user_login_id (Optional[int]) \u2013 User login ID and/or name is required

  • \n
  • user_login (Optional[str]) \u2013 User login ID and/or name is required

  • \n
  • start_date (Optional[str]) \u2013 Date and time together in RFC 3339 format, for example, 2018-11-13T20:20:39+00:00.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.ClickhouseSource(name, host, port, database, username, password=None, jdbc_url_params=None, ssl=None)[source]\u00b6
\n
\n
\n__init__(name, host, port, database, username, password=None, jdbc_url_params=None, ssl=None)[source]\u00b6
\n

Airbyte Source for Clickhouse.

\n

Documentation can be found at https://docs.airbyte.com/integrations/destinations/clickhouse

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • host (str) \u2013 The host endpoint of the Clickhouse cluster.

  • \n
  • port (int) \u2013 The port of the database.

  • \n
  • database (str) \u2013 The name of the database.

  • \n
  • username (str) \u2013 The username which is used to access the database.

  • \n
  • password (Optional[str]) \u2013 The password associated with this username.

  • \n
  • jdbc_url_params (Optional[str]) \u2013 Additional properties to pass to the JDBC URL string when connecting to the database formatted as \u2018key=value\u2019 pairs separated by the symbol \u2018&\u2019. (Eg. key1=value1&key2=value2&key3=value3). For more information read about JDBC URL parameters.

  • \n
  • ssl (Optional[bool]) \u2013 Encrypt data using SSL.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.FreshserviceSource(name, domain_name, api_key, start_date)[source]\u00b6
\n
\n
\n__init__(name, domain_name, api_key, start_date)[source]\u00b6
\n

Airbyte Source for Freshservice.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/freshservice

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • domain_name (str) \u2013 The name of your Freshservice domain

  • \n
  • api_key (str) \u2013 Freshservice API Key. See here. The key is case sensitive.

  • \n
  • start_date (str) \u2013 UTC date and time in the format 2020-10-01T00:00:00Z. Any data before this date will not be replicated.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.ZenloopSource(name, api_token, date_from=None, survey_id=None, survey_group_id=None)[source]\u00b6
\n
\n
\n__init__(name, api_token, date_from=None, survey_id=None, survey_group_id=None)[source]\u00b6
\n

Airbyte Source for Zenloop.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/zenloop

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • api_token (str) \u2013 Zenloop API Token. You can get the API token in settings page here

  • \n
  • date_from (Optional[str]) \u2013 Zenloop date_from. Format: 2021-10-24T03:30:30Z or 2021-10-24. Leave empty if only data from current data should be synced

  • \n
  • survey_id (Optional[str]) \u2013 Zenloop Survey ID. Can be found here. Leave empty to pull answers from all surveys

  • \n
  • survey_group_id (Optional[str]) \u2013 Zenloop Survey Group ID. Can be found by pulling All Survey Groups via SurveyGroups stream. Leave empty to pull answers from all survey groups

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.OracleSource(name, host, port, connection_data, username, encryption, password=None, schemas=None, jdbc_url_params=None)[source]\u00b6
\n
\n
\n__init__(name, host, port, connection_data, username, encryption, password=None, schemas=None, jdbc_url_params=None)[source]\u00b6
\n

Airbyte Source for Oracle.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/oracle

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • host (str) \u2013 Hostname of the database.

  • \n
  • port (int) \u2013 Port of the database. Oracle Corporations recommends the following port numbers: 1521 - Default listening port for client connections to the listener. 2484 - Recommended and officially registered listening port for client connections to the listener using TCP/IP with SSL

  • \n
  • connection_data (Union[OracleSource.ServiceName, OracleSource.SystemIDSID]) \u2013 Connect data that will be used for DB connection

  • \n
  • username (str) \u2013 The username which is used to access the database.

  • \n
  • password (Optional[str]) \u2013 The password associated with the username.

  • \n
  • schemas (Optional[List[str]]) \u2013 The list of schemas to sync from. Defaults to user. Case sensitive.

  • \n
  • jdbc_url_params (Optional[str]) \u2013 Additional properties to pass to the JDBC URL string when connecting to the database formatted as \u2018key=value\u2019 pairs separated by the symbol \u2018&\u2019. (example: key1=value1&key2=value2&key3=value3).

  • \n
  • encryption (Union[OracleSource.Unencrypted, OracleSource.NativeNetworkEncryptionNNE, OracleSource.TLSEncryptedVerifyCertificate]) \u2013 The encryption method with is used when communicating with the database.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass OracleSource.ServiceName(service_name, connection_type=None)[source]\u00b6
\n
\n
\n__init__(service_name, connection_type=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass OracleSource.SystemIDSID(sid, connection_type=None)[source]\u00b6
\n
\n
\n__init__(sid, connection_type=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass OracleSource.Unencrypted[source]\u00b6
\n
\n
\n__init__()[source]\u00b6
\n
\n\n
\n\n
\n
\nclass OracleSource.NativeNetworkEncryptionNNE(encryption_algorithm=None)[source]\u00b6
\n
\n
\n__init__(encryption_algorithm=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass OracleSource.TLSEncryptedVerifyCertificate(ssl_certificate)[source]\u00b6
\n
\n
\n__init__(ssl_certificate)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.KlaviyoSource(name, api_key, start_date)[source]\u00b6
\n
\n
\n__init__(name, api_key, start_date)[source]\u00b6
\n

Airbyte Source for Klaviyo.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/klaviyo

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • api_key (str) \u2013 Klaviyo API Key. See our docs if you need help finding this key.

  • \n
  • start_date (str) \u2013 UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.GoogleDirectorySource(name, credentials)[source]\u00b6
\n
\n
\n__init__(name, credentials)[source]\u00b6
\n

Airbyte Source for Google Directory.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/google-directory

\n
\n
Parameters:
\n
\n
\n
\n
\n\n
\n\n
\n
\nclass GoogleDirectorySource.SignInViaGoogleOAuth(client_id, client_secret, refresh_token, credentials_title=None)[source]\u00b6
\n
\n
\n__init__(client_id, client_secret, refresh_token, credentials_title=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass GoogleDirectorySource.ServiceAccountKey(credentials_json, email, credentials_title=None)[source]\u00b6
\n
\n
\n__init__(credentials_json, email, credentials_title=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.InstagramSource(name, start_date, access_token)[source]\u00b6
\n
\n
\n__init__(name, start_date, access_token)[source]\u00b6
\n

Airbyte Source for Instagram.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/instagram

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • start_date (str) \u2013 The date from which you\u2019d like to replicate data for User Insights, in the format YYYY-MM-DDT00:00:00Z. All data generated after this date will be replicated.

  • \n
  • access_token (str) \u2013 The value of the access token generated. See the docs for more information

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.ShortioSource(name, domain_id, secret_key, start_date)[source]\u00b6
\n
\n
\n__init__(name, domain_id, secret_key, start_date)[source]\u00b6
\n

Airbyte Source for Shortio.

\n

Documentation can be found at https://developers.short.io/reference

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • secret_key (str) \u2013 Short.io Secret Key

  • \n
  • start_date (str) \u2013 UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.SquareSource(name, is_sandbox, credentials, start_date=None, include_deleted_objects=None)[source]\u00b6
\n
\n
\n__init__(name, is_sandbox, credentials, start_date=None, include_deleted_objects=None)[source]\u00b6
\n

Airbyte Source for Square.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/square

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • is_sandbox (bool) \u2013 Determines whether to use the sandbox or production environment.

  • \n
  • start_date (Optional[str]) \u2013 UTC date in the format YYYY-MM-DD. Any data before this date will not be replicated. If not set, all data will be replicated.

  • \n
  • include_deleted_objects (Optional[bool]) \u2013 In some streams there is an option to include deleted objects (Items, Categories, Discounts, Taxes)

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass SquareSource.OauthAuthentication(client_id, client_secret, refresh_token)[source]\u00b6
\n
\n
\n__init__(client_id, client_secret, refresh_token)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass SquareSource.APIKey(api_key)[source]\u00b6
\n
\n
\n__init__(api_key)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.DelightedSource(name, since, api_key)[source]\u00b6
\n
\n
\n__init__(name, since, api_key)[source]\u00b6
\n

Airbyte Source for Delighted.

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • since (str) \u2013 The date from which you\u2019d like to replicate the data

  • \n
  • api_key (str) \u2013 A Delighted API key.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.AmazonSqsSource(name, queue_url, region, delete_messages, max_batch_size=None, max_wait_time=None, attributes_to_return=None, visibility_timeout=None, access_key=None, secret_key=None)[source]\u00b6
\n
\n
\n__init__(name, queue_url, region, delete_messages, max_batch_size=None, max_wait_time=None, attributes_to_return=None, visibility_timeout=None, access_key=None, secret_key=None)[source]\u00b6
\n

Airbyte Source for Amazon Sqs.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/amazon-sqs

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • queue_url (str) \u2013 URL of the SQS Queue

  • \n
  • region (str) \u2013 AWS Region of the SQS Queue

  • \n
  • delete_messages (bool) \u2013 If Enabled, messages will be deleted from the SQS Queue after being read. If Disabled, messages are left in the queue and can be read more than once. WARNING: Enabling this option can result in data loss in cases of failure, use with caution, see documentation for more detail.

  • \n
  • max_batch_size (Optional[int]) \u2013 Max amount of messages to get in one batch (10 max)

  • \n
  • max_wait_time (Optional[int]) \u2013 Max amount of time in seconds to wait for messages in a single poll (20 max)

  • \n
  • attributes_to_return (Optional[str]) \u2013 Comma separated list of Mesage Attribute names to return

  • \n
  • visibility_timeout (Optional[int]) \u2013 Modify the Visibility Timeout of the individual message from the Queue\u2019s default (seconds).

  • \n
  • access_key (Optional[str]) \u2013 The Access Key ID of the AWS IAM Role to use for pulling messages

  • \n
  • secret_key (Optional[str]) \u2013 The Secret Key of the AWS IAM Role to use for pulling messages

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.YoutubeAnalyticsSource(name, credentials)[source]\u00b6
\n
\n
\n__init__(name, credentials)[source]\u00b6
\n

Airbyte Source for Youtube Analytics.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/youtube-analytics

\n
\n
Parameters:
\n

name (str) \u2013 The name of the destination.

\n
\n
\n
\n\n
\n\n
\n
\nclass YoutubeAnalyticsSource.AuthenticateViaOAuth20(client_id, client_secret, refresh_token)[source]\u00b6
\n
\n
\n__init__(client_id, client_secret, refresh_token)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.ScaffoldSourcePythonSource(name, fix_me=None)[source]\u00b6
\n
\n
\n__init__(name, fix_me=None)[source]\u00b6
\n

Airbyte Source for Scaffold Source Python.

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • fix_me (Optional[str]) \u2013 describe me

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.LookerSource(name, domain, client_id, client_secret, run_look_ids=None)[source]\u00b6
\n
\n
\n__init__(name, domain, client_id, client_secret, run_look_ids=None)[source]\u00b6
\n

Airbyte Source for Looker.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/looker

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • domain (str) \u2013 Domain for your Looker account, e.g. airbyte.cloud.looker.com,looker.[clientname].com,IP address

  • \n
  • client_id (str) \u2013 The Client ID is first part of an API3 key that is specific to each Looker user. See the docs for more information on how to generate this key.

  • \n
  • client_secret (str) \u2013 The Client Secret is second part of an API3 key.

  • \n
  • run_look_ids (Optional[List[str]]) \u2013 The IDs of any Looks to run

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.GitlabSource(name, api_url, private_token, start_date, groups=None, projects=None)[source]\u00b6
\n
\n
\n__init__(name, api_url, private_token, start_date, groups=None, projects=None)[source]\u00b6
\n

Airbyte Source for Gitlab.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/gitlab

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • api_url (str) \u2013 Please enter your basic URL from GitLab instance.

  • \n
  • private_token (str) \u2013 Log into your GitLab account and then generate a personal Access Token.

  • \n
  • groups (Optional[str]) \u2013 Space-delimited list of groups. e.g. airbyte.io.

  • \n
  • projects (Optional[str]) \u2013 Space-delimited list of projects. e.g. airbyte.io/documentation meltano/tap-gitlab.

  • \n
  • start_date (str) \u2013 The date from which you\u2019d like to replicate data for GitLab API, in the format YYYY-MM-DDT00:00:00Z. All data generated after this date will be replicated.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.ExchangeRatesSource(name, start_date, access_key, base=None, ignore_weekends=None)[source]\u00b6
\n
\n
\n__init__(name, start_date, access_key, base=None, ignore_weekends=None)[source]\u00b6
\n

Airbyte Source for Exchange Rates.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/exchangeratesapi

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • start_date (str) \u2013 Start getting data from that date.

  • \n
  • access_key (str) \u2013 Your API Key. See here. The key is case sensitive.

  • \n
  • base (Optional[str]) \u2013 ISO reference currency. See here. Free plan doesn\u2019t support Source Currency Switching, default base currency is EUR

  • \n
  • ignore_weekends (Optional[bool]) \u2013 Ignore weekends? (Exchanges don\u2019t run on weekends)

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.AmazonAdsSource(name, client_id, client_secret, refresh_token, auth_type=None, region=None, report_wait_timeout=None, report_generation_max_retries=None, start_date=None, profiles=None, state_filter=None)[source]\u00b6
\n
\n
\n__init__(name, client_id, client_secret, refresh_token, auth_type=None, region=None, report_wait_timeout=None, report_generation_max_retries=None, start_date=None, profiles=None, state_filter=None)[source]\u00b6
\n

Airbyte Source for Amazon Ads.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/amazon-ads

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • client_id (str) \u2013 The client ID of your Amazon Ads developer application. See the docs for more information.

  • \n
  • client_secret (str) \u2013 The client secret of your Amazon Ads developer application. See the docs for more information.

  • \n
  • refresh_token (str) \u2013 Amazon Ads refresh token. See the docs for more information on how to obtain this token.

  • \n
  • region (Optional[str]) \u2013 Region to pull data from (EU/NA/FE). See docs for more details.

  • \n
  • report_wait_timeout (Optional[int]) \u2013 Timeout duration in minutes for Reports. Default is 60 minutes.

  • \n
  • report_generation_max_retries (Optional[int]) \u2013 Maximum retries Airbyte will attempt for fetching report data. Default is 5.

  • \n
  • start_date (Optional[str]) \u2013 The Start date for collecting reports, should not be more than 60 days in the past. In YYYY-MM-DD format

  • \n
  • profiles (Optional[List[int]]) \u2013 Profile IDs you want to fetch data for. See docs for more details.

  • \n
  • state_filter (Optional[List[str]]) \u2013 Reflects the state of the Display, Product, and Brand Campaign streams as enabled, paused, or archived. If you do not populate this field, it will be ignored completely.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.MixpanelSource(name, credentials, project_id=None, attribution_window=None, project_timezone=None, select_properties_by_default=None, start_date=None, end_date=None, region=None, date_window_size=None)[source]\u00b6
\n
\n
\n__init__(name, credentials, project_id=None, attribution_window=None, project_timezone=None, select_properties_by_default=None, start_date=None, end_date=None, region=None, date_window_size=None)[source]\u00b6
\n

Airbyte Source for Mixpanel.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/mixpanel

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • credentials (Union[MixpanelSource.ServiceAccount, MixpanelSource.ProjectSecret]) \u2013 Choose how to authenticate to Mixpanel

  • \n
  • project_id (Optional[int]) \u2013 Your project ID number. See the docs for more information on how to obtain this.

  • \n
  • attribution_window (Optional[int]) \u2013 A period of time for attributing results to ads and the lookback period after those actions occur during which ad results are counted. Default attribution window is 5 days.

  • \n
  • project_timezone (Optional[str]) \u2013 Time zone in which integer date times are stored. The project timezone may be found in the project settings in the Mixpanel console.

  • \n
  • select_properties_by_default (Optional[bool]) \u2013 Setting this config parameter to TRUE ensures that new properties on events and engage records are captured. Otherwise new properties will be ignored.

  • \n
  • start_date (Optional[str]) \u2013 The date in the format YYYY-MM-DD. Any data before this date will not be replicated. If this option is not set, the connector will replicate data from up to one year ago by default.

  • \n
  • end_date (Optional[str]) \u2013 The date in the format YYYY-MM-DD. Any data after this date will not be replicated. Left empty to always sync to most recent date

  • \n
  • region (Optional[str]) \u2013 The region of mixpanel domain instance either US or EU.

  • \n
  • date_window_size (Optional[int]) \u2013 Defines window size in days, that used to slice through data. You can reduce it, if amount of data in each window is too big for your environment.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass MixpanelSource.ServiceAccount(username, secret)[source]\u00b6
\n
\n
\n__init__(username, secret)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass MixpanelSource.ProjectSecret(api_secret)[source]\u00b6
\n
\n
\n__init__(api_secret)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.OrbitSource(name, api_token, workspace, start_date=None)[source]\u00b6
\n
\n
\n__init__(name, api_token, workspace, start_date=None)[source]\u00b6
\n

Airbyte Source for Orbit.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/orbit

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • api_token (str) \u2013 Authorizes you to work with Orbit workspaces associated with the token.

  • \n
  • workspace (str) \u2013 The unique name of the workspace that your API token is associated with.

  • \n
  • start_date (Optional[str]) \u2013 Date in the format 2022-06-26. Only load members whose last activities are after this date.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.AmazonSellerPartnerSource(name, lwa_app_id, lwa_client_secret, refresh_token, aws_access_key, aws_secret_key, role_arn, replication_start_date, aws_environment, region, app_id=None, auth_type=None, replication_end_date=None, period_in_days=None, report_options=None, max_wait_seconds=None)[source]\u00b6
\n
\n
\n__init__(name, lwa_app_id, lwa_client_secret, refresh_token, aws_access_key, aws_secret_key, role_arn, replication_start_date, aws_environment, region, app_id=None, auth_type=None, replication_end_date=None, period_in_days=None, report_options=None, max_wait_seconds=None)[source]\u00b6
\n

Airbyte Source for Amazon Seller Partner.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/amazon-seller-partner

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • app_id (Optional[str]) \u2013 Your Amazon App ID

  • \n
  • lwa_app_id (str) \u2013 Your Login with Amazon Client ID.

  • \n
  • lwa_client_secret (str) \u2013 Your Login with Amazon Client Secret.

  • \n
  • refresh_token (str) \u2013 The Refresh Token obtained via OAuth flow authorization.

  • \n
  • aws_access_key (str) \u2013 Specifies the AWS access key used as part of the credentials to authenticate the user.

  • \n
  • aws_secret_key (str) \u2013 Specifies the AWS secret key used as part of the credentials to authenticate the user.

  • \n
  • role_arn (str) \u2013 Specifies the Amazon Resource Name (ARN) of an IAM role that you want to use to perform operations requested using this profile. (Needs permission to \u2018Assume Role\u2019 STS).

  • \n
  • replication_start_date (str) \u2013 UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated.

  • \n
  • replication_end_date (Optional[str]) \u2013 UTC date and time in the format 2017-01-25T00:00:00Z. Any data after this date will not be replicated.

  • \n
  • period_in_days (Optional[int]) \u2013 Will be used for stream slicing for initial full_refresh sync when no updated state is present for reports that support sliced incremental sync.

  • \n
  • report_options (Optional[str]) \u2013 Additional information passed to reports. This varies by report type. Must be a valid json string.

  • \n
  • max_wait_seconds (Optional[int]) \u2013 Sometimes report can take up to 30 minutes to generate. This will set the limit for how long to wait for a successful report.

  • \n
  • aws_environment (str) \u2013 An enumeration.

  • \n
  • region (str) \u2013 An enumeration.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.CourierSource(name, api_key)[source]\u00b6
\n
\n
\n__init__(name, api_key)[source]\u00b6
\n

Airbyte Source for Courier.

\n

Documentation can be found at https://docs.airbyte.io/integrations/sources/courier

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • api_key (str) \u2013 Courier API Key to retrieve your data.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.CloseComSource(name, api_key, start_date=None)[source]\u00b6
\n
\n
\n__init__(name, api_key, start_date=None)[source]\u00b6
\n

Airbyte Source for Close Com.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/close-com

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • api_key (str) \u2013 Close.com API key (usually starts with \u2018api\\_\u2019; find yours here).

  • \n
  • start_date (Optional[str]) \u2013 The start date to sync data. Leave blank for full sync. Format: YYYY-MM-DD.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.BingAdsSource(name, client_id, refresh_token, developer_token, reports_start_date, auth_method=None, tenant_id=None, client_secret=None)[source]\u00b6
\n
\n
\n__init__(name, client_id, refresh_token, developer_token, reports_start_date, auth_method=None, tenant_id=None, client_secret=None)[source]\u00b6
\n

Airbyte Source for Bing Ads.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/bing-ads

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • tenant_id (Optional[str]) \u2013 The Tenant ID of your Microsoft Advertising developer application. Set this to \u201ccommon\u201d unless you know you need a different value.

  • \n
  • client_id (str) \u2013 The Client ID of your Microsoft Advertising developer application.

  • \n
  • client_secret (Optional[str]) \u2013 The Client Secret of your Microsoft Advertising developer application.

  • \n
  • refresh_token (str) \u2013 Refresh Token to renew the expired Access Token.

  • \n
  • developer_token (str) \u2013 Developer token associated with user. See more info in the docs.

  • \n
  • reports_start_date (str) \u2013 The start date from which to begin replicating report data. Any data generated before this date will not be replicated in reports. This is a UTC date in YYYY-MM-DD format.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.PrimetricSource(name, client_id, client_secret)[source]\u00b6
\n
\n
\n__init__(name, client_id, client_secret)[source]\u00b6
\n

Airbyte Source for Primetric.

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • client_id (str) \u2013 The Client ID of your Primetric developer application. The Client ID is visible here.

  • \n
  • client_secret (str) \u2013 The Client Secret of your Primetric developer application. You can manage your client\u2019s credentials here.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.PivotalTrackerSource(name, api_token)[source]\u00b6
\n
\n
\n__init__(name, api_token)[source]\u00b6
\n

Airbyte Source for Pivotal Tracker.

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • api_token (str) \u2013 Pivotal Tracker API token

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.ElasticsearchSource(name, endpoint, authenticationMethod)[source]\u00b6
\n
\n
\n__init__(name, endpoint, authenticationMethod)[source]\u00b6
\n

Airbyte Source for Elasticsearch.

\n

Documentation can be found at https://docs.airbyte.com/integrations/source/elasticsearch

\n
\n
Parameters:
\n
\n
\n
\n
\n\n
\n\n
\n
\nclass ElasticsearchSource.None_[source]\u00b6
\n
\n
\n__init__()[source]\u00b6
\n
\n\n
\n\n
\n
\nclass ElasticsearchSource.ApiKeySecret(apiKeyId, apiKeySecret)[source]\u00b6
\n
\n
\n__init__(apiKeyId, apiKeySecret)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass ElasticsearchSource.UsernamePassword(username, password)[source]\u00b6
\n
\n
\n__init__(username, password)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.BigquerySource(name, project_id, credentials_json, dataset_id=None)[source]\u00b6
\n
\n
\n__init__(name, project_id, credentials_json, dataset_id=None)[source]\u00b6
\n

Airbyte Source for Bigquery.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/bigquery

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • project_id (str) \u2013 The GCP project ID for the project containing the target BigQuery dataset.

  • \n
  • dataset_id (Optional[str]) \u2013 The dataset ID to search for tables and views. If you are only loading data from one dataset, setting this option could result in much faster schema discovery.

  • \n
  • credentials_json (str) \u2013 The contents of your Service Account Key JSON file. See the docs for more information on how to obtain this key.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.WoocommerceSource(name, shop, start_date, api_key, api_secret, conversion_window_days=None)[source]\u00b6
\n
\n
\n__init__(name, shop, start_date, api_key, api_secret, conversion_window_days=None)[source]\u00b6
\n

Airbyte Source for Woocommerce.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/woocommerce

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • shop (str) \u2013 The name of the store. For https://EXAMPLE.com, the shop name is \u2018EXAMPLE.com\u2019.

  • \n
  • start_date (str) \u2013 The date you would like to replicate data. Format: YYYY-MM-DD.

  • \n
  • api_key (str) \u2013 The CUSTOMER KEY for API in WooCommerce shop.

  • \n
  • api_secret (str) \u2013 The CUSTOMER SECRET for API in WooCommerce shop.

  • \n
  • conversion_window_days (Optional[int]) \u2013 A conversion window is the period of time after an ad interaction (such as an ad click or video view) during which a conversion, such as a purchase, is recorded in Google Ads.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.SearchMetricsSource(name, api_key, client_secret, country_code, start_date)[source]\u00b6
\n
\n
\n__init__(name, api_key, client_secret, country_code, start_date)[source]\u00b6
\n

Airbyte Source for Search Metrics.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/seacrh-metrics

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • country_code (str) \u2013 The region of the S3 staging bucket to use if utilising a copy strategy.

  • \n
  • start_date (str) \u2013 Data generated in SearchMetrics after this date will be replicated. This date must be specified in the format YYYY-MM-DDT00:00:00Z.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.TypeformSource(name, start_date, token, form_ids=None)[source]\u00b6
\n
\n
\n__init__(name, start_date, token, form_ids=None)[source]\u00b6
\n

Airbyte Source for Typeform.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/typeform

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • start_date (str) \u2013 UTC date and time in the format: YYYY-MM-DDTHH:mm:ss[Z]. Any data before this date will not be replicated.

  • \n
  • token (str) \u2013 The API Token for a Typeform account.

  • \n
  • form_ids (Optional[List[str]]) \u2013 When this parameter is set, the connector will replicate data only from the input forms. Otherwise, all forms in your Typeform account will be replicated. You can find form IDs in your form URLs. For example, in the URL \u201chttps://mysite.typeform.com/to/u6nXL7\u201d the form_id is u6nXL7. You can find form URLs on Share panel

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.WebflowSource(name, site_id, api_key)[source]\u00b6
\n
\n
\n__init__(name, site_id, api_key)[source]\u00b6
\n

Airbyte Source for Webflow.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/webflow

\n
\n
Parameters:
\n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.FireboltSource(name, username, password, database, account=None, host=None, engine=None)[source]\u00b6
\n
\n
\n__init__(name, username, password, database, account=None, host=None, engine=None)[source]\u00b6
\n

Airbyte Source for Firebolt.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/firebolt

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • username (str) \u2013 Firebolt email address you use to login.

  • \n
  • password (str) \u2013 Firebolt password.

  • \n
  • account (Optional[str]) \u2013 Firebolt account to login.

  • \n
  • host (Optional[str]) \u2013 The host name of your Firebolt database.

  • \n
  • database (str) \u2013 The database to connect to.

  • \n
  • engine (Optional[str]) \u2013 Engine name or url to connect to.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.FaunaSource(name, domain, port, scheme, secret, collection)[source]\u00b6
\n
\n
\n__init__(name, domain, port, scheme, secret, collection)[source]\u00b6
\n

Airbyte Source for Fauna.

\n

Documentation can be found at https://github.com/fauna/airbyte/blob/source-fauna/docs/integrations/sources/fauna.md

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • domain (str) \u2013 Domain of Fauna to query. Defaults db.fauna.com. See the docs.

  • \n
  • port (int) \u2013 Endpoint port.

  • \n
  • scheme (str) \u2013 URL scheme.

  • \n
  • secret (str) \u2013 Fauna secret, used when authenticating with the database.

  • \n
  • collection (FaunaSource.Collection) \u2013 Settings for the Fauna Collection.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass FaunaSource.Disabled[source]\u00b6
\n
\n
\n__init__()[source]\u00b6
\n
\n\n
\n\n
\n
\nclass FaunaSource.Enabled(column)[source]\u00b6
\n
\n
\n__init__(column)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass FaunaSource.Collection(page_size, deletions)[source]\u00b6
\n
\n
\n__init__(page_size, deletions)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.IntercomSource(name, start_date, access_token)[source]\u00b6
\n
\n
\n__init__(name, start_date, access_token)[source]\u00b6
\n

Airbyte Source for Intercom.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/intercom

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • start_date (str) \u2013 UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated.

  • \n
  • access_token (str) \u2013 Access token for making authenticated requests. See the Intercom docs for more information.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.FreshsalesSource(name, domain_name, api_key)[source]\u00b6
\n
\n
\n__init__(name, domain_name, api_key)[source]\u00b6
\n

Airbyte Source for Freshsales.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/freshsales

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • domain_name (str) \u2013 The Name of your Freshsales domain

  • \n
  • api_key (str) \u2013 Freshsales API Key. See here. The key is case sensitive.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.AdjustSource(name, api_token, dimensions, ingest_start, metrics, additional_metrics=None, until_today=None)[source]\u00b6
\n
\n
\n__init__(name, api_token, dimensions, ingest_start, metrics, additional_metrics=None, until_today=None)[source]\u00b6
\n

Airbyte Source for Adjust.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/adjust

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • additional_metrics (Optional[List[str]]) \u2013 Metrics names that are not pre-defined, such as cohort metrics or app specific metrics.

  • \n
  • api_token (str) \u2013 Adjust API key, see https://help.adjust.com/en/article/report-service-api-authentication

  • \n
  • dimensions (List[str]) \u2013 Dimensions allow a user to break down metrics into groups using one or several parameters. For example, the number of installs by date, country and network. See https://help.adjust.com/en/article/reports-endpoint#dimensions for more information about the dimensions.

  • \n
  • ingest_start (str) \u2013 Data ingest start date.

  • \n
  • metrics (List[str]) \u2013 Select at least one metric to query.

  • \n
  • until_today (Optional[bool]) \u2013 Syncs data up until today. Useful when running daily incremental syncs, and duplicates are not desired.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.BambooHrSource(name, subdomain, api_key, custom_reports_fields=None, custom_reports_include_default_fields=None)[source]\u00b6
\n
\n
\n__init__(name, subdomain, api_key, custom_reports_fields=None, custom_reports_include_default_fields=None)[source]\u00b6
\n

Airbyte Source for Bamboo Hr.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/bamboo-hr

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • subdomain (str) \u2013 Sub Domain of bamboo hr

  • \n
  • api_key (str) \u2013 Api key of bamboo hr

  • \n
  • custom_reports_fields (Optional[str]) \u2013 Comma-separated list of fields to include in custom reports.

  • \n
  • custom_reports_include_default_fields (Optional[bool]) \u2013 If true, the custom reports endpoint will include the default fields defined here: https://documentation.bamboohr.com/docs/list-of-field-names.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.GoogleAdsSource(name, credentials, customer_id, start_date, end_date=None, custom_queries=None, login_customer_id=None, conversion_window_days=None)[source]\u00b6
\n
\n
\n__init__(name, credentials, customer_id, start_date, end_date=None, custom_queries=None, login_customer_id=None, conversion_window_days=None)[source]\u00b6
\n

Airbyte Source for Google Ads.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/google-ads

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • customer_id (str) \u2013 Comma separated list of (client) customer IDs. Each customer ID must be specified as a 10-digit number without dashes. More instruction on how to find this value in our docs. Metrics streams like AdGroupAdReport cannot be requested for a manager account.

  • \n
  • start_date (str) \u2013 UTC date and time in the format 2017-01-25. Any data before this date will not be replicated.

  • \n
  • end_date (Optional[str]) \u2013 UTC date and time in the format 2017-01-25. Any data after this date will not be replicated.

  • \n
  • login_customer_id (Optional[str]) \u2013 If your access to the customer account is through a manager account, this field is required and must be set to the customer ID of the manager account (10-digit number without dashes). More information about this field you can see here

  • \n
  • conversion_window_days (Optional[int]) \u2013 A conversion window is the period of time after an ad interaction (such as an ad click or video view) during which a conversion, such as a purchase, is recorded in Google Ads. For more information, see Google\u2019s documentation.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass GoogleAdsSource.GoogleCredentials(developer_token, client_id, client_secret, refresh_token, access_token=None)[source]\u00b6
\n
\n
\n__init__(developer_token, client_id, client_secret, refresh_token, access_token=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass GoogleAdsSource.CustomGAQLQueriesEntry(query, table_name)[source]\u00b6
\n
\n
\n__init__(query, table_name)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.HellobatonSource(name, api_key, company)[source]\u00b6
\n
\n
\n__init__(name, api_key, company)[source]\u00b6
\n

Airbyte Source for Hellobaton.

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • api_key (str) \u2013 authentication key required to access the api endpoints

  • \n
  • company (str) \u2013 Company name that generates your base api url

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.SendgridSource(name, apikey, start_time)[source]\u00b6
\n
\n
\n__init__(name, apikey, start_time)[source]\u00b6
\n

Airbyte Source for Sendgrid.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/sendgrid

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • apikey (str) \u2013 API Key, use admin to generate this key.

  • \n
  • start_time (Union[int, str]) \u2013 Start time in ISO8601 format. Any data before this time point will not be replicated.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.MondaySource(name, credentials)[source]\u00b6
\n
\n
\n__init__(name, credentials)[source]\u00b6
\n

Airbyte Source for Monday.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/monday

\n
\n
Parameters:
\n

name (str) \u2013 The name of the destination.

\n
\n
\n
\n\n
\n\n
\n
\nclass MondaySource.OAuth20(client_id, client_secret, access_token, subdomain=None)[source]\u00b6
\n
\n
\n__init__(client_id, client_secret, access_token, subdomain=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass MondaySource.APIToken(api_token)[source]\u00b6
\n
\n
\n__init__(api_token)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.DixaSource(name, api_token, start_date, batch_size=None)[source]\u00b6
\n
\n
\n__init__(name, api_token, start_date, batch_size=None)[source]\u00b6
\n

Airbyte Source for Dixa.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/dixa

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • api_token (str) \u2013 Dixa API token

  • \n
  • start_date (str) \u2013 The connector pulls records updated from this date onwards.

  • \n
  • batch_size (Optional[int]) \u2013 Number of days to batch into one request. Max 31.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.SalesforceSource(name, client_id, client_secret, refresh_token, is_sandbox=None, auth_type=None, start_date=None, streams_criteria=None)[source]\u00b6
\n
\n
\n__init__(name, client_id, client_secret, refresh_token, is_sandbox=None, auth_type=None, start_date=None, streams_criteria=None)[source]\u00b6
\n

Airbyte Source for Salesforce.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/salesforce

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • is_sandbox (Optional[bool]) \u2013 Toggle if you\u2019re using a Salesforce Sandbox

  • \n
  • client_id (str) \u2013 Enter your Salesforce developer application\u2019s Client ID

  • \n
  • client_secret (str) \u2013 Enter your Salesforce developer application\u2019s Client secret

  • \n
  • refresh_token (str) \u2013 Enter your application\u2019s Salesforce Refresh Token used for Airbyte to access your Salesforce account.

  • \n
  • start_date (Optional[str]) \u2013 Enter the date in the YYYY-MM-DD format. Airbyte will replicate the data added on and after this date. If this field is blank, Airbyte will replicate all data.

  • \n
  • streams_criteria (Optional[List[SalesforceSource.FilterSalesforceObjectsEntry]]) \u2013 Filter streams relevant to you

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass SalesforceSource.FilterSalesforceObjectsEntry(criteria, value)[source]\u00b6
\n
\n
\n__init__(criteria, value)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.PipedriveSource(name, authorization, replication_start_date)[source]\u00b6
\n
\n
\n__init__(name, authorization, replication_start_date)[source]\u00b6
\n

Airbyte Source for Pipedrive.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/pipedrive

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • authorization (Union[PipedriveSource.SignInViaPipedriveOAuth, PipedriveSource.APIKeyAuthentication]) \u2013 Choose one of the possible authorization method

  • \n
  • replication_start_date (str) \u2013 UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated. When specified and not None, then stream will behave as incremental

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass PipedriveSource.SignInViaPipedriveOAuth(client_id, client_secret, refresh_token)[source]\u00b6
\n
\n
\n__init__(client_id, client_secret, refresh_token)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass PipedriveSource.APIKeyAuthentication(api_token)[source]\u00b6
\n
\n
\n__init__(api_token)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.FileSource(name, dataset_name, format, url, provider, reader_options=None)[source]\u00b6
\n
\n
\n__init__(name, dataset_name, format, url, provider, reader_options=None)[source]\u00b6
\n

Airbyte Source for File.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/file

\n
\n
Parameters:
\n
\n
\n
\n
\n\n
\n\n
\n
\nclass FileSource.HTTPSPublicWeb(user_agent=None)[source]\u00b6
\n
\n
\n__init__(user_agent=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass FileSource.GCSGoogleCloudStorage(service_account_json=None)[source]\u00b6
\n
\n
\n__init__(service_account_json=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass FileSource.S3AmazonWebServices(aws_access_key_id=None, aws_secret_access_key=None)[source]\u00b6
\n
\n
\n__init__(aws_access_key_id=None, aws_secret_access_key=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass FileSource.AzBlobAzureBlobStorage(storage_account, sas_token=None, shared_key=None)[source]\u00b6
\n
\n
\n__init__(storage_account, sas_token=None, shared_key=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass FileSource.SSHSecureShell(user, host, password=None, port=None)[source]\u00b6
\n
\n
\n__init__(user, host, password=None, port=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass FileSource.SCPSecureCopyProtocol(user, host, password=None, port=None)[source]\u00b6
\n
\n
\n__init__(user, host, password=None, port=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass FileSource.SFTPSecureFileTransferProtocol(user, host, password=None, port=None)[source]\u00b6
\n
\n
\n__init__(user, host, password=None, port=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass FileSource.LocalFilesystemLimited[source]\u00b6
\n
\n
\n__init__()[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.GlassfrogSource(name, api_key)[source]\u00b6
\n
\n
\n__init__(name, api_key)[source]\u00b6
\n

Airbyte Source for Glassfrog.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/glassfrog

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • api_key (str) \u2013 API key provided by Glassfrog

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.ChartmogulSource(name, api_key, start_date, interval)[source]\u00b6
\n
\n
\n__init__(name, api_key, start_date, interval)[source]\u00b6
\n

Airbyte Source for Chartmogul.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/chartmogul

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • api_key (str) \u2013 Chartmogul API key

  • \n
  • start_date (str) \u2013 UTC date and time in the format 2017-01-25T00:00:00Z. When feasible, any data before this date will not be replicated.

  • \n
  • interval (str) \u2013 Some APIs such as Metrics require intervals to cluster data.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.OrbSource(name, api_key, start_date=None, lookback_window_days=None, string_event_properties_keys=None, numeric_event_properties_keys=None)[source]\u00b6
\n
\n
\n__init__(name, api_key, start_date=None, lookback_window_days=None, string_event_properties_keys=None, numeric_event_properties_keys=None)[source]\u00b6
\n

Airbyte Source for Orb.

\n

Documentation can be found at https://docs.withorb.com/

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • api_key (str) \u2013 Orb API Key, issued from the Orb admin console.

  • \n
  • start_date (Optional[str]) \u2013 UTC date and time in the format 2022-03-01T00:00:00Z. Any data with created_at before this data will not be synced.

  • \n
  • lookback_window_days (Optional[int]) \u2013 When set to N, the connector will always refresh resources created within the past N days. By default, updated objects that are not newly created are not incrementally synced.

  • \n
  • string_event_properties_keys (Optional[List[str]]) \u2013 Property key names to extract from all events, in order to enrich ledger entries corresponding to an event deduction.

  • \n
  • numeric_event_properties_keys (Optional[List[str]]) \u2013 Property key names to extract from all events, in order to enrich ledger entries corresponding to an event deduction.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.CockroachdbSource(name, host, port, database, username, password=None, jdbc_url_params=None, ssl=None)[source]\u00b6
\n
\n
\n__init__(name, host, port, database, username, password=None, jdbc_url_params=None, ssl=None)[source]\u00b6
\n

Airbyte Source for Cockroachdb.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/cockroachdb

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • host (str) \u2013 Hostname of the database.

  • \n
  • port (int) \u2013 Port of the database.

  • \n
  • database (str) \u2013 Name of the database.

  • \n
  • username (str) \u2013 Username to use to access the database.

  • \n
  • password (Optional[str]) \u2013 Password associated with the username.

  • \n
  • jdbc_url_params (Optional[str]) \u2013 Additional properties to pass to the JDBC URL string when connecting to the database formatted as \u2018key=value\u2019 pairs separated by the symbol \u2018&\u2019. (Eg. key1=value1&key2=value2&key3=value3). For more information read about JDBC URL parameters.

  • \n
  • ssl (Optional[bool]) \u2013 Encrypt client/server communications for increased security.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.ConfluenceSource(name, api_token, domain_name, email)[source]\u00b6
\n
\n
\n__init__(name, api_token, domain_name, email)[source]\u00b6
\n

Airbyte Source for Confluence.

\n
\n
Parameters:
\n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.PlaidSource(name, access_token, api_key, client_id, plaid_env, start_date=None)[source]\u00b6
\n
\n
\n__init__(name, access_token, api_key, client_id, plaid_env, start_date=None)[source]\u00b6
\n

Airbyte Source for Plaid.

\n

Documentation can be found at https://plaid.com/docs/api/

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • access_token (str) \u2013 The end-user\u2019s Link access token.

  • \n
  • api_key (str) \u2013 The Plaid API key to use to hit the API.

  • \n
  • client_id (str) \u2013 The Plaid client id

  • \n
  • plaid_env (str) \u2013 The Plaid environment

  • \n
  • start_date (Optional[str]) \u2013 The date from which you\u2019d like to replicate data for Plaid in the format YYYY-MM-DD. All data generated after this date will be replicated.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.SnapchatMarketingSource(name, client_id, client_secret, refresh_token, start_date=None, end_date=None)[source]\u00b6
\n
\n
\n__init__(name, client_id, client_secret, refresh_token, start_date=None, end_date=None)[source]\u00b6
\n

Airbyte Source for Snapchat Marketing.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/snapchat-marketing

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • client_id (str) \u2013 The Client ID of your Snapchat developer application.

  • \n
  • client_secret (str) \u2013 The Client Secret of your Snapchat developer application.

  • \n
  • refresh_token (str) \u2013 Refresh Token to renew the expired Access Token.

  • \n
  • start_date (Optional[str]) \u2013 Date in the format 2022-01-01. Any data before this date will not be replicated.

  • \n
  • end_date (Optional[str]) \u2013 Date in the format 2017-01-25. Any data after this date will not be replicated.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.MicrosoftTeamsSource(name, period, credentials)[source]\u00b6
\n
\n
\n__init__(name, period, credentials)[source]\u00b6
\n

Airbyte Source for Microsoft Teams.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/microsoft-teams

\n
\n
Parameters:
\n
\n
\n
\n
\n\n
\n\n
\n
\nclass MicrosoftTeamsSource.AuthenticateViaMicrosoftOAuth20(tenant_id, client_id, client_secret, refresh_token, auth_type=None)[source]\u00b6
\n
\n
\n__init__(tenant_id, client_id, client_secret, refresh_token, auth_type=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass MicrosoftTeamsSource.AuthenticateViaMicrosoft(tenant_id, client_id, client_secret, auth_type=None)[source]\u00b6
\n
\n
\n__init__(tenant_id, client_id, client_secret, auth_type=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.LeverHiringSource(name, credentials, start_date, environment=None)[source]\u00b6
\n
\n
\n__init__(name, credentials, start_date, environment=None)[source]\u00b6
\n

Airbyte Source for Lever Hiring.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/lever-hiring

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • credentials (LeverHiringSource.OAuthCredentials) \u2013 Choose how to authenticate to Lever Hiring.

  • \n
  • start_date (str) \u2013 UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated. Note that it will be used only in the following incremental streams: comments, commits, and issues.

  • \n
  • environment (Optional[str]) \u2013 The environment in which you\u2019d like to replicate data for Lever. This is used to determine which Lever API endpoint to use.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass LeverHiringSource.OAuthCredentials(refresh_token, auth_type=None, client_id=None, client_secret=None)[source]\u00b6
\n
\n
\n__init__(refresh_token, auth_type=None, client_id=None, client_secret=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.TwilioSource(name, account_sid, auth_token, start_date, lookback_window=None)[source]\u00b6
\n
\n
\n__init__(name, account_sid, auth_token, start_date, lookback_window=None)[source]\u00b6
\n

Airbyte Source for Twilio.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/twilio

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • account_sid (str) \u2013 Twilio account SID

  • \n
  • auth_token (str) \u2013 Twilio Auth Token.

  • \n
  • start_date (str) \u2013 UTC date and time in the format 2020-10-01T00:00:00Z. Any data before this date will not be replicated.

  • \n
  • lookback_window (Optional[int]) \u2013 How far into the past to look for records. (in minutes)

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.StripeSource(name, account_id, client_secret, start_date, lookback_window_days=None, slice_range=None)[source]\u00b6
\n
\n
\n__init__(name, account_id, client_secret, start_date, lookback_window_days=None, slice_range=None)[source]\u00b6
\n

Airbyte Source for Stripe.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/stripe

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • account_id (str) \u2013 Your Stripe account ID (starts with \u2018acct\\_\u2019, find yours here).

  • \n
  • client_secret (str) \u2013 Stripe API key (usually starts with \u2018sk_live\\_\u2019; find yours here).

  • \n
  • start_date (str) \u2013 UTC date and time in the format 2017-01-25T00:00:00Z. Only data generated after this date will be replicated.

  • \n
  • lookback_window_days (Optional[int]) \u2013 When set, the connector will always re-export data from the past N days, where N is the value set here. This is useful if your data is frequently updated after creation. More info here

  • \n
  • slice_range (Optional[int]) \u2013 The time increment used by the connector when requesting data from the Stripe API. The bigger the value is, the less requests will be made and faster the sync will be. On the other hand, the more seldom the state is persisted.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.Db2Source(name, host, port, db, username, password, encryption, jdbc_url_params=None)[source]\u00b6
\n
\n
\n__init__(name, host, port, db, username, password, encryption, jdbc_url_params=None)[source]\u00b6
\n

Airbyte Source for Db2.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/db2

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • host (str) \u2013 Host of the Db2.

  • \n
  • port (int) \u2013 Port of the database.

  • \n
  • db (str) \u2013 Name of the database.

  • \n
  • username (str) \u2013 Username to use to access the database.

  • \n
  • password (str) \u2013 Password associated with the username.

  • \n
  • jdbc_url_params (Optional[str]) \u2013 Additional properties to pass to the JDBC URL string when connecting to the database formatted as \u2018key=value\u2019 pairs separated by the symbol \u2018&\u2019. (example: key1=value1&key2=value2&key3=value3).

  • \n
  • encryption (Union[Db2Source.Unencrypted, Db2Source.TLSEncryptedVerifyCertificate]) \u2013 Encryption method to use when communicating with the database

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass Db2Source.Unencrypted[source]\u00b6
\n
\n
\n__init__()[source]\u00b6
\n
\n\n
\n\n
\n
\nclass Db2Source.TLSEncryptedVerifyCertificate(ssl_certificate, key_store_password=None)[source]\u00b6
\n
\n
\n__init__(ssl_certificate, key_store_password=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.SlackSource(name, start_date, lookback_window, join_channels, credentials, channel_filter=None)[source]\u00b6
\n
\n
\n__init__(name, start_date, lookback_window, join_channels, credentials, channel_filter=None)[source]\u00b6
\n

Airbyte Source for Slack.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/slack

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • start_date (str) \u2013 UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated.

  • \n
  • lookback_window (int) \u2013 How far into the past to look for messages in threads.

  • \n
  • join_channels (bool) \u2013 Whether to join all channels or to sync data only from channels the bot is already in. If false, you\u2019ll need to manually add the bot to all the channels from which you\u2019d like to sync messages.

  • \n
  • channel_filter (Optional[List[str]]) \u2013 A channel name list (without leading \u2018#\u2019 char) which limit the channels from which you\u2019d like to sync. Empty list means no filter.

  • \n
  • credentials (Union[SlackSource.DefaultOAuth20Authorization, SlackSource.APITokenCredentials]) \u2013 Choose how to authenticate into Slack

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass SlackSource.DefaultOAuth20Authorization(client_id, client_secret, access_token, refresh_token=None)[source]\u00b6
\n
\n
\n__init__(client_id, client_secret, access_token, refresh_token=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass SlackSource.APITokenCredentials(api_token)[source]\u00b6
\n
\n
\n__init__(api_token)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.RechargeSource(name, start_date, access_token)[source]\u00b6
\n
\n
\n__init__(name, start_date, access_token)[source]\u00b6
\n

Airbyte Source for Recharge.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/recharge

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • start_date (str) \u2013 The date from which you\u2019d like to replicate data for Recharge API, in the format YYYY-MM-DDT00:00:00Z. Any data before this date will not be replicated.

  • \n
  • access_token (str) \u2013 The value of the Access Token generated. See the docs for more information.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.OpenweatherSource(name, lat, lon, appid, units=None, lang=None)[source]\u00b6
\n
\n
\n__init__(name, lat, lon, appid, units=None, lang=None)[source]\u00b6
\n

Airbyte Source for Openweather.

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • lat (str) \u2013 Latitude for which you want to get weather condition from. (min -90, max 90)

  • \n
  • lon (str) \u2013 Longitude for which you want to get weather condition from. (min -180, max 180)

  • \n
  • appid (str) \u2013 Your OpenWeather API Key. See here. The key is case sensitive.

  • \n
  • units (Optional[str]) \u2013 Units of measurement. standard, metric and imperial units are available. If you do not use the units parameter, standard units will be applied by default.

  • \n
  • lang (Optional[str]) \u2013 You can use lang parameter to get the output in your language. The contents of the description field will be translated. See here for the list of supported languages.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.RetentlySource(name, credentials)[source]\u00b6
\n
\n
\n__init__(name, credentials)[source]\u00b6
\n

Airbyte Source for Retently.

\n
\n
Parameters:
\n
\n
\n
\n
\n\n
\n\n
\n
\nclass RetentlySource.AuthenticateViaRetentlyOAuth(client_id, client_secret, refresh_token, auth_type=None)[source]\u00b6
\n
\n
\n__init__(client_id, client_secret, refresh_token, auth_type=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass RetentlySource.AuthenticateWithAPIToken(api_key, auth_type=None)[source]\u00b6
\n
\n
\n__init__(api_key, auth_type=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.ScaffoldSourceHttpSource(name, TODO)[source]\u00b6
\n
\n
\n__init__(name, TODO)[source]\u00b6
\n

Airbyte Source for Scaffold Source Http.

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • TODO (str) \u2013 describe me

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.YandexMetricaSource(name, auth_token, counter_id, start_date, end_date)[source]\u00b6
\n
\n
\n__init__(name, auth_token, counter_id, start_date, end_date)[source]\u00b6
\n

Airbyte Source for Yandex Metrica.

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • auth_token (str) \u2013 Your Yandex Metrica API access token

  • \n
  • counter_id (str) \u2013 Counter ID

  • \n
  • start_date (str) \u2013 UTC date and time in the format YYYY-MM-DD.

  • \n
  • end_date (str) \u2013 UTC date and time in the format YYYY-MM-DD.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.TalkdeskExploreSource(name, start_date, auth_url, api_key, timezone=None)[source]\u00b6
\n
\n
\n__init__(name, start_date, auth_url, api_key, timezone=None)[source]\u00b6
\n

Airbyte Source for Talkdesk Explore.

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • start_date (str) \u2013 The date from which you\u2019d like to replicate data for Talkdesk Explore API, in the format YYYY-MM-DDT00:00:00. All data generated after this date will be replicated.

  • \n
  • timezone (Optional[str]) \u2013 Timezone to use when generating reports. Only IANA timezones are supported (https://nodatime.org/TimeZones)

  • \n
  • auth_url (str) \u2013 Talkdesk Auth URL. Only \u2018client_credentials\u2019 auth type supported at the moment.

  • \n
  • api_key (str) \u2013 Talkdesk API key.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.ChargifySource(name, api_key, domain)[source]\u00b6
\n
\n
\n__init__(name, api_key, domain)[source]\u00b6
\n

Airbyte Source for Chargify.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/chargify

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • api_key (str) \u2013 Chargify API Key.

  • \n
  • domain (str) \u2013 Chargify domain. Normally this domain follows the following format companyname.chargify.com

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.RkiCovidSource(name, start_date)[source]\u00b6
\n
\n
\n__init__(name, start_date)[source]\u00b6
\n

Airbyte Source for Rki Covid.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/rki-covid

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • start_date (str) \u2013 UTC date in the format 2017-01-25. Any data before this date will not be replicated.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.PostgresSource(name, host, port, database, username, ssl_mode, replication_method, tunnel_method, schemas=None, password=None, jdbc_url_params=None, ssl=None)[source]\u00b6
\n
\n
\n__init__(name, host, port, database, username, ssl_mode, replication_method, tunnel_method, schemas=None, password=None, jdbc_url_params=None, ssl=None)[source]\u00b6
\n

Airbyte Source for Postgres.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/postgres

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • host (str) \u2013 Hostname of the database.

  • \n
  • port (int) \u2013 Port of the database.

  • \n
  • database (str) \u2013 Name of the database.

  • \n
  • schemas (Optional[List[str]]) \u2013 The list of schemas (case sensitive) to sync from. Defaults to public.

  • \n
  • username (str) \u2013 Username to access the database.

  • \n
  • password (Optional[str]) \u2013 Password associated with the username.

  • \n
  • jdbc_url_params (Optional[str]) \u2013 Additional properties to pass to the JDBC URL string when connecting to the database formatted as \u2018key=value\u2019 pairs separated by the symbol \u2018&\u2019. (Eg. key1=value1&key2=value2&key3=value3). For more information read about JDBC URL parameters.

  • \n
  • ssl (Optional[bool]) \u2013 Encrypt data using SSL. When activating SSL, please select one of the connection modes.

  • \n
  • ssl_mode (Union[PostgresSource.Disable, PostgresSource.Allow, PostgresSource.Prefer, PostgresSource.Require, PostgresSource.VerifyCa, PostgresSource.VerifyFull]) \u2013 SSL connection modes. disable - Disables encryption of communication between Airbyte and source database allow - Enables encryption only when required by the source database prefer - allows unencrypted connection only if the source database does not support encryption require - Always require encryption. If the source database server does not support encryption, connection will fail verify-ca - Always require encryption and verifies that the source database server has a valid SSL certificate verify-full - This is the most secure mode. Always require encryption and verifies the identity of the source database server Read more in the docs.

  • \n
  • replication_method (Union[PostgresSource.Standard, PostgresSource.LogicalReplicationCDC]) \u2013 Replication method for extracting data from the database.

  • \n
  • tunnel_method (Union[PostgresSource.NoTunnel, PostgresSource.SSHKeyAuthentication, PostgresSource.PasswordAuthentication]) \u2013 Whether to initiate an SSH tunnel before connecting to the database, and if so, which kind of authentication to use.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass PostgresSource.Disable[source]\u00b6
\n
\n
\n__init__()[source]\u00b6
\n
\n\n
\n\n
\n
\nclass PostgresSource.Allow[source]\u00b6
\n
\n
\n__init__()[source]\u00b6
\n
\n\n
\n\n
\n
\nclass PostgresSource.Prefer[source]\u00b6
\n
\n
\n__init__()[source]\u00b6
\n
\n\n
\n\n
\n
\nclass PostgresSource.Require[source]\u00b6
\n
\n
\n__init__()[source]\u00b6
\n
\n\n
\n\n
\n
\nclass PostgresSource.VerifyCa(ca_certificate, client_certificate=None, client_key=None, client_key_password=None)[source]\u00b6
\n
\n
\n__init__(ca_certificate, client_certificate=None, client_key=None, client_key_password=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass PostgresSource.VerifyFull(ca_certificate, client_certificate=None, client_key=None, client_key_password=None)[source]\u00b6
\n
\n
\n__init__(ca_certificate, client_certificate=None, client_key=None, client_key_password=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass PostgresSource.Standard[source]\u00b6
\n
\n
\n__init__()[source]\u00b6
\n
\n\n
\n\n
\n
\nclass PostgresSource.LogicalReplicationCDC(replication_slot, publication, plugin=None, initial_waiting_seconds=None)[source]\u00b6
\n
\n
\n__init__(replication_slot, publication, plugin=None, initial_waiting_seconds=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass PostgresSource.NoTunnel[source]\u00b6
\n
\n
\n__init__()[source]\u00b6
\n
\n\n
\n\n
\n
\nclass PostgresSource.SSHKeyAuthentication(tunnel_host, tunnel_port, tunnel_user, ssh_key)[source]\u00b6
\n
\n
\n__init__(tunnel_host, tunnel_port, tunnel_user, ssh_key)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass PostgresSource.PasswordAuthentication(tunnel_host, tunnel_port, tunnel_user, tunnel_user_password)[source]\u00b6
\n
\n
\n__init__(tunnel_host, tunnel_port, tunnel_user, tunnel_user_password)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.TrelloSource(name, token, key, start_date, board_ids=None)[source]\u00b6
\n
\n
\n__init__(name, token, key, start_date, board_ids=None)[source]\u00b6
\n

Airbyte Source for Trello.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/trello

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • token (str) \u2013 Trello v API token. See the docs for instructions on how to generate it.

  • \n
  • key (str) \u2013 Trello API key. See the docs for instructions on how to generate it.

  • \n
  • start_date (str) \u2013 UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated.

  • \n
  • board_ids (Optional[List[str]]) \u2013 IDs of the boards to replicate data from. If left empty, data from all boards to which you have access will be replicated.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.PrestashopSource(name, url, access_key)[source]\u00b6
\n
\n
\n__init__(name, url, access_key)[source]\u00b6
\n

Airbyte Source for Prestashop.

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • url (str) \u2013 Shop URL without trailing slash (domain name or IP address)

  • \n
  • access_key (str) \u2013 Your PrestaShop access key. See the docs for info on how to obtain this.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.PaystackSource(name, secret_key, start_date, lookback_window_days=None)[source]\u00b6
\n
\n
\n__init__(name, secret_key, start_date, lookback_window_days=None)[source]\u00b6
\n

Airbyte Source for Paystack.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/paystack

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • secret_key (str) \u2013 The Paystack API key (usually starts with \u2018sk_live\\_\u2019; find yours here).

  • \n
  • start_date (str) \u2013 UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated.

  • \n
  • lookback_window_days (Optional[int]) \u2013 When set, the connector will always reload data from the past N days, where N is the value set here. This is useful if your data is updated after creation.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.S3Source(name, dataset, path_pattern, format, provider, schema=None)[source]\u00b6
\n
\n
\n__init__(name, dataset, path_pattern, format, provider, schema=None)[source]\u00b6
\n

Airbyte Source for S3.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/s3

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • dataset (str) \u2013 The name of the stream you would like this source to output. Can contain letters, numbers, or underscores.

  • \n
  • path_pattern (str) \u2013 A regular expression which tells the connector which files to replicate. All files which match this pattern will be replicated. Use | to separate multiple patterns. See this page to understand pattern syntax (GLOBSTAR and SPLIT flags are enabled). Use pattern ** to pick up all files.

  • \n
  • format (Union[S3Source.CSV, S3Source.Parquet, S3Source.Avro, S3Source.Jsonl]) \u2013 The format of the files you\u2019d like to replicate

  • \n
  • schema (Optional[str]) \u2013 Optionally provide a schema to enforce, as a valid JSON string. Ensure this is a mapping of { \u201ccolumn\u201d : \u201ctype\u201d }, where types are valid JSON Schema datatypes. Leave as {} to auto-infer the schema.

  • \n
  • provider (S3Source.S3AmazonWebServices) \u2013 Use this to load files from S3 or S3-compatible services

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass S3Source.CSV(filetype=None, delimiter=None, infer_datatypes=None, quote_char=None, escape_char=None, encoding=None, double_quote=None, newlines_in_values=None, additional_reader_options=None, advanced_options=None, block_size=None)[source]\u00b6
\n
\n
\n__init__(filetype=None, delimiter=None, infer_datatypes=None, quote_char=None, escape_char=None, encoding=None, double_quote=None, newlines_in_values=None, additional_reader_options=None, advanced_options=None, block_size=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass S3Source.Parquet(filetype=None, columns=None, batch_size=None, buffer_size=None)[source]\u00b6
\n
\n
\n__init__(filetype=None, columns=None, batch_size=None, buffer_size=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass S3Source.Avro(filetype=None)[source]\u00b6
\n
\n
\n__init__(filetype=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass S3Source.Jsonl(filetype=None, newlines_in_values=None, unexpected_field_behavior=None, block_size=None)[source]\u00b6
\n
\n
\n__init__(filetype=None, newlines_in_values=None, unexpected_field_behavior=None, block_size=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass S3Source.S3AmazonWebServices(bucket, aws_access_key_id=None, aws_secret_access_key=None, path_prefix=None, endpoint=None)[source]\u00b6
\n
\n
\n__init__(bucket, aws_access_key_id=None, aws_secret_access_key=None, path_prefix=None, endpoint=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.SnowflakeSource(name, credentials, host, role, warehouse, database, schema, jdbc_url_params=None)[source]\u00b6
\n
\n
\n__init__(name, credentials, host, role, warehouse, database, schema, jdbc_url_params=None)[source]\u00b6
\n

Airbyte Source for Snowflake.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/snowflake

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • host (str) \u2013 The host domain of the snowflake instance (must include the account, region, cloud environment, and end with snowflakecomputing.com).

  • \n
  • role (str) \u2013 The role you created for Airbyte to access Snowflake.

  • \n
  • warehouse (str) \u2013 The warehouse you created for Airbyte to access data.

  • \n
  • database (str) \u2013 The database you created for Airbyte to access data.

  • \n
  • schema (str) \u2013 The source Snowflake schema tables.

  • \n
  • jdbc_url_params (Optional[str]) \u2013 Additional properties to pass to the JDBC URL string when connecting to the database formatted as \u2018key=value\u2019 pairs separated by the symbol \u2018&\u2019. (example: key1=value1&key2=value2&key3=value3).

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass SnowflakeSource.OAuth20(client_id, client_secret, access_token=None, refresh_token=None)[source]\u00b6
\n
\n
\n__init__(client_id, client_secret, access_token=None, refresh_token=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass SnowflakeSource.UsernameAndPassword(username, password)[source]\u00b6
\n
\n
\n__init__(username, password)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.AmplitudeSource(name, api_key, secret_key, start_date)[source]\u00b6
\n
\n
\n__init__(name, api_key, secret_key, start_date)[source]\u00b6
\n

Airbyte Source for Amplitude.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/amplitude

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • api_key (str) \u2013 Amplitude API Key. See the setup guide for more information on how to obtain this key.

  • \n
  • secret_key (str) \u2013 Amplitude Secret Key. See the setup guide for more information on how to obtain this key.

  • \n
  • start_date (str) \u2013 UTC date and time in the format 2021-01-25T00:00:00Z. Any data before this date will not be replicated.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.PosthogSource(name, start_date, api_key, base_url=None)[source]\u00b6
\n
\n
\n__init__(name, start_date, api_key, base_url=None)[source]\u00b6
\n

Airbyte Source for Posthog.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/posthog

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • start_date (str) \u2013 The date from which you\u2019d like to replicate the data. Any data before this date will not be replicated.

  • \n
  • api_key (str) \u2013 API Key. See the docs for information on how to generate this key.

  • \n
  • base_url (Optional[str]) \u2013 Base PostHog url. Defaults to PostHog Cloud (https://app.posthog.com).

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.PaypalTransactionSource(name, start_date, is_sandbox, client_id=None, client_secret=None, refresh_token=None)[source]\u00b6
\n
\n
\n__init__(name, start_date, is_sandbox, client_id=None, client_secret=None, refresh_token=None)[source]\u00b6
\n

Airbyte Source for Paypal Transaction.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/paypal-transactions

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • client_id (Optional[str]) \u2013 The Client ID of your Paypal developer application.

  • \n
  • client_secret (Optional[str]) \u2013 The Client Secret of your Paypal developer application.

  • \n
  • refresh_token (Optional[str]) \u2013 The key to refresh the expired access token.

  • \n
  • start_date (str) \u2013 Start Date for data extraction in ISO format. Date must be in range from 3 years till 12 hrs before present time.

  • \n
  • is_sandbox (bool) \u2013 Determines whether to use the sandbox or production environment.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.MssqlSource(name, host, port, database, username, ssl_method, replication_method, schemas=None, password=None, jdbc_url_params=None)[source]\u00b6
\n
\n
\n__init__(name, host, port, database, username, ssl_method, replication_method, schemas=None, password=None, jdbc_url_params=None)[source]\u00b6
\n

Airbyte Source for Mssql.

\n

Documentation can be found at https://docs.airbyte.com/integrations/destinations/mssql

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • host (str) \u2013 The hostname of the database.

  • \n
  • port (int) \u2013 The port of the database.

  • \n
  • database (str) \u2013 The name of the database.

  • \n
  • schemas (Optional[List[str]]) \u2013 The list of schemas to sync from. Defaults to user. Case sensitive.

  • \n
  • username (str) \u2013 The username which is used to access the database.

  • \n
  • password (Optional[str]) \u2013 The password associated with the username.

  • \n
  • jdbc_url_params (Optional[str]) \u2013 Additional properties to pass to the JDBC URL string when connecting to the database formatted as \u2018key=value\u2019 pairs separated by the symbol \u2018&\u2019. (example: key1=value1&key2=value2&key3=value3).

  • \n
  • ssl_method (Union[MssqlSource.Unencrypted, MssqlSource.EncryptedTrustServerCertificate, MssqlSource.EncryptedVerifyCertificate]) \u2013 The encryption method which is used when communicating with the database.

  • \n
  • replication_method (Union[MssqlSource.Standard, MssqlSource.LogicalReplicationCDC]) \u2013 The replication method used for extracting data from the database. STANDARD replication requires no setup on the DB side but will not be able to represent deletions incrementally. CDC uses {TBC} to detect inserts, updates, and deletes. This needs to be configured on the source database itself.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass MssqlSource.Unencrypted[source]\u00b6
\n
\n
\n__init__()[source]\u00b6
\n
\n\n
\n\n
\n
\nclass MssqlSource.EncryptedTrustServerCertificate[source]\u00b6
\n
\n
\n__init__()[source]\u00b6
\n
\n\n
\n\n
\n
\nclass MssqlSource.EncryptedVerifyCertificate(hostNameInCertificate=None)[source]\u00b6
\n
\n
\n__init__(hostNameInCertificate=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass MssqlSource.Standard[source]\u00b6
\n
\n
\n__init__()[source]\u00b6
\n
\n\n
\n\n
\n
\nclass MssqlSource.LogicalReplicationCDC(data_to_sync=None, snapshot_isolation=None)[source]\u00b6
\n
\n
\n__init__(data_to_sync=None, snapshot_isolation=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.ZohoCrmSource(name, client_id, client_secret, refresh_token, dc_region, environment, edition, start_datetime=None)[source]\u00b6
\n
\n
\n__init__(name, client_id, client_secret, refresh_token, dc_region, environment, edition, start_datetime=None)[source]\u00b6
\n

Airbyte Source for Zoho Crm.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/zoho-crm

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • client_id (str) \u2013 OAuth2.0 Client ID

  • \n
  • client_secret (str) \u2013 OAuth2.0 Client Secret

  • \n
  • refresh_token (str) \u2013 OAuth2.0 Refresh Token

  • \n
  • dc_region (str) \u2013 Please choose the region of your Data Center location. More info by this Link

  • \n
  • environment (str) \u2013 Please choose the environment

  • \n
  • start_datetime (Optional[str]) \u2013 ISO 8601, for instance: YYYY-MM-DD, YYYY-MM-DD HH:MM:SS+HH:MM

  • \n
  • edition (str) \u2013 Choose your Edition of Zoho CRM to determine API Concurrency Limits

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.RedshiftSource(name, host, port, database, username, password, schemas=None, jdbc_url_params=None)[source]\u00b6
\n
\n
\n__init__(name, host, port, database, username, password, schemas=None, jdbc_url_params=None)[source]\u00b6
\n

Airbyte Source for Redshift.

\n

Documentation can be found at https://docs.airbyte.com/integrations/destinations/redshift

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • host (str) \u2013 Host Endpoint of the Redshift Cluster (must include the cluster-id, region and end with .redshift.amazonaws.com).

  • \n
  • port (int) \u2013 Port of the database.

  • \n
  • database (str) \u2013 Name of the database.

  • \n
  • schemas (Optional[List[str]]) \u2013 The list of schemas to sync from. Specify one or more explicitly or keep empty to process all schemas. Schema names are case sensitive.

  • \n
  • username (str) \u2013 Username to use to access the database.

  • \n
  • password (str) \u2013 Password associated with the username.

  • \n
  • jdbc_url_params (Optional[str]) \u2013 Additional properties to pass to the JDBC URL string when connecting to the database formatted as \u2018key=value\u2019 pairs separated by the symbol \u2018&\u2019. (example: key1=value1&key2=value2&key3=value3).

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.AsanaSource(name, credentials)[source]\u00b6
\n
\n
\n__init__(name, credentials)[source]\u00b6
\n

Airbyte Source for Asana.

\n
\n
Parameters:
\n
\n
\n
\n
\n\n
\n\n
\n
\nclass AsanaSource.PATCredentials(personal_access_token)[source]\u00b6
\n
\n
\n__init__(personal_access_token)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass AsanaSource.OAuthCredentials(client_id, client_secret, refresh_token)[source]\u00b6
\n
\n
\n__init__(client_id, client_secret, refresh_token)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.SmartsheetsSource(name, access_token, spreadsheet_id, start_datetime=None)[source]\u00b6
\n
\n
\n__init__(name, access_token, spreadsheet_id, start_datetime=None)[source]\u00b6
\n

Airbyte Source for Smartsheets.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/smartsheets

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • access_token (str) \u2013 The access token to use for accessing your data from Smartsheets. This access token must be generated by a user with at least read access to the data you\u2019d like to replicate. Generate an access token in the Smartsheets main menu by clicking Account > Apps & Integrations > API Access. See the setup guide for information on how to obtain this token.

  • \n
  • spreadsheet_id (str) \u2013 The spreadsheet ID. Find it by opening the spreadsheet then navigating to File > Properties

  • \n
  • start_datetime (Optional[str]) \u2013 Only rows modified after this date/time will be replicated. This should be an ISO 8601 string, for instance: 2000-01-01T13:00:00

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.MailchimpSource(name, credentials)[source]\u00b6
\n
\n
\n__init__(name, credentials)[source]\u00b6
\n

Airbyte Source for Mailchimp.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/mailchimp

\n
\n
Parameters:
\n

name (str) \u2013 The name of the destination.

\n
\n
\n
\n\n
\n\n
\n
\nclass MailchimpSource.OAuth20(access_token, client_id=None, client_secret=None)[source]\u00b6
\n
\n
\n__init__(access_token, client_id=None, client_secret=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass MailchimpSource.APIKey(apikey)[source]\u00b6
\n
\n
\n__init__(apikey)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.SentrySource(name, auth_token, organization, project, hostname=None, discover_fields=None)[source]\u00b6
\n
\n
\n__init__(name, auth_token, organization, project, hostname=None, discover_fields=None)[source]\u00b6
\n

Airbyte Source for Sentry.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/sentry

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • auth_token (str) \u2013 Log into Sentry and then create authentication tokens.For self-hosted, you can find or create authentication tokens by visiting \u201c{instance_url_prefix}/settings/account/api/auth-tokens/\u201d

  • \n
  • hostname (Optional[str]) \u2013 Host name of Sentry API server.For self-hosted, specify your host name here. Otherwise, leave it empty.

  • \n
  • organization (str) \u2013 The slug of the organization the groups belong to.

  • \n
  • project (str) \u2013 The name (slug) of the Project you want to sync.

  • \n
  • discover_fields (Optional[List[str]]) \u2013 Fields to retrieve when fetching discover events

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.MailgunSource(name, private_key, domain_region=None, start_date=None)[source]\u00b6
\n
\n
\n__init__(name, private_key, domain_region=None, start_date=None)[source]\u00b6
\n

Airbyte Source for Mailgun.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/mailgun

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • private_key (str) \u2013 Primary account API key to access your Mailgun data.

  • \n
  • domain_region (Optional[str]) \u2013 Domain region code. \u2018EU\u2019 or \u2018US\u2019 are possible values. The default is \u2018US\u2019.

  • \n
  • start_date (Optional[str]) \u2013 UTC date and time in the format 2020-10-01 00:00:00. Any data before this date will not be replicated. If omitted, defaults to 3 days ago.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.OnesignalSource(name, user_auth_key, start_date, outcome_names)[source]\u00b6
\n
\n
\n__init__(name, user_auth_key, start_date, outcome_names)[source]\u00b6
\n

Airbyte Source for Onesignal.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/onesignal

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • user_auth_key (str) \u2013 OneSignal User Auth Key, see the docs for more information on how to obtain this key.

  • \n
  • start_date (str) \u2013 The date from which you\u2019d like to replicate data for OneSignal API, in the format YYYY-MM-DDT00:00:00Z. All data generated after this date will be replicated.

  • \n
  • outcome_names (str) \u2013 Comma-separated list of names and the value (sum/count) for the returned outcome data. See the docs for more details

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.PythonHttpTutorialSource(name, start_date, base, access_key=None)[source]\u00b6
\n
\n
\n__init__(name, start_date, base, access_key=None)[source]\u00b6
\n

Airbyte Source for Python Http Tutorial.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/exchangeratesapi

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • access_key (Optional[str]) \u2013 API access key used to retrieve data from the Exchange Rates API.

  • \n
  • start_date (str) \u2013 UTC date and time in the format 2017-01-25. Any data before this date will not be replicated.

  • \n
  • base (str) \u2013 ISO reference currency. See here.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.AirtableSource(name, api_key, base_id, tables)[source]\u00b6
\n
\n
\n__init__(name, api_key, base_id, tables)[source]\u00b6
\n

Airbyte Source for Airtable.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/airtable

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • api_key (str) \u2013 The API Key for the Airtable account. See the Support Guide for more information on how to obtain this key.

  • \n
  • base_id (str) \u2013 The Base ID to integrate the data from. You can find the Base ID following the link Airtable API, log in to your account, select the base you need and find Base ID in the docs.

  • \n
  • tables (List[str]) \u2013 The list of Tables to integrate.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.MongodbV2Source(name, instance_type, database, user=None, password=None, auth_source=None)[source]\u00b6
\n
\n
\n__init__(name, instance_type, database, user=None, password=None, auth_source=None)[source]\u00b6
\n

Airbyte Source for Mongodb V2.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/mongodb-v2

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • instance_type (Union[MongodbV2Source.StandaloneMongoDbInstance, MongodbV2Source.ReplicaSet, MongodbV2Source.MongoDBAtlas]) \u2013 The MongoDb instance to connect to. For MongoDB Atlas and Replica Set TLS connection is used by default.

  • \n
  • database (str) \u2013 The database you want to replicate.

  • \n
  • user (Optional[str]) \u2013 The username which is used to access the database.

  • \n
  • password (Optional[str]) \u2013 The password associated with this username.

  • \n
  • auth_source (Optional[str]) \u2013 The authentication source where the user information is stored.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass MongodbV2Source.StandaloneMongoDbInstance(instance, host, port, tls=None)[source]\u00b6
\n
\n
\n__init__(instance, host, port, tls=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass MongodbV2Source.ReplicaSet(instance, server_addresses, replica_set=None)[source]\u00b6
\n
\n
\n__init__(instance, server_addresses, replica_set=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass MongodbV2Source.MongoDBAtlas(instance, cluster_url)[source]\u00b6
\n
\n
\n__init__(instance, cluster_url)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.FileSecureSource(name, dataset_name, format, url, provider, reader_options=None)[source]\u00b6
\n
\n
\n__init__(name, dataset_name, format, url, provider, reader_options=None)[source]\u00b6
\n

Airbyte Source for File Secure.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/file

\n
\n
Parameters:
\n
\n
\n
\n
\n\n
\n\n
\n
\nclass FileSecureSource.HTTPSPublicWeb(user_agent=None)[source]\u00b6
\n
\n
\n__init__(user_agent=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass FileSecureSource.GCSGoogleCloudStorage(service_account_json=None)[source]\u00b6
\n
\n
\n__init__(service_account_json=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass FileSecureSource.S3AmazonWebServices(aws_access_key_id=None, aws_secret_access_key=None)[source]\u00b6
\n
\n
\n__init__(aws_access_key_id=None, aws_secret_access_key=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass FileSecureSource.AzBlobAzureBlobStorage(storage_account, sas_token=None, shared_key=None)[source]\u00b6
\n
\n
\n__init__(storage_account, sas_token=None, shared_key=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass FileSecureSource.SSHSecureShell(user, host, password=None, port=None)[source]\u00b6
\n
\n
\n__init__(user, host, password=None, port=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass FileSecureSource.SCPSecureCopyProtocol(user, host, password=None, port=None)[source]\u00b6
\n
\n
\n__init__(user, host, password=None, port=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass FileSecureSource.SFTPSecureFileTransferProtocol(user, host, password=None, port=None)[source]\u00b6
\n
\n
\n__init__(user, host, password=None, port=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.ZendeskSupportSource(name, start_date, subdomain, credentials)[source]\u00b6
\n
\n
\n__init__(name, start_date, subdomain, credentials)[source]\u00b6
\n

Airbyte Source for Zendesk Support.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/zendesk-support

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • start_date (str) \u2013 The date from which you\u2019d like to replicate data for Zendesk Support API, in the format YYYY-MM-DDT00:00:00Z. All data generated after this date will be replicated.

  • \n
  • subdomain (str) \u2013 This is your Zendesk subdomain that can be found in your account URL. For example, in https://{MY_SUBDOMAIN}.zendesk.com/, where MY_SUBDOMAIN is the value of your subdomain.

  • \n
  • credentials (Union[ZendeskSupportSource.OAuth20, ZendeskSupportSource.APIToken]) \u2013 Zendesk service provides two authentication methods. Choose between: OAuth2.0 or API token.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass ZendeskSupportSource.OAuth20(access_token, credentials=None)[source]\u00b6
\n
\n
\n__init__(access_token, credentials=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass ZendeskSupportSource.APIToken(email, api_token, credentials=None)[source]\u00b6
\n
\n
\n__init__(email, api_token, credentials=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.TempoSource(name, api_token)[source]\u00b6
\n
\n
\n__init__(name, api_token)[source]\u00b6
\n

Airbyte Source for Tempo.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • api_token (str) \u2013 Tempo API Token. Go to Tempo>Settings, scroll down to Data Access and select API integration.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.BraintreeSource(name, merchant_id, public_key, private_key, environment, start_date=None)[source]\u00b6
\n
\n
\n__init__(name, merchant_id, public_key, private_key, environment, start_date=None)[source]\u00b6
\n

Airbyte Source for Braintree.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/braintree

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • merchant_id (str) \u2013 The unique identifier for your entire gateway account. See the docs for more information on how to obtain this ID.

  • \n
  • public_key (str) \u2013 Braintree Public Key. See the docs for more information on how to obtain this key.

  • \n
  • private_key (str) \u2013 Braintree Private Key. See the docs for more information on how to obtain this key.

  • \n
  • start_date (Optional[str]) \u2013 UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated.

  • \n
  • environment (str) \u2013 Environment specifies where the data will come from.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.SalesloftSource(name, client_id, client_secret, refresh_token, start_date)[source]\u00b6
\n
\n
\n__init__(name, client_id, client_secret, refresh_token, start_date)[source]\u00b6
\n

Airbyte Source for Salesloft.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/salesloft

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • client_id (str) \u2013 The Client ID of your Salesloft developer application.

  • \n
  • client_secret (str) \u2013 The Client Secret of your Salesloft developer application.

  • \n
  • refresh_token (str) \u2013 The token for obtaining a new access token.

  • \n
  • start_date (str) \u2013 The date from which you\u2019d like to replicate data for Salesloft API, in the format YYYY-MM-DDT00:00:00Z. All data generated after this date will be replicated.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.LinnworksSource(name, application_id, application_secret, token, start_date)[source]\u00b6
\n
\n
\n__init__(name, application_id, application_secret, token, start_date)[source]\u00b6
\n

Airbyte Source for Linnworks.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/linnworks

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • application_id (str) \u2013 Linnworks Application ID

  • \n
  • application_secret (str) \u2013 Linnworks Application Secret

  • \n
  • start_date (str) \u2013 UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.ChargebeeSource(name, site, site_api_key, start_date, product_catalog)[source]\u00b6
\n
\n
\n__init__(name, site, site_api_key, start_date, product_catalog)[source]\u00b6
\n

Airbyte Source for Chargebee.

\n

Documentation can be found at https://apidocs.chargebee.com/docs/api

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • site (str) \u2013 The site prefix for your Chargebee instance.

  • \n
  • site_api_key (str) \u2013 Chargebee API Key. See the docs for more information on how to obtain this key.

  • \n
  • start_date (str) \u2013 UTC date and time in the format 2021-01-25T00:00:00Z. Any data before this date will not be replicated.

  • \n
  • product_catalog (str) \u2013 Product Catalog version of your Chargebee site. Instructions on how to find your version you may find here under API Version section.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.GoogleAnalyticsDataApiSource(name, property_id, credentials, date_ranges_start_date, custom_reports=None, window_in_days=None)[source]\u00b6
\n
\n
\n__init__(name, property_id, credentials, date_ranges_start_date, custom_reports=None, window_in_days=None)[source]\u00b6
\n

Airbyte Source for Google Analytics Data Api.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/google-analytics-v4

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • property_id (str) \u2013 A Google Analytics GA4 property identifier whose events are tracked. Specified in the URL path and not the body

  • \n
  • credentials (Union[GoogleAnalyticsDataApiSource.AuthenticateViaGoogleOauth, GoogleAnalyticsDataApiSource.ServiceAccountKeyAuthentication]) \u2013 Credentials for the service

  • \n
  • date_ranges_start_date (str) \u2013 The start date. One of the values Ndaysago, yesterday, today or in the format YYYY-MM-DD

  • \n
  • custom_reports (Optional[str]) \u2013 A JSON array describing the custom reports you want to sync from Google Analytics. See the docs for more information about the exact format you can use to fill out this field.

  • \n
  • window_in_days (Optional[int]) \u2013 The time increment used by the connector when requesting data from the Google Analytics API. More information is available in the the docs. The bigger this value is, the faster the sync will be, but the more likely that sampling will be applied to your data, potentially causing inaccuracies in the returned results. We recommend setting this to 1 unless you have a hard requirement to make the sync faster at the expense of accuracy. The minimum allowed value for this field is 1, and the maximum is 364.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass GoogleAnalyticsDataApiSource.AuthenticateViaGoogleOauth(client_id, client_secret, refresh_token, auth_type=None, access_token=None)[source]\u00b6
\n
\n
\n__init__(client_id, client_secret, refresh_token, auth_type=None, access_token=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass GoogleAnalyticsDataApiSource.ServiceAccountKeyAuthentication(credentials_json, auth_type=None)[source]\u00b6
\n
\n
\n__init__(credentials_json, auth_type=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.OutreachSource(name, client_id, client_secret, refresh_token, redirect_uri, start_date)[source]\u00b6
\n
\n
\n__init__(name, client_id, client_secret, refresh_token, redirect_uri, start_date)[source]\u00b6
\n

Airbyte Source for Outreach.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/outreach

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • client_id (str) \u2013 The Client ID of your Outreach developer application.

  • \n
  • client_secret (str) \u2013 The Client Secret of your Outreach developer application.

  • \n
  • refresh_token (str) \u2013 The token for obtaining the new access token.

  • \n
  • redirect_uri (str) \u2013 A Redirect URI is the location where the authorization server sends the user once the app has been successfully authorized and granted an authorization code or access token.

  • \n
  • start_date (str) \u2013 The date from which you\u2019d like to replicate data for Outreach API, in the format YYYY-MM-DDT00:00:00Z. All data generated after this date will be replicated.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.LemlistSource(name, api_key)[source]\u00b6
\n
\n
\n__init__(name, api_key)[source]\u00b6
\n

Airbyte Source for Lemlist.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/lemlist

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • api_key (str) \u2013 Lemlist API key.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.ApifyDatasetSource(name, datasetId, clean=None)[source]\u00b6
\n
\n
\n__init__(name, datasetId, clean=None)[source]\u00b6
\n

Airbyte Source for Apify Dataset.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/apify-dataset

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • datasetId (str) \u2013 ID of the dataset you would like to load to Airbyte.

  • \n
  • clean (Optional[bool]) \u2013 If set to true, only clean items will be downloaded from the dataset. See description of what clean means in Apify API docs. If not sure, set clean to false.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.RecurlySource(name, api_key, begin_time=None, end_time=None)[source]\u00b6
\n
\n
\n__init__(name, api_key, begin_time=None, end_time=None)[source]\u00b6
\n

Airbyte Source for Recurly.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/recurly

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • api_key (str) \u2013 Recurly API Key. See the docs for more information on how to generate this key.

  • \n
  • begin_time (Optional[str]) \u2013 ISO8601 timestamp from which the replication from Recurly API will start from.

  • \n
  • end_time (Optional[str]) \u2013 ISO8601 timestamp to which the replication from Recurly API will stop. Records after that date won\u2019t be imported.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.ZendeskTalkSource(name, subdomain, credentials, start_date)[source]\u00b6
\n
\n
\n__init__(name, subdomain, credentials, start_date)[source]\u00b6
\n

Airbyte Source for Zendesk Talk.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/zendesk-talk

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • subdomain (str) \u2013 This is your Zendesk subdomain that can be found in your account URL. For example, in https://{MY_SUBDOMAIN}.zendesk.com/, where MY_SUBDOMAIN is the value of your subdomain.

  • \n
  • credentials (Union[ZendeskTalkSource.APIToken, ZendeskTalkSource.OAuth20]) \u2013 Zendesk service provides two authentication methods. Choose between: OAuth2.0 or API token.

  • \n
  • start_date (str) \u2013 The date from which you\u2019d like to replicate data for Zendesk Talk API, in the format YYYY-MM-DDT00:00:00Z. All data generated after this date will be replicated.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass ZendeskTalkSource.APIToken(email, api_token, auth_type=None)[source]\u00b6
\n
\n
\n__init__(email, api_token, auth_type=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass ZendeskTalkSource.OAuth20(access_token, auth_type=None)[source]\u00b6
\n
\n
\n__init__(access_token, auth_type=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.SftpSource(name, user, host, port, credentials, file_types=None, folder_path=None, file_pattern=None)[source]\u00b6
\n
\n
\n__init__(name, user, host, port, credentials, file_types=None, folder_path=None, file_pattern=None)[source]\u00b6
\n

Airbyte Source for Sftp.

\n

Documentation can be found at https://docs.airbyte.com/integrations/source/sftp

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • user (str) \u2013 The server user

  • \n
  • host (str) \u2013 The server host address

  • \n
  • port (int) \u2013 The server port

  • \n
  • credentials (Union[SftpSource.PasswordAuthentication, SftpSource.SSHKeyAuthentication]) \u2013 The server authentication method

  • \n
  • file_types (Optional[str]) \u2013 Coma separated file types. Currently only \u2018csv\u2019 and \u2018json\u2019 types are supported.

  • \n
  • folder_path (Optional[str]) \u2013 The directory to search files for sync

  • \n
  • file_pattern (Optional[str]) \u2013 The regular expression to specify files for sync in a chosen Folder Path

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass SftpSource.PasswordAuthentication(auth_user_password)[source]\u00b6
\n
\n
\n__init__(auth_user_password)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass SftpSource.SSHKeyAuthentication(auth_ssh_key)[source]\u00b6
\n
\n
\n__init__(auth_ssh_key)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.WhiskyHunterSource(name)[source]\u00b6
\n
\n
\n__init__(name)[source]\u00b6
\n

Airbyte Source for Whisky Hunter.

\n

Documentation can be found at https://docs.airbyte.io/integrations/sources/whisky-hunter

\n
\n
Parameters:
\n

name (str) \u2013 The name of the destination.

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.FreshdeskSource(name, domain, api_key, requests_per_minute=None, start_date=None)[source]\u00b6
\n
\n
\n__init__(name, domain, api_key, requests_per_minute=None, start_date=None)[source]\u00b6
\n

Airbyte Source for Freshdesk.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/freshdesk

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • domain (str) \u2013 Freshdesk domain

  • \n
  • api_key (str) \u2013 Freshdesk API Key. See the docs for more information on how to obtain this key.

  • \n
  • requests_per_minute (Optional[int]) \u2013 The number of requests per minute that this source allowed to use. There is a rate limit of 50 requests per minute per app per account.

  • \n
  • start_date (Optional[str]) \u2013 UTC date and time. Any data created after this date will be replicated. If this parameter is not set, all data will be replicated.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.GocardlessSource(name, access_token, gocardless_environment, gocardless_version, start_date)[source]\u00b6
\n
\n
\n__init__(name, access_token, gocardless_environment, gocardless_version, start_date)[source]\u00b6
\n

Airbyte Source for Gocardless.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/gocardless

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • access_token (str) \u2013 Gocardless API TOKEN

  • \n
  • gocardless_environment (str) \u2013 Environment you are trying to connect to.

  • \n
  • gocardless_version (str) \u2013 GoCardless version. This is a date. You can find the latest here: https://developer.gocardless.com/api-reference/#api-usage-making-requests

  • \n
  • start_date (str) \u2013 UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.ZuoraSource(name, start_date, tenant_endpoint, data_query, client_id, client_secret, window_in_days=None)[source]\u00b6
\n
\n
\n__init__(name, start_date, tenant_endpoint, data_query, client_id, client_secret, window_in_days=None)[source]\u00b6
\n

Airbyte Source for Zuora.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/zuora

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • start_date (str) \u2013 Start Date in format: YYYY-MM-DD

  • \n
  • window_in_days (Optional[str]) \u2013 The amount of days for each data-chunk begining from start_date. Bigger the value - faster the fetch. (0.1 - as for couple of hours, 1 - as for a Day; 364 - as for a Year).

  • \n
  • tenant_endpoint (str) \u2013 Please choose the right endpoint where your Tenant is located. More info by this Link

  • \n
  • data_query (str) \u2013 Choose between Live, or Unlimited - the optimized, replicated database at 12 hours freshness for high volume extraction Link

  • \n
  • client_id (str) \u2013 Your OAuth user Client ID

  • \n
  • client_secret (str) \u2013 Your OAuth user Client Secret

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.MarketoSource(name, domain_url, client_id, client_secret, start_date)[source]\u00b6
\n
\n
\n__init__(name, domain_url, client_id, client_secret, start_date)[source]\u00b6
\n

Airbyte Source for Marketo.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/marketo

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • domain_url (str) \u2013 Your Marketo Base URL. See the docs for info on how to obtain this.

  • \n
  • client_id (str) \u2013 The Client ID of your Marketo developer application. See the docs for info on how to obtain this.

  • \n
  • client_secret (str) \u2013 The Client Secret of your Marketo developer application. See the docs for info on how to obtain this.

  • \n
  • start_date (str) \u2013 UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.DriftSource(name, credentials)[source]\u00b6
\n
\n
\n__init__(name, credentials)[source]\u00b6
\n

Airbyte Source for Drift.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/drift

\n
\n
Parameters:
\n

name (str) \u2013 The name of the destination.

\n
\n
\n
\n\n
\n\n
\n
\nclass DriftSource.OAuth20(client_id, client_secret, access_token, refresh_token, credentials=None)[source]\u00b6
\n
\n
\n__init__(client_id, client_secret, access_token, refresh_token, credentials=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass DriftSource.AccessToken(access_token, credentials=None)[source]\u00b6
\n
\n
\n__init__(access_token, credentials=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.PokeapiSource(name, pokemon_name)[source]\u00b6
\n
\n
\n__init__(name, pokemon_name)[source]\u00b6
\n

Airbyte Source for Pokeapi.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/pokeapi

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • pokemon_name (str) \u2013 Pokemon requested from the API.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.NetsuiteSource(name, realm, consumer_key, consumer_secret, token_key, token_secret, start_datetime, object_types=None, window_in_days=None)[source]\u00b6
\n
\n
\n__init__(name, realm, consumer_key, consumer_secret, token_key, token_secret, start_datetime, object_types=None, window_in_days=None)[source]\u00b6
\n

Airbyte Source for Netsuite.

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • realm (str) \u2013 Netsuite realm e.g. 2344535, as for production or 2344535_SB1, as for the sandbox

  • \n
  • consumer_key (str) \u2013 Consumer key associated with your integration

  • \n
  • consumer_secret (str) \u2013 Consumer secret associated with your integration

  • \n
  • token_key (str) \u2013 Access token key

  • \n
  • token_secret (str) \u2013 Access token secret

  • \n
  • object_types (Optional[List[str]]) \u2013 The API names of the Netsuite objects you want to sync. Setting this speeds up the connection setup process by limiting the number of schemas that need to be retrieved from Netsuite.

  • \n
  • start_datetime (str) \u2013 Starting point for your data replication, in format of \u201cYYYY-MM-DDTHH:mm:ssZ\u201d

  • \n
  • window_in_days (Optional[int]) \u2013 The amount of days used to query the data with date chunks. Set smaller value, if you have lots of data.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.HubplannerSource(name, api_key)[source]\u00b6
\n
\n
\n__init__(name, api_key)[source]\u00b6
\n

Airbyte Source for Hubplanner.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/hubplanner

\n
\n
Parameters:
\n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.Dv360Source(name, credentials, partner_id, start_date, end_date=None, filters=None)[source]\u00b6
\n
\n
\n__init__(name, credentials, partner_id, start_date, end_date=None, filters=None)[source]\u00b6
\n

Airbyte Source for Dv 360.

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • credentials (Dv360Source.Oauth2Credentials) \u2013 Oauth2 credentials

  • \n
  • partner_id (int) \u2013 Partner ID

  • \n
  • start_date (str) \u2013 UTC date and time in the format 2017-01-25. Any data before this date will not be replicated

  • \n
  • end_date (Optional[str]) \u2013 UTC date and time in the format 2017-01-25. Any data after this date will not be replicated.

  • \n
  • filters (Optional[List[str]]) \u2013 filters for the dimensions. each filter object had 2 keys: \u2018type\u2019 for the name of the dimension to be used as. and \u2018value\u2019 for the value of the filter

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass Dv360Source.Oauth2Credentials(access_token, refresh_token, token_uri, client_id, client_secret)[source]\u00b6
\n
\n
\n__init__(access_token, refresh_token, token_uri, client_id, client_secret)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.NotionSource(name, start_date, credentials)[source]\u00b6
\n
\n
\n__init__(name, start_date, credentials)[source]\u00b6
\n

Airbyte Source for Notion.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/notion

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • start_date (str) \u2013 UTC date and time in the format 2017-01-25T00:00:00.000Z. Any data before this date will not be replicated.

  • \n
  • credentials (Union[NotionSource.OAuth20, NotionSource.AccessToken]) \u2013 Pick an authentication method.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass NotionSource.OAuth20(client_id, client_secret, access_token)[source]\u00b6
\n
\n
\n__init__(client_id, client_secret, access_token)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass NotionSource.AccessToken(token)[source]\u00b6
\n
\n
\n__init__(token)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.ZendeskSunshineSource(name, subdomain, start_date, credentials)[source]\u00b6
\n
\n
\n__init__(name, subdomain, start_date, credentials)[source]\u00b6
\n

Airbyte Source for Zendesk Sunshine.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/zendesk_sunshine

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • subdomain (str) \u2013 The subdomain for your Zendesk Account.

  • \n
  • start_date (str) \u2013 The date from which you\u2019d like to replicate data for Zendesk Sunshine API, in the format YYYY-MM-DDT00:00:00Z.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass ZendeskSunshineSource.OAuth20(client_id, client_secret, access_token)[source]\u00b6
\n
\n
\n__init__(client_id, client_secret, access_token)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass ZendeskSunshineSource.APIToken(api_token, email)[source]\u00b6
\n
\n
\n__init__(api_token, email)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.PinterestSource(name, start_date, credentials)[source]\u00b6
\n
\n
\n__init__(name, start_date, credentials)[source]\u00b6
\n

Airbyte Source for Pinterest.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/pinterest

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • start_date (str) \u2013 A date in the format YYYY-MM-DD. If you have not set a date, it would be defaulted to latest allowed date by api (914 days from today).

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass PinterestSource.OAuth20(refresh_token, client_id=None, client_secret=None)[source]\u00b6
\n
\n
\n__init__(refresh_token, client_id=None, client_secret=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass PinterestSource.AccessToken(access_token)[source]\u00b6
\n
\n
\n__init__(access_token)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.MetabaseSource(name, instance_api_url, username=None, password=None, session_token=None)[source]\u00b6
\n
\n
\n__init__(name, instance_api_url, username=None, password=None, session_token=None)[source]\u00b6
\n

Airbyte Source for Metabase.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/metabase

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • instance_api_url (str) \u2013 URL to your metabase instance API

  • \n
  • session_token (Optional[str]) \u2013 To generate your session token, you need to run the following command: ` curl -X POST \\\\   -H "Content-Type: application/json" \\\\   -d '{"username": "person@metabase.com", "password": "fakepassword"}' \\\\   http://localhost:3000/api/session ` Then copy the value of the id field returned by a successful call to that API. Note that by default, sessions are good for 14 days and needs to be regenerated.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.HubspotSource(name, start_date, credentials)[source]\u00b6
\n
\n
\n__init__(name, start_date, credentials)[source]\u00b6
\n

Airbyte Source for Hubspot.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/hubspot

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • start_date (str) \u2013 UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated.

  • \n
  • credentials (Union[HubspotSource.OAuth, HubspotSource.APIKey, HubspotSource.PrivateAPP]) \u2013 Choose how to authenticate to HubSpot.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass HubspotSource.OAuth(client_id, client_secret, refresh_token)[source]\u00b6
\n
\n
\n__init__(client_id, client_secret, refresh_token)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass HubspotSource.APIKey(api_key)[source]\u00b6
\n
\n
\n__init__(api_key)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass HubspotSource.PrivateAPP(access_token)[source]\u00b6
\n
\n
\n__init__(access_token)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.HarvestSource(name, account_id, replication_start_date, credentials)[source]\u00b6
\n
\n
\n__init__(name, account_id, replication_start_date, credentials)[source]\u00b6
\n

Airbyte Source for Harvest.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/harvest

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • account_id (str) \u2013 Harvest account ID. Required for all Harvest requests in pair with Personal Access Token

  • \n
  • replication_start_date (str) \u2013 UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated.

  • \n
  • credentials (Union[HarvestSource.AuthenticateViaHarvestOAuth, HarvestSource.AuthenticateWithPersonalAccessToken]) \u2013 Choose how to authenticate to Harvest.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass HarvestSource.AuthenticateViaHarvestOAuth(client_id, client_secret, refresh_token, auth_type=None)[source]\u00b6
\n
\n
\n__init__(client_id, client_secret, refresh_token, auth_type=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass HarvestSource.AuthenticateWithPersonalAccessToken(api_token, auth_type=None)[source]\u00b6
\n
\n
\n__init__(api_token, auth_type=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.GithubSource(name, credentials, start_date, repository, branch=None, page_size_for_large_streams=None)[source]\u00b6
\n
\n
\n__init__(name, credentials, start_date, repository, branch=None, page_size_for_large_streams=None)[source]\u00b6
\n

Airbyte Source for Github.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/github

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • credentials (Union[GithubSource.OAuthCredentials, GithubSource.PATCredentials]) \u2013 Choose how to authenticate to GitHub

  • \n
  • start_date (str) \u2013 The date from which you\u2019d like to replicate data from GitHub in the format YYYY-MM-DDT00:00:00Z. For the streams which support this configuration, only data generated on or after the start date will be replicated. This field doesn\u2019t apply to all streams, see the docs for more info

  • \n
  • repository (str) \u2013 Space-delimited list of GitHub organizations/repositories, e.g. airbytehq/airbyte for single repository, airbytehq/* for get all repositories from organization and airbytehq/airbyte airbytehq/another-repo for multiple repositories.

  • \n
  • branch (Optional[str]) \u2013 Space-delimited list of GitHub repository branches to pull commits for, e.g. airbytehq/airbyte/master. If no branches are specified for a repository, the default branch will be pulled.

  • \n
  • page_size_for_large_streams (Optional[int]) \u2013 The Github connector contains several streams with a large amount of data. The page size of such streams depends on the size of your repository. We recommended that you specify values between 10 and 30.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass GithubSource.OAuthCredentials(access_token)[source]\u00b6
\n
\n
\n__init__(access_token)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass GithubSource.PATCredentials(personal_access_token)[source]\u00b6
\n
\n
\n__init__(personal_access_token)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.E2eTestSource(name, max_messages, mock_catalog, type=None, seed=None, message_interval_ms=None)[source]\u00b6
\n
\n
\n__init__(name, max_messages, mock_catalog, type=None, seed=None, message_interval_ms=None)[source]\u00b6
\n

Airbyte Source for E2e Test.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/e2e-test

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • max_messages (int) \u2013 Number of records to emit per stream. Min 1. Max 100 billion.

  • \n
  • seed (Optional[int]) \u2013 When the seed is unspecified, the current time millis will be used as the seed. Range: [0, 1000000].

  • \n
  • message_interval_ms (Optional[int]) \u2013 Interval between messages in ms. Min 0 ms. Max 60000 ms (1 minute).

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass E2eTestSource.SingleSchema(stream_name, stream_schema, stream_duplication=None)[source]\u00b6
\n
\n
\n__init__(stream_name, stream_schema, stream_duplication=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass E2eTestSource.MultiSchema(stream_schemas)[source]\u00b6
\n
\n
\n__init__(stream_schemas)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.MysqlSource(name, host, port, database, username, ssl_mode, replication_method, password=None, jdbc_url_params=None, ssl=None)[source]\u00b6
\n
\n
\n__init__(name, host, port, database, username, ssl_mode, replication_method, password=None, jdbc_url_params=None, ssl=None)[source]\u00b6
\n

Airbyte Source for Mysql.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/mysql

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • host (str) \u2013 The host name of the database.

  • \n
  • port (int) \u2013 The port to connect to.

  • \n
  • database (str) \u2013 The database name.

  • \n
  • username (str) \u2013 The username which is used to access the database.

  • \n
  • password (Optional[str]) \u2013 The password associated with the username.

  • \n
  • jdbc_url_params (Optional[str]) \u2013 Additional properties to pass to the JDBC URL string when connecting to the database formatted as \u2018key=value\u2019 pairs separated by the symbol \u2018&\u2019. (example: key1=value1&key2=value2&key3=value3). For more information read about JDBC URL parameters.

  • \n
  • ssl (Optional[bool]) \u2013 Encrypt data using SSL.

  • \n
  • ssl_mode (Union[MysqlSource.Preferred, MysqlSource.Required, MysqlSource.VerifyCA, MysqlSource.VerifyIdentity]) \u2013 SSL connection modes. preferred - Automatically attempt SSL connection. If the MySQL server does not support SSL, continue with a regular connection.required - Always connect with SSL. If the MySQL server doesn`t support SSL, the connection will not be established. Certificate Authority (CA) and Hostname are not verified.verify-ca - Always connect with SSL. Verifies CA, but allows connection even if Hostname does not match.Verify Identity - Always connect with SSL. Verify both CA and Hostname.Read more in the docs.

  • \n
  • replication_method (Union[MysqlSource.Standard, MysqlSource.LogicalReplicationCDC]) \u2013 Replication method to use for extracting data from the database.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass MysqlSource.Preferred[source]\u00b6
\n
\n
\n__init__()[source]\u00b6
\n
\n\n
\n\n
\n
\nclass MysqlSource.Required[source]\u00b6
\n
\n
\n__init__()[source]\u00b6
\n
\n\n
\n\n
\n
\nclass MysqlSource.VerifyCA(ca_certificate, client_certificate=None, client_key=None, client_key_password=None)[source]\u00b6
\n
\n
\n__init__(ca_certificate, client_certificate=None, client_key=None, client_key_password=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass MysqlSource.VerifyIdentity(ca_certificate, client_certificate=None, client_key=None, client_key_password=None)[source]\u00b6
\n
\n
\n__init__(ca_certificate, client_certificate=None, client_key=None, client_key_password=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass MysqlSource.Standard[source]\u00b6
\n
\n
\n__init__()[source]\u00b6
\n
\n\n
\n\n
\n
\nclass MysqlSource.LogicalReplicationCDC(initial_waiting_seconds=None, server_time_zone=None)[source]\u00b6
\n
\n
\n__init__(initial_waiting_seconds=None, server_time_zone=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.MyHoursSource(name, email, password, start_date, logs_batch_size=None)[source]\u00b6
\n
\n
\n__init__(name, email, password, start_date, logs_batch_size=None)[source]\u00b6
\n

Airbyte Source for My Hours.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/my-hours

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • email (str) \u2013 Your My Hours username

  • \n
  • password (str) \u2013 The password associated to the username

  • \n
  • start_date (str) \u2013 Start date for collecting time logs

  • \n
  • logs_batch_size (Optional[int]) \u2013 Pagination size used for retrieving logs in days

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.KyribaSource(name, domain, username, password, start_date, end_date=None)[source]\u00b6
\n
\n
\n__init__(name, domain, username, password, start_date, end_date=None)[source]\u00b6
\n

Airbyte Source for Kyriba.

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • domain (str) \u2013 Kyriba domain

  • \n
  • username (str) \u2013 Username to be used in basic auth

  • \n
  • password (str) \u2013 Password to be used in basic auth

  • \n
  • start_date (str) \u2013 The date the sync should start from.

  • \n
  • end_date (Optional[str]) \u2013 The date the sync should end. If let empty the sync will run to the current date.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.GoogleSearchConsoleSource(name, site_urls, start_date, authorization, end_date=None, custom_reports=None)[source]\u00b6
\n
\n
\n__init__(name, site_urls, start_date, authorization, end_date=None, custom_reports=None)[source]\u00b6
\n

Airbyte Source for Google Search Console.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/google-search-console

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • site_urls (List[str]) \u2013 The URLs of the website property attached to your GSC account. Read more here.

  • \n
  • start_date (str) \u2013 UTC date in the format 2017-01-25. Any data before this date will not be replicated.

  • \n
  • end_date (Optional[str]) \u2013 UTC date in the format 2017-01-25. Any data after this date will not be replicated. Must be greater or equal to the start date field.

  • \n
  • custom_reports (Optional[str]) \u2013 A JSON array describing the custom reports you want to sync from Google Search Console. See the docs for more information about the exact format you can use to fill out this field.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass GoogleSearchConsoleSource.OAuth(client_id, client_secret, refresh_token, access_token=None)[source]\u00b6
\n
\n
\n__init__(client_id, client_secret, refresh_token, access_token=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass GoogleSearchConsoleSource.ServiceAccountKeyAuthentication(service_account_info, email)[source]\u00b6
\n
\n
\n__init__(service_account_info, email)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.FacebookMarketingSource(name, account_id, start_date, access_token, end_date=None, include_deleted=None, fetch_thumbnail_images=None, custom_insights=None, page_size=None, insights_lookback_window=None, max_batch_size=None)[source]\u00b6
\n
\n
\n__init__(name, account_id, start_date, access_token, end_date=None, include_deleted=None, fetch_thumbnail_images=None, custom_insights=None, page_size=None, insights_lookback_window=None, max_batch_size=None)[source]\u00b6
\n

Airbyte Source for Facebook Marketing.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/facebook-marketing

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • account_id (str) \u2013 The Facebook Ad account ID to use when pulling data from the Facebook Marketing API.

  • \n
  • start_date (str) \u2013 The date from which you\u2019d like to replicate data for all incremental streams, in the format YYYY-MM-DDT00:00:00Z. All data generated after this date will be replicated.

  • \n
  • end_date (Optional[str]) \u2013 The date until which you\u2019d like to replicate data for all incremental streams, in the format YYYY-MM-DDT00:00:00Z. All data generated between start_date and this date will be replicated. Not setting this option will result in always syncing the latest data.

  • \n
  • access_token (str) \u2013 The value of the access token generated. See the docs for more information

  • \n
  • include_deleted (Optional[bool]) \u2013 Include data from deleted Campaigns, Ads, and AdSets

  • \n
  • fetch_thumbnail_images (Optional[bool]) \u2013 In each Ad Creative, fetch the thumbnail_url and store the result in thumbnail_data_url

  • \n
  • custom_insights (Optional[List[FacebookMarketingSource.InsightConfig]]) \u2013 A list which contains insights entries, each entry must have a name and can contains fields, breakdowns or action_breakdowns)

  • \n
  • page_size (Optional[int]) \u2013 Page size used when sending requests to Facebook API to specify number of records per page when response has pagination. Most users do not need to set this field unless they specifically need to tune the connector to address specific issues or use cases.

  • \n
  • insights_lookback_window (Optional[int]) \u2013 The attribution window

  • \n
  • max_batch_size (Optional[int]) \u2013 Maximum batch size used when sending batch requests to Facebook API. Most users do not need to set this field unless they specifically need to tune the connector to address specific issues or use cases.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass FacebookMarketingSource.InsightConfig(name, fields=None, breakdowns=None, action_breakdowns=None, time_increment=None, start_date=None, end_date=None, insights_lookback_window=None)[source]\u00b6
\n
\n
\n__init__(name, fields=None, breakdowns=None, action_breakdowns=None, time_increment=None, start_date=None, end_date=None, insights_lookback_window=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.SurveymonkeySource(name, access_token, start_date, survey_ids=None)[source]\u00b6
\n
\n
\n__init__(name, access_token, start_date, survey_ids=None)[source]\u00b6
\n

Airbyte Source for Surveymonkey.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/surveymonkey

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • access_token (str) \u2013 Access Token for making authenticated requests. See the docs for information on how to generate this key.

  • \n
  • start_date (str) \u2013 UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated.

  • \n
  • survey_ids (Optional[List[str]]) \u2013 IDs of the surveys from which you\u2019d like to replicate data. If left empty, data from all boards to which you have access will be replicated.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.PardotSource(name, pardot_business_unit_id, client_id, client_secret, refresh_token, start_date=None, is_sandbox=None)[source]\u00b6
\n
\n
\n__init__(name, pardot_business_unit_id, client_id, client_secret, refresh_token, start_date=None, is_sandbox=None)[source]\u00b6
\n

Airbyte Source for Pardot.

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • pardot_business_unit_id (str) \u2013 Pardot Business ID, can be found at Setup > Pardot > Pardot Account Setup

  • \n
  • client_id (str) \u2013 The Consumer Key that can be found when viewing your app in Salesforce

  • \n
  • client_secret (str) \u2013 The Consumer Secret that can be found when viewing your app in Salesforce

  • \n
  • refresh_token (str) \u2013 Salesforce Refresh Token used for Airbyte to access your Salesforce account. If you don\u2019t know what this is, follow this guide to retrieve it.

  • \n
  • start_date (Optional[str]) \u2013 UTC date and time in the format 2017-01-25T00:00:00Z. Any data before this date will not be replicated. Leave blank to skip this filter

  • \n
  • is_sandbox (Optional[bool]) \u2013 Whether or not the the app is in a Salesforce sandbox. If you do not know what this, assume it is false.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.FlexportSource(name, api_key, start_date)[source]\u00b6
\n
\n
\n__init__(name, api_key, start_date)[source]\u00b6
\n

Airbyte Source for Flexport.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/flexport

\n
\n
Parameters:
\n

name (str) \u2013 The name of the destination.

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.ZenefitsSource(name, token)[source]\u00b6
\n
\n
\n__init__(name, token)[source]\u00b6
\n

Airbyte Source for Zenefits.

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • token (str) \u2013 Use Sync with Zenefits button on the link given on the readme file, and get the token to access the api

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.sources.KafkaSource(name, MessageFormat, bootstrap_servers, subscription, protocol, test_topic=None, group_id=None, max_poll_records=None, polling_time=None, client_id=None, enable_auto_commit=None, auto_commit_interval_ms=None, client_dns_lookup=None, retry_backoff_ms=None, request_timeout_ms=None, receive_buffer_bytes=None, auto_offset_reset=None, repeated_calls=None, max_records_process=None)[source]\u00b6
\n
\n
\n__init__(name, MessageFormat, bootstrap_servers, subscription, protocol, test_topic=None, group_id=None, max_poll_records=None, polling_time=None, client_id=None, enable_auto_commit=None, auto_commit_interval_ms=None, client_dns_lookup=None, retry_backoff_ms=None, request_timeout_ms=None, receive_buffer_bytes=None, auto_offset_reset=None, repeated_calls=None, max_records_process=None)[source]\u00b6
\n

Airbyte Source for Kafka.

\n

Documentation can be found at https://docs.airbyte.com/integrations/sources/kafka

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • MessageFormat (Union[KafkaSource.JSON, KafkaSource.AVRO]) \u2013 The serialization used based on this

  • \n
  • bootstrap_servers (str) \u2013 A list of host/port pairs to use for establishing the initial connection to the Kafka cluster. The client will make use of all servers irrespective of which servers are specified here for bootstrapping&mdash;this list only impacts the initial hosts used to discover the full set of servers. This list should be in the form host1:port1,host2:port2,\u2026. Since these servers are just used for the initial connection to discover the full cluster membership (which may change dynamically), this list need not contain the full set of servers (you may want more than one, though, in case a server is down).

  • \n
  • subscription (Union[KafkaSource.ManuallyAssignAListOfPartitions, KafkaSource.SubscribeToAllTopicsMatchingSpecifiedPattern]) \u2013 You can choose to manually assign a list of partitions, or subscribe to all topics matching specified pattern to get dynamically assigned partitions.

  • \n
  • test_topic (Optional[str]) \u2013 The Topic to test in case the Airbyte can consume messages.

  • \n
  • group_id (Optional[str]) \u2013 The Group ID is how you distinguish different consumer groups.

  • \n
  • max_poll_records (Optional[int]) \u2013 The maximum number of records returned in a single call to poll(). Note, that max_poll_records does not impact the underlying fetching behavior. The consumer will cache the records from each fetch request and returns them incrementally from each poll.

  • \n
  • polling_time (Optional[int]) \u2013 Amount of time Kafka connector should try to poll for messages.

  • \n
  • protocol (Union[KafkaSource.PLAINTEXT, KafkaSource.SASLPLAINTEXT, KafkaSource.SASLSSL]) \u2013 The Protocol used to communicate with brokers.

  • \n
  • client_id (Optional[str]) \u2013 An ID string to pass to the server when making requests. The purpose of this is to be able to track the source of requests beyond just ip/port by allowing a logical application name to be included in server-side request logging.

  • \n
  • enable_auto_commit (Optional[bool]) \u2013 If true, the consumer\u2019s offset will be periodically committed in the background.

  • \n
  • auto_commit_interval_ms (Optional[int]) \u2013 The frequency in milliseconds that the consumer offsets are auto-committed to Kafka if enable.auto.commit is set to true.

  • \n
  • client_dns_lookup (Optional[str]) \u2013 Controls how the client uses DNS lookups. If set to use_all_dns_ips, connect to each returned IP address in sequence until a successful connection is established. After a disconnection, the next IP is used. Once all IPs have been used once, the client resolves the IP(s) from the hostname again. If set to resolve_canonical_bootstrap_servers_only, resolve each bootstrap address into a list of canonical names. After the bootstrap phase, this behaves the same as use_all_dns_ips. If set to default (deprecated), attempt to connect to the first IP address returned by the lookup, even if the lookup returns multiple IP addresses.

  • \n
  • retry_backoff_ms (Optional[int]) \u2013 The amount of time to wait before attempting to retry a failed request to a given topic partition. This avoids repeatedly sending requests in a tight loop under some failure scenarios.

  • \n
  • request_timeout_ms (Optional[int]) \u2013 The configuration controls the maximum amount of time the client will wait for the response of a request. If the response is not received before the timeout elapses the client will resend the request if necessary or fail the request if retries are exhausted.

  • \n
  • receive_buffer_bytes (Optional[int]) \u2013 The size of the TCP receive buffer (SO_RCVBUF) to use when reading data. If the value is -1, the OS default will be used.

  • \n
  • auto_offset_reset (Optional[str]) \u2013 What to do when there is no initial offset in Kafka or if the current offset does not exist any more on the server - earliest: automatically reset the offset to the earliest offset, latest: automatically reset the offset to the latest offset, none: throw exception to the consumer if no previous offset is found for the consumer\u2019s group, anything else: throw exception to the consumer.

  • \n
  • repeated_calls (Optional[int]) \u2013 The number of repeated calls to poll() if no messages were received.

  • \n
  • max_records_process (Optional[int]) \u2013 The Maximum to be processed per execution

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass KafkaSource.JSON(deserialization_type=None)[source]\u00b6
\n
\n
\n__init__(deserialization_type=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass KafkaSource.AVRO(deserialization_type=None, deserialization_strategy=None, schema_registry_url=None, schema_registry_username=None, schema_registry_password=None)[source]\u00b6
\n
\n
\n__init__(deserialization_type=None, deserialization_strategy=None, schema_registry_url=None, schema_registry_username=None, schema_registry_password=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass KafkaSource.ManuallyAssignAListOfPartitions(topic_partitions)[source]\u00b6
\n
\n
\n__init__(topic_partitions)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass KafkaSource.SubscribeToAllTopicsMatchingSpecifiedPattern(topic_pattern)[source]\u00b6
\n
\n
\n__init__(topic_pattern)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass KafkaSource.PLAINTEXT(security_protocol)[source]\u00b6
\n
\n
\n__init__(security_protocol)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass KafkaSource.SASLPLAINTEXT(security_protocol, sasl_mechanism, sasl_jaas_config)[source]\u00b6
\n
\n
\n__init__(security_protocol, sasl_mechanism, sasl_jaas_config)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass KafkaSource.SASLSSL(security_protocol, sasl_mechanism, sasl_jaas_config)[source]\u00b6
\n
\n
\n__init__(security_protocol, sasl_mechanism, sasl_jaas_config)[source]\u00b6
\n
\n\n
\n\n
\n
\n

Managed Config Generated Destinations\u00b6

\n
\n
\nclass dagster_airbyte.managed.generated.destinations.DynamodbDestination(name, dynamodb_table_name_prefix, dynamodb_region, access_key_id, secret_access_key, dynamodb_endpoint=None)[source]\u00b6
\n
\n
\n__init__(name, dynamodb_table_name_prefix, dynamodb_region, access_key_id, secret_access_key, dynamodb_endpoint=None)[source]\u00b6
\n

Airbyte Destination for Dynamodb.

\n

Documentation can be found at https://docs.airbyte.com/integrations/destinations/dynamodb

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • dynamodb_endpoint (Optional[str]) \u2013 This is your DynamoDB endpoint url.(if you are working with AWS DynamoDB, just leave empty).

  • \n
  • dynamodb_table_name_prefix (str) \u2013 The prefix to use when naming DynamoDB tables.

  • \n
  • dynamodb_region (str) \u2013 The region of the DynamoDB.

  • \n
  • access_key_id (str) \u2013 The access key id to access the DynamoDB. Airbyte requires Read and Write permissions to the DynamoDB.

  • \n
  • secret_access_key (str) \u2013 The corresponding secret to the access key id.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.destinations.BigqueryDestination(name, project_id, dataset_location, dataset_id, loading_method, credentials_json=None, transformation_priority=None, big_query_client_buffer_size_mb=None)[source]\u00b6
\n
\n
\n__init__(name, project_id, dataset_location, dataset_id, loading_method, credentials_json=None, transformation_priority=None, big_query_client_buffer_size_mb=None)[source]\u00b6
\n

Airbyte Destination for Bigquery.

\n

Documentation can be found at https://docs.airbyte.com/integrations/destinations/bigquery

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • project_id (str) \u2013 The GCP project ID for the project containing the target BigQuery dataset. Read more here.

  • \n
  • dataset_location (str) \u2013 The location of the dataset. Warning: Changes made after creation will not be applied. Read more here.

  • \n
  • dataset_id (str) \u2013 The default BigQuery Dataset ID that tables are replicated to if the source does not specify a namespace. Read more here.

  • \n
  • loading_method (Union[BigqueryDestination.StandardInserts, BigqueryDestination.GCSStaging]) \u2013 Loading method used to send select the way data will be uploaded to BigQuery. Standard Inserts - Direct uploading using SQL INSERT statements. This method is extremely inefficient and provided only for quick testing. In almost all cases, you should use staging. GCS Staging - Writes large batches of records to a file, uploads the file to GCS, then uses COPY INTO table to upload the file. Recommended for most workloads for better speed and scalability. Read more about GCS Staging here.

  • \n
  • credentials_json (Optional[str]) \u2013 The contents of the JSON service account key. Check out the docs if you need help generating this key. Default credentials will be used if this field is left empty.

  • \n
  • transformation_priority (Optional[str]) \u2013 Interactive run type means that the query is executed as soon as possible, and these queries count towards concurrent rate limit and daily limit. Read more about interactive run type here. Batch queries are queued and started as soon as idle resources are available in the BigQuery shared resource pool, which usually occurs within a few minutes. Batch queries don`t count towards your concurrent rate limit. Read more about batch queries here. The default \u201cinteractive\u201d value is used if not set explicitly.

  • \n
  • big_query_client_buffer_size_mb (Optional[int]) \u2013 Google BigQuery client\u2019s chunk (buffer) size (MIN=1, MAX = 15) for each table. The size that will be written by a single RPC. Written data will be buffered and only flushed upon reaching this size or closing the channel. The default 15MB value is used if not set explicitly. Read more here.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass BigqueryDestination.StandardInserts[source]\u00b6
\n
\n
\n__init__()[source]\u00b6
\n
\n\n
\n\n
\n
\nclass BigqueryDestination.HMACKey(hmac_key_access_id, hmac_key_secret)[source]\u00b6
\n
\n
\n__init__(hmac_key_access_id, hmac_key_secret)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass BigqueryDestination.GCSStaging(credential, gcs_bucket_name, gcs_bucket_path, keep_files_in_gcs_bucket=None)[source]\u00b6
\n
\n
\n__init__(credential, gcs_bucket_name, gcs_bucket_path, keep_files_in_gcs_bucket=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.destinations.RabbitmqDestination(name, host, routing_key, ssl=None, port=None, virtual_host=None, username=None, password=None, exchange=None)[source]\u00b6
\n
\n
\n__init__(name, host, routing_key, ssl=None, port=None, virtual_host=None, username=None, password=None, exchange=None)[source]\u00b6
\n

Airbyte Destination for Rabbitmq.

\n

Documentation can be found at https://docs.airbyte.com/integrations/destinations/rabbitmq

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • ssl (Optional[bool]) \u2013 SSL enabled.

  • \n
  • host (str) \u2013 The RabbitMQ host name.

  • \n
  • port (Optional[int]) \u2013 The RabbitMQ port.

  • \n
  • virtual_host (Optional[str]) \u2013 The RabbitMQ virtual host name.

  • \n
  • username (Optional[str]) \u2013 The username to connect.

  • \n
  • password (Optional[str]) \u2013 The password to connect.

  • \n
  • exchange (Optional[str]) \u2013 The exchange name.

  • \n
  • routing_key (str) \u2013 The routing key.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.destinations.KvdbDestination(name, bucket_id, secret_key)[source]\u00b6
\n
\n
\n__init__(name, bucket_id, secret_key)[source]\u00b6
\n

Airbyte Destination for Kvdb.

\n

Documentation can be found at https://kvdb.io/docs/api/

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • bucket_id (str) \u2013 The ID of your KVdb bucket.

  • \n
  • secret_key (str) \u2013 Your bucket Secret Key.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.destinations.ClickhouseDestination(name, host, port, database, username, password=None, jdbc_url_params=None, ssl=None)[source]\u00b6
\n
\n
\n__init__(name, host, port, database, username, password=None, jdbc_url_params=None, ssl=None)[source]\u00b6
\n

Airbyte Destination for Clickhouse.

\n

Documentation can be found at https://docs.airbyte.com/integrations/destinations/clickhouse

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • host (str) \u2013 Hostname of the database.

  • \n
  • port (int) \u2013 HTTP port of the database.

  • \n
  • database (str) \u2013 Name of the database.

  • \n
  • username (str) \u2013 Username to use to access the database.

  • \n
  • password (Optional[str]) \u2013 Password associated with the username.

  • \n
  • jdbc_url_params (Optional[str]) \u2013 Additional properties to pass to the JDBC URL string when connecting to the database formatted as \u2018key=value\u2019 pairs separated by the symbol \u2018&\u2019. (example: key1=value1&key2=value2&key3=value3).

  • \n
  • ssl (Optional[bool]) \u2013 Encrypt data using SSL.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.destinations.AmazonSqsDestination(name, queue_url, region, message_delay=None, access_key=None, secret_key=None, message_body_key=None, message_group_id=None)[source]\u00b6
\n
\n
\n__init__(name, queue_url, region, message_delay=None, access_key=None, secret_key=None, message_body_key=None, message_group_id=None)[source]\u00b6
\n

Airbyte Destination for Amazon Sqs.

\n

Documentation can be found at https://docs.airbyte.com/integrations/destinations/amazon-sqs

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • queue_url (str) \u2013 URL of the SQS Queue

  • \n
  • region (str) \u2013 AWS Region of the SQS Queue

  • \n
  • message_delay (Optional[int]) \u2013 Modify the Message Delay of the individual message from the Queue\u2019s default (seconds).

  • \n
  • access_key (Optional[str]) \u2013 The Access Key ID of the AWS IAM Role to use for sending messages

  • \n
  • secret_key (Optional[str]) \u2013 The Secret Key of the AWS IAM Role to use for sending messages

  • \n
  • message_body_key (Optional[str]) \u2013 Use this property to extract the contents of the named key in the input record to use as the SQS message body. If not set, the entire content of the input record data is used as the message body.

  • \n
  • message_group_id (Optional[str]) \u2013 The tag that specifies that a message belongs to a specific message group. This parameter applies only to, and is REQUIRED by, FIFO queues.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.destinations.MariadbColumnstoreDestination(name, host, port, database, username, password=None, jdbc_url_params=None)[source]\u00b6
\n
\n
\n__init__(name, host, port, database, username, password=None, jdbc_url_params=None)[source]\u00b6
\n

Airbyte Destination for Mariadb Columnstore.

\n

Documentation can be found at https://docs.airbyte.com/integrations/destinations/mariadb-columnstore

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • host (str) \u2013 The Hostname of the database.

  • \n
  • port (int) \u2013 The Port of the database.

  • \n
  • database (str) \u2013 Name of the database.

  • \n
  • username (str) \u2013 The Username which is used to access the database.

  • \n
  • password (Optional[str]) \u2013 The Password associated with the username.

  • \n
  • jdbc_url_params (Optional[str]) \u2013 Additional properties to pass to the JDBC URL string when connecting to the database formatted as \u2018key=value\u2019 pairs separated by the symbol \u2018&\u2019. (example: key1=value1&key2=value2&key3=value3).

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.destinations.KinesisDestination(name, endpoint, region, shardCount, accessKey, privateKey, bufferSize)[source]\u00b6
\n
\n
\n__init__(name, endpoint, region, shardCount, accessKey, privateKey, bufferSize)[source]\u00b6
\n

Airbyte Destination for Kinesis.

\n

Documentation can be found at https://docs.airbyte.com/integrations/destinations/kinesis

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • endpoint (str) \u2013 AWS Kinesis endpoint.

  • \n
  • region (str) \u2013 AWS region. Your account determines the Regions that are available to you.

  • \n
  • shardCount (int) \u2013 Number of shards to which the data should be streamed.

  • \n
  • accessKey (str) \u2013 Generate the AWS Access Key for current user.

  • \n
  • privateKey (str) \u2013 The AWS Private Key - a string of numbers and letters that are unique for each account, also known as a \u201crecovery phrase\u201d.

  • \n
  • bufferSize (int) \u2013 Buffer size for storing kinesis records before being batch streamed.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.destinations.AzureBlobStorageDestination(name, azure_blob_storage_account_name, azure_blob_storage_account_key, format, azure_blob_storage_endpoint_domain_name=None, azure_blob_storage_container_name=None, azure_blob_storage_output_buffer_size=None)[source]\u00b6
\n
\n
\n__init__(name, azure_blob_storage_account_name, azure_blob_storage_account_key, format, azure_blob_storage_endpoint_domain_name=None, azure_blob_storage_container_name=None, azure_blob_storage_output_buffer_size=None)[source]\u00b6
\n

Airbyte Destination for Azure Blob Storage.

\n

Documentation can be found at https://docs.airbyte.com/integrations/destinations/azureblobstorage

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • azure_blob_storage_endpoint_domain_name (Optional[str]) \u2013 This is Azure Blob Storage endpoint domain name. Leave default value (or leave it empty if run container from command line) to use Microsoft native from example.

  • \n
  • azure_blob_storage_container_name (Optional[str]) \u2013 The name of the Azure blob storage container. If not exists - will be created automatically. May be empty, then will be created automatically airbytecontainer+timestamp

  • \n
  • azure_blob_storage_account_name (str) \u2013 The account\u2019s name of the Azure Blob Storage.

  • \n
  • azure_blob_storage_account_key (str) \u2013 The Azure blob storage account key.

  • \n
  • azure_blob_storage_output_buffer_size (Optional[int]) \u2013 The amount of megabytes to buffer for the output stream to Azure. This will impact memory footprint on workers, but may need adjustment for performance and appropriate block size in Azure.

  • \n
  • format (Union[AzureBlobStorageDestination.CSVCommaSeparatedValues, AzureBlobStorageDestination.JSONLinesNewlineDelimitedJSON]) \u2013 Output data format

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass AzureBlobStorageDestination.CSVCommaSeparatedValues(flattening)[source]\u00b6
\n
\n
\n__init__(flattening)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass AzureBlobStorageDestination.JSONLinesNewlineDelimitedJSON[source]\u00b6
\n
\n
\n__init__()[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.destinations.KafkaDestination(name, bootstrap_servers, topic_pattern, protocol, acks, enable_idempotence, compression_type, batch_size, linger_ms, max_in_flight_requests_per_connection, client_dns_lookup, buffer_memory, max_request_size, retries, socket_connection_setup_timeout_ms, socket_connection_setup_timeout_max_ms, max_block_ms, request_timeout_ms, delivery_timeout_ms, send_buffer_bytes, receive_buffer_bytes, test_topic=None, sync_producer=None, client_id=None)[source]\u00b6
\n
\n
\n__init__(name, bootstrap_servers, topic_pattern, protocol, acks, enable_idempotence, compression_type, batch_size, linger_ms, max_in_flight_requests_per_connection, client_dns_lookup, buffer_memory, max_request_size, retries, socket_connection_setup_timeout_ms, socket_connection_setup_timeout_max_ms, max_block_ms, request_timeout_ms, delivery_timeout_ms, send_buffer_bytes, receive_buffer_bytes, test_topic=None, sync_producer=None, client_id=None)[source]\u00b6
\n

Airbyte Destination for Kafka.

\n

Documentation can be found at https://docs.airbyte.com/integrations/destinations/kafka

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • bootstrap_servers (str) \u2013 A list of host/port pairs to use for establishing the initial connection to the Kafka cluster. The client will make use of all servers irrespective of which servers are specified here for bootstrapping&mdash;this list only impacts the initial hosts used to discover the full set of servers. This list should be in the form host1:port1,host2:port2,\u2026. Since these servers are just used for the initial connection to discover the full cluster membership (which may change dynamically), this list need not contain the full set of servers (you may want more than one, though, in case a server is down).

  • \n
  • topic_pattern (str) \u2013 Topic pattern in which the records will be sent. You can use patterns like \u2018{namespace}\u2019 and/or \u2018{stream}\u2019 to send the message to a specific topic based on these values. Notice that the topic name will be transformed to a standard naming convention.

  • \n
  • test_topic (Optional[str]) \u2013 Topic to test if Airbyte can produce messages.

  • \n
  • sync_producer (Optional[bool]) \u2013 Wait synchronously until the record has been sent to Kafka.

  • \n
  • protocol (Union[KafkaDestination.PLAINTEXT, KafkaDestination.SASLPLAINTEXT, KafkaDestination.SASLSSL]) \u2013 Protocol used to communicate with brokers.

  • \n
  • client_id (Optional[str]) \u2013 An ID string to pass to the server when making requests. The purpose of this is to be able to track the source of requests beyond just ip/port by allowing a logical application name to be included in server-side request logging.

  • \n
  • acks (str) \u2013 The number of acknowledgments the producer requires the leader to have received before considering a request complete. This controls the durability of records that are sent.

  • \n
  • enable_idempotence (bool) \u2013 When set to \u2018true\u2019, the producer will ensure that exactly one copy of each message is written in the stream. If \u2018false\u2019, producer retries due to broker failures, etc., may write duplicates of the retried message in the stream.

  • \n
  • compression_type (str) \u2013 The compression type for all data generated by the producer.

  • \n
  • batch_size (int) \u2013 The producer will attempt to batch records together into fewer requests whenever multiple records are being sent to the same partition.

  • \n
  • linger_ms (str) \u2013 The producer groups together any records that arrive in between request transmissions into a single batched request.

  • \n
  • max_in_flight_requests_per_connection (int) \u2013 The maximum number of unacknowledged requests the client will send on a single connection before blocking. Can be greater than 1, and the maximum value supported with idempotency is 5.

  • \n
  • client_dns_lookup (str) \u2013 Controls how the client uses DNS lookups. If set to use_all_dns_ips, connect to each returned IP address in sequence until a successful connection is established. After a disconnection, the next IP is used. Once all IPs have been used once, the client resolves the IP(s) from the hostname again. If set to resolve_canonical_bootstrap_servers_only, resolve each bootstrap address into a list of canonical names. After the bootstrap phase, this behaves the same as use_all_dns_ips. If set to default (deprecated), attempt to connect to the first IP address returned by the lookup, even if the lookup returns multiple IP addresses.

  • \n
  • buffer_memory (str) \u2013 The total bytes of memory the producer can use to buffer records waiting to be sent to the server.

  • \n
  • max_request_size (int) \u2013 The maximum size of a request in bytes.

  • \n
  • retries (int) \u2013 Setting a value greater than zero will cause the client to resend any record whose send fails with a potentially transient error.

  • \n
  • socket_connection_setup_timeout_ms (str) \u2013 The amount of time the client will wait for the socket connection to be established.

  • \n
  • socket_connection_setup_timeout_max_ms (str) \u2013 The maximum amount of time the client will wait for the socket connection to be established. The connection setup timeout will increase exponentially for each consecutive connection failure up to this maximum.

  • \n
  • max_block_ms (str) \u2013 The configuration controls how long the KafkaProducer\u2019s send(), partitionsFor(), initTransactions(), sendOffsetsToTransaction(), commitTransaction() and abortTransaction() methods will block.

  • \n
  • request_timeout_ms (int) \u2013 The configuration controls the maximum amount of time the client will wait for the response of a request. If the response is not received before the timeout elapses the client will resend the request if necessary or fail the request if retries are exhausted.

  • \n
  • delivery_timeout_ms (int) \u2013 An upper bound on the time to report success or failure after a call to \u2018send()\u2019 returns.

  • \n
  • send_buffer_bytes (int) \u2013 The size of the TCP send buffer (SO_SNDBUF) to use when sending data. If the value is -1, the OS default will be used.

  • \n
  • receive_buffer_bytes (int) \u2013 The size of the TCP receive buffer (SO_RCVBUF) to use when reading data. If the value is -1, the OS default will be used.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass KafkaDestination.PLAINTEXT(security_protocol)[source]\u00b6
\n
\n
\n__init__(security_protocol)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass KafkaDestination.SASLPLAINTEXT(security_protocol, sasl_mechanism, sasl_jaas_config)[source]\u00b6
\n
\n
\n__init__(security_protocol, sasl_mechanism, sasl_jaas_config)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass KafkaDestination.SASLSSL(security_protocol, sasl_mechanism, sasl_jaas_config)[source]\u00b6
\n
\n
\n__init__(security_protocol, sasl_mechanism, sasl_jaas_config)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.destinations.ElasticsearchDestination(name, endpoint, authenticationMethod, upsert=None)[source]\u00b6
\n
\n
\n__init__(name, endpoint, authenticationMethod, upsert=None)[source]\u00b6
\n

Airbyte Destination for Elasticsearch.

\n

Documentation can be found at https://docs.airbyte.com/integrations/destinations/elasticsearch

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • endpoint (str) \u2013 The full url of the Elasticsearch server

  • \n
  • upsert (Optional[bool]) \u2013 If a primary key identifier is defined in the source, an upsert will be performed using the primary key value as the elasticsearch doc id. Does not support composite primary keys.

  • \n
  • authenticationMethod (Union[ElasticsearchDestination.None\\_, ElasticsearchDestination.ApiKeySecret, ElasticsearchDestination.UsernamePassword]) \u2013 The type of authentication to be used

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass ElasticsearchDestination.None_[source]\u00b6
\n
\n
\n__init__()[source]\u00b6
\n
\n\n
\n\n
\n
\nclass ElasticsearchDestination.ApiKeySecret(apiKeyId, apiKeySecret)[source]\u00b6
\n
\n
\n__init__(apiKeyId, apiKeySecret)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass ElasticsearchDestination.UsernamePassword(username, password)[source]\u00b6
\n
\n
\n__init__(username, password)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.destinations.MysqlDestination(name, host, port, database, username, password=None, ssl=None, jdbc_url_params=None)[source]\u00b6
\n
\n
\n__init__(name, host, port, database, username, password=None, ssl=None, jdbc_url_params=None)[source]\u00b6
\n

Airbyte Destination for Mysql.

\n

Documentation can be found at https://docs.airbyte.com/integrations/destinations/mysql

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • host (str) \u2013 Hostname of the database.

  • \n
  • port (int) \u2013 Port of the database.

  • \n
  • database (str) \u2013 Name of the database.

  • \n
  • username (str) \u2013 Username to use to access the database.

  • \n
  • password (Optional[str]) \u2013 Password associated with the username.

  • \n
  • ssl (Optional[bool]) \u2013 Encrypt data using SSL.

  • \n
  • jdbc_url_params (Optional[str]) \u2013 Additional properties to pass to the JDBC URL string when connecting to the database formatted as \u2018key=value\u2019 pairs separated by the symbol \u2018&\u2019. (example: key1=value1&key2=value2&key3=value3).

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.destinations.SftpJsonDestination(name, host, username, password, destination_path, port=None)[source]\u00b6
\n
\n
\n__init__(name, host, username, password, destination_path, port=None)[source]\u00b6
\n

Airbyte Destination for Sftp Json.

\n

Documentation can be found at https://docs.airbyte.com/integrations/destinations/sftp-json

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • host (str) \u2013 Hostname of the SFTP server.

  • \n
  • port (Optional[int]) \u2013 Port of the SFTP server.

  • \n
  • username (str) \u2013 Username to use to access the SFTP server.

  • \n
  • password (str) \u2013 Password associated with the username.

  • \n
  • destination_path (str) \u2013 Path to the directory where json files will be written.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.destinations.GcsDestination(name, gcs_bucket_name, gcs_bucket_path, credential, format, gcs_bucket_region=None)[source]\u00b6
\n
\n
\n__init__(name, gcs_bucket_name, gcs_bucket_path, credential, format, gcs_bucket_region=None)[source]\u00b6
\n

Airbyte Destination for Gcs.

\n

Documentation can be found at https://docs.airbyte.com/integrations/destinations/gcs

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • gcs_bucket_name (str) \u2013 You can find the bucket name in the App Engine Admin console Application Settings page, under the label Google Cloud Storage Bucket. Read more here.

  • \n
  • gcs_bucket_path (str) \u2013 GCS Bucket Path string Subdirectory under the above bucket to sync the data into.

  • \n
  • gcs_bucket_region (Optional[str]) \u2013 Select a Region of the GCS Bucket. Read more here.

  • \n
  • credential (GcsDestination.HMACKey) \u2013 An HMAC key is a type of credential and can be associated with a service account or a user account in Cloud Storage. Read more here.

  • \n
  • format (Union[GcsDestination.AvroApacheAvro, GcsDestination.CSVCommaSeparatedValues, GcsDestination.JSONLinesNewlineDelimitedJSON, GcsDestination.ParquetColumnarStorage]) \u2013 Output data format. One of the following formats must be selected - AVRO format, PARQUET format, CSV format, or JSONL format.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass GcsDestination.HMACKey(credential_type, hmac_key_access_id, hmac_key_secret)[source]\u00b6
\n
\n
\n__init__(credential_type, hmac_key_access_id, hmac_key_secret)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass GcsDestination.NoCompression(compression_type=None)[source]\u00b6
\n
\n
\n__init__(compression_type=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass GcsDestination.Deflate(codec, compression_level=None)[source]\u00b6
\n
\n
\n__init__(codec, compression_level=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass GcsDestination.Bzip2(codec)[source]\u00b6
\n
\n
\n__init__(codec)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass GcsDestination.Xz(codec, compression_level=None)[source]\u00b6
\n
\n
\n__init__(codec, compression_level=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass GcsDestination.Zstandard(codec, compression_level=None, include_checksum=None)[source]\u00b6
\n
\n
\n__init__(codec, compression_level=None, include_checksum=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass GcsDestination.Snappy(codec)[source]\u00b6
\n
\n
\n__init__(codec)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass GcsDestination.AvroApacheAvro(format_type, compression_codec)[source]\u00b6
\n
\n
\n__init__(format_type, compression_codec)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass GcsDestination.GZIP(compression_type=None)[source]\u00b6
\n
\n
\n__init__(compression_type=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass GcsDestination.CSVCommaSeparatedValues(format_type, compression, flattening=None)[source]\u00b6
\n
\n
\n__init__(format_type, compression, flattening=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass GcsDestination.JSONLinesNewlineDelimitedJSON(format_type, compression)[source]\u00b6
\n
\n
\n__init__(format_type, compression)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass GcsDestination.ParquetColumnarStorage(format_type, compression_codec=None, block_size_mb=None, max_padding_size_mb=None, page_size_kb=None, dictionary_page_size_kb=None, dictionary_encoding=None)[source]\u00b6
\n
\n
\n__init__(format_type, compression_codec=None, block_size_mb=None, max_padding_size_mb=None, page_size_kb=None, dictionary_page_size_kb=None, dictionary_encoding=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.destinations.CassandraDestination(name, keyspace, username, password, address, port, datacenter=None, replication=None)[source]\u00b6
\n
\n
\n__init__(name, keyspace, username, password, address, port, datacenter=None, replication=None)[source]\u00b6
\n

Airbyte Destination for Cassandra.

\n

Documentation can be found at https://docs.airbyte.com/integrations/destinations/cassandra

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • keyspace (str) \u2013 Default Cassandra keyspace to create data in.

  • \n
  • username (str) \u2013 Username to use to access Cassandra.

  • \n
  • password (str) \u2013 Password associated with Cassandra.

  • \n
  • address (str) \u2013 Address to connect to.

  • \n
  • port (int) \u2013 Port of Cassandra.

  • \n
  • datacenter (Optional[str]) \u2013 Datacenter of the cassandra cluster.

  • \n
  • replication (Optional[int]) \u2013 Indicates to how many nodes the data should be replicated to.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.destinations.FireboltDestination(name, username, password, database, loading_method, account=None, host=None, engine=None)[source]\u00b6
\n
\n
\n__init__(name, username, password, database, loading_method, account=None, host=None, engine=None)[source]\u00b6
\n

Airbyte Destination for Firebolt.

\n

Documentation can be found at https://docs.airbyte.com/integrations/destinations/firebolt

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • username (str) \u2013 Firebolt email address you use to login.

  • \n
  • password (str) \u2013 Firebolt password.

  • \n
  • account (Optional[str]) \u2013 Firebolt account to login.

  • \n
  • host (Optional[str]) \u2013 The host name of your Firebolt database.

  • \n
  • database (str) \u2013 The database to connect to.

  • \n
  • engine (Optional[str]) \u2013 Engine name or url to connect to.

  • \n
  • loading_method (Union[FireboltDestination.SQLInserts, FireboltDestination.ExternalTableViaS3]) \u2013 Loading method used to select the way data will be uploaded to Firebolt

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass FireboltDestination.SQLInserts[source]\u00b6
\n
\n
\n__init__()[source]\u00b6
\n
\n\n
\n\n
\n
\nclass FireboltDestination.ExternalTableViaS3(s3_bucket, s3_region, aws_key_id, aws_key_secret)[source]\u00b6
\n
\n
\n__init__(s3_bucket, s3_region, aws_key_id, aws_key_secret)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.destinations.GoogleSheetsDestination(name, spreadsheet_id, credentials)[source]\u00b6
\n
\n
\n__init__(name, spreadsheet_id, credentials)[source]\u00b6
\n

Airbyte Destination for Google Sheets.

\n

Documentation can be found at https://docs.airbyte.com/integrations/destinations/google-sheets

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • spreadsheet_id (str) \u2013 The link to your spreadsheet. See this guide for more details.

  • \n
  • credentials (GoogleSheetsDestination.AuthenticationViaGoogleOAuth) \u2013 Google API Credentials for connecting to Google Sheets and Google Drive APIs

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass GoogleSheetsDestination.AuthenticationViaGoogleOAuth(client_id, client_secret, refresh_token)[source]\u00b6
\n
\n
\n__init__(client_id, client_secret, refresh_token)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.destinations.DatabricksDestination(name, accept_terms, databricks_server_hostname, databricks_http_path, databricks_personal_access_token, data_source, databricks_port=None, database_schema=None, purge_staging_data=None)[source]\u00b6
\n
\n
\n__init__(name, accept_terms, databricks_server_hostname, databricks_http_path, databricks_personal_access_token, data_source, databricks_port=None, database_schema=None, purge_staging_data=None)[source]\u00b6
\n

Airbyte Destination for Databricks.

\n

Documentation can be found at https://docs.airbyte.com/integrations/destinations/databricks

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • accept_terms (bool) \u2013 You must agree to the Databricks JDBC Driver Terms & Conditions to use this connector.

  • \n
  • databricks_server_hostname (str) \u2013 Databricks Cluster Server Hostname.

  • \n
  • databricks_http_path (str) \u2013 Databricks Cluster HTTP Path.

  • \n
  • databricks_port (Optional[str]) \u2013 Databricks Cluster Port.

  • \n
  • databricks_personal_access_token (str) \u2013 Databricks Personal Access Token for making authenticated requests.

  • \n
  • database_schema (Optional[str]) \u2013 The default schema tables are written to if the source does not specify a namespace. Unless specifically configured, the usual value for this field is \u201cpublic\u201d.

  • \n
  • data_source (Union[DatabricksDestination.AmazonS3, DatabricksDestination.AzureBlobStorage]) \u2013 Storage on which the delta lake is built.

  • \n
  • purge_staging_data (Optional[bool]) \u2013 Default to \u2018true\u2019. Switch it to \u2018false\u2019 for debugging purpose.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass DatabricksDestination.AmazonS3(data_source_type, s3_bucket_name, s3_bucket_path, s3_bucket_region, s3_access_key_id, s3_secret_access_key, file_name_pattern=None)[source]\u00b6
\n
\n
\n__init__(data_source_type, s3_bucket_name, s3_bucket_path, s3_bucket_region, s3_access_key_id, s3_secret_access_key, file_name_pattern=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass DatabricksDestination.AzureBlobStorage(data_source_type, azure_blob_storage_account_name, azure_blob_storage_container_name, azure_blob_storage_sas_token, azure_blob_storage_endpoint_domain_name=None)[source]\u00b6
\n
\n
\n__init__(data_source_type, azure_blob_storage_account_name, azure_blob_storage_container_name, azure_blob_storage_sas_token, azure_blob_storage_endpoint_domain_name=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.destinations.BigqueryDenormalizedDestination(name, project_id, dataset_id, loading_method, credentials_json=None, dataset_location=None, big_query_client_buffer_size_mb=None)[source]\u00b6
\n
\n
\n__init__(name, project_id, dataset_id, loading_method, credentials_json=None, dataset_location=None, big_query_client_buffer_size_mb=None)[source]\u00b6
\n

Airbyte Destination for Bigquery Denormalized.

\n

Documentation can be found at https://docs.airbyte.com/integrations/destinations/bigquery

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • project_id (str) \u2013 The GCP project ID for the project containing the target BigQuery dataset. Read more here.

  • \n
  • dataset_id (str) \u2013 The default BigQuery Dataset ID that tables are replicated to if the source does not specify a namespace. Read more here.

  • \n
  • loading_method (Union[BigqueryDenormalizedDestination.StandardInserts, BigqueryDenormalizedDestination.GCSStaging]) \u2013 Loading method used to send select the way data will be uploaded to BigQuery. Standard Inserts - Direct uploading using SQL INSERT statements. This method is extremely inefficient and provided only for quick testing. In almost all cases, you should use staging. GCS Staging - Writes large batches of records to a file, uploads the file to GCS, then uses COPY INTO table to upload the file. Recommended for most workloads for better speed and scalability. Read more about GCS Staging here.

  • \n
  • credentials_json (Optional[str]) \u2013 The contents of the JSON service account key. Check out the docs if you need help generating this key. Default credentials will be used if this field is left empty.

  • \n
  • dataset_location (Optional[str]) \u2013 The location of the dataset. Warning: Changes made after creation will not be applied. The default \u201cUS\u201d value is used if not set explicitly. Read more here.

  • \n
  • big_query_client_buffer_size_mb (Optional[int]) \u2013 Google BigQuery client\u2019s chunk (buffer) size (MIN=1, MAX = 15) for each table. The size that will be written by a single RPC. Written data will be buffered and only flushed upon reaching this size or closing the channel. The default 15MB value is used if not set explicitly. Read more here.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass BigqueryDenormalizedDestination.StandardInserts[source]\u00b6
\n
\n
\n__init__()[source]\u00b6
\n
\n\n
\n\n
\n
\nclass BigqueryDenormalizedDestination.HMACKey(hmac_key_access_id, hmac_key_secret)[source]\u00b6
\n
\n
\n__init__(hmac_key_access_id, hmac_key_secret)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass BigqueryDenormalizedDestination.GCSStaging(credential, gcs_bucket_name, gcs_bucket_path, keep_files_in_gcs_bucket=None)[source]\u00b6
\n
\n
\n__init__(credential, gcs_bucket_name, gcs_bucket_path, keep_files_in_gcs_bucket=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.destinations.SqliteDestination(name, destination_path)[source]\u00b6
\n
\n
\n__init__(name, destination_path)[source]\u00b6
\n

Airbyte Destination for Sqlite.

\n

Documentation can be found at https://docs.airbyte.com/integrations/destinations/sqlite

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • destination_path (str) \u2013 Path to the sqlite.db file. The file will be placed inside that local mount. For more information check out our docs

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.destinations.MongodbDestination(name, instance_type, database, auth_type)[source]\u00b6
\n
\n
\n__init__(name, instance_type, database, auth_type)[source]\u00b6
\n

Airbyte Destination for Mongodb.

\n

Documentation can be found at https://docs.airbyte.com/integrations/destinations/mongodb

\n
\n
Parameters:
\n
\n
\n
\n
\n\n
\n\n
\n
\nclass MongodbDestination.StandaloneMongoDbInstance(instance, host, port, tls=None)[source]\u00b6
\n
\n
\n__init__(instance, host, port, tls=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass MongodbDestination.ReplicaSet(instance, server_addresses, replica_set=None)[source]\u00b6
\n
\n
\n__init__(instance, server_addresses, replica_set=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass MongodbDestination.MongoDBAtlas(instance, cluster_url)[source]\u00b6
\n
\n
\n__init__(instance, cluster_url)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass MongodbDestination.None_[source]\u00b6
\n
\n
\n__init__()[source]\u00b6
\n
\n\n
\n\n
\n
\nclass MongodbDestination.LoginPassword(username, password)[source]\u00b6
\n
\n
\n__init__(username, password)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.destinations.RocksetDestination(name, api_key, workspace, api_server=None)[source]\u00b6
\n
\n
\n__init__(name, api_key, workspace, api_server=None)[source]\u00b6
\n

Airbyte Destination for Rockset.

\n

Documentation can be found at https://docs.airbyte.com/integrations/destinations/rockset

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • api_key (str) \u2013 Rockset api key

  • \n
  • workspace (str) \u2013 The Rockset workspace in which collections will be created + written to.

  • \n
  • api_server (Optional[str]) \u2013 Rockset api URL

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.destinations.OracleDestination(name, host, port, sid, username, encryption, password=None, jdbc_url_params=None, schema=None)[source]\u00b6
\n
\n
\n__init__(name, host, port, sid, username, encryption, password=None, jdbc_url_params=None, schema=None)[source]\u00b6
\n

Airbyte Destination for Oracle.

\n

Documentation can be found at https://docs.airbyte.com/integrations/destinations/oracle

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • host (str) \u2013 The hostname of the database.

  • \n
  • port (int) \u2013 The port of the database.

  • \n
  • sid (str) \u2013 The System Identifier uniquely distinguishes the instance from any other instance on the same computer.

  • \n
  • username (str) \u2013 The username to access the database. This user must have CREATE USER privileges in the database.

  • \n
  • password (Optional[str]) \u2013 The password associated with the username.

  • \n
  • jdbc_url_params (Optional[str]) \u2013 Additional properties to pass to the JDBC URL string when connecting to the database formatted as \u2018key=value\u2019 pairs separated by the symbol \u2018&\u2019. (example: key1=value1&key2=value2&key3=value3).

  • \n
  • schema (Optional[str]) \u2013 The default schema is used as the target schema for all statements issued from the connection that do not explicitly specify a schema name. The usual value for this field is \u201cairbyte\u201d. In Oracle, schemas and users are the same thing, so the \u201cuser\u201d parameter is used as the login credentials and this is used for the default Airbyte message schema.

  • \n
  • encryption (Union[OracleDestination.Unencrypted, OracleDestination.NativeNetworkEncryptionNNE, OracleDestination.TLSEncryptedVerifyCertificate]) \u2013 The encryption method which is used when communicating with the database.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass OracleDestination.Unencrypted[source]\u00b6
\n
\n
\n__init__()[source]\u00b6
\n
\n\n
\n\n
\n
\nclass OracleDestination.NativeNetworkEncryptionNNE(encryption_algorithm=None)[source]\u00b6
\n
\n
\n__init__(encryption_algorithm=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass OracleDestination.TLSEncryptedVerifyCertificate(ssl_certificate)[source]\u00b6
\n
\n
\n__init__(ssl_certificate)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.destinations.CsvDestination(name, destination_path)[source]\u00b6
\n
\n
\n__init__(name, destination_path)[source]\u00b6
\n

Airbyte Destination for Csv.

\n

Documentation can be found at https://docs.airbyte.com/integrations/destinations/local-csv

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • destination_path (str) \u2013 Path to the directory where csv files will be written. The destination uses the local mount \u201c/local\u201d and any data files will be placed inside that local mount. For more information check out our docs

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.destinations.S3Destination(name, s3_bucket_name, s3_bucket_path, s3_bucket_region, format, access_key_id=None, secret_access_key=None, s3_endpoint=None, s3_path_format=None, file_name_pattern=None)[source]\u00b6
\n
\n
\n__init__(name, s3_bucket_name, s3_bucket_path, s3_bucket_region, format, access_key_id=None, secret_access_key=None, s3_endpoint=None, s3_path_format=None, file_name_pattern=None)[source]\u00b6
\n

Airbyte Destination for S3.

\n

Documentation can be found at https://docs.airbyte.com/integrations/destinations/s3

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • access_key_id (Optional[str]) \u2013 The access key ID to access the S3 bucket. Airbyte requires Read and Write permissions to the given bucket. Read more here.

  • \n
  • secret_access_key (Optional[str]) \u2013 The corresponding secret to the access key ID. Read more here

  • \n
  • s3_bucket_name (str) \u2013 The name of the S3 bucket. Read more here.

  • \n
  • s3_bucket_path (str) \u2013 Directory under the S3 bucket where data will be written. Read more here

  • \n
  • s3_bucket_region (str) \u2013 The region of the S3 bucket. See here for all region codes.

  • \n
  • format (Union[S3Destination.AvroApacheAvro, S3Destination.CSVCommaSeparatedValues, S3Destination.JSONLinesNewlineDelimitedJSON, S3Destination.ParquetColumnarStorage]) \u2013 Format of the data output. See here for more details

  • \n
  • s3_endpoint (Optional[str]) \u2013 Your S3 endpoint url. Read more here

  • \n
  • s3_path_format (Optional[str]) \u2013 Format string on how data will be organized inside the S3 bucket directory. Read more here

  • \n
  • file_name_pattern (Optional[str]) \u2013 The pattern allows you to set the file-name format for the S3 staging file(s)

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass S3Destination.NoCompression(compression_type=None)[source]\u00b6
\n
\n
\n__init__(compression_type=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass S3Destination.Deflate(codec, compression_level)[source]\u00b6
\n
\n
\n__init__(codec, compression_level)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass S3Destination.Bzip2(codec)[source]\u00b6
\n
\n
\n__init__(codec)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass S3Destination.Xz(codec, compression_level)[source]\u00b6
\n
\n
\n__init__(codec, compression_level)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass S3Destination.Zstandard(codec, compression_level, include_checksum=None)[source]\u00b6
\n
\n
\n__init__(codec, compression_level, include_checksum=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass S3Destination.Snappy(codec)[source]\u00b6
\n
\n
\n__init__(codec)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass S3Destination.AvroApacheAvro(format_type, compression_codec)[source]\u00b6
\n
\n
\n__init__(format_type, compression_codec)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass S3Destination.GZIP(compression_type=None)[source]\u00b6
\n
\n
\n__init__(compression_type=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass S3Destination.CSVCommaSeparatedValues(format_type, flattening, compression)[source]\u00b6
\n
\n
\n__init__(format_type, flattening, compression)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass S3Destination.JSONLinesNewlineDelimitedJSON(format_type, compression)[source]\u00b6
\n
\n
\n__init__(format_type, compression)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass S3Destination.ParquetColumnarStorage(format_type, compression_codec=None, block_size_mb=None, max_padding_size_mb=None, page_size_kb=None, dictionary_page_size_kb=None, dictionary_encoding=None)[source]\u00b6
\n
\n
\n__init__(format_type, compression_codec=None, block_size_mb=None, max_padding_size_mb=None, page_size_kb=None, dictionary_page_size_kb=None, dictionary_encoding=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.destinations.AwsDatalakeDestination(name, region, credentials, bucket_name, bucket_prefix, aws_account_id=None, lakeformation_database_name=None)[source]\u00b6
\n
\n
\n__init__(name, region, credentials, bucket_name, bucket_prefix, aws_account_id=None, lakeformation_database_name=None)[source]\u00b6
\n

Airbyte Destination for Aws Datalake.

\n

Documentation can be found at https://docs.airbyte.com/integrations/destinations/aws-datalake

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • aws_account_id (Optional[str]) \u2013 target aws account id

  • \n
  • region (str) \u2013 Region name

  • \n
  • credentials (Union[AwsDatalakeDestination.IAMRole, AwsDatalakeDestination.IAMUser]) \u2013 Choose How to Authenticate to AWS.

  • \n
  • bucket_name (str) \u2013 Name of the bucket

  • \n
  • bucket_prefix (str) \u2013 S3 prefix

  • \n
  • lakeformation_database_name (Optional[str]) \u2013 Which database to use

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass AwsDatalakeDestination.IAMRole(role_arn)[source]\u00b6
\n
\n
\n__init__(role_arn)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass AwsDatalakeDestination.IAMUser(aws_access_key_id, aws_secret_access_key)[source]\u00b6
\n
\n
\n__init__(aws_access_key_id, aws_secret_access_key)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.destinations.MssqlDestination(name, host, port, database, schema, username, ssl_method, password=None, jdbc_url_params=None)[source]\u00b6
\n
\n
\n__init__(name, host, port, database, schema, username, ssl_method, password=None, jdbc_url_params=None)[source]\u00b6
\n

Airbyte Destination for Mssql.

\n

Documentation can be found at https://docs.airbyte.com/integrations/destinations/mssql

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • host (str) \u2013 The host name of the MSSQL database.

  • \n
  • port (int) \u2013 The port of the MSSQL database.

  • \n
  • database (str) \u2013 The name of the MSSQL database.

  • \n
  • schema (str) \u2013 The default schema tables are written to if the source does not specify a namespace. The usual value for this field is \u201cpublic\u201d.

  • \n
  • username (str) \u2013 The username which is used to access the database.

  • \n
  • password (Optional[str]) \u2013 The password associated with this username.

  • \n
  • jdbc_url_params (Optional[str]) \u2013 Additional properties to pass to the JDBC URL string when connecting to the database formatted as \u2018key=value\u2019 pairs separated by the symbol \u2018&\u2019. (example: key1=value1&key2=value2&key3=value3).

  • \n
  • ssl_method (Union[MssqlDestination.Unencrypted, MssqlDestination.EncryptedTrustServerCertificate, MssqlDestination.EncryptedVerifyCertificate]) \u2013 The encryption method which is used to communicate with the database.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass MssqlDestination.Unencrypted[source]\u00b6
\n
\n
\n__init__()[source]\u00b6
\n
\n\n
\n\n
\n
\nclass MssqlDestination.EncryptedTrustServerCertificate[source]\u00b6
\n
\n
\n__init__()[source]\u00b6
\n
\n\n
\n\n
\n
\nclass MssqlDestination.EncryptedVerifyCertificate(hostNameInCertificate=None)[source]\u00b6
\n
\n
\n__init__(hostNameInCertificate=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.destinations.PubsubDestination(name, project_id, topic_id, credentials_json)[source]\u00b6
\n
\n
\n__init__(name, project_id, topic_id, credentials_json)[source]\u00b6
\n

Airbyte Destination for Pubsub.

\n

Documentation can be found at https://docs.airbyte.com/integrations/destinations/pubsub

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • project_id (str) \u2013 The GCP project ID for the project containing the target PubSub.

  • \n
  • topic_id (str) \u2013 The PubSub topic ID in the given GCP project ID.

  • \n
  • credentials_json (str) \u2013 The contents of the JSON service account key. Check out the docs if you need help generating this key.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.destinations.R2Destination(name, account_id, access_key_id, secret_access_key, s3_bucket_name, s3_bucket_path, format, s3_path_format=None, file_name_pattern=None)[source]\u00b6
\n
\n
\n__init__(name, account_id, access_key_id, secret_access_key, s3_bucket_name, s3_bucket_path, format, s3_path_format=None, file_name_pattern=None)[source]\u00b6
\n

Airbyte Destination for R2.

\n

Documentation can be found at https://docs.airbyte.com/integrations/destinations/r2

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • account_id (str) \u2013 Cloudflare account ID

  • \n
  • access_key_id (str) \u2013 The access key ID to access the R2 bucket. Airbyte requires Read and Write permissions to the given bucket. Read more here.

  • \n
  • secret_access_key (str) \u2013 The corresponding secret to the access key ID. Read more here

  • \n
  • s3_bucket_name (str) \u2013 The name of the R2 bucket. Read more here.

  • \n
  • s3_bucket_path (str) \u2013 Directory under the R2 bucket where data will be written.

  • \n
  • format (Union[R2Destination.AvroApacheAvro, R2Destination.CSVCommaSeparatedValues, R2Destination.JSONLinesNewlineDelimitedJSON]) \u2013 Format of the data output. See here for more details

  • \n
  • s3_path_format (Optional[str]) \u2013 Format string on how data will be organized inside the R2 bucket directory. Read more here

  • \n
  • file_name_pattern (Optional[str]) \u2013 The pattern allows you to set the file-name format for the R2 staging file(s)

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass R2Destination.NoCompression(compression_type=None)[source]\u00b6
\n
\n
\n__init__(compression_type=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass R2Destination.Deflate(codec, compression_level)[source]\u00b6
\n
\n
\n__init__(codec, compression_level)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass R2Destination.Bzip2(codec)[source]\u00b6
\n
\n
\n__init__(codec)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass R2Destination.Xz(codec, compression_level)[source]\u00b6
\n
\n
\n__init__(codec, compression_level)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass R2Destination.Zstandard(codec, compression_level, include_checksum=None)[source]\u00b6
\n
\n
\n__init__(codec, compression_level, include_checksum=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass R2Destination.Snappy(codec)[source]\u00b6
\n
\n
\n__init__(codec)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass R2Destination.AvroApacheAvro(format_type, compression_codec)[source]\u00b6
\n
\n
\n__init__(format_type, compression_codec)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass R2Destination.GZIP(compression_type=None)[source]\u00b6
\n
\n
\n__init__(compression_type=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass R2Destination.CSVCommaSeparatedValues(format_type, flattening, compression)[source]\u00b6
\n
\n
\n__init__(format_type, flattening, compression)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass R2Destination.JSONLinesNewlineDelimitedJSON(format_type, compression)[source]\u00b6
\n
\n
\n__init__(format_type, compression)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.destinations.JdbcDestination(name, username, jdbc_url, password=None, schema=None)[source]\u00b6
\n
\n
\n__init__(name, username, jdbc_url, password=None, schema=None)[source]\u00b6
\n

Airbyte Destination for Jdbc.

\n

Documentation can be found at https://docs.airbyte.com/integrations/destinations/postgres

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • username (str) \u2013 The username which is used to access the database.

  • \n
  • password (Optional[str]) \u2013 The password associated with this username.

  • \n
  • jdbc_url (str) \u2013 JDBC formatted url. See the standard here.

  • \n
  • schema (Optional[str]) \u2013 If you leave the schema unspecified, JDBC defaults to a schema named \u201cpublic\u201d.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.destinations.KeenDestination(name, project_id, api_key, infer_timestamp=None)[source]\u00b6
\n
\n
\n__init__(name, project_id, api_key, infer_timestamp=None)[source]\u00b6
\n

Airbyte Destination for Keen.

\n

Documentation can be found at https://docs.airbyte.com/integrations/destinations/keen

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • project_id (str) \u2013 To get Keen Project ID, navigate to the Access tab from the left-hand, side panel and check the Project Details section.

  • \n
  • api_key (str) \u2013 To get Keen Master API Key, navigate to the Access tab from the left-hand, side panel and check the Project Details section.

  • \n
  • infer_timestamp (Optional[bool]) \u2013 Allow connector to guess keen.timestamp value based on the streamed data.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.destinations.TidbDestination(name, host, port, database, username, password=None, ssl=None, jdbc_url_params=None)[source]\u00b6
\n
\n
\n__init__(name, host, port, database, username, password=None, ssl=None, jdbc_url_params=None)[source]\u00b6
\n

Airbyte Destination for Tidb.

\n

Documentation can be found at https://docs.airbyte.com/integrations/destinations/tidb

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • host (str) \u2013 Hostname of the database.

  • \n
  • port (int) \u2013 Port of the database.

  • \n
  • database (str) \u2013 Name of the database.

  • \n
  • username (str) \u2013 Username to use to access the database.

  • \n
  • password (Optional[str]) \u2013 Password associated with the username.

  • \n
  • ssl (Optional[bool]) \u2013 Encrypt data using SSL.

  • \n
  • jdbc_url_params (Optional[str]) \u2013 Additional properties to pass to the JDBC URL string when connecting to the database formatted as \u2018key=value\u2019 pairs separated by the symbol \u2018&\u2019. (example: key1=value1&key2=value2&key3=value3).

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.destinations.FirestoreDestination(name, project_id, credentials_json=None)[source]\u00b6
\n
\n
\n__init__(name, project_id, credentials_json=None)[source]\u00b6
\n

Airbyte Destination for Firestore.

\n

Documentation can be found at https://docs.airbyte.com/integrations/destinations/firestore

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • project_id (str) \u2013 The GCP project ID for the project containing the target BigQuery dataset.

  • \n
  • credentials_json (Optional[str]) \u2013 The contents of the JSON service account key. Check out the docs if you need help generating this key. Default credentials will be used if this field is left empty.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.destinations.ScyllaDestination(name, keyspace, username, password, address, port, replication=None)[source]\u00b6
\n
\n
\n__init__(name, keyspace, username, password, address, port, replication=None)[source]\u00b6
\n

Airbyte Destination for Scylla.

\n

Documentation can be found at https://docs.airbyte.com/integrations/destinations/scylla

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • keyspace (str) \u2013 Default Scylla keyspace to create data in.

  • \n
  • username (str) \u2013 Username to use to access Scylla.

  • \n
  • password (str) \u2013 Password associated with Scylla.

  • \n
  • address (str) \u2013 Address to connect to.

  • \n
  • port (int) \u2013 Port of Scylla.

  • \n
  • replication (Optional[int]) \u2013 Indicates to how many nodes the data should be replicated to.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.destinations.RedisDestination(name, host, port, username, password, cache_type)[source]\u00b6
\n
\n
\n__init__(name, host, port, username, password, cache_type)[source]\u00b6
\n

Airbyte Destination for Redis.

\n

Documentation can be found at https://docs.airbyte.com/integrations/destinations/redis

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • host (str) \u2013 Redis host to connect to.

  • \n
  • port (int) \u2013 Port of Redis.

  • \n
  • username (str) \u2013 Username associated with Redis.

  • \n
  • password (str) \u2013 Password associated with Redis.

  • \n
  • cache_type (str) \u2013 Redis cache type to store data in.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.destinations.MqttDestination(name, broker_host, broker_port, use_tls, topic_pattern, publisher_sync, connect_timeout, automatic_reconnect, clean_session, message_retained, message_qos, username=None, password=None, topic_test=None, client=None)[source]\u00b6
\n
\n
\n__init__(name, broker_host, broker_port, use_tls, topic_pattern, publisher_sync, connect_timeout, automatic_reconnect, clean_session, message_retained, message_qos, username=None, password=None, topic_test=None, client=None)[source]\u00b6
\n

Airbyte Destination for Mqtt.

\n

Documentation can be found at https://docs.airbyte.com/integrations/destinations/mqtt

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • broker_host (str) \u2013 Host of the broker to connect to.

  • \n
  • broker_port (int) \u2013 Port of the broker.

  • \n
  • use_tls (bool) \u2013 Whether to use TLS encryption on the connection.

  • \n
  • username (Optional[str]) \u2013 User name to use for the connection.

  • \n
  • password (Optional[str]) \u2013 Password to use for the connection.

  • \n
  • topic_pattern (str) \u2013 Topic pattern in which the records will be sent. You can use patterns like \u2018{namespace}\u2019 and/or \u2018{stream}\u2019 to send the message to a specific topic based on these values. Notice that the topic name will be transformed to a standard naming convention.

  • \n
  • topic_test (Optional[str]) \u2013 Topic to test if Airbyte can produce messages.

  • \n
  • client (Optional[str]) \u2013 A client identifier that is unique on the server being connected to.

  • \n
  • publisher_sync (bool) \u2013 Wait synchronously until the record has been sent to the broker.

  • \n
  • connect_timeout (int) \u2013 Maximum time interval (in seconds) the client will wait for the network connection to the MQTT server to be established.

  • \n
  • automatic_reconnect (bool) \u2013 Whether the client will automatically attempt to reconnect to the server if the connection is lost.

  • \n
  • clean_session (bool) \u2013 Whether the client and server should remember state across restarts and reconnects.

  • \n
  • message_retained (bool) \u2013 Whether or not the publish message should be retained by the messaging engine.

  • \n
  • message_qos (str) \u2013 Quality of service used for each message to be delivered.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.destinations.RedshiftDestination(name, host, port, username, password, database, schema, uploading_method, jdbc_url_params=None)[source]\u00b6
\n
\n
\n__init__(name, host, port, username, password, database, schema, uploading_method, jdbc_url_params=None)[source]\u00b6
\n

Airbyte Destination for Redshift.

\n

Documentation can be found at https://docs.airbyte.com/integrations/destinations/redshift

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • host (str) \u2013 Host Endpoint of the Redshift Cluster (must include the cluster-id, region and end with .redshift.amazonaws.com)

  • \n
  • port (int) \u2013 Port of the database.

  • \n
  • username (str) \u2013 Username to use to access the database.

  • \n
  • password (str) \u2013 Password associated with the username.

  • \n
  • database (str) \u2013 Name of the database.

  • \n
  • schema (str) \u2013 The default schema tables are written to if the source does not specify a namespace. Unless specifically configured, the usual value for this field is \u201cpublic\u201d.

  • \n
  • jdbc_url_params (Optional[str]) \u2013 Additional properties to pass to the JDBC URL string when connecting to the database formatted as \u2018key=value\u2019 pairs separated by the symbol \u2018&\u2019. (example: key1=value1&key2=value2&key3=value3).

  • \n
  • uploading_method (Union[RedshiftDestination.Standard, RedshiftDestination.S3Staging]) \u2013 The method how the data will be uploaded to the database.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass RedshiftDestination.Standard[source]\u00b6
\n
\n
\n__init__()[source]\u00b6
\n
\n\n
\n\n
\n
\nclass RedshiftDestination.NoEncryption[source]\u00b6
\n
\n
\n__init__()[source]\u00b6
\n
\n\n
\n\n
\n
\nclass RedshiftDestination.AESCBCEnvelopeEncryption(key_encrypting_key=None)[source]\u00b6
\n
\n
\n__init__(key_encrypting_key=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass RedshiftDestination.S3Staging(s3_bucket_name, s3_bucket_region, access_key_id, secret_access_key, encryption, s3_bucket_path=None, file_name_pattern=None, purge_staging_data=None)[source]\u00b6
\n
\n
\n__init__(s3_bucket_name, s3_bucket_region, access_key_id, secret_access_key, encryption, s3_bucket_path=None, file_name_pattern=None, purge_staging_data=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.destinations.PulsarDestination(name, brokers, use_tls, topic_type, topic_tenant, topic_namespace, topic_pattern, compression_type, send_timeout_ms, max_pending_messages, max_pending_messages_across_partitions, batching_enabled, batching_max_messages, batching_max_publish_delay, block_if_queue_full, topic_test=None, producer_name=None, producer_sync=None)[source]\u00b6
\n
\n
\n__init__(name, brokers, use_tls, topic_type, topic_tenant, topic_namespace, topic_pattern, compression_type, send_timeout_ms, max_pending_messages, max_pending_messages_across_partitions, batching_enabled, batching_max_messages, batching_max_publish_delay, block_if_queue_full, topic_test=None, producer_name=None, producer_sync=None)[source]\u00b6
\n

Airbyte Destination for Pulsar.

\n

Documentation can be found at https://docs.airbyte.com/integrations/destinations/pulsar

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • brokers (str) \u2013 A list of host/port pairs to use for establishing the initial connection to the Pulsar cluster.

  • \n
  • use_tls (bool) \u2013 Whether to use TLS encryption on the connection.

  • \n
  • topic_type (str) \u2013 It identifies type of topic. Pulsar supports two kind of topics: persistent and non-persistent. In persistent topic, all messages are durably persisted on disk (that means on multiple disks unless the broker is standalone), whereas non-persistent topic does not persist message into storage disk.

  • \n
  • topic_tenant (str) \u2013 The topic tenant within the instance. Tenants are essential to multi-tenancy in Pulsar, and spread across clusters.

  • \n
  • topic_namespace (str) \u2013 The administrative unit of the topic, which acts as a grouping mechanism for related topics. Most topic configuration is performed at the namespace level. Each tenant has one or multiple namespaces.

  • \n
  • topic_pattern (str) \u2013 Topic pattern in which the records will be sent. You can use patterns like \u2018{namespace}\u2019 and/or \u2018{stream}\u2019 to send the message to a specific topic based on these values. Notice that the topic name will be transformed to a standard naming convention.

  • \n
  • topic_test (Optional[str]) \u2013 Topic to test if Airbyte can produce messages.

  • \n
  • producer_name (Optional[str]) \u2013 Name for the producer. If not filled, the system will generate a globally unique name which can be accessed with.

  • \n
  • producer_sync (Optional[bool]) \u2013 Wait synchronously until the record has been sent to Pulsar.

  • \n
  • compression_type (str) \u2013 Compression type for the producer.

  • \n
  • send_timeout_ms (int) \u2013 If a message is not acknowledged by a server before the send-timeout expires, an error occurs (in ms).

  • \n
  • max_pending_messages (int) \u2013 The maximum size of a queue holding pending messages.

  • \n
  • max_pending_messages_across_partitions (int) \u2013 The maximum number of pending messages across partitions.

  • \n
  • batching_enabled (bool) \u2013 Control whether automatic batching of messages is enabled for the producer.

  • \n
  • batching_max_messages (int) \u2013 Maximum number of messages permitted in a batch.

  • \n
  • batching_max_publish_delay (int) \u2013 Time period in milliseconds within which the messages sent will be batched.

  • \n
  • block_if_queue_full (bool) \u2013 If the send operation should block when the outgoing message queue is full.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.destinations.SnowflakeDestination(name, host, role, warehouse, database, schema, username, credentials, loading_method, jdbc_url_params=None)[source]\u00b6
\n
\n
\n__init__(name, host, role, warehouse, database, schema, username, credentials, loading_method, jdbc_url_params=None)[source]\u00b6
\n

Airbyte Destination for Snowflake.

\n

Documentation can be found at https://docs.airbyte.com/integrations/destinations/snowflake

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • host (str) \u2013 Enter your Snowflake account\u2019s locator (in the format \u2026snowflakecomputing.com)

  • \n
  • role (str) \u2013 Enter the role that you want to use to access Snowflake

  • \n
  • warehouse (str) \u2013 Enter the name of the warehouse that you want to sync data into

  • \n
  • database (str) \u2013 Enter the name of the database you want to sync data into

  • \n
  • schema (str) \u2013 Enter the name of the default schema

  • \n
  • username (str) \u2013 Enter the name of the user you want to use to access the database

  • \n
  • jdbc_url_params (Optional[str]) \u2013 Enter the additional properties to pass to the JDBC URL string when connecting to the database (formatted as key=value pairs separated by the symbol &). Example: key1=value1&key2=value2&key3=value3

  • \n
  • loading_method (Union[SnowflakeDestination.SelectAnotherOption, SnowflakeDestination.RecommendedInternalStaging, SnowflakeDestination.AWSS3Staging, SnowflakeDestination.GoogleCloudStorageStaging, SnowflakeDestination.AzureBlobStorageStaging]) \u2013 Select a data staging method

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass SnowflakeDestination.OAuth20(access_token, refresh_token, auth_type=None, client_id=None, client_secret=None)[source]\u00b6
\n
\n
\n__init__(access_token, refresh_token, auth_type=None, client_id=None, client_secret=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass SnowflakeDestination.KeyPairAuthentication(private_key, auth_type=None, private_key_password=None)[source]\u00b6
\n
\n
\n__init__(private_key, auth_type=None, private_key_password=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass SnowflakeDestination.UsernameAndPassword(password)[source]\u00b6
\n
\n
\n__init__(password)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass SnowflakeDestination.SelectAnotherOption(method)[source]\u00b6
\n
\n
\n__init__(method)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass SnowflakeDestination.RecommendedInternalStaging(method)[source]\u00b6
\n
\n
\n__init__(method)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass SnowflakeDestination.NoEncryption[source]\u00b6
\n
\n
\n__init__()[source]\u00b6
\n
\n\n
\n\n
\n
\nclass SnowflakeDestination.AESCBCEnvelopeEncryption(key_encrypting_key=None)[source]\u00b6
\n
\n
\n__init__(key_encrypting_key=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass SnowflakeDestination.AWSS3Staging(method, s3_bucket_name, access_key_id, secret_access_key, encryption, s3_bucket_region=None, purge_staging_data=None, file_name_pattern=None)[source]\u00b6
\n
\n
\n__init__(method, s3_bucket_name, access_key_id, secret_access_key, encryption, s3_bucket_region=None, purge_staging_data=None, file_name_pattern=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass SnowflakeDestination.GoogleCloudStorageStaging(method, project_id, bucket_name, credentials_json)[source]\u00b6
\n
\n
\n__init__(method, project_id, bucket_name, credentials_json)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass SnowflakeDestination.AzureBlobStorageStaging(method, azure_blob_storage_account_name, azure_blob_storage_container_name, azure_blob_storage_sas_token, azure_blob_storage_endpoint_domain_name=None)[source]\u00b6
\n
\n
\n__init__(method, azure_blob_storage_account_name, azure_blob_storage_container_name, azure_blob_storage_sas_token, azure_blob_storage_endpoint_domain_name=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.destinations.PostgresDestination(name, host, port, database, schema, username, ssl_mode, password=None, ssl=None, jdbc_url_params=None)[source]\u00b6
\n
\n
\n__init__(name, host, port, database, schema, username, ssl_mode, password=None, ssl=None, jdbc_url_params=None)[source]\u00b6
\n

Airbyte Destination for Postgres.

\n

Documentation can be found at https://docs.airbyte.com/integrations/destinations/postgres

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • host (str) \u2013 Hostname of the database.

  • \n
  • port (int) \u2013 Port of the database.

  • \n
  • database (str) \u2013 Name of the database.

  • \n
  • schema (str) \u2013 The default schema tables are written to if the source does not specify a namespace. The usual value for this field is \u201cpublic\u201d.

  • \n
  • username (str) \u2013 Username to use to access the database.

  • \n
  • password (Optional[str]) \u2013 Password associated with the username.

  • \n
  • ssl (Optional[bool]) \u2013 Encrypt data using SSL. When activating SSL, please select one of the connection modes.

  • \n
  • ssl_mode (Union[PostgresDestination.Disable, PostgresDestination.Allow, PostgresDestination.Prefer, PostgresDestination.Require, PostgresDestination.VerifyCa, PostgresDestination.VerifyFull]) \u2013 SSL connection modes. disable - Chose this mode to disable encryption of communication between Airbyte and destination database allow - Chose this mode to enable encryption only when required by the source database prefer - Chose this mode to allow unencrypted connection only if the source database does not support encryption require - Chose this mode to always require encryption. If the source database server does not support encryption, connection will fail verify-ca - Chose this mode to always require encryption and to verify that the source database server has a valid SSL certificate verify-full - This is the most secure mode. Chose this mode to always require encryption and to verify the identity of the source database server See more information - in the docs.

  • \n
  • jdbc_url_params (Optional[str]) \u2013 Additional properties to pass to the JDBC URL string when connecting to the database formatted as \u2018key=value\u2019 pairs separated by the symbol \u2018&\u2019. (example: key1=value1&key2=value2&key3=value3).

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass PostgresDestination.Disable[source]\u00b6
\n
\n
\n__init__()[source]\u00b6
\n
\n\n
\n\n
\n
\nclass PostgresDestination.Allow[source]\u00b6
\n
\n
\n__init__()[source]\u00b6
\n
\n\n
\n\n
\n
\nclass PostgresDestination.Prefer[source]\u00b6
\n
\n
\n__init__()[source]\u00b6
\n
\n\n
\n\n
\n
\nclass PostgresDestination.Require[source]\u00b6
\n
\n
\n__init__()[source]\u00b6
\n
\n\n
\n\n
\n
\nclass PostgresDestination.VerifyCa(ca_certificate, client_key_password=None)[source]\u00b6
\n
\n
\n__init__(ca_certificate, client_key_password=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass PostgresDestination.VerifyFull(ca_certificate, client_certificate, client_key, client_key_password=None)[source]\u00b6
\n
\n
\n__init__(ca_certificate, client_certificate, client_key, client_key_password=None)[source]\u00b6
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.destinations.ScaffoldDestinationPythonDestination(name, TODO=None)[source]\u00b6
\n
\n
\n__init__(name, TODO=None)[source]\u00b6
\n

Airbyte Destination for Scaffold Destination Python.

\n

Documentation can be found at https://docs.airbyte.com/integrations/destinations/scaffold-destination-python

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • TODO (Optional[str]) \u2013 FIX ME

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.destinations.LocalJsonDestination(name, destination_path)[source]\u00b6
\n
\n
\n__init__(name, destination_path)[source]\u00b6
\n

Airbyte Destination for Local Json.

\n

Documentation can be found at https://docs.airbyte.com/integrations/destinations/local-json

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • destination_path (str) \u2013 Path to the directory where json files will be written. The files will be placed inside that local mount. For more information check out our docs

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_airbyte.managed.generated.destinations.MeilisearchDestination(name, host, api_key=None)[source]\u00b6
\n
\n
\n__init__(name, host, api_key=None)[source]\u00b6
\n

Airbyte Destination for Meilisearch.

\n

Documentation can be found at https://docs.airbyte.com/integrations/destinations/meilisearch

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the destination.

  • \n
  • host (str) \u2013 Hostname of the MeiliSearch instance.

  • \n
  • api_key (Optional[str]) \u2013 MeiliSearch API Key. See the docs for more information on how to obtain this key.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\n

Legacy\u00b6

\n
\n
\ndagster_airbyte.airbyte_resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
request_max_retries (dagster.IntSource, optional):
\n

The maximum number of times requests to the Airbyte API should be retried before failing.

\n

Default Value: 3

\n
\n
request_retry_delay (Float, optional):
\n

Time (in seconds) to wait between each request retry.

\n

Default Value: 0.25

\n
\n
request_timeout (dagster.IntSource, optional):
\n

Time (in seconds) after which the requests to Airbyte are declared timed out.

\n

Default Value: 15

\n
\n
cancel_sync_on_run_termination (dagster.BoolSource, optional):
\n

Whether to cancel a sync in Airbyte if the Dagster runner is terminated. This may be useful to disable if using Airbyte sources that cannot be cancelled and resumed easily, or if your Dagster deployment may experience runner interruptions that do not impact your Airbyte deployment.

\n

Default Value: True

\n
\n
poll_interval (Float, optional):
\n

Time (in seconds) to wait between checking a sync\u2019s status.

\n

Default Value: 10

\n
\n
host (dagster.StringSource):
\n

The Airbyte server address.

\n
\n
port (dagster.StringSource):
\n

Port used for the Airbyte server.

\n
\n
username (Union[dagster.StringSource, None], optional):
\n

Username if using basic auth.

\n
\n
password (Union[dagster.StringSource, None], optional):
\n

Password if using basic auth.

\n
\n
use_https (dagster.BoolSource, optional):
\n

Whether to use HTTPS to connect to the Airbyte server.

\n

Default Value: False

\n
\n
forward_logs (dagster.BoolSource, optional):
\n

Whether to forward Airbyte logs to the compute log, can be expensive for long-running syncs.

\n

Default Value: True

\n
\n
request_additional_params (dict, optional):
\n

Any additional kwargs to pass to the requests library when making requests to Airbyte.

\n
\nDefault Value:
{}\n
\n
\n
\n
\n

This resource allows users to programatically interface with the Airbyte REST API to launch\nsyncs and monitor their progress. This currently implements only a subset of the functionality\nexposed by the API.

\n

For a complete set of documentation on the Airbyte REST API, including expected response JSON\nschema, see the Airbyte API Docs.

\n

To configure this resource, we recommend using the configured method.

\n

Examples:

\n
from dagster import job\nfrom dagster_airbyte import airbyte_resource\n\nmy_airbyte_resource = airbyte_resource.configured(\n    {\n        "host": {"env": "AIRBYTE_HOST"},\n        "port": {"env": "AIRBYTE_PORT"},\n        # If using basic auth\n        "username": {"env": "AIRBYTE_USERNAME"},\n        "password": {"env": "AIRBYTE_PASSWORD"},\n    }\n)\n\n@job(resource_defs={"airbyte":my_airbyte_resource})\ndef my_airbyte_job():\n    ...\n
\n
\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-airbyte", "customsidebar": null, "display_toc": true, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../dagster-airflow/", "title": "Airflow (dagster-airflow)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../../memoization/", "title": "Job-Level Versioning and Memoization (Deprecated)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-airflow", "Airflow (dagster-airflow)", "N", "next"], ["sections/api/apidocs/memoization", "Job-Level Versioning and Memoization (Deprecated)", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/libraries/dagster-airbyte.rst.txt", "title": "Airbyte (dagster-airbyte)", "toc": "\n"}, "dagster-airflow": {"alabaster_version": "0.7.13", "body": "
\n

Airflow (dagster-airflow)\u00b6

\n

This library provides a Dagster integration with Airflow.

\n

For more information on getting started, see the Airflow integration guide.

\n
\n

Run Airflow on Dagster\u00b6

\n
\n
\ndagster_airflow.make_dagster_definitions_from_airflow_dags_path(dag_path, safe_mode=True, connections=None, resource_defs={})[source]\u00b6
\n

Construct a Dagster repository corresponding to Airflow DAGs in dag_path.

\n
\n
Usage:

Create make_dagster_definitions.py:

\n
from dagster_airflow import make_dagster_definitions_from_airflow_dags_path\n\ndef make_definitions_from_dir():\n    return make_dagster_definitions_from_airflow_dags_path(\n        '/path/to/dags/',\n    )\n
\n
\n

Use RepositoryDefinition as usual, for example:\ndagster-webserver -f path/to/make_dagster_repo.py -n make_repo_from_dir

\n
\n
\n
\n
Parameters:
\n
    \n
  • dag_path (str) \u2013 Path to directory or file that contains Airflow Dags

  • \n
  • include_examples (bool) \u2013 True to include Airflow\u2019s example DAGs. (default: False)

  • \n
  • safe_mode (bool) \u2013 True to use Airflow\u2019s default heuristic to find files that contain DAGs\n(ie find files that contain both b\u2019DAG\u2019 and b\u2019airflow\u2019) (default: True)

  • \n
  • connections (List[Connection]) \u2013 List of Airflow Connections to be created in the Airflow DB

  • \n
\n
\n
Returns:
\n

Definitions

\n
\n
\n
\n\n
\n
\ndagster_airflow.make_dagster_definitions_from_airflow_dag_bag(dag_bag, connections=None, resource_defs={})[source]\u00b6
\n

Construct a Dagster definition corresponding to Airflow DAGs in DagBag.

\n
\n
Usage:
\n
Create make_dagster_definition.py:

from dagster_airflow import make_dagster_definition_from_airflow_dag_bag\nfrom airflow_home import my_dag_bag

\n
\n
def make_definition_from_dag_bag():

return make_dagster_definition_from_airflow_dag_bag(my_dag_bag)

\n
\n
\n
\n
Use Definitions as usual, for example:

dagster-webserver -f path/to/make_dagster_definition.py

\n
\n
\n
\n
\n
\n
Parameters:
\n
    \n
  • dag_bag (DagBag) \u2013 Airflow DagBag Model

  • \n
  • connections (List[Connection]) \u2013 List of Airflow Connections to be created in the Airflow DB

  • \n
\n
\n
Returns:
\n

Definitions

\n
\n
\n
\n\n
\n
\ndagster_airflow.make_schedules_and_jobs_from_airflow_dag_bag(dag_bag, connections=None, resource_defs={})[source]\u00b6
\n

Construct Dagster Schedules and Jobs corresponding to Airflow DagBag.

\n
\n
Parameters:
\n
    \n
  • dag_bag (DagBag) \u2013 Airflow DagBag Model

  • \n
  • connections (List[Connection]) \u2013 List of Airflow Connections to be created in the Airflow DB

  • \n
\n
\n
Returns:
\n

The generated Dagster Schedules\n- List[JobDefinition]: The generated Dagster Jobs

\n
\n
Return type:
\n

    \n
  • List[ScheduleDefinition]

  • \n
\n

\n
\n
\n
\n\n
\n
\ndagster_airflow.make_dagster_job_from_airflow_dag(dag, tags=None, connections=None, resource_defs={})[source]\u00b6
\n

Construct a Dagster job corresponding to a given Airflow DAG.

\n

Tasks in the resulting job will execute the execute() method on the corresponding\nAirflow Operator. Dagster, any dependencies required by Airflow Operators, and the module\ncontaining your DAG definition must be available in the Python environment within which your\nDagster solids execute.

\n

To set Airflow\u2019s execution_date for use with Airflow Operator\u2019s execute() methods,\neither:

\n
    \n
  1. \n
    (Best for ad hoc runs) Execute job directly. This will set execution_date to the

    time (in UTC) of the run.

    \n
    \n
    \n
  2. \n
  3. \n
    Add {'airflow_execution_date': utc_date_string} to the job tags. This will override

    behavior from (1).

    \n
    my_dagster_job = make_dagster_job_from_airflow_dag(\n        dag=dag,\n        tags={'airflow_execution_date': utc_execution_date_str}\n)\nmy_dagster_job.execute_in_process()\n
    \n
    \n
    \n
    \n
  4. \n
  5. \n
    (Recommended) Add {'airflow_execution_date': utc_date_string} to the run tags,

    such as in the Dagster UI. This will override behavior from (1) and (2)

    \n
    \n
    \n
  6. \n
\n

We apply normalized_name() to the dag id and task ids when generating job name and op\nnames to ensure that names conform to Dagster\u2019s naming conventions.

\n
\n
Parameters:
\n
    \n
  • dag (DAG) \u2013 The Airflow DAG to compile into a Dagster job

  • \n
  • tags (Dict[str, Field]) \u2013 Job tags. Optionally include\ntags={\u2018airflow_execution_date\u2019: utc_date_string} to specify execution_date used within\nexecution of Airflow Operators.

  • \n
  • connections (List[Connection]) \u2013 List of Airflow Connections to be created in the Ephemeral\nAirflow DB, if use_emphemeral_airflow_db is False this will be ignored.

  • \n
\n
\n
Returns:
\n

The generated Dagster job

\n
\n
Return type:
\n

JobDefinition

\n
\n
\n
\n\n
\n
\ndagster_airflow.load_assets_from_airflow_dag(dag, task_ids_by_asset_key={}, upstream_dependencies_by_asset_key={}, connections=None)[source]\u00b6
\n

[Experimental] Construct Dagster Assets for a given Airflow DAG.

\n
\n
Parameters:
\n
    \n
  • dag (DAG) \u2013 The Airflow DAG to compile into a Dagster job

  • \n
  • task_ids_by_asset_key (Optional[Mapping[AssetKey, AbstractSet[str]]]) \u2013 A mapping from asset\nkeys to task ids. Used break up the Airflow Dag into multiple SDAs

  • \n
  • upstream_dependencies_by_asset_key (Optional[Mapping[AssetKey, AbstractSet[AssetKey]]]) \u2013 A\nmapping from upstream asset keys to assets provided in task_ids_by_asset_key. Used to\ndeclare new upstream SDA depenencies.

  • \n
  • connections (List[Connection]) \u2013 List of Airflow Connections to be created in the Airflow DB

  • \n
\n
\n
Returns:
\n

List[AssetsDefinition]

\n
\n
\n
\n\n
\n
\ndagster_airflow.make_ephemeral_airflow_db_resource(connections=[], dag_run_config=None)[source]\u00b6
\n

Creates a Dagster resource that provides an ephemeral Airflow database.

\n
\n
Parameters:
\n
    \n
  • connections (List[Connection]) \u2013 List of Airflow Connections to be created in the Airflow DB

  • \n
  • dag_run_config (Optional[dict]) \u2013 dag_run configuration to be used when creating a DagRun

  • \n
\n
\n
Returns:
\n

The ephemeral Airflow DB resource

\n
\n
Return type:
\n

ResourceDefinition

\n
\n
\n
\n\n
\n
\ndagster_airflow.make_persistent_airflow_db_resource(uri='', connections=[], dag_run_config={})[source]\u00b6
\n

Creates a Dagster resource that provides an persistent Airflow database.

\n
\n
Usage:
from dagster_airflow import (\n    make_dagster_definitions_from_airflow_dags_path,\n    make_persistent_airflow_db_resource,\n)\npostgres_airflow_db = "postgresql+psycopg2://airflow:airflow@localhost:5432/airflow"\nairflow_db = make_persistent_airflow_db_resource(uri=postgres_airflow_db)\ndefinitions = make_dagster_definitions_from_airflow_example_dags(\n    '/path/to/dags/',\n    resource_defs={"airflow_db": airflow_db}\n)\n
\n
\n
\n
\n
\n
Parameters:
\n
    \n
  • uri \u2013 SQLAlchemy URI of the Airflow DB to be used

  • \n
  • connections (List[Connection]) \u2013 List of Airflow Connections to be created in the Airflow DB

  • \n
  • dag_run_config (Optional[dict]) \u2013 dag_run configuration to be used when creating a DagRun

  • \n
\n
\n
Returns:
\n

The persistent Airflow DB resource

\n
\n
Return type:
\n

ResourceDefinition

\n
\n
\n
\n\n
\n
\n

Orchestrate Dagster from Airflow\u00b6

\n
\n
\nclass dagster_airflow.DagsterCloudOperator(*args, **kwargs)[source]\u00b6
\n

DagsterCloudOperator.

\n

Uses the dagster cloud graphql api to run and monitor dagster jobs on dagster cloud

\n
\n
Parameters:
\n
    \n
  • repository_name (str) \u2013 the name of the repository to use

  • \n
  • repostitory_location_name (str) \u2013 the name of the repostitory location to use

  • \n
  • job_name (str) \u2013 the name of the job to run

  • \n
  • run_config (Optional[Dict[str, Any]]) \u2013 the run config to use for the job run

  • \n
  • dagster_conn_id (Optional[str]) \u2013 the id of the dagster connection, airflow 2.0+ only

  • \n
  • organization_id (Optional[str]) \u2013 the id of the dagster cloud organization

  • \n
  • deployment_name (Optional[str]) \u2013 the name of the dagster cloud deployment

  • \n
  • user_token (Optional[str]) \u2013 the dagster cloud user token to use

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster_airflow.DagsterOperator(*args, **kwargs)[source]\u00b6
\n

DagsterOperator.

\n

Uses the dagster graphql api to run and monitor dagster jobs on remote dagster infrastructure

\n
\n
Parameters:
\n
    \n
  • repository_name (str) \u2013 the name of the repository to use

  • \n
  • repostitory_location_name (str) \u2013 the name of the repostitory location to use

  • \n
  • job_name (str) \u2013 the name of the job to run

  • \n
  • run_config (Optional[Dict[str, Any]]) \u2013 the run config to use for the job run

  • \n
  • dagster_conn_id (Optional[str]) \u2013 the id of the dagster connection, airflow 2.0+ only

  • \n
  • organization_id (Optional[str]) \u2013 the id of the dagster cloud organization

  • \n
  • deployment_name (Optional[str]) \u2013 the name of the dagster cloud deployment

  • \n
  • user_token (Optional[str]) \u2013 the dagster cloud user token to use

  • \n
\n
\n
\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-airflow", "customsidebar": null, "display_toc": true, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../dagster-aws/", "title": "AWS (dagster-aws)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-airbyte/", "title": "Airbyte (dagster-airbyte)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-aws", "AWS (dagster-aws)", "N", "next"], ["sections/api/apidocs/libraries/dagster-airbyte", "Airbyte (dagster-airbyte)", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/libraries/dagster-airflow.rst.txt", "title": "Airflow (dagster-airflow)", "toc": "\n"}, "dagster-aws": {"alabaster_version": "0.7.13", "body": "
\n

AWS (dagster-aws)\u00b6

\n

Utilities for interfacing with AWS with Dagster.

\n
\n

S3\u00b6

\n
\n
\ndagster_aws.s3.S3Resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
use_unsigned_session (dagster.BoolSource, optional):
\n

Specifies whether to use an unsigned S3 session.

\n

Default Value: False

\n
\n
region_name (Union[dagster.StringSource, None], optional):
\n

Specifies a custom region for the S3 session.

\n
\n
endpoint_url (Union[dagster.StringSource, None], optional):
\n

Specifies a custom endpoint for the S3 session.

\n
\n
max_attempts (dagster.IntSource, optional):
\n

This provides Boto3\u2019s retry handler with a value of maximum retry attempts, where the initial call counts toward the max_attempts value that you provide.

\n

Default Value: 5

\n
\n
profile_name (Union[dagster.StringSource, None], optional):
\n

Specifies a profile to connect that session.

\n
\n
use_ssl (dagster.BoolSource, optional):
\n

Whether or not to use SSL. By default, SSL is used.

\n

Default Value: True

\n
\n
verify (Union[dagster.StringSource, None], optional):
\n

Whether or not to verify SSL certificates. By default SSL certificates are verified. You can also specify this argument if you want to use a different CA cert bundle than the one used by botocore.

\n
\n
aws_access_key_id (Union[dagster.StringSource, None], optional):
\n

AWS access key ID to use when creating the boto3 session.

\n
\n
aws_secret_access_key (Union[dagster.StringSource, None], optional):
\n

AWS secret access key to use when creating the boto3 session.

\n
\n
aws_session_token (Union[dagster.StringSource, None], optional):
\n

AWS session token to use when creating the boto3 session.

\n
\n
\n

Resource that gives access to S3.

\n

The underlying S3 session is created by calling\nboto3.session.Session(profile_name).\nThe returned resource object is an S3 client, an instance of botocore.client.S3.

\n

Example

\n
from dagster import job, op, Definitions\nfrom dagster_aws.s3 import S3Resource\n\n@op\ndef example_s3_op(s3: S3Resource):\n    return s3.get_client().list_objects_v2(\n        Bucket='my-bucket',\n        Prefix='some-key'\n    )\n\n@job\ndef example_job():\n    example_s3_op()\n\ndefs = Definitions(\n    jobs=[example_job],\n    resources={'s3': S3Resource(region_name='us-west-1')}\n)\n
\n
\n
\n\n
\n
\ndagster_aws.s3.S3PickleIOManager IOManagerDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
s3_resource (Union[Any, None], optional):
\n

\n
s3_bucket (dagster.StringSource):
\n

S3 bucket to use for the file manager.

\n
\n
s3_prefix (dagster.StringSource, optional):
\n

Prefix to use for the S3 bucket for this file manager.

\n

Default Value: \u2018dagster\u2019

\n
\n
\n

Persistent IO manager using S3 for storage.

\n

Serializes objects via pickling. Suitable for objects storage for distributed executors, so long\nas each execution node has network connectivity and credentials for S3 and the backing bucket.

\n

Assigns each op output to a unique filepath containing run ID, step key, and output name.\nAssigns each asset to a single filesystem path, at \u201c<base_dir>/<asset_key>\u201d. If the asset key\nhas multiple components, the final component is used as the name of the file, and the preceding\ncomponents as parent directories under the base_dir.

\n

Subsequent materializations of an asset will overwrite previous materializations of that asset.\nWith a base directory of \u201c/my/base/path\u201d, an asset with key\nAssetKey([\u201cone\u201d, \u201ctwo\u201d, \u201cthree\u201d]) would be stored in a file called \u201cthree\u201d in a directory\nwith path \u201c/my/base/path/one/two/\u201d.

\n

Example usage:

\n
from dagster import asset, Definitions\nfrom dagster_aws.s3 import S3PickleIOManager, S3Resource\n\n\n@asset\ndef asset1():\n    # create df ...\n    return df\n\n@asset\ndef asset2(asset1):\n    return asset1[:5]\n\ndefs = Definitions(\n    assets=[asset1, asset2],\n    resources={\n        "io_manager": S3PickleIOManager(\n            s3_resource=S3Resource(),\n            s3_bucket="my-cool-bucket",\n            s3_prefix="my-cool-prefix",\n        )\n    }\n)\n
\n
\n
\n\n
\n
\nclass dagster_aws.s3.S3ComputeLogManager(bucket, local_dir=None, inst_data=None, prefix='dagster', use_ssl=True, verify=True, verify_cert_path=None, endpoint_url=None, skip_empty_files=False, upload_interval=None, upload_extra_args=None, show_url_only=False, region=None)[source]\u00b6
\n

Logs compute function stdout and stderr to S3.

\n

Users should not instantiate this class directly. Instead, use a YAML block in dagster.yaml\nsuch as the following:

\n
compute_logs:\n  module: dagster_aws.s3.compute_log_manager\n  class: S3ComputeLogManager\n  config:\n    bucket: "mycorp-dagster-compute-logs"\n    local_dir: "/tmp/cool"\n    prefix: "dagster-test-"\n    use_ssl: true\n    verify: true\n    verify_cert_path: "/path/to/cert/bundle.pem"\n    endpoint_url: "http://alternate-s3-host.io"\n    skip_empty_files: true\n    upload_interval: 30\n    upload_extra_args:\n      ServerSideEncryption: "AES256"\n    show_url_only: false\n    region: "us-west-1"\n
\n
\n
\n
Parameters:
\n
    \n
  • bucket (str) \u2013 The name of the s3 bucket to which to log.

  • \n
  • local_dir (Optional[str]) \u2013 Path to the local directory in which to stage logs. Default:\ndagster._seven.get_system_temp_directory().

  • \n
  • prefix (Optional[str]) \u2013 Prefix for the log file keys.

  • \n
  • use_ssl (Optional[bool]) \u2013 Whether or not to use SSL. Default True.

  • \n
  • verify (Optional[bool]) \u2013 Whether or not to verify SSL certificates. Default True.

  • \n
  • verify_cert_path (Optional[str]) \u2013 A filename of the CA cert bundle to use. Only used if\nverify set to False.

  • \n
  • endpoint_url (Optional[str]) \u2013 Override for the S3 endpoint url.

  • \n
  • skip_empty_files \u2013 (Optional[bool]): Skip upload of empty log files.

  • \n
  • upload_interval \u2013 (Optional[int]): Interval in seconds to upload partial log files to S3. By default, will only upload when the capture is complete.

  • \n
  • upload_extra_args \u2013 (Optional[dict]): Extra args for S3 file upload

  • \n
  • show_url_only \u2013 (Optional[bool]): Only show the URL of the log file in the UI, instead of fetching and displaying the full content. Default False.

  • \n
  • region \u2013 (Optional[str]): The region of the S3 bucket. If not specified, will use the default region of the AWS session.

  • \n
  • inst_data (Optional[ConfigurableClassData]) \u2013 Serializable representation of the compute\nlog manager when newed up from config.

  • \n
\n
\n
\n
\n\n
\n
\ndagster_aws.s3.S3Coordinate DagsterType\u00b6
\n

A dagster.DagsterType intended to make it easier to pass information about files on S3\nfrom op to op. Objects of this type should be dicts with 'bucket' and 'key' keys,\nand may be hydrated from config in the intuitive way, e.g., for an input with the name\ns3_file:

\n
inputs:\n  s3_file:\n    value:\n      bucket: my-bucket\n      key: my-key\n
\n
\n
\n\n
\n

File Manager (Experimental)\u00b6

\n
\n
\nclass dagster_aws.s3.S3FileHandle(s3_bucket, s3_key)[source]\u00b6
\n

A reference to a file on S3.

\n
\n\n
\n
\ndagster_aws.s3.S3FileManagerResource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
use_unsigned_session (dagster.BoolSource, optional):
\n

Specifies whether to use an unsigned S3 session.

\n

Default Value: False

\n
\n
region_name (Union[dagster.StringSource, None], optional):
\n

Specifies a custom region for the S3 session.

\n
\n
endpoint_url (Union[dagster.StringSource, None], optional):
\n

Specifies a custom endpoint for the S3 session.

\n
\n
max_attempts (dagster.IntSource, optional):
\n

This provides Boto3\u2019s retry handler with a value of maximum retry attempts, where the initial call counts toward the max_attempts value that you provide.

\n

Default Value: 5

\n
\n
profile_name (Union[dagster.StringSource, None], optional):
\n

Specifies a profile to connect that session.

\n
\n
use_ssl (dagster.BoolSource, optional):
\n

Whether or not to use SSL. By default, SSL is used.

\n

Default Value: True

\n
\n
verify (Union[dagster.StringSource, None], optional):
\n

Whether or not to verify SSL certificates. By default SSL certificates are verified. You can also specify this argument if you want to use a different CA cert bundle than the one used by botocore.

\n
\n
aws_access_key_id (Union[dagster.StringSource, None], optional):
\n

AWS access key ID to use when creating the boto3 session.

\n
\n
aws_secret_access_key (Union[dagster.StringSource, None], optional):
\n

AWS secret access key to use when creating the boto3 session.

\n
\n
aws_session_token (Union[dagster.StringSource, None], optional):
\n

AWS session token to use when creating the boto3 session.

\n
\n
s3_bucket (dagster.StringSource):
\n

S3 bucket to use for the file manager.

\n
\n
s3_prefix (dagster.StringSource, optional):
\n

Prefix to use for the S3 bucket for this file manager.

\n

Default Value: \u2018dagster\u2019

\n
\n
\n

Base class for Dagster resources that utilize structured config.

\n

This class is a subclass of both ResourceDefinition and Config.

\n

Example definition:

\n
class WriterResource(ConfigurableResource):\n    prefix: str\n\n    def output(self, text: str) -> None:\n        print(f"{self.prefix}{text}")\n
\n
\n

Example usage:

\n
@asset\ndef asset_that_uses_writer(writer: WriterResource):\n    writer.output("text")\n\ndefs = Definitions(\n    assets=[asset_that_uses_writer],\n    resources={"writer": WriterResource(prefix="a_prefix")},\n)\n
\n
\n
\n\n
\n
\n
\n

ECS\u00b6

\n
\n
\ndagster_aws.ecs.EcsRunLauncher RunLauncher[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
task_definition (Union[String, strict dict], optional):
\n

Either the short name of an existing task definition to use when launching new tasks, or a dictionary configuration to use when creating a task definition for the run.If neither is provided, the task definition will be created based on the current task\u2019s task definition.

\n
\n
container_name (dagster.StringSource, optional):
\n

The container name to use when launching new tasks. Defaults to \u2018run\u2019.

\n

Default Value: \u2018run\u2019

\n
\n
secrets (List[Union[String, strict dict]], optional):
\n

An array of AWS Secrets Manager secrets. These secrets will be mounted as environment variables in the container. See https://docs.aws.amazon.com/AmazonECS/latest/APIReference/API_Secret.html.

\n
\n
secrets_tag (Union[dagster.StringSource, None], optional):
\n

AWS Secrets Manager secrets with this tag will be mounted as environment variables in the container. Defaults to \u2018dagster\u2019.

\n

Default Value: \u2018dagster\u2019

\n
\n
include_sidecars (Bool, optional):
\n

Whether each run should use the same sidecars as the task that launches it. Defaults to False.

\n

Default Value: False

\n
\n
use_current_ecs_task_config (Bool, optional):
\n

Whether to use the run launcher\u2019s current ECS task in order to determine the cluster and networking configuration for the launched task. Defaults to True. Should only be called if the run launcher is running within an ECS task.

\n

Default Value: True

\n
\n
run_task_kwargs (permissive dict, optional):
\n

Additional arguments to include while running the task. See https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/ecs.html#ECS.Client.run_task for the available parameters. The overrides and taskDefinition arguments will always be set by the run launcher.

\n
\nConfig Schema:
\n
cluster (dagster.StringSource, optional):
\n

Name of the ECS cluster to launch ECS tasks in.

\n
\n
\n
\n
env_vars (List[dagster.StringSource], optional):
\n

List of environment variable names to include in the ECS task. Each can be of the form KEY=VALUE or just KEY (in which case the value will be pulled from the current process)

\n
\n
run_resources (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
cpu (String, optional):
\n

The CPU override to use for the launched task.

\n
\n
memory (String, optional):
\n

The memory override to use for the launched task.

\n
\n
ephemeral_storage (Int, optional):
\n

The ephemeral storage, in GiB, to use for the launched task.

\n
\n
\n
\n
run_ecs_tags (List[strict dict], optional):
\n

Additional tags to apply to the launched ECS task.

\n
\n
\n

RunLauncher that starts a task in ECS for each Dagster job run.

\n
\n\n
\n
\n

Redshift\u00b6

\n
\n
\ndagster_aws.redshift.RedshiftClientResource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
host (dagster.StringSource):
\n

Redshift host

\n
\n
port (dagster.IntSource, optional):
\n

Redshift port

\n

Default Value: 5439

\n
\n
user (Union[dagster.StringSource, None], optional):
\n

Username for Redshift connection

\n
\n
password (Union[dagster.StringSource, None], optional):
\n

Password for Redshift connection

\n
\n
database (Union[dagster.StringSource, None], optional):
\n

Name of the default database to use. After login, you can use USE DATABASE to change the database.

\n
\n
autocommit (Union[dagster.BoolSource, None], optional):
\n

Whether to autocommit queries

\n
\n
connect_timeout (dagster.IntSource, optional):
\n

Timeout for connection to Redshift cluster. Defaults to 5 seconds.

\n

Default Value: 5

\n
\n
sslmode (dagster.StringSource, optional):
\n

SSL mode to use. See the Redshift documentation for reference: https://docs.aws.amazon.com/redshift/latest/mgmt/connecting-ssl-support.html

\n

Default Value: \u2018require\u2019

\n
\n
\n

This resource enables connecting to a Redshift cluster and issuing queries against that\ncluster.

\n

Example

\n
from dagster import Definitions, asset, EnvVar\nfrom dagster_aws.redshift import RedshiftClientResource\n\n@asset\ndef example_redshift_asset(context, redshift: RedshiftClientResource):\n    redshift.get_client().execute_query('SELECT 1', fetch_results=True)\n\nredshift_configured = RedshiftClientResource(\n    host='my-redshift-cluster.us-east-1.redshift.amazonaws.com',\n    port=5439,\n    user='dagster',\n    password=EnvVar("DAGSTER_REDSHIFT_PASSWORD"),\n    database='dev',\n)\n\ndefs = Definitions(\n    assets=[example_redshift_asset],\n    resources={'redshift': redshift_configured},\n)\n
\n
\n
\n\n
\n

Testing\u00b6

\n
\n
\ndagster_aws.redshift.FakeRedshiftClientResource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
host (dagster.StringSource):
\n

Redshift host

\n
\n
port (dagster.IntSource, optional):
\n

Redshift port

\n

Default Value: 5439

\n
\n
user (Union[dagster.StringSource, None], optional):
\n

Username for Redshift connection

\n
\n
password (Union[dagster.StringSource, None], optional):
\n

Password for Redshift connection

\n
\n
database (Union[dagster.StringSource, None], optional):
\n

Name of the default database to use. After login, you can use USE DATABASE to change the database.

\n
\n
autocommit (Union[dagster.BoolSource, None], optional):
\n

Whether to autocommit queries

\n
\n
connect_timeout (dagster.IntSource, optional):
\n

Timeout for connection to Redshift cluster. Defaults to 5 seconds.

\n

Default Value: 5

\n
\n
sslmode (dagster.StringSource, optional):
\n

SSL mode to use. See the Redshift documentation for reference: https://docs.aws.amazon.com/redshift/latest/mgmt/connecting-ssl-support.html

\n

Default Value: \u2018require\u2019

\n
\n
\n

This resource enables connecting to a Redshift cluster and issuing queries against that\ncluster.

\n

Example

\n
from dagster import Definitions, asset, EnvVar\nfrom dagster_aws.redshift import RedshiftClientResource\n\n@asset\ndef example_redshift_asset(context, redshift: RedshiftClientResource):\n    redshift.get_client().execute_query('SELECT 1', fetch_results=True)\n\nredshift_configured = RedshiftClientResource(\n    host='my-redshift-cluster.us-east-1.redshift.amazonaws.com',\n    port=5439,\n    user='dagster',\n    password=EnvVar("DAGSTER_REDSHIFT_PASSWORD"),\n    database='dev',\n)\n\ndefs = Definitions(\n    assets=[example_redshift_asset],\n    resources={'redshift': redshift_configured},\n)\n
\n
\n
\n\n
\n
\n
\n

EMR\u00b6

\n
\n
\ndagster_aws.emr.emr_pyspark_step_launcher ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
spark_config (permissive dict, optional):
\n
\nDefault Value:
{\n    "spark": {\n        "app": {},\n        "driver": {\n            "blockManager": {}\n        },\n        "executor": {\n            "pyspark": {},\n            "logs": {\n                "rolling": {\n                    "time": {}\n                }\n            }\n        },\n        "local": {},\n        "submit": {},\n        "log": {},\n        "redaction": {},\n        "python": {\n            "profile": {},\n            "worker": {}\n        },\n        "files": {},\n        "jars": {},\n        "pyspark": {\n            "driver": {}\n        },\n        "reducer": {},\n        "shuffle": {\n            "file": {},\n            "io": {},\n            "service": {\n                "index": {\n                    "cache": {}\n                }\n            },\n            "sort": {},\n            "spill": {},\n            "registration": {}\n        },\n        "eventLog": {\n            "logBlockUpdates": {},\n            "longForm": {},\n            "buffer": {}\n        },\n        "ui": {\n            "dagGraph": {},\n            "liveUpdate": {}\n        },\n        "worker": {\n            "ui": {}\n        },\n        "sql": {\n            "ui": {}\n        },\n        "streaming": {\n            "ui": {},\n            "backpressure": {},\n            "receiver": {\n                "writeAheadLog": {}\n            },\n            "kafka": {},\n            "driver": {\n                "writeAheadLog": {}\n            }\n        },\n        "broadcast": {},\n        "io": {\n            "compression": {\n                "lz4": {},\n                "snappy": {},\n                "zstd": {}\n            }\n        },\n        "kryo": {},\n        "kryoserializer": {\n            "buffer": {}\n        },\n        "rdd": {},\n        "serializer": {},\n        "memory": {\n            "offHeap": {}\n        },\n        "storage": {\n            "replication": {}\n        },\n        "cleaner": {\n            "periodicGC": {},\n            "referenceTracking": {\n                "blocking": {}\n            }\n        },\n        "default": {},\n        "hadoop": {\n            "mapreduce": {\n                "fileoutputcommitter": {\n                    "algorithm": {}\n                }\n            }\n        },\n        "rpc": {\n            "message": {},\n            "retry": {}\n        },\n        "blockManager": {},\n        "network": {},\n        "port": {},\n        "core": {\n            "connection": {\n                "ack": {\n                    "wait": {}\n                }\n            }\n        },\n        "cores": {},\n        "locality": {\n            "wait": {}\n        },\n        "scheduler": {\n            "revive": {},\n            "listenerbus": {\n                "eventqueue": {}\n            }\n        },\n        "blacklist": {\n            "task": {},\n            "stage": {},\n            "application": {\n                "fetchFailure": {}\n            }\n        },\n        "speculation": {},\n        "task": {\n            "reaper": {}\n        },\n        "stage": {},\n        "dynamicAllocation": {},\n        "r": {\n            "driver": {},\n            "shell": {}\n        },\n        "graphx": {\n            "pregel": {}\n        },\n        "deploy": {\n            "zookeeper": {}\n        }\n    }\n}\n
\n
\n
\nConfig Schema:
\n
spark (permissive dict, optional):
\n
\nDefault Value:
{\n    "app": {},\n    "driver": {\n        "blockManager": {}\n    },\n    "executor": {\n        "pyspark": {},\n        "logs": {\n            "rolling": {\n                "time": {}\n            }\n        }\n    },\n    "local": {},\n    "submit": {},\n    "log": {},\n    "redaction": {},\n    "python": {\n        "profile": {},\n        "worker": {}\n    },\n    "files": {},\n    "jars": {},\n    "pyspark": {\n        "driver": {}\n    },\n    "reducer": {},\n    "shuffle": {\n        "file": {},\n        "io": {},\n        "service": {\n            "index": {\n                "cache": {}\n            }\n        },\n        "sort": {},\n        "spill": {},\n        "registration": {}\n    },\n    "eventLog": {\n        "logBlockUpdates": {},\n        "longForm": {},\n        "buffer": {}\n    },\n    "ui": {\n        "dagGraph": {},\n        "liveUpdate": {}\n    },\n    "worker": {\n        "ui": {}\n    },\n    "sql": {\n        "ui": {}\n    },\n    "streaming": {\n        "ui": {},\n        "backpressure": {},\n        "receiver": {\n            "writeAheadLog": {}\n        },\n        "kafka": {},\n        "driver": {\n            "writeAheadLog": {}\n        }\n    },\n    "broadcast": {},\n    "io": {\n        "compression": {\n            "lz4": {},\n            "snappy": {},\n            "zstd": {}\n        }\n    },\n    "kryo": {},\n    "kryoserializer": {\n        "buffer": {}\n    },\n    "rdd": {},\n    "serializer": {},\n    "memory": {\n        "offHeap": {}\n    },\n    "storage": {\n        "replication": {}\n    },\n    "cleaner": {\n        "periodicGC": {},\n        "referenceTracking": {\n            "blocking": {}\n        }\n    },\n    "default": {},\n    "hadoop": {\n        "mapreduce": {\n            "fileoutputcommitter": {\n                "algorithm": {}\n            }\n        }\n    },\n    "rpc": {\n        "message": {},\n        "retry": {}\n    },\n    "blockManager": {},\n    "network": {},\n    "port": {},\n    "core": {\n        "connection": {\n            "ack": {\n                "wait": {}\n            }\n        }\n    },\n    "cores": {},\n    "locality": {\n        "wait": {}\n    },\n    "scheduler": {\n        "revive": {},\n        "listenerbus": {\n            "eventqueue": {}\n        }\n    },\n    "blacklist": {\n        "task": {},\n        "stage": {},\n        "application": {\n            "fetchFailure": {}\n        }\n    },\n    "speculation": {},\n    "task": {\n        "reaper": {}\n    },\n    "stage": {},\n    "dynamicAllocation": {},\n    "r": {\n        "driver": {},\n        "shell": {}\n    },\n    "graphx": {\n        "pregel": {}\n    },\n    "deploy": {\n        "zookeeper": {}\n    }\n}\n
\n
\n
\nConfig Schema:
\n
app (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
name (dagster.StringSource, optional):
\n

Application Properties: The name of your application. This will appear in the UI and in log data.

\n
\n
\n
\n
driver (permissive dict, optional):
\n
\nDefault Value:
{\n    "blockManager": {}\n}\n
\n
\n
\nConfig Schema:
\n
cores (dagster.IntSource, optional):
\n

Application Properties: Number of cores to use for the driver process, only in cluster mode.

\n
\n
maxResultSize (dagster.StringSource, optional):
\n

Application Properties: Limit of total size of serialized results of all partitions for each Spark action (e.g. collect) in bytes. Should be at least 1M, or 0 for unlimited. Jobs will be aborted if the total size is above this limit. Having a high limit may cause out-of-memory errors in driver (depends on spark.driver.memory and memory overhead of objects in JVM). Setting a proper limit can protect the driver from out-of-memory errors.

\n
\n
memory (dagster.StringSource, optional):
\n

Application Properties: Amount of memory to use for the driver process, i.e. where SparkContext is initialized, in the same format as JVM memory strings with a size unit suffix (\u201ck\u201d, \u201cm\u201d, \u201cg\u201d or \u201ct\u201d) (e.g. 512m, 2g). Note: In client mode, this config must not be set through the SparkConf directly in your application, because the driver JVM has already started at that point. Instead, please set this through the \u2013driver-memory command line option or in your default properties file.

\n
\n
memoryOverhead (dagster.StringSource, optional):
\n

Application Properties: The amount of off-heap memory to be allocated per driver in cluster mode, in MiB unless otherwise specified. This is memory that accounts for things like VM overheads, interned strings, other native overheads, etc. This tends to grow with the container size (typically 6-10%). This option is currently supported on YARN and Kubernetes.

\n
\n
supervise (Bool, optional):
\n

Application Properties: If true, restarts the driver automatically if it fails with a non-zero exit status. Only has effect in Spark standalone mode or Mesos cluster deploy mode.

\n
\n
extraClassPath (dagster.StringSource, optional):
\n

Runtime Environment: Extra classpath entries to prepend to the classpath of the driver. Note: In client mode, this config must not be set through the SparkConf directly in your application, because the driver JVM has already started at that point. Instead, please set this through the \u2013driver-class-path command line option or in your default properties file.

\n
\n
extraJavaOptions (dagster.StringSource, optional):
\n

Runtime Environment: A string of extra JVM options to pass to the driver. For instance, GC settings or other logging. Note that it is illegal to set maximum heap size (-Xmx) settings with this option. Maximum heap size settings can be set with spark.driver.memory in the cluster mode and through the \u2013driver-memory command line option in the client mode. Note: In client mode, this config must not be set through the SparkConf directly in your application, because the driver JVM has already started at that point. Instead, please set this through the \u2013driver-java-options command line option or in your default properties file.

\n
\n
extraLibraryPath (dagster.StringSource, optional):
\n

Runtime Environment: Set a special library path to use when launching the driver JVM. Note: In client mode, this config must not be set through the SparkConf directly in your application, because the driver JVM has already started at that point. Instead, please set this through the \u2013driver-library-path command line option or in your default properties file.

\n
\n
userClassPathFirst (Bool, optional):
\n

Runtime Environment: (Experimental) Whether to give user-added jars precedence over Spark\u2019s own jars when loading classes in the driver. This feature can be used to mitigate conflicts between Spark\u2019s dependencies and user dependencies. It is currently an experimental feature. This is used in cluster mode only.

\n
\n
blockManager (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
port (dagster.StringSource, optional):
\n

Networking: Driver-specific port for the block manager to listen on, for cases where it cannot use the same configuration as executors.

\n
\n
\n
\n
bindAddress (dagster.StringSource, optional):
\n

Networking: Hostname or IP address where to bind listening sockets. This config overrides the SPARK_LOCAL_IP environment variable (see below). It also allows a different address from the local one to be advertised to executors or external systems. This is useful, for example, when running containers with bridged networking. For this to properly work, the different ports used by the driver (RPC, block manager and UI) need to be forwarded from the container\u2019s host.

\n
\n
host (dagster.StringSource, optional):
\n

Networking: Hostname or IP address for the driver. This is used for communicating with the executors and the standalone Master.

\n
\n
port (dagster.StringSource, optional):
\n

Networking: Port for the driver to listen on. This is used for communicating with the executors and the standalone Master.

\n
\n
\n
\n
executor (permissive dict, optional):
\n
\nDefault Value:
{\n    "pyspark": {},\n    "logs": {\n        "rolling": {\n            "time": {}\n        }\n    }\n}\n
\n
\n
\nConfig Schema:
\n
memory (dagster.StringSource, optional):
\n

Application Properties: Amount of memory to use per executor process, in the same format as JVM memory strings with a size unit suffix (\u201ck\u201d, \u201cm\u201d, \u201cg\u201d or \u201ct\u201d) (e.g. 512m, 2g).

\n
\n
pyspark (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
memory (dagster.StringSource, optional):
\n

Application Properties: The amount of memory to be allocated to PySpark in each executor, in MiB unless otherwise specified. If set, PySpark memory for an executor will be limited to this amount. If not set, Spark will not limit Python\u2019s memory use and it is up to the application to avoid exceeding the overhead memory space shared with other non-JVM processes. When PySpark is run in YARN or Kubernetes, this memory is added to executor resource requests.

\n
\n
\n
\n
memoryOverhead (dagster.StringSource, optional):
\n

Application Properties: The amount of off-heap memory to be allocated per executor, in MiB unless otherwise specified. This is memory that accounts for things like VM overheads, interned strings, other native overheads, etc. This tends to grow with the executor size (typically 6-10%). This option is currently supported on YARN and Kubernetes.

\n
\n
extraClassPath (dagster.StringSource, optional):
\n

Runtime Environment: Extra classpath entries to prepend to the classpath of executors. This exists primarily for backwards-compatibility with older versions of Spark. Users typically should not need to set this option.

\n
\n
extraJavaOptions (dagster.StringSource, optional):
\n

Runtime Environment: A string of extra JVM options to pass to executors. For instance, GC settings or other logging. Note that it is illegal to set Spark properties or maximum heap size (-Xmx) settings with this option. Spark properties should be set using a SparkConf object or the spark-defaults.conf file used with the spark-submit script. Maximum heap size settings can be set with spark.executor.memory. The following symbols, if present will be interpolated: {{APP_ID}} will be replaced by application ID and {{EXECUTOR_ID}} will be replaced by executor ID. For example, to enable verbose gc logging to a file named for the executor ID of the app in /tmp, pass a \u2018value\u2019 of: -verbose:gc -Xloggc:/tmp/{{APP_ID}}-{{EXECUTOR_ID}}.gc

\n
\n
extraLibraryPath (dagster.StringSource, optional):
\n

Runtime Environment: Set a special library path to use when launching executor JVM\u2019s.

\n
\n
logs (permissive dict, optional):
\n
\nDefault Value:
{\n    "rolling": {\n        "time": {}\n    }\n}\n
\n
\n
\nConfig Schema:
\n
rolling (permissive dict, optional):
\n
\nDefault Value:
{\n    "time": {}\n}\n
\n
\n
\nConfig Schema:
\n
maxRetainedFiles (dagster.IntSource, optional):
\n

Runtime Environment: Sets the number of latest rolling log files that are going to be retained by the system. Older log files will be deleted. Disabled by default.

\n
\n
enableCompression (Bool, optional):
\n

Runtime Environment: Enable executor log compression. If it is enabled, the rolled executor logs will be compressed. Disabled by default.

\n
\n
maxSize (dagster.IntSource, optional):
\n

Runtime Environment: Set the max size of the file in bytes by which the executor logs will be rolled over. Rolling is disabled by default. See spark.executor.logs.rolling.maxRetainedFiles for automatic cleaning of old logs.

\n
\n
strategy (dagster.StringSource, optional):
\n

Runtime Environment: Set the strategy of rolling of executor logs. By default it is disabled. It can be set to \u201ctime\u201d (time-based rolling) or \u201csize\u201d (size-based rolling). For \u201ctime\u201d, use spark.executor.logs.rolling.time.interval to set the rolling interval. For \u201csize\u201d, use spark.executor.logs.rolling.maxSize to set the maximum file size for rolling.

\n
\n
time (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
interval (dagster.StringSource, optional):
\n

Runtime Environment: Set the time interval by which the executor logs will be rolled over. Rolling is disabled by default. Valid values are daily, hourly, minutely or any interval in seconds. See spark.executor.logs.rolling.maxRetainedFiles for automatic cleaning of old logs.

\n
\n
\n
\n
\n
\n
\n
\n
userClassPathFirst (Bool, optional):
\n

Runtime Environment: (Experimental) Same functionality as spark.driver.userClassPathFirst, but applied to executor instances.

\n
\n
cores (dagster.IntSource, optional):
\n

Execution Behavior: The number of cores to use on each executor. In standalone and Mesos coarse-grained modes, for more detail, see this description.

\n
\n
heartbeatInterval (dagster.StringSource, optional):
\n

Execution Behavior: Interval between each executor\u2019s heartbeats to the driver. Heartbeats let the driver know that the executor is still alive and update it with metrics for in-progress tasks. spark.executor.heartbeatInterval should be significantly less than spark.network.timeout

\n
\n
\n
\n
extraListeners (dagster.StringSource, optional):
\n

Application Properties: A comma-separated list of classes that implement SparkListener; when initializing SparkContext, instances of these classes will be created and registered with Spark\u2019s listener bus. If a class has a single-argument constructor that accepts a SparkConf, that constructor will be called; otherwise, a zero-argument constructor will be called. If no valid constructor can be found, the SparkContext creation will fail with an exception.

\n
\n
local (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
dir (dagster.StringSource, optional):
\n

Application Properties: Directory to use for \u201cscratch\u201d space in Spark, including map output files and RDDs that get stored on disk. This should be on a fast, local disk in your system. It can also be a comma-separated list of multiple directories on different disks. NOTE: In Spark 1.0 and later this will be overridden by SPARK_LOCAL_DIRS (Standalone), MESOS_SANDBOX (Mesos) or LOCAL_DIRS (YARN) environment variables set by the cluster manager.

\n
\n
\n
\n
logConf (Bool, optional):
\n

Application Properties: Logs the effective SparkConf as INFO when a SparkContext is started.

\n
\n
master (dagster.StringSource, optional):
\n

Application Properties: The cluster manager to connect to. See the list of allowed master URL\u2019s.

\n
\n
submit (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
deployMode (dagster.StringSource, optional):
\n

Application Properties: The deploy mode of Spark driver program, either \u201cclient\u201d or \u201ccluster\u201d, Which means to launch driver program locally (\u201cclient\u201d) or remotely (\u201ccluster\u201d) on one of the nodes inside the cluster.

\n
\n
pyFiles (dagster.StringSource, optional):
\n

Runtime Environment: Comma-separated list of .zip, .egg, or .py files to place on the PYTHONPATH for Python apps. Globs are allowed.

\n
\n
\n
\n
log (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
callerContext (dagster.StringSource, optional):
\n

Application Properties: Application information that will be written into Yarn RM log/HDFS audit log when running on Yarn/HDFS. Its length depends on the Hadoop configuration hadoop.caller.context.max.size. It should be concise, and typically can have up to 50 characters.

\n
\n
\n
\n
redaction (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
regex (dagster.StringSource, optional):
\n

Runtime Environment: Regex to decide which Spark configuration properties and environment variables in driver and executor environments contain sensitive information. When this regex matches a property key or value, the value is redacted from the environment UI and various logs like YARN and event logs.

\n
\n
\n
\n
python (permissive dict, optional):
\n
\nDefault Value:
{\n    "profile": {},\n    "worker": {}\n}\n
\n
\n
\nConfig Schema:
\n
profile (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
root (Bool, optional):
\n

Runtime Environment: Enable profiling in Python worker, the profile result will show up by sc.show_profiles(), or it will be displayed before the driver exits. It also can be dumped into disk by sc.dump_profiles(path). If some of the profile results had been displayed manually, they will not be displayed automatically before driver exiting. By default the pyspark.profiler.BasicProfiler will be used, but this can be overridden by passing a profiler class in as a parameter to the SparkContext constructor.

\n
\n
dump (dagster.StringSource, optional):
\n

Runtime Environment: The directory which is used to dump the profile result before driver exiting. The results will be dumped as separated file for each RDD. They can be loaded by ptats.Stats(). If this is specified, the profile result will not be displayed automatically.

\n
\n
\n
\n
worker (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
memory (dagster.StringSource, optional):
\n

Runtime Environment: Amount of memory to use per python worker process during aggregation, in the same format as JVM memory strings with a size unit suffix (\u201ck\u201d, \u201cm\u201d, \u201cg\u201d or \u201ct\u201d) (e.g. 512m, 2g). If the memory used during aggregation goes above this amount, it will spill the data into disks.

\n
\n
reuse (Bool, optional):
\n

Runtime Environment: Reuse Python worker or not. If yes, it will use a fixed number of Python workers, does not need to fork() a Python process for every task. It will be very useful if there is large broadcast, then the broadcast will not be needed to transferred from JVM to Python worker for every task.

\n
\n
\n
\n
\n
\n
files (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
root (dagster.StringSource, optional):
\n

Runtime Environment: Comma-separated list of files to be placed in the working directory of each executor. Globs are allowed.

\n
\n
fetchTimeout (dagster.StringSource, optional):
\n

Execution Behavior: Communication timeout to use when fetching files added through SparkContext.addFile() from the driver.

\n
\n
useFetchCache (Bool, optional):
\n

Execution Behavior: If set to true (default), file fetching will use a local cache that is shared by executors that belong to the same application, which can improve task launching performance when running many executors on the same host. If set to false, these caching optimizations will be disabled and all executors will fetch their own copies of files. This optimization may be disabled in order to use Spark local directories that reside on NFS filesystems (see SPARK-6313 for more details).

\n
\n
overwrite (Bool, optional):
\n

Execution Behavior: Whether to overwrite files added through SparkContext.addFile() when the target file exists and its contents do not match those of the source.

\n
\n
maxPartitionBytes (dagster.IntSource, optional):
\n

Execution Behavior: The maximum number of bytes to pack into a single partition when reading files.

\n
\n
openCostInBytes (dagster.IntSource, optional):
\n

Execution Behavior: The estimated cost to open a file, measured by the number of bytes could be scanned at the same time. This is used when putting multiple files into a partition. It is better to overestimate, then the partitions with small files will be faster than partitions with bigger files.

\n
\n
\n
\n
jars (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
root (dagster.StringSource, optional):
\n

Runtime Environment: Comma-separated list of jars to include on the driver and executor classpaths. Globs are allowed.

\n
\n
packages (dagster.StringSource, optional):
\n

Runtime Environment: Comma-separated list of Maven coordinates of jars to include on the driver and executor classpaths. The coordinates should be groupId:artifactId:version. If spark.jars.ivySettings is given artifacts will be resolved according to the configuration in the file, otherwise artifacts will be searched for in the local maven repo, then maven central and finally any additional remote repositories given by the command-line option \u2013repositories. For more details, see Advanced Dependency Management.

\n
\n
excludes (dagster.StringSource, optional):
\n

Runtime Environment: Comma-separated list of groupId:artifactId, to exclude while resolving the dependencies provided in spark.jars.packages to avoid dependency conflicts.

\n
\n
ivy (dagster.StringSource, optional):
\n

Runtime Environment: Path to specify the Ivy user directory, used for the local Ivy cache and package files from spark.jars.packages. This will override the Ivy property ivy.default.ivy.user.dir which defaults to ~/.ivy2.

\n
\n
ivySettings (dagster.StringSource, optional):
\n

Runtime Environment: Path to an Ivy settings file to customize resolution of jars specified using spark.jars.packages instead of the built-in defaults, such as maven central. Additional repositories given by the command-line option \u2013repositories or spark.jars.repositories will also be included. Useful for allowing Spark to resolve artifacts from behind a firewall e.g. via an in-house artifact server like Artifactory. Details on the settings file format can be found at http://ant.apache.org/ivy/history/latest-milestone/settings.html

\n
\n
repositories (dagster.StringSource, optional):
\n

Runtime Environment: Comma-separated list of additional remote repositories to search for the maven coordinates given with \u2013packages or spark.jars.packages.

\n
\n
\n
\n
pyspark (permissive dict, optional):
\n
\nDefault Value:
{\n    "driver": {}\n}\n
\n
\n
\nConfig Schema:
\n
driver (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
python (dagster.StringSource, optional):
\n

Runtime Environment: Python binary executable to use for PySpark in driver. (default is spark.pyspark.python)

\n
\n
\n
\n
python (dagster.StringSource, optional):
\n

Runtime Environment: Python binary executable to use for PySpark in both driver and executors.

\n
\n
\n
\n
reducer (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
maxSizeInFlight (dagster.StringSource, optional):
\n

Shuffle Behavior: Maximum size of map outputs to fetch simultaneously from each reduce task, in MiB unless otherwise specified. Since each output requires us to create a buffer to receive it, this represents a fixed memory overhead per reduce task, so keep it small unless you have a large amount of memory.

\n
\n
maxReqsInFlight (dagster.IntSource, optional):
\n

Shuffle Behavior: This configuration limits the number of remote requests to fetch blocks at any given point. When the number of hosts in the cluster increase, it might lead to very large number of inbound connections to one or more nodes, causing the workers to fail under load. By allowing it to limit the number of fetch requests, this scenario can be mitigated.

\n
\n
maxBlocksInFlightPerAddress (dagster.IntSource, optional):
\n

Shuffle Behavior: This configuration limits the number of remote blocks being fetched per reduce task from a given host port. When a large number of blocks are being requested from a given address in a single fetch or simultaneously, this could crash the serving executor or Node Manager. This is especially useful to reduce the load on the Node Manager when external shuffle is enabled. You can mitigate this issue by setting it to a lower value.

\n
\n
\n
\n
maxRemoteBlockSizeFetchToMem (dagster.IntSource, optional):
\n

Shuffle Behavior: The remote block will be fetched to disk when size of the block is above this threshold in bytes. This is to avoid a giant request that takes too much memory. By default, this is only enabled for blocks > 2GB, as those cannot be fetched directly into memory, no matter what resources are available. But it can be turned down to a much lower value (eg. 200m) to avoid using too much memory on smaller blocks as well. Note this configuration will affect both shuffle fetch and block manager remote block fetch. For users who enabled external shuffle service, this feature can only be used when external shuffle service is newer than Spark 2.2.

\n
\n
shuffle (permissive dict, optional):
\n
\nDefault Value:
{\n    "file": {},\n    "io": {},\n    "service": {\n        "index": {\n            "cache": {}\n        }\n    },\n    "sort": {},\n    "spill": {},\n    "registration": {}\n}\n
\n
\n
\nConfig Schema:
\n
compress (Bool, optional):
\n

Shuffle Behavior: Whether to compress map output files. Generally a good idea. Compression will use spark.io.compression.codec.

\n
\n
file (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
buffer (dagster.StringSource, optional):
\n

Shuffle Behavior: Size of the in-memory buffer for each shuffle file output stream, in KiB unless otherwise specified. These buffers reduce the number of disk seeks and system calls made in creating intermediate shuffle files.

\n
\n
\n
\n
io (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
maxRetries (dagster.IntSource, optional):
\n

Shuffle Behavior: (Netty only) Fetches that fail due to IO-related exceptions are automatically retried if this is set to a non-zero value. This retry logic helps stabilize large shuffles in the face of long GC pauses or transient network connectivity issues.

\n
\n
numConnectionsPerPeer (dagster.IntSource, optional):
\n

Shuffle Behavior: (Netty only) Connections between hosts are reused in order to reduce connection buildup for large clusters. For clusters with many hard disks and few hosts, this may result in insufficient concurrency to saturate all disks, and so users may consider increasing this value.

\n
\n
preferDirectBufs (Bool, optional):
\n

Shuffle Behavior: (Netty only) Off-heap buffers are used to reduce garbage collection during shuffle and cache block transfer. For environments where off-heap memory is tightly limited, users may wish to turn this off to force all allocations from Netty to be on-heap.

\n
\n
retryWait (dagster.StringSource, optional):
\n

Shuffle Behavior: (Netty only) How long to wait between retries of fetches. The maximum delay caused by retrying is 15 seconds by default, calculated as maxRetries * retryWait.

\n
\n
\n
\n
service (permissive dict, optional):
\n
\nDefault Value:
{\n    "index": {\n        "cache": {}\n    }\n}\n
\n
\n
\nConfig Schema:
\n
enabled (Bool, optional):
\n

Shuffle Behavior: Enables the external shuffle service. This service preserves the shuffle files written by executors so the executors can be safely removed. This must be enabled if spark.dynamicAllocation.enabled is \u201ctrue\u201d. The external shuffle service must be set up in order to enable it. See dynamic allocation configuration and setup documentation for more information.

\n
\n
port (dagster.IntSource, optional):
\n

Shuffle Behavior: Port on which the external shuffle service will run.

\n
\n
index (permissive dict, optional):
\n
\nDefault Value:
{\n    "cache": {}\n}\n
\n
\n
\nConfig Schema:
\n
cache (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
size (dagster.StringSource, optional):
\n

Shuffle Behavior: Cache entries limited to the specified memory footprint in bytes.

\n
\n
\n
\n
\n
\n
\n
\n
maxChunksBeingTransferred (dagster.IntSource, optional):
\n

Shuffle Behavior: The max number of chunks allowed to be transferred at the same time on shuffle service. Note that new incoming connections will be closed when the max number is hit. The client will retry according to the shuffle retry configs (see spark.shuffle.io.maxRetries and spark.shuffle.io.retryWait), if those limits are reached the task will fail with fetch failure.

\n
\n
sort (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
bypassMergeThreshold (dagster.IntSource, optional):
\n

Shuffle Behavior: (Advanced) In the sort-based shuffle manager, avoid merge-sorting data if there is no map-side aggregation and there are at most this many reduce partitions.

\n
\n
\n
\n
spill (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
compress (Bool, optional):
\n

Shuffle Behavior: Whether to compress data spilled during shuffles. Compression will use spark.io.compression.codec.

\n
\n
\n
\n
accurateBlockThreshold (dagster.IntSource, optional):
\n

Shuffle Behavior: Threshold in bytes above which the size of shuffle blocks in HighlyCompressedMapStatus is accurately recorded. This helps to prevent OOM by avoiding underestimating shuffle block size when fetch shuffle blocks.

\n
\n
registration (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
timeout (dagster.IntSource, optional):
\n

Shuffle Behavior: Timeout in milliseconds for registration to the external shuffle service.

\n
\n
maxAttempts (dagster.IntSource, optional):
\n

Shuffle Behavior: When we fail to register to the external shuffle service, we will retry for maxAttempts times.

\n
\n
\n
\n
memoryFraction (Float, optional):
\n

Memory Management: (deprecated) This is read only if spark.memory.useLegacyMode is enabled. Fraction of Java heap to use for aggregation and cogroups during shuffles. At any given time, the collective size of all in-memory maps used for shuffles is bounded by this limit, beyond which the contents will begin to spill to disk. If spills are often, consider increasing this value at the expense of spark.storage.memoryFraction.

\n
\n
\n
\n
eventLog (permissive dict, optional):
\n
\nDefault Value:
{\n    "logBlockUpdates": {},\n    "longForm": {},\n    "buffer": {}\n}\n
\n
\n
\nConfig Schema:
\n
logBlockUpdates (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
enabled (dagster.StringSource, optional):
\n

Spark UI: Whether to log events for every block update, if spark.eventLog.enabled is true. *Warning*: This will increase the size of the event log considerably.

\n
\n
\n
\n
longForm (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
enabled (dagster.StringSource, optional):
\n

Spark UI: If true, use the long form of call sites in the event log. Otherwise use the short form.

\n
\n
\n
\n
compress (dagster.StringSource, optional):
\n

Spark UI: Whether to compress logged events, if spark.eventLog.enabled is true. Compression will use spark.io.compression.codec.

\n
\n
dir (dagster.StringSource, optional):
\n

Spark UI: Base directory in which Spark events are logged, if spark.eventLog.enabled is true. Within this base directory, Spark creates a sub-directory for each application, and logs the events specific to the application in this directory. Users may want to set this to a unified location like an HDFS directory so history files can be read by the history server.

\n
\n
enabled (dagster.StringSource, optional):
\n

Spark UI: Whether to log Spark events, useful for reconstructing the Web UI after the application has finished.

\n
\n
overwrite (dagster.StringSource, optional):
\n

Spark UI: Whether to overwrite any existing files.

\n
\n
buffer (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
kb (dagster.StringSource, optional):
\n

Spark UI: Buffer size to use when writing to output streams, in KiB unless otherwise specified.

\n
\n
\n
\n
\n
\n
ui (permissive dict, optional):
\n
\nDefault Value:
{\n    "dagGraph": {},\n    "liveUpdate": {}\n}\n
\n
\n
\nConfig Schema:
\n
dagGraph (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
retainedRootRDDs (dagster.StringSource, optional):
\n

Spark UI: How many DAG graph nodes the Spark UI and status APIs remember before garbage collecting.

\n
\n
\n
\n
enabled (dagster.StringSource, optional):
\n

Spark UI: Whether to run the web UI for the Spark application.

\n
\n
killEnabled (dagster.StringSource, optional):
\n

Spark UI: Allows jobs and stages to be killed from the web UI.

\n
\n
liveUpdate (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
period (dagster.StringSource, optional):
\n

Spark UI: How often to update live entities. -1 means \u201cnever update\u201d when replaying applications, meaning only the last write will happen. For live applications, this avoids a few operations that we can live without when rapidly processing incoming task events.

\n
\n
\n
\n
port (dagster.StringSource, optional):
\n

Spark UI: Port for your application\u2019s dashboard, which shows memory and workload data.

\n
\n
retainedJobs (dagster.StringSource, optional):
\n

Spark UI: How many jobs the Spark UI and status APIs remember before garbage collecting. This is a target maximum, and fewer elements may be retained in some circumstances.

\n
\n
retainedStages (dagster.StringSource, optional):
\n

Spark UI: How many stages the Spark UI and status APIs remember before garbage collecting. This is a target maximum, and fewer elements may be retained in some circumstances.

\n
\n
retainedTasks (dagster.StringSource, optional):
\n

Spark UI: How many tasks the Spark UI and status APIs remember before garbage collecting. This is a target maximum, and fewer elements may be retained in some circumstances.

\n
\n
reverseProxy (dagster.StringSource, optional):
\n

Spark UI: Enable running Spark Master as reverse proxy for worker and application UIs. In this mode, Spark master will reverse proxy the worker and application UIs to enable access without requiring direct access to their hosts. Use it with caution, as worker and application UI will not be accessible directly, you will only be able to access them through spark master/proxy public URL. This setting affects all the workers and application UIs running in the cluster and must be set on all the workers, drivers and masters.

\n
\n
reverseProxyUrl (dagster.StringSource, optional):
\n

Spark UI: This is the URL where your proxy is running. This URL is for proxy which is running in front of Spark Master. This is useful when running proxy for authentication e.g. OAuth proxy. Make sure this is a complete URL including scheme (http/https) and port to reach your proxy.

\n
\n
showConsoleProgress (dagster.StringSource, optional):
\n

Spark UI: Show the progress bar in the console. The progress bar shows the progress of stages that run for longer than 500ms. If multiple stages run at the same time, multiple progress bars will be displayed on the same line.

\n
\n
retainedDeadExecutors (dagster.StringSource, optional):
\n

Spark UI: How many dead executors the Spark UI and status APIs remember before garbage collecting.

\n
\n
filters (dagster.StringSource, optional):
\n

Spark UI: Comma separated list of filter class names to apply to the Spark Web UI. The filter should be a standard javax servlet Filter. Filter parameters can also be specified in the configuration, by setting config entries of the form spark.<class name of filter>.param.<param name>=<value> For example: spark.ui.filters=com.test.filter1 spark.com.test.filter1.param.name1=foo spark.com.test.filter1.param.name2=bar

\n
\n
\n
\n
worker (permissive dict, optional):
\n
\nDefault Value:
{\n    "ui": {}\n}\n
\n
\n
\nConfig Schema:
\n
ui (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
retainedExecutors (dagster.StringSource, optional):
\n

Spark UI: How many finished executors the Spark UI and status APIs remember before garbage collecting.

\n
\n
retainedDrivers (dagster.StringSource, optional):
\n

Spark UI: How many finished drivers the Spark UI and status APIs remember before garbage collecting.

\n
\n
\n
\n
\n
\n
sql (permissive dict, optional):
\n
\nDefault Value:
{\n    "ui": {}\n}\n
\n
\n
\nConfig Schema:
\n
ui (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
retainedExecutions (dagster.StringSource, optional):
\n

Spark UI: How many finished executions the Spark UI and status APIs remember before garbage collecting.

\n
\n
\n
\n
\n
\n
streaming (permissive dict, optional):
\n
\nDefault Value:
{\n    "ui": {},\n    "backpressure": {},\n    "receiver": {\n        "writeAheadLog": {}\n    },\n    "kafka": {},\n    "driver": {\n        "writeAheadLog": {}\n    }\n}\n
\n
\n
\nConfig Schema:
\n
ui (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
retainedBatches (dagster.StringSource, optional):
\n

Spark Streaming: How many batches the Spark Streaming UI and status APIs remember before garbage collecting.

\n
\n
\n
\n
backpressure (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
enabled (dagster.StringSource, optional):
\n

Spark Streaming: Enables or disables Spark Streaming\u2019s internal backpressure mechanism (since 1.5). This enables the Spark Streaming to control the receiving rate based on the current batch scheduling delays and processing times so that the system receives only as fast as the system can process. Internally, this dynamically sets the maximum receiving rate of receivers. This rate is upper bounded by the values spark.streaming.receiver.maxRate and spark.streaming.kafka.maxRatePerPartition if they are set (see below).

\n
\n
initialRate (dagster.StringSource, optional):
\n

Spark Streaming: This is the initial maximum receiving rate at which each receiver will receive data for the first batch when the backpressure mechanism is enabled.

\n
\n
\n
\n
blockInterval (dagster.StringSource, optional):
\n

Spark Streaming: Interval at which data received by Spark Streaming receivers is chunked into blocks of data before storing them in Spark. Minimum recommended - 50 ms. See the performance tuning section in the Spark Streaming programing guide for more details.

\n
\n
receiver (permissive dict, optional):
\n
\nDefault Value:
{\n    "writeAheadLog": {}\n}\n
\n
\n
\nConfig Schema:
\n
maxRate (dagster.StringSource, optional):
\n

Spark Streaming: Maximum rate (number of records per second) at which each receiver will receive data. Effectively, each stream will consume at most this number of records per second. Setting this configuration to 0 or a negative number will put no limit on the rate. See the deployment guide in the Spark Streaming programing guide for mode details.

\n
\n
writeAheadLog (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
enable (dagster.StringSource, optional):
\n

Spark Streaming: Enable write-ahead logs for receivers. All the input data received through receivers will be saved to write-ahead logs that will allow it to be recovered after driver failures. See the deployment guide in the Spark Streaming programing guide for more details.

\n
\n
closeFileAfterWrite (dagster.StringSource, optional):
\n

Spark Streaming: Whether to close the file after writing a write-ahead log record on the receivers. Set this to \u2018true\u2019 when you want to use S3 (or any file system that does not support flushing) for the data WAL on the receivers.

\n
\n
\n
\n
\n
\n
unpersist (dagster.StringSource, optional):
\n

Spark Streaming: Force RDDs generated and persisted by Spark Streaming to be automatically unpersisted from Spark\u2019s memory. The raw input data received by Spark Streaming is also automatically cleared. Setting this to false will allow the raw data and persisted RDDs to be accessible outside the streaming application as they will not be cleared automatically. But it comes at the cost of higher memory usage in Spark.

\n
\n
stopGracefullyOnShutdown (dagster.StringSource, optional):
\n

Spark Streaming: If true, Spark shuts down the StreamingContext gracefully on JVM shutdown rather than immediately.

\n
\n
kafka (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
maxRatePerPartition (dagster.StringSource, optional):
\n

Spark Streaming: Maximum rate (number of records per second) at which data will be read from each Kafka partition when using the new Kafka direct stream API. See the Kafka Integration guide for more details.

\n
\n
minRatePerPartition (dagster.StringSource, optional):
\n

Spark Streaming: Minimum rate (number of records per second) at which data will be read from each Kafka partition when using the new Kafka direct stream API.

\n
\n
maxRetries (dagster.StringSource, optional):
\n

Spark Streaming: Maximum number of consecutive retries the driver will make in order to find the latest offsets on the leader of each partition (a default value of 1 means that the driver will make a maximum of 2 attempts). Only applies to the new Kafka direct stream API.

\n
\n
\n
\n
driver (permissive dict, optional):
\n
\nDefault Value:
{\n    "writeAheadLog": {}\n}\n
\n
\n
\nConfig Schema:
\n
writeAheadLog (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
closeFileAfterWrite (dagster.StringSource, optional):
\n

Spark Streaming: Whether to close the file after writing a write-ahead log record on the driver. Set this to \u2018true\u2019 when you want to use S3 (or any file system that does not support flushing) for the metadata WAL on the driver.

\n
\n
\n
\n
\n
\n
\n
\n
broadcast (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
compress (dagster.StringSource, optional):
\n

Compression and Serialization: Whether to compress broadcast variables before sending them. Generally a good idea. Compression will use spark.io.compression.codec.

\n
\n
blockSize (dagster.StringSource, optional):
\n

Execution Behavior: Size of each piece of a block for TorrentBroadcastFactory, in KiB unless otherwise specified. Too large a value decreases parallelism during broadcast (makes it slower); however, if it is too small, BlockManager might take a performance hit.

\n
\n
checksum (dagster.StringSource, optional):
\n

Execution Behavior: Whether to enable checksum for broadcast. If enabled, broadcasts will include a checksum, which can help detect corrupted blocks, at the cost of computing and sending a little more data. It\u2019s possible to disable it if the network has other mechanisms to guarantee data won\u2019t be corrupted during broadcast.

\n
\n
\n
\n
io (permissive dict, optional):
\n
\nDefault Value:
{\n    "compression": {\n        "lz4": {},\n        "snappy": {},\n        "zstd": {}\n    }\n}\n
\n
\n
\nConfig Schema:
\n
compression (permissive dict, optional):
\n
\nDefault Value:
{\n    "lz4": {},\n    "snappy": {},\n    "zstd": {}\n}\n
\n
\n
\nConfig Schema:
\n
codec (dagster.StringSource, optional):
\n

Compression and Serialization: The codec used to compress internal data such as RDD partitions, event log, broadcast variables and shuffle outputs. By default, Spark provides four codecs: lz4, lzf, snappy, and zstd. You can also use fully qualified class names to specify the codec, e.g. org.apache.spark.io.LZ4CompressionCodec, org.apache.spark.io.LZFCompressionCodec, org.apache.spark.io.SnappyCompressionCodec, and org.apache.spark.io.ZStdCompressionCodec.

\n
\n
lz4 (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
blockSize (dagster.StringSource, optional):
\n

Compression and Serialization: Block size in bytes used in LZ4 compression, in the case when LZ4 compression codec is used. Lowering this block size will also lower shuffle memory usage when LZ4 is used.

\n
\n
\n
\n
snappy (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
blockSize (dagster.StringSource, optional):
\n

Compression and Serialization: Block size in bytes used in Snappy compression, in the case when Snappy compression codec is used. Lowering this block size will also lower shuffle memory usage when Snappy is used.

\n
\n
\n
\n
zstd (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
level (dagster.StringSource, optional):
\n

Compression and Serialization: Compression level for Zstd compression codec. Increasing the compression level will result in better compression at the expense of more CPU and memory.

\n
\n
bufferSize (dagster.StringSource, optional):
\n

Compression and Serialization: Buffer size in bytes used in Zstd compression, in the case when Zstd compression codec is used. Lowering this size will lower the shuffle memory usage when Zstd is used, but it might increase the compression cost because of excessive JNI call overhead.

\n
\n
\n
\n
\n
\n
\n
\n
kryo (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
classesToRegister (dagster.StringSource, optional):
\n

Compression and Serialization: If you use Kryo serialization, give a comma-separated list of custom class names to register with Kryo. See the tuning guide for more details.

\n
\n
referenceTracking (dagster.StringSource, optional):
\n

Compression and Serialization: Whether to track references to the same object when serializing data with Kryo, which is necessary if your object graphs have loops and useful for efficiency if they contain multiple copies of the same object. Can be disabled to improve performance if you know this is not the case.

\n
\n
registrationRequired (dagster.StringSource, optional):
\n

Compression and Serialization: Whether to require registration with Kryo. If set to \u2018true\u2019, Kryo will throw an exception if an unregistered class is serialized. If set to false (the default), Kryo will write unregistered class names along with each object. Writing class names can cause significant performance overhead, so enabling this option can enforce strictly that a user has not omitted classes from registration.

\n
\n
registrator (dagster.StringSource, optional):
\n

Compression and Serialization: If you use Kryo serialization, give a comma-separated list of classes that register your custom classes with Kryo. This property is useful if you need to register your classes in a custom way, e.g. to specify a custom field serializer. Otherwise spark.kryo.classesToRegister is simpler. It should be set to classes that extend KryoRegistrator. See the tuning guide for more details.

\n
\n
unsafe (dagster.StringSource, optional):
\n

Compression and Serialization: Whether to use unsafe based Kryo serializer. Can be substantially faster by using Unsafe Based IO.

\n
\n
\n
\n
kryoserializer (permissive dict, optional):
\n
\nDefault Value:
{\n    "buffer": {}\n}\n
\n
\n
\nConfig Schema:
\n
buffer (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
root (dagster.StringSource, optional):
\n

Compression and Serialization: Initial size of Kryo\u2019s serialization buffer, in KiB unless otherwise specified. Note that there will be one buffer per core on each worker. This buffer will grow up to spark.kryoserializer.buffer.max if needed.

\n
\n
max (dagster.StringSource, optional):
\n

Compression and Serialization: Maximum allowable size of Kryo serialization buffer, in MiB unless otherwise specified. This must be larger than any object you attempt to serialize and must be less than 2048m. Increase this if you get a \u201cbuffer limit exceeded\u201d exception inside Kryo.

\n
\n
\n
\n
\n
\n
rdd (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
compress (dagster.StringSource, optional):
\n

Compression and Serialization: Whether to compress serialized RDD partitions (e.g. for StorageLevel.MEMORY_ONLY_SER in Java and Scala or StorageLevel.MEMORY_ONLY in Python). Can save substantial space at the cost of some extra CPU time. Compression will use spark.io.compression.codec.

\n
\n
\n
\n
serializer (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
root (dagster.StringSource, optional):
\n

Compression and Serialization: Class to use for serializing objects that will be sent over the network or need to be cached in serialized form. The default of Java serialization works with any Serializable Java object but is quite slow, so we recommend using org.apache.spark.serializer.KryoSerializer and configuring Kryo serialization when speed is necessary. Can be any subclass of org.apache.spark.Serializer.

\n
\n
objectStreamReset (dagster.StringSource, optional):
\n

Compression and Serialization: When serializing using org.apache.spark.serializer.JavaSerializer, the serializer caches objects to prevent writing redundant data, however that stops garbage collection of those objects. By calling \u2018reset\u2019 you flush that info from the serializer, and allow old objects to be collected. To turn off this periodic reset set it to -1. By default it will reset the serializer every 100 objects.

\n
\n
\n
\n
memory (permissive dict, optional):
\n
\nDefault Value:
{\n    "offHeap": {}\n}\n
\n
\n
\nConfig Schema:
\n
fraction (Float, optional):
\n

Memory Management: Fraction of (heap space - 300MB) used for execution and storage. The lower this is, the more frequently spills and cached data eviction occur. The purpose of this config is to set aside memory for internal metadata, user data structures, and imprecise size estimation in the case of sparse, unusually large records. Leaving this at the default value is recommended. For more detail, including important information about correctly tuning JVM garbage collection when increasing this value, see this description.

\n
\n
storageFraction (Float, optional):
\n

Memory Management: Amount of storage memory immune to eviction, expressed as a fraction of the size of the region set aside by spark.memory.fraction. The higher this is, the less working memory may be available to execution and tasks may spill to disk more often. Leaving this at the default value is recommended. For more detail, see this description.

\n
\n
offHeap (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
enabled (Bool, optional):
\n

Memory Management: If true, Spark will attempt to use off-heap memory for certain operations. If off-heap memory use is enabled, then spark.memory.offHeap.size must be positive.

\n
\n
size (dagster.IntSource, optional):
\n

Memory Management: The absolute amount of memory in bytes which can be used for off-heap allocation. This setting has no impact on heap memory usage, so if your executors\u2019 total memory consumption must fit within some hard limit then be sure to shrink your JVM heap size accordingly. This must be set to a positive value when spark.memory.offHeap.enabled=true.

\n
\n
\n
\n
useLegacyMode (Bool, optional):
\n

Memory Management: Whether to enable the legacy memory management mode used in Spark 1.5 and before. The legacy mode rigidly partitions the heap space into fixed-size regions, potentially leading to excessive spilling if the application was not tuned. The following deprecated memory fraction configurations are not read unless this is enabled: spark.shuffle.memoryFraction spark.storage.memoryFraction spark.storage.unrollFraction

\n
\n
\n
\n
storage (permissive dict, optional):
\n
\nDefault Value:
{\n    "replication": {}\n}\n
\n
\n
\nConfig Schema:
\n
memoryFraction (Float, optional):
\n

Memory Management: (deprecated) This is read only if spark.memory.useLegacyMode is enabled. Fraction of Java heap to use for Spark\u2019s memory cache. This should not be larger than the \u201cold\u201d generation of objects in the JVM, which by default is given 0.6 of the heap, but you can increase it if you configure your own old generation size.

\n
\n
unrollFraction (Float, optional):
\n

Memory Management: (deprecated) This is read only if spark.memory.useLegacyMode is enabled. Fraction of spark.storage.memoryFraction to use for unrolling blocks in memory. This is dynamically allocated by dropping existing blocks when there is not enough free storage space to unroll the new block in its entirety.

\n
\n
replication (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
proactive (Bool, optional):
\n

Memory Management: Enables proactive block replication for RDD blocks. Cached RDD block replicas lost due to executor failures are replenished if there are any existing available replicas. This tries to get the replication level of the block to the initial number.

\n
\n
\n
\n
memoryMapThreshold (dagster.StringSource, optional):
\n

Execution Behavior: Size in bytes of a block above which Spark memory maps when reading a block from disk. This prevents Spark from memory mapping very small blocks. In general, memory mapping has high overhead for blocks close to or below the page size of the operating system.

\n
\n
\n
\n
cleaner (permissive dict, optional):
\n
\nDefault Value:
{\n    "periodicGC": {},\n    "referenceTracking": {\n        "blocking": {}\n    }\n}\n
\n
\n
\nConfig Schema:
\n
periodicGC (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
interval (dagster.StringSource, optional):
\n

Memory Management: Controls how often to trigger a garbage collection. This context cleaner triggers cleanups only when weak references are garbage collected. In long-running applications with large driver JVMs, where there is little memory pressure on the driver, this may happen very occasionally or not at all. Not cleaning at all may lead to executors running out of disk space after a while.

\n
\n
\n
\n
referenceTracking (permissive dict, optional):
\n
\nDefault Value:
{\n    "blocking": {}\n}\n
\n
\n
\nConfig Schema:
\n
root (Bool, optional):
\n

Memory Management: Enables or disables context cleaning.

\n
\n
blocking (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
root (Bool, optional):
\n

Memory Management: Controls whether the cleaning thread should block on cleanup tasks (other than shuffle, which is controlled by spark.cleaner.referenceTracking.blocking.shuffle Spark property).

\n
\n
shuffle (Bool, optional):
\n

Memory Management: Controls whether the cleaning thread should block on shuffle cleanup tasks.

\n
\n
\n
\n
cleanCheckpoints (Bool, optional):
\n

Memory Management: Controls whether to clean checkpoint files if the reference is out of scope.

\n
\n
\n
\n
\n
\n
default (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
parallelism (dagster.IntSource, optional):
\n

Execution Behavior: Default number of partitions in RDDs returned by transformations like join, reduceByKey, and parallelize when not set by user.

\n
\n
\n
\n
hadoop (permissive dict, optional):
\n
\nDefault Value:
{\n    "mapreduce": {\n        "fileoutputcommitter": {\n            "algorithm": {}\n        }\n    }\n}\n
\n
\n
\nConfig Schema:
\n
cloneConf (Bool, optional):
\n

Execution Behavior: If set to true, clones a new Hadoop Configuration object for each task. This option should be enabled to work around Configuration thread-safety issues (see SPARK-2546 for more details). This is disabled by default in order to avoid unexpected performance regressions for jobs that are not affected by these issues.

\n
\n
validateOutputSpecs (Bool, optional):
\n

Execution Behavior: If set to true, validates the output specification (e.g. checking if the output directory already exists) used in saveAsHadoopFile and other variants. This can be disabled to silence exceptions due to pre-existing output directories. We recommend that users do not disable this except if trying to achieve compatibility with previous versions of Spark. Simply use Hadoop\u2019s FileSystem API to delete output directories by hand. This setting is ignored for jobs generated through Spark Streaming\u2019s StreamingContext, since data may need to be rewritten to pre-existing output directories during checkpoint recovery.

\n
\n
mapreduce (permissive dict, optional):
\n
\nDefault Value:
{\n    "fileoutputcommitter": {\n        "algorithm": {}\n    }\n}\n
\n
\n
\nConfig Schema:
\n
fileoutputcommitter (permissive dict, optional):
\n
\nDefault Value:
{\n    "algorithm": {}\n}\n
\n
\n
\nConfig Schema:
\n
algorithm (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
version (dagster.IntSource, optional):
\n

Execution Behavior: The file output committer algorithm version, valid algorithm version number: 1 or 2. Version 2 may have better performance, but version 1 may handle failures better in certain situations, as per MAPREDUCE-4815.

\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
rpc (permissive dict, optional):
\n
\nDefault Value:
{\n    "message": {},\n    "retry": {}\n}\n
\n
\n
\nConfig Schema:
\n
message (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
maxSize (dagster.StringSource, optional):
\n

Networking: Maximum message size (in MB) to allow in \u201ccontrol plane\u201d communication; generally only applies to map output size information sent between executors and the driver. Increase this if you are running jobs with many thousands of map and reduce tasks and see messages about the RPC message size.

\n
\n
\n
\n
numRetries (dagster.StringSource, optional):
\n

Networking: Number of times to retry before an RPC task gives up. An RPC task will run at most times of this number.

\n
\n
retry (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
wait (dagster.StringSource, optional):
\n

Networking: Duration for an RPC ask operation to wait before retrying.

\n
\n
\n
\n
askTimeout (dagster.StringSource, optional):
\n

Networking: Duration for an RPC ask operation to wait before timing out.

\n
\n
lookupTimeout (dagster.StringSource, optional):
\n

Networking: Duration for an RPC remote endpoint lookup operation to wait before timing out.

\n
\n
\n
\n
blockManager (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
port (dagster.StringSource, optional):
\n

Networking: Port for all block managers to listen on. These exist on both the driver and the executors.

\n
\n
\n
\n
network (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
timeout (dagster.StringSource, optional):
\n

Networking: Default timeout for all network interactions. This config will be used in place of spark.core.connection.ack.wait.timeout, spark.storage.blockManagerSlaveTimeoutMs, spark.shuffle.io.connectionTimeout, spark.rpc.askTimeout or spark.rpc.lookupTimeout if they are not configured.

\n
\n
\n
\n
port (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
maxRetries (dagster.StringSource, optional):
\n

Networking: Maximum number of retries when binding to a port before giving up. When a port is given a specific value (non 0), each subsequent retry will increment the port used in the previous attempt by 1 before retrying. This essentially allows it to try a range of ports from the start port specified to port + maxRetries.

\n
\n
\n
\n
core (permissive dict, optional):
\n
\nDefault Value:
{\n    "connection": {\n        "ack": {\n            "wait": {}\n        }\n    }\n}\n
\n
\n
\nConfig Schema:
\n
connection (permissive dict, optional):
\n
\nDefault Value:
{\n    "ack": {\n        "wait": {}\n    }\n}\n
\n
\n
\nConfig Schema:
\n
ack (permissive dict, optional):
\n
\nDefault Value:
{\n    "wait": {}\n}\n
\n
\n
\nConfig Schema:
\n
wait (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
timeout (dagster.StringSource, optional):
\n

Networking: How long for the connection to wait for ack to occur before timing out and giving up. To avoid unwilling timeout caused by long pause like GC, you can set larger value.

\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
cores (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
max (dagster.StringSource, optional):
\n

Scheduling: When running on a standalone deploy cluster or a Mesos cluster in \u201ccoarse-grained\u201d sharing mode, the maximum amount of CPU cores to request for the application from across the cluster (not from each machine). If not set, the default will be spark.deploy.defaultCores on Spark\u2019s standalone cluster manager, or infinite (all available cores) on Mesos.

\n
\n
\n
\n
locality (permissive dict, optional):
\n
\nDefault Value:
{\n    "wait": {}\n}\n
\n
\n
\nConfig Schema:
\n
wait (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
root (dagster.StringSource, optional):
\n

Scheduling: How long to wait to launch a data-local task before giving up and launching it on a less-local node. The same wait will be used to step through multiple locality levels (process-local, node-local, rack-local and then any). It is also possible to customize the waiting time for each level by setting spark.locality.wait.node, etc. You should increase this setting if your tasks are long and see poor locality, but the default usually works well.

\n
\n
node (dagster.StringSource, optional):
\n

Scheduling: Customize the locality wait for node locality. For example, you can set this to 0 to skip node locality and search immediately for rack locality (if your cluster has rack information).

\n
\n
process (dagster.StringSource, optional):
\n

Scheduling: Customize the locality wait for process locality. This affects tasks that attempt to access cached data in a particular executor process.

\n
\n
rack (dagster.StringSource, optional):
\n

Scheduling: Customize the locality wait for rack locality.

\n
\n
\n
\n
\n
\n
scheduler (permissive dict, optional):
\n
\nDefault Value:
{\n    "revive": {},\n    "listenerbus": {\n        "eventqueue": {}\n    }\n}\n
\n
\n
\nConfig Schema:
\n
maxRegisteredResourcesWaitingTime (dagster.StringSource, optional):
\n

Scheduling: Maximum amount of time to wait for resources to register before scheduling begins.

\n
\n
minRegisteredResourcesRatio (dagster.StringSource, optional):
\n

Scheduling: The minimum ratio of registered resources (registered resources / total expected resources) (resources are executors in yarn mode and Kubernetes mode, CPU cores in standalone mode and Mesos coarse-grained mode [\u2018spark.cores.max\u2019 value is total expected resources for Mesos coarse-grained mode] ) to wait for before scheduling begins. Specified as a double between 0.0 and 1.0. Regardless of whether the minimum ratio of resources has been reached, the maximum amount of time it will wait before scheduling begins is controlled by config spark.scheduler.maxRegisteredResourcesWaitingTime.

\n
\n
mode (dagster.StringSource, optional):
\n

Scheduling: The scheduling mode between jobs submitted to the same SparkContext. Can be set to FAIR to use fair sharing instead of queueing jobs one after another. Useful for multi-user services.

\n
\n
revive (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
interval (dagster.StringSource, optional):
\n

Scheduling: The interval length for the scheduler to revive the worker resource offers to run tasks.

\n
\n
\n
\n
listenerbus (permissive dict, optional):
\n
\nDefault Value:
{\n    "eventqueue": {}\n}\n
\n
\n
\nConfig Schema:
\n
eventqueue (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
capacity (dagster.StringSource, optional):
\n

Scheduling: Capacity for event queue in Spark listener bus, must be greater than 0. Consider increasing value (e.g. 20000) if listener events are dropped. Increasing this value may result in the driver using more memory.

\n
\n
\n
\n
\n
\n
\n
\n
blacklist (permissive dict, optional):
\n
\nDefault Value:
{\n    "task": {},\n    "stage": {},\n    "application": {\n        "fetchFailure": {}\n    }\n}\n
\n
\n
\nConfig Schema:
\n
enabled (dagster.StringSource, optional):
\n

Scheduling: If set to \u201ctrue\u201d, prevent Spark from scheduling tasks on executors that have been blacklisted due to too many task failures. The blacklisting algorithm can be further controlled by the other \u201cspark.blacklist\u201d configuration options.

\n
\n
timeout (dagster.StringSource, optional):
\n

Scheduling: (Experimental) How long a node or executor is blacklisted for the entire application, before it is unconditionally removed from the blacklist to attempt running new tasks.

\n
\n
task (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
maxTaskAttemptsPerExecutor (dagster.StringSource, optional):
\n

Scheduling: (Experimental) For a given task, how many times it can be retried on one executor before the executor is blacklisted for that task.

\n
\n
maxTaskAttemptsPerNode (dagster.StringSource, optional):
\n

Scheduling: (Experimental) For a given task, how many times it can be retried on one node, before the entire node is blacklisted for that task.

\n
\n
\n
\n
stage (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
maxFailedTasksPerExecutor (dagster.StringSource, optional):
\n

Scheduling: (Experimental) How many different tasks must fail on one executor, within one stage, before the executor is blacklisted for that stage.

\n
\n
maxFailedExecutorsPerNode (dagster.StringSource, optional):
\n

Scheduling: (Experimental) How many different executors are marked as blacklisted for a given stage, before the entire node is marked as failed for the stage.

\n
\n
\n
\n
application (permissive dict, optional):
\n
\nDefault Value:
{\n    "fetchFailure": {}\n}\n
\n
\n
\nConfig Schema:
\n
maxFailedTasksPerExecutor (dagster.StringSource, optional):
\n

Scheduling: (Experimental) How many different tasks must fail on one executor, in successful task sets, before the executor is blacklisted for the entire application. Blacklisted executors will be automatically added back to the pool of available resources after the timeout specified by spark.blacklist.timeout. Note that with dynamic allocation, though, the executors may get marked as idle and be reclaimed by the cluster manager.

\n
\n
maxFailedExecutorsPerNode (dagster.StringSource, optional):
\n

Scheduling: (Experimental) How many different executors must be blacklisted for the entire application, before the node is blacklisted for the entire application. Blacklisted nodes will be automatically added back to the pool of available resources after the timeout specified by spark.blacklist.timeout. Note that with dynamic allocation, though, the executors on the node may get marked as idle and be reclaimed by the cluster manager.

\n
\n
fetchFailure (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
enabled (dagster.StringSource, optional):
\n

Scheduling: (Experimental) If set to \u201ctrue\u201d, Spark will blacklist the executor immediately when a fetch failure happens. If external shuffle service is enabled, then the whole node will be blacklisted.

\n
\n
\n
\n
\n
\n
killBlacklistedExecutors (dagster.StringSource, optional):
\n

Scheduling: (Experimental) If set to \u201ctrue\u201d, allow Spark to automatically kill the executors when they are blacklisted on fetch failure or blacklisted for the entire application, as controlled by spark.blacklist.application.*. Note that, when an entire node is added to the blacklist, all of the executors on that node will be killed.

\n
\n
\n
\n
speculation (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
root (dagster.StringSource, optional):
\n

Scheduling: If set to \u201ctrue\u201d, performs speculative execution of tasks. This means if one or more tasks are running slowly in a stage, they will be re-launched.

\n
\n
interval (dagster.StringSource, optional):
\n

Scheduling: How often Spark will check for tasks to speculate.

\n
\n
multiplier (dagster.StringSource, optional):
\n

Scheduling: How many times slower a task is than the median to be considered for speculation.

\n
\n
quantile (dagster.StringSource, optional):
\n

Scheduling: Fraction of tasks which must be complete before speculation is enabled for a particular stage.

\n
\n
\n
\n
task (permissive dict, optional):
\n
\nDefault Value:
{\n    "reaper": {}\n}\n
\n
\n
\nConfig Schema:
\n
cpus (dagster.StringSource, optional):
\n

Scheduling: Number of cores to allocate for each task.

\n
\n
maxFailures (dagster.StringSource, optional):
\n

Scheduling: Number of failures of any particular task before giving up on the job. The total number of failures spread across different tasks will not cause the job to fail; a particular task has to fail this number of attempts. Should be greater than or equal to 1. Number of allowed retries = this value - 1.

\n
\n
reaper (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
enabled (dagster.StringSource, optional):
\n

Scheduling: Enables monitoring of killed / interrupted tasks. When set to true, any task which is killed will be monitored by the executor until that task actually finishes executing. See the other spark.task.reaper.* configurations for details on how to control the exact behavior of this monitoring. When set to false (the default), task killing will use an older code path which lacks such monitoring.

\n
\n
pollingInterval (dagster.StringSource, optional):
\n

Scheduling: When spark.task.reaper.enabled = true, this setting controls the frequency at which executors will poll the status of killed tasks. If a killed task is still running when polled then a warning will be logged and, by default, a thread-dump of the task will be logged (this thread dump can be disabled via the spark.task.reaper.threadDump setting, which is documented below).

\n
\n
threadDump (dagster.StringSource, optional):
\n

Scheduling: When spark.task.reaper.enabled = true, this setting controls whether task thread dumps are logged during periodic polling of killed tasks. Set this to false to disable collection of thread dumps.

\n
\n
killTimeout (dagster.StringSource, optional):
\n

Scheduling: When spark.task.reaper.enabled = true, this setting specifies a timeout after which the executor JVM will kill itself if a killed task has not stopped running. The default value, -1, disables this mechanism and prevents the executor from self-destructing. The purpose of this setting is to act as a safety-net to prevent runaway noncancellable tasks from rendering an executor unusable.

\n
\n
\n
\n
\n
\n
stage (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
maxConsecutiveAttempts (dagster.StringSource, optional):
\n

Scheduling: Number of consecutive stage attempts allowed before a stage is aborted.

\n
\n
\n
\n
dynamicAllocation (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
enabled (dagster.StringSource, optional):
\n

Dynamic Allocation: Whether to use dynamic resource allocation, which scales the number of executors registered with this application up and down based on the workload. For more detail, see the description here. This requires spark.shuffle.service.enabled to be set. The following configurations are also relevant: spark.dynamicAllocation.minExecutors, spark.dynamicAllocation.maxExecutors, and spark.dynamicAllocation.initialExecutors spark.dynamicAllocation.executorAllocationRatio

\n
\n
executorIdleTimeout (dagster.StringSource, optional):
\n

Dynamic Allocation: If dynamic allocation is enabled and an executor has been idle for more than this duration, the executor will be removed. For more detail, see this description.

\n
\n
cachedExecutorIdleTimeout (dagster.StringSource, optional):
\n

Dynamic Allocation: If dynamic allocation is enabled and an executor which has cached data blocks has been idle for more than this duration, the executor will be removed. For more details, see this description.

\n
\n
initialExecutors (dagster.StringSource, optional):
\n

Dynamic Allocation: Initial number of executors to run if dynamic allocation is enabled. If \u2013num-executors (or spark.executor.instances) is set and larger than this value, it will be used as the initial number of executors.

\n
\n
maxExecutors (dagster.StringSource, optional):
\n

Dynamic Allocation: Upper bound for the number of executors if dynamic allocation is enabled.

\n
\n
minExecutors (dagster.StringSource, optional):
\n

Dynamic Allocation: Lower bound for the number of executors if dynamic allocation is enabled.

\n
\n
executorAllocationRatio (dagster.StringSource, optional):
\n

Dynamic Allocation: By default, the dynamic allocation will request enough executors to maximize the parallelism according to the number of tasks to process. While this minimizes the latency of the job, with small tasks this setting can waste a lot of resources due to executor allocation overhead, as some executor might not even do any work. This setting allows to set a ratio that will be used to reduce the number of executors w.r.t. full parallelism. Defaults to 1.0 to give maximum parallelism. 0.5 will divide the target number of executors by 2 The target number of executors computed by the dynamicAllocation can still be overridden by the spark.dynamicAllocation.minExecutors and spark.dynamicAllocation.maxExecutors settings

\n
\n
schedulerBacklogTimeout (dagster.StringSource, optional):
\n

Dynamic Allocation: If dynamic allocation is enabled and there have been pending tasks backlogged for more than this duration, new executors will be requested. For more detail, see this description.

\n
\n
sustainedSchedulerBacklogTimeout (dagster.StringSource, optional):
\n

Dynamic Allocation: Same as spark.dynamicAllocation.schedulerBacklogTimeout, but used only for subsequent executor requests. For more detail, see this description.

\n
\n
\n
\n
r (permissive dict, optional):
\n
\nDefault Value:
{\n    "driver": {},\n    "shell": {}\n}\n
\n
\n
\nConfig Schema:
\n
numRBackendThreads (dagster.StringSource, optional):
\n

SparkR: Number of threads used by RBackend to handle RPC calls from SparkR package.

\n
\n
command (dagster.StringSource, optional):
\n

SparkR: Executable for executing R scripts in cluster modes for both driver and workers.

\n
\n
driver (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
command (dagster.StringSource, optional):
\n

SparkR: Executable for executing R scripts in client modes for driver. Ignored in cluster modes.

\n
\n
\n
\n
shell (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
command (dagster.StringSource, optional):
\n

SparkR: Executable for executing sparkR shell in client modes for driver. Ignored in cluster modes. It is the same as environment variable SPARKR_DRIVER_R, but take precedence over it. spark.r.shell.command is used for sparkR shell while spark.r.driver.command is used for running R script.

\n
\n
\n
\n
backendConnectionTimeout (dagster.StringSource, optional):
\n

SparkR: Connection timeout set by R process on its connection to RBackend in seconds.

\n
\n
heartBeatInterval (dagster.StringSource, optional):
\n

SparkR: Interval for heartbeats sent from SparkR backend to R process to prevent connection timeout.

\n
\n
\n
\n
graphx (permissive dict, optional):
\n
\nDefault Value:
{\n    "pregel": {}\n}\n
\n
\n
\nConfig Schema:
\n
pregel (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
checkpointInterval (dagster.StringSource, optional):
\n

GraphX: Checkpoint interval for graph and message in Pregel. It used to avoid stackOverflowError due to long lineage chains after lots of iterations. The checkpoint is disabled by default.

\n
\n
\n
\n
\n
\n
deploy (permissive dict, optional):
\n
\nDefault Value:
{\n    "zookeeper": {}\n}\n
\n
\n
\nConfig Schema:
\n
recoveryMode (dagster.StringSource, optional):
\n

Deploy: The recovery mode setting to recover submitted Spark jobs with cluster mode when it failed and relaunches. This is only applicable for cluster mode when running with Standalone or Mesos.

\n
\n
zookeeper (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
url (dagster.StringSource, optional):
\n

Deploy: When spark.deploy.recoveryMode is set to ZOOKEEPER, this configuration is used to set the zookeeper URL to connect to.

\n
\n
dir (dagster.StringSource, optional):
\n

Deploy: When spark.deploy.recoveryMode is set to ZOOKEEPER, this configuration is used to set the zookeeper directory to store recovery state.

\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
cluster_id (dagster.StringSource):
\n

Name of the job flow (cluster) on which to execute.

\n
\n
region_name (dagster.StringSource):
\n

The AWS region that the cluster is in.

\n
\n
action_on_failure (String, optional):
\n

The EMR action to take when the cluster step fails: https://docs.aws.amazon.com/emr/latest/APIReference/API_StepConfig.html

\n

Default Value: \u2018CANCEL_AND_WAIT\u2019

\n
\n
staging_bucket (dagster.StringSource):
\n

S3 bucket to use for passing files between the plan process and EMR process.

\n
\n
staging_prefix (dagster.StringSource, optional):
\n

S3 key prefix inside the staging_bucket to use for files passed the plan process and EMR process

\n

Default Value: \u2018emr_staging\u2019

\n
\n
wait_for_logs (Bool, optional):
\n

If set, the system will wait for EMR logs to appear on S3. Note that logs are copied every 5 minutes, so enabling this will add several minutes to the job runtime.

\n

Default Value: False

\n
\n
local_job_package_path (dagster.StringSource, optional):
\n

Absolute path to the package that contains the job definition(s) whose steps will execute remotely on EMR. This is a path on the local fileystem of the process executing the job. The expectation is that this package will also be available on the python path of the launched process running the Spark step on EMR, either deployed on step launch via the deploy_local_job_package option, referenced on s3 via the s3_job_package_path option, or installed on the cluster via bootstrap actions.

\n
\n
local_pipeline_package_path (dagster.StringSource, optional):
\n

(legacy) Absolute path to the package that contains the pipeline definition(s) whose steps will execute remotely on EMR. This is a path on the local fileystem of the process executing the pipeline. The expectation is that this package will also be available on the python path of the launched process running the Spark step on EMR, either deployed on step launch via the deploy_local_pipeline_package option, referenced on s3 via the s3_pipeline_package_path option, or installed on the cluster via bootstrap actions.

\n
\n
deploy_local_job_package (Bool, optional):
\n

If set, before every step run, the launcher will zip up all the code in local_job_package_path, upload it to s3, and pass it to spark-submit\u2019s \u2013py-files option. This gives the remote process access to up-to-date user code. If not set, the assumption is that some other mechanism is used for distributing code to the EMR cluster. If this option is set to True, s3_job_package_path should not also be set.

\n

Default Value: False

\n
\n
deploy_local_pipeline_package (Bool, optional):
\n

(legacy) If set, before every step run, the launcher will zip up all the code in local_job_package_path, upload it to s3, and pass it to spark-submit\u2019s \u2013py-files option. This gives the remote process access to up-to-date user code. If not set, the assumption is that some other mechanism is used for distributing code to the EMR cluster. If this option is set to True, s3_job_package_path should not also be set.

\n

Default Value: False

\n
\n
s3_job_package_path (dagster.StringSource, optional):
\n

If set, this path will be passed to the \u2013py-files option of spark-submit. This should usually be a path to a zip file. If this option is set, deploy_local_job_package should not be set to True.

\n
\n
s3_pipeline_package_path (dagster.StringSource, optional):
\n

If set, this path will be passed to the \u2013py-files option of spark-submit. This should usually be a path to a zip file. If this option is set, deploy_local_pipeline_package should not be set to True.

\n
\n
\n
    \n
  • spark_config:

  • \n
  • cluster_id: Name of the job flow (cluster) on which to execute.

  • \n
  • region_name: The AWS region that the cluster is in.

  • \n
  • action_on_failure: The EMR action to take when the cluster step fails: https://docs.aws.amazon.com/emr/latest/APIReference/API_StepConfig.html

  • \n
  • staging_bucket: S3 bucket to use for passing files between the plan process and EMR process.

  • \n
  • staging_prefix: S3 key prefix inside the staging_bucket to use for files passed the plan process and EMR process

  • \n
  • wait_for_logs: If set, the system will wait for EMR logs to appear on S3. Note that logs are copied every 5 minutes, so enabling this will add several minutes to the job runtime.

  • \n
  • local_job_package_path: Absolute path to the package that contains the job definition(s) whose steps will execute remotely on EMR. This is a path on the local fileystem of the process executing the job. The expectation is that this package will also be available on the python path of the launched process running the Spark step on EMR, either deployed on step launch via the deploy_local_job_package option, referenced on s3 via the s3_job_package_path option, or installed on the cluster via bootstrap actions.

  • \n
  • local_pipeline_package_path: (legacy) Absolute path to the package that contains the pipeline definition(s) whose steps will execute remotely on EMR. This is a path on the local fileystem of the process executing the pipeline. The expectation is that this package will also be available on the python path of the launched process running the Spark step on EMR, either deployed on step launch via the deploy_local_pipeline_package option, referenced on s3 via the s3_pipeline_package_path option, or installed on the cluster via bootstrap actions.

  • \n
  • deploy_local_job_package: If set, before every step run, the launcher will zip up all the code in local_job_package_path, upload it to s3, and pass it to spark-submit\u2019s \u2013py-files option. This gives the remote process access to up-to-date user code. If not set, the assumption is that some other mechanism is used for distributing code to the EMR cluster. If this option is set to True, s3_job_package_path should not also be set.

  • \n
  • deploy_local_pipeline_package: (legacy) If set, before every step run, the launcher will zip up all the code in local_job_package_path, upload it to s3, and pass it to spark-submit\u2019s \u2013py-files option. This gives the remote process access to up-to-date user code. If not set, the assumption is that some other mechanism is used for distributing code to the EMR cluster. If this option is set to True, s3_job_package_path should not also be set.

  • \n
  • s3_job_package_path: If set, this path will be passed to the \u2013py-files option of spark-submit. This should usually be a path to a zip file. If this option is set, deploy_local_job_package should not be set to True.

  • \n
  • s3_pipeline_package_path: If set, this path will be passed to the \u2013py-files option of spark-submit. This should usually be a path to a zip file. If this option is set, deploy_local_pipeline_package should not be set to True.

  • \n
\n
\n\n
\n
\nclass dagster_aws.emr.EmrJobRunner(region, check_cluster_every=30, aws_access_key_id=None, aws_secret_access_key=None)[source]\u00b6
\n
\n\n
\n
\nclass dagster_aws.emr.EmrError[source]\u00b6
\n
\n\n
\n
\ndagster_aws.emr.EmrClusterState = <enum 'EmrClusterState'>[source]\u00b6
\n

An enumeration.

\n
\n\n
\n
\ndagster_aws.emr.EmrStepState = <enum 'EmrStepState'>[source]\u00b6
\n

An enumeration.

\n
\n\n
\n
\n

CloudWatch\u00b6

\n
\n
\ndagster_aws.cloudwatch.cloudwatch_logger LoggerDefinition\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
log_level (String, optional):
\n

Default Value: \u2018INFO\u2019

\n
\n
name (String, optional):
\n

Default Value: \u2018dagster\u2019

\n
\n
log_group_name (String):
\n

The name of the log group

\n
\n
log_stream_name (String):
\n

The name of the log stream

\n
\n
aws_region (dagster.StringSource, optional):
\n

Specifies a custom region for the S3 session. Default is chosen through the ordinary boto3 credential chain.

\n
\n
aws_secret_access_key (dagster.StringSource, optional):
\n

\n
aws_access_key_id (dagster.StringSource, optional):
\n

\n
\n

Core class for defining loggers.

\n

Loggers are job-scoped logging handlers, which will be automatically invoked whenever\ndagster messages are logged from within a job.

\n
\n
Parameters:
\n
    \n
  • logger_fn (Callable[[InitLoggerContext], logging.Logger]) \u2013 User-provided function to\ninstantiate the logger. This logger will be automatically invoked whenever the methods\non context.log are called from within job compute logic.

  • \n
  • config_schema (Optional[ConfigSchema]) \u2013 The schema for the config. Configuration data available in\ninit_context.logger_config. If not set, Dagster will accept any config provided.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of this logger.

  • \n
\n
\n
\n
\n\n
\n
\n

SecretsManager\u00b6

\n

Resources which surface SecretsManager secrets for use in Dagster resources and jobs.

\n
\n
\ndagster_aws.secretsmanager.SecretsManagerResource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
region_name (Union[dagster.StringSource, None], optional):
\n

Specifies a custom region for the Boto3 session

\n
\n
max_attempts (dagster.IntSource, optional):
\n

This provides Boto3\u2019s retry handler with a value of maximum retry attempts, where the initial call counts toward the max_attempts value that you provide

\n

Default Value: 5

\n
\n
profile_name (Union[dagster.StringSource, None], optional):
\n

Specifies a profile to connect that session

\n
\n
\n

Resource that gives access to AWS SecretsManager.

\n

The underlying SecretsManager session is created by calling\nboto3.session.Session(profile_name).\nThe returned resource object is a SecretsManager client, an instance of botocore.client.SecretsManager.

\n

Example

\n
from dagster import build_op_context, job, op\nfrom dagster_aws.secretsmanager import SecretsManagerResource\n\n@op\ndef example_secretsmanager_op(secretsmanager: SecretsManagerResource):\n    return secretsmanager.get_client().get_secret_value(\n        SecretId='arn:aws:secretsmanager:region:aws_account_id:secret:appauthexample-AbCdEf'\n    )\n\n@job\ndef example_job():\n    example_secretsmanager_op()\n\ndefs = Definitions(\n    jobs=[example_job],\n    resources={\n        'secretsmanager': SecretsManagerResource(\n            region_name='us-west-1'\n        )\n    }\n)\n
\n
\n
\n\n
\n
\ndagster_aws.secretsmanager.SecretsManagerSecretsResource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
region_name (Union[dagster.StringSource, None], optional):
\n

Specifies a custom region for the Boto3 session

\n
\n
max_attempts (dagster.IntSource, optional):
\n

This provides Boto3\u2019s retry handler with a value of maximum retry attempts, where the initial call counts toward the max_attempts value that you provide

\n

Default Value: 5

\n
\n
profile_name (Union[dagster.StringSource, None], optional):
\n

Specifies a profile to connect that session

\n
\n
secrets (List[dagster.StringSource], optional):
\n

An array of AWS Secrets Manager secrets arns to fetch.

\n

Default Value: []

\n
\n
secrets_tag (Union[dagster.StringSource, None], optional):
\n

AWS Secrets Manager secrets with this tag will be fetched and made available.

\n
\n
\n

Resource that provides a dict which maps selected SecretsManager secrets to\ntheir string values. Also optionally sets chosen secrets as environment variables.

\n

Example

\n
import os\nfrom dagster import build_op_context, job, op, ResourceParam\nfrom dagster_aws.secretsmanager import SecretsManagerSecretsResource\n\n@op\ndef example_secretsmanager_secrets_op(secrets: SecretsManagerSecretsResource):\n    return secrets.fetch_secrets().get("my-secret-name")\n\n@op\ndef example_secretsmanager_secrets_op_2(secrets: SecretsManagerSecretsResource):\n    with secrets.secrets_in_environment():\n        return os.getenv("my-other-secret-name")\n\n@job\ndef example_job():\n    example_secretsmanager_secrets_op()\n    example_secretsmanager_secrets_op_2()\n\ndefs = Definitions(\n    jobs=[example_job],\n    resources={\n        'secrets': SecretsManagerSecretsResource(\n            region_name='us-west-1',\n            secrets_tag="dagster",\n            add_to_environment=True,\n        )\n    }\n)\n
\n
\n

Note that your ops must also declare that they require this resource with or it will not be initialized\nfor the execution of their compute functions.

\n
\n\n
\n
\n

Legacy\u00b6

\n
\n
\ndagster_aws.s3.ConfigurablePickledObjectS3IOManager IOManagerDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
s3_resource (Union[Any, None], optional):
\n

\n
s3_bucket (dagster.StringSource):
\n

S3 bucket to use for the file manager.

\n
\n
s3_prefix (dagster.StringSource, optional):
\n

Prefix to use for the S3 bucket for this file manager.

\n

Default Value: \u2018dagster\u2019

\n
\n
\n
\n
\n

\n \n (\n deprecated\n )\n \n This API will be removed in version 2.0. Please use S3PickleIOManager instead..\n \n

\n

Renamed to S3PickleIOManager. See S3PickleIOManager for documentation.

\n
\n\n
\n
\ndagster_aws.s3.s3_resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
use_unsigned_session (dagster.BoolSource, optional):
\n

Specifies whether to use an unsigned S3 session.

\n

Default Value: False

\n
\n
region_name (Union[dagster.StringSource, None], optional):
\n

Specifies a custom region for the S3 session.

\n
\n
endpoint_url (Union[dagster.StringSource, None], optional):
\n

Specifies a custom endpoint for the S3 session.

\n
\n
max_attempts (dagster.IntSource, optional):
\n

This provides Boto3\u2019s retry handler with a value of maximum retry attempts, where the initial call counts toward the max_attempts value that you provide.

\n

Default Value: 5

\n
\n
profile_name (Union[dagster.StringSource, None], optional):
\n

Specifies a profile to connect that session.

\n
\n
use_ssl (dagster.BoolSource, optional):
\n

Whether or not to use SSL. By default, SSL is used.

\n

Default Value: True

\n
\n
verify (Union[dagster.StringSource, None], optional):
\n

Whether or not to verify SSL certificates. By default SSL certificates are verified. You can also specify this argument if you want to use a different CA cert bundle than the one used by botocore.

\n
\n
aws_access_key_id (Union[dagster.StringSource, None], optional):
\n

AWS access key ID to use when creating the boto3 session.

\n
\n
aws_secret_access_key (Union[dagster.StringSource, None], optional):
\n

AWS secret access key to use when creating the boto3 session.

\n
\n
aws_session_token (Union[dagster.StringSource, None], optional):
\n

AWS session token to use when creating the boto3 session.

\n
\n
\n

Resource that gives access to S3.

\n

The underlying S3 session is created by calling\nboto3.session.Session(profile_name).\nThe returned resource object is an S3 client, an instance of botocore.client.S3.

\n

Example

\n
from dagster import build_op_context, job, op\nfrom dagster_aws.s3 import s3_resource\n\n@op(required_resource_keys={'s3'})\ndef example_s3_op(context):\n    return context.resources.s3.list_objects_v2(\n        Bucket='my-bucket',\n        Prefix='some-key'\n    )\n\n@job(resource_defs={'s3': s3_resource})\ndef example_job():\n    example_s3_op()\n\nexample_job.execute_in_process(\n    run_config={\n        'resources': {\n            's3': {\n                'config': {\n                    'region_name': 'us-west-1',\n                }\n            }\n        }\n    }\n)\n
\n
\n

Note that your ops must also declare that they require this resource with\nrequired_resource_keys, or it will not be initialized for the execution of their compute\nfunctions.

\n

You may configure this resource as follows:

\n
resources:\n  s3:\n    config:\n      region_name: "us-west-1"\n      # Optional[str]: Specifies a custom region for the S3 session. Default is chosen\n      # through the ordinary boto credential chain.\n      use_unsigned_session: false\n      # Optional[bool]: Specifies whether to use an unsigned S3 session. Default: True\n      endpoint_url: "http://localhost"\n      # Optional[str]: Specifies a custom endpoint for the S3 session. Default is None.\n      profile_name: "dev"\n      # Optional[str]: Specifies a custom profile for S3 session. Default is default\n      # profile as specified in ~/.aws/credentials file\n      use_ssl: true\n      # Optional[bool]: Whether or not to use SSL. By default, SSL is used.\n      verify: None\n      # Optional[str]: Whether or not to verify SSL certificates. By default SSL certificates are verified.\n      # You can also specify this argument if you want to use a different CA cert bundle than the one used by botocore."\n      aws_access_key_id: None\n      # Optional[str]: The access key to use when creating the client.\n      aws_secret_access_key: None\n      # Optional[str]: The secret key to use when creating the client.\n      aws_session_token: None\n      # Optional[str]:  The session token to use when creating the client.\n
\n
\n
\n\n
\n
\ndagster_aws.s3.s3_pickle_io_manager IOManagerDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
s3_resource (Union[Any, None], optional):
\n

\n
s3_bucket (dagster.StringSource):
\n

S3 bucket to use for the file manager.

\n
\n
s3_prefix (dagster.StringSource, optional):
\n

Prefix to use for the S3 bucket for this file manager.

\n

Default Value: \u2018dagster\u2019

\n
\n
\n

Persistent IO manager using S3 for storage.

\n

Serializes objects via pickling. Suitable for objects storage for distributed executors, so long\nas each execution node has network connectivity and credentials for S3 and the backing bucket.

\n

Assigns each op output to a unique filepath containing run ID, step key, and output name.\nAssigns each asset to a single filesystem path, at \u201c<base_dir>/<asset_key>\u201d. If the asset key\nhas multiple components, the final component is used as the name of the file, and the preceding\ncomponents as parent directories under the base_dir.

\n

Subsequent materializations of an asset will overwrite previous materializations of that asset.\nWith a base directory of \u201c/my/base/path\u201d, an asset with key\nAssetKey([\u201cone\u201d, \u201ctwo\u201d, \u201cthree\u201d]) would be stored in a file called \u201cthree\u201d in a directory\nwith path \u201c/my/base/path/one/two/\u201d.

\n

Example usage:

\n
    \n
  1. Attach this IO manager to a set of assets.

  2. \n
\n
from dagster import Definitions, asset\nfrom dagster_aws.s3 import s3_pickle_io_manager, s3_resource\n\n\n@asset\ndef asset1():\n    # create df ...\n    return df\n\n@asset\ndef asset2(asset1):\n    return asset1[:5]\n\ndefs = Definitions(\n    assets=[asset1, asset2],\n    resources={\n        "io_manager": s3_pickle_io_manager.configured(\n            {"s3_bucket": "my-cool-bucket", "s3_prefix": "my-cool-prefix"}\n        ),\n        "s3": s3_resource,\n    },\n)\n
\n
\n
    \n
  1. Attach this IO manager to your job to make it available to your ops.

  2. \n
\n
from dagster import job\nfrom dagster_aws.s3 import s3_pickle_io_manager, s3_resource\n\n@job(\n    resource_defs={\n        "io_manager": s3_pickle_io_manager.configured(\n            {"s3_bucket": "my-cool-bucket", "s3_prefix": "my-cool-prefix"}\n        ),\n        "s3": s3_resource,\n    },\n)\ndef my_job():\n    ...\n
\n
\n
\n\n
\n
\ndagster_aws.s3.s3_file_manager ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
use_unsigned_session (dagster.BoolSource, optional):
\n

Specifies whether to use an unsigned S3 session.

\n

Default Value: False

\n
\n
region_name (Union[dagster.StringSource, None], optional):
\n

Specifies a custom region for the S3 session.

\n
\n
endpoint_url (Union[dagster.StringSource, None], optional):
\n

Specifies a custom endpoint for the S3 session.

\n
\n
max_attempts (dagster.IntSource, optional):
\n

This provides Boto3\u2019s retry handler with a value of maximum retry attempts, where the initial call counts toward the max_attempts value that you provide.

\n

Default Value: 5

\n
\n
profile_name (Union[dagster.StringSource, None], optional):
\n

Specifies a profile to connect that session.

\n
\n
use_ssl (dagster.BoolSource, optional):
\n

Whether or not to use SSL. By default, SSL is used.

\n

Default Value: True

\n
\n
verify (Union[dagster.StringSource, None], optional):
\n

Whether or not to verify SSL certificates. By default SSL certificates are verified. You can also specify this argument if you want to use a different CA cert bundle than the one used by botocore.

\n
\n
aws_access_key_id (Union[dagster.StringSource, None], optional):
\n

AWS access key ID to use when creating the boto3 session.

\n
\n
aws_secret_access_key (Union[dagster.StringSource, None], optional):
\n

AWS secret access key to use when creating the boto3 session.

\n
\n
aws_session_token (Union[dagster.StringSource, None], optional):
\n

AWS session token to use when creating the boto3 session.

\n
\n
s3_bucket (dagster.StringSource):
\n

S3 bucket to use for the file manager.

\n
\n
s3_prefix (dagster.StringSource, optional):
\n

Prefix to use for the S3 bucket for this file manager.

\n

Default Value: \u2018dagster\u2019

\n
\n
\n

FileManager that provides abstract access to S3.

\n

Implements the FileManager API.

\n
\n\n
\n
\ndagster_aws.redshift.redshift_resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
host (dagster.StringSource):
\n

Redshift host

\n
\n
port (dagster.IntSource, optional):
\n

Redshift port

\n

Default Value: 5439

\n
\n
user (Union[dagster.StringSource, None], optional):
\n

Username for Redshift connection

\n
\n
password (Union[dagster.StringSource, None], optional):
\n

Password for Redshift connection

\n
\n
database (Union[dagster.StringSource, None], optional):
\n

Name of the default database to use. After login, you can use USE DATABASE to change the database.

\n
\n
autocommit (Union[dagster.BoolSource, None], optional):
\n

Whether to autocommit queries

\n
\n
connect_timeout (dagster.IntSource, optional):
\n

Timeout for connection to Redshift cluster. Defaults to 5 seconds.

\n

Default Value: 5

\n
\n
sslmode (dagster.StringSource, optional):
\n

SSL mode to use. See the Redshift documentation for reference: https://docs.aws.amazon.com/redshift/latest/mgmt/connecting-ssl-support.html

\n

Default Value: \u2018require\u2019

\n
\n
\n

This resource enables connecting to a Redshift cluster and issuing queries against that\ncluster.

\n

Example

\n
from dagster import build_op_context, op\nfrom dagster_aws.redshift import redshift_resource\n\n@op(required_resource_keys={'redshift'})\ndef example_redshift_op(context):\n    return context.resources.redshift.execute_query('SELECT 1', fetch_results=True)\n\nredshift_configured = redshift_resource.configured({\n    'host': 'my-redshift-cluster.us-east-1.redshift.amazonaws.com',\n    'port': 5439,\n    'user': 'dagster',\n    'password': 'dagster',\n    'database': 'dev',\n})\ncontext = build_op_context(resources={'redshift': redshift_configured})\nassert example_redshift_op(context) == [(1,)]\n
\n
\n
\n\n
\n
\ndagster_aws.redshift.fake_redshift_resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
host (dagster.StringSource):
\n

Redshift host

\n
\n
port (dagster.IntSource, optional):
\n

Redshift port

\n

Default Value: 5439

\n
\n
user (Union[dagster.StringSource, None], optional):
\n

Username for Redshift connection

\n
\n
password (Union[dagster.StringSource, None], optional):
\n

Password for Redshift connection

\n
\n
database (Union[dagster.StringSource, None], optional):
\n

Name of the default database to use. After login, you can use USE DATABASE to change the database.

\n
\n
autocommit (Union[dagster.BoolSource, None], optional):
\n

Whether to autocommit queries

\n
\n
connect_timeout (dagster.IntSource, optional):
\n

Timeout for connection to Redshift cluster. Defaults to 5 seconds.

\n

Default Value: 5

\n
\n
sslmode (dagster.StringSource, optional):
\n

SSL mode to use. See the Redshift documentation for reference: https://docs.aws.amazon.com/redshift/latest/mgmt/connecting-ssl-support.html

\n

Default Value: \u2018require\u2019

\n
\n
\n
\n\n
\n
\ndagster_aws.secretsmanager.secretsmanager_resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
region_name (Union[dagster.StringSource, None], optional):
\n

Specifies a custom region for the Boto3 session

\n
\n
max_attempts (dagster.IntSource, optional):
\n

This provides Boto3\u2019s retry handler with a value of maximum retry attempts, where the initial call counts toward the max_attempts value that you provide

\n

Default Value: 5

\n
\n
profile_name (Union[dagster.StringSource, None], optional):
\n

Specifies a profile to connect that session

\n
\n
\n

Resource that gives access to AWS SecretsManager.

\n

The underlying SecretsManager session is created by calling\nboto3.session.Session(profile_name).\nThe returned resource object is a SecretsManager client, an instance of botocore.client.SecretsManager.

\n

Example

\n
from dagster import build_op_context, job, op\nfrom dagster_aws.secretsmanager import secretsmanager_resource\n\n@op(required_resource_keys={'secretsmanager'})\ndef example_secretsmanager_op(context):\n    return context.resources.secretsmanager.get_secret_value(\n        SecretId='arn:aws:secretsmanager:region:aws_account_id:secret:appauthexample-AbCdEf'\n    )\n\n@job(resource_defs={'secretsmanager': secretsmanager_resource})\ndef example_job():\n    example_secretsmanager_op()\n\nexample_job.execute_in_process(\n    run_config={\n        'resources': {\n            'secretsmanager': {\n                'config': {\n                    'region_name': 'us-west-1',\n                }\n            }\n        }\n    }\n)\n
\n
\n

You may configure this resource as follows:

\n
resources:\n  secretsmanager:\n    config:\n      region_name: "us-west-1"\n      # Optional[str]: Specifies a custom region for the SecretsManager session. Default is chosen\n      # through the ordinary boto credential chain.\n      profile_name: "dev"\n      # Optional[str]: Specifies a custom profile for SecretsManager session. Default is default\n      # profile as specified in ~/.aws/credentials file\n
\n
\n
\n\n
\n
\ndagster_aws.secretsmanager.secretsmanager_secrets_resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
region_name (Union[dagster.StringSource, None], optional):
\n

Specifies a custom region for the Boto3 session

\n
\n
max_attempts (dagster.IntSource, optional):
\n

This provides Boto3\u2019s retry handler with a value of maximum retry attempts, where the initial call counts toward the max_attempts value that you provide

\n

Default Value: 5

\n
\n
profile_name (Union[dagster.StringSource, None], optional):
\n

Specifies a profile to connect that session

\n
\n
secrets (List[dagster.StringSource], optional):
\n

An array of AWS Secrets Manager secrets arns to fetch.

\n

Default Value: []

\n
\n
secrets_tag (Union[dagster.StringSource, None], optional):
\n

AWS Secrets Manager secrets with this tag will be fetched and made available.

\n
\n
add_to_environment (Bool, optional):
\n

Whether to add the secrets to the environment. Defaults to False.

\n

Default Value: False

\n
\n
\n

Resource that provides a dict which maps selected SecretsManager secrets to\ntheir string values. Also optionally sets chosen secrets as environment variables.

\n

Example

\n
import os\nfrom dagster import build_op_context, job, op\nfrom dagster_aws.secretsmanager import secretsmanager_secrets_resource\n\n@op(required_resource_keys={'secrets'})\ndef example_secretsmanager_secrets_op(context):\n    return context.resources.secrets.get("my-secret-name")\n\n@op(required_resource_keys={'secrets'})\ndef example_secretsmanager_secrets_op_2(context):\n    return os.getenv("my-other-secret-name")\n\n@job(resource_defs={'secrets': secretsmanager_secrets_resource})\ndef example_job():\n    example_secretsmanager_secrets_op()\n    example_secretsmanager_secrets_op_2()\n\nexample_job.execute_in_process(\n    run_config={\n        'resources': {\n            'secrets': {\n                'config': {\n                    'region_name': 'us-west-1',\n                    'secrets_tag': 'dagster',\n                    'add_to_environment': True,\n                }\n            }\n        }\n    }\n)\n
\n
\n

Note that your ops must also declare that they require this resource with\nrequired_resource_keys, or it will not be initialized for the execution of their compute\nfunctions.

\n

You may configure this resource as follows:

\n
resources:\n  secretsmanager:\n    config:\n      region_name: "us-west-1"\n      # Optional[str]: Specifies a custom region for the SecretsManager session. Default is chosen\n      # through the ordinary boto credential chain.\n      profile_name: "dev"\n      # Optional[str]: Specifies a custom profile for SecretsManager session. Default is default\n      # profile as specified in ~/.aws/credentials file\n      secrets: ["arn:aws:secretsmanager:region:aws_account_id:secret:appauthexample-AbCdEf"]\n      # Optional[List[str]]: Specifies a list of secret ARNs to pull from SecretsManager.\n      secrets_tag: "dagster"\n      # Optional[str]: Specifies a tag, all secrets which have the tag set will be pulled\n      # from SecretsManager.\n      add_to_environment: true\n      # Optional[bool]: Whether to set the selected secrets as environment variables. Defaults\n      # to false.\n
\n
\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-aws", "customsidebar": null, "display_toc": true, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../dagster-azure/", "title": "Azure (dagster-azure)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-airflow/", "title": "Airflow (dagster-airflow)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-azure", "Azure (dagster-azure)", "N", "next"], ["sections/api/apidocs/libraries/dagster-airflow", "Airflow (dagster-airflow)", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/libraries/dagster-aws.rst.txt", "title": "AWS (dagster-aws)", "toc": "\n"}, "dagster-azure": {"alabaster_version": "0.7.13", "body": "
\n

Azure (dagster-azure)\u00b6

\n

Utilities for using Azure Storage Accounts with Dagster. This is mostly aimed at Azure Data Lake\nStorage Gen 2 (ADLS2) but also contains some utilities for Azure Blob Storage.

\n
\n

Resources\u00b6

\n
\n
\ndagster_azure.adls2.ADLS2Resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
storage_account (dagster.StringSource):
\n

The storage account name.

\n
\n
credential (selector):
\n
\nConfig Schema:
\n
sas (strict dict):
\n
\nConfig Schema:
\n
token (dagster.StringSource):
\n

\n
\n
\n
key (strict dict):
\n
\nConfig Schema:
\n
key (dagster.StringSource):
\n

\n
\n
\n
default_azure_credential (strict dict):
\n
\nConfig Schema:
\n
kwargs (dict):
\n

\n
\n
\n
\n
\n
\n

Resource containing clients to access Azure Data Lake Storage Gen2.

\n

Contains a client for both the Data Lake and Blob APIs, to work around the limitations\nof each.

\n
\n\n
\n
\ndagster_azure.adls2.FakeADLS2Resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
account_name (dagster.StringSource):
\n

\n
storage_account (Union[dagster.StringSource, None], optional):
\n

\n
\n

Stateful mock of an ADLS2Resource for testing.

\n

Wraps a mock.MagicMock. Containers are implemented using an in-memory dict.

\n
\n\n
\n
\nclass dagster_azure.blob.AzureBlobComputeLogManager(storage_account, container, secret_key=None, local_dir=None, inst_data=None, prefix='dagster', upload_interval=None, default_azure_credential=None)[source]\u00b6
\n

Logs op compute function stdout and stderr to Azure Blob Storage.

\n

This is also compatible with Azure Data Lake Storage.

\n

Users should not instantiate this class directly. Instead, use a YAML block in dagster.yaml\nsuch as the following:

\n
compute_logs:\n  module: dagster_azure.blob.compute_log_manager\n  class: AzureBlobComputeLogManager\n  config:\n    storage_account: my-storage-account\n    container: my-container\n    credential: sas-token-or-secret-key\n    default_azure_credential:\n      exclude_environment_credential: true\n    prefix: "dagster-test-"\n    local_dir: "/tmp/cool"\n    upload_interval: 30\n
\n
\n
\n
Parameters:
\n
    \n
  • storage_account (str) \u2013 The storage account name to which to log.

  • \n
  • container (str) \u2013 The container (or ADLS2 filesystem) to which to log.

  • \n
  • secret_key (Optional[str]) \u2013 Secret key for the storage account. SAS tokens are not\nsupported because we need a secret key to generate a SAS token for a download URL.

  • \n
  • default_azure_credential (Optional[dict]) \u2013 Use and configure DefaultAzureCredential.\nCannot be used with sas token or secret key config.

  • \n
  • local_dir (Optional[str]) \u2013 Path to the local directory in which to stage logs. Default:\ndagster._seven.get_system_temp_directory().

  • \n
  • prefix (Optional[str]) \u2013 Prefix for the log file keys.

  • \n
  • upload_interval \u2013 (Optional[int]): Interval in seconds to upload partial log files blob storage. By default, will only upload when the capture is complete.

  • \n
  • inst_data (Optional[ConfigurableClassData]) \u2013 Serializable representation of the compute\nlog manager when newed up from config.

  • \n
\n
\n
\n
\n\n
\n
\n

I/O Manager\u00b6

\n
\n
\ndagster_azure.adls2.ADLS2PickleIOManager IOManagerDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
adls2 (Union[Any, None], optional):
\n

\n
adls2_file_system (dagster.StringSource):
\n

ADLS Gen2 file system name.

\n
\n
adls2_prefix (dagster.StringSource, optional):
\n

ADLS Gen2 file system prefix to write to.

\n

Default Value: \u2018dagster\u2019

\n
\n
\n

Persistent IO manager using Azure Data Lake Storage Gen2 for storage.

\n

Serializes objects via pickling. Suitable for objects storage for distributed executors, so long\nas each execution node has network connectivity and credentials for ADLS and the backing\ncontainer.

\n

Assigns each op output to a unique filepath containing run ID, step key, and output name.\nAssigns each asset to a single filesystem path, at \u201c<base_dir>/<asset_key>\u201d. If the asset key\nhas multiple components, the final component is used as the name of the file, and the preceding\ncomponents as parent directories under the base_dir.

\n

Subsequent materializations of an asset will overwrite previous materializations of that asset.\nWith a base directory of \u201c/my/base/path\u201d, an asset with key\nAssetKey([\u201cone\u201d, \u201ctwo\u201d, \u201cthree\u201d]) would be stored in a file called \u201cthree\u201d in a directory\nwith path \u201c/my/base/path/one/two/\u201d.

\n

Example usage:

\n
    \n
  1. Attach this IO manager to a set of assets.

  2. \n
\n
from dagster import Definitions, asset\nfrom dagster_azure.adls2 import ADLS2PickleIOManager, adls2_resource\n\n@asset\ndef asset1():\n    # create df ...\n    return df\n\n@asset\ndef asset2(asset1):\n    return df[:5]\n\ndefs = Definitions(\n    assets=[asset1, asset2],\n    resources={\n        "io_manager": ADLS2PickleIOManager(\n            adls2_file_system="my-cool-fs",\n            adls2_prefix="my-cool-prefix"\n        ),\n        "adls2": adls2_resource,\n    },\n)\n
\n
\n
    \n
  1. Attach this IO manager to your job to make it available to your ops.

  2. \n
\n
from dagster import job\nfrom dagster_azure.adls2 import ADLS2PickleIOManager, adls2_resource\n\n@job(\n    resource_defs={\n        "io_manager": ADLS2PickleIOManager(\n            adls2_file_system="my-cool-fs",\n            adls2_prefix="my-cool-prefix"\n        ),\n        "adls2": adls2_resource,\n    },\n)\ndef my_job():\n    ...\n
\n
\n
\n\n
\n
\n

File Manager (Experimental)\u00b6

\n
\n
\ndagster_azure.adls2.adls2_file_manager ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
storage_account (dagster.StringSource):
\n

The storage account name.

\n
\n
credential (selector):
\n

The credentials with which to authenticate.

\n
\nConfig Schema:
\n
sas (dagster.StringSource):
\n

SAS token for the account.

\n
\n
key (dagster.StringSource):
\n

Shared Access Key for the account.

\n
\n
DefaultAzureCredential (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\n
\n
\n
adls2_file_system (dagster.StringSource):
\n

ADLS Gen2 file system name

\n
\n
adls2_prefix (dagster.StringSource, optional):
\n

Default Value: \u2018dagster\u2019

\n
\n
\n

FileManager that provides abstract access to ADLS2.

\n

Implements the FileManager API.

\n
\n\n
\n
\nclass dagster_azure.adls2.ADLS2FileHandle(account, file_system, key)[source]\u00b6
\n

A reference to a file on ADLS2.

\n
\n\n
\n
\n

Legacy\u00b6

\n
\n
\ndagster_azure.adls2.ConfigurablePickledObjectADLS2IOManager IOManagerDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
adls2 (Union[Any, None], optional):
\n

\n
adls2_file_system (dagster.StringSource):
\n

ADLS Gen2 file system name.

\n
\n
adls2_prefix (dagster.StringSource, optional):
\n

ADLS Gen2 file system prefix to write to.

\n

Default Value: \u2018dagster\u2019

\n
\n
\n
\n
\n

\n \n (\n deprecated\n )\n \n This API will be removed in version 2.0. Please use GCSPickleIOManager instead..\n \n

\n

Renamed to ADLS2PickleIOManager. See ADLS2PickleIOManager for documentation.

\n
\n\n
\n
\ndagster_azure.adls2.adls2_resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
storage_account (dagster.StringSource):
\n

The storage account name.

\n
\n
credential (selector):
\n

The credentials with which to authenticate.

\n
\nConfig Schema:
\n
sas (dagster.StringSource):
\n

SAS token for the account.

\n
\n
key (dagster.StringSource):
\n

Shared Access Key for the account.

\n
\n
DefaultAzureCredential (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\n
\n
\n
\n

Resource that gives ops access to Azure Data Lake Storage Gen2.

\n

The underlying client is a DataLakeServiceClient.

\n

Attach this resource definition to a JobDefinition in order to make it\navailable to your ops.

\n

Example

\n
from dagster import job, op\nfrom dagster_azure.adls2 import adls2_resource\n\n@op(required_resource_keys={'adls2'})\ndef example_adls2_op(context):\n    return list(context.resources.adls2.adls2_client.list_file_systems())\n\n@job(resource_defs={"adls2": adls2_resource})\ndef my_job():\n    example_adls2_op()\n
\n
\n

Note that your ops must also declare that they require this resource with\nrequired_resource_keys, or it will not be initialized for the execution of their compute\nfunctions.

\n

You may pass credentials to this resource using either a SAS token, a key or by passing the\nDefaultAzureCredential object.

\n
resources:\n  adls2:\n    config:\n      storage_account: my_storage_account\n      # str: The storage account name.\n      credential:\n        sas: my_sas_token\n        # str: the SAS token for the account.\n        key:\n          env: AZURE_DATA_LAKE_STORAGE_KEY\n        # str: The shared access key for the account.\n        DefaultAzureCredential: {}\n        # dict: The keyword arguments used for DefaultAzureCredential\n        # or leave the object empty for no arguments\n        DefaultAzureCredential:\n            exclude_environment_credential: true\n
\n
\n
\n\n
\n
\ndagster_azure.adls2.adls2_pickle_io_manager IOManagerDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
adls2 (Union[Any, None], optional):
\n

\n
adls2_file_system (dagster.StringSource):
\n

ADLS Gen2 file system name.

\n
\n
adls2_prefix (dagster.StringSource, optional):
\n

ADLS Gen2 file system prefix to write to.

\n

Default Value: \u2018dagster\u2019

\n
\n
\n

Persistent IO manager using Azure Data Lake Storage Gen2 for storage.

\n

Serializes objects via pickling. Suitable for objects storage for distributed executors, so long\nas each execution node has network connectivity and credentials for ADLS and the backing\ncontainer.

\n

Assigns each op output to a unique filepath containing run ID, step key, and output name.\nAssigns each asset to a single filesystem path, at \u201c<base_dir>/<asset_key>\u201d. If the asset key\nhas multiple components, the final component is used as the name of the file, and the preceding\ncomponents as parent directories under the base_dir.

\n

Subsequent materializations of an asset will overwrite previous materializations of that asset.\nWith a base directory of \u201c/my/base/path\u201d, an asset with key\nAssetKey([\u201cone\u201d, \u201ctwo\u201d, \u201cthree\u201d]) would be stored in a file called \u201cthree\u201d in a directory\nwith path \u201c/my/base/path/one/two/\u201d.

\n

Example usage:

\n
    \n
  1. Attach this IO manager to a set of assets.

  2. \n
\n
from dagster import Definitions, asset\nfrom dagster_azure.adls2 import adls2_pickle_io_manager, adls2_resource\n\n@asset\ndef asset1():\n    # create df ...\n    return df\n\n@asset\ndef asset2(asset1):\n    return df[:5]\n\ndefs = Definitions(\n    assets=[asset1, asset2],\n    resources={\n        "io_manager": adls2_pickle_io_manager.configured(\n            {"adls2_file_system": "my-cool-fs", "adls2_prefix": "my-cool-prefix"}\n        ),\n        "adls2": adls2_resource,\n    },\n)\n
\n
\n
    \n
  1. Attach this IO manager to your job to make it available to your ops.

  2. \n
\n
from dagster import job\nfrom dagster_azure.adls2 import adls2_pickle_io_manager, adls2_resource\n\n@job(\n    resource_defs={\n        "io_manager": adls2_pickle_io_manager.configured(\n            {"adls2_file_system": "my-cool-fs", "adls2_prefix": "my-cool-prefix"}\n        ),\n        "adls2": adls2_resource,\n    },\n)\ndef my_job():\n    ...\n
\n
\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-azure", "customsidebar": null, "display_toc": true, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../dagster-celery/", "title": "Celery (dagster-celery)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-aws/", "title": "AWS (dagster-aws)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-celery", "Celery (dagster-celery)", "N", "next"], ["sections/api/apidocs/libraries/dagster-aws", "AWS (dagster-aws)", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/libraries/dagster-azure.rst.txt", "title": "Azure (dagster-azure)", "toc": "\n"}, "dagster-celery": {"alabaster_version": "0.7.13", "body": "
\n

Celery (dagster-celery)\u00b6

\n
\n

Quickstart\u00b6

\n

To get a local rabbitmq broker started and available via the default\npyamqp://guest@localhost:5672, in the dagster/python_modules/libraries/dagster-celery/\ndirectory run:

\n
docker-compose up\n
\n
\n

To run a celery worker:

\n
celery -A dagster_celery.app worker -l info\n
\n
\n

To start multiple workers in the background, run:

\n
celery multi start w2 -A dagster_celery.app -l info\n
\n
\n

To execute a job using the celery-backed executor, you\u2019ll need to set the job\u2019s executor_def to\nthe celery_executor.

\n
from dagster import job\nfrom dagster_celery import celery_executor\n\n@job(executor_def=celery_executor)\ndef my_job():\n    pass\n
\n
\n
\n

Monitoring your Celery tasks\u00b6

\n

We advise using [Flower](https://celery.readthedocs.io/en/latest/userguide/monitoring.html#flower-real-time-celery-web-monitor):

\n
celery -A dagster_celery.app flower\n
\n
\n
\n
\n

Customizing the Celery broker, backend, and other app configuration\u00b6

\n

By default this will use amqp://guest:**@localhost:5672// as the Celery broker URL and\nrpc:// as the results backend. In production, you will want to change these values. Pending the\nintroduction of a dagster_celery CLI, that would entail writing a Python module my_module as\nfollows:

\n
from celery import Celery\n\nfrom dagster_celery.tasks import create_task\n\napp = Celery('dagster', broker_url='some://custom@value', ...)\n\nexecute_plan = create_task(app)\n\nif __name__ == '__main__':\n    app.worker_main()\n
\n
\n

You can then run the celery worker using:

\n
celery -A my_module worker --loglevel=info\n
\n
\n

This customization mechanism is used to implement dagster_celery_k8s and dagster_celery_k8s which delegate the execution of steps to ephemeral kubernetes pods and docker containers, respectively.

\n
\n
\n

Celery best practices\u00b6

\n

Celery is a rich and full-featured system. We\u2019ve found the following resources helpful:

\n\n
\n
\n
\n

API\u00b6

\n
\n
\ndagster_celery.celery_executor ExecutorDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
broker (Union[dagster.StringSource, None], optional):
\n

The URL of the Celery broker. Default: \u2018pyamqp://guest@{os.getenv(\u2018DAGSTER_CELERY_BROKER_HOST\u2019,\u2019localhost\u2019)}//\u2019.

\n
\n
backend (Union[dagster.StringSource, None], optional):
\n

The URL of the Celery results backend. Default: \u2018rpc://\u2019.

\n

Default Value: \u2018rpc://\u2019

\n
\n
include (List[String], optional):
\n

List of modules every worker should import

\n
\n
config_source (Union[permissive dict, None], optional):
\n

Additional settings for the Celery app.

\n
\n
retries (selector, optional):
\n

Whether retries are enabled or not. By default, retries are enabled.

\n
\nDefault Value:
{\n    "enabled": {}\n}\n
\n
\n
\nConfig Schema:
\n
enabled (strict dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\n
disabled (strict dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\n
\n
\n
\n

Celery-based executor.

\n

The Celery executor exposes config settings for the underlying Celery app under\nthe config_source key. This config corresponds to the \u201cnew lowercase settings\u201d introduced\nin Celery version 4.0 and the object constructed from config will be passed to the\ncelery.Celery constructor as its config_source argument.\n(See https://docs.celeryq.dev/en/stable/userguide/configuration.html for details.)

\n

The executor also exposes the broker, backend, and include arguments to the\ncelery.Celery constructor.

\n

In the most common case, you may want to modify the broker and backend (e.g., to use\nRedis instead of RabbitMQ). We expect that config_source will be less frequently\nmodified, but that when solid executions are especially fast or slow, or when there are\ndifferent requirements around idempotence or retry, it may make sense to execute jobs\nwith variations on these settings.

\n

To use the celery_executor, set it as the executor_def when defining a job:

\n
from dagster import job\nfrom dagster_celery import celery_executor\n\n@job(executor_def=celery_executor)\ndef celery_enabled_job():\n    pass\n
\n
\n

Then you can configure the executor as follows:

\n
execution:\n  config:\n    broker: 'pyamqp://guest@localhost//'  # Optional[str]: The URL of the Celery broker\n    backend: 'rpc://' # Optional[str]: The URL of the Celery results backend\n    include: ['my_module'] # Optional[List[str]]: Modules every worker should import\n    config_source: # Dict[str, Any]: Any additional parameters to pass to the\n        #...       # Celery workers. This dict will be passed as the `config_source`\n        #...       # argument of celery.Celery().\n
\n
\n

Note that the YAML you provide here must align with the configuration with which the Celery\nworkers on which you hope to run were started. If, for example, you point the executor at a\ndifferent broker than the one your workers are listening to, the workers will never be able to\npick up tasks for execution.

\n
\n\n
\n
\n

CLI\u00b6

\n

The dagster-celery CLI lets you start, monitor, and terminate workers.

\n
\n

dagster-celery worker start\u00b6

\n

Start a dagster celery worker.

\n
dagster-celery worker start [OPTIONS] [ADDITIONAL_ARGS]...\n
\n
\n

Options

\n
\n
\n-n, --name <name>\u00b6
\n

The name of the worker. Defaults to a unique name prefixed with \u201cdagster-\u201d and ending with the hostname.

\n
\n\n
\n
\n-y, --config-yaml <config_yaml>\u00b6
\n

Specify the path to a config YAML file with options for the worker. This is the same config block that you provide to dagster_celery.celery_executor when configuring a job for execution with Celery, with, e.g., the URL of the broker to use.

\n
\n\n
\n
\n-q, --queue <queue>\u00b6
\n

Names of the queues on which this worker should listen for tasks. Provide multiple -q arguments to specify multiple queues. Note that each celery worker may listen on no more than four queues.

\n
\n\n
\n
\n-d, --background\u00b6
\n

Set this flag to run the worker in the background.

\n
\n\n
\n
\n-i, --includes <includes>\u00b6
\n

Python modules the worker should import. Provide multiple -i arguments to specify multiple modules.

\n
\n\n
\n
\n-l, --loglevel <loglevel>\u00b6
\n

Log level for the worker.

\n
\n\n
\n
\n-A, --app <app>\u00b6
\n
\n\n

Arguments

\n
\n
\nADDITIONAL_ARGS\u00b6
\n

Optional argument(s)

\n
\n\n
\n
\n

dagster-celery worker list\u00b6

\n

List running dagster-celery workers. Note that we use the broker to contact the workers.

\n
dagster-celery worker list [OPTIONS]\n
\n
\n

Options

\n
\n
\n-y, --config-yaml <config_yaml>\u00b6
\n

Specify the path to a config YAML file with options for the workers you are trying to manage. This is the same config block that you provide to dagster_celery.celery_executor when configuring a job for execution with Celery, with, e.g., the URL of the broker to use. Without this config file, you will not be able to find your workers (since the CLI won\u2019t know how to reach the broker).

\n
\n\n
\n
\n

dagster-celery worker terminate\u00b6

\n

Shut down dagster-celery workers. Note that we use the broker to send signals to the workers to terminate \u2013 if the broker is not running, this command is a no-op. Provide the argument NAME to terminate a specific worker by name.

\n
dagster-celery worker terminate [OPTIONS] [NAME]\n
\n
\n

Options

\n
\n
\n-a, --all\u00b6
\n

Set this flag to terminate all running workers.

\n
\n\n
\n
\n-y, --config-yaml <config_yaml>\u00b6
\n

Specify the path to a config YAML file with options for the workers you are trying to manage. This is the same config block that you provide to dagster_celery.celery_executor when configuring a job for execution with Celery, with, e.g., the URL of the broker to use. Without this config file, you will not be able to terminate your workers (since the CLI won\u2019t know how to reach the broker).

\n
\n\n

Arguments

\n
\n
\nNAME\u00b6
\n

Optional argument

\n
\n\n
\n
\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-celery", "customsidebar": null, "display_toc": true, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../dagster-celery-docker/", "title": "Orchestration on Celery + Docker"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-azure/", "title": "Azure (dagster-azure)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-celery-docker", "Orchestration on Celery + Docker", "N", "next"], ["sections/api/apidocs/libraries/dagster-azure", "Azure (dagster-azure)", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/libraries/dagster-celery.rst.txt", "title": "Celery (dagster-celery)", "toc": "\n"}, "dagster-celery-docker": {"alabaster_version": "0.7.13", "body": "
\n

Orchestration on Celery + Docker\u00b6

\n
\n
\n

APIs\u00b6

\n
\n
\ndagster_celery_docker.celery_docker_executor ExecutorDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
broker (Union[dagster.StringSource, None], optional):
\n

The URL of the Celery broker. Default: \u2018pyamqp://guest@{os.getenv(\u2018DAGSTER_CELERY_BROKER_HOST\u2019,\u2019localhost\u2019)}//\u2019.

\n
\n
backend (Union[dagster.StringSource, None], optional):
\n

The URL of the Celery results backend. Default: \u2018rpc://\u2019.

\n

Default Value: \u2018rpc://\u2019

\n
\n
include (List[String], optional):
\n

List of modules every worker should import

\n
\n
config_source (Union[permissive dict, None], optional):
\n

Additional settings for the Celery app.

\n
\n
retries (selector, optional):
\n

Whether retries are enabled or not. By default, retries are enabled.

\n
\nDefault Value:
{\n    "enabled": {}\n}\n
\n
\n
\nConfig Schema:
\n
enabled (strict dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\n
disabled (strict dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\n
\n
\n
docker (strict dict):
\n

The configuration for interacting with docker in the celery worker.

\n
\nConfig Schema:
\n
image (dagster.StringSource, optional):
\n

The docker image to be used for step execution.

\n
\n
registry (strict dict, optional):
\n

Information for using a non local/public docker registry

\n
\nConfig Schema:
\n
url (dagster.StringSource):
\n

\n
username (dagster.StringSource):
\n

\n
password (dagster.StringSource):
\n

\n
\n
\n
env_vars (List[String], optional):
\n

The list of environment variables names to forward from the celery worker in to the docker container

\n
\n
network (String, optional):
\n

Name of the network this container will be connected to at creation time

\n
\n
container_kwargs (permissive dict, optional):
\n

Additional keyword args for the docker container

\n
\n
\n
\n
\n

Celery-based executor which launches tasks in docker containers.

\n

The Celery executor exposes config settings for the underlying Celery app under\nthe config_source key. This config corresponds to the \u201cnew lowercase settings\u201d introduced\nin Celery version 4.0 and the object constructed from config will be passed to the\ncelery.Celery constructor as its config_source argument.\n(See https://docs.celeryq.dev/en/stable/userguide/configuration.html for details.)

\n

The executor also exposes the broker, backend, and include arguments to the\ncelery.Celery constructor.

\n

In the most common case, you may want to modify the broker and backend (e.g., to use\nRedis instead of RabbitMQ). We expect that config_source will be less frequently\nmodified, but that when op executions are especially fast or slow, or when there are\ndifferent requirements around idempotence or retry, it may make sense to execute jobs\nwith variations on these settings.

\n

To use the celery_docker_executor, set it as the executor_def when defining a job:

\n
from dagster import job\nfrom dagster_celery_docker.executor import celery_docker_executor\n\n@job(executor_def=celery_docker_executor)\ndef celery_enabled_job():\n    pass\n
\n
\n

Then you can configure the executor as follows:

\n
execution:\n  config:\n    docker:\n      image: 'my_repo.com/image_name:latest'\n      registry:\n        url: 'my_repo.com'\n        username: 'my_user'\n        password: {env: 'DOCKER_PASSWORD'}\n      env_vars: ["DAGSTER_HOME"] # environment vars to pass from celery worker to docker\n      container_kwargs: # keyword args to be passed to the container. example:\n        volumes: ['/home/user1/:/mnt/vol2','/var/www:/mnt/vol1']\n\n    broker: 'pyamqp://guest@localhost//'  # Optional[str]: The URL of the Celery broker\n    backend: 'rpc://' # Optional[str]: The URL of the Celery results backend\n    include: ['my_module'] # Optional[List[str]]: Modules every worker should import\n    config_source: # Dict[str, Any]: Any additional parameters to pass to the\n        #...       # Celery workers. This dict will be passed as the `config_source`\n        #...       # argument of celery.Celery().\n
\n
\n

Note that the YAML you provide here must align with the configuration with which the Celery\nworkers on which you hope to run were started. If, for example, you point the executor at a\ndifferent broker than the one your workers are listening to, the workers will never be able to\npick up tasks for execution.

\n

In deployments where the celery_docker_job_executor is used all appropriate celery and dagster_celery\ncommands must be invoked with the -A dagster_celery_docker.app argument.

\n
\n\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-celery-docker", "customsidebar": null, "display_toc": true, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../dagster-celery-k8s/", "title": "Orchestration on Celery + Kubernetes"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-celery/", "title": "Celery (dagster-celery)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-celery-k8s", "Orchestration on Celery + Kubernetes", "N", "next"], ["sections/api/apidocs/libraries/dagster-celery", "Celery (dagster-celery)", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/libraries/dagster-celery-docker.rst.txt", "title": "Orchestration on Celery + Docker", "toc": "\n"}, "dagster-celery-k8s": {"alabaster_version": "0.7.13", "body": "
\n

Orchestration on Celery + Kubernetes\u00b6

\n
\n
\n

APIs\u00b6

\n
\n
\ndagster_celery_k8s.CeleryK8sRunLauncher RunLauncher[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
job_image (Union[dagster.StringSource, None], optional):
\n

Docker image to use for launched Jobs. If this field is empty, the image that was used to originally load the Dagster repository will be used. (Ex: \u201cmycompany.com/dagster-k8s-image:latest\u201d).

\n
\n
image_pull_policy (Union[dagster.StringSource, None], optional):
\n

Image pull policy to set on launched Pods.

\n
\n
image_pull_secrets (Union[List[strict dict], None], optional):
\n

Specifies that Kubernetes should get the credentials from the Secrets named in this list.

\n
\n
service_account_name (Union[dagster.StringSource, None], optional):
\n

The name of the Kubernetes service account under which to run.

\n
\n
env_config_maps (Union[List[dagster.StringSource], None], optional):
\n

A list of custom ConfigMapEnvSource names from which to draw environment variables (using envFrom) for the Job. Default: []. See:https://kubernetes.io/docs/tasks/inject-data-application/define-environment-variable-container/#define-an-environment-variable-for-a-container

\n
\n
env_secrets (Union[List[dagster.StringSource], None], optional):
\n

A list of custom Secret names from which to draw environment variables (using envFrom) for the Job. Default: []. See:https://kubernetes.io/docs/tasks/inject-data-application/distribute-credentials-secure/#configure-all-key-value-pairs-in-a-secret-as-container-environment-variables

\n
\n
env_vars (Union[List[String], None], optional):
\n

A list of environment variables to inject into the Job. Each can be of the form KEY=VALUE or just KEY (in which case the value will be pulled from the current process). Default: []. See: https://kubernetes.io/docs/tasks/inject-data-application/distribute-credentials-secure/#configure-all-key-value-pairs-in-a-secret-as-container-environment-variables

\n
\n
volume_mounts (List[permissive dict], optional):
\n

A list of volume mounts to include in the job\u2019s container. Default: []. See: https://v1-18.docs.kubernetes.io/docs/reference/generated/kubernetes-api/v1.18/#volumemount-v1-core

\n

Default Value: []

\n
\n
volumes (List[permissive dict], optional):
\n

A list of volumes to include in the Job\u2019s Pod. Default: []. For the many possible volume source types that can be included, see: https://v1-18.docs.kubernetes.io/docs/reference/generated/kubernetes-api/v1.18/#volume-v1-core

\n

Default Value: []

\n
\n
labels (permissive dict, optional):
\n

Labels to apply to all created pods. See: https://kubernetes.io/docs/concepts/overview/working-with-objects/labels

\n
\n
resources (Union[strict dict, None], optional):
\n

Compute resource requirements for the container. See: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/

\n
\n
scheduler_name (Union[dagster.StringSource, None], optional):
\n

Use a custom Kubernetes scheduler for launched Pods. See:https://kubernetes.io/docs/tasks/extend-kubernetes/configure-multiple-schedulers/

\n
\n
security_context (permissive dict, optional):
\n

Security settings for the container. See:https://kubernetes.io/docs/tasks/configure-pod-container/security-context/#set-capabilities-for-a-container

\n
\n
instance_config_map (dagster.StringSource):
\n

The name of an existing Volume to mount into the pod in order to provide a ConfigMap for the Dagster instance. This Volume should contain a dagster.yaml with appropriate values for run storage, event log storage, etc.

\n
\n
postgres_password_secret (dagster.StringSource, optional):
\n

The name of the Kubernetes Secret where the postgres password can be retrieved. Will be mounted and supplied as an environment variable to the Job Pod.Secret must contain the key "postgresql-password" which will be exposed in the Job environment as the environment variable DAGSTER_PG_PASSWORD.

\n
\n
dagster_home (dagster.StringSource, optional):
\n

The location of DAGSTER_HOME in the Job container; this is where the dagster.yaml file will be mounted from the instance ConfigMap specified here. Defaults to /opt/dagster/dagster_home.

\n

Default Value: \u2018/opt/dagster/dagster_home\u2019

\n
\n
load_incluster_config (Bool, optional):
\n

Set this value if you are running the launcher\nwithin a k8s cluster. If True, we assume the launcher is running within the target\ncluster and load config using kubernetes.config.load_incluster_config. Otherwise,\nwe will use the k8s config specified in kubeconfig_file (using\nkubernetes.config.load_kube_config) or fall back to the default kubeconfig.

\n

Default Value: True

\n
\n
kubeconfig_file (Union[String, None], optional):
\n

The kubeconfig file from which to load config. Defaults to using the default kubeconfig.

\n

Default Value: None

\n
\n
fail_pod_on_run_failure (Bool, optional):
\n

Whether the launched Kubernetes Jobs and Pods should fail if the Dagster run fails

\n
\n
run_k8s_config (strict dict, optional):
\n

Raw Kubernetes configuration for launched runs.

\n
\nConfig Schema:
\n
container_config (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\n
pod_template_spec_metadata (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\n
pod_spec_config (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\n
job_config (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\n
job_metadata (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\n
job_spec_config (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\n
\n
\n
job_namespace (dagster.StringSource, optional):
\n

Default Value: \u2018default\u2019

\n
\n
broker (Union[dagster.StringSource, None], optional):
\n

The URL of the Celery broker. Default: \u2018pyamqp://guest@{os.getenv(\u2018DAGSTER_CELERY_BROKER_HOST\u2019,\u2019localhost\u2019)}//\u2019.

\n
\n
backend (Union[dagster.StringSource, None], optional):
\n

The URL of the Celery results backend. Default: \u2018rpc://\u2019.

\n

Default Value: \u2018rpc://\u2019

\n
\n
include (List[String], optional):
\n

List of modules every worker should import

\n
\n
config_source (Union[permissive dict, None], optional):
\n

Additional settings for the Celery app.

\n
\n
retries (selector, optional):
\n

Whether retries are enabled or not. By default, retries are enabled.

\n
\nDefault Value:
{\n    "enabled": {}\n}\n
\n
\n
\nConfig Schema:
\n
enabled (strict dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\n
disabled (strict dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\n
\n
\n
\n

In contrast to the K8sRunLauncher, which launches dagster runs as single K8s\nJobs, this run launcher is intended for use in concert with\ndagster_celery_k8s.celery_k8s_job_executor().

\n

With this run launcher, execution is delegated to:

\n
\n
    \n
  1. A run worker Kubernetes Job, which traverses the dagster run execution plan and\nsubmits steps to Celery queues for execution;

  2. \n
  3. The step executions which are submitted to Celery queues are picked up by Celery workers,\nand each step execution spawns a step execution Kubernetes Job. See the implementation\ndefined in dagster_celery_k8.executor.create_k8s_job_task().

  4. \n
\n
\n

You can configure a Dagster instance to use this RunLauncher by adding a section to your\ndagster.yaml like the following:

\n
run_launcher:\n  module: dagster_k8s.launcher\n  class: CeleryK8sRunLauncher\n  config:\n    instance_config_map: "dagster-k8s-instance-config-map"\n    dagster_home: "/some/path"\n    postgres_password_secret: "dagster-k8s-pg-password"\n    broker: "some_celery_broker_url"\n    backend: "some_celery_backend_url"\n
\n
\n
\n\n
\n
\ndagster_celery_k8s.celery_k8s_job_executor ExecutorDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
broker (Union[dagster.StringSource, None], optional):
\n

The URL of the Celery broker. Default: \u2018pyamqp://guest@{os.getenv(\u2018DAGSTER_CELERY_BROKER_HOST\u2019,\u2019localhost\u2019)}//\u2019.

\n
\n
backend (Union[dagster.StringSource, None], optional):
\n

The URL of the Celery results backend. Default: \u2018rpc://\u2019.

\n

Default Value: \u2018rpc://\u2019

\n
\n
include (List[String], optional):
\n

List of modules every worker should import

\n
\n
config_source (Union[permissive dict, None], optional):
\n

Additional settings for the Celery app.

\n
\n
retries (selector, optional):
\n

Whether retries are enabled or not. By default, retries are enabled.

\n
\nDefault Value:
{\n    "enabled": {}\n}\n
\n
\n
\nConfig Schema:
\n
enabled (strict dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\n
disabled (strict dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\n
\n
\n
job_image (Union[dagster.StringSource, None], optional):
\n

Docker image to use for launched Jobs. If this field is empty, the image that was used to originally load the Dagster repository will be used. (Ex: \u201cmycompany.com/dagster-k8s-image:latest\u201d).

\n
\n
image_pull_policy (Union[dagster.StringSource, None], optional):
\n

Image pull policy to set on launched Pods.

\n
\n
image_pull_secrets (Union[List[strict dict], None], optional):
\n

Specifies that Kubernetes should get the credentials from the Secrets named in this list.

\n
\n
service_account_name (Union[dagster.StringSource, None], optional):
\n

The name of the Kubernetes service account under which to run.

\n
\n
env_config_maps (Union[List[dagster.StringSource], None], optional):
\n

A list of custom ConfigMapEnvSource names from which to draw environment variables (using envFrom) for the Job. Default: []. See:https://kubernetes.io/docs/tasks/inject-data-application/define-environment-variable-container/#define-an-environment-variable-for-a-container

\n
\n
env_secrets (Union[List[dagster.StringSource], None], optional):
\n

A list of custom Secret names from which to draw environment variables (using envFrom) for the Job. Default: []. See:https://kubernetes.io/docs/tasks/inject-data-application/distribute-credentials-secure/#configure-all-key-value-pairs-in-a-secret-as-container-environment-variables

\n
\n
env_vars (Union[List[String], None], optional):
\n

A list of environment variables to inject into the Job. Each can be of the form KEY=VALUE or just KEY (in which case the value will be pulled from the current process). Default: []. See: https://kubernetes.io/docs/tasks/inject-data-application/distribute-credentials-secure/#configure-all-key-value-pairs-in-a-secret-as-container-environment-variables

\n
\n
volume_mounts (List[permissive dict], optional):
\n

A list of volume mounts to include in the job\u2019s container. Default: []. See: https://v1-18.docs.kubernetes.io/docs/reference/generated/kubernetes-api/v1.18/#volumemount-v1-core

\n

Default Value: []

\n
\n
volumes (List[permissive dict], optional):
\n

A list of volumes to include in the Job\u2019s Pod. Default: []. For the many possible volume source types that can be included, see: https://v1-18.docs.kubernetes.io/docs/reference/generated/kubernetes-api/v1.18/#volume-v1-core

\n

Default Value: []

\n
\n
labels (permissive dict, optional):
\n

Labels to apply to all created pods. See: https://kubernetes.io/docs/concepts/overview/working-with-objects/labels

\n
\n
resources (Union[strict dict, None], optional):
\n

Compute resource requirements for the container. See: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/

\n
\n
scheduler_name (Union[dagster.StringSource, None], optional):
\n

Use a custom Kubernetes scheduler for launched Pods. See:https://kubernetes.io/docs/tasks/extend-kubernetes/configure-multiple-schedulers/

\n
\n
security_context (permissive dict, optional):
\n

Security settings for the container. See:https://kubernetes.io/docs/tasks/configure-pod-container/security-context/#set-capabilities-for-a-container

\n
\n
load_incluster_config (Bool, optional):
\n

Set this value if you are running the launcher within a k8s cluster. If\nTrue, we assume the launcher is running within the target cluster and load config\nusing kubernetes.config.load_incluster_config. Otherwise, we will use the k8s config\nspecified in kubeconfig_file (using kubernetes.config.load_kube_config) or fall\nback to the default kubeconfig. Default: True.

\n

Default Value: True

\n
\n
kubeconfig_file (Union[String, None], optional):
\n

Path to a kubeconfig file to use, if not using default kubeconfig.

\n
\n
job_namespace (dagster.StringSource, optional):
\n

The namespace into which to launch new jobs. Note that any other Kubernetes resources the Job requires (such as the service account) must be present in this namespace. Default: "default"

\n
\n
repo_location_name (dagster.StringSource, optional):
\n

The repository location name to use for execution.

\n

Default Value: \u2018<<in_process>>\u2019

\n
\n
job_wait_timeout (Float, optional):
\n

Wait this many seconds for a job to complete before marking the run as failed. Defaults to 86400.0 seconds.

\n

Default Value: 86400.0

\n
\n
\n

Celery-based executor which launches tasks as Kubernetes Jobs.

\n

The Celery executor exposes config settings for the underlying Celery app under\nthe config_source key. This config corresponds to the \u201cnew lowercase settings\u201d introduced\nin Celery version 4.0 and the object constructed from config will be passed to the\ncelery.Celery constructor as its config_source argument.\n(See https://docs.celeryq.dev/en/stable/userguide/configuration.html for details.)

\n

The executor also exposes the broker, backend, and include arguments to the\ncelery.Celery constructor.

\n

In the most common case, you may want to modify the broker and backend (e.g., to use\nRedis instead of RabbitMQ). We expect that config_source will be less frequently\nmodified, but that when op executions are especially fast or slow, or when there are\ndifferent requirements around idempotence or retry, it may make sense to execute dagster jobs\nwith variations on these settings.

\n

To use the celery_k8s_job_executor, set it as the executor_def when defining a job:

\n
from dagster import job\nfrom dagster_celery_k8s.executor import celery_k8s_job_executor\n\n\n@job(executor_def=celery_k8s_job_executor)\ndef celery_enabled_job():\n    pass\n
\n
\n

Then you can configure the executor as follows:

\n
execution:\n  config:\n    job_image: 'my_repo.com/image_name:latest'\n    job_namespace: 'some-namespace'\n    broker: 'pyamqp://guest@localhost//'  # Optional[str]: The URL of the Celery broker\n    backend: 'rpc://' # Optional[str]: The URL of the Celery results backend\n    include: ['my_module'] # Optional[List[str]]: Modules every worker should import\n    config_source: # Dict[str, Any]: Any additional parameters to pass to the\n        #...       # Celery workers. This dict will be passed as the `config_source`\n        #...       # argument of celery.Celery().\n
\n
\n

Note that the YAML you provide here must align with the configuration with which the Celery\nworkers on which you hope to run were started. If, for example, you point the executor at a\ndifferent broker than the one your workers are listening to, the workers will never be able to\npick up tasks for execution.

\n

In deployments where the celery_k8s_job_executor is used all appropriate celery and dagster_celery\ncommands must be invoked with the -A dagster_celery_k8s.app argument.

\n
\n\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-celery-k8s", "customsidebar": null, "display_toc": true, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../dagster-census/", "title": "Census (dagster-census)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-celery-docker/", "title": "Orchestration on Celery + Docker"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-census", "Census (dagster-census)", "N", "next"], ["sections/api/apidocs/libraries/dagster-celery-docker", "Orchestration on Celery + Docker", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/libraries/dagster-celery-k8s.rst.txt", "title": "Orchestration on Celery + Kubernetes", "toc": "\n"}, "dagster-census": {"alabaster_version": "0.7.13", "body": "
\n

Census (dagster-census)\u00b6

\n

This library provides an integration with Census.

\n
\n
\ndagster_census.census_trigger_sync_op OpDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
sync_id (Int):
\n

Id of the parent sync.

\n
\n
force_full_sync (Bool, optional):
\n

If this trigger request should be a Full Sync. Note that some sync configurations such as Append do not support full syncs.

\n

Default Value: False

\n
\n
poll_interval (Float, optional):
\n

The time (in seconds) to wait between successive polls.

\n

Default Value: 10

\n
\n
poll_timeout (Union[Float, None], optional):
\n

The maximum time to wait before this operation is timed out. By default, this will never time out.

\n

Default Value: None

\n
\n
yield_materializations (Bool, optional):
\n

If True, materializations corresponding to the results of the Census sync will be yielded when the op executes.

\n

Default Value: True

\n
\n
asset_key_prefix (List[String], optional):
\n

If provided and yield_materializations is True, these components will be used to prefix the generated asset keys.

\n

Default Value: [\u2018census\u2019]

\n
\n
\n

Executes a Census sync for a given sync_id and polls until that sync completes, raising\nan error if it is unsuccessful.

\n

It outputs a CensusOutput which contains the details of the Census\nsync after it successfully completes.

\n

It requires the use of the census_resource, which allows it to\ncommunicate with the Census API.

\n

Examples:

\n
from dagster import job\nfrom dagster_census import census_resource, census_sync_op\n\nmy_census_resource = census_resource.configured(\n    {\n        "api_key": {"env": "CENSUS_API_KEY"},\n    }\n)\n\nsync_foobar = census_sync_op.configured({"sync_id": "foobar"}, name="sync_foobar")\n\n@job(resource_defs={"census": my_census_resource})\ndef my_simple_census_job():\n    sync_foobar()\n
\n
\n
\n\n
\n
\ndagster_census.census_resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
api_key (dagster.StringSource):
\n

Census API Key.

\n
\n
request_max_retries (Int, optional):
\n

The maximum number of times requests to the Census API should be retried before failing.

\n

Default Value: 3

\n
\n
request_retry_delay (Float, optional):
\n

Time (in seconds) to wait between each request retry.

\n

Default Value: 0.25

\n
\n
\n

This resource allows users to programatically interface with the Census REST API to launch\nsyncs and monitor their progress. This currently implements only a subset of the functionality\nexposed by the API.

\n

Examples:

\n
from dagster import job\nfrom dagster_census import census_resource\n\nmy_census_resource = census_resource.configured(\n    {\n        "api_key": {"env": "CENSUS_API_KEY"},\n    }\n)\n\n@job(resource_defs={"census":my_census_resource})\ndef my_census_job():\n    ...\n
\n
\n
\n\n
\n
\nclass dagster_census.CensusResource(api_key, request_max_retries=3, request_retry_delay=0.25, log=<Logger dagster.builtin (DEBUG)>)[source]\u00b6
\n

This class exposes methods on top of the Census REST API.

\n
\n\n
\n
\nclass dagster_census.CensusOutput(sync_run, source, destination)[source]\u00b6
\n

Contains recorded information about the state of a Census sync after a sync completes.

\n
\n
\nsync_run\u00b6
\n

The details of the specific sync run.

\n
\n
Type:
\n

Dict[str, Any]

\n
\n
\n
\n\n
\n
\nsource\u00b6
\n

Information about the source for the Census sync.

\n
\n
Type:
\n

Dict[str, Any]

\n
\n
\n
\n\n
\n
\ndestination\u00b6
\n

Information about the destination for the Census sync.

\n
\n
Type:
\n

Dict[str, Any]

\n
\n
\n
\n\n
\n\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-census", "customsidebar": null, "display_toc": false, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../dagster-dask/", "title": "Dask (dagster-dask)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-celery-k8s/", "title": "Orchestration on Celery + Kubernetes"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-dask", "Dask (dagster-dask)", "N", "next"], ["sections/api/apidocs/libraries/dagster-celery-k8s", "Orchestration on Celery + Kubernetes", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/libraries/dagster-census.rst.txt", "title": "Census (dagster-census)", "toc": "\n"}, "dagster-dask": {"alabaster_version": "0.7.13", "body": "
\n

Dask (dagster-dask)\u00b6

\n

See also the Dask deployment guide.

\n
\n
\ndagster_dask.dask_executor ExecutorDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
cluster (selector):
\n
\nConfig Schema:
\n
existing (strict dict):
\n

Connect to an existing scheduler.

\n
\nConfig Schema:
\n
address (dagster.StringSource):
\n

\n
\n
\n
local (permissive dict, optional):
\n

Local cluster configuration.

\n
\n
yarn (permissive dict, optional):
\n

YARN cluster configuration.

\n
\n
ssh (permissive dict, optional):
\n

SSH cluster configuration.

\n
\n
pbs (permissive dict, optional):
\n

PBS cluster configuration.

\n
\n
moab (permissive dict, optional):
\n

Moab cluster configuration.

\n
\n
sge (permissive dict, optional):
\n

SGE cluster configuration.

\n
\n
lsf (permissive dict, optional):
\n

LSF cluster configuration.

\n
\n
slurm (permissive dict, optional):
\n

SLURM cluster configuration.

\n
\n
oar (permissive dict, optional):
\n

OAR cluster configuration.

\n
\n
kube (permissive dict, optional):
\n

Kubernetes cluster configuration.

\n
\n
\n
\n
\n

Dask-based executor.

\n

The \u2018cluster\u2019 can be one of the following:\n(\u2018existing\u2019, \u2018local\u2019, \u2018yarn\u2019, \u2018ssh\u2019, \u2018pbs\u2019, \u2018moab\u2019, \u2018sge\u2019, \u2018lsf\u2019, \u2018slurm\u2019, \u2018oar\u2019, \u2018kube\u2019).

\n

If the Dask executor is used without providing executor-specific config, a local Dask cluster\nwill be created (as when calling dask.distributed.Client()\nwith dask.distributed.LocalCluster()).

\n

The Dask executor optionally takes the following config:

\n
cluster:\n    {\n        local?: # takes distributed.LocalCluster parameters\n            {\n                timeout?: 5,  # Timeout duration for initial connection to the scheduler\n                n_workers?: 4  # Number of workers to start\n                threads_per_worker?: 1 # Number of threads per each worker\n            }\n    }\n
\n
\n

To use the dask_executor, set it as the executor_def when defining a job:

\n
from dagster import job\nfrom dagster_dask import dask_executor\n\n@job(executor_def=dask_executor)\ndef dask_enabled_job():\n    pass\n
\n
\n
\n\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-dask", "customsidebar": null, "display_toc": false, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../dagster-databricks/", "title": "Databricks (dagster-databricks)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-census/", "title": "Census (dagster-census)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-databricks", "Databricks (dagster-databricks)", "N", "next"], ["sections/api/apidocs/libraries/dagster-census", "Census (dagster-census)", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/libraries/dagster-dask.rst.txt", "title": "Dask (dagster-dask)", "toc": "\n"}, "dagster-databricks": {"alabaster_version": "0.7.13", "body": "
\n

Databricks (dagster-databricks)\u00b6

\n

The dagster_databricks package provides these main pieces of functionality:

\n\n

Note that, for the databricks_pyspark_step_launcher, either S3 or Azure Data Lake Storage config\nmust be specified for ops to succeed, and the credentials for this storage must also be\nstored as a Databricks Secret and stored in the resource config so that the Databricks cluster can\naccess storage.

\n
\n
\n

APIs\u00b6

\n
\n

Resources\u00b6

\n
\n
\ndagster_databricks.DatabricksClientResource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
host (dagster.StringSource):
\n

Databricks host, e.g. https://uksouth.azuredatabricks.com

\n
\n
token (Union[dagster.StringSource, None], optional):
\n

Databricks access token

\n
\n
oauth_credentials (Union[strict dict, None], optional):
\n

Databricks OAuth credentials for using a service principal. See https://docs.databricks.com/en/dev-tools/auth.html#oauth-2-0

\n
\n
workspace_id (Union[dagster.StringSource, None], optional):
\n

DEPRECATED: The Databricks workspace ID, as described in https://docs.databricks.com/workspace/workspace-details.html#workspace-instance-names-urls-and-ids. This is no longer used and will be removed in a 0.21.

\n
\n
\n

Resource which provides a Python client for interacting with Databricks within an\nop or asset.

\n
\n\n
\n
\nclass dagster_databricks.DatabricksClient(host, token=None, oauth_client_id=None, oauth_client_secret=None, workspace_id=None)[source]\u00b6
\n

A thin wrapper over the Databricks REST API.

\n
\n
\nproperty api_client\u00b6
\n
\n
\n

\n \n (\n deprecated\n )\n \n This API will be removed in version 0.21.0. Use workspace_client property instead..\n \n

\n

Retrieve a reference to the underlying Databricks API client. For more information,\nsee the Databricks Python API.\nNoe: accessing this property will throw an exception if oauth credentials are used to initialize the\nDatabricksClient, because oauth credentials are not supported by the legacy Databricks API client.\nExamples:.

\n
from dagster import op\nfrom databricks_cli.jobs.api import JobsApi\nfrom databricks_cli.runs.api import RunsApi\nfrom databricks.sdk import WorkspaceClient\n\n@op(required_resource_keys={"databricks_client"})\ndef op1(context):\n    # Initialize the Databricks Jobs API\n    jobs_client = JobsApi(context.resources.databricks_client.api_client)\n    runs_client = RunsApi(context.resources.databricks_client.api_client)\n    client = context.resources.databricks_client.api_client\n\n    # Example 1: Run a Databricks job with some parameters.\n    jobs_client.run_now(...)\n    client.jobs.run_now(...)\n\n    # Example 2: Trigger a one-time run of a Databricks workload.\n    runs_client.submit_run(...)\n    client.jobs.submit(...)\n\n    # Example 3: Get an existing run.\n    runs_client.get_run(...)\n    client.jobs.get_run(...)\n\n    # Example 4: Cancel a run.\n    runs_client.cancel_run(...)\n    client.jobs.cancel_run(...)\n
\n
\n
\n
Returns:
\n

The authenticated Databricks API client.

\n
\n
Return type:
\n

ApiClient

\n
\n
\n
\n\n
\n
\nproperty client\u00b6
\n
\n
\n

\n \n (\n deprecated\n )\n \n This API will be removed in version 0.21.0. Use workspace_client property instead..\n \n

\n

accessing this property will throw an exception if oauth\ncredentials are used to initialize the DatabricksClient, because oauth credentials are not supported by the\nlegacy Databricks API client.

\n
\n
Type:
\n

Retrieve the legacy Databricks API client. Note

\n
\n
\n
\n\n
\n
\nproperty workspace_client\u00b6
\n

Retrieve a reference to the underlying Databricks Workspace client. For more information,\nsee the Databricks SDK for Python.

\n

Examples:

\n
from dagster import op\nfrom databricks.sdk import WorkspaceClient\n\n@op(required_resource_keys={"databricks_client"})\ndef op1(context):\n    # Initialize the Databricks Jobs API\n    client = context.resources.databricks_client.api_client\n\n    # Example 1: Run a Databricks job with some parameters.\n    client.jobs.run_now(...)\n\n    # Example 2: Trigger a one-time run of a Databricks workload.\n    client.jobs.submit(...)\n\n    # Example 3: Get an existing run.\n    client.jobs.get_run(...)\n\n    # Example 4: Cancel a run.\n    client.jobs.cancel_run(...)\n
\n
\n
\n
Returns:
\n

The authenticated Databricks SDK Workspace Client.

\n
\n
Return type:
\n

WorkspaceClient

\n
\n
\n
\n\n
\n\n
\n
\n

Ops\u00b6

\n
\n
\ndagster_databricks.create_databricks_run_now_op(databricks_job_id, databricks_job_configuration=None, poll_interval_seconds=10, max_wait_time_seconds=86400, name=None, databricks_resource_key='databricks')[source]\u00b6
\n

Creates an op that launches an existing databricks job.

\n

As config, the op accepts a blob of the form described in Databricks\u2019 Job API:\nhttps://docs.databricks.com/api-explorer/workspace/jobs/runnow. The only required field is\njob_id, which is the ID of the job to be executed. Additional fields can be used to specify\noverride parameters for the Databricks Job.

\n
\n
Parameters:
\n
    \n
  • databricks_job_id (int) \u2013 The ID of the Databricks Job to be executed.

  • \n
  • databricks_job_configuration (dict) \u2013 Configuration for triggering a new job run of a\nDatabricks Job. See https://docs.databricks.com/api-explorer/workspace/jobs/runnow\nfor the full configuration.

  • \n
  • poll_interval_seconds (float) \u2013 How often to poll the Databricks API to check whether the\nDatabricks job has finished running.

  • \n
  • max_wait_time_seconds (float) \u2013 How long to wait for the Databricks job to finish running\nbefore raising an error.

  • \n
  • name (Optional[str]) \u2013 The name of the op. If not provided, the name will be\n_databricks_run_now_op.

  • \n
  • databricks_resource_key (str) \u2013 The name of the resource key used by this op. If not\nprovided, the resource key will be \u201cdatabricks\u201d.

  • \n
\n
\n
Returns:
\n

An op definition to run the Databricks Job.

\n
\n
Return type:
\n

OpDefinition

\n
\n
\n

Example

\n
from dagster import job\nfrom dagster_databricks import create_databricks_run_now_op, DatabricksClientResource\n\nDATABRICKS_JOB_ID = 1234\n\n\nrun_now_op = create_databricks_run_now_op(\n    databricks_job_id=DATABRICKS_JOB_ID,\n    databricks_job_configuration={\n        "python_params": [\n            "--input",\n            "schema.db.input_table",\n            "--output",\n            "schema.db.output_table",\n        ],\n    },\n)\n\n@job(\n    resource_defs={\n        "databricks": DatabricksClientResource(\n            host=EnvVar("DATABRICKS_HOST"),\n            token=EnvVar("DATABRICKS_TOKEN")\n        )\n    }\n)\ndef do_stuff():\n    run_now_op()\n
\n
\n
\n\n
\n
\ndagster_databricks.create_databricks_submit_run_op(databricks_job_configuration, poll_interval_seconds=10, max_wait_time_seconds=86400, name=None, databricks_resource_key='databricks')[source]\u00b6
\n

Creates an op that submits a one-time run of a set of tasks on Databricks.

\n

As config, the op accepts a blob of the form described in Databricks\u2019 Job API:\nhttps://docs.databricks.com/api-explorer/workspace/jobs/submit.

\n
\n
Parameters:
\n
    \n
  • databricks_job_configuration (dict) \u2013 Configuration for submitting a one-time run of a set\nof tasks on Databricks. See https://docs.databricks.com/api-explorer/workspace/jobs/submit\nfor the full configuration.

  • \n
  • poll_interval_seconds (float) \u2013 How often to poll the Databricks API to check whether the\nDatabricks job has finished running.

  • \n
  • max_wait_time_seconds (float) \u2013 How long to wait for the Databricks job to finish running\nbefore raising an error.

  • \n
  • name (Optional[str]) \u2013 The name of the op. If not provided, the name will be\n_databricks_submit_run_op.

  • \n
  • databricks_resource_key (str) \u2013 The name of the resource key used by this op. If not\nprovided, the resource key will be \u201cdatabricks\u201d.

  • \n
\n
\n
Returns:
\n

An op definition to submit a one-time run of a set of tasks on Databricks.

\n
\n
Return type:
\n

OpDefinition

\n
\n
\n

Example

\n
from dagster import job\nfrom dagster_databricks import create_databricks_submit_run_op, DatabricksClientResource\n\n\nsubmit_run_op = create_databricks_submit_run_op(\n    databricks_job_configuration={\n        "new_cluster": {\n            "spark_version": '2.1.0-db3-scala2.11',\n            "num_workers": 2\n        },\n        "notebook_task": {\n            "notebook_path": "/Users/dagster@example.com/PrepareData",\n        },\n    }\n)\n\n@job(\n    resource_defs={\n        "databricks": DatabricksClientResource(\n            host=EnvVar("DATABRICKS_HOST"),\n            token=EnvVar("DATABRICKS_TOKEN")\n        )\n    }\n)\ndef do_stuff():\n    submit_run_op()\n
\n
\n
\n\n
\n
\n

Step Launcher\u00b6

\n
\n
\ndagster_databricks.databricks_pyspark_step_launcher ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
run_config (strict dict):
\n

Databricks job run configuration

\n
\nConfig Schema:
\n
cluster (selector):
\n
\nConfig Schema:
\n
new (strict dict):
\n
\nConfig Schema:
\n
size (selector):
\n
\nConfig Schema:
\n
autoscale (strict dict):
\n
\nConfig Schema:
\n
min_workers (Int):
\n

The minimum number of workers to which the cluster can scale down when underutilized. It is also the initial number of workers the cluster will have after creation.

\n
\n
max_workers (Int):
\n

The maximum number of workers to which the cluster can scale up when overloaded. max_workers must be strictly greater than min_workers.

\n
\n
\n
\n
num_workers (Int):
\n

If num_workers, number of worker nodes that this cluster should have. A cluster has one Spark Driver and num_workers Executors for a total of num_workers + 1 Spark nodes.

\n
\n
\n
\n
spark_version (String):
\n

The Spark version of the cluster. A list of available Spark versions can be retrieved by using the Runtime versions API call. This field is required.

\n
\n
spark_conf (permissive dict, optional):
\n

An object containing a set of optional, user-specified Spark configuration key-value pairs. You can also pass in a string of extra JVM options to the driver and the executors via spark.driver.extraJavaOptions and spark.executor.extraJavaOptions respectively. Example Spark confs: {\u201cspark.speculation\u201d: true, \u201cspark.streaming.ui.retainedBatches\u201d: 5} or {\u201cspark.driver.extraJavaOptions\u201d: \u201c-verbose:gc -XX:+PrintGCDetails\u201d}

\n
\n
nodes (selector):
\n

The nodes used in the cluster. Either the node types or an instance pool can be specified.

\n
\nConfig Schema:
\n
node_types (strict dict):
\n
\nConfig Schema:
\n
node_type_id (String):
\n

This field encodes, through a single value, the resources available to each of the Spark nodes in this cluster. For example, the Spark nodes can be provisioned and optimized for memory or compute intensive workloads. A list of available node types can be retrieved by using the List node types API call. This field is required.

\n
\n
driver_node_type_id (String, optional):
\n

The node type of the Spark driver. This field is optional; if unset, the driver node type is set as the same value as node_type_id defined above.

\n
\n
\n
\n
instance_pool_id (String, optional):
\n

The optional ID of the instance pool to which the cluster belongs. Refer to the Instance Pools API for details.

\n
\n
\n
\n
aws_attributes (permissive dict, optional):
\n

Attributes related to clusters running on Amazon Web Services. If not specified at cluster creation, a set of default values is used. See aws_attributes at https://docs.databricks.com/dev-tools/api/latest/clusters.html.

\n
\nConfig Schema:
\n
first_on_demand (Int, optional):
\n

The first first_on_demand nodes of the cluster will be placed on on-demand instances. If this value is greater than 0, the cluster driver node will be placed on an on-demand instance. If this value is greater than or equal to the current cluster size, all nodes will be placed on on-demand instances. If this value is less than the current cluster size, first_on_demand nodes will be placed on on-demand instances and the remainder will be placed on availability instances. This value does not affect cluster size and cannot be mutated over the lifetime of a cluster.

\n
\n
availability (AWSAvailability, optional):
\n

Availability type used for all subsequent nodes past the first_on_demand ones. Note: If first_on_demand is zero, this availability type will be used for the entire cluster.

\n
\n
zone_id (String, optional):
\n

Identifier for the availability zone/datacenter in which the cluster resides.

\n
\n
instance_profile_arn (String, optional):
\n

Nodes for this cluster will only be placed on AWS instances with this instance profile.

\n
\n
spot_bid_price_percent (Int, optional):
\n

The max price for AWS spot instances, as a percentage of the corresponding instance type\u2019s on-demand price.

\n
\n
ebs_volume_type (EBSVolumeType, optional):
\n

The type of EBS volumes that will be launched with this cluster.

\n
\n
ebs_volume_count (Int, optional):
\n

The number of volumes launched for each instance. You can choose up to 10 volumes.

\n
\n
ebs_volume_size (Int, optional):
\n

The size of each EBS volume (in GiB) launched for each instance.

\n
\n
ebs_volume_iops (Int, optional):
\n

The number of IOPS per EBS gp3 volume.

\n
\n
ebs_volume_throughput (Int, optional):
\n

The throughput per EBS gp3 volume, in MiB per second.

\n
\n
\n
\n
ssh_public_keys (List[String], optional):
\n

SSH public key contents that will be added to each Spark node in this cluster. The corresponding private keys can be used to login with the user name ubuntu on port 2200. Up to 10 keys can be specified.

\n
\n
custom_tags (List[strict dict], optional):
\n

Additional tags for cluster resources. Databricks tags all cluster resources (e.g., AWS instances and EBS volumes) with these tags in addition to default_tags. Note: - Tags are not supported on legacy node types such as compute-optimized and memory-optimized - Databricks allows at most 45 custom tagsMore restrictions may apply if using Azure Databricks; refer to the official docs for further details.

\n
\n
cluster_log_conf (selector, optional):
\n

Recommended! The configuration for delivering Spark logs to a long-term storage destination. Only one destination can be specified for one cluster. If the conf is given, the logs will be delivered to the destination every 5 mins. The destination of driver logs is <destination>/<cluster-id>/driver, while the destination of executor logs is <destination>/<cluster-id>/executor.

\n
\nConfig Schema:
\n
dbfs (strict dict):
\n

DBFS storage information

\n
\nConfig Schema:
\n
destination (String):
\n

DBFS destination, e.g. dbfs:/my/path

\n
\n
\n
\n
s3 (strict dict):
\n

S3 storage information

\n
\nConfig Schema:
\n
destination (String):
\n

S3 destination, e.g. s3://my-bucket/some-prefix. You must configure the cluster with an instance profile and the instance profile must have write access to the destination. You cannot use AWS keys.

\n
\n
region (String):
\n

S3 region, e.g. us-west-2. Either region or endpoint must be set. If both are set, endpoint is used.

\n
\n
endpoint (String):
\n

S3 endpoint, e.g. https://s3-us-west-2.amazonaws.com. Either region or endpoint must be set. If both are set, endpoint is used.

\n
\n
enable_encryption (Bool, optional):
\n

(Optional) Enable server side encryption, false by default.

\n
\n
encryption_type (String, optional):
\n

(Optional) The encryption type, it could be sse-s3 or sse-kms. It is used only when encryption is enabled and the default type is sse-s3.

\n
\n
kms_key (String, optional):
\n

(Optional) KMS key used if encryption is enabled and encryption type is set to sse-kms.

\n
\n
canned_acl (String, optional):
\n

(Optional) Set canned access control list, e.g. bucket-owner-full-control.If canned_acl is set, the cluster instance profile must have s3:PutObjectAcl permission on the destination bucket and prefix. The full list of possible canned ACLs can be found at https://docs.aws.amazon.com/AmazonS3/latest/dev/acl-overview.html#canned-acl. By default only the object owner gets full control. If you are using cross account role for writing data, you may want to set bucket-owner-full-control to make bucket owner able to read the logs.

\n
\n
\n
\n
\n
\n
init_scripts (List[selector], optional):
\n

The configuration for storing init scripts. Any number of scripts can be specified. The scripts are executed sequentially in the order provided. If cluster_log_conf is specified, init script logs are sent to <destination>/<cluster-id>/init_scripts.

\n
\n
spark_env_vars (permissive dict, optional):
\n

An object containing a set of optional, user-specified environment variable key-value pairs. Key-value pair of the form (X,Y) are exported as is (i.e., export X=\u201dY\u201d) while launching the driver and workers. To specify an additional set of SPARK_DAEMON_JAVA_OPTS, we recommend appending them to $SPARK_DAEMON_JAVA_OPTS as shown in the example below. This ensures that all default Databricks managed environmental variables are included as well. Example Spark environment variables: {\u201cSPARK_WORKER_MEMORY\u201d: \u201c28000m\u201d, \u201cSPARK_LOCAL_DIRS\u201d: \u201c/local_disk0\u201d} or {\u201cSPARK_DAEMON_JAVA_OPTS\u201d: \u201c$SPARK_DAEMON_JAVA_OPTS -Dspark.shuffle.service.enabled=true\u201d}

\n
\n
enable_elastic_disk (Bool, optional):
\n

Autoscaling Local Storage: when enabled, this cluster dynamically acquires attitional disk space when its Spark workers are running low on disk space. This feature requires specific AWS permissions to function correctly - refer to https://docs.databricks.com/clusters/configure.html#autoscaling-local-storage for details.

\n
\n
policy_id (String, optional):
\n

The ID of the cluster policy used to create the cluster if applicable

\n
\n
\n
\n
existing (String):
\n

The ID of an existing cluster that will be used for all runs of this job. When running jobs on an existing cluster, you may need to manually restart the cluster if it stops responding. Databricks suggests running jobs on new clusters for greater reliability.

\n
\n
\n
\n
run_name (String, optional):
\n

An optional name for the run. The default value is Untitled

\n
\n
libraries (List[selector], optional):
\n

An optional list of libraries to be installed on the cluster that will execute the job. By default dagster, dagster-databricks and dagster-pyspark libraries will be included.

\n
\n
install_default_libraries (Bool, optional):
\n

By default, Dagster installs a version of dagster, dagster-databricks, and dagster-pyspark matching the locally-installed versions of those libraries. If you would like to disable this behavior, this value can be set to False.

\n
\n
timeout_seconds (Int, optional):
\n

An optional timeout applied to each run of this job. The default behavior is to have no timeout.

\n
\n
idempotency_token (String, optional):
\n

An optional token that can be used to guarantee the idempotency of job run requests.If an active run with the provided token already exists, the request will not create a new run, but will return the ID of the existing run instead. If you specify the idempotency token, upon failure you can retry until the request succeeds. Databricks guarantees that exactly one run will be launched with that idempotency token. This token should have at most 64 characters.

\n
\n
\n
\n
permissions (strict dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
job_permissions (strict dict, optional):
\n

job permission spec; ref: https://docs.databricks.com/security/access-control/jobs-acl.html#job-permissions

\n
\nConfig Schema:
\n
NO_PERMISSIONS (List[selector], optional):
\n

\n
CAN_VIEW (List[selector], optional):
\n

\n
CAN_MANAGE_RUN (List[selector], optional):
\n

\n
IS_OWNER (List[selector], optional):
\n

\n
CAN_MANAGE (List[selector], optional):
\n

\n
\n
\n
cluster_permissions (strict dict, optional):
\n

cluster permission spec; ref: https://docs.databricks.com/security/access-control/cluster-acl.html#cluster-level-permissions

\n
\nConfig Schema:
\n
NO_PERMISSIONS (List[selector], optional):
\n

\n
CAN_ATTACH_TO (List[selector], optional):
\n

\n
CAN_RESTART (List[selector], optional):
\n

\n
CAN_MANAGE (List[selector], optional):
\n

\n
\n
\n
\n
\n
databricks_host (dagster.StringSource):
\n

Databricks host, e.g. uksouth.azuredatabricks.com

\n
\n
databricks_token (Union[dagster.StringSource, None], optional):
\n

Databricks access token

\n

Default Value: None

\n
\n
oauth_credentials (Union[strict dict, None], optional):
\n

Oauth credentials for interacting with the Databricks REST API via a service principal. See https://docs.databricks.com/en/dev-tools/auth.html#oauth-2-0

\n

Default Value: None

\n
\n
env_variables (permissive dict, optional):
\n

Dictionary of arbitrary environment variables to be set on the databricks cluster.

\n
\n
secrets_to_env_variables (List[strict dict], optional):
\n

Databricks secrets to be exported as environment variables. Since runs will execute in the Databricks runtime environment, environment variables (such as those required for a StringSource config variable) will not be accessible to Dagster. These variables must be stored as Databricks secrets and specified here, which will ensure they are re-exported as environment variables accessible to Dagster upon execution.

\n
\n
storage (selector, optional):
\n

Databricks storage configuration for either S3 or ADLS2. If access credentials for your Databricks storage are stored in Databricks secrets, this config indicates the secret scope and the secret keys used to access either S3 or ADLS2.

\n
\nConfig Schema:
\n
s3 (strict dict):
\n

S3 storage secret configuration

\n
\nConfig Schema:
\n
secret_scope (String):
\n

The Databricks secret scope containing the storage secrets.

\n
\n
access_key_key (String):
\n

The key of a Databricks secret containing the S3 access key ID.

\n
\n
secret_key_key (String):
\n

The key of a Databricks secret containing the S3 secret access key.

\n
\n
\n
\n
adls2 (strict dict):
\n

ADLS2 storage secret configuration

\n
\nConfig Schema:
\n
secret_scope (String):
\n

The Databricks secret scope containing the storage secrets.

\n
\n
storage_account_name (String):
\n

The name of the storage account used to access data.

\n
\n
storage_account_key_key (String):
\n

The key of a Databricks secret containing the storage account secret key.

\n
\n
\n
\n
\n
\n
local_pipeline_package_path (dagster.StringSource, optional):
\n

Absolute path to root python package containing your Dagster code. If you set this value to a directory lower than the root package, and have user relative imports in your code (e.g. from .foo import bar), it\u2019s likely you\u2019ll encounter an import error on the remote step. Before every step run, the launcher will zip up the code in this local path, upload it to DBFS, and unzip it into the Python path of the remote Spark process. This gives the remote process access to up-to-date user code.

\n
\n
local_dagster_job_package_path (dagster.StringSource, optional):
\n

Absolute path to root python package containing your Dagster code. If you set this value to a directory lower than the root package, and have user relative imports in your code (e.g. from .foo import bar), it\u2019s likely you\u2019ll encounter an import error on the remote step. Before every step run, the launcher will zip up the code in this local path, upload it to DBFS, and unzip it into the Python path of the remote Spark process. This gives the remote process access to up-to-date user code.

\n
\n
staging_prefix (dagster.StringSource, optional):
\n

Directory in DBFS to use for uploaded job code. Must be absolute.

\n

Default Value: \u2018/dagster_staging\u2019

\n
\n
wait_for_logs (Bool, optional):
\n

If set, and if the specified cluster is configured to export logs, the system will wait after job completion for the logs to appear in the configured location. Note that logs are copied every 5 minutes, so enabling this will add several minutes to the job runtime. NOTE: this integration will export stdout/stderrfrom the remote Databricks process automatically, so this option is not generally necessary.

\n

Default Value: False

\n
\n
max_completion_wait_time_seconds (dagster.IntSource, optional):
\n

If the Databricks job run takes more than this many seconds, then consider it failed and terminate the step.

\n

Default Value: 86400

\n
\n
poll_interval_sec (Float, optional):
\n

How frequently Dagster will poll Databricks to determine the state of the job.

\n

Default Value: 5.0

\n
\n
verbose_logs (Bool, optional):
\n

Determines whether to display debug logs emitted while job is being polled. It can be helpful for Dagster UI performance to set to False when running long-running or fan-out Databricks jobs, to avoid forcing the UI to fetch large amounts of debug logs.

\n

Default Value: True

\n
\n
add_dagster_env_variables (Bool, optional):
\n

Automatically add Dagster system environment variables. This option is only applicable when the code being executed is deployed on Dagster Cloud. It will be ignored when the environment variables provided by Dagster Cloud are not present.

\n

Default Value: True

\n
\n
\n

Resource for running ops as a Databricks Job.

\n

When this resource is used, the op will be executed in Databricks using the \u2018Run Submit\u2019\nAPI. Pipeline code will be zipped up and copied to a directory in DBFS along with the op\u2019s\nexecution context.

\n

Use the \u2018run_config\u2019 configuration to specify the details of the Databricks cluster used, and\nthe \u2018storage\u2019 key to configure persistent storage on that cluster. Storage is accessed by\nsetting the credentials in the Spark context, as documented here for S3 and here for ADLS.

\n
\n\n
\n
\n

Other\u00b6

\n
\n
\nclass dagster_databricks.DatabricksError[source]\u00b6
\n
\n\n
\n
\n

Legacy\u00b6

\n
\n
\ndagster_databricks.databricks_client ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
host (dagster.StringSource):
\n

Databricks host, e.g. https://uksouth.azuredatabricks.com

\n
\n
token (Union[dagster.StringSource, None], optional):
\n

Databricks access token

\n
\n
oauth_credentials (Union[strict dict, None], optional):
\n

Databricks OAuth credentials for using a service principal. See https://docs.databricks.com/en/dev-tools/auth.html#oauth-2-0

\n
\n
workspace_id (Union[dagster.StringSource, None], optional):
\n

DEPRECATED: The Databricks workspace ID, as described in https://docs.databricks.com/workspace/workspace-details.html#workspace-instance-names-urls-and-ids. This is no longer used and will be removed in a 0.21.

\n
\n
\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-databricks", "customsidebar": null, "display_toc": true, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../dagster-datadog/", "title": "Datadog (dagster-datadog)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-dask/", "title": "Dask (dagster-dask)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-datadog", "Datadog (dagster-datadog)", "N", "next"], ["sections/api/apidocs/libraries/dagster-dask", "Dask (dagster-dask)", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/libraries/dagster-databricks.rst.txt", "title": "Databricks (dagster-databricks)", "toc": "\n"}, "dagster-datadog": {"alabaster_version": "0.7.13", "body": "
\n

Datadog (dagster-datadog)\u00b6

\n

This library provides an integration with Datadog, to support publishing metrics to Datadog from\nwithin Dagster ops.

\n

We use the Python datadogpy library. To use it, you\u2019ll\nfirst need to create a DataDog account and get both API and Application keys.

\n

The integration uses DogStatsD, so you\u2019ll need\nto ensure the datadog agent is running on the host you\u2019re sending metrics from.

\n
\n
\ndagster_datadog.DatadogResource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
api_key (dagster.StringSource):
\n

Datadog API key. See https://docs.datadoghq.com/account_management/api-app-keys/

\n
\n
app_key (dagster.StringSource):
\n

Datadog application key. See https://docs.datadoghq.com/account_management/api-app-keys/.

\n
\n
\n

This resource is a thin wrapper over the\ndogstatsd library.

\n

As such, we directly mirror the public API methods of DogStatsd here; you can refer to the\nDataDog documentation for how to use this\nresource.

\n

Examples

\n
@op\ndef datadog_op(datadog_client: ResourceParam[DatadogClient]):\n    datadog_client.event('Man down!', 'This server needs assistance.')\n    datadog_client.gauge('users.online', 1001, tags=["protocol:http"])\n    datadog_client.increment('page.views')\n    datadog_client.decrement('page.views')\n    datadog_client.histogram('album.photo.count', 26, tags=["gender:female"])\n    datadog_client.distribution('album.photo.count', 26, tags=["color:blue"])\n    datadog_client.set('visitors.uniques', 999, tags=["browser:ie"])\n    datadog_client.service_check('svc.check_name', datadog_client.WARNING)\n    datadog_client.timing("query.response.time", 1234)\n\n    # Use timed decorator\n    @datadog_client.timed('run_fn')\n    def run_fn():\n        pass\n\n    run_fn()\n\n@job\ndef job_for_datadog_op() -> None:\n    datadog_op()\n\njob_for_datadog_op.execute_in_process(\n    resources={"datadog_client": DatadogResource(api_key="FOO", app_key="BAR")}\n)\n
\n
\n
\n\n
\n

Legacy\u00b6

\n
\n
\ndagster_datadog.datadog_resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
api_key (dagster.StringSource):
\n

Datadog API key. See https://docs.datadoghq.com/account_management/api-app-keys/

\n
\n
app_key (dagster.StringSource):
\n

Datadog application key. See https://docs.datadoghq.com/account_management/api-app-keys/.

\n
\n
\n

This legacy resource is a thin wrapper over the\ndogstatsd library.

\n

Prefer using DatadogResource.

\n

As such, we directly mirror the public API methods of DogStatsd here; you can refer to the\nDataDog documentation for how to use this\nresource.

\n

Examples

\n
@op(required_resource_keys={'datadog'})\ndef datadog_op(context):\n    dd = context.resources.datadog\n\n    dd.event('Man down!', 'This server needs assistance.')\n    dd.gauge('users.online', 1001, tags=["protocol:http"])\n    dd.increment('page.views')\n    dd.decrement('page.views')\n    dd.histogram('album.photo.count', 26, tags=["gender:female"])\n    dd.distribution('album.photo.count', 26, tags=["color:blue"])\n    dd.set('visitors.uniques', 999, tags=["browser:ie"])\n    dd.service_check('svc.check_name', dd.WARNING)\n    dd.timing("query.response.time", 1234)\n\n    # Use timed decorator\n    @dd.timed('run_fn')\n    def run_fn():\n        pass\n\n    run_fn()\n\n@job(resource_defs={'datadog': datadog_resource})\ndef dd_job():\n    datadog_op()\n\nresult = dd_job.execute_in_process(\n    run_config={'resources': {'datadog': {'config': {'api_key': 'YOUR_KEY', 'app_key': 'YOUR_KEY'}}}}\n)\n
\n
\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-datadog", "customsidebar": null, "display_toc": true, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../dagster-datahub/", "title": "Datahub (dagster-datahub)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-databricks/", "title": "Databricks (dagster-databricks)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-datahub", "Datahub (dagster-datahub)", "N", "next"], ["sections/api/apidocs/libraries/dagster-databricks", "Databricks (dagster-databricks)", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/libraries/dagster-datadog.rst.txt", "title": "Datadog (dagster-datadog)", "toc": "\n"}, "dagster-datahub": {"alabaster_version": "0.7.13", "body": "
\n

Datahub (dagster-datahub)\u00b6

\n

This library provides an integration with Datahub, to support pushing metadata to Datahub from\nwithin Dagster ops.

\n
\n

\n
\n

We use the Datahub Python Library. To use it, you\u2019ll\nfirst need to start up a Datahub Instance. Datahub Quickstart Guide.

\n
\n

\n
\n
\n
\ndagster_datahub.DatahubRESTEmitterResource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
connection (dagster.StringSource):
\n

Datahub GMS Server

\n
\n
token (Union[dagster.StringSource, None], optional):
\n

Personal Access Token

\n
\n
connect_timeout_sec (Union[Float, None], optional):
\n

\n
read_timeout_sec (Union[Float, None], optional):
\n

\n
retry_status_codes (Union[List[dagster.IntSource], None], optional):
\n

\n
retry_methods (Union[List[dagster.StringSource], None], optional):
\n

\n
retry_max_times (Union[dagster.IntSource, None], optional):
\n

\n
extra_headers (Union[dict, None], optional):
\n

\n
ca_certificate_path (Union[dagster.StringSource, None], optional):
\n

\n
server_telemetry_id (Union[dagster.StringSource, None], optional):
\n

\n
disable_ssl_verification (dagster.BoolSource, optional):
\n

Default Value: False

\n
\n
\n

Base class for Dagster resources that utilize structured config.

\n

This class is a subclass of both ResourceDefinition and Config.

\n

Example definition:

\n
class WriterResource(ConfigurableResource):\n    prefix: str\n\n    def output(self, text: str) -> None:\n        print(f"{self.prefix}{text}")\n
\n
\n

Example usage:

\n
@asset\ndef asset_that_uses_writer(writer: WriterResource):\n    writer.output("text")\n\ndefs = Definitions(\n    assets=[asset_that_uses_writer],\n    resources={"writer": WriterResource(prefix="a_prefix")},\n)\n
\n
\n
\n\n
\n
\ndagster_datahub.DatahubKafkaEmitterResource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
connection (strict dict):
\n
\nConfig Schema:
\n
bootstrap (dagster.StringSource):
\n

Kafka Boostrap Servers. Comma delimited

\n
\n
schema_registry_url (dagster.StringSource):
\n

Schema Registry Location.

\n
\n
schema_registry_config (dict, optional):
\n

Extra Schema Registry Config.

\n
\nDefault Value:
{}\n
\n
\n
\n
\n
\n
topic (Union[dagster.StringSource, None], optional):
\n

\n
topic_routes (dict, optional):
\n
\nDefault Value:
{\n    "MetadataChangeEvent": "MetadataChangeEvent_v4",\n    "MetadataChangeProposal": "MetadataChangeProposal_v1"\n}\n
\n
\n
\n
\n

Base class for Dagster resources that utilize structured config.

\n

This class is a subclass of both ResourceDefinition and Config.

\n

Example definition:

\n
class WriterResource(ConfigurableResource):\n    prefix: str\n\n    def output(self, text: str) -> None:\n        print(f"{self.prefix}{text}")\n
\n
\n

Example usage:

\n
@asset\ndef asset_that_uses_writer(writer: WriterResource):\n    writer.output("text")\n\ndefs = Definitions(\n    assets=[asset_that_uses_writer],\n    resources={"writer": WriterResource(prefix="a_prefix")},\n)\n
\n
\n
\n\n
\n

Legacy\u00b6

\n
\n
\ndagster_datahub.datahub_rest_emitter ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
connection (dagster.StringSource):
\n

Datahub GMS Server

\n
\n
token (Union[dagster.StringSource, None], optional):
\n

Personal Access Token

\n
\n
connect_timeout_sec (Union[Float, None], optional):
\n

\n
read_timeout_sec (Union[Float, None], optional):
\n

\n
retry_status_codes (Union[List[dagster.IntSource], None], optional):
\n

\n
retry_methods (Union[List[dagster.StringSource], None], optional):
\n

\n
retry_max_times (Union[dagster.IntSource, None], optional):
\n

\n
extra_headers (Union[dict, None], optional):
\n

\n
ca_certificate_path (Union[dagster.StringSource, None], optional):
\n

\n
server_telemetry_id (Union[dagster.StringSource, None], optional):
\n

\n
disable_ssl_verification (dagster.BoolSource, optional):
\n

Default Value: False

\n
\n
\n
\n\n
\n
\ndagster_datahub.datahub_kafka_emitter ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
connection (strict dict):
\n
\nConfig Schema:
\n
bootstrap (dagster.StringSource):
\n

Kafka Boostrap Servers. Comma delimited

\n
\n
schema_registry_url (dagster.StringSource):
\n

Schema Registry Location.

\n
\n
schema_registry_config (dict, optional):
\n

Extra Schema Registry Config.

\n
\nDefault Value:
{}\n
\n
\n
\n
\n
\n
topic (Union[dagster.StringSource, None], optional):
\n

\n
topic_routes (dict, optional):
\n
\nDefault Value:
{\n    "MetadataChangeEvent": "MetadataChangeEvent_v4",\n    "MetadataChangeProposal": "MetadataChangeProposal_v1"\n}\n
\n
\n
\n
\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-datahub", "customsidebar": null, "display_toc": true, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../dagster-dbt/", "title": "dbt (dagster-dbt)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-datadog/", "title": "Datadog (dagster-datadog)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-dbt", "dbt (dagster-dbt)", "N", "next"], ["sections/api/apidocs/libraries/dagster-datadog", "Datadog (dagster-datadog)", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/libraries/dagster-datahub.rst.txt", "title": "Datahub (dagster-datahub)", "toc": "\n"}, "dagster-dbt": {"alabaster_version": "0.7.13", "body": "
\n

dbt (dagster-dbt)\u00b6

\n

Dagster orchestrates dbt alongside other technologies, so you can combine dbt with Spark, Python,\netc. in a single workflow. Dagster\u2019s software-defined asset abstractions make it simple to define\ndata assets that depend on specific dbt models, or to define the computation required to compute\nthe sources that your dbt models depend on.

\n

Related documentation pages: dbt and\ndbt Cloud.

\n
\n

dagster-dbt\u00b6

\n
\n

dagster-dbt project scaffold\u00b6

\n

This command will initialize a new Dagster project and create directories and files that\nload assets from an existing dbt project.

\n
dagster-dbt project scaffold [OPTIONS]\n
\n
\n

Options

\n
\n
\n--project-name <project_name>\u00b6
\n

Required The name of the Dagster project to initialize for your dbt project.

\n
\n\n
\n
\n--dbt-project-dir <dbt_project_dir>\u00b6
\n

The path of your dbt project directory. This path must contain a dbt_project.yml file. By default, this command will assume that the current working directory contains a dbt project, but you can set a different directory by setting this option.

\n
\n\n
\n
\n
\n

dbt Core\u00b6

\n

Here, we provide interfaces to manage dbt projects invoked by the local dbt command line interface\n(dbt CLI).

\n
\n

Assets (dbt Core)\u00b6

\n
\n
\ndagster_dbt.load_assets_from_dbt_project(project_dir, profiles_dir=None, *, select=None, exclude=None, dagster_dbt_translator=None, io_manager_key=None, target_dir=None, key_prefix=None, source_key_prefix=None, op_name=None, runtime_metadata_fn=None, node_info_to_asset_key=<function default_asset_key_fn>, use_build_command=True, partitions_def=None, partition_key_to_vars_fn=None, node_info_to_group_fn=<function default_group_from_dbt_resource_props>, node_info_to_freshness_policy_fn=<function default_freshness_policy_fn>, node_info_to_auto_materialize_policy_fn=<function default_auto_materialize_policy_fn>, node_info_to_definition_metadata_fn=<function default_metadata_from_dbt_resource_props>, display_raw_sql=None, dbt_resource_key='dbt')[source]\u00b6
\n

Loads a set of dbt models from a dbt project into Dagster assets.

\n

Creates one Dagster asset for each dbt model. All assets will be re-materialized using a single\ndbt run or dbt build command.

\n

When searching for more flexibility in defining the computations that materialize your\ndbt assets, we recommend that you use dbt_assets.

\n
\n
Parameters:
\n
    \n
  • project_dir (Optional[str]) \u2013 The directory containing the dbt project to load.

  • \n
  • profiles_dir (Optional[str]) \u2013 The profiles directory to use for loading the DBT project.\nDefaults to a directory called \u201cconfig\u201d inside the project_dir.

  • \n
  • target_dir (Optional[str]) \u2013 The target directory where dbt will place compiled artifacts.\nDefaults to \u201ctarget\u201d underneath the project_dir.

  • \n
  • select (Optional[str]) \u2013 A dbt selection string for the models in a project that you want\nto include. Defaults to \u201cfqn:*\u201d.

  • \n
  • exclude (Optional[str]) \u2013 A dbt selection string for the models in a project that you want\nto exclude. Defaults to \u201c\u201d.

  • \n
  • dagster_dbt_translator (Optional[DagsterDbtTranslator]) \u2013 Allows customizing how to map\ndbt models, seeds, etc. to asset keys and asset metadata.

  • \n
  • key_prefix (Optional[Union[str, List[str]]]) \u2013 [Deprecated] A key prefix to apply to all assets loaded\nfrom the dbt project. Does not apply to input assets. Deprecated: use\ndagster_dbt_translator=KeyPrefixDagsterDbtTranslator(key_prefix=\u2026) instead.

  • \n
  • source_key_prefix (Optional[Union[str, List[str]]]) \u2013 [Deprecated] A key prefix to apply to all input\nassets for the set of assets loaded from the dbt project. Deprecated: use\ndagster_dbt_translator=KeyPrefixDagsterDbtTranslator(source_key_prefix=\u2026) instead.

  • \n
  • op_name (Optional[str]) \u2013 [Deprecated] Sets the name of the underlying Op that will generate the dbt assets.\nDeprecated: use the @dbt_assets decorator if you need to customize the op name.

  • \n
  • dbt_resource_key (Optional[str]) \u2013 [Deprecated] The resource key that the dbt resource will be specified at.\nDefaults to \u201cdbt\u201d. Deprecated: use the @dbt_assets decorator if you need to customize\nthe resource key.

  • \n
  • runtime_metadata_fn (Optional[Callable[[OpExecutionContext, Mapping[str, Any]], Mapping[str, Any]]]) \u2013 [Deprecated]\nA function that will be run after any of the assets are materialized and returns\nmetadata entries for the asset, to be displayed in the asset catalog for that run.\nDeprecated: use the @dbt_assets decorator if you need to customize runtime metadata.

  • \n
  • manifest_json (Optional[Mapping[str, Any]]) \u2013 [Deprecated] Use the manifest argument instead.

  • \n
  • selected_unique_ids (Optional[Set[str]]) \u2013 [Deprecated] The set of dbt unique_ids that you want to load\nas assets. Deprecated: use the select argument instead.

  • \n
  • node_info_to_asset_key (Mapping[str, Any] -> AssetKey) \u2013 [Deprecated] A function that takes a dictionary\nof dbt node info and returns the AssetKey that you want to represent that node. By\ndefault, the asset key will simply be the name of the dbt model. Deprecated: instead,\nprovide a custom DagsterDbtTranslator that overrides node_info_to_asset_key.

  • \n
  • use_build_command (bool) \u2013 Flag indicating if you want to use dbt build as the core computation\nfor this asset. Defaults to True. If set to False, then dbt run will be used, and\nseeds and snapshots won\u2019t be loaded as assets.

  • \n
  • partitions_def (Optional[PartitionsDefinition]) \u2013 [Deprecated] Defines the set of partition keys that\ncompose the dbt assets. Deprecated: use the @dbt_assets decorator to define partitioned\ndbt assets.

  • \n
  • partition_key_to_vars_fn (Optional[str -> Dict[str, Any]]) \u2013 [Deprecated] A function to translate a given\npartition key (e.g. \u20182022-01-01\u2019) to a dictionary of vars to be passed into the dbt\ninvocation (e.g. {\u201crun_date\u201d: \u201c2022-01-01\u201d}). Deprecated: use the @dbt_assets decorator\nto define partitioned dbt assets.

  • \n
  • node_info_to_group_fn (Dict[str, Any] -> Optional[str]) \u2013 [Deprecated] A function that takes a\ndictionary of dbt node info and returns the group that this node should be assigned to.\nDeprecated: instead, configure dagster groups on a dbt resource\u2019s meta field or assign\ndbt groups.

  • \n
  • node_info_to_freshness_policy_fn (Dict[str, Any] -> Optional[FreshnessPolicy]) \u2013 [Deprecated] A function\nthat takes a dictionary of dbt node info and optionally returns a FreshnessPolicy that\nshould be applied to this node. By default, freshness policies will be created from\nconfig applied to dbt models, i.e.:\ndagster_freshness_policy={\u201cmaximum_lag_minutes\u201d: 60, \u201ccron_schedule\u201d: \u201c0 9 * * *\u201d}\nwill result in that model being assigned\nFreshnessPolicy(maximum_lag_minutes=60, cron_schedule=\u201d0 9 * * *\u201d). Deprecated:\ninstead, configure auto-materialize policies on a dbt resource\u2019s meta field.

  • \n
  • node_info_to_auto_materialize_policy_fn (Dict[str, Any] -> Optional[AutoMaterializePolicy]) \u2013 [Deprecated]\nA function that takes a dictionary of dbt node info and optionally returns a AutoMaterializePolicy\nthat should be applied to this node. By default, AutoMaterializePolicies will be created from\nconfig applied to dbt models, i.e.:\ndagster_auto_materialize_policy={\u201ctype\u201d: \u201clazy\u201d} will result in that model being assigned\nAutoMaterializePolicy.lazy(). Deprecated: instead, configure auto-materialize\npolicies on a dbt resource\u2019s meta field.

  • \n
  • node_info_to_definition_metadata_fn (Dict[str, Any] -> Optional[Dict[str, MetadataUserInput]]) \u2013 [Deprecated]\nA function that takes a dictionary of dbt node info and optionally returns a dictionary\nof metadata to be attached to the corresponding definition. This is added to the default\nmetadata assigned to the node, which consists of the node\u2019s schema (if present).\nDeprecated: instead, provide a custom DagsterDbtTranslator that overrides\nnode_info_to_metadata.

  • \n
  • display_raw_sql (Optional[bool]) \u2013 [Deprecated] A flag to indicate if the raw sql associated\nwith each model should be included in the asset description. For large projects, setting\nthis flag to False is advised to reduce the size of the resulting snapshot. Deprecated:\ninstead, provide a custom DagsterDbtTranslator that overrides node_info_to_description.

  • \n
\n
\n
\n
\n\n
\n
\ndagster_dbt.load_assets_from_dbt_manifest(manifest=None, *, select=None, exclude=None, io_manager_key=None, dagster_dbt_translator=None, key_prefix=None, source_key_prefix=None, selected_unique_ids=None, display_raw_sql=None, dbt_resource_key='dbt', op_name=None, manifest_json=None, use_build_command=True, partitions_def=None, partition_key_to_vars_fn=None, runtime_metadata_fn=None, node_info_to_asset_key=<function default_asset_key_fn>, node_info_to_group_fn=<function default_group_from_dbt_resource_props>, node_info_to_freshness_policy_fn=<function default_freshness_policy_fn>, node_info_to_auto_materialize_policy_fn=<function default_auto_materialize_policy_fn>, node_info_to_definition_metadata_fn=<function default_metadata_from_dbt_resource_props>)[source]\u00b6
\n

Loads a set of dbt models, described in a manifest.json, into Dagster assets.

\n

Creates one Dagster asset for each dbt model. All assets will be re-materialized using a single\ndbt run command.

\n

When searching for more flexibility in defining the computations that materialize your\ndbt assets, we recommend that you use dbt_assets.

\n
\n
Parameters:
\n
    \n
  • manifest (Optional[Mapping[str, Any]]) \u2013 The contents of a DBT manifest.json, which contains\na set of models to load into assets.

  • \n
  • select (Optional[str]) \u2013 A dbt selection string for the models in a project that you want\nto include. Defaults to \u201cfqn:*\u201d.

  • \n
  • exclude (Optional[str]) \u2013 A dbt selection string for the models in a project that you want\nto exclude. Defaults to \u201c\u201d.

  • \n
  • io_manager_key (Optional[str]) \u2013 The IO manager key that will be set on each of the returned\nassets. When other ops are downstream of the loaded assets, the IOManager specified\nhere determines how the inputs to those ops are loaded. Defaults to \u201cio_manager\u201d.

  • \n
  • dagster_dbt_translator (Optional[DagsterDbtTranslator]) \u2013 Allows customizing how to map\ndbt models, seeds, etc. to asset keys and asset metadata.

  • \n
  • key_prefix (Optional[Union[str, List[str]]]) \u2013 [Deprecated] A key prefix to apply to all assets loaded\nfrom the dbt project. Does not apply to input assets. Deprecated: use\ndagster_dbt_translator=KeyPrefixDagsterDbtTranslator(key_prefix=\u2026) instead.

  • \n
  • source_key_prefix (Optional[Union[str, List[str]]]) \u2013 [Deprecated] A key prefix to apply to all input\nassets for the set of assets loaded from the dbt project. Deprecated: use\ndagster_dbt_translator=KeyPrefixDagsterDbtTranslator(source_key_prefix=\u2026) instead.

  • \n
  • op_name (Optional[str]) \u2013 [Deprecated] Sets the name of the underlying Op that will generate the dbt assets.\nDeprecated: use the @dbt_assets decorator if you need to customize the op name.

  • \n
  • dbt_resource_key (Optional[str]) \u2013 \n \n (\n deprecated\n )\n \n (This parameter will be removed in version 0.21. Use the @dbt_assets decorator if you need to customize your resource key.) [Deprecated] The resource key that the dbt resource will be specified at.\nDefaults to \u201cdbt\u201d. Deprecated: use the @dbt_assets decorator if you need to customize\nthe resource key.

  • \n
  • runtime_metadata_fn (Optional[Callable[[OpExecutionContext, Mapping[str, Any]], Mapping[str, Any]]]) \u2013 \n \n (\n deprecated\n )\n \n (This parameter will be removed in version 0.21. Use the @dbt_assets decorator if you need to customize runtime metadata.) [Deprecated]\nA function that will be run after any of the assets are materialized and returns\nmetadata entries for the asset, to be displayed in the asset catalog for that run.\nDeprecated: use the @dbt_assets decorator if you need to customize runtime metadata.

  • \n
  • selected_unique_ids (Optional[Set[str]]) \u2013 \n \n (\n deprecated\n )\n \n (This parameter will be removed in version 0.21. Use the select parameter instead.) [Deprecated] The set of dbt unique_ids that you want to load\nas assets. Deprecated: use the select argument instead.

  • \n
  • node_info_to_asset_key (Mapping[str, Any] -> AssetKey) \u2013 [Deprecated] A function that takes a dictionary\nof dbt node info and returns the AssetKey that you want to represent that node. By\ndefault, the asset key will simply be the name of the dbt model.

  • \n
  • use_build_command (bool) \u2013 \n \n (\n deprecated\n )\n \n (This parameter will be removed in version 0.21. Use the @dbt_assets decorator if you need to customize the underlying dbt commands.) Flag indicating if you want to use dbt build as the core computation\nfor this asset. Defaults to True. If set to False, then dbt run will be used, and\nseeds and snapshots won\u2019t be loaded as assets.

  • \n
  • partitions_def (Optional[PartitionsDefinition]) \u2013 \n \n (\n deprecated\n )\n \n (This parameter will be removed in version 0.21. Use the @dbt_assets decorator to define partitioned dbt assets.) [Deprecated] Defines the set of partition keys that\ncompose the dbt assets. Deprecated: use the @dbt_assets decorator to define partitioned\ndbt assets.

  • \n
  • partition_key_to_vars_fn (Optional[str -> Dict[str, Any]]) \u2013 \n \n (\n deprecated\n )\n \n (This parameter will be removed in version 0.21. Use the @dbt_assets decorator to define partitioned dbt assets.) [Deprecated] A function to translate a given\npartition key (e.g. \u20182022-01-01\u2019) to a dictionary of vars to be passed into the dbt\ninvocation (e.g. {\u201crun_date\u201d: \u201c2022-01-01\u201d}). Deprecated: use the @dbt_assets decorator\nto define partitioned dbt assets.

  • \n
  • node_info_to_group_fn (Dict[str, Any] -> Optional[str]) \u2013 [Deprecated] A function that takes a\ndictionary of dbt node info and returns the group that this node should be assigned to.\nDeprecated: instead, configure dagster groups on a dbt resource\u2019s meta field or assign\ndbt groups.

  • \n
  • node_info_to_freshness_policy_fn (Dict[str, Any] -> Optional[FreshnessPolicy]) \u2013 [Deprecated] A function\nthat takes a dictionary of dbt node info and optionally returns a FreshnessPolicy that\nshould be applied to this node. By default, freshness policies will be created from\nconfig applied to dbt models, i.e.:\ndagster_freshness_policy={\u201cmaximum_lag_minutes\u201d: 60, \u201ccron_schedule\u201d: \u201c0 9 * * *\u201d}\nwill result in that model being assigned\nFreshnessPolicy(maximum_lag_minutes=60, cron_schedule=\u201d0 9 * * *\u201d). Deprecated:\ninstead, configure auto-materialize policies on a dbt resource\u2019s meta field.

  • \n
  • node_info_to_auto_materialize_policy_fn (Dict[str, Any] -> Optional[AutoMaterializePolicy]) \u2013 [Deprecated]\nA function that takes a dictionary of dbt node info and optionally returns a AutoMaterializePolicy\nthat should be applied to this node. By default, AutoMaterializePolicies will be created from\nconfig applied to dbt models, i.e.:\ndagster_auto_materialize_policy={\u201ctype\u201d: \u201clazy\u201d} will result in that model being assigned\nAutoMaterializePolicy.lazy(). Deprecated: instead, configure auto-materialize\npolicies on a dbt resource\u2019s meta field.

  • \n
  • node_info_to_definition_metadata_fn (Dict[str, Any] -> Optional[Dict[str, MetadataUserInput]]) \u2013 [Deprecated]\nA function that takes a dictionary of dbt node info and optionally returns a dictionary\nof metadata to be attached to the corresponding definition. This is added to the default\nmetadata assigned to the node, which consists of the node\u2019s schema (if present).\nDeprecated: instead, provide a custom DagsterDbtTranslator that overrides\nnode_info_to_metadata.

  • \n
  • display_raw_sql (Optional[bool]) \u2013 [Deprecated] A flag to indicate if the raw sql associated\nwith each model should be included in the asset description. For large projects, setting\nthis flag to False is advised to reduce the size of the resulting snapshot. Deprecated:\ninstead, provide a custom DagsterDbtTranslator that overrides node_info_to_description.

  • \n
\n
\n
\n
\n\n
\n
\n@dagster_dbt.dbt_assets(*, manifest, select='fqn:*', exclude=None, io_manager_key=None, partitions_def=None, dagster_dbt_translator=<dagster_dbt.dagster_dbt_translator.DagsterDbtTranslator object>)[source]\u00b6
\n

Create a definition for how to compute a set of dbt resources, described by a manifest.json.\nWhen invoking dbt commands using DbtCliResource\u2019s\ncli() method, Dagster events are emitted by calling\nyield from on the event stream returned by stream().

\n
\n
Parameters:
\n
    \n
  • manifest (Union[Mapping[str, Any], str, Path]) \u2013 The contents of a manifest.json file\nor the path to a manifest.json file. A manifest.json contains a representation of a\ndbt project (models, tests, macros, etc). We use this representation to create\ncorresponding Dagster assets.

  • \n
  • select (str) \u2013 A dbt selection string for the models in a project that you want\nto include. Defaults to fqn:*.

  • \n
  • exclude (Optional[str]) \u2013 A dbt selection string for the models in a project that you want\nto exclude. Defaults to \u201c\u201d.

  • \n
  • io_manager_key (Optional[str]) \u2013 The IO manager key that will be set on each of the returned\nassets. When other ops are downstream of the loaded assets, the IOManager specified\nhere determines how the inputs to those ops are loaded. Defaults to \u201cio_manager\u201d.

  • \n
  • partitions_def (Optional[PartitionsDefinition]) \u2013 Defines the set of partition keys that\ncompose the dbt assets.

  • \n
  • dagster_dbt_translator (Optional[DagsterDbtTranslator]) \u2013 Allows customizing how to map\ndbt models, seeds, etc. to asset keys and asset metadata.

  • \n
\n
\n
\n

Examples

\n

Running dbt build for a dbt project:

\n
from pathlib import Path\n\nfrom dagster import AssetExecutionContext\nfrom dagster_dbt import DbtCliResource, dbt_assets\n\n\n@dbt_assets(manifest=Path("target", "manifest.json"))\ndef my_dbt_assets(context: AssetExecutionContext, dbt: DbtCliResource):\n    yield from dbt.cli(["build"], context=context).stream()\n
\n
\n

Running dbt commands with flags:

\n
from pathlib import Path\n\nfrom dagster import AssetExecutionContext\nfrom dagster_dbt import DbtCliResource, dbt_assets\n\n\n@dbt_assets(manifest=Path("target", "manifest.json"))\ndef my_dbt_assets(context: AssetExecutionContext, dbt: DbtCliResource):\n    yield from dbt.cli(["build", "--full-refresh"], context=context).stream()\n
\n
\n

Running dbt commands with --vars:

\n
import json\nfrom pathlib import Path\n\nfrom dagster import AssetExecutionContext\nfrom dagster_dbt import DbtCliResource, dbt_assets\n\n\n@dbt_assets(manifest=Path("target", "manifest.json"))\ndef my_dbt_assets(context: AssetExecutionContext, dbt: DbtCliResource):\n    dbt_vars = {"key": "value"}\n\n    yield from dbt.cli(["build", "--vars", json.dumps(dbt_vars)], context=context).stream()\n
\n
\n

Retrieving dbt artifacts after running a dbt command:

\n
from pathlib import Path\n\nfrom dagster import AssetExecutionContext\nfrom dagster_dbt import DbtCliResource, dbt_assets\n\n\n@dbt_assets(manifest=Path("target", "manifest.json"))\ndef my_dbt_assets(context: AssetExecutionContext, dbt: DbtCliResource):\n    dbt_build_invocation = dbt.cli(["build"], context=context)\n\n    yield from dbt_build_invocation.stream()\n\n    run_results_json = dbt_build_invocation.get_artifact("run_results.json")\n
\n
\n

Running multiple dbt commands for a dbt project:

\n
from pathlib import Path\n\nfrom dagster import AssetExecutionContext\nfrom dagster_dbt import DbtCliResource, dbt_assets\n\n\n@dbt_assets(manifest=Path("target", "manifest.json"))\ndef my_dbt_assets(context: AssetExecutionContext, dbt: DbtCliResource):\n    yield from dbt.cli(["run"], context=context).stream()\n    yield from dbt.cli(["test"], context=context).stream()\n
\n
\n

Customizing the Dagster asset metadata inferred from a dbt project using DagsterDbtTranslator:

\n
from pathlib import Path\n\nfrom dagster import AssetExecutionContext\nfrom dagster_dbt import DagsterDbtTranslator, DbtCliResource, dbt_assets\n\n\nclass CustomDagsterDbtTranslator(DagsterDbtTranslator):\n    ...\n\n\n@dbt_assets(\n    manifest=Path("target", "manifest.json"),\n    dagster_dbt_translator=CustomDagsterDbtTranslator(),\n)\ndef my_dbt_assets(context: AssetExecutionContext, dbt: DbtCliResource):\n    yield from dbt.cli(["build"], context=context).stream()\n
\n
\n

Invoking another Dagster ResourceDefinition alongside dbt:

\n
from pathlib import Path\n\nfrom dagster import AssetExecutionContext\nfrom dagster_dbt import DagsterDbtTranslator, DbtCliResource, dbt_assets\nfrom dagster_slack import SlackResource\n\n\n@dbt_assets(manifest=Path("target", "manifest.json"))\ndef my_dbt_assets(context: AssetExecutionContext, dbt: DbtCliResource, slack: SlackResource):\n    yield from dbt.cli(["build"], context=context).stream()\n\n    slack_client = slack.get_client()\n    slack_client.chat_postMessage(channel="#my-channel", text="dbt build succeeded!")\n
\n
\n

Defining and accessing Dagster Config alongside dbt:

\n
from pathlib import Path\n\nfrom dagster import AssetExecutionContext, Config\nfrom dagster_dbt import DagsterDbtTranslator, DbtCliResource, dbt_assets\n\n\nclass MyDbtConfig(Config):\n    full_refresh: bool\n\n\n@dbt_assets(manifest=Path("target", "manifest.json"))\ndef my_dbt_assets(context: AssetExecutionContext, dbt: DbtCliResource, config: MyDbtConfig):\n    dbt_build_args = ["build"]\n    if config.full_refresh:\n        dbt_build_args += ["--full-refresh"]\n\n    yield from dbt.cli(dbt_build_args, context=context).stream()\n
\n
\n

Defining Dagster PartitionDefinition alongside dbt:

\n
import json\nfrom pathlib import Path\n\nfrom dagster import AssetExecutionContext, DailyPartitionDefinition\nfrom dagster_dbt import DbtCliResource, dbt_assets\n\n\n@dbt_assets(\n    manifest=Path("target", "manifest.json"),\n    partitions_def=DailyPartitionsDefinition(start_date="2023-01-01")\n)\ndef partitionshop_dbt_assets(context: AssetExecutionContext, dbt: DbtCliResource):\n    time_window = context.asset_partitions_time_window_for_output(\n        list(context.selected_output_names)[0]\n    )\n\n    dbt_vars = {\n        "min_date": time_window.start.isoformat(),\n        "max_date": time_window.end.isoformat()\n    }\n    dbt_build_args = ["build", "--vars", json.dumps(dbt_vars)]\n\n    yield from dbt.cli(dbt_build_args, context=context).stream()\n
\n
\n
\n\n
\n
\nclass dagster_dbt.DagsterDbtTranslator[source]\u00b6
\n

Holds a set of methods that derive Dagster asset definition metadata given a representation\nof a dbt resource (models, tests, sources, etc).

\n

This class is exposed so that methods can be overriden to customize how Dagster asset metadata\nis derived.

\n
\n
\nclassmethod get_asset_key(dbt_resource_props)[source]\u00b6
\n

A function that takes a dictionary representing properties of a dbt resource, and\nreturns the Dagster asset key that represents that resource.

\n

Note that a dbt resource is unrelated to Dagster\u2019s resource concept, and simply represents\na model, seed, snapshot or source in a given dbt project. You can learn more about dbt\nresources and the properties available in this dictionary here:\nhttps://docs.getdbt.com/reference/artifacts/manifest-json#resource-details

\n

This method can be overridden to provide a custom asset key for a dbt resource.

\n
\n
Parameters:
\n

dbt_resource_props (Mapping[str, Any]) \u2013 A dictionary representing the dbt resource.

\n
\n
Returns:
\n

The Dagster asset key for the dbt resource.

\n
\n
Return type:
\n

AssetKey

\n
\n
\n

Examples

\n

Adding a prefix to the default asset key generated for each dbt resource:

\n
from typing import Any, Mapping\n\nfrom dagster import AssetKey\nfrom dagster_dbt import DagsterDbtTranslator\n\n\nclass CustomDagsterDbtTranslator(DagsterDbtTranslator):\n    @classmethod\n    def get_asset_key(cls, dbt_resource_props: Mapping[str, Any]) -> AssetKey:\n        return super().get_asset_key(dbt_resource_props).with_prefix("prefix")\n
\n
\n

Adding a prefix to the default asset key generated for each dbt resource, but only for dbt sources:

\n
from typing import Any, Mapping\n\nfrom dagster import AssetKey\nfrom dagster_dbt import DagsterDbtTranslator\n\n\nclass CustomDagsterDbtTranslator(DagsterDbtTranslator):\n    @classmethod\n    def get_asset_key(cls, dbt_resource_props: Mapping[str, Any]) -> AssetKey:\n        asset_key = super().get_asset_key(dbt_resource_props)\n\n        if dbt_resource_props["resource_type"] == "source":\n            asset_key = asset_key.with_prefix("my_prefix")\n\n        return asset_key\n
\n
\n
\n\n
\n
\nclassmethod get_auto_materialize_policy(dbt_resource_props)[source]\u00b6
\n

A function that takes a dictionary representing properties of a dbt resource, and\nreturns the Dagster dagster.AutoMaterializePolicy for that resource.

\n

Note that a dbt resource is unrelated to Dagster\u2019s resource concept, and simply represents\na model, seed, snapshot or source in a given dbt project. You can learn more about dbt\nresources and the properties available in this dictionary here:\nhttps://docs.getdbt.com/reference/artifacts/manifest-json#resource-details

\n

This method can be overridden to provide a custom auto-materialize policy for a dbt resource.

\n
\n
Parameters:
\n

dbt_resource_props (Mapping[str, Any]) \u2013 A dictionary representing the dbt resource.

\n
\n
Returns:
\n

A Dagster auto-materialize policy.

\n
\n
Return type:
\n

Optional[AutoMaterializePolicy]

\n
\n
\n

Examples

\n

Set a custom auto-materialize policy for all dbt resources:

\n
from typing import Any, Mapping\n\nfrom dagster_dbt import DagsterDbtTranslator\n\n\nclass CustomDagsterDbtTranslator(DagsterDbtTranslator):\n    @classmethod\n    def get_auto_materialize_policy(cls, dbt_resource_props: Mapping[str, Any]) -> Optional[AutoMaterializePolicy]:\n        return AutoMaterializePolicy.eager()\n
\n
\n

Set a custom auto-materialize policy for dbt resources with a specific tag:

\n
from typing import Any, Mapping\n\nfrom dagster_dbt import DagsterDbtTranslator\n\n\nclass CustomDagsterDbtTranslator(DagsterDbtTranslator):\n    @classmethod\n    def get_auto_materialize_policy(cls, dbt_resource_props: Mapping[str, Any]) -> Optional[AutoMaterializePolicy]:\n        auto_materialize_policy = None\n        if "my_custom_tag" in dbt_resource_props.get("tags", []):\n            auto_materialize_policy = AutoMaterializePolicy.eager()\n\n        return auto_materialize_policy\n
\n
\n
\n\n
\n
\nclassmethod get_description(dbt_resource_props)[source]\u00b6
\n

A function that takes a dictionary representing properties of a dbt resource, and\nreturns the Dagster description for that resource.

\n

Note that a dbt resource is unrelated to Dagster\u2019s resource concept, and simply represents\na model, seed, snapshot or source in a given dbt project. You can learn more about dbt\nresources and the properties available in this dictionary here:\nhttps://docs.getdbt.com/reference/artifacts/manifest-json#resource-details

\n

This method can be overridden to provide a custom description for a dbt resource.

\n
\n
Parameters:
\n

dbt_resource_props (Mapping[str, Any]) \u2013 A dictionary representing the dbt resource.

\n
\n
Returns:
\n

The description for the dbt resource.

\n
\n
Return type:
\n

str

\n
\n
\n

Examples

\n
from typing import Any, Mapping\n\nfrom dagster_dbt import DagsterDbtTranslator\n\n\nclass CustomDagsterDbtTranslator(DagsterDbtTranslator):\n    @classmethod\n    def get_description(cls, dbt_resource_props: Mapping[str, Any]) -> str:\n        return "custom description"\n
\n
\n
\n\n
\n
\nclassmethod get_freshness_policy(dbt_resource_props)[source]\u00b6
\n

A function that takes a dictionary representing properties of a dbt resource, and\nreturns the Dagster dagster.FreshnessPolicy for that resource.

\n

Note that a dbt resource is unrelated to Dagster\u2019s resource concept, and simply represents\na model, seed, snapshot or source in a given dbt project. You can learn more about dbt\nresources and the properties available in this dictionary here:\nhttps://docs.getdbt.com/reference/artifacts/manifest-json#resource-details

\n

This method can be overridden to provide a custom freshness policy for a dbt resource.

\n
\n
Parameters:
\n

dbt_resource_props (Mapping[str, Any]) \u2013 A dictionary representing the dbt resource.

\n
\n
Returns:
\n

A Dagster freshness policy.

\n
\n
Return type:
\n

Optional[FreshnessPolicy]

\n
\n
\n

Examples

\n

Set a custom freshness policy for all dbt resources:

\n
from typing import Any, Mapping\n\nfrom dagster_dbt import DagsterDbtTranslator\n\n\nclass CustomDagsterDbtTranslator(DagsterDbtTranslator):\n    @classmethod\n    def get_freshness_policy(cls, dbt_resource_props: Mapping[str, Any]) -> Optional[FreshnessPolicy]:\n        return FreshnessPolicy(maximum_lag_minutes=60)\n
\n
\n

Set a custom freshness policy for dbt resources with a specific tag:

\n
from typing import Any, Mapping\n\nfrom dagster_dbt import DagsterDbtTranslator\n\n\nclass CustomDagsterDbtTranslator(DagsterDbtTranslator):\n    @classmethod\n    def get_freshness_policy(cls, dbt_resource_props: Mapping[str, Any]) -> Optional[FreshnessPolicy]:\n        freshness_policy = None\n        if "my_custom_tag" in dbt_resource_props.get("tags", []):\n            freshness_policy = FreshnessPolicy(maximum_lag_minutes=60)\n\n        return freshness_policy\n
\n
\n
\n\n
\n
\nclassmethod get_group_name(dbt_resource_props)[source]\u00b6
\n

A function that takes a dictionary representing properties of a dbt resource, and\nreturns the Dagster group name for that resource.

\n

Note that a dbt resource is unrelated to Dagster\u2019s resource concept, and simply represents\na model, seed, snapshot or source in a given dbt project. You can learn more about dbt\nresources and the properties available in this dictionary here:\nhttps://docs.getdbt.com/reference/artifacts/manifest-json#resource-details

\n

This method can be overridden to provide a custom group name for a dbt resource.

\n
\n
Parameters:
\n

dbt_resource_props (Mapping[str, Any]) \u2013 A dictionary representing the dbt resource.

\n
\n
Returns:
\n

A Dagster group name.

\n
\n
Return type:
\n

Optional[str]

\n
\n
\n

Examples

\n
from typing import Any, Mapping\n\nfrom dagster_dbt import DagsterDbtTranslator\n\n\nclass CustomDagsterDbtTranslator(DagsterDbtTranslator):\n    @classmethod\n    def get_group_name(cls, dbt_resource_props: Mapping[str, Any]) -> Optional[str]:\n        return "custom_group_prefix" + dbt_resource_props.get("config", {}).get("group")\n
\n
\n
\n\n
\n
\nclassmethod get_metadata(dbt_resource_props)[source]\u00b6
\n

A function that takes a dictionary representing properties of a dbt resource, and\nreturns the Dagster metadata for that resource.

\n

Note that a dbt resource is unrelated to Dagster\u2019s resource concept, and simply represents\na model, seed, snapshot or source in a given dbt project. You can learn more about dbt\nresources and the properties available in this dictionary here:\nhttps://docs.getdbt.com/reference/artifacts/manifest-json#resource-details

\n

This method can be overridden to provide a custom metadata for a dbt resource.

\n
\n
Parameters:
\n

dbt_resource_props (Mapping[str, Any]) \u2013 A dictionary representing the dbt resource.

\n
\n
Returns:
\n

A dictionary representing the Dagster metadata for the dbt resource.

\n
\n
Return type:
\n

Mapping[str, Any]

\n
\n
\n

Examples

\n
from typing import Any, Mapping\n\nfrom dagster_dbt import DagsterDbtTranslator\n\n\nclass CustomDagsterDbtTranslator(DagsterDbtTranslator):\n    @classmethod\n    def get_metadata(cls, dbt_resource_props: Mapping[str, Any]) -> Mapping[str, Any]:\n        return {"custom": "metadata"}\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_dbt.DbtManifestAssetSelection(manifest, select='fqn:*', *, dagster_dbt_translator=None, exclude=None)[source]\u00b6
\n

Defines a selection of assets from a dbt manifest wrapper and a dbt selection string.

\n
\n
Parameters:
\n
    \n
  • manifest (Mapping[str, Any]) \u2013 The dbt manifest blob.

  • \n
  • select (str) \u2013 A dbt selection string to specify a set of dbt resources.

  • \n
  • exclude (Optional[str]) \u2013 A dbt selection string to exclude a set of dbt resources.

  • \n
\n
\n
\n

Examples

\n
import json\nfrom pathlib import Path\n\nfrom dagster_dbt import DbtManifestAssetSelection\n\nmanifest = json.loads(Path("path/to/manifest.json").read_text())\n\n# select the dbt assets that have the tag "foo".\nmy_selection = DbtManifestAssetSelection(manifest=manifest, select="tag:foo")\n
\n
\n
\n\n
\n
\ndagster_dbt.build_dbt_asset_selection(dbt_assets, dbt_select='fqn:*', dbt_exclude=None)[source]\u00b6
\n

Build an asset selection for a dbt selection string.

\n

See https://docs.getdbt.com/reference/node-selection/syntax#how-does-selection-work for\nmore information.

\n
\n
Parameters:
\n
    \n
  • dbt_select (str) \u2013 A dbt selection string to specify a set of dbt resources.

  • \n
  • dbt_exclude (Optional[str]) \u2013 A dbt selection string to exclude a set of dbt resources.

  • \n
\n
\n
Returns:
\n

An asset selection for the selected dbt nodes.

\n
\n
Return type:
\n

AssetSelection

\n
\n
\n

Examples

\n
from dagster_dbt import dbt_assets, build_dbt_asset_selection\n\n@dbt_assets(manifest=...)\ndef all_dbt_assets():\n    ...\n\n# Select the dbt assets that have the tag "foo".\nfoo_selection = build_dbt_asset_selection([dbt_assets], dbt_select="tag:foo")\n\n# Select the dbt assets that have the tag "foo" and all Dagster assets downstream\n# of them (dbt-related or otherwise)\nfoo_and_downstream_selection = foo_selection.downstream()\n
\n
\n
\n\n
\n
\ndagster_dbt.build_schedule_from_dbt_selection(dbt_assets, job_name, cron_schedule, dbt_select='fqn:*', dbt_exclude=None, tags=None, config=None, execution_timezone=None)[source]\u00b6
\n

Build a schedule to materialize a specified set of dbt resources from a dbt selection string.

\n

See https://docs.getdbt.com/reference/node-selection/syntax#how-does-selection-work for\nmore information.

\n
\n
Parameters:
\n
    \n
  • job_name (str) \u2013 The name of the job to materialize the dbt resources.

  • \n
  • cron_schedule (str) \u2013 The cron schedule to define the schedule.

  • \n
  • dbt_select (str) \u2013 A dbt selection string to specify a set of dbt resources.

  • \n
  • dbt_exclude (Optional[str]) \u2013 A dbt selection string to exclude a set of dbt resources.

  • \n
  • tags (Optional[Mapping[str, str]]) \u2013 A dictionary of tags (string key-value pairs) to attach\nto the scheduled runs.

  • \n
  • config (Optional[RunConfig]) \u2013 The config that parameterizes the execution of this schedule.

  • \n
  • execution_timezone (Optional[str]) \u2013 Timezone in which the schedule should run.\nSupported strings for timezones are the ones provided by the\nIANA time zone database <https://www.iana.org/time-zones> - e.g. \u201cAmerica/Los_Angeles\u201d.

  • \n
\n
\n
Returns:
\n

A definition to materialize the selected dbt resources on a cron schedule.

\n
\n
Return type:
\n

ScheduleDefinition

\n
\n
\n

Examples

\n
from dagster_dbt import dbt_assets, build_schedule_from_dbt_selection\n\n@dbt_assets(manifest=...)\ndef all_dbt_assets():\n    ...\n\ndaily_dbt_assets_schedule = build_schedule_from_dbt_selection(\n    [all_dbt_assets],\n    job_name="all_dbt_assets",\n    cron_schedule="0 0 * * *",\n    dbt_select="fqn:*",\n)\n
\n
\n
\n\n
\n
\ndagster_dbt.get_asset_key_for_model(dbt_assets, model_name)[source]\u00b6
\n

Return the corresponding Dagster asset key for a dbt model.

\n
\n
Parameters:
\n
    \n
  • dbt_assets (AssetsDefinition) \u2013 An AssetsDefinition object produced by\nload_assets_from_dbt_project, load_assets_from_dbt_manifest, or @dbt_assets.

  • \n
  • model_name (str) \u2013 The name of the dbt model.

  • \n
\n
\n
Returns:
\n

The corresponding Dagster asset key.

\n
\n
Return type:
\n

AssetKey

\n
\n
\n

Examples

\n
from dagster import asset\nfrom dagster_dbt import dbt_assets, get_asset_key_for_model\n\n@dbt_assets(manifest=...)\ndef all_dbt_assets():\n    ...\n\n\n@asset(deps={get_asset_key_for_model([all_dbt_assets], "customers")})\ndef cleaned_customers():\n    ...\n
\n
\n
\n\n
\n
\ndagster_dbt.get_asset_key_for_source(dbt_assets, source_name)[source]\u00b6
\n

Returns the corresponding Dagster asset key for a dbt source with a singular table.

\n
\n
Parameters:
\n

source_name (str) \u2013 The name of the dbt source.

\n
\n
Raises:
\n

DagsterInvalidInvocationError \u2013 If the source has more than one table.

\n
\n
Returns:
\n

The corresponding Dagster asset key.

\n
\n
Return type:
\n

AssetKey

\n
\n
\n

Examples

\n
from dagster import asset\nfrom dagster_dbt import dbt_assets, get_asset_key_for_source\n\n@dbt_assets(manifest=...)\ndef all_dbt_assets():\n    ...\n\n@asset(key=get_asset_key_for_source([all_dbt_assets], "my_source"))\ndef upstream_python_asset():\n    ...\n
\n
\n
\n\n
\n
\ndagster_dbt.get_asset_keys_by_output_name_for_source(dbt_assets, source_name)[source]\u00b6
\n

Returns the corresponding Dagster asset keys for all tables in a dbt source.

\n

This is a convenience method that makes it easy to define a multi-asset that generates\nall the tables for a given dbt source.

\n
\n
Parameters:
\n

source_name (str) \u2013 The name of the dbt source.

\n
\n
Returns:
\n

\n
A mapping of the table name to corresponding Dagster asset key

for all tables in the given dbt source.

\n
\n
\n

\n
\n
Return type:
\n

Mapping[str, AssetKey]

\n
\n
\n

Examples

\n
from dagster import AssetOut, multi_asset\nfrom dagster_dbt import dbt_assets, get_asset_keys_by_output_name_for_source\n\n@dbt_assets(manifest=...)\ndef all_dbt_assets():\n    ...\n\n@multi_asset(\n    outs={\n        name: AssetOut(key=asset_key)\n        for name, asset_key in get_asset_keys_by_output_name_for_source(\n            [all_dbt_assets], "raw_data"\n        ).items()\n    },\n)\ndef upstream_python_asset():\n    ...\n
\n
\n
\n\n
\n
\n

Resources (dbt Core)\u00b6

\n
\n

CLI Resource\u00b6

\n
\n
\nclass dagster_dbt.DbtCliResource(*, project_dir, global_config_flags=[], profiles_dir=None, profile=None, target=None)[source]\u00b6
\n

A resource used to execute dbt CLI commands.

\n
\n
\nproject_dir\u00b6
\n

The path to the dbt project directory. This directory should contain a\ndbt_project.yml. See https://docs.getdbt.com/reference/dbt_project.yml for more\ninformation.

\n
\n
Type:
\n

str

\n
\n
\n
\n\n
\n
\nglobal_config_flags\u00b6
\n

A list of global flags configuration to pass to the dbt CLI\ninvocation. See https://docs.getdbt.com/reference/global-configs for a full list of\nconfiguration.

\n
\n
Type:
\n

List[str]

\n
\n
\n
\n\n
\n
\nprofiles_dir\u00b6
\n

The path to the directory containing your dbt profiles.yml.\nBy default, the current working directory is used, which is the dbt project directory.\nSee https://docs.getdbt.com/docs/core/connect-data-platform/connection-profiles for more\ninformation.

\n
\n
Type:
\n

Optional[str]

\n
\n
\n
\n\n
\n
\nprofile\u00b6
\n

The profile from your dbt profiles.yml to use for execution. See\nhttps://docs.getdbt.com/docs/core/connect-data-platform/connection-profiles for more\ninformation.

\n
\n
Type:
\n

Optional[str]

\n
\n
\n
\n\n
\n
\ntarget\u00b6
\n

The target from your dbt profiles.yml to use for execution. See\nhttps://docs.getdbt.com/docs/core/connect-data-platform/connection-profiles for more\ninformation.

\n
\n
Type:
\n

Optional[str]

\n
\n
\n
\n\n

Examples

\n

Creating a dbt resource with only a reference to project_dir:

\n
from dagster_dbt import DbtCliResource\n\ndbt = DbtCliResource(project_dir="/path/to/dbt/project")\n
\n
\n

Creating a dbt resource with a custom profiles_dir:

\n
from dagster_dbt import DbtCliResource\n\ndbt = DbtCliResource(\n    project_dir="/path/to/dbt/project",\n    profiles_dir="/path/to/dbt/project/profiles",\n)\n
\n
\n

Creating a dbt resource with a custom profile and target:

\n
from dagster_dbt import DbtCliResource\n\ndbt = DbtCliResource(\n    project_dir="/path/to/dbt/project",\n    profiles_dir="/path/to/dbt/project/profiles",\n    profile="jaffle_shop",\n    target="dev",\n)\n
\n
\n

Creating a dbt resource with global configs, e.g. disabling colored logs with --no-use-color:

\n
from dagster_dbt import DbtCliResource\n\ndbt = DbtCliResource(\n    project_dir="/path/to/dbt/project",\n    global_config_flags=["--no-use-color"],\n)\n
\n
\n
\n
\ncli(args, *, raise_on_error=True, manifest=None, dagster_dbt_translator=None, context=None)[source]\u00b6
\n

Create a subprocess to execute a dbt CLI command.

\n
\n
Parameters:
\n
    \n
  • args (List[str]) \u2013 The dbt CLI command to execute.

  • \n
  • raise_on_error (bool) \u2013 Whether to raise an exception if the dbt CLI command fails.

  • \n
  • manifest (Optional[Union[Mapping[str, Any], str, Path]]) \u2013 The dbt manifest blob. If an\nexecution context from within @dbt_assets is provided to the context argument,\nthen the manifest provided to @dbt_assets will be used.

  • \n
  • dagster_dbt_translator (Optional[DagsterDbtTranslator]) \u2013 The translator to link dbt\nnodes to Dagster assets. If an execution context from within @dbt_assets is\nprovided to the context argument, then the dagster_dbt_translator provided to\n@dbt_assets will be used.

  • \n
  • context (Optional[OpExecutionContext]) \u2013 The execution context from within @dbt_assets.

  • \n
\n
\n
Returns:
\n

\n
A invocation instance that can be used to retrieve the output of the

dbt CLI command.

\n
\n
\n

\n
\n
Return type:
\n

DbtCliInvocation

\n
\n
\n

Examples

\n

Streaming Dagster events for dbt asset materializations and observations:

\n
from pathlib import Path\n\nfrom dagster import AssetExecutionContext\nfrom dagster_dbt import DbtCliResource, dbt_assets\n\n\n@dbt_assets(manifest=Path("target", "manifest.json"))\ndef my_dbt_assets(context: AssetExecutionContext, dbt: DbtCliResource):\n    yield from dbt.cli(["run"], context=context).stream()\n
\n
\n

Retrieving a dbt artifact after streaming the Dagster events:

\n
from pathlib import Path\n\nfrom dagster import AssetExecutionContext\nfrom dagster_dbt import DbtCliResource, dbt_assets\n\n\n@dbt_assets(manifest=Path("target", "manifest.json"))\ndef my_dbt_assets(context: AssetExecutionContext, dbt: DbtCliResource):\n    dbt_run_invocation = dbt.cli(["run"], context=context)\n\n    yield from dbt_run_invocation.stream()\n\n    # Retrieve the `run_results.json` dbt artifact as a dictionary:\n    run_results_json = dbt_run_invocation.get_artifact("run_results.json")\n\n    # Retrieve the `run_results.json` dbt artifact as a file path:\n    run_results_path = dbt_run_invocation.target_path.joinpath("run_results.json")\n
\n
\n

Customizing the asset materialization metadata when streaming the Dagster events:

\n
from pathlib import Path\n\nfrom dagster import AssetExecutionContext\nfrom dagster_dbt import DbtCliResource, dbt_assets\n\n\n@dbt_assets(manifest=Path("target", "manifest.json"))\ndef my_dbt_assets(context: AssetExecutionContext, dbt: DbtCliResource):\n    dbt_cli_invocation = dbt.cli(["run"], context=context)\n\n    for dbt_event in dbt_cli_invocation.stream_raw_events():\n        for dagster_event in dbt_event.to_default_asset_events(manifest=dbt_cli_invocation.manifest):\n            if isinstance(dagster_event, Output):\n                context.add_output_metadata(\n                    metadata={\n                        "my_custom_metadata": "my_custom_metadata_value",\n                    },\n                    output_name=dagster_event.output_name,\n                )\n\n            yield dagster_event\n
\n
\n

Suppressing exceptions from a dbt CLI command when a non-zero exit code is returned:

\n
from pathlib import Path\n\nfrom dagster import AssetExecutionContext\nfrom dagster_dbt import DbtCliResource, dbt_assets\n\n\n@dbt_assets(manifest=Path("target", "manifest.json"))\ndef my_dbt_assets(context: AssetExecutionContext, dbt: DbtCliResource):\n    dbt_run_invocation = dbt.cli(["run"], context=context, raise_on_error=False)\n\n    if dbt_run_invocation.is_successful():\n        yield from dbt_run_invocation.stream()\n    else:\n        ...\n
\n
\n

Invoking a dbt CLI command in a custom asset or op:

\n
import json\n\nfrom dagster import asset, op\nfrom dagster_dbt import DbtCliResource\n\n\n@asset\ndef my_dbt_asset(dbt: DbtCliResource):\n    dbt_macro_args = {"key": "value"}\n    dbt.cli(["run-operation", "my-macro", json.dumps(dbt_macro_args)]).wait()\n\n\n@op\ndef my_dbt_op(dbt: DbtCliResource):\n    dbt_macro_args = {"key": "value"}\n    dbt.cli(["run-operation", "my-macro", json.dumps(dbt_macro_args)]).wait()\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_dbt.DbtCliInvocation(process, manifest, dagster_dbt_translator, project_dir, target_path, raise_on_error)[source]\u00b6
\n

The representation of an invoked dbt command.

\n
\n
Parameters:
\n
    \n
  • process (subprocess.Popen) \u2013 The process running the dbt command.

  • \n
  • manifest (Mapping[str, Any]) \u2013 The dbt manifest blob.

  • \n
  • project_dir (Path) \u2013 The path to the dbt project.

  • \n
  • target_path (Path) \u2013 The path to the dbt target folder.

  • \n
  • raise_on_error (bool) \u2013 Whether to raise an exception if the dbt command fails.

  • \n
\n
\n
\n
\n
\nget_artifact(artifact)[source]\u00b6
\n

Retrieve a dbt artifact from the target path.

\n

See https://docs.getdbt.com/reference/artifacts/dbt-artifacts for more information.

\n
\n
Parameters:
\n

artifact (Union[Literal["manifest.json"], Literal["catalog.json"], Literal["run_results.json"], Literal["sources.json"]]) \u2013 The name of the artifact to retrieve.

\n
\n
Returns:
\n

The artifact as a dictionary.

\n
\n
Return type:
\n

Dict[str, Any]

\n
\n
\n

Examples

\n
from dagster_dbt import DbtCliResource\n\ndbt = DbtCliResource(project_dir="/path/to/dbt/project")\n\ndbt_cli_invocation = dbt.cli(["run"]).wait()\n\n# Retrieve the run_results.json artifact.\nrun_results = dbt_cli_invocation.get_artifact("run_results.json")\n
\n
\n
\n\n
\n
\nis_successful()[source]\u00b6
\n

Return whether the dbt CLI process completed successfully.

\n
\n
Returns:
\n

True, if the dbt CLI process returns with a zero exit code, and False otherwise.

\n
\n
Return type:
\n

bool

\n
\n
\n

Examples

\n
from dagster_dbt import DbtCliResource\n\ndbt = DbtCliResource(project_dir="/path/to/dbt/project")\n\ndbt_cli_invocation = dbt.cli(["run"], raise_on_error=False)\n\nif dbt_cli_invocation.is_successful():\n    ...\n
\n
\n
\n\n
\n
\nstream()[source]\u00b6
\n

Stream the events from the dbt CLI process and convert them to Dagster events.

\n
\n
Returns:
\n

\n
A set of corresponding Dagster events.
    \n
  • Output for refables (e.g. models, seeds, snapshots.)

  • \n
  • AssetObservation for dbt test results that are not enabled as asset checks.

  • \n
  • AssetCheckResult for dbt test results that are enabled as asset checks.

  • \n
\n
\n
\n

\n
\n
Return type:
\n

Iterator[Union[Output, AssetObservation, AssetCheckResult]]

\n
\n
\n

Examples

\n
from pathlib import Path\nfrom dagster_dbt import DbtCliResource, dbt_assets\n\n@dbt_assets(manifest=Path("target", "manifest.json"))\ndef my_dbt_assets(context, dbt: DbtCliResource):\n    yield from dbt.cli(["run"], context=context).stream()\n
\n
\n
\n\n
\n
\nstream_raw_events()[source]\u00b6
\n

Stream the events from the dbt CLI process.

\n
\n
Returns:
\n

An iterator of events from the dbt CLI process.

\n
\n
Return type:
\n

Iterator[DbtCliEventMessage]

\n
\n
\n
\n\n
\n
\nwait()[source]\u00b6
\n

Wait for the dbt CLI process to complete.

\n
\n
Returns:
\n

The current representation of the dbt CLI invocation.

\n
\n
Return type:
\n

DbtCliInvocation

\n
\n
\n

Examples

\n
from dagster_dbt import DbtCliResource\n\ndbt = DbtCliResource(project_dir="/path/to/dbt/project")\n\ndbt_cli_invocation = dbt.cli(["run"]).wait()\n
\n
\n
\n\n
\n\n
\n
\nclass dagster_dbt.DbtCliEventMessage(raw_event)[source]\u00b6
\n

The representation of a dbt CLI event.

\n
\n
Parameters:
\n

raw_event (Dict[str, Any]) \u2013 The raw event dictionary.\nSee https://docs.getdbt.com/reference/events-logging#structured-logging for more\ninformation.

\n
\n
\n
\n
\nto_default_asset_events(manifest, dagster_dbt_translator=<dagster_dbt.dagster_dbt_translator.DagsterDbtTranslator object>)[source]\u00b6
\n

Convert a dbt CLI event to a set of corresponding Dagster events.

\n
\n
Parameters:
\n
    \n
  • manifest (Union[Mapping[str, Any], str, Path]) \u2013 The dbt manifest blob.

  • \n
  • dagster_dbt_translator (DagsterDbtTranslator) \u2013 Optionally, a custom translator for\nlinking dbt nodes to Dagster assets.

  • \n
\n
\n
Returns:
\n

\n
A set of corresponding Dagster events.
    \n
  • Output for refables (e.g. models, seeds, snapshots.)

  • \n
  • AssetObservation for dbt test results that are not enabled as asset checks.

  • \n
  • AssetCheckResult for dbt test results that are enabled as asset checks.

  • \n
\n
\n
\n

\n
\n
Return type:
\n

Iterator[Union[Output, AssetObservation, AssetCheckResult]]

\n
\n
\n
\n\n
\n\n
\n
\n

Deprecated (dbt Core)\u00b6

\n
\n
\nclass dagster_dbt.DbtCliOutput(command, return_code, raw_output, logs, result, docs_url=None)[source]\u00b6
\n

The results of executing a dbt command, along with additional metadata about the dbt CLI\nprocess that was run.

\n

This class is deprecated, because it\u2019s only produced by methods of the DbtCliClientResource class,\nwhich is deprecated in favor of DbtCliResource.

\n

Note that users should not construct instances of this class directly. This class is intended\nto be constructed from the JSON output of dbt commands.

\n
\n
\ncommand\u00b6
\n

The full shell command that was executed.

\n
\n
Type:
\n

str

\n
\n
\n
\n\n
\n
\nreturn_code\u00b6
\n

The return code of the dbt CLI process.

\n
\n
Type:
\n

int

\n
\n
\n
\n\n
\n
\nraw_output\u00b6
\n

The raw output (stdout) of the dbt CLI process.

\n
\n
Type:
\n

str

\n
\n
\n
\n\n
\n
\nlogs\u00b6
\n

List of parsed JSON logs produced by the dbt command.

\n
\n
Type:
\n

List[Dict[str, Any]]

\n
\n
\n
\n\n
\n
\nresult\u00b6
\n

Dictionary containing dbt-reported result information\ncontained in run_results.json. Some dbt commands do not produce results, and will\ntherefore have result = None.

\n
\n
Type:
\n

Optional[Dict[str, Any]]

\n
\n
\n
\n\n
\n
\ndocs_url\u00b6
\n

Hostname where dbt docs are being served for this project.

\n
\n
Type:
\n

Optional[str]

\n
\n
\n
\n\n
\n\n
\n
\ndagster_dbt.dbt_cli_resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
project_dir (dagster.StringSource, optional):
\n

Which directory to look in for the dbt_project.yml file. Default is the current working directory and its parents.

\n

Default Value: \u2018.\u2019

\n
\n
profiles_dir (Union[dagster.StringSource, None], optional):
\n

Which directory to look in for the profiles.yml file. Default = $DBT_PROFILES_DIR or $HOME/.dbt

\n
\n
profile (Union[dagster.StringSource, None], optional):
\n

Which profile to load. Overrides setting in dbt_project.yml.

\n
\n
target (Union[dagster.StringSource, None], optional):
\n

Which target to load for the given profile.

\n
\n
vars (Union[dict, None], optional):
\n

Supply variables to the project. This argument overrides variables defined in your dbt_project.yml file. This argument should be a dictionary, eg. {\u2018my_variable\u2019: \u2018my_value\u2019}

\n
\n
bypass_cache (dagster.BoolSource, optional):
\n

If set, bypass the adapter-level cache of database state

\n

Default Value: False

\n
\n
warn_error (dagster.BoolSource, optional):
\n

If dbt would normally warn, instead raise an exception. Examples include \u2013models that selects nothing, deprecations, configurations with no associated models, invalid test configurations, and missing sources/refs in tests.

\n

Default Value: False

\n
\n
dbt_executable (dagster.StringSource, optional):
\n

Path to the dbt executable. Default is dbt

\n

Default Value: \u2018dbt\u2019

\n
\n
ignore_handled_error (dagster.BoolSource, optional):
\n

When True, will not raise an exception when the dbt CLI returns error code 1. Default is False.

\n

Default Value: False

\n
\n
target_path (dagster.StringSource, optional):
\n

The directory path for target if different from the default target-path in your dbt project configuration file.

\n

Default Value: \u2018target\u2019

\n
\n
docs_url (Union[dagster.StringSource, None], optional):
\n

The url for where dbt docs are being served for this project.

\n
\n
json_log_format (dagster.BoolSource, optional):
\n

When True, dbt will invoked with the \u2013log-format json flag, allowing Dagster to parse the log messages and emit simpler log messages to the event log.

\n

Default Value: True

\n
\n
capture_logs (dagster.BoolSource, optional):
\n

When True, dbt will invoked with the \u2013capture-output flag, allowing Dagster to capture the logs and emit them to the event log.

\n

Default Value: True

\n
\n
debug (dagster.BoolSource, optional):
\n

When True, dbt will invoked with the \u2013debug flag, which will print additional debug information to the console.

\n

Default Value: False

\n
\n
\n
\n
\n

\n \n (\n deprecated\n )\n \n This API will be removed in version 0.21. Use DbtCliResource instead..\n \n

\n

This resource issues dbt CLI commands against a configured dbt project. It is deprecated\nin favor of DbtCliResource.

\n
\n\n
\n
\n
\n

Ops (dbt Core)\u00b6

\n

If you\u2019re using asset-based dbt APIs like load_assets_from_dbt_project, you usually will not also use the below op-based APIs.

\n

dagster_dbt provides a set of pre-built ops that work with the CLI. For more advanced use cases,\nwe suggest building your own ops which directly interact with these resources.

\n
\n
\ndagster_dbt.dbt_run_op = <dagster._core.definitions.op_definition.OpDefinition object>[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
yield_materializations (dagster.BoolSource, optional):
\n

If True, materializations corresponding to the results of the dbt operation will be yielded when the op executes. Default: True

\n

Default Value: True

\n
\n
asset_key_prefix (Union[List[dagster.StringSource], None], optional):
\n

If provided and yield_materializations is True, these components will be used to prefix the generated asset keys.

\n

Default Value: [\u2018dbt\u2019]

\n
\n
\n

This op executes a dbt run command. It requires the use of a dbt resource, which can be\nset to execute this command through the CLI (using the dbt_cli_resource).

\n

Examples:

\n
from dagster import job\nfrom dagster_dbt import dbt_run_op, dbt_cli_resource\n\n@job(resource_defs={"dbt":dbt_cli_resource})\ndef my_dbt_cli_job():\n    dbt_run_op()\n
\n
\n
\n\n
\n
\ndagster_dbt.dbt_compile_op(context)[source]\u00b6
\n

This op executes a dbt compile command. It requires the use of a dbt resource, which can be\nset to execute this command through the CLI (using the dbt_cli_resource).

\n

Examples:

\n
from dagster import job\nfrom dagster_dbt import dbt_compile_op, dbt_cli_resource\n\n@job(resource_defs={"dbt":dbt_cli_resource})\ndef my_dbt_cli_job():\n    dbt_compile_op()\n
\n
\n
\n\n
\n
\ndagster_dbt.dbt_ls_op(context)[source]\u00b6
\n

This op executes a dbt ls command. It requires the use of a dbt resource, which can be\nset to execute this command through the CLI (using the dbt_cli_resource).

\n

Examples:

\n
from dagster import job\nfrom dagster_dbt import dbt_ls_op, dbt_cli_resource\n\n@job(resource_defs={"dbt":dbt_cli_resource})\ndef my_dbt_cli_job():\n    dbt_ls_op()\n
\n
\n
\n\n
\n
\ndagster_dbt.dbt_test_op(context)[source]\u00b6
\n

This op executes a dbt test command. It requires the use of a dbt resource, which can be\nset to execute this command through the CLI (using the dbt_cli_resource).

\n

Examples:

\n
from dagster import job\nfrom dagster_dbt import dbt_test_op, dbt_cli_resource\n\n@job(resource_defs={"dbt":dbt_cli_resource})\ndef my_dbt_cli_job():\n    dbt_test_op()\n
\n
\n
\n\n
\n
\ndagster_dbt.dbt_snapshot_op(context)[source]\u00b6
\n

This op executes a dbt snapshot command. It requires the use of a dbt resource, which can be\nset to execute this command through the CLI (using the dbt_cli_resource).

\n

Examples:

\n
from dagster import job\nfrom dagster_dbt import dbt_snapshot_op, dbt_cli_resource\n\n@job(resource_defs={"dbt":dbt_cli_resource})\ndef my_dbt_cli_job():\n    dbt_snapshot_op()\n
\n
\n
\n\n
\n
\ndagster_dbt.dbt_seed_op(context)[source]\u00b6
\n

This op executes a dbt seed command. It requires the use of a dbt resource, which can be\nset to execute this command through the CLI (using the dbt_cli_resource).

\n

Examples:

\n
from dagster import job\nfrom dagster_dbt import dbt_seed_op, dbt_cli_resource\n\n@job(resource_defs={"dbt":dbt_cli_resource})\ndef my_dbt_cli_job():\n    dbt_seed_op()\n
\n
\n
\n\n
\n
\ndagster_dbt.dbt_docs_generate_op(context)[source]\u00b6
\n

This op executes a dbt docs generate command. It requires the use of a dbt resource, which can be\nset to execute this command through the CLI (using the dbt_cli_resource).

\n

Examples:

\n
from dagster import job\nfrom dagster_dbt import dbt_docs_generate_op, dbt_cli_resource\n\n@job(resource_defs={"dbt":dbt_cli_resource})\ndef my_dbt_cli_job():\n    dbt_docs_generate_op()\n
\n
\n
\n\n
\n
\n
\n

dbt Cloud\u00b6

\n

Here, we provide interfaces to manage dbt projects invoked by the hosted dbt Cloud service.

\n
\n

Assets (dbt Cloud)\u00b6

\n
\n
\ndagster_dbt.load_assets_from_dbt_cloud_job(dbt_cloud, job_id, node_info_to_asset_key=<function default_asset_key_fn>, node_info_to_group_fn=<function default_group_from_dbt_resource_props>, node_info_to_freshness_policy_fn=<function default_freshness_policy_fn>, node_info_to_auto_materialize_policy_fn=<function default_auto_materialize_policy_fn>, partitions_def=None, partition_key_to_vars_fn=None)[source]\u00b6
\n
\n
\n

\n \n (\n experimental\n )\n \n This API may break in future versions, even between dot releases.\n \n

\n

Loads a set of dbt models, managed by a dbt Cloud job, into Dagster assets. In order to\ndetermine the set of dbt models, the project is compiled to generate the necessary artifacts\nthat define the dbt models and their dependencies.

\n

One Dagster asset is created for each dbt model.

\n
\n
Parameters:
\n
    \n
  • dbt_cloud (ResourceDefinition) \u2013 The dbt Cloud resource to use to connect to the dbt Cloud API.

  • \n
  • job_id (int) \u2013 The ID of the dbt Cloud job to load assets from.

  • \n
  • node_info_to_asset_key \u2013 (Mapping[str, Any] -> AssetKey): A function that takes a dictionary\nof dbt metadata and returns the AssetKey that you want to represent a given model or\nsource. By default: dbt model -> AssetKey([model_name]) and\ndbt source -> AssetKey([source_name, table_name])

  • \n
  • node_info_to_group_fn (Dict[str, Any] -> Optional[str]) \u2013 A function that takes a\ndictionary of dbt node info and returns the group that this node should be assigned to.

  • \n
  • node_info_to_freshness_policy_fn (Dict[str, Any] -> Optional[FreshnessPolicy]) \u2013 A function\nthat takes a dictionary of dbt node info and optionally returns a FreshnessPolicy that\nshould be applied to this node. By default, freshness policies will be created from\nconfig applied to dbt models, i.e.:\ndagster_freshness_policy={\u201cmaximum_lag_minutes\u201d: 60, \u201ccron_schedule\u201d: \u201c0 9 * * *\u201d}\nwill result in that model being assigned\nFreshnessPolicy(maximum_lag_minutes=60, cron_schedule=\u201d0 9 * * *\u201d)

  • \n
  • node_info_to_auto_materialize_policy_fn (Dict[str, Any] -> Optional[AutoMaterializePolicy]) \u2013 A function that takes a dictionary of dbt node info and optionally returns a AutoMaterializePolicy\nthat should be applied to this node. By default, AutoMaterializePolicies will be created from\nconfig applied to dbt models, i.e.:\ndagster_auto_materialize_policy={\u201ctype\u201d: \u201clazy\u201d} will result in that model being assigned\nAutoMaterializePolicy.lazy()

  • \n
  • node_info_to_definition_metadata_fn (Dict[str, Any] -> Optional[Dict[str, MetadataUserInput]]) \u2013 A function that takes a dictionary of dbt node info and optionally returns a dictionary\nof metadata to be attached to the corresponding definition. This is added to the default\nmetadata assigned to the node, which consists of the node\u2019s schema (if present).

  • \n
  • partitions_def (Optional[PartitionsDefinition]) \u2013 \n \n (\n experimental\n )\n \n (This parameter may break in future versions, even between dot releases.) Defines the set of partition keys that\ncompose the dbt assets.

  • \n
  • partition_key_to_vars_fn (Optional[str -> Dict[str, Any]]) \u2013 \n \n (\n experimental\n )\n \n (This parameter may break in future versions, even between dot releases.) A function to translate a given\npartition key (e.g. \u20182022-01-01\u2019) to a dictionary of vars to be passed into the dbt\ninvocation (e.g. {\u201crun_date\u201d: \u201c2022-01-01\u201d})

  • \n
\n
\n
Returns:
\n

A definition for the loaded assets.

\n
\n
Return type:
\n

CacheableAssetsDefinition

\n
\n
\n

Examples

\n
from dagster import repository\nfrom dagster_dbt import dbt_cloud_resource, load_assets_from_dbt_cloud_job\n\nDBT_CLOUD_JOB_ID = 1234\n\ndbt_cloud = dbt_cloud_resource.configured(\n    {\n        "auth_token": {"env": "DBT_CLOUD_API_TOKEN"},\n        "account_id": {"env": "DBT_CLOUD_ACCOUNT_ID"},\n    }\n)\n\ndbt_cloud_assets = load_assets_from_dbt_cloud_job(\n    dbt_cloud=dbt_cloud, job_id=DBT_CLOUD_JOB_ID\n)\n\n\n@repository\ndef dbt_cloud_sandbox():\n    return [dbt_cloud_assets]\n
\n
\n
\n\n
\n
\n

Ops (dbt Cloud)\u00b6

\n
\n
\ndagster_dbt.dbt_cloud_run_op = <dagster._core.definitions.op_definition.OpDefinition object>[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
job_id (dagster.IntSource):
\n

The integer ID of the relevant dbt Cloud job. You can find this value by going to the details page of your job in the dbt Cloud UI. It will be the final number in the url, e.g.: https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/jobs/{job_id}/

\n
\n
poll_interval (Float, optional):
\n

The time (in seconds) that will be waited between successive polls.

\n

Default Value: 10

\n
\n
poll_timeout (Union[Float, None], optional):
\n

The maximum time that will waited before this operation is timed out. By default, this will never time out.

\n
\n
yield_materializations (dagster.BoolSource, optional):
\n

If True, materializations corresponding to the results of the dbt operation will be yielded when the op executes.

\n

Default Value: True

\n
\n
asset_key_prefix (List[dagster.StringSource], optional):
\n

If provided and yield_materializations is True, these components will be used to prefix the generated asset keys.

\n

Default Value: [\u2018dbt\u2019]

\n
\n
\n

Initiates a run for a dbt Cloud job, then polls until the run completes. If the job\nfails or is otherwised stopped before succeeding, a dagster.Failure exception will be raised,\nand this op will fail.

\n

It requires the use of a \u2018dbt_cloud\u2019 resource, which is used to connect to the dbt Cloud API.

\n

Config Options:

\n
\n
job_id (int)

The integer ID of the relevant dbt Cloud job. You can find this value by going to the details\npage of your job in the dbt Cloud UI. It will be the final number in the url, e.g.:\nhttps://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/jobs/{job_id}/

\n
\n
poll_interval (float)

The time (in seconds) that will be waited between successive polls. Defaults to 10.

\n
\n
poll_timeout (float)

The maximum time (in seconds) that will waited before this operation is timed out. By\ndefault, this will never time out.

\n
\n
yield_materializations (bool)

If True, materializations corresponding to the results of the dbt operation will be\nyielded when the solid executes. Defaults to True.

\n
\n
rasset_key_prefix (float)

If provided and yield_materializations is True, these components will be used to \u201d\nprefix the generated asset keys. Defaults to [\u201cdbt\u201d].

\n
\n
\n

Examples:

\n
from dagster import job\nfrom dagster_dbt import dbt_cloud_resource, dbt_cloud_run_op\n\nmy_dbt_cloud_resource = dbt_cloud_resource.configured(\n    {"auth_token": {"env": "DBT_CLOUD_AUTH_TOKEN"}, "account_id": 77777}\n)\nrun_dbt_nightly_sync = dbt_cloud_run_op.configured(\n    {"job_id": 54321}, name="run_dbt_nightly_sync"\n)\n\n@job(resource_defs={"dbt_cloud": my_dbt_cloud_resource})\ndef dbt_cloud():\n    run_dbt_nightly_sync()\n
\n
\n
\n\n
\n
\n

Resources (dbt Cloud)\u00b6

\n
\n
\nclass dagster_dbt.DbtCloudClientResource(*, auth_token, account_id, disable_schedule_on_trigger=True, request_max_retries=3, request_retry_delay=0.25, dbt_cloud_host='https://cloud.getdbt.com/')[source]\u00b6
\n

This resource helps interact with dbt Cloud connectors.

\n
\n\n
\n

Deprecated (dbt Cloud)\u00b6

\n
\n
\ndagster_dbt.dbt_cloud_resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
auth_token (dagster.StringSource):
\n

dbt Cloud API Token. User tokens can be found in the [dbt Cloud UI](https://cloud.getdbt.com/#/profile/api/), or see the [dbt Cloud Docs](https://docs.getdbt.com/docs/dbt-cloud/dbt-cloud-api/service-tokens) for instructions on creating a Service Account token.

\n
\n
account_id (dagster.IntSource):
\n

dbt Cloud Account ID. This value can be found in the url of a variety of views in the dbt Cloud UI, e.g. https://cloud.getdbt.com/#/accounts/{account_id}/settings/.

\n
\n
disable_schedule_on_trigger (dagster.BoolSource, optional):
\n

Specifies if you would like any job that is triggered using this resource to automatically disable its schedule.

\n

Default Value: True

\n
\n
request_max_retries (dagster.IntSource, optional):
\n

The maximum number of times requests to the dbt Cloud API should be retried before failing.

\n

Default Value: 3

\n
\n
request_retry_delay (Float, optional):
\n

Time (in seconds) to wait between each request retry.

\n

Default Value: 0.25

\n
\n
dbt_cloud_host (dagster.StringSource, optional):
\n

The hostname where dbt cloud is being hosted (e.g. https://my_org.cloud.getdbt.com/).

\n

Default Value: \u2018https://cloud.getdbt.com/\u2019

\n
\n
\n

This resource allows users to programatically interface with the dbt Cloud Administrative REST\nAPI (v2) to launch jobs and monitor their progress. This currently implements only a subset of\nthe functionality exposed by the API.

\n

For a complete set of documentation on the dbt Cloud Administrative REST API, including expected\nresponse JSON schemae, see the dbt Cloud API Docs.

\n

To configure this resource, we recommend using the configured method.

\n

Examples:

\n
from dagster import job\nfrom dagster_dbt import dbt_cloud_resource\n\nmy_dbt_cloud_resource = dbt_cloud_resource.configured(\n    {\n        "auth_token": {"env": "DBT_CLOUD_AUTH_TOKEN"},\n        "account_id": {"env": "DBT_CLOUD_ACCOUNT_ID"},\n    }\n)\n\n@job(resource_defs={"dbt_cloud": my_dbt_cloud_resource})\ndef my_dbt_cloud_job():\n    ...\n
\n
\n
\n\n
\n
\n
\n
\n

Types\u00b6

\n
\n
\nclass dagster_dbt.DbtOutput(result)[source]\u00b6
\n

Base class for both DbtCliOutput and DbtRPCOutput. Contains a single field, result, which\nrepresents the dbt-formatted result of the command that was run (if any).

\n

Used internally, should not be instantiated directly by the user.

\n
\n\n
\n
\nclass dagster_dbt.DbtResource(logger=None)[source]\u00b6
\n
\n\n
\n
\n

Errors\u00b6

\n
\n
\nexception dagster_dbt.DagsterDbtError(description=None, metadata=None, allow_retries=None)[source]\u00b6
\n

The base exception of the dagster-dbt library.

\n
\n\n
\n
\nexception dagster_dbt.DagsterDbtCliRuntimeError(description, logs=None, raw_output=None, messages=None)[source]\u00b6
\n

Represents an error while executing a dbt CLI command.

\n
\n\n
\n
\nexception dagster_dbt.DagsterDbtCliFatalRuntimeError(logs=None, raw_output=None, messages=None)[source]\u00b6
\n

Represents a fatal error in the dbt CLI (return code 2).

\n
\n\n
\n
\nexception dagster_dbt.DagsterDbtCliHandledRuntimeError(logs=None, raw_output=None, messages=None)[source]\u00b6
\n

Represents a model error reported by the dbt CLI at runtime (return code 1).

\n
\n\n
\n
\nexception dagster_dbt.DagsterDbtCliOutputsNotFoundError(path)[source]\u00b6
\n

Represents a problem in finding the target/run_results.json artifact when executing a dbt\nCLI command.

\n

For more details on target/run_results.json, see\nhttps://docs.getdbt.com/reference/dbt-artifacts#run_resultsjson.

\n
\n\n
\n
\nexception dagster_dbt.DagsterDbtCliUnexpectedOutputError(invalid_line_nos)[source]\u00b6
\n

Represents an error when parsing the output of a dbt CLI command.

\n
\n
\ninvalid_line_nos\u00b6
\n
\n\n
\n\n
\n
\n

Utils\u00b6

\n
\n
\ndagster_dbt.default_group_from_dbt_resource_props(dbt_resource_props)[source]\u00b6
\n

Get the group name for a dbt node.

\n

If a Dagster group is configured in the metadata for the node, use that.

\n

Otherwise, if a dbt group is configured for the node, use that.

\n
\n\n
\n
\ndagster_dbt.group_from_dbt_resource_props_fallback_to_directory(dbt_resource_props)[source]\u00b6
\n

Get the group name for a dbt node.

\n

Has the same behavior as the default_group_from_dbt_resource_props, except for that, if no group can be determined\nfrom config or metadata, falls back to using the subdirectory of the models directory that the\nsource file is in.

\n
\n
Parameters:
\n

dbt_resource_props (Mapping[str, Any]) \u2013 A dictionary representing the dbt resource.

\n
\n
\n

Examples

\n
from dagster_dbt import group_from_dbt_resource_props_fallback_to_directory\n\ndbt_assets = load_assets_from_dbt_manifest(\n    manifest=manifest,\n    node_info_to_group_fn=group_from_dbt_resource_props_fallback_to_directory,\n)\n
\n
\n
\n\n
\n
\ndagster_dbt.default_metadata_from_dbt_resource_props(dbt_resource_props)[source]\u00b6
\n
\n\n
\n
\ndagster_dbt.utils.generate_materializations(dbt_output, asset_key_prefix=None)[source]\u00b6
\n

This function yields dagster.AssetMaterialization events for each model updated by\na dbt command.

\n

Information parsed from a DbtOutput object.

\n

Examples

\n
from dagster import op, Output\nfrom dagster_dbt.utils import generate_materializations\nfrom dagster_dbt import dbt_cli_resource\n\n@op(required_resource_keys={"dbt"})\ndef my_custom_dbt_run(context):\n    dbt_output = context.resources.dbt.run()\n    for materialization in generate_materializations(dbt_output):\n        # you can modify the materialization object to add extra metadata, if desired\n        yield materialization\n    yield Output(my_dbt_output)\n\n@job(resource_defs={{"dbt":dbt_cli_resource}})\ndef my_dbt_cli_job():\n    my_custom_dbt_run()\n
\n
\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-dbt", "customsidebar": null, "display_toc": true, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../dagster-duckdb/", "title": "DuckDB (dagster-duckdb)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-datahub/", "title": "Datahub (dagster-datahub)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-duckdb", "DuckDB (dagster-duckdb)", "N", "next"], ["sections/api/apidocs/libraries/dagster-datahub", "Datahub (dagster-datahub)", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/libraries/dagster-dbt.rst.txt", "title": "dbt (dagster-dbt)", "toc": "\n"}, "dagster-docker": {"alabaster_version": "0.7.13", "body": "
\n

Orchestration on Docker\u00b6

\n
\n
\n

APIs\u00b6

\n
\n
\ndagster_docker.DockerRunLauncher RunLauncher[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
image (dagster.StringSource, optional):
\n

The docker image to be used if the repository does not specify one.

\n
\n
network (dagster.StringSource, optional):
\n

Name of the network to which to connect the launched container at creation time

\n
\n
registry (strict dict, optional):
\n

Information for using a non local/public docker registry

\n
\nConfig Schema:
\n
url (dagster.StringSource):
\n

\n
username (dagster.StringSource):
\n

\n
password (dagster.StringSource):
\n

\n
\n
\n
env_vars (List[String], optional):
\n

The list of environment variables names to include in the docker container. Each can be of the form KEY=VALUE or just KEY (in which case the value will be pulled from the local environment)

\n
\n
container_kwargs (permissive dict, optional):
\n

key-value pairs that can be passed into containers.create. See https://docker-py.readthedocs.io/en/stable/containers.html for the full list of available options.

\n
\n
networks (List[dagster.StringSource], optional):
\n

Names of the networks to which to connect the launched container at creation time

\n
\n
\n

Launches runs in a Docker container.

\n
\n\n
\n
\ndagster_docker.docker_executor ExecutorDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
image (dagster.StringSource, optional):
\n

The docker image to be used if the repository does not specify one.

\n
\n
network (dagster.StringSource, optional):
\n

Name of the network to which to connect the launched container at creation time

\n
\n
registry (strict dict, optional):
\n

Information for using a non local/public docker registry

\n
\nConfig Schema:
\n
url (dagster.StringSource):
\n

\n
username (dagster.StringSource):
\n

\n
password (dagster.StringSource):
\n

\n
\n
\n
env_vars (List[String], optional):
\n

The list of environment variables names to include in the docker container. Each can be of the form KEY=VALUE or just KEY (in which case the value will be pulled from the local environment)

\n
\n
container_kwargs (permissive dict, optional):
\n

key-value pairs that can be passed into containers.create. See https://docker-py.readthedocs.io/en/stable/containers.html for the full list of available options.

\n
\n
networks (List[dagster.StringSource], optional):
\n

Names of the networks to which to connect the launched container at creation time

\n
\n
retries (selector, optional):
\n

Whether retries are enabled or not. By default, retries are enabled.

\n
\nDefault Value:
{\n    "enabled": {}\n}\n
\n
\n
\nConfig Schema:
\n
enabled (strict dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\n
disabled (strict dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\n
\n
\n
max_concurrent (dagster.IntSource, optional):
\n

Limit on the number of containers that will run concurrently within the scope of a Dagster run. Note that this limit is per run, not global.

\n
\n
tag_concurrency_limits (List[strict dict], optional):
\n

A set of limits that are applied to steps with particular tags. If a value is set, the limit is applied to only that key-value pair. If no value is set, the limit is applied across all values of that key. If the value is set to a dict with applyLimitPerUniqueValue: true, the limit will apply to the number of unique values for that key. Note that these limits are per run, not global.

\n
\n
\n
\n
\n

\n \n (\n experimental\n )\n \n This API may break in future versions, even between dot releases.\n \n

\n

Executor which launches steps as Docker containers.

\n

To use the docker_executor, set it as the executor_def when defining a job:

\n
from dagster_docker import docker_executor\n\nfrom dagster import job\n\n@job(executor_def=docker_executor)\ndef docker_job():\n    pass\n
\n
\n

Then you can configure the executor with run config as follows:

\n
execution:\n  config:\n    registry: ...\n    network: ...\n    networks: ...\n    container_kwargs: ...\n
\n
\n

If you\u2019re using the DockerRunLauncher, configuration set on the containers created by the run\nlauncher will also be set on the containers that are created for each step.

\n
\n\n
\n

Ops\u00b6

\n
\n
\ndagster_docker.docker_container_op = <dagster._core.definitions.op_definition.OpDefinition object>[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
image (dagster.StringSource):
\n

The image in which to run the Docker container.

\n
\n
network (dagster.StringSource, optional):
\n

Name of the network to which to connect the launched container at creation time

\n
\n
registry (strict dict, optional):
\n

Information for using a non local/public docker registry

\n
\nConfig Schema:
\n
url (dagster.StringSource):
\n

\n
username (dagster.StringSource):
\n

\n
password (dagster.StringSource):
\n

\n
\n
\n
env_vars (List[String], optional):
\n

The list of environment variables names to include in the docker container. Each can be of the form KEY=VALUE or just KEY (in which case the value will be pulled from the local environment)

\n
\n
container_kwargs (permissive dict, optional):
\n

key-value pairs that can be passed into containers.create. See https://docker-py.readthedocs.io/en/stable/containers.html for the full list of available options.

\n
\n
networks (List[dagster.StringSource], optional):
\n

Names of the networks to which to connect the launched container at creation time

\n
\n
entrypoint (List[String], optional):
\n

The ENTRYPOINT for the Docker container

\n
\n
command (List[String], optional):
\n

The command to run in the container within the launched Docker container.

\n
\n
\n
\n
\n

\n \n (\n experimental\n )\n \n This API may break in future versions, even between dot releases.\n \n

\n

An op that runs a Docker container using the docker Python API.

\n

Contrast with the docker_executor, which runs each Dagster op in a Dagster job in its\nown Docker container.

\n
\n
This op may be useful when:
    \n
  • You need to orchestrate a command that isn\u2019t a Dagster op (or isn\u2019t written in Python)

  • \n
  • You want to run the rest of a Dagster job using a specific executor, and only a single\nop in docker.

  • \n
\n
\n
\n

For example:

\n
from dagster_docker import docker_container_op\n\nfrom dagster import job\n\nfirst_op = docker_container_op.configured(\n    {\n        "image": "busybox",\n        "command": ["echo HELLO"],\n    },\n    name="first_op",\n)\nsecond_op = docker_container_op.configured(\n    {\n        "image": "busybox",\n        "command": ["echo GOODBYE"],\n    },\n    name="second_op",\n)\n\n@job\ndef full_job():\n    second_op(first_op())\n
\n
\n

You can create your own op with the same implementation by calling the execute_docker_container function\ninside your own op.

\n
\n\n
\n
\ndagster_docker.execute_docker_container(context, image, entrypoint=None, command=None, networks=None, registry=None, env_vars=None, container_kwargs=None)[source]\u00b6
\n
\n
\n

\n \n (\n experimental\n )\n \n This API may break in future versions, even between dot releases.\n \n

\n

This function is a utility for executing a Docker container from within a Dagster op.

\n
\n
Parameters:
\n
    \n
  • image (str) \u2013 The image to use for the launched Docker container.

  • \n
  • entrypoint (Optional[Sequence[str]]) \u2013 The ENTRYPOINT to run in the launched Docker\ncontainer. Default: None.

  • \n
  • command (Optional[Sequence[str]]) \u2013 The CMD to run in the launched Docker container.\nDefault: None.

  • \n
  • networks (Optional[Sequence[str]]) \u2013 Names of the Docker networks to which to connect the\nlaunched container. Default: None.

  • \n
  • registry \u2013 (Optional[Mapping[str, str]]): Information for using a non local/public Docker\nregistry. Can have \u201curl\u201d, \u201cusername\u201d, or \u201cpassword\u201d keys.

  • \n
  • env_vars (Optional[Sequence[str]]) \u2013 List of environemnt variables to include in the launched\ncontainer. ach can be of the form KEY=VALUE or just KEY (in which case the value will be\npulled from the calling environment.

  • \n
  • container_kwargs (Optional[Dict[str[Any]]]) \u2013 key-value pairs that can be passed into\ncontainers.create in the Docker Python API. See\nhttps://docker-py.readthedocs.io/en/stable/containers.html for the full list\nof available options.

  • \n
\n
\n
\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-docker", "customsidebar": null, "display_toc": true, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../dagster-gcp/", "title": "GCP (dagster-gcp)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-fivetran/", "title": "Fivetran (dagster-fivetran)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-gcp", "GCP (dagster-gcp)", "N", "next"], ["sections/api/apidocs/libraries/dagster-fivetran", "Fivetran (dagster-fivetran)", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/libraries/dagster-docker.rst.txt", "title": "Orchestration on Docker", "toc": "\n"}, "dagster-duckdb": {"alabaster_version": "0.7.13", "body": "
\n

DuckDB (dagster-duckdb)\u00b6

\n

This library provides an integration with the DuckDB database.

\n

Related Guides:

\n\n
\n
\ndagster_duckdb.DuckDBIOManager IOManagerDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
database (dagster.StringSource):
\n

Path to the DuckDB database.

\n
\n
schema (Union[dagster.StringSource, None], optional):
\n

Name of the schema to use.

\n
\n
\n

Base class for an IO manager definition that reads inputs from and writes outputs to DuckDB.

\n

Examples

\n
from dagster_duckdb import DuckDBIOManager\nfrom dagster_duckdb_pandas import DuckDBPandasTypeHandler\n\nclass MyDuckDBIOManager(DuckDBIOManager):\n    @staticmethod\n    def type_handlers() -> Sequence[DbTypeHandler]:\n        return [DuckDBPandasTypeHandler()]\n\n@asset(\n    key_prefix=["my_schema"]  # will be used as the schema in duckdb\n)\ndef my_table() -> pd.DataFrame:  # the name of the asset will be the table name\n    ...\n\ndefs = Definitions(\n    assets=[my_table],\n    resources={"io_manager": MyDuckDBIOManager(database="my_db.duckdb")}\n)\n
\n
\n

If you do not provide a schema, Dagster will determine a schema based on the assets and ops using\nthe IO Manager. For assets, the schema will be determined from the asset key, as in the above example.\nFor ops, the schema can be specified by including a \u201cschema\u201d entry in output metadata. If none\nof these is provided, the schema will default to \u201cpublic\u201d.

\n
@op(\n    out={"my_table": Out(metadata={"schema": "my_schema"})}\n)\ndef make_my_table() -> pd.DataFrame:\n    ...\n
\n
\n

To only use specific columns of a table as input to a downstream op or asset, add the metadata \u201ccolumns\u201d to the\nIn or AssetIn.

\n
@asset(\n    ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n)\ndef my_table_a(my_table: pd.DataFrame):\n    # my_table will just contain the data from column "a"\n    ...\n
\n
\n
\n\n
\n
\ndagster_duckdb.DuckDBResource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
database (dagster.StringSource):
\n

Path to the DuckDB database. Setting database=\u2019:memory:\u2019 will use an in-memory database

\n
\n
\n

Resource for interacting with a DuckDB database.

\n

Examples

\n
from dagster import Definitions, asset\nfrom dagster_duckdb import DuckDBResource\n\n@asset\ndef my_table(duckdb: DuckDBResource):\n    with duckdb.get_connection() as conn:\n        conn.execute("SELECT * from MY_SCHEMA.MY_TABLE")\n\ndefs = Definitions(\n    assets=[my_table],\n    resources={"duckdb": DuckDBResource(database="path/to/db.duckdb")}\n)\n
\n
\n
\n\n
\n

Legacy\u00b6

\n
\n
\ndagster_duckdb.build_duckdb_io_manager IOManagerDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
database (dagster.StringSource):
\n

Path to the DuckDB database.

\n
\n
schema (Union[dagster.StringSource, None], optional):
\n

Name of the schema to use.

\n
\n
\n

Builds an IO manager definition that reads inputs from and writes outputs to DuckDB.

\n
\n
Parameters:
\n
    \n
  • type_handlers (Sequence[DbTypeHandler]) \u2013 Each handler defines how to translate between\nDuckDB tables and an in-memory type - e.g. a Pandas DataFrame. If only\none DbTypeHandler is provided, it will be used as teh default_load_type.

  • \n
  • default_load_type (Type) \u2013 When an input has no type annotation, load it as this type.

  • \n
\n
\n
Returns:
\n

IOManagerDefinition

\n
\n
\n

Examples

\n
from dagster_duckdb import build_duckdb_io_manager\nfrom dagster_duckdb_pandas import DuckDBPandasTypeHandler\n\n@asset(\n    key_prefix=["my_schema"]  # will be used as the schema in duckdb\n)\ndef my_table() -> pd.DataFrame:  # the name of the asset will be the table name\n    ...\n\nduckdb_io_manager = build_duckdb_io_manager([DuckDBPandasTypeHandler()])\n\n@repository\ndef my_repo():\n    return with_resources(\n        [my_table],\n        {"io_manager": duckdb_io_manager.configured({"database": "my_db.duckdb"})}\n    )\n
\n
\n

If you do not provide a schema, Dagster will determine a schema based on the assets and ops using\nthe IO Manager. For assets, the schema will be determined from the asset key. For ops, the schema can be\nspecified by including a \u201cschema\u201d entry in output metadata. If none of these is provided, the schema will\ndefault to \u201cpublic\u201d.

\n
@op(\n    out={"my_table": Out(metadata={"schema": "my_schema"})}\n)\ndef make_my_table() -> pd.DataFrame:\n    ...\n
\n
\n

To only use specific columns of a table as input to a downstream op or asset, add the metadata \u201ccolumns\u201d to the\nIn or AssetIn.

\n
@asset(\n    ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n)\ndef my_table_a(my_table: pd.DataFrame):\n    # my_table will just contain the data from column "a"\n    ...\n
\n
\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-duckdb", "customsidebar": null, "display_toc": true, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../dagster-duckdb-pandas/", "title": "DuckDB + Pandas (dagster-duckdb-pandas)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-dbt/", "title": "dbt (dagster-dbt)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-duckdb-pandas", "DuckDB + Pandas (dagster-duckdb-pandas)", "N", "next"], ["sections/api/apidocs/libraries/dagster-dbt", "dbt (dagster-dbt)", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/libraries/dagster-duckdb.rst.txt", "title": "DuckDB (dagster-duckdb)", "toc": "\n"}, "dagster-duckdb-pandas": {"alabaster_version": "0.7.13", "body": "
\n

DuckDB + Pandas (dagster-duckdb-pandas)\u00b6

\n

This library provides an integration with the DuckDB database and Pandas data processing library.

\n

Related guides:

\n\n
\n
\ndagster_duckdb_pandas.DuckDBPandasIOManager IOManagerDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
database (dagster.StringSource):
\n

Path to the DuckDB database.

\n
\n
schema (Union[dagster.StringSource, None], optional):
\n

Name of the schema to use.

\n
\n
\n

An I/O manager definition that reads inputs from and writes Pandas DataFrames to DuckDB. When\nusing the DuckDBPandasIOManager, any inputs and outputs without type annotations will be loaded\nas Pandas DataFrames.

\n
\n
Returns:
\n

IOManagerDefinition

\n
\n
\n

Examples

\n
from dagster_duckdb_pandas import DuckDBPandasIOManager\n\n@asset(\n    key_prefix=["my_schema"]  # will be used as the schema in DuckDB\n)\ndef my_table() -> pd.DataFrame:  # the name of the asset will be the table name\n    ...\n\ndefs = Definitions(\n    assets=[my_table],\n    resources={"io_manager": DuckDBPandasIOManager(database="my_db.duckdb")}\n)\n
\n
\n

If you do not provide a schema, Dagster will determine a schema based on the assets and ops using\nthe I/O Manager. For assets, the schema will be determined from the asset key, as in the above example.\nFor ops, the schema can be specified by including a \u201cschema\u201d entry in output metadata. If \u201cschema\u201d is not provided\nvia config or on the asset/op, \u201cpublic\u201d will be used for the schema.

\n
@op(\n    out={"my_table": Out(metadata={"schema": "my_schema"})}\n)\ndef make_my_table() -> pd.DataFrame:\n    # the returned value will be stored at my_schema.my_table\n    ...\n
\n
\n

To only use specific columns of a table as input to a downstream op or asset, add the metadata \u201ccolumns\u201d to the\nIn or AssetIn.

\n
@asset(\n    ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n)\ndef my_table_a(my_table: pd.DataFrame) -> pd.DataFrame:\n    # my_table will just contain the data from column "a"\n    ...\n
\n
\n
\n\n
\n
\nclass dagster_duckdb_pandas.DuckDBPandasTypeHandler[source]\u00b6
\n

Stores and loads Pandas DataFrames in DuckDB.

\n

To use this type handler, return it from the type_handlers` method of an I/O manager that inherits from ``DuckDBIOManager.

\n

Example

\n
from dagster_duckdb import DuckDBIOManager\nfrom dagster_duckdb_pandas import DuckDBPandasTypeHandler\n\nclass MyDuckDBIOManager(DuckDBIOManager):\n    @staticmethod\n    def type_handlers() -> Sequence[DbTypeHandler]:\n        return [DuckDBPandasTypeHandler()]\n\n@asset(\n    key_prefix=["my_schema"]  # will be used as the schema in duckdb\n)\ndef my_table() -> pd.DataFrame:  # the name of the asset will be the table name\n    ...\n\ndefs = Definitions(\n    assets=[my_table],\n    resources={"io_manager": MyDuckDBIOManager(database="my_db.duckdb")}\n)\n
\n
\n
\n\n
\n

Legacy\u00b6

\n
\n
\ndagster_duckdb_pandas.duckdb_pandas_io_manager IOManagerDefinition\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
database (dagster.StringSource):
\n

Path to the DuckDB database.

\n
\n
schema (Union[dagster.StringSource, None], optional):
\n

Name of the schema to use.

\n
\n
\n

An I/O manager definition that reads inputs from and writes Pandas DataFrames to DuckDB. When\nusing the duckdb_pandas_io_manager, any inputs and outputs without type annotations will be loaded\nas Pandas DataFrames.

\n
\n
Returns:
\n

IOManagerDefinition

\n
\n
\n

Examples

\n
from dagster_duckdb_pandas import duckdb_pandas_io_manager\n\n@asset(\n    key_prefix=["my_schema"]  # will be used as the schema in DuckDB\n)\ndef my_table() -> pd.DataFrame:  # the name of the asset will be the table name\n    ...\n\n@repository\ndef my_repo():\n    return with_resources(\n        [my_table],\n        {"io_manager": duckdb_pandas_io_manager.configured({"database": "my_db.duckdb"})}\n    )\n
\n
\n

If you do not provide a schema, Dagster will determine a schema based on the assets and ops using\nthe I/O Manager. For assets, the schema will be determined from the asset key.\nFor ops, the schema can be specified by including a \u201cschema\u201d entry in output metadata. If \u201cschema\u201d is not provided\nvia config or on the asset/op, \u201cpublic\u201d will be used for the schema.

\n
@op(\n    out={"my_table": Out(metadata={"schema": "my_schema"})}\n)\ndef make_my_table() -> pd.DataFrame:\n    # the returned value will be stored at my_schema.my_table\n    ...\n
\n
\n

To only use specific columns of a table as input to a downstream op or asset, add the metadata \u201ccolumns\u201d to the\nIn or AssetIn.

\n
@asset(\n    ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n)\ndef my_table_a(my_table: pd.DataFrame) -> pd.DataFrame:\n    # my_table will just contain the data from column "a"\n    ...\n
\n
\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-duckdb-pandas", "customsidebar": null, "display_toc": true, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../dagster-duckdb-pyspark/", "title": "DuckDB + PySpark (dagster-duckdb-pyspark)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-duckdb/", "title": "DuckDB (dagster-duckdb)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-duckdb-pyspark", "DuckDB + PySpark (dagster-duckdb-pyspark)", "N", "next"], ["sections/api/apidocs/libraries/dagster-duckdb", "DuckDB (dagster-duckdb)", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/libraries/dagster-duckdb-pandas.rst.txt", "title": "DuckDB + Pandas (dagster-duckdb-pandas)", "toc": "\n"}, "dagster-duckdb-polars": {"alabaster_version": "0.7.13", "body": "
\n

DuckDB + Polars (dagster-duckdb-polars)\u00b6

\n

This library provides an integration with the DuckDB database and Polars data processing library.

\n

Related guides:

\n\n
\n
\ndagster_duckdb_polars.DuckDBPolarsIOManager IOManagerDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
database (dagster.StringSource):
\n

Path to the DuckDB database.

\n
\n
schema (Union[dagster.StringSource, None], optional):
\n

Name of the schema to use.

\n
\n
\n

An I/O manager definition that reads inputs from and writes Polars DataFrames to DuckDB. When\nusing the DuckDBPolarsIOManager, any inputs and outputs without type annotations will be loaded\nas Polars DataFrames.

\n
\n
Returns:
\n

IOManagerDefinition

\n
\n
\n

Examples

\n
from dagster_duckdb_polars import DuckDBPolarsIOManager\n\n@asset(\n    key_prefix=["my_schema"]  # will be used as the schema in DuckDB\n)\ndef my_table() -> pl.DataFrame:  # the name of the asset will be the table name\n    ...\n\ndefs = Definitions(\n    assets=[my_table],\n    resources={"io_manager": DuckDBPolarsIOManager(database="my_db.duckdb")}\n)\n
\n
\n

If you do not provide a schema, Dagster will determine a schema based on the assets and ops using\nthe I/O Manager. For assets, the schema will be determined from the asset key, as in the above example.\nFor ops, the schema can be specified by including a \u201cschema\u201d entry in output metadata. If \u201cschema\u201d is not provided\nvia config or on the asset/op, \u201cpublic\u201d will be used for the schema.

\n
@op(\n    out={"my_table": Out(metadata={"schema": "my_schema"})}\n)\ndef make_my_table() -> pl.DataFrame:\n    # the returned value will be stored at my_schema.my_table\n    ...\n
\n
\n

To only use specific columns of a table as input to a downstream op or asset, add the metadata \u201ccolumns\u201d to the\nIn or AssetIn.

\n
@asset(\n    ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n)\ndef my_table_a(my_table: pl.DataFrame) -> pl.DataFrame:\n    # my_table will just contain the data from column "a"\n    ...\n
\n
\n
\n\n
\n
\nclass dagster_duckdb_polars.DuckDBPolarsTypeHandler[source]\u00b6
\n

Stores and loads Polars DataFrames in DuckDB.

\n

To use this type handler, return it from the type_handlers` method of an I/O manager that inherits from ``DuckDBIOManager.

\n

Example

\n
from dagster_duckdb import DuckDBIOManager\nfrom dagster_duckdb_polars import DuckDBPolarsTypeHandler\n\nclass MyDuckDBIOManager(DuckDBIOManager):\n    @staticmethod\n    def type_handlers() -> Sequence[DbTypeHandler]:\n        return [DuckDBPolarsTypeHandler()]\n\n@asset(\n    key_prefix=["my_schema"]  # will be used as the schema in duckdb\n)\ndef my_table() -> pl.DataFrame:  # the name of the asset will be the table name\n    ...\n\ndefs = Definitions(\n    assets=[my_table],\n    resources={"io_manager": MyDuckDBIOManager(database="my_db.duckdb")}\n)\n
\n
\n
\n\n
\n

Legacy\u00b6

\n
\n
\ndagster_duckdb_polars.duckdb_polars_io_manager IOManagerDefinition\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
database (dagster.StringSource):
\n

Path to the DuckDB database.

\n
\n
schema (Union[dagster.StringSource, None], optional):
\n

Name of the schema to use.

\n
\n
\n

An I/O manager definition that reads inputs from and writes polars dataframes to DuckDB. When\nusing the duckdb_polars_io_manager, any inputs and outputs without type annotations will be loaded\nas Polars DataFrames.

\n
\n
Returns:
\n

IOManagerDefinition

\n
\n
\n

Examples

\n
from dagster_duckdb_polars import duckdb_polars_io_manager\n\n@asset(\n    key_prefix=["my_schema"]  # will be used as the schema in DuckDB\n)\ndef my_table() -> pl.DataFrame:  # the name of the asset will be the table name\n    ...\n\n@repository\ndef my_repo():\n    return with_resources(\n        [my_table],\n        {"io_manager": duckdb_polars_io_manager.configured({"database": "my_db.duckdb"})}\n    )\n
\n
\n

If you do not provide a schema, Dagster will determine a schema based on the assets and ops using\nthe I/O Manager. For assets, the schema will be determined from the asset key.\nFor ops, the schema can be specified by including a \u201cschema\u201d entry in output metadata. If \u201cschema\u201d is not provided\nvia config or on the asset/op, \u201cpublic\u201d will be used for the schema.

\n
@op(\n    out={"my_table": Out(metadata={"schema": "my_schema"})}\n)\ndef make_my_table() -> pl.DataFrame:\n    # the returned value will be stored at my_schema.my_table\n    ...\n
\n
\n

To only use specific columns of a table as input to a downstream op or asset, add the metadata \u201ccolumns\u201d to the\nIn or AssetIn.

\n
@asset(\n    ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n)\ndef my_table_a(my_table: pl.DataFrame) -> pl.DataFrame:\n    # my_table will just contain the data from column "a"\n    ...\n
\n
\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-duckdb-polars", "customsidebar": null, "display_toc": true, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../dagster-embedded-elt/", "title": "embedded-elt (dagster-embedded-elt)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-duckdb-pyspark/", "title": "DuckDB + PySpark (dagster-duckdb-pyspark)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-embedded-elt", "embedded-elt (dagster-embedded-elt)", "N", "next"], ["sections/api/apidocs/libraries/dagster-duckdb-pyspark", "DuckDB + PySpark (dagster-duckdb-pyspark)", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/libraries/dagster-duckdb-polars.rst.txt", "title": "DuckDB + Polars (dagster-duckdb-polars)", "toc": "\n"}, "dagster-duckdb-pyspark": {"alabaster_version": "0.7.13", "body": "
\n

DuckDB + PySpark (dagster-duckdb-pyspark)\u00b6

\n

This library provides an integration with the DuckDB database and PySpark data processing library.

\n

Related guides:

\n\n
\n
\ndagster_duckdb_pyspark.DuckDBPySparkIOManager IOManagerDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
database (dagster.StringSource):
\n

Path to the DuckDB database.

\n
\n
schema (Union[dagster.StringSource, None], optional):
\n

Name of the schema to use.

\n
\n
\n

An I/O manager definition that reads inputs from and writes PySpark DataFrames to DuckDB. When\nusing the DuckDBPySparkIOManager, any inputs and outputs without type annotations will be loaded\nas PySpark DataFrames.

\n
\n
Returns:
\n

IOManagerDefinition

\n
\n
\n

Examples

\n
from dagster_duckdb_pyspark import DuckDBPySparkIOManager\n\n@asset(\n    key_prefix=["my_schema"]  # will be used as the schema in DuckDB\n)\ndef my_table() -> pyspark.sql.DataFrame:  # the name of the asset will be the table name\n    ...\n\ndefs = Definitions(\n    assets=[my_table],\n    resources={"io_manager": DuckDBPySparkIOManager(database="my_db.duckdb")}\n)\n
\n
\n

If you do not provide a schema, Dagster will determine a schema based on the assets and ops using\nthe I/O Manager. For assets, the schema will be determined from the asset key, as in the above example.\nFor ops, the schema can be specified by including a \u201cschema\u201d entry in output metadata. If \u201cschema\u201d is not provided\nvia config or on the asset/op, \u201cpublic\u201d will be used for the schema.

\n
@op(\n    out={"my_table": Out(metadata={"schema": "my_schema"})}\n)\ndef make_my_table() -> pyspark.sql.DataFrame:\n    # the returned value will be stored at my_schema.my_table\n    ...\n
\n
\n

To only use specific columns of a table as input to a downstream op or asset, add the metadata \u201ccolumns\u201d to the\nIn or AssetIn.

\n
@asset(\n    ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n)\ndef my_table_a(my_table: pyspark.sql.DataFrame) -> pyspark.sql.DataFrame:\n    # my_table will just contain the data from column "a"\n    ...\n
\n
\n
\n\n
\n
\nclass dagster_duckdb_pyspark.DuckDBPySparkTypeHandler[source]\u00b6
\n

Stores PySpark DataFrames in DuckDB.

\n

To use this type handler, return it from the type_handlers` method of an I/O manager that inherits from ``DuckDBIOManager.

\n

Example

\n
from dagster_duckdb import DuckDBIOManager\nfrom dagster_duckdb_pyspark import DuckDBPySparkTypeHandler\n\nclass MyDuckDBIOManager(DuckDBIOManager):\n    @staticmethod\n    def type_handlers() -> Sequence[DbTypeHandler]:\n        return [DuckDBPySparkTypeHandler()]\n\n@asset(\n    key_prefix=["my_schema"]  # will be used as the schema in duckdb\n)\ndef my_table() -> pyspark.sql.DataFrame:  # the name of the asset will be the table name\n    ...\n\ndefs = Definitions(\n    assets=[my_table],\n    resources={"io_manager": MyDuckDBIOManager(database="my_db.duckdb")}\n)\n
\n
\n
\n\n
\n

Legacy\u00b6

\n
\n
\ndagster_duckdb_pyspark.duckdb_pyspark_io_manager IOManagerDefinition\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
database (dagster.StringSource):
\n

Path to the DuckDB database.

\n
\n
schema (Union[dagster.StringSource, None], optional):
\n

Name of the schema to use.

\n
\n
\n

An I/O manager definition that reads inputs from and writes PySpark DataFrames to DuckDB. When\nusing the duckdb_pyspark_io_manager, any inputs and outputs without type annotations will be loaded\nas PySpark DataFrames.

\n
\n
Returns:
\n

IOManagerDefinition

\n
\n
\n

Examples

\n
from dagster_duckdb_pyspark import duckdb_pyspark_io_manager\n\n@asset(\n    key_prefix=["my_schema"]  # will be used as the schema in DuckDB\n)\ndef my_table() -> pyspark.sql.DataFrame:  # the name of the asset will be the table name\n    ...\n\n@repository\ndef my_repo():\n    return with_resources(\n        [my_table],\n        {"io_manager": duckdb_pyspark_io_manager.configured({"database": "my_db.duckdb"})}\n    )\n
\n
\n

If you do not provide a schema, Dagster will determine a schema based on the assets and ops using\nthe I/O Manager. For assets, the schema will be determined from the asset key.\nFor ops, the schema can be specified by including a \u201cschema\u201d entry in output metadata. If \u201cschema\u201d is not provided\nvia config or on the asset/op, \u201cpublic\u201d will be used for the schema.

\n
@op(\n    out={"my_table": Out(metadata={"schema": "my_schema"})}\n)\ndef make_my_table() -> pyspark.sql.DataFrame:\n    # the returned value will be stored at my_schema.my_table\n    ...\n
\n
\n

To only use specific columns of a table as input to a downstream op or asset, add the metadata \u201ccolumns\u201d to the\nIn or AssetIn.

\n
@asset(\n    ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n)\ndef my_table_a(my_table: pyspark.sql.DataFrame) -> pyspark.sql.DataFrame:\n    # my_table will just contain the data from column "a"\n    ...\n
\n
\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-duckdb-pyspark", "customsidebar": null, "display_toc": true, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../dagster-duckdb-polars/", "title": "DuckDB + Polars (dagster-duckdb-polars)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-duckdb-pandas/", "title": "DuckDB + Pandas (dagster-duckdb-pandas)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-duckdb-polars", "DuckDB + Polars (dagster-duckdb-polars)", "N", "next"], ["sections/api/apidocs/libraries/dagster-duckdb-pandas", "DuckDB + Pandas (dagster-duckdb-pandas)", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/libraries/dagster-duckdb-pyspark.rst.txt", "title": "DuckDB + PySpark (dagster-duckdb-pyspark)", "toc": "\n"}, "dagster-embedded-elt": {"alabaster_version": "0.7.13", "body": "
\n

embedded-elt (dagster-embedded-elt)\u00b6

\n

This package provides a framework for building ELT pipelines with Dagster through\nhelpful pre-built assets and resources.

\n

This package currently includes a Sling integration which\nprovides a simple way to sync data between databases and file systems.

\n

Related documentation pages: embedded-elt.

\n
\n

Sling\u00b6

\n
\n

Assets\u00b6

\n
\n
\ndagster_embedded_elt.sling.build_sling_asset(asset_spec, source_stream, target_object, mode=SlingMode.FULL_REFRESH, primary_key=None, update_key=None, source_options=None, target_options=None, sling_resource_key='sling')[source]\u00b6
\n
\n
\n

\n \n (\n experimental\n )\n \n This API may break in future versions, even between dot releases.\n \n

\n

Asset Factory for using Sling to sync data from a source stream to a target object.

\n
\n
Parameters:
\n
    \n
  • asset_spec (AssetSpec) \u2013 The AssetSpec to use to materialize this asset.

  • \n
  • source_stream (str) \u2013 The source stream to sync from. This can be a table, a query, or a path.

  • \n
  • target_object (str) \u2013 The target object to sync to. This can be a table, or a path.

  • \n
  • mode (SlingMode, optional) \u2013 The sync mode to use when syncing. Defaults to SlingMode.FULL_REFRESH.

  • \n
  • primary_key (Optional[Union[str, List[str]]], optional) \u2013 The optional primary key to use when syncing.

  • \n
  • update_key (Optional[Union[str, List[str]]], optional) \u2013 The optional update key to use when syncing.

  • \n
  • source_options (Optional[Dict[str, Any]], optional) \u2013 Any optional Sling source options to use when syncing.

  • \n
  • target_options (Optional[Dict[str, Any]], optional) \u2013 Any optional target options to use when syncing.

  • \n
  • sling_resource_key (str, optional) \u2013 The resource key for the SlingResource. Defaults to \u201csling\u201d.

  • \n
\n
\n
\n

Examples

\n

Creating a Sling asset that syncs from a file to a table:

\n
asset_spec = AssetSpec(key=["main", "dest_tbl"])\nasset_def = build_sling_asset(\n        asset_spec=asset_spec,\n        source_stream="file:///tmp/test.csv",\n        target_object="main.dest_table",\n        mode=SlingMode.INCREMENTAL,\n        primary_key="id"\n)\n
\n
\n

Creating a Sling asset that syncs from a table to a file with a full refresh:

\n
asset_spec = AssetSpec(key="test.csv")\nasset_def = build_sling_asset(\n        asset_spec=asset_spec,\n        source_stream="main.dest_table",\n        target_object="file:///tmp/test.csv",\n        mode=SlingMode.FULL_REFRESH\n)\n
\n
\n
\n\n
\n
\n

Resources\u00b6

\n
\n
\nclass dagster_embedded_elt.sling.SlingResource(*, source_connection, target_connection)[source]\u00b6
\n
\n
\n

\n \n (\n experimental\n )\n \n This API may break in future versions, even between dot releases.\n \n

\n

Resource for interacting with the Sling package.

\n

Examples

\n
from dagster_etl.sling import SlingResource\nsling_resource = SlingResource(\n    source_connection=SlingSourceConnection(\n        type="postgres", connection_string=EnvVar("POSTGRES_CONNECTION_STRING")\n    ),\n    target_connection=SlingTargetConnection(\n        type="snowflake",\n        host="host",\n        user="user",\n        database="database",\n        password="password",\n        role="role",\n    ),\n)\n
\n
\n
\n\n
\n
\nclass dagster_embedded_elt.sling.resources.SlingSourceConnection(*, type, connection_string=None, **config_dict)[source]\u00b6
\n

A Sling Source Connection defines the source connection used by SlingResource.

\n

Examples

\n

Creating a Sling Source for a file, such as CSV or JSON:

\n
source = SlingSourceConnection(type="file")\n
\n
\n

Create a Sling Source for a Postgres database, using a connection string:

\n
source = SlingTargetConnection(type="postgres", connection_string=EnvVar("POSTGRES_CONNECTION_STRING"))\nsource = SlingSourceConnection(type="postgres", connection_string="postgresql://user:password@host:port/schema")\n
\n
\n

Create a Sling Source for a Postgres database, using keyword arguments, as described here:\nhttps://docs.slingdata.io/connections/database-connections/postgres

\n
source = SlingTargetConnection(type="postgres", host="host", user="hunter42", password=EnvVar("POSTGRES_PASSWORD"))\n
\n
\n
\n\n
\n
\nclass dagster_embedded_elt.sling.resources.SlingTargetConnection(*, type, connection_string=None, **config_dict)[source]\u00b6
\n

A Sling Target Connection defines the target connection used by SlingResource.

\n

Examples

\n

Creating a Sling Target for a file, such as CSV or JSON:

\n
source = SlingTargetConnection(type="file")\n
\n
\n

Create a Sling Source for a Postgres database, using a connection string:

\n
source = SlingTargetConnection(type="postgres", connection_string="postgresql://user:password@host:port/schema"\nsource = SlingTargetConnection(type="postgres", connection_string=EnvVar("POSTGRES_CONNECTION_STRING"))\n
\n
\n

Create a Sling Source for a Postgres database, using keyword arguments, as described here:\nhttps://docs.slingdata.io/connections/database-connections/postgres

\n
\n\n
\n
\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-embedded-elt", "customsidebar": null, "display_toc": true, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../dagster-fivetran/", "title": "Fivetran (dagster-fivetran)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-duckdb-polars/", "title": "DuckDB + Polars (dagster-duckdb-polars)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-fivetran", "Fivetran (dagster-fivetran)", "N", "next"], ["sections/api/apidocs/libraries/dagster-duckdb-polars", "DuckDB + Polars (dagster-duckdb-polars)", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/libraries/dagster-embedded-elt.rst.txt", "title": "embedded-elt (dagster-embedded-elt)", "toc": "\n"}, "dagster-fivetran": {"alabaster_version": "0.7.13", "body": "
\n

Fivetran (dagster-fivetran)\u00b6

\n

This library provides a Dagster integration with Fivetran.

\n
\n

Resources\u00b6

\n
\n
\ndagster_fivetran.FivetranResource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
api_key (dagster.StringSource):
\n

The Fivetran API key to use for this resource.

\n
\n
api_secret (dagster.StringSource):
\n

The Fivetran API secret to use for this resource.

\n
\n
disable_schedule_on_trigger (dagster.BoolSource, optional):
\n

Specifies if you would like any connector that is sync\u2019d using this resource to be automatically taken off its Fivetran schedule.

\n

Default Value: True

\n
\n
request_max_retries (dagster.IntSource, optional):
\n

The maximum number of times requests to the Fivetran API should be retried before failing.

\n

Default Value: 3

\n
\n
request_retry_delay (Float, optional):
\n

Time (in seconds) to wait between each request retry.

\n

Default Value: 0.25

\n
\n
\n

This class exposes methods on top of the Fivetran REST API.

\n
\n\n
\n
\n

Assets\u00b6

\n
\n
\ndagster_fivetran.load_assets_from_fivetran_instance(fivetran, key_prefix=None, connector_to_group_fn=<function _clean_name>, io_manager_key=None, connector_to_io_manager_key_fn=None, connector_filter=None, connector_to_asset_key_fn=None, poll_interval=10, poll_timeout=None)[source]\u00b6
\n

Loads Fivetran connector assets from a configured FivetranResource instance. This fetches information\nabout defined connectors at initialization time, and will error on workspace load if the Fivetran\ninstance is not reachable.

\n
\n
Parameters:
\n
    \n
  • fivetran (ResourceDefinition) \u2013 A FivetranResource configured with the appropriate connection\ndetails.

  • \n
  • key_prefix (Optional[CoercibleToAssetKeyPrefix]) \u2013 A prefix for the asset keys created.

  • \n
  • connector_to_group_fn (Optional[Callable[[str], Optional[str]]]) \u2013 Function which returns an asset\ngroup name for a given Fivetran connector name. If None, no groups will be created. Defaults\nto a basic sanitization function.

  • \n
  • io_manager_key (Optional[str]) \u2013 The IO manager key to use for all assets. Defaults to \u201cio_manager\u201d.\nUse this if all assets should be loaded from the same source, otherwise use connector_to_io_manager_key_fn.

  • \n
  • connector_to_io_manager_key_fn (Optional[Callable[[str], Optional[str]]]) \u2013 Function which returns an\nIO manager key for a given Fivetran connector name. When other ops are downstream of the loaded assets,\nthe IOManager specified determines how the inputs to those ops are loaded. Defaults to \u201cio_manager\u201d.

  • \n
  • connector_filter (Optional[Callable[[FivetranConnectorMetadata], bool]]) \u2013 Optional function which takes\nin connector metadata and returns False if the connector should be excluded from the output assets.

  • \n
  • connector_to_asset_key_fn (Optional[Callable[[FivetranConnectorMetadata, str], AssetKey]]) \u2013 Optional function\nwhich takes in connector metadata and a table name and returns an AssetKey for that table. Defaults to\na function that generates an AssetKey matching the table name, split by \u201c.\u201d.

  • \n
  • poll_interval (float) \u2013 The time (in seconds) that will be waited between successive polls.

  • \n
  • poll_timeout (Optional[float]) \u2013 The maximum time that will waited before this operation is\ntimed out. By default, this will never time out.

  • \n
\n
\n
\n

Examples:

\n

Loading all Fivetran connectors as assets:

\n
from dagster_fivetran import fivetran_resource, load_assets_from_fivetran_instance\n\nfivetran_instance = fivetran_resource.configured(\n    {\n        "api_key": "some_key",\n        "api_secret": "some_secret",\n    }\n)\nfivetran_assets = load_assets_from_fivetran_instance(fivetran_instance)\n
\n
\n

Filtering the set of loaded connectors:

\n
from dagster_fivetran import fivetran_resource, load_assets_from_fivetran_instance\n\nfivetran_instance = fivetran_resource.configured(\n    {\n        "api_key": "some_key",\n        "api_secret": "some_secret",\n    }\n)\nfivetran_assets = load_assets_from_fivetran_instance(\n    fivetran_instance,\n    connector_filter=lambda meta: "snowflake" in meta.name,\n)\n
\n
\n
\n\n
\n
\ndagster_fivetran.build_fivetran_assets(connector_id, destination_tables, poll_interval=10, poll_timeout=None, io_manager_key=None, asset_key_prefix=None, metadata_by_table_name=None, group_name=None, infer_missing_tables=False, op_tags=None)[source]\u00b6
\n

Build a set of assets for a given Fivetran connector.

\n

Returns an AssetsDefinition which connects the specified asset_keys to the computation that\nwill update them. Internally, executes a Fivetran sync for a given connector_id, and\npolls until that sync completes, raising an error if it is unsuccessful. Requires the use of the\nfivetran_resource, which allows it to communicate with the\nFivetran API.

\n
\n
Parameters:
\n
    \n
  • connector_id (str) \u2013 The Fivetran Connector ID that this op will sync. You can retrieve this\nvalue from the \u201cSetup\u201d tab of a given connector in the Fivetran UI.

  • \n
  • destination_tables (List[str]) \u2013 schema_name.table_name for each table that you want to be\nrepresented in the Dagster asset graph for this connection.

  • \n
  • poll_interval (float) \u2013 The time (in seconds) that will be waited between successive polls.

  • \n
  • poll_timeout (Optional[float]) \u2013 The maximum time that will waited before this operation is\ntimed out. By default, this will never time out.

  • \n
  • io_manager_key (Optional[str]) \u2013 The io_manager to be used to handle each of these assets.

  • \n
  • asset_key_prefix (Optional[List[str]]) \u2013 A prefix for the asset keys inside this asset.\nIf left blank, assets will have a key of AssetKey([schema_name, table_name]).

  • \n
  • metadata_by_table_name (Optional[Mapping[str, MetadataUserInput]]) \u2013 A mapping from destination\ntable name to user-supplied metadata that should be associated with the asset for that table.

  • \n
  • group_name (Optional[str]) \u2013 A string name used to organize multiple assets into groups. This\ngroup name will be applied to all assets produced by this multi_asset.

  • \n
  • infer_missing_tables (bool) \u2013 If True, will create asset materializations for tables specified\nin destination_tables even if they are not present in the Fivetran sync output. This is useful\nin cases where Fivetran does not sync any data for a table and therefore does not include it\nin the sync output API response.

  • \n
  • op_tags (Optional[Dict[str, Any]]) \u2013 A dictionary of tags for the op that computes the asset. Frameworks may expect and\nrequire certain metadata to be attached to a op. Values that are not strings will be\njson encoded and must meet the criteria that json.loads(json.dumps(value)) == value.

  • \n
\n
\n
\n

Examples:

\n

Basic example:

\n
\n
from dagster import AssetKey, repository, with_resources\n\nfrom dagster_fivetran import fivetran_resource\nfrom dagster_fivetran.assets import build_fivetran_assets\n\nmy_fivetran_resource = fivetran_resource.configured(\n    {\n        "api_key": {"env": "FIVETRAN_API_KEY"},\n        "api_secret": {"env": "FIVETRAN_API_SECRET"},\n    }\n)\n
\n
\n
\n

Attaching metadata:

\n
\n
fivetran_assets = build_fivetran_assets(\n    connector_id="foobar",\n    table_names=["schema1.table1", "schema2.table2"],\n    metadata_by_table_name={\n        "schema1.table1": {\n            "description": "This is a table that contains foo and bar",\n        },\n        "schema2.table2": {\n            "description": "This is a table that contains baz and quux",\n        },\n    },\n)\n
\n
\n
\n
\n\n
\n
\n

Ops\u00b6

\n
\n
\ndagster_fivetran.fivetran_sync_op = <dagster._core.definitions.op_definition.OpDefinition object>[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
connector_id (dagster.StringSource):
\n

The Fivetran Connector ID that this op will sync. You can retrieve this value from the \u201cSetup\u201d tab of a given connector in the Fivetran UI.

\n
\n
poll_interval (Float, optional):
\n

The time (in seconds) that will be waited between successive polls.

\n

Default Value: 10

\n
\n
poll_timeout (Union[Float, None], optional):
\n

The maximum time that will waited before this operation is timed out. By default, this will never time out.

\n
\n
yield_materializations (dagster.BoolSource, optional):
\n

If True, materializations corresponding to the results of the Fivetran sync will be yielded when the op executes.

\n

Default Value: True

\n
\n
asset_key_prefix (List[dagster.StringSource], optional):
\n

If provided and yield_materializations is True, these components will be used to prefix the generated asset keys.

\n

Default Value: [\u2018fivetran\u2019]

\n
\n
\n

Executes a Fivetran sync for a given connector_id, and polls until that sync\ncompletes, raising an error if it is unsuccessful. It outputs a FivetranOutput which contains\nthe details of the Fivetran connector after the sync successfully completes, as well as details\nabout which tables the sync updates.

\n

It requires the use of the fivetran_resource, which allows it to\ncommunicate with the Fivetran API.

\n

Examples

\n
from dagster import job\nfrom dagster_fivetran import fivetran_resource, fivetran_sync_op\n\nmy_fivetran_resource = fivetran_resource.configured(\n    {\n        "api_key": {"env": "FIVETRAN_API_KEY"},\n        "api_secret": {"env": "FIVETRAN_API_SECRET"},\n    }\n)\n\nsync_foobar = fivetran_sync_op.configured({"connector_id": "foobar"}, name="sync_foobar")\n\n@job(resource_defs={"fivetran": my_fivetran_resource})\ndef my_simple_fivetran_job():\n    sync_foobar()\n\n@job(resource_defs={"fivetran": my_fivetran_resource})\ndef my_composed_fivetran_job():\n    final_foobar_state = sync_foobar(start_after=some_op())\n    other_op(final_foobar_state)\n
\n
\n
\n\n
\n
\n

Legacy\u00b6

\n
\n
\ndagster_fivetran.fivetran_resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
api_key (dagster.StringSource):
\n

The Fivetran API key to use for this resource.

\n
\n
api_secret (dagster.StringSource):
\n

The Fivetran API secret to use for this resource.

\n
\n
disable_schedule_on_trigger (dagster.BoolSource, optional):
\n

Specifies if you would like any connector that is sync\u2019d using this resource to be automatically taken off its Fivetran schedule.

\n

Default Value: True

\n
\n
request_max_retries (dagster.IntSource, optional):
\n

The maximum number of times requests to the Fivetran API should be retried before failing.

\n

Default Value: 3

\n
\n
request_retry_delay (Float, optional):
\n

Time (in seconds) to wait between each request retry.

\n

Default Value: 0.25

\n
\n
\n

This resource allows users to programatically interface with the Fivetran REST API to launch\nsyncs and monitor their progress. This currently implements only a subset of the functionality\nexposed by the API.

\n

For a complete set of documentation on the Fivetran REST API, including expected response JSON\nschemae, see the Fivetran API Docs.

\n

To configure this resource, we recommend using the configured method.

\n

Examples:

\n
from dagster import job\nfrom dagster_fivetran import fivetran_resource\n\nmy_fivetran_resource = fivetran_resource.configured(\n    {\n        "api_key": {"env": "FIVETRAN_API_KEY"},\n        "api_secret": {"env": "FIVETRAN_API_SECRET"},\n    }\n)\n\n@job(resource_defs={"fivetran":my_fivetran_resource})\ndef my_fivetran_job():\n    ...\n
\n
\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-fivetran", "customsidebar": null, "display_toc": true, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../dagster-docker/", "title": "Orchestration on Docker"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-embedded-elt/", "title": "embedded-elt (dagster-embedded-elt)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-docker", "Orchestration on Docker", "N", "next"], ["sections/api/apidocs/libraries/dagster-embedded-elt", "embedded-elt (dagster-embedded-elt)", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/libraries/dagster-fivetran.rst.txt", "title": "Fivetran (dagster-fivetran)", "toc": "\n"}, "dagster-gcp": {"alabaster_version": "0.7.13", "body": "
\n

GCP (dagster-gcp)\u00b6

\n
\n

BigQuery\u00b6

\n

Related Guides:

\n\n
\n

BigQuery Resource\u00b6

\n
\n
\ndagster_gcp.BigQueryResource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
project (Union[dagster.StringSource, None], optional):
\n

Project ID for the project which the client acts on behalf of. Will be passed when creating a dataset / job. If not passed, falls back to the default inferred from the environment.

\n
\n
location (Union[dagster.StringSource, None], optional):
\n

Default location for jobs / datasets / tables.

\n
\n
gcp_credentials (Union[dagster.StringSource, None], optional):
\n

GCP authentication credentials. If provided, a temporary file will be created with the credentials and GOOGLE_APPLICATION_CREDENTIALS will be set to the temporary file. To avoid issues with newlines in the keys, you must base64 encode the key. You can retrieve the base64 encoded key with this shell command: cat $GOOGLE_AUTH_CREDENTIALS | base64

\n
\n
\n

Resource for interacting with Google BigQuery.

\n

Examples

\n
from dagster import Definitions, asset\nfrom dagster_gcp import BigQueryResource\n\n@asset\ndef my_table(bigquery: BigQueryResource):\n    with bigquery.get_client() as client:\n        client.query("SELECT * FROM my_dataset.my_table")\n\ndefs = Definitions(\n    assets=[my_table],\n    resources={\n        "bigquery": BigQueryResource(project="my-project")\n    }\n)\n
\n
\n
\n\n
\n
\n

BigQuery I/O Manager\u00b6

\n
\n
\ndagster_gcp.BigQueryIOManager IOManagerDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
project (dagster.StringSource):
\n

The GCP project to use.

\n
\n
dataset (Union[dagster.StringSource, None], optional):
\n

Name of the BigQuery dataset to use. If not provided, the last prefix before the asset name will be used.

\n
\n
location (Union[dagster.StringSource, None], optional):
\n

The GCP location. Note: When using PySpark DataFrames, the default location of the project will be used. A custom location can be specified in your SparkSession configuration.

\n
\n
gcp_credentials (Union[dagster.StringSource, None], optional):
\n

GCP authentication credentials. If provided, a temporary file will be created with the credentials and GOOGLE_APPLICATION_CREDENTIALS will be set to the temporary file. To avoid issues with newlines in the keys, you must base64 encode the key. You can retrieve the base64 encoded key with this shell command: cat $GOOGLE_AUTH_CREDENTIALS | base64

\n
\n
temporary_gcs_bucket (Union[dagster.StringSource, None], optional):
\n

When using PySpark DataFrames, optionally specify a temporary GCS bucket to store data. If not provided, data will be directly written to BigQuery.

\n
\n
timeout (Union[Float, None], optional):
\n

When using Pandas DataFrames, optionally specify a timeout for the BigQuery queries (loading and reading from tables).

\n
\n
\n

Base class for an I/O manager definition that reads inputs from and writes outputs to BigQuery.

\n

Examples

\n
from dagster_gcp import BigQueryIOManager\nfrom dagster_bigquery_pandas import BigQueryPandasTypeHandler\nfrom dagster import Definitions, EnvVar\n\nclass MyBigQueryIOManager(BigQueryIOManager):\n    @staticmethod\n    def type_handlers() -> Sequence[DbTypeHandler]:\n        return [BigQueryPandasTypeHandler()]\n\n@asset(\n    key_prefix=["my_dataset"]  # my_dataset will be used as the dataset in BigQuery\n)\ndef my_table() -> pd.DataFrame:  # the name of the asset will be the table name\n    ...\n\ndefs = Definitions(\n    assets=[my_table],\n    resources={\n        "io_manager": MyBigQueryIOManager(project=EnvVar("GCP_PROJECT"))\n    }\n)\n
\n
\n

You can tell Dagster in which dataset to create tables by setting the dataset configuration value.\nIf you do not provide a dataset as configuration to the I/O manager, Dagster will determine a dataset based\non the assets and ops using the I/O Manager. For assets, the dataset will be determined from the asset key,\nas shown in the above example. The final prefix before the asset name will be used as the dataset. For example,\nif the asset my_table had the key prefix ["gcp", "bigquery", "my_dataset"], the dataset my_dataset will be\nused. For ops, the dataset can be specified by including a schema entry in output metadata. If schema is\nnot provided via config or on the asset/op, public will be used for the dataset.

\n
@op(\n    out={"my_table": Out(metadata={"schema": "my_dataset"})}\n)\ndef make_my_table() -> pd.DataFrame:\n    # the returned value will be stored at my_dataset.my_table\n    ...\n
\n
\n

To only use specific columns of a table as input to a downstream op or asset, add the metadata columns to the\nIn or AssetIn.

\n
@asset(\n    ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n)\ndef my_table_a(my_table: pd.DataFrame) -> pd.DataFrame:\n    # my_table will just contain the data from column "a"\n    ...\n
\n
\n

If you cannot upload a file to your Dagster deployment, or otherwise cannot\nauthenticate with GCP\nvia a standard method, you can provide a service account key as the gcp_credentials configuration.\nDagster will store this key in a temporary file and set GOOGLE_APPLICATION_CREDENTIALS to point to the file.\nAfter the run completes, the file will be deleted, and GOOGLE_APPLICATION_CREDENTIALS will be\nunset. The key must be base64 encoded to avoid issues with newlines in the keys. You can retrieve\nthe base64 encoded with this shell command: cat $GOOGLE_APPLICATION_CREDENTIALS | base64

\n
\n\n
\n
\n

BigQuery Ops\u00b6

\n
\n
\ndagster_gcp.bq_create_dataset(context)[source]\u00b6
\n

BigQuery Create Dataset.

\n

This op encapsulates creating a BigQuery dataset.

\n

Expects a BQ client to be provisioned in resources as context.resources.bigquery.

\n
\n\n
\n
\ndagster_gcp.bq_delete_dataset(context)[source]\u00b6
\n

BigQuery Delete Dataset.

\n

This op encapsulates deleting a BigQuery dataset.

\n

Expects a BQ client to be provisioned in resources as context.resources.bigquery.

\n
\n\n
\n
\ndagster_gcp.bq_op_for_queries(sql_queries)[source]\u00b6
\n

Executes BigQuery SQL queries.

\n

Expects a BQ client to be provisioned in resources as context.resources.bigquery.

\n
\n\n
\n
\ndagster_gcp.import_df_to_bq(context, df)[source]\u00b6
\n
\n\n
\n
\ndagster_gcp.import_file_to_bq(context, path)[source]\u00b6
\n
\n\n
\n
\ndagster_gcp.import_gcs_paths_to_bq(context, paths)[source]\u00b6
\n
\n\n
\n
\n

Other\u00b6

\n
\n
\nclass dagster_gcp.BigQueryError[source]\u00b6
\n
\n\n
\n
\n
\n

GCS\u00b6

\n
\n

GCS Resource\u00b6

\n
\n
\ndagster_gcp.GCSResource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
project (Union[dagster.StringSource, None], optional):
\n

Project name

\n
\n
\n

Resource for interacting with Google Cloud Storage.

\n

Example

\n
@asset\ndef my_asset(gcs: GCSResource):\n    with gcs.get_client() as client:\n        # client is a google.cloud.storage.Client\n        ...\n
\n
\n
\n\n
\n
\n

GCS I/O Manager\u00b6

\n
\n
\ndagster_gcp.GCSPickleIOManager IOManagerDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
gcs (Union[Any, None], optional):
\n

\n
gcs_bucket (dagster.StringSource):
\n

GCS bucket to store files

\n
\n
gcs_prefix (dagster.StringSource, optional):
\n

Prefix to add to all file paths

\n

Default Value: \u2018dagster\u2019

\n
\n
\n

Persistent IO manager using GCS for storage.

\n

Serializes objects via pickling. Suitable for objects storage for distributed executors, so long\nas each execution node has network connectivity and credentials for GCS and the backing bucket.

\n

Assigns each op output to a unique filepath containing run ID, step key, and output name.\nAssigns each asset to a single filesystem path, at <base_dir>/<asset_key>. If the asset key\nhas multiple components, the final component is used as the name of the file, and the preceding\ncomponents as parent directories under the base_dir.

\n

Subsequent materializations of an asset will overwrite previous materializations of that asset.\nWith a base directory of /my/base/path, an asset with key\nAssetKey(["one", "two", "three"]) would be stored in a file called three in a directory\nwith path /my/base/path/one/two/.

\n

Example usage:

\n
    \n
  1. Attach this IO manager to a set of assets.

  2. \n
\n
from dagster import asset, Definitions\nfrom dagster_gcp.gcs import GCSPickleIOManager, GCSResource\n\n@asset\ndef asset1():\n    # create df ...\n    return df\n\n@asset\ndef asset2(asset1):\n    return asset1[:5]\n\ndefs = Definitions(\n    assets=[asset1, asset2],\n    resources={\n        "io_manager": GCSPickleIOManager(\n            gcs_bucket="my-cool-bucket",\n            gcs_prefix="my-cool-prefix"\n        ),\n        "gcs": GCSResource(project="my-cool-project")\n    }\n)\n
\n
\n
    \n
  1. Attach this IO manager to your job to make it available to your ops.

  2. \n
\n
from dagster import job\nfrom dagster_gcp.gcs import GCSPickleIOManager, GCSResource\n\n@job(\n    resource_defs={\n        "io_manager": GCSPickleIOManager(\n            gcs=GCSResource(project="my-cool-project")\n            gcs_bucket="my-cool-bucket",\n            gcs_prefix="my-cool-prefix"\n        ),\n    }\n)\ndef my_job():\n    ...\n
\n
\n
\n\n
\n
\n

File Manager (Experimental)\u00b6

\n
\n
\nclass dagster_gcp.GCSFileHandle(gcs_bucket, gcs_key)[source]\u00b6
\n

A reference to a file on GCS.

\n
\n\n
\n
\ndagster_gcp.GCSFileManagerResource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
project (Union[dagster.StringSource, None], optional):
\n

Project name

\n
\n
gcs_bucket (dagster.StringSource):
\n

GCS bucket to store files

\n
\n
gcs_prefix (dagster.StringSource, optional):
\n

Prefix to add to all file paths

\n

Default Value: \u2018dagster\u2019

\n
\n
\n

FileManager that provides abstract access to GCS.

\n
\n\n
\n
\n

GCS Compute Log Manager\u00b6

\n
\n
\nclass dagster_gcp.gcs.GCSComputeLogManager(bucket, local_dir=None, inst_data=None, prefix='dagster', json_credentials_envvar=None, upload_interval=None)[source]\u00b6
\n

Logs op compute function stdout and stderr to GCS.

\n

Users should not instantiate this class directly. Instead, use a YAML block in dagster.yaml\nsuch as the following:

\n
compute_logs:\n  module: dagster_gcp.gcs.compute_log_manager\n  class: GCSComputeLogManager\n  config:\n    bucket: "mycorp-dagster-compute-logs"\n    local_dir: "/tmp/cool"\n    prefix: "dagster-test-"\n    upload_interval: 30\n
\n
\n

There are more configuration examples in the instance documentation guide: https://docs.dagster.io/deployment/dagster-instance#compute-log-storage

\n
\n
Parameters:
\n
    \n
  • bucket (str) \u2013 The name of the GCS bucket to which to log.

  • \n
  • local_dir (Optional[str]) \u2013 Path to the local directory in which to stage logs. Default:\ndagster._seven.get_system_temp_directory().

  • \n
  • prefix (Optional[str]) \u2013 Prefix for the log file keys.

  • \n
  • json_credentials_envvar (Optional[str]) \u2013 Environment variable that contains the JSON with a private key\nand other credentials information. If this is set, GOOGLE_APPLICATION_CREDENTIALS will be ignored.\nCan be used when the private key cannot be used as a file.

  • \n
  • upload_interval \u2013 (Optional[int]): Interval in seconds to upload partial log files to GCS. By default, will only upload when the capture is complete.

  • \n
  • inst_data (Optional[ConfigurableClassData]) \u2013 Serializable representation of the compute\nlog manager when instantiated from config.

  • \n
\n
\n
\n
\n\n
\n
\n
\n

Dataproc\u00b6

\n
\n

Dataproc Resource\u00b6

\n
\n
\ndagster_gcp.DataprocResource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
project_id (dagster.StringSource):
\n

Required. Project ID for the project which the client acts on behalf of. Will be passed when creating a dataset/job.

\n
\n
region (dagster.StringSource):
\n

The GCP region.

\n
\n
cluster_name (dagster.StringSource):
\n

Required. The cluster name. Cluster names within a project must be unique. Names of deleted clusters can be reused.

\n
\n
cluster_config_yaml_path (Union[dagster.StringSource, None], optional):
\n

Full path to a YAML file containing cluster configuration. See https://cloud.google.com/dataproc/docs/reference/rest/v1/ClusterConfig for configuration options. Only one of cluster_config_yaml_path, cluster_config_json_path, or cluster_config_dict may be provided.

\n
\n
cluster_config_json_path (Union[dagster.StringSource, None], optional):
\n

Full path to a JSON file containing cluster configuration. See https://cloud.google.com/dataproc/docs/reference/rest/v1/ClusterConfig for configuration options. Only one of cluster_config_yaml_path, cluster_config_json_path, or cluster_config_dict may be provided.

\n
\n
cluster_config_dict (Union[dict, None], optional):
\n

Python dictionary containing cluster configuration. See https://cloud.google.com/dataproc/docs/reference/rest/v1/ClusterConfig for configuration options. Only one of cluster_config_yaml_path, cluster_config_json_path, or cluster_config_dict may be provided.

\n
\n
\n

Resource for connecting to a Dataproc cluster.

\n

Example

\n
@asset\ndef my_asset(dataproc: DataprocResource):\n    with dataproc.get_client() as client:\n        # client is a dagster_gcp.DataprocClient\n        ...\n
\n
\n
\n\n
\n
\n

Dataproc Ops\u00b6

\n
\n
\ndagster_gcp.dataproc_op = <dagster._core.definitions.op_definition.OpDefinition object>[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
job_timeout_in_seconds (Int, optional):
\n

Optional. Maximum time in seconds to wait for the job being\ncompleted. Default is set to 1200 seconds (20 minutes).

\n

Default Value: 1200

\n
\n
job_config (strict dict):
\n
\nConfig Schema:
\n
job (strict dict, optional):
\n

A Cloud Dataproc job resource.

\n
\nConfig Schema:
\n
status (strict dict, optional):
\n

Cloud Dataproc job status.

\n
\n
placement (strict dict, optional):
\n

Cloud Dataproc job config.

\n
\nConfig Schema:
\n
clusterName (String, optional):
\n

Required. The name of the cluster where the job will\nbe submitted.

\n
\n
\n
\n
scheduling (strict dict, optional):
\n

Job scheduling options.

\n
\nConfig Schema:
\n
maxFailuresPerHour (Int, optional):
\n

Optional. Maximum number of times per hour a driver\nmay be restarted as a result of driver terminating with non-zero\ncode before job is reported failed.A job may be reported as\nthrashing if driver exits with non-zero code 4 times within 10\nminute window.Maximum value is 10.

\n
\n
\n
\n
pigJob (strict dict, optional):
\n

A Cloud Dataproc job for running Apache Pig\n(https://pig.apache.org/) queries on YARN.

\n
\nConfig Schema:
\n
queryFileUri (String, optional):
\n

The HCFS URI of the script that contains the Pig\nqueries.

\n
\n
queryList (strict dict, optional):
\n

A list of queries to run on a cluster.

\n
\nConfig Schema:
\n
queries (List[String], optional):
\n

Required. The queries to execute. You do\nnot need to terminate a query with a semicolon. Multiple\nqueries can be specified in one string by separating\neach with a semicolon. Here is an example of an Cloud\nDataproc API snippet that uses a QueryList to specify a\nHiveJob: \u201chiveJob\u201d: { \u201cqueryList\u201d: { \u201cqueries\u201d: [\n\u201cquery1\u201d, \u201cquery2\u201d, \u201cquery3;query4\u201d, ]\n} }

\n
\n
\n
\n
jarFileUris (List[String], optional):
\n

Optional. HCFS URIs of jar files to add to the\nCLASSPATH of the Pig Client and Hadoop MapReduce (MR) tasks. Can\ncontain Pig UDFs.

\n
\n
scriptVariables (permissive dict, optional):
\n

Optional. Mapping of query variable names to values\n(equivalent to the Pig command: name=[value]).

\n
\n
loggingConfig (strict dict, optional):
\n

The runtime logging config of the job.

\n
\nConfig Schema:
\n
driverLogLevels (permissive dict, optional):
\n

The per-package log levels for the\ndriver. This may include \u201croot\u201d package name to\nconfigure rootLogger. Examples: \u2018com.google = FATAL\u2019,\n\u2018root = INFO\u2019, \u2018org.apache = DEBUG\u2019

\n
\n
\n
\n
properties (permissive dict, optional):
\n

Optional. A mapping of property names to values, used\nto configure Pig. Properties that conflict with values set by the\nCloud Dataproc API may be overwritten. Can include properties set in\n/etc/hadoop/conf/*-site.xml, /etc/pig/conf/pig.properties, and\nclasses in user code.

\n
\n
continueOnFailure (Bool, optional):
\n

Optional. Whether to continue executing queries if a\nquery fails. The default value is false. Setting to true can be\nuseful when executing independent parallel queries.

\n
\n
\n
\n
hiveJob (strict dict, optional):
\n

A Cloud Dataproc job for running Apache Hive\n(https://hive.apache.org/) queries on YARN.

\n
\nConfig Schema:
\n
continueOnFailure (Bool, optional):
\n

Optional. Whether to continue executing queries if a\nquery fails. The default value is false. Setting to true can be\nuseful when executing independent parallel queries.

\n
\n
queryFileUri (String, optional):
\n

The HCFS URI of the script that contains Hive\nqueries.

\n
\n
queryList (strict dict, optional):
\n

A list of queries to run on a cluster.

\n
\nConfig Schema:
\n
queries (List[String], optional):
\n

Required. The queries to execute. You do\nnot need to terminate a query with a semicolon. Multiple\nqueries can be specified in one string by separating\neach with a semicolon. Here is an example of an Cloud\nDataproc API snippet that uses a QueryList to specify a\nHiveJob: \u201chiveJob\u201d: { \u201cqueryList\u201d: { \u201cqueries\u201d: [\n\u201cquery1\u201d, \u201cquery2\u201d, \u201cquery3;query4\u201d, ]\n} }

\n
\n
\n
\n
jarFileUris (List[String], optional):
\n

Optional. HCFS URIs of jar files to add to the\nCLASSPATH of the Hive server and Hadoop MapReduce (MR) tasks. Can\ncontain Hive SerDes and UDFs.

\n
\n
scriptVariables (permissive dict, optional):
\n

Optional. Mapping of query variable names to values\n(equivalent to the Hive command: SET name=\u201dvalue\u201d;).

\n
\n
properties (permissive dict, optional):
\n

Optional. A mapping of property names and values,\nused to configure Hive. Properties that conflict with values set by\nthe Cloud Dataproc API may be overwritten. Can include properties\nset in /etc/hadoop/conf/*-site.xml, /etc/hive/conf/hive-site.xml,\nand classes in user code.

\n
\n
\n
\n
labels (permissive dict, optional):
\n

Optional. The labels to associate with this job. Label keys must\ncontain 1 to 63 characters, and must conform to RFC 1035\n(https://www.ietf.org/rfc/rfc1035.txt). Label values may be empty, but, if\npresent, must contain 1 to 63 characters, and must conform to RFC 1035\n(https://www.ietf.org/rfc/rfc1035.txt). No more than 32 labels can be associated\nwith a job.

\n
\n
sparkJob (strict dict, optional):
\n

A Cloud Dataproc job for running Apache Spark\n(http://spark.apache.org/) applications on YARN.

\n
\nConfig Schema:
\n
archiveUris (List[String], optional):
\n

Optional. HCFS URIs of archives to be extracted in\nthe working directory of Spark drivers and tasks. Supported file\ntypes: .jar, .tar, .tar.gz, .tgz, and .zip.

\n
\n
mainJarFileUri (String, optional):
\n

The HCFS URI of the jar file that contains the main\nclass.

\n
\n
jarFileUris (List[String], optional):
\n

Optional. HCFS URIs of jar files to add to the\nCLASSPATHs of the Spark driver and tasks.

\n
\n
loggingConfig (strict dict, optional):
\n

The runtime logging config of the job.

\n
\nConfig Schema:
\n
driverLogLevels (permissive dict, optional):
\n

The per-package log levels for the\ndriver. This may include \u201croot\u201d package name to\nconfigure rootLogger. Examples: \u2018com.google = FATAL\u2019,\n\u2018root = INFO\u2019, \u2018org.apache = DEBUG\u2019

\n
\n
\n
\n
properties (permissive dict, optional):
\n

Optional. A mapping of property names to values, used\nto configure Spark. Properties that conflict with values set by the\nCloud Dataproc API may be overwritten. Can include properties set in\n/etc/spark/conf/spark-defaults.conf and classes in user code.

\n
\n
args (List[String], optional):
\n

Optional. The arguments to pass to the driver. Do not\ninclude arguments, such as \u2013conf, that can be set as job\nproperties, since a collision may occur that causes an incorrect job\nsubmission.

\n
\n
fileUris (List[String], optional):
\n

Optional. HCFS URIs of files to be copied to the\nworking directory of Spark drivers and distributed tasks. Useful for\nnaively parallel tasks.

\n
\n
mainClass (String, optional):
\n

The name of the driver\u2019s main class. The jar file\nthat contains the class must be in the default CLASSPATH or\nspecified in jar_file_uris.

\n
\n
\n
\n
sparkSqlJob (strict dict, optional):
\n

A Cloud Dataproc job for running Apache Spark SQL\n(http://spark.apache.org/sql/) queries.

\n
\nConfig Schema:
\n
queryList (strict dict, optional):
\n

A list of queries to run on a cluster.

\n
\nConfig Schema:
\n
queries (List[String], optional):
\n

Required. The queries to execute. You do\nnot need to terminate a query with a semicolon. Multiple\nqueries can be specified in one string by separating\neach with a semicolon. Here is an example of an Cloud\nDataproc API snippet that uses a QueryList to specify a\nHiveJob: \u201chiveJob\u201d: { \u201cqueryList\u201d: { \u201cqueries\u201d: [\n\u201cquery1\u201d, \u201cquery2\u201d, \u201cquery3;query4\u201d, ]\n} }

\n
\n
\n
\n
queryFileUri (String, optional):
\n

The HCFS URI of the script that contains SQL\nqueries.

\n
\n
scriptVariables (permissive dict, optional):
\n

Optional. Mapping of query variable names to values\n(equivalent to the Spark SQL command: SET name=\u201dvalue\u201d;).

\n
\n
jarFileUris (List[String], optional):
\n

Optional. HCFS URIs of jar files to be added to the\nSpark CLASSPATH.

\n
\n
loggingConfig (strict dict, optional):
\n

The runtime logging config of the job.

\n
\nConfig Schema:
\n
driverLogLevels (permissive dict, optional):
\n

The per-package log levels for the\ndriver. This may include \u201croot\u201d package name to\nconfigure rootLogger. Examples: \u2018com.google = FATAL\u2019,\n\u2018root = INFO\u2019, \u2018org.apache = DEBUG\u2019

\n
\n
\n
\n
properties (permissive dict, optional):
\n

Optional. A mapping of property names to values, used\nto configure Spark SQL\u2019s SparkConf. Properties that conflict with\nvalues set by the Cloud Dataproc API may be overwritten.

\n
\n
\n
\n
pysparkJob (strict dict, optional):
\n

A Cloud Dataproc job for running Apache PySpark\n(https://spark.apache.org/docs/0.9.0/python-programming-guide.html) applications\non YARN.

\n
\nConfig Schema:
\n
jarFileUris (List[String], optional):
\n

Optional. HCFS URIs of jar files to add to the\nCLASSPATHs of the Python driver and tasks.

\n
\n
loggingConfig (strict dict, optional):
\n

The runtime logging config of the job.

\n
\nConfig Schema:
\n
driverLogLevels (permissive dict, optional):
\n

The per-package log levels for the\ndriver. This may include \u201croot\u201d package name to\nconfigure rootLogger. Examples: \u2018com.google = FATAL\u2019,\n\u2018root = INFO\u2019, \u2018org.apache = DEBUG\u2019

\n
\n
\n
\n
properties (permissive dict, optional):
\n

Optional. A mapping of property names to values, used\nto configure PySpark. Properties that conflict with values set by\nthe Cloud Dataproc API may be overwritten. Can include properties\nset in /etc/spark/conf/spark-defaults.conf and classes in user\ncode.

\n
\n
args (List[String], optional):
\n

Optional. The arguments to pass to the driver. Do not\ninclude arguments, such as \u2013conf, that can be set as job\nproperties, since a collision may occur that causes an incorrect job\nsubmission.

\n
\n
fileUris (List[String], optional):
\n

Optional. HCFS URIs of files to be copied to the\nworking directory of Python drivers and distributed tasks. Useful\nfor naively parallel tasks.

\n
\n
pythonFileUris (List[String], optional):
\n

Optional. HCFS file URIs of Python files to pass to\nthe PySpark framework. Supported file types: .py, .egg, and\n.zip.

\n
\n
mainPythonFileUri (String, optional):
\n

Required. The HCFS URI of the main Python file to use\nas the driver. Must be a .py file.

\n
\n
archiveUris (List[String], optional):
\n

Optional. HCFS URIs of archives to be extracted in\nthe working directory of .jar, .tar, .tar.gz, .tgz, and .zip.

\n
\n
\n
\n
reference (strict dict, optional):
\n

Encapsulates the full scoping used to reference a job.

\n
\nConfig Schema:
\n
projectId (String, optional):
\n

Required. The ID of the Google Cloud Platform project\nthat the job belongs to.

\n
\n
jobId (String, optional):
\n

Optional. The job ID, which must be unique within the\nproject.The ID must contain only letters (a-z, A-Z), numbers (0-9),\nunderscores (_), or hyphens (-). The maximum length is 100\ncharacters.If not specified by the caller, the job ID will be\nprovided by the server.

\n
\n
\n
\n
hadoopJob (strict dict, optional):
\n

A Cloud Dataproc job for running Apache Hadoop MapReduce\n(https://hadoop.apache.org/docs/current/hadoop-mapreduce-client/hadoop-mapreduce-client-core/MapReduceTutorial.html)\njobs on Apache Hadoop YARN\n(https://hadoop.apache.org/docs/r2.7.1/hadoop-yarn/hadoop-yarn-site/YARN.html).

\n
\nConfig Schema:
\n
jarFileUris (List[String], optional):
\n

Optional. Jar file URIs to add to the CLASSPATHs of\nthe Hadoop driver and tasks.

\n
\n
loggingConfig (strict dict, optional):
\n

The runtime logging config of the job.

\n
\nConfig Schema:
\n
driverLogLevels (permissive dict, optional):
\n

The per-package log levels for the\ndriver. This may include \u201croot\u201d package name to\nconfigure rootLogger. Examples: \u2018com.google = FATAL\u2019,\n\u2018root = INFO\u2019, \u2018org.apache = DEBUG\u2019

\n
\n
\n
\n
properties (permissive dict, optional):
\n

Optional. A mapping of property names to values, used\nto configure Hadoop. Properties that conflict with values set by the\nCloud Dataproc API may be overwritten. Can include properties set in\n/etc/hadoop/conf/*-site and classes in user code.

\n
\n
args (List[String], optional):
\n

Optional. The arguments to pass to the driver. Do not\ninclude arguments, such as -libjars or -Dfoo=bar, that can be set as\njob properties, since a collision may occur that causes an incorrect\njob submission.

\n
\n
fileUris (List[String], optional):
\n

Optional. HCFS (Hadoop Compatible Filesystem) URIs of\nfiles to be copied to the working directory of Hadoop drivers and\ndistributed tasks. Useful for naively parallel tasks.

\n
\n
mainClass (String, optional):
\n

The name of the driver\u2019s main class. The jar file\ncontaining the class must be in the default CLASSPATH or specified\nin jar_file_uris.

\n
\n
archiveUris (List[String], optional):
\n

Optional. HCFS URIs of archives to be extracted in\nthe working directory of Hadoop drivers and tasks. Supported file\ntypes: .jar, .tar, .tar.gz, .tgz, or .zip.

\n
\n
mainJarFileUri (String, optional):
\n

The HCFS URI of the jar file containing the main\nclass. Examples:\n\u2018gs://foo-bucket/analytics-binaries/extract-useful-metrics-mr.jar\u2019\n\u2018hdfs:/tmp/test-samples/custom-wordcount.jar\u2019\n\u2018file:///home/usr/lib/hadoop-mapreduce/hadoop-mapreduce-examples.jar\u2019

\n
\n
\n
\n
\n
\n
projectId (dagster.StringSource):
\n

Required. Project ID for the project which the client acts on behalf of. Will\nbe passed when creating a dataset / job. If not passed, falls back to the default inferred\nfrom the environment.

\n
\n
region (dagster.StringSource):
\n

\n
\n
\n
job_scoped_cluster (Bool, optional):
\n

whether to create a cluster or use an existing cluster

\n

Default Value: True

\n
\n
\n
\n\n
\n
\n
\n

Legacy\u00b6

\n
\n
\ndagster_gcp.ConfigurablePickledObjectGCSIOManager IOManagerDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
gcs (Union[Any, None], optional):
\n

\n
gcs_bucket (dagster.StringSource):
\n

GCS bucket to store files

\n
\n
gcs_prefix (dagster.StringSource, optional):
\n

Prefix to add to all file paths

\n

Default Value: \u2018dagster\u2019

\n
\n
\n
\n
\n

\n \n (\n deprecated\n )\n \n This API will be removed in version 2.0. Please use GCSPickleIOManager instead..\n \n

\n

Renamed to GCSPickleIOManager. See GCSPickleIOManager for documentation.

\n
\n\n
\n
\ndagster_gcp.bigquery_resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
project (Union[dagster.StringSource, None], optional):
\n

Project ID for the project which the client acts on behalf of. Will be passed when creating a dataset / job. If not passed, falls back to the default inferred from the environment.

\n
\n
location (Union[dagster.StringSource, None], optional):
\n

Default location for jobs / datasets / tables.

\n
\n
gcp_credentials (Union[dagster.StringSource, None], optional):
\n

GCP authentication credentials. If provided, a temporary file will be created with the credentials and GOOGLE_APPLICATION_CREDENTIALS will be set to the temporary file. To avoid issues with newlines in the keys, you must base64 encode the key. You can retrieve the base64 encoded key with this shell command: cat $GOOGLE_AUTH_CREDENTIALS | base64

\n
\n
\n
\n\n
\n
\ndagster_gcp.build_bigquery_io_manager IOManagerDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
project (dagster.StringSource):
\n

The GCP project to use.

\n
\n
dataset (Union[dagster.StringSource, None], optional):
\n

Name of the BigQuery dataset to use. If not provided, the last prefix before the asset name will be used.

\n
\n
location (Union[dagster.StringSource, None], optional):
\n

The GCP location. Note: When using PySpark DataFrames, the default location of the project will be used. A custom location can be specified in your SparkSession configuration.

\n
\n
gcp_credentials (Union[dagster.StringSource, None], optional):
\n

GCP authentication credentials. If provided, a temporary file will be created with the credentials and GOOGLE_APPLICATION_CREDENTIALS will be set to the temporary file. To avoid issues with newlines in the keys, you must base64 encode the key. You can retrieve the base64 encoded key with this shell command: cat $GOOGLE_AUTH_CREDENTIALS | base64

\n
\n
temporary_gcs_bucket (Union[dagster.StringSource, None], optional):
\n

When using PySpark DataFrames, optionally specify a temporary GCS bucket to store data. If not provided, data will be directly written to BigQuery.

\n
\n
timeout (Union[Float, None], optional):
\n

When using Pandas DataFrames, optionally specify a timeout for the BigQuery queries (loading and reading from tables).

\n
\n
\n
\n
\n

\n \n (\n experimental\n )\n \n This API may break in future versions, even between dot releases.\n \n

\n

Builds an I/O manager definition that reads inputs from and writes outputs to BigQuery.

\n
\n
Parameters:
\n
    \n
  • type_handlers (Sequence[DbTypeHandler]) \u2013 Each handler defines how to translate between\nslices of BigQuery tables and an in-memory type - e.g. a Pandas DataFrame.\nIf only one DbTypeHandler is provided, it will be used as the default_load_type.

  • \n
  • default_load_type (Type) \u2013 When an input has no type annotation, load it as this type.

  • \n
\n
\n
Returns:
\n

IOManagerDefinition

\n
\n
\n

Examples

\n
from dagster_gcp import build_bigquery_io_manager\nfrom dagster_bigquery_pandas import BigQueryPandasTypeHandler\nfrom dagster import Definitions\n\n@asset(\n    key_prefix=["my_dataset"]  # my_dataset will be used as the dataset in BigQuery\n)\ndef my_table() -> pd.DataFrame:  # the name of the asset will be the table name\n    ...\n\nbigquery_io_manager = build_bigquery_io_manager([BigQueryPandasTypeHandler()])\n\ndefs = Definitions(\n    assets=[my_table],\n    resources={\n        "io_manager": bigquery_io_manager.configured({\n            "project" : {"env": "GCP_PROJECT"}\n        })\n    }\n)\n
\n
\n

You can tell Dagster in which dataset to create tables by setting the dataset configuration value.\nIf you do not provide a dataset as configuration to the I/O manager, Dagster will determine a dataset based\non the assets and ops using the I/O Manager. For assets, the dataset will be determined from the asset key,\nas shown in the above example. The final prefix before the asset name will be used as the dataset. For example,\nif the asset my_table had the key prefix ["gcp", "bigquery", "my_dataset"], the dataset my_dataset will be\nused. For ops, the dataset can be specified by including a schema entry in output metadata. If schema is\nnot provided via config or on the asset/op, public will be used for the dataset.

\n
@op(\n    out={"my_table": Out(metadata={"schema": "my_dataset"})}\n)\ndef make_my_table() -> pd.DataFrame:\n    # the returned value will be stored at my_dataset.my_table\n    ...\n
\n
\n

To only use specific columns of a table as input to a downstream op or asset, add the metadata columns to the\nIn or AssetIn.

\n
@asset(\n    ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n)\ndef my_table_a(my_table: pd.DataFrame) -> pd.DataFrame:\n    # my_table will just contain the data from column "a"\n    ...\n
\n
\n

If you cannot upload a file to your Dagster deployment, or otherwise cannot\nauthenticate with GCP\nvia a standard method, you can provide a service account key as the gcp_credentials configuration.\nDagster willstore this key in a temporary file and set GOOGLE_APPLICATION_CREDENTIALS to point to the file.\nAfter the run completes, the file will be deleted, and GOOGLE_APPLICATION_CREDENTIALS will be\nunset. The key must be base64 encoded to avoid issues with newlines in the keys. You can retrieve\nthe base64 encoded with this shell command: cat $GOOGLE_APPLICATION_CREDENTIALS | base64

\n
\n\n
\n
\ndagster_gcp.gcs_resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
project (Union[dagster.StringSource, None], optional):
\n

Project name

\n
\n
\n
\n\n
\n
\ndagster_gcp.gcs_pickle_io_manager IOManagerDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
gcs (Union[Any, None], optional):
\n

\n
gcs_bucket (dagster.StringSource):
\n

GCS bucket to store files

\n
\n
gcs_prefix (dagster.StringSource, optional):
\n

Prefix to add to all file paths

\n

Default Value: \u2018dagster\u2019

\n
\n
\n

Persistent IO manager using GCS for storage.

\n

Serializes objects via pickling. Suitable for objects storage for distributed executors, so long\nas each execution node has network connectivity and credentials for GCS and the backing bucket.

\n

Assigns each op output to a unique filepath containing run ID, step key, and output name.\nAssigns each asset to a single filesystem path, at <base_dir>/<asset_key>. If the asset key\nhas multiple components, the final component is used as the name of the file, and the preceding\ncomponents as parent directories under the base_dir.

\n

Subsequent materializations of an asset will overwrite previous materializations of that asset.\nWith a base directory of /my/base/path, an asset with key\nAssetKey(["one", "two", "three"]) would be stored in a file called three in a directory\nwith path /my/base/path/one/two/.

\n

Example usage:

\n
    \n
  1. Attach this IO manager to a set of assets.

  2. \n
\n
from dagster import Definitions, asset\nfrom dagster_gcp.gcs import gcs_pickle_io_manager, gcs_resource\n\n@asset\ndef asset1():\n    # create df ...\n    return df\n\n@asset\ndef asset2(asset1):\n    return asset1[:5]\n\ndefs = Definitions(\n    assets=[asset1, asset2],\n    resources={\n            "io_manager": gcs_pickle_io_manager.configured(\n                {"gcs_bucket": "my-cool-bucket", "gcs_prefix": "my-cool-prefix"}\n            ),\n            "gcs": gcs_resource.configured({"project": "my-cool-project"}),\n        },\n)\n
\n
\n
    \n
  1. Attach this IO manager to your job to make it available to your ops.

  2. \n
\n
from dagster import job\nfrom dagster_gcp.gcs import gcs_pickle_io_manager, gcs_resource\n\n@job(\n    resource_defs={\n        "io_manager": gcs_pickle_io_manager.configured(\n            {"gcs_bucket": "my-cool-bucket", "gcs_prefix": "my-cool-prefix"}\n        ),\n        "gcs": gcs_resource.configured({"project": "my-cool-project"}),\n    },\n)\ndef my_job():\n    ...\n
\n
\n
\n\n
\n
\ndagster_gcp.gcs_file_manager ResourceDefinition[source]\u00b6
\n

FileManager that provides abstract access to GCS.

\n

Implements the FileManager API.

\n
\n\n
\n
\ndagster_gcp.dataproc_resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
projectId (dagster.StringSource):
\n

Required. Project ID for the project which the client acts on behalf of. Will\nbe passed when creating a dataset / job. If not passed, falls back to the default inferred\nfrom the environment.

\n
\n
region (dagster.StringSource):
\n

\n
clusterName (dagster.StringSource):
\n

Required. The cluster name. Cluster names within a project must be unique.\nNames of deleted clusters can be reused.

\n
\n
cluster_config (strict dict, optional):
\n

The cluster config.

\n
\nConfig Schema:
\n
masterConfig (strict dict, optional):
\n

Optional. The config settings for Compute Engine resources in an\ninstance group, such as a master or worker group.

\n
\nConfig Schema:
\n
accelerators (List[strict dict], optional):
\n

Optional. The Compute Engine accelerator\nconfiguration for these instances.Beta Feature: This feature is\nstill under development. It may be changed before final release.

\n
\n
numInstances (Int, optional):
\n

Optional. The number of VM instances in the instance\ngroup. For master instance groups, must be set to 1.

\n
\n
diskConfig (strict dict, optional):
\n

Specifies the config of disk options for a group of\nVM instances.

\n
\nConfig Schema:
\n
numLocalSsds (Int, optional):
\n

Optional. Number of attached SSDs, from 0\nto 4 (default is 0). If SSDs are not attached, the boot\ndisk is used to store runtime logs and HDFS\n(https://hadoop.apache.org/docs/r1.2.1/hdfs_user_guide.html)\ndata. If one or more SSDs are attached, this runtime\nbulk data is spread across them, and the boot disk\ncontains only basic config and installed binaries.

\n
\n
bootDiskSizeGb (Int, optional):
\n

Optional. Size in GB of the boot disk\n(default is 500GB).

\n
\n
bootDiskType (String, optional):
\n

Optional. Type of the boot disk (default\nis \u201cpd-standard\u201d). Valid values: \u201cpd-ssd\u201d (Persistent\nDisk Solid State Drive) or \u201cpd-standard\u201d (Persistent\nDisk Hard Disk Drive).

\n
\n
\n
\n
managedGroupConfig (strict dict, optional):
\n

Specifies the resources used to actively manage an\ninstance group.

\n
\n
isPreemptible (Bool, optional):
\n

Optional. Specifies that this instance group contains\npreemptible instances.

\n
\n
imageUri (String, optional):
\n

Optional. The Compute Engine image resource used for\ncluster instances. It can be specified or may be inferred from\nSoftwareConfig.image_version.

\n
\n
machineTypeUri (String, optional):
\n

Optional. The Compute Engine machine type used for\ncluster instances.A full URL, partial URI, or short name are valid.\nExamples:\nhttps://www.googleapis.com/compute/v1/projects/[project_id]/zones/us-east1-a/machineTypes/n1-standard-2\nprojects/[project_id]/zones/us-east1-a/machineTypes/n1-standard-2\nn1-standard-2Auto Zone Exception: If you are using the Cloud\nDataproc Auto Zone Placement feature, you must use the short name of\nthe machine type resource, for example, n1-standard-2.

\n
\n
\n
\n
secondaryWorkerConfig (strict dict, optional):
\n

Optional. The config settings for Compute Engine resources in an\ninstance group, such as a master or worker group.

\n
\nConfig Schema:
\n
accelerators (List[strict dict], optional):
\n

Optional. The Compute Engine accelerator\nconfiguration for these instances.Beta Feature: This feature is\nstill under development. It may be changed before final release.

\n
\n
numInstances (Int, optional):
\n

Optional. The number of VM instances in the instance\ngroup. For master instance groups, must be set to 1.

\n
\n
diskConfig (strict dict, optional):
\n

Specifies the config of disk options for a group of\nVM instances.

\n
\nConfig Schema:
\n
numLocalSsds (Int, optional):
\n

Optional. Number of attached SSDs, from 0\nto 4 (default is 0). If SSDs are not attached, the boot\ndisk is used to store runtime logs and HDFS\n(https://hadoop.apache.org/docs/r1.2.1/hdfs_user_guide.html)\ndata. If one or more SSDs are attached, this runtime\nbulk data is spread across them, and the boot disk\ncontains only basic config and installed binaries.

\n
\n
bootDiskSizeGb (Int, optional):
\n

Optional. Size in GB of the boot disk\n(default is 500GB).

\n
\n
bootDiskType (String, optional):
\n

Optional. Type of the boot disk (default\nis \u201cpd-standard\u201d). Valid values: \u201cpd-ssd\u201d (Persistent\nDisk Solid State Drive) or \u201cpd-standard\u201d (Persistent\nDisk Hard Disk Drive).

\n
\n
\n
\n
managedGroupConfig (strict dict, optional):
\n

Specifies the resources used to actively manage an\ninstance group.

\n
\n
isPreemptible (Bool, optional):
\n

Optional. Specifies that this instance group contains\npreemptible instances.

\n
\n
imageUri (String, optional):
\n

Optional. The Compute Engine image resource used for\ncluster instances. It can be specified or may be inferred from\nSoftwareConfig.image_version.

\n
\n
machineTypeUri (String, optional):
\n

Optional. The Compute Engine machine type used for\ncluster instances.A full URL, partial URI, or short name are valid.\nExamples:\nhttps://www.googleapis.com/compute/v1/projects/[project_id]/zones/us-east1-a/machineTypes/n1-standard-2\nprojects/[project_id]/zones/us-east1-a/machineTypes/n1-standard-2\nn1-standard-2Auto Zone Exception: If you are using the Cloud\nDataproc Auto Zone Placement feature, you must use the short name of\nthe machine type resource, for example, n1-standard-2.

\n
\n
\n
\n
encryptionConfig (strict dict, optional):
\n

Encryption settings for the cluster.

\n
\nConfig Schema:
\n
gcePdKmsKeyName (String, optional):
\n

Optional. The Cloud KMS key name to use for PD disk\nencryption for all instances in the cluster.

\n
\n
\n
\n
securityConfig (strict dict, optional):
\n

Security related configuration, including Kerberos.

\n
\nConfig Schema:
\n
kerberosConfig (strict dict, optional):
\n

Specifies Kerberos related configuration.

\n
\nConfig Schema:
\n
truststorePasswordUri (String, optional):
\n

Optional. The Cloud Storage URI of a KMS\nencrypted file containing the password to the user\nprovided truststore. For the self-signed certificate,\nthis password is generated by Dataproc.

\n
\n
enableKerberos (Bool, optional):
\n

Optional. Flag to indicate whether to\nKerberize the cluster.

\n
\n
truststoreUri (String, optional):
\n

Optional. The Cloud Storage URI of the\ntruststore file used for SSL encryption. If not\nprovided, Dataproc will provide a self-signed\ncertificate.

\n
\n
crossRealmTrustRealm (String, optional):
\n

Optional. The remote realm the Dataproc\non-cluster KDC will trust, should the user enable cross\nrealm trust.

\n
\n
rootPrincipalPasswordUri (String, optional):
\n

Required. The Cloud Storage URI of a KMS\nencrypted file containing the root principal\npassword.

\n
\n
kmsKeyUri (String, optional):
\n

Required. The uri of the KMS key used to\nencrypt various sensitive files.

\n
\n
crossRealmTrustKdc (String, optional):
\n

Optional. The KDC (IP or hostname) for\nthe remote trusted realm in a cross realm trust\nrelationship.

\n
\n
crossRealmTrustSharedPasswordUri (String, optional):
\n

Optional. The Cloud Storage URI of a KMS\nencrypted file containing the shared password between\nthe on-cluster Kerberos realm and the remote trusted\nrealm, in a cross realm trust relationship.

\n
\n
tgtLifetimeHours (Int, optional):
\n

Optional. The lifetime of the ticket\ngranting ticket, in hours. If not specified, or user\nspecifies 0, then default value 10 will be used.

\n
\n
keystoreUri (String, optional):
\n

Optional. The Cloud Storage URI of the\nkeystore file used for SSL encryption. If not provided,\nDataproc will provide a self-signed certificate.

\n
\n
keyPasswordUri (String, optional):
\n

Optional. The Cloud Storage URI of a KMS\nencrypted file containing the password to the user\nprovided key. For the self-signed certificate, this\npassword is generated by Dataproc.

\n
\n
keystorePasswordUri (String, optional):
\n

Optional. The Cloud Storage URI of a KMS\nencrypted file containing the password to the user\nprovided keystore. For the self-signed certificate, this\npassword is generated by Dataproc.

\n
\n
crossRealmTrustAdminServer (String, optional):
\n

Optional. The admin server (IP or\nhostname) for the remote trusted realm in a cross realm\ntrust relationship.

\n
\n
kdcDbKeyUri (String, optional):
\n

Optional. The Cloud Storage URI of a KMS\nencrypted file containing the master key of the KDC\ndatabase.

\n
\n
\n
\n
\n
\n
initializationActions (List[strict dict], optional):
\n

Optional. Commands to execute on each node after config is\ncompleted. By default, executables are run on master and all worker nodes. You\ncan test a node\u2019s role metadata to run an executable on a master or worker\nnode, as shown below using curl (you can also use wget): ROLE=$(curl -H\nMetadata-Flavor:Google\nhttp://metadata/computeMetadata/v1/instance/attributes/dataproc-role) if [[\n\u201c${ROLE}\u201d == \u2018Master\u2019 ]]; then \u2026 master specific actions \u2026 else \u2026\nworker specific actions \u2026 fi

\n
\n
configBucket (String, optional):
\n

Optional. A Google Cloud Storage bucket used to stage job\ndependencies, config files, and job driver console output. If you do not specify\na staging bucket, Cloud Dataproc will determine a Cloud Storage location (US,\nASIA, or EU) for your cluster\u2019s staging bucket according to the Google Compute\nEngine zone where your cluster is deployed, and then create and manage this\nproject-level, per-location bucket (see Cloud Dataproc staging bucket).

\n
\n
workerConfig (strict dict, optional):
\n

Optional. The config settings for Compute Engine resources in an\ninstance group, such as a master or worker group.

\n
\nConfig Schema:
\n
accelerators (List[strict dict], optional):
\n

Optional. The Compute Engine accelerator\nconfiguration for these instances.Beta Feature: This feature is\nstill under development. It may be changed before final release.

\n
\n
numInstances (Int, optional):
\n

Optional. The number of VM instances in the instance\ngroup. For master instance groups, must be set to 1.

\n
\n
diskConfig (strict dict, optional):
\n

Specifies the config of disk options for a group of\nVM instances.

\n
\nConfig Schema:
\n
numLocalSsds (Int, optional):
\n

Optional. Number of attached SSDs, from 0\nto 4 (default is 0). If SSDs are not attached, the boot\ndisk is used to store runtime logs and HDFS\n(https://hadoop.apache.org/docs/r1.2.1/hdfs_user_guide.html)\ndata. If one or more SSDs are attached, this runtime\nbulk data is spread across them, and the boot disk\ncontains only basic config and installed binaries.

\n
\n
bootDiskSizeGb (Int, optional):
\n

Optional. Size in GB of the boot disk\n(default is 500GB).

\n
\n
bootDiskType (String, optional):
\n

Optional. Type of the boot disk (default\nis \u201cpd-standard\u201d). Valid values: \u201cpd-ssd\u201d (Persistent\nDisk Solid State Drive) or \u201cpd-standard\u201d (Persistent\nDisk Hard Disk Drive).

\n
\n
\n
\n
managedGroupConfig (strict dict, optional):
\n

Specifies the resources used to actively manage an\ninstance group.

\n
\n
isPreemptible (Bool, optional):
\n

Optional. Specifies that this instance group contains\npreemptible instances.

\n
\n
imageUri (String, optional):
\n

Optional. The Compute Engine image resource used for\ncluster instances. It can be specified or may be inferred from\nSoftwareConfig.image_version.

\n
\n
machineTypeUri (String, optional):
\n

Optional. The Compute Engine machine type used for\ncluster instances.A full URL, partial URI, or short name are valid.\nExamples:\nhttps://www.googleapis.com/compute/v1/projects/[project_id]/zones/us-east1-a/machineTypes/n1-standard-2\nprojects/[project_id]/zones/us-east1-a/machineTypes/n1-standard-2\nn1-standard-2Auto Zone Exception: If you are using the Cloud\nDataproc Auto Zone Placement feature, you must use the short name of\nthe machine type resource, for example, n1-standard-2.

\n
\n
\n
\n
gceClusterConfig (strict dict, optional):
\n

Common config settings for resources of Compute Engine cluster\ninstances, applicable to all instances in the cluster.

\n
\nConfig Schema:
\n
networkUri (String, optional):
\n

Optional. The Compute Engine network to be used for\nmachine communications. Cannot be specified with subnetwork_uri. If\nneither network_uri nor subnetwork_uri is specified, the \u201cdefault\u201d\nnetwork of the project is used, if it exists. Cannot be a \u201cCustom\nSubnet Network\u201d (see Using Subnetworks for more information).A full\nURL, partial URI, or short name are valid. Examples:\nhttps://www.googleapis.com/compute/v1/projects/[project_id]/regions/global/default\nprojects/[project_id]/regions/global/default default

\n
\n
zoneUri (String, optional):
\n

Optional. The zone where the Compute Engine cluster\nwill be located. On a create request, it is required in the \u201cglobal\u201d\nregion. If omitted in a non-global Cloud Dataproc region, the\nservice will pick a zone in the corresponding Compute Engine region.\nOn a get request, zone will always be present.A full URL, partial\nURI, or short name are valid. Examples:\nhttps://www.googleapis.com/compute/v1/projects/[project_id]/zones/[zone]\nprojects/[project_id]/zones/[zone] us-central1-f

\n
\n
metadata (permissive dict, optional):
\n

The Compute Engine metadata entries to add to all\ninstances (see Project and instance metadata\n(https://cloud.google.com/compute/docs/storing-retrieving-metadata#project_and_instance_metadata)).

\n
\n
internalIpOnly (Bool, optional):
\n

Optional. If true, all instances in the cluster will\nonly have internal IP addresses. By default, clusters are not\nrestricted to internal IP addresses, and will have ephemeral\nexternal IP addresses assigned to each instance. This\ninternal_ip_only restriction can only be enabled for subnetwork\nenabled networks, and all off-cluster dependencies must be\nconfigured to be accessible without external IP addresses.

\n
\n
serviceAccountScopes (List[String], optional):
\n

Optional. The URIs of service account scopes to be\nincluded in Compute Engine instances. The following base set of\nscopes is always included:\nhttps://www.googleapis.com/auth/cloud.useraccounts.readonly\nhttps://www.googleapis.com/auth/devstorage.read_write\nhttps://www.googleapis.com/auth/logging.writeIf no scopes are\nspecified, the following defaults are also provided:\nhttps://www.googleapis.com/auth/bigquery\nhttps://www.googleapis.com/auth/bigtable.admin.table\nhttps://www.googleapis.com/auth/bigtable.data\nhttps://www.googleapis.com/auth/devstorage.full_control

\n
\n
tags (List[String], optional):
\n

The Compute Engine tags to add to all instances (see\nTagging instances).

\n
\n
serviceAccount (String, optional):
\n

Optional. The service account of the instances.\nDefaults to the default Compute Engine service account. Custom\nservice accounts need permissions equivalent to the following IAM\nroles: roles/logging.logWriter roles/storage.objectAdmin(see\nhttps://cloud.google.com/compute/docs/access/service-accounts#custom_service_accounts\nfor more information). Example:\n[account_id]@[project_id].iam.gserviceaccount.com

\n
\n
subnetworkUri (String, optional):
\n

Optional. The Compute Engine subnetwork to be used\nfor machine communications. Cannot be specified with network_uri.A\nfull URL, partial URI, or short name are valid. Examples:\nhttps://www.googleapis.com/compute/v1/projects/[project_id]/regions/us-east1/subnetworks/sub0\nprojects/[project_id]/regions/us-east1/subnetworks/sub0 sub0

\n
\n
\n
\n
softwareConfig (strict dict, optional):
\n

Specifies the selection and config of software inside the\ncluster.

\n
\nConfig Schema:
\n
properties (permissive dict, optional):
\n

Optional. The properties to set on daemon config\nfiles.Property keys are specified in prefix:property format, for\nexample core:hadoop.tmp.dir. The following are supported prefixes\nand their mappings: capacity-scheduler: capacity-scheduler.xml core:\ncore-site.xml distcp: distcp-default.xml hdfs: hdfs-site.xml hive:\nhive-site.xml mapred: mapred-site.xml pig: pig.properties spark:\nspark-defaults.conf yarn: yarn-site.xmlFor more information, see\nCluster properties.

\n
\n
optionalComponents (List[Component], optional):
\n

The set of optional components to activate on the\ncluster.

\n
\n
imageVersion (String, optional):
\n

Optional. The version of software inside the cluster.\nIt must be one of the supported Cloud Dataproc Versions, such as\n\u201c1.2\u201d (including a subminor version, such as \u201c1.2.29\u201d), or the\n\u201cpreview\u201d version. If unspecified, it defaults to the latest Debian\nversion.

\n
\n
\n
\n
\n
\n
\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-gcp", "customsidebar": null, "display_toc": true, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../dagster-gcp-pandas/", "title": "GCP + Pandas (dagster-gcp-pandas)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-docker/", "title": "Orchestration on Docker"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-gcp-pandas", "GCP + Pandas (dagster-gcp-pandas)", "N", "next"], ["sections/api/apidocs/libraries/dagster-docker", "Orchestration on Docker", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/libraries/dagster-gcp.rst.txt", "title": "GCP (dagster-gcp)", "toc": "\n"}, "dagster-gcp-pandas": {"alabaster_version": "0.7.13", "body": "
\n

GCP + Pandas (dagster-gcp-pandas)\u00b6

\n
\n

Google BigQuery\u00b6

\n

This library provides an integration with the BigQuery database and Pandas data processing library.

\n

Related Guides:

\n\n
\n
\ndagster_gcp_pandas.BigQueryPandasIOManager IOManagerDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
project (dagster.StringSource):
\n

The GCP project to use.

\n
\n
dataset (Union[dagster.StringSource, None], optional):
\n

Name of the BigQuery dataset to use. If not provided, the last prefix before the asset name will be used.

\n
\n
location (Union[dagster.StringSource, None], optional):
\n

The GCP location. Note: When using PySpark DataFrames, the default location of the project will be used. A custom location can be specified in your SparkSession configuration.

\n
\n
gcp_credentials (Union[dagster.StringSource, None], optional):
\n

GCP authentication credentials. If provided, a temporary file will be created with the credentials and GOOGLE_APPLICATION_CREDENTIALS will be set to the temporary file. To avoid issues with newlines in the keys, you must base64 encode the key. You can retrieve the base64 encoded key with this shell command: cat $GOOGLE_AUTH_CREDENTIALS | base64

\n
\n
temporary_gcs_bucket (Union[dagster.StringSource, None], optional):
\n

When using PySpark DataFrames, optionally specify a temporary GCS bucket to store data. If not provided, data will be directly written to BigQuery.

\n
\n
timeout (Union[Float, None], optional):
\n

When using Pandas DataFrames, optionally specify a timeout for the BigQuery queries (loading and reading from tables).

\n
\n
\n

An I/O manager definition that reads inputs from and writes pandas DataFrames to BigQuery.

\n
\n
Returns:
\n

IOManagerDefinition

\n
\n
\n

Examples

\n
from dagster_gcp_pandas import BigQueryPandasIOManager\nfrom dagster import Definitions, EnvVar\n\n@asset(\n    key_prefix=["my_dataset"]  # will be used as the dataset in BigQuery\n)\ndef my_table() -> pd.DataFrame:  # the name of the asset will be the table name\n    ...\n\ndefs = Definitions(\n    assets=[my_table],\n    resources={\n        "io_manager": BigQueryPandasIOManager(project=EnvVar("GCP_PROJECT"))\n    }\n)\n
\n
\n

You can tell Dagster in which dataset to create tables by setting the \u201cdataset\u201d configuration value.\nIf you do not provide a dataset as configuration to the I/O manager, Dagster will determine a dataset based\non the assets and ops using the I/O Manager. For assets, the dataset will be determined from the asset key,\nas shown in the above example. The final prefix before the asset name will be used as the dataset. For example,\nif the asset \u201cmy_table\u201d had the key prefix [\u201cgcp\u201d, \u201cbigquery\u201d, \u201cmy_dataset\u201d], the dataset \u201cmy_dataset\u201d will be\nused. For ops, the dataset can be specified by including a \u201cschema\u201d entry in output metadata. If \u201cschema\u201d is not provided\nvia config or on the asset/op, \u201cpublic\u201d will be used for the dataset.

\n
@op(\n    out={"my_table": Out(metadata={"schema": "my_dataset"})}\n)\ndef make_my_table() -> pd.DataFrame:\n    # the returned value will be stored at my_dataset.my_table\n    ...\n
\n
\n

To only use specific columns of a table as input to a downstream op or asset, add the metadata \u201ccolumns\u201d to the\nIn or AssetIn.

\n
@asset(\n    ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n)\ndef my_table_a(my_table: pd.DataFrame) -> pd.DataFrame:\n    # my_table will just contain the data from column "a"\n    ...\n
\n
\n

If you cannot upload a file to your Dagster deployment, or otherwise cannot\nauthenticate with GCP\nvia a standard method, you can provide a service account key as the \u201cgcp_credentials\u201d configuration.\nDagster will store this key in a temporary file and set GOOGLE_APPLICATION_CREDENTIALS to point to the file.\nAfter the run completes, the file will be deleted, and GOOGLE_APPLICATION_CREDENTIALS will be\nunset. The key must be base64 encoded to avoid issues with newlines in the keys. You can retrieve\nthe base64 encoded key with this shell command: cat $GOOGLE_APPLICATION_CREDENTIALS | base64

\n
\n\n
\n
\nclass dagster_gcp_pandas.BigQueryPandasTypeHandler[source]\u00b6
\n

Plugin for the BigQuery I/O Manager that can store and load Pandas DataFrames as BigQuery tables.

\n

Examples

\n
from dagster_gcp import BigQueryIOManager\nfrom dagster_bigquery_pandas import BigQueryPandasTypeHandler\nfrom dagster import Definitions, EnvVar\n\nclass MyBigQueryIOManager(BigQueryIOManager):\n    @staticmethod\n    def type_handlers() -> Sequence[DbTypeHandler]:\n        return [BigQueryPandasTypeHandler()]\n\n@asset(\n    key_prefix=["my_dataset"]  # my_dataset will be used as the dataset in BigQuery\n)\ndef my_table() -> pd.DataFrame:  # the name of the asset will be the table name\n    ...\n\ndefs = Definitions(\n    assets=[my_table],\n    resources={\n        "io_manager": MyBigQueryIOManager(project=EnvVar("GCP_PROJECT"))\n    }\n)\n
\n
\n
\n\n
\n
\n
\n

Legacy\u00b6

\n
\n
\ndagster_gcp_pandas.bigquery_pandas_io_manager IOManagerDefinition\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
project (dagster.StringSource):
\n

The GCP project to use.

\n
\n
dataset (Union[dagster.StringSource, None], optional):
\n

Name of the BigQuery dataset to use. If not provided, the last prefix before the asset name will be used.

\n
\n
location (Union[dagster.StringSource, None], optional):
\n

The GCP location. Note: When using PySpark DataFrames, the default location of the project will be used. A custom location can be specified in your SparkSession configuration.

\n
\n
gcp_credentials (Union[dagster.StringSource, None], optional):
\n

GCP authentication credentials. If provided, a temporary file will be created with the credentials and GOOGLE_APPLICATION_CREDENTIALS will be set to the temporary file. To avoid issues with newlines in the keys, you must base64 encode the key. You can retrieve the base64 encoded key with this shell command: cat $GOOGLE_AUTH_CREDENTIALS | base64

\n
\n
temporary_gcs_bucket (Union[dagster.StringSource, None], optional):
\n

When using PySpark DataFrames, optionally specify a temporary GCS bucket to store data. If not provided, data will be directly written to BigQuery.

\n
\n
timeout (Union[Float, None], optional):
\n

When using Pandas DataFrames, optionally specify a timeout for the BigQuery queries (loading and reading from tables).

\n
\n
\n

An I/O manager definition that reads inputs from and writes pandas DataFrames to BigQuery.

\n
\n
Returns:
\n

IOManagerDefinition

\n
\n
\n

Examples

\n
from dagster_gcp_pandas import bigquery_pandas_io_manager\nfrom dagster import Definitions\n\n@asset(\n    key_prefix=["my_dataset"]  # will be used as the dataset in BigQuery\n)\ndef my_table() -> pd.DataFrame:  # the name of the asset will be the table name\n    ...\n\ndefs = Definitions(\n    assets=[my_table],\n    resources={\n        "io_manager": bigquery_pandas_io_manager.configured({\n            "project" : {"env": "GCP_PROJECT"}\n        })\n    }\n)\n
\n
\n

You can tell Dagster in which dataset to create tables by setting the \u201cdataset\u201d configuration value.\nIf you do not provide a dataset as configuration to the I/O manager, Dagster will determine a dataset based\non the assets and ops using the I/O Manager. For assets, the dataset will be determined from the asset key,\nas shown in the above example. The final prefix before the asset name will be used as the dataset. For example,\nif the asset \u201cmy_table\u201d had the key prefix [\u201cgcp\u201d, \u201cbigquery\u201d, \u201cmy_dataset\u201d], the dataset \u201cmy_dataset\u201d will be\nused. For ops, the dataset can be specified by including a \u201cschema\u201d entry in output metadata. If \u201cschema\u201d is not provided\nvia config or on the asset/op, \u201cpublic\u201d will be used for the dataset.

\n
@op(\n    out={"my_table": Out(metadata={"schema": "my_dataset"})}\n)\ndef make_my_table() -> pd.DataFrame:\n    # the returned value will be stored at my_dataset.my_table\n    ...\n
\n
\n

To only use specific columns of a table as input to a downstream op or asset, add the metadata \u201ccolumns\u201d to the\nIn or AssetIn.

\n
@asset(\n    ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n)\ndef my_table_a(my_table: pd.DataFrame) -> pd.DataFrame:\n    # my_table will just contain the data from column "a"\n    ...\n
\n
\n

If you cannot upload a file to your Dagster deployment, or otherwise cannot\nauthenticate with GCP\nvia a standard method, you can provide a service account key as the \u201cgcp_credentials\u201d configuration.\nDagster will store this key in a temporary file and set GOOGLE_APPLICATION_CREDENTIALS to point to the file.\nAfter the run completes, the file will be deleted, and GOOGLE_APPLICATION_CREDENTIALS will be\nunset. The key must be base64 encoded to avoid issues with newlines in the keys. You can retrieve\nthe base64 encoded key with this shell command: cat $GOOGLE_APPLICATION_CREDENTIALS | base64

\n
\n\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-gcp-pandas", "customsidebar": null, "display_toc": true, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../dagster-gcp-pyspark/", "title": "GCP + PySpark (dagster-gcp-pyspark)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-gcp/", "title": "GCP (dagster-gcp)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-gcp-pyspark", "GCP + PySpark (dagster-gcp-pyspark)", "N", "next"], ["sections/api/apidocs/libraries/dagster-gcp", "GCP (dagster-gcp)", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/libraries/dagster-gcp-pandas.rst.txt", "title": "GCP + Pandas (dagster-gcp-pandas)", "toc": "\n"}, "dagster-gcp-pyspark": {"alabaster_version": "0.7.13", "body": "
\n

GCP + PySpark (dagster-gcp-pyspark)\u00b6

\n
\n

Google BigQuery\u00b6

\n

This library provides an integration with the BigQuery database and PySpark data processing library.

\n

Related Guides:

\n\n
\n
\ndagster_gcp_pyspark.BigQueryPySparkIOManager IOManagerDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
project (dagster.StringSource):
\n

The GCP project to use.

\n
\n
dataset (Union[dagster.StringSource, None], optional):
\n

Name of the BigQuery dataset to use. If not provided, the last prefix before the asset name will be used.

\n
\n
location (Union[dagster.StringSource, None], optional):
\n

The GCP location. Note: When using PySpark DataFrames, the default location of the project will be used. A custom location can be specified in your SparkSession configuration.

\n
\n
gcp_credentials (Union[dagster.StringSource, None], optional):
\n

GCP authentication credentials. If provided, a temporary file will be created with the credentials and GOOGLE_APPLICATION_CREDENTIALS will be set to the temporary file. To avoid issues with newlines in the keys, you must base64 encode the key. You can retrieve the base64 encoded key with this shell command: cat $GOOGLE_AUTH_CREDENTIALS | base64

\n
\n
temporary_gcs_bucket (Union[dagster.StringSource, None], optional):
\n

When using PySpark DataFrames, optionally specify a temporary GCS bucket to store data. If not provided, data will be directly written to BigQuery.

\n
\n
timeout (Union[Float, None], optional):
\n

When using Pandas DataFrames, optionally specify a timeout for the BigQuery queries (loading and reading from tables).

\n
\n
\n

An I/O manager definition that reads inputs from and writes PySpark DataFrames to BigQuery.

\n
\n
Returns:
\n

IOManagerDefinition

\n
\n
\n

Examples

\n
from dagster_gcp_pyspark import BigQueryPySparkIOManager\nfrom dagster import Definitions, EnvVar\n\n@asset(\n    key_prefix=["my_dataset"]  # will be used as the dataset in BigQuery\n)\ndef my_table() -> pd.DataFrame:  # the name of the asset will be the table name\n    ...\n\ndefs = Definitions(\n    assets=[my_table],\n    resources={\n        "io_manager": BigQueryPySparkIOManager(project=EnvVar("GCP_PROJECT"))\n    }\n)\n
\n
\n

You can tell Dagster in which dataset to create tables by setting the \u201cdataset\u201d configuration value.\nIf you do not provide a dataset as configuration to the I/O manager, Dagster will determine a dataset based\non the assets and ops using the I/O Manager. For assets, the dataset will be determined from the asset key,\nas shown in the above example. The final prefix before the asset name will be used as the dataset. For example,\nif the asset \u201cmy_table\u201d had the key prefix [\u201cgcp\u201d, \u201cbigquery\u201d, \u201cmy_dataset\u201d], the dataset \u201cmy_dataset\u201d will be\nused. For ops, the dataset can be specified by including a \u201cschema\u201d entry in output metadata. If \u201cschema\u201d is not provided\nvia config or on the asset/op, \u201cpublic\u201d will be used for the dataset.

\n
@op(\n    out={"my_table": Out(metadata={"schema": "my_dataset"})}\n)\ndef make_my_table() -> pd.DataFrame:\n    # the returned value will be stored at my_dataset.my_table\n    ...\n
\n
\n

To only use specific columns of a table as input to a downstream op or asset, add the metadata \u201ccolumns\u201d to the\nIn or AssetIn.

\n
@asset(\n    ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n)\ndef my_table_a(my_table: pd.DataFrame) -> pd.DataFrame:\n    # my_table will just contain the data from column "a"\n    ...\n
\n
\n

If you cannot upload a file to your Dagster deployment, or otherwise cannot\nauthenticate with GCP\nvia a standard method, you can provide a service account key as the \u201cgcp_credentials\u201d configuration.\nDagster will store this key in a temporary file and set GOOGLE_APPLICATION_CREDENTIALS to point to the file.\nAfter the run completes, the file will be deleted, and GOOGLE_APPLICATION_CREDENTIALS will be\nunset. The key must be base64 encoded to avoid issues with newlines in the keys. You can retrieve\nthe base64 encoded key with this shell command: cat $GOOGLE_APPLICATION_CREDENTIALS | base64

\n
\n\n
\n
\nclass dagster_gcp_pyspark.BigQueryPySparkTypeHandler[source]\u00b6
\n

Plugin for the BigQuery I/O Manager that can store and load PySpark DataFrames as BigQuery tables.

\n

Examples

\n
from dagster_gcp import BigQueryIOManager\nfrom dagster_bigquery_pandas import BigQueryPySparkTypeHandler\nfrom dagster import Definitions, EnvVar\n\nclass MyBigQueryIOManager(BigQueryIOManager):\n    @staticmethod\n    def type_handlers() -> Sequence[DbTypeHandler]:\n        return [BigQueryPySparkTypeHandler()]\n\n@asset(\n    key_prefix=["my_dataset"]  # my_dataset will be used as the dataset in BigQuery\n)\ndef my_table() -> pd.DataFrame:  # the name of the asset will be the table name\n    ...\n\ndefs = Definitions(\n    assets=[my_table],\n    resources={\n        "io_manager": MyBigQueryIOManager(project=EnvVar("GCP_PROJECT"))\n    }\n)\n
\n
\n
\n\n
\n
\n
\n

Legacy\u00b6

\n
\n
\ndagster_gcp_pyspark.bigquery_pyspark_io_manager IOManagerDefinition\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
project (dagster.StringSource):
\n

The GCP project to use.

\n
\n
dataset (Union[dagster.StringSource, None], optional):
\n

Name of the BigQuery dataset to use. If not provided, the last prefix before the asset name will be used.

\n
\n
location (Union[dagster.StringSource, None], optional):
\n

The GCP location. Note: When using PySpark DataFrames, the default location of the project will be used. A custom location can be specified in your SparkSession configuration.

\n
\n
gcp_credentials (Union[dagster.StringSource, None], optional):
\n

GCP authentication credentials. If provided, a temporary file will be created with the credentials and GOOGLE_APPLICATION_CREDENTIALS will be set to the temporary file. To avoid issues with newlines in the keys, you must base64 encode the key. You can retrieve the base64 encoded key with this shell command: cat $GOOGLE_AUTH_CREDENTIALS | base64

\n
\n
temporary_gcs_bucket (Union[dagster.StringSource, None], optional):
\n

When using PySpark DataFrames, optionally specify a temporary GCS bucket to store data. If not provided, data will be directly written to BigQuery.

\n
\n
timeout (Union[Float, None], optional):
\n

When using Pandas DataFrames, optionally specify a timeout for the BigQuery queries (loading and reading from tables).

\n
\n
\n

An I/O manager definition that reads inputs from and writes PySpark DataFrames to BigQuery.

\n
\n
Returns:
\n

IOManagerDefinition

\n
\n
\n

Examples

\n
from dagster_gcp_pyspark import bigquery_pyspark_io_manager\nfrom dagster import Definitions\n\n@asset(\n    key_prefix=["my_dataset"]  # will be used as the dataset in BigQuery\n)\ndef my_table() -> pd.DataFrame:  # the name of the asset will be the table name\n    ...\n\ndefs = Definitions(\n    assets=[my_table],\n    resources={\n        "io_manager": bigquery_pyspark_io_manager.configured({\n            "project" : {"env": "GCP_PROJECT"}\n        })\n    }\n)\n
\n
\n

You can tell Dagster in which dataset to create tables by setting the \u201cdataset\u201d configuration value.\nIf you do not provide a dataset as configuration to the I/O manager, Dagster will determine a dataset based\non the assets and ops using the I/O Manager. For assets, the dataset will be determined from the asset key,\nas shown in the above example. The final prefix before the asset name will be used as the dataset. For example,\nif the asset \u201cmy_table\u201d had the key prefix [\u201cgcp\u201d, \u201cbigquery\u201d, \u201cmy_dataset\u201d], the dataset \u201cmy_dataset\u201d will be\nused. For ops, the dataset can be specified by including a \u201cschema\u201d entry in output metadata. If \u201cschema\u201d is not provided\nvia config or on the asset/op, \u201cpublic\u201d will be used for the dataset.

\n
@op(\n    out={"my_table": Out(metadata={"schema": "my_dataset"})}\n)\ndef make_my_table() -> pd.DataFrame:\n    # the returned value will be stored at my_dataset.my_table\n    ...\n
\n
\n

To only use specific columns of a table as input to a downstream op or asset, add the metadata \u201ccolumns\u201d to the\nIn or AssetIn.

\n
@asset(\n    ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n)\ndef my_table_a(my_table: pd.DataFrame) -> pd.DataFrame:\n    # my_table will just contain the data from column "a"\n    ...\n
\n
\n

If you cannot upload a file to your Dagster deployment, or otherwise cannot\nauthenticate with GCP\nvia a standard method, you can provide a service account key as the \u201cgcp_credentials\u201d configuration.\nDagster will store this key in a temporary file and set GOOGLE_APPLICATION_CREDENTIALS to point to the file.\nAfter the run completes, the file will be deleted, and GOOGLE_APPLICATION_CREDENTIALS will be\nunset. The key must be base64 encoded to avoid issues with newlines in the keys. You can retrieve\nthe base64 encoded key with this shell command: cat $GOOGLE_APPLICATION_CREDENTIALS | base64

\n
\n\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-gcp-pyspark", "customsidebar": null, "display_toc": true, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../dagster-ge/", "title": "Great Expectations (dagster-ge)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-gcp-pandas/", "title": "GCP + Pandas (dagster-gcp-pandas)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-ge", "Great Expectations (dagster-ge)", "N", "next"], ["sections/api/apidocs/libraries/dagster-gcp-pandas", "GCP + Pandas (dagster-gcp-pandas)", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/libraries/dagster-gcp-pyspark.rst.txt", "title": "GCP + PySpark (dagster-gcp-pyspark)", "toc": "\n"}, "dagster-ge": {"alabaster_version": "0.7.13", "body": "
\n

Great Expectations (dagster-ge)\u00b6

\n
\n
\ndagster_ge.ge_validation_op_factory(name, datasource_name, suite_name, validation_operator_name=None, input_dagster_type=<dagster._core.types.dagster_type.DagsterType object>, batch_kwargs=None)[source]\u00b6
\n

Generates ops for interacting with GE.

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 the name of the op

  • \n
  • datasource_name (str) \u2013 the name of your DataSource, see your great_expectations.yml

  • \n
  • suite_name (str) \u2013 the name of your expectation suite, see your great_expectations.yml

  • \n
  • validation_operator_name (Optional[str]) \u2013 what validation operator to run \u2013 defaults to\nNone, which generates an ephemeral validator. If you want to save data docs, use\n\u2018action_list_operator\u2019.\nSee https://legacy.docs.greatexpectations.io/en/0.12.1/reference/core_concepts/validation_operators_and_actions.html#

  • \n
  • input_dagster_type (DagsterType) \u2013 the Dagster type used to type check the input to the op.\nDefaults to dagster_pandas.DataFrame.

  • \n
  • batch_kwargs (Optional[dict]) \u2013 overrides the batch_kwargs parameter when calling the\nge_data_context\u2019s get_batch method. Defaults to {\u201cdataset\u201d: dataset}, where\ndataset is the input to the generated op.

  • \n
\n
\n
Returns:
\n

An op that takes in a set of data and yields both an expectation with relevant metadata\nand an output with all the metadata (for user processing)

\n
\n
\n
\n\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-ge", "customsidebar": null, "display_toc": false, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../dagster-github/", "title": "GitHub (dagster-github)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-gcp-pyspark/", "title": "GCP + PySpark (dagster-gcp-pyspark)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-github", "GitHub (dagster-github)", "N", "next"], ["sections/api/apidocs/libraries/dagster-gcp-pyspark", "GCP + PySpark (dagster-gcp-pyspark)", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/libraries/dagster-ge.rst.txt", "title": "Great Expectations (dagster-ge)", "toc": "\n"}, "dagster-github": {"alabaster_version": "0.7.13", "body": "
\n

GitHub (dagster-github)\u00b6

\n

This library provides an integration with GitHub Apps, to support performing various automation\noperations within your github repositories and with the tighter permissions scopes that github apps\nallow for vs using a personal token.

\n

Presently, it provides a thin wrapper on the github v4 graphql API.

\n

To use this integration, you\u2019ll first need to create a GitHub App for it.

\n
    \n
  1. Create App: Follow the instructions in\nhttps://developer.github.com/apps/quickstart-guides/setting-up-your-development-environment/, You will end up with a private key and App ID, which will be used when configuring the\ndagster-github resource. Note you will need to grant your app the relevent permissions\nfor the API requests you want to make, for example to post issues it will need read/write access\nfor the issues repository permission, more info on GitHub application permissions can be found\nhere

  2. \n
  3. Install App: Follow the instructions in\nhttps://developer.github.com/apps/quickstart-guides/setting-up-your-development-environment/#step-7-install-the-app-on-your-account

  4. \n
  5. Find your installation_id: You can pull this from the GitHub app administration page,\nhttps://github.com/apps/<app-name>/installations/<installation_id>. Note if your app is\ninstalled more than once you can also programatically retrieve these IDs.

  6. \n
\n

Sharing your App ID and Installation ID is fine, but make sure that the Private Key for your app is\nstored securily.

\n
\n
\n

Posting Issues\u00b6

\n

Now, you can create issues in GitHub from Dagster with the GitHub resource:

\n
import os\n\nfrom dagster import job, op\nfrom dagster_github import GithubResource\n\n\n@op\ndef github_op(github: GithubResource):\n    github.get_client().create_issue(\n        repo_name='dagster',\n        repo_owner='dagster-io',\n        title='Dagster\\'s first github issue',\n        body='this open source thing seems like a pretty good idea',\n    )\n\n@job(resource_defs={\n     'github': GithubResource(\n         github_app_id=os.getenv('GITHUB_APP_ID'),\n         github_app_private_rsa_key=os.getenv('GITHUB_PRIVATE_KEY'),\n         github_installation_id=os.getenv('GITHUB_INSTALLATION_ID')\n )})\ndef github_job():\n    github_op()\n\ngithub_job.execute_in_process()\n
\n
\n

Run the above code, and you\u2019ll see the issue appear in GitHub:\n

\n

GitHub enterprise users can provide their hostname in the run config. Provide github_hostname\nas part of your github config like below.

\n
GithubResource(\n    github_app_id=os.getenv('GITHUB_APP_ID'),\n    github_app_private_rsa_key=os.getenv('GITHUB_PRIVATE_KEY'),\n    github_installation_id=os.getenv('GITHUB_INSTALLATION_ID'),\n    github_hostname=os.getenv('GITHUB_HOSTNAME'),\n)\n
\n
\n

By provisioning GithubResource as a Dagster resource, you can post to GitHub from\nwithin any asset or op execution.

\n
\n
\n

Executing GraphQL queries\u00b6

\n
import os\n\nfrom dagster import job, op\nfrom dagster_github import github_resource\n\n\n@op\ndef github_op(github: GithubResource):\n    github.get_client().execute(\n        query="""\n        query get_repo_id($repo_name: String!, $repo_owner: String!) {\n            repository(name: $repo_name, owner: $repo_owner) {\n                id\n            }\n        }\n        """,\n        variables={"repo_name": repo_name, "repo_owner": repo_owner},\n    )\n\n@job(resource_defs={\n     'github': GithubResource(\n         github_app_id=os.getenv('GITHUB_APP_ID'),\n         github_app_private_rsa_key=os.getenv('GITHUB_PRIVATE_KEY'),\n         github_installation_id=os.getenv('GITHUB_INSTALLATION_ID')\n )})\ndef github_job():\n    github_op()\n\ngithub_job.execute_in_process()\n
\n
\n
\n
\ndagster_github.GithubResource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
github_app_id (dagster.IntSource):
\n

Github Application ID, for more info see https://developer.github.com/apps/

\n
\n
github_app_private_rsa_key (dagster.StringSource):
\n

Github Application Private RSA key text, for more info see https://developer.github.com/apps/

\n
\n
github_installation_id (Union[dagster.IntSource, None], optional):
\n

Github Application Installation ID, for more info see https://developer.github.com/apps/

\n
\n
github_hostname (Union[dagster.StringSource, None], optional):
\n

Github hostname. Defaults to api.github.com, for more info see https://developer.github.com/apps/

\n
\n
\n

Base class for Dagster resources that utilize structured config.

\n

This class is a subclass of both ResourceDefinition and Config.

\n

Example definition:

\n
class WriterResource(ConfigurableResource):\n    prefix: str\n\n    def output(self, text: str) -> None:\n        print(f"{self.prefix}{text}")\n
\n
\n

Example usage:

\n
@asset\ndef asset_that_uses_writer(writer: WriterResource):\n    writer.output("text")\n\ndefs = Definitions(\n    assets=[asset_that_uses_writer],\n    resources={"writer": WriterResource(prefix="a_prefix")},\n)\n
\n
\n
\n\n
\n

Legacy\u00b6

\n
\n
\ndagster_github.github_resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
github_app_id (dagster.IntSource):
\n

Github Application ID, for more info see https://developer.github.com/apps/

\n
\n
github_app_private_rsa_key (dagster.StringSource):
\n

Github Application Private RSA key text, for more info see https://developer.github.com/apps/

\n
\n
github_installation_id (Union[dagster.IntSource, None], optional):
\n

Github Application Installation ID, for more info see https://developer.github.com/apps/

\n
\n
github_hostname (Union[dagster.StringSource, None], optional):
\n

Github hostname. Defaults to api.github.com, for more info see https://developer.github.com/apps/

\n
\n
\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-github", "customsidebar": null, "display_toc": true, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../dagster-k8s/", "title": "Kubernetes (dagster-k8s)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-ge/", "title": "Great Expectations (dagster-ge)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-k8s", "Kubernetes (dagster-k8s)", "N", "next"], ["sections/api/apidocs/libraries/dagster-ge", "Great Expectations (dagster-ge)", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/libraries/dagster-github.rst.txt", "title": "GitHub (dagster-github)", "toc": "\n"}, "dagster-graphql": {"alabaster_version": "0.7.13", "body": "
\n

GraphQL (dagster-graphql)\u00b6

\n
\n

Python Client\u00b6

\n
\n
\nclass dagster_graphql.DagsterGraphQLClient(hostname, port_number=None, transport=None, use_https=False, timeout=300, headers=None)[source]\u00b6
\n
\n
\n

\n \n (\n experimental\n )\n \n This API may break in future versions, even between dot releases.\n \n

\n

Official Dagster Python Client for GraphQL.

\n

Utilizes the gql library to dispatch queries over HTTP to a remote Dagster GraphQL Server

\n

As of now, all operations on this client are synchronous.

\n

Intended usage:

\n
client = DagsterGraphQLClient("localhost", port_number=3000)\nstatus = client.get_run_status(**SOME_RUN_ID**)\n
\n
\n
\n
Parameters:
\n
    \n
  • hostname (str) \u2013 Hostname for the Dagster GraphQL API, like localhost or\ndagster.YOUR_ORG_HERE.

  • \n
  • port_number (Optional[int]) \u2013 Port number to connect to on the host.\nDefaults to None.

  • \n
  • transport (Optional[Transport], optional) \u2013 A custom transport to use to connect to the\nGraphQL API with (e.g. for custom auth). Defaults to None.

  • \n
  • use_https (bool, optional) \u2013 Whether to use https in the URL connection string for the\nGraphQL API. Defaults to False.

  • \n
  • timeout (int) \u2013 Number of seconds before requests should time out. Defaults to 60.

  • \n
  • headers (Optional[Dict[str, str]]) \u2013 Additional headers to include in the request. To use\nthis client in Dagster Cloud, set the \u201cDagster-Cloud-Api-Token\u201d header to a user token\ngenerated in the Dagster Cloud UI.

  • \n
\n
\n
Raises:
\n

ConnectionError \u2013 if the client cannot connect to the host.

\n
\n
\n
\n
\nget_run_status(run_id)[source]\u00b6
\n

Get the status of a given Pipeline Run.

\n
\n
Parameters:
\n

run_id (str) \u2013 run id of the requested pipeline run.

\n
\n
Raises:
\n
\n
\n
Returns:
\n

returns a status Enum describing the state of the requested pipeline run

\n
\n
Return type:
\n

DagsterRunStatus

\n
\n
\n
\n\n
\n
\nreload_repository_location(repository_location_name)[source]\u00b6
\n

Reloads a Dagster Repository Location, which reloads all repositories in that repository location.

\n

This is useful in a variety of contexts, including refreshing the Dagster UI without restarting\nthe server.

\n
\n
Parameters:
\n

repository_location_name (str) \u2013 The name of the repository location

\n
\n
Returns:
\n

Object with information about the result of the reload request

\n
\n
Return type:
\n

ReloadRepositoryLocationInfo

\n
\n
\n
\n\n
\n
\nshutdown_repository_location(repository_location_name)[source]\u00b6
\n

Shuts down the server that is serving metadata for the provided repository location.

\n

This is primarily useful when you want the server to be restarted by the compute environment\nin which it is running (for example, in Kubernetes, the pod in which the server is running\nwill automatically restart when the server is shut down, and the repository metadata will\nbe reloaded)

\n
\n
Parameters:
\n

repository_location_name (str) \u2013 The name of the repository location

\n
\n
Returns:
\n

Object with information about the result of the reload request

\n
\n
Return type:
\n

ShutdownRepositoryLocationInfo

\n
\n
\n
\n\n
\n
\nsubmit_job_execution(job_name, repository_location_name=None, repository_name=None, run_config=None, tags=None, op_selection=None)[source]\u00b6
\n

Submits a job with attached configuration for execution.

\n
\n
Parameters:
\n
    \n
  • job_name (str) \u2013 The job\u2019s name

  • \n
  • repository_location_name (Optional[str]) \u2013 The name of the repository location where\nthe job is located. If omitted, the client will try to infer the repository location\nfrom the available options on the Dagster deployment. Defaults to None.

  • \n
  • repository_name (Optional[str]) \u2013 The name of the repository where the job is located.\nIf omitted, the client will try to infer the repository from the available options\non the Dagster deployment. Defaults to None.

  • \n
  • run_config (Optional[Dict[str, Any]]) \u2013 This is the run config to execute the job with.\nNote that runConfigData is any-typed in the GraphQL type system. This type is used when passing in\nan arbitrary object for run config. However, it must conform to the constraints of the config\nschema for this job. If it does not, the client will throw a DagsterGraphQLClientError with a message of\nJobConfigValidationInvalid. Defaults to None.

  • \n
  • tags (Optional[Dict[str, Any]]) \u2013 A set of tags to add to the job execution.

  • \n
\n
\n
Raises:
\n
    \n
  • DagsterGraphQLClientError("InvalidStepError", invalid_step_key) \u2013 the job has an invalid step

  • \n
  • DagsterGraphQLClientError("InvalidOutputError", body=error_object) \u2013 some solid has an invalid output within the job.\n The error_object is of type dagster_graphql.InvalidOutputErrorInfo.

  • \n
  • DagsterGraphQLClientError("RunConflict", message) \u2013 a DagsterRunConflict occured during execution.\n This indicates that a conflicting job run already exists in run storage.

  • \n
  • DagsterGraphQLClientError("PipelineConfigurationInvalid", invalid_step_key) \u2013 the run_config is not in the expected format\n for the job

  • \n
  • DagsterGraphQLClientError("JobNotFoundError", message) \u2013 the requested job does not exist

  • \n
  • DagsterGraphQLClientError("PythonError", message) \u2013 an internal framework error occurred

  • \n
\n
\n
Returns:
\n

run id of the submitted pipeline run

\n
\n
Return type:
\n

str

\n
\n
\n
\n\n
\n\n
\n
\nexception dagster_graphql.DagsterGraphQLClientError(*args, body=None)[source]\u00b6
\n
\n\n
\n
\nclass dagster_graphql.InvalidOutputErrorInfo(step_key, invalid_output_name)[source]\u00b6
\n

This class gives information about an InvalidOutputError from submitting a pipeline for execution\nfrom GraphQL.

\n
\n
Parameters:
\n
    \n
  • step_key (str) \u2013 key of the step that failed

  • \n
  • invalid_output_name (str) \u2013 the name of the invalid output from the given step

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster_graphql.ReloadRepositoryLocationInfo(status, failure_type=None, message=None)[source]\u00b6
\n

This class gives information about the result of reloading\na Dagster repository location with a GraphQL mutation.

\n
\n
Parameters:
\n
    \n
  • status (ReloadRepositoryLocationStatus) \u2013 The status of the reload repository location mutation

  • \n
  • failure_type \u2013 (Optional[str], optional): the failure type if status == ReloadRepositoryLocationStatus.FAILURE.\nCan be one of ReloadNotSupported, RepositoryLocationNotFound, or RepositoryLocationLoadFailure. Defaults to None.

  • \n
  • message (Optional[str], optional) \u2013 the failure message/reason if\nstatus == ReloadRepositoryLocationStatus.FAILURE. Defaults to None.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster_graphql.ReloadRepositoryLocationStatus(value)[source]\u00b6
\n

This enum describes the status of a GraphQL mutation to reload a Dagster repository location.

\n
\n
Parameters:
\n

Enum (str) \u2013 can be either ReloadRepositoryLocationStatus.SUCCESS\nor ReloadRepositoryLocationStatus.FAILURE.

\n
\n
\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-graphql", "customsidebar": null, "display_toc": true, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../dagster-wandb/", "title": "Weights & Biases (dagster-wandb)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagstermill/", "title": "Dagstermill"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-wandb", "Weights & Biases (dagster-wandb)", "N", "next"], ["sections/api/apidocs/libraries/dagstermill", "Dagstermill", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/libraries/dagster-graphql.rst.txt", "title": "GraphQL (dagster-graphql)", "toc": "\n"}, "dagster-k8s": {"alabaster_version": "0.7.13", "body": "
\n

Kubernetes (dagster-k8s)\u00b6

\n

See also the Kubernetes deployment guide.

\n

This library contains utilities for running Dagster with Kubernetes. This includes a Python API\nallowing the webserver to launch runs as Kubernetes Jobs, as well as a Helm chart you can use as the basis\nfor a Dagster deployment on a Kubernetes cluster.

\n
\n
\n

APIs\u00b6

\n
\n
\ndagster_k8s.K8sRunLauncher RunLauncher[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
job_image (Union[dagster.StringSource, None], optional):
\n

Docker image to use for launched Jobs. If this field is empty, the image that was used to originally load the Dagster repository will be used. (Ex: \u201cmycompany.com/dagster-k8s-image:latest\u201d).

\n
\n
image_pull_policy (Union[dagster.StringSource, None], optional):
\n

Image pull policy to set on launched Pods.

\n
\n
image_pull_secrets (Union[List[strict dict], None], optional):
\n

Specifies that Kubernetes should get the credentials from the Secrets named in this list.

\n
\n
service_account_name (Union[dagster.StringSource, None], optional):
\n

The name of the Kubernetes service account under which to run.

\n
\n
env_config_maps (Union[List[dagster.StringSource], None], optional):
\n

A list of custom ConfigMapEnvSource names from which to draw environment variables (using envFrom) for the Job. Default: []. See:https://kubernetes.io/docs/tasks/inject-data-application/define-environment-variable-container/#define-an-environment-variable-for-a-container

\n
\n
env_secrets (Union[List[dagster.StringSource], None], optional):
\n

A list of custom Secret names from which to draw environment variables (using envFrom) for the Job. Default: []. See:https://kubernetes.io/docs/tasks/inject-data-application/distribute-credentials-secure/#configure-all-key-value-pairs-in-a-secret-as-container-environment-variables

\n
\n
env_vars (Union[List[String], None], optional):
\n

A list of environment variables to inject into the Job. Each can be of the form KEY=VALUE or just KEY (in which case the value will be pulled from the current process). Default: []. See: https://kubernetes.io/docs/tasks/inject-data-application/distribute-credentials-secure/#configure-all-key-value-pairs-in-a-secret-as-container-environment-variables

\n
\n
volume_mounts (List[permissive dict], optional):
\n

A list of volume mounts to include in the job\u2019s container. Default: []. See: https://v1-18.docs.kubernetes.io/docs/reference/generated/kubernetes-api/v1.18/#volumemount-v1-core

\n

Default Value: []

\n
\n
volumes (List[permissive dict], optional):
\n

A list of volumes to include in the Job\u2019s Pod. Default: []. For the many possible volume source types that can be included, see: https://v1-18.docs.kubernetes.io/docs/reference/generated/kubernetes-api/v1.18/#volume-v1-core

\n

Default Value: []

\n
\n
labels (permissive dict, optional):
\n

Labels to apply to all created pods. See: https://kubernetes.io/docs/concepts/overview/working-with-objects/labels

\n
\n
resources (Union[strict dict, None], optional):
\n

Compute resource requirements for the container. See: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/

\n
\n
scheduler_name (Union[dagster.StringSource, None], optional):
\n

Use a custom Kubernetes scheduler for launched Pods. See:https://kubernetes.io/docs/tasks/extend-kubernetes/configure-multiple-schedulers/

\n
\n
security_context (permissive dict, optional):
\n

Security settings for the container. See:https://kubernetes.io/docs/tasks/configure-pod-container/security-context/#set-capabilities-for-a-container

\n
\n
instance_config_map (dagster.StringSource):
\n

The name of an existing Volume to mount into the pod in order to provide a ConfigMap for the Dagster instance. This Volume should contain a dagster.yaml with appropriate values for run storage, event log storage, etc.

\n
\n
postgres_password_secret (dagster.StringSource, optional):
\n

The name of the Kubernetes Secret where the postgres password can be retrieved. Will be mounted and supplied as an environment variable to the Job Pod.Secret must contain the key "postgresql-password" which will be exposed in the Job environment as the environment variable DAGSTER_PG_PASSWORD.

\n
\n
dagster_home (dagster.StringSource, optional):
\n

The location of DAGSTER_HOME in the Job container; this is where the dagster.yaml file will be mounted from the instance ConfigMap specified here. Defaults to /opt/dagster/dagster_home.

\n

Default Value: \u2018/opt/dagster/dagster_home\u2019

\n
\n
load_incluster_config (Bool, optional):
\n

Set this value if you are running the launcher\nwithin a k8s cluster. If True, we assume the launcher is running within the target\ncluster and load config using kubernetes.config.load_incluster_config. Otherwise,\nwe will use the k8s config specified in kubeconfig_file (using\nkubernetes.config.load_kube_config) or fall back to the default kubeconfig.

\n

Default Value: True

\n
\n
kubeconfig_file (Union[String, None], optional):
\n

The kubeconfig file from which to load config. Defaults to using the default kubeconfig.

\n

Default Value: None

\n
\n
fail_pod_on_run_failure (Bool, optional):
\n

Whether the launched Kubernetes Jobs and Pods should fail if the Dagster run fails

\n
\n
run_k8s_config (strict dict, optional):
\n

Raw Kubernetes configuration for launched runs.

\n
\nConfig Schema:
\n
container_config (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\n
pod_template_spec_metadata (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\n
pod_spec_config (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\n
job_config (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\n
job_metadata (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\n
job_spec_config (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\n
\n
\n
job_namespace (dagster.StringSource, optional):
\n

Default Value: \u2018default\u2019

\n
\n
\n

RunLauncher that starts a Kubernetes Job for each Dagster job run.

\n

Encapsulates each run in a separate, isolated invocation of dagster-graphql.

\n

You can configure a Dagster instance to use this RunLauncher by adding a section to your\ndagster.yaml like the following:

\n
run_launcher:\n  module: dagster_k8s.launcher\n  class: K8sRunLauncher\n  config:\n    service_account_name: your_service_account\n    job_image: my_project/dagster_image:latest\n    instance_config_map: dagster-instance\n    postgres_password_secret: dagster-postgresql-secret\n
\n
\n
\n\n
\n
\ndagster_k8s.k8s_job_executor ExecutorDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
job_image (Union[dagster.StringSource, None], optional):
\n

Docker image to use for launched Jobs. If this field is empty, the image that was used to originally load the Dagster repository will be used. (Ex: \u201cmycompany.com/dagster-k8s-image:latest\u201d).

\n
\n
image_pull_policy (Union[dagster.StringSource, None], optional):
\n

Image pull policy to set on launched Pods.

\n
\n
image_pull_secrets (Union[List[strict dict], None], optional):
\n

Specifies that Kubernetes should get the credentials from the Secrets named in this list.

\n
\n
service_account_name (Union[dagster.StringSource, None], optional):
\n

The name of the Kubernetes service account under which to run.

\n
\n
env_config_maps (Union[List[dagster.StringSource], None], optional):
\n

A list of custom ConfigMapEnvSource names from which to draw environment variables (using envFrom) for the Job. Default: []. See:https://kubernetes.io/docs/tasks/inject-data-application/define-environment-variable-container/#define-an-environment-variable-for-a-container

\n
\n
env_secrets (Union[List[dagster.StringSource], None], optional):
\n

A list of custom Secret names from which to draw environment variables (using envFrom) for the Job. Default: []. See:https://kubernetes.io/docs/tasks/inject-data-application/distribute-credentials-secure/#configure-all-key-value-pairs-in-a-secret-as-container-environment-variables

\n
\n
env_vars (Union[List[String], None], optional):
\n

A list of environment variables to inject into the Job. Each can be of the form KEY=VALUE or just KEY (in which case the value will be pulled from the current process). Default: []. See: https://kubernetes.io/docs/tasks/inject-data-application/distribute-credentials-secure/#configure-all-key-value-pairs-in-a-secret-as-container-environment-variables

\n
\n
volume_mounts (List[permissive dict], optional):
\n

A list of volume mounts to include in the job\u2019s container. Default: []. See: https://v1-18.docs.kubernetes.io/docs/reference/generated/kubernetes-api/v1.18/#volumemount-v1-core

\n

Default Value: []

\n
\n
volumes (List[permissive dict], optional):
\n

A list of volumes to include in the Job\u2019s Pod. Default: []. For the many possible volume source types that can be included, see: https://v1-18.docs.kubernetes.io/docs/reference/generated/kubernetes-api/v1.18/#volume-v1-core

\n

Default Value: []

\n
\n
labels (permissive dict, optional):
\n

Labels to apply to all created pods. See: https://kubernetes.io/docs/concepts/overview/working-with-objects/labels

\n
\n
resources (Union[strict dict, None], optional):
\n

Compute resource requirements for the container. See: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/

\n
\n
scheduler_name (Union[dagster.StringSource, None], optional):
\n

Use a custom Kubernetes scheduler for launched Pods. See:https://kubernetes.io/docs/tasks/extend-kubernetes/configure-multiple-schedulers/

\n
\n
security_context (permissive dict, optional):
\n

Security settings for the container. See:https://kubernetes.io/docs/tasks/configure-pod-container/security-context/#set-capabilities-for-a-container

\n
\n
load_incluster_config (Bool, optional):
\n

Whether or not the executor is running within a k8s cluster already. If\nthe job is using the K8sRunLauncher, the default value of this parameter will be\nthe same as the corresponding value on the run launcher.\nIf True, we assume the executor is running within the target cluster and load config\nusing kubernetes.config.load_incluster_config. Otherwise, we will use the k8s config\nspecified in kubeconfig_file (using kubernetes.config.load_kube_config) or fall\nback to the default kubeconfig.

\n
\n
kubeconfig_file (Union[String, None], optional):
\n

Path to a kubeconfig file to use, if not using default kubeconfig. If\nthe job is using the K8sRunLauncher, the default value of this parameter will be\nthe same as the corresponding value on the run launcher.

\n
\n
job_namespace (dagster.StringSource, optional):
\n

\n
retries (selector, optional):
\n

Whether retries are enabled or not. By default, retries are enabled.

\n
\nDefault Value:
{\n    "enabled": {}\n}\n
\n
\n
\nConfig Schema:
\n
enabled (strict dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\n
disabled (strict dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\n
\n
\n
max_concurrent (dagster.IntSource, optional):
\n

Limit on the number of pods that will run concurrently within the scope of a Dagster run. Note that this limit is per run, not global.

\n
\n
tag_concurrency_limits (List[strict dict], optional):
\n

A set of limits that are applied to steps with particular tags. If a value is set, the limit is applied to only that key-value pair. If no value is set, the limit is applied across all values of that key. If the value is set to a dict with applyLimitPerUniqueValue: true, the limit will apply to the number of unique values for that key. Note that these limits are per run, not global.

\n
\n
step_k8s_config (strict dict, optional):
\n

Raw Kubernetes configuration for each step launched by the executor.

\n
\nConfig Schema:
\n
container_config (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\n
pod_template_spec_metadata (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\n
pod_spec_config (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\n
job_config (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\n
job_metadata (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\n
job_spec_config (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\n
\n
\n
\n

Executor which launches steps as Kubernetes Jobs.

\n

To use the k8s_job_executor, set it as the executor_def when defining a job:

\n
from dagster_k8s import k8s_job_executor\n\nfrom dagster import job\n\n@job(executor_def=k8s_job_executor)\ndef k8s_job():\n    pass\n
\n
\n

Then you can configure the executor with run config as follows:

\n
execution:\n  config:\n    job_namespace: 'some-namespace'\n    image_pull_policy: ...\n    image_pull_secrets: ...\n    service_account_name: ...\n    env_config_maps: ...\n    env_secrets: ...\n    env_vars: ...\n    job_image: ... # leave out if using userDeployments\n    max_concurrent: ...\n
\n
\n

max_concurrent limits the number of pods that will execute concurrently for one run. By default\nthere is no limit- it will maximally parallel as allowed by the DAG. Note that this is not a\nglobal limit.

\n

Configuration set on the Kubernetes Jobs and Pods created by the K8sRunLauncher will also be\nset on Kubernetes Jobs and Pods created by the k8s_job_executor.

\n

Configuration set using tags on a @job will only apply to the run level. For configuration\nto apply at each step it must be set using tags for each @op.

\n
\n\n
\n

Ops\u00b6

\n
\n
\ndagster_k8s.k8s_job_op = <dagster._core.definitions.op_definition.OpDefinition object>[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
image_pull_policy (Union[dagster.StringSource, None], optional):
\n

Image pull policy to set on launched Pods.

\n
\n
image_pull_secrets (Union[List[strict dict], None], optional):
\n

Specifies that Kubernetes should get the credentials from the Secrets named in this list.

\n
\n
service_account_name (Union[dagster.StringSource, None], optional):
\n

The name of the Kubernetes service account under which to run.

\n
\n
env_config_maps (Union[List[dagster.StringSource], None], optional):
\n

A list of custom ConfigMapEnvSource names from which to draw environment variables (using envFrom) for the Job. Default: []. See:https://kubernetes.io/docs/tasks/inject-data-application/define-environment-variable-container/#define-an-environment-variable-for-a-container

\n
\n
env_secrets (Union[List[dagster.StringSource], None], optional):
\n

A list of custom Secret names from which to draw environment variables (using envFrom) for the Job. Default: []. See:https://kubernetes.io/docs/tasks/inject-data-application/distribute-credentials-secure/#configure-all-key-value-pairs-in-a-secret-as-container-environment-variables

\n
\n
env_vars (Union[List[String], None], optional):
\n

A list of environment variables to inject into the Job. Each can be of the form KEY=VALUE or just KEY (in which case the value will be pulled from the current process). Default: []. See: https://kubernetes.io/docs/tasks/inject-data-application/distribute-credentials-secure/#configure-all-key-value-pairs-in-a-secret-as-container-environment-variables

\n
\n
volume_mounts (List[permissive dict], optional):
\n

A list of volume mounts to include in the job\u2019s container. Default: []. See: https://v1-18.docs.kubernetes.io/docs/reference/generated/kubernetes-api/v1.18/#volumemount-v1-core

\n

Default Value: []

\n
\n
volumes (List[permissive dict], optional):
\n

A list of volumes to include in the Job\u2019s Pod. Default: []. For the many possible volume source types that can be included, see: https://v1-18.docs.kubernetes.io/docs/reference/generated/kubernetes-api/v1.18/#volume-v1-core

\n

Default Value: []

\n
\n
labels (permissive dict, optional):
\n

Labels to apply to all created pods. See: https://kubernetes.io/docs/concepts/overview/working-with-objects/labels

\n
\n
resources (Union[strict dict, None], optional):
\n

Compute resource requirements for the container. See: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/

\n
\n
scheduler_name (Union[dagster.StringSource, None], optional):
\n

Use a custom Kubernetes scheduler for launched Pods. See:https://kubernetes.io/docs/tasks/extend-kubernetes/configure-multiple-schedulers/

\n
\n
security_context (permissive dict, optional):
\n

Security settings for the container. See:https://kubernetes.io/docs/tasks/configure-pod-container/security-context/#set-capabilities-for-a-container

\n
\n
image (dagster.StringSource):
\n

The image in which to launch the k8s job.

\n
\n
command (List[String], optional):
\n

The command to run in the container within the launched k8s job.

\n
\n
args (List[String], optional):
\n

The args for the command for the container.

\n
\n
namespace (dagster.StringSource, optional):
\n

\n
load_incluster_config (Bool, optional):
\n

Set this value if you are running the launcher\nwithin a k8s cluster. If True, we assume the launcher is running within the target\ncluster and load config using kubernetes.config.load_incluster_config. Otherwise,\nwe will use the k8s config specified in kubeconfig_file (using\nkubernetes.config.load_kube_config) or fall back to the default kubeconfig.

\n

Default Value: True

\n
\n
kubeconfig_file (Union[String, None], optional):
\n

The kubeconfig file from which to load config. Defaults to using the default kubeconfig.

\n

Default Value: None

\n
\n
timeout (Int, optional):
\n

How long to wait for the job to succeed before raising an exception

\n
\n
container_config (permissive dict, optional):
\n

Raw k8s config for the k8s pod\u2019s main container (https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.20/#container-v1-core). Keys can either snake_case or camelCase.

\n
\n
pod_template_spec_metadata (permissive dict, optional):
\n

Raw k8s config for the k8s pod\u2019s metadata (https://kubernetes.io/docs/reference/kubernetes-api/common-definitions/object-meta/#ObjectMeta). Keys can either snake_case or camelCase.

\n
\n
pod_spec_config (permissive dict, optional):
\n

Raw k8s config for the k8s pod\u2019s pod spec (https://kubernetes.io/docs/reference/kubernetes-api/workload-resources/pod-v1/#PodSpec). Keys can either snake_case or camelCase.

\n
\n
job_metadata (permissive dict, optional):
\n

Raw k8s config for the k8s job\u2019s metadata (https://kubernetes.io/docs/reference/kubernetes-api/common-definitions/object-meta/#ObjectMeta). Keys can either snake_case or camelCase.

\n
\n
job_spec_config (permissive dict, optional):
\n

Raw k8s config for the k8s job\u2019s job spec (https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.20/#jobspec-v1-batch). Keys can either snake_case or camelCase.

\n
\n
\n
\n
\n

\n \n (\n experimental\n )\n \n This API may break in future versions, even between dot releases.\n \n

\n

An op that runs a Kubernetes job using the k8s API.

\n

Contrast with the k8s_job_executor, which runs each Dagster op in a Dagster job in its\nown k8s job.

\n
\n
This op may be useful when:
    \n
  • You need to orchestrate a command that isn\u2019t a Dagster op (or isn\u2019t written in Python)

  • \n
  • You want to run the rest of a Dagster job using a specific executor, and only a single\nop in k8s.

  • \n
\n
\n
\n

For example:

\n
from dagster_k8s import k8s_job_op\n\nfrom dagster import job\n\nfirst_op = k8s_job_op.configured(\n    {\n        "image": "busybox",\n        "command": ["/bin/sh", "-c"],\n        "args": ["echo HELLO"],\n    },\n    name="first_op",\n)\nsecond_op = k8s_job_op.configured(\n    {\n        "image": "busybox",\n        "command": ["/bin/sh", "-c"],\n        "args": ["echo GOODBYE"],\n    },\n    name="second_op",\n)\n\n@job\ndef full_job():\n    second_op(first_op())\n
\n
\n

You can create your own op with the same implementation by calling the execute_k8s_job function\ninside your own op.

\n

The service account that is used to run this job should have the following RBAC permissions:

\n
rules:\n  - apiGroups: ["batch"]\n      resources: ["jobs", "jobs/status"]\n      verbs: ["*"]\n  # The empty arg "" corresponds to the core API group\n  - apiGroups: [""]\n      resources: ["pods", "pods/log", "pods/status"]\n      verbs: ["*"]'\n
\n
\n
\n\n
\n
\ndagster_k8s.execute_k8s_job(context, image, command=None, args=None, namespace=None, image_pull_policy=None, image_pull_secrets=None, service_account_name=None, env_config_maps=None, env_secrets=None, env_vars=None, volume_mounts=None, volumes=None, labels=None, resources=None, scheduler_name=None, load_incluster_config=True, kubeconfig_file=None, timeout=None, container_config=None, pod_template_spec_metadata=None, pod_spec_config=None, job_metadata=None, job_spec_config=None, k8s_job_name=None)[source]\u00b6
\n
\n
\n

\n \n (\n experimental\n )\n \n This API may break in future versions, even between dot releases.\n \n

\n

This function is a utility for executing a Kubernetes job from within a Dagster op.

\n
\n
Parameters:
\n
\n
\n
\n
\n\n
\n

Python API\u00b6

\n

The K8sRunLauncher allows webserver instances to be configured to launch new runs by starting\nper-run Kubernetes Jobs. To configure the K8sRunLauncher, your dagster.yaml should\ninclude a section like:

\n
run_launcher:\n  module: dagster_k8s.launcher\n  class: K8sRunLauncher\n  config:\n    image_pull_secrets:\n    service_account_name: dagster\n    job_image: "my-company.com/image:latest"\n    dagster_home: "/opt/dagster/dagster_home"\n    postgres_password_secret: "dagster-postgresql-secret"\n    image_pull_policy: "IfNotPresent"\n    job_namespace: "dagster"\n    instance_config_map: "dagster-instance"\n    env_config_maps:\n      - "dagster-k8s-job-runner-env"\n    env_secrets:\n      - "dagster-k8s-some-secret"\n
\n
\n
\n
\n

Helm chart\u00b6

\n

For local dev (e.g., on kind or minikube):

\n
helm install \\\n    --set dagsterWebserver.image.repository="dagster.io/buildkite-test-image" \\\n    --set dagsterWebserver.image.tag="py310-latest" \\\n    --set job_runner.image.repository="dagster.io/buildkite-test-image" \\\n    --set job_runner.image.tag="py310-latest" \\\n    --set imagePullPolicy="IfNotPresent" \\\n    dagster \\\n    helm/dagster/\n
\n
\n

Upon installation, the Helm chart will provide instructions for port forwarding\nthe Dagster webserver and Flower (if configured).

\n
\n
\n

Running tests\u00b6

\n

To run the unit tests:

\n
pytest -m "not integration"\n
\n
\n

To run the integration tests, you must have Docker,\nkind,\nand helm installed.

\n

On macOS:

\n
brew install kind\nbrew install helm\n
\n
\n

Docker must be running.

\n

You may experience slow first test runs thanks to image pulls (run pytest -svv --fulltrace for\nvisibility). Building images and loading them to the kind cluster is slow, and there is\nno visibility into the progress of the load.

\n

NOTE: This process is quite slow, as it requires bootstrapping a local kind cluster with\nDocker images and the dagster-k8s Helm chart. For faster development, you can either:

\n
    \n
  1. Keep a warm kind cluster

  2. \n
  3. Use a remote K8s cluster, e.g. via AWS EKS or GCP GKE

  4. \n
\n

Instructions are below.

\n
\n

Faster local development (with kind)\u00b6

\n

You may find that the kind cluster creation, image loading, and kind cluster creation loop\nis too slow for effective local dev.

\n

You may bypass cluster creation and image loading in the following way. First add the --no-cleanup\nflag to your pytest invocation:

\n
pytest --no-cleanup -s -vvv -m "not integration"\n
\n
\n

The tests will run as before, but the kind cluster will be left running after the tests are completed.

\n

For subsequent test runs, you can run:

\n
pytest --kind-cluster="cluster-d9971c84d44d47f382a2928c8c161faa" --existing-helm-namespace="dagster-test-95590a" -s -vvv -m "not integration"\n
\n
\n

This will bypass cluster creation, image loading, and Helm chart installation, for much faster tests.

\n

The kind cluster name and Helm namespace for this command can be found in the logs, or retrieved\nvia the respective CLIs, using kind get clusters and kubectl get namespaces. Note that\nfor kubectl and helm to work correctly with a kind cluster, you should override your\nkubeconfig file location with:

\n
kind get kubeconfig --name kind-test > /tmp/kubeconfig\nexport KUBECONFIG=/tmp/kubeconfig\n
\n
\n
\n
\n

Manual kind cluster setup\u00b6

\n

The test fixtures provided by dagster-k8s automate the process described below, but sometimes\nit\u2019s useful to manually configure a kind cluster and load images onto it.

\n

First, ensure you have a Docker image appropriate for your Python version. Run, from the root of\nthe repo:

\n
./python_modules/dagster-test/dagster_test/test_project/build.sh 3.7.6 \\\n    dagster.io.priv/buildkite-test-image:py310-latest\n
\n
\n

In the above invocation, the Python majmin version should be appropriate for your desired tests.

\n

Then run the following commands to create the cluster and load the image. Note that there is no\nfeedback from the loading process.

\n
kind create cluster --name kind-test\nkind load docker-image --name kind-test dagster.io/dagster-docker-buildkite:py310-latest\n
\n
\n

If you are deploying the Helm chart with an in-cluster Postgres (rather than an external database),\nand/or with dagster-celery workers (and a RabbitMQ), you\u2019ll also want to have images present for\nrabbitmq and postgresql:

\n
docker pull docker.io/bitnami/rabbitmq\ndocker pull docker.io/bitnami/postgresql\n\nkind load docker-image --name kind-test docker.io/bitnami/rabbitmq:latest\nkind load docker-image --name kind-test docker.io/bitnami/postgresql:latest\n
\n
\n

Then you can run pytest as follows:

\n
pytest --kind-cluster=kind-test\n
\n
\n
\n
\n
\n

Faster local development (with an existing K8s cluster)\u00b6

\n

If you already have a development K8s cluster available, you can run tests on that cluster vs.\nrunning locally in kind.

\n

For this to work, first build and deploy the test image to a registry available to your cluster.\nFor example, with a private ECR repository:

\n
./python_modules/dagster-test/dagster_test/test_project/build.sh 3.7.6\ndocker tag dagster-docker-buildkite:latest $AWS_ACCOUNT_ID.dkr.ecr.us-west-2.amazonaws.com/dagster-k8s-tests:2020-04-21T21-04-06\n\naws ecr get-login --no-include-email --region us-west-1 | sh\ndocker push $AWS_ACCOUNT_ID.dkr.ecr.us-west-1.amazonaws.com/dagster-k8s-tests:2020-04-21T21-04-06\n
\n
\n

Then, you can run tests on EKS with:

\n
export DAGSTER_DOCKER_IMAGE_TAG="2020-04-21T21-04-06"\nexport DAGSTER_DOCKER_REPOSITORY="$AWS_ACCOUNT_ID.dkr.ecr.us-west-2.amazonaws.com"\nexport DAGSTER_DOCKER_IMAGE="dagster-k8s-tests"\n\n# First run with --no-cleanup to leave Helm chart in place\npytest --cluster-provider="kubeconfig" --no-cleanup -s -vvv\n\n# Subsequent runs against existing Helm chart\npytest --cluster-provider="kubeconfig" --existing-helm-namespace="dagster-test-<some id>" -s -vvv\n
\n
\n
\n
\n

Validating Helm charts\u00b6

\n

To test / validate Helm charts, you can run:

\n
helm install dagster --dry-run --debug helm/dagster\nhelm lint\n
\n
\n
\n
\n

Enabling GCR access from Minikube\u00b6

\n

To enable GCR access from Minikube:

\n
kubectl create secret docker-registry element-dev-key \\\n    --docker-server=https://gcr.io \\\n    --docker-username=oauth2accesstoken \\\n    --docker-password="$(gcloud auth print-access-token)" \\\n    --docker-email=my@email.com\n
\n
\n
\n
\n

A note about PVCs\u00b6

\n

Both the Postgres and the RabbitMQ Helm charts will store credentials using Persistent Volume\nClaims, which will outlive test invocations and calls to helm uninstall. These must be deleted if\nyou want to change credentials. To view your pvcs, run:

\n
kubectl get pvc\n
\n
\n
\n
\n

Testing Redis\u00b6

\n

The Redis Helm chart installs w/ a randomly-generated password by default; turn this off:

\n
helm install dagredis stable/redis --set usePassword=false\n
\n
\n

Then, to connect to your database from outside the cluster execute the following commands:

\n
kubectl port-forward --namespace default svc/dagredis-master 6379:6379\nredis-cli -h 127.0.0.1 -p 6379\n
\n
\n
\n
\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-k8s", "customsidebar": null, "display_toc": true, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../dagster-mlflow/", "title": "MLflow (dagster-mlflow)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-github/", "title": "GitHub (dagster-github)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-mlflow", "MLflow (dagster-mlflow)", "N", "next"], ["sections/api/apidocs/libraries/dagster-github", "GitHub (dagster-github)", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/libraries/dagster-k8s.rst.txt", "title": "Kubernetes (dagster-k8s)", "toc": "\n"}, "dagster-mlflow": {"alabaster_version": "0.7.13", "body": "
\n

MLflow (dagster-mlflow)\u00b6

\n
\n
\ndagster_mlflow.mlflow_tracking ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
experiment_name (dagster.StringSource):
\n

MlFlow experiment name.

\n
\n
mlflow_tracking_uri (Union[dagster.StringSource, None], optional):
\n

MlFlow tracking server uri.

\n

Default Value: None

\n
\n
parent_run_id (Union[String, None], optional):
\n

Mlflow run ID of parent run if this is a nested run.

\n

Default Value: None

\n
\n
env (permissive dict, optional):
\n

Environment variables for mlflow setup.

\n
\nDefault Value:
{}\n
\n
\n
\n
env_to_tag (Union[List[Any], None], optional):
\n

List of environment variables to log as tags in mlflow.

\n

Default Value: None

\n
\n
extra_tags (permissive dict, optional):
\n

Any extra key-value tags to log to mlflow.

\n
\nDefault Value:
{}\n
\n
\n
\n
\n

This resource initializes an MLflow run that\u2019s used for all steps within a Dagster run.

\n

This resource provides access to all of mlflow\u2019s methods as well as the mlflow tracking client\u2019s\nmethods.

\n

Usage:

\n
    \n
  1. Add the mlflow resource to any ops in which you want to invoke mlflow tracking APIs.

  2. \n
  3. Add the end_mlflow_on_run_finished hook to your job to end the MLflow run\nwhen the Dagster run is finished.

  4. \n
\n

Examples

\n
from dagster_mlflow import end_mlflow_on_run_finished, mlflow_tracking\n\n@op(required_resource_keys={"mlflow"})\ndef mlflow_op(context):\n    mlflow.log_params(some_params)\n    mlflow.tracking.MlflowClient().create_registered_model(some_model_name)\n\n@end_mlflow_on_run_finished\n@job(resource_defs={"mlflow": mlflow_tracking})\ndef mlf_example():\n    mlflow_op()\n\n# example using an mlflow instance with s3 storage\nmlf_example.execute_in_process(run_config={\n    "resources": {\n        "mlflow": {\n            "config": {\n                "experiment_name": my_experiment,\n                "mlflow_tracking_uri": "http://localhost:5000",\n\n                # if want to run a nested run, provide parent_run_id\n                "parent_run_id": an_existing_mlflow_run_id,\n\n                # env variables to pass to mlflow\n                "env": {\n                    "MLFLOW_S3_ENDPOINT_URL": my_s3_endpoint,\n                    "AWS_ACCESS_KEY_ID": my_aws_key_id,\n                    "AWS_SECRET_ACCESS_KEY": my_secret,\n                },\n\n                # env variables you want to log as mlflow tags\n                "env_to_tag": ["DOCKER_IMAGE_TAG"],\n\n                # key-value tags to add to your experiment\n                "extra_tags": {"super": "experiment"},\n            }\n        }\n    }\n})\n
\n
\n
\n\n
\n
\ndagster_mlflow.end_mlflow_on_run_finished HookDefinition\u00b6
\n
\n\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-mlflow", "customsidebar": null, "display_toc": false, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../dagster-msteams/", "title": "Microsoft Teams (dagster-msteams)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-k8s/", "title": "Kubernetes (dagster-k8s)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-msteams", "Microsoft Teams (dagster-msteams)", "N", "next"], ["sections/api/apidocs/libraries/dagster-k8s", "Kubernetes (dagster-k8s)", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/libraries/dagster-mlflow.rst.txt", "title": "MLflow (dagster-mlflow)", "toc": "\n"}, "dagster-msteams": {"alabaster_version": "0.7.13", "body": "
\n

Microsoft Teams (dagster-msteams)\u00b6

\n
\n

Resource\u00b6

\n
\n
\ndagster_msteams.MSTeamsResource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
hook_url (Union[dagster.StringSource, None], optional):
\n

To send messages to MS Teams channel, an incoming webhook has to be created. The incoming webhook url must be given as a part of the resource config to the MSTeamsResource in Dagster. For more information on how to create an incoming webhook, see https://docs.microsoft.com/en-us/microsoftteams/platform/webhooks-and-connectors/how-to/add-incoming-webhook

\n
\n
http_proxy (Union[dagster.StringSource, None], optional):
\n

HTTP proxy URL

\n
\n
https_proxy (Union[dagster.StringSource, None], optional):
\n

HTTPS proxy URL

\n
\n
timeout (Float, optional):
\n

Timeout for requests to MS Teams

\n

Default Value: 60

\n
\n
verify (dagster.BoolSource, optional):
\n

Whether to verify SSL certificates, defaults to True

\n

Default Value: True

\n
\n
\n

This resource is for connecting to Microsoft Teams.

\n

Provides a dagster_msteams.TeamsClient which can be used to\ninterface with the MS Teams API.

\n

By configuring this resource, you can post messages to MS Teams from any Dagster op,\nasset, schedule, or sensor:

\n

Examples

\n
import os\n\nfrom dagster import op, job, Definitions, EnvVar\nfrom dagster_msteams import Card, MSTeamsResource\n\n\n@op\ndef teams_op(msteams: MSTeamsResource):\n    card = Card()\n    card.add_attachment(text_message="Hello There !!")\n    msteams.get_client().post_message(payload=card.payload)\n\n\n@job\ndef teams_job():\n    teams_op()\n\ndefs = Definitions(\n    jobs=[teams_job],\n    resources={\n        "msteams": MSTeamsResource(\n            hook_url=EnvVar("TEAMS_WEBHOOK_URL")\n        )\n    }\n)\n
\n
\n
\n\n
\n
\n

Sensors\u00b6

\n
\n
\ndagster_msteams.teams_on_failure HookDefinition[source]\u00b6
\n

Create a hook on step failure events that will message the given MS Teams webhook URL.

\n
\n
Parameters:
\n
    \n
  • message_fn (Optional(Callable[[HookContext], str])) \u2013 Function which takes in the\nHookContext outputs the message you want to send.

  • \n
  • dagit_base_url \u2013 \n \n (\n deprecated\n )\n \n (This parameter will be removed in version 2.0. Use webserver_base_url instead.) (Optional[str]): The base url of your webserver instance. Specify this\nto allow messages to include deeplinks to the specific run that triggered\nthe hook.

  • \n
  • webserver_base_url \u2013 (Optional[str]): The base url of your webserver instance. Specify this\nto allow messages to include deeplinks to the specific run that triggered\nthe hook.

  • \n
\n
\n
\n

Examples

\n
@teams_on_failure(webserver_base_url="http://localhost:3000")\n@job(...)\ndef my_job():\n    pass\n
\n
\n
def my_message_fn(context: HookContext) -> str:\n    return f"Op {context.op.name} failed!"\n\n@op\ndef a_op(context):\n    pass\n\n@job(...)\ndef my_job():\n    a_op.with_hooks(hook_defs={teams_on_failure("#foo", my_message_fn)})\n
\n
\n
\n\n
\n
\ndagster_msteams.teams_on_success HookDefinition[source]\u00b6
\n

Create a hook on step success events that will message the given MS Teams webhook URL.

\n
\n
Parameters:
\n
    \n
  • message_fn (Optional(Callable[[HookContext], str])) \u2013 Function which takes in the\nHookContext outputs the message you want to send.

  • \n
  • dagit_base_url \u2013 \n \n (\n deprecated\n )\n \n (This parameter will be removed in version 2.0. Use webserver_base_url instead.) (Optional[str]): The base url of your webserver instance. Specify this\nto allow messages to include deeplinks to the specific run that triggered\nthe hook.

  • \n
\n
\n
\n

Examples

\n
@teams_on_success(webserver_base_url="http://localhost:3000")\n@job(...)\ndef my_job():\n    pass\n
\n
\n
def my_message_fn(context: HookContext) -> str:\n    return f"Op {context.op.name} failed!"\n\n@op\ndef a_op(context):\n    pass\n\n@job(...)\ndef my_job():\n    a_op.with_hooks(hook_defs={teams_on_success("#foo", my_message_fn)})\n
\n
\n
\n\n
\n
\ndagster_msteams.make_teams_on_run_failure_sensor(hook_url, message_fn=<function _default_failure_message>, http_proxy=None, https_proxy=None, timeout=60, verify=None, name=None, dagit_base_url=None, default_status=DefaultSensorStatus.STOPPED, monitored_jobs=None, monitor_all_repositories=False, webserver_base_url=None)[source]\u00b6
\n

Create a sensor on run failures that will message the given MS Teams webhook URL.

\n
\n
Parameters:
\n
    \n
  • hook_url (str) \u2013 MS Teams incoming webhook URL.

  • \n
  • message_fn (Optional(Callable[[RunFailureSensorContext], str])) \u2013 Function which\ntakes in the RunFailureSensorContext and outputs the message you want to send.\nDefaults to a text message that contains error message, job name, and run ID.

  • \n
  • http_proxy \u2013 (Optional[str]): Proxy for requests using http protocol.

  • \n
  • https_proxy \u2013 (Optional[str]): Proxy for requests using https protocol.

  • \n
  • timeout \u2013 (Optional[float]): Connection timeout in seconds. Defaults to 60.

  • \n
  • verify \u2013 (Optional[bool]): Whether to verify the servers TLS certificate.

  • \n
  • name \u2013 (Optional[str]): The name of the sensor. Defaults to \u201cteams_on_run_failure\u201d.

  • \n
  • dagit_base_url \u2013 \n \n (\n deprecated\n )\n \n (This parameter will be removed in version 2.0. Use webserver_base_url instead.) (Optional[str]): The base url of your webserver instance. Specify this to allow\nmessages to include deeplinks to the failed run.

  • \n
  • default_status (DefaultSensorStatus) \u2013 Whether the sensor starts as running or not. The default\nstatus can be overridden from Dagit or via the GraphQL API.

  • \n
  • monitored_jobs (Optional[List[Union[JobDefinition, GraphDefinition, UnresolvedAssetJobDefinition, RepositorySelector, JobSelector]]]) \u2013 Jobs in the current repository that will be monitored by this sensor. Defaults to None,\nwhich means the alert will be sent when any job in the repository matches the requested\nrun_status. To monitor jobs in external repositories, use RepositorySelector and JobSelector.

  • \n
  • monitor_all_repositories (bool) \u2013 If set to True, the sensor will monitor all runs in the\nDagster instance. If set to True, an error will be raised if you also specify\nmonitored_jobs or job_selection. Defaults to False.

  • \n
  • webserver_base_url \u2013 (Optional[str]): The base url of your webserver instance. Specify this to allow\nmessages to include deeplinks to the failed run.

  • \n
\n
\n
\n

Examples

\n
teams_on_run_failure = make_teams_on_run_failure_sensor(\n    hook_url=os.getenv("TEAMS_WEBHOOK_URL")\n)\n\n@repository\ndef my_repo():\n    return [my_job + teams_on_run_failure]\n
\n
\n
def my_message_fn(context: RunFailureSensorContext) -> str:\n    return "Job {job_name} failed! Error: {error}".format(\n        job_name=context.dagster_run.job_name,\n        error=context.failure_event.message,\n    )\n\nteams_on_run_failure = make_teams_on_run_failure_sensor(\n    hook_url=os.getenv("TEAMS_WEBHOOK_URL"),\n    message_fn=my_message_fn,\n    webserver_base_url="http://localhost:3000",\n)\n
\n
\n
\n\n
\n
\n

Legacy\u00b6

\n
\n
\ndagster_msteams.msteams_resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
hook_url (Union[dagster.StringSource, None], optional):
\n

To send messages to MS Teams channel, an incoming webhook has to be created. The incoming webhook url must be given as a part of the resource config to the MSTeamsResource in Dagster. For more information on how to create an incoming webhook, see https://docs.microsoft.com/en-us/microsoftteams/platform/webhooks-and-connectors/how-to/add-incoming-webhook

\n
\n
http_proxy (Union[dagster.StringSource, None], optional):
\n

HTTP proxy URL

\n
\n
https_proxy (Union[dagster.StringSource, None], optional):
\n

HTTPS proxy URL

\n
\n
timeout (Float, optional):
\n

Timeout for requests to MS Teams

\n

Default Value: 60

\n
\n
verify (dagster.BoolSource, optional):
\n

Whether to verify SSL certificates, defaults to True

\n

Default Value: True

\n
\n
\n

This resource is for connecting to Microsoft Teams.

\n

The resource object is a dagster_msteams.TeamsClient.

\n

By configuring this resource, you can post messages to MS Teams from any Dagster solid:

\n

Examples

\n
import os\n\nfrom dagster import op, job\nfrom dagster_msteams import Card, msteams_resource\n\n\n@op(required_resource_keys={"msteams"})\ndef teams_op(context):\n    card = Card()\n    card.add_attachment(text_message="Hello There !!")\n    context.resources.msteams.post_message(payload=card.payload)\n\n\n@job(resource_defs={"msteams": msteams_resource})\ndef teams_job():\n    teams_op()\n\n\nteams_job.execute_in_process(\n    {"resources": {"msteams": {"config": {"hook_url": os.getenv("TEAMS_WEBHOOK_URL")}}}}\n)\n
\n
\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-msteams", "customsidebar": null, "display_toc": true, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../dagster-mysql/", "title": "MySQL (dagster-mysql)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-mlflow/", "title": "MLflow (dagster-mlflow)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-mysql", "MySQL (dagster-mysql)", "N", "next"], ["sections/api/apidocs/libraries/dagster-mlflow", "MLflow (dagster-mlflow)", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/libraries/dagster-msteams.rst.txt", "title": "Microsoft Teams (dagster-msteams)", "toc": "\n"}, "dagster-mysql": {"alabaster_version": "0.7.13", "body": "
\n

MySQL (dagster-mysql)\u00b6

\n
\n
\nclass dagster_mysql.MySQLEventLogStorage(mysql_url, inst_data=None)[source]\u00b6
\n

MySQL-backed event log storage.

\n

Users should not directly instantiate this class; it is instantiated by internal machinery when\ndagster-webserver and dagster-graphql load, based on the values in the dagster.yaml file in\n$DAGSTER_HOME. Configuration of this class should be done by setting values in that file.

\n
\n
dagster.yaml\u00b6
\n
event_log_storage:\n module: dagster_mysql.event_log\n class: MySQLEventLogStorage\n config:\n  mysql_db:\n   username: { username }\n   password: { password }\n   hostname: { hostname }\n   db_name: { db_name }\n   port: { port }\n
\n
\n
\n

Note that the fields in this config are StringSource and\nIntSource and can be configured from environment variables.

\n
\n\n
\n
\nclass dagster_mysql.MySQLRunStorage(mysql_url, inst_data=None)[source]\u00b6
\n

MySQL-backed run storage.

\n

Users should not directly instantiate this class; it is instantiated by internal machinery when\ndagster-webserver and dagster-graphql load, based on the values in the dagster.yaml file in\n$DAGSTER_HOME. Configuration of this class should be done by setting values in that file.

\n
\n
dagster.yaml\u00b6
\n
run_storage:\n module: dagster_mysql.run_storage\n class: MySQLRunStorage\n config:\n  mysql_db:\n   username: { username }\n   password: { password }\n   hostname: { hostname }\n   db_name: { database }\n   port: { port }\n
\n
\n
\n

Note that the fields in this config are StringSource and\nIntSource and can be configured from environment variables.

\n
\n\n
\n
\nclass dagster_mysql.MySQLScheduleStorage(mysql_url, inst_data=None)[source]\u00b6
\n

MySQL-backed run storage.

\n

Users should not directly instantiate this class; it is instantiated by internal machinery when\ndagster-webserver and dagster-graphql load, based on the values in the dagster.yaml file in\n$DAGSTER_HOME. Configuration of this class should be done by setting values in that file.

\n
\n
dagster.yaml\u00b6
\n
schedule_storage:\n module: dagster_mysql.schedule_storage\n class: MySQLScheduleStorage\n config:\n  mysql_db:\n   username: { username }\n   password: { password }\n   hostname: { hostname }\n   db_name: { db_name }\n   port: { port }\n
\n
\n
\n

Note that the fields in this config are StringSource and\nIntSource and can be configured from environment variables.

\n
\n\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-mysql", "customsidebar": null, "display_toc": false, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../dagster-pagerduty/", "title": "PagerDuty (dagster-pagerduty)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-msteams/", "title": "Microsoft Teams (dagster-msteams)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-pagerduty", "PagerDuty (dagster-pagerduty)", "N", "next"], ["sections/api/apidocs/libraries/dagster-msteams", "Microsoft Teams (dagster-msteams)", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/libraries/dagster-mysql.rst.txt", "title": "MySQL (dagster-mysql)", "toc": "\n"}, "dagster-pagerduty": {"alabaster_version": "0.7.13", "body": "
\n

PagerDuty (dagster-pagerduty)\u00b6

\n

This library provides an integration with PagerDuty, to support creating alerts from your Dagster\ncode.

\n

Presently, it provides a thin wrapper on the Events API V2.

\n
\n
\n

Getting Started\u00b6

\n

You can install this library with:

\n
pip install dagster_pagerduty\n
\n
\n

To use this integration, you\u2019ll first need to create an Events API V2 PagerDuty integration on a PagerDuty service. There are instructions\nhere for\ncreating a new PagerDuty service & integration.

\n

Once your Events API V2 integration is set up, you\u2019ll find an Integration Key (also referred to as a\n\u201cRouting Key\u201d) on the Integrations tab for your service. This key is used to authorize events\ncreated from the PagerDuty events API.

\n

Once your service/integration is created, you can provision a PagerDuty resource and issue PagerDuty\nalerts from within your ops.

\n
\n
\ndagster_pagerduty.PagerDutyService ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
routing_key (dagster.StringSource):
\n

The routing key provisions access to your PagerDuty service. Youwill need to include the integration key for your new integration, as arouting_key in the event payload.

\n
\n
\n

This resource is for posting events to PagerDuty.

\n
\n\n
\n

Legacy\u00b6

\n
\n
\ndagster_pagerduty.pagerduty_resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
routing_key (dagster.StringSource):
\n

The routing key provisions access to your PagerDuty service. Youwill need to include the integration key for your new integration, as arouting_key in the event payload.

\n
\n
\n

A resource for posting events (alerts) to PagerDuty.

\n

Example

\n
@op\ndef pagerduty_op(pagerduty: PagerDutyService):\n    pagerduty.EventV2_create(\n        summary='alert from dagster'\n        source='localhost',\n        severity='error',\n        event_action='trigger',\n    )\n\n@job(resource_defs={ 'pagerduty': pagerduty_resource })\ndef pagerduty_test():\n    pagerduty_op()\n\npagerduty_test.execute_in_process(\n    run_config={\n        "resources": {\n            'pagerduty': {'config': {'routing_key': '0123456789abcdef0123456789abcdef'}}\n        }\n    }\n)\n
\n
\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-pagerduty", "customsidebar": null, "display_toc": true, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../dagster-pandas/", "title": "Pandas (dagster-pandas)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-mysql/", "title": "MySQL (dagster-mysql)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-pandas", "Pandas (dagster-pandas)", "N", "next"], ["sections/api/apidocs/libraries/dagster-mysql", "MySQL (dagster-mysql)", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/libraries/dagster-pagerduty.rst.txt", "title": "PagerDuty (dagster-pagerduty)", "toc": "\n"}, "dagster-pandas": {"alabaster_version": "0.7.13", "body": "
\n

Pandas (dagster-pandas)\u00b6

\n

The dagster_pandas library provides utilities for using pandas with Dagster and for implementing\nvalidation on pandas DataFrames. A good place to start with dagster_pandas is the validation\nguide.

\n
\n
\ndagster_pandas.create_dagster_pandas_dataframe_type(name, description=None, columns=None, metadata_fn=None, dataframe_constraints=None, loader=None)[source]\u00b6
\n

Constructs a custom pandas dataframe dagster type.

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 Name of the dagster pandas type.

  • \n
  • description (Optional[str]) \u2013 A markdown-formatted string, displayed in tooling.

  • \n
  • columns (Optional[List[PandasColumn]]) \u2013 A list of PandasColumn objects\nwhich express dataframe column schemas and constraints.

  • \n
  • metadata_fn (Optional[Callable[[], Union[Dict[str, Union[str, float, int, Dict, MetadataValue]]) \u2013 A callable which takes your dataframe and returns a dict with string label keys and\nMetadataValue values.

  • \n
  • dataframe_constraints (Optional[List[DataFrameConstraint]]) \u2013 A list of objects that inherit from\nDataFrameConstraint. This allows you to express dataframe-level constraints.

  • \n
  • loader (Optional[DagsterTypeLoader]) \u2013 An instance of a class that\ninherits from DagsterTypeLoader. If None, we will default\nto using dataframe_loader.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster_pandas.RowCountConstraint(num_allowed_rows, error_tolerance=0)[source]\u00b6
\n

A dataframe constraint that validates the expected count of rows.

\n
\n
Parameters:
\n
    \n
  • num_allowed_rows (int) \u2013 The number of allowed rows in your dataframe.

  • \n
  • error_tolerance (Optional[int]) \u2013 The acceptable threshold if you are not completely certain. Defaults to 0.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster_pandas.StrictColumnsConstraint(strict_column_list, enforce_ordering=False)[source]\u00b6
\n

A dataframe constraint that validates column existence and ordering.

\n
\n
Parameters:
\n
    \n
  • strict_column_list (List[str]) \u2013 The exact list of columns that your dataframe must have.

  • \n
  • enforce_ordering (Optional[bool]) \u2013 If true, will enforce that the ordering of column names must match.\nDefault is False.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster_pandas.PandasColumn(name, constraints=None, is_required=None)[source]\u00b6
\n

The main API for expressing column level schemas and constraints for your custom dataframe\ntypes.

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 Name of the column. This must match up with the column name in the dataframe you\nexpect to receive.

  • \n
  • is_required (Optional[bool]) \u2013 Flag indicating the optional/required presence of the column.\nIf th column exists, the validate function will validate the column. Defaults to True.

  • \n
  • constraints (Optional[List[Constraint]]) \u2013 List of constraint objects that indicate the\nvalidation rules for the pandas column.

  • \n
\n
\n
\n
\n\n
\n
\ndagster_pandas.DataFrame = <dagster._core.types.dagster_type.DagsterType object>\u00b6
\n

Define a type in dagster. These can be used in the inputs and outputs of ops.

\n
\n
Parameters:
\n
    \n
  • type_check_fn (Callable[[TypeCheckContext, Any], [Union[bool, TypeCheck]]]) \u2013 The function that defines the type check. It takes the value flowing\nthrough the input or output of the op. If it passes, return either\nTrue or a TypeCheck with success set to True. If it fails,\nreturn either False or a TypeCheck with success set to False.\nThe first argument must be named context (or, if unused, _, _context, or context_).\nUse required_resource_keys for access to resources.

  • \n
  • key (Optional[str]) \u2013

    The unique key to identify types programmatically.\nThe key property always has a value. If you omit key to the argument\nto the init function, it instead receives the value of name. If\nneither key nor name is provided, a CheckError is thrown.

    \n

    In the case of a generic type such as List or Optional, this is\ngenerated programmatically based on the type parameters.

    \n

    For most use cases, name should be set and the key argument should\nnot be specified.

    \n

  • \n
  • name (Optional[str]) \u2013 A unique name given by a user. If key is None, key\nbecomes this value. Name is not given in a case where the user does\nnot specify a unique name for this type, such as a generic class.

  • \n
  • description (Optional[str]) \u2013 A markdown-formatted string, displayed in tooling.

  • \n
  • loader (Optional[DagsterTypeLoader]) \u2013 An instance of a class that\ninherits from DagsterTypeLoader and can map config data to a value of\nthis type. Specify this argument if you will need to shim values of this type using the\nconfig machinery. As a rule, you should use the\n@dagster_type_loader decorator to construct\nthese arguments.

  • \n
  • required_resource_keys (Optional[Set[str]]) \u2013 Resource keys required by the type_check_fn.

  • \n
  • is_builtin (bool) \u2013 Defaults to False. This is used by tools to display or\nfilter built-in types (such as String, Int) to visually distinguish\nthem from user-defined types. Meant for internal use.

  • \n
  • kind (DagsterTypeKind) \u2013 Defaults to None. This is used to determine the kind of runtime type\nfor InputDefinition and OutputDefinition type checking.

  • \n
  • typing_type \u2013 Defaults to None. A valid python typing type (e.g. Optional[List[int]]) for the\nvalue contained within the DagsterType. Meant for internal use.

  • \n
\n
\n
\n
\n\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-pandas", "customsidebar": null, "display_toc": false, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../dagster-pandera/", "title": "Pandera (dagster-pandera)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-pagerduty/", "title": "PagerDuty (dagster-pagerduty)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-pandera", "Pandera (dagster-pandera)", "N", "next"], ["sections/api/apidocs/libraries/dagster-pagerduty", "PagerDuty (dagster-pagerduty)", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/libraries/dagster-pandas.rst.txt", "title": "Pandas (dagster-pandas)", "toc": "\n"}, "dagster-pandera": {"alabaster_version": "0.7.13", "body": "
\n

Pandera (dagster-pandera)\u00b6

\n

The dagster_pandera library allows Dagster users to use dataframe validation library Pandera for the validation of Pandas dataframes. See the guide for details.

\n
\n
\ndagster_pandera.pandera_schema_to_dagster_type(schema)[source]\u00b6
\n

Convert a Pandera dataframe schema to a DagsterType.

\n

The generated Dagster type will be given an automatically generated name. The schema\u2019s title\nproperty, name property, or class name (in that order) will be used. If neither title or\nname is defined, a name of the form DagsterPanderaDataframe<n> is generated.

\n

Additional metadata is also extracted from the Pandera schema and attached to the returned\nDagsterType as a metadata dictionary. The extracted metadata includes:

\n
    \n
  • Descriptions on the schema and constituent columns and checks.

  • \n
  • Data types for each column.

  • \n
  • String representations of all column-wise checks.

  • \n
  • String representations of all row-wise (i.e. \u201cwide\u201d) checks.

  • \n
\n

The returned DagsterType type will call the Pandera schema\u2019s validate() method in its type\ncheck function. Validation is done in lazy mode, i.e. pandera will attempt to validate all\nvalues in the dataframe, rather than stopping on the first error.

\n

If validation fails, the returned TypeCheck object will contain two pieces of metadata:

\n
    \n
  • num_failures total number of validation errors.

  • \n
  • failure_sample a table containing up to the first 10 validation errors.

  • \n
\n
\n
Parameters:
\n

schema (Union[pa.DataFrameSchema, Type[pa.SchemaModel]]) \u2013

\n
\n
Returns:
\n

Dagster Type constructed from the Pandera schema.

\n
\n
Return type:
\n

DagsterType

\n
\n
\n
\n\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-pandera", "customsidebar": null, "display_toc": false, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../dagster-papertrail/", "title": "Papertrail (dagster-papertrail)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-pandas/", "title": "Pandas (dagster-pandas)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-papertrail", "Papertrail (dagster-papertrail)", "N", "next"], ["sections/api/apidocs/libraries/dagster-pandas", "Pandas (dagster-pandas)", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/libraries/dagster-pandera.rst.txt", "title": "Pandera (dagster-pandera)", "toc": "\n"}, "dagster-papertrail": {"alabaster_version": "0.7.13", "body": "
\n

Papertrail (dagster-papertrail)\u00b6

\n

This library provides an integration with Papertrail for logging.

\n

You can easily set up your Dagster job to log to Papertrail. You\u2019ll need an active Papertrail\naccount, and have your papertrail URL and port handy.

\n
\n
\ndagster_papertrail.papertrail_logger LoggerDefinition\u00b6
\n

Core class for defining loggers.

\n

Loggers are job-scoped logging handlers, which will be automatically invoked whenever\ndagster messages are logged from within a job.

\n
\n
Parameters:
\n
    \n
  • logger_fn (Callable[[InitLoggerContext], logging.Logger]) \u2013 User-provided function to\ninstantiate the logger. This logger will be automatically invoked whenever the methods\non context.log are called from within job compute logic.

  • \n
  • config_schema (Optional[ConfigSchema]) \u2013 The schema for the config. Configuration data available in\ninit_context.logger_config. If not set, Dagster will accept any config provided.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of this logger.

  • \n
\n
\n
\n
\n\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-papertrail", "customsidebar": null, "display_toc": false, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../dagster-postgres/", "title": "PostgreSQL (dagster-postgres)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-pandera/", "title": "Pandera (dagster-pandera)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-postgres", "PostgreSQL (dagster-postgres)", "N", "next"], ["sections/api/apidocs/libraries/dagster-pandera", "Pandera (dagster-pandera)", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/libraries/dagster-papertrail.rst.txt", "title": "Papertrail (dagster-papertrail)", "toc": "\n"}, "dagster-postgres": {"alabaster_version": "0.7.13", "body": "
\n

PostgreSQL (dagster-postgres)\u00b6

\n
\n
\ndagster_postgres.PostgresEventLogStorage = <class 'dagster_postgres.event_log.event_log.PostgresEventLogStorage'>[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
postgres_url (dagster.StringSource, optional):
\n

\n
postgres_db (strict dict, optional):
\n
\nConfig Schema:
\n
username (dagster.StringSource):
\n

\n
password (dagster.StringSource):
\n

\n
hostname (dagster.StringSource):
\n

\n
db_name (dagster.StringSource):
\n

\n
port (dagster.IntSource, optional):
\n

Default Value: 5432

\n
\n
params (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\n
scheme (dagster.StringSource, optional):
\n

Default Value: \u2018postgresql\u2019

\n
\n
\n
\n
should_autocreate_tables (Bool, optional):
\n

Default Value: True

\n
\n
\n

Postgres-backed event log storage.

\n

Users should not directly instantiate this class; it is instantiated by internal machinery when\ndagster-webserver and dagster-graphql load, based on the values in the dagster.yaml file in\n$DAGSTER_HOME. Configuration of this class should be done by setting values in that file.

\n

To use Postgres for all of the components of your instance storage, you can add the following\nblock to your dagster.yaml:

\n
\n
dagster.yaml\u00b6
\n
storage:\n  postgres:\n    postgres_db:\n      username: my_username\n      password: my_password\n      hostname: my_hostname\n      db_name: my_database\n      port: 5432\n
\n
\n
\n

If you are configuring the different storage components separately and are specifically\nconfiguring your event log storage to use Postgres, you can add a block such as the following\nto your dagster.yaml:

\n
\n
dagster.yaml\u00b6
\n
event_log_storage:\n  module: dagster_postgres.event_log\n  class: PostgresEventLogStorage\n  config:\n    postgres_db:\n      username: { username }\n      password: { password }\n      hostname: { hostname }\n      db_name: { db_name }\n      port: { port }\n
\n
\n
\n

Note that the fields in this config are StringSource and\nIntSource and can be configured from environment variables.

\n
\n\n
\n
\ndagster_postgres.PostgresRunStorage = <class 'dagster_postgres.run_storage.run_storage.PostgresRunStorage'>[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
postgres_url (dagster.StringSource, optional):
\n

\n
postgres_db (strict dict, optional):
\n
\nConfig Schema:
\n
username (dagster.StringSource):
\n

\n
password (dagster.StringSource):
\n

\n
hostname (dagster.StringSource):
\n

\n
db_name (dagster.StringSource):
\n

\n
port (dagster.IntSource, optional):
\n

Default Value: 5432

\n
\n
params (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\n
scheme (dagster.StringSource, optional):
\n

Default Value: \u2018postgresql\u2019

\n
\n
\n
\n
should_autocreate_tables (Bool, optional):
\n

Default Value: True

\n
\n
\n

Postgres-backed run storage.

\n

Users should not directly instantiate this class; it is instantiated by internal machinery when\ndagster-webserver and dagster-graphql load, based on the values in the dagster.yaml file in\n$DAGSTER_HOME. Configuration of this class should be done by setting values in that file.

\n

To use Postgres for all of the components of your instance storage, you can add the following\nblock to your dagster.yaml:

\n
\n
dagster.yaml\u00b6
\n
storage:\n  postgres:\n    postgres_db:\n      username: my_username\n      password: my_password\n      hostname: my_hostname\n      db_name: my_database\n      port: 5432\n
\n
\n
\n

If you are configuring the different storage components separately and are specifically\nconfiguring your run storage to use Postgres, you can add a block such as the following\nto your dagster.yaml:

\n
\n
dagster.yaml\u00b6
\n
run_storage:\n  module: dagster_postgres.run_storage\n  class: PostgresRunStorage\n  config:\n    postgres_db:\n      username: { username }\n      password: { password }\n      hostname: { hostname }\n      db_name: { db_name }\n      port: { port }\n
\n
\n
\n

Note that the fields in this config are StringSource and\nIntSource and can be configured from environment variables.

\n
\n\n
\n
\ndagster_postgres.PostgresScheduleStorage = <class 'dagster_postgres.schedule_storage.schedule_storage.PostgresScheduleStorage'>[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
postgres_url (dagster.StringSource, optional):
\n

\n
postgres_db (strict dict, optional):
\n
\nConfig Schema:
\n
username (dagster.StringSource):
\n

\n
password (dagster.StringSource):
\n

\n
hostname (dagster.StringSource):
\n

\n
db_name (dagster.StringSource):
\n

\n
port (dagster.IntSource, optional):
\n

Default Value: 5432

\n
\n
params (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\n
scheme (dagster.StringSource, optional):
\n

Default Value: \u2018postgresql\u2019

\n
\n
\n
\n
should_autocreate_tables (Bool, optional):
\n

Default Value: True

\n
\n
\n

Postgres-backed run storage.

\n

Users should not directly instantiate this class; it is instantiated by internal machinery when\ndagster-webserver and dagster-graphql load, based on the values in the dagster.yaml file in\n$DAGSTER_HOME. Configuration of this class should be done by setting values in that file.

\n

To use Postgres for all of the components of your instance storage, you can add the following\nblock to your dagster.yaml:

\n
\n
dagster.yaml\u00b6
\n
storage:\n  postgres:\n    postgres_db:\n      username: my_username\n      password: my_password\n      hostname: my_hostname\n      db_name: my_database\n      port: 5432\n
\n
\n
\n

If you are configuring the different storage components separately and are specifically\nconfiguring your schedule storage to use Postgres, you can add a block such as the following\nto your dagster.yaml:

\n
\n
dagster.yaml\u00b6
\n
schedule_storage:\n  module: dagster_postgres.schedule_storage\n  class: PostgresScheduleStorage\n  config:\n    postgres_db:\n      username: { username }\n      password: { password }\n      hostname: { hostname }\n      db_name: { db_name }\n      port: { port }\n
\n
\n
\n

Note that the fields in this config are StringSource and\nIntSource and can be configured from environment variables.

\n
\n\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-postgres", "customsidebar": null, "display_toc": false, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../dagster-prometheus/", "title": "Prometheus (dagster-prometheus)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-papertrail/", "title": "Papertrail (dagster-papertrail)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-prometheus", "Prometheus (dagster-prometheus)", "N", "next"], ["sections/api/apidocs/libraries/dagster-papertrail", "Papertrail (dagster-papertrail)", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/libraries/dagster-postgres.rst.txt", "title": "PostgreSQL (dagster-postgres)", "toc": "\n"}, "dagster-prometheus": {"alabaster_version": "0.7.13", "body": "
\n

Prometheus (dagster-prometheus)\u00b6

\n
\n
\ndagster_prometheus.PrometheusResource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
gateway (dagster.StringSource):
\n

The url for your push gateway. Either of the form \u2018http://pushgateway.local\u2019, or \u2018pushgateway.local\u2019. Scheme defaults to \u2018http\u2019 if none is provided

\n
\n
timeout (dagster.IntSource, optional):
\n

is how long delete will attempt to connect before giving up. Defaults to 30s.

\n

Default Value: 30

\n
\n
\n

This resource is used to send metrics to a Prometheus Pushgateway.

\n

Example:

\n
from dagster_prometheus import PrometheusResource\nfrom dagster import Definitions, job, op\n\n@op\ndef example_prometheus_op(prometheus: PrometheusResource):\n    prometheus.push_to_gateway(job="my_job")\n\n@job\ndef my_job():\n    example_prometheus_op()\n\ndefs = Definitions(\n    jobs=[my_job],\n    resources={"prometheus": PrometheusResource(gateway="http://pushgateway.local")},\n)\n
\n
\n
\n\n
\n
\nclass dagster_prometheus.resources.PrometheusClient[source]\u00b6
\n

Integrates with Prometheus via the prometheus_client library.

\n
\n\n
\n

Legacy\u00b6

\n
\n
\ndagster_prometheus.prometheus_resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
gateway (dagster.StringSource):
\n

The url for your push gateway. Either of the form \u2018http://pushgateway.local\u2019, or \u2018pushgateway.local\u2019. Scheme defaults to \u2018http\u2019 if none is provided

\n
\n
timeout (dagster.IntSource, optional):
\n

is how long delete will attempt to connect before giving up. Defaults to 30s.

\n

Default Value: 30

\n
\n
\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-prometheus", "customsidebar": null, "display_toc": true, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../dagster-pyspark/", "title": "Pyspark (dagster-pyspark)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-postgres/", "title": "PostgreSQL (dagster-postgres)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-pyspark", "Pyspark (dagster-pyspark)", "N", "next"], ["sections/api/apidocs/libraries/dagster-postgres", "PostgreSQL (dagster-postgres)", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/libraries/dagster-prometheus.rst.txt", "title": "Prometheus (dagster-prometheus)", "toc": "\n"}, "dagster-pyspark": {"alabaster_version": "0.7.13", "body": "
\n

Pyspark (dagster-pyspark)\u00b6

\n
\n
\ndagster_pyspark.PySparkResource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
spark_config (dict):
\n

\n
\n

This resource provides access to a PySpark Session for executing PySpark code within Dagster.

\n

Example

\n
@op\ndef my_op(pyspark: PySparkResource)\n    spark_session = pyspark.spark_session\n    dataframe = spark_session.read.json("examples/src/main/resources/people.json")\n\n\n@job(\n    resource_defs={\n        "pyspark": PySparkResource(\n            spark_config={\n                "spark.executor.memory": "2g"\n            }\n        )\n    }\n)\ndef my_spark_job():\n    my_op()\n
\n
\n
\n\n
\n

Legacy\u00b6

\n
\n
\ndagster_pyspark.pyspark_resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
spark_conf (permissive dict, optional):
\n
\nDefault Value:
{\n    "spark": {\n        "app": {},\n        "driver": {\n            "blockManager": {}\n        },\n        "executor": {\n            "pyspark": {},\n            "logs": {\n                "rolling": {\n                    "time": {}\n                }\n            }\n        },\n        "local": {},\n        "submit": {},\n        "log": {},\n        "redaction": {},\n        "python": {\n            "profile": {},\n            "worker": {}\n        },\n        "files": {},\n        "jars": {},\n        "pyspark": {\n            "driver": {}\n        },\n        "reducer": {},\n        "shuffle": {\n            "file": {},\n            "io": {},\n            "service": {\n                "index": {\n                    "cache": {}\n                }\n            },\n            "sort": {},\n            "spill": {},\n            "registration": {}\n        },\n        "eventLog": {\n            "logBlockUpdates": {},\n            "longForm": {},\n            "buffer": {}\n        },\n        "ui": {\n            "dagGraph": {},\n            "liveUpdate": {}\n        },\n        "worker": {\n            "ui": {}\n        },\n        "sql": {\n            "ui": {}\n        },\n        "streaming": {\n            "ui": {},\n            "backpressure": {},\n            "receiver": {\n                "writeAheadLog": {}\n            },\n            "kafka": {},\n            "driver": {\n                "writeAheadLog": {}\n            }\n        },\n        "broadcast": {},\n        "io": {\n            "compression": {\n                "lz4": {},\n                "snappy": {},\n                "zstd": {}\n            }\n        },\n        "kryo": {},\n        "kryoserializer": {\n            "buffer": {}\n        },\n        "rdd": {},\n        "serializer": {},\n        "memory": {\n            "offHeap": {}\n        },\n        "storage": {\n            "replication": {}\n        },\n        "cleaner": {\n            "periodicGC": {},\n            "referenceTracking": {\n                "blocking": {}\n            }\n        },\n        "default": {},\n        "hadoop": {\n            "mapreduce": {\n                "fileoutputcommitter": {\n                    "algorithm": {}\n                }\n            }\n        },\n        "rpc": {\n            "message": {},\n            "retry": {}\n        },\n        "blockManager": {},\n        "network": {},\n        "port": {},\n        "core": {\n            "connection": {\n                "ack": {\n                    "wait": {}\n                }\n            }\n        },\n        "cores": {},\n        "locality": {\n            "wait": {}\n        },\n        "scheduler": {\n            "revive": {},\n            "listenerbus": {\n                "eventqueue": {}\n            }\n        },\n        "blacklist": {\n            "task": {},\n            "stage": {},\n            "application": {\n                "fetchFailure": {}\n            }\n        },\n        "speculation": {},\n        "task": {\n            "reaper": {}\n        },\n        "stage": {},\n        "dynamicAllocation": {},\n        "r": {\n            "driver": {},\n            "shell": {}\n        },\n        "graphx": {\n            "pregel": {}\n        },\n        "deploy": {\n            "zookeeper": {}\n        }\n    }\n}\n
\n
\n
\nConfig Schema:
\n
spark (permissive dict, optional):
\n
\nDefault Value:
{\n    "app": {},\n    "driver": {\n        "blockManager": {}\n    },\n    "executor": {\n        "pyspark": {},\n        "logs": {\n            "rolling": {\n                "time": {}\n            }\n        }\n    },\n    "local": {},\n    "submit": {},\n    "log": {},\n    "redaction": {},\n    "python": {\n        "profile": {},\n        "worker": {}\n    },\n    "files": {},\n    "jars": {},\n    "pyspark": {\n        "driver": {}\n    },\n    "reducer": {},\n    "shuffle": {\n        "file": {},\n        "io": {},\n        "service": {\n            "index": {\n                "cache": {}\n            }\n        },\n        "sort": {},\n        "spill": {},\n        "registration": {}\n    },\n    "eventLog": {\n        "logBlockUpdates": {},\n        "longForm": {},\n        "buffer": {}\n    },\n    "ui": {\n        "dagGraph": {},\n        "liveUpdate": {}\n    },\n    "worker": {\n        "ui": {}\n    },\n    "sql": {\n        "ui": {}\n    },\n    "streaming": {\n        "ui": {},\n        "backpressure": {},\n        "receiver": {\n            "writeAheadLog": {}\n        },\n        "kafka": {},\n        "driver": {\n            "writeAheadLog": {}\n        }\n    },\n    "broadcast": {},\n    "io": {\n        "compression": {\n            "lz4": {},\n            "snappy": {},\n            "zstd": {}\n        }\n    },\n    "kryo": {},\n    "kryoserializer": {\n        "buffer": {}\n    },\n    "rdd": {},\n    "serializer": {},\n    "memory": {\n        "offHeap": {}\n    },\n    "storage": {\n        "replication": {}\n    },\n    "cleaner": {\n        "periodicGC": {},\n        "referenceTracking": {\n            "blocking": {}\n        }\n    },\n    "default": {},\n    "hadoop": {\n        "mapreduce": {\n            "fileoutputcommitter": {\n                "algorithm": {}\n            }\n        }\n    },\n    "rpc": {\n        "message": {},\n        "retry": {}\n    },\n    "blockManager": {},\n    "network": {},\n    "port": {},\n    "core": {\n        "connection": {\n            "ack": {\n                "wait": {}\n            }\n        }\n    },\n    "cores": {},\n    "locality": {\n        "wait": {}\n    },\n    "scheduler": {\n        "revive": {},\n        "listenerbus": {\n            "eventqueue": {}\n        }\n    },\n    "blacklist": {\n        "task": {},\n        "stage": {},\n        "application": {\n            "fetchFailure": {}\n        }\n    },\n    "speculation": {},\n    "task": {\n        "reaper": {}\n    },\n    "stage": {},\n    "dynamicAllocation": {},\n    "r": {\n        "driver": {},\n        "shell": {}\n    },\n    "graphx": {\n        "pregel": {}\n    },\n    "deploy": {\n        "zookeeper": {}\n    }\n}\n
\n
\n
\nConfig Schema:
\n
app (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
name (dagster.StringSource, optional):
\n

Application Properties: The name of your application. This will appear in the UI and in log data.

\n
\n
\n
\n
driver (permissive dict, optional):
\n
\nDefault Value:
{\n    "blockManager": {}\n}\n
\n
\n
\nConfig Schema:
\n
cores (dagster.IntSource, optional):
\n

Application Properties: Number of cores to use for the driver process, only in cluster mode.

\n
\n
maxResultSize (dagster.StringSource, optional):
\n

Application Properties: Limit of total size of serialized results of all partitions for each Spark action (e.g. collect) in bytes. Should be at least 1M, or 0 for unlimited. Jobs will be aborted if the total size is above this limit. Having a high limit may cause out-of-memory errors in driver (depends on spark.driver.memory and memory overhead of objects in JVM). Setting a proper limit can protect the driver from out-of-memory errors.

\n
\n
memory (dagster.StringSource, optional):
\n

Application Properties: Amount of memory to use for the driver process, i.e. where SparkContext is initialized, in the same format as JVM memory strings with a size unit suffix (\u201ck\u201d, \u201cm\u201d, \u201cg\u201d or \u201ct\u201d) (e.g. 512m, 2g). Note: In client mode, this config must not be set through the SparkConf directly in your application, because the driver JVM has already started at that point. Instead, please set this through the \u2013driver-memory command line option or in your default properties file.

\n
\n
memoryOverhead (dagster.StringSource, optional):
\n

Application Properties: The amount of off-heap memory to be allocated per driver in cluster mode, in MiB unless otherwise specified. This is memory that accounts for things like VM overheads, interned strings, other native overheads, etc. This tends to grow with the container size (typically 6-10%). This option is currently supported on YARN and Kubernetes.

\n
\n
supervise (Bool, optional):
\n

Application Properties: If true, restarts the driver automatically if it fails with a non-zero exit status. Only has effect in Spark standalone mode or Mesos cluster deploy mode.

\n
\n
extraClassPath (dagster.StringSource, optional):
\n

Runtime Environment: Extra classpath entries to prepend to the classpath of the driver. Note: In client mode, this config must not be set through the SparkConf directly in your application, because the driver JVM has already started at that point. Instead, please set this through the \u2013driver-class-path command line option or in your default properties file.

\n
\n
extraJavaOptions (dagster.StringSource, optional):
\n

Runtime Environment: A string of extra JVM options to pass to the driver. For instance, GC settings or other logging. Note that it is illegal to set maximum heap size (-Xmx) settings with this option. Maximum heap size settings can be set with spark.driver.memory in the cluster mode and through the \u2013driver-memory command line option in the client mode. Note: In client mode, this config must not be set through the SparkConf directly in your application, because the driver JVM has already started at that point. Instead, please set this through the \u2013driver-java-options command line option or in your default properties file.

\n
\n
extraLibraryPath (dagster.StringSource, optional):
\n

Runtime Environment: Set a special library path to use when launching the driver JVM. Note: In client mode, this config must not be set through the SparkConf directly in your application, because the driver JVM has already started at that point. Instead, please set this through the \u2013driver-library-path command line option or in your default properties file.

\n
\n
userClassPathFirst (Bool, optional):
\n

Runtime Environment: (Experimental) Whether to give user-added jars precedence over Spark\u2019s own jars when loading classes in the driver. This feature can be used to mitigate conflicts between Spark\u2019s dependencies and user dependencies. It is currently an experimental feature. This is used in cluster mode only.

\n
\n
blockManager (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
port (dagster.StringSource, optional):
\n

Networking: Driver-specific port for the block manager to listen on, for cases where it cannot use the same configuration as executors.

\n
\n
\n
\n
bindAddress (dagster.StringSource, optional):
\n

Networking: Hostname or IP address where to bind listening sockets. This config overrides the SPARK_LOCAL_IP environment variable (see below). It also allows a different address from the local one to be advertised to executors or external systems. This is useful, for example, when running containers with bridged networking. For this to properly work, the different ports used by the driver (RPC, block manager and UI) need to be forwarded from the container\u2019s host.

\n
\n
host (dagster.StringSource, optional):
\n

Networking: Hostname or IP address for the driver. This is used for communicating with the executors and the standalone Master.

\n
\n
port (dagster.StringSource, optional):
\n

Networking: Port for the driver to listen on. This is used for communicating with the executors and the standalone Master.

\n
\n
\n
\n
executor (permissive dict, optional):
\n
\nDefault Value:
{\n    "pyspark": {},\n    "logs": {\n        "rolling": {\n            "time": {}\n        }\n    }\n}\n
\n
\n
\nConfig Schema:
\n
memory (dagster.StringSource, optional):
\n

Application Properties: Amount of memory to use per executor process, in the same format as JVM memory strings with a size unit suffix (\u201ck\u201d, \u201cm\u201d, \u201cg\u201d or \u201ct\u201d) (e.g. 512m, 2g).

\n
\n
pyspark (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
memory (dagster.StringSource, optional):
\n

Application Properties: The amount of memory to be allocated to PySpark in each executor, in MiB unless otherwise specified. If set, PySpark memory for an executor will be limited to this amount. If not set, Spark will not limit Python\u2019s memory use and it is up to the application to avoid exceeding the overhead memory space shared with other non-JVM processes. When PySpark is run in YARN or Kubernetes, this memory is added to executor resource requests.

\n
\n
\n
\n
memoryOverhead (dagster.StringSource, optional):
\n

Application Properties: The amount of off-heap memory to be allocated per executor, in MiB unless otherwise specified. This is memory that accounts for things like VM overheads, interned strings, other native overheads, etc. This tends to grow with the executor size (typically 6-10%). This option is currently supported on YARN and Kubernetes.

\n
\n
extraClassPath (dagster.StringSource, optional):
\n

Runtime Environment: Extra classpath entries to prepend to the classpath of executors. This exists primarily for backwards-compatibility with older versions of Spark. Users typically should not need to set this option.

\n
\n
extraJavaOptions (dagster.StringSource, optional):
\n

Runtime Environment: A string of extra JVM options to pass to executors. For instance, GC settings or other logging. Note that it is illegal to set Spark properties or maximum heap size (-Xmx) settings with this option. Spark properties should be set using a SparkConf object or the spark-defaults.conf file used with the spark-submit script. Maximum heap size settings can be set with spark.executor.memory. The following symbols, if present will be interpolated: {{APP_ID}} will be replaced by application ID and {{EXECUTOR_ID}} will be replaced by executor ID. For example, to enable verbose gc logging to a file named for the executor ID of the app in /tmp, pass a \u2018value\u2019 of: -verbose:gc -Xloggc:/tmp/{{APP_ID}}-{{EXECUTOR_ID}}.gc

\n
\n
extraLibraryPath (dagster.StringSource, optional):
\n

Runtime Environment: Set a special library path to use when launching executor JVM\u2019s.

\n
\n
logs (permissive dict, optional):
\n
\nDefault Value:
{\n    "rolling": {\n        "time": {}\n    }\n}\n
\n
\n
\nConfig Schema:
\n
rolling (permissive dict, optional):
\n
\nDefault Value:
{\n    "time": {}\n}\n
\n
\n
\nConfig Schema:
\n
maxRetainedFiles (dagster.IntSource, optional):
\n

Runtime Environment: Sets the number of latest rolling log files that are going to be retained by the system. Older log files will be deleted. Disabled by default.

\n
\n
enableCompression (Bool, optional):
\n

Runtime Environment: Enable executor log compression. If it is enabled, the rolled executor logs will be compressed. Disabled by default.

\n
\n
maxSize (dagster.IntSource, optional):
\n

Runtime Environment: Set the max size of the file in bytes by which the executor logs will be rolled over. Rolling is disabled by default. See spark.executor.logs.rolling.maxRetainedFiles for automatic cleaning of old logs.

\n
\n
strategy (dagster.StringSource, optional):
\n

Runtime Environment: Set the strategy of rolling of executor logs. By default it is disabled. It can be set to \u201ctime\u201d (time-based rolling) or \u201csize\u201d (size-based rolling). For \u201ctime\u201d, use spark.executor.logs.rolling.time.interval to set the rolling interval. For \u201csize\u201d, use spark.executor.logs.rolling.maxSize to set the maximum file size for rolling.

\n
\n
time (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
interval (dagster.StringSource, optional):
\n

Runtime Environment: Set the time interval by which the executor logs will be rolled over. Rolling is disabled by default. Valid values are daily, hourly, minutely or any interval in seconds. See spark.executor.logs.rolling.maxRetainedFiles for automatic cleaning of old logs.

\n
\n
\n
\n
\n
\n
\n
\n
userClassPathFirst (Bool, optional):
\n

Runtime Environment: (Experimental) Same functionality as spark.driver.userClassPathFirst, but applied to executor instances.

\n
\n
cores (dagster.IntSource, optional):
\n

Execution Behavior: The number of cores to use on each executor. In standalone and Mesos coarse-grained modes, for more detail, see this description.

\n
\n
heartbeatInterval (dagster.StringSource, optional):
\n

Execution Behavior: Interval between each executor\u2019s heartbeats to the driver. Heartbeats let the driver know that the executor is still alive and update it with metrics for in-progress tasks. spark.executor.heartbeatInterval should be significantly less than spark.network.timeout

\n
\n
\n
\n
extraListeners (dagster.StringSource, optional):
\n

Application Properties: A comma-separated list of classes that implement SparkListener; when initializing SparkContext, instances of these classes will be created and registered with Spark\u2019s listener bus. If a class has a single-argument constructor that accepts a SparkConf, that constructor will be called; otherwise, a zero-argument constructor will be called. If no valid constructor can be found, the SparkContext creation will fail with an exception.

\n
\n
local (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
dir (dagster.StringSource, optional):
\n

Application Properties: Directory to use for \u201cscratch\u201d space in Spark, including map output files and RDDs that get stored on disk. This should be on a fast, local disk in your system. It can also be a comma-separated list of multiple directories on different disks. NOTE: In Spark 1.0 and later this will be overridden by SPARK_LOCAL_DIRS (Standalone), MESOS_SANDBOX (Mesos) or LOCAL_DIRS (YARN) environment variables set by the cluster manager.

\n
\n
\n
\n
logConf (Bool, optional):
\n

Application Properties: Logs the effective SparkConf as INFO when a SparkContext is started.

\n
\n
master (dagster.StringSource, optional):
\n

Application Properties: The cluster manager to connect to. See the list of allowed master URL\u2019s.

\n
\n
submit (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
deployMode (dagster.StringSource, optional):
\n

Application Properties: The deploy mode of Spark driver program, either \u201cclient\u201d or \u201ccluster\u201d, Which means to launch driver program locally (\u201cclient\u201d) or remotely (\u201ccluster\u201d) on one of the nodes inside the cluster.

\n
\n
pyFiles (dagster.StringSource, optional):
\n

Runtime Environment: Comma-separated list of .zip, .egg, or .py files to place on the PYTHONPATH for Python apps. Globs are allowed.

\n
\n
\n
\n
log (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
callerContext (dagster.StringSource, optional):
\n

Application Properties: Application information that will be written into Yarn RM log/HDFS audit log when running on Yarn/HDFS. Its length depends on the Hadoop configuration hadoop.caller.context.max.size. It should be concise, and typically can have up to 50 characters.

\n
\n
\n
\n
redaction (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
regex (dagster.StringSource, optional):
\n

Runtime Environment: Regex to decide which Spark configuration properties and environment variables in driver and executor environments contain sensitive information. When this regex matches a property key or value, the value is redacted from the environment UI and various logs like YARN and event logs.

\n
\n
\n
\n
python (permissive dict, optional):
\n
\nDefault Value:
{\n    "profile": {},\n    "worker": {}\n}\n
\n
\n
\nConfig Schema:
\n
profile (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
root (Bool, optional):
\n

Runtime Environment: Enable profiling in Python worker, the profile result will show up by sc.show_profiles(), or it will be displayed before the driver exits. It also can be dumped into disk by sc.dump_profiles(path). If some of the profile results had been displayed manually, they will not be displayed automatically before driver exiting. By default the pyspark.profiler.BasicProfiler will be used, but this can be overridden by passing a profiler class in as a parameter to the SparkContext constructor.

\n
\n
dump (dagster.StringSource, optional):
\n

Runtime Environment: The directory which is used to dump the profile result before driver exiting. The results will be dumped as separated file for each RDD. They can be loaded by ptats.Stats(). If this is specified, the profile result will not be displayed automatically.

\n
\n
\n
\n
worker (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
memory (dagster.StringSource, optional):
\n

Runtime Environment: Amount of memory to use per python worker process during aggregation, in the same format as JVM memory strings with a size unit suffix (\u201ck\u201d, \u201cm\u201d, \u201cg\u201d or \u201ct\u201d) (e.g. 512m, 2g). If the memory used during aggregation goes above this amount, it will spill the data into disks.

\n
\n
reuse (Bool, optional):
\n

Runtime Environment: Reuse Python worker or not. If yes, it will use a fixed number of Python workers, does not need to fork() a Python process for every task. It will be very useful if there is large broadcast, then the broadcast will not be needed to transferred from JVM to Python worker for every task.

\n
\n
\n
\n
\n
\n
files (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
root (dagster.StringSource, optional):
\n

Runtime Environment: Comma-separated list of files to be placed in the working directory of each executor. Globs are allowed.

\n
\n
fetchTimeout (dagster.StringSource, optional):
\n

Execution Behavior: Communication timeout to use when fetching files added through SparkContext.addFile() from the driver.

\n
\n
useFetchCache (Bool, optional):
\n

Execution Behavior: If set to true (default), file fetching will use a local cache that is shared by executors that belong to the same application, which can improve task launching performance when running many executors on the same host. If set to false, these caching optimizations will be disabled and all executors will fetch their own copies of files. This optimization may be disabled in order to use Spark local directories that reside on NFS filesystems (see SPARK-6313 for more details).

\n
\n
overwrite (Bool, optional):
\n

Execution Behavior: Whether to overwrite files added through SparkContext.addFile() when the target file exists and its contents do not match those of the source.

\n
\n
maxPartitionBytes (dagster.IntSource, optional):
\n

Execution Behavior: The maximum number of bytes to pack into a single partition when reading files.

\n
\n
openCostInBytes (dagster.IntSource, optional):
\n

Execution Behavior: The estimated cost to open a file, measured by the number of bytes could be scanned at the same time. This is used when putting multiple files into a partition. It is better to overestimate, then the partitions with small files will be faster than partitions with bigger files.

\n
\n
\n
\n
jars (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
root (dagster.StringSource, optional):
\n

Runtime Environment: Comma-separated list of jars to include on the driver and executor classpaths. Globs are allowed.

\n
\n
packages (dagster.StringSource, optional):
\n

Runtime Environment: Comma-separated list of Maven coordinates of jars to include on the driver and executor classpaths. The coordinates should be groupId:artifactId:version. If spark.jars.ivySettings is given artifacts will be resolved according to the configuration in the file, otherwise artifacts will be searched for in the local maven repo, then maven central and finally any additional remote repositories given by the command-line option \u2013repositories. For more details, see Advanced Dependency Management.

\n
\n
excludes (dagster.StringSource, optional):
\n

Runtime Environment: Comma-separated list of groupId:artifactId, to exclude while resolving the dependencies provided in spark.jars.packages to avoid dependency conflicts.

\n
\n
ivy (dagster.StringSource, optional):
\n

Runtime Environment: Path to specify the Ivy user directory, used for the local Ivy cache and package files from spark.jars.packages. This will override the Ivy property ivy.default.ivy.user.dir which defaults to ~/.ivy2.

\n
\n
ivySettings (dagster.StringSource, optional):
\n

Runtime Environment: Path to an Ivy settings file to customize resolution of jars specified using spark.jars.packages instead of the built-in defaults, such as maven central. Additional repositories given by the command-line option \u2013repositories or spark.jars.repositories will also be included. Useful for allowing Spark to resolve artifacts from behind a firewall e.g. via an in-house artifact server like Artifactory. Details on the settings file format can be found at http://ant.apache.org/ivy/history/latest-milestone/settings.html

\n
\n
repositories (dagster.StringSource, optional):
\n

Runtime Environment: Comma-separated list of additional remote repositories to search for the maven coordinates given with \u2013packages or spark.jars.packages.

\n
\n
\n
\n
pyspark (permissive dict, optional):
\n
\nDefault Value:
{\n    "driver": {}\n}\n
\n
\n
\nConfig Schema:
\n
driver (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
python (dagster.StringSource, optional):
\n

Runtime Environment: Python binary executable to use for PySpark in driver. (default is spark.pyspark.python)

\n
\n
\n
\n
python (dagster.StringSource, optional):
\n

Runtime Environment: Python binary executable to use for PySpark in both driver and executors.

\n
\n
\n
\n
reducer (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
maxSizeInFlight (dagster.StringSource, optional):
\n

Shuffle Behavior: Maximum size of map outputs to fetch simultaneously from each reduce task, in MiB unless otherwise specified. Since each output requires us to create a buffer to receive it, this represents a fixed memory overhead per reduce task, so keep it small unless you have a large amount of memory.

\n
\n
maxReqsInFlight (dagster.IntSource, optional):
\n

Shuffle Behavior: This configuration limits the number of remote requests to fetch blocks at any given point. When the number of hosts in the cluster increase, it might lead to very large number of inbound connections to one or more nodes, causing the workers to fail under load. By allowing it to limit the number of fetch requests, this scenario can be mitigated.

\n
\n
maxBlocksInFlightPerAddress (dagster.IntSource, optional):
\n

Shuffle Behavior: This configuration limits the number of remote blocks being fetched per reduce task from a given host port. When a large number of blocks are being requested from a given address in a single fetch or simultaneously, this could crash the serving executor or Node Manager. This is especially useful to reduce the load on the Node Manager when external shuffle is enabled. You can mitigate this issue by setting it to a lower value.

\n
\n
\n
\n
maxRemoteBlockSizeFetchToMem (dagster.IntSource, optional):
\n

Shuffle Behavior: The remote block will be fetched to disk when size of the block is above this threshold in bytes. This is to avoid a giant request that takes too much memory. By default, this is only enabled for blocks > 2GB, as those cannot be fetched directly into memory, no matter what resources are available. But it can be turned down to a much lower value (eg. 200m) to avoid using too much memory on smaller blocks as well. Note this configuration will affect both shuffle fetch and block manager remote block fetch. For users who enabled external shuffle service, this feature can only be used when external shuffle service is newer than Spark 2.2.

\n
\n
shuffle (permissive dict, optional):
\n
\nDefault Value:
{\n    "file": {},\n    "io": {},\n    "service": {\n        "index": {\n            "cache": {}\n        }\n    },\n    "sort": {},\n    "spill": {},\n    "registration": {}\n}\n
\n
\n
\nConfig Schema:
\n
compress (Bool, optional):
\n

Shuffle Behavior: Whether to compress map output files. Generally a good idea. Compression will use spark.io.compression.codec.

\n
\n
file (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
buffer (dagster.StringSource, optional):
\n

Shuffle Behavior: Size of the in-memory buffer for each shuffle file output stream, in KiB unless otherwise specified. These buffers reduce the number of disk seeks and system calls made in creating intermediate shuffle files.

\n
\n
\n
\n
io (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
maxRetries (dagster.IntSource, optional):
\n

Shuffle Behavior: (Netty only) Fetches that fail due to IO-related exceptions are automatically retried if this is set to a non-zero value. This retry logic helps stabilize large shuffles in the face of long GC pauses or transient network connectivity issues.

\n
\n
numConnectionsPerPeer (dagster.IntSource, optional):
\n

Shuffle Behavior: (Netty only) Connections between hosts are reused in order to reduce connection buildup for large clusters. For clusters with many hard disks and few hosts, this may result in insufficient concurrency to saturate all disks, and so users may consider increasing this value.

\n
\n
preferDirectBufs (Bool, optional):
\n

Shuffle Behavior: (Netty only) Off-heap buffers are used to reduce garbage collection during shuffle and cache block transfer. For environments where off-heap memory is tightly limited, users may wish to turn this off to force all allocations from Netty to be on-heap.

\n
\n
retryWait (dagster.StringSource, optional):
\n

Shuffle Behavior: (Netty only) How long to wait between retries of fetches. The maximum delay caused by retrying is 15 seconds by default, calculated as maxRetries * retryWait.

\n
\n
\n
\n
service (permissive dict, optional):
\n
\nDefault Value:
{\n    "index": {\n        "cache": {}\n    }\n}\n
\n
\n
\nConfig Schema:
\n
enabled (Bool, optional):
\n

Shuffle Behavior: Enables the external shuffle service. This service preserves the shuffle files written by executors so the executors can be safely removed. This must be enabled if spark.dynamicAllocation.enabled is \u201ctrue\u201d. The external shuffle service must be set up in order to enable it. See dynamic allocation configuration and setup documentation for more information.

\n
\n
port (dagster.IntSource, optional):
\n

Shuffle Behavior: Port on which the external shuffle service will run.

\n
\n
index (permissive dict, optional):
\n
\nDefault Value:
{\n    "cache": {}\n}\n
\n
\n
\nConfig Schema:
\n
cache (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
size (dagster.StringSource, optional):
\n

Shuffle Behavior: Cache entries limited to the specified memory footprint in bytes.

\n
\n
\n
\n
\n
\n
\n
\n
maxChunksBeingTransferred (dagster.IntSource, optional):
\n

Shuffle Behavior: The max number of chunks allowed to be transferred at the same time on shuffle service. Note that new incoming connections will be closed when the max number is hit. The client will retry according to the shuffle retry configs (see spark.shuffle.io.maxRetries and spark.shuffle.io.retryWait), if those limits are reached the task will fail with fetch failure.

\n
\n
sort (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
bypassMergeThreshold (dagster.IntSource, optional):
\n

Shuffle Behavior: (Advanced) In the sort-based shuffle manager, avoid merge-sorting data if there is no map-side aggregation and there are at most this many reduce partitions.

\n
\n
\n
\n
spill (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
compress (Bool, optional):
\n

Shuffle Behavior: Whether to compress data spilled during shuffles. Compression will use spark.io.compression.codec.

\n
\n
\n
\n
accurateBlockThreshold (dagster.IntSource, optional):
\n

Shuffle Behavior: Threshold in bytes above which the size of shuffle blocks in HighlyCompressedMapStatus is accurately recorded. This helps to prevent OOM by avoiding underestimating shuffle block size when fetch shuffle blocks.

\n
\n
registration (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
timeout (dagster.IntSource, optional):
\n

Shuffle Behavior: Timeout in milliseconds for registration to the external shuffle service.

\n
\n
maxAttempts (dagster.IntSource, optional):
\n

Shuffle Behavior: When we fail to register to the external shuffle service, we will retry for maxAttempts times.

\n
\n
\n
\n
memoryFraction (Float, optional):
\n

Memory Management: (deprecated) This is read only if spark.memory.useLegacyMode is enabled. Fraction of Java heap to use for aggregation and cogroups during shuffles. At any given time, the collective size of all in-memory maps used for shuffles is bounded by this limit, beyond which the contents will begin to spill to disk. If spills are often, consider increasing this value at the expense of spark.storage.memoryFraction.

\n
\n
\n
\n
eventLog (permissive dict, optional):
\n
\nDefault Value:
{\n    "logBlockUpdates": {},\n    "longForm": {},\n    "buffer": {}\n}\n
\n
\n
\nConfig Schema:
\n
logBlockUpdates (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
enabled (dagster.StringSource, optional):
\n

Spark UI: Whether to log events for every block update, if spark.eventLog.enabled is true. *Warning*: This will increase the size of the event log considerably.

\n
\n
\n
\n
longForm (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
enabled (dagster.StringSource, optional):
\n

Spark UI: If true, use the long form of call sites in the event log. Otherwise use the short form.

\n
\n
\n
\n
compress (dagster.StringSource, optional):
\n

Spark UI: Whether to compress logged events, if spark.eventLog.enabled is true. Compression will use spark.io.compression.codec.

\n
\n
dir (dagster.StringSource, optional):
\n

Spark UI: Base directory in which Spark events are logged, if spark.eventLog.enabled is true. Within this base directory, Spark creates a sub-directory for each application, and logs the events specific to the application in this directory. Users may want to set this to a unified location like an HDFS directory so history files can be read by the history server.

\n
\n
enabled (dagster.StringSource, optional):
\n

Spark UI: Whether to log Spark events, useful for reconstructing the Web UI after the application has finished.

\n
\n
overwrite (dagster.StringSource, optional):
\n

Spark UI: Whether to overwrite any existing files.

\n
\n
buffer (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
kb (dagster.StringSource, optional):
\n

Spark UI: Buffer size to use when writing to output streams, in KiB unless otherwise specified.

\n
\n
\n
\n
\n
\n
ui (permissive dict, optional):
\n
\nDefault Value:
{\n    "dagGraph": {},\n    "liveUpdate": {}\n}\n
\n
\n
\nConfig Schema:
\n
dagGraph (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
retainedRootRDDs (dagster.StringSource, optional):
\n

Spark UI: How many DAG graph nodes the Spark UI and status APIs remember before garbage collecting.

\n
\n
\n
\n
enabled (dagster.StringSource, optional):
\n

Spark UI: Whether to run the web UI for the Spark application.

\n
\n
killEnabled (dagster.StringSource, optional):
\n

Spark UI: Allows jobs and stages to be killed from the web UI.

\n
\n
liveUpdate (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
period (dagster.StringSource, optional):
\n

Spark UI: How often to update live entities. -1 means \u201cnever update\u201d when replaying applications, meaning only the last write will happen. For live applications, this avoids a few operations that we can live without when rapidly processing incoming task events.

\n
\n
\n
\n
port (dagster.StringSource, optional):
\n

Spark UI: Port for your application\u2019s dashboard, which shows memory and workload data.

\n
\n
retainedJobs (dagster.StringSource, optional):
\n

Spark UI: How many jobs the Spark UI and status APIs remember before garbage collecting. This is a target maximum, and fewer elements may be retained in some circumstances.

\n
\n
retainedStages (dagster.StringSource, optional):
\n

Spark UI: How many stages the Spark UI and status APIs remember before garbage collecting. This is a target maximum, and fewer elements may be retained in some circumstances.

\n
\n
retainedTasks (dagster.StringSource, optional):
\n

Spark UI: How many tasks the Spark UI and status APIs remember before garbage collecting. This is a target maximum, and fewer elements may be retained in some circumstances.

\n
\n
reverseProxy (dagster.StringSource, optional):
\n

Spark UI: Enable running Spark Master as reverse proxy for worker and application UIs. In this mode, Spark master will reverse proxy the worker and application UIs to enable access without requiring direct access to their hosts. Use it with caution, as worker and application UI will not be accessible directly, you will only be able to access them through spark master/proxy public URL. This setting affects all the workers and application UIs running in the cluster and must be set on all the workers, drivers and masters.

\n
\n
reverseProxyUrl (dagster.StringSource, optional):
\n

Spark UI: This is the URL where your proxy is running. This URL is for proxy which is running in front of Spark Master. This is useful when running proxy for authentication e.g. OAuth proxy. Make sure this is a complete URL including scheme (http/https) and port to reach your proxy.

\n
\n
showConsoleProgress (dagster.StringSource, optional):
\n

Spark UI: Show the progress bar in the console. The progress bar shows the progress of stages that run for longer than 500ms. If multiple stages run at the same time, multiple progress bars will be displayed on the same line.

\n
\n
retainedDeadExecutors (dagster.StringSource, optional):
\n

Spark UI: How many dead executors the Spark UI and status APIs remember before garbage collecting.

\n
\n
filters (dagster.StringSource, optional):
\n

Spark UI: Comma separated list of filter class names to apply to the Spark Web UI. The filter should be a standard javax servlet Filter. Filter parameters can also be specified in the configuration, by setting config entries of the form spark.<class name of filter>.param.<param name>=<value> For example: spark.ui.filters=com.test.filter1 spark.com.test.filter1.param.name1=foo spark.com.test.filter1.param.name2=bar

\n
\n
\n
\n
worker (permissive dict, optional):
\n
\nDefault Value:
{\n    "ui": {}\n}\n
\n
\n
\nConfig Schema:
\n
ui (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
retainedExecutors (dagster.StringSource, optional):
\n

Spark UI: How many finished executors the Spark UI and status APIs remember before garbage collecting.

\n
\n
retainedDrivers (dagster.StringSource, optional):
\n

Spark UI: How many finished drivers the Spark UI and status APIs remember before garbage collecting.

\n
\n
\n
\n
\n
\n
sql (permissive dict, optional):
\n
\nDefault Value:
{\n    "ui": {}\n}\n
\n
\n
\nConfig Schema:
\n
ui (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
retainedExecutions (dagster.StringSource, optional):
\n

Spark UI: How many finished executions the Spark UI and status APIs remember before garbage collecting.

\n
\n
\n
\n
\n
\n
streaming (permissive dict, optional):
\n
\nDefault Value:
{\n    "ui": {},\n    "backpressure": {},\n    "receiver": {\n        "writeAheadLog": {}\n    },\n    "kafka": {},\n    "driver": {\n        "writeAheadLog": {}\n    }\n}\n
\n
\n
\nConfig Schema:
\n
ui (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
retainedBatches (dagster.StringSource, optional):
\n

Spark Streaming: How many batches the Spark Streaming UI and status APIs remember before garbage collecting.

\n
\n
\n
\n
backpressure (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
enabled (dagster.StringSource, optional):
\n

Spark Streaming: Enables or disables Spark Streaming\u2019s internal backpressure mechanism (since 1.5). This enables the Spark Streaming to control the receiving rate based on the current batch scheduling delays and processing times so that the system receives only as fast as the system can process. Internally, this dynamically sets the maximum receiving rate of receivers. This rate is upper bounded by the values spark.streaming.receiver.maxRate and spark.streaming.kafka.maxRatePerPartition if they are set (see below).

\n
\n
initialRate (dagster.StringSource, optional):
\n

Spark Streaming: This is the initial maximum receiving rate at which each receiver will receive data for the first batch when the backpressure mechanism is enabled.

\n
\n
\n
\n
blockInterval (dagster.StringSource, optional):
\n

Spark Streaming: Interval at which data received by Spark Streaming receivers is chunked into blocks of data before storing them in Spark. Minimum recommended - 50 ms. See the performance tuning section in the Spark Streaming programing guide for more details.

\n
\n
receiver (permissive dict, optional):
\n
\nDefault Value:
{\n    "writeAheadLog": {}\n}\n
\n
\n
\nConfig Schema:
\n
maxRate (dagster.StringSource, optional):
\n

Spark Streaming: Maximum rate (number of records per second) at which each receiver will receive data. Effectively, each stream will consume at most this number of records per second. Setting this configuration to 0 or a negative number will put no limit on the rate. See the deployment guide in the Spark Streaming programing guide for mode details.

\n
\n
writeAheadLog (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
enable (dagster.StringSource, optional):
\n

Spark Streaming: Enable write-ahead logs for receivers. All the input data received through receivers will be saved to write-ahead logs that will allow it to be recovered after driver failures. See the deployment guide in the Spark Streaming programing guide for more details.

\n
\n
closeFileAfterWrite (dagster.StringSource, optional):
\n

Spark Streaming: Whether to close the file after writing a write-ahead log record on the receivers. Set this to \u2018true\u2019 when you want to use S3 (or any file system that does not support flushing) for the data WAL on the receivers.

\n
\n
\n
\n
\n
\n
unpersist (dagster.StringSource, optional):
\n

Spark Streaming: Force RDDs generated and persisted by Spark Streaming to be automatically unpersisted from Spark\u2019s memory. The raw input data received by Spark Streaming is also automatically cleared. Setting this to false will allow the raw data and persisted RDDs to be accessible outside the streaming application as they will not be cleared automatically. But it comes at the cost of higher memory usage in Spark.

\n
\n
stopGracefullyOnShutdown (dagster.StringSource, optional):
\n

Spark Streaming: If true, Spark shuts down the StreamingContext gracefully on JVM shutdown rather than immediately.

\n
\n
kafka (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
maxRatePerPartition (dagster.StringSource, optional):
\n

Spark Streaming: Maximum rate (number of records per second) at which data will be read from each Kafka partition when using the new Kafka direct stream API. See the Kafka Integration guide for more details.

\n
\n
minRatePerPartition (dagster.StringSource, optional):
\n

Spark Streaming: Minimum rate (number of records per second) at which data will be read from each Kafka partition when using the new Kafka direct stream API.

\n
\n
maxRetries (dagster.StringSource, optional):
\n

Spark Streaming: Maximum number of consecutive retries the driver will make in order to find the latest offsets on the leader of each partition (a default value of 1 means that the driver will make a maximum of 2 attempts). Only applies to the new Kafka direct stream API.

\n
\n
\n
\n
driver (permissive dict, optional):
\n
\nDefault Value:
{\n    "writeAheadLog": {}\n}\n
\n
\n
\nConfig Schema:
\n
writeAheadLog (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
closeFileAfterWrite (dagster.StringSource, optional):
\n

Spark Streaming: Whether to close the file after writing a write-ahead log record on the driver. Set this to \u2018true\u2019 when you want to use S3 (or any file system that does not support flushing) for the metadata WAL on the driver.

\n
\n
\n
\n
\n
\n
\n
\n
broadcast (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
compress (dagster.StringSource, optional):
\n

Compression and Serialization: Whether to compress broadcast variables before sending them. Generally a good idea. Compression will use spark.io.compression.codec.

\n
\n
blockSize (dagster.StringSource, optional):
\n

Execution Behavior: Size of each piece of a block for TorrentBroadcastFactory, in KiB unless otherwise specified. Too large a value decreases parallelism during broadcast (makes it slower); however, if it is too small, BlockManager might take a performance hit.

\n
\n
checksum (dagster.StringSource, optional):
\n

Execution Behavior: Whether to enable checksum for broadcast. If enabled, broadcasts will include a checksum, which can help detect corrupted blocks, at the cost of computing and sending a little more data. It\u2019s possible to disable it if the network has other mechanisms to guarantee data won\u2019t be corrupted during broadcast.

\n
\n
\n
\n
io (permissive dict, optional):
\n
\nDefault Value:
{\n    "compression": {\n        "lz4": {},\n        "snappy": {},\n        "zstd": {}\n    }\n}\n
\n
\n
\nConfig Schema:
\n
compression (permissive dict, optional):
\n
\nDefault Value:
{\n    "lz4": {},\n    "snappy": {},\n    "zstd": {}\n}\n
\n
\n
\nConfig Schema:
\n
codec (dagster.StringSource, optional):
\n

Compression and Serialization: The codec used to compress internal data such as RDD partitions, event log, broadcast variables and shuffle outputs. By default, Spark provides four codecs: lz4, lzf, snappy, and zstd. You can also use fully qualified class names to specify the codec, e.g. org.apache.spark.io.LZ4CompressionCodec, org.apache.spark.io.LZFCompressionCodec, org.apache.spark.io.SnappyCompressionCodec, and org.apache.spark.io.ZStdCompressionCodec.

\n
\n
lz4 (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
blockSize (dagster.StringSource, optional):
\n

Compression and Serialization: Block size in bytes used in LZ4 compression, in the case when LZ4 compression codec is used. Lowering this block size will also lower shuffle memory usage when LZ4 is used.

\n
\n
\n
\n
snappy (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
blockSize (dagster.StringSource, optional):
\n

Compression and Serialization: Block size in bytes used in Snappy compression, in the case when Snappy compression codec is used. Lowering this block size will also lower shuffle memory usage when Snappy is used.

\n
\n
\n
\n
zstd (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
level (dagster.StringSource, optional):
\n

Compression and Serialization: Compression level for Zstd compression codec. Increasing the compression level will result in better compression at the expense of more CPU and memory.

\n
\n
bufferSize (dagster.StringSource, optional):
\n

Compression and Serialization: Buffer size in bytes used in Zstd compression, in the case when Zstd compression codec is used. Lowering this size will lower the shuffle memory usage when Zstd is used, but it might increase the compression cost because of excessive JNI call overhead.

\n
\n
\n
\n
\n
\n
\n
\n
kryo (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
classesToRegister (dagster.StringSource, optional):
\n

Compression and Serialization: If you use Kryo serialization, give a comma-separated list of custom class names to register with Kryo. See the tuning guide for more details.

\n
\n
referenceTracking (dagster.StringSource, optional):
\n

Compression and Serialization: Whether to track references to the same object when serializing data with Kryo, which is necessary if your object graphs have loops and useful for efficiency if they contain multiple copies of the same object. Can be disabled to improve performance if you know this is not the case.

\n
\n
registrationRequired (dagster.StringSource, optional):
\n

Compression and Serialization: Whether to require registration with Kryo. If set to \u2018true\u2019, Kryo will throw an exception if an unregistered class is serialized. If set to false (the default), Kryo will write unregistered class names along with each object. Writing class names can cause significant performance overhead, so enabling this option can enforce strictly that a user has not omitted classes from registration.

\n
\n
registrator (dagster.StringSource, optional):
\n

Compression and Serialization: If you use Kryo serialization, give a comma-separated list of classes that register your custom classes with Kryo. This property is useful if you need to register your classes in a custom way, e.g. to specify a custom field serializer. Otherwise spark.kryo.classesToRegister is simpler. It should be set to classes that extend KryoRegistrator. See the tuning guide for more details.

\n
\n
unsafe (dagster.StringSource, optional):
\n

Compression and Serialization: Whether to use unsafe based Kryo serializer. Can be substantially faster by using Unsafe Based IO.

\n
\n
\n
\n
kryoserializer (permissive dict, optional):
\n
\nDefault Value:
{\n    "buffer": {}\n}\n
\n
\n
\nConfig Schema:
\n
buffer (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
root (dagster.StringSource, optional):
\n

Compression and Serialization: Initial size of Kryo\u2019s serialization buffer, in KiB unless otherwise specified. Note that there will be one buffer per core on each worker. This buffer will grow up to spark.kryoserializer.buffer.max if needed.

\n
\n
max (dagster.StringSource, optional):
\n

Compression and Serialization: Maximum allowable size of Kryo serialization buffer, in MiB unless otherwise specified. This must be larger than any object you attempt to serialize and must be less than 2048m. Increase this if you get a \u201cbuffer limit exceeded\u201d exception inside Kryo.

\n
\n
\n
\n
\n
\n
rdd (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
compress (dagster.StringSource, optional):
\n

Compression and Serialization: Whether to compress serialized RDD partitions (e.g. for StorageLevel.MEMORY_ONLY_SER in Java and Scala or StorageLevel.MEMORY_ONLY in Python). Can save substantial space at the cost of some extra CPU time. Compression will use spark.io.compression.codec.

\n
\n
\n
\n
serializer (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
root (dagster.StringSource, optional):
\n

Compression and Serialization: Class to use for serializing objects that will be sent over the network or need to be cached in serialized form. The default of Java serialization works with any Serializable Java object but is quite slow, so we recommend using org.apache.spark.serializer.KryoSerializer and configuring Kryo serialization when speed is necessary. Can be any subclass of org.apache.spark.Serializer.

\n
\n
objectStreamReset (dagster.StringSource, optional):
\n

Compression and Serialization: When serializing using org.apache.spark.serializer.JavaSerializer, the serializer caches objects to prevent writing redundant data, however that stops garbage collection of those objects. By calling \u2018reset\u2019 you flush that info from the serializer, and allow old objects to be collected. To turn off this periodic reset set it to -1. By default it will reset the serializer every 100 objects.

\n
\n
\n
\n
memory (permissive dict, optional):
\n
\nDefault Value:
{\n    "offHeap": {}\n}\n
\n
\n
\nConfig Schema:
\n
fraction (Float, optional):
\n

Memory Management: Fraction of (heap space - 300MB) used for execution and storage. The lower this is, the more frequently spills and cached data eviction occur. The purpose of this config is to set aside memory for internal metadata, user data structures, and imprecise size estimation in the case of sparse, unusually large records. Leaving this at the default value is recommended. For more detail, including important information about correctly tuning JVM garbage collection when increasing this value, see this description.

\n
\n
storageFraction (Float, optional):
\n

Memory Management: Amount of storage memory immune to eviction, expressed as a fraction of the size of the region set aside by spark.memory.fraction. The higher this is, the less working memory may be available to execution and tasks may spill to disk more often. Leaving this at the default value is recommended. For more detail, see this description.

\n
\n
offHeap (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
enabled (Bool, optional):
\n

Memory Management: If true, Spark will attempt to use off-heap memory for certain operations. If off-heap memory use is enabled, then spark.memory.offHeap.size must be positive.

\n
\n
size (dagster.IntSource, optional):
\n

Memory Management: The absolute amount of memory in bytes which can be used for off-heap allocation. This setting has no impact on heap memory usage, so if your executors\u2019 total memory consumption must fit within some hard limit then be sure to shrink your JVM heap size accordingly. This must be set to a positive value when spark.memory.offHeap.enabled=true.

\n
\n
\n
\n
useLegacyMode (Bool, optional):
\n

Memory Management: Whether to enable the legacy memory management mode used in Spark 1.5 and before. The legacy mode rigidly partitions the heap space into fixed-size regions, potentially leading to excessive spilling if the application was not tuned. The following deprecated memory fraction configurations are not read unless this is enabled: spark.shuffle.memoryFraction spark.storage.memoryFraction spark.storage.unrollFraction

\n
\n
\n
\n
storage (permissive dict, optional):
\n
\nDefault Value:
{\n    "replication": {}\n}\n
\n
\n
\nConfig Schema:
\n
memoryFraction (Float, optional):
\n

Memory Management: (deprecated) This is read only if spark.memory.useLegacyMode is enabled. Fraction of Java heap to use for Spark\u2019s memory cache. This should not be larger than the \u201cold\u201d generation of objects in the JVM, which by default is given 0.6 of the heap, but you can increase it if you configure your own old generation size.

\n
\n
unrollFraction (Float, optional):
\n

Memory Management: (deprecated) This is read only if spark.memory.useLegacyMode is enabled. Fraction of spark.storage.memoryFraction to use for unrolling blocks in memory. This is dynamically allocated by dropping existing blocks when there is not enough free storage space to unroll the new block in its entirety.

\n
\n
replication (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
proactive (Bool, optional):
\n

Memory Management: Enables proactive block replication for RDD blocks. Cached RDD block replicas lost due to executor failures are replenished if there are any existing available replicas. This tries to get the replication level of the block to the initial number.

\n
\n
\n
\n
memoryMapThreshold (dagster.StringSource, optional):
\n

Execution Behavior: Size in bytes of a block above which Spark memory maps when reading a block from disk. This prevents Spark from memory mapping very small blocks. In general, memory mapping has high overhead for blocks close to or below the page size of the operating system.

\n
\n
\n
\n
cleaner (permissive dict, optional):
\n
\nDefault Value:
{\n    "periodicGC": {},\n    "referenceTracking": {\n        "blocking": {}\n    }\n}\n
\n
\n
\nConfig Schema:
\n
periodicGC (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
interval (dagster.StringSource, optional):
\n

Memory Management: Controls how often to trigger a garbage collection. This context cleaner triggers cleanups only when weak references are garbage collected. In long-running applications with large driver JVMs, where there is little memory pressure on the driver, this may happen very occasionally or not at all. Not cleaning at all may lead to executors running out of disk space after a while.

\n
\n
\n
\n
referenceTracking (permissive dict, optional):
\n
\nDefault Value:
{\n    "blocking": {}\n}\n
\n
\n
\nConfig Schema:
\n
root (Bool, optional):
\n

Memory Management: Enables or disables context cleaning.

\n
\n
blocking (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
root (Bool, optional):
\n

Memory Management: Controls whether the cleaning thread should block on cleanup tasks (other than shuffle, which is controlled by spark.cleaner.referenceTracking.blocking.shuffle Spark property).

\n
\n
shuffle (Bool, optional):
\n

Memory Management: Controls whether the cleaning thread should block on shuffle cleanup tasks.

\n
\n
\n
\n
cleanCheckpoints (Bool, optional):
\n

Memory Management: Controls whether to clean checkpoint files if the reference is out of scope.

\n
\n
\n
\n
\n
\n
default (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
parallelism (dagster.IntSource, optional):
\n

Execution Behavior: Default number of partitions in RDDs returned by transformations like join, reduceByKey, and parallelize when not set by user.

\n
\n
\n
\n
hadoop (permissive dict, optional):
\n
\nDefault Value:
{\n    "mapreduce": {\n        "fileoutputcommitter": {\n            "algorithm": {}\n        }\n    }\n}\n
\n
\n
\nConfig Schema:
\n
cloneConf (Bool, optional):
\n

Execution Behavior: If set to true, clones a new Hadoop Configuration object for each task. This option should be enabled to work around Configuration thread-safety issues (see SPARK-2546 for more details). This is disabled by default in order to avoid unexpected performance regressions for jobs that are not affected by these issues.

\n
\n
validateOutputSpecs (Bool, optional):
\n

Execution Behavior: If set to true, validates the output specification (e.g. checking if the output directory already exists) used in saveAsHadoopFile and other variants. This can be disabled to silence exceptions due to pre-existing output directories. We recommend that users do not disable this except if trying to achieve compatibility with previous versions of Spark. Simply use Hadoop\u2019s FileSystem API to delete output directories by hand. This setting is ignored for jobs generated through Spark Streaming\u2019s StreamingContext, since data may need to be rewritten to pre-existing output directories during checkpoint recovery.

\n
\n
mapreduce (permissive dict, optional):
\n
\nDefault Value:
{\n    "fileoutputcommitter": {\n        "algorithm": {}\n    }\n}\n
\n
\n
\nConfig Schema:
\n
fileoutputcommitter (permissive dict, optional):
\n
\nDefault Value:
{\n    "algorithm": {}\n}\n
\n
\n
\nConfig Schema:
\n
algorithm (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
version (dagster.IntSource, optional):
\n

Execution Behavior: The file output committer algorithm version, valid algorithm version number: 1 or 2. Version 2 may have better performance, but version 1 may handle failures better in certain situations, as per MAPREDUCE-4815.

\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
rpc (permissive dict, optional):
\n
\nDefault Value:
{\n    "message": {},\n    "retry": {}\n}\n
\n
\n
\nConfig Schema:
\n
message (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
maxSize (dagster.StringSource, optional):
\n

Networking: Maximum message size (in MB) to allow in \u201ccontrol plane\u201d communication; generally only applies to map output size information sent between executors and the driver. Increase this if you are running jobs with many thousands of map and reduce tasks and see messages about the RPC message size.

\n
\n
\n
\n
numRetries (dagster.StringSource, optional):
\n

Networking: Number of times to retry before an RPC task gives up. An RPC task will run at most times of this number.

\n
\n
retry (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
wait (dagster.StringSource, optional):
\n

Networking: Duration for an RPC ask operation to wait before retrying.

\n
\n
\n
\n
askTimeout (dagster.StringSource, optional):
\n

Networking: Duration for an RPC ask operation to wait before timing out.

\n
\n
lookupTimeout (dagster.StringSource, optional):
\n

Networking: Duration for an RPC remote endpoint lookup operation to wait before timing out.

\n
\n
\n
\n
blockManager (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
port (dagster.StringSource, optional):
\n

Networking: Port for all block managers to listen on. These exist on both the driver and the executors.

\n
\n
\n
\n
network (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
timeout (dagster.StringSource, optional):
\n

Networking: Default timeout for all network interactions. This config will be used in place of spark.core.connection.ack.wait.timeout, spark.storage.blockManagerSlaveTimeoutMs, spark.shuffle.io.connectionTimeout, spark.rpc.askTimeout or spark.rpc.lookupTimeout if they are not configured.

\n
\n
\n
\n
port (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
maxRetries (dagster.StringSource, optional):
\n

Networking: Maximum number of retries when binding to a port before giving up. When a port is given a specific value (non 0), each subsequent retry will increment the port used in the previous attempt by 1 before retrying. This essentially allows it to try a range of ports from the start port specified to port + maxRetries.

\n
\n
\n
\n
core (permissive dict, optional):
\n
\nDefault Value:
{\n    "connection": {\n        "ack": {\n            "wait": {}\n        }\n    }\n}\n
\n
\n
\nConfig Schema:
\n
connection (permissive dict, optional):
\n
\nDefault Value:
{\n    "ack": {\n        "wait": {}\n    }\n}\n
\n
\n
\nConfig Schema:
\n
ack (permissive dict, optional):
\n
\nDefault Value:
{\n    "wait": {}\n}\n
\n
\n
\nConfig Schema:
\n
wait (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
timeout (dagster.StringSource, optional):
\n

Networking: How long for the connection to wait for ack to occur before timing out and giving up. To avoid unwilling timeout caused by long pause like GC, you can set larger value.

\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
cores (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
max (dagster.StringSource, optional):
\n

Scheduling: When running on a standalone deploy cluster or a Mesos cluster in \u201ccoarse-grained\u201d sharing mode, the maximum amount of CPU cores to request for the application from across the cluster (not from each machine). If not set, the default will be spark.deploy.defaultCores on Spark\u2019s standalone cluster manager, or infinite (all available cores) on Mesos.

\n
\n
\n
\n
locality (permissive dict, optional):
\n
\nDefault Value:
{\n    "wait": {}\n}\n
\n
\n
\nConfig Schema:
\n
wait (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
root (dagster.StringSource, optional):
\n

Scheduling: How long to wait to launch a data-local task before giving up and launching it on a less-local node. The same wait will be used to step through multiple locality levels (process-local, node-local, rack-local and then any). It is also possible to customize the waiting time for each level by setting spark.locality.wait.node, etc. You should increase this setting if your tasks are long and see poor locality, but the default usually works well.

\n
\n
node (dagster.StringSource, optional):
\n

Scheduling: Customize the locality wait for node locality. For example, you can set this to 0 to skip node locality and search immediately for rack locality (if your cluster has rack information).

\n
\n
process (dagster.StringSource, optional):
\n

Scheduling: Customize the locality wait for process locality. This affects tasks that attempt to access cached data in a particular executor process.

\n
\n
rack (dagster.StringSource, optional):
\n

Scheduling: Customize the locality wait for rack locality.

\n
\n
\n
\n
\n
\n
scheduler (permissive dict, optional):
\n
\nDefault Value:
{\n    "revive": {},\n    "listenerbus": {\n        "eventqueue": {}\n    }\n}\n
\n
\n
\nConfig Schema:
\n
maxRegisteredResourcesWaitingTime (dagster.StringSource, optional):
\n

Scheduling: Maximum amount of time to wait for resources to register before scheduling begins.

\n
\n
minRegisteredResourcesRatio (dagster.StringSource, optional):
\n

Scheduling: The minimum ratio of registered resources (registered resources / total expected resources) (resources are executors in yarn mode and Kubernetes mode, CPU cores in standalone mode and Mesos coarse-grained mode [\u2018spark.cores.max\u2019 value is total expected resources for Mesos coarse-grained mode] ) to wait for before scheduling begins. Specified as a double between 0.0 and 1.0. Regardless of whether the minimum ratio of resources has been reached, the maximum amount of time it will wait before scheduling begins is controlled by config spark.scheduler.maxRegisteredResourcesWaitingTime.

\n
\n
mode (dagster.StringSource, optional):
\n

Scheduling: The scheduling mode between jobs submitted to the same SparkContext. Can be set to FAIR to use fair sharing instead of queueing jobs one after another. Useful for multi-user services.

\n
\n
revive (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
interval (dagster.StringSource, optional):
\n

Scheduling: The interval length for the scheduler to revive the worker resource offers to run tasks.

\n
\n
\n
\n
listenerbus (permissive dict, optional):
\n
\nDefault Value:
{\n    "eventqueue": {}\n}\n
\n
\n
\nConfig Schema:
\n
eventqueue (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
capacity (dagster.StringSource, optional):
\n

Scheduling: Capacity for event queue in Spark listener bus, must be greater than 0. Consider increasing value (e.g. 20000) if listener events are dropped. Increasing this value may result in the driver using more memory.

\n
\n
\n
\n
\n
\n
\n
\n
blacklist (permissive dict, optional):
\n
\nDefault Value:
{\n    "task": {},\n    "stage": {},\n    "application": {\n        "fetchFailure": {}\n    }\n}\n
\n
\n
\nConfig Schema:
\n
enabled (dagster.StringSource, optional):
\n

Scheduling: If set to \u201ctrue\u201d, prevent Spark from scheduling tasks on executors that have been blacklisted due to too many task failures. The blacklisting algorithm can be further controlled by the other \u201cspark.blacklist\u201d configuration options.

\n
\n
timeout (dagster.StringSource, optional):
\n

Scheduling: (Experimental) How long a node or executor is blacklisted for the entire application, before it is unconditionally removed from the blacklist to attempt running new tasks.

\n
\n
task (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
maxTaskAttemptsPerExecutor (dagster.StringSource, optional):
\n

Scheduling: (Experimental) For a given task, how many times it can be retried on one executor before the executor is blacklisted for that task.

\n
\n
maxTaskAttemptsPerNode (dagster.StringSource, optional):
\n

Scheduling: (Experimental) For a given task, how many times it can be retried on one node, before the entire node is blacklisted for that task.

\n
\n
\n
\n
stage (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
maxFailedTasksPerExecutor (dagster.StringSource, optional):
\n

Scheduling: (Experimental) How many different tasks must fail on one executor, within one stage, before the executor is blacklisted for that stage.

\n
\n
maxFailedExecutorsPerNode (dagster.StringSource, optional):
\n

Scheduling: (Experimental) How many different executors are marked as blacklisted for a given stage, before the entire node is marked as failed for the stage.

\n
\n
\n
\n
application (permissive dict, optional):
\n
\nDefault Value:
{\n    "fetchFailure": {}\n}\n
\n
\n
\nConfig Schema:
\n
maxFailedTasksPerExecutor (dagster.StringSource, optional):
\n

Scheduling: (Experimental) How many different tasks must fail on one executor, in successful task sets, before the executor is blacklisted for the entire application. Blacklisted executors will be automatically added back to the pool of available resources after the timeout specified by spark.blacklist.timeout. Note that with dynamic allocation, though, the executors may get marked as idle and be reclaimed by the cluster manager.

\n
\n
maxFailedExecutorsPerNode (dagster.StringSource, optional):
\n

Scheduling: (Experimental) How many different executors must be blacklisted for the entire application, before the node is blacklisted for the entire application. Blacklisted nodes will be automatically added back to the pool of available resources after the timeout specified by spark.blacklist.timeout. Note that with dynamic allocation, though, the executors on the node may get marked as idle and be reclaimed by the cluster manager.

\n
\n
fetchFailure (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
enabled (dagster.StringSource, optional):
\n

Scheduling: (Experimental) If set to \u201ctrue\u201d, Spark will blacklist the executor immediately when a fetch failure happens. If external shuffle service is enabled, then the whole node will be blacklisted.

\n
\n
\n
\n
\n
\n
killBlacklistedExecutors (dagster.StringSource, optional):
\n

Scheduling: (Experimental) If set to \u201ctrue\u201d, allow Spark to automatically kill the executors when they are blacklisted on fetch failure or blacklisted for the entire application, as controlled by spark.blacklist.application.*. Note that, when an entire node is added to the blacklist, all of the executors on that node will be killed.

\n
\n
\n
\n
speculation (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
root (dagster.StringSource, optional):
\n

Scheduling: If set to \u201ctrue\u201d, performs speculative execution of tasks. This means if one or more tasks are running slowly in a stage, they will be re-launched.

\n
\n
interval (dagster.StringSource, optional):
\n

Scheduling: How often Spark will check for tasks to speculate.

\n
\n
multiplier (dagster.StringSource, optional):
\n

Scheduling: How many times slower a task is than the median to be considered for speculation.

\n
\n
quantile (dagster.StringSource, optional):
\n

Scheduling: Fraction of tasks which must be complete before speculation is enabled for a particular stage.

\n
\n
\n
\n
task (permissive dict, optional):
\n
\nDefault Value:
{\n    "reaper": {}\n}\n
\n
\n
\nConfig Schema:
\n
cpus (dagster.StringSource, optional):
\n

Scheduling: Number of cores to allocate for each task.

\n
\n
maxFailures (dagster.StringSource, optional):
\n

Scheduling: Number of failures of any particular task before giving up on the job. The total number of failures spread across different tasks will not cause the job to fail; a particular task has to fail this number of attempts. Should be greater than or equal to 1. Number of allowed retries = this value - 1.

\n
\n
reaper (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
enabled (dagster.StringSource, optional):
\n

Scheduling: Enables monitoring of killed / interrupted tasks. When set to true, any task which is killed will be monitored by the executor until that task actually finishes executing. See the other spark.task.reaper.* configurations for details on how to control the exact behavior of this monitoring. When set to false (the default), task killing will use an older code path which lacks such monitoring.

\n
\n
pollingInterval (dagster.StringSource, optional):
\n

Scheduling: When spark.task.reaper.enabled = true, this setting controls the frequency at which executors will poll the status of killed tasks. If a killed task is still running when polled then a warning will be logged and, by default, a thread-dump of the task will be logged (this thread dump can be disabled via the spark.task.reaper.threadDump setting, which is documented below).

\n
\n
threadDump (dagster.StringSource, optional):
\n

Scheduling: When spark.task.reaper.enabled = true, this setting controls whether task thread dumps are logged during periodic polling of killed tasks. Set this to false to disable collection of thread dumps.

\n
\n
killTimeout (dagster.StringSource, optional):
\n

Scheduling: When spark.task.reaper.enabled = true, this setting specifies a timeout after which the executor JVM will kill itself if a killed task has not stopped running. The default value, -1, disables this mechanism and prevents the executor from self-destructing. The purpose of this setting is to act as a safety-net to prevent runaway noncancellable tasks from rendering an executor unusable.

\n
\n
\n
\n
\n
\n
stage (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
maxConsecutiveAttempts (dagster.StringSource, optional):
\n

Scheduling: Number of consecutive stage attempts allowed before a stage is aborted.

\n
\n
\n
\n
dynamicAllocation (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
enabled (dagster.StringSource, optional):
\n

Dynamic Allocation: Whether to use dynamic resource allocation, which scales the number of executors registered with this application up and down based on the workload. For more detail, see the description here. This requires spark.shuffle.service.enabled to be set. The following configurations are also relevant: spark.dynamicAllocation.minExecutors, spark.dynamicAllocation.maxExecutors, and spark.dynamicAllocation.initialExecutors spark.dynamicAllocation.executorAllocationRatio

\n
\n
executorIdleTimeout (dagster.StringSource, optional):
\n

Dynamic Allocation: If dynamic allocation is enabled and an executor has been idle for more than this duration, the executor will be removed. For more detail, see this description.

\n
\n
cachedExecutorIdleTimeout (dagster.StringSource, optional):
\n

Dynamic Allocation: If dynamic allocation is enabled and an executor which has cached data blocks has been idle for more than this duration, the executor will be removed. For more details, see this description.

\n
\n
initialExecutors (dagster.StringSource, optional):
\n

Dynamic Allocation: Initial number of executors to run if dynamic allocation is enabled. If \u2013num-executors (or spark.executor.instances) is set and larger than this value, it will be used as the initial number of executors.

\n
\n
maxExecutors (dagster.StringSource, optional):
\n

Dynamic Allocation: Upper bound for the number of executors if dynamic allocation is enabled.

\n
\n
minExecutors (dagster.StringSource, optional):
\n

Dynamic Allocation: Lower bound for the number of executors if dynamic allocation is enabled.

\n
\n
executorAllocationRatio (dagster.StringSource, optional):
\n

Dynamic Allocation: By default, the dynamic allocation will request enough executors to maximize the parallelism according to the number of tasks to process. While this minimizes the latency of the job, with small tasks this setting can waste a lot of resources due to executor allocation overhead, as some executor might not even do any work. This setting allows to set a ratio that will be used to reduce the number of executors w.r.t. full parallelism. Defaults to 1.0 to give maximum parallelism. 0.5 will divide the target number of executors by 2 The target number of executors computed by the dynamicAllocation can still be overridden by the spark.dynamicAllocation.minExecutors and spark.dynamicAllocation.maxExecutors settings

\n
\n
schedulerBacklogTimeout (dagster.StringSource, optional):
\n

Dynamic Allocation: If dynamic allocation is enabled and there have been pending tasks backlogged for more than this duration, new executors will be requested. For more detail, see this description.

\n
\n
sustainedSchedulerBacklogTimeout (dagster.StringSource, optional):
\n

Dynamic Allocation: Same as spark.dynamicAllocation.schedulerBacklogTimeout, but used only for subsequent executor requests. For more detail, see this description.

\n
\n
\n
\n
r (permissive dict, optional):
\n
\nDefault Value:
{\n    "driver": {},\n    "shell": {}\n}\n
\n
\n
\nConfig Schema:
\n
numRBackendThreads (dagster.StringSource, optional):
\n

SparkR: Number of threads used by RBackend to handle RPC calls from SparkR package.

\n
\n
command (dagster.StringSource, optional):
\n

SparkR: Executable for executing R scripts in cluster modes for both driver and workers.

\n
\n
driver (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
command (dagster.StringSource, optional):
\n

SparkR: Executable for executing R scripts in client modes for driver. Ignored in cluster modes.

\n
\n
\n
\n
shell (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
command (dagster.StringSource, optional):
\n

SparkR: Executable for executing sparkR shell in client modes for driver. Ignored in cluster modes. It is the same as environment variable SPARKR_DRIVER_R, but take precedence over it. spark.r.shell.command is used for sparkR shell while spark.r.driver.command is used for running R script.

\n
\n
\n
\n
backendConnectionTimeout (dagster.StringSource, optional):
\n

SparkR: Connection timeout set by R process on its connection to RBackend in seconds.

\n
\n
heartBeatInterval (dagster.StringSource, optional):
\n

SparkR: Interval for heartbeats sent from SparkR backend to R process to prevent connection timeout.

\n
\n
\n
\n
graphx (permissive dict, optional):
\n
\nDefault Value:
{\n    "pregel": {}\n}\n
\n
\n
\nConfig Schema:
\n
pregel (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
checkpointInterval (dagster.StringSource, optional):
\n

GraphX: Checkpoint interval for graph and message in Pregel. It used to avoid stackOverflowError due to long lineage chains after lots of iterations. The checkpoint is disabled by default.

\n
\n
\n
\n
\n
\n
deploy (permissive dict, optional):
\n
\nDefault Value:
{\n    "zookeeper": {}\n}\n
\n
\n
\nConfig Schema:
\n
recoveryMode (dagster.StringSource, optional):
\n

Deploy: The recovery mode setting to recover submitted Spark jobs with cluster mode when it failed and relaunches. This is only applicable for cluster mode when running with Standalone or Mesos.

\n
\n
zookeeper (permissive dict, optional):
\n
\nDefault Value:
{}\n
\n
\n
\nConfig Schema:
\n
url (dagster.StringSource, optional):
\n

Deploy: When spark.deploy.recoveryMode is set to ZOOKEEPER, this configuration is used to set the zookeeper URL to connect to.

\n
\n
dir (dagster.StringSource, optional):
\n

Deploy: When spark.deploy.recoveryMode is set to ZOOKEEPER, this configuration is used to set the zookeeper directory to store recovery state.

\n
\n
\n
\n
\n
\n
\n
\n
\n
\n
\n

This resource provides access to a PySpark SparkSession for executing PySpark code within Dagster.

\n

Example

\n
@op(required_resource_keys={"pyspark"})\ndef my_op(context):\n    spark_session = context.resources.pyspark.spark_session\n    dataframe = spark_session.read.json("examples/src/main/resources/people.json")\n\nmy_pyspark_resource = pyspark_resource.configured(\n    {"spark_conf": {"spark.executor.memory": "2g"}}\n)\n\n@job(resource_defs={"pyspark": my_pyspark_resource})\ndef my_spark_job():\n    my_op()\n
\n
\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-pyspark", "customsidebar": null, "display_toc": true, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../dagster-shell/", "title": "Shell (dagster-shell)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-prometheus/", "title": "Prometheus (dagster-prometheus)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-shell", "Shell (dagster-shell)", "N", "next"], ["sections/api/apidocs/libraries/dagster-prometheus", "Prometheus (dagster-prometheus)", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/libraries/dagster-pyspark.rst.txt", "title": "Pyspark (dagster-pyspark)", "toc": "\n"}, "dagster-shell": {"alabaster_version": "0.7.13", "body": "
\n

Shell (dagster-shell)\u00b6

\n

The Dagster shell library provides utilities and op factories for executing inline shell scripts or script files.

\n
\n
\n

APIs\u00b6

\n
\n
\ndagster_shell.create_shell_command_op(shell_command, name, description=None, required_resource_keys=None, tags=None)[source]\u00b6
\n

This function is a factory that constructs ops to execute a shell command.

\n

Note that you can only use shell_command_op if you know the command you\u2019d like to execute\nat job construction time. If you\u2019d like to construct shell commands dynamically during\njob execution and pass them between ops, you should use shell_op instead.

\n

The resulting op can take a single start argument that is a\nNothing dependency\nto allow you to run ops before the shell op.

\n

Examples

\n
from dagster import graph\nfrom dagster_shell import create_shell_command_op\n\n\n@graph\ndef my_graph():\n    a = create_shell_command_op('echo "hello, world!"', name="a")\n    a()\n
\n
\n
@op\ndef run_before_shell_op():\n    do_some_work()\n\n@graph\ndef my_graph():\n    my_echo_op = create_shell_command_op("echo hello world!", name="echo_op")\n    my_echo_op(start=run_before_shell_op())\n
\n
\n
\n
Parameters:
\n
    \n
  • shell_command (str) \u2013 The shell command that the constructed op will execute.

  • \n
  • name (str) \u2013 The name of the constructed op.

  • \n
  • description (Optional[str]) \u2013 Human-readable description of this op.

  • \n
  • required_resource_keys (Optional[Set[str]]) \u2013 Set of resource handles required by this op.\nSetting this ensures that resource spin up for the required resources will occur before\nthe shell command is executed.

  • \n
  • tags (Optional[Dict[str, Any]]) \u2013 Arbitrary metadata for the op. Frameworks may\nexpect and require certain metadata to be attached to a op. Users should generally\nnot set metadata directly. Values that are not strings will be json encoded and must meet\nthe criteria that json.loads(json.dumps(value)) == value.

  • \n
\n
\n
Raises:
\n

Failure \u2013 Raised when the shell command returns a non-zero exit code.

\n
\n
Returns:
\n

Returns the constructed op definition.

\n
\n
Return type:
\n

OpDefinition

\n
\n
\n
\n\n
\n
\ndagster_shell.create_shell_script_op(shell_script_path, name='create_shell_script_op', ins=None, **kwargs)[source]\u00b6
\n

This function is a factory which constructs an op that will execute a shell command read\nfrom a script file.

\n

Any kwargs passed to this function will be passed along to the underlying @op decorator. However, note that overriding config or output_defs is not\nsupported.

\n

You might consider using @graph to wrap this op\nin the cases where you\u2019d like to configure the shell op with different config fields.

\n

If no ins are passed then the resulting op can take a single start argument that is a\nNothing dependency\nto allow you to run ops before the shell op.

\n

Examples

\n
from dagster import file_relative_path, graph\nfrom dagster_shell import create_shell_script_op\n\n\n@graph\ndef my_graph():\n    a = create_shell_script_op(file_relative_path(__file__, "hello_world.sh"), name="a")\n    a()\n
\n
\n
@op\ndef run_before_shell_op():\n    do_some_work()\n\n@graph\ndef my_graph():\n    my_echo_op = create_shell_script_op(file_relative_path(__file__, "hello_world.sh"), name="echo_op")\n    my_echo_op(start=run_before_shell_op())\n
\n
\n
\n
Parameters:
\n
    \n
  • shell_script_path (str) \u2013 The script file to execute.

  • \n
  • name (Optional[str]) \u2013 The name of this op. Defaults to \u201ccreate_shell_script_op\u201d.

  • \n
  • ins (Optional[Mapping[str, In]]) \u2013 Ins for the op. Defaults to\na single Nothing input.

  • \n
\n
\n
Raises:
\n

Failure \u2013 Raised when the shell command returns a non-zero exit code.

\n
\n
Returns:
\n

Returns the constructed op definition.

\n
\n
Return type:
\n

OpDefinition

\n
\n
\n
\n\n
\n
\ndagster_shell.shell_op(context, shell_command, config)[source]\u00b6
\n

This op executes a shell command it receives as input.\nThis op is suitable for uses where the command to execute is generated dynamically by\nupstream ops. If you know the command to execute at job construction time,\nconsider shell_command_op instead.

\n
\n
Parameters:
\n
    \n
  • shell_command \u2013 The shell command to be executed

  • \n
  • config (ShellOpConfig) \u2013 A ShellOpConfig object specifying configuration options

  • \n
\n
\n
\n

Examples

\n
@op\ndef create_shell_command():\n    return "echo hello world!"\n\n@graph\ndef echo_graph():\n    shell_op(create_shell_command())\n
\n
\n
\n\n
\n
\ndagster_shell.execute_shell_command(shell_command, output_logging, log, cwd=None, env=None)\u00b6
\n

This function is a utility for executing shell commands from within a Dagster op (or from Python in general).\nIt can be used to execute shell commands on either op input data, or any data generated within a generic python op.

\n

Internally, it executes a shell script specified by the argument shell_command. The script will be written\nto a temporary file first and invoked via subprocess.Popen(['bash', shell_script_path], ...).

\n

In the Popen invocation, stdout=PIPE, stderr=STDOUT is used, and the combined stdout/stderr\noutput is retrieved.

\n

Examples

\n
from dagster import OpExecutionContext, op\nfrom dagster_shell import execute_shell_command\n\n\n@op\ndef my_shell_op(context: OpExecutionContext, data: str):\n    temp_file = "/tmp/data.txt"\n    with open(temp_file, "w", encoding="utf-8") as temp_file_writer:\n        temp_file_writer.write(data)\n        execute_shell_command(f"cat {temp_file}", output_logging="STREAM", log=context.log)\n
\n
\n
\n
Parameters:
\n
    \n
  • shell_command (str) \u2013 The shell command to execute

  • \n
  • output_logging (str) \u2013 The logging mode to use. Supports STREAM, BUFFER, and NONE.

  • \n
  • log (Union[logging.Logger, DagsterLogManager]) \u2013 Any logger which responds to .info()

  • \n
  • cwd (str, optional) \u2013 Working directory for the shell command to use. Defaults to the\ntemporary path where we store the shell command in a script file.

  • \n
  • env (Dict[str, str], optional) \u2013 Environment dictionary to pass to subprocess.Popen.\nUnused by default.

  • \n
\n
\n
Returns:
\n

A tuple where the first element is the combined stdout/stderr output of running the shell\ncommand and the second element is the return code.

\n
\n
Return type:
\n

Tuple[str, int]

\n
\n
\n
\n\n
\n
\ndagster_shell.execute_shell_script(shell_script_path, output_logging, log, cwd=None, env=None)\u00b6
\n

Execute a shell script file specified by the argument shell_script_path. The script will be\ninvoked via subprocess.Popen(['bash', shell_script_path], ...).

\n

In the Popen invocation, stdout=PIPE, stderr=STDOUT is used, and the combined stdout/stderr\noutput is retrieved.

\n

Examples

\n
from dagster import OpExecutionContext, op\nfrom dagster_shell import execute_shell_script\n\n\n@op\ndef my_shell_op(context: OpExecutionContext, data: str):\n    temp_file = "/tmp/echo_data.sh"\n    with open(temp_file, "w", encoding="utf-8") as temp_file_writer:\n        temp_file_writer.write(f"echo {data}")\n        execute_shell_script(temp_file, output_logging="STREAM", log=context.log)\n
\n
\n
\n
Parameters:
\n
    \n
  • shell_script_path (str) \u2013 The shell script to execute.

  • \n
  • output_logging (str) \u2013 The logging mode to use. Supports STREAM, BUFFER, and NONE.

  • \n
  • log (Union[logging.Logger, DagsterLogManager]) \u2013 Any logger which responds to .info()

  • \n
  • cwd (str, optional) \u2013 Working directory for the shell command to use. Defaults to the\ntemporary path where we store the shell command in a script file.

  • \n
  • env (Dict[str, str], optional) \u2013 Environment dictionary to pass to subprocess.Popen.\nUnused by default.

  • \n
\n
\n
Raises:
\n

Exception \u2013 When an invalid output_logging is selected. Unreachable from op-based\n invocation since the config system will check output_logging against the config\n enum.

\n
\n
Returns:
\n

A tuple where the first element is the combined stdout/stderr output of running the shell\ncommand and the second element is the return code.

\n
\n
Return type:
\n

Tuple[str, int]

\n
\n
\n
\n\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-shell", "customsidebar": null, "display_toc": true, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../dagster-slack/", "title": "Slack (dagster-slack)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-pyspark/", "title": "Pyspark (dagster-pyspark)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-slack", "Slack (dagster-slack)", "N", "next"], ["sections/api/apidocs/libraries/dagster-pyspark", "Pyspark (dagster-pyspark)", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/libraries/dagster-shell.rst.txt", "title": "Shell (dagster-shell)", "toc": "\n"}, "dagster-slack": {"alabaster_version": "0.7.13", "body": "
\n

Slack (dagster-slack)\u00b6

\n

\n
\n

\n
\n

This library provides an integration with Slack, to support posting messages in your company\u2019s Slack workspace.

\n
\n

\n
\n

Presently, it provides a thin wrapper on the Slack client API chat.postMessage.

\n
\n

\n
\n

To use this integration, you\u2019ll first need to create a Slack App for it.

\n
    \n
  1. Create App: Go to https://api.slack.com/apps and click \u201cCreate New App\u201d:

    \n

    \n
  2. \n
  3. Install App: After creating an app, on the left-hand side of the app configuration, click \u201cBot Users\u201d, and then create a bot user. Then, click \u201cInstall App\u201d on the left hand side, and finally \u201cInstall App to Workspace\u201d.

  4. \n
  5. Bot Token: Once finished, this will create a new bot token for your bot/workspace:

    \n

    \n
  6. \n
\n

Copy this bot token and put it somewhere safe; see Safely Storing Credentials for more on this topic.

\n
\n
\ndagster_slack.SlackResource ResourceDefinition[source]\u00b6
\n

This resource is for connecting to Slack.

\n

By configuring this Slack resource, you can post messages to Slack from any Dagster op, asset, schedule or sensor.

\n

Examples

\n
import os\n\nfrom dagster import EnvVar, job, op\nfrom dagster_slack import SlackResource\n\n\n@op\ndef slack_op(slack: SlackResource):\n    slack.get_client().chat_postMessage(channel='#noise', text=':wave: hey there!')\n\n@job\ndef slack_job():\n    slack_op()\n\ndefs = Definitions(\n    jobs=[slack_job],\n    resources={\n        "slack": SlackResource(token=EnvVar("MY_SLACK_TOKEN")),\n    },\n)\n
\n
\n
\n\n
\n
\ndagster_slack.make_slack_on_run_failure_sensor(channel, slack_token, text_fn=<function _default_failure_message_text_fn>, blocks_fn=None, name=None, dagit_base_url=None, minimum_interval_seconds=None, monitored_jobs=None, job_selection=None, monitor_all_repositories=False, default_status=DefaultSensorStatus.STOPPED, webserver_base_url=None)[source]\u00b6
\n

Create a sensor on job failures that will message the given Slack channel.

\n
\n
Parameters:
\n
    \n
  • channel (str) \u2013 The channel to send the message to (e.g. \u201c#my_channel\u201d)

  • \n
  • slack_token (str) \u2013 The slack token.\nTokens are typically either user tokens or bot tokens. More in the Slack API\ndocumentation here: https://api.slack.com/docs/token-types

  • \n
  • text_fn (Optional(Callable[[RunFailureSensorContext], str])) \u2013 Function which\ntakes in the RunFailureSensorContext and outputs the message you want to send.\nDefaults to a text message that contains error message, job name, and run ID.\nThe usage of the text_fn changes depending on whether you\u2019re using blocks_fn. If you\nare using blocks_fn, this is used as a fallback string to display in notifications. If\nyou aren\u2019t, this is the main body text of the message. It can be formatted as plain text,\nor with markdown.\nSee more details in https://api.slack.com/methods/chat.postMessage#text_usage

  • \n
  • blocks_fn (Callable[[RunFailureSensorContext], List[Dict]]) \u2013 Function which takes in\nthe RunFailureSensorContext and outputs the message blocks you want to send.\nSee information about Blocks in https://api.slack.com/reference/block-kit/blocks

  • \n
  • name \u2013 (Optional[str]): The name of the sensor. Defaults to \u201cslack_on_run_failure\u201d.

  • \n
  • dagit_base_url \u2013 \n \n (\n deprecated\n )\n \n (This parameter will be removed in version 2.0. Use webserver_base_url instead.) (Optional[str]): The base url of your Dagit instance. Specify this to allow\nmessages to include deeplinks to the failed job run.

  • \n
  • minimum_interval_seconds \u2013 (Optional[int]): The minimum number of seconds that will elapse\nbetween sensor evaluations.

  • \n
  • monitored_jobs (Optional[List[Union[JobDefinition, GraphDefinition, RepositorySelector, JobSelector, CodeLocationSensor]]]) \u2013 The jobs in the\ncurrent repository that will be monitored by this failure sensor. Defaults to None, which\nmeans the alert will be sent when any job in the repository fails. To monitor jobs in external repositories, use RepositorySelector and JobSelector

  • \n
  • job_selection (Optional[List[Union[JobDefinition, GraphDefinition, RepositorySelector, JobSelector, CodeLocationSensor]]]) \u2013 \n \n (\n deprecated\n )\n \n (This parameter will be removed in version 2.0. Use monitored_jobs instead.) (deprecated in favor of monitored_jobs)\nThe jobs in the current repository that will be monitored by this failure sensor. Defaults to None, which means the alert will\nbe sent when any job in the repository fails.

  • \n
  • monitor_all_repositories (bool) \u2013 If set to True, the sensor will monitor all runs in the\nDagster instance. If set to True, an error will be raised if you also specify\nmonitored_jobs or job_selection. Defaults to False.

  • \n
  • default_status (DefaultSensorStatus) \u2013 Whether the sensor starts as running or not. The default\nstatus can be overridden from Dagit or via the GraphQL API.

  • \n
  • webserver_base_url \u2013 (Optional[str]): The base url of your webserver instance. Specify this to allow\nmessages to include deeplinks to the failed job run.

  • \n
\n
\n
\n

Examples

\n
slack_on_run_failure = make_slack_on_run_failure_sensor(\n    "#my_channel",\n    os.getenv("MY_SLACK_TOKEN")\n)\n\n@repository\ndef my_repo():\n    return [my_job + slack_on_run_failure]\n
\n
\n
def my_message_fn(context: RunFailureSensorContext) -> str:\n    return (\n        f"Job {context.dagster_run.job_name} failed!"\n        f"Error: {context.failure_event.message}"\n    )\n\nslack_on_run_failure = make_slack_on_run_failure_sensor(\n    channel="#my_channel",\n    slack_token=os.getenv("MY_SLACK_TOKEN"),\n    text_fn=my_message_fn,\n    webserver_base_url="http://mycoolsite.com",\n)\n
\n
\n
\n\n
\n
\ndagster_slack.make_slack_on_freshness_policy_status_change_sensor(channel, slack_token, asset_selection, warn_after_minutes_overdue=0, notify_when_back_on_time=False, text_fn=<function _default_freshness_message_text_fn>, blocks_fn=None, name=None, dagit_base_url=None, default_status=DefaultSensorStatus.STOPPED, webserver_base_url=None)[source]\u00b6
\n
\n
\n

\n \n (\n experimental\n )\n \n This API may break in future versions, even between dot releases.\n \n

\n

Create a sensor that will message the given Slack channel whenever an asset in the provided\nAssetSelection becomes out of date. Messages are only fired when the state changes, meaning\nonly a single slack message will be sent (when the asset begins to be out of date). If\nnotify_when_back_on_time is set to True, a second slack message will be sent once the asset\nis on time again.

\n
\n
Parameters:
\n
    \n
  • channel (str) \u2013 The channel to send the message to (e.g. \u201c#my_channel\u201d)

  • \n
  • slack_token (str) \u2013 The slack token.\nTokens are typically either user tokens or bot tokens. More in the Slack API\ndocumentation here: https://api.slack.com/docs/token-types

  • \n
  • asset_selection (AssetSelection) \u2013 The selection of assets which this sensor will monitor.\nAlerts will only be fired for assets that have a FreshnessPolicy defined.

  • \n
  • warn_after_minutes_overdue (float) \u2013 How many minutes past the specified FreshnessPolicy this\nsensor will wait before firing an alert (by default, an alert will be fired as soon as\nthe policy is violated).

  • \n
  • notify_when_back_on_time (bool) \u2013 If a success message should be sent when the asset becomes on\ntime again.

  • \n
  • text_fn (Optional(Callable[[RunFailureSensorContext], str])) \u2013 Function which\ntakes in the FreshnessPolicySensorContext and outputs the message you want to send.\nDefaults to a text message that contains the relevant asset key, and the number of\nminutes past its defined freshness policy it currently is.\nThe usage of the text_fn changes depending on whether you\u2019re using blocks_fn. If you\nare using blocks_fn, this is used as a fallback string to display in notifications. If\nyou aren\u2019t, this is the main body text of the message. It can be formatted as plain text,\nor with markdown.\nSee more details in https://api.slack.com/methods/chat.postMessage#text_usage

  • \n
  • blocks_fn (Callable[[FreshnessPolicySensorContext], List[Dict]]) \u2013 Function which takes in\nthe FreshnessPolicySensorContext and outputs the message blocks you want to send.\nSee information about Blocks in https://api.slack.com/reference/block-kit/blocks

  • \n
  • name \u2013 (Optional[str]): The name of the sensor. Defaults to \u201cslack_on_freshness_policy\u201d.

  • \n
  • dagit_base_url \u2013 \n \n (\n deprecated\n )\n \n (This parameter will be removed in version 2.0. Use webserver_base_url instead.) (Optional[str]): The base url of your Dagit instance. Specify this to allow\nmessages to include deeplinks to the relevant asset page.

  • \n
  • default_status (DefaultSensorStatus) \u2013 Whether the sensor starts as running or not. The default\nstatus can be overridden from Dagit or via the GraphQL API.

  • \n
  • webserver_base_url \u2013 (Optional[str]): The base url of your Dagit instance. Specify this to allow\nmessages to include deeplinks to the relevant asset page.

  • \n
\n
\n
\n

Examples

\n
slack_on_freshness_policy = make_slack_on_freshness_policy_status_change_sensor(\n    "#my_channel",\n    os.getenv("MY_SLACK_TOKEN"),\n)\n
\n
\n
def my_message_fn(context: FreshnessPolicySensorContext) -> str:\n    if context.minutes_overdue == 0:\n        return f"Asset {context.asset_key} is currently on time :)"\n    return (\n        f"Asset {context.asset_key} is currently {context.minutes_overdue} minutes late!!"\n    )\n\nslack_on_run_failure = make_slack_on_run_failure_sensor(\n    channel="#my_channel",\n    slack_token=os.getenv("MY_SLACK_TOKEN"),\n    text_fn=my_message_fn,\n    webserver_base_url="http://mycoolsite.com",\n)\n
\n
\n
\n\n
\n
\ndagster_slack.slack_on_failure HookDefinition[source]\u00b6
\n

Create a hook on step failure events that will message the given Slack channel.

\n
\n
Parameters:
\n
    \n
  • channel (str) \u2013 The channel to send the message to (e.g. \u201c#my_channel\u201d)

  • \n
  • message_fn (Optional(Callable[[HookContext], str])) \u2013 Function which takes in the HookContext\noutputs the message you want to send.

  • \n
  • dagit_base_url \u2013 \n \n (\n deprecated\n )\n \n (This parameter will be removed in version 2.0. Use webserver_base_url instead.) (Optional[str]): The base url of your webserver instance. Specify this to allow\nmessages to include deeplinks to the specific run that triggered the hook.

  • \n
  • webserver_base_url \u2013 (Optional[str]): The base url of your webserver instance. Specify this to allow\nmessages to include deeplinks to the specific run that triggered the hook.

  • \n
\n
\n
\n

Examples

\n
@slack_on_failure("#foo", webserver_base_url="http://localhost:3000")\n@job(...)\ndef my_job():\n    pass\n
\n
\n
def my_message_fn(context: HookContext) -> str:\n    return f"Op {context.op} failed!"\n\n@op\ndef an_op(context):\n    pass\n\n@job(...)\ndef my_job():\n    an_op.with_hooks(hook_defs={slack_on_failure("#foo", my_message_fn)})\n
\n
\n
\n\n
\n
\ndagster_slack.slack_on_success HookDefinition[source]\u00b6
\n

Create a hook on step success events that will message the given Slack channel.

\n
\n
Parameters:
\n
    \n
  • channel (str) \u2013 The channel to send the message to (e.g. \u201c#my_channel\u201d)

  • \n
  • message_fn (Optional(Callable[[HookContext], str])) \u2013 Function which takes in the HookContext\noutputs the message you want to send.

  • \n
  • dagit_base_url \u2013 \n \n (\n deprecated\n )\n \n (This parameter will be removed in version 2.0. Use webserver_base_url instead.) (Optional[str]): The base url of your webserver instance. Specify this to allow\nmessages to include deeplinks to the specific run that triggered the hook.

  • \n
  • webserver_base_url \u2013 (Optional[str]): The base url of your webserver instance. Specify this to allow\nmessages to include deeplinks to the specific run that triggered the hook.

  • \n
\n
\n
\n

Examples

\n
@slack_on_success("#foo", webserver_base_url="http://localhost:3000")\n@job(...)\ndef my_job():\n    pass\n
\n
\n
def my_message_fn(context: HookContext) -> str:\n    return f"Op {context.op} worked!"\n\n@op\ndef an_op(context):\n    pass\n\n@job(...)\ndef my_job():\n    an_op.with_hooks(hook_defs={slack_on_success("#foo", my_message_fn)})\n
\n
\n
\n\n
\n

Legacy\u00b6

\n
\n
\ndagster_slack.slack_resource ResourceDefinition[source]\u00b6
\n

This resource is for connecting to Slack.

\n

The resource object is a slack_sdk.WebClient.

\n

By configuring this Slack resource, you can post messages to Slack from any Dagster op, asset, schedule or sensor.

\n

Examples

\n
import os\n\nfrom dagster import job, op\nfrom dagster_slack import slack_resource\n\n\n@op(required_resource_keys={'slack'})\ndef slack_op(context):\n    context.resources.slack.chat_postMessage(channel='#noise', text=':wave: hey there!')\n\n@job(resource_defs={'slack': slack_resource})\ndef slack_job():\n    slack_op()\n\nslack_job.execute_in_process(\n    run_config={'resources': {'slack': {'config': {'token': os.getenv('SLACK_TOKEN')}}}}\n)\n
\n
\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-slack", "customsidebar": null, "display_toc": true, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../dagster-snowflake/", "title": "Snowflake (dagster-snowflake)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-shell/", "title": "Shell (dagster-shell)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-snowflake", "Snowflake (dagster-snowflake)", "N", "next"], ["sections/api/apidocs/libraries/dagster-shell", "Shell (dagster-shell)", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/libraries/dagster-slack.rst.txt", "title": "Slack (dagster-slack)", "toc": "\n"}, "dagster-snowflake": {"alabaster_version": "0.7.13", "body": "
\n

Snowflake (dagster-snowflake)\u00b6

\n

This library provides an integration with the Snowflake data\nwarehouse.

\n

To use this library, you should first ensure that you have an appropriate Snowflake user configured to access\nyour data warehouse.

\n

Related Guides:

\n\n
\n

I/O Manager\u00b6

\n
\n
\ndagster_snowflake.SnowflakeIOManager IOManagerDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
database (dagster.StringSource):
\n

Name of the database to use.

\n
\n
account (dagster.StringSource):
\n

Your Snowflake account name. For more details, see the Snowflake documentation.

\n
\n
user (dagster.StringSource):
\n

User login name.

\n
\n
schema (Union[dagster.StringSource, None], optional):
\n

Name of the schema to use.

\n
\n
password (Union[dagster.StringSource, None], optional):
\n

User password.

\n
\n
warehouse (Union[dagster.StringSource, None], optional):
\n

Name of the warehouse to use.

\n
\n
role (Union[dagster.StringSource, None], optional):
\n

Name of the role to use.

\n
\n
private_key (Union[dagster.StringSource, None], optional):
\n

Raw private key to use. See the Snowflake documentation for details. To avoid issues with newlines in the keys, you can base64 encode the key. You can retrieve the base64 encoded key with this shell command: cat rsa_key.p8 | base64

\n
\n
private_key_path (Union[dagster.StringSource, None], optional):
\n

Path to the private key. See the Snowflake documentation for details.

\n
\n
private_key_password (Union[dagster.StringSource, None], optional):
\n

The password of the private key. See the Snowflake documentation for details. Required for both private_key and private_key_path if the private key is encrypted. For unencrypted keys, this config can be omitted or set to None.

\n
\n
store_timestamps_as_strings (dagster.BoolSource, optional):
\n

If using Pandas DataFrames, whether to convert time data to strings. If True, time data will be converted to strings when storing the DataFrame and converted back to time data when loading the DataFrame. If False, time data without a timezone will be set to UTC timezone to avoid a Snowflake bug. Defaults to False.

\n

Default Value: False

\n
\n
authenticator (Union[dagster.StringSource, None], optional):
\n

Optional parameter to specify the authentication mechanism to use.

\n
\n
\n

Base class for an IO manager definition that reads inputs from and writes outputs to Snowflake.

\n

Examples

\n
from dagster_snowflake import SnowflakeIOManager\nfrom dagster_snowflake_pandas import SnowflakePandasTypeHandler\nfrom dagster_snowflake_pyspark import SnowflakePySparkTypeHandler\nfrom dagster import Definitions, EnvVar\n\nclass MySnowflakeIOManager(SnowflakeIOManager):\n    @staticmethod\n    def type_handlers() -> Sequence[DbTypeHandler]:\n        return [SnowflakePandasTypeHandler(), SnowflakePySparkTypeHandler()]\n\n@asset(\n    key_prefix=["my_schema"]  # will be used as the schema in snowflake\n)\ndef my_table() -> pd.DataFrame:  # the name of the asset will be the table name\n    ...\n\ndefs = Definitions(\n    assets=[my_table],\n    resources={\n        "io_manager": MySnowflakeIOManager(database="MY_DATABASE", account=EnvVar("SNOWFLAKE_ACCOUNT"), ...)\n    }\n)\n
\n
\n

If you do not provide a schema, Dagster will determine a schema based on the assets and ops using\nthe IO Manager. For assets, the schema will be determined from the asset key,\nas shown in the above example. The final prefix before the asset name will be used as the schema. For example,\nif the asset my_table had the key prefix ["snowflake", "my_schema"], the schema my_schema will be\nused. For ops, the schema can be specified by including a schema entry in output metadata. If schema is not provided\nvia config or on the asset/op, public will be used for the schema.

\n
@op(\n    out={"my_table": Out(metadata={"schema": "my_schema"})}\n)\ndef make_my_table() -> pd.DataFrame:\n    # the returned value will be stored at my_schema.my_table\n    ...\n
\n
\n

To only use specific columns of a table as input to a downstream op or asset, add the metadata columns to the\nIn or AssetIn.

\n
@asset(\n    ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n)\ndef my_table_a(my_table: pd.DataFrame) -> pd.DataFrame:\n    # my_table will just contain the data from column "a"\n    ...\n
\n
\n
\n\n
\n
\n

Resource\u00b6

\n
\n
\ndagster_snowflake.SnowflakeResource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
account (Union[dagster.StringSource, None], optional):
\n

Your Snowflake account name. For more details, see the Snowflake documentation.

\n
\n
user (dagster.StringSource):
\n

User login name.

\n
\n
password (Union[dagster.StringSource, None], optional):
\n

User password.

\n
\n
database (Union[dagster.StringSource, None], optional):
\n

Name of the default database to use. After login, you can use USE DATABASE to change the database.

\n
\n
schema (Union[dagster.StringSource, None], optional):
\n

Name of the default schema to use. After login, you can use USE SCHEMA to change the schema.

\n
\n
role (Union[dagster.StringSource, None], optional):
\n

Name of the default role to use. After login, you can use USE ROLE to change the role.

\n
\n
warehouse (Union[dagster.StringSource, None], optional):
\n

Name of the default warehouse to use. After login, you can use USE WAREHOUSE to change the role.

\n
\n
private_key (Union[dagster.StringSource, None], optional):
\n

Raw private key to use. See the Snowflake documentation for details. Alternately, set private_key_path and private_key_password. To avoid issues with newlines in the keys, you can base64 encode the key. You can retrieve the base64 encoded key with this shell command: cat rsa_key.p8 | base64

\n
\n
private_key_password (Union[dagster.StringSource, None], optional):
\n

Raw private key password to use. See the Snowflake documentation for details. Required for both private_key and private_key_path if the private key is encrypted. For unencrypted keys, this config can be omitted or set to None.

\n
\n
private_key_path (Union[dagster.StringSource, None], optional):
\n

Raw private key path to use. See the Snowflake documentation for details. Alternately, set the raw private key as private_key.

\n
\n
autocommit (Union[dagster.BoolSource, None], optional):
\n

None by default, which honors the Snowflake parameter AUTOCOMMIT. Set to True or False to enable or disable autocommit mode in the session, respectively.

\n
\n
client_prefetch_threads (Union[dagster.IntSource, None], optional):
\n

Number of threads used to download the results sets (4 by default). Increasing the value improves fetch performance but requires more memory.

\n
\n
client_session_keep_alive (Union[dagster.BoolSource, None], optional):
\n

False by default. Set this to True to keep the session active indefinitely, even if there is no activity from the user. Make certain to call the close method to terminate the thread properly or the process may hang.

\n
\n
login_timeout (Union[dagster.IntSource, None], optional):
\n

Timeout in seconds for login. By default, 60 seconds. The login request gives up after the timeout length if the HTTP response is \u201csuccess\u201d.

\n
\n
network_timeout (Union[dagster.IntSource, None], optional):
\n

Timeout in seconds for all other operations. By default, none/infinite. A general request gives up after the timeout length if the HTTP response is not \u2018success\u2019.

\n
\n
ocsp_response_cache_filename (Union[dagster.StringSource, None], optional):
\n

URI for the OCSP response cache file. By default, the OCSP response cache file is created in the cache directory.

\n
\n
validate_default_parameters (Union[dagster.BoolSource, None], optional):
\n

If True, raise an exception if the warehouse, database, or schema doesn\u2019t exist. Defaults to False.

\n
\n
paramstyle (Union[dagster.StringSource, None], optional):
\n

pyformat by default for client side binding. Specify qmark or numeric to change bind variable formats for server side binding.

\n
\n
timezone (Union[dagster.StringSource, None], optional):
\n

None by default, which honors the Snowflake parameter TIMEZONE. Set to a valid time zone (e.g. America/Los_Angeles) to set the session time zone.

\n
\n
connector (Union[dagster.StringSource, None], optional):
\n

Indicate alternative database connection engine. Permissible option is \u2018sqlalchemy\u2019 otherwise defaults to use the Snowflake Connector for Python.

\n
\n
cache_column_metadata (Union[dagster.StringSource, None], optional):
\n

Optional parameter when connector is set to sqlalchemy. Snowflake SQLAlchemy takes a flag cache_column_metadata=True such that all of column metadata for all tables are \u201ccached\u201d

\n
\n
numpy (Union[dagster.BoolSource, None], optional):
\n

Optional parameter when connector is set to sqlalchemy. To enable fetching NumPy data types, add numpy=True to the connection parameters.

\n
\n
authenticator (Union[dagster.StringSource, None], optional):
\n

Optional parameter to specify the authentication mechanism to use.

\n
\n
\n

A resource for connecting to the Snowflake data warehouse.

\n

If connector configuration is not set, SnowflakeResource.get_connection() will return a\nsnowflake.connector.Connection\nobject. If connector=\u201dsqlalchemy\u201d configuration is set, then SnowflakeResource.get_connection() will\nreturn a SQLAlchemy Connection\nor a SQLAlchemy raw connection.

\n

A simple example of loading data into Snowflake and subsequently querying that data is shown below:

\n

Examples

\n
from dagster import job, op\nfrom dagster_snowflake import SnowflakeResource\n\n@op\ndef get_one(snowflake_resource: SnowflakeResource):\n    with snowflake_resource.get_connection() as conn:\n        # conn is a snowflake.connector.Connection object\n        conn.cursor().execute("SELECT 1")\n\n@job\ndef my_snowflake_job():\n    get_one()\n\nmy_snowflake_job.execute_in_process(\n    resources={\n        'snowflake_resource': SnowflakeResource(\n            account=EnvVar("SNOWFLAKE_ACCOUNT"),\n            user=EnvVar("SNOWFLAKE_USER"),\n            password=EnvVar("SNOWFLAKE_PASSWORD")\n            database="MY_DATABASE",\n            schema="MY_SCHEMA",\n            warehouse="MY_WAREHOUSE"\n        )\n    }\n)\n
\n
\n
\n\n
\n
\nclass dagster_snowflake.SnowflakeConnection(config, log, snowflake_connection_resource)[source]\u00b6
\n

A connection to Snowflake that can execute queries. In general this class should not be\ndirectly instantiated, but rather used as a resource in an op or asset via the\nsnowflake_resource().

\n

Note that the SnowflakeConnection is only used by the snowflake_resource. The Pythonic SnowflakeResource does\nnot use this SnowflakeConnection class.

\n
\n
\nexecute_queries(sql_queries, parameters=None, fetch_results=False, use_pandas_result=False)[source]\u00b6
\n

Execute multiple queries in Snowflake.

\n
\n
Parameters:
\n
    \n
  • sql_queries (str) \u2013 List of queries to be executed in series

  • \n
  • parameters (Optional[Union[Sequence[Any], Mapping[Any, Any]]]) \u2013 Parameters to be passed to every query. See the\nSnowflake documentation\nfor more information.

  • \n
  • fetch_results (bool) \u2013 If True, will return the results of the queries as a list. Defaults to False. If True\nand use_pandas_result is also True, results will be returned as Pandas DataFrames.

  • \n
  • use_pandas_result (bool) \u2013 If True, will return the results of the queries as a list of a Pandas DataFrames.\nDefaults to False. If fetch_results is False and use_pandas_result is True, an error will be\nraised.

  • \n
\n
\n
Returns:
\n

The results of the queries as a list if fetch_results or use_pandas_result is True,\notherwise returns None

\n
\n
\n

Examples

\n
@op\ndef create_fresh_database(snowflake: SnowflakeResource):\n    queries = ["DROP DATABASE IF EXISTS MY_DATABASE", "CREATE DATABASE MY_DATABASE"]\n    snowflake.execute_queries(\n        sql_queries=queries\n    )\n
\n
\n
\n\n
\n
\nexecute_query(sql, parameters=None, fetch_results=False, use_pandas_result=False)[source]\u00b6
\n

Execute a query in Snowflake.

\n
\n
Parameters:
\n
    \n
  • sql (str) \u2013 the query to be executed

  • \n
  • parameters (Optional[Union[Sequence[Any], Mapping[Any, Any]]]) \u2013 Parameters to be passed to the query. See the\nSnowflake documentation\nfor more information.

  • \n
  • fetch_results (bool) \u2013 If True, will return the result of the query. Defaults to False. If True\nand use_pandas_result is also True, results will be returned as a Pandas DataFrame.

  • \n
  • use_pandas_result (bool) \u2013 If True, will return the result of the query as a Pandas DataFrame.\nDefaults to False. If fetch_results is False and use_pandas_result is True, an error will be\nraised.

  • \n
\n
\n
Returns:
\n

The result of the query if fetch_results or use_pandas_result is True, otherwise returns None

\n
\n
\n

Examples

\n
@op\ndef drop_database(snowflake: SnowflakeResource):\n    snowflake.execute_query(\n        "DROP DATABASE IF EXISTS MY_DATABASE"\n    )\n
\n
\n
\n\n
\n
\nget_connection(raw_conn=True)[source]\u00b6
\n

Gets a connection to Snowflake as a context manager.

\n

If using the execute_query, execute_queries, or load_table_from_local_parquet methods,\nyou do not need to create a connection using this context manager.

\n
\n
Parameters:
\n

raw_conn (bool) \u2013 If using the sqlalchemy connector, you can set raw_conn to True to create a raw\nconnection. Defaults to True.

\n
\n
\n

Examples

\n
@op(\n    required_resource_keys={"snowflake"}\n)\ndef get_query_status(query_id):\n    with context.resources.snowflake.get_connection() as conn:\n        # conn is a Snowflake Connection object or a SQLAlchemy Connection if\n        # sqlalchemy is specified as the connector in the Snowflake Resource config\n\n        return conn.get_query_status(query_id)\n
\n
\n
\n\n
\n
\nload_table_from_local_parquet(src, table)[source]\u00b6
\n

Stores the content of a parquet file to a Snowflake table.

\n
\n
Parameters:
\n
    \n
  • src (str) \u2013 the name of the file to store in Snowflake

  • \n
  • table (str) \u2013 the name of the table to store the data. If the table does not exist, it will\nbe created. Otherwise the contents of the table will be replaced with the data in src

  • \n
\n
\n
\n

Examples

\n
import pandas as pd\nimport pyarrow as pa\nimport pyarrow.parquet as pq\n\n@op\ndef write_parquet_file(snowflake: SnowflakeResource):\n    df = pd.DataFrame({"one": [1, 2, 3], "ten": [11, 12, 13]})\n    table = pa.Table.from_pandas(df)\n    pq.write_table(table, "example.parquet')\n    snowflake.load_table_from_local_parquet(\n        src="example.parquet",\n        table="MY_TABLE"\n    )\n
\n
\n
\n\n
\n\n
\n
\n

Ops\u00b6

\n
\n
\ndagster_snowflake.snowflake_op_for_query(sql, parameters=None)[source]\u00b6
\n

This function is an op factory that constructs an op to execute a snowflake query.

\n

Note that you can only use snowflake_op_for_query if you know the query you\u2019d like to\nexecute at graph construction time. If you\u2019d like to execute queries dynamically during\njob execution, you should manually execute those queries in your custom op using the\nsnowflake resource.

\n
\n
Parameters:
\n
    \n
  • sql (str) \u2013 The sql query that will execute against the provided snowflake resource.

  • \n
  • parameters (dict) \u2013 The parameters for the sql query.

  • \n
\n
\n
Returns:
\n

Returns the constructed op definition.

\n
\n
Return type:
\n

OpDefinition

\n
\n
\n
\n\n
\n
\n

Legacy\u00b6

\n
\n
\ndagster_snowflake.build_snowflake_io_manager IOManagerDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
database (dagster.StringSource):
\n

Name of the database to use.

\n
\n
account (dagster.StringSource):
\n

Your Snowflake account name. For more details, see the Snowflake documentation.

\n
\n
user (dagster.StringSource):
\n

User login name.

\n
\n
schema (Union[dagster.StringSource, None], optional):
\n

Name of the schema to use.

\n
\n
password (Union[dagster.StringSource, None], optional):
\n

User password.

\n
\n
warehouse (Union[dagster.StringSource, None], optional):
\n

Name of the warehouse to use.

\n
\n
role (Union[dagster.StringSource, None], optional):
\n

Name of the role to use.

\n
\n
private_key (Union[dagster.StringSource, None], optional):
\n

Raw private key to use. See the Snowflake documentation for details. To avoid issues with newlines in the keys, you can base64 encode the key. You can retrieve the base64 encoded key with this shell command: cat rsa_key.p8 | base64

\n
\n
private_key_path (Union[dagster.StringSource, None], optional):
\n

Path to the private key. See the Snowflake documentation for details.

\n
\n
private_key_password (Union[dagster.StringSource, None], optional):
\n

The password of the private key. See the Snowflake documentation for details. Required for both private_key and private_key_path if the private key is encrypted. For unencrypted keys, this config can be omitted or set to None.

\n
\n
store_timestamps_as_strings (dagster.BoolSource, optional):
\n

If using Pandas DataFrames, whether to convert time data to strings. If True, time data will be converted to strings when storing the DataFrame and converted back to time data when loading the DataFrame. If False, time data without a timezone will be set to UTC timezone to avoid a Snowflake bug. Defaults to False.

\n

Default Value: False

\n
\n
authenticator (Union[dagster.StringSource, None], optional):
\n

Optional parameter to specify the authentication mechanism to use.

\n
\n
\n

Builds an IO manager definition that reads inputs from and writes outputs to Snowflake.

\n
\n
Parameters:
\n
    \n
  • type_handlers (Sequence[DbTypeHandler]) \u2013 Each handler defines how to translate between\nslices of Snowflake tables and an in-memory type - e.g. a Pandas DataFrame. If only\none DbTypeHandler is provided, it will be used as teh default_load_type.

  • \n
  • default_load_type (Type) \u2013 When an input has no type annotation, load it as this type.

  • \n
\n
\n
Returns:
\n

IOManagerDefinition

\n
\n
\n

Examples

\n
from dagster_snowflake import build_snowflake_io_manager\nfrom dagster_snowflake_pandas import SnowflakePandasTypeHandler\nfrom dagster_snowflake_pyspark import SnowflakePySparkTypeHandler\nfrom dagster import Definitions\n\n@asset(\n    key_prefix=["my_schema"]  # will be used as the schema in snowflake\n)\ndef my_table() -> pd.DataFrame:  # the name of the asset will be the table name\n    ...\n\nsnowflake_io_manager = build_snowflake_io_manager([SnowflakePandasTypeHandler(), SnowflakePySparkTypeHandler()])\n\ndefs = Definitions(\n    assets=[my_table],\n    resources={\n        "io_manager": snowflake_io_manager.configured({\n            "database": "my_database",\n            "account" : {"env": "SNOWFLAKE_ACCOUNT"}\n            ...\n        })\n    }\n)\n
\n
\n

If you do not provide a schema, Dagster will determine a schema based on the assets and ops using\nthe IO Manager. For assets, the schema will be determined from the asset key,\nas shown in the above example. The final prefix before the asset name will be used as the schema. For example,\nif the asset my_table had the key prefix ["snowflake", "my_schema"], the schema my_schema will be\nused. For ops, the schema can be specified by including a schema entry in output metadata. If schema is not provided\nvia config or on the asset/op, public will be used for the schema.

\n
@op(\n    out={"my_table": Out(metadata={"schema": "my_schema"})}\n)\ndef make_my_table() -> pd.DataFrame:\n    # the returned value will be stored at my_schema.my_table\n    ...\n
\n
\n

To only use specific columns of a table as input to a downstream op or asset, add the metadata columns to the\nIn or AssetIn.

\n
@asset(\n    ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n)\ndef my_table_a(my_table: pd.DataFrame) -> pd.DataFrame:\n    # my_table will just contain the data from column "a"\n    ...\n
\n
\n
\n\n
\n
\ndagster_snowflake.snowflake_resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
account (Union[dagster.StringSource, None], optional):
\n

Your Snowflake account name. For more details, see the Snowflake documentation.

\n
\n
user (dagster.StringSource):
\n

User login name.

\n
\n
password (Union[dagster.StringSource, None], optional):
\n

User password.

\n
\n
database (Union[dagster.StringSource, None], optional):
\n

Name of the default database to use. After login, you can use USE DATABASE to change the database.

\n
\n
schema (Union[dagster.StringSource, None], optional):
\n

Name of the default schema to use. After login, you can use USE SCHEMA to change the schema.

\n
\n
role (Union[dagster.StringSource, None], optional):
\n

Name of the default role to use. After login, you can use USE ROLE to change the role.

\n
\n
warehouse (Union[dagster.StringSource, None], optional):
\n

Name of the default warehouse to use. After login, you can use USE WAREHOUSE to change the role.

\n
\n
private_key (Union[dagster.StringSource, None], optional):
\n

Raw private key to use. See the Snowflake documentation for details. Alternately, set private_key_path and private_key_password. To avoid issues with newlines in the keys, you can base64 encode the key. You can retrieve the base64 encoded key with this shell command: cat rsa_key.p8 | base64

\n
\n
private_key_password (Union[dagster.StringSource, None], optional):
\n

Raw private key password to use. See the Snowflake documentation for details. Required for both private_key and private_key_path if the private key is encrypted. For unencrypted keys, this config can be omitted or set to None.

\n
\n
private_key_path (Union[dagster.StringSource, None], optional):
\n

Raw private key path to use. See the Snowflake documentation for details. Alternately, set the raw private key as private_key.

\n
\n
autocommit (Union[dagster.BoolSource, None], optional):
\n

None by default, which honors the Snowflake parameter AUTOCOMMIT. Set to True or False to enable or disable autocommit mode in the session, respectively.

\n
\n
client_prefetch_threads (Union[dagster.IntSource, None], optional):
\n

Number of threads used to download the results sets (4 by default). Increasing the value improves fetch performance but requires more memory.

\n
\n
client_session_keep_alive (Union[dagster.BoolSource, None], optional):
\n

False by default. Set this to True to keep the session active indefinitely, even if there is no activity from the user. Make certain to call the close method to terminate the thread properly or the process may hang.

\n
\n
login_timeout (Union[dagster.IntSource, None], optional):
\n

Timeout in seconds for login. By default, 60 seconds. The login request gives up after the timeout length if the HTTP response is \u201csuccess\u201d.

\n
\n
network_timeout (Union[dagster.IntSource, None], optional):
\n

Timeout in seconds for all other operations. By default, none/infinite. A general request gives up after the timeout length if the HTTP response is not \u2018success\u2019.

\n
\n
ocsp_response_cache_filename (Union[dagster.StringSource, None], optional):
\n

URI for the OCSP response cache file. By default, the OCSP response cache file is created in the cache directory.

\n
\n
validate_default_parameters (Union[dagster.BoolSource, None], optional):
\n

If True, raise an exception if the warehouse, database, or schema doesn\u2019t exist. Defaults to False.

\n
\n
paramstyle (Union[dagster.StringSource, None], optional):
\n

pyformat by default for client side binding. Specify qmark or numeric to change bind variable formats for server side binding.

\n
\n
timezone (Union[dagster.StringSource, None], optional):
\n

None by default, which honors the Snowflake parameter TIMEZONE. Set to a valid time zone (e.g. America/Los_Angeles) to set the session time zone.

\n
\n
connector (Union[dagster.StringSource, None], optional):
\n

Indicate alternative database connection engine. Permissible option is \u2018sqlalchemy\u2019 otherwise defaults to use the Snowflake Connector for Python.

\n
\n
cache_column_metadata (Union[dagster.StringSource, None], optional):
\n

Optional parameter when connector is set to sqlalchemy. Snowflake SQLAlchemy takes a flag cache_column_metadata=True such that all of column metadata for all tables are \u201ccached\u201d

\n
\n
numpy (Union[dagster.BoolSource, None], optional):
\n

Optional parameter when connector is set to sqlalchemy. To enable fetching NumPy data types, add numpy=True to the connection parameters.

\n
\n
authenticator (Union[dagster.StringSource, None], optional):
\n

Optional parameter to specify the authentication mechanism to use.

\n
\n
\n

A resource for connecting to the Snowflake data warehouse. The returned resource object is an\ninstance of SnowflakeConnection.

\n

A simple example of loading data into Snowflake and subsequently querying that data is shown below:

\n

Examples

\n
from dagster import job, op\nfrom dagster_snowflake import snowflake_resource\n\n@op(required_resource_keys={'snowflake'})\ndef get_one(context):\n    context.resources.snowflake.execute_query('SELECT 1')\n\n@job(resource_defs={'snowflake': snowflake_resource})\ndef my_snowflake_job():\n    get_one()\n\nmy_snowflake_job.execute_in_process(\n    run_config={\n        'resources': {\n            'snowflake': {\n                'config': {\n                    'account': {'env': 'SNOWFLAKE_ACCOUNT'},\n                    'user': {'env': 'SNOWFLAKE_USER'},\n                    'password': {'env': 'SNOWFLAKE_PASSWORD'},\n                    'database': {'env': 'SNOWFLAKE_DATABASE'},\n                    'schema': {'env': 'SNOWFLAKE_SCHEMA'},\n                    'warehouse': {'env': 'SNOWFLAKE_WAREHOUSE'},\n                }\n            }\n        }\n    }\n)\n
\n
\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-snowflake", "customsidebar": null, "display_toc": true, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../dagster-snowflake-pandas/", "title": "Snowflake with Pandas (dagster-snowflake-pandas)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-slack/", "title": "Slack (dagster-slack)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-snowflake-pandas", "Snowflake with Pandas (dagster-snowflake-pandas)", "N", "next"], ["sections/api/apidocs/libraries/dagster-slack", "Slack (dagster-slack)", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/libraries/dagster-snowflake.rst.txt", "title": "Snowflake (dagster-snowflake)", "toc": "\n"}, "dagster-snowflake-pandas": {"alabaster_version": "0.7.13", "body": "
\n

Snowflake with Pandas (dagster-snowflake-pandas)\u00b6

\n

This library provides an integration with the Snowflake data\nwarehouse and Pandas data processing library.

\n

To use this library, you should first ensure that you have an appropriate Snowflake user configured to access\nyour data warehouse.

\n

Related Guides:

\n\n
\n
\ndagster_snowflake_pandas.SnowflakePandasIOManager IOManagerDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
database (dagster.StringSource):
\n

Name of the database to use.

\n
\n
account (dagster.StringSource):
\n

Your Snowflake account name. For more details, see the Snowflake documentation.

\n
\n
user (dagster.StringSource):
\n

User login name.

\n
\n
schema (Union[dagster.StringSource, None], optional):
\n

Name of the schema to use.

\n
\n
password (Union[dagster.StringSource, None], optional):
\n

User password.

\n
\n
warehouse (Union[dagster.StringSource, None], optional):
\n

Name of the warehouse to use.

\n
\n
role (Union[dagster.StringSource, None], optional):
\n

Name of the role to use.

\n
\n
private_key (Union[dagster.StringSource, None], optional):
\n

Raw private key to use. See the Snowflake documentation for details. To avoid issues with newlines in the keys, you can base64 encode the key. You can retrieve the base64 encoded key with this shell command: cat rsa_key.p8 | base64

\n
\n
private_key_path (Union[dagster.StringSource, None], optional):
\n

Path to the private key. See the Snowflake documentation for details.

\n
\n
private_key_password (Union[dagster.StringSource, None], optional):
\n

The password of the private key. See the Snowflake documentation for details. Required for both private_key and private_key_path if the private key is encrypted. For unencrypted keys, this config can be omitted or set to None.

\n
\n
store_timestamps_as_strings (dagster.BoolSource, optional):
\n

If using Pandas DataFrames, whether to convert time data to strings. If True, time data will be converted to strings when storing the DataFrame and converted back to time data when loading the DataFrame. If False, time data without a timezone will be set to UTC timezone to avoid a Snowflake bug. Defaults to False.

\n

Default Value: False

\n
\n
authenticator (Union[dagster.StringSource, None], optional):
\n

Optional parameter to specify the authentication mechanism to use.

\n
\n
\n

An I/O manager definition that reads inputs from and writes Pandas DataFrames to Snowflake. When\nusing the SnowflakePandasIOManager, any inputs and outputs without type annotations will be loaded\nas Pandas DataFrames.

\n
\n
Returns:
\n

IOManagerDefinition

\n
\n
\n

Examples

\n
from dagster_snowflake_pandas import SnowflakePandasIOManager\nfrom dagster import asset, Definitions, EnvVar\n\n@asset(\n    key_prefix=["my_schema"]  # will be used as the schema in snowflake\n)\ndef my_table() -> pd.DataFrame:  # the name of the asset will be the table name\n    ...\n\ndefs = Definitions(\n    assets=[my_table],\n    resources={\n        "io_manager": SnowflakePandasIOManager(database="MY_DATABASE", account=EnvVar("SNOWFLAKE_ACCOUNT"), ...)\n    }\n)\n
\n
\n

If you do not provide a schema, Dagster will determine a schema based on the assets and ops using\nthe I/O Manager. For assets, the schema will be determined from the asset key, as in the above example.\nFor ops, the schema can be specified by including a \u201cschema\u201d entry in output metadata. If \u201cschema\u201d is not provided\nvia config or on the asset/op, \u201cpublic\u201d will be used for the schema.

\n
@op(\n    out={"my_table": Out(metadata={"schema": "my_schema"})}\n)\ndef make_my_table() -> pd.DataFrame:\n    # the returned value will be stored at my_schema.my_table\n    ...\n
\n
\n

To only use specific columns of a table as input to a downstream op or asset, add the metadata \u201ccolumns\u201d to the\nIn or AssetIn.

\n
@asset(\n    ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n)\ndef my_table_a(my_table: pd.DataFrame) -> pd.DataFrame:\n    # my_table will just contain the data from column "a"\n    ...\n
\n
\n
\n\n
\n
\nclass dagster_snowflake_pandas.SnowflakePandasTypeHandler[source]\u00b6
\n

Plugin for the Snowflake I/O Manager that can store and load Pandas DataFrames as Snowflake tables.

\n

Examples

\n
from dagster_snowflake import SnowflakeIOManager\nfrom dagster_snowflake_pandas import SnowflakePandasTypeHandler\nfrom dagster_snowflake_pyspark import SnowflakePySparkTypeHandler\nfrom dagster import Definitions, EnvVar\n\nclass MySnowflakeIOManager(SnowflakeIOManager):\n    @staticmethod\n    def type_handlers() -> Sequence[DbTypeHandler]:\n        return [SnowflakePandasTypeHandler(), SnowflakePySparkTypeHandler()]\n\n@asset(\n    key_prefix=["my_schema"]  # will be used as the schema in snowflake\n)\ndef my_table() -> pd.DataFrame:  # the name of the asset will be the table name\n    ...\n\ndefs = Definitions(\n    assets=[my_table],\n    resources={\n        "io_manager": MySnowflakeIOManager(database="MY_DATABASE", account=EnvVar("SNOWFLAKE_ACCOUNT"), ...)\n    }\n)\n
\n
\n
\n\n
\n

Legacy\u00b6

\n
\n
\ndagster_snowflake_pandas.snowflake_pandas_io_manager IOManagerDefinition\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
database (dagster.StringSource):
\n

Name of the database to use.

\n
\n
account (dagster.StringSource):
\n

Your Snowflake account name. For more details, see the Snowflake documentation.

\n
\n
user (dagster.StringSource):
\n

User login name.

\n
\n
schema (Union[dagster.StringSource, None], optional):
\n

Name of the schema to use.

\n
\n
password (Union[dagster.StringSource, None], optional):
\n

User password.

\n
\n
warehouse (Union[dagster.StringSource, None], optional):
\n

Name of the warehouse to use.

\n
\n
role (Union[dagster.StringSource, None], optional):
\n

Name of the role to use.

\n
\n
private_key (Union[dagster.StringSource, None], optional):
\n

Raw private key to use. See the Snowflake documentation for details. To avoid issues with newlines in the keys, you can base64 encode the key. You can retrieve the base64 encoded key with this shell command: cat rsa_key.p8 | base64

\n
\n
private_key_path (Union[dagster.StringSource, None], optional):
\n

Path to the private key. See the Snowflake documentation for details.

\n
\n
private_key_password (Union[dagster.StringSource, None], optional):
\n

The password of the private key. See the Snowflake documentation for details. Required for both private_key and private_key_path if the private key is encrypted. For unencrypted keys, this config can be omitted or set to None.

\n
\n
store_timestamps_as_strings (dagster.BoolSource, optional):
\n

If using Pandas DataFrames, whether to convert time data to strings. If True, time data will be converted to strings when storing the DataFrame and converted back to time data when loading the DataFrame. If False, time data without a timezone will be set to UTC timezone to avoid a Snowflake bug. Defaults to False.

\n

Default Value: False

\n
\n
authenticator (Union[dagster.StringSource, None], optional):
\n

Optional parameter to specify the authentication mechanism to use.

\n
\n
\n

An I/O manager definition that reads inputs from and writes Pandas DataFrames to Snowflake. When\nusing the snowflake_pandas_io_manager, any inputs and outputs without type annotations will be loaded\nas Pandas DataFrames.

\n
\n
Returns:
\n

IOManagerDefinition

\n
\n
\n

Examples

\n
from dagster_snowflake_pandas import snowflake_pandas_io_manager\nfrom dagster import asset, Definitions\n\n@asset(\n    key_prefix=["my_schema"]  # will be used as the schema in snowflake\n)\ndef my_table() -> pd.DataFrame:  # the name of the asset will be the table name\n    ...\n\ndefs = Definitions(\n    assets=[my_table],\n    resources={\n        "io_manager": snowflake_pandas_io_manager.configured({\n            "database": "my_database",\n            "account" : {"env": "SNOWFLAKE_ACCOUNT"}\n            ...\n        })\n    }\n)\n
\n
\n

If you do not provide a schema, Dagster will determine a schema based on the assets and ops using\nthe I/O Manager. For assets, the schema will be determined from the asset key.\nFor ops, the schema can be specified by including a \u201cschema\u201d entry in output metadata. If \u201cschema\u201d is not provided\nvia config or on the asset/op, \u201cpublic\u201d will be used for the schema.

\n
@op(\n    out={"my_table": Out(metadata={"schema": "my_schema"})}\n)\ndef make_my_table() -> pd.DataFrame:\n    # the returned value will be stored at my_schema.my_table\n    ...\n
\n
\n

To only use specific columns of a table as input to a downstream op or asset, add the metadata \u201ccolumns\u201d to the\nIn or AssetIn.

\n
@asset(\n    ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n)\ndef my_table_a(my_table: pd.DataFrame) -> pd.DataFrame:\n    # my_table will just contain the data from column "a"\n    ...\n
\n
\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-snowflake-pandas", "customsidebar": null, "display_toc": true, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../dagster-snowflake-pyspark/", "title": "Snowflake with PySpark (dagster-snowflake-pyspark)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-snowflake/", "title": "Snowflake (dagster-snowflake)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-snowflake-pyspark", "Snowflake with PySpark (dagster-snowflake-pyspark)", "N", "next"], ["sections/api/apidocs/libraries/dagster-snowflake", "Snowflake (dagster-snowflake)", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/libraries/dagster-snowflake-pandas.rst.txt", "title": "Snowflake with Pandas (dagster-snowflake-pandas)", "toc": "\n"}, "dagster-snowflake-pyspark": {"alabaster_version": "0.7.13", "body": "
\n

Snowflake with PySpark (dagster-snowflake-pyspark)\u00b6

\n

This library provides an integration with the Snowflake data\nwarehouse and PySpark data processing library.

\n

To use this library, you should first ensure that you have an appropriate Snowflake user configured to access\nyour data warehouse.

\n

Related Guides:

\n\n
\n
\ndagster_snowflake_pyspark.SnowflakePySparkIOManager IOManagerDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
database (dagster.StringSource):
\n

Name of the database to use.

\n
\n
account (dagster.StringSource):
\n

Your Snowflake account name. For more details, see the Snowflake documentation.

\n
\n
user (dagster.StringSource):
\n

User login name.

\n
\n
schema (Union[dagster.StringSource, None], optional):
\n

Name of the schema to use.

\n
\n
password (Union[dagster.StringSource, None], optional):
\n

User password.

\n
\n
warehouse (Union[dagster.StringSource, None], optional):
\n

Name of the warehouse to use.

\n
\n
role (Union[dagster.StringSource, None], optional):
\n

Name of the role to use.

\n
\n
private_key (Union[dagster.StringSource, None], optional):
\n

Raw private key to use. See the Snowflake documentation for details. To avoid issues with newlines in the keys, you can base64 encode the key. You can retrieve the base64 encoded key with this shell command: cat rsa_key.p8 | base64

\n
\n
private_key_path (Union[dagster.StringSource, None], optional):
\n

Path to the private key. See the Snowflake documentation for details.

\n
\n
private_key_password (Union[dagster.StringSource, None], optional):
\n

The password of the private key. See the Snowflake documentation for details. Required for both private_key and private_key_path if the private key is encrypted. For unencrypted keys, this config can be omitted or set to None.

\n
\n
store_timestamps_as_strings (dagster.BoolSource, optional):
\n

If using Pandas DataFrames, whether to convert time data to strings. If True, time data will be converted to strings when storing the DataFrame and converted back to time data when loading the DataFrame. If False, time data without a timezone will be set to UTC timezone to avoid a Snowflake bug. Defaults to False.

\n

Default Value: False

\n
\n
authenticator (Union[dagster.StringSource, None], optional):
\n

Optional parameter to specify the authentication mechanism to use.

\n
\n
\n

An I/O manager definition that reads inputs from and writes PySpark DataFrames to Snowflake. When\nusing the SnowflakePySparkIOManager, any inputs and outputs without type annotations will be loaded\nas PySpark DataFrames.

\n
\n
Returns:
\n

IOManagerDefinition

\n
\n
\n

Examples

\n
from dagster_snowflake_pyspark import SnowflakePySparkIOManager\nfrom pyspark.sql import DataFrame\nfrom dagster import Definitions, EnvVar\n\n@asset(\n    key_prefix=["my_schema"]  # will be used as the schema in snowflake\n)\ndef my_table() -> DataFrame:  # the name of the asset will be the table name\n    ...\n\ndefs = Definitions(\n    assets=[my_table],\n    resources={\n        "io_manager": SnowflakePySparkIOManager(\n            database="my_database",\n            warehouse="my_warehouse", # required for SnowflakePySparkIOManager\n            account=EnvVar("SNOWFLAKE_ACCOUNT"),\n            password=EnvVar("SNOWFLAKE_PASSWORD"),\n            ...\n        )\n    }\n)\n
\n
\n

Note that the warehouse configuration value is required when using the SnowflakePySparkIOManager

\n

If you do not provide a schema, Dagster will determine a schema based on the assets and ops using\nthe I/O Manager. For assets, the schema will be determined from the asset key, as in the above example.\nFor ops, the schema can be specified by including a \u201cschema\u201d entry in output metadata. If \u201cschema\u201d is not provided\nvia config or on the asset/op, \u201cpublic\u201d will be used for the schema.

\n
@op(\n    out={"my_table": Out(metadata={"schema": "my_schema"})}\n)\ndef make_my_table() -> DataFrame:\n    # the returned value will be stored at my_schema.my_table\n    ...\n
\n
\n

To only use specific columns of a table as input to a downstream op or asset, add the metadata \u201ccolumns\u201d to the\nIn or AssetIn.

\n
@asset(\n    ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n)\ndef my_table_a(my_table: DataFrame) -> DataFrame:\n    # my_table will just contain the data from column "a"\n    ...\n
\n
\n
\n\n
\n
\nclass dagster_snowflake_pyspark.SnowflakePySparkTypeHandler[source]\u00b6
\n

Plugin for the Snowflake I/O Manager that can store and load PySpark DataFrames as Snowflake tables.

\n

Examples

\n
from dagster_snowflake import SnowflakeIOManager\nfrom dagster_snowflake_pandas import SnowflakePandasTypeHandler\nfrom dagster_snowflake_pyspark import SnowflakePySparkTypeHandler\nfrom dagster import Definitions, EnvVar\n\nclass MySnowflakeIOManager(SnowflakeIOManager):\n    @staticmethod\n    def type_handlers() -> Sequence[DbTypeHandler]:\n        return [SnowflakePandasTypeHandler(), SnowflakePySparkTypeHandler()]\n\n@asset(\n    key_prefix=["my_schema"]  # will be used as the schema in snowflake\n)\ndef my_table() -> pd.DataFrame:  # the name of the asset will be the table name\n    ...\n\ndefs = Definitions(\n    assets=[my_table],\n    resources={\n        "io_manager": MySnowflakeIOManager(database="MY_DATABASE", account=EnvVar("SNOWFLAKE_ACCOUNT"), warehouse="my_warehouse", ...)\n    }\n)\n
\n
\n
\n\n
\n

Legacy\u00b6

\n
\n
\ndagster_snowflake_pyspark.snowflake_pyspark_io_manager IOManagerDefinition\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
database (dagster.StringSource):
\n

Name of the database to use.

\n
\n
account (dagster.StringSource):
\n

Your Snowflake account name. For more details, see the Snowflake documentation.

\n
\n
user (dagster.StringSource):
\n

User login name.

\n
\n
schema (Union[dagster.StringSource, None], optional):
\n

Name of the schema to use.

\n
\n
password (Union[dagster.StringSource, None], optional):
\n

User password.

\n
\n
warehouse (Union[dagster.StringSource, None], optional):
\n

Name of the warehouse to use.

\n
\n
role (Union[dagster.StringSource, None], optional):
\n

Name of the role to use.

\n
\n
private_key (Union[dagster.StringSource, None], optional):
\n

Raw private key to use. See the Snowflake documentation for details. To avoid issues with newlines in the keys, you can base64 encode the key. You can retrieve the base64 encoded key with this shell command: cat rsa_key.p8 | base64

\n
\n
private_key_path (Union[dagster.StringSource, None], optional):
\n

Path to the private key. See the Snowflake documentation for details.

\n
\n
private_key_password (Union[dagster.StringSource, None], optional):
\n

The password of the private key. See the Snowflake documentation for details. Required for both private_key and private_key_path if the private key is encrypted. For unencrypted keys, this config can be omitted or set to None.

\n
\n
store_timestamps_as_strings (dagster.BoolSource, optional):
\n

If using Pandas DataFrames, whether to convert time data to strings. If True, time data will be converted to strings when storing the DataFrame and converted back to time data when loading the DataFrame. If False, time data without a timezone will be set to UTC timezone to avoid a Snowflake bug. Defaults to False.

\n

Default Value: False

\n
\n
authenticator (Union[dagster.StringSource, None], optional):
\n

Optional parameter to specify the authentication mechanism to use.

\n
\n
\n

An I/O manager definition that reads inputs from and writes PySpark DataFrames to Snowflake. When\nusing the snowflake_pyspark_io_manager, any inputs and outputs without type annotations will be loaded\nas PySpark DataFrames.

\n
\n
Returns:
\n

IOManagerDefinition

\n
\n
\n

Examples

\n
from dagster_snowflake_pyspark import snowflake_pyspark_io_manager\nfrom pyspark.sql import DataFrame\nfrom dagster import Definitions\n\n@asset(\n    key_prefix=["my_schema"]  # will be used as the schema in snowflake\n)\ndef my_table() -> DataFrame:  # the name of the asset will be the table name\n    ...\n\ndefs = Definitions(\n    assets=[my_table],\n    resources={\n        "io_manager": snowflake_pyspark_io_manager.configured({\n            "database": "my_database",\n            "warehouse": "my_warehouse", # required for snowflake_pyspark_io_manager\n            "account" : {"env": "SNOWFLAKE_ACCOUNT"},\n            "password": {"env": "SNOWFLAKE_PASSWORD"},\n            ...\n        })\n    }\n)\n
\n
\n

Note that the warehouse configuration value is required when using the snowflake_pyspark_io_manager

\n

If you do not provide a schema, Dagster will determine a schema based on the assets and ops using\nthe I/O Manager. For assets, the schema will be determined from the asset key.\nFor ops, the schema can be specified by including a \u201cschema\u201d entry in output metadata. If \u201cschema\u201d is not provided\nvia config or on the asset/op, \u201cpublic\u201d will be used for the schema.

\n
@op(\n    out={"my_table": Out(metadata={"schema": "my_schema"})}\n)\ndef make_my_table() -> DataFrame:\n    # the returned value will be stored at my_schema.my_table\n    ...\n
\n
\n

To only use specific columns of a table as input to a downstream op or asset, add the metadata \u201ccolumns\u201d to the\nIn or AssetIn.

\n
@asset(\n    ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}\n)\ndef my_table_a(my_table: DataFrame) -> DataFrame:\n    # my_table will just contain the data from column "a"\n    ...\n
\n
\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-snowflake-pyspark", "customsidebar": null, "display_toc": true, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../dagster-spark/", "title": "Spark (dagster-spark)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-snowflake-pandas/", "title": "Snowflake with Pandas (dagster-snowflake-pandas)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-spark", "Spark (dagster-spark)", "N", "next"], ["sections/api/apidocs/libraries/dagster-snowflake-pandas", "Snowflake with Pandas (dagster-snowflake-pandas)", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/libraries/dagster-snowflake-pyspark.rst.txt", "title": "Snowflake with PySpark (dagster-snowflake-pyspark)", "toc": "\n"}, "dagster-spark": {"alabaster_version": "0.7.13", "body": "
\n

Spark (dagster-spark)\u00b6

\n
\n
\nclass dagster_spark.SparkOpError[source]\u00b6
\n
\n\n
\n
\ndagster_spark.define_spark_config()[source]\u00b6
\n

Spark configuration.

\n
\n
See the Spark documentation for reference:

https://spark.apache.org/docs/latest/submitting-applications.html

\n
\n
\n
\n\n
\n
\ndagster_spark.create_spark_op(name, main_class, description=None, required_resource_keys=frozenset({'spark'}))[source]\u00b6
\n
\n\n
\n
\ndagster_spark.construct_spark_shell_command(application_jar, main_class, master_url=None, spark_conf=None, deploy_mode=None, application_arguments=None, spark_home=None)[source]\u00b6
\n

Constructs the spark-submit command for a Spark job.

\n
\n\n
\n

Legacy\u00b6

\n
\n
\ndagster_spark.spark_resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-spark", "customsidebar": null, "display_toc": true, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../dagster-ssh/", "title": "SSH / SFTP (dagster-ssh)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-snowflake-pyspark/", "title": "Snowflake with PySpark (dagster-snowflake-pyspark)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-ssh", "SSH / SFTP (dagster-ssh)", "N", "next"], ["sections/api/apidocs/libraries/dagster-snowflake-pyspark", "Snowflake with PySpark (dagster-snowflake-pyspark)", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/libraries/dagster-spark.rst.txt", "title": "Spark (dagster-spark)", "toc": "\n"}, "dagster-ssh": {"alabaster_version": "0.7.13", "body": "
\n

SSH / SFTP (dagster-ssh)\u00b6

\n

This library provides an integration with SSH and SFTP.

\n
\n
\ndagster_ssh.ssh_resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
remote_host (dagster.StringSource):
\n

remote host to connect to

\n
\n
remote_port (dagster.IntSource, optional):
\n

port of remote host to connect (Default is paramiko SSH_PORT)

\n

Default Value: 22

\n
\n
username (dagster.StringSource, optional):
\n

username to connect to the remote_host

\n
\n
password (dagster.StringSource, optional):
\n

password of the username to connect to the remote_host

\n
\n
key_file (dagster.StringSource, optional):
\n

key file to use to connect to the remote_host.

\n
\n
key_string (dagster.StringSource, optional):
\n

key string to use to connect to remote_host

\n
\n
timeout (dagster.IntSource, optional):
\n

timeout for the attempt to connect to the remote_host.

\n

Default Value: 10

\n
\n
keepalive_interval (dagster.IntSource, optional):
\n

send a keepalive packet to remote host every keepalive_interval seconds

\n

Default Value: 30

\n
\n
compress (dagster.BoolSource, optional):
\n

Default Value: True

\n
\n
no_host_key_check (dagster.BoolSource, optional):
\n

Default Value: True

\n
\n
allow_host_key_change (dagster.BoolSource, optional):
\n

[Deprecated]

\n

Default Value: False

\n
\n
\n
\n\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-ssh", "customsidebar": null, "display_toc": false, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../dagster-twilio/", "title": "Twilio (dagster-twilio)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-spark/", "title": "Spark (dagster-spark)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-twilio", "Twilio (dagster-twilio)", "N", "next"], ["sections/api/apidocs/libraries/dagster-spark", "Spark (dagster-spark)", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/libraries/dagster-ssh.rst.txt", "title": "SSH / SFTP (dagster-ssh)", "toc": "\n"}, "dagster-twilio": {"alabaster_version": "0.7.13", "body": "
\n

Twilio (dagster-twilio)\u00b6

\n

This library provides an integration with Twilio.

\n
\n
\ndagster_twilio.TwilioResource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
account_sid (dagster.StringSource):
\n

Twilio Account SID, created with yout Twilio account. This can be found on your Twilio dashboard, see https://www.twilio.com/blog/twilio-access-tokens-python

\n
\n
auth_token (dagster.StringSource):
\n

Twilio Authentication Token, created with yout Twilio account. This can be found on your Twilio dashboard, see https://www.twilio.com/blog/twilio-access-tokens-python

\n
\n
\n

This resource is for connecting to Twilio.

\n
\n\n
\n

Legacy\u00b6

\n
\n
\ndagster_twilio.twilio_resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
account_sid (dagster.StringSource):
\n

Twilio Account SID, created with yout Twilio account. This can be found on your Twilio dashboard, see https://www.twilio.com/blog/twilio-access-tokens-python

\n
\n
auth_token (dagster.StringSource):
\n

Twilio Authentication Token, created with yout Twilio account. This can be found on your Twilio dashboard, see https://www.twilio.com/blog/twilio-access-tokens-python

\n
\n
\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-twilio", "customsidebar": null, "display_toc": true, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../dagstermill/", "title": "Dagstermill"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-ssh/", "title": "SSH / SFTP (dagster-ssh)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagstermill", "Dagstermill", "N", "next"], ["sections/api/apidocs/libraries/dagster-ssh", "SSH / SFTP (dagster-ssh)", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/libraries/dagster-twilio.rst.txt", "title": "Twilio (dagster-twilio)", "toc": "\n"}, "dagster-wandb": {"alabaster_version": "0.7.13", "body": "
\n

Weights & Biases (dagster-wandb)\u00b6

\n

This library provides a Dagster integration with Weights & Biases.

\n

Use Dagster and Weights & Biases (W&B) to orchestrate your MLOps pipelines and maintain ML assets.

\n
\n

The integration with W&B makes it easy within Dagster to:

\n\n
\n

Useful links\u00b6

\n

For a complete set of documentation, see Dagster integration on the W&B website.

\n

For full-code examples, see examples/with_wandb in the Dagster\u2019s Github repo.

\n
\n
\n

Resource\u00b6

\n
\n
\ndagster_wandb.wandb_resource ResourceDefinition[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
api_key (dagster.StringSource):
\n

W&B API key necessary to communicate with the W&B API.

\n
\n
host (String, optional):
\n

API host server you wish to use. Only required if you are using W&B Server.

\n

Default Value: \u2018https://api.wandb.ai\u2019

\n
\n
\n

Dagster resource used to communicate with the W&B API. It\u2019s useful when you want to use the\nwandb client within your ops and assets. It\u2019s a required resources if you are using the W&B IO\nManager.

\n

It automatically authenticates using the provided API key.

\n

For a complete set of documentation, see Dagster integration.

\n

To configure this resource, we recommend using the configured method.

\n

Example:

\n
from dagster import job\nfrom dagster_wandb import wandb_resource\n\nmy_wandb_resource = wandb_resource.configured({"api_key": {"env": "WANDB_API_KEY"}})\n\n@job(resource_defs={"wandb_resource": my_wandb_resource})\ndef my_wandb_job():\n    ...\n
\n
\n
\n\n
\n
\n

I/O Manager\u00b6

\n
\n
\ndagster_wandb.wandb_artifacts_io_manager IOManager[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
run_name (String, optional):
\n

Short display name for this run, which is how you\u2019ll identify this run in the UI. By default, it`s set to a string with the following format dagster-run-[8 first characters of the Dagster Run ID] e.g. dagster-run-7e4df022.

\n
\n
run_id (String, optional):
\n

Unique ID for this run, used for resuming. It must be unique in the project, and if you delete a run you can\u2019t reuse the ID. Use the name field for a short descriptive name, or config for saving hyperparameters to compare across runs. The ID cannot contain the following special characters: /#?%:.. You need to set the Run ID when you are doing experiment tracking inside Dagster to allow the IO Manager to resume the run. By default it`s set to the Dagster Run ID e.g 7e4df022-1bf2-44b5-a383-bb852df4077e.

\n
\n
run_tags (List[String], optional):
\n

A list of strings, which will populate the list of tags on this run in the UI. Tags are useful for organizing runs together, or applying temporary labels like \u2018baseline\u2019 or \u2018production\u2019. It\u2019s easy to add and remove tags in the UI, or filter down to just runs with a specific tag. Any W&B Run used by the integration will have the dagster_wandb tag.

\n
\n
base_dir (String, optional):
\n

Base directory used for local storage and caching. W&B Artifacts and W&B Run logs will be written and read from that directory. By default, it`s using the DAGSTER_HOME directory.

\n
\n
cache_duration_in_minutes (Int, optional):
\n

Defines the amount of time W&B Artifacts and W&B Run logs should be kept in the local storage. Only files and directories that were not opened for that amount of time are removed from the cache. Cache purging happens at the end of an IO Manager execution. You can set it to 0, if you want to disable caching completely. Caching improves speed when an Artifact is reused between jobs running on the same machine. It defaults to 30 days.

\n
\n
\n

Dagster IO Manager to create and consume W&B Artifacts.

\n

It allows any Dagster @op or @asset to create and consume W&B Artifacts natively.

\n

For a complete set of documentation, see Dagster integration.

\n

Example:

\n
@repository\ndef my_repository():\n    return [\n        *with_resources(\n            load_assets_from_current_module(),\n            resource_defs={\n                "wandb_config": make_values_resource(\n                    entity=str,\n                    project=str,\n                ),\n                "wandb_resource": wandb_resource.configured(\n                    {"api_key": {"env": "WANDB_API_KEY"}}\n                ),\n                "wandb_artifacts_manager": wandb_artifacts_io_manager.configured(\n                    {"cache_duration_in_minutes": 60} # only cache files for one hour\n                ),\n            },\n            resource_config_by_key={\n                "wandb_config": {\n                    "config": {\n                        "entity": "my_entity",\n                        "project": "my_project"\n                    }\n                }\n            },\n        ),\n    ]\n\n\n@asset(\n    name="my_artifact",\n    metadata={\n        "wandb_artifact_configuration": {\n            "type": "dataset",\n        }\n    },\n    io_manager_key="wandb_artifacts_manager",\n)\ndef create_dataset():\n    return [1, 2, 3]\n
\n
\n
\n\n
\n

Config\u00b6

\n
\n
\nclass dagster_wandb.WandbArtifactConfiguration[source]\u00b6
\n

W&B Artifacts IO Manager configuration. Useful for type checking.

\n
\n\n
\n
\nclass dagster_wandb.SerializationModule[source]\u00b6
\n

W&B Artifacts IO Manager configuration of the serialization module. Useful for type checking.

\n
\n\n
\n
\n

Errors\u00b6

\n
\n
\nexception dagster_wandb.WandbArtifactsIOManagerError(message='A W&B Artifacts IO Manager error occurred.')[source]\u00b6
\n

Represents an execution error of the W&B Artifacts IO Manager.

\n
\n\n
\n
\n
\n

Ops\u00b6

\n
\n
\ndagster_wandb.run_launch_agent(context)[source]\u00b6
\n

It starts a Launch Agent and runs it as a long running process until stopped manually.

\n

Agents are processes that poll launch queues and execute the jobs (or dispatch them to external\nservices to be executed) in order.

\n

Example:

\n
# config.yaml\n\nresources:\n  wandb_config:\n    config:\n      entity: my_entity\n      project: my_project\nops:\n  run_launch_agent:\n    config:\n      max_jobs: -1\n      queues:\n        - my_dagster_queue\n
\n
\n
from dagster_wandb.launch.ops import run_launch_agent\nfrom dagster_wandb.resources import wandb_resource\n\nfrom dagster import job, make_values_resource\n\n\n@job(\n    resource_defs={\n        "wandb_config": make_values_resource(\n            entity=str,\n            project=str,\n        ),\n        "wandb_resource": wandb_resource.configured(\n            {"api_key": {"env": "WANDB_API_KEY"}}\n        ),\n    },\n)\ndef run_launch_agent_example():\n    run_launch_agent()\n
\n
\n
\n\n
\n
\ndagster_wandb.run_launch_job(context)[source]\u00b6
\n

Executes a Launch job.

\n

A Launch job is assigned to a queue in order to be executed. You can create a queue or use the\ndefault one. Make sure you have an active agent listening to that queue. You can run an agent\ninside your Dagster instance but can also consider using a deployable agent in Kubernetes.

\n

Example:

\n
# config.yaml\n\nresources:\n  wandb_config:\n    config:\n      entity: my_entity\n      project: my_project\nops:\n  my_launched_job:\n    config:\n      entry_point:\n        - python\n        - train.py\n      queue: my_dagster_queue\n      uri: https://github.com/wandb/example-dagster-integration-with-launch\n
\n
\n
from dagster_wandb.launch.ops import run_launch_job\nfrom dagster_wandb.resources import wandb_resource\n\nfrom dagster import job, make_values_resource\n\n\n@job(\n    resource_defs={\n        "wandb_config": make_values_resource(\n            entity=str,\n            project=str,\n        ),\n        "wandb_resource": wandb_resource.configured(\n            {"api_key": {"env": "WANDB_API_KEY"}}\n        ),\n    },\n)\ndef run_launch_job_example():\n    run_launch_job.alias("my_launched_job")() # we rename the job with an alias\n
\n
\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagster-wandb", "customsidebar": null, "display_toc": true, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": null, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-graphql/", "title": "GraphQL (dagster-graphql)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-graphql", "GraphQL (dagster-graphql)", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/libraries/dagster-wandb.rst.txt", "title": "Weights & Biases (dagster-wandb)", "toc": "\n"}, "dagstermill": {"alabaster_version": "0.7.13", "body": "
\n

Dagstermill\u00b6

\n

This library provides an integration with papermill to allow you to run Jupyter notebooks with Dagster.

\n

Related Guides:

\n\n
\n
\ndagstermill.define_dagstermill_asset(name, notebook_path, key_prefix=None, ins=None, deps=None, metadata=None, config_schema=None, required_resource_keys=None, resource_defs=None, description=None, partitions_def=None, op_tags=None, group_name=None, io_manager_key=None, retry_policy=None, save_notebook_on_failure=False, non_argument_deps=None)[source]\u00b6
\n

Creates a Dagster asset for a Jupyter notebook.

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name for the asset

  • \n
  • notebook_path (str) \u2013 Path to the backing notebook

  • \n
  • key_prefix (Optional[Union[str, Sequence[str]]]) \u2013 If provided, the asset\u2019s key is the\nconcatenation of the key_prefix and the asset\u2019s name, which defaults to the name of\nthe decorated function. Each item in key_prefix must be a valid name in dagster (ie only\ncontains letters, numbers, and _) and may not contain python reserved keywords.

  • \n
  • ins (Optional[Mapping[str, AssetIn]]) \u2013 A dictionary that maps input names to information\nabout the input.

  • \n
  • deps (Optional[Sequence[Union[AssetsDefinition, SourceAsset, AssetKey, str]]]) \u2013 The assets\nthat are upstream dependencies, but do not pass an input value to the notebook.

  • \n
  • config_schema (Optional[ConfigSchema) \u2013 The configuration schema for the asset\u2019s underlying\nop. If set, Dagster will check that config provided for the op matches this schema and fail\nif it does not. If not set, Dagster will accept any config provided for the op.

  • \n
  • metadata (Optional[Dict[str, Any]]) \u2013 A dict of metadata entries for the asset.

  • \n
  • required_resource_keys (Optional[Set[str]]) \u2013 Set of resource handles required by the notebook.

  • \n
  • description (Optional[str]) \u2013 Description of the asset to display in the Dagster UI.

  • \n
  • partitions_def (Optional[PartitionsDefinition]) \u2013 Defines the set of partition keys that\ncompose the asset.

  • \n
  • op_tags (Optional[Dict[str, Any]]) \u2013 A dictionary of tags for the op that computes the asset.\nFrameworks may expect and require certain metadata to be attached to a op. Values that\nare not strings will be json encoded and must meet the criteria that\njson.loads(json.dumps(value)) == value.

  • \n
  • group_name (Optional[str]) \u2013 A string name used to organize multiple assets into groups. If not provided,\nthe name \u201cdefault\u201d is used.

  • \n
  • resource_defs (Optional[Mapping[str, ResourceDefinition]]) \u2013 (Experimental) A mapping of resource keys to resource definitions. These resources\nwill be initialized during execution, and can be accessed from the\ncontext within the notebook.

  • \n
  • io_manager_key (Optional[str]) \u2013 A string key for the IO manager used to store the output notebook.\nIf not provided, the default key output_notebook_io_manager will be used.

  • \n
  • retry_policy (Optional[RetryPolicy]) \u2013 The retry policy for the op that computes the asset.

  • \n
  • save_notebook_on_failure (bool) \u2013 If True and the notebook fails during execution, the failed notebook will be\nwritten to the Dagster storage directory. The location of the file will be printed in the Dagster logs.\nDefaults to False.

  • \n
  • non_argument_deps (Optional[Union[Set[AssetKey], Set[str]]]) \u2013 Deprecated, use deps instead. Set of asset keys that are\nupstream dependencies, but do not pass an input to the asset.

  • \n
\n
\n
\n

Examples

\n
from dagstermill import define_dagstermill_asset\nfrom dagster import asset, AssetIn, AssetKey\nfrom sklearn import datasets\nimport pandas as pd\nimport numpy as np\n\n@asset\ndef iris_dataset():\n    sk_iris = datasets.load_iris()\n    return pd.DataFrame(\n        data=np.c_[sk_iris["data"], sk_iris["target"]],\n        columns=sk_iris["feature_names"] + ["target"],\n    )\n\niris_kmeans_notebook = define_dagstermill_asset(\n    name="iris_kmeans_notebook",\n    notebook_path="/path/to/iris_kmeans.ipynb",\n    ins={\n        "iris": AssetIn(key=AssetKey("iris_dataset"))\n    }\n)\n
\n
\n
\n\n
\n
\ndagstermill.define_dagstermill_op(name, notebook_path, ins=None, outs=None, config_schema=None, required_resource_keys=None, output_notebook_name=None, asset_key_prefix=None, description=None, tags=None, io_manager_key=None, save_notebook_on_failure=False)[source]\u00b6
\n

Wrap a Jupyter notebook in a op.

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the op.

  • \n
  • notebook_path (str) \u2013 Path to the backing notebook.

  • \n
  • ins (Optional[Mapping[str, In]]) \u2013 The op\u2019s inputs.

  • \n
  • outs (Optional[Mapping[str, Out]]) \u2013 The op\u2019s outputs. Your notebook should\ncall yield_result() to yield each of these outputs.

  • \n
  • required_resource_keys (Optional[Set[str]]) \u2013 The string names of any required resources.

  • \n
  • output_notebook_name \u2013 (Optional[str]): If set, will be used as the name of an injected output\nof type of BufferedIOBase that is the file object of the executed\nnotebook (in addition to the AssetMaterialization that is always\ncreated). It allows the downstream ops to access the executed notebook via a file\nobject.

  • \n
  • asset_key_prefix (Optional[Union[List[str], str]]) \u2013 If set, will be used to prefix the\nasset keys for materialized notebooks.

  • \n
  • description (Optional[str]) \u2013 If set, description used for op.

  • \n
  • tags (Optional[Dict[str, str]]) \u2013 If set, additional tags used to annotate op.\nDagster uses the tag keys notebook_path and kind, which cannot be\noverwritten by the user.

  • \n
  • io_manager_key (Optional[str]) \u2013 If using output_notebook_name, you can additionally provide\na string key for the IO manager used to store the output notebook.\nIf not provided, the default key output_notebook_io_manager will be used.

  • \n
  • save_notebook_on_failure (bool) \u2013 If True and the notebook fails during execution, the failed notebook will be\nwritten to the Dagster storage directory. The location of the file will be printed in the Dagster logs.\nDefaults to False.

  • \n
\n
\n
Returns:
\n

OpDefinition

\n
\n
\n
\n\n
\n
\nclass dagstermill.ConfigurableLocalOutputNotebookIOManager(*, base_dir=None, asset_key_prefix=[])[source]\u00b6
\n

Built-in IO Manager for handling output notebook.

\n
\n\n
\n
\ndagstermill.get_context(op_config=None, resource_defs=None, logger_defs=None, run_config=None)\u00b6
\n

Get a dagstermill execution context for interactive exploration and development.

\n
\n
Parameters:
\n
    \n
  • op_config (Optional[Any]) \u2013 If specified, this value will be made available on the\ncontext as its op_config property.

  • \n
  • resource_defs (Optional[Mapping[str, ResourceDefinition]]) \u2013 Specifies resources to provide to context.

  • \n
  • logger_defs (Optional[Mapping[str, LoggerDefinition]]) \u2013 Specifies loggers to provide to context.

  • \n
  • run_config (Optional[dict]) \u2013 The config dict with which to construct\nthe context.

  • \n
\n
\n
Returns:
\n

DagstermillExecutionContext

\n
\n
\n
\n\n
\n
\ndagstermill.yield_event(dagster_event)\u00b6
\n

Yield a dagster event directly from notebook code.

\n

When called interactively or in development, returns its input.

\n
\n
Parameters:
\n

dagster_event (Union[dagster.AssetMaterialization, dagster.ExpectationResult, dagster.TypeCheck, dagster.Failure, dagster.RetryRequested]) \u2013 An event to yield back to Dagster.

\n
\n
\n
\n\n
\n
\ndagstermill.yield_result(value, output_name='result')\u00b6
\n

Yield a result directly from notebook code.

\n

When called interactively or in development, returns its input.

\n
\n
Parameters:
\n
    \n
  • value (Any) \u2013 The value to yield.

  • \n
  • output_name (Optional[str]) \u2013 The name of the result to yield (default: 'result').

  • \n
\n
\n
\n
\n\n
\n
\nclass dagstermill.DagstermillExecutionContext(job_context, job_def, resource_keys_to_init, op_name, node_handle, op_config=None)[source]\u00b6
\n

Dagstermill-specific execution context.

\n

Do not initialize directly: use dagstermill.get_context().

\n
\n
\nproperty job_def\u00b6
\n

The job definition for the context.

\n

This will be a dagstermill-specific shim.

\n
\n
Type:
\n

dagster.JobDefinition

\n
\n
\n
\n\n
\n
\nproperty job_name\u00b6
\n

The name of the executing job.

\n
\n
Type:
\n

str

\n
\n
\n
\n\n
\n
\nproperty logging_tags\u00b6
\n

The logging tags for the context.

\n
\n
Type:
\n

dict

\n
\n
\n
\n\n
\n
\nproperty op_config\u00b6
\n

A dynamically-created type whose properties allow access to\nop-specific config.

\n
\n
Type:
\n

collections.namedtuple

\n
\n
\n
\n\n
\n
\nproperty op_def\u00b6
\n

The op definition for the context.

\n

In interactive contexts, this may be a dagstermill-specific shim, depending whether an\nop definition was passed to dagstermill.get_context.

\n
\n
Type:
\n

dagster.OpDefinition

\n
\n
\n
\n\n
\n
\nproperty run\u00b6
\n

The job run for the context.

\n
\n
Type:
\n

dagster.DagsterRun

\n
\n
\n
\n\n
\n
\nproperty run_config\u00b6
\n

The run_config for the context.

\n
\n
Type:
\n

dict

\n
\n
\n
\n\n
\n
\nproperty run_id\u00b6
\n

The run_id for the context.

\n
\n
Type:
\n

str

\n
\n
\n
\n\n
\n\n
\n
\nclass dagstermill.DagstermillError[source]\u00b6
\n

Base class for errors raised by dagstermill.

\n
\n\n
\n", "current_page_name": "sections/api/apidocs/libraries/dagstermill", "customsidebar": null, "display_toc": false, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../dagster-graphql/", "title": "GraphQL (dagster-graphql)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dagster-twilio/", "title": "Twilio (dagster-twilio)"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-graphql", "GraphQL (dagster-graphql)", "N", "next"], ["sections/api/apidocs/libraries/dagster-twilio", "Twilio (dagster-twilio)", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/libraries/dagstermill.rst.txt", "title": "Dagstermill", "toc": "\n"}}, "loggers": {"alabaster_version": "0.7.13", "body": "
\n

Loggers\u00b6

\n
\n

Built-in loggers\u00b6

\n
\n
\ndagster._loggers.colored_console_logger(*args, **kwargs)\u00b6
\n

Core class for defining loggers.

\n

Loggers are job-scoped logging handlers, which will be automatically invoked whenever\ndagster messages are logged from within a job.

\n
\n
Parameters:
\n
    \n
  • logger_fn (Callable[[InitLoggerContext], logging.Logger]) \u2013 User-provided function to\ninstantiate the logger. This logger will be automatically invoked whenever the methods\non context.log are called from within job compute logic.

  • \n
  • config_schema (Optional[ConfigSchema]) \u2013 The schema for the config. Configuration data available in\ninit_context.logger_config. If not set, Dagster will accept any config provided.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of this logger.

  • \n
\n
\n
\n
\n\n
\n
\ndagster._loggers.json_console_logger(*args, **kwargs)\u00b6
\n

Core class for defining loggers.

\n

Loggers are job-scoped logging handlers, which will be automatically invoked whenever\ndagster messages are logged from within a job.

\n
\n
Parameters:
\n
    \n
  • logger_fn (Callable[[InitLoggerContext], logging.Logger]) \u2013 User-provided function to\ninstantiate the logger. This logger will be automatically invoked whenever the methods\non context.log are called from within job compute logic.

  • \n
  • config_schema (Optional[ConfigSchema]) \u2013 The schema for the config. Configuration data available in\ninit_context.logger_config. If not set, Dagster will accept any config provided.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of this logger.

  • \n
\n
\n
\n
\n\n
\n
\n

Logging from an @op\u00b6

\n
\n
\nclass dagster.DagsterLogManager(dagster_handler, level=0, managed_loggers=None)[source]\u00b6
\n

Centralized dispatch for logging from user code.

\n

Handles the construction of uniform structured log messages and passes them through to the\nunderlying loggers/handlers.

\n

An instance of the log manager is made available to ops as context.log. Users should not\ninitialize instances of the log manager directly. To configure custom loggers, set the\nlogger_defs argument in an @job decorator or when calling the to_job() method on a\nGraphDefinition.

\n

The log manager inherits standard convenience methods like those exposed by the Python standard\nlibrary python:logging module (i.e., within the body of an op,\ncontext.log.{debug, info, warning, warn, error, critical, fatal}).

\n

The underlying integer API can also be called directly using, e.g.\ncontext.log.log(5, msg), and the log manager will delegate to the log method\ndefined on each of the loggers it manages.

\n

User-defined custom log levels are not supported, and calls to, e.g.,\ncontext.log.trace or context.log.notice will result in hard exceptions at runtime.

\n
\n\n
\n
\n

Defining custom loggers\u00b6

\n
\n
\n@dagster.logger(config_schema=None, description=None)[source]\u00b6
\n

Define a logger.

\n

The decorated function should accept an InitLoggerContext and return an instance of\npython:logging.Logger. This function will become the logger_fn of an underlying\nLoggerDefinition.

\n
\n
Parameters:
\n
    \n
  • config_schema (Optional[ConfigSchema]) \u2013 The schema for the config. Configuration data available in\ninit_context.logger_config. If not set, Dagster will accept any config provided.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the logger.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.LoggerDefinition(logger_fn, config_schema=None, description=None)[source]\u00b6
\n

Core class for defining loggers.

\n

Loggers are job-scoped logging handlers, which will be automatically invoked whenever\ndagster messages are logged from within a job.

\n
\n
Parameters:
\n
    \n
  • logger_fn (Callable[[InitLoggerContext], logging.Logger]) \u2013 User-provided function to\ninstantiate the logger. This logger will be automatically invoked whenever the methods\non context.log are called from within job compute logic.

  • \n
  • config_schema (Optional[ConfigSchema]) \u2013 The schema for the config. Configuration data available in\ninit_context.logger_config. If not set, Dagster will accept any config provided.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of this logger.

  • \n
\n
\n
\n
\n
\nproperty config_schema\u00b6
\n

The schema for the logger\u2019s config. Configuration data available in init_context.logger_config.

\n
\n
Type:
\n

Any

\n
\n
\n
\n\n
\n
\nproperty description\u00b6
\n

A human-readable description of the logger.

\n
\n
Type:
\n

Optional[str]

\n
\n
\n
\n\n
\n
\nproperty logger_fn\u00b6
\n

The function that will be invoked to\ninstantiate the logger.

\n
\n
Type:
\n

Callable[[InitLoggerContext], logging.Logger]

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.InitLoggerContext(logger_config, logger_def=None, job_def=None, run_id=None)[source]\u00b6
\n

The context object available as the argument to the initialization function of a dagster.LoggerDefinition.

\n

Users should not instantiate this object directly. To construct an\nInitLoggerContext for testing purposes, use dagster.\nbuild_init_logger_context().

\n

Example

\n
from dagster import logger, InitLoggerContext\n\n@logger\ndef hello_world(init_context: InitLoggerContext):\n    ...\n
\n
\n
\n
\nproperty logger_config\u00b6
\n

The configuration data provided by the run config. The\nschema for this data is defined by config_schema on the LoggerDefinition.

\n
\n\n
\n
\nproperty logger_def\u00b6
\n

The logger definition for the logger being constructed.

\n
\n\n
\n
\nproperty run_id\u00b6
\n

The ID for this run of the job.

\n
\n\n
\n\n
\n
\ndagster.build_init_logger_context(logger_config=None, job_def=None)[source]\u00b6
\n

Builds logger initialization context from provided parameters.

\n

This function can be used to provide the context argument to the invocation of a logger\ndefinition.

\n

Note that you may only specify one of pipeline_def and job_def.

\n
\n
Parameters:
\n
    \n
  • logger_config (Any) \u2013 The config to provide during initialization of logger.

  • \n
  • job_def (Optional[JobDefinition]) \u2013 The job definition that the logger will be used with.

  • \n
\n
\n
\n

Examples

\n
context = build_init_logger_context()\nlogger_to_init(context)\n
\n
\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/loggers", "customsidebar": null, "display_toc": true, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../ops/", "title": "Ops"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../jobs/", "title": "Jobs"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/ops", "Ops", "N", "next"], ["sections/api/apidocs/jobs", "Jobs", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/loggers.rst.txt", "title": "Loggers", "toc": "\n"}, "memoization": {"alabaster_version": "0.7.13", "body": "
\n

Job-Level Versioning and Memoization (Deprecated)\u00b6

\n

Dagster has deprecated functionality that allows for job-level code versioning and memoization of previous op outputs based upon that versioning.

\n

This is currently deprecated in favor of asset versioning.

\n
\n

Versioning\u00b6

\n
\n
\nclass dagster.VersionStrategy[source]\u00b6
\n

Abstract class for defining a strategy to version ops and resources.

\n

When subclassing, get_op_version must be implemented, and\nget_resource_version can be optionally implemented.

\n

get_op_version should ingest an OpVersionContext, and get_resource_version should ingest a\nResourceVersionContext. From that, each synthesize a unique string called\na version, which will\nbe tagged to outputs of that op in the job. Providing a\nVersionStrategy instance to a\njob will enable memoization on that job, such that only steps whose\noutputs do not have an up-to-date version will run.

\n
\n
\nabstract get_op_version(context)[source]\u00b6
\n

Computes a version for an op.

\n
\n
Parameters:
\n

context (OpVersionContext) \u2013 The context for computing the version.

\n
\n
Returns:
\n

The version for the op.

\n
\n
Return type:
\n

str

\n
\n
\n
\n\n
\n
\nget_resource_version(context)[source]\u00b6
\n

Computes a version for a resource.

\n
\n
Parameters:
\n

context (ResourceVersionContext) \u2013 The context for computing the version.

\n
\n
Returns:
\n

\n
The version for the resource. If None, the resource will not be

memoized.

\n
\n
\n

\n
\n
Return type:
\n

Optional[str]

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.SourceHashVersionStrategy[source]\u00b6
\n

VersionStrategy that checks for changes to the source code of ops and resources.

\n

Only checks for changes within the immediate body of the op/resource\u2019s\ndecorated function (or compute function, if the op/resource was\nconstructed directly from a definition).

\n
\n
\nget_op_version(context)[source]\u00b6
\n

Computes a version for an op by hashing its source code.

\n
\n
Parameters:
\n

context (OpVersionContext) \u2013 The context for computing the version.

\n
\n
Returns:
\n

The version for the op.

\n
\n
Return type:
\n

str

\n
\n
\n
\n\n
\n
\nget_resource_version(context)[source]\u00b6
\n

Computes a version for a resource by hashing its source code.

\n
\n
Parameters:
\n

context (ResourceVersionContext) \u2013 The context for computing the version.

\n
\n
Returns:
\n

\n
The version for the resource. If None, the resource will not be

memoized.

\n
\n
\n

\n
\n
Return type:
\n

Optional[str]

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.OpVersionContext(op_def, op_config)[source]\u00b6
\n

Provides execution-time information for computing the version for an op.

\n
\n
\nop_def\u00b6
\n

The definition of the op to compute a version for.

\n
\n
Type:
\n

OpDefinition

\n
\n
\n
\n\n
\n
\nop_config\u00b6
\n

The parsed config to be passed to the op during execution.

\n
\n
Type:
\n

Any

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.ResourceVersionContext(resource_def, resource_config)[source]\u00b6
\n

Provides execution-time information for computing the version for a resource.

\n
\n
\nresource_def\u00b6
\n

The definition of the resource whose version will be computed.

\n
\n
Type:
\n

ResourceDefinition

\n
\n
\n
\n\n
\n
\nresource_config\u00b6
\n

The parsed config to be passed to the resource during execution.

\n
\n
Type:
\n

Any

\n
\n
\n
\n\n
\n\n
\n
\n

Memoization\u00b6

\n
\n
\nclass dagster.MemoizableIOManager[source]\u00b6
\n

Base class for IO manager enabled to work with memoized execution. Users should implement\nthe load_input and handle_output methods described in the IOManager API, and the\nhas_output method, which returns a boolean representing whether a data object can be found.

\n
\n
\nabstract has_output(context)[source]\u00b6
\n

The user-defined method that returns whether data exists given the metadata.

\n
\n
Parameters:
\n

context (OutputContext) \u2013 The context of the step performing this check.

\n
\n
Returns:
\n

True if there is data present that matches the provided context. False otherwise.

\n
\n
Return type:
\n

bool

\n
\n
\n
\n\n
\n\n

See also: dagster.IOManager.

\n
\n
\ndagster.MEMOIZED_RUN_TAG\u00b6
\n

Provide this tag to a run to toggle memoization on or off. {MEMOIZED_RUN_TAG: "true"} toggles memoization on, while {MEMOIZED_RUN_TAG: "false"} toggles memoization off.

\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/memoization", "customsidebar": null, "display_toc": true, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../libraries/dagster-airbyte/", "title": "Airbyte (dagster-airbyte)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../utilities/", "title": "Utilities"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/libraries/dagster-airbyte", "Airbyte (dagster-airbyte)", "N", "next"], ["sections/api/apidocs/utilities", "Utilities", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/memoization.rst.txt", "title": "Job-Level Versioning and Memoization (Deprecated)", "toc": "\n"}, "ops": {"alabaster_version": "0.7.13", "body": "
\n

Ops\u00b6

\n

The foundational unit of computation in Dagster.

\n
\n
\n

Defining ops\u00b6

\n
\n
\n@dagster.op(compute_fn=None, *, name=None, description=None, ins=None, out=None, config_schema=None, required_resource_keys=None, tags=None, version=None, retry_policy=None, code_version=None)[source]\u00b6
\n

Create an op with the specified parameters from the decorated function.

\n

Ins and outs will be inferred from the type signature of the decorated function\nif not explicitly provided.

\n

The decorated function will be used as the op\u2019s compute function. The signature of the\ndecorated function is more flexible than that of the compute_fn in the core API; it may:

\n
    \n
  1. Return a value. This value will be wrapped in an Output and yielded by the compute function.

  2. \n
  3. Return an Output. This output will be yielded by the compute function.

  4. \n
  5. Yield Output or other event objects. Same as default compute behavior.

  6. \n
\n

Note that options 1) and 2) are incompatible with yielding other events \u2013 if you would like\nto decorate a function that yields events, it must also wrap its eventual output in an\nOutput and yield it.

\n

@op supports async def functions as well, including async generators when yielding multiple\nevents or outputs. Note that async ops will generally be run on their own unless using a custom\nExecutor implementation that supports running them together.

\n
\n
Parameters:
\n
    \n
  • name (Optional[str]) \u2013 Name of op. Must be unique within any GraphDefinition\nusing the op.

  • \n
  • description (Optional[str]) \u2013 Human-readable description of this op. If not provided, and\nthe decorated function has docstring, that docstring will be used as the description.

  • \n
  • ins (Optional[Dict[str, In]]) \u2013 Information about the inputs to the op. Information provided here will be combined\nwith what can be inferred from the function signature.

  • \n
  • out (Optional[Union[Out, Dict[str, Out]]]) \u2013 Information about the op outputs. Information provided here will be combined with\nwhat can be inferred from the return type signature if the function does not use yield.

  • \n
  • config_schema (Optional[ConfigSchema) \u2013 The schema for the config. If set, Dagster will check\nthat config provided for the op matches this schema and fail if it does not. If not\nset, Dagster will accept any config provided for the op.

  • \n
  • required_resource_keys (Optional[Set[str]]) \u2013 Set of resource handles required by this op.

  • \n
  • tags (Optional[Dict[str, Any]]) \u2013 Arbitrary metadata for the op. Frameworks may\nexpect and require certain metadata to be attached to a op. Values that are not strings\nwill be json encoded and must meet the criteria that json.loads(json.dumps(value)) == value.

  • \n
  • code_version (Optional[str]) \u2013 (Experimental) Version of the logic encapsulated by the op. If set,\nthis is used as a default version for all outputs.

  • \n
  • retry_policy (Optional[RetryPolicy]) \u2013 The retry policy for this op.

  • \n
\n
\n
\n

Examples

\n
@op\ndef hello_world():\n    print('hello')\n\n@op\ndef echo(msg: str) -> str:\n    return msg\n\n@op(\n    ins={'msg': In(str)},\n    out=Out(str)\n)\ndef echo_2(msg): # same as above\n    return msg\n\n@op(\n    out={'word': Out(), 'num': Out()}\n)\ndef multi_out() -> Tuple[str, int]:\n    return 'cool', 4\n
\n
\n
\n\n
\n
\nclass dagster.OpDefinition(compute_fn, name, ins=None, outs=None, description=None, config_schema=None, required_resource_keys=None, tags=None, version=None, retry_policy=None, code_version=None)[source]\u00b6
\n

Defines an op, the functional unit of user-defined computation.

\n

For more details on what a op is, refer to the\nOps Overview .

\n

End users should prefer the @op decorator. OpDefinition is generally intended to be\nused by framework authors or for programatically generated ops.

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 Name of the op. Must be unique within any GraphDefinition or\nJobDefinition that contains the op.

  • \n
  • input_defs (List[InputDefinition]) \u2013 Inputs of the op.

  • \n
  • compute_fn (Callable) \u2013

    The core of the op, the function that performs the actual\ncomputation. The signature of this function is determined by input_defs, and\noptionally, an injected first argument, context, a collection of information\nprovided by the system.

    \n

    This function will be coerced into a generator or an async generator, which must yield\none Output for each of the op\u2019s output_defs, and additionally may\nyield other types of Dagster events, including AssetMaterialization and\nExpectationResult.

    \n

  • \n
  • output_defs (List[OutputDefinition]) \u2013 Outputs of the op.

  • \n
  • config_schema (Optional[ConfigSchema) \u2013 The schema for the config. If set, Dagster will check\nthat the config provided for the op matches this schema and will fail if it does not. If\nnot set, Dagster will accept any config provided for the op.

  • \n
  • description (Optional[str]) \u2013 Human-readable description of the op.

  • \n
  • tags (Optional[Dict[str, Any]]) \u2013 Arbitrary metadata for the op. Frameworks may\nexpect and require certain metadata to be attached to a op. Users should generally\nnot set metadata directly. Values that are not strings will be json encoded and must meet\nthe criteria that json.loads(json.dumps(value)) == value.

  • \n
  • required_resource_keys (Optional[Set[str]]) \u2013 Set of resources handles required by this op.

  • \n
  • code_version (Optional[str]) \u2013 (Experimental) Version of the code encapsulated by the op. If set,\nthis is used as a default code version for all outputs.

  • \n
  • retry_policy (Optional[RetryPolicy]) \u2013 The retry policy for this op.

  • \n
\n
\n
\n

Examples

\n
def _add_one(_context, inputs):\n    yield Output(inputs["num"] + 1)\n\nOpDefinition(\n    name="add_one",\n    ins={"num": In(int)},\n    outs={"result": Out(int)},\n    compute_fn=_add_one,\n)\n
\n
\n
\n
\nalias(name)[source]\u00b6
\n

Creates a copy of this op with the given name.

\n
\n\n
\n
\nproperty config_schema\u00b6
\n

The config schema for this op.

\n
\n
Type:
\n

IDefinitionConfigSchema

\n
\n
\n
\n\n
\n
\nproperty ins\u00b6
\n

A mapping from input name to the In object that represents that input.

\n
\n
Type:
\n

Mapping[str, In]

\n
\n
\n
\n\n
\n
\nproperty name\u00b6
\n

The name of this op.

\n
\n
Type:
\n

str

\n
\n
\n
\n\n
\n
\nproperty outs\u00b6
\n

A mapping from output name to the Out object that represents that output.

\n
\n
Type:
\n

Mapping[str, Out]

\n
\n
\n
\n\n
\n
\nproperty required_resource_keys\u00b6
\n

A set of keys for resources that must be provided to this OpDefinition.

\n
\n
Type:
\n

AbstractSet[str]

\n
\n
\n
\n\n
\n
\nproperty retry_policy\u00b6
\n

The RetryPolicy for this op.

\n
\n
Type:
\n

Optional[RetryPolicy]

\n
\n
\n
\n\n
\n
\ntag(tags)[source]\u00b6
\n

Creates a copy of this op with the given tags.

\n
\n\n
\n
\nproperty tags\u00b6
\n

The tags for this op.

\n
\n
Type:
\n

Mapping[str, str]

\n
\n
\n
\n\n
\n
\nproperty version\u00b6
\n
\n
\n

\n \n (\n deprecated\n )\n \n This API will be removed in version 2.0. Use code_version instead..\n \n

\n

Version of the code encapsulated by the op. If set, this is used as a\ndefault code version for all outputs.

\n
\n
Type:
\n

str

\n
\n
\n
\n\n
\n
\nwith_hooks(hook_defs)[source]\u00b6
\n

Creates a copy of this op with the given hook definitions.

\n
\n\n
\n
\nwith_retry_policy(retry_policy)[source]\u00b6
\n

Creates a copy of this op with the given retry policy.

\n
\n\n
\n\n
\n
\n
\n

Ins & outs\u00b6

\n
\n
\nclass dagster.In(dagster_type=<class 'dagster._core.definitions.utils.NoValueSentinel'>, description=None, default_value=<class 'dagster._core.definitions.utils.NoValueSentinel'>, metadata=None, asset_key=None, asset_partitions=None, input_manager_key=None)[source]\u00b6
\n

Defines an argument to an op\u2019s compute function.

\n

Inputs may flow from previous op\u2019s outputs, or be stubbed using config. They may optionally\nbe typed using the Dagster type system.

\n
\n
Parameters:
\n
    \n
  • dagster_type (Optional[Union[Type, DagsterType]]]) \u2013 The type of this input. Should only be set if the correct type can not\nbe inferred directly from the type signature of the decorated function.

  • \n
  • description (Optional[str]) \u2013 Human-readable description of the input.

  • \n
  • default_value (Optional[Any]) \u2013 The default value to use if no input is provided.

  • \n
  • metadata (Optional[Dict[str, RawMetadataValue]]) \u2013 A dict of metadata for the input.

  • \n
  • asset_key (Optional[Union[AssetKey, InputContext -> AssetKey]]) \u2013 (Experimental) An AssetKey\n(or function that produces an AssetKey from the InputContext) which should be associated\nwith this In. Used for tracking lineage information through Dagster.

  • \n
  • asset_partitions (Optional[Union[Set[str], InputContext -> Set[str]]]) \u2013 (Experimental) A\nset of partitions of the given asset_key (or a function that produces this list of\npartitions from the InputContext) which should be associated with this In.

  • \n
  • input_manager_key (Optional[str]) \u2013 (Experimental) The resource key for the\nInputManager used for loading this input when it is not connected to an\nupstream output.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.Out(dagster_type=<class 'dagster._core.definitions.utils.NoValueSentinel'>, description=None, is_required=True, io_manager_key=None, metadata=None, code_version=None)[source]\u00b6
\n

Defines an output from an op\u2019s compute function.

\n

Ops can have multiple outputs, in which case outputs cannot be anonymous.

\n

Many ops have only one output, in which case the user can provide a single output definition\nthat will be given the default name, \u201cresult\u201d.

\n

Outs may be typed using the Dagster type system.

\n
\n
Parameters:
\n
    \n
  • dagster_type (Optional[Union[Type, DagsterType]]]) \u2013 The type of this output. Should only be set if the correct type can not\nbe inferred directly from the type signature of the decorated function.

  • \n
  • description (Optional[str]) \u2013 Human-readable description of the output.

  • \n
  • is_required (bool) \u2013 Whether the presence of this field is required. (default: True)

  • \n
  • io_manager_key (Optional[str]) \u2013 The resource key of the output manager used for this output.\n(default: \u201cio_manager\u201d).

  • \n
  • metadata (Optional[Dict[str, Any]]) \u2013 A dict of the metadata for the output.\nFor example, users can provide a file path if the data object will be stored in a\nfilesystem, or provide information of a database table when it is going to load the data\ninto the table.

  • \n
  • code_version (Optional[str]) \u2013 (Experimental) Version of the code that generates this output. In\ngeneral, versions should be set only for code that deterministically produces the same\noutput when given the same inputs.

  • \n
\n
\n
\n
\n\n
\n
\n
\n

Execution\u00b6

\n
\n
\nclass dagster.RetryPolicy(max_retries=1, delay=None, backoff=None, jitter=None)[source]\u00b6
\n

A declarative policy for when to request retries when an exception occurs during op execution.

\n
\n
Parameters:
\n
    \n
  • max_retries (int) \u2013 The maximum number of retries to attempt. Defaults to 1.

  • \n
  • delay (Optional[Union[int,float]]) \u2013 The time in seconds to wait between the retry being requested and the next attempt\nbeing started. This unit of time can be modulated as a function of attempt number\nwith backoff and randomly with jitter.

  • \n
  • backoff (Optional[Backoff]) \u2013 A modifier for delay as a function of retry attempt number.

  • \n
  • jitter (Optional[Jitter]) \u2013 A randomizing modifier for delay, applied after backoff calculation.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.Backoff(value)[source]\u00b6
\n

A modifier for delay as a function of attempt number.

\n

LINEAR: attempt_num * delay\nEXPONENTIAL: ((2 ^ attempt_num) - 1) * delay

\n
\n\n
\n
\nclass dagster.Jitter(value)[source]\u00b6
\n

A randomizing modifier for delay, applied after backoff calculation.

\n

FULL: between 0 and the calculated delay based on backoff: random() * backoff_delay\nPLUS_MINUS: +/- the delay: backoff_delay + ((2 * (random() * delay)) - delay)

\n
\n\n
\n
\n
\n

Events\u00b6

\n

The objects that can be yielded by the body of ops\u2019 compute functions to communicate with the\nDagster framework.

\n

(Note that Failure and RetryRequested are intended to be raised from ops rather than yielded.)

\n
\n

Event types\u00b6

\n
\n
\nclass dagster.Output(value, output_name='result', metadata=None, data_version=None)[source]\u00b6
\n

Event corresponding to one of a op\u2019s outputs.

\n

Op compute functions must explicitly yield events of this type when they have more than\none output, or when they also yield events of other types, or when defining a op using the\nOpDefinition API directly.

\n

Outputs are values produced by ops that will be consumed by downstream ops in a job.\nThey are type-checked at op boundaries when their corresponding Out\nor the downstream In is typed.

\n
\n
Parameters:
\n
    \n
  • value (Any) \u2013 The value returned by the compute function.

  • \n
  • output_name (Optional[str]) \u2013 Name of the corresponding out. (default:\n\u201cresult\u201d)

  • \n
  • metadata (Optional[Dict[str, Union[str, float, int, MetadataValue]]]) \u2013 Arbitrary metadata about the failure. Keys are displayed string labels, and values are\none of the following: string, float, int, JSON-serializable dict, JSON-serializable\nlist, and one of the data classes returned by a MetadataValue static method.

  • \n
  • data_version (Optional[DataVersion]) \u2013 \n \n (\n experimental\n )\n \n (This parameter may break in future versions, even between dot releases.) (Experimental) A data version to manually set\nfor the asset.

  • \n
\n
\n
\n
\n
\nproperty data_version\u00b6
\n

A data version that was manually set on the Output.

\n
\n
Type:
\n

Optional[DataVersion]

\n
\n
\n
\n\n
\n
\nproperty output_name\u00b6
\n

Name of the corresponding Out.

\n
\n
Type:
\n

str

\n
\n
\n
\n\n
\n
\nproperty value\u00b6
\n

The value returned by the compute function.

\n
\n
Type:
\n

Any

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.AssetMaterialization(asset_key, description=None, metadata=None, partition=None, tags=None)[source]\u00b6
\n

Event indicating that an op has materialized an asset.

\n

Op compute functions may yield events of this type whenever they wish to indicate to the\nDagster framework (and the end user) that they have produced a materialized value as a\nside effect of computation. Unlike outputs, asset materializations can not be passed to other\nops, and their persistence is controlled by op logic, rather than by the Dagster\nframework.

\n

Op authors should use these events to organize metadata about the side effects of their\ncomputations, enabling tooling like the Assets dashboard in the Dagster UI.

\n
\n
Parameters:
\n
    \n
  • asset_key (Union[str, List[str], AssetKey]) \u2013 A key to identify the materialized asset across\njob runs

  • \n
  • description (Optional[str]) \u2013 A longer human-readable description of the materialized value.

  • \n
  • partition (Optional[str]) \u2013 The name of the partition\nthat was materialized.

  • \n
  • tags (Optional[Mapping[str, str]]) \u2013 A mapping containing system-populated tags for the\nmaterialization. Users should not pass values into this argument.

  • \n
  • metadata (Optional[Dict[str, RawMetadataValue]]) \u2013 Arbitrary metadata about the asset. Keys are displayed string labels, and values are\none of the following: string, float, int, JSON-serializable dict, JSON-serializable\nlist, and one of the data classes returned by a MetadataValue static method.

  • \n
\n
\n
\n
\n
\nstatic file(path, description=None, asset_key=None)[source]\u00b6
\n

Static constructor for standard materializations corresponding to files on disk.

\n
\n
Parameters:
\n
    \n
  • path (str) \u2013 The path to the file.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the materialization.

  • \n
\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.ExpectationResult(success, label=None, description=None, metadata=None)[source]\u00b6
\n
\n
\n

\n \n (\n deprecated\n )\n \n This API will be removed in version 1.7. Please use AssetCheckResult and @asset_check instead..\n \n

\n

Event corresponding to a data quality test.

\n

Op compute functions may yield events of this type whenever they wish to indicate to the\nDagster framework (and the end user) that a data quality test has produced a (positive or\nnegative) result.

\n
\n
Parameters:
\n
    \n
  • success (bool) \u2013 Whether the expectation passed or not.

  • \n
  • label (Optional[str]) \u2013 Short display name for expectation. Defaults to \u201cresult\u201d.

  • \n
  • description (Optional[str]) \u2013 A longer human-readable description of the expectation.

  • \n
  • metadata (Optional[Dict[str, RawMetadataValue]]) \u2013 Arbitrary metadata about the failure. Keys are displayed string labels, and values are\none of the following: string, float, int, JSON-serializable dict, JSON-serializable\nlist, and one of the data classes returned by a MetadataValue static method.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.TypeCheck(success, description=None, metadata=None)[source]\u00b6
\n

Event corresponding to a successful typecheck.

\n

Events of this type should be returned by user-defined type checks when they need to encapsulate\nadditional metadata about a type check\u2019s success or failure. (i.e., when using\nas_dagster_type(), @usable_as_dagster_type, or the underlying\nPythonObjectDagsterType() API.)

\n

Op compute functions should generally avoid yielding events of this type to avoid confusion.

\n
\n
Parameters:
\n
    \n
  • success (bool) \u2013 True if the type check succeeded, False otherwise.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the type check.

  • \n
  • metadata (Optional[Dict[str, RawMetadataValue]]) \u2013 Arbitrary metadata about the failure. Keys are displayed string labels, and values are\none of the following: string, float, int, JSON-serializable dict, JSON-serializable\nlist, and one of the data classes returned by a MetadataValue static method.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.Failure(description=None, metadata=None, allow_retries=None)[source]\u00b6
\n

Event indicating op failure.

\n

Raise events of this type from within op compute functions or custom type checks in order to\nindicate an unrecoverable failure in user code to the Dagster machinery and return\nstructured metadata about the failure.

\n
\n
Parameters:
\n
    \n
  • description (Optional[str]) \u2013 A human-readable description of the failure.

  • \n
  • metadata (Optional[Dict[str, RawMetadataValue]]) \u2013 Arbitrary metadata about the failure. Keys are displayed string labels, and values are\none of the following: string, float, int, JSON-serializable dict, JSON-serializable\nlist, and one of the data classes returned by a MetadataValue static method.

  • \n
  • allow_retries (Optional[bool]) \u2013 Whether this Failure should respect the retry policy or bypass it and immediately fail.\nDefaults to True, respecting the retry policy and allowing retries.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.RetryRequested(max_retries=1, seconds_to_wait=None)[source]\u00b6
\n

An exception to raise from an op to indicate that it should be retried.

\n
\n
Parameters:
\n
    \n
  • max_retries (Optional[int]) \u2013 The max number of retries this step should attempt before failing

  • \n
  • seconds_to_wait (Optional[Union[float,int]]) \u2013 Seconds to wait before restarting the step after putting the step in\nto the up_for_retry state

  • \n
\n
\n
\n

Example

\n
@op\ndef flakes():\n    try:\n        flakey_operation()\n    except Exception as e:\n        raise RetryRequested(max_retries=3) from e\n
\n
\n
\n\n
\n
\n
\n

Event metadata\u00b6

\n

Dagster uses metadata to communicate arbitrary user-specified metadata about structured\nevents.

\n
\n
\nclass dagster.MetadataValue[source]\u00b6
\n

Utility class to wrap metadata values passed into Dagster events so that they can be\ndisplayed in the Dagster UI and other tooling.

\n
@op\ndef emit_metadata(context, df):\n    yield AssetMaterialization(\n        asset_key="my_dataset",\n        metadata={\n            "my_text_label": "hello",\n            "dashboard_url": MetadataValue.url("http://mycoolsite.com/my_dashboard"),\n            "num_rows": 0,\n        },\n    )\n
\n
\n
\n
\nstatic asset(asset_key)[source]\u00b6
\n

Static constructor for a metadata value referencing a Dagster asset, by key.

\n

For example:

\n
@op\ndef validate_table(context, df):\n    yield AssetMaterialization(\n        asset_key=AssetKey("my_table"),\n        metadata={\n            "Related asset": MetadataValue.asset(AssetKey('my_other_table')),\n        },\n    )\n
\n
\n
\n
Parameters:
\n

asset_key (AssetKey) \u2013 The asset key referencing the asset.

\n
\n
\n
\n\n
\n
\nstatic bool(value)[source]\u00b6
\n

Static constructor for a metadata value wrapping a bool as\nBoolMetadataValuye. Can be used as the value type for the metadata\nparameter for supported events.

\n

Example

\n
@op\ndef emit_metadata(context, df):\n    yield AssetMaterialization(\n        asset_key="my_dataset",\n        metadata={\n            "num rows > 1000": MetadataValue.bool(len(df) > 1000),\n        },\n    )\n
\n
\n
\n
Parameters:
\n

value (bool) \u2013 The bool value for a metadata entry.

\n
\n
\n
\n\n
\n
\nstatic dagster_run(run_id)[source]\u00b6
\n

Static constructor for a metadata value wrapping a reference to a Dagster run.

\n
\n
Parameters:
\n

run_id (str) \u2013 The ID of the run.

\n
\n
\n
\n\n
\n
\nstatic float(value)[source]\u00b6
\n

Static constructor for a metadata value wrapping a float as\nFloatMetadataValue. Can be used as the value type for the metadata\nparameter for supported events.

\n

Example

\n
@op\ndef emit_metadata(context, df):\n    yield AssetMaterialization(\n        asset_key="my_dataset",\n        metadata={\n            "size (bytes)": MetadataValue.float(calculate_bytes(df)),\n        }\n    )\n
\n
\n
\n
Parameters:
\n

value (float) \u2013 The float value for a metadata entry.

\n
\n
\n
\n\n
\n
\nstatic int(value)[source]\u00b6
\n

Static constructor for a metadata value wrapping an int as\nIntMetadataValue. Can be used as the value type for the metadata\nparameter for supported events.

\n

Example

\n
@op\ndef emit_metadata(context, df):\n    yield AssetMaterialization(\n        asset_key="my_dataset",\n        metadata={\n            "number of rows": MetadataValue.int(len(df)),\n        },\n    )\n
\n
\n
\n
Parameters:
\n

value (int) \u2013 The int value for a metadata entry.

\n
\n
\n
\n\n
\n
\nstatic json(data)[source]\u00b6
\n

Static constructor for a metadata value wrapping a json-serializable list or dict\nas JsonMetadataValue. Can be used as the value type for the metadata\nparameter for supported events.

\n

Example

\n
@op\ndef emit_metadata(context):\n    yield ExpectationResult(\n        success=not missing_things,\n        label="is_present",\n        metadata={\n            "about my dataset": MetadataValue.json({"missing_columns": missing_things})\n        },\n    )\n
\n
\n
\n
Parameters:
\n

data (Union[Sequence[Any], Mapping[str, Any]]) \u2013 The JSON data for a metadata entry.

\n
\n
\n
\n\n
\n
\nstatic md(data)[source]\u00b6
\n

Static constructor for a metadata value wrapping markdown data as\nMarkdownMetadataValue. Can be used as the value type for the metadata\nparameter for supported events.

\n

Example

\n
@op\ndef emit_metadata(context, md_str):\n    yield AssetMaterialization(\n        asset_key="info",\n        metadata={\n            'Details': MetadataValue.md(md_str)\n        },\n    )\n
\n
\n
\n
Parameters:
\n

md_str (str) \u2013 The markdown for a metadata entry.

\n
\n
\n
\n\n
\n
\nstatic notebook(path)[source]\u00b6
\n

Static constructor for a metadata value wrapping a notebook path as\nNotebookMetadataValue.

\n

Example

\n
@op\ndef emit_metadata(context):\n    yield AssetMaterialization(\n        asset_key="my_dataset",\n        metadata={\n            "notebook_path": MetadataValue.notebook("path/to/notebook.ipynb"),\n        }\n    )\n
\n
\n
\n
Parameters:
\n

path (str) \u2013 The path to a notebook for a metadata entry.

\n
\n
\n
\n\n
\n
\nstatic null()[source]\u00b6
\n

Static constructor for a metadata value representing null. Can be used as the value type\nfor the metadata parameter for supported events.

\n
\n\n
\n
\nstatic path(path)[source]\u00b6
\n

Static constructor for a metadata value wrapping a path as\nPathMetadataValue.

\n

Example

\n
@op\ndef emit_metadata(context):\n    yield AssetMaterialization(\n        asset_key="my_dataset",\n        metadata={\n            "filepath": MetadataValue.path("path/to/file"),\n        }\n    )\n
\n
\n
\n
Parameters:
\n

path (str) \u2013 The path for a metadata entry.

\n
\n
\n
\n\n
\n
\nstatic python_artifact(python_artifact)[source]\u00b6
\n

Static constructor for a metadata value wrapping a python artifact as\nPythonArtifactMetadataValue. Can be used as the value type for the\nmetadata parameter for supported events.

\n

Example

\n
@op\ndef emit_metadata(context, df):\n    yield AssetMaterialization(\n        asset_key="my_dataset",\n        metadata={\n            "class": MetadataValue.python_artifact(MyClass),\n            "function": MetadataValue.python_artifact(my_function),\n        }\n    )\n
\n
\n
\n
Parameters:
\n

value (Callable) \u2013 The python class or function for a metadata entry.

\n
\n
\n
\n\n
\n
\nstatic table(records, schema=None)[source]\u00b6
\n
\n
\n

\n \n (\n experimental\n )\n \n This API may break in future versions, even between dot releases.\n \n

\n

Static constructor for a metadata value wrapping arbitrary tabular data as\nTableMetadataValue. Can be used as the value type for the metadata\nparameter for supported events.

\n

Example

\n
@op\ndef emit_metadata(context):\n    yield ExpectationResult(\n        success=not has_errors,\n        label="is_valid",\n        metadata={\n            "errors": MetadataValue.table(\n                records=[\n                    TableRecord(code="invalid-data-type", row=2, col="name"),\n                ],\n                schema=TableSchema(\n                    columns=[\n                        TableColumn(name="code", type="string"),\n                        TableColumn(name="row", type="int"),\n                        TableColumn(name="col", type="string"),\n                    ]\n                )\n            ),\n        },\n    )\n
\n
\n
\n\n
\n
\nstatic table_schema(schema)[source]\u00b6
\n

Static constructor for a metadata value wrapping a table schema as\nTableSchemaMetadataValue. Can be used as the value type\nfor the metadata parameter for supported events.

\n

Example

\n
schema = TableSchema(\n    columns = [\n        TableColumn(name="id", type="int"),\n        TableColumn(name="status", type="bool"),\n    ]\n)\n\nDagsterType(\n    type_check_fn=some_validation_fn,\n    name='MyTable',\n    metadata={\n        'my_table_schema': MetadataValue.table_schema(schema),\n    }\n)\n
\n
\n
\n
Parameters:
\n

schema (TableSchema) \u2013 The table schema for a metadata entry.

\n
\n
\n
\n\n
\n
\nstatic text(text)[source]\u00b6
\n

Static constructor for a metadata value wrapping text as\nTextMetadataValue. Can be used as the value type for the metadata\nparameter for supported events.

\n

Example

\n
@op\ndef emit_metadata(context, df):\n    yield AssetMaterialization(\n        asset_key="my_dataset",\n        metadata={\n            "my_text_label": MetadataValue.text("hello")\n        },\n    )\n
\n
\n
\n
Parameters:
\n

text (str) \u2013 The text string for a metadata entry.

\n
\n
\n
\n\n
\n
\nstatic url(url)[source]\u00b6
\n

Static constructor for a metadata value wrapping a URL as\nUrlMetadataValue. Can be used as the value type for the metadata\nparameter for supported events.

\n

Example

\n
@op\ndef emit_metadata(context):\n    yield AssetMaterialization(\n        asset_key="my_dashboard",\n        metadata={\n            "dashboard_url": MetadataValue.url("http://mycoolsite.com/my_dashboard"),\n        }\n    )\n
\n
\n
\n
Parameters:
\n

url (str) \u2013 The URL for a metadata entry.

\n
\n
\n
\n\n
\n
\nabstract property value\u00b6
\n

The wrapped value.

\n
\n\n
\n\n
\n
\nclass dagster.MetadataEntry(label, description=None, entry_data=None, value=None)[source]\u00b6
\n
\n
\n

\n \n (\n deprecated\n )\n \n This API will be removed in version 2.0. Please use a dict with MetadataValue values instead..\n \n

\n

A structure for describing metadata for Dagster events.

\n
\n

Note

\n

This class is no longer usable in any Dagster API, and will be completely removed in 2.0.

\n
\n

Lists of objects of this type can be passed as arguments to Dagster events and will be displayed\nin the Dagster UI and other tooling.

\n

Should be yielded from within an IO manager to append metadata for a given input/output event.\nFor other event types, passing a dict with MetadataValue values to the metadata argument\nis preferred.

\n
\n
Parameters:
\n
    \n
  • label (str) \u2013 Short display label for this metadata entry.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of this metadata entry.

  • \n
  • value (MetadataValue) \u2013 Typed metadata entry data. The different types allow\nfor customized display in tools like the Dagster UI.

  • \n
\n
\n
\n
\n\n
\n
\n

Metadata types\u00b6

\n

All metadata types inherit from MetadataValue. The following types are defined:

\n
\n
\nclass dagster.DagsterAssetMetadataValue(asset_key)[source]\u00b6
\n

Representation of a dagster asset.

\n
\n
Parameters:
\n

asset_key (AssetKey) \u2013 The dagster asset key

\n
\n
\n
\n
\nproperty value\u00b6
\n

The wrapped AssetKey.

\n
\n
Type:
\n

AssetKey

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.DagsterRunMetadataValue(run_id)[source]\u00b6
\n

Representation of a dagster run.

\n
\n
Parameters:
\n

run_id (str) \u2013 The run id

\n
\n
\n
\n
\nproperty value\u00b6
\n

The wrapped run id.

\n
\n
Type:
\n

str

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.FloatMetadataValue(value)[source]\u00b6
\n

Container class for float metadata entry data.

\n
\n
Parameters:
\n

value (Optional[float]) \u2013 The float value.

\n
\n
\n
\n\n
\n
\nclass dagster.IntMetadataValue(value)[source]\u00b6
\n

Container class for int metadata entry data.

\n
\n
Parameters:
\n

value (Optional[int]) \u2013 The int value.

\n
\n
\n
\n\n
\n
\nclass dagster.JsonMetadataValue(data)[source]\u00b6
\n

Container class for JSON metadata entry data.

\n
\n
Parameters:
\n

data (Union[Sequence[Any], Dict[str, Any]]) \u2013 The JSON data.

\n
\n
\n
\n
\nproperty value\u00b6
\n

The wrapped JSON data.

\n
\n
Type:
\n

Optional[Union[Sequence[Any], Dict[str, Any]]]

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.MarkdownMetadataValue(md_str)[source]\u00b6
\n

Container class for markdown metadata entry data.

\n
\n
Parameters:
\n

md_str (Optional[str]) \u2013 The markdown as a string.

\n
\n
\n
\n
\nproperty value\u00b6
\n

The wrapped markdown as a string.

\n
\n
Type:
\n

Optional[str]

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.PathMetadataValue(path)[source]\u00b6
\n

Container class for path metadata entry data.

\n
\n
Parameters:
\n

path (Optional[str]) \u2013 The path as a string or conforming to os.PathLike.

\n
\n
\n
\n
\nproperty value\u00b6
\n

The wrapped path.

\n
\n
Type:
\n

Optional[str]

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.NotebookMetadataValue(path)[source]\u00b6
\n

Container class for notebook metadata entry data.

\n
\n
Parameters:
\n

path (Optional[str]) \u2013 The path to the notebook as a string or conforming to os.PathLike.

\n
\n
\n
\n
\nproperty value\u00b6
\n

The wrapped path to the notebook as a string.

\n
\n
Type:
\n

Optional[str]

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.PythonArtifactMetadataValue(module, name)[source]\u00b6
\n

Container class for python artifact metadata entry data.

\n
\n
Parameters:
\n
    \n
  • module (str) \u2013 The module where the python artifact can be found

  • \n
  • name (str) \u2013 The name of the python artifact

  • \n
\n
\n
\n
\n
\nproperty value\u00b6
\n

Identity function.

\n
\n
Type:
\n

PythonArtifactMetadataValue

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.TableMetadataValue(records, schema)[source]\u00b6
\n
\n
\n

\n \n (\n experimental\n )\n \n This API may break in future versions, even between dot releases.\n \n

\n

Container class for table metadata entry data.

\n
\n
Parameters:
\n
    \n
  • records (TableRecord) \u2013 The data as a list of records (i.e. rows).

  • \n
  • schema (Optional[TableSchema]) \u2013 A schema for the table.

  • \n
\n
\n
\n
\n
\nstatic infer_column_type(value)[source]\u00b6
\n

str: Infer the TableSchema column type that will be used for a value.

\n
\n\n
\n
\nproperty value\u00b6
\n

Identity function.

\n
\n
Type:
\n

TableMetadataValue

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.TableSchemaMetadataValue(schema)[source]\u00b6
\n

Representation of a schema for arbitrary tabular data.

\n
\n
Parameters:
\n

schema (TableSchema) \u2013 The dictionary containing the schema representation.

\n
\n
\n
\n
\nproperty value\u00b6
\n

The wrapped TableSchema.

\n
\n
Type:
\n

TableSchema

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.TextMetadataValue(text)[source]\u00b6
\n

Container class for text metadata entry data.

\n
\n
Parameters:
\n

text (Optional[str]) \u2013 The text data.

\n
\n
\n
\n
\nproperty value\u00b6
\n

The wrapped text data.

\n
\n
Type:
\n

Optional[str]

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.UrlMetadataValue(url)[source]\u00b6
\n

Container class for URL metadata entry data.

\n
\n
Parameters:
\n

url (Optional[str]) \u2013 The URL as a string.

\n
\n
\n
\n
\nproperty value\u00b6
\n

The wrapped URL.

\n
\n
Type:
\n

Optional[str]

\n
\n
\n
\n\n
\n\n
\n
\n

Tables\u00b6

\n

These APIs provide the ability to express table schemas (TableSchema) and table rows/records (TableRecord) in Dagster. Currently the only use case for TableSchemas and TableRecords is to wrap them in their corresponding metadata classes TableMetadataValue and TableSchemaMetadataValue for attachment to events or Dagster types.

\n
\n
\nclass dagster.TableRecord(data)[source]\u00b6
\n
\n
\n

\n \n (\n experimental\n )\n \n This API may break in future versions, even between dot releases.\n \n

\n

Represents one record in a table. Field keys are arbitrary strings\u2013 field values must be\nstrings, integers, floats, or bools.

\n
\n\n
\n
\nclass dagster.TableSchema(columns, constraints=None)[source]\u00b6
\n

Representation of a schema for tabular data.

\n

Schema is composed of two parts:

\n
    \n
  • A required list of columns (TableColumn). Each column specifies a\nname, type, set of constraints, and (optional) description. type\ndefaults to string if unspecified. Column constraints\n(TableColumnConstraints) consist of boolean properties unique and\nnullable, as well as a list of strings other containing string\ndescriptions of all additional constraints (e.g. \u201c<= 5\u201d).

  • \n
  • An optional list of table-level constraints (TableConstraints). A\ntable-level constraint cannot be expressed in terms of a single column,\ne.g. col a > col b. Presently, all table-level constraints must be\nexpressed as strings under the other attribute of a TableConstraints\nobject.

  • \n
\n
# example schema\nTableSchema(\n    constraints = TableConstraints(\n        other = [\n            "foo > bar",\n        ],\n    ),\n    columns = [\n        TableColumn(\n            name = "foo",\n            type = "string",\n            description = "Foo description",\n            constraints = TableColumnConstraints(\n                required = True,\n                other = [\n                    "starts with the letter 'a'",\n                ],\n            ),\n        ),\n        TableColumn(\n            name = "bar",\n            type = "string",\n        ),\n        TableColumn(\n            name = "baz",\n            type = "custom_type",\n            constraints = TableColumnConstraints(\n                unique = True,\n            )\n        ),\n    ],\n)\n
\n
\n
\n
Parameters:
\n
    \n
  • columns (List[TableColumn]) \u2013 The columns of the table.

  • \n
  • constraints (Optional[TableConstraints]) \u2013 The constraints of the table.

  • \n
\n
\n
\n
\n
\nstatic from_name_type_dict(name_type_dict)[source]\u00b6
\n

Constructs a TableSchema from a dictionary whose keys are column names and values are the\nnames of data types of those columns.

\n
\n\n
\n\n
\n
\nclass dagster.TableConstraints(other)[source]\u00b6
\n

Descriptor for \u201ctable-level\u201d constraints. Presently only one property,\nother is supported. This contains strings describing arbitrary\ntable-level constraints. A table-level constraint is a constraint defined\nin terms of multiple columns (e.g. col_A > col_B) or in terms of rows.

\n
\n
Parameters:
\n

other (List[str]) \u2013 Descriptions of arbitrary table-level constraints.

\n
\n
\n
\n\n
\n
\nclass dagster.TableColumn(name, type='string', description=None, constraints=None)[source]\u00b6
\n

Descriptor for a table column. The only property that must be specified\nby the user is name. If no type is specified, string is assumed. If\nno constraints are specified, the column is assumed to be nullable\n(i.e. required = False) and have no other constraints beyond the data type.

\n
\n
Parameters:
\n
    \n
  • name (List[str]) \u2013 Descriptions of arbitrary table-level constraints.

  • \n
  • type (Optional[str]) \u2013 The type of the column. Can be an arbitrary\nstring. Defaults to \u201cstring\u201d.

  • \n
  • description (Optional[str]) \u2013 Description of this column. Defaults to None.

  • \n
  • constraints (Optional[TableColumnConstraints]) \u2013 Column-level constraints.\nIf unspecified, column is nullable with no constraints.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.TableColumnConstraints(nullable=True, unique=False, other=None)[source]\u00b6
\n

Descriptor for a table column\u2019s constraints. Nullability and uniqueness are specified with\nboolean properties. All other constraints are described using arbitrary strings under the\nother property.

\n
\n
Parameters:
\n
    \n
  • nullable (Optional[bool]) \u2013 If true, this column can hold null values.

  • \n
  • unique (Optional[bool]) \u2013 If true, all values in this column must be unique.

  • \n
  • other (List[str]) \u2013 Descriptions of arbitrary column-level constraints\nnot expressible by the predefined properties.

  • \n
\n
\n
\n
\n\n
\n
\n
\n

Asset key\u00b6

\n

Dagster uses AssetKey to build an index on Materialization events.\nAssets materialized with an AssetKey are highlighted in the Dagster UI on the Assets\ndashboard.

\n
\n
\nclass dagster.AssetKey(path)[source]\u00b6
\n

Object representing the structure of an asset key. Takes in a sanitized string, list of\nstrings, or tuple of strings.

\n

Example usage:

\n
from dagster import op\n\n@op\ndef emit_metadata(context, df):\n    yield AssetMaterialization(\n        asset_key=AssetKey('flat_asset_key'),\n        metadata={"text_metadata": "Text-based metadata for this event"},\n    )\n\n@op\ndef structured_asset_key(context, df):\n    yield AssetMaterialization(\n        asset_key=AssetKey(['parent', 'child', 'grandchild']),\n        metadata={"text_metadata": "Text-based metadata for this event"},\n    )\n\n@op\ndef structured_asset_key_2(context, df):\n    yield AssetMaterialization(\n        asset_key=AssetKey(('parent', 'child', 'grandchild')),\n        metadata={"text_metadata": "Text-based metadata for this event"},\n    )\n
\n
\n
\n
Parameters:
\n

path (Sequence[str]) \u2013 String, list of strings, or tuple of strings. A list of strings\nrepresent the hierarchical structure of the asset_key.

\n
\n
\n
\n\n
\n
\n
\n", "current_page_name": "sections/api/apidocs/ops", "customsidebar": null, "display_toc": true, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../io-managers/", "title": "IO Managers"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../loggers/", "title": "Loggers"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/io-managers", "IO Managers", "N", "next"], ["sections/api/apidocs/loggers", "Loggers", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/ops.rst.txt", "title": "Ops", "toc": "\n"}, "partitions": {"alabaster_version": "0.7.13", "body": "
\n

Partitions Definitions\u00b6

\n
\n
\nclass dagster.PartitionsDefinition[source]\u00b6
\n

Defines a set of partitions, which can be attached to a software-defined asset or job.

\n

Abstract class with implementations for different kinds of partitions.

\n
\n
\nabstract get_partition_keys(current_time=None, dynamic_partitions_store=None)[source]\u00b6
\n

Returns a list of strings representing the partition keys of the PartitionsDefinition.

\n
\n
Parameters:
\n
    \n
  • current_time (Optional[datetime]) \u2013 A datetime object representing the current time, only\napplicable to time-based partitions definitions.

  • \n
  • dynamic_partitions_store (Optional[DynamicPartitionsStore]) \u2013 The DynamicPartitionsStore\nobject that is responsible for fetching dynamic partitions. Required when the\npartitions definition is a DynamicPartitionsDefinition with a name defined. Users\ncan pass the DagsterInstance fetched via context.instance to this argument.

  • \n
\n
\n
Returns:
\n

Sequence[str]

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.HourlyPartitionsDefinition(start_date, end_date=None, minute_offset=0, timezone=None, fmt=None, end_offset=0)[source]\u00b6
\n

A set of hourly partitions.

\n

The first partition in the set will start on the start_date at midnight. The last partition\nin the set will end before the current time, unless the end_offset argument is set to a\npositive number. If minute_offset is provided, the start and end times of each partition\nwill be minute_offset past the hour.

\n
\n
Parameters:
\n
    \n
  • start_date (Union[datetime.datetime, str]) \u2013 The first date in the set of partitions. Can\nprovide in either a datetime or string format.

  • \n
  • end_date (Union[datetime.datetime, str, None]) \u2013 The last date(excluding) in the set of partitions.\nDefault is None. Can provide in either a datetime or string format.

  • \n
  • minute_offset (int) \u2013 Number of minutes past the hour to \u201csplit\u201d the partition. Defaults\nto 0.

  • \n
  • fmt (Optional[str]) \u2013 The date format to use. Defaults to %Y-%m-%d.

  • \n
  • timezone (Optional[str]) \u2013 The timezone in which each date should exist.\nSupported strings for timezones are the ones provided by the\nIANA time zone database <https://www.iana.org/time-zones> - e.g. \u201cAmerica/Los_Angeles\u201d.

  • \n
  • end_offset (int) \u2013 Extends the partition set by a number of partitions equal to the value\npassed. If end_offset is 0 (the default), the last partition ends before the current\ntime. If end_offset is 1, the second-to-last partition ends before the current time,\nand so on.

  • \n
\n
\n
\n
HourlyPartitionsDefinition(start_date=datetime(2022, 03, 12))\n# creates partitions (2022-03-12-00:00, 2022-03-12-01:00), (2022-03-12-01:00, 2022-03-12-02:00), ...\n\nHourlyPartitionsDefinition(start_date=datetime(2022, 03, 12), minute_offset=15)\n# creates partitions (2022-03-12-00:15, 2022-03-12-01:15), (2022-03-12-01:15, 2022-03-12-02:15), ...\n
\n
\n
\n\n
\n
\nclass dagster.DailyPartitionsDefinition(start_date, end_date=None, minute_offset=0, hour_offset=0, timezone=None, fmt=None, end_offset=0)[source]\u00b6
\n

A set of daily partitions.

\n

The first partition in the set will start at the start_date at midnight. The last partition\nin the set will end before the current time, unless the end_offset argument is set to a\npositive number. If minute_offset and/or hour_offset are used, the start and end times of\neach partition will be hour_offset:minute_offset of each day.

\n
\n
Parameters:
\n
    \n
  • start_date (Union[datetime.datetime, str]) \u2013 The first date in the set of partitions. Can\nprovide in either a datetime or string format.

  • \n
  • end_date (Union[datetime.datetime, str, None]) \u2013 The last date(excluding) in the set of partitions.\nDefault is None. Can provide in either a datetime or string format.

  • \n
  • minute_offset (int) \u2013 Number of minutes past the hour to \u201csplit\u201d the partition. Defaults\nto 0.

  • \n
  • hour_offset (int) \u2013 Number of hours past 00:00 to \u201csplit\u201d the partition. Defaults to 0.

  • \n
  • timezone (Optional[str]) \u2013 The timezone in which each date should exist.\nSupported strings for timezones are the ones provided by the\nIANA time zone database <https://www.iana.org/time-zones> - e.g. \u201cAmerica/Los_Angeles\u201d.

  • \n
  • fmt (Optional[str]) \u2013 The date format to use. Defaults to %Y-%m-%d.

  • \n
  • end_offset (int) \u2013 Extends the partition set by a number of partitions equal to the value\npassed. If end_offset is 0 (the default), the last partition ends before the current\ntime. If end_offset is 1, the second-to-last partition ends before the current time,\nand so on.

  • \n
\n
\n
\n
DailyPartitionsDefinition(start_date="2022-03-12")\n# creates partitions (2022-03-12-00:00, 2022-03-13-00:00), (2022-03-13-00:00, 2022-03-14-00:00), ...\n\nDailyPartitionsDefinition(start_date="2022-03-12", minute_offset=15, hour_offset=16)\n# creates partitions (2022-03-12-16:15, 2022-03-13-16:15), (2022-03-13-16:15, 2022-03-14-16:15), ...\n
\n
\n
\n\n
\n
\nclass dagster.WeeklyPartitionsDefinition(start_date, end_date=None, minute_offset=0, hour_offset=0, day_offset=0, timezone=None, fmt=None, end_offset=0)[source]\u00b6
\n

Defines a set of weekly partitions.

\n

The first partition in the set will start at the start_date. The last partition in the set will\nend before the current time, unless the end_offset argument is set to a positive number. If\nday_offset is provided, the start and end date of each partition will be day of the week\ncorresponding to day_offset (0 indexed with Sunday as the start of the week). If\nminute_offset and/or hour_offset are used, the start and end times of each partition will be\nhour_offset:minute_offset of each day.

\n
\n
Parameters:
\n
    \n
  • start_date (Union[datetime.datetime, str]) \u2013 The first date in the set of partitions will\nSunday at midnight following start_date. Can provide in either a datetime or string\nformat.

  • \n
  • end_date (Union[datetime.datetime, str, None]) \u2013 The last date(excluding) in the set of partitions.\nDefault is None. Can provide in either a datetime or string format.

  • \n
  • minute_offset (int) \u2013 Number of minutes past the hour to \u201csplit\u201d the partition. Defaults\nto 0.

  • \n
  • hour_offset (int) \u2013 Number of hours past 00:00 to \u201csplit\u201d the partition. Defaults to 0.

  • \n
  • day_offset (int) \u2013 Day of the week to \u201csplit\u201d the partition. Defaults to 0 (Sunday).

  • \n
  • timezone (Optional[str]) \u2013 The timezone in which each date should exist.\nSupported strings for timezones are the ones provided by the\nIANA time zone database <https://www.iana.org/time-zones> - e.g. \u201cAmerica/Los_Angeles\u201d.

  • \n
  • fmt (Optional[str]) \u2013 The date format to use. Defaults to %Y-%m-%d.

  • \n
  • end_offset (int) \u2013 Extends the partition set by a number of partitions equal to the value\npassed. If end_offset is 0 (the default), the last partition ends before the current\ntime. If end_offset is 1, the second-to-last partition ends before the current time,\nand so on.

  • \n
\n
\n
\n
WeeklyPartitionsDefinition(start_date="2022-03-12")\n# creates partitions (2022-03-13-00:00, 2022-03-20-00:00), (2022-03-20-00:00, 2022-03-27-00:00), ...\n\nWeeklyPartitionsDefinition(start_date="2022-03-12", minute_offset=15, hour_offset=3, day_offset=6)\n# creates partitions (2022-03-12-03:15, 2022-03-19-03:15), (2022-03-19-03:15, 2022-03-26-03:15), ...\n
\n
\n
\n\n
\n
\nclass dagster.MonthlyPartitionsDefinition(start_date, end_date=None, minute_offset=0, hour_offset=0, day_offset=1, timezone=None, fmt=None, end_offset=0)[source]\u00b6
\n

A set of monthly partitions.

\n

The first partition in the set will start at the soonest first of the month after start_date\nat midnight. The last partition in the set will end before the current time, unless the\nend_offset argument is set to a positive number. If day_offset is provided, the start and\nend date of each partition will be day_offset. If minute_offset and/or hour_offset are used,\nthe start and end times of each partition will be hour_offset:minute_offset of each day.

\n
\n
Parameters:
\n
    \n
  • start_date (Union[datetime.datetime, str]) \u2013 The first date in the set of partitions will be\nmidnight the sonnest first of the month following start_date. Can provide in either a\ndatetime or string format.

  • \n
  • end_date (Union[datetime.datetime, str, None]) \u2013 The last date(excluding) in the set of partitions.\nDefault is None. Can provide in either a datetime or string format.

  • \n
  • minute_offset (int) \u2013 Number of minutes past the hour to \u201csplit\u201d the partition. Defaults\nto 0.

  • \n
  • hour_offset (int) \u2013 Number of hours past 00:00 to \u201csplit\u201d the partition. Defaults to 0.

  • \n
  • day_offset (int) \u2013 Day of the month to \u201csplit\u201d the partition. Defaults to 1.

  • \n
  • timezone (Optional[str]) \u2013 The timezone in which each date should exist.\nSupported strings for timezones are the ones provided by the\nIANA time zone database <https://www.iana.org/time-zones> - e.g. \u201cAmerica/Los_Angeles\u201d.

  • \n
  • fmt (Optional[str]) \u2013 The date format to use. Defaults to %Y-%m-%d.

  • \n
  • end_offset (int) \u2013 Extends the partition set by a number of partitions equal to the value\npassed. If end_offset is 0 (the default), the last partition ends before the current\ntime. If end_offset is 1, the second-to-last partition ends before the current time,\nand so on.

  • \n
\n
\n
\n
MonthlyPartitionsDefinition(start_date="2022-03-12")\n# creates partitions (2022-04-01-00:00, 2022-05-01-00:00), (2022-05-01-00:00, 2022-06-01-00:00), ...\n\nMonthlyPartitionsDefinition(start_date="2022-03-12", minute_offset=15, hour_offset=3, day_offset=5)\n# creates partitions (2022-04-05-03:15, 2022-05-05-03:15), (2022-05-05-03:15, 2022-06-05-03:15), ...\n
\n
\n
\n\n
\n
\nclass dagster.TimeWindowPartitionsDefinition(start, fmt, end=None, schedule_type=None, timezone=None, end_offset=0, minute_offset=None, hour_offset=None, day_offset=None, cron_schedule=None)[source]\u00b6
\n

A set of partitions where each partitions corresponds to a time window.

\n

The provided cron_schedule determines the bounds of the time windows. E.g. a cron_schedule of\n\u201c0 0 \\* \\* \\*\u201d will result in daily partitions that start at midnight and end at midnight of the\nfollowing day.

\n

The string partition_key associated with each partition corresponds to the start of the\npartition\u2019s time window.

\n

The first partition in the set will start on at the first cron_schedule tick that is equal to\nor after the given start datetime. The last partition in the set will end before the current\ntime, unless the end_offset argument is set to a positive number.

\n
\n
Parameters:
\n
    \n
  • cron_schedule (str) \u2013 Determines the bounds of the time windows.

  • \n
  • start (datetime) \u2013 The first partition in the set will start on at the first cron_schedule\ntick that is equal to or after this value.

  • \n
  • timezone (Optional[str]) \u2013 The timezone in which each time should exist.\nSupported strings for timezones are the ones provided by the\nIANA time zone database <https://www.iana.org/time-zones> - e.g. \u201cAmerica/Los_Angeles\u201d.

  • \n
  • end (datetime) \u2013 The last partition (excluding) in the set.

  • \n
  • fmt (str) \u2013 The date format to use for partition_keys.

  • \n
  • end_offset (int) \u2013 Extends the partition set by a number of partitions equal to the value\npassed. If end_offset is 0 (the default), the last partition ends before the current\ntime. If end_offset is 1, the second-to-last partition ends before the current time,\nand so on.

  • \n
\n
\n
\n
\n
\nproperty day_offset\u00b6
\n

For a weekly or monthly partitions definition, returns the day to \u201csplit\u201d partitions\nby. Each partition will start on this day, and end before this day in the following\nweek/month. Returns 0 if the day_offset parameter is unset in the\nWeeklyPartitionsDefinition, MonthlyPartitionsDefinition, or the provided cron schedule.

\n

For weekly partitions, returns a value between 0 (representing Sunday) and 6 (representing\nSaturday). Providing a value of 1 means that a partition will exist weekly from Monday to\nthe following Sunday.

\n

For monthly partitions, returns a value between 0 (the first day of the month) and 31 (the\nlast possible day of the month).

\n
\n
Type:
\n

int

\n
\n
\n
\n\n
\n
\nget_cron_schedule(minute_of_hour=None, hour_of_day=None, day_of_week=None, day_of_month=None)[source]\u00b6
\n

The schedule executes at the cadence specified by the partitioning, but may overwrite\nthe minute/hour/day offset of the partitioning.

\n

This is useful e.g. if you have partitions that span midnight to midnight but you want to\nschedule a job that runs at 2 am.

\n
\n\n
\n
\nproperty hour_offset\u00b6
\n

Number of hours past 00:00 to \u201csplit\u201d partitions. Defaults to 0.

\n

For example, returns 1 if each partition starts at 01:00.

\n
\n
Type:
\n

int

\n
\n
\n
\n\n
\n
\nproperty minute_offset\u00b6
\n

Number of minutes past the hour to \u201csplit\u201d partitions. Defaults to 0.

\n

For example, returns 15 if each partition starts at 15 minutes past the hour.

\n
\n
Type:
\n

int

\n
\n
\n
\n\n
\n
\nproperty schedule_type\u00b6
\n

An enum representing the partition cadence (hourly, daily,\nweekly, or monthly).

\n
\n
Type:
\n

Optional[ScheduleType]

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.TimeWindow(start, end)[source]\u00b6
\n

An interval that is closed at the start and open at the end.

\n
\n
\nstart\u00b6
\n

A pendulum datetime that marks the start of the window.

\n
\n
Type:
\n

datetime

\n
\n
\n
\n\n
\n
\nend\u00b6
\n

A pendulum datetime that marks the end of the window.

\n
\n
Type:
\n

datetime

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.StaticPartitionsDefinition(partition_keys)[source]\u00b6
\n

A statically-defined set of partitions.

\n

Example

\n
from dagster import StaticPartitionsDefinition, asset\n\noceans_partitions_def = StaticPartitionsDefinition(\n    ["arctic", "atlantic", "indian", "pacific", "southern"]\n)\n\n@asset(partitions_def=oceans_partitions_defs)\ndef ml_model_for_each_ocean():\n    ...\n
\n
\n
\n
\nget_partition_keys(current_time=None, dynamic_partitions_store=None)[source]\u00b6
\n

Returns a list of strings representing the partition keys of the PartitionsDefinition.

\n
\n
Parameters:
\n
    \n
  • current_time (Optional[datetime]) \u2013 A datetime object representing the current time, only\napplicable to time-based partitions definitions.

  • \n
  • dynamic_partitions_store (Optional[DynamicPartitionsStore]) \u2013 The DynamicPartitionsStore\nobject that is responsible for fetching dynamic partitions. Only applicable to\nDynamicPartitionsDefinitions.

  • \n
\n
\n
Returns:
\n

Sequence[str]

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.MultiPartitionsDefinition(partitions_defs)[source]\u00b6
\n

Takes the cross-product of partitions from two partitions definitions.

\n

For example, with a static partitions definition where the partitions are [\u201ca\u201d, \u201cb\u201d, \u201cc\u201d]\nand a daily partitions definition, this partitions definition will have the following\npartitions:

\n

2020-01-01|a\n2020-01-01|b\n2020-01-01|c\n2020-01-02|a\n2020-01-02|b\n\u2026

\n
\n
Parameters:
\n

partitions_defs (Mapping[str, PartitionsDefinition]) \u2013 A mapping of dimension name to partitions definition. The total set of partitions will\nbe the cross-product of the partitions from each PartitionsDefinition.

\n
\n
\n
\n
\npartitions_defs\u00b6
\n

A sequence of PartitionDimensionDefinition objects, each of which contains a dimension\nname and a PartitionsDefinition. The total set of partitions will be the cross-product\nof the partitions from each PartitionsDefinition. This sequence is ordered by\ndimension name, to ensure consistent ordering of the partitions.

\n
\n
Type:
\n

Sequence[PartitionDimensionDefinition]

\n
\n
\n
\n\n
\n
\nget_partition_keys(current_time=None, dynamic_partitions_store=None)[source]\u00b6
\n

Returns a list of MultiPartitionKeys representing the partition keys of the\nPartitionsDefinition.

\n
\n
Parameters:
\n
    \n
  • current_time (Optional[datetime]) \u2013 A datetime object representing the current time, only\napplicable to time-based partition dimensions.

  • \n
  • dynamic_partitions_store (Optional[DynamicPartitionsStore]) \u2013 The DynamicPartitionsStore\nobject that is responsible for fetching dynamic partitions. Required when a\ndimension is a DynamicPartitionsDefinition with a name defined. Users can pass the\nDagsterInstance fetched via context.instance to this argument.

  • \n
\n
\n
Returns:
\n

Sequence[MultiPartitionKey]

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.MultiPartitionKey(keys_by_dimension)[source]\u00b6
\n

A multi-dimensional partition key stores the partition key for each dimension.\nSubclasses the string class to keep partition key type as a string.

\n

Contains additional methods to access the partition key for each dimension.\nCreates a string representation of the partition key for each dimension, separated by a pipe (|).\nOrders the dimensions by name, to ensure consistent string representation.

\n
\n\n
\n
\nclass dagster.DynamicPartitionsDefinition(partition_fn=None, name=None)[source]\u00b6
\n

A partitions definition whose partition keys can be dynamically added and removed.

\n

This is useful for cases where the set of partitions is not known at definition time,\nbut is instead determined at runtime.

\n

Partitions can be added and removed using instance.add_dynamic_partitions and\ninstance.delete_dynamic_partition methods.

\n
\n
Parameters:
\n
    \n
  • name (Optional[str]) \u2013 The name of the partitions definition.

  • \n
  • partition_fn (Optional[Callable[[Optional[datetime]], Union[Sequence[Partition], Sequence[str]]]]) \u2013 \n \n (\n deprecated\n )\n \n (This parameter will be removed in version 2.0. Provide partition definition name instead.) A function that returns the current set of partitions. This argument is deprecated and\nwill be removed in 2.0.0.

  • \n
\n
\n
\n

Examples

\n
fruits = DynamicPartitionsDefinition(name="fruits")\n\n@sensor(job=my_job)\ndef my_sensor(context):\n    return SensorResult(\n        run_requests=[RunRequest(partition_key="apple")],\n        dynamic_partitions_requests=[fruits.build_add_request(["apple"])]\n    )\n
\n
\n
\n
\nget_partition_keys(current_time=None, dynamic_partitions_store=None)[source]\u00b6
\n

Returns a list of strings representing the partition keys of the\nPartitionsDefinition.

\n
\n
Parameters:
\n
    \n
  • current_time (Optional[datetime]) \u2013 A datetime object representing the current time, only\napplicable to time-based partitions definitions.

  • \n
  • dynamic_partitions_store (Optional[DynamicPartitionsStore]) \u2013 The DynamicPartitionsStore\nobject that is responsible for fetching dynamic partitions. Required when the\npartitions definition is a DynamicPartitionsDefinition with a name defined. Users\ncan pass the DagsterInstance fetched via context.instance to this argument.

  • \n
\n
\n
Returns:
\n

Sequence[str]

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.PartitionKeyRange(start, end)[source]\u00b6
\n

Defines a range of partitions.

\n
\n
\nstart\u00b6
\n

The starting partition key in the range (inclusive).

\n
\n
Type:
\n

str

\n
\n
\n
\n\n
\n
\nend\u00b6
\n

The ending partition key in the range (inclusive).

\n
\n
Type:
\n

str

\n
\n
\n
\n\n

Examples

\n
partitions_def = StaticPartitionsDefinition(["a", "b", "c", "d"])\npartition_key_range = PartitionKeyRange(start="a", end="c") # Represents ["a", "b", "c"]\n
\n
\n
\n\n
\n
\n

Partitioned Schedules\u00b6

\n
\n
\ndagster.build_schedule_from_partitioned_job(job, description=None, name=None, minute_of_hour=None, hour_of_day=None, day_of_week=None, day_of_month=None, default_status=DefaultScheduleStatus.STOPPED, tags=None)[source]
\n

Creates a schedule from a time window-partitioned job or a job that targets\ntime window-partitioned assets. The job can also be multipartitioned, as long as one\nof the partitions dimensions is time-partitioned.

\n

The schedule executes at the cadence specified by the time partitioning of the job or assets.

\n

Examples

\n
######################################\n# Job that targets partitioned assets\n######################################\n\nfrom dagster import (\n    DailyPartitionsDefinition,\n    asset,\n    build_schedule_from_partitioned_job,\n    define_asset_job,\n)\n\n@asset(partitions_def=DailyPartitionsDefinition(start_date="2020-01-01"))\ndef asset1():\n    ...\n\nasset1_job = define_asset_job("asset1_job", selection=[asset1])\n\n# The created schedule will fire daily\nasset1_job_schedule = build_schedule_from_partitioned_job(asset1_job)\n\ndefs = Definitions(assets=[asset1], schedules=[asset1_job_schedule])\n\n################\n# Non-asset job\n################\n\nfrom dagster import DailyPartitionsDefinition, build_schedule_from_partitioned_job, jog\n\n\n@job(partitions_def=DailyPartitionsDefinition(start_date="2020-01-01"))\ndef do_stuff_partitioned():\n    ...\n\n# The created schedule will fire daily\ndo_stuff_partitioned_schedule = build_schedule_from_partitioned_job(\n    do_stuff_partitioned,\n)\n\ndefs = Definitions(schedules=[do_stuff_partitioned_schedule])\n
\n
\n
\n\n
\n
\n

Partition Mapping\u00b6

\n
\n
\nclass dagster.PartitionMapping[source]\u00b6
\n

Defines a correspondence between the partitions in an asset and the partitions in an asset\nthat it depends on.

\n

Overriding PartitionMapping outside of Dagster is not supported. The abstract methods of this\nclass may change at any time.

\n
\n
\nabstract get_downstream_partitions_for_partitions(upstream_partitions_subset, downstream_partitions_def, current_time=None, dynamic_partitions_store=None)[source]\u00b6
\n

Returns the subset of partition keys in the downstream asset that use the data in the given\npartition key subset of the upstream asset.

\n
\n
Parameters:
\n
    \n
  • upstream_partitions_subset (Union[PartitionKeyRange, PartitionsSubset]) \u2013 The\nsubset of partition keys in the upstream asset.

  • \n
  • downstream_partitions_def (PartitionsDefinition) \u2013 The partitions definition for the\ndownstream asset.

  • \n
\n
\n
\n
\n\n
\n
\nabstract get_upstream_mapped_partitions_result_for_partitions(downstream_partitions_subset, upstream_partitions_def, current_time=None, dynamic_partitions_store=None)[source]\u00b6
\n

Returns a UpstreamPartitionsResult object containing the partition keys the downstream\npartitions subset was mapped to in the upstream partitions definition.

\n

Valid upstream partitions will be included in UpstreamPartitionsResult.partitions_subset.\nInvalid upstream partitions will be included in UpstreamPartitionsResult.required_but_nonexistent_partition_keys.

\n

For example, if an upstream asset is time-partitioned and starts in June 2023, and the\ndownstream asset is time-partitioned and starts in May 2023, this function would return a\nUpstreamPartitionsResult(PartitionsSubset(\u201c2023-06-01\u201d), required_but_nonexistent_partition_keys=[\u201c2023-05-01\u201d])\nwhen downstream_partitions_subset contains 2023-05-01 and 2023-06-01.

\n
\n\n
\n\n
\n
\nclass dagster.TimeWindowPartitionMapping(start_offset=0, end_offset=0, allow_nonexistent_upstream_partitions=False)[source]\u00b6
\n

The default mapping between two TimeWindowPartitionsDefinitions.

\n

A partition in the downstream partitions definition is mapped to all partitions in the upstream\nasset whose time windows overlap it.

\n

This means that, if the upstream and downstream partitions definitions share the same time\nperiod, then this mapping is essentially the identity partition mapping - plus conversion of\ndatetime formats.

\n

If the upstream time period is coarser than the downstream time period, then each partition in\nthe downstream asset will map to a single (larger) upstream partition. E.g. if the downstream is\nhourly and the upstream is daily, then each hourly partition in the downstream will map to the\ndaily partition in the upstream that contains that hour.

\n

If the upstream time period is finer than the downstream time period, then each partition in the\ndownstream asset will map to multiple upstream partitions. E.g. if the downstream is daily and\nthe upstream is hourly, then each daily partition in the downstream asset will map to the 24\nhourly partitions in the upstream that occur on that day.

\n
\n
\nstart_offset\u00b6
\n

If not 0, then the starts of the upstream windows are shifted by this\noffset relative to the starts of the downstream windows. For example, if start_offset=-1\nand end_offset=0, then the downstream partition \u201c2022-07-04\u201d would map to the upstream\npartitions \u201c2022-07-03\u201d and \u201c2022-07-04\u201d. Only permitted to be non-zero when the\nupstream and downstream PartitionsDefinitions are the same. Defaults to 0.

\n
\n
Type:
\n

int

\n
\n
\n
\n\n
\n
\nend_offset\u00b6
\n

If not 0, then the ends of the upstream windows are shifted by this\noffset relative to the ends of the downstream windows. For example, if start_offset=0\nand end_offset=1, then the downstream partition \u201c2022-07-04\u201d would map to the upstream\npartitions \u201c2022-07-04\u201d and \u201c2022-07-05\u201d. Only permitted to be non-zero when the\nupstream and downstream PartitionsDefinitions are the same. Defaults to 0.

\n
\n
Type:
\n

int

\n
\n
\n
\n\n
\n
\nallow_nonexistent_upstream_partitions\u00b6
\n

Defaults to false. If true, does not\nraise an error when mapped upstream partitions fall outside the start-end time window of the\npartitions def. For example, if the upstream partitions def starts on \u201c2023-01-01\u201d but\nthe downstream starts on \u201c2022-01-01\u201d, setting this bool to true would return no\npartition keys when get_upstream_partitions_for_partitions is called with \u201c2022-06-01\u201d.\nWhen set to false, would raise an error.

\n
\n
Type:
\n

bool

\n
\n
\n
\n\n

Examples

\n
from dagster import DailyPartitionsDefinition, TimeWindowPartitionMapping, AssetIn, asset\n\npartitions_def = DailyPartitionsDefinition(start_date="2020-01-01")\n\n@asset(partitions_def=partitions_def)\ndef asset1():\n    ...\n\n@asset(\n    partitions_def=partitions_def,\n    ins={\n        "asset1": AssetIn(\n            partition_mapping=TimeWindowPartitionMapping(start_offset=-1)\n        )\n    }\n)\ndef asset2(asset1):\n    ...\n
\n
\n
\n\n
\n
\nclass dagster.IdentityPartitionMapping[source]\u00b6
\n

Expects that the upstream and downstream assets are partitioned in the same way, and maps\npartitions in the downstream asset to the same partition in the upstream asset.

\n
\n\n
\n
\nclass dagster.AllPartitionMapping[source]\u00b6
\n

Maps every partition in the downstream asset to every partition in the upstream asset.

\n

Commonly used in the case when the downstream asset is not partitioned, in which the entire\ndownstream asset depends on all partitions of the usptream asset.

\n
\n\n
\n
\nclass dagster.LastPartitionMapping[source]\u00b6
\n

Maps all dependencies to the last partition in the upstream asset.

\n

Commonly used in the case when the downstream asset is not partitioned, in which the entire\ndownstream asset depends on the last partition of the upstream asset.

\n
\n\n
\n
\nclass dagster.StaticPartitionMapping(downstream_partition_keys_by_upstream_partition_key)[source]\u00b6
\n

Define an explicit correspondence between two StaticPartitionsDefinitions.

\n
\n
Parameters:
\n

downstream_partition_keys_by_upstream_partition_key (Dict[str, str | Collection[str]]) \u2013 The single or multi-valued correspondence from upstream keys to downstream keys.

\n
\n
\n
\n\n
\n
\nclass dagster.SpecificPartitionsPartitionMapping(partition_keys)[source]\u00b6
\n

Maps to a specific subset of partitions in the upstream asset.

\n

Example

\n
from dagster import SpecificPartitionsPartitionMapping, StaticPartitionsDefinition, asset\n\n@asset(partitions_def=StaticPartitionsDefinition(["a", "b", "c"]))\ndef upstream():\n    ...\n\n@asset(\n    ins={\n        "upstream": AssetIn(partition_mapping=SpecificPartitionsPartitionMapping(["a"]))\n    }\n)\ndef a_downstream(upstream):\n    ...\n
\n
\n
\n\n
\n
\nclass dagster.MultiToSingleDimensionPartitionMapping(partition_dimension_name=None)[source]\u00b6
\n
\n
\n

\n \n (\n experimental\n )\n \n This API may break in future versions, even between dot releases.\n \n

\n

Defines a correspondence between an single-dimensional partitions definition\nand a MultiPartitionsDefinition. The single-dimensional partitions definition must be\na dimension of the MultiPartitionsDefinition.

\n

This class handles the case where the upstream asset is multipartitioned and the\ndownstream asset is single dimensional, and vice versa.

\n

For a partition key X, this partition mapping assumes that any multi-partition key with\nX in the selected dimension is a dependency.

\n
\n
Parameters:
\n

partition_dimension_name (Optional[str]) \u2013 The name of the partition dimension in the\nMultiPartitionsDefinition that matches the single-dimension partitions definition.

\n
\n
\n
\n\n
\n
\nclass dagster.MultiPartitionMapping(downstream_mappings_by_upstream_dimension)[source]\u00b6
\n
\n
\n

\n \n (\n experimental\n )\n \n This API may break in future versions, even between dot releases.\n \n

\n

Defines a correspondence between two MultiPartitionsDefinitions.

\n

Accepts a mapping of upstream dimension name to downstream DimensionPartitionMapping, representing\nthe explicit correspondence between the upstream and downstream MultiPartitions dimensions\nand the partition mapping used to calculate the downstream partitions.

\n

Examples

\n
weekly_abc = MultiPartitionsDefinition(\n    {\n        "abc": StaticPartitionsDefinition(["a", "b", "c"]),\n        "weekly": WeeklyPartitionsDefinition("2023-01-01"),\n    }\n)\ndaily_123 = MultiPartitionsDefinition(\n    {\n        "123": StaticPartitionsDefinition(["1", "2", "3"]),\n        "daily": DailyPartitionsDefinition("2023-01-01"),\n    }\n)\n\nMultiPartitionsMapping(\n    {\n        "abc": DimensionPartitionMapping(\n            dimension_name="123",\n            partition_mapping=StaticPartitionMapping({"a": "1", "b": "2", "c": "3"}),\n        ),\n        "weekly": DimensionPartitionMapping(\n            dimension_name="daily",\n            partition_mapping=TimeWindowPartitionMapping(),\n        )\n    }\n)\n
\n
\n

For upstream or downstream dimensions not explicitly defined in the mapping, Dagster will\nassume an AllPartitionsMapping, meaning that all upstream partitions in those dimensions\nwill be mapped to all downstream partitions in those dimensions.

\n

Examples

\n
weekly_abc = MultiPartitionsDefinition(\n    {\n        "abc": StaticPartitionsDefinition(["a", "b", "c"]),\n        "daily": DailyPartitionsDefinition("2023-01-01"),\n    }\n)\ndaily_123 = MultiPartitionsDefinition(\n    {\n        "123": StaticPartitionsDefinition(["1", "2", "3"]),\n        "daily": DailyPartitionsDefinition("2023-01-01"),\n    }\n)\n\nMultiPartitionsMapping(\n    {\n        "daily": DimensionPartitionMapping(\n            dimension_name="daily",\n            partition_mapping=IdentityPartitionMapping(),\n        )\n    }\n)\n\n# Will map `daily_123` partition key {"123": "1", "daily": "2023-01-01"} to the upstream:\n# {"abc": "a", "daily": "2023-01-01"}\n# {"abc": "b", "daily": "2023-01-01"}\n# {"abc": "c", "daily": "2023-01-01"}\n
\n
\n
\n
Parameters:
\n

downstream_mappings_by_upstream_dimension (Mapping[str, DimensionPartitionMapping]) \u2013 A\nmapping that defines an explicit correspondence between one dimension of the upstream\nMultiPartitionsDefinition and one dimension of the downstream MultiPartitionsDefinition.\nMaps a string representing upstream dimension name to downstream DimensionPartitionMapping,\ncontaining the downstream dimension name and partition mapping.

\n
\n
\n
\n\n
\n
\n

Backfill Policy (Experimental)\u00b6

\n
\n
\nclass dagster.BackfillPolicy(max_partitions_per_run=1)[source]\u00b6
\n
\n
\n

\n \n (\n experimental\n )\n \n This API may break in future versions, even between dot releases.\n \n

\n

A BackfillPolicy specifies how Dagster should attempt to backfill a partitioned asset.

\n

There are two main kinds of backfill policies: single-run and multi-run.

\n

An asset with a single-run backfill policy will take a single run to backfill all of its\npartitions at once.

\n

An asset with a multi-run backfill policy will take multiple runs to backfill all of its\npartitions. Each run will backfill a subset of the partitions. The number of partitions to\nbackfill in each run is controlled by the max_partitions_per_run parameter.

\n

For example:

\n
    \n
  • If an asset has 100 partitions, and the max_partitions_per_run is set to 10, then it will\nbe backfilled in 10 runs; each run will backfill 10 partitions.

  • \n
  • If an asset has 100 partitions, and the max_partitions_per_run is set to 11, then it will\nbe backfilled in 10 runs; the first 9 runs will backfill 11 partitions, and the last one run\nwill backfill the remaining 9 partitions.

  • \n
\n

Warning:

\n

Constructing an BackfillPolicy directly is not recommended as the API is subject to change.\nBackfillPolicy.single_run() and BackfillPolicy.multi_run(max_partitions_per_run=x) are the\nrecommended APIs.

\n
\n
\nstatic multi_run(max_partitions_per_run=1)[source]\u00b6
\n

Creates a BackfillPolicy that executes the entire backfill in multiple runs.\nEach run will backfill [max_partitions_per_run] number of partitions.

\n
\n
Parameters:
\n

max_partitions_per_run (Optional[int]) \u2013 The maximum number of partitions in each run of\nthe multiple runs. Defaults to 1.

\n
\n
\n
\n\n
\n
\nstatic single_run()[source]\u00b6
\n

Creates a BackfillPolicy that executes the entire backfill in a single run.

\n
\n\n
\n\n
\n
\n

Partitioned Config\u00b6

\n
\n
\nclass dagster.PartitionedConfig(partitions_def, run_config_for_partition_fn=None, decorated_fn=None, tags_for_partition_fn=None, run_config_for_partition_key_fn=None, tags_for_partition_key_fn=None)[source]\u00b6
\n

Defines a way of configuring a job where the job can be run on one of a discrete set of\npartitions, and each partition corresponds to run configuration for the job.

\n

Setting PartitionedConfig as the config for a job allows you to launch backfills for that job\nand view the run history across partitions.

\n
\n
\nget_partition_keys(current_time=None)[source]\u00b6
\n

Returns a list of partition keys, representing the full set of partitions that\nconfig can be applied to.

\n
\n
Parameters:
\n

current_time (Optional[datetime]) \u2013 A datetime object representing the current time. Only\napplicable to time-based partitions definitions.

\n
\n
Returns:
\n

Sequence[str]

\n
\n
\n
\n\n
\n
\nproperty partitions_def\u00b6
\n

The partitions definition associated with this PartitionedConfig.

\n
\n
Type:
\n

T_PartitionsDefinition

\n
\n
\n
\n\n
\n
\nproperty run_config_for_partition_fn\u00b6
\n
\n
\n

\n \n (\n deprecated\n )\n \n This API will be removed in version 2.0. Use run_config_for_partition_key_fn instead..\n \n

\n

A function that accepts a partition\nand returns a dictionary representing the config to attach to runs for that partition.\nDeprecated as of 1.3.3.

\n
\n
Type:
\n

Optional[Callable[[Partition], Mapping[str, Any]]]

\n
\n
\n
\n\n
\n
\nproperty run_config_for_partition_key_fn\u00b6
\n

A function that accepts a partition key\nand returns a dictionary representing the config to attach to runs for that partition.

\n
\n
Type:
\n

Optional[Callable[[str], Mapping[str, Any]]]

\n
\n
\n
\n\n
\n
\nproperty tags_for_partition_fn\u00b6
\n
\n
\n

\n \n (\n deprecated\n )\n \n This API will be removed in version 2.0. Use tags_for_partition_key_fn instead..\n \n

\n

A function that\naccepts a partition and returns a dictionary of tags to attach to runs for\nthat partition. Deprecated as of 1.3.3.

\n
\n
Type:
\n

Optional[Callable[[Partition], Mapping[str, str]]]

\n
\n
\n
\n\n
\n
\nproperty tags_for_partition_key_fn\u00b6
\n

A function that\naccepts a partition key and returns a dictionary of tags to attach to runs for\nthat partition.

\n
\n
Type:
\n

Optional[Callable[[str], Mapping[str, str]]]

\n
\n
\n
\n\n
\n\n
\n
\ndagster.static_partitioned_config(partition_keys, tags_for_partition_fn=None, tags_for_partition_key_fn=None)[source]\u00b6
\n

Creates a static partitioned config for a job.

\n

The provided partition_keys is a static list of strings identifying the set of partitions. The\nlist of partitions is static, so while the run config returned by the decorated function may\nchange over time, the list of valid partition keys does not.

\n

This has performance advantages over dynamic_partitioned_config in terms of loading different\npartition views in the Dagster UI.

\n

The decorated function takes in a partition key and returns a valid run config for a particular\ntarget job.

\n
\n
Parameters:
\n
    \n
  • partition_keys (Sequence[str]) \u2013 A list of valid partition keys, which serve as the range of\nvalues that can be provided to the decorated run config function.

  • \n
  • tags_for_partition_fn (Optional[Callable[[str], Mapping[str, str]]]) \u2013 \n \n (\n deprecated\n )\n \n (This parameter will be removed in version 2.0. Use tags_for_partition_key_fn instead.) A function that\naccepts a partition key and returns a dictionary of tags to attach to runs for that\npartition.

  • \n
  • tags_for_partition_key_fn (Optional[Callable[[str], Mapping[str, str]]]) \u2013 A function that\naccepts a partition key and returns a dictionary of tags to attach to runs for that\npartition.

  • \n
\n
\n
Returns:
\n

PartitionedConfig

\n
\n
\n
\n\n
\n
\ndagster.dynamic_partitioned_config(partition_fn, tags_for_partition_fn=None, tags_for_partition_key_fn=None)[source]\u00b6
\n

Creates a dynamic partitioned config for a job.

\n

The provided partition_fn returns a list of strings identifying the set of partitions, given\nan optional datetime argument (representing the current time). The list of partitions returned\nmay change over time.

\n

The decorated function takes in a partition key and returns a valid run config for a particular\ntarget job.

\n
\n
Parameters:
\n
    \n
  • partition_fn (Callable[[datetime.datetime], Sequence[str]]) \u2013 A function that generates a\nlist of valid partition keys, which serve as the range of values that can be provided\nto the decorated run config function.

  • \n
  • tags_for_partition_fn (Optional[Callable[[str], Mapping[str, str]]]) \u2013 \n \n (\n deprecated\n )\n \n (This parameter will be removed in version 2.0. Use tags_for_partition_key_fn instead.) A function that\naccepts a partition key and returns a dictionary of tags to attach to runs for that\npartition.

  • \n
\n
\n
Returns:
\n

PartitionedConfig

\n
\n
\n
\n\n
\n
\ndagster.hourly_partitioned_config(start_date, minute_offset=0, timezone=None, fmt=None, end_offset=0, tags_for_partition_fn=None)[source]\u00b6
\n

Defines run config over a set of hourly partitions.

\n

The decorated function should accept a start datetime and end datetime, which represent the date\npartition the config should delineate.

\n

The decorated function should return a run config dictionary.

\n

The resulting object created by this decorator can be provided to the config argument of a Job.\nThe first partition in the set will start at the start_date at midnight. The last partition in\nthe set will end before the current time, unless the end_offset argument is set to a positive\nnumber. If minute_offset is provided, the start and end times of each partition will be\nminute_offset past the hour.

\n
\n
Parameters:
\n
    \n
  • start_date (Union[datetime.datetime, str]) \u2013 The first date in the set of partitions. Can\nprovide in either a datetime or string format.

  • \n
  • minute_offset (int) \u2013 Number of minutes past the hour to \u201csplit\u201d the partition. Defaults\nto 0.

  • \n
  • fmt (Optional[str]) \u2013 The date format to use. Defaults to %Y-%m-%d.

  • \n
  • timezone (Optional[str]) \u2013 The timezone in which each date should exist.\nSupported strings for timezones are the ones provided by the\nIANA time zone database <https://www.iana.org/time-zones> - e.g. \u201cAmerica/Los_Angeles\u201d.

  • \n
  • end_offset (int) \u2013 Extends the partition set by a number of partitions equal to the value\npassed. If end_offset is 0 (the default), the last partition ends before the current\ntime. If end_offset is 1, the second-to-last partition ends before the current time,\nand so on.

  • \n
  • tags_for_partition_fn (Optional[Callable[[str], Mapping[str, str]]]) \u2013 A function that\naccepts a partition time window and returns a dictionary of tags to attach to runs for\nthat partition.

  • \n
\n
\n
\n
@hourly_partitioned_config(start_date=datetime(2022, 03, 12))\n# creates partitions (2022-03-12-00:00, 2022-03-12-01:00), (2022-03-12-01:00, 2022-03-12-02:00), ...\n\n@hourly_partitioned_config(start_date=datetime(2022, 03, 12), minute_offset=15)\n# creates partitions (2022-03-12-00:15, 2022-03-12-01:15), (2022-03-12-01:15, 2022-03-12-02:15), ...\n
\n
\n
\n\n
\n
\ndagster.daily_partitioned_config(start_date, minute_offset=0, hour_offset=0, timezone=None, fmt=None, end_offset=0, tags_for_partition_fn=None)[source]\u00b6
\n

Defines run config over a set of daily partitions.

\n

The decorated function should accept a start datetime and end datetime, which represent the bounds\nof the date partition the config should delineate.

\n

The decorated function should return a run config dictionary.

\n

The resulting object created by this decorator can be provided to the config argument of a Job.\nThe first partition in the set will start at the start_date at midnight. The last partition in\nthe set will end before the current time, unless the end_offset argument is set to a positive\nnumber. If minute_offset and/or hour_offset are used, the start and end times of each partition\nwill be hour_offset:minute_offset of each day.

\n
\n
Parameters:
\n
    \n
  • start_date (Union[datetime.datetime, str]) \u2013 The first date in the set of partitions. Can\nprovide in either a datetime or string format.

  • \n
  • minute_offset (int) \u2013 Number of minutes past the hour to \u201csplit\u201d the partition. Defaults\nto 0.

  • \n
  • hour_offset (int) \u2013 Number of hours past 00:00 to \u201csplit\u201d the partition. Defaults to 0.

  • \n
  • timezone (Optional[str]) \u2013 The timezone in which each date should exist.\nSupported strings for timezones are the ones provided by the\nIANA time zone database <https://www.iana.org/time-zones> - e.g. \u201cAmerica/Los_Angeles\u201d.

  • \n
  • fmt (Optional[str]) \u2013 The date format to use. Defaults to %Y-%m-%d.

  • \n
  • end_offset (int) \u2013 Extends the partition set by a number of partitions equal to the value\npassed. If end_offset is 0 (the default), the last partition ends before the current\ntime. If end_offset is 1, the second-to-last partition ends before the current time,\nand so on.

  • \n
  • tags_for_partition_fn (Optional[Callable[[str], Mapping[str, str]]]) \u2013 A function that\naccepts a partition time window and returns a dictionary of tags to attach to runs for\nthat partition.

  • \n
\n
\n
\n
@daily_partitioned_config(start_date="2022-03-12")\n# creates partitions (2022-03-12-00:00, 2022-03-13-00:00), (2022-03-13-00:00, 2022-03-14-00:00), ...\n\n@daily_partitioned_config(start_date="2022-03-12", minute_offset=15, hour_offset=16)\n# creates partitions (2022-03-12-16:15, 2022-03-13-16:15), (2022-03-13-16:15, 2022-03-14-16:15), ...\n
\n
\n
\n\n
\n
\ndagster.weekly_partitioned_config(start_date, minute_offset=0, hour_offset=0, day_offset=0, timezone=None, fmt=None, end_offset=0, tags_for_partition_fn=None)[source]\u00b6
\n

Defines run config over a set of weekly partitions.

\n

The decorated function should accept a start datetime and end datetime, which represent the date\npartition the config should delineate.

\n

The decorated function should return a run config dictionary.

\n

The resulting object created by this decorator can be provided to the config argument of a Job.\nThe first partition in the set will start at the start_date. The last partition in the set will\nend before the current time, unless the end_offset argument is set to a positive number. If\nday_offset is provided, the start and end date of each partition will be day of the week\ncorresponding to day_offset (0 indexed with Sunday as the start of the week). If\nminute_offset and/or hour_offset are used, the start and end times of each partition will be\nhour_offset:minute_offset of each day.

\n
\n
Parameters:
\n
    \n
  • start_date (Union[datetime.datetime, str]) \u2013 The first date in the set of partitions will\nSunday at midnight following start_date. Can provide in either a datetime or string\nformat.

  • \n
  • minute_offset (int) \u2013 Number of minutes past the hour to \u201csplit\u201d the partition. Defaults\nto 0.

  • \n
  • hour_offset (int) \u2013 Number of hours past 00:00 to \u201csplit\u201d the partition. Defaults to 0.

  • \n
  • day_offset (int) \u2013 Day of the week to \u201csplit\u201d the partition. Defaults to 0 (Sunday).

  • \n
  • timezone (Optional[str]) \u2013 The timezone in which each date should exist.\nSupported strings for timezones are the ones provided by the\nIANA time zone database <https://www.iana.org/time-zones> - e.g. \u201cAmerica/Los_Angeles\u201d.

  • \n
  • fmt (Optional[str]) \u2013 The date format to use. Defaults to %Y-%m-%d.

  • \n
  • end_offset (int) \u2013 Extends the partition set by a number of partitions equal to the value\npassed. If end_offset is 0 (the default), the last partition ends before the current\ntime. If end_offset is 1, the second-to-last partition ends before the current time,\nand so on.

  • \n
  • tags_for_partition_fn (Optional[Callable[[str], Mapping[str, str]]]) \u2013 A function that\naccepts a partition time window and returns a dictionary of tags to attach to runs for\nthat partition.

  • \n
\n
\n
\n
@weekly_partitioned_config(start_date="2022-03-12")\n# creates partitions (2022-03-13-00:00, 2022-03-20-00:00), (2022-03-20-00:00, 2022-03-27-00:00), ...\n\n@weekly_partitioned_config(start_date="2022-03-12", minute_offset=15, hour_offset=3, day_offset=6)\n# creates partitions (2022-03-12-03:15, 2022-03-19-03:15), (2022-03-19-03:15, 2022-03-26-03:15), ...\n
\n
\n
\n\n
\n
\ndagster.monthly_partitioned_config(start_date, minute_offset=0, hour_offset=0, day_offset=1, timezone=None, fmt=None, end_offset=0, tags_for_partition_fn=None)[source]\u00b6
\n

Defines run config over a set of monthly partitions.

\n

The decorated function should accept a start datetime and end datetime, which represent the date\npartition the config should delineate.

\n

The decorated function should return a run config dictionary.

\n

The resulting object created by this decorator can be provided to the config argument of a Job.\nThe first partition in the set will start at midnight on the soonest first of the month after\nstart_date. The last partition in the set will end before the current time, unless the\nend_offset argument is set to a positive number. If day_offset is provided, the start and end\ndate of each partition will be day_offset. If minute_offset and/or hour_offset are used, the\nstart and end times of each partition will be hour_offset:minute_offset of each day.

\n
\n
Parameters:
\n
    \n
  • start_date (Union[datetime.datetime, str]) \u2013 The first date in the set of partitions will be\nmidnight the sonnest first of the month following start_date. Can provide in either a\ndatetime or string format.

  • \n
  • minute_offset (int) \u2013 Number of minutes past the hour to \u201csplit\u201d the partition. Defaults\nto 0.

  • \n
  • hour_offset (int) \u2013 Number of hours past 00:00 to \u201csplit\u201d the partition. Defaults to 0.

  • \n
  • day_offset (int) \u2013 Day of the month to \u201csplit\u201d the partition. Defaults to 1.

  • \n
  • timezone (Optional[str]) \u2013 The timezone in which each date should exist.\nSupported strings for timezones are the ones provided by the\nIANA time zone database <https://www.iana.org/time-zones> - e.g. \u201cAmerica/Los_Angeles\u201d.

  • \n
  • fmt (Optional[str]) \u2013 The date format to use. Defaults to %Y-%m-%d.

  • \n
  • end_offset (int) \u2013 Extends the partition set by a number of partitions equal to the value\npassed. If end_offset is 0 (the default), the last partition ends before the current\ntime. If end_offset is 1, the second-to-last partition ends before the current time,\nand so on.

  • \n
  • tags_for_partition_fn (Optional[Callable[[str], Mapping[str, str]]]) \u2013 A function that\naccepts a partition time window and returns a dictionary of tags to attach to runs for\nthat partition.

  • \n
\n
\n
\n
@monthly_partitioned_config(start_date="2022-03-12")\n# creates partitions (2022-04-01-00:00, 2022-05-01-00:00), (2022-05-01-00:00, 2022-06-01-00:00), ...\n\n@monthly_partitioned_config(start_date="2022-03-12", minute_offset=15, hour_offset=3, day_offset=5)\n# creates partitions (2022-04-05-03:15, 2022-05-05-03:15), (2022-05-05-03:15, 2022-06-05-03:15), ...\n
\n
\n
\n\n
\n", "current_page_name": "sections/api/apidocs/partitions", "customsidebar": null, "display_toc": true, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../definitions/", "title": "Definitions"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../io-managers/", "title": "IO Managers"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/definitions", "Definitions", "N", "next"], ["sections/api/apidocs/io-managers", "IO Managers", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/partitions.rst.txt", "title": "Partitions Definitions", "toc": "\n"}, "repositories": {"alabaster_version": "0.7.13", "body": "
\n

Repositories\u00b6

\n
\n
\ndagster.repository RepositoryDefinition[source]\u00b6
\n

Create a repository from the decorated function.

\n

The decorated function should take no arguments and its return value should one of:

\n

1. List[Union[JobDefinition, ScheduleDefinition, SensorDefinition]].\nUse this form when you have no need to lazy load jobs or other definitions. This is the\ntypical use case.

\n
    \n
  1. A dict of the form:

  2. \n
\n
{\n    'jobs': Dict[str, Callable[[], JobDefinition]],\n    'schedules': Dict[str, Callable[[], ScheduleDefinition]]\n    'sensors': Dict[str, Callable[[], SensorDefinition]]\n}\n
\n
\n

This form is intended to allow definitions to be created lazily when accessed by name,\nwhich can be helpful for performance when there are many definitions in a repository, or\nwhen constructing the definitions is costly.

\n

3. A RepositoryData. Return this object if you need fine-grained\ncontrol over the construction and indexing of definitions within the repository, e.g., to\ncreate definitions dynamically from .yaml files in a directory.

\n
\n
Parameters:
\n
    \n
  • name (Optional[str]) \u2013 The name of the repository. Defaults to the name of the decorated\nfunction.

  • \n
  • description (Optional[str]) \u2013 A string description of the repository.

  • \n
  • metadata (Optional[Dict[str, RawMetadataValue]]) \u2013 Arbitrary metadata for the repository.

  • \n
  • top_level_resources (Optional[Mapping[str, ResourceDefinition]]) \u2013 A dict of top-level\nresource keys to defintions, for resources which should be displayed in the UI.

  • \n
\n
\n
\n

Example

\n
######################################################################\n# A simple repository using the first form of the decorated function\n######################################################################\n\n@op(config_schema={n: Field(Int)})\ndef return_n(context):\n    return context.op_config['n']\n\n@job\ndef simple_job():\n    return_n()\n\n@job\ndef some_job():\n    ...\n\n@sensor(job=some_job)\ndef some_sensor():\n    if foo():\n        yield RunRequest(\n            run_key= ...,\n            run_config={\n                'ops': {'return_n': {'config': {'n': bar()}}}\n            }\n        )\n\n@job\ndef my_job():\n    ...\n\nmy_schedule = ScheduleDefinition(cron_schedule="0 0 * * *", job=my_job)\n\n@repository\ndef simple_repository():\n    return [simple_job, some_sensor, my_schedule]\n\n######################################################################\n# A simple repository using the first form of the decorated function\n# and custom metadata that will be displayed in the UI\n######################################################################\n\n...\n\n@repository(\n    name='my_repo',\n    metadata={\n        'team': 'Team A',\n        'repository_version': '1.2.3',\n        'environment': 'production',\n })\ndef simple_repository():\n    return [simple_job, some_sensor, my_schedule]\n\n######################################################################\n# A lazy-loaded repository\n######################################################################\n\ndef make_expensive_job():\n    @job\n    def expensive_job():\n        for i in range(10000):\n            return_n.alias(f'return_n_{i}')()\n\n    return expensive_job\n\ndef make_expensive_schedule():\n    @job\n    def other_expensive_job():\n        for i in range(11000):\n            return_n.alias(f'my_return_n_{i}')()\n\n    return ScheduleDefinition(cron_schedule="0 0 * * *", job=other_expensive_job)\n\n@repository\ndef lazy_loaded_repository():\n    return {\n        'jobs': {'expensive_job': make_expensive_job},\n        'schedules': {'expensive_schedule': make_expensive_schedule}\n    }\n\n\n######################################################################\n# A complex repository that lazily constructs jobs from a directory\n# of files in a bespoke YAML format\n######################################################################\n\nclass ComplexRepositoryData(RepositoryData):\n    def __init__(self, yaml_directory):\n        self._yaml_directory = yaml_directory\n\n    def get_all_jobs(self):\n        return [\n            self._construct_job_def_from_yaml_file(\n              self._yaml_file_for_job_name(file_name)\n            )\n            for file_name in os.listdir(self._yaml_directory)\n        ]\n\n    ...\n\n@repository\ndef complex_repository():\n    return ComplexRepositoryData('some_directory')\n
\n
\n
\n\n
\n
\nclass dagster.RepositoryDefinition(name, *, repository_data, description=None, metadata=None, repository_load_data=None)[source]\u00b6
\n

Define a repository that contains a group of definitions.

\n

Users should typically not create objects of this class directly. Instead, use the\n@repository() decorator.

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the repository.

  • \n
  • repository_data (RepositoryData) \u2013 Contains the definitions making up the repository.

  • \n
  • description (Optional[str]) \u2013 A string description of the repository.

  • \n
  • metadata (Optional[MetadataMapping]) \u2013 A map of arbitrary metadata for the repository.

  • \n
\n
\n
\n
\n
\nproperty description\u00b6
\n

A human-readable description of the repository.

\n
\n
Type:
\n

Optional[str]

\n
\n
\n
\n\n
\n
\nget_all_jobs()[source]\u00b6
\n

Return all jobs in the repository as a list.

\n

Note that this will construct any job in the lazily evaluated dictionary that has\nnot yet been constructed.

\n
\n
Returns:
\n

All jobs in the repository.

\n
\n
Return type:
\n

List[JobDefinition]

\n
\n
\n
\n\n
\n
\nget_asset_value_loader(instance=None)[source]\u00b6
\n

Returns an object that can load the contents of assets as Python objects.

\n

Invokes load_input on the IOManager associated with the assets. Avoids\nspinning up resources separately for each asset.

\n

Usage:

\n
with my_repo.get_asset_value_loader() as loader:\n    asset1 = loader.load_asset_value("asset1")\n    asset2 = loader.load_asset_value("asset2")\n
\n
\n
\n\n
\n
\nget_job(name)[source]\u00b6
\n

Get a job by name.

\n

If this job is present in the lazily evaluated dictionary passed to the\nconstructor, but has not yet been constructed, only this job is constructed, and\nwill be cached for future calls.

\n
\n
Parameters:
\n

name (str) \u2013 Name of the job to retrieve.

\n
\n
Returns:
\n

The job definition corresponding to\nthe given name.

\n
\n
Return type:
\n

JobDefinition

\n
\n
\n
\n\n
\n
\nget_schedule_def(name)[source]\u00b6
\n

Get a schedule definition by name.

\n
\n
Parameters:
\n

name (str) \u2013 The name of the schedule.

\n
\n
Returns:
\n

The schedule definition.

\n
\n
Return type:
\n

ScheduleDefinition

\n
\n
\n
\n\n
\n
\nget_sensor_def(name)[source]\u00b6
\n

Get a sensor definition by name.

\n
\n
Parameters:
\n

name (str) \u2013 The name of the sensor.

\n
\n
Returns:
\n

The sensor definition.

\n
\n
Return type:
\n

SensorDefinition

\n
\n
\n
\n\n
\n
\nhas_job(name)[source]\u00b6
\n

Check if a job with a given name is present in the repository.

\n
\n
Parameters:
\n

name (str) \u2013 The name of the job.

\n
\n
Returns:
\n

bool

\n
\n
\n
\n\n
\n
\nhas_schedule_def(name)[source]\u00b6
\n

bool: Check if a schedule with a given name is present in the repository.

\n
\n\n
\n
\nhas_sensor_def(name)[source]\u00b6
\n

bool: Check if a sensor with a given name is present in the repository.

\n
\n\n
\n
\nproperty job_names\u00b6
\n

Names of all jobs in the repository.

\n
\n
Type:
\n

List[str]

\n
\n
\n
\n\n
\n
\nload_asset_value(asset_key, *, python_type=None, instance=None, partition_key=None, metadata=None, resource_config=None)[source]\u00b6
\n

Load the contents of an asset as a Python object.

\n

Invokes load_input on the IOManager associated with the asset.

\n

If you want to load the values of multiple assets, it\u2019s more efficient to use\nget_asset_value_loader(), which avoids spinning up\nresources separately for each asset.

\n
\n
Parameters:
\n
    \n
  • asset_key (Union[AssetKey, Sequence[str], str]) \u2013 The key of the asset to load.

  • \n
  • python_type (Optional[Type]) \u2013 The python type to load the asset as. This is what will\nbe returned inside load_input by context.dagster_type.typing_type.

  • \n
  • partition_key (Optional[str]) \u2013 The partition of the asset to load.

  • \n
  • metadata (Optional[Dict[str, Any]]) \u2013 Input metadata to pass to the IOManager\n(is equivalent to setting the metadata argument in In or AssetIn).

  • \n
  • resource_config (Optional[Any]) \u2013 A dictionary of resource configurations to be passed\nto the IOManager.

  • \n
\n
\n
Returns:
\n

The contents of an asset as a Python object.

\n
\n
\n
\n\n
\n
\nproperty metadata\u00b6
\n

Arbitrary metadata for the repository.

\n
\n
Type:
\n

Optional[MetadataMapping]

\n
\n
\n
\n\n
\n
\nproperty name\u00b6
\n

The name of the repository.

\n
\n
Type:
\n

str

\n
\n
\n
\n\n
\n
\nproperty schedule_defs\u00b6
\n

All schedules in the repository.

\n
\n
Type:
\n

List[ScheduleDefinition]

\n
\n
\n
\n\n
\n
\nproperty sensor_defs\u00b6
\n

All sensors in the repository.

\n
\n
Type:
\n

Sequence[SensorDefinition]

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.RepositoryData[source]\u00b6
\n

Users should usually rely on the @repository decorator to create new\nrepositories, which will in turn call the static constructors on this class. However, users may\nsubclass RepositoryData for fine-grained control over access to and lazy creation\nof repository members.

\n
\n
\nabstract get_all_jobs()[source]\u00b6
\n

Return all jobs in the repository as a list.

\n
\n
Returns:
\n

All jobs in the repository.

\n
\n
Return type:
\n

List[JobDefinition]

\n
\n
\n
\n\n
\n
\nget_all_schedules()[source]\u00b6
\n

Return all schedules in the repository as a list.

\n
\n
Returns:
\n

All jobs in the repository.

\n
\n
Return type:
\n

List[ScheduleDefinition]

\n
\n
\n
\n\n
\n
\nget_all_sensors()[source]\u00b6
\n

Sequence[SensorDefinition]: Return all sensors in the repository as a list.

\n
\n\n
\n
\nget_assets_defs_by_key()[source]\u00b6
\n

Mapping[AssetKey, AssetsDefinition]: Get the asset definitions for the repository.

\n
\n\n
\n
\nget_job(job_name)[source]\u00b6
\n

Get a job by name.

\n
\n
Parameters:
\n

job_name (str) \u2013 Name of the job to retrieve.

\n
\n
Returns:
\n

The job definition corresponding to the given name.

\n
\n
Return type:
\n

JobDefinition

\n
\n
\n
\n\n
\n
\nget_job_names()[source]\u00b6
\n

Get the names of all jobs in the repository.

\n
\n
Returns:
\n

List[str]

\n
\n
\n
\n\n
\n
\nget_schedule(schedule_name)[source]\u00b6
\n

Get a schedule by name.

\n
\n
Parameters:
\n

schedule_name (str) \u2013 name of the schedule to retrieve.

\n
\n
Returns:
\n

The schedule definition corresponding to the given name.

\n
\n
Return type:
\n

ScheduleDefinition

\n
\n
\n
\n\n
\n
\nget_schedule_names()[source]\u00b6
\n

Get the names of all schedules in the repository.

\n
\n
Returns:
\n

List[str]

\n
\n
\n
\n\n
\n
\nget_sensor(sensor_name)[source]\u00b6
\n

Get a sensor by name.

\n
\n
Parameters:
\n

sensor_name (str) \u2013 name of the sensor to retrieve.

\n
\n
Returns:
\n

The sensor definition corresponding to the given name.

\n
\n
Return type:
\n

SensorDefinition

\n
\n
\n
\n\n
\n
\nget_sensor_names()[source]\u00b6
\n

Sequence[str]: Get the names of all sensors in the repository.

\n
\n\n
\n
\nget_source_assets_by_key()[source]\u00b6
\n

Mapping[AssetKey, SourceAsset]: Get the source assets for the repository.

\n
\n\n
\n
\nhas_job(job_name)[source]\u00b6
\n

Check if a job with a given name is present in the repository.

\n
\n
Parameters:
\n

job_name (str) \u2013 The name of the job.

\n
\n
Returns:
\n

bool

\n
\n
\n
\n\n
\n
\nhas_schedule(schedule_name)[source]\u00b6
\n

Check if a schedule with a given name is present in the repository.

\n
\n\n
\n
\nhas_sensor(sensor_name)[source]\u00b6
\n

Check if a sensor with a given name is present in the repository.

\n
\n\n
\n\n
\n", "current_page_name": "sections/api/apidocs/repositories", "customsidebar": null, "display_toc": false, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../resources/", "title": "Resources"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../definitions/", "title": "Definitions"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/resources", "Resources", "N", "next"], ["sections/api/apidocs/definitions", "Definitions", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/repositories.rst.txt", "title": "Repositories", "toc": "\n"}, "resources": {"alabaster_version": "0.7.13", "body": "
\n

Resources\u00b6

\n
\n

Pythonic resource system\u00b6

\n

The following classes are used as part of the new Pythonic resources system.

\n
\n
\nclass dagster.ConfigurableResource[source]\u00b6
\n

Base class for Dagster resources that utilize structured config.

\n

This class is a subclass of both ResourceDefinition and Config.

\n

Example definition:

\n
class WriterResource(ConfigurableResource):\n    prefix: str\n\n    def output(self, text: str) -> None:\n        print(f"{self.prefix}{text}")\n
\n
\n

Example usage:

\n
@asset\ndef asset_that_uses_writer(writer: WriterResource):\n    writer.output("text")\n\ndefs = Definitions(\n    assets=[asset_that_uses_writer],\n    resources={"writer": WriterResource(prefix="a_prefix")},\n)\n
\n
\n
\n\n
\n
\nclass dagster.ResourceDefinition(resource_fn, config_schema=None, description=None, required_resource_keys=None, version=None)[source]\u00b6
\n

Core class for defining resources.

\n

Resources are scoped ways to make external resources (like database connections) available to\nops and assets during job execution and to clean up after execution resolves.

\n

If resource_fn yields once rather than returning (in the manner of functions decorable with\n@contextlib.contextmanager) then the body of the\nfunction after the yield will be run after execution resolves, allowing users to write their\nown teardown/cleanup logic.

\n

Depending on your executor, resources may be instantiated and cleaned up more than once in a\njob execution.

\n
\n
Parameters:
\n
    \n
  • resource_fn (Callable[[InitResourceContext], Any]) \u2013 User-provided function to instantiate\nthe resource, which will be made available to executions keyed on the\ncontext.resources object.

  • \n
  • config_schema (Optional[ConfigSchema) \u2013 The schema for the config. If set, Dagster will check\nthat config provided for the resource matches this schema and fail if it does not. If\nnot set, Dagster will accept any config provided for the resource.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the resource.

  • \n
  • required_resource_keys \u2013 (Optional[Set[str]]) Keys for the resources required by this\nresource. A DagsterInvariantViolationError will be raised during initialization if\ndependencies are cyclic.

  • \n
  • version (Optional[str]) \u2013 \n \n (\n experimental\n )\n \n (This parameter may break in future versions, even between dot releases.) (Experimental) The version of the resource\u2019s definition fn. Two\nwrapped resource functions should only have the same version if they produce the same\nresource definition when provided with the same inputs.

  • \n
\n
\n
\n
\n
\nproperty description\u00b6
\n

A human-readable description of the resource.

\n
\n\n
\n
\nstatic hardcoded_resource(value, description=None)[source]\u00b6
\n

A helper function that creates a ResourceDefinition with a hardcoded object.

\n
\n
Parameters:
\n
    \n
  • value (Any) \u2013 The value that will be accessible via context.resources.resource_name.

  • \n
  • description ([Optional[str]]) \u2013 The description of the resource. Defaults to None.

  • \n
\n
\n
Returns:
\n

A hardcoded resource.

\n
\n
Return type:
\n

[ResourceDefinition]

\n
\n
\n
\n\n
\n
\nstatic mock_resource(description=None)[source]\u00b6
\n

A helper function that creates a ResourceDefinition which wraps a mock.MagicMock.

\n
\n
Parameters:
\n

description ([Optional[str]]) \u2013 The description of the resource. Defaults to None.

\n
\n
Returns:
\n

\n
A resource that creates the magic methods automatically and helps

you mock existing resources.

\n
\n
\n

\n
\n
Return type:
\n

[ResourceDefinition]

\n
\n
\n
\n\n
\n
\nstatic none_resource(description=None)[source]\u00b6
\n

A helper function that returns a none resource.

\n
\n
Parameters:
\n

description ([Optional[str]]) \u2013 The description of the resource. Defaults to None.

\n
\n
Returns:
\n

A resource that does nothing.

\n
\n
Return type:
\n

[ResourceDefinition]

\n
\n
\n
\n\n
\n
\nproperty required_resource_keys\u00b6
\n

A set of the resource keys that this resource depends on. These keys will be made available\nto the resource\u2019s init context during execution, and the resource will not be instantiated\nuntil all required resources are available.

\n
\n\n
\n
\nstatic string_resource(description=None)[source]\u00b6
\n

Creates a ResourceDefinition which takes in a single string as configuration\nand returns this configured string to any ops or assets which depend on it.

\n
\n
Parameters:
\n

description ([Optional[str]]) \u2013 The description of the string resource. Defaults to None.

\n
\n
Returns:
\n

\n
A resource that takes in a single string as configuration and

returns that string.

\n
\n
\n

\n
\n
Return type:
\n

[ResourceDefinition]

\n
\n
\n
\n\n
\n
\nproperty version\u00b6
\n

A string which can be used to identify a particular code version of a resource definition.

\n
\n\n
\n\n
\n
\nclass dagster.InitResourceContext(resource_config, resources, resource_def=None, instance=None, dagster_run=None, log_manager=None)[source]\u00b6
\n

The context object available as the argument to the initialization function of a dagster.ResourceDefinition.

\n

Users should not instantiate this object directly. To construct an InitResourceContext for testing purposes, use dagster.build_init_resource_context().

\n

Example

\n
from dagster import resource, InitResourceContext\n\n@resource\ndef the_resource(init_context: InitResourceContext):\n    init_context.log.info("Hello, world!")\n
\n
\n
\n
\nproperty instance\u00b6
\n

The Dagster instance configured for the current execution context.

\n
\n\n
\n
\nproperty log\u00b6
\n

The Dagster log manager configured for the current execution context.

\n
\n\n
\n
\nproperty log_manager\u00b6
\n

The log manager for this run of the job.

\n
\n\n
\n
\nproperty resource_config\u00b6
\n

The configuration data provided by the run config. The schema\nfor this data is defined by the config_field argument to\nResourceDefinition.

\n
\n\n
\n
\nproperty resource_def\u00b6
\n

The definition of the resource currently being constructed.

\n
\n\n
\n
\nproperty resources\u00b6
\n

The resources that are available to the resource that we are initalizing.

\n
\n\n
\n
\nproperty run_id\u00b6
\n

The id for this run of the job or pipeline. When initializing resources outside of\nexecution context, this will be None.

\n
\n\n
\n\n
\n
\ndagster.make_values_resource(**kwargs)[source]\u00b6
\n

A helper function that creates a ResourceDefinition to take in user-defined values.

\n
\n

This is useful for sharing values between ops.

\n
\n
\n
Parameters:
\n

**kwargs \u2013 Arbitrary keyword arguments that will be passed to the config schema of the\nreturned resource definition. If not set, Dagster will accept any config provided for\nthe resource.

\n
\n
\n

For example:

\n
@op(required_resource_keys={"globals"})\ndef my_op(context):\n    print(context.resources.globals["my_str_var"])\n\n@job(resource_defs={"globals": make_values_resource(my_str_var=str, my_int_var=int)})\ndef my_job():\n    my_op()\n
\n
\n
\n
Returns:
\n

A resource that passes in user-defined values.

\n
\n
Return type:
\n

ResourceDefinition

\n
\n
\n
\n\n
\n
\ndagster.build_init_resource_context(config=None, resources=None, instance=None)[source]\u00b6
\n

Builds resource initialization context from provided parameters.

\n

build_init_resource_context can be used as either a function or context manager. If there is a\nprovided resource to build_init_resource_context that is a context manager, then it must be\nused as a context manager. This function can be used to provide the context argument to the\ninvocation of a resource.

\n
\n
Parameters:
\n
    \n
  • resources (Optional[Dict[str, Any]]) \u2013 The resources to provide to the context. These can be\neither values or resource definitions.

  • \n
  • config (Optional[Any]) \u2013 The resource config to provide to the context.

  • \n
  • instance (Optional[DagsterInstance]) \u2013 The dagster instance configured for the context.\nDefaults to DagsterInstance.ephemeral().

  • \n
\n
\n
\n

Examples

\n
context = build_init_resource_context()\nresource_to_init(context)\n\nwith build_init_resource_context(\n    resources={"foo": context_manager_resource}\n) as context:\n    resource_to_init(context)\n
\n
\n
\n\n
\n
\ndagster.build_resources(resources, instance=None, resource_config=None, dagster_run=None, log_manager=None)[source]\u00b6
\n

Context manager that yields resources using provided resource definitions and run config.

\n

This API allows for using resources in an independent context. Resources will be initialized\nwith the provided run config, and optionally, dagster_run. The resulting resources will be\nyielded on a dictionary keyed identically to that provided for resource_defs. Upon exiting the\ncontext, resources will also be torn down safely.

\n
\n
Parameters:
\n
    \n
  • resources (Mapping[str, Any]) \u2013 Resource instances or definitions to build. All\nrequired resource dependencies to a given resource must be contained within this\ndictionary, or the resource build will fail.

  • \n
  • instance (Optional[DagsterInstance]) \u2013 The dagster instance configured to instantiate\nresources on.

  • \n
  • resource_config (Optional[Mapping[str, Any]]) \u2013 A dict representing the config to be\nprovided to each resource during initialization and teardown.

  • \n
  • dagster_run (Optional[PipelineRun]) \u2013 The pipeline run to provide during resource\ninitialization and teardown. If the provided resources require either the dagster_run\nor run_id attributes of the provided context during resource initialization and/or\nteardown, this must be provided, or initialization will fail.

  • \n
  • log_manager (Optional[DagsterLogManager]) \u2013 Log Manager to use during resource\ninitialization. Defaults to system log manager.

  • \n
\n
\n
\n

Examples

\n
from dagster import resource, build_resources\n\n@resource\ndef the_resource():\n    return "foo"\n\nwith build_resources(resources={"from_def": the_resource, "from_val": "bar"}) as resources:\n    assert resources.from_def == "foo"\n    assert resources.from_val == "bar"\n
\n
\n
\n\n
\n
\ndagster.with_resources(definitions, resource_defs, resource_config_by_key=None)[source]\u00b6
\n

Adds dagster resources to copies of resource-requiring dagster definitions.

\n

An error will be thrown if any provided definitions have a conflicting\nresource definition provided for a key provided to resource_defs. Resource\nconfig can be provided, with keys in the config dictionary corresponding to\nthe keys for each resource definition. If any definition has unsatisfied\nresource keys after applying with_resources, an error will be thrown.

\n
\n
Parameters:
\n
    \n
  • definitions (Iterable[ResourceAddable]) \u2013 Dagster definitions to provide resources to.

  • \n
  • resource_defs (Mapping[str, object]) \u2013 Mapping of resource keys to objects to satisfy\nresource requirements of provided dagster definitions.

  • \n
  • resource_config_by_key (Optional[Mapping[str, Any]]) \u2013 Specifies config for provided resources. The key in this dictionary\ncorresponds to configuring the same key in the resource_defs\ndictionary.

  • \n
\n
\n
\n

Examples

\n
from dagster import asset, resource, with_resources\n\n@resource(config_schema={"bar": str})\ndef foo_resource():\n    ...\n\n@asset(required_resource_keys={"foo"})\ndef asset1(context):\n    foo = context.resources.foo\n    ...\n\n@asset(required_resource_keys={"foo"})\ndef asset2(context):\n    foo = context.resources.foo\n    ...\n\nasset1_with_foo, asset2_with_foo = with_resources(\n    [the_asset, other_asset],\n    resource_config_by_key={\n        "foo": {\n            "config": {"bar": ...}\n        }\n    }\n)\n
\n
\n
\n\n
\n
\n

Legacy resource system\u00b6

\n

The following classes are used as part of the legacy resource system.

\n
\n
\n@dagster.resource(config_schema=None, description=None, required_resource_keys=None, version=None)[source]\u00b6
\n

Define a resource.

\n

The decorated function should accept an InitResourceContext and return an instance of\nthe resource. This function will become the resource_fn of an underlying\nResourceDefinition.

\n

If the decorated function yields once rather than returning (in the manner of functions\ndecorable with @contextlib.contextmanager) then\nthe body of the function after the yield will be run after execution resolves, allowing users\nto write their own teardown/cleanup logic.

\n
\n
Parameters:
\n
    \n
  • config_schema (Optional[ConfigSchema]) \u2013 The schema for the config. Configuration data available in\ninit_context.resource_config. If not set, Dagster will accept any config provided.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the resource.

  • \n
  • version (Optional[str]) \u2013 (Experimental) The version of a resource function. Two wrapped\nresource functions should only have the same version if they produce the same resource\ndefinition when provided with the same inputs.

  • \n
  • required_resource_keys (Optional[Set[str]]) \u2013 Keys for the resources required by this resource.

  • \n
\n
\n
\n
\n\n
\n
\n", "current_page_name": "sections/api/apidocs/resources", "customsidebar": null, "display_toc": true, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../schedules-sensors/", "title": "Run Requests"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../repositories/", "title": "Repositories"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/schedules-sensors", "Run Requests", "N", "next"], ["sections/api/apidocs/repositories", "Repositories", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/resources.rst.txt", "title": "Resources", "toc": "\n"}, "schedules-sensors": {"alabaster_version": "0.7.13", "body": "
\n

Run Requests\u00b6

\n
\n
\nclass dagster.RunRequest(run_key=None, run_config=None, tags=None, job_name=None, asset_selection=None, stale_assets_only=False, partition_key=None)[source]\u00b6
\n

Represents all the information required to launch a single run. Must be returned by a\nSensorDefinition or ScheduleDefinition\u2019s evaluation function for a run to be launched.

\n
\n
\nrun_key\u00b6
\n

A string key to identify this launched run. For sensors, ensures that\nonly one run is created per run key across all sensor evaluations. For schedules,\nensures that one run is created per tick, across failure recoveries. Passing in a None\nvalue means that a run will always be launched per evaluation.

\n
\n
Type:
\n

Optional[str]

\n
\n
\n
\n\n
\n
\nrun_config (Optional[Mapping[str, Any]]
\n

Configuration for the run. If the job has\na PartitionedConfig, this value will override replace the config\nprovided by it.

\n
\n\n
\n
\ntags\u00b6
\n

A dictionary of tags (string key-value pairs) to attach\nto the launched run.

\n
\n
Type:
\n

Optional[Dict[str, Any]]

\n
\n
\n
\n\n
\n
\njob_name\u00b6
\n

(Experimental) The name of the job this run request will launch.\nRequired for sensors that target multiple jobs.

\n
\n
Type:
\n

Optional[str]

\n
\n
\n
\n\n
\n
\nasset_selection\u00b6
\n

A sequence of AssetKeys that should be\nlaunched with this run.

\n
\n
Type:
\n

Optional[Sequence[AssetKey]]

\n
\n
\n
\n\n
\n
\nstale_assets_only\u00b6
\n

Set to true to further narrow the asset\nselection to stale assets. If passed without an asset selection, all stale assets in the\njob will be materialized. If the job does not materialize assets, this flag is ignored.

\n
\n
Type:
\n

bool

\n
\n
\n
\n\n
\n
\npartition_key\u00b6
\n

The partition key for this run request.

\n
\n
Type:
\n

Optional[str]

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.SkipReason(skip_message=None)[source]\u00b6
\n

Represents a skipped evaluation, where no runs are requested. May contain a message to indicate\nwhy no runs were requested.

\n
\n
\nskip_message\u00b6
\n

A message displayed in the Dagster UI for why this evaluation resulted\nin no requested runs.

\n
\n
Type:
\n

Optional[str]

\n
\n
\n
\n\n
\n\n
\n
\n

Schedules\u00b6

\n
\n
\n@dagster.schedule(cron_schedule, *, job_name=None, name=None, tags=None, tags_fn=None, should_execute=None, environment_vars=None, execution_timezone=None, description=None, job=None, default_status=DefaultScheduleStatus.STOPPED, required_resource_keys=None)[source]\u00b6
\n

Creates a schedule following the provided cron schedule and requests runs for the provided job.

\n

The decorated function takes in a ScheduleEvaluationContext as its only\nargument, and does one of the following:

\n
    \n
  1. Return a RunRequest object.

  2. \n
  3. Return a list of RunRequest objects.

  4. \n
  5. Return a SkipReason object, providing a descriptive message of why no runs were requested.

  6. \n
  7. Return nothing (skipping without providing a reason)

  8. \n
  9. Return a run config dictionary.

  10. \n
  11. Yield a SkipReason or yield one ore more RunRequest objects.

  12. \n
\n

Returns a ScheduleDefinition.

\n
\n
Parameters:
\n
    \n
  • cron_schedule (Union[str, Sequence[str]]) \u2013 A valid cron string or sequence of cron strings\nspecifying when the schedule will run, e.g., '45 23 * * 6' for a schedule that runs\nat 11:45 PM every Saturday. If a sequence is provided, then the schedule will run for\nthe union of all execution times for the provided cron strings, e.g.,\n['45 23 * * 6', '30 9 * * 0] for a schedule that runs at 11:45 PM every Saturday and\n9:30 AM every Sunday.

  • \n
  • name (Optional[str]) \u2013 The name of the schedule to create.

  • \n
  • tags (Optional[Dict[str, str]]) \u2013 A dictionary of tags (string key-value pairs) to attach\nto the scheduled runs.

  • \n
  • tags_fn (Optional[Callable[[ScheduleEvaluationContext], Optional[Dict[str, str]]]]) \u2013 A function\nthat generates tags to attach to the schedules runs. Takes a\nScheduleEvaluationContext and returns a dictionary of tags (string\nkey-value pairs). You may set only one of tags and tags_fn.

  • \n
  • should_execute (Optional[Callable[[ScheduleEvaluationContext], bool]]) \u2013 A function that runs at\nschedule execution time to determine whether a schedule should execute or skip. Takes a\nScheduleEvaluationContext and returns a boolean (True if the\nschedule should execute). Defaults to a function that always returns True.

  • \n
  • execution_timezone (Optional[str]) \u2013 Timezone in which the schedule should run.\nSupported strings for timezones are the ones provided by the\nIANA time zone database <https://www.iana.org/time-zones> - e.g. \u201cAmerica/Los_Angeles\u201d.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the schedule.

  • \n
  • job (Optional[Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]]) \u2013 The job\nthat should execute when this schedule runs.

  • \n
  • default_status (DefaultScheduleStatus) \u2013 Whether the schedule starts as running or not. The default\nstatus can be overridden from the Dagster UI or via the GraphQL API.

  • \n
  • required_resource_keys (Optional[Set[str]]) \u2013 The set of resource keys required by the schedule.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.ScheduleDefinition(name=None, *, cron_schedule=None, job_name=None, run_config=None, run_config_fn=None, tags=None, tags_fn=None, should_execute=None, environment_vars=None, execution_timezone=None, execution_fn=None, description=None, job=None, default_status=DefaultScheduleStatus.STOPPED, required_resource_keys=None)[source]\u00b6
\n

Define a schedule that targets a job.

\n
\n
Parameters:
\n
    \n
  • name (Optional[str]) \u2013 The name of the schedule to create. Defaults to the job name plus\n\u201c_schedule\u201d.

  • \n
  • cron_schedule (Union[str, Sequence[str]]) \u2013 A valid cron string or sequence of cron strings\nspecifying when the schedule will run, e.g., '45 23 * * 6' for a schedule that runs\nat 11:45 PM every Saturday. If a sequence is provided, then the schedule will run for\nthe union of all execution times for the provided cron strings, e.g.,\n['45 23 * * 6', '30 9 * * 0] for a schedule that runs at 11:45 PM every Saturday and\n9:30 AM every Sunday.

  • \n
  • execution_fn (Callable[ScheduleEvaluationContext]) \u2013

    The core evaluation function for the\nschedule, which is run at an interval to determine whether a run should be launched or\nnot. Takes a ScheduleEvaluationContext.

    \n

    This function must return a generator, which must yield either a single SkipReason\nor one or more RunRequest objects.

    \n

  • \n
  • run_config (Optional[Mapping]) \u2013 The config that parameterizes this execution,\nas a dict.

  • \n
  • run_config_fn (Optional[Callable[[ScheduleEvaluationContext], [Mapping]]]) \u2013 A function that\ntakes a ScheduleEvaluationContext object and returns the run configuration that\nparameterizes this execution, as a dict. You may set only one of run_config,\nrun_config_fn, and execution_fn.

  • \n
  • tags (Optional[Mapping[str, str]]) \u2013 A dictionary of tags (string key-value pairs) to attach\nto the scheduled runs.

  • \n
  • tags_fn (Optional[Callable[[ScheduleEvaluationContext], Optional[Mapping[str, str]]]]) \u2013 A\nfunction that generates tags to attach to the schedules runs. Takes a\nScheduleEvaluationContext and returns a dictionary of tags (string\nkey-value pairs). You may set only one of tags, tags_fn, and execution_fn.

  • \n
  • should_execute (Optional[Callable[[ScheduleEvaluationContext], bool]]) \u2013 A function that runs\nat schedule execution time to determine whether a schedule should execute or skip. Takes\na ScheduleEvaluationContext and returns a boolean (True if the\nschedule should execute). Defaults to a function that always returns True.

  • \n
  • execution_timezone (Optional[str]) \u2013 Timezone in which the schedule should run.\nSupported strings for timezones are the ones provided by the\nIANA time zone database <https://www.iana.org/time-zones> - e.g. \u201cAmerica/Los_Angeles\u201d.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the schedule.

  • \n
  • job (Optional[Union[GraphDefinition, JobDefinition]]) \u2013 The job that should execute when this\nschedule runs.

  • \n
  • default_status (DefaultScheduleStatus) \u2013 Whether the schedule starts as running or not. The default\nstatus can be overridden from the Dagster UI or via the GraphQL API.

  • \n
  • required_resource_keys (Optional[Set[str]]) \u2013 The set of resource keys required by the schedule.

  • \n
\n
\n
\n
\n
\nproperty cron_schedule\u00b6
\n

The cron schedule representing when this schedule will be evaluated.

\n
\n
Type:
\n

Union[str, Sequence[str]]

\n
\n
\n
\n\n
\n
\nproperty default_status\u00b6
\n

The default status for this schedule when it is first loaded in\na code location.

\n
\n
Type:
\n

DefaultScheduleStatus

\n
\n
\n
\n\n
\n
\nproperty description\u00b6
\n

A description for this schedule.

\n
\n
Type:
\n

Optional[str]

\n
\n
\n
\n\n
\n
\nproperty environment_vars\u00b6
\n
\n
\n

\n \n (\n deprecated\n )\n \n This API will be removed in version 2.0. Setting this property no longer has any effect..\n \n

\n

Environment variables to export to the cron schedule.

\n
\n
Type:
\n

Mapping[str, str]

\n
\n
\n
\n\n
\n
\nproperty execution_timezone\u00b6
\n

The timezone in which this schedule will be evaluated.

\n
\n
Type:
\n

Optional[str]

\n
\n
\n
\n\n
\n
\nproperty job\u00b6
\n

The job that is\ntargeted by this schedule.

\n
\n
Type:
\n

Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]

\n
\n
\n
\n\n
\n
\nproperty job_name\u00b6
\n

The name of the job targeted by this schedule.

\n
\n
Type:
\n

str

\n
\n
\n
\n\n
\n
\nproperty name\u00b6
\n

The name of the schedule.

\n
\n
Type:
\n

str

\n
\n
\n
\n\n
\n
\nproperty required_resource_keys\u00b6
\n

The set of keys for resources that must be provided to this schedule.

\n
\n
Type:
\n

Set[str]

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.ScheduleEvaluationContext(instance_ref, scheduled_execution_time, repository_name=None, schedule_name=None, resources=None, repository_def=None)[source]\u00b6
\n

The context object available as the first argument various functions defined on a dagster.ScheduleDefinition.

\n

A ScheduleEvaluationContext object is passed as the first argument to run_config_fn, tags_fn,\nand should_execute.

\n

Users should not instantiate this object directly. To construct a ScheduleEvaluationContext for testing purposes, use dagster.build_schedule_context().

\n

Example

\n
from dagster import schedule, ScheduleEvaluationContext\n\n@schedule\ndef the_schedule(context: ScheduleEvaluationContext):\n    ...\n
\n
\n
\n
\nproperty instance\u00b6
\n

The current DagsterInstance.

\n
\n
Type:
\n

DagsterInstance

\n
\n
\n
\n\n
\n
\nproperty resources\u00b6
\n

Mapping of resource key to resource definition to be made available\nduring schedule execution.

\n
\n\n
\n
\nproperty scheduled_execution_time\u00b6
\n

The time in which the execution was scheduled to happen. May differ slightly\nfrom both the actual execution time and the time at which the run config is computed.

\n
\n\n
\n\n
\n
\ndagster.build_schedule_context(instance=None, scheduled_execution_time=None, resources=None, repository_def=None, instance_ref=None)[source]\u00b6
\n

Builds schedule execution context using the provided parameters.

\n

The instance provided to build_schedule_context must be persistent;\nDagsterInstance.ephemeral() will result in an error.

\n
\n
Parameters:
\n
    \n
  • instance (Optional[DagsterInstance]) \u2013 The dagster instance configured to run the schedule.

  • \n
  • scheduled_execution_time (datetime) \u2013 The time in which the execution was scheduled to\nhappen. May differ slightly from both the actual execution time and the time at which\nthe run config is computed.

  • \n
\n
\n
\n

Examples

\n
context = build_schedule_context(instance)\n
\n
\n
\n\n
\n
\ndagster._core.scheduler.DagsterDaemonScheduler Scheduler[source]\u00b6
\n
\n

\n
\n
\nConfig Schema:
\n
max_catchup_runs (dagster.IntSource, optional):
\n

For partitioned schedules, controls the maximum number of past\npartitions for each schedule that will be considered when looking for missing\nruns . Generally this parameter will only come into play if the scheduler\nfalls behind or launches after experiencing downtime. This parameter will not be checked for\nschedules without partition sets (for example, schedules created using the @schedule\ndecorator) - only the most recent execution time will be considered for those schedules.

\n

Note that no matter what this value is, the scheduler will never launch a run from a time\nbefore the schedule was turned on (even if the start_date on the schedule is earlier) - if\nyou want to launch runs for earlier partitions, launch a backfill.

\n

Default Value: 5

\n
\n
max_tick_retries (dagster.IntSource, optional):
\n

For each schedule tick that raises an error, how many times to retry that tick

\n

Default Value: 0

\n
\n
\n

Default scheduler implementation that submits runs from the dagster-daemon\nlong-lived process. Periodically checks each running schedule for execution times that don\u2019t\nhave runs yet and launches them.

\n
\n\n
\n
\n

Partitioned Schedules\u00b6

\n
\n
\ndagster.build_schedule_from_partitioned_job(job, description=None, name=None, minute_of_hour=None, hour_of_day=None, day_of_week=None, day_of_month=None, default_status=DefaultScheduleStatus.STOPPED, tags=None)[source]\u00b6
\n

Creates a schedule from a time window-partitioned job or a job that targets\ntime window-partitioned assets. The job can also be multipartitioned, as long as one\nof the partitions dimensions is time-partitioned.

\n

The schedule executes at the cadence specified by the time partitioning of the job or assets.

\n

Examples

\n
######################################\n# Job that targets partitioned assets\n######################################\n\nfrom dagster import (\n    DailyPartitionsDefinition,\n    asset,\n    build_schedule_from_partitioned_job,\n    define_asset_job,\n)\n\n@asset(partitions_def=DailyPartitionsDefinition(start_date="2020-01-01"))\ndef asset1():\n    ...\n\nasset1_job = define_asset_job("asset1_job", selection=[asset1])\n\n# The created schedule will fire daily\nasset1_job_schedule = build_schedule_from_partitioned_job(asset1_job)\n\ndefs = Definitions(assets=[asset1], schedules=[asset1_job_schedule])\n\n################\n# Non-asset job\n################\n\nfrom dagster import DailyPartitionsDefinition, build_schedule_from_partitioned_job, jog\n\n\n@job(partitions_def=DailyPartitionsDefinition(start_date="2020-01-01"))\ndef do_stuff_partitioned():\n    ...\n\n# The created schedule will fire daily\ndo_stuff_partitioned_schedule = build_schedule_from_partitioned_job(\n    do_stuff_partitioned,\n)\n\ndefs = Definitions(schedules=[do_stuff_partitioned_schedule])\n
\n
\n
\n\n
\n
\n@dagster.hourly_partitioned_config(start_date, minute_offset=0, timezone=None, fmt=None, end_offset=0, tags_for_partition_fn=None)[source]
\n

Defines run config over a set of hourly partitions.

\n

The decorated function should accept a start datetime and end datetime, which represent the date\npartition the config should delineate.

\n

The decorated function should return a run config dictionary.

\n

The resulting object created by this decorator can be provided to the config argument of a Job.\nThe first partition in the set will start at the start_date at midnight. The last partition in\nthe set will end before the current time, unless the end_offset argument is set to a positive\nnumber. If minute_offset is provided, the start and end times of each partition will be\nminute_offset past the hour.

\n
\n
Parameters:
\n
    \n
  • start_date (Union[datetime.datetime, str]) \u2013 The first date in the set of partitions. Can\nprovide in either a datetime or string format.

  • \n
  • minute_offset (int) \u2013 Number of minutes past the hour to \u201csplit\u201d the partition. Defaults\nto 0.

  • \n
  • fmt (Optional[str]) \u2013 The date format to use. Defaults to %Y-%m-%d.

  • \n
  • timezone (Optional[str]) \u2013 The timezone in which each date should exist.\nSupported strings for timezones are the ones provided by the\nIANA time zone database <https://www.iana.org/time-zones> - e.g. \u201cAmerica/Los_Angeles\u201d.

  • \n
  • end_offset (int) \u2013 Extends the partition set by a number of partitions equal to the value\npassed. If end_offset is 0 (the default), the last partition ends before the current\ntime. If end_offset is 1, the second-to-last partition ends before the current time,\nand so on.

  • \n
  • tags_for_partition_fn (Optional[Callable[[str], Mapping[str, str]]]) \u2013 A function that\naccepts a partition time window and returns a dictionary of tags to attach to runs for\nthat partition.

  • \n
\n
\n
\n
@hourly_partitioned_config(start_date=datetime(2022, 03, 12))\n# creates partitions (2022-03-12-00:00, 2022-03-12-01:00), (2022-03-12-01:00, 2022-03-12-02:00), ...\n\n@hourly_partitioned_config(start_date=datetime(2022, 03, 12), minute_offset=15)\n# creates partitions (2022-03-12-00:15, 2022-03-12-01:15), (2022-03-12-01:15, 2022-03-12-02:15), ...\n
\n
\n
\n\n
\n
\n@dagster.daily_partitioned_config(start_date, minute_offset=0, hour_offset=0, timezone=None, fmt=None, end_offset=0, tags_for_partition_fn=None)[source]
\n

Defines run config over a set of daily partitions.

\n

The decorated function should accept a start datetime and end datetime, which represent the bounds\nof the date partition the config should delineate.

\n

The decorated function should return a run config dictionary.

\n

The resulting object created by this decorator can be provided to the config argument of a Job.\nThe first partition in the set will start at the start_date at midnight. The last partition in\nthe set will end before the current time, unless the end_offset argument is set to a positive\nnumber. If minute_offset and/or hour_offset are used, the start and end times of each partition\nwill be hour_offset:minute_offset of each day.

\n
\n
Parameters:
\n
    \n
  • start_date (Union[datetime.datetime, str]) \u2013 The first date in the set of partitions. Can\nprovide in either a datetime or string format.

  • \n
  • minute_offset (int) \u2013 Number of minutes past the hour to \u201csplit\u201d the partition. Defaults\nto 0.

  • \n
  • hour_offset (int) \u2013 Number of hours past 00:00 to \u201csplit\u201d the partition. Defaults to 0.

  • \n
  • timezone (Optional[str]) \u2013 The timezone in which each date should exist.\nSupported strings for timezones are the ones provided by the\nIANA time zone database <https://www.iana.org/time-zones> - e.g. \u201cAmerica/Los_Angeles\u201d.

  • \n
  • fmt (Optional[str]) \u2013 The date format to use. Defaults to %Y-%m-%d.

  • \n
  • end_offset (int) \u2013 Extends the partition set by a number of partitions equal to the value\npassed. If end_offset is 0 (the default), the last partition ends before the current\ntime. If end_offset is 1, the second-to-last partition ends before the current time,\nand so on.

  • \n
  • tags_for_partition_fn (Optional[Callable[[str], Mapping[str, str]]]) \u2013 A function that\naccepts a partition time window and returns a dictionary of tags to attach to runs for\nthat partition.

  • \n
\n
\n
\n
@daily_partitioned_config(start_date="2022-03-12")\n# creates partitions (2022-03-12-00:00, 2022-03-13-00:00), (2022-03-13-00:00, 2022-03-14-00:00), ...\n\n@daily_partitioned_config(start_date="2022-03-12", minute_offset=15, hour_offset=16)\n# creates partitions (2022-03-12-16:15, 2022-03-13-16:15), (2022-03-13-16:15, 2022-03-14-16:15), ...\n
\n
\n
\n\n
\n
\n@dagster.weekly_partitioned_config(start_date, minute_offset=0, hour_offset=0, day_offset=0, timezone=None, fmt=None, end_offset=0, tags_for_partition_fn=None)[source]
\n

Defines run config over a set of weekly partitions.

\n

The decorated function should accept a start datetime and end datetime, which represent the date\npartition the config should delineate.

\n

The decorated function should return a run config dictionary.

\n

The resulting object created by this decorator can be provided to the config argument of a Job.\nThe first partition in the set will start at the start_date. The last partition in the set will\nend before the current time, unless the end_offset argument is set to a positive number. If\nday_offset is provided, the start and end date of each partition will be day of the week\ncorresponding to day_offset (0 indexed with Sunday as the start of the week). If\nminute_offset and/or hour_offset are used, the start and end times of each partition will be\nhour_offset:minute_offset of each day.

\n
\n
Parameters:
\n
    \n
  • start_date (Union[datetime.datetime, str]) \u2013 The first date in the set of partitions will\nSunday at midnight following start_date. Can provide in either a datetime or string\nformat.

  • \n
  • minute_offset (int) \u2013 Number of minutes past the hour to \u201csplit\u201d the partition. Defaults\nto 0.

  • \n
  • hour_offset (int) \u2013 Number of hours past 00:00 to \u201csplit\u201d the partition. Defaults to 0.

  • \n
  • day_offset (int) \u2013 Day of the week to \u201csplit\u201d the partition. Defaults to 0 (Sunday).

  • \n
  • timezone (Optional[str]) \u2013 The timezone in which each date should exist.\nSupported strings for timezones are the ones provided by the\nIANA time zone database <https://www.iana.org/time-zones> - e.g. \u201cAmerica/Los_Angeles\u201d.

  • \n
  • fmt (Optional[str]) \u2013 The date format to use. Defaults to %Y-%m-%d.

  • \n
  • end_offset (int) \u2013 Extends the partition set by a number of partitions equal to the value\npassed. If end_offset is 0 (the default), the last partition ends before the current\ntime. If end_offset is 1, the second-to-last partition ends before the current time,\nand so on.

  • \n
  • tags_for_partition_fn (Optional[Callable[[str], Mapping[str, str]]]) \u2013 A function that\naccepts a partition time window and returns a dictionary of tags to attach to runs for\nthat partition.

  • \n
\n
\n
\n
@weekly_partitioned_config(start_date="2022-03-12")\n# creates partitions (2022-03-13-00:00, 2022-03-20-00:00), (2022-03-20-00:00, 2022-03-27-00:00), ...\n\n@weekly_partitioned_config(start_date="2022-03-12", minute_offset=15, hour_offset=3, day_offset=6)\n# creates partitions (2022-03-12-03:15, 2022-03-19-03:15), (2022-03-19-03:15, 2022-03-26-03:15), ...\n
\n
\n
\n\n
\n
\n@dagster.monthly_partitioned_config(start_date, minute_offset=0, hour_offset=0, day_offset=1, timezone=None, fmt=None, end_offset=0, tags_for_partition_fn=None)[source]
\n

Defines run config over a set of monthly partitions.

\n

The decorated function should accept a start datetime and end datetime, which represent the date\npartition the config should delineate.

\n

The decorated function should return a run config dictionary.

\n

The resulting object created by this decorator can be provided to the config argument of a Job.\nThe first partition in the set will start at midnight on the soonest first of the month after\nstart_date. The last partition in the set will end before the current time, unless the\nend_offset argument is set to a positive number. If day_offset is provided, the start and end\ndate of each partition will be day_offset. If minute_offset and/or hour_offset are used, the\nstart and end times of each partition will be hour_offset:minute_offset of each day.

\n
\n
Parameters:
\n
    \n
  • start_date (Union[datetime.datetime, str]) \u2013 The first date in the set of partitions will be\nmidnight the sonnest first of the month following start_date. Can provide in either a\ndatetime or string format.

  • \n
  • minute_offset (int) \u2013 Number of minutes past the hour to \u201csplit\u201d the partition. Defaults\nto 0.

  • \n
  • hour_offset (int) \u2013 Number of hours past 00:00 to \u201csplit\u201d the partition. Defaults to 0.

  • \n
  • day_offset (int) \u2013 Day of the month to \u201csplit\u201d the partition. Defaults to 1.

  • \n
  • timezone (Optional[str]) \u2013 The timezone in which each date should exist.\nSupported strings for timezones are the ones provided by the\nIANA time zone database <https://www.iana.org/time-zones> - e.g. \u201cAmerica/Los_Angeles\u201d.

  • \n
  • fmt (Optional[str]) \u2013 The date format to use. Defaults to %Y-%m-%d.

  • \n
  • end_offset (int) \u2013 Extends the partition set by a number of partitions equal to the value\npassed. If end_offset is 0 (the default), the last partition ends before the current\ntime. If end_offset is 1, the second-to-last partition ends before the current time,\nand so on.

  • \n
  • tags_for_partition_fn (Optional[Callable[[str], Mapping[str, str]]]) \u2013 A function that\naccepts a partition time window and returns a dictionary of tags to attach to runs for\nthat partition.

  • \n
\n
\n
\n
@monthly_partitioned_config(start_date="2022-03-12")\n# creates partitions (2022-04-01-00:00, 2022-05-01-00:00), (2022-05-01-00:00, 2022-06-01-00:00), ...\n\n@monthly_partitioned_config(start_date="2022-03-12", minute_offset=15, hour_offset=3, day_offset=5)\n# creates partitions (2022-04-05-03:15, 2022-05-05-03:15), (2022-05-05-03:15, 2022-06-05-03:15), ...\n
\n
\n
\n\n
\n
\n

Sensors\u00b6

\n
\n
\n@dagster.sensor(job_name=None, *, name=None, minimum_interval_seconds=None, description=None, job=None, jobs=None, default_status=DefaultSensorStatus.STOPPED, asset_selection=None, required_resource_keys=None)[source]\u00b6
\n

Creates a sensor where the decorated function is used as the sensor\u2019s evaluation function.

\n

The decorated function may:

\n
    \n
  1. Return a RunRequest object.

  2. \n
  3. Return a list of RunRequest objects.

  4. \n
  5. Return a SkipReason object, providing a descriptive message of why no runs were requested.

  6. \n
  7. Return nothing (skipping without providing a reason)

  8. \n
  9. Yield a SkipReason or yield one or more RunRequest objects.

  10. \n
\n

Takes a SensorEvaluationContext.

\n
\n
Parameters:
\n
    \n
  • name (Optional[str]) \u2013 The name of the sensor. Defaults to the name of the decorated\nfunction.

  • \n
  • minimum_interval_seconds (Optional[int]) \u2013 The minimum number of seconds that will elapse\nbetween sensor evaluations.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the sensor.

  • \n
  • job (Optional[Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]]) \u2013 The job to be executed when the sensor fires.

  • \n
  • jobs (Optional[Sequence[Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]]]) \u2013 (experimental) A list of jobs to be executed when the sensor fires.

  • \n
  • default_status (DefaultSensorStatus) \u2013 Whether the sensor starts as running or not. The default\nstatus can be overridden from the Dagster UI or via the GraphQL API.

  • \n
  • asset_selection (AssetSelection) \u2013 (Experimental) an asset selection to launch a run for if\nthe sensor condition is met. This can be provided instead of specifying a job.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.SensorDefinition(name=None, *, evaluation_fn=None, job_name=None, minimum_interval_seconds=None, description=None, job=None, jobs=None, default_status=DefaultSensorStatus.STOPPED, asset_selection=None, required_resource_keys=None)[source]\u00b6
\n

Define a sensor that initiates a set of runs based on some external state.

\n
\n
Parameters:
\n
    \n
  • evaluation_fn (Callable[[SensorEvaluationContext]]) \u2013

    The core evaluation function for the\nsensor, which is run at an interval to determine whether a run should be launched or\nnot. Takes a SensorEvaluationContext.

    \n

    This function must return a generator, which must yield either a single SkipReason\nor one or more RunRequest objects.

    \n

  • \n
  • name (Optional[str]) \u2013 The name of the sensor to create. Defaults to name of evaluation_fn

  • \n
  • minimum_interval_seconds (Optional[int]) \u2013 The minimum number of seconds that will elapse\nbetween sensor evaluations.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the sensor.

  • \n
  • job (Optional[GraphDefinition, JobDefinition, UnresolvedAssetJob]) \u2013 The job to execute when this sensor fires.

  • \n
  • jobs (Optional[Sequence[GraphDefinition, JobDefinition, UnresolvedAssetJob]]) \u2013 (experimental) A list of jobs to execute when this sensor fires.

  • \n
  • default_status (DefaultSensorStatus) \u2013 Whether the sensor starts as running or not. The default\nstatus can be overridden from the Dagster UI or via the GraphQL API.

  • \n
  • asset_selection (AssetSelection) \u2013 (Experimental) an asset selection to launch a run for if\nthe sensor condition is met. This can be provided instead of specifying a job.

  • \n
\n
\n
\n
\n
\nproperty default_status\u00b6
\n

The default status for this sensor when it is first loaded in\na code location.

\n
\n
Type:
\n

DefaultSensorStatus

\n
\n
\n
\n\n
\n
\nproperty description\u00b6
\n

A description for this sensor.

\n
\n
Type:
\n

Optional[str]

\n
\n
\n
\n\n
\n
\nproperty job\u00b6
\n

The job that is\ntargeted by this schedule.

\n
\n
Type:
\n

Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]

\n
\n
\n
\n\n
\n
\nproperty job_name\u00b6
\n

The name of the job that is targeted by this sensor.

\n
\n
Type:
\n

Optional[str]

\n
\n
\n
\n\n
\n
\nproperty jobs\u00b6
\n

A list of jobs\nthat are targeted by this schedule.

\n
\n
Type:
\n

List[Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]]

\n
\n
\n
\n\n
\n
\nproperty minimum_interval_seconds\u00b6
\n

The minimum number of seconds between sequential evaluations of this sensor.

\n
\n
Type:
\n

Optional[int]

\n
\n
\n
\n\n
\n
\nproperty name\u00b6
\n

The name of this sensor.

\n
\n
Type:
\n

str

\n
\n
\n
\n\n
\n
\nproperty required_resource_keys\u00b6
\n

The set of keys for resources that must be provided to this sensor.

\n
\n
Type:
\n

Set[str]

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.SensorEvaluationContext(instance_ref, last_completion_time, last_run_key, cursor, repository_name, repository_def=None, instance=None, sensor_name=None, resources=None, definitions=None)[source]
\n

The context object available as the argument to the evaluation function of a dagster.SensorDefinition.

\n

Users should not instantiate this object directly. To construct a\nSensorEvaluationContext for testing purposes, use dagster.\nbuild_sensor_context().

\n
\n
\ninstance_ref
\n

The serialized instance configured to run the schedule

\n
\n
Type:
\n

Optional[InstanceRef]

\n
\n
\n
\n\n
\n
\ncursor
\n

The cursor, passed back from the last sensor evaluation via\nthe cursor attribute of SkipReason and RunRequest

\n
\n
Type:
\n

Optional[str]

\n
\n
\n
\n\n
\n
\nlast_completion_time
\n

DEPRECATED The last time that the sensor was evaluated (UTC).

\n
\n
Type:
\n

float

\n
\n
\n
\n\n
\n
\nlast_run_key
\n

DEPRECATED The run key of the RunRequest most recently created by this\nsensor. Use the preferred cursor attribute instead.

\n
\n
Type:
\n

str

\n
\n
\n
\n\n
\n
\nrepository_name
\n

The name of the repository that the sensor belongs to.

\n
\n
Type:
\n

Optional[str]

\n
\n
\n
\n\n
\n
\nrepository_def
\n

The repository or that\nthe sensor belongs to. If needed by the sensor top-level resource definitions will be\npulled from this repository. You can provide either this or definitions.

\n
\n
Type:
\n

Optional[RepositoryDefinition]

\n
\n
\n
\n\n
\n
\ninstance
\n

The deserialized instance can also be passed in\ndirectly (primarily useful in testing contexts).

\n
\n
Type:
\n

Optional[DagsterInstance]

\n
\n
\n
\n\n
\n
\ndefinitions
\n

Definitions object that the sensor is defined in.\nIf needed by the sensor, top-level resource definitions will be pulled from these\ndefinitions. You can provide either this or repository_def.

\n
\n
Type:
\n

Optional[Definitions]

\n
\n
\n
\n\n
\n
\nresources
\n

A dict of resource keys to resource\ndefinitions to be made available during sensor execution.

\n
\n
Type:
\n

Optional[Dict[str, Any]]

\n
\n
\n
\n\n

Example

\n
from dagster import sensor, SensorEvaluationContext\n\n@sensor\ndef the_sensor(context: SensorEvaluationContext):\n    ...\n
\n
\n
\n
\nproperty cursor
\n

The cursor value for this sensor, which was set in an earlier sensor evaluation.

\n
\n\n
\n
\nproperty instance
\n

The current DagsterInstance.

\n
\n
Type:
\n

DagsterInstance

\n
\n
\n
\n\n
\n
\nproperty last_completion_time
\n

Timestamp representing the last time this sensor completed an evaluation.

\n
\n
Type:
\n

Optional[float]

\n
\n
\n
\n\n
\n
\nproperty last_run_key
\n

The run key supplied to the most recent RunRequest produced by this sensor.

\n
\n
Type:
\n

Optional[str]

\n
\n
\n
\n\n
\n
\nproperty repository_def
\n

The RepositoryDefinition that this sensor resides in.

\n
\n
Type:
\n

Optional[RepositoryDefinition]

\n
\n
\n
\n\n
\n
\nproperty repository_name
\n

The name of the repository that this sensor resides in.

\n
\n
Type:
\n

Optional[str]

\n
\n
\n
\n\n
\n
\nproperty resources
\n

A mapping from resource key to instantiated resources for this sensor.

\n
\n
Type:
\n

Resources

\n
\n
\n
\n\n
\n
\nupdate_cursor(cursor)[source]
\n

Updates the cursor value for this sensor, which will be provided on the context for the\nnext sensor evaluation.

\n

This can be used to keep track of progress and avoid duplicate work across sensor\nevaluations.

\n
\n
Parameters:
\n

cursor (Optional[str]) \u2013

\n
\n
\n
\n\n
\n\n
\n
\ndagster.build_sensor_context(instance=None, cursor=None, repository_name=None, repository_def=None, sensor_name=None, resources=None, definitions=None, instance_ref=None)[source]\u00b6
\n

Builds sensor execution context using the provided parameters.

\n

This function can be used to provide a context to the invocation of a sensor definition.If\nprovided, the dagster instance must be persistent; DagsterInstance.ephemeral() will result in an\nerror.

\n
\n
Parameters:
\n
    \n
  • instance (Optional[DagsterInstance]) \u2013 The dagster instance configured to run the sensor.

  • \n
  • cursor (Optional[str]) \u2013 A cursor value to provide to the evaluation of the sensor.

  • \n
  • repository_name (Optional[str]) \u2013 The name of the repository that the sensor belongs to.

  • \n
  • repository_def (Optional[RepositoryDefinition]) \u2013 The repository that the sensor belongs to.\nIf needed by the sensor top-level resource definitions will be pulled from this repository.\nYou can provide either this or definitions.

  • \n
  • resources (Optional[Mapping[str, ResourceDefinition]]) \u2013 A set of resource definitions\nto provide to the sensor. If passed, these will override any resource definitions\nprovided by the repository.

  • \n
  • definitions (Optional[Definitions]) \u2013 Definitions object that the sensor is defined in.\nIf needed by the sensor, top-level resource definitions will be pulled from these\ndefinitions. You can provide either this or repository_def.

  • \n
\n
\n
\n

Examples

\n
context = build_sensor_context()\nmy_sensor(context)\n
\n
\n
\n\n
\n
\n@dagster.asset_sensor(asset_key, *, job_name=None, name=None, minimum_interval_seconds=None, description=None, job=None, jobs=None, default_status=DefaultSensorStatus.STOPPED, required_resource_keys=None)[source]\u00b6
\n

Creates an asset sensor where the decorated function is used as the asset sensor\u2019s evaluation\nfunction.

\n

If the asset has been materialized multiple times between since the last sensor tick, the\nevaluation function will only be invoked once, with the latest materialization.

\n

The decorated function may:

\n
    \n
  1. Return a RunRequest object.

  2. \n
  3. Return a list of RunRequest objects.

  4. \n
  5. Return a SkipReason object, providing a descriptive message of why no runs were requested.

  6. \n
  7. Return nothing (skipping without providing a reason)

  8. \n
  9. Yield a SkipReason or yield one or more RunRequest objects.

  10. \n
\n

Takes a SensorEvaluationContext and an EventLogEntry corresponding to an\nAssetMaterialization event.

\n
\n
Parameters:
\n
    \n
  • asset_key (AssetKey) \u2013 The asset_key this sensor monitors.

  • \n
  • name (Optional[str]) \u2013 The name of the sensor. Defaults to the name of the decorated\nfunction.

  • \n
  • minimum_interval_seconds (Optional[int]) \u2013 The minimum number of seconds that will elapse\nbetween sensor evaluations.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the sensor.

  • \n
  • job (Optional[Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]]) \u2013 The\njob to be executed when the sensor fires.

  • \n
  • jobs (Optional[Sequence[Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]]]) \u2013 (experimental) A list of jobs to be executed when the sensor fires.

  • \n
  • default_status (DefaultSensorStatus) \u2013 Whether the sensor starts as running or not. The default\nstatus can be overridden from the Dagster UI or via the GraphQL API.

  • \n
\n
\n
\n

Example

\n
from dagster import AssetKey, EventLogEntry, SensorEvaluationContext, asset_sensor\n\n\n@asset_sensor(asset_key=AssetKey("my_table"), job=my_job)\ndef my_asset_sensor(context: SensorEvaluationContext, asset_event: EventLogEntry):\n    return RunRequest(\n        run_key=context.cursor,\n        run_config={\n            "ops": {\n                "read_materialization": {\n                    "config": {\n                        "asset_key": asset_event.dagster_event.asset_key.path,\n                    }\n                }\n            }\n        },\n    )\n
\n
\n
\n\n
\n
\nclass dagster.AssetSensorDefinition(name, asset_key, job_name, asset_materialization_fn, minimum_interval_seconds=None, description=None, job=None, jobs=None, default_status=DefaultSensorStatus.STOPPED, required_resource_keys=None)[source]\u00b6
\n

Define an asset sensor that initiates a set of runs based on the materialization of a given\nasset.

\n

If the asset has been materialized multiple times between since the last sensor tick, the\nevaluation function will only be invoked once, with the latest materialization.

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the sensor to create.

  • \n
  • asset_key (AssetKey) \u2013 The asset_key this sensor monitors.

  • \n
  • asset_materialization_fn (Callable[[SensorEvaluationContext, EventLogEntry], Union[Iterator[Union[RunRequest, SkipReason]], RunRequest, SkipReason]]) \u2013

    The core\nevaluation function for the sensor, which is run at an interval to determine whether a\nrun should be launched or not. Takes a SensorEvaluationContext and\nan EventLogEntry corresponding to an AssetMaterialization event.

    \n

    This function must return a generator, which must yield either a single SkipReason\nor one or more RunRequest objects.

    \n

  • \n
  • minimum_interval_seconds (Optional[int]) \u2013 The minimum number of seconds that will elapse\nbetween sensor evaluations.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the sensor.

  • \n
  • job (Optional[Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]]) \u2013 The job\nobject to target with this sensor.

  • \n
  • jobs (Optional[Sequence[Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]]]) \u2013 (experimental) A list of jobs to be executed when the sensor fires.

  • \n
  • default_status (DefaultSensorStatus) \u2013 Whether the sensor starts as running or not. The default\nstatus can be overridden from the Dagster UI or via the GraphQL API.

  • \n
\n
\n
\n
\n
\nproperty asset_key\u00b6
\n

The key of the asset targeted by this sensor.

\n
\n
Type:
\n

AssetKey

\n
\n
\n
\n\n
\n\n
\n
\n@dagster.freshness_policy_sensor(asset_selection, *, name=None, minimum_interval_seconds=None, description=None, default_status=DefaultSensorStatus.STOPPED)[source]\u00b6
\n
\n
\n

\n \n (\n experimental\n )\n \n This API may break in future versions, even between dot releases.\n \n

\n

Define a sensor that reacts to the status of a given set of asset freshness policies, where the\ndecorated function will be evaluated on every tick for each asset in the selection that has a\nFreshnessPolicy defined.

\n

Note: returning or yielding a value from the annotated function will result in an error.

\n

Takes a FreshnessPolicySensorContext.

\n
\n
Parameters:
\n
    \n
  • asset_selection (AssetSelection) \u2013 The asset selection monitored by the sensor.

  • \n
  • name (Optional[str]) \u2013 The name of the sensor. Defaults to the name of the decorated function.

  • \n
  • freshness_policy_sensor_fn (Callable[[FreshnessPolicySensorContext], None]) \u2013 The core\nevaluation function for the sensor. Takes a FreshnessPolicySensorContext.

  • \n
  • minimum_interval_seconds (Optional[int]) \u2013 The minimum number of seconds that will elapse\nbetween sensor evaluations.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the sensor.

  • \n
  • default_status (DefaultSensorStatus) \u2013 Whether the sensor starts as running or not. The default\nstatus can be overridden from the Dagster UI or via the GraphQL API.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.FreshnessPolicySensorDefinition(name, asset_selection, freshness_policy_sensor_fn, minimum_interval_seconds=None, description=None, default_status=DefaultSensorStatus.STOPPED, required_resource_keys=None)[source]\u00b6
\n

Define a sensor that reacts to the status of a given set of asset freshness policies,\nwhere the decorated function will be evaluated on every sensor tick.

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the sensor. Defaults to the name of the decorated function.

  • \n
  • freshness_policy_sensor_fn (Callable[[FreshnessPolicySensorContext], None]) \u2013 The core\nevaluation function for the sensor. Takes a FreshnessPolicySensorContext.

  • \n
  • asset_selection (AssetSelection) \u2013 The asset selection monitored by the sensor.

  • \n
  • minimum_interval_seconds (Optional[int]) \u2013 The minimum number of seconds that will elapse\nbetween sensor evaluations.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the sensor.

  • \n
  • default_status (DefaultSensorStatus) \u2013 Whether the sensor starts as running or not. The default\nstatus can be overridden from the Dagster UI or via the GraphQL API.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.FreshnessPolicySensorContext(sensor_name, asset_key, freshness_policy, minutes_overdue, previous_minutes_overdue, instance, resources=None)[source]
\n

The context object available to a decorated function of freshness_policy_sensor.

\n
\n
\nsensor_name
\n

the name of the sensor.

\n
\n
Type:
\n

str

\n
\n
\n
\n\n
\n
\nasset_key
\n

the key of the asset being monitored

\n
\n
Type:
\n

AssetKey

\n
\n
\n
\n\n
\n
\nfreshness_policy
\n

the freshness policy of the asset being monitored

\n
\n
Type:
\n

FreshnessPolicy

\n
\n
\n
\n\n
\n
\nminutes_overdue
\n
\n
Type:
\n

Optional[float]

\n
\n
\n
\n\n
\n
\nprevious_minutes_overdue
\n

the minutes_overdue value for this asset on the\nprevious sensor tick.

\n
\n
Type:
\n

Optional[float]

\n
\n
\n
\n\n
\n
\ninstance
\n

the current instance.

\n
\n
Type:
\n

DagsterInstance

\n
\n
\n
\n\n
\n\n
\n
\ndagster.build_freshness_policy_sensor_context(sensor_name, asset_key, freshness_policy, minutes_overdue, previous_minutes_overdue=None, instance=None, resources=None)[source]\u00b6
\n
\n
\n

\n \n (\n experimental\n )\n \n This API may break in future versions, even between dot releases.\n \n

\n

Builds freshness policy sensor context from provided parameters.

\n

This function can be used to provide the context argument when directly invoking a function\ndecorated with @freshness_policy_sensor, such as when writing unit tests.

\n
\n
Parameters:
\n
    \n
  • sensor_name (str) \u2013 The name of the sensor the context is being constructed for.

  • \n
  • asset_key (AssetKey) \u2013 The AssetKey for the monitored asset

  • \n
  • freshness_policy (FreshnessPolicy) \u2013 The FreshnessPolicy for the monitored asset

  • \n
  • minutes_overdue (Optional[float]) \u2013 How overdue the monitored asset currently is

  • \n
  • previous_minutes_overdue (Optional[float]) \u2013 How overdue the monitored asset was on the\nprevious tick.

  • \n
  • instance (DagsterInstance) \u2013 The dagster instance configured for the context.

  • \n
\n
\n
\n

Examples

\n
context = build_freshness_policy_sensor_context(\n    sensor_name="freshness_policy_sensor_to_invoke",\n    asset_key=AssetKey("some_asset"),\n    freshness_policy=FreshnessPolicy(maximum_lag_minutes=30)<\n    minutes_overdue=10.0,\n)\nfreshness_policy_sensor_to_invoke(context)\n
\n
\n
\n\n
\n
\n@dagster.multi_asset_sensor(monitored_assets, *, job_name=None, name=None, minimum_interval_seconds=None, description=None, job=None, jobs=None, default_status=DefaultSensorStatus.STOPPED, request_assets=None, required_resource_keys=None)[source]\u00b6
\n
\n
\n

\n \n (\n experimental\n )\n \n This API may break in future versions, even between dot releases.\n \n

\n

Creates an asset sensor that can monitor multiple assets.

\n

The decorated function is used as the asset sensor\u2019s evaluation\nfunction. The decorated function may:

\n
    \n
  1. Return a RunRequest object.

  2. \n
  3. Return a list of RunRequest objects.

  4. \n
  5. Return a SkipReason object, providing a descriptive message of why no runs were requested.

  6. \n
  7. Return nothing (skipping without providing a reason)

  8. \n
  9. Yield a SkipReason or yield one or more RunRequest objects.

  10. \n
\n

Takes a MultiAssetSensorEvaluationContext.

\n
\n
Parameters:
\n
    \n
  • monitored_assets (Union[Sequence[AssetKey], AssetSelection]) \u2013 The assets this\nsensor monitors. If an AssetSelection object is provided, it will only apply to assets\nwithin the Definitions that this sensor is part of.

  • \n
  • name (Optional[str]) \u2013 The name of the sensor. Defaults to the name of the decorated\nfunction.

  • \n
  • minimum_interval_seconds (Optional[int]) \u2013 The minimum number of seconds that will elapse\nbetween sensor evaluations.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the sensor.

  • \n
  • job (Optional[Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]]) \u2013 The\njob to be executed when the sensor fires.

  • \n
  • jobs (Optional[Sequence[Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]]]) \u2013 (experimental) A list of jobs to be executed when the sensor fires.

  • \n
  • default_status (DefaultSensorStatus) \u2013 Whether the sensor starts as running or not. The default\nstatus can be overridden from the Dagster UI or via the GraphQL API.

  • \n
  • request_assets (Optional[AssetSelection]) \u2013 (Experimental) an asset selection to launch a run\nfor if the sensor condition is met. This can be provided instead of specifying a job.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.MultiAssetSensorDefinition(name, monitored_assets, job_name, asset_materialization_fn, minimum_interval_seconds=None, description=None, job=None, jobs=None, default_status=DefaultSensorStatus.STOPPED, request_assets=None, required_resource_keys=None)[source]\u00b6
\n
\n
\n

\n \n (\n experimental\n )\n \n This API may break in future versions, even between dot releases.\n \n

\n

Define an asset sensor that initiates a set of runs based on the materialization of a list of\nassets.

\n

Users should not instantiate this object directly. To construct a\nMultiAssetSensorDefinition, use dagster.\nmulti_asset_sensor().

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the sensor to create.

  • \n
  • asset_keys (Sequence[AssetKey]) \u2013 The asset_keys this sensor monitors.

  • \n
  • asset_materialization_fn (Callable[[MultiAssetSensorEvaluationContext], Union[Iterator[Union[RunRequest, SkipReason]], RunRequest, SkipReason]]) \u2013

    The core\nevaluation function for the sensor, which is run at an interval to determine whether a\nrun should be launched or not. Takes a MultiAssetSensorEvaluationContext.

    \n

    This function must return a generator, which must yield either a single SkipReason\nor one or more RunRequest objects.

    \n

  • \n
  • minimum_interval_seconds (Optional[int]) \u2013 The minimum number of seconds that will elapse\nbetween sensor evaluations.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the sensor.

  • \n
  • job (Optional[Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]]) \u2013 The job\nobject to target with this sensor.

  • \n
  • jobs (Optional[Sequence[Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]]]) \u2013 (experimental) A list of jobs to be executed when the sensor fires.

  • \n
  • default_status (DefaultSensorStatus) \u2013 Whether the sensor starts as running or not. The default\nstatus can be overridden from the Dagster UI or via the GraphQL API.

  • \n
  • request_assets (Optional[AssetSelection]) \u2013 (Experimental) an asset selection to launch a run\nfor if the sensor condition is met. This can be provided instead of specifying a job.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.MultiAssetSensorEvaluationContext(instance_ref, last_completion_time, last_run_key, cursor, repository_name, repository_def, monitored_assets, instance=None, resource_defs=None, definitions=None)[source]\u00b6
\n
\n
\n

\n \n (\n experimental\n )\n \n This API may break in future versions, even between dot releases.\n \n

\n

The context object available as the argument to the evaluation function of a\ndagster.MultiAssetSensorDefinition.

\n

Users should not instantiate this object directly. To construct a\nMultiAssetSensorEvaluationContext for testing purposes, use dagster.\nbuild_multi_asset_sensor_context().

\n

The MultiAssetSensorEvaluationContext contains a cursor object that tracks the state of\nconsumed event logs for each monitored asset. For each asset, the cursor stores the storage ID\nof the latest materialization that has been marked as \u201cconsumed\u201d (via a call to advance_cursor)\nin a latest_consumed_event_id field.

\n

For each monitored asset, the cursor will store the latest unconsumed event ID for up to 25\npartitions. Each event ID must be before the latest_consumed_event_id field for the asset.

\n

Events marked as consumed via advance_cursor will be returned in future ticks until they\nare marked as consumed.

\n

To update the cursor to the latest materialization and clear the unconsumed events, call\nadvance_all_cursors.

\n
\n
\nmonitored_assets\u00b6
\n

The assets monitored\nby the sensor. If an AssetSelection object is provided, it will only apply to assets\nwithin the Definitions that this sensor is part of.

\n
\n
Type:
\n

Union[Sequence[AssetKey], AssetSelection]

\n
\n
\n
\n\n
\n
\nrepository_def\u00b6
\n

The repository that the sensor belongs to.\nIf needed by the sensor top-level resource definitions will be pulled from this repository.\nYou can provide either this or definitions.

\n
\n
Type:
\n

Optional[RepositoryDefinition]

\n
\n
\n
\n\n
\n
\ninstance_ref\u00b6
\n

The serialized instance configured to run the schedule

\n
\n
Type:
\n

Optional[InstanceRef]

\n
\n
\n
\n\n
\n
\ncursor\u00b6
\n

The cursor, passed back from the last sensor evaluation via\nthe cursor attribute of SkipReason and RunRequest. Must be a dictionary of asset key\nstrings to a stringified tuple of (latest_event_partition, latest_event_storage_id,\ntrailing_unconsumed_partitioned_event_ids).

\n
\n
Type:
\n

Optional[str]

\n
\n
\n
\n\n
\n
\nlast_completion_time\u00b6
\n

DEPRECATED The last time that the sensor was consumed (UTC).

\n
\n
Type:
\n

float

\n
\n
\n
\n\n
\n
\nlast_run_key\u00b6
\n

DEPRECATED The run key of the RunRequest most recently created by this\nsensor. Use the preferred cursor attribute instead.

\n
\n
Type:
\n

str

\n
\n
\n
\n\n
\n
\nrepository_name\u00b6
\n

The name of the repository that the sensor belongs to.

\n
\n
Type:
\n

Optional[str]

\n
\n
\n
\n\n
\n
\ninstance\u00b6
\n

The deserialized instance can also be passed in\ndirectly (primarily useful in testing contexts).

\n
\n
Type:
\n

Optional[DagsterInstance]

\n
\n
\n
\n\n
\n
\ndefinitions\u00b6
\n

Definitions object that the sensor is defined in.\nIf needed by the sensor, top-level resource definitions will be pulled from these\ndefinitions. You can provide either this or repository_def.

\n
\n
Type:
\n

Optional[Definitions]

\n
\n
\n
\n\n

Example

\n
from dagster import multi_asset_sensor, MultiAssetSensorEvaluationContext\n\n@multi_asset_sensor(monitored_assets=[AssetKey("asset_1), AssetKey("asset_2)])\ndef the_sensor(context: MultiAssetSensorEvaluationContext):\n    ...\n
\n
\n
\n
\nadvance_all_cursors()[source]\u00b6
\n

Updates the cursor to the most recent materialization event for all assets monitored by\nthe multi_asset_sensor.

\n

Marks all materialization events as consumed by the sensor, including unconsumed events.

\n
\n\n
\n
\nadvance_cursor(materialization_records_by_key)[source]\u00b6
\n

Marks the provided materialization records as having been consumed by the sensor.

\n

At the end of the tick, the cursor will be updated to advance past all materializations\nrecords provided via advance_cursor. In the next tick, records that have been consumed\nwill no longer be returned.

\n

Passing a partitioned materialization record into this function will mark prior materializations\nwith the same asset key and partition as having been consumed.

\n
\n
Parameters:
\n

materialization_records_by_key (Mapping[AssetKey, Optional[EventLogRecord]]) \u2013 Mapping of\nAssetKeys to EventLogRecord or None. If an EventLogRecord is provided, the cursor\nfor the AssetKey will be updated and future calls to fetch asset materialization events\nwill not fetch this event again. If None is provided, the cursor for the AssetKey\nwill not be updated.

\n
\n
\n
\n\n
\n
\nall_partitions_materialized(asset_key, partitions=None)[source]\u00b6
\n

A utility method to check if a provided list of partitions have been materialized\nfor a particular asset. This method ignores the cursor and checks all materializations\nfor the asset.

\n
\n
Parameters:
\n
    \n
  • asset_key (AssetKey) \u2013 The asset to check partitions for.

  • \n
  • partitions (Optional[Sequence[str]]) \u2013 A list of partitions to check. If not provided,\nall partitions for the asset will be checked.

  • \n
\n
\n
Returns:
\n

True if all selected partitions have been materialized, False otherwise.

\n
\n
Return type:
\n

bool

\n
\n
\n
\n\n
\n
\nproperty asset_keys\u00b6
\n

The asset keys which are monitored by this sensor.

\n
\n
Type:
\n

Sequence[AssetKey]

\n
\n
\n
\n\n
\n
\nproperty assets_defs_by_key\u00b6
\n

A mapping from AssetKey to the\nAssetsDefinition object which produces it. If a given asset is monitored by this sensor, but\nis not produced within the same code location as this sensor, then the value will be None.

\n
\n
Type:
\n

Mapping[AssetKey, Optional[AssetsDefinition]]

\n
\n
\n
\n\n
\n
\nget_cursor_partition(asset_key)[source]\u00b6
\n

A utility method to get the current partition the cursor is on.

\n
\n\n
\n
\nget_downstream_partition_keys(partition_key, from_asset_key, to_asset_key)[source]\u00b6
\n

Converts a partition key from one asset to the corresponding partition key in a downstream\nasset. Uses the existing partition mapping between the upstream asset and the downstream\nasset if it exists, otherwise, uses the default partition mapping.

\n
\n
Parameters:
\n
    \n
  • partition_key (str) \u2013 The partition key to convert.

  • \n
  • from_asset_key (AssetKey) \u2013 The asset key of the upstream asset, which the provided\npartition key belongs to.

  • \n
  • to_asset_key (AssetKey) \u2013 The asset key of the downstream asset. The provided partition\nkey will be mapped to partitions within this asset.

  • \n
\n
\n
Returns:
\n

\n
A list of the corresponding downstream partitions in to_asset_key that

partition_key maps to.

\n
\n
\n

\n
\n
Return type:
\n

Sequence[str]

\n
\n
\n
\n\n
\n
\nget_trailing_unconsumed_events(asset_key)[source]\u00b6
\n

Fetches the unconsumed events for a given asset key. Returns only events\nbefore the latest consumed event ID for the given asset. To mark an event as consumed,\npass the event to advance_cursor. Returns events in ascending order by storage ID.

\n
\n
Parameters:
\n

asset_key (AssetKey) \u2013 The asset key to get unconsumed events for.

\n
\n
Returns:
\n

The unconsumed events for the given asset key.

\n
\n
Return type:
\n

Sequence[EventLogRecord]

\n
\n
\n
\n\n
\n
\nlatest_materialization_records_by_key(asset_keys=None)[source]\u00b6
\n

Fetches the most recent materialization event record for each asset in asset_keys.\nOnly fetches events after the latest consumed event ID for the given asset key.

\n
\n
Parameters:
\n

asset_keys (Optional[Sequence[AssetKey]]) \u2013 list of asset keys to fetch events for. If\nnot specified, the latest materialization will be fetched for all assets the\nmulti_asset_sensor monitors.

\n
\n
\n
\n
Returns: Mapping of AssetKey to EventLogRecord where the EventLogRecord is the latest

materialization event for the asset. If there is no materialization event for the asset,\nthe value in the mapping will be None.

\n
\n
\n
\n\n
\n
\nlatest_materialization_records_by_partition(asset_key, after_cursor_partition=False)[source]\u00b6
\n

Given an asset, returns a mapping of partition key to the latest materialization event\nfor that partition. Fetches only materializations that have not been marked as \u201cconsumed\u201d\nvia a call to advance_cursor.

\n
\n
Parameters:
\n
    \n
  • asset_key (AssetKey) \u2013 The asset to fetch events for.

  • \n
  • after_cursor_partition (Optional[bool]) \u2013 If True, only materializations with partitions\nafter the cursor\u2019s current partition will be returned. By default, set to False.

  • \n
\n
\n
Returns:
\n

Mapping of AssetKey to a mapping of partitions to EventLogRecords where the\nEventLogRecord is the most recent materialization event for the partition.\nThe mapping preserves the order that the materializations occurred.

\n
\n
Return type:
\n

Mapping[str, EventLogRecord]

\n
\n
\n

Example

\n
@asset(partitions_def=DailyPartitionsDefinition("2022-07-01"))\ndef july_asset():\n    return 1\n\n@multi_asset_sensor(asset_keys=[july_asset.key])\ndef my_sensor(context):\n    context.latest_materialization_records_by_partition(july_asset.key)\n\n# After materializing july_asset for 2022-07-05, latest_materialization_by_partition\n# returns {"2022-07-05": EventLogRecord(...)}\n
\n
\n
\n\n
\n
\nlatest_materialization_records_by_partition_and_asset()[source]\u00b6
\n

Finds the most recent unconsumed materialization for each partition for each asset\nmonitored by the sensor. Aggregates all materializations into a mapping of partition key\nto a mapping of asset key to the materialization event for that partition.

\n

For example, if the sensor monitors two partitioned assets A and B that are materialized\nfor partition_x after the cursor, this function returns:

\n
\n
{\n    "partition_x": {asset_a.key: EventLogRecord(...), asset_b.key: EventLogRecord(...)}\n}\n
\n
\n
\n

This method can only be called when all monitored assets are partitioned and share\nthe same partition definition.

\n
\n\n
\n
\nmaterialization_records_for_key(asset_key, limit=None)[source]\u00b6
\n

Fetches asset materialization event records for asset_key, with the earliest event first.

\n

Only fetches events after the latest consumed event ID for the given asset key.

\n
\n
Parameters:
\n
    \n
  • asset_key (AssetKey) \u2013 The asset to fetch materialization events for

  • \n
  • limit (Optional[int]) \u2013 The number of events to fetch

  • \n
\n
\n
\n
\n\n
\n\n
\n
\ndagster.build_multi_asset_sensor_context(*, monitored_assets, repository_def=None, instance=None, cursor=None, repository_name=None, cursor_from_latest_materializations=False, resources=None, definitions=None)[source]\u00b6
\n
\n
\n

\n \n (\n experimental\n )\n \n This API may break in future versions, even between dot releases.\n \n

\n

Builds multi asset sensor execution context for testing purposes using the provided parameters.

\n

This function can be used to provide a context to the invocation of a multi asset sensor definition. If\nprovided, the dagster instance must be persistent; DagsterInstance.ephemeral() will result in an\nerror.

\n
\n
Parameters:
\n
    \n
  • monitored_assets (Union[Sequence[AssetKey], AssetSelection]) \u2013 The assets monitored\nby the sensor. If an AssetSelection object is provided, it will only apply to assets\nwithin the Definitions that this sensor is part of.

  • \n
  • repository_def (RepositoryDefinition) \u2013 RepositoryDefinition object that\nthe sensor is defined in. Must provide definitions if this is not provided.

  • \n
  • instance (Optional[DagsterInstance]) \u2013 The dagster instance configured to run the sensor.

  • \n
  • cursor (Optional[str]) \u2013 A string cursor to provide to the evaluation of the sensor. Must be\na dictionary of asset key strings to ints that has been converted to a json string

  • \n
  • repository_name (Optional[str]) \u2013 The name of the repository that the sensor belongs to.

  • \n
  • cursor_from_latest_materializations (bool) \u2013 If True, the cursor will be set to the latest\nmaterialization for each monitored asset. By default, set to False.

  • \n
  • resources (Optional[Mapping[str, object]]) \u2013 The resource definitions\nto provide to the sensor.

  • \n
  • definitions (Optional[Definitions]) \u2013 Definitions object that the sensor is defined in.\nMust provide repository_def if this is not provided.

  • \n
\n
\n
\n

Examples

\n
with instance_for_test() as instance:\n    context = build_multi_asset_sensor_context(\n        monitored_assets=[AssetKey("asset_1"), AssetKey("asset_2")],\n        instance=instance,\n    )\n    my_asset_sensor(context)\n
\n
\n
\n\n
\n
\nclass dagster.RunStatusSensorDefinition(name, run_status, run_status_sensor_fn, minimum_interval_seconds=None, description=None, monitored_jobs=None, monitor_all_repositories=False, default_status=DefaultSensorStatus.STOPPED, request_job=None, request_jobs=None, required_resource_keys=None)[source]\u00b6
\n

Define a sensor that reacts to a given status of job execution, where the decorated\nfunction will be evaluated when a run is at the given status.

\n
\n
Parameters:
\n
    \n
  • name (str) \u2013 The name of the sensor. Defaults to the name of the decorated function.

  • \n
  • run_status (DagsterRunStatus) \u2013 The status of a run which will be\nmonitored by the sensor.

  • \n
  • run_status_sensor_fn (Callable[[RunStatusSensorContext], Union[SkipReason, DagsterRunReaction]]) \u2013 The core\nevaluation function for the sensor. Takes a RunStatusSensorContext.

  • \n
  • minimum_interval_seconds (Optional[int]) \u2013 The minimum number of seconds that will elapse\nbetween sensor evaluations.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the sensor.

  • \n
  • monitored_jobs (Optional[List[Union[JobDefinition, GraphDefinition, UnresolvedAssetJobDefinition, JobSelector, RepositorySelector, CodeLocationSelector]]]) \u2013 The jobs in the current repository that will be monitored by this sensor. Defaults to\nNone, which means the alert will be sent when any job in the repository fails.

  • \n
  • monitor_all_repositories (bool) \u2013 If set to True, the sensor will monitor all runs in the\nDagster instance. If set to True, an error will be raised if you also specify\nmonitored_jobs or job_selection. Defaults to False.

  • \n
  • default_status (DefaultSensorStatus) \u2013 Whether the sensor starts as running or not. The default\nstatus can be overridden from the Dagster UI or via the GraphQL API.

  • \n
  • request_job (Optional[Union[GraphDefinition, JobDefinition]]) \u2013 The job a RunRequest should\nexecute if yielded from the sensor.

  • \n
  • request_jobs (Optional[Sequence[Union[GraphDefinition, JobDefinition]]]) \u2013 (experimental)\nA list of jobs to be executed if RunRequests are yielded from the sensor.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.RunStatusSensorContext(sensor_name, dagster_run, dagster_event, instance, context=None, resource_defs=None, logger=None, partition_key=None, _resources=None, _cm_scope_entered=False)[source]\u00b6
\n

The context object available to a decorated function of run_status_sensor.

\n
\n
\nproperty dagster_event\u00b6
\n

The event associated with the job run status.

\n
\n\n
\n
\nproperty dagster_run\u00b6
\n

The run of the job.

\n
\n\n
\n
\nproperty instance\u00b6
\n

The current instance.

\n
\n\n
\n
\nproperty log\u00b6
\n

The logger for the current sensor evaluation.

\n
\n\n
\n
\nproperty partition_key\u00b6
\n

The partition key of the relevant run.

\n
\n
Type:
\n

Optional[str]

\n
\n
\n
\n\n
\n
\nproperty sensor_name\u00b6
\n

The name of the sensor.

\n
\n\n
\n\n
\n
\nclass dagster.RunFailureSensorContext(sensor_name, dagster_run, dagster_event, instance, context=None, resource_defs=None, logger=None, partition_key=None, _resources=None, _cm_scope_entered=False)[source]\u00b6
\n

The context object available to a decorated function of run_failure_sensor.

\n
\n
\nsensor_name\u00b6
\n

the name of the sensor.

\n
\n
Type:
\n

str

\n
\n
\n
\n\n
\n
\ndagster_run\u00b6
\n

the failed run.

\n
\n
Type:
\n

DagsterRun

\n
\n
\n
\n\n
\n
\nproperty failure_event\u00b6
\n

The run failure event.

\n

If the run failed because of an error inside a step, get_step_failure_events will have more\ndetails on the step failure.

\n
\n\n
\n
\nget_step_failure_events()[source]\u00b6
\n

The step failure event for each step in the run that failed.

\n

Examples

\n
error_strings_by_step_key = {\n    # includes the stack trace\n    event.step_key: event.event_specific_data.error.to_string()\n    for event in context.get_step_failure_events()\n}\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.JobSelector(location_name, repository_name=None, job_name=None)[source]\u00b6
\n
\n\n
\n
\nclass dagster.RepositorySelector(location_name, repository_name)[source]\u00b6
\n
\n\n
\n
\ndagster.build_run_status_sensor_context(sensor_name, dagster_event, dagster_instance, dagster_run, context=None, resources=None, partition_key=None)[source]\u00b6
\n

Builds run status sensor context from provided parameters.

\n

This function can be used to provide the context argument when directly invoking a function\ndecorated with @run_status_sensor or @run_failure_sensor, such as when writing unit tests.

\n
\n
Parameters:
\n
    \n
  • sensor_name (str) \u2013 The name of the sensor the context is being constructed for.

  • \n
  • dagster_event (DagsterEvent) \u2013 A DagsterEvent with the same event type as the one that\ntriggers the run_status_sensor

  • \n
  • dagster_instance (DagsterInstance) \u2013 The dagster instance configured for the context.

  • \n
  • dagster_run (DagsterRun) \u2013 DagsterRun object from running a job

  • \n
  • resources (Optional[Mapping[str, object]]) \u2013 A dictionary of resources to be made available\nto the sensor.

  • \n
\n
\n
\n

Examples

\n
instance = DagsterInstance.ephemeral()\nresult = my_job.execute_in_process(instance=instance)\n\ndagster_run = result.dagster_run\ndagster_event = result.get_job_success_event() # or get_job_failure_event()\n\ncontext = build_run_status_sensor_context(\n    sensor_name="run_status_sensor_to_invoke",\n    dagster_instance=instance,\n    dagster_run=dagster_run,\n    dagster_event=dagster_event,\n)\nrun_status_sensor_to_invoke(context)\n
\n
\n
\n\n
\n
\n@dagster.run_status_sensor(run_status, name=None, minimum_interval_seconds=None, description=None, monitored_jobs=None, job_selection=None, monitor_all_repositories=False, default_status=DefaultSensorStatus.STOPPED, request_job=None, request_jobs=None)[source]\u00b6
\n

Creates a sensor that reacts to a given status of job execution, where the decorated\nfunction will be run when a job is at the given status.

\n

Takes a RunStatusSensorContext.

\n
\n
Parameters:
\n
    \n
  • run_status (DagsterRunStatus) \u2013 The status of run execution which will be\nmonitored by the sensor.

  • \n
  • name (Optional[str]) \u2013 The name of the sensor. Defaults to the name of the decorated function.

  • \n
  • minimum_interval_seconds (Optional[int]) \u2013 The minimum number of seconds that will elapse\nbetween sensor evaluations.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the sensor.

  • \n
  • monitored_jobs (Optional[List[Union[JobDefinition, GraphDefinition, UnresolvedAssetJobDefinition, RepositorySelector, JobSelector, CodeLocationSelector]]]) \u2013 Jobs in the current repository that will be monitored by this sensor. Defaults to None, which means the alert will\nbe sent when any job in the repository matches the requested run_status. Jobs in external repositories can be monitored by using\nRepositorySelector or JobSelector.

  • \n
  • monitor_all_repositories (bool) \u2013 If set to True, the sensor will monitor all runs in the Dagster instance.\nIf set to True, an error will be raised if you also specify monitored_jobs or job_selection.\nDefaults to False.

  • \n
  • job_selection (Optional[List[Union[JobDefinition, GraphDefinition, RepositorySelector, JobSelector, CodeLocationSelector]]]) \u2013 \n \n (\n deprecated\n )\n \n (This parameter will be removed in version 2.0. Use monitored_jobs instead.) (deprecated in favor of monitored_jobs) Jobs in the current repository that will be\nmonitored by this sensor. Defaults to None, which means the alert will be sent when\nany job in the repository matches the requested run_status.

  • \n
  • default_status (DefaultSensorStatus) \u2013 Whether the sensor starts as running or not. The default\nstatus can be overridden from the Dagster UI or via the GraphQL API.

  • \n
  • request_job (Optional[Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]]) \u2013 The job that should be\nexecuted if a RunRequest is yielded from the sensor.

  • \n
  • request_jobs (Optional[Sequence[Union[GraphDefinition, JobDefinition, UnresolvedAssetJobDefinition]]]) \u2013 (experimental)\nA list of jobs to be executed if RunRequests are yielded from the sensor.

  • \n
\n
\n
\n
\n\n
\n
\n@dagster.run_failure_sensor(name=None, minimum_interval_seconds=None, description=None, monitored_jobs=None, job_selection=None, monitor_all_repositories=False, default_status=DefaultSensorStatus.STOPPED, request_job=None, request_jobs=None)[source]\u00b6
\n

Creates a sensor that reacts to job failure events, where the decorated function will be\nrun when a run fails.

\n

Takes a RunFailureSensorContext.

\n
\n
Parameters:
\n
    \n
  • name (Optional[str]) \u2013 The name of the job failure sensor. Defaults to the name of the\ndecorated function.

  • \n
  • minimum_interval_seconds (Optional[int]) \u2013 The minimum number of seconds that will elapse\nbetween sensor evaluations.

  • \n
  • description (Optional[str]) \u2013 A human-readable description of the sensor.

  • \n
  • monitored_jobs (Optional[List[Union[JobDefinition, GraphDefinition, UnresolvedAssetJobDefinition, RepositorySelector, JobSelector, CodeLocationSelector]]]) \u2013 The jobs in the current repository that will be monitored by this failure sensor.\nDefaults to None, which means the alert will be sent when any job in the current\nrepository fails.

  • \n
  • monitor_all_repositories (bool) \u2013 If set to True, the sensor will monitor all runs in the\nDagster instance. If set to True, an error will be raised if you also specify\nmonitored_jobs or job_selection. Defaults to False.

  • \n
  • job_selection (Optional[List[Union[JobDefinition, GraphDefinition, RepositorySelector, JobSelector, CodeLocationSelector]]]) \u2013 \n \n (\n deprecated\n )\n \n (This parameter will be removed in version 2.0. Use monitored_jobs instead.) (deprecated in favor of monitored_jobs) The jobs in the current repository that will be\nmonitored by this failure sensor. Defaults to None, which means the alert will be sent\nwhen any job in the repository fails.

  • \n
  • default_status (DefaultSensorStatus) \u2013 Whether the sensor starts as running or not. The default\nstatus can be overridden from the Dagster UI or via the GraphQL API.

  • \n
  • request_job (Optional[Union[GraphDefinition, JobDefinition, UnresolvedAssetJob]]) \u2013 The job a RunRequest should\nexecute if yielded from the sensor.

  • \n
  • request_jobs (Optional[Sequence[Union[GraphDefinition, JobDefinition, UnresolvedAssetJob]]]) \u2013 (experimental)\nA list of jobs to be executed if RunRequests are yielded from the sensor.

  • \n
\n
\n
\n
\n\n
\n
\nclass dagster.SensorResult(run_requests=None, skip_reason=None, cursor=None, dynamic_partitions_requests=None, asset_events=None)[source]\u00b6
\n

The result of a sensor evaluation.

\n
\n
\nrun_requests\u00b6
\n

A list\nof run requests to be executed.

\n
\n
Type:
\n

Optional[Sequence[RunRequest]]

\n
\n
\n
\n\n
\n
\nskip_reason\u00b6
\n

A skip message indicating why sensor\nevaluation was skipped.

\n
\n
Type:
\n

Optional[Union[str, SkipReason]]

\n
\n
\n
\n\n
\n
\ncursor\u00b6
\n

The cursor value for this sensor, which will be provided on the\ncontext for the next sensor evaluation.

\n
\n
Type:
\n

Optional[str]

\n
\n
\n
\n\n
\n
\ndynamic_partitions_requests (Optional[Sequence[Union[DeleteDynamicPartitionsRequest,
\n

AddDynamicPartitionsRequest]]]): A list of dynamic partition requests to request dynamic\npartition addition and deletion. Run requests will be evaluated using the state of the\npartitions with these changes applied.

\n
\n\n
\n
\nasset_events\u00b6
\n

(Experimental) A\nlist of materializations, observations, and asset check evaluations that the system\nwill persist on your behalf at the end of sensor evaluation. These events will be not\nbe associated with any particular run, but will be queryable and viewable in the asset catalog.

\n
\n
Type:
\n

Optional[Sequence[Union[AssetObservation, AssetMaterialization, AssetCheckEvaluation]]]

\n
\n
\n
\n\n
\n\n
\n
\nclass dagster.AddDynamicPartitionsRequest(partitions_def_name, partition_keys)[source]\u00b6
\n

A request to add partitions to a dynamic partitions definition, to be evaluated by a sensor or schedule.

\n
\n\n
\n
\nclass dagster.DeleteDynamicPartitionsRequest(partitions_def_name, partition_keys)[source]\u00b6
\n

A request to delete partitions to a dynamic partitions definition, to be evaluated by a sensor or schedule.

\n
\n\n
\n", "current_page_name": "sections/api/apidocs/schedules-sensors", "customsidebar": null, "display_toc": true, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../dynamic/", "title": "Dynamic Mapping & Collect"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../resources/", "title": "Resources"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/dynamic", "Dynamic Mapping & Collect", "N", "next"], ["sections/api/apidocs/resources", "Resources", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/schedules-sensors.rst.txt", "title": "Run Requests", "toc": "\n"}, "types": {"alabaster_version": "0.7.13", "body": "
\n

Types\u00b6

\n

Dagster includes facilities for typing the input and output values of ops (\u201cruntime\u201d types).

\n
\n

Built-in types\u00b6

\n
\n
\ndagster.Nothing\u00b6
\n

Use this type only for inputs and outputs, in order to establish an execution dependency without\ncommunicating a value. Inputs of this type will not be passed to the op compute function, so\nit is necessary to use the explicit In API to define them rather than\nthe Python 3 type hint syntax.

\n

All values are considered to be instances of Nothing.

\n

Examples:

\n
@op\ndef wait(_) -> Nothing:\n    time.sleep(1)\n    return\n\n@op(\n    ins={"ready": In(dagster_type=Nothing)},\n)\ndef done(_) -> str:\n    return 'done'\n\n@job\ndef nothing_job():\n    done(wait())\n\n# Any value will pass the type check for Nothing\n@op\ndef wait_int(_) -> Int:\n    time.sleep(1)\n    return 1\n\n@job\ndef nothing_int_job():\n    done(wait_int())\n
\n
\n
\n\n
\n
\n

Making New Types\u00b6

\n
\n
\nclass dagster.DagsterType(type_check_fn, key=None, name=None, is_builtin=False, description=None, loader=None, required_resource_keys=None, kind=DagsterTypeKind.REGULAR, typing_type=typing.Any, metadata=None)[source]\u00b6
\n

Define a type in dagster. These can be used in the inputs and outputs of ops.

\n
\n
Parameters:
\n
    \n
  • type_check_fn (Callable[[TypeCheckContext, Any], [Union[bool, TypeCheck]]]) \u2013 The function that defines the type check. It takes the value flowing\nthrough the input or output of the op. If it passes, return either\nTrue or a TypeCheck with success set to True. If it fails,\nreturn either False or a TypeCheck with success set to False.\nThe first argument must be named context (or, if unused, _, _context, or context_).\nUse required_resource_keys for access to resources.

  • \n
  • key (Optional[str]) \u2013

    The unique key to identify types programmatically.\nThe key property always has a value. If you omit key to the argument\nto the init function, it instead receives the value of name. If\nneither key nor name is provided, a CheckError is thrown.

    \n

    In the case of a generic type such as List or Optional, this is\ngenerated programmatically based on the type parameters.

    \n

    For most use cases, name should be set and the key argument should\nnot be specified.

    \n

  • \n
  • name (Optional[str]) \u2013 A unique name given by a user. If key is None, key\nbecomes this value. Name is not given in a case where the user does\nnot specify a unique name for this type, such as a generic class.

  • \n
  • description (Optional[str]) \u2013 A markdown-formatted string, displayed in tooling.

  • \n
  • loader (Optional[DagsterTypeLoader]) \u2013 An instance of a class that\ninherits from DagsterTypeLoader and can map config data to a value of\nthis type. Specify this argument if you will need to shim values of this type using the\nconfig machinery. As a rule, you should use the\n@dagster_type_loader decorator to construct\nthese arguments.

  • \n
  • required_resource_keys (Optional[Set[str]]) \u2013 Resource keys required by the type_check_fn.

  • \n
  • is_builtin (bool) \u2013 Defaults to False. This is used by tools to display or\nfilter built-in types (such as String, Int) to visually distinguish\nthem from user-defined types. Meant for internal use.

  • \n
  • kind (DagsterTypeKind) \u2013 Defaults to None. This is used to determine the kind of runtime type\nfor InputDefinition and OutputDefinition type checking.

  • \n
  • typing_type \u2013 Defaults to None. A valid python typing type (e.g. Optional[List[int]]) for the\nvalue contained within the DagsterType. Meant for internal use.

  • \n
\n
\n
\n
\n
\nproperty description\u00b6
\n

Description of the type, or None if not provided.

\n
\n
Type:
\n

Optional[str]

\n
\n
\n
\n\n
\n
\nproperty display_name\u00b6
\n

Either the name or key (if name is None) of the type, overridden in many subclasses.

\n
\n\n
\n
\nproperty has_unique_name\u00b6
\n

Whether the type has a unique name.

\n
\n
Type:
\n

bool

\n
\n
\n
\n\n
\n
\nproperty loader\u00b6
\n

Loader for this type, if any.

\n
\n
Type:
\n

Optional[DagsterTypeLoader]

\n
\n
\n
\n\n
\n
\nproperty required_resource_keys\u00b6
\n

Set of resource keys required by the type check function.

\n
\n
Type:
\n

AbstractSet[str]

\n
\n
\n
\n\n
\n
\ntype_check(context, value)[source]\u00b6
\n

Type check the value against the type.

\n
\n
Parameters:
\n
    \n
  • context (TypeCheckContext) \u2013 The context of the type check.

  • \n
  • value (Any) \u2013 The value to check.

  • \n
\n
\n
Returns:
\n

The result of the type check.

\n
\n
Return type:
\n

TypeCheck

\n
\n
\n
\n\n
\n
\nproperty typing_type\u00b6
\n

The python typing type for this type.

\n
\n
Type:
\n

Any

\n
\n
\n
\n\n
\n
\nproperty unique_name\u00b6
\n

The unique name of this type. Can be None if the type is not unique, such as container types.

\n
\n\n
\n\n
\n
\ndagster.PythonObjectDagsterType(python_type, key=None, name=None, **kwargs)[source]\u00b6
\n

Define a type in dagster whose typecheck is an isinstance check.

\n

Specifically, the type can either be a single python type (e.g. int),\nor a tuple of types (e.g. (int, float)) which is treated as a union.

\n

Examples

\n
ntype = PythonObjectDagsterType(python_type=int)\nassert ntype.name == 'int'\nassert_success(ntype, 1)\nassert_failure(ntype, 'a')\n
\n
\n
ntype = PythonObjectDagsterType(python_type=(int, float))\nassert ntype.name == 'Union[int, float]'\nassert_success(ntype, 1)\nassert_success(ntype, 1.5)\nassert_failure(ntype, 'a')\n
\n
\n
\n
Parameters:
\n
    \n
  • python_type (Union[Type, Tuple[Type, ...]) \u2013 The dagster typecheck function calls instanceof on\nthis type.

  • \n
  • name (Optional[str]) \u2013 Name the type. Defaults to the name of python_type.

  • \n
  • key (Optional[str]) \u2013 Key of the type. Defaults to name.

  • \n
  • description (Optional[str]) \u2013 A markdown-formatted string, displayed in tooling.

  • \n
  • loader (Optional[DagsterTypeLoader]) \u2013 An instance of a class that\ninherits from DagsterTypeLoader and can map config data to a value of\nthis type. Specify this argument if you will need to shim values of this type using the\nconfig machinery. As a rule, you should use the\n@dagster_type_loader decorator to construct\nthese arguments.

  • \n
\n
\n
\n
\n\n
\n
\ndagster.dagster_type_loader(config_schema, required_resource_keys=None, loader_version=None, external_version_fn=None)[source]\u00b6
\n

Create an dagster type loader that maps config data to a runtime value.

\n

The decorated function should take the execution context and parsed config value and return the\nappropriate runtime value.

\n
\n
Parameters:
\n
    \n
  • config_schema (ConfigSchema) \u2013 The schema for the config that\u2019s passed to the decorated\nfunction.

  • \n
  • loader_version (str) \u2013 (Experimental) The version of the decorated compute function. Two\nloading functions should have the same version if and only if they deterministically\nproduce the same outputs when provided the same inputs.

  • \n
  • external_version_fn (Callable) \u2013 (Experimental) A function that takes in the same parameters as the loader\nfunction (config_value) and returns a representation of the version of the external\nasset (str). Two external assets with identical versions are treated as identical to one\nanother.

  • \n
\n
\n
\n

Examples

\n
@dagster_type_loader(Permissive())\ndef load_dict(_context, value):\n    return value\n
\n
\n
\n\n
\n
\nclass dagster.DagsterTypeLoader[source]\u00b6
\n

Dagster type loaders are used to load unconnected inputs of the dagster type they are attached\nto.

\n

The recommended way to define a type loader is with the\n@dagster_type_loader decorator.

\n
\n\n
\n
\nclass dagster.DagsterTypeLoaderContext(plan_data, execution_data, log_manager, step, output_capture, known_state)[source]\u00b6
\n

The context object provided to a @dagster_type_loader-decorated function during execution.

\n

Users should not construct this object directly.

\n
\n
\nproperty job_def\u00b6
\n

The underlying job definition being executed.

\n
\n\n
\n
\nproperty op_def\u00b6
\n

The op for which type loading is occurring.

\n
\n\n
\n
\nproperty resources\u00b6
\n

The resources available to the type loader, specified by the required_resource_keys argument of the decorator.

\n
\n\n
\n\n
\n
\ndagster.usable_as_dagster_type(name=None, description=None, loader=None)[source]\u00b6
\n

Decorate a Python class to make it usable as a Dagster Type.

\n

This is intended to make it straightforward to annotate existing business logic classes to\nmake them dagster types whose typecheck is an isinstance check against that python class.

\n
\n
Parameters:
\n
    \n
  • python_type (cls) \u2013 The python type to make usable as python type.

  • \n
  • name (Optional[str]) \u2013 Name of the new Dagster type. If None, the name (__name__) of\nthe python_type will be used.

  • \n
  • description (Optional[str]) \u2013 A user-readable description of the type.

  • \n
  • loader (Optional[DagsterTypeLoader]) \u2013 An instance of a class that\ninherits from DagsterTypeLoader and can map config data to a value of\nthis type. Specify this argument if you will need to shim values of this type using the\nconfig machinery. As a rule, you should use the\n@dagster_type_loader decorator to construct\nthese arguments.

  • \n
\n
\n
\n

Examples

\n
# dagster_aws.s3.file_manager.S3FileHandle\n@usable_as_dagster_type\nclass S3FileHandle(FileHandle):\n    def __init__(self, s3_bucket, s3_key):\n        self._s3_bucket = check.str_param(s3_bucket, 's3_bucket')\n        self._s3_key = check.str_param(s3_key, 's3_key')\n\n    @property\n    def s3_bucket(self):\n        return self._s3_bucket\n\n    @property\n    def s3_key(self):\n        return self._s3_key\n\n    @property\n    def path_desc(self):\n        return self.s3_path\n\n    @property\n    def s3_path(self):\n        return 's3://{bucket}/{key}'.format(bucket=self.s3_bucket, key=self.s3_key)\n
\n
\n
\n\n
\n
\ndagster.make_python_type_usable_as_dagster_type(python_type, dagster_type)[source]\u00b6
\n

Take any existing python type and map it to a dagster type (generally created with\nDagsterType) This can only be called once\non a given python type.

\n
\n\n
\n

Testing Types\u00b6

\n
\n
\ndagster.check_dagster_type(dagster_type, value)[source]\u00b6
\n

Test a custom Dagster type.

\n
\n
Parameters:
\n
    \n
  • dagster_type (Any) \u2013 The Dagster type to test. Should be one of the\nbuilt-in types, a dagster type explicitly constructed with\nas_dagster_type(), @usable_as_dagster_type, or\nPythonObjectDagsterType(), or a Python type.

  • \n
  • value (Any) \u2013 The runtime value to test.

  • \n
\n
\n
Returns:
\n

The result of the type check.

\n
\n
Return type:
\n

TypeCheck

\n
\n
\n

Examples

\n
assert check_dagster_type(Dict[Any, Any], {'foo': 'bar'}).success\n
\n
\n
\n\n
\n
\n
\n", "current_page_name": "sections/api/apidocs/types", "customsidebar": null, "display_toc": true, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../utilities/", "title": "Utilities"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../dynamic/", "title": "Dynamic Mapping & Collect"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/utilities", "Utilities", "N", "next"], ["sections/api/apidocs/dynamic", "Dynamic Mapping & Collect", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/types.rst.txt", "title": "Types", "toc": "\n"}, "utilities": {"alabaster_version": "0.7.13", "body": "
\n

Utilities\u00b6

\n
\n
\ndagster.file_relative_path(dunderfile, relative_path)[source]\u00b6
\n

Get a path relative to the currently executing Python file.

\n

This function is useful when one needs to load a file that is relative to the position of\nthe current file. (Such as when you encode a configuration file path in source file and want\nin runnable in any current working directory)

\n
\n
Parameters:
\n
    \n
  • dunderfile (str) \u2013 Should always be __file__.

  • \n
  • relative_path (str) \u2013 Path to get relative to the currently executing file.

  • \n
\n
\n
\n

Examples:

\n
file_relative_path(__file__, 'path/relative/to/file')\n
\n
\n
\n\n
\n
\ndagster.config_from_files(config_files)[source]\u00b6
\n

Constructs run config from YAML files.

\n
\n
Parameters:
\n

config_files (List[str]) \u2013 List of paths or glob patterns for yaml files\nto load and parse as the run config.

\n
\n
Returns:
\n

A run config dictionary constructed from provided YAML files.

\n
\n
Return type:
\n

Dict[str, Any]

\n
\n
Raises:
\n
    \n
  • FileNotFoundError \u2013 When a config file produces no results

  • \n
  • DagsterInvariantViolationError \u2013 When one of the YAML files is invalid and has a parse\n error.

  • \n
\n
\n
\n
\n\n
\n
\ndagster.config_from_pkg_resources(pkg_resource_defs)[source]\u00b6
\n

Load a run config from a package resource, using pkg_resources.resource_string().

\n

Example

\n
config_from_pkg_resources(\n    pkg_resource_defs=[\n        ('dagster_examples.airline_demo.environments', 'local_base.yaml'),\n        ('dagster_examples.airline_demo.environments', 'local_warehouse.yaml'),\n    ],\n)\n
\n
\n
\n
Parameters:
\n

pkg_resource_defs (List[(str, str)]) \u2013 List of pkg_resource modules/files to\nload as the run config.

\n
\n
Returns:
\n

A run config dictionary constructed from the provided yaml strings

\n
\n
Return type:
\n

Dict[Str, Any]

\n
\n
Raises:
\n

DagsterInvariantViolationError \u2013 When one of the YAML documents is invalid and has a\n parse error.

\n
\n
\n
\n\n
\n
\ndagster.config_from_yaml_strings(yaml_strings)[source]\u00b6
\n

Static constructor for run configs from YAML strings.

\n
\n
Parameters:
\n

yaml_strings (List[str]) \u2013 List of yaml strings to parse as the run config.

\n
\n
Returns:
\n

A run config dictionary constructed from the provided yaml strings

\n
\n
Return type:
\n

Dict[Str, Any]

\n
\n
Raises:
\n

DagsterInvariantViolationError \u2013 When one of the YAML documents is invalid and has a\n parse error.

\n
\n
\n
\n\n
\n
\ndagster.get_dagster_logger(name=None)[source]\u00b6
\n

Creates a python logger whose output messages will be captured and converted into Dagster log\nmessages. This means they will have structured information such as the step_key, run_id, etc.\nembedded into them, and will show up in the Dagster event log.

\n

This can be used as a more convenient alternative to context.log in most cases. If log level\nis not set explicitly, defaults to DEBUG.

\n
\n
Parameters:
\n

name (Optional[str]) \u2013 If supplied, will create a logger with the name \u201cdagster.builtin.{name}\u201d,\nwith properties inherited from the base Dagster logger. If omitted, the returned logger\nwill be named \u201cdagster.builtin\u201d.

\n
\n
Returns:
\n

A logger whose output will be captured by Dagster.

\n
\n
Return type:
\n

logging.Logger

\n
\n
\n

Example

\n
from dagster import get_dagster_logger, op\n\n@op\ndef hello_op():\n    log = get_dagster_logger()\n    for i in range(5):\n        # do something\n        log.info(f"Did {i+1} things!")\n
\n
\n
\n\n
\n
\nclass dagster.ExperimentalWarning[source]\u00b6
\n
\n\n
\n
\ndagster.make_email_on_run_failure_sensor(email_from, email_password, email_to, email_body_fn=<function _default_failure_email_body>, email_subject_fn=<function _default_failure_email_subject>, smtp_host='smtp.gmail.com', smtp_type='SSL', smtp_port=None, name=None, webserver_base_url=None, monitored_jobs=None, job_selection=None, monitor_all_repositories=False, default_status=DefaultSensorStatus.STOPPED)[source]\u00b6
\n

Create a job failure sensor that sends email via the SMTP protocol.

\n
\n
Parameters:
\n
    \n
  • email_from (str) \u2013 The sender email address to send the message from.

  • \n
  • email_password (str) \u2013 The password of the sender.

  • \n
  • email_to (List[str]) \u2013 The receipt email addresses to send the message to.

  • \n
  • email_body_fn (Optional(Callable[[RunFailureSensorContext], str])) \u2013 Function which\ntakes in the RunFailureSensorContext outputs the email body you want to send.\nDefaults to the plain text that contains error message, job name, and run ID.

  • \n
  • email_subject_fn (Optional(Callable[[RunFailureSensorContext], str])) \u2013 Function which\ntakes in the RunFailureSensorContext outputs the email subject you want to send.\nDefaults to \u201cDagster Run Failed: <job_name>\u201d.

  • \n
  • smtp_host (str) \u2013 The hostname of the SMTP server. Defaults to \u201csmtp.gmail.com\u201d.

  • \n
  • smtp_type (str) \u2013 The protocol; either \u201cSSL\u201d or \u201cSTARTTLS\u201d. Defaults to SSL.

  • \n
  • smtp_port (Optional[int]) \u2013 The SMTP port. Defaults to 465 for SSL, 587 for STARTTLS.

  • \n
  • name \u2013 (Optional[str]): The name of the sensor. Defaults to \u201cemail_on_job_failure\u201d.

  • \n
  • webserver_base_url \u2013 (Optional[str]): The base url of your dagster-webserver instance. Specify this to allow\nmessages to include deeplinks to the failed run.

  • \n
  • monitored_jobs (Optional[List[Union[JobDefinition, GraphDefinition, JobDefinition, RepositorySelector, JobSelector]]]) \u2013 The jobs that will be monitored by this failure sensor. Defaults to None, which means the alert will\nbe sent when any job in the repository fails. To monitor jobs in external repositories,\nuse RepositorySelector and JobSelector.

  • \n
  • monitor_all_repositories (bool) \u2013 If set to True, the sensor will monitor all runs in the\nDagster instance. If set to True, an error will be raised if you also specify\nmonitored_jobs or job_selection. Defaults to False.

  • \n
  • job_selection (Optional[List[Union[JobDefinition, GraphDefinition, JobDefinition, RepositorySelector, JobSelector]]]) \u2013 \n \n (\n deprecated\n )\n \n (This parameter will be removed in version 2.0. Use monitored_jobs instead.) (deprecated in favor of monitored_jobs) The jobs that will be monitored by this failure\nsensor. Defaults to None, which means the alert will be sent when any job in the repository fails.

  • \n
  • default_status (DefaultSensorStatus) \u2013 Whether the sensor starts as running or not. The default\nstatus can be overridden from the Dagster UI or via the GraphQL API.

  • \n
\n
\n
\n

Examples

\n
email_on_run_failure = make_email_on_run_failure_sensor(\n    email_from="no-reply@example.com",\n    email_password=os.getenv("ALERT_EMAIL_PASSWORD"),\n    email_to=["xxx@example.com"],\n)\n\n@repository\ndef my_repo():\n    return [my_job + email_on_run_failure]\n
\n
\n
def my_message_fn(context: RunFailureSensorContext) -> str:\n    return (\n        f"Job {context.pipeline_run.job_name} failed!"\n        f"Error: {context.failure_event.message}"\n    )\n\nemail_on_run_failure = make_email_on_run_failure_sensor(\n    email_from="no-reply@example.com",\n    email_password=os.getenv("ALERT_EMAIL_PASSWORD"),\n    email_to=["xxx@example.com"],\n    email_body_fn=my_message_fn,\n    email_subject_fn=lambda _: "Dagster Alert",\n    webserver_base_url="http://mycoolsite.com",\n)\n
\n
\n
\n\n
\n
\nclass dagster._utils.forked_pdb.ForkedPdb(completekey='tab', stdin=None, stdout=None, skip=None, nosigint=False, readrc=True)[source]\u00b6
\n

A pdb subclass that may be used from a forked multiprocessing child.

\n

Examples:

\n
from dagster._utils.forked_pdb import ForkedPdb\n\n@solid\ndef complex_solid(_):\n    # some complicated stuff\n\n    ForkedPdb().set_trace()\n\n    # some other complicated stuff\n
\n
\n

You can initiate pipeline execution via the webserver and use the pdb debugger to examine/step through\nexecution at the breakpoint.

\n
\n\n
\n", "current_page_name": "sections/api/apidocs/utilities", "customsidebar": null, "display_toc": false, "favicon_url": null, "logo_url": null, "meta": {}, "metatags": "\n", "next": {"link": "../memoization/", "title": "Job-Level Versioning and Memoization (Deprecated)"}, "page_source_suffix": ".rst", "parents": [], "prev": {"link": "../types/", "title": "Types"}, "rellinks": [["genindex", "General Index", "I", "index"], ["py-modindex", "Python Module Index", "", "modules"], ["sections/api/apidocs/memoization", "Job-Level Versioning and Memoization (Deprecated)", "N", "next"], ["sections/api/apidocs/types", "Types", "P", "previous"]], "sidebars": ["about.html", "navigation.html", "relations.html", "searchbox.html", "donate.html"], "sourcename": "sections/api/apidocs/utilities.rst.txt", "title": "Utilities", "toc": "\n"}}}} \ No newline at end of file diff --git a/docs/next/public/objects.inv b/docs/next/public/objects.inv index 56191a765f2c3512f2cd53096a70de523c53460c..480a5f3171c52eb15e05582cdb6a642cb67f763c 100644 GIT binary patch delta 25270 zcmV)yK$5@G#sSpE0kCZWe^dN!Xd5*LFCdEwd-mOOfX2CCn}Wa;=oX}M5Cef}9+bDw z5KLVY?9NTyp$~cY0|X_0{{|)n_%U7TKovj$cAw~*=swx7@4MoX0<%~lFr~sijIm2I zaEN{wL#Y?%0U*oiFH>j3R21^Fk3L3z9R1^0&4)S4<`Bgs?UB7se=t$TD)|3EX4bToy zRaNZHs0$vk{Fi{}c-o^uryU9iEhTUsC6+Q^4$CaClolwH*voVk*p4c+Sv+1N1W;P3 zUGR*=Ey~VGBvpw{O=gHd+vDx2Nd+ke=@Hye=y^JK1lelue~uv9>)sK#5YW2N7{*e^ zj-c7?*%3%vT{{A1+8%ui2t@(=}tsU@@{%1XlN7aQ1h)n(nPma*(*r3ANfAvjnAq$zC$ zs#$Qcs<7^|qsEVR60KUaMn#(Kp`v~h-=npZ<2$Ijom$kZYvvHaW8IZdZ#zUt8-OQw zEK1wN{y!I+cz9&6%W(JEHYy%(*aLnlX~)GhT`#db@$jF9m8kwpffml)Iwv|)Oh;+O93hFwc^Mnu;AEo`^Kz@SahuC zi4Hz09R?BjnGG2w>#L#V|2E8q&I4s-GA&9H5t}du=8*7kP+uVW8Nzr=nh2>JDcYQH zL!N1Ee~_R-WjAOHBWzLc{_Gn7lo=lM-GBW|R~jafBIvP2iChBiqaPG7yVyZ}V*QUv zw5p@h!Mde?>B=A*IsaLWDb+I1Ak?NFq)v5?hA>+><71hSpm$jYN!oxP0ibgvLo4X` zO);`P5WJLB<{;aVsU%S0PDObP_61dYy4wSIe+;$RK^Z`R|0Uz+=nw@v>G>9U>}qAy zo?499%$Hb^ijtg~_S{}n-=(piB($lr5rLWq?okeV@nv_9Nn(TK7$qprR{6~q><^Sb z3Tlw7-eG15{NP7%nE{K4CqD^e)E7kkjI^{V^#tLwb>reWKqNLw>}svYvv|-rgOSmf zf7H4y2GKb?Z#JwAHU$6z}?rosK*Y{RI{uVVQ(h zkna;zb0!kwv?xJuCw@E*v+=`l7t-Sroq443DFaFP5X7JWeFobEQqvrgkRV0FevtXG zHN>OL)Vn;^t{#TvGr(pbfLUl4dBz-u;pw<+4&gECwdMU`8NbV85yHBsYmEq%e;B-6 z8HLtSX*$dbZrA4MV^NaDTl@9X$#H9Ku}%9eFS4SG=s zL2(L61#gGOoB=L9vE1-m4jW~|fBa+-5KqXRf&AcD3RGbrKZ$2&HKGe-z7~NHt&R*3 z-j+0EhM5`Tmr3-! z1hx?4dgrJikp5Pv31G6aLbg;+cM_(d$c&Zb>dyq4)r24in{T5K6|YC1e^zp)C4oR% zD_4?RjK^b1L1fP+D`1i7P05{oDzGq}1GQc*Hv|47c~0PKQuTwZB=sN3xjID-vnN`n z&v=KA9rPR8@tCOvQhQ@i+j%4fO=i(~c~hLSvIa!_t8I-V`6`*GCl)oI`ie=K+kIRkVJ@nD%jw(~QHSnukTtVw$_sTd-Q@R-$)|MIg$X1hNs z3Ng~(9cO=R=J3@te2hRGYIQ(JNR$B%K8pMfS@9(bCvVUFm7{*65f9fqZQsn1)gVZh*&363R@TsgJgUT95G?d=4)ek#r$#(5ZY;H@fY>h61 z#X3K>B^LhCjs>f|tX4!pJZEM{7daJ-clzfR@yO~d=dDE(_THe>%FG6d0)IlLkX=O} zW*ZE?A?GI~e+Lp1P!YEc(-4Z^t!)Su!=nWhnC^o5@TECcGu>|mjvrLz79jN(Gbe!+ zR(__iq(^%bK>VI@O(=yXT!>I+mKRtooUg4RHuL}icOYY8pf`<3))eaFRi6F8^M`P4Rv58JuBue=+Z(-yRRr%RE~hA!>LO%uI{&S3W%B_VvZ$0A>#cJLxqOY22AA5y@6m4vcrDYX{M;9p$%v%HhqF zCK|m3HDHTbzV@gt@7iD-*nEu7p;#G3Q#`q0f30X{^FA?M^+5=RUeoaN8 z+*T?nJLa7PEUCXxe9Rl~JW@EnI|Z$>b|i&Wpq4QPFCgBvng|W#dc(fiuDL#vogH%m ze_T6T+62Jn+t~!5wzjejLz``56W|(eVMCboo)w8*Y!jOke$AojLD3$B!4c61KQF9J zoj>5?{N_7oIxu7#jlm8Y5uE9+96I}{&@mBCdv;N(?nz#|gVmZ%?ZGN;u#y91^T-E% z$j^75zuy1P-RhEzjT#f0?h64w|28ZH`0X$o*ZhzelZ)>~K$vls)4+ z$_$IQ5XPX0^G>Q2ZcUy-(YY{r64Ur9oX<3SC^h%SSjNYg!HP9Y8r*R=_Fi&z2(}_@B&InIx<(J za(S0AZIhUj-6>vZf8><*aW6w^ z&7;a78f~GfA0m*GJjx%CQa_e6PR6e3Y#42m7&?j}!1Vy+H*hfy3lRt}=|Hz_PHQMgLgZ z6mI(@s2Q5nY1~N&3(;b}_|1V-wophQsU5r(0M?7|TT-0v`mZgvyl7%b_#OP<=(g_m8hQpIm?{hu)+eWGQee4_G{=C zv0}bh%<^R-MQWKvBGoW05Gn3>%0iLrGR_pq8Cr$rJPUTG1d?hx)Ryies6CpRF0O8!lh zV67(;x2sp-5-EB7e;Lcy>^-)IK=A;l0!6sD3#fF^N(B~=GL!PWmph@ZfLtcC6c*#X z-vZ2>>;NZqjrM%>5%U5)vV&>5W26f-xD_~WAGkNzX(ObpeV*Vd6pi0F-i#ci zvriA$UDf8ctB|a2Na~}jcNXQbo5_l{p(RhlS|B3Jpq9z-28o&DBcQMt6C3;IX5TvZ z=ocr$9bY{NgYowx$j(uoM=yGN*qY#y)q@a(9OJuYf2a6NjWY~pVkUz6b^kYdyK^!0 zfn61^ut)9*)H7&bQ@IN}2d3Pf>P^bGtj{~)G?l#`^(FB@bO;#7__hIwysive?~|Sb zo9?`}jVwIBWsyMYUI$ggW=hv0Zk2d5!Oo+ZWQtip=g~_CUM$yH&v1C!!ES~lp1|y9 z3<7I2f9>RZ2x@SuPF57rXQ+=m;2QTlYtY!sbZuOziqNvtk&3Wsf$S~4HYcox!|A+e1Pbl(@NYwYQ4?e( zD%<(>7$moX=z%M5KhUG;)US3)?D>yF!1>20mbvU;pd4)jKlZaf5*aa~Zo`%dv=^s7 z;wH#Ydh-*0hUp=osIHjLNjlBNp)@#CeF6-_m2Z3!iL&GFWJHND9s({sPC%hLoQ&QPECt=lWxv}Zu z%}{6d8LZZrU3g11i~!bJ{UX4g#nNZ=;ar|~q?>@UI2yO1JQdEuHJ5GXs~^cje@Tb- z3RrB2+{_i2j}$@SqZFb0xt;j}G~i72_8M)Gkf&OQQG(Ykyal}`?Yi;@seI1@7rtB7YF&uH<24tqOHZpRK$_eH(Spo(s6xdx-d=8GF$HK# znDONlJQ$54U|S!};Zhrx>hGB~e-_re)yx3#lb@g1I>ENc`~n;=`imQbkYwT*ru^wj z&1*i!DQe4DAfi|B18P3Lz-C;;M7&08MsQm@($o&TIb`rmhGWDgyD-wayM;rvk#Yhg6GNlw!BkZj z!bDDV27lG(nYHTK2EzD|7YAj2x~l=R)=CXS;!8{5>kjW>`4ORa6Yelp`@Gc*@xgZA z!NY-#E#_b#$uSd$jmCZ)e|8i47f!tHfuac#a_vZW`9rT$F~M{~ZS(x`IY$Y;!FqO{ zdV`^?yMjR;6#fP~-O;F1tDuHnZo4>!9?5!eHvBjBK}~%j{ee%O$zQBEc!c?nEH-Q# zy9fqNqqPy`Y~#q8M~{C2@ND^dHnuge?R+{e8m5%L(^ab zpgUC2#d-VwU|C=sXdG5_zE&2U3p-BIUz3zJdKGelpgn0XjZT8~q3OMyQP9pqo9XGbOmCSDTvP1HG^g#jcd^XneDra)xY$%xK~9US8n+hC6SwDn zToSiEUL>dnY6-~ze}N%?sQ~k-u^|)Mq(jwRcJ#)kvOVe(nQLacII=qO6a-x8)-^`D z5-^A+{}!OWM^so4Q6uRpL0>JXz$nuSLXgrxw;+`y?b6|pf73)!Lf^hmG->V=2^Ltg zoZWI`slR*gj`t`7F(7k@jVI*1b5>a47gi+j9M zSvRLI=PEH4(&e%-EDOlr>&xz^*<7RNpPR?s(>c;wgfL2iGL)EIoTpNzi##EQfQ(7m z`@!=e!dDWCo7;6evFlFsG`+}RL6zP)3j=x zsjdo7@wt6bst#QjLW27ClKAH+==ydZ%+&OCE?>36e@hVF{Xi9eLG;P?Nm~)^hnFB_ zr{5H*s!$RHvi57?%NuRzP^T14C$I#aV76L{sR=hqLS|~)otr!MOr>t&_6RhYqX#pl zuq!L7%S!XSMe?g*_&n)T+GDU+@EyhM@~E#*VKb0MVl~vygJx0cE<*AbASGZS69lSTa zA#Dd9V~ydFZ4<4CAIK9GZi>Ot_aMd;RJ$!V%-tSslf|n>Qi(c@qax#}Oe~1#p-4&i z7(?I41Ofml;`_viwijHA^w%63nYtfdiuSXge?4QDz9!US3^{uC;VvmM)n0NttP-78 zEY)K-apkoXtr$a!e#G~x6m>tm9IcFhQ#k=6NK#V9_BqP+O8Wrv#G{+_NiXXxa_UrITqW`xELTu=cc$9Iv(@xsN`rJdDmLEPCL_+>TNT&0mh7H$&Wq zv7|_8QDIt{>{?n{83wG`5;xJJJkAs2rVDs+*>96@^Y5#5Jfmr35v*6yV{Y*i)!NNf7)Re-(wy zX2|wb9}Oe{FZ2d0zwjrq5S3vkEoj_W1_l3FZr?R`?xJGoVvKWFp0W8}G5-{g8i($T z@uWO)58OOsCaZZmepQ3EM7f>;@2sE^H0&E(VFRiFW{QgQrUCbnv3p2jPlkO zj){C1%Wey5f=+QUuQi+NFzofQe|zq4b#~>A;L2*d1u-bVEb=CxWfh8i1#76`U}Txa zZkRNRZ?NwhSVIE*?bl)n+9 z(l_Q@RVVB9;dB+*oyr_W9o5H<6XBf(@`sW}Wbimr#ABC(=wzHe+_;Qxe}1-_jn#55 zcZHbPEtzZ=f)6~(h`sH{%&*OWlN*KoXDt9V-mYUDMOGGkj1zkk-Mx~-?z{eAO~xmM z%7ASIPg!KXxkZi2#p2}XAe@G_Pbjb16gPsWq3$!P15Y;PjR9)t6S;ZO%V%cn_6#(E zsi9KlxVwtN5xg;V?A-n?f3HsUe@%z(?eSzz^fwDx9T+7MQ&$KnNf`iB5+vdIPf61- zo{}s(Vx}a3EURv^g~ZX!e-2C=k@w!ep>KKrJ^)nhK5U)Z`N*y>zQeCzD)1u!)aU5n zEES{cY-${V9oJ({EyaSj<3GUTeS*5QQTP$!U6^L9Lc|Fb)eK+Df1jWQ3yk0@DEb16 zkFZQa%Yxdl<9e7iX>MvNQ*%Y5zzUiyD^cK|!5Uh2*p_=V7O;q0;0+A6hMlXzM2Gh+Qe}c+{U5r7p>J5X-FFJ@BIEYad zFW`o_GH0h?KzWW zSkzWZJMru_wtb8)Bi)mSMlDg|<3iquylI}AC32VI_Y#yK>@$$-uWS(TX{r!6<&>4f z@2ah~e_+(9bL)dMOsYLGlct!sBd$x*!qrd225=Nw*@hyt9yc;0%Dl{{A#ujOi%I8+ z4qwQ@W!jt4^f|~Q46)-XYFB-!rxeLoL(?+Mz)y=3UuVhJ0ypfTs2VULqvN+W7ky@% zio52Xn+-N;#6>G%<80gd%31?X1|q{uHu*?he}trLLeU-YY7<676I2x&ov6YyE2MDP zj7S5G55U7XLNy%-x8LKa0M#Un)k4(}&J2&@Fh9fmRw5lSVX~`N?W5UAQp#X-Ey(F& zPfN8v)~L+B`XrI+1_NZS<;MN^hGgd#34D-vLImw{BQ}5U3K^G zoJ6k~%E^n6%od#oobra{s*5p1OeWiFX694T_HCgL>o?qX-PP@f7b^p zxr{Zd%09f5P_7t=B#)(^9n}JcA>n1}7bm{09J)4CsiV*V>5`K*{LX!oy@h=9tpEeO z)#M+!1(6G6Lda9GRU;I?7~<(?I(GorbD3>@^}5r)i*WzWVPXw52U&=0!e6;|6SJMM zT2;yrVB39(9w^H(^h%l$$ttC2f6xG`^m6`LR3vnLwb1Jw4={U1xhf6!n~&t!52`3i;}1Nt7Jv6(t$=7*+W3U-aR0lsIhus*)~F!|GQjo2RmDwYK!{-N5u_ zIcbRc9=SdcUGtMcnph{?mr)!tSEfO;kz)#V;PPgQHgr#h0=|@La;<~{e<<7b*9XFb zLRKZI;0dcH4zYIiqkzfn5fq`~NdX14!P~JZp1_Z-?KC1|Njuq!ZvV3w{f^tK-QRF8 zgG%qa5W#WxsT-wp-+PJeWT`uNvjHD6cy5kFqncmRx^gVGtwOK-yJ13n&nH{ROH9EsZT z7y9{`oXFNKemn`ExNP5uW*Uds8AU5^2elKkDY&8A?1^mVNkz7(i251K542@<)vjAf zVFT4T;5Mde9uYcFUsa^(*KG31LP7@cG0YdR!{bPUyg$&@u}~u&e{ybkJGn1zsG&RJ z%Y<3RmmhNtM3%VCmI+_^*X{-08TFwl1lMWN1v(LjEtbm&w1R*>l=_1!b1u<$30}LNhP)U@2T~4KMYBXJiN4 zGAmeA-5Y@7#V&qje^qk>uHkgf*?$H*a7K0e4j-RDz2B;ym#MR1Ck}Hj9%U|ZC zukYtJdV2(&>_pA$-kCw~pSv~MprG=1Pvfho!5PXO%N@c4XB$n?nYy8}T;C z3(YqLMYXQIW`RIj!h)$x9Eo#kKG>kMX*Kqyqdz6T?R2?~e_>qQ^eUQ(gzt^rEnrqZ z+kmpBiTHQ7T$=MTAi3M1dvCRVC$3|Bwk|9my@721O1!vPHrZ=boNP=z?ffX)U+|%F z`u7GV2@O42%HwFC`J{w0ERSJH5^Q#KzH~ReRZNfS7WI7bF`W4#!!gVr8@fCh-o?aQ z2;N7X1d+_ie;&JZv&Zv^tKrAm2tD*|jXi_SDERCa+Fp|2GYn^o1%uxl$~Z=K44LfM z`DqH0aPBCzIpGV)B)6>^(nd)@z3`xc(LyopQ|?04@}|~_+27Vk#bUv#;ByqQ9qnLR zL}v??jDz;$r3Rd8G`(?l#Tap?&X?juo&CQN{t}cyf883-VO2UJ3^zxR0PKe}+=NYo znQyMo{y#9;e$QuYs5K|8t`x^#K)sIMme>;1`QSf|%W$tjSDAeH2BKeC$Q?$sP z*#OUJ>@$=q4ks;z8mg(xpH?MiUFp;t)2_o-9Z@T@nKLzbOFInr4$E0zfir8|p=cLT zkOMypPVP)HBx7@(MyT7l1wqDN1G^qgt4p}e7np$ET1-X6tj|Vx1q1JN#ER+2oMpZ|%{f4Mabk+2qftYg(* zPSjd#b_))%UjVl@Vj|XJlVASl{?V;Lf3SkJSi{Ylj9A24Y_g^uF)Gs5V!tmCdqPDW zpWJyjqa|f6_IdQP?is+(xyyvk-{l$o0M?f3XO6Hi5~}>DINr0934wQQ~ML&6J!KB@}&h`}1{y z%;VlnyqYyx$;IxqAsK73%M(cBDDbz7?}8lFvL*}Bvo|qHIGVTj_neCrw+0)OwkG>I zlk&ZAxT~+j?#>E%{ylbRh7SGaE(x+rir)K{m3grR*$c>S0KQET&$5D*f6H4bx;7UlWj*$BCgkF_wz{mu4qr1rh#ijHd;FU(9_gzuw6}+t*CBAR&-Zyk7`+wg*i^f5!|Q7I%y|EM!%-c>>?dC|g6$VMcEQH0Z%0|w#l>5ANxCqqacW8+?_cNE4=I=kgrhe~6NmSjWkkGLeLp z*kkb%WnKD}*um9#vH=w$~mE&o_6;3y5I~nl-@{OIuS4qF%zPY{tu3m7TUfxp6f=s%2dka@2&6NLiJAoN0l< za}Hcq+dTo!Y_a{xf1Mdx+-~PM$E2(yZJf+;3KXow8ta(jl+v^ki@BNO6lz$BMI6j= z_64lO7HgX0R8Y1O>$#lc98t0o>#V}ln2e;A*vrYBut>s6?6LTqaF>21c5pQ(JfLDF zR&k~x@DB$V^S2Q`UJ$)QZwxY$9NOy=$G%JBbokv_;!vb+f8BNn6u7fS0$&fEtH+l% zhd{zw=Fye3ttt^W*MDtYO%hM)S(&9e*NUylXkuyWvLE#ec@x7jT%QF8#jVSRzk$mm z3g7noxd!_Qg{{kW4%ZAP^{mTM#DDC+{&NjRVp7&+pC=H&^BE)|2=5bm=hTX-vSD@W zvf>uRi1|8!f2hjHzOJTN3S_OzZq;3R*GcHe`>+MqIAz6xj)e>;hL8XYe9CiomIU-S34-B=(! zK-`RKQZ9mPFA#1{v3Uz^xf+_6*bEgE{dF+i8yi4}76)8dhDNa+;HO$I_dG3Uqm zI#f7Lip}HoH|I8KYjAB57xb?=nQ|{OKZ+MGBYkowHo$8)-<*t(I?OI&|FeJIL-=X& z)}hVoee^;VhPWI5+dBD!v+rwP}7IU2-gAnE&m3eLE^+RVkwU8 zWw_9cSJ zmXA6|>muH_`+qGR=bF;>Jr9b4oF>n*w>gZ`yS0N0au7$y6MC|ENof;1iF@>8@dcxX zf73~fQHKO%r*iS0P>0w_+)DlSvFV*G2#?BeoER?X@BRh8f#o;JwrDq>%eaWP!3muG6-aQN4Alg+Q-kmt+*`zj z0A%zc3Uc6Q!D;dG(I>~-3x5NbEeNtDe{ScNRkD(#S?WgjNeOGQM|D7GxkF1TYSv;U z7czbXCqFrYC-ijjnOFh23mbg&1F(ni`Lmxrg95cQOWR>Uz~(;5@q7fHuZ@l?le40P zaw+8z{hqtZFC4iW8$2&RgU^w=iF3L9j&6>0a|7d;lDX!0Z2szQ*3gX|Dt6aaf4EB= z8iA3!wt)-!2n|O;P-cs--gcQc*Z*MeKV3IKs}r<_RNDMf@t)3_9uF-7U|i7>?I07?$AI2HS4g_%_IGO1Jlc61`8O#*rlCB zde&j7kKdRy{L{D0Px1rE)?l(xe_`ve-S!jvegd&mTeXC&!ZxlnxabkBs4uk2UeJ2_ zKWpsytf0@cs@@J2^8f=s#^3V`3VeQ`*E%?{i&%s*7!$e>4_TQr2J}cM=jmzTn-z*gx-e8aXObLKPeC9v|>$7>;QTw986*mUf55Wq4iegl`KZ*Q`pa&=a9ujkMI%W;vd1gx>NN@!bA60Ww4$_iYQ zEuBfZjWWanmv<>Dr0*x$>IAJOsW#i_nA|5#{c!D|?vl~89*a2>f3hW~)ou_ATz;F+ zpoEQ+EHfx%Rmroq)=fj=y6ovr%n5+P(c}xyJts~7=R}vMpt;B-PRQv_A z=REb1fx>f?JOR8e@2Dt_NO5AH41mOm2iq>Qlh~I9zH9f#;&E^Bx`_EFKYMZDsi#HT zt@7EA3pYj5lAI?~e@on7QE_?%Sq{+{-ia4DWohp1=RyF8W4QMVm@Hn@I?PUDU+&Y` zPaIqy+GTbW`#gn(gQz#Oo9oB>MW?Ss?9Pq9^bCW5+`SDRv*-o=eEs<7$`FkL_ttPA zfcf8v7oX$q5Ic%{;U6>q>>%zuv75M~_X6l3>O8NLn4{NRf837P=5-PCU&DQ{bgUar zCozt)#T_d(oGxPgn}2v-JjxvD=K9a4ukM%~n|$-@_UjinvB%7=WB=JtNyM1tH z2}JJP2558?e`L`fe1q3<{0*j2kbm}z;Dp_DrXYw85eOgs0wE`k45gsZwKZMP$No!n zfs6M?cv>g%CfoNxhF#1rKhG#)cVdMLDjCanMmbg&v3>yOI9h%ct3~T3-Y+2g6lD&4 zUH7ORp7Wg6vWmoUA|gtjK^P?t&U^K!-P}?&Dq|H%f3p0IRf7`NVGoBQocMtkkh?7_ z1434kFz(8WK*U;X;zmaDinp;jC@(y*lek0Z;L-%1)kUm7`%#R>+2?fWhNH#B$VRXKD0W_LN_L<*j-!Uj|5aX*o`4L0j_P~g8uRe ze|=hh;+3a$6K@x7?`1u;<7JJtBPZ%H0|$_0i`EJvde?S%0^iHXnHef}*H&=QR$f-p zR&qL67w_7Q6)N_1SmB;*!YwW?ZBc1(Z4n3bUqF$UQL*@nb(_~o%pnF}Jl-6un_Zkk z3S6hb@@oMicIQ@b!2k8RSo};-!|7lxe@sB`*6z#a{XVy23%FoU06u{3FFt$j zzHGFJ+1X@QmlDaSSwZqNHv`r`e=O`9UQ*B+Y~@743oJ-qCvf0rLB#IF3b7xBi$}g8 zbrNUPp8CsQrQV`-6Ysn@X3IMtYV&Tc;~2iWWp{7=&25xpzwyrO!1z~S3CFu1Wz?*| zN?+i&GD$$}%rKRJ71-htMgJ9Mp|c*SjGQ&tiELbTXBR@h0z0sstIiC;fB08miS722 z%aPl(m^IjF@%_aE0#;xPw(jUVvqdFfHMaOz#<62-U}POsr+{r< zH!+Wb=RJhaxtrT3#QZKSfAIu@lb;RnU6{0S9FK`XG83kA>74SZRC>8s7_BIetlmX349 zxVe7Gpd(tBmfqZK|8x*-#OgHGzveKB9mJdPI*s|GpM5WvbU)W+e|8%C?N8Td{~wtA z1@`Q~OYXoD0u@fJ;)2_cL4JVf^Wrsrhk5fcic!es@z*_eteoE^MRr-KtTm*VGYv6Z zez?^kb`tl%KOA5j{_4N@%^mj(nn1|!(h^hwp8a!Wi>?Cq)}S+@ZtcNqaAFbQXTj++ znH^tzLz#fwjSZ6Uf0!*lhHUdXiaBZ_9mKn4brS1?gIciWbiV6pQi|I&alE%>LWpDA zxL`lQG&(F_N$|AzY)F+?S0KT8GAw_BOGxkD4sHZw2;M~7{DNM} z86}JF{2LJ9#uo7P5c|)|Pv*9X9rPi@0#=X+n?G`pCH@&)C!vFW=CqhK*vN&1@1>tD zUr;#K#qSRnkh`}0=Btkis90ikK2U>zWMQU2=eEEg37 z9I2ByN6TN=+Hksw@pA#6osA!b^vjQ+MrEufSvFfg3HR`a6W439iq>N-Cqm9(`O^qI ztCLu#ae$Yem+vWenVrNw197~3L&>o^iS=s~FTYsBf3rG?^(6%dn4K4ocZ=6W%>T%5 zK$4e>PqVjqUBvt^us_7`b?Jz=Xr09S6y?vCPbGHixp?Gz+)gY2VF1r(;9!GzkJ?H6 zvuOF%6pqwIoc~|{1mVXh@sj{NX3_E~KOn%3Es_xYpxa3O z9BkzkfAe<__bxtQR+6=zbaSEPe?Wrg^KMT5;wMIW)NZ`CkIGm@k}N;tG$>&m_HZX+ zTPFD_N`RxX(x!KCZ|1m;)g+9w5+jze9=o^{lBO^#oE037-DSBE$yiIOI4m^+6|1p| zI~{nn)lq?IsNIzofreEihO=_gm#_|dxDf#kf6?+AtJ}PeVjjcgU5U16ox~fS;LY-j z8!cKl@rGwNUL(8APGVoeZ1LGWj&%MwXCQz{5XBMQ*LqEV{!D+pi@_Nr#S?pnzYf4t zm(i6l#WgItxTMHVEc^{j5~^elbsV~+o?DO?QR2@fh=-R{)Xl*5#4u!89hW5d48wVp ze^l5lEqIl1b4$P9#IOv%rns;{&3i2{T(3Cs9CN|P&<}SA4ix$Bkim2H!w$WOxejGQ z^QaxVv;Pcs)x!|=I8YsgF?-y*y<7f#m-^I==$s1vhWo{=4$>2<2Qxoz$o9Jo^0v6 zSoEu31Sjs}E{4Y=?HYuB;VUH3Yu-s7L;-?)a~I@KW%A&oL_gUBK4+V|GqQV$A8iZN z<+6)>mt}et{|M%1KfAbFg zknUo;88u^`!#5vMC0V%h3=I)VIzzsxFAj5a}FUhhee803L*^uhL+(9Rbt+%cOsX5?SfDouvMe@5jd!;{Tr zsaC@?SZF*|I+-Hb6||4xeshgK{)|3Z>AVfaoluGT0W$Q4#+Zs`qcs3A$SxcDX9s?O zpk(tEYFU5+wC%t^QHW<0?bbJACXeC>37`Kf-`tRozx|ilECu62fP+NMM=}R#94`SE zK0maf$B4d`1xiYdY)Vlke~~v~du7BgiY(fr%wwOaschyXxg^0ZD`$%f-@4dK5&FlV zLY~G(rZ|!=5;hP_tx{CFhmGVZ(~MicCxNbMiOY;7at7N_hkpLN!T%Uk0yucvra1gQ zLtTJ;b6bC;lRz4ou0fM#*&wuCP!pyZp5!hr^KvfD_X@M;JoN*xe{@XsyQfNIx)a|L zBxs1`7l3<$>)TxdUs<)HzAmIinwR@?R1oeNd8JCDr^P5!;N>-GPw`7j>dyHK6>-JM zdhX^}EY0m01KJMrVHwn*BleQ@JPN(cj^oKk$@j50s|q9n*+8@l)U(H6z<|E0jRajHAgrT^fSin zTjnQuthp0Q8hdjD1>Kdkd#3JMRkvs=QlWnT?u_~-*yUyPe-GHDFph!+h|$13$fC3w zxX(*=K(*m10eQYy^o|)U(;fDHJU~)b{u*g-eT5RzwFsC}vZ?iSW>Y9Opii@q9FSfkrInx$$?I|2lVi zZ(jY#c6w+}e``)+;u-ie>g-r;9(jBma93vWYy>ep=Cgt0`~qv^&qD!nR2R(%UzAz0 z3)r4@yBr47yAr5)mdkQMs-cfD-1{*y3toQR9b$iuOjNU_8N`cx^E2eTFv{ko!Hd%~ zvlM=I1on#rEW`e!qNQy6*-xSa8UkPAF2xqa-P1hAe+1T>0NX~`mr)$zPpn&&^G48^ zY#ipOHr>;$_Ly-FF0wnETdSGMLY^9(MDFb9ff;Dz>|Xvg+HE#f(}tB(BI@%R>)=4; zRM=0MXe>qON}9NvW=z~)Lqk7tpLazwG8>;cFp(kx?ntUieQvP;Vo=lrFq!WihDD`q zM&Nppf5jFoJg0b*koL;W5HRVTSI2kl!^gRO%rk3gyUmPDD{NLE2hjTxtBfUXX3azl z)J25WcDBm_2<<0ep7?2gf`xtL0%r@msBET)POsJdjVgh6cnv(Cfv=ikTYwVaDePQn zp~XsLBBYs?Hn}omOPq*kTHf?xjxBvcrfGrme;ZEGViJo*wv>sfCM3?KM?~IQjM*Jv zwmZWR#1>0?pTXpS^fX1j+h5oW8d;r~iE^xZ-W~in-uuDx4p03@8RCK2#Ugvpvq94n z{2T7)CaU^FbMvsNAB!~g(8=z&NjPXKizv=(M>X6EBpF&-YftG`?pyN5XlJ!(`!+*o ze>=njm=u1LkW=+@;2Uhx9LA=gVAqXi5YP3mJmoh)vb!vy&kve<@qvtYip9N>PRwHh`xqN6p}U>2$kj72vYP z-z;Pa3M;mjh|*53kYH_pwURN^Sw(qc#15XeBvsQl5dh~s2tyF=KwNC{I7*J2E&kVF zEvbU^5MZ{Sx#spK%k2NI|MP93mc|n>Jys< zy>1hmv{kjmP5Q1v{E!uXt)^of(Q@uM201QBHoR60;)@(sjB^?kvB3!Sb+vqm3{9kmymXQ<^F&npo& z(L~D#orU}`#VE)TqA4C3VV<5cMLUc~!^*VSu;IYw2zKPGCRvIj9VOpjiT2p>Hg${& zd`|fn^2eZ)O~MPI=^v8@MTDLC>3`{aTx}Tm1%LhhLEEtjgOleV%lLRl85+VDrPgsN zbe&{NAdjp`Kc^I_?kFWK;3jSuCkV5HaW>WN_I~VVskrg?vn$w zorRxB&Uvbn?SZA*l3d%mq}XH&c#l65)q$r@DOKMLs-oK}B=gfu>Ut9pB7ccfxzK{P z0g4G>av9fxyip=ek=-0WqfAEC2*u6?P5l@4oC#vkYL5zYS*X_w5h9yAK^%qEnV{(b zo?~)}21r8oafkY3xF3LTu*pv#ROg^LLK;R4zoB^NsJHQi`+^2DKd8(umY0*_hhZf|w*5^S(+YrU! zcJTNP;y!AJ17MHU4GRi4j9 z64bck5%QGDrmZz9W-|81iiQ~avZ~cZ{wY}yJmBm~Ch+tc_K}2XCcg%@gPEKHfARwJ%?5AUZ5CFOI0{+B~ukf`(3!i2+UU zw41G2C@R?nBG^mT?s^%kKcFZZ1_E}njGRZ<6^4TY;2Jo%8%zw&3V`7H^7auu@$!G3LE2#)=o`f|iXJp(U$x1>SN>clzSBu^9F0 zG8Fq_UKIx01b^g7464N`4t8{*`fjwCK1s&y${#C{o(`j2sTZkgBF}Ax)LiACG(kg_ z&6cS3@UnVfyuGQ07LFIEh|Tu4C~|in*vZkF+^vl@GFI^nr0FDwFfy!P-vl$>5ygoW zcJ(K@tz{uO`rZ>;xk#6OAR}4Xq9u^0moQwE=)3seuYalTaTt-Q2riwZD}!Rkt`D&= zpc>?2BSS)@J?I%i55WQ713=NK(^tz3rbF{VV0wN-oW!0g{7Amxc=OW+{cjSP{M3$0 z@fyVu+B zNfIWomPN7<* zfZ_Xmx6NnGj>9x=wSyJKtpGoY1t>OnBNS@wwSO)s)MKkIU0Hw*uasfm2L*FizW^l~ zsyF?mW%!m$K6dl%GEB?b5fetCMY4baOO=-a=OSs~oEDcIzU~?KGk($^t(v<3Fb{& z<0m`(L|YjT*E|ssM+vzT2P7}M7wqJXER9vZZ)!!OZ)vNfs2)P~Fsg@9y)O6oTjD6u z-%uTP;9}4`G!V?RamcNQA~~`D4?c+Jhr2Q-!X9LDl6mWQ!Co=5bXP=6{r zuR?|ejY$5fzC4EF@!0LBB1va5$IuqUJ=2y$G1_9dXCf)7A3UR)r>%v5s4a$pwAJtr zg>qolsWR{reGMF3^F&B|DNcCjNECBsXrAdSAQ&Ex%RduI$*~jNjuq%LZ53@u{+UPs zDo$10Q3VxcFgfF63g~})_8dk@BY$rCRc*}(y0R1p?Gb3Nr9$FWQ8b6!RC}%Ia62l~ zf^Iyo-CGOJAsn3(CMxQ;ynfeB%+l{9Sz(jud$rh`UY14fp{^BT9_q|kQ6S$_LP`$N z4-k5&9ecrwLput(phD&o@UvnMd`oHGz>Twc_(}>`+Ged`#q3r8)0|LDFn`{!v%;*} zbtI~|-kc>7qy0m~De9eBp_c3RVFj&jm>uqko*iL5dNFuGErlh<3Qov&_J5EX24uM% zzGRSRw+Ii{TeZS-7W1_Fp;m|{dbXIe>dBgO4G?hsN;~a`n1}0(8dKzA3W*+_6?(0n zBhoihmqFY_`*ZgF!a0>6=6}qGr#V~l^PJUBGG`r{&RNIg_Dg*LVvt=}E9S*fUeF-{ zp69;!XUTT7zp77hMKbz~-ep42#jy(1+sd$A{4||g38C_?C#B#Eyzn(&Nw|!na8l8N znTAT5POm5l;xYvJGJ2v(J*YdW&^a~L6Hn@q?|+$UNhY;?FG0DGn1An6Vy;(G%GXl1 zqm>k(wG^CWC57huO3KS+9Tood?d|=y`>$WFA9mMIzigKg|{v%0^q(x zJbP}g=~{!2VMzf6rJdNFD>}<*CGg_E$?sE1vKeSk_=c>t^WNY1nuYZt-xU8*x3{tGoh6a?1@@js z_7dcQe>lK6L>Y^JjIy(CpB}*owFpRil#?2(M<$bpDgs-4oI3Dgy+2(;N_C{s+qS!_ zHKbHO6}>%Y2YqX&W*h8Pm-qyNB2>sAQx7;;B9HEfILT3b*_AYj4U+%%al~4A|%h`SgPVHyjSF0(` zZXB^Wfj0;#oV}~ww%n?>VP#tODung*i-b~~yo zb8O9q)wx&5VRQDWVj7-$;&{^&ID^cO?QCqWsr#@ZcH?%k;yVc8?M$jw9HJSu4wrGS zqQ%vjOY3l1m`H1}^=8o;K3iR}y6zORKyswq3IP-nSeq*oRlH)x3{W2jp&dRkiKRYg z#qys*mVXSKp`{uy6A~DoRI1>`dDzAzV#Uy(L2P(l{4By=5^^g{FJcamqUK(G) zp3S#e92t&)3de@c`;7~M?go$g9sNBXJsQDQkALPJ(NtLTtL0&A7g%N>+k~+`lC@jr@G)#v9rVRL@2b1_>iKoF6^W|lV5-HEUS3gToqn=yh?c<91 zZOoc*g|9bn(&FkB&1H+cjaik3wLh=IbAQ>LS>YPv5k8x3?yH!`pSLR+&TFN^yp_jk z-lj61w;n&w+YF}jHW9BD^F48An@fz`U`iaulKXM(zH(T3nhJ7D$n{nb7G^KZ>7 z2m-%w^$DKAzoG7kAaHu|svq88vAYf^4EM`OkVn#PwnOH?&w^7ocKF%*t1bNvUw@MR zZ^9T82hm<|0>N|6M`H5j_gB}%xAYAS`u6Te3bF_fDg5}|G}i~}nVJ#51$=0UglZ{y z!+-O5C{2x$mOwVx!b5=?WZ>PhMI%C(k70kvt1#6<6UQkLYZTuug2xb))S!|x8BJWe zu9)BKtDA@WL8K3?;9a3-S9lnr>wmKEstY7RNzz!Aj|6H#Ofyv`Q{cftlnfYxeR)*h zs)+-&AY_y8e9Ya9yBjN_+*5A{!3E1)t_%Tas0Is$dvU_^f<24hzHboN6B3BT!xkN`Hb*TB}B~ zH^&XG`e4$;uTiRd)RF`hz)f?rwI|IzV^^K;(|mOwQF}`Ns`q4om0l7s+1irV-m^Oe z#-KiMh!esoCcC*n0UFg&r=uIk&0CNMuR@6@^ST`V*dy9{Yc`Umz2B@}5Q6zS^F9t(X}!~H@Y$kU@PrZzbV z+fe_CzDg3qM2iyzlTOGc2)*hj*JSjn;CEd>IR%o1qRUK5LG@{h36B|q*8Px5j3~I* z4*qKwa7w}1FWAkQIDaQdPa<$4S_su7!siz3*bG7=B?I-<$ap8EMh@>F(0u+RfSr_r zdcIY2s&T^;bwV+I%aag;t5e46NBY|P(K>^^gYt43M?r*!iy#F_2$DdqH^wO5RN=i= z_&X8ms1E7cvLa>y#-ezFP-4E3y!HO-6aF)T^Pxc-7fA8m(0|uQP$Sf!WQiY7E3Dtd zv%&1+1SV?k&MDr*6T{)kATG&D*DNXz(DpzWy9VO82C8n5PE)O>xdDua9y`G3Q{x=}NNS1r1U$Z&*jDh!X8 zTwm3xBqvJnF*H=)U$GNu3M0yvCIaLkAa=CPtAXxo3VM&~#Hh2cu8yL5hU_b>zPGa^ z)%dw4lF5?1PW`mxlTaz;*qomt9ScRL^TZ_e2xP1P;FsQyjG^)GA0)>z1nUiXc>+*# zutW(A$A3wEPdt8Xzr0h_@WCGfk}%J(zBlZH*Nw70-Lhlv zF_9EwY`i@zK)M)>2#_dL!vbj846mfuB}RlWEH(^hl}R|dbRjM52ojL_1$q#Iyucka z$*FE>ywJ6%rxca3Qa?8DCEp|C9vjEH$v40AEFiukGhoGD z4a`&ZNkPdWhOc_HFb+v-VFK~C^UFI$4Ilg=AYJPaeQ($YuN!4+x?zU8;;0owIj~+t z#JBTZ@y6Ab&!N{*Ab0;&=wCVdTb`apP&GPukXB*CW-o_PB#0+SmJZEsNivkXZm5`g zX@9}5xlX=)#ajsf+H&dsn!i)v{D65|f%R9x<{lp%BV9oZ%kb4N`g=0v1#uWR z_~T#rqlyR+-!NR}idb4;f4gJUD{m0cUc|%Lq2lSp^`n1)(Avo8M_qm=7*o|CJMUFF{p~Myk z?rgUc$g+K(|0~x|a};diTiqS83*>pn*|}*J?4M*C5`AcvQ8SV18ek6tSJ9!spF6BK9IIjkzL!z( zY@Tm8nBVJ<@=}t#)a(1;fVkUD0N%ErzfjtnUr`;B}*HO}Am<5s`X{w=Z-O zZeP5=!a?71Pu?kTR$Iy5Vi2+jteHaiem#msF4ZGYY;zd}C(FwE(xC1_PEQ%cOE|nR zsIYv4b>T^ms{YrPgnv-H8s+BUzHUfVq;zwSmJ%L>`>d;@_Eo~}Q?w$icT3~d7^RZ* zZs{@5i*&Y(X;Luj0;#Vj3jfblUG$W>VFt`hK@h4ErbG|Y3DFSMl-{8Bx3V*6{;Fh)>+GdA{zYLus;?tEA}^Iz)FK-A2sr< zx~uGi%j;^ZzNX9R+Uw}m0(xr3t2%8luxxLzT0gi8bm`Z1o>k=_t5fGPmOKruEem8*A3M znv7@LQPCUKG7K2|nMt-F;rVPq0pAMWd#9z2%NC>HMz*B!c+>joiH$XDTrFe^78Rpx zamiCZdp5`y6oAhdmo0&OD}M8xoBUOY9GD&6n zZo_1tt2S__@wCzenww42T;RM1VTcBgAl8{lSG1+ zw)@*VRev2O1)&zCR|skehvDcL>Z(3@J@eZ%p+QC4NY$!pYYDo;(uQAgU|KVqmF|Y$ zpkyC|RHHbdQ17pPxp{1I0DAIHfm45s*#xEv`XjF94Gl#futEBRN)A&itvJW5EyX;IYCSbnElMYiY#%Bn8fQIHj8)?ns?`_N*NSUQzISTU z%|yC#6D7DiwpHcR^VB=6!4rAzMY?M5y$|rHQ)ObURw{I?k*~|KHqB}rYcemASWu0E z)#wyNvqs;bvDe>YO?WjnS08o!Jsbh{M}H3d0y_ssL72y~rl`tq0yu&XM}h=~ko3#} zuvc+>e-e-Daj~|gKPvRL?9zb9j#2T6-}p+OEAn@*s`wsX;m$*(IlaKNb+54a9(xsP z=PrxiA1XdY-OPO5*c+*)f4xo$*|D=p&A{a8RSuLsh19Q0l;@>k+A=7QK|M0$dw=p( zS{8Q$mtyv(A>Y^e=WXYgcZwQ5R9bpKdS+t!-c&ii8)a*{=~Ykosz~e%-$*OFB%MLA zt6y)(H(6DBej$iMh{i2JV-)nmI|b?w{#61|x5nSYvbaiLG`n!r2ZV>AJObqrsB0el z@z0xFNty1}DLsc2m=YxZo-DW=gMaMG4+4jrt9lN1>^6!W#On zBLC&)sYU*oy`d75gHPNB#xOwRju2na4pThTxO)4Fs-r7ELCt?eD7MB( zvN487F+SCBDk7c-svdnw4Wh?ajW+5yG*&8+?MZ4>IWGrAnd+;hkR*Yt%b6*A1nnnYc9HH4P6g&2c1gbJQ(u!yWo65<{N8H`6^o_J6dHI+xd z41^}_5QFh3408{NJv2s2YJXn*I0}6dwkiJxvKl=pnTiTzOaDr1 zxa3+xRuT^7@ynJU_K~OWl(;`o1sw=ITM$ie$RDpAK_i_hp1lprLdQR49rpdzBmQ`c zVczzBRoC+22nyyG7qtUG?_g!4?hS+UD!5-5{FZ)K*Jn^$ak8}Pt42hVbOM{3YO6nb z{hVScu{AdbRU3o4+ka;ix6ZIlDXADm(|mM?N@wE_qp+YCWWJ`sP)KSq4Dq(}i>ihX z=@5{%*|5Gh?1R^hvIX5K$coI5VlDGkfw2W^ptt?sHS~N)#}PFRS}T6jeD(y^uoWMe z7sm`}m1Y+7J(-wBpIU8M^{cXy57h_0`K{6PtXe63(_ni&3xC*&MF!8HH~}s9bxmMo z(+22m|91^NMlD|>Ota~0WTfK}GuGF9_5_Jy%dFZP66?)~kKO%(s@wD|Y=1M``c?@@ zhhk!;sk_^;qA@mHegd&hb3($szuJ=EZ(C2*xb+t*s08(;3QC)cl$+=vf4sI`&CC<~ zfljF+;ZV|$;D2vhPt~|hN)r{KSC%MwF4AwJh5Yf_)-^Ly0$&fY|E#kmr4EY{s)l*n z`&C_w{)nUt!7C$cE#0nvH-NTSqm0&7t(oV#lJa)k)2X;y65l)_OJJQbSTB_qNFnXk zy@%1yP~9E+y&(F6E#)brPC;K(H&=C=@4f+SMWCCpI)5FCeu6KMR>qh6G_|Pd1wV}@ z8~MgGzi6*uatajJF&l29DR)kQ_N03wB_C@VP4zVyxZj}LX_Iv;PF7iV%9~mlGp++o z>OET5iS(Oj>C{-XmNGI@e$q*v{Dcx``{@e(`|%sM;~)K?Lj3{LzrV^4McT&Dvk_AD z2L0n4aQGa-7Jnb8;GNEqp%f4z>O4bAn6 zdU`q=yh*tCSAT)%cq;O%|8@0qR7bPRt@mZ06)%bD6Z+nJ ze4-717)ATAFQd5dqU4H)pNI@2cvFS-s<1zIz{aUVaEQnjK6(k6j)DrpqY$-_TQT&w z9)At~D#(K@KGsC`e*4MQ{{vH1E*w>|x!g}Y6#Ai$D3)*rXBhp{To*T%f4FzwU;PUA z-UHs+eDx<1fgWKxe3jd04)JZ{oB7=&|A)U3WK1Am+K{}+w~Y&Enc9cf{H7-G8Uk{K z9ru>Ud90oxj$L~zsON)n0;V}RYbsK5A%7bShzYCqdMpE>cis=T%IH_>^6Bas$n=>p z8g=jF;Q-fF&|&ntUXklu5Q4fQA?M*afMFVxW0C@8+QP?&*JX)FD6bB6*8-U$llAe; zG0`A*P4Ed0HF$!8E%buuX#zJChE zZ2!3hGEAnBTSvRS7_0F2vAkgR_BZlJ44V2`5z(`~Q^mh|6>sOMgV91oK2K46z?X&| zaHlARafF7&${rjz4R0zikAk&5H~d^NJu)T<=M9|D`Xd@prKw!i%WJDYdsX<_VrV=? zp2%xJO_}tdRwx35>~7aa1;QK~-+vCs$b%CIOSv34q@=;%#cZpm@_O>Cu*rJ5g3!Q) z15wG=8AxGHuKkqQzQw=%9*r%VZ`0r(dQ9W-Y;nD51(qS^o$ifCiM{!L%CBDiW;LPK z4Ad`ZXmpl}Q$@yR)M^hF9>aJLLQe^zN1&p>Oj3o?6ZJR%KP5}W^&zK2+JDnwT?Re= zl)+eP_d>kK2elB-_j350RxgltWzwfNuT%Q!G8ChX#Yj&SLhxz?9RnE}r3tPadHM4u zB3_wuCQWB+ET@sT-jCf_Lf&X5IAr^96u6n(py7UQ!Hf}}{b$^eC*W0;l7{q9$g1l| zp}ruf$#_=B8WlOKV~p(n7c2%c&8EmQb-N6{2nLl$iFB{ud{6F;As;6&KeW-u6*u^A NnqUpn{|Dm^^whdY56u7o delta 25268 zcmV)sK$ySO#sSjC0kCZWe{=8xvZ%0biM$K~fD+3V?3UAMoDH@q2uy))K`I9^5SV5{ zdHW2()K$Um+*BU=kas^oP~!J*U{Zh|)5Q){0R&+8iN1;MlMVa6D=sN8ixmP>D(%A< zyEFrb=!Y?sdU+lIvYh@hl{QR8AwT=*W8}xtKYrDIn4@eCQB2Yvf7$BjHAFXA7S9VA3UR&#nu1=7=j(DA#-MVGczDN! zV&lj0bZsMtv!lICf7*et14>@80%cJZh1wlpwzDWOgs(|nWWYaLfg^?7jWoM90$}jXLd6Kxip}^C+>D0drVpfu*!SnZ#bEtH5?tq0Qp)8Xs=n)>_LFmiwTaS0xvAA(H3L~$US82w?5&kKHR zP&?%c6_X4ItB9)nKUEV+K?Z)Z%ga5R%&`MWrV(?L$o6ESID;evNpRU!9S%#lbtRDR z_7~YoUJCn=f5G$anEC0+P9CCwFtvnsL|I8V{bB>VuDYyS)iRd7td!tZE(8Z_fHbA8 zKs75aRu$G=cGURMPNG$d)~HCcJyg_h;(N4~a(o9>w^NIHbTQPzX#?=& zjzwvk*#GBZ6AzCJb{XzI+eXFX4ST>(CGEJFrVGZ|fAHOSTop%ou3r^Tivy;dB#1Qr}yZr_*{5sQxX zJkh~NrNbZsKeHjDWPLTX{@;e#(0QP&Or}LiB4QK9z#I}D4(ba;KSLOANfRNJBSpIt zZpbsOe;pDusO%1nVT3K}-Jg8}fHK2_zWcAA=}N;SQUpD=D3MFRee{C@W*0lCPptnj ziB@$~I#{>#FI^dABj-P>F{N7O8HC!@gVd?c(GX@UXM8LZ67(+1AW0kWBLH-cWM~B) zzbQtx2ZEQ9${b`nGL-}>+^Hy!!M>nsPj`C&e~+OyJ17GP@V{jI937%yCq3UHk6o>d z+Ea@WoB0weQc;pq)1KRl>bo@dlY}-^HX=~-z&*-gFTU*dF-dHY9HRu~*($%;g8hN= zM?npe)mzLgfgk)RE;C>e@#H6AjQWD8pOKa}rJf*swr*TJ2Z+Q*iCwMrcoq*DXD~AQ zf0A0a1p%E&>$$(_#wI+ncY%6wR;vmEOy;gO9gEm_kZNio*8F6Kx$SyXqsoFys zTdpephI{#?6dAx69#O@9+!g4dHPADPJL;5L!y|K&wC=TrSr3eCB@KGjnmCd-W!Jjo z0iz_e+q-#y$NK~!M&U=8J<;)~Q7;*Te|}@#I+HGK_3IYJyZ&OQBhN#BfyGByCZQGN z`vldTiNrW9O3>ShACJRq{4m^w^t?o89%+2aKoUL#F(^Qv!8U=^G>0T4NYSt#WPWT7 z@hCI(E|0aVhhg~)uo(zo7TQIgF^6GzIxd?-c#L{&d4E{O@A6oLu(Qr`f1GJaAduF| zmE;!V@mNw2*|W(CSY&!ra%Z0kEKKJ>t(VKqfd5FI6Zo1`{U9qz{YP@HPLadxiI(Xz z-r-{hJ;z1}f!&EUF{;%i^C-SFo~*oJ@EjVls}biq!#Lm+`DJ)RiDvRiQtV|ETy$jw9#@!TclS!@d-Wb$&9!WuyS#%y=dm62ci;8*=){0 z2BC3O#fLJjMw}%Jf1X0l09`{oSZ0vz{0t)2yE-Ln(%wuehR7m3X7%I0{49~#?vIK> zjP!TM*&mxZd^HUpBM^sL9S{-{Wq^Z^BELgce2GH2iTPE7fRl(C0N3e<6pqz4p( zPxaO=odVq;Y*N2C?QrZ2GB1Y9FX-dBb-IJ>UspYb@ez$We@)L!e5}r#862bRVA%FI z@(Ug*8je;w7^t@t`MKU8wF^bF9e*}_Dr?A~vc?e&rFU%g!;V_AUAq#S+fplAqsw5i z&W~-0g@3eT!D=t76;Tk+nc2}rP6gwg{<%dwvO3FoYte+gHz>6-vq7T3pO7hJR}qNW z27_;$PH@>QZm!!zqJ~zwiXN`-8x%FC z2P%-Qe+AT8!FQRTNUp`{QslFtOZ#!tL@%N<*u6$c2w&~7$QNiiyso2vJE+Z8qKI_Z z4v|@}uZwLTwRfJzjv*S@_7OFE=yb+Tdd)-{ccw~2vK5sB<6Y_6L9}Z}`K_OFcr&Gm zMsGn4*kYEiJ*vyQHW&vsALDZMs-@^Ts=m6wdEXL947CNnsVJWsJcKh(U6iVOlGpBFwPsU$u!19QR^c++!G^Z&-jiq z!{RN3F(~4^lWK))lc!L0ZcLuUG`T+zgB@FByT9nR$X4G(a#ZPL{Xge8$%p%yL^{7x@lH z+*i5&+>~JBhc9@!n%spElvyryQ>Qgn%D4m}z3QWyM&WKc3724Rxi&PsfKrl<%oV9z z-epYNB<5sy%9tx``yZH}5HI}Ee+a6~ZCRmh?jlw`oot1_$oweAC(X+Q4MnhHtdN_V znpuF-RxE35yG(LZnK2`(+)>7Y=QIW+UGcj8g3NfjN_JRffl=rNlzG48s;Zgu)U=Kz zi@1WCou7oVm);rc8WMzOW0y&%GelK9#M{^#C!6!<(hQ_9$GjKmt)(_Of2Dog%aB_0 zs4|E~Td3-X2;?M>@&}~UkL8S$v1>XTM%yHYj$#OKJplO)T+9X@)@IDxXx0WsvwKn* zW@)dB%2f}eEZ^XkTrhWwk6&Xq*w&2y1!da%DJ2A6Prkuc!+}Bhge;4=2`Ovb5ZpT5C*Hp&gBDw8);~=uD4C5%UEG%fzKbAIy z+dc_uh9-3ycM`%vw3sh`b0C#16cR{k2X6&{^`g1#G}pjdc^`>f{ZXCL<`#IgmiI;Q zWG07rbIlF+I}DBq?!)3rTYkKk0x>xJ8IBU z-6?s<=1|LAt#HjS6Snd}3N?7k_EtI2DvxTuU336ccDN&tDtTx~2~gJCsXVBXe-kBG z>&e9J>Q%TzN*;g4fATeZkF6n4Jiw_y5$^2*Djl>^fyJZDq&)BCPN*v&m&q)J#dz3EI&W^}VAq;Mi>>{%)G!{-%w*jsBYG}h+cGl1bp8?jMyM`fLOa2=A@N<=WQ_l)LH0k>wq9f3}n*{gc#XoQIhZJO9HC6u#|*U=&?;8n)rj@H^Mws z9RaN^6g{L3e@H^3Js*9iRI1e`snJNMS1LIvZ8Hh$&}EZ9pQgE5p|Nr02k< zJFjgc3(s#^B#^q-K^3u?(zS?NCEiT1^JpfSViwSO^wNPB%XQW>9G-Tto8gEjF#8#U zz}ieZfB7DQ8l0+=6-D$J>f;W$#y!s(H1;xG8&~b#XB`5C8=w{8?H0z4bG|w4(iF~F z;9KpN>xea>`aYhgWJHgkItMNa_uPEO2v^ zN)<*X7t5lbVemW)Z~_xB3$Xr47I}`m$ncWIe;o1^=7uLgZSBGk)i zkX{>J!f6qj`^OmK<3@v)%*QgGh4f}fY@-9+_r>ZOdpcWteJ#TFmNzY8TlBTTq2`4j zf7gR|n8dd6tD}q-Ah);|0m3^{Hg0NfK{|e0FuKYnN-?PAl?^d;17b5U@&4ugI)2a%PCO1K}AoCrnP;rg7mm6730ooE~ zd^rUVMxzMW)<<)=)P|+{duEM=fAwxPGeG?0=O?yKuq`sb0LP2|;>I8(nK*_if4Wlh znvZdc+A(s zkrSQ4U-fxrt$MbBFh1nPL7AWKYQU_uQp1q=(h~T(!+ThMMCjdwJB-ynZ#6@Fu-$j? zaA0GLIT%QC%*0`%u^)%se}w*p6R&%qXo7@XJJMbL(Cbu8Fr84_Jb!%7QG#!r`vL2ib|4n^RQ(s7b;FD+a7b^}PVg4hF4co>p zf z4pnq<-o8Ir78nN_hZUW#l||>mj+6A)B&CgBg`6O0PufePlVE*ldT(bGwDZuWyH!@L zW{~F@KD4Z!GSb}5e@r>kTV?~-6niqwX*=#+EOR*@eVi>WHdR%S)8eYet%dW%?YSS9 z#BGlk395lwLNdT#V8~x8zfOP<59by|JlmkNQOBnwc(+td2Yd0T;S;jghVd z45G=u1*q>46&6I)NV-bUR|_gI%Cv$Iq%_biNF_ke?PPDvc}+tUKL9RV~oDlEI}ujt(IbH!i|!Unc8;e=8ip6sav=`0!`-V!Hg+6 zpkdM}0eN1vXdkNwy6t-xK!sh<=j*^(?fp$*k4Y!BDlhaG{J^AP=mLzf<0}Yx@=xfy z3OBjJf3TtM1RrB2_e)W&_qvh1m!Y^Ak+EBOs^s|cPJ~5*01(zcIHHSu~?uVD6{p@GYf7qq33AGqQj-GwEONvajm)s7kM5h%? z_1H~Zc`ZdN#*m^P@x3ZV-48EEE92i(PQVC~l$5c3j&i-yK7c&&=w>~6dJ&AAh~Z6n za_qhEQnbU{9i&)$!KFwGY$@^fTzN~gka~o|4Y{h^JczmsLjn5K-7U8AtSqD|q3o`T zf1%PePKYO(c0+UNWE#r;g!%}qJ*^|ht1U?GqfaXjqcaMN9{4f0qm)APmm}!S5cgp$ zDNF(i_j3sXC*1J{c-9(3K(*4v{5UzUs`lNdlV; zl-t3_`089$+1Fm?90Qb-G;)!KijY2xexKYMzc?)u1g=u4lkID`*4_`vzCofGU8QP=dFd@*9d&_MEonqPq~Iy!C}+ zBHzWb+k%>)Q(Vkz&E`4`dwuMlfBRdVU3nw8vf6Gz3<@xdya{Mog(6?U8frKgS!S^t zCJpWukTko%BajYv12e@;{i+EjmL`35>!pVf(?sN`hh9C6B1l^jEzKnkqmBvXZ^Wqd zjX77<$$EV_T}5`MGKWz|^|9kbc&CBljCol?5N;#NI@AujH`%u0L3l@kyaF zU>m_x7MX8uQKNFPI5|29r=jf=%4;^ojo@jh`;6+qlTCSJfExNlZeH~AnHjr115IFR zsFXSGuA*=RZ%iFKx4+A)e^dQm)1iBNJed>y%|ccOMoGlf6+%i<2EddANqGKK(lm^x zB+HJNDG4CUs+(*faWwOv1Jg$2z4veETi(A909CsWTc>tDvg?cQ@GF=K{0IQ`IXXB? z#ppVl8b@Hq^_WvjvEc3a5Ab-Ope}6`euQ`zrWvacaY98k!`JdBe`vu1Be)8RzQE!m zER)c(pf>Ed9%fCNo0`hhT+t}7f+ovK6!>SbhL#<+Rqj+%GZV&=A613Pl0y;O32;rt!gb8w*VwazFksae@sL`2FPM~tkYWZSj$Lm z5g%@;{Zlm~(zJpldl8O|g!*u(G`Y}+RtsX9m!7sHqUkwHY4+YZ z%|6C7eQUF?e_6!a7}M1G1^s;Hdod5t4;2i6eHV(mLv0{wDdTqqC@w~5lsYD3*x9W@ zvG0Kl)6-927bQE^hpt?w(KvwnQheuewmt_ETRFk&Gv z+wA=u)IJrA95M`Z)b(HWs!8S(;dGg+I|G}+baRKGe==bgW00(R!{G9Z4q^rlVpIkB zBO<%J^XeDB>7het862o%0c4lPD8wJM9OgbdIYFFS@2kU&SV)D zwUyFNJbR68AEV1i_vE2bOO*JykT)W4nx|%o+@<)v1SJUj4CMMN8w7lsD#T4WW##a@ zYO5_6e|75I`rr(cYER6hDJJfS>yor^^%JoH9EDc4p$M(Vjm(HLFY{?goU!j>(s`o8 z7jkf!_NFv_4)O>??6`{BRbT2UMe^0qv+_@Gy>0O$Wm5_c$s*H3?(2P&I@z!=pIN&oIB0NJmVV?CMqfXm*m6G8kP8a=O^l zlJf8IPAUzDMXnB57z6_wWH5=T4m2dJf_*Cqt%Pa{OZ*dAEFr4?J25e!6!cn ze`7sGTySYRs}1)MI?7$?Wp^Udb_GOc3jBCifO9&1QH&XJNRG5DNSFaXT!Ld)-TgZ! z(QAfs@**U&MdtyhykWWOVhj zGYAW6;L7iu)qX7+FXi-nmTgefEtm@z)t|FKS97fPdP!aLj?&}4Gd-P09}Za1OiUYe zN)Oki>?j*Z17mS*+#!AS6kk0lW;Hi(@ZaP48Bm;OCLuX{kV8AshLd4h63*@Qe}PIa zW6i3v4=*K@D+VITW9er{wSZwrc$xaeiEk^1t_@Y{D0D!&Vu}zyNPG z`A2R+Fcq2*odkc>0;n9RT)RW?Nsq?)2{>+`n^}SOd*L79yMQSFYW}Y-g-i zl`;g_c3+|g%5n_7l4eA*N+}vNe}F2zoPQP-30+?;^m@kw%$`xMO2hr;Bl-9k?k5e4 zgQEUA=o*$nzWP}bB}aNiNrpW}6@L5|eK;K@4x7KKq>IzA`qjzisVrNqE&Y2pFuhq$ z8lt{Ot`9`l{A7?O)(Q7z6o<@}Y0zxsm_i-6yqTg6-IJk!FQuAXE1>|&f42Sgf$*S^ zRY@v%!m5cwtX=&mU~+o|MW}dEKml#=c5I3#@MCK`jmTKiPPU@k|13toJHv)z=sT;nsYmbXk$xa zkEbl9GQ&VK$77|>N3{$le}Q@N&oMr3jN4FSNW42i2Bh-i)k4UCj|bn4x3)4zqIUd+ zetsq=vUQ6ePr@fI+c%<_#vyh_(aPIF?Zj*fZs<09BAasC_O zKs64yjj5VPgbvhK6>0i4n|!j6kO6!Q^9AhiIMN{R4|H`b)JTV%e;eLT?u#30=#Ka@ zVV3db$6N!EC2q52!dL#adx3XGeP{~7by{?RPQ+o0_i2{ic5c*`q}|EFRW?oboHk!UnPqr+p82iN%nLnO3YS~MOa0&(*}=BV z3KmuO2B3Jci(gsQf82m;IGuC$pTQ2CQQf}7$0tzlx2osmtJ8rxi$JI3N+a6xmpSR{ z`?-zY9ziENQS-WYX3+cRZjCl5sQlg2_$q2}hH}Sphw#AJMpJaAZm2BRx)xf4{wUZ& z^G!ift!uAYAdr@@U@8+w;+&cfHt1|xjlJpUPswjPU2bC-e-}5sie@6=dt-MCnAOiV zpsZ;k{@pE?=DZ9@?l$P&Tdm)T>lmM{3(H4uAlttZFK(7i_8Ju@8&gj^Kg#wOe5joM zy@5$WLr<3SINE1EDWMF@V_1>|n;o4m-A!*5)1$gYJzsnbXTHdA4710EE>DJcG4U3H z_faQ7By+OIe=gnZ@qFTH_^~!Z4}Du>&tNkOKD&jsmn8TM!`WiN;5Uadj!_*$COdY1 znt~*pJ4$U%_yRJ?ZL5Z~Q4&xuJZNCFP)z%jyAZX!sdZxZw>46+Sg|-4_0_ zwJmXF!q%FSn@^vwZ(3K8uSwNvt%gz+dt(jn6V&t<`O2)#$Wv)XHq;Oby=B4#T~}a@JSi%o=wn+C>!P zz|Vq{JCh8_*j%R(>UM5Hknz{Ru1C}A5^nPaCSbP~QxP%ivk_jwz&oAsZAGs#ZYUk; ze_EgATnYLZWnd5E!d(?5P_ViCLHDnJ!nXZVf{uti>MdSoN0^ zwHBM*fVz^#p#h_%?{m;bqcbZZc-e_$=vaI+>O7O@tatZ7G#inO)Z?+e79P*KMx zcizotNm+}19{ucl35pvSm*>RK+|05{XmuVJ#N(&7s4>A(G(C9-k4E zAMWYwf#W5&Q7!H)1fn={y)Q{Df5M$jU~+W2b*(P|6{};EINC@vC1*tmMIYV%d|e>( zxHl88W=&Rdv3qSu#+vN%1kyMP{O#hqAV;;V$wKt(O^gzb=I#AG=VHaJ!3L$R$$rkH zd@mgC>g%w(vqGMKj~$w!L%+F8g6xu__r7IiUTi`30jnF?q z79B2YG+|HKdaUkM|K;?Z%a^+qOIuI+ZN5EhS5svxs#}p2-PPNpT2^EsS4tjXUK}&vaB5zlU~?VE@O3RZ z)|57z+bEZEjlYSES|ZN<%%vqd{LU=#^e=Z-XmPter#L2M9ckm_l%hbv zO02PtGm280R$?(XClrMmR$>tc=M(z^R$_}aoy1d6wi4^PJcl=;e`F=radM_iBw;1? zSo}m;mwqL7aCM$+K*dU|;!K6V-$%vyyTkE7Ux7PofY9YW^e(+iJLIS6P&hS3SHb4` zZhQ0d&0X>WVwi$vO>o81)|7&%m#`|E@p4vWr|nN}T+NSaS(k+zHQ^&tR%IV&T43;; z1DDlyPk=L9Y=3fRe})#f+d0lLDeFiZCv%(v1uL<}I_5Z~G_Ay9Zss_J8dhQv2XmZ# z0V}b^n&vnal&!>iF6TH$l&r)$tMD`?BWWe}axy0@lCTncEIudPrC*61T+Im&s91?r zoT&)>!vV(pZG?{(MDNfWgN!7H_PWHe@6tFOes`8Q6scRce;on^?yQl(*F)#(@ukfn zkg%3{bR})8O2p0eUt3p`#FKhfW~t7#Vrw#*SlYVmNBu(H#IOw4XTd>n>$2f*;PQyV zxBY&u!G1zv>$080HN#0g>#`K_AN#NWT!WFAly%wX2?X$b21y9Q`-I*(wW6wQSlzm; zxCJp{zD^*je=@SKt0|TOS?jV}bywbX5<2oeCM#=Qc5^1?dmI%m2TomnXO{RDxpd*z zVPDb;IV`gP)e49V^B337{j5s0^2{0@}n*f8w4-hYP+5J_pO!{5@_r7Dx{e zI~Z9$`YmEdanBF&#X;TICUzBf>{NS;hMsN=&}DaGg&h5~c;p*WI)!qRfe=s3`EkAu z6^@f)^LYKuxlP&{TwBBi{cBF9+>6YQ;>F8IpWKNJ@Y>BcC*z|Ivy0gO?4S1#ep_~b#5_4Vm=kExI*B)`XpGG9K^cbMJ89_HW% zo|lh1&+8=SFnG?OADqJd;*sxCyNN#?ffNwN`SKR%k-M=$a)?WiEI$P7GP{X=iD0tj zqt4N~i1+ROUrWcirgVMJgQ6g($+PTj4x{vL?cjnO#L@ADo-AHc+Qd%c9{pH+!KmSM ze-dNVApzN`T)ZdLA$AgXROKx`HN=r_oJ@WU2ymPL%k6w0A$M&97xXVb{ont&co}F& zH(z~hdM69QqcR*Ph70<;e}Qjc`AxDd+Rf)OE~0I40%v~(5}YSPH9_svAiM_m7I7f} z8NG;t9QavqTD*Mp$?^8W-@s)Hf^3P~f4OCqtR!icy3u`7!dmQ69S~aX(2|OpwOGl8 zj32?tPmbUTJzabzRzU8;1|R(Z>>+&q>}SuQKrPMEb{G(_xleLDAA#pbYq8#-L)0&e-ejA zVC1fC;DSCv!%+~F+2X6WUFOa8KiK1g#;JHox3#``&&Wno6i=6_)z+?c3w_ zvE0W|@E6!SG*Mf~Dr~d;iJq%PdbS{Yi2{&2G>|~eI;?c_NWb5}^zxX&0tPU4X(y4M zby(`-Hzp1L^eyv~`~b2wm~2$oe>!Zp{lvbXKVMmg3kx?)!o3>jAE(@3+QY-wOMCtM2Ko zppUnz-krmKT?Y0rE|%}x@w`rA9)lk#T>b=ao7YLqGmxK9vl}~D<>^v8e~W(z@@FTp zZ-|}6J@;b=i<=#0cd@5kDL3)wxm{S`m#A>CC7)+?=(}oEhU?UD=Dlh}gmas?;g5Y6 zC8TC`5^EgGLk1V$Bh?{x5_cH*S%?~@#oNR#vzyrC**XWli#o)cn=&up`Iqb8AAT%&wHIlj*65}#fH1b2mBfN^j&f~ zY4rlvmZBRr9eW)FuuO{Iz-8&%n{23DomJiI`SbsBTx2T&Yb>o2+E$c=t1Y9l0@q|q zXHssX46(rFU5X0n`$@JsL90os%{DqF_eoPfTsx?{WHhbEV$OtYf5~aJ8^i*a-{vzY zVdEsr3<_CQ@~o|O(~!6>d%6>I0-$g-`NDG#^y-`xK{;9LNVC*f5GfI zPkm&d@Ej#i0I$nCDvBdgoY*G=AaUZsw#)1!_GN+Z+WoP3+*`aZV*bg`UL1JpY0-A8 zeD>qQO_8)D=gHI(fA?2ZoE|}zLo|kW;ss7wntS`X5CGyB?)?HLix;&Hvy<4D`!x0w z2iJ#onH|MGPhsI8>J9DY`tg3z=_?VtbK@^P!yq7cZ-d7ydO<&5KR&uLM5DmHH5>?F z{&(WV=eRq>j^bYU$IL%Fh&xa0Chq9H06K^|&+8=S=rtF&exR=w zjH7IE$4U*Six~grAD$PFGDo_({`2XpJ7&iw-~77$`o&G`F|+H~fA*8okK@aAj@tI* z`XS1TuZKrJDAKY02 zkvq2m8XW~$f3yeR;B_2-gJ~4xpZy{@VK<#A2%~XzMu~&-UOj3zw^WVFSVfX7e}7}upoDeU!=VT#e&7Y6r*wWIbAy99pc4zGl=96qjeH*Q22X)`CZa&UMDe+f(+*HP<#Xa zdGRJ+f5hJ1q+7qUQdw(BwLyW)wn0^;RMAJ4ma%;4XMZ|O@z$5gma zjn3ICXJ%>3Sb<$!D1b@mFTTB!XT>9gg(q)>^scS%V09Tb9w)?vK= z{FkGcSH$%D?a$W*Vqb@f6(q;z38XRV+HDu@e>ei@qF=vdWnSPe9>{J$RzwFfOj@NV#|hQQ`x9i*;j$ic z_4K&5)B`*s9WkpGG3m|ijT<8grMR{atq!x$jY%qY*H-u=0aXrmV+c-wYg@RWzkEWU zf0mzktwStJ=wH=G)EWTph=5-Qth`|?+H^=H`7w3=y z*J-f)T7Zb%xfLAne|;_%KNHk&I#>%6e~`Pi`|^3e&+XU(F4z-*51{*tj~_e4PU4<| z|R^5t%If?TIiwQc2#e|D8x zoC5dOxS&wy``hjPtt)FZ3Y=Tx+rxGZi#hU|oLkJvT>X#&=hkq-zhA!T**lGIe^fY# zapdsy{nxU{fFGO|k9LdKMa&=lEL`59EzjyE)|b?ezh!=s`+^j!@a(_@ z9Ium@gDj8oBKp2~)Z4^P;vQjFq7Lp0=n%V^R~-?tg2Y&UhG{^+8f@W2LJGnR4X>9k z8!cjXHrdssL^5htkUY)Jfb|axfBS})6to6gIg#)J3)0sK9Qaufu{*Is>__3^k#9(y z#2K}x{_se^3I3ayqoJdhOchf-CKWi8|Bz@vHFeT0~@C|Ex7En+8ezaX0AQ&FVf$^vH5@*53@6u7X)@1-C68C={cV4K%X z%%k9W58-p}=Jp9OzY9w|e}UlSXUD}CLE5~IVqRv;-|5++brbI}KShUP`H1tpZem`b zp^=Lu=XtyS602VqUd~bw)@g}@ZWB%x8-^(T4&vludf5v|M)AiZ^2PS`k zJv;D{JFtX6g;T4z;PzvXA0Ya?c#Yp--h7N=6ta2zb&nk@=XXhwT~;b<4JqbKLkyQ6 zZgq&A#69p22N;LH`Y(QS$NhpP5c0dU1XX}%|6JLktH8ZA=!~dad+-{ZSOoZ4aQaMU z#~0sFCLniXgCsm=f6I>{+q{lqj#@|u@vd2&#QNZ%7OXj)?|Pb);xKZz1_ZdV1$;fk{`2yaxou(xeF(9D6(qvuj~rx)e+Jh{=%AlDEoKciav|Y+>1WFq z6pnTA`@;p~t}Va$>Z5|{Y(PGt#JP1`@E0(6{ubdQ=htcEYDe*y3O82C{1<
ST* zVjZQxL9A<5eLkw5^4GOCoNi+LT)<~%<3}O=@*}8G8LLT_&DKxCJ^bOs^_r}r^;pY^kTY2RGy>1+ zB-Uvh;HBr~d&*sAC$Y~!953Hca;#2b{Tju~FV^s^e@}_5bG5-tf4>5dQI^r!_C-FW-`SaydiQRfG9{C=(6AM5X!1Eb6*dX4cb`t+A zT7ET!BXtqy|JOf3_%TZSBmj?Dw0z1B2ykPIBm_U`Hd23iJM_q%*Z`JAxd+#YA72Uw zTY1I&f8E2qiw~HUWUVLNTqyY;kl^{eo0GrziIE<)8?Wu7GFFi!%g;CsN?3e^yNMP4!FlWTLY0)cK`qBOQrzsuKx&%{KuS z{puINiTk*V;qgej2BBa03Q6>ucajHDfFR%81^H8%JoqTlPxgS%+2-zy>|WwW+X8jD z>>}S~nO?;|g83`lKjDkBY~yYVklHqZ*bJT;@St4>`2;31=WIZQyu0+0r+)Ule?vc{ zyV!0<&6wx#%|}#8R&K;^;)ryPn;O|B(=sdvTpEhVLaEOj&hVMRn zjDqJFgw*Nd_NV)=bQqJ_W84081J7w$fCqTwz-`n53>{`69!UV3nm?kU+b7hX>)lQ{ z3d@Y}WIbKCKhe~jB@yk*4QRH&f9>v$j&&+7kg=~;N&hJ`u<^q< z|JUaF_F?-kv&s~sO%RyZdlDoDd0-rUustTU^Ti5x%;t?5`PZ~clcBItf4RxFqfb^kZ$oh>RHA->485T-rlQ$s4L}UC%ZC2hfgd0! z*?fgs7N7uaJ1|fb;u%G|_05>cqc}ps=l{w#H{|1Q|7A8y!MG6MAW`#?%t0E*OTdNC z4{hi%qOWCvl2Rj^Qj|&Le@)n48S#rEi}on<*k@`gn>k4?NwCYx+2X>tF7{G{{xPVK zr?HVKj--o(4FprG6qW8_BYDa+m_x>9#5MMYWiFyT=PVAJJPe8n6`68Po+B{0Km*Sds0E2_`x` z4j18~9i{vWEIz_A3FjqSBu0NE;75wUKuF`}fNc{cFMb?_yEsY!YHf4P5zYktj4}I` z`AHsY?!=PD-W)+ecV+FKsk>IyEt-l{sNcUkqrM4tc^Ul!e|9O1qhJAIG;j~HD6IzW z^O7A*q+4EZjMvUzFn;`Gcc zg`XXP{UQO&us^A2DcgSbljwkkz}L7-u?2DWG>c|>sIBw5i}+n zhdHWE_jId0W}JhI><;JFYNoP~r$#4{J3D${1{yiLmw%0Rn@!cUVda#F`n<+EI8Zqi z_ERPrOA)$~Chn#g6ZhB9&`;dwUD1rp#%B&pq=QE8hI zxL#zje+3KADc&Tcy>c@IOnT?l@m>4yac&>;%v#!RGb7Uqn-$0b^uEL@V~LwtGZ6!I z5uvr6?Q#G@`w5sQewv?PVIR4`*}^U=n<=8xYjuC4O5hz{1J7sRtESi%paggdJ6Bq0 zvC^0bX{M!3uFTjHCnB1bH@%o+OP`QwTHySKe^a!W#A1;xWn!uciF4@@k+&9Ob_ba4 z&M*YA#nRqqFgYMSO_A^R7dC@NRwrhn9IKvp2S1MYe(=1*Q~yzhcwlz1$lmj8(DVfV zhWojRs{YX2JZ$R6B27JXvO8`P4w}j$iu2l04YvYGhL+aaQ@WM=mi#f=SuNVW&CuBn zfAIh&g&!s4RQ(+I2Aed8u_-9nb>pCEe-Tj`Jcs+uE&6l&v%&cpiZg<3uxDo&noXjS zFGeR?hBe%=>i^U`PVr(amg39vY`%Y;8N^5h2EM@DP1#>D>;u>yuq7j`ofTt@* z&ES3MF!rE`F? z_StCV?4aX-Uv9Rxqks=Apy7XDISNRrk#~qOP-Ax?L*T~l zCWe45OWjAu?>O7BS(bq!(K@8@?hAEl)#AZRS z+r%bqRc&#TzN-*FWQAX==@>_}oI8#|jti0vuN8y%B8L^@oCZZ~FhYG@EgvES6{~@g zHe!$7(9O$`wQT`SBC_4Mb{WLBocW?NzVKuh9`Gu|{=3Bj&X9?TXTm17_KQ<)n}491 zi>ew6kf5s@+PVrBP5w!U^0ZVb8DKO5Hs(o3?ZxI9YI(-9KO4&%|VGA%Z2IIuZ_9XYE>mf}c9$v0S{J$AfJ9isxD zQ~rhgG3aEI@Iq+%$D~0KVP}4N`hOl*8wP&CUw?nlc5K4n@-P42E+>V?3gqrzMk>h(f|$mUKEM`3j)XnKI> zm|UU(l8}Aep*|Vz2jClQ@)HQvIVg^hhEWumA3R6N(GoWL*FWc*&)aYKuYRA80g2Jc z{V(KWZ6QC3%M9#tkmOKvOMlZiCH-sA(;S7bBf}yx7d-PPU<>||qiW;xW&8UBIkWZv zk6To>^qfc#ut9uO*USaIWoH^^z$g9=@PsAzY1~|%Dyc+TbXC!z1^KP@dC<@{L~*!X zJ@ui|N=Wa)cskjT*9C7hrg+pHmc?cBh(Fz;Pg{_^L^^dqjn&R94Sz-b$)KVcB%xc= z5Z1?xIjd0x?$0hUR65^v1LN|nv+-aA`3b(>z%s!b8*g(4$pCc)k?i+?Y|>_!-Sj77GOH_b=w%hwu+4hzhSBdWSKkL-h>p;KgHKodOe zW@{D-LHlEcCxdu~36zMb{;1z^Zr}!KnvW)}%rsB4L9L0H_AKfkADc2~bL$3%6HSp< z=+!)YW)~g}>f*Sn`N)?Ya|)$>mX@GQvQUS7_DQM%d@C-qa(~d6Sf-%RHmwt*QTAqw z|JBc-1Vn^O??c5O$q=&R0g){u?1Gb@9Lmf$Q-)yT-MmvCOJO!G%Kyn+bRM$+5c`u8x;>Ee3W#dI?$?9Bzx17?QzIbgcM!mWW z#lDzVg~2ufd4CdvYH^B#9bKrt8!e_!l5xB8$4aE9!zfqkMXH*}bDJSGSNSJR(2!-b zC2BpqtR5I|Z>ph%{l74C~i7!HjoAabksC z{Yh?XSxAn)_rz8%(xo5BNLIFJ3FPS|3>PK(F8=pxs(*VNMr102ODE~dpxCkNLo5ua z2D#YCkPvAPdWO(LaKQHfP;~0_)iQ(W(0mY>p5G8Bv8M_@l5aTP{Io&;n?xo*wWCtJ zMsWl;-^d^RUKWKBLeoD6nnl>JD4*{)Ti#X3XNafYG)fu462bq4|Gj38-0Ndqnqwz9 zmZpnhvVZMskP8pJ=`BM$#F1!DjL5)i@6Q+nXT@zS$vst;>Fgv{A(Jx<#f(sNHPNj= z%s-ZVcCsn3OVpJ}{;e-XFzb>I<`7DvU(xh|&ruw=4s_y8Pchv{_5~JT4`I7gs8%Up z_&(ol^I5awFpXR7U`25&z>i`9iVfZfg<5;93x5jr*s4oc7NEl`W!U#Y!Q9m^K#7Lx zO@C<_zU7jS-F&+Y)3SENgi&abEMUM=<=a!^lC3}5idd(fdt~4gL>y@Wh342qjNkS) zDX5Dr8M574lAqZWa+PkDBGk0Fxkig-5L;=ow%$tw;*eemWYG!~wYk~F02a?17g2JR z7k}Z0*x!SA&`6}uhy(qMgLo(mdGc@UjuX1MLjtEhck=!X4eGI=PpGz}X1>~zCsmw1 z6x2IrY2elOb$C^usky;pnmLH0I#EY3n+uc!E|PwNd6U-o z$qql!R>s3MPejB~Lhi%?$;<8qJ9#5ZW0milTG8lR+A1lkhfqC?>S0u`%RT;g4E#i20|(bU5fWdD6W%!z#he+MXZi{VhR5Ub&qPvk>_oR?1^P@|MH`ZTCK7;( zQx$hqK}8u%&iI%D`X8S?hf&gqn}2>)TQh>LEX6^41lnt@#4 z8_#R^)`D{gN9Tlziux_D-*pqS^gBsb*kt-%E%v6DWs!TRYlWDHIx|)j$oG_xl0)%6VTrMV6SAHCAEbr>S#F0f z8RXe5!o&4et?-=1Jgt7H6{3lrE#|CxvgTX^1YEz;PWvI|;X0$n6uFo}qDN|5I52OoPED=PUVL=^MB!K&X)WQh{ij6S1xnGkewtOE76GHe$=P3KlZsJ!b*Dfj{}e9c!9E~6-%RJ35G zp^~Q4D@ua63_-q(o@i1J>P{+jPEGa1lX~R)U#41;NiE+?P%b3q`+t;}>y?!9wUq5> zB?V|L1t(caq4~a&@^V>6g@1i}d;jhJ>zC_?-SyKi+hs)g+0p9~56L7u84|w$xGxdU zo||jB*5G4U(t<5`Bkdw1&0m?RF6*+-m+Q}W3t9-}c*58S3w>}#A+s{Ex#eXHRk-Pe z3xF>qAbqV`2K{kY|9@bhrk)n`IzNi$3%PKXSDt&wGHcIVAnU8oeQc@qXFizSh2S2t z%u+BHXvRg#K8xZqWdmSsp^`6fLri272wT*$GuzIwiopkzG%I{L%2?K7-h~}tZ?-^L z;uYN6ToV|N)lTQd5L7Ur9~oW0EtEmrHfYBDF;TKC{9gDmz!o2;4!l_JPuGxA9clEo?e1y~ zDb-I!Z_n95ZGYfZd#2!Jo{l0?G}%KW+~`|WSm>Q0!qWh_c}ENI$Rbe(pqf2S+s`FR#&X9JB2Ke94WU#0EGnB=E_7Bub43d)W<<+hfhpmsn1!l z{HKs51Ak{|sRqo11jZ+oDtK`owlRrVG4y8;8=e3bD%`17^j;e3`k%vY$-^VQ~XzIr^JuP)8ztKVj<+g|3YYF@3E##gXs z^KBMKh9jWDv0?Lm<3gaj!J~dhe~(9xMzGbRd4ESV71sP}c^KOTmKn%4VXO~jd(cvY zSuT{_;cORJWrzTupjzvp%I%Ah3;ecS>5JsQzL z)I5OT62!Kw-bEec##h7uav;veVeOWHKD1bc%}I_%!0&=_AxOzo%*&=UaR;6L>&#UlUc7JD9xW;&d&t{wZD(3O$?MjC8TIn!v<#C#~ zsf_2X$ItUNgXz3Y#H+=8Pu$t&5+gU55{I$me%$=ujsR7(A^6b_SUr7zb&vo2Tk{Hn zz%N{Vf@kn=s5>GEoL;=@hxb?Pt^*3g{W22dk+hrbkU8+P;M9#Be)j%qOMkscxSLoRl9){?;?0>uJ0!dJkG*;y!fm#sLOqIzLcyJIU1BPH<9@V#M z;y^738KxJFEukFC%QT0sKi@iB7igLiUxA~@hJ$dyr#aY`WL2IjSO;dB58un6$c2M& z!KXQLc9Xg)Q5~)eBtc29Ayy@%q1qsmWNafoD_@Pn!mvE2nn(8tl-GfhpnsFrs*&u? zaf7Qqm^ATgl`s9( zs1F?Cgm8+T=C=kUJX`G*A@Y#Xpur zQ64D6t$<-v8J(?)f+|FQL4V=T3Zg22N)7{f6yl1dkgi@to=;{Z?p@SkR5B*ftAl(G7ezV?2!&YUB#`TkF^V@;c&`=y zPJ}wDL%O!Ch*^NKD4rmcm~SL+y}$Z||BT>#Xwb$5QoJ|x^?wo62sJ2K;>Xhp>o@Ui zF#9-xiJH4}iudru@OV|-#yYLNVS2*=$nsbPaMK5!(m^kI!vS(s;mO^v#i<~{iH0ip zT^DdtL47GMxnUmHH);~9@LnDMt_L|SqN@9qXQ5kC7)pEo`K|$%n0#(Tk%jRpZB2BM zZEy^zE$t(wsDCa2R8yxgtH(*HL*0+gg#C6Cg4n~GjtIT7@D&SKCGR4Pc1eMD-7j9dBoFpqiT3f=bP-Cdt!kv=6 z#WW8MjZy*9GJX|kJ8E=Tm3E27t9chSUtg=rAfH73uz#;^)QsR&i*6z^9O0V^!{a5_ zS9L1Mi4uGa4b}Ho>_nQvh_a=L0C@g=nlqo|%C`wFY??JP+( zey)jRvLvrlKP~wrR7yEE=ch==Lec3wF-biF87lzzrS~IaX#D#J$*~NI#M{mJ=?tE4TP_`$4AFVR}jN8eD#a|o=kZ`9L5d) z_!s`DA_4@tK06M4ce7REJ7)%bDDd6qClx-k39ua9mVW&_Hn}bqRFEFtQC~>~@QwbJ z=KiP5v<7xK9IBj2@VBj}@07Sd_#y~BI}lB8$RDpAK?^#q*Ctf$y!8oD9=(TGMme1- zjl6n(Tj?1&nK);#iw@a$^C*Y5iQs%UIx0VsjMs?Tc+| zpq!W$TUC)Y5Z6;iN~*Hau1l@dh<}ydJu?1Pn)xVbV3gvuT8!v5h&Li0RjsjEisJBt zr;LVK%e;VRy5FlffvSY;Gl(jp|9ypTngvPtzv%*7aUzGaQsvEWBdQyI55ZT3hw^)j zp03Jac6joHPfzEq;RXwK&O-SUC-5b2hvV zgNYkXPiyO-v8LAU!S;SZ6>T9rYF0@oB(0Ay#M{m-6&hrZJ2mOq+a6f3!Q}9 z7w@ld(6`)^cM6=mkva-H3sJoEUQwH%84lfKU zEZ<;Vc+#V)|Mevy6o0Qqxp}y+8xj>M-Q1(4ga_e1>*}a|m9YC1tqAMg(s(sSsU*Ey zdJOa;oh@UU6wJCn>g$QZ|8rFrJ!Ni~0W(t&gsOxo(StMt0omARrCw_AxVq`KL!w?k z^<2kXEI(oH$o|^HhPFoBt+Z(Lb4=&RWtOL@A{l`^03a*#Tt$4g?ef7k~nl-K_ z0s4e|vA;4{W$OCaBh-+ZU1kjooP@({j}Nm*>(w12;Q16G?m&KB}W6(NI6QklNn zFd68o4cuuwtu%q=X45nmIPXCiqQN7Gb!HOz;P0Q2d{x1=uJ-! zwxoFc0m4d_&Pl?SMBUcBEe)g8k`$WumK38625mhxR4qz%s{vMqx3{Dj4D?Fm2sSEWXEHh1$8z z;`fJ&Pf<5BUpMwfs_9>^lR|dvY*I5Yd3u!trB5OC>=Na9X_&SQ%41NE4EdgXm4BAS z-NB`p{b|Veb^dwV`Q@FWh7XmN9*~}yn7%hv&hJLqnr?d86TU1GyTdoq%C1RgQ0(&8 z8}dz7m7Zq^;u4~9Owbqw{qRnK`h$O!K-8`A_pmIk(ihDx9Q6X>VJMG4c?9a32Y>wY zCRb9XyLn2_B?YDgiN7aHCC4DU@_&N>YlKMzN^T8 zxp``le`ar}#N^-;$AK{n(6}SSSG2hjdzM|^r%1==9AJGa3XTTsX@%!)e<3|ZS zLg;aX9_I7>#H;x5>mK8~`Y12+H%_KSOFTzOI9s$a_*5K;0F?-xKzzOmPJgDhKYxxwpM-77zk#d<&ui*}{OZq!rl}XCg`Q(gAIUV}`>W3n zZBN7W`8zfK4@^cEMax)B*&F)T>%`PlFZ$O%>vaEENMx}n$RNIM9%U^I^|tedsuq*+ zRwYVrz*Px2*D)%@F*4Iu!+Yd`*G&uS1~r;tt7X?Hs2P(i$$g z){vEiLwWqN<%fOb={qIv4^%-1LeCaN(;M=~Ye&#XXNqTU!?MuvP+5n4fAxqz-eQ=y zyM| zdfWe9Lyu9**9g;W`WhMOc*KnLHJ?2}qS!L4_J+iIGvZ@+zo63CbQ#Ee=g$gP`eW`-d<|5@LI>;ZdZC5k%#D1Vt zsz^ALG$i=j)_+qqZj;hPMd+0!N}h}Kn`j|_ytZ}CjFiCFL+n56Y)PrZqJ*kp-u8Y~ z*P=fn=|b?z$XZLc>)#EaE!HTbbyaKTxvr$V9rtu9?v}(iPskEjXAIU$q9$K$Cip z)^#HNCR#c*7Oka>jFg{rk|#f*#MyqjLjQjJ#_jk=Kd4ZD!1V90@JH}FQu$hG`YQgYL- z4O3v)t)W_UdlO89Vo!o{8ucO=v33uFN+@%1D1YOtI7*(4;!klrit`5tswU^PE?IkF z$ug11usw?Gwd(>)Mfx1+%oO5K@2fkv@fQWQ4_#(O>k#fYJjk>TctpiZV)}%>_a2{U z!yiV`KJ3dVF1#qY;^8ME!wB9~VZAEs&mFLF>JS_vvW1UcL#Cskg77FrE#y`VJ+4QC zzkdqyAd8PRk-gu3a`pefRFw-ym258e6Ay)c=p%|HoWU7J|1=lIjpZNi-S=0&!oBx^ zw>Dq>iA11Bm=0g&_L)O`+jwVwH_89uZv+_=$d@)GFY;~U1X`x{;WfXh3A~1YTw=$) z<#9f%XNY6h-U{mZpqzkdPR^Q&lw8Qh0)Jw{s=Xe|KuyEZ0}U@Z(haQdFo)aP?6756d&-lp$FV4 zN?{zKVX?9Y2TsGA3e2NmZO;uqS4@wLNy2#pC$#>E22^P(SM~DR>d#&kzP1<|Pmw3` z8cm1j14-2M#G|FnBTB>Z!b*{3>j+o~|G?aN$5y zvULVhn3Ib?CAM$zFTY1)%jVlO_=g_TcsyHNZ(4z6hzQgMCA>5%qxSbvv6k3VHF zmfF1#@9{w`#PhuzKBv_Sq+OZx>CNkuzPb#>C}T0w6NM1G8bQZEhDK?ED@R`byorcc z=A22>*&54fB()v-oJ&gvK=y8(uQOe(V}vP|7CgRg=?o?z%n`6kw3Cs^|^l`-v{+lLP L!}R|FrJeR{I?_NW